Read metadata scriptlet
parent
d69fd04acd
commit
4705e4ffb7
|
@ -0,0 +1,26 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Parroted with code assist from Phind-CodeLlama-34B-v2
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
# Specify the directory and its path
|
||||
the_stack_meta_path = '/srv/ml/huggingface/datasets/bigcode/the-stack-metadata/data/943_944'
|
||||
|
||||
# Check if the directory exists
|
||||
if not os.path.isdir(the_stack_meta_path):
|
||||
print("Directory does not exist")
|
||||
else:
|
||||
# Get a list of all files in the directory
|
||||
files = os.listdir(the_stack_meta_path)
|
||||
|
||||
# Check if there are at least 3 parquet files in the directory
|
||||
parquet_files = [f for f in files if f.endswith('.parquet')]
|
||||
if len(parquet_files) < 3:
|
||||
print("Less than 3 parquet files in the directory")
|
||||
else:
|
||||
# Read headers of each parquet file and print them
|
||||
for file in parquet_files:
|
||||
df = pd.read_parquet(os.path.join(the_stack_meta_path, file))
|
||||
print("Headers of file {0}: {1}".format(file, ', '.join(df.columns)))
|
|
@ -0,0 +1,4 @@
|
|||
datasets
|
||||
tqdm
|
||||
pandas
|
||||
pathlib
|
Loading…
Reference in New Issue