Read metadata scriptlet

main
Jeff Moe 2023-11-24 15:42:20 -07:00
parent d69fd04acd
commit 4705e4ffb7
2 changed files with 30 additions and 0 deletions

View File

@ -0,0 +1,26 @@
#!/usr/bin/env python3
#
# Parroted with code assist from Phind-CodeLlama-34B-v2
import os
import pandas as pd
# Specify the directory and its path
the_stack_meta_path = '/srv/ml/huggingface/datasets/bigcode/the-stack-metadata/data/943_944'
# Check if the directory exists
if not os.path.isdir(the_stack_meta_path):
print("Directory does not exist")
else:
# Get a list of all files in the directory
files = os.listdir(the_stack_meta_path)
# Check if there are at least 3 parquet files in the directory
parquet_files = [f for f in files if f.endswith('.parquet')]
if len(parquet_files) < 3:
print("Less than 3 parquet files in the directory")
else:
# Read headers of each parquet file and print them
for file in parquet_files:
df = pd.read_parquet(os.path.join(the_stack_meta_path, file))
print("Headers of file {0}: {1}".format(file, ', '.join(df.columns)))

View File

@ -0,0 +1,4 @@
datasets
tqdm
pandas
pathlib