parrot-datasets/src/the_smack/the_stack_headers.py

71 lines
2.1 KiB
Python
Executable File

#!/usr/bin/env python3
#
# Parrot with code assist from Phind-CodeLlama-34B-v2
"""Script to read headers of The Stack metadata parquet files.
The script expects the The Stack metadata main directory as a command line argument.
If no directory is provided, it uses
'/srv/ml/huggingface/datasets/bigcode/the-stack-metadata' as the default directory."""
import argparse
import os
import pandas as pd
from termcolor import colored
def read_parquet(file):
df = pd.read_parquet(file)
return df.columns
def main():
# Create argument parser
parser = argparse.ArgumentParser(description="Specify the directory to use")
# Add an argument for specifying the directory
parser.add_argument(
"directory",
metavar="dir",
type=str,
nargs="?",
default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
help="directory to use",
)
# Parse the arguments
args = parser.parse_args()
the_stack_meta_path = args.directory
# Check if the directory exists
if not os.path.isdir(the_stack_meta_path):
print("Directory does not exist")
else:
subdirs = [d for d in os.listdir(os.path.join(the_stack_meta_path, "data"))]
# If there are no subdirectories
if not subdirs:
print("No subdirectories in the directory")
else:
the_stack_meta_path = os.path.join(the_stack_meta_path, "data/", subdirs[0])
files = os.listdir(the_stack_meta_path)
parquet_files = [f for f in files if f.endswith(".parquet")]
if len(parquet_files) < 3:
print("Less than 3 parquet files in the directory")
else:
columns = []
for file in parquet_files:
col = read_parquet(os.path.join(the_stack_meta_path, file))
columns.append((file, col))
for file, col in columns:
print("\n\033[1m{0}\033[0m:".format(file))
for column in col:
print(column)
if __name__ == "__main__":
main()