71 lines
2.1 KiB
Python
Executable File
71 lines
2.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#
|
|
# Parrot with code assist from Phind-CodeLlama-34B-v2
|
|
|
|
"""Script to read headers of The Stack metadata parquet files.
|
|
The script expects the The Stack metadata main directory as a command line argument.
|
|
If no directory is provided, it uses
|
|
'/srv/ml/huggingface/datasets/bigcode/the-stack-metadata' as the default directory."""
|
|
|
|
import argparse
|
|
import os
|
|
import pandas as pd
|
|
from termcolor import colored
|
|
|
|
|
|
def read_parquet(file):
|
|
df = pd.read_parquet(file)
|
|
return df.columns
|
|
|
|
|
|
def main():
|
|
# Create argument parser
|
|
parser = argparse.ArgumentParser(description="Specify the directory to use")
|
|
|
|
# Add an argument for specifying the directory
|
|
parser.add_argument(
|
|
"directory",
|
|
metavar="dir",
|
|
type=str,
|
|
nargs="?",
|
|
default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
|
|
help="directory to use",
|
|
)
|
|
|
|
# Parse the arguments
|
|
args = parser.parse_args()
|
|
|
|
the_stack_meta_path = args.directory
|
|
|
|
# Check if the directory exists
|
|
if not os.path.isdir(the_stack_meta_path):
|
|
print("Directory does not exist")
|
|
else:
|
|
subdirs = [d for d in os.listdir(os.path.join(the_stack_meta_path, "data"))]
|
|
|
|
# If there are no subdirectories
|
|
if not subdirs:
|
|
print("No subdirectories in the directory")
|
|
else:
|
|
the_stack_meta_path = os.path.join(the_stack_meta_path, "data/", subdirs[0])
|
|
|
|
files = os.listdir(the_stack_meta_path)
|
|
|
|
parquet_files = [f for f in files if f.endswith(".parquet")]
|
|
if len(parquet_files) < 3:
|
|
print("Less than 3 parquet files in the directory")
|
|
else:
|
|
columns = []
|
|
for file in parquet_files:
|
|
col = read_parquet(os.path.join(the_stack_meta_path, file))
|
|
columns.append((file, col))
|
|
|
|
for file, col in columns:
|
|
print("\n\033[1m{0}\033[0m:".format(file))
|
|
for column in col:
|
|
print(column)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|