parrot-datasets/src/the_smack/the_stack_headers.py

#!/usr/bin/env python3
#
# Parrot with code assist from Phind-CodeLlama-34B-v2

"""Script to read headers of The Stack metadata parquet files.
The script expects the The Stack metadata main directory as a command line argument.
If no directory is provided, it uses
'/srv/ml/huggingface/datasets/bigcode/the-stack-metadata' as the default directory."""

import argparse
import os
import pandas as pd
from termcolor import colored


def read_parquet(file):
    df = pd.read_parquet(file)
    return df.columns


def main():
    # Create argument parser
    parser = argparse.ArgumentParser(description="Specify the directory to use")

    # Add an argument for specifying the directory
    parser.add_argument(
        "directory",
        metavar="dir",
        type=str,
        nargs="?",
        default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
        help="directory to use",
    )

    # Parse the arguments
    args = parser.parse_args()

    the_stack_meta_path = args.directory

    # Check if the directory exists
    if not os.path.isdir(the_stack_meta_path):
        print("Directory does not exist")
    else:
        subdirs = [d for d in os.listdir(os.path.join(the_stack_meta_path, "data"))]

        # If there are no subdirectories
        if not subdirs:
            print("No subdirectories in the directory")
        else:
            the_stack_meta_path = os.path.join(the_stack_meta_path, "data/", subdirs[0])

            files = os.listdir(the_stack_meta_path)

            parquet_files = [f for f in files if f.endswith(".parquet")]
            if len(parquet_files) < 3:
                print("Less than 3 parquet files in the directory")
            else:
                columns = []
                for file in parquet_files:
                    col = read_parquet(os.path.join(the_stack_meta_path, file))
                    columns.append((file, col))

                for file, col in columns:
                    print("\n\033[1m{0}\033[0m:".format(file))
                    for column in col:
                        print(column)


if __name__ == "__main__":
    main()