parrot-datasets/src/the_stack/the_stack_licenses.py

#!/usr/bin/env python3
"""
This script is designed to read and print specific records from the lic.parquet file in a numbered directory under the data/ subdirectory.

Example usage: python3 script.py --records 1-5 -c

Command-line options:
  -h, --help            show this help message and exit
  --version             show program's version number and exit
  -r RANGE, --records=RANGE
                        record number or range to print (e.g., 1, 5-7)
  -c, --color           colorize the output
  -l, --list_licenses   list unique licenses in the file
"""

import argparse
import os
import pandas as pd
import re
from termcolor import colored


def get_records(dataframe, args):
    """
    Extract records from a DataFrame based on user-specified range.

    Parameters:
        dataframe (DataFrame): The pandas DataFrame to extract records from.
        args (Namespace): A namespace object containing parsed command line arguments.

    Returns:
        DataFrame: The extracted records as a new DataFrame.
    """
    if "-" in args.records:
        start, end = map(int, args.records.split("-"))
        return dataframe[start - 1 : end]
    else:
        record_number = int(args.records) - 1
        return dataframe.iloc[[record_number]]


def print_records(dataframe, color):
    """
    Print the records in a DataFrame with optional colorization.

    Parameters:
        dataframe (DataFrame): The pandas DataFrame to print.
        color (bool): If True, colorize the output.
    """
    for index, row in dataframe.iterrows():
        if color:
            for col in row.index:
                print(
                    colored(f"{col}: ", "red") + colored(f"{row[col]}", "blue"),
                    end="\n",
                )
            print()
        else:
            print(row)


def print_unique_licenses(dataframe):
    """
    Print the unique licenses in a DataFrame, sorted alphabetically.

    Parameters:
        dataframe (DataFrame): The pandas DataFrame to extract licenses from.
    """
    licenses = dataframe["license"].unique().tolist()
    licenses.sort(
        key=lambda x: [int(i) if i.isdigit() else i for i in re.split("([0-9]+)", x)]
    )
    for license in licenses:
        print(license)


def main():
    """
    Main function to parse command line arguments and run the script.
    """
    parser = argparse.ArgumentParser(
        description="Specify the directory and record range to use"
    )
    parser.add_argument(
        "directory",
        metavar="dir",
        type=str,
        nargs="?",
        default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
        help="directory to use",
    )
    parser.add_argument(
        "--records",
        "-r",
        metavar="R",
        type=str,
        nargs="?",
        help="record number or range to print (e.g., 1, 5-7)",
    )
    parser.add_argument(
        "--color", "-c", action="store_true", help="Colorize the output"
    )
    parser.add_argument(
        "--list-licenses",
        "-l",
        action="store_true",
        help="List unique licenses in the file",
    )
    args = parser.parse_args()

    directory = os.path.join(args.directory, "data/")
    if not os.path.isdir(directory):
        print("Directory does not exist")
    else:
        subdirs = [
            d
            for d in os.listdir(directory)
            if os.path.isdir(os.path.join(directory, d))
        ]

        if not subdirs:
            print("No subdirectories in the directory")
        else:
            directory = os.path.join(directory, subdirs[0])

            files = [f for f in os.listdir(directory) if f == "lic.parquet"]
            if not files:
                print("No lic.parquet file found")
            else:
                df = pd.read_parquet(os.path.join(directory, "lic.parquet"))

                if args.records:
                    records = get_records(df, args)
                    print_records(records, args.color)
                else:
                    print_unique_licenses(df)


if __name__ == "__main__":
    main()