#!/usr/bin/env python3 """ This script is designed to read and print specific records from the lic.parquet file in a numbered directory under the data/ subdirectory. Example usage: python3 script.py --records 1-5 -c Command-line options: -h, --help show this help message and exit --version show program's version number and exit -r RANGE, --records=RANGE record number or range to print (e.g., 1, 5-7) -c, --color colorize the output -l, --list_licenses list unique licenses in the file """ import argparse import os import pandas as pd import re from termcolor import colored def get_records(dataframe, args): """ Extract records from a DataFrame based on user-specified range. Parameters: dataframe (DataFrame): The pandas DataFrame to extract records from. args (Namespace): A namespace object containing parsed command line arguments. Returns: DataFrame: The extracted records as a new DataFrame. """ if "-" in args.records: start, end = map(int, args.records.split("-")) return dataframe[start - 1 : end] else: record_number = int(args.records) - 1 return dataframe.iloc[[record_number]] def print_records(dataframe, color): """ Print the records in a DataFrame with optional colorization. Parameters: dataframe (DataFrame): The pandas DataFrame to print. color (bool): If True, colorize the output. """ for index, row in dataframe.iterrows(): if color: for col in row.index: print( colored(f"{col}: ", "red") + colored(f"{row[col]}", "blue"), end="\n", ) print() else: print(row) def print_unique_licenses(dataframe): """ Print the unique licenses in a DataFrame, sorted alphabetically. Parameters: dataframe (DataFrame): The pandas DataFrame to extract licenses from. """ licenses = dataframe["license"].unique().tolist() licenses.sort( key=lambda x: [int(i) if i.isdigit() else i for i in re.split("([0-9]+)", x)] ) for license in licenses: print(license) def main(): """ Main function to parse command line arguments and run the script. """ parser = argparse.ArgumentParser( description="Specify the directory and record range to use" ) parser.add_argument( "directory", metavar="dir", type=str, nargs="?", default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata", help="directory to use", ) parser.add_argument( "--records", "-r", metavar="R", type=str, nargs="?", help="record number or range to print (e.g., 1, 5-7)", ) parser.add_argument( "--color", "-c", action="store_true", help="Colorize the output" ) parser.add_argument( "--list-licenses", "-l", action="store_true", help="List unique licenses in the file", ) args = parser.parse_args() directory = os.path.join(args.directory, "data/") if not os.path.isdir(directory): print("Directory does not exist") else: subdirs = [ d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d)) ] if not subdirs: print("No subdirectories in the directory") else: directory = os.path.join(directory, subdirs[0]) files = [f for f in os.listdir(directory) if f == "lic.parquet"] if not files: print("No lic.parquet file found") else: df = pd.read_parquet(os.path.join(directory, "lic.parquet")) if args.records: records = get_records(df, args) print_records(records, args.color) else: print_unique_licenses(df) if __name__ == "__main__": main()