parrot-datasets/src/the_stack/the_stack_licenses.py

141 lines
4.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""
This script is designed to read and print specific records from the lic.parquet file in a numbered directory under the data/ subdirectory.
Example usage: python3 script.py --records 1-5 -c
Command-line options:
-h, --help show this help message and exit
--version show program's version number and exit
-r RANGE, --records=RANGE
record number or range to print (e.g., 1, 5-7)
-c, --color colorize the output
-l, --list_licenses list unique licenses in the file
"""
import argparse
import os
import pandas as pd
import re
from termcolor import colored
def get_records(dataframe, args):
"""
Extract records from a DataFrame based on user-specified range.
Parameters:
dataframe (DataFrame): The pandas DataFrame to extract records from.
args (Namespace): A namespace object containing parsed command line arguments.
Returns:
DataFrame: The extracted records as a new DataFrame.
"""
if "-" in args.records:
start, end = map(int, args.records.split("-"))
return dataframe[start - 1 : end]
else:
record_number = int(args.records) - 1
return dataframe.iloc[[record_number]]
def print_records(dataframe, color):
"""
Print the records in a DataFrame with optional colorization.
Parameters:
dataframe (DataFrame): The pandas DataFrame to print.
color (bool): If True, colorize the output.
"""
for index, row in dataframe.iterrows():
if color:
for col in row.index:
print(
colored(f"{col}: ", "red") + colored(f"{row[col]}", "blue"),
end="\n",
)
print()
else:
print(row)
def print_unique_licenses(dataframe):
"""
Print the unique licenses in a DataFrame, sorted alphabetically.
Parameters:
dataframe (DataFrame): The pandas DataFrame to extract licenses from.
"""
licenses = dataframe["license"].unique().tolist()
licenses.sort(
key=lambda x: [int(i) if i.isdigit() else i for i in re.split("([0-9]+)", x)]
)
for license in licenses:
print(license)
def main():
"""
Main function to parse command line arguments and run the script.
"""
parser = argparse.ArgumentParser(
description="Specify the directory and record range to use"
)
parser.add_argument(
"directory",
metavar="dir",
type=str,
nargs="?",
default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
help="directory to use",
)
parser.add_argument(
"--records",
"-r",
metavar="R",
type=str,
nargs="?",
help="record number or range to print (e.g., 1, 5-7)",
)
parser.add_argument(
"--color", "-c", action="store_true", help="Colorize the output"
)
parser.add_argument(
"--list-licenses",
"-l",
action="store_true",
help="List unique licenses in the file",
)
args = parser.parse_args()
directory = os.path.join(args.directory, "data/")
if not os.path.isdir(directory):
print("Directory does not exist")
else:
subdirs = [
d
for d in os.listdir(directory)
if os.path.isdir(os.path.join(directory, d))
]
if not subdirs:
print("No subdirectories in the directory")
else:
directory = os.path.join(directory, subdirs[0])
files = [f for f in os.listdir(directory) if f == "lic.parquet"]
if not files:
print("No lic.parquet file found")
else:
df = pd.read_parquet(os.path.join(directory, "lic.parquet"))
if args.records:
records = get_records(df, args)
print_records(records, args.color)
else:
print_unique_licenses(df)
if __name__ == "__main__":
main()