141 lines
4.0 KiB
Python
Executable File
141 lines
4.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
This script is designed to read and print specific records from the lic.parquet file in a numbered directory under the data/ subdirectory.
|
|
|
|
Example usage: python3 script.py --records 1-5 -c
|
|
|
|
Command-line options:
|
|
-h, --help show this help message and exit
|
|
--version show program's version number and exit
|
|
-r RANGE, --records=RANGE
|
|
record number or range to print (e.g., 1, 5-7)
|
|
-c, --color colorize the output
|
|
-l, --list_licenses list unique licenses in the file
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import pandas as pd
|
|
import re
|
|
from termcolor import colored
|
|
|
|
|
|
def get_records(dataframe, args):
|
|
"""
|
|
Extract records from a DataFrame based on user-specified range.
|
|
|
|
Parameters:
|
|
dataframe (DataFrame): The pandas DataFrame to extract records from.
|
|
args (Namespace): A namespace object containing parsed command line arguments.
|
|
|
|
Returns:
|
|
DataFrame: The extracted records as a new DataFrame.
|
|
"""
|
|
if "-" in args.records:
|
|
start, end = map(int, args.records.split("-"))
|
|
return dataframe[start - 1 : end]
|
|
else:
|
|
record_number = int(args.records) - 1
|
|
return dataframe.iloc[[record_number]]
|
|
|
|
|
|
def print_records(dataframe, color):
|
|
"""
|
|
Print the records in a DataFrame with optional colorization.
|
|
|
|
Parameters:
|
|
dataframe (DataFrame): The pandas DataFrame to print.
|
|
color (bool): If True, colorize the output.
|
|
"""
|
|
for index, row in dataframe.iterrows():
|
|
if color:
|
|
for col in row.index:
|
|
print(
|
|
colored(f"{col}: ", "red") + colored(f"{row[col]}", "blue"),
|
|
end="\n",
|
|
)
|
|
print()
|
|
else:
|
|
print(row)
|
|
|
|
|
|
def print_unique_licenses(dataframe):
|
|
"""
|
|
Print the unique licenses in a DataFrame, sorted alphabetically.
|
|
|
|
Parameters:
|
|
dataframe (DataFrame): The pandas DataFrame to extract licenses from.
|
|
"""
|
|
licenses = dataframe["license"].unique().tolist()
|
|
licenses.sort(
|
|
key=lambda x: [int(i) if i.isdigit() else i for i in re.split("([0-9]+)", x)]
|
|
)
|
|
for license in licenses:
|
|
print(license)
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main function to parse command line arguments and run the script.
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Specify the directory and record range to use"
|
|
)
|
|
parser.add_argument(
|
|
"directory",
|
|
metavar="dir",
|
|
type=str,
|
|
nargs="?",
|
|
default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
|
|
help="directory to use",
|
|
)
|
|
parser.add_argument(
|
|
"--records",
|
|
"-r",
|
|
metavar="R",
|
|
type=str,
|
|
nargs="?",
|
|
help="record number or range to print (e.g., 1, 5-7)",
|
|
)
|
|
parser.add_argument(
|
|
"--color", "-c", action="store_true", help="Colorize the output"
|
|
)
|
|
parser.add_argument(
|
|
"--list-licenses",
|
|
"-l",
|
|
action="store_true",
|
|
help="List unique licenses in the file",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
directory = os.path.join(args.directory, "data/")
|
|
if not os.path.isdir(directory):
|
|
print("Directory does not exist")
|
|
else:
|
|
subdirs = [
|
|
d
|
|
for d in os.listdir(directory)
|
|
if os.path.isdir(os.path.join(directory, d))
|
|
]
|
|
|
|
if not subdirs:
|
|
print("No subdirectories in the directory")
|
|
else:
|
|
directory = os.path.join(directory, subdirs[0])
|
|
|
|
files = [f for f in os.listdir(directory) if f == "lic.parquet"]
|
|
if not files:
|
|
print("No lic.parquet file found")
|
|
else:
|
|
df = pd.read_parquet(os.path.join(directory, "lic.parquet"))
|
|
|
|
if args.records:
|
|
records = get_records(df, args)
|
|
print_records(records, args.color)
|
|
else:
|
|
print_unique_licenses(df)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|