From df6a972c299dafb4d5b49edcc40d70a04fd0cc09 Mon Sep 17 00:00:00 2001 From: Jeff Moe Date: Fri, 24 Nov 2023 17:23:47 -0700 Subject: [PATCH] Add option to print unique licenses used by The Stack --- datasets/the-smack/the-stack-licenses | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/datasets/the-smack/the-stack-licenses b/datasets/the-smack/the-stack-licenses index 4199cdd..cdf4afb 100755 --- a/datasets/the-smack/the-stack-licenses +++ b/datasets/the-smack/the-stack-licenses @@ -29,6 +29,12 @@ def print_records(dataframe, color): print(row) +def print_unique_licenses(dataframe): + licenses = dataframe["license"].unique().tolist() + for license in licenses: + print(license) + + def main(): parser = argparse.ArgumentParser( description="Specify the directory and record range to use" @@ -53,6 +59,12 @@ def main(): parser.add_argument( "--color", "-c", action="store_true", help="Colorize the output" ) + parser.add_argument( + "--list-licenses", + "-l", + action="store_true", + help="List unique licenses in the file", + ) args = parser.parse_args() directory = os.path.join(args.directory, "data/") @@ -76,8 +88,11 @@ def main(): else: df = pd.read_parquet(os.path.join(directory, "lic.parquet")) - records = get_records(df, args) - print_records(records, args.color) + if args.list_licenses: + print_unique_licenses(df) + else: + records = get_records(df, args) + print_records(records, args.color) if __name__ == "__main__":