Print panda dataframe types of headers

marginally faster without multiprocessing
dont iterate all dirs, header reader
2023-11-24 16:43:42 -07:00 · 2023-11-24 16:38:58 -07:00 · 2023-11-24 16:36:55 -07:00 · 2023-11-24 16:34:11 -07:00
1 changed files with 27 additions and 18 deletions
--- a/datasets/the-smack/the-stack-headers
+++ b/datasets/the-smack/the-stack-headers
@ -2,28 +2,35 @@
 #
 # Parrot with code assist from Phind-CodeLlama-34B-v2

-"""Script to read headers of each parquet file in the lowest numbered subdirectory of a given directory.
-The script expects the directory as a command line argument. If no directory is provided, it uses
+"""Script to read headers of The Stack metadata parquet files.
+The script expects the The Stack metadata main directory as a command line argument.
+If no directory is provided, it uses
 '/srv/ml/huggingface/datasets/bigcode/the-stack-metadata' as the default directory."""

 import argparse
 import os
 import pandas as pd
 from termcolor import colored
-from multiprocessing import Pool
+

 def read_parquet(file):
    df = pd.read_parquet(file)
-    return df.columns
+    return df
+

 def main():
    # Create argument parser
-    parser = argparse.ArgumentParser(description='Specify the directory to use')
+    parser = argparse.ArgumentParser(description="Specify the directory to use")

    # Add an argument for specifying the directory
-    parser.add_argument('directory', metavar='dir', type=str, nargs='?',
-                        default='/srv/ml/huggingface/datasets/bigcode/the-stack-metadata',
-                        help='directory to use')
+    parser.add_argument(
+        "directory",
+        metavar="dir",
+        type=str,
+        nargs="?",
+        default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
+        help="directory to use",
+    )

    # Parse the arguments
    args = parser.parse_args()
@ -34,28 +41,30 @@ def main():
    if not os.path.isdir(the_stack_meta_path):
        print("Directory does not exist")
    else:
-        subdirs = sorted([d for d in os.listdir(os.path.join(the_stack_meta_path, "data")) if
-                          os.path.isdir(os.path.join(the_stack_meta_path, "data", d))])
+        subdirs = [d for d in os.listdir(os.path.join(the_stack_meta_path, "data"))]

        # If there are no subdirectories
        if not subdirs:
            print("No subdirectories in the directory")
        else:
-            the_stack_meta_path = os.path.join(the_stack_meta_path, 'data/', subdirs[0])
+            the_stack_meta_path = os.path.join(the_stack_meta_path, "data/", subdirs[0])

            files = os.listdir(the_stack_meta_path)

-            parquet_files = [f for f in files if f.endswith('.parquet')]
+            parquet_files = [f for f in files if f.endswith(".parquet")]
            if len(parquet_files) < 3:
                print("Less than 3 parquet files in the directory")
            else:
-                with Pool() as pool:
-                    columns = pool.map(read_parquet, [os.path.join(the_stack_meta_path, file) for file in parquet_files])
+                dfs = []
+                for file in parquet_files:
+                    df = read_parquet(os.path.join(the_stack_meta_path, file))
+                    dfs.append(df)
+
+                final_df = pd.concat(dfs)
+
+                print("\n\033[1mFinal DataFrame:\033[0m")
+                print(final_df.info())

-                for i, col in enumerate(columns):
-                    print("\n\033[1m{0}\033[0m:".format(parquet_files[i]))
-                    for column in col:
-                        print(column)

 if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
Jeff Moe	44f43a5fcc	Print panda dataframe types of headers	2023-11-24 16:43:42 -07:00
Jeff Moe	3debab7105	marginally faster without multiprocessing	2023-11-24 16:38:58 -07:00
Jeff Moe	7003b5766c	dont iterate all dirs, header reader	2023-11-24 16:36:55 -07:00
Jeff Moe	f5ed65602f	format with black	2023-11-24 16:34:11 -07:00