Compare commits

...

4 Commits

Author SHA1 Message Date
Jeff Moe 44f43a5fcc Print panda dataframe types of headers 2023-11-24 16:43:42 -07:00
Jeff Moe 3debab7105 marginally faster without multiprocessing 2023-11-24 16:38:58 -07:00
Jeff Moe 7003b5766c dont iterate all dirs, header reader 2023-11-24 16:36:55 -07:00
Jeff Moe f5ed65602f format with black 2023-11-24 16:34:11 -07:00
1 changed files with 27 additions and 18 deletions

View File

@ -2,28 +2,35 @@
#
# Parrot with code assist from Phind-CodeLlama-34B-v2
"""Script to read headers of each parquet file in the lowest numbered subdirectory of a given directory.
The script expects the directory as a command line argument. If no directory is provided, it uses
"""Script to read headers of The Stack metadata parquet files.
The script expects the The Stack metadata main directory as a command line argument.
If no directory is provided, it uses
'/srv/ml/huggingface/datasets/bigcode/the-stack-metadata' as the default directory."""
import argparse
import os
import pandas as pd
from termcolor import colored
from multiprocessing import Pool
def read_parquet(file):
df = pd.read_parquet(file)
return df.columns
return df
def main():
# Create argument parser
parser = argparse.ArgumentParser(description='Specify the directory to use')
parser = argparse.ArgumentParser(description="Specify the directory to use")
# Add an argument for specifying the directory
parser.add_argument('directory', metavar='dir', type=str, nargs='?',
default='/srv/ml/huggingface/datasets/bigcode/the-stack-metadata',
help='directory to use')
parser.add_argument(
"directory",
metavar="dir",
type=str,
nargs="?",
default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
help="directory to use",
)
# Parse the arguments
args = parser.parse_args()
@ -34,28 +41,30 @@ def main():
if not os.path.isdir(the_stack_meta_path):
print("Directory does not exist")
else:
subdirs = sorted([d for d in os.listdir(os.path.join(the_stack_meta_path, "data")) if
os.path.isdir(os.path.join(the_stack_meta_path, "data", d))])
subdirs = [d for d in os.listdir(os.path.join(the_stack_meta_path, "data"))]
# If there are no subdirectories
if not subdirs:
print("No subdirectories in the directory")
else:
the_stack_meta_path = os.path.join(the_stack_meta_path, 'data/', subdirs[0])
the_stack_meta_path = os.path.join(the_stack_meta_path, "data/", subdirs[0])
files = os.listdir(the_stack_meta_path)
parquet_files = [f for f in files if f.endswith('.parquet')]
parquet_files = [f for f in files if f.endswith(".parquet")]
if len(parquet_files) < 3:
print("Less than 3 parquet files in the directory")
else:
with Pool() as pool:
columns = pool.map(read_parquet, [os.path.join(the_stack_meta_path, file) for file in parquet_files])
dfs = []
for file in parquet_files:
df = read_parquet(os.path.join(the_stack_meta_path, file))
dfs.append(df)
final_df = pd.concat(dfs)
print("\n\033[1mFinal DataFrame:\033[0m")
print(final_df.info())
for i, col in enumerate(columns):
print("\n\033[1m{0}\033[0m:".format(parquet_files[i]))
for column in col:
print(column)
if __name__ == "__main__":
main()