Compare commits

...

4 Commits

Author SHA1 Message Date
Jeff Moe 44f43a5fcc Print panda dataframe types of headers 2023-11-24 16:43:42 -07:00
Jeff Moe 3debab7105 marginally faster without multiprocessing 2023-11-24 16:38:58 -07:00
Jeff Moe 7003b5766c dont iterate all dirs, header reader 2023-11-24 16:36:55 -07:00
Jeff Moe f5ed65602f format with black 2023-11-24 16:34:11 -07:00
1 changed files with 27 additions and 18 deletions

View File

@ -2,28 +2,35 @@
# #
# Parrot with code assist from Phind-CodeLlama-34B-v2 # Parrot with code assist from Phind-CodeLlama-34B-v2
"""Script to read headers of each parquet file in the lowest numbered subdirectory of a given directory. """Script to read headers of The Stack metadata parquet files.
The script expects the directory as a command line argument. If no directory is provided, it uses The script expects the The Stack metadata main directory as a command line argument.
If no directory is provided, it uses
'/srv/ml/huggingface/datasets/bigcode/the-stack-metadata' as the default directory.""" '/srv/ml/huggingface/datasets/bigcode/the-stack-metadata' as the default directory."""
import argparse import argparse
import os import os
import pandas as pd import pandas as pd
from termcolor import colored from termcolor import colored
from multiprocessing import Pool
def read_parquet(file): def read_parquet(file):
df = pd.read_parquet(file) df = pd.read_parquet(file)
return df.columns return df
def main(): def main():
# Create argument parser # Create argument parser
parser = argparse.ArgumentParser(description='Specify the directory to use') parser = argparse.ArgumentParser(description="Specify the directory to use")
# Add an argument for specifying the directory # Add an argument for specifying the directory
parser.add_argument('directory', metavar='dir', type=str, nargs='?', parser.add_argument(
default='/srv/ml/huggingface/datasets/bigcode/the-stack-metadata', "directory",
help='directory to use') metavar="dir",
type=str,
nargs="?",
default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
help="directory to use",
)
# Parse the arguments # Parse the arguments
args = parser.parse_args() args = parser.parse_args()
@ -34,28 +41,30 @@ def main():
if not os.path.isdir(the_stack_meta_path): if not os.path.isdir(the_stack_meta_path):
print("Directory does not exist") print("Directory does not exist")
else: else:
subdirs = sorted([d for d in os.listdir(os.path.join(the_stack_meta_path, "data")) if subdirs = [d for d in os.listdir(os.path.join(the_stack_meta_path, "data"))]
os.path.isdir(os.path.join(the_stack_meta_path, "data", d))])
# If there are no subdirectories # If there are no subdirectories
if not subdirs: if not subdirs:
print("No subdirectories in the directory") print("No subdirectories in the directory")
else: else:
the_stack_meta_path = os.path.join(the_stack_meta_path, 'data/', subdirs[0]) the_stack_meta_path = os.path.join(the_stack_meta_path, "data/", subdirs[0])
files = os.listdir(the_stack_meta_path) files = os.listdir(the_stack_meta_path)
parquet_files = [f for f in files if f.endswith('.parquet')] parquet_files = [f for f in files if f.endswith(".parquet")]
if len(parquet_files) < 3: if len(parquet_files) < 3:
print("Less than 3 parquet files in the directory") print("Less than 3 parquet files in the directory")
else: else:
with Pool() as pool: dfs = []
columns = pool.map(read_parquet, [os.path.join(the_stack_meta_path, file) for file in parquet_files]) for file in parquet_files:
df = read_parquet(os.path.join(the_stack_meta_path, file))
dfs.append(df)
final_df = pd.concat(dfs)
print("\n\033[1mFinal DataFrame:\033[0m")
print(final_df.info())
for i, col in enumerate(columns):
print("\n\033[1m{0}\033[0m:".format(parquet_files[i]))
for column in col:
print(column)
if __name__ == "__main__": if __name__ == "__main__":
main() main()