Compare commits
4 Commits
8cf313a5de
...
44f43a5fcc
Author | SHA1 | Date |
---|---|---|
![]() |
44f43a5fcc | |
![]() |
3debab7105 | |
![]() |
7003b5766c | |
![]() |
f5ed65602f |
|
@ -2,28 +2,35 @@
|
|||
#
|
||||
# Parrot with code assist from Phind-CodeLlama-34B-v2
|
||||
|
||||
"""Script to read headers of each parquet file in the lowest numbered subdirectory of a given directory.
|
||||
The script expects the directory as a command line argument. If no directory is provided, it uses
|
||||
"""Script to read headers of The Stack metadata parquet files.
|
||||
The script expects the The Stack metadata main directory as a command line argument.
|
||||
If no directory is provided, it uses
|
||||
'/srv/ml/huggingface/datasets/bigcode/the-stack-metadata' as the default directory."""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import pandas as pd
|
||||
from termcolor import colored
|
||||
from multiprocessing import Pool
|
||||
|
||||
|
||||
def read_parquet(file):
|
||||
df = pd.read_parquet(file)
|
||||
return df.columns
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
# Create argument parser
|
||||
parser = argparse.ArgumentParser(description='Specify the directory to use')
|
||||
parser = argparse.ArgumentParser(description="Specify the directory to use")
|
||||
|
||||
# Add an argument for specifying the directory
|
||||
parser.add_argument('directory', metavar='dir', type=str, nargs='?',
|
||||
default='/srv/ml/huggingface/datasets/bigcode/the-stack-metadata',
|
||||
help='directory to use')
|
||||
parser.add_argument(
|
||||
"directory",
|
||||
metavar="dir",
|
||||
type=str,
|
||||
nargs="?",
|
||||
default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
|
||||
help="directory to use",
|
||||
)
|
||||
|
||||
# Parse the arguments
|
||||
args = parser.parse_args()
|
||||
|
@ -34,28 +41,30 @@ def main():
|
|||
if not os.path.isdir(the_stack_meta_path):
|
||||
print("Directory does not exist")
|
||||
else:
|
||||
subdirs = sorted([d for d in os.listdir(os.path.join(the_stack_meta_path, "data")) if
|
||||
os.path.isdir(os.path.join(the_stack_meta_path, "data", d))])
|
||||
subdirs = [d for d in os.listdir(os.path.join(the_stack_meta_path, "data"))]
|
||||
|
||||
# If there are no subdirectories
|
||||
if not subdirs:
|
||||
print("No subdirectories in the directory")
|
||||
else:
|
||||
the_stack_meta_path = os.path.join(the_stack_meta_path, 'data/', subdirs[0])
|
||||
the_stack_meta_path = os.path.join(the_stack_meta_path, "data/", subdirs[0])
|
||||
|
||||
files = os.listdir(the_stack_meta_path)
|
||||
|
||||
parquet_files = [f for f in files if f.endswith('.parquet')]
|
||||
parquet_files = [f for f in files if f.endswith(".parquet")]
|
||||
if len(parquet_files) < 3:
|
||||
print("Less than 3 parquet files in the directory")
|
||||
else:
|
||||
with Pool() as pool:
|
||||
columns = pool.map(read_parquet, [os.path.join(the_stack_meta_path, file) for file in parquet_files])
|
||||
dfs = []
|
||||
for file in parquet_files:
|
||||
df = read_parquet(os.path.join(the_stack_meta_path, file))
|
||||
dfs.append(df)
|
||||
|
||||
final_df = pd.concat(dfs)
|
||||
|
||||
print("\n\033[1mFinal DataFrame:\033[0m")
|
||||
print(final_df.info())
|
||||
|
||||
for i, col in enumerate(columns):
|
||||
print("\n\033[1m{0}\033[0m:".format(parquet_files[i]))
|
||||
for column in col:
|
||||
print(column)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
Loading…
Reference in New Issue