Compare commits
4 Commits
8cf313a5de
...
44f43a5fcc
Author | SHA1 | Date |
---|---|---|
Jeff Moe | 44f43a5fcc | |
Jeff Moe | 3debab7105 | |
Jeff Moe | 7003b5766c | |
Jeff Moe | f5ed65602f |
|
@ -2,28 +2,35 @@
|
||||||
#
|
#
|
||||||
# Parrot with code assist from Phind-CodeLlama-34B-v2
|
# Parrot with code assist from Phind-CodeLlama-34B-v2
|
||||||
|
|
||||||
"""Script to read headers of each parquet file in the lowest numbered subdirectory of a given directory.
|
"""Script to read headers of The Stack metadata parquet files.
|
||||||
The script expects the directory as a command line argument. If no directory is provided, it uses
|
The script expects the The Stack metadata main directory as a command line argument.
|
||||||
|
If no directory is provided, it uses
|
||||||
'/srv/ml/huggingface/datasets/bigcode/the-stack-metadata' as the default directory."""
|
'/srv/ml/huggingface/datasets/bigcode/the-stack-metadata' as the default directory."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from termcolor import colored
|
from termcolor import colored
|
||||||
from multiprocessing import Pool
|
|
||||||
|
|
||||||
def read_parquet(file):
|
def read_parquet(file):
|
||||||
df = pd.read_parquet(file)
|
df = pd.read_parquet(file)
|
||||||
return df.columns
|
return df
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Create argument parser
|
# Create argument parser
|
||||||
parser = argparse.ArgumentParser(description='Specify the directory to use')
|
parser = argparse.ArgumentParser(description="Specify the directory to use")
|
||||||
|
|
||||||
# Add an argument for specifying the directory
|
# Add an argument for specifying the directory
|
||||||
parser.add_argument('directory', metavar='dir', type=str, nargs='?',
|
parser.add_argument(
|
||||||
default='/srv/ml/huggingface/datasets/bigcode/the-stack-metadata',
|
"directory",
|
||||||
help='directory to use')
|
metavar="dir",
|
||||||
|
type=str,
|
||||||
|
nargs="?",
|
||||||
|
default="/srv/ml/huggingface/datasets/bigcode/the-stack-metadata",
|
||||||
|
help="directory to use",
|
||||||
|
)
|
||||||
|
|
||||||
# Parse the arguments
|
# Parse the arguments
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -34,28 +41,30 @@ def main():
|
||||||
if not os.path.isdir(the_stack_meta_path):
|
if not os.path.isdir(the_stack_meta_path):
|
||||||
print("Directory does not exist")
|
print("Directory does not exist")
|
||||||
else:
|
else:
|
||||||
subdirs = sorted([d for d in os.listdir(os.path.join(the_stack_meta_path, "data")) if
|
subdirs = [d for d in os.listdir(os.path.join(the_stack_meta_path, "data"))]
|
||||||
os.path.isdir(os.path.join(the_stack_meta_path, "data", d))])
|
|
||||||
|
|
||||||
# If there are no subdirectories
|
# If there are no subdirectories
|
||||||
if not subdirs:
|
if not subdirs:
|
||||||
print("No subdirectories in the directory")
|
print("No subdirectories in the directory")
|
||||||
else:
|
else:
|
||||||
the_stack_meta_path = os.path.join(the_stack_meta_path, 'data/', subdirs[0])
|
the_stack_meta_path = os.path.join(the_stack_meta_path, "data/", subdirs[0])
|
||||||
|
|
||||||
files = os.listdir(the_stack_meta_path)
|
files = os.listdir(the_stack_meta_path)
|
||||||
|
|
||||||
parquet_files = [f for f in files if f.endswith('.parquet')]
|
parquet_files = [f for f in files if f.endswith(".parquet")]
|
||||||
if len(parquet_files) < 3:
|
if len(parquet_files) < 3:
|
||||||
print("Less than 3 parquet files in the directory")
|
print("Less than 3 parquet files in the directory")
|
||||||
else:
|
else:
|
||||||
with Pool() as pool:
|
dfs = []
|
||||||
columns = pool.map(read_parquet, [os.path.join(the_stack_meta_path, file) for file in parquet_files])
|
for file in parquet_files:
|
||||||
|
df = read_parquet(os.path.join(the_stack_meta_path, file))
|
||||||
|
dfs.append(df)
|
||||||
|
|
||||||
|
final_df = pd.concat(dfs)
|
||||||
|
|
||||||
|
print("\n\033[1mFinal DataFrame:\033[0m")
|
||||||
|
print(final_df.info())
|
||||||
|
|
||||||
for i, col in enumerate(columns):
|
|
||||||
print("\n\033[1m{0}\033[0m:".format(parquet_files[i]))
|
|
||||||
for column in col:
|
|
||||||
print(column)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Reference in New Issue