-
Notifications
You must be signed in to change notification settings - Fork 0
/
dropDupes.py
executable file
·32 lines (24 loc) · 975 Bytes
/
dropDupes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/env python
# script to drop duplicate lines given a file
import sys
import pandas as pd
import numpy as np
# set command line arguments
# improvement: get an argument parser in place
raw_data = sys.argv[1] # 0 is the index of name of python prog, 1 is the index of first argument in command line.
column_name_to_sort = sys.argv[2]
new_filename = sys.argv[3]
# read in data
df=pd.read_csv(raw_data, sep=' ')
# drop duplicates
# improvement: get a number of how many duplicates were dropped
df = df.drop_duplicates(subset = column_name_to_sort, keep = "first")
# sort based on column - seems to be extremely slow (MacBook, 2.4 GHz Quad-Core Intel Core i5, 16Gb)
#df.sort_values(by = [column_name_to_sort], inplace = True)
# write to .gz-file
# improvement: command line argument for file name
df.to_csv(new_filename, sep = ' ', compression = "gzip", index = False)
# improvement:
# - get header in place
# - get functions in place
# - get copyright in place