-
Notifications
You must be signed in to change notification settings - Fork 0
/
combine_ebv_and_mutation_vcfs_to_stata_format.py
53 lines (39 loc) · 2.02 KB
/
combine_ebv_and_mutation_vcfs_to_stata_format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
import sys
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
ebv_file = open(sys.argv[1], 'r')
df_ebv_file = pd.read_csv(ebv_file)
df_ebv_file['Study ID'] = df_ebv_file['Study ID'].astype(str)
#df_ebv_file_int = df_ebv_file.astype('Int64')
human_file = open(sys.argv[2], 'r')
df_human_file = pd.read_csv(human_file)
df_human_file['Study ID'] = df_human_file['Study ID'].astype(str)
#df_human_file_int = df_human_file.astype('Int64')
combined_df = pd.merge(df_ebv_file, df_human_file, on='Study ID')
#print(combined_df)
#only filtering for geography=1 aka African samples + removing Brazilian patients
combined_df = combined_df.drop(combined_df[combined_df['Geography'] == 0].index)
combined_df = combined_df.drop(combined_df[combined_df['Location'] == 'Belo Horizonte'].index)
combined_df = combined_df.drop(columns=combined_df.columns[combined_df.eq(0).mean()>0.95])
#print(combined_df)
#so that we only get columns with 2 or fewer missing values
combined_df.dropna(axis=1, thresh = int(0.98*combined_df.shape[0]), inplace=True)
#combined_df.to_csv('df_ebv_and_human_mutations_231115_240225trial_Africanonly.csv', index=False)
#imputation with median value
combined_df[combined_df.columns] = combined_df[combined_df.columns].apply(pd.to_numeric, errors='ignore')
combined_df.fillna(combined_df.median(numeric_only=True), inplace=True)
na_column_df = combined_df.isna().sum()
na_column_df.to_csv('df_na_column.csv', index=False)
combined_df.drop(['Unnamed: 0_x', 'Unnamed: 0_y'], axis=1, inplace=True)
#combined_df.to_csv('df_ebv_and_human_mutations_geography1_231115_Africaonly_240226.csv', index=False)
output_variable_file = open("/Users/isaacekimjr/Desktop/231115_varnames.txt", 'w+')
for varname in list(combined_df.columns):
if "Unnamed" not in varname:
output_variable_file.write(varname.lower()[:34])
output_variable_file.write('\n')
#print(combined_df)