-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSeq_Analysis_grouping.py
67 lines (59 loc) · 3.88 KB
/
Seq_Analysis_grouping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python3.8
# Copyright [2020] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from merger import Merger
import argparse, hashlib, os, subprocess, sys, time
import pandas as pd
parser = argparse.ArgumentParser(prog='Seq_Analysis_grouping.py', formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
+ ============================================================ +
| European Nucleotide Archive (ENA) Dashboard Tool |
| |
| |
+ =========================================================== +
""")
parser.add_argument('-o', '--output', help='output directory', type=str, required=True)
parser.add_argument('-f', '--file', help='Analysis and Sequence directory', type=str, required=True)
args = parser.parse_args()
def stat_dataframe_sequences(df_analysis,df_sequences, database):
df_analysis = df_analysis.astype(str)
df_sequences = df_sequences.astype(str)
analysis_seq_df = pd.merge(df_analysis, df_sequences, on=['Analysis ID'], how='left')#.fillna('')#.replace('', '0')astype(str)
GCA_formatting_data = pd.merge(df_analysis, df_sequences, on=['Analysis ID'], how='right')
print(analysis_seq_df)
#df = pd.DataFrame.from_dict(APIdata, orient='columns')
#df['country'] = df['country'].str.split(':').str[0]
#sql_api_join = pd.merge(analysis_seq_df, df[['accession', 'country']], on='accession', how='left')
#sql_api_join['Country'] = sql_api_join['Country'].fillna(sql_api_join['country'])
#sql_api_join.drop(['country'], inplace=True, axis=1)
merger = Merger(analysis_seq_df, database, GCA_formatting_data)
analysis_seq_webin = merger[0]
analysis_seq_webin.to_csv(f"{args.output}/SQL-API.{database}_webin.log.csv", index=False)
analysis_seq_project = merger[1]
analysis_seq_project.to_csv(f"{args.output}/SQL-API.{database}_project.log.csv",
index=False)
center_name_grouped = merger[2]
center_name_grouped.to_csv(f"{args.output}/SQL-API.{database}_center_names.log.csv",
index=False)
if __name__ == "__main__":
###########################################
# #
# MAIN #
# #
###########################################
# fetching analysis data
sql_output_analysis = pd.read_csv(f"{args.file}/SQL.Analysis.log.csv", sep=",", dtype={"Webin Account": "string", "Project ID": "string", "Project Status ID": int, "Sample ID": "string", "Sample Status ID": int, "Analysis ID": "string" , "Analysis Status ID": int, "Country": "string", "last_updated": "string"})#, header=None, names=['Webin Account', 'Project ID', 'Project Status ID', 'Sample ID', 'Sample Status ID', 'Analysis ID', 'Analysis Status ID', 'Country'])
sql_output_seq = pd.read_csv(f"{args.file}/SQL.Sequence.log.csv", sep=",", dtype={"Analysis ID": "string", "GC_ID": "string", "GC Status ID": int, "Sequence Accessions Range": "string", "WGS ID": "string"})#, header=None, names=['Analysis ID', 'GC_ID', 'GC Status ID', 'Sequence Accessions Range', 'WGS ID'])
stat_dataframe_sequences(sql_output_analysis, sql_output_seq,'sequence')
sys.stderr.write("*************END*************\n")