diff --git a/XICRA_pip/XICRA/modules/database.py b/XICRA_pip/XICRA/modules/database.py index 5db9edc..915f557 100644 --- a/XICRA_pip/XICRA/modules/database.py +++ b/XICRA_pip/XICRA/modules/database.py @@ -66,7 +66,7 @@ def miRNA_db(options): miRBase_files_dict[file_req] = "" else: file_retrieved = HCGB_main.retrieve_matching_files(options.miRNA_db, file_req, options.debug, starts=False) - if HCGB_main.is_non_zero_file(file_retrieved[0]): + if HCGB_files.is_non_zero_file(file_retrieved[0]): miRBase_files_dict[file_req] = file_retrieved[0] else: miRBase_files_dict[file_req] = "" diff --git a/XICRA_pip/XICRA/scripts/generate_DE.py b/XICRA_pip/XICRA/scripts/generate_DE.py index aef6499..a730bab 100644 --- a/XICRA_pip/XICRA/scripts/generate_DE.py +++ b/XICRA_pip/XICRA/scripts/generate_DE.py @@ -162,7 +162,32 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"): ## header of tsv files: ## UID Read miRNA Variant iso_5p iso_3p iso_add3p iso_snp sRNAbench + ## add NA if any data['Variant'].fillna('NA', inplace=True) + + + ## some variants are more complex and are denoted by several variants separated by comma: + ## e.g. iso_3p:+3,iso_add3p:1 + + ## These variants be included in different orders generating erroneous duplicated hits later: + ## e.g.: + ## "hsa-miR-383-3p&iso_3p:+3,iso_add3p:1 & iso-22-0JEVN3JBF" + ## "hsa-miR-383-3p&iso_add3p:1,iso_3p:+3 & iso-22-0JEVN3JBF" + + ## "hsa-miR-9500 & iso_add3p:1,iso_snv,iso_3p:+3 & iso-22-DKDERUKIQ" + ## "hsa-miR-9500 & iso_snv,iso_add3p:1,iso_3p:+3 & iso-22-DKDERUKIQ" + + ## let's sort several entries if any and avoid this artifact + + ## get variants that contain several types and sort them + #print(data[data['Variant'].str.contains(",")]) + for i, row in data.iterrows(): + if (',' in row.Variant): + list_of_variants = row['Variant'].split(',') + list_of_variants.sort() + data.at[i,'Variant'] = ",".join(list_of_variants) + + ## create unique_id merging miRNA & variants & UID data['unique_id'] = data.apply(lambda data: data['miRNA'] + '&' + data['Variant'] + '&' + data['UID'], axis=1) ## parse according to software