forked from AgResearch/gbs_prism
-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_sample_sheet_header.py
executable file
·106 lines (78 loc) · 3.62 KB
/
add_sample_sheet_header.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/env python
from __future__ import print_function
import csv
import sys
import re
import string
import argparse
import datetime
import itertools
import os
def add_header(options):
"""
output "harmonsied" sample sheet - e.g.
for hiseq: with header added if necessary, padding header if necessary to match number of columns in sample sheet
for novaseq: all we need to do so far is change
Adapter,CTGTCTCTTATACACATCT,,,,,,,,,
to
AdapterRead1,CTGTCTCTTATACACATCT,,,,,,,,,
"""
if options["sequencing_platform"] == "hiseq" :
csvwriter = csv.writer(sys.stdout)
# get data
with open(options["header_file"],"r") as header:
header_records = [ record for record in csv.reader(header) ]
for record in header_records:
if record[0] == 'Date':
record[1] = record[1]%{"today" : datetime.date.today().strftime("%d/%m/%Y")}
sample_sheet_records = [ record for record in csv.reader(sys.stdin)]
# calculate passding
header_numcol = len(header_records[0])
sample_sheet_numcol = max( (len(record) for record in sample_sheet_records ))
# test if header already present
header_present = reduce(lambda x,y: x or y, [ record[0] == '[Header]' for record in sample_sheet_records ] , False)
adapter_config_present = reduce(lambda x,y: x or y, [ record[0] == 'Adapter' for record in sample_sheet_records ] , False)
if header_present and not adapter_config_present:
raise Exception(" error , header in the sample sheet supplied does not specify adapter")
# output sample sheet, adding and padding header if necessary
if not header_present:
for record in header_records + sample_sheet_records:
csvwriter.writerow(record + (sample_sheet_numcol - len(record)) * [""])
else:
for record in sample_sheet_records:
csvwriter.writerow(record)
else:
# we just tweak one of the records
settings_section = False
for record in sys.stdin:
if re.match("\[Settings\]", record, re.IGNORECASE) is not None:
settings_section = True
print(record,end="")
continue
if settings_section:
if re.match("Adapter,", record, re.IGNORECASE) is not None:
record=re.sub("^Adapter,", "AdapterRead1,", record)
settings_section = False
print(record,end="")
def get_options():
description = """
adds header to a sample sheet if it needs it
"""
long_description = """
examples :
cat /dataset/hiseq/active/191021_D00390_0510_BCE3UBANXX/SampleSheet.csv | ./add_sample_sheet_header.py --sequencing_platform hiseq -H /dataset/gseq_processing/active/bin/gbs_prism/etc/sample_sheet_header.csv
cat /dataset/2024_illumina_sequencing_d/scratch/220426_A01439_0069_BHNFW2DRXY/HNFW2DRXY.csv | ./add_sample_sheet_header.py
"""
parser = argparse.ArgumentParser(description=description, epilog=long_description, formatter_class = argparse.RawDescriptionHelpFormatter)
parser.add_argument('-H', dest='header_file', required=False , help="header to add")
parser.add_argument('--sequencing_platform', dest='sequencing_platform', type=str, choices = ["novaseq", "hiseq"], default = "novaseq", help="sequencing platform")
args = vars(parser.parse_args())
if args["sequencing_platform"] == "hiseq" :
if not os.path.isfile(args["header_file"]):
raise Exception("header file %s does not exist"%args["header_file"])
return args
def main():
options = get_options()
add_header(options)
if __name__ == "__main__":
main()