-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
67 lines (54 loc) · 1.71 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#################################################################
# Location of the raw genomes
#RAW_GENOMES_PATH = "/mnt/moria/rylan/phenores_data/raw/genomes/"
RAW_GENOMES_PATH = "genomes/raw/"
# Location of the MIC data file (excel spreadsheet)
MIC_DATA_FILE = "amr_data/no_ecoli_GenotypicAMR_Master.xlsx" # location of MIC data file
# The number of input genomes. The number of rows must match the
# nubmer of rows in the MIC data file. The names of the genomes
# must also be consistent, but need not be in the same order.
NUM_INPUT_FILES = 2260
# Kmer length that you want to count
KMER_SIZE = 11
# Data type of the resulting kmer matrix. Use uint8 if counts are
# all under 256. Else use uint16 (kmer counts under 65536)
MATRIX_DTYPE = 'uint8'
#################################################################
ids, = glob_wildcards(RAW_GENOMES_PATH+"{id}.fasta")
rule all:
input:
"touchfile.txt"
rule clean:
input:
RAW_GENOMES_PATH+"{id}.fasta"
output:
"genomes/clean/{id}.fasta"
run:
shell("python clean.py {input} genomes/clean/")
rule kmer_count:
input:
"genomes/clean/{id}.fasta"
output:
temp("results/{id}.jf")
threads:
2
shell:
"jellyfish count -m {KMER_SIZE} -s 100M -t {threads} {input} -o {output}"
rule fa_dump:
input:
"results/{id}.jf"
output:
"results/{id}.fa"
shell:
"jellyfish dump {input} > {output}"
rule make_matrix:
input:
expand("results/{id}.fa", id=ids)
output:
touch("touchfile.txt")
run:
shell("python parallel_matrix.py {NUM_INPUT_FILES} {KMER_SIZE} {MATRIX_DTYPE} results/")
shell("python convert_dict.py")
shell("python bin_mics.py {MIC_DATA_FILE}")
shell("python filter.py")
shell("python amr_prep.py")