-
Notifications
You must be signed in to change notification settings - Fork 1
/
SalmonellaGroups.snakefile
75 lines (53 loc) · 2.26 KB
/
SalmonellaGroups.snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
#################################################################################
# FUNCTIONS #
#################################################################################
#################################################################################
# GLOBALS #
#################################################################################
#################################################################################
# RULES #
#################################################################################
rule all:
input:
"data/interim/mash_population_groups.csv"
rule mash:
# Compute mash distance between all genomes
input:
"data/interim/fasta_files_list_no_ecoli.txt"
output:
"data/interim/mash_distances.txt"
params:
sketchfile="data/interim/mash_sketch",
k=21, # K-mer length
p=12, # Threads
S=42, # Seed
s=10000 # Sketch size
shell:
"""
mash sketch -k {params.k} -p {params.p} -s {params.s} -S {params.S} -l {input} -o {params.sketchfile} && \
mash dist -p {params.p} {params.sketchfile}.msh {params.sketchfile}.msh > {output}
"""
rule dist:
# Convert mash distance output into matrix
input:
"data/interim/mash_distances.txt"
output:
"data/interim/mash_distance_matrix.csv"
run:
import pandas as pd
import os
ddf = pd.read_csv(input[0], sep='\t', header=None, names=['f1','f2','d','p','h'])
ddf['s1'] = ddf.f1.apply(lambda x: os.path.splitext(os.path.os.path.basename(x))[0])
ddf['s2'] = ddf.f2.apply(lambda x: os.path.splitext(os.path.os.path.basename(x))[0])
dist = ddf.pivot(index='s1', columns='s2', values='d')
dist.to_csv(output[0])
rule groups:
# Assign genomes to clusters based on mash distances
input:
"data/interim/mash_distance_matrix.csv",
output:
"data/interim/mash_population_groups.csv"
params:
k=9 # Number of clusters
script: "src/data/make_mash_salmonella_groups.R"