-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess_pdb_seqres.py
47 lines (37 loc) · 1.16 KB
/
preprocess_pdb_seqres.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""
Preprocess the PDB sequence data.
original source https://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt.gz
"""
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
SEQ_FILE = "data/pdb_seqres.txt"
with open(SEQ_FILE) as f:
lines = f.readlines()
# We keep `mol:protein` types, and only one of each name (the first one)
names = set()
name_to_seq = dict()
for l in tqdm(lines):
if l.startswith(">"):
keep = False
is_prot = "mol:protein" in l
if is_prot:
name = " ".join(l.split()[3:])
if name not in names:
names.add(name)
keep = True
elif keep:
name_to_seq[name] = l.strip()
# Write file:
with open("data/prot_seqs.txt", "w") as f:
f.write("\n".join(name_to_seq.values()))
print("number sequences:", len(name_to_seq))
print("number tokens:", len(set("".join(name_to_seq.values()))))
lengths = [len(seq) for seq in name_to_seq.values()]
plt.hist(lengths, bins=100, log=True)
plt.title(
"distribution of protein sequence lengths ($\mu$={:.2f})".format(np.mean(lengths))
)
plt.xlabel("sequence length")
plt.ylabel("number of sequences")
plt.show()