forked from google-deepmind/alphafold3
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathjson_from_uniprot.py
84 lines (74 loc) · 2.56 KB
/
json_from_uniprot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse
import json
import requests
import string
import itertools
#TODO: ChatGPT, revise and test this script
def id_generator():
"""Yield single-character IDs from A to Z, then AA, AB, etc."""
base = string.ascii_uppercase
for length in range(1, 3): # Support IDs of length 1 or 2 (A-Z, AA-ZZ)
for id_ in map("".join, itertools.product(base, repeat=length)):
yield id_
def fetch_sequence(uniprot_id):
"""Fetch the protein sequence for a given UniProt ID from the UniProt API."""
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
try:
response = requests.get(url)
response.raise_for_status()
fasta_data = response.text
sequence = "".join(fasta_data.split("\n")[1:]) # Skip the FASTA header
return sequence
except requests.exceptions.RequestException as e:
print(f"Error fetching sequence for {uniprot_id}: {e}")
return None
def construct_json(uniprot_ids, job_name):
"""Construct the JSON structure."""
id_gen = id_generator() # Initialize ID generator
sequences = []
for uniprot_id in uniprot_ids:
sequence = fetch_sequence(uniprot_id)
if sequence:
sequences.append({
"protein": {
"id": next(id_gen), # Assign a unique chain-like ID
"sequence": sequence
}
})
else:
print(f"Skipping {uniprot_id} due to missing sequence.")
json_structure = {
"name": job_name if job_name else "_".join(uniprot_ids),
"modelSeeds": [1],
"sequences": sequences,
"dialect": "alphafold3",
"version": 1
}
return json_structure
def main():
# Set up argument parser
parser = argparse.ArgumentParser(description="Generate JSON for a list of UniProt IDs with sequences.")
parser.add_argument(
"uniprot_ids",
nargs="+",
help="List of UniProt IDs"
)
parser.add_argument(
"--job_name",
default=None,
help="Optional job name (default is UniProt IDs joined with '_')"
)
parser.add_argument(
"--output",
default="output.json",
help="Output JSON file name (default is 'output.json')"
)
args = parser.parse_args()
# Construct JSON
json_data = construct_json(args.uniprot_ids, args.job_name)
# Write JSON to file
with open(args.output, "w") as json_file:
json.dump(json_data, json_file, indent=4)
print(f"JSON written to {args.output}")
if __name__ == "__main__":
main()