Skip to content

Commit

Permalink
Refactoring fetch_pdb into program template. Also added ability to do…
Browse files Browse the repository at this point in the history
…wnload EM map.
  • Loading branch information
olegsobolev committed Oct 16, 2024
1 parent 037b7e8 commit 13541a9
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 247 deletions.
4 changes: 2 additions & 2 deletions iotbx/bioinformatics/pdb_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def tst_pdb_info_local():
except requests.exceptions.ReadTimeout:
print("Skipped test: transient read timeout, can't run test right now")
return
assert rlist == ans_list_2
assert rdict == ans_dict_2
assert rlist == ans_list_2, rlist
assert rdict == ans_dict_2, rdict


def run(args):
Expand Down
6 changes: 3 additions & 3 deletions iotbx/examples/pdbx_mmcif_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import os
from six.moves import range
from libtbx.utils import Sorry
from iotbx.pdb.fetch import valid_pdb_id, get_pdb
from iotbx.pdb.fetch import valid_pdb_id, fetch_and_write

def run(args):
if len(args) == 0:
Expand All @@ -25,8 +25,8 @@ def run(args):
# download pdbx/mmcif file from the PDB
pdb_id = arg
mirror = "pdbe"
mmcif_file = get_pdb(
pdb_id, data_type="pdb", mirror=mirror, log=sys.stdout, format="cif")
mmcif_file = fetch_and_write(
pdb_id, entity="model_cif", mirror=mirror, log=sys.stdout)

# read the cif file and get an iotbx.cif object
import iotbx.cif
Expand Down
65 changes: 25 additions & 40 deletions iotbx/pdb/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,29 @@
import os


def get_link(mirror, file_type, pdb_id=None, emdb_number=None):
def get_link(mirror, entity, pdb_id=None, emdb_number=None):

all_links_dict = {
'rcsb': {
'model_pdb': 'https://files.rcsb.org/pub/pdb/data/structures/divided/pdb/{mid_id}/pdb{pdb_id}.ent.gz',
'model_cif': 'https://files.rcsb.org/pub/pdb/data/structures/divided/mmCIF/{mid_id}/{pdb_id}.cif.gz',
'sequence': 'https://www.rcsb.org/fasta/entry/{pdb_id}',
'sf': 'https://files.rcsb.org/download/{pdb_id}-sf.cif.gz',
'map': 'https://files.rcsb.org/pub/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
'em_map': 'https://files.rcsb.org/pub/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
},
'pdbe': {
'model_pdb': 'https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/{mid_id}/pdb{pdb_id}.ent.gz',
'model_cif': 'https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/mmCIF/{mid_id}/{pdb_id}.cif.gz',
'sequence': 'https://www.ebi.ac.uk/pdbe/entry/pdb/{pdb_id}/fasta',
'sf': 'https://www.ebi.ac.uk/pdbe/entry-files/download/r{pdb_id}sf.ent',
'map': 'https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
'em_map': 'https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
},
'pdbj': {
'model_pdb': 'https://ftp.pdbj.org/pub/pdb/data/structures/divided/pdb/{mid_id}/pdb{pdb_id}.ent.gz',
'model_cif': 'https://ftp.pdbj.org/pub/pdb/data/structures/divided/mmCIF/{mid_id}/{pdb_id}.cif.gz',
'sequence': 'https://pdbj.org/rest/newweb/fetch/file?cat=pdb&type=fasta&id={pdb_id}',
'sf': 'https://data.pdbjpw1.pdbj.org/pub/pdb/data/structures/divided/structure_factors/{mid_id}/r{pdb_id}sf.ent.gz',
'map': 'https://ftp.pdbj.org/pub/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
'em_map': 'https://ftp.pdbj.org/pub/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
},
# 'pdb-redo': {
# 'model_pdb': 'https://pdb-redo.eu/db/{pdb_id}/{pdb_id}_final.pdb',
Expand All @@ -68,62 +68,42 @@ def get_link(mirror, file_type, pdb_id=None, emdb_number=None):
}

assert mirror in ['rcsb', 'pdbe', 'pdbj']
assert file_type in ['model_pdb', 'model_cif', 'sequence', 'sf', 'map']
if file_type == 'map':
assert entity in ['model_pdb', 'model_cif', 'sequence', 'sf', 'em_map']
if entity == 'map':
assert emdb_number
else:
assert pdb_id
mid_pdb_id = pdb_id[1:3]
return all_links_dict[mirror][file_type].format(mid_id=mid_pdb_id, pdb_id=pdb_id, emdb_number=emdb_number)
return all_links_dict[mirror][entity].format(mid_id=mid_pdb_id, pdb_id=pdb_id, emdb_number=emdb_number)

def valid_pdb_id(id):
return len(id) == 4 and re.match("[1-9]{1}[a-zA-Z0-9]{3}", id)

def get_file_type_from_params(data_type, format):
""" Temporary, during refactoring. """
conversion_dict = {
"pdb":'model',
"xray": 'sf',
"fasta": 'sequence',
"map": 'map',
}
result = conversion_dict[data_type]
if result == 'model':
if format == 'pdb':
result = 'model_pdb'
else:
result = 'model_cif'
return result

def fetch(id, data_type="pdb", format="pdb", mirror="rcsb", log=None):
def fetch(id, entity='model_pdb', mirror="rcsb", emdb_number=None):
"""
Locate and open a data file for the specified PDB ID and format, either in a
local mirror or online.
:param id: 4-character PDB ID (e.g. '1hbb')
:param data_type: type of content to download: pdb, xray, or fasta
:param format: format of data: cif, pdb, or xml (or cif_or_pdb)
:param entity - one of 'model_pdb', 'model_cif', 'sequence', 'sf', 'em_map'
:param mirror: remote site to use, either rcsb, pdbe, pdbj or pdb-redo
:returns: a filehandle-like object (with read() method)
"""
assert data_type in ["pdb", "xray", "fasta"]
assert format in ["cif", "pdb", "xml", "cif_or_pdb"]
assert mirror in ["rcsb", "pdbe", "pdbj", "pdb-redo"]
assert entity in ['model_pdb', 'model_cif', 'sequence', 'sf', 'em_map']
assert mirror in ["rcsb", "pdbe", "pdbj"]
id = id.lower()
if not valid_pdb_id(id):
raise Sorry("Invalid pdb id %s. Must be 4 characters, 1st is a number 1-9." % pdb_id)
if (log is None) : log = null_out()

file_type = get_file_type_from_params(data_type, format)
url = get_link(mirror, file_type, pdb_id=id, emdb_number=None)
need_to_decompress = url.split('.')[-1] == 'gz' and file_type != 'map'
url = get_link(mirror, entity, pdb_id=id, emdb_number=emdb_number)
need_to_decompress = url.split('.')[-1] == 'gz' and entity != 'em_map'

try :
data = libtbx.utils.urlopen(url)
except HTTPError as e :
if e.getcode() == 404 :
raise RuntimeError("Couldn't download %s for %s at %s." % (file_type, id, url))
raise RuntimeError("Couldn't download %s for %s at %s." % (entity, id, url))
else :
raise
if need_to_decompress:
Expand All @@ -134,20 +114,25 @@ def write_data_to_disc(fname, data):
with open(fname, "wb") as f:
f.write(data.read())

def get_pdb(id, data_type, mirror, log, format="pdb"):
def fetch_and_write(id, entity='model_pdb', mirror='rcsb', emdb_number=None, log=None):
"""
Frontend for fetch(...), writes resulting data to disk.
"""
try :
data = fetch(id, data_type, mirror=mirror, format=format, log=log)
data = fetch(id, entity, mirror=mirror, emdb_number=emdb_number)
except RuntimeError as e :
raise Sorry(str(e))
print(str(e),file=log)
return None
if (log is None) : log = null_out()
default_value = (os.path.join(os.getcwd(), "{}.{}".format(id, format)), "Model")
file_names_titles = defaultdict(lambda: default_value, {
"xray": (os.path.join(os.getcwd(), "{}-sf.cif".format(id)), "Structure factors"),
"fasta": (os.path.join(os.getcwd(), "{}.fa".format(id)), "Sequence"),
"model_pdb": (os.path.join(os.getcwd(), "{}.pdb".format(id)), "Model in PDB format"),
"model_cif": (os.path.join(os.getcwd(), "{}.cif".format(id)), "Model in mmCIF format"),
"sf": (os.path.join(os.getcwd(), "{}-sf.cif".format(id)), "Structure factors"),
"sequence": (os.path.join(os.getcwd(), "{}.fa".format(id)), "Sequence"),
"em_map": (os.path.join(os.getcwd(), "emd_{}.tar.gz".format(emdb_number)), "Cryo-EM map"),
})
file_name, title = file_names_titles[data_type]
file_name, title = file_names_titles[entity]
write_data_to_disc(file_name, data)
print("%s saved to %s" % (title, file_name), file=log)
return file_name
Expand Down
6 changes: 3 additions & 3 deletions iotbx/regression/tst_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def exercise_1():
""")

def exercise_2():
string_1yjp_sf = fetch(id = "1yjp", data_type='xray').read().decode()
# print("%s" % string_1yjp_sf)
string_1yjp_sf = fetch(id = "1yjp", entity='sf').read().decode()
print("%s" % string_1yjp_sf)
assert_lines_in_text(string_1yjp_sf, """\
loop_
_refln.crystal_id
Expand All @@ -35,7 +35,7 @@ def exercise_2():

def exercise_get_link():
r = []
for ft in ['model_pdb', 'model_cif', 'sequence', 'sf', 'map']:
for ft in ['model_pdb', 'model_cif', 'sequence', 'sf', 'em_map']:
r.append(get_link('rcsb', ft, '1ab2', emdb_number="1111"))
# print (r)
assert r == ['https://files.rcsb.org/pub/pdb/data/structures/divided/pdb/ab/pdb1ab2.ent.gz',
Expand Down
201 changes: 3 additions & 198 deletions mmtbx/command_line/fetch_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,202 +8,7 @@
# LIBTBX_SET_DISPATCHER_NAME phenix.fetch_pdb
# LIBTBX_SET_DISPATCHER_NAME iotbx.fetch_pdb

# XXX most of this code is unused when run from the command line, but the
# PHENIX GUI includes a simple frontend that uses the phil interface.
from mmtbx.programs import fetch
from iotbx.cli_parser import run_program

import libtbx.phil
from libtbx.utils import Sorry, Usage
from libtbx import easy_run
from iotbx.pdb.fetch import get_pdb
import sys
import os

master_phil = libtbx.phil.parse("""
fetch_pdb
.caption = PHENIX can automatically retrieve data from the PDB via the RCSB \
web server. If you intend to re-refine or re-build the structure we \
recommend creating a new project, but this is not required. Note that \
you may also use this tool to generate an MTZ file from the mmCIF \
structure factors (if available), but the options \
are more limited than what is available in the phenix.cif_as_mtz.
.style = auto_align caption_img:icons/custom/pdb_import64.png \
caption_width:400
{
pdb_ids = None
.type = strings
.short_caption = PDB ID(s)
.input_size = 400
.style = bold
action = *pdb_only all_data all_plus_mtz
.type = choice
.caption = Download_PDB_file(s) Download_all_data Download_all_data_and_convert_CIF_to_MTZ
.style = bold
site = *rcsb pdbe pdbj
.type = choice
.caption = RCSB PDBe PDBj
.short_caption = Mirror site
.style = bold
}""")

# XXX for use in the PHENIX GUI only - run2 is used from the command line
def run(args=(), params=None, out=sys.stdout):
"""
For use in PHENIX GUI only, fetches pdb filesand/or
reflection data from the PDB.
Parameters
----------
args : list of str, optional
params : libtbx.phil.scope_extract, optional
out : file, optional
Returns
-------
output_files : list of str
errors : list of str
"""
assert (params is not None)
output_files = []
errors = []
mirror = "--mirror=%s" % params.fetch_pdb.site
for id in params.fetch_pdb.pdb_ids :
args = [mirror, id]
if (params.fetch_pdb.action in ["all_data","all_plus_mtz"]):
args.insert(0, "--all")
if (params.fetch_pdb.action == "all_plus_mtz"):
args.insert(1, "--mtz")
try :
data_files = run2(args=args, log=out)
print("\n".join(data_files), file=out)
output_files.extend(data_files)
except Exception as e:
errors.append(str(e))
else :
pdb_file = run2(args=[mirror,id], log=out)
print(pdb_file, file=out)
output_files.append(pdb_file)
return output_files, errors

def run2(args, log=sys.stdout):
"""
Fetches pdb files and/or reflection data from the PDB.
Parameters
----------
args : list of str
log : file, optional
Returns
-------
str or list of str
List of file names that were downloaded.
"""
if len(args) < 1 :
raise Usage("""\
phenix.fetch_pdb [-x|-f|--all] [--mtz] [-q] ID1 [ID2, ...]
Command-line options:
-x Get structure factors (mmCIF file)
-c Get model file in mmCIF format
-f Get sequence (FASTA file)
--all Download all available data
--mtz Download structure factors and PDB file, and generate MTZ
-q suppress printed output
""")
quiet = False
convert_to_mtz = False
data_type = "pdb"
format = "pdb"
mirror = "rcsb"
ids = []
for arg in args :
if (arg == "--all"):
data_type = "all"
elif (arg == "-x"):
data_type = "xray"
elif (arg == "-f"):
data_type = "fasta"
elif (arg == "-q"):
quiet = True
elif (arg == "--mtz"):
convert_to_mtz = True
data_type = "all"
elif (arg == "-c"):
format = "cif"
elif (arg.startswith("--mirror=")):
mirror = arg.split("=")[1]
if (not mirror in ["rcsb", "pdbe", "pdbj", "pdb-redo"]):
raise Sorry(
"Unrecognized mirror site '%s' (choices: rcsb, pdbe, pdbj, pdb-redo)" %
mirror)
else :
ids.append(arg)
if (len(ids) == 0):
raise Sorry("No PDB IDs specified.")
if (data_type != "all"):
#mirror = "rcsb"
files = []
for id in ids :
try:
files.append(get_pdb(id, data_type, mirror, log, format=format))
except Sorry as e:
if data_type == "pdb" and format == "pdb":
# try with mmCIF
print(" PDB format is not available, trying to get mmCIF", file=log)
files.append(get_pdb(id, data_type, mirror, log, format="cif"))
if (len(files) == 1):
return files[0]
return files
else :
files = []
for id in ids :
for data_type_, data_format in [("pdb", "pdb"), ("fasta", "pdb"),
("xray", "pdb"), ("pdb", "cif")] :
try:
files.append(get_pdb(id, data_type_, mirror, log, format=data_format))
except Sorry as e:
print (str(e))
if (convert_to_mtz):
misc_args = ["--merge", "--map_to_asu", "--extend_flags",
"--ignore_bad_sigmas"]
easy_run.call("phenix.cif_as_mtz %s-sf.cif %s" %
(id, " ".join(misc_args)))
if os.path.isfile("%s-sf.mtz" % id):
os.rename("%s-sf.mtz" % id, "%s.mtz" % id)
print("Converted structure factors saved to %s.mtz" % id, file=log)
# os.remove("%s-sf.cif" % id)
files[-1] = os.path.abspath("%s.mtz" % id)
if (not os.path.isfile("%s.mtz" % id)):
raise Sorry("MTZ conversion failed - try running phenix.cif_as_mtz "+
"manually (and check %s-sf.cif for format errors)." % id)
from iotbx.file_reader import any_file
mtz_in = any_file("%s.mtz" % id)
mtz_in.assert_file_type("hkl")
for array in mtz_in.file_server.miller_arrays :
if (array.anomalous_flag()):
print(" %s is anomalous" % array.info().label_string(), file=log)
return files

def validate_params(params):
"""
Validates that the input parameters specify a PDB file to download.
Parameters
----------
params : libtbx.phil.scope_extract
Returns
-------
bool
Raises
------
Sorry
Raised if a pdb file is not specified to be fetched within params.
"""
if (params.fetch_pdb.pdb_ids is None) or (len(params.fetch_pdb.pdb_ids)==0):
raise Sorry("No PDB IDs specified!")
return True

if __name__ == "__main__" :
run2(sys.argv[1:])
result = run_program(fetch.Program)
Loading

0 comments on commit 13541a9

Please sign in to comment.