Refactoring fetch_pdb into program template. Also added ability to do…

…wnload EM map.
cctbx · Oct 16, 2024 · 13541a9 · 13541a9
1 parent 037b7e8
commit 13541a9
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 247 deletions.
diff --git a/iotbx/bioinformatics/pdb_info.py b/iotbx/bioinformatics/pdb_info.py
@@ -164,8 +164,8 @@ def tst_pdb_info_local():
   except requests.exceptions.ReadTimeout:
     print("Skipped test: transient read timeout, can't run test right now")
     return
-  assert rlist == ans_list_2
-  assert rdict == ans_dict_2
+  assert rlist == ans_list_2, rlist
+  assert rdict == ans_dict_2, rdict
 
 
 def run(args):

diff --git a/iotbx/examples/pdbx_mmcif_tutorial.py b/iotbx/examples/pdbx_mmcif_tutorial.py
@@ -9,7 +9,7 @@
 import os
 from six.moves import range
 from libtbx.utils import Sorry
-from iotbx.pdb.fetch import valid_pdb_id, get_pdb
+from iotbx.pdb.fetch import valid_pdb_id, fetch_and_write
 
 def run(args):
   if len(args) == 0:
@@ -25,8 +25,8 @@ def run(args):
       # download pdbx/mmcif file from the PDB
       pdb_id = arg
       mirror = "pdbe"
-      mmcif_file = get_pdb(
-        pdb_id, data_type="pdb", mirror=mirror, log=sys.stdout, format="cif")
+      mmcif_file = fetch_and_write(
+        pdb_id, entity="model_cif", mirror=mirror, log=sys.stdout)
 
     # read the cif file and get an iotbx.cif object
     import iotbx.cif

diff --git a/iotbx/pdb/fetch.py b/iotbx/pdb/fetch.py
@@ -33,29 +33,29 @@
 import os
 
 
-def get_link(mirror, file_type, pdb_id=None, emdb_number=None):
+def get_link(mirror, entity, pdb_id=None, emdb_number=None):
 
   all_links_dict = {
       'rcsb': {
           'model_pdb': 'https://files.rcsb.org/pub/pdb/data/structures/divided/pdb/{mid_id}/pdb{pdb_id}.ent.gz',
           'model_cif': 'https://files.rcsb.org/pub/pdb/data/structures/divided/mmCIF/{mid_id}/{pdb_id}.cif.gz',
           'sequence': 'https://www.rcsb.org/fasta/entry/{pdb_id}',
           'sf': 'https://files.rcsb.org/download/{pdb_id}-sf.cif.gz',
-          'map': 'https://files.rcsb.org/pub/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
+          'em_map': 'https://files.rcsb.org/pub/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
           },
       'pdbe': {
           'model_pdb': 'https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/pdb/{mid_id}/pdb{pdb_id}.ent.gz',
           'model_cif': 'https://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/mmCIF/{mid_id}/{pdb_id}.cif.gz',
           'sequence': 'https://www.ebi.ac.uk/pdbe/entry/pdb/{pdb_id}/fasta',
           'sf': 'https://www.ebi.ac.uk/pdbe/entry-files/download/r{pdb_id}sf.ent',
-          'map': 'https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
+          'em_map': 'https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
           },
       'pdbj': {
           'model_pdb': 'https://ftp.pdbj.org/pub/pdb/data/structures/divided/pdb/{mid_id}/pdb{pdb_id}.ent.gz',
           'model_cif': 'https://ftp.pdbj.org/pub/pdb/data/structures/divided/mmCIF/{mid_id}/{pdb_id}.cif.gz',
           'sequence': 'https://pdbj.org/rest/newweb/fetch/file?cat=pdb&type=fasta&id={pdb_id}',
           'sf': 'https://data.pdbjpw1.pdbj.org/pub/pdb/data/structures/divided/structure_factors/{mid_id}/r{pdb_id}sf.ent.gz',
-          'map': 'https://ftp.pdbj.org/pub/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
+          'em_map': 'https://ftp.pdbj.org/pub/emdb/structures/EMD-{emdb_number}/map/emd_{emdb_number}.map.gz',
           },
       # 'pdb-redo': {
       #     'model_pdb': 'https://pdb-redo.eu/db/{pdb_id}/{pdb_id}_final.pdb',
@@ -68,62 +68,42 @@ def get_link(mirror, file_type, pdb_id=None, emdb_number=None):
   }
 
   assert mirror in ['rcsb', 'pdbe', 'pdbj']
-  assert file_type in ['model_pdb', 'model_cif', 'sequence', 'sf', 'map']
-  if file_type == 'map':
+  assert entity in ['model_pdb', 'model_cif', 'sequence', 'sf', 'em_map']
+  if entity == 'map':
     assert emdb_number
   else:
     assert pdb_id
   mid_pdb_id = pdb_id[1:3]
-  return all_links_dict[mirror][file_type].format(mid_id=mid_pdb_id, pdb_id=pdb_id, emdb_number=emdb_number)
+  return all_links_dict[mirror][entity].format(mid_id=mid_pdb_id, pdb_id=pdb_id, emdb_number=emdb_number)
 
 def valid_pdb_id(id):
   return len(id) == 4 and re.match("[1-9]{1}[a-zA-Z0-9]{3}", id)
 
-def get_file_type_from_params(data_type, format):
-  """ Temporary, during refactoring. """
-  conversion_dict = {
-      "pdb":'model',
-      "xray": 'sf',
-      "fasta": 'sequence',
-      "map": 'map',
-      }
-  result = conversion_dict[data_type]
-  if result == 'model':
-    if format == 'pdb':
-      result = 'model_pdb'
-    else:
-      result = 'model_cif'
-  return result
-
-def fetch(id, data_type="pdb", format="pdb", mirror="rcsb", log=None):
+def fetch(id, entity='model_pdb', mirror="rcsb", emdb_number=None):
   """
   Locate and open a data file for the specified PDB ID and format, either in a
   local mirror or online.
 
   :param id: 4-character PDB ID (e.g. '1hbb')
-  :param data_type: type of content to download: pdb, xray, or fasta
-  :param format: format of data: cif, pdb, or xml (or cif_or_pdb)
+  :param entity - one of 'model_pdb', 'model_cif', 'sequence', 'sf', 'em_map'
   :param mirror: remote site to use, either rcsb, pdbe, pdbj or pdb-redo
 
   :returns: a filehandle-like object (with read() method)
   """
-  assert data_type in ["pdb", "xray", "fasta"]
-  assert format in ["cif", "pdb", "xml", "cif_or_pdb"]
-  assert mirror in ["rcsb", "pdbe", "pdbj", "pdb-redo"]
+  assert entity in ['model_pdb', 'model_cif', 'sequence', 'sf', 'em_map']
+  assert mirror in ["rcsb", "pdbe", "pdbj"]
   id = id.lower()
   if not valid_pdb_id(id):
     raise Sorry("Invalid pdb id %s. Must be 4 characters, 1st is a number 1-9." % pdb_id)
-  if (log is None) : log = null_out()
 
-  file_type = get_file_type_from_params(data_type, format)
-  url = get_link(mirror, file_type, pdb_id=id, emdb_number=None)
-  need_to_decompress = url.split('.')[-1] == 'gz' and file_type != 'map'
+  url = get_link(mirror, entity, pdb_id=id, emdb_number=emdb_number)
+  need_to_decompress = url.split('.')[-1] == 'gz' and entity != 'em_map'
 
   try :
     data = libtbx.utils.urlopen(url)
   except HTTPError as e :
     if e.getcode() == 404 :
-      raise RuntimeError("Couldn't download %s for %s at %s." % (file_type, id, url))
+      raise RuntimeError("Couldn't download %s for %s at %s." % (entity, id, url))
     else :
       raise
   if need_to_decompress:
@@ -134,20 +114,25 @@ def write_data_to_disc(fname, data):
     with open(fname, "wb") as f:
       f.write(data.read())
 
-def get_pdb(id, data_type, mirror, log, format="pdb"):
+def fetch_and_write(id, entity='model_pdb', mirror='rcsb', emdb_number=None, log=None):
   """
   Frontend for fetch(...), writes resulting data to disk.
   """
   try :
-    data = fetch(id, data_type, mirror=mirror, format=format, log=log)
+    data = fetch(id, entity, mirror=mirror, emdb_number=emdb_number)
   except RuntimeError as e :
-    raise Sorry(str(e))
+    print(str(e),file=log)
+    return None
+  if (log is None) : log = null_out()
   default_value = (os.path.join(os.getcwd(), "{}.{}".format(id, format)), "Model")
   file_names_titles = defaultdict(lambda: default_value, {
-      "xray":  (os.path.join(os.getcwd(), "{}-sf.cif".format(id)), "Structure factors"),
-      "fasta": (os.path.join(os.getcwd(), "{}.fa".format(id)), "Sequence"),
+      "model_pdb":  (os.path.join(os.getcwd(), "{}.pdb".format(id)), "Model in PDB format"),
+      "model_cif":  (os.path.join(os.getcwd(), "{}.cif".format(id)), "Model in mmCIF format"),
+      "sf":  (os.path.join(os.getcwd(), "{}-sf.cif".format(id)), "Structure factors"),
+      "sequence": (os.path.join(os.getcwd(), "{}.fa".format(id)), "Sequence"),
+      "em_map": (os.path.join(os.getcwd(), "emd_{}.tar.gz".format(emdb_number)), "Cryo-EM map"),
   })
-  file_name, title = file_names_titles[data_type]
+  file_name, title = file_names_titles[entity]
   write_data_to_disc(file_name, data)
   print("%s saved to %s" % (title, file_name), file=log)
   return file_name

diff --git a/iotbx/regression/tst_fetch.py b/iotbx/regression/tst_fetch.py
@@ -16,8 +16,8 @@ def exercise_1():
 """)
 
 def exercise_2():
-  string_1yjp_sf = fetch(id = "1yjp", data_type='xray').read().decode()
-  # print("%s" % string_1yjp_sf)
+  string_1yjp_sf = fetch(id = "1yjp", entity='sf').read().decode()
+  print("%s" % string_1yjp_sf)
   assert_lines_in_text(string_1yjp_sf, """\
 loop_
 _refln.crystal_id
@@ -35,7 +35,7 @@ def exercise_2():
 
 def exercise_get_link():
   r = []
-  for ft in ['model_pdb', 'model_cif', 'sequence', 'sf', 'map']:
+  for ft in ['model_pdb', 'model_cif', 'sequence', 'sf', 'em_map']:
     r.append(get_link('rcsb', ft, '1ab2', emdb_number="1111"))
   # print (r)
   assert r == ['https://files.rcsb.org/pub/pdb/data/structures/divided/pdb/ab/pdb1ab2.ent.gz',

diff --git a/mmtbx/command_line/fetch_pdb.py b/mmtbx/command_line/fetch_pdb.py
@@ -8,202 +8,7 @@
 # LIBTBX_SET_DISPATCHER_NAME phenix.fetch_pdb
 # LIBTBX_SET_DISPATCHER_NAME iotbx.fetch_pdb
 
-# XXX most of this code is unused when run from the command line, but the
-# PHENIX GUI includes a simple frontend that uses the phil interface.
+from mmtbx.programs import fetch
+from iotbx.cli_parser import run_program
 
-import libtbx.phil
-from libtbx.utils import Sorry, Usage
-from libtbx import easy_run
-from iotbx.pdb.fetch import get_pdb
-import sys
-import os
-
-master_phil = libtbx.phil.parse("""
-fetch_pdb
-  .caption = PHENIX can automatically retrieve data from the PDB via the RCSB \
-    web server.  If you intend to re-refine or re-build the structure we \
-    recommend creating a new project, but this is not required.  Note that \
-    you may also use this tool to generate an MTZ file from the mmCIF \
-    structure factors (if available), but the options \
-    are more limited than what is available in the phenix.cif_as_mtz.
-  .style = auto_align caption_img:icons/custom/pdb_import64.png \
-    caption_width:400
-{
-  pdb_ids = None
-    .type = strings
-    .short_caption = PDB ID(s)
-    .input_size = 400
-    .style = bold
-  action = *pdb_only all_data all_plus_mtz
-    .type = choice
-    .caption = Download_PDB_file(s) Download_all_data Download_all_data_and_convert_CIF_to_MTZ
-    .style = bold
-  site = *rcsb pdbe pdbj
-    .type = choice
-    .caption = RCSB PDBe PDBj
-    .short_caption = Mirror site
-    .style = bold
-}""")
-
-# XXX for use in the PHENIX GUI only - run2 is used from the command line
-def run(args=(), params=None, out=sys.stdout):
-  """
-  For use in PHENIX GUI only, fetches pdb filesand/or
-  reflection data from the PDB.
-
-  Parameters
-  ----------
-  args : list of str, optional
-  params : libtbx.phil.scope_extract, optional
-  out : file, optional
-
-  Returns
-  -------
-  output_files : list of str
-  errors : list of str
-  """
-  assert (params is not None)
-  output_files = []
-  errors = []
-  mirror = "--mirror=%s" % params.fetch_pdb.site
-  for id in params.fetch_pdb.pdb_ids :
-    args = [mirror, id]
-    if (params.fetch_pdb.action in ["all_data","all_plus_mtz"]):
-      args.insert(0, "--all")
-      if (params.fetch_pdb.action == "all_plus_mtz"):
-        args.insert(1, "--mtz")
-      try :
-        data_files = run2(args=args, log=out)
-        print("\n".join(data_files), file=out)
-        output_files.extend(data_files)
-      except Exception as e:
-        errors.append(str(e))
-    else :
-      pdb_file = run2(args=[mirror,id], log=out)
-      print(pdb_file, file=out)
-      output_files.append(pdb_file)
-  return output_files, errors
-
-def run2(args, log=sys.stdout):
-  """
-  Fetches pdb files and/or reflection data from the PDB.
-
-  Parameters
-  ----------
-  args : list of str
-  log : file, optional
-
-  Returns
-  -------
-  str or list of str
-      List of file names that were downloaded.
-  """
-  if len(args) < 1 :
-    raise Usage("""\
-phenix.fetch_pdb [-x|-f|--all] [--mtz] [-q] ID1 [ID2, ...]
-
-Command-line options:
-  -x      Get structure factors (mmCIF file)
-  -c      Get model file in mmCIF format
-  -f      Get sequence (FASTA file)
-  --all   Download all available data
-  --mtz   Download structure factors and PDB file, and generate MTZ
-  -q      suppress printed output
-""")
-  quiet = False
-  convert_to_mtz  = False
-  data_type = "pdb"
-  format = "pdb"
-  mirror = "rcsb"
-  ids = []
-  for arg in args :
-    if (arg == "--all"):
-      data_type = "all"
-    elif (arg == "-x"):
-      data_type = "xray"
-    elif (arg == "-f"):
-      data_type = "fasta"
-    elif (arg == "-q"):
-      quiet = True
-    elif (arg == "--mtz"):
-      convert_to_mtz = True
-      data_type = "all"
-    elif (arg == "-c"):
-      format = "cif"
-    elif (arg.startswith("--mirror=")):
-      mirror = arg.split("=")[1]
-      if (not mirror in ["rcsb", "pdbe", "pdbj", "pdb-redo"]):
-        raise Sorry(
-          "Unrecognized mirror site '%s' (choices: rcsb, pdbe, pdbj, pdb-redo)" %
-          mirror)
-    else :
-      ids.append(arg)
-  if (len(ids) == 0):
-    raise Sorry("No PDB IDs specified.")
-  if (data_type != "all"):
-    #mirror = "rcsb"
-    files = []
-    for id in ids :
-      try:
-        files.append(get_pdb(id, data_type, mirror, log, format=format))
-      except Sorry as e:
-        if data_type == "pdb" and format == "pdb":
-          # try with mmCIF
-          print("  PDB format is not available, trying to get mmCIF", file=log)
-          files.append(get_pdb(id, data_type, mirror, log, format="cif"))
-    if (len(files) == 1):
-      return files[0]
-    return files
-  else :
-    files = []
-    for id in ids :
-      for data_type_, data_format in [("pdb", "pdb"), ("fasta", "pdb"),
-                                      ("xray", "pdb"), ("pdb", "cif")] :
-        try:
-          files.append(get_pdb(id, data_type_, mirror, log, format=data_format))
-        except Sorry as e:
-          print (str(e))
-      if (convert_to_mtz):
-        misc_args = ["--merge", "--map_to_asu", "--extend_flags",
-                     "--ignore_bad_sigmas"]
-        easy_run.call("phenix.cif_as_mtz %s-sf.cif %s" %
-          (id, " ".join(misc_args)))
-        if os.path.isfile("%s-sf.mtz" % id):
-          os.rename("%s-sf.mtz" % id, "%s.mtz" % id)
-          print("Converted structure factors saved to %s.mtz" % id, file=log)
-        #  os.remove("%s-sf.cif" % id)
-        files[-1] = os.path.abspath("%s.mtz" % id)
-        if (not os.path.isfile("%s.mtz" % id)):
-          raise Sorry("MTZ conversion failed - try running phenix.cif_as_mtz "+
-            "manually (and check %s-sf.cif for format errors)." % id)
-        from iotbx.file_reader import any_file
-        mtz_in = any_file("%s.mtz" % id)
-        mtz_in.assert_file_type("hkl")
-        for array in mtz_in.file_server.miller_arrays :
-          if (array.anomalous_flag()):
-            print("  %s is anomalous" % array.info().label_string(), file=log)
-    return files
-
-def validate_params(params):
-  """
-  Validates that the input parameters specify a PDB file to download.
-
-  Parameters
-  ----------
-  params : libtbx.phil.scope_extract
-
-  Returns
-  -------
-  bool
-
-  Raises
-  ------
-  Sorry
-      Raised if a pdb file is not specified to be fetched within params.
-  """
-  if (params.fetch_pdb.pdb_ids is None) or (len(params.fetch_pdb.pdb_ids)==0):
-    raise Sorry("No PDB IDs specified!")
-  return True
-
-if __name__ == "__main__" :
-  run2(sys.argv[1:])
+result = run_program(fetch.Program)