sb-ncbr · JakubOrsula · Nov 23, 2023 · Nov 24, 2023 · Nov 24, 2023 · Nov 24, 2023
diff --git a/app/static/attributions.html b/app/static/attributions.html
@@ -0,0 +1,47 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Project Contributors</title>
+    <!-- Bootstrap CSS -->
+    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3" crossorigin="anonymous">
+</head>
+<body>
+    <div class="container my-4">
+        <h1 class="text-center mb-4">Project Contributors</h1>
+
+        <!-- Contributor 1 -->
+        <div class="card mb-3">
+            <div class="card-body">
+                <h5 class="card-title">Contributor Name</h5>
+                <p class="card-text">Description of the role and work. For example, development of search algorithms. Mention of "binary sketches in the Hamming space, index, PPP-codes of David Novák and Pavel Zezula".</p>
+                <a href="link-to-relevant-paper" class="card-link">Relevant Paper Title</a>
+            </div>
+        </div>
+
+        <!-- Contributor 2 -->
+        <div class="card mb-3">
+            <div class="card-body">
+                <h5 class="card-title">Contributor Name</h5>
+                <p class="card-text">Brief description of their work within the project.</p>
+                <!-- Add paper link if applicable -->
+            </div>
+        </div>
+
+        <!-- Additional contributors here -->
+
+        <!-- References Section -->
+        <h2 class="mt-5 mb-3">References</h2>
+        <ul class="list-group list-group-flush">
+            <li class="list-group-item">Similarity search for an extreme application: experience and implementation</li>
+            <li class="list-group-item">Designing sketches for similarity filtering</li>
+            <li class="list-group-item">Binary sketches for secondary filtering</li>
+            <li class="list-group-item">PPP-codes for large-scale similarity searching</li>
+        </ul>
+    </div>
+
+    <!-- Bootstrap JS Bundle -->
+    <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"></script>
+</body>
+</html>
diff --git a/app/templates/index.html b/app/templates/index.html
@@ -53,6 +53,7 @@ <h5 class="modal-title">Select protein</h5>
                 <p class="lead">
                     Search for the most similar protein chains to a given query chain. <br>
                 </p>
+                See <a href="/static/attributions.html">attributions page</a> for information about project authors.<br>
                 Indexed {{ chain_count }} chains from {{ protein_count }} proteins downloaded from <a
                     href="http://pdbe.org/" rel="noreferrer" target="_blank">PDBe</a>.
                 Last update: {{ updated }}.
@@ -291,6 +292,8 @@ <h5 class="card-title">Detected chains {% if selected %} of
             <input type="hidden" name="uploaded" id="uploaded" value="{{ uploaded }}">
 
         </form>
+    <hr>
+
     </div>
 
 {% endblock body %}

diff --git a/utils/get_stats.py b/utils/get_stats.py
@@ -1,8 +1,44 @@
 import python_distance
+import gemmi
 
-protein = "/mnt/data-ssd/PDBe_raw/xh/2xhc.cif"
+protein = "/mnt/data/PDBe_raw/xh/2xhc.cif"
+protein = '/mnt/data/PDBe_raw/as/1asj.cif'
+protein = '/mnt/data/PDBe_raw/pc/3pcc.cif'
 
-print(python_distance.save_chains(f'/mnt/data-ssd/PDBe_raw/{protein}.cif', '/tmp', 'test'))
+protein_out = '/tmp/stripped.bin'
+
+
+def strip_file(filename):
+    with open(filename, 'r') as fin:
+        contents = fin.read()
+        doc = gemmi.cif.read_string(contents)
+        block = doc.sole_block()
+        pdb_id = block.find_pair('_struct.entry_id')
+        pdb_title = block.find_pair('_struct.title')
+
+        lines = []
+
+        for line in contents.splitlines(keepends=True):
+            if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
+                lines.append(line)
+        for i, line in enumerate(lines[:-1]):
+            if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'):
+                lines[i] = None
+        lines = [line for line in lines if line is not None]
+        for i in range(1, len(lines)):
+            if lines[i].startswith('#') and lines[i - 1].startswith('#'):
+                lines[i - 1] = None
+        lines = [line for line in lines if line is not None]
+
+    lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n')
+    return lines
+
+with open(protein_out, 'w') as fout:
+        fout.writelines(strip_file(protein))
+
+
+
+print(python_distance.save_chains(protein_out, '/tmp', 'test'))
 
 
 def get_raw_from_gesamt(strid):

diff --git a/utils/strip_mmcif.py b/utils/strip_mmcif.py
@@ -0,0 +1,10 @@
+import io
+import python_distance
+
+def strip_stream(fin: io.TextIOWrapper, fout: io.TextIOWrapper):
+    for line in fin:
+        ...
+
+def test_stripping():
+    python_distance.distance("test", "test")
+
diff --git a/utils/update_binary_archive.py b/utils/update_binary_archive.py
@@ -33,10 +33,17 @@ def is_updated(filename: str, mirror_dir: str, raw_dir: str) -> Tuple[str, bool]
     gzip_path = Path(mirror_dir) / get_dir(filename) / f'{filename}.gz'
     raw_path = Path(raw_dir) / get_dir(filename) / filename
     with gzip.open(gzip_path, 'rt') as f_gzip, open(raw_path, 'r') as f_raw:
-        if f_gzip.read() != f_raw.read():
-            return filename, True
+        gzip_contents = f_gzip.read()
+        raw_contents = f_raw.read()
+        stripped = strip_cif(gzip_contents)
+        stripped = ''.join(stripped)
+        # this if is troublesome
+        # the first part is present just for legacy bins
+        # if all old bins are regenarated, it can be dropped
+        if gzip_contents == raw_contents or stripped == raw_contents:
+            return filename, False
 
-    return filename, False
+    return filename, True
 
 
 def get_whats_updated(mirror_dir: str, raw_dir: str, executor: ProcessPoolExecutor) -> Tuple[
@@ -79,7 +86,6 @@ def get_whats_updated(mirror_dir: str, raw_dir: str, executor: ProcessPoolExecut
 
 def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariadb.connection') -> None:
     cursor = conn.cursor()
-    print(files)
     for file in files:
         pdb_id = Path(file).with_suffix('').name.upper()
         cursor.execute('DELETE FROM protein WHERE pdbId = %s', (pdb_id,))
@@ -93,18 +99,50 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad
             cursor.execute(f'UPDATE proteinChain SET indexedAsDataObject = 0 WHERE intId IN ({ids_format})', int_ids)
 
             for chain_id in chain_ids:
-                (Path(binary_dir) / dirpath / f'{chain_id}.bin').unlink()
+                try:
+                    (Path(binary_dir) / dirpath / f'{chain_id}.bin').unlink()
+                except FileNotFoundError:
+                    pass  # the .bin chain might have never existed if gesamt was unable to read the cif file
 
         (Path(raw_dir) / dirpath / file).unlink()
 
     conn.commit()
     cursor.close()
 
 
+def strip_cif(cif_content: str):
+    # parse the info we care about
+    doc = gemmi.cif.read_string(cif_content)
+    block = doc.sole_block()
+    pdb_id = block.find_pair('_struct.entry_id')
+    pdb_title = block.find_pair('_struct.title')
+
+    lines = []
+
+    # strip everything else
+    for line in cif_content.splitlines(keepends=True):
+        if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
+            lines.append(line)
+    for i, line in enumerate(lines[:-1]):
+        if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'):
+            lines[i] = None
+    lines = [line for line in lines if line is not None]
+    for i in range(1, len(lines)):
+        if lines[i].startswith('#') and lines[i - 1].startswith('#'):
+            lines[i - 1] = None
+    lines = [line for line in lines if line is not None]
+
+    # put the info back in
+    lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n')
+    return lines
+
+
 def decompress_file(filename, src_dir: str, dest_dir: str) -> None:
-    with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as f_in:
-        with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as f_out:
-            shutil.copyfileobj(f_in, f_out)
+    with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin:
+        contents = fin.read()
+    lines = strip_cif(contents)
+    with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as fout:
+        fout.writelines(lines)
 
 
 def create_binaries(filename: str, src_dir: str, dest_dir: str) -> List[Tuple[str, str, int]]:
@@ -137,7 +175,6 @@ def read_protein_title(filename: str) -> Tuple[str, Optional[str]]:
 def add_chains(files: List[str], mirror_dir: str, raw_dir: str, binary_dir: str, conn: 'mariadb.connection',
                executor: ProcessPoolExecutor) -> None:
     cursor = conn.cursor()
-    print(files)
 
     # Decompress gzipped CIFs
     jobs = [executor.submit(decompress_file, filename, mirror_dir, raw_dir) for filename in files]
@@ -177,88 +214,18 @@ def add_chains(files: List[str], mirror_dir: str, raw_dir: str, binary_dir: str,
     cursor.close()
 
 
-def consistency_check(raw_dir: str, conn: 'mariadb.connection') -> None:
-    '''
-    performs a consistency check between raw directory and database # todo shouldn't it be with binary?
-    '''
-    gesamt_ids = set()
-    num_top_level_folders = len(
-        [name for name in os.listdir(Path(raw_dir))])
-
-    with tqdm.tqdm(total=num_top_level_folders, desc='Getting ids from filesystem') as pbar:
-        for dirpath, _, fnames in os.walk(Path(raw_dir)):
-            for filename in fnames:
-                file = Path(raw_dir) / get_dir(filename) / filename
-                pdb_id = file.name[:4].upper()
-                gesamt_ids.add(pdb_id)
-            pbar.update(1)
-
-    cur = conn.cursor()
-    cur.execute("select gesamtId from proteinChain")
-    gesamt_ids_db = set()
-    for gid in cur:
-        gid = gid[0]
-        gesamt_ids_db.add(gid.split(':')[0])
-
-    diff = gesamt_ids - gesamt_ids_db
-    print(gesamt_ids - gesamt_ids_db)
-    print(f"ids in fs {len(gesamt_ids)}")
-    print(f"ids in db {len(gesamt_ids_db)}")
-    print(f"got {len(diff)} more ids in the raw_dir than db")
-    print("Consistency check for raw directories failed")
-
-
-def consistency_check(raw_dir: str, conn: 'mariadb.connection') -> None:
-    '''
-    performs a consistency check between raw directory and database
-    '''
-    gesamt_ids = set()
-    num_top_level_folders = len(
-        [name for name in os.listdir(Path(raw_dir))])
-
-    with tqdm.tqdm(total=num_top_level_folders, desc='Getting ids from filesystem') as pbar:
-        for dirpath, _, fnames in os.walk(Path(raw_dir)):
-            for filename in fnames:
-                file = Path(raw_dir) / get_dir(filename) / filename
-                pdb_id = file.name[:4].upper()
-                gesamt_ids.add(pdb_id)
-            pbar.update(1)
-
-    cur = conn.cursor()
-    cur.execute("select gesamtId from proteinChain")
-    gesamt_ids_db = set()
-    for gid in cur:
-        gid = gid[0]
-        gesamt_ids_db.add(gid.split(':')[0])
-
-    diff = gesamt_ids - gesamt_ids_db
-    print(gesamt_ids - gesamt_ids_db)
-    print(f"ids in fs {len(gesamt_ids)}")
-    print(f"ids in db {len(gesamt_ids_db)}")
-    print(f"got {len(diff)} more ids in the raw_dir than db")
-    print("Consistency check for raw directories failed")
-
-
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--config', type=str, default='/etc/protein_search.ini', help='File with configuration of DB')
     parser.add_argument('--mirror-directory', type=str, default=True, help='Directory with rsynced files')
     parser.add_argument('--binary-directory', type=str, required=True, help='Directory to store binaries')
     parser.add_argument('--raw-directory', type=str, required=True, help='Directory with uncompressed files')
     parser.add_argument('--workers', type=int, default=1, help='Number of workers ')
-    parser.add_argument('--consistency-check', type=bool, default=False, help='Should a consistency check with DB be performed')
     args = parser.parse_args()
 
     config = configparser.ConfigParser()
     config.read(args.config)
 
-    if args.consistency_check:
-        print("performing consistency check")
-        conn = mariadb.connect(host=config['db']['host'], user=config['db']['user'], password=config['db']['password'],
-                               database=config['db']['database'])
-        consistency_check(args.raw_directory, conn)
-        return
-
     executor = ProcessPoolExecutor(args.workers)
 
     print('*** Updating directories ***')
@@ -273,11 +240,13 @@ def main():
     print(f'Removed files: {stats["removed"]}')
     print(f'Up-to-date files: {stats["ok"]}')
 
+    if len(removed_files) > 10000 or len(modified_files) > 10000:
+        print('Too many files to remove, aborting...')
+        exit(1)
 
     conn = mariadb.connect(host=config['db']['host'], user=config['db']['user'], password=config['db']['password'],
                            database=config['db']['database'])
 
-
     print('*** Processing new entries ***')
     add_chains(new_files, args.mirror_directory, args.raw_directory, args.binary_directory, conn, executor)