From 699bc4f38922981d6d171191940ac307c8d2b44d Mon Sep 17 00:00:00 2001
From: Jakub Orsula <jakub.orsula@kiwi.com>
Date: Thu, 23 Nov 2023 21:15:24 +0100
Subject: [PATCH 1/4] added stripping of the cif files

---
 app/static/attributions.html   | 47 ++++++++++++++++++++++++++++++++++
 app/templates/index.html       |  3 +++
 utils/get_stats.py             | 27 +++++++++++++++++--
 utils/strip_mmcif.py           | 10 ++++++++
 utils/update_binary_archive.py | 28 +++++++++++++++++---
 5 files changed, 109 insertions(+), 6 deletions(-)
 create mode 100644 app/static/attributions.html
 create mode 100644 utils/strip_mmcif.py
diff --git a/app/static/attributions.html b/app/static/attributions.html
new file mode 100644
index 0000000..445aafb
--- /dev/null
+++ b/app/static/attributions.html
@@ -0,0 +1,47 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Project Contributors</title>
+    <!-- Bootstrap CSS -->
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3" crossorigin="anonymous">
+</head>
+<body>
+    <div class="container my-4">
+        <h1 class="text-center mb-4">Project Contributors</h1>
+
+        <!-- Contributor 1 -->
+        <div class="card mb-3">
+            <div class="card-body">
+                <h5 class="card-title">Contributor Name</h5>
+                <p class="card-text">Description of the role and work. For example, development of search algorithms. Mention of "binary sketches in the Hamming space, index, PPP-codes of David Novák and Pavel Zezula".</p>
+                <a href="link-to-relevant-paper" class="card-link">Relevant Paper Title</a>
+            </div>
+        </div>
+
+        <!-- Contributor 2 -->
+        <div class="card mb-3">
+            <div class="card-body">
+                <h5 class="card-title">Contributor Name</h5>
+                <p class="card-text">Brief description of their work within the project.</p>
+                <!-- Add paper link if applicable -->
+            </div>
+        </div>
+
+        <!-- Additional contributors here -->
+
+        <!-- References Section -->
+        <h2 class="mt-5 mb-3">References</h2>
+        <ul class="list-group list-group-flush">
+            <li class="list-group-item">Similarity search for an extreme application: experience and implementation</li>
+            <li class="list-group-item">Designing sketches for similarity filtering</li>
+            <li class="list-group-item">Binary sketches for secondary filtering</li>
+            <li class="list-group-item">PPP-codes for large-scale similarity searching</li>
+        </ul>
+    </div>
+
+    <!-- Bootstrap JS Bundle -->
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0/dist/js/bootstrap.bundle.min.js"></script>
+</body>
+</html>
diff --git a/app/templates/index.html b/app/templates/index.html
index 6a4e985..982eb76 100644
--- a/app/templates/index.html
+++ b/app/templates/index.html
@@ -53,6 +53,7 @@ <h5 class="modal-title">Select protein</h5>
                 <p class="lead">
                     Search for the most similar protein chains to a given query chain. <br>
                 </p>
+                See <a href="/static/attributions.html">attributions page</a> for information about project authors.<br>
                 Indexed {{ chain_count }} chains from {{ protein_count }} proteins downloaded from <a
                     href="http://pdbe.org/" rel="noreferrer" target="_blank">PDBe</a>.
                 Last update: {{ updated }}.
@@ -291,6 +292,8 @@ <h5 class="card-title">Detected chains {% if selected %} of
             <input type="hidden" name="uploaded" id="uploaded" value="{{ uploaded }}">
 
         </form>
+    <hr>
+
     </div>
 
 {% endblock body %}
diff --git a/utils/get_stats.py b/utils/get_stats.py
index 162e5ac..650d969 100644
--- a/utils/get_stats.py
+++ b/utils/get_stats.py
@@ -1,8 +1,31 @@
 import python_distance
 
-protein = "/mnt/data-ssd/PDBe_raw/xh/2xhc.cif"
+protein = "/mnt/data/PDBe_raw/xh/2xhc.cif"
+protein = '/mnt/data/PDBe_raw/as/1asj.cif'
+protein = '/mnt/data/PDBe_raw/pc/3pcc.cif'
 
-print(python_distance.save_chains(f'/mnt/data-ssd/PDBe_raw/{protein}.cif', '/tmp', 'test'))
+protein_out = '/tmp/stripped.bin'
+
+with open(protein, 'r') as fin:
+    lines = []
+    for line in fin:
+        if line.startswith(('data_', '_entry', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
+            lines.append(line)
+    for i, line in enumerate(lines[:-1]):
+        if line.startswith('loop_') and not lines[i+1].startswith('_atom_site.group_PDB'):
+            lines[i] = None
+    lines = [line for line in lines if line is not None]
+    for i in range(1, len(lines)):
+        if lines[i].startswith('#') and lines[i - 1].startswith('#'):
+            lines[i-1] = None
+    lines = [line for line in lines if line is not None]
+
+with open(protein_out, 'w') as fout:
+        fout.writelines(lines)
+
+
+
+print(python_distance.save_chains(protein_out, '/tmp', 'test'))
 
 
 def get_raw_from_gesamt(strid):
diff --git a/utils/strip_mmcif.py b/utils/strip_mmcif.py
new file mode 100644
index 0000000..491d0d6
--- /dev/null
+++ b/utils/strip_mmcif.py
@@ -0,0 +1,10 @@
+import io
+import python_distance
+
+def strip_stream(fin: io.TextIOWrapper, fout: io.TextIOWrapper):
+    for line in fin:
+        ...
+
+def test_stripping():
+    python_distance.distance("test", "test")
+
diff --git a/utils/update_binary_archive.py b/utils/update_binary_archive.py
index f0cacf8..8de70f4 100644
--- a/utils/update_binary_archive.py
+++ b/utils/update_binary_archive.py
@@ -93,7 +93,10 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad
             cursor.execute(f'UPDATE proteinChain SET indexedAsDataObject = 0 WHERE intId IN ({ids_format})', int_ids)
 
             for chain_id in chain_ids:
-                (Path(binary_dir) / dirpath / f'{chain_id}.bin').unlink()
+                try:
+                    (Path(binary_dir) / dirpath / f'{chain_id}.bin').unlink()
+                except FileNotFoundError:
+                    pass  # the .bin chain might have never existed if gesamt was unable to read the cif file
 
         (Path(raw_dir) / dirpath / file).unlink()
 
@@ -102,16 +105,33 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad
 
 
 def decompress_file(filename, src_dir: str, dest_dir: str) -> None:
-    with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as f_in:
-        with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as f_out:
-            shutil.copyfileobj(f_in, f_out)
+    with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin:
+        lines = []
+        for line in fin:
+            if line.startswith(('data_', '_entry', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
+                lines.append(line)
+        for i, line in enumerate(lines[:-1]):
+            if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'):
+                lines[i] = None
+        lines = [line for line in lines if line is not None]
+        for i in range(1, len(lines)):
+            if lines[i].startswith('#') and lines[i - 1].startswith('#'):
+                lines[i - 1] = None
+        lines = [line for line in lines if line is not None]
+    with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as fout:
+        fout.writelines(lines)
 
 
 def create_binaries(filename: str, src_dir: str, dest_dir: str) -> List[Tuple[str, str, int]]:
+    empty_cifs = 0
     file = Path(src_dir) / get_dir(filename) / filename
     dirname = get_dir(filename)
     pdb_id = file.name[:4].upper()
     results = python_distance.save_chains(str(file), str(Path(dest_dir) / dirname), pdb_id)
+    if not results:
+        empty_cifs += 1
+        print("No chains extracted from file: ", file)
+    print('empty_cifs', empty_cifs)
     return [(filename, f'{pdb_id}:{chain_id}', size) for chain_id, size in results]
 
 

From c884ac71fd7a99dd950309d8ddff57e09acc1f81 Mon Sep 17 00:00:00 2001
From: Jakub Orsula <jakub.orsula@kiwi.com>
Date: Fri, 24 Nov 2023 17:02:10 +0100
Subject: [PATCH 2/4] improved stripping

---
 utils/get_stats.py             | 41 ++++++++++-----
 utils/update_binary_archive.py | 94 ++++++----------------------------
 2 files changed, 42 insertions(+), 93 deletions(-)

diff --git a/utils/get_stats.py b/utils/get_stats.py
index 650d969..857ce44 100644
--- a/utils/get_stats.py
+++ b/utils/get_stats.py
@@ -1,4 +1,5 @@
 import python_distance
+import gemmi
 
 protein = "/mnt/data/PDBe_raw/xh/2xhc.cif"
 protein = '/mnt/data/PDBe_raw/as/1asj.cif'
@@ -6,22 +7,34 @@
 
 protein_out = '/tmp/stripped.bin'
 
-with open(protein, 'r') as fin:
-    lines = []
-    for line in fin:
-        if line.startswith(('data_', '_entry', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
-            lines.append(line)
-    for i, line in enumerate(lines[:-1]):
-        if line.startswith('loop_') and not lines[i+1].startswith('_atom_site.group_PDB'):
-            lines[i] = None
-    lines = [line for line in lines if line is not None]
-    for i in range(1, len(lines)):
-        if lines[i].startswith('#') and lines[i - 1].startswith('#'):
-            lines[i-1] = None
-    lines = [line for line in lines if line is not None]
+
+def strip_file(filename):
+    with open(filename, 'r') as fin:
+        contents = fin.read()
+        doc = gemmi.cif.read_string(contents)
+        block = doc.sole_block()
+        pdb_id = block.find_pair('_struct.entry_id')
+        pdb_title = block.find_pair('_struct.title')
+
+        lines = []
+
+        for line in contents.splitlines(keepends=True):
+            if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
+                lines.append(line)
+        for i, line in enumerate(lines[:-1]):
+            if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'):
+                lines[i] = None
+        lines = [line for line in lines if line is not None]
+        for i in range(1, len(lines)):
+            if lines[i].startswith('#') and lines[i - 1].startswith('#'):
+                lines[i - 1] = None
+        lines = [line for line in lines if line is not None]
+
+    lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n')
+    return lines
 
 with open(protein_out, 'w') as fout:
-        fout.writelines(lines)
+        fout.writelines(strip_file(protein))
 
 
 
diff --git a/utils/update_binary_archive.py b/utils/update_binary_archive.py
index 8de70f4..0b7df3e 100644
--- a/utils/update_binary_archive.py
+++ b/utils/update_binary_archive.py
@@ -79,7 +79,6 @@ def get_whats_updated(mirror_dir: str, raw_dir: str, executor: ProcessPoolExecut
 
 def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariadb.connection') -> None:
     cursor = conn.cursor()
-    print(files)
     for file in files:
         pdb_id = Path(file).with_suffix('').name.upper()
         cursor.execute('DELETE FROM protein WHERE pdbId = %s', (pdb_id,))
@@ -106,9 +105,19 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad
 
 def decompress_file(filename, src_dir: str, dest_dir: str) -> None:
     with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin:
+        contents = fin.read()
+
+        # parse the info we care about
+        doc = gemmi.cif.read_string(contents)
+        block = doc.sole_block()
+        pdb_id = block.find_pair('_struct.entry_id')
+        pdb_title = block.find_pair('_struct.title')
+
         lines = []
-        for line in fin:
-            if line.startswith(('data_', '_entry', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
+
+        # strip everything else
+        for line in contents.splitlines(keepends=True):
+            if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
                 lines.append(line)
         for i, line in enumerate(lines[:-1]):
             if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'):
@@ -118,20 +127,18 @@ def decompress_file(filename, src_dir: str, dest_dir: str) -> None:
             if lines[i].startswith('#') and lines[i - 1].startswith('#'):
                 lines[i - 1] = None
         lines = [line for line in lines if line is not None]
+
+    # put the info back in
+    lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n')
     with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as fout:
         fout.writelines(lines)
 
 
 def create_binaries(filename: str, src_dir: str, dest_dir: str) -> List[Tuple[str, str, int]]:
-    empty_cifs = 0
     file = Path(src_dir) / get_dir(filename) / filename
     dirname = get_dir(filename)
     pdb_id = file.name[:4].upper()
     results = python_distance.save_chains(str(file), str(Path(dest_dir) / dirname), pdb_id)
-    if not results:
-        empty_cifs += 1
-        print("No chains extracted from file: ", file)
-    print('empty_cifs', empty_cifs)
     return [(filename, f'{pdb_id}:{chain_id}', size) for chain_id, size in results]
 
 
@@ -157,7 +164,6 @@ def read_protein_title(filename: str) -> Tuple[str, Optional[str]]:
 def add_chains(files: List[str], mirror_dir: str, raw_dir: str, binary_dir: str, conn: 'mariadb.connection',
                executor: ProcessPoolExecutor) -> None:
     cursor = conn.cursor()
-    print(files)
 
     # Decompress gzipped CIFs
     jobs = [executor.submit(decompress_file, filename, mirror_dir, raw_dir) for filename in files]
@@ -197,68 +203,6 @@ def add_chains(files: List[str], mirror_dir: str, raw_dir: str, binary_dir: str,
     cursor.close()
 
 
-def consistency_check(raw_dir: str, conn: 'mariadb.connection') -> None:
-    '''
-    performs a consistency check between raw directory and database # todo shouldn't it be with binary?
-    '''
-    gesamt_ids = set()
-    num_top_level_folders = len(
-        [name for name in os.listdir(Path(raw_dir))])
-
-    with tqdm.tqdm(total=num_top_level_folders, desc='Getting ids from filesystem') as pbar:
-        for dirpath, _, fnames in os.walk(Path(raw_dir)):
-            for filename in fnames:
-                file = Path(raw_dir) / get_dir(filename) / filename
-                pdb_id = file.name[:4].upper()
-                gesamt_ids.add(pdb_id)
-            pbar.update(1)
-
-    cur = conn.cursor()
-    cur.execute("select gesamtId from proteinChain")
-    gesamt_ids_db = set()
-    for gid in cur:
-        gid = gid[0]
-        gesamt_ids_db.add(gid.split(':')[0])
-
-    diff = gesamt_ids - gesamt_ids_db
-    print(gesamt_ids - gesamt_ids_db)
-    print(f"ids in fs {len(gesamt_ids)}")
-    print(f"ids in db {len(gesamt_ids_db)}")
-    print(f"got {len(diff)} more ids in the raw_dir than db")
-    print("Consistency check for raw directories failed")
-
-
-def consistency_check(raw_dir: str, conn: 'mariadb.connection') -> None:
-    '''
-    performs a consistency check between raw directory and database
-    '''
-    gesamt_ids = set()
-    num_top_level_folders = len(
-        [name for name in os.listdir(Path(raw_dir))])
-
-    with tqdm.tqdm(total=num_top_level_folders, desc='Getting ids from filesystem') as pbar:
-        for dirpath, _, fnames in os.walk(Path(raw_dir)):
-            for filename in fnames:
-                file = Path(raw_dir) / get_dir(filename) / filename
-                pdb_id = file.name[:4].upper()
-                gesamt_ids.add(pdb_id)
-            pbar.update(1)
-
-    cur = conn.cursor()
-    cur.execute("select gesamtId from proteinChain")
-    gesamt_ids_db = set()
-    for gid in cur:
-        gid = gid[0]
-        gesamt_ids_db.add(gid.split(':')[0])
-
-    diff = gesamt_ids - gesamt_ids_db
-    print(gesamt_ids - gesamt_ids_db)
-    print(f"ids in fs {len(gesamt_ids)}")
-    print(f"ids in db {len(gesamt_ids_db)}")
-    print(f"got {len(diff)} more ids in the raw_dir than db")
-    print("Consistency check for raw directories failed")
-
-
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--config', type=str, default='/etc/protein_search.ini', help='File with configuration of DB')
@@ -266,19 +210,11 @@ def main():
     parser.add_argument('--binary-directory', type=str, required=True, help='Directory to store binaries')
     parser.add_argument('--raw-directory', type=str, required=True, help='Directory with uncompressed files')
     parser.add_argument('--workers', type=int, default=1, help='Number of workers ')
-    parser.add_argument('--consistency-check', type=bool, default=False, help='Should a consistency check with DB be performed')
     args = parser.parse_args()
 
     config = configparser.ConfigParser()
     config.read(args.config)
 
-    if args.consistency_check:
-        print("performing consistency check")
-        conn = mariadb.connect(host=config['db']['host'], user=config['db']['user'], password=config['db']['password'],
-                               database=config['db']['database'])
-        consistency_check(args.raw_directory, conn)
-        return
-
     executor = ProcessPoolExecutor(args.workers)
 
     print('*** Updating directories ***')

From fb614b2620ca15c11aff52e5e9595c968c9aef48 Mon Sep 17 00:00:00 2001
From: Jakub Orsula <jakub.orsula@kiwi.com>
Date: Fri, 24 Nov 2023 17:27:36 +0100
Subject: [PATCH 3/4] safety check for too many removals

---
 utils/update_binary_archive.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/utils/update_binary_archive.py b/utils/update_binary_archive.py
index 0b7df3e..acd8135 100644
--- a/utils/update_binary_archive.py
+++ b/utils/update_binary_archive.py
@@ -229,11 +229,13 @@ def main():
     print(f'Removed files: {stats["removed"]}')
     print(f'Up-to-date files: {stats["ok"]}')
 
+    if len(removed_files) > 10000 or len(modified_files) > 10000:
+        print('Too many files to remove, aborting...')
+        exit(1)
 
     conn = mariadb.connect(host=config['db']['host'], user=config['db']['user'], password=config['db']['password'],
                            database=config['db']['database'])
 
-
     print('*** Processing new entries ***')
     add_chains(new_files, args.mirror_directory, args.raw_directory, args.binary_directory, conn, executor)
 

From 4d483994ddb46f86bc556b53f8b4d5b469e50587 Mon Sep 17 00:00:00 2001
From: Jakub Orsula <jakub.orsula@kiwi.com>
Date: Fri, 24 Nov 2023 21:43:44 +0100
Subject: [PATCH 4/4] used the strip version and full contents for comparison

---
 utils/update_binary_archive.py | 63 ++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/utils/update_binary_archive.py b/utils/update_binary_archive.py
index acd8135..2a05ccb 100644
--- a/utils/update_binary_archive.py
+++ b/utils/update_binary_archive.py
@@ -33,10 +33,17 @@ def is_updated(filename: str, mirror_dir: str, raw_dir: str) -> Tuple[str, bool]
     gzip_path = Path(mirror_dir) / get_dir(filename) / f'{filename}.gz'
     raw_path = Path(raw_dir) / get_dir(filename) / filename
     with gzip.open(gzip_path, 'rt') as f_gzip, open(raw_path, 'r') as f_raw:
-        if f_gzip.read() != f_raw.read():
-            return filename, True
+        gzip_contents = f_gzip.read()
+        raw_contents = f_raw.read()
+        stripped = strip_cif(gzip_contents)
+        stripped = ''.join(stripped)
+        # this if is troublesome
+        # the first part is present just for legacy bins
+        # if all old bins are regenarated, it can be dropped
+        if gzip_contents == raw_contents or stripped == raw_contents:
+            return filename, False
 
-    return filename, False
+    return filename, True
 
 
 def get_whats_updated(mirror_dir: str, raw_dir: str, executor: ProcessPoolExecutor) -> Tuple[
@@ -103,33 +110,37 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad
     cursor.close()
 
 
-def decompress_file(filename, src_dir: str, dest_dir: str) -> None:
-    with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin:
-        contents = fin.read()
+def strip_cif(cif_content: str):
+    # parse the info we care about
+    doc = gemmi.cif.read_string(cif_content)
+    block = doc.sole_block()
+    pdb_id = block.find_pair('_struct.entry_id')
+    pdb_title = block.find_pair('_struct.title')
 
-        # parse the info we care about
-        doc = gemmi.cif.read_string(contents)
-        block = doc.sole_block()
-        pdb_id = block.find_pair('_struct.entry_id')
-        pdb_title = block.find_pair('_struct.title')
-
-        lines = []
-
-        # strip everything else
-        for line in contents.splitlines(keepends=True):
-            if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
-                lines.append(line)
-        for i, line in enumerate(lines[:-1]):
-            if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'):
-                lines[i] = None
-        lines = [line for line in lines if line is not None]
-        for i in range(1, len(lines)):
-            if lines[i].startswith('#') and lines[i - 1].startswith('#'):
-                lines[i - 1] = None
-        lines = [line for line in lines if line is not None]
+    lines = []
+
+    # strip everything else
+    for line in cif_content.splitlines(keepends=True):
+        if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
+            lines.append(line)
+    for i, line in enumerate(lines[:-1]):
+        if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'):
+            lines[i] = None
+    lines = [line for line in lines if line is not None]
+    for i in range(1, len(lines)):
+        if lines[i].startswith('#') and lines[i - 1].startswith('#'):
+            lines[i - 1] = None
+    lines = [line for line in lines if line is not None]
 
     # put the info back in
     lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n')
+    return lines
+
+
+def decompress_file(filename, src_dir: str, dest_dir: str) -> None:
+    with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin:
+        contents = fin.read()
+    lines = strip_cif(contents)
     with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as fout:
         fout.writelines(lines)