From 699bc4f38922981d6d171191940ac307c8d2b44d Mon Sep 17 00:00:00 2001 From: Jakub Orsula Date: Thu, 23 Nov 2023 21:15:24 +0100 Subject: [PATCH 1/4] added stripping of the cif files --- app/static/attributions.html | 47 ++++++++++++++++++++++++++++++++++ app/templates/index.html | 3 +++ utils/get_stats.py | 27 +++++++++++++++++-- utils/strip_mmcif.py | 10 ++++++++ utils/update_binary_archive.py | 28 +++++++++++++++++--- 5 files changed, 109 insertions(+), 6 deletions(-) create mode 100644 app/static/attributions.html create mode 100644 utils/strip_mmcif.py diff --git a/app/static/attributions.html b/app/static/attributions.html new file mode 100644 index 0000000..445aafb --- /dev/null +++ b/app/static/attributions.html @@ -0,0 +1,47 @@ + + + + + + Project Contributors + + + + +
+

Project Contributors

+ + +
+
+
Contributor Name
+

Description of the role and work. For example, development of search algorithms. Mention of "binary sketches in the Hamming space, index, PPP-codes of David Novák and Pavel Zezula".

+ Relevant Paper Title +
+
+ + +
+
+
Contributor Name
+

Brief description of their work within the project.

+ +
+
+ + + + +

References

+ +
+ + + + + diff --git a/app/templates/index.html b/app/templates/index.html index 6a4e985..982eb76 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -53,6 +53,7 @@

Search for the most similar protein chains to a given query chain.

+ See attributions page for information about project authors.
Indexed {{ chain_count }} chains from {{ protein_count }} proteins downloaded from PDBe. Last update: {{ updated }}. @@ -291,6 +292,8 @@
Detected chains {% if selected %} of +
+ {% endblock body %} diff --git a/utils/get_stats.py b/utils/get_stats.py index 162e5ac..650d969 100644 --- a/utils/get_stats.py +++ b/utils/get_stats.py @@ -1,8 +1,31 @@ import python_distance -protein = "/mnt/data-ssd/PDBe_raw/xh/2xhc.cif" +protein = "/mnt/data/PDBe_raw/xh/2xhc.cif" +protein = '/mnt/data/PDBe_raw/as/1asj.cif' +protein = '/mnt/data/PDBe_raw/pc/3pcc.cif' -print(python_distance.save_chains(f'/mnt/data-ssd/PDBe_raw/{protein}.cif', '/tmp', 'test')) +protein_out = '/tmp/stripped.bin' + +with open(protein, 'r') as fin: + lines = [] + for line in fin: + if line.startswith(('data_', '_entry', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')): + lines.append(line) + for i, line in enumerate(lines[:-1]): + if line.startswith('loop_') and not lines[i+1].startswith('_atom_site.group_PDB'): + lines[i] = None + lines = [line for line in lines if line is not None] + for i in range(1, len(lines)): + if lines[i].startswith('#') and lines[i - 1].startswith('#'): + lines[i-1] = None + lines = [line for line in lines if line is not None] + +with open(protein_out, 'w') as fout: + fout.writelines(lines) + + + +print(python_distance.save_chains(protein_out, '/tmp', 'test')) def get_raw_from_gesamt(strid): diff --git a/utils/strip_mmcif.py b/utils/strip_mmcif.py new file mode 100644 index 0000000..491d0d6 --- /dev/null +++ b/utils/strip_mmcif.py @@ -0,0 +1,10 @@ +import io +import python_distance + +def strip_stream(fin: io.TextIOWrapper, fout: io.TextIOWrapper): + for line in fin: + ... + +def test_stripping(): + python_distance.distance("test", "test") + diff --git a/utils/update_binary_archive.py b/utils/update_binary_archive.py index f0cacf8..8de70f4 100644 --- a/utils/update_binary_archive.py +++ b/utils/update_binary_archive.py @@ -93,7 +93,10 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad cursor.execute(f'UPDATE proteinChain SET indexedAsDataObject = 0 WHERE intId IN ({ids_format})', int_ids) for chain_id in chain_ids: - (Path(binary_dir) / dirpath / f'{chain_id}.bin').unlink() + try: + (Path(binary_dir) / dirpath / f'{chain_id}.bin').unlink() + except FileNotFoundError: + pass # the .bin chain might have never existed if gesamt was unable to read the cif file (Path(raw_dir) / dirpath / file).unlink() @@ -102,16 +105,33 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad def decompress_file(filename, src_dir: str, dest_dir: str) -> None: - with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as f_in: - with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as f_out: - shutil.copyfileobj(f_in, f_out) + with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin: + lines = [] + for line in fin: + if line.startswith(('data_', '_entry', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')): + lines.append(line) + for i, line in enumerate(lines[:-1]): + if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'): + lines[i] = None + lines = [line for line in lines if line is not None] + for i in range(1, len(lines)): + if lines[i].startswith('#') and lines[i - 1].startswith('#'): + lines[i - 1] = None + lines = [line for line in lines if line is not None] + with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as fout: + fout.writelines(lines) def create_binaries(filename: str, src_dir: str, dest_dir: str) -> List[Tuple[str, str, int]]: + empty_cifs = 0 file = Path(src_dir) / get_dir(filename) / filename dirname = get_dir(filename) pdb_id = file.name[:4].upper() results = python_distance.save_chains(str(file), str(Path(dest_dir) / dirname), pdb_id) + if not results: + empty_cifs += 1 + print("No chains extracted from file: ", file) + print('empty_cifs', empty_cifs) return [(filename, f'{pdb_id}:{chain_id}', size) for chain_id, size in results] From c884ac71fd7a99dd950309d8ddff57e09acc1f81 Mon Sep 17 00:00:00 2001 From: Jakub Orsula Date: Fri, 24 Nov 2023 17:02:10 +0100 Subject: [PATCH 2/4] improved stripping --- utils/get_stats.py | 41 ++++++++++----- utils/update_binary_archive.py | 94 ++++++---------------------------- 2 files changed, 42 insertions(+), 93 deletions(-) diff --git a/utils/get_stats.py b/utils/get_stats.py index 650d969..857ce44 100644 --- a/utils/get_stats.py +++ b/utils/get_stats.py @@ -1,4 +1,5 @@ import python_distance +import gemmi protein = "/mnt/data/PDBe_raw/xh/2xhc.cif" protein = '/mnt/data/PDBe_raw/as/1asj.cif' @@ -6,22 +7,34 @@ protein_out = '/tmp/stripped.bin' -with open(protein, 'r') as fin: - lines = [] - for line in fin: - if line.startswith(('data_', '_entry', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')): - lines.append(line) - for i, line in enumerate(lines[:-1]): - if line.startswith('loop_') and not lines[i+1].startswith('_atom_site.group_PDB'): - lines[i] = None - lines = [line for line in lines if line is not None] - for i in range(1, len(lines)): - if lines[i].startswith('#') and lines[i - 1].startswith('#'): - lines[i-1] = None - lines = [line for line in lines if line is not None] + +def strip_file(filename): + with open(filename, 'r') as fin: + contents = fin.read() + doc = gemmi.cif.read_string(contents) + block = doc.sole_block() + pdb_id = block.find_pair('_struct.entry_id') + pdb_title = block.find_pair('_struct.title') + + lines = [] + + for line in contents.splitlines(keepends=True): + if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')): + lines.append(line) + for i, line in enumerate(lines[:-1]): + if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'): + lines[i] = None + lines = [line for line in lines if line is not None] + for i in range(1, len(lines)): + if lines[i].startswith('#') and lines[i - 1].startswith('#'): + lines[i - 1] = None + lines = [line for line in lines if line is not None] + + lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n') + return lines with open(protein_out, 'w') as fout: - fout.writelines(lines) + fout.writelines(strip_file(protein)) diff --git a/utils/update_binary_archive.py b/utils/update_binary_archive.py index 8de70f4..0b7df3e 100644 --- a/utils/update_binary_archive.py +++ b/utils/update_binary_archive.py @@ -79,7 +79,6 @@ def get_whats_updated(mirror_dir: str, raw_dir: str, executor: ProcessPoolExecut def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariadb.connection') -> None: cursor = conn.cursor() - print(files) for file in files: pdb_id = Path(file).with_suffix('').name.upper() cursor.execute('DELETE FROM protein WHERE pdbId = %s', (pdb_id,)) @@ -106,9 +105,19 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad def decompress_file(filename, src_dir: str, dest_dir: str) -> None: with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin: + contents = fin.read() + + # parse the info we care about + doc = gemmi.cif.read_string(contents) + block = doc.sole_block() + pdb_id = block.find_pair('_struct.entry_id') + pdb_title = block.find_pair('_struct.title') + lines = [] - for line in fin: - if line.startswith(('data_', '_entry', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')): + + # strip everything else + for line in contents.splitlines(keepends=True): + if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')): lines.append(line) for i, line in enumerate(lines[:-1]): if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'): @@ -118,20 +127,18 @@ def decompress_file(filename, src_dir: str, dest_dir: str) -> None: if lines[i].startswith('#') and lines[i - 1].startswith('#'): lines[i - 1] = None lines = [line for line in lines if line is not None] + + # put the info back in + lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n') with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as fout: fout.writelines(lines) def create_binaries(filename: str, src_dir: str, dest_dir: str) -> List[Tuple[str, str, int]]: - empty_cifs = 0 file = Path(src_dir) / get_dir(filename) / filename dirname = get_dir(filename) pdb_id = file.name[:4].upper() results = python_distance.save_chains(str(file), str(Path(dest_dir) / dirname), pdb_id) - if not results: - empty_cifs += 1 - print("No chains extracted from file: ", file) - print('empty_cifs', empty_cifs) return [(filename, f'{pdb_id}:{chain_id}', size) for chain_id, size in results] @@ -157,7 +164,6 @@ def read_protein_title(filename: str) -> Tuple[str, Optional[str]]: def add_chains(files: List[str], mirror_dir: str, raw_dir: str, binary_dir: str, conn: 'mariadb.connection', executor: ProcessPoolExecutor) -> None: cursor = conn.cursor() - print(files) # Decompress gzipped CIFs jobs = [executor.submit(decompress_file, filename, mirror_dir, raw_dir) for filename in files] @@ -197,68 +203,6 @@ def add_chains(files: List[str], mirror_dir: str, raw_dir: str, binary_dir: str, cursor.close() -def consistency_check(raw_dir: str, conn: 'mariadb.connection') -> None: - ''' - performs a consistency check between raw directory and database # todo shouldn't it be with binary? - ''' - gesamt_ids = set() - num_top_level_folders = len( - [name for name in os.listdir(Path(raw_dir))]) - - with tqdm.tqdm(total=num_top_level_folders, desc='Getting ids from filesystem') as pbar: - for dirpath, _, fnames in os.walk(Path(raw_dir)): - for filename in fnames: - file = Path(raw_dir) / get_dir(filename) / filename - pdb_id = file.name[:4].upper() - gesamt_ids.add(pdb_id) - pbar.update(1) - - cur = conn.cursor() - cur.execute("select gesamtId from proteinChain") - gesamt_ids_db = set() - for gid in cur: - gid = gid[0] - gesamt_ids_db.add(gid.split(':')[0]) - - diff = gesamt_ids - gesamt_ids_db - print(gesamt_ids - gesamt_ids_db) - print(f"ids in fs {len(gesamt_ids)}") - print(f"ids in db {len(gesamt_ids_db)}") - print(f"got {len(diff)} more ids in the raw_dir than db") - print("Consistency check for raw directories failed") - - -def consistency_check(raw_dir: str, conn: 'mariadb.connection') -> None: - ''' - performs a consistency check between raw directory and database - ''' - gesamt_ids = set() - num_top_level_folders = len( - [name for name in os.listdir(Path(raw_dir))]) - - with tqdm.tqdm(total=num_top_level_folders, desc='Getting ids from filesystem') as pbar: - for dirpath, _, fnames in os.walk(Path(raw_dir)): - for filename in fnames: - file = Path(raw_dir) / get_dir(filename) / filename - pdb_id = file.name[:4].upper() - gesamt_ids.add(pdb_id) - pbar.update(1) - - cur = conn.cursor() - cur.execute("select gesamtId from proteinChain") - gesamt_ids_db = set() - for gid in cur: - gid = gid[0] - gesamt_ids_db.add(gid.split(':')[0]) - - diff = gesamt_ids - gesamt_ids_db - print(gesamt_ids - gesamt_ids_db) - print(f"ids in fs {len(gesamt_ids)}") - print(f"ids in db {len(gesamt_ids_db)}") - print(f"got {len(diff)} more ids in the raw_dir than db") - print("Consistency check for raw directories failed") - - def main(): parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default='/etc/protein_search.ini', help='File with configuration of DB') @@ -266,19 +210,11 @@ def main(): parser.add_argument('--binary-directory', type=str, required=True, help='Directory to store binaries') parser.add_argument('--raw-directory', type=str, required=True, help='Directory with uncompressed files') parser.add_argument('--workers', type=int, default=1, help='Number of workers ') - parser.add_argument('--consistency-check', type=bool, default=False, help='Should a consistency check with DB be performed') args = parser.parse_args() config = configparser.ConfigParser() config.read(args.config) - if args.consistency_check: - print("performing consistency check") - conn = mariadb.connect(host=config['db']['host'], user=config['db']['user'], password=config['db']['password'], - database=config['db']['database']) - consistency_check(args.raw_directory, conn) - return - executor = ProcessPoolExecutor(args.workers) print('*** Updating directories ***') From fb614b2620ca15c11aff52e5e9595c968c9aef48 Mon Sep 17 00:00:00 2001 From: Jakub Orsula Date: Fri, 24 Nov 2023 17:27:36 +0100 Subject: [PATCH 3/4] safety check for too many removals --- utils/update_binary_archive.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/update_binary_archive.py b/utils/update_binary_archive.py index 0b7df3e..acd8135 100644 --- a/utils/update_binary_archive.py +++ b/utils/update_binary_archive.py @@ -229,11 +229,13 @@ def main(): print(f'Removed files: {stats["removed"]}') print(f'Up-to-date files: {stats["ok"]}') + if len(removed_files) > 10000 or len(modified_files) > 10000: + print('Too many files to remove, aborting...') + exit(1) conn = mariadb.connect(host=config['db']['host'], user=config['db']['user'], password=config['db']['password'], database=config['db']['database']) - print('*** Processing new entries ***') add_chains(new_files, args.mirror_directory, args.raw_directory, args.binary_directory, conn, executor) From 4d483994ddb46f86bc556b53f8b4d5b469e50587 Mon Sep 17 00:00:00 2001 From: Jakub Orsula Date: Fri, 24 Nov 2023 21:43:44 +0100 Subject: [PATCH 4/4] used the strip version and full contents for comparison --- utils/update_binary_archive.py | 63 ++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/utils/update_binary_archive.py b/utils/update_binary_archive.py index acd8135..2a05ccb 100644 --- a/utils/update_binary_archive.py +++ b/utils/update_binary_archive.py @@ -33,10 +33,17 @@ def is_updated(filename: str, mirror_dir: str, raw_dir: str) -> Tuple[str, bool] gzip_path = Path(mirror_dir) / get_dir(filename) / f'{filename}.gz' raw_path = Path(raw_dir) / get_dir(filename) / filename with gzip.open(gzip_path, 'rt') as f_gzip, open(raw_path, 'r') as f_raw: - if f_gzip.read() != f_raw.read(): - return filename, True + gzip_contents = f_gzip.read() + raw_contents = f_raw.read() + stripped = strip_cif(gzip_contents) + stripped = ''.join(stripped) + # this if is troublesome + # the first part is present just for legacy bins + # if all old bins are regenarated, it can be dropped + if gzip_contents == raw_contents or stripped == raw_contents: + return filename, False - return filename, False + return filename, True def get_whats_updated(mirror_dir: str, raw_dir: str, executor: ProcessPoolExecutor) -> Tuple[ @@ -103,33 +110,37 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad cursor.close() -def decompress_file(filename, src_dir: str, dest_dir: str) -> None: - with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin: - contents = fin.read() +def strip_cif(cif_content: str): + # parse the info we care about + doc = gemmi.cif.read_string(cif_content) + block = doc.sole_block() + pdb_id = block.find_pair('_struct.entry_id') + pdb_title = block.find_pair('_struct.title') - # parse the info we care about - doc = gemmi.cif.read_string(contents) - block = doc.sole_block() - pdb_id = block.find_pair('_struct.entry_id') - pdb_title = block.find_pair('_struct.title') - - lines = [] - - # strip everything else - for line in contents.splitlines(keepends=True): - if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')): - lines.append(line) - for i, line in enumerate(lines[:-1]): - if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'): - lines[i] = None - lines = [line for line in lines if line is not None] - for i in range(1, len(lines)): - if lines[i].startswith('#') and lines[i - 1].startswith('#'): - lines[i - 1] = None - lines = [line for line in lines if line is not None] + lines = [] + + # strip everything else + for line in cif_content.splitlines(keepends=True): + if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')): + lines.append(line) + for i, line in enumerate(lines[:-1]): + if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'): + lines[i] = None + lines = [line for line in lines if line is not None] + for i in range(1, len(lines)): + if lines[i].startswith('#') and lines[i - 1].startswith('#'): + lines[i - 1] = None + lines = [line for line in lines if line is not None] # put the info back in lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n') + return lines + + +def decompress_file(filename, src_dir: str, dest_dir: str) -> None: + with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin: + contents = fin.read() + lines = strip_cif(contents) with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as fout: fout.writelines(lines)