diff --git a/app/static/attributions.html b/app/static/attributions.html new file mode 100644 index 0000000..445aafb --- /dev/null +++ b/app/static/attributions.html @@ -0,0 +1,47 @@ + + + + + + Project Contributors + + + + +
+

Project Contributors

+ + +
+
+
Contributor Name
+

Description of the role and work. For example, development of search algorithms. Mention of "binary sketches in the Hamming space, index, PPP-codes of David Novák and Pavel Zezula".

+ Relevant Paper Title +
+
+ + +
+
+
Contributor Name
+

Brief description of their work within the project.

+ +
+
+ + + + +

References

+ +
+ + + + + diff --git a/app/templates/index.html b/app/templates/index.html index 6a4e985..982eb76 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -53,6 +53,7 @@

Search for the most similar protein chains to a given query chain.

+ See attributions page for information about project authors.
Indexed {{ chain_count }} chains from {{ protein_count }} proteins downloaded from PDBe. Last update: {{ updated }}. @@ -291,6 +292,8 @@
Detected chains {% if selected %} of +
+ {% endblock body %} diff --git a/utils/get_stats.py b/utils/get_stats.py index 162e5ac..857ce44 100644 --- a/utils/get_stats.py +++ b/utils/get_stats.py @@ -1,8 +1,44 @@ import python_distance +import gemmi -protein = "/mnt/data-ssd/PDBe_raw/xh/2xhc.cif" +protein = "/mnt/data/PDBe_raw/xh/2xhc.cif" +protein = '/mnt/data/PDBe_raw/as/1asj.cif' +protein = '/mnt/data/PDBe_raw/pc/3pcc.cif' -print(python_distance.save_chains(f'/mnt/data-ssd/PDBe_raw/{protein}.cif', '/tmp', 'test')) +protein_out = '/tmp/stripped.bin' + + +def strip_file(filename): + with open(filename, 'r') as fin: + contents = fin.read() + doc = gemmi.cif.read_string(contents) + block = doc.sole_block() + pdb_id = block.find_pair('_struct.entry_id') + pdb_title = block.find_pair('_struct.title') + + lines = [] + + for line in contents.splitlines(keepends=True): + if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')): + lines.append(line) + for i, line in enumerate(lines[:-1]): + if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'): + lines[i] = None + lines = [line for line in lines if line is not None] + for i in range(1, len(lines)): + if lines[i].startswith('#') and lines[i - 1].startswith('#'): + lines[i - 1] = None + lines = [line for line in lines if line is not None] + + lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n') + return lines + +with open(protein_out, 'w') as fout: + fout.writelines(strip_file(protein)) + + + +print(python_distance.save_chains(protein_out, '/tmp', 'test')) def get_raw_from_gesamt(strid): diff --git a/utils/strip_mmcif.py b/utils/strip_mmcif.py new file mode 100644 index 0000000..491d0d6 --- /dev/null +++ b/utils/strip_mmcif.py @@ -0,0 +1,10 @@ +import io +import python_distance + +def strip_stream(fin: io.TextIOWrapper, fout: io.TextIOWrapper): + for line in fin: + ... + +def test_stripping(): + python_distance.distance("test", "test") + diff --git a/utils/update_binary_archive.py b/utils/update_binary_archive.py index f0cacf8..2a05ccb 100644 --- a/utils/update_binary_archive.py +++ b/utils/update_binary_archive.py @@ -33,10 +33,17 @@ def is_updated(filename: str, mirror_dir: str, raw_dir: str) -> Tuple[str, bool] gzip_path = Path(mirror_dir) / get_dir(filename) / f'{filename}.gz' raw_path = Path(raw_dir) / get_dir(filename) / filename with gzip.open(gzip_path, 'rt') as f_gzip, open(raw_path, 'r') as f_raw: - if f_gzip.read() != f_raw.read(): - return filename, True + gzip_contents = f_gzip.read() + raw_contents = f_raw.read() + stripped = strip_cif(gzip_contents) + stripped = ''.join(stripped) + # this if is troublesome + # the first part is present just for legacy bins + # if all old bins are regenarated, it can be dropped + if gzip_contents == raw_contents or stripped == raw_contents: + return filename, False - return filename, False + return filename, True def get_whats_updated(mirror_dir: str, raw_dir: str, executor: ProcessPoolExecutor) -> Tuple[ @@ -79,7 +86,6 @@ def get_whats_updated(mirror_dir: str, raw_dir: str, executor: ProcessPoolExecut def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariadb.connection') -> None: cursor = conn.cursor() - print(files) for file in files: pdb_id = Path(file).with_suffix('').name.upper() cursor.execute('DELETE FROM protein WHERE pdbId = %s', (pdb_id,)) @@ -93,7 +99,10 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad cursor.execute(f'UPDATE proteinChain SET indexedAsDataObject = 0 WHERE intId IN ({ids_format})', int_ids) for chain_id in chain_ids: - (Path(binary_dir) / dirpath / f'{chain_id}.bin').unlink() + try: + (Path(binary_dir) / dirpath / f'{chain_id}.bin').unlink() + except FileNotFoundError: + pass # the .bin chain might have never existed if gesamt was unable to read the cif file (Path(raw_dir) / dirpath / file).unlink() @@ -101,10 +110,39 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad cursor.close() +def strip_cif(cif_content: str): + # parse the info we care about + doc = gemmi.cif.read_string(cif_content) + block = doc.sole_block() + pdb_id = block.find_pair('_struct.entry_id') + pdb_title = block.find_pair('_struct.title') + + lines = [] + + # strip everything else + for line in cif_content.splitlines(keepends=True): + if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')): + lines.append(line) + for i, line in enumerate(lines[:-1]): + if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'): + lines[i] = None + lines = [line for line in lines if line is not None] + for i in range(1, len(lines)): + if lines[i].startswith('#') and lines[i - 1].startswith('#'): + lines[i - 1] = None + lines = [line for line in lines if line is not None] + + # put the info back in + lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n') + return lines + + def decompress_file(filename, src_dir: str, dest_dir: str) -> None: - with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as f_in: - with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as f_out: - shutil.copyfileobj(f_in, f_out) + with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin: + contents = fin.read() + lines = strip_cif(contents) + with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as fout: + fout.writelines(lines) def create_binaries(filename: str, src_dir: str, dest_dir: str) -> List[Tuple[str, str, int]]: @@ -137,7 +175,6 @@ def read_protein_title(filename: str) -> Tuple[str, Optional[str]]: def add_chains(files: List[str], mirror_dir: str, raw_dir: str, binary_dir: str, conn: 'mariadb.connection', executor: ProcessPoolExecutor) -> None: cursor = conn.cursor() - print(files) # Decompress gzipped CIFs jobs = [executor.submit(decompress_file, filename, mirror_dir, raw_dir) for filename in files] @@ -177,68 +214,6 @@ def add_chains(files: List[str], mirror_dir: str, raw_dir: str, binary_dir: str, cursor.close() -def consistency_check(raw_dir: str, conn: 'mariadb.connection') -> None: - ''' - performs a consistency check between raw directory and database # todo shouldn't it be with binary? - ''' - gesamt_ids = set() - num_top_level_folders = len( - [name for name in os.listdir(Path(raw_dir))]) - - with tqdm.tqdm(total=num_top_level_folders, desc='Getting ids from filesystem') as pbar: - for dirpath, _, fnames in os.walk(Path(raw_dir)): - for filename in fnames: - file = Path(raw_dir) / get_dir(filename) / filename - pdb_id = file.name[:4].upper() - gesamt_ids.add(pdb_id) - pbar.update(1) - - cur = conn.cursor() - cur.execute("select gesamtId from proteinChain") - gesamt_ids_db = set() - for gid in cur: - gid = gid[0] - gesamt_ids_db.add(gid.split(':')[0]) - - diff = gesamt_ids - gesamt_ids_db - print(gesamt_ids - gesamt_ids_db) - print(f"ids in fs {len(gesamt_ids)}") - print(f"ids in db {len(gesamt_ids_db)}") - print(f"got {len(diff)} more ids in the raw_dir than db") - print("Consistency check for raw directories failed") - - -def consistency_check(raw_dir: str, conn: 'mariadb.connection') -> None: - ''' - performs a consistency check between raw directory and database - ''' - gesamt_ids = set() - num_top_level_folders = len( - [name for name in os.listdir(Path(raw_dir))]) - - with tqdm.tqdm(total=num_top_level_folders, desc='Getting ids from filesystem') as pbar: - for dirpath, _, fnames in os.walk(Path(raw_dir)): - for filename in fnames: - file = Path(raw_dir) / get_dir(filename) / filename - pdb_id = file.name[:4].upper() - gesamt_ids.add(pdb_id) - pbar.update(1) - - cur = conn.cursor() - cur.execute("select gesamtId from proteinChain") - gesamt_ids_db = set() - for gid in cur: - gid = gid[0] - gesamt_ids_db.add(gid.split(':')[0]) - - diff = gesamt_ids - gesamt_ids_db - print(gesamt_ids - gesamt_ids_db) - print(f"ids in fs {len(gesamt_ids)}") - print(f"ids in db {len(gesamt_ids_db)}") - print(f"got {len(diff)} more ids in the raw_dir than db") - print("Consistency check for raw directories failed") - - def main(): parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default='/etc/protein_search.ini', help='File with configuration of DB') @@ -246,19 +221,11 @@ def main(): parser.add_argument('--binary-directory', type=str, required=True, help='Directory to store binaries') parser.add_argument('--raw-directory', type=str, required=True, help='Directory with uncompressed files') parser.add_argument('--workers', type=int, default=1, help='Number of workers ') - parser.add_argument('--consistency-check', type=bool, default=False, help='Should a consistency check with DB be performed') args = parser.parse_args() config = configparser.ConfigParser() config.read(args.config) - if args.consistency_check: - print("performing consistency check") - conn = mariadb.connect(host=config['db']['host'], user=config['db']['user'], password=config['db']['password'], - database=config['db']['database']) - consistency_check(args.raw_directory, conn) - return - executor = ProcessPoolExecutor(args.workers) print('*** Updating directories ***') @@ -273,11 +240,13 @@ def main(): print(f'Removed files: {stats["removed"]}') print(f'Up-to-date files: {stats["ok"]}') + if len(removed_files) > 10000 or len(modified_files) > 10000: + print('Too many files to remove, aborting...') + exit(1) conn = mariadb.connect(host=config['db']['host'], user=config['db']['user'], password=config['db']['password'], database=config['db']['database']) - print('*** Processing new entries ***') add_chains(new_files, args.mirror_directory, args.raw_directory, args.binary_directory, conn, executor)