diff --git a/.gitignore b/.gitignore index 96058e0..bfe71ae 100644 --- a/.gitignore +++ b/.gitignore @@ -119,3 +119,4 @@ test/file/.DS_Store .DS_Store ALLCools/_version.py schicluster/_version.py +*.scool diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD006.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD006.3C.contact.tsv.gz new file mode 100644 index 0000000..3d5a569 Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD006.3C.contact.tsv.gz differ diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD007.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD007.3C.contact.tsv.gz new file mode 100644 index 0000000..ec99498 Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD007.3C.contact.tsv.gz differ diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD008.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD008.3C.contact.tsv.gz new file mode 100644 index 0000000..e9afd99 Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD008.3C.contact.tsv.gz differ diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD010.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD010.3C.contact.tsv.gz new file mode 100644 index 0000000..c505d9e Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD010.3C.contact.tsv.gz differ diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD012.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD012.3C.contact.tsv.gz new file mode 100644 index 0000000..4153194 Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD012.3C.contact.tsv.gz differ diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD001.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD001.3C.contact.tsv.gz new file mode 100644 index 0000000..31d2a37 Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD001.3C.contact.tsv.gz differ diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD002.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD002.3C.contact.tsv.gz new file mode 100644 index 0000000..203e0cf Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD002.3C.contact.tsv.gz differ diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD004.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD004.3C.contact.tsv.gz new file mode 100644 index 0000000..f764795 Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD004.3C.contact.tsv.gz differ diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD006.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD006.3C.contact.tsv.gz new file mode 100644 index 0000000..e042774 Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD006.3C.contact.tsv.gz differ diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD008.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD008.3C.contact.tsv.gz new file mode 100644 index 0000000..0b6c38e Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD008.3C.contact.tsv.gz differ diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD010.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD010.3C.contact.tsv.gz new file mode 100644 index 0000000..eee43d9 Binary files /dev/null and b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD010.3C.contact.tsv.gz differ diff --git a/example/contacts/contact_table.txt b/example/contacts/contact_table.txt new file mode 100644 index 0000000..8824700 --- /dev/null +++ b/example/contacts/contact_table.txt @@ -0,0 +1,11 @@ +CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD006 CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD006.3C.contact.tsv.gz +CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD007 CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD007.3C.contact.tsv.gz +CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD008 CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD008.3C.contact.tsv.gz +CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD010 CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD010.3C.contact.tsv.gz +CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD012 CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD012.3C.contact.tsv.gz +CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD001 CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD001.3C.contact.tsv.gz +CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD002 CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD002.3C.contact.tsv.gz +CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD004 CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD004.3C.contact.tsv.gz +CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD006 CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD006.3C.contact.tsv.gz +CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD008 CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD008.3C.contact.tsv.gz +CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD010 CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD010.3C.contact.tsv.gz \ No newline at end of file diff --git a/example/contacts/mm10.main.nochrM.chrom.sizes b/example/contacts/mm10.main.nochrM.chrom.sizes new file mode 100644 index 0000000..fcc0e40 --- /dev/null +++ b/example/contacts/mm10.main.nochrM.chrom.sizes @@ -0,0 +1,21 @@ +chr1 195471971 +chr2 182113224 +chr3 160039680 +chr4 156508116 +chr5 151834684 +chr6 149736546 +chr7 145441459 +chr8 129401213 +chr9 124595110 +chr10 130694993 +chr11 122082543 +chr12 120129022 +chr13 120421639 +chr14 124902244 +chr15 104043685 +chr16 98207768 +chr17 94987271 +chr18 90702639 +chr19 61431566 +chrX 171031299 +chrY 91744698 diff --git a/example/contacts/test_scool.sh b/example/contacts/test_scool.sh new file mode 100644 index 0000000..03c05c8 --- /dev/null +++ b/example/contacts/test_scool.sh @@ -0,0 +1 @@ +hicluster scool --contacts_table contact_table.txt --output_prefix test.scool --chrom_size_path mm10.main.nochrM.chrom.sizes --resolutions 10000 100000 --cpu 4 --batch_n 2 diff --git a/schicluster/__main__.py b/schicluster/__main__.py index ac1fc34..60c5153 100644 --- a/schicluster/__main__.py +++ b/schicluster/__main__.py @@ -276,6 +276,32 @@ def merge_cell_register_subparser(subparser): return +def generate_scool_register_subparser(subparser): + parser = subparser.add_parser('generate-scool', + aliases=['scool'], + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + help="") + + parser_req = parser.add_argument_group("required arguments") + parser_req.add_argument('--contacts_table', type=str, required=True, default=None, + help='tab-separated table containing tow columns, 1) cell id, ' + '2) cell contact file path (juicer-pre format). No header') + parser_req.add_argument('--output_prefix', type=str, required=True, default=None, + help='Output prefix of the scool files. ' + 'Output path will be {output_prefix}.{resolution_str}.scool') + parser_req.add_argument('--chrom_size_path', type=str, required=True, default=None, + help='Path to the chromosome size file, this file should be in UCSC chrom.sizes format. ' + 'We will use this file as the reference to create matrix. ' + 'It is recommended to remove small contigs or chrM from this file.') + parser_req.add_argument('--resolutions', type=int, nargs='+', required=True, default=[], + help='Resolutions to generate the matrix. ' + 'You can provide multiple resolutions separated by space, ' + '(e.g., "--resolutions 10000 100000"). ' + 'Each resolution will be stored in a separate file.') + parser.add_argument('--cpu', type=int, default=1, help='number of cpus to parallel.') + parser.add_argument('--batch_n', type=int, default=50, help='number of cells to deal with in each cpu process.') + + def main(): parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG, @@ -329,6 +355,8 @@ def main(): from .dev.loop_sc import loop_sc as func elif cur_command in ['merge-cell']: from .dev.merge_cell import merge_cell as func + elif cur_command in ['generate-scool', 'scool']: + from .scool import generate_scool as func else: log.debug(f'{cur_command} is not an valid sub-command') parser.parse_args(["-h"]) diff --git a/schicluster/scool/__init__.py b/schicluster/scool/__init__.py new file mode 100644 index 0000000..1e9e633 --- /dev/null +++ b/schicluster/scool/__init__.py @@ -0,0 +1,177 @@ +import pandas as pd +from cooler import create_scool +from collections import Counter, defaultdict +from concurrent.futures import ProcessPoolExecutor, as_completed +import warnings +import subprocess +import re + + +def get_chrom_offsets(chrom_sizes_series, resolution): + cur_offset = 0 + chrom_offset = {} + for chrom, length in chrom_sizes_series.items(): + chrom_offset[chrom] = cur_offset + n_bins = length // resolution + 1 * (length % resolution != 0) + cur_offset += n_bins + return chrom_offset + + +def generate_bins_df_from_chrom_sizes(chrom_sizes, resolution): + bins_dfs = [] + for chrom, length in chrom_sizes.items(): + chrom_bins_df = pd.DataFrame({ + 'start': + list(range(0, length, resolution)), + 'end': + list(range(resolution, length + resolution, resolution)) + }) + chrom_bins_df.iloc[-1, 1] = length + chrom_bins_df['chrom'] = chrom + bins_dfs.append(chrom_bins_df) + bins_df = pd.concat(bins_dfs) + bins_df = bins_df[['chrom', 'start', + 'end']].sort_values(['chrom', + 'start']).reset_index(drop=True) + return bins_df + + +def generate_scool_batch_data(cell_path_dict, resolution, chrom_offset, + output_path): + def single_cell_pixel(path): + contacts = pd.read_csv(path, sep='\t', header=None) + + # filter + contacts = contacts[contacts[1].isin(chrom_offset) + & contacts[5].isin(chrom_offset) + & (contacts[2] > 0) + & (contacts[6] > 0)] + + # calculate pixel + bin1_id = contacts[1].map(chrom_offset) + (contacts[2] - + 1) // resolution + bin2_id = contacts[5].map(chrom_offset) + (contacts[6] - + 1) // resolution + counter = Counter() + for x, y in zip(bin1_id, bin2_id): + if x > y: + x, y = y, x + counter[(x, y)] += 1 + pixel_df = pd.Series(counter).reset_index() + pixel_df.columns = pd.Index(['bin1_id', 'bin2_id', 'count']) + pixel_df = pixel_df.sort_values(['bin1_id', + 'bin2_id']).reset_index(drop=True) + return pixel_df + + with warnings.catch_warnings(): + # ignore the + warnings.simplefilter("ignore") + with pd.HDFStore(output_path) as hdf: + for cell_id, path in cell_path_dict.items(): + hdf[cell_id] = single_cell_pixel(path) + return + + +def generate_scool_single_resolution(cell_path_dict, + chrom_size_path, + resolution, + output_path, + batch_n=20, + cpu=1): + # parse chromosome sizes, prepare bin_df + chrom_sizes = pd.read_csv(chrom_size_path, + header=None, + index_col=0, + sep='\t', + squeeze=True).sort_index() + chrom_offset = get_chrom_offsets(chrom_sizes, resolution) + bins_df = generate_bins_df_from_chrom_sizes(chrom_sizes, resolution) + + chunk_dicts = defaultdict(dict) + for i, (cell, path) in enumerate(cell_path_dict.items()): + chunk_dicts[i // batch_n][cell] = path + + with ProcessPoolExecutor(cpu) as exe: + futures = {} + for batch, cell_path_dict in chunk_dicts.items(): + batch_output = output_path + f'_{batch}' + f = exe.submit(generate_scool_batch_data, + cell_path_dict=cell_path_dict, + resolution=resolution, + chrom_offset=chrom_offset, + output_path=batch_output) + futures[f] = batch_output + + for future in as_completed(futures): + # batch finished + batch_output = futures[future] + future.result() + + # dump batch result into scool + cell_pixel_dict = {} + with pd.HDFStore(batch_output, mode='r') as hdf: + for cell_id in hdf.keys(): + cell_id = cell_id[1:] # remove '/' + cell_pixel_dict[cell_id] = hdf[cell_id] + create_scool(output_path, + bins=bins_df, + cell_name_pixels_dict=cell_pixel_dict, + ordered=True, + mode='a') + subprocess.run(['rm', '-f', batch_output], check=True) + return + + +def generate_scool(contacts_table, + output_prefix, + chrom_size_path, + resolutions, + cpu=1, + batch_n=50): + """ + Generate single-resolution scool files from single-cell contact files recorded in contacts_table + + Parameters + ---------- + contacts_table + tab-separated table containing tow columns, 1) cell id, 2) cell contact file path (juicer-pre format) + No header + output_prefix + Output prefix of the scool files. Output path will be {output_prefix}.{resolution_str}.scool + chrom_size_path + Path to the chromosome size file, this file should be in UCSC chrom.sizes format. We will use this file as + the reference to create matrix. It is recommended to remove small contigs or chrM from this file. + resolutions + Resolutions to generate the matrix. Each resolution will be stored in a separate file. + cpu + number of cpus to parallel. + batch_n + number of cells to deal with in each cpu process. + + Returns + ------- + + """ + # read cell paths + cell_path_dict = pd.read_csv(contacts_table, + header=None, + index_col=0, + squeeze=True, + sep='\t').to_dict() + print(f'{len(cell_path_dict)} cells to process.') + + for resolution in resolutions: + resolution_str = str(resolution) + resolution_str = re.sub(r'000000$', 'M', resolution_str) + resolution_str = re.sub(r'000$', 'K', resolution_str) + output_path = f'{output_prefix}.{resolution_str}.scool' + + print('Generating', output_path) + generate_scool_single_resolution(cell_path_dict=cell_path_dict, + chrom_size_path=chrom_size_path, + resolution=resolution, + output_path=output_path, + batch_n=batch_n, + cpu=cpu) + print('Finished', output_path) + return diff --git a/setup.py b/setup.py index 1b9f6dc..96c6caa 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ long_description_content_type='text/markdown', url='https://github.com/zhoujt1994/scHiCluster', include_package_data=True, - install_requires=['numpy', 'scipy', 'scikit-learn', 'h5py', 'opencv-python'], + install_requires=['numpy', 'scipy', 'scikit-learn', 'h5py', 'opencv-python', 'tables', 'cooler'], entry_points={ 'console_scripts': ['hicluster=schicluster.__main__:main'], }