Merge pull request #4 from lhqing/master

add CLI "hicluster scool"
zhoujt1994 · Mar 15, 2021 · caf58c5 · caf58c5
2 parents 7f2a556 + 01d5e9e
commit caf58c5
Show file tree

Hide file tree

Showing 18 changed files with 240 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -119,3 +119,4 @@ test/file/.DS_Store
 .DS_Store
 ALLCools/_version.py
 schicluster/_version.py
+*.scool
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD006.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD006.3C.contact.tsv.gz
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD007.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD007.3C.contact.tsv.gz
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD008.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD008.3C.contact.tsv.gz
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD010.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD010.3C.contact.tsv.gz
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD012.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD012.3C.contact.tsv.gz
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD001.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD001.3C.contact.tsv.gz
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD002.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD002.3C.contact.tsv.gz
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD004.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD004.3C.contact.tsv.gz
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD006.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD006.3C.contact.tsv.gz
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD008.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD008.3C.contact.tsv.gz
diff --git a/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD010.3C.contact.tsv.gz b/example/contacts/CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD010.3C.contact.tsv.gz
diff --git a/example/contacts/contact_table.txt b/example/contacts/contact_table.txt
@@ -0,0 +1,11 @@
+CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD006	CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD006.3C.contact.tsv.gz
+CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD007	CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD007.3C.contact.tsv.gz
+CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD008	CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD008.3C.contact.tsv.gz
+CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD010	CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD010.3C.contact.tsv.gz
+CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD012	CEMBA191126_9J_1-CEMBA191126_9J_2-A10-AD012.3C.contact.tsv.gz
+CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD001	CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD001.3C.contact.tsv.gz
+CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD002	CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD002.3C.contact.tsv.gz
+CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD004	CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD004.3C.contact.tsv.gz
+CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD006	CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD006.3C.contact.tsv.gz
+CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD008	CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD008.3C.contact.tsv.gz
+CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD010	CEMBA191126_9J_1-CEMBA191126_9J_2-A11-AD010.3C.contact.tsv.gz
diff --git a/example/contacts/mm10.main.nochrM.chrom.sizes b/example/contacts/mm10.main.nochrM.chrom.sizes
@@ -0,0 +1,21 @@
+chr1	195471971
+chr2	182113224
+chr3	160039680
+chr4	156508116
+chr5	151834684
+chr6	149736546
+chr7	145441459
+chr8	129401213
+chr9	124595110
+chr10	130694993
+chr11	122082543
+chr12	120129022
+chr13	120421639
+chr14	124902244
+chr15	104043685
+chr16	98207768
+chr17	94987271
+chr18	90702639
+chr19	61431566
+chrX	171031299
+chrY	91744698
diff --git a/example/contacts/test_scool.sh b/example/contacts/test_scool.sh
@@ -0,0 +1 @@
+hicluster scool --contacts_table contact_table.txt --output_prefix test.scool --chrom_size_path mm10.main.nochrM.chrom.sizes --resolutions 10000 100000 --cpu 4 --batch_n 2
diff --git a/schicluster/__main__.py b/schicluster/__main__.py
@@ -276,6 +276,32 @@ def merge_cell_register_subparser(subparser):
     return
 
 
+def generate_scool_register_subparser(subparser):
+    parser = subparser.add_parser('generate-scool',
+                                  aliases=['scool'],
+                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+                                  help="")
+
+    parser_req = parser.add_argument_group("required arguments")
+    parser_req.add_argument('--contacts_table', type=str, required=True, default=None,
+                            help='tab-separated table containing tow columns, 1) cell id, '
+                                 '2) cell contact file path (juicer-pre format). No header')
+    parser_req.add_argument('--output_prefix', type=str, required=True, default=None,
+                            help='Output prefix of the scool files. '
+                                 'Output path will be {output_prefix}.{resolution_str}.scool')
+    parser_req.add_argument('--chrom_size_path', type=str, required=True, default=None,
+                            help='Path to the chromosome size file, this file should be in UCSC chrom.sizes format. '
+                                 'We will use this file as the reference to create matrix. '
+                                 'It is recommended to remove small contigs or chrM from this file.')
+    parser_req.add_argument('--resolutions', type=int, nargs='+', required=True, default=[],
+                            help='Resolutions to generate the matrix. '
+                                 'You can provide multiple resolutions separated by space, '
+                                 '(e.g., "--resolutions 10000 100000"). '
+                                 'Each resolution will be stored in a separate file.')
+    parser.add_argument('--cpu', type=int, default=1, help='number of cpus to parallel.')
+    parser.add_argument('--batch_n', type=int, default=50, help='number of cells to deal with in each cpu process.')
+
+
 def main():
     parser = argparse.ArgumentParser(description=DESCRIPTION,
                                      epilog=EPILOG,
@@ -329,6 +355,8 @@ def main():
         from .dev.loop_sc import loop_sc as func
     elif cur_command in ['merge-cell']:
         from .dev.merge_cell import merge_cell as func
+    elif cur_command in ['generate-scool', 'scool']:
+        from .scool import generate_scool as func
     else:
         log.debug(f'{cur_command} is not an valid sub-command')
         parser.parse_args(["-h"])

diff --git a/schicluster/scool/__init__.py b/schicluster/scool/__init__.py
@@ -0,0 +1,177 @@
+import pandas as pd
+from cooler import create_scool
+from collections import Counter, defaultdict
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import warnings
+import subprocess
+import re
+
+
+def get_chrom_offsets(chrom_sizes_series, resolution):
+    cur_offset = 0
+    chrom_offset = {}
+    for chrom, length in chrom_sizes_series.items():
+        chrom_offset[chrom] = cur_offset
+        n_bins = length // resolution + 1 * (length % resolution != 0)
+        cur_offset += n_bins
+    return chrom_offset
+
+
+def generate_bins_df_from_chrom_sizes(chrom_sizes, resolution):
+    bins_dfs = []
+    for chrom, length in chrom_sizes.items():
+        chrom_bins_df = pd.DataFrame({
+            'start':
+                list(range(0, length, resolution)),
+            'end':
+                list(range(resolution, length + resolution, resolution))
+        })
+        chrom_bins_df.iloc[-1, 1] = length
+        chrom_bins_df['chrom'] = chrom
+        bins_dfs.append(chrom_bins_df)
+    bins_df = pd.concat(bins_dfs)
+    bins_df = bins_df[['chrom', 'start',
+                       'end']].sort_values(['chrom',
+                                            'start']).reset_index(drop=True)
+    return bins_df
+
+
+def generate_scool_batch_data(cell_path_dict, resolution, chrom_offset,
+                              output_path):
+    def single_cell_pixel(path):
+        contacts = pd.read_csv(path, sep='\t', header=None)
+
+        # filter
+        contacts = contacts[contacts[1].isin(chrom_offset)
+                            & contacts[5].isin(chrom_offset)
+                            & (contacts[2] > 0)
+                            & (contacts[6] > 0)]
+
+        # calculate pixel
+        bin1_id = contacts[1].map(chrom_offset) + (contacts[2] -
+                                                   1) // resolution
+        bin2_id = contacts[5].map(chrom_offset) + (contacts[6] -
+                                                   1) // resolution
+        counter = Counter()
+        for x, y in zip(bin1_id, bin2_id):
+            if x > y:
+                x, y = y, x
+            counter[(x, y)] += 1
+        pixel_df = pd.Series(counter).reset_index()
+        pixel_df.columns = pd.Index(['bin1_id', 'bin2_id', 'count'])
+        pixel_df = pixel_df.sort_values(['bin1_id',
+                                         'bin2_id']).reset_index(drop=True)
+        return pixel_df
+
+    with warnings.catch_warnings():
+        # ignore the
+        warnings.simplefilter("ignore")
+        with pd.HDFStore(output_path) as hdf:
+            for cell_id, path in cell_path_dict.items():
+                hdf[cell_id] = single_cell_pixel(path)
+    return
+
+
+def generate_scool_single_resolution(cell_path_dict,
+                                     chrom_size_path,
+                                     resolution,
+                                     output_path,
+                                     batch_n=20,
+                                     cpu=1):
+    # parse chromosome sizes, prepare bin_df
+    chrom_sizes = pd.read_csv(chrom_size_path,
+                              header=None,
+                              index_col=0,
+                              sep='\t',
+                              squeeze=True).sort_index()
+    chrom_offset = get_chrom_offsets(chrom_sizes, resolution)
+    bins_df = generate_bins_df_from_chrom_sizes(chrom_sizes, resolution)
+
+    chunk_dicts = defaultdict(dict)
+    for i, (cell, path) in enumerate(cell_path_dict.items()):
+        chunk_dicts[i // batch_n][cell] = path
+
+    with ProcessPoolExecutor(cpu) as exe:
+        futures = {}
+        for batch, cell_path_dict in chunk_dicts.items():
+            batch_output = output_path + f'_{batch}'
+            f = exe.submit(generate_scool_batch_data,
+                           cell_path_dict=cell_path_dict,
+                           resolution=resolution,
+                           chrom_offset=chrom_offset,
+                           output_path=batch_output)
+            futures[f] = batch_output
+
+        for future in as_completed(futures):
+            # batch finished
+            batch_output = futures[future]
+            future.result()
+
+            # dump batch result into scool
+            cell_pixel_dict = {}
+            with pd.HDFStore(batch_output, mode='r') as hdf:
+                for cell_id in hdf.keys():
+                    cell_id = cell_id[1:]  # remove '/'
+                    cell_pixel_dict[cell_id] = hdf[cell_id]
+            create_scool(output_path,
+                         bins=bins_df,
+                         cell_name_pixels_dict=cell_pixel_dict,
+                         ordered=True,
+                         mode='a')
+            subprocess.run(['rm', '-f', batch_output], check=True)
+    return
+
+
+def generate_scool(contacts_table,
+                   output_prefix,
+                   chrom_size_path,
+                   resolutions,
+                   cpu=1,
+                   batch_n=50):
+    """
+    Generate single-resolution scool files from single-cell contact files recorded in contacts_table
+
+    Parameters
+    ----------
+    contacts_table
+        tab-separated table containing tow columns, 1) cell id, 2) cell contact file path (juicer-pre format)
+        No header
+    output_prefix
+        Output prefix of the scool files. Output path will be {output_prefix}.{resolution_str}.scool
+    chrom_size_path
+        Path to the chromosome size file, this file should be in UCSC chrom.sizes format. We will use this file as
+        the reference to create matrix. It is recommended to remove small contigs or chrM from this file.
+    resolutions
+        Resolutions to generate the matrix. Each resolution will be stored in a separate file.
+    cpu
+        number of cpus to parallel.
+    batch_n
+        number of cells to deal with in each cpu process.
+
+    Returns
+    -------
+
+    """
+    # read cell paths
+    cell_path_dict = pd.read_csv(contacts_table,
+                                 header=None,
+                                 index_col=0,
+                                 squeeze=True,
+                                 sep='\t').to_dict()
+    print(f'{len(cell_path_dict)} cells to process.')
+
+    for resolution in resolutions:
+        resolution_str = str(resolution)
+        resolution_str = re.sub(r'000000$', 'M', resolution_str)
+        resolution_str = re.sub(r'000$', 'K', resolution_str)
+        output_path = f'{output_prefix}.{resolution_str}.scool'
+
+        print('Generating', output_path)
+        generate_scool_single_resolution(cell_path_dict=cell_path_dict,
+                                         chrom_size_path=chrom_size_path,
+                                         resolution=resolution,
+                                         output_path=output_path,
+                                         batch_n=batch_n,
+                                         cpu=cpu)
+        print('Finished', output_path)
+    return
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
     long_description_content_type='text/markdown',
     url='https://github.com/zhoujt1994/scHiCluster',
     include_package_data=True,
-    install_requires=['numpy', 'scipy', 'scikit-learn', 'h5py', 'opencv-python'],
+    install_requires=['numpy', 'scipy', 'scikit-learn', 'h5py', 'opencv-python', 'tables', 'cooler'],
     entry_points={
         'console_scripts': ['hicluster=schicluster.__main__:main'],
     }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		hicluster scool --contacts_table contact_table.txt --output_prefix test.scool --chrom_size_path mm10.main.nochrM.chrom.sizes --resolutions 10000 100000 --cpu 4 --batch_n 2