From c672be7ca859f44690af921df113e25c94fc2589 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 19 Feb 2024 13:35:09 +0000 Subject: [PATCH] v0.5.0 (#78) * draft streaming with generators * set up effect types * profiling improvements * fix output * check for duplicates * add liftover * update dependencies and set up pre-commit * complain when linting fails * fix linting * support wide files * add log * fix tests and liftover * fix test * sqlite support and add log data * fix tests * fix tests * fixes to make old and new output consistent * update tests * drop parallel gzip and --threads * create ScoreVariant and EffectType classes * review comments * add type hints * remove coordinates from mandatory fields * fix old scoring files * check effect alleles and compelx scoring files * don't access __annotations__ directly * remove logger * warn about complex files and variant mismatch * refactor scorevariant from userdict to class with __slots__ * fix __repr__ and type hints * add pyarrow support * add license data to log * add custom exceptions * add custom exit code * move class definitions * rename * update effect allele class * tidy up docstring * add docstrings to pytest * fix pyproject * Make sure that IID isn't converted to numeric during aggreation Signed-off-by: smlmbrt * bump minor version * dynamically set is_snp * remove samplesheet package * delete samplesheet tests * fix liftover * set up local venv * fix liftover test * improve comment --------- Signed-off-by: smlmbrt Co-authored-by: smlmbrt --- .gitignore | 3 +- .pre-commit-config.yaml | 8 + conftest.py | 221 +++-- .../aggregate/aggregate_scores.py | 2 +- pgscatalog_utils/download/GenomeBuild.py | 25 +- pgscatalog_utils/pgsexceptions.py | 127 +++ pgscatalog_utils/samplesheet/Config.py | 7 - pgscatalog_utils/samplesheet/check.py | 370 -------- .../scorefile/combine_scorefiles.py | 272 +++--- pgscatalog_utils/scorefile/config.py | 16 + pgscatalog_utils/scorefile/effect_type.py | 34 - pgscatalog_utils/scorefile/effect_weight.py | 49 - pgscatalog_utils/scorefile/effectallele.py | 60 ++ pgscatalog_utils/scorefile/effecttype.py | 14 + pgscatalog_utils/scorefile/genome_build.py | 24 - pgscatalog_utils/scorefile/harmonised.py | 30 - pgscatalog_utils/scorefile/liftover.py | 138 ++- pgscatalog_utils/scorefile/qc.py | 294 ++++-- pgscatalog_utils/scorefile/read.py | 79 -- pgscatalog_utils/scorefile/scorevariant.py | 137 +++ pgscatalog_utils/scorefile/scoringfile.py | 188 ++++ .../scorefile/scoringfileheader.py | 92 ++ pgscatalog_utils/scorefile/write.py | 212 ++++- poetry.lock | 158 +++- poetry.toml | 3 + pyproject.toml | 7 +- tests/data/combine/PGS001229_22.txt | 850 ++++++++++++++++++ tests/data/combine/scorefile.txt | 838 +++++++++++++++++ .../scorefile_dominant_and_recessive.txt | 838 +++++++++++++++++ tests/test_combine.py | 145 ++- tests/test_liftover.py | 40 +- tests/test_samplesheet.py | 101 --- 32 files changed, 4200 insertions(+), 1182 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 pgscatalog_utils/pgsexceptions.py delete mode 100644 pgscatalog_utils/samplesheet/Config.py delete mode 100755 pgscatalog_utils/samplesheet/check.py create mode 100644 pgscatalog_utils/scorefile/config.py delete mode 100644 pgscatalog_utils/scorefile/effect_type.py delete mode 100644 pgscatalog_utils/scorefile/effect_weight.py create mode 100644 pgscatalog_utils/scorefile/effectallele.py create mode 100644 pgscatalog_utils/scorefile/effecttype.py delete mode 100644 pgscatalog_utils/scorefile/genome_build.py delete mode 100644 pgscatalog_utils/scorefile/harmonised.py delete mode 100644 pgscatalog_utils/scorefile/read.py create mode 100644 pgscatalog_utils/scorefile/scorevariant.py create mode 100644 pgscatalog_utils/scorefile/scoringfile.py create mode 100644 pgscatalog_utils/scorefile/scoringfileheader.py create mode 100644 poetry.toml create mode 100644 tests/data/combine/PGS001229_22.txt create mode 100644 tests/data/combine/scorefile.txt create mode 100644 tests/data/combine/scorefile_dominant_and_recessive.txt delete mode 100644 tests/test_samplesheet.py diff --git a/.gitignore b/.gitignore index b0b6f3a..5ee9a36 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,5 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ \ No newline at end of file +.idea/ +.DS_Store diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..f7d0c74 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,8 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.1.3 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format diff --git a/conftest.py b/conftest.py index a4a55c6..e01a719 100644 --- a/conftest.py +++ b/conftest.py @@ -1,11 +1,9 @@ import glob import importlib.resources import os -import pathlib import shutil from unittest.mock import patch -import pandas as pd import polars as pl import pytest import requests as req @@ -13,21 +11,53 @@ from pgscatalog_utils.download.download_scorefile import download_scorefile from pgscatalog_utils.match.preprocess import complement_valid_alleles from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles +from pgscatalog_utils.scorefile.scorevariant import ScoreVariant + +from tests.data import combine pl.toggle_string_cache(True) @pytest.fixture(scope="session") def pgs_accessions(): - return ['PGS001229', 'PGS000922'] + return ["PGS001229", "PGS000922"] + + +@pytest.fixture(scope="session") +def mini_score_path(tmp_path_factory): + path = importlib.resources.files(combine) / "PGS001229_22.txt" + return str(path) + + +@pytest.fixture(scope="session") +def mini_scorefile(mini_score_path, tmp_path_factory): + # The mini scorefile overlaps well with cineca synthetic subset + out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt" + args: list[str] = ( + ["combine_scorefiles", "-t", "GRCh37", "-s"] + + [mini_score_path] + + ["-o", str(out_path.resolve())] + ) + + with patch("sys.argv", args): + combine_scorefiles() + + return str(out_path.resolve()) @pytest.fixture(scope="session") def scorefiles(tmp_path_factory, pgs_accessions): fn = tmp_path_factory.mktemp("scorefiles") - args: list[str] = ['download_scorefiles', '-b', 'GRCh37', '-o', str(fn.resolve()), '-i'] + pgs_accessions - - with patch('sys.argv', args): + args: list[str] = [ + "download_scorefiles", + "-b", + "GRCh37", + "-o", + str(fn.resolve()), + "-i", + ] + pgs_accessions + + with patch("sys.argv", args): download_scorefile() return glob.glob(os.path.join(fn.resolve(), "*.txt.gz")) @@ -37,8 +67,9 @@ def scorefiles(tmp_path_factory, pgs_accessions): def target_path(tmp_path_factory): try: bim = req.get( - 'https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim', - timeout=5) + "https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/cineca_synthetic_subset.bim", + timeout=5, + ) except (req.exceptions.ConnectionError, req.Timeout): bim = [] @@ -46,129 +77,125 @@ def target_path(tmp_path_factory): pytest.skip("Couldn't get test data from network") else: fn = tmp_path_factory.mktemp("target") / "data.bim" - with open(fn, 'wb') as f: + with open(fn, "wb") as f: f.write(bim.content) return str(fn.resolve()) -@pytest.fixture(scope="session") -def mini_score_path(tmp_path_factory): - try: - score = req.get('https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/PGS001229_22.txt', - timeout=5) - except (req.exceptions.ConnectionError, req.Timeout): - score = [] - - if not score: - pytest.skip("Couldn't get test data from network") - else: - fn = tmp_path_factory.mktemp("score") / "PGS001229_22.txt" - with open(fn, 'wb') as f: - f.write(score.content) - - return str(fn.resolve()) - - -@pytest.fixture(scope="session") -def mini_scorefile(mini_score_path, tmp_path_factory): - # The mini scorefile overlaps well with cineca synthetic subset - out_path = tmp_path_factory.mktemp("scores") / "mini_score.txt" - args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + [mini_score_path] + ['-o', str(out_path.resolve())] - - with patch('sys.argv', args): - combine_scorefiles() - - return str(out_path.resolve()) - - -@pytest.fixture(scope="session") -def combined_scorefile(scorefiles, tmp_path_factory): - # The combined scorefile overlaps poorly with cineca synthetic subset - out_path = tmp_path_factory.mktemp("scores") / "combined.txt" - args: list[str] = ['combine_scorefiles', '-t', 'GRCh37', '-s'] + scorefiles + ['-o', str(out_path.resolve())] - - with patch('sys.argv', args): - combine_scorefiles() - - return str(out_path.resolve()) - - @pytest.fixture(scope="session") def chain_files(tmp_path_factory): - chain_dir = tmp_path_factory.mktemp('chain_dir') + chain_dir = tmp_path_factory.mktemp("chain_dir") shutil.copy2("tests/data/hg19ToHg38.over.chain.gz", chain_dir) shutil.copy2("tests/data/hg38ToHg19.over.chain.gz", chain_dir) - - return str(chain_dir.resolve()) - - -@pytest.fixture(scope="session") -def lifted_scorefiles(mini_score_path, chain_files, tmp_path_factory): - out_path = tmp_path_factory.mktemp("scores") / "lifted.txt" - args: list[str] = ['combine_scorefiles', '-s'] + [mini_score_path] + ['--liftover', '-c', chain_files, '-t', - 'GRCh38', - '-m', '0.8'] + ['-o', str(out_path.resolve())] - - with patch('sys.argv', args): - combine_scorefiles() - return str(out_path.resolve()) + return str(chain_dir.resolve()) @pytest.fixture(scope="session") def hg38_coords(): - d = {'rsid': ['rs11903757', 'rs6061231'], 'chr_name': ['2', '20'], 'chr_position': [191722478, 62381861]} - df = pd.DataFrame(d) - df['accession'] = 'dummy' - df['genome_build'] = 'GRCh38' - return df + rs11903757 = ScoreVariant( + **{ + "rsid": "rs11903757", + "chr_name": "2", + "chr_position": 191722478, + "row_nr": 0, + "effect_weight": 1, + "accession": "test", + "effect_allele": "A", + } + ) + rs6061231 = ScoreVariant( + **{ + "rsid": "rs6061231", + "chr_name": "20", + "chr_position": 62381861, + "row_nr": 1, + "effect_weight": 1, + "accession": "test", + "effect_allele": "A", + } + ) + return (x for x in [rs11903757, rs6061231]) @pytest.fixture(scope="session") -def hg19_coords(hg38_coords): +def hg19_coords(): # hg38_coords in GRCh37, from dbSNP - d = {'lifted_chr': ['2', '20'], 'lifted_pos': [192587204, 60956917], 'liftover': [True, True]} - return pd.DataFrame(d) + rs11903757 = ScoreVariant( + **{ + "rsid": "rs11903757", + "chr_name": "2", + "chr_position": 192587204, + "row_nr": 0, + "effect_weight": 1, + "accession": "test", + "effect_allele": "A", + } + ) + rs6061231 = ScoreVariant( + **{ + "rsid": "rs6061231", + "chr_name": "20", + "chr_position": 60956917, + "row_nr": 1, + "effect_weight": 1, + "accession": "test", + "effect_allele": "A", + } + ) + return (x for x in [rs11903757, rs6061231]) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def small_flipped_scorefile(small_scorefile): # simulate a scorefile on the wrong strand - return (complement_valid_alleles(small_scorefile, ['effect_allele', 'other_allele']) - .drop(['effect_allele', 'other_allele']) - .rename({'effect_allele_FLIP': 'effect_allele', 'other_allele_FLIP': 'other_allele'}) - .pipe(complement_valid_alleles, ['effect_allele', 'other_allele'])) + return ( + complement_valid_alleles(small_scorefile, ["effect_allele", "other_allele"]) + .drop(["effect_allele", "other_allele"]) + .rename( + {"effect_allele_FLIP": "effect_allele", "other_allele_FLIP": "other_allele"} + ) + .pipe(complement_valid_alleles, ["effect_allele", "other_allele"]) + ) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def small_target(): - return pl.DataFrame({"#CHROM": [1, 2, 3], - "POS": [1, 2, 3], - "REF": ["A", "T", "T"], - "ALT": ["C", "A", "G"], - "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"], - "is_multiallelic": [False, False, False]}) + return pl.DataFrame( + { + "#CHROM": [1, 2, 3], + "POS": [1, 2, 3], + "REF": ["A", "T", "T"], + "ALT": ["C", "A", "G"], + "ID": ["1:1:A:C", "2:2:T:A", "3:3:T:G"], + "is_multiallelic": [False, False, False], + } + ) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def small_scorefile(): - df = pl.DataFrame({"accession": ["test", "test", "test"], - "row_nr": [1, 2, 3], - "chr_name": [1, 2, 3], - "chr_position": [1, 2, 3], - "effect_allele": ["A", "A", "G"], - "other_allele": ["C", "T", "T"], - "effect_weight": [1, 2, 3], - "effect_type": ["additive", "additive", "additive"]}) + df = pl.DataFrame( + { + "accession": ["test", "test", "test"], + "row_nr": [1, 2, 3], + "chr_name": [1, 2, 3], + "chr_position": [1, 2, 3], + "effect_allele": ["A", "A", "G"], + "other_allele": ["C", "T", "T"], + "effect_weight": [1, 2, 3], + "effect_type": ["additive", "additive", "additive"], + } + ) return complement_valid_alleles(df, ["effect_allele", "other_allele"]) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def small_scorefile_no_oa(small_scorefile): - return small_scorefile.with_column(pl.lit(None).alias('other_allele')) + return small_scorefile.with_column(pl.lit(None).alias("other_allele")) def _get_timeout(url): diff --git a/pgscatalog_utils/aggregate/aggregate_scores.py b/pgscatalog_utils/aggregate/aggregate_scores.py index aa1de89..d57943d 100644 --- a/pgscatalog_utils/aggregate/aggregate_scores.py +++ b/pgscatalog_utils/aggregate/aggregate_scores.py @@ -33,7 +33,7 @@ def aggregate(scorefiles: list[str]): for i, path in enumerate(scorefiles): logger.debug(f"Reading {path}") # pandas can automatically detect zst compression, neat! - df = (pd.read_table(path) + df = (pd.read_table(path, converters={"#IID": str}, header=0) .assign(sampleset=path.split('_')[0]) .set_index(['sampleset', '#IID'])) diff --git a/pgscatalog_utils/download/GenomeBuild.py b/pgscatalog_utils/download/GenomeBuild.py index 419c3f2..69fd8ab 100644 --- a/pgscatalog_utils/download/GenomeBuild.py +++ b/pgscatalog_utils/download/GenomeBuild.py @@ -1,6 +1,25 @@ -from enum import Enum, auto +from enum import Enum class GenomeBuild(Enum): - GRCh37 = auto() - GRCh38 = auto() + GRCh37 = "GRCh37" + GRCh38 = "GRCh38" + # just included to handle older files, incompatible unless harmonised: + NCBI36 = "NCBI36" # ew + + def __str__(self): + return str(self.value) + + @classmethod + def from_string(cls, build): + match build: + case "GRCh37" | "hg19": + return cls(GenomeBuild.GRCh37) + case "GRCh38" | "hg38": + return cls(GenomeBuild.GRCh38) + case "NR": + return None + case "NCBI36" | "hg18": + return cls(GenomeBuild.NCBI36) + case _: + raise Exception(f"Can't match {build=}") diff --git a/pgscatalog_utils/pgsexceptions.py b/pgscatalog_utils/pgsexceptions.py new file mode 100644 index 0000000..224025e --- /dev/null +++ b/pgscatalog_utils/pgsexceptions.py @@ -0,0 +1,127 @@ +""" This module defines a custom PGS exception hierarchy. There's a lot of exceptions for specific failure states, +which can be a bad approach and too complex. However, we did this anyway for a few reasons: + +1. There's only a few types of common errors (around a dozen, with 3-4 very common) +2. Want to exit the program with custom exit codes to simplify communicating program +state with external processes (e.g. PGS Catalog Calculator, web platforms) without doing +complicated things like logging to an external location +3. This approach should make maintaining exit codes simple + +So the plan is to override sys.excepthook, intercept errors defined here, and map them +to custom exit codes defined below +""" +import sys +from types import MappingProxyType + + +class BasePGSException(Exception): + """The base class from which all PGS errors must inherit. + The purpose of this class is to simplify finding PGS exceptions and exiting python + with a matching custom exit code.""" + + +class MatchError(BasePGSException): + """The base class for errors that are raised during variant matching""" + + +class DuplicateMatchError(MatchError): + """Raised when a matched variant has been duplicated, so that a variant with the same ID + would be split across two rows in an output scoring file. + """ + + +class MatchRateError(MatchError): + """Raised when match rate is below match threshold for one or more scoring files""" + + +class ZeroMatchesError(MatchError): + """Raised when zero matches are found for one or more scoring files. + + Distinct from MatchRateError because it's very common, and caused by bad input data or parameters.""" + + +class MatchValueError(MatchError): + """Raised when a match function receives inappropriate values. + + e.g., Multiple chromosomes detected in variant data but data is split per-chromosome""" + + +class CombineError(BasePGSException): + """The base class for errors that are raised when combining scorefiles""" + + +class BuildError(CombineError): + """Raised when there's a problem with a scoring file genome build.""" + + +class ScoreFormatError(CombineError): + """Raised when there's a problem with a scoring file.""" + + +class CatalogError(BasePGSException): + """The base class for errors when querying or downloading from the PGS Catalog""" + + +class ScoreDownloadError(CatalogError): + """Raised when a scoring file can't be downloaded""" + + +class ScoreChecksumError(CatalogError): + """Raised when a scoring file fails checksum validation""" + + +class QueryError(CatalogError): + """Raised when the Catalog API doesn't return a valid response""" + + +class InvalidAccessionError(CatalogError): + """Raised when an invalid term is used to query the Catalog""" + + +class SamplesheetError(BasePGSException): + """The base class for errors related to samplesheet parsing""" + + +class GenomesNotFound(SamplesheetError): + """Raised when FileNotFound""" + + +class SamplesheetFormatError(SamplesheetError): + """Raised when a samplesheet is badly formatted""" + + +class ExceptionExitCodeMap: + """A read only map to get exit codes for custom exceptions""" + + # https://unix.stackexchange.com/a/604262 + _mapping = { + ScoreDownloadError: 8, + ScoreFormatError: 9, + ScoreChecksumError: 10, + QueryError: 11, + InvalidAccessionError: 12, + DuplicateMatchError: 13, + MatchRateError: 14, + ZeroMatchesError: 15, + MatchValueError: 16, + BuildError: 17, + GenomesNotFound: 19, + SamplesheetFormatError: 20, + } + + code_map = MappingProxyType(_mapping) + + def __getitem__(self, exception_type): + # if an exception can't be found in the map, return an error code (> 0) but default + # max possible value 255 + return self.code_map.get(exception_type, 255) + + +def handle_uncaught_exception(exctype, value, trace): + code_map = ExceptionExitCodeMap() + oldHook(exctype, value, trace) + if isinstance(value, BasePGSException): + sys.exit(code_map[exctype]) + + +sys.excepthook, oldHook = handle_uncaught_exception, sys.excepthook diff --git a/pgscatalog_utils/samplesheet/Config.py b/pgscatalog_utils/samplesheet/Config.py deleted file mode 100644 index 1f4bddb..0000000 --- a/pgscatalog_utils/samplesheet/Config.py +++ /dev/null @@ -1,7 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class Config: - input_path: str - output_path: str diff --git a/pgscatalog_utils/samplesheet/check.py b/pgscatalog_utils/samplesheet/check.py deleted file mode 100755 index b1ff16b..0000000 --- a/pgscatalog_utils/samplesheet/check.py +++ /dev/null @@ -1,370 +0,0 @@ -import argparse -import logging -import math -import pathlib -from pathlib import Path - -import pandas as pd - -from pgscatalog_utils import config -from pgscatalog_utils.samplesheet.Config import Config - -logger = logging.getLogger(__name__) - - -def _parse_args(args=None) -> argparse.Namespace: - d: ( - str - ) = "Convert pgscatalog/pgsc_calc samplesheet file to JSON and check its contents." - e: str = "Example usage: python check.py " - - parser: argparse.ArgumentParser = argparse.ArgumentParser(description=d, epilog=e) - parser.add_argument("FILE_IN", help="Input samplesheet file.") - parser.add_argument( - "-v", - "--verbose", - dest="verbose", - action="store_true", - help=" Extra logging information", - ) - parser.add_argument("FILE_OUT", help="Output file.") - return parser.parse_args(args) - - -def _truncate_chrom(chrom): - match chrom: - case _ if chrom.isdigit(): - return int(chrom) - case _ if chrom.startswith("chr"): - logger.critical("Please remove chr prefix from samplesheet chromosome column e.g. chr1 -> 1, chrX -> X") - raise ValueError("chr prefix detected") - case _: - return chrom - - -def _check_colnames(df: pd.DataFrame): - mandatory: list[str] = ["sampleset", "path_prefix", "chrom", "format"] - optional: list[str] = ["vcf_genotype_field"] - - if not set(mandatory) == set(df.columns): - if set(mandatory + optional) == set(df.columns): - # this is fine - return - else: - logger.critical("Samplesheet has invalid header row") - logger.critical(f"Column names must only include: {mandatory}") - [ - logger.critical(f"Invalid column name: {col}") - for col in df - if col not in mandatory - ] - raise Exception - - -def _check_unique_paths(df: pd.DataFrame): - """Each row in a samplesheet should have a unique path""" - duplicated: pd.Series = df["path_prefix"].duplicated() - for idx, duplicate in duplicated.items(): - if duplicate: - bad_record = df.iloc[:idx] - logger.critical(f"Duplicated path found in samplesheet:\n{bad_record}") - - -def _check_empty_paths(df: pd.DataFrame): - """Paths are mandatory""" - empty_paths: pd.Series = df["path_prefix"].isnull() - for idx, empty in empty_paths.items(): - if empty: - logger.critical(f"Empty path found in samplesheet:\n {df.iloc[[idx]]}") - raise Exception - - -def _read_samplesheet(path: str) -> pd.DataFrame: - csv: pd.DataFrame = pd.read_csv(path, sep=",", header=0, converters={"chrom": str}) - csv["chrom"] = csv["chrom"].apply(_truncate_chrom) - return csv - - -def _check_paths(df: pd.DataFrame) -> None: - _check_empty_paths(df) - _check_unique_paths(df) - - -def _get_chrom_list(df: pd.DataFrame) -> dict[str, list[str | None]]: - chrom_dict = {} - for idx, row in df.iterrows(): - key = row["sampleset"] - value = row["chrom"] - try: - if math.isnan(value): - value = None - except TypeError: - pass - chroms = chrom_dict.get(key, []) - chroms.append(value) - chrom_dict.update({key: chroms}) - - return chrom_dict - - -def _check_chrom_duplicates(sampleset: str, chrom_list: dict) -> None: - seen = set() - duplicate_chromosomes: list[str] = [ - str(x) for x in chrom_list if x in seen or seen.add(x) - ] - if duplicate_chromosomes: - logger.critical(f"Duplicate chromosomes detected in sampleset {sampleset}") - logger.critical(f"Duplicate chromosomes: {duplicate_chromosomes}") - raise Exception - - -def _check_multiple_missing_chrom(sampleset: str, chrom_list: dict) -> None: - for chrom in chrom_list: - if chrom is None and len(chrom_list) != 1: - logger.critical( - f"Sampleset {sampleset} has rows with multiple missing chromosomes" - ) - logger.critical( - "If you have file with multiple chromosomes, delete the duplicate rows" - ) - logger.critical( - "If your data are split per chromosome, then chromosomes must be set for all rows" - ) - raise Exception - - -def _check_chrom(df: pd.DataFrame) -> None: - # get a list of chroms per sampleset and check them for some basic errors - chroms: dict = _get_chrom_list(df) - - for sampleset, chrom_list in chroms.items(): - _check_chrom_duplicates(sampleset, chrom_list) - _check_multiple_missing_chrom(sampleset, chrom_list) - - -def _check_format(df: pd.DataFrame): - """Make sure the file format is a valid choice""" - for idx, row in df.iterrows(): - valid_formats: list[str] = ["vcf", "pfile", "bfile"] - if row["format"] not in valid_formats: - logger.critical( - f"Invalid format: {row['format']} must be one of {valid_formats}" - ) - logger.critical(f"\n{df.iloc[[idx]]}") - raise Exception - - -def _setup_paths(df: pd.DataFrame) -> pd.DataFrame: - """Add suffix to path prefixes depending on file format / type""" - paths: list[pd.Series] = [] - for idx, row in df.iterrows(): - suffix: list[str] - match row["format"]: - case "vcf": - logger.info("Setting VCF input") - suffix = [".vcf.gz"] - case "bfile": - logger.info("Setting plink1 binary fileset (bfile) input") - suffix = [".bed", ".bim", ".fam"] - case "pfile": - logger.info("Setting plink2 binary fileset (pfile) input") - suffix = [".pgen", ".pvar", ".psam"] - case _: - raise Exception - - resolved_paths: list[str] = _resolve_paths( - [row["path_prefix"] + x for x in suffix], row["format"] - ) - paths.append(pd.Series(data=[resolved_paths], index=[idx])) - - df["path"] = pd.concat(paths) - return df - - -def _resolve_compressed_variant_path(path: str) -> pathlib.Path: - # .bim.zst | .bim -> OK - # .pvar.zst | .pvar -> OK - # anything else not OK - zstd_ext: str = ".zst" - compressed_path: pathlib.Path = pathlib.Path(path + zstd_ext).resolve() - uncompressed_path: pathlib.Path = pathlib.Path(path).resolve() - - # prefer compressed data - if compressed_path.exists(): - logger.info(f"Found compressed variant information file {compressed_path.name}") - return compressed_path - elif uncompressed_path.exists(): - logger.info( - f"Couldn't find compressed variant information file, trying {uncompressed_path.name}" - ) - return uncompressed_path - else: - logger.critical(f"{compressed_path} doesn't exist") - logger.critical(f"{uncompressed_path} doesn't exist") - logger.critical( - "Couldn't find variant information files, please check samplesheet path_prefix and try again" - ) - raise Exception - - -def _resolve_paths(path_list: list[str], filetype: str) -> list[str]: - resolved_list: list[str] = [] - - # always resolve the input samplesheet - base_dir: Path = Path(Config.input_path).resolve().parent - if (path := Path(Config.input_path)).is_symlink(): - logger.info( - f"Input file {path} is symlinked, resolving to absolute path {path.resolve()}" - ) - - for path in path_list: - if path.startswith("https://") | path.startswith("s3://"): - logger.info("Remote path detected, skipping resolve") - resolved_list.append(str(path)) - continue - elif path.startswith("http://"): - logger.critical("HTTP download is insecure! Did you mean https:// ?") - raise Exception("Insecure path detected") - else: - p: Path = Path(path) - if not p.is_absolute(): - logger.warning( - "Relative path detected in samplesheet. Set absolute paths to silence this warning." - ) - logger.warning( - "Assuming input samplesheet is a symlinked file in a nextflow working directory" - ) - logger.warning( - "Following symlink and attempting to resolve path relative to input file" - ) - logger.warning(f"Resolving paths relative to: {base_dir}") - resolved = _resolve_filetypes( - path=str(base_dir.joinpath(path)), filetype=filetype - ) - else: - logger.info("Absolute path detected") - resolved = _resolve_filetypes(filetype=filetype, path=str(p)) - - if resolved.exists(): - logger.info(f"{resolved} exists") - resolved_list.append(str(resolved)) - else: - logger.critical( - f"{resolved} doesn't exist, please check samplesheet path_prefix and try again" - ) - logger.critical( - "If you're 100% sure this file exists and you're confused by this error, please check https://pgsc-calc.readthedocs.io/en/latest/how-to/mount.html" - ) - raise FileNotFoundError - - return resolved_list - - -def _resolve_filetypes(filetype: str, path: str) -> Path: - match filetype: - case "pfile" | "bfile": - if path.endswith(".bim") or path.endswith(".pvar"): - resolved = _resolve_compressed_variant_path(path) - else: - # bed / pgen | fam / psam - resolved = pathlib.Path(path).resolve() - case "vcf": - resolved = pathlib.Path(path).resolve() - case _: - logger.critical(f"Unsupported filetype {filetype}") - raise Exception - - return resolved - - -def _check_genotype_field(df: pd.DataFrame) -> pd.DataFrame: - df["vcf_import_dosage"] = False # (dosage off by default) - if "vcf_genotype_field" in df.columns: - logger.debug("vcf_genotype_field detected") - for index, row in df.iterrows(): - if row["vcf_genotype_field"] not in ["GT", "DS"]: - missing: bool # missing dosage is OK - try: - missing = math.isnan(row["vcf_genotype_field"]) - except TypeError: - missing = False - - if not missing: - logger.critical( - f"Invalid entry in vcf_genotype_field: {row['vcf_genotype_field']}" - ) - logger.critical(f"\n {row}") - raise Exception - - df.loc[df["vcf_genotype_field"] == "DS", "vcf_import_dosage"] = True - else: - logger.info("no vcf_genotype_field detected") - - return df - - -def _check_reserved_names(df: pd.DataFrame): - if any(df["sampleset"] == "reference"): - logger.critical( - "Samplesets must not be named 'reference', please rename in the sample sheet" - ) - raise Exception - - # Check whether reference contains reserved tokens from nextflow channels - badnames = [x for x in df["sampleset"] if ("." in x or "_" in x)] - if len(badnames) > 0: - logger.critical( - "Samplesets must not contain any reserved characters ( '_' , '.'), " - "please rename the following samples in the sample sheet: {}".format( - badnames - ) - ) - raise Exception - - -def _check_one_sampleset(df: pd.DataFrame): - samplesets = set(df["sampleset"].to_list()) - if len(samplesets) > 1: - logger.critical(f"Multiple samplesets defined in the samplesheet {samplesets}") - sampleset_error = """ Only one sampleset per samplesheet is supported - Your genomic data should _only_ be split by chromosome - pgsc_calc works best with cohorts - Individual VCFs should be merged into a multi-sample VCF - If you want to process multiple cohorts, please run pgsc_calc multiple times with different samplesheets. """ - [logger.critical(x.strip()) for x in sampleset_error.split("\n")] - raise Exception("Multiple samplesets") - - -def check_samplesheet() -> None: - """ - This function checks that the samplesheet follows the following structure: - sampleset,vcf_path,bfile_path,chrom,chunk - cineca_synthetic_subset,cineca_synthetic_subset.vcf.gz,,22, - """ - args = _parse_args() - config.set_logging_level(args.verbose) - - Config.input_path = args.FILE_IN - Config.output_path = args.FILE_OUT - - df = _read_samplesheet(Config.input_path) - - # check df for errors - _check_one_sampleset(df) - _check_reserved_names(df) - _check_colnames(df) - _check_paths(df) - _check_chrom(df) - _check_format(df) - - # add information to df - df = _setup_paths(df) - df = _check_genotype_field(df) # dosages - - logger.info("Samplesheet checks complete") - (df.drop(["path_prefix"], axis=1).to_json(Config.output_path, orient="records")) - logger.info(f"JSON file successfully written to {Config.output_path}") - - -if __name__ == "__main__": - check_samplesheet() diff --git a/pgscatalog_utils/scorefile/combine_scorefiles.py b/pgscatalog_utils/scorefile/combine_scorefiles.py index 9465484..ffef5e4 100644 --- a/pgscatalog_utils/scorefile/combine_scorefiles.py +++ b/pgscatalog_utils/scorefile/combine_scorefiles.py @@ -1,39 +1,17 @@ import argparse +import json import logging -import os +import pathlib import sys import textwrap -import json from pgscatalog_utils.config import set_logging_level -from pgscatalog_utils.scorefile.effect_type import set_effect_type -from pgscatalog_utils.scorefile.effect_weight import melt_effect_weights -from pgscatalog_utils.scorefile.genome_build import build2GRC -from pgscatalog_utils.scorefile.harmonised import remap_harmonised -from pgscatalog_utils.scorefile.liftover import liftover -from pgscatalog_utils.scorefile.qc import quality_control -from pgscatalog_utils.scorefile.read import load_scorefile, get_scorefile_basename -from pgscatalog_utils.scorefile.write import write_scorefile - - -headers2logs = [ - 'pgs_id', - 'pgp_id', - 'pgs_name', - 'genome_build', - 'variants_number', - 'trait_reported', - 'trait_efo', - 'trait_mapped', - 'weight_type', - 'citation' -] -headers2logs_harmonisation = [ - 'HmPOS_build', - 'HmPOS_date', - 'HmPOS_match_chr', - 'HmPOS_match_pos' -] +from pgscatalog_utils.download.GenomeBuild import GenomeBuild +from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.liftover import create_liftover +from pgscatalog_utils.scorefile.scoringfile import ScoringFile +from pgscatalog_utils.scorefile.write import write_combined + def combine_scorefiles(): args = _parse_args() @@ -41,110 +19,48 @@ def combine_scorefiles(): logger = logging.getLogger(__name__) set_logging_level(args.verbose) + Config.batch_size = 100000 + Config.drop_missing = args.drop_missing + Config.target_build = GenomeBuild.from_string(args.target_build) + Config.liftover = args.liftover + Config.min_lift = args.min_lift + + if args.chain_dir: + Config.chain_dir = args.chain_dir + Config.lo = create_liftover() + + if pathlib.Path(args.outfile).exists(): + raise FileExistsError(f"{args.outfile}") + paths: list[str] = list(set(args.scorefiles)) # unique paths only logger.debug(f"Input scorefiles: {paths}") - if os.path.exists(args.outfile): - logger.critical(f"Output file {args.outfile} already exists") - raise Exception - - # Score header logs - init - score_logs = {} - dir_output = os.path.dirname(args.outfile) - if dir_output == '': - dir_output = './' - elif dir_output.endswith('/') is False: - dir_output += '/' - json_logs_file = dir_output + args.logfile - - for x in paths: - # Read scorefile df and header - h, score = load_scorefile(x) - score_shape_original = score.shape - - if score.empty: - logger.critical(f"Empty scorefile {x} detected! Please check the input data") - raise Exception + sfs = [ScoringFile.from_path(x) for x in paths] - # Check if we should use the harmonized positions - use_harmonised = False - current_build = None - if h.get('HmPOS_build') is not None: - if h.get('HmPOS_build') == args.target_build: - use_harmonised = True - current_build = h.get('HmPOS_build') - else: - logger.error( - f"Cannot combine {x} (harmonized to {h.get('HmPOS_build')}) in target build {args.target_build}") - raise Exception - - # Process/QC score and check variant columns - score = (score.pipe(remap_harmonised, use_harmonised=use_harmonised) - .pipe(quality_control, drop_missing=args.drop_missing) - .pipe(melt_effect_weights) - .pipe(set_effect_type)) - - # Annotate score with the genome_build (in GRCh notation) - if current_build is None: - current_build = build2GRC(h.get('genome_build')) - if current_build is None: - logger.error("Scorefile has no build information, " - "please add the build to the header with " - "('#genome_build=[insert variant build]") - raise Exception - - score = score.assign(genome_build=current_build) - - if (current_build != args.target_build) and (args.liftover is False): - logger.error( - f"Cannot combine {x} (build={h.get('genome_build')}) with target build {args.target_build} without liftover") - logger.error("Try running with --liftover and specifying the --chain_dir") - raise Exception + target_build = GenomeBuild.from_string(args.target_build) + bad_builds = [x.accession for x in sfs if x.genome_build != target_build] - if args.liftover: - logger.debug("Annotating scorefile with liftover parameters") - score = liftover(score, args.chain_dir, args.min_lift, args.target_build) - - if score.empty and (args.drop_missing is False): - logger.critical("Empty output score detected, something went wrong while combining") + if not args.liftover: + for bad_file in bad_builds: + logger.critical(f"{bad_file} doesn't match {target_build}, can't combine") + if len(bad_builds) > 0: raise Exception - write_scorefile(score, args.outfile) - - # Build Score header logs - score_id = get_scorefile_basename(x) - score_header = score_logs[score_id] = {} - # Scoring file header information - for header in headers2logs: - header_val = h.get(header) - if (header in ['trait_efo', 'trait_mapped']) and (header_val is not None): - header_val = header_val.split('|') - score_header[header] = header_val - # Other header information - score_header['columns'] = list(score.columns) - score_header['use_liftover'] = False - if args.liftover: - score_header['use_liftover'] = True - # Harmonized header information - score_header['use_harmonised'] = use_harmonised - if use_harmonised: - score_header['sources'] = sorted(score['hm_source'].unique().tolist()) - for hm_header in headers2logs_harmonisation: - hm_header_val = h.get(hm_header) - if hm_header_val: - if hm_header.startswith('HmPOS_match'): - hm_header_val = json.loads(hm_header_val) - score_header[hm_header] = hm_header_val - if score_header['variants_number'] is None: - score_header['variants_number'] = score_shape_original[0] - - # Write Score header logs file - with open(json_logs_file, 'w') as fp: - json.dump(score_logs, fp, indent=4) + # provide line counts when making the scoring files + logs: dict[str, int] = write_combined(sfs, args.outfile) + json_log = [] + for (k, v), sf in zip(logs.items(), sfs): + json_log.append(sf.generate_log(v)) + + log_out_path = pathlib.Path(args.outfile).parent / args.logfile + with open(log_out_path, "w") as f: + logger.info(f"Writing log to {f.name}") + json.dump(json_log, f, indent=4) def _description_text() -> str: - return textwrap.dedent('''\ + return textwrap.dedent( + """\ Combine multiple scoring files in PGS Catalog format (see https://www.pgscatalog.org/downloads/ for details) to a 'long' table of columns needed for variant matching and subsequent calculation. @@ -153,43 +69,93 @@ def _description_text() -> str: unharmonised and harmonised PGS Catalog data. By default all variants are output (including positions with duplicated data [often caused by rsID/liftover collions across builds]) and variants with missing positions. - ''') + """ + ) def _epilog_text() -> str: - return textwrap.dedent('''\ + return textwrap.dedent( + """\ The long table is used to simplify intersecting variants in target genotyping datasets and the scoring files with the match_variants program. - ''') + """ + ) def _parse_args(args=None) -> argparse.Namespace: - parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(), - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('-s', '--scorefiles', dest='scorefiles', nargs='+', - help=' Scorefile path (wildcard * is OK)', required=True) - parser.add_argument('--liftover', dest='liftover', - help=' Convert scoring file variants to target genome build?', action='store_true') - parser.add_argument('-t', '--target_build', dest='target_build', - choices=['GRCh37', 'GRCh38'], help=' Build of target genome', - required=True) - parser.add_argument('-c', '--chain_dir', dest='chain_dir', help='Path to directory containing chain files', - required="--liftover" in sys.argv) - parser.add_argument('-m', '--min_lift', dest='min_lift', - help=' If liftover, minimum proportion of variants lifted over', - required="--liftover" in sys.argv, default=0.95, type=float) - parser.add_argument('--drop_missing', dest='drop_missing', action='store_true', - help=' Drop variants with missing information (chr/pos) and ' - 'non-standard alleles (e.g. HLA=P/N) from the output file.') - parser.add_argument('-o', '--outfile', dest='outfile', required=True, - default='combined.txt', - help=' Output path to combined long scorefile ' - '[ will compress output if filename ends with .gz ]') - parser.add_argument('-l', '--logfile', dest='logfile', default='log_combined.json', - help=' Name for the log file (score metadata) for combined scores.' - '[ will write to identical directory as combined scorefile]') - parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', - help=' Extra logging information') + parser = argparse.ArgumentParser( + description=_description_text(), + epilog=_epilog_text(), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "-s", + "--scorefiles", + dest="scorefiles", + nargs="+", + help=" Scorefile path (wildcard * is OK)", + required=True, + ) + parser.add_argument( + "--liftover", + dest="liftover", + help=" Convert scoring file variants to target genome build?", + action="store_true", + ) + parser.add_argument( + "-t", + "--target_build", + dest="target_build", + choices=["GRCh37", "GRCh38"], + help=" Build of target genome", + required=True, + ) + parser.add_argument( + "-c", + "--chain_dir", + dest="chain_dir", + help="Path to directory containing chain files", + required="--liftover" in sys.argv, + ) + parser.add_argument( + "-m", + "--min_lift", + dest="min_lift", + help=" If liftover, minimum proportion of variants lifted over", + default=0.95, + type=float, + ) + parser.add_argument( + "--drop_missing", + dest="drop_missing", + action="store_true", + help=" Drop variants with missing information (chr/pos) and " + "non-standard alleles (e.g. HLA=P/N) from the output file.", + ) + parser.add_argument( + "-o", + "--outfile", + dest="outfile", + required=True, + default="combined.txt", + help=" Output path to combined long scorefile " + "[ will compress output if filename ends with .gz ]", + ) + parser.add_argument( + "-l", + "--logfile", + dest="logfile", + default="log_combined.json", + help=" Name for the log file (score metadata) for combined scores." + "[ will write to identical directory as combined scorefile]", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + action="store_true", + help=" Extra logging information", + ) return parser.parse_args(args) diff --git a/pgscatalog_utils/scorefile/config.py b/pgscatalog_utils/scorefile/config.py new file mode 100644 index 0000000..a7540fc --- /dev/null +++ b/pgscatalog_utils/scorefile/config.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass + +import pyliftover + +from pgscatalog_utils.download.GenomeBuild import GenomeBuild + + +@dataclass +class Config: + drop_missing: bool + liftover: bool + lo: pyliftover.liftover + chain_dir: str + min_lift: float + batch_size: int + target_build: GenomeBuild diff --git a/pgscatalog_utils/scorefile/effect_type.py b/pgscatalog_utils/scorefile/effect_type.py deleted file mode 100644 index 50c8c73..0000000 --- a/pgscatalog_utils/scorefile/effect_type.py +++ /dev/null @@ -1,34 +0,0 @@ -import logging - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def set_effect_type(df: pd.DataFrame) -> pd.DataFrame: - if {'is_recessive', 'is_dominant'}.issubset(df.columns): - _check_effect_types(df) - return (df.assign(additive=lambda x: ~x["is_recessive"] & ~x["is_dominant"]) - .assign(effect_type=lambda x: x[["is_recessive", "is_dominant", "additive"]].idxmax(1))) - else: - return _set_default_effect_type(df) - - -def _check_effect_types(df: pd.DataFrame): - """ Check that only one effect type is set per variant """ - bad_rows: pd.DataFrame = df[['is_dominant', 'is_recessive']].all(axis=1).any() - - error = ''' ERROR: Bad variants in scorefile - is_recessive and is_dominant columns are both TRUE for a variant - These columns are mutually exclusive (both can't be true) - However, both can be FALSE for additive variant scores - ''' - if bad_rows: - logger.error(error) - logger.error(bad_rows) - raise Exception - - -def _set_default_effect_type(df: pd.DataFrame, effect_type: str = "additive") -> pd.DataFrame: - logger.debug(f'No effect types set, using default ({effect_type})') - return df.assign(effect_type=effect_type) diff --git a/pgscatalog_utils/scorefile/effect_weight.py b/pgscatalog_utils/scorefile/effect_weight.py deleted file mode 100644 index 4b95e0f..0000000 --- a/pgscatalog_utils/scorefile/effect_weight.py +++ /dev/null @@ -1,49 +0,0 @@ -import logging -import re - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def melt_effect_weights(df: pd.DataFrame) -> pd.DataFrame: - """ Ensure all dataframes are in long format, with one effect weight column and a score accession column """ - elongate = _detect_multiple_weight_columns(df) - - if elongate: - logger.debug("Melting effect weights") - return _melt(df) - else: - logger.debug("Skipping melt") - df['accession'] = df['filename'] - return df - - -def _detect_multiple_weight_columns(df: pd.DataFrame) -> bool: - """ Detect if multiple effect weight columns are present - - Single weight format: - | chr_name | chr_pos | effect_allele | effect_weight - - Multiple weight format: - | chr_name | chr_pos | effect_allele | effect_weight_score_1 | ... | effect_weight_score_n - """ - columns: list[re.match | None] = [re.search("^effect_weight$", x) for x in df.columns.to_list()] - columns_suffix: list[re.match | None] = [re.search("^effect_weight_[A-Za-z0-9]+$", x) for x - in df.columns.to_list()] - - if any([col for col in columns]): - logger.debug("Single effect weight column detected") - return False - elif any([col for col in columns_suffix]): - logger.debug("Multiple weight weight columns detected") - return True - else: - logger.error("ERROR: Missing valid effect weight columns") - raise Exception("Bad effect weights") - - -def _melt(df: pd.DataFrame) -> pd.DataFrame: - """ Melt a multiple effect weight format """ - ew_cols: list[str] = df.filter(regex="effect_weight_*").columns.to_list() - return df.melt(value_vars=ew_cols, value_name="effect_weight", var_name="accession") diff --git a/pgscatalog_utils/scorefile/effectallele.py b/pgscatalog_utils/scorefile/effectallele.py new file mode 100644 index 0000000..0dffd04 --- /dev/null +++ b/pgscatalog_utils/scorefile/effectallele.py @@ -0,0 +1,60 @@ +class EffectAllele: + """A class that represents an effect allele found in PGS Catalog scoring files + + The allele that's dosage is counted (e.g. {0, 1, 2}) and multiplied by the variant's + weight (effect_weight) when calculating score. The effect allele is also known as + the 'risk allele'. + >>> simple_ea = EffectAllele("A") + >>> simple_ea + EffectAllele("A") + >>> simple_ea.is_snp + True + >>> str(simple_ea) + 'A' + >>> EffectAllele("AG") + EffectAllele("AG") + >>> hla_example = EffectAllele("+") + >>> hla_example + EffectAllele("+") + >>> hla_example.is_snp + False + """ + + _valid_snp_bases = frozenset({"A", "C", "T", "G"}) + __slots__ = ("_allele", "_is_snp") + + def __init__(self, allele): + self._allele = str(allele) + self._is_snp = None # computed when accessed + + def __repr__(self): + return f'{type(self).__name__}("{self.allele}")' + + def __str__(self): + return self.allele + + @property + def allele(self): + return self._allele + + @allele.setter + def allele(self, value): + self._allele = str(value) + self._is_snp = None # reset _is_snp when allele is changed + + @property + def is_snp(self) -> bool: + """SNPs are the most common type of effect allele in PGS Catalog scoring + files. More complex effect alleles, like HLAs or APOE genes, often require + extra work to represent in genomes. Users should be warned about complex + effect alleles. + >>> ea = EffectAllele("+") + >>> ea.is_snp + False + >>> ea.allele = "A" + >>> ea.is_snp + True + """ + if self._is_snp is None: + self._is_snp = not frozenset(self.allele) - self._valid_snp_bases + return self._is_snp diff --git a/pgscatalog_utils/scorefile/effecttype.py b/pgscatalog_utils/scorefile/effecttype.py new file mode 100644 index 0000000..4878072 --- /dev/null +++ b/pgscatalog_utils/scorefile/effecttype.py @@ -0,0 +1,14 @@ +from enum import Enum + + +class EffectType(Enum): + RECESSIVE = "recessive" + DOMINANT = "dominant" + ADDITIVE = "additive" + + def __str__(self): + return str(self.value) + + def __repr__(self): + # pasting __repr__ output should be sufficient to construct the class + return f"{type(self).__name__}.{self.name}" diff --git a/pgscatalog_utils/scorefile/genome_build.py b/pgscatalog_utils/scorefile/genome_build.py deleted file mode 100644 index 7ea4f09..0000000 --- a/pgscatalog_utils/scorefile/genome_build.py +++ /dev/null @@ -1,24 +0,0 @@ -import logging - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def annotate_build(df: pd.DataFrame, target_build: str) -> pd.DataFrame: - """ Annotate the dataframe with genome build data """ - logger.debug(f"Annotating target build: {target_build}") - build_dict: dict = {'GRCh37': 'hg19', 'GRCh38': 'hg38', 'hg19': 'hg19', 'hg38': 'hg38'} # standardise build names - df['chain_target_build'] = build_dict[target_build] - df = df.assign(chain_genome_build=[build_dict[x] for x in df['genome_build']]) - return df - - -def build2GRC(build): - """Map build names so they can be compared with GRCh37 and 38""" - build_2_GRC_dict = {'GRCh37': 'GRCh37', 'GRCh38': 'GRCh38', 'hg19': 'GRCh37', - 'hg38': 'GRCh38'} # standardise build names - if pd.isnull(build): - return None - else: - return build_2_GRC_dict.get(build) diff --git a/pgscatalog_utils/scorefile/harmonised.py b/pgscatalog_utils/scorefile/harmonised.py deleted file mode 100644 index b56fb93..0000000 --- a/pgscatalog_utils/scorefile/harmonised.py +++ /dev/null @@ -1,30 +0,0 @@ -import logging -import re - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def remap_harmonised(df: pd.DataFrame, use_harmonised) -> pd.DataFrame: - """ Replace original columns with harmonised data, if available and appropriate """ - - if any([re.match("hm_\\w+", x) for x in df.columns]) and use_harmonised: - logger.debug("Harmonised columns detected and used") - hm_colnames: dict[str: str] = {'hm_chr': 'chr_name', 'hm_pos': 'chr_position', - 'hm_inferOtherAllele': 'other_allele'} - - if 'other_allele' not in df or all(df['other_allele'].isnull()): - logger.debug("other_allele column contains no information, replacing with hm_inferOtherAllele") - return (df.drop(['chr_name', 'chr_position', 'other_allele'], axis=1, errors='ignore') - .rename(hm_colnames, axis=1)) - else: - logger.debug("other_allele column contains information, dropping hm_inferOtherAllele") - return (df.drop(['chr_name', 'chr_position', 'hm_inferOtherAllele'], axis=1, errors='ignore') - .rename(hm_colnames, axis=1)) - elif any([re.match("hm_\\w+", x) for x in df.columns]) and not use_harmonised: - logger.debug(f"Harmonised columns detected but not used (use_harmonised={use_harmonised})") - return df - else: - logger.debug("Harmonised columns not detected") - return df diff --git a/pgscatalog_utils/scorefile/liftover.py b/pgscatalog_utils/scorefile/liftover.py index 45258b1..24b9194 100644 --- a/pgscatalog_utils/scorefile/liftover.py +++ b/pgscatalog_utils/scorefile/liftover.py @@ -1,103 +1,75 @@ import logging import os +import typing -import pandas as pd import pyliftover -from pgscatalog_utils.scorefile.genome_build import annotate_build +from pgscatalog_utils.download.GenomeBuild import GenomeBuild +from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.scorevariant import ScoreVariant logger = logging.getLogger(__name__) -def liftover(df: pd.DataFrame, chain_dir: str, min_lift: float, target_build: str) -> pd.DataFrame: - """ Liftover genomic coordinates to a different genome build """ - df = annotate_build(df, target_build) # get chain_target_build (e.g. in hg notation to match chain files) - - mapped, unmapped = pd.DataFrame(), pd.DataFrame() - no_liftover: pd.DataFrame = df.query('chain_target_build == chain_genome_build') - to_liftover: pd.DataFrame = df.query('chain_target_build != chain_genome_build') - - if no_liftover.empty: - logger.debug("Liftover required for all scorefile variants") +def liftover( + variants: typing.Generator[ScoreVariant, None, None], + harmonised: bool, + current_build: GenomeBuild, + target_build: GenomeBuild, +) -> typing.Generator[ScoreVariant, None, None]: + if harmonised: + skip_lo = True + elif target_build == current_build: + skip_lo = True else: - logger.debug("Skipping liftover for scorefiles with same build as target genome") - no_liftover.loc[:, ['lifted_chr', 'lifted_pos']] = no_liftover[ - ['chr_name', 'chr_position']] # assume col structure - no_liftover.assign(liftover=None) + skip_lo = False - if to_liftover.empty: - logger.debug("Liftover skipped because no variants required it") + if skip_lo: + logger.info("Skipping liftover") + for variant in variants: + yield variant else: - lo: dict[str, pyliftover.LiftOver] = _create_liftover(chain_dir) # loads chain files - logger.debug("Lifting over scoring files") - lifted: pd.DataFrame = to_liftover.apply(_convert_coordinates, axis=1, lo_dict=lo) - to_liftover = pd.concat([to_liftover, lifted], axis=1) - logger.debug("Liftover complete") - - mapped: pd.DataFrame = (to_liftover[~to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] - .assign(liftover=True)) - unmapped: pd.DataFrame = (to_liftover[to_liftover[['lifted_chr', 'lifted_pos']].isnull().any(axis=1)] \ - .assign(liftover=False)) - _check_min_liftover(mapped, unmapped, min_lift) - - return pd.concat([mapped, unmapped, no_liftover]) - - -def _check_min_liftover(mapped: pd.DataFrame, unmapped: pd.DataFrame, min_lift: float) -> None: - """ Check that liftover process met minimum parameters""" - df = pd.concat([mapped, unmapped]) - n_variants: pd.DataFrame = (pd.DataFrame(df.groupby('accession')['liftover'].count()) - .reset_index() - .rename({'liftover': 'n_var'}, axis=1)) - lo_counts = (pd.DataFrame(df.groupby(['accession', 'liftover'])['liftover'].count()) \ - .rename_axis(['accession', 'liftover_status']) - .reset_index()) - summary: pd.DataFrame = lo_counts.merge(n_variants, on='accession') - summary['proportion'] = summary['liftover'] / summary['n_var'] - - for row in summary.query('liftover_status == True')[['accession', 'proportion']].itertuples(): - if row.proportion < min_lift: - logger.error(f'Liftover failed for scorefile {row.accession}') - logger.error(f'{row.proportion} of variants lifted over, less than min_lift parameter ({min_lift})') + logger.info("Starting liftover") + if current_build == GenomeBuild.GRCh37 and target_build == GenomeBuild.GRCh38: + lo: pyliftover.LiftOver = Config.lo["hg19hg38"] + elif current_build == GenomeBuild.GRCh38 and target_build == GenomeBuild.GRCh37: + lo: pyliftover.LiftOver = Config.lo["hg38hg19"] + else: + raise Exception("Can't get pyliftover object") + + n_lifted = 0 + n = 0 + + for variant in variants: + chrom = "chr" + variant.chr_name + pos = int(variant.chr_position) - 1 # VCF -> 1 based, UCSC -> 0 based + lifted = lo.convert_coordinate(chrom, pos) + if lifted: + variant.chr_name = lifted[0][0][3:].split("_")[0] + variant.chr_position = lifted[0][1] + 1 # reverse 0 indexing + yield variant + n_lifted += 1 + else: + variant.chr_name = None + variant.chr_position = None + yield variant + n += 1 + + if (n_lifted / n) < Config.min_lift: + logger.error("Liftover failed for variant {variant}") raise Exception else: - logger.debug(f'Minimum liftover threshold passed for scorefile {row.accession}') - - -def _convert_coordinates(df: pd.Series, lo_dict: dict[str, pyliftover.LiftOver]) -> pd.Series: - """ Convert genomic coordinates to different build """ - converted: list[tuple[str, int, str, int]] | None - - if df[['chr_name', 'chr_position']].isnull().values.any(): - converted = None - else: - lo = lo_dict[df['chain_genome_build'] + df['chain_target_build']] # extract lo object from dict - chrom: str = 'chr' + str(df['chr_name']) - pos: int = int(df['chr_position']) - 1 # liftOver is 0 indexed, VCF is 1 indexed - # converted example: [('chr22', 15460378, '+', 3320966530)] or None - converted = lo.convert_coordinate(chrom, pos) - - if converted: - lifted_chrom: str = _parse_lifted_chrom(converted[0][0][3:]) # return first matching liftover - lifted_pos: int = int(converted[0][1]) + 1 # reverse 0 indexing - return pd.Series([lifted_chrom, lifted_pos], index=['lifted_chr', 'lifted_pos']) - else: - return pd.Series([None, None], index=['lifted_chr', 'lifted_pos']) - - -def _parse_lifted_chrom(i: str) -> str: - """ Convert lifted chromosomes to tidy integers - - liftover needs chr suffix for chromosome input (1 -> chr1), and it also - returns weird chromosomes sometimes (chr22 -> 22_KI270879v1_alt) - """ - return i.split('_')[0] + logger.info("Liftover successful") -def _create_liftover(chain_dir: str) -> dict['str': pyliftover.LiftOver]: - """ Create LiftOver objects that can remap genomic coordinates """ +def create_liftover() -> dict["str" : pyliftover.LiftOver]: + """Create LiftOver objects that can remap genomic coordinates""" + chain_dir: str = Config.chain_dir builds: list[str] = ["hg19hg38", "hg38hg19"] - chains: list[str] = [os.path.join(chain_dir, x) for x in ["hg19ToHg38.over.chain.gz", "hg38ToHg19.over.chain.gz"]] + chains: list[str] = [ + os.path.join(chain_dir, x) + for x in ["hg19ToHg38.over.chain.gz", "hg38ToHg19.over.chain.gz"] + ] lo: list[pyliftover.LiftOver] = [pyliftover.LiftOver(x) for x in chains] logger.debug("Chain files loaded for liftover") return dict(zip(builds, lo)) diff --git a/pgscatalog_utils/scorefile/qc.py b/pgscatalog_utils/scorefile/qc.py index 68e511c..526fda2 100644 --- a/pgscatalog_utils/scorefile/qc.py +++ b/pgscatalog_utils/scorefile/qc.py @@ -1,92 +1,226 @@ import logging +import typing -import pandas as pd -logger = logging.getLogger(__name__) - - -def quality_control(df: pd.DataFrame, drop_missing: bool) -> pd.DataFrame: - """ Do quality control checks on a scorefile """ - _check_shape(df) - _check_columns(df) - logger.debug("Quality control: checking for bad variants") - if drop_missing is True: - return (df.pipe(_drop_hla) - .pipe(_drop_missing_variants) - .pipe(_check_duplicate_identifiers) - .pipe(_drop_multiple_oa)) - else: - return (df.pipe(_check_duplicate_identifiers) - .pipe(_drop_multiple_oa)) - - -def _drop_multiple_oa(df: pd.DataFrame) -> pd.DataFrame: - """ Set alleles to None in hm_inferOtherAllele if they contain multiple alleles +from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.effecttype import EffectType +from pgscatalog_utils.scorefile.scoringfileheader import ScoringFileHeader +from pgscatalog_utils.scorefile.liftover import liftover +from pgscatalog_utils.scorefile.scorevariant import ScoreVariant - e.g. A / C / T -> None; A -> A; A / C -> None - """ - if 'other_allele' in df: - if df['other_allele'].str.contains('/').any(): - logger.debug("Multiple inferred other alleles detected, dropping other alleles for ambiguous variants") - df['other_allele'] = df['other_allele'].replace(regex='.+\\/.+', value=None) - return df - else: - logger.debug("Only single other alleles detected.") - return df - else: - logger.warning("No other allele data detected, skipping QC of other allele") - return df +logger = logging.getLogger(__name__) -def _drop_missing_variants(df: pd.DataFrame) -> pd.DataFrame: - no_na: pd.DataFrame = df.dropna(subset=['chr_name', 'chr_position', 'effect_weight']) - n_dropped = df.shape[0] - no_na.shape[0] +def quality_control( + variants: typing.Generator[ScoreVariant, None, None], + header: ScoringFileHeader, + harmonised: bool, + wide: bool, +) -> typing.Generator[ScoreVariant, None, None]: + # order is important for: + # 1. liftover non-harmonised data (quite rare), failed lifts get None'd + # 2. remap harmonised data, failed harmonisations get None'd + # 3. check and optionally drop bad variants + # where a bad variant has None in a mandatory ScoreVariant field + # then continue with other QC + if Config.liftover: + variants = liftover( + variants, + harmonised=harmonised, + current_build=header.genome_build, + target_build=Config.target_build, + ) + + variants = remap_harmonised(variants, harmonised) + variants = check_bad_variant(variants) + + if Config.drop_missing: + variants = drop_hla(variants) + + variants = assign_effect_type(variants) + variants = check_effect_weight(variants) + variants = assign_other_allele(variants) + variants = check_effect_allele(variants) + variants = detect_complex(variants) + + if wide: + # wide data must be sorted because check_duplicates requires sorted input + variants = (x for x in sorted(variants, key=lambda x: x["accession"])) + + variants = check_duplicates(variants) + + return variants + + +def check_duplicates( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: + seen_ids: dict = {} + current_accession: typing.Union[str, None] = None + n_duplicates: int = 0 + n_variants: int = 0 + for variant in variants: + accession: str = variant.accession + + if accession != current_accession: + seen_ids = {} + current_accession = accession + + # None other allele -> empty string + variant_id: str = ":".join( + [ + str(getattr(variant, k) or "") + for k in ["chr_name", "chr_position", "effect_allele", "other_allele"] + ] + ) + + if variant_id in seen_ids: + variant.is_duplicated = True + n_duplicates += 1 + + seen_ids[variant_id] = True + + yield variant + n_variants += 1 + + if n_duplicates > 0: + logger.warning( + f"{n_duplicates} of {n_variants} variants are duplicated in: {current_accession}" + ) + + +def drop_hla( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: + n_dropped = 0 + for variant in variants: + match variant: + case {"effect_allele": "P"} | {"effect_allele": "N"}: + n_dropped += 1 + continue + case _: + yield variant + + logger.warning(f"{n_dropped} HLA alleles detected and dropped") + + +def check_effect_weight( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: + for variant in variants: + try: + float(variant.effect_weight) + yield variant + except ValueError: + logger.critical(f"{variant} has bad effect weight") + raise ValueError + + +def assign_other_allele( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: + n_dropped = 0 + for variant in variants: + if "/" in variant.other_allele: + n_dropped += 1 + variant.other_allele = None + + yield variant if n_dropped > 0: - logger.warning(f"{n_dropped} variants with missing values detected and dropped from scoring file") - - return no_na - - -def _drop_hla(df: pd.DataFrame) -> pd.DataFrame: - """ Drop HLA effect alleles with present / absent encoding """ - - no_hla: pd.DataFrame = df.query('effect_allele != "P" | effect_allele != "N"') - - if df.shape[0] > no_hla.shape[0]: - logger.debug("HLA alleles detected and dropped") - - return no_hla - - -def _check_duplicate_identifiers(df: pd.DataFrame) -> pd.DataFrame: - if 'other_allele' in df: - logger.debug("Other allele column detected, including other_allele in variant identifier") - group_cols = ['chr_name', 'chr_position', 'effect_allele', 'other_allele'] + logger.warning(f"Multiple other_alleles detected in {n_dropped} variants") + logger.warning("Other allele for these variants is set to missing") + + +def assign_effect_type( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: + for variant in variants: + match (variant.is_recessive, variant.is_dominant): + case (None, None) | ("FALSE", "FALSE"): + pass # default value is additive, pass to break match and yield + case ("FALSE", "TRUE"): + variant.effect_type = EffectType.DOMINANT + case ("TRUE", "FALSE"): + variant.effect_type = EffectType.RECESSIVE + case _: + logger.critical(f"Bad effect type setting: {variant}") + raise Exception + yield variant + + +def remap_harmonised( + variants: typing.Generator[ScoreVariant, None, None], harmonised: bool +) -> typing.Generator[ScoreVariant, None, None]: + if harmonised: + for variant in variants: + # using the harmonised field in the header to make sure we don't accidentally overwrite + # positions with empty data (e.g. in an unharmonised file) + # if harmonisation has failed we _always_ want to use that information + variant.chr_name = variant.hm_chr + variant.chr_position = variant.hm_pos + if variant.other_allele is None: + variant.other_allele = variant.hm_inferOtherAllele + yield variant else: - logger.warning("Other allele column not detected, dropping other_allele from variant identifier.") - group_cols = ['chr_name', 'chr_position', 'effect_allele'] - - u_count: pd.Series = df.groupby(group_cols).size() - - if all(u_count == 1): - return df.assign(is_duplicated=False) - else: - logger.warning("Duplicate variants in scoring file: {}".format(df['filename_prefix'].unique())) - u_count = u_count > 1 - u_count.name = 'is_duplicated' - df = pd.merge(df, u_count, how='left', left_on=group_cols, right_index=True) - df.loc[df.is_duplicated.isnull(), 'is_duplicated'] = False # handles variants with null chr/pos - return df - + for variant in variants: + # can't remap, so don't try + yield variant + + +def check_bad_variant( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: + n_bad = 0 + for variant in variants: + match variant: + case ( + ScoreVariant(chr_name=None) + | ScoreVariant(chr_position=None) + | ScoreVariant(effect_allele=None) + ): + # (effect weight checked separately) + n_bad += 1 + if not Config.drop_missing: + yield variant + case _: + yield variant + + if n_bad > 1: + logger.warning(f"{n_bad} bad variants") + + +def check_effect_allele( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: + n_bad = 0 + for variant in variants: + if not variant.effect_allele.is_snp: + n_bad += 1 + + yield variant + + if n_bad > 1: + logger.warning(f"{n_bad} variants have invalid effect alleles (not ACTG)") + + +def detect_complex( + variants: typing.Generator[ScoreVariant, None, None] +) -> typing.Generator[ScoreVariant, None, None]: + """Some older scoring files in the PGS Catalog are complicated. + They often require bespoke set up to support interaction terms, etc + """ + is_complex = False -def _check_shape(df: pd.DataFrame) -> None: - assert len(df.columns) > 1, "ERROR: scorefile not formatted correctly (0 columns)" - assert df.shape[0] > 0, "ERROR: No variants detected in input file (0 rows)" + for variant in variants: + if not is_complex: + if variant.is_complex: + is_complex = True + yield variant -def _check_columns(df: pd.DataFrame) -> None: - assert {'chr_name', 'chr_position'}.issubset(df.columns), "Missing chromosomal positions. If you're " \ - "using PGS Catalog files with rsIDs you should request " \ - "harmonised data files (HmPOS) instead." - assert 'effect_allele' in df, "ERROR: Missing effect allele column" + if is_complex: + logger.warning("Complex scoring file detected") + logger.warning( + "Complex files are difficult to calculate properly and may require manual intervention" + ) diff --git a/pgscatalog_utils/scorefile/read.py b/pgscatalog_utils/scorefile/read.py deleted file mode 100644 index dbd559b..0000000 --- a/pgscatalog_utils/scorefile/read.py +++ /dev/null @@ -1,79 +0,0 @@ -import gzip -import io -import logging -import os - -import pandas as pd - -logger = logging.getLogger(__name__) - - -def load_scorefile(path: str) -> tuple[dict, pd.DataFrame]: - logger.debug(f'Reading scorefile {path}') - df = pd.read_table(path, dtype=_scorefile_dtypes(), comment='#', na_values=['None'], low_memory=False) - return (_read_header(path), - df.assign(filename_prefix=get_scorefile_basename(path), filename=path, row_nr=df.index)) - - -def _read_header(path: str) -> dict: - """Parses the header of a PGS Catalog format scorefle into a dictionary""" - f = io.TextIOWrapper(gzip.open(path, 'r')) - try: - f.readline() - except gzip.BadGzipFile: - f = open(path, 'r') - - header = {} - lastline = '#' - while lastline.startswith('#'): - lastline = f.readline() - line = lastline.strip() - if line.startswith('#'): - if '=' in line: - line = line[1:].split('=') - field, val = [x.strip() for x in line] - if field in remap_header: - header[remap_header[field]] = val - else: - header[field] = val - - if ('genome_build' in header) and (header['genome_build'] == 'NR'): - header['genome_build'] = None - f.close() - return header - - -def _scorefile_dtypes() -> dict[str]: - """ Data types for columns that might be found in a scorefile """ - return {'rsID': str, 'chr_name': str, 'chr_position': pd.UInt64Dtype(), 'effect_allele': 'str', - 'effect_weight': float, 'locus_name': str, 'OR': float, 'hm_source': str, 'hm_rsID': str, - 'hm_chr': str, 'hm_pos': pd.UInt64Dtype(), 'hm_inferOtherAllele': str} - - -def get_scorefile_basename(path: str) -> str: - """ Return the basename of a scoring file without extension """ - filename = os.path.basename(path) - if filename.endswith('.txt.gz'): - filename = filename.replace('.txt.gz', '') - elif filename.endswith('.txt'): - filename = filename.replace('.txt', '') - return filename - - -remap_header = { - 'PGS ID': 'pgs_id', - 'PGS Name': 'pgs_name', - 'Reported Trait': 'trait_reported', - 'Original Genome Build': 'genome_build', - 'Number of Variants': 'variants_number', - 'PGP ID': 'pgp_id', - 'Citation': 'citation', - 'LICENSE': 'license', - # Harmonization related - 'HmPOS Build': 'HmPOS_build', - 'HmPOS Date': 'HmPOS_date', - 'HmVCF Reference': 'HmVCF_ref', - 'HmVCF Date': 'HmVCF_date', - 'HmVCF N Matched Variants': 'HmVCF_n_matched', - 'HmVCF N Unmapped Variants': 'HmVCF_n_unmapped' -} # Used to maintain reverse compatibility to old scoring files diff --git a/pgscatalog_utils/scorefile/scorevariant.py b/pgscatalog_utils/scorefile/scorevariant.py new file mode 100644 index 0000000..38135dc --- /dev/null +++ b/pgscatalog_utils/scorefile/scorevariant.py @@ -0,0 +1,137 @@ +from typing import Optional + +from pgscatalog_utils.scorefile.effectallele import EffectAllele +from pgscatalog_utils.scorefile.effecttype import EffectType + + +class ScoreVariant: + mandatory_fields: tuple[str] = ( + "effect_allele", + "effect_weight", + "accession", + "row_nr", + ) + optional_fields: tuple[str] = ( + "chr_name", + "chr_position", + "rsID", + "other_allele", + "hm_chr", + "hm_pos", + "hm_inferOtherAllele", + "hm_source", + "is_dominant", + "is_recessive", + "hm_rsID", + "hm_match_chr", + "hm_match_pos", + "is_duplicated", + "effect_type", + ) + complex_fields: tuple[str] = ("is_haplotype", "is_diplotype", "is_interaction") + + # column names for output are used by __iter__ and when writing out + output_fields: tuple[str] = ( + "chr_name", + "chr_position", + "effect_allele", + "other_allele", + "effect_weight", + "effect_type", + "is_duplicated", + "accession", + "row_nr", + ) + + # slots uses magic to improve speed and memory when making millions of objects + __slots__ = mandatory_fields + optional_fields + ("is_complex",) + + # __init__ is intentionally verbose and avoids using loops or trickery to work: + # - attributes won't change often + # - class accepts keyword parameters only to init (not positional) + # - type hints are helpful in parameters + # - setting sensible defaults for optional fields is clear + # - being verbose helps prevent IDE warnings + # extra kwargs are silently ignored + # (yes, effect_weight is treated as a str, want to avoid rounding errors at this stage) + def __init__( + self, + *, + effect_allele: str, + effect_weight: str, + accession: str, + row_nr: int, + chr_name: str = None, + chr_position: int = None, + rsID: str = None, + other_allele: str = None, + hm_chr: str = None, + hm_pos: int = None, + hm_inferOtherAllele: str = None, + hm_source: str = None, + is_dominant: str = None, + is_recessive: str = None, + hm_rsID: str = None, + hm_match_chr: str = None, + hm_match_pos: str = None, + is_duplicated: bool = False, + effect_type: EffectType = EffectType.ADDITIVE, + is_complex: bool = False, + **kwargs, + ): + # start with mandatory attributes + self.effect_allele: EffectAllele = EffectAllele(effect_allele) + self.effect_weight: str = effect_weight + self.accession: str = accession + self.row_nr: int = int(row_nr) + + # now set optional fields + self.chr_name: Optional[str] = chr_name + + # casting to int is important for arrow export + try: + self.chr_position: Optional[int] = int(chr_position) + except (ValueError, TypeError): + self.chr_position = None + + self.rsID: Optional[str] = rsID + self.other_allele: Optional[str] = other_allele + self.hm_chr: Optional[str] = hm_chr + + # casting to int is important when harmonised data may replace chr_position + try: + self.hm_pos: Optional[int] = int(hm_pos) + except (ValueError, TypeError): + self.hm_pos = None + + self.hm_inferOtherAllele: Optional[str] = hm_inferOtherAllele + self.hm_source: Optional[str] = hm_source + self.is_dominant: Optional[bool] = is_dominant + self.is_recessive: Optional[bool] = is_recessive + self.hm_rsID: Optional[str] = hm_rsID + self.hm_match_chr: Optional[str] = hm_match_chr + self.hm_match_pos: Optional[str] = hm_match_pos + self.is_duplicated: Optional[bool] = is_duplicated + self.effect_type: EffectType = effect_type + + # these fields are important to check if variants are complex + if any([x in kwargs for x in self.complex_fields]): + is_complex = True + self.is_complex: bool = is_complex + + def __repr__(self): + class_name = type(self).__name__ + values = {} + + for key in ScoreVariant.__slots__: + values[key] = getattr(self, key, None) + + # extract str parameter for effect allele + values["effect_allele"] = values["effect_allele"].allele + + params = ",".join([f"{k}={repr(v)}" for k, v in values.items()]) + return f"{class_name}({params})" + + def __iter__(self): + for attr in self.output_fields: + yield getattr(self, attr) diff --git a/pgscatalog_utils/scorefile/scoringfile.py b/pgscatalog_utils/scorefile/scoringfile.py new file mode 100644 index 0000000..bb3aaa2 --- /dev/null +++ b/pgscatalog_utils/scorefile/scoringfile.py @@ -0,0 +1,188 @@ +import csv +import logging +import os +import pathlib +import typing +from dataclasses import dataclass +from itertools import islice + +from pgscatalog_utils.download.GenomeBuild import GenomeBuild +from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.scoringfileheader import ScoringFileHeader, auto_open +from pgscatalog_utils.scorefile.qc import quality_control +from pgscatalog_utils.scorefile.scorevariant import ScoreVariant + +logger = logging.getLogger(__name__) + + +@dataclass +class ScoringFile: + path: pathlib.Path + accession: str + header: typing.Union[ScoringFileHeader, None] + genome_build: typing.Union[GenomeBuild, None] + harmonised: bool + fields: list[str] + variants: typing.Generator[ScoreVariant, None, None] + + def __post_init__(self): + if self.header.HmPOS_build: + logger.info( + f"{self.path} harmonised data detected: {self.header.HmPOS_build}" + ) + self.genome_build = self.header.HmPOS_build + + mandatory_columns = {"chr_name", "effect_allele", "effect_weight"} + if not mandatory_columns.issubset(self.fields) not in self.fields: + err_msg = f"{self.path} missing fields" + raise Exception(err_msg) + + @classmethod + def from_path(cls, path: pathlib.Path): + header = ScoringFileHeader.from_path(path) + name = os.path.basename(path).split(".")[0] + if header: + if header.HmPOS_build: + harmonised = True + genome_build = header.HmPOS_build + else: + harmonised = False + genome_build = header.genome_build + else: + harmonised = False + genome_build = None + + start_line, cols = get_columns(path) + is_wide = detect_wide(cols) + + logger.info(f"Lazily reading variants from {path}") + variants: typing.Generator[ + ScoreVariant, None, None + ] = ScoringFile.read_variants( + path=path, start_line=start_line, fields=cols, name=name, is_wide=is_wide + ) + + # the quality_control function normalises a list of variants to have a standard representation + # attributes are overwritten using harmonised data, etc. + variants: typing.Generator[ScoreVariant, None, None] = quality_control( + variants, header=header, harmonised=harmonised, wide=is_wide + ) + + return cls( + path=path, + header=header, + genome_build=genome_build, + harmonised=harmonised, + fields=cols, + variants=variants, + accession=name, + ) + + def generate_log(self, counted: typing.Counter): + log = { + key: str(value) if value is not None else None + for key, value in self.header.__dict__.items() + } + + if log["variants_number"] is None: + # custom scoring files might not have this information + log["variants_number"] = counted["n_variants"] + + if ( + int(log["variants_number"]) != counted["n_variants"] + and not Config.drop_missing + ): + logger.warning( + f"Mismatch between header ({log['variants_number']}) and output row count ({counted['n_variants']}) for {self.accession}" + ) + logger.warning( + "This can happen with older scoring files in the PGS Catalog (e.g. PGS000028)" + ) + + # multiple terms may be separated with a pipe + if log["trait_mapped"]: + log["trait_mapped"] = log["trait_mapped"].split("|") + + if log["trait_efo"]: + log["trait_efo"] = log["trait_efo"].split("|") + + log["columns"] = self.fields + log["use_liftover"] = Config.liftover + log["use_harmonised"] = self.harmonised + log["sources"] = [k for k, v in counted.items() if k != "n_variants"] + + return {self.accession: log} + + @staticmethod + def read_variants( + path, fields, start_line, name: str, is_wide: bool + ) -> typing.Generator[ScoreVariant, None, None]: + open_function = auto_open(path) + row_nr = 0 + + with open_function(path, mode="rt") as f: + for _ in range(start_line + 1): + # skip header + next(f) + + while True: + batch = list(islice(f, Config.batch_size)) + if not batch: + break + + csv_reader = csv.reader(batch, delimiter="\t") + yield from read_rows(csv_reader, fields, name, is_wide, row_nr) + # this is important because row_nr resets for each batch + row_nr += len(batch) + + +def read_rows( + csv_reader, fields: list[str], name: str, wide: bool, row_nr: int +) -> typing.Generator[ScoreVariant, None, None]: + for row in csv_reader: + variant = dict(zip(fields, row)) + + if wide: + ew_col_idxs: list[int] = [ + i for i, x in enumerate(["effect_weight_" in x for x in fields]) if x + ] + for i, weight_name in zip(ew_col_idxs, [fields[i] for i in ew_col_idxs]): + yield ScoreVariant( + **variant, + **{ + "accession": weight_name, + "row_nr": row_nr, + "effect_weight": variant[weight_name], + }, + ) + else: + yield ScoreVariant(**variant, **{"accession": name, "row_nr": row_nr}) + + row_nr += 1 + + +def get_columns(path) -> tuple[int, list[str]]: + open_function = auto_open(path) + with open_function(path, mode="rt") as f: + for i, line in enumerate(f): + if line.startswith("#"): + continue + line_no, cols = i, line.strip().split("\t") + if len(set(cols)) != len(cols): + logger.critical(f"Duplicated column names: {cols}") + raise ValueError + + return line_no, cols + + +def detect_wide(cols: list[str]) -> bool: + """ + Check columns to see if multiple effect weights are present. Multiple effect weights must be present in the form: + effect_weight_suffix1 + effect_weight_suffix2 + """ + if any(["effect_weight_" in x for x in cols]): + logger.info("Wide scoring file detected with multiple effect weights") + return True + else: + return False diff --git a/pgscatalog_utils/scorefile/scoringfileheader.py b/pgscatalog_utils/scorefile/scoringfileheader.py new file mode 100644 index 0000000..06d7f10 --- /dev/null +++ b/pgscatalog_utils/scorefile/scoringfileheader.py @@ -0,0 +1,92 @@ +import gzip +import inspect +import pathlib +from dataclasses import dataclass + +from pgscatalog_utils.download.GenomeBuild import GenomeBuild + + +@dataclass +class ScoringFileHeader: + pgs_id: str + pgp_id: str + pgs_name: str + genome_build: GenomeBuild + variants_number: int + trait_reported: str + trait_efo: str + trait_mapped: str + weight_type: str + citation: str + HmPOS_build: GenomeBuild + HmPOS_date: str + format_version: str + license: str = ( + "PGS obtained from the Catalog should be cited appropriately, and " + "used in accordance with any licensing restrictions set by the authors. See EBI " + "Terms of Use (https://www.ebi.ac.uk/about/terms-of-use/) for additional details." + ) + + def __post_init__(self): + if self.variants_number: + self.variants_number = int(self.variants_number) + + self.genome_build = GenomeBuild.from_string(self.genome_build) + if self.HmPOS_build: + self.HmPOS_build = GenomeBuild.from_string(self.HmPOS_build) + + @classmethod + def from_path(cls, path: pathlib.Path): + raw_header: dict = raw_header_to_dict(read_header(path)) + # only keep keys needed by class but support partial headers with None values + keep_keys = inspect.get_annotations(ScoringFileHeader).keys() + header_dict = {k: raw_header.get(k) for k in keep_keys} + # ... so we can unpack the dict into a dataclass + + if header_dict.get("license") is None: + # missing license data in header means default license + # (this may change in the future) + header_dict["license"] = cls.license + + if "HmPOS_build" not in header_dict: + # working with pgs catalog formatted header but unharmonised data + header_dict["HmPOS_build"] = None + + if not all([v is None for _, v in header_dict.items()]): + return ScoringFileHeader(**header_dict) + else: + # no header available + raise Exception(f"No header detected in scoring file {path=}") + + +def raw_header_to_dict(header): + d = {} + for item in header: + key, value = item.split("=") + d[key[1:]] = value # drop # character from key + return d + + +def read_header(path: pathlib.Path): + """Parses the header of a PGS Catalog format scorefile into a dictionary""" + open_function = auto_open(path) + with open_function(path, "rt") as f: + yield from _gen_header_lines(f) + + +def _gen_header_lines(f): + for line in f: + if line.startswith("#"): + if "=" in line: + yield line.strip() + else: + # stop reading lines + break + + +def auto_open(filepath): + with open(filepath, "rb") as test_f: + if test_f.read(2) == b"\x1f\x8b": + return gzip.open + else: + return open diff --git a/pgscatalog_utils/scorefile/write.py b/pgscatalog_utils/scorefile/write.py index 8a3233b..1e43594 100644 --- a/pgscatalog_utils/scorefile/write.py +++ b/pgscatalog_utils/scorefile/write.py @@ -1,43 +1,183 @@ +import csv +import functools +import gzip import logging import os +import sqlite3 +import typing +from collections import Counter +from itertools import islice -import pandas as pd +from pgscatalog_utils.scorefile.config import Config +from pgscatalog_utils.scorefile.scorevariant import ScoreVariant +from pgscatalog_utils.scorefile.scoringfile import ScoringFile + +try: + import pyarrow as pa + + PYARROW_AVAILABLE = True +except ImportError: + PYARROW_AVAILABLE = False logger = logging.getLogger(__name__) -def write_scorefile(df: pd.DataFrame, path: str) -> None: - cols: list[str] = ['chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', - 'is_duplicated', 'accession', 'row_nr'] - - if os.path.exists(path): - logger.debug("Output file exists: setting write mode to append") - write_mode = 'a' - header = False - else: - logger.debug("Output file doesn't exist: setting write mode to write (create new file)") - write_mode = 'w' - header = True - - out_df: pd.DataFrame = (df.drop('accession', axis=1) - .rename({'filename_prefix': 'accession'}, axis=1) - .pipe(_filter_failed_liftover)) - - if 'other_allele' not in out_df: - logger.warning("No other allele information detected, writing out as missing data") - out_df['other_allele'] = None - - if path.endswith('.gz'): - logger.debug("Writing out gzip-compressed combined scorefile") - out_df[cols].to_csv(path, index=False, sep="\t", compression='gzip', mode=write_mode, header=header) - else: - logger.debug("Writing out combined scorefile") - out_df[cols].to_csv(path, index=False, sep="\t", mode=write_mode, header=header) - - -def _filter_failed_liftover(df: pd.DataFrame) -> pd.DataFrame: - if 'liftover' in df: - logger.debug("Filtering variants that failed liftover") - return df.query('liftover == True') - else: - return df +class DataWriter: + def __init__(self, filename): + self.filename = filename + self.fieldnames = [ + "chr_name", + "chr_position", + "effect_allele", + "other_allele", + "effect_weight", + "effect_type", + "is_duplicated", + "accession", + "row_nr", + ] + logger.info(f"Output filename: {filename}") + + def write(self, batch): + pass + + +class TextFileWriter(DataWriter): + def __init__(self, compress, filename): + super().__init__(filename) + self.compress = compress + + if self.compress: + logger.info("Writing with gzip") + self.open_function = functools.partial(gzip.open, compresslevel=6) + else: + logger.info("Writing text file") + self.open_function = open + + def write(self, batch): + mode = "at" if os.path.exists(self.filename) else "wt" + with self.open_function(self.filename, mode) as f: + writer = csv.writer( + f, + delimiter="\t", + lineterminator="\n", + ) + if mode == "wt": + writer.writerow(ScoreVariant.output_fields) + + writer.writerows(batch) + + +class SqliteWriter(DataWriter): + def __init__(self, filename): + super().__init__(filename) + + def write(self, batch): + conn = sqlite3.connect(self.filename) + cursor = conn.cursor() + placeholders = ", ".join("?" for _ in self.fieldnames) + + values = [ + tuple(row[key] for key in self.fieldnames if key in row) for row in batch + ] + + cursor.execute( + f"CREATE TABLE IF NOT EXISTS variants ({', '.join(self.fieldnames)})" + ) + cursor.executemany(f"INSERT INTO variants VALUES ({placeholders})", values) + conn.commit() + conn.close() + + +class PyarrowWriter(DataWriter): + if PYARROW_AVAILABLE: + schema = pa.schema( + [ + pa.field("chr_name", pa.string()), + pa.field("chr_position", pa.uint64()), + pa.field("effect_allele", pa.string()), + pa.field("other_allele", pa.string()), + pa.field("effect_weight", pa.string()), + pa.field("effect_type", pa.string()), + pa.field("is_duplicated", pa.bool_()), + pa.field("accession", pa.string()), + pa.field("row_nr", pa.uint64()), + ] + ) + + def __init__(self, filename): + if not PYARROW_AVAILABLE: + # TODO: provide a pip command + raise ImportError( + "pyarrow output not available, please install pyarrow as listed in the pyproject.toml extras section" + ) + super().__init__(filename) + + self._sink = pa.OSFile(self.filename, "wb") + self._writer: pa.RecordBatchFileWriter = pa.ipc.new_file( + self._sink, self.schema + ) + + def write(self, batch: list[ScoreVariant]): + batch_dict = { + "chr_name": [x.chr_name for x in batch], + "chr_position": [x.chr_position for x in batch], + "effect_allele": [str(x.effect_allele) for x in batch], + "other_allele": [x.other_allele for x in batch], + "effect_weight": [x.effect_weight for x in batch], + "effect_type": [str(x.effect_type) for x in batch], + "is_duplicated": [x.is_duplicated for x in batch], + "accession": [x.accession for x in batch], + "row_nr": [x.row_nr for x in batch], + } + + record_batch = pa.RecordBatch.from_pydict(batch_dict, schema=self.schema) + self._writer.write(record_batch) + + def __del__(self): + # it's very important to close the writer and file, or it gets corrupted + # can't use a with statement, so close when the object gets deleted + self._writer.close() + if not self._sink.closed: + self._sink.close() + + +def write_combined( + scoring_files: list[ScoringFile], out_path: str +) -> dict[str : typing.Counter]: + # compresslevel can be really slow, default is 9 + match fn := out_path.lower(): + case _ if fn.endswith("gz"): + writer = TextFileWriter(compress=True, filename=out_path) + case _ if fn.endswith("txt"): + writer = TextFileWriter(compress=False, filename=out_path) + case _ if fn.endswith("sqlite"): + writer = SqliteWriter(filename=out_path) + case _ if fn.endswith("ipc"): + writer = PyarrowWriter(filename=out_path) + case _: + raise ValueError(f"Unsupported file extension: {out_path}") + + counts = [] + log = {} + for scoring_file in scoring_files: + logger.info(f"Writing {scoring_file.accession} variants") + while True: + batch = list(islice(scoring_file.variants, Config.batch_size)) + if not batch: + break + writer.write(batch=batch) + counts = calculate_log(batch, counts) + + log[scoring_file.accession] = sum(counts, Counter()) + counts = [] + + return log + + +def calculate_log(batch: list[ScoreVariant], log: list[Counter]) -> list[Counter]: + # these statistics can only be generated while iterating through variants + n_variants = Counter("n_variants" for item in batch) + hm_source = Counter(getattr(item, "hm_source") for item in batch) + log.extend([n_variants + hm_source]) + return log diff --git a/poetry.lock b/poetry.lock index dac6b3d..05b2c77 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "anyio" @@ -297,6 +297,17 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "cfgv" +version = "3.4.0" +description = "Validate configuration and produce human readable error messages." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, + {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, +] + [[package]] name = "charset-normalizer" version = "3.2.0" @@ -671,6 +682,17 @@ files = [ {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, ] +[[package]] +name = "distlib" +version = "0.3.7" +description = "Distribution utilities" +optional = false +python-versions = "*" +files = [ + {file = "distlib-0.3.7-py2.py3-none-any.whl", hash = "sha256:2e24928bc811348f0feb63014e97aaae3037f2cf48712d51ae61df7fd6075057"}, + {file = "distlib-0.3.7.tar.gz", hash = "sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8"}, +] + [[package]] name = "exceptiongroup" version = "1.1.3" @@ -713,6 +735,22 @@ files = [ [package.extras] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] +[[package]] +name = "filelock" +version = "3.13.1" +description = "A platform independent file lock." +optional = false +python-versions = ">=3.8" +files = [ + {file = "filelock-3.13.1-py3-none-any.whl", hash = "sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c"}, + {file = "filelock-3.13.1.tar.gz", hash = "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e"}, +] + +[package.extras] +docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.24)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] +typing = ["typing-extensions (>=4.8)"] + [[package]] name = "fonttools" version = "4.42.1" @@ -781,6 +819,20 @@ files = [ {file = "fqdn-1.5.1.tar.gz", hash = "sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f"}, ] +[[package]] +name = "identify" +version = "2.5.31" +description = "File identification library for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "identify-2.5.31-py2.py3-none-any.whl", hash = "sha256:90199cb9e7bd3c5407a9b7e81b4abec4bb9d249991c79439ec8af740afc6293d"}, + {file = "identify-2.5.31.tar.gz", hash = "sha256:7736b3c7a28233637e3c36550646fc6389bedd74ae84cb788200cc8e2dd60b75"}, +] + +[package.extras] +license = ["ukkonen"] + [[package]] name = "idna" version = "3.4" @@ -1737,6 +1789,20 @@ files = [ {file = "nest_asyncio-1.5.8.tar.gz", hash = "sha256:25aa2ca0d2a5b5531956b9e273b45cf664cae2b145101d73b86b199978d48fdb"}, ] +[[package]] +name = "nodeenv" +version = "1.8.0" +description = "Node.js virtual environment builder" +optional = false +python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*" +files = [ + {file = "nodeenv-1.8.0-py2.py3-none-any.whl", hash = "sha256:df865724bb3c3adc86b3876fa209771517b0cfe596beff01a92700e0e8be4cec"}, + {file = "nodeenv-1.8.0.tar.gz", hash = "sha256:d51e0c37e64fbf47d017feac3145cdbb58836d7eee8c6f6d3b6880c5456227d2"}, +] + +[package.dependencies] +setuptools = "*" + [[package]] name = "notebook" version = "7.0.4" @@ -1871,8 +1937,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, ] python-dateutil = ">=2.8.1" pytz = ">=2020.1" @@ -2082,6 +2148,24 @@ pyarrow = ["pyarrow (>=4.0.0)"] timezone = ["backports.zoneinfo", "tzdata"] xlsx2csv = ["xlsx2csv (>=0.8.0)"] +[[package]] +name = "pre-commit" +version = "3.5.0" +description = "A framework for managing and maintaining multi-language pre-commit hooks." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pre_commit-3.5.0-py2.py3-none-any.whl", hash = "sha256:841dc9aef25daba9a0238cd27984041fa0467b4199fc4852e27950664919f660"}, + {file = "pre_commit-3.5.0.tar.gz", hash = "sha256:5804465c675b659b0862f07907f96295d490822a450c4c40e747d0b1c6ebcb32"}, +] + +[package.dependencies] +cfgv = ">=2.0.0" +identify = ">=1.0.0" +nodeenv = ">=0.11.1" +pyyaml = ">=5.1" +virtualenv = ">=20.10.0" + [[package]] name = "prometheus-client" version = "0.17.1" @@ -2161,6 +2245,54 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "pyarrow" +version = "14.0.1" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"}, + {file = "pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a8ae88c0038d1bc362a682320112ee6774f006134cd5afc291591ee4bc06505"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f6f053cb66dc24091f5511e5920e45c83107f954a21032feadc7b9e3a8e7851"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:906b0dc25f2be12e95975722f1e60e162437023f490dbd80d0deb7375baf3171"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:78d4a77a46a7de9388b653af1c4ce539350726cd9af62e0831e4f2bd0c95a2f4"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:06ca79080ef89d6529bb8e5074d4b4f6086143b2520494fcb7cf8a99079cde93"}, + {file = "pyarrow-14.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:32542164d905002c42dff896efdac79b3bdd7291b1b74aa292fac8450d0e4dcd"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c7331b4ed3401b7ee56f22c980608cf273f0380f77d0f73dd3c185f78f5a6220"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:922e8b49b88da8633d6cac0e1b5a690311b6758d6f5d7c2be71acb0f1e14cd61"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58c889851ca33f992ea916b48b8540735055201b177cb0dcf0596a495a667b00"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30d8494870d9916bb53b2a4384948491444741cb9a38253c590e21f836b01222"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:be28e1a07f20391bb0b15ea03dcac3aade29fc773c5eb4bee2838e9b2cdde0cb"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:981670b4ce0110d8dcb3246410a4aabf5714db5d8ea63b15686bce1c914b1f83"}, + {file = "pyarrow-14.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:4756a2b373a28f6166c42711240643fb8bd6322467e9aacabd26b488fa41ec23"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:cf87e2cec65dd5cf1aa4aba918d523ef56ef95597b545bbaad01e6433851aa10"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:470ae0194fbfdfbf4a6b65b4f9e0f6e1fa0ea5b90c1ee6b65b38aecee53508c8"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6263cffd0c3721c1e348062997babdf0151301f7353010c9c9a8ed47448f82ab"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8089d7e77d1455d529dbd7cff08898bbb2666ee48bc4085203af1d826a33cc"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fada8396bc739d958d0b81d291cfd201126ed5e7913cb73de6bc606befc30226"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:2a145dab9ed7849fc1101bf03bcdc69913547f10513fdf70fc3ab6c0a50c7eee"}, + {file = "pyarrow-14.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:05fe7994745b634c5fb16ce5717e39a1ac1fac3e2b0795232841660aa76647cd"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:a8eeef015ae69d104c4c3117a6011e7e3ecd1abec79dc87fd2fac6e442f666ee"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c76807540989fe8fcd02285dd15e4f2a3da0b09d27781abec3adc265ddbeba1"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:450e4605e3c20e558485f9161a79280a61c55efe585d51513c014de9ae8d393f"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323cbe60210173ffd7db78bfd50b80bdd792c4c9daca8843ef3cd70b186649db"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0140c7e2b740e08c5a459439d87acd26b747fc408bde0a8806096ee0baaa0c15"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:e592e482edd9f1ab32f18cd6a716c45b2c0f2403dc2af782f4e9674952e6dd27"}, + {file = "pyarrow-14.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d264ad13605b61959f2ae7c1d25b1a5b8505b112715c961418c8396433f213ad"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:01e44de9749cddc486169cb632f3c99962318e9dacac7778315a110f4bf8a450"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0351fecf0e26e152542bc164c22ea2a8e8c682726fce160ce4d459ea802d69c"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c1f6110c386464fd2e5e4ea3624466055bbe681ff185fd6c9daa98f30a3f9a"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11e045dfa09855b6d3e7705a37c42e2dc2c71d608fab34d3c23df2e02df9aec3"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:097828b55321897db0e1dbfc606e3ff8101ae5725673498cbfa7754ee0da80e4"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1daab52050a1c48506c029e6fa0944a7b2436334d7e44221c16f6f1b2cc9c510"}, + {file = "pyarrow-14.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3f6d5faf4f1b0d5a7f97be987cf9e9f8cd39902611e818fe134588ee99bf0283"}, + {file = "pyarrow-14.0.1.tar.gz", hash = "sha256:b8b3f4fe8d4ec15e1ef9b599b94683c5216adaed78d5cb4c606180546d1e2ee1"}, +] + +[package.dependencies] +numpy = ">=1.16.6" + [[package]] name = "pycparser" version = "2.21" @@ -3031,6 +3163,26 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "virtualenv" +version = "20.24.6" +description = "Virtual Python Environment builder" +optional = false +python-versions = ">=3.7" +files = [ + {file = "virtualenv-20.24.6-py3-none-any.whl", hash = "sha256:520d056652454c5098a00c0f073611ccbea4c79089331f60bf9d7ba247bb7381"}, + {file = "virtualenv-20.24.6.tar.gz", hash = "sha256:02ece4f56fbf939dbbc33c0715159951d6bf14aaf5457b092e4548e1382455af"}, +] + +[package.dependencies] +distlib = ">=0.3.7,<1" +filelock = ">=3.12.2,<4" +platformdirs = ">=3.9.1,<4" + +[package.extras] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] + [[package]] name = "wcwidth" version = "0.2.6" @@ -3157,4 +3309,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "2859497817dfd52518f4fa2ba527c716a5bb5e4354175f791b314e80a033edf2" +content-hash = "397df0f3e64b00fabebb36bf3c3576d94c2f34c2f34dcec223973a19e525d2e6" diff --git a/poetry.toml b/poetry.toml new file mode 100644 index 0000000..53b35d3 --- /dev/null +++ b/poetry.toml @@ -0,0 +1,3 @@ +[virtualenvs] +create = true +in-project = true diff --git a/pyproject.toml b/pyproject.toml index 571ed03..098283f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pgscatalog_utils" -version = "0.4.3" +version = "0.5.0" description = "Utilities for working with PGS Catalog API and scoring files" homepage = "https://github.com/PGScatalog/pgscatalog_utils" authors = ["Benjamin Wingfield ", "Samuel Lambert ", "Laurent Gil "] @@ -16,7 +16,6 @@ aggregate_scores = "pgscatalog_utils.aggregate.aggregate_scores:aggregate_scores validate_scorefiles = "pgscatalog_utils.validate.validate_scorefile:validate_scorefile" relabel_ids = "pgscatalog_utils.relabel.relabel_ids:relabel_ids" ancestry_analysis = "pgscatalog_utils.ancestry.ancestry_analysis:ancestry_analysis" -samplesheet_to_json = "pgscatalog_utils.samplesheet.check:check_samplesheet" [tool.poetry.dependencies] python = "^3.10" @@ -30,6 +29,8 @@ polars = "^0.15.0" zstandard = "^0.18.0" pgzip = "^0.3.2" scikit-learn = "^1.2.1" +pre-commit = "^3.5.0" +pyarrow = "^14.0.1" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" @@ -43,3 +44,5 @@ seaborn = "^0.12.2" requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" +[tool.pytest.ini_options] +addopts = "--doctest-modules" \ No newline at end of file diff --git a/tests/data/combine/PGS001229_22.txt b/tests/data/combine/PGS001229_22.txt new file mode 100644 index 0000000..4084c13 --- /dev/null +++ b/tests/data/combine/PGS001229_22.txt @@ -0,0 +1,850 @@ +###PGS CATALOG SCORING FILE - see https://https://www.pgscatalog.org/downloads/#dl_ftp_scoring_scoring for additional information +#format_version=2.0 +##POLYGENIC SCORE (PGS) INFORMATION +#pgs_id=PGS001229 +#pgs_name=GBE_INI50 +#trait_reported=Standing height +#trait_mapped=body height +#trait_efo=EFO_0004339 +#weight_type=NR +#genome_build=GRCh37 +#variants_number=835 +##SOURCE INFORMATION +#pgp_id=PGP000244 +#citation=Tanigawa Y et al. medRxiv (2021). doi:10.1101/2021.09.02.21262942 +rsID chr_name chr_position effect_allele other_allele effect_weight is_haplotype imputation_method locus_name variant_description +rs5746679 22 17080378 G A 1.045457e-02 False +rs2192430 22 17300230 A G 1.411475e-04 False +rs165636 22 17318864 A C 8.166266e-03 False +rs165808 22 17327595 T C 7.791641e-03 False +rs5748743 22 17409813 A G 3.108784e-04 False +rs11703655 22 17450952 G A -3.033983e-02 False +rs2192155 22 17492533 G A 3.889990e-03 False +rs2845402 22 17542810 C T 8.036290e-03 False +rs4819958 22 17565013 G A 2.135621e-02 False +rs879577 22 17589209 T C 3.026491e-03 False +rs5994165 22 17600977 A G 1.581277e-02 False +rs35665085 22 17625915 A G -1.172964e-01 False +rs1034859 22 17630486 A C 1.012909e-02 False +rs738032 22 17633785 C T 2.325500e-03 False +rs5994185 22 17643689 A G 3.361814e-03 False +rs2231495 22 17669306 C T 2.145060e-02 False +rs5747018 22 17677699 T C -7.031384e-04 False +rs17807317 22 17680519 C A 1.079236e-03 False +rs9606655 22 17701234 G A 4.477145e-03 False +rs78718739 22 17703119 A T 7.771872e-04 False +rs73153427 22 17718699 C A -1.320632e-02 False +rs4266110 22 17721595 C T 9.480363e-03 False +rs4819982 22 17727648 T C 7.811685e-03 False +rs5749000 22 17738177 G A -4.719812e-03 False +rs5749002 22 17749096 A G -5.244795e-03 False +rs11704699 22 17770181 G T -3.101703e-02 False +rs5749032 22 17793969 G A 1.774444e-02 False +rs5994272 22 17815696 G C -5.516090e-03 False +rs4820001 22 17827684 G A -5.944752e-03 False +rs2040692 22 17831813 T C 1.061587e-02 False +rs9606701 22 17844929 T G 1.717643e-03 False +rs73391753 22 17850661 T C -2.805489e-02 False +rs4819581 22 17887534 A G 7.723542e-04 False +rs2522310 22 17887725 A G 7.472703e-03 False +rs5747199 22 17958221 C A -2.098647e-02 False +rs174346 22 18036253 G A -1.772981e-02 False +rs174351 22 18038786 A G -2.119071e-03 False +rs9605406 22 18262301 A T -5.065485e-03 False +rs389496 22 18289204 A G 5.306345e-03 False +rs399757 22 18295575 C T 2.300129e-02 False +rs1550663 22 18296238 G A -5.665446e-03 False +rs439231 22 18319179 T C 3.440642e-02 False +rs2401424 22 18393534 A C 1.764269e-02 False +rs1076540 22 18439958 T C -2.261707e-03 False +rs4819654 22 18483388 G A 3.318724e-02 False +rs9617650 22 18488883 C G -1.919180e-02 False +rs397709 22 18489048 C A 1.233198e-02 False +rs452579 22 18495470 A G -5.804926e-03 False +rs1992576 22 18537145 G A -4.930116e-03 False +rs464385 22 18571008 A G -8.844726e-05 False +rs3827281 22 18584433 C T -1.169893e-03 False +rs9618216 22 18631365 T C -1.551714e-03 False +rs3180408 22 18650682 T C -1.313784e-02 False +rs2010694 22 18890037 A G 5.968921e-02 False +rs454534 22 18891398 G A 6.891943e-03 False +rs2080346 22 18892575 A G -2.244470e-03 False +rs2016108 22 18915963 A G 3.719756e-03 False +rs2518810 22 18959581 T C 6.464581e-03 False +rs2019061 22 18963340 A G -1.397565e-02 False +rs11089247 22 18970915 T C -1.507131e-03 False +rs2073776 22 19024651 T C -3.505750e-03 False +rs712965 22 19121872 A G 1.644046e-02 False +rs2275901 22 19135603 A G -2.970077e-02 False +rs11089259 22 19190143 T C 3.268027e-03 False +rs361787 22 19263698 T C 2.057255e-02 False +rs8135222 22 19292446 G T 1.153989e-02 False +rs34292276 22 19371052 T C 1.055134e-02 False +rs1128399 22 19420109 C T -8.628228e-03 False +rs5748218 22 19451186 A C 2.141029e-02 False +rs5748260 22 19518079 C T 5.372247e-03 False +rs5993713 22 19581331 T C 1.686942e-02 False +rs9606090 22 19593854 C A 6.544249e-04 False +rs8135254 22 19606703 G A 2.070121e-02 False +rs9617823 22 19649005 A G 2.868601e-03 False +rs6518580 22 19735854 C T 6.262962e-03 False +rs1005133 22 19738355 T C 4.973840e-05 False +rs9680615 22 19770886 A G -1.013929e-02 False +rs2871043 22 19781823 T C 2.481609e-02 False +rs2073750 22 19873357 T C 1.163020e-02 False +rs5748469 22 19907099 A C -2.676450e-02 False +rs9618723 22 19968597 T C -2.203945e-02 False +rs5748515 22 20046344 G A -9.801428e-03 False +rs59528277 22 20084821 C T -2.232886e-02 False +rs625704 22 20185457 A G 6.892171e-03 False +rs672570 22 20189077 T C 1.738215e-02 False +rs7293032 22 20219648 A G 9.307625e-03 False +rs855050 22 20248391 A G -5.405845e-03 False +rs855061 22 20267213 A G 6.713242e-03 False +rs741413 22 20286099 G T 1.574758e-02 False +rs35012563 22 20749042 G A 6.603339e-03 False +rs361860 22 20754039 A G -1.181141e-02 False +rs1771145 22 20775167 T C 1.160113e-02 False +rs9680797 22 20780296 A G 6.735311e-02 False +rs1005640 22 20789074 C T 2.844307e-02 False +rs12628193 22 20791438 A C 4.734740e-02 False +rs1035239 22 20793914 C T 7.009781e-03 False +rs75179603 22 20839810 T G 3.947346e-03 False +rs738092 22 20860931 T C 5.613511e-04 False +rs10427922 22 20979980 G A 3.231665e-03 False +rs2080195 22 20991771 G A 4.226765e-03 False +rs5751800 22 21075537 C A -2.096453e-03 False +rs361979 22 21154393 G T -4.297086e-03 False +rs756878 22 21323357 C T -6.041745e-03 False +rs178275 22 21331918 G C -2.280912e-03 False +rs105034 22 21334924 C G -2.031369e-02 False +rs28372939 22 21356824 A G 1.476577e-02 False +rs2072550 22 21386019 A G 1.435557e-02 False +rs431319 22 21449028 G A -1.537701e-02 False +rs2845419 22 21463515 A G -1.335614e-02 False +rs2298428 22 21982892 T C -6.373335e-02 False +rs62235077 22 22001704 T G 2.809584e-02 False +rs76940365 22 22062480 T C 5.291130e-02 False +rs10427813 22 22080735 G A -1.394260e-02 False +rs78907487 22 22151939 C A -8.287849e-03 False +rs9607287 22 22163425 G A 5.518983e-02 False +rs412050 22 22307519 C G -3.486191e-03 False +rs79165737 22 22351283 G A -7.483763e-04 False +rs5844480 22 22394291 AG A 4.320583e-03 False +rs2213141 22 22395754 T C 2.587971e-03 False +rs6519111 22 22424302 A C 1.140800e-03 False +rs77010661 22 22473905 C A 1.226009e-02 False +rs2073447 22 22550450 G C 1.773244e-02 False +rs5757417 22 22561610 C T -6.207024e-03 False +rs6001482 22 22581369 G A -6.272413e-03 False +rs5757569 22 22584678 A G -2.176470e-03 False +rs736898 22 22711786 T C 7.779875e-03 False +rs738881 22 22726372 T C 3.496320e-03 False +rs2051490 22 22762771 C T 1.252501e-02 False +rs433766 22 22769923 G A -1.103632e-02 False +rs361959 22 22869742 A C -2.412657e-03 False +rs362168 22 22871922 A G -2.769974e-03 False +rs4462880 22 22929268 T C -7.035723e-03 False +rs456455 22 23001481 A G 7.524178e-03 False +rs11703025 22 23022520 T C 2.175257e-03 False +rs10854762 22 23064982 A C -1.255076e-02 False +rs2856876 22 23249440 A C 2.085816e-02 False +rs58555503 22 23268677 A G 1.337349e-02 False +rs17514179 22 23279456 C G -1.371401e-02 False +rs468884 22 23282286 C T 4.994329e-03 False +rs9623992 22 23325722 C T 8.506657e-04 False +rs3788338 22 23412058 A G -9.545553e-03 False +rs140504 22 23627369 G A -1.900175e-02 False +rs12168342 22 23644425 G A -9.106953e-04 False +rs131693 22 23649242 G T 1.061643e-03 False +rs3827368 22 23794844 G A -1.198736e-02 False +rs11090252 22 23804670 G T -1.119846e-03 False +rs2330498 22 23819697 T G -1.028722e-02 False +rs5759884 22 23873076 T C 9.509027e-03 False +rs179303 22 23892145 T C 1.351280e-02 False +rs131429 22 23925779 C T -4.127647e-03 False +rs6003815 22 23960187 T C -8.475905e-03 False +rs2070446 22 24035970 T C -1.334318e-03 False +rs5759985 22 24086107 G A -1.652957e-02 False +rs73396542 22 24105789 A G 1.813091e-02 False +rs2298375 22 24106448 A G 1.834095e-03 False +rs6003915 22 24186809 C T -1.426541e-02 False +rs4822446 22 24235360 G A 3.168635e-04 False +rs4822455 22 24255296 T C 1.624252e-02 False +rs144128236 22 24300540 T C -3.225760e-03 False +rs144686326 22 24376584 A G -6.223068e-03 False +rs422674 22 24406778 A C 3.046540e-03 False +rs5996675 22 24618331 G A -6.506681e-04 False +rs5751862 22 24802564 A G -6.695797e-03 False +rs6004171 22 24912232 T C -1.536303e-02 False +rs762283 22 24943582 A G -1.687764e-03 False +rs2006092 22 24995668 G A -3.537331e-02 False +rs5760609 22 25123505 C T -1.600990e-02 False +rs5760620 22 25145094 T C -5.584047e-03 False +rs1892723 22 25145453 T C -1.388536e-03 False +rs5760661 22 25185823 A G -9.228375e-03 False +rs11703103 22 25265972 A G 1.088906e-02 False +rs139766 22 25309448 A G -2.238693e-03 False +rs5752027 22 25363411 A G 4.035775e-03 False +rs34259162 22 25410895 G A 9.720734e-04 False +rs16979472 22 25442369 C T 1.660527e-02 False +rs9612844 22 25454658 C A 1.200285e-02 False +rs6004418 22 25465065 C T 1.320801e-02 False +rs4627697 22 25524916 C T 1.147501e-02 False +rs13055430 22 25603008 T C -1.262741e-02 False +rs7286982 22 25619025 G T -1.212511e-02 False +rs5752084 22 25621591 T C 1.051851e-02 False +rs11703955 22 25643483 T G 1.373474e-02 False +rs9612921 22 25661725 A G -5.936431e-03 False +rs6004519 22 25667883 G A 1.547775e-02 False +rs5996879 22 25668730 A C 2.616493e-02 False +rs67839603 22 25678577 T C 3.040180e-02 False +rs79854676 22 25761309 T C -1.760112e-03 False +rs713847 22 25761936 T C -5.171998e-03 False +rs571663 22 25938977 T C 1.966116e-02 False +rs1008673 22 25994013 A G 6.268228e-04 False +rs718163 22 26081873 T C 5.232603e-02 False +rs10212011 22 26132612 A G -6.457239e-03 False +rs133847 22 26133775 T C -1.181527e-03 False +rs133885 22 26159289 A G -8.399401e-03 False +rs3859870 22 26181767 C T 1.044769e-02 False +rs5761201 22 26190915 G A 4.287533e-03 False +rs5761256 22 26218164 G A -2.803502e-03 False +rs17704912 22 26231312 C G 6.105629e-03 False +rs2269632 22 26237826 C T 4.981479e-03 False +rs5761268 22 26239850 A C 4.144037e-03 False +rs4822668 22 26273893 C G 5.616213e-03 False +rs695809 22 26278128 G T -3.965338e-03 False +rs2157538 22 26280462 T C -8.324497e-04 False +rs6004814 22 26290588 T C -1.307320e-02 False +rs973523 22 26292659 G A 4.294309e-05 False +rs2072006 22 26343593 G A 7.813758e-03 False +rs9306419 22 26369358 T C -4.836650e-03 False +rs2331198 22 26390964 A G -7.849451e-03 False +rs5752254 22 26415475 T C -1.219281e-03 False +rs5752262 22 26456367 G A -1.285326e-02 False +rs56116806 22 26460519 T C -8.695338e-03 False +rs78711257 22 26528054 A G 1.973023e-02 False +rs5752282 22 26617260 T A -1.384025e-02 False +rs4438594 22 26638906 G T 1.229772e-02 False +rs5761484 22 26735648 A G 7.879673e-04 False +rs5752316 22 26782251 G A 5.096459e-04 False +rs7289238 22 26812632 C T -1.850814e-02 False +rs732933 22 26939781 C T -9.222796e-04 False +rs2267091 22 26960648 A C -5.679255e-03 False +rs5752371 22 27038865 T G -1.487706e-04 False +rs5752372 22 27042828 A G 2.957737e-02 False +rs1476035 22 27161060 A G 2.844558e-03 False +rs56278657 22 27191643 T C 8.953731e-03 False +rs739226 22 27216426 G A 9.120990e-03 False +rs4822804 22 27217018 A G 1.510616e-02 False +rs136511 22 27240025 T G -2.971740e-02 False +rs136516 22 27242642 G A -9.822927e-03 False +rs136535 22 27246070 C T -1.554199e-03 False +rs9306427 22 27252454 C T -6.560251e-03 False +rs5761797 22 27264880 G T -1.323094e-02 False +rs4822824 22 27337886 A G -9.600014e-03 False +rs739257 22 27339284 T C -9.944488e-03 False +rs5761864 22 27353810 T C -2.171555e-03 False +rs5761885 22 27370273 T C -9.798478e-03 False +rs7288253 22 27378884 A G 5.145072e-02 False +rs7287426 22 27398749 C T 1.012263e-03 False +rs9613339 22 27403571 C T -1.745865e-02 False +rs2516086 22 27405012 T C -5.425419e-03 False +rs17343637 22 27415255 C T -1.499362e-02 False +rs60259956 22 27426628 G C 2.289460e-02 False +rs9620654 22 27430724 A G -7.068064e-03 False +rs760526 22 27435577 C T -8.632412e-03 False +rs4822847 22 27487580 G A 3.691502e-03 False +rs5761976 22 27498426 A G -6.801544e-03 False +rs11704703 22 27526095 G A -8.086267e-04 False +rs9625170 22 27563274 C A 1.369650e-02 False +rs9306437 22 27584680 A G -2.139188e-03 False +rs546339 22 27628151 C G 2.130389e-02 False +rs134786 22 27652290 T G 4.815735e-03 False +rs760593 22 27660675 A G 4.899654e-03 False +rs134810 22 27674832 G T 1.248065e-04 False +rs736950 22 27718775 A G 2.292384e-02 False +rs568561 22 27729742 G A 4.951261e-03 False +rs6519705 22 27762155 C T 4.856660e-03 False +rs6005412 22 27781736 A C -8.336242e-03 False +rs5752545 22 27829565 G A 2.854090e-03 False +rs16984654 22 27832985 G C -1.668955e-02 False +rs4822878 22 27836311 G A -7.756250e-03 False +rs7288006 22 27839704 T C -2.492106e-02 False +rs5762173 22 27864471 A C 2.189950e-03 False +rs10439912 22 27873024 G A 2.721729e-03 False +rs5762194 22 27883265 G A 2.961735e-02 False +rs6005471 22 27890684 A G -8.057355e-03 False +rs761596 22 27927298 T C 2.054268e-02 False +rs5997265 22 27934290 G A 4.751755e-03 False +rs5762249 22 27951176 A G -4.329547e-04 False +rs762064 22 27974819 C A 1.439093e-02 False +rs4822917 22 27975451 G A -3.648208e-02 False +rs6005524 22 28007741 C T -1.635917e-02 False +rs1885362 22 28016883 C A 8.564085e-03 False +rs8135014 22 28046561 T C 1.535905e-02 False +rs9608638 22 28060034 A G 3.097228e-02 False +rs134110 22 28076058 C T 2.848654e-02 False +rs1885364 22 28094845 G A -2.659077e-02 False +rs7291248 22 28130130 C T -1.640387e-02 False +rs2079095 22 28136977 A C -3.962775e-03 False +rs4822935 22 28150109 G A 6.071392e-04 False +rs2283844 22 28150815 A G 1.604724e-02 False +rs2267106 22 28151825 A G -5.390282e-03 False +rs2267113 22 28155404 T C 5.030388e-03 False +rs4822939 22 28172577 G T 5.704168e-03 False +rs12166473 22 28185452 G T -6.896853e-03 False +rs5752639 22 28200176 G A -6.474674e-03 False +rs11705555 22 28206912 C A -6.175542e-03 False +rs5997320 22 28270372 G T -6.768204e-04 False +rs742547 22 28412908 G T 1.763639e-02 False +rs77885044 22 28501414 T C -2.304747e-01 False +rs1884816 22 29106733 C T -1.074749e-02 False +rs132549 22 29318724 T C 1.743333e-03 False +rs17518058 22 29378610 C T 6.690876e-04 False +rs134620 22 29478760 C T -3.029428e-02 False +rs34920087 22 29533572 G C -1.269604e-02 False +rs111625211 22 29626515 A G -1.171130e-02 False +rs3950176 22 29630337 A G 2.658049e-02 False +rs4820803 22 29669648 C G -8.550535e-03 False +rs131190 22 29692497 T G 1.234896e-03 False +rs3804076 22 29837537 C T 1.321112e-02 False +rs467768 22 29961986 T G 1.878853e-03 False +rs140130 22 30151687 C T 3.418302e-03 False +rs76013375 22 30163526 G A 1.576261e-02 False +rs2412971 22 30494371 A G 7.959801e-03 False +rs713875 22 30592487 G C -1.047403e-01 False +rs76168543 22 30621613 A C -1.382104e-02 False +rs55816744 22 30658082 C T -3.794014e-02 False +rs4823086 22 30688659 T C 2.257140e-02 False +rs740223 22 30762140 A G 2.079806e-02 False +rs757660 22 30793137 A G -4.609306e-03 False +rs9608956 22 30901592 C T -8.334040e-03 False +rs5749118 22 30927975 T C 3.226189e-03 False +rs2267161 22 30953295 T C -7.685790e-03 False +rs4820875 22 30992651 G A -2.565800e-02 False +rs1131603 22 31018975 C T 4.241226e-02 False +rs5997714 22 31032920 G A -2.311985e-02 False +rs34597012 22 31063804 G GT -2.081808e-04 False +rs136382 22 31114086 G T 2.825476e-02 False +rs5753303 22 31139653 A G 2.640129e-06 False +rs136230 22 31214382 G A 1.137657e-02 False +rs57527354 22 31216506 C T 5.531311e-03 False +rs67441859 22 31272930 T C -1.056118e-03 False +rs3747151 22 31333631 C T -1.235089e-02 False +rs4820921 22 31378447 A G 1.020507e-02 False +rs715297 22 31442308 A G -2.479126e-03 False +rs11089487 22 31477361 C G -1.263667e-02 False +rs5753465 22 31514348 G A 5.803240e-03 False +rs2240432 22 31521404 A G 1.097391e-02 False +rs5749244 22 31659495 C T 2.663412e-02 False +rs7289941 22 31884405 C T -3.950834e-04 False +rs41311139 22 32200849 T C 1.585735e-02 False +rs7290696 22 32341684 T C -2.960328e-02 False +rs8139657 22 32559835 G A -2.170436e-02 False +rs7291990 22 32569263 C T -1.296006e-03 False +rs5998321 22 32624139 C T 5.619574e-03 False +rs5753956 22 32702816 A G -1.534023e-02 False +rs201161881 22 32756652 G A 2.512177e-02 False +rs2076050 22 32831540 T C 1.868495e-03 False +rs2076054 22 32832874 T C 6.028815e-05 False +rs9609559 22 32853660 G A 1.382210e-02 False +rs62241183 22 32854391 C A 1.960825e-04 False +rs11107 22 32875190 A G -6.426637e-03 False +rs11341975 22 32934713 C CT -9.057754e-03 False +rs2157189 22 32952012 A C -3.802480e-03 False +rs2710386 22 32954443 G A 2.210369e-03 False +rs62232741 22 32993032 C T -2.429979e-03 False +rs966964 22 32997766 T C -8.424246e-03 False +rs62234573 22 33045573 T C -3.107145e-02 False +rs762899 22 33046110 G C -6.954732e-02 False +rs80186738 22 33048039 T C 1.138346e-02 False +rs4821083 22 33056341 C T -6.477198e-02 False +rs997120 22 33108536 T C -3.426392e-02 False +rs7286819 22 33108981 T C -7.404035e-02 False +rs743743 22 33116435 T C 6.542471e-02 False +rs2157133 22 33143528 G A 2.195059e-02 False +rs58039541 22 33146363 A G 8.105390e-04 False +rs5749529 22 33259625 C T 2.309793e-02 False +rs137560 22 33336039 T G -2.554387e-02 False +rs9609680 22 33408519 T C -7.556300e-03 False +rs4821137 22 33660345 C G 2.190743e-03 False +rs117531661 22 33804893 C T 6.680774e-03 False +rs5754555 22 33844303 C T 8.923314e-03 False +rs9609802 22 33846914 T C 6.295378e-03 False +rs62225321 22 33898906 A C 1.958759e-05 False +rs86487 22 34022284 A G -2.579330e-03 False +rs239333 22 34137784 G A 4.460828e-03 False +rs5999111 22 34208570 T C -3.365869e-03 False +rs9941961 22 34217757 T C 9.289431e-03 False +rs10854640 22 34256923 A C 1.439384e-02 False +rs79442817 22 34265402 G A -1.636610e-02 False +rs5754747 22 34284173 G A -2.315559e-02 False +rs2157153 22 34296093 C A -4.688326e-03 False +rs12169215 22 34378012 A G 2.276664e-03 False +rs242898 22 34436795 C T 1.337033e-04 False +rs2413215 22 34488452 A G -4.288310e-04 False +rs135198 22 34501541 A G 2.763614e-03 False +rs243001 22 34514810 C A 3.976601e-03 False +rs130668 22 34526428 C T 1.088864e-02 False +rs5999246 22 34583078 A G 1.802495e-03 False +rs753600 22 34620754 T C 1.466546e-02 False +rs2097307 22 34691035 A G -2.082615e-04 False +rs411451 22 34758540 T C 5.165532e-03 False +rs2609850 22 34851377 A C 1.371180e-02 False +rs737821 22 35371707 T C -4.985554e-04 False +rs35433006 22 35382268 A C -4.931336e-03 False +rs7292124 22 35419122 C T -1.077953e-02 False +rs8140287 22 35478529 A G 1.760523e-04 False +rs61735502 22 35481493 T C 1.056439e-02 False +rs80730 22 35526281 G A -2.766891e-03 False +rs61134707 22 35603836 A G -1.783939e-04 False +rs1053593 22 35660875 T G 3.988231e-02 False +rs6518950 22 35745196 G T 1.750545e-04 False +rs17793276 22 35750980 A G -7.651136e-03 False +rs2071749 22 35783413 G A 1.649791e-03 False +rs35806646 22 35918270 C T 6.918713e-03 False +rs5750115 22 35959242 A G 1.697538e-02 False +rs738368 22 35962060 G A 5.181476e-03 False +rs926338 22 35964158 G C 2.769931e-03 False +rs5995124 22 35984385 A G -1.280623e-02 False +rs4820205 22 36001258 C T 1.342405e-02 False +rs4327313 22 36072262 T C 4.895490e-03 False +rs6000004 22 36180535 G A -3.250252e-02 False +rs78188544 22 36517307 C T 1.366076e-02 False +rs6000142 22 36519596 A C -3.499560e-03 False +rs9610403 22 36532058 A G -1.214487e-02 False +rs3788518 22 36543489 C G 7.838149e-03 False +rs6000190 22 36600841 G A 2.644389e-02 False +rs2010659 22 36629633 C A -6.871468e-03 False +rs136145 22 36635967 G A -2.634742e-02 False +rs78188930 22 36655735 A G -5.385142e-03 False +rs136176 22 36661646 A G -1.560741e-02 False +rs2269529 22 36684354 C T -5.170111e-03 False +rs75138027 22 36705622 A G 1.713234e-02 False +rs3842715 22 36708049 C CTCCTGTGA -5.187051e-02 False +rs11089788 22 36751101 A C -2.440650e-02 False +rs16996704 22 36764788 G A 2.784116e-02 False +rs5756223 22 36897427 C T 2.603792e-02 False +rs760718 22 36900806 G A 7.366207e-03 False +rs6000293 22 36923144 T C -1.875563e-03 False +rs5995298 22 36924714 G A -3.632594e-03 False +rs140020 22 36946643 T G 1.333137e-02 False +rs4821501 22 36954939 T C 1.105894e-02 False +rs5756255 22 36998907 T C -6.084687e-04 False +rs9622429 22 37001495 G T -1.224147e-02 False +rs2267348 22 37013167 G C 1.866849e-02 False +rs6000386 22 37077364 C T 7.294257e-03 False +rs738514 22 37080738 C G -4.873355e-03 False +rs2746971 22 37101890 C T 3.991764e-02 False +rs933229 22 37118535 A G -1.713909e-03 False +rs62230508 22 37184521 G A 6.515894e-03 False +rs4820254 22 37206341 G T 2.566936e-04 False +rs11089806 22 37256262 A G 1.152626e-03 False +rs4821544 22 37258503 C T -9.761102e-03 False +rs909486 22 37323988 T C -7.318200e-03 False +rs1534882 22 37329545 G A 5.775806e-03 False +rs131843 22 37337409 T C -2.534399e-02 False +rs2093380 22 37343000 A C -4.011777e-04 False +rs743749 22 37398195 T C -1.001198e-02 False +rs2413447 22 37401532 A G -3.244795e-03 False +rs11554714 22 37407109 C G 4.335972e-02 False +rs2543523 22 37477732 T C 3.669548e-04 False +rs1861947 22 37507019 A G -9.259451e-04 False +rs28450477 22 37513316 A G 1.153887e-03 False +rs3218297 22 37532441 A G 1.802306e-02 False +rs2543529 22 37571497 G A -5.785311e-03 False +rs7290488 22 37581383 T C 3.172492e-02 False +rs9798725 22 37621269 C A 4.460405e-03 False +rs10212068 22 37644621 T C -8.386907e-03 False +rs730422 22 37671896 A G 2.303688e-02 False +rs1041895 22 37679763 G A -2.658396e-03 False +rs1008184 22 37720268 G A 2.120184e-02 False +rs2069221 22 37753256 C T 8.984539e-03 False +rs4821645 22 37757099 G A -1.560347e-02 False +rs9610727 22 37780522 C G -1.496708e-02 False +rs9607459 22 37800175 T C -5.510833e-03 False +rs6000739 22 37846448 G A 1.152963e-02 False +rs742152 22 37896749 C T 5.447068e-03 False +rs6000756 22 37908435 C T 1.909131e-03 False +rs12167061 22 37977481 T C 1.465308e-02 False +rs75937893 22 37992699 G A 8.339179e-04 False +rs36120988 22 38032762 G GA 1.693041e-02 False +rs9622677 22 38054262 C A 4.354146e-02 False +rs4820295 22 38083101 C T -2.092117e-02 False +rs12628603 22 38119213 A G 3.948165e-02 False +rs5756795 22 38122122 C T 4.377277e-02 False +rs79849571 22 38204089 T C 2.977743e-02 False +rs117267625 22 38435786 T G -7.684278e-03 False +rs2284063 22 38544298 G A 5.090446e-02 False +rs4608623 22 38597378 T G -1.997927e-02 False +rs4444637 22 38606780 G A -9.182016e-03 False +rs5995554 22 38630272 C T 7.393137e-03 False +rs135720 22 38663819 G A -6.392021e-03 False +rs135730 22 38673234 A G -1.106705e-02 False +rs35336050 22 38685131 C T -4.493352e-03 False +rs5750581 22 38695406 T C -1.155972e-02 False +rs56182369 22 38708506 A G 1.701713e-02 False +rs5757057 22 38744184 C T -2.112956e-02 False +rs743942 22 38819613 A G -5.625806e-03 False +rs12004 22 38877461 G T 1.108728e-03 False +rs5750616 22 38918894 G T -8.094286e-03 False +rs112010490 22 38928269 G T -2.114917e-02 False +rs35069730 22 39027286 C CAG 3.840735e-03 False +rs3747172 22 39067524 G A 1.200232e-02 False +rs5757275 22 39159201 C T 3.096214e-03 False +rs760482 22 39178701 G A 2.148449e-03 False +rs735306 22 39260032 T C 3.574634e-02 False +rs760481 22 39268785 T G 9.377414e-03 False +rs5750691 22 39281774 G T 3.816951e-02 False +rs5757355 22 39300265 C T 3.540156e-02 False +rs1014971 22 39332623 T C -4.449842e-03 False +rs5757424 22 39415780 G A 1.479946e-02 False +rs35860424 22 39448465 A G 3.065974e-03 False +rs2011869 22 39480697 G A -4.005617e-02 False +rs139272 22 39487665 G A -1.218988e-04 False +rs55989856 22 39493294 C T -3.115929e-02 False +rs738469 22 39510995 G A -2.069106e-02 False +rs877529 22 39542292 A G 9.653575e-03 False +rs73884827 22 39543000 T C -4.069841e-03 False +rs7287160 22 39573724 A C 2.683694e-02 False +rs5750761 22 39575692 A C 1.451305e-02 False +rs738470 22 39581277 A C 1.766406e-02 False +rs13053714 22 39626572 A G -2.901981e-02 False +rs5757580 22 39658626 C T 4.177065e-03 False +rs1569497 22 39665395 G A 1.264611e-02 False +rs54211 22 39687484 G A 5.418141e-03 False +rs6519183 22 39708279 A G -4.281532e-02 False +rs5757611 22 39708357 T C 8.605574e-03 False +rs5750811 22 39793066 G T 3.658209e-02 False +rs34026806 22 39798127 G A 2.302129e-03 False +rs5757678 22 39843409 T C 1.065699e-02 False +rs6001601 22 39865475 G A 1.588501e-03 False +rs5757703 22 39932516 A G -1.179841e-02 False +rs62228477 22 39963426 G A -1.503908e-02 False +rs11704409 22 40023636 C T 6.443146e-03 False +rs136829 22 40046176 C T -7.416552e-04 False +rs5757764 22 40067818 T C 4.559360e-03 False +rs5757777 22 40092864 G A 2.400297e-02 False +rs5757783 22 40127293 T C -8.870038e-04 False +rs7285609 22 40358148 T C -1.079902e-02 False +rs8139715 22 40420786 G C -8.092115e-03 False +rs7291691 22 40454069 G T 7.898880e-03 False +rs732384 22 40541981 G A 1.742640e-02 False +rs12484776 22 40652873 G A 5.853057e-03 False +rs28360630 22 40676672 G T -1.894274e-03 False +rs470113 22 40729614 G A 1.959940e-02 False +rs5757949 22 40820151 C T -1.628066e-02 False +rs35898643 22 40986372 G C -1.983507e-02 False +rs12165625 22 41494925 A G -2.918069e-02 False +rs11703267 22 41646738 G A 3.521847e-04 False +rs8139705 22 41680898 T C 1.402732e-02 False +rs34011394 22 41704872 T C 6.681484e-05 False +rs2073167 22 41791536 C T -5.572333e-05 False +rs2076196 22 41895409 A G -4.407217e-02 False +rs2076198 22 41929175 G T -3.186844e-02 False +rs739134 22 42089623 C T 5.322340e-03 False +rs147348682 22 42095658 G T 3.846131e-02 False +rs139568 22 42210985 C T -3.139710e-03 False +rs13055841 22 42279653 G A -6.596336e-03 False +rs7293091 22 42341308 G A -6.862491e-04 False +rs35742686 22 42524243 C CT -1.181191e-02 False +rs762995 22 42672124 G A -5.278171e-03 False +rs1548304 22 42691238 T C -1.642396e-02 False +rs8139063 22 42813753 C T -3.867750e-03 False +rs5758742 22 42867898 G A -1.352327e-03 False +rs11553441 22 42912097 T C -7.295657e-04 False +rs4822160 22 42932317 A G -5.768556e-02 False +rs28627172 22 43010817 A G 1.722077e-02 False +rs130370 22 43080028 T C -5.527551e-04 False +rs6002910 22 43096507 T C -5.556102e-03 False +rs738526 22 43112475 T C -1.350273e-02 False +rs8138149 22 43114824 G A -1.963192e-02 False +rs5758896 22 43115576 C T -1.880097e-02 False +rs9623692 22 43154299 G A -1.621113e-03 False +rs9611885 22 43159948 T C -7.980584e-03 False +rs1018448 22 43206950 C A -5.783037e-03 False +rs9607957 22 43218397 C T -3.976636e-03 False +rs2267463 22 43283255 C A -1.426668e-02 False +rs4822220 22 43290583 C T -3.955775e-02 False +rs8140884 22 43333156 A G -3.127845e-02 False +rs6003002 22 43426262 G A -3.668040e-03 False +rs8141749 22 43483242 T C -2.540203e-02 False +rs4988388 22 43515108 C T -1.570749e-02 False +rs13815 22 43529314 C G 1.738127e-02 False +rs5759199 22 43551513 G A 2.565386e-02 False +rs6972 22 43558972 A G -1.962819e-02 False +rs4822262 22 43577214 T C -2.270478e-02 False +rs13058467 22 43579049 C T -1.193909e-03 False +rs138993 22 43610207 G A -7.621661e-03 False +rs129415 22 43623395 G C -4.852519e-02 False +rs11703272 22 43640512 C T -5.533207e-03 False +rs139027 22 43649701 C T 7.724845e-02 False +rs5751462 22 43661080 T C -4.251741e-02 False +rs739306 22 43683088 A G -3.582388e-03 False +rs4820518 22 43707996 A G -2.547044e-02 False +rs6519367 22 43711080 C G -5.784446e-03 False +rs6003156 22 43721519 C A 3.658850e-04 False +rs1894717 22 43729401 C T 8.557013e-03 False +rs4820525 22 43763757 T G -1.789810e-02 False +rs28673361 22 43836198 G T 2.427697e-03 False +rs9614382 22 43976396 A G -1.277457e-02 False +rs137731 22 44031042 C T 3.593107e-03 False +rs9614187 22 44193626 C A -6.865434e-03 False +rs138057 22 44221247 G A 1.833991e-02 False +rs4823156 22 44296372 T C 6.169212e-03 False +rs6006453 22 44298838 A G 7.441756e-03 False +rs2294918 22 44342116 G A 2.810328e-02 False +rs3761472 22 44368122 G A 1.299680e-02 False +rs8418 22 44379838 G A 1.648422e-03 False +rs6006598 22 44380033 C T -2.136788e-03 False +rs1007863 22 44395451 C T -6.698507e-03 False +rs7285340 22 44419871 C T 1.816130e-02 False +rs6006622 22 44424108 T C 1.036733e-02 False +rs130313 22 44467899 C T -2.592364e-03 False +rs9614325 22 44498134 T C 7.281423e-03 False +rs1535009 22 44522312 C T -2.636447e-04 False +rs4823194 22 44526130 G A -3.882980e-03 False +rs2267613 22 44530286 A G 2.528159e-02 False +rs2267614 22 44530420 C T -1.233654e-02 False +rs10483222 22 44548944 G A -3.947209e-03 False +rs77120395 22 44551755 G A 1.262458e-02 False +rs9614359 22 44566434 A G -4.290306e-03 False +rs139131 22 44581046 T C -1.479950e-02 False +rs9626137 22 44643161 C T 1.439493e-02 False +rs135400 22 44677081 C T -1.030513e-02 False +rs135388 22 44681612 G A -1.269762e-03 False +rs3935378 22 44695088 T C 6.324859e-03 False +rs6519840 22 44707716 G T 2.288939e-03 False +rs62228577 22 44725343 G A 3.534678e-03 False +rs6519897 22 44738406 G A 2.320049e-02 False +rs7289501 22 44746729 A G -1.754216e-02 False +rs5764718 22 44751158 G A -6.539695e-03 False +rs9614538 22 44757439 A G 2.480295e-02 False +rs9614823 22 44759519 G A 2.111274e-03 False +rs5765809 22 44761797 A T -5.311720e-03 False +rs5764921 22 44763352 C G 1.452737e-02 False +rs19985 22 44783779 G A 9.142699e-03 False +rs2071820 22 44791807 C T -2.371876e-02 False +rs2746583 22 44818986 C T -6.740622e-03 False +rs5765690 22 44894913 G A -5.179871e-05 False +rs4508 22 45058431 C T 1.098259e-02 False +rs6006845 22 45066035 A G -1.484374e-02 False +rs9614870 22 45069410 T C 1.530441e-02 False +rs41515447 22 45081330 G A 1.350120e-03 False +rs28460735 22 45082168 C A 3.663354e-03 False +rs4823364 22 45090008 G A 2.811861e-03 False +rs6006857 22 45116664 C T 1.247728e-02 False +rs2269543 22 45244930 T C -1.450041e-02 False +rs8881 22 45258457 G A -3.500519e-03 False +rs9614987 22 45323989 T C 1.111338e-03 False +rs140556 22 45415987 A G -1.398184e-02 False +rs132067 22 45451355 G A -5.566982e-03 False +rs5765155 22 45471607 C T 1.148978e-02 False +rs5765167 22 45497738 C T -5.029327e-03 False +rs7292035 22 45502829 C T -3.893521e-02 False +rs2018928 22 45519040 T G 2.377071e-03 False +rs6006941 22 45523391 A G 1.318997e-02 False +rs17548742 22 45573450 C A 4.385600e-03 False +rs1125398 22 45589490 G A -8.350439e-03 False +rs58667 22 45668012 T C 1.286879e-02 False +rs5765242 22 45671343 G A -2.940682e-06 False +rs2742648 22 45672574 T C 5.743608e-03 False +rs5765250 22 45693923 A G -2.675069e-03 False +rs7290139 22 45718743 G A -2.092804e-02 False +rs11556482 22 45723807 C G 1.670159e-03 False +rs6007594 22 45728370 A G 1.879231e-04 False +rs56343022 22 45741537 G T 1.420045e-02 False +rs5764698 22 45749983 T G -4.591012e-02 False +rs2272804 22 45809624 A C 2.185772e-03 False +rs2142662 22 45821935 A G 2.250782e-02 False +rs6007041 22 45837410 G A -2.756449e-03 False +rs11090631 22 45846371 T C 7.910102e-02 False +rs713975 22 45864934 T C 8.535181e-03 False +rs10483228 22 45871507 G C -7.764056e-03 False +rs5765426 22 45892656 G T -3.885653e-03 False +rs3810631 22 45897997 C T 3.935204e-04 False +rs105199 22 45929577 C T -2.532217e-02 False +rs136755 22 45936350 A G -8.001698e-03 False +rs5765463 22 45942726 T G -1.415551e-02 False +rs13268 22 45996298 G A 5.643525e-02 False +rs17564843 22 46009063 G A 6.464843e-03 False +rs5765546 22 46022070 G A 2.246740e-02 False +rs2239398 22 46155548 G C -3.247470e-02 False +rs136018 22 46207955 C T -1.354554e-03 False +rs136029 22 46236425 A G 8.398423e-02 False +rs57514815 22 46275529 T C 2.264300e-03 False +rs75427302 22 46287720 A G -2.237482e-02 False +rs28473346 22 46289699 T C 1.872124e-02 False +rs9697736 22 46303347 T C -1.283734e-02 False +rs28663466 22 46316057 A G 2.312579e-02 False +rs9286453 22 46337043 G C 1.701173e-02 False +rs75862558 22 46347519 C T 1.574289e-02 False +rs9330813 22 46364161 A G -4.466341e-02 False +rs62228062 22 46381234 G A 4.730559e-02 False +rs28628653 22 46396925 G A 1.783944e-03 False +rs28698504 22 46403715 A G -2.132589e-02 False +rs78358349 22 46406782 A C 8.439466e-02 False +rs9627368 22 46445002 G C -7.613496e-02 False +rs7292297 22 46458123 G T 3.328073e-02 False +rs9626891 22 46482948 C T 4.241879e-02 False +rs12160757 22 46486508 C T -9.684390e-03 False +rs3747243 22 46493852 T C -6.758580e-03 False +rs9616125 22 46499120 C G -9.873118e-03 False +rs12170325 22 46502870 T C -1.792140e-02 False +rs76755807 22 46561713 G A 2.604703e-02 False +rs4253701 22 46586110 A G -1.256735e-03 False +rs59842914 22 46592168 C T 1.417055e-02 False +rs1800206 22 46614274 G C -5.854014e-02 False +rs4253772 22 46627603 T C 8.004024e-02 False +rs35364389 22 46760086 T C 3.229515e-03 False +rs34267201 22 46782382 T C -2.470821e-02 False +rs9627450 22 46807234 C T 2.324176e-03 False +rs9306514 22 46837114 G A 9.440730e-04 False +rs5768830 22 46888399 T C 9.911095e-03 False +rs9615374 22 46907779 G A 6.531440e-03 False +rs4823838 22 46909355 T G -4.780494e-03 False +rs12484501 22 46914277 A C 9.689535e-03 False +rs3810636 22 46943687 G A -1.303660e-02 False +rs9627514 22 46985917 A G 1.893397e-02 False +rs9615396 22 47021226 G A -1.322949e-02 False +rs13057352 22 47095235 A C -1.156013e-01 False +rs13054785 22 47109621 C T 4.322858e-04 False +rs34301321 22 47125474 G A -1.746025e-02 False +rs17221476 22 47147117 T C -2.418349e-02 False +rs5769136 22 47156703 C T 2.628970e-02 False +rs6008990 22 47245836 A G 1.880575e-03 False +rs140535 22 47271747 C T 1.055264e-03 False +rs5767397 22 47301822 C T 3.032158e-03 False +rs9616173 22 47345487 T C -2.945945e-03 False +rs470059 22 47372368 T C 2.067644e-02 False +rs136120 22 47380606 C T 4.041426e-02 False +rs5769300 22 47437808 C T 1.683027e-03 False +rs131924 22 47450911 A G 1.624479e-02 False +rs910541 22 47511864 A C -4.226735e-03 False +rs2295246 22 47519476 T C -3.954111e-03 False +rs13055207 22 47529458 A G -3.602848e-04 False +rs4823597 22 47531320 T C -6.899703e-03 False +rs738669 22 47548321 T C 4.925401e-03 False +rs2337244 22 47568291 C T 7.726693e-03 False +rs15646 22 47571203 A G -9.744751e-03 False +rs135368 22 47574009 C T -5.327010e-03 False +rs136618 22 47642100 T C 6.976251e-03 False +rs136636 22 47657635 T C 1.798943e-03 False +rs6008118 22 47683805 C T -3.475544e-02 False +rs36008375 22 47720973 T C -7.868172e-03 False +rs17763944 22 47821952 G A -8.854280e-04 False +rs2301382 22 47893053 A G -2.449056e-02 False +rs5767784 22 47935365 C T -1.599879e-03 False +rs2285093 22 47961708 G T -3.593525e-03 False +rs131114 22 47986332 T C -3.976592e-03 False +rs9615626 22 48154645 C T 7.608639e-03 False +rs5845816 22 48165452 C CT 2.039503e-03 False +rs16994709 22 48207318 T C -9.725168e-03 False +rs4823698 22 48213904 G C -1.220367e-02 False +rs9615649 22 48215904 A G -2.488244e-05 False +rs738739 22 48220460 T C -2.702163e-03 False +rs738743 22 48230941 C A -1.129522e-03 False +rs4823717 22 48271961 A G -5.053446e-03 False +rs2338258 22 48284025 T C -3.344182e-03 False +rs5768135 22 48297953 C T -1.046958e-02 False +rs1028528 22 48362290 G A -2.367254e-02 False +rs28537386 22 48362914 C A -3.167719e-03 False +rs5768244 22 48387670 A G -8.243989e-03 False +rs7289071 22 48415446 C T 2.130715e-03 False +rs135271 22 48460730 T C 2.682476e-03 False +rs5768344 22 48491160 T C 1.257794e-03 False +rs4823512 22 48519794 C T 3.680757e-03 False +rs6007807 22 48537775 G A 2.134692e-03 False +rs106953 22 48543566 T C 7.314089e-03 False +rs133534 22 48593037 C T 9.084708e-03 False +rs34776844 22 48687509 C T -2.771960e-02 False +rs5768510 22 48692033 T C -2.126264e-02 False +rs62223851 22 48699617 T C 5.093107e-04 False +rs34080684 22 48717568 T C -8.190281e-04 False +rs1475987 22 48811946 C T 7.916515e-03 False +rs7293013 22 48823357 G A 1.464317e-02 False +rs2071750 22 48840428 A C 3.711229e-03 False +rs9615896 22 48851612 T C -5.887765e-03 False +rs13056230 22 48874310 T C -1.106607e-02 False +rs761793 22 48968070 C T 1.280691e-02 False +rs28658383 22 48991385 T C -1.234119e-02 False +rs34694572 22 49004050 G A 2.290755e-02 False +rs28406241 22 49014565 A G 1.555565e-03 False +rs7288241 22 49086481 T C -6.196369e-03 False +rs4989008 22 49107173 T C 1.277272e-02 False +rs131032 22 49180915 A G 6.346977e-03 False +rs4076042 22 49262579 A G 2.657134e-02 False +rs28726380 22 49270317 C T 1.447665e-03 False +rs2024695 22 49313196 A G -7.055532e-03 False +rs1467436 22 49335230 T C -6.548281e-03 False +rs4824067 22 49366123 T C 1.136486e-02 False +rs738596 22 49372356 G C -2.420841e-02 False +rs17178683 22 49443666 T C 1.581736e-02 False +rs55898343 22 49496835 G A -1.355414e-02 False +rs1981477 22 49524428 A G -4.228482e-03 False +rs135257 22 49530553 G C 8.197389e-03 False +rs9627875 22 49537845 T C 1.112550e-02 False +rs5769975 22 49557457 G A 9.401926e-03 False +rs5769981 22 49562666 C A 1.271701e-02 False +rs2318943 22 49574509 C T 4.703177e-04 False +rs5769446 22 49579141 A G 2.448619e-02 False +rs7288983 22 49650863 T C 6.739571e-03 False +rs5770154 22 49662549 T G -5.769464e-03 False +rs1880009 22 49665841 T C -7.037069e-04 False +rs62220604 22 49677464 A G -2.177735e-02 False +rs6009594 22 49696067 C T -3.309682e-03 False +rs5770223 22 49700272 T G -2.541948e-03 False +rs1124544 22 49706433 T C -1.719402e-02 False +rs73173197 22 49713835 G A -1.370754e-02 False +rs848761 22 49719264 A C -1.067852e-02 False +rs848721 22 49743627 G A -5.970581e-04 False +rs9628005 22 49800265 C T 3.098582e-02 False +rs6009666 22 49806863 A G 3.940447e-03 False +rs136795 22 49830851 C T -2.742706e-03 False +rs11705513 22 49834624 G A -2.820163e-03 False +rs6009703 22 49843235 G C -4.458281e-04 False +rs9616311 22 49847501 T G 2.235016e-03 False +rs4823938 22 49861033 C T 1.721243e-02 False +rs5770489 22 49881321 A G -5.127800e-04 False +rs9628037 22 49908804 G A -9.455892e-03 False +rs134474 22 49911222 G T -1.389666e-02 False +rs17779492 22 49925268 A G 1.679984e-02 False +rs134447 22 49927332 T C 3.929800e-04 False +rs111392589 22 50109212 T C 1.610819e-02 False +rs6009846 22 50118149 G C 7.024666e-03 False +rs138844 22 50184484 G T 1.222581e-02 False +rs117613664 22 50219447 T C 5.091891e-02 False +rs910799 22 50278568 G A -2.340672e-02 False +rs78676969 22 50319170 G A 1.669806e-02 False +rs28372448 22 50350971 A G 2.640160e-02 False +rs4077129 22 50356693 C T 3.851499e-03 False +rs5771069 22 50435480 G A 1.663630e-02 False +rs9617098 22 50439626 A G -2.722154e-03 False +rs137890 22 50466542 C T -2.560094e-03 False +rs11101958 22 50470516 T C -1.621986e-02 False +rs5771133 22 50491150 G A 1.828674e-02 False +rs6010164 22 50515270 C T 1.439904e-02 False +rs56144269 22 50529850 C T 2.054628e-02 False +rs75570992 22 50570755 C G 7.077514e-03 False +rs2272837 22 50582626 G A -3.588854e-03 False +rs17836662 22 50672154 A G 7.660848e-03 False +rs11547731 22 50722134 C T -1.747164e-02 False +rs79966207 22 50722408 C T -1.063465e-03 False +rs28379706 22 50728062 C T 2.159223e-02 False +rs11553142 22 50750481 T C 1.877272e-02 False +rs62241237 22 50758873 T C 4.001731e-03 False +rs9628184 22 50835040 A G -6.374259e-03 False +rs9616997 22 50859049 C T 3.480749e-04 False +rs1053744 22 50885775 G A -1.358311e-02 False +rs2232883 22 50926768 T C 1.798498e-03 False +rs2232885 22 50928026 A G 4.775504e-03 False +rs140522 22 50971266 C T 2.160893e-02 False +rs41281529 22 50989197 T C -1.328884e-02 False +rs131778 22 50989326 G A 1.037054e-02 False +rs5770892 22 50999681 G A -1.226224e-02 False +rs35826039 22 51046163 T C -2.754002e-02 False +rs9616915 22 51117580 C T 3.573542e-02 False +rs2301584 22 51171497 A G -1.951606e-02 False +rs73174435 22 51174939 T C -6.178519e-03 False diff --git a/tests/data/combine/scorefile.txt b/tests/data/combine/scorefile.txt new file mode 100644 index 0000000..1043a68 --- /dev/null +++ b/tests/data/combine/scorefile.txt @@ -0,0 +1,838 @@ +#pgs_name=PGS001229_22 +#genome_build=GRCh37 +chr_name chr_position effect_allele other_allele effect_weight +22 17080378 G A 0.01045457 +22 17300230 A G 0.0001411475 +22 17318864 A C 0.008166266 +22 17327595 T C 0.007791641 +22 17409813 A G 0.0003108784 +22 17450952 G A -0.03033983 +22 17492533 G A 0.00388999 +22 17542810 C T 0.00803629 +22 17565013 G A 0.02135621 +22 17589209 T C 0.003026491 +22 17600977 A G 0.01581277 +22 17625915 A G -0.1172964 +22 17630486 A C 0.01012909 +22 17633785 C T 0.0023255 +22 17643689 A G 0.003361814 +22 17669306 C T 0.0214506 +22 17677699 T C -0.0007031384 +22 17680519 C A 0.001079236 +22 17701234 G A 0.004477145 +22 17703119 A T 0.0007771872 +22 17718699 C A -0.01320632 +22 17721595 C T 0.009480363 +22 17727648 T C 0.007811685 +22 17738177 G A -0.004719812 +22 17749096 A G -0.005244795 +22 17770181 G T -0.03101703 +22 17793969 G A 0.01774444 +22 17815696 G C -0.00551609 +22 17827684 G A -0.005944752 +22 17831813 T C 0.01061587 +22 17844929 T G 0.001717643 +22 17850661 T C -0.02805489 +22 17887534 A G 0.0007723542 +22 17887725 A G 0.007472703 +22 17958221 C A -0.02098647 +22 18036253 G A -0.01772981 +22 18038786 A G -0.002119071 +22 18262301 A T -0.005065485 +22 18289204 A G 0.005306345 +22 18295575 C T 0.02300129 +22 18296238 G A -0.005665446 +22 18319179 T C 0.03440642 +22 18393534 A C 0.01764269 +22 18439958 T C -0.002261707 +22 18483388 G A 0.03318724 +22 18488883 C G -0.0191918 +22 18489048 C A 0.01233198 +22 18495470 A G -0.005804926 +22 18537145 G A -0.004930116 +22 18571008 A G -8.844726E-05 +22 18584433 C T -0.001169893 +22 18631365 T C -0.001551714 +22 18650682 T C -0.01313784 +22 18890037 A G 0.05968921 +22 18891398 G A 0.006891943 +22 18892575 A G -0.00224447 +22 18915963 A G 0.003719756 +22 18959581 T C 0.006464581 +22 18963340 A G -0.01397565 +22 18970915 T C -0.001507131 +22 19024651 T C -0.00350575 +22 19121872 A G 0.01644046 +22 19135603 A G -0.02970077 +22 19190143 T C 0.003268027 +22 19263698 T C 0.02057255 +22 19292446 G T 0.01153989 +22 19371052 T C 0.01055134 +22 19420109 C T -0.008628228 +22 19451186 A C 0.02141029 +22 19518079 C T 0.005372247 +22 19581331 T C 0.01686942 +22 19593854 C A 0.0006544249 +22 19606703 G A 0.02070121 +22 19649005 A G 0.002868601 +22 19735854 C T 0.006262962 +22 19738355 T C 4.97384E-05 +22 19770886 A G -0.01013929 +22 19781823 T C 0.02481609 +22 19873357 T C 0.0116302 +22 19907099 A C -0.0267645 +22 19968597 T C -0.02203945 +22 20046344 G A -0.009801428 +22 20084821 C T -0.02232886 +22 20185457 A G 0.006892171 +22 20189077 T C 0.01738215 +22 20219648 A G 0.009307625 +22 20248391 A G -0.005405845 +22 20267213 A G 0.006713242 +22 20286099 G T 0.01574758 +22 20749042 G A 0.006603339 +22 20754039 A G -0.01181141 +22 20775167 T C 0.01160113 +22 20780296 A G 0.06735311 +22 20789074 C T 0.02844307 +22 20791438 A C 0.0473474 +22 20793914 C T 0.007009781 +22 20839810 T G 0.003947346 +22 20860931 T C 0.0005613511 +22 20979980 G A 0.003231665 +22 20991771 G A 0.004226765 +22 21075537 C A -0.002096453 +22 21154393 G T -0.004297086 +22 21323357 C T -0.006041745 +22 21331918 G C -0.002280912 +22 21334924 C G -0.02031369 +22 21356824 A G 0.01476577 +22 21386019 A G 0.01435557 +22 21449028 G A -0.01537701 +22 21463515 A G -0.01335614 +22 21982892 T C -0.06373335 +22 22001704 T G 0.02809584 +22 22062480 T C 0.0529113 +22 22080735 G A -0.0139426 +22 22151939 C A -0.008287849 +22 22163425 G A 0.05518983 +22 22307519 C G -0.003486191 +22 22351283 G A -0.0007483763 +22 22394291 AG A 0.004320583 +22 22395754 T C 0.002587971 +22 22424302 A C 0.0011408 +22 22473905 C A 0.01226009 +22 22550450 G C 0.01773244 +22 22561610 C T -0.006207024 +22 22581369 G A -0.006272413 +22 22584678 A G -0.00217647 +22 22711786 T C 0.007779875 +22 22726372 T C 0.00349632 +22 22762771 C T 0.01252501 +22 22769923 G A -0.01103632 +22 22869742 A C -0.002412657 +22 22871922 A G -0.002769974 +22 22929268 T C -0.007035723 +22 23001481 A G 0.007524178 +22 23022520 T C 0.002175257 +22 23064982 A C -0.01255076 +22 23249440 A C 0.02085816 +22 23268677 A G 0.01337349 +22 23279456 C G -0.01371401 +22 23282286 C T 0.004994329 +22 23325722 C T 0.0008506657 +22 23412058 A G -0.009545553 +22 23627369 G A -0.01900175 +22 23644425 G A -0.0009106953 +22 23649242 G T 0.001061643 +22 23794844 G A -0.01198736 +22 23804670 G T -0.001119846 +22 23819697 T G -0.01028722 +22 23873076 T C 0.009509027 +22 23892145 T C 0.0135128 +22 23925779 C T -0.004127647 +22 23960187 T C -0.008475905 +22 24035970 T C -0.001334318 +22 24086107 G A -0.01652957 +22 24105789 A G 0.01813091 +22 24106448 A G 0.001834095 +22 24186809 C T -0.01426541 +22 24235360 G A 0.0003168635 +22 24255296 T C 0.01624252 +22 24300540 T C -0.00322576 +22 24376584 A G -0.006223068 +22 24406778 A C 0.00304654 +22 24618331 G A -0.0006506681 +22 24802564 A G -0.006695797 +22 24912232 T C -0.01536303 +22 24943582 A G -0.001687764 +22 24995668 G A -0.03537331 +22 25123505 C T -0.0160099 +22 25145094 T C -0.005584047 +22 25145453 T C -0.001388536 +22 25185823 A G -0.009228375 +22 25265972 A G 0.01088906 +22 25309448 A G -0.002238693 +22 25363411 A G 0.004035775 +22 25410895 G A 0.0009720734 +22 25442369 C T 0.01660527 +22 25454658 C A 0.01200285 +22 25465065 C T 0.01320801 +22 25524916 C T 0.01147501 +22 25603008 T C -0.01262741 +22 25619025 G T -0.01212511 +22 25621591 T C 0.01051851 +22 25643483 T G 0.01373474 +22 25661725 A G -0.005936431 +22 25667883 G A 0.01547775 +22 25668730 A C 0.02616493 +22 25678577 T C 0.0304018 +22 25761309 T C -0.001760112 +22 25761936 T C -0.005171998 +22 25938977 T C 0.01966116 +22 25994013 A G 0.0006268228 +22 26081873 T C 0.05232603 +22 26132612 A G -0.006457239 +22 26133775 T C -0.001181527 +22 26159289 A G -0.008399401 +22 26181767 C T 0.01044769 +22 26190915 G A 0.004287533 +22 26218164 G A -0.002803502 +22 26231312 C G 0.006105629 +22 26237826 C T 0.004981479 +22 26239850 A C 0.004144037 +22 26273893 C G 0.005616213 +22 26278128 G T -0.003965338 +22 26280462 T C -0.0008324497 +22 26290588 T C -0.0130732 +22 26292659 G A 4.294309E-05 +22 26343593 G A 0.007813758 +22 26369358 T C -0.00483665 +22 26390964 A G -0.007849451 +22 26415475 T C -0.001219281 +22 26456367 G A -0.01285326 +22 26460519 T C -0.008695338 +22 26528054 A G 0.01973023 +22 26617260 T A -0.01384025 +22 26638906 G T 0.01229772 +22 26735648 A G 0.0007879673 +22 26782251 G A 0.0005096459 +22 26812632 C T -0.01850814 +22 26939781 C T -0.0009222796 +22 26960648 A C -0.005679255 +22 27038865 T G -0.0001487706 +22 27042828 A G 0.02957737 +22 27161060 A G 0.002844558 +22 27191643 T C 0.008953731 +22 27216426 G A 0.00912099 +22 27217018 A G 0.01510616 +22 27240025 T G -0.0297174 +22 27242642 G A -0.009822927 +22 27246070 C T -0.001554199 +22 27252454 C T -0.006560251 +22 27264880 G T -0.01323094 +22 27337886 A G -0.009600014 +22 27339284 T C -0.009944488 +22 27353810 T C -0.002171555 +22 27370273 T C -0.009798478 +22 27378884 A G 0.05145072 +22 27398749 C T 0.001012263 +22 27403571 C T -0.01745865 +22 27405012 T C -0.005425419 +22 27415255 C T -0.01499362 +22 27426628 G C 0.0228946 +22 27430724 A G -0.007068064 +22 27435577 C T -0.008632412 +22 27487580 G A 0.003691502 +22 27498426 A G -0.006801544 +22 27526095 G A -0.0008086267 +22 27563274 C A 0.0136965 +22 27584680 A G -0.002139188 +22 27628151 C G 0.02130389 +22 27652290 T G 0.004815735 +22 27660675 A G 0.004899654 +22 27674832 G T 0.0001248065 +22 27718775 A G 0.02292384 +22 27729742 G A 0.004951261 +22 27762155 C T 0.00485666 +22 27781736 A C -0.008336242 +22 27829565 G A 0.00285409 +22 27832985 G C -0.01668955 +22 27836311 G A -0.00775625 +22 27839704 T C -0.02492106 +22 27864471 A C 0.00218995 +22 27873024 G A 0.002721729 +22 27883265 G A 0.02961735 +22 27890684 A G -0.008057355 +22 27927298 T C 0.02054268 +22 27934290 G A 0.004751755 +22 27951176 A G -0.0004329547 +22 27974819 C A 0.01439093 +22 27975451 G A -0.03648208 +22 28007741 C T -0.01635917 +22 28016883 C A 0.008564085 +22 28046561 T C 0.01535905 +22 28060034 A G 0.03097228 +22 28076058 C T 0.02848654 +22 28094845 G A -0.02659077 +22 28130130 C T -0.01640387 +22 28136977 A C -0.003962775 +22 28150109 G A 0.0006071392 +22 28150815 A G 0.01604724 +22 28151825 A G -0.005390282 +22 28155404 T C 0.005030388 +22 28172577 G T 0.005704168 +22 28185452 G T -0.006896853 +22 28200176 G A -0.006474674 +22 28206912 C A -0.006175542 +22 28270372 G T -0.0006768204 +22 28412908 G T 0.01763639 +22 28501414 T C -0.2304747 +22 29106733 C T -0.01074749 +22 29318724 T C 0.001743333 +22 29378610 C T 0.0006690876 +22 29478760 C T -0.03029428 +22 29533572 G C -0.01269604 +22 29626515 A G -0.0117113 +22 29630337 A G 0.02658049 +22 29669648 C G -0.008550535 +22 29692497 T G 0.001234896 +22 29837537 C T 0.01321112 +22 29961986 T G 0.001878853 +22 30151687 C T 0.003418302 +22 30163526 G A 0.01576261 +22 30494371 A G 0.007959801 +22 30592487 G C -0.1047403 +22 30621613 A C -0.01382104 +22 30658082 C T -0.03794014 +22 30688659 T C 0.0225714 +22 30762140 A G 0.02079806 +22 30793137 A G -0.004609306 +22 30901592 C T -0.00833404 +22 30927975 T C 0.003226189 +22 30953295 T C -0.00768579 +22 30992651 G A -0.025658 +22 31018975 C T 0.04241226 +22 31032920 G A -0.02311985 +22 31063804 G GT -0.0002081808 +22 31114086 G T 0.02825476 +22 31139653 A G 2.640129E-06 +22 31214382 G A 0.01137657 +22 31216506 C T 0.005531311 +22 31272930 T C -0.001056118 +22 31333631 C T -0.01235089 +22 31378447 A G 0.01020507 +22 31442308 A G -0.002479126 +22 31477361 C G -0.01263667 +22 31514348 G A 0.00580324 +22 31521404 A G 0.01097391 +22 31659495 C T 0.02663412 +22 31884405 C T -0.0003950834 +22 32200849 T C 0.01585735 +22 32341684 T C -0.02960328 +22 32559835 G A -0.02170436 +22 32569263 C T -0.001296006 +22 32624139 C T 0.005619574 +22 32702816 A G -0.01534023 +22 32756652 G A 0.02512177 +22 32831540 T C 0.001868495 +22 32832874 T C 6.028815E-05 +22 32853660 G A 0.0138221 +22 32854391 C A 0.0001960825 +22 32875190 A G -0.006426637 +22 32934713 C CT -0.009057754 +22 32952012 A C -0.00380248 +22 32954443 G A 0.002210369 +22 32993032 C T -0.002429979 +22 32997766 T C -0.008424246 +22 33045573 T C -0.03107145 +22 33046110 G C -0.06954732 +22 33048039 T C 0.01138346 +22 33056341 C T -0.06477198 +22 33108536 T C -0.03426392 +22 33108981 T C -0.07404035 +22 33116435 T C 0.06542471 +22 33143528 G A 0.02195059 +22 33146363 A G 0.000810539 +22 33259625 C T 0.02309793 +22 33336039 T G -0.02554387 +22 33408519 T C -0.0075563 +22 33660345 C G 0.002190743 +22 33804893 C T 0.006680774 +22 33844303 C T 0.008923314 +22 33846914 T C 0.006295378 +22 33898906 A C 1.958759E-05 +22 34022284 A G -0.00257933 +22 34137784 G A 0.004460828 +22 34208570 T C -0.003365869 +22 34217757 T C 0.009289431 +22 34256923 A C 0.01439384 +22 34265402 G A -0.0163661 +22 34284173 G A -0.02315559 +22 34296093 C A -0.004688326 +22 34378012 A G 0.002276664 +22 34436795 C T 0.0001337033 +22 34488452 A G -0.000428831 +22 34501541 A G 0.002763614 +22 34514810 C A 0.003976601 +22 34526428 C T 0.01088864 +22 34583078 A G 0.001802495 +22 34620754 T C 0.01466546 +22 34691035 A G -0.0002082615 +22 34758540 T C 0.005165532 +22 34851377 A C 0.0137118 +22 35371707 T C -0.0004985554 +22 35382268 A C -0.004931336 +22 35419122 C T -0.01077953 +22 35478529 A G 0.0001760523 +22 35481493 T C 0.01056439 +22 35526281 G A -0.002766891 +22 35603836 A G -0.0001783939 +22 35660875 T G 0.03988231 +22 35745196 G T 0.0001750545 +22 35750980 A G -0.007651136 +22 35783413 G A 0.001649791 +22 35918270 C T 0.006918713 +22 35959242 A G 0.01697538 +22 35962060 G A 0.005181476 +22 35964158 G C 0.002769931 +22 35984385 A G -0.01280623 +22 36001258 C T 0.01342405 +22 36072262 T C 0.00489549 +22 36180535 G A -0.03250252 +22 36517307 C T 0.01366076 +22 36519596 A C -0.00349956 +22 36532058 A G -0.01214487 +22 36543489 C G 0.007838149 +22 36600841 G A 0.02644389 +22 36629633 C A -0.006871468 +22 36635967 G A -0.02634742 +22 36655735 A G -0.005385142 +22 36661646 A G -0.01560741 +22 36684354 C T -0.005170111 +22 36705622 A G 0.01713234 +22 36708049 C CTCCTGTGA -0.05187051 +22 36751101 A C -0.0244065 +22 36764788 G A 0.02784116 +22 36897427 C T 0.02603792 +22 36900806 G A 0.007366207 +22 36923144 T C -0.001875563 +22 36924714 G A -0.003632594 +22 36946643 T G 0.01333137 +22 36954939 T C 0.01105894 +22 36998907 T C -0.0006084687 +22 37001495 G T -0.01224147 +22 37013167 G C 0.01866849 +22 37077364 C T 0.007294257 +22 37080738 C G -0.004873355 +22 37101890 C T 0.03991764 +22 37118535 A G -0.001713909 +22 37184521 G A 0.006515894 +22 37206341 G T 0.0002566936 +22 37256262 A G 0.001152626 +22 37258503 C T -0.009761102 +22 37323988 T C -0.0073182 +22 37329545 G A 0.005775806 +22 37337409 T C -0.02534399 +22 37343000 A C -0.0004011777 +22 37398195 T C -0.01001198 +22 37401532 A G -0.003244795 +22 37407109 C G 0.04335972 +22 37477732 T C 0.0003669548 +22 37507019 A G -0.0009259451 +22 37513316 A G 0.001153887 +22 37532441 A G 0.01802306 +22 37571497 G A -0.005785311 +22 37581383 T C 0.03172492 +22 37621269 C A 0.004460405 +22 37644621 T C -0.008386907 +22 37671896 A G 0.02303688 +22 37679763 G A -0.002658396 +22 37720268 G A 0.02120184 +22 37753256 C T 0.008984539 +22 37757099 G A -0.01560347 +22 37780522 C G -0.01496708 +22 37800175 T C -0.005510833 +22 37846448 G A 0.01152963 +22 37896749 C T 0.005447068 +22 37908435 C T 0.001909131 +22 37977481 T C 0.01465308 +22 37992699 G A 0.0008339179 +22 38032762 G GA 0.01693041 +22 38054262 C A 0.04354146 +22 38083101 C T -0.02092117 +22 38119213 A G 0.03948165 +22 38122122 C T 0.04377277 +22 38204089 T C 0.02977743 +22 38435786 T G -0.007684278 +22 38544298 G A 0.05090446 +22 38597378 T G -0.01997927 +22 38606780 G A -0.009182016 +22 38630272 C T 0.007393137 +22 38663819 G A -0.006392021 +22 38673234 A G -0.01106705 +22 38685131 C T -0.004493352 +22 38695406 T C -0.01155972 +22 38708506 A G 0.01701713 +22 38744184 C T -0.02112956 +22 38819613 A G -0.005625806 +22 38877461 G T 0.001108728 +22 38918894 G T -0.008094286 +22 38928269 G T -0.02114917 +22 39027286 C CAG 0.003840735 +22 39067524 G A 0.01200232 +22 39159201 C T 0.003096214 +22 39178701 G A 0.002148449 +22 39260032 T C 0.03574634 +22 39268785 T G 0.009377414 +22 39281774 G T 0.03816951 +22 39300265 C T 0.03540156 +22 39332623 T C -0.004449842 +22 39415780 G A 0.01479946 +22 39448465 A G 0.003065974 +22 39480697 G A -0.04005617 +22 39487665 G A -0.0001218988 +22 39493294 C T -0.03115929 +22 39510995 G A -0.02069106 +22 39542292 A G 0.009653575 +22 39543000 T C -0.004069841 +22 39573724 A C 0.02683694 +22 39575692 A C 0.01451305 +22 39581277 A C 0.01766406 +22 39626572 A G -0.02901981 +22 39658626 C T 0.004177065 +22 39665395 G A 0.01264611 +22 39687484 G A 0.005418141 +22 39708279 A G -0.04281532 +22 39708357 T C 0.008605574 +22 39793066 G T 0.03658209 +22 39798127 G A 0.002302129 +22 39843409 T C 0.01065699 +22 39865475 G A 0.001588501 +22 39932516 A G -0.01179841 +22 39963426 G A -0.01503908 +22 40023636 C T 0.006443146 +22 40046176 C T -0.0007416552 +22 40067818 T C 0.00455936 +22 40092864 G A 0.02400297 +22 40127293 T C -0.0008870038 +22 40358148 T C -0.01079902 +22 40420786 G C -0.008092115 +22 40454069 G T 0.00789888 +22 40541981 G A 0.0174264 +22 40652873 G A 0.005853057 +22 40676672 G T -0.001894274 +22 40729614 G A 0.0195994 +22 40820151 C T -0.01628066 +22 40986372 G C -0.01983507 +22 41494925 A G -0.02918069 +22 41646738 G A 0.0003521847 +22 41680898 T C 0.01402732 +22 41704872 T C 6.681484E-05 +22 41791536 C T -5.572333E-05 +22 41895409 A G -0.04407217 +22 41929175 G T -0.03186844 +22 42089623 C T 0.00532234 +22 42095658 G T 0.03846131 +22 42210985 C T -0.00313971 +22 42279653 G A -0.006596336 +22 42341308 G A -0.0006862491 +22 42524243 C CT -0.01181191 +22 42672124 G A -0.005278171 +22 42691238 T C -0.01642396 +22 42813753 C T -0.00386775 +22 42867898 G A -0.001352327 +22 42912097 T C -0.0007295657 +22 42932317 A G -0.05768556 +22 43010817 A G 0.01722077 +22 43080028 T C -0.0005527551 +22 43096507 T C -0.005556102 +22 43112475 T C -0.01350273 +22 43114824 G A -0.01963192 +22 43115576 C T -0.01880097 +22 43154299 G A -0.001621113 +22 43159948 T C -0.007980584 +22 43206950 C A -0.005783037 +22 43218397 C T -0.003976636 +22 43283255 C A -0.01426668 +22 43290583 C T -0.03955775 +22 43333156 A G -0.03127845 +22 43426262 G A -0.00366804 +22 43483242 T C -0.02540203 +22 43515108 C T -0.01570749 +22 43529314 C G 0.01738127 +22 43551513 G A 0.02565386 +22 43558972 A G -0.01962819 +22 43577214 T C -0.02270478 +22 43579049 C T -0.001193909 +22 43610207 G A -0.007621661 +22 43623395 G C -0.04852519 +22 43640512 C T -0.005533207 +22 43649701 C T 0.07724845 +22 43661080 T C -0.04251741 +22 43683088 A G -0.003582388 +22 43707996 A G -0.02547044 +22 43711080 C G -0.005784446 +22 43721519 C A 0.000365885 +22 43729401 C T 0.008557013 +22 43763757 T G -0.0178981 +22 43836198 G T 0.002427697 +22 43976396 A G -0.01277457 +22 44031042 C T 0.003593107 +22 44193626 C A -0.006865434 +22 44221247 G A 0.01833991 +22 44296372 T C 0.006169212 +22 44298838 A G 0.007441756 +22 44342116 G A 0.02810328 +22 44368122 G A 0.0129968 +22 44379838 G A 0.001648422 +22 44380033 C T -0.002136788 +22 44395451 C T -0.006698507 +22 44419871 C T 0.0181613 +22 44424108 T C 0.01036733 +22 44467899 C T -0.002592364 +22 44498134 T C 0.007281423 +22 44522312 C T -0.0002636447 +22 44526130 G A -0.00388298 +22 44530286 A G 0.02528159 +22 44530420 C T -0.01233654 +22 44548944 G A -0.003947209 +22 44551755 G A 0.01262458 +22 44566434 A G -0.004290306 +22 44581046 T C -0.0147995 +22 44643161 C T 0.01439493 +22 44677081 C T -0.01030513 +22 44681612 G A -0.001269762 +22 44695088 T C 0.006324859 +22 44707716 G T 0.002288939 +22 44725343 G A 0.003534678 +22 44738406 G A 0.02320049 +22 44746729 A G -0.01754216 +22 44751158 G A -0.006539695 +22 44757439 A G 0.02480295 +22 44759519 G A 0.002111274 +22 44761797 A T -0.00531172 +22 44763352 C G 0.01452737 +22 44783779 G A 0.009142699 +22 44791807 C T -0.02371876 +22 44818986 C T -0.006740622 +22 44894913 G A -5.179871E-05 +22 45058431 C T 0.01098259 +22 45066035 A G -0.01484374 +22 45069410 T C 0.01530441 +22 45081330 G A 0.00135012 +22 45082168 C A 0.003663354 +22 45090008 G A 0.002811861 +22 45116664 C T 0.01247728 +22 45244930 T C -0.01450041 +22 45258457 G A -0.003500519 +22 45323989 T C 0.001111338 +22 45415987 A G -0.01398184 +22 45451355 G A -0.005566982 +22 45471607 C T 0.01148978 +22 45497738 C T -0.005029327 +22 45502829 C T -0.03893521 +22 45519040 T G 0.002377071 +22 45523391 A G 0.01318997 +22 45573450 C A 0.0043856 +22 45589490 G A -0.008350439 +22 45668012 T C 0.01286879 +22 45671343 G A -2.940682E-06 +22 45672574 T C 0.005743608 +22 45693923 A G -0.002675069 +22 45718743 G A -0.02092804 +22 45723807 C G 0.001670159 +22 45728370 A G 0.0001879231 +22 45741537 G T 0.01420045 +22 45749983 T G -0.04591012 +22 45809624 A C 0.002185772 +22 45821935 A G 0.02250782 +22 45837410 G A -0.002756449 +22 45846371 T C 0.07910102 +22 45864934 T C 0.008535181 +22 45871507 G C -0.007764056 +22 45892656 G T -0.003885653 +22 45897997 C T 0.0003935204 +22 45929577 C T -0.02532217 +22 45936350 A G -0.008001698 +22 45942726 T G -0.01415551 +22 45996298 G A 0.05643525 +22 46009063 G A 0.006464843 +22 46022070 G A 0.0224674 +22 46155548 G C -0.0324747 +22 46207955 C T -0.001354554 +22 46236425 A G 0.08398423 +22 46275529 T C 0.0022643 +22 46287720 A G -0.02237482 +22 46289699 T C 0.01872124 +22 46303347 T C -0.01283734 +22 46316057 A G 0.02312579 +22 46337043 G C 0.01701173 +22 46347519 C T 0.01574289 +22 46364161 A G -0.04466341 +22 46381234 G A 0.04730559 +22 46396925 G A 0.001783944 +22 46403715 A G -0.02132589 +22 46406782 A C 0.08439466 +22 46445002 G C -0.07613496 +22 46458123 G T 0.03328073 +22 46482948 C T 0.04241879 +22 46486508 C T -0.00968439 +22 46493852 T C -0.00675858 +22 46499120 C G -0.009873118 +22 46502870 T C -0.0179214 +22 46561713 G A 0.02604703 +22 46586110 A G -0.001256735 +22 46592168 C T 0.01417055 +22 46614274 G C -0.05854014 +22 46627603 T C 0.08004024 +22 46760086 T C 0.003229515 +22 46782382 T C -0.02470821 +22 46807234 C T 0.002324176 +22 46837114 G A 0.000944073 +22 46888399 T C 0.009911095 +22 46907779 G A 0.00653144 +22 46909355 T G -0.004780494 +22 46914277 A C 0.009689535 +22 46943687 G A -0.0130366 +22 46985917 A G 0.01893397 +22 47021226 G A -0.01322949 +22 47095235 A C -0.1156013 +22 47109621 C T 0.0004322858 +22 47125474 G A -0.01746025 +22 47147117 T C -0.02418349 +22 47156703 C T 0.0262897 +22 47245836 A G 0.001880575 +22 47271747 C T 0.001055264 +22 47301822 C T 0.003032158 +22 47345487 T C -0.002945945 +22 47372368 T C 0.02067644 +22 47380606 C T 0.04041426 +22 47437808 C T 0.001683027 +22 47450911 A G 0.01624479 +22 47511864 A C -0.004226735 +22 47519476 T C -0.003954111 +22 47529458 A G -0.0003602848 +22 47531320 T C -0.006899703 +22 47548321 T C 0.004925401 +22 47568291 C T 0.007726693 +22 47571203 A G -0.009744751 +22 47574009 C T -0.00532701 +22 47642100 T C 0.006976251 +22 47657635 T C 0.001798943 +22 47683805 C T -0.03475544 +22 47720973 T C -0.007868172 +22 47821952 G A -0.000885428 +22 47893053 A G -0.02449056 +22 47935365 C T -0.001599879 +22 47961708 G T -0.003593525 +22 47986332 T C -0.003976592 +22 48154645 C T 0.007608639 +22 48165452 C CT 0.002039503 +22 48207318 T C -0.009725168 +22 48213904 G C -0.01220367 +22 48215904 A G -2.488244E-05 +22 48220460 T C -0.002702163 +22 48230941 C A -0.001129522 +22 48271961 A G -0.005053446 +22 48284025 T C -0.003344182 +22 48297953 C T -0.01046958 +22 48362290 G A -0.02367254 +22 48362914 C A -0.003167719 +22 48387670 A G -0.008243989 +22 48415446 C T 0.002130715 +22 48460730 T C 0.002682476 +22 48491160 T C 0.001257794 +22 48519794 C T 0.003680757 +22 48537775 G A 0.002134692 +22 48543566 T C 0.007314089 +22 48593037 C T 0.009084708 +22 48687509 C T -0.0277196 +22 48692033 T C -0.02126264 +22 48699617 T C 0.0005093107 +22 48717568 T C -0.0008190281 +22 48811946 C T 0.007916515 +22 48823357 G A 0.01464317 +22 48840428 A C 0.003711229 +22 48851612 T C -0.005887765 +22 48874310 T C -0.01106607 +22 48968070 C T 0.01280691 +22 48991385 T C -0.01234119 +22 49004050 G A 0.02290755 +22 49014565 A G 0.001555565 +22 49086481 T C -0.006196369 +22 49107173 T C 0.01277272 +22 49180915 A G 0.006346977 +22 49262579 A G 0.02657134 +22 49270317 C T 0.001447665 +22 49313196 A G -0.007055532 +22 49335230 T C -0.006548281 +22 49366123 T C 0.01136486 +22 49372356 G C -0.02420841 +22 49443666 T C 0.01581736 +22 49496835 G A -0.01355414 +22 49524428 A G -0.004228482 +22 49530553 G C 0.008197389 +22 49537845 T C 0.0111255 +22 49557457 G A 0.009401926 +22 49562666 C A 0.01271701 +22 49574509 C T 0.0004703177 +22 49579141 A G 0.02448619 +22 49650863 T C 0.006739571 +22 49662549 T G -0.005769464 +22 49665841 T C -0.0007037069 +22 49677464 A G -0.02177735 +22 49696067 C T -0.003309682 +22 49700272 T G -0.002541948 +22 49706433 T C -0.01719402 +22 49713835 G A -0.01370754 +22 49719264 A C -0.01067852 +22 49743627 G A -0.0005970581 +22 49800265 C T 0.03098582 +22 49806863 A G 0.003940447 +22 49830851 C T -0.002742706 +22 49834624 G A -0.002820163 +22 49843235 G C -0.0004458281 +22 49847501 T G 0.002235016 +22 49861033 C T 0.01721243 +22 49881321 A G -0.00051278 +22 49908804 G A -0.009455892 +22 49911222 G T -0.01389666 +22 49925268 A G 0.01679984 +22 49927332 T C 0.00039298 +22 50109212 T C 0.01610819 +22 50118149 G C 0.007024666 +22 50184484 G T 0.01222581 +22 50219447 T C 0.05091891 +22 50278568 G A -0.02340672 +22 50319170 G A 0.01669806 +22 50350971 A G 0.0264016 +22 50356693 C T 0.003851499 +22 50435480 G A 0.0166363 +22 50439626 A G -0.002722154 +22 50466542 C T -0.002560094 +22 50470516 T C -0.01621986 +22 50491150 G A 0.01828674 +22 50515270 C T 0.01439904 +22 50529850 C T 0.02054628 +22 50570755 C G 0.007077514 +22 50582626 G A -0.003588854 +22 50672154 A G 0.007660848 +22 50722134 C T -0.01747164 +22 50722408 C T -0.001063465 +22 50728062 C T 0.02159223 +22 50750481 T C 0.01877272 +22 50758873 T C 0.004001731 +22 50835040 A G -0.006374259 +22 50859049 C T 0.0003480749 +22 50885775 G A -0.01358311 +22 50926768 T C 0.001798498 +22 50928026 A G 0.004775504 +22 50971266 C T 0.02160893 +22 50989197 T C -0.01328884 +22 50989326 G A 0.01037054 +22 50999681 G A -0.01226224 +22 51046163 T C -0.02754002 +22 51117580 C T 0.03573542 +22 51171497 A G -0.01951606 +22 51174939 T C -0.006178519 diff --git a/tests/data/combine/scorefile_dominant_and_recessive.txt b/tests/data/combine/scorefile_dominant_and_recessive.txt new file mode 100644 index 0000000..bbf23f0 --- /dev/null +++ b/tests/data/combine/scorefile_dominant_and_recessive.txt @@ -0,0 +1,838 @@ +#pgs_name=PGS001229_22_DominantRecessiveExample +#genome_build=GRCh37 +chr_name chr_position effect_allele other_allele effect_weight is_dominant is_recessive +22 17080378 G A 0.01045457 TRUE FALSE +22 17300230 A G 0.0001411475 FALSE TRUE +22 17318864 A C 0.008166266 FALSE FALSE +22 17327595 T C 0.007791641 FALSE FALSE +22 17409813 A G 0.0003108784 FALSE FALSE +22 17450952 G A -0.03033983 FALSE FALSE +22 17492533 G A 0.00388999 FALSE FALSE +22 17542810 C T 0.00803629 FALSE FALSE +22 17565013 G A 0.02135621 FALSE FALSE +22 17589209 T C 0.003026491 FALSE FALSE +22 17600977 A G 0.01581277 FALSE FALSE +22 17625915 A G -0.1172964 FALSE FALSE +22 17630486 A C 0.01012909 FALSE FALSE +22 17633785 C T 0.0023255 FALSE FALSE +22 17643689 A G 0.003361814 FALSE FALSE +22 17669306 C T 0.0214506 FALSE FALSE +22 17677699 T C -0.0007031384 FALSE FALSE +22 17680519 C A 0.001079236 FALSE FALSE +22 17701234 G A 0.004477145 FALSE FALSE +22 17703119 A T 0.0007771872 FALSE FALSE +22 17718699 C A -0.01320632 FALSE FALSE +22 17721595 C T 0.009480363 FALSE FALSE +22 17727648 T C 0.007811685 FALSE FALSE +22 17738177 G A -0.004719812 FALSE FALSE +22 17749096 A G -0.005244795 FALSE FALSE +22 17770181 G T -0.03101703 FALSE FALSE +22 17793969 G A 0.01774444 FALSE FALSE +22 17815696 G C -0.00551609 FALSE FALSE +22 17827684 G A -0.005944752 FALSE FALSE +22 17831813 T C 0.01061587 FALSE FALSE +22 17844929 T G 0.001717643 FALSE FALSE +22 17850661 T C -0.02805489 FALSE FALSE +22 17887534 A G 0.0007723542 FALSE FALSE +22 17887725 A G 0.007472703 FALSE FALSE +22 17958221 C A -0.02098647 FALSE FALSE +22 18036253 G A -0.01772981 FALSE FALSE +22 18038786 A G -0.002119071 FALSE FALSE +22 18262301 A T -0.005065485 FALSE FALSE +22 18289204 A G 0.005306345 FALSE FALSE +22 18295575 C T 0.02300129 FALSE FALSE +22 18296238 G A -0.005665446 FALSE FALSE +22 18319179 T C 0.03440642 FALSE FALSE +22 18393534 A C 0.01764269 FALSE FALSE +22 18439958 T C -0.002261707 FALSE FALSE +22 18483388 G A 0.03318724 FALSE FALSE +22 18488883 C G -0.0191918 FALSE FALSE +22 18489048 C A 0.01233198 FALSE FALSE +22 18495470 A G -0.005804926 FALSE FALSE +22 18537145 G A -0.004930116 FALSE FALSE +22 18571008 A G -8.844726E-05 FALSE FALSE +22 18584433 C T -0.001169893 FALSE FALSE +22 18631365 T C -0.001551714 FALSE FALSE +22 18650682 T C -0.01313784 FALSE FALSE +22 18890037 A G 0.05968921 FALSE FALSE +22 18891398 G A 0.006891943 FALSE FALSE +22 18892575 A G -0.00224447 FALSE FALSE +22 18915963 A G 0.003719756 FALSE FALSE +22 18959581 T C 0.006464581 FALSE FALSE +22 18963340 A G -0.01397565 FALSE FALSE +22 18970915 T C -0.001507131 FALSE FALSE +22 19024651 T C -0.00350575 FALSE FALSE +22 19121872 A G 0.01644046 FALSE FALSE +22 19135603 A G -0.02970077 FALSE FALSE +22 19190143 T C 0.003268027 FALSE FALSE +22 19263698 T C 0.02057255 FALSE FALSE +22 19292446 G T 0.01153989 FALSE FALSE +22 19371052 T C 0.01055134 FALSE FALSE +22 19420109 C T -0.008628228 FALSE FALSE +22 19451186 A C 0.02141029 FALSE FALSE +22 19518079 C T 0.005372247 FALSE FALSE +22 19581331 T C 0.01686942 FALSE FALSE +22 19593854 C A 0.0006544249 FALSE FALSE +22 19606703 G A 0.02070121 FALSE FALSE +22 19649005 A G 0.002868601 FALSE FALSE +22 19735854 C T 0.006262962 FALSE FALSE +22 19738355 T C 4.97384E-05 FALSE FALSE +22 19770886 A G -0.01013929 FALSE FALSE +22 19781823 T C 0.02481609 FALSE FALSE +22 19873357 T C 0.0116302 FALSE FALSE +22 19907099 A C -0.0267645 FALSE FALSE +22 19968597 T C -0.02203945 FALSE FALSE +22 20046344 G A -0.009801428 FALSE FALSE +22 20084821 C T -0.02232886 FALSE FALSE +22 20185457 A G 0.006892171 FALSE FALSE +22 20189077 T C 0.01738215 FALSE FALSE +22 20219648 A G 0.009307625 FALSE FALSE +22 20248391 A G -0.005405845 FALSE FALSE +22 20267213 A G 0.006713242 FALSE FALSE +22 20286099 G T 0.01574758 FALSE FALSE +22 20749042 G A 0.006603339 FALSE FALSE +22 20754039 A G -0.01181141 FALSE FALSE +22 20775167 T C 0.01160113 FALSE FALSE +22 20780296 A G 0.06735311 FALSE FALSE +22 20789074 C T 0.02844307 FALSE FALSE +22 20791438 A C 0.0473474 FALSE FALSE +22 20793914 C T 0.007009781 FALSE FALSE +22 20839810 T G 0.003947346 FALSE FALSE +22 20860931 T C 0.0005613511 FALSE FALSE +22 20979980 G A 0.003231665 FALSE FALSE +22 20991771 G A 0.004226765 FALSE FALSE +22 21075537 C A -0.002096453 FALSE FALSE +22 21154393 G T -0.004297086 FALSE FALSE +22 21323357 C T -0.006041745 FALSE FALSE +22 21331918 G C -0.002280912 FALSE FALSE +22 21334924 C G -0.02031369 FALSE FALSE +22 21356824 A G 0.01476577 FALSE FALSE +22 21386019 A G 0.01435557 FALSE FALSE +22 21449028 G A -0.01537701 FALSE FALSE +22 21463515 A G -0.01335614 FALSE FALSE +22 21982892 T C -0.06373335 FALSE FALSE +22 22001704 T G 0.02809584 FALSE FALSE +22 22062480 T C 0.0529113 FALSE FALSE +22 22080735 G A -0.0139426 FALSE FALSE +22 22151939 C A -0.008287849 FALSE FALSE +22 22163425 G A 0.05518983 FALSE FALSE +22 22307519 C G -0.003486191 FALSE FALSE +22 22351283 G A -0.0007483763 FALSE FALSE +22 22394291 AG A 0.004320583 FALSE FALSE +22 22395754 T C 0.002587971 FALSE FALSE +22 22424302 A C 0.0011408 FALSE FALSE +22 22473905 C A 0.01226009 FALSE FALSE +22 22550450 G C 0.01773244 FALSE FALSE +22 22561610 C T -0.006207024 FALSE FALSE +22 22581369 G A -0.006272413 FALSE FALSE +22 22584678 A G -0.00217647 FALSE FALSE +22 22711786 T C 0.007779875 FALSE FALSE +22 22726372 T C 0.00349632 FALSE FALSE +22 22762771 C T 0.01252501 FALSE FALSE +22 22769923 G A -0.01103632 FALSE FALSE +22 22869742 A C -0.002412657 FALSE FALSE +22 22871922 A G -0.002769974 FALSE FALSE +22 22929268 T C -0.007035723 FALSE FALSE +22 23001481 A G 0.007524178 FALSE FALSE +22 23022520 T C 0.002175257 FALSE FALSE +22 23064982 A C -0.01255076 FALSE FALSE +22 23249440 A C 0.02085816 FALSE FALSE +22 23268677 A G 0.01337349 FALSE FALSE +22 23279456 C G -0.01371401 FALSE FALSE +22 23282286 C T 0.004994329 FALSE FALSE +22 23325722 C T 0.0008506657 FALSE FALSE +22 23412058 A G -0.009545553 FALSE FALSE +22 23627369 G A -0.01900175 FALSE FALSE +22 23644425 G A -0.0009106953 FALSE FALSE +22 23649242 G T 0.001061643 FALSE FALSE +22 23794844 G A -0.01198736 FALSE FALSE +22 23804670 G T -0.001119846 FALSE FALSE +22 23819697 T G -0.01028722 FALSE FALSE +22 23873076 T C 0.009509027 FALSE FALSE +22 23892145 T C 0.0135128 FALSE FALSE +22 23925779 C T -0.004127647 FALSE FALSE +22 23960187 T C -0.008475905 FALSE FALSE +22 24035970 T C -0.001334318 FALSE FALSE +22 24086107 G A -0.01652957 FALSE FALSE +22 24105789 A G 0.01813091 FALSE FALSE +22 24106448 A G 0.001834095 FALSE FALSE +22 24186809 C T -0.01426541 FALSE FALSE +22 24235360 G A 0.0003168635 FALSE FALSE +22 24255296 T C 0.01624252 FALSE FALSE +22 24300540 T C -0.00322576 FALSE FALSE +22 24376584 A G -0.006223068 FALSE FALSE +22 24406778 A C 0.00304654 FALSE FALSE +22 24618331 G A -0.0006506681 FALSE FALSE +22 24802564 A G -0.006695797 FALSE FALSE +22 24912232 T C -0.01536303 FALSE FALSE +22 24943582 A G -0.001687764 FALSE FALSE +22 24995668 G A -0.03537331 FALSE FALSE +22 25123505 C T -0.0160099 FALSE FALSE +22 25145094 T C -0.005584047 FALSE FALSE +22 25145453 T C -0.001388536 FALSE FALSE +22 25185823 A G -0.009228375 FALSE FALSE +22 25265972 A G 0.01088906 FALSE FALSE +22 25309448 A G -0.002238693 FALSE FALSE +22 25363411 A G 0.004035775 FALSE FALSE +22 25410895 G A 0.0009720734 FALSE FALSE +22 25442369 C T 0.01660527 FALSE FALSE +22 25454658 C A 0.01200285 FALSE FALSE +22 25465065 C T 0.01320801 FALSE FALSE +22 25524916 C T 0.01147501 FALSE FALSE +22 25603008 T C -0.01262741 FALSE FALSE +22 25619025 G T -0.01212511 FALSE FALSE +22 25621591 T C 0.01051851 FALSE FALSE +22 25643483 T G 0.01373474 FALSE FALSE +22 25661725 A G -0.005936431 FALSE FALSE +22 25667883 G A 0.01547775 FALSE FALSE +22 25668730 A C 0.02616493 FALSE FALSE +22 25678577 T C 0.0304018 FALSE FALSE +22 25761309 T C -0.001760112 FALSE FALSE +22 25761936 T C -0.005171998 FALSE FALSE +22 25938977 T C 0.01966116 FALSE FALSE +22 25994013 A G 0.0006268228 FALSE FALSE +22 26081873 T C 0.05232603 FALSE FALSE +22 26132612 A G -0.006457239 FALSE FALSE +22 26133775 T C -0.001181527 FALSE FALSE +22 26159289 A G -0.008399401 FALSE FALSE +22 26181767 C T 0.01044769 FALSE FALSE +22 26190915 G A 0.004287533 FALSE FALSE +22 26218164 G A -0.002803502 FALSE FALSE +22 26231312 C G 0.006105629 FALSE FALSE +22 26237826 C T 0.004981479 FALSE FALSE +22 26239850 A C 0.004144037 FALSE FALSE +22 26273893 C G 0.005616213 FALSE FALSE +22 26278128 G T -0.003965338 FALSE FALSE +22 26280462 T C -0.0008324497 FALSE FALSE +22 26290588 T C -0.0130732 FALSE FALSE +22 26292659 G A 4.294309E-05 FALSE FALSE +22 26343593 G A 0.007813758 FALSE FALSE +22 26369358 T C -0.00483665 FALSE FALSE +22 26390964 A G -0.007849451 FALSE FALSE +22 26415475 T C -0.001219281 FALSE FALSE +22 26456367 G A -0.01285326 FALSE FALSE +22 26460519 T C -0.008695338 FALSE FALSE +22 26528054 A G 0.01973023 FALSE FALSE +22 26617260 T A -0.01384025 FALSE FALSE +22 26638906 G T 0.01229772 FALSE FALSE +22 26735648 A G 0.0007879673 FALSE FALSE +22 26782251 G A 0.0005096459 FALSE FALSE +22 26812632 C T -0.01850814 FALSE FALSE +22 26939781 C T -0.0009222796 FALSE FALSE +22 26960648 A C -0.005679255 FALSE FALSE +22 27038865 T G -0.0001487706 FALSE FALSE +22 27042828 A G 0.02957737 FALSE FALSE +22 27161060 A G 0.002844558 FALSE FALSE +22 27191643 T C 0.008953731 FALSE FALSE +22 27216426 G A 0.00912099 FALSE FALSE +22 27217018 A G 0.01510616 FALSE FALSE +22 27240025 T G -0.0297174 FALSE FALSE +22 27242642 G A -0.009822927 FALSE FALSE +22 27246070 C T -0.001554199 FALSE FALSE +22 27252454 C T -0.006560251 FALSE FALSE +22 27264880 G T -0.01323094 FALSE FALSE +22 27337886 A G -0.009600014 FALSE FALSE +22 27339284 T C -0.009944488 FALSE FALSE +22 27353810 T C -0.002171555 FALSE FALSE +22 27370273 T C -0.009798478 FALSE FALSE +22 27378884 A G 0.05145072 FALSE FALSE +22 27398749 C T 0.001012263 FALSE FALSE +22 27403571 C T -0.01745865 FALSE FALSE +22 27405012 T C -0.005425419 FALSE FALSE +22 27415255 C T -0.01499362 FALSE FALSE +22 27426628 G C 0.0228946 FALSE FALSE +22 27430724 A G -0.007068064 FALSE FALSE +22 27435577 C T -0.008632412 FALSE FALSE +22 27487580 G A 0.003691502 FALSE FALSE +22 27498426 A G -0.006801544 FALSE FALSE +22 27526095 G A -0.0008086267 FALSE FALSE +22 27563274 C A 0.0136965 FALSE FALSE +22 27584680 A G -0.002139188 FALSE FALSE +22 27628151 C G 0.02130389 FALSE FALSE +22 27652290 T G 0.004815735 FALSE FALSE +22 27660675 A G 0.004899654 FALSE FALSE +22 27674832 G T 0.0001248065 FALSE FALSE +22 27718775 A G 0.02292384 FALSE FALSE +22 27729742 G A 0.004951261 FALSE FALSE +22 27762155 C T 0.00485666 FALSE FALSE +22 27781736 A C -0.008336242 FALSE FALSE +22 27829565 G A 0.00285409 FALSE FALSE +22 27832985 G C -0.01668955 FALSE FALSE +22 27836311 G A -0.00775625 FALSE FALSE +22 27839704 T C -0.02492106 FALSE FALSE +22 27864471 A C 0.00218995 FALSE FALSE +22 27873024 G A 0.002721729 FALSE FALSE +22 27883265 G A 0.02961735 FALSE FALSE +22 27890684 A G -0.008057355 FALSE FALSE +22 27927298 T C 0.02054268 FALSE FALSE +22 27934290 G A 0.004751755 FALSE FALSE +22 27951176 A G -0.0004329547 FALSE FALSE +22 27974819 C A 0.01439093 FALSE FALSE +22 27975451 G A -0.03648208 FALSE FALSE +22 28007741 C T -0.01635917 FALSE FALSE +22 28016883 C A 0.008564085 FALSE FALSE +22 28046561 T C 0.01535905 FALSE FALSE +22 28060034 A G 0.03097228 FALSE FALSE +22 28076058 C T 0.02848654 FALSE FALSE +22 28094845 G A -0.02659077 FALSE FALSE +22 28130130 C T -0.01640387 FALSE FALSE +22 28136977 A C -0.003962775 FALSE FALSE +22 28150109 G A 0.0006071392 FALSE FALSE +22 28150815 A G 0.01604724 FALSE FALSE +22 28151825 A G -0.005390282 FALSE FALSE +22 28155404 T C 0.005030388 FALSE FALSE +22 28172577 G T 0.005704168 FALSE FALSE +22 28185452 G T -0.006896853 FALSE FALSE +22 28200176 G A -0.006474674 FALSE FALSE +22 28206912 C A -0.006175542 FALSE FALSE +22 28270372 G T -0.0006768204 FALSE FALSE +22 28412908 G T 0.01763639 FALSE FALSE +22 28501414 T C -0.2304747 FALSE FALSE +22 29106733 C T -0.01074749 FALSE FALSE +22 29318724 T C 0.001743333 FALSE FALSE +22 29378610 C T 0.0006690876 FALSE FALSE +22 29478760 C T -0.03029428 FALSE FALSE +22 29533572 G C -0.01269604 FALSE FALSE +22 29626515 A G -0.0117113 FALSE FALSE +22 29630337 A G 0.02658049 FALSE FALSE +22 29669648 C G -0.008550535 FALSE FALSE +22 29692497 T G 0.001234896 FALSE FALSE +22 29837537 C T 0.01321112 FALSE FALSE +22 29961986 T G 0.001878853 FALSE FALSE +22 30151687 C T 0.003418302 FALSE FALSE +22 30163526 G A 0.01576261 FALSE FALSE +22 30494371 A G 0.007959801 FALSE FALSE +22 30592487 G C -0.1047403 FALSE FALSE +22 30621613 A C -0.01382104 FALSE FALSE +22 30658082 C T -0.03794014 FALSE FALSE +22 30688659 T C 0.0225714 FALSE FALSE +22 30762140 A G 0.02079806 FALSE FALSE +22 30793137 A G -0.004609306 FALSE FALSE +22 30901592 C T -0.00833404 FALSE FALSE +22 30927975 T C 0.003226189 FALSE FALSE +22 30953295 T C -0.00768579 FALSE FALSE +22 30992651 G A -0.025658 FALSE FALSE +22 31018975 C T 0.04241226 FALSE FALSE +22 31032920 G A -0.02311985 FALSE FALSE +22 31063804 G GT -0.0002081808 FALSE FALSE +22 31114086 G T 0.02825476 FALSE FALSE +22 31139653 A G 2.640129E-06 FALSE FALSE +22 31214382 G A 0.01137657 FALSE FALSE +22 31216506 C T 0.005531311 FALSE FALSE +22 31272930 T C -0.001056118 FALSE FALSE +22 31333631 C T -0.01235089 FALSE FALSE +22 31378447 A G 0.01020507 FALSE FALSE +22 31442308 A G -0.002479126 FALSE FALSE +22 31477361 C G -0.01263667 FALSE FALSE +22 31514348 G A 0.00580324 FALSE FALSE +22 31521404 A G 0.01097391 FALSE FALSE +22 31659495 C T 0.02663412 FALSE FALSE +22 31884405 C T -0.0003950834 FALSE FALSE +22 32200849 T C 0.01585735 FALSE FALSE +22 32341684 T C -0.02960328 FALSE FALSE +22 32559835 G A -0.02170436 FALSE FALSE +22 32569263 C T -0.001296006 FALSE FALSE +22 32624139 C T 0.005619574 FALSE FALSE +22 32702816 A G -0.01534023 FALSE FALSE +22 32756652 G A 0.02512177 FALSE FALSE +22 32831540 T C 0.001868495 FALSE FALSE +22 32832874 T C 6.028815E-05 FALSE FALSE +22 32853660 G A 0.0138221 FALSE FALSE +22 32854391 C A 0.0001960825 FALSE FALSE +22 32875190 A G -0.006426637 FALSE FALSE +22 32934713 C CT -0.009057754 FALSE FALSE +22 32952012 A C -0.00380248 FALSE FALSE +22 32954443 G A 0.002210369 FALSE FALSE +22 32993032 C T -0.002429979 FALSE FALSE +22 32997766 T C -0.008424246 FALSE FALSE +22 33045573 T C -0.03107145 FALSE FALSE +22 33046110 G C -0.06954732 FALSE FALSE +22 33048039 T C 0.01138346 FALSE FALSE +22 33056341 C T -0.06477198 FALSE FALSE +22 33108536 T C -0.03426392 FALSE FALSE +22 33108981 T C -0.07404035 FALSE FALSE +22 33116435 T C 0.06542471 FALSE FALSE +22 33143528 G A 0.02195059 FALSE FALSE +22 33146363 A G 0.000810539 FALSE FALSE +22 33259625 C T 0.02309793 FALSE FALSE +22 33336039 T G -0.02554387 FALSE FALSE +22 33408519 T C -0.0075563 FALSE FALSE +22 33660345 C G 0.002190743 FALSE FALSE +22 33804893 C T 0.006680774 FALSE FALSE +22 33844303 C T 0.008923314 FALSE FALSE +22 33846914 T C 0.006295378 FALSE FALSE +22 33898906 A C 1.958759E-05 FALSE FALSE +22 34022284 A G -0.00257933 FALSE FALSE +22 34137784 G A 0.004460828 FALSE FALSE +22 34208570 T C -0.003365869 FALSE FALSE +22 34217757 T C 0.009289431 FALSE FALSE +22 34256923 A C 0.01439384 FALSE FALSE +22 34265402 G A -0.0163661 FALSE FALSE +22 34284173 G A -0.02315559 FALSE FALSE +22 34296093 C A -0.004688326 FALSE FALSE +22 34378012 A G 0.002276664 FALSE FALSE +22 34436795 C T 0.0001337033 FALSE FALSE +22 34488452 A G -0.000428831 FALSE FALSE +22 34501541 A G 0.002763614 FALSE FALSE +22 34514810 C A 0.003976601 FALSE FALSE +22 34526428 C T 0.01088864 FALSE FALSE +22 34583078 A G 0.001802495 FALSE FALSE +22 34620754 T C 0.01466546 FALSE FALSE +22 34691035 A G -0.0002082615 FALSE FALSE +22 34758540 T C 0.005165532 FALSE FALSE +22 34851377 A C 0.0137118 FALSE FALSE +22 35371707 T C -0.0004985554 FALSE FALSE +22 35382268 A C -0.004931336 FALSE FALSE +22 35419122 C T -0.01077953 FALSE FALSE +22 35478529 A G 0.0001760523 FALSE FALSE +22 35481493 T C 0.01056439 FALSE FALSE +22 35526281 G A -0.002766891 FALSE FALSE +22 35603836 A G -0.0001783939 FALSE FALSE +22 35660875 T G 0.03988231 FALSE FALSE +22 35745196 G T 0.0001750545 FALSE FALSE +22 35750980 A G -0.007651136 FALSE FALSE +22 35783413 G A 0.001649791 FALSE FALSE +22 35918270 C T 0.006918713 FALSE FALSE +22 35959242 A G 0.01697538 FALSE FALSE +22 35962060 G A 0.005181476 FALSE FALSE +22 35964158 G C 0.002769931 FALSE FALSE +22 35984385 A G -0.01280623 FALSE FALSE +22 36001258 C T 0.01342405 FALSE FALSE +22 36072262 T C 0.00489549 FALSE FALSE +22 36180535 G A -0.03250252 FALSE FALSE +22 36517307 C T 0.01366076 FALSE FALSE +22 36519596 A C -0.00349956 FALSE FALSE +22 36532058 A G -0.01214487 FALSE FALSE +22 36543489 C G 0.007838149 FALSE FALSE +22 36600841 G A 0.02644389 FALSE FALSE +22 36629633 C A -0.006871468 FALSE FALSE +22 36635967 G A -0.02634742 FALSE FALSE +22 36655735 A G -0.005385142 FALSE FALSE +22 36661646 A G -0.01560741 FALSE FALSE +22 36684354 C T -0.005170111 FALSE FALSE +22 36705622 A G 0.01713234 FALSE FALSE +22 36708049 C CTCCTGTGA -0.05187051 FALSE FALSE +22 36751101 A C -0.0244065 FALSE FALSE +22 36764788 G A 0.02784116 FALSE FALSE +22 36897427 C T 0.02603792 FALSE FALSE +22 36900806 G A 0.007366207 FALSE FALSE +22 36923144 T C -0.001875563 FALSE FALSE +22 36924714 G A -0.003632594 FALSE FALSE +22 36946643 T G 0.01333137 FALSE FALSE +22 36954939 T C 0.01105894 FALSE FALSE +22 36998907 T C -0.0006084687 FALSE FALSE +22 37001495 G T -0.01224147 FALSE FALSE +22 37013167 G C 0.01866849 FALSE FALSE +22 37077364 C T 0.007294257 FALSE FALSE +22 37080738 C G -0.004873355 FALSE FALSE +22 37101890 C T 0.03991764 FALSE FALSE +22 37118535 A G -0.001713909 FALSE FALSE +22 37184521 G A 0.006515894 FALSE FALSE +22 37206341 G T 0.0002566936 FALSE FALSE +22 37256262 A G 0.001152626 FALSE FALSE +22 37258503 C T -0.009761102 FALSE FALSE +22 37323988 T C -0.0073182 FALSE FALSE +22 37329545 G A 0.005775806 FALSE FALSE +22 37337409 T C -0.02534399 FALSE FALSE +22 37343000 A C -0.0004011777 FALSE FALSE +22 37398195 T C -0.01001198 FALSE FALSE +22 37401532 A G -0.003244795 FALSE FALSE +22 37407109 C G 0.04335972 FALSE FALSE +22 37477732 T C 0.0003669548 FALSE FALSE +22 37507019 A G -0.0009259451 FALSE FALSE +22 37513316 A G 0.001153887 FALSE FALSE +22 37532441 A G 0.01802306 FALSE FALSE +22 37571497 G A -0.005785311 FALSE FALSE +22 37581383 T C 0.03172492 FALSE FALSE +22 37621269 C A 0.004460405 FALSE FALSE +22 37644621 T C -0.008386907 FALSE FALSE +22 37671896 A G 0.02303688 FALSE FALSE +22 37679763 G A -0.002658396 FALSE FALSE +22 37720268 G A 0.02120184 FALSE FALSE +22 37753256 C T 0.008984539 FALSE FALSE +22 37757099 G A -0.01560347 FALSE FALSE +22 37780522 C G -0.01496708 FALSE FALSE +22 37800175 T C -0.005510833 FALSE FALSE +22 37846448 G A 0.01152963 FALSE FALSE +22 37896749 C T 0.005447068 FALSE FALSE +22 37908435 C T 0.001909131 FALSE FALSE +22 37977481 T C 0.01465308 FALSE FALSE +22 37992699 G A 0.0008339179 FALSE FALSE +22 38032762 G GA 0.01693041 FALSE FALSE +22 38054262 C A 0.04354146 FALSE FALSE +22 38083101 C T -0.02092117 FALSE FALSE +22 38119213 A G 0.03948165 FALSE FALSE +22 38122122 C T 0.04377277 FALSE FALSE +22 38204089 T C 0.02977743 FALSE FALSE +22 38435786 T G -0.007684278 FALSE FALSE +22 38544298 G A 0.05090446 FALSE FALSE +22 38597378 T G -0.01997927 FALSE FALSE +22 38606780 G A -0.009182016 FALSE FALSE +22 38630272 C T 0.007393137 FALSE FALSE +22 38663819 G A -0.006392021 FALSE FALSE +22 38673234 A G -0.01106705 FALSE FALSE +22 38685131 C T -0.004493352 FALSE FALSE +22 38695406 T C -0.01155972 FALSE FALSE +22 38708506 A G 0.01701713 FALSE FALSE +22 38744184 C T -0.02112956 FALSE FALSE +22 38819613 A G -0.005625806 FALSE FALSE +22 38877461 G T 0.001108728 FALSE FALSE +22 38918894 G T -0.008094286 FALSE FALSE +22 38928269 G T -0.02114917 FALSE FALSE +22 39027286 C CAG 0.003840735 FALSE FALSE +22 39067524 G A 0.01200232 FALSE FALSE +22 39159201 C T 0.003096214 FALSE FALSE +22 39178701 G A 0.002148449 FALSE FALSE +22 39260032 T C 0.03574634 FALSE FALSE +22 39268785 T G 0.009377414 FALSE FALSE +22 39281774 G T 0.03816951 FALSE FALSE +22 39300265 C T 0.03540156 FALSE FALSE +22 39332623 T C -0.004449842 FALSE FALSE +22 39415780 G A 0.01479946 FALSE FALSE +22 39448465 A G 0.003065974 FALSE FALSE +22 39480697 G A -0.04005617 FALSE FALSE +22 39487665 G A -0.0001218988 FALSE FALSE +22 39493294 C T -0.03115929 FALSE FALSE +22 39510995 G A -0.02069106 FALSE FALSE +22 39542292 A G 0.009653575 FALSE FALSE +22 39543000 T C -0.004069841 FALSE FALSE +22 39573724 A C 0.02683694 FALSE FALSE +22 39575692 A C 0.01451305 FALSE FALSE +22 39581277 A C 0.01766406 FALSE FALSE +22 39626572 A G -0.02901981 FALSE FALSE +22 39658626 C T 0.004177065 FALSE FALSE +22 39665395 G A 0.01264611 FALSE FALSE +22 39687484 G A 0.005418141 FALSE FALSE +22 39708279 A G -0.04281532 FALSE FALSE +22 39708357 T C 0.008605574 FALSE FALSE +22 39793066 G T 0.03658209 FALSE FALSE +22 39798127 G A 0.002302129 FALSE FALSE +22 39843409 T C 0.01065699 FALSE FALSE +22 39865475 G A 0.001588501 FALSE FALSE +22 39932516 A G -0.01179841 FALSE FALSE +22 39963426 G A -0.01503908 FALSE FALSE +22 40023636 C T 0.006443146 FALSE FALSE +22 40046176 C T -0.0007416552 FALSE FALSE +22 40067818 T C 0.00455936 FALSE FALSE +22 40092864 G A 0.02400297 FALSE FALSE +22 40127293 T C -0.0008870038 FALSE FALSE +22 40358148 T C -0.01079902 FALSE FALSE +22 40420786 G C -0.008092115 FALSE FALSE +22 40454069 G T 0.00789888 FALSE FALSE +22 40541981 G A 0.0174264 FALSE FALSE +22 40652873 G A 0.005853057 FALSE FALSE +22 40676672 G T -0.001894274 FALSE FALSE +22 40729614 G A 0.0195994 FALSE FALSE +22 40820151 C T -0.01628066 FALSE FALSE +22 40986372 G C -0.01983507 FALSE FALSE +22 41494925 A G -0.02918069 FALSE FALSE +22 41646738 G A 0.0003521847 FALSE FALSE +22 41680898 T C 0.01402732 FALSE FALSE +22 41704872 T C 6.681484E-05 FALSE FALSE +22 41791536 C T -5.572333E-05 FALSE FALSE +22 41895409 A G -0.04407217 FALSE FALSE +22 41929175 G T -0.03186844 FALSE FALSE +22 42089623 C T 0.00532234 FALSE FALSE +22 42095658 G T 0.03846131 FALSE FALSE +22 42210985 C T -0.00313971 FALSE FALSE +22 42279653 G A -0.006596336 FALSE FALSE +22 42341308 G A -0.0006862491 FALSE FALSE +22 42524243 C CT -0.01181191 FALSE FALSE +22 42672124 G A -0.005278171 FALSE FALSE +22 42691238 T C -0.01642396 FALSE FALSE +22 42813753 C T -0.00386775 FALSE FALSE +22 42867898 G A -0.001352327 FALSE FALSE +22 42912097 T C -0.0007295657 FALSE FALSE +22 42932317 A G -0.05768556 FALSE FALSE +22 43010817 A G 0.01722077 FALSE FALSE +22 43080028 T C -0.0005527551 FALSE FALSE +22 43096507 T C -0.005556102 FALSE FALSE +22 43112475 T C -0.01350273 FALSE FALSE +22 43114824 G A -0.01963192 FALSE FALSE +22 43115576 C T -0.01880097 FALSE FALSE +22 43154299 G A -0.001621113 FALSE FALSE +22 43159948 T C -0.007980584 FALSE FALSE +22 43206950 C A -0.005783037 FALSE FALSE +22 43218397 C T -0.003976636 FALSE FALSE +22 43283255 C A -0.01426668 FALSE FALSE +22 43290583 C T -0.03955775 FALSE FALSE +22 43333156 A G -0.03127845 FALSE FALSE +22 43426262 G A -0.00366804 FALSE FALSE +22 43483242 T C -0.02540203 FALSE FALSE +22 43515108 C T -0.01570749 FALSE FALSE +22 43529314 C G 0.01738127 FALSE FALSE +22 43551513 G A 0.02565386 FALSE FALSE +22 43558972 A G -0.01962819 FALSE FALSE +22 43577214 T C -0.02270478 FALSE FALSE +22 43579049 C T -0.001193909 FALSE FALSE +22 43610207 G A -0.007621661 FALSE FALSE +22 43623395 G C -0.04852519 FALSE FALSE +22 43640512 C T -0.005533207 FALSE FALSE +22 43649701 C T 0.07724845 FALSE FALSE +22 43661080 T C -0.04251741 FALSE FALSE +22 43683088 A G -0.003582388 FALSE FALSE +22 43707996 A G -0.02547044 FALSE FALSE +22 43711080 C G -0.005784446 FALSE FALSE +22 43721519 C A 0.000365885 FALSE FALSE +22 43729401 C T 0.008557013 FALSE FALSE +22 43763757 T G -0.0178981 FALSE FALSE +22 43836198 G T 0.002427697 FALSE FALSE +22 43976396 A G -0.01277457 FALSE FALSE +22 44031042 C T 0.003593107 FALSE FALSE +22 44193626 C A -0.006865434 FALSE FALSE +22 44221247 G A 0.01833991 FALSE FALSE +22 44296372 T C 0.006169212 FALSE FALSE +22 44298838 A G 0.007441756 FALSE FALSE +22 44342116 G A 0.02810328 FALSE FALSE +22 44368122 G A 0.0129968 FALSE FALSE +22 44379838 G A 0.001648422 FALSE FALSE +22 44380033 C T -0.002136788 FALSE FALSE +22 44395451 C T -0.006698507 FALSE FALSE +22 44419871 C T 0.0181613 FALSE FALSE +22 44424108 T C 0.01036733 FALSE FALSE +22 44467899 C T -0.002592364 FALSE FALSE +22 44498134 T C 0.007281423 FALSE FALSE +22 44522312 C T -0.0002636447 FALSE FALSE +22 44526130 G A -0.00388298 FALSE FALSE +22 44530286 A G 0.02528159 FALSE FALSE +22 44530420 C T -0.01233654 FALSE FALSE +22 44548944 G A -0.003947209 FALSE FALSE +22 44551755 G A 0.01262458 FALSE FALSE +22 44566434 A G -0.004290306 FALSE FALSE +22 44581046 T C -0.0147995 FALSE FALSE +22 44643161 C T 0.01439493 FALSE FALSE +22 44677081 C T -0.01030513 FALSE FALSE +22 44681612 G A -0.001269762 FALSE FALSE +22 44695088 T C 0.006324859 FALSE FALSE +22 44707716 G T 0.002288939 FALSE FALSE +22 44725343 G A 0.003534678 FALSE FALSE +22 44738406 G A 0.02320049 FALSE FALSE +22 44746729 A G -0.01754216 FALSE FALSE +22 44751158 G A -0.006539695 FALSE FALSE +22 44757439 A G 0.02480295 FALSE FALSE +22 44759519 G A 0.002111274 FALSE FALSE +22 44761797 A T -0.00531172 FALSE FALSE +22 44763352 C G 0.01452737 FALSE FALSE +22 44783779 G A 0.009142699 FALSE FALSE +22 44791807 C T -0.02371876 FALSE FALSE +22 44818986 C T -0.006740622 FALSE FALSE +22 44894913 G A -5.179871E-05 FALSE FALSE +22 45058431 C T 0.01098259 FALSE FALSE +22 45066035 A G -0.01484374 FALSE FALSE +22 45069410 T C 0.01530441 FALSE FALSE +22 45081330 G A 0.00135012 FALSE FALSE +22 45082168 C A 0.003663354 FALSE FALSE +22 45090008 G A 0.002811861 FALSE FALSE +22 45116664 C T 0.01247728 FALSE FALSE +22 45244930 T C -0.01450041 FALSE FALSE +22 45258457 G A -0.003500519 FALSE FALSE +22 45323989 T C 0.001111338 FALSE FALSE +22 45415987 A G -0.01398184 FALSE FALSE +22 45451355 G A -0.005566982 FALSE FALSE +22 45471607 C T 0.01148978 FALSE FALSE +22 45497738 C T -0.005029327 FALSE FALSE +22 45502829 C T -0.03893521 FALSE FALSE +22 45519040 T G 0.002377071 FALSE FALSE +22 45523391 A G 0.01318997 FALSE FALSE +22 45573450 C A 0.0043856 FALSE FALSE +22 45589490 G A -0.008350439 FALSE FALSE +22 45668012 T C 0.01286879 FALSE FALSE +22 45671343 G A -2.940682E-06 FALSE FALSE +22 45672574 T C 0.005743608 FALSE FALSE +22 45693923 A G -0.002675069 FALSE FALSE +22 45718743 G A -0.02092804 FALSE FALSE +22 45723807 C G 0.001670159 FALSE FALSE +22 45728370 A G 0.0001879231 FALSE FALSE +22 45741537 G T 0.01420045 FALSE FALSE +22 45749983 T G -0.04591012 FALSE FALSE +22 45809624 A C 0.002185772 FALSE FALSE +22 45821935 A G 0.02250782 FALSE FALSE +22 45837410 G A -0.002756449 FALSE FALSE +22 45846371 T C 0.07910102 FALSE FALSE +22 45864934 T C 0.008535181 FALSE FALSE +22 45871507 G C -0.007764056 FALSE FALSE +22 45892656 G T -0.003885653 FALSE FALSE +22 45897997 C T 0.0003935204 FALSE FALSE +22 45929577 C T -0.02532217 FALSE FALSE +22 45936350 A G -0.008001698 FALSE FALSE +22 45942726 T G -0.01415551 FALSE FALSE +22 45996298 G A 0.05643525 FALSE FALSE +22 46009063 G A 0.006464843 FALSE FALSE +22 46022070 G A 0.0224674 FALSE FALSE +22 46155548 G C -0.0324747 FALSE FALSE +22 46207955 C T -0.001354554 FALSE FALSE +22 46236425 A G 0.08398423 FALSE FALSE +22 46275529 T C 0.0022643 FALSE FALSE +22 46287720 A G -0.02237482 FALSE FALSE +22 46289699 T C 0.01872124 FALSE FALSE +22 46303347 T C -0.01283734 FALSE FALSE +22 46316057 A G 0.02312579 FALSE FALSE +22 46337043 G C 0.01701173 FALSE FALSE +22 46347519 C T 0.01574289 FALSE FALSE +22 46364161 A G -0.04466341 FALSE FALSE +22 46381234 G A 0.04730559 FALSE FALSE +22 46396925 G A 0.001783944 FALSE FALSE +22 46403715 A G -0.02132589 FALSE FALSE +22 46406782 A C 0.08439466 FALSE FALSE +22 46445002 G C -0.07613496 FALSE FALSE +22 46458123 G T 0.03328073 FALSE FALSE +22 46482948 C T 0.04241879 FALSE FALSE +22 46486508 C T -0.00968439 FALSE FALSE +22 46493852 T C -0.00675858 FALSE FALSE +22 46499120 C G -0.009873118 FALSE FALSE +22 46502870 T C -0.0179214 FALSE FALSE +22 46561713 G A 0.02604703 FALSE FALSE +22 46586110 A G -0.001256735 FALSE FALSE +22 46592168 C T 0.01417055 FALSE FALSE +22 46614274 G C -0.05854014 FALSE FALSE +22 46627603 T C 0.08004024 FALSE FALSE +22 46760086 T C 0.003229515 FALSE FALSE +22 46782382 T C -0.02470821 FALSE FALSE +22 46807234 C T 0.002324176 FALSE FALSE +22 46837114 G A 0.000944073 FALSE FALSE +22 46888399 T C 0.009911095 FALSE FALSE +22 46907779 G A 0.00653144 FALSE FALSE +22 46909355 T G -0.004780494 FALSE FALSE +22 46914277 A C 0.009689535 FALSE FALSE +22 46943687 G A -0.0130366 FALSE FALSE +22 46985917 A G 0.01893397 FALSE FALSE +22 47021226 G A -0.01322949 FALSE FALSE +22 47095235 A C -0.1156013 FALSE FALSE +22 47109621 C T 0.0004322858 FALSE FALSE +22 47125474 G A -0.01746025 FALSE FALSE +22 47147117 T C -0.02418349 FALSE FALSE +22 47156703 C T 0.0262897 FALSE FALSE +22 47245836 A G 0.001880575 FALSE FALSE +22 47271747 C T 0.001055264 FALSE FALSE +22 47301822 C T 0.003032158 FALSE FALSE +22 47345487 T C -0.002945945 FALSE FALSE +22 47372368 T C 0.02067644 FALSE FALSE +22 47380606 C T 0.04041426 FALSE FALSE +22 47437808 C T 0.001683027 FALSE FALSE +22 47450911 A G 0.01624479 FALSE FALSE +22 47511864 A C -0.004226735 FALSE FALSE +22 47519476 T C -0.003954111 FALSE FALSE +22 47529458 A G -0.0003602848 FALSE FALSE +22 47531320 T C -0.006899703 FALSE FALSE +22 47548321 T C 0.004925401 FALSE FALSE +22 47568291 C T 0.007726693 FALSE FALSE +22 47571203 A G -0.009744751 FALSE FALSE +22 47574009 C T -0.00532701 FALSE FALSE +22 47642100 T C 0.006976251 FALSE FALSE +22 47657635 T C 0.001798943 FALSE FALSE +22 47683805 C T -0.03475544 FALSE FALSE +22 47720973 T C -0.007868172 FALSE FALSE +22 47821952 G A -0.000885428 FALSE FALSE +22 47893053 A G -0.02449056 FALSE FALSE +22 47935365 C T -0.001599879 FALSE FALSE +22 47961708 G T -0.003593525 FALSE FALSE +22 47986332 T C -0.003976592 FALSE FALSE +22 48154645 C T 0.007608639 FALSE FALSE +22 48165452 C CT 0.002039503 FALSE FALSE +22 48207318 T C -0.009725168 FALSE FALSE +22 48213904 G C -0.01220367 FALSE FALSE +22 48215904 A G -2.488244E-05 FALSE FALSE +22 48220460 T C -0.002702163 FALSE FALSE +22 48230941 C A -0.001129522 FALSE FALSE +22 48271961 A G -0.005053446 FALSE FALSE +22 48284025 T C -0.003344182 FALSE FALSE +22 48297953 C T -0.01046958 FALSE FALSE +22 48362290 G A -0.02367254 FALSE FALSE +22 48362914 C A -0.003167719 FALSE FALSE +22 48387670 A G -0.008243989 FALSE FALSE +22 48415446 C T 0.002130715 FALSE FALSE +22 48460730 T C 0.002682476 FALSE FALSE +22 48491160 T C 0.001257794 FALSE FALSE +22 48519794 C T 0.003680757 FALSE FALSE +22 48537775 G A 0.002134692 FALSE FALSE +22 48543566 T C 0.007314089 FALSE FALSE +22 48593037 C T 0.009084708 FALSE FALSE +22 48687509 C T -0.0277196 FALSE FALSE +22 48692033 T C -0.02126264 FALSE FALSE +22 48699617 T C 0.0005093107 FALSE FALSE +22 48717568 T C -0.0008190281 FALSE FALSE +22 48811946 C T 0.007916515 FALSE FALSE +22 48823357 G A 0.01464317 FALSE FALSE +22 48840428 A C 0.003711229 FALSE FALSE +22 48851612 T C -0.005887765 FALSE FALSE +22 48874310 T C -0.01106607 FALSE FALSE +22 48968070 C T 0.01280691 FALSE FALSE +22 48991385 T C -0.01234119 FALSE FALSE +22 49004050 G A 0.02290755 FALSE FALSE +22 49014565 A G 0.001555565 FALSE FALSE +22 49086481 T C -0.006196369 FALSE FALSE +22 49107173 T C 0.01277272 FALSE FALSE +22 49180915 A G 0.006346977 FALSE FALSE +22 49262579 A G 0.02657134 FALSE FALSE +22 49270317 C T 0.001447665 FALSE FALSE +22 49313196 A G -0.007055532 FALSE FALSE +22 49335230 T C -0.006548281 FALSE FALSE +22 49366123 T C 0.01136486 FALSE FALSE +22 49372356 G C -0.02420841 FALSE FALSE +22 49443666 T C 0.01581736 FALSE FALSE +22 49496835 G A -0.01355414 FALSE FALSE +22 49524428 A G -0.004228482 FALSE FALSE +22 49530553 G C 0.008197389 FALSE FALSE +22 49537845 T C 0.0111255 FALSE FALSE +22 49557457 G A 0.009401926 FALSE FALSE +22 49562666 C A 0.01271701 FALSE FALSE +22 49574509 C T 0.0004703177 FALSE FALSE +22 49579141 A G 0.02448619 FALSE FALSE +22 49650863 T C 0.006739571 FALSE FALSE +22 49662549 T G -0.005769464 FALSE FALSE +22 49665841 T C -0.0007037069 FALSE FALSE +22 49677464 A G -0.02177735 FALSE FALSE +22 49696067 C T -0.003309682 FALSE FALSE +22 49700272 T G -0.002541948 FALSE FALSE +22 49706433 T C -0.01719402 FALSE FALSE +22 49713835 G A -0.01370754 FALSE FALSE +22 49719264 A C -0.01067852 FALSE FALSE +22 49743627 G A -0.0005970581 FALSE FALSE +22 49800265 C T 0.03098582 FALSE FALSE +22 49806863 A G 0.003940447 FALSE FALSE +22 49830851 C T -0.002742706 FALSE FALSE +22 49834624 G A -0.002820163 FALSE FALSE +22 49843235 G C -0.0004458281 FALSE FALSE +22 49847501 T G 0.002235016 FALSE FALSE +22 49861033 C T 0.01721243 FALSE FALSE +22 49881321 A G -0.00051278 FALSE FALSE +22 49908804 G A -0.009455892 FALSE FALSE +22 49911222 G T -0.01389666 FALSE FALSE +22 49925268 A G 0.01679984 FALSE FALSE +22 49927332 T C 0.00039298 FALSE FALSE +22 50109212 T C 0.01610819 FALSE FALSE +22 50118149 G C 0.007024666 FALSE FALSE +22 50184484 G T 0.01222581 FALSE FALSE +22 50219447 T C 0.05091891 FALSE FALSE +22 50278568 G A -0.02340672 FALSE FALSE +22 50319170 G A 0.01669806 FALSE FALSE +22 50350971 A G 0.0264016 FALSE FALSE +22 50356693 C T 0.003851499 FALSE FALSE +22 50435480 G A 0.0166363 FALSE FALSE +22 50439626 A G -0.002722154 FALSE FALSE +22 50466542 C T -0.002560094 FALSE FALSE +22 50470516 T C -0.01621986 FALSE FALSE +22 50491150 G A 0.01828674 FALSE FALSE +22 50515270 C T 0.01439904 FALSE FALSE +22 50529850 C T 0.02054628 FALSE FALSE +22 50570755 C G 0.007077514 FALSE FALSE +22 50582626 G A -0.003588854 FALSE FALSE +22 50672154 A G 0.007660848 FALSE FALSE +22 50722134 C T -0.01747164 FALSE FALSE +22 50722408 C T -0.001063465 FALSE FALSE +22 50728062 C T 0.02159223 FALSE FALSE +22 50750481 T C 0.01877272 FALSE FALSE +22 50758873 T C 0.004001731 FALSE FALSE +22 50835040 A G -0.006374259 FALSE FALSE +22 50859049 C T 0.0003480749 FALSE FALSE +22 50885775 G A -0.01358311 FALSE FALSE +22 50926768 T C 0.001798498 FALSE FALSE +22 50928026 A G 0.004775504 FALSE FALSE +22 50971266 C T 0.02160893 FALSE FALSE +22 50989197 T C -0.01328884 FALSE FALSE +22 50989326 G A 0.01037054 FALSE FALSE +22 50999681 G A -0.01226224 FALSE FALSE +22 51046163 T C -0.02754002 FALSE FALSE +22 51117580 C T 0.03573542 FALSE FALSE +22 51171497 A G -0.01951606 FALSE FALSE +22 51174939 T C -0.006178519 FALSE FALSE diff --git a/tests/test_combine.py b/tests/test_combine.py index db92cc9..bc82faf 100644 --- a/tests/test_combine.py +++ b/tests/test_combine.py @@ -1,39 +1,136 @@ +import csv +import importlib.resources +import json from unittest.mock import patch -import jq -import pandas as pd import pytest -from pgscatalog_utils.download.Catalog import CatalogQuery, CatalogResult -from pgscatalog_utils.download.CatalogCategory import CatalogCategory from pgscatalog_utils.scorefile.combine_scorefiles import combine_scorefiles +from tests.data import combine -def test_combine_scorefiles(combined_scorefile, _n_variants): - df = pd.read_table(combined_scorefile) - cols = {'chr_name', 'chr_position', 'effect_allele', 'other_allele', 'effect_weight', 'effect_type', - 'is_duplicated', 'accession', 'row_nr'} - assert set(df.columns).issubset(cols) - assert df.shape[0] == _n_variants +def test_pgscatalog_combine(pgscatalog_path, tmp_path, combine_output_header): + out_path = tmp_path / "combined.txt" + args: list[str] = ( + ["combine_scorefiles", "-t", "GRCh37", "-s"] + + [str(pgscatalog_path)] + + ["-o", str(out_path.resolve())] + ) + with patch("sys.argv", args): + combine_scorefiles() -def test_liftover(lifted_scorefiles): - df = pd.read_table(lifted_scorefiles) - assert df.shape[0] == 832 # approx size + n = -1 # skip header line + with open(out_path) as f: + for i, line in enumerate(f): + if i == 0: + cols = line.strip().split("\t") + assert not set(cols).difference(set(combine_output_header)) + n += 1 + with open(out_path.parent / "log_combined.json") as f: + header = json.load(f)[0] + assert header["PGS001229_22"]["pgs_id"] == "PGS001229" + assert header["PGS001229_22"]["pgs_name"] == "GBE_INI50" + assert header["PGS001229_22"]["genome_build"] == "GRCh37" + assert int(header["PGS001229_22"]["variants_number"]) == n + assert not header["PGS001229_22"]["use_harmonised"] -def test_fail_combine(scorefiles, tmp_path_factory): + +def test_effect_type_combine(effect_type_path, tmp_path, combine_output_header): + # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception + out_path = tmp_path / "combined.txt" + args: list[str] = ( + ["combine_scorefiles", "-t", "GRCh37", "-s"] + + [str(effect_type_path)] + + ["-o", str(out_path.resolve())] + ) + with patch("sys.argv", args): + combine_scorefiles() + + with open(out_path) as f: + n = 0 + for line in csv.DictReader(f, delimiter="\t"): + cols = list(line.keys()) + + if int(line["row_nr"]) == 0: + assert line["effect_type"] == "dominant" + + if int(line["row_nr"]) == 1: + assert line["effect_type"] == "recessive" + + n += 1 + + assert not set(cols).difference(set(combine_output_header)) + + with open(out_path.parent / "log_combined.json") as f: + header = json.load(f)[0] + assert ( + header["scorefile_dominant_and_recessive"]["pgs_name"] + == "PGS001229_22_DominantRecessiveExample" + ) + assert header["scorefile_dominant_and_recessive"]["genome_build"] == "GRCh37" + assert header["scorefile_dominant_and_recessive"]["variants_number"] == n + assert not header["scorefile_dominant_and_recessive"]["use_harmonised"] + + +def test_custom_combine(custom_score_path, tmp_path, combine_output_header): # these genomes are in build GRCh37, so combining with -t GRCh38 will raise an exception - with pytest.raises(Exception): - out_path = tmp_path_factory.mktemp("scores") / "combined.txt" - args: list[str] = ['combine_scorefiles', '-t', 'GRCh38', '-s'] + scorefiles + ['-o', str(out_path.resolve())] - with patch('sys.argv', args): - combine_scorefiles() + out_path = tmp_path / "combined.txt" + args: list[str] = ( + ["combine_scorefiles", "-t", "GRCh37", "-s"] + + [str(custom_score_path)] + + ["-o", str(out_path.resolve())] + ) + + with patch("sys.argv", args): + combine_scorefiles() + + # read combined file + n = -1 # skip header line + with open(out_path) as f: + for i, line in enumerate(f): + if i == 0: + cols = line.strip().split("\t") + assert not set(cols).difference(set(combine_output_header)) + n += 1 + + with open(out_path.parent / "log_combined.json") as f: + header = json.load(f)[0] + assert header["scorefile"]["pgs_name"] == "PGS001229_22" + assert header["scorefile"]["genome_build"] == "GRCh37" + assert header["scorefile"]["variants_number"] == n + assert not header["scorefile"]["use_harmonised"] + + +@pytest.fixture +def pgscatalog_path(scope="session"): + path = importlib.resources.files(combine) / "PGS001229_22.txt" + return path @pytest.fixture -def _n_variants(pgs_accessions): - result = CatalogQuery(CatalogCategory.SCORE, accession=pgs_accessions).get()[0] - json = result.response - n: list[int] = jq.compile("[.results][][].variants_number").input(json).all() - return sum(n) +def effect_type_path(scope="session"): + path = importlib.resources.files(combine) / "scorefile_dominant_and_recessive.txt" + return path + + +@pytest.fixture(scope="session") +def custom_score_path(tmp_path_factory): + path = importlib.resources.files(combine) / "scorefile.txt" + return path + + +@pytest.fixture(scope="session") +def combine_output_header(): + return [ + "chr_name", + "chr_position", + "effect_allele", + "other_allele", + "effect_weight", + "effect_type", + "is_duplicated", + "accession", + "row_nr", + ] diff --git a/tests/test_liftover.py b/tests/test_liftover.py index b2f03a0..396c8f8 100644 --- a/tests/test_liftover.py +++ b/tests/test_liftover.py @@ -1,9 +1,39 @@ -import pandas as pd +import copy -from pgscatalog_utils.scorefile.liftover import liftover +from pgscatalog_utils.scorefile.config import Config + +from pgscatalog_utils.download.GenomeBuild import GenomeBuild +from pgscatalog_utils.scorefile.liftover import liftover, create_liftover def test_liftover(hg38_coords, hg19_coords, chain_files): - lifted = liftover(hg38_coords, chain_files, min_lift=0.9, target_build='GRCh37') - coords: pd.DataFrame = hg19_coords[['lifted_pos', 'lifted_chr']] == lifted[['lifted_pos', 'lifted_chr']] - assert coords.all(axis=None) + Config.chain_dir = chain_files + Config.lo = create_liftover() + Config.min_lift = 0.95 + hg38 = list(hg38_coords) + hg19 = list(hg19_coords) + hg19_ = copy.deepcopy(hg19) + hg38_ = copy.deepcopy(hg38) + lifted = list( + liftover( + (x for x in hg38), + harmonised=False, + current_build=GenomeBuild.GRCh38, + target_build=GenomeBuild.GRCh37, + ) + ) + + assert [x.chr_position for x in lifted] == [x.chr_position for x in hg19_] + assert [x.chr_name for x in lifted] == [x.chr_name for x in hg19_] + + hg19 = copy.deepcopy(hg19) + lift_back = list( + liftover( + (x for x in hg19), + harmonised=False, + current_build=GenomeBuild.GRCh37, + target_build=GenomeBuild.GRCh38, + ) + ) + assert [x.chr_position for x in lift_back] == [x.chr_position for x in hg38_] + assert [x.chr_name for x in lift_back] == [x.chr_name for x in hg38_] diff --git a/tests/test_samplesheet.py b/tests/test_samplesheet.py deleted file mode 100644 index 80d77db..0000000 --- a/tests/test_samplesheet.py +++ /dev/null @@ -1,101 +0,0 @@ -import json -import os -from pathlib import Path -from unittest.mock import patch - -import pandas as pd -import pytest - -from pgscatalog_utils.samplesheet.check import check_samplesheet - - -@pytest.fixture -def existing_vcf_prefix(tmp_path): - vcf_path = tmp_path / "test.vcf.gz" - _touch(vcf_path) - return str(vcf_path.parent.joinpath(Path(vcf_path.stem).stem)) - - -@pytest.fixture -def samplesheet_df(existing_vcf_prefix): - return pd.DataFrame( - {"path_prefix": [existing_vcf_prefix], "format": ["vcf"], "sampleset": ["test"], "chrom": [None]}) - - -@pytest.fixture -def good_samplesheet(samplesheet_df, tmp_path): - path = tmp_path / "good_samplesheet.csv" - samplesheet_df.to_csv(path, index=False) - return str(path) - - -@pytest.fixture -def bad_samplesheet(samplesheet_df, tmp_path): - path = tmp_path / "bad_samplesheet.csv" - bad_df = samplesheet_df.copy() - bad_df['path_prefix'] = 'bad_path' # path doesn't exist - bad_df.to_csv(path, index=False) - return str(path) - - -@pytest.fixture -def multi_samplesets(samplesheet_df, tmp_path): - path = tmp_path / "multi_samplesets.csv" - multi_samplesets = pd.concat([samplesheet_df, samplesheet_df], ignore_index=True) - multi_samplesets.loc[multi_samplesets.index == 1, 'sampleset'] = 'a_different_name' - multi_samplesets.to_csv(path, index=False) - return str(path) - - -@pytest.fixture -def vcf_dosage(samplesheet_df, tmp_path): - path = tmp_path / "vcf_dosage.csv" - dosage_samplesheet = samplesheet_df.copy() - dosage_samplesheet["vcf_genotype_field"] = ["DS"] - dosage_samplesheet.to_csv(path, index=False) - return str(path) - - -def _touch(fname): - if os.path.exists(fname): - os.utime(fname, None) - else: - open(fname, 'a').close() - - -def test_good_samplesheet(good_samplesheet, tmp_path): - out_path = str(tmp_path / "out.json") - args = ['samplesheet_to_json', good_samplesheet, out_path] - with patch('sys.argv', args): - check_samplesheet() - - assert os.path.exists(out_path), "No file written" - - -def test_bad_samplesheet(bad_samplesheet, tmp_path): - out_path = str(tmp_path / "out.json") - args = ['samplesheet_to_json', bad_samplesheet, out_path] - with patch('sys.argv', args): - with pytest.raises(FileNotFoundError): - check_samplesheet() - - -def test_multi_samplesets(multi_samplesets, tmp_path): - out_path = str(tmp_path / "out.json") - args = ['samplesheet_to_json', multi_samplesets, out_path] - with patch('sys.argv', args): - with pytest.raises(Exception, match="Multiple samplesets"): - check_samplesheet() - - -def test_dosage_samplesheet(vcf_dosage, tmp_path): - out_path = str(tmp_path / "out.json") - args = ['samplesheet_to_json', vcf_dosage, out_path] - with patch('sys.argv', args): - check_samplesheet() - - assert os.path.exists(out_path), "Missing output file" - - with open(out_path, 'r') as f: - converted = json.loads(f.read()) - assert converted[0]['vcf_import_dosage'], "Not importing dosage correctly"