diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md new file mode 100644 index 0000000..e9e85c5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -0,0 +1,33 @@ +--- +name: Bug Report +about: Report an error or unexpected behavior. +title: "" +labels: bug +assignees: "" +--- + +### Bug Description + + + +### Steps to Reproduce + + + +1. +2. +3. + + + +```shell +# insert reprex here +``` + +### Expected Behavior + + diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..e2c9c12 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,5 @@ +blank_issues_enabled: true +contact_links: + - name: 💬 MIPTools Google Group + url: https://groups.google.com/g/miptools + about: Please ask and answer questions here. diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md new file mode 100644 index 0000000..5d8a59e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -0,0 +1,35 @@ +--- +name: Feature Request +about: Suggest an idea or feature. +title: "" +labels: "feature :sparkles:" +assignees: "" +--- + +### Related Problem + + + +### Solution Requested + + + + diff --git a/.github/ISSUE_TEMPLATE/maintenance.md b/.github/ISSUE_TEMPLATE/maintenance.md new file mode 100644 index 0000000..593eaa7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/maintenance.md @@ -0,0 +1,18 @@ +--- +name: Maintenance +about: Request code maintenance, clarity improvements, styling fixes, etc. +title: "" +labels: "maintenance :hammer_and_wrench:" +assignees: "" +--- + +### Maintenance Request + + + +#### Code Example + + diff --git a/.github/workflows/build-container.sh b/.github/workflows/build-container.sh new file mode 100755 index 0000000..37fd46f --- /dev/null +++ b/.github/workflows/build-container.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# Build container +sudo singularity build --force miptools_dev.sif MIPTools.def + +# Upload to website +mv -f miptools_dev.sif /work/bailey_share/software/MIPTools/download/ diff --git a/.github/workflows/notify-maintainers.yaml b/.github/workflows/notify-maintainers.yaml index 1d62b96..ee6ac13 100644 --- a/.github/workflows/notify-maintainers.yaml +++ b/.github/workflows/notify-maintainers.yaml @@ -1,7 +1,7 @@ name: Notify Maintainers on: - schedule: - - cron: 0 12 1,15 * * + # schedule: + # - cron: 0 12 1 * * workflow_dispatch: diff --git a/.github/workflows/notify-message.md b/.github/workflows/notify-message.md index 002e5cd..697f425 100644 --- a/.github/workflows/notify-message.md +++ b/.github/workflows/notify-message.md @@ -1,15 +1,9 @@ -Please build the container and deploy it to the Sylabs Cloud. It has been two weeks since the last reminder. +Please build the container and upload the container to the website. It has been one month since the last reminder. -Steps to build: +Build and deploy: ```bash -sudo singularity build miptools.sif MIPTools.def -``` - -Steps to deploy: - -```bash -singularity remote login -singularity sign miptools.sif -singularity push miptools.sif library://apascha1/miptools/miptools:{tag} +gh repo clone bailey-lab/MIPTools +cd MIPTools +.github/workflows/build-container.sh ``` diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 3e5a5af..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,22 +0,0 @@ -# MIPTools Changelog - -## MIPTools (development version) - -- Automatically build and deploy container using Github Actions (@arisp99, #11). -- Fix build failure due to dependency changes (#7). - -### Maintenance - -- Remove duplicated files. -- Improve bash errors. -- Make strings human readable (@arisp99, #5). - -### Documentation Overhaul - -- Add doc-strings to python functions. -- Improve clarity of README and add additional instructions on downloading or - building the container. - -## MIPTools 1.0.0 - -- First major release. diff --git a/MIPTools-dev.def b/MIPTools-dev.def deleted file mode 100644 index f9b32b5..0000000 --- a/MIPTools-dev.def +++ /dev/null @@ -1,338 +0,0 @@ -Bootstrap: docker -From: amd64/ubuntu:20.04 - -%post - # set number of cpus to use in build - CPU_COUNT=20 - # set build environment - export DEBIAN_FRONTEND=noninteractive \ - CONDA_DIR=/opt/conda \ - SHELL=/bin/bash \ - LANG=en_US.UTF-8 \ - LANGUAGE=en_US.UTF-8 \ - LC_ALL=en_US.UTF-8 \ - MINICONDA_VERSION=4.8.3 - export PATH=$CONDA_DIR/bin:$PATH - - # install system packages - apt-get update \ - && apt-get -yq dist-upgrade \ - && apt-get install -yq --no-install-recommends \ - wget \ - bzip2 \ - ca-certificates \ - sudo \ - locales \ - fonts-liberation \ - fonts-dejavu \ - git \ - build-essential \ - gcc \ - openssh-client \ - nano \ - libtbb-dev \ - libz-dev \ - libxrender1 \ - cmake \ - automake \ - autoconf \ - rsync \ - pigz \ - perl-tk \ - less \ - software-properties-common \ - libxext6 \ - libxrender1 \ - ghostscript \ - openjdk-11-jdk \ - liblzma-dev \ - libbz2-dev \ - libssl-dev \ - libcurl4-gnutls-dev \ - alien \ - unzip \ - tree \ - pandoc - - # set environment locale - echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen - echo "LANG=en_US.UTF-8" > /etc/locale.conf - echo "LC_ALL=en_US.UTF-8" >> /etc/environment - echo "LANGUAGE=en_US.UTF-8" >> /etc/environment - locale-gen en_US.UTF-8 - update-locale LANG=en_US.UTF-8 - - # install bcl2fastq, if the file is there - cd /opt/programs - unzip bcl2fastq2*.zip || true - alien bcl2fastq2*.rpm || true - dpkg -i bcl2fastq2*.deb || true - - # install msa2vcf - cd /opt/programs - git clone https://github.com/lindenb/jvarkit.git - cd jvarkit - ./gradlew msa2vcf - - # install conda - cd /tmp && \ - wget --quiet https://repo.continuum.io/miniconda/Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh && \ - echo "d63adf39f2c220950a063e0529d4ff74 *Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh" | md5sum -c - && \ - /bin/bash Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh -f -b -p $CONDA_DIR && \ - rm Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh && \ - $CONDA_DIR/bin/conda config --add channels defaults && \ - $CONDA_DIR/bin/conda config --add channels bioconda && \ - $CONDA_DIR/bin/conda config --add channels conda-forge && \ - $CONDA_DIR/bin/conda config --add channels r && \ - $CONDA_DIR/bin/conda config --system --set show_channel_urls true && \ - $CONDA_DIR/bin/conda install --quiet --yes conda="${MINICONDA_VERSION%.*}.*" && \ - conda clean -tipsy - - # install mamba - conda install mamba -c conda-forge - - # install conda packages using mamba - mamba install -qy\ - "gxx_linux-64" \ - "python" \ - "notebook" \ - "nbconvert" \ - "jupyter_contrib_nbextensions" \ - "xlrd" \ - "bcftools" \ - "samtools" \ - "vcftools" \ - "htslib" \ - "bwa" \ - "bowtie2" \ - "primer3" \ - "primer3-py" \ - "numpy" \ - "scipy" \ - "pysam" \ - "pandas" \ - "matplotlib" \ - "seaborn" \ - "scikit-learn" \ - "scandir" \ - "openpyxl" \ - "simplegeneric" \ - "matplotlib-venn" \ - "tblib" \ - "parallel" \ - "scikit-allel" \ - "bioconductor-dnacopy" \ - "basemap-data-hires" \ - "seqtk=1.3" \ - "freebayes" \ - "lastz" \ - "plotly" \ - "texlive-core" - - # install vt variant tool set - cd /opt/programs - git clone https://github.com/atks/vt.git - cd vt - git checkout 0.577 - make -j $CPU_COUNT - scp vt /opt/bin - - # install parasight - scp /opt/programs/parasight_v7.6/parasight.pl /opt/bin/parasight76.pl - - # add executable flag to executables - chmod -R +xr /usr/bin - chmod -R +xr /opt/bin - - - # create work and resources directories in /opt - mkdir /opt/work \ - /opt/project_resources \ - /opt/species_resources \ - /opt/data \ - /opt/analysis \ - /opt/host_species \ - /opt/extras - - -%files - programs /opt - bin /opt - src /opt - base_resources/ /opt/resources - -%environment - path=/opt/bin:/opt/conda/bin:/opt/programs/MIPWrangler/bin: - path=$path/opt/programs/elucidator/bin:/opt/programs/gatk: - path=$path$PATH - export PATH=$path - export XDG_RUNTIME_DIR="" - export DEBIAN_FRONTEND=noninteractive - export LANG=en_US.UTF-8 - export LANGUAduGE="en_US.UTF-8" - export LC_ALL="en_US.UTF-8" - -%apprun jupyter - set -e - set -u - nb_port=$(shuf -i 8000-9999 -n 1) - server_ip=$(hostname -i) - server_user=$(whoami)@$(hostname -f) - nb_dir=/opt - while getopts p:d: OPT; do - case "$OPT" in - p) - nb_port="$OPTARG";; - d) - nb_dir="$OPTARG";; - *) - echo "Invalid option. Use -p to specify notebook port \ - -d to specify notebook directory." - esac - done - rsync /opt/resources/*.ipynb /opt/analysis --ignore-existing \ - --ignore-missing-args - port_fw="Use the following command if you are running this notebook from " - port_fw=$port_fw"a remote server. Ignore if using a local computer." - echo $port_fw - port_fw="ssh -N -f -L localhost:$nb_port:$server_ip:$nb_port $server_user" - echo $port_fw - - jupyter nbextension enable plotlywidget/extension - jupyter nbextension enable toc2/main - jupyter nbextension enable codefolding/main - jupyter nbextension enable highlighter/highlighter - jupyter nbextension enable keyboard_shortcut_editor/main - jupyter nbextension enable spellchecker/main - - jupyter notebook --notebook-dir=$nb_dir --ip=$server_ip \ - --port=$nb_port --no-browser - -%apprun wrangler - set -e - set -u - # set defaults - cluster_script="runMIPWranglerCurrent.sh" - server_number=1 - cpu_count=1 - min_capture_length="none" - stitch_options="none" - keep_files="" - while getopts p:l:e:s:w:n:c:x:m:k OPT; do - case "$OPT" in - e) - experiment_id="$OPTARG";; - l) - sample_list="$OPTARG";; - p) - probe_sets="$OPTARG";; - s) - sample_sets="$OPTARG";; - w) - cluster_script="$OPTARG";; - n) - server_number="$OPTARG";; - c) - cpu_count="$OPTARG";; - x) - stitch_options="$OPTARG";; - k) - keep_files="-k";; - - m) - min_capture_length="$OPTARG";; - - *) - echo "Invalid option. Use 'wrangler \ - -e experiment_id -l sample_list.file -p probe_sets\ - -s sample_sets -w cluster_script -n server_number \ - -c cpu_count -x stitch_options' \ - -m min_capture_length [-k]" - esac - done - python /opt/src/generate_wrangler_scripts.py \ - -e $experiment_id -l /opt/analysis/$sample_list \ - -p $probe_sets -s $sample_sets -w $cluster_script -n $server_number \ - -c $cpu_count -x $stitch_options -m $min_capture_length $keep_files - . /opt/analysis/wrangle.sh - -%apprun download - set -e - set -u - while getopts r: opt; do - case $opt in - r) run_id=$OPTARG;; - ?) echo "Usage: singularity run --app download \\" - echo " -B /path_to_output_dir:/opt/analysis \\" - echo " -B /path_to_base_resources:/opt/resources \\" - echo " mycontainer.sif -r my_Illumina_run_ID" - echo "An 'access_token.txt' file with a valid access token is " - echo "required. It must be present in base_resources directory." - echo "A data directory where the data will be downloaded to" - echo "must be mounted to /opt/data." - exit 1;; - esac - done - echo "Downloading NextSeq run $run_id from BaseSpace." - echo "Depending on the data size, this can take very long (up to 10 h)" - echo "It is recommended to run this app in a screen (GNU screen)." - echo "A message indicating the end of download will be printed when done." - echo "Check nohup.out file in your output directory for the download log." - cd /opt/analysis - nohup python /opt/bin/BaseSpaceRunDownloader_v2.py \ - -r $run_id -a "$(cat /opt/resources/access_token.txt)" - echo "Download finished." - -%apprun demux - set -e - set -u - while getopts s:p: opt; do - case $opt in - s) sample_list=$OPTARG;; - p) platform=$OPTARG;; - ?) echo "Usage: singularity run --app demux \\" - echo " -B /path_to_run_dir:/opt/data \\" - echo " -B /path_to_output_dir:/opt/analysis \\" - echo " -B /path_to_base_resources:/opt/resources \\" - echo " mycontainer.sif -s sample_list_file \\" - echo " -p sequencing_platform (nextseq or miseq) \\" - echo "The sample list file must be present in the output" - echo "directory mounted to /opt/analysis." - exit 1;; - esac - done - # create a sample sheet for demultiplexing - cd /opt/src - template_dir="/opt/resources/templates/sample_sheet_templates/" - platform_template="$platform"_sample_sheet_template.csv - template="$template_dir$platform_template" - bc_dict="/opt/resources/barcode_dict.json" - output_dir="/opt/analysis" - sample_list="/opt/analysis/$sample_list" - python -c 'import mip_functions as mip; mip.generate_sample_sheet( - "'"$sample_list"'", "'"$bc_dict"'", "'"$template"'", "'"$platform"'", - "'"$output_dir"'")' - # cd to where bcl files are. - cd /opt/data - # create a fastq directory for saving fastqs - mkdir -p /opt/analysis/fastq - # increase limit of open number of files. - ulimit -Sn $(ulimit -Hn) - nohup bcl2fastq -o /opt/analysis/fastq \ - --sample-sheet /opt/analysis/SampleSheet.csv \ - --no-lane-splitting - -%apprun demux_qc - set -e - set -u - while getopts p: opt; do - case $opt in - p) platform=$OPTARG;; - ?) echo "Usage: singularity run --app demux_qc\\" - echo " -B /path_to_base_resources:/opt/resources \\" - echo " -B /path_to_fastq_dir:/opt/analysis " - echo " mycontainer.sif -p sequencing_platform" - exit 1;; - esac - done - python /opt/src/demux_qc.py -p $platform diff --git a/MIPTools.def b/MIPTools.def index d3810f3..1bac82e 100644 --- a/MIPTools.def +++ b/MIPTools.def @@ -6,7 +6,7 @@ From: amd64/ubuntu:20.04 ################################################################## %labels Author Bailey Lab - Version v1.0.0.9000 + Version v0.4.0.9000 ################################################################## ## Post Section ## @@ -17,13 +17,10 @@ From: amd64/ubuntu:20.04 # set build environment export DEBIAN_FRONTEND=noninteractive \ - CONDA_DIR=/opt/conda \ SHELL=/bin/bash \ LANG=en_US.UTF-8 \ LANGUAGE=en_US.UTF-8 \ - LC_ALL=en_US.UTF-8 \ - MINICONDA_VERSION=4.8.3 - export PATH=$CONDA_DIR/bin:$PATH + LC_ALL=en_US.UTF-8 # install system packages apt-get update \ @@ -38,7 +35,8 @@ From: amd64/ubuntu:20.04 fonts-dejavu \ git \ build-essential \ - gcc \ + gcc-10 \ + g++-10 \ openssh-client \ nano \ libtbb-dev \ @@ -63,7 +61,8 @@ From: amd64/ubuntu:20.04 alien \ unzip \ tree \ - pandoc + pandoc \ + pip # set environment locale echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen @@ -79,112 +78,96 @@ From: amd64/ubuntu:20.04 alien bcl2fastq2*.rpm || true dpkg -i bcl2fastq2*.deb || true - # install msa2vcf - cd /opt/programs - git clone https://github.com/lindenb/jvarkit.git - cd jvarkit - ./gradlew msa2vcf - - # install conda + # install conda and python via Miniconda3 + PYTHON_VERSION=3.8 + PYTHON_VERSION=$(echo ${PYTHON_VERSION} | sed 's/[^0-9]//g' | head -c2) + MINICONDA_VERSION=4.8.3 + MINICONDA_MD5=d63adf39f2c220950a063e0529d4ff74 + CONDA_DIR=/opt/conda + export PATH=${CONDA_DIR}/bin:${PATH} cd /tmp && \ - wget --quiet https://repo.continuum.io/miniconda/Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh && \ - echo "d63adf39f2c220950a063e0529d4ff74 *Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh" | md5sum -c - && \ - /bin/bash Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh -f -b -p $CONDA_DIR && \ - rm Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh && \ - $CONDA_DIR/bin/conda config --add channels defaults && \ - $CONDA_DIR/bin/conda config --add channels bioconda && \ - $CONDA_DIR/bin/conda config --add channels conda-forge && \ - $CONDA_DIR/bin/conda config --add channels r && \ - $CONDA_DIR/bin/conda config --system --set show_channel_urls true && \ - $CONDA_DIR/bin/conda install --quiet --yes conda="${MINICONDA_VERSION%.*}.*" && \ - conda clean --all -f -y - - # install mamba - conda install mamba -c conda-forge - - # install conda packages using mamba - mamba install -qy\ - "r" \ - "r-epitools" \ - "rpy2" \ - "r-irkernel" \ - "r-plotly" \ - "r-knitr" \ - "r-shiny" \ - "r-ggplot2" \ - "r-devtools" \ - "r-dplyr" \ - "r-dt" \ - "r-pkgbuild" \ - "gxx_linux-64" \ - "python" \ - "notebook" \ - "nbconvert" \ - "jupyter_contrib_nbextensions" \ - "xlrd" \ - "bcftools" \ - "samtools" \ - "vcftools" \ - "htslib" \ - "bwa" \ - "bowtie2" \ - "primer3" \ - "primer3-py" \ - "numpy" \ - "scipy" \ - "biopython" \ - "pysam" \ - "pandas" \ - "matplotlib" \ - "seaborn" \ - "scikit-learn" \ - "scandir" \ - "openpyxl" \ - "simplegeneric" \ - "matplotlib-venn" \ - "tblib" \ - "parallel" \ - "scikit-allel" \ - "bioconductor-dnacopy" \ - "basemap-data-hires" \ - "seqtk=1.3" \ - "gatk4" \ - "freebayes" \ - "lastz" \ - "plotly" \ - "texlive-core" \ - "libgfortran4" + wget --quiet "https://repo.anaconda.com/miniconda/Miniconda3-py${PYTHON_VERSION}_${MINICONDA_VERSION}-Linux-x86_64.sh" && \ + echo "${MINICONDA_MD5} *Miniconda3-py${PYTHON_VERSION}_${MINICONDA_VERSION}-Linux-x86_64.sh" | md5sum -c - && \ + /bin/bash Miniconda3-py${PYTHON_VERSION}_${MINICONDA_VERSION}-Linux-x86_64.sh -bfp ${CONDA_DIR} && \ + rm Miniconda3-py${PYTHON_VERSION}_${MINICONDA_VERSION}-Linux-x86_64.sh && \ + ${CONDA_DIR}/bin/conda config --system --set show_channel_urls true && \ + conda clean --all --force-pkgs-dirs --yes + + # Install mamba + # Note that the mamba installation will update conda to the latest version + conda install mamba --channel conda-forge --quiet + + # Install mamba packages using an environment file. + # If the versioned file exists, use it as a template to ensure version + # numbers are fixed. Otherwise, install packages with the latest versions. + # Note that instead of creating a new environment, we update the base + # environment, which is activated by default. + if [ -f "/opt/environment_versioned.yml" ]; then + mamba env update --prefix ${CONDA_DIR} --file /opt/environment_versioned.yml --quiet + else + # Update environment and save information to a file + mamba env update --prefix ${CONDA_DIR} --file /opt/environment.yml + mamba env export --prefix ${CONDA_DIR} > /opt/environment_versioned.yml + fi + + # Clean mamba installs + mamba clean --all --yes # install vt variant tool set cd /opt/programs - git clone https://github.com/atks/vt.git + git clone --branch 0.577 https://github.com/atks/vt.git cd vt - git checkout 0.577 make -j $CPU_COUNT scp vt /opt/bin + # install magrittr + Rscript -e 'devtools::install_version( + package = "magrittr", + version = "2.0.3", + repos = "https://cloud.r-project.org" + )' + # install RealMcCoil - # Rscript -e 'devtools::install_github("OJWatson/McCOILR")' + Rscript -e 'devtools::install_github("OJWatson/McCOILR", ref = "v1.3.1")' # install rehh - Rscript -e 'install.packages("rehh", repos="https://cloud.r-project.org")' + Rscript -e 'devtools::install_version( + package = "rehh", + version = "3.2.2", + repos = "https://cloud.r-project.org" + )' # install MIPWrangler cd /opt/programs - git clone https://github.com/bailey-lab/MIPWrangler + git clone --branch develop https://github.com/bailey-lab/MIPWrangler.git cd MIPWrangler - git checkout v1.2.0 ./install.sh $CPU_COUNT # install elucidator cd /opt/programs - git clone https://github.com/nickjhathaway/elucidator + git clone --branch develop https://github.com/nickjhathaway/elucidator.git cd elucidator - git checkout develop ./install.sh $CPU_COUNT # install parasight - scp /opt/programs/parasight_v7.6/parasight.pl /opt/bin/parasight76.pl + cd /opt/programs + git clone --branch v7.6 https://github.com/bailey-lab/parasight.git + scp parasight/parasight.pl /opt/bin + + # install basespace cli + BS_VERSION=1.5.1 + BS_PATH="https://launch.basespace.illumina.com/CLI/${BS_VERSION}/amd64-linux/bs" + wget $BS_PATH -O /opt/bin/bs + + # install snakemake + pip install snakemake + # use an older version of pulp (newest pulp not supported on current pip + #snakemake, shouldn't be needed once pip supports snakemake 7.32) + pip install pulp==2.7 + # Remove programs to reduce image size. These programs have been installed + # and the binaries moved to /opt/bin + cd /opt/programs + rm -r parasight vt # add executable flag to executables chmod -R +xr /usr/bin @@ -203,17 +186,20 @@ From: amd64/ubuntu:20.04 ## Files Section ## ################################################################# %files + environment* /opt programs /opt bin /opt src /opt + snakemake /opt base_resources/ /opt/resources ################################################################# ## Environment Section ## ################################################################# %environment - path=/opt/bin:/opt/conda/bin:/opt/programs/MIPWrangler/bin: - path=$path/opt/programs/elucidator/bin:/opt/programs/gatk: + path=/opt/bin:/opt/conda/bin: + path=$path/opt/programs/MIPWrangler/bin: + path=$path/opt/programs/elucidator/bin: path=$path$PATH export PATH=$path export XDG_RUNTIME_DIR="" @@ -221,14 +207,18 @@ From: amd64/ubuntu:20.04 export LANG=en_US.UTF-8 export LANGUAGE="en_US.UTF-8" export LC_ALL="en_US.UTF-8" + export PYTHONNOUSERSITE=1 ################################################################# ## Jupyter App ## ################################################################# %apprun jupyter - # Modify shell behavior - set -e - set -u + # Exit if something fails or if have unset object + set -eu + + # increase allowed number of open files to the hard limit of the machine + ulimit -Sn $(ulimit -Hn) + # Port forwarding setup nb_port=$(shuf -i 8000-9999 -n 1) @@ -236,16 +226,49 @@ From: amd64/ubuntu:20.04 server_user=$(whoami)@$(hostname -f) nb_dir=/opt + help() { + echo "Open an interactive Jupyter Notebook. The notebook can be used" + echo "for post-wrangler mapping and variant calling." + echo "" + echo "Usage:" + echo " singularity run [options] --app jupyter "\ + "[app_options]" + echo "" + echo "Options:" + echo " See 'singularity run'." + echo "" + echo "App Options:" + echo " -d The notebook directory." + echo " -h Print the help page." + echo " -p The port to be used to load the Jupyter Notebook." + echo "" + echo "Examples:" + echo " # Set paths" + echo " $ resource_dir=/bin/MIPTools/base_resources" + echo " $ project_resources=/work/usr/DR1_project_resources" + echo " $ species_resources=/work/usr/pf_species_resources" + echo " $ wrangler_dir=/work/usr/wrangler" + echo " $ variant_dir=/work/usr/variant" + echo "" + echo " # Run app" + echo " $ singularity run \\" + echo " -B \${resource_dir}:/opt/resources \\" + echo " -B \${project_resources}:/opt/project_resources \\" + echo " -B \${species_resources}:/opt/species_resources \\" + echo " -B \${wrangler_dir}:/opt/data \\" + echo " -B \${variant_dir}:/opt/analysis \\" + echo " --app jupyter " + } + # Parse options - while getopts p:d: OPT; do - case "$OPT" in - p) - nb_port="$OPTARG";; - d) - nb_dir="$OPTARG";; - *) - echo "Invalid option. Use -p to specify notebook port \ - -d to specify notebook directory." + while getopts "d:hp:" opt; do + case ${opt} in + d) nb_dir=${OPTARG} ;; + h) help + exit 1 ;; + p) nb_port=${OPTARG} ;; + *) help + exit 1 ;; esac done @@ -253,12 +276,11 @@ From: amd64/ubuntu:20.04 rsync /opt/resources/*.ipynb /opt/analysis --ignore-existing \ --ignore-missing-args - # Inform the user how to acess the notebook - port_fw="Use the following command if you are running this notebook from " - port_fw=$port_fw"a remote server. Ignore if using a local computer." - echo $port_fw - port_fw="ssh -N -f -L localhost:$nb_port:$server_ip:$nb_port $server_user" - echo $port_fw + # Inform the user how to access the notebook + echo "\nIf you are running this command from a remote server, you will need" + echo "to forward the port to your local machine. To do so, run:\n" + echo "ssh -fNL localhost:${nb_port}:${server_ip}:${nb_port}"\ + "${server_user}\n" # Setup juptyr notebook settings jupyter nbextension enable plotlywidget/extension @@ -269,17 +291,80 @@ From: amd64/ubuntu:20.04 jupyter nbextension enable spellchecker/main # Run notebook - jupyter notebook --notebook-dir=$nb_dir --ip=$server_ip \ - --port=$nb_port --no-browser + jupyter notebook --notebook-dir=${nb_dir} --ip=${server_ip} \ + --port=${nb_port} --no-browser ################################################################## ## Wrangler App ## ################################################################## %apprun wrangler - # Modify shell behavior - set -e - set -u - + # Exit if something fails or if have unset object + set -eu + + # Increase limit of open number of files. + ulimit -Sn $(ulimit -Hn) + + help() { + echo "Run MIPWrangler on demultiplexed data." + echo "" + echo "Usage:" + echo " singularity run [options] --app wrangler "\ + "[app_options]" + echo "" + echo "Options:" + echo " See 'singularity run'." + echo "" + echo "App Options:" + echo " -c Number of available processors to use. Default: 1." + echo " -e Required. A unique ID given to each sequencing run by" + echo " the user." + echo " -f The population fraction cutoff used by MIPWrangler." + echo " Default: 0.005." + echo " -h Print the help page." + echo " -k Keep intermediate files generated by MIPWrangler." + echo " -l Required. File providing a list of samples with " + echo " associated information." + echo " -m Minimum capture length for stitching excluding probe" + echo " arms." + echo " -n Starting number for MIP server. Default: 1." + echo " -o Absolute path to MIPWrangler run script. " + echo " Default: '/opt/bin/runMIPWranglerCurrent.sh'." + echo " -p Required. Probe sets to be processed." + echo " -s Required. Sample sets to be processed." + echo " -t The threshold at which UMIs will be downsampled." + echo " Defualt: 2000" + echo " -w Whether to apply a weight when randomly sampling UMIs." + echo " UMIs are weighted by their read counts." + echo " Default: false" + echo " -x Required. Additional arguments to pass to MIPWrangler" + echo " mipSetupAndExtractByArm. This command extracts sequences" + echo " and stitches paired end reads to single sequences." + echo "" + echo "Examples:" + echo " # Define variables" + echo " $ probe_sets='DR1,VAR4'" + echo " $ sample_sets='JJJ'" + echo " $ stitch_options='--stitchGapExtend=1,--overWriteDirs'" + echo "" + echo " $ singularity run " + echo " -B project_resources:/opt/project_resources \\" + echo " -B fastq_dir:/opt/data \\" + echo " -B wrangler_dir:/opt/analysis \\" + echo " --app wrangler \\" + echo " -e -l -p \${probe_sets} \\" + echo " -s \${sample_sets} -x \${stitch_options}" + echo "" + echo " # Run app" + echo " $ singularity run " + echo " -B project_resources:/opt/project_resources \\" + echo " -B fastq_dir:/opt/data \\" + echo " -B wrangler_dir:/opt/analysis \\" + echo " --app wrangler \\" + echo " -c -e -l \\" + echo " -m -p \${probe_sets} -s \\" + echo " \${sample_sets} -x \${stitch_options} -k" + } + # Set defaults cluster_script="runMIPWranglerCurrent.sh" server_number=1 @@ -287,46 +372,54 @@ From: amd64/ubuntu:20.04 min_capture_length="none" stitch_options="none" keep_files="" + population_fraction_cutoff=0.005 + downsample_threshold=2000 + weighted="" # Parse options - while getopts p:l:e:s:w:n:c:x:m:k OPT; do - case "$OPT" in - e) - experiment_id="$OPTARG";; - l) - sample_list="$OPTARG";; - p) - probe_sets="$OPTARG";; - s) - sample_sets="$OPTARG";; - w) - cluster_script="$OPTARG";; - n) - server_number="$OPTARG";; - c) - cpu_count="$OPTARG";; - x) - stitch_options="$OPTARG";; - k) - keep_files="-k";; - - m) - min_capture_length="$OPTARG";; - - *) - echo "Invalid option. Use 'wrangler \ - -e experiment_id -l sample_list.file -p probe_sets\ - -s sample_sets -w cluster_script -n server_number \ - -c cpu_count -x stitch_options' \ - -m min_capture_length [-k]" + while getopts "c:e:f:hkl:m:n:o:p:s:t:wx:" opt; do + case ${opt} in + c) cpu_count=${OPTARG} ;; + e) experiment_id=${OPTARG} ;; + f) population_fraction_cutoff=${OPTARG} ;; + h) help + exit 1 ;; + k) keep_files=-k ;; + l) sample_list=${OPTARG} ;; + m) min_capture_length=${OPTARG} ;; + n) server_number=${OPTARG} ;; + o) cluster_script=${OPTARG} ;; + p) probe_sets=${OPTARG} ;; + s) sample_sets=${OPTARG} ;; + t) downsample_threshold=${OPTARG} ;; + w) weighted=-w ;; + x) stitch_options=${OPTARG} ;; + *) help + exit 1 ;; esac done + # Remove whitespace from arguments + probe_sets=$(echo ${probe_sets} | sed "s/[[:space:]]//g") + sample_sets=$(echo ${sample_sets} | sed "s/[[:space:]]//g") + stitch_options=$(echo ${stitch_options} | sed "s/[[:space:]]//g") + + # Ensure that stitch options begin with a comma if not the default value + # This is done as the arguments are fed in with leading dashes and the + # python script will crash if dashes are fed in. By adding a leading comma, + # the script will run. + stitch_first_char=$(echo ${stitch_options} | head -c1) + if [ ${stitch_first_char} != "," ] && [ ${stitch_options} != "none" ]; then + stitch_options=",${stitch_options}" + fi + # Create wrangler bash scripts using python python /opt/src/generate_wrangler_scripts.py \ - -e $experiment_id -l /opt/analysis/$sample_list \ - -p $probe_sets -s $sample_sets -w $cluster_script -n $server_number \ - -c $cpu_count -x $stitch_options -m $min_capture_length $keep_files + -c ${cpu_count} -e ${experiment_id} ${keep_files} \ + -l /opt/analysis/${sample_list} -m ${min_capture_length} \ + -n ${server_number} -p ${probe_sets} -s ${sample_sets} \ + -o ${cluster_script} -x ${stitch_options} -f ${population_fraction_cutoff} \ + -t ${downsample_threshold} ${weighted} # Run wrangler scripts. # The dot space is used to let the sourced script modify the current @@ -334,115 +427,254 @@ From: amd64/ubuntu:20.04 . /opt/analysis/wrangle.sh ################################################################## -## Download App ## +## Basespace Download App ## ################################################################## %apprun download + # Exit if something fails + set -eu + + # Set default values for paths + output_path="/opt/analysis" + config_path="/opt/resources/basespace.cfg" + + help() { + echo "Download data from the Illumina BaseSpace Sequence Hub." + echo "" + echo "Usage:" + echo " singularity run [options] --app download "\ + "[app_options]" + echo "" + echo "Options:" + echo " See 'singularity run'." + echo "" + echo "App Options:" + echo " -i Required. The run ID of the data to download." + echo " -o The path to the output directory." + echo " Default: '/opt/analysis'." + echo " -c The path to the authentication credentials file." + echo " This file is created by 'bs auth'. For additional" + echo " information see the help page for that command." + echo " Default: '/opt/resources/basespace.cfg'." + echo " -h Print the help page." + echo "" + echo "Examples:" + echo " # Set paths" + echo " $ resource_dir=/bin/MIPTools/base_resources" + echo " $ run_dir=/work/usr/example" + echo "" + echo " # Run app" + echo " $ singularity run \\" + echo " -B \${resource_dir}:/opt/resources \\" + echo " -B \${run_dir}:/opt/analysis \\" + echo " --app download -i " + } + # Parse options - set -e - set -u - while getopts r: opt; do - case $opt in - r) run_id=$OPTARG;; - ?) echo "Usage: singularity run --app download \\" - echo " -B /path_to_output_dir:/opt/analysis \\" - echo " -B /path_to_base_resources:/opt/resources \\" - echo " mycontainer.sif -r my_Illumina_run_ID" - echo "An 'access_token.txt' file with a valid access token is " - echo "required. It must be present in base_resources directory." - echo "A data directory where the data will be downloaded to" - echo "must be mounted to /opt/data." - exit 1;; + while getopts "i:o:c:h" opt; do + case "${opt}" in + c) config_path=${OPTARG} ;; + h) help + exit 1 ;; + i) run_id=${OPTARG} ;; + o) output_path=${OPTARG} ;; + *) help + exit 1 ;; + esac + done + + # Ensure run_id is specified + if [ -z ${run_id} ]; then + echo "Argument -i must be provided" + help >&2 + exit 1 + fi + + # Read data from config file + # Remove whitespace from each line and export each line as a variable + export BASESPACE_API_SERVER=$(sed "1q;d" ${config_path} | sed "s/.*=.//g") + export BASESPACE_ACCESS_TOKEN=$(sed "2q;d" ${config_path} | sed "s/.*=.//g") + + # Download data + bs download run --summary -i ${run_id} -o ${output_path}/${run_id} + +################################################################# +## Superseded Download App ## +################################################################# +%apprun download_superseded + # Exit if something fails or if have unset object + set -eu + + help() { + echo "Download data from the Illumina BaseSpace Sequence Hub." + echo "" + echo "Superseded Note:" + echo " Please note that this app has been superseded by the download" + echo " app, which uses the basespace command line interface for" + echo " downloading data." + echo "" + echo "Usage:" + echo " singularity run [options] --app download_superseded \\" + echo " [app_options]" + echo "" + echo "Options:" + echo " See 'singularity run'." + echo "" + echo "App Options:" + echo " -h Print the help page." + echo " -r Required. The run ID of the data to download." + echo "" + echo "Additional Details:" + echo " An 'access_token.txt' file with a valid access token is" + echo " required. It must be present in the 'base_resources' directory." + echo " A data directory where the data will be downloaded to must be" + echo " mounted to '/opt/analysis'." + echo "" + echo "Examples:" + echo " # Set paths" + echo " $ resource_dir=/bin/MIPTools/base_resources" + echo " $ output_dir=/work/usr/downloaded" + echo "" + echo " # Run app" + echo " $ singularity run \\" + echo " -B \${resource_dir}:/opt/resources"\ + "-B \${output_dir}:/opt/analysis \\" + echo " --app download_superseded -r " + } + + while getopts "hr:" opt; do + case ${opt} in + h) help + exit 1 ;; + r) run_id=${OPTARG} ;; + *) help + exit 1 ;; esac done # Print to CLI - echo "Downloading NextSeq run $run_id from BaseSpace." - echo "Depending on the data size, this can take very long (up to 10 h)" + echo "Downloading NextSeq run ${run_id} from BaseSpace." + echo "Depending on the data size, this can take very long (up to 10 h)." echo "It is recommended to run this app in a screen (GNU screen)." echo "A message indicating the end of download will be printed when done." echo "Check nohup.out file in your output directory for the download log." - + # cd and run app # Use nohup to make command keep running even if get hangup signal cd /opt/analysis nohup python /opt/bin/BaseSpaceRunDownloader_v2.py \ - -r $run_id -a "$(cat /opt/resources/access_token.txt)" + -r ${run_id} -a "$(cat /opt/resources/access_token.txt)" - # Print to CLI + # Print to CLI echo "Download finished." ################################################################# ## Demux App ## ################################################################# %apprun demux - # Parse options - set -e - set -u - while getopts s:p: opt; do - case $opt in - s) sample_list=$OPTARG;; - p) platform=$OPTARG;; - ?) echo "Usage: singularity run --app demux \\" - echo " -B /path_to_run_dir:/opt/data \\" - echo " -B /path_to_output_dir:/opt/analysis \\" - echo " -B /path_to_base_resources:/opt/resources \\" - echo " mycontainer.sif -s sample_list_file \\" - echo " -p sequencing_platform (nextseq or miseq) \\" - echo "The sample list file must be present in the output" - echo "directory mounted to /opt/analysis." - exit 1;; + # Exit if something fails or if have unset object + set -eu + + help() { + echo "Demultiplex data. Generates per-sample fastq files from the raw" + echo "sequence data consisting of bcl files." + echo "" + echo "Usage:" + echo " singularity run [options] --app demux [app_options]" + echo "" + echo "Options:" + echo " See 'singularity run'." + echo "" + echo "App Options:" + echo " -h Print the help page." + echo " -s Required. Sample sheet for demultiplexing. " + echo " This file must be present in the directory mounted to " + echo " '/opt/analysis'." + echo "" + echo "Examples:" + echo " # Set paths" + echo " $ resource_dir=/bin/MIPTools/base_resources" + echo " $ bcl_dir=/work/usr/downloaded" + echo " $ fastq_root_dir=/work/usr/" + echo "" + echo " # Run app" + echo " $ singularity run \\" + echo " -B \${resource_dir}:/opt/resources \\" + echo " -B \${bcl_dir}:/opt/data \\" + echo " -B \${fastq_root_dir}:/opt/analysis \\" + echo " --app demux -s SampleSheet.csv" + } + + while getopts "hs:" opt; do + case ${opt} in + h) help + exit 1 ;; + s) sample_list=${OPTARG} ;; + *) help + exit 1 ;; esac done # Define variables - cd /opt/src - template_dir="/opt/resources/templates/sample_sheet_templates/" - platform_template="$platform"_sample_sheet_template.csv - template="$template_dir$platform_template" - bc_dict="/opt/resources/barcode_dict.json" - output_dir="/opt/analysis" - sample_list="/opt/analysis/$sample_list" - - # Create a sample sheet for demultiplexing - python -c 'import mip_functions as mip; mip.generate_sample_sheet( - "'"$sample_list"'", "'"$bc_dict"'", "'"$template"'", "'"$platform"'", - "'"$output_dir"'")' - - # cd to where bcl files are. + sample_sheet="/opt/analysis/${sample_list}" + # cd to where bcl files are cd /opt/data - + # Create a fastq directory for saving fastqs mkdir -p /opt/analysis/fastq - - # Copy sample list to fastq directory - scp $sample_list /opt/analysis/fastq/ - + # Increase limit of open number of files. ulimit -Sn $(ulimit -Hn) - + # Run bcl2fastq # Use nohup to make command keep running even if get hangup signal nohup bcl2fastq -o /opt/analysis/fastq \ - --sample-sheet /opt/analysis/SampleSheet.csv \ + --sample-sheet ${sample_sheet} \ --no-lane-splitting ################################################################## ## Demux QC App ## ################################################################## %apprun demux_qc - # Parse options - set -e - set -u - while getopts p: opt; do - case $opt in - p) platform=$OPTARG;; - ?) echo "Usage: singularity run --app demux_qc\\" - echo " -B /path_to_base_resources:/opt/resources \\" - echo " -B /path_to_fastq_dir:/opt/analysis " - echo " mycontainer.sif -p sequencing_platform" - exit 1;; + # Exit if something fails or if have unset object + set -eu + + help() { + echo "Run quality control on demultiplexed data." + echo "" + echo "Usage:" + echo " singularity run [options] --app demux_qc "\ + "[app_options]" + echo "" + echo "Options:" + echo " See 'singularity run'." + echo "" + echo "App Options:" + echo " -h Print the help page." + echo " -p Required. The sequencing platform used. Either 'miseq'" + echo " or 'nextseq'." + echo "" + echo "Examples:" + echo " # Set paths" + echo " $ resource_dir=/bin/MIPTools/base_resources" + echo " $ fastq_dir=/work/usr/example" + echo "" + echo " # Run app" + echo " $ singularity run \\" + echo " -B \${resource_dir}:/opt/resources"\ + "-B \${fastq_dir}:/opt/analysis \\" + echo " --app demux_qc -p 'nextseq'" + } + + # Argument handling + while getopts "hp:" opt; do + case ${opt} in + h) help + exit 1 ;; + p) platform=${OPTARG} ;; + *) help + exit 1 ;; esac done - # Run app - python /opt/src/demux_qc.py -p $platform + # Run python script + python /opt/src/demux_qc.py -p ${platform} diff --git a/README.md b/README.md index 7c76d1d..debbded 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ # MIPTools -[![Build Singularity](https://github.com/bailey-lab/MIPTools/actions/workflows/build-container.yaml/badge.svg)](https://github.com/bailey-lab/MIPTools/actions/workflows/build-container.yaml) + + ![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/bailey-lab/MIPTools) -![GitHub](https://img.shields.io/github/license/bailey-lab/MIPTools) +[![Documentation Status](https://readthedocs.org/projects/miptools/badge/?version=latest)](https://miptools.readthedocs.io/en/latest/?badge=latest) +![License](https://img.shields.io/github/license/bailey-lab/MIPTools) MIPTools is a suite of computational tools that are used for molecular inversion probe design, data processing, and analysis. @@ -27,17 +29,17 @@ by Singularity (e.g. on Ubuntu/Debian: `sudo snap install go --classic`). #### Download prebuilt container -The MIPTools container, built and ready to use, can be -downloaded from the [Sylabs Cloud](https://cloud.sylabs.io/). You can download -either the development version or the most recent stable release: +The MIPTools container, built and ready to use, can be downloaded +[here](https://baileylab.brown.edu/MIPTools/download/). You can download the +development version or any previous release: ```bash +# Download the latest stable release +wget https://baileylab.brown.edu/MIPTools/download/miptools_v0.4.0.sif + # Download the development version # The development version is updated every two weeks -singularity pull library://apascha1/miptools/miptools:dev - -# Download the latest stable release -singularity pull library://apascha1/miptools/miptools:v1.0.0 +wget https://baileylab.brown.edu/MIPTools/download/miptools_dev.sif ``` Note that these prebuilt versions do not include the `bcl2fastq` software due @@ -74,8 +76,8 @@ but this may change in the future. You must download the file: **`bcl2fastq2 Con You can install the most recent release using the following: ```bash -# Install stable version v1.0.0 -git clone --b v1.0.0 https://github.com/bailey-lab/MIPTools.git +# Install stable version v0.4.0 +git clone --branch v0.4.0 https://github.com/bailey-lab/MIPTools.git ``` You can alternatively install the development version: @@ -247,6 +249,8 @@ long term disk usage. ## Further documentation Further documentation for MIPTools is available +[online](https://miptools.readthedocs.io/) and in a public Google Drive +folder [here](https://drive.google.com/drive/folders/1Tmu7hdRYrdw-jqAN35lZpIjG2lBebuCK?usp=sharing) for various use cases (MIP design, data analysis, etc.). diff --git a/base_resources/MIPWrangler_scripts/runMIPWranglerCurrent.sh b/base_resources/MIPWrangler_scripts/runMIPWranglerCurrent.sh index 9433e4d..8d2238c 100644 --- a/base_resources/MIPWrangler_scripts/runMIPWranglerCurrent.sh +++ b/base_resources/MIPWrangler_scripts/runMIPWranglerCurrent.sh @@ -1,11 +1,24 @@ #!/usr/bin/env bash -if [[ $# -ne 2 ]]; then - echo "Illegal number of parameters, needs 2 argument, 1) name of mip server number, 2) num of threads to use" - exit + +if [[ $# -ne 5 ]]; then + msg="Illegal number of parameters. Needs five arguments:\n" + msg="${msg}1) The name of the MIP server number.\n" + msg="${msg}2) The number of threads to use.\n" + msg="${msg}3) The population clustering fraction cutoff.\n" + msg="${msg}4) The threshold for downsampling the UMI count.\n" + msg="${msg}5) A flag indicating if downsmapling should be weighted.\n" + msg="${msg} Either an empty string or the -w flag as a string." + echo ${msg} >&2 + exit 2 fi +# Correct barcodes MIPWrangler mipBarcodeCorrectionMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipBarcodeCorrecting_run1 --allowableErrors 6 MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1 + +# Downsample UMI counts +find . -type f -path '*mipBarcodeCorrection/*.fastq.gz' -exec python /opt/src/wrangler_downsample_umi.py --cpu-count $2 --downsample-threshold $4 $5 {} + + +# Cluster barcodes and MIPs MIPWrangler mipClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipClustering_run1 --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps -MIPWrangler mipPopulationClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipPopClustering_run1 --cutoff 0 --countEndGaps -#nohup MIPWrangler mav --masterDir $(realpath ./) --numThreads $2 --port $((10000+$1)) --name mip$1 & +MIPWrangler mipPopulationClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipPopClustering_run1 --cutoff 0 --countEndGaps --fraccutoff $3 diff --git a/base_resources/MIPWrangler_scripts/runMIPWranglerNoCutoffCurrent.sh b/base_resources/MIPWrangler_scripts/runMIPWranglerNoCutoffCurrent.sh deleted file mode 100755 index f7a2ee6..0000000 --- a/base_resources/MIPWrangler_scripts/runMIPWranglerNoCutoffCurrent.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash -if [[ $# -ne 2 ]]; then - echo "Illegal number of parameters, needs 2 argument, 1) name of mip server number, 2) num of threads to use" - exit -fi - -MIPWrangler mipBarcodeCorrectionMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipBarcodeCorrecting_run1 --allowableErrors 6 -MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1 -MIPWrangler mipClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipClustering_run1 --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps -MIPWrangler mipPopulationClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipPopClustering_run1 --cutoff 0 --countEndGaps --fraccutoff 0 -#nohup MIPWrangler mav --masterDir $(realpath ./) --numThreads $2 --port $((10000+$1)) --name mip$1 & diff --git a/base_resources/MIPWrangler_scripts/runMIPWranglerSwga.sh b/base_resources/MIPWrangler_scripts/runMIPWranglerSwga.sh index 54f2a78..52a15d9 100644 --- a/base_resources/MIPWrangler_scripts/runMIPWranglerSwga.sh +++ b/base_resources/MIPWrangler_scripts/runMIPWranglerSwga.sh @@ -1,11 +1,21 @@ #!/usr/bin/env bash -if [[ $# -ne 2 ]]; then - echo "Illegal number of parameters, needs 2 argument, 1) name of mip server number, 2) num of threads to use" - exit + +if [[ $# -ne 5 ]]; then + msg="Illegal number of parameters. Needs five arguments:\n" + msg="${msg}1) The name of the MIP server number.\n" + msg="${msg}2) The number of threads to use.\n" + msg="${msg}3) The population clustering fraction cutoff.\n" + msg="${msg}4) The threshold for downsampling the UMI count.\n" + msg="${msg}5) A flag indicating if downsmapling should be weighted.\n" + msg="${msg} Either an empty string or the -w flag as a string." + echo ${msg} >&2 + exit 2 fi +# Correct barcodes MIPWrangler mipBarcodeCorrectionMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipBarcodeCorrecting_run1 --allowableErrors 6 MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1 + +# Cluster barcodes and MIPs MIPWrangler mipClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipClustering_run1 --par /opt/resources/clustering_pars/illumina_swga.pars.txt --countEndGaps MIPWrangler mipPopulationClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipPopClustering_run1 --cutoff 0 --countEndGaps -#nohup MIPWrangler mav --masterDir $(realpath ./) --numThreads $2 --port $((10000+$1)) --name mip$1 & diff --git a/base_resources/MIPWrangler_scripts/runMIPWranglerSwgaPop.sh b/base_resources/MIPWrangler_scripts/runMIPWranglerSwgaPop.sh index 628ad32..6ca9673 100644 --- a/base_resources/MIPWrangler_scripts/runMIPWranglerSwgaPop.sh +++ b/base_resources/MIPWrangler_scripts/runMIPWranglerSwgaPop.sh @@ -1,11 +1,21 @@ #!/usr/bin/env bash -if [[ $# -ne 2 ]]; then - echo "Illegal number of parameters, needs 2 argument, 1) name of mip server number, 2) num of threads to use" - exit + +if [[ $# -ne 5 ]]; then + msg="Illegal number of parameters. Needs five arguments:\n" + msg="${msg}1) The name of the MIP server number.\n" + msg="${msg}2) The number of threads to use.\n" + msg="${msg}3) The population clustering fraction cutoff.\n" + msg="${msg}4) The threshold for downsampling the UMI count.\n" + msg="${msg}5) A flag indicating if downsmapling should be weighted.\n" + msg="${msg} Either an empty string or the -w flag as a string." + echo ${msg} >&2 + exit 2 fi +# Correct barcodes MIPWrangler mipBarcodeCorrectionMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipBarcodeCorrecting_run1 --allowableErrors 6 MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1 + +# Cluster barcodes and MIPs MIPWrangler mipClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipClustering_run1 --par /opt/resources/clustering_pars/illumina_swga_pop.pars.txt --countEndGaps MIPWrangler mipPopulationClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipPopClustering_run1 --cutoff 0 --countEndGaps -#nohup MIPWrangler mav --masterDir $(realpath ./) --numThreads $2 --port $((10000+$1)) --name mip$1 & diff --git a/base_resources/analysis-template-with-qual.ipynb b/base_resources/analysis-template-with-qual.ipynb index 46e1b04..475b39f 100644 --- a/base_resources/analysis-template-with-qual.ipynb +++ b/base_resources/analysis-template-with-qual.ipynb @@ -102,20 +102,17 @@ "source": [ "# USER INPUT\n", "\n", - "# provide the MIPWrangler output files\n", + "# provide the MIPWrangler output file\n", "# which must be located in the /opt/data directory within the container.\n", - "# if more than one run is to be merged, provide all files\n", - "info_files = [] \n", + "info_file = [] \n", "\n", - "# sample sheets associated with each wrangler file,\n", - "# in the same order as the wrangler files.\n", - "sample_sheets = []\n", + "# sample sheet associated with wrangler file,\n", + "#you should only have one sample sheet (in cases of multiple sample sheets, merge them first)\n", + "sample_sheet = []\n", "\n", "# No input below\n", - "info_files = [data_dir + i for i in info_files]\n", - "sample_sheets = [data_dir + s for s in sample_sheets]\n", - "pd.concat([pd.read_table(s) for s in sample_sheets],\n", - " ignore_index=True).groupby([\"sample_set\", \"probe_set\"]).first()" + "info_file = [data_dir + i]\n", + "sample_sheet = [data_dir + s]" ] }, { @@ -160,7 +157,7 @@ "\n", "For the species, the options are: \"pf\" for *Plasmodium falciparum*, \"pv\" for *Plasmodium vivax*, \"hg19\" for *Homo sapiens* genome assembly hg19/GRCh37 and \"hg38\" for *Homo sapiens* genome assembly hg38/GRCh38 \n", "___\n", - "Probe sets also must be specified. Check the output of the sample sheet summary above under **probe_set** field for a reminder of what the probe set of interest is. This is usually a three letter code or codes separated by a comma. " + "Probe sets also must be specified. Check the output of the sample sheet summary above under **probe_set** field for a reminder of what the probe set of interest is. This is usually a mip set of interest or mip sets separated by commas. " ] }, { @@ -170,15 +167,9 @@ "#### Example cell\n", "```python\n", "species = \"pf\"\n", - "probe_sets_used = \"DR1,VAR4\"\n", - "```\n", - "\n", - "It is also possible to analyse just a subset of probe sets that has been used. For example, if the data has both DR1 and VAR4 probe sets but I want to analyse only the DR1 set:\n", - "```python\n", - "species = \"pf\"\n", "probe_sets_used = \"DR1\"\n", "```\n", - "Note that I'd still need to specifiy \"DR1,VAR4\" in the sample_groups above." + "Note that probe_sets_used contains the probes you would like to analyze, while sample_groups includes all the probe sets that were in the probe set column of the original sample sheet (since this tool looks for exact matches to the probe set column of the sample sheet). E.g. if you only want to analyze DR1, but your sample sheet has a probe set column that is written as DR1,VAR4, you'd still need to specifiy \"DR1,VAR4\" in the sample_groups above." ] }, { @@ -200,6 +191,7 @@ "```python\n", "# available cpu count\n", "processorNumber = 20\n", + "freebayes_threads = 8\n", "\n", "## extra bwa options for haplotype alignment\n", "# use \"-a\" for getting all alignments\n", @@ -218,6 +210,7 @@ "# OPTIONAL USER INPUT\n", "# available cpu count\n", "processorNumber = 20\n", + "freebayes_threads = 8\n", "\n", "## extra bwa options for haplotype alignment\n", "# use \"-a\" for getting all alignments\n", @@ -244,38 +237,28 @@ "source": [ "# RUN\n", "\n", - "# copy the template settings file\n", - "temp_settings_file = \"/opt/resources/templates/analysis_settings_templates/settings.txt\"\n", - "subprocess.call([\"scp\", temp_settings_file, \"/opt/analysis/template_settings.txt\"])\n", - "\n", "# extract the settings template\n", - "temp_settings = mip.get_analysis_settings(\"/opt/analysis/template_settings.txt\")\n", + "settings = mip.get_analysis_settings(\"/opt/resources/templates/analysis_settings_templates/settings.txt\")\n", "\n", "# update bwa settings with the options set above\n", - "bwaOptions = temp_settings[\"bwaOptions\"]\n", - "try:\n", - " bwaOptions.extend(bwaExtra)\n", - "except AttributeError:\n", - " bwaOptions = [bwaOptions]\n", - " bwaOptions.extend(bwaExtra)\n", - "\n", - "# Create a list from the probe_sets string\n", - "mipSetKey = probe_sets_used.split(\",\") + [\"\"]\n", + "bwaOptions = [settings[\"bwaOptions\"]]\n", + "bwaOptions.extend(bwaExtra)\n", "\n", "# create a dictionary for which settings should be updated\n", "# using the user specified parameters.\n", - "update_keys = {\"processorNumber\": processorNumber,\n", - " \"bwaOptions\": bwaOptions,\n", - " \"species\": species,\n", - " \"mipSetKey\" : mipSetKey}\n", - "# update the settings\n", - "for k, v in update_keys.items():\n", - " temp_settings[k] = v\n", + "\n", + "settings['processorNumber']=processorNumber\n", + "settings['freebayes_threads']=freebayes_threads\n", + "settings['bwaOptions']=bwaOptions\n", + "settings['species']=species\n", + "settings['mipSetKey']=probe_sets_used.split(',')+['']\n", "# create a settings file in the analysis directory.\n", - "settings_file = \"settings.txt\"\n", + "settings_file='settings.txt'\n", "settings_path = os.path.join(wdir, settings_file)\n", - "mip.write_analysis_settings(temp_settings, settings_path)\n", + "mip.write_analysis_settings(settings, settings_path)\n", + "#reparse settings from settings file\n", "settings = mip.get_analysis_settings(wdir + settings_file)\n", + "print(settings['mipSetKey'])\n", "# create probe sets dictionary\n", "try:\n", " mip.update_probe_sets(\"/opt/project_resources/mip_ids/mipsets.csv\",\n", @@ -289,7 +272,7 @@ "metadata": {}, "source": [ "# Process run data\n", - "First section of the data analysis involves processing the MIPWrangler output files, combining data from multiple runs (if necessary), mapping haplotypes and creating summary files and plots showing how the sequencing runs went." + "First section of the data analysis involves processing the MIPWrangler output file, mapping haplotypes, and creating summary files and plots showing how the sequencing runs went." ] }, { @@ -297,36 +280,26 @@ "metadata": {}, "source": [ "## MIPWrangler output file processing\n", - "Below operation combines output files from multiple runs, summing up count data belonging to the same libraries. \n", - "\n", - "Libraries are labeled by combining three fields in the sample sheet: sample_name-sample_set-replicate, which makes the Sample ID. If two different libraries has the same Sample ID (same three fields, but a different LibraryPrep identifier), the overlapping libraries will be assigned new replicate numbers such that there are no shared IDs any more. A warning will be printed in that case, and the original sample ID and the new one will be written to the samples.tsv file generated in the analysis directory.\n", + "Libraries are labeled by combining three fields in the sample sheet: sample_name-sample_set-replicate, which makes the Sample ID.\n", "\n", - "If only a single output file is used, then the below operation just filters and renames some columns from the original file." + "The below operation just filters and renames some columns from the original file." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# RUN\n", - "if len(info_files) > 1:\n", - " mip.combine_info_files(wdir,\n", - " settings_file, \n", - " info_files,\n", - " sample_sheets,\n", - " settings[\"mipsterFile\"],\n", - " sample_sets=sample_groups)\n", - "else:\n", - " mip.process_info_file(wdir,\n", - " settings_file, \n", - " info_files,\n", - " sample_sheets,\n", - " settings[\"mipsterFile\"],\n", - " sample_sets=sample_groups)" + "mip.process_info_file(wdir,\n", + " settings_file, \n", + " info_file,\n", + " sample_sheet,\n", + " settings[\"mipsterFile\"],\n", + " sample_sets=sample_groups)" ] }, { @@ -336,12 +309,12 @@ "## Filter and map haplotype sequences\n", "Align each haplotype sequence to the reference genome. Remove off target haplotypes. All haplotype mappings will be saved to the disk so off targets can be inspected if needed. \n", "\n", - "Some filters can be applied to remove noise:\n", + "Some filters can be applied to remove noise and speed up processing:\n", "* minHaplotypeBarcodes: minimum total UMI cut off across all samples.\n", "* minHaplotypeSamples: minimum number of samples a haplotype is observed in.\n", "* minHaplotypeSampleFraction: minimum fraction of samples a haplotype is observed in. \n", "\n", - "It is probably safe to apply minimal count filters like at least 10 UMIs across samples and at least two samples. However, most data sets will be easily handled without these filters. So it may be better to not filter at this step unless the downstream operations are taking too much resources. However, filters can and should be applied after variant calls are made." + "It is usually better to not filter at this step (using the difault filtering levels below of filtering nothing) unless the downstream operations are difficult to compute. However, filters can and should be applied after variant calls are made." ] }, { @@ -390,7 +363,7 @@ "metadata": {}, "source": [ "### Preview the mapping results\n", - "Plotting the probe coverage by samples is a good way to see overall experiment perfomance. It shows if a probe has at least 1 barcode (or however many is specified below) for a given sample. \n", + "Plotting the probe coverage by samples is a good way to see overall experiment perfomance. It shows if a probe has at least 1 UMI (or however many is specified below) for a given sample. \n", "\n", "Dark columns point to poor performing probes whereas dark rows indicate poor samples. Note that this excludes samples with no reads at all. Use \"all_barcode_counts.csv\" file if those are of interest as well.\n", "\n", @@ -406,7 +379,7 @@ "# OPTIONAL USER INPUT\n", "\n", "# coverage filter: anything below this number will be considered absent\n", - "barcode_threshold = 10\n", + "UMI_threshold = 10\n", "# font size for tick labels for x and y axis\n", "tick_label_size=5\n", "# font size for heat map color bar\n", @@ -434,10 +407,10 @@ "outputs": [], "source": [ "# OPTIONAL USER INPUT\n", - "barcode_counts = pd.read_csv(wdir + \"barcode_counts.csv\",\n", + "UMI_counts = pd.read_csv(wdir + \"barcode_counts.csv\",\n", " header = [0,1], index_col = 0)\n", - "mip.plot_performance(barcode_counts,\n", - " barcode_threshold=barcode_threshold,\n", + "mip.plot_performance(UMI_counts,\n", + " barcode_threshold=UMI_threshold,\n", " tick_label_size=tick_label_size,\n", " cbar_label_size=cbar_label_size,\n", " dpi=dpi,\n", @@ -472,8 +445,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Plot total barcode count vs probe coverage\n", - "A scatter plot of total barcode count vs number of probes covered at a certain barcode count is a good way to see how the relationship between total coverage and probe coverage, which is useful in determining how to proceed to the next experiments or analyses." + "### Plot total UMI count vs probe coverage\n", + "A scatter plot of total UMI count vs number of probes covered at a certain UMI count is a good way to see the relationship between total coverage and probe coverage, which is useful in determining how to proceed to the next experiments or analyses." ] }, { @@ -486,8 +459,8 @@ "source": [ "# RUN\n", "f = sns.pairplot(data = sample_summary,\n", - " x_vars = \"Barcode Count\",\n", - " y_vars = \"targets_with_10_barcodes\",\n", + " x_vars = \"UMI Count\",\n", + " y_vars = \"targets_with_10_UMIs\",\n", " plot_kws={\"s\": 10})\n", "f.fig.set_size_inches(5,5)\n", "f.fig.set_dpi(150)\n", @@ -510,20 +483,20 @@ "target_coverage_fraction = 0.95\n", "target_coverage_key='targets_with_10_barcodes'\n", "```\n", - "Although we set our goal to 47 probes, it is likely that some sample will never reach that number regardless of how much we sequence, if there is a deletion in the region, for example. So it makes sense to set a total coverage threshold after which we don't expect more data. Looking at the plot above, it seems like after 1000 barcode counts, we would reach our goal for most samples. \n", + "Although we set our goal to 47 probes, it is likely that some sample will never reach that number regardless of how much we sequence, if there is a deletion in the region, for example. So it makes sense to set a total coverage threshold after which we don't expect more data. Looking at the plot above, it seems like after 1000 UMI counts, we would reach our goal for most samples. \n", "```python\n", "high_barcode_threshold = 10000\n", "```\n", - "Another metric to use for determining if we want to sequence a sample more is the average read count per barcode count. This value indicates we have sequenced each unique molecular index in our sample so many times, so when the value is high, it is unlikely that we'd get more UMIs by sequencing the same library more. It makes more sense for a fresh MIP capture from these samples if more data is needed.\n", + "Another metric to use for determining if we want to sequence a sample more is the average read count per UMI count. This value indicates we have sequenced each unique molecular index in our sample so many times, so when the value is high, it is unlikely that we'd get more UMIs by sequencing the same library more. It makes more sense for a fresh MIP capture from these samples if more data is needed.\n", "```python\n", "barcode_coverage_threshold=10\n", "```\n", - "Some samples perform very poorly for one reason or another. There are two options for these samples for repooling consideration: 1) Repool as much as we can for the next run, 2) Assuming there is a problem in the capture reaction, set up a new MIP capture reaction for these samples. It makes more sense to use option 1 if this is the first sequencing data using this library. Use option 2 if this library have been repooled at a higher volume already, but still producing poor data.\n", + "Some samples perform very poorly for one reason or another. There are two options for these samples for repooling consideration: 1) Repool as much as we can for the next run, 2) Assuming there is a problem in the capture reaction, set up a new MIP capture reaction for these samples. It makes more sense to use option 1 if this is the first sequencing data using this library. Use option 2 if this library have been repooled at a higher volume already, but is still producing poor data.\n", "```python\n", "barcode_count_threshold=100 # samples below total barcode count of this value is considered low coverage\n", "low_coverage_action='Repool' # what to do for low coverage samples (Repool or Recapture)\n", "```\n", - "Sometimes a handful of samples show uneven coverage of loci, i.e. they have very good coverage of a handful of loci but poor coverage in others, which may point to a problem with the sample or the experiment in general. These samples are determined by comparing the subset of samples that reached the goal we set (completed samples) and those that have not. We look at the number of barcodes per probe for _completed_ samples and get 25th percentile (or other percentile as set) and assume that if a sample on average has this many barcodes per target, it should have reached the set goal. For example, if on average _completed_ samples, i.e. samples that cover 47 probes at 10 barcodes or more, have 10000 total barcodes, they would have ~200 (10000/47) barcodes per target covered. And if an _incomplete_ sample has 5000 total barcodes and only 10 targets covered, this value would be 500 for that sample and it would be flagged as **uneven coverage** in repooling document.\n", + "Sometimes a handful of samples show uneven coverage of loci, i.e. they have very good coverage of a handful of loci but poor coverage in others, which may point to a problem with the sample or the experiment in general. These samples are determined by comparing the subset of samples that reached the goal we set (completed samples) and those that have not. We look at the number of UMIs per probe for _completed_ samples and get 25th percentile (or other percentile as set) and assume that if a sample on average has this many UMIs per target, it should have reached the set goal. For example, if on average _completed_ samples, i.e. samples that cover 47 probes at 10 UMIs or more, have 10000 total UMIs, they would have ~200 (10000/47) UMIs per target covered. And if an _incomplete_ sample has 5000 total UMIs and only 10 targets covered, this value would be 500 for that sample and it would be flagged as **uneven coverage** in the repooling document.\n", "```python\n", "assesment_key='targets_with_1_barcodes' # coverage key to compare \"complete\" and \"incomplete\" samples\n", "good_coverage_quantile=0.25 # percentile to set the threshold\n", @@ -715,7 +688,7 @@ "# These will be directy passed to freebayes\n", "\n", "# example for plasmodium falciparum calls\n", - "options = [\"--pooled-continuous\",\n", + "original_options = [\"--pooled-continuous\",\n", " \"--min-alternate-fraction\", \"0.01\",\n", " \"--min-alternate-count\", \"2\",\n", " \"--haplotype-length\", \"3\",\n", @@ -725,13 +698,13 @@ " \"--gvcf-dont-use-chunk\", \"true\"]\n", "\n", "# example for human genome calls with gvcf output\n", - "options = [\"--haplotype-length\", \"-1\",\n", + "original_options = [\"--haplotype-length\", \"-1\",\n", " \"--use-best-n-alleles\", \"50\",\n", " \"--genotype-qualities\", \"--gvcf\",\n", " \"--gvcf-dont-use-chunk\", \"true\"]\n", "\n", "# example for human genome calls without gvcf output\n", - "options = [\"--haplotype-length\", \"-1\",\n", + "original_options = [\"--haplotype-length\", \"-1\",\n", " \"--use-best-n-alleles\", \"50\",\n", " \"--genotype-qualities\"]\n", "```" @@ -747,7 +720,7 @@ "\n", "# provide freebayes options.\n", "# These will be directy passed to freebayes\n", - "options = " + "original_options = " ] }, { @@ -797,9 +770,14 @@ "outputs": [], "source": [ "# RUN\n", - "r = mip.freebayes_call(\n", + "from multiprocessing import Pool\n", + "import multiprocessing\n", + "import multiprocessing.pool\n", + "import copy\n", + "\n", + "freebayes_command_dict, contig_vcf_gz_paths = mip.freebayes_call(\n", " settings=settings,\n", - " options=options,\n", + " options=copy.deepcopy(original_options),\n", " align=align,\n", " verbose=verbose,\n", " fastq_dir=fastq_dir,\n", @@ -809,7 +787,25 @@ " bam_files=None,\n", " errors_file=errors_file,\n", " warnings_file=warnings_file,\n", - " fastq_padding=fastq_padding)" + " fastq_padding=fastq_padding)\n", + "freebayes_commands=list(freebayes_command_dict.values())\n", + "pool = Pool(int(settings[\"freebayes_threads\"]))\n", + "# run the freebayes worker program in parallel\n", + "# create a results container for the return values from the worker function\n", + "\n", + "results = []\n", + "errors = []\n", + "pool.map_async(mip.freebayes_worker, freebayes_commands, callback=results.extend,\n", + " error_callback=errors.extend)\n", + "#print(results)\n", + "pool.close()\n", + "pool.join()\n", + "#comment in these print statements if you get any errors for more details on which contigs failed to run in freebayes\n", + "#print('\\n\\n\\n\\n\\n')\n", + "#print(results, '\\n\\n\\n')\n", + "#print(errors, '\\n\\n\\n')\n", + "\n", + "mip.concatenate_headers(settings=settings, wdir='/opt/analysis', freebayes_settings=original_options, vcf_paths=contig_vcf_gz_paths)" ] }, { @@ -986,15 +982,15 @@ "min_site_qual = 1\n", "# reset targeted variant counts to zero\n", "# when the site quality is below this value\n", - "min_target_site_qual = 0\n", + "min_target_site_qual = -1\n", "# reset genotypes in the vcf file to NA\n", "# and depth to 0 if FORMAT/GQ value for a variant/sample\n", "# is below this value:\n", - "min_genotype_qual = 1\n", + "min_genotype_qual = -1\n", "# reset alt allele count in the vcf file to 0\n", "# if FORMAT/QA value divided by FORMAT/AO for a variant/sample\n", "# is below this value:\n", - "min_mean_alt_qual = 15 # average quality cut off for variants\n", + "min_mean_alt_qual = -1 # average quality cut off for variants\n", "# There are also available, similar filters for:\n", "# min_mean_ref_qual : resetting low qual reference allele counts\n", "# min_alt_qual : similar to min_mean_alt_qual, but for total qual score\n", @@ -1235,6 +1231,8 @@ "source": [ "# RUN\n", "filtered_mutation_counts = gt_calls[\"filtered_mutation_counts\"]\n", + "filtered_mutation_counts.to_csv(os.path.join(\n", + " wdir, \"filtered_alternate_AA_table.csv\"))\n", "filtered_mutation_counts.head()" ] }, @@ -1253,6 +1251,8 @@ "source": [ "# RUN\n", "filtered_mutation_coverage = gt_calls[\"filtered_mutation_coverage\"]\n", + "filtered_mutation_coverage.to_csv(os.path.join(\n", + " wdir, \"filtered_coverage_AA_table.csv\"))\n", "filtered_mutation_coverage.head()" ] }, @@ -1271,6 +1271,8 @@ "source": [ "# RUN\n", "freq = gt_calls[\"wsaf\"]\n", + "freq.to_csv(os.path.join(\n", + " wdir, \"within_sample_allele_frequencies.csv\"))\n", "freq.head()" ] }, @@ -1289,6 +1291,8 @@ "source": [ "# RUN\n", "genotypes = gt_calls[\"genotypes\"]\n", + "genotypes.to_csv(os.path.join(\n", + " wdir, \"filtered_genotypes_table.csv\"))\n", "genotypes.head()" ] }, @@ -1307,6 +1311,7 @@ "source": [ "# RUN\n", "prevalences = gt_calls[\"prevalences\"]\n", + "prevalences.to_csv(os.path.join(wdir, \"prevalences_input_table.csv\"))\n", "prevalences.head()" ] }, @@ -1448,6 +1453,7 @@ "outputs": [], "source": [ "filtered_genotypes = genotypes.loc[:, variant_mask]\n", + "filtered_genotypes.to_csv(os.path.join(wdir, \"final_filtered_genotypes.csv\"))\n", "filtered_genotypes.head()" ] }, @@ -1458,6 +1464,7 @@ "outputs": [], "source": [ "filtered_prevalences = prevalences.loc[:, variant_mask]\n", + "filtered_prevalences.to_csv(os.path.join(wdir, \"final_filtered_prevalences_input_table.csv\"))\n", "filtered_prevalences.head()" ] }, @@ -1827,7 +1834,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1841,7 +1848,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.10.12" }, "toc": { "base_numbering": 1, @@ -1866,5 +1873,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/base_resources/basespace.cfg b/base_resources/basespace.cfg new file mode 100644 index 0000000..e64ad2d --- /dev/null +++ b/base_resources/basespace.cfg @@ -0,0 +1,2 @@ +apiServer = https://api.basespace.illumina.com +accessToken = \ No newline at end of file diff --git a/base_resources/check_run_stats.ipynb b/base_resources/check_run_stats.ipynb new file mode 100644 index 0000000..827ce54 --- /dev/null +++ b/base_resources/check_run_stats.ipynb @@ -0,0 +1,599 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to use code cells in this notebook\n", + "\n", + "If a code cell starts with \n", + "```python\n", + "# RUN\n", + "```\n", + "Run the cell by CTRL+Enter, or the Run button above. \n", + "\n", + "If a code cell starts with\n", + "```python\n", + "# USER INPUT\n", + "```\n", + "User input is needed before running the cell. Usually there will be a cell preceding this which gives an example for the values to be provided.\n", + "\n", + "If a code cell starts with\n", + "```python\n", + "# OPTIONAL USER INPUT\n", + "```\n", + "User input is needed before running the cell. However, some defaults are provided, so make sure that either the settings will work for your run, or change them appropriately.\n", + "\n", + "If a cell starts with\n", + "\n", + "**Example cell**\n", + "\n", + "These cells are not code cells but examples of user inputs from the test data analysis for the actual code cell that follows it, informing the user about the formatting etc.\n", + "\n", + "**Important note on entering input:** When entering user input, please make sure you follow the formatting provided in the example cells. For example, when the parameter is text, make sure you have quotation marks around the parameters but when it is a number, do not enclose in quotes. If it is a list, then provide a list in brackets." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classes reloading.\n", + "functions reloading\n" + ] + } + ], + "source": [ + "# RUN\n", + "# these commands import necessary functions for the rest of the program\n", + "import sys\n", + "sys.path.append(\"/opt/src\")\n", + "import mip_functions as mip\n", + "import os\n", + "import subprocess\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "plt.style.use('ggplot')\n", + "from matplotlib.lines import Line2D\n", + "plt.rcParams['svg.fonttype'] = 'none'\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "wdir = \"/opt/analysis/\"\n", + "data_dir = \"/opt/data/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example cell\n", + "\n", + "What is your wrangler output data called? This is usually a file name that ends in run_(experiment_id)\\_wrangled\\_(date).txt.gz and is in the folder that you bound as /opt/data. What sample sheet did you use when you wrangled the data? What species did you use? This will likely be the (species) portion of a folder called (species)\\_species\\_resources. For example, Plasmodium falciparum in the tutorial dataset is in a folder called pf_species_resources and has been assigned the species name 'pf'\n", + "\n", + "```python\n", + "\n", + "# provide the MIPWrangler output file\n", + "# which must be located in the /opt/data directory within the container.\n", + "info_file = \"run_test_run_wrangled_20221102.txt.gz\" \n", + "\n", + "# sample sheet used by the wrangler run (must be located in /opt/data)\n", + "sample_sheet = \"sample_list.tsv\"\n", + "\n", + "# species name associated with the species_resources folder\n", + "species = 'pf'\n", + "\n", + "# No input below\n", + "info_file = data_dir + info_file\n", + "sample_sheet = data_dir + sample_sheet\n", + "pd.read_table(sample_sheet).groupby([\"sample_set\", \"probe_set\"]).first()\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# USER INPUT\n", + "\n", + "# provide the MIPWrangler output file\n", + "# which must be located in the /opt/data directory within the container.\n", + "info_file = \"\" \n", + "\n", + "# sample sheet associated with the wrangler file\n", + "sample_sheet = \"\"\n", + "\n", + "# No input below\n", + "info_file = [data_dir + info_file]\n", + "sample_sheet = [data_dir + sample_sheet]\n", + "pd.read_table(sample_sheet[0]).groupby([\"sample_set\", \"probe_set\"]).first()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example cell\n", + "Which sample sets and probe sets would you like to analyze? These are listed in your sample sheet under the \"sample_set\" and \"probe_set\" columns, and given to Jupyter as a [sample_set, probe_set] pair. You can also see an example row by running the Jupyter cell above.\n", + "\n", + "```python\n", + "sample_groups = [[\"JJJ\", \"DR1,VAR4\"]]\n", + "```\n", + "\n", + "If more than one combination is to be used, the input will be a list of lists, for example:\n", + "```python\n", + "sample_groups = [[\"sample_set_1\", \"probe_set_1\"], [\"sample_set_2\", \"probe_set_2\"]]\n", + "```\n", + "\n", + "There are two closely related names in this section: probe_set and probe_sets_used. probe_set is a column from the input sample sheet. If a sample was captured/sequenced with multiple probe sets at the same time, there might optionally be multiple comma delimited probe sets in this column (e.g. DR1,VAR4 if sequencing was performed on DR1 and VAR4 probe sets). probe_sets_used is a string of the probe sets you'd like to analyze here in this Jupyter notebook (e.g. DR1 if you only want to analyze DR1 probes, or DR1,VAR4 if you'd like to analyze both DR1 and VAR4 probes.\n", + "\n", + "probe_sets_used = \"DR1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# USER INPUT\n", + "sample_groups = [[]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example cell\n", + "How many processors would you like to use? Enter an integer that is less than or equal to the number of available processors on the computer/compute node that you are using. More processors means faster run time but higher likelihood of CPU crashes if your machine doesn't have enough RAM to handle the job.\n", + "\n", + "```python\n", + "# available cpu count\n", + "processorNumber = 12\n", + "\n", + "## extra bwa options for haplotype alignment\n", + "# use \"-a\" for getting all alignments\n", + "# use \"-L 500\" to penalize soft clipping \n", + "# use \"-t\" to set number of available processors\n", + "bwaExtra = [\"-t\", str(processorNumber)]\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# OPTIONAL USER INPUT\n", + "# available cpu count\n", + "processorNumber = 12\n", + "\n", + "## extra bwa options for haplotype alignment\n", + "# use \"-a\" for getting all alignments\n", + "# use \"-L 500\" to penalize soft clipping \n", + "# use \"-t\" to set number of available processors\n", + "bwaExtra = [\"-t\", str(processorNumber)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get/Set the analysis settings\n", + "The cell below will retrieve a template of default analysis settings to use. It will then modify these settings to match the variables you defined above, and save them to whatever folder you bound to the singularity container's /opt/analysis folder." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# RUN\n", + "\n", + "# copy the template settings file\n", + "temp_settings_file = \"/opt/resources/templates/analysis_settings_templates/settings.txt\"\n", + "subprocess.call([\"scp\", temp_settings_file, \"/opt/analysis/template_settings.txt\"])\n", + "\n", + "# extract the settings template\n", + "temp_settings = mip.get_analysis_settings(\"/opt/analysis/template_settings.txt\")\n", + "\n", + "# update bwa settings with the options set above\n", + "bwaOptions = temp_settings[\"bwaOptions\"]\n", + "try:\n", + " bwaOptions.extend(bwaExtra)\n", + "except AttributeError:\n", + " bwaOptions = [bwaOptions]\n", + " bwaOptions.extend(bwaExtra)\n", + "\n", + "# Create a list from the probe_sets string\n", + "mipSetKey = probe_sets_used.split(\",\") + [\"\"]\n", + "\n", + "# create a dictionary for which settings should be updated\n", + "# using the user specified parameters.\n", + "update_keys = {\"processorNumber\": processorNumber,\n", + " \"bwaOptions\": bwaOptions,\n", + " \"species\": species,\n", + " \"mipSetKey\" : mipSetKey}\n", + "# update the settings\n", + "for k, v in update_keys.items():\n", + " temp_settings[k] = v\n", + "# create a settings file in the analysis directory.\n", + "settings_file = \"settings.txt\"\n", + "settings_path = os.path.join(wdir, settings_file)\n", + "mip.write_analysis_settings(temp_settings, settings_path)\n", + "settings = mip.get_analysis_settings(wdir + settings_file)\n", + "# create probe sets dictionary\n", + "try:\n", + " mip.update_probe_sets(\"/opt/project_resources/mip_ids/mipsets.csv\",\n", + " \"/opt/project_resources/mip_ids/probe_sets.json\")\n", + "except IOError:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MIPWrangler output file processing\n", + "The operation below filters and renames some of the columns from the original file." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# RUN\n", + "mip.process_info_file(wdir,\n", + " settings_file, \n", + " info_files,\n", + " sample_sheets,\n", + " settings[\"mipsterFile\"],\n", + " sample_sets=sample_groups)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filter and map haplotype sequences\n", + "Align each haplotype sequence to the reference genome. Remove off target haplotypes. All haplotype mappings will be saved to the disk so off targets can be inspected if needed. \n", + "\n", + "Some filters can be applied to remove noise:\n", + "* minHaplotypeBarcodes: minimum total UMI cut off across all samples.\n", + "* minHaplotypeSamples: minimum number of samples a haplotype is observed in.\n", + "* minHaplotypeSampleFraction: minimum fraction of samples a haplotype is observed in. \n", + "\n", + "It is probably safe to apply minimal count filters like at least 10 UMIs across samples and at least two samples. However, most data sets will be easily handled without these filters. So it may be better to not filter at this step unless the downstream operations are taking too much resources. However, filters can and should be applied after variant calls are made." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example cell\n", + "```python\n", + "# filter haplotype sequences based on the number of total supporting UMIs\n", + "settings[\"minHaplotypeBarcodes\"] = 1\n", + "# filter haplotype sequences based on the number of samples they were observed in\n", + "settings[\"minHaplotypeSamples\"] = 1\n", + "# filter haplotype sequences based on the fraction of samples they were observed in\n", + "settings[\"minHaplotypeSampleFraction\"] = 0.0001\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# OPTIONAL USER INPUT\n", + "# filter haplotype sequences based on the number of total supporting UMIs\n", + "settings[\"minHaplotypeBarcodes\"] = 1\n", + "# filter haplotype sequences based on the number of samples they were observed in\n", + "settings[\"minHaplotypeSamples\"] = 1\n", + "# filter haplotype sequences based on the fraction of samples they were observed in\n", + "settings[\"minHaplotypeSampleFraction\"] = 0.0001 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#RUN\n", + "mip.map_haplotypes(settings)\n", + "mip.get_haplotype_counts(settings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preview the mapping results\n", + "Plotting the probe coverage by samples is a good way to see overall experiment perfomance. It shows if a probe has at least 1 barcode (or however many is specified below) for a given sample. \n", + "\n", + "Dark columns point to poor performing probes whereas dark rows indicate poor samples. Note that this excludes samples with no reads at all. Use \"all_barcode_counts.csv\" file if those are of interest as well.\n", + "\n", + "Some parameters can be supplied to the plotting function as noted in the comments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# OPTIONAL USER INPUT\n", + "\n", + "# coverage filter: anything below this number will be considered absent\n", + "barcode_threshold = 10\n", + "# font size for tick labels for x and y axis\n", + "tick_label_size=5\n", + "# font size for heat map color bar\n", + "cbar_label_size=5\n", + "# figure resolution\n", + "dpi=300\n", + "# present/absent colors\n", + "absent_color='black'\n", + "present_color='green'\n", + "# Save the plot in the analysis directory?\n", + "# If false, plots the graph here.\n", + "save=False\n", + "# How frequent the x and y-axis ticks should be\n", + "# every nth column will have a tick\n", + "ytick_freq=None\n", + "xtick_freq=None\n", + "# rotation of xtick labels\n", + "xtick_rotation=90" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# OPTIONAL USER INPUT\n", + "barcode_counts = pd.read_csv(wdir + \"barcode_counts.csv\",\n", + " header = [0,1], index_col = 0)\n", + "mip.plot_performance(barcode_counts,\n", + " barcode_threshold=barcode_threshold,\n", + " tick_label_size=tick_label_size,\n", + " cbar_label_size=cbar_label_size,\n", + " dpi=dpi,\n", + " absent_color=absent_color,\n", + " present_color=present_color,\n", + " save=save,\n", + " ytick_freq=ytick_freq,\n", + " xtick_freq=xtick_freq,\n", + " xtick_rotation=xtick_rotation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Look at summary stats \n", + "There are summary statistics and meta data (if provided) we can use to determine if coverage is enough, whether further sequencing is necessary, and how to proceed if further sequencing will be needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# RUN\n", + "sample_summary = pd.read_csv(wdir + \"sample_summary.csv\")\n", + "sample_summary.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot total barcode count vs probe coverage\n", + "A scatter plot of total barcode count vs number of probes covered at a certain barcode count is a good way to see how the relationship between total coverage and probe coverage, which is useful in determining how to proceed to the next experiments or analyses." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "code_folding": [] + }, + "outputs": [], + "source": [ + "# RUN\n", + "f = sns.pairplot(data = sample_summary,\n", + " x_vars = \"Barcode Count\",\n", + " y_vars = \"targets_with_10_barcodes\",\n", + " plot_kws={\"s\": 10})\n", + "f.fig.set_size_inches(5,5)\n", + "f.fig.set_dpi(150)\n", + "_ = plt.xticks(rotation=45)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Repooling capture reactions for further sequencing.\n", + "### Factors to consider:\n", + "1. What do you we want to accomplish? In most cases, we would like to get enough coverage for a number of probes for each sample. For example, the test data contains **50 probes** in total. Let's say it is sufficient if we had a coverage of **10** or more for each probe for a sample. Then, we would not want to sequence any more of that sample. \n", + "```python\n", + "target_coverage_count = 50\n", + "target_coverage_key='targets_with_10_barcodes'\n", + "```\n", + "Alternatively, we can set a goal of a fraction of total probes to reach a certain coverage rather than an absolute number of probes. For 95% of the maximum number of probes observed (47 in this case): \n", + "```python\n", + "target_coverage_fraction = 0.95\n", + "target_coverage_key='targets_with_10_barcodes'\n", + "```\n", + "Although we set our goal to 47 probes, it is likely that some sample will never reach that number regardless of how much we sequence, if there is a deletion in the region, for example. So it makes sense to set a total coverage threshold after which we don't expect more data. Looking at the plot above, it seems like after 1000 barcode counts, we would reach our goal for most samples. \n", + "```python\n", + "high_barcode_threshold = 10000\n", + "```\n", + "Another metric to use for determining if we want to sequence a sample more is the average read count per barcode count. This value indicates we have sequenced each unique molecular index in our sample so many times, so when the value is high, it is unlikely that we'd get more UMIs by sequencing the same library more. It makes more sense for a fresh MIP capture from these samples if more data is needed.\n", + "```python\n", + "barcode_coverage_threshold=10\n", + "```\n", + "Some samples perform very poorly for one reason or another. There are two options for these samples for repooling consideration: 1) Repool as much as we can for the next run, 2) Assuming there is a problem in the capture reaction, set up a new MIP capture reaction for these samples. It makes more sense to use option 1 if this is the first sequencing data using this library. Use option 2 if this library have been repooled at a higher volume already, but still producing poor data.\n", + "```python\n", + "barcode_count_threshold=100 # samples below total barcode count of this value is considered low coverage\n", + "low_coverage_action='Repool' # what to do for low coverage samples (Repool or Recapture)\n", + "```\n", + "Sometimes a handful of samples show uneven coverage of loci, i.e. they have very good coverage of a handful of loci but poor coverage in others, which may point to a problem with the sample or the experiment in general. These samples are determined by comparing the subset of samples that reached the goal we set (completed samples) and those that have not. We look at the number of barcodes per probe for _completed_ samples and get 25th percentile (or other percentile as set) and assume that if a sample on average has this many barcodes per target, it should have reached the set goal. For example, if on average _completed_ samples, i.e. samples that cover 47 probes at 10 barcodes or more, have 10000 total barcodes, they would have ~200 (10000/47) barcodes per target covered. And if an _incomplete_ sample has 5000 total barcodes and only 10 targets covered, this value would be 500 for that sample and it would be flagged as **uneven coverage** in repooling document.\n", + "```python\n", + "assesment_key='targets_with_1_barcodes' # coverage key to compare \"complete\" and \"incomplete\" samples\n", + "good_coverage_quantile=0.25 # percentile to set the threshold\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example cell\n", + "```python\n", + "high_barcode_threshold = 10000\n", + "target_coverage_count = None\n", + "target_coverage_fraction = 0.95\n", + "target_coverage_key = 'targets_with_10_barcodes'\n", + "barcode_coverage_threshold = 10\n", + "barcode_count_threshold = 100\n", + "low_coverage_action = 'Recapture'\n", + "assesment_key = 'targets_with_1_barcodes'\n", + "good_coverage_quantile = 0.25\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# USER INPUT\n", + "high_barcode_threshold = \n", + "low_coverage_action = " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# OPTIONAL USER INPUT\n", + "target_coverage_count = None\n", + "target_coverage_fraction = 0.95\n", + "target_coverage_key = 'targets_with_10_barcodes'\n", + "barcode_coverage_threshold = 10\n", + "barcode_count_threshold = 100\n", + "assesment_key = 'targets_with_1_barcodes'\n", + "good_coverage_quantile = 0.25" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# RUN\n", + "meta = pd.read_csv(wdir + \"run_meta.csv\")\n", + "data_summary = pd.merge(sample_summary, meta)\n", + "mip.repool(wdir, \n", + " data_summary, \n", + " high_barcode_threshold, \n", + " target_coverage_count=target_coverage_count, \n", + " target_coverage_fraction=target_coverage_fraction, \n", + " target_coverage_key=target_coverage_key,\n", + " barcode_coverage_threshold=barcode_coverage_threshold,\n", + " barcode_count_threshold=barcode_count_threshold, \n", + " low_coverage_action=low_coverage_action,\n", + " assesment_key=assesment_key,\n", + " good_coverage_quantile=good_coverage_quantile,\n", + " output_file='repool.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect the repool document\n", + "Library to completion field in the repool document has the value (volume) of how much from a sample should be pooled for re-sequencing. These values are only rough estimates and care should be taken to make sure there will be enough material to sequence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# RUN\n", + "pd.read_csv(wdir + \"repool.csv\").head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "toc": { + "base_numbering": 1, + "nav_menu": { + "height": "299px", + "width": "248px" + }, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "1114px", + "left": "977px", + "top": "163px", + "width": "256.797px" + }, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/base_resources/finish.wav b/base_resources/finish.wav deleted file mode 100644 index c4831c0..0000000 Binary files a/base_resources/finish.wav and /dev/null differ diff --git a/base_resources/processing-and-filtering-variant-calls.ipynb b/base_resources/processing-and-filtering-variant-calls.ipynb index a2d7d45..3ca08ae 100644 --- a/base_resources/processing-and-filtering-variant-calls.ipynb +++ b/base_resources/processing-and-filtering-variant-calls.ipynb @@ -137,17 +137,17 @@ "# provide a file that maps gene names to gene IDs\n", "# this is necessary when targeted variant annotations use\n", "# gene names instead of gene IDs. Otherwise provide None\n", - "geneid_to_genename = \n", + "geneid_to_genename = None\n", "# annotate targeted amino acid changes in the tables\n", "# using the file, or otherwise provide None\n", - "target_aa_annotation = \n", + "target_aa_annotation = None\n", "# decompose multi amino acid changes and combine counts of\n", "# resulting single amino acid changes\n", - "aggregate_aminoacids = \n", + "aggregate_aminoacids = None\n", "# decompose MNVs and combine counts for resulting SNVs\n", - "aggregate_nucleotides = \n", + "aggregate_nucleotides = None\n", "# annotate targeted nucleotide changes in the tables.\n", - "target_nt_annotation = " + "target_nt_annotation = None" ] }, { @@ -159,7 +159,7 @@ "# OPTIONAL USER INPUT\n", "\n", "# analysis settings dictionary\n", - "settings = settings\n", + "#settings = settings\n", "# provide the path to the settings file\n", "# if settings dictionary has not been loaded\n", "settings_file = None\n", diff --git a/base_resources/barcode_dict.json b/base_resources/sample_prep/barcode_dict.pickle similarity index 100% rename from base_resources/barcode_dict.json rename to base_resources/sample_prep/barcode_dict.pickle diff --git a/base_resources/templates/analysis_settings_templates/hg19_blood_settings.txt b/base_resources/templates/analysis_settings_templates/hg19_blood_settings.txt deleted file mode 100755 index 5f175a7..0000000 --- a/base_resources/templates/analysis_settings_templates/hg19_blood_settings.txt +++ /dev/null @@ -1,125 +0,0 @@ -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER RUN ################################# -############################################################################## -############################################################################## -workingDir analysis/human/blood_170428/ -runID 170428 -mipsterFile 170428_allinfo -filteredMipsterFile 170428_allinfo_filtered -existingData na -rawDataFile 170428_raw_data.json -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species hg19 -resourceDir resources/nonexistent/ -callInfoDictionary resources/blood/call_info.json -mipSetsDictionary resources/mip_ids/probe_sets.json -mipSetKey BLO, -annotationScript table_annovar.pl -annotationBuildVersion hg19 -annotationProtocol refGene;avsnp147;1000g2015aug_all;1000g2015aug_afr;dbnsfp30a -annotationOperation g;f;f;f;f -annotationNaString . -annotationOutput myanno -annotationKeys avsnp147;AAChange.refGene;ExonicFunc.refGene;Gene.refGene;1000g2015aug_all;1000g2015aug_afr -annotationIdKey avsnp147 -caseControlStats False -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -psvPriority 1 -psvMultiplier 50 -bwaOptions mem,-j,-t 30 -# minimum barcode information for a haplotype to be considered valid, fraction to all barcodes of all haplotypes for that copy in that sample -minBarcodeCount 0 -minBarcodeFraction 0 -# minSnpBarcodeCount is for filtering data for SNP analysis -# haplotypes with less barcodes than this will not be considered valid -minSnpBarcodeCount 5 -minSnpBarcodeFraction 0.05 -minSnpQuality 20 -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 2 -# mutations with less frequency than minMutationFraction will be set to zero frequency -minMutationFraction 0.005 -# median normalized barcode number for filtering probes used -minimumProbeMedian 10 -# median raw barcode number for filtering samples used for a given gene -minimumSampleMedian 10 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,h_qual,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,haplotype_quality_scores,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -#mipInfoDictionary resources/mip_dictionaries/human/hsu_call_info.json -#mipDesignDictionary resources/mip_dictionaries/human/human_mips_updated.dic -alignmentDir alignments/ -processorNumber 50 -copyStableGenes YtaYtb,ABO,Duffy-null,FyaFyb-rs34599082,OKaPlusOKaMinus,InaInb,LWaLWb,Sc1Sc2,JsaJsb,KpaKpb,Kk,JkaJkb,HyPlusHyMinus-Ja-DoaDob,CoaCob,DiaDib -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -caseFile case_file -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -mipCountFile mip_counts -barcodeCountFile barcode_counts -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -variationTableFile variation_table -sampleVariationFile sample_diffs -sampleInfoFile sample_info_out.dic -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -perSampleResults sample_results -perProbeResults probe_results -uniqueProbeFile unique_probes -tablesFile tables -filteredTablesFile filtered_tables -clusterOutputFile cluster_output -# plotting options -dpi 96 -ymax 6 -figsize 10,40 -snpResultsFile clinical_snp_results -# dbscan cluster parameters -tsneKey V -tsnePlotKey Y -minClusterSamples 5 -maxUnclusteredFrac 0.01 -maxClusterCount 9 -dbScanOutputFile dbscan_clusters diff --git a/base_resources/templates/analysis_settings_templates/hg19_blood_settings.txt~ b/base_resources/templates/analysis_settings_templates/hg19_blood_settings.txt~ deleted file mode 100755 index 5f175a7..0000000 --- a/base_resources/templates/analysis_settings_templates/hg19_blood_settings.txt~ +++ /dev/null @@ -1,125 +0,0 @@ -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER RUN ################################# -############################################################################## -############################################################################## -workingDir analysis/human/blood_170428/ -runID 170428 -mipsterFile 170428_allinfo -filteredMipsterFile 170428_allinfo_filtered -existingData na -rawDataFile 170428_raw_data.json -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species hg19 -resourceDir resources/nonexistent/ -callInfoDictionary resources/blood/call_info.json -mipSetsDictionary resources/mip_ids/probe_sets.json -mipSetKey BLO, -annotationScript table_annovar.pl -annotationBuildVersion hg19 -annotationProtocol refGene;avsnp147;1000g2015aug_all;1000g2015aug_afr;dbnsfp30a -annotationOperation g;f;f;f;f -annotationNaString . -annotationOutput myanno -annotationKeys avsnp147;AAChange.refGene;ExonicFunc.refGene;Gene.refGene;1000g2015aug_all;1000g2015aug_afr -annotationIdKey avsnp147 -caseControlStats False -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -psvPriority 1 -psvMultiplier 50 -bwaOptions mem,-j,-t 30 -# minimum barcode information for a haplotype to be considered valid, fraction to all barcodes of all haplotypes for that copy in that sample -minBarcodeCount 0 -minBarcodeFraction 0 -# minSnpBarcodeCount is for filtering data for SNP analysis -# haplotypes with less barcodes than this will not be considered valid -minSnpBarcodeCount 5 -minSnpBarcodeFraction 0.05 -minSnpQuality 20 -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 2 -# mutations with less frequency than minMutationFraction will be set to zero frequency -minMutationFraction 0.005 -# median normalized barcode number for filtering probes used -minimumProbeMedian 10 -# median raw barcode number for filtering samples used for a given gene -minimumSampleMedian 10 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,h_qual,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,haplotype_quality_scores,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -#mipInfoDictionary resources/mip_dictionaries/human/hsu_call_info.json -#mipDesignDictionary resources/mip_dictionaries/human/human_mips_updated.dic -alignmentDir alignments/ -processorNumber 50 -copyStableGenes YtaYtb,ABO,Duffy-null,FyaFyb-rs34599082,OKaPlusOKaMinus,InaInb,LWaLWb,Sc1Sc2,JsaJsb,KpaKpb,Kk,JkaJkb,HyPlusHyMinus-Ja-DoaDob,CoaCob,DiaDib -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -caseFile case_file -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -mipCountFile mip_counts -barcodeCountFile barcode_counts -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -variationTableFile variation_table -sampleVariationFile sample_diffs -sampleInfoFile sample_info_out.dic -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -perSampleResults sample_results -perProbeResults probe_results -uniqueProbeFile unique_probes -tablesFile tables -filteredTablesFile filtered_tables -clusterOutputFile cluster_output -# plotting options -dpi 96 -ymax 6 -figsize 10,40 -snpResultsFile clinical_snp_results -# dbscan cluster parameters -tsneKey V -tsnePlotKey Y -minClusterSamples 5 -maxUnclusteredFrac 0.01 -maxClusterCount 9 -dbScanOutputFile dbscan_clusters diff --git a/base_resources/templates/analysis_settings_templates/hg19_nextseq_peter_settings.txt b/base_resources/templates/analysis_settings_templates/hg19_nextseq_peter_settings.txt deleted file mode 100644 index d43f013..0000000 --- a/base_resources/templates/analysis_settings_templates/hg19_nextseq_peter_settings.txt +++ /dev/null @@ -1,123 +0,0 @@ -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER RUN ################################# -############################################################################## -############################################################################## -workingDir analysis/human/IBC_180228/ -mipsterFile 180228_data.tsv -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species hg19 -callInfoDictionary resources/mip_dictionaries/human/flat_call_info.json -mipSetsDictionary resources/mip_ids/probe_sets.json -mipSetKey PETER-HG, -annotationBuildVersion hg19 -annotationProtocol refGene;avsnp147;1000g2015aug_all;1000g2015aug_afr;dbnsfp30a -annotationOperation g;f;f;f;f -annotationNaString . -annotationOutput myanno -annotationKeys avsnp147;AAChange.refGene;ExonicFunc.refGene;Gene.refGene;1000g2015aug_all;1000g2015aug_afr -annotationIdKey avsnp147 -caseControlStats False -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -# psvPriority is used to determine which paralog a given sequence belong to -# for paralogus/multi mapping results. A sequence is aligned to the genome -# and an alignment score is generated by lastZ program. Number of Paralog -# specific variants are multiplied by psvMultiplier to generate PSV scores. -# There are 3 options to determine which Paralog a sequence is assigned to: -# 0: only consider alignment scores -# 1: supplement alignment scores with psv scores -# 2: use psv score, break ties with alignment scores -psvPriority 1 -psvMultiplier 50 -############################################################################## -# options for bwa program when aligning the haplotypes to genome -bwaOptions mem, -############################################################################## -############################################################################## -# filters for a haplotype to be considered valid, -# across all samples in an analysis. -# minimum total barcodes -minHaplotypeBarcodes 0 -# minimum number of samples that the haplotype is observed in -minHaplotypeSamples 0 -# minimum fraction of samples that the haplotype is observed in -minHaplotypeSampleFraction 0 -############################################################################## -############################################################################## -# filters for a variant to be considered valid, across all samples in -# an analysis similar to haplotype filters -minVariantBarcodes 0 -minVariantSamples 0 -minVariantSampleFraction 0 -# Phred quality filter for individual variants. This is not used across samples, -# but within each sample for each variant. -minVariantQuality 20 -# minimum (within sample) information of a variant to be considered valid -# for genotype calling. For example, if minMutationFraction is set to 0.01 -# and a mutation is at 0.005 frequency in a sample, it'll be called WT -# if frequency is between 0.01 and 0.99; it'll be called MIX -# if over 0.99, it'll be called MUT -minMutationFraction 0.005 -# minimum barcode count for a mutation to be considered valid -minMutationCount 2 -minCoverage 5 -# Merge nucleotide changes happening on the same codon to convert to amino acid change -# Use only for Plasmodium -mergeSNPs 1 -############################################################################## -############################################################################## -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic diff --git a/base_resources/templates/analysis_settings_templates/hg38_settings.txt b/base_resources/templates/analysis_settings_templates/hg38_settings.txt deleted file mode 100644 index 89f8947..0000000 --- a/base_resources/templates/analysis_settings_templates/hg38_settings.txt +++ /dev/null @@ -1,154 +0,0 @@ -### Settings are a tab separated file with two columns. -### First column is the setting name, second setting value. -### Values should be comma separated if more than one value is needed. -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species hg38 -callInfoDictionary /opt/project_resources/mip_ids/call_info.json -mipSetsDictionary /opt/project_resources/mip_ids/probe_sets.json -### Multiple sequence aligner to use for variant calling -multipleSequenceAligner decipher -############################################################################## -############################################################################## -### Annotation settings below are species specific -############################################################################## -annotationScript table_annovar.pl -annotationBuildVersion hg38 -annotationProtocol wgEncodeGencodeCompV24;avsnp147 -annotationOperation g;f -annotationNaString . -annotationOutput myanno -annotationKeys avsnp147;AAChange.wgEncodeGencodeCompV24;ExonicFunc.wgEncodeGencodeCompV24;Gene.wgEncodeGencodeCompV24 -annotationIdKey avsnp147 -############################################################################## -############################################################################## -############## SETTINGS THAT CAN CHANGE PER RUN ############################# -############################################################################## -############################################################################## -workingDir /opt/analysis/ -dataDir /opt/data -mipsterFile data.tsv -############################################################################## -############################################################################## -############## SETTINGS FOR DATA FILTERS ############################# -############################################################################## -############################################################################## -# filters for a haplotype to be considered valid, -# across all samples in an analysis. -# minimum total barcodes -minHaplotypeBarcodes 1 -# minimum number of samples that the haplotype is observed in -minHaplotypeSamples 1 -# minimum fraction of samples that the haplotype is observed in -minHaplotypeSampleFraction 0 -############################################################################## -############################################################################## -# loci filters, across samples -# minimum total depth (filters on INFO/AD) -minVariantBarcodes 1 -# minimum number of samples variant observed in (filters on INFO/AC) -minVariantSamples 1 -# minimum fraction of samples variant observed in (filters on INFO/AF) -minVariantSampleFraction 0 -# minimum number of samples with genotypes called (filters on INFO/AN) -minVariantSampleTotal 1 -# average sequence quality for variant across samples (filters on INFO/QS) -minVariantMeanQuality 20 -# average within sample allele frequency when observed (filters on INFO/WSAF) -minVariantMeanWsaf 0.001 -# average number of MIPs supporting the variant normalized to MAX (filters on INFO/MCF) -# this is a good filter to remove artifacts when multiple MIPs cover a locus -minMipCountFraction 0 -# Filters for individual variants. This is not used across samples, -# but within each sample for each variant. -# Phred quality filter -minVariantQuality 20 -# minimum coverage at locus to call any genotype -minVariantCoverage 1 -# minimum allele depth to call variant genotype -minVariantCount 1 -# minimum within sample allele frequency to call variant genotype -minVariantWsaf 0 -# minimum (within sample) information of an amino acid variant -# for genotype calling. For example, if minMutationFraction is set to 0.01 -# and a mutation is at 0.005 frequency in a sample, it'll be called WT -# if frequency is between 0.01 and 0.99; it'll be called MIX -# if over 0.99, it'll be called MUT -minMutationFraction 0.005 -# minimum barcode count for a (aminoacid) variant to be considered valid -minMutationCount 2 -minCoverage 5 -# Merge nucleotide changes happening on the same codon to convert to amino acid -# change. Use only for Plasmodium falciparum. -mergeSNPs 0 -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 2 -############################################################################## -############################################################################## -# psvPriority is used to determine which paralog a given sequence belong to -# for paralogus/multi mapping results. A sequence is aligned to the genome -# and an alignment score is generated by lastZ program. Number of Paralog -# specific variants are multiplied by psvMultiplier to generate PSV scores. -# There are 3 options to determine which Paralog a sequence is assigned to: -# 0: only consider alignment scores -# 1: supplement alignment scores with psv scores -# 2: use psv score, break ties with alignment scores -psvPriority 1 -psvMultiplier 50 -############################################################################## -# options for bwa program when aligning the haplotypes to genome -bwaOptions mem,-a -allowAltContigs 0 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic diff --git a/base_resources/templates/analysis_settings_templates/pf_miseq_DR_settings.txt b/base_resources/templates/analysis_settings_templates/pf_miseq_DR_settings.txt deleted file mode 100755 index 20b1bca..0000000 --- a/base_resources/templates/analysis_settings_templates/pf_miseq_DR_settings.txt +++ /dev/null @@ -1,137 +0,0 @@ -### Settings are a tab separated file with two columns. -### First column is the setting name, second setting value. -### Values should be comma separated if more than one value is needed. -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pf -callInfoDictionary /opt/project_resources/mip_ids/call_info.json -mipSetsDictionary /opt/project_resources/mip_ids/probe_sets.json -# If only one mip set was used, end with a trailing comma -mipSetKey DR1,VAR4 -############################################################################## -############################################################################## -### Annotation settings below are species specific, so do dot change -### unless working with something other than plasmodium falciparum -############################################################################## -annotationScript table_annovar_pf.pl -annotationBuildVersion 3d7 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -caseControlStats False -############################################################################## -############################################################################## -############## SETTINGS THAT CAN CHANGE PER RUN ############################# -############################################################################## -############################################################################## -workingDir /opt/analysis/ -dataDir /opt/data -mipsterFile data.tsv -############################################################################## -############################################################################## -############## SETTINGS FOR DATA FILTERS ############################# -############################################################################## -############################################################################## -# filters for a haplotype to be considered valid, -# across all samples in an analysis. -# minimum total barcodes -minHaplotypeBarcodes 0 -# minimum number of samples that the haplotype is observed in -minHaplotypeSamples 0 -# minimum fraction of samples that the haplotype is observed in -minHaplotypeSampleFraction 0 -############################################################################## -############################################################################## -# filters for a variant to be considered valid, across all samples in -# an analysis similar to haplotype filters -minVariantBarcodes 0 -minVariantSamples 0 -minVariantSampleFraction 0 -# Phred quality filter for individual variants. This is not used across samples, -# but within each sample for each variant. -minVariantQuality 20 -# minimum (within sample) information of a variant to be considered valid -# for genotype calling. For example, if minMutationFraction is set to 0.01 -# and a mutation is at 0.005 frequency in a sample, it'll be called WT -# if frequency is between 0.01 and 0.99; it'll be called MIX -# if over 0.99, it'll be called MUT -minMutationFraction 0.005 -# minimum barcode count for a mutation to be considered valid -minMutationCount 2 -minCoverage 5 -# Merge nucleotide changes happening on the same codon to convert to amino acid change -# Use only for Plasmodium -mergeSNPs 1 -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -############################################################################## -############################################################################## -# psvPriority is used to determine which paralog a given sequence belong to -# for paralogus/multi mapping results. A sequence is aligned to the genome -# and an alignment score is generated by lastZ program. Number of Paralog -# specific variants are multiplied by psvMultiplier to generate PSV scores. -# There are 3 options to determine which Paralog a sequence is assigned to: -# 0: only consider alignment scores -# 1: supplement alignment scores with psv scores -# 2: use psv score, break ties with alignment scores -psvPriority 1 -psvMultiplier 50 -############################################################################## -# options for bwa program when aligning the haplotypes to genome -bwaOptions mem,-t 30 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic diff --git a/base_resources/templates/analysis_settings_templates/pf_miseq_DR_settings.txt~ b/base_resources/templates/analysis_settings_templates/pf_miseq_DR_settings.txt~ deleted file mode 100755 index 1edf760..0000000 --- a/base_resources/templates/analysis_settings_templates/pf_miseq_DR_settings.txt~ +++ /dev/null @@ -1,124 +0,0 @@ -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER RUN ################################# -############################################################################## -############################################################################## -workingDir analysis/pf/AG3_170926_ipool/ -mipsterFile 170926_data.tsv -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pf -callInfoDictionary resources/mip_dictionaries/pf/pf_all_call_info_crt.json -mipSetsDictionary resources/mip_ids/probe_sets.json -mipSetKey DR1,VAR4 -annotationScript table_annovar_pf.pl -annotationBuildVersion 3d7 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -caseControlStats False -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -# psvPriority is used to determine which paralog a given sequence belong to -# for paralogus/multi mapping results. A sequence is aligned to the genome -# and an alignment score is generated by lastZ program. Number of Paralog -# specific variants are multiplied by psvMultiplier to generate PSV scores. -# There are 3 options to determine which Paralog a sequence is assigned to: -# 0: only consider alignment scores -# 1: supplement alignment scores with psv scores -# 2: use psv score, break ties with alignment scores -psvPriority 1 -psvMultiplier 50 -############################################################################## -# options for bwa program when aligning the haplotypes to genome -bwaOptions mem,-t 30 -############################################################################## -############################################################################## -# filters for a haplotype to be considered valid, -# across all samples in an analysis. -# minimum total barcodes -minHaplotypeBarcodes 0 -# minimum number of samples that the haplotype is observed in -minHaplotypeSamples 0 -# minimum fraction of samples that the haplotype is observed in -minHaplotypeSampleFraction 0 -############################################################################## -############################################################################## -# filters for a variant to be considered valid, across all samples in -# an analysis similar to haplotype filters -minVariantBarcodes 0 -minVariantSamples 0 -minVariantSampleFraction 0 -# Phred quality filter for individual variants. This is not used across samples, -# but within each sample for each variant. -minVariantQuality 20 -# minimum (within sample) information of a variant to be considered valid -# for genotype calling. For example, if minMutationFraction is set to 0.01 -# and a mutation is at 0.005 frequency in a sample, it'll be called WT -# if frequency is between 0.01 and 0.99; it'll be called MIX -# if over 0.99, it'll be called MUT -minMutationFraction 0.005 -# minimum barcode count for a mutation to be considered valid -minMutationCount 2 -minCoverage 5 -# Merge nucleotide changes happening on the same codon to convert to amino acid change -# Use only for Plasmodium -mergeSNPs 1 -############################################################################## -############################################################################## -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data diff --git a/base_resources/templates/analysis_settings_templates/pf_miseq_settings.txt b/base_resources/templates/analysis_settings_templates/pf_miseq_settings.txt deleted file mode 100755 index 4ba21cb..0000000 --- a/base_resources/templates/analysis_settings_templates/pf_miseq_settings.txt +++ /dev/null @@ -1,128 +0,0 @@ -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER RUN ################################# -############################################################################## -############################################################################## -workingDir analysis/pf/DRC_170828/ -runID 170828 -mipsterFile 170828_allinfo -filteredMipsterFile 170828_allinfo_filtered -existingData na -rawDataFile 170828_raw_data.json -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pf -resourceDir na -callInfoDictionary resources/mip_dictionaries/pf/pf_all_call_info_crt.json -mipSetsDictionary resources/mip_ids/probe_sets.json -mipSetKey DR1,CSP -annotationScript table_annovar_pf.pl -annotationBuildVersion 3d7 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -caseControlStats False -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -psvPriority 1 -psvMultiplier 50 -bwaOptions mem,-t 30,-L 500,-T 80 -# minimum barcode information for a haplotype to be considered valid, fraction to all barcodes of all haplotypes for that copy in that sample -minBarcodeCount 0 -minBarcodeFraction 0 -# minSnpBarcodeCount is for filtering data for SNP analysis -# haplotypes with less barcodes than this will not be considered valid -minSnpBarcodeCount 0 -minSnpBarcodeFraction 0 -minSnpQuality 20 -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -# mutations with less frequency than minMutationFraction will be set to zero frequency -minMutationFraction 0.005 -# Merge nucleotide changes happening on the same codon to convert to amino acid change -mergeSNPs 1 -# median normalized barcode number for filtering probes used -minimumProbeMedian 10 -# median raw barcode number for filtering samples used for a given gene -minimumSampleMedian 10 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,h_qual,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,haplotype_quality_scores,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -mipInfoDictionary resources/mip_dictionaries/human/hsu_call_info.json -mipDesignDictionary resources/mip_dictionaries/human/human_mips_updated.dic -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -caseFile case_file -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -mipCountFile mip_counts -barcodeCountFile barcode_counts -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -variationTableFile variation_table -sampleVariationFile sample_diffs -sampleInfoFile sample_info_out.dic -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -perSampleResults sample_results -perProbeResults probe_results -uniqueProbeFile unique_probes -tablesFile tables -filteredTablesFile filtered_tables -clusterOutputFile cluster_output -# plotting options -dpi 96 -ymax 6 -figsize 10,40 -snpResultsFile clinical_snp_results -########################## -# dbscan cluster parameters -tsneKey V -tsnePlotKey Y -minClusterSamples 5 -maxUnclusteredFrac 0.01 -maxClusterCount 9 -dbScanOutputFile dbscan_clusters diff --git a/base_resources/templates/analysis_settings_templates/pf_nextseq_IBC_settings.txt b/base_resources/templates/analysis_settings_templates/pf_nextseq_IBC_settings.txt deleted file mode 100755 index ce43f8f..0000000 --- a/base_resources/templates/analysis_settings_templates/pf_nextseq_IBC_settings.txt +++ /dev/null @@ -1,134 +0,0 @@ -### Settings are a tab separated file with two columns. -### First column is the setting name, second setting value. -### Values should be comma separated if more than one value is needed. -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pf -callInfoDictionary resources/mip_ids/ideel_call_info.json -mipSetsDictionary resources/mip_ids/probe_sets.json -# If only one mip set was used, end with a trailing comma -mipSetKey IBC, -############################################################################## -### Annotation settings below are species specific, so do dot change -### unless working with something other than plasmodium falciparum -annotationScript table_annovar_pf.pl -annotationBuildVersion 3d7 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -caseControlStats False -############################################################################## -############################################################################## -############## SETTINGS THAT CAN CHANGE PER RUN ############################# -############################################################################## -############################################################################## -workingDir ./ -mipsterFile data.tsv -############################################################################## -############################################################################## -############## SETTINGS FOR DATA FILTERS ############################# -############################################################################## -############################################################################## -# filters for a haplotype to be considered valid, -# across all samples in an analysis. -# minimum total barcodes -minHaplotypeBarcodes 0 -# minimum number of samples that the haplotype is observed in -minHaplotypeSamples 0 -# minimum fraction of samples that the haplotype is observed in -minHaplotypeSampleFraction 0 -############################################################################## -############################################################################## -# filters for a variant to be considered valid, across all samples in -# an analysis similar to haplotype filters -minVariantBarcodes 0 -minVariantSamples 0 -minVariantSampleFraction 0 -# Phred quality filter for individual variants. This is not used across samples, -# but within each sample for each variant. -minVariantQuality 20 -# minimum (within sample) information of a variant to be considered valid -# for genotype calling. For example, if minMutationFraction is set to 0.01 -# and a mutation is at 0.005 frequency in a sample, it'll be called WT -# if frequency is between 0.01 and 0.99; it'll be called MIX -# if over 0.99, it'll be called MUT -minMutationFraction 0.005 -# minimum barcode count for a mutation to be considered valid -minMutationCount 2 -minCoverage 5 -# Merge nucleotide changes happening on the same codon to convert to amino acid change -# Use only for Plasmodium -mergeSNPs 1 -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -############################################################################## -############################################################################## -# psvPriority is used to determine which paralog a given sequence belong to -# for paralogus/multi mapping results. A sequence is aligned to the genome -# and an alignment score is generated by lastZ program. Number of Paralog -# specific variants are multiplied by psvMultiplier to generate PSV scores. -# There are 3 options to determine which Paralog a sequence is assigned to: -# 0: only consider alignment scores -# 1: supplement alignment scores with psv scores -# 2: use psv score, break ties with alignment scores -psvPriority 1 -psvMultiplier 50 -############################################################################## -# options for bwa program when aligning the haplotypes to genome -bwaOptions mem, -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic diff --git a/base_resources/templates/analysis_settings_templates/pf_nextseq_IBC_settings.txt~ b/base_resources/templates/analysis_settings_templates/pf_nextseq_IBC_settings.txt~ deleted file mode 100755 index a39a4b2..0000000 --- a/base_resources/templates/analysis_settings_templates/pf_nextseq_IBC_settings.txt~ +++ /dev/null @@ -1,121 +0,0 @@ -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER RUN ################################# -############################################################################## -############################################################################## -workingDir analysis/pf/IBC_180228/ -mipsterFile 180228_allinfo -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pf -resourceDir na -callInfoDictionary resources/ideel-barcode/call_info.json -mipSetsDictionary resources/mip_ids/probe_sets.json -mipSetKey IBC, -annotationBuildVersion 3d7 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -caseControlStats False -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -psvPriority 1 -psvMultiplier 50 -bwaOptions mem, -# minimum barcode information for a haplotype to be considered valid, fraction to all barcodes of all haplotypes for that copy in that sample -minBarcodeCount 0 -minBarcodeFraction 0 -# minSnpBarcodeCount is for filtering data for SNP analysis -# haplotypes with less barcodes than this will not be considered valid -minSnpBarcodeCount 0 -minSnpBarcodeFraction 0 -minSnpQuality 20 -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -# mutations with less frequency than minMutationFraction will be set to zero frequency -minMutationFraction 0.005 -# median normalized barcode number for filtering probes used -minimumProbeMedian 10 -# median raw barcode number for filtering samples used for a given gene -minimumSampleMedian 10 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,h_qual,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,haplotype_quality_scores,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -mipInfoDictionary resources/mip_dictionaries/human/hsu_call_info.json -mipDesignDictionary resources/mip_dictionaries/human/human_mips_updated.dic -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -caseFile case_file -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -mipCountFile mip_counts -barcodeCountFile barcode_counts -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -variationTableFile variation_table -sampleVariationFile sample_diffs -sampleInfoFile sample_info_out.dic -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -perSampleResults sample_results -perProbeResults probe_results -uniqueProbeFile unique_probes -tablesFile tables -filteredTablesFile filtered_tables -clusterOutputFile cluster_output -# plotting options -dpi 96 -ymax 6 -figsize 10,40 -snpResultsFile clinical_snp_results -########################## -# dbscan cluster parameters -tsneKey V -tsnePlotKey Y -minClusterSamples 5 -maxUnclusteredFrac 0.01 -maxClusterCount 9 -dbScanOutputFile dbscan_clusters diff --git a/base_resources/templates/analysis_settings_templates/pf_nextseq_settings.txt b/base_resources/templates/analysis_settings_templates/pf_nextseq_settings.txt deleted file mode 100755 index 8c75713..0000000 --- a/base_resources/templates/analysis_settings_templates/pf_nextseq_settings.txt +++ /dev/null @@ -1,124 +0,0 @@ -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER RUN ################################# -############################################################################## -############################################################################## -workingDir analysis/pf/AG3_170926_ipool/ -mipsterFile 170926_data.tsv -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pf -callInfoDictionary resources/mip_dictionaries/pf/pf_all_call_info_crt.json -mipSetsDictionary resources/mip_ids/probe_sets.json -mipSetKey DR1,CSP -annotationScript table_annovar_pf.pl -annotationBuildVersion 3d7 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -caseControlStats False -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -# psvPriority is used to determine which paralog a given sequence belong to -# for paralogus/multi mapping results. A sequence is aligned to the genome -# and an alignment score is generated by lastZ program. Number of Paralog -# specific variants are multiplied by psvMultiplier to generate PSV scores. -# There are 3 options to determine which Paralog a sequence is assigned to: -# 0: only consider alignment scores -# 1: supplement alignment scores with psv scores -# 2: use psv score, break ties with alignment scores -psvPriority 1 -psvMultiplier 50 -############################################################################## -# options for bwa program when aligning the haplotypes to genome -bwaOptions mem,-t 30 -############################################################################## -############################################################################## -# filters for a haplotype to be considered valid, -# across all samples in an analysis. -# minimum total barcodes -minHaplotypeBarcodes 0 -# minimum number of samples that the haplotype is observed in -minHaplotypeSamples 0 -# minimum fraction of samples that the haplotype is observed in -minHaplotypeSampleFraction 0 -############################################################################## -############################################################################## -# filters for a variant to be considered valid, across all samples in -# an analysis similar to haplotype filters -minVariantBarcodes 0 -minVariantSamples 0 -minVariantSampleFraction 0 -# Phred quality filter for individual variants. This is not used across samples, -# but within each sample for each variant. -minVariantQuality 20 -# minimum (within sample) information of a variant to be considered valid -# for genotype calling. For example, if minMutationFraction is set to 0.01 -# and a mutation is at 0.005 frequency in a sample, it'll be called WT -# if frequency is between 0.01 and 0.99; it'll be called MIX -# if over 0.99, it'll be called MUT -minMutationFraction 0.005 -# minimum barcode count for a mutation to be considered valid -minMutationCount 2 -minCoverage 5 -# Merge nucleotide changes happening on the same codon to convert to amino acid change -# Use only for Plasmodium -mergeSNPs 1 -############################################################################## -############################################################################## -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data diff --git a/base_resources/templates/analysis_settings_templates/pf_settings.txt b/base_resources/templates/analysis_settings_templates/pf_settings.txt deleted file mode 100644 index 4a9550e..0000000 --- a/base_resources/templates/analysis_settings_templates/pf_settings.txt +++ /dev/null @@ -1,140 +0,0 @@ -### Settings are a tab separated file with two columns. -### First column is the setting name, second setting value. -### Values should be comma separated if more than one value is needed. -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pf -callInfoDictionary /opt/project_resources/mip_ids/call_info.json -mipSetsDictionary /opt/project_resources/mip_ids/probe_sets.json -### Multiple sequence aligner to be used for variant calling. -multipleSequenceAligner muscle -############## SETTINGS THAT CAN CHANGE PER RUN ############################# -############################################################################## -############################################################################## -workingDir /opt/analysis/ -dataDir /opt/data -mipsterFile data.tsv -############################################################################## -############################################################################## -############## SETTINGS FOR DATA FILTERS ############################# -############################################################################## -############################################################################## -# filters for a haplotype to be considered valid, -# across all samples in an analysis. -# minimum total barcodes -minHaplotypeBarcodes 1 -# minimum number of samples that the haplotype is observed in -minHaplotypeSamples 1 -# minimum fraction of samples that the haplotype is observed in -minHaplotypeSampleFraction 0 -############################################################################## -############################################################################## -# loci filters, across samples -# minimum total depth (filters on INFO/AD) -minVariantBarcodes 1 -# minimum number of samples variant observed in (filters on INFO/AC) -minVariantSamples 1 -# minimum fraction of samples variant observed in (filters on INFO/AF) -minVariantSampleFraction 0 -# minimum number of samples with genotypes called (filters on INFO/AN) -minVariantSampleTotal 1 -# average sequence quality for variant across samples (filters on INFO/QS) -minVariantMeanQuality 20 -# average within sample allele frequency when observed (filters on INFO/WSAF) -minVariantMeanWsaf 0.001 -# average number of MIPs supporting the variant normalized to MAX (filters on INFO/MCF) -# this is a good filter to remove artifacts when multiple MIPs cover a locus -minMipCountFraction 0 -# Filters for individual variants. This is not used across samples, -# but within each sample for each variant. -# Phred quality filter -minVariantQuality 20 -# minimum coverage at locus to call any genotype -minVariantCoverage 1 -# minimum allele depth to call variant genotype -minVariantCount 1 -# minimum within sample allele frequency to call variant genotype -minVariantWsaf 0 -# minimum (within sample) information of an amino acid variant -# for genotype calling. For example, if minMutationFraction is set to 0.01 -# and a mutation is at 0.005 frequency in a sample, it'll be called WT -# if frequency is between 0.01 and 0.99; it'll be called MIX -# if over 0.99, it'll be called MUT -minMutationFraction 0.005 -# minimum barcode count for a (aminoacid) variant to be considered valid -minMutationCount 2 -minCoverage 5 -# Merge nucleotide changes happening on the same codon to convert to amino acid -# change. Use only for Plasmodium falciparum. -mergeSNPs 0 -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -############################################################################## -############################################################################## -# psvPriority is used to determine which paralog a given sequence belong to -# for paralogus/multi mapping results. A sequence is aligned to the genome -# and an alignment score is generated by lastZ program. Number of Paralog -# specific variants are multiplied by psvMultiplier to generate PSV scores. -# There are 3 options to determine which Paralog a sequence is assigned to: -# 0: only consider alignment scores -# 1: supplement alignment scores with psv scores -# 2: use psv score, break ties with alignment scores -psvPriority 1 -psvMultiplier 50 -############################################################################## -# options for bwa program when aligning the haplotypes to genome -bwaOptions mem,-a,-L 500 -allowAltContigs 0 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -alignmentDir alignments/ -processorNumber 20 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic diff --git a/base_resources/templates/analysis_settings_templates/pf_settings_20191002.txt b/base_resources/templates/analysis_settings_templates/pf_settings_20191002.txt deleted file mode 100644 index eeb2dd1..0000000 --- a/base_resources/templates/analysis_settings_templates/pf_settings_20191002.txt +++ /dev/null @@ -1,156 +0,0 @@ -### Settings are a tab separated file with two columns. -### First column is the setting name, second setting value. -### Values should be comma separated if more than one value is needed. -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pf -callInfoDictionary /opt/project_resources/mip_ids/call_info.json -mipSetsDictionary /opt/project_resources/mip_ids/probe_sets.json -### Multiple sequence aligner to be used for variant calling. -multipleSequenceAligner decipher -############################################################################## -############################################################################## -### Annotation settings below are species specific, so do dot change -### unless working with something other than plasmodium falciparum -############################################################################## -annotationScript table_annovar_pf.pl -annotationBuildVersion 3d7 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -############################################################################## -snpEffDb pf -############################################################################## -############## SETTINGS THAT CAN CHANGE PER RUN ############################# -############################################################################## -############################################################################## -workingDir /opt/analysis/ -dataDir /opt/data -mipsterFile data.tsv -############################################################################## -############################################################################## -############## SETTINGS FOR DATA FILTERS ############################# -############################################################################## -############################################################################## -# filters for a haplotype to be considered valid, -# across all samples in an analysis. -# minimum total barcodes -minHaplotypeBarcodes 1 -# minimum number of samples that the haplotype is observed in -minHaplotypeSamples 1 -# minimum fraction of samples that the haplotype is observed in -minHaplotypeSampleFraction 0 -############################################################################## -############################################################################## -# loci filters, across samples -# minimum total depth (filters on INFO/AD) -minVariantBarcodes 1 -# minimum number of samples variant observed in (filters on INFO/AC) -minVariantSamples 1 -# minimum fraction of samples variant observed in (filters on INFO/AF) -minVariantSampleFraction 0 -# minimum number of samples with genotypes called (filters on INFO/AN) -minVariantSampleTotal 1 -# average sequence quality for variant across samples (filters on INFO/QS) -minVariantMeanQuality 20 -# average within sample allele frequency when observed (filters on INFO/WSAF) -minVariantMeanWsaf 0.001 -# average number of MIPs supporting the variant normalized to MAX (filters on INFO/MCF) -# this is a good filter to remove artifacts when multiple MIPs cover a locus -minMipCountFraction 0 -# Filters for individual variants. This is not used across samples, -# but within each sample for each variant. -# Phred quality filter -minVariantQuality 20 -# minimum coverage at locus to call any genotype -minVariantCoverage 1 -# minimum allele depth to call variant genotype -minVariantCount 1 -# minimum within sample allele frequency to call variant genotype -minVariantWsaf 0 -# minimum (within sample) information of an amino acid variant -# for genotype calling. For example, if minMutationFraction is set to 0.01 -# and a mutation is at 0.005 frequency in a sample, it'll be called WT -# if frequency is between 0.01 and 0.99; it'll be called MIX -# if over 0.99, it'll be called MUT -minMutationFraction 0.005 -# minimum barcode count for a (aminoacid) variant to be considered valid -minMutationCount 2 -minCoverage 5 -# Merge nucleotide changes happening on the same codon to convert to amino acid -# change. Use only for Plasmodium falciparum. -mergeSNPs 0 -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -############################################################################## -############################################################################## -# psvPriority is used to determine which paralog a given sequence belong to -# for paralogus/multi mapping results. A sequence is aligned to the genome -# and an alignment score is generated by lastZ program. Number of Paralog -# specific variants are multiplied by psvMultiplier to generate PSV scores. -# There are 3 options to determine which Paralog a sequence is assigned to: -# 0: only consider alignment scores -# 1: supplement alignment scores with psv scores -# 2: use psv score, break ties with alignment scores -psvPriority 1 -psvMultiplier 50 -############################################################################## -# options for bwa program when aligning the haplotypes to genome -bwaOptions mem,-a,-L 500 -allowAltContigs 0 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -alignmentDir alignments/ -processorNumber 20 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic diff --git a/base_resources/templates/analysis_settings_templates/pv_miseq_settings.txt b/base_resources/templates/analysis_settings_templates/pv_miseq_settings.txt deleted file mode 100755 index 23f9e89..0000000 --- a/base_resources/templates/analysis_settings_templates/pv_miseq_settings.txt +++ /dev/null @@ -1,128 +0,0 @@ -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER RUN ################################# -############################################################################## -############################################################################## -workingDir analysis/pv/pv_180404/ -runID 180404 -mipsterFile 180404_allinfo -filteredMipsterFile 180404_allinfo_filtered -existingData na -rawDataFile 180404_raw_data.json -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pv -resourceDir na -callInfoDictionary resources/mip_dictionaries/pv/call_info.json -mipSetsDictionary resources/mip_ids/probe_sets.json -mipSetKey PVS, -annotationScript table_annovar_pf.pl -annotationBuildVersion Sal1 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -caseControlStats False -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -psvPriority 1 -psvMultiplier 50 -bwaOptions mem,-t 30,-L 500,-T 80 -# minimum barcode information for a haplotype to be considered valid, fraction to all barcodes of all haplotypes for that copy in that sample -minBarcodeCount 0 -minBarcodeFraction 0 -# minSnpBarcodeCount is for filtering data for SNP analysis -# haplotypes with less barcodes than this will not be considered valid -minSnpBarcodeCount 0 -minSnpBarcodeFraction 0 -minSnpQuality 20 -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -# mutations with less frequency than minMutationFraction will be set to zero frequency -minMutationFraction 0.005 -# Merge nucleotide changes happening on the same codon to convert to amino acid change -mergeSNPs 1 -# median normalized barcode number for filtering probes used -minimumProbeMedian 10 -# median raw barcode number for filtering samples used for a given gene -minimumSampleMedian 10 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,h_qual,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,haplotype_quality_scores,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -mipInfoDictionary resources/mip_dictionaries/human/hsu_call_info.json -mipDesignDictionary resources/mip_dictionaries/human/human_mips_updated.dic -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -caseFile case_file -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -mipCountFile mip_counts -barcodeCountFile barcode_counts -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -variationTableFile variation_table -sampleVariationFile sample_diffs -sampleInfoFile sample_info_out.dic -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -perSampleResults sample_results -perProbeResults probe_results -uniqueProbeFile unique_probes -tablesFile tables -filteredTablesFile filtered_tables -clusterOutputFile cluster_output -# plotting options -dpi 96 -ymax 6 -figsize 10,40 -snpResultsFile clinical_snp_results -########################## -# dbscan cluster parameters -tsneKey V -tsnePlotKey Y -minClusterSamples 5 -maxUnclusteredFrac 0.01 -maxClusterCount 9 -dbScanOutputFile dbscan_clusters diff --git a/base_resources/templates/analysis_settings_templates/pv_settings.txt b/base_resources/templates/analysis_settings_templates/pv_settings.txt deleted file mode 100755 index adb4d67..0000000 --- a/base_resources/templates/analysis_settings_templates/pv_settings.txt +++ /dev/null @@ -1,155 +0,0 @@ -### Settings are a tab separated file with two columns. -### First column is the setting name, second setting value. -### Values should be comma separated if more than one value is needed. -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pv -callInfoDictionary /opt/project_resources/mip_ids/call_info.json -mipSetsDictionary /opt/project_resources/mip_ids/probe_sets.json -### Multiple sequence aligner to be used for variant calling. -multipleSequenceAligner decipher -############################################################################## -############################################################################## -### Annotation settings below are species specific, so do dot change -### unless working with something other than plasmodium falciparum -############################################################################## -annotationScript table_annovar_pf.pl -annotationBuildVersion Sal1 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -############################################################################## -snpEffDb none -############################################################################## -############## SETTINGS THAT CAN CHANGE PER RUN ############################# -############################################################################## -############################################################################## -workingDir /opt/analysis/ -dataDir /opt/data -mipsterFile data.tsv -############################################################################## -############################################################################## -############## SETTINGS FOR DATA FILTERS ############################# -############################################################################## -############################################################################## -# filters for a haplotype to be considered valid, -# across all samples in an analysis. -# minimum total barcodes -minHaplotypeBarcodes 1 -# minimum number of samples that the haplotype is observed in -minHaplotypeSamples 1 -# minimum fraction of samples that the haplotype is observed in -minHaplotypeSampleFraction 0 -############################################################################## -############################################################################## -# loci filters, across samples -# minimum total depth (filters on INFO/AD) -minVariantBarcodes 1 -# minimum number of samples variant observed in (filters on INFO/AC) -minVariantSamples 1 -# minimum fraction of samples variant observed in (filters on INFO/AF) -minVariantSampleFraction 0 -# minimum number of samples with genotypes called (filters on INFO/AN) -minVariantSampleTotal 1 -# average sequence quality for variant across samples (filters on INFO/QS) -minVariantMeanQuality 20 -# average within sample allele frequency when observed (filters on INFO/WSAF) -minVariantMeanWsaf 0.001 -# average number of MIPs supporting the variant normalized to MAX (filters on INFO/MCF) -# this is a good filter to remove artifacts when multiple MIPs cover a locus -minMipCountFraction 0 -# Filters for individual variants. This is not used across samples, -# but within each sample for each variant. -# Phred quality filter -minVariantQuality 20 -# minimum coverage at locus to call any genotype -minVariantCoverage 1 -# minimum allele depth to call variant genotype -minVariantCount 1 -# minimum within sample allele frequency to call variant genotype -minVariantWsaf 0 -# minimum (within sample) information of an amino acid variant -# for genotype calling. For example, if minMutationFraction is set to 0.01 -# and a mutation is at 0.005 frequency in a sample, it'll be called WT -# if frequency is between 0.01 and 0.99; it'll be called MIX -# if over 0.99, it'll be called MUT -minMutationFraction 0.005 -# minimum barcode count for a (aminoacid) variant to be considered valid -minMutationCount 2 -minCoverage 5 -# Merge nucleotide changes happening on the same codon to convert to amino acid -# change. Use only for Plasmodium falciparum. -mergeSNPs 0 -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -############################################################################## -############################################################################## -# psvPriority is used to determine which paralog a given sequence belong to -# for paralogus/multi mapping results. A sequence is aligned to the genome -# and an alignment score is generated by lastZ program. Number of Paralog -# specific variants are multiplied by psvMultiplier to generate PSV scores. -# There are 3 options to determine which Paralog a sequence is assigned to: -# 0: only consider alignment scores -# 1: supplement alignment scores with psv scores -# 2: use psv score, break ties with alignment scores -psvPriority 1 -psvMultiplier 50 -############################################################################## -# options for bwa program when aligning the haplotypes to genome -bwaOptions mem,-a,-L 500 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -alignmentDir alignments/ -processorNumber 20 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic diff --git a/base_resources/templates/analysis_settings_templates/pv_settings.txt~ b/base_resources/templates/analysis_settings_templates/pv_settings.txt~ deleted file mode 100755 index c1ffb7e..0000000 --- a/base_resources/templates/analysis_settings_templates/pv_settings.txt~ +++ /dev/null @@ -1,128 +0,0 @@ -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER RUN ################################# -############################################################################## -############################################################################## -workingDir analysis/pf/DRC_170828/ -runID 170828 -mipsterFile 170828_allinfo -filteredMipsterFile 170828_allinfo_filtered -existingData na -rawDataFile 170828_raw_data.json -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pf -resourceDir na -callInfoDictionary resources/mip_dictionaries/pf/pf_all_call_info_crt.json -mipSetsDictionary resources/mip_ids/probe_sets.json -mipSetKey DR1,CSP -annotationScript table_annovar_pf.pl -annotationBuildVersion 3d7 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -caseControlStats False -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -psvPriority 1 -psvMultiplier 50 -bwaOptions mem,-t 30,-L 500,-T 80 -# minimum barcode information for a haplotype to be considered valid, fraction to all barcodes of all haplotypes for that copy in that sample -minBarcodeCount 0 -minBarcodeFraction 0 -# minSnpBarcodeCount is for filtering data for SNP analysis -# haplotypes with less barcodes than this will not be considered valid -minSnpBarcodeCount 3 -minSnpBarcodeFraction 0 -minSnpQuality 20 -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -# mutations with less frequency than minMutationFraction will be set to zero frequency -minMutationFraction 0.005 -# Merge nucleotide changes happening on the same codon to convert to amino acid change -mergeSNPs 1 -# median normalized barcode number for filtering probes used -minimumProbeMedian 10 -# median raw barcode number for filtering samples used for a given gene -minimumSampleMedian 10 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,h_qual,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,haplotype_quality_scores,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -mipInfoDictionary resources/mip_dictionaries/human/hsu_call_info.json -mipDesignDictionary resources/mip_dictionaries/human/human_mips_updated.dic -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -caseFile case_file -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -mipCountFile mip_counts -barcodeCountFile barcode_counts -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -variationTableFile variation_table -sampleVariationFile sample_diffs -sampleInfoFile sample_info_out.dic -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -perSampleResults sample_results -perProbeResults probe_results -uniqueProbeFile unique_probes -tablesFile tables -filteredTablesFile filtered_tables -clusterOutputFile cluster_output -# plotting options -dpi 96 -ymax 6 -figsize 10,40 -snpResultsFile clinical_snp_results -########################## -# dbscan cluster parameters -tsneKey V -tsnePlotKey Y -minClusterSamples 5 -maxUnclusteredFrac 0.01 -maxClusterCount 9 -dbScanOutputFile dbscan_clusters diff --git a/base_resources/templates/analysis_settings_templates/settings.txt b/base_resources/templates/analysis_settings_templates/settings.txt index f9f7b7f..afca8fe 100644 --- a/base_resources/templates/analysis_settings_templates/settings.txt +++ b/base_resources/templates/analysis_settings_templates/settings.txt @@ -9,10 +9,6 @@ species pf callInfoDictionary /opt/project_resources/mip_ids/call_info.json mipSetsDictionary /opt/project_resources/mip_ids/probe_sets.json -### Multiple sequence aligner to be used for variant calling. -multipleSequenceAligner muscle -# set msaToVcf to miptools if miptools script is to be used (experimental) -msaToVcf msa2vcf # create snp only vcf when using msa for variant calling snpOnlyVcf 0 ############## SETTINGS THAT CAN CHANGE PER RUN ############################# @@ -124,6 +120,7 @@ givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,sequen colTypes str,str,str,str,str,str,str,int,int alignmentDir alignments/ processorNumber 20 +freebayes_threads 4 ######################################################################## ######################################################################## ############### FILE NAMES THAT DO NOT CHANGE ######################## diff --git a/base_resources/templates/analysis_settings_templates/settings_180228~ b/base_resources/templates/analysis_settings_templates/settings_180228~ deleted file mode 100755 index 71517cf..0000000 --- a/base_resources/templates/analysis_settings_templates/settings_180228~ +++ /dev/null @@ -1,125 +0,0 @@ -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER RUN ################################# -############################################################################## -############################################################################## -workingDir analysis/pf/IBC_180228/ -runID 180228 -mipsterFile 180228_allinfo -filteredMipsterFile 180228_allinfo_filtered -existingData na -rawDataFile 180228_raw_data.json -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE FOR FIRST RUN OF A PROJECT ############## -############################################################################## -############################################################################## -haplotypeDictionary unique_haplotype.dic -sequenceToHaplotypeDictionary sequence_to_haplotype.dic -variationDictionary variation.dic -variationKeyToUniqueKey variation_key_to_unique_key.dic -############################################################################## -############################################################################## -############## SETTINGS THAT CHANGE PER PROJECT -############################################################################## -############################################################################## -species pf -resourceDir na -callInfoDictionary resources/ideel-barcode/call_info.json -mipSetsDictionary resources/mip_ids/probe_sets.json -mipSetKey IBC, -annotationBuildVersion 3d7 -annotationProtocol refGene -annotationOperation g -annotationNaString . -annotationOutput myanno -annotationKeys AAChange.refGene;ExonicFunc.refGene;Gene.refGene -annotationIdKey na -caseControlStats False -############################################################################## -############################################################################## -############# SETTINGS THAT CHANGE LESS FREQUENTLY ########################### -############################################################################## -############################################################################## -psvPriority 1 -psvMultiplier 50 -bwaOptions mem, -# minimum barcode information for a haplotype to be considered valid, fraction to all barcodes of all haplotypes for that copy in that sample -minBarcodeCount 0 -minBarcodeFraction 0 -# minSnpBarcodeCount is for filtering data for SNP analysis -# haplotypes with less barcodes than this will not be considered valid -minSnpBarcodeCount 0 -minSnpBarcodeFraction 0 -minSnpQuality 20 -# Data normalization settings per probe -# Per probe data across samples will be normalized to normalizationPercentiles -# Mean of the values at lower and upper specified percentiles will be considered -# to represent the copy count specified in averageCopyCount parameter. -# This is similar to saying the median barcode count for a given probe -# likely represent the diploid copy state (2) for human genes, if values of -# normalizationPercentiles and averageCopyCount are set to 2 and 0.5,0.5, respectively. -normalizationPercentiles 0.4,0.6 -averageCopyCount 1 -# mutations with less frequency than minMutationFraction will be set to zero frequency -minMutationFraction 0.005 -# median normalized barcode number for filtering probes used -minimumProbeMedian 10 -# median raw barcode number for filtering samples used for a given gene -minimumSampleMedian 10 -############################################################################## -############################################################################## -############## SETTINGS THAT ARE UNLIKELY TO CHANGE ######################### -############################################################################## -############################################################################## -colNames p_targetName,p_geneName,s_Sample,h_popUID,h_seq,h_qual,c_qual,c_readCnt,c_barcodeCnt -givenNames mip_name,gene_name,sample_name,haplotype_ID,haplotype_sequence,haplotype_quality_scores,sequence_quality,read_count,barcode_count -colTypes str,str,str,str,str,str,str,int,int -mipInfoDictionary resources/mip_dictionaries/human/hsu_call_info.json -mipDesignDictionary resources/mip_dictionaries/human/human_mips_updated.dic -alignmentDir alignments/ -processorNumber 50 -copyStableGenes na -######################################################################## -######################################################################## -############### FILE NAMES THAT DO NOT CHANGE ######################## -######################################################################## -######################################################################### -caseFile case_file -haplotypesSamFile haplotypes_bwa.sam -haplotypesFastqFile haplotypes.fq -filteredDataFile filtered_data.json -normalizedDataFile normalized_data.json -tempHaplotypesFile haplotypes.tmp -tempOffTargetsFile off_targets.tmp -tempAlignmentStdOut alignment_out.tmp -tempAlignmentsFile alignments.tmp -tempMappedHaplotypesFile mapped_haplotypes.tmp -mipCountFile mip_counts -barcodeCountFile barcode_counts -rawVcfFile raw.vcf -normalizedVcfFile norm.vcf -variationTableFile variation_table -sampleVariationFile sample_diffs -sampleInfoFile sample_info_out.dic -rawProblemData problem_raw_data -normalizedProblemData problem_normalized_data -perSampleResults sample_results -perProbeResults probe_results -uniqueProbeFile unique_probes -tablesFile tables -filteredTablesFile filtered_tables -clusterOutputFile cluster_output -# plotting options -dpi 96 -ymax 6 -figsize 10,40 -snpResultsFile clinical_snp_results -########################## -# dbscan cluster parameters -tsneKey V -tsnePlotKey Y -minClusterSamples 5 -maxUnclusteredFrac 0.01 -maxClusterCount 9 -dbScanOutputFile dbscan_clusters diff --git a/bin/muscle b/bin/muscle deleted file mode 100755 index 569be49..0000000 Binary files a/bin/muscle and /dev/null differ diff --git a/bin/runMIPWranglerCurrent.sh b/bin/runMIPWranglerCurrent.sh index 04886df..8d2238c 100644 --- a/bin/runMIPWranglerCurrent.sh +++ b/bin/runMIPWranglerCurrent.sh @@ -1,11 +1,24 @@ #!/usr/bin/env bash -if [[ $# -ne 2 ]]; then - echo "Illegal number of parameters. Needs 2 arguments: 1) name of mip server number, 2) num of threads to use." >&2 + +if [[ $# -ne 5 ]]; then + msg="Illegal number of parameters. Needs five arguments:\n" + msg="${msg}1) The name of the MIP server number.\n" + msg="${msg}2) The number of threads to use.\n" + msg="${msg}3) The population clustering fraction cutoff.\n" + msg="${msg}4) The threshold for downsampling the UMI count.\n" + msg="${msg}5) A flag indicating if downsmapling should be weighted.\n" + msg="${msg} Either an empty string or the -w flag as a string." + echo ${msg} >&2 exit 2 fi +# Correct barcodes MIPWrangler mipBarcodeCorrectionMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipBarcodeCorrecting_run1 --allowableErrors 6 MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1 + +# Downsample UMI counts +find . -type f -path '*mipBarcodeCorrection/*.fastq.gz' -exec python /opt/src/wrangler_downsample_umi.py --cpu-count $2 --downsample-threshold $4 $5 {} + + +# Cluster barcodes and MIPs MIPWrangler mipClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipClustering_run1 --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps -MIPWrangler mipPopulationClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipPopClustering_run1 --cutoff 0 --countEndGaps -#nohup MIPWrangler mav --masterDir $(realpath ./) --numThreads $2 --port $((10000+$1)) --name mip$1 & +MIPWrangler mipPopulationClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipPopClustering_run1 --cutoff 0 --countEndGaps --fraccutoff $3 diff --git a/bin/runMIPWranglerNoCutoffCurrent.sh b/bin/runMIPWranglerNoCutoffCurrent.sh deleted file mode 100755 index a28dd93..0000000 --- a/bin/runMIPWranglerNoCutoffCurrent.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash -if [[ $# -ne 2 ]]; then - echo "Illegal number of parameters. Needs 2 arguments: 1) name of mip server number, 2) num of threads to use." >&2 - exit 2 -fi - -MIPWrangler mipBarcodeCorrectionMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipBarcodeCorrecting_run1 --allowableErrors 6 -MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1 -MIPWrangler mipClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipClustering_run1 --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps -MIPWrangler mipPopulationClusteringMultiple --masterDir $(realpath ./) --numThreads $2 --overWriteDirs --overWriteLog --logFile mipPopClustering_run1 --cutoff 0 --countEndGaps --fraccutoff 0 -#nohup MIPWrangler mav --masterDir $(realpath ./) --numThreads $2 --port $((10000+$1)) --name mip$1 & diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index 1d3e55f..0109e25 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -2,9 +2,72 @@ Changelog ========= +MIPTools (development version) +============================== + +New Features +------------ + +- When running the ``wrangler`` app, if the number of UMIs detected for a MIP + is above a certain threshold, we reduce the UMI count to a lower value. This + is done in order to increase the speed of our pipeline. Above a certain UMI + count, the information becomes redundant (:github:user:`arisp99`, + :github:pull:`40`). +- Add an additional argument to the ``wrangler`` app to control the population + clustering fraction cutoff defined by :github:repo:`MIPWrangler + ` (:github:user:`arisp99`, :github:pull:`39`). +- Add the capability to freeze software version numbers when building the + container. Additionally, the version number for key software tools has been + fixed (:github:user:`arisp99`, :github:pull:`32`). +- Install :github:repo:`mipscripts `, which contains + additional tools for analysis pipelines. +- Perform additional argument parsing to ensure arguments are formatted + correctly (:github:issue:`28`, :github:issue:`37`). +- New ``download`` app supersedes the previous ``download`` app, which has + been renamed to ``download_superseded``. The new app improves the method for + downloading data from the Illumina BaseSpace Sequence Hub by using the + official command line tool (:github:user:`arisp99`, :github:pull:`25`, + :github:pull:`13`). + +Bug Fixes +--------- + +- Upgrade C and C++ compiler versions (:github:issue:`43`). +- Don't install conda and mamba packages using defaults as this can cause the + install process to hang. +- Upgrade ``libgfortran4`` to ``libgfortran5`` (:github:issue:`38`). +- Let Freebayes run with only one CPU thread (:github:issue:`33`). +- Fix error when app arguments have whitespace characters (:github:issue:`26`, + :github:issue:`37`). +- Fix missing file error when MIP arms file is created from the MIP + info dictionary (:github:user:`aydemiro`, :github:pull:`23`). +- Improve sample sheet preparation. Avoid errors when sample file + columns are empty. Throw an error if there are invalid samples or + input fields (:github:user:`aydemiro`, :github:pull:`22`). +- Fix build failure due to dependency changes in the McCOILR R package + (:github:issue:`7`). + +Maintenance +----------- + +- Remove the ``msa2vcf`` program and other conversion tools + (:github:issue:`35`). +- Reduce size of image by deleting source code after installation of programs. +- Remove sequence aligners (:github:issue:`35`). +- Remove unused analysis settings files (:github:issue:`35`). +- Install programs from GitHub instead of storing source code + (:github:user:`arisp99`, :github:pull:`36`). +- Update LICENSE year. +- Store containers using an HTTP directory (:github:issue:`12`). +- Remove duplicated files. +- Improve bash errors. +- Make strings human readable (:github:user:`arisp99`, :github:pull:`5`). + +======= MIPTools 0.4.0 ============================== + Documentation Overhaul ---------------------- @@ -14,7 +77,4 @@ Documentation Overhaul - Generate online documentation using `Sphinx `__ and `Github Pages `__. -- Improve app documentation. -- Improve clarity of README and add additional instructions on - :ref:`downloading ` or :ref:`building the - container `. +- Improve app documentation. \ No newline at end of file diff --git a/docs/app-reference/demux-app.rst b/docs/app-reference/demux-app.rst index 071f1e3..b83339d 100644 --- a/docs/app-reference/demux-app.rst +++ b/docs/app-reference/demux-app.rst @@ -29,8 +29,10 @@ Options # Required -s Path to the sample sheet for demultiplexing. + # Optional + -h Print the help page. -p The sequencing platform used. Can either be 'nextseq' or 'miseq'. - +======= Sample Sheet ------------ @@ -46,4 +48,4 @@ Examples -B base_resources:/opt/resources \ -B bcl_dir:/opt/data \ -B fastq_root_dir:/opt/analysis \ - --app demux miptools.sif -s SampleSheet.csv -p 'miseq' + --app demux miptools.sif -s SampleSheet.csv \ No newline at end of file diff --git a/docs/app-reference/demux-qc-app.rst b/docs/app-reference/demux-qc-app.rst index d2259a4..f924d53 100644 --- a/docs/app-reference/demux-qc-app.rst +++ b/docs/app-reference/demux-qc-app.rst @@ -23,6 +23,8 @@ Options # Required -p The sequencing platform used. Can either be 'nextseq' or 'miseq'. + # Optional + -h Print the help page. Examples ======== diff --git a/docs/app-reference/download-app.rst b/docs/app-reference/download-app.rst index 39f0afe..efa1d61 100644 --- a/docs/app-reference/download-app.rst +++ b/docs/app-reference/download-app.rst @@ -19,12 +19,22 @@ Options .. code-block:: none # Required - -r The run ID of the data to download. + -i The run ID of the data to download. + + # Optional + -o The path to the output directory. + -c The path to the authentication credentials file. + -h Print the help page. + +Defaults +-------- +.. code-block:: shell + + -o Default: '/opt/analysis' + -c Default: '/opt/resources/basespace.cfg' + +.. _authenticate-label: -Download Destination --------------------- -Data will be downloaded to :code:`/opt/analysis`. A directory may be mounted -to this path to customize the download destination. Authentication Credential File ------------------------------ @@ -34,6 +44,7 @@ Authentication Credential File Users must first authenticate their account in order to download data from the BaseSpace Sequence Hub. +======= An authentication token must be generated in order to download data from the BaseSpace Sequence Hub. The steps to do so are outlined below: @@ -54,4 +65,4 @@ Examples singularity run \ -B base_resources:/opt/resources \ -B downloaded:/opt/analysis \ - --app download miptools.sif -r 12345 + --app download miptools.sif -r 12345 \ No newline at end of file diff --git a/docs/app-reference/download-superseded-app.rst b/docs/app-reference/download-superseded-app.rst new file mode 100644 index 0000000..e6b6708 --- /dev/null +++ b/docs/app-reference/download-superseded-app.rst @@ -0,0 +1,66 @@ +=================== +download_superseded +=================== + +Synopsis +======== +.. code-block:: shell + + singularity run [run options...] --app download_superseded [app options...] + +Description +=========== +Download data from the Illumina BaseSpace Sequence Hub. + +.. warning:: + + This app has been superseded by the :ref:`download app `, which + uses the BaseSpace CLI for downloading data. + +Options +======= +.. code-block:: none + + # Required + -r The run ID of the data to download. + + # Optional + -h Print the help page. + +Download Destination +-------------------- +Data will be downloaded to :code:`/opt/analysis`. A directory may be mounted +to this path to customize the download destination. + +Authentication Credential File +------------------------------ + +.. note:: + + Users must first authenticate their account in order to download data from + the BaseSpace Sequence Hub. + +An authentication token must be generated in order to download data from the +BaseSpace Sequence Hub. In order to do so, you consult the :ref:`authentication +credential file section of the download app `. Once +generated authentication token must be copied to +:code:`base_resources/access-token.txt`. + +.. note:: + + The :ref:`authentication credential file section of the download app + ` will generate a configuration file that indicates an API + server to contact and an access token to authenticate against BaseSpace + Sequence Hub. Only the **access token value** must be copied to + :code:`base_resources/access-token.txt`. + + +Examples +======== + +.. code-block:: shell + + singularity run \ + -B base_resources:/opt/resources \ + -B downloaded:/opt/analysis \ + --app download_superseded miptools.sif -r 12345 diff --git a/docs/app-reference/jupyter-app.rst b/docs/app-reference/jupyter-app.rst index bae4a3f..122363a 100644 --- a/docs/app-reference/jupyter-app.rst +++ b/docs/app-reference/jupyter-app.rst @@ -21,6 +21,7 @@ Options # Optional -d The notebook directory. + -h Print the help page. -p The port to be used to load the Jupyter Notebook. Examples diff --git a/docs/app-reference/wrangler-app.rst b/docs/app-reference/wrangler-app.rst index 9f4d060..40dc1bb 100644 --- a/docs/app-reference/wrangler-app.rst +++ b/docs/app-reference/wrangler-app.rst @@ -29,10 +29,18 @@ Options # Optional -c Number of available processors to use. + + -f Population fraction cutoff used by MIPWrangler. + -h Print the help page. -k Keep intermediate files generated by MIPWrangler. -m Minimum capture length for stitching excluding probe arms. -n Starting number for MIP server. - -w Absolute path to MIPWrangler run script. + -o Absolute path to MIPWrangler run script. + -t The threshold at which UMIs will be downsampled. For any MIPs with more + UMIs than this threshold, the number of UMIs will be reduced to the + threshold. + -w Whether to apply a weight when randomly sampling UMIs. UMIs are + weighed by their read counts. Defaults -------- @@ -43,10 +51,14 @@ Defaults # Optional -c Default: 1 + -f Default: 0.005 -k Default: false -m Default: 100 -n Default: 1 - -w Default: '/opt/bin/runMIPWranglerCurrent.sh' + -o Default: '/opt/bin/runMIPWranglerCurrent.sh' + -t Default: 2000 + -w Default: false + Examples ======== diff --git a/docs/conf.py b/docs/conf.py index c5576a4..1e10369 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,8 +21,7 @@ project = "MIPTools" copyright = "2023, Bailey Lab" author = "Bailey Lab" -version = "v0.4.0" - +version = "v0.4.0.9000" # -- General configuration --------------------------------------------------- @@ -63,7 +62,7 @@ "display_github": True, "github_user": "bailey-lab", "github_repo": "MIPTools", - "github_version": "v0.4.0-prod-docs/docs/", + "github_version": "master/docs/", } diff --git a/docs/guides/analysis-pipeline.rst b/docs/guides/analysis-pipeline.rst index 0ee568c..a17a7b5 100644 --- a/docs/guides/analysis-pipeline.rst +++ b/docs/guides/analysis-pipeline.rst @@ -25,7 +25,7 @@ line: .. code-block:: shell # Download and untar directory - wget https://baileylab.brown.edu/MIPTools/download/test-data.tar.gz + wget -qO- https://baileylab.brown.edu/MIPTools/download/test-data.tar.gz tar -xvf test-data.tar.gz The test data set contains 5 directories that contain the test data, species @@ -68,7 +68,8 @@ We additionally define several parameters needed to wrangle data: Next, we can run the :ref:`wrangler app `. For additional instructions on what each flag represents, consult the :ref:`man page ` for the app or the built in documentation with -:code:`singularity run --app wrangler miptools_v0.4.0.sif -h`. + +:code:`singularity run --app wrangler miptools_dev.sif -h`. .. code-block:: shell @@ -76,7 +77,7 @@ instructions on what each flag represents, consult the :ref:`man page -B test-data/DR1_project_resources:/opt/project_resources \ -B test-data/test_data/fastq:/opt/data \ -B test-data/wrangler:/opt/analysis \ - --app wrangler miptools_v0.4.0.sif \ + --app wrangler miptools_dev.sif \ -e ${experiment_id} -l ${sample_list} -p ${probe_sets_used} \ -s ${sample_sets_used} -c ${cpu_number} -m ${min_capture_length} @@ -129,7 +130,7 @@ Then we can start our Jupyter notebook: -B test-data/pf_species_resources:/opt/species_resources \ -B test-data/wrangler:/opt/data \ -B test-data/variant:/opt/analysis \ - --app jupyter miptools_v0.4.0.sif + --app jupyter miptools_dev.sif A series of instructions will be printed to the terminal on how to access the notebook. Follow these instructions to run the Jupyter notebooks in a web diff --git a/docs/index.rst b/docs/index.rst index c3afbc1..22e2e6f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,9 +4,42 @@ MIPTools Documentation Welcome to the MIPTools User Guide! -MIPTools is a suite of computational tools that are used for molecular -inversion probe design, data processing, and analysis. - +MIPTools is a suite of computational tools that are used for molecular inversion +probe (MIP) design, data processing, and analysis. Throughout much of this +tutorial, we assume a user interested in using MIPs as a cost effective way to +amplify and sequence hundreds to thousands of targeted regions of the genomes +from hundreds to thousands of pooled barcoded samples. Our group primarily uses +these MIPs to assess relatedness and drug resistance status of Plasmodium +falciparum targets, but we have attempted to generalize this tool for other +questions and datasets. This toolset also assumes the use of unique molecular +identifiers (UMIs) to define unique MIP capture events. + +A typical pipeline might look something like this: + * First, a user might design MIP probes (using the probe design tool of this + program) that have UMIs added to each MIP probe. + + * Second, a user might perform mip capturing reactions, PCR, sample barcoding, + and illumina sequencing. The output data should be demultiplexed, resulting in + two fastq files per sample. Bench techniques for these experiments are + described elsewhere. + + * Third, the data is wrangled to generate an output file describing which + genotypes (or haplotypes) are found at which abundances in each sample for each + targeted region, using: + + * a sample sheet that describes the samples + * a fastq folder of samples + * a project resources folder that describes the probes + + * Finally, the haplotype data is analyzed using a variant caller (Freebayes is + currently our best-supported tool) to produce a VCF file and some output tables + with frequencies and prevalences of mutations of interest, using: + + * a sample sheet that describes the samples + * a folder containing the wrangled haplotype data + * a folder of indexed genomes for your species of interest + * a project resources folder that describes the probes + .. toctree:: :caption: Quick Start :maxdepth: 2 diff --git a/docs/installation.rst b/docs/installation.rst index f90b614..14a0e70 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -7,6 +7,7 @@ Dependencies A working copy of `Singularity `_ is required. Singularity is best installed with **sudo**. While it is said to be possible to + install as an unprivileged user with some features missing, MIPTools has yet to be tested on such an installation. @@ -39,8 +40,8 @@ You can download the development version or any previous release: .. note:: These prebuilt versions do not include the :code:`bcl2fastq` software due to - its license. You must build the container yourself if you plan to use - MIPTools to demultiplex bcl files. + its license. If you plan to use MIPTools to demultiplex bcl files, you must + build the container yourself. .. _install-source: @@ -63,7 +64,7 @@ You can alternatively install the development version: # Install dev version git clone https://github.com/bailey-lab/MIPTools.git -Next, build the container, and you should be all set to get started using +Next, simply build the container and you should be all set to get started using MIPTools! .. code-block:: shell @@ -84,6 +85,7 @@ Sudo Privileges If you want to run the container on an environment without ``sudo``, either download a prebuilt image (see above) or build the container on your own + machine where you *do* have ``sudo`` privileges and copy the image file to the computer without ``sudo``. Note that the Singularity program itself must have been installed with ``sudo``. diff --git a/environment_versioned.yml b/environment_versioned.yml new file mode 100644 index 0000000..5d0c5c9 --- /dev/null +++ b/environment_versioned.yml @@ -0,0 +1,532 @@ +name: base +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - _r-mutex=1.0.1=anacondar_1 + - alsa-lib=1.2.8=h166bdaf_0 + - anyio=3.7.1=pyhd8ed1ab_0 + - argon2-cffi=21.3.0=pyhd8ed1ab_0 + - argon2-cffi-bindings=21.2.0=py39hb9d737c_3 + - asciitree=0.3.3=py_2 + - asttokens=2.2.1=pyhd8ed1ab_0 + - attr=2.5.1=h166bdaf_1 + - attrs=23.1.0=pyh71513ae_1 + - backcall=0.2.0=pyh9f0ad1d_0 + - backports=1.0=pyhd8ed1ab_3 + - backports.functools_lru_cache=1.6.5=pyhd8ed1ab_0 + - basemap-data=1.3.2=pyhd8ed1ab_3 + - basemap-data-hires=1.3.2=pyhd8ed1ab_3 + - bc=1.07.1=h7f98852_0 + - bcftools=1.17=h3cc50cf_1 + - beautifulsoup4=4.12.2=pyha770c72_0 + - binutils_impl_linux-64=2.40=hf600244_0 + - bioconductor-dnacopy=1.74.1=r43h9913872_0 + - biopython=1.81=py39h72bdee0_0 + - bleach=6.0.0=pyhd8ed1ab_0 + - bokeh=2.4.3=py39hf3d152e_0 + - boltons=23.0.0=pyhd8ed1ab_0 + - bowtie2=2.5.1=py39h6fed5c7_2 + - brotli=1.0.9=h166bdaf_9 + - brotli-bin=1.0.9=h166bdaf_9 + - brotli-python=1.0.9=py39h5a03fae_9 + - bwa=0.7.17=he4a0461_11 + - bwidget=1.9.14=ha770c72_1 + - bzip2=1.0.8=h7f98852_4 + - c-ares=1.19.1=hd590300_0 + - ca-certificates=2023.5.7=hbcca054_0 + - cached-property=1.5.2=hd8ed1ab_1 + - cached_property=1.5.2=pyha770c72_1 + - cairo=1.16.0=hbbf8b49_1016 + - certifi=2023.5.7=pyhd8ed1ab_0 + - cffi=1.15.1=py39he91dace_3 + - chardet=5.1.0=py39hf3d152e_0 + - charset-normalizer=3.2.0=pyhd8ed1ab_0 + - click=8.1.3=py39hf3d152e_1 + - cloudpickle=2.2.1=pyhd8ed1ab_0 + - conda=23.5.2=py39hf3d152e_0 + - conda-package-handling=2.0.2=pyh38be061_0 + - conda-package-streaming=0.8.0=pyhd8ed1ab_0 + - contourpy=1.1.0=py39h7633fee_0 + - cryptography=41.0.2=py39hd4f0224_0 + - curl=8.1.2=h409715c_0 + - cycler=0.11.0=pyhd8ed1ab_0 + - cytoolz=0.12.0=py39hb9d737c_1 + - dask=2023.2.0=pyhd8ed1ab_0 + - dask-core=2023.2.0=pyhd8ed1ab_0 + - dbus=1.13.6=h5008d03_3 + - debugpy=1.6.7=py39h227be39_0 + - decorator=5.1.1=pyhd8ed1ab_0 + - defusedxml=0.7.1=pyhd8ed1ab_0 + - descartes=1.1.0=py_4 + - distributed=2023.2.0=pyhd8ed1ab_0 + - eigen=3.4.0=h4bd325d_0 + - entrypoints=0.4=pyhd8ed1ab_0 + - et_xmlfile=1.1.0=pyhd8ed1ab_0 + - exceptiongroup=1.1.2=pyhd8ed1ab_0 + - executing=1.2.0=pyhd8ed1ab_0 + - expat=2.5.0=hcb278e6_1 + - fasteners=0.17.3=pyhd8ed1ab_0 + - flit-core=3.9.0=pyhd8ed1ab_0 + - fmt=9.1.0=h924138e_0 + - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 + - font-ttf-inconsolata=3.000=h77eed37_0 + - font-ttf-source-code-pro=2.038=h77eed37_0 + - font-ttf-ubuntu=0.83=hab24e00_0 + - fontconfig=2.14.2=h14ed4e7_0 + - fonts-conda-ecosystem=1=0 + - fonts-conda-forge=1=0 + - fonttools=4.41.0=py39hd1e30aa_0 + - freebayes=1.3.6=hb0f3ef8_6 + - freetype=2.12.1=hca18f0e_1 + - fribidi=1.0.10=h36c2ea0_0 + - fsspec=2023.6.0=pyh1a96a4e_0 + - gatk4=4.4.0.0=py36hdfd78af_0 + - gcc_impl_linux-64=13.1.0=hc4be1a9_0 + - gettext=0.21.1=h27087fc_0 + - gfortran_impl_linux-64=13.1.0=hd511a9b_0 + - giflib=5.2.1=h0b41bf4_3 + - glib=2.76.4=hfc55251_0 + - glib-tools=2.76.4=hfc55251_0 + - gmp=6.2.1=h58526e2_0 + - graphite2=1.3.13=h58526e2_1001 + - gsl=2.7=he838d99_0 + - gst-plugins-base=1.22.3=h938bd60_1 + - gstreamer=1.22.3=h977cf35_1 + - gxx_impl_linux-64=13.1.0=hc4be1a9_0 + - h5py=3.9.0=nompi_py39h680ca82_101 + - harfbuzz=7.3.0=hdb3a94d_0 + - hdf5=1.14.1=nompi_h4f84152_100 + - htslib=1.17=h81da01d_2 + - icu=72.1=hcb278e6_0 + - idna=2.9=py_1 + - importlib-metadata=6.8.0=pyha770c72_0 + - importlib-resources=6.0.0=pyhd8ed1ab_1 + - importlib_metadata=6.8.0=hd8ed1ab_0 + - importlib_resources=6.0.0=pyhd8ed1ab_1 + - ipykernel=6.14.0=py39hef51801_0 + - ipython=8.4.0=py39hf3d152e_0 + - ipython_genutils=0.2.0=py_1 + - jedi=0.18.2=pyhd8ed1ab_0 + - jinja2=3.1.2=pyhd8ed1ab_1 + - joblib=1.3.0=pyhd8ed1ab_1 + - jsoncpp=1.9.5=h4bd325d_1 + - jsonpatch=1.32=pyhd8ed1ab_0 + - jsonpointer=2.0=py_0 + - jsonschema=4.18.3=pyhd8ed1ab_0 + - jsonschema-specifications=2023.6.1=pyhd8ed1ab_0 + - jupyter_client=8.3.0=pyhd8ed1ab_0 + - jupyter_contrib_core=0.4.0=pyhd8ed1ab_0 + - jupyter_contrib_nbextensions=0.7.0=pyhd8ed1ab_0 + - jupyter_core=5.3.1=py39hf3d152e_0 + - jupyter_events=0.6.3=pyhd8ed1ab_0 + - jupyter_highlight_selected_word=0.2.0=py39hf3d152e_1005 + - jupyter_latex_envs=1.4.6=py39hf3d152e_1001 + - jupyter_nbextensions_configurator=0.6.1=pyhd8ed1ab_0 + - jupyter_server=2.7.0=pyhd8ed1ab_0 + - jupyter_server_terminals=0.4.4=pyhd8ed1ab_1 + - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0 + - kernel-headers_linux-64=2.6.32=he073ed8_16 + - keyutils=1.6.1=h166bdaf_0 + - kiwisolver=1.4.4=py39hf939315_1 + - krb5=1.20.1=h81ceb04_0 + - lame=3.100=h166bdaf_1003 + - lastz=1.04.22=h031d066_0 + - lcms2=2.15=haa2dc70_1 + - ld_impl_linux-64=2.40=h41732ed_0 + - lerc=4.0.0=h27087fc_0 + - libaec=1.0.6=hcb278e6_1 + - libarchive=3.6.2=h039dbb9_1 + - libblas=3.9.0=17_linux64_openblas + - libbrotlicommon=1.0.9=h166bdaf_9 + - libbrotlidec=1.0.9=h166bdaf_9 + - libbrotlienc=1.0.9=h166bdaf_9 + - libcap=2.67=he9d0100_0 + - libcblas=3.9.0=17_linux64_openblas + - libclang=15.0.7=default_h7634d5b_2 + - libclang13=15.0.7=default_h9986a30_2 + - libcups=2.3.3=h36d4200_3 + - libcurl=8.1.2=h409715c_0 + - libdeflate=1.18=h0b41bf4_0 + - libedit=3.1.20191231=he28a2e2_2 + - libev=4.33=h516909a_1 + - libevent=2.1.12=hf998b51_1 + - libexpat=2.5.0=hcb278e6_1 + - libffi=3.4.2=h7f98852_5 + - libflac=1.4.3=h59595ed_0 + - libgcc-devel_linux-64=13.1.0=he3cc6c4_0 + - libgcc-ng=13.1.0=he5830b7_0 + - libgcrypt=1.10.1=h166bdaf_0 + - libgfortran-ng=13.1.0=h69a702a_0 + - libgfortran5=13.1.0=h15d22d2_0 + - libgit2=1.6.4=h747ad27_0 + - libglib=2.76.4=hebfc3b9_0 + - libgomp=13.1.0=he5830b7_0 + - libgpg-error=1.47=h71f35ed_0 + - libhwloc=2.9.1=nocuda_h7313eea_6 + - libiconv=1.17=h166bdaf_0 + - libjpeg-turbo=2.1.5.1=h0b41bf4_0 + - liblapack=3.9.0=17_linux64_openblas + - libllvm15=15.0.7=h5cf9203_2 + - libmamba=1.4.9=h658169a_0 + - libmambapy=1.4.9=py39h78efd80_0 + - libnghttp2=1.52.0=h61bc06f_0 + - libnsl=2.0.0=h7f98852_0 + - libogg=1.3.4=h7f98852_1 + - libopenblas=0.3.23=pthreads_h80387f5_0 + - libopus=1.3.1=h7f98852_1 + - libpng=1.6.39=h753d276_0 + - libpq=15.3=hbcd7760_1 + - libsanitizer=13.1.0=hfd8a6a1_0 + - libsndfile=1.2.0=hb75c966_0 + - libsodium=1.0.18=h36c2ea0_1 + - libsolv=0.7.24=hfc55251_1 + - libsqlite=3.42.0=h2797004_0 + - libssh2=1.11.0=h0841786_0 + - libstdcxx-devel_linux-64=13.1.0=he3cc6c4_0 + - libstdcxx-ng=13.1.0=hfd8a6a1_0 + - libsystemd0=253=h8c4010b_1 + - libtiff=4.5.1=h8b53f26_0 + - libuuid=2.38.1=h0b41bf4_0 + - libvorbis=1.3.7=h9c3ff4c_0 + - libwebp-base=1.3.1=hd590300_0 + - libxcb=1.15=h0b41bf4_0 + - libxkbcommon=1.5.0=h5d7e998_3 + - libxml2=2.11.4=h0d562d8_0 + - libxslt=1.1.37=h0054252_1 + - libzlib=1.2.13=hd590300_5 + - locket=1.0.0=pyhd8ed1ab_0 + - lxml=4.9.3=py39hed45dcc_0 + - lz4=4.3.2=py39h724f13c_0 + - lz4-c=1.9.4=hcb278e6_0 + - lzo=2.10=h516909a_1000 + - make=4.3=hd18ef5c_1 + - mamba=1.4.9=py39hc5d2bb1_0 + - markupsafe=2.1.3=py39hd1e30aa_0 + - matplotlib=3.7.2=py39hf3d152e_0 + - matplotlib-base=3.7.2=py39h0126182_0 + - matplotlib-inline=0.1.6=pyhd8ed1ab_0 + - matplotlib-venn=0.11.9=pyhd8ed1ab_0 + - mistune=3.0.0=pyhd8ed1ab_0 + - mizani=0.7.3=pyhd8ed1ab_0 + - mpfr=4.2.0=hb012696_0 + - mpg123=1.31.3=hcb278e6_0 + - msgpack-python=1.0.5=py39h4b4f3f3_0 + - munkres=1.1.4=pyh9f0ad1d_0 + - mysql-common=8.0.33=hf1915f5_1 + - mysql-libs=8.0.33=hca2cd23_1 + - nbclassic=1.0.0=pyhb4ecaf3_1 + - nbclient=0.8.0=pyhd8ed1ab_0 + - nbconvert=7.6.0=pyhd8ed1ab_0 + - nbconvert-core=7.6.0=pyhd8ed1ab_0 + - nbconvert-pandoc=7.6.0=pyhd8ed1ab_0 + - nbformat=5.9.1=pyhd8ed1ab_0 + - ncurses=6.4=hcb278e6_0 + - nest-asyncio=1.5.6=pyhd8ed1ab_0 + - networkx=3.1=pyhd8ed1ab_0 + - nomkl=1.0=h5ca1d4c_0 + - notebook=6.5.4=pyha770c72_0 + - notebook-shim=0.2.3=pyhd8ed1ab_0 + - nspr=4.35=h27087fc_0 + - nss=3.89=he45b914_0 + - numcodecs=0.11.0=py39h227be39_1 + - numexpr=2.8.4=py39h5ef5dce_100 + - numpy=1.25.1=py39h6183b62_0 + - openjdk=17.0.3=h19c1b89_7 + - openjpeg=2.5.0=hfec8fc6_2 + - openpyxl=3.1.2=py39hd1e30aa_0 + - openssl=3.1.1=hd590300_1 + - overrides=7.3.1=pyhd8ed1ab_0 + - packaging=23.1=pyhd8ed1ab_0 + - palettable=3.3.3=pyhd8ed1ab_0 + - pandas=1.2.3=py39hde0f152_0 + - pandoc=2.19.2=h32600fe_2 + - pandocfilters=1.5.0=pyhd8ed1ab_0 + - pango=1.50.14=heaa33ce_1 + - parallel=20230522=ha770c72_0 + - parso=0.8.3=pyhd8ed1ab_0 + - partd=1.4.0=pyhd8ed1ab_0 + - patsy=0.5.3=pyhd8ed1ab_0 + - pcre2=10.40=hc3806b6_0 + - perl=5.32.1=3_hd590300_perl5 + - pexpect=4.8.0=pyh1a96a4e_2 + - pickleshare=0.7.5=py39hde42818_1002 + - pillow=10.0.0=py39haaeba84_0 + - pip=23.1.2=pyhd8ed1ab_0 + - pixman=0.40.0=h36c2ea0_0 + - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 + - platformdirs=3.8.1=pyhd8ed1ab_0 + - plotly=5.15.0=pyhd8ed1ab_0 + - plotnine=0.8.0=pyhd8ed1ab_0 + - pluggy=1.2.0=pyhd8ed1ab_0 + - ply=3.11=py_1 + - pomegranate=0.14.8=py39h7c9e3ff_0 + - pooch=1.7.0=pyha770c72_3 + - primer3=2.6.1=pl5321hdbdd923_4 + - primer3-py=0.6.1=py39hbf8eff0_1 + - prometheus_client=0.17.1=pyhd8ed1ab_0 + - prompt-toolkit=3.0.39=pyha770c72_0 + - psutil=5.9.5=py39h72bdee0_0 + - pthread-stubs=0.4=h36c2ea0_1001 + - ptyprocess=0.7.0=pyhd3deb0d_0 + - pulseaudio-client=16.1=hb77b528_4 + - pure_eval=0.2.2=pyhd8ed1ab_0 + - pybind11-abi=4=hd8ed1ab_3 + - pycosat=0.6.4=py39hb9d737c_1 + - pycparser=2.20=py_0 + - pygments=2.15.1=pyhd8ed1ab_0 + - pyopenssl=23.2.0=pyhd8ed1ab_1 + - pyparsing=3.0.9=pyhd8ed1ab_0 + - pyqt=5.15.7=py39h5c7b992_3 + - pyqt5-sip=12.11.0=py39h227be39_3 + - pysam=0.21.0=py39hcada746_1 + - pysocks=1.7.1=py39hf3d152e_5 + - python=3.9.16=h2782a2a_0_cpython + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python-fastjsonschema=2.17.1=pyhd8ed1ab_0 + - python-json-logger=2.0.7=pyhd8ed1ab_0 + - python_abi=3.9=3_cp39 + - pytz=2023.3=pyhd8ed1ab_0 + - pyyaml=6.0=py39hb9d737c_5 + - pyzmq=25.1.0=py39hb257651_0 + - qt-main=5.15.8=h01ceb2d_13 + - r-askpass=1.1=r43h57805ef_4 + - r-assertthat=0.2.1=r43hc72bb7e_4 + - r-base=4.3.1=hfabd6f2_1 + - r-base64enc=0.1_3=r43h57805ef_1006 + - r-brew=1.0_8=r43hc72bb7e_2 + - r-brio=1.1.3=r43h57805ef_2 + - r-bslib=0.5.0=r43hc72bb7e_1 + - r-cachem=1.0.8=r43h57805ef_1 + - r-callr=3.7.3=r43hc72bb7e_1 + - r-cli=3.6.1=r43ha503ecb_1 + - r-clipr=0.8.0=r43hc72bb7e_2 + - r-colorspace=2.1_0=r43h57805ef_1 + - r-commonmark=1.9.0=r43h57805ef_1 + - r-cpp11=0.4.4=r43hc72bb7e_0 + - r-crayon=1.5.2=r43hc72bb7e_2 + - r-credentials=1.3.2=r43hc72bb7e_2 + - r-crosstalk=1.2.0=r43hc72bb7e_2 + - r-curl=5.0.1=r43hf9611b0_0 + - r-data.table=1.14.8=r43h029312a_2 + - r-desc=1.4.2=r43hc72bb7e_2 + - r-devtools=2.4.5=r43hc72bb7e_2 + - r-diffobj=0.3.5=r43h57805ef_2 + - r-digest=0.6.33=r43ha503ecb_0 + - r-downlit=0.4.3=r43hc72bb7e_0 + - r-dplyr=1.1.2=r43ha503ecb_1 + - r-dt=0.28=r43hc72bb7e_1 + - r-ellipsis=0.3.2=r43h57805ef_2 + - r-epitools=0.5_10.1=r43hc72bb7e_1 + - r-evaluate=0.21=r43hc72bb7e_1 + - r-fansi=1.0.4=r43h57805ef_1 + - r-farver=2.1.1=r43ha503ecb_2 + - r-fastmap=1.1.1=r43ha503ecb_1 + - r-fontawesome=0.5.1=r43hc72bb7e_1 + - r-fs=1.6.2=r43ha503ecb_1 + - r-generics=0.1.3=r43hc72bb7e_2 + - r-gert=1.9.2=r43h1ab2d24_2 + - r-ggplot2=3.4.2=r43hc72bb7e_1 + - r-gh=1.4.0=r43hc72bb7e_1 + - r-gitcreds=0.1.2=r43hc72bb7e_2 + - r-glue=1.6.2=r43h57805ef_2 + - r-gtable=0.3.3=r43hc72bb7e_1 + - r-hexbin=1.28.3=r43h61816a4_1 + - r-highr=0.10=r43hc72bb7e_1 + - r-htmltools=0.5.5=r43ha503ecb_1 + - r-htmlwidgets=1.6.2=r43hc72bb7e_1 + - r-httpuv=1.6.11=r43ha503ecb_1 + - r-httr=1.4.6=r43hc72bb7e_1 + - r-httr2=0.2.3=r43hc72bb7e_1 + - r-ini=0.3.1=r43hc72bb7e_1005 + - r-irdisplay=1.1=r43hd8ed1ab_2 + - r-irkernel=1.3.2=r43h785f33e_1 + - r-isoband=0.2.7=r43ha503ecb_2 + - r-jquerylib=0.1.4=r43hc72bb7e_2 + - r-jsonlite=1.8.7=r43h57805ef_0 + - r-knitr=1.43=r43hc72bb7e_1 + - r-labeling=0.4.2=r43hc72bb7e_3 + - r-later=1.3.1=r43ha503ecb_1 + - r-lattice=0.21_8=r43h57805ef_1 + - r-lazyeval=0.2.2=r43h57805ef_4 + - r-lifecycle=1.0.3=r43hc72bb7e_2 + - r-magrittr=2.0.3=r43h57805ef_2 + - r-mass=7.3_60=r43h57805ef_1 + - r-matrix=1.6_0=r43h316c678_0 + - r-memoise=2.0.1=r43hc72bb7e_2 + - r-mgcv=1.9_0=r43h316c678_0 + - r-mime=0.12=r43h57805ef_2 + - r-miniui=0.1.1.1=r43hc72bb7e_1004 + - r-munsell=0.5.0=r43hc72bb7e_1006 + - r-nlme=3.1_162=r43h61816a4_1 + - r-openssl=2.0.6=r43hb353fa6_1 + - r-pbdzmq=0.3_9=r43ha81a24b_1 + - r-pillar=1.9.0=r43hc72bb7e_1 + - r-pkgbuild=1.4.2=r43hc72bb7e_0 + - r-pkgconfig=2.0.3=r43hc72bb7e_3 + - r-pkgdown=2.0.7=r43hc72bb7e_1 + - r-pkgload=1.3.2.1=r43hc72bb7e_0 + - r-plotly=4.10.2=r43hc72bb7e_1 + - r-praise=1.0.0=r43hc72bb7e_1007 + - r-prettyunits=1.1.1=r43hc72bb7e_3 + - r-processx=3.8.2=r43h57805ef_0 + - r-profvis=0.3.8=r43h57805ef_2 + - r-promises=1.2.0.1=r43ha503ecb_2 + - r-ps=1.7.5=r43h57805ef_1 + - r-purrr=1.0.1=r43h57805ef_1 + - r-r6=2.5.1=r43hc72bb7e_2 + - r-ragg=1.2.5=r43h85cdef0_2 + - r-rappdirs=0.3.3=r43h57805ef_2 + - r-rcmdcheck=1.4.0=r43h785f33e_2 + - r-rcolorbrewer=1.1_3=r43h785f33e_2 + - r-rcpp=1.0.11=r43h7df8631_0 + - r-rematch2=2.1.2=r43hc72bb7e_3 + - r-remotes=2.4.2=r43hc72bb7e_2 + - r-repr=1.1.6=r43h785f33e_1 + - r-rlang=1.1.1=r43ha503ecb_1 + - r-rmarkdown=2.23=r43hc72bb7e_0 + - r-roxygen2=7.2.3=r43ha503ecb_1 + - r-rprojroot=2.0.3=r43hc72bb7e_0 + - r-rstudioapi=0.15.0=r43hc72bb7e_0 + - r-rversions=2.1.2=r43hc72bb7e_2 + - r-sass=0.4.6=r43ha503ecb_1 + - r-scales=1.2.1=r43hc72bb7e_2 + - r-sessioninfo=1.2.2=r43hc72bb7e_2 + - r-shiny=1.7.4.1=r43h785f33e_0 + - r-sourcetools=0.1.7_1=r43ha503ecb_1 + - r-stringi=1.7.12=r43hc0c3e09_2 + - r-stringr=1.5.0=r43h785f33e_1 + - r-sys=3.4.2=r43h57805ef_1 + - r-systemfonts=1.0.4=r43haf97adc_2 + - r-testthat=3.1.10=r43ha503ecb_0 + - r-textshaping=0.3.6=r43h24cd192_6 + - r-tibble=3.2.1=r43h57805ef_2 + - r-tidyr=1.3.0=r43ha503ecb_1 + - r-tidyselect=1.2.0=r43hc72bb7e_1 + - r-tinytex=0.45=r43hc72bb7e_1 + - r-urlchecker=1.0.1=r43hc72bb7e_2 + - r-usethis=2.2.2=r43hc72bb7e_0 + - r-utf8=1.2.3=r43h57805ef_1 + - r-uuid=1.1_0=r43h57805ef_2 + - r-vctrs=0.6.3=r43ha503ecb_0 + - r-viridislite=0.4.2=r43hc72bb7e_1 + - r-waldo=0.5.1=r43hc72bb7e_1 + - r-whisker=0.4.1=r43hc72bb7e_1 + - r-withr=2.5.0=r43hc72bb7e_2 + - r-xfun=0.39=r43ha503ecb_1 + - r-xml2=1.3.5=r43h1ad5fc0_0 + - r-xopen=1.0.0=r43hc72bb7e_1005 + - r-xtable=1.8_4=r43hc72bb7e_5 + - r-yaml=2.3.7=r43h57805ef_1 + - r-zip=2.3.0=r43h57805ef_1 + - readline=8.2=h8228510_1 + - referencing=0.29.1=pyhd8ed1ab_0 + - reproc=14.2.4=h0b41bf4_0 + - reproc-cpp=14.2.4=hcb278e6_0 + - requests=2.31.0=pyhd8ed1ab_0 + - rfc3339-validator=0.1.4=pyhd8ed1ab_0 + - rfc3986-validator=0.1.1=pyh9f0ad1d_0 + - rpds-py=0.8.10=py39h9fdd4d6_0 + - rpy2=3.5.11=py39r43h0f8d45d_2 + - ruamel.yaml=0.17.32=py39hd1e30aa_0 + - ruamel.yaml.clib=0.2.7=py39h72bdee0_1 + - ruamel_yaml=0.15.80=py39hb9d737c_1008 + - samtools=1.17=hd87286a_1 + - scandir=1.10.0=py39hb9d737c_6 + - scikit-allel=1.3.6=py39h40cae4c_0 + - scikit-learn=1.3.0=py39hc236052_0 + - scipy=1.11.1=py39h6183b62_0 + - seaborn=0.12.2=hd8ed1ab_0 + - seaborn-base=0.12.2=pyhd8ed1ab_0 + - sed=4.8=he412f7d_0 + - send2trash=1.8.0=pyhd8ed1ab_0 + - seqtk=1.3=he4a0461_6 + - setuptools=59.8.0=py39hf3d152e_1 + - simplegeneric=0.8.1=py_1 + - sip=6.7.9=py39h3d6467e_0 + - six=1.16.0=pyh6c4a22f_0 + - sniffio=1.3.0=pyhd8ed1ab_0 + - sortedcontainers=2.4.0=pyhd8ed1ab_0 + - soupsieve=2.3.2.post1=pyhd8ed1ab_0 + - sqlite=3.31.1=h62c20be_1 + - stack_data=0.6.2=pyhd8ed1ab_0 + - statsmodels=0.14.0=py39h0f8d45d_1 + - sysroot_linux-64=2.12=he073ed8_16 + - tabixpp=1.1.0=hce60e53_13 + - tbb=2021.9.0=hf52228f_0 + - tblib=1.7.0=pyhd8ed1ab_0 + - tenacity=8.2.2=pyhd8ed1ab_0 + - terminado=0.15.0=py39hf3d152e_0 + - texlive-core=20230313=hc0e8fe8_4 + - threadpoolctl=3.2.0=pyha21a80b_0 + - tinycss2=1.2.1=pyhd8ed1ab_0 + - tk=8.6.12=h27826a3_0 + - tktable=2.10=hb7b940f_3 + - toml=0.10.2=pyhd8ed1ab_0 + - tomli=2.0.1=pyhd8ed1ab_0 + - toolz=0.12.0=pyhd8ed1ab_0 + - tornado=6.3.2=py39hd1e30aa_0 + - tqdm=4.46.0=py_0 + - traitlets=5.9.0=pyhd8ed1ab_0 + - typing-extensions=4.7.1=hd8ed1ab_0 + - typing_extensions=4.7.1=pyha770c72_0 + - typing_utils=0.1.0=pyhd8ed1ab_0 + - tzdata=2023c=h71feb2d_0 + - tzlocal=5.0.1=py39hf3d152e_0 + - unicodedata2=15.0.0=py39hb9d737c_0 + - urllib3=2.0.3=pyhd8ed1ab_1 + - vcflib=1.0.9=h146fbdb_1 + - vcftools=0.1.16=pl5321hdcf5f25_9 + - wcwidth=0.2.6=pyhd8ed1ab_0 + - webencodings=0.5.1=py_1 + - websocket-client=1.6.1=pyhd8ed1ab_0 + - wfa2-lib=2.3.3=h4ac6f70_0 + - wheel=0.40.0=pyhd8ed1ab_0 + - xcb-util=0.4.0=hd590300_1 + - xcb-util-image=0.4.0=h8ee46fc_1 + - xcb-util-keysyms=0.4.0=h8ee46fc_1 + - xcb-util-renderutil=0.3.9=hd590300_1 + - xcb-util-wm=0.4.1=h8ee46fc_1 + - xkeyboard-config=2.39=hd590300_0 + - xlrd=2.0.1=pyhd8ed1ab_3 + - xorg-fixesproto=5.0=h7f98852_1002 + - xorg-inputproto=2.3.2=h7f98852_1002 + - xorg-kbproto=1.0.7=h7f98852_1002 + - xorg-libice=1.0.10=h7f98852_0 + - xorg-libsm=1.2.3=hd9c2040_1000 + - xorg-libx11=1.8.6=h8ee46fc_0 + - xorg-libxau=1.0.11=hd590300_0 + - xorg-libxdmcp=1.1.3=h7f98852_0 + - xorg-libxext=1.3.4=h0b41bf4_2 + - xorg-libxfixes=5.0.3=h7f98852_1004 + - xorg-libxi=1.7.10=h7f98852_0 + - xorg-libxrender=0.9.11=hd590300_0 + - xorg-libxt=1.3.0=hd590300_0 + - xorg-libxtst=1.2.3=h7f98852_1002 + - xorg-recordproto=1.14.2=h7f98852_1002 + - xorg-renderproto=0.11.1=h7f98852_1002 + - xorg-xextproto=7.3.0=h0b41bf4_1003 + - xorg-xf86vidmodeproto=2.3.1=h7f98852_1002 + - xorg-xproto=7.0.31=h7f98852_1007 + - xz=5.2.6=h166bdaf_0 + - yaml=0.2.5=h7f98852_2 + - yaml-cpp=0.7.0=h27087fc_2 + - zarr=2.15.0=pyhd8ed1ab_0 + - zeromq=4.3.4=h9c3ff4c_1 + - zict=3.0.0=pyhd8ed1ab_0 + - zipp=3.16.0=pyhd8ed1ab_1 + - zlib=1.2.13=hd590300_5 + - zstandard=0.19.0=py39h6e5214e_2 + - zstd=1.5.2=hfc55251_7 + - pip: + - about-time==4.2.1 + - alive-progress==3.1.4 + - argparse==1.4.0 + - docutils==0.20.1 + - grapheme==0.6.0 + - mipscripts==0.3.4 + - statistics==1.0.3.5 +prefix: /opt/conda diff --git a/programs/README.md b/programs/README.md new file mode 100644 index 0000000..9927d89 --- /dev/null +++ b/programs/README.md @@ -0,0 +1,8 @@ +# Programs + +This folder can be used to install custom software into the MIPTools container. +By placing software into this folder, the software will be built into the +container. Currently only the +[`bcl2fastq`](https://support.illumina.com/sequencing/sequencing_software/bcl2fastq-conversion-software.html) +software, which is used for demultiplexing BCL files, can be installed. Consult +the documentation for additional instruction. diff --git a/programs/lastz/LICENSE b/programs/lastz/LICENSE deleted file mode 100644 index d6587b5..0000000 --- a/programs/lastz/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2007-2017 Robert S. Harris - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/programs/lastz/Makefile b/programs/lastz/Makefile deleted file mode 100644 index e2f389e..0000000 --- a/programs/lastz/Makefile +++ /dev/null @@ -1,47 +0,0 @@ -include make-include.mak - -default: build_lastz - -lastz_32: build_lastz_32 - -#--------- -# builds/installation -#--------- - -build: build_lastz - -build_lastz: - cd src && ${MAKE} lastz lastz_D - -build_lastz_32: - cd src && ${MAKE} lastz_32 - -build_test_version: - cd src && ${MAKE} lastz-test lastz_D-test - -clean: - cd src && ${MAKE} clean - -install: install_lastz - -install_lastz: - cd src && ${MAKE} install - -install_32: - cd src && ${MAKE} install_32 - -install_test_version: - cd src && ${MAKE} install_test_version - -#--------- -# testing -# -# Small tests to give some comfort level that the program has built properly, -# or that changes you've made to the source code haven't broken it. The -# results should be of this form: -# SUCCESS: ../test_data/xxx and ../test_results/yyy are equivalent -#--------- - -test: - cd src && ${MAKE} test - diff --git a/programs/lastz/README.lastz.html b/programs/lastz/README.lastz.html deleted file mode 100644 index 415728e..0000000 --- a/programs/lastz/README.lastz.html +++ /dev/null @@ -1,7760 +0,0 @@ - - - -LASTZ - - - - - - -

-

LASTZ   Release 1.04.00, - built March 12, 2017

- -TABLE OF CONTENTS - -

-

-

- - - - - - -


-
-

Introduction

- -

-This document describes installation and usage of the LASTZ sequence alignment -program. LASTZ is a drop-in replacement for -BLASTZ, and is backward -compatible with BLASTZ’s command-line syntax. That is, it supports -all of BLASTZ’s options but also has additional ones, and may produce -slightly different alignment results. - -

- - - - - - - - - - - -
LASTZ:A tool for (1) aligning two DNA sequences, and -(2) inferring appropriate scoring parameters automatically. -
Platform:This package was developed on a Macintosh OS X system, but should work on -other Unix or Linux platforms with little change (if any). LASTZ is written in -C and compiled with gcc; other C compilers can probably be used by adjusting -the Makefile. Some ancillary tools are written in Python, but only use modules -available in typical python installations. -
Author:Bob Harris  <rsharris at bx dot psu dot edu> -
Date:March 2017 -
Mailing list: -http://lists.bx.psu.edu/listinfo/lastz-users -
- - - - - - -


-
-

Availability

- -

-LASTZ is available from github at -https://github.com/lastz/lastz. - -

-A packed archive containing source code for LASTZ is available from the -Miller Lab at Penn State. - - - - - - -


-
-

Installation

- -

-If you have received the distribution as a packed archive, unpack it -by whatever means are appropriate for your computer. The result should be -a directory <somepath>/lastz‑distrib‑X.XX.XX that contains -a src subdirectory (and some others). You may find it convenient -to remove the revision number (‑X.XX.XX) from the directory name. - -

-Before building or installing any of the programs, you will need to tell the -installer where to put the executable, either by setting the shell variable -$LASTZ_INSTALL, or by editing the make‑include.mak -file to set the definition of installDir. Also, be sure to add -the directory you choose to your $PATH. - -

-Then to build the LASTZ executable, enter the following commands from bash -or a similar command-line shell (Solaris users should substitute -gmake for make). This will build two executables -(lastz and lastz_D) and copy them into your -installDir. -

-    cd <somepath>/lastz-distrib-X.XX.XX/src
-    make
-    make install
-
-The two executables are basically the same program; the only difference is -that lastz uses integer scores, while lastz_D uses -floating-point scores. - -

-The build process should not report any warnings or errors. Because of this, -the Makefile is set up so that warnings are considered errors and will stop the -build. If you encounter this situation, you can modify the Makefile, removing -"-Werror" from the variable definedForAll. This should allow the build to -complete, while still reporting the warnings. You'll need to decide whether -the warnings indicate something is really wrong. Usually they don't, but please -report them to the author regardless. - -

-A simple self test is included so you can test whether the build succeeded. -To run it, enter the following command: -

-    make test
-
-If the test is successful, you will see no output from this command. -Otherwise, you will see the differences between the expected output and the -output of your build, plus a line that looks like this: -
-    make: *** [test] Error 1
-
- - - - - -
-
-

Build Options

- -

-An additional executable (lastz_32) can be built, to handle -genomes larger than 2 gigabases. For details, see the section on -aligning to whole genomes. - -

-Any executable can be built to allow adjacent indels (by default, these are -not allowed). For details, see the section on -adjacent indels. - - - - - -


-
-

Overview of Processing Stages and Terminology

- -

-LASTZ is designed to preprocess one sequence or set of sequences (which we -collectively call the target) and then align several -query sequences to it. The general flow of the program is like a -pipeline: the output of one stage is the input to the next. The user can -choose to skip most stages via command-line options; any stages that are -skipped pass their input along to the next stage unchanged. Two of the stages, -scoring inference and interpolation, are special in that they perform a -miniature version of the pipeline within them. - -

-Note that the following discussion is a generalization, intended to describe -the basic idea of LASTZ’s operation. There are many exceptions that -depend on the particular options specified. - -

-The stages are: -

- -

-The usual flow is as follows (though most of these steps are optional, -and some settings like ‑‑anyornone -may affect the processing order). -We first read the target sequence(s) into memory, and use that to build a seed -word position table that will allow us to quickly map any word in the target to -all of the positions where it appears. (For the purposes of this discussion -you can think of a word as a 12-mer of DNA.) Then we read each -query sequence in turn, processing them more or less independently. We examine -the word starting at each base in the query and use the position table to find -matches, called seeds, in the target. The seeds are extended to -longer matches called HSPs (high-scoring segment pairs) and filtered -based on score. The HSPs are chained into the highest-scoring set of syntenic -alignments, and then reduced to single locations called anchors. -The anchors are then extended to local alignments (which may contain -gaps) and again filtered by score, followed by back-end filtering to discard -alignment blocks that do not meet specified criteria for certain traits. We -then interpolate, repeating the entire process at a higher sensitivity in the -holes between the alignment blocks. And finally, we write out the alignment -information to a file. Then these steps are repeated with the reverse -complement of the query sequence, before moving on to the next sequence in the -query file. - -

-The scoring inference stage is not usually performed. Typically it is used -only when sequences for new species are acquired, to create scoring files for -subsequent alignments of those species. - - - - - - -


-
-

Examples

- -

-For those eager to try it out, here are some illustrative examples to get you -started. Detailed reference material begins with the -next section. - - - -

-

Comparing a Human Chromosome and a Chicken Chromosome

- -

-It is often adequate to use a lower sensitivity level than is achieved with -LASTZ’s defaults. For example, to compare two complete chromosomes, even -for species as distant as human and chicken, the alignment landscape is evident -even at very low sensitivity settings. This can speed up the alignment process -considerably. - -

-This example compares human chromosome 4 to chicken chromosome 4. These -sequences can be found in the downloads section of the -UCSC Genome Browser, and are 191 and 94 -megabases long, respectively. To run a quick low-sensitivity alignment of -these sequences, use a command like this: -

-    lastz hg18.chr4.fa galGal3.chr4.fa \
-      --notransition --step=20 --nogapped \
-      --format=maf > hg18_4_vs_galGal3_4.maf
-
- -

-This runs in about two and a half minutes on a 2-GHz workstation, requiring -only 400 Mb of RAM. Figure 1(a) shows the results, plotted using the -‑‑format=rdotplot output option and -the R statistical package. -(When in MAF format, LASTZ output can be browsed with -the GMAJ interactive viewer for multiple alignments, available from the -Miller Lab at Penn State.) - -

-Using ‑‑notransition lowers -seeding sensitivity and reduces runtime (by a factor of about 10 in this case). -‑‑step=20 also lowers seeding -sensitivity, reducing runtime and also reducing memory consumption (by a factor -of about 3.3 in this case). -‑‑nogapped eliminates the -computation of gapped alignments. The complete alignment process using default -settings (shown in Figure 1(b)) uses 1.3 Gb of RAM and takes 4.5 hours on a -machine running at 2.83 GHz. - -

- - -
-Figure 1(a) -

-human vs. chicken: low sensitivity -

-
-
-lastz \
-  hg18.chr4.fa galGal3.chr4.fa \
-  --notransition --step=20 \
-  --nogapped
-
-
-

-Figure 1(b) -

-human vs. chicken: defaults -

-
-
-lastz \
-  hg18.chr4.fa galGal3.chr4.fa
-
-
-

- - - -

-

Aligning Shotgun Reads to a Human Chromosome

- -

-Short read mapping for close species requires parameters very different from -LASTZ’s defaults. This example compares a simulated set of primate shotgun -reads to human chromosome 21. The chromosome can be found in the downloads -section of the UCSC Genome Browser -(it is about 47 megabases). Ten thousand simulated reads were generated by -extracting 60-bp intervals from chimp chr21, subjecting them to mild mutation -(including short gaps), and then truncating them to 50 bp (these are included -in the LASTZ distribution, in test_data/fake_chimp_reads.2bit). - -

-To see where these reads map onto the human chromosome, use this command: -

-    lastz hg18.chr21.fa[unmask] fake_chimp_reads.2bit \
-      --step=10 --seed=match12 --notransition --exact=20 --noytrim \
-      --match=1,5 --ambiguous=n \
-      --filter=coverage:90 --filter=identity:95 \
-      --format=general:name1,start1,length1,name2,strand2 \
-      > hg18_21_vs_reads.dat
-
- -

-Attaching [unmask] to the chromosome -filename instructs LASTZ to ignore masking information and treat repeats the -same as any other part of the chromosome, in order to accurately assess the -uniqueness of the read mappings. Since we know the two species are close, we -want to reduce sensitivity. Using -‑‑step=10, we will only be looking for -seeds at every 10th base. Instead of the default seed pattern, we use -‑‑seed=match12 and -‑‑notransition so our -seeds will be exact matches of 12 bases. Instead of the default -x-drop extension method we use -‑‑exact=20 so that a 20-base -exact match is required to qualify as an HSP. Because we are aligning short -reads, we specify -‑‑noytrim so the alignment ends will -not be trimmed back to the highest scoring locations during gapped extension. - -

-We replace the default score set, which is for more distant species, with the -stricter ‑‑match=1,5. This scores -matching bases as +1 and mismatches as −5. We also use -‑‑ambiguous=n so that Ns -will be scored appropriately. -We are only interested in alignments that involve nearly an entire read, and -since the species are close we don't want alignments with low identity; -therefore we use ‑‑filter=coverage:90 and -‑‑filter=identity:95. - -

-For output, we are only interested in where the reads align, so we use the -‑‑format=general option and specify -that we want the position on the chromosome (name1, -start1, length1) and the read name and orientation -(name2, strand2). This creates a tab-delimited -output file with one line per alignment block, a format that is well-suited for -downstream processing by other programs. For example, to count the number of -different reads we've mapped, we can run this Unix shell command: -

-    cat hg18_21_vs_reads.dat | grep -v "#" | awk '{print $4}' | sort -u | wc
-
- - - - -
-

Seeds, HSPs, Gapped Alignments, Chaining

- -

-This example demonstrates the primary -alignment processing stages, using the -α-globin regions of cow and human. This data is included in the LASTZ -distribution in test_data/aglobin.2bit, and consists of a 70K bp -segment of human DNA and a 66K bp segment of cow DNA. We will follow this -example through the major stages of seeding, gap-free extension, chaining, and -gapped extension. - -

-Figure 2(a) shows the result of default seeding on a small window (3K bp) in the -middle of these segments. Seeds are short near-matches; in this case each seed -is 19 bp and could have as many as 8 mismatches (12-of-19 with one transition). -There are 338 seeds in this window, but regions where there are many seeds are -indistinguishable from line segments. - -

-Figure 2(b) shows high-scoring segment pairs, the result of gap-free extension -of the seeds. There are 11 HSPs (only 10 are apparent in the figure, but one -of those is split by a 1-bp shift to the next diagonal). Note that many seeds -were discarded because their extensions were low scoring or overlapped. - -

-Figure 2(c) shows the local alignment blocks resulting from gapped extension of -the HSPs. There are four alignment blocks. - -

-Then we zoom out and show the results for the full sequences; the red box -indicates the small region shown in the earlier figures. Figure 2(d) shows -the HSPs, 2(e) shows the gapped alignment blocks, and 2(f) illustrates how -chaining reduces the alignment blocks to a single syntenic line (or two lines, -if there were matches on both strands). Note that one can already tell -quite a bit about how the sequences align just from looking at the HSPs. - -

- - - - -
-Figure 2(a) -

-alpha-globin: seeds (closeup) -

-
-
-lastz \
-  aglobin.2bit/human[34000..37000] \
-  aglobin.2bit/cow[35000..38000] \
-  --nogfextend --nochain --nogapped
-
-
-

-Figure 2(b) -

-alpha-globin: HSPs (closeup) -

-
-
-lastz \
-  aglobin.2bit/human[34000..37000] \
-  aglobin.2bit/cow[35000..38000] \
-  --gfextend --nochain --nogapped
-
-
-

-Figure 2(c) -

-alpha-globin: gapped blocks (closeup) -

-
-
-lastz \
-  aglobin.2bit/human[34000..37000] \
-  aglobin.2bit/cow[35000..38000] \
-  --gfextend --nochain --gapped
-
-
-

-Figure 2(d) -

-alpha-globin: HSPs -

-
-
-lastz \
-  aglobin.2bit/human \
-  aglobin.2bit/cow \
-  --gfextend --nochain --nogapped
-
-
-

-Figure 2(e) -

-alpha-globin: gapped blocks -

-
-
-lastz \
-  aglobin.2bit/human \
-  aglobin.2bit/cow \
-  --gfextend --nochain --gapped
-
-
-

-Figure 2(f) -

-alpha-globin: gapped blocks with chaining -

-
-
-lastz \
-  aglobin.2bit/human \
-  aglobin.2bit/cow \
-  --gfextend --chain --gapped
-
-
-

- - - -

-

Aligning a Sequence With Itself

- -

-When a sequence is aligned to itself, the full result will contain mirror-image -copies of each alignment block. It is computationally wasteful to process both -copies. LASTZ can handle this situation in four different ways. -

    -
  1. Simply give LASTZ the same sequence for both the -target and query. In this case, LASTZ does not know that -it is aligning a sequence to itself, and performs the full computation on both -copies (Figure 3(a)). -

    -

  2. Specify the ‑‑notrivial -option. This performs the full computation on both copies, but doesn't report -the trivial self-alignment block along the main diagonal (Figure 3(b)). -

    -

  3. Specify the ‑‑self option in place -of the query sequence. LASTZ will save work by computing with only one block -of each mirror-image pair, though it still reports both copies in the output by -reconstructing the second copy from the first. It also invokes -‑‑notrivial automatically to omit the trivial self-alignment block -along the main diagonal. This gives the same output as the previous method, -but runs faster (Figure 3(c)). -

    -

  4. Specify ‑‑self in place of the -query, and also add the ‑‑nomirror -option. In this case LASTZ reports only one copy of each mirror-image pair, -as well as omitting the trivial block (Figure 3(d)). -
- -

-In the following figure, we suppose we have a sequence with repeated motifs, -in the order -α1 β1 γ1 β2 δ1 α2 δ2′ γ2. -That is, α1 and α2 are ancient duplications, as are β1 and -β2, and γ1 and γ2.  δ2′ is an inversion, a -reverse-complement duplicate of δ1. - -

- - - -
-Figure 3(a) -

-rearranged sequence: vs. itself, default options -

-
-
-lastz target target
-
-
-

-Figure 3(b) -

-rearranged sequence: vs. itself, --notrivial -

-
-
-lastz target target --notrivial
-
-
-

-Figure 3(c) -

-rearranged sequence: --self -

-
-
-lastz target --self
-
-
-

-Figure 3(d) -

-rearranged sequence: --self --nomirror -

-
-
-lastz target --self --nomirror
-
-
-

- - - - - - -


-
-

Command-line Syntax

- -

-If you are familiar with BLASTZ, you can run LASTZ the same way you ran BLASTZ, -with the same options and input files. In addition to this BLASTZ compatibility, -LASTZ provides other options. - -

-The general format of the LASTZ command line is -

-    lastz <target> [<query>] [<options>]
-
- -

-The angle brackets <> indicate meta-syntactic variables that -should be replaced with your values, while the square ones [] -indicate elements that are optional. Spaces separate fields on the command -line; a field that needs to contain a space (e.g. within a file name) must be -enclosed in double quotes "". Elements can appear in -any order, the only constraint being that, if present, the -<query> must appear after the <target>. -Output is generally written to stdout, unless specified otherwise -for a particular option. - -

-

-The <target> and <query> are usually -just the names of files containing the sequences to be aligned, in either -FASTA, Nib, -or 2Bit format. However they can be -HSX index files that refer to the sequences indirectly, -and they also can specify pre-processing actions such as selecting a -subsequence from the file (see Sequence Specifiers for -details). With certain options such as -‑‑self the <query> -is not needed; otherwise if it is left unspecified the query sequences are read -from stdin -(though this does not work with random-access formats -like 2Bit). -As a special case, the <target> is -omitted when the ‑‑targetcapsule -option is used, since the target sequence is embedded within the capsule file. - -

-For options, the general format is ‑‑<keyword> or -‑‑<keyword>=<value>, but for BLASTZ compatibility -some options also have an alternative syntax -<letter>=<number>. -(Be careful when copying options from the tables below, as some of the hyphens -here are special characters to avoid awkward line wrapping in certain web -browsers. If you have trouble, replace the pasted hyphens with ordinary typed -ones on your command line.) - -

-Please understand that LASTZ is a complex program and its options are not all -independent, i.e., some options are not valid in combination with certain -others. It would be difficult and cumbersome to attempt to list every possible -conflict here; instead we just mention some of the major ones. If you are not -sure about a particular combination, go ahead and try it — LASTZ will -tell you if it’s not allowed. - -

-Running the command lastz without any arguments prints a help -message with the most commonly used options, while running -

-    lastz --help
-
-lists all of the options. - - - -
-

Where to Look

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--strand=bothB=2 -Search both strands. -
--strand=plusB=0 -Search the forward strand only (the one corresponding to the query specifier). -
--strand=minus -Search only the reverse complement of the query specifier. -
--self -Perform a self-alignment: the target sequence is also the query. -Computation is more efficient than it would be without this option, since only -one of each mirror-image pair of alignment blocks is processed (the other, -redundant one is skipped during processing, but re-created in the output). -Also, the trivial self-alignment block along the main diagonal is omitted from -the output. -This option cannot be used if the target is comprised of multiple sequences. -
--nomirror -Inhibit the re-creation of mirror-image alignments. Output consists of only -one copy of each meaningful alignment block in a self-alignment. This option -is only applicable when the ‑‑self -option is used. -
--queryhsplimit=<n> -Discard queries that have more than <n> HSPs. Any queries -that exceed this limit are reported as a warning (to stderr), and -no alignments are reported. -

-This is useful for mapping reads to a reference genome, when some reads align -to too many places in the reference. -

--queryhsplimit=nowarn:<n> -Same as ‑‑queryhsplimit=<n> but warnings for queries that -exceed the limit are witheld. -
--queryhsplimit=keep,nowarn:<n> -Same as ‑‑queryhsplimit=<n> but queries that exceed the -limit are not discarded and warnings are witheld. For such a query, the first -<n> HSPs found are passed along to downstream processing. -

-Note that the HSPs reported are not the best <n> HSPs. They -are simply the first <n> found; they very likely have a -positional bias. -

--queryhspbest=<n> -For queries that have more than <n> HSPs, discard any HSPs -that score below the nth best. -

-This is useful for mapping reads to a reference genome, when some reads align -to too many places in the reference. -

--querydepth=<n> -Stop processing gapped alignments for a query/strand if its ratio of aligned -bases to query length exceeds <n>. A warning is written to -stderr, all alignments for the query/strand are discarded, and processing -continues with the next query (or strand). -

-‑‑querydepth=keep:<n> can be used if the preference is to -keep some alignments for such query/strands. -

-<n> is a real number and corresponds to a depth of coverage -threshold. For example, a value of 5.0 would cause termination -once a query/strand has an average of five alignments for every base in the -query. The numerator is the number of matches or substitutions (but not gaps); -the denominator is the length of the query sequence. -

-The purpose of this option is one of saving time. It is useful for -automatically terminating the processing of queries with high repeat content, -for which other methods of dealing with repetitive content fail. -

-Moreover, back-end filtering options are -not considered. In other words, matches are counted for any alignment -that meets the scoring threshold, regardless of whether that alignment would be -reported. The justification is that we are trying to abort the processing of -queries that have too many bounding alignments in the DP matrix, and back-end -filtering occurs later in the process. -

--querydepth=keep:<n> -Same as ‑‑querydepth=<n> but any alignments discovered for -this query/strand, before it exceeds the threshold, are reported. -

-Note that the alignments reported are not guaranteed to be the highest scoring -alignments that would achieve the threshold. They are simply the first -alignments found. In other words, the purpose of this option is one of saving -time, not one of finding optimal alignments. -

--querydepth=nowarn:<n> -Same as ‑‑querydepth=<n> but warnings for queries that -exceed the limit are witheld. -
--querydepth=keep,nowarn:<n> -Same as ‑‑querydepth=<n> but any alignments discovered for -this query/strand, before it exceeds the threshold, are reported and warnings -are witheld. -
--anyornone -Stop processing after the first qualifying alignment has been found and written -to the output, and move on to the next query. "Qualifying" means an alignment -that meets all of the thresholds, etc. set by other options as usual. See -Any-or-None Alignment for more details. -This option is not compatible with chaining -or interpolation. -
Defaults: -By default both strands are searched, and the target is assumed to be different -from the query. -

-If ‑‑self is used, the default is to -re-create the redundant mirror-image alignment blocks in the output. -

- - - -

-

Scoring

-

-These are fundamental parameters for alignment scoring, used in several of the -stages. -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--scores=<scoring_file>Q=<file> -Read the substitution scores and gap penalties (and possibly other options) -from a scoring file. This option cannot be used in -conjunction with ‑‑match or -inference. -
--match=<reward>[,<penalty>] -Set the score values for a match (+<reward>) -and mismatch (−<penalty>). -These are both specified as positive values; the "+" and "−" are -implicitly assumed. When <penalty> is not specified, -it is the same as <reward>. - -

-Note that specifying ‑‑match changes the defaults for some of -the other options (e.g. the scoring penalties for gaps, and various extension -thresholds), as described in their respective sections. The regular defaults -are chosen for compatibility with BLASTZ, but since BLASTZ doesn't support -‑‑match, LASTZ infers that you are not expecting BLASTZ -compatibility for this run, so it is free to use improved defaults. -

-This option cannot be used in conjunction with -‑‑scores or -inference. -

--gap=[<open>,]<extend>O=<open>
- E=<extend>
-Set the score penalties for opening and extending a gap. These are specified -as positive values; subtraction is implicitly assumed. Note that the first -base in a gap incurs the sum of both penalties. -

-This option is only valid if gapped extension is -being performed, and cannot be used in conjunction with -inference. These values specified on -the command line override any corresponding values from a file provided with -‑‑scores. -

--ambiguous=n[,<reward>][,<penalty>] -Treat each N in the input sequences as an ambiguous nucleotide. -Substitutions with N are scored as zero, instead of using the -fill_score value from the scoring file -(which is -100 by default). -

-A <penalty> can be specified, which will apply to any -non-match substitution involving an N. If a -<reward> is also specified, it applies to an N versus N -match (otherwise, these matches are scored as zero). Note that the -<penalty> is negated in the scoring matrix, while the -<reward> is not. -

-See -Non-ACGT Characters for a more thorough discussion. -This option is not valid with quantum DNA. -

-Prior to version 1.02.20, this option was incorrectly implemented, and the fix -has caused a change in behavior, and reported alignments, when -penalty is not specified. See the -change history item for details on how to -maintain capatability with the earlier version, if that is desired. -

--ambiguous=iupac[,<reward>][,<penalty>] -Treat each of the IUPAC-IUB ambiguity codes (B, D, H, K, M, R, S, V, -W, and Y, as well as N) in the input sequences -as a completely ambiguous nucleotide. Substitutions with these -characters are scored as zero, instead of using the fill_score -value from the scoring file (which is -100 by -default). -

-A <penalty> can be specified, which will apply to any -non-match substitution involving an ambiguous nucleotide. If a -<reward> is also specified, it applies to a match involving -ambiguous nucleotides (otherwise, these matches are scored as zero). Note that -the <penalty> is negated in the scoring matrix, while the -<reward> is not. -

-See Non-ACGT Characters for a more thorough -discussion. This option is not valid with quantum DNA. -

-Note that this does not mean that LASTZ considers the specific -ambiguity that is associated with each character (e.g. that R -would be considered a match to an A or G but not to -a C or T). Instead, they are all scored as if they -were an N. -

-Prior to version 1.02.20, this option was incorrectly implemented, and the fix -has caused a change in behavior, and reported alignments, when -penalty is not specified. See the -change history item for details on how to -maintain capatability with the earlier version, if that is desired. -

--infer[=<control_file>] -Infer substitution scores and/or gap penalties from the sequences, then use -them to align the sequences. Parameters controlling the inference process are -read from the control file. -This feature is somewhat experimental, and special builds of LASTZ are required -to enable it. Please see Inferring Score Sets for -more information. Inference cannot be used in conjunction with -‑‑scores, -‑‑match, or -‑‑gap. -
--inferonly[=<control_file>] -Infer substitution scores and/or gap penalties, but don't perform the final -alignment (requires ‑‑infscores). -
--infscores[=<output_file>] -Save the inferred scoring parameters to the specified file (or to -stdout), in the same format expected -by ‑‑scores. -
Defaults: -By default the HOXD70 substitution scores are used -(see [Chiaromonte 2002] for an explanation of -how this scoring matrix was determined). -

-
- - - - - - -
 ACGT
A91‑114‑31‑123
C‑114100‑125‑31
G‑31‑125100‑114
T‑123‑31‑11491
-
-

-Default gap penalties are determined as follows. If -‑‑match is -specified, the open penalty is 3.25 times the mismatch penalty, and the extend -penalty is 0.24375 times the mismatch penalty. (These are the same ratios as -BLASTZ’s defaults.) Both penalties are rounded up to the nearest integer. -Otherwise, the gap penalties are 400 for open, 30 for extend. -

-By default, a run of Ns serves as an old-style separator between -shotgun reads or other spliced sequences, rather than indicating ambiguous -nucleotides. This is solely a consequence of the steep -fill_score handicap imposed for -substitutions with N — LASTZ doesn't normally search for runs -of Ns to treat specially (however, the -separator=N action can be -used to accomplish that, and is preferred if Ns are intended to be -separators). -

- - - -

-

Indexing

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--step=<offset>Z=<offset> -Offset between the starting positions of successive target words considered for -potential seeds. (But this does not apply to the query words, which always use -a step size of 1.) -
--maxwordcount=<limit> -Words occurring more often than <limit> in the target are -not eligible for seeds. Specifically, after the target seed word position table -is built, any words exceeding this count are removed from the table. -
--maxwordcount=<limit>% -Set maxwordcount to keep a specified percentage of seed word -positions. <limit> is a lower bound on the percentage of -words to be kept (0 < limit < 100). -

-Setting this as a percentage makes it easier to maintain consistency across -runs. The actual count is dependent on sequence length and composition as -well as the step offset and seed pattern. For example, Figure 4 -shows the variation among human chromosomes in hg18 for -‑‑seed=match13, ‑‑step=15, and -‑‑maxwordcount=90%. The gray bars show the percentage of -seed word positions kept (the red line shows the ideal 90%). The blue numbers -show the equivalent count, which varies greatly. -

-Figure 4 -

- -word count rate per chromosome -

--masking=<count>M=<count> -Dynamically mask the target sequence by excluding any positions that appear -in too many alignments from further consideration for seeds. -

-Specifically, a cumulative count is maintained of the number of times each -target location is aligned. After each query sequence -and strand is processed, any locations that have been output in at least -<count> alignment blocks are masked, so they will be -excluded from the seeding stage for subsequent query sequences. -Since repetition discovered while processing one sequence strand is only masked -for subsequent sequence strands, this option has no effect on the first strand -of the first sequence in the query file. -

-This option requires one, two, or four bytes of memory for each target location, -depending on <count>. If <count> is 254 -or less, one byte is used; if it is 65,534 or less, two bytes are used. -

-The resulting masked intervals can be written to a file with the -‑‑outputmasking=<file> -option. -

--targetcapsule=<capsule_file> -The target seed word position table and seed (as well as the target sequence) -are read from the specified file. When this option -is used, the normal target specifier is omitted from the command line, and the -following options are not allowed: -‑‑step, -‑‑maxwordcount, -‑‑masking, -‑‑seed, -‑‑word. -
--chores=<chores_file> -Restrict alignment to a list of subintervals. The file -describes a list of sequence interval pairs, indicating that the alignment -process is to be restricted to those intervals. -

-See Aligning Many Subintervals for advice -on when to use this option. - -

--segments=<segment_file> -Read anchor segments from a file, instead of discovering -them via seeding. -This replaces any other options related to indexing, seeding, gap-free -extension or chaining. Those stages are skipped, and processing begins with -the gapped extension stage. -

-See Aligning Many Subintervals for advice -on when to use this option. -

Defaults: -By default a step of 1 is used, no words are removed from the target seed word -position table, dynamic masking is not performed, and no target capsule or -segment file is used. -
- - - -

-

Seeding

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--seed=12of19T=1 or T=2 -Seeds require a 19-bp word with matches in 12 specific positions -(1110100110010101111). -
--seed=14of22T=3 or T=4 -Seeds require a 22-bp word with matches in 14 specific positions -(1110101100110010101111). -
--seed=match<length>W=<length> -Seeds require a <length>-bp word with matches in all -positions. -
--seed=half<length> -Seeds require a <length>-bp word with matches or transitions -in all positions. This option is not valid with -quantum DNA. -
--seed=<pattern> -Specifies an arbitrary pattern of 1s, 0s, and -Ts for seed discovery. (Note that Ts are not valid -with quantum DNA.) -
--transitionT=1 or T=3 -In each seed, allow any one match position to be a transition instead. -This option is not valid with quantum DNA. -
--transition=2 -In each seed, allow any two match positions to be transitions instead. -This option is not valid with quantum DNA. -
--notransitionT=2 or T=4 -Don't allow any match positions in seeds to be satisfied by transitions. -
--filter=[<transv>,]<matches> -Filter the resulting seeds, requiring at least <matches> -exact matches and allowing no more than <transv> -transversions. If <transv> is not specified, any number -of transversions is allowed (they are not limited). -This option is not valid with quantum DNA. -
--nofilter -Don't filter seeds. -
---ball=<score> -Set the quantum seeding threshold, the minimum -score required of a DNA word to be included in the seeding ball. -
---ball=<percentage>% -Set the quantum seeding threshold as a percentage of the maximum word score -possible. -
--twins=[<minsep>..]<maxsep> -Require two nearby seeds on the same diagonal, separated by a number of bases -in the given range. See the Seed Patterns section -for more information. This option cannot be used in conjunction with -‑‑recoverseeds. -
--notwins -Allow single, isolated seeds. -
--recoverseeds -Avoid losing seeds in hash collisions. This will slow the alignment process -considerably and cost more memory, and usually does not improve the results -significantly. See the Gap-free Extension stage -for more information. This option cannot be used in conjunction with -‑‑twins. -
--norecoverseeds -Ignore hash collisions, at the expense of missing some seeds. Note that -missing seeds usually does not mean missing alignments, since most alignable -regions have many seed hits. -
Defaults: -By default the 12-of-19 seed is used, one transition is allowed (except with -quantum DNA), the hits are not filtered, twins are not -required, and hash collisions are not recovered. -

-If the quantum action is used in the -query file’s sequence specifier, the default ball -scoring threshold is 75% of the maximum word score possible. -

- - - -

-

Finding HSPs (Gap-free Extension)

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--gfextend -Perform gap-free extension of seeds to HSPs (high scoring segment pairs), -according to the other options in this section. -
--nogfextend -Skip the gap-free extension stage, passing the seeds along to the next -specified stage. -

-It is not recommended to use --nogfextend without also using ---nogapped. -

--exact=<length> -Find HSPs using the exact match extension method with the given length -threshold, instead of using the x-drop method. -
--mismatch=<count>,<length> -Find HSPs using the mismatch extension method with the given length threshold -and allowing count mismatches, instead of using the x-drop method. -

-count is limited to the range 1≤count≤50. -

--xdrop=<dropoff>X=<dropoff> -Find HSPs using the x-drop extension method with the given termination -threshold, instead of using the exact match method. The dropoff setting -determines the endpoints of each gap-free segment: the extension of each seed -is stopped when its cumulative score drops off by more than the given -threshold from the maximum seen so far. See the -Gap-free Extension stage for more details. -
--hspthresh=<score>K=<score> -Set the score threshold for the x-drop extension method; HSPs scoring lower -are discarded. -
--hspthresh=top<basecount> -Set an adaptive score threshold for the x-drop -extension method; HSPs scoring lower are discarded. The score threshold is -chosen to limit the number of target sequence bases in HSPs to about -<basecount> -(or possibly a little higher in case of ties, etc.). -
--hspthresh=top<percentage>% -Set an adaptive score threshold for the x-drop -extension method; HSPs scoring lower are discarded. The score threshold is -chosen to limit the number of target sequence bases in HSPs to about -<percentage> percent of the target (or possibly a little -higher in case of ties, etc.). -
--entropyP=1 -Adjust for entropy when qualifying HSPs in the x-drop extension method. -Those that score just slightly above the HSP threshold are adjusted downward -according to the entropy of their nucleotides, and any that then fall below -the threshold are discarded. -
--entropy=reportP=2 -Adjust for entropy when qualifying HSPs in the x-drop extension method, -and report (to stderr) any HSPs that are discarded as a result. -
--noentropyP=0 -Don't adjust for entropy when qualifying HSPs. -
Defaults: -By default seeds are extended to HSPs using x-drop extension, with entropy -adjustment. -

-If ‑‑match scoring is used, the -default x-drop termination threshold is 10 times the square root of the -mismatch penalty, rounded up to the nearest integer. Otherwise the default -is 10 times the A-vs.-A substitution score. -

-If ‑‑match scoring is used, the -default HSP score threshold is 30 times the match reward (equivalent to the -score of a 30-bp exact match). Otherwise the default is 3000. -

-‑‑help=defaults can be used -to see what values are set. -

- - - -

-

Chaining

-

- - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--chainC=1 or C=2 -Perform chaining of HSPs -with no penalties. -
--chain=<diag>,<anti>C=1 or C=2
- G=<diag>
- R=<anti>
-Perform chaining with the given penalties for diagonal and anti-diagonal in the -DP matrix. These are specified as positive values; -subtraction from the score is implicitly assumed. -
--nochainC=0 or C=3 -Skip the chaining stage. -
Defaults: -By default the chaining stage is skipped. -
- - - -

-

Gapped Extension

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--gappedC=0 or C=2 -Perform gapped extension of HSPs (or seeds, if gap-free extension is not -performed), after first reducing them to anchor points. -
--nogappedC=1 or C=3 -Skip the gapped extension stage. (This means that -interpolation must also be skipped, since -it is not allowed without gapped extension.) -
--ydrop=<dropoff>Y=<dropoff> -Set the threshold for terminating gapped extension; this restricts the -endpoints of each local alignment by limiting the local region around each -anchor in which extension is performed. The boundary of this region in the -DP matrix is formed by the points where the cumulative -score has dropped off by more than the given threshold from the maximum seen -so far. See the Gapped Extension stage for more -details. -
--noytrim -If y-drop extension encounters the end of the sequence, extend the alignment -to the end of the sequence rather than trimming it back to the location giving -the maximum score. This is highly recommended when either the target or query -sequences are short reads (say, less than 100 bases), to prevent -y-drop mismatch shadow. -
--gappedthresh=<score>L=<score> -Set the threshold for gapped extension; alignments scoring lower than -<score> are discarded. -When used along with the x-drop method for gap-free extension, this value is -generally set at least as high as the HSP threshold. Setting it lower has no -effect, since at worst the HSP itself would always qualify (both extension -stages use the same scoring matrix). -
--allgappedbounds -Revert to handling bounding alignments the way they were handled in BLASTZ. -This is discussed in -Bounding Alignments in the DP Matrix. -
Defaults: -By default gapped extension is performed, and alignment ends are trimmed -to the locations giving the maximum score. -

-If ‑‑match scoring is used, the -default y-drop threshold is twice the x-drop threshold (or if x-drop extension -was not performed, twice what the default x-drop threshold would have been); -otherwise it is the score of a 300-bp gap. -

-The default for the gapped score threshold is to use the same value as the -HSP threshold (which is settable via -‑‑hspthresh). If the HSP -threshold was adaptive, then the lowest-scoring -HSP that was kept is used for this default. If x-drop extension was not -performed, the value used is whatever the default HSP threshold would have been. -

-‑‑help=defaults can be used -to see what values are set. -

- - - -

-

Back-end Filtering

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--filter=identity:<min>[..<max>] -Filter alignments by their percent identity, -0 ≤ min ≤ max ≤ 100 percent. -Identity is the percentage of aligned bases that -are matches. Alignment blocks outside the given range are discarded. -This option is not valid with quantum DNA. -

-For backwards compatibility, ‑‑identity=<min>[..<max>] has the -same meaning. -

--filter=continuity:<min>[..<max>] -Filter alignments by how much of the input sequence aligns as matches or -mismatches, rather than gaps, -0 ≤ min ≤ max ≤ 100 percent. -Continuity is the percentage of alignment -columns that are not gaps. Alignment blocks outside the given range -are discarded. -

-For backwards compatibility, ‑‑continuity=<min>[..<max>] has the -same meaning. -

--filter=coverage:<min>[..<max>] -Filter alignments by how much of the input sequence they cover, -0 ≤ min ≤ max ≤ 100 percent. -Coverage is the percentage of the entire target -or query sequence (whichever is shorter) that is included in the alignment -block. Blocks outside the given range are discarded. -

-For backwards compatibility, ‑‑coverage=<min>[..<max>] has the -same meaning. -

--filter=nmatch:<min> -Filter alignments by how many bases match, requiring at least min -matched bases, min > 0. -Match count, or nmatch, is the number -of matched bases in the alignment. This option is not valid with -quantum DNA. -

-For backwards compatibility, ‑‑matchcount=<min> has the -same meaning. -

--filter=nmatch:<min>% -Filter alignments by how many bases match, with the threshold specified as a -percentage of the query length. -
--filter=nmismatch:0..<max> -Filter alignments by the number of mismatches, allowing no more than -max mismatched bases, -max ≥ 0. -Mismatch count, or nmismatch, is -the number of aligned bases in the alignment that are mismatches -(substitutions). This option is not valid with -quantum DNA. -
--filter=ngap:0..<max> -Filter alignments by the number of gaps, allowing no more than -max gaps, max ≥ 0. -Gap count, or ngap, is the -number of runs of gapped columns in the alignment (each run is counted as one -gap). -
--filter=cgap:0..<max> -Filter alignments by the number of gap columns, allowing no more than -max gaps, max ≥ 0. -Gap column count, or cgap, is the -number of gapped columns in the alignment (each column is counted as one gap). -
--notrivial -Do not output a trivial self-alignment block if the target and query sequences -are identical. Note that using ‑‑self -automatically enables this option. -
Defaults: -By default no back-end filtering is performed, and the trivial block is -included if the sequences happen to be identical. -
- - - -

-

Interpolation

-

- - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--inner=<score>H=<score> -Perform additional alignment between the gapped alignment blocks, using -(presumably) more sensitive alignment parameters. <score> -is used as the threshold for both the gap-free and gapped extension sub-stages; -see the discussion of interpolation for more -details. -

-This option is only valid if gapped extension is -performed. -

Defaults: -By default interpolation is not performed. -
- - - -

-

Output

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--output=<output_file> -Write the alignments to the specified file name instead of stdout. -
--format=<type> -Specifies the output format: -lav, -lav+text, -axt, -axt+, -maf, -maf+, -maf-, -sam, -softsam, -sam-, -softsam-, -cigar, -BLASTN, -differences, -rdotplot, -text, -general[:<fields>], -or -general-[:<fields>]. -

-‑‑format=none can be used when no alignment output is desired. -

--rdotplot=<file> -Create an additional output file suitable for plotting the alignment blocks -with the R statistical package. The -output file is the same as would be produced by -‑‑format=rdotplot, but this option -allows you to create the dotplot file without having to run the alignment twice. -
--readgroup=<tags> -Used in conjuction with the SAM file format, allowing -the specification of tags for SAM's ‑RG header line. -<tags> is a tab-delimited list of -<tag>:<value> items. See the SAM specification for -details about which tags are required. LASTZ does not validate whether the -list is a valid SAM tag list. -

-Since the list is tab-delimited, you may need to surround this option with -quotes to satisfy the command line shell. Alternately, you can use ---readgroup more than once, and the lists are concatenated. -

--markend -Just before normal completion, write a marker line -
-    # lastz end-of-file
-
-to the output file. This option can be useful with pipelines or batch servers, -where there may be a question as to whether or not LASTZ completed successfully. -Note that in some output formats this marker is not a legal line, in which case -you must remove it before any downstream processing. -
--census[=<output_file>]c=1 -Count and report how many times each target base aligns, up to 255. -Ns are included in the count (both bases that are Ns -and bases aligning to Ns), and even bases aligning to gaps are -counted. Requires one byte of memory for each target location. -

-For any of the lav formats, if <output_file> -is omitted the census is included as a special stanza in the output. -For all other formats <output_file> is mandatory. -

-

--census16[=<output_file>] -Count and report how many times each target base aligns, up to ≈65 -thousand. Requires two bytes of memory for each target location. -
--census32[=<output_file>] -Count and report how many times each target base aligns, up to ≈4 -billion. Requires four bytes of memory for each target location. -
--nocensusc=0 -Do not report a census of aligning bases. -
--outputmasking=<file> -Used in conjuction with the -‑‑masking=<count> option. -The masked target intervals, resulting from alignment with all queries, are -written to a file in -sequence masking file format. The file is suitable -for later use with the -softmask, -xmask, and -nmask sequence specifier actions. -

In contrast with -‑‑outputmasking:soft=<file>, -only those intervals created by the -‑‑masking=<count> option -are reported. -

--outputmasking+=<file> -The same as -‑‑outputmasking=<file>, -except that masked intervals are written to a file in -three field sequence masking file format, which -includes sequence names. The file is not suitable for later use as -input to LASTZ. -

-This is useful when the target file contains more than one sequence. -

--outputmasking:soft=<file> -Soft-masked target intervals (lowercase bases) are written to a file in -sequence masking file format. The file is suitable -for later use with the -softmask, -xmask, and -nmask sequence specifier actions. -

In contrast with -‑‑outputmasking=<file>, -all masked intervals in the target sequence are reported, regardless of whether -they were created by the -‑‑masking=<count> option -or were in the sequence as it was originally input. -

--outputmasking+:soft=<file> -The same as -‑‑outputmasking:soft=<file>, -except that masked intervals are written to a file in -three field sequence masking file format, which -includes sequence names. The file is not suitable for later use as -input to LASTZ. -

-This is useful when the target file contains more than one sequence. -

--tableonly -Just write out the target seed word position table and quit; don't search for -seeds or perform any subsequent stages. -
--tableonly=count -Just write out the target word count table and quit; don't search for seeds or -perform any subsequent stages. -
--writecapsule=<capsule_file> -Just write out a target capsule file and quit; don't -search for seeds or perform any subsequent stages. The capsule file contains -the target sequence, -the seed, the target seed word position table, -and other related information. -
--writesegments=<segment_file> -Write out alignments as segments, in the same format -used for input by the ‑‑segments -option. These anchor segments can then be used to anchor alignments -in a subsequent run of LASTZ. This can be useful if you want to filter HSPs in -some way before performing gapped extension, for example filtering them by -length. Since anchor segments must be gap-free, this option cannot be used in -conjunction with gapped extension. - - -

- - -

--progress[=<N>] -Report the count and name of every Nth query to stderr, as -processing begins on that query. If N is omitted, every query is reported. -
--progress+masking[=<N>] -Report the count and name of every Nth query to stderr, with -statistics relating to dynamic masking, as -processing begins on that query. If N is omitted, every query is reported. -
--show=defaults -List the option values lastz is using. This can be helpful if you are unsure -what the default value is for most common settings. -

-This gives the same information as -‑‑help=defaults, but writes -them to the output file. For some formats, this renders the output file as -non-conformant. -

Defaults: -By default alignments are written to stdout in lav -format, no census is reported, and no target table or capsule is written out. -
- - - -

-

Housekeeping

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--include=<file> -Read arguments from a text file. The arguments are parsed the same as they -would be from the command-line, with the exception that they may appear on -multiple lines in the file. ‑‑include can be used in conjunction -with other command line arguments. -

-Note that any shell-performed substitutions that would be performed on the -command line are not performed on the contents of the text file. -

--allocate:traceback=<bytes>m=<bytes> -Set the amount of memory to allocate (in RAM) for trace-back information during -the gapped extension stage. <bytes> may contain an -M or K unit suffix if desired (indicating a -multiplier of 1,024 or 1,048,576, respectively). For example, -‑‑allocate:traceback=80.0M is the same as -‑‑allocate:traceback=83886080. -

-For backwards compatibility, ‑‑traceback=<bytes> is also -accepted. -

--allocate:target=<bytes> -Predict the amount of memory (in RAM) that will be needed for target sequence -data. Normally LASTZ incrementally predicts the amount of memory needed as it -parses the file. In some instances that incremental allocation can lead to -memory overuse (depending on details of how the operating system handles memory -allocation). Predicting the memory needed prevents that. -

-The memory needed for a sequence is L+1, where -L is the length of the sequence. When -multiple is used, the total memory -needed is the sum of that needed for each sequence. -

--allocate:query=<bytes> -Predict the amount of memory (in RAM) that will be needed for query sequence -data. See -‑‑allocate:target for further -details. -

-The memory needed for a sequence is L+1, where -L is the length of the sequence. When the query file contains -more than one sequence and -multiple is not used, the -memory needed is that needed for the longest sequence. -

--action:target=<action> -Set a sequence specifier action for the target. This -is an alternative to appending the action to the target filename, and is useful -for shells that make using square brackets problematic. -

-This can be used more than once in the command line, the actions are all -applied. -

--action:query=<action> -Set a sequence specifier action for the query. This -is an alternative to appending the action to the target filename, and is useful -for shells that make using square brackets problematic. -

-This can be used more than once in the command line, the actions are all -applied. -

--word=<bits> -Set the maximum number of bits for the word hash. Use this to spend less -memory (in exchange for more time) and thereby avoid thrashing for heavy seeds. -
Defaults: -The default traceback space is 80.0M, -target and query memory is allocated as needed, -and the default word hash is 28 bits. -
- - - -

-

Shortcuts for Yasra

-

-There are several shortcut options to support the -Yasra mapping assembler. These -provide canned sets of option settings that work well for aligning an assembled -reference sequence (as the target) with a set of shotgun reads (as the query). -They are selected based on the expected level of identity between the sequences. -For example, ‑‑yasra90 should be used when we expect 90% identity. -The ‑‑yasraXXshort options are appropriate when the reads are very -short (less than 50 bp). - -

- - - - - - - - - -
Option Equivalent
--yasra98 T=2 Z=20 ‑‑match=1,6 O=8 E=1 Y=20 K=22 L=30 ‑‑filter=identity:98 ‑‑ambiguous=n ‑‑noytrim
--yasra95 T=2 Z=20 ‑‑match=1,5 O=8 E=1 Y=20 K=22 L=30 ‑‑filter=identity:95 ‑‑ambiguous=n ‑‑noytrim
--yasra90 T=2 Z=20 ‑‑match=1,5 O=6 E=1 Y=20 K=22 L=30 ‑‑filter=identity:90 ‑‑ambiguous=n ‑‑noytrim
--yasra85 T=2      ‑‑match=1,2O=4 E=1 Y=20 K=22 L=30 ‑‑filter=identity:85 ‑‑ambiguous=n ‑‑noytrim
--yasra75 T=2      ‑‑match=1,1O=3 E=1 Y=20 K=22 L=30 ‑‑filter=identity:75 ‑‑ambiguous=n ‑‑noytrim
--yasra95shortT=2      ‑‑match=1,7O=6 E=1 Y=14 K=10 L=14 ‑‑filter=identity:95 ‑‑ambiguous=n ‑‑noytrim
--yasra85shortT=2      ‑‑match=1,3O=4 E=1 Y=14 K=11 L=14 ‑‑filter=identity:85 ‑‑ambiguous=n ‑‑noytrim
- -

-Occasionally, newer releases of LASTZ change the Yasra shortcut options. This -is done as an improvement, so most users will want to use the shortcuts shown -above. Hoever, in order to support backward compatibility for users that want -to reproduce previous results, all previous versions of the shortcuts are -included. The syntax is ‑‑<shortcut>:<version>, where -<version> is the LASTZ version number that contained the -shortcut. - -

- - - - - - - - - -
Option LASTZ version Equivalent
--yasra98:<version> 1.02.45 or earlierT=2 Z=20 ‑‑match=1,6 O=8 E=1 Y=20 K=22 L=30 ‑‑filter=identity:98
--yasra95:<version> 1.02.45 or earlierT=2 Z=20 ‑‑match=1,5 O=8 E=1 Y=20 K=22 L=30 ‑‑filter=identity:95
--yasra90:<version> 1.02.45 or earlierT=2 Z=20 ‑‑match=1,5 O=6 E=1 Y=20 K=22 L=30 ‑‑filter=identity:90
--yasra85:<version> 1.02.45 or earlierT=2      ‑‑match=1,2O=4 E=1 Y=20 K=22 L=30 ‑‑filter=identity:85
--yasra75:<version> 1.02.45 or earlierT=2      ‑‑match=1,1O=3 E=1 Y=20 K=22 L=30 ‑‑filter=identity:75
--yasra95short:<version>1.02.45 or earlierT=2      ‑‑match=1,7O=6 E=1 Y=14 K=10 L=14 ‑‑filter=identity:95
--yasra85short:<version>1.02.45 or earlierT=2      ‑‑match=1,3O=4 E=1 Y=14 K=11 L=14 ‑‑filter=identity:85
- - - -

-

Help

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionMeaning
--version -Report the program version and quit. -
--help -List all options. -
--help=defaults -List the option values lastz would use given the rest of the command line. -This can be helpful if you are unsure what the default value is for most common -settings. -

-This gives the same information as -‑‑show=defaults. -

--help=files -Describe the syntax for sequence specifiers. -
--help=formats -Describe the available output formats. -
--help=shortcuts -List BLASTZ-compatible shortcuts. -
--help=yasra -List Yasra-specific shortcuts. -
- - - -

-

Sequence Specifiers

- -

-A target or query sequence specifier normally just indicates a file to be -used in the alignment; however various pre-processing actions can also be -specified. These are performed as the sequences are read from the file, -and may include selecting a particular sequence and/or subrange, masking, -adjusting sequence names, etc. - -

-The format of a sequence specifier is -

-    <file_name>[[<actions>]]*
-
- -

-The <file_name> field is required; the actions list is -optional. Note that the <actions> are enclosed in literal -square brackets (in addition to the meta ones that just indicate they are -optional), and consist of a comma-separated list (with no spaces), e.g. -[action1,action2,...]. The * indicates that -several action lists can be appended; they are treated the same as if they were -in a single list. - -

-Alternatively, actions can be specified with the commands -‑‑action:target=<action> -and -‑‑action:query=<action>. -This allows actions to be set without using square brackets (square brackets -are problematic in some command shells). - -

-Note that the actions apply to every sequence in the file. For example, if you -specify a subrange of, say, [100..], you will skip the first 99 bp -in every sequence. - -

-The following actions are supported: -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ActionMeaning
<subrange> -Only a subrange of the sequence is processed. The usual form of a subrange -is [<start>]..[<end>]. Either -<start> or <end> can be omitted, in which -case the start or end of the sequence is used. Subrange indices begin with 1 -and are inclusive -(i.e., they use the origin-one, closed position -numbering system). For example, 201..300 is a 100-bp subrange -that skips the first 200 bp in the sequence. -

-For BLASTZ compatibility, the alternative syntax -<start>,<end> is also recognized. In this case -both <start> and <end> are required. - -

-A “zoom out factor” can also be included, using the syntax -<start>..<end>+<zoom>%. The specified interval -is expanded on each end by <zoom> percent. This is useful -when you know, for example, the location of a gene, and would like to include -flanking regions in the alignment. - -

-Another useful syntax for this is <start>#<length>, -which is handy for specifying an interval of known length at a given position; -it is equivalent to <start>..<start+length−1>. -Similarly, <center>^<length> specifies an interval -of known length centered at the given position. Large lengths can be -specified using M or K units if desired, e.g. -10.2M. -

-Additionally, if a subrange has <start> larger than -<end>, the reverse complement of the extracted region is -used. However, this can lead to non-obvious interactions with other features -such as strand reporting, sequence masking, and segment files, so it should -be used with care. Usually it is simpler to use the -‑‑strand options instead. -

-Note that subrange positions are always measured from the start of the -sequence provided in the file (i.e., counting along the -forward strand), even if the sequence is being reverse complemented. -

multiple -The file’s sequences are internally treated as a single sequence. This -action is required when the target (not the query) is comprised of multiple -sequences. -

-There is rarely any reason to use the multiple action for the -query file. Doing so can negatively affect memory use and run time. -

separator=<character> -The file’s sequences are internally broken in pieces wherever the -specified <character> occurs, so that alignments will not -cross that separator. The separation action is performed after any masking -action -(such as xmask or -nmask), so it is possible to use the -masking operation to mark the sequence with separators. -

-The character can be any printable ASCII character. However, characters that -are important in the input format being used (for example a “>” -in fasta) should not be used for this purpose. Moreover, many input formats -have limited capability to represent characters other than nucleotides. There -is no error checking regarding the specified <character> -— if that character does not occur at all in the input, no separation is -performed. -

-See Non-ACGT Characters, Splicing, and Separation -for further details. -

subset=<names_file> -Process only a specified subset of the sequences in the file. -<names_file> is the name of a -file containing a list of desired -sequence names; only these sequences will be processed. The names can be -piped in by specifying /dev/stdin as the file. This action is -only valid for FASTA, 2Bit, -or HSX input files. -
subsample=<k>/<n> -Process only the kth sequence of every group of n -sequences. k ranges from 1 to n. This action is -only valid for FASTA, 2Bit, -or HSX input files. -
chores=<chores_file> -Restrict alignment to a list of subintervals. This is equivalent to the -the ‑‑chores=<chores_file> -option. -
unmask -Convert any lowercase bases to uppercase. Lowercase bases usually indicate -instances of biological repeats, and are excluded from the seeding stage -of the alignment process. -
softmask=<mask_file> -Mask the segments specified in -<mask_file> by replacing them with -lowercase equivalents. Lowercase bases usually represent instances of -biological repeats, and are excluded from the seeding stage of the alignment -process but not from later stages. -Note that soft masking is performed after any unmasking. -
softmask=keep:<mask_file> -Mask the segments not specified in -<mask_file> by replacing them with -lowercase equivalents. Any base not in one of the specified intervals -is replaced, and thereby excluded from the seeding stage (but not later stages) -of the alignment process. -
xmask=<mask_file> -Mask the segments specified in -<mask_file> by replacing them with -Xs. (Note that this always masks with actual Xs, -even if the scoring file specifies a different -character as "bad".) See -Non-ACGT Characters, Splicing, and Separation for -information on how Xs affect the alignment process. -
xmask=keep:<mask_file> -Mask the segments not specified in -<mask_file> by replacing them with -Xs. Any base not in one of the specified intervals is -replaced. -
nmask=<mask_file> -Mask the segments specified in -<mask_file> by replacing them with -Ns. See -Non-ACGT Characters, Splicing, and Separation for -information on how Ns affect the alignment process. -
nmask=keep:<mask_file> -Mask the segments not specified in -<mask_file> by replacing them with -Ns. Any base not in one of the specified intervals is -replaced. -
nameparse=full - -Report full sequence names in the output, instead of short names. As described -in Sequence Name Mangling, LASTZ normally shortens -FASTA and 2Bit sequence names -in an attempt to include only the distinguishing core of the name. This action -is provided in case LASTZ’s choice of names is not helpful. It is only -valid for FASTA or 2Bit -sequence files. -
nameparse=darkspace -Extract the first word from the sequence header line, keeping only a -non-whitespace string. If the first word is a filename, any directory/folder -information is discarded. See -Sequence Name Mangling for more information on how -the name used for output is derived. -This action is currently only valid for FASTA or -2Bit sequence files. -
nameparse=alphanum -Extract the first word from the sequence header line, keeping only an -alphanumeric string. If the first word is a filename, any directory/folder -information is discarded; then the name is truncated at the first character -that is not a letter, digit, or underscore. See -Sequence Name Mangling for more information on how -the name used for output is derived. -This action is currently only valid for FASTA -or 2Bit sequence files. -
nameparse=tag:<marker> -Use the specified marker to extract a short name from the sequence header line. -For example, nameparse=tag:foo will look for the string -foo in the header line, and copy the name from the text following -that, up to the next non-alphanumeric character. See -Sequence Name Mangling for more information on how -the name used for output is derived. This action is only valid for -FASTA or 2Bit sequence files. -
nickname=<name> -Ignore any sequence names in the input file, instead using -<name> in the output. See -Sequence Name Mangling for more information on -how the name used for output is derived. -
namejoin -Replace any spaces in the name with underscores. This is applied after the -effect of any nameparse action. It is most useful with -nameparse=full, and when the output format is such that having -spaces in names is problematic. -
quantum -The sequence contains quantum DNA. -Note that this changes the game significantly, and many of LASTZ’s other -actions and options are not valid with quantum sequences. Operations such as -reverse complement, masking, special treatment of Ns and -Xs, seeding options that need to recognize -matches / transitions / transversions, and computation of percent -identity do not apply because of the arbitrary quantum alphabet and the ability -of its symbols to encode ambiguity. -
quantum=<code_file> -The sequence contains quantum DNA corresponding to -the specified <code_file>, which -assigns nucleotide probabilities for the quantum alphabet. These are only used -to augment the display of alignment blocks in the -Human-Readable Text output format. -
- -

-In addition to the sequence specifier syntax shown above, LASTZ supports a -more complicated syntax. This is to maintain compatibility with BLASTZ and -early versions of LASTZ. All of the functionality described here can be -performed using the newer syntax above. - -

-The complete format of a sequence specifier is -

-    [<nickname>::]<file_name>[/<select_name>][{<mask_file>}][[<actions>]][-]
-
- -

-As with the simpler syntax, the <file_name> field is -required; all other fields are optional. The <file_name> -and <actions> fields have the same meaning as in the simpler -syntax. - -

-<nickname>:: is equivalent to the <name> -field in the nickname=<name> action. - -

-/<select_name> is only valid for the -2Bit file format, and only when the file name ends with -".2bit". It specifies a single sequence from the file to use, rather than all -sequences. This is similar to the subset=<names_file> -action, except that here a single sequence name is given instead of a file of -names. Note that the name must match the mangled -sequence name extracted from the file. - -

-{<mask_file>} is identical to the -xmask=<mask_file> action. - -

-A - (minus sign) is equivalent to swapping the endpoints in the -<subrange> action; it causes the reverse complement of the -sequence to be used instead of the sequence itself. Again, this should be -used with care, as it can lead to murky interactions with other features. -In BLASTZ it was needed for searching only the minus strand, but LASTZ provides -a ‑‑strand option for that. - - - - - - -


-
-

Processing Stages in Detail

- - - -
-

Target Sequence Input

- -

-The target sequence is read at the beginning and kept in memory throughout -the run of the program. Actions such as masking, unmasking, or reverse -complement are applied when the file is read. If there are multiple sequences -in the target file, they are treated internally as one long sequence (you must -use the multiple action in the -target file’s sequence specifier to enable this). - -

-In contrast, queries are processed individually and sequentially. Each query -sequence is read just before its seeding stage. The seeding through output -stages are performed, comparing the query to the target. Then by default, the -same stages are repeated to compare the reverse complement of the query to the -target, before moving on to the next sequence in the query file. - - - -

-

Scoring Inference

- -

-Scoring inference is not normally performed. As described in -Inferring Score Sets, LASTZ can iteratively -perform the complete alignment process on the target and query, to derive a -suitable scoring set. This is only available for special builds of LASTZ, and -will usually be too time-consuming to perform for all sequences being aligned. -The typical application is to use it once on some sample sequences from the -species of interest, save the scoring file, then use that scoring file for -subsequent alignments. - - - -

-

Indexing Target Seed Words

- -

-This pre-processing stage parses the target sequence(s) into overlapping -seed words of some constant length (you can think of these as -12-mers; the actual length is determined by the seed pattern). Each word is -converted to a number, called the packed seed word, according to the -specified seed pattern. These (word, position) -pairs are collected into the target seed word position table. -Conceptually, this table is a mapping from a packed seed word to a list of the -target sequence positions where that seed word occurs. - -

-This table is one of the major space requirements of the program. Both the -memory and time required for seeding can be decreased by using sparse spacing. -The ‑‑step option sets a -step size: instead of examining every position, seed words are -stored only for multiples of the step size. Large step sizes (say, -‑‑step=100) incur a loss of sensitivity, at least at the seeding -stage. However, to discover any gapped alignment block we only need to -discover one seed (of many) in that alignment, so the actual sensitivity loss -is small in most cases. Section 6.2 of [Harris 2007] -discusses some experimental results on the effect of step size on the end -result. - -

-The presence of biological repeats in the target and query can also be -addressed during the building of the position table. A large number of repeats -can adversely affect the speed of the program, by increasing the number of -irrelevant alignments the program considers in the early stages. LASTZ has -three techniques for dealing with repeats. -

-

    -
  1. Bases in the target and/or query sequences can be marked as repeats in -advance, by using lower case. Target and query words containing lower case -bases are left out of the seed word position table and skipped during seeding, -respectively, so they do not participate in the seeding stage. -
  2. If repeat locations are not known, the option -‑‑maxwordcount can be used to remove -frequently occurring target seed words from the position table before query -processing begins. -
  3. Dynamic masking (‑‑masking) can -be used to mask target positions that have occurred in too many alignments; -however this only affects subsequent query sequences. -
- - - -
-

Seeding

- -

-Seeds are short near-matches between target and query sequences. -They identify likely regions of homology that warrant further investigation, -and serve as starting points for bootstrapping the alignment process. "Short" -typically means less than 20 bp. Early alignment programs used exact matches -(e.g. of length 12) as seeds, but more recent programs have used spaced seeds -(these are described in more detail in the Seed -Patterns section). For the purposes of this section, a seed can be -thought of as a 12-mer exact match. - -

-To locate seeds, the query sequence is parsed into seed words the same -way the target is (except that -‑‑step does not apply to the query; -we look at every seed word). -Each packed seed word is used as an index into the target seed word position -table to find the target positions that have a seed match for this -query position. Query seed words containing lower case bases are skipped, so -that repeats will not participate in the seeding stage. - -

-

Quantum Seeding:

-For alignments with quantum DNA it is not possible to -do a direct lookup into the target seed word position table. The position -table is for DNA words (consisting of A, C, -G, and T), whereas the query consists of symbols from -an arbitrary alphabet. The quantum sequence is parsed into seed words as -before, but instead of a direct lookup, each word, called a q-word, -is first converted to a quantum seeding ball of those DNA words that -are most similar to it. Similarity is determined by the scoring matrix; all -words with a combined substitution score above the quantum seeding threshold -(set by the ‑‑ball option) are -considered to be in the ball. Then each word in the ball is looked up in the -target seed word position table as usual, with all such hits considered to be -seed matches for the q-word. -

-The quantum seeding threshold can also be set as a percentage of the maximum -word score possible. If an exact match seed is used, the maximum word score is -the highest value in the substitution matrix multiplied by the seed length. If -a spaced seed is used, the multiplier is the number of 1 positions -in the pattern. -

-Note that the seeding options that provide -special treatment for transitions (Ts in the seed pattern, -half-weight seeds, allowing one or two match positions to be transitions, etc.) -are not supported for quantum alignments. These options would make -the quantum seeding procedure more complex, and are not really necessary -because the quantum mechanism itself provides an alternative way to increase -the alignment sensitivity. Also note that q-words containing lower case bases -are not discarded, since the quantum alphabet is arbitrary and many -ASCII bytes do not even have upper/lowercase versions. - - - -

-

Gap-free Extension

- -

- - -

-In this stage, each seed is extended without allowing gaps to determine -whether it is part of a high-scoring segment pair (HSP). The seed is extended -along its DP matrix diagonal independently in both -directions according to an extension rule, currently either -exact match, M-mismatch, or x-drop. - -

-Exact match extension (‑‑exact) simply -extends the seed until a mismatch is found. If the resulting length is enough, -the extended seed is kept as an HSP for further processing. Exact match -extension is most useful when the target and query are expected to be very -similar, e.g. when aligning short reads to a similar reference genome. - -

-M-mismatch extension -(‑‑<M>mismatch) extends the -seed to find the longest interval that includes the entire seed and contains -no more than M mismatches. If the resulting length is enough, -the extended seed is kept as an HSP for further processing. M-mismatch -extension is most useful when the approximate divergence between the target -and query is known, and HSPs of a known length are desired. -It provides a way to specify both length and identity thresholds together, -with more flexibility than ‑‑exact. - -

-In x-drop extension (‑‑xdrop), as we -extend in each direction we track the cumulative score for the extended match -according to the substitution scoring matrix. The extension is stopped when -the score drops off by more than the given x-drop threshold; that is, when the -difference between the peak score seen so far and the current score is more -than <dropoff>. -(Another way to think of it is that the segment ends when a section scoring -worse than −<dropoff> is encountered.) -The extension is then trimmed back to the peak point. If the combined score -of the seed plus both extensions meets the threshold set by the -‑‑hspthresh option, it qualifies -as an HSP and is kept for further processing. Matches that do not meet the -score threshold are discarded. -The ‑‑entropy options control -whether or not the scores are adjusted for nucleotide entropy when they are -compared to the threshold. - -

-

Adaptive Score Threshold:

-Often it is not clear in advance what value to use for the x-drop method’s -HSP score threshold — set it too high and hardly anything will align, but -too low and the program will be swamped and not finish. LASTZ’s adaptive -scoring options -(‑‑hspthresh=top<basecount> -and -‑‑hspthresh=top<percentage>%) -allow you to set the threshold indirectly to align the desired amount of the -target (as an approximate number of bases or as a percentage, respectively). -This way you can set it for, say, 10% (which will run quickly regardless of the -data), then examine the scores in those results and make an informed choice for -your real threshold. - -

Diagonal Hashing:

-LASTZ includes a time and space optimization that deals with multiple seeds in -the same HSP. The number of seeds in an HSP is generally proportional to both -the length of the HSP and the similarity of the sequences being compared. For -long HSPs or very similar sequences, performing extension over and over for -many seeds in the same HSP would adversely affect the run time. To prevent -this, LASTZ maintains a diagonal extent table that tracks the latest -seed extension on each diagonal (only the latest is needed because of the way -the seeds are sorted). As new seeds "arrive", if they overlap an earlier -extension, they are simply ignored. While this saves time, a direct -implementation could require a lot of space. For two human chromosomes of size -250M bp, the DP matrix has 500 million diagonals, and -storing one position for each diagonal would require 2G bytes. To save memory, -LASTZ hashes diagonals to 16-bit values and tracks extensions only by the hash -value. While this saves space, it results in a miniscule loss of sensitivity -— LASTZ may miss some seeds due to hash collisions. Using -‑‑recoverseeds will prevent losing -these seeds, but will slow the program significantly. Moreover, since most -true alignments contain many HSPs, with many seeds in each HSP, the vast -majority of lost seeds have no effect on the final results. - - - -
-

HSP Chaining

- - - -

-The chaining stage aims to find a series of HSPs that forms a high-scoring path -through the DP matrix, aligning as much as possible while -avoiding backtracking in either sequence. Conceptually it does this by -examining all combinations of HSPs and scoring the chains according to the -relative positions of the HSPs (e.g. the distances between them along the -diagonal and anti-diagonal) as well as their individual scores. All HSPs not -in the highest-scoring chain are discarded. - -

-Ideally this process selects the "real" alignments, filtering out noise (such -as extra alignments due to repeats), and producing a set of HSPs where each -base is aligned at most once; however this is not guaranteed. LASTZ’s -implementation is primarily intended for the case where elements are -known to appear in the same relative order and orientation in the query as in -the target. (However, note that because the forward and reverse strands are -processed in separate pipelines, it will not necessarily cause inversions to be -discarded.) If LASTZ’s implementation of chaining is not suitable, it is -possible to substitute another chaining program by first running LASTZ with the -‑‑nogapped and -‑‑writesegments -options to get the HSPs, running a separate chaining program to filter them, -and then running the final stages of LASTZ on that output via the -‑‑segments option. - -

-Figure 5(a) shows an alignment without chaining, while 5(b) shows the same -alignment with chaining. - -

-

- - -
-Figure 5(a) -

-without chaining -

-
-
-lastz target query --nochain
-
-
-

-Figure 5(b) -

-with chaining -

-
-
-lastz target query --chain
-
-
-

- - - -

-

Gapped Extension

- -

-Before the HSPs are extended further by allowing gaps, each HSP is first -reduced to a single anchor point; -this allows for the possibility that the optimal alignment may include gaps -within the region occupied by the HSP. The gap-free HSP is only an indication -of likely homology in that vicinity; other paths through the same region that -allow gaps may have a higher score, so we don't want to just extend from the -ends of the HSP. Instead we run the gapped algorithm from a single point that -we think is most likely to lie on the optimal path, namely the middle of the -highest-scoring 31-bp interval in the HSP. A more general (and expensive) -approach would be to examine all paths through the square region defined by the -HSP, instead of starting from a single anchor point. - -

-Figure 6(a) illustrates the relationship of seeds, HSPs, and anchors. Heavy -lines are seeds, which were extended without gaps (see Overview) to create HSPs (thin lines). Blue dots are anchors. Seeds with -no HSP shown (gray lines) had low-scoring extensions and were discarded at the -gap-free extension stage. - -

- - -
-Figure 6(a) -

-seeds, HSPs, and anchors -

-Figure 6(b) -

-anchors and gapped extensions -

- -

-The anchors are then processed in the order of their HSP’s score (highest -first). Gapped extension is performed -independently in both directions from the anchor point, and the two resulting -alignments are joined at the anchor. If the total score meets the threshold -specified by the ‑‑gappedthresh -option, the joined alignment is kept and passed to the next stage; otherwise it -is discarded. If the extension from one anchor happens to go through one or -more other anchors, the redundant anchors are dropped from the list. - -

-Figure 6(b) shows the relationship of anchors and their gapped extensions. -The blue dots are the anchors from 6(a), which are extended in both directions -to form gapped alignments (squiggly lines; the gaps are too small to be visible -at this scale). One anchor had low-scoring extensions that did not meet the -threshold. Another had an extension that ran directly through a nearby anchor; -that anchor did not need to be processed separately. - -

-The gapped extensions are computed using a typical -dynamic programming recurrence for affine gap alignment -(e.g. [Myers 1989] or -[Gusfield 1997]), beginning at the anchor and -terminating at the point with the highest cumulative score. The portion of -the DP matrix examined is reduced by disallowing low-scoring regions (see -[Zhang 1998]): wherever the alignment score drops -below the peak score seen so far by more than the threshold specified in the -‑‑ydrop option, the DP matrix is -truncated and no further cells are computed along that row or column. -By default the extension is then trimmed back to the location of the peak -score; thus the extension normally ends when all remaining sub-alignment -possibilities (paths in the DP matrix) begin with sections that score worse -than −<dropoff>. However for alignments -where the extension reaches the end of the sequence, you can suppress this -trimming by specifying the ‑‑noytrim -option, which is recommended when aligning short reads. - -

-Figure 7 shows the effect of the y-drop threshold in more detail. Extension -is performed in two directions from the anchor (in this example, to the upper -right and lower left, because both sequences are on the positive strand). -The gray region is the portion of the DP matrix explored by the extension -algorithm; its boundary is formed by the points where the score dropped from -the maximum by more than the y-drop threshold. - -

- - -
-Figure 7 -

-effect of y-drop -

- - - -

-

Back-end Filtering

- -

-Whatever alignment blocks have made it through the above gauntlet are then -subjected to -identity, continuity, coverage and match count filtering (as specified by the -‑‑filter=identity, -‑‑filter=continuity, -‑‑filter=coverage, -‑‑filter=nmatch, -‑‑filter=nmismatch, -‑‑filter=ngapand -‑‑filter=cgap options, -respectively). Blocks that do not meet the specified range for each feature are -discarded. - -

-

-Identity is the fraction of aligned bases (excluding columns -containing gaps or non-ACGT characters) that are -matches, expressed as a percentage. The numerator is the number of matches in -the alignment block, while the denominator is the number of matches plus the -number of mismatches. -Characters that differ only in upper vs. lower case are -counted as matches. Columns containing gaps or non-ACGT characters play no -part in this computation, and it is independent of the settings for -‑‑ambiguous=n and -bad_score. Identity cannot -be determined for alignments with quantum DNA, because -of the potential ambiguity of the symbols. - -

-

-Continuity is the fraction of alignment columns that do not contain -gaps, expressed as a percentage. The numerator is the number of matches plus -mismatches in the alignment block, while the denominator is the number of -columns. Unlike the computation of identity, here "matches plus mismatches" -includes all non-gap columns regardless of whether they contain non-ACGT -characters. - -

-

-Coverage is the fraction of bases in the entire input sequence -(target or query, whichever is shorter) that are included in the alignment -block, expressed as a percentage. Such bases are aligned in the block to -either bases or gaps in the other sequence. Note that if there are multiple -sequences in the target and/or query, only the current one is considered; -however if an input sequence is spliced with runs of Ns or -Xs, then the combination of all its subsequences (including the -splice characters between them) is considered as one input sequence, because -LASTZ does not explicitly recognize the splicing. -Further, if a separator character is used, -again the combination of all subsequences is considered as one input sequence -(including the separator characters). Also note that each block’s -coverage is computed independently of other blocks, and each must meet any -specified filter range by itself; blocks cannot be combined to meet coverage -requirements. - -

-

-Match Count, or nmatch, is the number of matched bases in -the alignment. Characters that differ only in upper vs. lower case are counted -as matches, columns containing gaps or non-ACGT characters are not. Match -count cannot be determined for alignments with quantum -DNA, because of the potential ambiguity of the symbols. - -

-

-Mismatch Count, or nmismatch, is the number of aligned -bases in the alignment that are not matches. This includes substitutions as -well as non-ACGT characters (even if they are identical), but not gaps. -Mismatch count cannot be determined for alignments with -quantum DNA, because of the potential ambiguity of the -symbols. - -

-

-Gap Count, or ngap, is the number of gaps in the block, -counting each run of gapped columns as a single gap. - -

-

-Gap Column Count, or cgap, is the number of gaps in the -block, counting each gapped column as a separate gap. - - - -

-

Interpolation

- -

-Once the above stages have been performed, it is not uncommon to have regions -left over in which no alignment has been found. In the interpolation stage -(activated by the ‑‑inner option) we -repeat the seeding through gapped extension stages in these leftover regions, -at a presumably higher sensitivity. Using such high sensitivity from the -outset would be computationally prohibitive (due to the excessive number of -false, low-scoring matches), but is feasible on the smaller, leftover regions. - -

-Another complete alignment round (seeding, gap-free extension, chaining, and -gapped extension, even if some of these were skipped in the main alignment; -but not back-end filtering) is performed in the small areas between the -alignment blocks found in the preceding main alignment stage. Only regions -within 20K bp from the endpoints of the passed-in alignment blocks are searched. -Seeding for this alignment requires a 7-bp exact match with no transitions, and -uses the specified scoring threshold for both its gap-free and gapped extension -sub-stages. (This threshold should generally be set lower than the -corresponding ones in the main alignment, in order to increase the sensitivity -of the interpolation.) All other parameters are the same as those used for the -main alignment stages. - -

-Figure 8 shows the operation in more detail. The alignment blocks resulting -from gapped extension are shown in 8(a) as squiggly lines. After interpolation, -in 8(b), additional alignment blocks have been discovered in the red areas. -Note that there are still some holes remaining, where these sequences just -don't align well. - -

- - -
-Figure 8(a) -

-before interpolation -

-
-
-lastz target query
-
-
-

-Figure 8(b) -

-after interpolation -

-
-
-lastz target query --inner=1000
-
-
-

- - - -

-

Alignment Output

- -

-The alignment blocks found by the preceding pipeline of stages are written to -stdout (or to a file specified with the -‑‑output option) in the requested -format. -These may be seeds, gap-free HSPs, or gapped local alignments, depending on -which stages were performed. There is no particular order to the alignment -blocks for an individual query sequence (e.g. they are not sorted by -score or position). However, since the query sequences are processed serially, -the blocks for each one will appear together in the output. - - - - - - -


-
-

File Formats

- -

-LASTZ typically receives two sequence files and possibly a scoring file as -inputs, and produces an alignment file as output. -

-DNA sequences can be provided in FASTA, -FASTQ, -Nib, or 2Bit format, or -indirectly via an HSX index. These -sequences contain a series of A, C, G, -T, and N characters in upper or lower case. -Lower case indicates repeat-masked bases, while Ns represent -unknown bases if the ‑‑ambiguous=n -option is specified. (By default, a run of Ns or Xs -is used to separate sequences that have been catenated together for processing, -but this is now deprecated; see -Non-ACGT Characters, Splicing, and Separation -for a discussion of the use of Ns and Xs.) As an -alternative to DNA sequence, quantum DNA using an -abstract alphabet can be used as the query -(but not as the target). -

-The FASTA, FASTQ, 2Bit and HSX formats support more than one sequence within -the same file. -Files containing multiple sequences are normally only used as the query; -however invoking the multiple -action in the file’s sequence specifier allows -them to be used for the target as well. Also, the -subset action allows one or more -sequences to be selected from such a file. -

-The FASTQ format carries base-calling quality values as well as DNA. - - - -

-

FASTA (sequence input)

- -

-FASTA format stores DNA sequences as plain text. The first line begins with -a > followed by the name of the sequence, and all subsequent -lines contain nucleotide characters. The lines can be of any length. -If the file contains multiple sequences, each should start with its own -> header line. - -NCBI FASTA specification -

-Note that although the official FASTA specification allows the character -X only in amino acid sequences, LASTZ accepts it in DNA sequences -as a splicing character. However, LASTZ does not currently support -IUPAC-IUB ambiguity codes other than N (such as R, -W, etc.), -beyond the treatment afforded by ‑‑ambiguous=iupac. -

-A special case, non-conforming to the official standard, is made to allow a -special user-specified separator character. -Usually this will be N or X, but any other printable -ASCII character that suits the user’s needs is acceptible. -

-It has become common for suppliers of FASTA files to pack a plethora of -additional information into a sequence’s header line. This extra -information -can create difficulties for many sequence processing tools. For example, -headers often contain spaces but file formats such as MAF -do not allow spaces in sequence names. To compensate for this, LASTZ provides -several options for extracting a concise name from sequence headers; see -Sequence Name Mangling for details. - - - -

-

FASTQ (sequence input)

- -

-FASTQ format stores DNA and base-calling quality sequences as plain text, and -is primarily used to describe the results of short-read sequencing runs. As -explained in [Cock 2009], this format has evolved -over time in the Bioformatics community. LASTZ only supports a subset of this -format, prohibiting line-wrapping within DNA or quality sequences. -

-Each sequence consists of four lines. The first line begins with a - followed by the name of the sequence. The second line contains -nucleotide characters. The third line begins with a +, optionally -followed by the name of the sequence (which, if present must match that of the -first line). The fourth line contains quality characters. -

-There are several conflicting standards for encoding quality values in FASTQ -files, but (as of this writing) the differences are not relevant to LASTZ. -LASTZ currently does not make any computational use of the qualities, and -simply copies them into the output file when appropriate. -

-LASTZ treats IUPAC-IUB ambiguity codes in FASTQ files the same as those in -FASTA files. - - - -

-

Nib (sequence input)

- -

-Nib format stores a single unnamed DNA sequence, packed as two bases per byte. - -UCSC Nib specification - - - -

-

2Bit (sequence input)

- -

-2Bit format stores multiple DNA sequences, encoded as four bases per byte with -some additional information describing runs of masked bases or Ns. - -UCSC 2Bit specification -

-Sequence names in 2Bit files have all the same problems as in FASTA files, -so Sequence Name Mangling applies to these files -as well. - - - -

-

Quantum DNA (sequence input)

- -

-A quantum DNA file describes a single sequence of "quantum" DNA, which uses -an abstract, user-defined alphabet. Each position in the sequence is a byte -with a value in the range 0x01..0xFF, which can -represent an ambiguity code, amino acid, or any other meaning you desire. -LASTZ does not try to interpret these in any way; it just aligns them as -abstract symbols corresponding to columns in the scoring matrix. Note that -the value 0x00 is prohibited. -

-The file itself is stored in a binary format described by the table below. -It can be written on either a big-endian or little-endian machine; LASTZ -determines the byte order of multi-byte fields by examining the magic number -at the start of the file. -Be sure to use the quantum action -in the file’s sequence specifier to notify LASTZ -that it contains quantum DNA. - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
File OffsetDataMeaning
0x00 -C4 B4 71 97 -
—or— -
97 71 B4 C4 -
-Magic number indicating big-endian byte order. -
-
Magic number indicating little-endian byte order. -
0x0400 00 02 00File conforms to version 2.0 of the Quantum DNA file format.
0x0800 00 00 14Header length in bytes, including this field through the all-zero -field.
0x0Cxx xx xx xxSOFF: -offset (from file start) to sequence data.
0x10xx xx xx xxNOFF: -offset (from file start) to name; 0 indicates no name.
0x14xx xx xx xxSLEN: -length of sequence data.
0x1800 00 00 00Must be zero.
NOFFName: -a zero-terminated ASCII string.
SOFF -Sequence data: -a series of SLEN bytes, each of which is one quantum symbol -in the sequence. -
- - - -

-

Quantum Code File

- -

-This file is used with the quantum -action in a sequence specifier. It defines a mapping -from quantum DNA symbols to vectors of values for the -four nucleotides A, C, G, and -T. Usually these indicate the nucleotide probability distribution -for each symbol in the quantum alphabet. However, LASTZ doesn't interpret the -values, and only uses them to to augment the display of alignment blocks in the -Human-Readable Text output format. - -

-Each line in the file gives the mapping for one symbol. Lines beginning -with a # are considered to be comments and are ignored, as are -blank lines. Data lines have five columns, separated by whitespace. The first -field contains the symbol, as either a single character or two hexadecimal -digits, while the remaining four fields contain values for -A, C, G, and T, -respectively. Each value can be either a single floating-point number or a -fraction (two floating-point numbers with a / between them, -without spaces). Any symbols in the quantum alphabet that aren't listed in -this file receive zeroes for all four values. - -

-Here is an example. -

-    # sym p(A|sym) p(C|sym) p(G|sym) p(T|sym)
-      01  0.125041 0.080147 0.100723 0.694088
-      02  0.111162 0.053299 0.025790 0.809749
-      03  0.065313 0.007030 0.004978 0.922679
-       ... more rows here ...
-      FF  0.209476 0.014365 0.755682 0.020477
-
- - - -
-

Sequence Name File

- -

-This file is used with the subset -action in a sequence specifier to select particular -sequences for processing. It consists of one sequence name per line. Lines -beginning with a # are considered to be comments and are ignored, -as are blank lines. Only the first whitespace-delimited word in any line is -read as the name; the rest of the line is ignored. -

-Note that when used in conjunction with a -FASTA or -2Bit file, the names must appear in the same order as -they appear in the corresponding sequence file, and must match the -mangled name extracted from that file. When used -with an -HSX file, the names can be in any order but must -match names indexed in the HSX file. - - - -

-

Sequence Masking File

- -

- -This file is used with the xmask and -nmask actions in a -sequence specifier. -It can also be created by using the -‑‑outputmasking=<file> -or -‑‑outputmasking:soft=<file> -options. -It consists of one interval per -line, without sequence names. Lines beginning with a # are -considered to be comments and are ignored, as are blank lines. Only the first -two whitespace-delimited words in any line are interpreted as the interval; the -rest of the line is ignored. -

-Each interval describes a region to be masked, and consists of -

-    <start> <end>
-
-Locations are one-based and inclusive on both ends (i.e., they use the -origin-one, closed position numbering system). -Note that the masking intervals are -counted along the forward strand, even if we are only -aligning to the reverse complement of the query specifier (i.e. for -‑‑strand=minus). - -

-Here is an example. If the target sequence is hg18.chr1, this would mask the -5' UTRs from several genes. Note that the third column is neither required -nor interpreted by LASTZ, and acts as a comment. -

-     884484  884542  NM_015658
-     885830  885936  NM_198317
-     891740  891774  NM_032129
-     925217  925333  NM_021170
-     938742  938816  NM_005101
-     945366  945415  NM_198576
-    1016787 1016808  NM_001114103
-    1017234 1017346  NM_001114103
-    1041303 1041486  NM_001114103
-
- - - -
-

Sequence Masking File, Three Fields

- -This file format is output only. LASTZ does not recognize input files in this -format. -

-This file is created by using the -‑‑outputmasking+=<file> -or -‑‑outputmasking+:soft=<file> -options. -It consists of one interval per line, with sequence names. -

-Each interval describes a region that has been masked, and consists of -

-    <name> <start> <end>
-
-Locations are one-based and inclusive on both ends (i.e., they use the -origin-one, closed position numbering system). -Note that the masking intervals are -counted along the forward strand, even if we are only -aligning to the reverse complement of the query specifier (i.e. for -‑‑strand=minus). - - - -
-

Scoring File

- - -

-This file is used with the ‑‑scores -option to specify a set of (mostly) scoring-related parameters en masse. -The score set consists of a substitution matrix and other settings. The other -settings come first and are individually explained in the -table below. All settings are optional, -and most of them have exact correspondence to command-line options and the same -defaults (unless otherwise specified in the table). Command-line settings -always override settings in this file. Any line may end with a comment -(# is the comment character). - -

-

-In the matrix, rows correspond to characters in the target sequence, while -columns correspond to characters in the query. Matrix labels can be specified -either as single ASCII characters or as two-digit hexadecimal values in the -range 01..FF (do not add a leading 0x). -Note that the value 00 is not allowed. -The rows and columns of the matrix need not have the same set of labels, so -for example, a matrix might describe scoring between the 4-letter DNA alphabet -and the 15-letter ambiguity alphabet. Any labels other than A, -C, G, and T (or their hex equivalents) -are treated as quantum DNA. -

-Score values can be floating-point if the lastz_D version of the -executable is used instead of lastz. - -

-Here is an example: -

-    # This matches the default scoring set for BLASTZ
-    
-    bad_score          = X:-1000  # used for sub['X'][*] and sub[*]['X']
-    fill_score         = -100     # used when sub[*][*] is not defined
-    gap_open_penalty   =  400
-    gap_extend_penalty =   30
-
-         A     C     G     T
-    A   91  -114   -31  -123
-    C -114   100  -125   -31
-    G  -31  -125   100  -114
-    T -123   -31  -114    91
-
- - -

-BLASTZ scoring files are also accepted. These only contain a substitution -matrix, and row labels must be absent (they are assumed to be the same as the -column labels). No other settings are allowed. -

-       A     C     G     T
-      91  -114   -31  -123
-    -114   100  -125   -31
-     -31  -125   100  -114
-    -123   -31  -114    91
-
- -
-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeywordSettingMeaning
bad_score -<score>
-<row>:<col>:<score> -
-This score fills a single row and column of the substitution matrix, so that -any occurrences of the corresponding characters are severely penalized. By -default the "bad" character for both the target and query is X -for DNA sequences or the null byte (00) for -quantum DNA sequences, and the associated score is -−1000. -

-This option allows you to change these characters and/or the score they receive. -The <row> and <col> fields are character codes (as explained -above); if they are absent X (and/or -00) is assumed. Note that these characters are case sensitive. The -bad_score row and column cannot be removed entirely, but you can achieve the -same effect by setting them to invalid characters that will never occur in your -sequences. There is no corresponding command-line option. -

fill_score<score> -This is used as a default for all cells of the scoring matrix that are not -otherwise set (either by the user or by LASTZ’s defaults). This is the -score used for Ns (unless -‑‑ambiguous=n is specified on the -command line). -

-The default value is −100. There is no corresponding command-line option. -

gap_open_penalty<penalty> -This is identical to the <open> field of the -‑‑gap command line option. -
gap_extend_penalty<penalty> -This is identical to the <extend> field of the -‑‑gap command line option. -
step<offset> -This is identical to the -‑‑step command line option. -
seed<strategy> -This corresponds to the ‑‑seed and -‑‑transition command line options. -<strategy> must be one of the following, with no spaces: -
12of19,transition -
12of19,notransition -
14of22,transition -
14of22,notransition -
ball<score>
- <percentage>%
-This is identical to the -‑‑ball command line option. -
x_drop<dropoff> -This is identical to the -‑‑xdrop command line option. -
hsp_threshold<score> -This is identical to the -‑‑hspthresh command line option, -except that it does not currently support the -‑‑hspthresh=top<basecount> or -‑‑hspthresh=top<percentage>% variants. -
y_drop<dropoff> -This is identical to the -‑‑ydrop command line option. -
gapped_threshold<score> -This is identical to the -‑‑gappedthresh command line option. -
- - - -

-

Inference Control File

- -

-When LASTZ is asked to infer substitution scores and/or gap penalties from the -input sequences (e.g. via the ‑‑infer -option), this file is used to set parameters that control the inference -process. - -

-Here is an example (note that currently the parsing of this file is less -flexible than some of the others, and * is the only arithmetic -operator supported): - -

-

-    # base the inference on alignments in the middle half by identity
-    min_identity       = 25.0%    # 25th percentile
-    max_identity       = 75.0%    # 75th percentile
-
-    # scale scores so max substitution score will be 100, and only use
-    # alignments scoring at least as well as 20 ideal matches
-    inference_scale    = 100      # max substitution score
-    hsp_threshold      = 20*inference_scale
-    gapped_threshold   = hsp_threshold
-
-    # allow substitution score inference to iterate at most 20 times;
-    # don't perform gap penalty inference -- instead hardwire gap penalties
-    # relative to max substitution
-    max_sub_iterations = 20
-    max_gap_iterations = 0
-    gap_open_penalty   = 4*inference_scale
-    gap_extend_penalty = 0.3*inference_scale
-
-    # use all seedword positions (don't sample)
-    step               = 1
-
-    # adjust for entropy when qualifying HSPs
-    entropy            = on
-
- -

-min_identity and max_identity specify the range of -sequence identity upon which inference is based; -only alignment blocks within this range contribute to the inference. If the -value ends with a percent sign, it represents a percentile of the identity -distribution over all the blocks; otherwise it is a fixed percent identity -value. For example, min_identity=70 and -max_identity=90 indicates that blocks with identity ranging from -70 to 90 percent will be used, while min_identity=25% and -max_identity=75% indicates that half of the blocks will be used -(from the middle of the distribution). -The defaults are min_identity=0 and max_identity=100 -(i.e., no blocks are excluded from inference due to percent identity). - -

-inference_scale specifies a value for the largest substitution -score (i.e., the score for the best match). All other scores are scaled -proportionally. If this is set to none, the scores will be -log-odds using base 2 logarithms. -The default is inference_scale=100. - -

-hsp_threshold and gapped_threshold correspond to -the command line ‑‑hspthresh and -‑‑gappedthresh options. -The defaults are hsp_threshold=3000 and -gapped_threshold=hsp_threshold. - -

-max_sub_iterations and max_gap_iterations specify -limits on the number of inference iterations that will be performed. For -example, if you only want a substitution scoring matrix, you can set -max_gap_iterations=0. -The defaults are max_sub_iterations=30 and -max_gap_iterations=0. - -

-gap_open_penalty and gap_extend_penalty correspond to -the command line -‑‑gap=[<open>,]<extend> -option. These are used for the first iteration of gap-scoring inference. -The defaults are gap_open_penalty=3.25*worst_substitution and -gap_extend_penalty=0.24375*worst_substitution. - -

-step corresponds to the command line -‑‑step option. A large step, e.g. -step=100, could potentially speed up the inference process. -Ideally, this would base the inference on a sample of only one percent of the -whole. However, the sample actually ends up larger than that and is biased -toward HSPs that are either longer or have a lower substitution rate. This -happens because sampling occurs at the seed level, and such HSPs generally -have more seeds. Future versions of LASTZ may include a means to compensate -for this bias. -The default is step=1. - -

-entropy corresponds to the command line -‑‑entropy option. Legal values are -on or off. If on, sequence entropy is incorporated -when filtering HSPs. The default is entropy=on. - -

-The value of worst_substitution cannot be set directly. -Instead, it is computed from the initial scoring matrix. It is the minimum -score in the scoring matrix for any of the symbols A, C, G or T (equivalently, -the most negative score or the maximum penalty). - -

-Note that these parameters apply to the inference process only. If the -corresponding command line options are also set, those will apply for the -final, "real" alignment stages (and will also override the inferred values if -there is a conflict), but will not affect the inference itself. -Inference cannot be used in conjunction with a scores file. - - - -

-

HSX (Hashed Sequence Index)

- -

-An HSX file is an index of sequences in other files, allowing fast random -access to those sequences. The current implementation of LASTZ only supports -indexing FASTA files. Future versions may include -Nib and 2Bit sequences. -The following is a brief overview of the -file format. For more detailed information, see the - -HSX specification - -

-An HSX file can be created using the build_fasta_hsx.py utility -(included in the tools directory of the LASTZ distribution), using -a command like this: -

-    build_fasta_hsx sequences.fa [more_sequences.fa ...] > index.hsx
-
- -

-It is important that the HSX file has the extension .hsx and -resides in the same directory as the files being indexed. Further, the files -being indexed must have the extension .fa or .fasta. -These rules allow LASTZ to determine the sequence file type when it reads the -HSX file, and to locate the files containing the sequences. - -

-The index file includes names to be used for the sequences, which do not have -to match the original names or headers in the sequence files. This feature -obviates the need for LASTZ to perform sequence name -mangling, so most of those actions are not supported for HSX files. -Instead, it is the responsibility of the program that creates the index to -select suitable names. - - - -

-

Target Capsule File

- -

-A target capsule file is essentially a memory dump of several internal data -structures related to the target sequence and the target seed word position -table. At the present time the authors do not wish to create an official -specification for this format, but please see -Using Target Capsule Files for information on -how to create and utilize them. - - - -

-

Alignment Chores File

- -

- -

-A chores file describes a list of sequence interval pairs, indicating that the -alignment process is to be restricted to those intervals. - -

-The file contains two intervals per line, one from the target and one from the -query, with sequence names. Optionally, the query strand can be specified, as -well as an identifying tag. Lines beginning with a # are -considered to be comments and are ignored, as are blank lines. # -can also be used to put comments at the end of lines, but must be preceeding by -whitespace. - -

-Each line looks like -

-    <name1> <start1> <end1> <name2> [<start2> <end2>] [<strand2>] [id=<tag>] [#<comment>]
-
-where <name1>, etc. correspond to the target sequence and <name2>, -etc. correspond to the query. Fields are delimited by whitespace. - -

-When the target name is irrelevant (i.e. when there is only one name in the -target sequence file), * can replace <name1>. Similarly, if -we don't have a target (or query) subrange, * * can be used in -place of start and end. Note that the query subrange and strand are optional, -as is the tag. When the strand is not specified, both strands are searched. - -

-Locations are one-based and inclusive on both ends, i.e. -origin-one, closed (thus the interval "154 228" has -length 75 and is preceded by 153 bases in its sequence). All target intervals -are on the positive strand. All query intervals are -counted along the forward strand, regardless of which -strand is specified. - -

-Target sequence names may appear in any order. Sequence names for the query -must appear in the same order as they do in the query file. Because alignment -output ordering is on a chore-by-chore basis, it is good practice to include -all positive strand intervals for a query before any negative strand intervals -for that query. Some downstream tools may depend on this ordering. - -

-The tag can be any short string the user wants to associate with the chore -(excluding whitespace). As of this writing, the only use of the tag field is -that it can be copied to the output file by use of the -chore field for -‑‑format=general. - -

-Here is an example. -

-    chr9  116517410 116518409  READ_00070 *   *   + id=DFZ
-    chr3  157707345 157708344  READ_00070 *   *   + id=EDZ
-    chr9  112944437 112945436  READ_00078 101 200 + id=FAC
-    chr1  3377578   3378577    READ_00078 *   *   + id=LLH
-    chr2  175604671 175605670  READ_00078 *   *   - id=DFZ
-    chr2  230613705 230614704  READ_00079           id=DFZ
-    chr9  20387422  20388421   READ_00355 *   *   + id=DFZ
-    chr8  16396215  16397214   READ_00355 *   *   + id=MNQ
-    chr14 *         *          READ_00355 *   *   - id=MNQ
-    chr4  50534096  50535095   READ_00355 *   *   - id=QOY
-    chr6  58308766  58309765   READ_00376 *   *   - id=EDZ
-    chr5  172249269 172250268  READ_00376 *   *   - id=FAC
-    chr9  123860065 123861064  READ_00376 *   *   - id=MNQ
-
- - - -
-

Segment File

- -

-A segment file describes a list of segments representing gap-free alignments. -This list is either produced internally by LASTZ as a result of the -gap-free extension stage (see Overview), or read from -a user-supplied file via the -‑‑segments option. The latter -causes LASTZ to skip the indexing, seeding, and gap-free extension stages and -begin with the chaining stage (or the next specified stage, if chaining is not -requested). - -

-The file contains two intervals per line, one from the -target and one from the query, with sequence names. Lines beginning with a -# are considered to be comments and are ignored, as are blank -lines. # can also be used to put comments at the end of lines. - -

-Each line looks like -

-    <name1> <start1> <end1> <name2> <start2> <end2> <strand2> [<score>] [#<comment>]
-
-where <name1>, etc. correspond to the target sequence and <name2>, -etc. correspond to the query. Fields are delimited by whitespace. - -

-Locations are one-based and inclusive on both ends, -i.e. origin-one, closed (thus the interval "154 228" -has length 75 and is preceded by 153 bases in its sequence). Negative strand -intervals are measured from the 5' end of the query’s negative -strand -(corresponding to the rightmost end of the given query sequence, -i.e. counted along the reverse strand). All target -intervals are on the positive strand. The two intervals must have the same -length (since these alignments are gap-free). - The score is used to determine the -processing order during gapped extension. -Segments without scores are given a score of zero. - -

-Query sequence names must appear in the same order as they do in the query file. -For each query sequence, normally all positive strand intervals must appear -before any negative strand intervals. -Sequence names for the target may appear in any -order, and are only meaningful if the -multiple action is used; otherwise -they are ignored. Intervals with names not found in the target or query are not -allowed. In cases where sequence names are either unknown or of no importance -(e.g. when all sequences in the file have the same name), a * can -be used as a generic sequence name. - - -

-Here is an example. -

-    R36QBXA37A3EQH 151 225  Q81JBBY19D81JM 14  88 +  6875
-    R36QBXA37D4L6V  26 100  Q81JBBY19D81JM 10  84 +  6808
-    R36QBXA37EVLNU  19  93  Q81JBBY19D81JM  7  81 +  6842
-    R36QBXA37CEBPD   8  81  Q81JBBY19D81JM  9  82 +  7108
-    R36QBXA37BLO6X 132 205  Q81JBBY19D81JM 11  84 -  7339
-    R36QBXA37A2W3P 162 214  Q81JBBY19D81JM  2  54 -  5024
-    R36QBXA37A9395  62 136  Q81JBBY19A323K 18  92 +  7231
-    R36QBXA37DNC74  18  82  Q81JBBY19A323K  2  66 +  6418
-    R36QBXA37CTR26  83 167  Q81JBBY19ASA7F 19 103 +  8034
-    R36QBXA37C2TAC  95 181  Q81JBBY19ASA7F 15 101 +  8272
-
- - - -
-

LAV (alignment output)

- -

-LAV is the format produced by BLASTZ, and is the default. It reports the -alignment blocks grouped by "contig" (chromosome, scaffold, read, etc.) and -strand, and describes them by listing the coordinates of gap-free segments. -This format is compact because it does not include the nucleotides, but -consequently interpretation usually requires access to the original sequence -files, and it is not easy for humans to read. - -LAV specification - -(same specification at PSU) - -

-The option ‑‑format=lav+text adds -textual output for each alignment block (in the same -format as the ‑‑format=text option), intermixed with the LAV -format. Such files are unlikely to be recognized by any LAV-reading program. - - - -

-

AXT (alignment output)

- -

-AXT is a pairwise alignment format popular at UCSC and PSU. - -UCSC AXT specification -

-The option ‑‑format=axt+ reports -additional statistics with each block, in the form of comments. The exact -content of these comment lines may change in future releases of LASTZ. - - - -

-

MAF (alignment output)

- -

-MAF is a multiple alignment format developed at UCSC. The MAF files produced -by LASTZ have exactly two sequences per block: the first row always comes from -the target sequence, and the second from the query. - -UCSC MAF specification -

-The option ‑‑format=maf+ reports -additional statistics with each block, in the form of comments. The exact -content of these comment lines may change in future releases of LASTZ. -

-The option ‑‑format=maf- suppresses -the MAF header and any comments. This makes it suitable for concatenating -output from multiple runs. -

-UCSC’s MAF should not be confused with other formats that have the same -name. For example, the MIRA sequence assembler project has a file format named -MAF, but it is a completely unrelated file format and is not supported by LASTZ. - - - -

-

SAM (alignment output)

- - -

-SAM is a pairwise alignment format used primarily for short-read mapping, and -supported by the SAMtools programming suite. This format is described in -[Li 2009], and as of May 2011 a specification for it -can be found at the SAMtools page -at SourceForge. - -

-For SAM files, LASTZ assumes that the target sequence is the reference and -that query sequence(s) are short reads. For alignments that don't reach the -end of a query, ‑‑format=sam uses -"hard clipping", while ‑‑format=softsam -uses "soft clipping". See the section on "clipped alignment" in the SAM -specification for an explanation of what this means. - -

-The options ‑‑format=sam- and -‑‑format=softsam- suppress the SAM -header lines. This makes them suitable for concatenating output from multiple -runs. - - - -

-

CIGAR (alignment output)

- -

-

-CIGAR is an acronym for Concise Idiosyncratic Gapped Alignment Report, a -pairwise alignment format defined originally by the -Exonerate alignment program. -The format has since been adapted in different forms, as -ensembl cigar format -and as an -extended cigar string -in SAMtools. For -‑‑format=cigar, LASTZ implements -Exonerate CIGAR. LASTZ implements other CIGAR variants for -‑‑format=sam -and as fields for ‑‑format=general. - -

-Exonerate CIGAR -format does not include nucleotides; instead it describes the locations of -indels (but not substitutions) using run-length encoding. An alignment is -characterized as runs of M (match and/or substitution), -I (query contains a base not in target), and D -(target contains a base not in query). Each run is encoded by the letter code, -whitespace, and the length; multiple runs are separated by whitespace. The -format also includes positional information for the start of the alignment. An -example is shown at the end of this -section. While there seems to be no complete, definitive specification for -CIGAR, the CIGAR files produced by LASTZ are believed to match the format -produced by Exonerate. - -

-In the other variants of CIGAR, whitespace is removed and the order of the -letter code and length are reversed (length appears before letter code). In -some variants the length is omitted if it is 1; in other variants -M runs are divided into = (match) and X -(substitution). SAMtools extended cigar strings allow S and -H runs to describe clipping operations for short sequences. -LASTZ implements combinations of these variants where appropriate; details -are described in -‑‑format=general:cigar, -‑‑format=general:cigarx -and ‑‑format=sam. - -

-

-To understand the differences between different types of CIGAR strings, -consider the following alignment of a short 61-bp query to a longer target. - -

-

-    target:  ...GATTAAGAGTCTGTCCGACCTTCTTCT---GGGTTTACCGAAGCCCACTTAGCTGATATTCGA...
-                   ||||||||||||||||X|||||||   |||||||  X||||||||||||||||||
-     query:     ACCTAAGAGTCTGTCCGACATTCTTCTACGGGGTTTA--TAAGCCCACTTAGCTGATAAGGTT
-                   ↑      1         2         3           4         5    ↑    6
-                0123456789012345678901234567890123456--789012345678901234567890
-
-
- -

-For ‑‑format=cigar, the alignment would be described by this line: -

-    cigar: query 3 56 + target <start> <end> <strand> <score> M 24 I 3 M 7 D 2 M 19
-
- -

-For ‑‑format=general:cigar, the -alignment path would be described by this field: -

-    24M3I7M2D19M
-
- -

-For ‑‑format=general:cigarx, the -alignment path would be described by this field: -

-    16=X7=3I7=2DX18=
-
- -

-For ‑‑format=sam, the alignment path would -be described by this field: -

-    3H24M3I7M2D19M5H
-
- - - -
-

BLASTN (alignment output)

- - -

-The BLASTN format reports pairwise alignments in a format similar to -NCBI’s BLASTN program. Output is modeled upon version 2.2.24+ of the -standalone version of BLASTN available from - -NCBI’s BLAST ftp site. Output should be similar that produced by the -command -

-    blastn -task blastn -db <target> -query <query> -outfmt 7
-
-It is important to realize that a couple of the fields, specifically -evalue and bit score, are written as crude -approximations of the value that BLASTN would produce, as described below. - -

-The format is tab-delimited with one alignment reported per line, plus an -additional header. Here is some sample output: -

-    # lastz --format=blastn
-    # Query: orange
-    # Database: apple
-    # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
-    orange apple 82.14  2072 142 67 2    1926 103  2093 0     1972
-    orange apple 100.00 14   0   0  1906 1919 2086 2073 0.043 26.5
-    orange apple 93.33  15   1   0  1763 1777 2004 1990 0.53  22.9
-
- -

-Most of the fields correspond directly to fields available in the -General output format. These are -query id=name2, -subject id=name1, -%identity=blastid%, -alignment length=ncolumn, -mismatches=nmismatch, -gap opens=ngap, -q.start=start2, -and q.end=end2. -The fields s.start and s.end are nearly equivalent -to start1 and end1, but when the alignment is to the -reverse strand, they appear in the other order (i.e. -s.start > s.end). - -

-The two remaining fields, evalue and bit score, are crudely estimated -from LASTZ’s score field, but are not strictly -correct. Further, these approximations assume that default LASTZ scores -are used. Otherwise they are unlikely to be good approximations. The -approximation formulas are -

-     evalue    = 3.0e9*exp(-0.01421*score)
-     bit score = 0.0205*score
-
- - - -
-

Differences (alignment output)

- -

-LASTZ’s Differences format reports each difference between target and query -on a separate line, where a difference is any indel or run of -mismatches. It is intended for comparisons between close sequences, such as -when comparing reads from a human individual to the human reference genome, or -reads from a bacterial strain to a reference sequence for the same bacterium. -The format is a tab-delimited table with one line per difference; it is -well-suited for use with spreadsheets and the -R statistical package. - -

-The columns reported in this format are the name, start & end of the -difference, strand, and overall size for the target; the name, start & end -of the difference, strand, and overall size for the query; the text of the -difference in the target, then in the query; and finally the text of the -complete alignment block containing the difference, first in the target, then -in the query. Intervals are origin-zero, half-open -and counted along the forward strand. - -

-The example below compares output in this format to similar results using the -General output format for the same input sequences. -For the Differences output, column numbers have been added for discussion (they -are not in the actual output file). Each line in the output represents -slight evidence that a mutation occurred changing the target sequence -(chr22 here) to the query sequence (column 6). Columns 11 and 12 indicate the -specific mutation that has putatively occurred. For example, the first line -suggests that either an A has been -inserted into -chr22 at position 14485783, or an A has been -deleted from -EAYGRGI02GQ0SL at position 167 (actually, between positions 166 and 167). -Note that there are three differences reported for -EAYGRGI02GQ0SL, so it appears on three lines. The fifth line shows a putative -SNP at chr22 position 15234401, with a C in the reference and a G in the read, -while the seventh line shows evidence for an inversion of neighboring bases -(AG vs. GA). -Note that there are no lines for EAYGRGI01BIQCW, indicating a -perfect match for that block (i.e., no differences). - -

-Sample output for ‑‑format=differences. -

-     (1)     (2)      (3)  (4)   (5)         (6)       (7) (8) (9) (10) (11)(12)  (13)     (14)
-    chr22 14485783 14485784 + 49691432  EAYGRGI02GQ0SL 167 167  +  303   A   -   TGAGA... TGAGA...
-    chr22 14485791 14485792 + 49691432  EAYGRGI02GQ0SL 174 174  +  303   A   -   TGAGA... TGAGA...
-    chr22 14485843 14485843 + 49691432  EAYGRGI02GQ0SL 225 226  +  303   -   A   TGAGA... TGAGA...
-    chr22 14731895 14731895 + 49691432  EAYGRGI01EAV19 228 229  -  298   -   A   CTTCT... CTTCT...
-    chr22 15234401 15234402 + 49691432  EAYGRGI02H5ZGS 99  100  -  180   C   G   CGAAT... CGAAT...
-    chr22 15255536 15255537 + 49691432  EAYGRGI01BTT7U 56  56   -  267   A   -   TTTGC... TTTGC...
-    chr22 15255552 15255554 + 49691432  EAYGRGI01BTT7U 71  73   -  267   AG  GA  TTTGC... TTTGC...
-    chr22 15255617 15255618 + 49691432  EAYGRGI01BTT7U 136 136  -  267   A   -   TTTGC... TTTGC...
-    chr22 15255624 15255625 + 49691432  EAYGRGI01BTT7U 142 142  -  267   A   -   TTTGC... TTTGC...
-
- -

-Sample output for -‑‑format=general:name1,zstart1,end1,strand1,size1,name2,zstart2+,end2+,strand2,size2,text1,text2. -

-    chr22 14485616 14485920 + 49691432  EAYGRGI02GQ0SL 0   303  +  303   TGAGA... TGAGA...
-    chr22 14731668 14731964 + 49691432  EAYGRGI01EAV19 0   297  -  298   CTTCT... CTTCT...
-    chr22 15234302 15234482 + 49691432  EAYGRGI02H5ZGS 0   180  -  180   CGAAT... CGAAT...
-    chr22 15238845 15239070 + 49691432  EAYGRGI01BIQCW 0   225  -  225   TGGAA... TGGAA...
-    chr22 15255480 15255750 + 49691432  EAYGRGI01BTT7U 0   267  -  267   TTTGC... TTTGC...
-
- -

-(This example aligns reads from the genome of James Watson (available from -NCBI’s trace archive -by querying for CENTER_NAME = 'CSHL' and CENTER_PROJECT = 'Project Jim') -vs. the human reference genome hg18.) - - - -

-

R Dotplot (alignment output)

- -

-This is a home-grown format designed to facilitate plotting the alignment -blocks with the R statistical package. -Alignments are reduced to a series of gap-free segments, each of which is -written in three lines as shown below. Endpoints are -origin-one, closed, and alignments on the reverse -strand have -<..._query_end> less than -<..._query_start> so that R will draw them in the reverse -(slope=−1) orientation. - -

-

-    <target_name>            <query_name_>
-    <segment1_target_start>  <segment1_query_start>
-    <segment1_target_end>    <segment1_query_end>
-    NA                       NA
-    <segment2_target_start>  <segment2_query_start>
-    <segment2_target_end>    <segment2_query_end>
-    NA                       NA
-     ...
-
- -

-The file can then be plotted in R with commands like these: -

-    dots = read.table("your_file",header=T)
-    plot(dots,type="l")
-
- -

-When the the query file contains more than one sequence, alignments for each -query sequence are written as shown above. This includes a new header line -for each query. Unfortunately the simple R commands shown above will not -work to plot a file with more than one query. - -

-When the the target file contains more than one sequence, alignments for target -sequences are intermixed in the output file. In this case the entire target -is treated as a single sequence, and the target positions reported are relative -to this concatenated sequence. This can still be plotted using the simple R -commands above, but the target sequences will appear as one concatenated -sequence in the plot. - - - - -

-

Human-Readable Text (alignment output)

- -

-This textual output is intended to be read by people rather than programs. -Each alignment block is displayed with gap characters and a row of -match/transition characters, and lines are wrapped at a reasonable width -to allow printing to paper. The exact format of this output may change in -future releases of LASTZ, so programs are better off reading more stable -formats like LAV, AXT, or -MAF. - - - -

-

General Output (alignment output)

- -

-LASTZ’s General format is a tab-delimited table with one line per -alignment block and configurable columns. This format is well-suited for use -with spreadsheets and the -R statistical package, -and for filtering with shell commands. - -

-The syntax for this option is: -

-    ‑‑format=general[:<fields>]
-
-where <fields> is a comma-separated list of field names in -any desired order, with no spaces. For example -
-    ‑‑format=general:nmismatch,name1,strand1,start1,end1,name2,strand2,start2,end2
-
-will report each aligned interval pair and the number of mismatches in the -alignment of that pair, like this: -
-    #nmismatch name1   strand1 start1 end1 name2    strand2 start2 end2
-    41         apple8  +       130    930  orange2  -       119    931
-    35         apple15 +       113    930  orange3  +       87     909
-    52         apple4  +       131    952  orange5  -       111    932
-    46         apple7  +       131    930  orange10 +       111    909
-    37         apple12 +       131    930  orange11 -       111    909
-    38         apple3  +       127    939  orange12 +       107    926
-
- -

-The recognized field names are shown in the table below. Positions (start and -end fields) are counted from the 5' end of the aligning strand, -unless otherwise indicated in the table. -Please see Interval Coordinates for more information -about the position numbering systems used in LASTZ. - -

-If the field list is absent, the following -fields are printed, in this order:  -score, name1, strand1, -size1, zstart1, end1, -name2, strand2, size2, -zstart2, end2, identity, -coverage.  - -

-The option ‑‑format=mapping is a shortcut for ‑‑format=general -with the following fields:  -name1, zstart1, end1, -name2, strand2, zstart2+, -end2+, identity, coverage, -cigarx-. - -

-Field names are normally included as column headers in the first row of the -output, preceded by a #. The options -‑‑format=general-[:<fields>] -and ‑‑format=mapping- suppress column headers. This makes -them suitable for concatenating output from multiple runs. - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FieldMeaning
scoreScore of the alignment block. The scale and meaning of this number will -vary, depending on the final stage performed and other command-line options. -
name1Name of the target sequence.
number1 - -Number of the target sequence within the target file. The first sequence -is numbered zero. -
strand1Target sequence strand, either "+" or "−".
size1Size of the entire target sequence.
start1Starting position of the alignment block in the target, origin-one.
zstart1Starting position of the alignment block in the target, origin-zero.
end1 -Ending position of the alignment block in the target, expressed either as -origin-one closed or origin-zero half-open (the ending value is the same in -both systems). -
length1Length of the alignment block in the target (excluding gaps).
text1Aligned characters in the target, including gap characters. - align1 can be used as a -synonym for text1. -
qalign1 - -The target quality sequence (if there is one) correpsonding to aligned -characters. Gaps are indicated as a tilde (~). -
nucs1 - -The entire target sequence, after modifications due to specifier actions such -as subrange or softmask. -

-This is output in order along the target’s forward strand, regardless of -the strand of the alignment. -

quals1 - -The entire target quality sequence (if there is one), after modifications due -to specifier actions such as subrange. -

-This is output in order along the target’s forward strand, regardless of -the strand of the alignment. -

name2Name of the query sequence.
number2 - -Number of the query sequence within the query file. The first sequence -is numbered zero. -
strand2Query sequence strand, either "+" or "−".
size2Size of the entire query sequence.
start2Starting position of the alignment block in the query, origin-one.
zstart2Starting position of the alignment block in the query, origin-zero.
end2 -Ending position of the alignment block in the query, expressed either as -origin-one closed or origin-zero half-open (the ending value is the same in -both systems). -
start2+ -Starting position of the alignment block in the query, counting along the query -sequence’s positive strand (regardless of which query strand was aligned), -origin-one. -Note that if strand2 is "−", then this is the other end of -the block from start2. -
zstart2+ -Starting position of the alignment block in the query, counting along the query -sequence’s positive strand (regardless of which query strand was aligned), -origin-zero. -Note that if strand2 is "−", then this is the other end of -the block from zstart2. -
end2+ -Ending position of the alignment block in the query, counting along the query -sequence’s positive strand (regardless of which query strand was aligned), -expressed either as origin-one closed or origin-zero half-open (the ending -value is the same in both systems). -Note that if strand2 is "−", then this is the other end of -the block from end2. -
length2Length of the alignment block in the query (excluding gaps).
text2Aligned characters in the query, including gap characters. - align2 can be used as a -synonym for text2. -
qalign2 - -The query quality sequence (if there is one) correpsonding to aligned -characters. Gaps are indicated as a tilde (~). -
nucs2 - -The entire query sequence, after modifications due to specifier actions such -as subrange or softmask. -

-This is output in order along the query’s forward strand, regardless of -the strand of the alignment. -

quals2 - -The entire query quality sequence (if there is one), after modifications due -to specifier actions such as subrange. -

-This is output in order along the query’s forward strand, regardless of -the strand of the alignment. -

nmatch -Match count, the number of aligned bases in the -block that are matches. -
nmismatch -Mismatch count, the number of aligned bases in -the block that are mismatches (substitutions). -
ncolumn -Number of columns in the block. This includes matches, mismatches -(substitutions), and gaps. -
npair -Number of aligned bases in the block that are matches or mismatches -(substitutions). -
ngap -Gap count, the number of gaps in the block, counting -each run of gapped columns as a single gap. -
cgap -Gap column count, the number of gaps in the block, -counting each gapped column as a separate gap. -
diff -Differences between what would be written for text1 and -text2. Matches are written as . (period), transitions -as : (colon), transversions as X, and gaps as -- (hyphen). -
cigar - -A CIGAR-like representation of the alignment’s -path through the -DP matrix. This is the short representation, -without spaces, described in the -Ensembl CIGAR specification. -

-For more information, see the section about CIGAR and its -example. -

cigarx - -Same as cigar, but uses a newer syntax that distinguishes matches -from substitutions and omits the run length when it is 1. -

-For more information, see the section about CIGAR and -its example. -

identity -Fraction of aligned bases in the block that are matches (see -Identity). This is written as two fields. -The first field is a fraction, written as <n>/<d>. -The second field contains the same value, computed as a percentage. -
idfrac -Fraction of aligned bases in the block that are matches (see -Identity), written as a fraction. -
id% -Fraction of aligned bases in the block that are matches (see -Identity), written as a percentage. -
blastid% -Fraction of the alignment block that is matches, as would be reported by NCBI -BLAST. The numerator is the number of matches, and the denominator is the -number of alignment columns. The value is written as a percentage but without -a percent sign. -

-This is not the same as LASTZ normally reports for identity, since -NCBI BLAST includes gaps in the computation. -

continuity -Rate of non-gaps (non-indels) in the alignment block (see -Continuity). This is written as two fields. -The first field is a fraction, written as <n>/<d>. -The second field contains the same value, computed as a percentage. - -
confrac -Rate of non-gaps (non-indels) in the alignment block (see -Continuity), written as a fraction. -
con% -Rate of non-gaps (non-indels) in the alignment block (see -Continuity), written as a percentage. -
coverage - -Fraction of the entire input sequence (target or query, whichever is shorter) -that is covered by the alignment block (see -Coverage). This is written as two fields. -The first field is a fraction, written as <n>/<d>. -The second field contains the same value, computed as a percentage. -
covfrac -Fraction of the entire input sequence (target or query, whichever is shorter) -that is covered by the alignment block (see -Coverage), written as a fraction. -
cov% -Fraction of the entire input sequence (target or query, whichever is shorter) -that is covered by the alignment block (see -Coverage), written as a percentage. -
diagonal -The diagonal of the start of the alignment block in the -DP matrix, expressed as an identifying number -start1-start2. -
shingle -A measurement of the shingle overlap between the -target and the query. This is intended for the case where both the target and -query are relatively short, and their ends are expected to overlap. -
number - -The alignment number, counted as alignments are written to output. The count -begins at one. -
znumber - -The alignment number, counted as alignments are written to output. The count -begins at zero. -
chore - -The identifying tag corresponding to the chore that produced the alignment. -The tag is defined in the alignment chores file. -
- - - -

-

Other Output

- -

-LASTZ includes support for other output formats which are intended mainly -for the convenience of the developers. If you have specific questions, -please contact us. - - - - - - -


-
-

Advanced Topics

- - - - -
-

Aligning to Whole Genomes

- -

-Aligning queries to a whole genome can be accomplished in a single run of -lastz by using the -multiple action in the -target file’s sequence specifier. This causes -lastz to load all of the target’s sequences into memory. -However, sequence indexing inside lastz is limited to 31-bit -positions, which limits the overall size of the target to 2 gigabases. - -

-To facilitate larger genomes, an additional executable (lastz_32) -can be built. The two executables are basically the same; the only difference -is that sequence indexing in lastz is limited to 31-bit positions, -while lastz_32 uses 32-bit positions. The use of smaller positions -in lastz reduces memory usage and improves performance, but limits -the size of the target sequence to 2 gigabases. - -

-To build the lastz_32 executable, enter the following commands -from bash or a similar command-line shell (Solaris users should substitute -gmake for make). This will build the executable and -copy it into your installDir. -

-    cd <somepath>/lastz-distrib-X.XX.XX/src
-    make lastz_32
-    make install_32
-
- -

-lastz_32 can then be used as a replacement for lastz -in any command line, e.g. -

-    lastz_32 hg18.fa[multiple] galGal3.fa \
-      --notransition --step=20 --nogapped \
-      --progress=1 \
-      --format=maf > hg18_vs_galGal3.maf
-
- - - -
-

Adjacent Indels

- -

-Occasionly the sequences being compared contain unrelated segments of DNA -flanked by segments that are related. If the unrelated segments are long enough -(and -different enough) that two gaps are cheaper than a series of substitutions, -the optimal-scoring alignment should contain adjacent indels, like this: -

-

-    ...ATAAATTATTATTATTAAATTTTA-------------------CCCCCCCCCCCCCCCCCCTTTTTA...
-    ...ATAAATTATTATTATTAAATTTTAGGGGGGGGGGGGGGGGGAG-------------------TTTTA...
-
- -

-However, by default, lastz does not allow an insertion to follow a deletion, or -vice versa. So it ends up reporting an alignment like this instead: -

-

-    ...ATAAATTATTATTATTAAATTTT------------------ACCCCCCCCCCCCCCCCCCTTTTTA...
-    ...ATAAATTATTATTATTAAATTTTAGGGGGGGGGGGGGGGGGA------------------GTTTTA...
-
- -

-The latter alignment doesn't make any sense biologically. However, to maintain -backward compatibility with previous versions of LASTZ (and BLASTZ), the default -version of LASTZ will produce the latter alignment. - -

-Users that want to allow allow alignments with adjacent indels can build any -LASTZ executable with allowBackToBackGaps enabled. This is -accomplished by adding allowBackToBackGaps=ON to the -make command line, like this: - -

-

-    make clean
-    make lastz_32 allowBackToBackGaps=ON
-    make install_32
-
- - - -
-

Interval Coordinates

- -

-The biological research community has established several competing standards -describing intervals on a strand of DNA. Different programs often use -different standards. Since LASTZ supports several input and output formats, it -is inevitable that it uses more than one way of describing an interval. We -describe the different conventions here. - -

-For this discussion, suppose we have a 50-nucleotide strand of DNA as follows: -

-

-        origin-one, closed: 12345678901234567890123456789012345678901234567890
-                                      ↓      ↓
-                     5' >>> CGACCTTACGATTACCTACTTAACACGTAAACTGAGGGATCAAAAGGAAA >>> 3'
-                                      ↑       ↑
-    origin-zero, half-open: 01234567890123456789012345678901234567890123456789
-
- -

-Note that since this is DNA it has 5' and 3' ends; -we assume that all input sequences follow the standard practice of listing the -bases with the 5' end on the left. -Here we've highlighted the subsequence ATTACCTA so we can -discuss how to describe the interval it occupies. There are two commonly used -ways to do this. Both count from 5' to 3' (left to right). One way, -origin-one, starts counting from one. The other way, -origin-zero, starts counting from zero. So in origin-one, -ATTACCTA begins at position 11, while in origin-zero it begins at -position 10. - -

-To describe the ending position, there are also two commonly used methods. -One way is closed, in which the position of the last nucleotide is -given. The other is half-open, in which the position following the -last nucleotide is given. These are theoretically independent of the -conventions for the origin, but in practice only two of the combinations are -commonly used: origin-one, closed and -origin-zero, half-open. In the former, ATTACCTA is -said to occupy the interval (11,18), while in the latter it is said to occupy -the interval (10,18). Notice that only the first number changes between these -two paradigms; the second number stays the same. - -

-Another factor to consider is that DNA is usually double stranded, which would -look like this: -

-

-        along forward:        12345678901234567890123456789012345678901234567890
-                                        ↓      ↓
-       forward strand: 5' >>> CGACCTTACGATTACCTACTTAACACGTAAACTGAGGGATCAAAAGGAAA >>> 3'
-    complement strand: 3' <<< GCTGGAATGCTAATGGATGAATTGTGCATTTGACTCCCTAGTTTTCCTTT <<< 5'
-                                        ↑      ↑
-        along reverse:        09876543210987654321098765432109876543210987654321
-
- -

-In some cases it makes sense to refer to the interval along the complement -strand. For example, if the above sequence was a query and the target -contained TAGGTAAT, how should the query position of an alignment -of those two be described? One way would be to still refer to the interval -along the forward strand (which we also call the plus or -positive strand), and just indicate that in fact it was the reverse -complement of that interval that aligned. We call this -counting along the forward strand. Another way is to count from the -other end, from the 5' end of the complement strand (which we also call the -reverse, minus or negative strand). We call -this counting along the reverse strand, and for clarity we might add -"from its 5' end". In this example, if we were using origin-one, closed -counting, we would say that TAGGTAAT occurs at (33,40) along the -reverse strand. -Unless noted otherwise (e.g. for the -R Dotplot output format), when counting along the -forward or reverse strand LASTZ swaps the interval’s endpoints if -necessary, so -the position called start is numerically ≤ the position called -end. This is a common convention, but there are other programs -that leave them unswapped. - -

-Note that when counting positions all characters in the sequence are counted, -including runs of Ns or Xs and even invalid -characters. This is important so that other programs can use the reported -positions to index directly into the original sequences. - - - -

-

Non-ACGT Characters, Splicing, and Separation

-

-The handling of characters other than A, C, -G, and T in sequences that are supposed to represent -DNA is problematic. -In ordinary (non-quantum) DNA sequences, LASTZ currently supports two of these, -N and X. They can either be present in the original -input file (except that the Nib and -2Bit formats are incapable of containing -Xs), or added by using an -xmask or -nmask action in the -sequence specifier. -LASTZ can also be configured to tolerate the other IUPAC-IUB ambiguity codes -(B, D, H, K, M, R, S, V, W, and Y), and to recognize -a special user-specified separator character. - -

-Many database sequences contain Ns to represent bases for which -the actual nucleotide is not known (at least, not known with any level of -confidence). Ns (or better, Xs) can also be used to -mask out regions that have previously been identified as being of no interest, -and therefore should not be aligned. And unfortunately, there is also a -tradition of using strings of Xs or Ns to splice -together multiple sequences to gain efficiency when dealing with programs that -were limited to operating on a single sequence. - -

-Although splicing was useful in BLASTZ, it is no longer needed for LASTZ. -Since LASTZ can handle multiple target sequences (via the -multiple action in the target -file’s sequence specifier), it is preferred that users not -resort to splicing. -If splicing is necessary, the preferred method is to specify a -separator character to tell LASTZ explicitely -where the splices have occurred. - -

-Replacing BLASTZ with LASTZ in an existing -pipeline may still involve spliced sequences, so LASTZ’s default -interpretation of non-ACGT characters is the same as BLASTZ’s:  -Xs are excluded from the alignment seeding stage, and are so -severely penalized by alignment scoring that they will not normally -appear in -any alignment. Ns are also excluded from seeding, and are -penalized about the same as a transversion mismatch. Specifically, any -substitution with X is scored as −1000, and any substitution -with anything else (other than A, C, G, -or T) is scored as −100. -Note that you have to put "enough" Xs or Ns between -the sequences so that no alignment block will cross the splice. This can be -tricky, since gap scoring is only dependent on the length of the gap and not on -the characters in the gap. So if a gap the same length as the splice is not -penalized more than the y-drop setting, the -alignment may hop the splice. As a rough guideline, a splice length of 50 is -usually enough with the default settings, but this is not guaranteed. - -

-This default treatment of non-ACGT characters also works well when -Xs or Ns are used to mask out regions that should not -be aligned. However, it is inappropriate when the sequences contain -Ns to represent ambiguous bases. To handle this case, LASTZ -provides the ‑‑ambiguous=n option, -which causes substitutions with N to be scored as zero. -Additionally, the ‑‑ambiguous=iupac -option causes the other IUPAC-IUB ambiguity codes -(B, D, H, K, M, R, S, V, W, and Y) to be treated this -same as an ambiguous N. The two ‑‑ambiguous options -also allow you to specify rewards and penalties for matches and mismatches -involving ambiguous characters. - -

-In either case, non-ACGT characters are ignored during the seeding stage. -Only seed words that consist entirely of A, C, -G, and/or T are involved in seeding, even if the -non-ACGT characters occur in "don't-care" positions in the seed pattern. - -

-The score values described above can be changed if a -scoring file is specified. The −1000 score -is called bad_score and the −100 score is called -fill_score. Further, which character is considered "bad" (by -default this is X) can also be specified in the scoring file, and -can actually be different between the target and query. Throughout this -document, when we refer to the character X appearing in a DNA -sequence, we generally mean the character specified as "bad", which defaults to -X. - -

- -Splicing, or more correctly separation, can also be accomplished by -placing a specific character between subsequences, then using the -separator=<character> -action. LASTZ will then break the sequence into the prescribed subsequences -and prevent any alignment from crossing the boundaries. - -

-Quantum DNA sequences are different: they use an -arbitrary, user-defined alphabet of symbols, so the above-mentioned special -treatments for N and X do not apply. The default -"bad" character for quantum sequences is the null byte (00 -hexadecimal), which is not even allowed in the sequence; however it can be -changed to one of the valid alphabet symbols via the scoring file. There is -no analog of ambiguous Ns for quantum sequences, as typically -every symbol has some level of ambiguity. - - - -

-

Sequence Name Mangling

- -

-Often the names in the input sequence files are inconvenient for downstream -processing, or create problems with certain output formats. This is further -complicated by the fact that some input formats (most notably -Nib) do not contain sequence names, so in those cases -a name must be derived from the filename. LASTZ provides several choices for -naming the input sequences. These alternatives are mutually exclusive; only -one can be used at a time for a particular input file. - -

-Internally, LASTZ handles the naming task in two phases. First, it creates a -full header for the sequence. If the input format provides a name -or header, that becomes the full header. Otherwise, the full header is -constructed from the file name. - -

-In the second phase, LASTZ shortens the full header to a nickname. If the full -header starts with a file name, any path prefix is removed, and commonly-used -file extension suffixes are also removed (.fa, .fasta, -.nib, .2bit). Then by default, LASTZ uses the first -word (composed of characters other than whitespace, vertical bar, or colon) of -the remaining string as the sequence name. Thus a -FASTA header like -"> ~someuser/human/hg18/chr1.fa Human Chromosome 1" -is shortened to simply chr1. - -

-The actions -nameparse=darkspace -and nameparse=alphanum in the -sequence specifier change how the first word is -determined. darkspace -(i.e., "non-whitespace") narrows the set of terminating characters -to allow vertical bars and colons to appear in the word, while -alphanum widens it so the word is restricted to only alphabetic, -numeric, and underscore characters. Path prefixes -and file extensions are still removed. - -

-The default shortening is often adequate. For example, consider the following -FASTA file. By default, the names will be 000007_3133_3729 and -000015_3231_1315. -

-

-    >000007_3133_3729 length=142 uaccno=FX9DQEU13H5YZN
-    ACCCGAAAGAGAAACAGCTTCCCCCCCTGTCCCGAGGGATATCAAGTAGTTTGTTGGCTA
-    GGCTGATATTGGGGCCTTCCGCTAGAGTCGGCGCCCGCGCCTACGAGTCCCCCCCACCCC
-    CCACCCCCACAGCGGGTTATCC
-    >000015_3231_1315 length=190 uaccno=FX9DQEU13HUTXE
-    TTGTTGAGTCGGATGAGAATAGCAAGTGCAGTCAACGGCAATGTGCTGGGTTAGTACAAC
-     ...
-
- -

-However, the user may find it more convenient to use the accession numbers. To -accomplish this, she can use the -nameparse=tag:uaccno= action. LASTZ -will look for the tag string uaccno= in each header and read the -name from the characters that follow it, up to the first character that is not -alphabetic, numeric, or an underscore. In this case the sequence names would be -FX9DQEU13H5YZN and FX9DQEU13HUTXE. If the tag string -is not found in the full header for a particular sequence, the default -shortening is used instead. - -

-Now consider this FASTA file: -

-

-    >gi|197102135|ref|NM_001133512.1| Pongo abelii ...
-    GCGCGCGTCTCCGTCAGTGTACCTTCTAGTCCCGCCATGGCCGCTCTCACCCGGGACCCC
-    CAGTTCCAGAAGCTGCAGCAATGGTACCGCGAGCACGGCTCCGAGCTGAACCTGCGCCGC
-     ...
-    >gi|169213872|ref|XM_001716177.1| PREDICTED: Homo sapiens ...
-    ATGTCTGAGGAGGTAGGATTTGATGCAGGAGGGAGGATCTGGTGCACTTATAAGGATCTG
-    GGTCTGTCAGTGTCAGAGAAGGTAGGATCTGGCCCTGGTATGAGGATCTGGATCTGTCAG
-     ...
-    >gi|34784771|gb|BC006342.2| Homo sapiens ...
-    GGGTGGGAGGACGCTACTCGCTGTGGTCGGCCATCGGACTCTCCATTGCCCTGCACGTGG
-    GTTTTGACAACTTCGAGCAGCTGCTCTCGGGGGCTCACTGGATGGACCAGCACTTCCGCA
-     ...
-
- -

-In this case the default action does not do what we want (all sequences would -be named gi). The action nameparse="tag:gi|" gives -us the names 197102135, 169213872, and -34784771. (Note the quotes; this is necessary to prevent the -command-line shell from interpreting | as a pipe character.) -Observe that a tag of ref| will not work, because the third -sequence would need gb| instead. - -

-Sometimes it is more convenient just to assign a specific name. This can be -done with the -nickname=<name> -action. For example, using the target and query file specifiers -~someuser/human/hg18/chr1.nib[nickname=human] and -~someuser/human/ponAbe2/chr1.nib[nickname=orang], the output -will show the sequences as human and orang rather -than calling them both chr1. -If <name> contains the substring {number}, -the nickname will contain the number of the sequence within the file. This is -particularly useful when there is more than one sequence in the file. - -

-If you want to do away with name mangling entirely, you can use the action -nameparse=full. This uses the full -header as the sequence name. But note that if it contains spaces, the -resulting alignment files may not be readable by downstream tools. - -

-The above discussion applies to ordinary DNA sequences in FASTA, Nib, or -2Bit format. HSX index files -are handled differently: by default LASTZ uses the name from the index as-is, -without shortening it, -and the various nameparse actions are not -allowed. The nickname action can be used, -but is generally not -necessary since you can store the names you want directly in the index. - -

-Note that if the -subset=<names_file> action is -used, the names in the <names_file> must match the mangled -(or indexed) names. - -

-For FASTA files, more complicated name mangling can be performed using standard -Unix command-line tools. In the second example above, we could pipe the input -through sed a couple times to shorten each name to the NCBI -accession numbers NM_001133512.1, XM_001716177.1, -and BC006342.2. -

-

-    cat query_file.fa \
-      | sed "s/>.*ref\|/>/g" \
-      | sed "s/>.*gb\|/>/g" \
-      | lastz target /dev/stdin
-
- - - - - -
-

Seed Patterns

- -

-Seeds are short near-matches between the target and query sequences, where -"short" typically means less than 20 bp. Early alignment programs used exact -matches (e.g. of length 12) as seeds, but spaced seeds can improve -sensitivity when the sequences are diverged. - -

-A spaced seed pattern is a list of positions, in a short word, where -a seed may contain mismatches. For example, consider the seed pattern -1100101111. A 1 indicates a match is -required in this position, and a 0 indicates a mismatch is allowed -(effectively it is a "don't care" position). As the example below shows, using -this seed pattern, the seed word GTAGCTTCAC hits twice in the -sequence ACGTGACATCACACATGGCGACGTCGCTTCACTGG. -

-

-        target:  ACGTGACATCACACATGGCGACGTCGCTTCACTGG
-    (mis)match:    ||XX|X||||          ||X|||||||
-         query:    GTAGCTTCAC          GTAGCTTCAC
-       pattern:    1100101111          1100101111
-
-

-Spaced seeds have been shown to be more sensitive than exact match seeds, with -little change in specificity. This is most advantageous when the sequences -have lower similarity, such as human vs. mouse or chicken. Which seed pattern -is best depends on the sequences being compared. See -[Buhler 2003] for a discussion of spaced seeds and -how to design them. - -

-LASTZ’s seeding options give the -“user” many choices. The intent is that these will be selected by -some program (hence the quote marks around “user”), but they are -available from the command line for anyone. - -

N-mer match:

-A space-free seed can be specified by the length of the N-mer match required. -
-    --seed=match<length>
-
- -

General seed patterns:

-Any spaced seed pattern can be specified. The pattern is a string of -1s, 0s, and Ts, where a 1 -indicates that a match is required in that position, a 0 indicates -that a mismatch is allowed, and a T indicates that a mismatch is -allowed only if it is a transition (A↔G or C↔T). -
-    --seed=<pattern>
-
-The default seed is ‑‑seed=1110100110010101111, which is the same -12-of-19 seed used as the default in BLASTZ. - -

Half-weight seed patterns:

-If a seed pattern consists of only 0s and Ts, it is -implemented internally as a half-weight seed, which uses much less memory -(the same amount as a normal seed pattern half as long). Additionally, -‑‑seed=half<length> can be used as shorthand to specify a -space-free half-weight seed (i.e., all Ts). - -

Single, double, or no transitions:

-By default, one match position (a 1 in a spaced seed, or any -position in an N-mer match) is allowed to be a transition instead of a true -match. ‑‑notransition disables this. Alternatively, -‑‑transition=2 allows any two match positions to be -transitions. - -

Filtering on transversions and matches:

-The ‑‑filter option imposes additional requirements on the number -of transversions and matches in a valid seed. This is especially useful in -conjunction with half-weight patterns. For example, -
-    --seed=TTT0T00TT00T0T0TTTT --filter=2,15
-
-specifies the same pattern as the default seed, but allows the twelve -T positions to be matches or transitions, requires at least -fifteen matches total (among the 19 positions), and allows at most two -transversions. Note that the transversions can only occur in the -0 positions, since the T positions allow only matches -or transitions. -And although there are seven 0 positions, five of -them must contain matches or transitions since only two transversions are -allowed. - -

Twin hit seeds:

-The sensitivity of the seed can be decreased by ignoring seeds that don't -have a second hit nearby, i.e. by requiring two seeds on the same diagonal. -
-    --twins=[<minsep>..]<maxsep>
-
-The distance between the hits (the number of bases between the end of the -first hit and the beginning of the second) must be at least -<minsep> but not more than <maxsep>. -If <minsep> is omitted, zero is used (which means the -twin seeds may be adjacent but not overlap). Negative values can -be used; for example ‑‑twins=‑5..10 -means the twins can overlap -by as much as 5 bases or can have as much as 10 bases between them. - - - - -
-

Any-or-None Alignment

- -

-Sometimes, the only answer you want from an aligner is whether a query has -any strong alignments to the target or not. For example, you may want to know -which reads in a sequencing run have no alignment with a reference -genome. In this case, if a read aligns to a thousand different places on a -particular chromosome, you aren't interested in learning where — all you -want to know is whether it aligned or not. - -

-The ‑‑anyornone option is designed -for such cases, and can significantly improve alignment speed. Once any -qualifying alignment has been found, processing for the current query is -halted. The alignment is reported to the output, and then we immediately begin -processing the next query. A qualifying alignment is one that would normally -be output given the other parameter settings; for example it satisfies the -scoring thresholds (‑‑hspthresh -and/or ‑‑gappedthresh) and any -back-end filters. - -

-To get a list of reads that have at least one "good" alignment with a reference -sequence, you could do something like this: -

-    lastz <reference> <reads> --anyornone  \
-      --step=10 --seed=match12 --notransition --exact=20 --noytrim \
-      --match=1,5 --ambiguous=n \
-      --filter=coverage:90 --filter=identity:95 \
-      --format=general:name2
-
- - -

-This option slightly changes the usual processing order described in the -Overview. Instead of performing gap-free extension -on all seeds, collecting them into a list of HSPs, and then performing gapped -extension, each HSP is gap-extended and back-end filtered immediately. This -avoids wasted work to perform -complete -early stage processing on hits that will -just be abandoned as soon as the first qualifying alignment is found. - - - -

-

Y-drop Mismatch Shadow

- -

-The default configuration of gapped extension in LASTZ is to end the alignment -where the score would be the highest. This means that any prefix or suffix of -the alignment will have a non-negative score. While this is appropriate for -alignments that lie somewhere in the middle of two long sequences, it is not -desirable when an alignment is near the end of one or both sequences, which -happens quite often when aligning short reads. - -

-Consider the following alignment of a 50-base query to a chromosome target, and -suppose we are using ‑‑match=1,5, -‑‑gap=6,1, -‑‑filter=identity:97, and -‑‑filter=coverage:95. The entire -alignment as shown has 97.9% identity (46/47) and 100% coverage. However, the -first five bases (AGAAC vs. AGAAG) have a negative -score: four matches at +1 each and one mismatch at −5 gives a score -of −1 for this prefix. The highest scoring alignment is from positions -6 through 50, for a score of 33 (the entire alignment scores only 32). If -we stop the alignment at the highest score, coverage drops to 90%, and the -alignment is discarded. The overall result is that we will discard reads that -we don't want to, and we will see a bias against mismatches near the ends of -reads. (Note that this anomaly arises because the alignment is terminated -abruptly by the end of the sequence rather than normally by a low-scoring -region; also the ‑‑filter=coverage option is more commonly used -with short reads than with longer sequences.) - -

-

-    target:  ... CTTAGAACGGTAGATACTTGTATAAT---CGAGGGGGTTATTTTGTACAAATGACT ...
-                    ||||X||||||||||||||||||   ||||||||||||||||||||||||
-     query:         AGAAGGGTAGATACTTGTATAATCAACGAGGGGGTTATTTTGTACAAATG
-                         ↑                                           ↑
-                    12345678901234567890123456789012345678901234567890
-
- -

-To avoid this behavior, use the -‑‑noytrim option when aligning short -reads. This causes LASTZ to refrain from trimming such alignments back to the -highest-scoring location. Specifically, if the -gapped extension process encounters the end of the -sequence, it will keep that as the end of the alignment. In this case a -negatively-scoring prefix or suffix will be kept as long as it does not score -worse than the ‑‑ydrop value. - - - -

-

Shingle Overlap

- -

-In some applications, e.g. when assembling reads into contigs, we want to -determine how sequence ends overlap each other. For example, in case 1 below, -the starting portion of the query overlaps the ending portion of the target by -30 bases, and both sequences extend beyond each other in opposite directions. -We call this situation "shingling" (like shingles on a rooftop), and the -shingle field of the General output -format provides a measurement of it. A positive value indicates that the -starting portion of the query overlaps the ending portion of the target (case -1), while a negative value indicates the roles are reversed (case 2). If -neither of these cases occurs (e.g. if either sequence fails to extend beyond -the other), an NA is reported. - -

-Case 1 (shingle = +30): -

-                                                    target_end
-                           3         2         1        ↓
-                           098765432109876543210987654321
-    target:  ... GACGGCGGCTAACACATTGTGTTGXACGTACCATAACCAA
-                           ||||||X|||||||||XX||X||||||
-     query:                AACACAGTGTGTTGCAACTATCATAACATTAAACTTTAGA ...
-                           123456789012345678901234567890
-                           ↑        1         2         3
-                      query_start
-
- -

-Case 2 (shingle = −30): -

-                     target_start
-                           ↓        1         2         3
-                           123456789012345678901234567890
-    target:                TCCCTAATAAATCTTAAGTGCGATCCGCAGCGAGGTGTAC ...
-                              ||||X|||||||||X||||||||X||
-     query:  ... TGGCGCCTGTAGTCTAAGAAATCTTAATTGCGATCCACAC
-                           098765432109876543210987654321
-                           3         2         1        ↑
-                                                    query_end
-
- -

-Note that the value reported has no relation to the number of bases that align -in that region, nor is it an indication that the alignment extends all the way -to the start or end of the sequences. The shingle value is just evidence that -the proper registration of the two reads is to overlap them by the given value -— information that an assembler might use in assembling those reads into -a contig. - - - -

-

Using Target Capsule Files

- -

-Target capsule files are provided to improve run-time memory utilization when -multiple CPU cores on the same computer are running LASTZ with the same target -sequence. They permit the lion’s share of the large internal data structures -to be shared between the processes. This allows more copies of LASTZ to be run -simultaneously with less physical memory, which can improve the throughput, for -example, when mapping a large set of reads to a single (large) reference -sequence. - -

-To create a capsule file, use a command like this: -

-    lastz <target> --writecapsule=<capsule_file> [<seeding_options>]
-
-Applicable seeding options are -‑‑seed, -‑‑step, -‑‑maxwordcount, -and ‑‑word. - -

-To use the capsule file, run LASTZ like this: -

-    lastz --targetcapsule=<capsule_file> <query> [<other_options>]
-
-No additional effort on the part of the user is required to handle sharing of -the capsule data between separate runs. Nearly all options are allowed; -however the seeding options -‑‑seed, -‑‑step, -‑‑maxwordcount, -and ‑‑word -are not allowed, since these (or their byproducts) are already stored in the -capsule file. Further, ‑‑masking -is not allowed, because it would require modifying both the target sequence and -the target seed word position table, which are contained in the capsule. - -

-Internally LASTZ asks the operating system to directly map the capsule file -into the running program’s memory space -in a read-only fashion. Multiple running instances can map -the same file; each instance will have its own virtual addresses for the -capsule data, but the physical memory is shared. There is no requirement for -more than one instance to actually use the capsule simultaneously. Running -a single copy of lastz with ‑‑targetcapsule will work -fine, and in fact there may be a small speed improvement compared to running -the same alignment without a capsule. - -

-The downside of this technique is that the capsule files are very large and are -also machine-dependent. For example, the file for human chromosome 1 is about -1.4 Gb. Note that attempts to run a capsule built on a mismatched computer are -detected and rejected. - - - -

-

Inferring Score Sets

- -

-Scoring inference is an automated method for determining appropriate -substitution scores and/or gap penalties directly from the sequences being -aligned. The resulting scoring parameters can be saved to a file and/or used -immediately to align the sequences. Generally these depend mostly on the -species rather than particular regions, so once a suitable scoring set has been -obtained for a pair of species, the inference does not need to be -performed for each alignment run. In this section we give a brief overview -of the inference process; see [Harris 2007] for a -more detailed description. - -

-Inference is achieved by computing the probability of each of the 18 different -alignment events (gap open, gap extend, and 16 substitutions). -These probabilities are estimated from alignments of the sequences. Of course, -at first we don't have alignments, so we start by using a generic scoring set -to create alignments, infer scores from those, then realign, and so on, until -the scores stabilize or "converge". Ungapped alignments are performed until -the substitution scores converge, then gapped alignments are performed (holding -the substitution scores constant) until the gap penalties converge. - -

-To have LASTZ infer scoring parameters, use -a suitably enabled build of LASTZ (see below), and specify -the ‑‑infer or -‑‑inferonly options. (The latter -will stop after inferring the parameters, without performing the final -alignment.) Settings for the inference process can be specified in a -control file included with these options. - -

-The ‑‑infscores option causes the -inferred scoring parameters to be written out to a separate file. If no -<output_file> is specified, it is written to the header -of the alignment output file, as a comment. As a last resort, if no alignment -is performed the scoring set is written to stdout. The parameters -are written in the same format used to input scoring -sets. - -

-Usually it is undesirable to use all alignment blocks for inference. Blocks -with a high substitution rate (low identity) are likely to be false positives. -On the other hand, blocks with few substitutions (high identity) will be found -regardless of what scoring parameters are used. Thus it is desirable to base -the inference only on statistics from a mid-range of identity. By default the -middle 50% is used (that is, the 25th through 75th percentile from the identity -distribution), but this can be changed in the control file. - -

-

Special Builds Required:

-Since the inference is an iterated process, greater accuracy can be achieved -by using the floating-point version of LASTZ (lastz_D). Moreover, -the technique used to infer gap penalties has not yet been shown to select good -values, so the author recommends that users only employ inference for -substitution scores. To encourage these recommendations, the scoring inference -code is blocked from operation in the integer scoring version of LASTZ -(lastz), and gap penalty inference is blocked in both versions. -Special build options are available to defeat the blocks; contact the author -if you are interested. - - - -
-

Dynamic Programming Matrix

- -

- -

    -
  • Dynamic programming in general is a time-saving algorithm for computing - values that can be expressed via a recurrence relation - [Bellman 1957]. -
  • It has long been used for affine gap alignments of DNA and protein - sequences; see e.g. [Gusfield 1997]. -
  • It uses a matrix of sequence positions to store partial results; early - cells are used to compute later ones to avoid redundant work. -
  • Even for stages that do not involve gaps and do not actually use a DP - algorithm, the matrix is helpful as a conceptual tool because of its - strong correspondence with the dot-plot paradigm for visualizing - sequence alignments. -
  • Here we use the convention of representing the target sequence from - left to right along the horizontal axis (columns of the matrix), and - the query sequence from bottom to top along the vertical axis (rows - of the matrix); see e.g. Figure 5(a) for an - example. -
  • Gap-free alignment segments lie along diagonals of the - matrix/dotplot, in the forward (slope=+1) or reverse (slope=−1) - orientation; the latter typically indicates an alignment on the reverse - strand. -
  • Gaps bump the alignment to a different diagonal, since there is a - progression in one sequence but not in the other; this has the effect - of making gapped alignments look like diagonal-trending squiggly lines - when drawn at low resolution. -
  • Diagonals are characterized by a constant difference between the target - and query positions of their cells (or for a reverse diagonal, a constant - sum). -
  • This matrix is not to be confused with the substitution scoring matrix, - whose rows and columns correspond to characters rather than to sequence - positions. -
- - - -
-

Filtering With Shell Commands

- -

-Though LASTZ provides several filtering options (e.g. -‑‑filter=identity, -‑‑filter=continuity, -‑‑filter=coverage, -‑‑filter=nmatch, -‑‑filter=nmismatch, -‑‑filter=ngap and -‑‑filter=cgap), - sometimes these -are not sufficient for the task at hand. But in many cases it is still possible -to perform the desired filtering by using the -‑‑format=general option in conjunction -with a simple -awk, -perl, or -python script. Here we show one such -example, using awk. -

-Suppose we want to filter alignments by length, discarding anything shorter -than 500 bp, and that we need AXT output for downstream processing. We can -have LASTZ output whatever columns are necessary to reproduce AXT and use awk -to perform the filtering and reconstruct an AXT file. -

-Looking at the - -UCSC AXT specification, the corresponding --format=general -fields are shown in the table below. Note that when determining which fields -are needed for a given format, care has to be taken to make sure to get the -correct start and end fields. Different formats count from zero instead of -one, and some count reverse-strand positions along the plus strand. The -interval coordinates section provides more detail -about possible numbering schemes. -

- - - - - - - - - - - - - - -
AXT field field for ‑‑format=general
Alignment number (none)
Chromosome (primary organism) name1
Alignment start (primary organism) start1
Alignment end (primary organism) end1
Chromosome (aligning organism) name2
Alignment start (aligning organism) start2
Alignment end (aligning organism) end2
Strand (aligning organism) strand2
Blastz score score
Sequence line (primary assembly) text1
Sequence line (aligning assembly) text2
-

- -Then we can perform our filtered alignment with a series of commands like this: - -

-  lastz target.fa query.fa \
-     --format=general:name1,start1,end1,name2,start2,end2,strand2,score,text1,text2 \
-   | grep -v "^#" \
-   | awk '{ if ($3-$2+1 >= 500) print $0 }' \
-   | awk 'BEGIN { n=-1;} { print ++n,$1,$2,$3,$4,$5,$6,$7,$8; print $9; print $10; print ""; }' \
-   > filtered.axt
-
- -The grep command discards the line containing column headers. -

-The first awk command computes the alignment length in the target, and if it is -at least 500, copies the line to the output. $3 is -end1 and $2 is start1. Since these -represent a closed interval, we have to add 1 to get the length. $0 -represents the entire input line. -

-The second awk command converts the alignment from a single line into four lines -required for AXT. We use an awk counter, n, to create the -alignment number field. The other fields are copied from the fields output by -LASTZ. - - - -

-

Self-Masking a Sequence

- -

-For many alignment problems it is desirable to ignore alignments that consist -soley of genomic repeats. For this reason, most finished genomic assemblies -are soft-masked — bases that are part of identified repeats are in -lowercase. LASTZ’s seeding stage avoids seed hits in lowercase, and -thereby avoids finding alignments that are solely repeats. -

-With the wider availability of less-expensive DNA sequencing and custom -assemblies, it is more common for users to have unannotated sequences. The -following describes how LASTZ can be used to crudely identify duplications -and soft-mask the original sequence. This process is called -self-masking. -

-The self-masking process works by looking for alignments of the sequence with -itself, and makes use of the dynamic masking feature to reduce computational -time. The sequence is split into overlapping fragments, on the fly, which are -then aligned against the entire sequence. As duplications are discovered they -are marked as such and removed from the seeding process. -

-A command to perform self-masking would look like this, where critter.fa -contains the sequence to be masked. - -

-  cat critter.fa \
-    | ../tools/fasta_fragments.py --fragment=200 --step=100 \
-    | lastz critter.fa[multiple,unmask,nameparse=darkspace] /dev/stdin --masking=3 \
-        --progress+masking=10K \
-        --format=none --outputmasking+:soft=critter.masking.dat \
-        --notransition
-
-

-In that command, lastz is given the whole “critter” as its -target sequence, and overlapping 200bp fragments of the critter as the -queries. -

-The multiple action tells lastz to -allow more than one sequence in the file, and the -unmask action tells lastz to -ignore any softmasking that may be present in the file. (If your sequence -already has had some masking performed, and you want to keep that, omit -unmask.) The -nameparse=darkspace action tells -lastz to extract the first non-whitespace string from the sequence header line. -This is necessary to ensure that the final step -(fasta_softmask_intervals) will see the same sequence names in the -masked intervals file as those in the sequence file. -

-The ‑‑masking=3 option enables -dynamic masking, which will mark any reference base appearing in 3 or more -alignments. Since the fragments overlap by a factor of two, we expect every -base will appear in two trivial alignments. Any more than that would be caused -by a duplication elsewhere. -

-The ‑‑progress+masking -option causes lastz to give you a progress report after every 10 thousand -fragments. These reports come to the console (stderr) and look like this: -

-    (16.933s) processing query 50,001: critter_21299501, masked 8,920,893/51,304,566 (17.4%)
-
-

-The ‑‑format=none option inhibits the -normal alignment output and -‑‑format=outputmasking+:soft -tells lastz to write the final masked intervals to a file. -

-The final line (‑‑notransition -in this example) is whatever alignment scoring parameters you want to use. -What is appropriate will depend on the level of divergence you want to allow in -the masked duplications. -

-The command to apply the masking intervals to the fasta file will look like -this: -

-  cat critter.fa \
-    | ../tools/fasta_softmask_intervals.py --origin=1 critter.masking.dat \
-    > critter.softmasked.fa
-
- - - -
-

Aligning Many Subintervals

- -

-There are many occasions when you have a general idea of where the alignments -you are interested in are, and it seems computationally wasteful to align -entire sequences just to find a relatively few alignments. For instance, you -may have identified some alignments using fast, high sensitivity settings and -now want to look for alignments with higher divergence in the remaining -regions. Or you may have previously found alignments but did not collect all -the fields you needed. Or perhaps you used some tool other than LASTZ to -identify regions where you want to focus your search. Or you may have gapped -alignments from some other tool, and want to compare them to LASTZ's alignments -in the same subsequences. - -

-There are many ways to solve such a problem, and LASTZ provides several options -to support these needs. Here we describe and critique several different -approaches. - -

-Sequence masking. -LASTZ can use masking to eliminate the possibility -of alignments in (or not in) a given list of intervals. So we can create two -files, containing the desired intervals in the target and query, then run one -LASTZ job. -

-The disadvantage of this solution is that you may get alignments between -unintended interval pairs (for example, an alignment between the first target -interval and the fifth query interval). Some post-proceesing would be -necessary to eliminate these. Moreover, LASTZ will be spending a lot of its -time looking for alignments in those unintended interval pairs. - -

-Separate files. -The simplest solution, conceptually, is to preprocess the sequences to extract -the intervals of interest into separate files, then run each pair of files as -a separate LASTZ job. -

-The disadvantages of this solution are numerous. There is extra I/O involved -in splitting the files, and extra overhead in repeatedly launching LASTZ. -Further, depending on your needs, post-processing may be necessary to map -alignment positions back to the original sequences. - -

-Subranges and subsets. -The separate files solution can be improved upon by letting LASTZ mimic the -file splitting, internally. This can be accomplished with the -subrange and subset -actions, while still running each as a separate lastz job. -

-This improves upon external file separation by eliminating some I/O, and -eliminating the need to post-process to map positions. However, it still -suffers the extra overhead of repeatedly launching LASTZ. - -

-Alignment chores. -Another solution is to use an alignment chores file -(specified with the -‑‑chores=<file> option). The -chores file corresponds directly to the interval pairs of interest. Complete -alignment is performed over the regions defined by each pair (subject to -whatever other options have been set), and is blocked from extending beyond the -region. -

-There is little downside to this solution. The reported results will be -the same as for the post-processed separate files solution, or subrange/subset -solution (but with minor variations such as shifting of equally-scoring gap -placements). There is some computational waste in the chores -solution, but this is much less costly than the repeated launching. -

-In most cases, a chores file will be preferred over an anchors file. - -

-Anchor segments. -Another solution is to use an anchor segments file -(specified with the -‑‑segments=<file> option). The -anchors file does not correspond directly to the interval pairs of -interest. Instead, you will need to have same-length intervals in target and -query. Typically this will be a single point somewhere in the region. -Alignment is anchored at this point. Gapped alignment is performed in both -directions from that point, and is not restricted to the region. -

-The main disadvantage of this solution is lack of versatility. It is most -appropriate when used in conjunction with an external anchor-identifying method. -

-It can also be useful in cases where you want to run ungapped alignment with a -different scoring scheme than for gapped alignment. An ungapped run of LASTZ -creates the anchors (segments) file, and a second run uses those as anchors for -ungapped alignment with a different scoring scheme. - - - - - - -


-
-

Differences from BLASTZ

- -
    -
  • BLASTZ had a "short-by-two" error, which has been corrected in LASTZ. In -many cases, BLASTZ shortened alignments by two bases on either or both ends. -

    -

  • -BLASTZ had a problem with -premature alignment termination; -this has been corrected in LASTZ. -

    -

  • BLASTZ used the ydrop value from the main alignment as the xdrop value for -interpolation; this has been corrected in LASTZ. -

    -

  • -BLASTZ had a problem when -ydrop is less than the penalty for a one-base gap; -this has been corrected in LASTZ. -

    -

  • -BLASTZ chaining had a problem that caused it to -discard very high-scoring HSPs; -this has been corrected in LASTZ. -

    -

  • -The handling of ties in the DP matrix was unspecified in BLASTZ. This has -changed in LASTZ, which specifically prefers a longer alignment to a shorter -alignment with the same score. This change reflects the use of LASTZ to align -short reads, and desire to align as much of the read as possible. -

    -

  • -The handling of bounding alignments in the DP matrix is different in LASTZ than -in BLASTZ. This is discussed in -Bounding Alignments in the DP Matrix. The -‑‑allgappedbounds option can be -used to revert to the bounding criteria used in BLASTZ. - -

    -

  • -The handling of amibiguous nucleotides has been clarified in LASTZ, and in some -cases the default behavior is different than in BLASTZ. By default, BLASTZ -allowed IUPAC-IUB ambiguity codes (B, D, H, K, M, R, S, V, W, and -Y) in fasta sequences but was unclear about how these were scored. -Since we feel the user should be aware of how these bases are treated, LASTZ -rejects them by default. The -‑‑ambiguous=iupac option permits them -but treats them the same as an ambiguous N. This is discussed in -Non-ACGT Characters. - -

    -

  • LASTZ can produce a variety of alignment output formats such as AXT, MAF, -and human-readable text, as well as BLASTZ’s LAV format. -

    -

  • LASTZ can take the guesswork out of selecting alignment scoring parameters -by inferring them for you, based on its analysis of the input sequences. -

    -

  • LASTZ provides a large variety of seeding options. -
- - - -
-

Bounding Alignments in the DP Matrix

- -

-During the gapped extension stage, LASTZ processes the anchors in order of -score (highest scoring anchor is extended first, and so on). As anchors are -extended, a list of bounding alignments is constructed. These correspond to -paths in the DP matrix. Bounding alignments created for higher-scoring anchors -are used to bound the possible DP paths that lower-scoring anchors can take. -This prevents alignments from crossing each other. - -

-In BLASTZ, every gapped extension became a bound, and this was originally the -default behavior in LASTZ, through release 1.1.52. However, this caused LASTZ -to miss some alignments which it should have found. The failure case occured -as follows. A high-scoring anchor is extended but fails to meet the score -threshold. But it gets added as a bound. Then the extension of a lower-scoring -anchor is prevented from crossing or intersecting with that path, and it too -gets discarded even though it might score highly enough. This could occur, -for example, when two extensions would (in the absence of each other) share the -same tail, and the higher-scoring of the two has a lower-scoring anchor. - -

-The correction for this is to only use alignments as bounds if they satisfy the -score threshold. This corrected behavior is now the default in LASTZ (as of -release 1.02.00). The -‑‑allgappedbounds option can be -used to revert to the bounding criteria used in BLASTZ. - - - - - - -


-
-

Change History

- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ReleaseDateChanges
1.0.1Jul/28/2008 -Initial release. -
1.0.5Aug/2/2008 -Fixed a bug that in some cases caused a bus error when interpolated -alignments (e.g. ‑‑inner=…) were used with multiple -queries. -
-Added xmask=<file> and nmask=<file> -file masking actions. -
1.0.21Sep/9/2008 -Fixed a bug involving the default value for ‑‑gappedthresh -(a.k.a. L) when ‑‑exact is used. The bug caused the -gapped threshold to be inordinately low, allowing undesirable alignment blocks -to make it to the output file. -
-Fixed a bug whereby Xs and Ns were treated as desirable substitutions when -unit scores (e.g. ‑‑match=…) were used. -
-Re-implemented ‑‑twins=…. The previous implementation -improperly truncated the left-extension of HSPs. The new implementation is -slower and uses more memory. -
-Added ‑‑census=<file>. The census counts the number of -times each base in the target sequence is part of an alignment block. -Previously, ‑‑census produced a census only if the output format -was LAV (the census is a special stanza in a LAV file). Otherwise the option -was ignored. Now, if a file is specified a census is written to that file. -The format of lines in the census is -<name> <position> <count>. -The position is one-based, and the count is limited to 255. -

-In situtations where 255 is too limiting, ‑‑census16=<file> -or ‑‑census32=<file> can be used, with limits of about -65 thousand and 4 billion, respectively. Note that these will respectively -double and quadruple the amount of memory used for the census. The default -census uses one byte per target sequence location. -

-Added ‑‑format=<differences>, to support Galaxy. All -differences (gaps and runs of mismatches) are reported, one per line. -
-Added ‑‑anchors=<file> (eventually this was renamed to -‑‑segments=<file>), giving the user the ability to bypass -the seeding and gap-free extension stages. -
-Changed default gap penalties for unit scores (e.g. -‑‑match=…) to be relative to mismatch score (instead of -match score). -
-Made the <start>#<length> file subrange action -better at checking errors, and also allowed <length> to use -units such as M and K. -
-Sped up program exit by no longer freeing dynamically allocated memory. -
1.1.0Dec/5/2008 -Improved x-drop extension to better handle suboptimal HSPs. Left-extension -now starts at the right end of the seed (rather than the left end). This -reduces the chance that the extended region (the combination of left and right -extensions) will score less than some subinterval. -
-Changed coverage filtering so that it is relative to whichever sequence is -shortest. Previously it was always relative to the query. -
-Changed defaults for xdrop and ydrop when ‑‑match scoring is -used. -
-Interpolation now uses the xdrop value from the main alignment. -Previously it used the ydrop value to match BLASTZ, but we have decided that -was a bug in BLASTZ. -
-Added general output format. -
-Added ‑‑maxwordcount. -
-Added ‑‑notrivial. -
-Corrected problem with ‑‑subset action, which wasn't using -mangled sequence names. -
-Fixed problem in writing LAV m- and x-stanzas. -
-Blocked the use of scoring inference in the integer build, and blocked gap -scoring inference in all builds. -
-Changed much of the syntax for options and actions. The newer syntax is -clearer and more consistent than the older. The older is still supported by -the program so that existing scripts will still work, but it is not -documented. -
-Changed reporting of duplicated options from -can't understand "<option>" to -duplicated or conflicting option "<option>". -
-Added ‑‑format=rdotplot option. -
1.1.25Feb/5/2009 - Fixed a bug that caused some gapped -extensions to be terminated prematurely. In some cases this also allowed a -nearby low-scoring alignment to "piggyback" onto the remainder of a terminated -alignment, gaining enough in score to pass the score threshold. -
-Added support for target capsule files. -
-Added support for ‑‑format=cigar. -
-Added the <center>^<length> sequence interval -specifier. -
-Corrected the behavior of ‑‑exact regarding lowercase and -non-ACGT characters. ‑‑exact now considers, e.g., a lowercase A -to be a match for an uppercase A. Further, any non-ACGT characters now stop -the match. -
-Improved detection and reporting of memory allocation overflow. Two -problems were fixed as part of this: (1) allocation of single blocks larger -than 2 Gb was being rejected even on platforms that could support larger -blocks, and (2) an allocation overflow problem which could cause a segfault for -target sequences longer than about 1 Gb (these require allocation of a block -larger than 4 Gb). -
-Changed the behavior when encountering an empty sequence in a file with -many sequences. Previously this was reported as an error, and the program -halted. Now it is reported as a warning (to stderr), and the -program continues. -
-Added the ‑‑output option. In some batch systems, it is -difficult to redirect stdout into a file, so this option allows -the user to do it directly. -
-Removed ‑‑quantum and ‑‑code options, replacing -them with the quantum and quantum=<code_file> -sequence specifier actions. This is in preparation for allowing a quantum -target sequence. -
1.1.50Mar/16/2009 -Fixed two problems with exact-match extension. First, when both target and -query used the multiple sequence specifier action, exact match -extension was able to skip the boundary between sequences (this problem was -introduced in 1.1.25). Second, when the exact match should have extended to -the end of the sequence, it was being cut short by 1 bp (on either end). The -latter problem was only evident for ‑‑nogapped; a gapped entension -recovered the additional bases. -
-Fixed several problems with ‑‑segment=<file>. First, if -the file contained more than 4,000 segments, on some platforms the program would -segfault. Second, if a sequence subrange was being used, the limit test -comparing the segment interval to the subrange was incorrect. Third (if the -user was lucky enough to avoid the first two problems), if a segment was on the -negative strand it was improperly mapped to the subrange. -
-Added ‑‑noytrim to prevent y-drop mismatch shadow, improving -LASTZ’s ability to align short reads. -
-Set the default gapped extension score threshold to inherit the lowest HSP score in the -case where ‑‑hspthresh=top<basecount> or -‑‑hspthresh=top<percentage>% is used but -‑‑gappedthresh=<score> is not (and gapped extension is -performed). Previously this case was trapped by a low level routine and the -alignment was halted. -
-Fixed a problem with the start2+ field of -‑‑format=general. The position was left blank for alignments on -the + strand. -
-Fixed a problem in which ‑‑writecapsule was rejected if -‑‑seed=match<length> was used. -
-Fixed a problem related to name mangling which caused an "internal error" to -be reported. -
-Fixed a problem whereby single-symbol identifiers were not recognized in -quantum code files. -
-End of sequence limit checking for <start>#<length> -and <center>^<length> sequence specifier actions is -now "soft". If the resulting interval is beyond the end of the sequence it is -truncated. -
-Changed how ‑‑format=cigar reports alignments on the negative -strand. Apparently there is no complete spec for CIGAR format. Matching what -I see output by exonerate for certain cases is the best I can do. -
-Quantum code files can now specify probabilities as fractions. This gives a -clearer representation for motif-like sequences derived from a multiple -alignment. -
-Added cigar field for ‑‑format=general. -
-Added shingle field for ‑‑format=general. -
-Added the ‑‑rdotplot=<file> option. -
-The ‑‑notrivial option now works with the multiple -sequence specifier action. -
-Added ‑‑markend. -
-Added nameparse=darkspace. -
-Modifed the build process to accomodate the Solaris platform. -
1.1.52Mar/24/2009 - Fixed a bug that occurred when ydrop was less -than the penalty for a one-base gap (the sum of open and extend penalties). In -this case, a bug in the initialization of the DP matrix resulted in no -gapped alignments ever being found. -
-Fixed a problem with the combination of ‑‑recoverseeds and ‑‑exact. -Recovered seeds were cut short by one base on the left end. -
-Added ‑‑format=segments option. This was later replaced by -‑‑writesegments. -
-Added a workaround in the source code for what appears to be a bug in gcc -4.3.2 (see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37861). Without the -workaround, the build fails with this message: -
-    quantum.c: In function 'generate_dna_ball':
-    quantum.c:347: error: array subscript is above array bounds
-
-The workaround uses an ifdef that specifically targets gcc 4.3.2. -
-1.02.00Jan/12/2010 -Relaxed the rejection of some output formats, which was too aggressive. -Specifically, runs with ‑‑tableonly were rejected because of -output format, even though no output would be generated in that format. -
-Added the ability to set the ‑‑maxwordcount option as a -percentage. Also, ‑‑maxwordcount=<limit> now allows -<limit> to be 1. Previously it was not allowed to be less -than 2. -
-The scoring matrix used during x-drop extension now reflects the use -of ‑‑ambiguous=n. Previously, this matrix was not affected by -‑‑ambiguous=n, -and N-vs-N matches and N-vs-other matches were scored as -100 (more -specifically, as fill_score) during gap-free extension. This -caused LASTZ to miss some HSPs, usually those containing an N-vs-N match, since -the HSP was terminated at that match and didn't meet the score threshold. This -has been corrected. -
-Added support for HSX indexes, to support random access into FASTA files. This -improves the speed of aligning a single read (from a file of half a million) by -a factor of about 12. -
-Added ‑‑softmask=<mask_file> file action to permit -soft masking of specified intervals. Also added masking of the -interval complements — -‑‑xmask=keep:<mask_file>, -‑‑nmask=keep:<mask_file>, and -‑‑softmask=keep:<mask_file>. These make it easier to -restrict alignment to several specified intervals of a sequence. -
-Enabled the use of ‑‑filter=[<transv>,]<matches> -for non-halfweight seeds. Previously, ‑‑filter had only been -tested for half-weight seeds, but was erroneously prohibited for -all seeds (instead of just prohibiting non-halfweight seeds). Further, it -was not properly implemented for seed-only output (‑‑nogfextend -‑‑nogapped). These have all been corrected, and ‑‑filter -is now available for all seed types. -

-Also corrected the behavior of ‑‑filter regarding lowercase and -non-ACGT characters. ‑‑filter now considers, e.g., a lowercase -a to be a match for an uppercase A. Further, for the -purposes of ‑‑filter, any non-ACGT characters are considered to be -transversions. -

-Also changed the behavior when the <transv> field is absent. -This is now interpreted as unlimited transversions. Previously it meant that -no transversions were allowed. This should be a safe change in behavior since -it was (unintentionally) not possible for users to access this feature -previously. -

-Added a compile-time directive compileForWindows to make -appropriate behavioral adjustments for running on a Windows machine. -Currently this only affects the handling of file paths. To activate it, -the user must add -DcompileForWindows to the definition of -definedForAll in -.../lastz‑distrib‑X.XX.XX/src/Makefile. -
-Fixed chaining of seed hits. Previously, if ‑‑nogfextend and -‑‑chain were used together, nothing was output. This was due to -the fact that unextended seeds had no scores, and the chaining algorithm only -reports chains with positive score. This has been corrected by calculating -scores (as the sum of substitution scores) over anchor segments whenever (a) -the segments have not had scores computed for them, and (b) scores are required -for later processing. -

-This change may also affect (for the better) the results of gapped extension -when either ‑‑nogfextend or ‑‑exact is used. Gapped -extension processes the anchors highest score first. Since -‑‑nogfextend left all scores zero, the actual order in which gapped -extension was performed in that case was dependent on how the sort routine (the -C runtime routine qsort) deals with ties. For ‑‑exact, the score -was the length of the match. This has been changed to the segment’s -substitution score. -

-Changed ‑‑format=segments to -‑‑writesegments=<file>. -
-Added M-mismatch extension. -
-Added the replacement of {number} in sequence nicknames. -
-Added support for continuity reporting and -filtering. -
-Added support for match count -filtering. -
-Fixed a bug in handling subrange actions for nib files. The problem occurred -when the subrange action was of the form <start>.. and -<start> was even. That is, no <end> was -specified, and LASTZ is supposed to use the remainder of the sequence. LASTZ -miscalculated the length of the interval, making it one base longer. If the -actual full sequence length was odd, this resulted in an extra T -being appended to the sequence data. If the full sequence length was even, -LASTZ quit, reporting that it was unable to read the sequence. Note that this -only happened for .nib files, only when <start> when even, -and only when no <end> was specified. -
-Added the subsample action. -
-Added the ‑‑anyornone option. -
-Added ‑‑allgappedbounds. -
-Fixed a bug in exact and mistmach extension and queries using the -multiple action. It was possible for an HSP to cover parts of -two different queries. -
-Fixed an overflow bug in the chaining -algorithm. Due to numerical overflow, very high scoring HSPs were treated as -negatively scoring, and thus were not included in final chains. With default -scoring values, overflow was caused by the equivalent of an exact match of -about 22Mbases. This problem also existed in BLASTZ. -
-Added support for output in SAM format. -
-Corrected dotplot output. Previously, some of the coordinate -values were inconsistent and off by one. -
-Added ‑‑progress=[<N>]. -This existed as an unadvertized option in earlier versions of the program, as -‑‑debug=queryprogress=<N>. It has now been promoted to a -first class option. -
-Added ‑‑ambiguous=iupac and changed ‑‑ambiguousn to -‑‑ambiguous=n. the former is still supported, but not advertized. -
-Column headers for ‑‑format=general now match the command-line -keywords. Previously, all related keywords shared the same column header. -For example, keywords start2, zstart2, -start2+ and zstart2+ all produced the same column -header, start2, in the output file. -

-Also added ‑‑format=general-. -

-Now using inttypes.h macros for sized-types. This is to satisfy some -additional type-checking pickiness that appears to have added to gcc version -4.2.1. In the unlikely even that a compiler doesn't support inttypes.h, the -compile-time definition override_inttypes can be used. -
-Added nmatch, nmismatch, ngap, -cgap and cigarx fields for -‑‑format=general. -
-Added ‑‑format=mapping, a shortcut for typical fields for -‑‑format=general for mapping reads. -
-1.02.11Aug/21/2010 -Fixed the cigarx field for ‑‑format=general, so -that a run length of 1 is omitted for indels. -
-Fixed the behavior of ‑‑recoverseeds, which was failing to -recover many HSPs when seed denisty was high. This was due to left extension -being blocked by other seeds on that same hash-equivalent diagonal. Left -extension is now unblocked when ‑‑recoverseeds is enabled. -
-Changed/corrected how the ‑‑segment option handles wildcard names -when the multiple action in used. To support this, the -rewind command was added to the segments file format. -
-Sequence masking actions (softmask, xmask and -nmask) are now allowed for the multiple action. -
-Command-line arguments beginning with two unicode non-breaking hyphens are now -recognized. Since these are used in some places within this README file, it is -natural for a user to copy them to the command line. Previously these were not -recognized, which led to a somewhat confusing error message. -
-Fixed detection and reporting of improper gap penalties. Because the first -base in a gap is penalized as open+extend, open can be zero or negative as long -as that sum is strictly positive. Previously, a sum of zero was permitted, and -a negative sum was misleadingly reported as a problem with the open penalty. -Now, the sum must be strictly positive, and when it isn't the message more -accurately describes the problem. -
-Fixed the implementation of ‑‑self with regard to mirror-image -pairs. Previously, alignments were internally restricted to be above the main -diagonal in the ungapped stage only. The mirrored twins were created prior to -the gapped stage, and the gapped stage operated on the full set of anchors. -This had two undesirable effects -- there was little computational savings, and -the resulting set of alignments could be assymetrical (due to small variations -in gap positioning). This behavior has been changed so that the above-diagonal -restriction occurs throughout the alignment process and mirrored twins are -created just prior to output. -
-1.02.16Nov/2/2010 -Fixed a problem with ‑‑self, introduced in 1.02.11. The problem -manifested itself on 64-bit CPUs, with an error message indicating it was -attempting to allocate 17 billion bytes for edit_script_copy. This has been -corrected. -
-Corrected a problem in LAV output, in which the d stanza -reported an incorrect value for K or L (the ungapped and gapped soring -thresholds) when they were not equal to each other. Which value was reported -incorrectly depends on nuances of the compiler and could differ by platform. -

-Alignments were not affected. -

-Changed the error message when a fasta file contains bad characters. The -previous message caused confusion when the bad character happened to be -punctuation. Now the error message explicitely describes the offending -character (comma, ampersand, etc.). -
-Added ‑‑format=blastn. -
-Added idfrac, id%, blastid%, -covfrac, cov%, confrac, -con%, ncolumn, and npair fields for -‑‑format=general. -
-Added start..end+zoom% subrange specifier. -
-1.02.23Jan/10/2011 -Fixed a problem that occurred if the gap extension penalty was set to zero. -This caused a divide by zero (which is reported in different ways on different -platforms) and the program crashed. This has been corrected by trapping the -offending division. However, the fix increases memory usage. Moreover, it is -highly likely to cause truncated alignments. It’s not clear that there -is any useful reason to set gap extension to zero. - -
-Added ‑‑format=rdotplot+score and -‑‑rdotplot+score=<file>. -
-Improved ‑‑masking=<count> so that it can allow a count -threshold greater than 254. -
-Fixed a problem with ‑‑scores=<scoring_file>. When the -<scoring_file> defined score values for N, -those scores were not honored during the ungapped seed extension stage. -
- -Fixed problems with ‑‑ambiguous=n and -‑‑ambiguous=iupac. These were -incorrectly penalizing substitutions between non-ambiguous nucleotides -(A, C, G, or T) and ambiguous ones (N, B, D, H, K, M, R, S, -V, W, or Y). This has been corrected to honor the original -intent, which was clearly to score these as zero. -

-However, for users who desire the previous behavior, a substitution penalty can -now be specified with each of these options. To match the previous behavior, a -penalty of twice the gap extension should be used. -

-A later change history item is also -relevant. -

-Added ‑‑queryhsplimit=<n>. -
-1.02.27Jan/31/2011 -Added ‑‑outputmasking=<file>. -
-1.02.37Mar/31/2011 -Added ‑‑outputmasking:soft=<file>. -
-Added example of filtering with shell commands. -
-Changed the interpretation of comments in -sequence name files. Previously, the first # was -considered a comment. The implemenation predated the author’s -familiarity with Illumina read names (which contain a #). In order to still -allow lines that contain a read name and a comment, a # is not considered a -comment unless it is is preceded by whitespace or the start of the line. -
-Changed the behavior of -‑‑queryhsplimit=<n> to -better match user expectations. Previously the limit was applied separately -for each strand of the query. Moreover, HSPs discovered before the limit was -reached were still passed downstream for further processing. -

-This has all been changed so that the limit applies to the combined total of -HSPs for query, and if the limit is reached (exceeded), all HSPs for the read -are discarded and no downstream processing is performed. -

-Fixed a bug involving the ngap and cgap fields for -‑‑format=general. These fields were only reported correctly if -the continuity or ncolumn fields were also requested. -Otherwise, the value reported represented the contents of unitialized memory. -
-Added filtering options -‑‑filter=nmismatch:0..<max>, -‑‑filter=ngap:0..<max>, -and ‑‑filter=cgap:0..<max>. -

-Also changed the option name for match count filtering to -‑‑filter=nmatch:<min>. -The older option, ‑‑matchcount=<min> is of course still -recognized. -

-1.02.40Apr/7/2011 -Added ‑‑outputmasking+=<file> -and ‑‑outputmasking+:soft=<file>. -
-Added -‑‑progress+masking=[<N>]. -This existed as an unadvertized option in earlier versions of the program, as -‑‑debug=queryprogress+masking=<N>. It has now been promoted -to a first class option. -
-Added an example of how to create a soft-masked sequence by -self-masking. -
-1.03.00Jul/14/2011 -When a subrange was used, the wrong denominator -was used to compute coverage. The denominator -used was the length of the subrange instead of the entire sequence. This -adversely affected both the -‑‑filter=coverage filter and the -coverage output field. This has been corrected -to use the length of the entire sequence. -
-Added the -separator=<character> -action, allowing the user to specify a character which alignments will not -cross. See also -Non-ACGT Characters, Splicing, and Separation. - -
-Added support for reading FASTQ files. Quality values -do not participate in alignment, but are copied to alignment output when -appropriate. -
-Added ‑‑format=general fields -nucs1, -nucs2 -(the entire target or query nucleotides sequence), -quals1 and -quals2 -(the target or query base-call quality sequence). -
-Fixed a minor problem with the ‑‑format=general fields -cov% and con%. Those fields were being written with -an extra tab character preceeding them. This had a detrimental affect on -downstream parsers that required tabs as separators (parsers that interpreted -whitespace as separators were not affected). -
-Added ‑‑readgroup=<tags>, -allowing the specification of tags for SAM's ‑RG header line. -
-Added -‑‑allocate:target=<bytes> -and -‑‑allocate:query=<bytes>. -These allow the user to predict the amount of memory needed to store target -or query sequence data, which in some instances can resolve memory overuse -(it saves LASTZ from incrementally predicting the amount of memory needed). -

-For consistency, -‑‑allocate:traceback=<bytes> -is now renamed (from ‑‑traceback=<bytes>). -

-Added ‑‑include=<file>, -allowing command-line arguments to be read from a text file. -
-Updated the Yasra shortcuts. Some options that -improved alignment read mapping had not previously been included in the -Yasra definitions, because these options did not exist when the Yasra -shortcuts were originally defined. -

-To allow backward compatibility, the shortcuts now permit specification of -a particular version of LASTZ. See the description of the shortcuts for -details. -

-1.03.02Jul/19/2011 -Fixed a bug in ‑‑format=axt and -‑‑format=axt+, which caused every -alignment to be reported twice. The bug had been introduced in version -1.02.28 (not present in 1.02.27, present in 1.02.37). -
-1.03.34Apr/12/2013 -Fixed a problem with ‑‑self and ‑‑format=lav, -introduced in 1.02.11, which caused lastz to segfault. -
-Fixed a bug in ‑‑writecapsule. When the target was larger than -≈1 billion bp, an internal sanity check triggered incorrectly, stopping -the program and reporting "internal error writing to" the capsule file. - -
-Fixed a bug related to ‑‑nogfextend. If no -‑‑gappedthresh was set, the gapped threshold incorrectly was set -to 0 instead of the correct default of 3000. This has been corrected. - -
-The match count filter now allows the count to be specified as a percentage of -the query length -(‑‑filter=nmatch:<min>%). - -
-Added ‑‑format=general fields -number1, -number2, -number and -znumber -(sequence and alignment numbers). -
-Added ‑‑format=general fields -qalign1 and -qalign2 -(quality sequences in alignment order). -
-Corrected rdotplot output when the -query files contains more than one sequence. Previously, the header line -containing sequence names was only written once, at the beginning of the file. -Now it is written once for each query sequence. -
-Added a warning when a scores file is used -(‑‑scores), with the scale of the -scoring matrix substantially different from the default scoring matrix, and the -user hasn't set the hsp threshold or gapped threshold. This is a common -mistake and often results in no alignments being found. -

-The warning looks something like this: -

-  WARNING. Scores file may warrant setting of thresholds absent from scores.txt.
-  Minimum match score is 10, for matrix entry (A,A).
-  This may not work well with default --gappedthresh=3000.
-
-
-Added ‑‑queryhspbest=<n>. -
-Added ‑‑querydepth=<n>. -
-Added alignment chores files, -‑‑chores=<file> option, -chores=<file> action, and -chore field for -‑‑format=general. -See Aligning Many Subintervals. -
-Fixed a bug related to the -nickname=<name> action. If -the corresponding sequence file was in -2Bit format, the nickname wasn't used and sequence -names were copied from the sequence file. This has been corrected. - -
-Added ‑‑help=defaults and -‑‑show=defaults. -
-Fixed a problem which caused runaway memory allocation of the traceback row -buffer. The problem was discovered when an alignment of a 72-bp read to a -reference genome needed to allocate 170 million rows (about 700M bytes). This -has been corrected. -

-It is not clear whether this had any affect on the alignments produced. In the -examples used for testing and debugging, alignments were not affected. The -negative affect was memory requirements and possibly runtime. -

-However, the cause of the problem was incorrect determination of bounding -alignments when performing gapped alignment backwards from the anchor. So it -is possible that this could have caused a desirable alignment to have been -missed, truncated, or to contain suboptimal gap placement. -

-Corrected the behavior when -‑‑anyornone was used with -‑‑nogapped. Previously this failed -with a message indicating an internal error ("gapped_extend was given a NULL -traceback pointer."). -
-Score thresholds can now make use of units (e.g. -‑‑hspthresh=5K instead of -‑‑hspthresh=5000). -
-Detection of trivial self-alignments has been improved for cases where the -multiple action is used. -
-The implementation of ‑‑self has been -reworked so that it no longer reads the input file twice. As a result, -‑‑self now supports a file piped into stdin. -
-Added an additional build (lastz_32) to address aligning to whole -genomes larger than 2 gigabases. -
-Changed the option names for identity, continuity and coverage filtering to -‑‑filter=identity:<min>[..<max>], -‑‑filter=continuity:<min>[..<max>], -and -‑‑filter=coverage:<min>[..<max>]. -This change was made to achieve consistency with the other back-end filtering -options. -

-The older options, ‑‑identity, ‑‑continuity -and ‑‑coverage are of course still recognized. -

-1.03.46Oct/2/2013 -Added the namejoin action, to allow -better handling of input files that have spaces in sequence names (e.g. Illumina -casava version 1.8 fastq files). -
-Greatly improved speed for the use case where the target contains a large number -of sequences (e.g. 100 thousand exons). This was essentially a bug which had -no effect on accuracy. A data structure was being repeatly allocated and -erased, and was much larger than was needed (e.g. 12 Mbytes per query), and -since all writes were cache misses, this ended up being very significant. -
-Added the -allowBackToBackGaps build option. Previous -versions of LASTZ (and BLASTZ) did not consider alignments in which an insert -was immediately adjacent to a delete (or vice versa). -
-Fixed a problem in ‑‑format=differences -that was inadvertantly introduced in 1.03.34. A failsafe check for an unhandled -case was added in that version, and ‑‑format=differences wasn't -being handled. Unfortunately this prevented this format from being usable. -This has been corrected. -
-1.03.52Jan/14/2014 -Corrected a bug that occured when the -‑‑inner option is used with the -multiple action. With this -combination lastz could report alignments straddling two sequences. This is -now prevented. -

-This bug has existed in all previous versions of lastz. It was not in blastz -since blastz did not provide the ability to have multiple sequences in memory. -

-1.03.54Jan/28/2014 - -Modified ‑‑ambiguous=n and -‑‑ambiguous=iupac to allow a reward -for matches to be specified in addition to the penalty for mismatches. -

-An earlier change history item is also -relevant. -

-1.03.66Jan/19/2015 -Fixed an error that would cause a segfault. The causitive conditions at the -user level are not well characterized. In the discovered example both target -and query sequences contained 1K bp bursts of similar but highly diverged low -complexity sequence, but the specific relationship (if there is one) between -this feature and the failure are not known. Internally the problem resulted -from an unsigned index variable attempting to become negative. -
-Fixed an error that would cause a segfault if fasta queries were piped from -stdin and the first query was empty. -
-Added a sanity check to scoring inference. It is possible that, for a -candidate scoring set, the enforcement of identity filter settings -(min_identity and max_identity) leaves the inference with no alignments from -which to infer scores. This condition should now result in a failure message. -
-Changed how back-to-back gaps are represented in -lav format, to match the way -BLASTZ represented them. A zero-length segment is now written -to the file, separating the two gaps. It has been discovered that at least one -of the lav-processing tools in the Miller Lab suite expects to have such a -segment. -
-Fixed an error which prevented -‑‑progress[=<N>] -being reported for empty sequences. -
-1.03.73Jul/8/2015 -Eliminated alignments that begin or end with gaps. Such alignments do not make -sense biologically. -

-Earlier versions could report a gap at either end of an alignment if the -alignment was very close to an alignment found earlier in the process. This is -a failure in the logic that prevents alignments from crossing (or overlapping) -and/or assumptions made in extending HSPs when the anchor point is very close -to a previously-found alignment. The current solution truncates the alignment -by trimming away any end gaps and rescoring. -

-1.04.00Mar/12/2017 -Corrected a bug involving chaining. Previously, if -‑‑chain -and -[multiple] -were used together, the chaining algorithm incorrectly considered all sequences -together as a single entity, and found a single chain across all sequences. -

-This has been corrected, and the chaining algorithm is now performed -independently for each sequence. -

-Implemented a workaround for parsing conflicts between -sequence specifier actions and shells that use square -brackets for filename expansion. -

-In such shells appending any action to a filename, such as -[multiple], caused the shell to -report "lastz: no match". To provide a means for specifying actions without -having to surround them in square brackets, the commands -‑‑action:target=<action> -and -‑‑action:query=<action> -have been added. -

-Fixed an error that would cause a segfault when there are more than about 32 -million HSPs. The program now detects this error case and suggests ways to -avoid the situation. -
- - - - - -


-
-

References

- - -

-

-

-Bellman R (1957). -Dynamic Programming. -Princeton University Press, Princeton, NJ. - -

-

-Buhler J, Keich U, Sun Y (2003). -Designing seeds for similarity search in genomic DNA. -Proc. 7th Annual International Conference on Research in Computational -Molecular Biology (RECOMB '03), pp. 67-75. - -

-

-Chiaromonte F, Yap VB, Miller W (2002). -Scoring pairwise genomic sequence alignments. -Pacific Symposium on Biocomputing 7:115-126. - -

-

-Cock PJA, Fields CJ, Goto N, Heuer ML, Rice PM (2009). -The Sanger FASTQ file format for sequences with quality scores, and the -Solexa/Illumina FASTQ variants. -Nucleic Acids Research 38:1767-1771. - -

-

-Gusfield D (1997). -Algorithms on strings, trees and sequences. -Cambridge University Press, Cambridge, pp. 244. - -

-

-Harris RS (2007). -Improved pairwise alignment of genomic DNA. -Ph.D. thesis, Pennsylvania State University. - -

-

-Li H et al. (2009). -The Sequence Alignment/Map (SAM) format and SAMtools. -Bioinformatics 25:2078-2079. - -

-

-Myers EW, Miller W (1989). -Approximate matching of regular expressions. -Bull. Math. Biol. 51:5-37. - -

-

-Zhang Z, Berman P, Miller W (1998). -Alignments without low-scoring regions. -J. Comput. Biol. 5:197-210. - - - - - - -


-
-

Acknowledgments

- - -

-Thanks for to Haibao Tang for contributing an example implemention for BLASTN -output. - -

-


-

-

Bob Harris and Cathy Riemer
- -

- - diff --git a/programs/lastz/README.md b/programs/lastz/README.md deleted file mode 100644 index d2bd917..0000000 --- a/programs/lastz/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# LASTZ - pairwise DNA sequence aligner - -This repository contains the latest official release of LASTZ, version 1.04.00. -Additional LASTZ releases, including all releases prior to March 2017, can be -found at http://www.bx.psu.edu/~rsharris/lastz (in the form of tarballs). - -For information about installation and use, see https://lastz.github.io/lastz -(equivalent to README.lastz.html in this repository). - -The UCSC genome browser group has included lastz and lastz_D in their prebuilt -binaries: - - - - -Updated Apr/6/2018 by Bob Harris (the LASTZ guy) diff --git a/programs/lastz/docs/_config.yml b/programs/lastz/docs/_config.yml deleted file mode 100644 index 277f1f2..0000000 --- a/programs/lastz/docs/_config.yml +++ /dev/null @@ -1 +0,0 @@ -theme: jekyll-theme-cayman diff --git a/programs/lastz/docs/hsx_format.html b/programs/lastz/docs/hsx_format.html deleted file mode 100644 index 7d2a7a4..0000000 --- a/programs/lastz/docs/hsx_format.html +++ /dev/null @@ -1,1184 +0,0 @@ - - - -HSX Format - - - - - - -

-

HSX Format

-Format Specification version 1.0.0, -January 12, 2010 - -

-TABLE OF CONTENTS - -

-

- -

-

Introduction

-

- -HSX is a binary file format for indexing (or listing) DNA sequences in other -files, allowing fast random access to those sequences. The format was created -as part of the LASTZ project, -providing a means to input selected sequences from several short read files -into a single run of LASTZ. - -

-This document is provided for users interested in creating HSX files with -programs of their own design. - -

-The HSX file contains a sequence index array and an associated hash table. -Each sequence index entry includes the sequence's name, length, and a reference -to the location of the sequence's data in some other file. This array can be -accessed either sequentially or via the hash table. Note that the names in -the index file do not have to match the original names or headers in the -sequence files. - -

-Sequence entries are ordered by the hashes of their names. Conceptually, the -hash table groups the sequences into buckets of sequences with the -same hash. For each bucket, the table gives the location in the sequence index -of the first sequence in that bucket. Hash collisions are resolved by scanning -subsequent index entries for the remaining sequences in the bucket. - -

-There is also a file table, which allows a single index file to cover sequences -from multiple sequence files of varying formats. However, currently LASTZ only -supports indexing of files in FASTA format. - -

-

File Specification

- -

-The file is stored in a binary format described by the table below. It can be -written on either a big-endian or little-endian machine; programs reading the -file determine the byte order of multi-byte fields by examining the magic -number at the start of the file. - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
File OffsetDataMeaning
0x00D2 52 70 95 -
—or— -
95 70 52 D2
Magic number indicating big-endian byte order. -
-
Magic number indicating little-endian byte order.
0x0400 00 01 00File conforms to version 1.0 of the HSX file format.
0x0800 00 00 1CHeader length in bytes, including this field through the SOFF field.
0x0C00 00 00 xxFLEN: -number of entries in the file table (limited to 255).
0x10xx xx xx xxFOFF: -offset (from file start) to the file table.
0x14xx xx xx xxHLEN: -number of buckets in the hash table. This also serves as the modulus for -the hash function. -

-Typically the number of buckets is set so that the average number of -sequences per bucket (SLEN/HLEN) is reasonably small (e.g. 10). -

-The hash table actually includes HLEN+1 buckets. An extra -sentinel bucket is appended at the end of the table, containing -the offset to just past the end of the sequence index table.

0x18xx xx xx xxHOFF: -offset (from file start) to the hash table.
0x1Cxx xx xx xxSLEN: -number of entries in the sequence index table.
0x20xx xx xx xxSOFF: -offset (from file start) to the sequence index table. -

-Entries in the sequence index table are necessarily stored in hash order. -Entries with the same hash are stored in alphabetical order (actually, in -lexicographic order over the bytes of their names.) -

-See the hashing description below this table for -more information.

FOFFxx xx xx xxFINFO0: -offset (from file start) to the info record for the first sequence file -(file 0).
Offsets to info records for the remaining FLEN-1 files.
FINFO0xx xx FTYPE0: -file type for file 0, stored as a length byte -FTYPELEN0 followed by FTYPELEN0 -bytes of ASCII text. -

-This is equivalent to a file extension (without a leading .) and -will be used as such. In the current implementation, it must be -fa or fasta. -

-Together, this field and the next comprise a single info record in the -file table.

FINFO0+1+FTYPELEN0xx xx  -FNAME0: -file name for file 0, stored as a length byte -FNAMELEN0 followed by FNAMELEN0 -bytes of ASCII text. -

-This is used as the base file name for the corresponding sequence file, -including path. -However, it is usually an empty string, in which case the -base name and path are copied from the name and path of the HSX file itself. -This allows files to be renamed without rebuilding the index.

Info records for the remaining FLEN-1 files.
HOFFxx xx xx xx xxSOFFH(0): -offset (from file start) into the sequence index table, pointing to the first -sequence in the first hash bucket (bucket 0). -

-SOFFn is the file offset for the -n-th entry in the sequence index table. -H(k) is the number of sequences that have a hash code less than -that of bucket k (i.e. the number of sequences assigned to buckets -before bucket k). -Therefore SOFFH(k) points to the first -sequence in the kth hash bucket. -

-The most significant bit in a bucket's SOFFH(k) value -is used to indicate whether the bucket is empty or not. If a bucket is empty, -this bit is set (1), otherwise it is clear (0). The end of the sequences for -bucket k can be determined from SOFFH(k+1) -(the entry for the start of the next bucket). -

Offsets for the first sequences in the remaining HLEN-1 -buckets.
HOFF+5*HLENxx xx xx xx xxSentinel hash bucket. This contains an offset to the end of the -sequence index table (i.e., to the byte just beyond the last entry). -
SOFFxx xx xx xx xxIXLEN0: -length (in nucleotides) of the first sequence. -

-A sequence may be empty, so zero is a legitimate value for the sequence length. -

-Together, this field and the next three comprise a single entry in the -sequence index table.

SOFF+5xxIXFILE0: -index into the file table for the file containing the first sequence.
SOFF+6xx xx xx xx xx xxIXOFF0: -offset (from the start of the appropriate sequence file) pointing to the first -sequence.
SOFF+12xx xx IXNAME0: -name of the first sequence, stored as a length byte -IXNAMELEN0 followed by -IXNAMELEN0 bytes of ASCII text.
Sequence index entries for the remaining -SLEN-1 sequences.
- -

-

Hash Function

-

- -The code for the underlying hash function is shown below, written in C. -The hash bucket for sequence name NAME is computed by -

-    bucket = hassock_hash(NAME,strlen(NAME)) % HLEN;
-
- -

-This hash function is a variant of Austin Appleby's -MurmurHash2. -The primary differences are that it has the seed hardwired and scans the input -data in the reverse order (this is not structly true, since the -non-multiple-of-four leftover bytes are handled slightly differently). It is -also endian-neutral. - -

-

-    #include <stdint.h>
-
-    uint32_t hassock_hash (const void* key, uint32_t len)
-        {
-        const uint32_t seed = 0x5C3FC4D3;
-        const uint32_t m    = 0x87C10417;
-        const uint8_t* data = ((const uint8_t*) key) + len;
-        const uint8_t* stop = ((const uint8_t*) key) + 4;
-        uint32_t       h, k;
-
-        h = seed ^ len;
-        while (data >= stop)
-            {
-            k  = *(--data);
-            k |= *(--data) << 8;
-            k |= *(--data) << 16;
-            k |= *(--data) << 24;
-            k *= m;
-            k ^= k >> 24;
-            k *= m;
-            h *= m;
-            h ^= k;
-            len -= 4;
-            }
-        switch (len)
-            {
-            case 3: h ^= *(--data) << 16;
-            case 2: h ^= *(--data) << 8;
-            case 1: h ^= *(--data);
-                    h *= m;
-            }
-        h ^= h >> 13;
-        h *= m;
-        h ^= h >> 15;
-        return h;
-        }
-
- -

-

Example

-

- - -In this example, we have 10 sequences from 3 fasta files, indexed by a single -HSX file. We first show the fasta files, then show a field-by-field hex dump -of the corresponding HSX file. For demonstration purposes, the HSX file was -created with only 5 buckets. Typical HSX files will deal with more sequences, -more files, and have more buckets. - -

-hsxexA.fa contains five sequences: -

-

-    >HSXEXA_785
-    TAACGGCAATCTTTGGTAGACCTATTGGTCATATCATGAAATTGAAGGAT
-    AATTATTGCCATAAAGTTTTTCACGTTACTATCTTTGCCTCGCAATGAAT
-    AAAATATTCTTAGGGCTACTTTGTAACCTTGCAGAC
-    >HSXEXA_88K
-    TTAATTACTCGCATGATCTTTCAAGATCTTTACCGTTCACACAATTTCTC
-    GAACACTCAGTA
-    >HSXEXA_DNQ
-    CAGTGTACAAAATAAACTATTAACTATATGTAGATAGATACATAGAGACA
-    AAACGGGTAGCATCTAGTATCCTGACTGCGCATTGTGGGGTGTCGCTTCT
-    AAGTACCCGAAATGAGCGT
-    >HSXEXA_LRW
-    TTAAGTACATTCAGATCCATCATGGTTTCGGAAGCTAATGGGAAAAGGGG
-    TACAGAATACAACACCTAGTTGATACGATAGTTAGTTTTTTA
-    >HSXEXA_R9V
-    TATAGTGCGTGTATGACCAATATTACGATGATCGTGACGCCATAGGGTCA
-    TATTCCTTAATATGTAAATATGAAGGTA
-
- -

-hsxexB.fa contains four sequences: -

-

-    >HSXEXB_6YF
-    AAGAGTTCTTACGGCAATAACAAAATGATGCTGTATCCTAGTAACAGGAA
-    CGAACCATTCGCTTCTGTGTTCTATACAGAAGAAACCAGACTCGCTAAAC
-    A
-    >HSXEXB_WCV
-    AATTAGTCTATTAAGGACTATATGTTTACAAGGATGGTAGTCCTAACGGA
-    ATTGATACCAATAGGTGGCACTTACCGTAGCTAGGTAGATCGCCCTACTA
-    CACCAGCTCAGCCATCTTGCCCCGCCAACT
-    >HSXEXB_YKU
-    GTCAACAGGTTTTCGGACTGGTGGCTTTCCTGATTTGATATTCAAAGGAA
-    ATTAGGGTAAGGACTTTGAGTTGTCATAGAATTCAATTTCGGGCTCCGTC
-    CATCACCTCGT
-    >HSXEXB_YV1
-    GGTGATGTTGTGAATATCACTGTCATGAAGGTCTCCTTCGGCCGCCTTAA
-    TCATCATCATAAGTTTCACCATGGTAAAATGAATTAGCCCCAAGCT
-
- -

-hsxexC.fa contains three sequences: -

-

-    >HSXEXC_4ZL
-    CACATCACATTGGTTGTTCATCCATATAATTATTTCCCATAAACTTTAAG
-    AGCTCGGCTGGCCATACGTACTGACTAGCTTAGCCCCTAACTAATCGGCC
-    ACAGCGATAGTACA
-    >HSXEXC_936
-    TGGTTTTTAGAGTCCGTGGAGCCTCTCAGCCACACTGGGTTCGGGAAGTT
-    TCAGGCAAGTCCTACCTGTAA
-    >HSXEXC_GWD
-    CTAATCTGGGCTTGGGTCTGAACTCGCCCATGAGGAGGTAAGCAAACCAA
-    TAAATTCGGGTATGGCGGTCTTTATTATGCTTAAGGAACGGAACAA
-
- -

-The twelve sequence names are hashed into separate buckets, and sorted within -buckets, like this: - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hash Code/BucketSequence NameFasta fileOffset to sequence, in fasta file
0HSXEXB_6YFhsxexB.fa0x0000 00000000
1HSXEXA_785 -
HSXEXA_DNQ
hsxexA.fa -
hsxexA.fa
0x0000 00000000 -
0x0000 000000E3
2HSXEXA_88K -
HSXEXA_LRW -
HSXEXB_YV1 -
HSXEXC_4ZL
hsxexA.fa -
hsxexA.fa -
hsxexB.fa -
hsxexC.fa
0x0000 00000097 -
0x0000 00000169 -
0x0000 00000183 -
0x0000 00000000
3HSXEXB_YKUhsxexA.fa0x0000 00000105
4HSXEXA_R9V -
HSXEXB_WCV -
HSXEXC_936 -
HSXEXC_GWD
hsxexA.fa -
hsxexB.fa -
hsxexC.fa -
hsxexC.fa
0x0000 000001D3 -
0x0000 00000074 -
0x0000 00000081 -
0x0000 000000D6
- -

-Here is the complete HSX file, byte-by-byte: - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
File OffsetDataFieldMeaning
0x00000000D2 52 70 95 -Magic numberBig-endian.
0x0000000400 00 01 00HSX version1.0.
0x0000000800 00 00 1CHeader length28 bytes.
0x0000000C00 00 00 03FLEN=33 entries in file table.
0x0000001000 00 00 30FOFF=30File table is at 0x00000030.
0x0000001400 00 00 05HLEN=55 buckets in the hash table. -
0x0000001800 00 00 60HOFF=60Hash table is at 0x00000060.
0x0000001C00 00 00 0CSLEN=0C12 entries in the sequence index table.
0x0000002000 00 00 60SOFF=80Sequence index table is at 0x00000080.
0x0000002400 00 00 00 -
00 00 00 00 -
00 00 00 00
PaddingThe creating program can insert padding here, at its descretion.
0x0000003000 00 00 40FINFO0=40Info record for file 0 is at 0x00000040.
0x0000003400 00 00 4AFINFO1=4AInfo record for file 1 is at 0x0000004A.
0x0000003800 00 00 54FINFO2=54Info record for file 2 is at 0x00000054.
0x0000003C00 00 00 00 -Padding
0x0000004000 66 61FTYPE0File type for file 0 is "fa".
0x0000004306 68 73 78 -
65 78 41
FNAME0Base name for file 0 is "hsxexA". File name is hsxexA.fa.
0x0000004A00 66 61FTYPE1File type for file 1 is "fa".
0x0000004D06 68 73 78 -
65 78 42
FNAME1Base name for file 1 is "hsxexB". File name is hsxexB.fa.
0x0000005400 66 61FTYPE2File type for file 2 is "fa".
0x0000005706 68 73 78 -
65 78 43
FNAME2Base name for file 2 is "hsxexC". File name is hsxexC.fa.
0x0000005E00 00 -Padding
0x0000006000 00 00 00 80SOFFH(0)=80Sequence entries for hash bucket 0 start at 0x00000080.
0x0000006000 00 00 00 97SOFFH(1)=97Sequence entries for hash bucket 1 start at 0x00000097.
0x0000006000 00 00 00 C5SOFFH(2)=C5Sequence entries for hash bucket 2 start at 0x000000C5.
0x0000006000 00 00 01 21SOFFH(3)=121Sequence entries for hash bucket 3 start at 0x00000121.
0x0000006000 00 00 01 38SOFFH(4)=138Sequence entries for hash bucket 4 start at 0x00000138.
0x0000006080 00 00 01 94SOFFH(5)=194Sentinel bucket indicates end of sequence entries is at 0x00000194. -

The most significant bit of SOFFH(5) -is a 1, indicating the bucket is empty.

0x0000007E00 00 -Padding
0x0000008000 00 00 00 65IXLEN0=65(Start of hash bucket 0) -
Sequence 0 is 101 bp.
0x0000008501IXFILE0=1Sequence 0 is in file 1 (hsxexB.fa).
0x0000008600 00 00 00 00 00IXOFF0=00Sequence 0 is at file offset 0x0000 00000000.
0x0000008C0A 48 53 58 -
45 58 42 5F -
36 59 46
IXNAME0Sequence 0 is named "HSXEXB_6YF".
0000009700 00 00 00 88IXLEN1=88(Start of hash bucket 1) -
Sequence 1 is 136 bp.
0000009C00IXFILE1=0Sequence 1 is in file 0 (hsxexA.fa).
0000009D00 00 00 00 00 00IXOFF1=00Sequence 1 is at file offset 0x0000 00000000.
000000A30A 48 53 58 -
45 58 41 5F -
37 38 35
IXNAME1Sequence 1 is named "HSXEXA_785".
000000AE00 00 00 00 77IXLEN2=77Sequence 2 is 119 bp.
000000B300IXFILE2=0Sequence 2 is in file 0 (hsxexA.fa).
000000B400 00 00 00 00 E3IXOFF2=E3Sequence 2 is at file offset 0x0000 000000E3.
000000BA0A 48 53 58 -
45 58 41 5F -
44 4E 51
IXNAME2Sequence 2 is named "HSXEXA_DNQ".
000000C500 00 00 00 3EIXLEN3=3E(Start of hash bucket 2) -
Sequence 3 is 62 bp.
000000CA00IXFILE3=0Sequence 3 is in file 0 (hsxexA.fa).
000000CB00 00 00 00 00 97IXOFF3=97Sequence 3 is at file offset 0x0000 00000097.
000000D10A 48 53 58 -
45 58 41 5F -
38 38 4B
IXNAME3Sequence 3 is named "HSXEXA_88K".
000000DC00 00 00 00 5CIXLEN4=5CSequence 4 is 92 bp.
000000E100IXFILE4=0Sequence 4 is in file 0 (hsxexA.fa).
000000E200 00 00 00 01 69IXOFF4=169Sequence 4 is at file offset 0x0000 00000169.
000000E80A 48 53 58 -
45 58 41 5F -
4C 52 57
IXNAME4Sequence 4 is named "HSXEXA_LRW".
000000F300 00 00 00 60IXLEN5=60Sequence 5 is 96 bp.
000000F801IXFILE5=1Sequence 5 is in file 1 (hsxexB.fa).
000000F900 00 00 00 01 83IXOFF5=183Sequence 5 is at file offset 0x0000 00000183.
000000FF0A 48 53 58 -
45 58 42 5F -
59 56 31
IXNAME5Sequence 5 is named "HSXEXB_YV1".
0000010A00 00 00 00 72IXLEN6=72Sequence 6 is 130 bp.
0000010F02IXFILE6=2Sequence 6 is in file 2 (hsxexC.fa).
0000011000 00 00 00 00 00IXOFF6=0Sequence 6 is at file offset 0x0000 00000000.
000001160A 48 53 58 -
45 58 43 5F -
34 5A 4C
IXNAME6Sequence 6 is named "HSXEXC_4ZL".
0000012100 00 00 00 6FIXLEN7=6F(Start of hash bucket 3) -
Sequence 7 is 111 bp.
0000012601IXFILE7=1Sequence 7 is in file 1 (hsxexB.fa).
0000012700 00 00 00 01 05IXOFF7=105Sequence 7 is at file offset 0x0000 00000105.
0000012D0A 48 53 58 -
45 58 42 5F -
59 4B 55
IXNAME7Sequence 7 is named "HSXEXB_YKU".
0000013800 00 00 00 4EIXLEN8=4E(Start of hash bucket 4) -
Sequence 8 is 78 bp.
0000013D00IXFILE8=0Sequence 8 is in file 0 (hsxexA.fa).
0000013E00 00 00 00 01 D3IXOFF8=1D3Sequence 8 is at file offset 0x0000 000001D3.
000001440A 48 53 58 -
45 58 41 5F -
52 39 56
IXNAME8Sequence 8 is named "HSXEXA_R9V".
0000014F00 00 00 00 82IXLEN9=82Sequence 9 is 130 bp.
0000015401IXFILE9=1Sequence 9 is in file 1 (hsxexB.fa).
0000015500 00 00 00 00 74IXOFF9=74Sequence 9 is at file offset 0x0000 00000074.
0000015B0A 48 53 58 -
45 58 42 5F -
57 43 56
IXNAME9Sequence 9 is named "HSXEXB_WCV".
0000016600 00 00 00 47IXLEN10=47Sequence 10 is 71 bp.
0000016B02IXFILE10=2Sequence 10 is in file 2 (hsxexC.fa).
0000016C00 00 00 00 00 81IXOFF10=81Sequence 10 is at file offset 0x0000 00000081.
000001720A 48 53 58 -
45 58 43 5F -
39 33 36
IXNAME10Sequence 10 is named "HSXEXC_936".
0000017D00 00 00 00 60IXLEN11=60Sequence 11 is 96 bp.
0000018202IXFILE11=2Sequence 11 is in file 2 (hsxexC.fa).
0000018300 00 00 00 00 D6IXOFF11=D6Sequence 11 is at file offset 0x0000 000000D6.
000001890A 48 53 58 -
45 58 43 5F -
47 57 44
IXNAME11Sequence 11 is named "HSXEXC_GWD".
00000194(file ends here)
- - -

-


-Bob Harris and Cathy Riemer, January 2010 - -

- - diff --git a/programs/lastz/docs/images/after_chaining.png b/programs/lastz/docs/images/after_chaining.png deleted file mode 100644 index 6660551..0000000 Binary files a/programs/lastz/docs/images/after_chaining.png and /dev/null differ diff --git a/programs/lastz/docs/images/after_interpolation.png b/programs/lastz/docs/images/after_interpolation.png deleted file mode 100644 index 309f53c..0000000 Binary files a/programs/lastz/docs/images/after_interpolation.png and /dev/null differ diff --git a/programs/lastz/docs/images/aglobin_chained.png b/programs/lastz/docs/images/aglobin_chained.png deleted file mode 100644 index 84ed244..0000000 Binary files a/programs/lastz/docs/images/aglobin_chained.png and /dev/null differ diff --git a/programs/lastz/docs/images/aglobin_closeup_gapped.png b/programs/lastz/docs/images/aglobin_closeup_gapped.png deleted file mode 100644 index b5f0dc2..0000000 Binary files a/programs/lastz/docs/images/aglobin_closeup_gapped.png and /dev/null differ diff --git a/programs/lastz/docs/images/aglobin_closeup_hsps.png b/programs/lastz/docs/images/aglobin_closeup_hsps.png deleted file mode 100644 index a1284ca..0000000 Binary files a/programs/lastz/docs/images/aglobin_closeup_hsps.png and /dev/null differ diff --git a/programs/lastz/docs/images/aglobin_closeup_seeds.png b/programs/lastz/docs/images/aglobin_closeup_seeds.png deleted file mode 100644 index 7d1bdb5..0000000 Binary files a/programs/lastz/docs/images/aglobin_closeup_seeds.png and /dev/null differ diff --git a/programs/lastz/docs/images/aglobin_hsps.png b/programs/lastz/docs/images/aglobin_hsps.png deleted file mode 100644 index 066b053..0000000 Binary files a/programs/lastz/docs/images/aglobin_hsps.png and /dev/null differ diff --git a/programs/lastz/docs/images/aglobin_unchained.png b/programs/lastz/docs/images/aglobin_unchained.png deleted file mode 100644 index c917e80..0000000 Binary files a/programs/lastz/docs/images/aglobin_unchained.png and /dev/null differ diff --git a/programs/lastz/docs/images/anchors_and_alignment.png b/programs/lastz/docs/images/anchors_and_alignment.png deleted file mode 100644 index 012cfe7..0000000 Binary files a/programs/lastz/docs/images/anchors_and_alignment.png and /dev/null differ diff --git a/programs/lastz/docs/images/before_chaining.png b/programs/lastz/docs/images/before_chaining.png deleted file mode 100644 index 9e6ca2c..0000000 Binary files a/programs/lastz/docs/images/before_chaining.png and /dev/null differ diff --git a/programs/lastz/docs/images/before_interpolation.png b/programs/lastz/docs/images/before_interpolation.png deleted file mode 100644 index c78092f..0000000 Binary files a/programs/lastz/docs/images/before_interpolation.png and /dev/null differ diff --git a/programs/lastz/docs/images/human_vs_chicken.png b/programs/lastz/docs/images/human_vs_chicken.png deleted file mode 100644 index 0e74f6f..0000000 Binary files a/programs/lastz/docs/images/human_vs_chicken.png and /dev/null differ diff --git a/programs/lastz/docs/images/human_vs_chicken_full.png b/programs/lastz/docs/images/human_vs_chicken_full.png deleted file mode 100644 index c1c191e..0000000 Binary files a/programs/lastz/docs/images/human_vs_chicken_full.png and /dev/null differ diff --git a/programs/lastz/docs/images/seeds_and_hsps.png b/programs/lastz/docs/images/seeds_and_hsps.png deleted file mode 100644 index a3d98de..0000000 Binary files a/programs/lastz/docs/images/seeds_and_hsps.png and /dev/null differ diff --git a/programs/lastz/docs/images/seq_vs_same.png b/programs/lastz/docs/images/seq_vs_same.png deleted file mode 100644 index 02ea092..0000000 Binary files a/programs/lastz/docs/images/seq_vs_same.png and /dev/null differ diff --git a/programs/lastz/docs/images/seq_vs_same_notrivial.png b/programs/lastz/docs/images/seq_vs_same_notrivial.png deleted file mode 100644 index 1ff4cbc..0000000 Binary files a/programs/lastz/docs/images/seq_vs_same_notrivial.png and /dev/null differ diff --git a/programs/lastz/docs/images/seq_vs_self.png b/programs/lastz/docs/images/seq_vs_self.png deleted file mode 100644 index b16f9bb..0000000 Binary files a/programs/lastz/docs/images/seq_vs_self.png and /dev/null differ diff --git a/programs/lastz/docs/images/seq_vs_self_no_mirror.png b/programs/lastz/docs/images/seq_vs_self_no_mirror.png deleted file mode 100644 index a7f66ef..0000000 Binary files a/programs/lastz/docs/images/seq_vs_self_no_mirror.png and /dev/null differ diff --git a/programs/lastz/docs/images/word_count_table.png b/programs/lastz/docs/images/word_count_table.png deleted file mode 100644 index ff1bef2..0000000 Binary files a/programs/lastz/docs/images/word_count_table.png and /dev/null differ diff --git a/programs/lastz/docs/images/ydrop.png b/programs/lastz/docs/images/ydrop.png deleted file mode 100644 index a2455f5..0000000 Binary files a/programs/lastz/docs/images/ydrop.png and /dev/null differ diff --git a/programs/lastz/docs/index.html b/programs/lastz/docs/index.html deleted file mode 100644 index 415728e..0000000 --- a/programs/lastz/docs/index.html +++ /dev/null @@ -1,7760 +0,0 @@ - - - -LASTZ - - - - - - -

-

LASTZ   Release 1.04.00, - built March 12, 2017

- -TABLE OF CONTENTS - -

-

-

- - - - - - -


-
-

Introduction

- -

-This document describes installation and usage of the LASTZ sequence alignment -program. LASTZ is a drop-in replacement for -BLASTZ, and is backward -compatible with BLASTZ’s command-line syntax. That is, it supports -all of BLASTZ’s options but also has additional ones, and may produce -slightly different alignment results. - -

- - - - - - - - - - - -
LASTZ:A tool for (1) aligning two DNA sequences, and -(2) inferring appropriate scoring parameters automatically. -
Platform:This package was developed on a Macintosh OS X system, but should work on -other Unix or Linux platforms with little change (if any). LASTZ is written in -C and compiled with gcc; other C compilers can probably be used by adjusting -the Makefile. Some ancillary tools are written in Python, but only use modules -available in typical python installations. -
Author:Bob Harris  <rsharris at bx dot psu dot edu> -
Date:March 2017 -
Mailing list: -http://lists.bx.psu.edu/listinfo/lastz-users -
- - - - - - -


-
-

Availability

- -

-LASTZ is available from github at -https://github.com/lastz/lastz. - -

-A packed archive containing source code for LASTZ is available from the -Miller Lab at Penn State. - - - - - - -


-
-

Installation

- -

-If you have received the distribution as a packed archive, unpack it -by whatever means are appropriate for your computer. The result should be -a directory <somepath>/lastz‑distrib‑X.XX.XX that contains -a src subdirectory (and some others). You may find it convenient -to remove the revision number (‑X.XX.XX) from the directory name. - -

-Before building or installing any of the programs, you will need to tell the -installer where to put the executable, either by setting the shell variable -$LASTZ_INSTALL, or by editing the make‑include.mak -file to set the definition of installDir. Also, be sure to add -the directory you choose to your $PATH. - -

-Then to build the LASTZ executable, enter the following commands from bash -or a similar command-line shell (Solaris users should substitute -gmake for make). This will build two executables -(lastz and lastz_D) and copy them into your -installDir. -

-    cd <somepath>/lastz-distrib-X.XX.XX/src
-    make
-    make install
-
-The two executables are basically the same program; the only difference is -that lastz uses integer scores, while lastz_D uses -floating-point scores. - -

-The build process should not report any warnings or errors. Because of this, -the Makefile is set up so that warnings are considered errors and will stop the -build. If you encounter this situation, you can modify the Makefile, removing -"-Werror" from the variable definedForAll. This should allow the build to -complete, while still reporting the warnings. You'll need to decide whether -the warnings indicate something is really wrong. Usually they don't, but please -report them to the author regardless. - -

-A simple self test is included so you can test whether the build succeeded. -To run it, enter the following command: -

-    make test
-
-If the test is successful, you will see no output from this command. -Otherwise, you will see the differences between the expected output and the -output of your build, plus a line that looks like this: -
-    make: *** [test] Error 1
-
- - - - - -
-
-

Build Options

- -

-An additional executable (lastz_32) can be built, to handle -genomes larger than 2 gigabases. For details, see the section on -aligning to whole genomes. - -

-Any executable can be built to allow adjacent indels (by default, these are -not allowed). For details, see the section on -adjacent indels. - - - - - -


-
-

Overview of Processing Stages and Terminology

- -

-LASTZ is designed to preprocess one sequence or set of sequences (which we -collectively call the target) and then align several -query sequences to it. The general flow of the program is like a -pipeline: the output of one stage is the input to the next. The user can -choose to skip most stages via command-line options; any stages that are -skipped pass their input along to the next stage unchanged. Two of the stages, -scoring inference and interpolation, are special in that they perform a -miniature version of the pipeline within them. - -

-Note that the following discussion is a generalization, intended to describe -the basic idea of LASTZ’s operation. There are many exceptions that -depend on the particular options specified. - -

-The stages are: -

- -

-The usual flow is as follows (though most of these steps are optional, -and some settings like ‑‑anyornone -may affect the processing order). -We first read the target sequence(s) into memory, and use that to build a seed -word position table that will allow us to quickly map any word in the target to -all of the positions where it appears. (For the purposes of this discussion -you can think of a word as a 12-mer of DNA.) Then we read each -query sequence in turn, processing them more or less independently. We examine -the word starting at each base in the query and use the position table to find -matches, called seeds, in the target. The seeds are extended to -longer matches called HSPs (high-scoring segment pairs) and filtered -based on score. The HSPs are chained into the highest-scoring set of syntenic -alignments, and then reduced to single locations called anchors. -The anchors are then extended to local alignments (which may contain -gaps) and again filtered by score, followed by back-end filtering to discard -alignment blocks that do not meet specified criteria for certain traits. We -then interpolate, repeating the entire process at a higher sensitivity in the -holes between the alignment blocks. And finally, we write out the alignment -information to a file. Then these steps are repeated with the reverse -complement of the query sequence, before moving on to the next sequence in the -query file. - -

-The scoring inference stage is not usually performed. Typically it is used -only when sequences for new species are acquired, to create scoring files for -subsequent alignments of those species. - - - - - - -


-
-

Examples

- -

-For those eager to try it out, here are some illustrative examples to get you -started. Detailed reference material begins with the -next section. - - - -

-

Comparing a Human Chromosome and a Chicken Chromosome

- -

-It is often adequate to use a lower sensitivity level than is achieved with -LASTZ’s defaults. For example, to compare two complete chromosomes, even -for species as distant as human and chicken, the alignment landscape is evident -even at very low sensitivity settings. This can speed up the alignment process -considerably. - -

-This example compares human chromosome 4 to chicken chromosome 4. These -sequences can be found in the downloads section of the -UCSC Genome Browser, and are 191 and 94 -megabases long, respectively. To run a quick low-sensitivity alignment of -these sequences, use a command like this: -

-    lastz hg18.chr4.fa galGal3.chr4.fa \
-      --notransition --step=20 --nogapped \
-      --format=maf > hg18_4_vs_galGal3_4.maf
-
- -

-This runs in about two and a half minutes on a 2-GHz workstation, requiring -only 400 Mb of RAM. Figure 1(a) shows the results, plotted using the -‑‑format=rdotplot output option and -the R statistical package. -(When in MAF format, LASTZ output can be browsed with -the GMAJ interactive viewer for multiple alignments, available from the -Miller Lab at Penn State.) - -

-Using ‑‑notransition lowers -seeding sensitivity and reduces runtime (by a factor of about 10 in this case). -‑‑step=20 also lowers seeding -sensitivity, reducing runtime and also reducing memory consumption (by a factor -of about 3.3 in this case). -‑‑nogapped eliminates the -computation of gapped alignments. The complete alignment process using default -settings (shown in Figure 1(b)) uses 1.3 Gb of RAM and takes 4.5 hours on a -machine running at 2.83 GHz. - -

- - -
-Figure 1(a) -

-human vs. chicken: low sensitivity -

-
-
-lastz \
-  hg18.chr4.fa galGal3.chr4.fa \
-  --notransition --step=20 \
-  --nogapped
-
-
-

-Figure 1(b) -

-human vs. chicken: defaults -

-
-
-lastz \
-  hg18.chr4.fa galGal3.chr4.fa
-
-
-

- - - -

-

Aligning Shotgun Reads to a Human Chromosome

- -

-Short read mapping for close species requires parameters very different from -LASTZ’s defaults. This example compares a simulated set of primate shotgun -reads to human chromosome 21. The chromosome can be found in the downloads -section of the UCSC Genome Browser -(it is about 47 megabases). Ten thousand simulated reads were generated by -extracting 60-bp intervals from chimp chr21, subjecting them to mild mutation -(including short gaps), and then truncating them to 50 bp (these are included -in the LASTZ distribution, in test_data/fake_chimp_reads.2bit). - -

-To see where these reads map onto the human chromosome, use this command: -

-    lastz hg18.chr21.fa[unmask] fake_chimp_reads.2bit \
-      --step=10 --seed=match12 --notransition --exact=20 --noytrim \
-      --match=1,5 --ambiguous=n \
-      --filter=coverage:90 --filter=identity:95 \
-      --format=general:name1,start1,length1,name2,strand2 \
-      > hg18_21_vs_reads.dat
-
- -

-Attaching [unmask] to the chromosome -filename instructs LASTZ to ignore masking information and treat repeats the -same as any other part of the chromosome, in order to accurately assess the -uniqueness of the read mappings. Since we know the two species are close, we -want to reduce sensitivity. Using -‑‑step=10, we will only be looking for -seeds at every 10th base. Instead of the default seed pattern, we use -‑‑seed=match12 and -‑‑notransition so our -seeds will be exact matches of 12 bases. Instead of the default -x-drop extension method we use -‑‑exact=20 so that a 20-base -exact match is required to qualify as an HSP. Because we are aligning short -reads, we specify -‑‑noytrim so the alignment ends will -not be trimmed back to the highest scoring locations during gapped extension. - -

-We replace the default score set, which is for more distant species, with the -stricter ‑‑match=1,5. This scores -matching bases as +1 and mismatches as −5. We also use -‑‑ambiguous=n so that Ns -will be scored appropriately. -We are only interested in alignments that involve nearly an entire read, and -since the species are close we don't want alignments with low identity; -therefore we use ‑‑filter=coverage:90 and -‑‑filter=identity:95. - -

-For output, we are only interested in where the reads align, so we use the -‑‑format=general option and specify -that we want the position on the chromosome (name1, -start1, length1) and the read name and orientation -(name2, strand2). This creates a tab-delimited -output file with one line per alignment block, a format that is well-suited for -downstream processing by other programs. For example, to count the number of -different reads we've mapped, we can run this Unix shell command: -

-    cat hg18_21_vs_reads.dat | grep -v "#" | awk '{print $4}' | sort -u | wc
-
- - - - -
-

Seeds, HSPs, Gapped Alignments, Chaining

- -

-This example demonstrates the primary -alignment processing stages, using the -α-globin regions of cow and human. This data is included in the LASTZ -distribution in test_data/aglobin.2bit, and consists of a 70K bp -segment of human DNA and a 66K bp segment of cow DNA. We will follow this -example through the major stages of seeding, gap-free extension, chaining, and -gapped extension. - -

-Figure 2(a) shows the result of default seeding on a small window (3K bp) in the -middle of these segments. Seeds are short near-matches; in this case each seed -is 19 bp and could have as many as 8 mismatches (12-of-19 with one transition). -There are 338 seeds in this window, but regions where there are many seeds are -indistinguishable from line segments. - -

-Figure 2(b) shows high-scoring segment pairs, the result of gap-free extension -of the seeds. There are 11 HSPs (only 10 are apparent in the figure, but one -of those is split by a 1-bp shift to the next diagonal). Note that many seeds -were discarded because their extensions were low scoring or overlapped. - -

-Figure 2(c) shows the local alignment blocks resulting from gapped extension of -the HSPs. There are four alignment blocks. - -

-Then we zoom out and show the results for the full sequences; the red box -indicates the small region shown in the earlier figures. Figure 2(d) shows -the HSPs, 2(e) shows the gapped alignment blocks, and 2(f) illustrates how -chaining reduces the alignment blocks to a single syntenic line (or two lines, -if there were matches on both strands). Note that one can already tell -quite a bit about how the sequences align just from looking at the HSPs. - -

- - - - -
-Figure 2(a) -

-alpha-globin: seeds (closeup) -

-
-
-lastz \
-  aglobin.2bit/human[34000..37000] \
-  aglobin.2bit/cow[35000..38000] \
-  --nogfextend --nochain --nogapped
-
-
-

-Figure 2(b) -

-alpha-globin: HSPs (closeup) -

-
-
-lastz \
-  aglobin.2bit/human[34000..37000] \
-  aglobin.2bit/cow[35000..38000] \
-  --gfextend --nochain --nogapped
-
-
-

-Figure 2(c) -

-alpha-globin: gapped blocks (closeup) -

-
-
-lastz \
-  aglobin.2bit/human[34000..37000] \
-  aglobin.2bit/cow[35000..38000] \
-  --gfextend --nochain --gapped
-
-
-

-Figure 2(d) -

-alpha-globin: HSPs -

-
-
-lastz \
-  aglobin.2bit/human \
-  aglobin.2bit/cow \
-  --gfextend --nochain --nogapped
-
-
-

-Figure 2(e) -

-alpha-globin: gapped blocks -

-
-
-lastz \
-  aglobin.2bit/human \
-  aglobin.2bit/cow \
-  --gfextend --nochain --gapped
-
-
-

-Figure 2(f) -

-alpha-globin: gapped blocks with chaining -

-
-
-lastz \
-  aglobin.2bit/human \
-  aglobin.2bit/cow \
-  --gfextend --chain --gapped
-
-
-

- - - -

-

Aligning a Sequence With Itself

- -

-When a sequence is aligned to itself, the full result will contain mirror-image -copies of each alignment block. It is computationally wasteful to process both -copies. LASTZ can handle this situation in four different ways. -

    -
  1. Simply give LASTZ the same sequence for both the -target and query. In this case, LASTZ does not know that -it is aligning a sequence to itself, and performs the full computation on both -copies (Figure 3(a)). -

    -

  2. Specify the ‑‑notrivial -option. This performs the full computation on both copies, but doesn't report -the trivial self-alignment block along the main diagonal (Figure 3(b)). -

    -

  3. Specify the ‑‑self option in place -of the query sequence. LASTZ will save work by computing with only one block -of each mirror-image pair, though it still reports both copies in the output by -reconstructing the second copy from the first. It also invokes -‑‑notrivial automatically to omit the trivial self-alignment block -along the main diagonal. This gives the same output as the previous method, -but runs faster (Figure 3(c)). -

    -

  4. Specify ‑‑self in place of the -query, and also add the ‑‑nomirror -option. In this case LASTZ reports only one copy of each mirror-image pair, -as well as omitting the trivial block (Figure 3(d)). -
- -

-In the following figure, we suppose we have a sequence with repeated motifs, -in the order -α1 β1 γ1 β2 δ1 α2 δ2′ γ2. -That is, α1 and α2 are ancient duplications, as are β1 and -β2, and γ1 and γ2.  δ2′ is an inversion, a -reverse-complement duplicate of δ1. - -

- - - -
-Figure 3(a) -

-rearranged sequence: vs. itself, default options -

-
-
-lastz target target
-
-
-

-Figure 3(b) -

-rearranged sequence: vs. itself, --notrivial -

-
-
-lastz target target --notrivial
-
-
-

-Figure 3(c) -

-rearranged sequence: --self -

-
-
-lastz target --self
-
-
-

-Figure 3(d) -

-rearranged sequence: --self --nomirror -

-
-
-lastz target --self --nomirror
-
-
-

- - - - - - -


-
-

Command-line Syntax

- -

-If you are familiar with BLASTZ, you can run LASTZ the same way you ran BLASTZ, -with the same options and input files. In addition to this BLASTZ compatibility, -LASTZ provides other options. - -

-The general format of the LASTZ command line is -

-    lastz <target> [<query>] [<options>]
-
- -

-The angle brackets <> indicate meta-syntactic variables that -should be replaced with your values, while the square ones [] -indicate elements that are optional. Spaces separate fields on the command -line; a field that needs to contain a space (e.g. within a file name) must be -enclosed in double quotes "". Elements can appear in -any order, the only constraint being that, if present, the -<query> must appear after the <target>. -Output is generally written to stdout, unless specified otherwise -for a particular option. - -

-

-The <target> and <query> are usually -just the names of files containing the sequences to be aligned, in either -FASTA, Nib, -or 2Bit format. However they can be -HSX index files that refer to the sequences indirectly, -and they also can specify pre-processing actions such as selecting a -subsequence from the file (see Sequence Specifiers for -details). With certain options such as -‑‑self the <query> -is not needed; otherwise if it is left unspecified the query sequences are read -from stdin -(though this does not work with random-access formats -like 2Bit). -As a special case, the <target> is -omitted when the ‑‑targetcapsule -option is used, since the target sequence is embedded within the capsule file. - -

-For options, the general format is ‑‑<keyword> or -‑‑<keyword>=<value>, but for BLASTZ compatibility -some options also have an alternative syntax -<letter>=<number>. -(Be careful when copying options from the tables below, as some of the hyphens -here are special characters to avoid awkward line wrapping in certain web -browsers. If you have trouble, replace the pasted hyphens with ordinary typed -ones on your command line.) - -

-Please understand that LASTZ is a complex program and its options are not all -independent, i.e., some options are not valid in combination with certain -others. It would be difficult and cumbersome to attempt to list every possible -conflict here; instead we just mention some of the major ones. If you are not -sure about a particular combination, go ahead and try it — LASTZ will -tell you if it’s not allowed. - -

-Running the command lastz without any arguments prints a help -message with the most commonly used options, while running -

-    lastz --help
-
-lists all of the options. - - - -
-

Where to Look

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--strand=bothB=2 -Search both strands. -
--strand=plusB=0 -Search the forward strand only (the one corresponding to the query specifier). -
--strand=minus -Search only the reverse complement of the query specifier. -
--self -Perform a self-alignment: the target sequence is also the query. -Computation is more efficient than it would be without this option, since only -one of each mirror-image pair of alignment blocks is processed (the other, -redundant one is skipped during processing, but re-created in the output). -Also, the trivial self-alignment block along the main diagonal is omitted from -the output. -This option cannot be used if the target is comprised of multiple sequences. -
--nomirror -Inhibit the re-creation of mirror-image alignments. Output consists of only -one copy of each meaningful alignment block in a self-alignment. This option -is only applicable when the ‑‑self -option is used. -
--queryhsplimit=<n> -Discard queries that have more than <n> HSPs. Any queries -that exceed this limit are reported as a warning (to stderr), and -no alignments are reported. -

-This is useful for mapping reads to a reference genome, when some reads align -to too many places in the reference. -

--queryhsplimit=nowarn:<n> -Same as ‑‑queryhsplimit=<n> but warnings for queries that -exceed the limit are witheld. -
--queryhsplimit=keep,nowarn:<n> -Same as ‑‑queryhsplimit=<n> but queries that exceed the -limit are not discarded and warnings are witheld. For such a query, the first -<n> HSPs found are passed along to downstream processing. -

-Note that the HSPs reported are not the best <n> HSPs. They -are simply the first <n> found; they very likely have a -positional bias. -

--queryhspbest=<n> -For queries that have more than <n> HSPs, discard any HSPs -that score below the nth best. -

-This is useful for mapping reads to a reference genome, when some reads align -to too many places in the reference. -

--querydepth=<n> -Stop processing gapped alignments for a query/strand if its ratio of aligned -bases to query length exceeds <n>. A warning is written to -stderr, all alignments for the query/strand are discarded, and processing -continues with the next query (or strand). -

-‑‑querydepth=keep:<n> can be used if the preference is to -keep some alignments for such query/strands. -

-<n> is a real number and corresponds to a depth of coverage -threshold. For example, a value of 5.0 would cause termination -once a query/strand has an average of five alignments for every base in the -query. The numerator is the number of matches or substitutions (but not gaps); -the denominator is the length of the query sequence. -

-The purpose of this option is one of saving time. It is useful for -automatically terminating the processing of queries with high repeat content, -for which other methods of dealing with repetitive content fail. -

-Moreover, back-end filtering options are -not considered. In other words, matches are counted for any alignment -that meets the scoring threshold, regardless of whether that alignment would be -reported. The justification is that we are trying to abort the processing of -queries that have too many bounding alignments in the DP matrix, and back-end -filtering occurs later in the process. -

--querydepth=keep:<n> -Same as ‑‑querydepth=<n> but any alignments discovered for -this query/strand, before it exceeds the threshold, are reported. -

-Note that the alignments reported are not guaranteed to be the highest scoring -alignments that would achieve the threshold. They are simply the first -alignments found. In other words, the purpose of this option is one of saving -time, not one of finding optimal alignments. -

--querydepth=nowarn:<n> -Same as ‑‑querydepth=<n> but warnings for queries that -exceed the limit are witheld. -
--querydepth=keep,nowarn:<n> -Same as ‑‑querydepth=<n> but any alignments discovered for -this query/strand, before it exceeds the threshold, are reported and warnings -are witheld. -
--anyornone -Stop processing after the first qualifying alignment has been found and written -to the output, and move on to the next query. "Qualifying" means an alignment -that meets all of the thresholds, etc. set by other options as usual. See -Any-or-None Alignment for more details. -This option is not compatible with chaining -or interpolation. -
Defaults: -By default both strands are searched, and the target is assumed to be different -from the query. -

-If ‑‑self is used, the default is to -re-create the redundant mirror-image alignment blocks in the output. -

- - - -

-

Scoring

-

-These are fundamental parameters for alignment scoring, used in several of the -stages. -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--scores=<scoring_file>Q=<file> -Read the substitution scores and gap penalties (and possibly other options) -from a scoring file. This option cannot be used in -conjunction with ‑‑match or -inference. -
--match=<reward>[,<penalty>] -Set the score values for a match (+<reward>) -and mismatch (−<penalty>). -These are both specified as positive values; the "+" and "−" are -implicitly assumed. When <penalty> is not specified, -it is the same as <reward>. - -

-Note that specifying ‑‑match changes the defaults for some of -the other options (e.g. the scoring penalties for gaps, and various extension -thresholds), as described in their respective sections. The regular defaults -are chosen for compatibility with BLASTZ, but since BLASTZ doesn't support -‑‑match, LASTZ infers that you are not expecting BLASTZ -compatibility for this run, so it is free to use improved defaults. -

-This option cannot be used in conjunction with -‑‑scores or -inference. -

--gap=[<open>,]<extend>O=<open>
- E=<extend>
-Set the score penalties for opening and extending a gap. These are specified -as positive values; subtraction is implicitly assumed. Note that the first -base in a gap incurs the sum of both penalties. -

-This option is only valid if gapped extension is -being performed, and cannot be used in conjunction with -inference. These values specified on -the command line override any corresponding values from a file provided with -‑‑scores. -

--ambiguous=n[,<reward>][,<penalty>] -Treat each N in the input sequences as an ambiguous nucleotide. -Substitutions with N are scored as zero, instead of using the -fill_score value from the scoring file -(which is -100 by default). -

-A <penalty> can be specified, which will apply to any -non-match substitution involving an N. If a -<reward> is also specified, it applies to an N versus N -match (otherwise, these matches are scored as zero). Note that the -<penalty> is negated in the scoring matrix, while the -<reward> is not. -

-See -Non-ACGT Characters for a more thorough discussion. -This option is not valid with quantum DNA. -

-Prior to version 1.02.20, this option was incorrectly implemented, and the fix -has caused a change in behavior, and reported alignments, when -penalty is not specified. See the -change history item for details on how to -maintain capatability with the earlier version, if that is desired. -

--ambiguous=iupac[,<reward>][,<penalty>] -Treat each of the IUPAC-IUB ambiguity codes (B, D, H, K, M, R, S, V, -W, and Y, as well as N) in the input sequences -as a completely ambiguous nucleotide. Substitutions with these -characters are scored as zero, instead of using the fill_score -value from the scoring file (which is -100 by -default). -

-A <penalty> can be specified, which will apply to any -non-match substitution involving an ambiguous nucleotide. If a -<reward> is also specified, it applies to a match involving -ambiguous nucleotides (otherwise, these matches are scored as zero). Note that -the <penalty> is negated in the scoring matrix, while the -<reward> is not. -

-See Non-ACGT Characters for a more thorough -discussion. This option is not valid with quantum DNA. -

-Note that this does not mean that LASTZ considers the specific -ambiguity that is associated with each character (e.g. that R -would be considered a match to an A or G but not to -a C or T). Instead, they are all scored as if they -were an N. -

-Prior to version 1.02.20, this option was incorrectly implemented, and the fix -has caused a change in behavior, and reported alignments, when -penalty is not specified. See the -change history item for details on how to -maintain capatability with the earlier version, if that is desired. -

--infer[=<control_file>] -Infer substitution scores and/or gap penalties from the sequences, then use -them to align the sequences. Parameters controlling the inference process are -read from the control file. -This feature is somewhat experimental, and special builds of LASTZ are required -to enable it. Please see Inferring Score Sets for -more information. Inference cannot be used in conjunction with -‑‑scores, -‑‑match, or -‑‑gap. -
--inferonly[=<control_file>] -Infer substitution scores and/or gap penalties, but don't perform the final -alignment (requires ‑‑infscores). -
--infscores[=<output_file>] -Save the inferred scoring parameters to the specified file (or to -stdout), in the same format expected -by ‑‑scores. -
Defaults: -By default the HOXD70 substitution scores are used -(see [Chiaromonte 2002] for an explanation of -how this scoring matrix was determined). -

-
- - - - - - -
 ACGT
A91‑114‑31‑123
C‑114100‑125‑31
G‑31‑125100‑114
T‑123‑31‑11491
-
-

-Default gap penalties are determined as follows. If -‑‑match is -specified, the open penalty is 3.25 times the mismatch penalty, and the extend -penalty is 0.24375 times the mismatch penalty. (These are the same ratios as -BLASTZ’s defaults.) Both penalties are rounded up to the nearest integer. -Otherwise, the gap penalties are 400 for open, 30 for extend. -

-By default, a run of Ns serves as an old-style separator between -shotgun reads or other spliced sequences, rather than indicating ambiguous -nucleotides. This is solely a consequence of the steep -fill_score handicap imposed for -substitutions with N — LASTZ doesn't normally search for runs -of Ns to treat specially (however, the -separator=N action can be -used to accomplish that, and is preferred if Ns are intended to be -separators). -

- - - -

-

Indexing

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--step=<offset>Z=<offset> -Offset between the starting positions of successive target words considered for -potential seeds. (But this does not apply to the query words, which always use -a step size of 1.) -
--maxwordcount=<limit> -Words occurring more often than <limit> in the target are -not eligible for seeds. Specifically, after the target seed word position table -is built, any words exceeding this count are removed from the table. -
--maxwordcount=<limit>% -Set maxwordcount to keep a specified percentage of seed word -positions. <limit> is a lower bound on the percentage of -words to be kept (0 < limit < 100). -

-Setting this as a percentage makes it easier to maintain consistency across -runs. The actual count is dependent on sequence length and composition as -well as the step offset and seed pattern. For example, Figure 4 -shows the variation among human chromosomes in hg18 for -‑‑seed=match13, ‑‑step=15, and -‑‑maxwordcount=90%. The gray bars show the percentage of -seed word positions kept (the red line shows the ideal 90%). The blue numbers -show the equivalent count, which varies greatly. -

-Figure 4 -

- -word count rate per chromosome -

--masking=<count>M=<count> -Dynamically mask the target sequence by excluding any positions that appear -in too many alignments from further consideration for seeds. -

-Specifically, a cumulative count is maintained of the number of times each -target location is aligned. After each query sequence -and strand is processed, any locations that have been output in at least -<count> alignment blocks are masked, so they will be -excluded from the seeding stage for subsequent query sequences. -Since repetition discovered while processing one sequence strand is only masked -for subsequent sequence strands, this option has no effect on the first strand -of the first sequence in the query file. -

-This option requires one, two, or four bytes of memory for each target location, -depending on <count>. If <count> is 254 -or less, one byte is used; if it is 65,534 or less, two bytes are used. -

-The resulting masked intervals can be written to a file with the -‑‑outputmasking=<file> -option. -

--targetcapsule=<capsule_file> -The target seed word position table and seed (as well as the target sequence) -are read from the specified file. When this option -is used, the normal target specifier is omitted from the command line, and the -following options are not allowed: -‑‑step, -‑‑maxwordcount, -‑‑masking, -‑‑seed, -‑‑word. -
--chores=<chores_file> -Restrict alignment to a list of subintervals. The file -describes a list of sequence interval pairs, indicating that the alignment -process is to be restricted to those intervals. -

-See Aligning Many Subintervals for advice -on when to use this option. - -

--segments=<segment_file> -Read anchor segments from a file, instead of discovering -them via seeding. -This replaces any other options related to indexing, seeding, gap-free -extension or chaining. Those stages are skipped, and processing begins with -the gapped extension stage. -

-See Aligning Many Subintervals for advice -on when to use this option. -

Defaults: -By default a step of 1 is used, no words are removed from the target seed word -position table, dynamic masking is not performed, and no target capsule or -segment file is used. -
- - - -

-

Seeding

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--seed=12of19T=1 or T=2 -Seeds require a 19-bp word with matches in 12 specific positions -(1110100110010101111). -
--seed=14of22T=3 or T=4 -Seeds require a 22-bp word with matches in 14 specific positions -(1110101100110010101111). -
--seed=match<length>W=<length> -Seeds require a <length>-bp word with matches in all -positions. -
--seed=half<length> -Seeds require a <length>-bp word with matches or transitions -in all positions. This option is not valid with -quantum DNA. -
--seed=<pattern> -Specifies an arbitrary pattern of 1s, 0s, and -Ts for seed discovery. (Note that Ts are not valid -with quantum DNA.) -
--transitionT=1 or T=3 -In each seed, allow any one match position to be a transition instead. -This option is not valid with quantum DNA. -
--transition=2 -In each seed, allow any two match positions to be transitions instead. -This option is not valid with quantum DNA. -
--notransitionT=2 or T=4 -Don't allow any match positions in seeds to be satisfied by transitions. -
--filter=[<transv>,]<matches> -Filter the resulting seeds, requiring at least <matches> -exact matches and allowing no more than <transv> -transversions. If <transv> is not specified, any number -of transversions is allowed (they are not limited). -This option is not valid with quantum DNA. -
--nofilter -Don't filter seeds. -
---ball=<score> -Set the quantum seeding threshold, the minimum -score required of a DNA word to be included in the seeding ball. -
---ball=<percentage>% -Set the quantum seeding threshold as a percentage of the maximum word score -possible. -
--twins=[<minsep>..]<maxsep> -Require two nearby seeds on the same diagonal, separated by a number of bases -in the given range. See the Seed Patterns section -for more information. This option cannot be used in conjunction with -‑‑recoverseeds. -
--notwins -Allow single, isolated seeds. -
--recoverseeds -Avoid losing seeds in hash collisions. This will slow the alignment process -considerably and cost more memory, and usually does not improve the results -significantly. See the Gap-free Extension stage -for more information. This option cannot be used in conjunction with -‑‑twins. -
--norecoverseeds -Ignore hash collisions, at the expense of missing some seeds. Note that -missing seeds usually does not mean missing alignments, since most alignable -regions have many seed hits. -
Defaults: -By default the 12-of-19 seed is used, one transition is allowed (except with -quantum DNA), the hits are not filtered, twins are not -required, and hash collisions are not recovered. -

-If the quantum action is used in the -query file’s sequence specifier, the default ball -scoring threshold is 75% of the maximum word score possible. -

- - - -

-

Finding HSPs (Gap-free Extension)

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--gfextend -Perform gap-free extension of seeds to HSPs (high scoring segment pairs), -according to the other options in this section. -
--nogfextend -Skip the gap-free extension stage, passing the seeds along to the next -specified stage. -

-It is not recommended to use --nogfextend without also using ---nogapped. -

--exact=<length> -Find HSPs using the exact match extension method with the given length -threshold, instead of using the x-drop method. -
--mismatch=<count>,<length> -Find HSPs using the mismatch extension method with the given length threshold -and allowing count mismatches, instead of using the x-drop method. -

-count is limited to the range 1≤count≤50. -

--xdrop=<dropoff>X=<dropoff> -Find HSPs using the x-drop extension method with the given termination -threshold, instead of using the exact match method. The dropoff setting -determines the endpoints of each gap-free segment: the extension of each seed -is stopped when its cumulative score drops off by more than the given -threshold from the maximum seen so far. See the -Gap-free Extension stage for more details. -
--hspthresh=<score>K=<score> -Set the score threshold for the x-drop extension method; HSPs scoring lower -are discarded. -
--hspthresh=top<basecount> -Set an adaptive score threshold for the x-drop -extension method; HSPs scoring lower are discarded. The score threshold is -chosen to limit the number of target sequence bases in HSPs to about -<basecount> -(or possibly a little higher in case of ties, etc.). -
--hspthresh=top<percentage>% -Set an adaptive score threshold for the x-drop -extension method; HSPs scoring lower are discarded. The score threshold is -chosen to limit the number of target sequence bases in HSPs to about -<percentage> percent of the target (or possibly a little -higher in case of ties, etc.). -
--entropyP=1 -Adjust for entropy when qualifying HSPs in the x-drop extension method. -Those that score just slightly above the HSP threshold are adjusted downward -according to the entropy of their nucleotides, and any that then fall below -the threshold are discarded. -
--entropy=reportP=2 -Adjust for entropy when qualifying HSPs in the x-drop extension method, -and report (to stderr) any HSPs that are discarded as a result. -
--noentropyP=0 -Don't adjust for entropy when qualifying HSPs. -
Defaults: -By default seeds are extended to HSPs using x-drop extension, with entropy -adjustment. -

-If ‑‑match scoring is used, the -default x-drop termination threshold is 10 times the square root of the -mismatch penalty, rounded up to the nearest integer. Otherwise the default -is 10 times the A-vs.-A substitution score. -

-If ‑‑match scoring is used, the -default HSP score threshold is 30 times the match reward (equivalent to the -score of a 30-bp exact match). Otherwise the default is 3000. -

-‑‑help=defaults can be used -to see what values are set. -

- - - -

-

Chaining

-

- - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--chainC=1 or C=2 -Perform chaining of HSPs -with no penalties. -
--chain=<diag>,<anti>C=1 or C=2
- G=<diag>
- R=<anti>
-Perform chaining with the given penalties for diagonal and anti-diagonal in the -DP matrix. These are specified as positive values; -subtraction from the score is implicitly assumed. -
--nochainC=0 or C=3 -Skip the chaining stage. -
Defaults: -By default the chaining stage is skipped. -
- - - -

-

Gapped Extension

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--gappedC=0 or C=2 -Perform gapped extension of HSPs (or seeds, if gap-free extension is not -performed), after first reducing them to anchor points. -
--nogappedC=1 or C=3 -Skip the gapped extension stage. (This means that -interpolation must also be skipped, since -it is not allowed without gapped extension.) -
--ydrop=<dropoff>Y=<dropoff> -Set the threshold for terminating gapped extension; this restricts the -endpoints of each local alignment by limiting the local region around each -anchor in which extension is performed. The boundary of this region in the -DP matrix is formed by the points where the cumulative -score has dropped off by more than the given threshold from the maximum seen -so far. See the Gapped Extension stage for more -details. -
--noytrim -If y-drop extension encounters the end of the sequence, extend the alignment -to the end of the sequence rather than trimming it back to the location giving -the maximum score. This is highly recommended when either the target or query -sequences are short reads (say, less than 100 bases), to prevent -y-drop mismatch shadow. -
--gappedthresh=<score>L=<score> -Set the threshold for gapped extension; alignments scoring lower than -<score> are discarded. -When used along with the x-drop method for gap-free extension, this value is -generally set at least as high as the HSP threshold. Setting it lower has no -effect, since at worst the HSP itself would always qualify (both extension -stages use the same scoring matrix). -
--allgappedbounds -Revert to handling bounding alignments the way they were handled in BLASTZ. -This is discussed in -Bounding Alignments in the DP Matrix. -
Defaults: -By default gapped extension is performed, and alignment ends are trimmed -to the locations giving the maximum score. -

-If ‑‑match scoring is used, the -default y-drop threshold is twice the x-drop threshold (or if x-drop extension -was not performed, twice what the default x-drop threshold would have been); -otherwise it is the score of a 300-bp gap. -

-The default for the gapped score threshold is to use the same value as the -HSP threshold (which is settable via -‑‑hspthresh). If the HSP -threshold was adaptive, then the lowest-scoring -HSP that was kept is used for this default. If x-drop extension was not -performed, the value used is whatever the default HSP threshold would have been. -

-‑‑help=defaults can be used -to see what values are set. -

- - - -

-

Back-end Filtering

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--filter=identity:<min>[..<max>] -Filter alignments by their percent identity, -0 ≤ min ≤ max ≤ 100 percent. -Identity is the percentage of aligned bases that -are matches. Alignment blocks outside the given range are discarded. -This option is not valid with quantum DNA. -

-For backwards compatibility, ‑‑identity=<min>[..<max>] has the -same meaning. -

--filter=continuity:<min>[..<max>] -Filter alignments by how much of the input sequence aligns as matches or -mismatches, rather than gaps, -0 ≤ min ≤ max ≤ 100 percent. -Continuity is the percentage of alignment -columns that are not gaps. Alignment blocks outside the given range -are discarded. -

-For backwards compatibility, ‑‑continuity=<min>[..<max>] has the -same meaning. -

--filter=coverage:<min>[..<max>] -Filter alignments by how much of the input sequence they cover, -0 ≤ min ≤ max ≤ 100 percent. -Coverage is the percentage of the entire target -or query sequence (whichever is shorter) that is included in the alignment -block. Blocks outside the given range are discarded. -

-For backwards compatibility, ‑‑coverage=<min>[..<max>] has the -same meaning. -

--filter=nmatch:<min> -Filter alignments by how many bases match, requiring at least min -matched bases, min > 0. -Match count, or nmatch, is the number -of matched bases in the alignment. This option is not valid with -quantum DNA. -

-For backwards compatibility, ‑‑matchcount=<min> has the -same meaning. -

--filter=nmatch:<min>% -Filter alignments by how many bases match, with the threshold specified as a -percentage of the query length. -
--filter=nmismatch:0..<max> -Filter alignments by the number of mismatches, allowing no more than -max mismatched bases, -max ≥ 0. -Mismatch count, or nmismatch, is -the number of aligned bases in the alignment that are mismatches -(substitutions). This option is not valid with -quantum DNA. -
--filter=ngap:0..<max> -Filter alignments by the number of gaps, allowing no more than -max gaps, max ≥ 0. -Gap count, or ngap, is the -number of runs of gapped columns in the alignment (each run is counted as one -gap). -
--filter=cgap:0..<max> -Filter alignments by the number of gap columns, allowing no more than -max gaps, max ≥ 0. -Gap column count, or cgap, is the -number of gapped columns in the alignment (each column is counted as one gap). -
--notrivial -Do not output a trivial self-alignment block if the target and query sequences -are identical. Note that using ‑‑self -automatically enables this option. -
Defaults: -By default no back-end filtering is performed, and the trivial block is -included if the sequences happen to be identical. -
- - - -

-

Interpolation

-

- - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--inner=<score>H=<score> -Perform additional alignment between the gapped alignment blocks, using -(presumably) more sensitive alignment parameters. <score> -is used as the threshold for both the gap-free and gapped extension sub-stages; -see the discussion of interpolation for more -details. -

-This option is only valid if gapped extension is -performed. -

Defaults: -By default interpolation is not performed. -
- - - -

-

Output

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--output=<output_file> -Write the alignments to the specified file name instead of stdout. -
--format=<type> -Specifies the output format: -lav, -lav+text, -axt, -axt+, -maf, -maf+, -maf-, -sam, -softsam, -sam-, -softsam-, -cigar, -BLASTN, -differences, -rdotplot, -text, -general[:<fields>], -or -general-[:<fields>]. -

-‑‑format=none can be used when no alignment output is desired. -

--rdotplot=<file> -Create an additional output file suitable for plotting the alignment blocks -with the R statistical package. The -output file is the same as would be produced by -‑‑format=rdotplot, but this option -allows you to create the dotplot file without having to run the alignment twice. -
--readgroup=<tags> -Used in conjuction with the SAM file format, allowing -the specification of tags for SAM's ‑RG header line. -<tags> is a tab-delimited list of -<tag>:<value> items. See the SAM specification for -details about which tags are required. LASTZ does not validate whether the -list is a valid SAM tag list. -

-Since the list is tab-delimited, you may need to surround this option with -quotes to satisfy the command line shell. Alternately, you can use ---readgroup more than once, and the lists are concatenated. -

--markend -Just before normal completion, write a marker line -
-    # lastz end-of-file
-
-to the output file. This option can be useful with pipelines or batch servers, -where there may be a question as to whether or not LASTZ completed successfully. -Note that in some output formats this marker is not a legal line, in which case -you must remove it before any downstream processing. -
--census[=<output_file>]c=1 -Count and report how many times each target base aligns, up to 255. -Ns are included in the count (both bases that are Ns -and bases aligning to Ns), and even bases aligning to gaps are -counted. Requires one byte of memory for each target location. -

-For any of the lav formats, if <output_file> -is omitted the census is included as a special stanza in the output. -For all other formats <output_file> is mandatory. -

-

--census16[=<output_file>] -Count and report how many times each target base aligns, up to ≈65 -thousand. Requires two bytes of memory for each target location. -
--census32[=<output_file>] -Count and report how many times each target base aligns, up to ≈4 -billion. Requires four bytes of memory for each target location. -
--nocensusc=0 -Do not report a census of aligning bases. -
--outputmasking=<file> -Used in conjuction with the -‑‑masking=<count> option. -The masked target intervals, resulting from alignment with all queries, are -written to a file in -sequence masking file format. The file is suitable -for later use with the -softmask, -xmask, and -nmask sequence specifier actions. -

In contrast with -‑‑outputmasking:soft=<file>, -only those intervals created by the -‑‑masking=<count> option -are reported. -

--outputmasking+=<file> -The same as -‑‑outputmasking=<file>, -except that masked intervals are written to a file in -three field sequence masking file format, which -includes sequence names. The file is not suitable for later use as -input to LASTZ. -

-This is useful when the target file contains more than one sequence. -

--outputmasking:soft=<file> -Soft-masked target intervals (lowercase bases) are written to a file in -sequence masking file format. The file is suitable -for later use with the -softmask, -xmask, and -nmask sequence specifier actions. -

In contrast with -‑‑outputmasking=<file>, -all masked intervals in the target sequence are reported, regardless of whether -they were created by the -‑‑masking=<count> option -or were in the sequence as it was originally input. -

--outputmasking+:soft=<file> -The same as -‑‑outputmasking:soft=<file>, -except that masked intervals are written to a file in -three field sequence masking file format, which -includes sequence names. The file is not suitable for later use as -input to LASTZ. -

-This is useful when the target file contains more than one sequence. -

--tableonly -Just write out the target seed word position table and quit; don't search for -seeds or perform any subsequent stages. -
--tableonly=count -Just write out the target word count table and quit; don't search for seeds or -perform any subsequent stages. -
--writecapsule=<capsule_file> -Just write out a target capsule file and quit; don't -search for seeds or perform any subsequent stages. The capsule file contains -the target sequence, -the seed, the target seed word position table, -and other related information. -
--writesegments=<segment_file> -Write out alignments as segments, in the same format -used for input by the ‑‑segments -option. These anchor segments can then be used to anchor alignments -in a subsequent run of LASTZ. This can be useful if you want to filter HSPs in -some way before performing gapped extension, for example filtering them by -length. Since anchor segments must be gap-free, this option cannot be used in -conjunction with gapped extension. - - -

- - -

--progress[=<N>] -Report the count and name of every Nth query to stderr, as -processing begins on that query. If N is omitted, every query is reported. -
--progress+masking[=<N>] -Report the count and name of every Nth query to stderr, with -statistics relating to dynamic masking, as -processing begins on that query. If N is omitted, every query is reported. -
--show=defaults -List the option values lastz is using. This can be helpful if you are unsure -what the default value is for most common settings. -

-This gives the same information as -‑‑help=defaults, but writes -them to the output file. For some formats, this renders the output file as -non-conformant. -

Defaults: -By default alignments are written to stdout in lav -format, no census is reported, and no target table or capsule is written out. -
- - - -

-

Housekeeping

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionBLASTZ equivalentMeaning
--include=<file> -Read arguments from a text file. The arguments are parsed the same as they -would be from the command-line, with the exception that they may appear on -multiple lines in the file. ‑‑include can be used in conjunction -with other command line arguments. -

-Note that any shell-performed substitutions that would be performed on the -command line are not performed on the contents of the text file. -

--allocate:traceback=<bytes>m=<bytes> -Set the amount of memory to allocate (in RAM) for trace-back information during -the gapped extension stage. <bytes> may contain an -M or K unit suffix if desired (indicating a -multiplier of 1,024 or 1,048,576, respectively). For example, -‑‑allocate:traceback=80.0M is the same as -‑‑allocate:traceback=83886080. -

-For backwards compatibility, ‑‑traceback=<bytes> is also -accepted. -

--allocate:target=<bytes> -Predict the amount of memory (in RAM) that will be needed for target sequence -data. Normally LASTZ incrementally predicts the amount of memory needed as it -parses the file. In some instances that incremental allocation can lead to -memory overuse (depending on details of how the operating system handles memory -allocation). Predicting the memory needed prevents that. -

-The memory needed for a sequence is L+1, where -L is the length of the sequence. When -multiple is used, the total memory -needed is the sum of that needed for each sequence. -

--allocate:query=<bytes> -Predict the amount of memory (in RAM) that will be needed for query sequence -data. See -‑‑allocate:target for further -details. -

-The memory needed for a sequence is L+1, where -L is the length of the sequence. When the query file contains -more than one sequence and -multiple is not used, the -memory needed is that needed for the longest sequence. -

--action:target=<action> -Set a sequence specifier action for the target. This -is an alternative to appending the action to the target filename, and is useful -for shells that make using square brackets problematic. -

-This can be used more than once in the command line, the actions are all -applied. -

--action:query=<action> -Set a sequence specifier action for the query. This -is an alternative to appending the action to the target filename, and is useful -for shells that make using square brackets problematic. -

-This can be used more than once in the command line, the actions are all -applied. -

--word=<bits> -Set the maximum number of bits for the word hash. Use this to spend less -memory (in exchange for more time) and thereby avoid thrashing for heavy seeds. -
Defaults: -The default traceback space is 80.0M, -target and query memory is allocated as needed, -and the default word hash is 28 bits. -
- - - -

-

Shortcuts for Yasra

-

-There are several shortcut options to support the -Yasra mapping assembler. These -provide canned sets of option settings that work well for aligning an assembled -reference sequence (as the target) with a set of shotgun reads (as the query). -They are selected based on the expected level of identity between the sequences. -For example, ‑‑yasra90 should be used when we expect 90% identity. -The ‑‑yasraXXshort options are appropriate when the reads are very -short (less than 50 bp). - -

- - - - - - - - - -
Option Equivalent
--yasra98 T=2 Z=20 ‑‑match=1,6 O=8 E=1 Y=20 K=22 L=30 ‑‑filter=identity:98 ‑‑ambiguous=n ‑‑noytrim
--yasra95 T=2 Z=20 ‑‑match=1,5 O=8 E=1 Y=20 K=22 L=30 ‑‑filter=identity:95 ‑‑ambiguous=n ‑‑noytrim
--yasra90 T=2 Z=20 ‑‑match=1,5 O=6 E=1 Y=20 K=22 L=30 ‑‑filter=identity:90 ‑‑ambiguous=n ‑‑noytrim
--yasra85 T=2      ‑‑match=1,2O=4 E=1 Y=20 K=22 L=30 ‑‑filter=identity:85 ‑‑ambiguous=n ‑‑noytrim
--yasra75 T=2      ‑‑match=1,1O=3 E=1 Y=20 K=22 L=30 ‑‑filter=identity:75 ‑‑ambiguous=n ‑‑noytrim
--yasra95shortT=2      ‑‑match=1,7O=6 E=1 Y=14 K=10 L=14 ‑‑filter=identity:95 ‑‑ambiguous=n ‑‑noytrim
--yasra85shortT=2      ‑‑match=1,3O=4 E=1 Y=14 K=11 L=14 ‑‑filter=identity:85 ‑‑ambiguous=n ‑‑noytrim
- -

-Occasionally, newer releases of LASTZ change the Yasra shortcut options. This -is done as an improvement, so most users will want to use the shortcuts shown -above. Hoever, in order to support backward compatibility for users that want -to reproduce previous results, all previous versions of the shortcuts are -included. The syntax is ‑‑<shortcut>:<version>, where -<version> is the LASTZ version number that contained the -shortcut. - -

- - - - - - - - - -
Option LASTZ version Equivalent
--yasra98:<version> 1.02.45 or earlierT=2 Z=20 ‑‑match=1,6 O=8 E=1 Y=20 K=22 L=30 ‑‑filter=identity:98
--yasra95:<version> 1.02.45 or earlierT=2 Z=20 ‑‑match=1,5 O=8 E=1 Y=20 K=22 L=30 ‑‑filter=identity:95
--yasra90:<version> 1.02.45 or earlierT=2 Z=20 ‑‑match=1,5 O=6 E=1 Y=20 K=22 L=30 ‑‑filter=identity:90
--yasra85:<version> 1.02.45 or earlierT=2      ‑‑match=1,2O=4 E=1 Y=20 K=22 L=30 ‑‑filter=identity:85
--yasra75:<version> 1.02.45 or earlierT=2      ‑‑match=1,1O=3 E=1 Y=20 K=22 L=30 ‑‑filter=identity:75
--yasra95short:<version>1.02.45 or earlierT=2      ‑‑match=1,7O=6 E=1 Y=14 K=10 L=14 ‑‑filter=identity:95
--yasra85short:<version>1.02.45 or earlierT=2      ‑‑match=1,3O=4 E=1 Y=14 K=11 L=14 ‑‑filter=identity:85
- - - -

-

Help

-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionMeaning
--version -Report the program version and quit. -
--help -List all options. -
--help=defaults -List the option values lastz would use given the rest of the command line. -This can be helpful if you are unsure what the default value is for most common -settings. -

-This gives the same information as -‑‑show=defaults. -

--help=files -Describe the syntax for sequence specifiers. -
--help=formats -Describe the available output formats. -
--help=shortcuts -List BLASTZ-compatible shortcuts. -
--help=yasra -List Yasra-specific shortcuts. -
- - - -

-

Sequence Specifiers

- -

-A target or query sequence specifier normally just indicates a file to be -used in the alignment; however various pre-processing actions can also be -specified. These are performed as the sequences are read from the file, -and may include selecting a particular sequence and/or subrange, masking, -adjusting sequence names, etc. - -

-The format of a sequence specifier is -

-    <file_name>[[<actions>]]*
-
- -

-The <file_name> field is required; the actions list is -optional. Note that the <actions> are enclosed in literal -square brackets (in addition to the meta ones that just indicate they are -optional), and consist of a comma-separated list (with no spaces), e.g. -[action1,action2,...]. The * indicates that -several action lists can be appended; they are treated the same as if they were -in a single list. - -

-Alternatively, actions can be specified with the commands -‑‑action:target=<action> -and -‑‑action:query=<action>. -This allows actions to be set without using square brackets (square brackets -are problematic in some command shells). - -

-Note that the actions apply to every sequence in the file. For example, if you -specify a subrange of, say, [100..], you will skip the first 99 bp -in every sequence. - -

-The following actions are supported: -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ActionMeaning
<subrange> -Only a subrange of the sequence is processed. The usual form of a subrange -is [<start>]..[<end>]. Either -<start> or <end> can be omitted, in which -case the start or end of the sequence is used. Subrange indices begin with 1 -and are inclusive -(i.e., they use the origin-one, closed position -numbering system). For example, 201..300 is a 100-bp subrange -that skips the first 200 bp in the sequence. -

-For BLASTZ compatibility, the alternative syntax -<start>,<end> is also recognized. In this case -both <start> and <end> are required. - -

-A “zoom out factor” can also be included, using the syntax -<start>..<end>+<zoom>%. The specified interval -is expanded on each end by <zoom> percent. This is useful -when you know, for example, the location of a gene, and would like to include -flanking regions in the alignment. - -

-Another useful syntax for this is <start>#<length>, -which is handy for specifying an interval of known length at a given position; -it is equivalent to <start>..<start+length−1>. -Similarly, <center>^<length> specifies an interval -of known length centered at the given position. Large lengths can be -specified using M or K units if desired, e.g. -10.2M. -

-Additionally, if a subrange has <start> larger than -<end>, the reverse complement of the extracted region is -used. However, this can lead to non-obvious interactions with other features -such as strand reporting, sequence masking, and segment files, so it should -be used with care. Usually it is simpler to use the -‑‑strand options instead. -

-Note that subrange positions are always measured from the start of the -sequence provided in the file (i.e., counting along the -forward strand), even if the sequence is being reverse complemented. -

multiple -The file’s sequences are internally treated as a single sequence. This -action is required when the target (not the query) is comprised of multiple -sequences. -

-There is rarely any reason to use the multiple action for the -query file. Doing so can negatively affect memory use and run time. -

separator=<character> -The file’s sequences are internally broken in pieces wherever the -specified <character> occurs, so that alignments will not -cross that separator. The separation action is performed after any masking -action -(such as xmask or -nmask), so it is possible to use the -masking operation to mark the sequence with separators. -

-The character can be any printable ASCII character. However, characters that -are important in the input format being used (for example a “>” -in fasta) should not be used for this purpose. Moreover, many input formats -have limited capability to represent characters other than nucleotides. There -is no error checking regarding the specified <character> -— if that character does not occur at all in the input, no separation is -performed. -

-See Non-ACGT Characters, Splicing, and Separation -for further details. -

subset=<names_file> -Process only a specified subset of the sequences in the file. -<names_file> is the name of a -file containing a list of desired -sequence names; only these sequences will be processed. The names can be -piped in by specifying /dev/stdin as the file. This action is -only valid for FASTA, 2Bit, -or HSX input files. -
subsample=<k>/<n> -Process only the kth sequence of every group of n -sequences. k ranges from 1 to n. This action is -only valid for FASTA, 2Bit, -or HSX input files. -
chores=<chores_file> -Restrict alignment to a list of subintervals. This is equivalent to the -the ‑‑chores=<chores_file> -option. -
unmask -Convert any lowercase bases to uppercase. Lowercase bases usually indicate -instances of biological repeats, and are excluded from the seeding stage -of the alignment process. -
softmask=<mask_file> -Mask the segments specified in -<mask_file> by replacing them with -lowercase equivalents. Lowercase bases usually represent instances of -biological repeats, and are excluded from the seeding stage of the alignment -process but not from later stages. -Note that soft masking is performed after any unmasking. -
softmask=keep:<mask_file> -Mask the segments not specified in -<mask_file> by replacing them with -lowercase equivalents. Any base not in one of the specified intervals -is replaced, and thereby excluded from the seeding stage (but not later stages) -of the alignment process. -
xmask=<mask_file> -Mask the segments specified in -<mask_file> by replacing them with -Xs. (Note that this always masks with actual Xs, -even if the scoring file specifies a different -character as "bad".) See -Non-ACGT Characters, Splicing, and Separation for -information on how Xs affect the alignment process. -
xmask=keep:<mask_file> -Mask the segments not specified in -<mask_file> by replacing them with -Xs. Any base not in one of the specified intervals is -replaced. -
nmask=<mask_file> -Mask the segments specified in -<mask_file> by replacing them with -Ns. See -Non-ACGT Characters, Splicing, and Separation for -information on how Ns affect the alignment process. -
nmask=keep:<mask_file> -Mask the segments not specified in -<mask_file> by replacing them with -Ns. Any base not in one of the specified intervals is -replaced. -
nameparse=full - -Report full sequence names in the output, instead of short names. As described -in Sequence Name Mangling, LASTZ normally shortens -FASTA and 2Bit sequence names -in an attempt to include only the distinguishing core of the name. This action -is provided in case LASTZ’s choice of names is not helpful. It is only -valid for FASTA or 2Bit -sequence files. -
nameparse=darkspace -Extract the first word from the sequence header line, keeping only a -non-whitespace string. If the first word is a filename, any directory/folder -information is discarded. See -Sequence Name Mangling for more information on how -the name used for output is derived. -This action is currently only valid for FASTA or -2Bit sequence files. -
nameparse=alphanum -Extract the first word from the sequence header line, keeping only an -alphanumeric string. If the first word is a filename, any directory/folder -information is discarded; then the name is truncated at the first character -that is not a letter, digit, or underscore. See -Sequence Name Mangling for more information on how -the name used for output is derived. -This action is currently only valid for FASTA -or 2Bit sequence files. -
nameparse=tag:<marker> -Use the specified marker to extract a short name from the sequence header line. -For example, nameparse=tag:foo will look for the string -foo in the header line, and copy the name from the text following -that, up to the next non-alphanumeric character. See -Sequence Name Mangling for more information on how -the name used for output is derived. This action is only valid for -FASTA or 2Bit sequence files. -
nickname=<name> -Ignore any sequence names in the input file, instead using -<name> in the output. See -Sequence Name Mangling for more information on -how the name used for output is derived. -
namejoin -Replace any spaces in the name with underscores. This is applied after the -effect of any nameparse action. It is most useful with -nameparse=full, and when the output format is such that having -spaces in names is problematic. -
quantum -The sequence contains quantum DNA. -Note that this changes the game significantly, and many of LASTZ’s other -actions and options are not valid with quantum sequences. Operations such as -reverse complement, masking, special treatment of Ns and -Xs, seeding options that need to recognize -matches / transitions / transversions, and computation of percent -identity do not apply because of the arbitrary quantum alphabet and the ability -of its symbols to encode ambiguity. -
quantum=<code_file> -The sequence contains quantum DNA corresponding to -the specified <code_file>, which -assigns nucleotide probabilities for the quantum alphabet. These are only used -to augment the display of alignment blocks in the -Human-Readable Text output format. -
- -

-In addition to the sequence specifier syntax shown above, LASTZ supports a -more complicated syntax. This is to maintain compatibility with BLASTZ and -early versions of LASTZ. All of the functionality described here can be -performed using the newer syntax above. - -

-The complete format of a sequence specifier is -

-    [<nickname>::]<file_name>[/<select_name>][{<mask_file>}][[<actions>]][-]
-
- -

-As with the simpler syntax, the <file_name> field is -required; all other fields are optional. The <file_name> -and <actions> fields have the same meaning as in the simpler -syntax. - -

-<nickname>:: is equivalent to the <name> -field in the nickname=<name> action. - -

-/<select_name> is only valid for the -2Bit file format, and only when the file name ends with -".2bit". It specifies a single sequence from the file to use, rather than all -sequences. This is similar to the subset=<names_file> -action, except that here a single sequence name is given instead of a file of -names. Note that the name must match the mangled -sequence name extracted from the file. - -

-{<mask_file>} is identical to the -xmask=<mask_file> action. - -

-A - (minus sign) is equivalent to swapping the endpoints in the -<subrange> action; it causes the reverse complement of the -sequence to be used instead of the sequence itself. Again, this should be -used with care, as it can lead to murky interactions with other features. -In BLASTZ it was needed for searching only the minus strand, but LASTZ provides -a ‑‑strand option for that. - - - - - - -


-
-

Processing Stages in Detail

- - - -
-

Target Sequence Input

- -

-The target sequence is read at the beginning and kept in memory throughout -the run of the program. Actions such as masking, unmasking, or reverse -complement are applied when the file is read. If there are multiple sequences -in the target file, they are treated internally as one long sequence (you must -use the multiple action in the -target file’s sequence specifier to enable this). - -

-In contrast, queries are processed individually and sequentially. Each query -sequence is read just before its seeding stage. The seeding through output -stages are performed, comparing the query to the target. Then by default, the -same stages are repeated to compare the reverse complement of the query to the -target, before moving on to the next sequence in the query file. - - - -

-

Scoring Inference

- -

-Scoring inference is not normally performed. As described in -Inferring Score Sets, LASTZ can iteratively -perform the complete alignment process on the target and query, to derive a -suitable scoring set. This is only available for special builds of LASTZ, and -will usually be too time-consuming to perform for all sequences being aligned. -The typical application is to use it once on some sample sequences from the -species of interest, save the scoring file, then use that scoring file for -subsequent alignments. - - - -

-

Indexing Target Seed Words

- -

-This pre-processing stage parses the target sequence(s) into overlapping -seed words of some constant length (you can think of these as -12-mers; the actual length is determined by the seed pattern). Each word is -converted to a number, called the packed seed word, according to the -specified seed pattern. These (word, position) -pairs are collected into the target seed word position table. -Conceptually, this table is a mapping from a packed seed word to a list of the -target sequence positions where that seed word occurs. - -

-This table is one of the major space requirements of the program. Both the -memory and time required for seeding can be decreased by using sparse spacing. -The ‑‑step option sets a -step size: instead of examining every position, seed words are -stored only for multiples of the step size. Large step sizes (say, -‑‑step=100) incur a loss of sensitivity, at least at the seeding -stage. However, to discover any gapped alignment block we only need to -discover one seed (of many) in that alignment, so the actual sensitivity loss -is small in most cases. Section 6.2 of [Harris 2007] -discusses some experimental results on the effect of step size on the end -result. - -

-The presence of biological repeats in the target and query can also be -addressed during the building of the position table. A large number of repeats -can adversely affect the speed of the program, by increasing the number of -irrelevant alignments the program considers in the early stages. LASTZ has -three techniques for dealing with repeats. -

-

    -
  1. Bases in the target and/or query sequences can be marked as repeats in -advance, by using lower case. Target and query words containing lower case -bases are left out of the seed word position table and skipped during seeding, -respectively, so they do not participate in the seeding stage. -
  2. If repeat locations are not known, the option -‑‑maxwordcount can be used to remove -frequently occurring target seed words from the position table before query -processing begins. -
  3. Dynamic masking (‑‑masking) can -be used to mask target positions that have occurred in too many alignments; -however this only affects subsequent query sequences. -
- - - -
-

Seeding

- -

-Seeds are short near-matches between target and query sequences. -They identify likely regions of homology that warrant further investigation, -and serve as starting points for bootstrapping the alignment process. "Short" -typically means less than 20 bp. Early alignment programs used exact matches -(e.g. of length 12) as seeds, but more recent programs have used spaced seeds -(these are described in more detail in the Seed -Patterns section). For the purposes of this section, a seed can be -thought of as a 12-mer exact match. - -

-To locate seeds, the query sequence is parsed into seed words the same -way the target is (except that -‑‑step does not apply to the query; -we look at every seed word). -Each packed seed word is used as an index into the target seed word position -table to find the target positions that have a seed match for this -query position. Query seed words containing lower case bases are skipped, so -that repeats will not participate in the seeding stage. - -

-

Quantum Seeding:

-For alignments with quantum DNA it is not possible to -do a direct lookup into the target seed word position table. The position -table is for DNA words (consisting of A, C, -G, and T), whereas the query consists of symbols from -an arbitrary alphabet. The quantum sequence is parsed into seed words as -before, but instead of a direct lookup, each word, called a q-word, -is first converted to a quantum seeding ball of those DNA words that -are most similar to it. Similarity is determined by the scoring matrix; all -words with a combined substitution score above the quantum seeding threshold -(set by the ‑‑ball option) are -considered to be in the ball. Then each word in the ball is looked up in the -target seed word position table as usual, with all such hits considered to be -seed matches for the q-word. -

-The quantum seeding threshold can also be set as a percentage of the maximum -word score possible. If an exact match seed is used, the maximum word score is -the highest value in the substitution matrix multiplied by the seed length. If -a spaced seed is used, the multiplier is the number of 1 positions -in the pattern. -

-Note that the seeding options that provide -special treatment for transitions (Ts in the seed pattern, -half-weight seeds, allowing one or two match positions to be transitions, etc.) -are not supported for quantum alignments. These options would make -the quantum seeding procedure more complex, and are not really necessary -because the quantum mechanism itself provides an alternative way to increase -the alignment sensitivity. Also note that q-words containing lower case bases -are not discarded, since the quantum alphabet is arbitrary and many -ASCII bytes do not even have upper/lowercase versions. - - - -

-

Gap-free Extension

- -

- - -

-In this stage, each seed is extended without allowing gaps to determine -whether it is part of a high-scoring segment pair (HSP). The seed is extended -along its DP matrix diagonal independently in both -directions according to an extension rule, currently either -exact match, M-mismatch, or x-drop. - -

-Exact match extension (‑‑exact) simply -extends the seed until a mismatch is found. If the resulting length is enough, -the extended seed is kept as an HSP for further processing. Exact match -extension is most useful when the target and query are expected to be very -similar, e.g. when aligning short reads to a similar reference genome. - -

-M-mismatch extension -(‑‑<M>mismatch) extends the -seed to find the longest interval that includes the entire seed and contains -no more than M mismatches. If the resulting length is enough, -the extended seed is kept as an HSP for further processing. M-mismatch -extension is most useful when the approximate divergence between the target -and query is known, and HSPs of a known length are desired. -It provides a way to specify both length and identity thresholds together, -with more flexibility than ‑‑exact. - -

-In x-drop extension (‑‑xdrop), as we -extend in each direction we track the cumulative score for the extended match -according to the substitution scoring matrix. The extension is stopped when -the score drops off by more than the given x-drop threshold; that is, when the -difference between the peak score seen so far and the current score is more -than <dropoff>. -(Another way to think of it is that the segment ends when a section scoring -worse than −<dropoff> is encountered.) -The extension is then trimmed back to the peak point. If the combined score -of the seed plus both extensions meets the threshold set by the -‑‑hspthresh option, it qualifies -as an HSP and is kept for further processing. Matches that do not meet the -score threshold are discarded. -The ‑‑entropy options control -whether or not the scores are adjusted for nucleotide entropy when they are -compared to the threshold. - -

-

Adaptive Score Threshold:

-Often it is not clear in advance what value to use for the x-drop method’s -HSP score threshold — set it too high and hardly anything will align, but -too low and the program will be swamped and not finish. LASTZ’s adaptive -scoring options -(‑‑hspthresh=top<basecount> -and -‑‑hspthresh=top<percentage>%) -allow you to set the threshold indirectly to align the desired amount of the -target (as an approximate number of bases or as a percentage, respectively). -This way you can set it for, say, 10% (which will run quickly regardless of the -data), then examine the scores in those results and make an informed choice for -your real threshold. - -

Diagonal Hashing:

-LASTZ includes a time and space optimization that deals with multiple seeds in -the same HSP. The number of seeds in an HSP is generally proportional to both -the length of the HSP and the similarity of the sequences being compared. For -long HSPs or very similar sequences, performing extension over and over for -many seeds in the same HSP would adversely affect the run time. To prevent -this, LASTZ maintains a diagonal extent table that tracks the latest -seed extension on each diagonal (only the latest is needed because of the way -the seeds are sorted). As new seeds "arrive", if they overlap an earlier -extension, they are simply ignored. While this saves time, a direct -implementation could require a lot of space. For two human chromosomes of size -250M bp, the DP matrix has 500 million diagonals, and -storing one position for each diagonal would require 2G bytes. To save memory, -LASTZ hashes diagonals to 16-bit values and tracks extensions only by the hash -value. While this saves space, it results in a miniscule loss of sensitivity -— LASTZ may miss some seeds due to hash collisions. Using -‑‑recoverseeds will prevent losing -these seeds, but will slow the program significantly. Moreover, since most -true alignments contain many HSPs, with many seeds in each HSP, the vast -majority of lost seeds have no effect on the final results. - - - -
-

HSP Chaining

- - - -

-The chaining stage aims to find a series of HSPs that forms a high-scoring path -through the DP matrix, aligning as much as possible while -avoiding backtracking in either sequence. Conceptually it does this by -examining all combinations of HSPs and scoring the chains according to the -relative positions of the HSPs (e.g. the distances between them along the -diagonal and anti-diagonal) as well as their individual scores. All HSPs not -in the highest-scoring chain are discarded. - -

-Ideally this process selects the "real" alignments, filtering out noise (such -as extra alignments due to repeats), and producing a set of HSPs where each -base is aligned at most once; however this is not guaranteed. LASTZ’s -implementation is primarily intended for the case where elements are -known to appear in the same relative order and orientation in the query as in -the target. (However, note that because the forward and reverse strands are -processed in separate pipelines, it will not necessarily cause inversions to be -discarded.) If LASTZ’s implementation of chaining is not suitable, it is -possible to substitute another chaining program by first running LASTZ with the -‑‑nogapped and -‑‑writesegments -options to get the HSPs, running a separate chaining program to filter them, -and then running the final stages of LASTZ on that output via the -‑‑segments option. - -

-Figure 5(a) shows an alignment without chaining, while 5(b) shows the same -alignment with chaining. - -

-

- - -
-Figure 5(a) -

-without chaining -

-
-
-lastz target query --nochain
-
-
-

-Figure 5(b) -

-with chaining -

-
-
-lastz target query --chain
-
-
-

- - - -

-

Gapped Extension

- -

-Before the HSPs are extended further by allowing gaps, each HSP is first -reduced to a single anchor point; -this allows for the possibility that the optimal alignment may include gaps -within the region occupied by the HSP. The gap-free HSP is only an indication -of likely homology in that vicinity; other paths through the same region that -allow gaps may have a higher score, so we don't want to just extend from the -ends of the HSP. Instead we run the gapped algorithm from a single point that -we think is most likely to lie on the optimal path, namely the middle of the -highest-scoring 31-bp interval in the HSP. A more general (and expensive) -approach would be to examine all paths through the square region defined by the -HSP, instead of starting from a single anchor point. - -

-Figure 6(a) illustrates the relationship of seeds, HSPs, and anchors. Heavy -lines are seeds, which were extended without gaps (see Overview) to create HSPs (thin lines). Blue dots are anchors. Seeds with -no HSP shown (gray lines) had low-scoring extensions and were discarded at the -gap-free extension stage. - -

- - -
-Figure 6(a) -

-seeds, HSPs, and anchors -

-Figure 6(b) -

-anchors and gapped extensions -

- -

-The anchors are then processed in the order of their HSP’s score (highest -first). Gapped extension is performed -independently in both directions from the anchor point, and the two resulting -alignments are joined at the anchor. If the total score meets the threshold -specified by the ‑‑gappedthresh -option, the joined alignment is kept and passed to the next stage; otherwise it -is discarded. If the extension from one anchor happens to go through one or -more other anchors, the redundant anchors are dropped from the list. - -

-Figure 6(b) shows the relationship of anchors and their gapped extensions. -The blue dots are the anchors from 6(a), which are extended in both directions -to form gapped alignments (squiggly lines; the gaps are too small to be visible -at this scale). One anchor had low-scoring extensions that did not meet the -threshold. Another had an extension that ran directly through a nearby anchor; -that anchor did not need to be processed separately. - -

-The gapped extensions are computed using a typical -dynamic programming recurrence for affine gap alignment -(e.g. [Myers 1989] or -[Gusfield 1997]), beginning at the anchor and -terminating at the point with the highest cumulative score. The portion of -the DP matrix examined is reduced by disallowing low-scoring regions (see -[Zhang 1998]): wherever the alignment score drops -below the peak score seen so far by more than the threshold specified in the -‑‑ydrop option, the DP matrix is -truncated and no further cells are computed along that row or column. -By default the extension is then trimmed back to the location of the peak -score; thus the extension normally ends when all remaining sub-alignment -possibilities (paths in the DP matrix) begin with sections that score worse -than −<dropoff>. However for alignments -where the extension reaches the end of the sequence, you can suppress this -trimming by specifying the ‑‑noytrim -option, which is recommended when aligning short reads. - -

-Figure 7 shows the effect of the y-drop threshold in more detail. Extension -is performed in two directions from the anchor (in this example, to the upper -right and lower left, because both sequences are on the positive strand). -The gray region is the portion of the DP matrix explored by the extension -algorithm; its boundary is formed by the points where the score dropped from -the maximum by more than the y-drop threshold. - -

- - -
-Figure 7 -

-effect of y-drop -

- - - -

-

Back-end Filtering

- -

-Whatever alignment blocks have made it through the above gauntlet are then -subjected to -identity, continuity, coverage and match count filtering (as specified by the -‑‑filter=identity, -‑‑filter=continuity, -‑‑filter=coverage, -‑‑filter=nmatch, -‑‑filter=nmismatch, -‑‑filter=ngapand -‑‑filter=cgap options, -respectively). Blocks that do not meet the specified range for each feature are -discarded. - -

-

-Identity is the fraction of aligned bases (excluding columns -containing gaps or non-ACGT characters) that are -matches, expressed as a percentage. The numerator is the number of matches in -the alignment block, while the denominator is the number of matches plus the -number of mismatches. -Characters that differ only in upper vs. lower case are -counted as matches. Columns containing gaps or non-ACGT characters play no -part in this computation, and it is independent of the settings for -‑‑ambiguous=n and -bad_score. Identity cannot -be determined for alignments with quantum DNA, because -of the potential ambiguity of the symbols. - -

-

-Continuity is the fraction of alignment columns that do not contain -gaps, expressed as a percentage. The numerator is the number of matches plus -mismatches in the alignment block, while the denominator is the number of -columns. Unlike the computation of identity, here "matches plus mismatches" -includes all non-gap columns regardless of whether they contain non-ACGT -characters. - -

-

-Coverage is the fraction of bases in the entire input sequence -(target or query, whichever is shorter) that are included in the alignment -block, expressed as a percentage. Such bases are aligned in the block to -either bases or gaps in the other sequence. Note that if there are multiple -sequences in the target and/or query, only the current one is considered; -however if an input sequence is spliced with runs of Ns or -Xs, then the combination of all its subsequences (including the -splice characters between them) is considered as one input sequence, because -LASTZ does not explicitly recognize the splicing. -Further, if a separator character is used, -again the combination of all subsequences is considered as one input sequence -(including the separator characters). Also note that each block’s -coverage is computed independently of other blocks, and each must meet any -specified filter range by itself; blocks cannot be combined to meet coverage -requirements. - -

-

-Match Count, or nmatch, is the number of matched bases in -the alignment. Characters that differ only in upper vs. lower case are counted -as matches, columns containing gaps or non-ACGT characters are not. Match -count cannot be determined for alignments with quantum -DNA, because of the potential ambiguity of the symbols. - -

-

-Mismatch Count, or nmismatch, is the number of aligned -bases in the alignment that are not matches. This includes substitutions as -well as non-ACGT characters (even if they are identical), but not gaps. -Mismatch count cannot be determined for alignments with -quantum DNA, because of the potential ambiguity of the -symbols. - -

-

-Gap Count, or ngap, is the number of gaps in the block, -counting each run of gapped columns as a single gap. - -

-

-Gap Column Count, or cgap, is the number of gaps in the -block, counting each gapped column as a separate gap. - - - -

-

Interpolation

- -

-Once the above stages have been performed, it is not uncommon to have regions -left over in which no alignment has been found. In the interpolation stage -(activated by the ‑‑inner option) we -repeat the seeding through gapped extension stages in these leftover regions, -at a presumably higher sensitivity. Using such high sensitivity from the -outset would be computationally prohibitive (due to the excessive number of -false, low-scoring matches), but is feasible on the smaller, leftover regions. - -

-Another complete alignment round (seeding, gap-free extension, chaining, and -gapped extension, even if some of these were skipped in the main alignment; -but not back-end filtering) is performed in the small areas between the -alignment blocks found in the preceding main alignment stage. Only regions -within 20K bp from the endpoints of the passed-in alignment blocks are searched. -Seeding for this alignment requires a 7-bp exact match with no transitions, and -uses the specified scoring threshold for both its gap-free and gapped extension -sub-stages. (This threshold should generally be set lower than the -corresponding ones in the main alignment, in order to increase the sensitivity -of the interpolation.) All other parameters are the same as those used for the -main alignment stages. - -

-Figure 8 shows the operation in more detail. The alignment blocks resulting -from gapped extension are shown in 8(a) as squiggly lines. After interpolation, -in 8(b), additional alignment blocks have been discovered in the red areas. -Note that there are still some holes remaining, where these sequences just -don't align well. - -

- - -
-Figure 8(a) -

-before interpolation -

-
-
-lastz target query
-
-
-

-Figure 8(b) -

-after interpolation -

-
-
-lastz target query --inner=1000
-
-
-

- - - -

-

Alignment Output

- -

-The alignment blocks found by the preceding pipeline of stages are written to -stdout (or to a file specified with the -‑‑output option) in the requested -format. -These may be seeds, gap-free HSPs, or gapped local alignments, depending on -which stages were performed. There is no particular order to the alignment -blocks for an individual query sequence (e.g. they are not sorted by -score or position). However, since the query sequences are processed serially, -the blocks for each one will appear together in the output. - - - - - - -


-
-

File Formats

- -

-LASTZ typically receives two sequence files and possibly a scoring file as -inputs, and produces an alignment file as output. -

-DNA sequences can be provided in FASTA, -FASTQ, -Nib, or 2Bit format, or -indirectly via an HSX index. These -sequences contain a series of A, C, G, -T, and N characters in upper or lower case. -Lower case indicates repeat-masked bases, while Ns represent -unknown bases if the ‑‑ambiguous=n -option is specified. (By default, a run of Ns or Xs -is used to separate sequences that have been catenated together for processing, -but this is now deprecated; see -Non-ACGT Characters, Splicing, and Separation -for a discussion of the use of Ns and Xs.) As an -alternative to DNA sequence, quantum DNA using an -abstract alphabet can be used as the query -(but not as the target). -

-The FASTA, FASTQ, 2Bit and HSX formats support more than one sequence within -the same file. -Files containing multiple sequences are normally only used as the query; -however invoking the multiple -action in the file’s sequence specifier allows -them to be used for the target as well. Also, the -subset action allows one or more -sequences to be selected from such a file. -

-The FASTQ format carries base-calling quality values as well as DNA. - - - -

-

FASTA (sequence input)

- -

-FASTA format stores DNA sequences as plain text. The first line begins with -a > followed by the name of the sequence, and all subsequent -lines contain nucleotide characters. The lines can be of any length. -If the file contains multiple sequences, each should start with its own -> header line. - -NCBI FASTA specification -

-Note that although the official FASTA specification allows the character -X only in amino acid sequences, LASTZ accepts it in DNA sequences -as a splicing character. However, LASTZ does not currently support -IUPAC-IUB ambiguity codes other than N (such as R, -W, etc.), -beyond the treatment afforded by ‑‑ambiguous=iupac. -

-A special case, non-conforming to the official standard, is made to allow a -special user-specified separator character. -Usually this will be N or X, but any other printable -ASCII character that suits the user’s needs is acceptible. -

-It has become common for suppliers of FASTA files to pack a plethora of -additional information into a sequence’s header line. This extra -information -can create difficulties for many sequence processing tools. For example, -headers often contain spaces but file formats such as MAF -do not allow spaces in sequence names. To compensate for this, LASTZ provides -several options for extracting a concise name from sequence headers; see -Sequence Name Mangling for details. - - - -

-

FASTQ (sequence input)

- -

-FASTQ format stores DNA and base-calling quality sequences as plain text, and -is primarily used to describe the results of short-read sequencing runs. As -explained in [Cock 2009], this format has evolved -over time in the Bioformatics community. LASTZ only supports a subset of this -format, prohibiting line-wrapping within DNA or quality sequences. -

-Each sequence consists of four lines. The first line begins with a - followed by the name of the sequence. The second line contains -nucleotide characters. The third line begins with a +, optionally -followed by the name of the sequence (which, if present must match that of the -first line). The fourth line contains quality characters. -

-There are several conflicting standards for encoding quality values in FASTQ -files, but (as of this writing) the differences are not relevant to LASTZ. -LASTZ currently does not make any computational use of the qualities, and -simply copies them into the output file when appropriate. -

-LASTZ treats IUPAC-IUB ambiguity codes in FASTQ files the same as those in -FASTA files. - - - -

-

Nib (sequence input)

- -

-Nib format stores a single unnamed DNA sequence, packed as two bases per byte. - -UCSC Nib specification - - - -

-

2Bit (sequence input)

- -

-2Bit format stores multiple DNA sequences, encoded as four bases per byte with -some additional information describing runs of masked bases or Ns. - -UCSC 2Bit specification -

-Sequence names in 2Bit files have all the same problems as in FASTA files, -so Sequence Name Mangling applies to these files -as well. - - - -

-

Quantum DNA (sequence input)

- -

-A quantum DNA file describes a single sequence of "quantum" DNA, which uses -an abstract, user-defined alphabet. Each position in the sequence is a byte -with a value in the range 0x01..0xFF, which can -represent an ambiguity code, amino acid, or any other meaning you desire. -LASTZ does not try to interpret these in any way; it just aligns them as -abstract symbols corresponding to columns in the scoring matrix. Note that -the value 0x00 is prohibited. -

-The file itself is stored in a binary format described by the table below. -It can be written on either a big-endian or little-endian machine; LASTZ -determines the byte order of multi-byte fields by examining the magic number -at the start of the file. -Be sure to use the quantum action -in the file’s sequence specifier to notify LASTZ -that it contains quantum DNA. - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
File OffsetDataMeaning
0x00 -C4 B4 71 97 -
—or— -
97 71 B4 C4 -
-Magic number indicating big-endian byte order. -
-
Magic number indicating little-endian byte order. -
0x0400 00 02 00File conforms to version 2.0 of the Quantum DNA file format.
0x0800 00 00 14Header length in bytes, including this field through the all-zero -field.
0x0Cxx xx xx xxSOFF: -offset (from file start) to sequence data.
0x10xx xx xx xxNOFF: -offset (from file start) to name; 0 indicates no name.
0x14xx xx xx xxSLEN: -length of sequence data.
0x1800 00 00 00Must be zero.
NOFFName: -a zero-terminated ASCII string.
SOFF -Sequence data: -a series of SLEN bytes, each of which is one quantum symbol -in the sequence. -
- - - -

-

Quantum Code File

- -

-This file is used with the quantum -action in a sequence specifier. It defines a mapping -from quantum DNA symbols to vectors of values for the -four nucleotides A, C, G, and -T. Usually these indicate the nucleotide probability distribution -for each symbol in the quantum alphabet. However, LASTZ doesn't interpret the -values, and only uses them to to augment the display of alignment blocks in the -Human-Readable Text output format. - -

-Each line in the file gives the mapping for one symbol. Lines beginning -with a # are considered to be comments and are ignored, as are -blank lines. Data lines have five columns, separated by whitespace. The first -field contains the symbol, as either a single character or two hexadecimal -digits, while the remaining four fields contain values for -A, C, G, and T, -respectively. Each value can be either a single floating-point number or a -fraction (two floating-point numbers with a / between them, -without spaces). Any symbols in the quantum alphabet that aren't listed in -this file receive zeroes for all four values. - -

-Here is an example. -

-    # sym p(A|sym) p(C|sym) p(G|sym) p(T|sym)
-      01  0.125041 0.080147 0.100723 0.694088
-      02  0.111162 0.053299 0.025790 0.809749
-      03  0.065313 0.007030 0.004978 0.922679
-       ... more rows here ...
-      FF  0.209476 0.014365 0.755682 0.020477
-
- - - -
-

Sequence Name File

- -

-This file is used with the subset -action in a sequence specifier to select particular -sequences for processing. It consists of one sequence name per line. Lines -beginning with a # are considered to be comments and are ignored, -as are blank lines. Only the first whitespace-delimited word in any line is -read as the name; the rest of the line is ignored. -

-Note that when used in conjunction with a -FASTA or -2Bit file, the names must appear in the same order as -they appear in the corresponding sequence file, and must match the -mangled name extracted from that file. When used -with an -HSX file, the names can be in any order but must -match names indexed in the HSX file. - - - -

-

Sequence Masking File

- -

- -This file is used with the xmask and -nmask actions in a -sequence specifier. -It can also be created by using the -‑‑outputmasking=<file> -or -‑‑outputmasking:soft=<file> -options. -It consists of one interval per -line, without sequence names. Lines beginning with a # are -considered to be comments and are ignored, as are blank lines. Only the first -two whitespace-delimited words in any line are interpreted as the interval; the -rest of the line is ignored. -

-Each interval describes a region to be masked, and consists of -

-    <start> <end>
-
-Locations are one-based and inclusive on both ends (i.e., they use the -origin-one, closed position numbering system). -Note that the masking intervals are -counted along the forward strand, even if we are only -aligning to the reverse complement of the query specifier (i.e. for -‑‑strand=minus). - -

-Here is an example. If the target sequence is hg18.chr1, this would mask the -5' UTRs from several genes. Note that the third column is neither required -nor interpreted by LASTZ, and acts as a comment. -

-     884484  884542  NM_015658
-     885830  885936  NM_198317
-     891740  891774  NM_032129
-     925217  925333  NM_021170
-     938742  938816  NM_005101
-     945366  945415  NM_198576
-    1016787 1016808  NM_001114103
-    1017234 1017346  NM_001114103
-    1041303 1041486  NM_001114103
-
- - - -
-

Sequence Masking File, Three Fields

- -This file format is output only. LASTZ does not recognize input files in this -format. -

-This file is created by using the -‑‑outputmasking+=<file> -or -‑‑outputmasking+:soft=<file> -options. -It consists of one interval per line, with sequence names. -

-Each interval describes a region that has been masked, and consists of -

-    <name> <start> <end>
-
-Locations are one-based and inclusive on both ends (i.e., they use the -origin-one, closed position numbering system). -Note that the masking intervals are -counted along the forward strand, even if we are only -aligning to the reverse complement of the query specifier (i.e. for -‑‑strand=minus). - - - -
-

Scoring File

- - -

-This file is used with the ‑‑scores -option to specify a set of (mostly) scoring-related parameters en masse. -The score set consists of a substitution matrix and other settings. The other -settings come first and are individually explained in the -table below. All settings are optional, -and most of them have exact correspondence to command-line options and the same -defaults (unless otherwise specified in the table). Command-line settings -always override settings in this file. Any line may end with a comment -(# is the comment character). - -

-

-In the matrix, rows correspond to characters in the target sequence, while -columns correspond to characters in the query. Matrix labels can be specified -either as single ASCII characters or as two-digit hexadecimal values in the -range 01..FF (do not add a leading 0x). -Note that the value 00 is not allowed. -The rows and columns of the matrix need not have the same set of labels, so -for example, a matrix might describe scoring between the 4-letter DNA alphabet -and the 15-letter ambiguity alphabet. Any labels other than A, -C, G, and T (or their hex equivalents) -are treated as quantum DNA. -

-Score values can be floating-point if the lastz_D version of the -executable is used instead of lastz. - -

-Here is an example: -

-    # This matches the default scoring set for BLASTZ
-    
-    bad_score          = X:-1000  # used for sub['X'][*] and sub[*]['X']
-    fill_score         = -100     # used when sub[*][*] is not defined
-    gap_open_penalty   =  400
-    gap_extend_penalty =   30
-
-         A     C     G     T
-    A   91  -114   -31  -123
-    C -114   100  -125   -31
-    G  -31  -125   100  -114
-    T -123   -31  -114    91
-
- - -

-BLASTZ scoring files are also accepted. These only contain a substitution -matrix, and row labels must be absent (they are assumed to be the same as the -column labels). No other settings are allowed. -

-       A     C     G     T
-      91  -114   -31  -123
-    -114   100  -125   -31
-     -31  -125   100  -114
-    -123   -31  -114    91
-
- -
-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeywordSettingMeaning
bad_score -<score>
-<row>:<col>:<score> -
-This score fills a single row and column of the substitution matrix, so that -any occurrences of the corresponding characters are severely penalized. By -default the "bad" character for both the target and query is X -for DNA sequences or the null byte (00) for -quantum DNA sequences, and the associated score is -−1000. -

-This option allows you to change these characters and/or the score they receive. -The <row> and <col> fields are character codes (as explained -above); if they are absent X (and/or -00) is assumed. Note that these characters are case sensitive. The -bad_score row and column cannot be removed entirely, but you can achieve the -same effect by setting them to invalid characters that will never occur in your -sequences. There is no corresponding command-line option. -

fill_score<score> -This is used as a default for all cells of the scoring matrix that are not -otherwise set (either by the user or by LASTZ’s defaults). This is the -score used for Ns (unless -‑‑ambiguous=n is specified on the -command line). -

-The default value is −100. There is no corresponding command-line option. -

gap_open_penalty<penalty> -This is identical to the <open> field of the -‑‑gap command line option. -
gap_extend_penalty<penalty> -This is identical to the <extend> field of the -‑‑gap command line option. -
step<offset> -This is identical to the -‑‑step command line option. -
seed<strategy> -This corresponds to the ‑‑seed and -‑‑transition command line options. -<strategy> must be one of the following, with no spaces: -
12of19,transition -
12of19,notransition -
14of22,transition -
14of22,notransition -
ball<score>
- <percentage>%
-This is identical to the -‑‑ball command line option. -
x_drop<dropoff> -This is identical to the -‑‑xdrop command line option. -
hsp_threshold<score> -This is identical to the -‑‑hspthresh command line option, -except that it does not currently support the -‑‑hspthresh=top<basecount> or -‑‑hspthresh=top<percentage>% variants. -
y_drop<dropoff> -This is identical to the -‑‑ydrop command line option. -
gapped_threshold<score> -This is identical to the -‑‑gappedthresh command line option. -
- - - -

-

Inference Control File

- -

-When LASTZ is asked to infer substitution scores and/or gap penalties from the -input sequences (e.g. via the ‑‑infer -option), this file is used to set parameters that control the inference -process. - -

-Here is an example (note that currently the parsing of this file is less -flexible than some of the others, and * is the only arithmetic -operator supported): - -

-

-    # base the inference on alignments in the middle half by identity
-    min_identity       = 25.0%    # 25th percentile
-    max_identity       = 75.0%    # 75th percentile
-
-    # scale scores so max substitution score will be 100, and only use
-    # alignments scoring at least as well as 20 ideal matches
-    inference_scale    = 100      # max substitution score
-    hsp_threshold      = 20*inference_scale
-    gapped_threshold   = hsp_threshold
-
-    # allow substitution score inference to iterate at most 20 times;
-    # don't perform gap penalty inference -- instead hardwire gap penalties
-    # relative to max substitution
-    max_sub_iterations = 20
-    max_gap_iterations = 0
-    gap_open_penalty   = 4*inference_scale
-    gap_extend_penalty = 0.3*inference_scale
-
-    # use all seedword positions (don't sample)
-    step               = 1
-
-    # adjust for entropy when qualifying HSPs
-    entropy            = on
-
- -

-min_identity and max_identity specify the range of -sequence identity upon which inference is based; -only alignment blocks within this range contribute to the inference. If the -value ends with a percent sign, it represents a percentile of the identity -distribution over all the blocks; otherwise it is a fixed percent identity -value. For example, min_identity=70 and -max_identity=90 indicates that blocks with identity ranging from -70 to 90 percent will be used, while min_identity=25% and -max_identity=75% indicates that half of the blocks will be used -(from the middle of the distribution). -The defaults are min_identity=0 and max_identity=100 -(i.e., no blocks are excluded from inference due to percent identity). - -

-inference_scale specifies a value for the largest substitution -score (i.e., the score for the best match). All other scores are scaled -proportionally. If this is set to none, the scores will be -log-odds using base 2 logarithms. -The default is inference_scale=100. - -

-hsp_threshold and gapped_threshold correspond to -the command line ‑‑hspthresh and -‑‑gappedthresh options. -The defaults are hsp_threshold=3000 and -gapped_threshold=hsp_threshold. - -

-max_sub_iterations and max_gap_iterations specify -limits on the number of inference iterations that will be performed. For -example, if you only want a substitution scoring matrix, you can set -max_gap_iterations=0. -The defaults are max_sub_iterations=30 and -max_gap_iterations=0. - -

-gap_open_penalty and gap_extend_penalty correspond to -the command line -‑‑gap=[<open>,]<extend> -option. These are used for the first iteration of gap-scoring inference. -The defaults are gap_open_penalty=3.25*worst_substitution and -gap_extend_penalty=0.24375*worst_substitution. - -

-step corresponds to the command line -‑‑step option. A large step, e.g. -step=100, could potentially speed up the inference process. -Ideally, this would base the inference on a sample of only one percent of the -whole. However, the sample actually ends up larger than that and is biased -toward HSPs that are either longer or have a lower substitution rate. This -happens because sampling occurs at the seed level, and such HSPs generally -have more seeds. Future versions of LASTZ may include a means to compensate -for this bias. -The default is step=1. - -

-entropy corresponds to the command line -‑‑entropy option. Legal values are -on or off. If on, sequence entropy is incorporated -when filtering HSPs. The default is entropy=on. - -

-The value of worst_substitution cannot be set directly. -Instead, it is computed from the initial scoring matrix. It is the minimum -score in the scoring matrix for any of the symbols A, C, G or T (equivalently, -the most negative score or the maximum penalty). - -

-Note that these parameters apply to the inference process only. If the -corresponding command line options are also set, those will apply for the -final, "real" alignment stages (and will also override the inferred values if -there is a conflict), but will not affect the inference itself. -Inference cannot be used in conjunction with a scores file. - - - -

-

HSX (Hashed Sequence Index)

- -

-An HSX file is an index of sequences in other files, allowing fast random -access to those sequences. The current implementation of LASTZ only supports -indexing FASTA files. Future versions may include -Nib and 2Bit sequences. -The following is a brief overview of the -file format. For more detailed information, see the - -HSX specification - -

-An HSX file can be created using the build_fasta_hsx.py utility -(included in the tools directory of the LASTZ distribution), using -a command like this: -

-    build_fasta_hsx sequences.fa [more_sequences.fa ...] > index.hsx
-
- -

-It is important that the HSX file has the extension .hsx and -resides in the same directory as the files being indexed. Further, the files -being indexed must have the extension .fa or .fasta. -These rules allow LASTZ to determine the sequence file type when it reads the -HSX file, and to locate the files containing the sequences. - -

-The index file includes names to be used for the sequences, which do not have -to match the original names or headers in the sequence files. This feature -obviates the need for LASTZ to perform sequence name -mangling, so most of those actions are not supported for HSX files. -Instead, it is the responsibility of the program that creates the index to -select suitable names. - - - -

-

Target Capsule File

- -

-A target capsule file is essentially a memory dump of several internal data -structures related to the target sequence and the target seed word position -table. At the present time the authors do not wish to create an official -specification for this format, but please see -Using Target Capsule Files for information on -how to create and utilize them. - - - -

-

Alignment Chores File

- -

- -

-A chores file describes a list of sequence interval pairs, indicating that the -alignment process is to be restricted to those intervals. - -

-The file contains two intervals per line, one from the target and one from the -query, with sequence names. Optionally, the query strand can be specified, as -well as an identifying tag. Lines beginning with a # are -considered to be comments and are ignored, as are blank lines. # -can also be used to put comments at the end of lines, but must be preceeding by -whitespace. - -

-Each line looks like -

-    <name1> <start1> <end1> <name2> [<start2> <end2>] [<strand2>] [id=<tag>] [#<comment>]
-
-where <name1>, etc. correspond to the target sequence and <name2>, -etc. correspond to the query. Fields are delimited by whitespace. - -

-When the target name is irrelevant (i.e. when there is only one name in the -target sequence file), * can replace <name1>. Similarly, if -we don't have a target (or query) subrange, * * can be used in -place of start and end. Note that the query subrange and strand are optional, -as is the tag. When the strand is not specified, both strands are searched. - -

-Locations are one-based and inclusive on both ends, i.e. -origin-one, closed (thus the interval "154 228" has -length 75 and is preceded by 153 bases in its sequence). All target intervals -are on the positive strand. All query intervals are -counted along the forward strand, regardless of which -strand is specified. - -

-Target sequence names may appear in any order. Sequence names for the query -must appear in the same order as they do in the query file. Because alignment -output ordering is on a chore-by-chore basis, it is good practice to include -all positive strand intervals for a query before any negative strand intervals -for that query. Some downstream tools may depend on this ordering. - -

-The tag can be any short string the user wants to associate with the chore -(excluding whitespace). As of this writing, the only use of the tag field is -that it can be copied to the output file by use of the -chore field for -‑‑format=general. - -

-Here is an example. -

-    chr9  116517410 116518409  READ_00070 *   *   + id=DFZ
-    chr3  157707345 157708344  READ_00070 *   *   + id=EDZ
-    chr9  112944437 112945436  READ_00078 101 200 + id=FAC
-    chr1  3377578   3378577    READ_00078 *   *   + id=LLH
-    chr2  175604671 175605670  READ_00078 *   *   - id=DFZ
-    chr2  230613705 230614704  READ_00079           id=DFZ
-    chr9  20387422  20388421   READ_00355 *   *   + id=DFZ
-    chr8  16396215  16397214   READ_00355 *   *   + id=MNQ
-    chr14 *         *          READ_00355 *   *   - id=MNQ
-    chr4  50534096  50535095   READ_00355 *   *   - id=QOY
-    chr6  58308766  58309765   READ_00376 *   *   - id=EDZ
-    chr5  172249269 172250268  READ_00376 *   *   - id=FAC
-    chr9  123860065 123861064  READ_00376 *   *   - id=MNQ
-
- - - -
-

Segment File

- -

-A segment file describes a list of segments representing gap-free alignments. -This list is either produced internally by LASTZ as a result of the -gap-free extension stage (see Overview), or read from -a user-supplied file via the -‑‑segments option. The latter -causes LASTZ to skip the indexing, seeding, and gap-free extension stages and -begin with the chaining stage (or the next specified stage, if chaining is not -requested). - -

-The file contains two intervals per line, one from the -target and one from the query, with sequence names. Lines beginning with a -# are considered to be comments and are ignored, as are blank -lines. # can also be used to put comments at the end of lines. - -

-Each line looks like -

-    <name1> <start1> <end1> <name2> <start2> <end2> <strand2> [<score>] [#<comment>]
-
-where <name1>, etc. correspond to the target sequence and <name2>, -etc. correspond to the query. Fields are delimited by whitespace. - -

-Locations are one-based and inclusive on both ends, -i.e. origin-one, closed (thus the interval "154 228" -has length 75 and is preceded by 153 bases in its sequence). Negative strand -intervals are measured from the 5' end of the query’s negative -strand -(corresponding to the rightmost end of the given query sequence, -i.e. counted along the reverse strand). All target -intervals are on the positive strand. The two intervals must have the same -length (since these alignments are gap-free). - The score is used to determine the -processing order during gapped extension. -Segments without scores are given a score of zero. - -

-Query sequence names must appear in the same order as they do in the query file. -For each query sequence, normally all positive strand intervals must appear -before any negative strand intervals. -Sequence names for the target may appear in any -order, and are only meaningful if the -multiple action is used; otherwise -they are ignored. Intervals with names not found in the target or query are not -allowed. In cases where sequence names are either unknown or of no importance -(e.g. when all sequences in the file have the same name), a * can -be used as a generic sequence name. - - -

-Here is an example. -

-    R36QBXA37A3EQH 151 225  Q81JBBY19D81JM 14  88 +  6875
-    R36QBXA37D4L6V  26 100  Q81JBBY19D81JM 10  84 +  6808
-    R36QBXA37EVLNU  19  93  Q81JBBY19D81JM  7  81 +  6842
-    R36QBXA37CEBPD   8  81  Q81JBBY19D81JM  9  82 +  7108
-    R36QBXA37BLO6X 132 205  Q81JBBY19D81JM 11  84 -  7339
-    R36QBXA37A2W3P 162 214  Q81JBBY19D81JM  2  54 -  5024
-    R36QBXA37A9395  62 136  Q81JBBY19A323K 18  92 +  7231
-    R36QBXA37DNC74  18  82  Q81JBBY19A323K  2  66 +  6418
-    R36QBXA37CTR26  83 167  Q81JBBY19ASA7F 19 103 +  8034
-    R36QBXA37C2TAC  95 181  Q81JBBY19ASA7F 15 101 +  8272
-
- - - -
-

LAV (alignment output)

- -

-LAV is the format produced by BLASTZ, and is the default. It reports the -alignment blocks grouped by "contig" (chromosome, scaffold, read, etc.) and -strand, and describes them by listing the coordinates of gap-free segments. -This format is compact because it does not include the nucleotides, but -consequently interpretation usually requires access to the original sequence -files, and it is not easy for humans to read. - -LAV specification - -(same specification at PSU) - -

-The option ‑‑format=lav+text adds -textual output for each alignment block (in the same -format as the ‑‑format=text option), intermixed with the LAV -format. Such files are unlikely to be recognized by any LAV-reading program. - - - -

-

AXT (alignment output)

- -

-AXT is a pairwise alignment format popular at UCSC and PSU. - -UCSC AXT specification -

-The option ‑‑format=axt+ reports -additional statistics with each block, in the form of comments. The exact -content of these comment lines may change in future releases of LASTZ. - - - -

-

MAF (alignment output)

- -

-MAF is a multiple alignment format developed at UCSC. The MAF files produced -by LASTZ have exactly two sequences per block: the first row always comes from -the target sequence, and the second from the query. - -UCSC MAF specification -

-The option ‑‑format=maf+ reports -additional statistics with each block, in the form of comments. The exact -content of these comment lines may change in future releases of LASTZ. -

-The option ‑‑format=maf- suppresses -the MAF header and any comments. This makes it suitable for concatenating -output from multiple runs. -

-UCSC’s MAF should not be confused with other formats that have the same -name. For example, the MIRA sequence assembler project has a file format named -MAF, but it is a completely unrelated file format and is not supported by LASTZ. - - - -

-

SAM (alignment output)

- - -

-SAM is a pairwise alignment format used primarily for short-read mapping, and -supported by the SAMtools programming suite. This format is described in -[Li 2009], and as of May 2011 a specification for it -can be found at the SAMtools page -at SourceForge. - -

-For SAM files, LASTZ assumes that the target sequence is the reference and -that query sequence(s) are short reads. For alignments that don't reach the -end of a query, ‑‑format=sam uses -"hard clipping", while ‑‑format=softsam -uses "soft clipping". See the section on "clipped alignment" in the SAM -specification for an explanation of what this means. - -

-The options ‑‑format=sam- and -‑‑format=softsam- suppress the SAM -header lines. This makes them suitable for concatenating output from multiple -runs. - - - -

-

CIGAR (alignment output)

- -

-

-CIGAR is an acronym for Concise Idiosyncratic Gapped Alignment Report, a -pairwise alignment format defined originally by the -Exonerate alignment program. -The format has since been adapted in different forms, as -ensembl cigar format -and as an -extended cigar string -in SAMtools. For -‑‑format=cigar, LASTZ implements -Exonerate CIGAR. LASTZ implements other CIGAR variants for -‑‑format=sam -and as fields for ‑‑format=general. - -

-Exonerate CIGAR -format does not include nucleotides; instead it describes the locations of -indels (but not substitutions) using run-length encoding. An alignment is -characterized as runs of M (match and/or substitution), -I (query contains a base not in target), and D -(target contains a base not in query). Each run is encoded by the letter code, -whitespace, and the length; multiple runs are separated by whitespace. The -format also includes positional information for the start of the alignment. An -example is shown at the end of this -section. While there seems to be no complete, definitive specification for -CIGAR, the CIGAR files produced by LASTZ are believed to match the format -produced by Exonerate. - -

-In the other variants of CIGAR, whitespace is removed and the order of the -letter code and length are reversed (length appears before letter code). In -some variants the length is omitted if it is 1; in other variants -M runs are divided into = (match) and X -(substitution). SAMtools extended cigar strings allow S and -H runs to describe clipping operations for short sequences. -LASTZ implements combinations of these variants where appropriate; details -are described in -‑‑format=general:cigar, -‑‑format=general:cigarx -and ‑‑format=sam. - -

-

-To understand the differences between different types of CIGAR strings, -consider the following alignment of a short 61-bp query to a longer target. - -

-

-    target:  ...GATTAAGAGTCTGTCCGACCTTCTTCT---GGGTTTACCGAAGCCCACTTAGCTGATATTCGA...
-                   ||||||||||||||||X|||||||   |||||||  X||||||||||||||||||
-     query:     ACCTAAGAGTCTGTCCGACATTCTTCTACGGGGTTTA--TAAGCCCACTTAGCTGATAAGGTT
-                   ↑      1         2         3           4         5    ↑    6
-                0123456789012345678901234567890123456--789012345678901234567890
-
-
- -

-For ‑‑format=cigar, the alignment would be described by this line: -

-    cigar: query 3 56 + target <start> <end> <strand> <score> M 24 I 3 M 7 D 2 M 19
-
- -

-For ‑‑format=general:cigar, the -alignment path would be described by this field: -

-    24M3I7M2D19M
-
- -

-For ‑‑format=general:cigarx, the -alignment path would be described by this field: -

-    16=X7=3I7=2DX18=
-
- -

-For ‑‑format=sam, the alignment path would -be described by this field: -

-    3H24M3I7M2D19M5H
-
- - - -
-

BLASTN (alignment output)

- - -

-The BLASTN format reports pairwise alignments in a format similar to -NCBI’s BLASTN program. Output is modeled upon version 2.2.24+ of the -standalone version of BLASTN available from - -NCBI’s BLAST ftp site. Output should be similar that produced by the -command -

-    blastn -task blastn -db <target> -query <query> -outfmt 7
-
-It is important to realize that a couple of the fields, specifically -evalue and bit score, are written as crude -approximations of the value that BLASTN would produce, as described below. - -

-The format is tab-delimited with one alignment reported per line, plus an -additional header. Here is some sample output: -

-    # lastz --format=blastn
-    # Query: orange
-    # Database: apple
-    # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
-    orange apple 82.14  2072 142 67 2    1926 103  2093 0     1972
-    orange apple 100.00 14   0   0  1906 1919 2086 2073 0.043 26.5
-    orange apple 93.33  15   1   0  1763 1777 2004 1990 0.53  22.9
-
- -

-Most of the fields correspond directly to fields available in the -General output format. These are -query id=name2, -subject id=name1, -%identity=blastid%, -alignment length=ncolumn, -mismatches=nmismatch, -gap opens=ngap, -q.start=start2, -and q.end=end2. -The fields s.start and s.end are nearly equivalent -to start1 and end1, but when the alignment is to the -reverse strand, they appear in the other order (i.e. -s.start > s.end). - -

-The two remaining fields, evalue and bit score, are crudely estimated -from LASTZ’s score field, but are not strictly -correct. Further, these approximations assume that default LASTZ scores -are used. Otherwise they are unlikely to be good approximations. The -approximation formulas are -

-     evalue    = 3.0e9*exp(-0.01421*score)
-     bit score = 0.0205*score
-
- - - -
-

Differences (alignment output)

- -

-LASTZ’s Differences format reports each difference between target and query -on a separate line, where a difference is any indel or run of -mismatches. It is intended for comparisons between close sequences, such as -when comparing reads from a human individual to the human reference genome, or -reads from a bacterial strain to a reference sequence for the same bacterium. -The format is a tab-delimited table with one line per difference; it is -well-suited for use with spreadsheets and the -R statistical package. - -

-The columns reported in this format are the name, start & end of the -difference, strand, and overall size for the target; the name, start & end -of the difference, strand, and overall size for the query; the text of the -difference in the target, then in the query; and finally the text of the -complete alignment block containing the difference, first in the target, then -in the query. Intervals are origin-zero, half-open -and counted along the forward strand. - -

-The example below compares output in this format to similar results using the -General output format for the same input sequences. -For the Differences output, column numbers have been added for discussion (they -are not in the actual output file). Each line in the output represents -slight evidence that a mutation occurred changing the target sequence -(chr22 here) to the query sequence (column 6). Columns 11 and 12 indicate the -specific mutation that has putatively occurred. For example, the first line -suggests that either an A has been -inserted into -chr22 at position 14485783, or an A has been -deleted from -EAYGRGI02GQ0SL at position 167 (actually, between positions 166 and 167). -Note that there are three differences reported for -EAYGRGI02GQ0SL, so it appears on three lines. The fifth line shows a putative -SNP at chr22 position 15234401, with a C in the reference and a G in the read, -while the seventh line shows evidence for an inversion of neighboring bases -(AG vs. GA). -Note that there are no lines for EAYGRGI01BIQCW, indicating a -perfect match for that block (i.e., no differences). - -

-Sample output for ‑‑format=differences. -

-     (1)     (2)      (3)  (4)   (5)         (6)       (7) (8) (9) (10) (11)(12)  (13)     (14)
-    chr22 14485783 14485784 + 49691432  EAYGRGI02GQ0SL 167 167  +  303   A   -   TGAGA... TGAGA...
-    chr22 14485791 14485792 + 49691432  EAYGRGI02GQ0SL 174 174  +  303   A   -   TGAGA... TGAGA...
-    chr22 14485843 14485843 + 49691432  EAYGRGI02GQ0SL 225 226  +  303   -   A   TGAGA... TGAGA...
-    chr22 14731895 14731895 + 49691432  EAYGRGI01EAV19 228 229  -  298   -   A   CTTCT... CTTCT...
-    chr22 15234401 15234402 + 49691432  EAYGRGI02H5ZGS 99  100  -  180   C   G   CGAAT... CGAAT...
-    chr22 15255536 15255537 + 49691432  EAYGRGI01BTT7U 56  56   -  267   A   -   TTTGC... TTTGC...
-    chr22 15255552 15255554 + 49691432  EAYGRGI01BTT7U 71  73   -  267   AG  GA  TTTGC... TTTGC...
-    chr22 15255617 15255618 + 49691432  EAYGRGI01BTT7U 136 136  -  267   A   -   TTTGC... TTTGC...
-    chr22 15255624 15255625 + 49691432  EAYGRGI01BTT7U 142 142  -  267   A   -   TTTGC... TTTGC...
-
- -

-Sample output for -‑‑format=general:name1,zstart1,end1,strand1,size1,name2,zstart2+,end2+,strand2,size2,text1,text2. -

-    chr22 14485616 14485920 + 49691432  EAYGRGI02GQ0SL 0   303  +  303   TGAGA... TGAGA...
-    chr22 14731668 14731964 + 49691432  EAYGRGI01EAV19 0   297  -  298   CTTCT... CTTCT...
-    chr22 15234302 15234482 + 49691432  EAYGRGI02H5ZGS 0   180  -  180   CGAAT... CGAAT...
-    chr22 15238845 15239070 + 49691432  EAYGRGI01BIQCW 0   225  -  225   TGGAA... TGGAA...
-    chr22 15255480 15255750 + 49691432  EAYGRGI01BTT7U 0   267  -  267   TTTGC... TTTGC...
-
- -

-(This example aligns reads from the genome of James Watson (available from -NCBI’s trace archive -by querying for CENTER_NAME = 'CSHL' and CENTER_PROJECT = 'Project Jim') -vs. the human reference genome hg18.) - - - -

-

R Dotplot (alignment output)

- -

-This is a home-grown format designed to facilitate plotting the alignment -blocks with the R statistical package. -Alignments are reduced to a series of gap-free segments, each of which is -written in three lines as shown below. Endpoints are -origin-one, closed, and alignments on the reverse -strand have -<..._query_end> less than -<..._query_start> so that R will draw them in the reverse -(slope=−1) orientation. - -

-

-    <target_name>            <query_name_>
-    <segment1_target_start>  <segment1_query_start>
-    <segment1_target_end>    <segment1_query_end>
-    NA                       NA
-    <segment2_target_start>  <segment2_query_start>
-    <segment2_target_end>    <segment2_query_end>
-    NA                       NA
-     ...
-
- -

-The file can then be plotted in R with commands like these: -

-    dots = read.table("your_file",header=T)
-    plot(dots,type="l")
-
- -

-When the the query file contains more than one sequence, alignments for each -query sequence are written as shown above. This includes a new header line -for each query. Unfortunately the simple R commands shown above will not -work to plot a file with more than one query. - -

-When the the target file contains more than one sequence, alignments for target -sequences are intermixed in the output file. In this case the entire target -is treated as a single sequence, and the target positions reported are relative -to this concatenated sequence. This can still be plotted using the simple R -commands above, but the target sequences will appear as one concatenated -sequence in the plot. - - - - -

-

Human-Readable Text (alignment output)

- -

-This textual output is intended to be read by people rather than programs. -Each alignment block is displayed with gap characters and a row of -match/transition characters, and lines are wrapped at a reasonable width -to allow printing to paper. The exact format of this output may change in -future releases of LASTZ, so programs are better off reading more stable -formats like LAV, AXT, or -MAF. - - - -

-

General Output (alignment output)

- -

-LASTZ’s General format is a tab-delimited table with one line per -alignment block and configurable columns. This format is well-suited for use -with spreadsheets and the -R statistical package, -and for filtering with shell commands. - -

-The syntax for this option is: -

-    ‑‑format=general[:<fields>]
-
-where <fields> is a comma-separated list of field names in -any desired order, with no spaces. For example -
-    ‑‑format=general:nmismatch,name1,strand1,start1,end1,name2,strand2,start2,end2
-
-will report each aligned interval pair and the number of mismatches in the -alignment of that pair, like this: -
-    #nmismatch name1   strand1 start1 end1 name2    strand2 start2 end2
-    41         apple8  +       130    930  orange2  -       119    931
-    35         apple15 +       113    930  orange3  +       87     909
-    52         apple4  +       131    952  orange5  -       111    932
-    46         apple7  +       131    930  orange10 +       111    909
-    37         apple12 +       131    930  orange11 -       111    909
-    38         apple3  +       127    939  orange12 +       107    926
-
- -

-The recognized field names are shown in the table below. Positions (start and -end fields) are counted from the 5' end of the aligning strand, -unless otherwise indicated in the table. -Please see Interval Coordinates for more information -about the position numbering systems used in LASTZ. - -

-If the field list is absent, the following -fields are printed, in this order:  -score, name1, strand1, -size1, zstart1, end1, -name2, strand2, size2, -zstart2, end2, identity, -coverage.  - -

-The option ‑‑format=mapping is a shortcut for ‑‑format=general -with the following fields:  -name1, zstart1, end1, -name2, strand2, zstart2+, -end2+, identity, coverage, -cigarx-. - -

-Field names are normally included as column headers in the first row of the -output, preceded by a #. The options -‑‑format=general-[:<fields>] -and ‑‑format=mapping- suppress column headers. This makes -them suitable for concatenating output from multiple runs. - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FieldMeaning
scoreScore of the alignment block. The scale and meaning of this number will -vary, depending on the final stage performed and other command-line options. -
name1Name of the target sequence.
number1 - -Number of the target sequence within the target file. The first sequence -is numbered zero. -
strand1Target sequence strand, either "+" or "−".
size1Size of the entire target sequence.
start1Starting position of the alignment block in the target, origin-one.
zstart1Starting position of the alignment block in the target, origin-zero.
end1 -Ending position of the alignment block in the target, expressed either as -origin-one closed or origin-zero half-open (the ending value is the same in -both systems). -
length1Length of the alignment block in the target (excluding gaps).
text1Aligned characters in the target, including gap characters. - align1 can be used as a -synonym for text1. -
qalign1 - -The target quality sequence (if there is one) correpsonding to aligned -characters. Gaps are indicated as a tilde (~). -
nucs1 - -The entire target sequence, after modifications due to specifier actions such -as subrange or softmask. -

-This is output in order along the target’s forward strand, regardless of -the strand of the alignment. -

quals1 - -The entire target quality sequence (if there is one), after modifications due -to specifier actions such as subrange. -

-This is output in order along the target’s forward strand, regardless of -the strand of the alignment. -

name2Name of the query sequence.
number2 - -Number of the query sequence within the query file. The first sequence -is numbered zero. -
strand2Query sequence strand, either "+" or "−".
size2Size of the entire query sequence.
start2Starting position of the alignment block in the query, origin-one.
zstart2Starting position of the alignment block in the query, origin-zero.
end2 -Ending position of the alignment block in the query, expressed either as -origin-one closed or origin-zero half-open (the ending value is the same in -both systems). -
start2+ -Starting position of the alignment block in the query, counting along the query -sequence’s positive strand (regardless of which query strand was aligned), -origin-one. -Note that if strand2 is "−", then this is the other end of -the block from start2. -
zstart2+ -Starting position of the alignment block in the query, counting along the query -sequence’s positive strand (regardless of which query strand was aligned), -origin-zero. -Note that if strand2 is "−", then this is the other end of -the block from zstart2. -
end2+ -Ending position of the alignment block in the query, counting along the query -sequence’s positive strand (regardless of which query strand was aligned), -expressed either as origin-one closed or origin-zero half-open (the ending -value is the same in both systems). -Note that if strand2 is "−", then this is the other end of -the block from end2. -
length2Length of the alignment block in the query (excluding gaps).
text2Aligned characters in the query, including gap characters. - align2 can be used as a -synonym for text2. -
qalign2 - -The query quality sequence (if there is one) correpsonding to aligned -characters. Gaps are indicated as a tilde (~). -
nucs2 - -The entire query sequence, after modifications due to specifier actions such -as subrange or softmask. -

-This is output in order along the query’s forward strand, regardless of -the strand of the alignment. -

quals2 - -The entire query quality sequence (if there is one), after modifications due -to specifier actions such as subrange. -

-This is output in order along the query’s forward strand, regardless of -the strand of the alignment. -

nmatch -Match count, the number of aligned bases in the -block that are matches. -
nmismatch -Mismatch count, the number of aligned bases in -the block that are mismatches (substitutions). -
ncolumn -Number of columns in the block. This includes matches, mismatches -(substitutions), and gaps. -
npair -Number of aligned bases in the block that are matches or mismatches -(substitutions). -
ngap -Gap count, the number of gaps in the block, counting -each run of gapped columns as a single gap. -
cgap -Gap column count, the number of gaps in the block, -counting each gapped column as a separate gap. -
diff -Differences between what would be written for text1 and -text2. Matches are written as . (period), transitions -as : (colon), transversions as X, and gaps as -- (hyphen). -
cigar - -A CIGAR-like representation of the alignment’s -path through the -DP matrix. This is the short representation, -without spaces, described in the -Ensembl CIGAR specification. -

-For more information, see the section about CIGAR and its -example. -

cigarx - -Same as cigar, but uses a newer syntax that distinguishes matches -from substitutions and omits the run length when it is 1. -

-For more information, see the section about CIGAR and -its example. -

identity -Fraction of aligned bases in the block that are matches (see -Identity). This is written as two fields. -The first field is a fraction, written as <n>/<d>. -The second field contains the same value, computed as a percentage. -
idfrac -Fraction of aligned bases in the block that are matches (see -Identity), written as a fraction. -
id% -Fraction of aligned bases in the block that are matches (see -Identity), written as a percentage. -
blastid% -Fraction of the alignment block that is matches, as would be reported by NCBI -BLAST. The numerator is the number of matches, and the denominator is the -number of alignment columns. The value is written as a percentage but without -a percent sign. -

-This is not the same as LASTZ normally reports for identity, since -NCBI BLAST includes gaps in the computation. -

continuity -Rate of non-gaps (non-indels) in the alignment block (see -Continuity). This is written as two fields. -The first field is a fraction, written as <n>/<d>. -The second field contains the same value, computed as a percentage. - -
confrac -Rate of non-gaps (non-indels) in the alignment block (see -Continuity), written as a fraction. -
con% -Rate of non-gaps (non-indels) in the alignment block (see -Continuity), written as a percentage. -
coverage - -Fraction of the entire input sequence (target or query, whichever is shorter) -that is covered by the alignment block (see -Coverage). This is written as two fields. -The first field is a fraction, written as <n>/<d>. -The second field contains the same value, computed as a percentage. -
covfrac -Fraction of the entire input sequence (target or query, whichever is shorter) -that is covered by the alignment block (see -Coverage), written as a fraction. -
cov% -Fraction of the entire input sequence (target or query, whichever is shorter) -that is covered by the alignment block (see -Coverage), written as a percentage. -
diagonal -The diagonal of the start of the alignment block in the -DP matrix, expressed as an identifying number -start1-start2. -
shingle -A measurement of the shingle overlap between the -target and the query. This is intended for the case where both the target and -query are relatively short, and their ends are expected to overlap. -
number - -The alignment number, counted as alignments are written to output. The count -begins at one. -
znumber - -The alignment number, counted as alignments are written to output. The count -begins at zero. -
chore - -The identifying tag corresponding to the chore that produced the alignment. -The tag is defined in the alignment chores file. -
- - - -

-

Other Output

- -

-LASTZ includes support for other output formats which are intended mainly -for the convenience of the developers. If you have specific questions, -please contact us. - - - - - - -


-
-

Advanced Topics

- - - - -
-

Aligning to Whole Genomes

- -

-Aligning queries to a whole genome can be accomplished in a single run of -lastz by using the -multiple action in the -target file’s sequence specifier. This causes -lastz to load all of the target’s sequences into memory. -However, sequence indexing inside lastz is limited to 31-bit -positions, which limits the overall size of the target to 2 gigabases. - -

-To facilitate larger genomes, an additional executable (lastz_32) -can be built. The two executables are basically the same; the only difference -is that sequence indexing in lastz is limited to 31-bit positions, -while lastz_32 uses 32-bit positions. The use of smaller positions -in lastz reduces memory usage and improves performance, but limits -the size of the target sequence to 2 gigabases. - -

-To build the lastz_32 executable, enter the following commands -from bash or a similar command-line shell (Solaris users should substitute -gmake for make). This will build the executable and -copy it into your installDir. -

-    cd <somepath>/lastz-distrib-X.XX.XX/src
-    make lastz_32
-    make install_32
-
- -

-lastz_32 can then be used as a replacement for lastz -in any command line, e.g. -

-    lastz_32 hg18.fa[multiple] galGal3.fa \
-      --notransition --step=20 --nogapped \
-      --progress=1 \
-      --format=maf > hg18_vs_galGal3.maf
-
- - - -
-

Adjacent Indels

- -

-Occasionly the sequences being compared contain unrelated segments of DNA -flanked by segments that are related. If the unrelated segments are long enough -(and -different enough) that two gaps are cheaper than a series of substitutions, -the optimal-scoring alignment should contain adjacent indels, like this: -

-

-    ...ATAAATTATTATTATTAAATTTTA-------------------CCCCCCCCCCCCCCCCCCTTTTTA...
-    ...ATAAATTATTATTATTAAATTTTAGGGGGGGGGGGGGGGGGAG-------------------TTTTA...
-
- -

-However, by default, lastz does not allow an insertion to follow a deletion, or -vice versa. So it ends up reporting an alignment like this instead: -

-

-    ...ATAAATTATTATTATTAAATTTT------------------ACCCCCCCCCCCCCCCCCCTTTTTA...
-    ...ATAAATTATTATTATTAAATTTTAGGGGGGGGGGGGGGGGGA------------------GTTTTA...
-
- -

-The latter alignment doesn't make any sense biologically. However, to maintain -backward compatibility with previous versions of LASTZ (and BLASTZ), the default -version of LASTZ will produce the latter alignment. - -

-Users that want to allow allow alignments with adjacent indels can build any -LASTZ executable with allowBackToBackGaps enabled. This is -accomplished by adding allowBackToBackGaps=ON to the -make command line, like this: - -

-

-    make clean
-    make lastz_32 allowBackToBackGaps=ON
-    make install_32
-
- - - -
-

Interval Coordinates

- -

-The biological research community has established several competing standards -describing intervals on a strand of DNA. Different programs often use -different standards. Since LASTZ supports several input and output formats, it -is inevitable that it uses more than one way of describing an interval. We -describe the different conventions here. - -

-For this discussion, suppose we have a 50-nucleotide strand of DNA as follows: -

-

-        origin-one, closed: 12345678901234567890123456789012345678901234567890
-                                      ↓      ↓
-                     5' >>> CGACCTTACGATTACCTACTTAACACGTAAACTGAGGGATCAAAAGGAAA >>> 3'
-                                      ↑       ↑
-    origin-zero, half-open: 01234567890123456789012345678901234567890123456789
-
- -

-Note that since this is DNA it has 5' and 3' ends; -we assume that all input sequences follow the standard practice of listing the -bases with the 5' end on the left. -Here we've highlighted the subsequence ATTACCTA so we can -discuss how to describe the interval it occupies. There are two commonly used -ways to do this. Both count from 5' to 3' (left to right). One way, -origin-one, starts counting from one. The other way, -origin-zero, starts counting from zero. So in origin-one, -ATTACCTA begins at position 11, while in origin-zero it begins at -position 10. - -

-To describe the ending position, there are also two commonly used methods. -One way is closed, in which the position of the last nucleotide is -given. The other is half-open, in which the position following the -last nucleotide is given. These are theoretically independent of the -conventions for the origin, but in practice only two of the combinations are -commonly used: origin-one, closed and -origin-zero, half-open. In the former, ATTACCTA is -said to occupy the interval (11,18), while in the latter it is said to occupy -the interval (10,18). Notice that only the first number changes between these -two paradigms; the second number stays the same. - -

-Another factor to consider is that DNA is usually double stranded, which would -look like this: -

-

-        along forward:        12345678901234567890123456789012345678901234567890
-                                        ↓      ↓
-       forward strand: 5' >>> CGACCTTACGATTACCTACTTAACACGTAAACTGAGGGATCAAAAGGAAA >>> 3'
-    complement strand: 3' <<< GCTGGAATGCTAATGGATGAATTGTGCATTTGACTCCCTAGTTTTCCTTT <<< 5'
-                                        ↑      ↑
-        along reverse:        09876543210987654321098765432109876543210987654321
-
- -

-In some cases it makes sense to refer to the interval along the complement -strand. For example, if the above sequence was a query and the target -contained TAGGTAAT, how should the query position of an alignment -of those two be described? One way would be to still refer to the interval -along the forward strand (which we also call the plus or -positive strand), and just indicate that in fact it was the reverse -complement of that interval that aligned. We call this -counting along the forward strand. Another way is to count from the -other end, from the 5' end of the complement strand (which we also call the -reverse, minus or negative strand). We call -this counting along the reverse strand, and for clarity we might add -"from its 5' end". In this example, if we were using origin-one, closed -counting, we would say that TAGGTAAT occurs at (33,40) along the -reverse strand. -Unless noted otherwise (e.g. for the -R Dotplot output format), when counting along the -forward or reverse strand LASTZ swaps the interval’s endpoints if -necessary, so -the position called start is numerically ≤ the position called -end. This is a common convention, but there are other programs -that leave them unswapped. - -

-Note that when counting positions all characters in the sequence are counted, -including runs of Ns or Xs and even invalid -characters. This is important so that other programs can use the reported -positions to index directly into the original sequences. - - - -

-

Non-ACGT Characters, Splicing, and Separation

-

-The handling of characters other than A, C, -G, and T in sequences that are supposed to represent -DNA is problematic. -In ordinary (non-quantum) DNA sequences, LASTZ currently supports two of these, -N and X. They can either be present in the original -input file (except that the Nib and -2Bit formats are incapable of containing -Xs), or added by using an -xmask or -nmask action in the -sequence specifier. -LASTZ can also be configured to tolerate the other IUPAC-IUB ambiguity codes -(B, D, H, K, M, R, S, V, W, and Y), and to recognize -a special user-specified separator character. - -

-Many database sequences contain Ns to represent bases for which -the actual nucleotide is not known (at least, not known with any level of -confidence). Ns (or better, Xs) can also be used to -mask out regions that have previously been identified as being of no interest, -and therefore should not be aligned. And unfortunately, there is also a -tradition of using strings of Xs or Ns to splice -together multiple sequences to gain efficiency when dealing with programs that -were limited to operating on a single sequence. - -

-Although splicing was useful in BLASTZ, it is no longer needed for LASTZ. -Since LASTZ can handle multiple target sequences (via the -multiple action in the target -file’s sequence specifier), it is preferred that users not -resort to splicing. -If splicing is necessary, the preferred method is to specify a -separator character to tell LASTZ explicitely -where the splices have occurred. - -

-Replacing BLASTZ with LASTZ in an existing -pipeline may still involve spliced sequences, so LASTZ’s default -interpretation of non-ACGT characters is the same as BLASTZ’s:  -Xs are excluded from the alignment seeding stage, and are so -severely penalized by alignment scoring that they will not normally -appear in -any alignment. Ns are also excluded from seeding, and are -penalized about the same as a transversion mismatch. Specifically, any -substitution with X is scored as −1000, and any substitution -with anything else (other than A, C, G, -or T) is scored as −100. -Note that you have to put "enough" Xs or Ns between -the sequences so that no alignment block will cross the splice. This can be -tricky, since gap scoring is only dependent on the length of the gap and not on -the characters in the gap. So if a gap the same length as the splice is not -penalized more than the y-drop setting, the -alignment may hop the splice. As a rough guideline, a splice length of 50 is -usually enough with the default settings, but this is not guaranteed. - -

-This default treatment of non-ACGT characters also works well when -Xs or Ns are used to mask out regions that should not -be aligned. However, it is inappropriate when the sequences contain -Ns to represent ambiguous bases. To handle this case, LASTZ -provides the ‑‑ambiguous=n option, -which causes substitutions with N to be scored as zero. -Additionally, the ‑‑ambiguous=iupac -option causes the other IUPAC-IUB ambiguity codes -(B, D, H, K, M, R, S, V, W, and Y) to be treated this -same as an ambiguous N. The two ‑‑ambiguous options -also allow you to specify rewards and penalties for matches and mismatches -involving ambiguous characters. - -

-In either case, non-ACGT characters are ignored during the seeding stage. -Only seed words that consist entirely of A, C, -G, and/or T are involved in seeding, even if the -non-ACGT characters occur in "don't-care" positions in the seed pattern. - -

-The score values described above can be changed if a -scoring file is specified. The −1000 score -is called bad_score and the −100 score is called -fill_score. Further, which character is considered "bad" (by -default this is X) can also be specified in the scoring file, and -can actually be different between the target and query. Throughout this -document, when we refer to the character X appearing in a DNA -sequence, we generally mean the character specified as "bad", which defaults to -X. - -

- -Splicing, or more correctly separation, can also be accomplished by -placing a specific character between subsequences, then using the -separator=<character> -action. LASTZ will then break the sequence into the prescribed subsequences -and prevent any alignment from crossing the boundaries. - -

-Quantum DNA sequences are different: they use an -arbitrary, user-defined alphabet of symbols, so the above-mentioned special -treatments for N and X do not apply. The default -"bad" character for quantum sequences is the null byte (00 -hexadecimal), which is not even allowed in the sequence; however it can be -changed to one of the valid alphabet symbols via the scoring file. There is -no analog of ambiguous Ns for quantum sequences, as typically -every symbol has some level of ambiguity. - - - -

-

Sequence Name Mangling

- -

-Often the names in the input sequence files are inconvenient for downstream -processing, or create problems with certain output formats. This is further -complicated by the fact that some input formats (most notably -Nib) do not contain sequence names, so in those cases -a name must be derived from the filename. LASTZ provides several choices for -naming the input sequences. These alternatives are mutually exclusive; only -one can be used at a time for a particular input file. - -

-Internally, LASTZ handles the naming task in two phases. First, it creates a -full header for the sequence. If the input format provides a name -or header, that becomes the full header. Otherwise, the full header is -constructed from the file name. - -

-In the second phase, LASTZ shortens the full header to a nickname. If the full -header starts with a file name, any path prefix is removed, and commonly-used -file extension suffixes are also removed (.fa, .fasta, -.nib, .2bit). Then by default, LASTZ uses the first -word (composed of characters other than whitespace, vertical bar, or colon) of -the remaining string as the sequence name. Thus a -FASTA header like -"> ~someuser/human/hg18/chr1.fa Human Chromosome 1" -is shortened to simply chr1. - -

-The actions -nameparse=darkspace -and nameparse=alphanum in the -sequence specifier change how the first word is -determined. darkspace -(i.e., "non-whitespace") narrows the set of terminating characters -to allow vertical bars and colons to appear in the word, while -alphanum widens it so the word is restricted to only alphabetic, -numeric, and underscore characters. Path prefixes -and file extensions are still removed. - -

-The default shortening is often adequate. For example, consider the following -FASTA file. By default, the names will be 000007_3133_3729 and -000015_3231_1315. -

-

-    >000007_3133_3729 length=142 uaccno=FX9DQEU13H5YZN
-    ACCCGAAAGAGAAACAGCTTCCCCCCCTGTCCCGAGGGATATCAAGTAGTTTGTTGGCTA
-    GGCTGATATTGGGGCCTTCCGCTAGAGTCGGCGCCCGCGCCTACGAGTCCCCCCCACCCC
-    CCACCCCCACAGCGGGTTATCC
-    >000015_3231_1315 length=190 uaccno=FX9DQEU13HUTXE
-    TTGTTGAGTCGGATGAGAATAGCAAGTGCAGTCAACGGCAATGTGCTGGGTTAGTACAAC
-     ...
-
- -

-However, the user may find it more convenient to use the accession numbers. To -accomplish this, she can use the -nameparse=tag:uaccno= action. LASTZ -will look for the tag string uaccno= in each header and read the -name from the characters that follow it, up to the first character that is not -alphabetic, numeric, or an underscore. In this case the sequence names would be -FX9DQEU13H5YZN and FX9DQEU13HUTXE. If the tag string -is not found in the full header for a particular sequence, the default -shortening is used instead. - -

-Now consider this FASTA file: -

-

-    >gi|197102135|ref|NM_001133512.1| Pongo abelii ...
-    GCGCGCGTCTCCGTCAGTGTACCTTCTAGTCCCGCCATGGCCGCTCTCACCCGGGACCCC
-    CAGTTCCAGAAGCTGCAGCAATGGTACCGCGAGCACGGCTCCGAGCTGAACCTGCGCCGC
-     ...
-    >gi|169213872|ref|XM_001716177.1| PREDICTED: Homo sapiens ...
-    ATGTCTGAGGAGGTAGGATTTGATGCAGGAGGGAGGATCTGGTGCACTTATAAGGATCTG
-    GGTCTGTCAGTGTCAGAGAAGGTAGGATCTGGCCCTGGTATGAGGATCTGGATCTGTCAG
-     ...
-    >gi|34784771|gb|BC006342.2| Homo sapiens ...
-    GGGTGGGAGGACGCTACTCGCTGTGGTCGGCCATCGGACTCTCCATTGCCCTGCACGTGG
-    GTTTTGACAACTTCGAGCAGCTGCTCTCGGGGGCTCACTGGATGGACCAGCACTTCCGCA
-     ...
-
- -

-In this case the default action does not do what we want (all sequences would -be named gi). The action nameparse="tag:gi|" gives -us the names 197102135, 169213872, and -34784771. (Note the quotes; this is necessary to prevent the -command-line shell from interpreting | as a pipe character.) -Observe that a tag of ref| will not work, because the third -sequence would need gb| instead. - -

-Sometimes it is more convenient just to assign a specific name. This can be -done with the -nickname=<name> -action. For example, using the target and query file specifiers -~someuser/human/hg18/chr1.nib[nickname=human] and -~someuser/human/ponAbe2/chr1.nib[nickname=orang], the output -will show the sequences as human and orang rather -than calling them both chr1. -If <name> contains the substring {number}, -the nickname will contain the number of the sequence within the file. This is -particularly useful when there is more than one sequence in the file. - -

-If you want to do away with name mangling entirely, you can use the action -nameparse=full. This uses the full -header as the sequence name. But note that if it contains spaces, the -resulting alignment files may not be readable by downstream tools. - -

-The above discussion applies to ordinary DNA sequences in FASTA, Nib, or -2Bit format. HSX index files -are handled differently: by default LASTZ uses the name from the index as-is, -without shortening it, -and the various nameparse actions are not -allowed. The nickname action can be used, -but is generally not -necessary since you can store the names you want directly in the index. - -

-Note that if the -subset=<names_file> action is -used, the names in the <names_file> must match the mangled -(or indexed) names. - -

-For FASTA files, more complicated name mangling can be performed using standard -Unix command-line tools. In the second example above, we could pipe the input -through sed a couple times to shorten each name to the NCBI -accession numbers NM_001133512.1, XM_001716177.1, -and BC006342.2. -

-

-    cat query_file.fa \
-      | sed "s/>.*ref\|/>/g" \
-      | sed "s/>.*gb\|/>/g" \
-      | lastz target /dev/stdin
-
- - - - - -
-

Seed Patterns

- -

-Seeds are short near-matches between the target and query sequences, where -"short" typically means less than 20 bp. Early alignment programs used exact -matches (e.g. of length 12) as seeds, but spaced seeds can improve -sensitivity when the sequences are diverged. - -

-A spaced seed pattern is a list of positions, in a short word, where -a seed may contain mismatches. For example, consider the seed pattern -1100101111. A 1 indicates a match is -required in this position, and a 0 indicates a mismatch is allowed -(effectively it is a "don't care" position). As the example below shows, using -this seed pattern, the seed word GTAGCTTCAC hits twice in the -sequence ACGTGACATCACACATGGCGACGTCGCTTCACTGG. -

-

-        target:  ACGTGACATCACACATGGCGACGTCGCTTCACTGG
-    (mis)match:    ||XX|X||||          ||X|||||||
-         query:    GTAGCTTCAC          GTAGCTTCAC
-       pattern:    1100101111          1100101111
-
-

-Spaced seeds have been shown to be more sensitive than exact match seeds, with -little change in specificity. This is most advantageous when the sequences -have lower similarity, such as human vs. mouse or chicken. Which seed pattern -is best depends on the sequences being compared. See -[Buhler 2003] for a discussion of spaced seeds and -how to design them. - -

-LASTZ’s seeding options give the -“user” many choices. The intent is that these will be selected by -some program (hence the quote marks around “user”), but they are -available from the command line for anyone. - -

N-mer match:

-A space-free seed can be specified by the length of the N-mer match required. -
-    --seed=match<length>
-
- -

General seed patterns:

-Any spaced seed pattern can be specified. The pattern is a string of -1s, 0s, and Ts, where a 1 -indicates that a match is required in that position, a 0 indicates -that a mismatch is allowed, and a T indicates that a mismatch is -allowed only if it is a transition (A↔G or C↔T). -
-    --seed=<pattern>
-
-The default seed is ‑‑seed=1110100110010101111, which is the same -12-of-19 seed used as the default in BLASTZ. - -

Half-weight seed patterns:

-If a seed pattern consists of only 0s and Ts, it is -implemented internally as a half-weight seed, which uses much less memory -(the same amount as a normal seed pattern half as long). Additionally, -‑‑seed=half<length> can be used as shorthand to specify a -space-free half-weight seed (i.e., all Ts). - -

Single, double, or no transitions:

-By default, one match position (a 1 in a spaced seed, or any -position in an N-mer match) is allowed to be a transition instead of a true -match. ‑‑notransition disables this. Alternatively, -‑‑transition=2 allows any two match positions to be -transitions. - -

Filtering on transversions and matches:

-The ‑‑filter option imposes additional requirements on the number -of transversions and matches in a valid seed. This is especially useful in -conjunction with half-weight patterns. For example, -
-    --seed=TTT0T00TT00T0T0TTTT --filter=2,15
-
-specifies the same pattern as the default seed, but allows the twelve -T positions to be matches or transitions, requires at least -fifteen matches total (among the 19 positions), and allows at most two -transversions. Note that the transversions can only occur in the -0 positions, since the T positions allow only matches -or transitions. -And although there are seven 0 positions, five of -them must contain matches or transitions since only two transversions are -allowed. - -

Twin hit seeds:

-The sensitivity of the seed can be decreased by ignoring seeds that don't -have a second hit nearby, i.e. by requiring two seeds on the same diagonal. -
-    --twins=[<minsep>..]<maxsep>
-
-The distance between the hits (the number of bases between the end of the -first hit and the beginning of the second) must be at least -<minsep> but not more than <maxsep>. -If <minsep> is omitted, zero is used (which means the -twin seeds may be adjacent but not overlap). Negative values can -be used; for example ‑‑twins=‑5..10 -means the twins can overlap -by as much as 5 bases or can have as much as 10 bases between them. - - - - -
-

Any-or-None Alignment

- -

-Sometimes, the only answer you want from an aligner is whether a query has -any strong alignments to the target or not. For example, you may want to know -which reads in a sequencing run have no alignment with a reference -genome. In this case, if a read aligns to a thousand different places on a -particular chromosome, you aren't interested in learning where — all you -want to know is whether it aligned or not. - -

-The ‑‑anyornone option is designed -for such cases, and can significantly improve alignment speed. Once any -qualifying alignment has been found, processing for the current query is -halted. The alignment is reported to the output, and then we immediately begin -processing the next query. A qualifying alignment is one that would normally -be output given the other parameter settings; for example it satisfies the -scoring thresholds (‑‑hspthresh -and/or ‑‑gappedthresh) and any -back-end filters. - -

-To get a list of reads that have at least one "good" alignment with a reference -sequence, you could do something like this: -

-    lastz <reference> <reads> --anyornone  \
-      --step=10 --seed=match12 --notransition --exact=20 --noytrim \
-      --match=1,5 --ambiguous=n \
-      --filter=coverage:90 --filter=identity:95 \
-      --format=general:name2
-
- - -

-This option slightly changes the usual processing order described in the -Overview. Instead of performing gap-free extension -on all seeds, collecting them into a list of HSPs, and then performing gapped -extension, each HSP is gap-extended and back-end filtered immediately. This -avoids wasted work to perform -complete -early stage processing on hits that will -just be abandoned as soon as the first qualifying alignment is found. - - - -

-

Y-drop Mismatch Shadow

- -

-The default configuration of gapped extension in LASTZ is to end the alignment -where the score would be the highest. This means that any prefix or suffix of -the alignment will have a non-negative score. While this is appropriate for -alignments that lie somewhere in the middle of two long sequences, it is not -desirable when an alignment is near the end of one or both sequences, which -happens quite often when aligning short reads. - -

-Consider the following alignment of a 50-base query to a chromosome target, and -suppose we are using ‑‑match=1,5, -‑‑gap=6,1, -‑‑filter=identity:97, and -‑‑filter=coverage:95. The entire -alignment as shown has 97.9% identity (46/47) and 100% coverage. However, the -first five bases (AGAAC vs. AGAAG) have a negative -score: four matches at +1 each and one mismatch at −5 gives a score -of −1 for this prefix. The highest scoring alignment is from positions -6 through 50, for a score of 33 (the entire alignment scores only 32). If -we stop the alignment at the highest score, coverage drops to 90%, and the -alignment is discarded. The overall result is that we will discard reads that -we don't want to, and we will see a bias against mismatches near the ends of -reads. (Note that this anomaly arises because the alignment is terminated -abruptly by the end of the sequence rather than normally by a low-scoring -region; also the ‑‑filter=coverage option is more commonly used -with short reads than with longer sequences.) - -

-

-    target:  ... CTTAGAACGGTAGATACTTGTATAAT---CGAGGGGGTTATTTTGTACAAATGACT ...
-                    ||||X||||||||||||||||||   ||||||||||||||||||||||||
-     query:         AGAAGGGTAGATACTTGTATAATCAACGAGGGGGTTATTTTGTACAAATG
-                         ↑                                           ↑
-                    12345678901234567890123456789012345678901234567890
-
- -

-To avoid this behavior, use the -‑‑noytrim option when aligning short -reads. This causes LASTZ to refrain from trimming such alignments back to the -highest-scoring location. Specifically, if the -gapped extension process encounters the end of the -sequence, it will keep that as the end of the alignment. In this case a -negatively-scoring prefix or suffix will be kept as long as it does not score -worse than the ‑‑ydrop value. - - - -

-

Shingle Overlap

- -

-In some applications, e.g. when assembling reads into contigs, we want to -determine how sequence ends overlap each other. For example, in case 1 below, -the starting portion of the query overlaps the ending portion of the target by -30 bases, and both sequences extend beyond each other in opposite directions. -We call this situation "shingling" (like shingles on a rooftop), and the -shingle field of the General output -format provides a measurement of it. A positive value indicates that the -starting portion of the query overlaps the ending portion of the target (case -1), while a negative value indicates the roles are reversed (case 2). If -neither of these cases occurs (e.g. if either sequence fails to extend beyond -the other), an NA is reported. - -

-Case 1 (shingle = +30): -

-                                                    target_end
-                           3         2         1        ↓
-                           098765432109876543210987654321
-    target:  ... GACGGCGGCTAACACATTGTGTTGXACGTACCATAACCAA
-                           ||||||X|||||||||XX||X||||||
-     query:                AACACAGTGTGTTGCAACTATCATAACATTAAACTTTAGA ...
-                           123456789012345678901234567890
-                           ↑        1         2         3
-                      query_start
-
- -

-Case 2 (shingle = −30): -

-                     target_start
-                           ↓        1         2         3
-                           123456789012345678901234567890
-    target:                TCCCTAATAAATCTTAAGTGCGATCCGCAGCGAGGTGTAC ...
-                              ||||X|||||||||X||||||||X||
-     query:  ... TGGCGCCTGTAGTCTAAGAAATCTTAATTGCGATCCACAC
-                           098765432109876543210987654321
-                           3         2         1        ↑
-                                                    query_end
-
- -

-Note that the value reported has no relation to the number of bases that align -in that region, nor is it an indication that the alignment extends all the way -to the start or end of the sequences. The shingle value is just evidence that -the proper registration of the two reads is to overlap them by the given value -— information that an assembler might use in assembling those reads into -a contig. - - - -

-

Using Target Capsule Files

- -

-Target capsule files are provided to improve run-time memory utilization when -multiple CPU cores on the same computer are running LASTZ with the same target -sequence. They permit the lion’s share of the large internal data structures -to be shared between the processes. This allows more copies of LASTZ to be run -simultaneously with less physical memory, which can improve the throughput, for -example, when mapping a large set of reads to a single (large) reference -sequence. - -

-To create a capsule file, use a command like this: -

-    lastz <target> --writecapsule=<capsule_file> [<seeding_options>]
-
-Applicable seeding options are -‑‑seed, -‑‑step, -‑‑maxwordcount, -and ‑‑word. - -

-To use the capsule file, run LASTZ like this: -

-    lastz --targetcapsule=<capsule_file> <query> [<other_options>]
-
-No additional effort on the part of the user is required to handle sharing of -the capsule data between separate runs. Nearly all options are allowed; -however the seeding options -‑‑seed, -‑‑step, -‑‑maxwordcount, -and ‑‑word -are not allowed, since these (or their byproducts) are already stored in the -capsule file. Further, ‑‑masking -is not allowed, because it would require modifying both the target sequence and -the target seed word position table, which are contained in the capsule. - -

-Internally LASTZ asks the operating system to directly map the capsule file -into the running program’s memory space -in a read-only fashion. Multiple running instances can map -the same file; each instance will have its own virtual addresses for the -capsule data, but the physical memory is shared. There is no requirement for -more than one instance to actually use the capsule simultaneously. Running -a single copy of lastz with ‑‑targetcapsule will work -fine, and in fact there may be a small speed improvement compared to running -the same alignment without a capsule. - -

-The downside of this technique is that the capsule files are very large and are -also machine-dependent. For example, the file for human chromosome 1 is about -1.4 Gb. Note that attempts to run a capsule built on a mismatched computer are -detected and rejected. - - - -

-

Inferring Score Sets

- -

-Scoring inference is an automated method for determining appropriate -substitution scores and/or gap penalties directly from the sequences being -aligned. The resulting scoring parameters can be saved to a file and/or used -immediately to align the sequences. Generally these depend mostly on the -species rather than particular regions, so once a suitable scoring set has been -obtained for a pair of species, the inference does not need to be -performed for each alignment run. In this section we give a brief overview -of the inference process; see [Harris 2007] for a -more detailed description. - -

-Inference is achieved by computing the probability of each of the 18 different -alignment events (gap open, gap extend, and 16 substitutions). -These probabilities are estimated from alignments of the sequences. Of course, -at first we don't have alignments, so we start by using a generic scoring set -to create alignments, infer scores from those, then realign, and so on, until -the scores stabilize or "converge". Ungapped alignments are performed until -the substitution scores converge, then gapped alignments are performed (holding -the substitution scores constant) until the gap penalties converge. - -

-To have LASTZ infer scoring parameters, use -a suitably enabled build of LASTZ (see below), and specify -the ‑‑infer or -‑‑inferonly options. (The latter -will stop after inferring the parameters, without performing the final -alignment.) Settings for the inference process can be specified in a -control file included with these options. - -

-The ‑‑infscores option causes the -inferred scoring parameters to be written out to a separate file. If no -<output_file> is specified, it is written to the header -of the alignment output file, as a comment. As a last resort, if no alignment -is performed the scoring set is written to stdout. The parameters -are written in the same format used to input scoring -sets. - -

-Usually it is undesirable to use all alignment blocks for inference. Blocks -with a high substitution rate (low identity) are likely to be false positives. -On the other hand, blocks with few substitutions (high identity) will be found -regardless of what scoring parameters are used. Thus it is desirable to base -the inference only on statistics from a mid-range of identity. By default the -middle 50% is used (that is, the 25th through 75th percentile from the identity -distribution), but this can be changed in the control file. - -

-

Special Builds Required:

-Since the inference is an iterated process, greater accuracy can be achieved -by using the floating-point version of LASTZ (lastz_D). Moreover, -the technique used to infer gap penalties has not yet been shown to select good -values, so the author recommends that users only employ inference for -substitution scores. To encourage these recommendations, the scoring inference -code is blocked from operation in the integer scoring version of LASTZ -(lastz), and gap penalty inference is blocked in both versions. -Special build options are available to defeat the blocks; contact the author -if you are interested. - - - -
-

Dynamic Programming Matrix

- -

- -

    -
  • Dynamic programming in general is a time-saving algorithm for computing - values that can be expressed via a recurrence relation - [Bellman 1957]. -
  • It has long been used for affine gap alignments of DNA and protein - sequences; see e.g. [Gusfield 1997]. -
  • It uses a matrix of sequence positions to store partial results; early - cells are used to compute later ones to avoid redundant work. -
  • Even for stages that do not involve gaps and do not actually use a DP - algorithm, the matrix is helpful as a conceptual tool because of its - strong correspondence with the dot-plot paradigm for visualizing - sequence alignments. -
  • Here we use the convention of representing the target sequence from - left to right along the horizontal axis (columns of the matrix), and - the query sequence from bottom to top along the vertical axis (rows - of the matrix); see e.g. Figure 5(a) for an - example. -
  • Gap-free alignment segments lie along diagonals of the - matrix/dotplot, in the forward (slope=+1) or reverse (slope=−1) - orientation; the latter typically indicates an alignment on the reverse - strand. -
  • Gaps bump the alignment to a different diagonal, since there is a - progression in one sequence but not in the other; this has the effect - of making gapped alignments look like diagonal-trending squiggly lines - when drawn at low resolution. -
  • Diagonals are characterized by a constant difference between the target - and query positions of their cells (or for a reverse diagonal, a constant - sum). -
  • This matrix is not to be confused with the substitution scoring matrix, - whose rows and columns correspond to characters rather than to sequence - positions. -
- - - -
-

Filtering With Shell Commands

- -

-Though LASTZ provides several filtering options (e.g. -‑‑filter=identity, -‑‑filter=continuity, -‑‑filter=coverage, -‑‑filter=nmatch, -‑‑filter=nmismatch, -‑‑filter=ngap and -‑‑filter=cgap), - sometimes these -are not sufficient for the task at hand. But in many cases it is still possible -to perform the desired filtering by using the -‑‑format=general option in conjunction -with a simple -awk, -perl, or -python script. Here we show one such -example, using awk. -

-Suppose we want to filter alignments by length, discarding anything shorter -than 500 bp, and that we need AXT output for downstream processing. We can -have LASTZ output whatever columns are necessary to reproduce AXT and use awk -to perform the filtering and reconstruct an AXT file. -

-Looking at the - -UCSC AXT specification, the corresponding --format=general -fields are shown in the table below. Note that when determining which fields -are needed for a given format, care has to be taken to make sure to get the -correct start and end fields. Different formats count from zero instead of -one, and some count reverse-strand positions along the plus strand. The -interval coordinates section provides more detail -about possible numbering schemes. -

- - - - - - - - - - - - - - -
AXT field field for ‑‑format=general
Alignment number (none)
Chromosome (primary organism) name1
Alignment start (primary organism) start1
Alignment end (primary organism) end1
Chromosome (aligning organism) name2
Alignment start (aligning organism) start2
Alignment end (aligning organism) end2
Strand (aligning organism) strand2
Blastz score score
Sequence line (primary assembly) text1
Sequence line (aligning assembly) text2
-

- -Then we can perform our filtered alignment with a series of commands like this: - -

-  lastz target.fa query.fa \
-     --format=general:name1,start1,end1,name2,start2,end2,strand2,score,text1,text2 \
-   | grep -v "^#" \
-   | awk '{ if ($3-$2+1 >= 500) print $0 }' \
-   | awk 'BEGIN { n=-1;} { print ++n,$1,$2,$3,$4,$5,$6,$7,$8; print $9; print $10; print ""; }' \
-   > filtered.axt
-
- -The grep command discards the line containing column headers. -

-The first awk command computes the alignment length in the target, and if it is -at least 500, copies the line to the output. $3 is -end1 and $2 is start1. Since these -represent a closed interval, we have to add 1 to get the length. $0 -represents the entire input line. -

-The second awk command converts the alignment from a single line into four lines -required for AXT. We use an awk counter, n, to create the -alignment number field. The other fields are copied from the fields output by -LASTZ. - - - -

-

Self-Masking a Sequence

- -

-For many alignment problems it is desirable to ignore alignments that consist -soley of genomic repeats. For this reason, most finished genomic assemblies -are soft-masked — bases that are part of identified repeats are in -lowercase. LASTZ’s seeding stage avoids seed hits in lowercase, and -thereby avoids finding alignments that are solely repeats. -

-With the wider availability of less-expensive DNA sequencing and custom -assemblies, it is more common for users to have unannotated sequences. The -following describes how LASTZ can be used to crudely identify duplications -and soft-mask the original sequence. This process is called -self-masking. -

-The self-masking process works by looking for alignments of the sequence with -itself, and makes use of the dynamic masking feature to reduce computational -time. The sequence is split into overlapping fragments, on the fly, which are -then aligned against the entire sequence. As duplications are discovered they -are marked as such and removed from the seeding process. -

-A command to perform self-masking would look like this, where critter.fa -contains the sequence to be masked. - -

-  cat critter.fa \
-    | ../tools/fasta_fragments.py --fragment=200 --step=100 \
-    | lastz critter.fa[multiple,unmask,nameparse=darkspace] /dev/stdin --masking=3 \
-        --progress+masking=10K \
-        --format=none --outputmasking+:soft=critter.masking.dat \
-        --notransition
-
-

-In that command, lastz is given the whole “critter” as its -target sequence, and overlapping 200bp fragments of the critter as the -queries. -

-The multiple action tells lastz to -allow more than one sequence in the file, and the -unmask action tells lastz to -ignore any softmasking that may be present in the file. (If your sequence -already has had some masking performed, and you want to keep that, omit -unmask.) The -nameparse=darkspace action tells -lastz to extract the first non-whitespace string from the sequence header line. -This is necessary to ensure that the final step -(fasta_softmask_intervals) will see the same sequence names in the -masked intervals file as those in the sequence file. -

-The ‑‑masking=3 option enables -dynamic masking, which will mark any reference base appearing in 3 or more -alignments. Since the fragments overlap by a factor of two, we expect every -base will appear in two trivial alignments. Any more than that would be caused -by a duplication elsewhere. -

-The ‑‑progress+masking -option causes lastz to give you a progress report after every 10 thousand -fragments. These reports come to the console (stderr) and look like this: -

-    (16.933s) processing query 50,001: critter_21299501, masked 8,920,893/51,304,566 (17.4%)
-
-

-The ‑‑format=none option inhibits the -normal alignment output and -‑‑format=outputmasking+:soft -tells lastz to write the final masked intervals to a file. -

-The final line (‑‑notransition -in this example) is whatever alignment scoring parameters you want to use. -What is appropriate will depend on the level of divergence you want to allow in -the masked duplications. -

-The command to apply the masking intervals to the fasta file will look like -this: -

-  cat critter.fa \
-    | ../tools/fasta_softmask_intervals.py --origin=1 critter.masking.dat \
-    > critter.softmasked.fa
-
- - - -
-

Aligning Many Subintervals

- -

-There are many occasions when you have a general idea of where the alignments -you are interested in are, and it seems computationally wasteful to align -entire sequences just to find a relatively few alignments. For instance, you -may have identified some alignments using fast, high sensitivity settings and -now want to look for alignments with higher divergence in the remaining -regions. Or you may have previously found alignments but did not collect all -the fields you needed. Or perhaps you used some tool other than LASTZ to -identify regions where you want to focus your search. Or you may have gapped -alignments from some other tool, and want to compare them to LASTZ's alignments -in the same subsequences. - -

-There are many ways to solve such a problem, and LASTZ provides several options -to support these needs. Here we describe and critique several different -approaches. - -

-Sequence masking. -LASTZ can use masking to eliminate the possibility -of alignments in (or not in) a given list of intervals. So we can create two -files, containing the desired intervals in the target and query, then run one -LASTZ job. -

-The disadvantage of this solution is that you may get alignments between -unintended interval pairs (for example, an alignment between the first target -interval and the fifth query interval). Some post-proceesing would be -necessary to eliminate these. Moreover, LASTZ will be spending a lot of its -time looking for alignments in those unintended interval pairs. - -

-Separate files. -The simplest solution, conceptually, is to preprocess the sequences to extract -the intervals of interest into separate files, then run each pair of files as -a separate LASTZ job. -

-The disadvantages of this solution are numerous. There is extra I/O involved -in splitting the files, and extra overhead in repeatedly launching LASTZ. -Further, depending on your needs, post-processing may be necessary to map -alignment positions back to the original sequences. - -

-Subranges and subsets. -The separate files solution can be improved upon by letting LASTZ mimic the -file splitting, internally. This can be accomplished with the -subrange and subset -actions, while still running each as a separate lastz job. -

-This improves upon external file separation by eliminating some I/O, and -eliminating the need to post-process to map positions. However, it still -suffers the extra overhead of repeatedly launching LASTZ. - -

-Alignment chores. -Another solution is to use an alignment chores file -(specified with the -‑‑chores=<file> option). The -chores file corresponds directly to the interval pairs of interest. Complete -alignment is performed over the regions defined by each pair (subject to -whatever other options have been set), and is blocked from extending beyond the -region. -

-There is little downside to this solution. The reported results will be -the same as for the post-processed separate files solution, or subrange/subset -solution (but with minor variations such as shifting of equally-scoring gap -placements). There is some computational waste in the chores -solution, but this is much less costly than the repeated launching. -

-In most cases, a chores file will be preferred over an anchors file. - -

-Anchor segments. -Another solution is to use an anchor segments file -(specified with the -‑‑segments=<file> option). The -anchors file does not correspond directly to the interval pairs of -interest. Instead, you will need to have same-length intervals in target and -query. Typically this will be a single point somewhere in the region. -Alignment is anchored at this point. Gapped alignment is performed in both -directions from that point, and is not restricted to the region. -

-The main disadvantage of this solution is lack of versatility. It is most -appropriate when used in conjunction with an external anchor-identifying method. -

-It can also be useful in cases where you want to run ungapped alignment with a -different scoring scheme than for gapped alignment. An ungapped run of LASTZ -creates the anchors (segments) file, and a second run uses those as anchors for -ungapped alignment with a different scoring scheme. - - - - - - -


-
-

Differences from BLASTZ

- -
    -
  • BLASTZ had a "short-by-two" error, which has been corrected in LASTZ. In -many cases, BLASTZ shortened alignments by two bases on either or both ends. -

    -

  • -BLASTZ had a problem with -premature alignment termination; -this has been corrected in LASTZ. -

    -

  • BLASTZ used the ydrop value from the main alignment as the xdrop value for -interpolation; this has been corrected in LASTZ. -

    -

  • -BLASTZ had a problem when -ydrop is less than the penalty for a one-base gap; -this has been corrected in LASTZ. -

    -

  • -BLASTZ chaining had a problem that caused it to -discard very high-scoring HSPs; -this has been corrected in LASTZ. -

    -

  • -The handling of ties in the DP matrix was unspecified in BLASTZ. This has -changed in LASTZ, which specifically prefers a longer alignment to a shorter -alignment with the same score. This change reflects the use of LASTZ to align -short reads, and desire to align as much of the read as possible. -

    -

  • -The handling of bounding alignments in the DP matrix is different in LASTZ than -in BLASTZ. This is discussed in -Bounding Alignments in the DP Matrix. The -‑‑allgappedbounds option can be -used to revert to the bounding criteria used in BLASTZ. - -

    -

  • -The handling of amibiguous nucleotides has been clarified in LASTZ, and in some -cases the default behavior is different than in BLASTZ. By default, BLASTZ -allowed IUPAC-IUB ambiguity codes (B, D, H, K, M, R, S, V, W, and -Y) in fasta sequences but was unclear about how these were scored. -Since we feel the user should be aware of how these bases are treated, LASTZ -rejects them by default. The -‑‑ambiguous=iupac option permits them -but treats them the same as an ambiguous N. This is discussed in -Non-ACGT Characters. - -

    -

  • LASTZ can produce a variety of alignment output formats such as AXT, MAF, -and human-readable text, as well as BLASTZ’s LAV format. -

    -

  • LASTZ can take the guesswork out of selecting alignment scoring parameters -by inferring them for you, based on its analysis of the input sequences. -

    -

  • LASTZ provides a large variety of seeding options. -
- - - -
-

Bounding Alignments in the DP Matrix

- -

-During the gapped extension stage, LASTZ processes the anchors in order of -score (highest scoring anchor is extended first, and so on). As anchors are -extended, a list of bounding alignments is constructed. These correspond to -paths in the DP matrix. Bounding alignments created for higher-scoring anchors -are used to bound the possible DP paths that lower-scoring anchors can take. -This prevents alignments from crossing each other. - -

-In BLASTZ, every gapped extension became a bound, and this was originally the -default behavior in LASTZ, through release 1.1.52. However, this caused LASTZ -to miss some alignments which it should have found. The failure case occured -as follows. A high-scoring anchor is extended but fails to meet the score -threshold. But it gets added as a bound. Then the extension of a lower-scoring -anchor is prevented from crossing or intersecting with that path, and it too -gets discarded even though it might score highly enough. This could occur, -for example, when two extensions would (in the absence of each other) share the -same tail, and the higher-scoring of the two has a lower-scoring anchor. - -

-The correction for this is to only use alignments as bounds if they satisfy the -score threshold. This corrected behavior is now the default in LASTZ (as of -release 1.02.00). The -‑‑allgappedbounds option can be -used to revert to the bounding criteria used in BLASTZ. - - - - - - -


-
-

Change History

- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ReleaseDateChanges
1.0.1Jul/28/2008 -Initial release. -
1.0.5Aug/2/2008 -Fixed a bug that in some cases caused a bus error when interpolated -alignments (e.g. ‑‑inner=…) were used with multiple -queries. -
-Added xmask=<file> and nmask=<file> -file masking actions. -
1.0.21Sep/9/2008 -Fixed a bug involving the default value for ‑‑gappedthresh -(a.k.a. L) when ‑‑exact is used. The bug caused the -gapped threshold to be inordinately low, allowing undesirable alignment blocks -to make it to the output file. -
-Fixed a bug whereby Xs and Ns were treated as desirable substitutions when -unit scores (e.g. ‑‑match=…) were used. -
-Re-implemented ‑‑twins=…. The previous implementation -improperly truncated the left-extension of HSPs. The new implementation is -slower and uses more memory. -
-Added ‑‑census=<file>. The census counts the number of -times each base in the target sequence is part of an alignment block. -Previously, ‑‑census produced a census only if the output format -was LAV (the census is a special stanza in a LAV file). Otherwise the option -was ignored. Now, if a file is specified a census is written to that file. -The format of lines in the census is -<name> <position> <count>. -The position is one-based, and the count is limited to 255. -

-In situtations where 255 is too limiting, ‑‑census16=<file> -or ‑‑census32=<file> can be used, with limits of about -65 thousand and 4 billion, respectively. Note that these will respectively -double and quadruple the amount of memory used for the census. The default -census uses one byte per target sequence location. -

-Added ‑‑format=<differences>, to support Galaxy. All -differences (gaps and runs of mismatches) are reported, one per line. -
-Added ‑‑anchors=<file> (eventually this was renamed to -‑‑segments=<file>), giving the user the ability to bypass -the seeding and gap-free extension stages. -
-Changed default gap penalties for unit scores (e.g. -‑‑match=…) to be relative to mismatch score (instead of -match score). -
-Made the <start>#<length> file subrange action -better at checking errors, and also allowed <length> to use -units such as M and K. -
-Sped up program exit by no longer freeing dynamically allocated memory. -
1.1.0Dec/5/2008 -Improved x-drop extension to better handle suboptimal HSPs. Left-extension -now starts at the right end of the seed (rather than the left end). This -reduces the chance that the extended region (the combination of left and right -extensions) will score less than some subinterval. -
-Changed coverage filtering so that it is relative to whichever sequence is -shortest. Previously it was always relative to the query. -
-Changed defaults for xdrop and ydrop when ‑‑match scoring is -used. -
-Interpolation now uses the xdrop value from the main alignment. -Previously it used the ydrop value to match BLASTZ, but we have decided that -was a bug in BLASTZ. -
-Added general output format. -
-Added ‑‑maxwordcount. -
-Added ‑‑notrivial. -
-Corrected problem with ‑‑subset action, which wasn't using -mangled sequence names. -
-Fixed problem in writing LAV m- and x-stanzas. -
-Blocked the use of scoring inference in the integer build, and blocked gap -scoring inference in all builds. -
-Changed much of the syntax for options and actions. The newer syntax is -clearer and more consistent than the older. The older is still supported by -the program so that existing scripts will still work, but it is not -documented. -
-Changed reporting of duplicated options from -can't understand "<option>" to -duplicated or conflicting option "<option>". -
-Added ‑‑format=rdotplot option. -
1.1.25Feb/5/2009 - Fixed a bug that caused some gapped -extensions to be terminated prematurely. In some cases this also allowed a -nearby low-scoring alignment to "piggyback" onto the remainder of a terminated -alignment, gaining enough in score to pass the score threshold. -
-Added support for target capsule files. -
-Added support for ‑‑format=cigar. -
-Added the <center>^<length> sequence interval -specifier. -
-Corrected the behavior of ‑‑exact regarding lowercase and -non-ACGT characters. ‑‑exact now considers, e.g., a lowercase A -to be a match for an uppercase A. Further, any non-ACGT characters now stop -the match. -
-Improved detection and reporting of memory allocation overflow. Two -problems were fixed as part of this: (1) allocation of single blocks larger -than 2 Gb was being rejected even on platforms that could support larger -blocks, and (2) an allocation overflow problem which could cause a segfault for -target sequences longer than about 1 Gb (these require allocation of a block -larger than 4 Gb). -
-Changed the behavior when encountering an empty sequence in a file with -many sequences. Previously this was reported as an error, and the program -halted. Now it is reported as a warning (to stderr), and the -program continues. -
-Added the ‑‑output option. In some batch systems, it is -difficult to redirect stdout into a file, so this option allows -the user to do it directly. -
-Removed ‑‑quantum and ‑‑code options, replacing -them with the quantum and quantum=<code_file> -sequence specifier actions. This is in preparation for allowing a quantum -target sequence. -
1.1.50Mar/16/2009 -Fixed two problems with exact-match extension. First, when both target and -query used the multiple sequence specifier action, exact match -extension was able to skip the boundary between sequences (this problem was -introduced in 1.1.25). Second, when the exact match should have extended to -the end of the sequence, it was being cut short by 1 bp (on either end). The -latter problem was only evident for ‑‑nogapped; a gapped entension -recovered the additional bases. -
-Fixed several problems with ‑‑segment=<file>. First, if -the file contained more than 4,000 segments, on some platforms the program would -segfault. Second, if a sequence subrange was being used, the limit test -comparing the segment interval to the subrange was incorrect. Third (if the -user was lucky enough to avoid the first two problems), if a segment was on the -negative strand it was improperly mapped to the subrange. -
-Added ‑‑noytrim to prevent y-drop mismatch shadow, improving -LASTZ’s ability to align short reads. -
-Set the default gapped extension score threshold to inherit the lowest HSP score in the -case where ‑‑hspthresh=top<basecount> or -‑‑hspthresh=top<percentage>% is used but -‑‑gappedthresh=<score> is not (and gapped extension is -performed). Previously this case was trapped by a low level routine and the -alignment was halted. -
-Fixed a problem with the start2+ field of -‑‑format=general. The position was left blank for alignments on -the + strand. -
-Fixed a problem in which ‑‑writecapsule was rejected if -‑‑seed=match<length> was used. -
-Fixed a problem related to name mangling which caused an "internal error" to -be reported. -
-Fixed a problem whereby single-symbol identifiers were not recognized in -quantum code files. -
-End of sequence limit checking for <start>#<length> -and <center>^<length> sequence specifier actions is -now "soft". If the resulting interval is beyond the end of the sequence it is -truncated. -
-Changed how ‑‑format=cigar reports alignments on the negative -strand. Apparently there is no complete spec for CIGAR format. Matching what -I see output by exonerate for certain cases is the best I can do. -
-Quantum code files can now specify probabilities as fractions. This gives a -clearer representation for motif-like sequences derived from a multiple -alignment. -
-Added cigar field for ‑‑format=general. -
-Added shingle field for ‑‑format=general. -
-Added the ‑‑rdotplot=<file> option. -
-The ‑‑notrivial option now works with the multiple -sequence specifier action. -
-Added ‑‑markend. -
-Added nameparse=darkspace. -
-Modifed the build process to accomodate the Solaris platform. -
1.1.52Mar/24/2009 - Fixed a bug that occurred when ydrop was less -than the penalty for a one-base gap (the sum of open and extend penalties). In -this case, a bug in the initialization of the DP matrix resulted in no -gapped alignments ever being found. -
-Fixed a problem with the combination of ‑‑recoverseeds and ‑‑exact. -Recovered seeds were cut short by one base on the left end. -
-Added ‑‑format=segments option. This was later replaced by -‑‑writesegments. -
-Added a workaround in the source code for what appears to be a bug in gcc -4.3.2 (see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37861). Without the -workaround, the build fails with this message: -
-    quantum.c: In function 'generate_dna_ball':
-    quantum.c:347: error: array subscript is above array bounds
-
-The workaround uses an ifdef that specifically targets gcc 4.3.2. -
-1.02.00Jan/12/2010 -Relaxed the rejection of some output formats, which was too aggressive. -Specifically, runs with ‑‑tableonly were rejected because of -output format, even though no output would be generated in that format. -
-Added the ability to set the ‑‑maxwordcount option as a -percentage. Also, ‑‑maxwordcount=<limit> now allows -<limit> to be 1. Previously it was not allowed to be less -than 2. -
-The scoring matrix used during x-drop extension now reflects the use -of ‑‑ambiguous=n. Previously, this matrix was not affected by -‑‑ambiguous=n, -and N-vs-N matches and N-vs-other matches were scored as -100 (more -specifically, as fill_score) during gap-free extension. This -caused LASTZ to miss some HSPs, usually those containing an N-vs-N match, since -the HSP was terminated at that match and didn't meet the score threshold. This -has been corrected. -
-Added support for HSX indexes, to support random access into FASTA files. This -improves the speed of aligning a single read (from a file of half a million) by -a factor of about 12. -
-Added ‑‑softmask=<mask_file> file action to permit -soft masking of specified intervals. Also added masking of the -interval complements — -‑‑xmask=keep:<mask_file>, -‑‑nmask=keep:<mask_file>, and -‑‑softmask=keep:<mask_file>. These make it easier to -restrict alignment to several specified intervals of a sequence. -
-Enabled the use of ‑‑filter=[<transv>,]<matches> -for non-halfweight seeds. Previously, ‑‑filter had only been -tested for half-weight seeds, but was erroneously prohibited for -all seeds (instead of just prohibiting non-halfweight seeds). Further, it -was not properly implemented for seed-only output (‑‑nogfextend -‑‑nogapped). These have all been corrected, and ‑‑filter -is now available for all seed types. -

-Also corrected the behavior of ‑‑filter regarding lowercase and -non-ACGT characters. ‑‑filter now considers, e.g., a lowercase -a to be a match for an uppercase A. Further, for the -purposes of ‑‑filter, any non-ACGT characters are considered to be -transversions. -

-Also changed the behavior when the <transv> field is absent. -This is now interpreted as unlimited transversions. Previously it meant that -no transversions were allowed. This should be a safe change in behavior since -it was (unintentionally) not possible for users to access this feature -previously. -

-Added a compile-time directive compileForWindows to make -appropriate behavioral adjustments for running on a Windows machine. -Currently this only affects the handling of file paths. To activate it, -the user must add -DcompileForWindows to the definition of -definedForAll in -.../lastz‑distrib‑X.XX.XX/src/Makefile. -
-Fixed chaining of seed hits. Previously, if ‑‑nogfextend and -‑‑chain were used together, nothing was output. This was due to -the fact that unextended seeds had no scores, and the chaining algorithm only -reports chains with positive score. This has been corrected by calculating -scores (as the sum of substitution scores) over anchor segments whenever (a) -the segments have not had scores computed for them, and (b) scores are required -for later processing. -

-This change may also affect (for the better) the results of gapped extension -when either ‑‑nogfextend or ‑‑exact is used. Gapped -extension processes the anchors highest score first. Since -‑‑nogfextend left all scores zero, the actual order in which gapped -extension was performed in that case was dependent on how the sort routine (the -C runtime routine qsort) deals with ties. For ‑‑exact, the score -was the length of the match. This has been changed to the segment’s -substitution score. -

-Changed ‑‑format=segments to -‑‑writesegments=<file>. -
-Added M-mismatch extension. -
-Added the replacement of {number} in sequence nicknames. -
-Added support for continuity reporting and -filtering. -
-Added support for match count -filtering. -
-Fixed a bug in handling subrange actions for nib files. The problem occurred -when the subrange action was of the form <start>.. and -<start> was even. That is, no <end> was -specified, and LASTZ is supposed to use the remainder of the sequence. LASTZ -miscalculated the length of the interval, making it one base longer. If the -actual full sequence length was odd, this resulted in an extra T -being appended to the sequence data. If the full sequence length was even, -LASTZ quit, reporting that it was unable to read the sequence. Note that this -only happened for .nib files, only when <start> when even, -and only when no <end> was specified. -
-Added the subsample action. -
-Added the ‑‑anyornone option. -
-Added ‑‑allgappedbounds. -
-Fixed a bug in exact and mistmach extension and queries using the -multiple action. It was possible for an HSP to cover parts of -two different queries. -
-Fixed an overflow bug in the chaining -algorithm. Due to numerical overflow, very high scoring HSPs were treated as -negatively scoring, and thus were not included in final chains. With default -scoring values, overflow was caused by the equivalent of an exact match of -about 22Mbases. This problem also existed in BLASTZ. -
-Added support for output in SAM format. -
-Corrected dotplot output. Previously, some of the coordinate -values were inconsistent and off by one. -
-Added ‑‑progress=[<N>]. -This existed as an unadvertized option in earlier versions of the program, as -‑‑debug=queryprogress=<N>. It has now been promoted to a -first class option. -
-Added ‑‑ambiguous=iupac and changed ‑‑ambiguousn to -‑‑ambiguous=n. the former is still supported, but not advertized. -
-Column headers for ‑‑format=general now match the command-line -keywords. Previously, all related keywords shared the same column header. -For example, keywords start2, zstart2, -start2+ and zstart2+ all produced the same column -header, start2, in the output file. -

-Also added ‑‑format=general-. -

-Now using inttypes.h macros for sized-types. This is to satisfy some -additional type-checking pickiness that appears to have added to gcc version -4.2.1. In the unlikely even that a compiler doesn't support inttypes.h, the -compile-time definition override_inttypes can be used. -
-Added nmatch, nmismatch, ngap, -cgap and cigarx fields for -‑‑format=general. -
-Added ‑‑format=mapping, a shortcut for typical fields for -‑‑format=general for mapping reads. -
-1.02.11Aug/21/2010 -Fixed the cigarx field for ‑‑format=general, so -that a run length of 1 is omitted for indels. -
-Fixed the behavior of ‑‑recoverseeds, which was failing to -recover many HSPs when seed denisty was high. This was due to left extension -being blocked by other seeds on that same hash-equivalent diagonal. Left -extension is now unblocked when ‑‑recoverseeds is enabled. -
-Changed/corrected how the ‑‑segment option handles wildcard names -when the multiple action in used. To support this, the -rewind command was added to the segments file format. -
-Sequence masking actions (softmask, xmask and -nmask) are now allowed for the multiple action. -
-Command-line arguments beginning with two unicode non-breaking hyphens are now -recognized. Since these are used in some places within this README file, it is -natural for a user to copy them to the command line. Previously these were not -recognized, which led to a somewhat confusing error message. -
-Fixed detection and reporting of improper gap penalties. Because the first -base in a gap is penalized as open+extend, open can be zero or negative as long -as that sum is strictly positive. Previously, a sum of zero was permitted, and -a negative sum was misleadingly reported as a problem with the open penalty. -Now, the sum must be strictly positive, and when it isn't the message more -accurately describes the problem. -
-Fixed the implementation of ‑‑self with regard to mirror-image -pairs. Previously, alignments were internally restricted to be above the main -diagonal in the ungapped stage only. The mirrored twins were created prior to -the gapped stage, and the gapped stage operated on the full set of anchors. -This had two undesirable effects -- there was little computational savings, and -the resulting set of alignments could be assymetrical (due to small variations -in gap positioning). This behavior has been changed so that the above-diagonal -restriction occurs throughout the alignment process and mirrored twins are -created just prior to output. -
-1.02.16Nov/2/2010 -Fixed a problem with ‑‑self, introduced in 1.02.11. The problem -manifested itself on 64-bit CPUs, with an error message indicating it was -attempting to allocate 17 billion bytes for edit_script_copy. This has been -corrected. -
-Corrected a problem in LAV output, in which the d stanza -reported an incorrect value for K or L (the ungapped and gapped soring -thresholds) when they were not equal to each other. Which value was reported -incorrectly depends on nuances of the compiler and could differ by platform. -

-Alignments were not affected. -

-Changed the error message when a fasta file contains bad characters. The -previous message caused confusion when the bad character happened to be -punctuation. Now the error message explicitely describes the offending -character (comma, ampersand, etc.). -
-Added ‑‑format=blastn. -
-Added idfrac, id%, blastid%, -covfrac, cov%, confrac, -con%, ncolumn, and npair fields for -‑‑format=general. -
-Added start..end+zoom% subrange specifier. -
-1.02.23Jan/10/2011 -Fixed a problem that occurred if the gap extension penalty was set to zero. -This caused a divide by zero (which is reported in different ways on different -platforms) and the program crashed. This has been corrected by trapping the -offending division. However, the fix increases memory usage. Moreover, it is -highly likely to cause truncated alignments. It’s not clear that there -is any useful reason to set gap extension to zero. - -
-Added ‑‑format=rdotplot+score and -‑‑rdotplot+score=<file>. -
-Improved ‑‑masking=<count> so that it can allow a count -threshold greater than 254. -
-Fixed a problem with ‑‑scores=<scoring_file>. When the -<scoring_file> defined score values for N, -those scores were not honored during the ungapped seed extension stage. -
- -Fixed problems with ‑‑ambiguous=n and -‑‑ambiguous=iupac. These were -incorrectly penalizing substitutions between non-ambiguous nucleotides -(A, C, G, or T) and ambiguous ones (N, B, D, H, K, M, R, S, -V, W, or Y). This has been corrected to honor the original -intent, which was clearly to score these as zero. -

-However, for users who desire the previous behavior, a substitution penalty can -now be specified with each of these options. To match the previous behavior, a -penalty of twice the gap extension should be used. -

-A later change history item is also -relevant. -

-Added ‑‑queryhsplimit=<n>. -
-1.02.27Jan/31/2011 -Added ‑‑outputmasking=<file>. -
-1.02.37Mar/31/2011 -Added ‑‑outputmasking:soft=<file>. -
-Added example of filtering with shell commands. -
-Changed the interpretation of comments in -sequence name files. Previously, the first # was -considered a comment. The implemenation predated the author’s -familiarity with Illumina read names (which contain a #). In order to still -allow lines that contain a read name and a comment, a # is not considered a -comment unless it is is preceded by whitespace or the start of the line. -
-Changed the behavior of -‑‑queryhsplimit=<n> to -better match user expectations. Previously the limit was applied separately -for each strand of the query. Moreover, HSPs discovered before the limit was -reached were still passed downstream for further processing. -

-This has all been changed so that the limit applies to the combined total of -HSPs for query, and if the limit is reached (exceeded), all HSPs for the read -are discarded and no downstream processing is performed. -

-Fixed a bug involving the ngap and cgap fields for -‑‑format=general. These fields were only reported correctly if -the continuity or ncolumn fields were also requested. -Otherwise, the value reported represented the contents of unitialized memory. -
-Added filtering options -‑‑filter=nmismatch:0..<max>, -‑‑filter=ngap:0..<max>, -and ‑‑filter=cgap:0..<max>. -

-Also changed the option name for match count filtering to -‑‑filter=nmatch:<min>. -The older option, ‑‑matchcount=<min> is of course still -recognized. -

-1.02.40Apr/7/2011 -Added ‑‑outputmasking+=<file> -and ‑‑outputmasking+:soft=<file>. -
-Added -‑‑progress+masking=[<N>]. -This existed as an unadvertized option in earlier versions of the program, as -‑‑debug=queryprogress+masking=<N>. It has now been promoted -to a first class option. -
-Added an example of how to create a soft-masked sequence by -self-masking. -
-1.03.00Jul/14/2011 -When a subrange was used, the wrong denominator -was used to compute coverage. The denominator -used was the length of the subrange instead of the entire sequence. This -adversely affected both the -‑‑filter=coverage filter and the -coverage output field. This has been corrected -to use the length of the entire sequence. -
-Added the -separator=<character> -action, allowing the user to specify a character which alignments will not -cross. See also -Non-ACGT Characters, Splicing, and Separation. - -
-Added support for reading FASTQ files. Quality values -do not participate in alignment, but are copied to alignment output when -appropriate. -
-Added ‑‑format=general fields -nucs1, -nucs2 -(the entire target or query nucleotides sequence), -quals1 and -quals2 -(the target or query base-call quality sequence). -
-Fixed a minor problem with the ‑‑format=general fields -cov% and con%. Those fields were being written with -an extra tab character preceeding them. This had a detrimental affect on -downstream parsers that required tabs as separators (parsers that interpreted -whitespace as separators were not affected). -
-Added ‑‑readgroup=<tags>, -allowing the specification of tags for SAM's ‑RG header line. -
-Added -‑‑allocate:target=<bytes> -and -‑‑allocate:query=<bytes>. -These allow the user to predict the amount of memory needed to store target -or query sequence data, which in some instances can resolve memory overuse -(it saves LASTZ from incrementally predicting the amount of memory needed). -

-For consistency, -‑‑allocate:traceback=<bytes> -is now renamed (from ‑‑traceback=<bytes>). -

-Added ‑‑include=<file>, -allowing command-line arguments to be read from a text file. -
-Updated the Yasra shortcuts. Some options that -improved alignment read mapping had not previously been included in the -Yasra definitions, because these options did not exist when the Yasra -shortcuts were originally defined. -

-To allow backward compatibility, the shortcuts now permit specification of -a particular version of LASTZ. See the description of the shortcuts for -details. -

-1.03.02Jul/19/2011 -Fixed a bug in ‑‑format=axt and -‑‑format=axt+, which caused every -alignment to be reported twice. The bug had been introduced in version -1.02.28 (not present in 1.02.27, present in 1.02.37). -
-1.03.34Apr/12/2013 -Fixed a problem with ‑‑self and ‑‑format=lav, -introduced in 1.02.11, which caused lastz to segfault. -
-Fixed a bug in ‑‑writecapsule. When the target was larger than -≈1 billion bp, an internal sanity check triggered incorrectly, stopping -the program and reporting "internal error writing to" the capsule file. - -
-Fixed a bug related to ‑‑nogfextend. If no -‑‑gappedthresh was set, the gapped threshold incorrectly was set -to 0 instead of the correct default of 3000. This has been corrected. - -
-The match count filter now allows the count to be specified as a percentage of -the query length -(‑‑filter=nmatch:<min>%). - -
-Added ‑‑format=general fields -number1, -number2, -number and -znumber -(sequence and alignment numbers). -
-Added ‑‑format=general fields -qalign1 and -qalign2 -(quality sequences in alignment order). -
-Corrected rdotplot output when the -query files contains more than one sequence. Previously, the header line -containing sequence names was only written once, at the beginning of the file. -Now it is written once for each query sequence. -
-Added a warning when a scores file is used -(‑‑scores), with the scale of the -scoring matrix substantially different from the default scoring matrix, and the -user hasn't set the hsp threshold or gapped threshold. This is a common -mistake and often results in no alignments being found. -

-The warning looks something like this: -

-  WARNING. Scores file may warrant setting of thresholds absent from scores.txt.
-  Minimum match score is 10, for matrix entry (A,A).
-  This may not work well with default --gappedthresh=3000.
-
-
-Added ‑‑queryhspbest=<n>. -
-Added ‑‑querydepth=<n>. -
-Added alignment chores files, -‑‑chores=<file> option, -chores=<file> action, and -chore field for -‑‑format=general. -See Aligning Many Subintervals. -
-Fixed a bug related to the -nickname=<name> action. If -the corresponding sequence file was in -2Bit format, the nickname wasn't used and sequence -names were copied from the sequence file. This has been corrected. - -
-Added ‑‑help=defaults and -‑‑show=defaults. -
-Fixed a problem which caused runaway memory allocation of the traceback row -buffer. The problem was discovered when an alignment of a 72-bp read to a -reference genome needed to allocate 170 million rows (about 700M bytes). This -has been corrected. -

-It is not clear whether this had any affect on the alignments produced. In the -examples used for testing and debugging, alignments were not affected. The -negative affect was memory requirements and possibly runtime. -

-However, the cause of the problem was incorrect determination of bounding -alignments when performing gapped alignment backwards from the anchor. So it -is possible that this could have caused a desirable alignment to have been -missed, truncated, or to contain suboptimal gap placement. -

-Corrected the behavior when -‑‑anyornone was used with -‑‑nogapped. Previously this failed -with a message indicating an internal error ("gapped_extend was given a NULL -traceback pointer."). -
-Score thresholds can now make use of units (e.g. -‑‑hspthresh=5K instead of -‑‑hspthresh=5000). -
-Detection of trivial self-alignments has been improved for cases where the -multiple action is used. -
-The implementation of ‑‑self has been -reworked so that it no longer reads the input file twice. As a result, -‑‑self now supports a file piped into stdin. -
-Added an additional build (lastz_32) to address aligning to whole -genomes larger than 2 gigabases. -
-Changed the option names for identity, continuity and coverage filtering to -‑‑filter=identity:<min>[..<max>], -‑‑filter=continuity:<min>[..<max>], -and -‑‑filter=coverage:<min>[..<max>]. -This change was made to achieve consistency with the other back-end filtering -options. -

-The older options, ‑‑identity, ‑‑continuity -and ‑‑coverage are of course still recognized. -

-1.03.46Oct/2/2013 -Added the namejoin action, to allow -better handling of input files that have spaces in sequence names (e.g. Illumina -casava version 1.8 fastq files). -
-Greatly improved speed for the use case where the target contains a large number -of sequences (e.g. 100 thousand exons). This was essentially a bug which had -no effect on accuracy. A data structure was being repeatly allocated and -erased, and was much larger than was needed (e.g. 12 Mbytes per query), and -since all writes were cache misses, this ended up being very significant. -
-Added the -allowBackToBackGaps build option. Previous -versions of LASTZ (and BLASTZ) did not consider alignments in which an insert -was immediately adjacent to a delete (or vice versa). -
-Fixed a problem in ‑‑format=differences -that was inadvertantly introduced in 1.03.34. A failsafe check for an unhandled -case was added in that version, and ‑‑format=differences wasn't -being handled. Unfortunately this prevented this format from being usable. -This has been corrected. -
-1.03.52Jan/14/2014 -Corrected a bug that occured when the -‑‑inner option is used with the -multiple action. With this -combination lastz could report alignments straddling two sequences. This is -now prevented. -

-This bug has existed in all previous versions of lastz. It was not in blastz -since blastz did not provide the ability to have multiple sequences in memory. -

-1.03.54Jan/28/2014 - -Modified ‑‑ambiguous=n and -‑‑ambiguous=iupac to allow a reward -for matches to be specified in addition to the penalty for mismatches. -

-An earlier change history item is also -relevant. -

-1.03.66Jan/19/2015 -Fixed an error that would cause a segfault. The causitive conditions at the -user level are not well characterized. In the discovered example both target -and query sequences contained 1K bp bursts of similar but highly diverged low -complexity sequence, but the specific relationship (if there is one) between -this feature and the failure are not known. Internally the problem resulted -from an unsigned index variable attempting to become negative. -
-Fixed an error that would cause a segfault if fasta queries were piped from -stdin and the first query was empty. -
-Added a sanity check to scoring inference. It is possible that, for a -candidate scoring set, the enforcement of identity filter settings -(min_identity and max_identity) leaves the inference with no alignments from -which to infer scores. This condition should now result in a failure message. -
-Changed how back-to-back gaps are represented in -lav format, to match the way -BLASTZ represented them. A zero-length segment is now written -to the file, separating the two gaps. It has been discovered that at least one -of the lav-processing tools in the Miller Lab suite expects to have such a -segment. -
-Fixed an error which prevented -‑‑progress[=<N>] -being reported for empty sequences. -
-1.03.73Jul/8/2015 -Eliminated alignments that begin or end with gaps. Such alignments do not make -sense biologically. -

-Earlier versions could report a gap at either end of an alignment if the -alignment was very close to an alignment found earlier in the process. This is -a failure in the logic that prevents alignments from crossing (or overlapping) -and/or assumptions made in extending HSPs when the anchor point is very close -to a previously-found alignment. The current solution truncates the alignment -by trimming away any end gaps and rescoring. -

-1.04.00Mar/12/2017 -Corrected a bug involving chaining. Previously, if -‑‑chain -and -[multiple] -were used together, the chaining algorithm incorrectly considered all sequences -together as a single entity, and found a single chain across all sequences. -

-This has been corrected, and the chaining algorithm is now performed -independently for each sequence. -

-Implemented a workaround for parsing conflicts between -sequence specifier actions and shells that use square -brackets for filename expansion. -

-In such shells appending any action to a filename, such as -[multiple], caused the shell to -report "lastz: no match". To provide a means for specifying actions without -having to surround them in square brackets, the commands -‑‑action:target=<action> -and -‑‑action:query=<action> -have been added. -

-Fixed an error that would cause a segfault when there are more than about 32 -million HSPs. The program now detects this error case and suggests ways to -avoid the situation. -
- - - - - -


-
-

References

- - -

-

-

-Bellman R (1957). -Dynamic Programming. -Princeton University Press, Princeton, NJ. - -

-

-Buhler J, Keich U, Sun Y (2003). -Designing seeds for similarity search in genomic DNA. -Proc. 7th Annual International Conference on Research in Computational -Molecular Biology (RECOMB '03), pp. 67-75. - -

-

-Chiaromonte F, Yap VB, Miller W (2002). -Scoring pairwise genomic sequence alignments. -Pacific Symposium on Biocomputing 7:115-126. - -

-

-Cock PJA, Fields CJ, Goto N, Heuer ML, Rice PM (2009). -The Sanger FASTQ file format for sequences with quality scores, and the -Solexa/Illumina FASTQ variants. -Nucleic Acids Research 38:1767-1771. - -

-

-Gusfield D (1997). -Algorithms on strings, trees and sequences. -Cambridge University Press, Cambridge, pp. 244. - -

-

-Harris RS (2007). -Improved pairwise alignment of genomic DNA. -Ph.D. thesis, Pennsylvania State University. - -

-

-Li H et al. (2009). -The Sequence Alignment/Map (SAM) format and SAMtools. -Bioinformatics 25:2078-2079. - -

-

-Myers EW, Miller W (1989). -Approximate matching of regular expressions. -Bull. Math. Biol. 51:5-37. - -

-

-Zhang Z, Berman P, Miller W (1998). -Alignments without low-scoring regions. -J. Comput. Biol. 5:197-210. - - - - - - -


-
-

Acknowledgments

- - -

-Thanks for to Haibao Tang for contributing an example implemention for BLASTN -output. - -

-


-

-

Bob Harris and Cathy Riemer
- -

- - diff --git a/programs/lastz/docs/lav_format.html b/programs/lastz/docs/lav_format.html deleted file mode 100644 index adffcf7..0000000 --- a/programs/lastz/docs/lav_format.html +++ /dev/null @@ -1,358 +0,0 @@ - - - -LAV Format - - - - - - -

-

LAV Format

- -

-TABLE OF CONTENTS - -

-

- -

-

Introduction

-

- -LAV is a plain-text file format for alignments of two DNA sequences. It is -the only output format produced by the -BLASTZ alignment program -(though often converted to -AXT format -by post-processing programs), and is the default output format for BLASTZ's -successor, LASTZ. -

-The alignment blocks are grouped by sequence (e.g. chromosome, scaffold, -contig, cDNA read, shotgun sequencing read, etc.) and strand, and described -by listing the coordinates of the gap-free aligning segments in each block. -This format is compact because it does not include the nucleotides, but the -tradeoff is that interpretation usually requires access to the original -sequence files, and it is not easy for humans to read. - -

-

Example

-

- -Here's a typical LAV file: -

-

-    #:lav
-    d {
-      "lastz.v0.3 malus.fa aurantium.fa C=2 W=8 T=0 
-         A    C    G    T
-        91 -114  -31 -123
-      -114  100 -125  -31
-       -31 -125  100 -114
-      -123  -31 -114   91
-      O = 400, E = 30, K = 3000, L = 3000, M = 0"
-    }
-    #:lav
-    s {
-      "malus.fa" 1 191411218 0 1
-      "aurantium.fa" 1 90634903 0 1
-    }
-    h {
-      "> apple"
-      "> orange"
-    }
-    a {
-      s 20643
-      b 46566766 2083211
-      e 46567353 2083795
-      l 46566766 2083211 46566796 2083241 61
-      l 46566797 2083245 46566814 2083262 78
-      l 46566821 2083263 46567353 2083795 65
-    }
-    a {
-      s 4233
-      b 47246530 10635696
-      e 47246660 10635826
-      l 47246530 10635696 47246660 10635826 63
-    }
-    ... many more a-stanzas ...
-    #:lav
-    s {
-      "malus.fa" 1 191411218 0 1
-      "aurantium.fa-" 1 90634903 1 1
-    }
-    h {
-      "> apple"
-      "> orange (reverse complement)"
-    }
-    a {
-      s 13897
-      b 1005819 5352698
-      e 1006099 5352978
-      l 1005819 5352698 1006099 5352978 74
-    }
-    ... many more a-stanzas ...
-    #:eof
-
- -

-

Stanza Types

-

- -An LAV file primarily consists of a series of "stanzas", each of which -is a single letter code followed by a brace-enclosed block. There are -also #:lav lines which break the file into sections, and -one #:eof line indicating the end of the file. Programs -that read LAV format should consider the file bad if the -#:eof is missing (or if anything appears after it). - -

-D Stanza -

-The d-stanza is intended to document the program and parameters used -to create the file. Programs reading the file normally treat this as -a comment, but it is possible to extract the scoring parameters for -further processing. - -

-S Stanza -

-An s-stanza describes the sequences used for the subsequent alignment -records (a-stanzas). It contains exactly two lines in the following -format. -

-

-    "<filename>[-]" <start> <stop> [<rev_comp_flag> <sequence_number>]
-
-

-Here <start> and <stop> are -origin 1 (i.e. the first base in the original given sequence is called -"1") and inclusive (both endpoints are included in the interval). -Usually <start> is 1 and <stop> -is the full length of the given sequence, however they can specify any -subsequence (e.g. if the alignment program was instructed to use only -part of the original sequence). -

-<rev_comp_flag> is 1 if the sequence was -reverse-complemented before aligning, or 0 otherwise. Usually the -first sequence will have a 0 here, since most alignment programs only -ever reverse-complement the second one. If this flag is 1, the -<filename> will also have a - appended -to it; programs that read LAV format should report an error if these -two indicators are contradictory. Note that even when this flag -indicates reverse-complement, the <start> and -<stop> endpoints are still relative to the original -orientation, and <start> is less than -<stop>. That is, conceptually the alignment program -extracts the requested sequence fragment first, then reverse-complements -it (if applicable), and finally tries to align it. -

-<sequence_number> is useful when the second file -contains multiple sequences. The first sequence is 1, the second is 2, and so -on. Most programs that write and read LAV format do not allow the first file to -contain multiple sequences, so in these cases the sequence number for the first -file is always 1 (though the format itself does not require this). Note that -<start> and <stop> are relative to each -sequence, not to the entire file. -

-The <rev_comp_flag> and -<sequence_number> are shown here as optional because -early versions of this format did not include them. - -

-H Stanza -

-Usually an s-stanza is followed immediately by an h-stanza, which -provides a name for each of the two sequences, typically obtained -from the FASTA header line. (Before the s-stanza's -<sequence_number> field was introduced, this was -the only way to identify which sequence from a multi-sequence file was -aligned.) A (reverse complement) suffix is appended when -applicable; again, programs should report an error if this contradicts -the other indicators. - -

-A Stanza -

-An a-stanza describes a single alignment block, sometimes called a -"local alignment", which typically includes gaps due to small insertions -and deletions in the aligned sequences. In the example below, the -s, b, and -e lines indicate that the block has a score -of 13916 and an overall range of 4886..5171 in sequence 1 and 21292..21537 -in sequence 2. -

-The l lines describe the block's gap-free segments, with the -final field representing the percentage of matching bases in each segment. -In this example the alignment starts with a segment from 4886..4899 in -sequence 1 and from 21292..21305 in sequence 2, having a percent identity -of 79%. Note that the segment length must be the same in both sequences -(14 basepairs for this segment). The next segment starts at 4900 and 21308 -in sequences 1 and 2, respectively, indicating a two-base gap in sequence 1 -(corresponding to positions 21306 and 21307 in sequence 2). -

-

-    a {
-      s 13916
-      b 4886 21292
-      e 5171 21537
-      l 4886 21292 4899 21305 79
-      l 4900 21308 4924 21332 92
-      l 4925 21334 5024 21433 88
-      l 5027 21434 5040 21447 100
-      l 5086 21448 5117 21479 84
-      l 5118 21484 5171 21537 87
-    }
-
-

-Coordinates in an a-stanza are origin 1 and inclusive, and are relative -to the subsequences indicated in the most recent s-stanza. In the -example below the alignment is of apple 1333..1444 to orange 2777..2888. -

-

-    s {
-      "malus.fa" 1001 2000 0 1
-      "aurantium.fa" 2001 5000 0 1
-    }
-    ...
-    a {
-      s 7321
-      b 333 777
-      e 444 888
-      l 333 777 444 888 62
-    }
-
-

-If a sequence is reverse-complemented, then the coordinates are relative -to the reverse complement, so they are counted back from the end of -the subsequence. Thus the example below represents an alignment of apple -1333..1444 to the reverse complement of orange 4113..4224. In detail: -the s-stanza indicates that the first sequence from aurantium.fa should be -used, and its subsequence from 2001..5000 should be extracted and then -reverse complemented before aligning with apple. In this 3000 bp -reverse-complemented subsequence, the first base corresponds to position -5000 in the original sequence, the second to position 4999, and so on to -the last (3000th) base, which corresponds to position 2001. Thus the -conversion formula is p = 5000 - (r - 1), where p -is the position in the original sequence, and r is the position in the -reverse-complemented subsequence. Within the reverse-complemented -subsequence, the alignment is at 777..888. The starting point, 777, is the -nucleotide 776 bp back from 5000, or 4224, while the ending point, 888, is -887 bp back from 5000, or 4113. -

-

-    s {
-      "malus.fa" 1001 2000 0 1
-      "aurantium.fa-" 2001 5000 1 1
-    }
-    ...
-    a {
-      s 7321
-      b 333 777
-      e 444 888
-      l 333 777 444 888 62
-    }
-
-

-The fifth numeric field in an a-stanza's l line is the -percentage of bases in the aligned segment that match (often called the -"percent identity" or "percent id"). This is used by viewer tools such as -Laj and -PipMaker. - -

-X and M Stanzas -

-An LAV file may also contain x- and m-stanzas describing dynamic masking. -Each section will contain an x-stanza that looks like the one below. The -count is the number of bases newly masked as a result of processing the -latest query sequence; it does not include bases previously masked. -

-

-    x {
-      n <count>
-    }
-
-

-A single m-stanza listing the masked regions is then included in the final -section, and looks like the one below. <start> and -<stop> are origin 1 and inclusive, and are relative -to the first subsequence indicated in the most recent s-stanza. -

-

-    m {
-      x <start> <end>
-      x <start> <end>
-      ...
-      n <count>
-    }
-
-

-Dynamic masking is invoked by the -‑‑masking=<count> option in LASTZ, or -the M=<count> option in BLASTZ. For more information -about these options, please see the -LASTZ documentation. - -

-Census Stanza -

-In LASTZ, the ‑‑census option will produce a -Census-stanza. The first field in each line (1, -2, 3, …) is a -position in the target (sequence 1). The count indicates the number of -times the corresponding base appears in an alignment. -

-

-    Census {
-      1 <count>
-      2 <count>
-      ...
-    }
-
- -

-


-Bob Harris and Cathy Riemer, October 2008 - -

- - diff --git a/programs/lastz/hsx_format.html b/programs/lastz/hsx_format.html deleted file mode 100644 index 7d2a7a4..0000000 --- a/programs/lastz/hsx_format.html +++ /dev/null @@ -1,1184 +0,0 @@ - - - -HSX Format - - - - - - -

-

HSX Format

-Format Specification version 1.0.0, -January 12, 2010 - -

-TABLE OF CONTENTS - -

-

- -

-

Introduction

-

- -HSX is a binary file format for indexing (or listing) DNA sequences in other -files, allowing fast random access to those sequences. The format was created -as part of the LASTZ project, -providing a means to input selected sequences from several short read files -into a single run of LASTZ. - -

-This document is provided for users interested in creating HSX files with -programs of their own design. - -

-The HSX file contains a sequence index array and an associated hash table. -Each sequence index entry includes the sequence's name, length, and a reference -to the location of the sequence's data in some other file. This array can be -accessed either sequentially or via the hash table. Note that the names in -the index file do not have to match the original names or headers in the -sequence files. - -

-Sequence entries are ordered by the hashes of their names. Conceptually, the -hash table groups the sequences into buckets of sequences with the -same hash. For each bucket, the table gives the location in the sequence index -of the first sequence in that bucket. Hash collisions are resolved by scanning -subsequent index entries for the remaining sequences in the bucket. - -

-There is also a file table, which allows a single index file to cover sequences -from multiple sequence files of varying formats. However, currently LASTZ only -supports indexing of files in FASTA format. - -

-

File Specification

- -

-The file is stored in a binary format described by the table below. It can be -written on either a big-endian or little-endian machine; programs reading the -file determine the byte order of multi-byte fields by examining the magic -number at the start of the file. - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
File OffsetDataMeaning
0x00D2 52 70 95 -
—or— -
95 70 52 D2
Magic number indicating big-endian byte order. -
-
Magic number indicating little-endian byte order.
0x0400 00 01 00File conforms to version 1.0 of the HSX file format.
0x0800 00 00 1CHeader length in bytes, including this field through the SOFF field.
0x0C00 00 00 xxFLEN: -number of entries in the file table (limited to 255).
0x10xx xx xx xxFOFF: -offset (from file start) to the file table.
0x14xx xx xx xxHLEN: -number of buckets in the hash table. This also serves as the modulus for -the hash function. -

-Typically the number of buckets is set so that the average number of -sequences per bucket (SLEN/HLEN) is reasonably small (e.g. 10). -

-The hash table actually includes HLEN+1 buckets. An extra -sentinel bucket is appended at the end of the table, containing -the offset to just past the end of the sequence index table.

0x18xx xx xx xxHOFF: -offset (from file start) to the hash table.
0x1Cxx xx xx xxSLEN: -number of entries in the sequence index table.
0x20xx xx xx xxSOFF: -offset (from file start) to the sequence index table. -

-Entries in the sequence index table are necessarily stored in hash order. -Entries with the same hash are stored in alphabetical order (actually, in -lexicographic order over the bytes of their names.) -

-See the hashing description below this table for -more information.

FOFFxx xx xx xxFINFO0: -offset (from file start) to the info record for the first sequence file -(file 0).
Offsets to info records for the remaining FLEN-1 files.
FINFO0xx xx FTYPE0: -file type for file 0, stored as a length byte -FTYPELEN0 followed by FTYPELEN0 -bytes of ASCII text. -

-This is equivalent to a file extension (without a leading .) and -will be used as such. In the current implementation, it must be -fa or fasta. -

-Together, this field and the next comprise a single info record in the -file table.

FINFO0+1+FTYPELEN0xx xx  -FNAME0: -file name for file 0, stored as a length byte -FNAMELEN0 followed by FNAMELEN0 -bytes of ASCII text. -

-This is used as the base file name for the corresponding sequence file, -including path. -However, it is usually an empty string, in which case the -base name and path are copied from the name and path of the HSX file itself. -This allows files to be renamed without rebuilding the index.

Info records for the remaining FLEN-1 files.
HOFFxx xx xx xx xxSOFFH(0): -offset (from file start) into the sequence index table, pointing to the first -sequence in the first hash bucket (bucket 0). -

-SOFFn is the file offset for the -n-th entry in the sequence index table. -H(k) is the number of sequences that have a hash code less than -that of bucket k (i.e. the number of sequences assigned to buckets -before bucket k). -Therefore SOFFH(k) points to the first -sequence in the kth hash bucket. -

-The most significant bit in a bucket's SOFFH(k) value -is used to indicate whether the bucket is empty or not. If a bucket is empty, -this bit is set (1), otherwise it is clear (0). The end of the sequences for -bucket k can be determined from SOFFH(k+1) -(the entry for the start of the next bucket). -

Offsets for the first sequences in the remaining HLEN-1 -buckets.
HOFF+5*HLENxx xx xx xx xxSentinel hash bucket. This contains an offset to the end of the -sequence index table (i.e., to the byte just beyond the last entry). -
SOFFxx xx xx xx xxIXLEN0: -length (in nucleotides) of the first sequence. -

-A sequence may be empty, so zero is a legitimate value for the sequence length. -

-Together, this field and the next three comprise a single entry in the -sequence index table.

SOFF+5xxIXFILE0: -index into the file table for the file containing the first sequence.
SOFF+6xx xx xx xx xx xxIXOFF0: -offset (from the start of the appropriate sequence file) pointing to the first -sequence.
SOFF+12xx xx IXNAME0: -name of the first sequence, stored as a length byte -IXNAMELEN0 followed by -IXNAMELEN0 bytes of ASCII text.
Sequence index entries for the remaining -SLEN-1 sequences.
- -

-

Hash Function

-

- -The code for the underlying hash function is shown below, written in C. -The hash bucket for sequence name NAME is computed by -

-    bucket = hassock_hash(NAME,strlen(NAME)) % HLEN;
-
- -

-This hash function is a variant of Austin Appleby's -MurmurHash2. -The primary differences are that it has the seed hardwired and scans the input -data in the reverse order (this is not structly true, since the -non-multiple-of-four leftover bytes are handled slightly differently). It is -also endian-neutral. - -

-

-    #include <stdint.h>
-
-    uint32_t hassock_hash (const void* key, uint32_t len)
-        {
-        const uint32_t seed = 0x5C3FC4D3;
-        const uint32_t m    = 0x87C10417;
-        const uint8_t* data = ((const uint8_t*) key) + len;
-        const uint8_t* stop = ((const uint8_t*) key) + 4;
-        uint32_t       h, k;
-
-        h = seed ^ len;
-        while (data >= stop)
-            {
-            k  = *(--data);
-            k |= *(--data) << 8;
-            k |= *(--data) << 16;
-            k |= *(--data) << 24;
-            k *= m;
-            k ^= k >> 24;
-            k *= m;
-            h *= m;
-            h ^= k;
-            len -= 4;
-            }
-        switch (len)
-            {
-            case 3: h ^= *(--data) << 16;
-            case 2: h ^= *(--data) << 8;
-            case 1: h ^= *(--data);
-                    h *= m;
-            }
-        h ^= h >> 13;
-        h *= m;
-        h ^= h >> 15;
-        return h;
-        }
-
- -

-

Example

-

- - -In this example, we have 10 sequences from 3 fasta files, indexed by a single -HSX file. We first show the fasta files, then show a field-by-field hex dump -of the corresponding HSX file. For demonstration purposes, the HSX file was -created with only 5 buckets. Typical HSX files will deal with more sequences, -more files, and have more buckets. - -

-hsxexA.fa contains five sequences: -

-

-    >HSXEXA_785
-    TAACGGCAATCTTTGGTAGACCTATTGGTCATATCATGAAATTGAAGGAT
-    AATTATTGCCATAAAGTTTTTCACGTTACTATCTTTGCCTCGCAATGAAT
-    AAAATATTCTTAGGGCTACTTTGTAACCTTGCAGAC
-    >HSXEXA_88K
-    TTAATTACTCGCATGATCTTTCAAGATCTTTACCGTTCACACAATTTCTC
-    GAACACTCAGTA
-    >HSXEXA_DNQ
-    CAGTGTACAAAATAAACTATTAACTATATGTAGATAGATACATAGAGACA
-    AAACGGGTAGCATCTAGTATCCTGACTGCGCATTGTGGGGTGTCGCTTCT
-    AAGTACCCGAAATGAGCGT
-    >HSXEXA_LRW
-    TTAAGTACATTCAGATCCATCATGGTTTCGGAAGCTAATGGGAAAAGGGG
-    TACAGAATACAACACCTAGTTGATACGATAGTTAGTTTTTTA
-    >HSXEXA_R9V
-    TATAGTGCGTGTATGACCAATATTACGATGATCGTGACGCCATAGGGTCA
-    TATTCCTTAATATGTAAATATGAAGGTA
-
- -

-hsxexB.fa contains four sequences: -

-

-    >HSXEXB_6YF
-    AAGAGTTCTTACGGCAATAACAAAATGATGCTGTATCCTAGTAACAGGAA
-    CGAACCATTCGCTTCTGTGTTCTATACAGAAGAAACCAGACTCGCTAAAC
-    A
-    >HSXEXB_WCV
-    AATTAGTCTATTAAGGACTATATGTTTACAAGGATGGTAGTCCTAACGGA
-    ATTGATACCAATAGGTGGCACTTACCGTAGCTAGGTAGATCGCCCTACTA
-    CACCAGCTCAGCCATCTTGCCCCGCCAACT
-    >HSXEXB_YKU
-    GTCAACAGGTTTTCGGACTGGTGGCTTTCCTGATTTGATATTCAAAGGAA
-    ATTAGGGTAAGGACTTTGAGTTGTCATAGAATTCAATTTCGGGCTCCGTC
-    CATCACCTCGT
-    >HSXEXB_YV1
-    GGTGATGTTGTGAATATCACTGTCATGAAGGTCTCCTTCGGCCGCCTTAA
-    TCATCATCATAAGTTTCACCATGGTAAAATGAATTAGCCCCAAGCT
-
- -

-hsxexC.fa contains three sequences: -

-

-    >HSXEXC_4ZL
-    CACATCACATTGGTTGTTCATCCATATAATTATTTCCCATAAACTTTAAG
-    AGCTCGGCTGGCCATACGTACTGACTAGCTTAGCCCCTAACTAATCGGCC
-    ACAGCGATAGTACA
-    >HSXEXC_936
-    TGGTTTTTAGAGTCCGTGGAGCCTCTCAGCCACACTGGGTTCGGGAAGTT
-    TCAGGCAAGTCCTACCTGTAA
-    >HSXEXC_GWD
-    CTAATCTGGGCTTGGGTCTGAACTCGCCCATGAGGAGGTAAGCAAACCAA
-    TAAATTCGGGTATGGCGGTCTTTATTATGCTTAAGGAACGGAACAA
-
- -

-The twelve sequence names are hashed into separate buckets, and sorted within -buckets, like this: - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hash Code/BucketSequence NameFasta fileOffset to sequence, in fasta file
0HSXEXB_6YFhsxexB.fa0x0000 00000000
1HSXEXA_785 -
HSXEXA_DNQ
hsxexA.fa -
hsxexA.fa
0x0000 00000000 -
0x0000 000000E3
2HSXEXA_88K -
HSXEXA_LRW -
HSXEXB_YV1 -
HSXEXC_4ZL
hsxexA.fa -
hsxexA.fa -
hsxexB.fa -
hsxexC.fa
0x0000 00000097 -
0x0000 00000169 -
0x0000 00000183 -
0x0000 00000000
3HSXEXB_YKUhsxexA.fa0x0000 00000105
4HSXEXA_R9V -
HSXEXB_WCV -
HSXEXC_936 -
HSXEXC_GWD
hsxexA.fa -
hsxexB.fa -
hsxexC.fa -
hsxexC.fa
0x0000 000001D3 -
0x0000 00000074 -
0x0000 00000081 -
0x0000 000000D6
- -

-Here is the complete HSX file, byte-by-byte: - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
File OffsetDataFieldMeaning
0x00000000D2 52 70 95 -Magic numberBig-endian.
0x0000000400 00 01 00HSX version1.0.
0x0000000800 00 00 1CHeader length28 bytes.
0x0000000C00 00 00 03FLEN=33 entries in file table.
0x0000001000 00 00 30FOFF=30File table is at 0x00000030.
0x0000001400 00 00 05HLEN=55 buckets in the hash table. -
0x0000001800 00 00 60HOFF=60Hash table is at 0x00000060.
0x0000001C00 00 00 0CSLEN=0C12 entries in the sequence index table.
0x0000002000 00 00 60SOFF=80Sequence index table is at 0x00000080.
0x0000002400 00 00 00 -
00 00 00 00 -
00 00 00 00
PaddingThe creating program can insert padding here, at its descretion.
0x0000003000 00 00 40FINFO0=40Info record for file 0 is at 0x00000040.
0x0000003400 00 00 4AFINFO1=4AInfo record for file 1 is at 0x0000004A.
0x0000003800 00 00 54FINFO2=54Info record for file 2 is at 0x00000054.
0x0000003C00 00 00 00 -Padding
0x0000004000 66 61FTYPE0File type for file 0 is "fa".
0x0000004306 68 73 78 -
65 78 41
FNAME0Base name for file 0 is "hsxexA". File name is hsxexA.fa.
0x0000004A00 66 61FTYPE1File type for file 1 is "fa".
0x0000004D06 68 73 78 -
65 78 42
FNAME1Base name for file 1 is "hsxexB". File name is hsxexB.fa.
0x0000005400 66 61FTYPE2File type for file 2 is "fa".
0x0000005706 68 73 78 -
65 78 43
FNAME2Base name for file 2 is "hsxexC". File name is hsxexC.fa.
0x0000005E00 00 -Padding
0x0000006000 00 00 00 80SOFFH(0)=80Sequence entries for hash bucket 0 start at 0x00000080.
0x0000006000 00 00 00 97SOFFH(1)=97Sequence entries for hash bucket 1 start at 0x00000097.
0x0000006000 00 00 00 C5SOFFH(2)=C5Sequence entries for hash bucket 2 start at 0x000000C5.
0x0000006000 00 00 01 21SOFFH(3)=121Sequence entries for hash bucket 3 start at 0x00000121.
0x0000006000 00 00 01 38SOFFH(4)=138Sequence entries for hash bucket 4 start at 0x00000138.
0x0000006080 00 00 01 94SOFFH(5)=194Sentinel bucket indicates end of sequence entries is at 0x00000194. -

The most significant bit of SOFFH(5) -is a 1, indicating the bucket is empty.

0x0000007E00 00 -Padding
0x0000008000 00 00 00 65IXLEN0=65(Start of hash bucket 0) -
Sequence 0 is 101 bp.
0x0000008501IXFILE0=1Sequence 0 is in file 1 (hsxexB.fa).
0x0000008600 00 00 00 00 00IXOFF0=00Sequence 0 is at file offset 0x0000 00000000.
0x0000008C0A 48 53 58 -
45 58 42 5F -
36 59 46
IXNAME0Sequence 0 is named "HSXEXB_6YF".
0000009700 00 00 00 88IXLEN1=88(Start of hash bucket 1) -
Sequence 1 is 136 bp.
0000009C00IXFILE1=0Sequence 1 is in file 0 (hsxexA.fa).
0000009D00 00 00 00 00 00IXOFF1=00Sequence 1 is at file offset 0x0000 00000000.
000000A30A 48 53 58 -
45 58 41 5F -
37 38 35
IXNAME1Sequence 1 is named "HSXEXA_785".
000000AE00 00 00 00 77IXLEN2=77Sequence 2 is 119 bp.
000000B300IXFILE2=0Sequence 2 is in file 0 (hsxexA.fa).
000000B400 00 00 00 00 E3IXOFF2=E3Sequence 2 is at file offset 0x0000 000000E3.
000000BA0A 48 53 58 -
45 58 41 5F -
44 4E 51
IXNAME2Sequence 2 is named "HSXEXA_DNQ".
000000C500 00 00 00 3EIXLEN3=3E(Start of hash bucket 2) -
Sequence 3 is 62 bp.
000000CA00IXFILE3=0Sequence 3 is in file 0 (hsxexA.fa).
000000CB00 00 00 00 00 97IXOFF3=97Sequence 3 is at file offset 0x0000 00000097.
000000D10A 48 53 58 -
45 58 41 5F -
38 38 4B
IXNAME3Sequence 3 is named "HSXEXA_88K".
000000DC00 00 00 00 5CIXLEN4=5CSequence 4 is 92 bp.
000000E100IXFILE4=0Sequence 4 is in file 0 (hsxexA.fa).
000000E200 00 00 00 01 69IXOFF4=169Sequence 4 is at file offset 0x0000 00000169.
000000E80A 48 53 58 -
45 58 41 5F -
4C 52 57
IXNAME4Sequence 4 is named "HSXEXA_LRW".
000000F300 00 00 00 60IXLEN5=60Sequence 5 is 96 bp.
000000F801IXFILE5=1Sequence 5 is in file 1 (hsxexB.fa).
000000F900 00 00 00 01 83IXOFF5=183Sequence 5 is at file offset 0x0000 00000183.
000000FF0A 48 53 58 -
45 58 42 5F -
59 56 31
IXNAME5Sequence 5 is named "HSXEXB_YV1".
0000010A00 00 00 00 72IXLEN6=72Sequence 6 is 130 bp.
0000010F02IXFILE6=2Sequence 6 is in file 2 (hsxexC.fa).
0000011000 00 00 00 00 00IXOFF6=0Sequence 6 is at file offset 0x0000 00000000.
000001160A 48 53 58 -
45 58 43 5F -
34 5A 4C
IXNAME6Sequence 6 is named "HSXEXC_4ZL".
0000012100 00 00 00 6FIXLEN7=6F(Start of hash bucket 3) -
Sequence 7 is 111 bp.
0000012601IXFILE7=1Sequence 7 is in file 1 (hsxexB.fa).
0000012700 00 00 00 01 05IXOFF7=105Sequence 7 is at file offset 0x0000 00000105.
0000012D0A 48 53 58 -
45 58 42 5F -
59 4B 55
IXNAME7Sequence 7 is named "HSXEXB_YKU".
0000013800 00 00 00 4EIXLEN8=4E(Start of hash bucket 4) -
Sequence 8 is 78 bp.
0000013D00IXFILE8=0Sequence 8 is in file 0 (hsxexA.fa).
0000013E00 00 00 00 01 D3IXOFF8=1D3Sequence 8 is at file offset 0x0000 000001D3.
000001440A 48 53 58 -
45 58 41 5F -
52 39 56
IXNAME8Sequence 8 is named "HSXEXA_R9V".
0000014F00 00 00 00 82IXLEN9=82Sequence 9 is 130 bp.
0000015401IXFILE9=1Sequence 9 is in file 1 (hsxexB.fa).
0000015500 00 00 00 00 74IXOFF9=74Sequence 9 is at file offset 0x0000 00000074.
0000015B0A 48 53 58 -
45 58 42 5F -
57 43 56
IXNAME9Sequence 9 is named "HSXEXB_WCV".
0000016600 00 00 00 47IXLEN10=47Sequence 10 is 71 bp.
0000016B02IXFILE10=2Sequence 10 is in file 2 (hsxexC.fa).
0000016C00 00 00 00 00 81IXOFF10=81Sequence 10 is at file offset 0x0000 00000081.
000001720A 48 53 58 -
45 58 43 5F -
39 33 36
IXNAME10Sequence 10 is named "HSXEXC_936".
0000017D00 00 00 00 60IXLEN11=60Sequence 11 is 96 bp.
0000018202IXFILE11=2Sequence 11 is in file 2 (hsxexC.fa).
0000018300 00 00 00 00 D6IXOFF11=D6Sequence 11 is at file offset 0x0000 000000D6.
000001890A 48 53 58 -
45 58 43 5F -
47 57 44
IXNAME11Sequence 11 is named "HSXEXC_GWD".
00000194(file ends here)
- - -

-


-Bob Harris and Cathy Riemer, January 2010 - -

- - diff --git a/programs/lastz/images/after_chaining.png b/programs/lastz/images/after_chaining.png deleted file mode 100644 index 6660551..0000000 Binary files a/programs/lastz/images/after_chaining.png and /dev/null differ diff --git a/programs/lastz/images/after_interpolation.png b/programs/lastz/images/after_interpolation.png deleted file mode 100644 index 309f53c..0000000 Binary files a/programs/lastz/images/after_interpolation.png and /dev/null differ diff --git a/programs/lastz/images/aglobin_chained.png b/programs/lastz/images/aglobin_chained.png deleted file mode 100644 index 84ed244..0000000 Binary files a/programs/lastz/images/aglobin_chained.png and /dev/null differ diff --git a/programs/lastz/images/aglobin_closeup_gapped.png b/programs/lastz/images/aglobin_closeup_gapped.png deleted file mode 100644 index b5f0dc2..0000000 Binary files a/programs/lastz/images/aglobin_closeup_gapped.png and /dev/null differ diff --git a/programs/lastz/images/aglobin_closeup_hsps.png b/programs/lastz/images/aglobin_closeup_hsps.png deleted file mode 100644 index a1284ca..0000000 Binary files a/programs/lastz/images/aglobin_closeup_hsps.png and /dev/null differ diff --git a/programs/lastz/images/aglobin_closeup_seeds.png b/programs/lastz/images/aglobin_closeup_seeds.png deleted file mode 100644 index 7d1bdb5..0000000 Binary files a/programs/lastz/images/aglobin_closeup_seeds.png and /dev/null differ diff --git a/programs/lastz/images/aglobin_hsps.png b/programs/lastz/images/aglobin_hsps.png deleted file mode 100644 index 066b053..0000000 Binary files a/programs/lastz/images/aglobin_hsps.png and /dev/null differ diff --git a/programs/lastz/images/aglobin_unchained.png b/programs/lastz/images/aglobin_unchained.png deleted file mode 100644 index c917e80..0000000 Binary files a/programs/lastz/images/aglobin_unchained.png and /dev/null differ diff --git a/programs/lastz/images/anchors_and_alignment.png b/programs/lastz/images/anchors_and_alignment.png deleted file mode 100644 index 012cfe7..0000000 Binary files a/programs/lastz/images/anchors_and_alignment.png and /dev/null differ diff --git a/programs/lastz/images/before_chaining.png b/programs/lastz/images/before_chaining.png deleted file mode 100644 index 9e6ca2c..0000000 Binary files a/programs/lastz/images/before_chaining.png and /dev/null differ diff --git a/programs/lastz/images/before_interpolation.png b/programs/lastz/images/before_interpolation.png deleted file mode 100644 index c78092f..0000000 Binary files a/programs/lastz/images/before_interpolation.png and /dev/null differ diff --git a/programs/lastz/images/human_vs_chicken.png b/programs/lastz/images/human_vs_chicken.png deleted file mode 100644 index 0e74f6f..0000000 Binary files a/programs/lastz/images/human_vs_chicken.png and /dev/null differ diff --git a/programs/lastz/images/human_vs_chicken_full.png b/programs/lastz/images/human_vs_chicken_full.png deleted file mode 100644 index c1c191e..0000000 Binary files a/programs/lastz/images/human_vs_chicken_full.png and /dev/null differ diff --git a/programs/lastz/images/seeds_and_hsps.png b/programs/lastz/images/seeds_and_hsps.png deleted file mode 100644 index a3d98de..0000000 Binary files a/programs/lastz/images/seeds_and_hsps.png and /dev/null differ diff --git a/programs/lastz/images/seq_vs_same.png b/programs/lastz/images/seq_vs_same.png deleted file mode 100644 index 02ea092..0000000 Binary files a/programs/lastz/images/seq_vs_same.png and /dev/null differ diff --git a/programs/lastz/images/seq_vs_same_notrivial.png b/programs/lastz/images/seq_vs_same_notrivial.png deleted file mode 100644 index 1ff4cbc..0000000 Binary files a/programs/lastz/images/seq_vs_same_notrivial.png and /dev/null differ diff --git a/programs/lastz/images/seq_vs_self.png b/programs/lastz/images/seq_vs_self.png deleted file mode 100644 index b16f9bb..0000000 Binary files a/programs/lastz/images/seq_vs_self.png and /dev/null differ diff --git a/programs/lastz/images/seq_vs_self_no_mirror.png b/programs/lastz/images/seq_vs_self_no_mirror.png deleted file mode 100644 index a7f66ef..0000000 Binary files a/programs/lastz/images/seq_vs_self_no_mirror.png and /dev/null differ diff --git a/programs/lastz/images/word_count_table.png b/programs/lastz/images/word_count_table.png deleted file mode 100644 index ff1bef2..0000000 Binary files a/programs/lastz/images/word_count_table.png and /dev/null differ diff --git a/programs/lastz/images/ydrop.png b/programs/lastz/images/ydrop.png deleted file mode 100644 index a2455f5..0000000 Binary files a/programs/lastz/images/ydrop.png and /dev/null differ diff --git a/programs/lastz/lav_format.html b/programs/lastz/lav_format.html deleted file mode 100644 index adffcf7..0000000 --- a/programs/lastz/lav_format.html +++ /dev/null @@ -1,358 +0,0 @@ - - - -LAV Format - - - - - - -

-

LAV Format

- -

-TABLE OF CONTENTS - -

-

- -

-

Introduction

-

- -LAV is a plain-text file format for alignments of two DNA sequences. It is -the only output format produced by the -BLASTZ alignment program -(though often converted to -AXT format -by post-processing programs), and is the default output format for BLASTZ's -successor, LASTZ. -

-The alignment blocks are grouped by sequence (e.g. chromosome, scaffold, -contig, cDNA read, shotgun sequencing read, etc.) and strand, and described -by listing the coordinates of the gap-free aligning segments in each block. -This format is compact because it does not include the nucleotides, but the -tradeoff is that interpretation usually requires access to the original -sequence files, and it is not easy for humans to read. - -

-

Example

-

- -Here's a typical LAV file: -

-

-    #:lav
-    d {
-      "lastz.v0.3 malus.fa aurantium.fa C=2 W=8 T=0 
-         A    C    G    T
-        91 -114  -31 -123
-      -114  100 -125  -31
-       -31 -125  100 -114
-      -123  -31 -114   91
-      O = 400, E = 30, K = 3000, L = 3000, M = 0"
-    }
-    #:lav
-    s {
-      "malus.fa" 1 191411218 0 1
-      "aurantium.fa" 1 90634903 0 1
-    }
-    h {
-      "> apple"
-      "> orange"
-    }
-    a {
-      s 20643
-      b 46566766 2083211
-      e 46567353 2083795
-      l 46566766 2083211 46566796 2083241 61
-      l 46566797 2083245 46566814 2083262 78
-      l 46566821 2083263 46567353 2083795 65
-    }
-    a {
-      s 4233
-      b 47246530 10635696
-      e 47246660 10635826
-      l 47246530 10635696 47246660 10635826 63
-    }
-    ... many more a-stanzas ...
-    #:lav
-    s {
-      "malus.fa" 1 191411218 0 1
-      "aurantium.fa-" 1 90634903 1 1
-    }
-    h {
-      "> apple"
-      "> orange (reverse complement)"
-    }
-    a {
-      s 13897
-      b 1005819 5352698
-      e 1006099 5352978
-      l 1005819 5352698 1006099 5352978 74
-    }
-    ... many more a-stanzas ...
-    #:eof
-
- -

-

Stanza Types

-

- -An LAV file primarily consists of a series of "stanzas", each of which -is a single letter code followed by a brace-enclosed block. There are -also #:lav lines which break the file into sections, and -one #:eof line indicating the end of the file. Programs -that read LAV format should consider the file bad if the -#:eof is missing (or if anything appears after it). - -

-D Stanza -

-The d-stanza is intended to document the program and parameters used -to create the file. Programs reading the file normally treat this as -a comment, but it is possible to extract the scoring parameters for -further processing. - -

-S Stanza -

-An s-stanza describes the sequences used for the subsequent alignment -records (a-stanzas). It contains exactly two lines in the following -format. -

-

-    "<filename>[-]" <start> <stop> [<rev_comp_flag> <sequence_number>]
-
-

-Here <start> and <stop> are -origin 1 (i.e. the first base in the original given sequence is called -"1") and inclusive (both endpoints are included in the interval). -Usually <start> is 1 and <stop> -is the full length of the given sequence, however they can specify any -subsequence (e.g. if the alignment program was instructed to use only -part of the original sequence). -

-<rev_comp_flag> is 1 if the sequence was -reverse-complemented before aligning, or 0 otherwise. Usually the -first sequence will have a 0 here, since most alignment programs only -ever reverse-complement the second one. If this flag is 1, the -<filename> will also have a - appended -to it; programs that read LAV format should report an error if these -two indicators are contradictory. Note that even when this flag -indicates reverse-complement, the <start> and -<stop> endpoints are still relative to the original -orientation, and <start> is less than -<stop>. That is, conceptually the alignment program -extracts the requested sequence fragment first, then reverse-complements -it (if applicable), and finally tries to align it. -

-<sequence_number> is useful when the second file -contains multiple sequences. The first sequence is 1, the second is 2, and so -on. Most programs that write and read LAV format do not allow the first file to -contain multiple sequences, so in these cases the sequence number for the first -file is always 1 (though the format itself does not require this). Note that -<start> and <stop> are relative to each -sequence, not to the entire file. -

-The <rev_comp_flag> and -<sequence_number> are shown here as optional because -early versions of this format did not include them. - -

-H Stanza -

-Usually an s-stanza is followed immediately by an h-stanza, which -provides a name for each of the two sequences, typically obtained -from the FASTA header line. (Before the s-stanza's -<sequence_number> field was introduced, this was -the only way to identify which sequence from a multi-sequence file was -aligned.) A (reverse complement) suffix is appended when -applicable; again, programs should report an error if this contradicts -the other indicators. - -

-A Stanza -

-An a-stanza describes a single alignment block, sometimes called a -"local alignment", which typically includes gaps due to small insertions -and deletions in the aligned sequences. In the example below, the -s, b, and -e lines indicate that the block has a score -of 13916 and an overall range of 4886..5171 in sequence 1 and 21292..21537 -in sequence 2. -

-The l lines describe the block's gap-free segments, with the -final field representing the percentage of matching bases in each segment. -In this example the alignment starts with a segment from 4886..4899 in -sequence 1 and from 21292..21305 in sequence 2, having a percent identity -of 79%. Note that the segment length must be the same in both sequences -(14 basepairs for this segment). The next segment starts at 4900 and 21308 -in sequences 1 and 2, respectively, indicating a two-base gap in sequence 1 -(corresponding to positions 21306 and 21307 in sequence 2). -

-

-    a {
-      s 13916
-      b 4886 21292
-      e 5171 21537
-      l 4886 21292 4899 21305 79
-      l 4900 21308 4924 21332 92
-      l 4925 21334 5024 21433 88
-      l 5027 21434 5040 21447 100
-      l 5086 21448 5117 21479 84
-      l 5118 21484 5171 21537 87
-    }
-
-

-Coordinates in an a-stanza are origin 1 and inclusive, and are relative -to the subsequences indicated in the most recent s-stanza. In the -example below the alignment is of apple 1333..1444 to orange 2777..2888. -

-

-    s {
-      "malus.fa" 1001 2000 0 1
-      "aurantium.fa" 2001 5000 0 1
-    }
-    ...
-    a {
-      s 7321
-      b 333 777
-      e 444 888
-      l 333 777 444 888 62
-    }
-
-

-If a sequence is reverse-complemented, then the coordinates are relative -to the reverse complement, so they are counted back from the end of -the subsequence. Thus the example below represents an alignment of apple -1333..1444 to the reverse complement of orange 4113..4224. In detail: -the s-stanza indicates that the first sequence from aurantium.fa should be -used, and its subsequence from 2001..5000 should be extracted and then -reverse complemented before aligning with apple. In this 3000 bp -reverse-complemented subsequence, the first base corresponds to position -5000 in the original sequence, the second to position 4999, and so on to -the last (3000th) base, which corresponds to position 2001. Thus the -conversion formula is p = 5000 - (r - 1), where p -is the position in the original sequence, and r is the position in the -reverse-complemented subsequence. Within the reverse-complemented -subsequence, the alignment is at 777..888. The starting point, 777, is the -nucleotide 776 bp back from 5000, or 4224, while the ending point, 888, is -887 bp back from 5000, or 4113. -

-

-    s {
-      "malus.fa" 1001 2000 0 1
-      "aurantium.fa-" 2001 5000 1 1
-    }
-    ...
-    a {
-      s 7321
-      b 333 777
-      e 444 888
-      l 333 777 444 888 62
-    }
-
-

-The fifth numeric field in an a-stanza's l line is the -percentage of bases in the aligned segment that match (often called the -"percent identity" or "percent id"). This is used by viewer tools such as -Laj and -PipMaker. - -

-X and M Stanzas -

-An LAV file may also contain x- and m-stanzas describing dynamic masking. -Each section will contain an x-stanza that looks like the one below. The -count is the number of bases newly masked as a result of processing the -latest query sequence; it does not include bases previously masked. -

-

-    x {
-      n <count>
-    }
-
-

-A single m-stanza listing the masked regions is then included in the final -section, and looks like the one below. <start> and -<stop> are origin 1 and inclusive, and are relative -to the first subsequence indicated in the most recent s-stanza. -

-

-    m {
-      x <start> <end>
-      x <start> <end>
-      ...
-      n <count>
-    }
-
-

-Dynamic masking is invoked by the -‑‑masking=<count> option in LASTZ, or -the M=<count> option in BLASTZ. For more information -about these options, please see the -LASTZ documentation. - -

-Census Stanza -

-In LASTZ, the ‑‑census option will produce a -Census-stanza. The first field in each line (1, -2, 3, …) is a -position in the target (sequence 1). The count indicates the number of -times the corresponding base appears in an alignment. -

-

-    Census {
-      1 <count>
-      2 <count>
-      ...
-    }
-
- -

-


-Bob Harris and Cathy Riemer, October 2008 - -

- - diff --git a/programs/lastz/make-include.mak b/programs/lastz/make-include.mak deleted file mode 100644 index 7300047..0000000 --- a/programs/lastz/make-include.mak +++ /dev/null @@ -1,14 +0,0 @@ -#----------- -# make-include.mak-- -# Defines variables used by all LASTZ Makefiles -#----------- - -INSTALL = install -ARCH ?= $(shell uname -m) - -ifdef LASTZ_INSTALL -installDir = ${LASTZ_INSTALL} -else -installDir = ${HOME}/lastz-distrib/bin -endif - diff --git a/programs/lastz/src/Makefile b/programs/lastz/src/Makefile deleted file mode 100644 index 11510c5..0000000 --- a/programs/lastz/src/Makefile +++ /dev/null @@ -1,164 +0,0 @@ -include ../make-include.mak -include version.mak - -CC=gcc - -# default targets - -default: lastz lastz_D - -special: lastz_32 - -builds = lastz lastz_D lastz_32 - -#--------- -# program build -# -# normally creates two versions: -# lastz: standard lastz (integer scoring) -# lastz_D: lastz with double-float scoring -# -# and can also create this special version: -# lastz_32: standard lastz with 32-bit sequence indexing -# -#--------- -# build-time options (presently only one) -# -# This will enable certain options within the compiled program. To build with -# one of these option, you would do "make lastz =ON". The result -# will be an executable named lastz, i.e. mostly indistinguishable from the -# executable without this option. The only place the executable will report -# the option is in response to "lastz --version". -# -# So, for example, -# make lastz_32 allowBackToBackGaps=ON -# will build with allowBackToBackGaps set to ON. -# -# Note that you should do a "make clean" before doing a build with one of these -# options; otherwise there is no guarantee that all modules will get built with -# the same option settings. -# -# allowBackToBackGaps gapped_extend.c is modified to allow the opening of -# .. a delete right after an insert, or vice versa -# -#--------- -# Notes re optimization flags: -# -# On a 2GHz intel core duo iMac: -# -# O3 is a definite improvement over no optimization, improving many of -# the most-used routines down to as low as 60 to 70% of unoptimized run -# time. -# -# However, using -funroll-loops actually slowed things down a little. -# -#--------- - -definedForAll = -Wall -Wextra -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -flagsFor32 = -Dmax_sequence_index=32 -Dmax_malloc_index=40 -Ddiag_hash_size=4194304 - -allowBackToBackGaps ?= 0 # by default allowBackToBackGaps -ifeq ($(allowBackToBackGaps), ON) # .. is off; see note above for - definedForAll += -DallowBackToBackGaps # .. how to turn it on -endif - - -VERSION_FLAGS= \ - -DVERSION_MAJOR="\"${VERSION_MAJOR}"\" \ - -DVERSION_MINOR="\"${VERSION_MINOR}"\" \ - -DVERSION_SUBMINOR="\"${VERSION_SUBMINOR}"\" \ - -DREVISION_DATE="\"${REVISION_DATE}"\" \ - -DSUBVERSION_REV="\"${SUBVERSION_REV}"\" - - -CFLAGS = -O3 ${definedForAll} ${VERSION_FLAGS} - - -srcFiles = lastz infer_scores \ - seeds pos_table quantum seed_search diag_hash \ - chain gapped_extend tweener masking \ - segment edit_script \ - identity_dist coverage_dist continuity_dist \ - output gfa lav axt maf cigar sam genpaf text_align align_diffs \ - utilities dna_utilities sequences capsule - -incFiles = lastz.h infer_scores.h \ - seeds.h pos_table.h quantum.h seed_search.h diag_hash.h \ - chain.h gapped_extend.h tweener.h masking.h \ - segment.h edit_script.h \ - identity_dist.h coverage_dist.h continuity_dist.h \ - output.h gfa.h lav.h axt.h maf.h sam.h cigar.h genpaf.h text_align.h align_diffs.h \ - utilities.h dna_utilities.h sequences.h capsule.h - -%.o: %.c version.mak ${incFiles} - ${CC} -c ${CFLAGS} -Dscore_type=\'I\' $< -o $@ - -%_D.o: %.c version.mak ${incFiles} - ${CC} -c ${CFLAGS} -Dscore_type=\'D\' $< -o $@ - -%_32.o: %.c version.mak ${incFiles} - ${CC} -c ${CFLAGS} ${flagsFor32} $< -o $@ - - -lastz: $(foreach part,${srcFiles},${part}.o) - ${CC} $(foreach part,${srcFiles},${part}.o) -lm -o $@ - -lastz_D: $(foreach part,${srcFiles},${part}_D.o) - ${CC} $(foreach part,${srcFiles},${part}_D.o) -lm -o $@ - -lastz_32: $(foreach part,${srcFiles},${part}_32.o) - ${CC} $(foreach part,${srcFiles},${part}_32.o) -lm -o $@ - -# cleanup - -clean: cleano clean_builds clean_test - -cleano: - rm -f *.o - -clean_builds: - rm -f lastz - rm -f lastz_D - rm -f lastz_32 - -# installation; change installDir to suit your needs (in ../make-include.mak) - -install: lastz lastz_D - ${INSTALL} -d ${installDir} - ${INSTALL} lastz ${installDir} - ${INSTALL} lastz_D ${installDir} - -install_lastz: lastz - ${INSTALL} -d ${installDir} - ${INSTALL} lastz ${installDir} - -install_D: lastz_D - ${INSTALL} -d ${installDir} - ${INSTALL} lastz_D ${installDir} - -install_32: lastz_32 - ${INSTALL} -d ${installDir} - ${INSTALL} lastz_32 ${installDir} - -#--------- -# testing -# -# A small test to give some comfort level that the program has built properly, -# or that changes you've made to the source code haven't broken it. If the -# test succeeds, there will be no output from the diff. -#--------- - -clean_test: - rm -f ../test_results/base_test*.* - -test: lastz - @rm -f ../test_results/base_test.default.lav - @./lastz \ - ../test_data/pseudocat.fa \ - ../test_data/pseudopig.fa \ - | sed "s/\"lastz\.[^ ]* //g" \ - > ../test_results/base_test.default.lav - @diff \ - ../test_data/base_test.default.lav \ - ../test_results/base_test.default.lav - diff --git a/programs/lastz/src/align_diffs.c b/programs/lastz/src/align_diffs.c deleted file mode 100755 index 4b52f8b..0000000 --- a/programs/lastz/src/align_diffs.c +++ /dev/null @@ -1,729 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: align_diffs.c -// -//---------- -// -// align_diffs-- -// Support for printing alignments in a textual differences format. -// -// For an alignment like this: -// -// s phiX 4294 35 + 5386 CCCCCAACTTGATATTAATAACACTATAGACCACC -// s HWI-EAS91_1_306UPAAXX 1 35 - 36 CCCCCATCTTGATATTAATAACACTATAGACCACC -// -// The output will be something like this (but all on one line): -// -// phiX 4300 4301 + 5386 -// HWI-EAS91_1_306UPAAXX 7 8 - 36 -// A T -// CCCCCAACTTGATATTAATAACACTATAGACCACC CCCCCATCTTGATATTAATAACACTATAGACCACC -// -// Note that we use the same position conventions are per MAF format, zero-based, -// half-open, and negative strand positions are counted from the 5' end of the -// *negative* strand. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -#define align_diffs_owner // (make this the owner of its globals) -#include "align_diffs.h" // interface to this module - -//---------- -// -// prototypes for private functions -// -//---------- - -static void print_align_difference (FILE* f, - seq* seq1, unspos beg1, unspos end1, - seq* seq2, unspos beg2, unspos end2, - editscript* script, - unspos diffPos1, u8* diffText1, - unspos diffPos2, u8* diffText2, - unspos diffLength, - int withBlocks); - -static void print_match_difference (FILE* f, - seq* seq1, unspos pos1, unspos diffPos1, - seq* seq2, unspos pos2, unspos diffPos2, - unspos length, unspos diffLength, - int withBlocks); - -//---------- -// -// print_align_diffs_job_header-- -// Print a alignment differences job header. -// -//---------- - -void print_align_diffs_job_header - (arg_dont_complain(FILE* f), - arg_dont_complain(char* programName), - arg_dont_complain(char* name1), - arg_dont_complain(char* name2)) - { - // (do nothing) - } - -//---------- -// -// print_align_diffs_job_footer-- -// Print a alignment differences job footer. -// -//---------- - -void print_align_diffs_job_footer - (arg_dont_complain(FILE* f)) - { - // (do nothing) - } - -//---------- -// -// print_align_diffs_header-- -// Print a alignment differences query header. -// -//---------- - -void print_align_diffs_header - (arg_dont_complain(FILE* f), - arg_dont_complain(seq* seq1), - arg_dont_complain(seq* seq2)) - { - // (do nothing) - } - -//---------- -// -// print_align_diffs_align_list-- -// Print a list of gapped alignments, textually. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// int withBlocks: true => include full alignment block in output. -// int inhibitN: true => don't report mismatch-with-N differences. -// -// Returns: -// (nothing) -// -//---------- - -void print_align_diffs_align_list - (FILE* f, - alignel* alignList, - seq* seq1, - seq* seq2, - int withBlocks, - int inhibitN) - { - alignel* a; - - for (a=alignList ; a!=NULL ; a=a->next) - print_align_diffs_align (f, - seq1, a->beg1-1, a->end1, - seq2, a->beg2-1, a->end2, - a->script, - withBlocks, inhibitN); - } - -//---------- -// -// print_align_diffs_align-- -// Print a single gapped alignment, textually. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos beg1, end1: Range of positions in sequence 1 (origin 0). -// seq* seq2: Another sequence. -// unspos beg2, end2: Range of positions in sequence 2 (origin 0). -// editscript* script: The script describing the path the alignment takes -// .. in the DP matrix. -// int withBlocks: true => include full alignment block in output. -// int inhibitN: true => don't report mismatch-with-N differences. -// -// Returns: -// (nothing) -// -//---------- - -void print_align_diffs_align - (FILE* f, - seq* seq1, - unspos beg1, - unspos end1, - seq* seq2, - unspos beg2, - unspos end2, - editscript* script, - int withBlocks, - int inhibitN) - { - unspos height, width, i, j, run; - u32 opIx; - u8* p, *q; - s8 b1, b2; - unspos ix; - int isMatch, mismatchRun, gapLen; - - height = end1 - beg1; - width = end2 - beg2; - - // find and report each difference - - opIx = 0; - for (i=j=0 ; (iv+beg1+i; - q = seq2->v+beg2+j; - mismatchRun = 0; - for (ix=0 ; ixv+beg1+i; - startJ = j; q = seq2->v+beg2+j; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - gapLen = i - startI; - print_align_difference (f, - seq1, beg1, end1, - seq2, beg2, end2, - script, - i-gapLen, p, - j, NULL, - gapLen, - withBlocks); - p += gapLen; - } - - if (j != startJ) - { - gapLen = j - startJ; - print_align_difference (f, - seq1, beg1, end1, - seq2, beg2, end2, - script, - i, NULL, - j-gapLen, q, - gapLen, - withBlocks); - q += gapLen; - } - } - } - - } - - -static void print_align_difference - (FILE* f, - seq* seq1, - unspos beg1, - unspos end1, - seq* seq2, - unspos beg2, - unspos end2, - editscript* script, - unspos diffPos1, - u8* diffText1, - unspos diffPos2, - u8* diffText2, - unspos diffLength, - int withBlocks) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - unspos height, width, i, j, run; - u32 opIx; - u8* p, *q; - unspos ix; - char* name1, *name2; - unspos offset1, offset2, start1, start2; - unspos startLoc1, startLoc2; - unspos seq1Len, seq2Len, seq1True, seq2True; - char strand1, strand2; - unspos startI, startJ; - unspos diffLength1, diffLength2; - - height = end1 - beg1; - width = end2 - beg2; - - ////////// - // figure out the alignment's length - ////////// - - opIx = 0; - for (i=j=0 ; (ip == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - seq1Len = seq1->len; - seq1True = seq1->trueLen; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, beg1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - seq1Len = part->sepAfter - offset1; - seq1True = part->trueLen; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2Len = seq2->len; - seq2True = seq2->trueLen; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, beg2); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2Len = part->sepAfter - offset2; - seq2True = part->trueLen; - } - - if ((seq1->revCompFlags & rcf_rev) == 0) - { - start1 = beg1 + diffPos1 - offset1 + startLoc1; - strand1 = '+'; - } - else - { - start1 = beg1 + diffPos1 - offset1 + seq1True+2 - (startLoc1 + seq1Len); - strand1 = '-'; - } - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = beg2 + diffPos2 - offset2 + startLoc2; - strand2 = '+'; - } - else - { - start2 = beg2 + diffPos2 - offset2 + seq2True+2 - (startLoc2 + seq2Len); - strand2 = '-'; - } - - diffLength1 = (diffText1 != NULL)? diffLength : 0; - diffLength2 = (diffText2 != NULL)? diffLength : 0; - - ////////// - // print positional information - ////////// - - fprintf (f, "%s\t" unsposFmt "\t" unsposFmt "\t%c\t" unsposFmt "\t", - name1, start1-1, start1-1+diffLength1, strand1, seq1True); - - fprintf (f, "%s\t" unsposFmt "\t" unsposFmt "\t%c\t" unsposFmt "\t", - name2, start2-1, start2-1+diffLength2, strand2, seq2True); - - // print the aligned difference - - if (diffText1 != NULL) - { - for (ix=0 ; ixv+beg1+i; - q = seq2->v+beg2+j; - for (ix=0 ; ixv+beg1+i; - startJ = j; q = seq2->v+beg2+j; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startIv+beg1+i; - q = seq2->v+beg2+j; - for (ix=0 ; ixv+beg1+i; - startJ = j; q = seq2->v+beg2+j; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startI include full alignment block in output. -// int inhibitN: true => don't report mismatch-with-N differences. -// -// Returns: -// (nothing) -// -//---------- - -void print_align_diffs_match - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - int withBlocks, - int inhibitN) - { - u8* s1, *s2; - s8 b1, b2; - unspos ix; - int isMatch, mismatchRun; - - if (seq1->revCompFlags != rcf_forward) - suicide ("attempt to print - strand or complement for sequence 1 in print_align_diffs_match"); - - s1 = seq1->v + pos1; - s2 = seq2->v + pos2; - - // find and report each difference - - mismatchRun = 0; - for (ix=0 ; ixpartition; - seqpartition* sp2 = &seq2->partition; - partition* part; - u8* s1 = seq1->v + pos1; - u8* s2 = seq2->v + pos2; - char* name1, *name2; - unspos offset1, offset2, start1, start2; - unspos startLoc1, startLoc2; - unspos seq1Len, seq2Len, seq1True, seq2True; - char strand1, strand2; - unspos ix; - - // figure out position offsets and names - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - seq1Len = seq1->len; - seq1True = seq1->trueLen; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, pos1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - seq1Len = part->sepAfter - offset1; - seq1True = part->trueLen; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2Len = seq2->len; - seq2True = seq2->trueLen; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, pos2); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2Len = part->sepAfter - offset2; - seq2True = part->trueLen; - } - - if ((seq1->revCompFlags & rcf_rev) == 0) - { - start1 = diffPos1 - offset1 + startLoc1; - strand1 = '+'; - } - else - { - start1 = diffPos1 - offset1 + seq1True+2 - (startLoc1 + seq1Len); - strand1 = '-'; - } - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = diffPos2 - offset2 + startLoc2; - strand2 = '+'; - } - else - { - start2 = diffPos2 - offset2 + seq2True+2 - (startLoc2 + seq2Len); - strand2 = '-'; - } - - // print positional information - - fprintf (f, "%s\t" unsposFmt "\t" unsposFmt "\t%c\t" unsposFmt "\t", - name1, start1-1, start1-1+diffLength, strand1, seq1True); - - fprintf (f, "%s\t" unsposFmt "\t" unsposFmt "\t%c\t" unsposFmt "\t", - name2, start2-1, start2-1+diffLength, strand2, seq2True); - - // print the aligned difference - - for (ix=0 ; ix // standard C i/o stuff -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -//---------- -// -// prototypes for routines in align_diffs.c -// -//---------- - -void print_align_diffs_job_header (FILE* f, - char* programName, char* name1, char* name2); -void print_align_diffs_job_footer (FILE* f); -void print_align_diffs_header (FILE* f, seq* seq1, seq* seq2); -void print_align_diffs_align_list (FILE* f, - alignel* alignList, seq* seq1, seq* seq2, - int withBlocks, int inhibitN); -void print_align_diffs_align (FILE* f, - seq* seq1, unspos beg1, unspos end1, - seq* seq2, unspos beg2, unspos end2, - editscript* script, - int withBlocks, int inhibitN); -void print_align_diffs_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - int withBlocks, int inhibitN); - -#endif // align_diffs_H diff --git a/programs/lastz/src/axt.c b/programs/lastz/src/axt.c deleted file mode 100755 index 40c7d8e..0000000 --- a/programs/lastz/src/axt.c +++ /dev/null @@ -1,583 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: axt.c -// -//---------- -// -// axt-- -// Support for printing alignments in AXT format. -// -// AXT format is a well-established pairwise alignment format. As of Jan/2009, -// a spec for AXT files can be found at -// http://genome.ucsc.edu/goldenPath/help/axt.html -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff -#include "identity_dist.h" // identity distribution "format" stuff -#include "coverage_dist.h" // query coverage distribution stuff -#include "output.h" // alignment outout format stuff -#include "genpaf.h" // genpaf alignment format stuff - -#define axt_owner // (make this the owner of its globals) -#include "axt.h" // interface to this module - -// alignment counter - -static u64 axtAlignmentNumber; - -//---------- -// -// print_axt_job_header-- -// Print axt format job header. -// -//---------- - -void print_axt_job_header - (FILE* f, - char* _programName, - char* _args, - scoreset* scoring, - sthresh* hspThreshold, - sthresh* gappedThreshold, - score xDrop, - score yDrop) - { - char* programName = _programName; - char* args = _args; - - if (programName == NULL) programName = "(no name)"; - if (args == NULL) args = ""; - - fprintf (f, "# %s %s\n", programName, args); - fprintf (f, "#\n"); - fprintf (f, "# hsp_threshold = %s\n", score_thresh_to_string (hspThreshold)); - fprintf (f, "# gapped_threshold = %s\n", score_thresh_to_string (gappedThreshold)); - fprintf (f, "# x_drop = " scoreFmtSimple "\n", xDrop); - fprintf (f, "# y_drop = " scoreFmtSimple "\n", yDrop); - print_score_matrix_prefix (f, scoring, true, "# "); - - axtAlignmentNumber = (u64) -1; // caveat: this only works properly if - // .. we only write one axt file at a - // .. time, and write it completely - } - -//---------- -// -// print_axt_job_footer-- -// Print axt format job footer. -// -//---------- - -void print_axt_job_footer - (arg_dont_complain(FILE* f)) - { - // (do nothing) - } - -//---------- -// -// print_axt_header-- -// Print axt format query header. -// -//---------- - -void print_axt_header - (arg_dont_complain(FILE* f), - arg_dont_complain(seq* seq1), - arg_dont_complain(seq* seq2)) - { - // (do nothing) - } - -//---------- -// -// print_axt_align_list-- -// Print a list of gapped alignments in axt format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// int withComments: true => print comments as well -// char* extras: Extra fields to print on the summary line. -// .. These are genpaf keys, defined in genpaf.h -// .. (genpafXXX values). Currently only -// .. genpafSize2 is supported. This may be NULL. -// -// Returns: -// (nothing) -// -//---------- - -void print_axt_align_list - (FILE* f, - alignel* alignList, - seq* seq1, - seq* seq2, - int withComments, - char* extras) - { - alignel* a; - unspos numer, denom; - - for (a=alignList ; a!=NULL ; a=a->next) - { - if (withComments) - { - // report identity - alignment_identity (seq1, seq2, a, &numer, &denom); - fprintf (f, "# identity=" unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); - fprintf (f, "\n"); - - // report coverage - alignment_coverage (seq1, seq2, a, &numer, &denom); - fprintf (f, "# coverage=" unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); - fprintf (f, "\n"); - } - - print_axt_align (f, - seq1, a->beg1-1, a->end1, - seq2, a->beg2-1, a->end2, - a->script, a->s, extras); - } - - } - -//---------- -// -// print_axt_align-- -// Print a single gapped alignment in axt format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos beg1, end1: Range of positions in sequence 1 (origin 0). -// seq* seq2: Another sequence. -// unspos beg2, end2: Range of positions in sequence 2 (origin 0). -// editscript* script: The script describing the path the alignment -// .. takes in the DP matrix. -// score s: The alignment's score. -// char* extras: Extra fields to print on the summary line. -// .. These are genpaf keys, defined in genpaf.h -// .. (genpafXXX values). Currently only -// .. genpafSize2 is supported. This may be NULL. -// -// Returns: -// (nothing) -// -//---------- - -void print_axt_align - (FILE* f, - seq* seq1, - unspos beg1, - unspos end1, - seq* seq2, - unspos beg2, - unspos end2, - editscript* script, - score s, - char* extras) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - unspos height, width, i, j, startI, startJ, run; - u32 opIx; - u8* p, *q; - unspos ix, len1, len2; - char* name1, *name2; - unspos offset1, offset2, start1, start2; - unspos startLoc1, startLoc2; - unspos seq2Len, seq2True; - char strand2; - - if ((extras != NULL) && (strlen(extras) != 1) && (extras[0] != genpafSize2)) - suicide ("internal error: print_axt_align doesn't support extras"); - - if (seq1->revCompFlags != rcf_forward) - suicide ("attempt to print - strand or complement for sequence 1 in print_axt_align"); - - beg1++; // (internally, we want origin 1, inclusive) - beg2++; - - len1 = height = end1 - beg1 + 1; - len2 = width = end2 - beg2 + 1; - - ////////// - // figure out position offsets and names - ////////// - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, beg1-1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2Len = seq2->len; - seq2True = seq2->trueLen; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, beg2-1); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2Len = part->sepAfter - offset2; - seq2True = part->trueLen; - } - - ////////// - // print summary line - ////////// - - axtAlignmentNumber++; - - start1 = beg1-1 - offset1 + startLoc1; - - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = beg2-1 - offset2 + startLoc2; - strand2 = '+'; - } - else - { - start2 = beg2-1 - offset2 + seq2True+2 - (startLoc2 + seq2Len); - strand2 = '-'; - } - - fprintf (f, u64Fmt " %s " unsposFmt " " unsposFmt - " %s " unsposFmt " " unsposFmt " %c " scoreFmt, - axtAlignmentNumber, - name1, start1, start1+len1-1, - name2, start2, start2+len2-1, strand2, - s); - if ((extras != NULL) && (strlen(extras) == 1) && (extras[0] == genpafSize2)) - fprintf (f, " " unsposFmt, seq2Len); - fprintf (f, "\n"); - - ////////// - // print aligning path in sequence 1 (non-printables are printed as '*' - // but such should never be seen unless there is a problem elsewhere) - ////////// - - opIx = 0; - for (i=j=0 ; (iv+beg1+i-1; - q = seq2->v+beg2+j-1; - for (ix=0 ; ixv+beg1+i-1; - startJ = j; q = seq2->v+beg2+j-1; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startIv+beg1+i-1; - q = seq2->v+beg2+j-1; - for (ix=0 ; ixv+beg1+i-1; - startJ = j; q = seq2->v+beg2+j-1; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startI print comments as well -// char* extras: Extra fields to print on the summary line. These -// .. are genpaf keys, defined in genpaf.h (genpafXXX -// .. values). Currently only genpafSize2 is -// .. supported. This may be NULL. -// -// Returns: -// (nothing) -// -//---------- - -void print_axt_match - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - score s, - int withComments, - char* extras) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - u8* s1 = seq1->v + pos1; - u8* s2 = seq2->v + pos2; - char* name1, *name2; - unspos offset1, offset2, start1, end1, start2, end2; - unspos startLoc1, startLoc2; - unspos seq2Len, seq2True; - char strand2; - unspos ix; - segment seg; - unspos numer, denom; - - if ((extras != NULL) && (strlen(extras) != 1) && (extras[0] != genpafSize2)) - suicide ("internal error: print_axt_match doesn't support extras"); - - if (seq1->revCompFlags != rcf_forward) - suicide ("attempt to print - strand or complement for sequence 1 in print_axt_match"); - - if (withComments) - { - // report identity - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - fprintf (f, "# identity=" unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); - fprintf (f, "\n"); - - // report coverage - seg.pos1 = pos1; - seg.pos2 = pos2; - seg.length = length; - segment_coverage (seq1, seq2, &seg, &numer, &denom); - fprintf (f, "# coverage=" unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); - fprintf (f, "\n"); - } - - // figure out position offsets and names - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, pos1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2Len = seq2->len; - seq2True = seq2->trueLen; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, pos2); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2Len = part->sepAfter - offset2; - seq2True = part->trueLen; - } - - // print summary line - - axtAlignmentNumber++; - - start1 = pos1 - offset1 + startLoc1; - end1 = start1 + length; - - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = pos2 - offset2 + startLoc2; - end2 = start2 + length; - strand2 = '+'; - } - else - { - start2 = pos2 - offset2 + seq2True+2 - (startLoc2 + seq2Len); - end2 = start2 + length; - strand2 = '-'; - } - - fprintf (f, u64Fmt " %s " unsposFmt " " unsposFmt - " %s " unsposFmt " " unsposFmt " %c " scoreFmt, - axtAlignmentNumber, - name1, start1, end1-1, - name2, start2, end2-1, strand2, - s); - if ((extras != NULL) && (strlen(extras) == 1) && (extras[0] == genpafSize2)) - fprintf (f, " " unsposFmt, seq2Len); - fprintf (f, "\n"); - - // print aligning segment of sequence 1 (non-printables are printed as '*' - // but such should never be seen unless there is a problem elsewhere) - - for (ix=0 ; ix // standard C i/o stuff -#include // standard C variable argument list stuff -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -//---------- -// -// prototypes for routines in axt.c -// -//---------- - -void print_axt_job_header (FILE* f, - char* programName, char* args, scoreset* scoring, - sthresh* hspThreshold, sthresh* gappedThreshold, - score xDrop, score yDrop); -void print_axt_job_footer (FILE* f); -void print_axt_header (FILE* f, seq* seq1, seq* seq2); -void print_axt_align_list (FILE* f, alignel* alignList, seq* seq1, seq* seq2, - int withComments, char* extras); -void print_axt_align (FILE* f, - seq* seq1, unspos beg1, unspos end1, - seq* seq2, unspos beg2, unspos end2, - editscript* script, score s, char* extras); -void print_axt_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - score s, int withComments, char* extras); -void print_axt_comment (FILE* f, const char* format, ...); -void vprint_axt_comment (FILE* f, const char* format, va_list args); - -#endif // axt_H diff --git a/programs/lastz/src/build_options.h b/programs/lastz/src/build_options.h deleted file mode 100644 index a156264..0000000 --- a/programs/lastz/src/build_options.h +++ /dev/null @@ -1,28 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: build_options.h -// -//---------- -// -// This file contains a list of compile-time optional builds. Normally these -// build options should be left commented out here, and enabled instead by -// use of the "make" command line, as described in the Makefile. -// -// We show them here for two reasons. First, it provides a single place to -// describe the options, rather than scatter them over the .c files. Second, -// it provides a common point to make such definitions and to "ensure" (to the -// extent that we can do so) that all modules will be built with the same -// settings. -// -//---------- - -#ifndef build_options_H // (prevent multiple inclusion) -#define build_options_H - -//#define allowBackToBackGaps // if this is defined, gapped_extend.c is - // .. modified to allow the opening of a delete - // .. right after an insert, or vice versa - - -#undef global -#endif // build_options_H diff --git a/programs/lastz/src/capsule.c b/programs/lastz/src/capsule.c deleted file mode 100755 index 5a2ef64..0000000 --- a/programs/lastz/src/capsule.c +++ /dev/null @@ -1,994 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: capsule.c -// -//---------- -// -// capsule-- -// Support for "capsule" files and sharing of target data structures between -// multiple processes. -// -// Sharing is achieved through the unix mmap function. We mmap to a single -// file containing (a) the target sequence and (b) the seed word position -// table. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard UNIX stuff (non ANSI) -#include // standard C i/o stuff -#include // some mysterious UNIX voodoo -#include // UNIX memory manager stuff (non ANSI) -#include // UNIX file control stuff (non ANSI) -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "seeds.h" // seed matching stuff -#include "pos_table.h" // position table stuff - -#define capsule_owner // (make this the owner of its globals) -#include "capsule.h" // interface to this module - -// debugging defines - -//#define snoopBytesWritten // if this is defined, extra code is added to - // .. track the number of bytes written to the - // .. capsule file - -//---------- -// -// write_capsule_file-- -// Write a Target Sequence Capsule File corresponding to the current target -// sequence and seeding state. -// -//---------- -// -// Arguments: -// FILE* f: The file to write. The caller should have already -// .. opened this, with "wb" access. -// char* filename: The name of the file being written to. This is -// .. only used for error reporting, and may be NULL. -// seq* seq: The target sequence. -// u8* revNucs: The reverse of the target sequence (NOT reverse -// .. complement); this may be NULL, in which case it -// .. is left out of the file. -// postable* pt: A table of positions of words in target. -// seed* seed: The seed used to build the position table. -// -// Returns: -// The number of bytes written to the file. -// -//---------- - -//=== stuff for snoopBytesWritten === - -#ifndef snoopBytesWritten -#define debugSnoopBytesWritten_1 ; -#define debugSnoopBytesWritten_2 ; -#define debugSnoopBytesWritten_3(length,bytes) ; -#define debugSnoopBytesWritten_4 ; -#define debugSnoopBytesWritten_5 ; -#define debugSnoopBytesWritten_6 ; -#endif // not snoopBytesWritten - -#ifdef snoopBytesWritten - -#define debugSnoopBytesWritten_1 \ - fprintf (stderr,"write_field(0x%016" PRIX64 ",%s)\n", \ - bytesToWrite, reason); - -#define debugSnoopBytesWritten_2 \ - fprintf (stderr,"write_sized_field(0x%016" PRIX64 ",%s)\n", \ - bytesToWrite, reason); - -#define debugSnoopBytesWritten_3(length,bytes) \ - fprintf (stderr,"write_padding(0x%016" PRIX64 ",0x%016" PRIX64 ",%s)\n", \ - (u64) length, (u64) bytes, reason); - -#define debugSnoopBytesWritten_4 \ - fprintf (stderr," bytesWritten = 0x%016" PRIX64 "\n", bytesWritten); \ - fprintf (stderr," totalBytesWritten = 0x%016" PRIX64 "\n", totalBytesWritten); - -#define debugSnoopBytesWritten_5 \ - fprintf (stderr," sizeof(size_t) = %d\n", (int) sizeof(size_t)); \ - fprintf (stderr," sizeof(last[0]) = %d\n", (int) sizeof(pt->last[0])); \ - fprintf (stderr," allocLast = 0x%016" PRIX64 "\n", (u64) pt->allocLast); \ - fprintf (stderr," lastLength = 0x%016" PRIX64 "\n", lastLength); \ - fprintf (stderr," lastBytes = 0x%016" PRIX64 "\n", lastBytes); \ - fprintf (stderr," sizeof(prev[0]) = %d\n", (int) sizeof(pt->prev[0])); \ - fprintf (stderr," allocPrev = 0x%016" PRIX64 "\n", (u64) pt->allocPrev); \ - fprintf (stderr," prevLength = 0x%016" PRIX64 "\n", prevLength); \ - fprintf (stderr," prevBytes = 0x%016" PRIX64 "\n", prevBytes); - -#define debugSnoopBytesWritten_6 \ - fprintf (stderr," header length: 0x%016" PRIX64 "\n", (u64) headerLength);\ - fprintf (stderr," header bytes: 0x%016" PRIX64 "\n", (u64) headerBytes); \ - fprintf (stderr," name offset: 0x%016" PRIX64 "\n", (u64) nameOffset); \ - fprintf (stderr," name length: 0x%016" PRIX64 "\n", (u64) nameLength); \ - fprintf (stderr," name bytes: 0x%016" PRIX64 "\n", (u64) nameBytes); \ - fprintf (stderr," nucs offset: 0x%016" PRIX64 "\n", (u64) nucsOffset); \ - fprintf (stderr," nucs length: 0x%016" PRIX64 "\n", (u64) nucsLength); \ - fprintf (stderr," nucs bytes: 0x%016" PRIX64 "\n", (u64) nucsBytes); \ - fprintf (stderr," rvrs offset: 0x%016" PRIX64 "\n", (u64) rvrsOffset); \ - fprintf (stderr," rvrs length: 0x%016" PRIX64 "\n", (u64) rvrsLength); \ - fprintf (stderr," rvrs bytes: 0x%016" PRIX64 "\n", (u64) rvrsBytes); \ - fprintf (stderr," bits offset: 0x%016" PRIX64 "\n", (u64) bitsOffset); \ - fprintf (stderr," bits length: 0x%016" PRIX64 "\n", (u64) bitsLength); \ - fprintf (stderr," bits bytes: 0x%016" PRIX64 "\n", (u64) bitsBytes); \ - fprintf (stderr," last offset: 0x%016" PRIX64 "\n", (u64) lastOffset); \ - fprintf (stderr," last length: 0x%016" PRIX64 "\n", (u64) lastLength); \ - fprintf (stderr," last bytes: 0x%016" PRIX64 "\n", (u64) lastBytes); \ - fprintf (stderr," prev offset: 0x%016" PRIX64 "\n", (u64) prevOffset); \ - fprintf (stderr," prev length: 0x%016" PRIX64 "\n", (u64) prevLength); \ - fprintf (stderr," prev bytes: 0x%016" PRIX64 "\n", (u64) prevBytes); \ - fprintf (stderr," info offset: 0x%016" PRIX64 "\n", (u64) infoOffset); \ - fprintf (stderr," info length: 0x%016" PRIX64 "\n", (u64) infoLength); \ - fprintf (stderr," info bytes: 0x%016" PRIX64 "\n", (u64) infoBytes); \ - fprintf (stderr," part offset: 0x%016" PRIX64 "\n", (u64) partOffset); \ - fprintf (stderr," part length: 0x%016" PRIX64 "\n", (u64) partLength); \ - fprintf (stderr," part bytes: 0x%016" PRIX64 "\n", (u64) partBytes); \ - fprintf (stderr," pool offset: 0x%016" PRIX64 "\n", (u64) poolOffset); \ - fprintf (stderr," pool length: 0x%016" PRIX64 "\n", (u64) poolLength); \ - fprintf (stderr," pool bytes: 0x%016" PRIX64 "\n", (u64) poolBytes); \ - fprintf (stderr," seed offset: 0x%016" PRIX64 "\n", (u64) seedOffset); \ - fprintf (stderr," seed length: 0x%016" PRIX64 "\n", (u64) seedLength); \ - fprintf (stderr," seed bytes: 0x%016" PRIX64 "\n", (u64) seedBytes); \ - fprintf (stderr," end offset: 0x%016" PRIX64 "\n", (u64) endOffset); - -#endif // snoopBytesWritten - - -//=== macros to write fields === - -#define write_field(fieldName) \ - bytesToWrite = sizeof(fieldName); \ - debugSnoopBytesWritten_1; \ - bytesWritten = fwrite (&fieldName, 1, bytesToWrite, f); \ - if (bytesWritten != bytesToWrite) goto write_failure; \ - totalBytesWritten += bytesWritten; \ - debugSnoopBytesWritten_4; - -#define write_sized_field(fieldName,bytes) \ - bytesToWrite = bytes; \ - debugSnoopBytesWritten_2; \ - bytesWritten = fwrite (fieldName, 1, bytesToWrite, f); \ - if (bytesWritten != bytesToWrite) goto write_failure; \ - totalBytesWritten += bytesWritten; \ - debugSnoopBytesWritten_4; - -#define write_padding(length,bytes) \ - debugSnoopBytesWritten_3(length,bytes); \ - if (bytes > length) \ - { \ - bytesToWrite = bytes - length; \ - bytesWritten = fwrite (zeroes, 1, bytesToWrite, f); \ - if (bytesWritten != bytesToWrite) goto write_failure; \ - totalBytesWritten += bytesWritten; \ - debugSnoopBytesWritten_4; \ - } - - -//=== write_capsule_file === - -u64 write_capsule_file - (FILE* f, - char* filename, - seq* seq, - u8* revNucs, - postable* pt, - seed* seed) - { - seqpartition* sp = &seq->partition; - u8 zeroes[32]; - u64 totalBytesWritten = 0; - u64 bytesToWrite, bytesWritten; - u32 headerLength, headerBytes; // (yes, u32 not u64) - u64 nameLength, nameBytes, nameOffset; - u64 nucsLength, nucsBytes, nucsOffset; - u64 rvrsLength, rvrsBytes, rvrsOffset; - u64 bitsLength, bitsBytes, bitsOffset; - u64 lastLength, lastBytes, lastOffset; - u64 prevLength, prevBytes, prevOffset; - u64 infoLength, infoBytes, infoOffset; - u64 partLength, partBytes, partOffset; - u64 poolLength, poolBytes, poolOffset; - u64 seedLength, seedBytes, seedOffset; - u64 endOffset, badOffset; - u64 magic; - u32 version, headerEntries; - u32 dataTypeCode, dataItem, extraInfo; - char* seqName, *reason; - u32* flipScan; - int numFlips, flipIx, partIx; - - if (filename == NULL) filename = "unnamed capsule file"; - - memset (zeroes, 0, sizeof(zeroes)); - - if (sizeof(unspos) != sizeof(u32)) - suicide ("internal error, capsule expects positions to be 32 bits"); - - if (sizeof(partition) != sizeof(cappartition)) - suicidef ("internal error, capsule expects partition records to be %d bytes", - sizeof(cappartition)); - - ////////// - // write magic number - ////////// - - reason = "magic"; - magic = (((u64) refcapMagicABig) << 32) + refcapMagicBBig; - write_field (magic); - - ////////// - // figure out block sizes and offsets - ////////// - - // figure out the header size (not including the 12 bytes for magic number - // and version) - - headerEntries = 6; - if (revNucs != NULL) headerEntries++; - if (pt->asBits != NULL) headerEntries++; - if (sp->p != NULL) headerEntries += 2; - - headerLength = sizeof(u32) // length field - + (headerEntries * capsuleHeaderEntrySize) // entries - + sizeof(u32); // terminator - - headerBytes = round_up_32 (headerLength + capsulePreHeaderSize) - - capsulePreHeaderSize; - - // figure out the name block's size - - seqName = (seq->useFullNames)? seq->header : seq->shortHeader; - if ((seqName == NULL) || (seqName[0] == 0)) seqName = "(unnamed)"; - nameLength = strlen(seqName) + 1; - nameBytes = round_up_32 (nameLength); - - // figure out the nucleotide blocks' size - - nucsLength = seq->len + 1; - nucsBytes = round_up_32 (nucsLength); - - rvrsLength = (revNucs == NULL)? 0 : nucsLength; - rvrsBytes = round_up_32 (rvrsLength); - - bitsLength = (pt->asBits == NULL)? 0 : round_up_16((nucsLength+3) / 4); - bitsBytes = round_up_32 (bitsLength); - - infoLength = sizeof(capseqinfo); - infoBytes = round_up_32 (infoLength); - - // figure out the position table blocks' size - - lastLength = ((size_t) pt->allocLast) * sizeof(pt->last[0]); - lastBytes = round_up_32 (lastLength); - - prevLength = ((size_t) pt->allocPrev) * sizeof(pt->prev[0]); - prevBytes = round_up_32 (prevLength); - - debugSnoopBytesWritten_5; - - // figure out the partition blocks' sizes - - partBytes = poolBytes = 0; - if (sp->p != NULL) - { - partLength = (sp->len + 1) * sizeof(partition); - partBytes = round_up_32 (partLength); - - poolLength = sp->poolLen; - poolBytes = round_up_32 (poolLength); - } - - // figure out the seed block's size - - numFlips = 0; - for (flipScan=seed->transFlips ; *flipScan!=0 ; flipScan++) - numFlips++; - - seedLength = (sizeof(capseed) - sizeof(u32)) // standard fields - + (seed->numParts * sizeof(u32)) // shift[] array - + (seed->numParts * sizeof(u32)) // mask[] array - + ((numFlips+1) * sizeof(u32)); // transFlips[] array - seedBytes = round_up_32 (seedLength); - - // figure out all the offsets - - nameOffset = capsulePreHeaderSize + headerBytes; - nucsOffset = nameOffset + nameBytes; - rvrsOffset = nucsOffset + nucsBytes; - bitsOffset = rvrsOffset + rvrsBytes; - lastOffset = bitsOffset + bitsBytes; - prevOffset = lastOffset + lastBytes; - infoOffset = prevOffset + prevBytes; - partOffset = infoOffset + infoBytes; - poolOffset = partOffset + partBytes; - seedOffset = poolOffset + poolBytes; - endOffset = seedOffset + seedBytes; - - capsule_set_stat (display, true); - capsule_copy_stat (headerLength); - capsule_copy_stat (headerBytes); - capsule_copy_stat (nameOffset); - capsule_copy_stat (nameLength); - capsule_copy_stat (nameBytes); - capsule_copy_stat (nucsOffset); - capsule_copy_stat (nucsLength); - capsule_copy_stat (nucsBytes); - capsule_copy_stat (rvrsOffset); - capsule_copy_stat (rvrsLength); - capsule_copy_stat (rvrsBytes); - capsule_copy_stat (bitsOffset); - capsule_copy_stat (bitsLength); - capsule_copy_stat (bitsBytes); - capsule_copy_stat (lastOffset); - capsule_copy_stat (lastLength); - capsule_copy_stat (lastBytes); - capsule_copy_stat (prevOffset); - capsule_copy_stat (prevLength); - capsule_copy_stat (prevBytes); - capsule_copy_stat (infoOffset); - capsule_copy_stat (infoLength); - capsule_copy_stat (infoBytes); - capsule_copy_stat (partOffset); - capsule_copy_stat (partLength); - capsule_copy_stat (partBytes); - capsule_copy_stat (poolOffset); - capsule_copy_stat (poolLength); - capsule_copy_stat (poolBytes); - capsule_copy_stat (seedOffset); - capsule_copy_stat (seedLength); - capsule_copy_stat (seedBytes); - capsule_copy_stat (endOffset); - - debugSnoopBytesWritten_6; - - ////////// - // finish writing the pre-header - ////////// - - // write the file size - - reason = "file size"; - write_field (endOffset); - - // write the version - - reason = "version"; - version = refcapVersion; - write_field (version); - - // write the header length - - reason = "header"; - write_field (headerLength); - - ////////// - // write the header - ////////// - - // write the sequence name - - reason = "name entry"; - dataTypeCode = cap_seqName; write_field (dataTypeCode); - extraInfo = 0; write_field (extraInfo); - write_field (nameOffset); - write_field (nameLength); - - // write the nucleotides - - reason = "nucs entry"; - dataTypeCode = cap_seqForward; write_field (dataTypeCode); - extraInfo = 0; write_field (extraInfo); - write_field (nucsOffset); - write_field (nucsLength); - - // write the reversed nucleotides - - if (rvrsBytes > 0) - { - reason = "rvrs entry"; - dataTypeCode = cap_seqReverse; write_field (dataTypeCode); - extraInfo = 0; write_field (extraInfo); - write_field (rvrsOffset); - write_field (rvrsLength); - } - - // write the nucleotide bits - - if (bitsBytes > 0) - { - reason = "nuc bits entry"; - dataTypeCode = cap_seqBits; write_field (dataTypeCode); - extraInfo = 0; write_field (extraInfo); - write_field (bitsOffset); - write_field (bitsLength); - } - - // write the last[] array - - reason = "last entry"; - dataTypeCode = cap_lastPosTable; write_field (dataTypeCode); - extraInfo = 0; write_field (extraInfo); - write_field (lastOffset); - write_field (lastLength); - - // write the prev[] array - - reason = "prev entry"; - dataTypeCode = cap_prevPosTable; write_field (dataTypeCode); - extraInfo = 0; write_field (extraInfo); - write_field (prevOffset); - write_field (prevLength); - - // write the sequence info - - reason = "info entry"; - dataTypeCode = cap_seqInfo; write_field (dataTypeCode); - extraInfo = 0; write_field (extraInfo); - write_field (infoOffset); - write_field (infoLength); - - // write the partition[] array - - if (partBytes > 0) - { - reason = "parititon entry"; - dataTypeCode = cap_partitions; write_field (dataTypeCode); - extraInfo = 0; write_field (extraInfo); - write_field (partOffset); - write_field (partLength); - } - - // write the partition names[] array - - if (poolBytes > 0) - { - reason = "parititon entry"; - dataTypeCode = cap_partitionNames; write_field (dataTypeCode); - extraInfo = 0; write_field (extraInfo); - write_field (poolOffset); - write_field (poolLength); - } - - // write the seed - - reason = "seed entry"; - dataTypeCode = cap_seed; write_field (dataTypeCode); - extraInfo = 0; write_field (extraInfo); - write_field (seedOffset); - write_field (seedLength); - - // write the terminator and padding - - reason = "terminator"; - dataTypeCode = cap_terminator; write_field (dataTypeCode); - - reason = "header padding"; - write_padding (headerLength, headerBytes); - - ////////// - // write the data blocks - ////////// - - badOffset = 0; // (placate complier) - - // write the sequence name - - reason = "name"; - if (totalBytesWritten != nameOffset) - { badOffset = nameOffset; goto wrong_offset; } - - write_sized_field (seqName, nameLength); - write_padding (nameLength, nameBytes); - - // write the nucleotides - - reason = "nucs"; - if (totalBytesWritten != nucsOffset) - { badOffset = nucsOffset; goto wrong_offset; } - - write_sized_field (seq->v, nucsLength); - write_padding (nucsLength, nucsBytes); - - // write the reversed nucleotides - - if (rvrsBytes > 0) - { - reason = "rvrs"; - if (totalBytesWritten != rvrsOffset) - { badOffset = rvrsOffset; goto wrong_offset; } - - write_sized_field (revNucs, rvrsLength); - write_padding (rvrsLength, rvrsBytes); - } - - // write the nucleotide bits - - if (bitsBytes > 0) - { - reason = "bits"; - if (totalBytesWritten != bitsOffset) - { badOffset = bitsOffset; goto wrong_offset; } - - write_sized_field (pt->asBits, bitsLength); - write_padding (bitsLength, bitsBytes); - } - - // write the last[] array - - reason = "last"; - if (totalBytesWritten != lastOffset) - { badOffset = lastOffset; goto wrong_offset; } - - write_sized_field (pt->last, lastLength); - write_padding (lastLength, lastBytes); - - // write the prev[] array - - reason = "prev"; - if (totalBytesWritten != prevOffset) - { badOffset = prevOffset; goto wrong_offset; } - - write_sized_field (pt->prev, prevLength); - write_padding (prevLength, prevBytes); - - // write the sequence info - - reason = "info"; - if (totalBytesWritten != infoOffset) - { badOffset = infoOffset; goto wrong_offset; } - - dataItem = seq->startLoc; write_field (dataItem); - dataItem = seq->trueLen; write_field (dataItem); - dataItem = seq->revCompFlags; write_field (dataItem); - dataItem = seq->contig; write_field (dataItem); - - dataItem = (seq->partition.p == NULL)? 0 : seq->partition.len; - write_field (dataItem); - - write_padding (infoLength, infoBytes); - - // write the partitions and names - - if (partBytes > 0) - { - reason = "part"; - if (totalBytesWritten != partOffset) - { badOffset = partOffset; goto wrong_offset; } - - write_sized_field (sp->p, partLength); - write_padding (partLength, partBytes); - } - - if (poolBytes > 0) - { - reason = "pool"; - if (totalBytesWritten != poolOffset) - { badOffset = poolOffset; goto wrong_offset; } - - write_sized_field (sp->pool, poolLength); - write_padding (poolLength, poolBytes); - } - - // write the seed - - reason = "seed"; - if (totalBytesWritten != seedOffset) - { badOffset = seedOffset; goto wrong_offset; } - - dataItem = pt->step; write_field (dataItem); - dataItem = seed->type; write_field (dataItem); - dataItem = seed->length; write_field (dataItem); - dataItem = seed->weight; write_field (dataItem); - dataItem = seed->resolvingMask; write_field (dataItem); - dataItem = seed->revComp; write_field (dataItem); - dataItem = seed->isHalfweight; write_field (dataItem); - dataItem = seed->numParts; write_field (dataItem); - - for (partIx=0 ; partIxnumParts ; partIx++) - { dataItem = seed->shift[partIx]; write_field (dataItem); } - - for (partIx=0 ; partIxnumParts ; partIx++) - { dataItem = seed->mask[partIx]; write_field (dataItem); } - - for (flipIx=0 ; flipIxtransFlips[flipIx]; write_field (dataItem); } - - dataItem = 0; write_field (dataItem); - - write_padding (seedLength, seedBytes); - - // sanity check on file length - - if (totalBytesWritten != endOffset) - goto wrong_file_length; - - // success! - - return endOffset; - - ////////// - // failure exits - ////////// - -wrong_offset: - suicidef ("internal error writing to %s (offset for %s = 0x%s, actual is 0x%s)", - filename, reason, hex_64_string(badOffset), hex_64_string(totalBytesWritten)); - -wrong_file_length: - suicidef ("internal error writing to %s (file length = 0x%s, actual is 0x%s)", - filename, hex_64_string(endOffset), hex_64_string(totalBytesWritten)); - -write_failure: - suicidef_with_perror ("unable to write to %s (attempted %d bytes, wrote %d, for %s)", - filename, bytesToWrite, bytesWritten, reason); - return 0; // (never gets here) - } - -//---------- -// -// open_capsule_file-- -// Open a Target Sequence Capsule File and map it for sharing. -// -//---------- -// -// Arguments: -// char* filename: The name of the capsule file. -// -// Returns: -// A pointer to a capsule info record, allocated from the heap. (see note 1) -// -//---------- -// -// notes: -// -// (1) The caller must eventually call close_capsule_file() to unmap and -// .. reliquish the mapped memory, as well as disposing of the capsule -// .. info record. -// -// (2) Our use of open/mmap follows an example from chapter four of "Linux -// System Programming: Talking Directly to the Kernel and C Library", by -// Robert Love, as presented at -// www.devshed.com/c/a/BrainDump/Using-mmap-for-Advanced-File-IO -// -//---------- - -capinfo* open_capsule_file - (char* filename) - { - int fdes; - struct stat sb; - void* mappedData; - size_t dataSize; - u64 magic, fileSize; - u32 magicA, magicB; - int swap64halves, littleEndian; - capinfo* cap; - - // open the file - - fdes = open (filename, O_RDONLY); - if (fdes < 0) - goto open_failed; - - if (fstat (fdes, &sb) == -1) - goto fstat_failed; - - if (!S_ISREG (sb.st_mode)) - goto non_regular_file; - - // map the pre-header and read the file size; note that we have to check - // the magic number to figure out how to descramble the file size - - mappedData = mmap (0, capsulePreHeaderSize, PROT_READ, MAP_SHARED, fdes, 0); - if ((mappedData == NULL) || (mappedData == MAP_FAILED)) - { fileSize = capsulePreHeaderSize; goto mmap_failed; } - - magic = ((u64*) mappedData)[0]; - fileSize = ((u64*) mappedData)[1]; - munmap (mappedData, capsulePreHeaderSize); - - swap64halves = littleEndian = false; - magicA = (u32) (magic >> 32); - magicB = (u32) magic; - - if (((magicA == refcapMagicABig) && (magicB == refcapMagicBBig)) - || ((magicA == refcapMagicALittle) && (magicB == refcapMagicBLittle))) - ; // ok, and no half swapping needed - else if (((magicA == refcapMagicBBig) && (magicB == refcapMagicABig)) - || ((magicA == refcapMagicBLittle) && (magicB == refcapMagicALittle))) - { // ok, but half swapping needed - magic = swap_64_halves (magic); - fileSize = swap_64_halves (fileSize); - magicA = (u32) (magic >> 32); - magicB = (u32) magic; - swap64halves = true; - } - else - goto bad_magic; - - if (magicA == refcapMagicALittle) - { - fileSize = swap_two32_endian (fileSize); - littleEndian = true; - } - - dataSize = fileSize; - if (dataSize != fileSize) - goto file_size_overlow; - - // now that we know the size, map the whole thing; note that there is - // no point in memory-mapping a file on an architecture with different - // endianness that the one the file was created on - - if ((littleEndian) || (swap64halves)) - goto architecture_mismatch; - - mappedData = mmap (0, dataSize, PROT_READ, MAP_SHARED, fdes, 0); - if ((mappedData == NULL) || (mappedData == MAP_FAILED)) - goto mmap_failed; - - capsule_set_stat (sharedAddress, mappedData); - - // close the file; suprisingly, we can close it even though it is mapped - - if (close (fdes) == -1) - goto close_failed; - - // success! - - cap = (capinfo*) zalloc_or_die ("open_capsule_file", sizeof(capinfo)); - - cap->dataSize = dataSize; - cap->mappedData = mappedData; - cap->swap64halves = swap64halves; - cap->littleEndian = littleEndian; - - return cap; - - ////////// - // failure exits - ////////// - -open_failed: - suicidef_with_perror ("open(%s) failed (returned file descriptor = %d)", - filename, fdes); - return NULL; // (never gets here) - -close_failed: - suicidef_with_perror ("close() for %s failed", filename); - return NULL; // (never gets here) - -fstat_failed: - suicidef_with_perror ("fstat() for %s failed", filename); - return NULL; // (never gets here) - -non_regular_file: - suicidef ("%s is not a regular file, so is not mappable as a capsule file", - filename); - return NULL; // (never gets here) - -mmap_failed: - suicidef_with_perror ("mmap() for %s failed (attempted 0x%s bytes, return value = %p)", - filename, hex_64_string(fileSize), mappedData); - return NULL; // (never gets here) - -bad_magic: - suicidef ("%s is not a capsule file (magic = 0x%s)", - filename, hex_64_string(magic)); - return NULL; // (never gets here) - -file_size_overlow: - suicidef ("file size overflow in %s (fileSize = 0x%s, dataSize = 0x%s)", - filename, hex_64_string(fileSize), hex_64_string(dataSize)); - return NULL; // (never gets here) - -#define suggestions " rebuild it using --writecapsule" - -architecture_mismatch: - if ((littleEndian) && (!swap64halves)) - suicidef ("architecture mismatch for %s (8-byte words have halves swapped);" - suggestions, - filename); - else if ((!littleEndian) && (swap64halves)) - suicidef ("architecture mismatch for %s (4-byte words are wrong endian);" - suggestions, - filename); - else // if ((littleEndian) && (swap64halves)) - suicidef ("architecture mismatch for %s (8-byte words are wrong endian);" - suggestions, - filename); - return NULL; // (never gets here) - } - -//---------- -// -// close_capsule_file-- -// Close (== unmap) a Target Sequence Capsule File. -// -//---------- -// -// Arguments: -// capinfo* cap: The capsule info record, as returned by -// .. open_capsule_file. This may be NULL. -// -// Returns: -// (nothing) -// -//---------- - -void close_capsule_file - (capinfo* cap) - { - if (cap == NULL) return; - if (cap->mappedData != NULL) munmap (cap->mappedData, cap->dataSize); - free_if_valid ("close_capsule_file", cap); - } - -//---------- -// -// locate_capsule_data-- -// Loacte a data block in a mapped Target Sequence Capsule. -// -//---------- -// -// Arguments: -// capinfo* cap: The capsule info record. -// u32 dataType: The desired block's data type code (one of cap_xxx). -// u32* blockInfo: Place to return the block info word. This can be -// .. NULL. -// u32* blockSize: Place to return the block's size. This can be NULL. -// -// Returns: -// A pointer to the mapped block (NULL if the block is not found). -// -//---------- - -void* locate_capsule_data - (capinfo* cap, - u32 blockType, - u32* blockInfo, - u64* blockSize) - { - char* scan; - u32 headerLength, numEntries, ix; - u32 dataTypeCode; - u64 blockOffset; - - scan = ((char*) cap->mappedData) + capsulePreHeaderSize; - - headerLength = *((u32*) scan); scan += sizeof(u32); - if ((headerLength % capsuleHeaderEntrySize) != 8) - suicidef ("bad capsule header (length = %08X)", headerLength); - numEntries = (headerLength - 8) / capsuleHeaderEntrySize; - - for (ix=0 ; ixmappedData) + blockOffset); - } - - // block not found - - return NULL; - } - -//---------- -// -// capsule_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void capsule_zero_stats - (void) - { -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&refSharingStats, 0, sizeof(refSharingStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - // (none to set, yet) - -#endif // collect_stats - } - -//---------- -// -// capsule_show_stats: -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// -// Returns: -// (nothing) -// -//---------- - -void capsule_show_stats - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - - if (f == NULL) return; - if (!refSharingStats.display) return; - - fprintf (f, " header length: %s\n", hex_64_string(refSharingStats.headerLength)); - fprintf (f, " header bytes: %s\n", hex_64_string(refSharingStats.headerBytes)); - fprintf (f, " name offset: %s\n", hex_64_string(refSharingStats.nameOffset)); - fprintf (f, " name length: %s\n", hex_64_string(refSharingStats.nameLength)); - fprintf (f, " name bytes: %s\n", hex_64_string(refSharingStats.nameBytes)); - fprintf (f, " nucs offset: %s\n", hex_64_string(refSharingStats.nucsOffset)); - fprintf (f, " nucs length: %s\n", hex_64_string(refSharingStats.nucsLength)); - fprintf (f, " nucs bytes: %s\n", hex_64_string(refSharingStats.nucsBytes)); - fprintf (f, " rvrs offset: %s\n", hex_64_string(refSharingStats.rvrsOffset)); - fprintf (f, " rvrs length: %s\n", hex_64_string(refSharingStats.rvrsLength)); - fprintf (f, " rvrs bytes: %s\n", hex_64_string(refSharingStats.rvrsBytes)); - fprintf (f, " bits offset: %s\n", hex_64_string(refSharingStats.bitsOffset)); - fprintf (f, " bits length: %s\n", hex_64_string(refSharingStats.bitsLength)); - fprintf (f, " bits bytes: %s\n", hex_64_string(refSharingStats.bitsBytes)); - fprintf (f, " last offset: %s\n", hex_64_string(refSharingStats.lastOffset)); - fprintf (f, " last length: %s\n", hex_64_string(refSharingStats.lastLength)); - fprintf (f, " last bytes: %s\n", hex_64_string(refSharingStats.lastBytes)); - fprintf (f, " prev offset: %s\n", hex_64_string(refSharingStats.prevOffset)); - fprintf (f, " prev length: %s\n", hex_64_string(refSharingStats.prevLength)); - fprintf (f, " prev bytes: %s\n", hex_64_string(refSharingStats.prevBytes)); - fprintf (f, " info offset: %s\n", hex_64_string(refSharingStats.infoOffset)); - fprintf (f, " info length: %s\n", hex_64_string(refSharingStats.infoLength)); - fprintf (f, " info bytes: %s\n", hex_64_string(refSharingStats.infoBytes)); - fprintf (f, " part offset: %s\n", hex_64_string(refSharingStats.partOffset)); - fprintf (f, " part length: %s\n", hex_64_string(refSharingStats.partLength)); - fprintf (f, " part bytes: %s\n", hex_64_string(refSharingStats.partBytes)); - fprintf (f, " pool offset: %s\n", hex_64_string(refSharingStats.poolOffset)); - fprintf (f, " pool length: %s\n", hex_64_string(refSharingStats.poolLength)); - fprintf (f, " pool bytes: %s\n", hex_64_string(refSharingStats.poolBytes)); - fprintf (f, " seed offset: %s\n", hex_64_string(refSharingStats.seedOffset)); - fprintf (f, " seed length: %s\n", hex_64_string(refSharingStats.seedLength)); - fprintf (f, " seed bytes: %s\n", hex_64_string(refSharingStats.seedBytes)); - fprintf (f, " end offset: %s\n", hex_64_string(refSharingStats.endOffset)); - fprintf (f, " shared address: %p\n", refSharingStats.sharedAddress); - fprintf (f, "-------------------\n"); - -#endif // collect_stats - } diff --git a/programs/lastz/src/capsule.h b/programs/lastz/src/capsule.h deleted file mode 100644 index 41cf5f6..0000000 --- a/programs/lastz/src/capsule.h +++ /dev/null @@ -1,323 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: capsule.h -// -//---------- - -#ifndef capsule_H // (prevent multiple inclusion) -#define capsule_H - -// other files - -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "seeds.h" // seed strategy stuff -#include "pos_table.h" // position table stuff - -// establish ownership of global variables - -#ifdef capsule_owner -#define global -#else -#define global extern -#endif - -//---------- -// -// Reference Sequence Capsule File: -// -// A capsule file encapsulates the information needed for a reference sequence, -// including its seed word index table. -// -//---------- -// -// Reference capsule file format: -// -// Fields can be in big- or little-endian format; they must match the -// endianess of the magic number. Similarly, 64-bit fields must match the -// order indicated by the 64-bit magic number. If the data blocks containing -// multi-byte numeric fields are directly mapped by the program (e.g. with -// mmap), the program will reject a file fi the magic number doesn't match -// what is expected on the running-on platfrom. -// -// Version 1 -// -// offset 0x00: DA C8 9D 8E big endian magic number (8E 9D C8 DA => little endian) -// offset 0x04: 60 11 EF 1B big endian magic number (1B EF 11 60 => little endian) -// offset 0x08: xx xx xx xx file size (one half) (in bytes) -// offset 0x0C: xx xx xx xx file size (other half) -// offset 0x10: 00 00 01 00 version 1.0 (fourth byte is sub version) -// offset 0x14: xx xx xx xx header length H (in bytes, including this field) -// offset 0x18: ... header -// ... -// offset H+0x14: 68 45 6E 64 ('hEnd') header terminator -// offset ...: ... other data, pointed to by header entries -// -// Each header entry (at offset E) consists of the following fields: -// -// offset E+0x00: xx xx xx xx data type code (one of cap_xxx below) -// offset E+0x04: xx xx xx xx extra info, per type code -// offset E+0x08: xx xx xx xx data offset (one half) -// offset E+0x0C: xx xx xx xx data offset (other half) -// offset E+0x10: xx xx xx xx data length (one half) -// offset E+0x14: xx xx xx xx data length (other half) -// -// Type codes indicate the type of data block. Valid types are shown here. -// and are usually four ascii characters. -// -// 'name' The name of the reference sequence. -// -// offset 0x00: ... zero-terminated ascii string -// -// 'nucs' The reference sequence nucleotides, one byte per nucleotide. This -// .. corresponds to seq.v. -// -// offset 0x00: ... zero-terminated ascii string of length L+1, -// .. where L is the length of the sequence -// -// 'rvrs' The reference sequence nucleotides in reverse order (*not* -// complemented). This corresponds to targetRev in lastz's main(). -// -// offset 0x00: ... zero-terminated ascii string of length L+1. -// -// 'bits' The reference sequence nucleotides in forward order, encoded as -// bits. This corresponds to postable.asBits -// -// offset 0x00: ... bit data of length L/4 (rounded up to a -// .. multiple of 16 bytes). -// -// 'last' The last position, in the reference, of each seed word. This -// .. corresponds to postable.last. It has has 4^W entries with each -// .. entry being an index into the reference nucleotides. -// -// offset 0x00: xx xx xx xx postable.last[0] -// offset 0x04: xx xx xx xx postable.last[1] -// offset 0x08: ... -// -// 'prev' The previous position, in the reference, of each seed word. This -// .. corresponds to postable.prev. It has has ceil(L/Z) entries with -// .. entry being an 'adjusted' index into the reference nucleotides. -// -// offset 0x00: xx xx xx xx postable.prev[0] -// offset 0x04: xx xx xx xx postable.prev[1] -// offset 0x08: ... -// -// 'info' additional information about the reference sequence. -// -// offset 0x00: xx xx xx xx seq.start -// offset 0x04: xx xx xx xx seq.trueLen -// offset 0x08: xx xx xx xx seq.revCompFlags -// offset 0x0C: xx xx xx xx seq.contig -// offset 0x10: xx xx xx xx seqpartition.len -// -// 'part' sequence partitions -// -// offset 0x00: xx xx xx xx seqpartition.p[0].sepBefore -// offset 0x04: xx xx xx xx seqpartition.p[0].sepAfter -// offset 0x08: xx xx xx xx seqpartition.p[0].contig -// offset 0x0C: xx xx xx xx seqpartition.p[0].startLoc -// offset 0x10: xx xx xx xx seqpartition.p[0].trueLen -// offset 0x14: xx xx xx xx seqpartition.p[0].header -// offset 0x18: xx xx xx xx seqpartition.p[1].sepBefore -// offset 0x1C: xx xx xx xx seqpartition.p[1].sepAfter -// offset 0x20: xx xx xx xx seqpartition.p[1].contig -// offset 0x24: xx xx xx xx seqpartition.p[1].startLoc -// offset 0x28: xx xx xx xx seqpartition.p[1].trueLen -// offset 0x2C: xx xx xx xx seqpartition.p[1].header -// offset 0x30: xx xx xx xx seqpartition.p[2].sepBefore -// offset 0x34: ... -// -// 'pNam' names for sequence partitions -// -// offset 0x00: xx xx xx xx seqpartition.pool[0] .. seqpartition.pool[3] -// offset 0x04: xx xx xx xx seqpartition.pool[4] .. seqpartition.pool[7] -// offset 0x08: ... -// -// 'seed' The seed pattern used to build the seed word position table. This -// .. (mostly) corresponds to struct seed, but is not directly mapped. -// .. Instead an instance of a struct seed is built from these values. -// .. (seed capseed struct) -// -// offset 0x00: xx xx xx xx postable.step -// offset 0x04: xx xx xx xx seed.type -// offset 0x08: xx xx xx xx seed.length -// offset 0x0C: xx xx xx xx seed.weight -// offset 0x10: xx xx xx xx seed.resolvingMask -// offset 0x14: xx xx xx xx seed.revComp -// offset 0x18: xx xx xx xx seed.isHalfweight -// offset 0x1C: xx xx xx xx (P) seed.numParts -// offset 0x20: xx xx xx xx seed.shift[0] -// offset 0x24: .. -// offset 0x1C+4P: xx xx xx xx seed.shift[P-1] -// offset 0x20+4P: xx xx xx xx seed.mask[0] -// offset 0x24+4P: .. -// offset 0x1C+8P: xx xx xx xx seed.mask[P-1] -// offset 0x20+8P: xx xx xx xx seed.transFlips[0] -// offset 0x24+8P: ... -// ... -// offset ...: 00 00 00 00 -// -//---------- - -// capsule file magic number(s) - -static const u32 refcapMagicABig = 0xDAC89D8E; // in big endian format -static const u32 refcapMagicALittle = 0x8E9DC8DA; // in little endian format - -static const u32 refcapMagicBBig = 0x6011EF1B; // in big endian format -static const u32 refcapMagicBLittle = 0x1BEF1160; // in little endian format - -static const u32 refcapVersion = 0x00000100; // version 1.0 - -// miscellaneous capsule file block sizes - -#define capsulePreHeaderSize 0x14 -#define capsuleHeaderEntrySize 0x18 - -// capsule file data type codes -// $$$ add a creator field - -#define cap_seqName 0x6E616D65 // 'name' -#define cap_seqForward 0x6E756373 // 'nucs' -#define cap_seqReverse 0x72767273 // 'rvrs' -#define cap_seqBits 0x62697473 // 'bits' -#define cap_lastPosTable 0x6C617374 // 'last' -#define cap_prevPosTable 0x70726576 // 'prev' -#define cap_seqInfo 0x696E666F // 'info' -#define cap_seed 0x73656564 // 'seed' -#define cap_partitions 0x70617274 // 'part' -#define cap_partitionNames 0x704E616D // 'pNam' -#define cap_terminator 0x68456E64 // 'hEnd' - -typedef struct capinfo - { - void* mappedData; // Pointer to the file's mapped data. - size_t dataSize; // Size of the mapped data. - int swap64halves; // true => perform swap_64_halves on 64-bit - // .. values - int littleEndian; // true => perform swap_32_endian on 32-bit - // .. values - } capinfo; - -typedef struct capseqinfo // (corresponds to cap_info block) - { - u32 startLoc; // seq.startLoc - u32 trueLen; // seq.trueLen - u32 revCompFlags; // seq.revCompFlags - u32 contig; // seq.contig - u32 numPartitions; // seqpartition.len - } capseqinfo; - -typedef struct cappartition // (corresponds to cap_partitions block) - { // .. layout must match struct partition (sequences.h) - u32 sepBefore; // partition.sepBefore - u32 sepAfter; // partition.sepAfter - u32 contig; // partition.contig - u32 startLoc; // partition.startLoc - u32 trueLen; // partition.trueLen - u32 header; // partition.header - } cappartition; - -typedef struct cappartitionold // (corresponds to old cap_partitions block, - { // .. lacking sepAfter and startLoc fields) - u32 sepBefore; // partition.sepBefore - u32 contig; // partition.contig - u32 trueLen; // partition.trueLen - u32 header; // partition.header - } cappartitionold; - -typedef struct capseed // (corresponds to cap_seed block) - { - u32 step; // postable.step - u32 type; // seed.type - u32 length; // seed.length - u32 weight; // seed.weight - u32 resolvingMask; // seed.resolvingMask - u32 revComp; // seed.revComp - u32 isHalfweight; // seed.isHalfweight - u32 numParts; // seed.numParts - u32 shift0; // first entry of seed.shift[] - } capseed; - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -global struct - { - int display; - u64 headerLength; - u64 headerBytes; - u64 nameOffset; - u64 nameLength; - u64 nameBytes; - u64 nucsOffset; - u64 nucsLength; - u64 nucsBytes; - u64 rvrsOffset; - u64 rvrsLength; - u64 rvrsBytes; - u64 bitsOffset; - u64 bitsLength; - u64 bitsBytes; - u64 lastOffset; - u64 lastLength; - u64 lastBytes; - u64 prevOffset; - u64 prevLength; - u64 prevBytes; - u64 infoOffset; - u64 infoLength; - u64 infoBytes; - u64 partOffset; - u64 partLength; - u64 partBytes; - u64 poolOffset; - u64 poolLength; - u64 poolBytes; - u64 seedOffset; - u64 seedLength; - u64 seedBytes; - u64 endOffset; - void* sharedAddress; - } refSharingStats; - -// stats macros - -#define capsule_count_stat(field) ++refSharingStats.field -#define capsule_uncount_stat(field) --refSharingStats.field -#define capsule_set_stat(field,val) (refSharingStats.field = val) -#define capsule_copy_stat(field) (refSharingStats.field = field) -#define capsule_add_stat(field,val) (refSharingStats.field += val) -#else -#define capsule_count_stat(field) -#define capsule_uncount_stat(field) -#define capsule_set_stat(field,val) -#define capsule_copy_stat(field) -#define capsule_add_stat(field,val) -#endif // collect_stats - -// prototypes for stats routines - -void capsule_zero_stats (void); -void capsule_show_stats (FILE* f); - -//---------- -// -// prototypes for routines in capsule.c -// -//---------- - -u64 write_capsule_file (FILE* f, char* filename, seq* seq, u8* revNucs, - postable* pt, seed* seed); -capinfo* open_capsule_file (char* filename); -void close_capsule_file (capinfo* cap); -void* locate_capsule_data (capinfo* cap, u32 blockType, - u32* blockInfo, u64* blockSize); - -#undef global -#endif // capsule_H diff --git a/programs/lastz/src/chain.c b/programs/lastz/src/chain.c deleted file mode 100755 index 87d7997..0000000 --- a/programs/lastz/src/chain.c +++ /dev/null @@ -1,1100 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: chain.c -// -//---------- -// -// chain-- -// Find the highest scoring chain in a set of gap-free alignments. Each -// segment in the chain will begin strictly before the start of the next -// segment. This is (expected to be) the most parsimonious subset of the -// gap-free alignments, assuming there actual orthology contains no -// inversions. -// -// The algorithm finds, for each segment, the highest scoring chain that ends -// with that segment. Segments are scanned in an order (by increasing start -// in sequence 1) that guarantees all possible predecessor chains have been -// found and scored before that segment is considered. Upon completion, the -// chain is recovered by backtracking from its end segment. -// -// A chain's score is the sum of its segment scores minus the sum of penalties -// for the gaps between segments. The caller must provide a function to compute -// those penalties. See note (1) of reduce_to_chain() for more details. -// -// To facilitate the search for valid predecessors, a K-d tree is used. See -// the header of build_kd_tree() for more details on the tree implementation. -// -// References: -// -// [1] Multidimensional Binary Search Trees Used for Associative Searching. -// Jon Louis Bentley, Commun. ACM 18(9): 509-517 (1975). -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C i/o stuff -#include // standard C string stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff - -#define chain_owner // (make this the owner of its globals) -#include "chain.h" // interface to this module - -// debugging defines - -//#define snoopBatches // if this is defined, extra code is added to - // .. track the chaining of HSPs in per- - // .. partition batches - -//---------- -// -// private global data -// -//---------- - -typedef double bigscore; - -typedef struct kdinfo - { - score diagPen, antiPen; // chain gap penalties - int scale; // chain score scale factor - bigscore* chainScore; // array of the best score for any - // .. chain ending with a given segment - u32* perm, *invPerm; // permuation of segments (and inverse) - segment* seg; // array of segments - segment* query; // query segment - unspos x, y; // query's cartesian point - sgnpos diag; // .. x = pos1; y = pos2; diag = x-y - chainer connect; // chain connection penalty function - } kdinfo; - - -typedef struct bestpred - { - u32 num; // index of a predecessor segment; the - // .. value noPred indicates there is no - // .. predecessor - bigscore contrib; // score of the chain ending at that - // .. segment, inluding penalty to - // .. connect it to the query segment - } bestpred; - -#define noPred ((u32) -1) - - -typedef struct kdnode - { - int isBucket; // true => this node is a bucket/leaf - u32 loIx, hiIx; // isBucket is true - // .. index range of the segments in - // .. this leaf - // isBucket is false - // .. hiIx is index corresponding to - // .. cutVal - - // the following fields are onlyl valid if isBucket is false - - sgnpos cutVal; // value (along appropriate axis) which - // .. separates lower and upper children - bigscore maxChainScore; // the highest score for any chain - // .. ending at a segment in this - // .. subtree - struct kdnode* loSon, *hiSon; // pointers to child nodes - } kdnode; - -#define bucketSize 3 // max number of entries we'll place in a bucket node - -#define valid_kdnode(p) \ - (((p)->isBucket) || (((p)->loSon != NULL) && ((p)->hiSon != NULL))) - -#define perm_swap(kdi,p,q) \ - { u32 t; t = kdi->perm[p]; kdi->perm[p] = kdi->perm[q]; kdi->perm[q] = t; } - -// projection-- figure out spatial position of segment i along the current axis - -#define projection(i,axis,kdi) \ - ((axis == 0) ? (((sgnpos)kdi->seg[kdi->perm[i]].pos1) - ((sgnpos)kdi->seg[kdi->perm[i]].pos2)) \ - : ((sgnpos)kdi->seg[kdi->perm[i]].pos2)) - - -// segment batches-- partitioning of a segment table - -typedef struct segbatch - { - u32 start; // index (into a segment table) of the first - // .. entry in a batch - u32 end; // index (into a segment table) of the first - // .. entry NOT in a batch (i.e. the one after - // .. the last entry). - partition* part1; // sequence partitions that "contain" this - partition* part2; // .. batch; either of these can be NULL if we - // .. aren't dealing with partitions in the - // .. corresponding sequence - } segbatch; - -typedef struct sbtable - { - u32 size; // the number of entries allocated for batch[] - u32 len; // the number of batches (the number of entries - // .. actually used) - segbatch batch[1]; // the batch table (variable-length array) - } sbtable; - -#define sbtable_bytes(size) (sizeof(sbtable) + (((size)-1)*sizeof(segbatch))) - -//---------- -// -// prototypes for private functions -// -//---------- - -static kdnode* build_kd_tree (u32 lo, u32 hi, int axis, - const kdinfo* const kdi); -static void free_kd_tree (kdnode* subtree); -static void dump_kd_tree (FILE* f, int points, kdnode* root, - const kdinfo* const kdi); -static u32 partition_segments (u32 lo, u32 hi, int axis, - const kdinfo* const kdi); -static bestpred best_predecessor (kdnode* subtree, int axis, - bigscore lowerBound, - bestpred bp, const kdinfo* const kdi); -static void propagate_max_score (kdnode* subtree, bigscore s, u32 ix); - -//---------- -// -// try_reduce_to_chain-- -// This routine is a wrapper for reduce_to_chain(), to handle cases that arise -// when either of sequence 1 or sequence 2 is partitioned. -// -//---------- -// -// Arguments: -// seq* seq1: The first sequence(s). -// seq* seq2: The second sequence(s). -// (the rest are the same as for reduce_to_chain) -// -// Returns: -// The score of the best chain, unscaled; zero if there's some problem. -// -// -//---------- - -//=== stuff for snoopBatches === - -#ifndef snoopBatches -#define debugSnoopBatches_1 ; -#endif // not snoopBatches - -#ifdef snoopBatches - -#define debugSnoopBatches_1 \ - { \ - partition* batPart1 = chainBatches->batch[batIx].part1; \ - partition* batPart2 = chainBatches->batch[batIx].part2; \ - fprintf (stderr, "batch[%u] %u..%u", \ - batIx, startSegIx, endSegIx-1); \ - if (batPart1 != NULL) \ - fprintf (stderr, " seq1: " unsposFmt ".." unsposFmt, \ - batPart1->sepBefore+1, batPart1->sepAfter); \ - if (batPart2 != NULL) \ - fprintf (stderr, " seq2: " unsposFmt ".." unsposFmt, \ - batPart2->sepBefore+1, batPart2->sepAfter); \ - fprintf (stderr, "\n"); \ - } -#endif // snoopBatches - - -// private data for try_reduce_to_chain - -static sbtable* chainBatches = NULL; - - -// try_reduce_to_chain-- - -score try_reduce_to_chain - (seq* seq1, - seq* seq2, - segtable* st, - score diagPen, - score antiPen, - int scale, - chainer connect) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part1, *part2; - segment* seg, *seg2; - segtable stSubset; - u32 entriesNeeded; - size_t bytesNeeded; - u32 partIx1, partIx2, segIx, segIx2; - u32 batIx, startSegIx, endSegIx, segsKept; - unspos pEnd1, pEnd2; - bigscore chainScore, best; - - // if neither sequence is partitioned, just pass the segments along to - // reduce_to_chain() - - if ((sp1->p == NULL) // sequence 1 is not partitioned - && (sp2->p == NULL)) // sequence 2 is not partitioned - return reduce_to_chain (st,diagPen,antiPen,scale,connect); - - // allocate the batch table, or resize it if it's not big enough - - entriesNeeded = 1; - if (sp1->p != NULL) entriesNeeded *= sp1->len; // sequence 1 is partitioned - if (sp2->p != NULL) entriesNeeded *= sp2->len; // sequence 2 is partitioned - bytesNeeded = sbtable_bytes (entriesNeeded); - - if (chainBatches == NULL) - { - chainBatches = (sbtable*) malloc_or_die ("try_reduce_to_chain", bytesNeeded); - chainBatches->size = entriesNeeded; - } - else if (entriesNeeded > chainBatches->size) - { - chainBatches = (sbtable*) realloc_or_die ("try_reduce_to_chain", chainBatches, bytesNeeded); - chainBatches->size = entriesNeeded; - } - - // loop over all partitions in both target and query, collecting batches of - // segments that are confined to a single partition; unfortunately each of - // the three cases (regarding target and/or query partitioned) has to be - // handled separately - - if ((sp1->p != NULL) // sequence 1 is partitioned - && (sp2->p == NULL)) // sequence 2 is not partitioned - { - sort_segments (st, qSegmentsByPos1); - - batIx = 0; - segIx = 0; seg = &st->seg[segIx]; - for (partIx1=0 ; partIx1len ; partIx1++) - { - if (segIx >= st->len) break; - - part1 = &sp1->p[partIx1]; - pEnd1 = part1->sepAfter; - if (pEnd1 < seg->pos1 + seg->length) continue; - - chainBatches->batch[batIx].part1 = part1; - chainBatches->batch[batIx].part2 = NULL; - chainBatches->batch[batIx].start = segIx++; seg++; - while ((segIx < st->len) && (pEnd1 >= seg->pos1 + seg->length)) - { segIx++; seg++; } - - chainBatches->batch[batIx].end = segIx; - batIx++; - } - chainBatches->len = batIx; - } - - else if ((sp1->p == NULL) // sequence 1 is not partitioned - && (sp2->p != NULL)) // sequence 2 is partitioned - { - sort_segments (st, qSegmentsByPos2); - - batIx = 0; - segIx = 0; seg = &st->seg[segIx]; - for (partIx2=0 ; partIx2len ; partIx2++) - { - if (segIx >= st->len) break; - - part2 = &sp2->p[partIx2]; - pEnd2 = part2->sepAfter; - if (pEnd2 < seg->pos2 + seg->length) continue; - - chainBatches->batch[batIx].part1 = NULL; - chainBatches->batch[batIx].part2 = part2; - chainBatches->batch[batIx].start = segIx++; seg++; - while ((segIx < st->len) && (pEnd2 >= seg->pos2 + seg->length)) - { segIx++; seg++; } - - chainBatches->batch[batIx].end = segIx; - batIx++; - } - chainBatches->len = batIx; - } - - else if ((sp1->p != NULL) // sequence 1 is partitioned - && (sp2->p != NULL)) // sequence 2 is partitioned - { - sort_segments (st, qSegmentsByPos1); - - batIx = 0; - segIx = 0; seg = &st->seg[segIx]; - for (partIx1=0 ; partIx1len ; partIx1++) - { - if (segIx >= st->len) break; - - part1 = &sp1->p[partIx1]; - pEnd1 = part1->sepAfter; - if (pEnd1 < seg->pos1 + seg->length) continue; - - startSegIx = segIx++; seg++; - while ((segIx < st->len) && (pEnd1 >= seg->pos1 + seg->length)) - { segIx++; seg++; } - endSegIx = segIx; - - sort_some_segments (st, startSegIx, endSegIx, qSegmentsByPos2); - - segIx2 = startSegIx; seg2 = &st->seg[segIx2]; - for (partIx2=0 ; partIx2len ; partIx2++) - { - if (segIx2 >= endSegIx) break; - - part2 = &sp2->p[partIx2]; - pEnd2 = part2->sepAfter; - if (pEnd2 < seg2->pos2 + seg2->length) continue; - - chainBatches->batch[batIx].part1 = part1; - chainBatches->batch[batIx].part2 = part2; - chainBatches->batch[batIx].start = segIx2++; seg2++; - while ((segIx2 < endSegIx) && (pEnd2 >= seg2->pos2 + seg2->length)) - { segIx2++; seg2++; } - - chainBatches->batch[batIx].end = segIx2; - batIx++; - } - } - chainBatches->len = batIx; - } - - // perform chaining over each batch of segments, treating each as a - // separate chaining problem - - best = 0; - - for (batIx=0 ; batIxlen ; batIx++) - { - // create a subset of the segment table, corresponding to this batch - - startSegIx = chainBatches->batch[batIx].start; - endSegIx = chainBatches->batch[batIx].end; - debugSnoopBatches_1 - - subset_segment_table (st, startSegIx, endSegIx, &stSubset); - - // chain the subset - - chainScore = reduce_to_chain (&stSubset, diagPen, antiPen, scale, connect); - if (chainScore > best) best = chainScore; - - // mark the segments that are part of the chain as to-be-kept, and the - // others as to-be-filtered; note that reduce_to_chain has brought the - // chained segments to the front of the subset -- the rest are - // essentially garbage (and don't necessarily represent the excluded - // segments of the subset) - - segsKept = stSubset.len; - for (segIx=startSegIx ; segIxseg[segIx].filter = false; - for ( ; segIxseg[segIx].filter = true; - } - - // perform the final filtering step, unaware of batching - - filter_marked_segments (st); - - return best; - } - -//---------- -// -// reduce_to_chain-- -// Find the highest scoring chain, in which each segment in the chain begins -// strictly before the start of the next segment. -// -// WARNING: External modules should usually NOT call this function directly, -// but should instead call try_reduce_to_chain. The exception is that -// if the caller can guarantee that all segments are within the same -// partition (in both target and query), they can call this directly. -// -// A chain is a series of segments, where each segment in the chain (other than -// the last), begins strictly before the start of the next. A chain's score is -// scale times the sum of segment scores minus the sum of penalties for the gaps -// between segments: -// connect (segment_i, segment_(i+1), scale) -// the last sum is taken over all segments in the chain except the last). -// -//---------- -// -// Arguments: -// segtable* st: The segments on which to operate. -// score diagPen: Chaining penalty; see notes (1) and (3). -// score antiPen: Chaining penalty; see notes (1) and (3). -// int scale: Scaling constant; see note (2). -// chainer connect: Chain connection penalty function; see note above, -// .. and description of arguments in chain.h -// -// Returns: -// The score of the best chain, unscaled; zero if there's some problem. Note -// that the segment table is modified in place, with segments belonging to the -// chain brought to the front, and the table shortened by modifying st->len. -// -//---------- -// -// Notes: -// (1) The parameters diagPen and antiPen permit us to deduce useful -// inequalities about chain scores. Namely, let segment_i and segment_j -// be segments on diagonals diag_i and diag_j, and set -// diff = diag_j - diag_i -// Then diagPen and antiPen are required to satisfy: -// if diff >= 0, then connect(segment_i,segment_j) >= diff*diagPen -// and -// if diff < 0, then connect(segment_i,segment_j) >= -diff*antiPen -// -// (2) In effect, scale permits integer arithmetic to be used with very small -// gap penalties, since the computed chain also maximizes the sum of the -// segment scores minus the sum of -// connect(segment_i, segment_(i+1), scale)/scale. -// -// (3) diagPen and antiPen are considered to have already been scaled. We -// only apply scale to the segment substitution scores. -// -//---------- - -#define debugChaining_1 \ - if (chain_dbgChaining) \ - fprintf (stderr, \ - "chaining [%d] " unsposSlashFmt \ - "\tdiag=" sgnposFmt \ - "\tsegscore=" scoreFmtSimple "\n", \ - i, kdi.x, kdi.y, kdi.diag, kdi.query->s); - -#define debugChaining_2 \ - if (chain_dbgChaining) \ - { \ - if (bp.num == noPred) \ - fprintf (stderr, " pred=(none)\n"); \ - else \ - fprintf (stderr, " pred=%u query=%.2f contrib=%.2f score=%.2f\n", \ - bp.num, queryContrib, bp.contrib, \ - kdi.chainScore[i]); \ - } - -#define debugChaining_3 \ - if (chain_dbgChaining) \ - { \ - if (bestEnd == noPred) \ - fprintf (stderr, "best=(none)\n"); \ - else \ - fprintf (stderr, "best=%u score=%.2f\n", bestEnd, best); \ - } - - -score reduce_to_chain - (segtable* st, - score diagPen, - score antiPen, - int scale, - chainer connect) - { - kdinfo kdi; - kdnode* root; - u32* chain; - bigscore best, queryContrib; - u32 bestEnd; - u32 i, n; - bestpred bp; - segment* p; - - if (st == NULL) return 0; - - n = st->len; - if (n == 0) return 0; - - chain_add_stat (numAnchors, n); - - // sort segments by pos1, so that the predecessor search loop is guaranteed - // to score all possible predecessors of any segment before it considers - // that segment - - sort_segments (st, qSegmentsByPos1); - - // initialize 'global' data - - kdi.connect = connect; - kdi.seg = st->seg; - kdi.perm = malloc_or_die ("reduce_to_chain perm", n*sizeof(u32)); - kdi.invPerm = malloc_or_die ("reduce_to_chain invPerm", n*sizeof(u32)); - kdi.chainScore = zalloc_or_die ("reduce_to_chain chainScore", n*sizeof(bigscore)); - kdi.diagPen = diagPen; - kdi.antiPen = antiPen; - kdi.scale = scale; - - // build the K-d tree; as part of this process the segments are permuted - // (by use of the perm[] array), and we compute the inverse of that - // permutation to aid later access to the segments - - for (i=0 ; ipos1; - kdi.y = kdi.query->pos2; - kdi.diag = ((sgnpos) kdi.x) - ((sgnpos) kdi.y); - debugChaining_1; - - bp.num = noPred; - bp.contrib = 0; - bp = best_predecessor (root, 1, 0, bp, &kdi); - queryContrib = ((bigscore) kdi.query->s) * ((bigscore) kdi.scale); - kdi.chainScore[i] = queryContrib + bp.contrib; - debugChaining_2; - - if (kdi.chainScore[i] > best) - { best = kdi.chainScore[i]; bestEnd = i; } - chain[i] = bp.num; - propagate_max_score (root, kdi.chainScore[i], kdi.invPerm[i]); - } - - debugChaining_3; - - // get rid of non-chain segments - - for (p=st->seg ; ((u32)(p-st->seg))len ; p++) - p->filter = true; - - for (i=bestEnd ; i!=noPred ; i = chain[i]) - (kdi.seg+i)->filter = false; - - filter_marked_segments (st); - chain_add_stat (numSegments, st->len); - - // scale back best score - - if (dna_utilities_scoreType == 'I') - { - best = (best / scale) + 0.5; // best /= scale, rounded off - if (best > bestPossibleScore) // .. and clipped - best = bestPossibleScore; - } - else - best /= scale; - - free_if_valid ("reduce_to_chain perm", kdi.perm); - free_if_valid ("reduce_to_chain invPerm", kdi.invPerm); - free_if_valid ("reduce_to_chain chainScore", kdi.chainScore); - free_if_valid ("reduce_to_chain chain", chain); - free_kd_tree (root); - - return best; - } - -//---------- -// -// build_kd_tree-- -// Build segments into a K-d tree. -// -// Standard K-d tree implimentation (for K=2), such as might be found in -// reference [1]. The points are partitioned into two sets, split by the -// a value along one axis. Each of those sets is in turn split again, along -// the other axis, and so on, until all sets are small enough. "Small enough" -// is defined by bucketSize. The two dimensional axes are y (sequence pos2) -// and diagonal (sequence pos1-pos2). -// -//---------- -// -// Arguments: -// u32 lo,hi: range of entries (of kdi->seg[], indexed by kdi->perm[]) to -// .. build a tree of; these are inclusive (i.e. there are -// .. hi+1-lo entries) -// int axis: which dimension/axis to partition (at the top level) -// 0 => diagonal (pos1 - pos2) -// 1 => pos2 -// kdinfo* kdi: 'Global' control variables. -// -// Returns: -// The root of the tree. kdi->perm[] is modified so that entries in -// kdi->seg[kdi->perm[]] agree with the tree. -// -//---------- - -static kdnode* build_kd_tree - (u32 lo, - u32 hi, - int axis, - const kdinfo* const kdi) - { - kdnode* p; - u32 m; - - p = zalloc_or_die ("build_kd_tree", sizeof(kdnode)); - p->maxChainScore = 0; - - if (hi+1-lo <= bucketSize) // the range is small enough to fit in one - { // .. node - p->isBucket = true; - p->loIx = lo; - p->hiIx = hi; - } - else // the range is two big for one node, split - { // .. it into two subtrees - p->isBucket = false; - m = partition_segments (lo, hi, axis, kdi); - p->cutVal = projection (m, axis, kdi); - p->hiIx = m; - p->loSon = build_kd_tree (lo, m, 1-axis, kdi); - p->hiSon = build_kd_tree (m+1, hi, 1-axis, kdi); - } - - if (p == NULL) - suicide ("(in build_kd_tree, p == NULL)"); - if (!valid_kdnode(p)) - suicide ("(in build_kd_tree, p is not a valid kdnode)"); - - return p; - } - -//---------- -// -// free_kd_tree-- -// Dispose of the memory allocated for a K-d tree. -// -//---------- -// -// Arguments: -// kdnode* subtree: The K-d (sub)tree to dispose of. -// -// Returns: -// (nothing) -// -//---------- - -// $$$ we could eliminate tail recursion here - -static void free_kd_tree - (kdnode* subtree) - { - if (subtree->isBucket) - free_if_valid ("free_kd_tree leaf", subtree); - else - { - free_kd_tree (subtree->loSon); - free_kd_tree (subtree->hiSon); - free_if_valid ("free_kd_tree node", subtree); - } - } - -//---------- -// -// dump_kd_tree-- -// Dump a K-d tree to a file (for debugging). -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// int points: The number of points in the tree. -// kdnode* root: The K-d tree to print. -// kdinfo* kdi: 'Global' control variables. -// -// Returns: -// (nothing) -// -//---------- - -static void dump_kd_subtree (int indent, kdnode* subtree, int axis); - -static FILE* dksFile; -static const kdinfo* dksKdi; -static int dksIndexWidth; - -static void dump_kd_tree - (FILE* f, - int points, - kdnode* root, - const kdinfo* const kdi) - { - int i; - - for (dksIndexWidth=1,i=points-1 ; i>9 ; i/=10) dksIndexWidth++; - - dksFile = f; - dksKdi = kdi; - dump_kd_subtree (0, root, 1); - } - -static void dump_kd_subtree - (int indent, - kdnode* subtree, - int axis) - { - u32 i, j; - segment* s; - - if (!subtree->isBucket) - { - dump_kd_subtree (indent+2, subtree->loSon, 1-axis); - fprintf (dksFile, " %*s %*s%s<=" sgnposFmt "\n", indent, "", - dksIndexWidth, "", - (axis==0)? "x-y" : "x", subtree->cutVal); - dump_kd_subtree (indent+2, subtree->hiSon, 1-axis); - return; - } - - for (i=subtree->loIx ; i<=subtree->hiIx ; i++) - { - j = dksKdi->perm[i]; - s = &dksKdi->seg[j]; - fprintf (dksFile, "[%*d]%*s" unsposSlashFmt "\n", - dksIndexWidth, i, indent, "", s->pos1, s->pos2); - } - - } - -//---------- -// -// partition_segments-- -// Partition a list of segments into two sets, split by a value along one axis. -// -// A 'pivot' value is desginated as the median (along the specified axis) of -// the first and last segment, and the one at the middle of the list. Segments -// below the pivot are moved to the first part of the list, and segments above -// it are moved to the last part, with the pivot between them. -// -//---------- -// -// Arguments: -// u32 lo,hi: range of entries (of kdi->seg[], indexed by kdi->perm[]) to -// .. build a tree of; these are inclusive (i.e. there are -// .. hi+1-lo entries) -// int axis: dimension/axis to partition on -// 0 => diagonal (pos1 - pos2) -// 1 => pos2 -// kdinfo* kdi: 'Global' control variables. -// -// Returns: -// The index of the pivot (m). kdi->perm[] is modified so that entries in -// kdi->seg[kdi->perm[]] satsify lo..m-1 <= m <= m+1..hi. -// -//---------- - -static u32 partition_segments - (u32 lo, - u32 hi, - int axis, - const kdinfo* const kdi) - { - u32 m, i, j; - sgnpos a, b, c, pivot; - - if (hi - lo < 2) - suicidef ("partition: cannot happen (" unsposCommaFmt ")", lo, hi); - - while (true) - { - // find the pivot and move it to the front; we use the median of the - // lower, middle, and upper values as the pivot - - m = (lo+hi)/2; - a = projection (lo, axis, kdi); - b = projection (m, axis, kdi); - c = projection (hi, axis, kdi); - - if (((a <= b) && (b <= c)) || ((c <= b) && (b <= a))) - { perm_swap (kdi,lo,m); pivot = b; } - else if (((a <= c) && (c <= b)) || ((b <= c) && (c <= a))) - { perm_swap (kdi,lo,hi); pivot = c; } - else - pivot = a; - - // move smaller entries to front, larger to back - - i = lo; - j = hi+1; - while (i < j) - { - // search forward for a large entry - for (i++ ; (i<=hi)&&(projection(i,axis,kdi)<=pivot) ; i++) - ; - // search backward for a small entry - for (j-- ; (j>=lo)&&(projection(j,axis,kdi)>pivot) ; j--) - ; - perm_swap (kdi,i,j); - } - - perm_swap (kdi,i,j); // undo the last swap - perm_swap (kdi,lo,j); // move the pivot value to the proper location - - // warning: we must avoid returning j==hi (because build_kd_tree() would - // recurse forever); if j diagonal (pos1 - pos2) -// 1 => pos2 -// int lowerBound: Lower bound of chain score that must be achieved. -// bestpred bp: The best predecessor found so far. -// kdinfo* kdi: 'Global' data. -// -// Returns: -// The ending segment index (bestpred.num) and score (bestpred.contrib) of the -// best predecessor chain. -// -//---------- - -#define debugChaining_4 \ - if (chain_dbgChaining) \ - { \ - if (bp.num == noPred) \ - fprintf (stderr, " bestwas=(none) scorewas=%.2f\n", \ - bp.contrib); \ - else \ - fprintf (stderr, " bestwas=%u scorewas=%.2f\n", \ - bp.num, bp.contrib); \ - } - -#define debugChaining_5 \ - if (chain_dbgChaining) \ - { \ - fprintf (stderr, " cand=%u score=%.2f (from %.2f)\n", \ - j, predScore, kdi->chainScore[j]); \ - } - - -static bestpred best_predecessor - (kdnode* subtree, - int axis, - bigscore lowerBound, - bestpred bp, - const kdinfo* const kdi) - { - bigscore predScore; - u32 i, j; - segment* s; - - if (subtree == NULL) - suicide ("(in best_predecessor, NULL subtree)"); - - if (bp.contrib >= subtree->maxChainScore - lowerBound) - return bp; - - if (!valid_kdnode(subtree)) - suicide ("(in best_predecessor, invalid subtree)"); - - // if we're at a leaf, search over all segments in the leaf - - if (subtree->isBucket) - { - for (i=subtree->loIx ; i<=subtree->hiIx ; i++) - { - j = kdi->perm[i]; // kdi is the segment we want to add to the chain - s = &kdi->seg[j]; // s is the candidate to be a predecessor - if ((s->pos1 >= kdi->x) || (s->pos2 >= kdi->y)) - continue; - predScore = kdi->chainScore[j] - kdi->connect(s, kdi->query, kdi->scale); - debugChaining_4; - if (predScore > bp.contrib) { bp.contrib = predScore; bp.num = j; } - debugChaining_5; - } - } - - // if we're at a node cut by y, search over both subtrees, pruning the high - // subtree if all its segments have y greater than our query - - else if (axis == 1) - { - if (((sgnpos) kdi->y) >= subtree->cutVal) - bp = best_predecessor (subtree->hiSon, lowerBound, 1-axis, bp, kdi); - bp = best_predecessor (subtree->loSon, lowerBound, 1-axis, bp, kdi); - } - - // if we're at a node cut by the diagonal, search over search both subtrees, - // adjusting the lower bound accordingly - // nota bene: diff>0 => query diagonal is below cut - - else // if (axis == 0) - { - bigscore diff = kdi->diag - subtree->cutVal; - if (diff >= 0) // query diagonal is southeast of (or same as) cut - { - bp = best_predecessor (subtree->hiSon, 1-axis, lowerBound, bp, kdi); - bp = best_predecessor (subtree->loSon, 1-axis, diff*kdi->diagPen, bp, kdi); - } - else // query diagonal is northwest of cut - { - bp = best_predecessor (subtree->loSon, 1-axis, lowerBound, bp, kdi); - bp = best_predecessor (subtree->hiSon, 1-axis, -diff*kdi->antiPen, bp, kdi); - } - } - - return bp; - } - -//---------- -// -// propagate_max_score-- -// Propagate the best score for any chain ending at a particular segment to all -// the (sub)trees that contain that segment. -// -//---------- -// -// Arguments: -// kdnode* subtree: The K-d (sub)tree to operate on. -// bigscore s: The score to propagate. -// u32 ix: The index of the segment that has that score. -// -// Returns: -// (nothing) -// -//---------- -// -// Older, recursive version looked like this: -// -// if (subtree != NULL) -// { -// if (s > subtree->maxChainScore) subtree->maxChainScore = s; -// if (subtree->hiIx >= ix) propagate_max_score (subtree->loSon, s, ix); -// else propagate_max_score (subtree->hiSon, s, ix); -// } -// -//---------- - -static void propagate_max_score - (kdnode* subtree, - bigscore s, - u32 ix) - { - while (subtree != NULL) - { - if (s > subtree->maxChainScore) - subtree->maxChainScore = s; - if (ix <= subtree->hiIx) subtree = subtree->loSon; - else subtree = subtree->hiSon; - } - } - -//---------- -// -// chain_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void chain_zero_stats - (void) - { -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&chainStats, 0, sizeof(chainStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - // (none to set, yet) - -#endif // collect_stats - } - -//---------- -// -// chain_show_stats-- -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// -// Returns: -// (nothing) -// -//---------- - -void chain_show_stats - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - if (f == NULL) return; - fprintf (f, "# anchors to chain: %s\n", commatize(chainStats.numAnchors)); - fprintf (f, " segments in chain: %s\n", commatize(chainStats.numSegments)); - fprintf (f, "-------------------\n"); -#endif // collect_stats - } - -void chain_generic_stats - (arg_dont_complain(FILE* f), - arg_dont_complain(void (*func) (FILE*, const char*, ...))) - { -#ifdef collect_stats - if (f == NULL) return; - (*func) (f, "num_anchors=%d\n", chainStats.numAnchors); - (*func) (f, "num_segments=%d\n", chainStats.numSegments); -#endif // collect_stats - } - diff --git a/programs/lastz/src/chain.h b/programs/lastz/src/chain.h deleted file mode 100644 index 9295aec..0000000 --- a/programs/lastz/src/chain.h +++ /dev/null @@ -1,100 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: chain.h -// -//---------- - -#ifndef chain_H // (prevent multiple inclusion) -#define chain_H - -// other files - -#include "utilities.h" // utility stuff -#include "segment.h" // segment table management stuff - -// establish ownership of global variables - -#ifdef chain_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef chain_owner -int chain_dbgChaining = false; -int chain_dbgDumpTree = false; -#else -global int chain_dbgChaining; -global int chain_dbgDumpTree; -#endif - -//---------- -// -// data structures and types -// -//---------- - -// chain connection penalty functions-- -// Compute a chaining penalty between two segments. -// -// Arguments: -// segment* seq1: The first segment. -// segment* seq2: The other segment. -// int scale: Scaling factor (see note). -// -// Note: -// The effect of the scale parameter is to permit integer arithmetic to be -// used with very small gap penalties. This is only useful if the scoring -// type is an integer. - -typedef score (*chainer) (segment* , segment* , int); - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -global struct - { - int numAnchors; - int numSegments; - } chainStats; - -// stats macros - -#define chain_count_stat(field) ++chainStats.field -#define chain_uncount_stat(field) --chainStats.field -#define chain_set_stat(field,val) (chainStats.field = val) -#define chain_add_stat(field,val) (chainStats.field += val) -#else -#define chain_count_stat(field) -#define chain_uncount_stat(field) -#define chain_set_stat(field,val) -#define chain_add_stat(field,val) -#endif // collect_stats - -// prototypes for stats routines - -void chain_zero_stats (void); -void chain_show_stats (FILE* f); -void chain_generic_stats (FILE* f, void (*func) (FILE*, const char*, ...)); - -//---------- -// -// prototypes for routines in chain.c -// -//---------- - -score try_reduce_to_chain (seq* seq1, seq* seq2, - segtable* st, score diagPen, score antiPen, - int scale, chainer connect); -score reduce_to_chain (segtable* st, score diagPen, score antiPen, - int scale, chainer connect); - -#undef global -#endif // chain_H diff --git a/programs/lastz/src/cigar.c b/programs/lastz/src/cigar.c deleted file mode 100755 index 6b647c7..0000000 --- a/programs/lastz/src/cigar.c +++ /dev/null @@ -1,599 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: cigar.c -// -//---------- -// -// cigar-- -// Support for printing alignments in CIGAR format. -// -// CIGAR format is a pairwise alignment format that describes alignment blocks -// in a run-length format. As of Jan/2009, a spec for CIGAR files could be -// found at -// http://may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html -// However, as of Jan/2012, that page no longer exists, and it is not known -// where anything like a "spec" can be found. -// -// The treatment of intervals on the - strand is not addressed at the above -// link. CIGAR is also produced by exonerate (which may be where it origin- -// ated). The following alignment output from exonerate shows how - strand is -// treated. It is counted along the + strand, and then listed in reverse order. -// -// cigar: CAT 11407 11062 - PIG 13828 14153 + 892 M 11 I 1 M 176 I 2 ... -// -// 11407 : TGAGTGTTGAAGTAAACTTGCCAAGTTATCTTTATAGGTATCAGTCCATCGTTAGATTTG : 11348 -// | | | ||| | || ||||||||||||| ||||||| |||||| |||||| || ||| || -// 13829 : TTAATTTTGTA-TAGACTTGCCAAGTTAACTTTATATGTATCATTCCATCATTGGATGTG : 13887 -// -// 11347 : TTATAGCACACATGCACATTGCTTAGCTAACTGAAACATATCAGAAGAATTTATTATAAT : 11288 -// |||| ||||| ||| |||| | |||| ||| ||| |||||| |||| ||||||||||| -// 13888 : GTATAACACACCTGCTCATTCCCTAGCAAACCCAAAAATATCAAAAGATTTTATTATAAT : 13947 -// -// 11287 : GCTTAGATAACTTAGGGAATTGCCTACCAGGAAGTAGTGAATATCCGGAACAAGCTCCTC : 11228 -// ||||||||||||| | | | |||||| | | ||||||||| || || || || | | -// 13948 : GCTTAGATAACTTGGAGCAGTGCCTATCCTGTAGTAGTGAAGATGTGGGACGAGAGTCAC : 14007 -// -// 11227 : TATGGACTCCCATAGAGTCAAGGTCAGCTTGATGCTATTATGCTTGCTATGGGCCATCAG : 11168 -// ||||| | || ||| ||||||||||||||||| |||| | | | || |||| -// 14008 : AATGGATT--CA-----TCAGGGTCAGCTTGATGCTATAATGCCTACCCTAAGCAATCAA : 14060 -// -// 11167 : AGAGGGGCCTTTGTAACTTTTAGAGTTTGTGAAGGGACGCCTAAATTTGACCATTATCCT : 11108 -// | || ||||||||||||||||||| || | |||| |||||||| | -// 14061 : ACAG------------CTTTTAGAGTTTGTGAAGGTACACTAGAATTCTACCATTATGCA : 14108 -// -// 11107 : GCTTTTCAATAATTTCACATGTAAATCAAAAATAGGGCATATTTA : 11063 -// || |||||||||| ||| |||| | ||||| || ||||||||||| -// 14109 : GCATTTCAATAATATCATATGTGACTCAAAGATTGGGCATATTTA : 14153 -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -#define cigar_owner // (make this the owner of its globals) -#include "cigar.h" // interface to this module - -//---------- -// -// prototypes for private functions -// -//---------- - -static void print_cigar_mismatchy_run (FILE* f, u8* s1, u8* s2, unspos length, - int letterAfter, int hideSingles, int lowercase); - -//---------- -// -// print_cigar_job_header-- -// Print cigar format job header. -// -//---------- - -void print_cigar_job_header - (arg_dont_complain(FILE* f)) - { - // (do nothing) - } - -//---------- -// -// print_cigar_job_footer-- -// Print cigar format job footer. -// -//---------- - -void print_cigar_job_footer - (arg_dont_complain(FILE* f)) - { - // (do nothing) - } - -//---------- -// -// print_cigar_header-- -// Print cigar format query header. -// -//---------- - -void print_cigar_header - (arg_dont_complain(FILE* f), - arg_dont_complain(seq* seq1), - arg_dont_complain(seq* seq2)) - { - // (do nothing) - } - -//---------- -// -// print_cigar_align_list-- -// Print a list of gapped alignments in cigar format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// int withInfo: true => include info before cigar path string -// int markMismatches: true => use =/X syntax for non-indel runs -// false => use M syntax instead -// int letterAfter: true => letters after count (defying spec) -// int hideSingles: true => don't both to print count if count==1 -// int lowercase: true => use lower case letters (defying spec) -// int withNewLine: true => write new-lines after each alignment -// -// Returns: -// (nothing) -// -//---------- - -void print_cigar_align_list - (FILE* f, - alignel* alignList, - seq* seq1, - seq* seq2, - int withInfo, - int markMismatches, - int letterAfter, - int hideSingles, - int lowercase, - int withNewLine) - { - alignel* a; - - for (a=alignList ; a!=NULL ; a=a->next) - print_cigar_align (f, - seq1, a->beg1-1, a->end1, - seq2, a->beg2-1, a->end2, - a->script, a->s, - withInfo, markMismatches, letterAfter, - hideSingles, lowercase, withNewLine); - } - -//---------- -// -// print_cigar_align-- -// Print a single gapped alignment in cigar format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos beg1, end1: Range of positions in sequence 1 (origin 0). -// seq* seq2: Another sequence. -// unspos beg2, end2: Range of positions in sequence 2 (origin 0). -// editscript* script: The script describing the path the alignment -// .. takes in the DP matrix. -// score s: The alignment's score. -// int withInfo: true => include info before cigar path string -// int markMismatches: true => use =/X syntax for non-indel runs -// false => use M syntax instead -// int letterAfter: true => letters after count (defying spec) -// int hideSingles: true => don't both to print count if count==1 -// int lowercase: true => use lower case letters (defying spec) -// int withNewLine: true => write new-line after the alignment -// -// Returns: -// (nothing) -// -//---------- - -static char* rcfSuffix[4] = { "", "~", "~", "" }; - -void print_cigar_align - (FILE* f, - seq* seq1, - unspos beg1, - unspos _end1, - seq* seq2, - unspos beg2, - unspos _end2, - editscript* script, - score s, - int withInfo, - int markMismatches, - int letterAfter, - int hideSingles, - int lowercase, - int withNewLine) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - u8* s1 = seq1->v + beg1; - u8* s2 = seq2->v + beg2; - unspos height, width, i, j, prevI, prevJ, run; - u32 opIx; - char* name1, *name2, *suff1, *suff2; - unspos offset1, offset2, start1, start2, end1, end2; - unspos startLoc1, startLoc2; - unspos seq1Len, seq2Len; - char strand1, strand2; - char chM = (lowercase)? 'm' : 'M'; - char chD = (lowercase)? 'd' : 'D'; - char chI = (lowercase)? 'i' : 'I'; - - height = _end1 - beg1; - width = _end2 - beg2; - - // figure out position offsets and names - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - seq1Len = seq1->len; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, beg1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - seq1Len = part->sepAfter - offset1; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2Len = seq2->len; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, beg2); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2Len = part->sepAfter - offset2; - } - - // figure out strandedness - - suff1 = rcfSuffix[seq1->revCompFlags]; - suff2 = rcfSuffix[seq2->revCompFlags]; - - if ((seq1->revCompFlags & rcf_rev) == 0) - { - start1 = beg1-1 - offset1 + startLoc1; - end1 = start1 + height; - strand1 = '+'; - } - else - { - start1 = startLoc1 + seq1Len + offset1 - (beg1+1); - end1 = start1 - height; - strand1 = '-'; - } - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = beg2-1 - offset2 + startLoc2; - end2 = start2 + width; - strand2 = '+'; - } - else - { - start2 = startLoc2 + seq2Len + offset2 - (beg2+1); - end2 = start2 - width; - strand2 = '-'; - } - - // print the alignment - - if (withInfo) - fprintf (f, "cigar:" - " %s%s " unsposFmt " " unsposFmt " %c" - " %s%s " unsposFmt " " unsposFmt " %c" - " " scoreFmt, - name2, suff2, start2, end2, strand2, - name1, suff1, start1, end1, strand1, - s); - - opIx = 0; - for (i=j=0 ; (i< height)||(j 0) - { - if (markMismatches) - print_cigar_mismatchy_run (f, s1+i, s2+j, run, - letterAfter, hideSingles, lowercase); - else - { - if (letterAfter) fprintf (f, unsposFmt "%c", run, chM); - else fprintf (f, " %c " unsposFmt, chM, run); - } - i += run; j += run; - } - - if ((i < height) || (j < width)) - { - prevI = i; prevJ = j; - edit_script_indel_len (script, &opIx, &i, &j); - if (i > prevI) - { - if (!letterAfter) - fprintf (f, " %c " unsposFmt, chD, i - prevI); - else if ((hideSingles) && (i - prevI == 1)) - fprintf (f, "%c", chD); - else - fprintf (f, unsposFmt "%c", i - prevI, chD); - } - if (j > prevJ) - { - if (!letterAfter) - fprintf (f, " %c " unsposFmt, chI, j - prevJ); - else if ((hideSingles) && (j - prevJ == 1)) - fprintf (f, "%c", chI); - else - fprintf (f, unsposFmt "%c", j - prevJ, chI); - } - } - } - - if (withNewLine) - fprintf (f, "\n"); - } - -//---------- -// -// print_cigar_match-- -// Print an hsp in cigar format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos pos1: The position, in seq1, of first character in the -// .. match (origin-0). -// seq* seq2: Another sequence. -// unspos pos1: The position, in seq2, of first character in the -// .. match (origin-0). -// unspos length: The number of nucleotides in the HSP. -// score s: The HSP's score. -// int withInfo: true => include info before cigar path string -// int markMismatches: true => use =/X syntax for non-indel runs -// false => use M syntax instead -// int letterAfter: true => letters after count (defying spec) -// int hideSingles: true => don't both to print count if count==1 -// int lowercase: true => use lower case letters (defying spec) -// int withNewLine: true => write new-line after the alignment -// -// Returns: -// (nothing) -// -//---------- - -void print_cigar_match - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - score s, - int withInfo, - int markMismatches, - int letterAfter, - int hideSingles, - int lowercase, - int withNewLine) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - u8* s1 = seq1->v + pos1; - u8* s2 = seq2->v + pos2; - char* name1, *name2, *suff1, *suff2; - unspos offset1, offset2, start1, start2, end1, end2; - unspos startLoc1, startLoc2; - unspos seq1Len, seq2Len; - char strand1, strand2; - char chM = (lowercase)? 'm' : 'M'; - - // figure out position offsets and names - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - seq1Len = seq1->len; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, pos1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - seq1Len = part->sepAfter - offset1; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2Len = seq2->len; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, pos2); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2Len = part->sepAfter - offset2; - } - - // figure out strandedness - - suff1 = rcfSuffix[seq1->revCompFlags]; - suff2 = rcfSuffix[seq2->revCompFlags]; - - if ((seq1->revCompFlags & rcf_rev) == 0) - { - start1 = pos1-1 - offset1 + startLoc1; - end1 = start1 + length; - strand1 = '+'; - } - else - { - start1 = startLoc1 + seq1Len + offset1 - (pos1+1); - end1 = start1 - length; - strand1 = '-'; - } - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = pos2-1 - offset2 + startLoc2; - end2 = start2 + length; - strand2 = '+'; - } - else - { - start2 = startLoc2 + seq2Len + offset2 - (pos2+1); - end2 = start2 - length; - strand2 = '-'; - } - - // print the alignment - - if (withInfo) - fprintf (f, "cigar:" - " %s%s " unsposFmt " " unsposFmt " %c" - " %s%s " unsposFmt " " unsposFmt " %c" - " " scoreFmt, - name2, suff2, start2, end2, strand2, - name1, suff1, start1, end1, strand1, - s); - - if (markMismatches) - print_cigar_mismatchy_run (f, s1, s2, length, - letterAfter, hideSingles, lowercase); - else - { - if (!letterAfter) - fprintf (f, " %c " unsposFmt, chM, length); - else if ((hideSingles) && (length == 1)) - fprintf (f, "%c", chM); - else - fprintf (f, unsposFmt "%c", length, chM); - } - - if (withNewLine) - fprintf (f, "\n"); - } - -//---------- -// -// print_cigar_mismatchy_run-- -// Print a run of match/mismatch in (new) cigar format, using =/X syntax. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// u8* s1: Start of the run in one sequence. -// u8* s2: Start of the run in the other sequence. -// unspos length: The number of nucleotides in the run. -// int letterAfter: true => letters after count (defying spec) -// int hideSingles: true => don't both to print count if count==1 -// int lowercase: true => use lower case letters (defying spec) -// -// Returns: -// (nothing) -// -//---------- - -static void print_cigar_mismatchy_run - (FILE* f, - u8* s1, - u8* s2, - unspos length, - int letterAfter, - int hideSingles, - int lowercase) - { - char chX = (lowercase)? 'x' : 'X'; - - unspos ix; - int runIsMm, runLen; - s8 b1, b2; - char ch; - - //for (ix=0 ; ix= 0)) // match - { - if (!runIsMm) { runLen++; continue; } - if (runLen > 0) - { - if (!letterAfter) - fprintf (f, " %c " unsposFmt, chX, runLen); - else if ((hideSingles) && (runLen == 1)) - fprintf (f, "%c", chX); - else - fprintf (f, unsposFmt "%c", runLen, chX); - } - runIsMm = false; - runLen = 1; - } - else // mismatch - { - if (runIsMm) { runLen++; continue; } - if (runLen > 0) - { - if (!letterAfter) - fprintf (f, " = " unsposFmt, runLen); - else if ((hideSingles) && (runLen == 1)) - fprintf (f, "="); - else - fprintf (f, unsposFmt "=", runLen); - } - runIsMm = true; - runLen = 1; - } - } - - if (runLen > 0) - { - ch = (runIsMm)? chX : '='; - if (!letterAfter) - fprintf (f, " %c " unsposFmt, ch, runLen); - else if ((hideSingles) && (runLen == 1)) - fprintf (f, "%c", ch); - else - fprintf (f, unsposFmt "%c", runLen, ch); - } - - } - diff --git a/programs/lastz/src/cigar.h b/programs/lastz/src/cigar.h deleted file mode 100644 index 302a5bf..0000000 --- a/programs/lastz/src/cigar.h +++ /dev/null @@ -1,52 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: maf.h -// -//---------- - -#ifndef cigar_H // (prevent multiple inclusion) -#define cigar_H - -// other files - -#include // standard C i/o stuff -#include // standard C variable argument list stuff -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef cigar_owner -#define global -#else -#define global extern -#endif - -//---------- -// -// prototypes for routines in maf.c -// -//---------- - -void print_cigar_job_header (FILE* f); -void print_cigar_job_footer (FILE* f); -void print_cigar_header (FILE* f, seq* seq1, seq* seq2); -void print_cigar_align_list (FILE* f, alignel* alignList, seq* seq1, seq* seq2, - int withInfo, int markMismatches, int letterAfter, - int hideSingles, int lowercase, int withNewLine); -void print_cigar_align (FILE* f, - seq* seq1, unspos beg1, unspos end1, - seq* seq2, unspos beg2, unspos end2, - editscript* script, score s, - int withInfo, int markMismatches, int letterAfter, - int hideSingles, int lowercase, int withNewLine); -void print_cigar_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - score s, - int withInfo, int markMismatches, int letterAfter, - int hideSingles, int lowercase, int withNewLine); - -#undef global -#endif // maf_H diff --git a/programs/lastz/src/continuity_dist.c b/programs/lastz/src/continuity_dist.c deleted file mode 100755 index 2754a22..0000000 --- a/programs/lastz/src/continuity_dist.c +++ /dev/null @@ -1,350 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: continuity_dist.c -// -//---------- -// -// continuity_dist-- -// Support for collecting the query continuity (or gap rate) distribution from -// alignments. -// -// Notes: -// continuity = 1/(1+gaprate) -// gaprate = (1/continuity)-1 -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C upper/lower stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -#define continuity_dist_owner // (make this the owner of its globals) -#include "continuity_dist.h" // interface to this module - -//---------- -// -// filter_aligns_by_continuity-- -// Filter a list of alignments, removing any alignment that has continuity -// percentage outside of a specified range. -// -//---------- -// -// Arguments: -// alignel* alignList: The list of alignments to operate upon. -// float minContinuity, The range of query continuity in alignments that -// maxContinuity: .. we will *keep*. These are values between 0 -// .. and 1. -// -// Returns: -// A pointer to the list of remaining alignments. -// -//---------- -// -// Notes: -// (1) The numerator is the number of alignment columns *not* containing gaps -// and the denominator is the number of total alignment columns. -// (2) Memory for alignments that don't make the cut is deallocated here. -// (3) The returned list of alignments is in the same order as the incoming -// list. -// -//---------- - -// $$$ There's an inherent inefficiency here if the user asks us to filter -// $$$ .. alignments by several gap-based stats. We'd like to compute the -// $$$ .. gap stats only once, then either carry that in the alignment record, -// $$$ .. or apply all the filters in one pass through the loop. - -alignel* filter_aligns_by_continuity - (alignel* alignList, - float minContinuity, - float maxContinuity) - { - alignel* a, *next; - alignel* head, *prev; - unspos numer, denom; - float minCont, maxCont; - - // process each alignment, collecting a list of those that are long enough - - head = prev = NULL; - for (a=alignList ; a!=NULL ; a=next) - { - next = a->next; - - // if the alignment is too gappy, skip it - - alignment_continuity (a, &numer, &denom); - - minCont = denom * minContinuity; - maxCont = denom * maxContinuity; - - if ((numer < minCont) || (numer > maxCont)) - { // (unwanted alignment, discard it) - free_if_valid ("filter_aligns_by_continuity a->script", a->script); - free_if_valid ("filter_aligns_by_continuity a", a); - continue; - } - - // this alignment is ok, add it to the end of the new list we're - // building - - if (head == NULL) head = prev = a; - else { prev->next = a; prev = a; } - - a->next = NULL; - } - - return head; - } - -//---------- -// -// filter_aligns_by_num_gaps-- -// Filter a list of alignments, removing any alignment that has more gaps than -// a specified maximum (counting each run of gapped columns as a single gap). -// -//---------- -// -// Arguments: -// alignel* alignList: The list of alignments to operate upon. -// s32 maxSeparateGapsCount: The maximum number of gaps in alignments -// .. that we will *keep*. -// -// Returns: -// A pointer to the list of remaining alignments. -// -//---------- -// -// Notes: -// (1) Memory for alignments that don't make the cut is deallocated here. -// (2) The returned list of alignments is in the same order as the incoming -// list. -// -//---------- - -alignel* filter_aligns_by_num_gaps - (alignel* alignList, - s32 maxSeparateGapsCount) - { - alignel* a, *next; - alignel* head, *prev; - unspos height, width, i, j, run; - u32 opIx; - s32 numGaps; - - // process each alignment, collecting a list of those that are long enough - - head = prev = NULL; - for (a=alignList ; a!=NULL ; a=next) - { - next = a->next; - - // if the alignment is too gappy, skip it - - numGaps = 0; - - height = a->end1 - a->beg1 + 1; - width = a->end2 - a->beg2 + 1; - opIx = 0; - for (i=j=0 ; (iscript, &opIx); - i += run; j += run; - - // handle the next indel - - if ((i < height) || (j < width)) - { - edit_script_indel_len (a->script, &opIx, &i, &j); - if (++numGaps > maxSeparateGapsCount) - break; - } - } - - if (numGaps > maxSeparateGapsCount) - { // (unwanted alignment, discard it) - free_if_valid ("filter_aligns_by_continuity a->script", a->script); - free_if_valid ("filter_aligns_by_continuity a", a); - continue; - } - - // this alignment is ok, add it to the end of the new list we're - // building - - if (head == NULL) head = prev = a; - else { prev->next = a; prev = a; } - - a->next = NULL; - } - - return head; - } - -//---------- -// -// filter_aligns_by_num_gap_columns-- -// Filter a list of alignments, removing any alignment that has more gaps than -// a specified maximum (counting each gapped column as a single gap). -// -//---------- -// -// Arguments: -// alignel* alignList: The list of alignments to operate upon. -// s32 maxGapColumnsCount: The maximum number of gaps in alignments -// .. that we will *keep*. Note that this -// .. must not be negative. -// -// Returns: -// A pointer to the list of remaining alignments. -// -//---------- -// -// Notes: -// (1) Memory for alignments that don't make the cut is deallocated here. -// (2) The returned list of alignments is in the same order as the incoming -// list. -// -//---------- - -alignel* filter_aligns_by_num_gap_columns - (alignel* alignList, - s32 maxGapColumnsCount) - { - alignel* a, *next; - alignel* head, *prev; - unspos numer, denom; - - // process each alignment, collecting a list of those that are long enough - - head = prev = NULL; - for (a=alignList ; a!=NULL ; a=next) - { - next = a->next; - - // if the alignment is too gappy, skip it - - alignment_continuity (a, &numer, &denom); - - if ((denom == 0) || (denom-numer > (u32) maxGapColumnsCount)) - { // (unwanted alignment, discard it) - free_if_valid ("filter_aligns_by_continuity a->script", a->script); - free_if_valid ("filter_aligns_by_continuity a", a); - continue; - } - - // this alignment is ok, add it to the end of the new list we're - // building - - if (head == NULL) head = prev = a; - else { prev->next = a; prev = a; } - - a->next = NULL; - } - - return head; - } - -//---------- -// -// alignment_continuity-- -// Compute the continuity of an gapped alignment block. This is the number of -// bases *not* aligned to gaps divided by the number of alignment columns. -// -//---------- -// -// Arguments: -// alignel* a: The alignment of interest. -// unspos* numer, denom: Place to return the continuity fraction. Note -// .. that the returned denominator might be zero, -// .. and that the fraction may be greater than 1. -// -// Returns: -// (nothing) -// -//---------- - -void alignment_continuity - (alignel* a, - unspos* _numer, - unspos* _denom) - { - unspos gapColumns, nonGapColumns; - - alignment_gap_rate (a, &gapColumns, &nonGapColumns); - - *_numer = nonGapColumns; - *_denom = nonGapColumns + gapColumns; - } - -//---------- -// -// alignment_gap_rate-- -// Compute the gap rate of an gapped alignment block. This is the number of -// bases aligned to gaps divided by the number of aligned bases. -// -//---------- -// -// Arguments: -// alignel* a: The alignment of interest. -// unspos* numer, denom: Place to return the gap rate fraction. Note -// .. that the returned denominator might be zero, -// .. and that the fraction may be greater than 1. -// -// Returns: -// (nothing) -// -//---------- - -void alignment_gap_rate - (alignel* a, - unspos* _numer, - unspos* _denom) - { - unspos beg1 = a->beg1; - unspos beg2 = a->beg2; - unspos height, width, i, j; - u32 opIx; - unspos run; - unspos denom, gappedBases; - - height = a->end1 - beg1 + 1; - width = a->end2 - beg2 + 1; - - denom = 0; - opIx = 0; - for (i=j=0 ; (i< height)||(jscript, &opIx); - i += run; j += run; - - denom += run; - - if ((i < height) || (j < width)) - edit_script_indel_len (a->script, &opIx, &i, &j); - } - - if (denom == 0) - { *_numer = *_denom = 0; return; } - - gappedBases = (height - denom) + (width - denom); - - *_numer = gappedBases; - *_denom = denom; - } - diff --git a/programs/lastz/src/continuity_dist.h b/programs/lastz/src/continuity_dist.h deleted file mode 100644 index 6d0076a..0000000 --- a/programs/lastz/src/continuity_dist.h +++ /dev/null @@ -1,42 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: continuity_dist.h -// -//---------- - -#ifndef continuity_dist_H // (prevent multiple inclusion) -#define continuity_dist_H - -// other files - -#include // standard C i/o stuff -#include // standard C variable argument list stuff -#include "sequences.h" // sequence stuff -#include "segment.h" // segment table management stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef continuity_dist_owner -#define global -#else -#define global extern -#endif - -//---------- -// -// prototypes for routines in continuity_dist.c -// -//---------- - -alignel* filter_aligns_by_continuity (alignel* alignList, - float minContinuity, float maxContinuity); -alignel* filter_aligns_by_num_gaps (alignel* alignList, s32 maxSeparateGapsCount); -alignel* filter_aligns_by_num_gap_columns (alignel* alignList, s32 maxGapColumnsCount); -void alignment_continuity (alignel* a, - unspos* numer, unspos* denom); -void alignment_gap_rate (alignel* a, - unspos* numer, unspos* denom); - -#undef global -#endif // continuity_dist_H diff --git a/programs/lastz/src/coverage_dist.c b/programs/lastz/src/coverage_dist.c deleted file mode 100755 index 100a474..0000000 --- a/programs/lastz/src/coverage_dist.c +++ /dev/null @@ -1,339 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: coverage_dist.c -// -//---------- -// -// coverage_dist-- -// Support for collecting the query coverage distribution from alignments. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C upper/lower stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -#define coverage_dist_owner // (make this the owner of its globals) -#include "coverage_dist.h" // interface to this module - -//---------- -// -// filter_aligns_by_coverage-- -// Filter a list of alignments, removing any alignment that has coverage -// percentage outside of a specified range. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// alignel* alignList: The list of alignments to operate upon. -// float minCoverage, The range of query coverage that we will *keep*. -// maxCoverage .. These are values between 0 and 1. -// -// Returns: -// A pointer to the list of remaining alignments. -// -//---------- -// -// Notes: -// (1) The denominator is the length of whichever sequence is shorter, and the -// numerator is the length of the alignment in that sequence. -// (2) Memory for alignments that don't make the cut is deallocated here. -// (3) The returned list of alignments is in the same order as the incoming -// .. list. -// (4) Coverage is counted over all aligned bases, included those aligned to -// .. gaps. -// -//---------- - -alignel* filter_aligns_by_coverage - (seq* seq1, - seq* seq2, - alignel* alignList, - float minCoverage, - float maxCoverage) - { - alignel* a, *next; - alignel* head, *prev; - unspos numer, denom; - float minCov, maxCov; - - // process each alignment, collecting a list of those that are long enough - - head = prev = NULL; - for (a=alignList ; a!=NULL ; a=next) - { - next = a->next; - - // if the alignment isn't long enough, skip it - - alignment_coverage (seq1, seq2, a, &numer, &denom); - - minCov = denom * minCoverage; - maxCov = denom * maxCoverage; - - if ((numer < minCov) || (numer > maxCov)) - { // (unwanted alignment, discard it) - free_if_valid ("filter_aligns_by_coverage a->script", a->script); - free_if_valid ("filter_aligns_by_coverage a", a); - continue; - } - - - // this alignment is ok, add it to the end of the new list we're - // building - - if (head == NULL) head = prev = a; - else { prev->next = a; prev = a; } - - a->next = NULL; - } - - return head; - } - -//---------- -// -// alignment_coverage-- -// Measure an alignment's coverage percentage. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// alignel* a: The alignment to measure. -// unspos* numer, denom: Place to return the coverage -// -// Returns: -// (nothing) -// -//---------- -// -// Notes: -// (1) The denominator is the length of whichever sequence is shorter, and the -// numerator is the length of the segment. -// -//---------- - -void alignment_coverage - (seq* seq1, - seq* seq2, - alignel* a, - unspos* numer, - unspos* denom) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - unspos seq1Len, seq2Len; - - if (!seq1->needTrueLen) - suicidef ("internal error, in alignment_coverage, target does not have true lengths"); - - if (!seq2->needTrueLen) - suicidef ("internal error, in alignment_coverage, query does not have true lengths"); - - // determine sequence lengths for this alignment - - if (sp1->p == NULL) - seq1Len = seq1->trueLen; - else - { - part = lookup_partition (seq1, a->beg1-1); - seq1Len = part->trueLen; - } - - if (sp2->p == NULL) - seq2Len = seq2->trueLen; - else - { - part = lookup_partition (seq2, a->beg2-1); - seq2Len = part->trueLen; - } - - // use shorter sequence as denominator - - if (seq1Len < seq2Len) { *numer = a->end1+1 - a->beg1; *denom = seq1Len; } - else { *numer = a->end2+1 - a->beg2; *denom = seq2Len; } - } - -//---------- -// -// filter_segments_by_coverage-- -// Filter a table of segments, removing any segment that has coverage -// percentage outside of a specified range. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// segtable* st: The segment table to operate upon. -// float minCoverage, The range of query coverage that we will *keep*. -// maxCoverage .. These are values between 0 and 1. -// -// Returns: -// (nothing) -// -//---------- - -void filter_segments_by_coverage - (seq* seq1, - seq* seq2, - segtable* st, - float minCoverage, - float maxCoverage) - { - segment* srcSeg, *dstSeg; - unspos numer, denom; - float minCov, maxCov; - - if (st == NULL) return; -// if (st->seg == NULL) return; test not necessary st->seg is never NULL - - // process each segment, moving those that are long enough to the first - // part of the list - - for (dstSeg=srcSeg=st->seg ; ((u32)(srcSeg-st->seg))len ; srcSeg++) - { - // if the segment isn't long enough, skip it - - segment_coverage (seq1, seq2, srcSeg, &numer, &denom); - - minCov = denom * minCoverage; - maxCov = denom * maxCoverage; - - if ((numer < minCov) || (numer > maxCov)) - continue; // (unwanted segment, skip it) - - // copy the segment - - if (srcSeg != dstSeg) *dstSeg = *srcSeg; - dstSeg++; - } - - // set the new length of the list - - st->len = dstSeg - st->seg; - } - -//---------- -// -// filter_segment_by_coverage-- -// Filter a segment, reporting whether that segment has coverage percentage -// outside of a specified range. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// segment* seg: The segment to consider. Only pos1, pos2, and -// .. length are required. -// float minCoverage, The range of query coverage that we will *keep*. -// maxCoverage .. These are values between 0 and 1. -// -// Returns: -// true if the segment if outside the specified range (i.e. that it fails to -// pass the filter, and should be discarded) -// -//---------- - -int filter_segment_by_coverage - (seq* seq1, - seq* seq2, - segment* seg, - float minCoverage, - float maxCoverage) - { - unspos numer, denom; - float minCov, maxCov; - - segment_coverage (seq1, seq2, seg, &numer, &denom); - minCov = denom * minCoverage; - maxCov = denom * maxCoverage; - - if ((numer < minCov) || (numer > maxCov)) - return true; // (unwanted segment, skip it) - - return false; - } - -//---------- -// -// segment_coverage-- -// Measure a segment's coverage percentage. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// segment* seg: The segment to measure. -// unspos* numer, denom: Place to return the coverage -// -// Returns: -// (nothing) -// -//---------- -// -// Notes: -// (1) The denominator is the length of whichever sequence is shorter, and the -// numerator is the length of the segment. -// -//---------- - -void segment_coverage - (seq* seq1, - seq* seq2, - segment* seg, - unspos* numer, - unspos* denom) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - unspos seq1Len, seq2Len; - - if (!seq1->needTrueLen) - suicidef ("internal error, in segment_coverage, target does not have true lengths"); - - if (!seq2->needTrueLen) - suicidef ("internal error, in segment_coverage, query does not have true lengths"); - - // determine sequence lengths for this segment - - if (sp1->p == NULL) - seq1Len = seq1->trueLen; - else - { - part = lookup_partition (seq1, seg->pos1); - seq1Len = part->trueLen; - } - - if (sp2->p == NULL) - seq2Len = seq2->trueLen; - else - { - part = lookup_partition (seq2, seg->pos2); - seq2Len = part->trueLen; - } - - // use shorter sequence as denominator - - *numer = seg->length; - *denom = (seq1Len < seq2Len)? seq1Len - : seq2Len; - } - diff --git a/programs/lastz/src/coverage_dist.h b/programs/lastz/src/coverage_dist.h deleted file mode 100644 index c67980d..0000000 --- a/programs/lastz/src/coverage_dist.h +++ /dev/null @@ -1,44 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: coverage_dist.h -// -//---------- - -#ifndef coverage_dist_H // (prevent multiple inclusion) -#define coverage_dist_H - -// other files - -#include // standard C i/o stuff -#include // standard C variable argument list stuff -#include "sequences.h" // sequence stuff -#include "segment.h" // segment table management stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef coverage_dist_owner -#define global -#else -#define global extern -#endif - -//---------- -// -// prototypes for routines in coverage_dist.c -// -//---------- - -alignel* filter_aligns_by_coverage (seq* seq1, seq* seq2, alignel* alignList, - float minCoverage, float maxCoverage); -void alignment_coverage (seq* seq1, seq* seq2, alignel* a, - unspos* numer, unspos* denom); -void filter_segments_by_coverage (seq* seq1, seq* seq2, segtable* st, - float minCoverage, float maxCoverage); -int filter_segment_by_coverage (seq* seq1, seq* seq2, segment* seg, - float minCoverage, float maxCoverage); -void segment_coverage (seq* seq1, seq* seq2, segment* seg, - unspos* numer, unspos* denom); - -#undef global -#endif // coverage_dist_H diff --git a/programs/lastz/src/diag_hash.c b/programs/lastz/src/diag_hash.c deleted file mode 100755 index 6ffb1bf..0000000 --- a/programs/lastz/src/diag_hash.c +++ /dev/null @@ -1,373 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: diag_hash.c -// -//---------- -// -// diag_hash-- -// Support for managing a hash table of values associated with alignment -// diagonals. -// -// Our search algorithms need to track hits on each diagonal to detect -// overlapping hits and nearby multiple hits. For each diagonal, we store the -// position in the second sequence of the end of most recent hit (pos2), and in -// some cases we also store the start position of a run of overlapping or -// nearly-overlapping hits. -// -// Ideally we would have an entry for each possible diagonal. But with -// sequence lengths approaching a quarter billion, such an array would require -// too much memory (memory usage is nominally 16*(L1+L2) bytes, so two .25Gbase -// sequences would require 8Gbytes). Instead, we use a hash array much smaller -// than the number of hits likely to be "active" at any one time. It is -// critical that we store pos2 for this purpose rather than pos1, because pos2 -// always increases during the search while pos1 hops around the sequence. By -// guaranteeing the order that we (effectively) scan each diagonal, we limit -// our exposure to collisions in the hash. We also store the number of the -// actual diagonal associated with each hash location, so that we can detect -// collisions. -// -// The effect of a collision is that a hit that should be combined with a -// nearby one to its left won't be. Because the second sequence is being -// scanned from left to right, a hit stays "dangerous" (i.e., it can cover up -// a subsequent hit on a different hash-equivalent diagonal) only until the -// sweep along sequence 2 reaches pos2 for the end of that hit. If we were -// storing pos1, a hit could stay dangerous for the entire sweep. -// -// The hash funtion used simply uses some number of the least significant bits -// of the diagonal. For this reason the hash size H (diagHashSize) must be a -// power of 2. Hash-equivalent diagonals are not random but are separated by H -// intervening diagonals. -// -// Helpful formulae: -// diag = pos1 - pos2 -// pos1 = diag + pos2 -// pos2 = pos1 - diag -// -// We could reduce memory use by using two (or more) hashes with different -// moduli (relatively prime). The tradeoff is that we'd be spending more time -// computing the hash values and reading/writing them from/to the hash arrays. -// -// Allocation size... By default we allocate 65K entries for these tables. In -// a 31-bit sequence index enviroment, this works out to 1M bytes (4 tables -// with 32-bit entries => 4*4*65K = 1M). This can be overridden at compile -// time by defining diag_hash_size (to some power of 2). For example, setting -// diag_hash_size to 4M in a 32-bit sequence index enviroment works out to 84M -// (3 tables with-32 bit entries and one with 64-bit entries => 5*4*4M = 84M). -// -//---------- -// -// Seed hit queue -// ============== -// -// The seed hit queue keeps track of the N most recent seed hits, searchable by -// hashed diagonal. It is assumed that N is large enough that even in the worst -// case we have all the hits for the most recent W words in seqeunce 2, where W -// is the span allowed for twin seed hits. -// -// The queue is implemented as a simple cyclic array with a write head that -// wraps around. This saves us the effort of disposing of stale hit records; -// they are just overwritten. Within the queue, we keep a linked list of the -// hits belonging to each hashed diagonal. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C value limit stuff -#include "build_options.h" // build options - -#define diag_hash_owner // (make this the owner of its globals) -#include "diag_hash.h" // interface to this module - -// debugging defines - -//#define debugDiag 23420-14467 // if defined, breakdown what happens on this - // .. pos1-pos2 diagonal - -//---------- -// -// prototypes for private functions -// -//---------- - -//static void dump_seed_hit_queue (void); - -//---------- -// -// empty_diag_hash-- -// Create or re-create an empty diagonals hash and seed hit queue. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- -// -// Notes: -// -// (1) This routine allocates and reuses memory via global pointers. The -// caller should make a call to free_diag_hash() to de-allocate this -// memory when it is no longer needed. -// -//---------- - -void empty_diag_hash (void) - { - u32 hDiag; - -#ifndef noSeedHitQueue - //printf ("\n(erasing seed hit queue)\n"); -#endif // not noSeedHitQueue - - // allocate (or re-use) memory; note that only diagEnd requires that we - // fill it with zeros - - if (diagEnd == NULL) - { - // allocate new array - // caveat: sometimes we don't need all of this (e.g. process_for_simple_hit) - // .. so allocating all these together is wasteful - - diagEnd = malloc_or_die ("empty_diag_hash (diagEnd)", - diagHashSize * sizeof(unspos)); - diagStart = malloc_or_die ("empty_diag_hash (diagStart)", - diagHashSize * sizeof(unspos)); - diagActual = malloc_or_die ("empty_diag_hash (diagActual)", - diagHashSize * sizeof(sgnpos)); - diagActive = malloc_or_die ("empty_diag_hash (diagActive)", - diagHashSize * sizeof(u32)); - - for (hDiag=0 ; hDiag 0) - { - seedHitQueue = zalloc_or_die ("empty_diag_hash (seedHitQueue)", - seedHitQueueSize * sizeof(shqhit)); - lastSeedHit = zalloc_or_die ("empty_diag_hash (lastSeedHit)", - diagHashSize * sizeof(u64)); - seedHitNum = seedHitQueueSize; - } - else - { - seedHitQueue = NULL; - lastSeedHit = NULL; - } -#endif // not noSeedHitQueue - } - else - { - // clear the previously-allocated data - -#ifndef noSeedHitQueue - if (seedHitQueueSize > 0) - { - while (numDiagActive > 0) - { - hDiag = diagActive[--numDiagActive]; - diagEnd[hDiag] = hashInactiveEnd; - lastSeedHit[hDiag] = 0; -#ifdef debugDiag - if (hDiag == hashedDiag (debugDiag,0)) - printf ("insq: (diag %9s|%9s|%04X) end is %d\n", - "","", hDiag, hashInactiveEnd); -#endif - } - seedHitNum = seedHitQueueSize; - } - else - { -#endif // not noSeedHitQueue - while (numDiagActive > 0) - { - hDiag = diagActive[--numDiagActive]; - diagEnd[hDiag] = hashInactiveEnd; -#ifdef debugDiag - if (hDiag == hashedDiag (debugDiag,0)) - printf ("isq: (diag %9s|%9s|%04X) end is %d\n", - "","", hDiag, hashInactiveEnd); -#endif - } -#ifndef noSeedHitQueue - } -#endif // not noSeedHitQueue - } - - } - -//---------- -// -// free_diag_hash-- -// Dispose of the diagonals hash and seed hit queue. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void free_diag_hash - (void) - { - free_if_valid ("free_diag_hash (diagEnd)", diagEnd); diagEnd = NULL; - free_if_valid ("free_diag_hash (diagStart)", diagStart); diagStart = NULL; - free_if_valid ("free_diag_hash (diagActual)", diagActual); diagActual = NULL; - free_if_valid ("free_diag_hash (diagActive)", diagActive); diagActive = NULL; - -#ifndef noSeedHitQueue - free_if_valid ("free_diag_hash (seedHitQueue)", seedHitQueue); seedHitQueue = NULL; - free_if_valid ("free_diag_hash (lastSeedHit)", lastSeedHit); lastSeedHit = NULL; -#endif // not noSeedHitQueue - } - -//---------- -// -// enqueue_seed_hit-- -// Record a seed hit in the seed hit queue. -// -// Nota Bene: we name this _enqueue_seed_hit here so that that a macro in -// .. diaghash.h can make this a conditional call -// -//---------- -// -// Arguments: -// unspos pos1: The hit position in sequence 1. -// unspos pos2: The hit position in sequence 2. -// int isBlock: true => this is an end-of-extension -// false => this is a seed hit -// -// Returns: -// (nothing) -// -//---------- -// -// Assumptions: -// seedHitQueueSize > 0 -// lastSeedHit != NULL -// seedHitQueue != NULL -// -//---------- - -#ifndef noSeedHitQueue - -void _enqueue_seed_hit - (unspos pos1, - unspos pos2, - int isBlock) - { - static int haveShortfall = false; - sgnpos diag; - u32 hDiag; - shqhit* q; - - diag = diagNumber (pos1, pos2); - hDiag = hashedDiag (pos1, pos2); - - //printf ("\n[%04X] adding " sgnposFmt "/" unsposFmt "\n", - // hDiag, diag, pos2); - - seedHitNum++; - q = &seedHitQueue[seedHitNum % seedHitQueueSize]; - - if (seedHitNum > (u32) (2*seedHitQueueSize)) - { - //printf ("[%04X] removing " sgnposFmt "/" unsposFmt " (" unsposFmt " columns)\n", - // hashedDiag (q->diag,0), q->diag, q->pos2, pos2-q->pos2); - if ((!haveShortfall) - && (!q->isBlock) - && (pos2 - q->pos2 <= seedHitQueueColumns)) - { - // $$$ with some work we could realloc the queue here; we'd need - // .. to rotate the entries so that the current pointer is at - // .. the old end (so new entries go into the virgin area and - // .. don't overwrite old entries); this would require updating - // .. every prevHit link - haveShortfall = true; - fprintf (stderr, - "seed hit queue shortfall at " unsposSlashFmt "\n", - (unspos) (diag+pos2), pos2); - } - } - - if (lastSeedHit[hDiag] <= seedHitNum - seedHitQueueSize) - q->prevHit = 0; // (last seed hit is stale, no longer in queue) - else - q->prevHit = lastSeedHit[hDiag]; - lastSeedHit[hDiag] = seedHitNum; - - q->isBlock = isBlock; - q->pos2 = pos2; - q->diag = diag; - - //dump_seed_hit_queue (); - } - -#endif // not noSeedHitQueue - -//---------- -// -// dump_seed_hit_queue-- -// Dump the contents of the seed hit queue. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -#ifndef noSeedHitQueue - -//static void dump_seed_hit_queue -// (void) -// { -// u32 hDiag; -// u64 ix; -// shqhit* q; -// -// for (hDiag=0 ; hDiag seedHitNum - seedHitQueueSize) -// { -// q = &seedHitQueue[ix % seedHitQueueSize]; -// printf (" " sgnposFmt "/" unsposFmt, q->diag, q->pos2); -// ix = q->prevHit; -// } -// printf ("\n"); -// } -// } - -#endif // not noSeedHitQueue - diff --git a/programs/lastz/src/diag_hash.h b/programs/lastz/src/diag_hash.h deleted file mode 100644 index be6851e..0000000 --- a/programs/lastz/src/diag_hash.h +++ /dev/null @@ -1,166 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: diag_hash.h -// -//---------- - -#ifndef diag_hash_H // (prevent multiple inclusion) -#define diag_hash_H - -// other files - -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff - -// establish ownership of global variables - -#ifdef diag_hash_owner -#define global -#else -#define global extern -#endif - -// debugging defines - -//#define noSeedHitQueue // if this is defined, the seed hit queue is - // .. NOT used for finding twin seed hits (they - // .. are found using other techniques) - -//---------- -// -// diagonal hash-- -// (see diag_hash.c for details) -// -// Helpful formulas: -// diag = pos1 - pos2 diagNumber(pos1,pos2) -// pos1 = diag + pos2 diagToPos1(diag,pos2) -// pos2 = pos1 - diag diagToPos2(diag,pos1) -// -//---------- - -// note that diagHashSize must be a power of two - -#ifdef puny_diag_hash // test case; very small hash forces collisions -#define diagHashSize ((u32) 16) -#else -#ifdef tiny_diag_hash // test case; small hash forces collisions -#define diagHashSize ((u32) 1024) -#else -#ifdef huge_diag_hash // test case; no collisions on 1M vs 1M -#define diagHashSize ((u32) 4194304) -#else -#ifdef diag_hash_size // override; beware: diag_hash_size must be a power of 2 -#define diagHashSize ((u32) diag_hash_size) -#else // normal case -#define diagHashSize ((u32) 65536) -#endif -#endif -#endif -#endif - -#define diagNumber(pos1,pos2) (((sgnpos)(pos1))-((sgnpos)(pos2))) -#define hashedDiag(pos1,pos2) ((diagNumber(pos1,pos2)) & (diagHashSize-1)) -#define diagToPos1(diag,pos2) (((unspos)(diag))+((unspos)(pos2))) -#define diagToPos2(diag,pos1) (((unspos)(pos1))-((unspos)(diag))) - -//--- initialized variables for this module --- - -#ifdef diag_hash_owner -unspos* diagEnd = NULL; // arrays (indexed by 0..diagHashSize-1) to -unspos* diagStart = NULL; // .. track the extent of discovered hits on a -sgnpos* diagActual = NULL; // .. given diagonal (modulo diagHashSize); - // .. no values in any of these arrays are valid - // .. if the corresponding value in diagEnd is - // .. hashInactiveEnd; diagEnd (in addition to - // .. indicating validity) records the end of - // .. of the most recent hit or extension, as a - // .. position in sequence 2; diagStart records - // .. the start of the most recent hit (or - // .. series of overlapping hits in some cases); - // .. diagActual records the actual diagonal - // .. represented (this is used only to - // .. detect/resolve collisions) - -u32* diagActive = NULL; // array (indexed by 0..numdiagActive-1) of -int numDiagActive; // .. positions in diagEnd and diagStart that - // .. have a non-zero value - -//--- external access to the variables, for other modules --- - -#else -extern unspos* diagEnd; -extern unspos* diagStart; -extern sgnpos* diagActual; -extern u32* diagActive; -extern int numDiagActive; -#endif - -// macros for accessing the hash - -#define activate_hashed_diag(h) diagActive[numDiagActive++] = h; -#define hashInactiveEnd ((unspos) -1) - -//---------- -// -// seed hit queue-- -// (see diag_hash.c for details) -// -//---------- - -#ifndef noSeedHitQueue - -#define defaultSeedHitQueueSize 256*1024 - -typedef struct shqhit // seed hit queue element - { - u64 prevHit; // the number of the most recent seed hit on - // .. this hashed diagonal; 0 indicates no hit - int isBlock; // true => this is an end-of-extension - // false => this is a seed hit - unspos pos2; // the position of the seed hit in sequence 2; - // .. this is the position following the end of - // .. the hit (or extension) - sgnpos diag; // the diagonal containing the seed hit - } shqhit; - -#ifdef diag_hash_owner -int seedHitQueueSize = 0; // (N) number of entries in seedHitQueue[] -shqhit* seedHitQueue = NULL; // the most recent N hits, as a cyclic queue -u64* lastSeedHit = NULL; // the number of the most recent seed hit on - // .. each hashed diagonal; lastSeedHit mod N - // .. is the index into seedHitQueue[] of that - // .. seed hit; 0 indicates no hit -u32 seedHitQueueColumns = 0;// the desired number of sequence 2 positions - // .. covered by the queue; this is only used - // .. to generate a warning if we drop below it -u64 seedHitNum; // identifying number of the most recent seed - // .. hit (the first seed hit is number N+1) -#else -extern int seedHitQueueSize; -extern shqhit* seedHitQueue; -extern u64* lastSeedHit; -extern u32 seedHitQueueColumns; -extern u64 seedHitNum; -#endif - -#endif // not noSeedHitQueue - -//---------- -// -// prototypes for routines in diag_hash.c -// -//---------- - -void empty_diag_hash (void); -void free_diag_hash (void); - -#ifndef noSeedHitQueue -void _enqueue_seed_hit (unspos pos1, unspos pos2, int isBlock); -#ifndef diag_hash_owner -#define enqueue_seed_hit(pos1,pos2,isBlock) \ - if (seedHitQueueSize > 0) _enqueue_seed_hit(pos1,pos2,isBlock) -#endif -#endif // not noSeedHitQueue - -#undef global -#endif // diag_hash_H diff --git a/programs/lastz/src/dna_utilities.c b/programs/lastz/src/dna_utilities.c deleted file mode 100755 index 3a3c82d..0000000 --- a/programs/lastz/src/dna_utilities.c +++ /dev/null @@ -1,3128 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: dna_utilities.c -// -//---------- -// -// dna_utilities-- -// Utility functions relating to DNA. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C i/o stuff -#include // standard C string stuff -#include // standard C upper/lower stuff -#include // standard C math stuff -#include "build_options.h" // build options - -#define dna_utilities_owner // (make this the owner of its globals) -#include "dna_utilities.h" // interface to this module - -// debugging defines - -//#define bottleneckBiasOK // if this is defined, the mapping from quantum - // .. symbols to best scoring bottleneck char - // .. will be biased toward earlier chars in the - // .. alphabet - -//---------- -// -// globally available data -// -//---------- - -// nucleotide encoding-- -// nuc_to_bits maps an ascii character to a 2 bit nucleotide code; the 2-bit -// coding is designed so that the following are true: -// nuc_to_bits['A'] xor nuc_to_bits['G'] is 2 -// nuc_to_bits['C'] xor nuc_to_bits['T'] is 2 -// Do not change this code without maintaining that relationship. - -#define __ -1 -#define A_ 0 -#define C_ 1 -#define G_ 2 -#define T_ 3 - -const s8 nuc_to_bits[256] = - { - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 0x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 1x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 2x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 3x (numbers) - __,A_,__,C_,__,__,__,G_,__,__,__,__,__,__,__,__, // 4x (upper case) - __,__,__,__,T_,__,__,__,__,__,__,__,__,__,__,__, // 5x (upper case) - __,A_,__,C_,__,__,__,G_,__,__,__,__,__,__,__,__, // 6x (lower case) - __,__,__,__,T_,__,__,__,__,__,__,__,__,__,__,__, // 7x (lower case) - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 8x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 9x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Ax - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Bx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Cx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Dx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Ex - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__ // Fx - }; - -const s8 upper_nuc_to_bits[256] = - { - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 0x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 1x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 2x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 3x (numbers) - __,A_,__,C_,__,__,__,G_,__,__,__,__,__,__,__,__, // 4x (upper case) - __,__,__,__,T_,__,__,__,__,__,__,__,__,__,__,__, // 5x (upper case) - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 6x (lower case) - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 7x (lower case) - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 8x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 9x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Ax - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Bx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Cx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Dx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Ex - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__ // Fx - }; - -const u8* bits_to_nuc = (u8*) "ACGT"; -const u8* bit_to_pur_pyr = (u8*) "RY"; // purine (AG) or pyramidine (CT) -const u8* bits_to_pur_pyr = (u8*) "RYRY"; // purine (AG) or pyramidine (CT) - - -const u8 nuc_to_complement[256] = // assumes upper/lower iupac code - { - 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, // 0x - 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, // 1x - 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, // 2x - 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, // 3x (numbers) - 0x40,'T', 'V', 'G', 'H', 0x45,0x46,'C', 'D', 0x49,0x4A,'M', 0x4C,'K', 'N' ,0x4F, // 4x (upper case) - 0x50,0x51,'Y', 'S', 'A', 0x55,'B', 'W', 0x58,'R', 0x5A,0x5B,0x5C,0x5D,0x5E,0x5F, // 5x (upper case) - 0x60,'t', 'v', 'g', 'h', 0x65,0x66,'c', 'd', 0x69,0x6a,'m', 0x6c,'k', 'n' ,0x6f, // 6x (lower case) - 0x70,0x71,'y', 's', 'a', 0x75,'b', 'w', 0x78,'r', 0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, // 7x (lower case) - 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F, // 8x - 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F, // 9x - 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF, // Ax - 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF, // Bx - 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF, // Cx - 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, // Dx - 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, // Ex - 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF // Fx - }; - - // A C G T -const u8 bits_to_complement[4] = { T_, G_, C_, A_ }; -#define A_ 0 -#define C_ 1 -#define G_ 2 -#define T_ 3 - -#undef __ -#undef A_ -#undef C_ -#undef G_ -#undef T_ -#undef ___ - -// default substitution scores - -score HOXD70[4][4] = - {// A C G T - /* A */ { 91, -114, -31, -123 }, - /* C */ {-114, 100, -125, -31 }, - /* G */ { -31, -125, 100, -114 }, - /* T */ {-123, -31, -114, 91 }, - }; - -const score HOXD70_open = 400; -const score HOXD70_extend = 30; -const score HOXD70_X = -1000; -const score HOXD70_fill = -100; - -score unitScores[4][4] = - {// A C G T - /* A */ { 1, -1, -1, -1 }, - /* C */ {-1, 1, -1, -1 }, - /* G */ {-1, -1, 1, -1 }, - /* T */ {-1, -1, -1, 1 }, - }; - -const double unitScores_open = 3.25; // 400/123 -const double unitScores_extend = 0.24375; // 30/123 -const double unitScores_X = -10.0; -const double unitScores_fill = -1.0; -const double unitScores_thresh = 30.0; - -//---------- -// -// prototypes for private functions -// -//---------- - -// macro to test membership in a string - -#define in_string(ch,str) (((ch)!=0) && (strchr(((char*)(str)),(ch))!=NULL)) -#define not_in_string(ch,str) (((ch)==0) || (strchr(((char*)(str)),(ch))==NULL)) - -// real functions - -static exscoreset* create_extended_score_set (void); -static scoreset* create_score_set (void); -static char* quantum_visual (int ch); - -//---------- -// -// new_dna_score_set-- -// Create a new score set. -// -//---------- -// -// Arguments: -// score template[4][4]: The template containing the scores, with rows -// .. and columns corresponding to bits_to_nuc[]. -// .. A row corresponds to a character in sequence -// .. 1 and a column corresponds to a character in -// .. sequence 2. This can be NULL if the caller -// .. doesn't care about initial scores. -// score badScore: The score to use for row and column 'X'. -// score fillScore: The score to use for all other rows and -// .. columns. -// score gapOpen, gapExtend: Gap scoring parameters. -// -//---------- -// -// Returns: -// A pointer to the newly allocated score set, which the caller will have to -// dispose of eventually. The routine free() should be used for this purpose. -// -//---------- -// -// Notes: -// (1) In the resulting scoring matrix, upper and lower case characters are -// are considered identical, so entries for lower case are copied from -// upper case. -// -//---------- - -scoreset* new_dna_score_set - (score template[4][4], - score badScore, - score fillScore, - score gapOpen, - score gapExtend) - { - scoreset* ss; - u8* s, *d; - int r, c, rowCh, colCh, rowLower, colLower; - int len; - - ////////// - // allocate the score set - ////////// - - // allocate - - ss = create_score_set (); - - // set character set - - ustrcpy (ss->rowChars, bits_to_nuc); - s = ss->rowChars; len = ustrlen(s); d = s + len; - for ( ; len>0 ; s++,len--) - { *(d++) = dna_tolower(*s); *d = 0; } - - ustrcpy (ss->colChars, bits_to_nuc); - s = ss->colChars; len = ustrlen(s); d = s + len; - for ( ; len>0 ; s++,len--) - { *(d++) = dna_tolower(*s); *d = 0; } - - ss->badRow = ss->badCol = 'X'; - - ss->rowsAreDna = true; - ss->colsAreDna = true; - - // set gap scoring parameters - - ss->gapOpen = gapOpen; - ss->gapExtend = gapExtend; - - ////////// - // fill the array with a filler score and make sure scores for row and - // column zero are very very bad - ////////// - - // fill row 0 - - for (c=0 ; c<256 ; c++) - ss->sub[0][c] = veryBadScore; - - // fill row 1 - - ss->sub[1][0] = veryBadScore; - for (c=1 ; c<256 ; c++) - ss->sub[1][c] = fillScore; - - // copy row 1 to the remaining rows - - for (r=2 ; r<256 ; r++) - memcpy (/*to*/ ss->sub[r], /*from*/ ss->sub[1], - /*how much*/ sizeof(scorerow)); - - ////////// - // set up the remaining rows - ////////// - - // fill in X scores - - for (c=0 ; c<256 ; c++) - { - ss->sub['X'][ c ] - = ss->sub['x'][ c ] - = ss->sub[ c ]['X'] - = ss->sub[ c ]['x'] = badScore; - } - - // copy scores from the template - - if (template != NULL) - { - for (r=0 ; r<4 ; r++) - for (c=0 ; c<4 ; c++) - { - rowCh = bits_to_nuc[r]; - colCh = bits_to_nuc[c]; - rowLower = dna_tolower(rowCh); - colLower = dna_tolower(colCh); - - ss->sub[rowCh ][colCh ] = template[r][c]; - ss->sub[rowCh ][colLower] = template[r][c]; - ss->sub[rowLower][colCh ] = template[r][c]; - ss->sub[rowLower][colLower] = template[r][c]; - } - } - - return ss; - } - -//---------- -// -// create_score_set, create_extended_score_set-- -// Create a new score set but don't initialize it. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// A pointer to the newly allocated score set, which the caller will have to -// dispose of eventually. The routine free_score_set() should be used for -// this purpose. -// -// Notes: -// - The internal rows of the scoring matrix are *not* initialized. -// -//---------- - -static scoreset* create_either_score_set (int isExtended); - - -static exscoreset* create_extended_score_set () - { return (exscoreset*) create_either_score_set (true); } -static scoreset* create_score_set () - { return create_either_score_set (false); } - - -static scoreset* create_either_score_set - (int isExtended) - { - scoreset* ss; - u32 bytesNeeded; - int r; - - // allocate - - if (isExtended) bytesNeeded = round_up_8 (sizeof(exscoreset)); - else bytesNeeded = round_up_8 (sizeof(scoreset)); - ss = (scoreset*) zalloc_or_die ("create_score_set", bytesNeeded); - - // initialize - - ss->rowChars[0] = 0; - ss->colChars[0] = 0; - ss->badRow = ss->badCol = -1; - - ss->gapOpenSet = false; - ss->gapExtendSet = false; - - ss->bottleneck[0] = 0; - for (r=0 ; r<256 ; r++) ss->qToBest[r].len = -1; - - ss->qToComplement = NULL; - - // return the score set - - return ss; - } - -//---------- -// -// free_score_set-- -// Deallocate a score set, along with any associated memory. -// -//---------- -// -// Arguments: -// char* id: an identifying string to be used when trackMemoryUsage is -// .. turned on; this can be NULL. -// scoreset* ss: The score set to dispose of. -// -// Returns: -// (nothing) -// -//---------- - -void free_score_set - (char* id, - scoreset* ss) - { - char* message = " (qToComplement)"; - char temp[200]; - - if (ss == NULL) return; - if (id == NULL) id = "free_score_set"; - - if (strlen(id) + strlen(message) + 1 > sizeof(temp)) - free_if_valid (id, ss->qToComplement); - else - { - strcpy (temp, id); - strcpy (temp + strlen(id), message); - free_if_valid (temp, ss->qToComplement); - } - - free_if_valid (id, ss); - } - -//---------- -// -// copy_score_set-- -// Create a copy of a score set. -// -//---------- -// -// Arguments: -// scoreset* ss: The score set to copy. -// -// Returns: -// A pointer to the newly allocated score set, which the caller will have to -// dispose of eventually. The routine free() should be used for this purpose. -// -//---------- - -scoreset* copy_score_set - (scoreset* ss) - { - scoreset* ssNew; - int r, c; - - // alloacte new score set - - ssNew = create_score_set (); - - if (ss->qToComplement != NULL) - ssNew->qToComplement = (u8*) malloc_or_die ("copy_score_set (qToComplement)", 256); - - // copy gap scoring parameters - - ssNew->gapOpen = ss->gapOpen; - ssNew->gapOpenSet = ss->gapOpenSet; - - ssNew->gapExtend = ss->gapExtend; - ssNew->gapExtendSet = ss->gapExtendSet; - - // copy character set - - ssNew->badRow = ss->badRow; - ssNew->badCol = ss->badCol; - - ustrcpy (ssNew->rowChars, ss->rowChars); - ustrcpy (ssNew->colChars, ss->colChars); - - ssNew->rowsAreDna = ss->rowsAreDna; - ssNew->colsAreDna = ss->colsAreDna; - - ustrcpy (ssNew->bottleneck, ss->bottleneck); - for (r=0 ; r<256 ; r++) ssNew->qToBest[r] = ss->qToBest[r]; - - if (ss->qToComplement != NULL) - { - for (c=0 ; c<256 ; c++) - ssNew->qToComplement[c] = ss->qToComplement[c]; - } - - // copy substitution scores - - memcpy (/*to*/ ssNew->sub, /*from*/ ss->sub, /*how much*/ sizeof(ss->sub)); - - return ssNew; - } - -//---------- -// -// masked_score_set-- -// Create a copy of a score set, with all lower case entries given 'bad' -// scores. -// -//---------- -// -// Arguments: -// scoreset* ss: The score set to copy. Columns can be either DNA or -// .. quantum DNA, but rows must be DNA. -// -// Returns: -// A pointer to the newly allocated score set, which the caller will have to -// dispose of eventually. The routine free() should be used for this purpose. -// -//---------- - -scoreset* masked_score_set - (scoreset* ss) - { - score badScore; - scoreset* ssNew; - int r, c, goodRow; - u8* rr, *cc, *d; - int nIsARow, nIsACol; - - // $$$ the tests for rowsAreDna and colsAreDna should be replaced by - // $$$ .. a new field indicating whether rows/cols are maskable, and - // $$$ .. which characters survive masking - - // copy the score set and reduce copy's good characters to upper case - - ssNew = copy_score_set (ss); - - if (ss->rowsAreDna) - { - d = ssNew->rowChars; - for (rr=ss->rowChars ; *rr!=0 ; rr++) - { if (dna_isupper (*rr)) { *(d++) = *rr; *d = 0; } } - } - - if (ss->colsAreDna) - { - d = ssNew->colChars; - for (cc=ss->colChars ; *cc!=0 ; cc++) - { if (dna_isupper (*cc)) { *(d++) = *cc; *d = 0; } } - } - - // mask the copy; fill each lower case row or column with a bad score - - goodRow = ss->rowChars[0]; - badScore = ss->sub[goodRow][ss->badCol]; - - if (ss->rowsAreDna) - { - nIsARow = (ustrchr (ssNew->rowChars, 'N') != NULL); - - for (rr=ss->rowChars ; *rr!=0 ; rr++) - if (!dna_isupper (*rr)) - { for (c=1 ; c<256 ; c++) ssNew->sub[*rr][c] = badScore; } - if (!nIsARow) for (c=1 ; c<256 ; c++) ssNew->sub['N'][c] = badScore; - for (c=1 ; c<256 ; c++) ssNew->sub['n'][c] = badScore; - for (c=1 ; c<256 ; c++) ssNew->sub['X'][c] = badScore; - } - - if (ss->colsAreDna) - { - nIsACol = (ustrchr (ssNew->colChars, 'N') != NULL); - - for (cc=ss->colChars ; *cc!=0 ; cc++) - if (!dna_isupper (*cc)) - { for (r=1 ; r<256 ; r++) ssNew->sub[r][*cc] = badScore; } - if (!nIsACol) for (r=1 ; r<256 ; r++) ssNew->sub[r]['N'] = badScore; - for (r=1 ; r<256 ; r++) ssNew->sub[r]['n'] = badScore; - for (r=1 ; r<256 ; r++) ssNew->sub[r]['X'] = badScore; - } - - return ssNew; - } - -//---------- -// -// read_score_set_by_name, read_score_set-- -// Read a new score set from a file (see format description below). -// -//---------- -// -// Arguments: -// FILE* f: (read_score_set only) The file that scoring data is -// .. to be read from. This should already be open for -// .. text read. -// char* name: The name of the file that scoring data is to be read -// .. from. For read_score_set this is only used for -// .. reporting problems to the user (and may be NULL). -// -// Returns: -// A pointer to the newly allocated score set, which the caller will have to -// dispose of eventually. The routine free() should be used for this purpose. -// -//---------- -// -// Score Set File Format -// ===================== -// -// Here's an example: -// -// # This matches the default scoring set for blastz -// -// bad_score = X:-1000 # used for sub['X'][*] and sub[*]['X'] -// fill_score = -100 # used when sub[*][*] not defined -// gap_open_penalty = 30 -// gap_extend_penalty = 400 -// -// A C G T -// A 91 -114 -31 -123 -// C -114 100 -125 -31 -// G -31 -125 100 -114 -// T -123 -31 -114 91 -// -// The score set consists of a substitution matrix and other settings. The -// other settings come first. Any line may contain a comment, # is the comment -// character. -// -// Labels can either be single characters, or two-digit hexadecimal character -// values (the value 00 is not allowed). Rows and columns of the matrix need -// not have the same labels or range, so, for example, a matrix might describe -// scoring between the 15-letter ambiguity code and the 4-letter DNA code. -// -// Row labels are optional, and if absent it is presumed that they are the same -// as the column labels (and in the same order). This allows us to read blastz -// score files. -// -// For quantum alphabets, column labels may indicate reverse complement pairing. -// This is done by specifying each column as, e.g. A~T. If any labels indicate -// complement they all must. -// -// The bad_score setting is optional, and the X shown above is the character -// for which all scores will be marked as bad. A separate character can be -// specified for rows and columns by using ::. Both -// and are optional. -// -// The other settings, fill_score, gap_open_penalty and gap_extend_penalty, are -// also optional, and defaults compatible with blastz are used. -// -// Rows correspond to characters in sequence 1 and columns correspond to -// characters in sequence 2. -// -// Score values can be floating-point if the library is built with an -// appropriate scoreType. -// -//---------- - -static int parse_char_code (char** s, int* comp, char terminator); -static int parse_char_code_zero_ok (char** s, int* comp, char terminator); -static int parse_char_code_common (char** s, int* comp, char terminator, int zeroOk); -static int is_dna_alphabet (u8* alphabet); -static u8 two_char_as_hex (u8 ch1, u8 ch2); -static int parse_bottleneck (char* s, u8 bottleneck[5]); - - -exscoreset* read_score_set_by_name - (char* name) - { - FILE* f; - exscoreset* xss; - - if (name == NULL) - suicide ("can't open NULL file in read_score_set_by_name()"); - - f = fopen_or_die (name, "rt"); - xss = read_score_set (f, name); - fclose_if_valid (f); - - return xss; - } - - -exscoreset* read_score_set - (FILE* f, - char* _name) - { - // $$$ why is the line buffer declared as static? - static char line[256*25+1]; // (must hold 256 fields, up to 25 chars each) - char* name = _name; - u8 rowChars[256], colChars[256], colComps[256]; - u8* scanRowChars, *scanColChars; - int lineNum, len, missingEol; - int badRow, badCol, r, c, compC; - u8 bottleneck[5]; - score badScore, fillScore, gapOpen, gapExtend, - hspThreshold, gappedThreshold, xDrop, yDrop, ballScore; - float ballScoreFactor; - u32 step; - char* seed; - scorerow fillRowData; - exscoreset* xss; - char* s, *prevS, *waffle, *end; - int numRows, numCols, numFields, fieldCount, ix, iy; - int haveBottleneck, haveFillScore, haveGapOpen, haveGapExtend, - haveHspThreshold, haveGappedThreshold, - haveXDrop, haveYDrop, haveStep, haveBallScore, haveSeed; - char* valString, *scan, *colon; - int valLength, finalField, haveComps; - u8* src, *dst, ch; - - if (name == NULL) - name = "(unnamed file)"; - - ////////// - // read assignments - ////////// - - badScore = -1000; // (deafults from blastz) - fillScore = -100; - gapOpen = HOXD70_open; - gapExtend = HOXD70_extend; - - badCol = badRow = -1; - haveBottleneck = haveFillScore = haveGapOpen = haveGapExtend = false; - haveHspThreshold = haveGappedThreshold = false; - haveXDrop = haveYDrop = haveStep = haveBallScore = haveSeed = false; - hspThreshold = gappedThreshold = xDrop = yDrop = ballScore = 0; - ballScoreFactor = -1; - step = 0; - seed = NULL; - - lineNum = 0; - missingEol = false; - - while (fgets (line, sizeof(line), f) != NULL) - { - lineNum++; - - // check for lines getting split by fgets (the final line in the file - // might not have a newline, but no internal lines can be that way) - - if (missingEol) - suicidef ("line is too long (%s: line %d)", name, lineNum-1); - - len = strlen(line); - if (len == 0) continue; - missingEol = (line[len-1] != '\n'); - - // trim blanks, end of line, and comments, and ignore blank lines - - if (line[len-1] == '\n') line[--len] = 0; - - waffle = strchr (line, '#'); - if (waffle != NULL) *waffle = 0; - - trim_string (line); - if (line[0] == 0) continue; - - // if it doesn't contain an assignment, more on to the next phase - - valString = strchr (line, '='); - if (valString == NULL) break; - - // parse the assignment - - *(valString++) = 0; - trim_string (line); - trim_string (valString); - - if ((!haveBottleneck) - && (strcmp (line, "bottleneck") == 0)) - { - if (!parse_bottleneck (valString, bottleneck)) - suicidef ("invalid bottleneck alphabet (%s: line %d) %s=%s", - name, lineNum, line, valString); - haveBottleneck = true; - } - else if ((badCol == -1) - && ((strcmp (line, "bad") == 0) - || (strcmp (line, "bad_score") == 0))) - { - // parse [:[:]], no whitespace allowed - - scan = valString; - - colon = strchr (scan, ':'); - if (colon != NULL) - { - badCol = badRow = parse_char_code_zero_ok (&scan, NULL, ':'); - scan = colon+1; - if (badCol < 0) - suicidef ("invalid bad_score character code (%s: line %d) %s=%s", - name, lineNum, line, valString); - } - - colon = strchr (scan, ':'); - if (colon != NULL) - { - badRow = parse_char_code_zero_ok (&scan, NULL, ':'); - scan = colon+1; - if (badRow < 0) - suicidef ("invalid bad_score character code (%s: line %d) %s=%s", - name, lineNum, line, valString); - } - - badScore = string_to_score (scan); - } - else if ((!haveFillScore) - && ((strcmp (line, "fill") == 0) - || (strcmp (line, "fill_score") == 0))) - { - fillScore = string_to_score (valString); - haveFillScore = true; - } - else if ((!haveGapOpen) - && ((strcmp (line, "O") == 0) - || (strcmp (line, "open") == 0) - || (strcmp (line, "gap_open") == 0) - || (strcmp (line, "gap_open_penalty") == 0))) - { - gapOpen = string_to_score (valString); - haveGapOpen = true; - } - else if ((!haveGapExtend) - && ((strcmp (line, "E") == 0) - || (strcmp (line, "extend") == 0) - || (strcmp (line, "gap_extend") == 0) - || (strcmp (line, "gap_extend_penalty") == 0))) - { - gapExtend = string_to_score (valString); - haveGapExtend = true; - } - else if ((!haveHspThreshold) - && ((strcmp (line, "K") == 0) - || (strcmp (line, "hsp_thresh") == 0) - || (strcmp (line, "hsp_threshold") == 0))) - { - hspThreshold = string_to_score (valString); - haveHspThreshold = true; - } - else if ((!haveGappedThreshold) - && ((strcmp (line, "L") == 0) - || (strcmp (line, "gapped_thresh") == 0) - || (strcmp (line, "gapped_threshold") == 0))) - { - gappedThreshold = string_to_score (valString); - haveGappedThreshold = true; - } - else if ((!haveXDrop) - && ((strcmp (line, "X") == 0) - || (strcmp (line, "x_drop") == 0))) - { - xDrop = string_to_score (valString); - haveXDrop = true; - if (xDrop <= 0) - suicidef ("invalid x-drop threshold (%s: line %d) %s=%s", - name, lineNum, line, valString); - } - else if ((!haveYDrop) - && ((strcmp (line, "Y") == 0) - || (strcmp (line, "y_drop") == 0))) - { - yDrop = string_to_score (valString); - haveYDrop = true; - if (yDrop <= 0) - suicidef ("invalid y-drop threshold (%s: line %d) %s=%s", - name, lineNum, line, valString); - } - else if ((!haveStep) - && ((strcmp (line, "Z") == 0) - || (strcmp (line, "step") == 0))) - { - step = string_to_int (valString); - haveStep = true; - if (step <= 0) - suicidef ("invalid step (%s: line %d) %s=%s", - name, lineNum, line, valString); - } - else if ((!haveBallScore) - && (strcmp (line, "ball") == 0)) - { - valLength = strlen(valString); - if ((valLength > 0) && (valString[valLength-1] == '%')) - { - ballScoreFactor = pct_string_to_double (valString); - haveBallScore = true; - if ((ballScoreFactor <= 0) || (ballScoreFactor > 1)) - suicidef ("invalid quantum ball score (%s: line %d) %s=%s", - name, lineNum, line, valString); - } - else - { - ballScore = string_to_score (valString); - haveBallScore = true; - if (ballScore <= 0) - suicidef ("invalid quantum ball score (%s: line %d) %s=%s", - name, lineNum, line, valString); - } - } - else if ((!haveSeed) - && (strcmp (line, "T") == 0)) - { - if (strcmp (valString, "1") == 0) - seed = copy_string ("T=1"); - else if (strcmp (valString, "2") == 0) - seed = copy_string ("T=2"); - else if (strcmp (valString, "3") == 0) - seed = copy_string ("T=3"); - else if (strcmp (valString, "4") == 0) - seed = copy_string ("T=4"); - else - suicidef ("invalid seed (%s: line %d) %s=%s", - name, lineNum, line, valString); - haveSeed = true; - } - else if ((!haveSeed) - && (strcmp (line, "seed") == 0)) - { - if ((strcmp (valString, "12of19,transition") == 0) - || (strcmp (valString, "12_of_19,transition") == 0)) - seed = copy_string ("T=1"); - else if ((strcmp (valString, "12of19,notransition") == 0) - || (strcmp (valString, "12_of_19,no_transition") == 0)) - seed = copy_string ("T=2"); - else if ((strcmp (valString, "14of22,transition") == 0) - || (strcmp (valString, "14_of_22,transition") == 0)) - seed = copy_string ("T=3"); - else if ((strcmp (valString, "14of22,notransition") == 0) - || (strcmp (valString, "14_of_22,no_transition") == 0)) - seed = copy_string ("T=4"); - else - suicidef ("invalid seed (%s: line %d) %s=%s", - name, lineNum, line, valString); - haveSeed = true; - } - else - suicidef ("invalid name in assignment (%s: line %d) %s=%s", - name, lineNum, line, valString); - } - - ////////// - // read column characters - ////////// - - // current line caused us to exit assignment stage, so it must contain - // the column headers - - for (c=0 ; c<256 ; c++) - colComps[c] = 0; - - haveComps = -1; - - colChars[0] = 0; - for (s=line,scanColChars=colChars ; *s!=0 ; ) - { - prevS = s; - c = parse_char_code (&s, &compC, ' '); - - if (c <= 0) - suicidef ("invalid character code in %s:line %d at \"%s\"", - name, lineNum, s); - - if (compC < 0) - suicidef ("invalid complement in %s:line %d at \"%s\"", - name, lineNum, s); - - if (in_string (c, colChars)) - suicidef ("duplicate character code in %s:line %d at \"%s\"", - name, lineNum, s); - - if (haveComps == -1) - haveComps = (compC != 0); - else if (haveComps) - { - if (compC == 0) - suicidef ("missing complement in %s:line %d at \"%s\"", - name, lineNum, prevS); - } - else // if (!haveComps) - { - if (compC != 0) - suicidef ("missing complement(s) in %s:line %d before \"%s\"", - name, lineNum, prevS); - } - - *(scanColChars++) = c; *scanColChars = 0; - colComps[c] = compC; - } - - numCols = scanColChars - colChars; - - if ((badCol >= 0) && (in_string (badCol, colChars))) - suicidef ("character code for bad_score can't also be a matrix column\n" - "(%s: line %d)", - name, lineNum); - - if (numCols == 0) - suicidef ("matrix has no column headers (%s: line %d)", - name, lineNum); - - // validate complements - - if (haveComps) - { - for (ix=0 ; ixss.sub[r], /*from*/ fillRowData, - /*how much*/ sizeof(scorerow)); - - // disable the extra parameters - - xss->hspThresholdSet = false; - xss->gappedThresholdSet = false; - xss->xDropSet = false; - xss->yDropSet = false; - xss->stepSet = false; - xss->ballScoreSet = false; - xss->ballScoreFactor = -1; - xss->seedSet = false; - - xss->seed = NULL; - - ////////// - // read the scoring matrix, filling in over the scores we've already filled - ////////// - - scanRowChars = rowChars; - *scanRowChars = 0; - - numFields = -1; - iy = 0; - - while (fgets (line, sizeof(line), f) != NULL) - { - lineNum++; - - // check for lines getting split by fgets (the final line in the file - // might not have a newline, but no internal lines can be that way) - - if (missingEol) - suicidef ("line is too long (%s: line %d)", name, lineNum-1); - - len = strlen(line); - if (len == 0) continue; - missingEol = (line[len-1] != '\n'); - - // trim blanks, end of line, and comments, and ignore blank lines - - if (line[len-1] == '\n') line[--len] = 0; - - waffle = strchr (line, '#'); - if (waffle != NULL) *waffle = 0; - - trim_string (line); - if (line[0] == 0) continue; - - // count the number of fields - - fieldCount = 0; - for (s=line ; *s!=0 ; ) - { - s = skip_darkspace (s); - s = skip_whitespace (s); - fieldCount++; - } - - if (numFields < 0) - { - numFields = fieldCount; - if ((numFields != numCols) - && (numFields != numCols+1)) - suicidef ("wrong number of score columns (%s: line %d)", - name, lineNum); - } - else if (fieldCount != numFields) - suicidef ("inconsistent number of score columns (%s: line %d)", - name, lineNum); - - // first field is character code for the row; for blastz compatibility - // we just assign the next DNA character - - s = line; - - if (numFields == numCols) - { - if (iy >= numCols) - suicidef ("too many score rows (%s: line %d): \"%s\"", - name, lineNum, line); - r = colChars[iy++]; - *(scanRowChars++) = r; *scanRowChars = 0; - } - else - { - r = parse_char_code (&s, NULL, ' '); - if (r <= 0) - suicidef ("invalid row character code (%s: line %d) %s=%s", - name, lineNum, line, s); - if (in_string (r, rowChars)) - suicidef ("duplicate row character code (%s: line %d): \"%s\"", - name, lineNum, line); - - *(scanRowChars++) = r; *scanRowChars = 0; - } - - // remaining fields are the rows in this column - - for (ix=0 ; ixss.sub[r][c] = string_to_score (s); - if (finalField) s = end; - else s = skip_whitespace (end+1); - } - } - - numRows = scanRowChars - rowChars; - - if (numFields < 0) - suicidef ("scores file %s contains no score rows", name); - if ((numFields == numCols) && (numRows != numCols)) - suicidef ("not enough score rows, line (%s: line %d): \"%s\"", - name, lineNum, line); - - if ((badRow >= 0) && (in_string (badRow, rowChars))) - suicide ("character code for bad_score can't also be a matrix row"); - - ////////// - // finish off the scoring matrix - ////////// - - ustrcpy (xss->ss.colChars, colChars); - ustrcpy (xss->ss.rowChars, rowChars); - - xss->ss.gapOpen = gapOpen; - xss->ss.gapOpenSet = haveGapOpen; - xss->ss.gapExtend = gapExtend; - xss->ss.gapExtendSet = haveGapExtend; - - if ((xss->ss.gapOpenSet) && (xss->ss.gapOpen + xss->ss.gapExtend <= 0)) - suicidef (" (in %s) "scoreFmt " is not a valid gap open penalty with extension penalty " scoreFmt "\n" - "(open can be negative but the sum has to be postive)\n", - name, xss->ss.gapOpen, xss->ss.gapExtend); - if ((xss->ss.gapExtendSet) && (xss->ss.gapExtend < 0)) - suicidef (scoreFmt " is not a valid gap extension penalty (in %s)\n", - xss->ss.gapExtend, name); - - // set any extra parameters we have - - if (haveHspThreshold) - { xss->hspThresholdSet = true; xss->hspThreshold = hspThreshold; } - if (haveGappedThreshold) - { xss->gappedThresholdSet = true; xss->gappedThreshold = gappedThreshold; } - if (haveXDrop) - { xss->xDropSet = true; xss->xDrop = xDrop; } - if (haveYDrop) - { xss->yDropSet = true; xss->yDrop = yDrop; } - if (haveStep) - { xss->stepSet = true; xss->step = step; } - if (haveBallScore) - { xss->ballScoreSet = true; xss->ballScore = ballScore; - xss->ballScoreFactor = ballScoreFactor; } - if (haveSeed) - { xss->seedSet = true; xss->seed = seed; } - - // if the columns are DNA, make lower case columns equivalent to upper case - - xss->ss.colsAreDna = is_dna_alphabet (colChars); - - if (xss->ss.colsAreDna) - { - if (badCol < 0) badCol = 'X'; - for (ix=0 ; ixss.sub[r][c+'a'-'A'] = xss->ss.sub[r][c]; - } - } - - src = xss->ss.colChars; - dst = src + ustrlen(src); - for ( ; *src!=0 ; src++) - { - ch = dna_tolower (*src); - if (ustrchr (xss->ss.colChars, ch) == NULL) - { *(dst++) = ch; *dst = 0; } - } - } - - // if the rows are DNA, make lower case rows equivalent to upper case - - xss->ss.rowsAreDna = is_dna_alphabet (rowChars); - - if (xss->ss.rowsAreDna) - { - if (badRow < 0) badRow = 'X'; - for (ix=0 ; ixss.sub[r+'a'-'A'], /*from*/ xss->ss.sub[r], - /*how much*/ sizeof(scorerow)); - } - - src = xss->ss.rowChars; - dst = src + ustrlen(src); - for ( ; *src!=0 ; src++) - { - ch = dna_tolower (*src); - if (ustrchr (xss->ss.rowChars, ch) == NULL) - { *(dst++) = ch; *dst = 0; } - } - } - - // fill the bad row and column - - if (badCol == -1) badCol = 0; // (if rows and/or cols were DNA, these - if (badRow == -1) badRow = 0; // .. would already be set to 'X') - - xss->ss.badRow = badRow; - xss->ss.badCol = badCol; - - for (c=0 ; c<256 ; c++) xss->ss.sub[badRow][c] = badScore; - for (r=0 ; r<256 ; r++) xss->ss.sub[r][badCol] = badScore; - - // make sure scores for row and column zero are very very bad - - for (c=0 ; c<256 ; c++) - xss->ss.sub[0][c] = xss->ss.sub[c][0] = veryBadScore; - - ////////// - // create complement-mapping table - ////////// - - if (haveComps) - { - xss->ss.qToComplement = (u8*) malloc_or_die ("read_score_set (qToComplement)", 256); - for (c=0 ; c<256 ; c++) - xss->ss.qToComplement[c] = colComps[c]; - } - - ////////// - // create bottleneck-related fields - ////////// - - // set neutered defaults - - xss->ss.bottleneck[0] = 0; - for (r=0 ; r<256 ; r++) xss->ss.qToBest[r].len = -1; - - // if we don't have a quantum row alphabet, we can't have a bottleneck - - if ((haveBottleneck) && (xss->ss.rowsAreDna)) - suicidef ("invalid bottleneck alphabet (%s in %s), rows are DNA", - bottleneck, name); - - // if we don't have a quantum column alphabet, the bottleneck has to be DNA - - if ((haveBottleneck) && (xss->ss.colsAreDna) - && (ustrcmp (bottleneck, (u8*) "ACGT") != 0)) - suicidef ("invalid bottleneck alphabet (%s in %s), columns are DNA", - bottleneck, name); - - // if we have quantum rows and DNA columns but no bottleneck, assign - // a DNA bottleneck - - if ((!haveBottleneck) && (!xss->ss.rowsAreDna) && (xss->ss.colsAreDna)) - { ustrcpy (bottleneck, (u8*) "ACGT"); haveBottleneck = true; } - - // if we have quantum rows and quantum columns, we gotta have a bottleneck - - if ((!haveBottleneck) && (!xss->ss.rowsAreDna) && (!xss->ss.colsAreDna)) - suicidef ("missing bottleneck alphabet (in %s)", name); - - // ok, let's fill in the fields - - if (haveBottleneck) - { - u8* rr; - u8 bits; - charvec bestBits; - score bestScore, thisScore; - -#ifdef bottleneckBiasOK - bestBits.v[0] // (placate compiler) - = bestBits.v[1] - = bestBits.v[2] - = bestBits.v[3] - = 0; -#endif // no bottleneckBiasOK - - // all bottleneck chars must be in column alphabet - - if ((ustrchr (xss->ss.colChars, bottleneck[0]) == NULL) - || (ustrchr (xss->ss.colChars, bottleneck[1]) == NULL) - || (ustrchr (xss->ss.colChars, bottleneck[2]) == NULL) - || (ustrchr (xss->ss.colChars, bottleneck[3]) == NULL)) - suicidef ("invalid bottleneck alphabet (%s in %s)" - ", not contained in column alphabet", - bottleneck, name); - - ustrcpy (xss->ss.bottleneck, bottleneck); - - // find 'closest' match for each row character - - for (rr=xss->ss.rowChars ; *rr!=0 ; rr++) - { - r = *rr; - c = bottleneck[0]; - bestBits.len = 0; - bestScore = worstPossibleScore; - for (bits=0 ; bits<4 ; bits++) - { - c = bottleneck[bits]; - thisScore = xss->ss.sub[r][c]; - if (thisScore > bestScore) - { // (this character is 'closest' so far) - bestBits.len = 1; - bestBits.v[0] = bits; - bestScore = thisScore; - } -#ifndef bottleneckBiasOK - else if (thisScore == bestScore) - { // (this character is tied for 'closest' so far) - bestBits.v[(u8)(bestBits.len++)] = bits; - } -#endif // not bottleneckBiasOK - } - if (bestBits.len == 0) // (can't happen, but compiler frets) - bestBits.len = -1; - xss->ss.qToBest[r] = bestBits; - } - - if (dna_utilities_dbgShowQToBest) - { - for (rr=xss->ss.rowChars ; *rr!=0 ; rr++) - { - r = *rr; - bestBits = xss->ss.qToBest[r]; - fprintf (stderr, "qToBest[%02X]:", r); - if (bestBits.len == -1) - fprintf (stderr, " (none)"); - for (ix=0 ; ix= 0) && (cc2 >= 0)) *_s = s; - if (comp != NULL) *comp = cc2; - return cc; - } - - -static int is_dna_alphabet - (u8* alphabet) - { - int match = 0; - - if (ustrchr (alphabet,'A') != NULL) match++; - if (ustrchr (alphabet,'C') != NULL) match++; - if (ustrchr (alphabet,'G') != NULL) match++; - if (ustrchr (alphabet,'T') != NULL) match++; - - if (ustrlen (alphabet) == 4) - return (match == 4); - - if (ustrlen (alphabet) == 5) - return ((match == 4) && (ustrchr (alphabet,'N') != NULL)); - - if (ustrchr (alphabet,'a') != NULL) match++; - if (ustrchr (alphabet,'c') != NULL) match++; - if (ustrchr (alphabet,'g') != NULL) match++; - if (ustrchr (alphabet,'t') != NULL) match++; - - if (ustrlen (alphabet) == 8) - return (match == 8); - - if (ustrlen (alphabet) == 9) - return ((match == 8) && (ustrchr (alphabet,'N') != NULL)); - - return false; - } - -static u8 two_char_as_hex // assumes both characters are valid hex digits - (u8 ch1, - u8 ch2) - { - return 16 * ((ch1<='9')? (ch1-'0') : (ch1<='F')? (10+ch1-'A') : (10+ch1-'a')) - + ((ch2<='9')? (ch2-'0') : (ch2<='F')? (10+ch2-'A') : (10+ch2-'a')); - } - -static int parse_bottleneck // (returns true => success, false => failure) - (char* _s, - u8 bottleneck[5]) - { - char* s = _s; - int cc; - char follower; - int i; - - // parse the four symbols, separated by spaces - - for (i=0 ; i<4 ; i++) - { - cc = *(s++); - if (cc == 0) return false; - follower = *s; - - if ((follower != 0) && (!isspace(follower))) - { - s++; - if (isxdigit(cc)) cc = two_char_as_hex(cc,follower); - else return false; - if (cc == 0) return false; - } - - bottleneck[i] = cc; - - // eat up trailing whitespace - - if (*s != 0) s = skip_whitespace (s); - } - - if (*s != 0) return false; - - return true; - } - -//---------- -// -// ambiguate_n, ambiguate_iupac-- -// Change the substitution scores for N so that alignments to N treat N as an -// ambiguous base, rather than as a sequence-splicing character. -// -// For ambiguate_iupac, we consider any of the IUPAC ambiguous-but-not-ACGT -// characters to be the same as an N. -// -// We expect that nVsN is zero and nVsNonN is no worse than -2*E (i.e. twice -// the gap extend penalty). This value is chosen to give preference to this -// alignment instead of the second, gapped one (regardless of the number of -// consecutive Ns): -// -// TTCTCttcttacttcttcttcttcttcttcttcttcttctTC -// TTCTCTTCTNNNNNNNNNNNNNNNNNNNNNNNNNTCTTNTTC -// -// TTCTCttct-------------------------tacttcttcttcttcttcttcttcttcttctTC -// TTCTCTTCTNNNNNNNNNNNNNNNNNNNNNNNNNT-------------------------CTTNTTC -// -//---------- -// -// Arguments: -// scoreset* ss: The score set to modify. -// score nVsN: Score for N-to-N substitutions. -// score nVsNonN: Score for N-to-non-N substitutions. -// -// Returns: -// (nothing) -// -//---------- - -void ambiguate_n - (scoreset* ss, - score nVsN, - score nVsNonN) - { - u8* rr, *cc; - int ch, chLow; - - ss->sub['N']['N'] = nVsN; - ss->sub['N']['n'] = nVsN; - ss->sub['n']['N'] = nVsN; - ss->sub['n']['n'] = nVsN; - - if (ss->colsAreDna) - { - for (rr=ss->rowChars ; *rr!=0 ; rr++) - { - ch = *rr; - if (ch == 'N') continue; - chLow = dna_tolower(ch); - ss->sub[ch] ['N'] = nVsNonN; - ss->sub[ch] ['n'] = nVsNonN; - ss->sub[chLow]['N'] = nVsNonN; - ss->sub[chLow]['n'] = nVsNonN; - } - } - - if (ss->rowsAreDna) - { - for (cc=ss->colChars ; *cc!=0 ; cc++) - { - ch = *cc; - if (ch == 'N') continue; - chLow = dna_tolower(ch); - ss->sub['N'][ch] = nVsNonN; - ss->sub['n'][ch] = nVsNonN; - ss->sub['N'][chLow] = nVsNonN; - ss->sub['n'][chLow] = nVsNonN; - } - } - - } - - -void ambiguate_iupac - (scoreset* ss, - score nVsN, - score nVsNonN) - { - static u8* ambiggies = (u8*) "NnBDHKMRSVWYbdhkmrsvwy"; - u8* rr, *cc; - int ch, chLow, rrLow, ccLow; - - // set all ambi-vs-ambi scores as N-vs-N or N-vs-nonN, as appropriate - - for (rr=ambiggies ; *rr!=0 ; rr++) - for (cc=ambiggies ; *cc!=0 ; cc++) - { - rrLow = dna_tolower(*rr); - ccLow = dna_tolower(*cc); - if (rrLow == ccLow) ss->sub[*rr][*cc] = nVsN; - else ss->sub[*rr][*cc] = nVsNonN; - } - - // set all non-ambi rows as N-vs-nonN - - if (ss->rowsAreDna) - { - for (rr=ss->rowChars ; *rr!=0 ; rr++) - { - ch = *rr; - chLow = dna_tolower(ch); - for (cc=ambiggies ; *cc!=0 ; cc++) - { - if ((ch == 'N') && ((*cc == 'N') || (*cc == 'n'))) continue; - ss->sub[ch] [*cc] = nVsNonN; - ss->sub[chLow][*cc] = nVsNonN; - } - } - } - - // set all non-ambi columns as N-vs-nonN - - if (ss->colsAreDna) - { - for (cc=ss->colChars ; *cc!=0 ; cc++) - { - ch = *cc; - chLow = dna_tolower(ch); - for (rr=ambiggies ; *rr!=0 ; rr++) - { - if ((ch == 'N') && ((*rr == 'N') || (*rr == 'n'))) continue; - ss->sub[*rr][ch] = nVsNonN; - ss->sub[*rr][chLow] = nVsNonN; - } - } - } - - } - -//---------- -// -// write_score_set_by_name, write_score_set, write_score_set_as_ints-- -// Write a new score set to a file (see format description above, in the header -// for read_score_set_by_name). The write_score_set_as_ints() version will -// write the scores as though (scoreType == 'I'). This allows score sets -// written by a floating point version of the program to be read by an integer -// version. -// -//---------- -// -// Arguments: -// FILE* f: (write_score_set only) The file that scoring data is to -// .. be written to. This should already be open for text -// .. write. -// char* name: The name of the file that scoring data is to be written -// .. to. For write_score_set this is only used for -// .. reporting problems to the user (and may be NULL). -// scoreset* ss: The scoring set to write. -// int withGapScores: true => write gap scores too -// -// Returns: -// (nothing) -// -//---------- - -static void private_write_score_set (FILE* f, scoreset* ss, - int withGapScores, int asInts); - - -void write_score_set_by_name - (char* name, - scoreset* ss, - int withGapScores) - { - FILE* f; - - if (name == NULL) - suicide ("can't open NULL file in write_score_set_by_name()"); - - f = fopen_or_die (name, "wt"); - write_score_set (f, name, ss, withGapScores); - fclose_if_valid (f); - } - - -void write_score_set - (FILE* f, arg_dont_complain(char* name), scoreset* ss, int withGapScores) - { private_write_score_set (f, ss, withGapScores, false); } - - -void write_score_set_as_ints - (FILE* f, arg_dont_complain(char* name), scoreset* ss, int withGapScores) - { private_write_score_set (f, ss, withGapScores, true); } - - -static void private_write_score_set - (FILE* f, - scoreset* ss, - int withGapScores, - int asInts) - { - char s[101]; - u8* rr, *cc; - score minSub; - int vWidth, w; - char* wssScoreFmt, *wssScoreFmtStar; - score v; - - if (asInts) - { - wssScoreFmt = "%d"; - wssScoreFmtStar = "%*d"; - } - else - { -#if (scoreType == 'I') - wssScoreFmt = "%d"; - wssScoreFmtStar = "%*d"; -#elif (scoreType == 'F') - wssScoreFmt = "%.6f"; - wssScoreFmtStar = "%*.6f"; -#elif (scoreType == 'D') - wssScoreFmt = "%.6f"; - wssScoreFmtStar = "%*.6f"; -#endif - } - - if ((!ss->rowsAreDna) || (!ss->colsAreDna)) - suicide ("write_score_set only handles DNA scoring matrices"); - - ////////// - // write non-matrix fields - ////////// - - // determine minimum substitution score - - minSub = 0; - for (rr=ss->rowChars ; *rr!=0 ; rr++) - for (cc=ss->colChars ; *cc!=0 ; cc++) - { if (ss->sub[*rr][*cc] < minSub) minSub = ss->sub[*rr][*cc]; } - - // write the fields - - if (withGapScores) vWidth = 18; - else vWidth = 10; - - fprintf (f, "# (a LASTZ scoring set, created by \"LASTZ --infer\")\n"); - fprintf (f, "\n"); - - v = 10 * minSub; - fprintf (f, "%-*s = %c:", vWidth, "bad_score", ss->badRow); - if (asInts) fprintf (f, wssScoreFmt, round_score (v)); - else fprintf (f, wssScoreFmt, v); - fprintf (f, " # used for sub[%c][*] and sub[*][%c]\n", ss->badRow, ss->badRow); - - v = minSub; - fprintf (f, "%-*s = ", vWidth, "fill_score"); - if (asInts) fprintf (f, wssScoreFmt, round_score (v)); - else fprintf (f, wssScoreFmt, v); - fprintf (f, " # used when sub[*][*] not otherwise defined\n"); - - if (withGapScores) - { - v = ss->gapOpen; - fprintf (f, "%-*s = ", vWidth, "gap_open_penalty"); - if (asInts) fprintf (f, wssScoreFmt, round_score (v)); - else fprintf (f, wssScoreFmt, v); - fprintf (f, "\n"); - - v = ss->gapExtend; - fprintf (f, "%-*s = ", vWidth, "gap_extend_penalty"); - if (asInts) fprintf (f, wssScoreFmt, round_score (v)); - else fprintf (f, wssScoreFmt, v); - fprintf (f, "\n"); - } - - fprintf (f, "\n"); - - ////////// - // write subsitution scores - ////////// - - // determine field width - - w = 3; - for (rr=ss->rowChars ; *rr!=0 ; rr++) - if ((!ss->rowsAreDna) || (dna_isupper (*rr))) - for (cc=ss->colChars ; *cc!=0 ; cc++) - if ((!ss->colsAreDna) || (dna_isupper (*cc))) - { - v = ss->sub[*rr][*cc]; - if (asInts) sprintf (s, wssScoreFmt, round_score (v)); - else sprintf (s, wssScoreFmt, v); - if (strleni(s)+1 > w) w = strlen(s)+1; - } - - // write them - - fprintf (f, " "); - for (cc=ss->colChars ; *cc!=0 ; cc++) - { - if ((ss->colsAreDna) && (!dna_isupper (*cc))) continue; - fprintf (f, " %*c", w, *cc); - } - fprintf (f, "\n"); - - for (rr=ss->rowChars ; *rr!=0 ; rr++) - { - if ((ss->rowsAreDna) && (!dna_isupper (*rr))) continue; - fprintf (f, "%c", *rr); - for (cc=ss->colChars ; *cc!=0 ; cc++) - { - if ((ss->colsAreDna) && (!dna_isupper (*cc))) continue; - v = ss->sub[*rr][*cc]; - fprintf (f, " "); - if (asInts) fprintf (f, wssScoreFmtStar, w, round_score (v)); - else fprintf (f, wssScoreFmtStar, w, v); - } - fprintf (f, "\n"); - } - - } - -//---------- -// -// dump_dna_score_set-- -// Dump the substitution matrix of a score set that is expected to contain -// scores for any DNA characters. (Intended for debugging) -// -//---------- -// -// Arguments: -// FILE* f: The file to dumpt to. -// scoreset* ss: The score set. -// -// Returns: -// (nothing) -// -//---------- - -void dump_dna_score_set - (FILE* f, - scoreset* ss) - { - u8* alphabet = (u8*) "ACGTacgt"; - u8* ch1, *ch2; - - fprintf (stderr, "bad: " scoreFmtSimple "\n", - ss->sub[ss->badRow][ss->badCol]); - fprintf (stderr, "fill: " scoreFmtSimple "\n", - ss->sub['B']['B']); - - fprintf (f, " "); - for (ch2=alphabet ; *ch2!=0 ; ch2++) - fprintf (f, " %5c", *ch2); - fprintf (f, "\n"); - - for (ch1=alphabet ; *ch1!=0 ; ch1++) - { - fprintf (f, "%c: ", *ch1); - for (ch2=alphabet ; *ch2!=0 ; ch2++) -#if (scoreType == 'I') - fprintf (f, " %5d", ss->sub[*ch1][*ch2]); -#else - fprintf (f, " %12.6f", ss->sub[*ch1][*ch2]); -#endif - fprintf (f, "\n"); - } - - } - -//---------- -// -// string_to_score-- -// Parse a string for the score value it contains. -// -//---------- -// -// Arguments: -// const char* s: The string to parse. -// -// Returns: -// The score value of the string. Note that the string *must not* contain -// anything other than a valid score-- failures result in fatality. -// -//---------- - -score string_to_score - (const char* s) - { -#if (scoreType == 'I') - return string_to_unitized_int (s, /* byThousands */ true); -#elif (scoreType == 'F') - return (float) string_to_double (s); -#elif (scoreType == 'D') - return string_to_double (s); -#endif - } - -//---------- -// -// scale_score_set-- -// Multiply each match/substitution score in a set by a constant. -// -//---------- -// -// Arguments: -// scoreset* ss: The score set to modify. Note that *only* the match/ -// .. substitution scores are modified, and *only* for -// .. legitimate characters (those in ss->rowChars and -// .. ss->colChars). -// double scale: How much to scale the scores (e.g. 1.0 means scores -// .. are not changed). -// -// Returns: -// (nothing) -// -//---------- - -void scale_score_set - (scoreset* ss, - double scale) - { - int r, c; - - for (r=0 ; r<256 ; r++) - for (c=0 ; c<256 ; c++) - ss->sub[r][c] *= scale; - } - -//---------- -// -// round_score-- -// Round a score value to the nearest integer. -// -//---------- -// -// Arguments: -// double v: The score to round. We declare this as 'double' rather -// .. than 'score', because the routine is most useful when -// .. the caller has computed a score as a real, and needs to -// .. convert it to an int. -// -// Returns: -// The integer nearest to v. -// -//---------- - -int round_score - (double v) - { - if (v >= 0) return (int) (v + .5); - else return (int) (v - .5); - } - -//---------- -// -// print_score_matrix, print_score_matrix_lf,print_score_matrix_prefix-- -// Print the meaningful contents of a score set's matrix. -// dump_score_set-- -// Dump the contents of a score set. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// scoreset* ss: The score set to print. -// int withExtras: true => show extra stuff, such as gap_open_penalty -// char lineFeedCh: The character to use to separate rows of the -// .. matrix. -// char* prefix: A string to print before each row of the matrix. -// -// Returns: -// (nothing) -// -//---------- - -static void private_print_score_matrix (FILE* f, scoreset* ss, int withExtras, - char lineFeedCh, char* prefix); - - -void print_score_matrix (FILE* f, scoreset* ss, int withExtras) - { private_print_score_matrix (f, ss, withExtras, '\n', NULL); } - -void print_score_matrix_lf (FILE* f, scoreset* ss, int withExtras, char lineFeedCh) - { private_print_score_matrix (f, ss, withExtras, lineFeedCh, NULL); } - -void print_score_matrix_prefix (FILE* f, scoreset* ss, int withExtras, char* prefix) - { private_print_score_matrix (f, ss, withExtras, '\n', prefix); } - - -static void private_print_score_matrix - (FILE* f, - scoreset* ss, - int withExtras, - char lineFeedCh, - char* prefix) - { - int width; - int rowsHidden, rowsAsHex, colsAsHex; - u8* r, *c; - char s[3]; - - if (prefix == NULL) prefix = ""; - - // determine how character codes should be printed; rowsHidden is for - // old-style blastz compatibility, so we only set it false if any post- - // blastz features are active - - rowsAsHex = false; - for (r=ss->rowChars ; *r!=0 ; r++) - { - if ((!isprint (*r)) || (isspace (*r))) - { rowsAsHex = true; break; } - } - - colsAsHex = false; - for (c=ss->colChars ; *c!=0 ; c++) - { - if ((!isprint (*c)) || (isspace (*c))) - { colsAsHex = true; break; } - } - - rowsHidden = ((!rowsAsHex) && (!colsAsHex) && (!withExtras)); - - // print assignments - - if (withExtras) - { - fprintf (f, "%sgap_open_penalty = " scoreFmt "%c", - prefix, ss->gapOpen, lineFeedCh); - fprintf (f, "%sgap_extend_penalty = " scoreFmt "%c", - prefix, ss->gapExtend, lineFeedCh); - } - - // print the matrix - - if (lineFeedCh != '\n') - width = 1; - else if ((dna_utilities_scoreType == 'F') || (dna_utilities_scoreType == 'D')) - width = 13; - else - width = 4; - - fprintf (f, "%s", prefix); - if (lineFeedCh == '\n') - { - fprintf (f, (rowsHidden)? " " - : (rowsAsHex)? " " - : " "); - } - for (c=ss->colChars ; *c!=0 ; c++) - { - if ((ss->colsAreDna) && (!dna_isupper (*c))) continue; - if (colsAsHex) sprintf (s, "%02X", *c); - else sprintf (s, "%c", *c); - fprintf (f, " %*s", width, s); - } - fprintf (f, "%c", lineFeedCh); - - for (r=ss->rowChars ; *r!=0 ; r++) - { - if ((ss->rowsAreDna) && (!dna_isupper (*r))) continue; - fprintf (f, "%s", prefix); - if (lineFeedCh == '\n') - fprintf (f, (rowsAsHex)? " " : " "); - if (!rowsHidden) - { - if (rowsAsHex) sprintf (s, "%02X", *r); - else sprintf (s, "%c", *r); - fprintf (f, "%2s", s); - } - for (c=ss->colChars ; *c!=0 ; c++) - { - if ((ss->colsAreDna) && (!dna_isupper (*c))) continue; - fprintf (f, " " scoreFmtStar, width, ss->sub[*r][*c]); - } - fprintf (f, "%c", lineFeedCh); - } - - } - - -void dump_score_set - (FILE* f, - scoreset* ss, - u8* rowChars, - u8* colChars) - { - int width; - u8* r, *c; - - if (rowChars == NULL) rowChars = ss->rowChars; - if (colChars == NULL) rowChars = ss->colChars; - - if ((dna_utilities_scoreType == 'F') || (dna_utilities_scoreType == 'D')) - width = 13; - else - width = 5; - - fprintf (f, "rowChars = %s\n", ss->rowChars); - fprintf (f, "colChars = %s\n", ss->colChars); - - fprintf (f, "%2s %8s ", "", ""); - for (c=colChars ; *c!=0 ; c++) - fprintf (f, " %*s", width, quantum_visual(*c)); - fprintf (f, "\n"); - - for (r=rowChars ; *r!=0 ; r++) - { - fprintf (f, "%2s %8p:", quantum_visual(*r), ss->sub[*r]); - for (c=colChars ; *c!=0 ; c++) - fprintf (f, " " scoreFmtStar, width, ss->sub[*r][*c]); - fprintf (f, "\n"); - } - } - -void dump_lower_score_set - (FILE* f, - scoreset* ss) - { - int width; - u8* r, *c; - u8 rr; - - if ((dna_utilities_scoreType == 'F') || (dna_utilities_scoreType == 'D')) - width = 13; - else - width = 5; - - fprintf (f, "%2s %8s ", "", ""); - for (c=ss->colChars ; *c!=0 ; c++) - fprintf (f, " %*s", width, quantum_visual(*c)); - for (c=ss->colChars ; *c!=0 ; c++) - fprintf (f, " %*s", width, quantum_visual(*c+'a'-'A')); - fprintf (f, "\n"); - - for (r=ss->rowChars ; *r!=0 ; r++) - { - fprintf (f, "%2s %8p:", quantum_visual(*r), ss->sub[*r]); - for (c=ss->colChars ; *c!=0 ; c++) - fprintf (f, " " scoreFmtStar, width, ss->sub[*r][*c]); - for (c=ss->colChars ; *c!=0 ; c++) - fprintf (f, " " scoreFmtStar, width, ss->sub[*r][*c+'a'-'A']); - fprintf (f, "\n"); - } - - for (r=ss->rowChars ; *r!=0 ; r++) - { - rr = *r+'a'-'A'; - fprintf (f, "%2s %8p:", quantum_visual(rr), ss->sub[rr]); - for (c=ss->colChars ; *c!=0 ; c++) - fprintf (f, " " scoreFmtStar, width, ss->sub[rr][*c]); - for (c=ss->colChars ; *c!=0 ; c++) - fprintf (f, " " scoreFmtStar, width, ss->sub[rr][*c+'a'-'A']); - fprintf (f, "\n"); - } - } - -void dump_full_score_set - (FILE* f, - scoreset* ss) - { - int width; - int r, c; - - if ((dna_utilities_scoreType == 'F') || (dna_utilities_scoreType == 'D')) - width = 13; - else - width = 4; - - fprintf (f, "%2s %8s ", "", ""); - for (c=0 ; c<256 ; c++) - fprintf (f, " %*s", width, quantum_visual(c)); - fprintf (f, "\n"); - - for (r=0 ; r<256 ; r++) - { - fprintf (f, "%2s %8p:", quantum_visual(r), ss->sub[r]); - for (c=0 ; c<256 ; c++) - fprintf (f, " " scoreFmtStar, width, ss->sub[r][c]); - fprintf (f, "\n"); - } - } - -static char* quantum_visual (int ch) - { - static char s1[10], s2[10], s3[10]; - static char* s = s2; - - s = (s == s1)? s2 : (s == s2)? s3 : s1; // (ping pong pung) - - if ((isprint (ch)) && (!isspace (ch))) sprintf (s, "%c", ch); - else sprintf (s, "%02X", ch); - - return s; - } - -//---------- -// -// resolve_score_thresh-- -// If an adaptive score threshold is a percentage, convert it to a count. -// -//---------- -// -// Arguments: -// sthresh* threshold: The threshold to resolve. -// u32 denom: The denominator that the threshold's percentage -// .. would be relative to. -// -// Returns: -// (nothing) -// -//---------- - -void resolve_score_thresh - (sthresh* threshold, - u32 denom) - { - if (threshold->t != 'P') return; - - threshold->c = (u32) (threshold->p * denom + 0.5); - threshold->t = 'C'; - } - -//---------- -// -// string_to_score_thresh-- -// Parse a string for the adaptive score threshold value it contains. -// -//---------- -// -// Arguments: -// const char* s: The string to parse. -// -// Returns: -// The score value of the string. Note that the string *must not* contain -// anything other than a valid adaptive score threshold-- failures result in -// fatality. -// -//---------- - -sthresh string_to_score_thresh - (const char* s) - { - sthresh threshold; - int len; - - memset (&threshold, 0, sizeof(threshold)); // placate compilter - - if (strcmp_prefix (s, "top") != 0) - { - threshold.t = 'S'; - threshold.s = string_to_score (s); - return threshold; - } - - len = strlen(s); - if ((len > 3) && (s[len-1] == '%')) - { - threshold.t = 'P'; - threshold.p = pct_string_to_double (s+3); - return threshold; - } - - threshold.t = 'C'; - threshold.c = string_to_unitized_int (s+3, /* byThousands */ true); - return threshold; - } - -//---------- -// -// score_thresh_to_string-- -// Convert an adaptive score threshold to a string. -// -//---------- -// -// Arguments: -// sthresh* threshold: The threshold to convert. -// -// Returns: -// A string containing the threshold, as text. This string is actually static -// data belonging to this routine, so the caller must copy it if more than one -// such string is to be used simultaneously. -// -//---------- - -char* score_thresh_to_string - (const sthresh* threshold) - { - static char s1[41]; - static char s2[41]; - static char* s = s2; - - s = (s == s1)? s2 : s1; // (ping pong) - - if (threshold->t == 'S') sprintf (s, scoreFmtSimple, threshold->s); - else if (threshold->t == 'P') sprintf (s, "top%.1f%%", 100*threshold->p); - else if (threshold->t == 'C') sprintf (s, "top%d", threshold->c); - else sprintf (s, "(unrecognized)"); - - return s; - } - -//---------- -// -// blastz_score_to_ncbi_bits-- -// Convert (b)lastz score to bit score in NCBI sense. -// blastz_score_to_ncbi_expectation-- -// Convert (b)lastz score to expectation in NCBI sense. -// -// Convert (b)lastz-style scores to NCBI BLAST scores. The conversion is not -// exact, and only provides a quick-and-dirty estimate of the corresponding -// score that would be reported by NCBI BLAST. -// -// Note: These are borrowed with permission from the UCSC genome browser's -// source code tree, from src/lib/blastOut.c . Per communication with -// Jim Kent, these are from a part of that code which is considered to -// be in the public domain. -// -// The routines have been syntactically modified here, to fit the style -// of lastz's source code. At UCSC they are called blastzScoreToNcbiBits -// and blastzScoreToNcbiExpectation. -// -//---------- -// -// Arguments: -// score bzScore: The (b)lastz score to convert. -// (none) -// -// Returns: -// A converted score. -// -//---------- - -double blastz_score_to_ncbi_bits - (score bzScore) - { - return bzScore * 0.0205; - } - -double blastz_score_to_ncbi_expectation - (score bzScore) - { - double bits = bzScore * 0.0205; - double logProb = -bits * log(2); - return 3.0e9 * exp(logProb); - } - -//---------- -// -// new_quantum_code-- -// Create a new quantum dna code. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// A pointer to the newly allocated quantum dna code, which the caller will -// have to dispose of eventually. The routine free() should be used for this -// purpose. -// -//---------- - -qcode* new_quantum_code - (void) - { - return (qcode*) zalloc_or_die ("new_q_code", sizeof(qcode)); - } - -//---------- -// -// read_quantum_code_by_name, read_quantum_code-- -// Read a new quantum dna code from a file (see format description below). -// -//---------- -// -// Arguments: -// FILE* f: (read_quantum_code only) The file that code is to be read -// .. from. This should already be open for text read. -// char* name: The name of the file that code is to be read from. For -// .. For read_quantum_code this is only used for reporting -// .. problems to the user (and may be NULL). -// -// Returns: -// A pointer to the newly allocated quantum dna code, which the caller will -// have to dispose of eventually. The routine free() should be used for this -// purpose. -// -//---------- -// -// Quantum Code File Format -// ======================== -// -// Here's an example. Note that blanks lines and # comments are ignored. Rows -// begin the quantum symbol (as either a single character or a 2-digit -// hexadecimal representation). Columns represent probabilities of A, C, G, -// and T, in that order. -// -// 01 0.125041 0.080147 0.100723 0.694088 -// 02 0.111162 0.053299 0.025790 0.809749 -// 03 0.065313 0.007030 0.004978 0.922679 -// ... more rows here ... -// FF 0.209476 0.014365 0.755682 0.020477 -// -//---------- - -static int parse_quantum_profile (char* s, int* sym, - double* pa, double* pc, double* pg, double* pt); - -qcode* read_quantum_code_by_name - (char* name) - { - FILE* f; - qcode* cc; - - if (name == NULL) - suicide ("can't open NULL file in read_quantum_code_by_name()"); - - f = fopen_or_die (name, "rt"); - cc = read_quantum_code (f, name); - fclose_if_valid (f); - - return cc; - } - - -qcode* read_quantum_code - (FILE* f, - char* _name) - { - static char line[5*25+1]; // (must hold 5 fields, up to 25 chars each) - char* name = _name; - qcode* qc; - int seen[256]; - int lineNum, len, missingEol; - char* waffle; - int sym; - double pa, pc, pg, pt; - - pa = pc = pg = pt = 0; // (placate compiler) - - if (name == NULL) - name = "(unnamed file)"; - - // allocate - - qc = new_quantum_code (); - - for (sym=0 ; sym<256 ; sym++) - { - seen[sym] = false; - qc->p[sym][0] = qc->p[sym][1] = qc->p[sym][2] = qc->p[sym][3] = 0.0; - } - - qc->dna[0] = bits_to_nuc[0]; - qc->dna[1] = bits_to_nuc[1]; - qc->dna[2] = bits_to_nuc[2]; - qc->dna[3] = bits_to_nuc[3]; - - ////////// - // read it - ////////// - - lineNum = 0; - missingEol = false; - - while (fgets (line, sizeof(line), f) != NULL) - { - lineNum++; - - // check for lines getting split by fgets (the final line in the file - // might not have a newline, but no internal lines can be that way) - - if (missingEol) - suicidef ("line is too long (%s: line %d)", name, lineNum-1); - - len = strlen(line); - if (len == 0) continue; - missingEol = (line[len-1] != '\n'); - - // trim blanks, end of line, and comments, and ignore blank lines - - if (line[len-1] == '\n') line[--len] = 0; - - waffle = strchr (line, '#'); - if (waffle != NULL) *waffle = 0; - - trim_string (line); - if (line[0] == 0) continue; - - // parse it - - if (!parse_quantum_profile (line, &sym, &pa, &pc, &pg, &pt)) - suicidef ("invalid quantum code (%s: line %d) %s", - name, lineNum, line); - - if (seen[sym]) - suicidef ("quantum code %02X occurs more than once in %s", - sym, name); - - seen[sym] = true; - qc->p[sym][0] = pa; - qc->p[sym][1] = pc; - qc->p[sym][2] = pg; - qc->p[sym][3] = pt; - } - - return qc; - } - - -static int parse_quantum_profile // (returns true => success, false => failure) - (char* _s, - int* sym, - double* pa, - double* pc, - double* pg, - double* pt) - { - char* s = _s; - u8 ch; - double prob, numer, denom; - int items, charsUsed; - int i; - - // parse the symbol - - charsUsed = -1; - items = sscanf (s, "%c%n", &ch, &charsUsed); - if ((items == 1) && (ch != 0)) - { *sym = ch; s += charsUsed; } - else - { - charsUsed = -1; - items = sscanf (s, "%x%n", sym, &charsUsed); - if ((items != 1) || (*sym < 1) || (*sym > 255)) return false; - s += charsUsed; - } - - // parse the four probabilities - - for (i=0 ; i<4 ; i++) - { - charsUsed = -1; - items = sscanf (s, " %lf/%lf%n", &numer, &denom, &charsUsed); - if (items == 2) - { prob = numer / denom; s += charsUsed; } - else - { - items = sscanf (s, " %lf%n", &prob, &charsUsed); - if (items != 1) return false; - s += charsUsed; - } - - switch (i) - { - case 0: *pa = prob; break; - case 1: *pc = prob; break; - case 2: *pg = prob; break; - case 3: *pt = prob; break; - } - } - - if (*s != 0) return false; - - return true; - } - -//---------- -// -// print_quantum_word-- -// Print a quantum word, as something like a position weight matrix. -// print_quantum_dna_match-- -// Print a quantum word with a dna word lined up with it. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// qcode* coding: The code mapping quantum character to probaility -// .. vector. This may be NULL, in which case only -// .. a hexadecimal representation of the word is -// .. printed. -// u8* q: The quantum word to print. This is a string of -// characters, but is *not* zero-terminated. -// u8* d: (print_quantum_dna_match only) The dna word to -// .. print. Like q, *not* zero-terminated. -// u32 wordLen: The word length (number of quantum symbols in the word). -// -// Returns: -// (nothing). -// -//---------- - -void print_quantum_word (FILE* f, qcode* coding, u8* q, u32 wordLen) - { print_quantum_dna_match (f, coding, q, NULL, wordLen); } - -void print_quantum_dna_match - (FILE* f, - qcode* coding, - u8* q, - u8* d, - u32 wordLen) - { - char3 field; - int width = 4; // (must be at least 4) - u32 ix; - u32 nuc; - - // print dna sequence - - if (d != NULL) - { - fprintf (f, " "); - for (ix=0 ; ixdna) ; nuc++) - { - fprintf (f, "%c:", coding->dna[nuc]); - for (ix=0 ; ixp[q[ix]][nuc]); - fprintf (f, " %*s", width-1, field.s); - } - fprintf (f, "\n"); - } - - } - -//---------- -// -// quantum_word_string-- -// Convert a quantum word to a string, showing each symbol in hex. -// -//---------- -// -// Arguments: -// qcode* coding: The code mapping quantum character to probaility -// .. vector. This may be NULL, in which case only -// .. a hexadecimal representation of the word is -// .. printed. -// u8* q: The quantum word to print. This is a string of -// characters, but is *not* zero-terminated. -// u32 wordLen: The word length (number of quantum symbols in the word). -// int symWidth: Number of characters to devote to each quantum symbol. -// .. This must be at least 2. -// -// Returns: -// A string representing that quantum word. (see note 1) -// -//---------- -// -// notes: -// -// (1) The memory containing the returned string belongs to this routine, as -// static memory. There are only two such memory blocks, and they are -// used on alternate calls. So when you make more than two calls, the -// results of previous calls are clobbered. -// -//---------- - -char* quantum_word_string - (u8* q, - u32 wordLen, - int symWidth) - { - static char s1[200]; - static char s2[200]; - static char* s = s2; - char* ss; - u32 ix; - - s = (s == s1)? s2 : s1; // (ping pong) - - // sanity check - - if (symWidth < 2) symWidth = 2; - if (wordLen * symWidth + 1 > sizeof(s1)) - suicide ("internal error in quantum_word_string()"); - - // print quantum sequence to the string - - ss = s; - for (ix=0 ; ixsub[r][c], where r is in ss->rowChars and c is in -// ss->colChars. -// -//---------- - -score max_in_score_matrix - (scoreset* ss) - { - u8* r, *c; - score maxScore; - - maxScore = worstPossibleScore; - - for (r=ss->rowChars ; *r!=0 ; r++) - for (c=ss->colChars ; *c!=0 ; c++) - { if (ss->sub[*r][*c] > maxScore) maxScore = ss->sub[*r][*c]; } - - return maxScore; - } - - -score min_in_score_matrix - (scoreset* ss) - { - u8* r, *c; - score minScore; - - minScore = bestPossibleScore; - - for (r=ss->rowChars ; *r!=0 ; r++) - for (c=ss->colChars ; *c!=0 ; c++) - { if (ss->sub[*r][*c] < minScore) minScore = ss->sub[*r][*c]; } - - return minScore; - } - -//---------- -// -// print_dna_similarities-- -// Print the similarities of two nucleotide strings (or their prefixes). -// This prints a | when the strings contain the same nucleotide, and a ':' -// when they contain a transition. Spaces are printed otherwise. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// const char* s1: One string. -// const char* s2: The other string. -// int n: The number of characters to compare (and print). -// -// Returns: -// The number of characters printed, similar to print_prefix, except that -// the printing is terminated at the shorter of the two strings. -// -//---------- - -int print_dna_similarities - (FILE* f, - const char* s1, - const char* s2, - int n) - { - int ix; - u8 ch1, ch2; - - if (n < 1) return 0; - - for (ix=0 ; ix (int) sizeof(s)-1) numChars = sizeof(s)-1; - - // convert each bit pair to a character - - ss = s; - - while (numChars-- > 0) - { - twoBits = (word >> (2*numChars)) & 3; - *(ss++) = bits_to_nuc[twoBits]; - } - - *ss = 0; - return s; - } - -//---------- -// -// entropy-- -// Compute the entropy of a sequence pair. -// entropy_lower_ok-- -// Compute the entropy of a sequence pair, with upper/lower case seen as -// equivalent. -// -// WARNING: These functions are only valid for DNA sequences. -// -//---------- -// -// Arguments: -// u8* s: One sequence. -// u8* t: The other sequence. -// int len: The length of the sequences. -// -// Returns: -// The entropy of the sequence pair. -// -//---------- - -static double compute_entropy (u8* s, u8* t, int len, int lowerOk); - -double entropy (u8* s, u8* t, int len) - { return compute_entropy (s, t, len, false); } - -double entropy_lower_ok (u8* s, u8* t, int len) - { return compute_entropy (s, t, len, true); } - -static double compute_entropy - (u8* s, - u8* t, - int len, - int lowerOk) - { -#ifndef disallowEntropy - int count[256]; - double pA, pC, pG, pT, qA, qC, qG, qT; - int ix, cA, cC, cG, cT; - - count['A'] = count['C'] = count['G'] = count['T'] = 0; - if (lowerOk) - count['a'] = count['c'] = count['g'] = count['t'] = 0; - - if (lowerOk) - { - for (ix=0; ix 0) - { - revComp = (revComp << 8) + revCompByteByPairs[word & 0xFF]; - word >>= 8; - } - - return revComp >> adjust; - } - -u64 rev_comp_by_bits (u64 word, int length) - { - int bytes = (length+7) / 8; - int adjust = 8*bytes - length; - u64 revComp = 0; - - while (bytes-- > 0) - { - revComp = (revComp << 8) + revCompByteByBits[word & 0xFF]; - word >>= 8; - } - - return revComp >> adjust; - } - -//---------- -// -// char_to_description-- -// Convert a character to an english description. Only "oddball" characters -// are described-- those characters which would not be understandable if -// printed in an error message. -// -//---------- -// -// Arguments: -// char ch: The character to describe. -// -// Returns: -// Pointer to a string describing the character. -// -//---------- - -typedef struct c2d_el - { - u8 ch; - char* description; - } c2d_el; - -static c2d_el c2dLookup[] = - {{'!', "exclamation point \"!\""}, - {'"', "double quote"}, - {'#', "waffle/number sign/pound \"#\""}, - {'$', "dollar sign \"$\""}, - {'%', "percent sign \"%\""}, - {'&', "ampersand \"&\""}, - {'\'', "single quote/apostrophe \"'\""}, - {'(', "open parenthesis \"(\""}, - {')', "closing parenthesis \")\""}, - {'*', "asterisk \"*\""}, - {'+', "plus sign \"+\""}, - {',', "comma \",\""}, - {'-', "minus sign \"-\""}, - {'.', "period/dot/stop \".\""}, - {'/', "slash \"/\""}, - {':', "colon \":\""}, - {';', "semicolon \";\""}, - {'<', "less than sign \"<\""}, - {'=', "equals sign \"=\""}, - {'>', "greater than sign \">\""}, - {'?', "question mark \"?\""}, - {'@', "at sign \"@\""}, - {'[', "opening bracket \"[\""}, - {'\\', "backslash \"\\\""}, - {']', "closing bracket \"]\""}, - {'^', "caret/circumflex \"^\""}, - {'_', "underscore \"_\""}, - {'{', "opening brace \"{\""}, - {'|', "vertical bar \"|\""}, - {'}', "closing brace \"}\""}, - {'~', "tilde/squiggle sign \"~\""}, - {0, NULL}}; - - -char* char_to_description - (char ch) - { - char* desc; - static char _desc[50]; - c2d_el* scan; - - // describe punctuation as per table - - desc = NULL; - for (scan=c2dLookup ; scan->description!=NULL ; scan++) - { if (scan->ch == ch) { desc=scan->description; break; } } - - if (desc != NULL) - return desc; - - // describe digits as "the digit X" - - if (('0' <= ch) && (ch <= '9')) - { - sprintf (_desc, "the digit %c", ch); - return _desc; - } - - // describe uppercase as "uppercase X" - - if (('A' <= ch) && (ch <= 'Z')) - { - sprintf (_desc, "uppercase %c", ch); - return _desc; - } - - // describe lowercase as "lowercase x" - - if (('a' <= ch) && (ch <= 'z')) - { - sprintf (_desc, "lowercase %c", ch); - return _desc; - } - - // describe anything else as "ascii XX" - - sprintf (_desc, "ascii %02X", ch); - return _desc; - } - diff --git a/programs/lastz/src/dna_utilities.h b/programs/lastz/src/dna_utilities.h deleted file mode 100644 index f3a0ce2..0000000 --- a/programs/lastz/src/dna_utilities.h +++ /dev/null @@ -1,352 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: dna_utilities.h -// -//---------- - -#ifndef dna_utilities_H // (prevent multiple inclusion) -#define dna_utilities_H - -// other files - -#include // standard C i/o stuff -#include // (for FLT_MIN) -#include "utilities.h" // utility stuff - -// establish ownership of global variables - -#ifdef dna_utilities_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef dna_utilities_owner -int dna_utilities_dbgShowQToBest = false; // true => report quantum scores - // .. qToBest to stderr -#else -global int dna_utilities_dbgShowQToBest; -#endif - -//---------- -// -// score values-- -// Scores used for sequence comparisons are normally signed 32-bit integers, -// but the programmer can override this at compile time by defining score_type -// as one of 'F', 'D', or 'I'. Note that some effort must be taken to get -// the apostrophes into the definition from the compiler command line, such as -// -Dscore_type=\'F\' . -// -//---------- -// -// Notes: -// -// (1) For 32-bit integers, W (the worst possible score) is about minus 2.1 -// billion (-2,100,000,000). -// -// (2) We set negInfinity to a value higher than the worst possible (i.e. less -// negative. This permits values *less* than negative infinity, so that -// we can subtract values from negative infinity without running the risk -// that the result underflows (and becomes positive). By setting this to -// 90% of W, we are safe as long as valid scores for pieces of alignments -// do not get worse than -W/10. For 32-bit integers, negInfinity is about -// -1,900,000,000 and -W/10 is about -200,000,000. We expect gap and -// substitution scores to be much smaller than this. As long as xdrop and -// ydrop are smaller than this, we should be OK. Typically we expect -// substitution scores in the range of hundreds. -// -// (3) We set veryBadScore to a negative value such that, when added to -// negative infinity, we will not get underflow. We also want veryBadScore -// to be so bad that no alignment can possibly include a substition with -// that score. We set this to half the difference between W and negative -// infinity. On 32 bit machines this is about -100,000,000 -// -//---------- - -#if defined(score_type) -#define scoreType score_type -#else -#define scoreType 'I' -#endif - -#if (scoreType == 'I') -typedef s32 score; -#elif (scoreType == 'F') -typedef float score; -#elif (scoreType == 'D') -typedef double score; -#else -#error ***** undecipherable score type definition ***** -#endif - -#ifdef dna_utilities_owner -char dna_utilities_scoreType = scoreType; -#else -extern char dna_utilities_scoreType; -#endif - -#if (scoreType == 'I') -#ifdef override_inttypes -#define scoreFmt "%d" -#define scoreFmtStar "%*d" -#define scoreFmtSimple "%d" -#define scoreFmtScanf "%d" -#else -#include -#define scoreFmt "%" PRId32 -#define scoreFmtStar "%*" PRId32 -#define scoreFmtSimple "%" PRId32 -#define scoreFmtScanf "%" SCNd32 -#endif // override_inttypes -#elif (scoreType == 'F') -#define scoreFmt "%f" -#define scoreFmtStar "%*f" -#define scoreFmtSimple "%f" -#define scoreFmtScanf "%f" -#elif (scoreType == 'D') -#define scoreFmt "%le" -#define scoreFmtStar "%*.6le" -#define scoreFmtSimple "%lf" -#define scoreFmtScanf "%le" -#endif - - -#if (scoreType == 'I') -#define worstPossibleScore (-0x7FFFFFFF-1) -#define bestPossibleScore 0x7FFFFFFF -#elif (scoreType == 'F') || (scoreType == 'D') -#define worstPossibleScore -FLT_MAX // nota bene: FLT_MIN wasn't what I -#define bestPossibleScore FLT_MAX // .. thought it was -#endif - -#define noScore worstPossibleScore // see note (1) -#define negInfinity ((score) (0.9*worstPossibleScore)) // see note (2) -#define veryBadScore (-((negInfinity-worstPossibleScore)/2)) // see note (3) - -// $$$ The following is problematic. Some modules need to know the size of -// the score type, at *compile* time. Standard C doesn't seem to provide -// this capability, as sizeof seems to not work in the preprocessor - -#if (scoreType == 'I') -#define score_sz 4 -#elif (scoreType == 'F') -#define score_sz 4 -#elif (scoreType == 'D') -#define score_sz 8 -#endif - -//---------- -// -// data structures and types -// -//---------- - -// scoring sets-- -// A score set defines how alignments are scored. It includes a scoring -// matrix for substitutions and a linear gap penalty. -// -// The matrix is implemented as a fixed 256x256 array. Row and column zero of -// are reserved for scores so bad that nothing will match to a NUL character. - -typedef score scorerow[256]; - -typedef struct charvec - { - s8 len; // number of characters in the vector; -1; - // .. indicates an empty vector - u8 v[4]; // two-bit codes of some of the characters in - // .. the bottleneck alphabet - } charvec; - -typedef struct scoreset - { - u8 rowChars[256]; // the characters we expect to see used as - u8 colChars[256]; // .. row (sequence1) and column (sequence2) - // .. indexes; these are zero-terminated - // .. strings; all other indexes lead to - // .. 'background' scores - int rowsAreDna; // true => row indexes are A,C,G,T - int colsAreDna; // true => column indexes are A,C,G,T - u8 badRow, badCol; // the characters used for the bad scoring - // .. row and column - int gapOpenSet; // true => gapOpen was set explicitly (as - // .. opposed to default value) - score gapOpen; // (non-negative) penalty for opening an - // .. alignment gap; note that we also apply - // .. gapExtend on open - int gapExtendSet; // true => gapExtend was set explicitly - score gapExtend; // (non-negative) penalty for extending an - // .. alignment gap - u8 bottleneck[5]; // bottleneck alphabet for quantum DNA (includes - // a zero terminator); only valid if rowsAreDna - // .. is false - charvec qToBest[256]; // array to map a quantum base in the row - // .. alphabet to the two-bit codes for the - // .. 'closest' character(s) in the bottleneck - // .. alphabet; indexes that are not a valid - // .. row character have qToBest[].len == -1; - // .. only valid if the row alphabet is not - // .. ACGT - u8* qToComplement; // (similar to nuc_to_complement) array to - // .. map a quantum base in the column alphabet - // .. to its complement; this may be NULL - scorerow sub[256]; // maps a nucleotide pair to a score; indexed - // .. by [c1][c2], where c1 and c2 are - // characters (in the range 0..255) from - // sequence 1 and 2, respectively - } scoreset; - -// extended score set, extended to include related command-line parameters; -// note that ss field must be first, so that we can safely typecast this -// structure to a scoreset - -typedef struct exscoreset - { // .. command-line parameters) - scoreset ss; - int xDropSet; - score xDrop; - int yDropSet; - score yDrop; - int stepSet; - u32 step; - int hspThresholdSet; - score hspThreshold; - int gappedThresholdSet; - score gappedThreshold; - int ballScoreSet; - float ballScoreFactor; - score ballScore; - int seedSet; - char* seed; - } exscoreset; - -// quantum dna codes-- -// A qcode describes a mapping from a quantum sequence character to its -// probability vector in {A, C, G, T} - -typedef struct qcode - { - char dna[4]; // (usually "ACGT") - double p[256][4]; // p[sym][*] = dna probability vector for sym - // sum of p[sym][*] is 1.0 if sym is meaningful - // p[0][*] is usually meaningless - } qcode; - -// adapative score thresholds - -typedef struct sthresh // (could do as a C 'union', but why bother?) - { - char t; // type of threshold ('S', 'P', 'C') - score s; // threshold as a score - double p; // threshold as a percentage of bases in target - u32 c; // threshold as a count of bases in target - } sthresh; - -//---------- -// -// globally available data in dna_utilities.c -// -//---------- - -#ifndef dna_utilities_owner -extern const s8 nuc_to_bits[256]; -extern const s8 upper_nuc_to_bits[256]; -extern const u8* bits_to_nuc; -extern const u8* bit_to_pur_pyr; -extern const u8* bits_to_pur_pyr; -extern const u8 nuc_to_complement[256]; -extern const u8 bits_to_complement[4]; -extern score HOXD70[4][4]; -extern const score HOXD70_open; -extern const score HOXD70_extend; -extern const score HOXD70_X; -extern const score HOXD70_fill; -extern score unitScores[4][4]; -extern const double unitScores_open; -extern const double unitScores_extend; -extern const double unitScores_X; -extern const double unitScores_fill; -extern const double unitScores_thresh; -#endif - -//---------- -// -// prototypes for routines in dna_utilities.c -// -//---------- - -// macros to replace ctype.h (the standard C routines isupper, toupper, etc. -// behave strangely on 0x80..0xFF in some implementations) -// BEWARE: these all will fail if ch has side effects (such as ch = *s++) - -#define dna_isupper(ch) (((ch)>='A')&&((ch)<='Z')) -#define dna_islower(ch) (((ch)>='a')&&((ch)<='z')) -#define dna_isalpha(ch) ((dna_isupper(c))||(dna_islower(c))) -#define dna_isprint(ch) (((ch)>=0x20)&&((ch)<=0x7E)) -#define dna_isxdigit(ch) ((((ch)>='0')&&((ch)<='9'))||(((ch)>='A')&&((ch)<='F'))||(((ch)>='a')&&((ch)<='f'))) - -#define dna_toupper(ch) (dna_islower(ch)?((ch)-'a'+'A'):(ch)) -#define dna_tolower(ch) (dna_isupper(ch)?((ch)-'A'+'a'):(ch)) -#define dna_toprint(ch) (dna_isprint(ch)?(ch):'*') - -// prototypes for real routines - -scoreset* new_dna_score_set (score template[4][4], - score xScore, score fillScore, - score gapOpen, score gapExtend); -void free_score_set (char* id, scoreset* ss); -scoreset* copy_score_set (scoreset* ss); -scoreset* masked_score_set (scoreset* ss); -exscoreset* read_score_set_by_name (char* name); -exscoreset* read_score_set (FILE* f, char* name); -void ambiguate_n (scoreset* ss, - score nVsN, score nVsNonN); -void ambiguate_iupac (scoreset* ss, - score nVsN, score nVsNonN); -void write_score_set_by_name (char* name, scoreset* ss, - int withGapScores); -void write_score_set (FILE* f, char* name, scoreset* ss, - int withGapScores); -void write_score_set_as_ints (FILE* f, char* name, scoreset* ss, - int withGapScores); -void dump_dna_score_set (FILE* f, scoreset* ss); -score string_to_score (const char* s); -void scale_score_set (scoreset* ss, double scale); -int round_score (double v); -void print_score_matrix (FILE* f, scoreset* ss, int withExtras); -void print_score_matrix_lf (FILE* f, scoreset* ss, int withExtras, - char lineFeedCh); -void print_score_matrix_prefix (FILE* f, scoreset* ss, int withExtras, - char* prefix); -void dump_score_set (FILE* f, scoreset* ss, - u8* rowChars, u8* colChars); -void dump_lower_score_set (FILE* f, scoreset* ss); -void dump_full_score_set (FILE* f, scoreset* ss); -void print_quantum_word (FILE* f, qcode* coding, u8* q, - u32 wordLen); -void print_quantum_dna_match (FILE* f, qcode* coding, u8* q, u8* d, - u32 wordLen); -char* quantum_word_string (u8* q, u32 wordLen, int symWidth); -score max_in_score_matrix (scoreset* ss); -score min_in_score_matrix (scoreset* ss); -void resolve_score_thresh (sthresh* threshold, u32 denom); -sthresh string_to_score_thresh (const char* s); -char* score_thresh_to_string (const sthresh* threshold); -double blastz_score_to_ncbi_bits (score bzScore); -double blastz_score_to_ncbi_expectation (score bzScore); -qcode* new_quantum_code (void); -qcode* read_quantum_code_by_name (char* name); -qcode* read_quantum_code (FILE* f, char* name); -int print_dna_similarities (FILE* f, const char* s1, const char* s2, - int n); -char* bits_to_nuc_string (u64 word, int numChars); -double entropy (u8* s, u8* t, int len); -double entropy_lower_ok (u8* s, u8* t, int len); -u64 rev_comp_by_pairs (u64 word, int length); -u64 rev_comp_by_bits (u64 word, int length); -char* char_to_description (char ch); - -#undef global -#endif // dna_utilities_H diff --git a/programs/lastz/src/edit_script.c b/programs/lastz/src/edit_script.c deleted file mode 100755 index 208f0a7..0000000 --- a/programs/lastz/src/edit_script.c +++ /dev/null @@ -1,879 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: edit_script.c -// -//---------- -// -// edit_script-- -// Support for representing alignments as a series of substitute, insert, -// and delete. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff - -#define edit_script_owner // (make this the owner of its globals) -#include "edit_script.h" // interface to this module - -//---------- -// -// prototypes for private functions -// -//---------- - -static void edit_script_make_room (editscript** s, u32 entries); -static void edit_script_put (editscript** s, u32 op, u32 rpt); - -//---------- -// -// free_align_list-- -// Dispose of a list of alignments. -// -//---------- -// -// Arguments: -// alignel* a: The list of alignments to dispose of. -// -// Returns: -// (nothing) -// -//---------- - -void free_align_list - (alignel* a) - { - alignel* b; - - while (a != NULL) - { - b = a->next; - free_if_valid ("free_align_list a->script", a->script); - free_if_valid ("free_align_list a", a); - a = b; - } - } - -//---------- -// -// alignment_hash-- -// Compute the hash value of an alignment. -// -//---------- -// -// Arguments: -// unspos beg1, end1: Range of positions in sequence 1. -// int rcFlags1: Flags describing reverse and complement for -// .. sequence 1; one of the rcf_xxx values defined -// .. in sequences.h. -// unspos beg2, end2: Range of positions in sequence 2. -// int rcFlags2: Flags describing reverse and complement for -// .. sequence 2. -// editscript* script: The script describing the path the alignment -// .. takes in the DP matrix. This is NULL if the -// .. caller doesn't want the path to contribute to -// .. the hash. -// -// Returns: -// (nothing) -// -//---------- - -struct ahtemp { unspos beg1, beg2, rcFlags1, end1, end2, rcFlags2; } ahtemp; - -u32 alignment_hash - (unspos beg1, - unspos end1, - int rcFlags1, - unspos beg2, - unspos end2, - int rcFlags2, - editscript* script) - { - struct ahtemp temp; - u32 h; - - memset (&temp, 0, sizeof(temp)); - temp.beg1 = beg1; - temp.end1 = end1; - temp.rcFlags1 = rcFlags1; - temp.beg2 = beg2; - temp.end2 = end2; - temp.rcFlags2 = rcFlags2; - h = hassock_hash (&temp, sizeof(temp)); - - if (script != NULL) - h ^= hassock_hash (script->op, script->len*sizeof(editop)); - - return h; - } - -//---------- -// -// edit_script_new-- -// Allocate an alignment edit script. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// A pointer to an empty edit script. The caller is responsible for disposing -// of this memory, for which purpose free() can be used. -// -//---------- -// -// Notes: NULL is never returned-- failure to allocate is a fatal error. -// -//---------- - -editscript* edit_script_new - (void) - { - u32 entries = 12; - editscript* s; - - // allocate - - s = zalloc_or_die ("edit_script_new", edit_script_bytes(entries)); - - // initialize; note that by use of zalloc we already have - // s->len = 0; - // s->tailOp = 0; - - s->size = entries; - - return s; - } - -//---------- -// -// edit_script_make_room-- -// Make sure an alignment edit script has enough unused entries available, and -// enlarge if it doesn't. -// -//---------- -// -// Arguments: -// editscript** s: (pointer to) The script to check/enlarge. If -// .. reallocation is required we may alter this. -// u32 entries: The number of unused entries required. -// (none) -// -// Returns: -// Nothing. Note that a reallocation failure is a fatal error. -// -//---------- - -static void edit_script_make_room - (editscript** _s, - u32 entries) - { - editscript* s = *_s; - - // do we have enough space already? - - entries += s->len; - if (s->size >= entries) - return; // (yes) - - // reallocate - - entries += entries/2; // (anticipate 50% future growth) - - *_s = s = realloc_or_die ("edit_script_has_room", - s, edit_script_bytes(entries)); - - s->size = entries; - } - -//---------- -// -// edit_script_copy-- -// Allocate a copy of an alignment edit script. -// -//---------- -// -// Arguments: -// editscript* s: The script to copy. -// -// Returns: -// A pointer to a copy of the edit script. The caller is responsible for -// disposing of this memory, for which purpose free() can be used. -// -//---------- -// -// Notes: NULL is never returned-- failure to allocate is a fatal error. -// -//---------- - -editscript* edit_script_copy - (editscript* s) - { - u32 entries; - size_t bytesNeeded; - editscript* newS; - - // allocate - - entries = s->len; - bytesNeeded = edit_script_bytes(entries); - newS = zalloc_or_die ("edit_script_copy", bytesNeeded); - - // initialize - - memcpy (/*to*/ newS, /*from*/ s, /*how much*/ bytesNeeded); - newS->size = entries; - - return newS; - } - -//---------- -// -// edit_script_add-- -// Add an repeated operation to an alignment edit script, merging it with the -// tail of the script if possible. -// -//---------- -// -// Arguments: -// editscript** s: The script to add to. If the script has to be -// .. enlarged, this value may change upon return. -// u32 op: The operation to add. -// unspos rpt: The repeat count (i.e. how many copies of op to add). -// -// Returns: -// (nothing). -// -//---------- - -void edit_script_add - (editscript** _s, - u32 op, - unspos rpt) - { - editscript* s = *_s; - editop* tail; - u32 tailRpt; - - // if this operation matches the one currently in the tail, increase the - // repeat count on the tail; if the repeat count doesn't have enough count - // left, fall through to the loop - - if (edit_op_operation(s->tailOp) == op) - { - tail = s->op + s->len-1; - tailRpt = edit_op_repeat (*tail); - - if (tailRpt + rpt <= maxEditopRepeat) - { - *tail = edit_op_add_repeat (*tail, rpt); - return; - } - else - { - *tail = edit_op (op, maxEditopRepeat); - rpt = tailRpt + rpt - maxEditopRepeat; - } - } - - // loop, adding new operation(s) to the end of the script - - while (rpt > maxEditopRepeat) - { - edit_script_put (_s, op, maxEditopRepeat); - rpt -= maxEditopRepeat; - } - - edit_script_put (_s, op, rpt); - } - -//---------- -// -// edit_script_put-- -// Add an repeated operation to an alignment edit script, tacking a new entry -// onto the script. -// -//---------- -// -// Arguments: -// editscript** s: The script to add to. If the script has to be -// .. enlarged, this value may change upon return. -// u32 op, rpt: The operation to add, including a repeat count. -// -// Returns: -// (nothing). -// -//---------- - -static void edit_script_put - (editscript** _s, - u32 op, - u32 rpt) - { - editscript* s; - - edit_script_make_room (_s, 1); // (make sure we have room for one operation) - - s = *_s; - s->op[s->len++] = edit_op (op, rpt); - s->tailOp = op; - } - -//---------- -// -// edit_script_append-- -// Copy one alignment edit script to the end of another. -// -//---------- -// -// Arguments: -// editscript** dst: (pointer to) The script to copy to. If -// .. reallocation is required we may alter this. -// editscript* src: (pointer to) The script to copy from. -// -// Returns: -// A pointer to an empty edit script. The caller is responsible for disposing -// of this memory, for which purpose free() can be used. -// -//---------- - -void edit_script_append - (editscript** _dst, - editscript* src) - { - editscript* dst; - editop* s, *d; - u32 toCopy; - u32 sOp; - u32 sRpt, dRpt; - - if (src->len == 0) return; - - // make sure we have enough room - - edit_script_make_room (_dst, src->len); - dst = *_dst; - - // copy dst to src - - s = src->op; - d = dst->op + dst->len-1; - toCopy = src->len; - - sOp = edit_op_operation (*s); - if (sOp == dst->tailOp) - { - dRpt = edit_op_repeat (*d); - sRpt = edit_op_repeat (*s); - if (dRpt + sRpt <= maxEditopRepeat) - *d = edit_op_add_repeat (*d, sRpt); - else - { - *(d++) = edit_op (sOp, maxEditopRepeat); - *d = edit_op (sOp, dRpt + sRpt - maxEditopRepeat); - dst->len++; - } - s++; toCopy--; - } - d++; - - memcpy (d, s, toCopy*sizeof(editop)); - - dst->len += toCopy; - dst->tailOp = src->tailOp; - } - -//---------- -// -// edit_script_reverse-- -// Reverse the items in an alignment edit script, in place. -// -//---------- -// -// Arguments: -// editscript* s: The script to modify. -// -// Returns: -// (nothing) -// -//---------- - -void edit_script_reverse - (editscript* s) - { - u32 i, j; - editop t; - - if (s->len < 2) return; - - for (i=0,j=s->len-1 ; iop[i]; s->op[i] = s->op[j]; s->op[j] = t; } - } - -//---------- -// -// edit_script_mirror-- -// Flip an alignment edit script across the main diagonal, in place. -// -// This amounts to changing deletions to insertions, and vice-versa. -// -//---------- -// -// Arguments: -// editscript* s: The script to modify. -// -// Returns: -// (nothing) -// -//---------- - -void edit_script_mirror - (editscript* s) - { - u32 i; - editop op; - u32 rpt; - - for (i=0 ; ilen ; i++) - { - op = s->op[i]; - switch (edit_op_operation(op)) - { - case editopIns: - rpt = edit_op_repeat(op); - s->op[i] = edit_op (editopDel,rpt); - break; - case editopDel: - rpt = edit_op_repeat(op); - s->op[i] = edit_op (editopIns,rpt); - break; - default: - // do nothing - break; - } - } - } - -//---------- -// -// edit_script_trim_head-- -// Trim some number of steps off the head of an alignment edit script, in -// place. -// -//---------- -// -// Arguments: -// editscript* s: The script to modify. -// unspos len: The number of steps to trim. A step is a one-base -// .. step, in any direction. -// -// Returns: -// (nothing) -// -//---------- - -void edit_script_trim_head - (editscript* s, - unspos len) - { - u32 i, j; - editop op; - u32 rpt; - int shortScript; - - if (s->len == 0) return; // the alignment is empty - if (len == 0) return; // nothing to trim - - // scan to find the first segment that we won't completely skip - - shortScript = true; - for (i=0 ; ilen ; i++) - { - op = s->op[i]; - rpt = edit_op_repeat(op); - if (rpt > len) - { shortScript = false; break; } - len -= rpt; - } - - if (shortScript) // the alignment didn't have enough steps - { s->len = 0; return; } - - // if we skipped whole segments, shift the remaining segments forward - - if (i > 0) - { - for (j=i ; jlen ; j++) - s->op[j-i] = s->op[j]; - s->len -= i; - } - - // if we have anything else to trim (other than those whole segments), - // trim the first segment - - if (len > 0) - { - op = edit_op_operation(op); - s->op[0] = edit_op (op,rpt-len); - } - } - -//---------- -// -// edit_script_upper_truncate-- -// Truncate an alignment edit script at the main diagonal, as it crosses from -// upper triangle to lower triangle. -// -// We assume that the first sequence is along the positive strand, and the -// second is along the negative strand. -// -//---------- -// -// Arguments: -// editscript* s: The script to modify. -// unspos* pos1, pos2: (pointer to) The starting position of the script, -// .. in the DP matrix, both relative to the -// .. corresponding positive strand. If we return -// .. true, the position of the truncated ending is -// .. written to these values. If the truncated -// .. alignment is empty, seqposInfinity is written to -// .. both values. -// -// Returns: -// true if truncation has occured, in which case the new ending values for -// pos1 and pos2 have been written; false otherwise. -// -//---------- -// -// notes: -// (1) Positions with respect to the diagonal are as follows: -// pos1 < pos2 => above the diagonal -// pos1 == pos2 => on the diagonal -// pos1 > pos2 => below the diagonal -// However, we will truncate at a point with either of these cases: -// pos1 == pos2 => on the diagonal -// pos1 == pos2+1 => adjacent to the diagonal (below it) -// The latter case is necessary because an alignment can cross the diagonal -// without, technically, containing any positions on the diagonal. -// -//---------- - -int edit_script_upper_truncate - (editscript* s, - unspos* _pos1, - unspos* _pos2) - { - u32 i; - editop op; - u32 rpt; - unspos pos1, pos2, prevPos1, prevPos2; - int reachesDiagonal; - unspos limit = 0; // (to placate compiler) - - //fprintf (stderr, "s=%08X l=%08X\n", s->size, s->len); - - // handle special cases - - if (s->len == 0) - { - // the alignment is empty - return false; - } - - pos1 = (*_pos1); - pos2 = (*_pos2); - - //fprintf (stderr, unsposSlashFmt "\n", pos1, pos2); - - if (pos1 > pos2) - { - // the alignment starts below the diagonal, discard all of it - s->len = 0; - (*_pos1) = seqposInfinity; - (*_pos2) = seqposInfinity; - return true; - } - - // scan for the first segment that touches or crosses the main diagonal - - reachesDiagonal = false; - for (i=0 ; ilen ; i++) - { - prevPos1 = pos1; - prevPos2 = pos2; - - op = s->op[i]; - rpt = edit_op_repeat(op); - op = edit_op_operation(op); - switch (op) - { - case editopSub: pos1 += rpt; pos2 -= rpt; limit = pos2+1; break; - case editopIns: pos2 -= rpt; limit = pos2; break; - case editopDel: pos1 += rpt; limit = pos2; break; - } - - if (pos1 >= limit) - { reachesDiagonal = true; break; } - } - - if (!reachesDiagonal) return false; - - //fprintf (stderr, "diagonal reached\n"); - //fprintf (stderr, unsposCommaFmt " -> " unsposCommaFmt "\n", - // prevPos1, prevPos2, pos1, pos2); - - // truncate the list at the crossing segment - - s->len = i+1; - - // split the crossing segment (unless we're lucky enough that it ended - // at the diagonal) - - if (pos1 > pos2) - { - switch (op) - { - case editopSub: - // prevPos pos new pos rpt - // (90,110) (105,95) (100,100) 10 - // (90,111) (104,95) (101,100) 11 - rpt = (prevPos2+1 - prevPos1) / 2; - s->op[i] = edit_op (editopSub,rpt); - pos1 = prevPos1 + rpt; - pos2 = prevPos2 - rpt; - break; - case editopIns: - // prevPos pos new pos rpt - // (100,110) (100,95) (100,100) 10 - rpt = prevPos2 - prevPos1; - s->op[i] = edit_op (editopIns,rpt); - pos1 = prevPos1; - pos2 = prevPos2 - rpt; - break; - case editopDel: - // prevPos pos new pos rpt - // (90,100) (105,100) (100,100) 10 - rpt = prevPos2 - prevPos1; - s->op[i] = edit_op (editopDel,rpt); - pos1 = prevPos1 + rpt; - pos2 = prevPos2; - break; - } - - //fprintf (stderr, "truncated to %u -> " unsposCommaFmt "\n", - // rpt, pos1, pos2); - } - - (*_pos1) = pos1; - (*_pos2) = pos2; - return true; - } - -//---------- -// -// edit_script_run_of_subs, edit_script_run_of_subs_match-- -// Find the length of the current run of substitutions in an alignment edit -// script. -// -//---------- -// -// Arguments: -// editscript* s: The script being parsed. -// u32* opIx: Current parse location in the script. Upon return this -// .. is updated to point to the next operation beyond the -// .. run. -// const u8* p, q: Pointer to the sequences' nucleotides (corresponding to -// .. the parse location). These are used only to provide -// .. a match count, and can be NULL if match is NULL. -// unspos* match: Place to return the number of nucleotide matches in the -// .. run (including upper/lower mismatches). -// -// Returns: -// The length of the run. Note that this could be zero. -// -//---------- - -u32 edit_script_run_of_subs - (editscript* s, - u32* _opIx) - { - u32 opIx = (u32) *_opIx; - u32 rpt, run; - - run = 0; - while ((opIx < s->len) && (edit_op_operation(s->op[opIx]) == editopSub)) - { - rpt = edit_op_repeat(s->op[opIx]); opIx++; - run += rpt; - } - - *_opIx = opIx; - return run; - } - - -u32 edit_script_run_of_subs_match - (editscript* s, - u32* _opIx, - const u8* p, - const u8* q, - unspos* _match) - { - u32 opIx = *_opIx; - unspos match = *_match; - u32 rpt, run; - u8 pCh, qCh; - - run = 0; - match = 0; - while ((opIx < s->len) - && (edit_op_operation(s->op[opIx]) == editopSub)) - { - rpt = edit_op_repeat(s->op[opIx]); opIx++; - run += rpt; - while (rpt-- > 0) - { - pCh = *(p++); qCh = *(q++); - if (dna_toupper(pCh) == dna_toupper(qCh)) match++; - } - } - - *_opIx = opIx; - *_match = match; - return run; - } - -//---------- -// -// edit_script_indel_len-- -// Find the length of the current "run" of indels in an alignment edit script. -// -//---------- -// -// Arguments: -// editscript* s: The script being parsed. -// u32* opIx: Current parse location in the script. Upon return this -// .. is updated to point to the next operation beyond the -// .. indel. -// unspos* i, j: Current parse location in the sequences. Upon return -// .. these are updated. -// -// Returns: -// The length of the run. -// -//---------- - -u32 edit_script_indel_len - (editscript* s, - u32* opIx, - unspos* i, - unspos* j) - { - editop op; - u32 rpt; - - if (s->len <= (u32) *opIx) - return 0; - - op = s->op[*opIx]; - rpt = edit_op_repeat(op); - - switch (edit_op_operation(op)) - { - case editopIns: *j += rpt; break; - case editopDel: *i += rpt; break; - } - - (*opIx)++; - return rpt; - } - -//---------- -// -// edit_script_overall_len-- -// Find the length of an alignment edit script, along both sequences. -// -//---------- -// -// Arguments: -// editscript* s: The script to modify. -// unspos* i, j: Place to return the lengths. i is along the first -// .. sequence; j is along the second. -// -// Returns: -// (nothing) -// -//---------- - -void edit_script_overall_len - (editscript* s, - unspos* _i, - unspos* _j) - { - u32 opIx; - editop op; - u32 rpt; - unspos i, j; - - i = j = 0; - - for (opIx=0 ; opIxlen ; opIx++) - { - op = s->op[opIx]; - rpt = edit_op_repeat(op); - switch (edit_op_operation(op)) - { - case editopSub: i += rpt; j += rpt; break; - case editopIns: j += rpt; break; - case editopDel: i += rpt; break; - } - } - - (*_i) = i; - (*_j) = j; - } - -//---------- -// -// dump_edit_script-- -// Print the raw contents of an alignment edit script. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// editscript* s: The script to print. -// -// Returns: -// (nothing). -// -//---------- - -void dump_edit_script - (FILE* f, - editscript* s) - { - char* opName[4] = { "???", "INS", "DEL", "SUB" }; - u32 op, rpt; - u32 ix; - - for (ix=0 ; ixlen ; ix++) - { - op = edit_op_operation (s->op[ix]); - rpt = edit_op_repeat (s->op[ix]); - fprintf (f, "%dx%s\n", rpt, opName[op]); - } - } - diff --git a/programs/lastz/src/edit_script.h b/programs/lastz/src/edit_script.h deleted file mode 100644 index e7748f3..0000000 --- a/programs/lastz/src/edit_script.h +++ /dev/null @@ -1,106 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: edit_script.h -// -//---------- - -#ifndef edit_script_H // (prevent multiple inclusion) -#define edit_script_H - -// other files - -#include "sequences.h" // sequence stuff - -// establish ownership of global variables - -#ifdef edit_script_owner -#define global -#else -#define global extern -#endif - -//---------- -// -// data structures and types -// -//---------- - -// linked list of alignments - -typedef struct alignel - { - struct alignel* next; - int isTrivial; - unspos beg1, beg2, // (origin-1) - end1, end2; // (inclusive) - score s; - u8* seq1, *seq2; - struct editscript* script; - u64 hspId; // (for debugging) uniquely identifies the - // .. hsp that led to this alignment - } alignel; - -// edit scripts; a list of insert, delete and substitute operations - -typedef u32 editop; // each consists of a 2-bit code (for insert, delete or - // .. substitute) and a 30-bit repeat count - -enum - { - editopIns = 0x1, // (second sequence has extra bases) - editopDel = 0x2, // (first sequence has extra bases) - editopSub = 0x3 - }; - -typedef struct editscript - { - u32 size; // the number of entries allocated for op[] - u32 len; // the number of entries used - editop tailOp; // most recent operation added - editop op[1]; // variable-length array of edit operations - } editscript; - -#define edit_script_ins(s,rpt) edit_script_add(s,editopIns,rpt) -#define edit_script_del(s,rpt) edit_script_add(s,editopDel,rpt) -#define edit_script_sub(s,rpt) edit_script_add(s,editopSub,rpt) - -#define edit_script_bytes(entries) (((entries)==0)?(sizeof(editscript)):((sizeof(editscript)+(((entries)-1)*sizeof(editop))))) - - -#define edit_op_operation(op) ((op) & 0x3) -#define edit_op_repeat(op) ((op) >> 2) -#define edit_op(op,rpt) (((op) & 0x3) | ((rpt) << 2)) -#define edit_op_add_repeat(eop,n) ((eop) + ((n) << 2)) - -#define maxEditopRepeat ((((u32)1)<<30)-1) - -//---------- -// -// prototypes for routines in edit_script.c -// -//---------- - -void free_align_list (alignel* a); -u32 alignment_hash (unspos beg1, unspos end1, int rcFlags1, - unspos beg2, unspos end2, int rcFlags2, - editscript* script); -editscript* edit_script_new (void); -editscript* edit_script_copy (editscript* s); -void edit_script_add (editscript** s, u32 op, unspos rpt); -void edit_script_append (editscript** dst, editscript* src); -void edit_script_reverse (editscript* s); -void edit_script_mirror (editscript* s); -void edit_script_trim_head (editscript* s, unspos len); -int edit_script_upper_truncate (editscript* s, - unspos* pos1, unspos* pos2); -u32 edit_script_run_of_subs (editscript* s, u32* opIx); -u32 edit_script_run_of_subs_match (editscript* s, u32* opIx, - const u8* p, const u8* q, unspos* match); -u32 edit_script_indel_len (editscript* s, - u32* opIx, unspos* i, unspos* j); -void edit_script_overall_len (editscript* s, - unspos* i, unspos* j); -void dump_edit_script (FILE* f, editscript* s); - -#undef global -#endif // edit_script_H diff --git a/programs/lastz/src/gapped_extend.c b/programs/lastz/src/gapped_extend.c deleted file mode 100755 index 5b55b1d..0000000 --- a/programs/lastz/src/gapped_extend.c +++ /dev/null @@ -1,5758 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: gapped_extend.c -// -//---------- -// -// gapped_extend-- -// Support for extending anchors to alignments (gaps allowed). -// -// The Y-drop variant of dynamic programming is applied to create a gapped -// alignment by extending in both directions from each 'anchor point". We use -// the term Y-drop to distinguish this from the similar X-drop technique for -// ungapped alignments. -// -// The underlying DP algorithm here is the one shown in figure 4 of reference -// [1]. However, where the algorithm in [1] computes the entire DP matrix, we -// only compute horizontal slices of the DP matrix, with the bounds of each row -// determined by (1) any neighboring alignment segments, and (2) cells scoring -// less than Y from the max score. -// -// Throughout this module, we consider sequence 1 ("target") on the vertical -// edge of the dynamic programming matrix, and sequence 2 ("query") horizontal. -// Deletions are vertical, insertions horizontal. -// -// More detailed information can be found in the headers of the functions. -// Specifically, the function ydrop_one_sided_align() is the heart of the DP -// algorithm, and contains more details. -// -// References: -// [1] Approximate Matching of Regular Expressions. Myers and Miller, Bull. -// Math. Biol. 51 (1989), pp. 5-37. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C value limit stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "segment.h" // segment table management stuff -#include "edit_script.h" // alignment edit script stuff -#include "diag_hash.h" // diagonals hashing stuff -#include "identity_dist.h" // identity distribution stuff -#include "coverage_dist.h" // query coverage distribution stuff -#include "continuity_dist.h" // query continuity distribution stuff -#include "output.h" // alignment outout format stuff - -#define gapped_extend_owner // (make this the owner of its globals) -#include "gapped_extend.h" // interface to this module - -// debugging defines - -//#define snoopAnchors // if this is defined, extra code is added to - // .. track anchors through the alignment - // .. process -//#define snoopAnchorToGapped // if this is defined, extra code is added to - // .. track anchors through the alignment - // .. process (different than snoopAnchors) -//#define snoopBlocks // if this is defined, extra code is added to - // .. track alignment blocks through the process -//#define snoopSubprobs // if this is defined, extra code is added to - // .. report alignment sub-problems -//#define snoopAlgorithm // if this is defined, extra code is added to - // .. track the dynamic programming algorithm -//#define snoopAlgorithmTrap // if this is defined, extra code is added to - // .. allow trapping the dynamic programming - // .. algorithm in a debugger, at a specific - // .. cell - -//#define debugPosA1 15384592 // if defined (and snoopAlgorithm is also), -//#define debugPosA2 11569819 // .. only information about extension of this -//#define debugPosB1 130 // .. particular seed is output; note that -//#define debugPosB2 136 // .. this position is origin-zero - -//#define snoopTraceback // if this is defined, extra code is added to - // .. track the dynamic programming algorithm's - // .. traceback process - -//#define debugHspImmediate // if this is defined, extra code is added to - // .. aid debugging of gappily_extend_hsps() - -//#define snoopBatches // if this is defined, extra code is added to - // .. track the processing of HSPs in batches -//#define snoopEditScripts // if this is defined, extra code is added to - // .. examine the edit scripts created by - // .. format_alignment() -//#define snoopAlignioInput // if this is defined, extra code is added to - // .. examine the alignio data structure, which - // .. is used to pass data into ydrop_align() -//#define snoopAlignioOutput // if this is defined, extra code is added to - // .. examine the results in the alignio data - // .. structure, which is used to pass data out - // .. of ydrop_align() - -//#define snoopSpecialHsp // if this is defined, extra code is added to -// // .. so that only one particular HSP is -// // .. processed -//#define specialPosA 439 // if defined (and snoopSpecialHsp is also), -//#define specialPosB 16 // .. only this HSP is processed; note that -// // .. this position is origin-zero - -//#define snoopBounds // if this is defined, extra code is added to - // .. examine bounding alignments - -//---------- -// -// stats to augment crude profiling -// -//---------- - -//--- this module's contribution to profiling for the whole program -- - -#ifndef dbgTiming -#define dbg_timing_set_stat(field,val) ; -#define dbg_timing_count_stat(field) ; -#define dbg_timing_report_stat(field,name) ; -#endif // not dbgTiming - -#ifdef dbgTiming -struct - { - int numExtensions; - } gappedExtendTimingStats; - -#define dbg_timing_set_stat(field,val) (gappedExtendTimingStats.field = val) -#define dbg_timing_count_stat(field) ++gappedExtendTimingStats.field -#define dbg_timing_report_stat(field,name) fprintf(stderr,"%-26s %d\n",name":",gappedExtendTimingStats.field) -#endif // dbgTiming - -//--- profiling for this module only -- - -#ifndef dbgTimingGappedExtend -#define dbg_timing_gapped_extend_sub(v) ; -#define dbg_timing_gapped_extend_add(v) ; -#define dbg_timing_gapped_extend_copy(dst,src) ; -#define dbg_timing_gapped_extend_report(f,v,s) ; -#endif // not dbgTimingGappedExtend - -#ifdef dbgTimingGappedExtend -#define read_clock() microsec_clock() -#define clocksPerSec 1000000 -u64 microsec_clock (void); // (from lastz.c) - -s64 debugClockAboveBelow = 0, - debugClockLeftRight = 0, - debugClockYdropAlign = 0, - debugClockYdropOneSidedAlign = 0, - debugClockUpdateLrBounds = 0, - debugClockNextSweepSeg = 0, - debugClockPrevSweepSeg = 0, - debugClockUpdateActiveSegs = 0, - debugClockFilterActiveSegs = 0; - -#define dbg_timing_gapped_extend_sub(v) { v -= (s64) read_clock(); } -#define dbg_timing_gapped_extend_add(v) { v += (s64) read_clock(); } -#define dbg_timing_gapped_extend_copy(dst,src) { dst = src; } - -#define dbg_timing_gapped_extend_report(f,v,s) { fprintf(f,"%-26s %.3f\n",s":",((float)(v))/clocksPerSec); } -#endif // dbgTimingGappedExtend - -//---------- -// -// private data -// -//---------- - -// miscellany - -#define negInf negInfinity - -#define anchorPeakLen 31 - -#undef min -#define min(x,y) ((x)<(y)?(x):(y)) -#undef max -#define max(x,y) ((x)>(y)?(x):(y)) - -#define signed_difference(a,b) (((sgnpos)(a))-((sgnpos)(b))) - -// straight alignment segments - -typedef struct aliseg - { - char type; // one of diagSeg, horzSeg, vertSeg - unspos b1, b2, e1, e2; - struct aliseg* nextSeg; - struct aliseg* prevSeg; - } aliseg; - -#define diagSeg 0 -#define horzSeg 1 // parallel to sequence 2 -#define vertSeg 2 // parallel to sequence 1 - -// gapped alignment; this starts as a single point anchor, and is then extended -// into an alignment - -typedef struct galign - { - unspos pos1, pos2; // anchor location or start of alignment (these - // .. are origin-zero) - unspos end1, end2; // end of alignment (these are inclusive) - u64 hspId; // (for debugging) uniquely identifies the - // .. hsp that led to this alignment attempt - - aliseg* firstSeg; // the alignment, in diagonal, vertical and - aliseg* lastSeg; // .. horizontal segments - - alignel* align; // second form of the alignment, for external - // .. consumption only - - struct galign *leftAlign1; // at the alignment's beginning and ending - struct galign *rightAlign1; // .. points, we keep pointers to the alignments - struct galign *leftAlign2; // .. immediately left and right of the terminus - struct galign *rightAlign2; - - aliseg* leftSeg1; // these correspond to leftAlign1, etc., and - aliseg* rightSeg1; // .. are the closest segments to the alignment - aliseg* leftSeg2; // .. (e.g. leftSeg1 is a segment in leftAlign1) - aliseg* rightSeg2; - - struct galign* next; // alignments are linked both by increasing - struct galign* prev; // .. start-point and by decreasing end-point - } galign; - -// input/output for ydrop_align() - -typedef struct alignio - { - // the following must be supplied by caller - - u8* seq1, *seq2; // target and query sequences - u8* rev1, *rev2; // reverse of target and query sequences (NOT - // .. reverse complement) - unspos len1, len2; // total length of sequences - unspos low1, low2; // limits of sub-interval in each sequence; low - unspos high1, high2; // .. is the leftmost position allowed; high is - // .. one past the rightmost position allowed; - // .. both are origin-0 indexes into seq->v[] - unspos anchor1, anchor2; // position to start the alignment - u64 hspId; // (for debugging) uniquely identifies the - // .. hsp that led to this alignment attempt - - scoreset* scoring; // substitution and gap scores to use - score yDrop; // value of Y-dropoff parameter - int trimToPeak; // whether y-drop should be trimmed (see - // description in ydrop_one_sided_align) - tback* tb; // block of memory for alignment trace-back - - galign* leftAlign; // closest alignment to left of anchor pt - galign* rightAlign; // closest alignment to right of anchor pt - aliseg* leftSeg; // specific segment of leftAlign - aliseg* rightSeg; // specific segment of rightAlign - - galign* aboveList; // alignments starting above the anchor point - galign* belowList; // alignments ending below the anchor point - - // the following are returned by ydrop_align() - - score s; // alignment's score - unspos start1, start2; // alignment's start on target and query - unspos stop1, stop2; // alignment's end on target and query - editscript* script; // alignment's edit script - } alignio; - -// dynamic programming structure -// -// We (try to) declare the basic cell with a power-of-two size; doing so MAY -// improve speed on some platforms, presumably because random indexing into the -// array of dpCells would involve a shift rather than a multiply; however, if -// we can't figure out what size the padding should be (or if we don't need any, -// as would be the case for dpCell_padding_sz==16), we don't add any - -#define unpadded_dpCell_sz (score_sz+score_sz+unspos_sz) - -#if (unpadded_dpCell_sz == 12) -#define dpCell_padding_sz 4 -#elif (unpadded_dpCell_sz == 20) -#define dpCell_padding_sz 12 -#elif (unpadded_dpCell_sz == 24) -#define dpCell_padding_sz 8 -#endif - - -typedef struct dpCell - { - score DD, CC; - unspos mask; // mask out grid points that are part of previous - // .. alignments -#ifdef dpCell_padding_sz - char padding[dpCell_padding_sz]; -#endif - } dpCell; - -typedef struct dpMatrix - { - dpCell* p; - u32 len; - } dpMatrix; - -// linked list of active segments - -typedef struct activeseg - { - aliseg* seg; - unspos x; // column position where segment intersects sweep row; - // .. this is relative to the current slice of the DP - // .. matrix, e.g. ranging from LY to RY in the context - // .. of the alignment sweep - unspos lastRow; // (also relative to the alignment sweep) - char type; // one of diagSeg, horzSeg, vertSeg - char filter; - struct activeseg* next; - } activeseg; - - -#ifdef snoopSubprobs -seq* snoopSubprobsSeq1; -seq* snoopSubprobsSeq2; -#endif // snoopSubprobs - - -// segment batches-- partitioning of a segment table - -typedef struct segbatch - { - u32 start; // index (into a segment table) of the first - // .. entry in a batch - u32 end; // index (into a segment table) of the first - // .. entry NOT in a batch (i.e. the one after - // .. the last entry). - partition* part; // sequence partition that "contains" this - // .. batch; this can be NULL if we aren't - // .. dealing with partitions - } segbatch; - -typedef struct sbtable - { - u32 size; // the number of entries allocated for batch[] - u32 len; // the number of batches (the number of entries - // .. actually used) - segbatch batch[1]; // the batch table (variable-length array) - } sbtable; - -#define sbtable_bytes(size) (sizeof(sbtable) + (((size)-1)*sizeof(segbatch))) - -// flag related to reprting of "truncated alignments" - -static int haveReportedTruncation = false; - -//---------- -// -// prototypes for private functions -// -//---------- - -static unspos segment_peak (u8* s1, u8* s2, unspos segLength, - scoreset* scoring); -static sbtable* batched_segments (segtable* anchors, seqpartition* sp); -static galign** init_from_anchors (segtable* anchors, u32 numExtraSlots); -static int identical_sequences (seq* seq1, seq* seq2, - scoreset* scoring, score* s); -static int identical_partitioned_sequences - (seq* seq1, seq* seq2); -static int identical_partition_of_sequence - (seq* seq1, seq* seq2); -static score score_identical_partition - (seq* seq1, seq* seq2, - partition* p1, partition* p2, - scoreset* scoring); -static score score_identical_partition_of - (seq* seq1, seq* seq2, - partition* p1, - scoreset* scoring); -static void ydrop_align (alignio* io); -static score ydrop_one_sided_align (alignio* io, int reversed, - u8* A, u8* B, unspos M, unspos N, - int trimToPeak, - editscript** script, - unspos* end1, unspos* end2); -static void dp_ready (dpMatrix* dynProg, unspos needed); -static int msp_left_right (galign* obi, galign* m); -static void get_above_below (alignio* io, galign* obi, galign* oed); -static void align_left_right (galign* obi, galign* m); -static void insert_align (galign* m, galign** obi, galign** oed); -static void update_LR_bounds (int reversed, - aliseg** rightSeg, aliseg** leftSeg, - galign** rightAlign, galign** leftAlign, - unspos row, unspos anchor1, unspos anchor2, - sgnpos* L, sgnpos* R, - unspos* LY, unspos* RY); -static sgnpos next_sweep_seg (int lookRight, aliseg** bp, galign** mp, - unspos row, unspos anchor1, unspos anchor2); -static sgnpos prev_sweep_seg (int lookRight, aliseg** bp, galign** mp, - unspos row, unspos anchor1, unspos anchor2); -static void update_active_segs (int reversed, activeseg** active, - galign** alignList, dpCell* dp, - unspos row, unspos anchor1, unspos anchor2, - unspos LY, unspos RY); -static void build_active_seg (int reversed, activeseg* act, dpCell* dp, - unspos row, unspos anchor1, unspos anchor2, - unspos LY, unspos RY); -static activeseg* add_new_active (int reversed, activeseg* active, - galign* alignList, dpCell* dp, - unspos row, unspos anchor1, unspos anchor2, - unspos LY, unspos RY); -static void filter_active_segs (activeseg** active, int filter); -static alignel* format_alignment (alignio* io, galign* m); -static void save_seg (galign* m, - unspos b1, unspos b2, unspos e1, unspos e2); - -#ifdef snoopAlignioInput -static void dump_alignio_input (FILE* f, alignio* io); -#endif // snoopAlignioInput -#if ((defined(snoopAlignioOutput)) || (defined(snoopEditScripts))) -static void dump_alignio_output(FILE* f, alignio* io); -#endif // snoopAlignioOutput OR snoopEditScripts - -static u64 count_paired_bases (galign* mp); -static void warn_for_paired_bases_limit (seq* seq2, u64 maxPairedBases, - int overlyPairedKeep); - -//---------- -// -// reduce_to_points-- -// Convert each segment in a table to it's "peak". The definition of peak is -// the midpoint of the highest scoring subsegment of a given length. -// -//---------- -// -// Arguments: -// seq* seq1: The first sequence. -// seq* seq2: The second sequence. -// scoreset* scoring: The scoring scheme to use. -// segtable* anchors: The segment table to modify. -// -// Returns: -// (nothing) -// -//---------- - -void reduce_to_points - (seq* seq1, - seq* seq2, - scoreset* scoring, - segtable* anchors) - { - u32 ix; - segment* seg; - unspos peak; - int reportAnchor = false; - - if (gapped_extend_dbgShowAnchors) - { - if (anchors->len == 0) - fprintf (stderr, "reduce_to_points: no anchors\n"); - else - fprintf (stderr, "reduce_to_points: %s anchors\n", - ucommatize(anchors->len)); - } - - for (ix=0,seg=anchors->seg ; ixlen ; ix++,seg++) - { - if (gapped_extend_dbgShowAnchors) - { - reportAnchor = ((gapped_extend_dbgShowAnchorsHowOften == 0) - || (ix % gapped_extend_dbgShowAnchorsHowOften == 0)); - if (reportAnchor) - fprintf (stderr, "reduce_to_points: reducing [%s] (" - unsposSlashFmt " " unsposFmt - " diag=" sgnposFmt - " score=" scoreFmtSimple ")", - commatize(ix), - seg->pos1, seg->pos2, seg->length, - diagNumber(seg->pos1,seg->pos2), - seg->s); - } - - peak = segment_peak (seq1->v+seg->pos1, seq2->v+seg->pos2, seg->length, - scoring); - seg->pos1 += peak; - seg->pos2 += peak; - seg->length = 0; - - if (reportAnchor) - fprintf (stderr, " to (" unsposSlashFmt " " unsposFmt ")\n", - seg->pos1, seg->pos2, seg->length); - } - - gapped_extend_add_stat (numAnchors, anchors->len); - } - - -static unspos segment_peak - (u8* s1, - u8* s2, - unspos segLength, - scoreset* scoring) - { - u8* t1 = s1; - u8* t2 = s2; - score similarity, best; - unspos ix, peak; - - if (segLength <= anchorPeakLen) - peak = segLength / 2; - else - { - similarity = 0; - for (ix=0 ; ixsub[*t1++][*t2++]; - //fprintf (stderr, "%c %c " scoreFmtSimple " " scoreFmtSimple "\n", - // t1[-1], t2[-1], scoring->sub[t1[-1]][t2[-1]], similarity); - } - best = similarity; - peak = anchorPeakLen / 2; - - for ( ; ixsub[*s1++][*s2++]; - similarity += scoring->sub[*t1++][*t2++]; - //fprintf (stderr, "%c %c " scoreFmtSimple " " scoreFmtSimple "\n", - // t1[-1], t2[-1], scoring->sub[t1[-1]][t2[-1]], similarity); - if (similarity > best) - { - best = similarity; - peak = ix - (anchorPeakLen / 2); - } - } - - gapped_extend_count_stat (numPeaks); - gapped_extend_add_stat (totalPeakScore, best); - } - - return peak; - } - -//---------- -// -// gapped_extend-- -// performed gapped extension given a set of anchor segments. -// -//---------- -// -// Arguments: -// seq* seq1: The sequence being searched. -// u8* rev1: The reverse (NOT reverse complement) of seq1, -// .. as a zero-terminated string. -// seq* seq2: The sequence being searched for. -// u8* rev2: The reverse of the seq2 (analagous to rev1). -// int inhibitTrivial: true => don't output the trivial self-alignment. -// scoreset* scoring: The scoring scheme to use. -// segtable* anchors: The anchor segments. -// void* tb: Memory in which to track gapped alignment -// .. traceback. -// int allBounds: true => bound gapped alignments by *all* gapped -// .. extensions of higher-scoring HSPs (a -// .. la blastz) -// false => bound gapped alignments only by gapped -// .. extensions that meet the score -// .. threshold -// score yDrop: Threshold to stop gapped extensions; if the -// .. score drops off by more than yDrop, extension -// .. stops -// int trimToPeak: Whether y-drop should be trimmed (see -// .. description in ydrop_one_sided_align). -// sthresh scoreThresh: Minimum score required; gapped alignments are -// .. discarded if they score less than this. -// u64 maxPairedBases: Maximum number of "paired bases" we'll allow. -// .. A paired base is a match or substitution in -// .. the DP matrix. If we exceed this limit, we -// .. abort processing. However, any gapped -// .. alignments we find up to that point are part -// .. of what we return. A value of zero inidcates -// .. there is no limit. -// int overlyPairedWarn: true => write a warning message when a query -// .. .. exceeds the maxPairedBases -// .. .. threshold. -// int overlyPairedKeep: -// How we should treat alignments for queries that -// .. exceed the maxPairedBases threshold. -// .. false => we discard all the alignments -// .. true => we output whatever alignments we -// .. .. happened to find prior to -// .. .. exceeding the limit -// -// Returns: -// A linked list of alignments. The caller is responsible for disposing of -// these. The caller must also deallocate other memory; see note (1) below. -// -//---------- -// -// Notes: -// -// (1) This routine calls other routines that allocate long-term memory. The -// Calling program is repsonsible for making sure that this memory is -// disposed of at the appropriate time (usually at program exit). This -// should be done by calling free_segment_batches(). -// -//---------- - -//=== stuff for gapped_extend_verbosity === - -#define debugGappedExtendVerbosity_1 \ - if (gapped_extend_verbosity >= 2) \ - { \ - pos1 = mp->pos1; \ - pos2 = mp->pos2; \ - \ - if (sp1->p != NULL) \ - { \ - part = lookup_partition (seq1, pos1); \ - pos1 += part->sepBefore + 1; \ - } \ - if (sp2->p != NULL) \ - { \ - part = lookup_partition (seq2, pos2); \ - pos2 += part->sepBefore + 1; \ - } \ - \ - pos1 += seq1->startLoc; \ - pos2 += seq2->startLoc; \ - \ - fprintf (stderr, "processing anchor #%u (of %u)" \ - " hspId=" u64Fmt \ - " (" unsposSlashFmt ")" \ - " " unsposSlashFmt "\n", \ - i+1, anchors->len, \ - mp->hspId, \ - mp->pos1, mp->pos2, \ - pos1, pos2); \ - } - -#define debugGappedExtendVerbosity_2 \ - if (gapped_extend_verbosity >= 2) \ - { \ - pos1 = mp->pos1; len1 = mp->end1 - pos1; \ - pos2 = mp->pos2; len2 = mp->end2 - pos2; \ - \ - if (sp1->p != NULL) \ - { \ - part = lookup_partition (seq1, pos1); \ - pos1 += part->sepBefore + 1; \ - } \ - if (sp2->p != NULL) \ - { \ - part = lookup_partition (seq2, pos2); \ - pos2 += part->sepBefore + 1; \ - } \ - \ - pos1 += seq1->startLoc; \ - pos2 += seq2->startLoc; \ - \ - fprintf (stderr, "alignment block" \ - " score=" scoreFmtSimple \ - " at (" unsposSlashFmt ") " unsposSlashFmt \ - " length " unsposSlashFmt "\n", \ - mp->align->s, \ - mp->pos1, mp->pos2, pos1, pos2, \ - len1, len2); \ - } - - -//=== stuff for gapped_extend_dbgShowIdentity === -// nota bene: positions reported below are 1-based (not 0-based) - -#define debugGappedExtendDbgShowIdentity_1 \ - if (gapped_extend_dbgShowIdentity) \ - printf ("discarding " unsposSlashSFmt " score=" scoreFmtSimple "\n", \ - mp->pos1+1, ((seq1->revCompFlags & rcf_rev) != 0)? "-" : "+", \ - mp->pos2+1, ((seq2->revCompFlags & rcf_rev) != 0)? "-" : "+", \ - mp->align->s); - -#define debugGappedExtendDbgShowIdentity_2 \ - if (gapped_extend_dbgShowIdentity) \ - printf ("discarding " unsposSlashSFmt " score=" scoreFmtSimple "\n", \ - mp->pos1+1, ((seq1->revCompFlags & rcf_rev) != 0)? "-" : "+", \ - mp->pos2+1, ((seq2->revCompFlags & rcf_rev) != 0)? "-" : "+", \ - mp->align->s); - -#define debugGappedExtendDbgShowIdentity_3 \ - if (gapped_extend_dbgShowIdentity) \ - printf ("keeping " unsposSlashSFmt " score=" scoreFmtSimple "\n", \ - mp->pos1+1, ((seq1->revCompFlags & rcf_rev) != 0)? "-" : "+", \ - mp->pos2+1, ((seq2->revCompFlags & rcf_rev) != 0)? "-" : "+", \ - mp->align->s); - - -//=== stuff for snoopAnchors === - -#ifndef snoopAnchors -#define debugSnoopAnchors_1 ; -#endif // not snoopAnchors - -#ifdef snoopAnchors - -#define debugSnoopAnchors_1 \ - { \ - u32 ix; \ - fprintf (stderr,"===== msp =====\n"); \ - for (ix=0 ; ixlen ; ix++) \ - { \ - fprintf (stderr,"anchors[%3d,hspid=" u64Fmt "] " unsposSlashFmt " " unsposFmt " " scoreFmt, \ - ix, anchors->seg[ix].hspId, \ - anchors->seg[ix].pos1+1, \ - anchors->seg[ix].pos2+1, \ - anchors->seg[ix].length, \ - anchors->seg[ix].s); \ - fprintf (stderr," msp[%3d] " unsposSlashFmt " " unsposSlashFmt "\n", \ - ix, msp[ix]->pos1+1, msp[ix]->pos2+1, \ - msp[ix]->end1, msp[ix]->end2); \ - } \ - } - -#endif // snoopAnchors - - -//=== stuff for snoopAnchorToGapped === - -#ifndef snoopAnchorToGapped -#define debugSnoopAnchorToGapped_1 ; -#define debugSnoopAnchorToGapped_2 ; -#define debugSnoopAnchorToGapped_3 ; -#define debugSnoopAnchorToGapped_4 ; -#define debugSnoopAnchorToGapped_5 ; -#endif // not snoopAnchorToGapped - -#ifdef snoopAnchorToGapped - -#define debugSnoopAnchorToGapped_1 \ - fprintf (stderr, "processing anchor #%u (of %u)\n", \ - i+1, anchors->len); - -#define debugSnoopAnchorToGapped_2 \ - fprintf (stderr, "anchor: " unsposSlashFmt " (diag " sgnposFmt ")\n", \ - mp->pos1, mp->pos2, diagNumber(mp->pos1,mp->pos2)); - -#define debugSnoopAnchorToGapped_3 \ - fprintf (stderr, " gapped: " unsposSlashFmt " " unsposSlashFmt " " scoreFmt "\n", \ - io.start1, io.start2, io.stop1, io.stop2, io.s); - -#define debugSnoopAnchorToGapped_4 \ - fprintf (stderr, "finished processing %u anchors\n", anchors->len); \ - fprintf (stderr, " head=%p\n", head); \ - for (a=head,i=0 ; a!=NULL ; a=a->next,i++) \ - fprintf (stderr, " [%u] %p" \ - " %s:" unsposDotsFmt " %s:" unsposDotsFmt "\n", \ - i, a, \ - seq1->header, a->beg1, a->end1, \ - seq2->header, a->beg2, a->end2); - -#define debugSnoopAnchorToGapped_5 \ - fprintf (stderr, "finished processing %u anchors\n", anchors->len); \ - fprintf (stderr, " paired threshold was exceeded\n"); - -#endif // snoopAnchorToGapped - -//=== stuff for snoopBlocks === - -#ifndef snoopBlocks -#define debugSnoopBlocks_1 ; -#define debugSnoopBlocks_2 ; -#define debugSnoopBlocks_3 ; -#define debugSnoopBlocks_3b ; -#define debugSnoopBlocks_4 ; -#define debugSnoopBlocks_5 ; -#endif // not snoopBlocks - -#ifdef snoopBlocks - -#define debugSnoopBlocks_1 \ - fprintf (stderr, "===== searching for alignment blocks =====\n"); - -#define debugSnoopBlocks_2 \ - fprintf (stderr, "discarding alignment block [%8p]" \ - " b " unsposFmt " " unsposFmt \ - " e " unsposFmt " " unsposFmt \ - " s " scoreFmtSimple "\n", \ - mp, mp->pos1, mp->pos2, mp->end1, mp->end2, mp->align->s); - -#define debugSnoopBlocks_3 \ - fprintf (stderr, "===== collecting alignment blocks =====\n"); - -#define debugSnoopBlocks_3b \ - fprintf (stderr, "===== discarding alignment blocks =====\n"); - -#define debugSnoopBlocks_4 \ - fprintf (stderr, "keeping alignment block [%8p -> %8p]" \ - " b " unsposFmt " " unsposFmt \ - " e " unsposFmt " " unsposFmt \ - " s " scoreFmtSimple "\n", \ - mp, mp->align, \ - mp->pos1, mp->pos2, mp->end1, mp->end2, mp->align->s); - -#define debugSnoopBlocks_5 \ - fprintf (stderr, "discarding alignment block [%8p]" \ - " b " unsposFmt " " unsposFmt \ - " e " unsposFmt " " unsposFmt \ - " s " scoreFmtSimple "\n", \ - mp, mp->pos1, mp->pos2, mp->end1, mp->end2, mp->align->s); - -#endif // snoopBlocks - - -//=== stuff for snoopSubprobs === - -#ifndef snoopSubprobs -#define debugSnoopSubprobs_1 ; -#endif // not snoopSubprobs - -#ifdef snoopSubprobs - -#define debugSnoopSubprobs_1 \ - snoopSubprobsSeq1 = seq1; \ - snoopSubprobsSeq2 = seq2; - -#endif // snoopSubprobs - - -//=== stuff for snoopBatches === - -#ifndef snoopBatches -#define debugSnoopBatches_1 ; -#endif // not snoopBatches - -#ifdef snoopBatches - -#define debugSnoopBatches_1 \ - if (doHspsInBatches) \ - { \ - partition* batPart = segBatches->batch[batIx].part; \ - fprintf (stderr, "batch[%u] %u..%u", \ - batIx, startSegIx, endSegIx-1); \ - if (batPart != NULL) \ - fprintf (stderr, " " unsposFmt ".." unsposFmt, \ - batPart->sepBefore+1, batPart->sepAfter); \ - fprintf (stderr, "\n"); \ - } - -#endif // snoopBatches - - -//=== stuff for tryout === - -#ifndef tryout -#define debugTriviality_1 ; -#define debugTriviality_2 ; -#define debugTriviality_3 ; -#define debugTriviality_4 ; -#define debugTriviality_5 ; -#define debugTriviality_6 ; -#endif // not tryout - -#ifdef tryout - -#define debugTriviality_1 \ - if (gapped_extend_dbgTriviality) \ - { \ - fprintf (stderr, "trivial?: \"%s\" " unsposFmt " \"%s\" " unsposFmt "\n", \ - name1, len1, name2, len2); \ - } - -#define debugTriviality_2 \ - if (gapped_extend_dbgTriviality) \ - { \ - fprintf (stderr, " sequence lengths differ\n"); \ - } - -#define debugTriviality_3 \ - if (gapped_extend_dbgTriviality) \ - { \ - fprintf (stderr, " alignment lengths differ\n"); \ - } - -#define debugTriviality_4 \ - if (gapped_extend_dbgTriviality) \ - { \ - fprintf (stderr, " sequence names differ\n"); \ - } - -#define debugTriviality_5 \ - if (gapped_extend_dbgTriviality) \ - { \ - fprintf (stderr, " alignment content differs\n"); \ - } - -#define debugTriviality_6 \ - if (gapped_extend_dbgTriviality) \ - { \ - fprintf (stderr, " it's trivial!\n"); \ - } - -#endif // tryout - - -//=== stuff for dbgTimingGappedExtend === - -#ifdef dbgTimingGappedExtend -void gapped_extend_timing_report - (arg_dont_complain(FILE* f)) - { - dbg_timing_gapped_extend_report (f, debugClockAboveBelow, "total time in above_below()"); - dbg_timing_gapped_extend_report (f, debugClockLeftRight, "total time in left_right()"); - dbg_timing_gapped_extend_report (f, debugClockYdropAlign, "total time in ydrop_align()"); - dbg_timing_gapped_extend_report (f, debugClockYdropOneSidedAlign, " ydrop_one_sided_align()"); - dbg_timing_gapped_extend_report (f, debugClockUpdateLrBounds, " update_lr_bounds()"); - dbg_timing_gapped_extend_report (f, debugClockNextSweepSeg, " next_sweep_seg()"); - dbg_timing_gapped_extend_report (f, debugClockPrevSweepSeg, " prev_sweep_seg()"); - dbg_timing_gapped_extend_report (f, debugClockUpdateActiveSegs, " update_active_segs()"); - dbg_timing_gapped_extend_report (f, debugClockFilterActiveSegs, " filter_active_segs()"); - } -#endif // dbgTimingGappedExtend - - -//=== stuff for snoopEditScripts === - -#ifndef snoopEditScripts -#define debugSnoopEditScripts_1 ; -#define debugSnoopEditScripts_2 ; -#define debugSnoopEditScripts_3 ; -#define debugSnoopEditScripts_4 ; -#define debugSnoopEditScripts_5 ; -#endif // not snoopEditScripts - -#ifdef snoopEditScripts - -#define debugSnoopEditScripts_1 \ - fprintf (stderr, "(adding full length trivial alignment)\n"); - -#define debugSnoopEditScripts_2 \ - fprintf (stderr, "(adding full length trivial alignment vs. partition %u)\n", \ - trivialPartIx); - -#define debugSnoopEditScripts_3 \ - fprintf (stderr, "(adding trivial alignment for partition %u)\n", \ - partIx); - -#define debugSnoopEditScripts_4 \ - { \ - char* opName[4] = { "?", "I", "D", "S" }; \ - alignel* aa = mp->align; \ - u32 opIx, op, rpt; \ - fprintf (stderr, " " unsposDotsFmt " vs " unsposDotsFmt \ - " (diag " sgnposFmt ")" \ - " score=" scoreFmt, \ - aa->beg1, aa->end1, aa->beg2, aa->end2, \ - diagNumber(aa->beg1,aa->beg2), \ - aa->s); \ - if (aa->isTrivial) fprintf (stderr, " (trivial)"); \ - fprintf (stderr, "\n "); \ - for (opIx=0 ; opIxscript->len ; opIx++) \ - { \ - op = edit_op_operation (aa->script->op[opIx]); \ - rpt = edit_op_repeat (aa->script->op[opIx]); \ - fprintf (stderr, " %dx%s", rpt, opName[op]); \ - } \ - fprintf (stderr, "\n"); \ - } - -#define debugSnoopEditScripts_5 \ - fprintf (stderr, " (alignment is empty)\n"); - -#endif // snoopEditScripts - - -//=== stuff for snoopSpecialHsp === - -#ifndef snoopSpecialHsp -#define debugsnoopSpecialHsp_1 ; -#endif // not snoopSpecialHsp - -#ifdef snoopSpecialHsp - -#define debugsnoopSpecialHsp_1 \ - { \ - if ((mp->pos1 != specialPosA) || (mp->pos2 != specialPosB)) \ - { \ - fprintf (stderr," Ignoring msp[%3d] " unsposSlashFmt " " unsposSlashFmt "\n", \ - i, mp->pos1+1, mp->pos2+1, mp->end1, mp->end2); \ - continue; \ - } \ - } - -#endif // snoopSpecialHsp - - -//=== finally, the actual function gapped_extend() === - -alignel* gapped_extend - (seq* seq1, - u8* rev1, - seq* seq2, - u8* rev2, - int inhibitTrivial, - scoreset* scoring, - segtable* anchors, - tback* tb, - int allBounds, - score yDrop, - int trimToPeak, - sthresh scoreThresh, - u64 maxPairedBases, - int overlyPairedWarn, - int overlyPairedKeep) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* p1, *p2; - galign** msp; - alignel* a; - score s; - alignio io; - galign* mp, *mpNext; - galign* orderBegInc, *orderEndDec; - alignel* head, *last; - aliseg* bp, *bq; - u32 i; - unspos pos1, pos2, len1, len2; - partition* part; - int partitionedTriviality, delayedCheckForTrivial; - int doHspsInBatches; - sbtable* segBatches; - u32 batIx, startSegIx, endSegIx, partIx; - int trivialPartIx = -1; - u32 freeSlot; - u64 pairedBases = 0, newPairedBases; - - orderBegInc = orderEndDec = NULL; // (compiler appeasement) - - if (scoreThresh.t != 'S') - suicidef ("gapped_extend can't handle score threshold %s", - score_thresh_to_string (&scoreThresh)); - - // create a gapped alignment table containing one entry for each HSP (plus - // an additional slot for each possible trivial self-alignment-- see - // note (1) in init_from_anchors); note that batched_segments sorts the - // HSPs in descreasing score order (per batch) - - doHspsInBatches = false; - if (gapped_extend_dbgAllowBatches) - doHspsInBatches = (sp1->p != NULL); // (seq1 is partitioned) - - if (!doHspsInBatches) segBatches = batched_segments (anchors, NULL); - else segBatches = batched_segments (anchors, sp1); - - if ((sp1->p == NULL) || (sp2->p == NULL)) - msp = init_from_anchors (anchors, 1); - else - { - u32 numExtraSlots = sp1->len; - if (sp2->len < numExtraSlots) numExtraSlots = sp2->len; - msp = init_from_anchors (anchors, numExtraSlots); - } - - debugSnoopAnchors_1; - debugSnoopBlocks_1; - debugSnoopSubprobs_1; - - // set up the "io" block, which is used for communication with lower-level - // routines. - - io.seq1 = seq1->v; - io.seq2 = seq2->v; - io.rev1 = rev1; - io.rev2 = rev2; - io.low1 = 0; io.len1 = io.high1 = seq1->len; - io.low2 = 0; io.len2 = io.high2 = seq2->len; - - io.scoring = scoring; - io.yDrop = yDrop; - io.trimToPeak = trimToPeak; - - if (tb == NULL) - suicide ("gapped_extend was given a NULL traceback pointer."); - io.tb = tb; - - // special case check for trivial alignments of identical sequences; if the - // two sequences are identical, we add the trivial self alignment to the - // table immediately (so it can prevent some other alignment near the main - // diagonal from merging with major portions of the trivial alignment); - // note that if trivial alignments aren't desired we will discard them - // later, after we've extended all the other anchors - // - // see note (1) in init_from_anchors-- the gapped alignment table has - // extra slots to allow for the addition of these trivial self-alignments; - // there is one extra slot for min(nTarget,nQuery), where e.g. nTarget is - // the number of partitions in the target (or 1, if it is not partitioned). - // - // the check for trivialty becomes more complicated if we are dealing with - // partitioned sequences - // - // also note that the more complicated test provides *no guarantee* that - // it will prevent other other alignments from merging into the diagonal; - // it is possible that two identical sequences will end up with an aligment - // that only has part of the main diagonal, and no alignment covering the - // complete main diagonal; the author does not consider this a significant - // failure because it can only happen when the user does *not* tell the - // program she is doing a self-alignment - - partitionedTriviality = false; - delayedCheckForTrivial = ((inhibitTrivial) - && ((sp1->p != NULL) || (sp2->p != NULL))); - - if ((sp1->p != NULL) // (seq1 is partitioned) - && (sp2->p == NULL)) // (seq2 is not partitioned) - { - // $$$ this could be modified to check seq2 vs *every* partition in - // $$$ .. seq1, but would require some restructuring of the code; we - // $$$ .. don't expect there are several identical sequences in seq1 - trivialPartIx = identical_partition_of_sequence (seq1, seq2); - partitionedTriviality = (trivialPartIx != -1); - delayedCheckForTrivial = (inhibitTrivial) && (!partitionedTriviality); - } - else if ((sp1->p != NULL) // (seq1 is partitioned) - && (sp2->p != NULL)) // (seq2 is partitioned) - { - // $$$ this could be modified to check every partition in seq1 vs every - // $$$ .. partition in seq2, but would require some restructuring of the - // $$$ .. code; we don't expect seq2 to be partitioned in normal use; - // $$$ .. moreover, this would require *many* more slots in the gapped - // $$$ .. alignment table - partitionedTriviality = identical_partitioned_sequences (seq1, seq2); - delayedCheckForTrivial = (inhibitTrivial) && (!partitionedTriviality); - } - - if (!doHspsInBatches) // empty the list of bounding - orderBegInc = orderEndDec = NULL; // .. alignments - - if ((sp1->p == NULL) // (seq1 is not partitioned) - && (sp2->p == NULL) // (seq2 is not partitioned) - && (identical_sequences (seq1, seq2, scoring, &s))) - { - if (doHspsInBatches) - suicidef ("internal error, attempt to add trivial self-alignment with batches in play"); - - freeSlot = anchors->len; - mp = msp[freeSlot]; - mp->pos1 = mp->pos2 = 0; - mp->end1 = mp->end2 = seq1->len-1; - mp->leftAlign1 = - mp->leftAlign2 = - mp->rightAlign1 = - mp->rightAlign2 = NULL; - mp->leftSeg1 = - mp->leftSeg2 = - mp->rightSeg1 = - mp->rightSeg2 = NULL; - mp->firstSeg = NULL; - save_seg (mp, mp->pos1, mp->pos2, mp->end1, mp->end2); - insert_align(mp, &orderBegInc, &orderEndDec); - mp->lastSeg = mp->firstSeg; - mp->firstSeg->prevSeg = mp->lastSeg->nextSeg = NULL; - a = mp->align = malloc_or_die ("gapped_extend", sizeof(alignel)); - a->script = edit_script_new(); - edit_script_sub (&a->script, seq1->len); - a->beg1 = a->beg2 = 1; - a->end1 = a->end2 = seq1->len; - a->seq1 = seq1->v; - a->seq2 = seq2->v; - if (s < scoreThresh.s) a->s = scoreThresh.s; // so it won't be discarded - else a->s = s; - a->next = NULL; - a->isTrivial = true; - debugSnoopEditScripts_1; - debugSnoopEditScripts_4; - } - - else if ((partitionedTriviality) - && (sp2->p == NULL)) // (seq2 is not partitioned) - { - if (doHspsInBatches) - suicidef ("internal error, attempt to add trivial self-alignment with batches in play"); - - freeSlot = anchors->len; - p1 = &sp1->p[trivialPartIx]; - s = score_identical_partition_of (seq1, seq2, p1, scoring); - mp = msp[freeSlot]; - mp->pos1 = p1->sepBefore + 1; - mp->pos2 = 0; - mp->end1 = p1->sepAfter - 1; - mp->end2 = seq2->len - 1; - mp->leftAlign1 = - mp->leftAlign2 = - mp->rightAlign1 = - mp->rightAlign2 = NULL; - mp->leftSeg1 = - mp->leftSeg2 = - mp->rightSeg1 = - mp->rightSeg2 = NULL; - mp->firstSeg = NULL; - save_seg (mp, mp->pos1, mp->pos2, mp->end1, mp->end2); - insert_align(mp, &orderBegInc, &orderEndDec); - mp->lastSeg = mp->firstSeg; - mp->firstSeg->prevSeg = mp->lastSeg->nextSeg = NULL; - a = mp->align = malloc_or_die ("gapped_extend", sizeof(alignel)); - a->script = edit_script_new(); - edit_script_sub (&a->script, seq2->len); - a->beg1 = p1->sepBefore + 2; - a->beg2 = 1; - a->end1 = p1->sepAfter; - a->end2 = seq2->len; - a->seq1 = seq1->v; - a->seq2 = seq2->v; - if (s < scoreThresh.s) a->s = scoreThresh.s; // so it won't be discarded - else a->s = s; - a->next = NULL; - a->isTrivial = true; - debugSnoopEditScripts_2; - debugSnoopEditScripts_4; - } - - else if ((partitionedTriviality) - && (sp2->p != NULL)) // (seq2 is partitioned) - { - if (doHspsInBatches) - suicidef ("internal error, attempt to add trivial self-alignment with batches in play"); - - freeSlot = anchors->len; - for (partIx=0 ; partIxlen ; partIx++) - { - p1 = &sp1->p[partIx]; - p2 = &sp2->p[partIx]; - s = score_identical_partition (seq1, seq2, p1, p2, scoring); - mp = msp[freeSlot++]; - mp->pos1 = p1->sepBefore + 1; - mp->pos2 = p2->sepBefore + 1; - mp->end1 = p1->sepAfter - 1; - mp->end2 = p2->sepAfter - 1; - mp->leftAlign1 = - mp->leftAlign2 = - mp->rightAlign1 = - mp->rightAlign2 = NULL; - mp->leftSeg1 = - mp->leftSeg2 = - mp->rightSeg1 = - mp->rightSeg2 = NULL; - mp->firstSeg = NULL; - save_seg (mp, mp->pos1, mp->pos2, mp->end1, mp->end2); - insert_align(mp, &orderBegInc, &orderEndDec); - mp->lastSeg = mp->firstSeg; - mp->firstSeg->prevSeg = mp->lastSeg->nextSeg = NULL; - a = mp->align = malloc_or_die ("gapped_extend", sizeof(alignel)); - a->script = edit_script_new(); - edit_script_sub (&a->script, p1->sepAfter - (p1->sepBefore+1)); - a->beg1 = p1->sepBefore + 2; - a->beg2 = p2->sepBefore + 2; - a->end1 = p1->sepAfter; - a->end2 = p2->sepAfter; - a->seq1 = seq1->v; - a->seq2 = seq2->v; - if (s < scoreThresh.s) a->s = scoreThresh.s; // so it won't be discarded - else a->s = s; - a->next = NULL; - a->isTrivial = true; - debugSnoopEditScripts_3; - debugSnoopEditScripts_4; - } - } - - // process each batch of anchors - - head = last = NULL; // (initialize the list of qualifying gapped alignments) - - for (batIx=0 ; batIxlen ; batIx++) - { - startSegIx = segBatches->batch[batIx].start; - endSegIx = segBatches->batch[batIx].end; - debugSnoopBatches_1; - - if (doHspsInBatches) // empty the list of bounding - orderBegInc = orderEndDec = NULL; // .. alignments, for this batch - - // convert each anchor in this batch to a gapped extension, processing - // the anchors from high score to low (they've previously been sorted - // into that order) - - for (i=startSegIx ; ileftAlign1; - io.rightAlign = mp->rightAlign1; - io.leftSeg = mp->leftSeg1; - io.rightSeg = mp->rightSeg1; - - // find the closest alignments ending before or starting after this - // anchor - - debugSnoopAnchorToGapped_2; - io.anchor1 = mp->pos1; - io.anchor2 = mp->pos2; - io.hspId = mp->hspId; - dbg_timing_gapped_extend_sub (debugClockAboveBelow); - get_above_below (&io, orderBegInc, orderEndDec); - dbg_timing_gapped_extend_add (debugClockAboveBelow); - - // if either sequence is partitioned, figure out the limits of the - // partition containing this anchor - - if (segBatches->batch[batIx].part != NULL) - { p1 = segBatches->batch[batIx].part; goto set_limits1; } - else if (sp1->p != NULL) - { - p1 = lookup_partition (seq1, io.anchor1); - set_limits1: - io.low1 = p1->sepBefore + 1; - io.high1 = p1->sepAfter; - } - - if (sp2->p != NULL) - { - p2 = lookup_partition (seq2, io.anchor2); - io.low2 = p2->sepBefore + 1; - io.high2 = p2->sepAfter; - } - - // if we have a chore, further restrict the limits to the chore - - if (seq2->choresFile != NULL) - { - interval tInt = seq2->chore.targetInterval; - interval qInt = seq2->chore.queryInterval; - if (tInt.s > io.low1) io.low1 = tInt.s; - if (tInt.e < io.high1) io.high1 = tInt.e; - if (qInt.s > io.low2) io.low2 = qInt.s; - if (qInt.e < io.high2) io.high2 = qInt.e; - } - - // extend this anchor into a gapped alignment, in both directions - - dbg_timing_gapped_extend_sub (debugClockYdropAlign); - ydrop_align (&io); - dbg_timing_gapped_extend_add (debugClockYdropAlign); - - debugSnoopAnchorToGapped_3; - mp->align = format_alignment (&io, mp); - mp->pos1 = io.start1; - mp->pos2 = io.start2; - mp->end1 = io.stop1; - mp->end2 = io.stop2; - - if (mp->firstSeg == NULL) // (the gapped alignment is empty, - { // .. so skip it) - debugSnoopEditScripts_5; - continue; - } - - debugSnoopEditScripts_4; - - // record the alignment's tail and detach the circular pointer - - mp->lastSeg = mp->firstSeg->prevSeg; - mp->firstSeg->prevSeg = mp->lastSeg->nextSeg = NULL; - - // if this alignment doesn't meet the score threshold, discard it - // now; otherwise, save it and use it to bound subsequent gapped - // extensions; note that in blastz, the alignment was *always* - // used as a bound (regardless of whether it met score threshold), - // and we provide that functionality if allBounds is true (in which - // case the low-scoring alignments are discarded later) - - if ((!allBounds) && (mp->align->s < scoreThresh.s)) - { - debugGappedExtendDbgShowIdentity_1; - debugSnoopBlocks_2; - free_align_list (mp->align); - - for (bp=mp->firstSeg ; bp!=NULL ; bp=bq) - { bq = bp->nextSeg; free_if_valid ("gapped_extend seg", bp); } - - continue; - } - - // record the horizontal bounding alignments/segments of this - // anchor's gapped extension, and insert it into the vertically - // ordered alignment lists - - dbg_timing_gapped_extend_sub (debugClockLeftRight); - align_left_right (orderBegInc, mp); - insert_align (mp, &orderBegInc, &orderEndDec); - dbg_timing_gapped_extend_add (debugClockLeftRight); - - // if we have a limit on the number of paired bases we'll - // accept, check for that now; if we've exceeded the limit, - // we just stop processing HSPs - - if (maxPairedBases > 0) - { - newPairedBases = count_paired_bases (mp); - pairedBases += newPairedBases; - //fprintf (stderr, "paired bases: %s of %s\n", - // commatize(newPairedBases), - // commatize(pairedBases)); - if (pairedBases > maxPairedBases) - { - if (overlyPairedWarn) - warn_for_paired_bases_limit (seq2, maxPairedBases, - overlyPairedKeep); - if (!overlyPairedKeep) goto discard_alignments; - break; // exit the HSP loop - } - } - - debugGappedExtendVerbosity_2; - } - - // link the high scoring alignments together into a list; discard the - // trivial self-alignment if it isn't wanted, and discard any alignments - // that don't meet the score threshold - // note that unless allBounds is true (blastz compatibility), there will - // be no low-scoring alignments to discard here - // also note that if we have more than one batch, the self-alignment - // won't be here, so we don't have to worry that we'll discard it too - // early (but since we like to worry, we do check for that case) - - debugSnoopBlocks_3 - - for (mp=orderBegInc; mp!=NULL ; mp=mpNext) - { - mpNext = mp->next; - if (mp->align->s < scoreThresh.s) - { - debugGappedExtendDbgShowIdentity_2; - goto free_up_all; - } - if ((inhibitTrivial) && (mp->align->isTrivial)) - goto free_up_all; - else if (delayedCheckForTrivial) - { - aliseg* seg = mp->firstSeg; - char* name1, *name2; - - if (mp->lastSeg != seg) goto not_trivial; - if (seg->type != diagSeg) goto not_trivial; - - if (doHspsInBatches) - suicidef ("internal error, attempt to discard trivial self-alignment with batches in play"); - - if (sp1->p == NULL) - { - name1 = seq1->header; - len1 = seq1->trueLen; - if ((name1 != NULL) && (name1[0] == '>')) - name1 = skip_whitespace(name1+1); - } - else if (segBatches->batch[batIx].part != NULL) - { p1 = segBatches->batch[batIx].part; goto set_name1; } - else - { - p1 = lookup_partition (seq1, mp->pos1); - set_name1: - name1 = &sp1->pool[p1->header]; - len1 = p1->trueLen; - } - - if (sp2->p == NULL) - { - name2 = seq2->header; - len2 = seq2->trueLen; - if ((name2 != NULL) && (name2[0] == '>')) - name2 = skip_whitespace(name2+1); - } - else - { - p2 = lookup_partition (seq2, mp->pos2); - name2 = &sp2->pool[p2->header]; - len2 = p2->trueLen; - } - - debugTriviality_1; - - if (len1 != len2) - { debugTriviality_2; goto not_trivial; } - if (mp->end1+1 - mp->pos1 != len1) - { debugTriviality_3; goto not_trivial; } - if (strcmp (name1, name2) != 0) - { debugTriviality_4; goto not_trivial; } - - for (pos1=mp->pos1,pos2=mp->pos2 ; pos1<=mp->end1 ; pos1++,pos2++) - { - if (seq1->v[pos1] != seq2->v[pos2]) - { debugTriviality_5; goto not_trivial; } - } - - debugTriviality_6; - goto free_up_all; - } - not_trivial: - - if (head == NULL) head = last = mp->align; - else last = last->next = mp->align; - debugGappedExtendDbgShowIdentity_3; - debugSnoopBlocks_4; - goto free_up_segments; - - // discard an alignment block - - free_up_all: - - debugSnoopBlocks_5; - free_align_list (mp->align); - // (fall thru to free_up_segments) - - // discard an alignment block's segments - - free_up_segments: - for (bp=mp->firstSeg ; bp!=NULL ; bp=bq) - { bq = bp->nextSeg; free_if_valid ("gapped_extend seg", bp); } - } - - } - - debugSnoopAnchorToGapped_4; - - free_if_valid ("gapped_extend msp[]", msp); - - return head; - - ////////// - // failure exit - ////////// - - // discard all alignments we found - -discard_alignments: - - debugSnoopBlocks_3b - - for (mp=orderBegInc; mp!=NULL ; mp=mpNext) - { - mpNext = mp->next; - - // discard an alignment block and its segments - - debugSnoopBlocks_5; - free_align_list (mp->align); - - for (bp=mp->firstSeg ; bp!=NULL ; bp=bq) - { bq = bp->nextSeg; free_if_valid ("gapped_extend seg", bp); } - } - - debugSnoopAnchorToGapped_5; - - free_if_valid ("gapped_extend msp[]", msp); - - return NULL; - } - -//---------- -// -// batched_segments-- -// Separate a list of segments in batches relative to a partitioned sequence. -// Conceptually we produce one batch per partition. But if a partition -// contains no segment, it is just left out of the batch table. -// -//---------- -// -// Arguments: -// segtable* anchors: The anchors. Note that these well be reordered -// .. by this routine, so that every batch is -// .. sorted by descreasing score. -// seqpartition* sp: The partitioning to separate by. This should -// .. relate to sequence 1. A special value of -// .. NULL indicates that partitioning is not -// .. wanted, so we should treat the whole segment -// .. table as a single batch. -// -// Returns: -// A pointer to the batch table. This is allocated data, which the caller -// must eventually dispose of by calling free_segment_batches(). -// -//---------- - -static sbtable* _segBatches = NULL; - -static sbtable* batched_segments - (segtable* anchors, - seqpartition* sp) - { - u32 entriesNeeded; - size_t bytesNeeded; - partition* part; - segment* anc; - unspos pEnd; - u32 batIx, partIx, ancIx; - u32 startIx, endIx; - - // allocate the batch table, or resize it if it's not big enough - // nota bene: (as of this writing), we don't expect realloc to ever occur, - // because the sequence that we're partitioning with is the same - // one through the entire program; that may change in the future - - if (sp == NULL) entriesNeeded = 1; - else entriesNeeded = sp->len; - bytesNeeded = sbtable_bytes (entriesNeeded); - - if (_segBatches == NULL) - { - _segBatches = (sbtable*) malloc_or_die ("batched_segments", bytesNeeded); - _segBatches->size = entriesNeeded; - } - else if (entriesNeeded > _segBatches->size) - { - _segBatches = (sbtable*) realloc_or_die ("batched_segments", _segBatches, bytesNeeded); - _segBatches->size = entriesNeeded; - } - - // if we don't have a partitioned sequence, create one batch convering the - // entire segment list - - if (sp == NULL) - { - _segBatches->batch[0].part = NULL; - _segBatches->batch[0].start = 0; - _segBatches->batch[0].end = anchors->len; - _segBatches->len = 1; - - sort_segments (anchors, qSegmentsByDecreasingScore); - } - - // otherwise, create batches, dividing the segments by partition; note - // that we assume each segment is contained within a single partition - - else - { - sort_segments (anchors, qSegmentsByPos1); - - batIx = 0; - ancIx = 0; anc = &anchors->seg[ancIx]; - for (partIx=0 ; partIxlen ; partIx++) - { - if (ancIx >= anchors->len) break; - - part = &sp->p[partIx]; - pEnd = part->sepAfter; - if (pEnd < anc->pos1 + anc->length) continue; - - _segBatches->batch[batIx].part = part; - _segBatches->batch[batIx].start = startIx = ancIx++; anc++; - while ((ancIx < anchors->len) - && (pEnd >= anc->pos1 + anc->length)) - { ancIx++; anc++; } - - _segBatches->batch[batIx].end = endIx = ancIx; - batIx++; - - sort_some_segments (anchors, startIx, endIx, - qSegmentsByDecreasingScore); - } - - _segBatches->len = batIx; - } - - return _segBatches; - } - -//---------- -// -// free_segment_batches-- -// Dispose of memory alloceted by batched_segments(). -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void free_segment_batches (void) - { - free_if_valid ("free_segment_batches", _segBatches); - _segBatches = NULL; - } - -//---------- -// -// init_from_anchors-- -// Initialize a gapped alignments set from a set of anchors. -// -//---------- -// -// Arguments: -// segtable* anchors: The anchors, which we will expand into gapped -// .. alignment records, intitally as single -// .. points. -// u32 numExtraSlots: The number of extra slots to add (see note 1 -// .. below) -// -// Returns: -// A pointer to newly allocated data (see description below); failures result -// in program fatality. The caller must eventually dispose of the table, with -// a call to free(). -// -// Data Description: -// The allocated data is an array of pointers to galign blocks. It is -// allocated as a single block from the heap (which is why it can be freed wih -// a simple call to free). The first N locations form the array, and the data -// following that contains the galign blocks. -// -//---------- -// -// Notes: -// -// (1) We allocate extra slots in the gapped alignment table, to allow a -// trivial self-alignments to be added later. If the target sequence is -// *not* partitioned, there will be at most one trivial alignment and we -// add only one extra slot. The same is true if the query is *not* -// partitioned. If both target and query are partitioned, there can be -// a trivial alignment for each partition in whichever has fewer -// partitions, so we add that many extra slots. We push this determination -// back to the caller, who must provide the value for numExtraSlots. -// -// (2) We only copy pos1/pos2 from the anchors. We ignore the length and -// don't set end1/end2. The expectation is that the anchors are single -// points without length-- upstream code should have reduced them to -// single points, and initial downstream code only uses pos1/pos2. -// -//---------- - -static galign** init_from_anchors - (segtable* anchors, - u32 numExtraSlots) - { - int numAnchors, numSlots, ix; - u32 bytesNeeded, bytesPointers, bytesBlocks; - galign** snakes, *snake; - segment* anchor; - - numAnchors = anchors->len; - numSlots = numAnchors + numExtraSlots; - - // allocate; note that we allocate extra slots as per note (1) above - // $$$ Why is zalloc used here instead of malloc? If the table is large, - // $$$ .. zalloc will spec significant time zeroing it, but why is that - // $$$ .. necessary? This sets the anchor's length to zero, but would it - // $$$ .. be more efficient to set that explicitely, so that we don't waste - // $$$ .. time clearing out the extra slots which are often not used? - - bytesPointers = round_up_16 (numSlots * sizeof(galign*)); - if ((u32max-16) / sizeof(galign) < (u32) numSlots) goto overflow1; - bytesBlocks = round_up_16 (numSlots * sizeof(galign)); - bytesNeeded = bytesPointers; - bytesNeeded += bytesBlocks; if (bytesNeeded < bytesBlocks) goto overflow2; - - snakes = (galign**) zalloc_or_die ("init_from_anchors", bytesNeeded); - gapped_extend_count_stat (zallocCallsA); - gapped_extend_add_stat (zallocTotalA, bytesNeeded); - - // hook up pointers; this links the array of N pointers to the actual - // blocks - - snakes[0] = (galign*) (((char*) snakes) + bytesPointers); - - for (ix=1 ; ixseg[ix]; - snake->pos1 = anchor->pos1; - snake->pos2 = anchor->pos2; - snake->hspId = anchor->hspId; - } - - return snakes; - -// failure exits - -#define suggestions " consider raising scoring threshold (--hspthresh or --exact)" \ - " or breaking your target sequence into smaller pieces" - -overflow1: - suicidef ("in init_from_anchors(), structure size would exceed 2^32" - " (%u + %u*%u)\n" - suggestions, - bytesPointers, sizeof(galign), numSlots); - return NULL; // (doesn't get here) - -overflow2: - suicidef ("in init_from_anchors(), structure size exceeds 2^32" - " (%u + %u)\n" - suggestions, - bytesPointers, bytesBlocks); - - return NULL; // (doesn't get here) - } - -//---------- -// -// identical_sequences-- -// Determine if two sequences are identical for practical purposes. -// -//---------- -// -// Arguments: -// seq* seq1: One sequence -// seq* seq2: The other sequence. -// scoreset* scoring: The scoring scheme to use. This may be NULL if the -// .. caller isn't interested in the score. -// score* s: Place to return the score. This may be NULL. -// -// Returns: -// true if the sequences are identical, but allowing differences in upper/lower -// case; false otherwise. -// -//---------- -// -// Notes: -// -// (1) (this note also applies to score_identical_partition and -// score_identical_partition_of) -// When computing the score below we have to consider the possibility -// of overflow. For example, if matches are worth 100, then a sequence -// of length 21.5M will exceed 2^31 and overflow (assuming scores are -// 32-bit ints). We prevent this by checking whether adding a positve -// score will overflow. However, for negative match scores (e.g. N vs N), -// we simply subtract. Thus we are assuming that that we won't have so -// many negative scores that we underflow. -// -//---------- - -static int identical_sequences - (seq* seq1, - seq* seq2, - scoreset* scoring, - score* _s) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - u8* a, *b; - u8 aNuc, bNuc; - score s, sub; - unspos ix; - - if ((sp1->p != NULL) || (sp2->p != NULL)) // one or the - return false; // .. other is partitioned - - if ((seq1->fileType == seq_type_qdna) != (seq2->fileType == seq_type_qdna)) - return false; - - if (seq1->len != seq2->len) - return false; - - if (seq1->revCompFlags != seq2->revCompFlags) - return false; - - a = seq1->v; - b = seq2->v; - s = 0; - - for (ix=0 ; ixlen ; ix++) - { - aNuc = (u8) dna_toupper(a[ix]); - bNuc = (u8) dna_toupper(b[ix]); - if (aNuc != bNuc) return false; - if (scoring == NULL) continue; - - sub = scoring->sub[aNuc][bNuc]; // (see note above about overflow) - if (s == bestPossibleScore) - ; - else if ((sub <= 0) || (s < bestPossibleScore - sub)) - s += sub; - else - s = bestPossibleScore; - } - - if (_s != NULL) *(_s) = s; - return true; - } - -//---------- -// -// identical_partitioned_sequences-- -// Determine if two partitioned sequences are identical for practical purposes. -// -//---------- -// -// Arguments: -// seq* seq1: One sequence. -// seq* seq2: The other sequence. -// -// Returns: -// true if the sequences are identical across their partitions, but allowing -// differences in upper/lower case; false otherwise. -// -//---------- - -static int identical_partitioned_sequences - (seq* seq1, - seq* seq2) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* p1, *p2; - u32 partIx; - u8* a, *b; - u8 aNuc, bNuc; - unspos len1, len2, ix; - - if ((sp1->p == NULL) || (sp2->p == NULL)) // one or the other - return false; // .. is *not* partitioned - - if ((seq1->fileType == seq_type_qdna) != (seq2->fileType == seq_type_qdna)) - return false; - - if (sp1->len != sp2->len) - return false; - - if (seq1->revCompFlags != seq2->revCompFlags) - return false; - - for (partIx=0 ; partIxlen ; partIx++) - { - p1 = &sp1->p[partIx]; - p2 = &sp2->p[partIx]; - len1 = p1->sepAfter - (p1->sepBefore+1); - len2 = p2->sepAfter - (p2->sepBefore+1); - if (len1 != len2) return false; - - a = seq1->v + p1->sepBefore+1; - b = seq2->v + p2->sepBefore+1; - for (ix=0 ; ixpartition; - seqpartition* sp2 = &seq2->partition; - partition* p1; - u32 partIx; - u8* a, *b; - u8 aNuc, bNuc; - unspos len1, len2, ix; - int isMatch; - - if (sp1->p == NULL) return -1; // seq1 is *not* partitioned - if (sp2->p != NULL) return -1; // seq2 is partitioned - - if ((seq1->fileType == seq_type_qdna) != (seq2->fileType == seq_type_qdna)) - return -1; - - if (seq1->revCompFlags != seq2->revCompFlags) - return -1; - - // if this is a previously unseen partitioning, prescan to find the min and - // max lengths - -#ifdef cache_partition_lengths - if (sp1 != cachedSp1) - { - cachedSp1 = sp1; - - partIx = 0; - p1 = &sp1->p[partIx]; - len1 = p1->sepAfter - (p1->sepBefore+1); - cachedMinLen = cachedMaxLen = len1; - - for (partIx=1 ; partIxlen ; partIx++) - { - p1 = &sp1->p[partIx]; - len1 = p1->sepAfter - (p1->sepBefore+1); - if (len1 < cachedMinLen) cachedMinLen = len1; - else if (len1 > cachedMaxLen) cachedMaxLen = len1; - } - - //fprintf (stderr, "partition lengths: " unsposFmt ".." unsposFmt "\n", - // cachedMinLen, cachedMaxLen); - } -#endif // cache_partition_lengths - - // if the second sequence is shorter or longer than all partitions, there - // can be no match - - len2 = seq2->len; - -#ifdef cache_partition_lengths - if (sp1 == cachedSp1) - { - if ((len2 < cachedMinLen) || (len2 > cachedMaxLen)) - { - // fprintf (stderr, "length out of range: " unsposFmt "\n", len2); - return -1; - } - } -#endif // cache_partition_lengths - - // otherwise, scan all partitions for a match - - for (partIx=0 ; partIxlen ; partIx++) - { - p1 = &sp1->p[partIx]; - len1 = p1->sepAfter - (p1->sepBefore+1); - if (len1 != len2) continue; - - a = seq1->v + p1->sepBefore+1; - b = seq2->v; - isMatch = true; - for (ix=0 ; ixsepAfter - (p1->sepBefore+1); - - a = seq1->v + p1->sepBefore+1; - b = seq2->v + p2->sepBefore+1; - s = 0; - for (ix=0 ; ixsub[aNuc][bNuc]; // (see note above about overflow) - if (s == bestPossibleScore) - ; - else if ((sub <= 0) || (s < bestPossibleScore - sub)) - s += sub; - else - s = bestPossibleScore; - } - - return s; - } - -//---------- -// -// score_identical_partition_of-- -// Compute the score of a partition versus a sequence, which are assumed to -// have indentical length. -// -//---------- -// -// Arguments: -// seq* seq1: The sequence containing p1. -// seq* seq2: The other sequence (not partitioned). -// partition* p1: The partition. -// scoreset* scoring: The scoring scheme to use. -// -// Returns: -// the score. -// -//---------- -// -// Notes: -// -// (1) We assume, WITHOUT CHECKING, that the partition and sequence have the -// same length. -// -// (2) (see note 1 in identical_sequences, regarding overflow) -// -//---------- - -static score score_identical_partition_of - (seq* seq1, - seq* seq2, - partition* p1, - scoreset* scoring) - { - u8* a, *b; - u8 aNuc, bNuc; - score s, sub; - unspos len, ix; - - len = p1->sepAfter - (p1->sepBefore+1); - - a = seq1->v + p1->sepBefore+1; - b = seq2->v; - s = 0; - for (ix=0 ; ixsub[aNuc][bNuc]; // (see note above about overflow) - if (s == bestPossibleScore) - ; - else if ((sub <= 0) || (s < bestPossibleScore - sub)) - s += sub; - else - s = bestPossibleScore; - } - - return s; - } - -//---------- -// -// new_traceback-- -// Allocate some traceback data. -// -//---------- -// -// Arguments: -// u32 size: The maximum number of bytes to allocate. -// -// Returns: -// A pointer to a newly allocated traceback data; failures result in program -// fatality. The caller must eventually dispose of the table, with a call to -// free_traceback(). -// -//---------- - -tback* new_traceback - (u32 size) - { - tback* tb; - int cells; - - // sanity check - - if (size < sizeof(tback)) - suicidef ("in new_traceback(), size can't be %u", size); - - // allocate - - cells = 1 + (size - sizeof(tback)) / sizeof(tb->space[0]); - tb = (tback*) malloc_or_die ("new_traceback", size); - - // initialize - - tb->size = cells; - - return tb; - } - -//---------- -// -// free_traceback-- -// Deallocate traceback data. -// -//---------- -// -// Arguments: -// tback* tb: The traceback data to dispose of. -// -// Returns: -// (nothing) -// -//---------- - -void free_traceback - (tback* tb) - { - free_if_valid ("free_traceback", tb); - } - -//---------- -// -// ydrop_align-- -// The Y-drop variant of dynamic programming is applied to create a gapped -// alignment by extending in both directions from an "anchor point". -// -//---------- -// -// Arguments: -// alignio* io: The collection of input arguments, and a place to store -// .. results. The input arguments describe the alignment -// .. problem to be solved. See the definition of the type -// .. alignio for more details. -// -// Returns: -// nothing; actual result values are in io: s, start1, start2, stop1, stop2 -// and script. -// -//---------- - -static void lop_initial_indels (alignio* io); -static void lop_final_indels (alignio* io); - - -#ifndef snoopSubprobs -#define snoopSubprobsA_V ; -#define snoopSubprobsA_1 ; -#define snoopSubprobsA_2 ; -#define snoopSubprobsA_3 ; -#endif // not snoopSubprobs - -#ifdef snoopSubprobs - -#define snoopSubprobsA_V \ - partition* part; \ - char* name1, *name2 - -#define snoopSubprobsA_1 \ - if (snoopSubprobsSeq1->partition.p == NULL) \ - { \ - name1 = (snoopSubprobsSeq1->useFullNames)? snoopSubprobsSeq1->header \ - : snoopSubprobsSeq1->shortHeader; \ - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; \ - } \ - else \ - { \ - part = lookup_partition (snoopSubprobsSeq1, anchor1); \ - name1 = &snoopSubprobsSeq1->partition.pool[part->header]; \ - } \ - \ - if (snoopSubprobsSeq2->partition.p == NULL) \ - { \ - name2 = (snoopSubprobsSeq2->useFullNames)? snoopSubprobsSeq2->header \ - : snoopSubprobsSeq2->shortHeader; \ - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; \ - } \ - else \ - { \ - part = lookup_partition (snoopSubprobsSeq2, anchor2); \ - name2 = &snoopSubprobsSeq2->partition.pool[part->header]; \ - } \ - \ - fprintf (stderr, "align:\tbck\t%s\t" unsposFmt "\t" unsposFmt \ - "\t%s\t" unsposFmt "\t" unsposFmt, \ - name1, io->len1-anchor1-2, (anchor1+1)-io->low1, \ - name2, io->len2-anchor2-2, (anchor2+1)-io->low2); - -#define snoopSubprobsA_2 \ - if (snoopSubprobsSeq1->partition.p == NULL) \ - { \ - name1 = (snoopSubprobsSeq1->useFullNames)? snoopSubprobsSeq1->header \ - : snoopSubprobsSeq1->shortHeader; \ - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; \ - } \ - else \ - { \ - part = lookup_partition (snoopSubprobsSeq1, anchor1); \ - name1 = &snoopSubprobsSeq1->partition.pool[part->header]; \ - } \ - \ - if (snoopSubprobsSeq2->partition.p == NULL) \ - { \ - name2 = (snoopSubprobsSeq2->useFullNames)? snoopSubprobsSeq2->header \ - : snoopSubprobsSeq2->shortHeader; \ - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; \ - } \ - else \ - { \ - part = lookup_partition (snoopSubprobsSeq2, anchor2); \ - name2 = &snoopSubprobsSeq2->partition.pool[part->header]; \ - } \ - \ - fprintf (stderr, "\nalign:\tfwd\t%s\t" unsposFmt "\t" unsposFmt \ - "\t%s\t" unsposFmt "\t" unsposFmt, \ - name1, anchor1, io->high1-(anchor1+1), \ - name2, anchor2, io->high2-(anchor2+1)); - -#define snoopSubprobsA_3 \ - fprintf (stderr, "\n"); - -#endif // snoopSubprobs - - -static void ydrop_align - (alignio* io) - { - unspos anchor1, anchor2; - unspos end1, end2; - score scoreLeft, scoreRight; - editscript* script, *scriptRight; - u32 op; -#ifdef snoopAlgorithm - int snoop = true; -#endif // snoopAlgorithm - snoopSubprobsA_V; - - if (io == NULL) - suicide ("ydrop_align() called with NULL pointer."); - -#ifdef snoopAlignioInput - dump_alignio_input (stderr, io); -#endif // snoopAlignioInput - - gapped_extend_count_stat (numAnchorsExtended); - - anchor1 = io->anchor1; - anchor2 = io->anchor2; - -#ifdef snoopAlgorithm -#if ((defined debugPosA1) && (defined debugPosB1)) - snoop = ((anchor1 == debugPosA1) && (anchor2 == debugPosA2)) - || ((anchor1 == debugPosB1) && (anchor2 == debugPosB2)); -#elif (defined debugPosA1) - snoop = (anchor1 == debugPosA1) && (anchor2 == debugPosA2); -#elif (defined debugPosB1) - snoop = (anchor1 == debugPosB1) && (anchor2 == debugPosB2); -#endif // debugPosA1,debugPosB1 -#endif // snoopAlgorithm - -#ifdef snoopAlgorithm - if (snoop) - { - fprintf (stderr, "gapE = " scoreFmtSimple "\n", io->scoring->gapExtend); - fprintf (stderr, "gapOE = " scoreFmtSimple "\n", io->scoring->gapOpen + io->scoring->gapExtend); - fprintf (stderr, "yDrop = " scoreFmtSimple "\n", io->yDrop); - dump_score_set (stderr, io->scoring, (u8*)"ACGTacgtNn", (u8*)"ACGTacgtNn"); - } -#endif // snoopAlgorithm - - snoopSubprobsA_1; - - script = edit_script_new(); - dbg_timing_gapped_extend_sub (debugClockYdropOneSidedAlign); - scoreLeft = ydrop_one_sided_align (io, /*reverse*/ true, - io->rev1 + io->len1 - anchor1 - 2, - io->rev2 + io->len2 - anchor2 - 2, - (anchor1+1) - io->low1, - (anchor2+1) - io->low2, - io->trimToPeak, - &script, &end1, &end2); - dbg_timing_gapped_extend_add (debugClockYdropOneSidedAlign); - io->start1 = anchor1 + 1 - end1; - io->start2 = anchor2 + 1 - end2; - - snoopSubprobsA_2; - - scriptRight = edit_script_new(); - dbg_timing_gapped_extend_sub (debugClockYdropOneSidedAlign); - scoreRight = ydrop_one_sided_align (io, /*reverse*/ false, - io->seq1 + anchor1, - io->seq2 + anchor2, - io->high1 - (anchor1+1), - io->high2 - (anchor2+1), - io->trimToPeak, - &scriptRight, &end1, &end2); - dbg_timing_gapped_extend_add (debugClockYdropOneSidedAlign); - io->stop1 = anchor1 + end1; - io->stop2 = anchor2 + end2; - - snoopSubprobsA_3; - -#ifdef snoopAlgorithm - if (snoop) - { - fprintf (stderr, "==== left edit script:\n"); - dump_edit_script (stderr, script); - fprintf (stderr, "==== right edit script:\n"); - dump_edit_script (stderr, scriptRight); - } -#endif // snoopAlgorithm - - edit_script_reverse (scriptRight); - edit_script_append (&script, scriptRight); - free_if_valid ("ydrop_align, edit script", scriptRight); - - io->s = scoreRight + scoreLeft; - io->script = script; - -#ifdef snoopAlgorithm - if (snoop) - { - fprintf (stderr, "==== combined edit script:\n"); - dump_edit_script (stderr, script); - fprintf (stderr, "\n"); - } -#endif // snoopAlgorithm - - // in rare cases, the DP algorithm can produce an alignment that starts or - // ends with a gap; remove those here - - if (io->script->len != 0) - { - op = edit_op_operation (io->script->op[0]); - if (op != editopSub) - lop_initial_indels (io); - // nota bene: lop_initial_indels will not set len to zero - - op = edit_op_operation (io->script->op[io->script->len-1]); - if (op != editopSub) - lop_final_indels (io); - } - -#if ((defined(snoopAlignioOutput)) || (defined(snoopEditScripts))) - dump_alignio_output (stderr, io); -#endif // snoopAlignioOutput OR snoopEditScripts - } - - -// lop_initial_indels-- - -static void lop_initial_indels - (alignio* io) - { - unspos pos1, pos2; - u32 opIx, op, rpt, numIndelOps; - - // scan past all indel ops at the start - - pos1 = io->start1; - pos2 = io->start2; - - for (opIx=0 ; opIxscript->len ; opIx++) - { - op = edit_op_operation (io->script->op[opIx]); - rpt = edit_op_repeat (io->script->op[opIx]); - if (op == editopSub) break; - else if (op == editopIns) pos2 += rpt; - else if (op == editopDel) pos1 += rpt; - } - - numIndelOps = opIx; - - // special case if the alignment was nothing but indels; assign it a - // phenominally bad score so that something downstream from us will discard - // it - - if (numIndelOps == io->script->len) - { - io->s = worstPossibleScore; - return; - } - - // modify the starting position, copy the remainder of the edit script - // forward, and rescore the alignment - - io->start1 = pos1; - io->start2 = pos2; - - io->script->len -= numIndelOps; - for (opIx=0 ; opIxscript->len ; opIx++) - io->script->op[opIx] = io->script->op[opIx+numIndelOps]; - - io->s = score_alignment (io->scoring, - io->seq1, io->start1, - io->seq2, io->start2, - io->script); - } - - -// lop_final_indels-- - -static void lop_final_indels - (alignio* io) - { - unspos pos1, pos2; - u32 opIx, op, rpt; - - // scan past all indel ops at the end - - pos1 = io->stop1; - pos2 = io->stop2; - - for (opIx=io->script->len ; opIx>0 ; ) - { - opIx--; - op = edit_op_operation (io->script->op[opIx]); - rpt = edit_op_repeat (io->script->op[opIx]); - if (op == editopSub) { opIx++; break; } - else if (op == editopIns) pos2 -= rpt; - else if (op == editopDel) pos1 -= rpt; - } - - // special case if the alignment was nothing but indels; assign it a - // phenominally bad score so that something downstream from us will discard - // it - - if (opIx == 0) - { - io->s = worstPossibleScore; - return; - } - - // modify the ending position, truncate the edit script, and rescore the - // alignment - - io->stop1 = pos1; - io->stop2 = pos2; - - io->script->len = opIx; - - io->s = score_alignment (io->scoring, - io->seq1, io->start1, - io->seq2, io->start2, - io->script); - } - -//---------- -// -// ydrop_one_sided_align-- -// Find an gapped alignment in one direction from a starting point. -// -//---------- -// -// Arguments: -// alignio* io: Other input arguments for the alignment problem -// .. being solved. Of interest to this routine are -// .. the scoring parameters (scoring and yDrop) and -// .. the rightSeg/leftSeg stuff, which is used to -// .. track sweep row bounds caused by previous -// .. alignments. -// int reversed: true => search to the lower-left -// false => search to the upper-right -// u8* A: Query sequence (conceptually vertical) -// .. (indexed by 1..M, see note below) -// u8* B: Subject sequence (conceptually horizontal) -// .. (indexed by 1..N, see note below) -// unspos M: Length of query sequence. -// unspos N: Length of subject sequence. -// int trimToPeak: true => always trim end of alignment to score peak. -// false => don't trim if extension runs into end of -// .. sequence (see note 12 below). -// editscript** script: Place to return the resulting edit script. Note -// .. that this is in traceback order, with the -// .. alignment's starting point corresponding to the -// .. final operation in the edt script. -// unspos* end1: Place to return the alignment end, in A. -// unspos* end2: Place to return the alignment end, in B. -// -// Returns: -// The score of the alignment. -// -//---------- -// -// Notes: -// - A and B are indexed starting from 1, so the pointers must point to one -// character before the start of the string. That character is ignored. -// -// - Whether reversed is true or false, the DP matrix is conceptually scanned -// from bottom-up, and from left-to-right (see diagram below). What changes -// is the equivalence between DP row,col and sequence row,col. -// -// ------ right -----> -// +----+------------------+ -// row M |A[M]| | ^ -// ... |... | | | -// row 1 |A[1]| DP Matrix | up -// row 0 | | | | -// | +------------------+ | -// | B[1] ... B[N]| -// +-----------------------+ -// col col ... col -// 0 1 N -// -//---------- -// -// Implementation Notes: -// -// (1) We manage the external bounds of the current sweep row by keeping track -// of the alignment and the specific segments that constrain the feasible -// region on the left (leftAlign,leftSeg) and right (rightAlign,rightSeg), -// if they exist. These are initially set to the constraints of the anchor -// point, and information is maintained to allow efficient updating as the -// sweep row advances. -// -// For a given row, the values L and R bracket the legal DP columns to -// compute. Neither column L nor R is a valid column in that row, being -// part of a previous alignment or off the left or right edge of the -// matrix. Similarly, LY and RY bracket the intersection of the y-drop -// feasible region and the left/right bounds. -// -// (2) The new alignment should not be allowed to intersect any previously -// found alignments, so we have to keep track of "active segments"-- gap- -// free segments of earlier alignments that intersect the sweep row within -// the feasible region. These are kept in a list that supports insertion -// and deletion when the sweep row reaches either end of an earlier -// alignment. -// -// As we advance the sweep row, update_active_segs identifies all columns -// (within the feasible range) that intersect an existing alignment. It -// marks the corresponding DP cells as 'masked' (the mask indicator value -// is the current row number, so that we don't have to erase them for the -// next sweep row). When the sweep encounters a masked cell, it refuses -// to step to it, and sets all three scores to -infinity so that no step -// will be taken from it. -// -// (3) Memory for traceback cells is provided by the caller (in io->tb). That -// memory is carved into pieces matching the computed rows/slices of the DP -// matrix. This routine allocates an array of indexes to the start of -// each row (with the index offset to the left so that columns index the -// correct cells). -// -// The memory for this index array is allocated here, and lives across -// calls as a static variable. We allocate it in multiples of 512K in the -// hope that this improves performance on some platforms (specifically, to -// avoid expensive mmap/munmap activity in linux with gnu malloc); it is -// not clear whether this helps. -// -// We use an array of indexes, rather than an array of pointers, to reduce -// the memory requirement on 64-bit machines. Pointers on such machines -// would require 8 bytes per entry, whereas indexes require only 4 bytes. -// -// The BLASTZ version of this function allocated a (potentially) very large -// pointer array. It allocated at least M+1 entries, one for each possible -// row. For long sequences such as human chromosome 1 (~ 250 MB), this -// required 1GB of memory if the anchor was close to either end of the -// sequence. Typical alignments are very unlikely to use that many rows, -// especially if B is much shorter than A. So in this implementation, the -// pointer array is initially allocated as 1% of the the sequence length, -// but not more than the length of B, and rounded up to a multiple of 512K -// bytes. It is "sort of" expanded at a rate of 6% whenever the DP scan -// requires more rows (in practice, it just allocates the next chunk of -// 512K bytes). We don't want to have to reallocate very often, because -// realloc may have to copy the entire contents to the new buffer (even if -// we don't need it to, as when we are starting a new alignment; too bad -// malloc/realloc doesn't provide a routine that lets us say "I need more, -// but I don't need you to preserve the contents). -// -// (4) We only use DP memory corresponding to one row/slice of the matrix. At -// any time, we need information about cells in one row and the row above -// it, but we only need both rows at the scan point. We use three local -// variables to represent the extra cell there. -// -// One pointer (dp) scans the cells for the lower row (the row being read), -// another (dq) scans the upper row. These pointers are the same when -// the two rows have the same left bound. When the upper row has a -// different bound (which is necessarily further to the right), dq will -// lag behind dp. -// -// (5) The inner loop scans the current row from left to the right, starting -// with column LY and ending before column RY. Note that column LY is an -// invalid column (it is column zero, or it is on a segment, or it is -// beyond ydrop); this corresponds to the traditional full DP algorithm -// computing column zero even though column one is the first character of -// B. -// -// The following are true for each pass through the inner loop (see note -// (4) for information about dp and dq): -// -// We enter the loop each time with -// dp points to cell to read DP[*][col] -// dq points to cell to write DP[*][col] -// C[row-1][col] in dp->CC -// C[row ][col] in c (proposed, not final) -// D[row ][col] in dp->DD -// I[row ][col] in i -// A[row ][* ] reflected in sub[*] -// B[col+1] in *b -// And during the loop, we will write -// C[row ][col ] to dq->CC, and check/update bestScore -// C[row ][col+1] to c (proposed) -// I[row ][col+1] to i -// D[row+1][col ] to dq->DD -// link into C[row][col] to *tbp -// -// It is important to realize that the stored values in one cell are for -// the C node in one row and the D node for the row above it. -// -// (6) At the beginning of each row, we grab a pointer (sub) to the row of the -// substitution scores matrix for the character A[row]. At the end of each -// pass through the inner loop, we look up the score for the character -// B[col+1]. Thus we are computing the value of a substitution into -// node C[row,col+1]. -// -// (7) The trace-back information, i.e., to trace backwards along an optimal -// path, is packed into one byte per dynamic-programming grid point. At -// each grid point there are three nodes: C which is entered by one -// diagonal edge, D which is entered by two vertical edges (from a D node -// and a C node), and I which is entered by two horizontal edges (from an -// I node and a C node). See ref [1] for further details. -// -// The right-most two bits hold cFromC (0), cFromI (1) or cFromD (2), -// telling how this cell's C node gets the maximum score from an entering -// edge. -// -// iExtend (4) and dExtend (8) relate to edges *exiting* this cell to the I -// or D node in the next column or row, respectively. iExtend is set iff -// it is better to take a horizontal edge from this cell's I node (a gap -// extend) than a horizontal edge from the C node (a gap open). dExtend -// has a similar meaning for vertical edges. -// -// (8) The loop to compute the first row of the DP matrix is limited by col<=N, -// but instead this ought to be col=-yDrop, will keep us from -// going very far (with typical values, this condition will stop us after -// around 300 columns). -// -// (9) When we stop computing one row at its right bound, we may still need -// to prolong computation in the row to support an overhang on the row -// above. We do so by allowing a series of insertions. -// -// The following are true for each pass through the row prolongation loop: -// dq points to cell to write DP[*][col] -// I[row][col] in i -// -// (10) We use the loop body below left even though the one below right seems -// more mathematically correct. -// -// dq->CC = c; dq->CC = c; -// (dq++)->DD = c - gapOE; c -= gapE; -// c -= gapE; (dq++)->DD = c; -// *(tbp++) = cFromI; *(tbp++) = cFromI; -// -// However, using the one on the right caused problems aligning -// -// A=GTTTTTTTTTTTTTTTCTT -// B=TTTTTTTTTTTTTTTTTTCTT -// -// It preferred the alignment on the left to the one on the right, because -// it failed to charge a second gap open penalty. -// -// ---GTTTTTTTTTTTTTTTCTT --GTTTTTTTTTTTTTTTCTT -// TTT-TTTTTTTTTTTTTTTCTT TTTTTTTTTTTTTTTTTTCTT -// -// (11) Blastz had a potential problem with the feasibility bounds LY and RY. -// update_LR_bounds occasionally will create the condition RY < LY. This -// means that no columns of this dp matrix row will be computed, because -// the col=RY will halt the routine. However, the calculation -// of tbNeeded is suspect, since it could conceivably be negative. I don't -// believe it ever goes negative in practice (because I don't think RY-LY -// is ever less than -2), but for safety sake we clip it at zero. -// -// (12) Correction for short read "mismatch shadow". The original intent of -// this routine is that the two sequences would be long (e.g. chromosomes). -// In that context, it made sense to trim any negatively-scoring suffix -// from the end of an alignment. However, if one of the sequences is a -// short read (e.g. 50 bases), we are usually interested in aligning the -// entire read if we can. If we don't, we will be creating a bias against -// reporting mismatches near the end of the reads. -// -// Suppose scores are +1 for a match and -7 for a mismatch. Then any -// mismatch within 7 bases of the end will create a negatively-scoring -// suffix, even if all the bases between it and the end are matches. If -// this is trimmed, that mismatch is never reported as part of an -// alignment. -// -// The argument trimToPeak is added to control this behavior. If -// trimToPeak is true, we *always* trim the alignment end back to the -// maximum scoring location. If trimToPeak is false, we don't trim if we -// happen to encounter the end of the sequence. -// -// (13) The BLASTZ loop to compute the first row of dp matrix was equivalent to -// this: -// -// dq = dynProg->p; -// dq->CC = 0; -// c = (dq++)->DD = -gapOE; -// -// for (col=1 ; (col<=N)&&(c>=-yDrop) ; col++) -// { -// dq->CC = c; -// (dq++)->DD = c - gapOE; -// c -= gapE; -// } -// -// The (c>=-yDrop) part of this test is not correct, because it is testing -// the value of the D entry in the cell, instead of the C entry. -// -// (14) Bounding segment calculations are initially done w.r.t. the full DP -// matrix with forward orientation of the sequences. When we are doing a -// reversed alignment, L and R (the left and right bounds) must be -// swapped. This swap appeared to be performed wrong in BLASTZ, and has -// been changed, as shown in this table: -// -// left right | BLASTZ | LASTZ | -// bound? bound? | L' R' | L' R' | -// ---------------+---------+-----------+ -// no no | L R | L R | -// no yes | -R R | -R+1 N+1 | -// yes no | L -L | 0 -L-1 | -// yes yes | -R -L | -R+1 -L-1 | -// ---------------+---------+-----------+ -// -//---------- - -//=== macros for ydrop_one_sided_align === - -#define cFromC 0 // (c bit is no bits) see note (7) -#define cFromI 1 -#define cFromD 2 -#define iExtend 4 -#define dExtend 8 -#define cidBits (cFromC | cFromI | cFromD) - -#define op_string(op) ((op==cFromC)? "SUB" : (op==cFromI)? "INS" : (op==cFromD)? "DEL" : "???") - - -#define prune \ - c = dp->CC + sub[*(b++)]; /* propose C[row][col+1] */ \ - if (col == LY) /* (we're at first col in sweep) */ \ - LY++; \ - else \ - { /* 'set' I[row][col+1] */ \ - i = dq->DD = dq->CC = negInf; /* .. and set D[row+1][col] */ \ - dq++; /* .. and set C[row+1][col] */ \ - } \ - dp++; \ - *(tbp++) = 0; - - -//=== stuff for link_to_string === - -#define include_link_to_string \ -static char* link_to_string (int link) \ - { \ - static char str[100]; \ - char* s = str; \ - *(s++) = '{'; \ - if ((link & cidBits) == 0) { *(s++) = 'c'; *(s++) = ','; } \ - if ((link & cFromI) != 0) { *(s++) = 'i'; *(s++) = ','; } \ - if ((link & cFromD) != 0) { *(s++) = 'd'; *(s++) = ','; } \ - if ((link & iExtend) != 0) { *(s++) = 'i'; *(s++) = 'i'; *(s++) = ','; } \ - if ((link & dExtend) != 0) { *(s++) = 'd'; *(s++) = 'd'; *(s++) = ','; } \ - if (s > str+1) s--; \ - *(s++) = '}'; \ - *s = 0; \ - return str; \ - } - - -//=== stuff for snoopAlgorithm === - -#ifndef snoopAlgorithm -#define snoopAlgorithm_1 ; -#define snoopAlgorithm_2 ; -#define snoopAlgorithm_3 ; -#define snoopAlgorithm_4A ; -#define snoopAlgorithm_4B ; -#define snoopAlgorithm_4C ; -#define snoopAlgorithm_5 ; -#define snoopAlgorithm_5A ; -#define snoopAlgorithm_5B ; -#define snoopAlgorithm_6 ; -#define snoopAlgorithm_7A ; -#define snoopAlgorithm_7B ; -#define snoopAlgorithm_7C ; -#define snoopAlgorithm_7D ; -#define snoopAlgorithm_7E ; -#define snoopAlgorithm_8 ; -#endif // not snoopAlgorithm - -#ifdef snoopAlgorithm - -static char ccSnoop[100]; -static char ddSnoop[100]; -static char iiSnoop[100]; - -static char* relative_to_infinity (score s) - { - static char str[100]; - if (s < negInf) sprintf (str, "-inf" scoreFmtSimple, s-negInf); - else if (s == negInf) sprintf (str, "-inf"); - else if (s <= negInf+2000) sprintf (str, "-inf+" scoreFmtSimple, s-negInf); - else sprintf (str, scoreFmtSimple, s); - return str; - } - -include_link_to_string - -#define snoopAlgorithm_1 \ - if (snoop) \ - { \ - char A25[26], B25[26]; \ - \ - strncpy (A25, (char*) A+1, sizeof(A25)); A25[sizeof(A25)-1] = 0; \ - strncpy (B25, (char*) B+1, sizeof(B25)); B25[sizeof(B25)-1] = 0; \ - \ - fprintf (stderr, "\nydrop_one_sided_align(" unsposSlashFmt ") %s" \ - " A=%s (" unsposFmt ")" \ - " B=%s (" unsposFmt ")\n", \ - io->anchor1, io->anchor2, \ - (reversed)?" reversed":"forward", \ - A25, M, B25, N); \ - } - -#define snoopAlgorithm_2 \ - if (snoop) \ - { \ - strcpy (ccSnoop, relative_to_infinity ((dq-1)->CC)); \ - strcpy (ddSnoop, relative_to_infinity ((dq-1)->DD)); \ - fprintf (stderr,"init edge, dynProg=%08lX dpLen=%08X\n", \ - (long) dynProg, dynProg->len); \ - fprintf (stderr, "\n[%3u,%3u]" \ - " [%d].cc=%s [%d].DD=%s" \ - " dq=%d\n", \ - 0, 0, 0, ccSnoop, 0, ddSnoop, 0); \ - } - -#define snoopAlgorithm_3 \ - if (snoop) \ - { \ - strcpy (ccSnoop, relative_to_infinity ((dq-1)->CC)); \ - strcpy (ddSnoop, relative_to_infinity ((dq-1)->DD)); \ - fprintf (stderr, "[%3u,%3u]" \ - " [" unsposFmt "].cc=%s" \ - " [" unsposFmt "].DD=%s" \ - " dq=%d\n", \ - 0, col, col, ccSnoop, col, ddSnoop, col); \ - } - -#define snoopAlgorithm_4A \ - if (snoop) \ - fprintf (stderr, "\nrow " unsposFmt \ - " LY=" unsposFmt \ - " RY=" unsposFmt, \ - row, LY, RY); - -#define snoopAlgorithm_4B \ - if (snoop) \ - fprintf (stderr, " -> L=" sgnposFmt \ - " R=" sgnposFmt \ - " LY=" unsposFmt \ - " RY=" unsposFmt, \ - L, R, LY, RY); - -#define snoopAlgorithm_4C \ - if (snoop) \ - fprintf (stderr, " tbNeeded=%d dp=%ld dq=%ld\n", \ - tbNeeded, (long) (dp-dynProg->p), (long) (dq-dynProg->p)); - -#define snoopAlgorithm_5 \ - if (snoop) \ - { \ - strcpy (ccSnoop, relative_to_infinity (c)); \ - strcpy (ddSnoop, relative_to_infinity (dp->DD)); \ - strcpy (iiSnoop, relative_to_infinity (i)); \ - fprintf (stderr, "[%3u,%3u] %c%c B[%ld]=%c" \ - " c=%s d=%s i=%s" \ - " dp=%ld dq=%ld", \ - row, col, A[row], B[col], (long) (b-B), *b, \ - ccSnoop, ddSnoop, iiSnoop, \ - (long) (dp-dynProg->p), (long) (dq-dynProg->p)); \ - } - -#define snoopAlgorithm_5A \ - if (snoop) \ - { \ - strcpy (ccSnoop, relative_to_infinity (bestScore)); \ - fprintf (stderr, " **new best score %s**", ccSnoop); \ - } - -#define snoopAlgorithm_5B \ - if (snoop) \ - fprintf (stderr, "\n"); - -#define snoopAlgorithm_6 \ - if (snoop) \ - { \ - strcpy (ccSnoop, relative_to_infinity ((dq-1)->CC)); \ - strcpy (ddSnoop, relative_to_infinity ((dq-1)->DD)); \ - fprintf (stderr, " [%lu].CC=%s [%lu].DD=%s link=%s\n", \ - (long) ((dq-1)-dynProg->p), ccSnoop, \ - (long) ((dq-1)-dynProg->p), ddSnoop, \ - link_to_string (link)); \ - } - -#define snoopAlgorithm_7A \ - if (snoop) \ - { \ - if ((rightSeg != NULL) && (R > 0)) \ - fprintf (stderr, "NN <- " sgnposFmt " (from R)\n", R-1); \ - else \ - fprintf (stderr, "NN <- " sgnposFmt " (from N)\n", (sgnpos)N); \ - } - -#define snoopAlgorithm_7B \ - if (snoop) \ - fprintf (stderr, "RY <- " unsposFmt \ - " (hit ydrop prior to RY)\n", \ - RY); - -#define snoopAlgorithm_7C \ - if (snoop) \ - prevRY = RY; - -#define snoopAlgorithm_7D \ - if ((snoop) && (RY != prevRY)) \ - fprintf (stderr, "RY <- " unsposFmt \ - " (feasible overhang)\n", \ - RY); - -#define snoopAlgorithm_7E \ - if (snoop) \ - fprintf (stderr, "RY <- " unsposFmt \ - " (room at right boundary)\n", \ - RY); - -#define snoopAlgorithm_8 \ - if (snoop) \ - fprintf (stderr, "alignment ends at [%3u,%3u]\n", *_end1, *_end2); - -#endif // snoopAlgorithm - -#ifdef snoopAlgorithmTrap -static unspos trapRow = 497; -static unspos trapCol = (unspos) -1; -#endif // snoopAlgorithmTrap - - -//=== stuff for snoopTraceback === - -#ifndef snoopTraceback -#define snoopTraceback_1 ; -#define snoopTraceback_2 ; -#define snoopTraceback_3 ; -#endif // not snoopTraceback - -#ifdef snoopTraceback - -#define snoopTraceback_1 \ - fprintf (stderr, "(tbp=%08X) tb[%3d,%3d] <- %-2d %-9s\n", \ - (u32) tbp-1, \ - row, col, link, link_to_string(link)); - -#define snoopTraceback_2 \ - fprintf (stderr, "(tbp=%08X) tb[%3d,%3d] is %-2d %-9s" \ - " prevOp=%s --> op=%d", \ - (u32) &tb->space[tbRow[row] + col], \ - row, col, link, link_to_string(link), \ - op_string(prevOp), op); - -#define snoopTraceback_3 \ - fprintf (stderr, " -> %s -> row %d, col %d\n", \ - op_string(op), row, col); - -#ifndef snoopAlgorithm -include_link_to_string -#endif //snoopAlgorithm - -#endif // snoopTraceback - - -//=== stuff for snoopSubprobs === - -#ifndef snoopSubprobs -#define snoopSubprobsB_1 ; -#define snoopSubprobsB_2 ; -#define snoopSubprobsB_3 ; -#endif // not snoopSubprobs - -#ifdef snoopSubprobs - -#define snoopSubprobsB_1 \ - if (leftSeg == NULL) \ - fprintf (stderr, "\n leftSeg=(none)"); \ - else \ - fprintf (stderr, "\n leftSeg=%s" \ - " " unsposSlashFmt \ - " " unsposSlashFmt, \ - (leftSeg->type==diagSeg)? "diag" \ - : (leftSeg->type==horzSeg)? "horz" \ - : (leftSeg->type==vertSeg)? "vert" \ - : "????", \ - leftSeg->b1, leftSeg->b2, \ - leftSeg->e1, leftSeg->e2); \ - if (rightSeg == NULL) \ - fprintf (stderr, "\n rightSeg=(none)"); \ - else \ - fprintf (stderr, "\n rightSeg=%s" \ - " " unsposSlashFmt \ - " " unsposSlashFmt, \ - (rightSeg->type==diagSeg)? "diag" \ - : (rightSeg->type==horzSeg)? "horz" \ - : (rightSeg->type==vertSeg)? "vert" \ - : "????", \ - rightSeg->b1, rightSeg->b2, \ - rightSeg->e1, rightSeg->e2); \ - if (!reversed) \ - fprintf (stderr, "\n L=" sgnposFmt " R=" sgnposFmt, \ - L, R); \ - else \ - { \ - sgnpos tempL = L; \ - sgnpos tempR = R; \ - sgnpos tempT = 0; \ - if ((leftSeg == NULL) && (rightSeg != NULL)) { tempL = -R+1; tempR = N+1; } \ - else if ((leftSeg != NULL) && (rightSeg == NULL)) { tempR = -L-1; tempL = 0; } \ - else if ((leftSeg != NULL) && (rightSeg != NULL)) { tempT = -L-1; tempL = -R+1; tempR = tempT; } \ - fprintf (stderr, "\n L=" sgnposFmt " R=" sgnposFmt \ - " -> L=" sgnposFmt " R=" sgnposFmt, \ - L, R, tempL, tempR); \ - } \ - fprintf (stderr, "\n ----"); - -#define snoopSubprobsB_2 \ - fprintf (stderr, "\n row " unsposFmt \ - " -> L=" sgnposFmt \ - " R=" sgnposFmt \ - " LY=" unsposFmt \ - " RY=" unsposFmt, \ - row, L, R, LY, RY); - -#ifndef collect_stats -#define snoopSubprobsB_3 \ - fprintf (stderr, "\n"); \ - fprintf (stderr, "\trows=" unsposFmt, row); -#endif // not collect_stats - -#ifdef collect_stats -#define snoopSubprobsB_3 \ - fprintf (stderr, "\n"); \ - fprintf (stderr, "\trows=" unsposFmt, row); \ - fprintf (stderr, "\tmaxDpRows=%s", commatize(gappedExtendStats.maxDpRows)); -#endif // collect_stats - -#endif // snoopSubprobs - - -//=== stuff for snoopBounds === - -#ifndef snoopBounds -#define debugSnoopBounds_1 ; -#define debugSnoopBounds_2 ; -#define debugSnoopBounds_3 ; -#endif // not snoopBounds - -#ifdef snoopBounds - -static char* segmentTypeName[3] = {"diag", "horz", "vert"}; - -#define debugSnoopBounds_1 \ - { \ - fprintf (stderr, "bounds: leftSeg(%s)=(" unsposFmt ")/" unsposFmt \ - " anchor=(" unsposFmt ")/" unsposFmt "\n", \ - segmentTypeName[(u8)leftSeg->type], \ - leftSeg->b1, leftSeg->b2, \ - anchor1, anchor2); \ - if (leftSeg->type == diagSeg) \ - fprintf (stderr, "bounds: L=(" unsposFmt "-" unsposFmt ")" \ - "-(" unsposFmt "-" unsposFmt ")" \ - "=" sgnposFmt "\n", \ - leftSeg->b2, anchor2, leftSeg->b1, anchor1, L); \ - else \ - fprintf (stderr, "bounds: L=(" unsposFmt "-" unsposFmt ")" \ - "=" sgnposFmt "\n", \ - leftSeg->b2, anchor2, L); \ - } - -#define debugSnoopBounds_2 \ - { \ - fprintf (stderr, "bounds: rightSeg(%s)=(" unsposFmt ")/" unsposFmt \ - " anchor=(" unsposFmt ")/" unsposFmt "\n", \ - segmentTypeName[(u8)rightSeg->type], \ - rightSeg->b1, rightSeg->b2, \ - anchor1, anchor2); \ - if (rightSeg->type == diagSeg) \ - fprintf (stderr, "bounds: R=(" unsposFmt "-" unsposFmt ")" \ - "-(" unsposFmt "-" unsposFmt ")" \ - "=" sgnposFmt "\n", \ - rightSeg->b2, anchor2, rightSeg->b1, anchor1, R); \ - else \ - fprintf (stderr, "bounds: R=(" unsposFmt "-" unsposFmt ")" \ - "=" sgnposFmt "\n", \ - rightSeg->b2, anchor2, R); \ - } - -#define debugSnoopBounds_3 \ - fprintf (stderr, "bounds: swapped to L=" sgnposFmt " R=" sgnposFmt "\n", \ - L, R); - -#endif // snoopBounds - - -//=== traceback row memory (see note (3)) === -// -// minTbRowsNeeded is the inverse of the round_up result in tbrow_needed - -#define minTbRowsNeeded (((((512*1024)/sizeof(u32))-1)*16)/17) - -static u32* tbRow = NULL; // memory to track row positions in -static u32 tbRowLen = 0; // .. traceback array - -static void tbrow_needed (u32 rowsNeeded); -static void tbrow_needed (u32 rowsNeeded) - { - size_t needed; - - if (rowsNeeded <= tbRowLen) return; - - needed = round_up(sizeof(u32)*(rowsNeeded+1+rowsNeeded/16), 512*1024); - tbRow = realloc_or_die ("ydrop_one_sided_align tbRow", tbRow, needed); - tbRowLen = needed / sizeof(u32); - } - -void free_traceback_rows (void) - { - free_if_valid ("free_traceback_rows", tbRow); - tbRow = NULL; - tbRowLen = 0; - } - - -//=== ydrop_one_sided_align === - -static score ydrop_one_sided_align - (alignio* io, - int reversed, // true => search to the lower-left - u8* A, // vertical sequence text - u8* B, // horizontal sequence text - unspos M, // vertical sequence length - unspos N, // horizontal sequence length - int trimToPeak, - editscript** script, - unspos* _end1, - unspos* _end2) - { - tback* tb; // tb is space provided to record traceback; - u8* tbp; // .. tbp is the current traceback cell - int tbLen; // .. and steps linearly from tbp->space; - int tbNeeded; // .. tbRow[r] is the conceptual start of - // .. the traceback cells for row r; it is - // .. indexed as tbRow[r][c] for c=LY..R', - // .. where R' is the last cell used in - // .. row r - unspos anchor1, anchor2; // anchor positions in A and B - scorerow* allSub; // substitution scores matrix - score* sub; // substitution scores vector, current row - score gapOE, gapE; // gap penalties - score yDrop; // score drop threshold - int yDropTail; // length of shortest score fall >= yDrop - - // nota bene: row, col, leftCol, L, R, - // .. LY, RY, prevLY, NN, npCol, end1 - // .. and end2 are all relative to the DP - // .. matrix - - unspos row, col = 0; // current DP grid point - unspos leftCol; // (copy of left column, for stats) - sgnpos L, R; // external column limits for current row - unspos LY, RY; // actual column limits for current row; - // .. ("Y" is for y-drop, not cartesian Y) - unspos prevLY; // left column limit for previous row - sgnpos NN; // truncated right side bound, current row - unspos npCol; // last non-pruned cell in current row - unspos end1, end2; // end of optimal alignment - int endIsBoundary; // true => report boundaryScore instead of - // .. bestScore - score bestScore; // score of best alignment seen so far - score boundaryScore; // score of best alignment seen so far that - // .. ends at the end of either sequence - dpMatrix* dynProg; // DP cells - dpCell* dp; // scans previous row of dp matrix - dpCell* dq; // scans current row of dp matrix - u8* b; // scans horizontal sequence - score c, d, i; // running scores for DP cells - score cOpen, cNext, cTemp;// scratch values for cell scores - activeseg* active; // list of segments that intersect the - // .. sweep row within the feasible region - galign* alignList; - galign* rightAlign, *leftAlign; - aliseg* rightSeg, *leftSeg; - u8 link = 0; // traceback link - u8 op, prevOp; // edit operations -#if ((defined snoopAlgorithm) || (defined snoopAlgorithmTrap)) - int snoop = true; -#ifdef snoopAlgorithm - unspos prevRY = 0; -#endif // snoopAlgorithm -#endif // snoopAlgorithm or snoopAlgorithmTrap - - // sanity check; if either sequence is empty there's no alignment - - if ((N <= 0) || (M <= 0)) - { *(_end1) = *(_end2) = 0; return 0; } - -#if ((defined snoopAnchors) || ((defined snoopAlgorithm) && (defined debugPosA1))) - fprintf (stderr,"[hspId=" u64Fmt "] ydrop_one_sided_align(" unsposSlashFmt ") %s\n", - io->hspId, io->anchor1, io->anchor2, - (reversed)?" reversed":""); -#endif // snoopAnchors - -#ifdef snoopAlgorithm -#if ((defined debugPosA1) && (defined debugPosB1)) - snoop = ((io->anchor1 == debugPosA1) && (io->anchor2 == debugPosA2)) - || ((io->anchor1 == debugPosB1) && (io->anchor2 == debugPosB2)); -#elif (defined debugPosA1) - snoop = (io->anchor1 == debugPosA1) && (io->anchor2 == debugPosA2); -#elif (defined debugPosB1) - snoop = (io->anchor1 == debugPosB1) && (io->anchor2 == debugPosB2); -#endif // debugPosA1,debugPosB1 -#endif // snoopAlgorithm - - gapped_extend_count_stat (numExtensions); - dbg_timing_count_stat (numExtensions); - - snoopAlgorithm_1; - - // extract scoring constants - - allSub = io->scoring->sub; - gapE = io->scoring->gapExtend; - gapOE = io->scoring->gapOpen + gapE; // (any new gap gets both penalties) - yDrop = io->yDrop; - - tb = io->tb; - tbLen = tb->size; - - if (gapE != 0) - yDropTail = (yDrop/gapE) + 6; - else - { - // when gapE is zero, the above results would be infinite; but we can - // limit yDropTail to the distance from the length of the sequence; - // this can increase the amount of memory needed; the solution here is - // not completely sufficient; "truncating alignment" reports are still - // likely - - int maxYDropTail = 500*1000; - if (N < (unsigned int) maxYDropTail) yDropTail = N+1; - else yDropTail = maxYDropTail; - } - - // determine initial left and right constraints - - L = 0; - R = N+1; // (in blastz this was R=N) - anchor1 = io->anchor1; - anchor2 = io->anchor2; - - leftSeg = io->leftSeg; - if (leftSeg != NULL) - { - L = signed_difference (leftSeg->b2, anchor2); - if (leftSeg->type == diagSeg) - L -= signed_difference (leftSeg->b1, anchor1); - debugSnoopBounds_1; - } - - rightSeg = io->rightSeg; - if (rightSeg != NULL) - { - R = signed_difference (rightSeg->b2, anchor2); - if (rightSeg->type == diagSeg) - R -= signed_difference (rightSeg->b1, anchor1); - debugSnoopBounds_2; - } - - snoopSubprobsB_1; - - // if we're doing a reversed alignment we need to swap the L-R bounds (see - // note (14)) - - if (reversed) - { - sgnpos temp = 0; // (placate compiler) - if ((leftSeg == NULL) && (rightSeg != NULL)) { L = -R+1; R = N+1; } - else if ((leftSeg != NULL) && (rightSeg == NULL)) { R = -L-1; L = 0; } - else if ((leftSeg != NULL) && (rightSeg != NULL)) { temp = -L-1; L = -R+1; R = temp; } - debugSnoopBounds_3; - } - - active = NULL; - rightAlign = io->rightAlign; - leftAlign = io->leftAlign; - alignList = (!reversed)? io->aboveList - : io->belowList; - - // make sure we have a reasonable number of traceback rows to start with - // (see note (3)) - - tbrow_needed (minTbRowsNeeded); - - tbRow[0] = 0; - tbp = tb->space; - - ////////// - // compute first row of dp matrix - ////////// - - // make sure we have enough traceback and dp space for the first row - - tbNeeded = yDropTail; - if (tbNeeded > tbLen) - suicide ("not enough space in trace_back array"); - - dynProg = zalloc_or_die ("ydrop_one_sided_align dynProg", sizeof(dpMatrix)); - dp_ready (dynProg, tbNeeded); - gapped_extend_count_stat (zallocCallsB); - gapped_extend_add_stat (zallocTotalB, sizeof(dpMatrix)); - - // compute first row of dp matrix (see notes (8), (10) and (13)) - - dq = dynProg->p; - dq->CC = cTemp = 0; // set C[0][0] - c = (dq++)->DD = -gapOE; // set D[1][0] - *(tbp++) = 0; - - snoopAlgorithm_2; - - for (col=1 ; (col<=N)&&(cTemp>=-yDrop) ; col++) - { - dq->CC = cTemp = c; // set C[0][col] - (dq++)->DD = c - gapOE; // set D[1][col] - c -= gapE; - *(tbp++) = cFromI; - - snoopAlgorithm_3; - } - - gapped_extend_add_stat (dpCellsVisited, col); - - LY = 0; - RY = col; // (1 column beyond the feasible region) - - ////////// - // compute additional rows of DP matrix - ////////// - - end1 = end2 = 0; - bestScore = 0; - boundaryScore = negInf; - endIsBoundary = false; - - for (row=1; row<=M ; row++) - { -#ifdef snoopAlgorithm - if (snoop) - cTemp = 0; // (place to set a breakpoint) -#endif // snoopAlgorithm - - snoopAlgorithm_4A; - -#ifdef snoopAlgorithmTrap - if ((snoop) && (row == trapRow) && (trapCol == (unspos) -1)) - cTemp = 0; -#endif - - // update sweep row bounds, active segments, masking - - prevLY = LY; - update_LR_bounds (reversed, - &rightSeg, &leftSeg, &rightAlign, &leftAlign, - row, anchor1, anchor2, &L, &R, &LY, &RY); - update_active_segs (reversed, &active, &alignList, dynProg->p-prevLY, - row, anchor1, anchor2, LY, RY); - - snoopAlgorithm_4B; - snoopSubprobsB_2; - - // make sure we have enough traceback and dp space for this row (see - // note (3)) - - tbrow_needed (row+1); - gapped_extend_max_stat (maxDpRows, row+1); - - if (RY < LY) RY = LY; // (see note 11) - tbNeeded = RY - LY + yDropTail; - if ((tbp - tb->space) + tbNeeded >= tbLen) - { - if (gapped_extend_inhibitTruncationReport) - goto dp_finished; - - if (!reversed) - fprintf (stderr, "truncating alignment ending at (" unsposCommaFmt ");", - end1 + anchor1 + 1, end2 + anchor2 + 1); - else - fprintf (stderr, "truncating alignment starting at (" unsposCommaFmt ");", - anchor1 + 2 - end1, anchor2 + 2 - end2); - fprintf(stderr, " anchor at (" unsposCommaFmt ")\n", anchor1, anchor2); - - if (!haveReportedTruncation) - { - haveReportedTruncation = true; - fprintf(stderr, "truncation can be reduced by using --allocate:traceback to increase traceback memory\n"); - } - goto dp_finished; - } - tbRow[row] = (tbp - tb->space) - LY; - - // set up DP pointers for this sweep row (see note (5)) - - dp_ready (dynProg, tbNeeded); // make sure we have enough DP space - dq = dynProg->p; // dq cells start at col == LY - dp = dq + LY - prevLY; // dp cells start at col == prevLY - - snoopAlgorithm_4C; - - // compute DP values for all bounded columns in this row (see note (4)) - - sub = allSub[A[row]]; - - col = leftCol = LY; - b = B + col + 1; // (b scans horizontal sequence, one column ahead) - npCol = col; // npCol records the last non-pruned position - - i = negInf; // 'set' I[row][col] - c = negInf; // propose C[row][col] - - for ( ; (colDD; // get D[row][col] - - // at this point d, i and c contain the DP values for the cell at - // [row][col]; the latter is the value for reaching C from previous - // grid points, but since C also has edges entering it from the - // current D and I nodes, we might improve it - // nota bene: when we *can* improve c, we make an arbitrary choice - // to prefer deletion to insertion (when i and d are equal) - // nota bene 2: all paths through this series of ifs assign a value - // to link - - if ((active != NULL) && (dp->mask == row)) - { snoopAlgorithm_5; prune; snoopAlgorithm_5B; continue; } - - if ((d > c) || (i > c)) // === we CAN improve C === - { - // nota bene: both iExtend and dExtend are set here because - // the value of the C and I (or C and D) are equal, so traceback - // may as well take a gap extend into this cell - if (d >= i) { c = d; link = cFromD | iExtend | dExtend; } - else { c = i; link = cFromI | iExtend | dExtend; } - snoopAlgorithm_5; - if (c < bestScore - yDrop) - { prune; snoopAlgorithm_5B; continue; } - -#ifndef allowBackToBackGaps - // not allowing back-to-back gaps, so we don't need to consider - // opening a gap here - i -= gapE; // 'set' I[row][col+1] - dq->DD = d - gapE; // set D[row+1][col] -#else - // back-to-back gaps are allowed, so we must consider gap opens - cOpen = c - gapOE; - d -= gapE; // set D[row+1][col] - if (cOpen > d) { dq->DD = cOpen; link &= ~dExtend; } - else dq->DD = d; - - i -= gapE; // 'set' I[row][col+1] - if (cOpen > i) { i = cOpen; link &= ~iExtend; } -#endif // allowBackToBackGaps - } - else // === we CANNOT improve C === - { - snoopAlgorithm_5; - if (c < bestScore - yDrop) - { prune; snoopAlgorithm_5B; continue; } - - if (c >= bestScore) - { - bestScore = c; end1 = row; end2 = col; endIsBoundary = false; - snoopAlgorithm_5A; - } - if ((!trimToPeak) - && (c >= boundaryScore) - && ((row == M) || (col == N))) - { boundaryScore = c; end1 = row; end2 = col; endIsBoundary = true; } - - cOpen = c - gapOE; - d -= gapE; // set D[row+1][col] - if (cOpen > d) { dq->DD = cOpen; link = cFromC; } - else { dq->DD = d; link = cFromC | dExtend; } - - i -= gapE; // 'set' I[row][col+1] - if (cOpen > i) i = cOpen; - else link |= iExtend; - } - - npCol = col; // save as last non-pruned position - - // save C for this column, and compute proposed C for the next - // column (see note (6)) - - cNext = (dp++)->CC+sub[*(b++)]; // propose C[row][col+1] - (dq++)->CC = c; // set C[row][col] - c = cNext; - *(tbp++) = link; // set link into C[row][col] - - snoopAlgorithm_6; - //snoopTraceback_1; - } - - gapped_extend_add_stat (dpCellsVisited, col-leftCol); - - // if the feasible region is empty, we're done - - if (LY >= RY) - goto dp_finished; - - // finish up this row, by either moving the right bound left or - // prolonging the row to support an overhang on the row above - - snoopAlgorithm_7A; - NN = ((rightSeg != NULL) && (R > 0))? (R-1) : ((sgnpos) N); - - if (RY > npCol+1) // we hit ydrop prior to RY - { - RY = npCol+1; - snoopAlgorithm_7B; - } - else - { - // the current row reached its right bound, but the row above may - // still have a feasible overhang so we prolong this row with - // insertions (see note (9)) - - snoopAlgorithm_7C; - while ((i >= bestScore - yDrop) && (((sgnpos)RY) <= NN)) - { - if (((u32)(dq - dynProg->p)) >= dynProg->len) - suicidef("(in ydrop_one_sided_align:%d, dq-dynProg->p==%d, dynProg->len=" unsposFmt ")", - __LINE__, dq - dynProg->p, dynProg->len); - dq->CC = i; // set C[row][col] - (dq++)->DD = i - gapOE; // set D[row+1][col] - i -= gapE; // 'set' I[row][col+1] - *(tbp++) = cFromI; - RY++; - } - snoopAlgorithm_7D; - } - - // terminate the cell at the right boundary, so that nothing will - // step from it (termination occurs if the if is false and we thus - // *fail* to increment RY) - - if (((sgnpos)RY) <= NN) - { - if (((u32)(dq - dynProg->p)) >= dynProg->len) - suicidef("(in ydrop_one_sided_align:%d, dq-dynProg->p==%d, dynProg->len=" unsposFmt ")", - __LINE__, dq - dynProg->p, dynProg->len); - dq->DD = dq->CC = negInf; // set D[row+1][col] - RY++; // .. and set C[row][col] - snoopAlgorithm_7E; - } - } - -dp_finished: - snoopSubprobsB_3; - *(_end1) = row = end1; - *(_end2) = col = end2; - - snoopAlgorithm_8; - - ////////// - // traceback the alignment to create the edit script - ////////// - -#ifdef snoopAlgorithm - if (snoop) - cTemp = 0; // (place to set a breakpoint) -#endif // snoopAlgorithm - - for (prevOp=0 ; (row>=1) || (col>0) ; prevOp=op) - { - link = tb->space[tbRow[row] + col]; - op = link & cidBits; - if ((prevOp == cFromI) && ((link & iExtend) != 0)) op = cFromI; - if ((prevOp == cFromD) && ((link & dExtend) != 0)) op = cFromD; - snoopTraceback_2 - - if (op == cFromI) { col--; edit_script_ins (script, 1); } - else if (op == cFromD) { row--; edit_script_del (script, 1); } - else { row--; col--; edit_script_sub (script, 1); } - snoopTraceback_3 - } - - filter_active_segs (&active, 2); // (disposes of everything in the list) - - free_if_valid ("ydrop_one_sided_align dynProg->p", dynProg->p); - free_if_valid ("ydrop_one_sided_align dynProg", dynProg); - - if (endIsBoundary) return boundaryScore; - else return bestScore; - } - -//---------- -// -// dp_ready-- -// Allocate the dynamic-programming "sweep row" array, and ensure that the -// first n elements exist and have been initialized. -// -//---------- -// -// Arguments: -// dpMatrix* dynProg: The current sweep array. -// unspos needed: The number of elements required. -// -// Returns: -// nothing; the contents of dynProg are (potentially) modified by adding -// enough empty cells to satisfy the number needed -// -//---------- - -static void dp_ready - (dpMatrix* dynProg, - unspos needed) - { - u32 oldLen; - int added; - - if (dynProg == NULL) - suicide ("dp_ready called with NULL pointer"); - - if (needed > 0xFFFFFFFF) - suicidef ("in dp_ready, number of DP cells needed exceeds limit\n" - " " unsposFmt " cells requested", needed); - - // make sure that there's room for n cells - - if (dynProg->p == NULL) - { - oldLen = 0; - dynProg->len = needed + 1000; - dynProg->p = malloc_or_die ("dp_ready", dynProg->len * sizeof(dpCell)); - } - else - { - oldLen = dynProg->len; - if (needed > oldLen) - { - dynProg->len = needed + dynProg->len/16 + 1000; - dynProg->p = realloc_or_die ("dp_ready", dynProg->p, - dynProg->len * sizeof(dpCell)); - } - } - - // 0..oldLen-1 are already in use, but zero the rest to clear their mask - // fields - - added = dynProg->len - oldLen; - if (added > 0) - memset (dynProg->p + oldLen, 0, sizeof(dynProg->p[0])*added); - - gapped_extend_max_stat (maxDpColumns, needed); - } - -//---------- -// -// msp_left_right-- -// Given MSP m, we search obi, the list of alignments ordered by beginning -// point (in vertical/seq1), to find the alignments and their contained -// segments (gap-free pieces) that are the closest to the left and right of -// the MSP's anchor-point. -// -//---------- -// -// Arguments: -// galign* obi: The list of (previously computed) alignments, ordered by -// .. increasing beginning point. This may be NULL. -// galign* m: The MSP to check against the list. -// -// Returns: -// false if the anchor-point is in an already-computed alignment; true -// otherwise; actual result values (the segment pointers) are deposited in -// the MSP. -// -//---------- - -static int msp_left_right - (galign* obi, - galign* m) - { - unspos pos1 = m->pos1; - unspos pos2 = m->pos2; - unspos right, left; - galign* mRight, *mLeft; - aliseg* bRight, *bLeft; - aliseg* bp; - sgnpos x; - - right = left = seqposInfinity; - mRight = mLeft = NULL; - bRight = bLeft = NULL; - - // process all alignments in the obi that overlap (along vertical axis) - // m's anchor point - - for ( ; (obi!=NULL)&&(obi->pos1<=pos1) ; obi=obi->next) - { - if (obi->end1 < pos1) - continue; - - // invariant: obi->pos1 <= pos1 <= obi->end1 - - for (bp=obi->firstSeg ; bp!=NULL ; bp=bp->nextSeg) - { if (bp->e1 >= pos1) break; } - if (bp == NULL) - continue; - - // invariant: bp is the first segment (in this alignment) that - // intersects the line y=pos1 - - if (bp->type == horzSeg) - suicide ("msp_left_right: cannot be horizontal"); - - if (bp->type == diagSeg) - x = signed_difference (bp->b2, pos2) // x is how far to the - + signed_difference (pos1, bp->b1); // .. right of m the - else // (bp->type == vertSeg) // .. segment is, along - x = signed_difference (bp->b2, pos2); // .. the line y=pos1 - - if (x == 0) - return false; - - if ((x > 0) && ((unspos) x < right)) - { // a new closest segment to the right - right = (unspos) x; - mRight = obi; - bRight = bp; - } - else if ((x < 0) && ((unspos) -x < left)) - { // a new closest segment to the left - left = (unspos) -x; - mLeft = obi; - bLeft = bp; - } - } - - m->rightAlign1 = m->rightAlign2 = mRight; - m->rightSeg1 = m->rightSeg2 = bRight; - m->leftAlign1 = m->leftAlign2 = mLeft; - m->leftSeg1 = m->leftSeg2 = bLeft; - - return true; - } - -//---------- -// -// get_above_below-- -// In preparation for extending the anchor-point of an MSP, find the closest -// alignment ending below the anchor-point and the closest alignment starting -// above the anchor-point. -// -//---------- -// -// Arguments: -// alignio* io: io->anchor1 is the anchor's vertical/seq1 position; -// this routine fills in io->belowList and io->aboveList -// galign* obi: The list of (previously computed) alignments, ordered by -// .. increasing beginning point. This may be NULL. -// galign* oed: The list of (previously computed) alignments, ordered by -// .. decreasing ending point. This may be NULL. -// -// Returns: -// nothing; values are written to io->belowList and io->aboveList. -// -//---------- - -static void get_above_below - (alignio* io, - galign* obi, - galign* oed) - { - unspos pos1 = io->anchor1; - galign* mp; - - for (mp=oed ; mp!=NULL ; mp=mp->prev) - { if (mp->end1 < pos1) break; } - io->belowList = mp; - - for (mp=obi ; mp!=NULL ; mp=mp->next) - { if (mp->pos1 > pos1) break; } - io->aboveList = mp; - } - -//---------- -// -// align_left_right-- -// Given a new alignment, determine the alignments and their segments that are -// the closest in either horizontal direction at both ends of the alignment. -// -//---------- -// -// Arguments: -// galign* obi: The list of (previously computed) alignments, ordered by -// .. increasing beginning point. This may be NULL. -// galign* m: The alignment to check. -// -// Returns: -// (nothing) -// -//---------- - -static void align_left_right - (galign* obi, - galign* m) - { - unspos pos1 = m->pos1, pos2 = m->pos2; - unspos end1 = m->end1, end2 = m->end2; - unspos rightOfBottom, rightOfTop, leftOfBottom, leftOfTop; - galign* mRightOfBottom, *mRightOfTop, *mLeftOfBottom, *mLeftOfTop; - aliseg* bRightOfBottom, *bRightOfTop, *bLeftOfBottom, *bLeftOfTop; - aliseg* bp; - sgnpos x; - - rightOfBottom = rightOfTop = leftOfBottom = leftOfTop = seqposInfinity; - mRightOfBottom = mLeftOfBottom = mRightOfTop = mLeftOfTop = NULL; - bRightOfBottom = bRightOfTop = bLeftOfBottom = bLeftOfTop = NULL; - - for ( ; obi!=NULL ; obi=obi->next) - { - if ((obi->pos1 > end1) || (obi->end1 < pos1)) - continue; - - // invariant: obi->pos1 <= end1 and obi->end1 >= pos1 - // meaning: obi overlaps m along vertical/seq1 - - for (bp=obi->firstSeg ; bp!=NULL ; bp=bp->nextSeg) - { if ((bp->type != horzSeg) && (bp->e1 >= pos1)) break; } - - // invariant: bp is either NULL or the first non-horizontal segment (in - // this alignment) that intersects the line y=pos1 - - if ((bp != NULL) && (bp->b1 <= pos1)) - { - if (bp->type == diagSeg) - x = signed_difference (bp->b2, pos2) // x is how far to the - + signed_difference (pos1, bp->b1); // .. right of m the - else // (bp->type == vertSeg) // .. segment is, along - x = signed_difference (bp->b2, pos2); // .. the line y=pos1 - - if ((x > 0) && ((unspos) x < rightOfBottom)) - { - rightOfBottom = (unspos) x; - mRightOfBottom = obi; - bRightOfBottom = bp; - } - else if ((x < 0) && ((unspos) -x < leftOfBottom)) - { - leftOfBottom = (unspos) -x; - mLeftOfBottom = obi; - bLeftOfBottom = bp; - } - } - - for ( ; bp!=NULL ; bp=bp->nextSeg) - { if (bp->type != horzSeg && bp->e1 >= end1) break; } - - if ((bp != NULL) && (bp->type != horzSeg) && (bp->e1 >= end1)) - { - if (bp->type == diagSeg) - x = signed_difference (bp->b2, end2) // x is how far to the - + signed_difference (end1, bp->b1); // .. right of m the - else // (bp->type == vertSeg) // .. segment is, along - x = signed_difference (bp->b2, end2); // .. the line y=end1 - - if ((x > 0) && ((unspos) x < rightOfTop)) - { - rightOfTop = (unspos) x; - mRightOfTop = obi; - bRightOfTop = bp; - } - else if ((x < 0) && ((unspos) -x < leftOfTop)) - { - leftOfTop = (unspos) -x; - mLeftOfTop = obi; - bLeftOfTop = bp; - } - } - } - - m->rightAlign1 = mRightOfBottom; - m->rightSeg1 = bRightOfBottom; - m->rightAlign2 = mRightOfTop; - m->rightSeg2 = bRightOfTop; - m->leftAlign1 = mLeftOfBottom; - m->leftSeg1 = bLeftOfBottom; - m->leftAlign2 = mLeftOfTop; - m->leftSeg2 = bLeftOfTop; - } - -//---------- -// -// insert_align-- -// Insert a new alignment into the two lists of alignments, one ordered by -// increasing beginning point (in vertical/seq1) and the other ordered by -// decreasing end point. -// -//---------- -// -// Arguments: -// galign* m: The MSP that was used to anchor the alignment. -// galign** obi: Pointer to the list of (previously computed) alignments, -// .. ordered by increasing beginning point. The list may -// .. be empty (*obi == NULL). This is updated by this -// .. function. -// galign** oed: Pointer to the list of (previously computed) alignments, -// .. ordered by decreasing ending point. The list may -// .. be empty (*oed == NULL). This is updated by this -// .. function. -// -// Returns: -// (nothing) -// -//---------- - -//=== stuff for snoopBlocks === - -#ifndef snoopBlocks -#define debugSnoopBlocks_6 ; -#endif // not snoopBlocks - -#ifdef snoopBlocks - -#define debugSnoopBlocks_6 \ - fprintf (stderr, "adding alignment block [%8p]" \ - " b " unsposFmt " " unsposFmt \ - " e " unsposFmt " " unsposFmt \ - " s " scoreFmtSimple "\n", \ - m, m->pos1, m->pos2, m->end1, m->end2, \ - (m->align == NULL)? ((score) 0) : m->align->s); - -#endif // snoopBlocks - - -static void insert_align - (galign* m, - galign** _obi, - galign** _oed) - { - galign* mp; // (mp scans list, - galign* mq; // .. mq is predecessor in scan direction) - galign* obi = *_obi; - galign* oed = *_oed; - - if (m->firstSeg == NULL) - suicide ("insert_align: null first segment"); - - debugSnoopBlocks_6; - - for (mq=NULL,mp=obi ; mp!=NULL ; mq=mp,mp=mp->next) - { if (mp->pos1 >= m->pos1) break; } - - if (mq != NULL) { mq->next = m; m->next = mp; } - else { m->next = obi; obi = m; } - - for (mq=NULL,mp=oed ; mp!=NULL ; mq=mp,mp=mp->prev) - { if (mp->end1 <= m->end1) break; } - - if (mq != NULL) { mq->prev = m; m->prev = mp; } - else { m->prev = oed; oed = m; } - - *(_obi) = obi; *(_oed) = oed; - } - -//---------- -// -// update_LR_bounds-- -// As we move to a new DP row, update LY and RY, the actual column limits -// for dynamic programming. -// -//---------- -// -// Arguments: -// int reversed: true => the DP row advances downward -// false => it advances upward -// aliseg** rightSeg: Current constraining segments; these may be -// aliseg** leftSeg: .. updated by this function. -// galign** rightAlign: Alignments containing those segments; these may be -// galign** leftAlign: .. updated by this function. -// unspos row: The sweep row. -// unspos anchor1: The position at which the alignment -// unspos anchor2: .. began. -// sgnpos* L, (pointer to) most recent bounding constraints; -// sgnpos* R: .. these may be updated by this function. -// unspos* LY: (pointer to) most recent Y-drop constraints; these -// unspos* RY: .. may lie strictly inside L and R; these may be -// .. updated by this function. -// -// Returns: -// (nothing) -// -//---------- - -#if ((!defined snoopAlgorithm)&&(!defined snoopSubprobs)) -#define snoopLR_forward_left_1 ; -#define snoopLR_forward_left_2 ; -#define snoopLR_forward_left_3 ; -#define snoopLR_forward_right_1 ; -#define snoopLR_forward_right_2 ; -#define snoopLR_forward_right_3 ; -#define snoopLR_reverse_left_1 ; -#define snoopLR_reverse_left_2 ; -#define snoopLR_reverse_left_3 ; -#define snoopLR_reverse_right_1 ; -#define snoopLR_reverse_right_2 ; -#define snoopLR_reverse_right_3 ; -#endif // not snoopAlgorithm and not snoopSubprobs - - -#if (defined snoopAlgorithm) - -#define snoopLR_forward_left_1 \ - if (snoop) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - if ((*leftSeg)->type == diagSeg) \ - fprintf (stderr, "update_LR: L <- " sgnposFmt \ - " (on same diag seg)\n", \ - L); \ - else \ - fprintf (stderr, "update_LR: L <- " sgnposFmt \ - " (on same vert seg)\n", \ - L); \ - } - -#define snoopLR_forward_left_2 \ - if ((snoop) && (*leftSeg != NULL)) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - fprintf (stderr, "update_LR: L <- " sgnposFmt \ - " (on new seg " unsposSlashFmt " " unsposSlashFmt \ - ")\n", \ - L, \ - (*leftSeg)->b1, (*leftSeg)->b2, \ - (*leftSeg)->e1, (*leftSeg)->e2); \ - } - -#define snoopLR_forward_left_3 \ - if ((snoop) && (L > (sgnpos) LY)) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - fprintf (stderr, "update_LR: LY <- " unsposFmt \ - "\n", \ - (unspos) max ((sgnpos) LY, L)); \ - } - - -#define snoopLR_forward_right_1 \ - if (snoop) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - if ((*rightSeg)->type == diagSeg) \ - fprintf (stderr, "update_LR: L <- " sgnposFmt \ - " (on same diag seg)\n", \ - R); \ - else \ - fprintf (stderr, "update_LR: R <- " sgnposFmt \ - " (on same vert seg)\n", \ - R); \ - } - -#define snoopLR_forward_right_2 \ - if ((snoop) && (*rightSeg != NULL)) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - fprintf (stderr, "update_LR: R <- " sgnposFmt \ - " (on new seg " unsposSlashFmt " " unsposSlashFmt \ - ")\n", \ - R, \ - (*rightSeg)->b1, (*rightSeg)->b2, \ - (*rightSeg)->e1, (*rightSeg)->e2); \ - } - -#define snoopLR_forward_right_3 \ - if ((snoop) && (R < (sgnpos) RY)) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - fprintf (stderr, "update_LR: RY <- " unsposFmt \ - "\n", \ - special_min (RY, R)); \ - } - -#define snoopLR_reverse_left_1 \ - if (snoop) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - if ((*rightSeg)->type == diagSeg) \ - fprintf (stderr, "update_LR: L <- " sgnposFmt \ - " (on same diag seg)\n", \ - L); \ - else \ - fprintf (stderr, "update_LR: L <- " sgnposFmt \ - " (on same vert seg)\n", \ - L); \ - } - -#define snoopLR_reverse_left_2 \ - if ((snoop) && (*rightSeg != NULL)) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - fprintf (stderr, "update_LR: L <- " sgnposFmt \ - " (on new seg " unsposSlashFmt " " unsposSlashFmt \ - ")\n", \ - L, \ - (*rightSeg)->b1, (*rightSeg)->b2, \ - (*rightSeg)->e1, (*rightSeg)->e2); \ - } - -#define snoopLR_reverse_left_3 \ - if ((snoop) && (L > (sgnpos) LY)) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - fprintf (stderr, "update_LR: LY <- " unsposFmt \ - "\n", \ - (unspos) max ((sgnpos) LY, L)); \ - } - -#define snoopLR_reverse_right_1 \ - if (snoop) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - if ((*leftSeg)->type == diagSeg) \ - fprintf (stderr, "update_LR: R <- " sgnposFmt \ - " (on same diag seg)\n", \ - R); \ - else \ - fprintf (stderr, "update_LR: R <- " sgnposFmt \ - " (on same vert seg)\n", \ - R); \ - } - -#define snoopLR_reverse_right_2 \ - if ((snoop) && (*leftSeg != NULL)) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - fprintf (stderr, "update_LR: R <- " sgnposFmt \ - " (on new seg " unsposSlashFmt " " unsposSlashFmt \ - ")\n", \ - R, \ - (*leftSeg)->b1, (*leftSeg)->b2, \ - (*leftSeg)->e1, (*leftSeg)->e2); \ - } - -#define snoopLR_reverse_right_3 \ - if ((snoop) && (R < (sgnpos) RY)) \ - { \ - if (!havePrinted) \ - { fprintf (stderr, "\n"); havePrinted = true; } \ - fprintf (stderr, "update_LR: RY <- " unsposFmt \ - "\n", \ - special_min (RY, R)); \ - } - -#endif // snoopAlgorithm - - -#if ((!defined snoopAlgorithm)&&(defined snoopSubprobs)) - -#define snoopLR_forward_left_1 \ - if (true) \ - { \ - if ((*leftSeg)->type == diagSeg) \ - fprintf (stderr, "\n update_LR: L <- " sgnposFmt \ - " (on same diag seg)", \ - L); \ - else \ - fprintf (stderr, "\n update_LR: L <- " sgnposFmt \ - " (on same vert seg)", \ - L); \ - } - -#define snoopLR_forward_left_2 \ - if (*leftSeg != NULL) \ - { \ - fprintf (stderr, "\n update_LR: L <- " sgnposFmt \ - " (on new seg " unsposSlashFmt " " unsposSlashFmt \ - ")", \ - L, \ - (*leftSeg)->b1, (*leftSeg)->b2, \ - (*leftSeg)->e1, (*leftSeg)->e2); \ - } - -#define snoopLR_forward_left_3 \ - if (L > (sgnpos) LY) \ - { \ - fprintf (stderr, "\n update_LR: LY <- " unsposFmt, \ - (unspos) max ((sgnpos) LY, L)); \ - } - - -#define snoopLR_forward_right_1 \ - if (true) \ - { \ - if ((*rightSeg)->type == diagSeg) \ - fprintf (stderr, "\n update_LR: L <- " sgnposFmt \ - " (on same diag seg)", \ - R); \ - else \ - fprintf (stderr, "\n update_LR: R <- " sgnposFmt \ - " (on same vert seg)", \ - R); \ - } - -#define snoopLR_forward_right_2 \ - if (*rightSeg != NULL) \ - { \ - fprintf (stderr, "\n update_LR: R <- " sgnposFmt \ - " (on new seg " unsposSlashFmt " " unsposSlashFmt \ - ")", \ - R, \ - (*rightSeg)->b1, (*rightSeg)->b2, \ - (*rightSeg)->e1, (*rightSeg)->e2); \ - } - -#define snoopLR_forward_right_3 \ - if (R < (sgnpos) RY) \ - { \ - fprintf (stderr, "\n update_LR: RY <- " unsposFmt, \ - special_min (RY, R)); \ - } - -#define snoopLR_reverse_left_1 \ - if (true) \ - { \ - if ((*rightSeg)->type == diagSeg) \ - fprintf (stderr, "\n update_LR: L <- " sgnposFmt \ - " (on same diag seg)", \ - L); \ - else \ - fprintf (stderr, "\n update_LR: L <- " sgnposFmt \ - " (on same vert seg)", \ - L); \ - } - -#define snoopLR_reverse_left_2 \ - if (*rightSeg != NULL) \ - { \ - fprintf (stderr, "\n update_LR: L <- " sgnposFmt \ - " (on new seg " unsposSlashFmt " " unsposSlashFmt \ - ")", \ - L, \ - (*rightSeg)->b1, (*rightSeg)->b2, \ - (*rightSeg)->e1, (*rightSeg)->e2); \ - } - -#define snoopLR_reverse_left_3 \ - if (L > (sgnpos) LY) \ - { \ - fprintf (stderr, "\n update_LR: LY <- " unsposFmt, \ - (unspos) max ((sgnpos) LY, L)); \ - } - -#define snoopLR_reverse_right_1 \ - if (true) \ - { \ - if ((*leftSeg)->type == diagSeg) \ - fprintf (stderr, "\n update_LR: R <- " sgnposFmt \ - " (on same diag seg)", \ - R); \ - else \ - fprintf (stderr, "\n update_LR: R <- " sgnposFmt \ - " (on same vert seg)", \ - R); \ - } - -#define snoopLR_reverse_right_2 \ - if (*leftSeg != NULL) \ - { \ - fprintf (stderr, "\n update_LR: R <- " sgnposFmt \ - " (on new seg " unsposSlashFmt " " unsposSlashFmt \ - ")", \ - R, \ - (*leftSeg)->b1, (*leftSeg)->b2, \ - (*leftSeg)->e1, (*leftSeg)->e2); \ - } - -#define snoopLR_reverse_right_3 \ - if (R < (sgnpos) RY) \ - { \ - fprintf (stderr, "\n update_LR: RY <- " unsposFmt, \ - special_min (RY, R)); \ - } - -#endif // not snoopAlgorithm and snoopSubprobs - - -static unspos special_min(unspos RY, sgnpos R); -static unspos special_min(unspos RY, sgnpos R) - { - // ideally we would return min(RY,R); but R is signed, so if R is negative - // we need to return zero (since we are to return an unsigned); if R is - // positive then we can return min(RY,R), but we may as well compute it - // ourselves (instead of calling min); note that the cast of R to unsigned - // only occurs when R is positive, so we needn't worry about it causing - // overflaw (and yes, I mean overflaw not overflow) - - if (R <= 0) return 0; - else if ((unspos) R < RY) return R; - else return RY; - } - -static void update_LR_bounds - (int reversed, - aliseg** rightSeg, - aliseg** leftSeg, - galign** rightAlign, - galign** leftAlign, - unspos row, - unspos anchor1, - unspos anchor2, - sgnpos* _L, - sgnpos* _R, - unspos* _LY, - unspos* _RY) - { - sgnpos L = *_L; - sgnpos R = *_R; - unspos LY = *_LY; - unspos RY = *_RY; -#ifdef snoopAlgorithm - int snoop = true; - int havePrinted = false; -#endif // snoopAlgorithm - - dbg_timing_gapped_extend_sub (debugClockUpdateLrBounds); - -#ifdef snoopAlgorithm -#if ((defined debugPosA1) && (defined debugPosB1)) - snoop = ((anchor1 == debugPosA1) && (anchor2 == debugPosA2)) - || ((anchor1 == debugPosB1) && (anchor2 == debugPosB2)); -#elif (defined debugPosA1) - snoop = (anchor1 == debugPosA1) && (anchor2 == debugPosA2); -#elif (defined debugPosB1) - snoop = (anchor1 == debugPosB1) && (anchor2 == debugPosB2); -#endif // debugPosA1,debugPosB1 -#endif // snoopAlgorithm - - // handle forward search (DP row advances upward) - - if (!reversed) - { - if (*leftSeg != NULL) - { - if ((*leftSeg)->e1 >= row + anchor1) // stay on same segment - { - if ((*leftSeg)->type == diagSeg) L++; - snoopLR_forward_left_1; - } - else // move to next segment - { - L = next_sweep_seg (/*right*/ false, leftSeg, leftAlign, - row, anchor1, anchor2) + 1; - snoopLR_forward_left_2; - } - } - - if (*leftSeg != NULL) // (next_sweep_seg may have changed *leftSeg) - { - snoopLR_forward_left_3; - LY = (unspos) max ((sgnpos) LY, L); - } - - if (*rightSeg != NULL) - { - if ((*rightSeg)->e1 >= row + anchor1) // stay on same segment - { - if ((*rightSeg)->type == diagSeg) R++; - snoopLR_forward_right_1; - } - else // move to next segment - { - R = next_sweep_seg (/*right*/ true, rightSeg, rightAlign, - row, anchor1, anchor2) - 1; - snoopLR_forward_right_2; - } - } - - if (*rightSeg != NULL) // (next_sweep_seg may have changed *rightSeg) - { - snoopLR_forward_right_3; - RY = special_min (RY, R); - } - } - - // handle reversed search (DP row advances downward) - - else - { - if (*rightSeg != NULL) - { - if ((*rightSeg)->b1 <= anchor1 - row) // stay on same segment - { - if ((*rightSeg)->type == diagSeg) L++; - snoopLR_reverse_left_1; - } - else // move to next segment - { - L = prev_sweep_seg (/*right*/ true, rightSeg, rightAlign, - row, anchor1, anchor2) + 1; - snoopLR_reverse_left_2; - } - } - - if (*rightSeg != NULL) // (prev_sweep_seg may have changed *rightSeg) - { - snoopLR_reverse_left_3; - LY = (unspos) max ((sgnpos) LY, L); - } - - if (*leftSeg != NULL) - { - if ((*leftSeg)->b1 <= anchor1 - row) // stay on same segment - { - if ((*leftSeg)->type == diagSeg) R++; - snoopLR_reverse_right_1; - } - else // move to next segment - { - R = prev_sweep_seg (/*right*/ false, leftSeg, leftAlign, - row, anchor1, anchor2) - 1; - snoopLR_reverse_right_2; - } - } - - if (*leftSeg != NULL) // (prev_sweep_seg may have changed *leftSeg) - { - snoopLR_reverse_right_3; - RY = special_min (RY, R); - } - } - - *_L = L; - *_R = R; - *_LY = LY; - *_RY = RY; - - dbg_timing_gapped_extend_add (debugClockUpdateLrBounds); - } - -//---------- -// -// next_sweep_seg-- -// Move to the next leftSeg or rightSeg in a forward (upward) sweep. -// prev_sweep_seg-- -// Move to the previous leftSeg or rightSeg in a reverse (downward) sweep. -// -//---------- -// -// Arguments: -// int lookRight: true => move to the next alignment to the right -// .. when we run off the end of an alignment -// false => move to next alignment to left instead -// aliseg** bp: Current constraining segment; may be updated by -// .. this function. -// galign** mp: Alignment containing that segment; may be updated -// .. by this function. -// unspos row: The sweep row. -// unspos anchor1: The position at which the alignment -// unspos anchor2: .. began. -// -// Returns: -// The column position of the next (or previous) leftSeg or rightSeg if there -// is one; otherwise, the column position of the next (or previous) leftAlign -// or rightAlign. If there is also no such alignment, zero is returned. -// -//---------- - -static sgnpos next_sweep_seg - (int lookRight, - aliseg** bp, - galign** mp, - unspos row, - unspos anchor1, - unspos anchor2) - { - sgnpos col; - - dbg_timing_gapped_extend_sub (debugClockNextSweepSeg); - - // move to the next segment, if there is one - - *bp = (*bp)->nextSeg; - if (*bp != NULL) - { - if (((*bp)->type == horzSeg) && ((*bp = (*bp)->nextSeg) == NULL)) - suicide ("Last alignment segment was horizontal"); - - col = signed_difference ((*bp)->b2, anchor2); - dbg_timing_gapped_extend_add (debugClockNextSweepSeg); - return col; - } - - // we've run off the end (top) of an alignment; move to the next one - - if (lookRight) { *bp = (*mp)->rightSeg2; *mp = (*mp)->rightAlign2; } - else { *bp = (*mp)->leftSeg2; *mp = (*mp)->leftAlign2; } - - if (*bp == NULL) - { - dbg_timing_gapped_extend_add (debugClockNextSweepSeg); - return 0; // no constraint; there was no "next alignment" - } - - // figure out the column where the start of the new segment intersects the - // line y=row - - if ((*bp)->type == diagSeg) - col = (sgnpos) row - + signed_difference ((*bp)->b2, anchor2) - - signed_difference ((*bp)->b1, anchor1); - else // we jumped to a vertical segment - col = signed_difference ((*bp)->b2, anchor2); - - dbg_timing_gapped_extend_add (debugClockNextSweepSeg); - return col; - } - - -static sgnpos prev_sweep_seg - (int lookRight, - aliseg** bp, - galign** mp, - unspos row, - unspos anchor1, - unspos anchor2) - { - sgnpos col; - - dbg_timing_gapped_extend_sub (debugClockPrevSweepSeg); - - // move to the previous segment, if there is one - - *bp = (*bp)->prevSeg; - if (*bp != NULL) - { - if (((*bp)->type == horzSeg) && ((*bp = (*bp)->prevSeg) == NULL)) - suicide ("First alignment segment was horizontal"); - col = signed_difference (anchor2, (*bp)->e2); - dbg_timing_gapped_extend_add (debugClockPrevSweepSeg); - return col; - } - - // we've run off the front (bottom) of an alignment; move to the previous - // one - - if (lookRight) { *bp = (*mp)->rightSeg1; *mp = (*mp)->rightAlign1; } - else { *bp = (*mp)->leftSeg1; *mp = (*mp)->leftAlign1; } - - if (*bp == NULL) - { - dbg_timing_gapped_extend_add (debugClockPrevSweepSeg); - return 0; // no constraint; there was no "previous alignment" - } - - // figure out the column where the end of the new segment intersects the - // line y=row - - if ((*bp)->type == diagSeg) - col = (sgnpos) row - + signed_difference (anchor2, (*bp)->e2) - - signed_difference (anchor1, (*bp)->e1); - else // we jumped to a vertical segment - col = signed_difference (anchor2, (*bp)->e2); - - dbg_timing_gapped_extend_add (debugClockPrevSweepSeg); - return col; - } - -//---------- -// -// update_active_segs-- -// As we move to a new sweep row, update the list of segments that intersect -// the sweep row within the feasible region. -// -//---------- -// -// Arguments: -// int reversed: true => the DP row advances downward -// false => it advances upward -// activeseg** active: The list of active segments. This may be updated -// .. by this function. -// galign** alignList: Alignments in advance of the sweep row. This may -// .. be updated by this function. -// dpCell* dp: First DP cell in (conceptual) sweep row. This is -// .. indexed from LY to RY, inclusive. -// unspos row: The sweep row. -// unspos anchor1: The position at which the alignment -// unspos anchor2: .. began. -// unspos LY, RY: Current Y-drop constraints. -// -// Returns: -// (nothing) -// -//---------- - -static aliseg* next_seg (aliseg* bp, int reversed) - { return (reversed)? bp->prevSeg : bp->nextSeg; } - -static void update_active_segs - (int reversed, - activeseg** _active, - galign** _alignList, - dpCell* dp, - unspos row, - unspos anchor1, - unspos anchor2, - unspos LY, - unspos RY) - { - activeseg* active = *_active; - galign* alignList = *_alignList; - activeseg* act; - - dbg_timing_gapped_extend_sub (debugClockUpdateActiveSegs); - - // process currently active segments (those that intersect the sweep row) - - for (act=active ; act!=NULL ; act=act->next) - { - if (act->type == horzSeg) - suicide ("Impossible horizontal segment."); - - if (act->lastRow >= row) - { // sweep row still intersects this segment - if (act->type == diagSeg) - act->x++; - if ((act->x >= LY) && (act->x <= RY)) - dp[act->x].mask = row; - } - else if ((act->seg = next_seg(act->seg, reversed)) != NULL) - { // sweep row intersects the next segment of this alignment; - // move to the next segment and mask its intial DP cells - build_active_seg (reversed, act, dp, row, anchor1, anchor2, LY, RY); - if (act->type == horzSeg) - { - act->seg = next_seg (act->seg, reversed); - build_active_seg (reversed, act, dp, - row, anchor1, anchor2, LY, RY); - } - } - else - { // sweep row has passed the end of this alignment - act->filter = 1; // (mark it for deletion) - } - } - - // add any other alignments the sweep row now intersects, adding the - // first segment (from the appropriate end) to the active list and - // masking its intial DP cells - - if (!reversed) - { - while ((alignList!=NULL) && (alignList->pos1 - anchor1 == row)) - { - active = add_new_active (reversed, active, alignList, - dp, row, anchor1, anchor2, LY, RY); - alignList = alignList->next; - } - } - else - { - while ((alignList != NULL) && (anchor1 - alignList->end1 == row)) - { - active = add_new_active (reversed, active, alignList, - dp, row, anchor1, anchor2, LY, RY); - alignList = alignList->prev; - } - } - - filter_active_segs (&active, 0); // delete blocks whose filter is not 0 - - *_active = active; - *_alignList = alignList; - - dbg_timing_gapped_extend_add (debugClockUpdateActiveSegs); - } - -//---------- -// -// build_active_seg-- -// Create an active segment record for a given segment, and mask any DP -// cells that it intersects. -// -//---------- -// -// Arguments: -// int reversed: true => the DP row advances downward -// false => it advances upward -// activeseg* act: The active segment record, which already contains -// .. the proper segment, but nothing else. -// dpCell* dp: First DP cell in (conceptual) sweep row. This is -// .. indexed from LY to RY, inclusive. -// unspos row: The sweep row. -// unspos anchor1: The position at which the alignment -// unspos anchor2: .. began. -// unspos LY, RY: Current Y-drop constraints. -// -// Returns: -// (nothing) -// -//---------- - -static void build_active_seg - (int reversed, - activeseg* act, - dpCell* dp, - unspos row, - unspos anchor1, - unspos anchor2, - unspos LY, - unspos RY) - { - unspos horzEnd, iMin, iMax, i; - - act->type = act->seg->type; - - // nota bene: the following assigns to act->x and act->lastRow always - // result in non-neagtive values - - if (!reversed) - { - act->x = act->seg->b2 - anchor2; - act->lastRow = act->seg->e1 - anchor1; - } - else - { - act->x = anchor2 - act->seg->e2; - act->lastRow = anchor1 - act->seg->b1; - } - - if (act->type != horzSeg) - { - if ((act->x >= LY) && (act->x <= RY)) - dp[act->x].mask = row; - } - else - { - horzEnd = (!reversed)? act->seg->e2 - anchor2 - : anchor2 - act->seg->b2; - iMin = max (LY, act->x); - iMax = min (RY, horzEnd); - for (i=iMin ; i<=iMax ; i++) - dp[i].mask = row; - } - } - -//---------- -// -// add_new_active-- -// Create a new active segment record and add it to the list, containing -// the given alignment's terminal segment. -// -//---------- -// -// Arguments: -// int reversed: true => the DP row advances downward -// false => it advances upward -// activeseg* active: The list of active segments. Upon return, the -// .. caller should assign this function's return -// .. value to this. -// galign* alignList: Alignments in advance of the sweep row. This may -// .. be updated by this function. -// dpCell* dp: First DP cell in (conceptual) sweep row. This is -// .. indexed from LY to RY, inclusive. -// unspos row: The sweep row. -// unspos anchor1: The position at which the alignment -// unspos anchor2: .. began. -// unspos LY, RY: Current Y-drop constraints. -// -// Returns: -// Pointer to the head of the active segment list, which may be the newly -// created node. -// -//---------- - -static activeseg* add_new_active - (int reversed, - activeseg* active, - galign* alignList, - dpCell* dp, - unspos row, - unspos anchor1, - unspos anchor2, - unspos LY, - unspos RY) - { - activeseg* act = malloc_or_die ("add_new_active", sizeof(activeseg)); - - act->filter = 0; - if (!reversed) act->seg = alignList->firstSeg; - else act->seg = alignList->lastSeg; - act->next = active; - build_active_seg (reversed, act, dp, row, anchor1, anchor2, LY, RY); - - return act; - } - -//---------- -// -// filter_active_segs-- -// Remove (dispose of) active segments with filter values NOT equal to some -// specified value. -// -//---------- -// -// Arguments: -// activeseg** active: List of segments. -// int filter: The filter value of segments to KEEP. The active -// .. records for all other segments are removed from the -// .. list and disposed of. -// -// Returns: -// (nothing) -// -//---------- - -static void filter_active_segs - (activeseg** active, - int filter) - { - activeseg* prevAct, *act; - - dbg_timing_gapped_extend_sub (debugClockFilterActiveSegs); - - for (prevAct=NULL,act=(*active); act!=NULL ; ) - { - if (act->filter == filter) - { - prevAct = act; - act = act->next; - } - else if (prevAct != NULL) - { - prevAct->next = act->next; - free_if_valid ("filter_active_segs", act); - act = prevAct->next; - } - else - { - *active = act->next; - free_if_valid ("filter_active_segs", act); - act = *active; - } - } - - dbg_timing_gapped_extend_add (debugClockFilterActiveSegs); - } - -//---------- -// -// format_alignment-- -// Process the edit script for a newly computed alignment by (1) storing a -// linked-list form with the seeding MSP and (2) augmenting the edit script -// into a form suitable for returning to the calling program. -// -//---------- -// -// Arguments: -// alignio* io: The alignment to format. -// galign* m: The MSP that was used to anchor the alignment. -// -// Returns: -// Pointer to the newly allocated alignment description. -// -//---------- - -static alignel* format_alignment - (alignio* io, - galign* m) - { - unspos beg1, end1, beg2, end2; - unspos height, width, i, j, startI, startJ, run; - u32 opIx; - editscript* script; - u8* seq1, *seq2; - alignel* a; - - beg1 = io->start1 + 1; - end1 = io->stop1 + 1; - beg2 = io->start2 + 1; - end2 = io->stop2 + 1; - script = io->script; - seq1 = io->seq1; - seq2 = io->seq2; - - height = end1 - beg1 + 1; - width = end2 - beg2 + 1; - - opIx = 0; - for (i=j=0 ; (iscript = script; - a->beg1 = beg1; a->beg2 = beg2; - a->end1 = end1; a->end2 = end2; - a->seq1 = seq1; a->seq2 = seq2; - a->s = io->s; - a->next = NULL; - a->isTrivial = false; - a->hspId = m->hspId; - - return a; - } - -//---------- -// -// save_seg-- -// Add a gap-free segment to a seeding MSP, inserting a vertical or horizontal -// piece before it if appropriate. -// -//---------- -// -// Arguments: -// galign* m: The MSP to add the segment to. -// unspos b1, b2: The segment's starting position. -// unspos e1, e2: The segment's ending position. -// -// Returns: -// (nothing) -// -//---------- - -static void insert_seg_to_tail (galign* mp, aliseg* bp); - -static void save_seg - (galign* m, - unspos b1, - unspos b2, - unspos e1, - unspos e2) - { - aliseg* bp = malloc_or_die ("save_seg bp", sizeof(aliseg)); - aliseg* bq; - - bp->b1 = b1; - bp->b2 = b2; - bp->e1 = e1; - bp->e2 = e2; - bp->type = diagSeg; - - // if the alignment is empty, create it with this as the first segment - - if (m->firstSeg == NULL) - { - m->firstSeg = bp->prevSeg = bp->nextSeg = bp; - return; - } - - // otherwise, we insert it at the tail, with a preceding vertical or - // horizontal segment; we assume the previous tail was a diagonal segment - // (since they are the only type ever added to the tail); further, we - // assume that the previous tail ends on either the y=b1-1 or x=b2-1 line - // (but not both) - - bq = malloc_or_die ("save_seg bq", sizeof(aliseg)); - bq->type = ((b1 == m->firstSeg->prevSeg->e1+1)? horzSeg : vertSeg); - bq->b1 = m->firstSeg->prevSeg->e1 + 1; - bq->b2 = m->firstSeg->prevSeg->e2 + 1; - bq->e1 = b1 - 1; - bq->e2 = b2 - 1; - - insert_seg_to_tail (m, bq); - insert_seg_to_tail (m, bp); - } - -static void insert_seg_to_tail (galign* mp, aliseg* bp) - { - bp->prevSeg = mp->firstSeg->prevSeg; - bp->nextSeg = mp->firstSeg; - mp->firstSeg->prevSeg->nextSeg = bp; - mp->firstSeg->prevSeg = bp; - } - -//---------- -// [[-- a seed hit reporter function --]] -// -// gappily_extend_hsps-- -// Perform a gapped extension of a seed hit or HSP. -// -// Arguments and Return value: (see seed_search.h) -// -//---------- - -u32 gappily_extend_hsps - (void* _info, - unspos pos1, - unspos pos2, - unspos length, - score s) - { - hitrepgappily* info = (hitrepgappily*) _info; - seq* seq1 = info->seq1; - seq* seq2 = info->seq2; - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* p1, *p2; - unspos peak; - alignio io; - galign mp; - aliseg* bp, *bq; - u32 hIx, h; - u32 returnVal; - - //fprintf (stderr, "working on segment " unsposSlashFmt " " unsposFmt "\n", - // pos1-length, pos2-length, length); - - if (gapped_extend_dbgShowHsps) - { - fprintf (stderr, "\n"); - dump_aligned_nucleotides - (stderr, seq1, pos1-length, seq2, pos2-length, length); - } - - // (move pos1/pos2 from end of segment to start) - - pos1 -= length; - pos2 -= length; - - // $$$ move this test to set_up_hit_processor() - - if (info->scoreThresh.t != 'S') - suicidef ("gappily_extend_hsps can't handle score threshold %s", - score_thresh_to_string (&info->scoreThresh)); - - // reduce the HSP to a single point - - peak = segment_peak (seq1->v+pos1, seq2->v+pos2, length, info->scoring); - pos1 += peak; - pos2 += peak; - //length = 0; // (unnecessary) - -#ifdef debugHspImmediate - fprintf (stderr, "hsp: " unsposSlashFmt " -> " unsposSlashFmt "\n", - pos1-peak, pos2-peak, pos1, pos2); -#endif - - if (gapped_extend_dbgShowAnchors) - { - segtable st; - segment* seg = &st.seg[0]; - - st.size = 1; - st.len = 1; - st.haveScores = true; - st.coverageLimit = 0; - st.coverage = length; - st.lowScore = s; - seg->pos1 = pos1; - seg->pos2 = pos2; - seg->length = 0; - seg->s = s; - seg->id = seq2->revCompFlags; - seg->scoreCov = length; - seg->filter = false; - - write_segments (stderr, &st, seq1, seq2, false, gapped_extend_dbgShowAnchorsHowOften); - } - - // build the alignio record for ydrop_align() - - io.seq1 = seq1->v; - io.seq2 = seq2->v; - io.rev1 = info->rev1; - io.rev2 = info->rev2; - io.low1 = 0; io.len1 = io.high1 = seq1->len; - io.low2 = 0; io.len2 = io.high2 = seq2->len; - - io.scoring = info->scoring; - io.yDrop = info->yDrop; - io.trimToPeak = info->trimToPeak; - - io.anchor1 = pos1; - io.anchor2 = pos2; - - if (sp1->p != NULL) - { - p1 = lookup_partition (seq1, io.anchor1); - io.low1 = p1->sepBefore + 1; - io.high1 = p1->sepAfter; - } - - if (sp2->p != NULL) - { - p2 = lookup_partition (seq2, io.anchor2); - io.low2 = p2->sepBefore + 1; - io.high2 = p2->sepAfter; - } - - if (info->traceback == NULL) - suicide ("gappily_extend_hsps was given a NULL traceback pointer."); - io.tb = info->traceback; - - // perform alignment - - io.leftAlign = NULL; // (convince ydrop_align() that there are no - io.rightAlign = NULL; // .. neighboring/bounding alignments - io.leftSeg = NULL; // .. to worry about) - io.rightSeg = NULL; - io.aboveList = NULL; - io.belowList = NULL; - - ydrop_align (&io); // (find the gapped alignment) - -#ifdef debugHspImmediate - fprintf (stderr, " gappily: " unsposSlashFmt " " unsposSlashFmt " " scoreFmt "\n", - io.start1, io.start2, io.stop1, io.stop2, io.s); -#endif - - if (io.s < info->scoreThresh.s) - { -#ifdef debugHspImmediate - fprintf (stderr, " gappily: (fails score thresh, " scoreFmt "<" scoreFmt ")\n", - io.s, info->scoreThresh.s); -#endif - free_if_valid ("gappily_extend_hsps io.script", io.script); - mp.firstSeg = NULL; - mp.align = NULL; - goto return_zero; // (the alignment score is too low) - } - - mp.firstSeg = NULL; // (convert the alignment to a linked list) - mp.align = format_alignment (&io, &mp); - mp.pos1 = io.start1; - mp.pos2 = io.start2; - mp.end1 = io.stop1; - mp.end2 = io.stop2; - - if (mp.firstSeg == NULL) - goto return_zero; // (the alignment is empty) - - mp.lastSeg = mp.firstSeg->prevSeg; // (record the alignment's tail and - mp.firstSeg->prevSeg // .. detach the circular pointer) - = mp.lastSeg->nextSeg = NULL; - - // subject the alignment to the user's guantlet of filtering settings - - if ((info->minIdentity > 0) || (info->maxIdentity < 1)) - { - mp.align = filter_aligns_by_identity - (seq1, seq2, mp.align, info->minIdentity, info->maxIdentity); - if (mp.align == NULL) goto return_zero; - } - - if ((info->minCoverage > 0) || (info->maxCoverage < 1)) - { - mp.align = filter_aligns_by_coverage - (seq1, seq2, mp.align, info->minCoverage, info->maxCoverage); - if (mp.align == NULL) goto return_zero; - } - - if ((info->minContinuity > 0) || (info->maxContinuity < 1)) - { - mp.align = filter_aligns_by_continuity - (mp.align, info->minContinuity, info->maxContinuity); - if (mp.align == NULL) goto return_zero; - } - - if (info->minMatchCount > 0) - { - mp.align = filter_aligns_by_match_count - (seq1, seq2, mp.align, info->minMatchCount); - if (mp.align == NULL) goto return_zero; - } - - if (info->maxMismatchCount >= 0) - { - mp.align = filter_aligns_by_mismatch_count - (seq1, seq2, mp.align, info->maxMismatchCount); - if (mp.align == NULL) goto return_zero; - } - - if (info->maxSeparateGapsCount >= 0) - { - mp.align = filter_aligns_by_num_gaps - (mp.align, info->maxSeparateGapsCount); - if (mp.align == NULL) goto return_zero; - } - - if (info->maxGapColumnsCount >= 0) - { - mp.align = filter_aligns_by_num_gap_columns - (mp.align, info->maxGapColumnsCount); - if (mp.align == NULL) goto return_zero; - } - - // if we're to prevent duplicates, compute the alignment's hash and make - // sure we haven't already seen it - - if (info->alignmentHashes != NULL) - { - // if the list is already *past* full, we don't need to check whether - // this alignment is a duplicate, we can just reject it - - if (info->alignmentHashesSeen > info->alignmentHashesSize) - goto return_zero; - - // compute the hash value and check if we've seen it before; note that - // if the list is *just* full, we still need to check, and only reject - // if it is truly a duplicate; this allows the calling routine to - // properly detect when the number of alignments exceeds the limit - - h = alignment_hash (mp.align->beg1, mp.align->end1, seq1->revCompFlags, - mp.align->beg2, mp.align->end2, seq2->revCompFlags, - /*script*/ NULL); - - for (hIx=0 ; hIxalignmentHashesSeen ; hIx++) - { - if (hIx >= info->alignmentHashesSize) break; - if (info->alignmentHashes[hIx] == h) - goto return_zero; - } - - hIx = info->alignmentHashesSeen++; - if (hIx >= info->alignmentHashesSize) - goto return_one; - - info->alignmentHashes[hIx] = h; - } - - // the alignment has satisfied all the user's criteria-- print it - - if (info->deGapifyOutput) print_align_list_segments (mp.align); - else print_align_list (mp.align); - -return_one: - free_align_list (mp.align); - mp.align = NULL; - - returnVal = 1; - goto cleanup; - -return_zero: - if (mp.align != NULL) - { - free_align_list (mp.align); - mp.align = NULL; - } - returnVal = 0; - goto cleanup; - -cleanup: - if ((mp.align != NULL) && (mp.align->script != NULL)) - free_if_valid ("gappily_extend_hsps mp.script", mp.align->script); - - for (bp=mp.firstSeg ; bp!=NULL ; bp=bq) - { bq = bp->nextSeg; free_if_valid ("gappily_extend_hsps seg", bp); } - - return returnVal; - } - -//---------- -// -// dump_alignio_input, dump_alignio_output-- -// Dump the contents of ydrop_align's io record. -// -//---------- - -#ifdef snoopAlignioInput - -static void dump_alignio_input - (FILE* f, - alignio* io) - { - fprintf (f, "=====\n"); - - fprintf (f, "seq1 = %8p\n", io->seq1); - fprintf (f, "rev1 = %8p\n", io->rev1); - fprintf (f, "len1 = " unsposFmt "\n", io->len1); - fprintf (f, "low1 = " unsposFmt "\n", io->low1); - fprintf (f, "high1 = " unsposFmt "\n", io->high1); - fprintf (f, "anchor1 = " unsposFmt "\n", io->anchor1); - - fprintf (f, "seq2 = %8p\n", io->seq2); - fprintf (f, "rev2 = %8p\n", io->rev2); - fprintf (f, "len2 = " unsposFmt "\n", io->len2); - fprintf (f, "low2 = " unsposFmt "\n", io->low2); - fprintf (f, "high2 = " unsposFmt "\n", io->high2); - fprintf (f, "anchor2 = " unsposFmt "\n", io->anchor2); - - fprintf (f, "scoring = %8p\n", io->scoring); - fprintf (f, "yDrop = " scoreFmt "\n", io->yDrop); - fprintf (f, "trim = %s\n", (io->trimToPeak)? "yes" : "no"); - fprintf (f, "tb = %8p\n", io->tb); - - fprintf (f, "leftAlign = %8p\n", io->leftAlign); - fprintf (f, "rightAlign = %8p\n", io->rightAlign); - fprintf (f, "leftSeg = %8p\n", io->leftSeg); - fprintf (f, "rightSeg = %8p\n", io->rightSeg); - fprintf (f, "aboveList = %8p\n", io->aboveList); - fprintf (f, "belowList = %8p\n", io->belowList); - } - -#endif // snoopAlignioInput - - -#if ((defined(snoopAlignioOutput)) || (defined(snoopEditScripts))) - -static void dump_alignio_output - (FILE* f, - alignio* io) - { - fprintf (f, "s(core) = " scoreFmt "\n", io->s); - fprintf (f, " start1 = " unsposFmt "\n", io->start1); - fprintf (f, " stop1 = " unsposFmt "\n", io->stop1); - fprintf (f, " start2 = " unsposFmt "\n", io->start2); - fprintf (f, " stop2 = " unsposFmt "\n", io->stop2); -// fprintf (f, " script = %p\n", io->script); - } - -#endif // snoopAlignioOutput OR snoopEditScripts - -//---------- -// -// score_alignment-- -// Determine the score of gapped alignment in two subsequences. -// -// Note that this is generally used only after an alignment has been modified. -// The gapped_extend() produces the same alignment score as the alignment is -// found. -// -//---------- -// -// Arguments: -// scoreset* scoring: The scoring scheme to use. -// u8* seq1: The first sequence. -// unspos pos1: The subsequence start position in seq1 (origin-0). -// u8* seq2: The second sequence. -// unspos pos2: The subsequence start position in seq2 (origin-0). -// editscript* s: The script describing the alignment. -// -// Returns: -// The alignment's score. -// -//---------- - -score score_alignment - (scoreset* scoring, - u8* seq1, - unspos pos1, - u8* seq2, - unspos pos2, - editscript* script) - { - u8* s1 = seq1 + pos1; - u8* s2 = seq2 + pos2; - u8* stop; - u32 opIx; - editop op; - u32 rpt; - score similarity = 0; - - for (opIx=0 ; opIxlen ; opIx++) - { - // score s = similarity; - - op = script->op[opIx]; - rpt = edit_op_repeat(op); - if (rpt == 0) continue; - op = edit_op_operation(op); - switch (op) - { - case editopSub: - stop = s1 + rpt; - while (s1 < stop) - similarity += scoring->sub[*(s1++)][*(s2++)]; - //fprintf (stderr, "match " unsposFmt " -> " scoreFmt "\n", rpt, similarity - s); - break; - case editopIns: - similarity -= scoring->gapOpen + (rpt * scoring->gapExtend); - s2 += rpt; - //fprintf (stderr, "insert " unsposFmt " -> " scoreFmt "\n", rpt, similarity - s); - break; - case editopDel: - similarity -= scoring->gapOpen + (rpt * scoring->gapExtend); - s1 += rpt; - //fprintf (stderr, "delete " unsposFmt " -> " scoreFmt "\n", rpt, similarity - s); - break; - } - - } - - return similarity; - } - -//---------- -// -// count_paired_bases-- -// Count the number of paired bases in an alignment. -// -//---------- -// -// Arguments: -// galign* mp: The MSP that was used to anchor the alignment. -// -// Returns: -// The number of paired bases. -// -//---------- - -static u64 count_paired_bases - (galign* mp) - { - aliseg* bp; - u64 pairedBases; - - pairedBases = 0; - for (bp=mp->firstSeg ; bp!=NULL ; bp=bp->nextSeg) - { if (bp->type == diagSeg) pairedBases += bp->e1+1 - bp->b1; } - - return pairedBases; - } - -//---------- -// -// warn_for_paired_bases_limit-- -// Tell the user that this query exceeded the limit for paired bases. -// -//---------- -// -// Arguments: -// seq* seq2: The query sequence we were aligning. -// u64 maxPairedBases: (same meaning as for gapped_extend) -// int overlyPairedKeep: (same meaning as for gapped_extend) -// -// Returns: -// (nothing) -// -//---------- - -static void warn_for_paired_bases_limit - (seq* seq2, - u64 maxPairedBases, - int overlyPairedKeep) - { - static int firstReport = true; - seqpartition* sp2 = &seq2->partition; - char* name2; - char strand; - - if (sp2->p != NULL) name2 = "seq2"; // (seq2 is partitioned) - else if (seq2->useFullNames) name2 = seq2->header; - else name2 = seq2->shortHeader; - - strand = ((seq2->revCompFlags & rcf_rev) == 0)? '+' : '-'; - - fprintf (stderr, "WARNING. Query %s (%c strand) contains more than %s paired bases.\n", - name2, strand, commatize(maxPairedBases)); - - if (firstReport) - { - if (overlyPairedKeep) - fprintf (stderr, "Any gapped alignments already found for this query/strand are reported but the\n" - "query/strand is not processed further.\n"); - else - fprintf (stderr, "All gapped alignments for this query/strand are discarded and the query/strand\n" - "is not processed further.\n"); - firstReport = false; - } - } - -//---------- -// -// gapped_extend_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void gapped_extend_zero_stats - (void) - { - dbg_timing_set_stat (numExtensions, 0); - -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&gappedExtendStats, 0, sizeof(gappedExtendStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - gapped_extend_set_stat (totalPeakScore, 0); - -#endif // collect_stats - } - -//---------- -// -// gapped_extend_show_stats-- -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// -// Returns: -// (nothing) -// -//---------- - -void gapped_extend_show_stats - (arg_dont_complain(FILE* f)) - { - dbg_timing_report_stat (numExtensions, "gapped extensions"); - -#ifdef collect_stats - if (f == NULL) return; - fprintf (f, " number of anchors: %s\n", commatize(gappedExtendStats.numAnchors)); - fprintf (f, " anchors >= %2d bp: %s\n", anchorPeakLen, commatize(gappedExtendStats.numPeaks)); - if (gappedExtendStats.numPeaks > 0) - fprintf (f, "average peak score: %.1f\n", ((float)gappedExtendStats.totalPeakScore) / gappedExtendStats.numPeaks); - fprintf (f, " anchors extended: %s\n", commatize(gappedExtendStats.numAnchorsExtended)); - fprintf (f, " gapped extensions: %s\n", commatize(gappedExtendStats.numExtensions)); - fprintf (f, " DP cells visited: %s\n", commatize(gappedExtendStats.dpCellsVisited)); - if (gappedExtendStats.numExtensions > 0) - fprintf (f, "DP cells/extension: %s\n", commatize((2*gappedExtendStats.dpCellsVisited+gappedExtendStats.numExtensions)/(2*gappedExtendStats.numExtensions))); - fprintf (f, " max DP rows: %s\n", commatize(gappedExtendStats.maxDpRows)); - fprintf (f, " max DP columns: %s\n", commatize(gappedExtendStats.maxDpColumns)); - - if (gappedExtendStats.zallocCallsA != 0) - fprintf (f, " zalloc calls A: %s (%s bytes per)\n", - commatize(gappedExtendStats.zallocCallsA), - commatize((gappedExtendStats.zallocTotalA + gappedExtendStats.zallocCallsA/2) / gappedExtendStats.zallocCallsA)); - fprintf (f, " zalloc total A: %s\n", commatize(gappedExtendStats.zallocTotalA)); - - if (gappedExtendStats.zallocCallsB != 0) - fprintf (f, " zalloc calls B: %s (%s bytes per)\n", - commatize(gappedExtendStats.zallocCallsB), - commatize((gappedExtendStats.zallocTotalB + gappedExtendStats.zallocCallsB/2) / gappedExtendStats.zallocCallsB)); - fprintf (f, " zalloc total B: %s\n", commatize(gappedExtendStats.zallocTotalB)); - - fprintf (f, "-------------------\n"); -#endif // collect_stats - } - -void gapped_extend_generic_stats - (arg_dont_complain(FILE* f), - arg_dont_complain(void (*func) (FILE*, const char*, ...))) - { -#ifdef collect_stats - if (f == NULL) return; - (*func) (f, "num_anchors=%" PRId64 "\n", gappedExtendStats.numAnchors); -#endif // collect_stats - } - diff --git a/programs/lastz/src/gapped_extend.h b/programs/lastz/src/gapped_extend.h deleted file mode 100644 index 36b66a7..0000000 --- a/programs/lastz/src/gapped_extend.h +++ /dev/null @@ -1,176 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: gapped_extend.h -// -//---------- - -#ifndef gapped_extend_H // (prevent multiple inclusion) -#define gapped_extend_H - -// other files - -#include "utilities.h" // utility stuff -#include "segment.h" // segment table management stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef gapped_extend_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef gapped_extend_owner -int gapped_extend_verbosity = 0; // ranges from 0 (no info) to 10 (everything) -int gapped_extend_inhibitTruncationReport = false; - // true => don't report alignment truncations -int gapped_extend_dbgShowIdentity = false; -int gapped_extend_dbgShowHsps = false; -int gapped_extend_dbgShowAnchors = false; -int gapped_extend_dbgShowAnchorsHowOften = 0; -int gapped_extend_dbgAllowBatches = false; -#ifdef tryout -int gapped_extend_dbgTriviality = false; -#endif // tryout -#else -global int gapped_extend_verbosity; -global int gapped_extend_inhibitTruncationReport; -global int gapped_extend_dbgShowIdentity; -global int gapped_extend_dbgShowHsps; -global int gapped_extend_dbgShowAnchors; -global int gapped_extend_dbgShowAnchorsHowOften; -global int gapped_extend_dbgAllowBatches; -#ifdef tryout -global int gapped_extend_dbgTriviality; -#endif // tryout -#endif - -//---------- -// -// data structures and types -// -//---------- - -// traceback data structure - -typedef struct tback - { - u32 size; // the number of entries allocated for cell[] - u8 space[1]; // the traceback cells - } tback; - -// special data structure for gappily_extend_hsps(info,...) - -typedef struct hitrepgappily - { - seq* seq1; - seq* seq2; - u8* rev1; - u8* rev2; - scoreset* scoring; - score yDrop; - int trimToPeak; - sthresh scoreThresh; - tback* traceback; - float minIdentity; - float maxIdentity; - float minCoverage; - float maxCoverage; - float minContinuity; - float maxContinuity; - u32 minMatchCount; - s32 maxMismatchCount; - s32 maxSeparateGapsCount; - s32 maxGapColumnsCount; - int deGapifyOutput; - u32 alignmentHashesSize; // number of entries in alignmentHashes[] - u32 alignmentHashesSeen; // number of distinct hash values seen, - u32* alignmentHashes; // .. alignmentHashesSeen <= alignmentHashesSize+1 - } hitrepgappily; - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -global struct - { - int64 numAnchors; - int64 numAnchorsExtended; - int64 numPeaks; -#if (scoreType == 'I') - s64 totalPeakScore; -#else - score totalPeakScore; -#endif // scoreType - int64 numExtensions; - int64 dpCellsVisited; - int64 maxDpRows; - int64 maxDpColumns; - int64 zallocCallsA; - int64 zallocTotalA; - int64 zallocCallsB; - int64 zallocTotalB; - } gappedExtendStats; - -// stats macros - -#define gapped_extend_count_stat(field) ++gappedExtendStats.field -#define gapped_extend_uncount_stat(field) --gappedExtendStats.field -#define gapped_extend_set_stat(field,val) (gappedExtendStats.field = val) -#define gapped_extend_add_stat(field,val) (gappedExtendStats.field += val) -#define gapped_extend_max_stat(field,val) if (val > gappedExtendStats.field) gappedExtendStats.field = val -#else -#define gapped_extend_count_stat(field) -#define gapped_extend_uncount_stat(field) -#define gapped_extend_set_stat(field,val) -#define gapped_extend_add_stat(field,val) -#define gapped_extend_max_stat(field,val) -#endif // collect_stats - -// prototypes for stats routines - -void gapped_extend_zero_stats (void); -void gapped_extend_show_stats (FILE* f); -void gapped_extend_generic_stats (FILE* f, void (*func) (FILE*, const char*, ...)); - -//---------- -// -// prototypes for routines in gapped_extend.c -// -//---------- - -void reduce_to_points (seq* seq1, seq* seq2, scoreset* scoring, - segtable* anchors); -alignel* gapped_extend (seq* seq1, u8* rev1, seq* seq2, u8* rev2, - int inhibitTrivial, - scoreset* scoring, segtable* anchors, tback* tb, - int allBounds, score yDrop, int trimToPeak, - sthresh scoreThresh, - u64 maxPairedBases, - int overlyPairedWarn, int overlyPairedKeep); -void free_segment_batches (void); -tback* new_traceback (u32 size); -void free_traceback (tback* tb); -void free_traceback_rows (void); - -u32 gappily_extend_hsps (void* info, - unspos pos1, unspos pos2, unspos length, - score s); -score score_alignment (scoreset* scoring, - u8* seq1, unspos pos1, - u8* seq2, unspos pos2, - editscript* script); - -#ifdef dbgTimingGappedExtend -void gapped_extend_timing_report (FILE* f); -#endif // dbgTimingGappedExtend - -#undef global -#endif // gapped_extend_H diff --git a/programs/lastz/src/genpaf.c b/programs/lastz/src/genpaf.c deleted file mode 100755 index e54f236..0000000 --- a/programs/lastz/src/genpaf.c +++ /dev/null @@ -1,1960 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: genpaf.c -// -//---------- -// -// genpaf-- -// Support for printing alignments in "GENeral Pairwise Alignment Format". -// -// genpaf format is non-standard. It prints each alignment block on a single -// line, and the calling program can specify which fields are printed, and in -// what order. It is best suited for situations in which the alignment file -// will be processed by some other program. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff -#include "diag_hash.h" // diagonals hashing stuff -#include "identity_dist.h" // identity distribution stuff -#include "coverage_dist.h" // query coverage distribution stuff -#include "continuity_dist.h" // query continuity distribution stuff -#include "cigar.h" // cigar alignment format stuff - -#define genpaf_owner // (make this the owner of its globals) -#include "genpaf.h" // interface to this module - -// alignment counter - -static u64 genpafAlignmentNumber; - -// debugging defines - -//#define snoopGenpaf // if this is defined, extra code is added to - // .. track calls to print_genpaf_align() and - // .. print_genpaf_align(); note that another - // .. instance of this define is in output.c - -//---------- -// -// prototypes for private functions -// -//---------- - -static char* extract_key_info (char* keys, char desiredKey); - -//---------- -// -// print_genpaf_job_header-- -// Print genpaf format job header. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. Note that this can be NULL if we -// .. wish to initialize a print job, without actually printing -// .. the header. -// char* keys: A list of the fields we'll be printing (genpafXXX values). -// -// Returns: -// (nothing) -// -//---------- - -void print_genpaf_job_header - (FILE* f, - char* keys) - { - char* k; - int tabCh; - - genpafAlignmentNumber = (u64) -1; // caveat: this only works properly if - // .. we only write one file at a time, - // .. and write it completely - - if ((f == NULL) || (keys == NULL)) return; - - // print headers for the desired fields - // $$$ most of the case statement below should be replaced by a loop that - // $$$ .. makes use of genpafName[] - - tabCh = '#'; - for (k=keys ; (*k!=0)&&(*k!=genpafInfoSeparator); k++) - { - if (tabCh == '#') { fprintf (f, "#"); tabCh = '\t'; } - else if (tabCh == 0) { fprintf (f, "#"); tabCh = '\t'; } - else if (*k == genpafCR) { tabCh = '\t'; } - else if (*k == genpafMarker) { tabCh = '\t'; } - else { fprintf (f, "\t"); } - switch (*k) - { - case genpafCR: fprintf (f, "\n"); tabCh = '#'; break; - case genpafMarker: fprintf (f, "~"); tabCh = 0; break; - case genpafNA: break; - case genpafName1: fprintf (f, "name1"); break; - case genpafNumber1: fprintf (f, "number1"); break; - case genpafStrand1: fprintf (f, "strand1"); break; - case genpafSize1: fprintf (f, "size1"); break; - case genpafStart1: fprintf (f, "start1"); break; - case genpafStart1Zero: fprintf (f, "zstart1"); break; - case genpafStart1DotPlot: fprintf (f, "start1"); break; - case genpafStart1Blast: fprintf (f, "bstart1"); break; // (we don't expect the user to ever see this) - case genpafEnd1: fprintf (f, "end1"); break; - case genpafEnd1DotPlot: fprintf (f, "end1"); break; - case genpafEnd1Blast: fprintf (f, "bend1"); break; // (we don't expect the user to ever see this) - case genpafLength1: fprintf (f, "length1"); break; - case genpafAlign1: fprintf (f, "align1"); break; - case genpafText1: fprintf (f, "text1"); break; - case genpafQualsAlign1: fprintf (f, "qalign1"); break; - case genpafName2: fprintf (f, "name2"); break; - case genpafNumber2: fprintf (f, "number2"); break; - case genpafStrand2: fprintf (f, "strand2"); break; - case genpafSize2: fprintf (f, "size2"); break; - case genpafStart2: fprintf (f, "start2"); break; - case genpafStart2Zero: fprintf (f, "zstart2"); break; - case genpafStart2OnPlus: fprintf (f, "start2+"); break; - case genpafStart2ZeroOnPlus: fprintf (f, "zstart2+"); break; - case genpafStart2DotPlot: fprintf (f, "start2"); break; - case genpafEnd2: fprintf (f, "end2"); break; - case genpafEnd2OnPlus: fprintf (f, "end2+"); break; - case genpafEnd2DotPlot: fprintf (f, "end2"); break; - case genpafLength2: fprintf (f, "length2"); break; - case genpafAlign2: fprintf (f, "align2"); break; - case genpafText2: fprintf (f, "text2"); break; - case genpafQualsAlign2: fprintf (f, "qalign2"); break; - case genpafMatch: fprintf (f, "nmatch"); break; - case genpafMismatch: fprintf (f, "nmismatch"); break; - case genpafAlignedPairs: fprintf (f, "npair"); break; - case genpafAlignmentColumns: fprintf (f, "ncolumn"); break; - case genpafSeparateGaps: fprintf (f, "ngap"); break; - case genpafGapColumns: fprintf (f, "cgap"); break; - case genpafTextDiff: fprintf (f, "diff"); break; - case genpafCigar: fprintf (f, "cigar"); break; - case genpafCigarLower: fprintf (f, "cigar-"); break; - case genpafCigarX: fprintf (f, "cigarx"); break; - case genpafCigarXLower: fprintf (f, "cigarx-"); break; - case genpafDiagonal: fprintf (f, "diagonal"); break; - case genpafShingle: fprintf (f, "shingle"); break; - case genpafScore: fprintf (f, "score"); break; - case genpafAlignmentNumZero: fprintf (f, "znumber"); break; - case genpafAlignmentNum: fprintf (f, "number"); break; - case genpafBlastBitScore: fprintf (f, "bitscore"); break; - case genpafBlastEValue: fprintf (f, "evalue"); break; - case genpafIdentity: fprintf (f, "identity\tidPct"); break; - case genpafIdentityFrac: fprintf (f, "idfrac"); break; - case genpafIdentityPct: fprintf (f, "id%%"); break; - case genpafBlastIdentityPct: fprintf (f, "blastid%%"); break; - case genpafCoverage: fprintf (f, "coverage\tcovPct"); break; - case genpafCoverageFrac: fprintf (f, "covfrac"); break; - case genpafCoveragePct: fprintf (f, "cov%%"); break; - case genpafContinuity: fprintf (f, "continuity\tconPct"); break; - case genpafContinuityFrac: fprintf (f, "confrac"); break; - case genpafContinuityPct: fprintf (f, "con%%"); break; - case genpafGapRate: fprintf (f, "gaprate\tgapPct"); break; - case genpafChoreId: fprintf (f, "chore"); break; - case genpafTargetNucs: fprintf (f, "nucs1"); break; - case genpafTargetQuals: fprintf (f, "quals1"); break; - case genpafQueryNucs: fprintf (f, "nucs2"); break; - case genpafQueryQuals: fprintf (f, "quals2"); break; - case genpafHspId: fprintf (f, "hspid"); break; - case genpafPositionHash: fprintf (f, "phash"); break; - case genpafAlignmentHash: fprintf (f, "ahash"); break; - default: break; - } - } - fprintf (f, "\n"); - } - -//---------- -// -// print_genpaf_job_footer-- -// Print genpaf format job footer. -// -//---------- - -void print_genpaf_job_footer - (arg_dont_complain(FILE* f)) - { - // (do nothing) - } - -//---------- -// -// print_genpaf_header-- -// Print genpaf format query header. -// -//---------- - -void print_genpaf_header - (arg_dont_complain(FILE* f), - arg_dont_complain(seq* seq1), - arg_dont_complain(seq* seq2)) - { - // (do nothing) - } - -//---------- -// -// print_blast_job_header-- -// Print blast format job header. -// -//---------- - -void print_blast_job_header - (arg_dont_complain(FILE* f)) - { - // (do nothing) - } - -//---------- -// -// print_blast_job_footer-- -// Print blast format job footer. -// -//---------- - -void print_blast_job_footer - (arg_dont_complain(FILE* f)) - { - // (do nothing) - } - -//---------- -// -// print_blast_header-- -// Print blast format query header. -// -//---------- - -void print_blast_header - (arg_dont_complain(FILE* f), - char* _programName, - char* _args, - seq* database, - seq* query) - { - char* programName = _programName; - char* args = _args; - - if (programName == NULL) programName = "(no name)"; - if (args == NULL) args = ""; - - char* name = (query->useFullNames)? query->header : query->shortHeader; - if ((name == NULL) || (name[0] == 0)) name = "query"; - - fprintf (f, "# %s %s\n", programName, args); - fprintf (f, "# Query: %s\n", name); - fprintf (f, "# Database: %s\n", database->filename); - fprintf (f, "# Fields: query id, subject id, %% identity, alignment length" - ", mismatches, gap opens, q. start, q. end, s. start, s. end" - ", evalue, bit score\n"); - } - -//---------- -// -// print_genpaf_align_list-- -// Print a list of gapped alignments in genpaf format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// char* keys: A list of the fields to print (genpafXXX values). -// -// Returns: -// (nothing) -// -//---------- - -//=== stuff for snoopGenpaf === - -#ifndef snoopGenpaf -#define snoopGenpaf_1 ; -#define snoopGenpaf_2 ; -#endif // not snoopGenpaf - -#ifdef snoopGenpaf - -#define snoopGenpaf_1 \ - fprintf (stderr, "entering print_genpaf_align_list, alignList=%p\n", \ - alignList); - -#define snoopGenpaf_2 \ - fprintf (stderr, "in print_genpaf_align_list, a=%p\n", a); - -#endif // snoopGenpaf - - -//=== print_genpaf_align_list === - -void print_genpaf_align_list - (FILE* f, - alignel* alignList, - seq* seq1, - seq* seq2, - char* keys) - { - alignel* a; - int computeId, computeCov, computeCon, computeGap; - unspos idNumer, idDenom; - unspos covNumer, covDenom; - unspos conNumer, conDenom; - unspos gapNumer, gapDenom; - - idNumer = idDenom = covNumer = covDenom = 0; - - computeId = (strchr (keys, genpafIdentity) != NULL) - || (strchr (keys, genpafIdentityFrac) != NULL) - || (strchr (keys, genpafIdentityPct) != NULL) - || (strchr (keys, genpafBlastIdentityPct) != NULL) - || (strchr (keys, genpafMatch) != NULL) - || (strchr (keys, genpafMismatch) != NULL) - || (strchr (keys, genpafAlignedPairs) != NULL); - - computeCov = (strchr (keys, genpafCoverage) != NULL) - || (strchr (keys, genpafCoverageFrac) != NULL) - || (strchr (keys, genpafCoveragePct) != NULL); - - computeCon = (strchr (keys, genpafContinuity) != NULL) - || (strchr (keys, genpafContinuityFrac) != NULL) - || (strchr (keys, genpafContinuityPct) != NULL) - || (strchr (keys, genpafAlignmentColumns) != NULL) - || (strchr (keys, genpafSeparateGaps) != NULL) - || (strchr (keys, genpafGapColumns) != NULL) - || (strchr (keys, genpafBlastIdentityPct) != NULL); - - computeGap = (strchr (keys, genpafGapRate) != NULL); - - //snoopGenpaf_1; - - for (a=alignList ; a!=NULL ; a=a->next) - { - if (computeId) alignment_identity (seq1, seq2, a, &idNumer, &idDenom); - if (computeCov) alignment_coverage (seq1, seq2, a, &covNumer, &covDenom); - if (computeCon) alignment_continuity ( a, &conNumer, &conDenom); - if (computeGap) alignment_gap_rate ( a, &gapNumer, &gapDenom); - - //snoopGenpaf_2; - print_genpaf_align (f, - seq1, a->beg1-1, a->end1, - seq2, a->beg2-1, a->end2, - a->script, a->s, a->hspId, - keys, - idNumer, idDenom, - covNumer, covDenom, - conNumer, conDenom, - gapNumer, gapDenom); - } - } - -//---------- -// -// print_genpaf_align_list_segments-- -// Print a list of gapped alignments in genpaf format, splitting them into -// ungapped segments. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// char* keys: A list of the fields to print (genpafXXX values). -// scoreset* scoring: The scoring scheme to use for rescoring. This is -// .. only used if the keys include a key that -// .. requires the score (e.g. genpafScore). In that -// .. case each segment is rescored separately. -// -// Returns: -// (nothing) -// -//---------- - -//=== stuff for snoopGenpaf === - -#ifndef snoopGenpaf -#define snoopGenpaf_3 ; -#define snoopGenpaf_4 ; -#endif // not snoopGenpaf - -#ifdef snoopGenpaf - -#define snoopGenpaf_3 \ - fprintf (stderr, "segmenting(" \ - "%s:" unsposDotsFmt " %s:" unsposDotsFmt ") (%s)\n", \ - seq1->header, beg1, end1, seq2->header, beg2, end2, \ - (seq1->revCompFlags==seq2->revCompFlags)? "same strand" : "opposite strands"); - -#define snoopGenpaf_4 \ - { \ - unspos snoopIx; \ - fprintf (stderr, " " unsposStarFmt " ", 10, b1); \ - for (snoopIx=0 ; snoopIxv[b1+snoopIx]); \ - fprintf (stderr, "\n"); \ - fprintf (stderr, " " unsposStarFmt " ", 10, b2); \ - for (snoopIx=0 ; snoopIxv[b2+snoopIx]); \ - fprintf (stderr, "\n"); \ - } - -#endif // snoopGenpaf - - -//=== print_genpaf_align_list_segments === - -void print_genpaf_align_list_segments - (FILE* f, - alignel* alignList, - seq* seq1, - seq* seq2, - char* keys, - scoreset* scoring) - { - alignel* a; - unspos beg1, end1, beg2, end2; - unspos height, width, i, j, prevI, prevJ, run, b1, b2; - u32 opIx; - score s; - - for (a=alignList ; a!=NULL ; a=a->next) - { - beg1 = a->beg1; - end1 = a->end1; - beg2 = a->beg2; - end2 = a->end2; - height = end1 - beg1 + 1; - width = end2 - beg2 + 1; - - snoopGenpaf_3; - - // print the alignment's segments - - opIx = 0; - for (i=j=0 ; (i< height)||(jscript, &opIx); - i += run; j += run; - if ((i < height) || (j < width)) - edit_script_indel_len (a->script, &opIx, &i, &j); - - b1 = beg1-1 + prevI; - b2 = beg2-1 + prevJ; - snoopGenpaf_4; - - s = 0; - if (strchr (keys, genpafScore) != NULL) - s = score_match (scoring, seq1, b1, seq2, b2, run); - print_genpaf_match (f, seq1, b1, seq2, b2, run, s, a->hspId, keys); - } - } - - } - -//---------- -// -// print_genpaf_align-- -// Print a single gapped alignment in genpaf format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos beg1, end1: Range of positions in sequence 1 (origin 0). -// seq* seq2: Another sequence. -// unspos beg2, end2: Range of positions in sequence 2 (origin 0). -// editscript* script: The script describing the path the alignment -// .. takes in the DP matrix. -// score s: The alignment's score. -// char* keys: A list of the fields to print (genpafXXX values). -// unspos idNumer: Identity -// unspos idDenom: -// unspos covNumer: Coverage -// unspos covDenom: -// unspos gapNumer: Gap Rate -// unspos gapDenom: -// -// Returns: -// (nothing) -// -//---------- - -//=== stuff for snoopGenpaf === - -#ifndef snoopGenpaf -#define snoopGenpaf_5 ; -#endif // not snoopGenpaf - -#ifdef snoopGenpaf - -#define snoopGenpaf_5 \ - fprintf (stderr, " print_genpaf_align(" \ - "%s:" unsposDotsFmt " %s:" unsposDotsFmt ")\n", \ - seq1->header, beg1, end1, seq2->header, beg2, end2); - -#endif // snoopGenpaf - - -//=== miscellany for print_genpaf_align === - -static char* rcfSuffix[4] = { "", "~", "~", "" }; - - -static char diff_char (u8 p, u8 q, char* textDiffInfo); -static char diff_char (u8 p, u8 q, char* textDiffInfo) - { - s8 b1, b2; - char c; - - b1 = nuc_to_bits[p]; - b2 = nuc_to_bits[q]; - - if ((b1 < 0) || (b2 < 0)) c = textDiffInfo[genpafTDInfoOther]; - else if (b1 == b2) c = textDiffInfo[genpafTDInfoMatch]; - else if (bits_to_pur_pyr[(u8)b1] == bits_to_pur_pyr[(u8)b2]) c = textDiffInfo[genpafTDInfoTransition]; - else c = textDiffInfo[genpafTDInfoTransversion]; - - return c; - } - - -//=== print_genpaf_align === - -void print_genpaf_align - (FILE* f, - seq* seq1, - unspos beg1, - unspos end1, - seq* seq2, - unspos beg2, - unspos end2, - editscript* script, - score s, - u64 hspId, - char* keys, - unspos idNumer, - unspos idDenom, - unspos covNumer, - unspos covDenom, - unspos conNumer, - unspos conDenom, - unspos gapNumer, - unspos gapDenom) - { - char* k; - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - unspos height, width, i, j, prevI, prevJ, run; - u32 opIx; - u8* p, *q; - unspos ix, len1, len2; - char* name1, *name2, *suff1, *suff2; - unspos offset1, offset2, start1, start2, endOffset1, endOffset2; - unspos startLoc1, startLoc2; - unspos dotStart1, dotStart2, dotEnd1, dotEnd2; - unspos seq1Len, seq2Len, seq1True, seq2True; - unspos seq1Invert, seq2Invert; - u32 seq1Contig, seq2Contig; - char strand1, strand2; - unspos startI, startJ; - int tabCh; - char* textDiffInfo; - sgnpos diag, diagSE, diagNW; - unspos numGaps; - u32 h; - - snoopGenpaf_5; - - beg1++; // (internally, we want origin 1, inclusive) - beg2++; - - len1 = height = end1 - beg1 + 1; - len2 = width = end2 - beg2 + 1; - - genpafAlignmentNumber++; - - ////////// - // figure out position offsets and names - ////////// - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - seq1Len = seq1->len; - seq1True = seq1->trueLen; - seq1Contig = seq1->contig; - seq1Invert = seq1True; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, beg1-1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - seq1Len = part->sepAfter - offset1; - seq1True = part->trueLen; - seq1Contig = part->contig; - seq1Invert = part->sepBefore + part->sepAfter + 1; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2Len = seq2->len; - seq2True = seq2->trueLen; - seq2Contig = seq2->contig; - seq2Invert = seq2True; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, beg2-1); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2Len = part->sepAfter - offset2; - seq2True = part->trueLen; - seq2Contig = part->contig; - seq2Invert = part->sepBefore + part->sepAfter + 1; - } - - ////////// - // figure out strandedness - ////////// - - suff1 = rcfSuffix[seq1->revCompFlags]; - suff2 = rcfSuffix[seq2->revCompFlags]; - - if ((seq1->revCompFlags & rcf_rev) == 0) - { - start1 = beg1-1 - offset1 + startLoc1; - if (sp1->p == NULL) dotStart1 = start1 + 1; - else dotStart1 = beg1 + 1; - dotEnd1 = dotStart1 + height - 1; - strand1 = '+'; - } - else - { - start1 = beg1-1 - offset1 + seq1True+2 - (startLoc1 + seq1Len); - if (sp1->p == NULL) dotStart1 = (startLoc1 + seq1Len + offset1 - beg1) - 1; - else dotStart1 = seq1Invert - beg1; - dotEnd1 = (dotStart1 - height) + 1; - strand1 = '-'; - } - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = beg2-1 - offset2 + startLoc2; - if (sp2->p == NULL) dotStart2 = start2 + 1; - else dotStart2 = beg2 + 1; - dotEnd2 = dotStart2 + width - 1; - strand2 = '+'; - } - else - { - start2 = beg2-1 - offset2 + seq2True+2 - (startLoc2 + seq2Len); - if (sp1->p == NULL) dotStart2 = (startLoc2 + seq2Len + offset2 - beg2) - 1; - else dotStart2 = seq2Invert - beg2; - dotEnd2 = (dotStart2 - width) + 1; - strand2 = '-'; - } - - // print the desired fields - - textDiffInfo = extract_key_info (keys, genpafTextDiff); - if (textDiffInfo == NULL) textDiffInfo = genpafTDInfoDefault; - - tabCh = '#'; - for (k=keys ; (*k!=0)&&(*k!=genpafInfoSeparator); k++) - { - if ((tabCh == '#') - || (tabCh == 0) - || (*k == genpafCR) - || (*k == genpafMarker)) - tabCh = '\t'; - else - fprintf (f, "\t"); - - switch (*k) - { - case genpafCR: - fprintf (f, "\n"); - tabCh = '#'; - break; - case genpafMarker: - fprintf (f, "~"); - tabCh = 0; - break; - case genpafNA: - fprintf (f, "NA"); - break; - case genpafName1: - fprintf (f, "%s%s", name1, suff1); - break; - case genpafNumber1: - fprintf (f, "%u", seq1Contig-1); - break; - case genpafStrand1: - fprintf (f, "%c", strand1); - break; - case genpafSize1: - fprintf (f, unsposFmt, seq1True); - break; - case genpafStart1: - fprintf (f, unsposFmt, start1); - break; - case genpafStart1Zero: - fprintf (f, unsposFmt, start1-1); - break; - case genpafStart1DotPlot: - fprintf (f, unsposFmt, dotStart1); - break; - case genpafStart1Blast: - if (strand2 == strand1) fprintf (f, unsposFmt, start1); - else fprintf (f, unsposFmt, start1+len1-1); - break; - case genpafEnd1: - fprintf (f, unsposFmt, start1+len1-1); - break; - case genpafEnd1DotPlot: - fprintf (f, unsposFmt, dotEnd1); - break; - case genpafEnd1Blast: - if (strand2 == strand1) fprintf (f, unsposFmt, start1+len1-1); - else fprintf (f, unsposFmt, start1); - break; - case genpafLength1: - fprintf (f, unsposFmt, height); - break; - case genpafAlign1: - case genpafText1: - // print aligning path in sequence 1 (non-printables are printed as '*' - // but such should never be seen unless there is a problem elsewhere) - - opIx = 0; - for (i=j=0 ; (iv+beg1+i-1; - for (ix=0 ; ixv+beg1+i-1; - startJ = j; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startIvq == NULL) - fprintf (f, "*"); - else - { - opIx = 0; - for (i=j=0 ; (ivq+beg1+i-1; - for (ix=0 ; ixvq+beg1+i-1; - startJ = j; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startIv+beg2+j-1; - for (ix=0 ; ixv+beg2+j-1; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startIvq == NULL) - fprintf (f, "*"); - else - { - opIx = 0; - for (i=j=0 ; (ivq+beg2+j-1; - for (ix=0 ; ixvq+beg2+j-1; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startIv+beg1+i-1; - q = seq2->v+beg2+j-1; - for (ix=0 ; ixv+beg1+i-1; - startJ = j; q = seq2->v+beg2+j-1; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startI 0) - { - if (*k == genpafCigar) - fprintf (f, unsposFmt "M", run); - else // if (*k == genpafCigarLower) - fprintf (f, unsposFmt "m", run); - i += run; j += run; - } - - if ((i < height) || (j < width)) - { - prevI = i; prevJ = j; - edit_script_indel_len (script, &opIx, &i, &j); - if (i > prevI) - { - if (*k == genpafCigar) - fprintf (f, unsposFmt "D", i - prevI); - else - fprintf (f, unsposFmt "d", i - prevI); - } - if (j > prevJ) - { - if (*k == genpafCigar) - fprintf (f, unsposFmt "I", j - prevJ); - else - fprintf (f, unsposFmt "i", j - prevJ); - } - } - } - break; - case genpafCigarX: - case genpafCigarXLower: - print_cigar_align (f, seq1, beg1-1, end1, seq2, beg2-1, end2, - script, s, - /* withInfo */ false, - /* markMismatches */ true, - /* letterAfter */ true, - /* hideSingles */ true, - /* lowerCase */ (*k == genpafCigarXLower), - /* withNewLine */ false); - break; - case genpafDiagonal: - fprintf (f, sgnposFmt, diagNumber(start1,start2)); - break; - case genpafShingle: - diag = diagNumber (start1, start2); - diagSE = seq1Len - diag; - diagNW = seq2Len + diag; - if (diag < 0) - { - if ((diagNW < 0) - || ((u32) diagNW < seq1Len)) diag = -diagNW; - else diag = 0; - } - else if (diag > 0) - { - if ((diagSE < 0) - || ((u32) diagSE < seq2Len)) diag = diagSE; - else diag = 0; - } - if (diag == 0) fprintf (f, "NA"); - else fprintf (f, sgnposFmt, diag); - break; - case genpafScore: - fprintf (f, scoreFmt, s); - break; - case genpafAlignmentNumZero: - fprintf (f, u64Fmt, genpafAlignmentNumber); - break; - case genpafAlignmentNum: - fprintf (f, u64Fmt, 1+genpafAlignmentNumber); - break; - case genpafBlastBitScore: - fprintf (f, "%.1f", blastz_score_to_ncbi_bits(s)); - break; - case genpafBlastEValue: - fprintf (f, "%.2g", blastz_score_to_ncbi_expectation(s)); - break; - case genpafIdentity: - fprintf (f, unsposSlashFmt, idNumer, idDenom); - if (idDenom != 0) fprintf (f, "\t%.1f%%", (100.0*idNumer) / idDenom); - else fprintf (f, "\tNA"); - break; - case genpafIdentityFrac: - fprintf (f, unsposSlashFmt, idNumer, idDenom); - break; - case genpafIdentityPct: - if (idDenom != 0) fprintf (f, "%.1f%%", (100.0*idNumer) / idDenom); - else fprintf (f, "NA"); - break; - case genpafBlastIdentityPct: - if (conDenom != 0) fprintf (f, "%.2f", (100.0*idNumer) / conDenom); - else fprintf (f, "NA"); - break; - case genpafCoverage: - fprintf (f, unsposSlashFmt, covNumer, covDenom); - if (covDenom != 0) fprintf (f, "\t%.1f%%", (100.0*covNumer) / covDenom); - else fprintf (f, "\tNA"); - break; - case genpafCoverageFrac: - fprintf (f, unsposSlashFmt, covNumer, covDenom); - break; - case genpafCoveragePct: - if (covDenom != 0) fprintf (f, "%.1f%%", (100.0*covNumer) / covDenom); - else fprintf (f, "NA"); - break; - case genpafContinuity: - fprintf (f, unsposSlashFmt, conNumer, conDenom); - if (conDenom != 0) fprintf (f, "\t%.1f%%", (100.0*conNumer) / conDenom); - else fprintf (f, "\tNA"); - break; - case genpafContinuityFrac: - fprintf (f, unsposSlashFmt, conNumer, conDenom); - break; - case genpafContinuityPct: - if (conDenom != 0) fprintf (f, "%.1f%%", (100.0*conNumer) / conDenom); - else fprintf (f, "NA"); - break; - case genpafGapRate: - fprintf (f, unsposSlashFmt, gapNumer, gapDenom); - if (gapDenom != 0) fprintf (f, "\t%.1f%%", (100.0*gapNumer) / gapDenom); - else fprintf (f, "\tNA"); - break; - case genpafChoreId: - if ((seq2->choresFile == NULL) - || (seq2->chore.idTag[0] == 0)) - fprintf (f, "NA"); - else - fprintf (f, "%s", seq2->chore.idTag); - break; - case genpafTargetNucs: - if (strand1 == '+') - { - for (ix=0 ; ixv[offset1+ix])); - } - else - { - endOffset1 = offset1 + seq1Len-1; - for (ix=0 ; ixv[endOffset1-ix]])); - } - break; - case genpafTargetQuals: - if (seq1->vq == NULL) - fprintf (f, "*"); - else if (strand1 == '+') - { - for (ix=0 ; ixvq[offset1+ix]); - } - else - { - endOffset1 = offset1 + seq1Len-1; - for (ix=0 ; ixvq[endOffset1-ix]); - } - break; - case genpafQueryNucs: - if (strand2 == '+') - { - for (ix=0 ; ixv[offset2+ix])); - } - else - { - endOffset2 = offset2 + seq2Len-1; - for (ix=0 ; ixv[endOffset2-ix]])); - } - break; - case genpafQueryQuals: - if (seq2->vq == NULL) - fprintf (f, "*"); - else if (strand2 == '+') - { - for (ix=0 ; ixvq[offset2+ix]); - } - else - { - endOffset2 = offset2 + seq2Len-1; - for (ix=0 ; ixvq[endOffset2-ix]); - } - break; - case genpafHspId: - fprintf (f, u64Fmt, hspId); - break; - case genpafPositionHash: - h = alignment_hash (beg1, end1, seq1->revCompFlags, - beg2, end2, seq2->revCompFlags, - /*script*/ NULL); - fprintf (f, "%08lX", (long) h); - break; - case genpafAlignmentHash: - h = alignment_hash (beg1, end1, seq1->revCompFlags, - beg2, end2, seq2->revCompFlags, - script); - fprintf (f, "%08lX", (long) h); - break; - default: - break; - } - } - fprintf (f, "\n"); - } - -//---------- -// -// print_genpaf_match-- -// Print an hsp in genpaf format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos pos1: The position, in seq1, of first character in the match -// .. (origin-0). -// seq* seq2: Another sequence. -// unspos pos1: The position, in seq2, of first character in the match -// .. (origin-0). -// unspos length: The number of nucleotides in the HSP. -// score s: The HSP's score. -// char* keys: A list of the fields to print (genpafXXX values). -// -// Returns: -// (nothing) -// -//---------- - -//=== stuff for snoopGenpaf === - -#ifndef snoopGenpaf -#define snoopGenpaf_6 ; -#endif // not snoopGenpaf - -#ifdef snoopGenpaf - -#define snoopGenpaf_6 \ - fprintf (stderr, " print_genpaf_match(" \ - "%s:" unsposFmt " %s:" unsposFmt ")" \ - " length=" unsposFmt "\n", \ - seq1->header, pos1, seq2->header, pos2, length); - -#endif // snoopGenpaf - - -//=== print_genpaf_match === - -void print_genpaf_match - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - score s, - u64 hspId, - char* keys) - { - char* k; - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - u8* s1 = seq1->v + pos1; - u8* s2 = seq2->v + pos2; - char* name1, *name2, *suff1, *suff2; - unspos offset1, offset2, start1, start2, endOffset1, endOffset2; - unspos startLoc1, startLoc2; - unspos dotStart1, dotStart2, dotEnd1, dotEnd2; - unspos seq1Len, seq2Len, seq1True, seq2True; - unspos seq1Invert, seq2Invert; - u32 seq1Contig, seq2Contig; - char strand1, strand2; - unspos ix; - segment seg; - unspos numer, denom; - int tabCh; - char* textDiffInfo; - sgnpos diag, diagSE, diagNW; - u32 h; - - snoopGenpaf_6; - - if (seq1->revCompFlags != rcf_forward) - suicide ("attempt to print - strand or complement for sequence 1 in print_genpaf_match"); - - genpafAlignmentNumber++; - - ////////// - // figure out position offsets and names - ////////// - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - seq1Len = seq1->len; - seq1True = seq1->trueLen; - seq1Contig = seq1->contig; - seq1Invert = seq1True; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, pos1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - seq1Len = part->sepAfter - offset1; - seq1True = part->trueLen; - seq1Contig = part->contig; - seq1Invert = part->sepBefore + part->sepAfter + 1; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2Len = seq2->len; - seq2True = seq2->trueLen; - seq2Contig = seq2->contig; - seq2Invert = seq2True; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, pos2); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2Len = part->sepAfter - offset2; - seq2True = part->trueLen; - seq2Contig = part->contig; - seq2Invert = part->sepBefore + part->sepAfter + 1; - } - - ////////// - // figure out strandedness - ////////// - - suff1 = rcfSuffix[seq1->revCompFlags]; - suff2 = rcfSuffix[seq2->revCompFlags]; - - if ((seq1->revCompFlags & rcf_rev) == 0) - { - start1 = pos1 - offset1 + startLoc1; - if (sp1->p == NULL) dotStart1 = start1; - else dotStart1 = pos1 + 1; - dotEnd1 = dotStart1 + length - 1; - strand1 = '+'; - } - else - { - start1 = pos1 - offset1 + seq1True+2 - (startLoc1 + seq1Len); - if (sp1->p == NULL) dotStart1 = (startLoc1 + seq1Len + offset1 - pos1) - 1; - else dotStart1 = seq1Invert - pos1; - dotEnd1 = (dotStart1 - length) + 1; - strand1 = '-'; - } - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = pos2 - offset2 + startLoc2; - if (sp2->p == NULL) dotStart2 = start2; - else dotStart2 = pos2 + 1; - dotEnd2 = (dotStart2 + length) - 1; - strand2 = '+'; - } - else - { - start2 = pos2 - offset2 + seq2True+2 - (startLoc2 + seq2Len); - if (sp2->p == NULL) dotStart2 = (startLoc2 + seq2Len + offset2 - pos2) - 1; - else dotStart2 = seq2Invert - pos2; - dotEnd2 = (dotStart2 - length) + 1; - strand2 = '-'; - } - - // print the desired fields - - textDiffInfo = extract_key_info (keys,genpafTextDiff); - if (textDiffInfo == NULL) textDiffInfo = genpafTDInfoDefault; - - tabCh = '#'; - for (k=keys ; (*k!=0)&&(*k!=genpafInfoSeparator); k++) - { - if ((tabCh == '#') - || (tabCh == 0) - || (*k == genpafCR) - || (*k == genpafMarker)) - tabCh = '\t'; - else - fprintf (f, "\t"); - - switch (*k) - { - case genpafCR: - fprintf (f, "\n"); - tabCh = '#'; - break; - case genpafMarker: - fprintf (f, "~"); - tabCh = 0; - break; - case genpafNA: - fprintf (f, "NA"); - break; - case genpafName1: - fprintf (f, "%s%s", name1, suff1); - break; - case genpafNumber1: - fprintf (f, "%u", seq1Contig-1); - break; - case genpafStrand1: - fprintf (f, "%c", strand1); - break; - case genpafSize1: - fprintf (f, unsposFmt, seq1True); - break; - case genpafStart1: - fprintf (f, unsposFmt, start1); - break; - case genpafStart1Zero: - fprintf (f, unsposFmt, start1-1); - break; - case genpafStart1DotPlot: - fprintf (f, unsposFmt, dotStart1); - break; - case genpafStart1Blast: - if (strand2 == strand1) fprintf (f, unsposFmt, start1); - else fprintf (f, unsposFmt, start1-1 + length); - break; - case genpafEnd1: - fprintf (f, unsposFmt, start1-1 + length); - break; - case genpafEnd1DotPlot: - fprintf (f, unsposFmt, dotEnd1); - break; - case genpafEnd1Blast: - if (strand2 == strand1) fprintf (f, unsposFmt, start1-1 + length); - else fprintf (f, unsposFmt, start1); - case genpafLength1: - fprintf (f, unsposFmt, length); - break; - case genpafAlign1: - case genpafText1: - for (ix=0 ; ixvq == NULL) - fprintf (f, "*"); - else - { - for (ix=0 ; ixvq[start1+ix]); - } - break; - case genpafName2: - fprintf (f, "%s%s", name2, suff2); - break; - case genpafNumber2: - fprintf (f, "%u", seq2Contig-1); - break; - case genpafStrand2: - fprintf (f, "%c", strand2); - break; - case genpafSize2: - fprintf (f, unsposFmt, seq2True); - break; - case genpafStart2OnPlus: - if (strand2 == '-') - { - fprintf (f, unsposFmt, seq2True + 2 - (start2+length)); - break; - } - // (fall thru) - case genpafStart2: - fprintf (f, unsposFmt, start2); - break; - case genpafStart2ZeroOnPlus: - if (strand2 == '-') - { - fprintf (f, unsposFmt, seq2True + 1 - (start2+length)); - break; - } - // (fall thru) - case genpafStart2Zero: - fprintf (f, unsposFmt, start2-1); - break; - case genpafStart2DotPlot: - fprintf (f, unsposFmt, dotStart2); - break; - case genpafEnd2OnPlus: - if (strand2 == '-') - { - fprintf (f, unsposFmt, seq2True + 1 - start2); - break; - } - // (fall thru) - case genpafEnd2: - fprintf (f, unsposFmt, start2-1 + length); - break; - case genpafEnd2DotPlot: - fprintf (f, unsposFmt, dotEnd2); - break; - case genpafLength2: - fprintf (f, unsposFmt, length); - break; - case genpafAlign2: - case genpafText2: - for (ix=0 ; ixvq == NULL) - fprintf (f, "*"); - else - { - for (ix=0 ; ixvq[start2+ix]); - } - break; - case genpafMatch: - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - fprintf (f, unsposFmt, numer); - break; - case genpafMismatch: - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - fprintf (f, unsposFmt, denom - numer); - break; - case genpafAlignedPairs: - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - fprintf (f, unsposFmt, denom); - break; - case genpafAlignmentColumns: - fprintf (f, unsposFmt, length); - break; - case genpafSeparateGaps: - case genpafGapColumns: - fprintf (f, "0"); - break; - case genpafTextDiff: - for (ix=0 ; ix 0) - { - if ((diagSE < 0) - || ((u32) diagSE < seq2Len)) diag = diagSE; - else diag = 0; - } - if (diag == 0) fprintf (f, "NA"); - else fprintf (f, sgnposFmt, diag); - break; - case genpafScore: - fprintf (f, scoreFmt, s); - break; - case genpafAlignmentNumZero: - fprintf (f, u64Fmt, genpafAlignmentNumber); - break; - case genpafAlignmentNum: - fprintf (f, u64Fmt, 1+genpafAlignmentNumber); - break; - case genpafBlastBitScore: - fprintf (f, "%.1f", blastz_score_to_ncbi_bits(s)); - break; - case genpafBlastEValue: - fprintf (f, "%.2g", blastz_score_to_ncbi_expectation(s)); - break; - case genpafIdentity: - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - fprintf (f, unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, "\t%.1f%%", (100.0*numer) / denom); - else fprintf (f, "\tNA"); - break; - case genpafIdentityFrac: - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - fprintf (f, unsposSlashFmt, numer, denom); - break; - case genpafIdentityPct: - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - if (denom != 0) fprintf (f, "%.1f%%", (100.0*numer) / denom); - else fprintf (f, "NA"); - break; - case genpafBlastIdentityPct: - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - if (denom != 0) fprintf (f, "%.2f", (100.0*numer) / denom); - else fprintf (f, "NA"); - break; - case genpafCoverage: - seg.pos1 = pos1; - seg.pos2 = pos2; - seg.length = length; - segment_coverage (seq1, seq2, &seg, &numer, &denom); - fprintf (f, unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, "\t%.1f%%", (100.0*numer) / denom); - else fprintf (f, "\tNA"); - break; - case genpafCoverageFrac: - seg.pos1 = pos1; - seg.pos2 = pos2; - seg.length = length; - segment_coverage (seq1, seq2, &seg, &numer, &denom); - fprintf (f, unsposSlashFmt, numer, denom); - break; - case genpafCoveragePct: - seg.pos1 = pos1; - seg.pos2 = pos2; - seg.length = length; - segment_coverage (seq1, seq2, &seg, &numer, &denom); - if (denom != 0) fprintf (f, "%.1f%%", (100.0*numer) / denom); - else fprintf (f, "NA"); - break; - case genpafContinuity: - numer = denom = length; - fprintf (f, unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, "\t%.1f%%", (100.0*numer) / denom); - else fprintf (f, "\tNA"); - break; - case genpafContinuityFrac: - numer = denom = length; - fprintf (f, unsposSlashFmt, numer, denom); - break; - case genpafContinuityPct: - numer = denom = length; - if (denom != 0) fprintf (f, "%.1f%%", (100.0*numer) / denom); - else fprintf (f, "NA"); - break; - case genpafGapRate: - numer = 0; - denom = length; - fprintf (f, unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, "\t%.1f%%", (100.0*numer) / denom); - else fprintf (f, "\tNA"); - break; - case genpafChoreId: - if ((seq2->choresFile == NULL) - || (seq2->chore.idTag[0] == 0)) - fprintf (f, "NA"); - else - fprintf (f, "%s", seq2->chore.idTag); - break; - case genpafTargetNucs: - if (strand1 == '+') - { - for (ix=0 ; ixv[offset1+ix])); - } - else - { - endOffset1 = offset1 + seq1Len-1; - for (ix=0 ; ixv[endOffset1-ix]])); - } - break; - case genpafTargetQuals: - if (seq1->vq == NULL) - fprintf (f, "*"); - else if (strand1 == '+') - { - for (ix=0 ; ixvq[offset1+ix]); - } - else - { - endOffset1 = offset1 + seq1Len-1; - for (ix=0 ; ixvq[endOffset1-ix]); - } - break; - case genpafQueryNucs: - if (strand2 == '+') - { - for (ix=0 ; ixv[offset2+ix])); - } - else - { - endOffset2 = offset2 + seq2Len-1; - for (ix=0 ; ixv[endOffset2-ix]])); - } - break; - case genpafQueryQuals: - if (seq2->vq == NULL) - fprintf (f, "*"); - else if (strand2 == '+') - { - for (ix=0 ; ixvq[offset2+ix]); - } - else - { - endOffset2 = offset2 + seq2Len-1; - for (ix=0 ; ixvq[endOffset2-ix]); - } - break; - case genpafHspId: - fprintf (f, u64Fmt, hspId); - break; - case genpafPositionHash: - h = alignment_hash (pos1, pos1+length, seq1->revCompFlags, - pos2, pos2+length, seq2->revCompFlags, - /*script*/ NULL); - fprintf (f, "%08lX", (long) h); - break; - case genpafAlignmentHash: - fprintf (f, "(notimp)"); // $$$ not implemented yet - break; - default: - break; - } - } - fprintf (f, "\n"); - } - -//---------- -// -// parse_genpaf_keys-- -// Convert a list of genpaf names to field keys. -// -//---------- -// -// Arguments: -// char* s: A comma-separated list of the NAMES of the fields to print -// (genpafName or genpafAliases strings). -// -// Returns: -// A list of the fields to print (genpafXXX values). This is dynamically -// allocated from the heap. -// -//---------- - -char* parse_genpaf_keys - (char* s) - { - char* ss, *keys, *kScan, *field, *diffChars; - int numFields, ix, key; - char terminator; - int haveDiff; - - // figure out how many fields there will be - - numFields = 1; - for (ss=s ; *ss!=0 ; ss++) - if (*ss == ',') numFields++; - - // allocate key string (the extra characters are for a potential set of - // characters for genpafTextDiff) - - keys = malloc_or_die ("parse_genpaf_keys", - numFields+genpafTDInfoSize+1); - diffChars = NULL; - haveDiff = false; - - // parse fields - - kScan = keys; - field = s; - for (ss=s ; ; ss++) - { - if ((*ss != ',') && (*ss != 0)) continue; - - terminator = *ss; - *ss = 0; - - if (field[0] == 0) // (empty string) - key = genpafCR; - else - { - key = -1; - for (ix=0 ; genpafName[ix].name!=NULL ; ix++) - { - if (strcmp (field, genpafName[ix].name) != 0) continue; - key = genpafName[ix].key; - break; - } - if (key < 0) - { - for (ix=0 ; genpafAliases[ix].name!=NULL ; ix++) - { - if (strcmp (field, genpafAliases[ix].name) != 0) continue; - key = genpafAliases[ix].key; - break; - } - } - if ((key < 0) - && (strcmp_prefix (field, genpafTDName) == 0) - && (strlen (field) == strlen(genpafTDName) + genpafTDInfoSize)) - { - key = genpafTextDiff; - diffChars = field + strlen(genpafTDName); - if (strchr (diffChars, genpafInfoSeparator) != NULL) - suicidef ("%s field cannot contain %c", - genpafTDName, genpafInfoSeparator); - } - if (key < 0) - suicidef ("unrecognized field name (for --format=general): \"%s\"", field); - if (key == genpafTextDiff) - { - if (haveDiff) - suicidef ("duplicate field name: %s", genpafTDName); - haveDiff = true; - } - } - - *(kScan++) = key; - field = ss+1; - - if (terminator == 0) break; - } - - if (diffChars != NULL) - { - *(kScan++) = genpafInfoSeparator; - *(kScan++) = genpafTextDiff; - for (ix=0 ; ix // standard C i/o stuff -#include // standard C variable argument list stuff -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef genpaf_owner -#define global -#else -#define global extern -#endif - -// type codes for printable fields; note that these single character codes -// are not something the user ever sees, so the only real requirements are that -// they be unique, and that they agree with definitions of genpafStandardKeys -// and other genpafXXXKeys; they needn't be printable characters - -enum - { - genpafAlign1 = 'A', - genpafStart1 = 'B', - genpafCoverage = 'C', - genpafStrand1 = 'D', - genpafEnd1 = 'E', - genpafQualsAlign1 = 'F', - genpafGapRate = 'G', // deprecated; users should use continuity - genpafIdentity = 'I', - genpafTargetNucs = 'J', - genpafTargetQuals = 'K', - genpafLength1 = 'L', - genpafName1 = 'N', - genpafNumber1 = 'O', - genpafStart1Blast = 'Q', // (we don't expect the user to set this directly) - genpafEnd1Blast = 'R', // (we don't expect the user to set this directly) - genpafSize1 = 'S', - genpafText1 = 'T', - genpafAlignmentColumns = 'W', - genpafNA = 'X', - genpafStart1Zero = 'Z', - genpafAlign2 = 'a', - genpafStart2 = 'b', - genpafContinuity = 'c', - genpafStrand2 = 'd', - genpafEnd2 = 'e', - genpafQualsAlign2 = 'f', - genpafIdentityPct = 'i', - genpafIdentityFrac = 'k', - genpafLength2 = 'l', - genpafBlastIdentityPct = 'm', - genpafName2 = 'n', - genpafNumber2 = 'o', - genpafQueryNucs = 'p', - genpafQueryQuals = 'q', - genpafSize2 = 's', - genpafText2 = 't', - genpafMatch = 'u', - genpafMismatch = 'v', - genpafAlignedPairs = 'w', - genpafGapColumns = 'x', - genpafSeparateGaps = 'y', - genpafStart2Zero = 'z', - genpafStart1DotPlot = '0', - genpafEnd1DotPlot = '1', - genpafStart2DotPlot = '2', - genpafEnd2DotPlot = '3', - genpafCoveragePct = '6', - genpafCoverageFrac = '7', - genpafContinuityPct = '8', - genpafContinuityFrac = '9', - genpafCigarLower = '@', - genpafCR = '!', - genpafScore = '#', - genpafBlastBitScore = '$', // (we don't expect the user to set this directly) - genpafBlastEValue = '%', // (we don't expect the user to set this directly) - genpafCigar = '&', - genpafChoreId = '*', - genpafEnd2OnPlus = ',', - genpafDiagonal = '/', - genpafInfoSeparator = ';', - genpafStart2OnPlus = '<', - genpafTextDiff = '=', - genpafStart2ZeroOnPlus = '>', - genpafAlignmentNum = '[', - genpafShingle = '\\', - genpafAlignmentNumZero = ']', - genpafCigarX = '_', - genpafCigarXLower = '^', - genpafHspId = '(', - genpafPositionHash = '4', - genpafAlignmentHash = '5', - genpafMarker = '~' - }; - - -#define genpafStandardKeys "#NDSZEndszeIC" -#define genpafMappingKeys "NZEnd>,IC^" -#define genpafSegmentKeys "NBEnbed#" -#define genpafBlastKeys "nNmWvybeQR%$" -#define genpafRDotplotKeys "02!13!XX" -#define genpafRDotplotScoreKeys "02#!13#!XXX" - - -#define genpafTDName "diff" -#define genpafTDInfoDefault ".:x--X" // (indexed by these next definitons) -#define genpafTDInfoMatch 0 -#define genpafTDInfoTransition 1 -#define genpafTDInfoTransversion 2 -#define genpafTDInfoInsert1 3 -#define genpafTDInfoInsert2 4 -#define genpafTDInfoOther 5 -#define genpafTDInfoSize 6 - -#define genpafTNucsName "nucs1" -#define genpafTQualsName "quals1" -#define genpafQNucsName "nucs2" -#define genpafQQualsName "quals2" - -typedef struct stringtokey - { - char* name; - char key; - } stringtokey; - -#ifdef genpaf_owner -global stringtokey genpafName[] = - { - { "name1", genpafName1 }, - { "number1", genpafNumber1 }, - { "strand1", genpafStrand1 }, - { "size1", genpafSize1 }, - { "start1", genpafStart1 }, - { "zstart1", genpafStart1Zero }, - { "end1", genpafEnd1 }, - { "length1", genpafLength1 }, - { "align1", genpafAlign1 }, - { "text1", genpafText1 }, - { "qalign1", genpafQualsAlign1 }, - { "name2", genpafName2 }, - { "number2", genpafNumber2 }, - { "strand2", genpafStrand2 }, - { "size2", genpafSize2 }, - { "start2", genpafStart2 }, - { "zstart2", genpafStart2Zero }, - { "start2+", genpafStart2OnPlus }, - { "zstart2+", genpafStart2ZeroOnPlus }, - { "end2", genpafEnd2 }, - { "end2+", genpafEnd2OnPlus }, - { "length2", genpafLength2 }, - { "align2", genpafAlign2 }, - { "text2", genpafText2 }, - { "qalign2", genpafQualsAlign2 }, - { "nmatch", genpafMatch }, - { "nmismatch", genpafMismatch }, - { "npair", genpafAlignedPairs }, - { "ncolumn", genpafAlignmentColumns }, - { "ngap", genpafSeparateGaps }, - { "cgap", genpafGapColumns }, - { genpafTDName, genpafTextDiff }, - { "cigar", genpafCigar }, - { "cigar-", genpafCigarLower }, - { "cigarx", genpafCigarX }, - { "cigarx-", genpafCigarXLower }, - { "diagonal", genpafDiagonal }, - { "shingle", genpafShingle }, - { "score", genpafScore }, - { "identity", genpafIdentity }, - { "idfrac", genpafIdentityFrac }, - { "id%", genpafIdentityPct }, - { "blastid%", genpafBlastIdentityPct }, - { "coverage", genpafCoverage }, - { "covfrac", genpafCoverageFrac }, - { "cov%", genpafCoveragePct }, - { "continuity", genpafContinuity }, - { "confrac", genpafContinuityFrac }, - { "con%", genpafContinuityPct }, - { "gaprate", genpafGapRate }, - { genpafTNucsName, genpafTargetNucs }, - { genpafTQualsName, genpafTargetQuals }, - { genpafQNucsName, genpafQueryNucs }, - { genpafQQualsName, genpafQueryQuals }, - { "number", genpafAlignmentNum }, - { "znumber", genpafAlignmentNumZero }, - { "chore", genpafChoreId }, - { "hspid", genpafHspId }, // (unadvertised) - { "phash", genpafPositionHash }, // (unadvertised) - { "ahash", genpafAlignmentHash }, // (unadvertised) - { "NA", genpafNA }, - { "~", genpafMarker }, - { NULL, 0 } // (list terminator) - }; - -global stringtokey genpafAliases[] = - { - { "n1", genpafName1 }, - { "s1", genpafStart1 }, - { "z1", genpafStart1Zero }, - { "e1", genpafEnd1 }, - { "l1", genpafLength1 }, - { "a1", genpafAlign1 }, - { "t1", genpafText1 }, - { "n2", genpafName2 }, - { "s2", genpafStart2 }, - { "z2", genpafStart2Zero }, - { "s2+", genpafStart2OnPlus }, - { "z2+", genpafStart2ZeroOnPlus }, - { "e2", genpafEnd2 }, - { "e2+", genpafEnd2OnPlus }, - { "l2", genpafLength2 }, - { "a2", genpafAlign2 }, - { "t2", genpafText2 }, - { "d", genpafDiagonal }, - { "diag", genpafDiagonal }, - { "s", genpafScore }, - { "id", genpafIdentity }, - { "id%", genpafIdentityPct }, - { "ident", genpafIdentity }, - { "cov", genpafCoverage }, - { "cov%", genpafCoveragePct }, - { "con", genpafContinuity }, - { "con%", genpafContinuityPct }, - { "gap", genpafGapRate }, - { NULL, 0 } // (list terminator) - }; -#else -global stringtokey genpafName[]; -global stringtokey genpafAliases[]; -#endif - -//---------- -// -// prototypes for routines in genpaf.c -// -//---------- - -void print_genpaf_job_header (FILE* f, char* keys); -void print_genpaf_job_footer (FILE* f); -void print_genpaf_header (FILE* f, seq* seq1, seq* seq2); -void print_blast_job_header (FILE* f); -void print_blast_job_footer (FILE* f); -void print_blast_header (FILE* f, - char* programName, char* args, - seq* seq1, seq* seq2); -void print_genpaf_align_list (FILE* f, alignel* alignList, seq* seq1, seq* seq2, - char* keys); -void print_genpaf_align_list_segments - (FILE* f, alignel* alignList, seq* seq1, seq* seq2, - char* keys, scoreset* scoring); -void print_genpaf_align (FILE* f, - seq* seq1, unspos beg1, unspos end1, - seq* seq2, unspos beg2, unspos end2, - editscript* script, score s, u64 hspId, - char* keys, - unspos idNumer, unspos idDenom, - unspos covNumer, unspos covDenom, - unspos conNumer, unspos conDenom, - unspos gapNumer, unspos gapDenom); -void print_genpaf_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - score s, u64 hspId, - char* keys); -char* parse_genpaf_keys (char* s); - -#undef global -#endif // genpaf_H diff --git a/programs/lastz/src/gfa.c b/programs/lastz/src/gfa.c deleted file mode 100755 index e29b316..0000000 --- a/programs/lastz/src/gfa.c +++ /dev/null @@ -1,597 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: gfa.c -// -//---------- -// -// gfa-- -// Support for printing alignments in GFA format. -// -// GFA format is for gap-free alignments, with one per line, like the one -// below. This implemention does not provide enough information that it can be -// converted to a LAV file (full GFA is intended to do so). This line -// corresponds to an "a" stanza for an alignment starting at 10825 of sequence -// one's + strand and at 8530 on sequence two's - strand (positions are origin -// one), of length 74, score 4137. GFA's optional percent-identity field is -// not reported. -// -// a 10825+/8530- 74 4137 -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C upper/lower stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff -#include "diag_hash.h" // diagonals hashing stuff - -#define gfa_owner // (make this the owner of its globals) -#include "gfa.h" // interface to this module - -//---------- -// -// prototypes for private functions -// -//---------- - -static int is_comment_string (const char* s); - -//---------- -// -// print_gfa_job_header-- -// Print gfa format job header. -// -//---------- - -void print_gfa_job_header - (FILE* f, - char* _programName, - char* _name1, - char* _name2) - { - char* programName = _programName; - char* name1 = _name1; - char* name2 = _name2; - - if (programName == NULL) programName = "(no name)"; - if (name1 == NULL) name1 = "(no name)"; - if (name2 == NULL) name2 = "(no name)"; - - fprintf (f, "d"); - fprintf (f, " %s %s %s", programName, name1, name2); - fprintf (f, "\n"); - } - -//---------- -// -// print_gfa_job_footer-- -// Print gfa format job footer. -// -//---------- - -void print_gfa_job_footer - (arg_dont_complain(FILE* f)) - { - // (do nothing) - } - -//---------- -// -// print_gfa_header-- -// Print gfa format header. -// -//---------- - -void print_gfa_header - (FILE* f, - seq* seq1, - seq* seq2) - { - char* rcfShortSuffix[4] = { "", "~", "~-", "-" }; - char* rcfLongSuffix [4] = { "", // forward - "~", // complement - "~ (reverse complement)", // reverse - " (reverse complement)" }; // rev-comp - char* name1 = seq1->filename; - char* name2 = seq2->filename; - char* header1 = seq1->header; - char* header2 = seq2->header; - u32 contig1 = seq1->contig; - u32 contig2 = seq2->contig; - - if (name1 == NULL) name1 = "(no name)"; - if (name2 == NULL) name2 = "(no name)"; - if (header1 == NULL) header1 = "(no header)"; - if (header2 == NULL) header2 = "(no header)"; - - fprintf (f, "s "); - fprintf (f, "\"%s%s\" " unsposFmt " " unsposFmt " %d %u ", - name1, rcfShortSuffix[seq1->revCompFlags], - seq1->startLoc, seq1->startLoc+seq1->len-1, - ((seq1->revCompFlags & rcf_rev) != 0)?1:0, contig1); - fprintf (f, "\"%s%s\" " unsposFmt " " unsposFmt " %d %u", - name2, rcfShortSuffix[seq2->revCompFlags], - seq2->startLoc, seq2->startLoc+seq2->len-1, - ((seq2->revCompFlags & rcf_rev) != 0)?1:0, contig2); - fprintf (f, "\n"); - - fprintf (f, "h "); - fprintf (f, "\"%s%s\"", header1, rcfLongSuffix[seq1->revCompFlags]); - fprintf (f, " \"%s%s\"", header2, rcfLongSuffix[seq2->revCompFlags]); - fprintf (f, "\n"); - } - -//---------- -// -// print_gfa_align_list-- -// Print a list of gapped alignments in gfa format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// scoreset* scoring: The scoring scheme to use. This may be NULL. -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// -// Returns: -// (nothing) -// -//---------- - -void print_gfa_align_list - (FILE* f, - scoreset* scoring, - alignel* alignList, - seq* seq1, - seq* seq2) - { - alignel* a; - - for (a=alignList ; a!=NULL ; a=a->next) - print_gfa_align (f, scoring, - seq1, a->beg1-1, a->end1, - seq2, a->beg2-1, a->end2, - a->script); - } - -//---------- -// -// print_gfa_align-- -// Print a single gapped alignment in gfa format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// scoreset* scoring: The scoring scheme to use. This may be NULL. -// seq* seq1: One sequence. -// unspos beg1, end1: Range of positions in sequence 1 (origin 0). -// seq* seq2: Another sequence. -// unspos beg2, end2: Range of positions in sequence 2 (origin 0). -// editscript* script: The script describing the path the alignment takes -// .. in the DP matrix. -// -// Returns: -// (nothing) -// -//---------- - -void print_gfa_align - (FILE* f, - scoreset* scoring, - seq* seq1, - unspos beg1, - unspos end1, - seq* seq2, - unspos beg2, - unspos end2, - editscript* script) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - unspos height, width, i, j, prevI, prevJ, run; - u32 opIx; - score s; - - if ((sp1->p != NULL) || (sp2->p != NULL)) - suicide ("gfa format can't handle multi-sequences"); - // the issue is that we'd have to check if the partition changed - // since the previous alignment, and generate an s/h-stanza pair - - beg1++; // (internally, we want origin 1, inclusive) - beg2++; - - height = end1 - beg1 + 1; - width = end2 - beg2 + 1; - - // print the overall alignment info - - s = 0; - if (scoring != NULL) - { - opIx = 0; - for (i=j=0 ; (i< height)||(j 0) - s -= scoring->gapOpen + run*scoring->gapExtend; - } - } - } - - fprintf (f, "A " unsposSlashSFmt " " unsposSlashFmt " " scoreFmtSimple "\n", - beg1, ((seq1->revCompFlags & rcf_rev) != 0)? "-" : "+", - beg2, ((seq2->revCompFlags & rcf_rev) != 0)? "-" : "+", - height, width, s); - - // print the alignment's segments - - opIx = 0; - for (i=j=0 ; (i< height)||(jpartition; - seqpartition* sp2 = &seq2->partition; - int pctId; - sgnpos diag = diagNumber (pos1, pos2); - - if ((sp1->p != NULL) || (sp2->p != NULL)) - suicide ("gfa format can't handle multi-sequences"); - - // compute percent identity - - pctId = percent_identical (seq1, pos1, seq2, pos2, length); - - // print it - - fprintf (f, "a " unsposSlashSFmt " " unsposFmt " " scoreFmtSimple " %d ; diag " sgnposFmt "\n", - pos1+1, ((seq1->revCompFlags & rcf_rev) != 0)? "-" : "+", - pos2+1, ((seq2->revCompFlags & rcf_rev) != 0)? "-" : "+", - length, s, pctId, diag); - } - -//---------- -// -// print_gfa_generic-- -// Print a generic record in gfa format. -// -// Generic records allow the caller to play fast and loose with the gfa file, -// adding records with stanza-types that most gfa readers will (hopefully) -// ignore. It can be a good way to pass additional data along to a downstream -// program. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// char stanza: The single character identifying some stanza type. -// const char* format: A format string, as per printf. -// ...: (same as for printf) -// -// Returns: -// (nothing) -// -//---------- - -void print_gfa_generic - (FILE* f, - char stanza, - const char* format, - ...) - { - va_list args; - - va_start (args, format); - vprint_gfa_generic (f, stanza, format, args); - va_end (args); - } - -void vprint_gfa_generic - (FILE* f, - char stanza, - const char* format, - va_list args) - { - fprintf (f, "%c", stanza); - if (format != NULL) - { - fprintf (f, " "); - vfprintf (f, format, args); - } - fprintf (f, "\n"); - } - -//---------- -// -// parse_gfa_s_record-- -// Parse an s-record from a GFA file. -// -// A-records look something like this. -// -// s "td/human.fa" 1 1877426 0 1 "td/mouse.fa" 1 1736124 0 1 -// -//---------- -// -// Arguments: -// char* rec: The record to parse (a zero-terminated string). -// (remaining arguments are self-explanatory; each can be NULL if the caller -// has no interest in that field) -// -// Returns: -// true if successful, false if not. -// -//---------- - -int parse_gfa_s_record - (char* _rec, - char** _name1, - char** _name2) - { - int scanned; - char* rec = copy_string (_rec); - char* s, *n1, *n2; - unspos start1, stop1, start2, stop2; - int rc1, contig1, rc2, contig2; - char* name1, *name2; - int len; - - // skip 's' - - s = rec; - if (*s != 's') goto abort; - s = skip_whitespace(s+1); - - // parse the first filename - - if (*s != '"') goto abort; - s++; - n1 = s; - s = skip_til (s, "\""); - if (*s != '"') goto abort; - *(s++) = 0; // terminate n1 - if (s[-2] == '-') s[-2] = 0; - s = skip_whitespace(s); - - // parse the four int fields for sequence 1 - - scanned = -1; - sscanf (s, unsposFmtScanf " " unsposFmtScanf " %d %d%n", - &start1, &stop1, &rc1, &contig1, &scanned); - - if (scanned == -1) - goto abort; - - s = skip_whitespace(s+scanned); - - // parse the second filename - - if (*s != '"') goto abort; - s++; - n2 = s; - s = skip_til (s, "\""); - if (*s != '"') goto abort; - *(s++) = 0; // terminate n2 - if (s[-2] == '-') s[-2] = 0; - s = skip_whitespace(s); - - // parse the four int fields for sequence 2 - - scanned = -1; - sscanf (s, unsposFmtScanf " " unsposFmtScanf " %d %d%n", - &start2, &stop2, &rc2, &contig2, &scanned); - - if ((scanned == -1) || (!is_blank_string (s+scanned))) - goto abort; - - // build the file names; the format is name[s..e][-] where s..e defines a - // subrange of the file and - indicates reverse-complement - - if (_name1 != NULL) - { - len = snprintf (NULL, 0, "%s[" unsposDotsFmt "]%s", n1, start1, stop1, (rc1==1)?"-":""); - name1 = malloc_or_die ("parse_s_record (name1)", len+1); - sprintf (name1, "%s[" unsposDotsFmt "]%s", n1, start1, stop1, (rc1==1)?"-":""); - *_name1 = name1; - } - - if (_name2 != NULL) - { - len = snprintf (NULL, 0, "%s[" unsposDotsFmt "]%s", n2, start2, stop2, (rc2==1)?"-":""); - name2 = malloc_or_die ("parse_s_record (name2)", len+1); - sprintf (name2, "%s[" unsposDotsFmt "]%s", n2, start2, stop2, (rc2==1)?"-":""); - *_name2 = name2; - } - - // success - - free (rec); - return true; - - // failure - -abort: - free (rec); - return false; - } - -//---------- -// -// parse_gfa_a_record-- -// Parse an a-record from a GFA file. -// -// A-records look something like this. The score and pctid fields are optional -// (though of course if you have pctid you have to also have score). -// -// a start1+/start2+ length score pctid ; comment -// -// - Start1 and start2 are origin-1, relative to the 5's end of their strand. -// However, the *values* returned by this routine are origin-zero. -// -// - When score is not present, it is assigned noScore. -// -// - When pctid is not present, it is assigned -1. -// -//---------- -// -// Arguments: -// char* rec: The record to parse (a zero-terminated string). -// (remaining arguments are self-explanatory; each can be NULL if the caller -// has no interest in that field) -// -// Returns: -// true if successful, false if not. -// -//---------- - -int parse_gfa_a_record - (char* rec, - unspos* _start1, // (origin-zero) - char* _strand1, - unspos* _start2, // (origin-zero) - char* _strand2, - unspos* _length, - score* _s, - int* _pctId) - { - int scanned; - unspos start1, start2, length; - char strand1, strand2; - int pctId; - score s; - - scanned = -1; - sscanf (rec, "a " unsposSlashCFmtScanf " " unsposFmtScanf " " scoreFmtScanf " %d%n", - &start1, &strand1, &start2, &strand2, - &length, &s, &pctId, &scanned); - - if ((scanned != -1) && (!is_comment_string (rec+scanned))) - return false; - - if (scanned == -1) - { - pctId = -1; - sscanf (rec, "a " unsposSlashCFmtScanf " " unsposFmtScanf " " scoreFmtScanf "%n", - &start1, &strand1, &start2, &strand2, - &length, &s, &scanned); - if ((scanned != -1) && (!is_comment_string (rec+scanned))) - return false; - } - - if (scanned == -1) - { - s = noScore; - sscanf (rec, "a " unsposSlashCFmtScanf " " unsposFmtScanf "%n", - &start1, &strand1, &start2, &strand2, - &length, &scanned); - if ((scanned != -1) && (!is_comment_string (rec+scanned))) - return false; - } - - if (scanned == -1) - return false; - - if ((length <= 0) || (start1 <= 0) || (start2 <= 0) - || ((strand1 != '+') && (strand1 != '-')) - || ((strand2 != '+') && (strand2 != '-'))) - return false; - - if (_start1 != NULL) *_start1 = start1-1; - if (_strand1 != NULL) *_strand1 = strand1; - if (_start2 != NULL) *_start2 = start2-1; - if (_strand2 != NULL) *_strand2 = strand2; - if (_length != NULL) *_length = length; - if (_s != NULL) *_s = s; - if (_pctId != NULL) *_pctId = pctId; - - return true; - } - -//---------- -// -// is_comment_string-- -// Determine if a string contains a comment, or only blank characters. -// -//---------- - -static int is_comment_string - (const char* s) - { - char* ss = (char*) s; - - for ( ; *ss!=0 ; ss++) - { - if (isspace(*ss)) continue; - return (*ss == ';'); - } - - return true; - } - diff --git a/programs/lastz/src/gfa.h b/programs/lastz/src/gfa.h deleted file mode 100644 index afa2567..0000000 --- a/programs/lastz/src/gfa.h +++ /dev/null @@ -1,46 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: gfa.h -// -//---------- - -#ifndef gfa_H // (prevent multiple inclusion) -#define gfa_H - -// other files - -#include // standard C i/o stuff -#include // standard C variable argument list stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -//---------- -// -// prototypes for routines in gfa.c -// -//---------- - -void print_gfa_job_header (FILE* f, - char* programName, char* name1, char* name2); -void print_gfa_job_footer (FILE* f); -void print_gfa_header (FILE* f, seq* seq1, seq* seq2); -void print_gfa_align_list (FILE* f, scoreset* scoring, alignel* alignList, - seq* seq1, seq* seq2); -void print_gfa_align (FILE* f, scoreset* scoring, - seq* seq1, unspos beg1, unspos end1, - seq* seq2, unspos beg2, unspos end2, - editscript* script); -void print_gfa_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - score s); -void print_gfa_generic (FILE* f, char stanza, const char* format, ...); -void vprint_gfa_generic (FILE* f, char stanza, const char* format, - va_list args); -int parse_gfa_s_record (char* rec, char** name1, char** name2); -int parse_gfa_a_record (char* rec, - unspos* start1, char* strand1, - unspos* start2, char* strand2, unspos* length, - score* s, int* pctid); - -#endif // gfa_H diff --git a/programs/lastz/src/identity_dist.c b/programs/lastz/src/identity_dist.c deleted file mode 100755 index 5b0a558..0000000 --- a/programs/lastz/src/identity_dist.c +++ /dev/null @@ -1,899 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: identity_dist.c -// -//---------- -// -// identity_dist-- -// Support for collecting the percent identity distribution from alignments. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C upper/lower stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -#define identity_dist_owner // (make this the owner of its globals) -#include "identity_dist.h" // interface to this module - -// debugging defines - -//#define snoopCountSubs // if this is defined, extra code is added to - // .. count_substitutions to track its operation - -//#define snoopIdentityCalc // if this is defined, extra code is added to -// // .. to aid debugging of alignment_identity - -//---------- -// -// private global data -// -//---------- -static int statsActive = false; - -static unspos identityCount [numIdentityBins+1]; -static possum identityCoverage[numIdentityBins+1]; - -//---------- -// -// filter_aligns_by_identity-- -// Filter a list of alignments, removing any alignment that has percent -// identity outside of a specified range. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// alignel* alignList: The list of alignments to operate upon. -// float minIdentity, The range of percent identity in alignments that -// maxIdentity .. we will *keep*. These are values between 0 -// .. and 1. -// -// Returns: -// A pointer to the list of remaining alignments. -// -//---------- -// Notes: -// (1) Memory for alignments that don't make the cut is deallocated here. -// (2) The returned list of alignments is in the same order as the incoming -// .. list. -// (3) Identity is counted over all aligned bases; gaps are not counted. -// $$$ consider whether gaps should be included (perhaps make this a -// $$$ .. user option so inferz can experiment with it). -//---------- - -// $$$ There's an inherent inefficiency here if the user asks us to filter -// $$$ .. alignments by several idenitity-based stats. We'd like to compute -// $$$ .. the identity stats only once, then either carry that in the alignment -// $$$ .. record, or apply all the filters in one pass through the loop. - -alignel* filter_aligns_by_identity - (seq* seq1, - seq* seq2, - alignel* alignList, - float minIdentity, - float maxIdentity) - { - alignel* a, *next; - alignel* head, *prev; - unspos numer, denom; - - head = prev = NULL; - for (a=alignList ; a!=NULL ; a=next) - { - next = a->next; - - alignment_identity (seq1, seq2, a, &numer, &denom); - - if ((denom == 0) - || (numer < denom * minIdentity) - || (numer > denom * maxIdentity)) - { // (unwanted alignment, discard it) - free_if_valid ("filter_aligns_by_identity a->script", a->script); - free_if_valid ("filter_aligns_by_identity a", a); - if (identity_dist_dbgShowIdentity) - { - // nota bene: positions written as 1-based - printf ("discarding " unsposSlashSFmt " identity=" unsposSlashFmt "\n", - a->beg1, ((seq1->revCompFlags & rcf_rev) != 0)? "-" : "+", - a->beg2, ((seq2->revCompFlags & rcf_rev) != 0)? "-" : "+", - numer, denom); - } - continue; - } - - if (identity_dist_dbgShowIdentity) - { - // nota bene: positions written as 1-based - printf ("keeping " unsposSlashSFmt " identity=" unsposSlashFmt "\n", - a->beg1, ((seq1->revCompFlags & rcf_rev) != 0)? "-" : "+", - a->beg2, ((seq2->revCompFlags & rcf_rev) != 0)? "-" : "+", - numer, denom); - } - - // this alignment is ok, add it to the end of the new list we're - // building - - if (head == NULL) head = prev = a; - else { prev->next = a; prev = a; } - - a->next = NULL; - } - - return head; - } - -//---------- -// -// alignment_identity-- -// Compute the identity fraction of an gapped alignment block. -// -//---------- -// -// Arguments: -// seq* seq1: The first sequence. -// seq* seq2: The second sequence. -// alignel* a: The alignment of interest. -// unspos* numer, denom: Place to return the identity fraction. Note -// .. that the returned denominator might be zero. -// -// Returns: -// (nothing) -// -//---------- - -//=== stuff for snoopIdentityCalc === - -#ifndef snoopIdentityCalc -#define debugSnoopIdentityCalc_1 ; -#define debugSnoopIdentityCalc_2 ; -#endif // not snoopIdentityCalc - -#ifdef snoopIdentityCalc - -#define debugSnoopIdentityCalc_1 \ - fprintf (stderr, "alignment_identity(" \ - unsposSlashFmt ".." unsposSlashFmt " " \ - unsposFmt "x" unsposFmt ")\n", \ - a->beg1, a->beg2, a->end1, a->end2, width, height); - -#define debugSnoopIdentityCalc_2 \ - fprintf (stderr, " " unsposFmt " bp run from " \ - unsposSlashFmt ".." unsposSlashFmt "\n", \ - run, prevI, prevJ, i, j); - -#endif // snoopIdentityCalc - - -//=== stuff for alignment_identity === - -void alignment_identity - (seq* seq1, - seq* seq2, - alignel* a, - unspos* _numer, - unspos* _denom) - { - unspos beg1 = a->beg1; - unspos beg2 = a->beg2; - unspos height, width, i, j, prevI, prevJ; - u32 opIx; - unspos run; - u8 c1, c2; - unspos denom, matches; - unspos pairCount[4][4]; - - height = a->end1 - beg1 + 1; - width = a->end2 - beg2 + 1; - debugSnoopIdentityCalc_1 - - for (c1=0 ; c1<4 ; c1++) - for (c2=0 ; c2<4 ; c2++) - pairCount[c1][c2] = 0; - - denom = 0; - opIx = 0; - for (i=j=0 ; (i< height)||(jscript, &opIx); - i += run; j += run; - debugSnoopIdentityCalc_2 - - denom += count_substitutions (seq1, beg1-1+prevI, - seq2, beg2-1+prevJ, - run, pairCount); - - if ((i < height) || (j < width)) - edit_script_indel_len (a->script, &opIx, &i, &j); - } - - if (denom == 0) - { *_numer = *_denom = 0; return; } - - matches = 0; - for (c1=0 ; c1<4 ; c1++) - matches += pairCount[c1][c1]; - - *_numer = matches; - *_denom = denom; - } - -//---------- -// -// filter_segments_by_identity-- -// Filter a table of segments, removing any segment that has percent identity -// outside of a specified range. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// segtable* st: The segment table to operate upon. -// float minIdentity, The range of percent identity in segments that -// maxIdentity .. we will *keep*. These are values between 0 -// .. and 1. -// -// Returns: -// (nothing) -// -//---------- - -void filter_segments_by_identity - (seq* seq1, - seq* seq2, - segtable* st, - float minIdentity, - float maxIdentity) - { - segment* srcSeg, *dstSeg; - unspos numer, denom; - - if (st == NULL) return; -// if (st->seg == NULL) return; test not necessary st->seg is never NULL - - for (dstSeg=srcSeg=st->seg ; ((u32)(srcSeg-st->seg))len ; srcSeg++) - { - segment_identity (seq1, srcSeg->pos1, - seq2, srcSeg->pos2, srcSeg->length, - &numer, &denom); - if ((denom == 0) - || (numer < denom * minIdentity) - || (numer > denom * maxIdentity)) - continue; // (unwanted segment, skip it) - if (srcSeg != dstSeg) *dstSeg = *srcSeg; - dstSeg++; - } - - st->len = dstSeg - st->seg; - } - -//---------- -// -// filter_segment_by_identity-- -// Filter a segment, reporting whether that segment has percent identity -// outside of a specified range. -// -//---------- -// -// Arguments: -// seq* seq1: The first sequence. -// unspos pos1: The subsequence start position in seq1 (origin-0). -// seq* seq2: The second sequence. -// unspos pos2: The subsequence start position in seq2 (origin-0). -// unspos length: The length of the subsequence. -// float minIdentity, The range of percent identity in segments that -// maxIdentity .. we will *keep*. These are values between 0 -// .. and 1. -// -// Returns: -// true if the segment if outside the specified range (i.e. that it fails to -// pass the filter, and should be discarded) -// -//---------- - -int filter_segment_by_identity - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - float minIdentity, - float maxIdentity) - { - unspos numer, denom; - - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - if ((denom == 0) - || (numer < denom * minIdentity) - || (numer > denom * maxIdentity)) - return true; // (unwanted segment, skip it) - - return false; - } - -//---------- -// -// segment_identity-- -// Compute the identity fraction of an ungapped alignment segment. -// -//---------- -// -// Arguments: -// seq* seq1: The first sequence. -// unspos pos1: The subsequence start position in seq1 (origin-0). -// seq* seq2: The second sequence. -// unspos pos2: The subsequence start position in seq2 (origin-0). -// unspos length: The length of the subsequence. -// unspos* numer, denom: Place to return the identity fraction. Note that -// .. the returned denominator might be zero. -// -// Returns: -// (nothing) -// -//---------- - -void segment_identity - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - unspos* _numer, - unspos* _denom) - { - u8 c1, c2; - unspos denom, matches; - unspos pairCount[4][4]; - - // count substitutions and see if we pass our percent identity filter - - for (c1=0 ; c1<4 ; c1++) - for (c2=0 ; c2<4 ; c2++) - pairCount[c1][c2] = 0; - - denom = count_substitutions (seq1, pos1, seq2, pos2, length, pairCount); - if (denom == 0) - { *_numer = *_denom = 0; return; } - - matches = 0; - for (c1=0 ; c1<4 ; c1++) - matches += pairCount[c1][c1]; - - *_numer = matches; - *_denom = denom; - } - -//---------- -// -// count_substitutions-- -// Count the number of each type of base substitution in two subsequences. -// -//---------- -// -// Arguments: -// seq* seq1: The first sequence. -// unspos pos1: The subsequence start position in seq1 (origin-0). -// seq* seq2: The second sequence. -// unspos pos2: The subsequence start position in seq2 (origin-0). -// unspos length: The length of the subsequence. Note that this may -// .. be zero. -// unspos count[4][4]: Array in which to accumulate the substitutions. -// .. This is indexed by [c1][c2], where c1 and c2 are -// .. fromsequence 1 and 2, respectively, and code for -// .. nucleotides as per bits_to_nuc[]. Note that -// .. wew *accumulate* into this array; we don't clear -// .. it first. -// -// Returns: -// The number of new substitutions or matches we've counted. -// -//---------- -// -// Note: Masked (lowercase) bp are counted the same as unmasked (uppercase), -// but illegal values like 'N' or '-' are completely ignored. -// -//---------- - -//=== stuff for snoopCountSubs === - -#ifndef snoopCountSubs -#define debugSnoopCountSubs_1 ; -#endif // not snoopCountSubs - -#ifdef snoopCountSubs - -#define debugSnoopCountSubs_1 \ - { \ - fprintf (stderr, "count_substitutions(" unsposSlashFmt " #" unsposFmt "): ", \ - pos1, pos2, length); \ - for (ix=0 ; ixv + pos1; - u8* s2 = seq2->v + pos2; - s8 c1, c2; - unspos ix; - unspos denom = 0; - - if (length == 0) - return 0; - - debugSnoopCountSubs_1 - - for (ix=0 ; ix= 0) && (c2 >= 0)) - { count[(u8)c1][(u8)c2]++; denom++; } - } - - return denom; - } - -//---------- -// -// filter_aligns_by_match_count-- -// Filter a list of alignments, removing any alignment that has fewer matched -// bases than a specified minimum. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// alignel* alignList: The list of alignments to operate upon. -// u32 minMatchCount: The minimum number of matched bases in -// .. alignments that we will *keep*. -// -// Returns: -// A pointer to the list of remaining alignments. -// -//---------- -// Notes: -// (1) Memory for alignments that don't make the cut is deallocated here. -// (2) The returned list of alignments is in the same order as the incoming -// .. list. -// (3) Match-count is counted over all aligned bases, and counts only matches -// .. (not substitutions or gaps). -//---------- - -alignel* filter_aligns_by_match_count - (seq* seq1, - seq* seq2, - alignel* alignList, - u32 minMatchCount) - { - alignel* a, *next; - alignel* head, *prev; - unspos numer, denom; - - head = prev = NULL; - for (a=alignList ; a!=NULL ; a=next) - { - next = a->next; - - alignment_identity (seq1, seq2, a, &numer, &denom); - - if ((denom == 0) || (numer < minMatchCount)) - { // (unwanted alignment, discard it) - free_if_valid ("filter_aligns_by_match_count a->script", a->script); - free_if_valid ("filter_aligns_by_match_count a", a); - continue; - } - - // this alignment is ok, add it to the end of the new list we're - // building - - if (head == NULL) head = prev = a; - else { prev->next = a; prev = a; } - - a->next = NULL; - } - - return head; - } - -//---------- -// -// filter_segments_by_match_count-- -// Filter a table of segments, removing any segment that has fewer matched -// bases than a specified minimum. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// segtable* st: The segment table to operate upon. -// u32 minMatchCount: The minimum number of matched bases in segments -// .. that we will *keep*. -// -// Returns: -// (nothing) -// -//---------- - -void filter_segments_by_match_count - (seq* seq1, - seq* seq2, - segtable* st, - u32 minMatchCount) - { - segment* srcSeg, *dstSeg; - unspos numer, denom; - - if (st == NULL) return; -// if (st->seg == NULL) return; test not necessary st->seg is never NULL - - for (dstSeg=srcSeg=st->seg ; ((u32)(srcSeg-st->seg))len ; srcSeg++) - { - segment_identity (seq1, srcSeg->pos1, - seq2, srcSeg->pos2, srcSeg->length, - &numer, &denom); - if ((denom == 0) || (numer < minMatchCount)) - continue; // (unwanted segment, skip it) - if (srcSeg != dstSeg) *dstSeg = *srcSeg; - dstSeg++; - } - - st->len = dstSeg - st->seg; - } - -//---------- -// -// filter_segment_by_match_count-- -// Filter a segment, reporting whether that segment has fewer matched bases -// than a specified minimum. -// -//---------- -// -// Arguments: -// seq* seq1: The first sequence. -// unspos pos1: The subsequence start position in seq1 (origin-0). -// seq* seq2: The second sequence. -// unspos pos2: The subsequence start position in seq2 (origin-0). -// unspos length: The length of the subsequence. -// u32 minMatchCount: The minimum number of matched bases in segments -// .. that we will *keep*. -// -// Returns: -// true if the segment if outside the specified range (i.e. that it fails to -// pass the filter, and should be discarded) -// -//---------- - -int filter_segment_by_match_count - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - u32 minMatchCount) - { - unspos numer, denom; - - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - if ((denom == 0) || (numer < minMatchCount)) - return true; // (unwanted segment, skip it) - - return false; - } - -//---------- -// -// filter_aligns_by_mismatch_count-- -// Filter a list of alignments, removing any alignment that has more mismatched -// bases than a specified maximum. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// alignel* alignList: The list of alignments to operate upon. -// s32 maxMismatchCount: The maximum number of mismatched bases in -// .. alignments that we will *keep*. Note -// .. that this must not be negative. -// -// Returns: -// A pointer to the list of remaining alignments. -// -//---------- -// Notes: -// (1) Memory for alignments that don't make the cut is deallocated here. -// (2) The returned list of alignments is in the same order as the incoming -// .. list. -// (3) Mismatch-count is counted over all aligned bases, and counts only -// .. mismatches (not matches or gaps). -//---------- - -alignel* filter_aligns_by_mismatch_count - (seq* seq1, - seq* seq2, - alignel* alignList, - s32 maxMismatchCount) - { - alignel* a, *next; - alignel* head, *prev; - unspos numer, denom; - - head = prev = NULL; - for (a=alignList ; a!=NULL ; a=next) - { - next = a->next; - - alignment_identity (seq1, seq2, a, &numer, &denom); - - if ((denom == 0) || (denom-numer > (u32) maxMismatchCount)) - { // (unwanted alignment, discard it) - free_if_valid ("filter_aligns_by_mismatch_count a->script", a->script); - free_if_valid ("filter_aligns_by_mismatch_count a", a); - continue; - } - - // this alignment is ok, add it to the end of the new list we're - // building - - if (head == NULL) head = prev = a; - else { prev->next = a; prev = a; } - - a->next = NULL; - } - - return head; - } - -//---------- -// -// filter_segments_by_mismatch_count-- -// Filter a table of segments, removing any segment that has more mismatched -// bases than a specified maximum. -// -//---------- -// -// Arguments: -// seq* seq1, seq2: The sequences. -// segtable* st: The segment table to operate upon. -// s32 maxMismatchCount: The maximum number of matched bases in -// .. segments that we will *keep*. Note that -// .. this must not be negative. -// -// Returns: -// (nothing) -// -//---------- - -void filter_segments_by_mismatch_count - (seq* seq1, - seq* seq2, - segtable* st, - s32 maxMismatchCount) - { - segment* srcSeg, *dstSeg; - unspos numer, denom; - - if (st == NULL) return; -// if (st->seg == NULL) return; test not necessary st->seg is never NULL - - for (dstSeg=srcSeg=st->seg ; ((u32)(srcSeg-st->seg))len ; srcSeg++) - { - segment_identity (seq1, srcSeg->pos1, - seq2, srcSeg->pos2, srcSeg->length, - &numer, &denom); - if ((denom == 0) || (denom-numer > (u32) maxMismatchCount)) - continue; // (unwanted segment, skip it) - if (srcSeg != dstSeg) *dstSeg = *srcSeg; - dstSeg++; - } - - st->len = dstSeg - st->seg; - } - -//---------- -// -// filter_segment_by_mismatch_count-- -// Filter a segment, reporting whether that segment has more mismatched bases -// than a specified maximum. -// -//---------- -// -// Arguments: -// seq* seq1: The first sequence. -// unspos pos1: The subsequence start position in seq1 (origin-0). -// seq* seq2: The second sequence. -// unspos pos2: The subsequence start position in seq2 (origin-0). -// unspos length: The length of the subsequence. -// s32 maxMismatchCount: The maximum number of matched bases in -// .. segments that we will *keep*. Note that -// .. this must not be negative. -// -// Returns: -// true if the segment if outside the specified range (i.e. that it fails to -// pass the filter, and should be discarded) -// -//---------- - -int filter_segment_by_mismatch_count - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - s32 maxMismatchCount) - { - unspos numer, denom; - - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - if ((denom == 0) || (denom-numer > (u32) maxMismatchCount)) - return true; // (unwanted segment, skip it) - - return false; - } - -//---------- -// -// init_identity_dist_job-- -// Initialize percent identity distribution. -// -//---------- - -void init_identity_dist_job - (arg_dont_complain(seq* seq1), - arg_dont_complain(seq* seq2)) - { - u32 bin; - - if (statsActive) - suicide ("attempt to open a second identity distribution job"); - statsActive = true; - - for (bin=0 ; bin<=numIdentityBins ; bin++) - { - identityCount [bin] = 0; - identityCoverage[bin] = 0; - } - } - -//---------- -// -// print_identity_dist_job-- -// Print the percent identity distribution. -// -//---------- - -void print_identity_dist_job - (FILE* f) - { - static const u32 noBin = (u32) -1; - u32 bin, minBin, maxBin; - float binCenter; - - if (!statsActive) - suicide ("attempt to close a non-existent identity distribution job"); - - minBin = maxBin = noBin; - for (bin=0 ; bin<=numIdentityBins ; bin++) - { - if (identityCount[bin] == 0) continue; - maxBin = bin; - if (minBin == noBin) minBin = bin; - } - if (minBin == noBin) minBin = maxBin = numIdentityBins; - - if (minBin > 0) minBin--; // inferz likes to have an empty - if (maxBin < numIdentityBins) maxBin++; // .. bin before and after the - // .. table - - for (bin=minBin ; bin<=maxBin ; bin++) - { - binCenter = bin / ((float) numIdentityBins); - fprintf (f, identityBinFormat "\t" unsposFmt "\t" possumFmt "\n", - binCenter, identityCount[bin], identityCoverage[bin]); - } - - statsActive = false; - } - -//---------- -// -// identity_dist_from_align_list-- -// Collect percent identity distribution from a list of gapped alignments. -// -//---------- -// -// Arguments: -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// -// Returns: -// (nothing) -// -//---------- - -void identity_dist_from_align_list - (alignel* alignList, - seq* seq1, - seq* seq2) - { - alignel* a; - unspos numer, denom; - u32 bin; - - for (a=alignList ; a!=NULL ; a=a->next) - { - alignment_identity (seq1, seq2, a, &numer, &denom); - bin = identity_bin (numer, denom); - - identityCount [bin]++; - identityCoverage[bin] += denom; - } - } - -//---------- -// -// identity_dist_from_match-- -// Collect percent identity distribution from a single ungapped alignment. -// -//---------- -// -// Arguments: -// seq* seq1: One sequence. -// unspos pos1: The position, in seq1, of first character in the match -// .. (origin-0). -// seq* seq2: Another sequence. -// unspos pos2: The position, in seq2, of first character in the match -// .. (origin-0). -// unspos length: The number of nucleotides in the match. -// -// Returns: -// (nothing) -// -//---------- - -void identity_dist_from_match - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length) - { - unspos numer, denom; - u32 bin; - - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - bin = identity_bin (numer, denom); - - identityCount [bin]++; - identityCoverage[bin] += denom; - } - diff --git a/programs/lastz/src/identity_dist.h b/programs/lastz/src/identity_dist.h deleted file mode 100644 index 5537df2..0000000 --- a/programs/lastz/src/identity_dist.h +++ /dev/null @@ -1,106 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: identity_dist.h -// -//---------- - -#ifndef identity_dist_H // (prevent multiple inclusion) -#define identity_dist_H - -// other files - -#include // standard C i/o stuff -#include // standard C variable argument list stuff -#include "sequences.h" // sequence stuff -#include "segment.h" // segment table management stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef identity_dist_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef identity_dist_owner -int identity_dist_dbgShowIdentity = false; -#else -global int identity_dist_dbgShowIdentity; -#endif - -//---------- -// -// data structures and types -// -//---------- - -#define numIdentityBins 1000 -#define identityBinFormat "%.3f" -#define identityBinLongFormat "%.4f" - -// We map identity fraction f to bin b = floor (numBins*f + .5). So the bins -// run from -// -// b - 1/2 b + 1/2 -// ------- <= f < ------- -// numBins numBins -// -// -// identity_bin(n,d) = numBins * (n/d), rounded to integer - -#define identity_bin(numer,denom) \ - ((u32)((2*((u64)numer)*(numIdentityBins)+((u64)denom))/(2*(denom)))) -#define bin_to_identity(bin) ( ((float) bin) / numIdentityBins) -#define bin_bottom_to_identity(bin) ((((float) bin) - 0.5) / numIdentityBins) -#define bin_top_to_identity(bin) ((((float) bin) + 0.5) / numIdentityBins) - -//---------- -// -// prototypes for routines in identity_dist.c -// -//---------- - -alignel* filter_aligns_by_identity (seq* seq1, seq* seq2, alignel* alignList, - float minIdentity, float maxIdentity); -void alignment_identity (seq* seq1, seq* seq2, alignel* a, - unspos* numer, unspos* denom); -void filter_segments_by_identity (seq* seq1, seq* seq2, segtable* st, - float minIdentity, float maxIdentity); -int filter_segment_by_identity (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - float minIdentity, float maxIdentity); -void segment_identity (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - unspos* numer, unspos *denom); -unspos count_substitutions (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - unspos count[4][4]); - -alignel* filter_aligns_by_match_count (seq* seq1, seq* seq2, alignel* alignList, - u32 minMatchCount); -void filter_segments_by_match_count(seq* seq1, seq* seq2, segtable* st, - u32 minMatchCount); -int filter_segment_by_match_count (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - u32 minMatchCount); - -alignel* filter_aligns_by_mismatch_count (seq* seq1, seq* seq2, alignel* alignList, - s32 maxMismatchCount); -void filter_segments_by_mismatch_count(seq* seq1, seq* seq2, segtable* st, - s32 maxMismatchCount); -int filter_segment_by_mismatch_count(seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - s32 maxMismatchCount); - -void init_identity_dist_job (seq* seq1, seq* seq2); -void print_identity_dist_job (FILE* f); -void identity_dist_from_align_list (alignel* alignList, - seq* seq1, seq* seq2); -void identity_dist_from_match (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length); - -#undef global -#endif // identity_dist_H diff --git a/programs/lastz/src/infer_scores.c b/programs/lastz/src/infer_scores.c deleted file mode 100755 index caab787..0000000 --- a/programs/lastz/src/infer_scores.c +++ /dev/null @@ -1,2604 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: infer_scores.c -// -//---------- -// -// infer_scores-- -// Support for collecting alignment scoring inference stats from allignments. -// "Alignment scoring stats" are background nucleotide counts, substitituion -// counts, and gap length distribution. -// -// References: -// -// [1] "Scoring Pairwise Genomic Sequence Alignments" F Chiaromonte, VB -// Yap, W Miller. Pacific Symposium on Biocomputing (2002), vol. 7, pp. -// 115-126 -// -// [2] "Biological sequence analysis" Durbin, Eddy, Krogh and Mitchison. -// Cambridge University Press, 1998. Pages 29-31. -// -// [3] "Improved Pairwise Alignment of Genomic DNA". RS Harris, PhD Thesis, -// Pennsylvania State University, 2007. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C upper/lower stuff -#include // standard C math stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff -#include "identity_dist.h" // identity distribution "format" stuff -#include "output.h" // alignment outout format stuff -#include "lastz.h" // lastz program-wide stuff - -#define infer_scores_owner // (make this the owner of its globals) -#include "infer_scores.h" // interface to this module - -//---------- -// -// private global data -// -//---------- - -#define maxSubIterations 30 // max number of iterations for inferring - // .. substitution scores -#define maxGapIterations 30 // max number of iterations for inferring gap - // .. scores - -#if (scoreType == 'I') -#define subCloseEnough 0 -#define gapCloseEnough 0 -#else -#define subCloseEnough .000001 -#define gapCloseEnough .0001 -#endif - -// distribution-- -// A set of (length,count) pairs. -// -// All memory is self-contained, so a direct call to free() can be used for -// disposal. -// -// $$$ Eventually we should improve this by using something like a balanced -// binary tree or a hash-table. For now we just use the simple but slow -// method. -// -// $$$ Moreover, we are collecting more stats than we need. Currently we infer -// scores only from refGaps, secGaps, and segments. And the two gap -// distributions can be combined. -// -// $$$ Since we only need averages from the distributions, we could just store -// count and sum. However, we will eventually try to make use of the -// actual shape of the distribution. In particular, we need to make a -// correction for the inherent lack of short gaps. - -typedef struct dpair - { - unspos length; - u64 count; - } dpair; - -typedef struct distn - { - u32 size; // the number of entries allocated for items[] - u32 len; // the number of items used - dpair* items; // the (length,count) pairs - } distn; - -// inference stats - -typedef struct infstats - { - u64 count; - possum coverage; - - unspos refBases; - unspos secBases; - unspos refBkgd[4]; - unspos secBkgd[4]; - unspos subs[4][4]; - - distn* refBlocks; // alignment block length distributions - distn* secBlocks; - distn* refGaps; // gap length distributions - distn* secGaps; - distn* refRuns; // non-gap length distributions - distn* secRuns; - distn* segments; // ungapped pair length distribution - } infstats; - -// short score arrays (for tracking score convergence) - -typedef struct score2 { score s1,s2; } score2; -typedef struct score6 { score s1,s2,s3,s4,s5,s6; } score6; - -// private globals shared by all the routines herein - -control* params; -seq* target; -u8* targetRev; -postable* targPositions; -seq* query; -tback* traceback; -float minIdentity; -float maxIdentity; -scoreset* inferredScoring; - -static int statsActive = false; -static infstats infStats; - -static infstats infStatsByPctId[numIdentityBins+1]; - -//---------- -// -// prototypes for private functions -// -//---------- - -static double align_for_sub_scores (score scaleTo); -static double align_for_gap_scores (score scaleTo); -static void align_for_stats (hitprocessor hitProc, - void* hitProcInfo); - -static double infer_substitution_scores (double pOpen, score scaleTo); -static double infer_gap_scores (score scaleTo); - -static void copy_scores (scoreset* dst, scoreset* src); -static void repair_scores (scoreset* scoring, scoreset* masked); -static void write_scores (char* fileId, scoreset* ss, - int withGapScores, int withExtras, - int asInts); - -static void init_stats_for_inference (seq* seq1, seq* seq2); -static void erase_stats_for_inference (void); -static void free_stats_for_inference (void); -static void filter_stats_by_percentile (void); -static void combine_binned_stats (int mergeSequences); - -static void init_stats (infstats* inf); -static void erase_stats (infstats* inf); -static void free_stats (infstats* inf); -static void accumulate_stats_from_align (seq* seq1, unspos beg1, unspos end1, - seq* seq2, unspos beg2, unspos end2, - editscript* script, - infstats* inf); -static void accumulate_stats_from_match (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, - unspos length, - infstats* inf); - -static void print_bkgd_stats (FILE* f, char* s, unspos bkgd[4]); -static void print_subs_stats (FILE* f, unspos subs[4][4]); -static void print_blocks_stats (FILE* f, char* s, distn* blocks); -static void print_gaps_stats (FILE* f, char* s, distn* gaps); -static void print_runs_stats (FILE* f, char* s, distn* runs); -static void print_segments_stats (FILE* f, distn* segments); - -static distn* init_length_distribution (u32 numEntries); -static void erase_length_distribution (distn* d); -static void free_length_distribution (distn* d); -static void add_lengths_to_distribution (distn* src, distn** dst); -static void add_length_to_distribution (unspos length, distn** d); -static u64 number_of_instances (distn* d); -static double average_length (distn* d); -static void print_length_distribution (FILE* f, char* prefix, distn* d); -static int qCompareByLength (const void* pairA, const void* pairB); - -//---------- -// -// drive_scoring_inference-- -// Infer log odds alignment scores from the sequences. -// -// Scores for substitution and gaps are inferred by -// (a) assuming some scoring set -// (b) performing alignment -// (c) filtering out alignment blocks that are too good or too bad -// (d) inferring scores from the statistics of the resulting blocks -// -// Here we iterate that process until the scores converge. In phase I we -// restrict ourselves to gap-free alignments and infer only substitution scores. -// Then in phase II we allow gapped alignments and infer gap scores (without -// changing substitution scores). The reason for performing substitution score -// convergence in the absense of gaps is that gap-free alignment is much faster -// than gapped alignment. However, there is no real justification for holding -// substitution scores constant during phase II; this just matches the way the -// author of [3] had done some of his earlier experiments. -// -// $$$ need to add a lot more here about convergence and the potential for it -// .. to get into an orbit. -// -// $$$ describe percent-id filtering, percentiles, etc. -// -// $$$ we could allow substitution scores to change during phase II. -// -//---------- -// -// Arguments: -// control* params: Parameter set controlling the inference -// .. searches. This is actually declared as -// .. void* to avoid a cyclic dependency in -// .. the include files. -// seq* target: The sequence being searched. -// postable* targPositions: A table of positions of words in target. -// u8* targetRev: The reverse (NOT reverse complement) of the -// .. target sequence, as a zero-terminated -// .. string; this may be NULL if the caller -// .. doesn't need/want to supply it. It is -// .. only needed if we will be inferring gap -// .. scores. -// seq* query: The sequence(s) to compare to the target. -// .. Upon completion, this will be rewound -// .. back to the starting position. -// tback* traceback: Memory in which to track gapped alignment -// .. traceback. -// -// Returns: -// A pointer to the inferred scoring set. The caller is responsible for -// deallocating this. -// -//---------- - -static int close_enough_scores_2 (score2* u, score2* v); -static int close_enough_scores_6 (score6* u, score6* v); - - -scoreset* drive_scoring_inference - (void* _params, - seq* _target, - u8* _targetRev, - postable* _targPositions, - seq* _query, - tback* _traceback) - { - score6 subScores, pastSubScores[maxSubIterations+1]; - score2 gapScores, pastGapScores[maxGapIterations+1]; - char scoreFileId[20]; - score maxSubScore, minSubScore, scaleTo; - double oneOverMaxSubScore, minOverMaxSubScore; - double origHspThresholdRatio, origGappedThresholdRatio, - origGapOpenRatio, origGapExtendRatio; - double hspThresholdRatio, gappedThresholdRatio, - gapOpenRatio, gapExtendRatio; - int showAllScores, inOrbit; - int trial, oldTrial; - scoreset* temp; - - if (((control*) _params)->gappedThreshold.t != 'S') - suicidef ("drive_scoring_inference can't handle score threshold %s", - score_thresh_to_string (&((control*) _params)->gappedThreshold)); - - if ((((control*) _params)->minCoverage > 0) - || (((control*) _params)->maxCoverage < 1)) - suicidef ("drive_scoring_inference can't handle query coverage filtering"); - -#if (!defined infer_anything) - if (((control*) _params)->ic.gapIterations > 0) - suicide ("Gap scoring inference has not been shown to produce useful results and\n" - "is currently blocked. To unblock gap scoring inference, contact the author."); -#endif - - // pass globals to the rest of this module - // note: this makes this module non-threadsafe - - params = (control*) _params; - target = _target; - targetRev = _targetRev; - targPositions = _targPositions; - query = _query; - traceback = _traceback; - - if (params->ic.subIterations > maxSubIterations) - params->ic.subIterations = maxSubIterations; - - if (params->ic.gapIterations > maxGapIterations) - params->ic.gapIterations = maxGapIterations; - - if (params->ic.idIsPercentile) - { - minIdentity = params->minIdentity; params->minIdentity = 0.0; - maxIdentity = params->maxIdentity; params->maxIdentity = 1.0; - } - - origHspThresholdRatio = params->hspThreshold.s; - origGappedThresholdRatio = params->gappedThreshold.s; - origGapOpenRatio = params->scoring->gapOpen; - origGapExtendRatio = params->scoring->gapExtend; - - // determine limiting parameters, relative to the maximum substitution - // score - - scaleTo = params->ic.inferScale; - - maxSubScore = max_in_score_matrix (params->scoring); - minSubScore = min_in_score_matrix (params->scoring); - oneOverMaxSubScore = 1.0 / maxSubScore; - minOverMaxSubScore = (-minSubScore) / (double) maxSubScore; - - hspThresholdRatio = origHspThresholdRatio; - - switch (params->ic.hspThresholdIsRatio) - { - case ratioNone: hspThresholdRatio *= oneOverMaxSubScore; break; - case ratioMinSubScore: hspThresholdRatio *= minOverMaxSubScore; break; - } - - // allocate a scoring set to receive inferred scores; we will ping-pong - // between this one and the params->scoring set; and we will maintain the - // params->maskedScoring set as the masked version of the inferred set - - inferredScoring = new_dna_score_set (NULL, 0, 0, 0, 0); - - // decide whether we are going to ouput every score set we try - - showAllScores = false; - if (params->ic.inferFilename != NULL) - showAllScores = (strstr (params->ic.inferFilename, "%s") != NULL); - - // report inference settings to the user - - if (infer_scores_showParams) - { - fprintf (stderr, "inference parameters\n"); - if (params->ic.inferFilename == NULL) - fprintf (stderr, " inferFilename = (null)\n"); - else - fprintf (stderr, " inferFilename = %s\n", params->ic.inferFilename); - - if (params->ic.idIsPercentile) - { - fprintf (stderr, " min_identity = %.2f%% (as percentile)\n", 100.0*params->minIdentity); - fprintf (stderr, " max_identity = %.2f%% (as percentile)\n", 100.0*params->maxIdentity); - } - else - { - fprintf (stderr, " min_identity = %.2f (as identity)\n", 100.0*params->minIdentity); - fprintf (stderr, " max_identity = %.2f (as identity)\n", 100.0*params->maxIdentity); - } - - if ((params->ic.inferScale == 0) && (params->ic.writeAsInt)) - fprintf (stderr, " inference_scale = (no scaling) forced to int\n"); - else if ((params->ic.inferScale == 0) && (!params->ic.writeAsInt)) - fprintf (stderr, " inference_scale = (no scaling)\n"); - else if ((params->ic.inferScale != 0) && (params->ic.writeAsInt)) - fprintf (stderr, " inference_scale = " scoreFmtSimple " (forced to int)\n", params->ic.inferScale); - else - fprintf (stderr, " inference_scale = " scoreFmtSimple "\n", params->ic.inferScale); - - if (params->ic.hspThresholdIsRatio == ratioMaxSubScore) - fprintf (stderr, " hsp_threshold = " scoreFmtSimple "*inference_scale\n", params->hspThreshold.s); - else if (params->ic.hspThresholdIsRatio == ratioMinSubScore) - fprintf (stderr, " hsp_threshold = " scoreFmtSimple "*worst_substitution\n", params->hspThreshold.s); - else - fprintf (stderr, " hsp_threshold = " scoreFmtSimple "\n", params->hspThreshold.s); - - if (params->ic.gappedThresholdIsRatio == ratioMaxSubScore) - fprintf (stderr, " gapped_threshold = " scoreFmtSimple "*inference_scale\n", params->gappedThreshold.s); - else if (params->ic.gappedThresholdIsRatio == ratioMinSubScore) - fprintf (stderr, " gapped_threshold = " scoreFmtSimple "*worst_substitution\n", params->gappedThreshold.s); - else - fprintf (stderr, " gapped_threshold = " scoreFmtSimple "\n", params->gappedThreshold.s); - - fprintf (stderr, " max_sub_iterations = %d\n", params->ic.subIterations); - fprintf (stderr, " max_gap_iterations = %d\n", params->ic.gapIterations); - - if (params->ic.gapOpenIsRatio == ratioMaxSubScore) - fprintf (stderr, " gap_open_penalty = " scoreFmtSimple "*inference_scale\n", params->scoring->gapOpen); - else if (params->ic.gapOpenIsRatio == ratioMinSubScore) - fprintf (stderr, " gap_open_penalty = " scoreFmtSimple "*worst_substitution\n", params->scoring->gapOpen); - else - fprintf (stderr, " gap_open_penalty = " scoreFmtSimple "\n", params->scoring->gapOpen); - - if (params->ic.gapExtendIsRatio == ratioMaxSubScore) - fprintf (stderr, " gap_extend_penalty = " scoreFmtSimple "*inference_scale\n", params->scoring->gapExtend); - else if (params->ic.gapExtendIsRatio == ratioMinSubScore) - fprintf (stderr, " gap_extend_penalty = " scoreFmtSimple "*worst_substitution\n", params->scoring->gapExtend); - else - fprintf (stderr, " gap_extend_penalty = " scoreFmtSimple "\n", params->scoring->gapExtend); - - fprintf (stderr, " step = %d\n", params->step); - fprintf (stderr, " entropy = %s\n", (params->entropicHsp)? "on" : "off"); - } - - ////////// - // Phase I-- iterate substitution scores inference - ////////// - - init_stats_for_inference (params->seq1, params->seq2); - - if (infer_scores_outputLav) params->outputFormat = fmtLavInfScores; - else params->outputFormat = fmtInfScores; - - subScores.s1 = params->scoring->sub['A']['A']; - subScores.s2 = params->scoring->sub['T']['T']; - subScores.s3 = params->scoring->sub['A']['C']; - subScores.s4 = params->scoring->sub['A']['G']; - subScores.s5 = params->scoring->sub['A']['T']; - subScores.s6 = params->scoring->sub['C']['G']; - pastSubScores[0] = subScores; - inOrbit = false; - - for (trial=1 ; (!inOrbit)&&(trial<=params->ic.subIterations) ; trial++) - { - // determine limiting parameters, relative to the current maximum - // substitution score - - maxSubScore = max_in_score_matrix (params->scoring); - params->hspThreshold.s = hspThresholdRatio * maxSubScore; - params->xDrop = 10 * maxSubScore; - - // output the scores we are trying - - print_job_header (); - - if (showAllScores) - { - sprintf (scoreFileId, "s%03d", trial-1); - write_scores (scoreFileId, params->scoring, - /*withGapScores*/ false, - /*withExtras*/ true, - /*asInts*/ false); - } - - if (infer_scores_dbgShowIdentity) - printf ("===== starting iteration s%03d =====\n", trial-1); - - // perform alignment and infer scores from resulting alignments - - align_for_sub_scores (scaleTo); - rewind_sequence_file (query); - - // if resulting score matrix is close enough to one we've seen before, - // exit the main loop (by setting inOrbit = true) - - subScores.s1 = inferredScoring->sub['A']['A']; - subScores.s2 = inferredScoring->sub['C']['C']; - subScores.s3 = inferredScoring->sub['A']['C']; - subScores.s4 = inferredScoring->sub['A']['G']; - subScores.s5 = inferredScoring->sub['A']['T']; - subScores.s6 = inferredScoring->sub['C']['G']; - - if (infer_scores_snoopConverge) - { - fprintf (stderr, "=== stats iteration %d ===\n", trial); - fprintf (stderr, "alignments:%" PRIu64 " alignedBases:" possumFmt - " refBases:" unsposFmt " secBases:" unsposFmt "\n", - infStats.count, infStats.coverage, - infStats.refBases, infStats.secBases); - } - - if (infer_scores_watchConverge) - { - fprintf (stderr, "=== subs iteration %d ===\n", trial); - fprintf (stderr, "AA:" scoreFmtSimple " CC:" scoreFmtSimple - " AC:" scoreFmtSimple " AG:" scoreFmtSimple - " AT:" scoreFmtSimple " CG:" scoreFmtSimple "\n", - subScores.s1, subScores.s2, subScores.s3, - subScores.s4, subScores.s5, subScores.s6); - } - - for (oldTrial=trial-1 ; oldTrial>=0 ; oldTrial--) - { - if (close_enough_scores_6 (&subScores, &pastSubScores[oldTrial])) - { inOrbit = true; break; } - } - - pastSubScores[trial] = subScores; - - // ping-pong scoring sets - - temp = inferredScoring; - inferredScoring = params->scoring; - params->scoring = temp; - - repair_scores (params->scoring, params->maskedScoring); - - if (params->showStats) - infer_scores_show_stats_subs (params->statsFile, trial); - } - - ////////// - // Phase II-- iterate gap scores inference - ////////// - - // copy the final substitution scores into both scoring sets - - copy_scores (/*to*/ inferredScoring, /*from*/ params->scoring); - - // determine limiting parameters, relative to the current maximum - // substitution score - - maxSubScore = max_in_score_matrix (params->scoring); - minSubScore = min_in_score_matrix (params->scoring); - oneOverMaxSubScore = 1.0 / maxSubScore; - minOverMaxSubScore = (-minSubScore) / (double) maxSubScore; - - hspThresholdRatio = origHspThresholdRatio; - gappedThresholdRatio = origGappedThresholdRatio; - gapOpenRatio = origGapOpenRatio; - gapExtendRatio = origGapExtendRatio; - - switch (params->ic.hspThresholdIsRatio) - { - case ratioNone: hspThresholdRatio *= oneOverMaxSubScore; break; - case ratioMinSubScore: hspThresholdRatio *= minOverMaxSubScore; break; - } - - switch (params->ic.gappedThresholdIsRatio) - { - case ratioNone: gappedThresholdRatio *= oneOverMaxSubScore; break; - case ratioMinSubScore: gappedThresholdRatio *= minOverMaxSubScore; break; - } - - switch (params->ic.gapOpenIsRatio) - { - case ratioNone: gapOpenRatio *= oneOverMaxSubScore; break; - case ratioMinSubScore: gapOpenRatio *= minOverMaxSubScore; break; - } - - switch (params->ic.gapExtendIsRatio) - { - case ratioNone: gapExtendRatio *= oneOverMaxSubScore; break; - case ratioMinSubScore: gapExtendRatio *= minOverMaxSubScore; break; - } - - params->hspThreshold.s = hspThresholdRatio * maxSubScore; - params->gappedThreshold.s = gappedThresholdRatio * maxSubScore; - params->xDrop = 10 * maxSubScore; - params->scoring->gapOpen = gapOpenRatio * maxSubScore; - params->scoring->gapExtend = gapExtendRatio * maxSubScore; - - gapScores.s1 = params->scoring->gapOpen; - gapScores.s2 = params->scoring->gapExtend; - pastGapScores[trial] = gapScores; - inOrbit = false; - - for (trial=1 ; (!inOrbit)&&(trial<=params->ic.gapIterations) ; trial++) - { - // determine limiting parameters, relative to the current gap scores - // (this setting of yDrop = O + 300E comes from BLASTZ defaults) - - params->yDrop = params->scoring->gapOpen - + 300 * params->scoring->gapExtend; - - // output the scores we are trying - - print_job_header (); - - if (showAllScores) - { - sprintf (scoreFileId, "g%03d", trial-1); - write_scores (scoreFileId, params->scoring, - /*withGapScores*/ true, - /*withExtras*/ true, - /*asInts*/ false); - } - - if (infer_scores_dbgShowIdentity) - printf ("===== starting iteration g%03d =====\n", trial-1); - - // perform alignment and infer scores from resulting alignments - - align_for_gap_scores (scaleTo); - rewind_sequence_file (query); - - // if resulting gap score pair is close enough to one we've seen before, - // exit the main loop (by setting inOrbit = true) - - gapScores.s1 = inferredScoring->gapOpen; - gapScores.s2 = inferredScoring->gapExtend; - - if (infer_scores_watchConverge) - { - fprintf (stderr, "=== gaps iteration %d ===\n", trial); - fprintf (stderr, "O:" scoreFmtSimple " E:" scoreFmtSimple "\n", - gapScores.s1, gapScores.s2); - } - - for (oldTrial=trial-1 ; oldTrial>=0 ; oldTrial--) - { - if (close_enough_scores_2 (&gapScores, &pastGapScores[oldTrial])) - { inOrbit = true; break; } - } - - pastGapScores[trial] = gapScores; - - // ping-pong scoring sets - - temp = inferredScoring; - inferredScoring = params->scoring; - params->scoring = temp; - - repair_scores (params->scoring, params->maskedScoring); - - if (params->showStats) - infer_scores_show_stats_gaps (params->statsFile, trial); - } - - // ping-pong scoring sets (to compensate for the last ping-pong in the loop) - - temp = inferredScoring; - inferredScoring = params->scoring; - params->scoring = temp; - - // output the resulting scores - - write_scores ("", inferredScoring, - /*withGapScores*/ (maxGapIterations>0), - /*withExtras*/ false, - /*asInts*/ params->ic.writeAsInt); - - // cleanup - - free_stats_for_inference (); - - if (params->ic.idIsPercentile) - { - params->minIdentity = minIdentity; - params->maxIdentity = maxIdentity; - } - - return inferredScoring; - } - - -static int close_enough_scores_2 - (score2* u, - score2* v) - { - score diff; - - diff = u->s1 - v->s1; - if (diff < -gapCloseEnough) return false; - if (diff > gapCloseEnough) return false; - - diff = u->s2 - v->s2; - if (diff < -gapCloseEnough) return false; - if (diff > gapCloseEnough) return false; - - return true; - } - - -static int close_enough_scores_6 - (score6* u, - score6* v) - { - score diff; - - diff = u->s1 - v->s1; - if (diff < -subCloseEnough) return false; - if (diff > subCloseEnough) return false; - - diff = u->s2 - v->s2; - if (diff < -subCloseEnough) return false; - if (diff > subCloseEnough) return false; - - diff = u->s3 - v->s3; - if (diff < -subCloseEnough) return false; - if (diff > subCloseEnough) return false; - - diff = u->s4 - v->s4; - if (diff < -subCloseEnough) return false; - if (diff > subCloseEnough) return false; - - diff = u->s5 - v->s5; - if (diff < -subCloseEnough) return false; - if (diff > subCloseEnough) return false; - - diff = u->s6 - v->s6; - if (diff < -subCloseEnough) return false; - if (diff > subCloseEnough) return false; - - return true; - } - -//---------- -// -// align_for_sub_scores-- -// Perform ungapped alignment of the target sequence to every query, then -// infer substitution scores from stats collected from those alignments. -// -//---------- -// -// Arguments: -// score scaleTo: The desired value for the maximum subsitution score. -// .. If this is zero, no scaling is performed. -// -// Returns: -// The scaling factor used to accomplish scaleTo. -// -//---------- - -static double align_for_sub_scores - (score scaleTo) - { - hitprocessor hitProc; - void* hitProcInfo; - double scaleBy; - - params->chain = false; params->gappedExtend = false; // equiv. to C=3 - set_up_hit_processor (params, false, &hitProc, &hitProcInfo); - - // align target vs all queries, collecting stats - - align_for_stats (hitProc, hitProcInfo); - - // combine stats into a single bin - - if (params->ic.idIsPercentile) - filter_stats_by_percentile (); - - combine_binned_stats (/*merge sequences*/ true); - - // infer substitution scores (results are placed into inferredScoring) - - scaleBy = infer_substitution_scores (/*pOpen*/ 0.0, scaleTo); - infer_scores_set_stat (scaleBy, scaleBy); - - return scaleBy; - } - -//---------- -// -// align_for_gap_scores-- -// Perform gapped alignment of the target sequence to every query, then infer -// infer gap scores from stats collected from those alignments. -// -//---------- -// -// Arguments: -// score scaleTo: The desired value for the maximum subsitution score. -// .. If this is zero, no scaling is performed. -// (additional input is implicit in pInf, sInf, q1Inf and q2Inf) -// -// Returns: -// The scaling factor used to accomplish scaleTo. -// -//---------- - -static double align_for_gap_scores - (score scaleTo) - { - hitprocessor hitProc; - void* hitProcInfo; - double scaleBy; - - params->chain = true; params->gappedExtend = true; // equiv. to C=2 - set_up_hit_processor (params, false, &hitProc, &hitProcInfo); - - // align target vs all queries, collecting stats - - align_for_stats (hitProc, hitProcInfo); - - // combine stats into a single bin - - if (params->ic.idIsPercentile) - filter_stats_by_percentile (); - - combine_binned_stats (/*merge sequences*/ true); - - // infer gap scores (results are placed into inferredScoring) - - scaleBy = infer_gap_scores (scaleTo); - infer_scores_set_stat (scaleBy, scaleBy); - - return scaleBy; - } - -//---------- -// -// align_for_stats-- -// Perform alignment (gapped or ungapped) of the target sequence to every -// query, collecting stats from those alignments. -// -//---------- -// -// Arguments: -// hitprocessor processor: Function to call for each hit to determine if -// .. it is 'good enough'. -// void* processorInfo: A value to pass thru with each call to -// .. processor. -// -// Returns: -// (nothing) -// -//---------- - -static void align_for_stats - (hitprocessor hitProc, - void* hitProcInfo) - { - int collectHspsFromBoth; // collect HSPs from both strands - // .. before gapped stage - int hspsAreAdaptive; // adaptive HSP scoring threshold - // .. is being used - int abortQuery; - - - hspsAreAdaptive = (params->hspThreshold.t != 'S'); - collectHspsFromBoth = hspsAreAdaptive; // $$$ consider other conditions - // $$$ .. used in mainline lastz - - // align target vs all queries, collecting stats - - erase_stats_for_inference (); - - while (load_sequence (query)) - { - if (query->len == 0) continue; - if (params->minMatchCountRatio != 0) - params->minMatchCount = (u32) ceil (query->trueLen * params->minMatchCountRatio); - if (params->whichStrand < 0) - rev_comp_sequence (query, params->scoring->qToComplement); - abortQuery = !start_one_strand (target, targPositions, query, - /* empty anchors */ true, - /* prev anchor count */ 0, - hitProc, hitProcInfo); - if (abortQuery) continue; - if (!collectHspsFromBoth) - finish_one_strand (target, targetRev, targPositions, query, NULL, - traceback, NULL); - - if (params->whichStrand > 0) - { - rev_comp_sequence (query, params->scoring->qToComplement); - abortQuery = !start_one_strand (target, targPositions, query, - /* empty anchors */ !collectHspsFromBoth, - /* prev anchor count */ 0, - hitProc, hitProcInfo); - if (!abortQuery) - { - rev_comp_sequence (query, params->scoring->qToComplement); - continue; - } - if (collectHspsFromBoth) split_anchors (query->revCompFlags); - finish_one_strand (target, targetRev, targPositions, query, NULL, - traceback, NULL); - if (collectHspsFromBoth) - { - swap_anchor_sets (); - // we have to reverse query for subsequent call to finish_one_strand() - rev_comp_sequence (query, params->scoring->qToComplement); - } - } - - if (collectHspsFromBoth) - finish_one_strand (target, targetRev, targPositions, query, NULL, - traceback, NULL); - } - - } - -//---------- -// -// infer_substitution_scores-- -// Infer log odds substitution scoring matrix as per [1]; one addition we -// make to [1] is that we involve pOpen in each score to properly account for -// gaps in the three state pair FSA from [2]; for more information see the -// description in infer_gap_scores(). -// -//---------- -// -// Arguments: -// double pOpen: The gap-opening probability. -// score scaleTo: The desired value for the maximum subsitution score. -// .. All scores in the scoring matrix will be scaled by -// .. the same amount so that the maximum is this value. -// .. If this is zero, no scaling is performed. -// (additional input is implicit in infStats.subs) -// -// Returns: -// The scaling factor used to accomplish scaleTo. Additional output is -// implicit in inferredScoring, and internal state is saved in pInf, sInf, -// q1Inf and q2Inf (so that subsequent calls to infer_gap_scores can make -// use of it). -// -//---------- - -static double pInf[4][4], sInf[4][4]; -static double qInf1[4], qInf2[4]; - -static void sub_probs_to_log_scores (double pOpen); -static double log_scores_to_scoring_set (score scaleTo); - -//--- infer_substitution_scores - -static double infer_substitution_scores - (double pOpen, - score scaleTo) - { - unspos m[4][4]; - unspos n1[4], n2[4]; - unspos n; - double npairs; - int x, y, xx, yy; - double scaleBy; - - // collect column counts from the alignment stats - - for (x=0 ; x<4 ; x++) - { - n1[x] = 0; - n2[x] = 0; - for (y=0 ; y<4 ; y++) - m[x][y] = 0; - } - - for (x=0 ; x<4 ; x++) - for (y=0 ; y<4 ; y++) - { - // "observe(n,x,y)" - - n = infStats.subs[x][y]; - - xx = x; yy = y; - m[xx][yy] += n; - n1[xx] += n; - n2[yy] += n; - - xx = bits_to_complement[x]; // (for strand symmetry) - yy = bits_to_complement[y]; - m[xx][yy] += n; - n1[xx] += n; - n2[yy] += n; - - xx = y; yy = x; // (for species symmetry) - m[xx][yy] += n; - n1[xx] += n; - n2[yy] += n; - - xx = bits_to_complement[y]; // (for both strand and - yy = bits_to_complement[x]; // .. species symmetry) - m[xx][yy] += n; - n1[xx] += n; - n2[yy] += n; - } - -#ifdef collect_stats - for (x=0 ; x<4 ; x++) for (y=0 ; y<4 ; y++) infer_scores_set_stat (subs[x][y], infStats.subs[x][y]); - for (x=0 ; x<4 ; x++) infer_scores_set_stat (n1[x], n1[x]); - for (y=0 ; y<4 ; y++) infer_scores_set_stat (n2[y], n2[y]); - for (x=0 ; x<4 ; x++) for (y=0 ; y<4 ; y++) infer_scores_set_stat (m[x][y], m[x][y]); -#endif // collect_stats - - // validate the expected symmetry - - if ((n1[3] != n1[0]) - || (n1[2] != n1[1]) - || (n2[3] != n2[0]) - || (n2[2] != n2[1]) - || (m[3][3] != m[0][0]) - || (m[2][2] != m[1][1]) - || (m[1][0] != m[0][1]) - || (m[2][3] != m[0][1]) - || (m[3][2] != m[0][1]) - || (m[2][0] != m[0][2]) - || (m[1][3] != m[0][2]) - || (m[3][1] != m[0][2]) - || (m[3][0] != m[0][3]) - || (m[2][1] != m[1][2])) - suicidef ("internal error: non-symmetry in infer_substitution_scores\n" - " n1: %7d %7d %7d %7d\n" - " n2: %7d %7d %7d %7d\n" - " m[0]: %7d %7d %7d %7d\n" - " m[1]: %7d %7d %7d %7d\n" - " m[2]: %7d %7d %7d %7d\n" - " m[3]: %7d %7d %7d %7d\n", - n1[0], n1[1], n1[2], n1[3], - n2[0], n2[1], n2[2], n2[3], - m[0][0], m[0][1], m[0][2], m[0][3], - m[1][0], m[1][1], m[1][2], m[1][3], - m[2][0], m[2][1], m[2][2], m[2][3], - m[3][0], m[3][1], m[3][2], m[3][3]); - - // infer log odds scores from column counts - // - // nota bene: because of the symmetry folding performed above, we could - // simplify some of these counts, etc. (e.g. p(G) == p(T)); but - // the computational gain from doing so is inconsequential, and - // not doing so makes the correllation between the code and [1] - // more obvious - - npairs = (double) (n1[0] + n1[1] + n1[2] + n1[3]); - - for (x=0 ; x<4 ; x++) - { - if ((n1[x] == 0) || (n2[x] == 0)) - suicidef ("internal error in infer_substitution_scores:" - " n1[%c] or n2[%c] is zero", - bits_to_nuc[x], bits_to_nuc[x]); - qInf1[x] = n1[x] / npairs; - qInf2[x] = n2[x] / npairs; - for (y=0 ; y<4 ; y++) - pInf[x][y] = m[x][y] / npairs; - } - - sub_probs_to_log_scores (pOpen); - - // copy scores into inferredScoring, scaling if desired - - scaleBy = log_scores_to_scoring_set (scaleTo); - - inferredScoring->gapOpen = 0; - inferredScoring->gapExtend = 0; - - return scaleBy; - } - -//--- sub_probs_to_log_scores -// (called by both infer_substitution_scores and infer_gap_scores) - -static void sub_probs_to_log_scores (double pOpen) - { - double overLog2 = 1 / log(2.0); - int x, y; - - for (x=0 ; x<4 ; x++) - for (y=0 ; y<4 ; y++) - { - if (pInf[x][y] == 0) - suicidef ("internal error in infer_substitution_scores:" - " s[%c][%c] = -infinity", - bits_to_nuc[x], bits_to_nuc[y]); - - sInf[x][y] = log(pInf[x][y]/(qInf1[x]*qInf2[y])) * overLog2; - if (pOpen != 0) - sInf[x][y] += log(1-2*pOpen) * overLog2; - } - } - -//--- log_scores_to_scoring_set -// (called by both infer_substitution_scores and infer_gap_scores) - -static double log_scores_to_scoring_set (score scaleTo) - { - int x, y; - u8 nuc1, nuc2; - double scaleBy; - - if (scaleTo <= 0) - scaleBy = 1.0; - else - { - double maxS = sInf[0][0]; - for (x=0 ; x<4 ; x++) - for (y=0 ; y<4 ; y++) - if (sInf[x][y] > maxS) - { maxS = sInf[x][y]; } - - scaleBy = ((double) scaleTo) / maxS; - } - - for (x=0 ; x<4 ; x++) - { - nuc1 = (u8) bits_to_nuc[x]; - for (y=0 ; y<4 ; y++) - { - nuc2 = (u8) bits_to_nuc[y]; -#if (scoreType == 'I') - inferredScoring->sub[nuc1][nuc2] = round_score (scaleBy * sInf[x][y]); -#else - inferredScoring->sub[nuc1][nuc2] = scaleBy * sInf[x][y]; -#endif - } - } - - return scaleBy; - } - -//---------- -// -// infer_gap_scores-- -// Infer log odds scores for gap open and extend. -// -// For the underlying gap open probability, we have: -// avgSeg = 1/(2*p(stop)) -// => p(stop) = 1/(2*avgSeg) -// => p(gap open) = 1 - 1/(2*avgSeg) -// The reason for 2 in the denominator is because a gap could occur in either -// sequence, so the real p(stopping a segment) = 2*p(stop). -// -// For the underlying gap extend probability, we have: -// avgGap = 1/p(stop) -// = 1/(1-p(gap extend)) -// => 1-p(gap extend) = 1/avgGap -// => p(gap extend) = 1-(1/avgGap) -// -// For the pair FSA in [2], the log odds scores are then -// s'_xy = log p_x,y + log(1-2p_open) -// s'_open = log p_open -// s'_extend = log p_extend -// -// However there are two modifications that must be made. First, the initial -// pair coming out of a gap whould be scored as log p_x,y + log(1-p_extend), so -// using s'_xy we have underscored it by log(1-p_extend) - log(1-2p_open). We -// compensate for this by increasing the gap open score. Second, an extra gap -// extend penalty is charged as the gap is being opened, so we subtract this -// from our gap open penalty. Making these adjustments, the inferred scores -// are -// s_xy = log p_x,y + log(1-2p_open) -// s_open = log p_open + log(1-p_extend) - log(1-2p_open) - log p_extend -// s_extend = log p_extend -// -//---------- -// -// Arguments: -// score scaleTo: The desired value for the maximum subsitution score. -// .. See the description in infer_substitution_scores() -// .. for further info. Scaling is performed again as -// .. part of this routine, since a non-zero pOpen changes -// .. the substitution scores. -// (other input is implicitly infStats.refGaps and infStats.segments) -// -// Returns: -// The scaling factor used to accomplish scaleTo. Additional output is -// implicit in inferredScoring. -// -//---------- - -static double infer_gap_scores - (score scaleTo) - { - double overLog2 = 1 / log(2.0); - double avgGap, avgSeg; - double pOpen, pExtend; - double sOpen, sExtend; - double scaleBy; - - pOpen = pExtend = sOpen = sExtend = 0; // (placate compiler) - - if (number_of_instances (infStats.refGaps) == 0) - suicide ("internal error in infer_gap_scores: no gaps"); - - avgGap = average_length (infStats.refGaps); - avgSeg = average_length (infStats.segments); - - infer_scores_set_stat (averageGapLength, avgGap); - infer_scores_set_stat (averageSegmentLength, avgSeg); - - // infer gap extend - - if (avgGap < 0) - suicide ("internal error in infer_gap_scores: average gap doesn't exist"); - else if (avgGap == 1) - suicide ("internal error in infer_gap_scores: average gap is 1"); - else - { - pExtend = 1 - (1/avgGap); - sExtend = log(pExtend) * overLog2; - infer_scores_set_stat (pExtend, pExtend); - infer_scores_set_stat (sExtend, sExtend); - } - - // infer gap open - - if (avgSeg < 0) - suicide ("internal error in infer_gap_scores: average segment doesn't exist"); - else - pOpen = 1 / (2*avgSeg); - sOpen = (log(pOpen) - log(1-2*pOpen) + log(1-pExtend) - log(pExtend)) - * overLog2; - - if (sOpen + sExtend >= 0) - suicidef ("internal inconsistency, gap open \"reward\" in infer_gap_scores\n" - "(avgGap=%f pExtend=%f sExtend=%f avgSeg=%f pOpen=%f)\n" - "(+log(pOpen)=%f -log(1-2*pOpen)=%f +log(1-pExtend)=%f -log(pExtend)=%f)", - avgGap,pExtend,sExtend,avgSeg,pOpen, - log(pOpen)/log(2),-log(1-2*pOpen)/log(2), - log(1-pExtend)/log(2),-log(pExtend)/log(2)); - - infer_scores_set_stat (pOpen, pOpen); - infer_scores_set_stat (sOpen, sOpen); - - // recompute log odds substitution scores now that we have pOpen, and - // rescale them - - sub_probs_to_log_scores (pOpen); - scaleBy = log_scores_to_scoring_set (scaleTo); - - // scale and copy gap scores into inferredScoring - -#if (scoreType == 'I') - inferredScoring->gapOpen = round_score (scaleBy * (-sOpen)); - inferredScoring->gapExtend = round_score (scaleBy * (-sExtend)); -#else - inferredScoring->gapOpen = scaleBy * (-sOpen); - inferredScoring->gapExtend = scaleBy * (-sExtend); -#endif - - return scaleBy; - } - -//---------- -// -// copy_scores-- -// Copy one set of scores into the other (only uppercase substitution scores -// are copied). -// -//---------- -// -// Arguments: -// scoreset* dst: The score set to copy into. -// scoreset* src: The score set to copy from. -// -//---------- -// -// Returns: -// (nothing) -// -//---------- - -static void copy_scores - (scoreset* dst, - scoreset* src) - { - int x, y; - u8 nuc1, nuc2; - - for (x=0 ; x<4 ; x++) - { - nuc1 = (u8) bits_to_nuc[x]; - for (y=0 ; y<4 ; y++) - { - nuc2 = (u8) bits_to_nuc[y]; - dst->sub[nuc1][nuc2] = src->sub[nuc1][nuc2]; - } - } - - } - -//---------- -// -// repair_scores-- -// Fix a set of inferred scores. -// -// The functions infer_substitution_scores() and infer_gap_scores() set values -// in a scores set, but do not propagate those scores to lower case. Nor do -// they update the repeat-masked version of the score set. This routine -// corrects for those shortcomings. -// -//---------- -// -// Arguments: -// scoreset* scoring: The score set, as created by either of the -// .. infer_xxx_scores() functions. This will be -// .. modified by this routine. -// scoreset* maskedScoring: The score set to contain a repeat-masked -// .. version of the scoring matrix. -// -//---------- -// -// Returns: -// A pointer to the newly allocated score set, which the caller will have to -// dispose of eventually. The routine free() should be used for this purpose. -// -//---------- -// -// Notes: -// (1) In the resulting scoring matrix, upper and lower case characters are -// are considered identical, so entries for lower case are copied from -// upper case. -// -//---------- - -static void repair_scores - (scoreset* scoring, - scoreset* masked) - { - int x, y; - u8 nuc1, nuc2, nuc1low, nuc2low; - int c; - score sub, worstSub; - - worstSub = 0; - - for (x=0 ; x<4 ; x++) - { - nuc1 = (u8) bits_to_nuc[x]; - nuc1low = dna_tolower (nuc1); - for (y=0 ; y<4 ; y++) - { - nuc2 = (u8) bits_to_nuc[y]; - nuc2low = dna_tolower (nuc2); - sub = scoring->sub[nuc1][nuc2]; - scoring->sub[nuc1low][nuc2 ] = sub; - scoring->sub[nuc1 ][nuc2low] = sub; - scoring->sub[nuc1low][nuc2low] = sub; - masked ->sub[nuc1 ][nuc2 ] = sub; - if (sub < worstSub) worstSub = sub; - } - } - - for (x=0 ; x<4 ; x++) - { - nuc1 = (u8) bits_to_nuc[x]; - nuc1low = dna_tolower (nuc1); - scoring->sub[nuc1 ]['N' ] = worstSub; - scoring->sub[nuc1low]['N' ] = worstSub; - scoring->sub[nuc1 ]['n' ] = worstSub; - scoring->sub[nuc1low]['n' ] = worstSub; - scoring->sub['N' ][nuc1 ] = worstSub; - scoring->sub['N' ][nuc1low] = worstSub; - scoring->sub['n' ][nuc1 ] = worstSub; - scoring->sub['n' ][nuc1low] = worstSub; - } - - scoring->sub['N']['N'] = worstSub; - scoring->sub['N']['n'] = worstSub; - scoring->sub['n']['N'] = worstSub; - scoring->sub['n']['n'] = worstSub; - - // make sure scores for row and column zero are very very bad - - for (c=0 ; c<256 ; c++) - scoring->sub[0][c] = scoring->sub[c][0] = veryBadScore; - } - -//---------- -// -// write_scores-- -// Write the current scoring set to a file. -// -//---------- -// -// Arguments: -// char* fileId: A string to include in the file name, to -// .. identify this scoring set. -// scoreset* ss: The scoring set to write. -// int withGapScores: true => write gap scores too -// int withExtras: true => write extra values that are related to -// .. the scoring set, as comments -// int asInts: write scores as integers regardless of scoreType -// -// Returns: -// (nothing) -// -//---------- - -static void write_scores - (char* fileId, - scoreset* ss, - int withGapScores, - int withExtras, - int asInts) - { - char name[201]; - int replaced; - FILE* f; - - if (params->ic.inferFilename == NULL) - f = stdout; - else - { - strcpy (name, params->ic.inferFilename); - - replaced = false; - if ((fileId == NULL) || (fileId[0] == 0)) - { - if (!replaced) replaced = string_replace (name, sizeof(name), "_%s", fileId); - if (!replaced) replaced = string_replace (name, sizeof(name), ".%s", fileId); - } - if (!replaced) replaced = string_replace (name, sizeof(name), "%s", fileId); - - if ((!replaced) && (strstr (name, "%s") != NULL)) - suicidef ("unable to perform name substitution, try a shorter name than" - " %s", name); - - f = fopen_or_die (name, "wt"); - } - - // write the scores to the file - - if (asInts) - write_score_set_as_ints (f, name, ss, withGapScores); - else - write_score_set (f, name, ss, withGapScores); - - // write other useful info (as comments) - - if (withExtras) - { - fprintf (f, "\n"); - fprintf (f, "# hsp_threshold = %s\n", score_thresh_to_string (¶ms->hspThreshold)); - if (withGapScores) fprintf (f, "# gapped_threshold = %s\n", score_thresh_to_string (¶ms->gappedThreshold)); - fprintf (f, "# x_drop = " scoreFmtSimple "\n", params->xDrop); - if (withGapScores) fprintf (f, "# y_drop = " scoreFmtSimple "\n", params->yDrop); - } - - fclose_if_valid (f); - } - -//---------- -// -// init_stats_for_inference, erase_stats_for_inference, free_stats_for_inference-- -// Manage the by-percent-identity inference stats. -// -//---------- - -static void init_stats_for_inference - (arg_dont_complain(seq* seq1), - arg_dont_complain(seq* seq2)) - { - u32 bin; - - init_stats (&infStats); - for (bin=0 ; bin<=numIdentityBins ; bin++) - init_stats (&infStatsByPctId[bin]); - } - - -static void erase_stats_for_inference - (void) - { - u32 bin; - - erase_stats (&infStats); // (probably not necessary) - for (bin=0 ; bin<=numIdentityBins ; bin++) - erase_stats (&infStatsByPctId[bin]); - } - - -static void free_stats_for_inference - (void) - { - u32 bin; - - free_stats (&infStats); - - for (bin=0 ; bin<=numIdentityBins ; bin++) - free_stats (&infStatsByPctId[bin]); - } - -//---------- -// -// gather_stats_from_align_list-- -// Collect inference stats from a list of gapped alignments. -// -//---------- -// -// Arguments: -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// -// Returns: -// (nothing) -// -//---------- - -void gather_stats_from_align_list - (alignel* alignList, - seq* seq1, - seq* seq2) - { - alignel* a; - unspos numer, denom; - u32 bin; - - for (a=alignList ; a!=NULL ; a=a->next) - { - alignment_identity (seq1, seq2, a, &numer, &denom); - bin = identity_bin (numer, denom); - infStatsByPctId[bin].count++; - infStatsByPctId[bin].coverage += denom; - accumulate_stats_from_align (seq1, a->beg1-1, a->end1, - seq2, a->beg2-1, a->end2, - a->script, &infStatsByPctId[bin]); - - if (infer_scores_dbgShowIdentity) - { - // nota bene: positions written as 1-based - printf (unsposSlashFmt " identity=" unsposSlashFmt - " (bin as " identityBinFormat ")\n", - a->beg1, a->beg2, numer, denom, bin_to_identity (bin)); - } - } - - } - -//---------- -// -// gather_stats_from_match-- -// Collect inference stats from a single ungapped alignment. -// -//---------- -// -// Arguments: -// seq* seq1: One sequence. -// unspos pos1: The position, in seq1, of first character in the match -// .. (origin-0). -// seq* seq2: Another sequence. -// unspos pos2: The position, in seq2, of first character in the match -// .. (origin-0). -// unspos length: The number of nucleotides in the match. -// -// Returns: -// (nothing) -// -//---------- - -void gather_stats_from_match - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length) - { - unspos numer, denom; - u32 bin; - - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - bin = identity_bin (numer, denom); - infStatsByPctId[bin].count++; - infStatsByPctId[bin].coverage += denom; - accumulate_stats_from_match (seq1, pos1, seq2, pos2, length, - &infStatsByPctId[bin]); - } - -//---------- -// -// filter_stats_by_percentile-- -// Discard inference stats outside the desired percentile of percent-identity. -// -//---------- -// -// Arguments: -// (none; input is implicitly infStatsByPctId[*]) -// -// Returns: -// (nothing; output is implicitly infStatsByPctId[*]) -// -//---------- - -static void filter_stats_by_percentile - (void) - { - static const u32 noBin = (u32) -1; - u32 bin, minBin; - possum cov, covTotal, covLo, covHi; - - // convert the percentiles to a range of coverage counts - - covTotal = 0; - minBin = noBin; - for (bin=0 ; bin<=numIdentityBins ; bin++) - { - cov = infStatsByPctId[bin].coverage; - if (cov == 0) continue; - covTotal += cov; - if (minBin == noBin) minBin = bin; - } - if (minBin == noBin) minBin = numIdentityBins; - - covLo = (covTotal * minIdentity) + 0.5; - covHi = (covTotal * maxIdentity) + 0.5; - - infer_scores_set_stat (coverageTotal, covTotal); - infer_scores_set_stat (coverageLow, covLo); - infer_scores_set_stat (coverageHigh, covHi); - - if (infer_scores_dbgShowIdentity) - { - possum covTotalTemp = covTotal; - u32 binHi, binLo; - - binHi = numIdentityBins; - for (bin=numIdentityBins+1 ; bin>0 ; ) - { - bin--; - cov = infStatsByPctId[bin].coverage; - if (cov == 0) continue; - covTotalTemp -= cov; - binHi = bin; - if (covTotalTemp <= covHi) break; - } - - covTotalTemp = 0; - binLo = minBin; - for (bin=minBin ; bin<=numIdentityBins ; bin++) - { - cov = infStatsByPctId[bin].coverage; - if (cov == 0) continue; - covTotalTemp += cov; - binLo = bin; - if (covTotalTemp >= covLo) break; - } - - for (bin=minBin ; bin<=numIdentityBins ; bin++) - { - cov = infStatsByPctId[bin].coverage; - if (cov == 0) continue; - printf ("bin: " identityBinFormat " cov=" possumFmt, - bin_to_identity (bin), cov); - if ((bin <= binLo) || (bin >= binHi)) printf (" (discarded)"); - printf ("\n"); - } - } - - // discard any bins outside the range - - for (bin=numIdentityBins+1 ; bin>0 ; ) - { - bin--; - cov = infStatsByPctId[bin].coverage; - if (cov == 0) continue; - erase_stats (&infStatsByPctId[bin]); - covTotal -= cov; - infer_scores_set_stat (highIdentityBin, bin); - if (covTotal <= covHi) break; - } - - covTotal = 0; - for (bin=minBin ; bin<=numIdentityBins ; bin++) - { - cov = infStatsByPctId[bin].coverage; - if (cov == 0) continue; - erase_stats (&infStatsByPctId[bin]); - covTotal += cov; - infer_scores_set_stat (lowIdentityBin, bin); - if (covTotal >= covLo) break; - } - - // sanity check-- are any bins left? - - covTotal = 0; - for (bin=minBin ; bin<=numIdentityBins ; bin++) - { - cov = infStatsByPctId[bin].coverage; - if (cov == 0) continue; - covTotal += cov; - } - - if (covTotal == 0) - suicidef ("internal error in filter_stats_by_percentile:" - " no alignments remain after filtering"); - } - -//---------- -// -// combine_binned_stats-- -// Combine inference stats into a single bin. -// -// Stats from all bins in infStatsByPctId are combined into infStats. -// -//---------- -// -// Arguments: -// int mergeSequences: true => merge stats for reference and secondary. -// (other input is implicitly infStatsByPctId[*]) -// -// Returns: -// (nothing; output is implicitly infStats) -// -//---------- - -static void combine_binned_stats - (int mergeSequences) - { - u32 bin; - infstats* inf; - u8 c1, c2; - - erase_stats (&infStats); - - for (bin=0 ; bin<=numIdentityBins ; bin++) - { - inf = &infStatsByPctId[bin]; - if ((inf == NULL) || (inf->count == 0)) continue; - - infStats.count += inf->count; - infStats.coverage += inf->coverage; - infStats.refBases += inf->refBases; - infStats.secBases += inf->secBases; - - for (c1=0 ; c1<4 ; c1++) - { - infStats.refBkgd[c1] += inf->refBkgd[c1]; - infStats.secBkgd[c1] += inf->secBkgd[c1]; - for (c2=0 ; c2<4 ; c2++) - infStats.subs[c1][c2] += inf->subs[c1][c2]; - } - - add_lengths_to_distribution (inf->refBlocks, &infStats.refBlocks); - add_lengths_to_distribution (inf->refGaps, &infStats.refGaps); - add_lengths_to_distribution (inf->refRuns, &infStats.refRuns); - add_lengths_to_distribution (inf->segments, &infStats.segments); - - if (mergeSequences) - { - add_lengths_to_distribution (inf->secBlocks, &infStats.refBlocks); - add_lengths_to_distribution (inf->secGaps, &infStats.refGaps); - add_lengths_to_distribution (inf->secRuns, &infStats.refRuns); - } - else - { - add_lengths_to_distribution (inf->secBlocks, &infStats.secBlocks); - add_lengths_to_distribution (inf->secGaps, &infStats.secGaps); - add_lengths_to_distribution (inf->secRuns, &infStats.secRuns); - } - } - - } - -//---------- -// -// infererence statistics sets routines-- -// -//---------- - -static void init_stats - (infstats* inf) - { - inf->refBlocks = init_length_distribution (0); - inf->secBlocks = init_length_distribution (0); - inf->refGaps = init_length_distribution (0); - inf->secGaps = init_length_distribution (0); - inf->refRuns = init_length_distribution (0); - inf->secRuns = init_length_distribution (0); - inf->segments = init_length_distribution (0); - - erase_stats (inf); - } - -static void erase_stats - (infstats* inf) - { - u8 c1, c2; - - inf->count = 0; - inf->coverage = 0; - inf->refBases = 0; - inf->secBases = 0; - - for (c1=0 ; c1<4 ; c1++) - { - inf->refBkgd[c1] = 0; - inf->secBkgd[c1] = 0; - for (c2=0 ; c2<4 ; c2++) - inf->subs[c1][c2] = 0; - } - - erase_length_distribution (inf->refBlocks); - erase_length_distribution (inf->secBlocks); - erase_length_distribution (inf->refGaps); - erase_length_distribution (inf->secGaps); - erase_length_distribution (inf->refRuns); - erase_length_distribution (inf->secRuns); - erase_length_distribution (inf->segments); - } - - -static void free_stats - (infstats* inf) - { - free_length_distribution (inf->refBlocks); - free_length_distribution (inf->secBlocks); - free_length_distribution (inf->refGaps); - free_length_distribution (inf->secGaps); - free_length_distribution (inf->refRuns); - free_length_distribution (inf->secRuns); - free_length_distribution (inf->segments); - } - - -static void accumulate_stats_from_align - (seq* seq1, - unspos beg1, - unspos end1, - seq* seq2, - unspos beg2, - unspos end2, - editscript* script, - infstats* inf) - { - unspos height, width, i, j, prevI, prevJ; - u32 opIx; - unspos run, refRun, secRun, indelLen, indelBases; - u8* s1, *s2; - u8 c1, c2; - s8 cc1, cc2; - unspos denom; - unspos count, pairCount[4][4]; - unspos ix; - - beg1++; // (internally, we want origin 1, inclusive) - beg2++; - - height = end1 - beg1 + 1; - width = end2 - beg2 + 1; - - add_length_to_distribution (height, &inf->refBlocks); - add_length_to_distribution (width, &inf->secBlocks); - - for (c1=0 ; c1<4 ; c1++) - for (c2=0 ; c2<4 ; c2++) - pairCount[c1][c2] = 0; - - refRun = secRun = 0; - opIx = 0; - for (i=j=0 ; (i< height)||(j 0) - { - denom = count_substitutions (seq1, beg1-1+prevI, - seq2, beg2-1+prevJ, - run, pairCount); - if (denom != 0) - { - inf->refBases += denom; - inf->secBases += denom; - add_length_to_distribution (denom, &inf->segments); - } - } - - if ((i < height) || (j < width)) - { - prevI = i; prevJ = j; - edit_script_indel_len (script, &opIx, &i, &j); - if (j != prevJ) // (deletion from reference sequence) - { - indelLen = j - prevJ; - add_length_to_distribution (indelLen, &inf->refGaps); - if (refRun > 0) - { - add_length_to_distribution (refRun, &inf->refRuns); - refRun = 0; - } - indelBases = 0; - s2 = seq2->v + beg2-1+prevJ; - for (ix=0 ; ix= 0) { inf->secBkgd[(u8)cc2]++; indelBases++; } - } - secRun += indelBases; - inf->secBases += indelBases; - } - if (i != prevI) // (deletion from second sequence) - { - indelLen = i - prevI; - add_length_to_distribution (indelLen, &inf->secGaps); - if (secRun > 0) - { - add_length_to_distribution (secRun, &inf->secRuns); - secRun = 0; - } - indelBases = 0; - s1 = seq1->v + beg1-1+prevI; - for (ix=0 ; ix= 0) { inf->refBkgd[(u8)cc1]++; indelBases++; } - } - refRun += indelBases; - inf->refBases += indelBases; - } - } - } - - if (refRun > 0) add_length_to_distribution (refRun, &inf->refRuns); - if (secRun > 0) add_length_to_distribution (secRun, &inf->secRuns); - - for (c1=0 ; c1<4 ; c1++) - for (c2=0 ; c2<4 ; c2++) - { - count = pairCount[c1][c2]; - inf->refBkgd[c1] += count; - inf->secBkgd[c2] += count; - inf->subs[c1][c2] += count; - } - } - - -static void accumulate_stats_from_match - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - infstats* inf) - { - u8 c1, c2; - unspos denom; - unspos count, pairCount[4][4]; - - // count substitutions - - for (c1=0 ; c1<4 ; c1++) - for (c2=0 ; c2<4 ; c2++) - pairCount[c1][c2] = 0; - - denom = count_substitutions (seq1, pos1, seq2, pos2, length, pairCount); - - // collect stats - - inf->refBases += denom; - inf->secBases += denom; - add_length_to_distribution (denom, &inf->refBlocks); - add_length_to_distribution (denom, &inf->secBlocks); - add_length_to_distribution (denom, &inf->segments); - - for (c1=0 ; c1<4 ; c1++) - for (c2=0 ; c2<4 ; c2++) - { - count = pairCount[c1][c2]; - inf->refBkgd[c1] += count; - inf->secBkgd[c2] += count; - inf->subs[c1][c2] += count; - } - - } - -//---------- -// -// miscellaneous printing routines-- -// -//---------- - -static void print_bkgd_stats - (FILE* f, - char* s, - unspos bkgd[4]) - { - u8 c; - u8 nuc; - - fprintf (f, " %-7s", s); - - for (c=0 ; c<4 ; c++) - { - nuc = (u8) bits_to_nuc[c]; - fprintf (f, " %c:" unsposFmt, nuc, bkgd[c]); - } - fprintf (f, "\n"); - } - - -static void print_subs_stats - (FILE* f, - unspos subs[4][4]) - { - u8 c1, c2; - u8 nuc1, nuc2; - - for (c1=0 ; c1<4 ; c1++) - { - nuc1 = bits_to_nuc[c1]; - fprintf (f, " "); - for (c2=0 ; c2<4 ; c2++) - { - nuc2 = bits_to_nuc[c2]; - if (c2 != 0) fprintf (f, " "); - fprintf (f, "%c%c:" unsposFmt, nuc1, nuc2, subs[c1][c2]); - } - fprintf (f, "\n"); - } - } - - -static void print_blocks_stats - (FILE* f, - char* s, - distn* blocks) - { - fprintf (f, " blocks in %s\n", s); - print_length_distribution (f, " ", blocks); - } - - -static void print_gaps_stats - (FILE* f, - char* s, - distn* gaps) - { - fprintf (f, " gaps in %s\n", s); - print_length_distribution (f, " ", gaps); - } - - -static void print_runs_stats - (FILE* f, - char* s, - distn* runs) - { - fprintf (f, " runs in %s\n", s); - print_length_distribution (f, " ", runs); - } - - -static void print_segments_stats - (FILE* f, - distn* segments) - { - fprintf (f, " segments\n"); - print_length_distribution (f, " ", segments); - } - -//---------- -// -// length distribution routines-- -// -//---------- - -static distn* init_length_distribution - (u32 numEntries) - { - u32 bytesMain, bytesHeap; - distn* d; - - // if there are no entries desired, we won't allocate any memory until - // later (if and when anything is added to the distribution) - - if (numEntries == 0) return NULL; - - // allocate - - bytesMain = round_up_16 (sizeof(distn)); - bytesHeap = round_up_16 (numEntries * sizeof(dpair)); - d = malloc_or_die ("infer_stats distribution", bytesMain + bytesHeap); - - // hook up the internal array - - d->items = (dpair*) (((char*) d) + bytesMain); - - // initialize - - d->size = bytesHeap / sizeof(dpair); - d->len = 0; - - return d; - } - - -static void erase_length_distribution - (distn* d) - { - if (d == NULL) return; - d->len = 0; - } - - -static void free_length_distribution - (distn* d) - { - free_if_valid ("infer_stats distribution", d); - } - - -static void add_lengths_to_distribution - (distn* src, - distn** _dst) - { - distn* dst = *_dst; - u32 ix, iy; - unspos length; - u64 count; - u32 newEntries; - u32 bytesMain, bytesHeap; - - if (src == NULL) return; - - // if the distribution hasn't been allocated yet, this amounts to a copy - // operation - - if (dst == NULL) - { - newEntries = src->len; - bytesMain = round_up_16 (sizeof(distn)); - bytesHeap = round_up_16 (newEntries * sizeof(dpair)); - (*_dst) = dst = malloc_or_die ("add_lengths_to_distribution", - bytesMain + bytesHeap); - - dst->items = (dpair*) (((char*) dst) + bytesMain); - dst->size = bytesHeap / sizeof(dpair); - dst->len = newEntries; - memcpy (/*to*/ dst->items, /*from*/ src->items, - /*how much*/ newEntries * sizeof(dpair)); - return; - } - - // otherwise, consider lengths one at a time - - for (iy=0 ; iylen ; iy++) - { - length = src->items[iy].length; - count = src->items[iy].count; - - // locate this length; if we've seen it before, just update the count - - for (ix=0 ; ixlen ; ix++) - { if (dst->items[ix].length == length) break; } - - if (ix < dst->len) - { dst->items[ix].count += count; continue; } - - // length wasn't found; make sure there's enough room, then add an entry - - if (dst->len >= dst->size) - { - newEntries = 4*dst->size/3; - if (src->size > newEntries) newEntries = src->size; - - bytesMain = round_up_16 (sizeof(distn)); - bytesHeap = round_up_16 (newEntries * sizeof(dpair)); - (*_dst) = dst = realloc_or_die ("add_lengths_to_distribution", - dst, bytesMain + bytesHeap); - - dst->items = (dpair*) (((char*) dst) + bytesMain); - dst->size = bytesHeap / sizeof(dpair); - } - - ix = dst->len++; - dst->items[ix].length = length; - dst->items[ix].count = count; - } - - } - - -static void add_length_to_distribution - (unspos length, - distn** _d) - { - distn* d = *_d; - u32 ix; - u32 newEntries, len; - u32 bytesMain, bytesHeap; - - // if the distribution hasn't been allocated yet, go do so - - if (d == NULL) - { newEntries = 1000; len = 0; goto alloc_distn; } - - // locate length - - for (ix=0 ; ixlen ; ix++) - { if (d->items[ix].length == length) break; } - - if (ix < d->len) - { d->items[ix].count++; return; } - - // length wasn't found; make sure there's enough room, then add an entry - - if (d->len >= d->size) - { - newEntries = 4*d->size/3; - len = d->len; - - alloc_distn: - bytesMain = round_up_16 (sizeof(distn)); - bytesHeap = round_up_16 (newEntries * sizeof(dpair)); - (*_d) = d = realloc_or_die ("add_length_to_distribution", - d, bytesMain + bytesHeap); - - d->items = (dpair*) (((char*) d) + bytesMain); - d->size = bytesHeap / sizeof(dpair); - d->len = len; - } - - ix = d->len++; - d->items[ix].length = length; - d->items[ix].count = 1; - } - - -static u64 number_of_instances - (distn* d) - { - u32 ix; - u64 count = 0; - - if (d == NULL) return 0; - - for (ix=0 ; ixlen ; ix++) - count += d->items[ix].count; - - return count; - } - - -static double average_length - (distn* d) - { - u32 ix; - possum sum = 0; - u64 count = 0; - - if (d == NULL) return -1; // (since lengths are always strictly positive - // .. a negative average is impossible) - - for (ix=0 ; ixlen ; ix++) - { - count += d->items[ix].count; - sum += d->items[ix].count * d->items[ix].length; - } - - if (count == 0) return -1; - else return ((double) sum) / count; - } - - -static void print_length_distribution - (FILE* f, - char* prefix, - distn* d) - { - u32 ix; - - if ((d == NULL) || (d->len == 0)) - { fprintf (f, "%s (none)\n", prefix); return; } - - qsort (d->items, d->len, sizeof(dpair), qCompareByLength); - - for (ix=0 ; ixlen ; ix++) - fprintf (f, "%s " unsposFmt ":" u64Fmt "\n", - prefix, d->items[ix].length, d->items[ix].count); - } - - -static int qCompareByLength - (const void* _pairA, - const void* _pairB) - { - dpair* pairA = (dpair*) _pairA; - dpair* pairB = (dpair*) _pairB; - - if (pairA->length < pairB->length) return -1; - else if (pairA->length > pairB->length) return 1; - - return 0; - } - -//========== -// -// The next four routines exist soley to support LASTZ's fmtInfStats output -// format. This format allows collecting/reporting of stats upon which scoring -// inference can be performed by an external program. Rather than binning the -// stats by pctid, they collect stats in a single bin (infStats instead of -// infStatsByPctId). -// -// This interface is supported only so that the early python version of INFERZ, -// which was used for the experiments discussed in [3], is still functional. -// -// The routines are -// init_inference_stats_job -// print_inference_stats_job -// infer_stats_from_align_list -// infer_stats_from_match -// -//========== - -//---------- -// -// init_inference_stats_job-- -// Initialize inference stats. -// -//---------- - -void init_inference_stats_job - (arg_dont_complain(seq* seq1), - arg_dont_complain(seq* seq2)) - { - if (statsActive) - suicide ("attempt to open a second inference stats job"); - statsActive = true; - - init_stats (&infStats); - } - -//---------- -// -// print_inference_stats_job-- -// Print inference stats. -// -//---------- - -static void private_print_inference_stats_job (FILE* f, infstats* inf); - -void print_inference_stats_job - (FILE* f) - { - if (!statsActive) - suicide ("attempt to close a non-existent inference stats job"); - - private_print_inference_stats_job (f, &infStats); - free_stats (&infStats); - statsActive = false; - } - - -static void private_print_inference_stats_job - (FILE* f, - infstats* inf) - { - char* refSpecies = "seq1"; - char* secSpecies = "seq2"; - - if (!statsActive) - suicide ("attempt to close a non-existent inference stats job"); - - fprintf (f, "%s vs %s\n", refSpecies, secSpecies); - fprintf (f, " 0%% < GC <= 100%%\n"); - - fprintf (f, " %-7s " unsposFmt " bases, " u64Fmt " gaps, " u64Fmt " runs\n", - refSpecies, inf->refBases, - number_of_instances (inf->refGaps), - number_of_instances (inf->refRuns)); - fprintf (f, " %-7s " unsposFmt " bases, " u64Fmt " gaps, " u64Fmt " runs\n", - secSpecies, inf->secBases, - number_of_instances (inf->secGaps), - number_of_instances (inf->secRuns)); - - print_bkgd_stats (f, refSpecies, inf->refBkgd); - print_bkgd_stats (f, secSpecies, inf->secBkgd); - print_subs_stats (f, inf->subs); - print_blocks_stats (f, refSpecies, inf->refBlocks); - print_blocks_stats (f, secSpecies, inf->secBlocks); - print_gaps_stats (f, refSpecies, inf->refGaps); - print_gaps_stats (f, secSpecies, inf->secGaps); - print_runs_stats (f, refSpecies, inf->refRuns); - print_runs_stats (f, secSpecies, inf->secRuns); - print_segments_stats (f, inf->segments); - fprintf (f, "\n"); - } - -//---------- -// -// infer_stats_from_align_list-- -// Collect inference stats from a list of gapped alignments. -// -//---------- -// -// Arguments: -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// -// Returns: -// (nothing) -// -//---------- - -void infer_stats_from_align_list - (alignel* alignList, - seq* seq1, - seq* seq2) - { - alignel* a; - - for (a=alignList ; a!=NULL ; a=a->next) - accumulate_stats_from_align (seq1, a->beg1-1, a->end1, - seq2, a->beg2-1, a->end2, - a->script, &infStats); - } - -//---------- -// -// infer_stats_from_match-- -// Collect inference stats from a single ungapped alignment. -// -//---------- -// -// Arguments: -// seq* seq1: One sequence. -// unspos pos1: The position, in seq1, of first character in the match -// .. (origin-0). -// seq* seq2: Another sequence. -// unspos pos2: The position, in seq2, of first character in the match -// .. (origin-0). -// unspos length: The number of nucleotides in the match. -// -// Returns: -// (nothing) -// -//---------- - -void infer_stats_from_match - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length) - { - accumulate_stats_from_match (seq1, pos1, seq2, pos2, length, &infStats); - } - -//---------- -// -// infer_scores_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void infer_scores_zero_stats - (void) - { -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&inferScoresStats, 0, sizeof(inferScoresStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - inferScoresStats.averageGapLength = 0.0; - inferScoresStats.averageSegmentLength = 0.0; - inferScoresStats.pExtend = 0.0; - inferScoresStats.sExtend = 0.0; - inferScoresStats.pOpen = 0.0; - inferScoresStats.sOpen = 0.0; - inferScoresStats.scaleBy = 0.0; - -#endif // collect_stats - } - -//---------- -// -// infer_scores_show_stats_subs, infer_scores_show_stats_gaps, -// infer_scores_show_stats-- -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// -// Returns: -// (nothing) -// -//---------- - -void infer_scores_show_stats_subs - (arg_dont_complain(FILE* f), - arg_dont_complain(int trial)) - { -#ifdef collect_stats - int x, y; - u32 totalSubs; - - if (f == NULL) return; - - fprintf (f, "(trial s%03d)\n", trial); - infer_scores_show_stats_common (f); - - // inference counts - - totalSubs = 0; - for (x=0 ; x<4 ; x++) - for (y=0 ; y<4 ; y++) - totalSubs += inferScoresStats.subs[x][y]; - - fprintf (f, "%20s %6s ", "", ""); - for (y=0 ; y<4 ; y++) - fprintf (f, " %6c", bits_to_nuc[y]); - fprintf (f, "\n"); - - for (x=0 ; x<4 ; x++) - { - if (x == 1) fprintf (f, " inference counts: "); - else fprintf (f, "%20s", ""); - fprintf (f, " %c", bits_to_nuc[x]); - for (y=0 ; y<4 ; y++) - fprintf (f, " %6u", inferScoresStats.subs[x][y]); - fprintf (f, "\n"); - } - - fprintf (f, " (total): %s\n", commatize(totalSubs)); - - // inference observations - - fprintf (f, "%20s %6s ", "", ""); - for (y=0 ; y<4 ; y++) - fprintf (f, " %6c", bits_to_nuc[y]); - fprintf (f, "\n"); - - fprintf (f, "%20s %6s ", "", ""); - for (y=0 ; y<4 ; y++) - fprintf (f, " %6u", inferScoresStats.n2[y]); - fprintf (f, "\n"); - - fprintf (f, "%20s %6s ", "", ""); - for (y=0 ; y<4 ; y++) - fprintf (f, " ------"); - fprintf (f, "\n"); - - for (x=0 ; x<4 ; x++) - { - if (x == 0) fprintf (f, " observations: "); - else fprintf (f, "%20s", ""); - fprintf (f, "%c", bits_to_nuc[x]); - fprintf (f, " %6u |", inferScoresStats.n1[x]); - for (y=0 ; y<4 ; y++) - fprintf (f, " %6u", inferScoresStats.m[x][y]); - fprintf (f, "\n"); - } - - fprintf (f, "-------------------\n"); -#endif // collect_stats - } - -void infer_scores_show_stats_gaps - (arg_dont_complain(FILE* f), - arg_dont_complain(int trial)) - { -#ifdef collect_stats - if (f == NULL) return; - fprintf (f, "(trial g%03d)\n", trial); - infer_scores_show_stats_common (f); - - fprintf (f, "average gap length: %.13f\n", inferScoresStats.averageGapLength); - fprintf (f, "average seg length: %.13f\n", inferScoresStats.averageSegmentLength); - fprintf (f, " p(extend): %.13f\n", inferScoresStats.pExtend); - fprintf (f, " s(extend): %.13f\n", inferScoresStats.sExtend); - fprintf (f, " p(open): %.13f\n", inferScoresStats.pOpen); - fprintf (f, " s(open): %.13f\n", inferScoresStats.sOpen); - fprintf (f, "-------------------\n"); -#endif // collect_stats - } - -void infer_scores_show_stats_common - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - if (f == NULL) return; - fprintf (f, " total coverage: %s\n", commatize(inferScoresStats.coverageTotal)); - fprintf (f, " low coverage: %s\n", commatize(inferScoresStats.coverageLow)); - fprintf (f, " high coverage: %s\n", commatize(inferScoresStats.coverageHigh)); - fprintf (f, " low identity: " identityBinLongFormat "\n", bin_top_to_identity (inferScoresStats.lowIdentityBin)); - fprintf (f, " high identity: " identityBinLongFormat "\n", bin_bottom_to_identity(inferScoresStats.highIdentityBin)); - fprintf (f, " scale by: %.13f\n", inferScoresStats.scaleBy); -#endif // collect_stats - } - -void infer_scores_show_stats - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - if (f == NULL) return; - fprintf (f, "(no infer_scores stats)\n"); - fprintf (f, "-------------------\n"); -#endif // collect_stats - } - -void infer_scores_generic_stats - (arg_dont_complain(FILE* f), - arg_dont_complain(void (*func) (FILE*, const char*, ...))) - { -#ifdef collect_stats - if (f == NULL) return; - (*func) (f, "total_coverage: %" PRId64 "\n", inferScoresStats.coverageTotal); - (*func) (f, "low_coverage: %" PRId64 "\n", inferScoresStats.coverageLow); - (*func) (f, "high_coverage: %" PRId64 "\n", inferScoresStats.coverageHigh); - (*func) (f, "low_identity: " identityBinLongFormat "\n", bin_top_to_identity (inferScoresStats.lowIdentityBin)); - (*func) (f, "high_identity: " identityBinLongFormat "\n", bin_bottom_to_identity(inferScoresStats.highIdentityBin)); -#endif // collect_stats - } - diff --git a/programs/lastz/src/infer_scores.h b/programs/lastz/src/infer_scores.h deleted file mode 100644 index c9805ab..0000000 --- a/programs/lastz/src/infer_scores.h +++ /dev/null @@ -1,119 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: infer_scores.h -// -//---------- - -#ifndef infer_scores_H // (prevent multiple inclusion) -#define infer_scores_H - -// other files - -#include // standard C i/o stuff -#include // standard C variable argument list stuff -#include "sequences.h" // sequence stuff -#include "pos_table.h" // position table stuff -#include "seed_search.h" // seed hit search stuff -#include "gapped_extend.h" // gapped alignment stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef infer_scores_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef infer_scores_owner -int infer_scores_watchConverge = false; // true => report scores so we can watch convergence -int infer_scores_snoopConverge = false; // true => report stats so we can watch convergence -int infer_scores_showParams = false; // true => report the inference parameters -int infer_scores_outputLav = false; // true => output LAV for inference alignments -int infer_scores_dbgShowIdentity = false; -#else -global int infer_scores_watchConverge; -global int infer_scores_snoopConverge; -global int infer_scores_showParams; -global int infer_scores_outputLav; -global int infer_scores_dbgShowIdentity; -#endif - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -global struct - { - int64 coverageTotal; - int64 coverageLow; - int64 coverageHigh; - int lowIdentityBin; - int highIdentityBin; - u32 subs[4][4]; - u32 n1[4]; - u32 n2[4]; - u32 m[4][4]; - double averageGapLength; - double averageSegmentLength; - double pExtend; - double sExtend; - double pOpen; - double sOpen; - double scaleBy; - } inferScoresStats; - -// stats macros - -#define infer_scores_count_stat(field) ++inferScoresStats.field -#define infer_scores_uncount_stat(field) --inferScoresStats.field -#define infer_scores_set_stat(field,val) (inferScoresStats.field = val) -#define infer_scores_add_stat(field,val) (inferScoresStats.field += val) -#define infer_scores_max_stat(field,val) if (val > inferScoresStats.field) inferScoresStats.field = val -#else -#define infer_scores_count_stat(field) -#define infer_scores_uncount_stat(field) -#define infer_scores_set_stat(field,val) -#define infer_scores_add_stat(field,val) -#define infer_scores_max_stat(field,val) -#endif // collect_stats - -// prototypes for stats routines - -void infer_scores_zero_stats (void); -void infer_scores_show_stats_common (FILE* f); -void infer_scores_show_stats_subs (FILE* f, int trial); -void infer_scores_show_stats_gaps (FILE* f, int trial); -void infer_scores_show_stats (FILE* f); -void infer_scores_generic_stats (FILE* f, void (*func) (FILE*, const char*, ...)); - -//---------- -// -// prototypes for routines in infer_scores.c -// -//---------- - -scoreset* drive_scoring_inference (void* params, - seq* target, u8* targetRev, - postable* targPositions, - seq* query, - tback* traceback); - -void gather_stats_from_align_list (alignel* alignList, seq* seq1, seq* seq2); -void gather_stats_from_match (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length); - -void init_inference_stats_job (seq* seq1, seq* seq2); -void print_inference_stats_job (FILE* f); -void infer_stats_from_align_list (alignel* alignList, seq* seq1, seq* seq2); -void infer_stats_from_match (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length); - -#undef global -#endif // infer_scores_H diff --git a/programs/lastz/src/lastz.c b/programs/lastz/src/lastz.c deleted file mode 100755 index ff5e99a..0000000 --- a/programs/lastz/src/lastz.c +++ /dev/null @@ -1,10142 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: lastz.c -// -//---------- - -char* programName = "lastz"; -char* programVersionMajor = VERSION_MAJOR; -char* programVersionMinor = VERSION_MINOR; -char* programVersionSubMinor = VERSION_SUBMINOR; -char* programRevisionDate = REVISION_DATE; - -char* svnRevisionNumber = SUBVERSION_REV; - -//---------- -// -// lastz-- Local Alignment Search Tool, blastZ-like -// Find pairwise local alignments between a target DNA sequence and a series -// of query sequences. Query sequences can be DNA or quantum DNA. -// -// GOAL 1: Align sequences as large as 250M bases (about the size of human -// chromsome 1) on a workstation with 32-bit addressing and 2G byte of -// memory, with a seed up to weight 24 (equivalent to a 12-mer). -// -// GOAL 2: It should be relatively easy to try out different alignment -// strategies. -// -//---------- -// -// Algorithmic Overview -// -// Caveat: many details are left out or glossed over in this description. In -// some cases accuracy has been sacrificed for clarity. -// -// The algorithm consists of the following stages: -// 1 (SEED) discovery of short near-matches ("seed hits") -// 2 (HSP) extension to ungapped High-scoring Segment Pairs (HSPs) -// 3 (CHAIN) reduction of HSPs to the best-scoring "chain" -// 4 (ANCHOR) reduction of (remaining) HSPs to single positions ("anchors") -// 5 (ALIGN) extension of anchors to gapped alignments -// 6 (INTERP) "interpolation" between alignments at a higher sensitivity -// -// Most stages are optional. For example, you can leave out chaining, or you -// can stop as soon as HSPs are found. However, there are some dependencies -// between stages. SEED and HSP are performed together and are required for -// anything else, ANCHOR and ALIGN are performed together, and INTERP requires -// ALIGN. INTERP can be thought of as repeating the first five stages on -// whatever failed to align. -// -// The process is repeated, (mostly) independently, for every query sequence. -// The exceptions are a seed position table is built once, and dynamic masking -// has interactions between queries. (These are described below). -// -// The SEED stage makes use of a large table containing the positions (in -// the target sequence) of every possible seed bit pattern. For this discussion -// seed can be thought of as 12-mers, but we support more general seeds (see -// seeds.c). The table is indexed by the seed bit pattern, which for an 12-mer -// is a 24 bit value. For each seed value the table gives a list of all -// positions where that seed value can be found. The table requires 4*(L+4^W) -// bytes, where W is the seed length and L is the sequence length. For W=12 -// and L = 250MB, this is a little larger than 1GB. For details on how the -// table is stored, see pos_table.c. -// -// Having created the position table, the SEED stage scans a query sequence and -// looks up matches ("seed hits") for every query seed. As each seed hit is -// found, the HSP stage is performed, extending it along the diagonal in both -// directions until the score drops off. HSPs that do not meet the score -// threshold are discarded (strictly speaking, they are not HSPs). To improve -// performance, we keep track of how far we have progressed along each diagonal, -// and quickly discard any seed hits that fall into previously identified HSPs. -// Since a complete diagonal tracking array could be huge (for two 250M base -// sequences it would be 4 * 500M = 2G), we use a much smaller array and hash -// diagonals into its index space. This results in a small loss of sensitivity, -// as hash collisions between diagonals can cause seed hits on other diagonals -// to be hidden by HSPs on other diagonals. See seed_search.c and diag_hash.c. -// -// The CHAIN stage finds the highest scoring series of HSPs in which each HSP -// begins strictly before the start of the next. All HSPs not on this chain -// are discarded. This is useful when the query and target are known to be -// syntenic. The algorithm processes the HSPs in order along the target, build -// chains by adding the next HSP to the best previous viable chain. See -// chain.c. -// -// The ANCHOR stage reduces each HSP to a single point. A constant-width -// window is slid across the HSP and the midpoint of the highest-scoring window -// is chosen as the anchor. -// -// The ALIGN stage extends anchors into gapped alignments. One-sided extension -// is performed in two directions from the anchor point, the two resulting -// alignments are joined at the anchor, and if the score is high enough, this -// becomes an alignment in the output file. Extension is computed per the -// standard (but optimized) 3-state affine gap dynamic programming recurrence. -// The DP matrix is evaluated only in a small region straddling the high-scoring -// path, with only enough memory to store the widest row of that region. See -// gapped_extend.c. -// -// Dynamic masking is (optionally) performed during the ALIGN stage. We keep a -// count of how many times each target base has been in an alignment. When a -// base reaches a threshold, it is assumed to be a repeat and is masked from the -// sequence (by chaing it to an 'x'). This only affects subsequent queries in -// the same run. See masking.c. -// -// The INTERP stage repeats all of the previous steps in the regions between -// those gapped alignments. As of this writing the interpolation seed is a -// 7-mer exact match, for compatibility with BLASTZ. -// -// Input formats can be fasta, fastq, nib, 2bit, hsx, or qdna files, but other -// formats can be added without too much pain. See sequences.c. -// -// A variety of pairwise output formats are supported. The primary format is -// LAV for compatibility with BLASTZ. See lav.c, gfa.c, axt.c, maf.c, cigar.c, -// genpaf.c, text_align.c and align_diffs.c. It is relatively easy to add other -// formats (although it would be even easier if this were written in an object- -// oriented language). -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C i/o stuff -#include // standard C string stuff -#include // standard C upper/lower stuff -#include // standard C variable argument list stuff -#include // standard C math stuff -#include // standard C time stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "seeds.h" // seed strategy stuff -#include "pos_table.h" // position table stuff -#include "capsule.h" // multi-process sharing stuff -#include "seed_search.h" // seed hit search stuff -#include "quantum.h" // quantum DNA search stuff -#include "segment.h" // segment table management stuff -#include "chain.h" // segment chaining stuff -#include "gapped_extend.h" // gapped alignment stuff -#include "tweener.h" // interpolated alignment stuff -#include "masking.h" // dynamic masking stuff -#include "infer_scores.h" // scoring inference stuff -#include "edit_script.h" // alignment edit script stuff -#include "diag_hash.h" // diagonals hashing stuff -#include "identity_dist.h" // identity distribution stuff -#include "coverage_dist.h" // query coverage distribution stuff -#include "continuity_dist.h" // query continuity distribution stuff -#include "output.h" // alignment outout format stuff -#include "maf.h" // maf alignment format stuff -#include "sam.h" // sam alignment format stuff -#include "genpaf.h" // genpaf alignment format stuff -#include "text_align.h" // textual alignment format stuff - -#define lastz_owner // (make this owner of program-wide globals) -#include "lastz.h" // lastz program-wide stuff - -#define helpout stdout // stream to write help messages to - -#ifdef allowSeveralTargets -#error ***** allowSeveralTargets is not debugged, and has known problems-- DO NOT use it! ***** -#endif // allowSeveralTargets - -// debugging defines - -//#define snoopAlignList // if this is defined, extra code is added to - // .. track alignment lists -//#define snoopMirroring // if this is defined, extra code is added to - // .. track mirroring of alignment lists -//#define snoopHitProc // if this is defined, extra code is added to - // .. track hit processor assignments - -//---------- -// -// debug set ups -// -//---------- - -//--- debug set up for validating sequences received for chores --- - -//#define debugChoreChecksum - - -//--- debug set up for chores' positional filtering --- - -//#define debugChoreFilter - -#ifndef debugChoreFilter -#define debugChoreFilter_1 ; -#define debugChoreFilter_2 ; -#endif // not debugChoreFilter - -#ifdef debugChoreFilter - -#define debugChoreFilter_1 \ - if (reportProgressNow) \ - fprintf (stderr, " reverse query=[" unsposFmt "," unsposFmt "]\n", \ - query->chore.queryInterval.s, \ - query->chore.queryInterval.e); - -#define debugChoreFilter_2 \ - fprintf (stderr, " target=[" unsposFmt "," unsposFmt "]", \ - query->chore.targetInterval.s, \ - query->chore.targetInterval.e); \ - fprintf (stderr, " query=[" unsposFmt "," unsposFmt "]", \ - query->chore.queryInterval.s, \ - query->chore.queryInterval.e); - -#endif // debugChoreFilter - - -//--- debug set up for tracking sequence states --- - -//#define debugSequenceState - -#ifndef debugSequenceState -#define debugSequenceState_1 ; -#define debugSequenceState_2 ; -#endif // not debugSequenceState - -#ifdef debugSequenceState - -#define debugSequenceState_1 \ - fprintf (stderr, "target state:\n-------------\n"); \ - dump_sequence_state (stderr, target); \ - fprintf (stderr, "\n"); - -#define debugSequenceState_2 \ - fprintf (stderr, "query %d state:\n-----------------\n", \ - numQueries+1); \ - dump_sequence_state (stderr, query); \ - fprintf (stderr, "\n"); - -#endif // debugSequenceState - -//---------- -// -// stuff for crude profiling -// -//---------- - -//#define useStandardClock (define at build time) - -#ifdef useStandardClock -#define read_clock() clock() -#define clocksPerSec CLOCKS_PER_SEC -#endif // useStandardClock - -#ifndef useStandardClock -#include -#define read_clock() microsec_clock() -#define clocksPerSec 1000000 -u64 microsec_clock (void); -u64 microsec_clock (void) - { - static int failed = false; - struct timeval time1; - int err; - - if (failed) return 0; // (previous call to gettimeofday has failed) - err = gettimeofday (&time1, NULL); - if (err != 0) { failed = true; return 0; } - - return (((u64) time1.tv_sec) * 1000000) + time1.tv_usec; - } -#endif // not useStandardClock - -// clock for dbgQueryProgress and dbgTargetProgress - -s64 dbgQueryProgressClock = 0; - -#ifdef allowSeveralTargets -s64 dbgTargetProgressClock = 0; -#endif // allowSeveralTargets - -// build-specific profiling stuff - -#ifndef dbgTiming -#define dbg_timing_sub(v) ; -#define dbg_timing_add(v) ; -#define dbg_timing_copy(dst,src) ; -#define dbg_timing_report(v,s) ; -#endif // not dbgTiming - -#ifdef dbgTiming - -s64 debugClockTotal = 0, - debugClockSeq1 = 0, - debugClockSeq2 = 0, - debugClockPosTable = 0, - debugClockQueryTotal = 0, - debugClockSegTable = 0, - debugClockChaining = 0, - debugClockGappedExtend = 0, - debugClockInterpolation = 0, - debugClockOutput = 0; - -#define dbg_timing_sub(v) { v -= (s64) read_clock(); } -#define dbg_timing_add(v) { v += (s64) read_clock(); } -#define dbg_timing_copy(dst,src) { dst = src; } - -#define dbg_timing_report(v,s) { fprintf(stderr,"%-26s %.3f\n",s":",((float)(v))/clocksPerSec); } -#endif // dbgTiming - -//---------- -// -// private global data -// -//---------- - -static int showDefaults = false; -static int showDefaultsStderr = false; -static int showDefaultsExit = false; -static int showProgress = false; - -// we keep two copies of the control data, one for the primary alignment, and -// another for alignments used to infer a scoring set; for historical reasons -// we call these "lz" (short for lastz) and "iz" (short for inferz); we keep a -// pointer to whichever of these is 'active' at any given time - -static control lzParams; -static control izParams; - -control* currParams; // (nota bene: currParams is accessed by output.c) - -// command line options - -// defaults - -static const control defaultParams = - { - NULL,NULL, // seq1, seq1Filename - NULL,NULL, // seq2, seq2Filename - NULL,NULL, // rev1, rev2 - - false,false, // inferScores, inferOnly - { - NULL, // ic.inferFilename - 100,true, // ic.inferScale, ic.writeAsInt - ratioNone, // ic.hspThresholdIsRatio, - ratioNone, // ic.gappedThresholdIsRatio - ratioNone, // ic.gapOpenIsRatio - ratioNone, // ic.gapExtendIsRatio - 30,0, // ic.subIterations, ic.gapIterations - false, // ic.idIsPercentile - }, - false,false, // selfCompare, clonedQuery, - - true, // doSeedSearch - (s8*) nuc_to_bits, // charToBits - (s8*) upper_nuc_to_bits, // upperCharToBits - 1, // whichStrand - 1, // step - - NULL, // hitSeed (default is defaultSeedString) - 28, // maxIndexBits - 1, // withTrans - false, // noHitFiltering - 0,0, // twinMinSpan, twinMaxSpan (trumped - // .. by defaultTwinsYes, etc. below) - hitSimple, // basicHitType - -1,-1, // minMatches, maxTransversions - false, // filterCaresOnly -#ifndef noSeedHitQueue - defaultSeedHitQueueSize, // seedHitQueueSize -#endif // not noSeedHitQueue - - false,false, // readCapsule, writeCapsule, - NULL,NULL,NULL, // capsuleFile, capsuleFilename, capsule - 0,0, // targetMem, queryMem - - NULL,NULL, // anchorsFile, anchorsFilename - NULL, // choresFilename - - gfexXDrop, // gfExtend - false, // mergeAnchors - false,0,0, // chain, chainDiag, chainAnti - true, // gappedExtend - - NULL,NULL, // scoring, maskedScoring - 0, // xDrop - 0, // yDrop - false,false, // xDropUntrimmed, yDropUntrimmed - {'S',3000,0,0}, // hspThreshold - {'S',0, 0,0}, // gappedThreshold - true,false, // entropicHsp, reportEntropy - false, // gappedAllBounds - -1,-1, // mirrorHSP, mirrorGapped (-1 - // .. indicates "not set on command - // .. line, default is hardcoded) - false, // inhibitTrivial - 80*1024*1024, // tracebackMem - NULL, // traceback - false,false,0,0, // nIsAmbiguous,allowAmbiDNA,ambiMatch,ambiMismatch - false,0,true,false, // hspImmediate,searchLimit,searchLimitWarn,searchLimitKeep - 0, // numBestHsps - 0.0,0,false,false, // maxPairedDepth,maxPairedBases,overlyPairedWarn,overlyPairedKeep - - 0.0,0, // wordCountKeep, wordCountLimit - 0, // maxWordCountChasm - 0, // dynamicMasking - NULL,NULL,false, // maskingFile, maskingFilename, masking3Fields - NULL,NULL,false, // softMaskedFile, softMaskedFilename, softMasked3Fields - false, // reportCensus - NULL,NULL,0, // censusFile, censusFilename, censusKind - - 0.0,1.0, // minIdentity, maxIdentity - 0.0,1.0, // minCoverage, maxCoverage - 0.0,1.0, // minContinuity, maxContinuity - 0.0,0, // minMatchCountRatio, minMatchCount - -1, // maxMismatchCount - -1,-1, // maxSeparateGapsCount, maxGapColumnsCount -#ifdef densityFiltering - 0.0, // maxDensity -#endif // densityFiltering - NULL,NULL, // outputFilename, outputFile - fmtLav,NULL,NULL,NULL, // outputFormat, outputInfo, readGroup, samRGTags - false, // endComment - false, // needTrueLengths - false, // deGapifyOutput - NULL,NULL,NULL, // dotplotFilename, dotplotFile, dotplotKeys - - 0, // innerThreshold - NULL, // innerSeed - 20000, // innerWindow - - false,false, // targetIsQuantum, queryIsQuantum - -1, // ballScore - - true, // lajCompatible - 0, // textContext - NULL, // args - 0, // verbosity - false, // reportTiming - false, // reportStats - false, // showStats - NULL, // statsFile - NULL, // statsFilename - spt_dont // showPosTable - }; - -static const char* defaultSeedString = seed_12of19; - -static const int defaultTwinsYes = false; -static const int defaultTwinMinGap = 0; -static const int defaultTwinMaxGap = 10; - -static int dbgShowMatrix = false; -static int dbgDumpTargetSequence = false; -static int dbgDumpQuerySequence = false; -static int dbgDumpTargetSequence2 = false; -static int dbgDumpQuerySequence2 = false; -static int dbgShowParams = false; -static int dbgShowHsps = false; -static u32 dbgShowHspCountsMin = (u32) -1; -static int dbgAnchorParsing = false; -static int dbgAnchorContent = false; -static int dbgShowAnchors = false; -static int dbgShowAnchorsHowOften = 0; -static int dbgSortAnchorsByDiag = false; -static int dbgInhibitSegmentReduction = false; -static int dbgMasking = false; -static int dbgQueryProgress = 0; -static int dbgQueryProgressWithMasking = false; -static char* dbgQueryProgressPrefix = ""; -#ifdef allowSeveralTargets -static int dbgTargetProgress = 0; -static char* dbgTargetProgressPrefix = ""; -#endif // allowSeveralTargets -static int dbgReportFinish = false; - -#define innerWordSize 7 // word size for inner alignment seed - // .. hits - -static const float defaultBallScoreFactor = 0.75; - -// anchors-- -// Whenever the process will go beyond just finding gap-free extensions, the -// segments that will become anchors (e.g. HSPs) are collected in this table -// instead of being written to the console. - -#define numDefaultAnchors 4000 - -static segtable* anchors = NULL; -static segtable* secondaryAnchors = NULL; - -// miscellany - -u32* alignmentHashes = NULL; // array of hashes of alignments for - // .. the current query; used for - // .. gappily_extend_hsps under certain - // .. conditions - -#define chainScale 100 - -sthresh scratchThreshold = {'S',0,0,0}; - -#define dbg_show_hsp_counts_1 \ - if ((query->shortHeader != NULL) && (!query->useFullNames)) \ - fprintf (stderr, " for query %s", query->shortHeader); \ - else if (query->header != NULL) \ - fprintf (stderr, " for query %s", query->header); \ - if (query->revCompFlags == rcf_comp) \ - fprintf (stderr, " (complement)"); \ - else if (query->revCompFlags == rcf_rev) \ - fprintf (stderr, " (reverse)"); \ - else if (query->revCompFlags == rcf_revcomp) \ - fprintf (stderr, " (reverse complement)"); \ - fprintf (stderr, "\n"); - -#define dbg_show_hsp_counts_2 \ - if ((dbgShowHspCountsMin != (u32)-1) \ - && (anchors->len != originalNumAnchors)) \ - { \ - if (dbgQueryProgress != 0) fprintf (stderr, " "); \ - fprintf (stderr, "reduced %s HSPs to %s (K=" scoreFmtSimple ")\n", \ - ucommatize(originalNumAnchors), \ - ucommatize(anchors->len), \ - anchors->seg[anchors->len-1].s); \ - } - -//---------- -// -// pre-canned expansion arguments -// -//---------- - -typedef struct exparg - { - char* argName; - char* version; // NULL means version can't be used; - // .. "" means version is applicable - char* expansion; - } exparg; - - -// expander list; note that for expansions that have versions, it is imperitive -// that the versions are listed here in oldest-first order; this is because the -// command line parser will use the first version it encounters that is newer -// than (or the same as) the version the user specifies - -exparg expanders[] = - { -// old expansions: - { "--yasra98", "1.02.45", "T=2 Z=20 --match=1,6 O=8 E=1 Y=20 K=22 L=30 --identity=98..100" }, - { "--yasra95", "1.02.45", "T=2 Z=20 --match=1,5 O=8 E=1 Y=20 K=22 L=30 --identity=95..100" }, - { "--yasra90", "1.02.45", "T=2 Z=20 --match=1,5 O=6 E=1 Y=20 K=22 L=30 --identity=90..100" }, - { "--yasra85", "1.02.45", "T=2 --match=1,2 O=4 E=1 Y=20 K=22 L=30 --identity=85..100" }, - { "--yasra75", "1.02.45", "T=2 --match=1,1 O=3 E=1 Y=20 K=22 L=30 --identity=75..100" }, - { "--yasra95short", "1.02.45", "T=2 --match=1,7 O=6 E=1 Y=14 K=10 L=14 --identity=95..100" }, - { "--yasra85short", "1.02.45", "T=2 --match=1,3 O=4 E=1 Y=14 K=11 L=14 --identity=85..100" }, -// current expansions: - { "--yasra98", "", "T=2 Z=20 --match=1,6 O=8 E=1 Y=20 K=22 L=30 --identity=98..100 --ambiguous=n --noytrim" }, - { "--yasra95", "", "T=2 Z=20 --match=1,5 O=8 E=1 Y=20 K=22 L=30 --identity=95..100 --ambiguous=n --noytrim" }, - { "--yasra90", "", "T=2 Z=20 --match=1,5 O=6 E=1 Y=20 K=22 L=30 --identity=90..100 --ambiguous=n --noytrim" }, - { "--yasra85", "", "T=2 --match=1,2 O=4 E=1 Y=20 K=22 L=30 --identity=85..100 --ambiguous=n --noytrim" }, - { "--yasra75", "", "T=2 --match=1,1 O=3 E=1 Y=20 K=22 L=30 --identity=75..100 --ambiguous=n --noytrim" }, - { "--yasra95short", "", "T=2 --match=1,7 O=6 E=1 Y=14 K=10 L=14 --identity=95..100 --ambiguous=n --noytrim" }, - { "--yasra85short", "", "T=2 --match=1,3 O=4 E=1 Y=14 K=11 L=14 --identity=85..100 --ambiguous=n --noytrim" } - }; -#define numExpanders ((int)(sizeof(expanders)/sizeof(exparg))) - -//---------- -// -// prototypes for private functions -// -//---------- - -int main (int argc, char** argv); - -static int report_progress (seq* target, seq* query, - int applyChore, int numQueries, int numChores, - hitprocinfo* hitProcInfo); -static seq* capsule_target (capinfo* cap, u8** targetRev); -static postable* capsule_position_table (capinfo* cap, seq* seq, - seed* hitSeed, u32 step); -static interval resolve_chore_target (chore* chore, seq* target); -static interval resolve_chore_query (seq* query, char strand); -static void choose_best_anchors (u32 numAnchors); -static score chain_connect_penalty (segment* seg1, segment* seg2, int scale); -static void remove_interval_seeds (unspos b, unspos e, void* info); -static u32 report_hsps (void* info, - unspos pos1, unspos pos2, unspos length, - score s); -static u32 collect_filtered_hsps (void* info, - unspos pos1, unspos pos2, unspos length, - score s); -static u32 collect_hsps (void* info, - unspos pos1, unspos pos2, unspos length, - score s); -static alignel* mirror_alignments (alignel* alignList); -static void usage (void); -static void all_options (void); -static void file_options (void); -static void format_options (void); -static void shortcuts (void); -static void show_scoring_defaults (FILE* f, int andExit); -static void expander_options (char* header, char* prefix); -static void chastise (const char* format, ...); -static void parse_options (int argc, char** argv, - control* lzParams, control* izParams); -static void create_seed_structure (control* lzParams, char** seedString, - int haveWithTrans, int twinsYes, - int minGap, int maxGap); - -static void print_params (FILE* f, control* lzParams); - -static void read_control_file_by_name (char* name, control* params); -static void read_control_file (FILE* f, char* name, control* params); - -static void print_options (void); - -static int name_spec_is_quantum (char* spec); - -static void lastz_zero_stats (void); -static void lastz_show_stats_before (FILE* f); -static void lastz_show_stats (FILE* f); - -//---------- -// -// lastz-- -// Main program -// -//---------- - -//=== "nuisance" defines to prevent certain versions of gcc from complaining === - -#if ((defined allowSeveralTargets) || (defined trackMemoryUsage) || (defined valgrindMemoryCheck)) -#define trackTargetRev -#endif // allowSeveralTargets or trackMemoryUsage or valgrindMemoryCheck - - -//=== the actual function main() === - -int main - (int argc, - char** argv) - { - FILE* statsF = NULL; - seq* target = NULL; - seq* query = NULL; - postable* targPositions = NULL; -#ifdef trackTargetRev - int freeTargetRev = false; -#endif // trackTargetRev - u8* targetRev = NULL; - tback* traceback = NULL; - census* targCensus = NULL; - hitprocessor hitProc; - void* voidHitProcInfo; - hitprocinfo* hitProcInfo = NULL; - hitprocsimple* simpleInfo = NULL; - int applyChore = false; - time_t startClock, endClock; - unspos coverageLimit; - int reverseNeeded, tableWillBeUsed, queryExists; - int collectHspsFromBoth; // collect HSPs from both strands - // .. before gapped stage - int collectHspsSeparately; // collect HSPs in separate tables - int hspsAreAdaptive; // adaptive HSP scoring threshold - // .. is being used - u32 prevAnchorCount; - int numQueries = 0; - int numChores = 0; -#ifdef debugChoreFilter - int reportProgressNow; -#endif // debugChoreFilter - u8 rCh, cCh; - int needRewindableQuery = false; - int abortQuery; - char* badAction; - u32 originalNumAnchors; - int emptyAnchors; -#ifdef allowSeveralTargets - int checkForEmpty; - int numTargets = 0; - int haveSeveralTargets = false; - int havePrintedJobHeader = false; -#endif // allowSeveralTargets - - dbg_timing_sub (debugClockTotal); - dbgQueryProgressClock = -((s64) read_clock()); -#ifdef allowSeveralTargets - dbgTargetProgressClock = dbgQueryProgressClock; -#endif // allowSeveralTargets - - debug = 0; - - lastz_zero_stats (); - sequence_zero_stats (); - pos_table_zero_stats (); - capsule_zero_stats (); - seed_search_zero_stats (); - quantum_zero_stats (); - chain_zero_stats (); - gapped_extend_zero_stats (); - tweener_zero_stats (); - masking_zero_stats (); - infer_scores_zero_stats (); - - ////////// - // fetch arguments - ////////// - - currParams = NULL; - parse_options (argc, argv, &lzParams, &izParams); - currParams = &lzParams; - - if (showDefaults) - { - if (showDefaultsExit) - show_scoring_defaults(helpout, /*andExit*/ true); // (does not return) - else if (showDefaultsStderr) - show_scoring_defaults(stderr, /*andExit*/ false); - else - show_scoring_defaults(stdout, /*andExit*/ false); - } - - // open stats file - - if (currParams->statsFilename != NULL) - { - currParams->statsFile = fopen_or_die (currParams->statsFilename, "wt"); - free_if_valid ("stats file name", currParams->statsFilename); - } - - if (currParams->showStats) - statsF = (currParams->statsFile != NULL)? currParams->statsFile : stderr; - - if ((currParams->inferScores) && (statsF != NULL)) - izParams.statsFile = statsF; - - // open anchors file; note that we don't dispose of the file name here, so - // that we can use it in error messages later - - if (currParams->anchorsFilename != NULL) - { - currParams->anchorsFile = fopen_or_die (currParams->anchorsFilename, "rt"); - currParams->mergeAnchors = true; - } - - ////////// - // open the sequence files - // - // We also load the first sequence from the input files here. This allows - // us to check and report problems (such as an extra target or an empty - // query) immediately, rather than wasting what might turn out to be a lot - // of processing time. - ////////// - - startClock = clock(); - - // open and load target - - memory_checkpoint ("[[* Opening Files ]]\n"); - - dbg_timing_sub (debugClockSeq1); - - reverseNeeded = ((currParams->gappedExtend) - || ((currParams->inferScores) && (izParams.gappedExtend))); - - if (lzParams.capsule != NULL) - { - if (!reverseNeeded) - target = capsule_target (lzParams.capsule, NULL); - else - { - target = capsule_target (lzParams.capsule, &targetRev); - currParams->rev1 = targetRev; -#ifdef trackTargetRev - freeTargetRev = false; -#endif // trackTargetRev - } - currParams->seq1 = target; - } - else - { - if (currParams->verbosity >= 5) - fprintf (stderr, "opening \"%s\"\n", currParams->seq1Filename); - - target = open_sequence_file (currParams->seq1Filename, seq_type_unknown, - /* choresAllowed, choresFilename */ false, NULL, - currParams->targetMem, - currParams->needTrueLengths, - currParams->allowAmbiDNA, NULL); - - if ((currParams->targetIsQuantum) && (target->fileType != seq_type_qdna)) - suicidef ("%s does not contain quantum DNA", target->filename); - - currParams->seq1 = target; - -#ifndef allowSeveralTargets - if ((!load_sequence (target)) || (target->len == 0)) - suicidef ("target file %s contains no sequence", target->filename); - if (another_sequence (target)) - suicidef ("target file %s contains more than one sequence\n" - "consider using the \"multiple\" action (see \"lastz --help=files\")", - target->filename); - debugSequenceState_1; -#else // if (allowSeveralTargets) - if (!load_sequence (target, NULL)) - suicidef ("target file %s contains no sequence", target->filename); - - haveSeveralTargets = another_sequence (target); - if ((target->len == 0) && (!haveSeveralTargets)) - suicidef ("target file %s contains no sequence", target->filename); - - if (haveSeveralTargets) - { - char* errorMessage; - - numTargets = 1; - - if (currParams->selfCompare) - { - errorMessage = "can't use --self"; - goto cant_do_that_with_several_targets; - // $$$ --self would require us to read the file separately as - // $$$ .. both target and query, and loop over both files to get - // $$$ .. all-vs-all (instead of each-vs-each); also, we would - // $$$ .. have to handle seqA vs seqA as we do now for --self, - // $$$ .. but would have to handle seqA vs seqB as a normal - // $$$ .. alignment but with mirrored output, when seqA < seqB, - // $$$ .. and completely ignore it when seqA > seqB; all this - // $$$ .. is doable, it's just extra work - } - - if (currParams->anchorsFile != NULL) - { - errorMessage = "can't use --segments"; - goto cant_do_that_with_several_targets; - // $$$ --segments might currently work, I have not had time to - // $$$ .. test it; the issue is whether the target name(s) - // $$$ .. are correctly handled when there is more than one - // $$$ .. target (I am skeptical) - } - - if (currParams->inferScores) - { - errorMessage = "can't perform scoring inference"; - goto cant_do_that_with_several_targets; - // $$$ not sure what all would be required to make scoring - // $$$ .. inference work; at the very least it would have to - // $$$ .. be able to rewind the target - } - - if ((currParams->outputFormat == fmtGenpafBlast) - || (currParams->outputFormat == fmtGenpafBlastNoHeader)) - { - errorMessage = "can't use --format=BLASTN"; - goto cant_do_that_with_several_targets; - // $$$ blastn output needs to have all alignments for a query - // $$$ .. together; since my outer loop is targets, not - // $$$ .. queries, this is impossible; the user can accomplish - // $$$ .. this using [multi] as long as the combined targets - // $$$ .. fit in memory; otherwise, she will have to do - // $$$ .. each target in a separate run; the only internal - // $$$ .. solution I can think of is for the blastn output - // $$$ .. module to save all alignments rather than writing - // $$$ .. them out, then sort them by query name at the end - } - - if ((currParams->outputFormat == fmtSoftSam) - || (currParams->outputFormat == fmtSoftSamNoHeader) - || (currParams->outputFormat == fmtHardSam) - || (currParams->outputFormat == fmtHardSamNoHeader)) - { - errorMessage = "can't use --format=SAM"; - goto cant_do_that_with_several_targets; - // $$$ sam output needs to have a list of all targets appear - // $$$ .. as SN tags in the header; the only way I can think - // $$$ .. to do this internally is to (a) open the target file - // $$$ .. as rewindable and (b) prescan it to identify all - // $$$ .. target names; doable but not worth the trouble at - // $$$ .. present; or, I could only allow this if the input - // $$$ .. format allows me to fetch the sequence names (i.e. - // $$$ .. 2bit or hsx; anyway, the user can accomplish this - // $$$ .. using [multi], but with the same caveats as for - // $$$ .. blastn - } - - if (false) - { - cant_do_that_with_several_targets: - suicidef ("%s when target file contains several sequences (%s)\n" - "consider using the \"multiple\" action (see \"lastz --help=files\")", - errorMessage, target->filename); - } - } -#endif // allowSeveralTargets - } - - -#ifdef allowSeveralTargets -next_target: -#endif // allowSeveralTargets - - if (dbgDumpTargetSequence) - print_sequence (stderr, target, "target", 100); - else if (dbgDumpTargetSequence2) - dump_sequence (stderr, target); - - if (lzParams.capsule == NULL) - { -#ifdef allowSeveralTargets - if (haveSeveralTargets) - if ((dbgTargetProgress != 0) - && ((dbgTargetProgress == 1) || (numTargets % dbgTargetProgress == 1))) - { - float secs; - int hours, mins; - dbgTargetProgressClock += (s64) read_clock(); - secs = ((float)(dbgTargetProgressClock)) / clocksPerSec; - dbgTargetProgressClock = -((s64) read_clock()); - - fprintf (stderr, "%s", dbgTargetProgressPrefix); - - if (secs < 60) - fprintf (stderr, "(%.3fs) ", secs); - else if (secs < 3600) - { - mins = secs / 60; - secs -= 60 * mins; - fprintf (stderr, "(%dm%06.3fs) ", mins, secs); - } - else - { - mins = secs / 60; - secs -= 60 * mins; - hours = mins / 60; - mins -= 60 * hours; - fprintf (stderr, "(%dh%02dm%06.3fs) ", hours, mins, secs); - } - - fprintf (stderr, "processing target %s", commatize(numTargets)); - if ((target->shortHeader != NULL) && (!target->useFullNames)) - fprintf (stderr, ": %s", target->shortHeader); - else if (target->header != NULL) - fprintf (stderr, ": %s", target->header); - fprintf (stderr, "\n"); - } -#endif // allowSeveralTargets - - if (reverseNeeded) - { - targetRev = (u8*) copy_reverse_of_string ((char*) target->v, target->len); - currParams->rev1 = targetRev; -#ifdef trackTargetRev - freeTargetRev = true; -#endif // trackTargetRev - } - } - - dbg_timing_add (debugClockSeq1); - - if ((currParams->anchorsFile != NULL) && (target->revCompFlags != rcf_forward)) - suicidef ("can't use --segments with reverse-complement of target (%s)", - target->filename); - - if ((currParams->dynamicMasking > 0) || (currParams->reportCensus)) - targCensus = new_census (target->len, currParams->censusKind, currParams->dynamicMasking); - - // now that we know the target length, set the hsp and gapped thresholds - // if they are a percentage of the target length - - resolve_score_thresh (&currParams->hspThreshold, target->len); - resolve_score_thresh (&currParams->gappedThreshold, target->len); - - if (currParams->inferScores) - { - resolve_score_thresh (&izParams.hspThreshold, target->len); - resolve_score_thresh (&izParams.gappedThreshold, target->len); - } - - // open and load query - - if ((currParams->doSeedSearch) || (currParams->inferScores)) - { - if (currParams->verbosity >= 5) - { - if (currParams->seq2Filename != NULL) - fprintf (stderr, "opening \"%s\"\n", currParams->seq2Filename); - else - fprintf (stderr, "opening unnamed query file\n"); - } - - dbg_timing_sub (debugClockSeq2); - - if (currParams->inferScores) - needRewindableQuery = true; -#ifdef allowSeveralTargets - if (haveSeveralTargets) - needRewindableQuery = true; -#endif // allowSeveralTargets - -#ifdef allowSeveralTargets - checkForEmpty = true; - if (query != NULL) - { - rewind_sequence_file (query); - checkForEmpty = false; - } - else -#endif // allowSeveralTargets - - if (currParams->clonedQuery) - { - // clone query from target sequence already in memory; note that a - // query cloned from the target is inherently rewindable-in-memory - - query = clone_sequence (target); - currParams->seq2 = query; - } - else if (needRewindableQuery) - { - // read query from file and require we be able to rewind it later - - query = open_rewindable_sequence_file - (currParams->seq2Filename, seq_type_unknown, - /* choresAllowed */ true, currParams->choresFilename, - currParams->queryMem, - currParams->needTrueLengths, currParams->allowAmbiDNA, - NULL); - currParams->seq2 = query; - } - else - { - // read query from file without regard for ability to rewind it - - query = open_sequence_file - (currParams->seq2Filename, seq_type_unknown, - /* choresAllowed */ true, currParams->choresFilename, - currParams->queryMem, - currParams->needTrueLengths, currParams->allowAmbiDNA, - currParams->maskedScoring->qToComplement); - currParams->seq2 = query; - } - - if ((currParams->queryIsQuantum) && (query->fileType != seq_type_qdna)) - suicidef ("%s does not contain quantum DNA", query->filename); - - applyChore = (query->choresFile != NULL); - if ((applyChore) && (currParams->inferScores)) - suicidef ("can't use [chores] with --infer[only]\n"); - if ((applyChore) && (currParams->anchorsFile != NULL)) - suicidef ("can't use [chores] with --segments\n"); - - if (currParams->choresFilename != NULL) - { // (choresFilename has been copied into query->choresFilename by open_sequence_file) - free_if_valid ("lz.choresFilename", currParams->choresFilename); - currParams->choresFilename = NULL; - } - -#ifndef allowSeveralTargets - if (!another_sequence (query)) - suicidef ("query file %s contains no sequence", query->filename); -#else - if ((checkForEmpty) && (!another_sequence (query))) - suicidef ("query file %s contains no sequence", query->filename); -#endif // allowSeveralTargets - - dbg_timing_add (debugClockSeq2); - } - - if (dbgShowParams) - print_params (stderr, &lzParams); - - // check for bad combination of partitioned sequence vs other options - // $$$ we'd like to also reject [multiple] with --chain (lzParams->chain), - // $$$ .. but we'd need to distinguish between a partitioned sequence that - // $$$ .. contains more than one actual sequence, versus one that just - // $$$ .. has in-sequence separators; this distinction would have to be - // $$$ .. made during sequence parsing (search for separatorCh in - // $$$ .. sequences.c); or possibly we could infer it from partition.header, - // $$$ .. e.g. if all the partition.header entries are the same then --chain - // $$$ .. is OK, otherwise reject it - - if ((target->partition.p != NULL) - || ((query != NULL) && (query->partition.p != NULL))) - { - badAction = (target->separatorCh == 0) || (query->separatorCh == 0) - ? "multiple action" - : "multiple action (forced by separator action)"; - - if ((currParams->doSeedSearch) - && (!currParams->inferOnly) - && ((currParams->outputFormat == fmtGfa) - || (currParams->outputFormat == fmtGfaNoScore))) - suicidef ("%s cannot be used with --gfa", badAction); - - if ((currParams->doSeedSearch) - && (!currParams->inferOnly) - && ((currParams->outputFormat == fmtLav) - || (currParams->outputFormat == fmtLavComment) - || (currParams->outputFormat == fmtLavScore) - || (currParams->outputFormat == fmtLavText))) - suicidef ("%s cannot be used with --lav\n" - "(lav has requirements on the order of alignments that would require additional\n" - " computation; use \"--help=formats\" to see other options for output)", - badAction); - } - - if (target->partition.p != NULL) - { - badAction = (target->separatorCh == 0) - ? "multiple action" - : "multiple action (forced by separator action)"; - - // the [multi] --self combination is no longer prohibited - //if (currParams->selfCompare) - // suicidef ("%s cannot be used with --self", badAction); - if (currParams->maskingFilename != NULL) - suicidef ("%s cannot be used with --outputmasking", badAction); - if ((currParams->softMaskedFilename != NULL) - && (!currParams->softMasked3Fields)) - suicidef ("%s cannot be used with --outputmasking:soft\n" - "consider using --outputmasking+:soft instead", - badAction); - } - - // check for bad combination of the target attributes vs genpaf options; - // note that we were not able to check for these during command-line - // parsing (for example, we can't know whether we have base-call qualities - // until we know the file type - - if ((currParams->outputFormat == fmtGenpaf) - || (currParams->outputFormat == fmtGenpafNoHeader)) - { - if ((target->vq == NULL) - && (strchr (currParams->outputInfo, genpafTargetQuals) != NULL)) - suicidef ("%s has no base-call qualities (required for --format=general:%s)", - target->filename, genpafTQualsName); - } - - // allocate traceback memory - - if ((currParams->gappedExtend) - || ((currParams->inferScores) && (izParams.gappedExtend))) - { - if (traceback == NULL) - { - traceback = new_traceback (currParams->tracebackMem); - currParams->traceback = traceback; - } - } - - ////////// - // build a position table for the target sequence - // - // If we are inferring scores, we'd like to share the same position table - // for both inference and for the final alignment. However, it is natural - // to use a less sensitive seed or step during inference. So in case the - // seed or step is different, we use the inference control values here and - // will rebuild the table after inference is finished, if needed. - // - // If we are to get the position table from a capsule file, we do so here. - // Note that scoring inference and reading a capsule file are mutually - // exclusive options. - ////////// - - if (currParams->anchorsFile == NULL) - { - dbg_timing_sub (debugClockPosTable); - - if (currParams->verbosity >= 1) - { - if (lzParams.capsule != NULL) - fprintf (stderr, "linking to position table in %s\n", - lzParams.capsuleFilename); - else - fprintf (stderr, "building position table for %s\n", - target->filename); - } - - if (lzParams.capsule != NULL) - targPositions = capsule_position_table - (lzParams.capsule, target, - currParams->hitSeed, currParams->step); - else if (currParams->inferScores) - targPositions = build_seed_position_table - (target, 0, target->len, - currParams->upperCharToBits, izParams.hitSeed, - izParams.step); - else if (!currParams->targetIsQuantum) - { - targPositions = build_seed_position_table - (target, 0, target->len, - currParams->upperCharToBits, currParams->hitSeed, - currParams->step); - if (currParams->wordCountKeep > 0) - currParams->wordCountLimit = find_position_table_limit - (targPositions, - currParams->wordCountKeep); - if (currParams->wordCountLimit > 0) - limit_position_table (targPositions, currParams->wordCountLimit, - currParams->maxWordCountChasm); - } - else // if (currParams->targetIsQuantum) - { - targPositions = build_quantum_seed_position_table - (target, 0, target->len, - currParams->maskedScoring->bottleneck, - currParams->maskedScoring->qToBest, - currParams->hitSeed, currParams->step); - if (currParams->wordCountKeep > 0) - currParams->wordCountLimit = find_position_table_limit - (targPositions, - currParams->wordCountKeep); - if (currParams->wordCountLimit > 0) - limit_position_table (targPositions, currParams->wordCountLimit, - currParams->maxWordCountChasm); - } - - dbg_timing_add (debugClockPosTable); - - lastz_show_stats_before (statsF); - pos_table_show_stats (statsF, targPositions); - } - - ////////// - // perform scoring inference, if requested - ////////// - - if (currParams->inferScores) - { - // switch control to the inference parameters - - currParams = &izParams; - izParams.seq1Filename = lzParams.seq1Filename; - izParams.seq1 = lzParams.seq1; - izParams.seq2Filename = lzParams.seq2Filename; - izParams.seq2 = lzParams.seq2; - - // perform the scoring inference (note that this will rewind the query - // file) - - if (currParams->hspThreshold.t == 'S') coverageLimit = 0; - else coverageLimit = currParams->hspThreshold.c; - - anchors = new_segment_table (numDefaultAnchors, coverageLimit); - - lzParams.scoring = drive_scoring_inference - (currParams, - target, targetRev, targPositions, query, - traceback); - - // switch control back to the main parameters - - currParams = &lzParams; - - // fill in score-based parameters - - currParams->maskedScoring = masked_score_set (currParams->scoring); - - if (currParams->xDrop < 0) - { - rCh = currParams->scoring->rowChars[0]; - cCh = currParams->scoring->colChars[0]; - currParams->xDrop = 10 * currParams->scoring->sub[rCh][cCh]; - } - - if (currParams->yDrop < 0) - currParams->yDrop = currParams->scoring->gapOpen + 300 * currParams->scoring->gapExtend; - - // rebuild the target sequence position table if it would be different - - tableWillBeUsed = ((currParams->doSeedSearch) - || (currParams->showPosTable != spt_dont) - || (currParams->writeCapsule)); - - if ((tableWillBeUsed) - && (!is_same_seed (izParams.hitSeed, currParams->hitSeed)) - && (izParams.step != currParams->step)) - { - free_position_table (targPositions); - if (!currParams->targetIsQuantum) - targPositions = build_seed_position_table - (target, 0, target->len, - currParams->upperCharToBits, currParams->hitSeed, - currParams->step); - else // if (currParams->targetIsQuantum) - targPositions = build_quantum_seed_position_table - (target, 0, target->len, - currParams->maskedScoring->bottleneck, - currParams->maskedScoring->qToBest, - currParams->hitSeed, currParams->step); - if (currParams->wordCountKeep > 0) - currParams->wordCountLimit = find_position_table_limit - (targPositions, - currParams->wordCountKeep); - if (currParams->wordCountLimit > 0) - limit_position_table (targPositions, currParams->wordCountLimit, - currParams->maxWordCountChasm); - } - } - - if (currParams->showPosTable == spt_distribution) - { - poscount* posDist = position_table_count_distribution (targPositions); - poscount* pd; - fprintf (currParams->outputFile, - "seed-word counts distribution table for %s:\n", - currParams->seq1->filename); - for (pd=posDist ; pd->occurrences!=0 ; pd++) - fprintf (currParams->outputFile, unsposFmt " " unsposFmt "\n", - pd->count, pd->occurrences); - free_if_valid ("seed word position counts distribution", posDist); - } - else if (currParams->showPosTable != spt_dont) - { - if (currParams->showPosTable == spt_table) - fprintf (currParams->outputFile, - "seed-word positions table for %s:\n", - currParams->seq1->filename); - else if (currParams->showPosTable == spt_countsonly) - fprintf (currParams->outputFile, - "seed-word counts table for %s:\n", - currParams->seq1->filename); - else // if (currParams->showPosTable == spt_withcounts) - fprintf (currParams->outputFile, - "seed-word counts and positions table for %s:\n", - currParams->seq1->filename); - dump_position_table (currParams->outputFile, - targPositions, currParams->hitSeed, - (currParams->showPosTable == spt_table) - || (currParams->showPosTable == spt_withcounts), - (currParams->showPosTable == spt_countsonly) - || (currParams->showPosTable == spt_withcounts)); - printf ("\n"); - } - - ////////// - // if we are only writing a capsule file, do so and quit - ////////// - - if (currParams->writeCapsule) - { - u64 capSize; - - currParams->capsuleFile = fopen_or_die (currParams->capsuleFilename, "wb"); - capSize = write_capsule_file (currParams->capsuleFile, - currParams->capsuleFilename, - target, targetRev, targPositions, - currParams->hitSeed); - fclose_if_valid (currParams->capsuleFile); - currParams->capsuleFile = NULL; - endClock = clock(); - printf ("%s byte target sequence capsule written to %s\n", - unitize(capSize,/*byThousands*/ true), - currParams->capsuleFilename); - goto show_stats_and_clean_up; - } - - ////////// - // perform the alignment of the query (or queries) to the target - ////////// - - dbg_timing_copy (debugClockQueryTotal, debugClockSeq2); - dbg_timing_sub (debugClockQueryTotal); - numQueries = 0; - numChores = 0; - - // if the user doesn't want the alignment we can quit now - - if (((!currParams->inferScores) && (!currParams->doSeedSearch)) - || (( currParams->inferScores) && (currParams->inferOnly))) - { - endClock = clock(); - goto show_stats_and_clean_up; - } - - // initialize (or empty) the anchors list - - if (currParams->hspThreshold.t == 'S') coverageLimit = 0; - else coverageLimit = currParams->hspThreshold.c; - - if (anchors == NULL) - anchors = new_segment_table (numDefaultAnchors, coverageLimit); - else - { - empty_segment_table (anchors); - limit_segment_table (anchors, coverageLimit); - } - - if (currParams->anchorsFile == NULL) - { - set_up_hit_processor (currParams, (targCensus!=NULL), - &hitProc, &voidHitProcInfo); - hitProcInfo = (hitprocinfo*) voidHitProcInfo; - simpleInfo = (hitprocsimple*) voidHitProcInfo; - } - else - { - hitProc = NULL; - voidHitProcInfo = NULL; - } - - // search for hits in each query sequence - - hspsAreAdaptive = (currParams->hspThreshold.t != 'S'); - collectHspsFromBoth = hspsAreAdaptive - || (currParams->searchLimit > 0) - || (currParams->numBestHsps > 0); - - collectHspsSeparately = false; - if (collectHspsFromBoth) - { - collectHspsSeparately = true; - if (hspsAreAdaptive) collectHspsSeparately = false; - if (currParams->numBestHsps > 0) collectHspsSeparately = false; - } - - if ((collectHspsFromBoth) && (secondaryAnchors == NULL)) - secondaryAnchors = new_segment_table (numDefaultAnchors, 0); - -#ifndef allowSeveralTargets - print_job_header (); - print_options (); -#else - if (!havePrintedJobHeader) - { - havePrintedJobHeader = true; - print_job_header (); - print_options (); - } -#endif // allowSeveralTargets - - while (true) - { - memory_checkpoint_1 ("[[* Query #%d (loading)]]\n", numQueries); - - dbg_timing_sub (debugClockSeq2); - queryExists = load_sequence (query); - dbg_timing_add (debugClockSeq2); - if (!queryExists) break; - debugSequenceState_2; - - if (!applyChore) numQueries++; - else if (query->chore.num == 1) numQueries++; - numChores++; - - if (query->len == 0) - { - report_progress (target, query, - applyChore, numQueries, numChores, - hitProcInfo); - continue; - } - - memory_checkpoint_2 ("[[* Query #%d, %s (loaded) ]]\n", - numQueries, - (query->useFullNames)? query->header - : query->shortHeader); - - if (dbgDumpQuerySequence) - print_sequence (stderr, query, "query", 100); - else if (dbgDumpQuerySequence2) - { fprintf (stderr, "query, forward:\n"); dump_sequence (stderr, query); } - - // if we have a chores file, set up the hit-processor's position - // filtering; at the same time, we validate that the current chore has - // valid target and query intervals - // nota bene: if the chore is - strand only, it would seem that we - // needn't call resolve_chore_query; however, we do so to - // validate the chore interval - - if (!applyChore) - { if (hitProcInfo != NULL) hitProcInfo->posFilter = false; } - else - { - query->chore.targetInterval = resolve_chore_target (&query->chore, target); - query->chore.queryInterval = resolve_chore_query (query, '+'); - hitProcInfo->posFilter = true; - hitProcInfo->targetInterval = query->chore.targetInterval; - hitProcInfo->queryInterval = query->chore.queryInterval; - } - - // if we have a match count filter, expressed as a ratio, compute the - // filter threshold - - if (currParams->minMatchCountRatio != 0) - currParams->minMatchCount = (u32) ceil (query->trueLen * currParams->minMatchCountRatio); - - // check for bad combination of the query attributes vs genpaf options; - // note that we were not able to check for these during command-line - // parsing (because, e.g., we didn't know then whether the query file - // contained quality values) - - if (numChores == 1) - { - if ((currParams->outputFormat == fmtGenpaf) - || (currParams->outputFormat == fmtGenpafNoHeader)) - { - if ((query->vq == NULL) - && (strchr (currParams->outputInfo, genpafQueryQuals) != NULL)) - suicidef ("%s has no base-call qualities (required for --format=general:%s)", - query->filename, genpafQQualsName); - } - } - - if (currParams->whichStrand < 0) - { - // $$$ [Note S] if we have a chores file, we would probably do - // $$$ .. better by coordinating query reversal between - // $$$ .. load_sequence() and this loop; as it stands, if we have - // $$$ .. a series of minus-strand chores on the same query, - // $$$ .. load_sequence() and this loop are continually reversing - // $$$ .. the query without merit - - rev_comp_sequence (query, currParams->scoring->qToComplement); - } - - if (currParams->verbosity >= 1) - fprintf (stderr, "searching for matches in %s%s\n", - query->filename, - (currParams->whichStrand>=0)? "" - : ", (reverse strand)"); - -#ifdef debugChoreFilter - reportProgressNow = -#endif // debugChoreFilter - report_progress (target, query, - applyChore, numQueries, numChores, - hitProcInfo); - - if ((!applyChore) || (query->chore.num == 1)) - init_output_for_query (); - - // search for "forward" strand hits; note that if we are working on a - // chore that is restricted to the - strand, we skip this stage - - emptyAnchors = true; - - if ((applyChore) && (query->chore.qStrand < 0)) - goto plus_strand_finished; - - if ((currParams->hspImmediate) - && (currParams->gappedExtend)) - { - // $$$ note that if we have a chores file, we might do better if we - // $$$ .. had saved the previous rev2 in case this chore was for the - // $$$ .. same query (in which case we might need to un-reverse it) - - hitrepgappily* gappilyInfo = (hitrepgappily*) simpleInfo->hp.reporterInfo; - - if (currParams->rev2 != NULL) - suicidef ("internal error, currParams->rev2 is not NULL"); - - currParams->rev2 = (u8*) copy_reverse_of_string ((char*) query->v, query->len); - - gappilyInfo->seq2 = currParams->seq2; - gappilyInfo->rev2 = currParams->rev2; - gappilyInfo->alignmentHashesSeen = 0; - } - - abortQuery = !start_one_strand (target, targPositions, query, - /* empty anchors */ emptyAnchors, - /* prev anchor count */ 0, - hitProc, voidHitProcInfo); - emptyAnchors = false; - if (abortQuery) goto cleanup_query; - - // finish alignment for "forward" strand hits, unless we are collecting - // HSPs from both strands (such as when HSPs are adaptive); note that - // if we are only interested in the best so-many HSPs, we choose them - // here - - if (!collectHspsFromBoth) - { - if (currParams->numBestHsps > 0) - { - originalNumAnchors = anchors->len; - choose_best_anchors (currParams->numBestHsps); - dbg_show_hsp_counts_2; - } - - finish_one_strand (target, targetRev, targPositions, query, NULL, - traceback, targCensus); - } - - plus_strand_finished: - - // search for "reverse" strand hits; note that if we are working on a - // chore that is restricted to the + strand, we skip this stage - - if ((applyChore) && (query->chore.qStrand == 0)) - goto minus_strand_finished; - - if (applyChore) - { - query->chore.queryInterval = resolve_chore_query (query, '-'); - hitProcInfo->queryInterval = query->chore.queryInterval; - debugChoreFilter_1; - } - - if (currParams->whichStrand > 0) - { - // $$$ see [Note S] above - - if (currParams->verbosity >= 1) - fprintf (stderr, "searching for matches in %s, (reverse strand)\n", - query->filename); - - rev_comp_sequence (query, currParams->scoring->qToComplement); - if ((currParams->hspImmediate) - && (currParams->gappedExtend)) - strncpy_reverse (/* to */ (char*) currParams->rev2, - /* from */ (char*) query->v, - /* size */ query->len); - - if (dbgDumpQuerySequence2) - { fprintf (stderr, "query, reverse:\n"); dump_sequence (stderr, query); } - - prevAnchorCount = 0; - if (collectHspsSeparately) - { - prevAnchorCount = anchors->len; - swap_anchor_sets (); // + strand moved to secondary anchors - empty_segment_table (anchors); // - strand will be collected in anchors - limit_segment_table (anchors, 0); - } - - abortQuery = !start_one_strand (target, targPositions, query, - /* empty anchors */ emptyAnchors || (!collectHspsFromBoth), - prevAnchorCount, - hitProc, voidHitProcInfo); - if (abortQuery) goto cleanup_query; - - // finish alignment for "reverse" strand hits; note that if we are - // only interested in the best so-many HSPs, we choose them here - // (the choice is made on the combined HSPs from both strands, - // because collectHspsFromBoth and collectHspsSeparately are true) - - if (currParams->numBestHsps > 0) - { - originalNumAnchors = anchors->len; - choose_best_anchors (currParams->numBestHsps); - dbg_show_hsp_counts_2; - } - - if ((collectHspsFromBoth) && (!collectHspsSeparately)) - split_anchors (query->revCompFlags); // - strand kept in anchors; - // .. + strand moved to secondary anchors - - finish_one_strand (target, targetRev, targPositions, query, NULL, - traceback, targCensus); - - if (collectHspsFromBoth) - { - swap_anchor_sets (); // - strand moved to anchors - // .. + strand discarded (moved to secondary anchors) - // we have to reverse query for subsequent call to finish_one_strand() - rev_comp_sequence (query, currParams->scoring->qToComplement); - } - } - - minus_strand_finished: - - // if we were collecting HSPs from both strands, finish alignment for - // "forward" strand hits; note that if we are only interested in the - // best so-many HSPs, they have been chosen before we reach this point - - if (collectHspsFromBoth) - finish_one_strand (target, targetRev, targPositions, query, NULL, - traceback, targCensus); - - cleanup_query: - - if ((currParams->hspImmediate) - && (currParams->gappedExtend) - && (currParams->rev2 != NULL)) - { - // $$$ see [Note S] above; we might do better by keeping this until - // $$$ .. we see whether the next chore is for the same query - - free_if_valid ("reverse query (currParams->rev2)", currParams->rev2); - currParams->rev2 = NULL; - } - } - - if (currParams->anchorsFile != NULL) - { - // make sure all segments were read; this will generate an error - // message if there are any remaining segments in the file - - read_segment_table (currParams->anchorsFile, currParams->anchorsFilename, - NULL, NULL, NULL); - } - - endClock = clock(); - - if (currParams->maskingFilename != NULL) - { - pmiInfo pmi1; - - if (currParams->maskingFile == NULL) - currParams->maskingFile = fopen_or_die (currParams->maskingFilename, "wt"); - - pmi1.f = currParams->maskingFile; - pmi1.seq = target; - if (currParams->masking3Fields) - report_census_intervals (targCensus, print_masking_interval_3, &pmi1); - else - report_census_intervals (targCensus, print_masking_interval, &pmi1); - } - - if (currParams->softMaskedFilename != NULL) - { - pmiInfo pmi2; - - if (currParams->softMaskedFile == NULL) - currParams->softMaskedFile = fopen_or_die (currParams->softMaskedFilename, "wt"); - - pmi2.f = currParams->softMaskedFile; - pmi2.seq = target; - if (currParams->softMasked3Fields) - report_masked_intervals (target, -1, print_masking_interval_3, &pmi2); - else - report_masked_intervals (target, -1, print_masking_interval, &pmi2); - } - - print_m_stanza (targCensus); - if (currParams->reportCensus) - { - u32 savedThresh = targCensus->maskThresh; - targCensus->maskThresh = 0; - if (currParams->censusFilename == NULL) - print_census_stanza (targCensus); - else - { - if (currParams->censusFile == NULL) - currParams->censusFile = fopen_or_die (currParams->censusFilename, "wt"); - print_census (currParams->censusFile, target, targCensus, '\t'); - } - targCensus->maskThresh = savedThresh; - } - - if ((seed_search_dbgSearchLimitExceeded > 0) - && ((currParams->searchLimitWarn) || (hitProcInfo->reporter != gappily_extend_hsps))) - { - // nota bene: this report is diaabled for gappily_extend_hsps because - // the reported count would be incorrect - if (seed_search_dbgSearchLimitExceeded == 1) - fprintf (stderr, "1 query exceeded the"); - else - fprintf (stderr, "%d queries exceeded the", - seed_search_dbgSearchLimitExceeded); - if (hitProcInfo->reporter == gappily_extend_hsps) - fprintf (stderr, " limit of qualifying alignments\n"); - else - fprintf (stderr, " HSP limit\n"); - } - -#ifdef collect_stats - if (currParams->gfExtend != gfexNoExtend) - { - print_generic (currParams->outputFile, - "gap_free_extensions=%" PRId64, - seed_search_hsps() - + seed_search_low_scoring_hsps()); - if (!currParams->reportStats) - print_generic (currParams->outputFile, - "bp_extended=%" PRId64, - seed_search_bp_extended()); - } -#endif // collect_stats - - if (currParams->reportStats) - { - seed_search_generic_stats (currParams->outputFile, print_generic); - if (query->fileType == seq_type_qdna) - quantum_generic_stats (currParams->outputFile, print_generic); - chain_generic_stats (currParams->outputFile, print_generic); - gapped_extend_generic_stats (currParams->outputFile, print_generic); - if (currParams->innerSeed != NULL) - tweener_generic_stats (currParams->outputFile, print_generic); - if (currParams->dynamicMasking > 0) - masking_generic_stats (currParams->outputFile, print_generic); - if (currParams->inferScores) - infer_scores_generic_stats (currParams->outputFile, print_generic); - } - -// nota bene: printing a job footer is problematic if there are several -// sequences in the target file, because we won't know until a -// little later if this was the last target; moreover, we might -// no longer have all the info required to print the footer, -// because some info gets discarded as each target is processed - -#ifndef allowSeveralTargets - print_job_footer (); -#else - if (!haveSeveralTargets) - print_job_footer (); -#endif // allowSeveralTargets - -show_stats_and_clean_up: - { - float runTime = ((float)(endClock-startClock))/CLOCKS_PER_SEC; - if (currParams->reportTiming) - print_generic (currParams->outputFile, "runtime=%.3f", runTime); - lastz_set_stat (runTime, runTime); - } - - sequence_show_stats (statsF); - capsule_show_stats (statsF); - seed_search_show_stats (statsF); - if ((query != NULL) && (query->fileType == seq_type_qdna)) - quantum_show_stats (statsF); - chain_show_stats (statsF); - gapped_extend_show_stats (statsF); - if (currParams->innerSeed != NULL) - tweener_show_stats (statsF); - if (currParams->dynamicMasking > 0) - { - pos_table_show_stats_after (statsF); - masking_show_stats (statsF); - } - if (currParams->inferScores) - infer_scores_show_stats (statsF); - lastz_show_stats (statsF); - -#ifndef allowSeveralTargets - if (currParams->endComment) -#else - if ((!haveSeveralTargets) && (currParams->endComment)) -#endif // allowSeveralTargets - { - print_eof_comment (); - - if (currParams->maskingFilename != NULL) - { - if (currParams->maskingFile == NULL) - currParams->maskingFile = fopen_or_die (currParams->maskingFilename, "wt"); - fprintf (currParams->maskingFile, "# lastz end-of-file\n"); - } - - if (currParams->softMaskedFilename != NULL) - { - if (currParams->softMaskedFile == NULL) - currParams->softMaskedFile = fopen_or_die (currParams->softMaskedFilename, "wt"); - fprintf (currParams->softMaskedFile, "# lastz end-of-file\n"); - } - } - - ////////// - // go back and handle the next target, if there are any - ////////// - -#ifdef allowSeveralTargets - if (haveSeveralTargets) - { - int haveAnotherTarget = false; - - while (true) - { - if (!load_sequence (target)) break; - numTargets++; - if (target->len != 0) { haveAnotherTarget = true; break; } - } - - if (haveAnotherTarget) - { - if (freeTargetRev) - { free_if_valid ("targetRev", targetRev); targetRev = NULL; } - free_if_valid ("lzParams.rev2", lzParams.rev2); lzParams.rev2 = NULL; - free_position_table (targPositions); targPositions = NULL; - free_segment_table (anchors); anchors = NULL; - free_segment_table (secondaryAnchors); secondaryAnchors = NULL; - free_if_valid ("targCensus", targCensus); targCensus = NULL; - free_seed_hit_search (); - free_quantum_search (); - goto next_target; - } - - print_job_footer (); - if (currParams->endComment) - { - print_eof_comment (); - - if (currParams->maskingFilename != NULL) - { - if (currParams->maskingFile == NULL) - currParams->maskingFile = fopen_or_die (currParams->maskingFilename, "wt"); - fprintf (currParams->maskingFile, "# lastz end-of-file\n"); - } - - if (currParams->softMaskedFilename != NULL) - { - if (currParams->softMaskedFile == NULL) - currParams->softMaskedFile = fopen_or_die (currParams->softMaskedFilename, "wt"); - fprintf (currParams->softMaskedFile, "# lastz end-of-file\n"); - } - } - } -#endif // allowSeveralTargets - - ////////// - // clean up - // - // note that we don't bother to dispose of allocated memory unless we are - // going to be running the valgrind memory checker - ////////// - - memory_checkpoint ("[[* Cleanup ]]\n"); - -#if ((defined trackMemoryUsage) || (defined valgrindMemoryCheck)) - - free_if_valid ("lz.outputFilename", lzParams.outputFilename); lzParams.outputFilename = NULL; - fclose_if_valid (lzParams.outputFile); lzParams.outputFile = NULL; - free_if_valid ("lz.dotplotFilename", lzParams.dotplotFilename); lzParams.dotplotFilename = NULL; - fclose_if_valid (lzParams.dotplotFile); lzParams.dotplotFile = NULL; - free_if_valid ("lz.dotplotKeys", lzParams.dotplotKeys); lzParams.dotplotKeys = NULL; - free_if_valid ("lz.seq1Filename", lzParams.seq1Filename); lzParams.seq1Filename = NULL; - free_sequence (lzParams.seq1); lzParams.seq1 = NULL; - if (freeTargetRev) - { free_if_valid ("targetRev", targetRev); targetRev = NULL; } - free_if_valid ("lz.seq2Filename", lzParams.seq2Filename); lzParams.seq2Filename = NULL; - free_sequence (lzParams.seq2); lzParams.seq2 = NULL; - free_if_valid ("lzParams.rev2", lzParams.rev2); lzParams.rev2 = NULL; - free_if_valid ("lz.args", lzParams.args); lzParams.args = NULL; - free_score_set ("lz.scoring", lzParams.scoring); lzParams.scoring = NULL; - free_score_set ("lz.maskedScoring", lzParams.maskedScoring); lzParams.maskedScoring = NULL; - free_position_table (targPositions); targPositions = NULL; - free_segment_table (anchors); anchors = NULL; - free_segment_table (secondaryAnchors); secondaryAnchors = NULL; - free_traceback (traceback); traceback = NULL; - free_traceback_rows (); - free_segment_batches (); - free_if_valid ("alignmentHashes", alignmentHashes); alignmentHashes = NULL; - - free_if_valid ("targCensus", targCensus); targCensus = NULL; - free_seeds (lzParams.hitSeed); lzParams.hitSeed = NULL; - fclose_if_valid (lzParams.capsuleFile); lzParams.capsuleFile = NULL; - free_if_valid ("lz.capsuleFilename", lzParams.capsuleFilename); lzParams.capsuleFilename = NULL; - close_capsule_file (lzParams.capsule); lzParams.capsule = NULL; - fclose_if_valid (lzParams.anchorsFile); lzParams.anchorsFile = NULL; - free_if_valid ("lz.anchorsFilename", lzParams.anchorsFilename); lzParams.anchorsFilename = NULL; - free_if_valid ("lz.choresFilename", lzParams.choresFilename); lzParams.choresFilename = NULL; - free_seeds (lzParams.innerSeed); lzParams.innerSeed = NULL; - free_if_valid ("lz.outputInfo", lzParams.outputInfo); lzParams.outputInfo = NULL; - free_if_valid ("lz.readGroup", lzParams.readGroup); lzParams.readGroup = NULL; - free_if_valid ("lz.samRGTags", lzParams.samRGTags); lzParams.samRGTags = NULL; - fclose_if_valid (lzParams.maskingFile); lzParams.maskingFile = NULL; - free_if_valid ("lz.maskingFilename", lzParams.maskingFilename); lzParams.maskingFilename = NULL; - fclose_if_valid (lzParams.softMaskedFile); lzParams.softMaskedFile = NULL; - free_if_valid ("lz.softMaskedFilename", lzParams.softMaskedFilename); lzParams.softMaskedFilename = NULL; - fclose_if_valid (lzParams.censusFile); lzParams.censusFile = NULL; - free_if_valid ("lz.censusFilename", lzParams.censusFilename); lzParams.censusFilename = NULL; - fclose_if_valid (lzParams.statsFile); lzParams.statsFile = NULL; - free_seed_hit_search (); - free_quantum_search (); - - free_if_valid ("iz.inferFilename", izParams.ic.inferFilename); izParams.ic.inferFilename = NULL; - free_seeds (izParams.hitSeed); izParams.hitSeed = NULL; - free_score_set ("iz.scoring", izParams.scoring); izParams.scoring = NULL; - free_score_set ("iz.maskedScoring", izParams.maskedScoring); izParams.maskedScoring = NULL; - -#endif // trackMemoryUsage or valgrindMemoryCheck - - // report timing stats - - dbg_timing_add (debugClockTotal); - dbg_timing_add (debugClockQueryTotal); - - dbg_timing_report (debugClockTotal, "total run time"); - dbg_timing_report (debugClockSeq1, "sequence 1 I/O"); - dbg_timing_report (debugClockPosTable, "seed position table"); - dbg_timing_report (debugClockSeq2, "sequence 2 I/O"); - dbg_timing_report (debugClockSegTable, "seed hit search"); - dbg_timing_report (debugClockChaining, "chaining"); - dbg_timing_report (debugClockGappedExtend, "gapped extension"); - dbg_timing_report (debugClockInterpolation, "interpolation"); - dbg_timing_report (debugClockOutput, "output"); - dbg_timing_report (debugClockQueryTotal, "total query time"); - -#ifdef dbgTiming - { - float perQuery; - - perQuery = ((float) debugClockQueryTotal) / numChores; - perQuery /= clocksPerSec; - - fprintf (stderr, "%-26s %d\n", - "queries:", numChores); - fprintf (stderr, "%-26s %.3f (%.1f per second)\n", - "per query (with I/O):", perQuery, 1/perQuery); - - debugClockQueryTotal -= debugClockSeq2; - perQuery = ((float) debugClockQueryTotal) / numChores; - perQuery /= clocksPerSec; - - fprintf (stderr, "%-26s %.3f (%.1f per second)\n", - "per query (w/o input):", perQuery, 1/perQuery); - } -#endif // dbgTiming - -#ifdef dbgTimingGappedExtend - gapped_extend_timing_report (stderr); -#endif // dbgTimingGappedExtend - - if (dbgReportFinish) - fprintf (stderr, "lastz has finished successfully\n"); - - return EXIT_SUCCESS; - } - -//---------- -// -// report_progress-- -// Report job progress to the user (via stderr). -// -//---------- -// -// Arguments: -// seq* target, query: The sequences being aligned. -// int applyChore: true if the job is controlled by chores. -// int numQueries: A count of the number of queries that have been -// .. done. -// int numChores: A count of the number of chores that have been -// .. done, if any. -// hitprocinfo* hitProcInfo: (not used in this version of the program) -// -// Returns: -// true if a progress report was made; false if it wasn't time to make a -// report. -// -//---------- - -static int report_progress - (seq* target, - seq* query, - int applyChore, - int numQueries, - int numChores, - arg_dont_complain (hitprocinfo* hitProcInfo)) - { - int reportProgressNow; - float secs; - int hours, mins; - - // decide whether it's time to report - - reportProgressNow = false; - if (dbgQueryProgress != 0) - { - if (dbgQueryProgress == 1) - reportProgressNow = true; - else if (applyChore) - reportProgressNow = (numChores % dbgQueryProgress == 1); - else - reportProgressNow = (numQueries % dbgQueryProgress == 1); - } - - if (!reportProgressNow) - return false; - - // make the report - - dbgQueryProgressClock += (s64) read_clock(); - secs = ((float)(dbgQueryProgressClock)) / clocksPerSec; - dbgQueryProgressClock = -((s64) read_clock()); - - fprintf (stderr, "%s", dbgQueryProgressPrefix); - - if (secs < 60) - fprintf (stderr, "(%.3fs) ", secs); - else if (secs < 3600) - { - mins = secs / 60; - secs -= 60 * mins; - fprintf (stderr, "(%dm%06.3fs) ", mins, secs); - } - else - { - mins = secs / 60; - secs -= 60 * mins; - hours = mins / 60; - mins -= 60 * hours; - fprintf (stderr, "(%dh%02dm%06.3fs) ", hours, mins, secs); - } - - if (applyChore) - { - fprintf (stderr, "processing chore %s (query %d.%d)", - commatize(numChores), - numQueries, query->chore.num); - if ((query->shortHeader != NULL) && (!query->useFullNames)) - fprintf (stderr, ": %s", query->shortHeader); - else if (query->header != NULL) - fprintf (stderr, ": %s", query->header); - -#ifdef debugChoreChecksum - fprintf (stderr, " checksum=%08X", hassock_hash (query->v, query->len)); -#endif // debugChoreChecksum - - if (query->chore.tSubrange) - fprintf (stderr, " %s " unsposFmt " " unsposFmt, - query->chore.tName, - query->chore.tStart, query->chore.tEnd); - else - fprintf (stderr, " %s * *", - query->chore.tName); - - if (query->chore.qSubrange) - fprintf (stderr, " %s " unsposFmt " " unsposFmt, - query->nextContigName, - query->chore.qStart, query->chore.qEnd); - else - fprintf (stderr, " %s * *", - query->nextContigName); - - if (query->chore.qStrand == 0) fprintf (stderr, " +"); - else if (query->chore.qStrand < 0) fprintf (stderr, " -"); - - debugChoreFilter_2; - - if (query->chore.idTag[0] != 0) - fprintf (stderr, " id=%s", query->chore.idTag); - } - else - { - fprintf (stderr, "processing query %s", commatize(numQueries)); - if ((query->shortHeader != NULL) && (!query->useFullNames)) - fprintf (stderr, ": %s", query->shortHeader); - else if (query->header != NULL) - fprintf (stderr, ": %s", query->header); - - if (dbgQueryProgressWithMasking) - { - unspos targLen = target->len; - if (target->partition.p != NULL) // (sequence 1 is partitioned) - targLen -= target->partition.len; - unspos maskedBases = count_masked_bases (target, -1); - fprintf(stderr, ", masked %s/%s (%.1f%%)", - commatize(maskedBases), commatize(targLen), - (100.0*maskedBases) / targLen); - } - - } - fprintf (stderr, "\n"); - - return true; - } - -//---------- -// -// capsule_target-- -// Hook up the target sequence from a capsule. -// -//---------- -// -// Arguments: -// capinfo* cap: The capsule info record. -// u8** targetRev: Place to return a pointer to the reverse sequence. -// -// Returns: -// A pointer to the sequence; failures result in fatality. The caller must -// eventually de-allocate this by calling free_sequence(). -// -//---------- - -static seq* capsule_target - (capinfo* cap, - u8** _targetRev) - { - seq* target; - u8* fwd, *rev; - char* name; - capseqinfo* info; - cappartition* partitions; - char* namePool; - u64 fwdSize, revSize, nameSize, infoSize, poolSize; - u64 partExpected, partExpectedOld, partSize; - u32 ix; - - // locate the mapped forward sequence - - fwd = locate_capsule_data (cap, cap_seqForward, NULL, &fwdSize); - - if (fwd == NULL) - suicide ("bad capsule file (missing sequence)"); - - if (fwdSize == 0) - suicide ("bad capsule file, sequence length is zero"); - if (fwdSize != (unspos) fwdSize) - suicidef ("bad capsule file, sequence length too large (0x%s)", - hex_64_string(fwdSize)); - - if (fwd[fwdSize-1] != 0) - suicidef ("bad capsule file, sequence not properly terminated (0x2X)", - fwd[fwdSize-1]); - - // locate the mapped reverse sequence - - rev = NULL; // (placate compiler) - if (_targetRev != NULL) - { - rev = locate_capsule_data (cap, cap_seqReverse, NULL, &revSize); - - if (rev == NULL) - suicide ("bad capsule file (missing reverse sequence)"); - if (revSize != fwdSize) - suicidef ("bad capsule file, sequence lengths disagree (forward 0x%s, reverse 0x%s)", - hex_64_string(fwdSize), hex_64_string(revSize)); - - if (rev[fwdSize-1] != 0) - suicidef ("bad capsule file, reverse sequence not properly terminated (0x2X)", - rev[fwdSize-1]); - } - - // locate the mapped name - - name = locate_capsule_data (cap, cap_seqName, NULL, &nameSize); - - if (name != NULL) - { - if (name[nameSize-1] != 0) - suicidef ("bad capsule file, sequence name not properly terminated (0x2X)", - name[nameSize-1]); - } - - // locate the mapped sequence info - - info = locate_capsule_data (cap, cap_seqInfo, NULL, &infoSize); - - if (info == NULL) - suicide ("bad capsule file (missing sequence info)"); - - if (infoSize != sizeof(capseqinfo)) - suicidef ("bad capsule file sequence info (expected size 0x%s, actual 0x%s)", - hex_64_string(sizeof(capseqinfo)), hex_64_string(infoSize)); - if (info->startLoc == 0) - suicidef ("bad capsule file sequence info (start = 0)"); - if (info->contig == 0) - suicidef ("bad capsule file sequence info (contig number = 0)"); - if ((info->revCompFlags & (~rcf_revcomp)) != 0) - suicidef ("bad capsule file sequence info (rev comp flags = %s)", - hex_64_string(sizeof(info->revCompFlags))); - - // locate the partition info, if needed - - partitions = NULL; - namePool = NULL; - if (info->numPartitions != 0) - { - partExpected = ((u64) (info->numPartitions+1)) * sizeof(cappartition); - partExpectedOld = ((u64) (info->numPartitions+1)) * sizeof(cappartitionold); - - partitions = locate_capsule_data (cap, cap_partitions, NULL, &partSize); - - if (partitions == NULL) - suicide ("bad capsule file (missing sequence partitions)"); - - if (partSize == partExpectedOld) - // $$$ this could be handled by - // $$$ .. (1) copying the old-format partition list to a new one - // $$$ .. (2) adding an 'owner' field to determine whether the - // $$$ .. partition list should be dealloc'd or not - // $$$ .. However, I doubt if anyone is in posession of an old- - // $$$ .. format capsule file, so why bother? - suicidef ("outdated capsule file, paritions[] length mismatch (expected 0x%s, actual 0x%s)\n" - "recreate capsule file using lastz 1.02.43 or newer", - hex_64_string(partExpected), hex_64_string(partSize)); - else if (partSize != partExpected) - suicidef ("bad capsule file, paritions[] length mismatch (expected 0x%s, actual 0x%s)", - hex_64_string(partExpected), hex_64_string(partSize)); - - // locate partition names - - namePool = locate_capsule_data (cap, cap_partitionNames, NULL, &poolSize); - - if (namePool == NULL) - suicide ("bad capsule file (missing sequence partition names)"); - - for (ix=0 ; ixnumPartitions ; ix++) - { - if (partitions[ix].header >= poolSize) - suicidef ("bad capsule file, paritionName[%d] beyond array (0x%s >= 0x%s)", - ix, - hex_64_string(partitions[ix].header), - hex_64_string(poolSize)); - } - } - - // create a new sequence record and hook up the mapped data - - target = new_sequence (seqposInfinity); - - target->v = fwd; - target->vOwner = false; - target->size = fwdSize; - target->len = fwdSize-1; - target->header = name; - target->shortHeader = name; - - target->startLoc = info->startLoc; - target->trueLen = info->trueLen; - target->needTrueLen = true; - if (currParams->needTrueLengths != true) - suicidef ("internal error, in capsule_target, needTrueLengths is not == true"); - target->revCompFlags = info->revCompFlags; - target->contig = info->contig; - - // hook up the partition info, if needed - - if (info->numPartitions != 0) - { - seqpartition* sp = &target->partition; - - sp->p = (partition*) partitions; - sp->size = info->numPartitions + 1; - sp->len = info->numPartitions; - sp->pool = namePool; - sp->poolSize = poolSize; - sp->poolLen = poolSize; - sp->poolOwner = false; - sp->state = seqpart_ready; - } - - // success! - - if (_targetRev != NULL) *_targetRev = rev; - return target; - } - -//---------- -// -// capsule_position_table-- -// Hook up the target seed word position table from a capsule. -// -//---------- -// -// Arguments: -// capinfo* cap: The capsule info record. -// seq* seq: The sequence the position table in built for. -// seed* hitSeed: The seed-word the table is based on. -// u32 step: The step size the table is based on. -// -// Returns: -// A pointer to the position table; failures result in fatality. The caller -// must eventually de-allocate this by calling free_position_table(). -// -//---------- - -static postable* capsule_position_table - (capinfo* cap, - seq* seq, - seed* hitSeed, - u32 step) - { - postable* pt; - unspos prevEntries; - unspos* last, *prev; - u32* asBits; - u64 lastExpected, prevExpected, bitsExpected; - u64 lastSize, prevSize, bitsSize; - - if (sizeof(unspos) != sizeof(u32)) - suicide ("internal error, capsule expects positions to be 32 bits"); - - // figure out how many bytes to expect - - lastExpected = (((u64) 1) << hitSeed->weight) * sizeof(unspos); - prevEntries = 1 + (seq->len / step); - prevExpected = ((u64) prevEntries) * sizeof(unspos); - - // locate the mapped last[] array - - last = locate_capsule_data (cap, cap_lastPosTable, NULL, &lastSize); - - if (last == NULL) - suicide ("bad capsule file (missing last[] array)"); - - if (lastSize != lastExpected) - suicidef ("bad capsule file, last[] length mismatch (expected 0x%s, actual 0x%s)", - hex_64_string(lastExpected), hex_64_string(lastSize)); - - // locate the mapped prev[] array - - prev = locate_capsule_data (cap, cap_prevPosTable, NULL, &prevSize); - - if (prev == NULL) - suicide ("bad capsule file (missing prev[] array)"); - - if (prevSize != prevExpected) - suicidef ("bad capsule file, prev[] length mismatch (expected 0x%s, actual 0x%s)", - hex_64_string(prevExpected), hex_64_string(prevSize)); - - // locate the mapped sequence bits - - asBits = NULL; - if (hitSeed->type == 'R') - { - asBits = locate_capsule_data (cap, cap_seqBits, NULL, &bitsSize); - - if (asBits == NULL) - suicide ("bad capsule file (missing sequence bits[] array)"); - - bitsExpected = round_up_16((seq->len+3) / 4); - if (bitsSize != bitsExpected) - suicidef ("bad capsule file, sequence bits[] length mismatch (expected 0x%s, actual 0x%s)", - hex_64_string(bitsExpected), hex_64_string(bitsSize)); - } - - // create a new position table record and hook up the mapped data - - pt = new_position_table (hitSeed->weight, 0, seq->len, step, - false, false, false); - - pt->last = last; - pt->prev = prev; - pt->asBits = asBits; - - // success! - - return pt; - } - -//---------- -// -// resolve_chore_target, resolve_chore_query-- -// Convert a chore to the corresponding position on the target (or query) -// sequence. As a side effect, we validate the chore's target (or query). -// -//---------- -// -// Arguments: -// (for resolve_chore_target) -// chore* chore: The unadulterated chore, as defined within the chores file. -// seq* target: The sequence being searched. -// -// (for resolve_chore_query) -// seq* query: The sequence(s) being searched for. -// char strand: '+' => map interval for forward strand -// '-' => map interval for reverse strand -// -// Returns: -// The interval corresponding to the chore. This is origin-zero half-open, -// and relative to target->v[] (or query->v[]). -// -//---------- -// -// Implementation notes: -// -// tSeqStart is in-file position of start of the resident piece-of-sequence -// (origin zero). -// -// tOffset is index into target->v[] of the start of the resident piece-of- -// sequence. -// -// qSeqStart and qOffset have similar meaning. -// -// We ignore qStrand information and use the strand provided by the caller. -// -//---------- - -static interval resolve_chore_target - (chore* _chore, - seq* target) - { - seqpartition* tSp = &target->partition; - partition* tNamePart, *tPart; - int nameIsWildcard; - char* tHeader; - unspos tStart, tEnd; - unspos tSeqStart, tSeqEnd, tOffset, tLen; - interval tChore = {0,0}; - - nameIsWildcard = (_chore->tName[0] == 0); - - tStart = _chore->tStart - 1 ; - tEnd = _chore->tEnd; - - if (tSp->p == NULL) // target is not partitioned - { - tHeader = (target->useFullNames)? target->header : target->shortHeader; - if ((!nameIsWildcard) - && (strcmp (_chore->tName, tHeader) != 0)) - goto target_mismatch; - - if (!_chore->tSubrange) // no interval specified - { - tChore.s = 0; // return full sequence - tChore.e = target->len; - } - else // sub-interval specified - { - tSeqStart = target->startLoc - 1; - tOffset = 0; - tLen = target->len; - tSeqEnd = tSeqStart + tLen; - if (tStart < tSeqStart) goto interval_before_start; - if (tEnd > tSeqEnd) goto interval_after_end; - tChore.s = tOffset + tStart - tSeqStart; - tChore.e = tOffset + tEnd - tSeqStart; - } - } - - else if (nameIsWildcard) // target is partitioned and - goto cant_use_wildcard; // .. name is wildcard - - else // target is partitioned and - { // .. specific name is given - tNamePart = lookup_named_partition (target, _chore->tName); - if (tNamePart == NULL) goto bad_target_name; - - if (!_chore->tSubrange) // no interval specified - { - tStart = tNamePart->startLoc - 1; - tPart = lookup_partition_seq_pos (target, tNamePart, tStart+1); - if (tPart == NULL) goto bad_position; // (this would be an internal error) - tChore.s = tPart->sepBefore + 1; // return full partition - tChore.e = tPart->sepAfter; - } - else // sub-interval specified - { - tPart = lookup_partition_seq_pos (target, tNamePart, tStart+1); - if (tPart == NULL) goto bad_position; - tSeqStart = tPart->startLoc - 1; - tOffset = tPart->sepBefore + 1; - tLen = tPart->sepAfter - tOffset; - tSeqEnd = tSeqStart + tLen; - if (tStart < tSeqStart) goto interval_before_start; - if (tEnd > tSeqEnd) goto interval_after_end; - tChore.s = tOffset + tStart - tSeqStart; - tChore.e = tOffset + tEnd - tSeqStart; - } - } - - return tChore; - - ////////// - // failure exits - ////////// - -cant_use_wildcard: - suicidef ("wildcard target in chore can't be used with a multiple sequence target file (%s)", - target->filename); - return tChore; // (never gets here) - -target_mismatch: - suicidef ("chore target %s is mismatch for %s in target file (%s)", - _chore->tName, target->header, target->filename); - return tChore; // (never gets here) - -bad_target_name: - suicidef ("chore target %s does not exist in target file (%s)", - _chore->tName, target->filename); - return tChore; // (never gets here) - -bad_position: - suicidef ("bad chore target name/position: %s " unsposFmt, - _chore->tName, tStart+1); - return tChore; // (never gets here) - -interval_before_start: - suicidef ("chore target interval out of range on %s (" unsposFmt "<" unsposFmt ")", - _chore->tName, tStart+1, tSeqStart+1); - return tChore; // (never gets here) - -interval_after_end: - suicidef ("chore target interval out of range on %s (" unsposFmt ">" unsposFmt ")", - _chore->tName, tEnd, tSeqEnd); - return tChore; // (never gets here) - } - - -// resolve_chore_query-- - -static interval resolve_chore_query - (seq* query, - char strand) - { - seqpartition* qSp = &query->partition; - partition* qNamePart, *qPart; - chore* _chore = &query->chore; - unspos qStart, qEnd; - unspos qSeqStart, qSeqEnd, qOffset, qLen; - interval qChore = {0,0}; - - qStart = _chore->qStart - 1; - qEnd = _chore->qEnd; - - if (qSp->p == NULL) // query is not partitioned - { - if (!_chore->qSubrange) // no interval specified - { - qChore.s = 0; // return full sequence - qChore.e = query->len; - } - else // sub-interval specified - { - qSeqStart = query->startLoc - 1; - qOffset = 0; - qLen = query->len; - qSeqEnd = qSeqStart + qLen; - if (qStart < qSeqStart) goto interval_before_start; - if (qEnd > qSeqEnd) goto interval_after_end; - if (strand != '-') - { // positive strand - qChore.s = qOffset + qStart - qSeqStart; - qChore.e = qOffset + qEnd - qSeqStart; - } - else - { // negative strand - qChore.s = qOffset + qSeqEnd - qEnd; - qChore.e = qOffset + qSeqEnd - qStart; - } - } - } - - else if (!_chore->qSubrange) // query is partitioned and - { // .. no interval specified - qNamePart = lookup_named_partition (query, query->nextContigName); - if (qNamePart == NULL) goto bad_query_name; - qPart = last_partition_with_name (query, qNamePart); - qOffset = qNamePart->sepBefore + 1; - qLen = qPart->sepAfter - qOffset; - qChore.s = qOffset; // return full sequence - qChore.e = qOffset + qLen; - } - - else // query is partitioned - { - qNamePart = lookup_named_partition (query, query->nextContigName); - if (qNamePart == NULL) goto bad_query_name; - - qPart = lookup_partition_seq_pos (query, qNamePart, qStart+1); - if (qPart == NULL) goto bad_position; - qSeqStart = qPart->startLoc - 1; - qOffset = qPart->sepBefore + 1; - qLen = qPart->sepAfter - qOffset; - qSeqEnd = qSeqStart + qLen; - if (qStart < qSeqStart) goto interval_before_parition_start; - if (qEnd > qSeqEnd) goto interval_after_parition_end; - - if (strand != '-') - { // positive strand - qChore.s = qOffset + qStart - qSeqStart; - qChore.e = qOffset + qEnd - qSeqStart; - } - else - { // negative strand - qChore.s = qOffset + qSeqEnd - qEnd; - qChore.e = qOffset + qSeqEnd - qStart; - } - } - - return qChore; - - ////////// - // failure exits - ////////// - -bad_query_name: - suicidef ("INTERNAL ERROR. chore query %s does not exist in query file (%s)", - query->nextContigName, query->filename); - return qChore; // (never gets here) - -bad_position: - suicidef ("bad chore query name/position: %s " unsposFmt, - query->nextContigName, qStart+1); - return qChore; // (never gets here) - -interval_before_start: - suicidef ("chore query interval out of range on %s (" unsposFmt "<" unsposFmt ")", - query->nextContigName, qStart+1, qSeqStart+1); - return qChore; // (never gets here) - -interval_after_end: - suicidef ("chore query interval out of range on %s (" unsposFmt ">" unsposFmt ")", - query->nextContigName, qEnd, qSeqEnd); - return qChore; // (never gets here) - -interval_before_parition_start: - suicidef ("chore query interval beyond partition range on %s" - " (" unsposFmt "<" unsposFmt ".." unsposFmt ")", - query->nextContigName, qStart+1, qSeqStart+1, qSeqEnd); - return qChore; // (never gets here) - -interval_after_parition_end: - suicidef ("chore query interval beyond partition range on %s" - " (" unsposFmt ">" unsposFmt ".." unsposFmt ")", - query->nextContigName, qEnd, qSeqStart+1, qSeqEnd); - return qChore; // (never gets here) - - } - -//---------- -// -// set_up_hit_processor-- -// Set up variables that select and control the appropriate seed hit processor -// function. -// -//---------- -// -// Arguments: -// control* params: Parameter set controlling the desired -// .. search. -// int collectingCensus: true => caller will be collecting a -// .. census. -// hitprocessor* hitProc: Place to return the function to call for -// .. each hit to determine if it is 'good -// .. enough'. -// void** hitProcInfo: Place to return a value (usually a pointer -// .. to some data) to pass thru with each -// .. call to hitProc. -// -// Returns: -// (nothing) -// -//---------- -// -// Notes: -// (1) This routine is NOT reentrant, since some of the control data returned -// in (*hitProcInfo) is stored in static variables that are private to this -// routine. -// -// (2) As of this writing, there are four "seed hit processor functions", -// namely process_for_plain_hit, process_for_recoverable_hit, -// process_for_simple_hit, and process_for_twin_hit. The code for these is -// in seed_search.c. Each is annotated with this line: -// [[-- a seed hit processor function --]] -// -//---------- - -void set_up_hit_processor - (control* params, - int collectingCensus, - hitprocessor* _hitProc, - void** _hitProcInfo) - { - static hitprocsimple simpleInfo; - static hitproctwin twinInfo; - static hitrepgappily gappilyInfo; - hitprocinfo* hpInfo; - int filtering; - - // decide which hit processor to use when we discover a seed hit - - if (params->twinMinSpan <= 0) - { - if ((params->gfExtend == gfexNoExtend) && (!params->gappedExtend)) - (*_hitProc) = process_for_plain_hit; - else if (params->basicHitType == hitRecover) - (*_hitProc) = process_for_recoverable_hit; - else - (*_hitProc) = process_for_simple_hit; - (*_hitProcInfo) = (void*) &simpleInfo; - hpInfo = &simpleInfo.hp; - } - else - { - (*_hitProc) = process_for_twin_hit; - (*_hitProcInfo) = (void*) &twinInfo; - hpInfo = &twinInfo.hp; - twinInfo.minSpan = params->twinMinSpan; - twinInfo.maxSpan = params->twinMaxSpan; - } - - hpInfo->posFilter = false; - hpInfo->targetInterval.s = hpInfo->targetInterval.e = 0; - hpInfo->queryInterval.s = hpInfo->queryInterval.e = 0; - - params->mergeAnchors = (((*_hitProc) != process_for_plain_hit) - && ((*_hitProc) != process_for_simple_hit)); - - filtering = ((params->minIdentity > 0) - || (params->maxIdentity < 1) - || (params->minCoverage > 0) - || (params->maxCoverage < 1) - || (params->minContinuity > 0) - || (params->maxContinuity < 1) - || (params->minMatchCount > 0) - || (params->maxMismatchCount >= 0) - || (params->maxSeparateGapsCount >= 0) - || (params->maxGapColumnsCount >= 0)); - - // decide how to control that hit processor - - hpInfo->reporter = collect_hsps; - hpInfo->reporterInfo = NULL; - - if ((anchors == NULL) - || ((params->hspThreshold.t =='S') // (non-adaptive HSP score threshold) - && (params->searchLimit == 0) // (no limit on number of HSPs) - && (!params->chain) // (not chaining) - && (!params->gappedExtend) // (not doing gapped extension) - && (!params->mergeAnchors) -#ifdef densityFiltering - && (params->maxDensity == 0) -#endif // densityFiltering - && (!collectingCensus) - && (!filtering) - && (!dbgSortAnchorsByDiag) - && (dbgShowHspCountsMin == (u32)-1))) - hpInfo->reporter = report_hsps; - - if ((params->hspImmediate) && (!params->gappedExtend)) - { - hpInfo->reporter = collect_filtered_hsps; - hpInfo->reporterInfo = NULL; - } - else if ((params->hspImmediate) && (params->gappedExtend)) - { - hpInfo->reporter = gappily_extend_hsps; - hpInfo->reporterInfo = &gappilyInfo; - - gappilyInfo.seq1 = params->seq1; - gappilyInfo.rev1 = params->rev1; - gappilyInfo.seq2 = NULL; // (can't set this yet) - gappilyInfo.rev2 = NULL; - gappilyInfo.scoring = params->scoring; - gappilyInfo.yDrop = params->yDrop; - gappilyInfo.trimToPeak = (params->yDropUntrimmed == false); - gappilyInfo.scoreThresh = params->gappedThreshold; - gappilyInfo.traceback = params->traceback; - gappilyInfo.minIdentity = params->minIdentity; - gappilyInfo.maxIdentity = params->maxIdentity; - gappilyInfo.minCoverage = params->minCoverage; - gappilyInfo.maxCoverage = params->maxCoverage; - gappilyInfo.minContinuity = params->minContinuity; - gappilyInfo.maxContinuity = params->maxContinuity; - gappilyInfo.minMatchCount = params->minMatchCount; - gappilyInfo.maxMismatchCount = params->maxMismatchCount; - gappilyInfo.maxSeparateGapsCount = params->maxSeparateGapsCount; - gappilyInfo.maxGapColumnsCount = params->maxGapColumnsCount; - gappilyInfo.deGapifyOutput = params->deGapifyOutput; - - if (params->searchLimit <= 1) - { - gappilyInfo.alignmentHashesSeen = 0; - gappilyInfo.alignmentHashes = NULL; - } - else - { - if (alignmentHashes == NULL) - alignmentHashes = (u32*) malloc_or_die ("set_up_hit_processor (alignmentHashes)", params->searchLimit * sizeof(u32)); - gappilyInfo.alignmentHashesSize = params->searchLimit; - gappilyInfo.alignmentHashesSeen = 0; - gappilyInfo.alignmentHashes = alignmentHashes; - } - } - - if (params->minMatches >= 0) - { - hpInfo->minMatches = params->minMatches; - hpInfo->maxTransversions = params->maxTransversions; - hpInfo->filterPattern = NULL; - hpInfo->charToBits = params->charToBits; - if (params->filterCaresOnly) - hpInfo->filterPattern = params->hitSeed->pattern; - } - else - { - hpInfo->minMatches = -1; // (no filtering) - hpInfo->charToBits = params->charToBits; - } - - if (params->gfExtend == gfexNoExtend) - { - hpInfo->gfExtend = gfexNoExtend; - hpInfo->seq1 = params->seq1; // (sequences may be - hpInfo->seq2 = params->seq2; // .. needed for filtering) - } - else if ((params->gfExtend == gfexExact) - || ((params->gfExtend >= gfexMismatch_min) - && (params->gfExtend <= gfexMismatch_max))) - { - hpInfo->gfExtend = params->gfExtend; - hpInfo->seq1 = params->seq1; - hpInfo->seq2 = params->seq2; - hpInfo->hspThreshold = params->hspThreshold; - hpInfo->anchors = &anchors; - seed_search_set_stat(isHspSearch,true); - } - else // if (params->gfExtend == gfexXDrop) - { - hpInfo->gfExtend = gfexXDrop; - hpInfo->seq1 = params->seq1; - hpInfo->seq2 = params->seq2; - hpInfo->scoring = params->maskedScoring; - hpInfo->xDrop = params->xDrop; - hpInfo->hspThreshold = params->hspThreshold; - hpInfo->hspZeroThreshold = (params->hspThreshold.t !='S')? 0 - : (params->hspThreshold.s > 0 )? params->hspThreshold.s - : 0; - hpInfo->anchors = &anchors; - hpInfo->entropicHsp = params->entropicHsp; - hpInfo->reportEntropy = params->reportEntropy; - seed_search_set_stat(isHspSearch,true); - - if (infer_scores_dbgShowIdentity) - { - printf ("hit_processor xDrop = " scoreFmtSimple "\n", - hpInfo->xDrop); - printf ("hit_processor hspThreshold = %s\n", - score_thresh_to_string (&hpInfo->hspThreshold)); - } - } - -#ifdef snoopHitProc - if (*_hitProc == process_for_plain_hit) fprintf (stderr, "hitProc == process_for_plain_hit (%p)\n", *_hitProc); - else if (*_hitProc == process_for_recoverable_hit) fprintf (stderr, "hitProc == process_for_recoverable_hit (%p)\n", *_hitProc); - else if (*_hitProc == process_for_simple_hit) fprintf (stderr, "hitProc == process_for_simple_hit (%p)\n", *_hitProc); - else if (*_hitProc == process_for_twin_hit) fprintf (stderr, "hitProc == process_for_twin_hit (%p)\n", *_hitProc); - else fprintf (stderr, "hitProc == ??? (%p)\n", *_hitProc); - - if (*_hitProcInfo == &simpleInfo) fprintf (stderr, "hitProcInfo == simpleInfo (%p)\n", *_hitProcInfo); - else if (*_hitProcInfo == &twinInfo) fprintf (stderr, "hitProcInfo == twinInfo (%p)\n", *_hitProcInfo); - else fprintf (stderr, "hitProcInfo == ??? (%p)\n", *_hitProcInfo); - - if (hpInfo->reporter == collect_hsps) fprintf (stderr, "hpInfo->reporter == collect_hsps (%p)\n", hpInfo->reporter); - else if (hpInfo->reporter == report_hsps) fprintf (stderr, "hpInfo->reporter == report_hsps (%p)\n", hpInfo->reporter); - else if (hpInfo->reporter == collect_filtered_hsps) fprintf (stderr, "hpInfo->reporter == collect_filtered_hsps (%p)\n", hpInfo->reporter); - else if (hpInfo->reporter == gappily_extend_hsps) fprintf (stderr, "hpInfo->reporter == gappily_extend_hsps (%p)\n", hpInfo->reporter); - else fprintf (stderr, "hpInfo->reporter == ??? (%p)\n", hpInfo->reporter); - - if (hpInfo->reporterInfo == &gappilyInfo) fprintf (stderr, "hpInfo->reporterInfo == gappily_extend_hsps (%p)\n", hpInfo->reporterInfo); - else if (hpInfo->reporterInfo == NULL) fprintf (stderr, "hpInfo->reporterInfo == NULL (%p)\n", hpInfo->reporterInfo); - else fprintf (stderr, "hpInfo->reporterInfo == ??? (%p)\n", hpInfo->reporterInfo); -#endif // snoopHitProc - } - -//---------- -// -// start_one_strand-- -// Start alignment upon one query strand. -// -//---------- -// -// Arguments: -// seq* target: The sequence being searched. -// postable* targPositions: A table of positions of words in target. -// seq* query: The sequence(s) being searched for. -// int emptyAnchors: true => clear the anchors table before -// .. starting. -// u32 prevAnchorCount:Number of anchors previously found. This -// .. is applicable if there is a search limit -// .. and we've found a saved anchors from the -// .. other strand. -// hitprocessor hitProc: Function to call for each hit to determine -// .. if it is 'good enough'. -// void* hitProcInfo: A value (usually a pointer to some data) to -// .. pass thru with each call to hitProc. -// -// Returns: -// false if the caller should abort processing of this query; true if the -// caller should continue. An example of a reason to abort is if the number -// of HSP's for the query exceeds currParams->searchLimit. -// -//---------- - -int start_one_strand - (seq* target, - postable* targPositions, - seq* query, - int emptyAnchors, - u32 prevAnchorCount, - hitprocessor hitProc, - void* hitProcInfo) - { - unspos coverageLimit; - u32 searchLimit; -#ifdef densityFiltering - u64 basesHit; -#endif // densityFiltering - int success; - - init_output_for_strand (); - - // if we have a chore, place 'fences' in the target and query; the fences - // will prevent the ungapped extension stage from searching beyond the - // chore interval - - if (query->choresFile != NULL) - { - fence_sequence_interval (target, query->chore.targetInterval, 0); - fence_sequence_interval (query, query->chore.queryInterval, 0); - } - - // if we're to read anchors from a file, do so - - if (currParams->anchorsFile != NULL) - { - if (emptyAnchors) - { - if (currParams->hspThreshold.t == 'S') coverageLimit = 0; - else coverageLimit = currParams->hspThreshold.c; - - empty_segment_table (anchors); - limit_segment_table (anchors, coverageLimit); - } - anchors = read_segment_table - (currParams->anchorsFile, currParams->anchorsFilename, - anchors, target, query); - goto compare_to_anchor_limit; - } - - // find seed hits; depending on hitProc, we may extend these to HSPs; and - // depending on format parameters and hitReporter, we will either report - // these directly to the output, or we will collect them in anchors[] - - dbg_timing_sub (debugClockSegTable); - - if ((emptyAnchors) && (anchors != NULL)) - { - if (currParams->hspThreshold.t == 'S') coverageLimit = 0; - else coverageLimit = currParams->hspThreshold.c; - - empty_segment_table (anchors); - limit_segment_table (anchors, coverageLimit); - } - - searchLimit = currParams->searchLimit; - if ((searchLimit > 0) && (prevAnchorCount > 0)) - { - if (prevAnchorCount < searchLimit) searchLimit -= prevAnchorCount; - else searchLimit = 1; - } - - //fprintf (stderr, "start_one_strand(.,%s%c), searchLimit=%u\n", - // (query->useFullNames)? query->header : query->shortHeader, - // ((query->revCompFlags & rcf_rev) == 0)? '+' : '-', - // searchLimit); - - if (query->fileType == seq_type_qdna) - quantum_seed_hit_search (target, targPositions, - query, 0, query->len, - currParams->upperCharToBits, currParams->hitSeed, - currParams->maskedScoring, currParams->ballScore, - hitProc, hitProcInfo); - else - { -#ifndef densityFiltering // === density filtering DISabled - seed_hit_search (target, targPositions, - query, 0, query->len, currParams->selfCompare, - currParams->upperCharToBits, currParams->hitSeed, - searchLimit, - (currParams->searchLimitWarn)? currParams->searchLimit : 0, - hitProc, hitProcInfo); -#else // === density filtering ENabled - basesHit = seed_hit_search (target, targPositions, - query, 0, query->len, currParams->selfCompare, - currParams->upperCharToBits, currParams->hitSeed, - searchLimit, - (currParams->searchLimitWarn)? currParams->searchLimit : 0, - currParams->maxDensity, - hitProc, hitProcInfo); - if (basesHit == u64max) // maxDensity has been exceeded (u64max is used - goto abort; // .. as a special value indicating this) -#endif // densityFiltering - } - - // see if we got too many HSPs/anchors/segments - -compare_to_anchor_limit: - - if ((currParams->searchLimit > 0) // (if we have a search limit - && (!currParams->searchLimitKeep) // .. and we're not reporting queries that exceed the limit - && (anchors->len + prevAnchorCount > currParams->searchLimit)) // .. and this query exceeded the limit) - { - if (dbgShowHspCountsMin != (u32)-1) - { - if (dbgQueryProgress != 0) fprintf (stderr, " "); - fprintf (stderr, "too many HSPs"); - dbg_show_hsp_counts_1; - } - - goto abort; - } - - if (dbgAnchorContent) - write_segments (stderr, anchors, target, query, true, 0); - - success = true; - goto exit; - - // the caller shouldn't process the HSPs - -abort: - success = false; - goto exit; - - // cleanup and exit - -exit: - if (query->choresFile != NULL) - { - unfence_sequence_interval (target); - unfence_sequence_interval (query); - } - - dbg_timing_add (debugClockSegTable); - return success; - } - -//---------- -// -// finish_one_strand-- -// Finish alignment upon one query strand. -// -//---------- -// -// Arguments: -// seq* target: The sequence being searched. -// postable* targPositions: A table of positions of words in target. -// u8* targetRev: The reverse (NOT reverse complement) of the -// .. target sequence, as a zero-terminated -// .. string; this may be NULL if the caller -// .. doesn't need/want to supply it. It is -// .. only needed if we will be doing a gapped -// .. extension. -// seq* query: The sequence(s) being searched for. -// u8* queryRev: The reverse (NOT reverse complement) of the -// .. query sequence (analagous to targetRev) -// tback* traceback: Memory in which to track gapped alignment -// .. traceback. -// census* targCensus: Census array for target sequence. If this -// .. is non-NULL, we count how many times each -// .. target base is aligned. This information -// .. is used to mask positions in the target -// .. sequence if currParams->dynamicMasking > 0. -// -// Returns: -// (nothing) -// -//---------- - -//=== stuff for snoopAlignList === - -#ifndef snoopAlignList -#define snoopAlignList_1 ; -#define snoopAlignList_2 ; -#define snoopAlignList_3 ; -#endif // not snoopAlignList - -#ifdef snoopAlignList - -#define snoopAlignList_1 \ - { \ - alignel* a; \ - int i; \ - for (a=alignList,i=0 ; a!=NULL ; a=a->next,i++) \ - fprintf (stderr, "finish_one_strand.1 [%d] a=%08lX" \ - " " unsposDotsFmt " " unsposDotsFmt "\n", \ - i, (long) a, \ - a->beg1, a->end1, a->beg2, a->end2); \ - fprintf (stderr, "\n"); \ - } - -#define snoopAlignList_2 \ - { \ - alignel* a; \ - int i; \ - for (a=alignList,i=0 ; a!=NULL ; a=a->next,i++) \ - fprintf (stderr, "finish_one_strand.2 [%d] a=%08lX" \ - " " unsposDotsFmt " " unsposDotsFmt "\n", \ - i, (long) a, \ - a->beg1, a->end1, a->beg2, a->end2); \ - fprintf (stderr, "\n"); \ - } - -#define snoopAlignList_3 \ - { \ - alignel* a; \ - int i; \ - for (a=alignList,i=0 ; a!=NULL ; a=a->next,i++) \ - fprintf (stderr, "finish_one_strand.3 [%d] a=%08lX" \ - " " unsposDotsFmt " " unsposDotsFmt "\n", \ - i, (long) a, \ - a->beg1, a->end1, a->beg2, a->end2); \ - fprintf (stderr, "\n"); \ - } - -#endif // snoopAlignList - - -//--- finish_one_strand-- - -void finish_one_strand - (seq* target, - u8* _targetRev, - postable* targPositions, - seq* query, - u8* _queryRev, - tback* traceback, - census* targCensus) - { - u8* targetRev = _targetRev; - u8* queryRev = _queryRev; - alignel* alignList = NULL; - int hspsAreAdaptive; - score lowAnchorScore = 0; - u64 maxPairedBases; - - hspsAreAdaptive = (currParams->hspThreshold.t != 'S'); - if ((anchors != NULL) && (hspsAreAdaptive)) - { - lowAnchorScore = anchors->lowScore; - if ((secondaryAnchors != NULL) - && (secondaryAnchors->lowScore < lowAnchorScore)) - lowAnchorScore = secondaryAnchors->lowScore; - } - - if ((anchors != NULL) - && (dbgShowHspCountsMin != (u32)-1) - && (anchors->len >= dbgShowHspCountsMin)) - { - if (dbgQueryProgress != 0) fprintf (stderr, " "); - fprintf (stderr, "%s HSPs", commatize(anchors->len)); - dbg_show_hsp_counts_1; - } - - if ((anchors != NULL) // merging may be necessary because - && (currParams->mergeAnchors)) // .. the diag hash technique used in - { // .. the seed search may result in - merge_segments (anchors); // .. duplicate or overlapping HSPs - //fprintf (stderr, "segments for %s %c\n", - // (query->partition.p != NULL)? "(partitioned query)" - // : (query->useFullNames) ? query->header - // : query->shortHeader, - // ((query->revCompFlags & rcf_rev) != 0)? '-' : '+'); - //write_segments (stderr, anchors, target, query, false, 0); - } - - if ((anchors != NULL) && (dbgSortAnchorsByDiag)) - sort_segments (anchors, qSegmentsByDiag); - - // filter HSPs by identity and/or coverage - - if ((anchors != NULL) - && (!currParams->gappedExtend)) - { - if ((currParams->minIdentity > 0) || (currParams->maxIdentity < 1)) - filter_segments_by_identity (target, query, anchors, - currParams->minIdentity, - currParams->maxIdentity); - - if ((currParams->minCoverage > 0) || (currParams->maxCoverage < 1)) - filter_segments_by_coverage (target, query, anchors, - currParams->minCoverage, - currParams->maxCoverage); - - if (currParams->minMatchCount > 0) - filter_segments_by_match_count (target, query, anchors, - currParams->minMatchCount); - - if (currParams->maxMismatchCount >= 0) - filter_segments_by_mismatch_count (target, query, anchors, - currParams->maxMismatchCount); - } - - // if we have scoreless anchors, and we need scores, score 'em - - if ((anchors != NULL) - && (!anchors->haveScores) - && ((currParams->chain) || (currParams->gappedExtend))) - score_segments (anchors, target, query, currParams->maskedScoring); - - // reduce the set of HSPs to the best syntenic subset - - if ((anchors != NULL) && (currParams->chain)) - { - u32 originalNumAnchors = anchors->len; - - dbg_timing_sub (debugClockChaining); - try_reduce_to_chain (target, query, - anchors, currParams->chainDiag, currParams->chainAnti, - chainScale, chain_connect_penalty); - sort_segments (anchors, qSegmentsByPos1); - dbg_timing_add (debugClockChaining); - - if (dbgShowAnchors) - fprintf (stderr, "(chaining reduced %u anchors to %u)\n", - originalNumAnchors, anchors->len); - } - - // report the set of HSPs if we don't plan to do gapped extension - - if ((anchors != NULL) && (!currParams->gappedExtend)) - { - u32 ix; - segment* seg; - - for (ix=0,seg=anchors->seg ; ixlen ; ix++,seg++) - print_match (seg->pos1, seg->pos2, seg->length, seg->s, seg->hspId); - } - - // if we don't plan to do gapped extension, perform dynamic masking; note - // that if currParams->dynamicMasking == 0, we don't actually mask, we just - // count for the census; further note that we don't mask the reverse - // sequence unless the caller provided it - // $$$ it appears that in this case, masking occurs *before* filtering, - // $$$ .. which is not what the user should expect; unfortunately I need - // $$$ .. need to keep it that way for backward compatibility - - if ((targCensus != NULL) && (anchors != NULL) && (!currParams->gappedExtend)) - { - unspos numMasked; - numMasked = census_mask_segments - (anchors, target->v, _targetRev, targCensus, - remove_interval_seeds, targPositions); - print_x_stanza (numMasked); - if (dbgMasking) print_m_stanza (targCensus); - } - - // extend the HSPs to gapped alignments - - if (currParams->gappedExtend) - { - sthresh gappedThreshold; - - dbg_timing_sub (debugClockGappedExtend); - if (targetRev == NULL) - targetRev = (u8*) copy_reverse_of_string ((char*) target->v, target->len); - if (queryRev == NULL) - queryRev = (u8*) copy_reverse_of_string ((char*) query->v, query->len); - - reduce_to_points (target, query, currParams->scoring, anchors); - if ((anchors != NULL) && (dbgShowAnchors)) - write_segments (stderr, anchors, target, query, false, dbgShowAnchorsHowOften); - - gappedThreshold = currParams->gappedThreshold; - if ((gappedThreshold.t != 'S') && (hspsAreAdaptive)) - { - gappedThreshold.t = 'S'; - gappedThreshold.s = lowAnchorScore; - //fprintf (stderr, "gapped threshold <- " scoreFmtSimple "\n", lowAnchorScore); - } - - maxPairedBases = 0; - if (currParams->maxPairedBases > 0) - maxPairedBases = currParams->maxPairedBases; - else if (currParams->maxPairedDepth > 0.0) - maxPairedBases = (u64) ceil (currParams->maxPairedDepth * query->len); - - alignList = gapped_extend (target, targetRev, query, queryRev, - currParams->inhibitTrivial, - currParams->scoring, - anchors, traceback, - currParams->gappedAllBounds, - currParams->yDrop, - (currParams->yDropUntrimmed == false), - gappedThreshold, - maxPairedBases, - currParams->overlyPairedWarn, - currParams->overlyPairedKeep); - snoopAlignList_1; - dbg_timing_add (debugClockGappedExtend); - } - - // filter gapped alignments by identity, coverage, and/or continuity - - if (alignList != NULL) - { - if ((currParams->minIdentity > 0) || (currParams->maxIdentity < 1)) - alignList = filter_aligns_by_identity - (target, query, alignList, - currParams->minIdentity, currParams->maxIdentity); - - if ((currParams->minCoverage > 0) || (currParams->maxCoverage < 1)) - alignList = filter_aligns_by_coverage - (target, query, alignList, - currParams->minCoverage, currParams->maxCoverage); - - if ((currParams->minContinuity > 0) || (currParams->maxContinuity < 1)) - alignList = filter_aligns_by_continuity - (alignList, - currParams->minContinuity, currParams->maxContinuity); - - if (currParams->minMatchCount > 0) - alignList = filter_aligns_by_match_count - (target, query, alignList, currParams->minMatchCount); - - if (currParams->maxMismatchCount >= 0) - alignList = filter_aligns_by_mismatch_count - (target, query, alignList, currParams->maxMismatchCount); - - if (currParams->maxSeparateGapsCount >= 0) - alignList = filter_aligns_by_num_gaps - (alignList, currParams->maxSeparateGapsCount); - - if (currParams->maxGapColumnsCount >= 0) - alignList = filter_aligns_by_num_gap_columns - (alignList, currParams->maxGapColumnsCount); - } - - // interpolate between the gapped alignments - - if ((alignList != NULL) && (currParams->innerThreshold > 0)) - { - dbg_timing_sub (debugClockInterpolation); - alignList = tweener_interpolate - (alignList, target, query, - currParams->selfCompare, currParams->inhibitTrivial, - currParams->upperCharToBits, currParams->innerSeed, - currParams->scoring, currParams->maskedScoring, traceback, - currParams->xDrop, currParams->gappedAllBounds, - currParams->yDrop, (currParams->yDropUntrimmed == false), - currParams->innerThreshold, - currParams->chainDiag, currParams->chainAnti, - chainScale, chain_connect_penalty, currParams->innerWindow); - dbg_timing_add (debugClockInterpolation); - } - - // print the gapped alignments - - if (alignList != NULL) - { - dbg_timing_sub (debugClockOutput); - snoopAlignList_2; - if (currParams->mirrorGapped) - { - alignList = mirror_alignments (alignList); - snoopAlignList_3; - } - if (currParams->deGapifyOutput) print_align_list_segments (alignList); - else print_align_list (alignList); - fflush (currParams->outputFile); - dbg_timing_add (debugClockOutput); - } - - // perform dynamic masking; note that if currParams->dynamicMasking == 0, we - // don't actually mask, we just count for the census; further note that - // we don't mask the reverse sequence unless the caller provided it - - if ((targCensus != NULL) && (alignList != NULL)) - { - unspos numMasked; - numMasked = census_mask_aligns - (alignList, target->v, _targetRev, targCensus, - remove_interval_seeds, targPositions); - print_x_stanza (numMasked); - if (dbgMasking) print_m_stanza (targCensus); - } - - // cleanup - - if (alignList != NULL) - free_align_list (alignList); - - if ((_targetRev == NULL) && (targetRev != NULL)) // (it was allocated locally) - free_if_valid ("finish_one_strand (targetRev)", targetRev); - if ((_queryRev == NULL) && (queryRev != NULL)) // (it was allocated locally) - free_if_valid ("finish_one_strand (queryRev)", queryRev); - } - -//---------- -// -// choose_best_anchors-- -// Select the N highest-scoring anchors. -// -// The list of anchors may include anchors from both strands (we don't make any -// use of that strandedness). In this case, upon return, anchors will be -// intermixed with disregard to strand. -// -// The list is sorted by decreasing score, then truncated to contain *at least* -// N anchors. In the case that additional anchors are tied with the -// Nth best, we keep those too. -// -//---------- -// -// Arguments: -// u32 numAnchors: The number of anchors to keep. We will keep at least this -// .. many anchors (if there are that many to begin with). If -// .. additional anchors are as good as the last that would be -// .. kept, we keept those too. If this is zero, all anchors -// .. are kept. -// -// Returns: -// (nothing) -// -//---------- - -static void choose_best_anchors - (u32 numAnchors) - { - segment* seg; - score cutoff; - u32 cutoffIx, ix; - - // check for special case where no limit is to be performed - - if (numAnchors == 0) return; - - // if we don't have more than N anchors, there's nothing to do - - if (anchors->len <= numAnchors) return; - - // sort from high score to low score with disregard to strand - - sort_segments (anchors, qSegmentsByDecreasingScore); - - // extend the cutoff point to include all anchors that score as well as - // the Nth best - - seg = &anchors->seg[numAnchors-1]; // (Nth best score) - cutoff = seg->s; - - cutoffIx = 0; - for (ix=numAnchors ; ixlen ; ix++) - { - seg = &anchors->seg[ix]; - if (seg->s < cutoff) - { cutoffIx = ix; break; } - } - - // truncate the list - - if (cutoffIx > 0) - anchors->len = cutoffIx; - } - -//---------- -// -// split_anchors, swap_anchor_sets-- -// Select single-strand anchors from a list containing anchors for both -// strands. -// -// Under certain configurations (e.g. for adaptive-K), collect_hsps collects -// HSPs from both strands into a single table (anchors). Split_anchors removes -// the forward stand HSPs and moves them to a second table (secondaryAnchors). -// swap_anchor_sets switches the tables so that the forward strand HSPs are in the -// main table (anchors). -// -//---------- -// -// Arguments: -// int id: The id of segments on the reverse strand. -// -// Returns: -// (nothing) -// -//---------- - -void split_anchors (int id) - { - if (secondaryAnchors == NULL) - secondaryAnchors = new_segment_table (numDefaultAnchors, - /* coverage limit */ 0); - else - { - empty_segment_table (secondaryAnchors); - limit_segment_table (secondaryAnchors, /* coverage limit */ 0); - } - - split_segment_table (anchors, id, &secondaryAnchors); - - //printf ("\nanchors:\n"); - //dump_segments (stdout, anchors, NULL, NULL); - //printf ("\nleftovers:\n"); - //dump_segments (stdout, secondaryAnchors, NULL, NULL); - //printf ("\n"); - } - - -void swap_anchor_sets (void) - { segtable* a = secondaryAnchors; secondaryAnchors = anchors; anchors = a; } - -//---------- -// [[-- a chain connection penalty function --]] -// -// chain_connect_penalty-- -// Compute penalty for connecting two segments in the chain. -// -// Arguments and Return value: (see chain.h) -// -// Note bene: -// x = pos1, y = pos2, diag = x-y -// diag increases toward southeast -// -//---------- - -#define debugChaining_1 \ - if (chain_dbgChaining) \ - fprintf (stderr, \ - " diagDiff=" sgnposFmt " numSubs=" sgnposFmt "\n", \ - diagDiff, numSubs); - -#define debugChaining_2 \ - if (chain_dbgChaining) \ - fprintf (stderr, " base_penalty=%.2f\n", penalty); - -#define debugChaining_3 \ - if (chain_dbgChaining) \ - fprintf (stderr, " penalty=%.2f\n", penalty); - -#define debugChaining_4 \ - if (chain_dbgChaining) \ - fprintf (stderr, " penalty=%.2f (" scoreFmtSimple ")\n", \ - penalty, currParams->scoring->sub[rCh][cCh]); - -#define debugChaining_5 \ - if (chain_dbgChaining) \ - { \ - if (penalty > bestPossibleScore) \ - fprintf (stderr, " returning " scoreFmtSimple "\n", \ - bestPossibleScore); \ - else \ - fprintf (stderr, " returning " scoreFmtSimple "\n", \ - (score) penalty); \ - } - - -static score chain_connect_penalty - (segment* seg1, - segment* seg2, - int scale) - { - unspos xEnd, yEnd; - sgnpos diag1, diag2, diagDiff; - sgnpos numSubs; // number of substitutions needed to get from end - double penalty; // .. of segment 1 to beginning of segment 2 - u8 rCh, cCh; - - if ((seg2->pos1 <= seg1->pos1) || (seg2->pos2 <= seg1->pos2)) - suicide ("HSPs improperly ordered for chaining"); - - xEnd = seg1->pos1 + seg1->length - 1; - yEnd = seg1->pos2 + seg1->length - 1; - - diag1 = diagNumber (seg1->pos1, seg1->pos2); - diag2 = diagNumber (seg2->pos1, seg2->pos2); - - diagDiff = diag2 - diag1; - if (diagDiff >= 0) - { // segment 1's diagonal is above segment 2's - numSubs = ((sgnpos) seg2->pos2) - ((sgnpos) yEnd) - 1; - } - else - { // segment 1's diagonal is below segment 2's - numSubs = ((sgnpos) seg2->pos1) - ((sgnpos) xEnd) - 1; - diagDiff = -diagDiff; - } - - debugChaining_1; - - // nota bene: penalty is declared as double to allow it to overflow the - // regular score type; after we compute the penalty we clip it - // to worst penalty as we return it - - penalty = diagDiff * currParams->chainDiag; - debugChaining_2; - if (numSubs >= 0) - { - penalty += numSubs * currParams->chainAnti; - debugChaining_3; - } - else - { - rCh = currParams->scoring->rowChars[0]; - cCh = currParams->scoring->colChars[0]; - penalty += (-numSubs) * scale * currParams->scoring->sub[rCh][cCh]; - debugChaining_4; - } - - debugChaining_5; - if (penalty > bestPossibleScore) return bestPossibleScore; - else return (score) penalty; - } - -//---------- -// [[-- a census_mask_segments or census_mask_aligns callback function --]] -// -// remove_interval_seeds-- -// Remove seeds from the target sequence position table that are about to be -// rendered meaningless by dynamic masking. -// -//---------- -// -// Arguments: -// unspos b, e: The interval, in the target sequence, that is about to be -// .. masked. Origin-1, inclusive. -// void* info: (really postable*) The position table (targPositions). -// -// Returns: -// (nothing) -// -//---------- -// -// Notes: -// (1) Intervals given to this routine are origin-1 inclusive, while the -// intervals it passes along to mask_seed_position_table are origin-0 -// end-exclusive. Further, we have to expand the interval on both ends -// by the length of the seed (minus 1). -// -// For example, suppose the input interval is 20..40 (11 bp) and the seed -// length is 10. In the diagram below, * is a base in the input interval, -// x is a base in the expanded interval, and o is any other base. Numbers -// on the top are origin-1; numbers on the bottom are origin-0. The output -// interval is 10..49. -// -// 1 11 20 40 49 -// v v v v v -// ooooooooooxxxxxxxxx*********************xxxxxxxxxoooooooo ... -// ^ ^ ^ ^ ^ -// 0 10 19 39 48 -// -// The reason for expansion is that we want to eliminate any seed that -// contains any * in the diagram. The * can occur at any position in the -// 10 bp seed, so we must expand by 9 bp on each end. -// -//---------- - -static void remove_interval_seeds (unspos b, unspos e, void* info) - { - postable* pt = (postable*) info; - seq* target = currParams->seq1; - seed* hitSeed = currParams->hitSeed; - u32 seedLen = (unsigned) hitSeed->length; - const s8* upperCharToBits = currParams->upperCharToBits; - - // adjust the interval endpoints to account for the seed length - - if (b < seedLen) b = 1; - else b -= seedLen - 1; - - if (e >= target->len - (seedLen-1)) e = target->len; - else e += seedLen - 1; - - // remove masked seeds from the table - - mask_seed_position_table (pt, target, b-1, e, upperCharToBits, hitSeed); - } - -//---------- -// [[-- a seed hit reporter function --]] -// -// report_hsps-- -// Report a seed hit or HSP (i.e. just write it to output). -// -// Arguments and Return value: (see seed_search.h) -// -//---------- - -static u32 report_hsps - (arg_dont_complain(void* info), - unspos pos1, - unspos pos2, - unspos length, - score s) - { - static u64 hspIdCounter; - unspos s1, s2; - - // report this hit/HSP - - print_match (pos1-length, pos2-length, length, s, ++hspIdCounter); - - if (dbgShowHsps) - { - fprintf (stderr, "\n"); - dump_aligned_nucleotides (stderr, - currParams->seq1, pos1-length, - currParams->seq2, pos2-length, - length); - } - - if (!currParams->mirrorHSP) return length; - - // report the mirror of this hit/HSP - // $$$ we should validate that the ends are symmetric about the diagonal - - if (currParams->seq1->revCompFlags == currParams->seq2->revCompFlags) - { - s1 = pos1; - s2 = pos2; - } - else - { - s1 = (currParams->seq1->len) - pos1 + length; - s2 = (currParams->seq2->len) - pos2 + length; - if ((s2 == pos1) && (s1 == pos2)) return length; - } - - print_match (s2-length, s1-length, length, s, ++hspIdCounter); - - if (dbgShowHsps) - { - fprintf (stderr, "\n"); - dump_aligned_nucleotides (stderr, - currParams->seq1, s2-length, - currParams->seq2, s1-length, - length); - } - - return length; - } - -//---------- -// [[-- a seed hit reporter function --]] -// -// collect_hsps-- -// Collect a seed hit or HSP. -// -// Arguments and Return value: (see seed_search.h) -// -//---------- - -static u32 collect_hsps - (arg_dont_complain(void* info), - unspos pos1, - unspos pos2, - unspos length, - score s) - { - unspos s1, s2; - int reportAnchor = false; - - // add this hit/HSP to the list of anchors; note that we use the strand - // (actually the rcf value) as the id field, so that if we happen to be - // collecting segments from both strands, we can separate them later - - if (dbgShowAnchors) - { - reportAnchor = ((dbgShowAnchorsHowOften == 0) - || (anchors == NULL) - || (anchors->len == 0) - || ((anchors->len+1) % dbgShowAnchorsHowOften == 0)); - if (reportAnchor) - fprintf (stderr, "adding segment " unsposSlashFmt " " unsposFmt - " diag=" sgnposFmt "\n", - pos1-length, pos2-length, length, - diagNumber(pos1-length,pos2-length)); - } - - anchors = add_segment (anchors, pos1-length, pos2-length, length, s, - /*id*/ currParams->seq2->revCompFlags, - /*hspId*/ 0); - - if (reportAnchor) - fprintf (stderr, "(now have %s anchors)\n", ucommatize(anchors->len)); - - if (dbgShowHsps) - { - fprintf (stderr, "\n"); - dump_aligned_nucleotides - (stderr, currParams->seq1, pos1-length, currParams->seq2, pos2-length, length); - } - - if (!currParams->mirrorHSP) - return length; - - // add the mirror of this hit/HSP to the list of anchors - - if (currParams->seq1->revCompFlags == currParams->seq2->revCompFlags) - { - s1 = pos1; - s2 = pos2; - } - else - { - s1 = (currParams->seq1->len) - pos1 + length; - s2 = (currParams->seq2->len) - pos2 + length; - if ((s2 == pos1) && (s1 == pos2)) return length; - } - - if (reportAnchor) - fprintf (stderr, "adding segment " unsposSlashFmt " " unsposFmt - " diag=" sgnposFmt "\n", - s2-length, s1-length, length, - diagNumber(s2-length,s1-length)); - - anchors = add_segment (anchors, s2-length, s1-length, length, s, - /*id*/ currParams->seq2->revCompFlags, - /*hspId*/ 0); - - if (reportAnchor) - fprintf (stderr, "(now have %s anchors)\n", ucommatize(anchors->len)); - - if (dbgShowHsps) - { - fprintf (stderr, "\n"); - dump_aligned_nucleotides - (stderr, currParams->seq1, s2-length, currParams->seq2, s1-length, length); - } - - return 2*length; - } - -//---------- -// [[-- a seed hit reporter function --]] -// -// collect_filtered_hsps-- -// Collect a seed hit or HSP, so long as it satisfies the current filtering -// criteria. -// -// Arguments and Return value: (see seed_search.h) -// -//---------- - -static u32 collect_filtered_hsps - (arg_dont_complain(void* info), - unspos pos1, - unspos pos2, - unspos length, - score s) - { - unspos startPos1 = pos1 - length; - unspos startPos2 = pos2 - length; - segment seg; - - // filter HSP by identity and/or coverage - - if ((currParams->minIdentity > 0) || (currParams->maxIdentity < 1)) - { - if (filter_segment_by_identity (currParams->seq1, startPos1, - currParams->seq2, startPos2, length, - currParams->minIdentity, - currParams->maxIdentity)) - goto filtered; - } - - if ((currParams->minCoverage > 0) || (currParams->maxCoverage < 1)) - { - seg.pos1 = startPos1; - seg.pos2 = startPos2; - seg.length = length; - if (filter_segment_by_coverage (currParams->seq1, currParams->seq2, &seg, - currParams->minCoverage, - currParams->maxCoverage)) - goto filtered; - } - - if (currParams->minMatchCount > 0) - { - if (filter_segment_by_match_count (currParams->seq1, startPos1, - currParams->seq2, startPos2, length, - currParams->minMatchCount)) - goto filtered; - } - - if (currParams->maxMismatchCount >= 0) - { - if (filter_segment_by_mismatch_count (currParams->seq1, startPos1, - currParams->seq2, startPos2, length, - currParams->minMatchCount)) - goto filtered; - } - - return report_hsps (info, pos1, pos2, length, s); - -filtered: - return 0; - } - -//---------- -// -// mirror_alignments-- -// Reflect gapped alignments across the main diagonal (of DP space). See -// description below regarding how we view DP space here. -// -//---------- -// -// Arguments: -// alignel* alignList: The list of 'upper' alignments. -// -// Returns: -// The same list of alignments, usually with new alignments appended to the -// tail. It is possible that some alignments are deleted from the list (and -// disposed of). Because of this, the caller needs to replace alignList with -// the return value. -// -//---------- -// -// Notes: [ similar notes appear in seed_hit_below_diagonal() ] -// -// (1) We assume, without checking, that seq1 and seq2 are essentially the -// same. I.e. that they have the same length, and if one is partitioned, -// the other has the same partitions. -// -// (2) The DP matrix is viewed as having sequence 1 along the x axis and -// sequence 2 along the y axis, as in this diagram: -// -// +-------------+ -// ^ | . . . . . / | -// | | . . . . / | -// | | . . . / | -// seq 2 | . . / | -// | | . / | -// | | / | -// +-------------+ -// --- seq 1 --> -// -// (3) The diagonal runs from lower-left to upper-right, shown as the slashed -// line in the diagrom. -// -// (4) Alignments "above the diagonal" have pos1 < pos2. In the diagram, this -// is the region filled with dots. Alignments "below the diagonal" have -// pos1 > pos2; the region is empty in the diagram. Points "on the -// diagonal" have pos1 = pos2. But also see notes 6 and 7. -// -// (5) Alignments on the same strand will run generally parallel to the -// diagonal, from lower-left to upper-right. Alignments on opposite -// strands will run generally perpendicular to the diagonal. -// -// (6) When sequence 2 is on the minus strand, pos2 and end2 are counted in -// reverse (see 'actual' below). We flip this (to 'conceptual'), but -// keep the reverse-counted values in inPos2 and inEnd2. But also see -// note 7. -// -// (conceptual) (actual) -// +-------------+ +-------------+ -// ^ | . . . . . / | ^ | \ . . . . . | -// | | . . . . / | | | \ . . . . | -// | | . . . / | | | \ . . . | -// seq 2 | . . / | seq 2 | \ . . | -// | | . / | | | \ . | -// | | / | | | \ | -// +-------------+ +-------------+ -// --- seq 1 --> --- seq 1 --> -// -// (7) When sequence 2 is partitioned, and on the minus strand, the situation -// with positions in complicated by the fact that the partitions have been -// reversed individually, not the sequence as a whole. We flip these to -// forward strand equivalents, and keep the reverse-counted values as in -// note 6. -// -// (conceptual) (actual) -// +-------------+-------+ +---------------------+ -// ^ | . . . . . . | . . / | ^ | . . . . . . | \ . . | -// | | . . . . . . | . / | | | . . . . . . | \ . | -// | | . . . . . . | / | | | . . . . . . | \ | -// | +-------------+-------+ | +-------------+-------+ -// seq 2 | . . . . . / | | seq 2 | \ . . . . . | | -// | | . . . . / | | | | \ . . . . | | -// | | . . . / | | | | \ . . . | | -// | | . . / | | | | \ . . | | -// | | . / | | | | \ . | | -// | | / | | | | \ | | -// +---------------------+ +---------------------+ -// --- seq 1 --> --- seq 1 --> -// -//---------- - -//=== stuff for snoopMirroring === - -#ifndef snoopMirroring -#define snoopMirroring_1 ; -#define snoopMirroring_2a ; -#define snoopMirroring_2b ; -#define snoopMirroring_3 ; -#define snoopMirroring_4 ; -#define snoopMirroring_5 ; -#define snoopMirroring_6 ; -#endif // not snoopMirroring - -#ifdef snoopMirroring - -#define snoopMirroring_1 \ - fprintf (stderr, "mirror_alignments\n %s\n", \ - (sameStrand)?"same strand":"opposite strands"); - -#define snoopMirroring_2a \ - fprintf (stderr, " in: " unsposDotsFmt " " unsposDotsFmt "\n", \ - a->beg1-1, a->end1, a->beg2-1, a->end2); \ - -#define snoopMirroring_2b \ - fprintf (stderr, " mirror: " unsposDotsFmt " " unsposDotsFmt "\n", \ - b->beg1-1, b->end1, b->beg2-1, b->end2); \ - -#define snoopMirroring_3 \ - fprintf (stderr, " flip: " unsposDotsFmt " " unsposDotsFmt \ - " (" unsposFmt " " unsposFmt ")\n", \ - pos1, end1, pos2, end2, invert1, invert2); - -#define snoopMirroring_4 \ - fprintf (stderr, "checking alignment from " \ - unsposCommaFmt " to " unsposCommaFmt ")\n", \ - pos1, end1, pos2, end2); - -#define snoopMirroring_5 \ - fprintf (stderr, "rescoring alignment from " \ - unsposCommaFmt " to " unsposCommaFmt ")\n" \ - " new score is " scoreFmt "\n", \ - pos1, end1, pos2, end2, a->s); - -#define snoopMirroring_6 \ - fprintf (stderr, " (exit mirror_alignments)\n"); - -#endif // snoopMirroring - - -//--- mirror_alignments-- - -static alignel* mirror_alignments - (alignel* alignList) - { - seq* _seq1 = currParams->seq1; - seq* _seq2 = currParams->seq2; - seqpartition* sp2 = &_seq2->partition; - partition* part1, *part2; - unspos seqLen = _seq1->len; - int sameStrand; - alignel* newAlignList, *a, *aPrev, *aNext, *aTail, *b, *bTail; - unspos pos1, end1, pos2, end2, inPos2, inEnd2, invert1, invert2; - unspos x, y; - int isTruncated, haveOverlap, dontMirror; - editscript* tempScript; - - if (_seq2->len != seqLen) - suicidef ("internal error (for mirroring), sequence lengths differ " - unsposFmt " vs " unsposFmt, - seqLen, _seq2->len); - - sameStrand = (currParams->seq1->revCompFlags == currParams->seq2->revCompFlags); - snoopMirroring_1; - - // scan the alignments, creating a mirrored alignment for each (in a - // separate list) - - newAlignList = NULL; - bTail = NULL; - - aPrev = aTail = NULL; - for (a=alignList ; a!=NULL ; a=aNext) - { - aPrev = aTail; - aNext = a->next; - aTail = a; - - pos1 = a->beg1-1; end1 = a->end1; - pos2 = a->beg2-1; end2 = a->end2; - - snoopMirroring_2a; - - if (sameStrand) - { - // alignment is on same strand; we just create a mirror image of - // it and add it to the new list - - b = malloc_or_die ("mirror_alignments", sizeof(alignel)); - - b->isTrivial = false; - b->beg1 = pos2 + 1; b->end1 = end2; - b->beg2 = pos1 + 1; b->end2 = end1; - b->s = a->s; - b->seq1 = a->seq1; - b->seq2 = a->seq2; - b->next = NULL; - - b->script = edit_script_copy (a->script); - edit_script_mirror (b->script); - snoopMirroring_2b; - } - else - { - // alignment is on opposite strands; we need to check whether or - // not it crosses the diagonal; if it is completely below the - // diagonal we discard it; if it crosses the diagonal we trucate - // it before duplication - - inPos2 = pos2; - inEnd2 = end2; - - if (sp2->p == NULL) // (seq2 is not partitioned) - { // flip positions as per note 6 - invert1 = invert2 = seqLen; - } - else // (seq2 is partitioned) - { // flip positions as per note 7 - part1 = lookup_partition (_seq1, pos1); - part2 = lookup_partition (_seq2, pos2); - invert1 = part1->sepBefore + part1->sepAfter + 1; - invert2 = part2->sepBefore + part2->sepAfter + 1; - } - - pos2 = invert2 - inPos2; - end2 = invert2 - inEnd2; // nota bene: end2 < pos2 - snoopMirroring_3; - - if (pos1 == pos2) - { // alignment starts on the diagonal - discard_alignment: - free_if_valid ("mirror_alignments a->script", a->script); - free_if_valid ("mirror_alignments a", a); - - // detach it from the list - - if (aPrev == NULL) { alignList = aNext; aTail = NULL; } - else { aPrev->next = aNext; aTail = aPrev; } - - continue; // don't bother to create or save a mirror image - } - - // check to see if we cross the diagonal - - if (end1 >= end2) - { // alignment touches diagonal or crosses it - snoopMirroring_4; - - x = pos1; y = pos2; - isTruncated = edit_script_upper_truncate (a->script, &x, &y); - - if ((isTruncated) && (x == seqposInfinity)) - goto discard_alignment; - - haveOverlap = false; - if (isTruncated) - { - dontMirror = false; - if ((x < y) || (x > y+1)) - { - fprintf (stderr, "WARNING. Internal error in mirror_alignments().\n" - " An alignment crosses the main diagonal in an unexpected way.\n" - " (alignment from " unsposCommaFmt " to " unsposCommaFmt - " crosses at " unsposCommaFmt ")\n" - " The alignment is kept, but truncated at that point.\n", - pos1, end1, pos2, end2, x, y); - dontMirror = true; - } - - a->end1 = end1 = x; - a->end2 = inEnd2 = invert2 - y; - end2 = y; - - if (dontMirror) continue; - if (x == y+1) haveOverlap = true; - } - - tempScript = edit_script_copy (a->script); - edit_script_reverse (tempScript); - edit_script_mirror (tempScript); - if (haveOverlap) edit_script_trim_head (tempScript, 1); - edit_script_append (&a->script, tempScript); - free_if_valid ("mirror_alignments tempScript", tempScript); - edit_script_overall_len (a->script, &x, &y); - a->end1 = end1 = pos1 + x; - a->end2 = inEnd2 = inPos2 + y; - - a->s = score_alignment (currParams->scoring, - _seq1->v, pos1, _seq2->v, inPos2, a->script); - snoopMirroring_5; - continue; // don't bother to create or save a mirror image, - // since the mirror image has been appended to the - // alignment's edit script - } - - // otherwise, alignment doesn't touch diagonal nor cross it; we - // just create a mirror image of it and add it to the new list - - b = malloc_or_die ("mirror_alignments", sizeof(alignel)); - b->isTrivial = false; - b->beg1 = (invert2-inEnd2) + 1; b->end1 = (invert2-inPos2); - b->beg2 = (invert1-end1) + 1; b->end2 = (invert1-pos1); - b->s = a->s; - b->seq1 = a->seq1; - b->seq2 = a->seq2; - b->next = NULL; - - b->script = edit_script_copy (a->script); - edit_script_reverse (b->script); - edit_script_mirror (b->script); - snoopMirroring_2b; - } - - // append the new alignment to the tail of the new list - - if (bTail == NULL) newAlignList = b; - else bTail->next = b; - bTail = b; - } - - // attach the new alignments to the tail of the alignment list; note that - // the NULL case here can only happen if all the alignments were below the - // diagonal, which is probably impossible (and in that case, newAlignList - // will be NULL too) - - if (aTail == NULL) alignList = newAlignList; - else aTail->next = newAlignList; - - snoopMirroring_6; - return alignList; - } - -//---------- -// -// parse_options-- -// Parse command line options. -// -//---------- -// -// Arguments: -// argc, argv: (as per main) -// control* lzParams: Control data to fill in for the primary alignment. -// control* izParams: Control data to fill in for inference alignments. -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -static void usage (void) - { - fprintf (helpout, "%s-- Local Alignment Search Tool, blastZ-like\n", - programName); - fprintf (helpout, " (version %s.%s.%s released %s", - programVersionMajor, programVersionMinor, programVersionSubMinor, programRevisionDate); - if (scoreType == 'F') fprintf (helpout, ", floating point scores"); - else if (scoreType == 'D') fprintf (helpout, ", double floating point scores"); - fprintf (helpout, ")\n"); - fprintf (helpout, "usage: %s target [query] [options]\n", programName); - - fprintf (helpout, " (common options; use --help for a more extensive list)\n"); - fprintf (helpout, " target, query specifiers or files, containing sequences to align\n"); - fprintf (helpout, " (use --help=files for more details)\n"); - - fprintf (helpout, " --seed= set seed pattern (12of19, 14of22, or general pattern)\n"); - fprintf (helpout, " (default is %s)\n", - defaultSeedString); - fprintf (helpout, " --[no]transition allow (or don't) one transition in a seed hit\n"); - fprintf (helpout, " (by default %s)\n", - (defaultParams.withTrans == 0)? "the seed must match as is" : - (defaultParams.withTrans == 1)? "a transition is allowed" - : "two transitions are allowed"); - - fprintf (helpout, " --[no]chain perform chaining\n"); - fprintf (helpout, " (by default %s)\n", - (!defaultParams.chain)? "no chaining is performed" - : "chaining is performed"); - - fprintf (helpout, " --[no]gapped perform gapped alignment (instead of gap-free)\n"); - fprintf (helpout, " (by default %s)\n", - (!defaultParams.gappedExtend)? "gapped alignment is not performed" - : "gapped alignment is performed"); - - fprintf (helpout, " --step= set step length (default is %u)\n", - defaultParams.step); - - fprintf (helpout, " --strand=both search both strands\n"); - fprintf (helpout, " --strand=plus search + strand only (matching strand of query spec)\n"); - fprintf (helpout, " (by default %s)\n", - (defaultParams.whichStrand == 0)? "only + strand is searched" : - (defaultParams.whichStrand < 0)? "only - strand is searched" - : "both strands are searched"); - - fprintf (helpout, " --scores= read substitution and gap scores from a file\n"); - fprintf (helpout, " --xdrop= set x-drop threshold (default is 10sub[A][A])\n"); - fprintf (helpout, " --ydrop= set y-drop threshold (default is open+300extend)\n"); - - fprintf (helpout, " --infer[=] infer scores from the sequences, then use them\n"); - fprintf (helpout, " all inference options are read from the control file\n"); - - fprintf (helpout, " --hspthresh= set threshold for high scoring pairs (default is %s)\n", - score_thresh_to_string (&defaultParams.hspThreshold)); - fprintf (helpout, " ungapped extensions scoring lower are discarded\n"); - fprintf (helpout, " can also be a percentage or base count\n"); - - fprintf (helpout, " --gappedthresh= set threshold for gapped alignments\n"); - fprintf (helpout, " gapped extensions scoring lower are discarded\n"); - fprintf (helpout, " can also be a percentage or base count\n"); - fprintf (helpout, " (default is to use same value as --hspthresh)\n"); - - fprintf (helpout, " --include= read command line arguments from a text file\n"); - - fprintf (helpout, " --help list \"all\" options (but the online documentation is\n"); - fprintf (helpout, " more complete)\n"); - fprintf (helpout, " --help=files list information about file specifiers\n"); - fprintf (helpout, " --help=shortcuts list blastz-compatible shortcuts\n"); - fprintf (helpout, " --help=defaults list scoring defaults for your current settings\n"); - fprintf (helpout, " --help=yasra list yasra-specific shortcuts\n"); - - fprintf (helpout, "\n"); - fprintf (helpout, " See the online documentation at http://www.bx.psu.edu/~rsharris/lastz for\n"); - fprintf (helpout, " the most up-to-date information.\n"); - - exit (EXIT_FAILURE); - } - - -static void all_options (void) - { - int ix; - - // nota bene: I've commented out most of these. At one time, long ago, - // this list was complete. But it became too difficult to - // describe these with sufficient detail, and a burden to - // attempt to do so when the information is readily available - // in the readme file. - - fprintf (helpout, "NOTE: the following list is not comprehensive. The most up-to-date list is\n"); - fprintf (helpout, " available at http://www.bx.psu.edu/~rsharris/lastz\n"); - fprintf (helpout, "\n"); - - fprintf (helpout, " target[[start..end]] spec/file containing target sequence (fasta, fastq,\n"); - fprintf (helpout, " nib, 2bit or hsx); [start..end] defines a subrange of\n"); - fprintf (helpout, " the file\n"); - fprintf (helpout, " (use --help=files for more details)\n"); - fprintf (helpout, " query[[start..end]] spec/file containing query sequences; if absent,\n"); - fprintf (helpout, " queries come from stdin (if needed)\n"); - fprintf (helpout, " --self the target sequence is also the query\n"); - fprintf (helpout, " (this replaces the query file)\n"); - - fprintf (helpout, " --seed=match use a word with no gaps instead of a seed pattern\n"); -// fprintf (helpout, " --seed=half use space-free half-weight word instead of seed pattern\n"); - fprintf (helpout, " --[no]transition[=2] allow one or two transitions in a seed hit\n"); - fprintf (helpout, " (by default %s)\n", - (defaultParams.withTrans == 0)? "the seed must match as is" : - (defaultParams.withTrans == 1)? "a transition is allowed" - : "two transitions are allowed"); - -// fprintf (helpout, " --word= set max bits for word hash; use this to trade time for\n"); -// fprintf (helpout, " memory, eliminating thrashing for heavy seeds\n"); -// fprintf (helpout, " (default is %d bits)\n", -// defaultParams.maxIndexBits); - -// fprintf (helpout, " --filter=, filter seed hits, requiring at least M matches and\n"); -// fprintf (helpout, " allowing no more than T transversions\n"); -// if (defaultParams.minMatches < 0) -// fprintf (helpout, " (default is no filtering)\n"); -// else if (defaultParams.maxTransversions < 0) -// fprintf (helpout, " (default is to require %d matches)\n", -// defaultParams.minMatches); -// else if (defaultParams.maxTransversions == 0) -// fprintf (helpout, " (default is to require %d matches, no transversions)\n", -// defaultParams.minMatches); -// else -// fprintf (helpout, " (default is %d matches/%d transversions)\n", -// defaultParams.minMatches,defaultParams.maxTransversions); - -// fprintf (helpout, " --notwins require just one seed hit\n"); -// fprintf (helpout, " --twins=.. require two nearby seed hits on the same diagonal\n"); -// if (defaultTwinsYes) -// fprintf (helpout, " (default is twins with %d:%d bp gap)\n", -// defaultTwinMinGap,defaultTwinMaxGap); -// else -// fprintf (helpout, " (default is twins aren't required)\n"); - -//#ifndef noSeedHitQueue -// fprintf (helpout, " --seedqueue= set number of entries in seed hit queue\n"); -// fprintf (helpout, " (default is %d)\n", -// defaultParams.seedHitQueueSize); -//#endif // not noSeedHitQueue - -// fprintf (helpout, " --segments= read anchor segments from a file, instead of\n"); -// fprintf (helpout, " discovering anchors via seeding\n"); - -// fprintf (helpout, " --norecoverseeds don't recover hash-collision seed hits\n"); -// fprintf (helpout, " --recoverseeds recover hash-collision seed hits\n"); -// if (defaultParams.basicHitType == hitRecover) -// fprintf (helpout, " (default is to recover seed hits)\n"); -// else -// fprintf (helpout, " (default is not to recover seed hits)\n"); - - fprintf (helpout, " --step= set step length (default is %u)\n", - defaultParams.step); - - fprintf (helpout, " --strand=both search both strands\n"); - fprintf (helpout, " --strand=plus search + strand only (matching strand of query spec)\n"); - fprintf (helpout, " --strand=minus search - strand only (opposite strand of query spec)\n"); - fprintf (helpout, " (by default %s)\n", - (defaultParams.whichStrand == 0)? "only + strand is searched" : - (defaultParams.whichStrand < 0)? "only - strand is searched" - : "both strands are searched"); - - fprintf (helpout, " --ambiguous=n[,] treat N as an ambiguous nucleotide\n"); - fprintf (helpout, " (by default N is treated as a sequence splicing\n"); - fprintf (helpout, " character)\n"); - - fprintf (helpout, " --ambiguous=iupac[,] treat any ambiguous IUPAC-IUB character as a\n"); - fprintf (helpout, " completely ambiguous nucleotide\n"); - fprintf (helpout, " (by default any sequence file with B,D,H,K,M,R,S,V,W,Y\n"); - fprintf (helpout, " is rejected)\n"); - - fprintf (helpout, " --[no]gfextend perform gap-free extension of seed hits to HSPs\n"); - fprintf (helpout, " (by default %s)\n", - (defaultParams.gfExtend == gfexNoExtend)? "no extension is performed" - : (defaultParams.gfExtend == gfexExact)? "exact match extension is performed" - : "extension is performed"); - - fprintf (helpout, " --[no]chain perform chaining\n"); - fprintf (helpout, " --chain= perform chaining with given penalties for diagonal and\n"); - fprintf (helpout, " anti-diagonal\n"); - fprintf (helpout, " (by default %s)\n", - (!defaultParams.chain)? "no chaining is performed" - : "chaining is performed"); - - fprintf (helpout, " --[no]gapped perform gapped alignment (instead of gap-free)\n"); - fprintf (helpout, " (by default %s)\n", - (!defaultParams.gappedExtend)? "gapped alignment is not performed" - : "gapped alignment is performed"); - fprintf (helpout, " --notrivial do not output a trivial self-alignment block if the\n"); - fprintf (helpout, " target and query happen to be identical\n"); - - fprintf (helpout, " --scores= read substitution scores from a file\n"); - fprintf (helpout, " (default is HOXD70)\n"); - fprintf (helpout, " --match=,

scores are +R/-P for match/mismatch\n"); - fprintf (helpout, " --gap= set gap open and extend penalties (default is " scoreFmtSimple "," scoreFmtSimple ")\n", - HOXD70_open, HOXD70_extend); - fprintf (helpout, " --xdrop= set x-drop threshold (default is 10*sub[A][A])\n"); - fprintf (helpout, " --ydrop= set y-drop threshold (default is open+300extend)\n"); - fprintf (helpout, " --noxtrim if x-drop extension encounters end of sequence, don't\n"); - fprintf (helpout, " trim back to peak score (use this for short reads)\n"); - fprintf (helpout, " --noytrim if y-drop extension encounters end of sequence, don't\n"); - fprintf (helpout, " trim back to peak score (use this for short reads)\n"); - -// fprintf (helpout, " --infer= infer scores from the sequences, then use them\n"); -// fprintf (helpout, " --inferonly= infer scores but don't use them (requires --infscores)\n"); -// fprintf (helpout, " all inference options are read from the control file\n"); -// fprintf (helpout, " --infscores[=] write inferred scores to a file\n"); - - fprintf (helpout, " --hspthresh= set threshold for high scoring pairs (default is %s)\n", - score_thresh_to_string (&defaultParams.hspThreshold)); - fprintf (helpout, " ungapped extensions scoring lower are discarded\n"); - fprintf (helpout, " can also be a percentage or base count\n"); - - fprintf (helpout, " --exact= set threshold for exact matches\n"); - fprintf (helpout, " if specified, exact matches are found rather than high\n"); - fprintf (helpout, " scoring pairs (replaces --hspthresh)\n"); -// fprintf (helpout, " --mismatch=, set threshold for mismatches\n"); -// fprintf (helpout, " if specified, N-mismatch segments are found rather\n"); -// fprintf (helpout, " than high scoring pairs (replaces --hspthresh)\n"); - - fprintf (helpout, " --inner= set threshold for HSPs during interpolation\n"); - if (defaultParams.innerThreshold <= 0) - fprintf (helpout, " (default is no interpolation)\n"); - else - fprintf (helpout, " (default is " scoreFmtSimple ")\n", - defaultParams.innerThreshold); - - fprintf (helpout, " --gappedthresh= set threshold for gapped alignments\n"); - fprintf (helpout, " gapped extensions scoring lower are discarded\n"); - fprintf (helpout, " can also be a percentage or base count\n"); - fprintf (helpout, " (default is to use same value as --hspthresh)\n"); - -// fprintf (helpout, " --ball=[%%] set minimum score required of words 'in' a quantum ball\n"); - - fprintf (helpout, " --[no]entropy involve entropy in filtering high scoring pairs\n"); - fprintf (helpout, " (default is \"%s\")\n", - (defaultParams.entropicHsp)? "entropy" - : "noentropy"); - - fprintf (helpout, " --nomirror don't report mirror-image alignments when using --self\n"); - fprintf (helpout, " (default is to skip processing them, but recreate them\n"); - fprintf (helpout, " in the output)\n"); - - fprintf (helpout, " --allocate:traceback= space for trace-back information\n"); - fprintf (helpout, " (default is %s)\n", - unitize(defaultParams.tracebackMem,/*byThousands*/ false)); - -// fprintf (helpout, " --maxwordcount=[%%] limit seed word-repeats in target\n"); -// fprintf (helpout, " words occurring too often are not used in seed hits\n"); -// fprintf (helpout, " (default is no word-repeat limit)\n"); - - fprintf (helpout, " --masking= mask any position in target hit this many times\n"); - fprintf (helpout, " zero indicates no masking\n"); - if (defaultParams.dynamicMasking <= 0) - fprintf (helpout, " (default is no masking)\n"); - else - fprintf (helpout, " (default is %d)\n", - defaultParams.dynamicMasking); - -// fprintf (helpout, " --outputmasking= report masked intervals (from --masking) to a file\n"); -// fprintf (helpout, " (default is to not report masked intervals)\n"); -// fprintf (helpout, " --outputmasking+= report masked intervals (from --masking), including\n"); -// fprintf (helpout, " sequence name, to a file\n"); -// fprintf (helpout, " --outputmasking:soft= report masked intervals in the target to a file\n"); -// fprintf (helpout, " (default is to not report masked intervals)\n"); -// fprintf (helpout, " --outputmasking+:soft= report masked intervals in the target, including\n"); -// fprintf (helpout, " sequence name, to a file\n"); - -// fprintf (helpout, " --[no]census[=] count/report how many times each target base aligns\n"); -// fprintf (helpout, " (default is %s)\n", -// (defaultParams.reportCensus)? "to report census" -// : "to not report census"); - - fprintf (helpout, " --identity=[..] filter alignments by percent identity\n"); - fprintf (helpout, " 0<=min<=max<=100; blocks (or HSPs) outside min..max\n"); - fprintf (helpout, " are discarded\n"); - fprintf (helpout, " (default is no identity filtering)\n"); - - fprintf (helpout, " --coverage=[..] filter alignments by percentage of query covered\n"); - fprintf (helpout, " 0<=min<=max<=100; blocks (or HSPs) outside min..max\n"); - fprintf (helpout, " are discarded\n"); - fprintf (helpout, " (default is no query coverage filtering)\n"); - -// fprintf (helpout, " --continuity=[..] filter alignments by percent continuity\n"); -// fprintf (helpout, " 0<=min<=max<=100; blocks (or HSPs) outside min..max\n"); -// fprintf (helpout, " are discarded\n"); -// fprintf (helpout, " (default is no continuity filtering)\n"); - -// fprintf (helpout, " --filter=nmatch: filter alignments by match-count\n"); -// fprintf (helpout, " 0 filter alignments by mismatch-count\n"); -// fprintf (helpout, " 0 filter alignments by gap-count\n"); -// fprintf (helpout, " 0 filter alignments by gap-count\n"); -// fprintf (helpout, " 0 filter query sequences by alignment density\n"); -// fprintf (helpout, " sequences with bases_aligned/bases > max\n"); -// fprintf (helpout, " are discarded\n"); -// fprintf (helpout, " (default is no query density filtering)\n"); -//#endif // densityFiltering - -// fprintf (helpout, " --[no]laj backward compatibility for laj\n"); -// fprintf (helpout, " (default is %s)\n", -// (defaultParams.lajCompatible)? "to be backward compatible" -// : "not to bother with backward compatibility"); - - fprintf (helpout, " --output= specify output alignment file; otherwise alignments\n"); - fprintf (helpout, " are written to stdout\n"); - fprintf (helpout, " --format= specify output format; one of lav, axt, maf, cigar,\n"); - fprintf (helpout, " rdotplot, text or general\n"); - fprintf (helpout, " (use --help=formats for more details)\n"); - fprintf (helpout, " (by default output is %s)\n", - formatNames[defaultParams.outputFormat]); -// fprintf (helpout, " --readgroup= specify readgroup tags for SAM format\n"); -// fprintf (helpout, " (use --help=formats for more details)\n"); -// fprintf (helpout, " --markend Write a comment at the end of the output file\n"); - - fprintf (helpout, " --rdotplot= create an output file suitable for plotting in R.\n"); - -// fprintf (helpout, " --verbosity= set info level (0 is minimum, 10 is everything)\n"); -// fprintf (helpout, " (default is %d)\n", -// defaultParams.verbosity); - -// fprintf (helpout, " --[no]runtime report runtime in the output file\n"); -// fprintf (helpout, " (default is %s)\n", -// (defaultParams.reportTiming)? "to report runtime" -// : "to not report runtime"); - -// fprintf (helpout, " --tableonly[=count] just produce the target position table, don't\n"); -// fprintf (helpout, " search for seeds\n"); - -// fprintf (helpout, " --writesegments= just produce the anchor segments table, don't\n"); -// fprintf (helpout, " perform gapped alignment\n"); - -// fprintf (helpout, " --writecapsule= write the target and seed word table to a file\n"); -// fprintf (helpout, " --targetcapsule= read the target seed word table from a file\n"); -// fprintf (helpout, " (this replaces the target specifier)\n"); - -#ifdef collect_stats - fprintf (helpout, " --[no]stats[=] show search statistics (or don't)\n"); - fprintf (helpout, " (default is %s)\n", - (defaultParams.showStats)? "to show shats" - : "to not show stats"); -#endif - - fprintf (helpout, " --progress= report processing of every nth query\n"); -// fprintf (helpout, " --progress+masking= report processing of every nth query, and include\n"); -// fprintf (helpout, " masking stats (useful with --masking)\n"); - - fprintf (helpout, " --version report the program version and quit\n"); - fprintf (helpout, " --help list all options\n"); - fprintf (helpout, " --help=files list information about file specifiers\n"); - fprintf (helpout, " --help=formats list information about output file formats\n"); - fprintf (helpout, " --help=shortcuts list blastz-compatible shortcuts\n"); - fprintf (helpout, " --help=defaults list scoring defaults for your current settings\n"); - fprintf (helpout, " --help=yasra list yasra-specific shortcuts\n"); - - // non-yasra expanders - - for (ix=0 ; ix allows the specification of tags for SAM's\n"); - fprintf (helpout, " @RG header line. is a tab-delimited list of : items.\n"); - fprintf (helpout, " See the SAM spec for more information about these tags. If --readgroup is\n"); - fprintf (helpout, " used more than once the lists are concatenated.\n"); - fprintf (helpout, "\n"); - fprintf (helpout, "CIGAR\n"); - fprintf (helpout, " CIGAR format is a pairwise alignment format that describes alignment blocks\n"); - fprintf (helpout, " in a run-length format. As of Jan/2009, a spec for CIGAR files can be\n"); - fprintf (helpout, " found at\n"); - fprintf (helpout, " may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html\n"); - fprintf (helpout, "\n"); - fprintf (helpout, "BLASTN\n"); - fprintf (helpout, " BLASTN format is similar to the output from the blastn program of the NCBI\n"); - fprintf (helpout, " standalone blast package.\n"); - fprintf (helpout, "\n"); - fprintf (helpout, "segments\n"); - fprintf (helpout, " Output anchor segments, for reprocessing with --segments=.\n"); - fprintf (helpout, "\n"); - fprintf (helpout, "rdotplot\n"); - fprintf (helpout, " R output creates a file that can be plotted in the statistical package R.\n"); - fprintf (helpout, " After creating the file like this:\n"); - fprintf (helpout, " lastz ... --format=rdotplot > rdots.dat\n"); - fprintf (helpout, " ask R to plot it using an R command like this:\n"); - fprintf (helpout, " plot(read.table(\"rdots.dat\",header=T),type=\"l\")\n"); - fprintf (helpout, " The separate option --rdotplot= can be used to create a dot plot file\n"); - fprintf (helpout, " at the same time as creating alignment output in another format.\n"); - fprintf (helpout, "\n"); - fprintf (helpout, "text\n"); - fprintf (helpout, " Textual output is intended to be human readable. Each alignment block is\n"); - fprintf (helpout, " displayed with gap characters and a row of match/transition characters.\n"); - fprintf (helpout, " Lines are wrapped at some reasonable width to allow printing to paper.\n"); - fprintf (helpout, " The exact format of textual output may change in future releases of lastz.\n"); - fprintf (helpout, "\n"); - fprintf (helpout, "general\n"); - fprintf (helpout, " General output creates a tab-delimited table with one line per alignment\n"); - fprintf (helpout, " block. The user can specify which fields are written (and in what order).\n"); - fprintf (helpout, " This format is well-suited for use with spreadsheets and the R statistical\n"); - fprintf (helpout, " package, and for downstream processing with command-line tools such as awk\n"); - fprintf (helpout, " and sort.\n"); - fprintf (helpout, "\n"); - fprintf (helpout, " The format of the general output option is one of these:\n"); - fprintf (helpout, " --format=general\n"); - fprintf (helpout, " --format=general:\n"); - fprintf (helpout, " --format=general-\n"); - fprintf (helpout, " --format=general-:\n"); - fprintf (helpout, " where is a comma-separated list of field names. If this list is\n"); - fprintf (helpout, " absent a default set of fields is printed. The option --format=general-\n"); - fprintf (helpout, " (with or without fields) inhibits the header lines. This makes it suitable\n"); - fprintf (helpout, " for catenating output from multiple runs. The recognized field names are\n"); - fprintf (helpout, " shown below. See the lastz readme file for more details.\n"); - fprintf (helpout, "\n"); - fprintf (helpout, " Recognized field names:\n"); - - lineWidth = 0; - for (ix=0 ; genpafName[ix].name!=NULL ; ix++) - { - name = genpafName[ix].name; - nameLen = strlen(name); - - if (strcmp (name, "NA") == 0) continue; // (unadvertised fields) - if (strcmp (name, "~") == 0) continue; - if (strcmp (name, "hspid") == 0) continue; - if (strcmp (name, "phash") == 0) continue; - if (strcmp (name, "ahash") == 0) continue; - - needComma = true; - if (lineWidth == 0) - { - fprintf (helpout, " "); - lineWidth = 8; - needComma = false; - } - else if (lineWidth + 2 + nameLen >= 79) - { - fprintf (helpout, ",\n "); - lineWidth = 8; - needComma = false; - } - - if (needComma) fprintf (helpout, ", "); - fprintf (helpout, "%s", name); - if (needComma) lineWidth += 2; - lineWidth += nameLen; - } - fprintf (helpout, "\n"); - - fprintf (helpout, "\n"); - fprintf (helpout, "The option --markend can be useful in cases (such as batch servers) in which\n"); - fprintf (helpout, "there may be a question as to whether or not lastz completed successfully. The\n"); - fprintf (helpout, "line \"# lastz end-of-file\" is written to output as the last line. Note that\n"); - fprintf (helpout, "in some formats this is *not* a legal line; the user must remove it before any\n"); - fprintf (helpout, "downstream processsing.\n"); - - exit (EXIT_FAILURE); - } - - -static void shortcuts (void) - { - scoreset* hoxScoring = NULL; - - fprintf (helpout, "%54s%s\n", "", "[defaults]"); - fprintf (helpout, " B=0 same as --strand=plus%s\n", - (defaultParams.whichStrand == 0)? " [B=0]" : ""); - fprintf (helpout, " B=2 same as --strand=both\%s\n", - (defaultParams.whichStrand > 0)? " [B=2]" : ""); - fprintf (helpout, " B=-1 same as --strand=minus%s\n", - (defaultParams.whichStrand < -1)? " [B=-1]" : ""); - - fprintf (helpout, " C=0 same as --nochain --gapped%s\n", - ((!defaultParams.chain) && (defaultParams.gappedExtend))? " [C=0]" : ""); - fprintf (helpout, " C=1 same as --chain --nogapped%s\n", - ((defaultParams.chain) && (!defaultParams.gappedExtend))? " [C=1]" : ""); - fprintf (helpout, " C=2 same as --chain --gapped%s\n", - ((defaultParams.chain) && (defaultParams.gappedExtend))? " [C=2]" : ""); - fprintf (helpout, " C=3 same as --nochain --nogapped%s\n", - ((!defaultParams.chain) && (!defaultParams.gappedExtend))? " [C=3]" : ""); - - fprintf (helpout, " c=1 same as --census%23s[c=%d]\n", - "", (defaultParams.reportCensus)? 1 : 0); - - fprintf (helpout, " E= same as --gap=<..,penalty>%13s[E=" scoreFmtSimple "]\n", - "", HOXD70_extend); - - fprintf (helpout, " G= same as --chain=%13s[G=" scoreFmtSimple "]\n", - "", defaultParams.chainDiag); - - fprintf (helpout, " H= same as --inner=%16s[H=" scoreFmtSimple "]\n", - "", defaultParams.innerThreshold); - - fprintf (helpout, " K= same as --hspthresh=%12s[K=%s]\n", - "", score_thresh_to_string (&defaultParams.hspThreshold)); - fprintf (helpout, " L= same as --gappedthresh=%9s[L=K]\n", ""); - - fprintf (helpout, " M= same as --masking=%14s[M=%d]\n", - "", defaultParams.dynamicMasking); - - fprintf (helpout, " m= same as --allocate:traceback=%3s[m=%s]\n", - "", unitize(defaultParams.tracebackMem,/*byThousands*/ false)); - - fprintf (helpout, " O= same as --gap=%13s[O=" scoreFmtSimple "]\n", - "", HOXD70_open); - - fprintf (helpout, " P=0 same as --noentropy%s\n", - (!defaultParams.entropicHsp)? " [P=0]" : ""); - fprintf (helpout, " P=1 same as --entropy%s\n", - ((defaultParams.entropicHsp) && (!defaultParams.reportEntropy))? " [P=1]" : ""); - fprintf (helpout, " P>1 same as --entropy=report%s\n", - ((defaultParams.entropicHsp) && (defaultParams.reportEntropy))? " [P>1]" : ""); - - fprintf (helpout, " Q= same as --scores=%16s[Q=]\n",""); - - fprintf (helpout, " R= same as --chain=<..,score>%13s[R=" scoreFmtSimple "]\n", - "", defaultParams.chainAnti); - - fprintf (helpout, " T=1 same as --seed=12of19 --transition%s\n", - ((defaultParams.withTrans == 1) && (strcmp(defaultSeedString,seed_12of19) == 0))? " [T=1]" : ""); - fprintf (helpout, " T=2 same as --seed=12of19 --notransition%s\n", - ((defaultParams.withTrans == 0) && (strcmp(defaultSeedString,seed_12of19) == 0))? " [T=2]" : ""); - fprintf (helpout, " T=3 same as --seed=14of22 --transition%s\n", - ((defaultParams.withTrans == 1) && (strcmp(defaultSeedString,seed_14of22) == 0))? " [T=3]" : ""); - fprintf (helpout, " T=4 same as --seed=14of22 --notransition%s\n", - ((defaultParams.withTrans == 0) && (strcmp(defaultSeedString,seed_14of22) == 0))? " [T=4]" : ""); - - fprintf (helpout, " U=1 same as --match=1,1\n"); - - fprintf (helpout, " W= same as --seed=match\n"); - - fprintf (helpout, " X= same as --xdrop=%16s[X=10sub[A][A]]\n", ""); - fprintf (helpout, " Y= same as --ydrop=%16s[Y=O+300E]\n", ""); - - fprintf (helpout, " Z= same as --step=%16s[Z=%u]\n", - "", defaultParams.step); - - fprintf (helpout, " v=0 same as --verbosity=0%s\n", - (defaultParams.verbosity == 0)? " [v=0]" : ""); - fprintf (helpout, " v=1 same as --verbosity=10%s\n", - (defaultParams.verbosity == 10)? " [v=1]" : ""); - - fprintf (helpout, "\n"); - hoxScoring = new_dna_score_set (HOXD70, 0, 0, 0, 0); - print_score_matrix_lf (helpout, hoxScoring, false, '\n'); - free_score_set ("", hoxScoring); - - exit (EXIT_FAILURE); - } - - -static void show_scoring_defaults (FILE* f, int andExit) - { - // nota bene: an older, similar routine is print_params() - char* name1 = currParams->seq1Filename; - char* name2 = currParams->seq2Filename; - char* args = currParams->args; - scoreset* scoring = currParams->scoring; - seed* hitSeed = currParams->hitSeed; - char* seedPattern, *seedNickname; - int w = 12; // width of shortcut field - char* commentPrefix; - char _commentPrefix[2]; - char buffer[501]; - - if (name1 == NULL) name1 = "(no name)"; - if (name2 == NULL) name2 = "(no name)"; - if (args == NULL) args = "(none)"; - - if (andExit) - { _commentPrefix[0] = 0; commentPrefix = _commentPrefix; } - else - { - commentPrefix = print_comment_open (); - if (commentPrefix == NULL) - { _commentPrefix[0] = 0; commentPrefix = _commentPrefix; } - } - - fprintf (f, "%s target file spec = %s\n", commentPrefix, name1); - fprintf (f, "%s query file spec = %s\n", commentPrefix, name2); - fprintf (f, "%s arguments = %s\n", commentPrefix, args); - fprintf (f, "%s\n", commentPrefix); - - if (currParams->selfCompare) - fprintf (f, "%s %-*s --self\n", commentPrefix, w, ""); - - if (currParams->whichStrand > 0) - fprintf (f, "%s %-*s --strand=both\n", commentPrefix, w, "B=2"); - else if (currParams->whichStrand < 0) - fprintf (f, "%s %-*s --strand=minus\n", commentPrefix, w, "B=-1"); - else - fprintf (f, "%s %-*s --strand=plus\n", commentPrefix, w, "B=0"); - - sprintf (buffer, "Z=%d", currParams->step); - fprintf (f, "%s %-*s --step=%d\n", commentPrefix, w, buffer, currParams->step); - - seedPattern = seed_pattern (hitSeed); - if (strcmp (seedPattern, seed_12of19) == 0) seedNickname = " (12of19)"; - else if (strcmp (seedPattern, seed_14of22) == 0) seedNickname = " (14of22)"; - else seedNickname = ""; - if (hitSeed->weight == 2*hitSeed->length) - sprintf (buffer, "W=%d", hitSeed->length); - else - strcpy (buffer, ""); - fprintf (f, "%s %-*s --seed=%s%s\n", commentPrefix, w, buffer, seedPattern, seedNickname); - - if (currParams->withTrans == 0) - fprintf (f, "%s %-*s --notransition\n", commentPrefix, w, ""); - else if (currParams->withTrans == 1) - fprintf (f, "%s %-*s --transition\n", commentPrefix, w, ""); - else if (currParams->withTrans == 2) - fprintf (f, "%s %-*s --transition=2\n", commentPrefix, w, ""); - - sprintf (buffer, "O=" scoreFmtSimple " E=" scoreFmtSimple, scoring->gapOpen, scoring->gapExtend); - fprintf (f, "%s %-*s --gap=" scoreFmtSimple "," scoreFmtSimple "\n", commentPrefix, w, buffer, scoring->gapOpen, scoring->gapExtend); - - if (currParams->gfExtend == gfexXDrop) - { - sprintf (buffer, "K=%s", score_thresh_to_string (&currParams->hspThreshold)); - fprintf (f, "%s %-*s --hspthresh=%s\n", commentPrefix, w, buffer, score_thresh_to_string (&currParams->hspThreshold)); - } - - sprintf (buffer, "L=%s", score_thresh_to_string (&currParams->gappedThreshold)); - fprintf (f, "%s %-*s --gappedthresh=%s\n", commentPrefix, w, buffer, score_thresh_to_string (&currParams->gappedThreshold)); - - if (currParams->entropicHsp) - fprintf (f, "%s %-*s --entropy\n", commentPrefix, w, "P=1"); - else - fprintf (f, "%s %-*s --noentropy\n", commentPrefix, w, "P=0"); - - if (currParams->gfExtend == gfexXDrop) - { - sprintf (buffer, "X=" scoreFmtSimple, currParams->xDrop); - fprintf (f, "%s %-*s --xdrop=" scoreFmtSimple "\n", commentPrefix, w, buffer, currParams->xDrop); - } - else if (currParams->gfExtend == gfexExact) - fprintf (f, "%s %-*s --exact=%s\n", commentPrefix, w, "", score_thresh_to_string (&currParams->hspThreshold)); - else if ((currParams->gfExtend >= gfexMismatch_min) && (currParams->gfExtend <= gfexMismatch_max)) - fprintf (f, "%s %-*s --mismatch=%d,%s\n", commentPrefix, w, "", currParams->gfExtend,score_thresh_to_string (&currParams->hspThreshold)); - - sprintf (buffer, "Y=" scoreFmtSimple, currParams->yDrop); - fprintf (f, "%s %-*s --ydrop=" scoreFmtSimple "\n", commentPrefix, w, buffer, currParams->yDrop); - - sprintf (buffer, "H=" scoreFmtSimple, currParams->innerThreshold); - fprintf (f, "%s %-*s --inner=" scoreFmtSimple "\n", commentPrefix, w, buffer, currParams->innerThreshold); - - sprintf (buffer, "M=%d", currParams->dynamicMasking); - fprintf (f, "%s %-*s --masking=%d\n", commentPrefix, w, buffer, currParams->dynamicMasking); - - sprintf (buffer, "m=%u", currParams->tracebackMem); - fprintf (f, "%s %-*s --allocate:traceback=%u\n", commentPrefix, w, buffer, currParams->tracebackMem); - - fprintf (f, "%s\n", commentPrefix); - - fprintf (f, "%s (substitution scores)\n", commentPrefix); - if (andExit) - print_score_matrix_lf (f, scoring, false, '\n'); - else - print_score_matrix_prefix (f, scoring, false, commentPrefix); - - if (andExit) - exit (EXIT_FAILURE); - else - print_comment_close (); - } - - -static void expander_options (char* header, char* prefix) - { - int ix, width, len; - - width = 0; - for (ix=0 ; ix width) width = len; - } - - fprintf (helpout, "%s\n", header); - if (width == 0) - { - fprintf (helpout, " (none)\n"); - exit (EXIT_FAILURE); - } - - for (ix=0 ; ix 0) - { - // prepare the next argument for parsing; if the argument starts with - // unicode non-breaking hyphens (UTF-8 string 0xE2 0x80 0x91), make a - // copy of the argument and replace those with an ascii dash - - arg = argv[0]; - - if ((arg[0] == (char) 0xE2) && (arg[1] == (char) 0x80) && (arg[2] == (char) 0x91) - && (arg[3] == (char) 0xE2) && (arg[4] == (char) 0x80) && (arg[5] == (char) 0x91)) - { - argTempNeeded = 2 + strlen(arg+6) + 1; - if (argTempNeeded > argTempSize) - { - if (argTemp == NULL) - argTemp = (char*) malloc_or_die ("temporary argument string", argTempNeeded); - else - argTemp = (char*) realloc_or_die ("temporary argument string", argTemp, argTempNeeded); - } - argTemp[0] = argTemp[1] = '-'; - strcpy (argTemp+2, arg+6); - arg = argTemp; - } - - // copy argument (if it turns out to be a file name, we'll erase it - // later) - - if (isTopLevel) - { - argsLen = strlen(lzParams->args); - strcpy (lzParams->args+argsLen, arg); - strcpy (lzParams->args+argsLen+strlen(arg)," "); - } - - // locate arg value string, if there is one - - argStr = strchr(arg,'='); - if (argStr != NULL) argStr++; - - //if (argStr == NULL) fprintf (stderr, "arg=\"%s\"\n", arg); - // else fprintf (stderr, "arg=\"%s\" argStr=\"%s\"\n", arg, argStr); - - // --svn (unadvertised) - // $$$ This needs to be improved so that it shows the *latest* revision - // $$$ .. number of any module in the build path - - if (strcmp (arg, "--svn") == 0) - { - printf ("SVN revision: %s\n", svnRevisionNumber); - exit (EXIT_FAILURE); - } - - // --self and (unadvertised) --debug=clonedquery; the latter uses the - // same sequence file (as a cloned structure) but otherwise should - // behave the same as if the file were copied - - if (strcmp (arg, "--self") == 0) - { - lzParams->selfCompare - = lzParams->clonedQuery - = lzParams->inhibitTrivial = true; - goto next_arg; - } - - if (strcmp (arg, "--debug=clonedquery") == 0) - { lzParams->clonedQuery = true; goto next_arg; } - - // --notrivial - - if (strcmp (arg, "--notrivial") == 0) - { lzParams->inhibitTrivial = true; goto next_arg; } - - // --seed=, --seed=match and variants - - if (strcmp (arg, "T=0") == 0) // in blastz, T=0 was accompanied - { // .. by W=, which did not - lzParams->withTrans = 0; // .. support transitions - goto next_arg; - } - - if (strcmp (arg, "T=1") == 0) - { - if (seedString != NULL) goto duplicated_option; - seedString = copy_string (seed_12of19); - seedArg = copy_string (arg); - lzParams->withTrans = 1; - goto next_arg; - } - - if (strcmp (arg, "T=2") == 0) - { - if (seedString != NULL) goto duplicated_option; - seedString = copy_string (seed_12of19); - seedArg = copy_string (arg); - lzParams->withTrans = 0; - goto next_arg; - } - - if (strcmp (arg, "T=3") == 0) - { - if (seedString != NULL) goto duplicated_option; - seedString = copy_string (seed_14of22); - seedArg = copy_string (arg); - lzParams->withTrans = 1; - goto next_arg; - } - - if (strcmp (arg, "T=4") == 0) - { - if (seedString != NULL) goto duplicated_option; - seedString = copy_string (seed_14of22); - seedArg = copy_string (arg); - lzParams->withTrans = 0; - goto next_arg; - } - - if (strcmp_prefix (arg, "W=") == 0) - { - if (seedString != NULL) - chastise ("can't specify W= with --seed\n"); - items = sscanf (argStr, "%d", &wordLen); - if (items != 1) goto cant_understand; - goto build_match_seed; - } - - if (strcmp_prefix (arg, "--seed=") == 0) - { - if (seedString != NULL) goto duplicated_option; - if (strcmp (argStr, "12of19") == 0) - { - seedString = copy_string (seed_12of19); - seedArg = copy_string (arg); - goto next_arg; - } - - if (strcmp (argStr, "14of22") == 0) - { - seedString = copy_string (seed_14of22); - seedArg = copy_string (arg); - goto next_arg; - } - - if ((strcmp_prefix (argStr, "match(") == 0) - && (argStr[strlen(argStr)-1] == ')')) - { - scan = strchr(argStr,'(') + 1; - items = sscanf (scan, "%d)", &wordLen); - if (items != 1) goto cant_understand; - goto build_match_seed; - } - - if (strcmp_prefix (argStr, "match") == 0) - { - scan = argStr + strlen("match"); - items = sscanf (scan, "%d%c", &wordLen, &extra); - if (items != 1) goto cant_understand; - build_match_seed: - // note: we allow wordLen=1 only to support the --tableonly - // .. option, to allow tabulating single-base sequence - // .. content; if the user attempts to use wordLen=1 for - // .. actual alignment, the seed hit search routine(s) - // .. will report failure to the user - if ((wordLen < 1) || (wordLen > 15)) - chastise ("%d is not a valid word length\n", wordLen); - - seedString = malloc_or_die ("parse_options_loop (wordLen)", wordLen + 1); - for (ix=0 ; ixwithTrans = 0; - haveWithTrans = true; - haveWithTransForMatch = true; - } - seedArg = copy_string (arg); - goto next_arg; - } - - if ((strcmp_prefix (argStr, "half(") == 0) - && (argStr[strlen(argStr)-1] == ')')) - { - scan = strchr(argStr,'(') + 1; - items = sscanf (scan, "%d)", &wordLen); - if (items != 1) goto cant_understand; - goto build_half_seed; - } - - if (strcmp_prefix (argStr, "half") == 0) - { - scan = argStr + strlen("half"); - items = sscanf (scan, "%d%c", &wordLen, &extra); - if (items != 1) goto cant_understand; - build_half_seed: - if ((wordLen < 2) || (wordLen > 31)) - chastise ("%d is not a valid word length\n", wordLen); - - seedString = malloc_or_die ("parse_options_loop (wordLen)", wordLen + 1); - for (ix=0 ; ixwithTrans = 0; goto next_arg; } - - if ((strcmp (arg, "--trans") == 0) - || (strcmp (arg, "--transition") == 0) - || (strcmp (arg, "--trans=1") == 0) - || (strcmp (arg, "--transition=1") == 0)) - { lzParams->withTrans = 1; haveWithTrans = true; goto next_arg; } - - if ((strcmp (arg, "--trans=2") == 0) - || (strcmp (arg, "--transition=2") == 0) - || (strcmp (arg, "--transitions=2") == 0)) - { lzParams->withTrans = 2; haveWithTrans = true; goto next_arg; } - - // --[no]filter=<[T,]M> and --filter=cares:<[T,]M> - // (--filter=<[T:]M> supported for historical reasons) - - if (strcmp (arg, "--nofilter") == 0) - { lzParams->minMatches = -1; goto next_arg; } - - if (strcmp_prefix (arg, "--filter=cares:") == 0) - { - scan = strchr(argStr,','); - if (scan != NULL) - { - *(scan++) = 0; - lzParams->maxTransversions = string_to_int (argStr + strlen("cares:")); - lzParams->minMatches = string_to_int (scan); - } - else - { - lzParams->maxTransversions = -1; - lzParams->minMatches = string_to_int (argStr + strlen("cares:")); - } - - lzParams->filterCaresOnly = true; - goto next_arg; - } - - if ((strcmp_prefix (arg, "--filter=") == 0) - && (isdigit(argStr[0]))) // prevents collision with --filter=:nmatch, etc. - { - scan = strchr(argStr,','); - if (scan != NULL) - { - *(scan++) = 0; - lzParams->maxTransversions = string_to_int (argStr); - lzParams->minMatches = string_to_int (scan); - } - else if ((scan = strchr(argStr,':')) != NULL) - { - *(scan++) = 0; - lzParams->maxTransversions = string_to_int (argStr); - lzParams->minMatches = string_to_int (scan); - } - else - { - lzParams->maxTransversions = -1; - lzParams->minMatches = string_to_int (argStr); - } - - lzParams->filterCaresOnly = false; - goto next_arg; - } - - // --word= - - if (strcmp_prefix (arg, "--word=") == 0) - { - lzParams->maxIndexBits = string_to_int (argStr); - goto next_arg; - } - - // --notwins and --twins= - // (--twins= supported for historical reasons) - - if (strcmp (arg, "--notwins") == 0) - { twinsYes = false; goto next_arg; } - - if (strcmp_prefix (arg, "--twins=") == 0) - { - twinsYes = true; - scan = strstr(argStr,".."); - if (scan != NULL) - { - *scan = 0; scan += 2; - minGap = string_to_int (argStr); - maxGap = string_to_int (scan); - } - else if ((scan = strchr(argStr,':')) != NULL) - { - *(scan++) = 0; - minGap = string_to_int (argStr); - maxGap = string_to_int (scan); - } - else - { - minGap = 0; - maxGap = string_to_int (argStr); - } - - goto next_arg; - } - - // --seedqueue= - -#ifndef noSeedHitQueue - if (strcmp_prefix (arg, "--seedqueue=") == 0) - { - lzParams->seedHitQueueSize = string_to_int (argStr); - goto next_arg; - } -#endif // not noSeedHitQueue - - // --recoverseeds (used to be --recoverhits) - - if ((strcmp (arg, "--norecoverseeds") == 0) - || (strcmp (arg, "--norecoverhits") == 0)) - { lzParams->basicHitType = hitSimple; goto next_arg; } - - if ((strcmp (arg, "--recoverseeds") == 0) - || (strcmp (arg, "--recoverhits") == 0)) - { lzParams->basicHitType = hitRecover; goto next_arg; } - - // --rawhits - - if (strcmp (arg, "--rawhits") == 0) - { lzParams->noHitFiltering = true; goto next_arg; } - - // --step= or Z= - - if ((strcmp_prefix (arg, "--step=") == 0) - || (strcmp_prefix (arg, "Z=") == 0)) - { - tempInt = string_to_int (argStr); - if (tempInt <= 0) - suicidef ("--step must be positive"); - lzParams->step = tempInt; - haveStep = true; - goto next_arg; - } - - // --strand=both, etc. - - if ((strcmp (arg, "--both") == 0) - || (strcmp (arg, "--bothstrands") == 0) - || (strcmp (arg, "--strand=both") == 0)) - { lzParams->whichStrand = 2; goto next_arg; } - - if ((strcmp (arg, "--plus") == 0) - || (strcmp (arg, "--plusstrand") == 0) - || (strcmp (arg, "--strand=plus") == 0) - || (strcmp (arg, "--strand=+") == 0) - || (strcmp (arg, "--strand=forward") == 0)) - { lzParams->whichStrand = 0; goto next_arg; } - - if ((strcmp (arg, "--minus") == 0) - || (strcmp (arg, "--minusstrand") == 0) - || (strcmp (arg, "--strand=minus") == 0) - || (strcmp (arg, "--strand=-") == 0) - || (strcmp (arg, "--strand=reverse") == 0)) - { lzParams->whichStrand = -1; goto next_arg; } - - if (strcmp_prefix (arg, "B=") == 0) - { - lzParams->whichStrand = string_to_int (argStr); - goto next_arg; - } - - // --ambiguous=n[,] and --ambiguous=iupac[,] - - if ((strcmp_prefix (arg, "--ambiguous=n,") == 0) - || (strcmp_prefix (arg, "--ambig=n,") == 0) - || (strcmp_prefix (arg, "--ambiguous=N,") == 0) - || (strcmp_prefix (arg, "--ambig=N,") == 0)) - { - argStr2 = strchr(arg,',') + 1; - argStr = strchr(argStr2,','); - if (argStr != NULL) - { - argStr = strchr(arg,',') + 1; - argStr2 = strchr(argStr,',') + 1; - } - - tempScore2 = string_to_score (argStr2); - if (tempScore2 < 0) - suicidef ("penalty for --ambiguous=n must be non-negative"); - - tempScore = 0; - if (argStr != NULL) - { - argTempSubNeeded = argStr2 - argStr; // ',' holds space for the 0 - if (argTempSubNeeded > argTempSubSize) - { - if (argTempSub == NULL) - argTempSub = (char*) malloc_or_die ("temporary argument substring", argTempSubNeeded); - else - argTempSub = (char*) realloc_or_die ("temporary argument substring", argTempSub, argTempSubNeeded); - } - strncpy (argTempSub, argStr, argTempSubNeeded-1); - argTempSub[argTempSubNeeded-1] = 0; - tempScore = string_to_score (argTempSub); - } - - lzParams->nIsAmbiguous = true; - lzParams->ambiMatch = tempScore; - lzParams->ambiMismatch = tempScore2; - goto next_arg; - } - - if ((strcmp_prefix (arg, "--ambiguous=iupac,") == 0) - || (strcmp_prefix (arg, "--ambig=iupac,") == 0) - || (strcmp_prefix (arg, "--ambiguous=IUPAC,") == 0) - || (strcmp_prefix (arg, "--ambig=IUPAC,") == 0)) - { - argStr2 = strchr(arg,',') + 1; - argStr = strchr(argStr2,','); - if (argStr != NULL) - { - argStr = strchr(arg,',') + 1; - argStr2 = strchr(argStr,',') + 1; - } - - tempScore2 = string_to_score (argStr2); - if (tempScore2 < 0) - suicidef ("penalty for --ambiguous=iupac must be non-negative"); - - tempScore = 0; - if (argStr != NULL) - { - argTempSubNeeded = argStr2 - argStr; // ',' holds space for the 0 - if (argTempSubNeeded > argTempSubSize) - { - if (argTempSub == NULL) - argTempSub = (char*) malloc_or_die ("temporary argument substring", argTempSubNeeded); - else - argTempSub = (char*) realloc_or_die ("temporary argument substring", argTempSub, argTempSubNeeded); - } - strncpy (argTempSub, argStr, argTempSubNeeded-1); - argTempSub[argTempSubNeeded-1] = 0; - tempScore = string_to_score (argTempSub); - } - - lzParams->allowAmbiDNA = lzParams->nIsAmbiguous = true; - lzParams->ambiMatch = tempScore; - lzParams->ambiMismatch = tempScore2; - goto next_arg; - } - - if ((strcmp (arg, "--ambiguousn") == 0) - || (strcmp (arg, "--ambiguous=n") == 0) - || (strcmp (arg, "--ambig=n") == 0) - || (strcmp (arg, "--ambiguous=N") == 0) - || (strcmp (arg, "--ambig=N") == 0)) - { lzParams->nIsAmbiguous = true; goto next_arg; } - - if ((strcmp (arg, "--ambiguous=iupac") == 0) - || (strcmp (arg, "--ambig=iupac") == 0) - || (strcmp (arg, "--ambiguous=IUPAC") == 0) - || (strcmp (arg, "--ambig=IUPAC") == 0)) - { - lzParams->allowAmbiDNA = lzParams->nIsAmbiguous = true; - goto next_arg; - } - - // --[no]gfextend or (unadvertised) --[no]gfx - - if ((strcmp (arg, "--gfextend") == 0) - || (strcmp (arg, "--gfx" ) == 0)) - { lzParams->gfExtend = gfexXDrop; goto next_arg; } - - if ((strcmp (arg, "--nogfextend") == 0) - || (strcmp (arg, "--nogfx" ) == 0)) - { lzParams->gfExtend = gfexNoExtend; goto next_arg; } - - // --justhits or --hitsonly (unadvertised) - - if ((strcmp (arg, "--justhits") == 0) - || (strcmp (arg, "--hitsonly") == 0)) - { - lzParams->gfExtend = gfexNoExtend; - lzParams->gappedExtend = false; - goto next_arg; - } - - // --[no]chain, --chain=, G=, or R= - - if (strcmp (arg, "--chain") == 0) - { lzParams->chain = true; goto next_arg; } - - if (strcmp (arg, "--nochain") == 0) - { lzParams->chain = false; goto next_arg; } - - if (strcmp_prefix (arg, "--chain=") == 0) - { - lzParams->chain = true; - scan = strchr(argStr,','); - if (scan == NULL) - chastise ("%s is not a valid pair of chain penalties\n", argStr); - *scan = 0; - lzParams->chainDiag = string_to_score (argStr); - *(scan++) = ','; - lzParams->chainAnti = string_to_score (scan); - goto next_arg; - } - - if (strcmp_prefix (arg, "G=") == 0) - { - lzParams->chainDiag = string_to_score (argStr); - goto next_arg; - } - - if (strcmp_prefix (arg, "R=") == 0) - { - lzParams->chainAnti = string_to_score (argStr); - goto next_arg; - } - - // --[no]gapped, C=, or (unadvertised) --[no]gx - - if ((strcmp (arg, "--gapped") == 0) - || (strcmp (arg, "--gx" ) == 0)) - { - lzParams->gappedExtend = true; - haveGappedOption = true; - goto next_arg; - } - - if ((strcmp (arg, "--nogapped") == 0) - || (strcmp (arg, "--ungapped") == 0) - || (strcmp (arg, "--nogx" ) == 0)) - { lzParams->gappedExtend = false; goto next_arg; } - - if (strcmp (arg, "C=0") == 0) - { - lzParams->chain = false; - lzParams->gappedExtend = true; - haveGappedOption = true; - goto next_arg; - } - - if (strcmp (arg, "C=1") == 0) - { - lzParams->chain = true; - lzParams->gappedExtend = false; - goto next_arg; - } - - if (strcmp (arg, "C=2") == 0) - { - lzParams->chain = true; - lzParams->gappedExtend = true; - haveGappedOption = true; - goto next_arg; - } - - if (strcmp (arg, "C=3") == 0) - { - lzParams->chain = false; - lzParams->gappedExtend = false; - goto next_arg; - } - - // --anyornone - - if ((strcmp (arg, "--anyornone") == 0) - || (strcmp (arg, "--stopafterone") == 0)) - { - lzParams->hspImmediate = true; - lzParams->searchLimit = 1; - lzParams->searchLimitWarn = false; - lzParams->searchLimitKeep = false; - goto next_arg; - } - - // --limitperquery= (unadvertised) - - if ((strcmp_prefix (arg, "--limitperquery=") == 0) - || (strcmp_prefix (arg, "--stopafter=") == 0)) - { - tempInt = string_to_int (strchr(arg,'=')+1); - if (tempInt <= 0) - suicidef ("limit for --limitperquery must be positive"); - lzParams->hspImmediate = true; - lzParams->searchLimit = tempInt; - lzParams->searchLimitWarn = false; - lzParams->searchLimitKeep = false; - goto next_arg; - } - - // --queryhsplimit[+]=[[no]warn:] - // this differs from --limitperquery by not setting hspImmediate - // queryhsplimit+ is not documented; it allows alignments to be - // .. reported up to the limit, whereas the other options inhibit - // .. reporting of alignments for queries that exceed the limit - - if ((strcmp_prefix (arg, "--queryhsplimit=keep,nowarn:") == 0) - || (strcmp_prefix (arg, "--queryhsplimit+=nowarn:" ) == 0)) - { - tempInt = string_to_unitized_int (strchr(arg,':')+1, true /*units of 1,000*/); - lzParams->searchLimitWarn = false; - lzParams->searchLimitKeep = true; - goto check_search_limit; - } - - if (strcmp_prefix (arg, "--queryhsplimit+=warn:") == 0) - { - tempInt = string_to_unitized_int (strchr(arg,':')+1, true /*units of 1,000*/); - lzParams->searchLimitWarn = true; - lzParams->searchLimitKeep = true; - goto check_search_limit; - } - - if ((strcmp_prefix (arg, "--queryhsplimit=keep:") == 0) - || (strcmp_prefix (arg, "--queryhsplimit+=" ) == 0)) - { - tempInt = string_to_unitized_int (strchr(arg,'=')+1, true /*units of 1,000*/); - lzParams->searchLimitWarn = true; - lzParams->searchLimitKeep = true; - goto check_search_limit; - } - - if (strcmp_prefix (arg, "--queryhsplimit=nowarn:") == 0) - { - tempInt = string_to_unitized_int (strchr(arg,':')+1, true /*units of 1,000*/); - lzParams->searchLimitWarn = false; - lzParams->searchLimitKeep = false; - goto check_search_limit; - } - - if (strcmp_prefix (arg, "--queryhsplimit=warn:") == 0) - { - tempInt = string_to_unitized_int (strchr(arg,':')+1, true /*units of 1,000*/); - lzParams->searchLimitWarn = true; - lzParams->searchLimitKeep = false; - goto check_search_limit; - } - - if (strcmp_prefix (arg, "--queryhsplimit=") == 0) - { - tempInt = string_to_unitized_int (strchr(arg,'=')+1, true /*units of 1,000*/); - lzParams->searchLimitWarn = true; - lzParams->searchLimitKeep = false; - check_search_limit: - if (tempInt <= 0) - suicidef ("--queryhsplimit must be positive"); - lzParams->searchLimit = tempInt; - if (lzParams->numBestHsps != 0) - chastise ("can't use %s with --queryhspbest\n", arg); - goto next_arg; - } - - // --queryhspbest= - - if (strcmp_prefix (arg, "--queryhspbest=") == 0) - { - tempInt = string_to_unitized_int (strchr(arg,'=')+1, true /*units of 1,000*/); - if (tempInt <= 0) - suicidef ("--queryhspbest must be positive"); - lzParams->numBestHsps = tempInt; - if (lzParams->searchLimit != 0) - chastise ("can't use %s with --queryhsplimit\n", arg); - goto next_arg; - } - - // --querydepth= and (unadvertised) --querydepth=keep: - // for backward compatibility, we also have --querydepth=discard: - - if (strcmp_prefix (arg, "--querydepth=nowarn:") == 0) - { - argStr = strchr(argStr,':') + 1; - lzParams->overlyPairedWarn = false; - lzParams->overlyPairedKeep = false; - goto parse_query_depth; - } - - if (strcmp_prefix (arg, "--querydepth=keep:") == 0) - { - argStr = strchr(argStr,':') + 1; - lzParams->overlyPairedWarn = true; - lzParams->overlyPairedKeep = true; - goto parse_query_depth; - } - - if (strcmp_prefix (arg, "--querydepth=keep,nowarn:") == 0) - { - argStr = strchr(argStr,':') + 1; - lzParams->overlyPairedWarn = false; - lzParams->overlyPairedKeep = true; - goto parse_query_depth; - } - - if (strcmp_prefix (arg, "--querydepth=discard:") == 0) - { - argStr = strchr(argStr,':') + 1; - lzParams->overlyPairedWarn = true; - lzParams->overlyPairedKeep = false; - goto parse_query_depth; - } - - if (strcmp_prefix (arg, "--querydepth=") == 0) - { - lzParams->overlyPairedWarn = true; - lzParams->overlyPairedKeep = false; - parse_query_depth: - lzParams->maxPairedDepth = string_to_unitized_double (argStr, true /*units of 1,000*/); - if (lzParams->maxPairedDepth < 0.0) lzParams->maxPairedDepth = 0.0; - goto next_arg; - } - - // --allgappedbounds - - if (strcmp (arg, "--allgappedbounds") == 0) - { lzParams->gappedAllBounds = true; goto next_arg; } - - // --score[s]= or Q= - - if ((strcmp_prefix (arg, "--scores=") == 0) - || (strcmp_prefix (arg, "--score=") == 0) - || (strcmp_prefix (arg, "Q=") == 0)) - { - if (scoreFilename != NULL) goto duplicated_option; - scoreFilename = copy_string (argStr); - goto next_arg; - } - - // --match=[,] - // or (unadvertised) --unitscore[s] or U=1 - // or (unadvertised) U=[,] - - if ((strcmp (arg, "--unitscore") == 0) - || (strcmp (arg, "--unitscores") == 0) - || (strcmp (arg, "U=1") == 0)) - { - useUnitScores = true; - unitMatch = 1; - unitMismatch = -1; - goto next_arg; - } - - if ((strcmp_prefix (arg, "--match=") == 0) - || (strcmp_prefix (arg, "U=") == 0)) - { - useUnitScores = true; - scan = strchr(argStr,','); - if (scan != NULL) - *(scan++) = 0; - unitMatch = string_to_score (argStr); - if (unitMatch <= 0) - chastise ("%s is not a valid match score\n", argStr); - if (scan == NULL) - unitMismatch = -unitMatch; - else - { - unitMismatch = -string_to_score (scan); - if (unitMismatch >= 0) - chastise ("%s is not a valid mismatch penalty\n", scan); - } - goto next_arg; - } - - // (unadvertised) : - - if ((nuc_to_bits[((u8*)arg)[0]] >= 0) - && (nuc_to_bits[((u8*)arg)[1]] >= 0) - && (arg[2] == ':')) - { - r = nuc_to_bits[(u8)arg[0]]; - c = nuc_to_bits[(u8)arg[1]]; - specialSubScores[r][c] = string_to_score (&arg[3]); - if (firstSpecialSub == NULL) - firstSpecialSub = copy_string (arg); - goto next_arg; - } - - // --infer[only][=] - - if (strcmp (arg, "--infer") == 0) - { - if (infControlFilename != NULL) goto duplicated_option; - lzParams->inferScores = true; - lzParams->inferOnly = false; - goto next_arg; - } - - if (strcmp_prefix (arg, "--infer=") == 0) - { - if (infControlFilename != NULL) goto duplicated_option; - lzParams->inferScores = true; - lzParams->inferOnly = false; - infControlFilename = copy_string (argStr); - goto next_arg; - } - - if (strcmp (arg, "--inferonly") == 0) - { - if (infControlFilename != NULL) goto duplicated_option; - lzParams->inferScores = true; - lzParams->inferOnly = true; - goto next_arg; - } - - if (strcmp_prefix (arg, "--inferonly=") == 0) - { - if (infControlFilename != NULL) goto duplicated_option; - lzParams->inferScores = true; - lzParams->inferOnly = true; - infControlFilename = copy_string (argStr); - goto next_arg; - } - - // --infscores[=] - - if (strcmp (arg, "--infscores") == 0) - { - lzParams->inferScores = true; - goto next_arg; - } - - if (strcmp_prefix (arg, "--infscores=") == 0) - { - if (izParams->ic.inferFilename != NULL) goto duplicated_option; - lzParams->inferScores = true; - izParams->ic.inferFilename = copy_string (argStr); - goto next_arg; - } - - // --gap=<[open,]extend> or O= or E= - - if (strcmp_prefix (arg, "--gap=") == 0) - { - scan = strchr(argStr,','); - if (scan == NULL) - { - gapExtend = string_to_score (argStr); - if (gapExtend < 0) - chastise ("%s is not a valid gap extension penalty\n", argStr); - haveGapExtend = true; - gapExtendStr = argStr; - } - else - { - *(scan++) = 0; - gapOpen = string_to_score (argStr); - gapExtend = string_to_score (scan); - haveGapOpen = haveGapExtend = true; - gapOpenStr = argStr; - gapExtendStr = scan; - } - goto next_arg; - } - - if (strcmp_prefix (arg, "O=") == 0) - { - gapOpen = string_to_score (argStr); - haveGapOpen = true; - gapOpenStr = argStr; - goto next_arg; - } - - if (strcmp_prefix (arg, "E=") == 0) - { - gapExtend = string_to_score (argStr); - haveGapExtend = true; - gapExtendStr = argStr; - goto next_arg; - } - - // --xdrop= or X= - - if ((strcmp_prefix (arg, "--xdrop=") == 0) - || (strcmp_prefix (arg, "X=") == 0)) - { - if ((haveHspThreshold) - && (lzParams->gfExtend == gfexExact)) - chastise ("can't use %s with --exact\n", arg); - if ((haveHspThreshold) - && (lzParams->gfExtend >= gfexMismatch_min) - && (lzParams->gfExtend <= gfexMismatch_max)) - chastise ("can't use %s with --%dmismatch\n", arg, lzParams->gfExtend); - lzParams->gfExtend = gfexXDrop; - lzParams->xDrop = string_to_score (argStr); - haveXDrop = true; - goto next_arg; - } - - // --ydrop= or Y= - - if ((strcmp_prefix (arg, "--ydrop=") == 0) - || (strcmp_prefix (arg, "Y=") == 0)) - { - lzParams->yDrop = string_to_score (argStr); - haveYDrop = true; - goto next_arg; - } - - // --noxtrim - - if ((strcmp (arg, "--noxtrim") == 0) - || (strcmp (arg, "--noxdroptrim") == 0)) - { chastise ("sorry, --noxtrim not implemented yet\n", arg); - lzParams->xDropUntrimmed = true; goto next_arg; } - - // --noytrim - - if ((strcmp (arg, "--noytrim") == 0) - || (strcmp (arg, "--noydroptrim") == 0)) - { lzParams->yDropUntrimmed = true; goto next_arg; } - - // --hspthresh= or K= - - if ((strcmp_prefix (arg, "--hspthresh=") == 0) - || (strcmp_prefix (arg, "--hspthreshold=") == 0) - || (strcmp_prefix (arg, "--mspthresh=") == 0) - || (strcmp_prefix (arg, "--mspthreshold=") == 0) - || (strcmp_prefix (arg, "K=" ) == 0)) - { - if ((haveHspThreshold) - && (lzParams->gfExtend == gfexExact)) - chastise ("can't use %s with --exact\n", arg); - if ((haveHspThreshold) - && (lzParams->gfExtend >= gfexMismatch_min) - && (lzParams->gfExtend <= gfexMismatch_max)) - chastise ("can't use %s with --%dmismatch\n", arg, lzParams->gfExtend); - lzParams->hspThreshold = string_to_score_thresh (argStr); - haveHspThreshold = true; - goto next_arg; - } - - // --exact= - - if (strcmp_prefix (arg, "--exact=") == 0) - { - scan = argStr; - parse_gfex_exact: - if ((haveHspThreshold) - && (lzParams->gfExtend == gfexXDrop)) - chastise ("can't use %s with --hspthreshold\n", arg); - if ((haveXDrop) - && (lzParams->gfExtend == gfexXDrop)) - chastise ("can't use %s with --xdrop\n", arg); - if ((haveHspThreshold) - && (lzParams->gfExtend >= gfexMismatch_min) - && (lzParams->gfExtend <= gfexMismatch_max)) - chastise ("can't use %s with --%dmismatch\n", arg, lzParams->gfExtend); - lzParams->gfExtend = gfexExact; - lzParams->hspThreshold.t = 'S'; - lzParams->hspThreshold.s = string_to_score (scan); - if (lzParams->hspThreshold.s <= 0) - chastise ("%s is not a valid exact match threshold\n", scan); - haveHspThreshold = true; - goto next_arg; - } - - // --mismatch= or --mismatch=, - - argIsAMatch = false; - if (sscanf (arg, "--%d%n", &tempInt, &charsUsed) == 1) - argIsAMatch = (strcmp_prefix (arg+charsUsed, "mismatch=") == 0); - - if (argIsAMatch) - { scan = argStr; goto parse_gfex_mismatch; } - - if (strcmp_prefix (arg, "--mismatch=") == 0) - { - scan = strchr(argStr,','); - if (scan == NULL) - chastise ("--mismatch requires two values (count and length)\n"); - *(scan++) = 0; - tempInt = string_to_score (argStr); - parse_gfex_mismatch: - if (tempInt == 0) goto parse_gfex_exact; - if ((tempInt < gfexMismatch_min) || (tempInt > gfexMismatch_max)) - chastise ("%d is out of range for N-mismatch (valid range is %d..%d)\n", - tempInt, gfexMismatch_min, gfexMismatch_max); - if ((haveHspThreshold) - && (lzParams->gfExtend == gfexXDrop)) - chastise ("can't use %s with --hspthreshold\n", arg); - if ((haveXDrop) - && (lzParams->gfExtend == gfexXDrop)) - chastise ("can't use %s with --xdrop\n", arg); - if ((haveHspThreshold) - && (lzParams->gfExtend == gfexExact)) - chastise ("can't use %s with --exact\n", arg); - lzParams->gfExtend = tempInt; - lzParams->hspThreshold.t = 'S'; - lzParams->hspThreshold.s = string_to_score (scan); - if (lzParams->hspThreshold.s < tempInt) - chastise ("%s is not a valid exact %dmismatch threshold\n", scan, tempInt); - haveHspThreshold = true; - goto next_arg; - } - - // --inner= or H= - - if ((strcmp_prefix (arg, "--inner=") == 0) - || (strcmp_prefix (arg, "H=" ) == 0)) - { - lzParams->innerThreshold = string_to_score (argStr); - haveInterpThreshold = true; - goto next_arg; - } - - // --gappedthresh= or L= - - if ((strcmp_prefix (arg, "--gappedthresh=") == 0) - || (strcmp_prefix (arg, "--gappedthreshold=") == 0) - || (strcmp_prefix (arg, "L=" ) == 0)) - { - lzParams->gappedThreshold = string_to_score_thresh (argStr); - haveGappedThreshold = true; - goto next_arg; - } - - // --ball= - - if (strcmp_prefix (arg, "--ball=") == 0) - { - argLen = strlen(argStr); - if ((argLen > 0) && (argStr[argLen-1] == '%')) - { - lzParams->ballScore = 0; // (just signals that --ball used) - ballScoreFactor = pct_string_to_double (argStr); - } - else - { - lzParams->ballScore = string_to_score (argStr); - haveBallScore = true; - } - goto next_arg; - } - - // --[no]entropy or P= - - if ((strcmp (arg, "--entropy") == 0) - || (strcmp (arg, "P=1" ) == 0)) - { lzParams->entropicHsp = haveEntropicHsp = true; goto next_arg; } - - if ((strcmp (arg, "--noentropy") == 0) - || (strcmp (arg, "P=0" ) == 0)) - { lzParams->entropicHsp = haveEntropicHsp = false; goto next_arg; } - - if (strcmp_prefix (arg, "P=") == 0) - { - if (string_to_int (argStr) <= 0) - chastise ("illegal value for P"); - goto report_entropy; - } - - if (strcmp (arg, "--entropy=report") == 0) - { - report_entropy: - lzParams->entropicHsp = lzParams->reportEntropy = haveEntropicHsp = true; - goto next_arg; - } - - // --allocate:traceback= or --traceback= or m= - - if ((strcmp_prefix (arg, "--allocate:traceback=") == 0) - || (strcmp_prefix (arg, "--alloc:traceback=" ) == 0) - || (strcmp_prefix (arg, "--memory:traceback=" ) == 0) - || (strcmp_prefix (arg, "--mem:traceback=" ) == 0) - || (strcmp_prefix (arg, "--traceback=" ) == 0) - || (strcmp_prefix (arg, "m=" ) == 0)) - { - lzParams->tracebackMem = string_to_unitized_int (argStr, false /*units of 1,024*/); - goto next_arg; - } - - // --allocate:target= (intentionally not in --help) - - if ((strcmp_prefix (arg, "--allocate:target=") == 0) - || (strcmp_prefix (arg, "--alloc:target=" ) == 0) - || (strcmp_prefix (arg, "--memory:target=" ) == 0) - || (strcmp_prefix (arg, "--mem:target=" ) == 0)) - { - lzParams->targetMem = string_to_unitized_int64 (argStr, false /*units of 1,024*/); - if (lzParams->targetMem > maxSequenceLen) - lzParams->targetMem = maxSequenceLen; - goto next_arg; - } - - // --allocate:query= (intentionally not in --help) - - if ((strcmp_prefix (arg, "--allocate:query=") == 0) - || (strcmp_prefix (arg, "--alloc:query=" ) == 0) - || (strcmp_prefix (arg, "--memory:query=" ) == 0) - || (strcmp_prefix (arg, "--mem:query=" ) == 0)) - { - lzParams->queryMem = string_to_unitized_int64 (argStr, false /*units of 1,024*/); - if (lzParams->queryMem > maxSequenceLen) - lzParams->queryMem = maxSequenceLen; - goto next_arg; - } - - // --maxwordcount=[%][,] - - if (strcmp_prefix (arg, "--maxwordcount=") == 0) - { - argLen = strlen(argStr); - scan = strchr(argStr,','); - if (scan != NULL) - { - argLen = scan - argStr; - *(scan++) = 0; - tempInt = string_to_int (scan); - if (tempInt < 1) - suicidef ("--maxwordcount's max interval must be at least 1"); - lzParams->maxWordCountChasm = tempInt; - } - if ((argLen > 0) && (argStr[argLen-1] == '%')) - { - lzParams->wordCountKeep = pct_string_to_double (argStr); - lzParams->wordCountLimit = 0; - if (lzParams->wordCountKeep < 0) - suicidef ("--maxwordcount cannot be zero"); - else if (lzParams->wordCountKeep < 0) - suicidef ("--maxwordcount cannot be negative"); - else if (lzParams->wordCountKeep == 1) - suicidef ("--maxwordcount cannot be 100%"); - else if (lzParams->wordCountKeep >= 1) - suicidef ("--maxwordcount cannot be more than 100%"); - } - else - { - tempInt = string_to_int (argStr); - if (tempInt < 1) - suicidef ("--maxwordcount must be at least 1"); - lzParams->wordCountLimit = tempInt; - lzParams->wordCountKeep = 0.0; - } - goto next_arg; - } - - // --masking= or M= - // - // the premise for combinations of --masking and --census[size] is that - // .. if the user specifies no census size then the size will be set - // .. just large enough to support the masking threshold value; but if - // .. a size *is* specified then that is the size we'll use, and the - // .. threshold must be range-checked against the size - - if ((strcmp_prefix (arg, "--masking=") == 0) - || (strcmp_prefix (arg, "M=" ) == 0)) - { - tempInt = string_to_int (argStr); - if (tempInt < 0) - suicidef ("--masking cannot be negative"); - if ((lzParams->censusKind == 'B') - && (tempInt >= 255)) // (255 itself is not allowed for censusKind = 'B') - lzParams->censusKind = 0; // (we'll just reset it below) - else if ((lzParams->censusKind == 'W') - && (tempInt >= 65535)) // (65535 itself is not allowed for censusKind = 'B') - suicidef ("--census16 can't support --masking > %d", - " (--masking=%d is too big)\n", - 65535-1, tempInt); - - lzParams->dynamicMasking = tempInt; - - if (tempInt < 255) // (255 itself is not allowed for censusKind = 'B') - lzParams->censusKind = 'B'; - else if (tempInt < 65535) // (65535 itself is not allowed for censusKind = 'W') - lzParams->censusKind = 'W'; - else - lzParams->censusKind = 'L'; - - goto next_arg; - } - - // --outputmasking[+]= and --outputmasking[+]:soft= - - if ((strcmp_prefix (arg, "--outputmasking=") == 0) - || (strcmp_prefix (arg, "--outputmasking:dynamic=") == 0)) - { - if (lzParams->maskingFilename != NULL) goto duplicated_option; - lzParams->maskingFilename = copy_string (argStr); - lzParams->masking3Fields = false; - goto next_arg; - } - - if ((strcmp_prefix (arg, "--outputmasking+=") == 0) - || (strcmp_prefix (arg, "--outputmasking+:dynamic=") == 0)) - { - if (lzParams->maskingFilename != NULL) goto duplicated_option; - lzParams->maskingFilename = copy_string (argStr); - lzParams->masking3Fields = true; - goto next_arg; - } - - if (strcmp_prefix (arg, "--outputmasking:soft=") == 0) - { - if (lzParams->softMaskedFilename != NULL) goto duplicated_option; - lzParams->softMaskedFilename = copy_string (argStr); - lzParams->softMasked3Fields = false; - goto next_arg; - } - - if (strcmp_prefix (arg, "--outputmasking+:soft=") == 0) - { - if (lzParams->softMaskedFilename != NULL) goto duplicated_option; - lzParams->softMaskedFilename = copy_string (argStr); - lzParams->softMasked3Fields = true; - goto next_arg; - } - - // --[no]census[=] or c= - - if (strcmp_prefix (arg, "c=") == 0) - { - if (string_to_int (argStr) == 0) goto census_off; - else goto census_on; - } - - if (strcmp (arg, "--census") == 0) - { - census_on: - lzParams->reportCensus = true; - if (lzParams->censusKind == 0) lzParams->censusKind = 'B'; - goto next_arg; - } - - if (strcmp (arg, "--nocensus") == 0) - { - census_off: - lzParams->reportCensus = false; - goto next_arg; - } - - if (strcmp_prefix (arg, "--census=") == 0) - { - if (lzParams->censusFilename != NULL) goto duplicated_option; - if (lzParams->censusKind == 0) lzParams->censusKind = 'B'; - goto census_to_file; - } - - if (strcmp_prefix (arg, "--census16=") == 0) - { - if (lzParams->censusFilename != NULL) goto duplicated_option; - if (lzParams->dynamicMasking > 65534) - suicidef ("--census16 can't support --masking > %d\n" - " (--masking=%d is too big)\n", - 65535-1, lzParams->dynamicMasking); - lzParams->censusKind = 'W'; - goto census_to_file; - } - - if (strcmp_prefix (arg, "--census32=") == 0) - { - if (lzParams->censusFilename != NULL) goto duplicated_option; - lzParams->censusKind = 'L'; - census_to_file: - lzParams->censusFilename = copy_string (argStr); - lzParams->reportCensus = true; - goto next_arg; - } - - // --filter=identity:min[..max] (and historical --identity=min[..max]) - - if (strcmp_prefix (arg, "--identity=") == 0) - { // backward compatability for --identity=min[..max] - goto set_identity_filter; - } - - if (strcmp_prefix (arg, "--filter=identity:") == 0) - { - argStr = strchr(arg,':')+1; - set_identity_filter: - if (strcmp (argStr,"..") == 0) goto cant_understand; - scan = argStr; - scan2 = strstr (scan, ".."); - if (scan2 == NULL) { ; } // min - else if (scan2 == scan) { scan2 += 2; scan = NULL; } // ..max - else if (scan2[2] == 0) { scan2 = NULL; } // min.. - else { scan2 += 2; } // min..max - - minPctId = 0.0; - if (scan != NULL) - { - scanned = -1; - sscanf (scan, "%f%n", &minPctId, &scanned); - if (scanned == -1) goto cant_understand; - scan += scanned; - if (*scan == '%') scan++; - if (scan2 == NULL) - { - if ((*scan != 0) && (strcmp (scan, ".") != 0) && (strcmp (scan, "..") != 0)) - goto cant_understand; - } - else - { - if (*scan != '.') goto cant_understand; scan++; - if (scan != scan2) { if (*scan != '.') goto cant_understand; scan++; } - if (scan != scan2) goto cant_understand; - } - } - - maxPctId = 100.0; - if (scan2 != NULL) - { - scanned = -1; - sscanf (scan2, "%f%n", &maxPctId, &scanned); - if (scanned == -1) goto cant_understand; - scan2 += scanned; - if (*scan2 == '%') scan2++; - if (*scan2 != 0) goto cant_understand; - } - - if ((minPctId < 0) || (maxPctId > 100) || (minPctId > maxPctId)) - goto cant_understand; - - lzParams->minIdentity = minPctId / 100.0; - lzParams->maxIdentity = maxPctId / 100.0; - haveMaxIdentity = true; - goto next_arg; - } - - // --filter=coverage:min[..max] (and historical --coverage=min[..max]) - - if (strcmp_prefix (arg, "--coverage=") == 0) - { // backward compatability for --coverage=min[..max] - goto set_coverage_filter; - } - - if (strcmp_prefix (arg, "--filter=coverage:") == 0) - { - argStr = strchr(arg,':')+1; - set_coverage_filter: - if (strcmp (argStr,"..") == 0) goto cant_understand; - scan = argStr; - scan2 = strstr (scan, ".."); - if (scan2 == NULL) { ; } // min - else if (scan2 == scan) { scan2 += 2; scan = NULL; } // ..max - else if (scan2[2] == 0) { scan2 = NULL; } // min.. - else { scan2 += 2; } // min..max - - minCov = 0.0; - if (scan != NULL) - { - scanned = -1; - sscanf (scan, "%f%n", &minCov, &scanned); - if (scanned == -1) goto cant_understand; - scan += scanned; - if (*scan == '%') scan++; - if (scan2 == NULL) - { - if ((*scan != 0) && (strcmp (scan, ".") != 0) && (strcmp (scan, "..") != 0)) - goto cant_understand; - } - else - { - if (*scan != '.') goto cant_understand; scan++; - if (scan != scan2) { if (*scan != '.') goto cant_understand; scan++; } - if (scan != scan2) goto cant_understand; - } - } - - maxCov = 100.0; - if (scan2 != NULL) - { - scanned = -1; - sscanf (scan2, "%f%n", &maxCov, &scanned); - if (scanned == -1) goto cant_understand; - scan2 += scanned; - if (*scan2 == '%') scan2++; - if (*scan2 != 0) goto cant_understand; - } - - if ((minCov < 0) || (maxCov > 100) || (minCov > maxCov)) - goto cant_understand; - - lzParams->minCoverage = minCov / 100.0; - lzParams->maxCoverage = maxCov / 100.0; - goto next_arg; - } - - // --filter=continuity:min[..max] (and historical --continuity=min[..max]) - - if (strcmp_prefix (arg, "--continuity=") == 0) - { // backward compatability for --continuity=min[..max] - goto set_continuity_filter; - } - - if (strcmp_prefix (arg, "--filter=continuity:") == 0) - { - argStr = strchr(arg,':')+1; - set_continuity_filter: - if (strcmp (argStr,"..") == 0) goto cant_understand; - scan = argStr; - scan2 = strstr (scan, ".."); - if (scan2 == NULL) { ; } // min - else if (scan2 == scan) { scan2 += 2; scan = NULL; } // ..max - else if (scan2[2] == 0) { scan2 = NULL; } // min.. - else { scan2 += 2; } // min..max - - minContinuity = 0.0; - if (scan != NULL) - { - scanned = -1; - sscanf (scan, "%f%n", &minContinuity, &scanned); - if (scanned == -1) goto cant_understand; - scan += scanned; - if (*scan == '%') scan++; - if (scan2 == NULL) - { - if ((*scan != 0) && (strcmp (scan, ".") != 0) && (strcmp (scan, "..") != 0)) - goto cant_understand; - } - else - { - if (*scan != '.') goto cant_understand; scan++; - if (scan != scan2) { if (*scan != '.') goto cant_understand; scan++; } - if (scan != scan2) goto cant_understand; - } - } - - maxContinuity = 100.0; - if (scan2 != NULL) - { - scanned = -1; - sscanf (scan2, "%f%n", &maxContinuity, &scanned); - if (scanned == -1) goto cant_understand; - scan2 += scanned; - if (*scan2 == '%') scan2++; - if (*scan2 != 0) goto cant_understand; - } - - if ((minContinuity < 0) || (maxContinuity > 100) || (minContinuity > maxContinuity)) - goto cant_understand; - - lzParams->minContinuity = minContinuity / 100.0; - lzParams->maxContinuity = maxContinuity / 100.0; - goto next_arg; - } - - // --filter=nmatch: - - if (strcmp_prefix (arg, "--matchcount=") == 0) - { // backward compatability for --matchcount= - goto set_min_match_count; - } - - if (strcmp_prefix (arg, "--filter=nmatch:") == 0) - { - argStr = strchr(arg,':')+1; - set_min_match_count: - argLen = strlen(argStr); - if ((argLen > 0) && (argStr[argLen-1] == '%')) - lzParams->minMatchCountRatio = pct_string_to_double (argStr); - else - { - tempInt = string_to_int (argStr); - if (tempInt <= 0) - suicidef ("--filter=nmatch must be positive"); - lzParams->minMatchCount = tempInt; - } - goto next_arg; - } - - // --filter=nmismatch:[0].. - // $$$ allow/support trailing % - - if (strcmp_prefix (arg, "--filter=nmismatch:..") == 0) - { - tempInt = string_to_int (strstr(arg,":..")+3); - goto set_max_mismatch_count; - } - - if (strcmp_prefix (arg, "--filter=nmismatch:0..") == 0) - { - tempInt = string_to_int (strstr(arg,":0..")+4); - set_max_mismatch_count: - if (tempInt < 0) - suicidef ("--filter=nmismatch can't be negative"); - lzParams->maxMismatchCount = tempInt; - goto next_arg; - } - - if (strcmp_prefix (arg, "--filter=nmismatch:") == 0) - { - suggestion = "--filter=nmismatch:0.."; - goto make_suggestion; - } - - // --filter=ngap:[0].. - // $$$ allow/support trailing % - - if (strcmp_prefix (arg, "--filter=ngap:..") == 0) - { - tempInt = string_to_int (strstr(arg,":..")+3); - goto set_max_separate_gaps_count; - } - - if (strcmp_prefix (arg, "--filter=ngap:0..") == 0) - { - tempInt = string_to_int (strstr(arg,":0..")+4); - set_max_separate_gaps_count: - if (tempInt < 0) - suicidef ("--filter=ngap can't be negative"); - lzParams->maxSeparateGapsCount = tempInt; - goto next_arg; - } - - if (strcmp_prefix (arg, "--filter=ngap:") == 0) - { - suggestion = "--filter=ngap:0.."; - goto make_suggestion; - } - - // --filter=cgap:[0].. - // $$$ allow/support trailing % - - if (strcmp_prefix (arg, "--filter=cgap:..") == 0) - { - tempInt = string_to_int (strstr(arg,":..")+3); - goto set_max_gap_columns_count; - } - - if (strcmp_prefix (arg, "--filter=cgap:0..") == 0) - { - tempInt = string_to_int (strstr(arg,":0..")+4); - set_max_gap_columns_count: - if (tempInt < 0) - suicidef ("--filter=cgap can't be negative"); - lzParams->maxGapColumnsCount = tempInt; - goto next_arg; - } - - if (strcmp_prefix (arg, "--filter=cgap:") == 0) - { - suggestion = "--filter=cgap:0.."; - goto make_suggestion; - } - - // --density=max - // $$$ change to recognize --filter=density:[0].. - -#ifdef densityFiltering - if (strcmp_prefix (arg, "--density=") == 0) - { - lzParams->maxDensity = string_to_double (argStr); - goto next_arg; - } -#endif // densityFiltering - - // --[no]mirror - - if (strcmp (arg, "--mirror") == 0) - { lzParams->mirrorHSP = true; goto next_arg; } - - if (strcmp (arg, "--nomirror") == 0) - { lzParams->mirrorHSP = false; goto next_arg; } - - // --out[put]= - - if ((strcmp_prefix (arg, "--out=") == 0) - || (strcmp_prefix (arg, "--output=") == 0)) - { - if (lzParams->outputFilename != NULL) - free_if_valid ("output file name", lzParams->outputFilename); - lzParams->outputFilename = copy_string (argStr); - goto next_arg; - } - - // --format= (many variations) - - if ((strcmp (arg, "--format=gfa") == 0) - || (strcmp (arg, "--format=GFA") == 0) - || (strcmp (arg, "--gfa") == 0) - || (strcmp (arg, "--GFA") == 0)) - { lzParams->outputFormat = fmtGfa; goto next_arg; } - - if ((strcmp (arg, "--format=gfanoscore") == 0) // (unadvertised) - || (strcmp (arg, "--format=GFANOSCORE") == 0) - || (strcmp (arg, "--gfanoscore") == 0) - || (strcmp (arg, "--GFANOSCORE") == 0)) - { lzParams->outputFormat = fmtGfaNoScore; goto next_arg; } - - if ((strcmp (arg, "--format=lav") == 0) - || (strcmp (arg, "--format=LAV") == 0) - || (strcmp (arg, "--lav") == 0) - || (strcmp (arg, "--LAV") == 0)) - { lzParams->outputFormat = fmtLav; goto next_arg; } - - if ((strcmp (arg, "--format=lav+") == 0) - || (strcmp (arg, "--format=LAV+") == 0) - || (strcmp (arg, "--lav+") == 0) - || (strcmp (arg, "--LAV+") == 0)) - { lzParams->outputFormat = fmtLavComment; goto next_arg; } - - if ((strcmp (arg, "--format=lav+text") == 0) - || (strcmp (arg, "--format=LAV+text") == 0) - || (strcmp (arg, "--lav+text") == 0) - || (strcmp (arg, "--LAV+text") == 0) - || (strcmp (arg, "--format=text+lav") == 0) - || (strcmp (arg, "--format=text+LAV") == 0) - || (strcmp (arg, "--text+lav") == 0) - || (strcmp (arg, "--text+LAV") == 0)) - { lzParams->outputFormat = fmtLavText; goto next_arg; } - - if ((strcmp (arg, "--format=lavscore") == 0) // (unadvertised) - || (strcmp (arg, "--format=LAVSCORE") == 0) - || (strcmp (arg, "--lavscore") == 0) - || (strcmp (arg, "--LAVSCORE") == 0)) - { lzParams->outputFormat = fmtLavScore; goto next_arg; } - - if ((strcmp (arg, "--format=axt") == 0) - || (strcmp (arg, "--format=AXT") == 0) - || (strcmp (arg, "--axt") == 0) - || (strcmp (arg, "--AXT") == 0)) - { lzParams->outputFormat = fmtAxt; goto next_arg; } - - if ((strcmp (arg, "--format=axt+") == 0) - || (strcmp (arg, "--format=AXT+") == 0) - || (strcmp (arg, "--axt+") == 0) - || (strcmp (arg, "--AXT+") == 0)) - { lzParams->outputFormat = fmtAxtComment; goto next_arg; } - - if ((strcmp (arg, "--format=axt:size2") == 0) - || (strcmp (arg, "--format=AXT:size2") == 0) - || (strcmp (arg, "--axt:size2") == 0) - || (strcmp (arg, "--AXT:size2") == 0) - || (strcmp (arg, "--format=waxt") == 0) - || (strcmp (arg, "--format=WAXT") == 0) - || (strcmp (arg, "--waxt") == 0) - || (strcmp (arg, "--WAXT") == 0)) - { - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - lzParams->outputFormat = fmtAxtGeneral; - lzParams->outputInfo = copy_string (" "); - ((char*)lzParams->outputInfo)[0] = genpafSize2; - goto next_arg; - } - - if ((strcmp (arg, "--format=maf") == 0) - || (strcmp (arg, "--format=MAF") == 0) - || (strcmp (arg, "--maf") == 0) - || (strcmp (arg, "--MAF") == 0)) - { - lzParams->outputFormat = fmtMaf; - maf_distinguishNames = false; - goto next_arg; - } - - if ((strcmp (arg, "--format=~maf") == 0) - || (strcmp (arg, "--format=~MAF") == 0)) - { - lzParams->outputFormat = fmtMaf; - maf_distinguishNames = true; - goto next_arg; - } - - if ((strcmp (arg, "--format=maf+") == 0) - || (strcmp (arg, "--format=MAF+") == 0) - || (strcmp (arg, "--maf+") == 0) - || (strcmp (arg, "--MAF+") == 0)) - { - lzParams->outputFormat = fmtMafComment; - maf_distinguishNames = false; - goto next_arg; - } - - if ((strcmp (arg, "--format=~maf+") == 0) - || (strcmp (arg, "--format=~MAF+") == 0)) - { - lzParams->outputFormat = fmtMafComment; - maf_distinguishNames = true; - goto next_arg; - } - - if ((strcmp (arg, "--format=maf-") == 0) - || (strcmp (arg, "--format=MAF-") == 0) - || (strcmp (arg, "--maf-") == 0) - || (strcmp (arg, "--MAF-") == 0)) - { - lzParams->outputFormat = fmtMafNoComment; - maf_distinguishNames = false; - goto next_arg; - } - - if ((strcmp (arg, "--format=mafsegments") == 0) - || (strcmp (arg, "--format=MAFSEGMENTS") == 0) - || (strcmp (arg, "--mafsegments") == 0) - || (strcmp (arg, "--MAFSEGMENTS") == 0)) - { - lzParams->outputFormat = fmtMaf; - lzParams->deGapifyOutput = true; - maf_distinguishNames = false; - goto next_arg; - } - - if ((strcmp (arg, "--format=mafsegments+") == 0) - || (strcmp (arg, "--format=MAFSEGMENTS+") == 0) - || (strcmp (arg, "--mafsegments+") == 0) - || (strcmp (arg, "--MAFSEGMENTS+") == 0)) - { - lzParams->outputFormat = fmtMafComment; - lzParams->deGapifyOutput = true; - maf_distinguishNames = false; - goto next_arg; - } - - if ((strcmp (arg, "--format=mafsegments-") == 0) - || (strcmp (arg, "--format=MAFSEGMENTS-") == 0) - || (strcmp (arg, "--mafsegments-") == 0) - || (strcmp (arg, "--MAFSEGMENTS-") == 0)) - { - lzParams->outputFormat = fmtMafNoComment; - lzParams->deGapifyOutput = true; - maf_distinguishNames = false; - goto next_arg; - } - - if ((strcmp (arg, "--format=softsam") == 0) - || (strcmp (arg, "--format=SOFTSAM") == 0) - || (strcmp (arg, "--softsam") == 0) - || (strcmp (arg, "--SOFTSAM") == 0)) - { - lzParams->outputFormat = fmtSoftSam; - goto next_arg; - } - - if ((strcmp (arg, "--format=softsam-") == 0) - || (strcmp (arg, "--format=SOFTSAM-") == 0) - || (strcmp (arg, "--softsam-") == 0) - || (strcmp (arg, "--SOFTSAM-") == 0)) - { - lzParams->outputFormat = fmtSoftSamNoHeader; - goto next_arg; - } - - if ((strcmp (arg, "--format=sam") == 0) - || (strcmp (arg, "--format=SAM") == 0) - || (strcmp (arg, "--sam") == 0) - || (strcmp (arg, "--SAM") == 0)) - { - lzParams->outputFormat = fmtHardSam; - goto next_arg; - } - - if ((strcmp (arg, "--format=sam-") == 0) - || (strcmp (arg, "--format=SAM-") == 0) - || (strcmp (arg, "--sam-") == 0) - || (strcmp (arg, "--SAM-") == 0)) - { - lzParams->outputFormat = fmtHardSamNoHeader; - goto next_arg; - } - - if ((strcmp (arg, "--format=cigar") == 0) - || (strcmp (arg, "--format=CIGAR") == 0) - || (strcmp (arg, "--cigar") == 0) - || (strcmp (arg, "--CIGAR") == 0)) - { - lzParams->outputFormat = fmtCigar; - goto next_arg; - } - - if (strcmp_prefix (arg, "--writesegments=") == 0) - { - if (lzParams->outputFilename != NULL) - free_if_valid ("output file name", lzParams->outputFilename); - lzParams->outputFilename = copy_string (argStr); - goto format_segments; - } - - if (strcmp (arg, "--format=segments") == 0) // (now unadvertised!) - { - format_segments: - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - lzParams->outputFormat = fmtGenpaf; - lzParams->outputInfo = copy_string (genpafSegmentKeys); - formatIsSegments = true; - goto next_arg; - } - - if ((strcmp (arg, "--format=genseg") == 0) - || (strcmp (arg, "--format=generalseg") == 0)) - { lzParams->deGapifyOutput = true; goto be_general; } - - if ((strcmp (arg, "--format=genseg-") == 0) - || (strcmp (arg, "--format=generalseg-") == 0)) - { lzParams->deGapifyOutput = true; goto be_general_no_header; } - - if ((strcmp (arg, "--format=gen") == 0) - || (strcmp (arg, "--format=GEN") == 0) - || (strcmp (arg, "--format=general") == 0) - || (strcmp (arg, "--format=GENERAL") == 0)) - { - be_general: - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - lzParams->outputFormat = fmtGenpaf; - lzParams->outputInfo = copy_string (genpafStandardKeys); - goto next_arg; - } - - if ((strcmp (arg, "--format=gen-") == 0) - || (strcmp (arg, "--format=GEN-") == 0) - || (strcmp (arg, "--format=general-") == 0) - || (strcmp (arg, "--format=GENERAL-") == 0)) - { - be_general_no_header: - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - lzParams->outputFormat = fmtGenpafNoHeader; - lzParams->outputInfo = copy_string (genpafStandardKeys); - goto next_arg; - } - - if ((strcmp_prefix (arg, "--format=genseg:") == 0) - || (strcmp_prefix (arg, "--format=generalseg:") == 0)) - { lzParams->deGapifyOutput = true; goto parse_general; } - - if ((strcmp_prefix (arg, "--format=genseg-:") == 0) - || (strcmp_prefix (arg, "--format=generalseg-:") == 0)) - { lzParams->deGapifyOutput = true; goto parse_general_no_header; } - - if ((strcmp_prefix (arg, "--format=gen:") == 0) - || (strcmp_prefix (arg, "--format=GEN:") == 0) - || (strcmp_prefix (arg, "--format=general:") == 0) - || (strcmp_prefix (arg, "--format=GENERAL:") == 0)) - { - parse_general: - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - scan = strchr(arg,':') + 1; - if (*scan == 0) - suicidef ("empty keys string for --format=general:"); - lzParams->outputFormat = fmtGenpaf; - lzParams->outputInfo = parse_genpaf_keys (scan); - goto next_arg; - } - - if ((strcmp_prefix (arg, "--format=gen-:") == 0) - || (strcmp_prefix (arg, "--format=GEN-:") == 0) - || (strcmp_prefix (arg, "--format=general-:") == 0) - || (strcmp_prefix (arg, "--format=GENERAL-:") == 0)) - { - parse_general_no_header: - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - scan = strchr(arg,':') + 1; - if (*scan == 0) - suicidef ("empty keys string for --format=general-:"); - lzParams->outputFormat = fmtGenpafNoHeader; - lzParams->outputInfo = parse_genpaf_keys (scan); - goto next_arg; - } - - if ((strcmp (arg, "--format=mapping") == 0) - || (strcmp (arg, "--format=MAPPING") == 0)) - { - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - lzParams->outputFormat = fmtGenpaf; - lzParams->outputInfo = copy_string (genpafMappingKeys); - goto next_arg; - } - - if ((strcmp (arg, "--format=mapping-") == 0) - || (strcmp (arg, "--format=MAPPING-") == 0)) - { - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - lzParams->outputFormat = fmtGenpafNoHeader; - lzParams->outputInfo = copy_string (genpafMappingKeys); - goto next_arg; - } - - if ((strcmp (arg, "--format=blastn") == 0) - || (strcmp (arg, "--format=BLASTN") == 0)) - { - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - lzParams->outputFormat = fmtGenpafBlast; - lzParams->outputInfo = copy_string (genpafBlastKeys); - goto next_arg; - } - - if ((strcmp (arg, "--format=blastn-") == 0) - || (strcmp (arg, "--format=BLASTN-") == 0)) - { - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - lzParams->outputFormat = fmtGenpafBlastNoHeader; - lzParams->outputInfo = copy_string (genpafBlastKeys); - goto next_arg; - } - - if ((strcmp (arg, "--format=rdotplot") == 0)) - { - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - lzParams->outputFormat = fmtGenpafNameHeader; - lzParams->outputInfo = copy_string (genpafRDotplotKeys); - lzParams->deGapifyOutput = true; - formatIsDotPlot = true; - goto next_arg; - } - - if ((strcmp (arg, "--format=rdotplot+score") == 0)) - { - free_if_valid ("lzParams->outputInfo", lzParams->outputInfo); - lzParams->outputFormat = fmtGenpafNameHeader; - lzParams->outputInfo = copy_string (genpafRDotplotScoreKeys); - lzParams->deGapifyOutput = true; - formatIsDotPlot = true; - goto next_arg; - } - - if (strcmp (arg, "--format=text") == 0) - { lzParams->outputFormat = fmtText; goto next_arg; } - - if ((strcmp (arg, "--format=ztext") == 0) - || (strcmp (arg, "--format=zerotext") == 0)) - { lzParams->outputFormat = fmtZeroText; goto next_arg; } - - if (strcmp (arg, "--format=comp") == 0) - { lzParams->outputFormat = fmtHspComp; goto next_arg; } - - if ((strcmp (arg, "--format=diff") == 0) - || (strcmp (arg, "--format=diffs") == 0) - || (strcmp (arg, "--format=difference") == 0) - || (strcmp (arg, "--format=differences") == 0)) - { lzParams->outputFormat = fmtDiffs; goto next_arg; } - - if ((strcmp (arg, "--format=diff-") == 0) - || (strcmp (arg, "--format=diffs-") == 0) - || (strcmp (arg, "--format=difference-") == 0) - || (strcmp (arg, "--format=differences-") == 0)) - { lzParams->outputFormat = fmtDiffsNoBlocks; goto next_arg; } - - if ((strcmp (arg, "--format=istats") == 0) - || (strcmp (arg, "--format=infstats") == 0)) - { - if (haveMaxIdentity) goto set_inf_stats_format; - maxPctId = 70.0; - goto set_inf_stats_info; - } - - if (((strcmp_prefix (arg, "--format=istats(") == 0) - || (strcmp_prefix (arg, "--format=infstats(") == 0)) - && (arg[strlen(arg)-1] == ')')) - { - scan = strchr(arg,'(') + 1; - scanned = -1; - sscanf (scan, "%f%n", &maxPctId, &scanned); - if (scanned == -1) goto cant_understand; - scan += scanned; - if (*scan == '%') scan++; - if (strcmp (scan, ")") != 0) goto cant_understand; - if ((maxPctId < 0) || (maxPctId > 100)) goto cant_understand; - set_inf_stats_info: - lzParams->maxIdentity = maxPctId / 100.0; - haveMaxIdentity = true; - set_inf_stats_format: - lzParams->outputFormat = fmtInfStats; - goto next_arg; - } - - if (strcmp (arg, "--format=identity") == 0) - { lzParams->outputFormat = fmtIdDist; goto next_arg; } - - if (strcmp (arg, "--format=deseed") == 0) - { lzParams->outputFormat = fmtDeseed; goto next_arg; } - - if (strcmp (arg, "--format=none") == 0) - { lzParams->outputFormat = fmtNone; goto next_arg; } - - if (strcmp (arg, "--markend") == 0) - { lzParams->endComment = true; goto next_arg; } - - // --rdotplot= - - if (strcmp_prefix (arg, "--rdotplot=") == 0) - { - if (lzParams->dotplotFilename != NULL) - goto duplicated_option; - lzParams->dotplotFilename = copy_string (argStr); - lzParams->dotplotKeys = copy_string (genpafRDotplotKeys); - goto next_arg; - } - - if (strcmp_prefix (arg, "--rdotplot+score=") == 0) - { - if (lzParams->dotplotFilename != NULL) - goto duplicated_option; - lzParams->dotplotFilename = copy_string (argStr); - lzParams->dotplotKeys = copy_string (genpafRDotplotScoreKeys); - goto next_arg; - } - - // --readgroup= - // tags should be tab separated but we do not check since we mainly - // .. just copy the string to SAM output - // if the user gives more than one --readgroup option we tab-concatenate - - if (strcmp_prefix (arg, "--readgroup=") == 0) - { - if (lzParams->readGroup == NULL) - lzParams->readGroup = copy_string (argStr); - else - { // (concatenate with a tab between) - wordLen = strlen(lzParams->readGroup); - argLen = strlen(argStr); - lzParams->readGroup = (char*) realloc_or_die ("lzParams->readGroup", lzParams->readGroup, wordLen+1+argLen+1); - lzParams->readGroup[wordLen] = '\t'; - ustrcpy (lzParams->readGroup+wordLen+1, argStr); - } - goto next_arg; - } - - // --[no]laj - - if (strcmp (arg, "--laj") == 0) - { lzParams->lajCompatible = true; goto next_arg; } - - if (strcmp (arg, "--nolaj") == 0) - { lzParams->lajCompatible = false; goto next_arg; } - - // (unadvertised) --expand= - // only applies to fmtText and variants (e.g. fmtZeroText, fmtLavText) - - if (strcmp_prefix (arg, "--expand=") == 0) - { - tempInt = string_to_int (argStr); - if (tempInt < 0) - suicidef ("--expand cannot be negative"); - else if (tempInt >= 1000) - suicidef ("--expand must be less than 1000"); - lzParams->textContext = (u32) tempInt; - goto next_arg; - } - - // additional file actions; the unbrackets versions are for use with - // shells that intercept square brackets for their own use; action1 and - // action2 are kept for backward compatibility with undocumented - // earlier versions of these options - - if ((strcmp_prefix (arg, "--action1=[") == 0) - && (strcmp_suffix (arg, "]") == 0)) - goto target_action; - - if ((strcmp_prefix (arg, "--action1=") == 0) - && (strcmp_suffix (arg, "]") != 0)) - goto target_action_no_brackets; - - if ((strcmp_prefix (arg, "--action2=[") == 0) - && (strcmp_suffix (arg, "]") == 0)) - goto query_action; - - if ((strcmp_prefix (arg, "--action2=") == 0) - && (strcmp_suffix (arg, "]") != 0)) - goto query_action_no_brackets; - - if ((strcmp_prefix (arg, "--action:target=[") == 0) - && (strcmp_suffix (arg, "]") == 0)) - { - target_action: - scan = concatenate_strings (seq1Actions, argStr); - free_if_valid ("parse_options_loop (seq1Actions)", seq1Actions); - seq1Actions = scan; - goto next_arg; - } - - if ((strcmp_prefix (arg, "--action:target=") == 0) - && (strcmp_suffix (arg, "]") != 0)) - { - target_action_no_brackets: - scan = concatenate_four_strings (seq1Actions, "[", argStr, "]"); - free_if_valid ("parse_options_loop (seq1Actions)", seq1Actions); - seq1Actions = scan; - goto next_arg; - } - - if ((strcmp_prefix (arg, "--action:query=[") == 0) - && (strcmp_suffix (arg, "]") == 0)) - { - query_action: - scan = concatenate_strings (seq2Actions, argStr); - free_if_valid ("parse_options_loop (seq2Actions)", seq2Actions); - seq2Actions = scan; - goto next_arg; - } - - if ((strcmp_prefix (arg, "--action:query=") == 0) - && (strcmp_suffix (arg, "]") != 0)) - { - query_action_no_brackets: - scan = concatenate_four_strings (seq2Actions, "[", argStr, "]"); - free_if_valid ("parse_options_loop (seq2Actions)", seq2Actions); - seq2Actions = scan; - goto next_arg; - } - - // --include= (read options from a file) - - if (strcmp_prefix (arg, "--include=") == 0) - { - if (!allowExpanders) - suicidef ("internal error, inclusion is not allowed in expanders (%s)", arg); - if (!allowInclude) - chastise ("nested inclusion is not allowed (%s)\n", arg); - parse_options_file (argStr, lzParams, izParams); - goto next_arg; - } - - // precanned expansion arguments - - for (ix=0 ; ix %s\n", arg, expanders[ix].expansion); - parse_options_string (expanders[ix].expansion, lzParams, izParams, - /* allow expanders */ false, - /* allow include */ false); - goto next_arg; - } - - // --verbosity= - - if (strcmp_prefix (arg, "v=0") == 0) - { lzParams->verbosity = 0; goto next_arg; } - - if (strcmp_prefix (arg, "v=1") == 0) - { lzParams->verbosity = 10; goto next_arg; } - - if (strcmp_prefix (arg, "--verbosity=") == 0) - { - lzParams->verbosity = string_to_int (argStr); - if (lzParams->verbosity < 0 ) lzParams->verbosity = 0; - else if (lzParams->verbosity > 10) lzParams->verbosity = 10; - goto next_arg; - } - - if (strcmp_prefix (arg, "--gexverbosity=") == 0) // (unadvertised) - { - gappedExtendVerbosity = string_to_int (argStr); - if (gappedExtendVerbosity < 0 ) gappedExtendVerbosity = 0; - else if (gappedExtendVerbosity > 10) gappedExtendVerbosity = 10; - goto next_arg; - } - - // --[no]runtime - - if (strcmp (arg, "--runtime") == 0) - { lzParams->reportTiming = true; goto next_arg; } - - if (strcmp (arg, "--noruntime") == 0) - { lzParams->reportTiming = false; goto next_arg; } - - // --tableonly[=count] unadvertised variants - - if (strcmp (arg, "--tableonly") == 0) - { - lzParams->doSeedSearch = false; - lzParams->showPosTable = spt_table; - goto next_arg; - } - - if (strcmp (arg, "--tableonly=count") == 0) - { - lzParams->doSeedSearch = false; - lzParams->showPosTable = spt_countsonly; - goto next_arg; - } - - if (strcmp (arg, "--tableonly=andcount") == 0) - { - lzParams->doSeedSearch = false; - lzParams->showPosTable = spt_withcounts; - goto next_arg; - } - - if (strcmp (arg, "--tableonly=distribution") == 0) - { - lzParams->doSeedSearch = false; - lzParams->showPosTable = spt_distribution; - goto next_arg; - } - - if (strcmp (arg, "--tableonly=stop") == 0) - { // (for speed comparisons vs other --tableonly settings) - lzParams->doSeedSearch = false; - goto next_arg; - } - - if (strcmp (arg, "--showtable") == 0) - { lzParams->showPosTable = spt_table; goto next_arg; } - - if (strcmp (arg, "--showtable=count") == 0) - { lzParams->showPosTable = spt_countsonly; goto next_arg; } - - // --writecapsule= - - if (strcmp_prefix (arg, "--writecapsule=") == 0) - { - if (lzParams->writeCapsule) - goto duplicated_option; - if (lzParams->capsuleFilename != NULL) - chastise ("can't use --writecapsule with --targetcapsule\n"); - lzParams->capsuleFilename = copy_string (argStr); - lzParams->writeCapsule = true; - lzParams->doSeedSearch = false; - goto next_arg; - } - - // --targetcapsule= - - if (strcmp_prefix (arg, "--targetcapsule=") == 0) - { - if (lzParams->readCapsule) - goto duplicated_option; - if (lzParams->capsuleFilename != NULL) - chastise ("can't use --targetcapsule with --writecapsule\n"); - if (lzParams->seq1Filename != NULL) - { - if (lzParams->seq2Filename != NULL) - chastise ("can't use --targetcapsule with two queries\n"); - lzParams->seq2Filename = lzParams->seq1Filename; - lzParams->seq1Filename = NULL; - } - lzParams->capsuleFilename = copy_string (argStr); - lzParams->readCapsule = true; - goto next_arg; - } - - // --[no]stats[=] or (unadvertised) --stats= - - if (strcmp (arg, "--stats=") == 0) - { lzParams->reportStats = true; goto next_arg; } - - if (strcmp (arg, "--stats") == 0) - { lzParams->showStats = true; goto next_arg; } - - if (strcmp (arg, "--nostats") == 0) - { lzParams->showStats = false; goto next_arg; } - - if (strcmp_prefix (arg, "--stats=") == 0) - { - if (lzParams->statsFilename != NULL) goto duplicated_option; - lzParams->statsFilename = copy_string (argStr); - lzParams->showStats = true; - goto next_arg; - } - - // --segments= - - if ((strcmp_prefix (arg, "--segments=") == 0) - || (strcmp_prefix (arg, "--anchors=") == 0)) // (old option) - { - if (lzParams->anchorsFilename != NULL) goto duplicated_option; - lzParams->anchorsFilename = copy_string (argStr); - goto next_arg; - } - - // --chores= - - if (strcmp_prefix (arg, "--chores=") == 0) - { - if (lzParams->choresFilename != NULL) goto duplicated_option; - lzParams->choresFilename = copy_string (argStr); - goto next_arg; - } - - // --notruncationreport (unadvertised) - - if (strcmp (arg, "--notruncationreport") == 0) - { gapped_extend_inhibitTruncationReport = true; goto next_arg; } - - // --version and (unadvertised) --version:noerror - - if (strcmp (arg, "--version:noerror") == 0) - { - // this allows batch script to report the version, without having - // to jump through hoops to ignore the exit code - exitVal = EXIT_SUCCESS; - goto report_version; - } - - if ((strcmp (arg, "--version") == 0) - || (strcmp (arg, "-v") == 0) - || (strcmp (arg, "-version") == 0)) - { - exitVal = EXIT_FAILURE; - report_version: - fprintf (helpout, "%s (version %s.%s.%s released %s)\n", - programName, - programVersionMajor, programVersionMinor, programVersionSubMinor, programRevisionDate); - - #if (scoreType == 'I') - fprintf (helpout, " score=int"); - #elif (scoreType == 'F') - fprintf (helpout, " score=float"); - #elif (scoreType == 'D') - fprintf (helpout, " score=double-float"); - #endif - - fprintf (helpout, ", sequence=%d-bit", maxSequenceIndex); - fprintf (helpout, ", alloc=%d-bit", maxMallocIndex); - -#ifdef allowBackToBackGaps - fprintf (helpout, ", allowBackToBackGaps=ON"); -#else - fprintf (helpout, ", allowBackToBackGaps=OFF"); -#endif // allowBackToBackGaps - - fprintf (helpout, "\n"); -#ifdef __GNUC__ - fprintf (helpout, " built with gcc-%d.%d.%d \"%s\"\n", - __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, __VERSION__); -#endif - exit (exitVal); - } - - // --help - - if ((strcmp (arg, "--all") == 0) - || (strcmp (arg, "--help") == 0) - || (strcmp (arg, "-help") == 0) - || (strcmp (arg, "--help=all") == 0) - || (strcmp (arg, "--h") == 0) - || (strcmp (arg, "-h") == 0)) - { all_options(); } - - // --help=files - - if ((strcmp (arg, "--help=files") == 0) - || (strcmp (arg, "--help=input") == 0)) - { file_options(); } - - // --help=format[s] - - if ((strcmp (arg, "--help=format") == 0) - || (strcmp (arg, "--help=formats") == 0) - || (strcmp (arg, "--help=output") == 0)) - { format_options(); } - - // --help=short[cuts] - - if ((strcmp (arg, "--short") == 0) - || (strcmp (arg, "--shortcuts") == 0) - || (strcmp (arg, "--help=short") == 0) - || (strcmp (arg, "--help=shortcuts") == 0) - || (strcmp (arg, "--blastz") == 0) - || (strcmp (arg, "--help=blastz") == 0)) - { shortcuts(); } - - // --help=defaults, --show=defaults, and (unadvertized) --show=defaults:stderr - - if (strcmp (arg, "--help=defaults") == 0) - { - showDefaults = true; - showDefaultsStderr = false; - showDefaultsExit = true; - goto next_arg; - } - - if (strcmp (arg, "--show=defaults") == 0) - { - showDefaults = true; - showDefaultsStderr = false; - showDefaultsExit = false; - goto next_arg; - } - - if (strcmp (arg, "--show=defaults:stderr") == 0) - { - showDefaults = true; - showDefaultsStderr = true; - showDefaultsExit = false; - goto next_arg; - } - - // --help=yasra - - if ((strcmp (arg, "--yasra") == 0) - || (strcmp (arg, "--help=yasra") == 0)) - { expander_options ("yasra-specific options", "--yasra"); } - - // --tryout= (unadvertised) - -#ifdef tryout - if (strcmp (arg, "--tryout=immediategapped") == 0) - { - lzParams->hspImmediate = true; - goto next_arg; - } -#endif // tryout - - // --debug= (unadvertised) - - if (strcmp (arg, "--debug") == 0) - { debug = 100; goto next_arg; } - - if (strcmp (arg, "--debug=scorematrix") == 0) - { dbgShowMatrix = true; goto next_arg; } - - if (strcmp (arg, "--debug=sequence") == 0) - { sequences_dbgDumpSequence = true; goto next_arg; } - - if (strcmp (arg, "--debug=targetsequence") == 0) - { dbgDumpTargetSequence = true; goto next_arg; } - - if (strcmp (arg, "--debug=targetsequence2") == 0) - { dbgDumpTargetSequence2 = true; goto next_arg; } - - if (strcmp (arg, "--debug=querysequence") == 0) - { dbgDumpQuerySequence = true; goto next_arg; } - - if (strcmp (arg, "--debug=querysequence2") == 0) - { dbgDumpQuerySequence2 = true; goto next_arg; } - - if (strcmp (arg, "--debug=color") == 0) - { sequences_dbgAllowColors = true; goto next_arg; } - - if (strcmp (arg, "--debug=rawhits:aligned") == 0) - { seed_search_dbgDumpRawHits = true; goto next_arg; } - - if (strcmp (arg, "--debug=rawhits") == 0) - { seed_search_dbgDumpRawHits = seed_search_dbgShowRawHits = true; goto next_arg; } - - if (strcmp (arg, "--debug=words") == 0) - { pos_table_dbgShowWords = true; goto next_arg; } - - if (strcmp (arg, "--debug=maxwordcount") == 0) - { pos_table_dbgShowDiscards = true; goto next_arg; } - - if (strcmp (arg, "--debug=seedhits") == 0) - { seed_search_dbgShowHits = true; goto next_arg; } - - if (strcmp (arg, "--debug=seedbases") == 0) - { seed_search_dbgShowCoverage = true; goto next_arg; } - - if (strcmp (arg, "--debug=chaining") == 0) - { chain_dbgChaining = true; goto next_arg; } - - if (strcmp (arg, "--debug=chainingtree") == 0) - { chain_dbgDumpTree = true; goto next_arg; } - -#ifdef densityFiltering - if (strcmp (arg, "--debug=density") == 0) - { seed_search_dbgShowRejections = true; goto next_arg; } -#endif // densityFiltering - -#ifdef snoopHspSubrange - if (strcmp (arg, "--debug=subhsp") == 0) - { seed_search_dbgSubrangeHsps = true; goto next_arg; } -#endif // snoopHspSubrange - - if (strcmp (arg, "--debug=currParams") == 0) - { dbgShowParams = true; goto next_arg; } - - if (strcmp (arg, "--debug=hsps") == 0) - { dbgShowHsps = true; goto next_arg; } - - if (strcmp (arg, "--debug=hsps:count") == 0) - { dbgShowHspCountsMin = 0; goto next_arg; } - - if (strcmp_prefix (arg, "--debug=hsps:count:") == 0) - { - scan = strchr(argStr,':') + 1; - scan = strchr(scan, ':') + 1; - dbgShowHspCountsMin = string_to_int (scan); - goto next_arg; - } - - if ((strcmp (arg, "--debug=segments:parsing") == 0) - || (strcmp (arg, "--debug=anchors:parsing") == 0)) - { dbgAnchorParsing = true; goto next_arg; } - - if ((strcmp (arg, "--debug=segments:content") == 0) - || (strcmp (arg, "--debug=anchors:content") == 0)) - { dbgAnchorContent = true; goto next_arg; } - - if ((strcmp (arg, "--debug=segments") == 0) - || (strcmp (arg, "--debug=anchors") == 0)) - { dbgShowAnchors = true; goto next_arg; } - - if ((strcmp_prefix (arg, "--debug=segments:") == 0) - || (strcmp_prefix (arg, "--debug=anchors:") == 0)) - { - scan = strchr(argStr,':') + 1; - dbgShowAnchors = true; - dbgShowAnchorsHowOften = string_to_unitized_int (scan, true /*units of 1,000*/); - if (dbgShowAnchorsHowOften <= 0) - { - dbgShowAnchors = false; - dbgShowAnchorsHowOften = 0; - } - else if (dbgShowAnchorsHowOften == 1) - { - dbgShowAnchors = true; - dbgShowAnchorsHowOften = 0; - } - goto next_arg; - } - - if (strcmp (arg, "--debug=sort:diag") == 0) - { dbgSortAnchorsByDiag = true; goto next_arg; } - - if (strcmp (arg, "--debug=reduction") == 0) - { dbgInhibitSegmentReduction = true; goto next_arg; } - - if (strcmp (arg, "--debug=masking") == 0) - { dbgMasking = true; goto next_arg; } - - if (strcmp (arg, "--debug=pctid") == 0) - { - gapped_extend_dbgShowIdentity = true; - identity_dist_dbgShowIdentity = true; - infer_scores_dbgShowIdentity = true; - goto next_arg; - } - - if (strcmp (arg, "--debug=allowbatches") == 0) - { - gapped_extend_dbgAllowBatches = true; - goto next_arg; - } - - if (strcmp (arg, "--debug=qtobest") == 0) - { dna_utilities_dbgShowQToBest = true; goto next_arg; } - - if (strcmp (arg, "--debug=qball") == 0) - { quantum_dbgQuantumBall = true; goto next_arg; } - - if (strcmp (arg, "--debug=maf:diag") == 0) - { maf_dbgReportDiag = true; goto next_arg; } - - if (strcmp (arg, "--debug=text:diag") == 0) - { text_align_dbgReportDiag = true; goto next_arg; } - - if (strcmp_prefix (arg, "--debug=gapped:pairedbases=keep:") == 0) - { - argStr = strchr(argStr,'=') + 1; - argStr = strchr(argStr,':') + 1; - lzParams->overlyPairedWarn = true; - lzParams->overlyPairedKeep = true; - goto parse_max_paired_bases; - } - - if (strcmp_prefix (arg, "--debug=gapped:pairedbases=") == 0) - { - argStr = strchr(argStr,'=') + 1; - lzParams->overlyPairedWarn = true; - lzParams->overlyPairedKeep = false; - parse_max_paired_bases: - lzParams->maxPairedBases = string_to_unitized_int64 (argStr, true /*units of 1,000*/); - goto next_arg; - } - -#ifndef allowSeveralTargets - if ((strcmp (arg, "--progress") == 0) - || (strcmp (arg, "--debug=queryprogress") == 0)) - { dbgQueryProgress = 1; goto next_arg; } - - if (strcmp_prefix (arg, "--progress=") == 0) - { - dbgQueryProgress = string_to_unitized_int (argStr, true /*units of 1,000*/); - goto next_arg; - } - - if (strcmp_prefix (arg, "--debug=queryprogress=") == 0) - { - argStr = strchr(argStr,'=') + 1; - dbgQueryProgress = string_to_unitized_int (argStr, true /*units of 1,000*/); - goto next_arg; - } - - if ((strcmp (arg, "--progress+masking") == 0) - || (strcmp (arg, "--debug=queryprogress+masking") == 0)) - { - dbgQueryProgress = 1; - dbgQueryProgressWithMasking = true; - goto next_arg; - } - - if (strcmp_prefix (arg, "--progress+masking=") == 0) - { - dbgQueryProgress = string_to_unitized_int (argStr, true /*units of 1,000*/); - dbgQueryProgressWithMasking = true; - goto next_arg; - } - - if (strcmp_prefix (arg, "--debug=queryprogress+masking=") == 0) - { - argStr = strchr(argStr,'=') + 1; - dbgQueryProgress = string_to_unitized_int (argStr, true /*units of 1,000*/); - dbgQueryProgressWithMasking = true; - goto next_arg; - } - - if (strcmp (arg, "--debug=progressprefix") == 0) - { dbgQueryProgressPrefix = "==================== "; goto next_arg; } -#else // allowSeveralTargets - if ((strcmp (arg, "--progress") == 0) - || (strcmp (arg, "--debug=queryprogress") == 0)) - { dbgQueryProgress = dbgTargetProgress = 1; goto next_arg; } - - if (strcmp_prefix (arg, "--progress=") == 0) - { - dbgQueryProgress = string_to_unitized_int (argStr, true /*units of 1,000*/); - if (dbgTargetProgress == 0) dbgTargetProgress = 1; - goto next_arg; - } - - if ((strcmp (arg, "--progress+masking") == 0) - || (strcmp (arg, "--debug=queryprogress+masking") == 0)) - { - argStr = strchr(argStr,'=')+1; - dbgQueryProgress = string_to_unitized_int (argStr, true /*units of 1,000*/); - dbgQueryProgressWithMasking = true; - goto next_arg; - } - - if (strcmp_prefix (arg, "--debug=queryprogress=") == 0) - { - argStr = strchr(argStr,'=') + 1; - dbgQueryProgress = string_to_unitized_int (argStr, true /*units of 1,000*/); - if (dbgTargetProgress == 0) dbgTargetProgress = 1; - goto next_arg; - } - - if (strcmp_prefix (arg, "--debug=targetprogress=") == 0) - { - argStr = strchr(argStr,'=') + 1; - dbgTargetProgress = string_to_unitized_int (argStr, true /*units of 1,000*/); - goto next_arg; - } - - if (strcmp (arg, "--debug=progressprefix") == 0) - { dbgQueryProgressPrefix = dbgTargetProgressPrefix = "==================== "; goto next_arg; } -#endif // allowSeveralTargets - - if ((strcmp (arg, "--debug=converge") == 0) - || (strcmp (arg, "--debug=convergence") == 0)) - { infer_scores_watchConverge = true; goto next_arg; } - - if ((strcmp (arg, "--debug=converge+") == 0) - || (strcmp (arg, "--debug=convergence+") == 0)) - { infer_scores_watchConverge = infer_scores_snoopConverge = true; goto next_arg; } - - if (strcmp (arg, "--debug=showinferparams") == 0) - { infer_scores_showParams = true; goto next_arg; } - - if (strcmp (arg, "--debug=lav+infer") == 0) - { infer_scores_outputLav = true; goto next_arg; } - -#ifdef tryout - if (strcmp (arg, "--debug=triviality") == 0) - { gapped_extend_dbgTriviality = true; goto next_arg; } -#endif // tryout - - if (strcmp (arg, "--debug=reportfinish") == 0) - { dbgReportFinish = true; goto next_arg; } - - if (strcmp (arg, "--debug=filepointers") == 0) - { utilities_dbgDumpFilePointers = true; goto next_arg; } - - if (strcmp_prefix (arg, "--debug=") == 0) - { - debug = string_to_int (argStr); - if (debug < 0 ) debug = 0; - else if (debug > 100) debug = 100; - goto next_arg; - } - - // unknown -- argument - - if (strcmp_prefix (arg, "--") == 0) - goto cant_understand; - - // target file name - - if ((lzParams->seq1Filename == NULL) && (!lzParams->readCapsule)) - { - lzParams->seq1Filename = copy_string (arg); - goto next_arg_erase; - } - - // query file name - - if (lzParams->seq2Filename == NULL) - { - lzParams->seq2Filename = copy_string (arg); - if (arg[0] == '[') - waywardBracketArg = arg; - goto next_arg_erase; - } - - // (no option matched) - - goto cant_understand; - - // (bottom of loop) advance to the next argument - - next_arg_erase: - if (isTopLevel) lzParams->args[argsLen] = 0; // 'erase' the argument - next_arg: - argv++; argc--; - continue; - - // (failure points) - - cant_understand: - if (strcmp_prefix (arg, "--") == 0) - chastise ("Can't understand \"%s\"\n", arg); - else if (arg[0] == '[') - chastise ("Can't understand \"%s\"\n" - "(my guess) don't use a space between sequence file and bracketed \"action list\"\n", - arg); - else if (waywardBracketArg != NULL) - chastise ("Can't understand \"%s\"\n" - "(my guess) don't use a space between sequence file and %s\n", - arg, waywardBracketArg); - else - chastise ("Can't understand \"%s\"\n" - "(my guess) perhaps you have too many sequence files in the command\n", - arg); - - make_suggestion: - chastise ("Can't understand \"%s\"\nConsider \"%s\"\n", arg, suggestion); - - duplicated_option: - chastise ("Duplicated or conflicting option \"%s\"\n", arg); - } - - // dispose of argTemp - - free_if_valid ("temporary argument string", argTemp); - free_if_valid ("temporary argument substring", argTempSub); - } - -// parse_options_string-- parse options from a string - -static void parse_options_string - (char* s, - control* lzParams, - control* izParams, - int allowExpanders, - int allowInclude) - { - int argC, ix; - char** argV, *argS, *scan, *argEnd; - - for (scan=skip_whitespace(s),argC=0 ; *scan!=0 ; argC++) - { - scan = skip_darkspace (scan); - scan = skip_whitespace (scan); - } - - argV = malloc_or_die ("parse_options_string (argV)", argC * sizeof(char*)); - argS = copy_string (s); - - for (scan=skip_whitespace(argS),ix=0 ; *scan!=0 ; ix++) - { - argV[ix] = scan; - scan = skip_darkspace (scan); argEnd = scan; - scan = skip_whitespace (scan); - *argEnd = 0; - } - - parse_options_loop (argC, argV, lzParams, izParams, - /* top level */ false, - /* allow expanders */ allowExpanders, - /* allow include */ allowInclude); - - free_if_valid ("parse_options_string (argV)", argV); - free_if_valid ("parse_options_string (argS)", argS); - } - - -// parse_options_file-- parse options from a text file - -static void parse_options_file - (char* filename, - control* lzParams, - control* izParams) - { - char line[2001]; - FILE* f; - int lineNum, len, missingEol; - - f = fopen_or_die (filename, "rt"); - - lineNum = 0; - missingEol = false; - - while (fgets (line, sizeof(line), f) != NULL) - { - lineNum++; - - // check for lines getting split by fgets (the final line in the file - // might not have a newline, but no internal lines can be that way) - // $$$ this is not a perfect solution, since we will not discover the - // $$$ .. problem until after we have parsed the first part of the long - // $$$ .. line; this means we may report a parsing error instead of - // $$$ .. the line-too-long problem - - if (missingEol) - goto line_too_long; - - len = strlen (line); - if (len == 0) continue; - missingEol = (line[len-1] != '\n'); - - // trim blanks and end of line, and ignore blank lines - - if (line[len-1] == '\n') line[--len] = 0; - trim_string (line); - if (line[0] == 0) continue; - - // parse the line as command line options - - parse_options_string (line, lzParams, izParams, - /* allow expanders */ true, - /* allow include */ false); - } - - fclose_if_valid (f); - - return; - - ////////// - // failure exits - ////////// - -line_too_long: - suicidef ("included line is too long (%s: line %d)", filename, lineNum-1); - } - - -// parse_options-- overall options parsing - -static void parse_options - (int _argc, - char** _argv, - control* lzParams, - control* izParams) - { - int argc; - char** argv; - int argsLen, ix; - score maxScore; - int r, c; - u8 nuc1, nuc2; - //char* seq1Filename; (no longer used) - char* seq2Filename; - char* tempS; - score myUnitScores[4][4]; - exscoreset* xss; - u8 rCh, cCh; - - // skip program name - - argv = _argv+1; argc = _argc - 1; - - ////////// - // set defaults - ////////// - - *izParams = defaultParams; - *lzParams = defaultParams; - - seq1Actions = NULL; - seq2Actions = NULL; - seedString = NULL; - seedArg = NULL; - scoreFilename = NULL; - infControlFilename = NULL; - haveXDrop = false; - haveYDrop = false; - haveStep = false; - haveGappedOption = false; - haveHspThreshold = false; - haveGappedThreshold = false; - haveInterpThreshold = false; - haveEntropicHsp = false; - haveBallScore = false; - haveWithTrans = false; - haveWithTransForMatch = false; - haveMaxIdentity = false; - useUnitScores = false; - gappedExtendVerbosity = -1; - unitMatch = 1; - unitMismatch = -1; - haveGapOpen = false; - haveGapExtend = false; - gapOpenStr = NULL; - gapExtendStr = NULL; - gapOpen = 0; - gapExtend = 0; - twinsYes = defaultTwinsYes; - minGap = defaultTwinMinGap; - maxGap = defaultTwinMaxGap; - ballScoreFactor = -1; // (indicates we have no factor) - firstSpecialSub = NULL; - formatIsSegments = false; - formatIsDotPlot = false; - - // create a string to copy arguments into (this will be slightly bigger - // than we need) - - argsLen = 2; - for (ix=0 ; ixargs = malloc_or_die ("parse_options (lzParams->args)", argsLen); - lzParams->args[0] = 0; - - for (r=0 ; r<4 ; r++) - for (c=0 ; c<4 ; c++) - specialSubScores[r][c] = worstPossibleScore; - - ////////// - // scan arguments - ////////// - - parse_options_loop (argc, argv, lzParams, izParams, - /* top level */ true, - /* allow expanders */ true, - /* allow include */ true); - - if (lzParams->outputFilename == NULL) - lzParams->outputFile = stdout; - else - lzParams->outputFile = fopen_or_die (lzParams->outputFilename, "wt"); - - if ((lzParams->dotplotFilename) && (formatIsDotPlot)) - suicidef ("--format=rdotplot can't be used with --rdotplot="); - - if (lzParams->dotplotFilename != NULL) - lzParams->dotplotFile = fopen_or_die (lzParams->dotplotFilename, "wt"); - - if (lzParams->readGroup != NULL) - { - char* rgErrorText = NULL; - - if ((lzParams->outputFormat != fmtSoftSam) - && (lzParams->outputFormat != fmtSoftSamNoHeader) - && (lzParams->outputFormat != fmtHardSam) - && (lzParams->outputFormat != fmtHardSamNoHeader)) - suicidef ("--readgroup requires one of the SAM formats (e.g. --format=sam)"); - - lzParams->samRGTags = sam_rg_tags (lzParams->readGroup, &rgErrorText); - if (lzParams->samRGTags == NULL) - { - if (rgErrorText != NULL) - suicidef ("bad --readgroup string; %s", rgErrorText); - else - suicidef ("bad --readgroup string"); - } - } - - ////////// - // do some post-processing of sequence names - ////////// - - // bind 'extra' actions to sequence names - - if (seq1Actions != NULL) - { - // $$$ this is too stringent-- certain actions would be ok, so we - // $$$ .. really ought to just check for the actions that would be bad; - // $$$ .. however, we'd have no target filename string to bind the - // $$$ .. actions to - if (lzParams->readCapsule) - suicidef ("--action1 can't be used with --targetcapsule"); - tempS = concatenate_strings (lzParams->seq1Filename, seq1Actions); - free_if_valid ("parse_options (seq1Filename)", lzParams->seq1Filename); - free_if_valid ("parse_options (seq1Actions)", seq1Actions); seq1Actions = NULL; - lzParams->seq1Filename = tempS; - } - - if (seq2Actions != NULL) - { - if (lzParams->seq2Filename == NULL) - suicidef ("--action2 can't be used without query sequence file"); - tempS = concatenate_strings (lzParams->seq2Filename, seq2Actions); - free_if_valid ("parse_options (seq2Filename)", lzParams->seq2Filename); - free_if_valid ("parse_options (seq2Actions)", seq2Actions); seq2Actions = NULL; - lzParams->seq2Filename = tempS; - } - - // determine which sequences are quantum - - lzParams->targetIsQuantum = false; - if ((lzParams->seq1Filename != NULL) && (!lzParams->readCapsule)) - lzParams->targetIsQuantum = name_spec_is_quantum (lzParams->seq1Filename); - - if (lzParams->seq2Filename != NULL) - lzParams->queryIsQuantum = name_spec_is_quantum (lzParams->seq2Filename); - - if ((lzParams->targetIsQuantum) || (lzParams->queryIsQuantum)) - { - if (lzParams->inferScores) - suicide ("scoring inference cannot be performed with quantum DNA"); - if ((lzParams->minIdentity > 0) || (lzParams->maxIdentity < 1)) - suicide ("identity filtering cannot be used with quantum DNA"); - if ((lzParams->minMatchCountRatio != 0) || (lzParams->minMatchCount > 0)) - suicide ("match count filtering cannot be used with quantum DNA"); - if (lzParams->maxMismatchCount > 0) - suicide ("mismatch count filtering cannot be used with quantum DNA"); - if (lzParams->outputFormat == fmtIdDist) - suicide ("--format=identity cannot be used with quantum DNA"); -#ifdef densityFiltering - if (lzParams->maxDensity != 0) - suicide ("--density cannot be used with quantum DNA"); -#endif // densityFiltering - } - - ////////// - // check for sensibility - ////////// - -#ifdef disallowEntropy - if (lzParams->entropicHsp) - chastise ("--entropy is currently disabled\n"); -#endif - -#ifndef collect_stats - if (lzParams->showStats) - chastise ("--stats is not implemented in this build of the program\n"); -#endif // collect_stats - - if (lzParams->writeCapsule) - lzParams->outputFormat = fmtNone; - - if ((lzParams->seq1Filename == NULL) && (!lzParams->readCapsule)) - chastise ("You must specify a target file\n"); - - if (lzParams->inferOnly) - { - if (lzParams->noHitFiltering) - chastise ("--rawhits can't be used with --inferonly\n"); - - if (lzParams->dynamicMasking > 0) - chastise ("--masking can't be used with --inferonly\n"); - - if (lzParams->reportCensus) - chastise ("--census can't be used with --inferonly\n"); - - if (lzParams->outputFormat != defaultParams.outputFormat) - chastise ("--format=%s can't be used with --inferonly\n", - formatNames[lzParams->outputFormat]); - - if (lzParams->innerThreshold > 0) - chastise ("--inner can't be used with --inferonly\n"); - - if (lzParams->anchorsFilename != NULL) - chastise ("--segments can't be used with --inferonly\n"); - } - - if (lzParams->selfCompare) - { - if (lzParams->seq2Filename != NULL) - chastise ("--self can't be used when you specify a query file\n"); - if (lzParams->anchorsFilename != NULL) - chastise ("--segments can't be used with --self\n"); - if (lzParams->readCapsule) - chastise ("--self can't be used with --targetcapsule\n"); - if (lzParams->inferScores) - chastise ("--self can't be used with --infer\n"); - lzParams->seq2Filename = copy_string (lzParams->seq1Filename); - if (lzParams->mirrorHSP == -1) - { // selfCompare implies mirroring by default, so enable mirroring - // .. here if the user hasn't disabled it; we assume for the - // .. moment that mirroring will occur in the ungapped stage, but - // .. if we later discover that gappedExtend is true, we'll - // .. shift mirroring to the gapped state; the reason we delay - // .. is that, at this point, the value of gappdExtend may not be - // .. fully established - lzParams->mirrorHSP = true; - lzParams->mirrorGapped = false; - } - else if (lzParams->mirrorGapped == -1) - lzParams->mirrorGapped = false; - } - else if (lzParams->clonedQuery) - { - if (lzParams->seq2Filename != NULL) - chastise ("cloned query can't be used when you specify a query file\n"); - if (lzParams->anchorsFilename != NULL) - chastise ("--segments can't be used with cloned query\n"); - lzParams->seq2Filename = copy_string (lzParams->seq1Filename); - if (lzParams->mirrorHSP == -1) - { - lzParams->mirrorHSP = false; - lzParams->mirrorGapped = false; - } - else if (lzParams->mirrorGapped == -1) - lzParams->mirrorGapped = false; - } - else if (lzParams->mirrorHSP == true) - chastise ("--mirror can only be used with --self\n"); - else - lzParams->mirrorHSP = lzParams->mirrorGapped = false; - - if (lzParams->readCapsule) - { - capseed* seedCapsule; - - if (seedString != NULL) - { - if (seedArg == NULL) - chastise ("can't set word size or seed pattern with --targetcapsule\n"); - else if (strcmp_prefix (seedArg, "T=") == 0) - chastise ("can't set word size or seed pattern with --targetcapsule (%s)\n" - "(use --transition or --notransition instead)\n", - seedArg); - else - chastise ("can't set word size or seed pattern with --targetcapsule (%s)\n", - seedArg); - } - if (haveStep) - chastise ("can't use --step with --targetcapsule\n"); - if (lzParams->dynamicMasking > 0) - chastise ("can't use --masking with --targetcapsule\n"); - if (lzParams->wordCountLimit > 0) - chastise ("can't use --maxwordcount with --targetcapsule\n"); - if (lzParams->maxIndexBits != defaultParams.maxIndexBits) - chastise ("can't use --word with --targetcapsule\n"); - if (lzParams->targetMem != 0) - chastise ("can't use --allocate:target with --targetcapsule\n"); - - lzParams->capsule = open_capsule_file (lzParams->capsuleFilename); - seedCapsule = locate_capsule_data (lzParams->capsule, cap_seed, - NULL, NULL); - if (seedCapsule == NULL) - suicide ("bad capsule file (missing seed)"); - lzParams->step = seedCapsule->step; - } - - if (lzParams->writeCapsule) - { - if (lzParams->seq2Filename != NULL) - chastise ("--writecapsule can't be used when you specify a query file\n"); - if (lzParams->inferScores) - chastise ("can't use --infer with --writecapsule\n"); - if (lzParams->anchorsFilename != NULL) - chastise ("can't use --segments with --writecapsule\n"); - if (haveXDrop) - chastise ("can't use --xdrop with --writecapsule\n"); - if (haveYDrop) - chastise ("can't use --ydrop with --writecapsule\n"); - if (haveHspThreshold) - chastise ("can't use --hspthresh with --writecapsule\n"); - if (haveGappedThreshold) - chastise ("can't use --gappedthresh with --writecapsule\n"); - if (haveInterpThreshold) - chastise ("can't use --inner with --writecapsule\n"); - if (haveEntropicHsp) - chastise ("can't use --entropy with --writecapsule\n"); - if (haveBallScore) - chastise ("can't use --ball with --writecapsule\n"); - if ((haveWithTrans) && (!haveWithTransForMatch)) - chastise ("can't use --transition with --writecapsule\n"); - if (haveMaxIdentity) - chastise ("can't use --identity with --writecapsule\n"); - if ((haveGapOpen) || (haveGapExtend)) - chastise ("can't use --gap with --writecapsule\n"); - } - - if (!lzParams->doSeedSearch) - { - if (lzParams->seq2Filename != NULL) - chastise ("--tableonly can't be used when you specify a query file\n"); - if (lzParams->inferScores) - chastise ("--infer and --tableonly are not compatible\n"); - } - - if (lzParams->maxIndexBits < 8) - chastise ("--word doesn't allow so few bits (%d)\n", - lzParams->maxIndexBits); - - if (lzParams->tracebackMem < 100*1024) - chastise ("--allocate:traceback must be at least 100K (it's only %s)\n", - unitize(lzParams->tracebackMem,/*byThousands*/ false)); - -#ifndef noSeedHitQueue - if (lzParams->seedHitQueueSize < 0) - chastise ("--seedqueue can't be negative\n"); -#endif // not noSeedHitQueue - - if (lzParams->maskingFilename != NULL) - { - if (lzParams->dynamicMasking == 0) - chastise ("--outputmasking requires --masking\n"); - } - - if ((lzParams->reportCensus) - && (lzParams->censusFilename == NULL)) - { - if ((lzParams->outputFormat != fmtLav) - && (lzParams->outputFormat != fmtLavComment) - && (lzParams->outputFormat != fmtLavScore) - && (lzParams->outputFormat != fmtLavText)) - chastise ("--census with --format=%s requires --census=\n", - formatNames[lzParams->outputFormat]); - } - - if (lzParams->hspImmediate) - { - // $$$ hspImmediate should turn off second-stage gapped, although - // $$$ .. nothing will happen in the second stage, since the anchors - // $$$ .. list will be empty - - if (lzParams->inferScores) - chastise ("can't use --anyornone with --infer[only]\n"); - - if (lzParams->innerThreshold > 0) - chastise ("can't use --anyornone with --inner\n"); - - if (lzParams->anchorsFilename != NULL) - chastise ("can't use --anyornone with --segments\n"); - - if (lzParams->hspThreshold.t != 'S') - chastise ("can't use --anyornone with adaptive hsp score threshold\n"); - - if (lzParams->chain) - chastise ("can't use --anyornone with --chain\n"); - - if (lzParams->innerThreshold > 0) - chastise ("can't use --anyornone with --inner\n"); - } - - if (lzParams->searchLimit > 0) - { - if (lzParams->inferScores) - chastise ("can't use --anyornone or --queryhsplimit with --infer[only]\n"); - - if (lzParams->innerThreshold > 0) - chastise ("can't use --anyornone or --queryhsplimit with --inner\n"); - - if (lzParams->anchorsFilename != NULL) - chastise ("can't use --anyornone or --queryhsplimit with --segments\n"); - - if (lzParams->hspThreshold.t != 'S') - chastise ("can't use --anyornone or --queryhsplimit with adaptive hsp score threshold\n"); - - if ((lzParams->targetIsQuantum) || (lzParams->queryIsQuantum)) - chastise ("can't use --anyornone or --queryhsplimit with quantum dna\n"); - } - - if (lzParams->choresFilename != NULL) - { - if (lzParams->inferScores) - chastise ("can't use --chores with --infer[only]\n"); - if (lzParams->selfCompare) - chastise ("can't use --chores with --self\n"); - if (lzParams->anchorsFilename != NULL) - chastise ("can't use --chores with --segments\n"); - if (lzParams->readCapsule) - chastise ("can't use --chores with --targetcapsule\n"); - // the issue is that chores require use to the sequence in memory, - // and for a target capsule the sequence in memory is read only - } - - if ((formatIsSegments) && (!haveGappedOption)) - { - if (haveInterpThreshold) - chastise ("--inner cannot be used with --writesegments\n"); - lzParams->gappedExtend = false; - } - - ////////// - // set up score set - ////////// - - if (lzParams->inferScores) - { -#if ((scoreType == 'I') && (!defined infer_anything)) - suicide ("scoring inference can't be performed with integer arithmetic; use lastz_D"); -#endif - - if (lzParams->anchorsFilename != NULL) - chastise ("--segments can't be used with --infer[only]\n"); - - if (scoreFilename != NULL) - chastise ("can't use --infer[only] and --scores together\n"); - - if (useUnitScores) - chastise ("can't use --infer[only] and --match (or --unitscores) together\n"); - - if (haveGapOpen) - chastise ("can't use --infer[only] and --gap (or O=) together\n"); - - if (haveGapExtend) - chastise ("can't use --infer[only] and --gap (or E=) together\n"); - - if (firstSpecialSub != NULL) - chastise ("can't use --infer[only] and special substitution scores together\n"); - } - - if (lzParams->gfExtend == gfexNoExtend) - { - if ((!lzParams->gappedExtend) - && (scoreFilename != NULL) - && (!lzParams->targetIsQuantum) - && (!lzParams->queryIsQuantum)) - chastise ("--scores requires --gfextend or --gapped\n"); - - if (haveXDrop) - chastise ("--xdrop requires --gfextend\n"); - - if (haveHspThreshold) - chastise ("--hspthresh requires --gfextend\n"); - - if (haveEntropicHsp) - chastise ("--entropy requires --gfextend\n"); - - if (lzParams->xDropUntrimmed) - chastise ("--noxtrim requires --gfextend\n"); - - lzParams->xDrop = 0; - lzParams->hspThreshold.t = 'S'; - lzParams->hspThreshold.s = 0; - lzParams->entropicHsp = false; - } - - if (!lzParams->chain) - { - if (lzParams->chainDiag != 0) - chastise ("G= requires --chain\n"); - - if (lzParams->chainAnti != 0) - chastise ("R= requires --chain\n"); - } - - if (lzParams->chain) - { - if (lzParams->anchorsFilename != NULL) - chastise ("--segments can't be used with --chain\n"); - } - - if (!lzParams->gappedExtend) - { - if ((haveGapOpen) || (haveGapExtend)) - chastise ("--gap (or O= or E=) requires --gapped\n"); - - if (haveYDrop) - chastise ("--ydrop requires --gapped\n"); - - if (haveGappedThreshold) - chastise ("--gappedThreshold requires --gapped\n"); - - if (haveInterpThreshold) - chastise ("--inner requires --gapped\n"); - - if (lzParams->yDropUntrimmed) - chastise ("--noytrim requires --gapped\n"); - - if ((lzParams->maxContinuity < 1) - && (!lzParams->doSeedSearch) - && (!lzParams->writeCapsule)) - chastise ("--continuity maximum less than 1 requires --gapped\n"); - - if (lzParams->gappedAllBounds) - chastise ("--allgappedbounds requires --gapped\n"); - } - - if (lzParams->gappedExtend) - { - if (formatIsSegments) - chastise ("can't used --writesegments with --gapped\n"); - if (lzParams->mirrorHSP) - { // for gapped alignments, mirroring shall be done at the - // .. gapped stage, so "shift" the mirror setting mirrorHSP - // .. to mirrorGapped - lzParams->mirrorHSP = false; - lzParams->mirrorGapped = true; - } - } - - if (lzParams->anchorsFilename != NULL) - { - if (haveHspThreshold) - chastise ("--segments can't be used with --hspthresh\n"); - if (haveXDrop) - chastise ("--segments can't be used with --xdrop\n"); - if (seedString != NULL) - { - if (seedArg == NULL) - chastise ("can't set word size or seed pattern with --segments\n"); - else if (strcmp_prefix (seedArg, "T=") == 0) - chastise ("can't set word size or seed pattern with --segments (%s)\n" - "(use --transition or --notransition instead)\n", - seedArg); - else - chastise ("can't set word size or seed pattern with --segments (%s)\n", - seedArg); - } - } - - if (haveXDrop && (lzParams->xDrop <= 0)) - chastise ("%d is not a legitimate x-drop threshold\n", lzParams->xDrop); - - if (haveYDrop && (lzParams->yDrop <= 0)) - chastise ("%d is not a legitimate y-drop threshold\n", lzParams->yDrop); - - if ((useUnitScores) && (scoreFilename != NULL)) - chastise ("can't use --match (or --unitscores) and --scores together\n"); - - if (scoreFilename != NULL) - { - // read scores and score-related parameters from file, allowing them - // to be overridden by the command line - - xss = read_score_set_by_name (scoreFilename); - lzParams->scoring = (scoreset*) xss; - - if (xss->seedSet) - { - if (seedString == NULL) // it contains params in command-line syntax - parse_options_loop (1, &xss->seed, lzParams, izParams, - /* top level */ false, - /* allow expanders */ false, - /* allow include */ false); - free_if_valid ("xss->seed", xss->seed); xss->seed = NULL; - } - - if (!haveGapOpen) - gapOpen = lzParams->scoring->gapOpen; - else - { - lzParams->scoring->gapOpen = gapOpen; - lzParams->scoring->gapOpenSet = true; - } - - if (!haveGapExtend) - gapExtend = lzParams->scoring->gapExtend; - else - { - lzParams->scoring->gapExtend = gapExtend; - lzParams->scoring->gapExtendSet = true; - } - - if ((!haveHspThreshold) && (xss->hspThresholdSet)) - { - lzParams->hspThreshold.t = 'S'; - lzParams->hspThreshold.s = xss->hspThreshold; - haveHspThreshold = true; - } - if ((!haveGappedThreshold) && (xss->gappedThresholdSet)) - { - lzParams->gappedThreshold.t = 'S'; - lzParams->gappedThreshold.s = xss->gappedThreshold; - haveGappedThreshold = true; - } - if ((!haveXDrop) && (xss->xDropSet)) - { - lzParams->xDrop = xss->xDrop; - haveXDrop = true; - } - if ((!haveYDrop) && (xss->yDropSet)) - { - lzParams->yDrop = xss->yDrop; - haveYDrop = true; - } - if ((!haveBallScore) && (ballScoreFactor < 0) && (xss->ballScoreSet)) - { - if (xss->ballScoreFactor < 0) - { lzParams->ballScore = xss->ballScore; haveBallScore = true; } - else - ballScoreFactor = xss->ballScoreFactor; - } - if ((!haveStep) && (xss->stepSet)) - { - lzParams->step = xss->step; - haveStep = true; - } - - if ((haveGapOpen) && (gapOpen + gapExtend <= 0)) - chastise ("%s is not a valid gap open penalty with extension penalty %s\n" - "(open can be negative but the sum has to be postive)\n", - gapOpenStr, gapExtendStr); - if ((haveGapExtend) && (gapExtend < 0)) - chastise ("%s is not a valid gap extension penalty\n", gapExtendStr); - } - else if (useUnitScores) - { - // use unit scoring matrix, scaled if requested - - scratchThreshold.t = 'S'; - if (scoreType == 'I') - scratchThreshold.s = (score) ceil (unitScores_thresh * unitMatch); - else - scratchThreshold.s = (score) (unitScores_thresh * unitMatch); - - if (!haveGapOpen) - { - if (scoreType == 'I') - gapOpen = (score) ceil (unitScores_open * -unitMismatch); - else - gapOpen = (score) (unitScores_open * -unitMismatch); - haveGapOpen = true; - } - if (!haveGapExtend) - { - if (scoreType == 'I') - gapExtend = (score) ceil (unitScores_extend * -unitMismatch); - else - gapExtend = (score) (unitScores_extend * -unitMismatch); - haveGapExtend = true; - } - - if (!haveHspThreshold) - { - lzParams->hspThreshold = scratchThreshold; - haveHspThreshold = true; - } - - if ((!haveGappedThreshold) && (lzParams->gfExtend == gfexExact)) - { - lzParams->gappedThreshold = scratchThreshold; - haveGappedThreshold = true; - } - - if ((!haveXDrop) && (!lzParams->inferScores)) - { - if (scoreType == 'I') - lzParams->xDrop = (score) ceil (10.0 * sqrt(-unitMismatch)); - else - lzParams->xDrop = (score) (10.0 * sqrt(-unitMismatch)); - haveXDrop = true; - } - - if ((!haveYDrop) && (!lzParams->inferScores)) - { - lzParams->yDrop = 2 * lzParams->xDrop; - haveYDrop = true; - } - - if ((haveGapOpen) && (gapOpen + gapExtend < 0)) - chastise ("%s is not a valid gap open penalty\n", gapOpenStr); - if ((haveGapExtend) && (gapExtend < 0)) - chastise ("%s is not a valid gap extension penalty\n", gapExtendStr); - - for (r=0 ; r<4 ; r++) - for (c=0 ; c<4 ; c++) - myUnitScores[r][c] = (r==c)? unitMatch : unitMismatch; - - lzParams->scoring = new_dna_score_set (myUnitScores, - unitScores_X * -unitMismatch, - unitScores_fill * -unitMismatch, - gapOpen, gapExtend); - } - else if (lzParams->inferScores) - { - ; // (do nothing, lzParams->scoring will be created by inference) - } - else - { - // use blastz default scoring matrix - if (!haveGapOpen) gapOpen = HOXD70_open; - if (!haveGapExtend) gapExtend = HOXD70_extend; - if ((haveGapOpen) && (gapOpen + gapExtend < 0)) - chastise ("%s is not a valid gap open penalty\n", gapOpenStr); - if ((haveGapExtend) && (gapExtend < 0)) - chastise ("%s is not a valid gap extension penalty\n", gapExtendStr); - lzParams->scoring = new_dna_score_set (HOXD70, - HOXD70_X, HOXD70_fill, - gapOpen, gapExtend); - } - - if (firstSpecialSub != NULL) - { - score worstScore = 0; - - if ((!lzParams->scoring->rowsAreDna) - || (!lzParams->scoring->colsAreDna)) - suicidef ("special substitution scores (e.g. %s) can't be used with quantum DNA scores", - firstSpecialSub); - - free_if_valid ("parse_options (firstSpecialSub)", firstSpecialSub); - - for (r=0 ; r<4 ; r++) - for (c=0 ; c<4 ; c++) - { - if (specialSubScores[r][c] == worstPossibleScore) - { - nuc1 = bits_to_nuc[r]; - nuc2 = bits_to_nuc[c]; - specialSubScores[r][c] = lzParams->scoring->sub[nuc1][nuc2]; - } - if (specialSubScores[r][c] < worstScore) - worstScore = specialSubScores[r][c]; - } - free_score_set ("lzParams->scoring (special subs)", lzParams->scoring); - lzParams->scoring = new_dna_score_set (specialSubScores, - 10*worstScore, worstScore, - gapOpen, gapExtend); - } - - ////////// - // convert seed string to a seed structure - ////////// - - if ((lzParams->targetIsQuantum) || (lzParams->queryIsQuantum)) - { - if ((haveWithTrans) && (lzParams->withTrans != 0)) - suicidef ("can't use --transitions with quantum DNA", - lzParams->seq2Filename); - lzParams->withTrans = 0; - } - - create_seed_structure (lzParams, &seedString, - haveWithTrans, twinsYes, minGap, maxGap); - - if ((lzParams->targetIsQuantum) || (lzParams->queryIsQuantum)) - { - if (lzParams->hitSeed->type != 'S') - suicide ("quantum DNA requires a strict seed\n" - "(only 1s and 0s allowed, no Ts, no --seed=half)"); - } - - if (pos_table_dbgShowDiscards) - pos_table_dbgSeed = lzParams->hitSeed; - - ////////// - // compute default values for parameters that have not been set by the user - ////////// - - if (!haveXDrop) - { - if (lzParams->inferScores) - lzParams->xDrop = -1; // (will fill in after scoring inference) - else - { - rCh = lzParams->scoring->rowChars[0]; - cCh = lzParams->scoring->colChars[0]; - lzParams->xDrop = 10 * lzParams->scoring->sub[rCh][cCh]; - } - } - - if (!haveYDrop) - { - if (lzParams->inferScores) - lzParams->yDrop = -1; // (will fill in after scoring inference) - else - lzParams->yDrop = lzParams->scoring->gapOpen + 300 * lzParams->scoring->gapExtend; - } - - if (!haveGappedThreshold) - { - if (lzParams->gfExtend == gfexXDrop) - lzParams->gappedThreshold = lzParams->hspThreshold; - else - lzParams->gappedThreshold = defaultParams.hspThreshold; - } - - if ((scoreFilename != NULL) - && (((!haveHspThreshold) && (lzParams->gfExtend == gfexXDrop)) - || (!haveGappedThreshold)) - && (lzParams->scoring->rowsAreDna) - && (lzParams->scoring->colsAreDna)) - { - char minNuc, maxNuc; - score minSub, maxSub; - char* thresholdOption; - - if ((!haveHspThreshold) && (lzParams->gfExtend == gfexXDrop)) - thresholdOption = "--hspthresh"; - else if ((!haveHspThreshold) && (!haveGappedThreshold)) - thresholdOption = "--gappedthresh"; - else - goto threshold_check_done; - - minNuc = maxNuc = 'A'; minSub = maxSub = lzParams->scoring->sub['A']['A']; - if (lzParams->scoring->sub['C']['C'] < minSub) - { minNuc = 'C'; minSub = lzParams->scoring->sub['C']['C']; } - else if (lzParams->scoring->sub['C']['C'] > maxSub) - { maxNuc = 'C'; maxSub = lzParams->scoring->sub['C']['C']; } - if (lzParams->scoring->sub['G']['G'] < minSub) - { minNuc = 'G'; minSub = lzParams->scoring->sub['G']['G']; } - else if (lzParams->scoring->sub['G']['G'] > maxSub) - { maxNuc = 'G'; maxSub = lzParams->scoring->sub['G']['G']; } - if (lzParams->scoring->sub['T']['T'] < minSub) - { minNuc = 'T'; minSub = lzParams->scoring->sub['T']['T']; } - else if (lzParams->scoring->sub['T']['T'] > maxSub) - { maxNuc = 'T'; maxSub = lzParams->scoring->sub['T']['T']; } - - if (minSub < 70) - fprintf (stderr, "WARNING. Scores file may warrant setting of thresholds absent from %s.\n" - "Minimum match score is " scoreFmt ", for matrix entry (%c,%c).\n" - "This may not work well with default %s=" scoreFmt " (may result in few alignments).\n", - scoreFilename, minSub, minNuc, minNuc, - thresholdOption, defaultParams.hspThreshold.s); - else if (maxSub > 120) - fprintf (stderr, "WARNING. Scores file may warrant setting of thresholds absent from %s.\n" - "Maximum match score is " scoreFmt ", for matrix entry (%c,%c).\n" - "This may not work well with default %s=" scoreFmt " (may result in too many alignments).\n", - scoreFilename, maxSub, maxNuc, maxNuc, - thresholdOption, defaultParams.hspThreshold.s); - } - -threshold_check_done: - - ////////// - // set up others - ////////// - - // if we are doing quantum alignment, make certain that we are only - // aligning to the positive strand unless the scoring file provided a - // complement mapping - - if ((lzParams->scoring != NULL) - && (!lzParams->scoring->colsAreDna) // (query is quantum) - && (lzParams->scoring->qToComplement == NULL) // (we have no complement mapping) - && (lzParams->whichStrand != 0)) // (we're aligning to reverse or both) - suicide ("can't search minus strand if query is quantum and scores file does not\n" - "provide a way to map to complements"); - - // create a version of the scoring set that penalizes lowercase bases - - if (!lzParams->inferScores) - lzParams->maskedScoring = masked_score_set (lzParams->scoring); - - // make scores vs N be ambiguous, if desired and if rows and columns are - // both DNA - - if ((lzParams->nIsAmbiguous) - && (!lzParams->scoring->rowsAreDna) - && (!lzParams->scoring->colsAreDna)) - suicidef ("can't use --ambiguous if both target and query are quantum"); - - if (lzParams->allowAmbiDNA) - { - ambiguate_iupac (lzParams->scoring, lzParams->ambiMatch, -lzParams->ambiMismatch); - ambiguate_iupac (lzParams->maskedScoring, lzParams->ambiMatch, -lzParams->ambiMismatch); - } - - if (lzParams->nIsAmbiguous) - { - ambiguate_n (lzParams->scoring, lzParams->ambiMatch, -lzParams->ambiMismatch); - ambiguate_n (lzParams->maskedScoring, lzParams->ambiMatch, -lzParams->ambiMismatch); - } - - if (dbgShowMatrix) - { - // nota bene: Bb is representative of iupac ambiggies; F represents "fill" - fprintf (stderr, "lzParams->scoring:\n"); - dump_score_set (stderr, lzParams->scoring, (u8*)"ACGTacgtNnBbXF", (u8*)"ACGTacgtNnBbXF"); - fprintf (stderr, "\n"); - fprintf (stderr, "lzParams->maskedScoring:\n"); - dump_score_set (stderr, lzParams->maskedScoring, (u8*)"ACGTacgtNnBbXF", (u8*)"ACGTacgtNnBbXF"); - } - - if (lzParams->inferScores) - maxScore = 0; // (maxScore is not needed) - else - maxScore = max_in_score_matrix (lzParams->scoring); - - // (no longer used) - //if (lzParams->seq1Filename != NULL) seq1Filename = lzParams->seq1Filename; - // else seq1Filename = "(unnamed target file)"; - - if (lzParams->seq2Filename != NULL) seq2Filename = lzParams->seq2Filename; - else seq2Filename = "(unnamed query file)"; - - if (!lzParams->inferScores) - { - if ((!lzParams->targetIsQuantum) // DNA target - && (!lzParams->maskedScoring->rowsAreDna)) - suicidef ("row scores are for quantum DNA, but target is not"); - - if ((lzParams->doSeedSearch) - && (!lzParams->queryIsQuantum) // DNA query - && (!lzParams->maskedScoring->colsAreDna)) - suicidef ("column scores are for quantum DNA, but query is not"); - - if ((lzParams->targetIsQuantum) // quantum DNA target - && (lzParams->maskedScoring->rowsAreDna)) - suicidef ("target is quantum DNA, but row scores are not"); - - if ((lzParams->queryIsQuantum) // quantum DNA query - && (lzParams->maskedScoring->colsAreDna)) - suicidef ("query is quantum DNA, but column scores are not"); - - if (((haveBallScore) || (ballScoreFactor >= 0)) - && ((!lzParams->targetIsQuantum) && (!lzParams->queryIsQuantum))) - suicidef ("--ball can't be used with DNA target and query"); - - if ((lzParams->targetIsQuantum) - || (lzParams->queryIsQuantum)) - { - if ((haveBallScore) && (lzParams->ballScore < 0)) - chastise (scoreFmtSimple " is not a legitimate ball threshold\n", - lzParams->ballScore); - if (!haveBallScore) - { - if (ballScoreFactor < 0) ballScoreFactor = defaultBallScoreFactor; - lzParams->ballScore = ballScoreFactor * maxScore - * lzParams->hitSeed->weight/2; - } - if (lzParams->ballScore < 0) - suicidef ("quantum DNA (%s) requires --ball", seq2Filename); - if ((lzParams->outputFormat == fmtAxt) - || (lzParams->outputFormat == fmtAxtComment) - || (lzParams->outputFormat == fmtAxtGeneral)) - suicidef ("--axt doesn't support quantum DNA"); - if ((lzParams->outputFormat == fmtMaf) - || (lzParams->outputFormat == fmtMafComment) - || (lzParams->outputFormat == fmtMafNoComment)) - suicidef ("--maf doesn't support quantum DNA"); - if ((lzParams->outputFormat == fmtGenpaf) - || (lzParams->outputFormat == fmtGenpafNoHeader)) - { - if (strchr (lzParams->outputInfo, genpafText1) != NULL) - suicidef ("--format=general:text1 doesn't support quantum DNA"); - if (strchr (lzParams->outputInfo, genpafText2) != NULL) - suicidef ("--format=general:text2 doesn't support quantum DNA"); - if ((lzParams->targetIsQuantum) - && (strchr (lzParams->outputInfo, genpafTargetNucs) != NULL)) - suicidef ("--format=general:%s doesn't support quantum DNA", genpafTNucsName); - if ((lzParams->queryIsQuantum) - && (strchr (lzParams->outputInfo, genpafQueryNucs) != NULL)) - suicidef ("--format=general:%s doesn't support quantum DNA", genpafQNucsName); - if ((lzParams->targetIsQuantum) - && (strchr (lzParams->outputInfo, genpafTargetQuals) != NULL)) - suicidef ("--format=general:%s doesn't support quantum DNA", genpafTQualsName); - if ((lzParams->queryIsQuantum) - && (strchr (lzParams->outputInfo, genpafQueryQuals) != NULL)) - suicidef ("--format=general:%s doesn't support quantum DNA", genpafQQualsName); - } - } - } - - // build a seed for interpolation (an n-mer exact match) - - if (lzParams->innerThreshold > 0) - { - char seedString[innerWordSize+1]; - int i; - - for (i=0 ; iinnerSeed, 28); - lzParams->innerSeed->withTrans = 0; - } - - // decide whether we need to 'waste' time reading files just to get the - // true sequence length (some output formats require the sequence length, - // others don't) - - lzParams->needTrueLengths = false; - - if (lzParams->anchorsFilename != NULL) - lzParams->needTrueLengths = true; - else if (lzParams->capsule != NULL) - lzParams->needTrueLengths = true; - else if ((lzParams->minCoverage > 0) - || (lzParams->maxCoverage < 1)) - lzParams->needTrueLengths = true; - else if (lzParams->minMatchCountRatio != 0) - lzParams->needTrueLengths = true; - else if ((lzParams->anchorsFilename != NULL) - || (lzParams->outputFormat == fmtAxt) - || (lzParams->outputFormat == fmtAxtComment) - || (lzParams->outputFormat == fmtAxtGeneral) - || (lzParams->outputFormat == fmtMaf) - || (lzParams->outputFormat == fmtMafComment) - || (lzParams->outputFormat == fmtMafNoComment) - || (lzParams->outputFormat == fmtSoftSam) - || (lzParams->outputFormat == fmtHardSam) - || (lzParams->outputFormat == fmtCigar)) - lzParams->needTrueLengths = true; - else if ((lzParams->outputFormat == fmtGenpaf) - || (lzParams->outputFormat == fmtGenpafNoHeader) - || (lzParams->outputFormat == fmtGenpafNameHeader) - || (lzParams->outputFormat == fmtGenpafBlast) - || (lzParams->outputFormat == fmtGenpafBlastNoHeader)) - { - if ((strchr (lzParams->outputInfo, genpafSize1) != NULL) - || (strchr (lzParams->outputInfo, genpafSize2) != NULL) - || (strchr (lzParams->outputInfo, genpafCoverage) != NULL) - || (strchr (lzParams->outputInfo, genpafCoverageFrac) != NULL) - || (strchr (lzParams->outputInfo, genpafCoveragePct) != NULL)) - lzParams->needTrueLengths = true; - } - - // propagate some deep control to other modules - - if (lzParams->verbosity >= 2) - { - showProgress = true; - pos_table_showProgress = true; - seed_search_showProgress = true; - } - - gapped_extend_verbosity = (gappedExtendVerbosity>=0)? gappedExtendVerbosity - : lzParams->verbosity; - gapped_extend_dbgShowHsps = dbgShowHsps; - gapped_extend_dbgShowAnchors = dbgShowAnchors; - gapped_extend_dbgShowAnchorsHowOften = dbgShowAnchorsHowOften; - - sequences_keepFastaArrow = lzParams->lajCompatible; - - segment_dbgAnchorParsing = dbgAnchorParsing; - - ////////// - // set up inference parameters - // - // note 1: Ideally we want the default gapOpen and gapExtend penalties to - // be relative to the worst substitution score. Unfortunately, - // if scores are integers we don't have any mechanism to pass the - // ratios in the score variable (the ratios are not integers). So - // the code below does what we want when scores are non-integers, - // but makes a poor approximation attempt otherwise. This could be - // corrected with some effort, but performing inference with integer - // scores is undesirable, for other reasons, so the effort is not - // warranted. - ////////// - - if (lzParams->inferScores) - { - infcontrol tempIc; - - tempIc = izParams->ic; - *izParams = *lzParams; - izParams->ic = tempIc; - izParams->tracebackMem = 0; - izParams->outputFormat = fmtNone; - - izParams->hitSeed = copy_seeds (lzParams->hitSeed); - - if (izParams->scoring == NULL) - izParams->scoring = new_dna_score_set (unitScores, - unitScores_X, unitScores_fill, - worstPossibleScore, worstPossibleScore); - - if (infControlFilename != NULL) - read_control_file_by_name (infControlFilename, izParams); - - if ((izParams->ic.inferScale > 0) - && (izParams->ic.inferScale != 1)) - scale_score_set (izParams->scoring, izParams->ic.inferScale); - - if (izParams->scoring->gapOpen == worstPossibleScore) - { // (see note 1 about 20 lines above) - if (scoreType != 'I') - { - izParams->ic.gapOpenIsRatio = ratioMinSubScore; - izParams->scoring->gapOpen = unitScores_open; - } - else if (izParams->ic.inferScale > 0) - izParams->scoring->gapOpen = (score) ceil (unitScores_open * izParams->ic.inferScale); - else - izParams->scoring->gapOpen = unitScores_open; - } - - if (izParams->scoring->gapExtend == worstPossibleScore) - { // (see note 1 about 30 lines above) - if (scoreType != 'I') - { - izParams->ic.gapExtendIsRatio = ratioMinSubScore; - izParams->scoring->gapExtend = unitScores_extend; - } - else if (izParams->ic.inferScale > 0) - izParams->scoring->gapExtend = (score) ceil (unitScores_extend * izParams->ic.inferScale); - else - izParams->scoring->gapExtend = unitScores_extend; - } - - izParams->maskedScoring = masked_score_set (izParams->scoring); - } - - ////////// - // clean up - ////////// - - free_if_valid ("parse_options (seedString)", seedString); - free_if_valid ("parse_options (seedArg)", seedArg); - free_if_valid ("parse_options (scoreFilename)", scoreFilename); - free_if_valid ("parse_options (infControlFilename)", infControlFilename); - } - -//---------- -// -// create_seed_structure-- -// Convert the user-specified parameters into a seed and set any related -// variables. -// -//---------- -// -// Arguments: -// control* lzParams: Control data for the primary alignment. This -// .. routine will potnetially modify several fields. -// char** seedString: The user-defined seed pattern. This may be -// .. modified by this routine. Specifically, if the -// .. string is NULL, a nely allocated string is -// .. created and copied to this. -// -// Returns: -// (nothing) -// -//---------- - -static void create_seed_structure - (control* lzParams, - char** _seedString, - int haveWithTrans, - int twinsYes, - int minGap, - int maxGap) - { - char* seedString = *_seedString; - - // reconstruct seed from the capsule - - if (lzParams->capsule != NULL) - { - capseed* seedCapsule; - u64 seedCapsuleSize, expectedSize; - int shift[100]; // (100 is a safe upper bound; usually - u32* mask; // .. fewer than five are needed) - u32* transFlips; - int numParts, numFlips, ix; - u32* scan; - - seedCapsule = locate_capsule_data (lzParams->capsule, cap_seed, - NULL, &seedCapsuleSize); - if (seedCapsule == NULL) - suicide ("bad capsule file (missing seed)"); - - numParts = (int) seedCapsule->numParts; - if (numParts > (int) (sizeof(shift)/sizeof(shift[0]))) - suicidef ("internal error handling capsule file (numParts = %d)", - numParts); - - scan = &seedCapsule->shift0; - mask = &scan[numParts]; - transFlips = &mask[numParts]; - - for (ix=0 ; ixhitSeed = reconstruct_seed - ((char) seedCapsule->type, - (int) seedCapsule->length, - (int) seedCapsule->weight, - NULL, // $$$ add support for capsules containing seed pattern - seedCapsule->resolvingMask, - (int) seedCapsule->revComp, - (int) seedCapsule->isHalfweight, - (int) seedCapsule->numParts, - shift, mask, transFlips); - } - - // create seed from user params - - else - { - if (seedString == NULL) - *_seedString = seedString = copy_string (defaultSeedString); - - parse_seeds_string (seedString, &lzParams->hitSeed, lzParams->maxIndexBits); - } - - if ((lzParams->hitSeed->type == 'H') - && (!haveWithTrans)) - lzParams->withTrans = 0; - - if ((lzParams->withTrans != 0) - && (lzParams->hitSeed->type != 'S') - && (lzParams->hitSeed->transFlips == NULL)) - chastise ("--transition can only be used with strict seeds (1s and 0s)\n"); - - if (lzParams->minMatches >= 0) - { - int numPositions; - char* pScan; - - if ((lzParams->filterCaresOnly) && (lzParams->hitSeed->pattern == NULL)) - chastise ("--filter=cares: cannot be used with a patternless seed\n"); - - if (!lzParams->filterCaresOnly) - numPositions = lzParams->hitSeed->length; - else - { - numPositions = 0; - for (pScan=lzParams->hitSeed->pattern ; *pScan!=0 ; ) - { if (*(pScan++) != '0') numPositions++; } - } - - if (lzParams->minMatches > numPositions) - chastise ("--filter can't require more matches (%d) than seed (%d)\n", - lzParams->minMatches, numPositions); - } - - pos_table_set_stat (wordWeight, lzParams->hitSeed->weight); - pos_table_set_stat (wordSpace, 1L << lzParams->hitSeed->weight); - - lzParams->hitSeed->withTrans = lzParams->withTrans; - seed_search_set_stat (withTrans, lzParams->hitSeed->withTrans); - seed_search_set_stat (minMatches, lzParams->minMatches); - seed_search_set_stat (maxTransversions, lzParams->maxTransversions); - seed_search_set_stat (filterCaresOnly, lzParams->filterCaresOnly); - - if (debug >= 90) - { - print_seeds (lzParams->outputFile, lzParams->hitSeed); - printf ("%s\n", seed_pattern (lzParams->hitSeed)); - } - - // set span for twins (complicated by the fact that we let the user set it - // in terms of gap length before we know the seed length) - - if ((lzParams->noHitFiltering) && (twinsYes)) - chastise ("--rawhits can't be used with --twins\n"); - - if ((lzParams->noHitFiltering) && (lzParams->gfExtend != gfexNoExtend)) - chastise ("--rawhits can't be used with --gfextend\n"); - - if (twinsYes) - { - if (minGap <= -lzParams->hitSeed->length) - chastise ("minGap for twins (%d) must be greater than negative of seed length (%d)\n", - minGap, -lzParams->hitSeed->length); - if (maxGap < minGap) - chastise ("maxGap for twins (%d) can't be less than min gap (%d)\n", - maxGap, minGap); - - lzParams->twinMinSpan = 2 * lzParams->hitSeed->length + minGap; - lzParams->twinMaxSpan = 2 * lzParams->hitSeed->length + maxGap; - } - else - lzParams->twinMinSpan = 0; // gaps not used - -#ifndef noSeedHitQueue - if (twinsYes) - { - seedHitQueueSize = lzParams->seedHitQueueSize; - seedHitQueueColumns = lzParams->twinMaxSpan - lzParams->hitSeed->length; - } - else - { - seedHitQueueSize = 0; - seedHitQueueColumns = -1; - } -#endif // not noSeedHitQueue - } - -//---------- -// -// print_params-- -// Dump some of the user-specified parameters to a file (for debugging). -// -// nota bene: a newer, similar routine is show_scoring_defaults() -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// control* params: Control data to print (some of). -// -// Returns: -// (nothing) -// -//---------- - -static void print_params - (FILE* f, - control* params) - { - if (params->seq1 != NULL) fprintf (f, "seq 1: %s\n", params->seq1->filename); - if (params->seq2 != NULL) fprintf (f, "seq 2: %s\n", params->seq2->filename); - if (params->selfCompare) fprintf (f, "--self\n"); - print_score_matrix (f, params->scoring, true); - if (params->whichStrand > 0) fprintf (f, "--strand=both\n"); - else if (params->whichStrand < 0) fprintf (f, "--strand=minus\n"); - else fprintf (f, "--strand=plus\n"); - fprintf (f, "--step=%u\n", params->step); - fprintf (f, "--seed=%s\n", seed_pattern(params->hitSeed)); - if (params->gfExtend == gfexXDrop) fprintf (f, "--gfextend\n"); - fprintf (f, "--hspthresh=%s\n", score_thresh_to_string (¶ms->hspThreshold)); - fprintf (f, "--gappedthresh=%s\n", score_thresh_to_string (¶ms->gappedThreshold)); - fprintf (f, "--xDrop=" scoreFmtSimple "\n", params->xDrop); - fprintf (f, "--yDrop=" scoreFmtSimple "\n", params->yDrop); - fprintf (f, "%s\n", (params->entropicHsp)? "--entropy" : "--noentropy"); - if (params->minMatches >= 0) - { - char* qualifier = (params->filterCaresOnly)? "cares:" : ""; - if (params->maxTransversions < 0) - fprintf (f, "--filter=%s%d\n", qualifier, params->minMatches); - else - fprintf (f, "--filter=%s%d,%d\n", qualifier, params->minMatches, params->maxTransversions); - } - if (params->twinMinSpan > 0) fprintf (f, "--twins=%d..%d\n", params->twinMinSpan-2*params->hitSeed->length, params->twinMaxSpan-2*params->hitSeed->length); - if (params->innerThreshold > 0) fprintf (f, "--innerthresh=" scoreFmtSimple "\n", params->innerThreshold); - if (params->tracebackMem != defaultParams.tracebackMem) fprintf (f, "--allocate:traceback=%u\n", params->tracebackMem); - } - -//---------- -// -// read_control_file_by_name, read_control_file-- -// Read control data from a file (see format description below). -// -//---------- -// -// Arguments: -// FILE* f: (read_control_file only) The file that control data is -// .. to be read from. This should already be open for -// .. text read. -// char* name: The name of the file that control data is to be read -// .. from. For read_control_file this is only used for -// .. reporting problems to the user (and may be NULL). -// control* params: Control data to fill in. -// -// Returns: -// (nothing) -// -//---------- -// -// Control Data File Format -// ======================== -// -// Here's an example: -// -// min_identity = 25.0% # 25th percentile -// max_identity = 75.0% # 75th percentile -// hsp_threshold = 3000 -// -// The control data consists of name-value settings. Valid names are as -// follows: -// -// min_identity The range of sequence identity upon which inference -// max_identity .. is based. Only alignment blocks within this -// .. range contribute to inference. If the value ends -// .. with a percent sign, the range is a percentile of -// .. the values found in the overall alignment (other- -// .. wise it is a fixed percentage. -// -// min_coverage The range of query coverage upon which inference -// max_coverage .. is based. Only alignment blocks within this -// .. range (as a percentage of the query sequence) -// .. contribute to inference. -// -// min_continuity The range of query continuity upon which inference -// max_continuity .. is based. Only alignment blocks within this -// .. range (as a percentage of the query sequence) -// .. contribute to inference. -// -// inference_scale The value for the largest substitution score (i.e. -// .. the score for the best match). All other scores -// .. are scaled by the same factor. If this is an -// .. integer (i.e. has no decimal point), then all -// .. scores will be rounded to an integer as well. -// -// hsp_threshold These correspond to the command line --hspthresh -// gapped_threshold .. and --gappedthresh options (also known as K and L -// .. in BLASTZ lingo). They can be specified as -// .. mulitples of the scale, e.g. -// .. hsp_threshold = 20*inference_scale -// .. Further, the gapped threshold can be specified -// .. as a multiple of the hsp threshold, e.g. -// .. gapped_threshold = 1.2*hsp_threshold -// -// max_sub_iterations Limits on the number of iterations that will be -// max_gap_iterations .. performed. For example, -// .. max_sub_iterations = 1 -// .. max_gap_iterations = 0 -// .. will just do one pass and create a substitution -// .. scoring matrix. -// -// gap_open_penalty Correspond to the command line --gap=<[open,]extend> -// gap_extend_penalty .. option (also known as O and E in BLASTZ lingo). -// .. These are the values used for the first iteration -// .. of gap-scoring inference. They can be specified -// .. as mulitples of the scale, and the extend penalty -// .. can be a multiple of the open penalty. -// -// step Corresponds to the command line --step option (also -// .. known as Z in BLASTZ lingo). -// -// entropy Corresponds to the command line --entropy option -// .. (also known as P in BLASTZ lingo). Legal values -// .. are "on" or "off". -// -//---------- - -static void read_control_file_by_name - (char* name, - control* params) - { - FILE* f; - - if (name == NULL) - suicide ("can't open NULL file in read_control_file_by_name()"); - - f = fopen_or_die (name, "rt"); - read_control_file (f, name, params); - fclose_if_valid (f); - } - -static void read_control_file - (FILE* f, - char* _name, - control* params) - { - char line[201]; - char* name = _name; - int lineNum, len, valLen, missingEol; - char* waffle; - char* valString = NULL; - int idIsPercentile = -1; - int haveMinIdentity = false; - int haveMaxIdentity = false; - int haveMinCoverage = false; - int haveMaxCoverage = false; - int haveMinContinuity = false; - int haveMaxContinuity = false; - int tempInt; - - if (name == NULL) - name = "(unnamed file)"; - - ////////// - // read assignments - ////////// - - lineNum = 0; - missingEol = false; - - while (fgets (line, sizeof(line), f) != NULL) - { - lineNum++; - - // check for lines getting split by fgets (the final line in the file - // might not have a newline, but no internal lines can be that way) - // $$$ this is not a perfect solution, since we will not discover the - // $$$ .. problem until after we have parsed the first part of the long - // $$$ .. line; this means we may report a parsing error instead of - // $$$ .. the line-too-long problem - - if (missingEol) - goto line_too_long; - - len = strlen (line); - if (len == 0) continue; - missingEol = (line[len-1] != '\n'); - - // trim blanks, end of line, and comments, and ignore blank lines - - if (line[len-1] == '\n') line[--len] = 0; - - waffle = strchr (line, '#'); - if (waffle != NULL) *waffle = 0; - - trim_string (line); - if (line[0] == 0) continue; - - // separate the value from the assignment - - valString = strchr (line, '='); - if (valString == NULL) - goto invalid_line; - - *(valString++) = 0; - trim_string (line); - trim_string (valString); - valLen = strlen (valString); - if (valLen == 0) - goto empty_assignment; - - // parse the assignment - - // inference_scale - - if (strcmp (line, "inference_scale") == 0) - { -#if (scoreType != 'I') - int v; - char extra; -#endif - - if (strcmp (valString, "none") == 0) - { - params->ic.inferScale = 0; - params->ic.writeAsInt = false; - } - else - { - params->ic.inferScale = string_to_score (valString); - -#if (scoreType == 'I') - params->ic.writeAsInt = false; // (no need to force it to an int) -#else - params->ic.writeAsInt = (sscanf (valString, "%d%c", &v, &extra) == 1); -#endif - } - continue; - } - - // hsp_threshold, gapped_threshold - // $$$ the use of expressions involving other settings should be - // $$$ .. expanded and generalized - - if (strcmp (line, "hsp_threshold") == 0) - { - params->ic.hspThresholdIsRatio = ratioNone; - if (strcmp_prefix (valString, "top") == 0) - params->hspThreshold = string_to_score_thresh (valString); - else if (strcmp_suffix (valString,"*inference_scale") == 0) - { - valString[valLen-strlen("*inference_scale")] = 0; - params->hspThreshold.t = 'S'; - params->hspThreshold.s = string_to_double (valString); - if (params->ic.inferScale > 0) - params->hspThreshold.s *= params->ic.inferScale; - else - params->ic.hspThresholdIsRatio = ratioMaxSubScore; - } - else if (strcmp_suffix (valString,"*worst_substitution") == 0) - { - valString[valLen-strlen("*worst_substitution")] = 0; - params->hspThreshold.t = 'S'; - params->hspThreshold.s = string_to_double (valString); - params->ic.hspThresholdIsRatio = ratioMinSubScore; - } - else - { - params->hspThreshold.t = 'S'; - params->hspThreshold.s = string_to_score (valString); - } - continue; - } - - if (strcmp (line, "gapped_threshold") == 0) - { - params->ic.gappedThresholdIsRatio = ratioNone; - if (strcmp_prefix (valString, "top") == 0) - params->gappedThreshold = string_to_score_thresh (valString); - else if (strcmp_suffix (valString,"*inference_scale") == 0) - { - valString[valLen-strlen("*inference_scale")] = 0; - params->gappedThreshold.t = 'S'; - params->gappedThreshold.s = string_to_double (valString); - if (params->ic.inferScale > 0) - params->gappedThreshold.s *= params->ic.inferScale; - else - params->ic.gappedThresholdIsRatio = ratioMaxSubScore; - } - else if (strcmp_suffix (valString,"*worst_substitution") == 0) - { - valString[valLen-strlen("*worst_substitution")] = 0; - params->gappedThreshold.t = 'S'; - params->gappedThreshold.s = string_to_double (valString); - params->ic.gappedThresholdIsRatio = ratioMinSubScore; - } - else if (strcmp (valString, "hsp_threshold") == 0) - params->gappedThreshold = params->hspThreshold; - else - { - params->gappedThreshold.t = 'S'; - params->gappedThreshold.s = string_to_score (valString); - } - continue; - } - - // gap_open_penalty, gap_extend_penalty - - if (strcmp (line, "gap_open_penalty") == 0) - { - params->ic.gapOpenIsRatio = ratioNone; - if (strcmp_suffix (valString,"*inference_scale") == 0) - { - valString[valLen-strlen("*inference_scale")] = 0; - params->scoring->gapOpen = string_to_double (valString); - if (params->ic.inferScale > 0) - params->scoring->gapOpen *= params->ic.inferScale; - else - params->ic.gapOpenIsRatio = ratioMaxSubScore; - } - else if (strcmp_suffix (valString,"*worst_substitution") == 0) - { - valString[valLen-strlen("*worst_substitution")] = 0; - params->scoring->gapOpen = string_to_double (valString); - params->ic.gapOpenIsRatio = ratioMinSubScore; - } - else - params->scoring->gapOpen = string_to_score (valString); - continue; - } - - if (strcmp (line, "gap_extend_penalty") == 0) - { - params->ic.gapExtendIsRatio = ratioNone; - if (strcmp_suffix (valString,"*inference_scale") == 0) - { - valString[valLen-strlen("*inference_scale")] = 0; - params->scoring->gapExtend = string_to_double (valString); - if (params->ic.inferScale > 0) - params->scoring->gapExtend *= params->ic.inferScale; - else - params->ic.gapExtendIsRatio = ratioMaxSubScore; - } - else if (strcmp_suffix (valString,"*worst_substitution") == 0) - { - valString[valLen-strlen("*worst_substitution")] = 0; - params->scoring->gapExtend = string_to_double (valString); - params->ic.gapExtendIsRatio = ratioMinSubScore; - } - else if (strcmp_suffix (valString,"*gap_open_penalty") == 0) - { - valString[valLen-strlen("*gap_open_penalty")] = 0; - params->scoring->gapExtend = string_to_double (valString) - * params->scoring->gapOpen; - params->ic.gapExtendIsRatio = params->ic.gapOpenIsRatio; - } - else - params->scoring->gapExtend = string_to_score (valString); - continue; - } - - // entropy - - if (strcmp (line, "entropy") == 0) - { - if (strcmp (valString, "on") == 0) - params->entropicHsp = true; - else if (strcmp (valString, "off") == 0) - params->entropicHsp = false; - else - goto on_off_mismatch; - continue; - } - - // max_sub_iterations, max_gap_iterations - - if (strcmp (line, "max_sub_iterations") == 0) - { - params->ic.subIterations = string_to_int (valString); - continue; - } - - if (strcmp (line, "max_gap_iterations") == 0) - { - params->ic.gapIterations = string_to_int (valString); - continue; - } - - // step - - if (strcmp (line, "step") == 0) - { - tempInt = string_to_int (valString); - if (tempInt <= 0) - goto bad_step; - params->step = tempInt; - continue; - } - - // min_identity, max_identity - // - // Ideally the user will set both of these; however, if one is set but - // not the other, we peg the other to the edge of the range; further, - // we validate that the user specifies percentile or non-percentile the - // same for both - - if ((strcmp (line, "min_identity") == 0) - || (strcmp (line, "max_identity") == 0)) - { - if (valString[valLen-1] == '%') - { - valString[--valLen] = 0; - if (idIsPercentile == false) - goto percentile_mismatch; - if (idIsPercentile == -1) - params->ic.idIsPercentile = idIsPercentile = true; - } - else if (idIsPercentile == true) - goto percentile_mismatch; - else if (idIsPercentile == -1) - params->ic.idIsPercentile = idIsPercentile = false; - - if (strcmp (line, "min_identity") == 0) - { - params->minIdentity = string_to_double (valString) / 100; - haveMinIdentity = true; - if (!haveMaxIdentity) params->maxIdentity = 1.0; - } - else - { - params->maxIdentity = string_to_double (valString) / 100; - haveMaxIdentity = true; - if (!haveMinIdentity) params->minIdentity = 0.0; - } - continue; - } - - // min_coverage, max_coverage - // - // Ideally the user will set both of these; however, if one is set but - // not the other, we peg the other to the edge of the range - - if ((strcmp (line, "min_coverage") == 0) - || (strcmp (line, "max_coverage") == 0)) - { - if (strcmp (line, "min_coverage") == 0) - { - params->minCoverage = string_to_double (valString) / 100; - haveMinCoverage = true; - if (!haveMaxCoverage) params->maxCoverage = 1.0; - } - else - { - params->maxCoverage = string_to_double (valString) / 100; - haveMaxCoverage = true; - if (!haveMinCoverage) params->minCoverage = 0.0; - } - continue; - } - - // min_continuity, max_continuity - // - // Ideally the user will set both of these; however, if one is set but - // not the other, we peg the other to the edge of the range - - if ((strcmp (line, "min_continuity") == 0) - || (strcmp (line, "max_continuity") == 0)) - { - if (strcmp (line, "min_continuity") == 0) - { - params->minContinuity = string_to_double (valString) / 100; - haveMinContinuity = true; - if (!haveMaxContinuity) params->maxContinuity = 1.0; - } - else - { - params->maxContinuity = string_to_double (valString) / 100; - haveMaxContinuity = true; - if (!haveMinContinuity) params->minContinuity = 0.0; - } - continue; - } - - // min_match_count - - if ((strcmp (line, "min_match_count") == 0) - || (strcmp (line, "min_nmatch") == 0)) - { - if ((valLen > 0) && (valString[valLen-1] == '%')) - params->minMatchCountRatio = pct_string_to_double (valString); - else - params->minMatchCount = string_to_int (valString); - continue; - } - - // max_mismatch_count - - if ((strcmp (line, "max_mismatch_count") == 0) - || (strcmp (line, "max_nmismatch") == 0)) - { - params->maxMismatchCount = string_to_int (valString); - continue; - } - - // max_gap_count - - if ((strcmp (line, "max_gap_count") == 0) - || (strcmp (line, "max_ngap") == 0)) - { - params->maxSeparateGapsCount = string_to_int (valString); - continue; - } - - // max_gap_column_count - - if ((strcmp (line, "max_gap_column_count") == 0) - || (strcmp (line, "max_cgap") == 0)) - { - params->maxGapColumnsCount = string_to_int (valString); - continue; - } - - goto unknown_assignment; - } - - return; - - ////////// - // failure exits - ////////// - -line_too_long: - suicidef ("line is too long (%s: line %d)", name, lineNum-1); - -invalid_line: - suicidef ("invalid line (%s: line %d) %s", name, lineNum, line); - -empty_assignment: - suicidef ("empty value in assignment (%s: line %d) %s=", - name, lineNum, line); - -percentile_mismatch: - suicidef ("assignment of identity percentile mismatches earlier setting" - " (%s: line %d) %s=", - name, lineNum, line); - -on_off_mismatch: - suicidef ("invalid on/off in assignment (%s: line %d) %s=%s", - name, lineNum, line, valString); - -unknown_assignment: - suicidef ("invalid name in assignment (%s: line %d) %s=%s", - name, lineNum, line, valString); - -bad_step: - suicidef ("invalid value for step (%s: line %d) %s=%s", - name, lineNum, line, valString); - } - -//---------- -// -// print_options-- -// Print some of the command line options in the output file. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -static void print_options - (void) - { - print_generic (currParams->outputFile, - "seed=%s%s", - seed_pattern(currParams->hitSeed), - (currParams->hitSeed->withTrans == 0)?"": - (currParams->hitSeed->withTrans == 1)?" w/transition" - :" w/2 transitions"); - //print_generic (currParams->outputFile, "--hspthresh=" scoreFmtSimple, currParams->hspThreshold); - //print_generic (currParams->outputFile, "--gappedthresh=" scoreFmtSimple, currParams->gappedThreshold); - //print_generic (currParams->outputFile, "--xDrop=" scoreFmtSimple, currParams->xDrop); - //print_generic (currParams->outputFile, "--yDrop=" scoreFmtSimple, currParams->yDrop); - //print_generic (currParams->outputFile, "%s", (currParams->entropicHsp)? "--entropy" : "--noentropy"); - //if (currParams->minMatches >= 0) print_generic (currParams->outputFile, 'z', "--filter=%d,%d", currParams->minMatches, currParams->maxTransversions); - //if (currParams->twinMinSpan > 0) print_generic (currParams->outputFile, "twins=%d..%d", currParams->twinMinSpan-2*currParams->hitSeed->length, currParams->twinMaxSpan-2*currParams->hitSeed->length); - // else print_generic (currParams->outputFile, "notwins"); - print_generic (currParams->outputFile, "step=%u", currParams->step); - } - -//---------- -// -// name_spec_is_quantum-- -// Determine if a sequence name specifier is describing a quantum sequence. -// -// It is deemed to be a quantum sequence if either the filename ends in ".qdna" -// or the action list contains the "quantum" action. -// Dump the nucleotides (from each sequence) for a gap-free alignment. -// -//---------- -// -// Arguments: -// char* spec: The sequence name specifier. This is of the form -// .. [], where each action list is a -// .. comma-separated list inside square brackets. -// -// Returns: -// (nothing) -// -//---------- - -static int name_spec_is_quantum - (char* spec) - { - char* nameEnd, *where; - char before, after; - - if (spec == NULL) return false; - - // see if file name ends with ".qdna" - - nameEnd = strchr(spec,'['); - if (nameEnd == NULL) - return (strcmp_suffix(spec,".qdna") == 0); - - if (strncmp_suffix(spec,".qdna",nameEnd-spec) == 0) return true; - - // see if action lists contain the keyword "quantum" - - where = strstr(nameEnd,"quantum"); // (nota bene: nameEnd != NULL) - if (where != NULL) - { - before = where[-1]; // (nota bene: where > nameEnd) - after = where[strlen("quantum")]; - if (((before == '[') || (before == ',')) - && ((after == ']') || (after == ',') || (after == '='))) - return true; - } - - return false; - } - -//---------- -// -// lastz_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void lastz_zero_stats - (void) - { -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&lastzStats, 0, sizeof(lastzStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - // (none to set, yet) - -#endif // collect_stats - } - -//---------- -// -// lastz_show_stats_before, lastz_show_stats-- -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// -// Returns: -// (nothing) -// -//---------- - -static void lastz_show_stats_before - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - if (f == NULL) return; - fprintf (f, "-------------------\n"); - fprintf (f, " target length: %s\n", commatize (lzParams.seq1->len)); - if (lzParams.seq2 != NULL) - fprintf (f, " query length: %s\n", commatize (lzParams.seq2->len)); - fprintf (f, " step size: %u\n", currParams->step); - fprintf (f, "-------------------\n"); -#endif // collect_stats - } - - -static void lastz_show_stats - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - if (f == NULL) return; - fprintf (f, " run time: %.3f seconds\n", lastzStats.runTime); - fprintf (f, "-------------------\n"); -#endif // collect_stats - } - diff --git a/programs/lastz/src/lastz.h b/programs/lastz/src/lastz.h deleted file mode 100755 index 3c781de..0000000 --- a/programs/lastz/src/lastz.h +++ /dev/null @@ -1,510 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: lastz.h -// -//---------- - -#ifndef lastz_H // (prevent multiple inclusion) -#define lastz_H - -// other files - -#include "dna_utilities.h" // dna/scoring stuff -#include "seeds.h" // seed strategy stuff -#include "pos_table.h" // position table stuff -#include "capsule.h" // multi-process sharing stuff -#include "seed_search.h" // seed hit search stuff -#include "gapped_extend.h" // gapped alignment stuff -#include "masking.h" // dynamic masking stuff -#include "diag_hash.h" // diagonals hashing stuff - -// establish ownership of global variables - -#ifdef lastz_owner -#define global -#else -#define global extern -#endif - -//---------- -// -// data structures and types -// -//---------- - -global int debug; // how much debug info to show the programmer - // 0 => nothing - // 100 => everything - - -typedef struct infcontrol - { - char* inferFilename; // Name of file to write inferred scores to. If - // .. this includes "%s", then each score set we - // .. try during convergence will be written, - // .. with %s replaced by a string of the form - // .. sNNN or gNNN. The final score set will - // .. be written with %s replaced by an empty - // .. string - score inferScale; // The desired value for the maximum subsitution - // .. score. If this is zero, no scaling is - // .. performed. - int writeAsInt; // true => we should produce scores that are - // .. integers even if the score type - // .. is floating-point - int hspThresholdIsRatio;// non-zero => the hspThreshold and - int gappedThresholdIsRatio;// .. gappedThreshold value(s) passed into - // .. drive_scoring_inference is to be used - // .. as a ratio, as per ratioXXX value - int gapOpenIsRatio; // non-zero => the scoring->gapOpen and - int gapExtendIsRatio; // .. scoring->gapExtend value(s) passed - // .. into drive_scoring_inference is to be - // .. used as a ratio, as per ratioXXX value - int subIterations; // max number of iterations for inferring - // .. substitution scores - int gapIterations; // max number of iterations for inferring gap - // .. scores - int idIsPercentile; // true => control.minIdentity and - // .. control.maxIdentity are - // .. percentiles - // false => minIdentity and maxIdentity are - // .. the actual percentages - } infcontrol; - -enum - { - ratioNone = 0, - ratioMaxSubScore, - ratioMinSubScore, - }; - -// control-- -// data structure for providing global information to all parts of the lastz -// program; most of these are control parameters, but some information about -// the problem being solved is also passed here -// -// items marked here with an (I) are also valid for control of the scoring -// inference process; items marked (I-only) are *only* valid for inference; -// items marked (I-copy) are valid for inference but are copies of pointers -// to memory that belongs to the regular control set - -typedef struct control - { - // the sequences being processed - - char* seq1Filename; // target sequence - struct seq* seq1; - - char* seq2Filename; // query sequence - struct seq* seq2; - - u8* rev1; // (I) reverse of target and query sequences - u8* rev2; // .. (NOT reverse complement) - - // control parameters - - int inferScores; // true => infer scores from the sequences - int inferOnly; // true => prohibit alignment (other than - // .. what is performed for - // .. inference) - infcontrol ic; // (I-only) Additional inference controls. - - int selfCompare; // (I) true => align with the knowledge that - // .. seq1 and seq2 are the same - // .. sequence - int clonedQuery; // (I) true => seq1 and seq2 are cloned from - // .. the same sequence - int doSeedSearch; // (I) false => prohibit the seed search (and - // .. prohibit alignment too) - const s8* charToBits; // tables to map sequence characters to - const s8* upperCharToBits;// .. two-bit values, and illegal characters - // .. to -1; indexed by a u8 value, 0..255; - // .. charToBits will normally consider - // .. upper and lower case the same; - // .. upperCharToBits will normally reject - // .. lower case - int whichStrand; // (I) 0 => search + strand only - // < 0 => search - strand only - // > 0 => search both strands of target - u32 step; // (I) positional step size, indicating how - // .. often to store target word positions - - struct seed* hitSeed; // (I) seeding strategy for hits - int maxIndexBits; // (I) maximum number of index bits to use for - // .. the position table; if the seed - // .. weight (in bits) is larger than this, - // .. the additional bits (effective hash - // .. collisions) are resolved using the - // .. sequences - - int withTrans; // (I) number of allowed transitions in a seed - // .. hit (0, 1 or 2) - int noHitFiltering; // true => just report every raw seed hit, - // .. with no filtering or - // .. processing - u32 twinMinSpan; // (I) span threshold for hits to be considered - u32 twinMaxSpan; // (I) .. twins; twinMinSpan<=0 means we don't - // .. don't require twins (and we ignore - // .. twinMaxSpan); see the twininfo struct - // .. in seed_search.h for details - int basicHitType; // (I) the type of hit we require for a basic, - // .. single hit; one of the hitXXX values - // .. below - int minMatches; // (I) filter criteria for each seed hit; we - int maxTransversions;//(I) .. require at least minMatches matches - int filterCaresOnly;// (I) .. and no more than maxTransversions - // .. transversions; if minMatches<0 no - // .. filtering is performed; if - // .. maxTransversions<0 there is no - // .. transversion limit; if - // .. filterCaresOnly is true the filter - // .. criteria only applies to the seed's - // .. "care" positions (and not to any - // .. don't-care positions) -#ifndef noSeedHitQueue - int seedHitQueueSize;// number of entries to allocate for the - // .. seedHitQueue -#endif // not noSeedHitQueue - - int readCapsule; // true => read capsule file - int writeCapsule; // true => write capsule file - FILE* capsuleFile; // file to write capsule to (can be NULL) - char* capsuleFilename;// name of capsule file (can be NULL) - capinfo* capsule; // info about open capsule file - u64 targetMem; // number of bytes to pre-allocate for - // .. the target sequence record (zero - // .. indicates no pre-allocation) - u64 queryMem; // number of bytes to pre-allocate for - // .. the query sequence record (zero - // .. indicates no pre-allocation) - - FILE* anchorsFile; // file to read anchors from instead of - // .. discovering them via seeding (can be - // .. NULL) - char* anchorsFilename;// name of anchors file (can be NULL) - char* choresFilename; // name of chores file (can be NULL) - - int gfExtend; // whether to extend seed hits into HSPs - // .. (one of gfexXXX, from seed_search.h) - int mergeAnchors; // true => after seed hits/HSPs are found, - // .. merge any overlapping segments - int chain; // (I) true => perform chaining - score chainDiag; // (I) diagonal chaining penalty - score chainAnti; // (I) antidiagonal chaining penalty - int gappedExtend; // (I) true => perform gapped extension - - scoreset* scoring; // (I) scoring set - scoreset* maskedScoring; // (I) scoring set with lowercase penalized; in - // .. general, we treat lower case as bad - // .. during the search for HSPs, then treat - // .. upper/lower case as equivalent during - // .. the gapped alignment stage - score xDrop; // (I) threshold to stop extensions of - // .. *ungapped* matches; if the score - // .. drops off by more than xDrop, - // .. extension stops - score yDrop; // (I) threshold to stop extensions of *gapped* - // .. alignments (similar to xdrop) - int xDropUntrimmed; // (I) true => xDrop extension does *not* trim - // .. to the scoring peak if it - // .. happens to run into the end - // .. of either sequence - int yDropUntrimmed; // (I) true => yDrop extension does *not* trim - // .. to the scoring peak if it - // .. happens to run into the end - // .. of either sequence - sthresh hspThreshold; // (I) threshold for high scoring segment pairs - // .. (ungapped matches); an HSP is - // .. discarded if its score is less than - // .. this threshold - sthresh gappedThreshold;// (I) threshold for high scoring alignments; a - // .. gapped alignment is discarded if its - // .. score is less than this threshold - int entropicHsp; // (I) true => involve entropy in the decision - // .. to discard an HSP - int reportEntropy; // true => report any HSPs that are - // .. discarded due to entropy - int gappedAllBounds;// (I) true => bound gapped alignments by *all* - // .. gapped extensions of higher- - // .. scoring HSPs (a la blastz) - // false => bound gapped alignments only by - // .. gapped extensions that meet - // .. the score threshold - int mirrorHSP; // (I) true => each HSP or seed hit is output - // .. with its symmetric copy, - // .. reflected over the diagonal - int mirrorGapped; // (I) true => each gapped alignment is output - // .. with its symmetric copy, - // .. reflected over the diagonal - int inhibitTrivial; // (I) true => don't output the trivial self- - // .. alignment - u32 tracebackMem; // number of bytes to allocate to track - // .. gapped alignment traceback - tback* traceback; // memory in which to track gapped alignment - // .. traceback - int nIsAmbiguous; // true => N is an ambiguous nucleotide - // false => N is a sequence-splicer - int allowAmbiDNA; // true => permit ambiguous DNA characters - // .. B,D,H,K,M,R,S,V,W,Y - // false => only A,C,G,T,N,X permitted - score ambiMatch; // (non-negative) penalty for matches - // .. among ambiguous DNA - score ambiMismatch; // (non-negative) penalty for mismatches - // .. among ambiguous DNA - int hspImmediate; // true => process HSPs immediately (rather - // .. than collecting them in a - // .. table then processing en - // .. masse); if gapped extension - // .. is to be performed, it is - // .. done immediately (rather than - // .. collecting them in a table - // .. then extending en masse) - u32 searchLimit; // maximum number of "HSPs" allowed for a - // .. given query/strand; zero indicates - // .. "no limit"; "HSPs" means gapped - // .. alignments if hspImmediate is true - int searchLimitWarn;// true => warn user about reads that - // .. exceed searchLimit - int searchLimitKeep;// true => report alignments for reads that - // .. exceed searchLimit (up to the - // .. limit) - u32 numBestHsps; // maximum number of HSPs processed for a - // .. given query/strand; zero indicates - // .. "no limit"; if this is non-zero, only - // .. the best N HSPs are kept after sorting - float maxPairedDepth; // maximum alignment "depth" we'll allow - // .. in one call to gapped_extend(); this - // .. relates to P/L, were P is the number - // .. of "paired bases" and L is the length - // .. of the sequence; if this is zero it - // .. has no effect - u64 maxPairedBases; // maximum number of "paired bases" we'll - // .. allow in one call to gapped_extend(); - // .. if this is zero it has no effect, - // .. otherwise, it overrides maxPairedDepth - int overlyPairedWarn;// true => warn user about reads that - // .. exceed exceed the maximum - // .. alignment "depth" - int overlyPairedKeep;// false => we discard all alignments for - // .. reads that exceed the - // .. maximum alignment "depth" - // true => we output whatever alignments - // .. we happen to find prior to - // .. exceeding the limit - - float wordCountKeep; // if non-zero, this specifies how to set - // .. wordCountLimit adaptively; this value - // .. is between 0 and 1 and indicates a - // .. lower bound on the fraction of seed - // .. word positions that we will keep - u32 wordCountLimit; // the maximum number of positions that a - // .. particular seed word can occur in the - // .. target; words that occur more than - // .. this is deleted from the table; if - // .. wordCountKeep is non-zero, this - // .. variable is adaptively set; zero - // .. means no limit - u32 maxWordCountChasm;// if non-zero, this specifies the maximum - // .. length of an interval of discarded - // .. seed word positions (discarded due to - // .. wordCountKeep or wordCountLimit); if - // .. there is an interval longer than this, - // .. we protect some seed word positions - // .. that would otherwise be discarded - u32 dynamicMasking; // mask any position in target hit this many - // .. times; zero indicates no masking - FILE* maskingFile; // file to write masked intervals to (can be - // .. NULL) - char* maskingFilename;// name of masked intervals file (can be - // .. NULL) - int masking3Fields; // true => write 3-field masked intervals - // false => write 2-field masked intervals - FILE* softMaskedFile; // file to write soft-masked intervals to - // .. (can be NULL) - char* softMaskedFilename;// name of soft-masked intervals file (can - // .. be NULL) - int softMasked3Fields;// true => write 3-field soft-masked intervals - // false => write 2-field soft-masked intervals - int reportCensus; // true => report how many times each base - // .. in the target is aligned - FILE* censusFile; // file to write census to (can be NULL) - char* censusFilename; // name of census file (can be NULL) - char censusKind; // size of counting type to use for census - // .. (see the definition of the census type - // .. in masking.h). - - float minIdentity; // (I) the range of percent identity of HSPS we - float maxIdentity; // (I) .. will keep (or alignment blocks if we - // .. are doing gapped extension); these - // .. are values between 0.0 and 1.0 - float minCoverage; // (I) the range of query coverage of HSPS we - float maxCoverage; // (I) .. will keep (or alignment blocks if we - // .. are doing gapped extension); these - // .. are values between 0.0 and 1.0 - float minContinuity; // (I) the range of query continuity of HSPS we - float maxContinuity; // (I) .. will keep (or alignment blocks if we - // .. are doing gapped extension); these - // .. are values between 0.0 and 1.0 - double minMatchCountRatio;// (I) ratio of match-count/query-length; this - // .. is only valid if non-zero; and is used - // .. to reset minMatchCount as each query - // ... is loaded - u32 minMatchCount; // (I) the minimum match-count of HSPS we will - // .. keep (or alignment blocks if we are - // .. doing gapped extension) - s32 maxMismatchCount;//(I) the maximum number of mismatches we'll - // .. allow in HSPS we keep (or alignment - // .. blocks if we are doing gapped - // .. extension); -1 indicates we have no - // .. limit - s32 maxSeparateGapsCount;//(I) the maximum number of gaps we'll - // .. allow, counting each run of gapped - // .. columns as one gap, in HSPS we keep - // .. (or alignment blocks if we are doing - // .. gapped extension); -1 indicates we - // .. have no limit - s32 maxGapColumnsCount; // (I) the maximum number of gaps we'll - // .. allow, counting each gapped column - // .. as one gap, in HSPS we keep (or - // .. alignment blocks if we are doing - // .. gapped extension); -1 indicates we - // .. have no limit - -#ifdef densityFiltering - float maxDensity; // the maximum alignment density we will - // .. allow before discarding a query - // .. sequence; zero means no limit -#endif // densityFiltering - - char* outputFilename; // name of the file to write output to - FILE* outputFile; // file to write output to; this may be - // .. stdout - int outputFormat; // format in which to write alignments; one - // .. of fmtXXX values (in output.h) - void* outputInfo; // additional information for the particular - // .. output format chosen - char* readGroup; // additional information for SAM format - char* samRGTags; // additional information for SAM format - int endComment; // true => write a comment at the end of the - // .. output file, so that the user - // .. can tell we completed - int needTrueLengths;// true => we need seq->trueLen to be - // .. correct (e.g. because we have - // .. an anchors file, or because - // .. outputFormat requires it) - int deGapifyOutput; // true => convert gapped alignments back to - // .. ungapped segments on output - char* dotplotFilename;// name of the file to write dot plot to - FILE* dotplotFile; // file to write dot plot to - char* dotplotKeys; // genpaf keys for formatting dotplot - - // for inner alignment (interpolation) - - score innerThreshold; // threshold for HSPs during interpolation - // .. stage (equivalent to hspThreshold); - // .. zero or negative indicates no - // .. interpolation - struct seed* innerSeed; // seeding strategy for inner alignment hits - u32 innerWindow; // windowSize interpolation parameter (see - // .. inner_interpolate() and inner.c) - - // for quantum DNA query - - int targetIsQuantum;// true => the target is quantum DNA - int queryIsQuantum; // true => the query is quantum DNA - score ballScore; // minimum score required of a DNA word to - // .. be considered 'in' the ball - // .. surrounding a given quantum word; - // .. -1 indicates that this was never set - - // debug/info flags - - int lajCompatible; // true => backward compatibility for LAJ - u32 textContext; // the number of extra bp to print at the - // .. ends of matches for - // .. outputFormat=fmtText or fmtZeroText - char* args; // a copy of the argv options, less sequence - // .. names, suitable for printing - int verbosity; // (I) how much info to bombard the user with - // 0 => minimal - // 10 => everything - int reportTiming; // (I) true => report runtime to the output file - // .. (e.g as z records in a gfa - // .. file) - int reportStats; // (I) true => report search statistics to the - // .. output file (e.g as z records - // .. in a gfa file) - int showStats; // (I) true => show search statistics - FILE* statsFile; // (I-copy) file to write statistics to (can be - // .. NULL) - char* statsFilename; // name of stats file (can be NULL) - int showPosTable; // whether or not to show target positions - // .. table (one of spt_xxx values) - } control; - -// values for control parameters - -enum - { - hitBad = -1, - hit_min = 0, - hitSimple = hit_min, // simple hit, no hash collision detection - hitRecover, // simple hit with hash collision recovery - hit_max = hitRecover - }; - -enum - { - spt_dont = 0, // don't show target positions table - spt_table = 1, // show target positions table - spt_countsonly = 2, // show target positions table counts only - spt_withcounts = 3, // show target positions table and counts - spt_distribution = 4 // show position counts distribution - }; - -//---------- -// -// prototypes for routines in lastz.c -// -//---------- - -void set_up_hit_processor (control* params, int collectingCensus, - hitprocessor* hitProc, void** hitProcInfo); - -int start_one_strand (seq* target, postable* targPositions, seq* query, - int emptyAnchors, u32 prevAnchorsCount, - hitprocessor hitProc, void* hitProcInfo); -void finish_one_strand (seq* target, u8* targetRev, - postable* targPositions, - seq* query, u8* queryRev, - tback* traceback, census* targCensus); -void split_anchors (int id); -void swap_anchor_sets (void); -void print_job_header (void); - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -global struct - { - float runTime; - } lastzStats; - -// stats macros - -#define lastz_count_stat(field) ++lastzStats.field -#define lastz_uncount_stat(field) --lastzStats.field -#define lastz_set_stat(field,val) (lastzStats.field = val) -#else -#define lastz_count_stat(field) -#define lastz_uncount_stat(field) -#define lastz_set_stat(field,val) -#endif // collect_stats - -#undef global -#endif // lastz_H diff --git a/programs/lastz/src/lav.c b/programs/lastz/src/lav.c deleted file mode 100755 index b2ccc1d..0000000 --- a/programs/lastz/src/lav.c +++ /dev/null @@ -1,513 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: lav.c -// -//---------- -// -// lav-- -// Support for printing alignments in LAV format. -// -// LAV format is the well-established pairwise alignment format produced by -// blastz. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff -#include "masking.h" // dynamic masking stuff - -#define lav_owner // (make this the owner of its globals) -#include "lav.h" // interface to this module - -// debugging defines - -//#define snoopLavAlignList // if this is defined, extra code is added to - // .. track alignment lists - -//---------- -// -// print_lav_job_header-- -// Print lav format job header. -// -//---------- - -void print_lav_job_header - (FILE* f, - char* _programName, - char* _name1, - char* _name2, - char* _args, - scoreset* scoring, - sthresh* hspThreshold, - sthresh* gappedThreshold, - u8 dynamicMasking, - int withExtras, - score xDrop, - score yDrop) - { - char* programName = _programName; - char* name1 = _name1; - char* name2 = _name2; - char* args = _args; - - if (programName == NULL) programName = "(no name)"; - if (name1 == NULL) name1 = "(no name)"; - if (name2 == NULL) name2 = "(no name)"; - if (args == NULL) args = ""; - - fprintf (f, "#:lav\n"); - fprintf (f, "d {\n"); - fprintf (f, " \"%s %s %s %s\n", programName, name1, name2, args); - print_score_matrix (f, scoring, false); - fprintf (f, " O = " scoreFmtSimple - ", E = " scoreFmtSimple - ", K = %s" - ", L = %s" - ", M = %d", - scoring->gapOpen, scoring->gapExtend, - score_thresh_to_string (hspThreshold), - score_thresh_to_string (gappedThreshold), - dynamicMasking); - if (withExtras) - fprintf (f, ", X = " scoreFmtSimple - ", Y = " scoreFmtSimple, - xDrop, yDrop); - fprintf (f, "\"\n}\n"); - } - -//---------- -// -// print_lav_job_footer-- -// Print lav format job footer. -// -//---------- - -void print_lav_job_footer - (FILE* f) - { - fprintf (f, "#:eof\n"); - } - -//---------- -// -// print_lav_header-- -// Print lav format query header. -// -//---------- - -void print_lav_header - (FILE* f, - seq* seq1, - seq* seq2) - { - char* rcfShortSuffix[4] = { "", "~", "~-", "-" }; - char* rcfLongSuffix [4] = { "", // forward - "~", // complement - "~ (reverse complement)", // reverse - " (reverse complement)" }; // rev-comp - char* name1 = seq1->filename; - char* name2 = seq2->filename; - char* header1 = seq1->header; - char* header2 = seq2->header; - u32 contig1 = seq1->contig; - u32 contig2 = seq2->contig; - - if (name1 == NULL) name1 = "(no name)"; - if (name2 == NULL) name2 = "(no name)"; - if (header1 == NULL) header1 = "(no header)"; - if (header2 == NULL) header2 = "(no header)"; - - fprintf (f, "#:lav\n"); - fprintf (f, "s {\n"); - fprintf (f, " \"%s%s\" " unsposFmt " " unsposFmt " %d %u\n", - name1, rcfShortSuffix[seq1->revCompFlags], - seq1->startLoc, seq1->startLoc+seq1->len-1, - ((seq1->revCompFlags & rcf_rev) != 0)?1:0, contig1); - fprintf (f, " \"%s%s\" " unsposFmt " " unsposFmt " %d %u\n", - name2, rcfShortSuffix[seq2->revCompFlags], - seq2->startLoc, seq2->startLoc+seq2->len-1, - ((seq2->revCompFlags & rcf_rev) != 0)?1:0, contig2); - fprintf (f, "}\n"); - - fprintf (f, "h {\n"); - fprintf (f, " \"%s%s\"\n", header1, rcfLongSuffix[seq1->revCompFlags]); - fprintf (f, " \"%s%s\"\n", header2, rcfLongSuffix[seq2->revCompFlags]); - fprintf (f, "}\n"); - } - -//---------- -// -// print_lav_align_list-- -// Print a list of gapped alignments in lav format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// -// Returns: -// (nothing) -// -//---------- - -//=== stuff for snoopLavAlignList === - -#ifndef snoopLavAlignList -#define snoopLavAlignList_1 ; -#endif // not snoopLavAlignList - -#ifdef snoopLavAlignList - -#define snoopLavAlignList_1 \ - fprintf (stderr, "print_lav_align_list a=%08lX" \ - " a->seq1=%08lX a->seq2=%08lX\n", \ - (long) a, (long) a->seq1, (long) a->seq2); - -#endif // snoopLavAlignList - - -// print_lav_align_list-- - -void print_lav_align_list - (FILE* f, - alignel* alignList, - seq* seq1, - seq* seq2) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - alignel* a; - - if ((sp1->p != NULL) || (sp2->p != NULL)) - suicide ("lav format can't handle multi-sequences"); - // the issue is that we'd have to check if the partition changed - // since the previous alignment, and generate an s/h-stanza pair - - for (a=alignList ; a!=NULL ; a=a->next) - { - snoopLavAlignList_1; - print_lav_align (f, - a->seq1, a->beg1-1, a->end1, - a->seq2, a->beg2-1, a->end2, - a->script, a->s); - } - } - -//---------- -// -// print_lav_align-- -// Print a single gapped alignment in lav format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// const u8* seq1: One sequence. -// unspos beg1, end1: Range of positions in sequence 1 (origin 0). -// const u8* seq2: Another sequence. -// unspos beg2, end2: Range of positions in sequence 2 (origin 0). -// editscript* script: The script describing the path the alignment takes -// .. in the DP matrix. -// score s: The alignment's score. -// -// Returns: -// (nothing) -// -//---------- - -static int align_match_percent (unspos run, unspos match); - -void print_lav_align - (FILE* f, - const u8* seq1, - unspos beg1, - unspos end1, - const u8* seq2, - unspos beg2, - unspos end2, - editscript* script, - score s) - { - unspos height, width, i, j, prevI, prevJ; - unspos run, match; - u32 opIx; - - beg1++; // (internally, we want origin 1, inclusive) - beg2++; - - height = end1 - beg1 + 1; - width = end2 - beg2 + 1; - - fprintf (f, "a {\n s " scoreFmtSimple "\n" - " b " unsposFmt " " unsposFmt "\n" - " e " unsposFmt " " unsposFmt "\n", - s, beg1, beg2, end1, end2); - - opIx = 0; - for (i=j=0 ; (i< height)||(j 0) -// { - i += run; j += run; - fprintf (f, " l " unsposFmt " " unsposFmt - " " unsposFmt " " unsposFmt " %d\n", - beg1+prevI, beg2+prevJ, beg1+i-1, beg2+j-1, - align_match_percent (run, match)); -// } - - if ((i < height) || (j < width)) - edit_script_indel_len (script, &opIx, &i, &j); - } - - fprintf (f, "}\n"); - } - - -static int align_match_percent (unspos run, unspos match) - { - possum numer, denom; - - if (run == 0) return 0; // (not clear what should be returned in this case) - - numer = 200 * ((possum) match) + ((possum) run); - denom = 2 * ((possum) run); - - return numer / denom; // 100*match/run, rounded - } - -//---------- -// -// print_lav_match-- -// Print an hsp in lav format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos pos1: The position, in seq1, of first character in the match -// .. (origin-0). -// seq* seq2: Another sequence. -// unspos pos1: The position, in seq2, of first character in the match -// .. (origin-0). -// unspos length: The number of nucleotides in the HSP. -// score s: The HSP's score. -// -// Returns: -// (nothing) -// -//---------- - -void print_lav_match - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - score s) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - unspos end1 = pos1 + length; - unspos end2 = pos2 + length; - int pctId; - - if ((sp1->p != NULL) || (sp2->p != NULL)) - suicide ("lav format can't handle multi-sequences"); - - // compute percent identity - - pctId = percent_identical (seq1, pos1, seq2, pos2, length); - - // print it - - fprintf (f, "a {\n"); - fprintf (f, " s " scoreFmtSimple "\n", s); - fprintf (f, " b " unsposFmt " " unsposFmt "\n", - pos1+1, pos2+1); - fprintf (f, " e " unsposFmt " " unsposFmt "\n", - end1, end2); - fprintf (f, " l " unsposFmt " " unsposFmt - " " unsposFmt " " unsposFmt " %d\n", - pos1+1, pos2+1, end1, end2, pctId); - fprintf (f, "}\n"); - } - - -void print_lavscore_match // same as regular lav except we output the score - (FILE* f, // .. wherever the pctid would normally go; this - seq* seq1, // .. is to allow compatibility with some very old - unspos pos1, // .. programs (Dblast, chain) - seq* seq2, - unspos pos2, - unspos length, - score s) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - unspos end1 = pos1 + length; - unspos end2 = pos2 + length; - - if ((sp1->p != NULL) || (sp2->p != NULL)) - suicide ("lav format can't handle multi-sequences"); - - // print it - - fprintf (f, "a {\n"); - fprintf (f, " s " scoreFmtSimple "\n", s); - fprintf (f, " b " unsposFmt " " unsposFmt "\n", - pos1+1, pos2+1); - fprintf (f, " e " unsposFmt " " unsposFmt "\n", - end1, end2); - fprintf (f, " l " unsposFmt " " unsposFmt " " unsposFmt " " unsposFmt " " scoreFmtSimple "\n", - pos1+1, pos2+1, end1, end2, s); - fprintf (f, "}\n"); - } - -//---------- -// -// print_lav_m_stanza-- -// print_lav_census_stanza-- -// print_lav_x_stanza-- -// -//---------- - -static void print_lav_m_interval (unspos b, unspos e, void* info); -static FILE* plmiFile = NULL; - -void print_lav_m_stanza (FILE* f, census* cen) - { - unspos n; - - plmiFile = f; - - fprintf (f, "m {\n"); - n = 0; - if (cen != NULL) - n = report_census_intervals (cen, print_lav_m_interval, NULL); - fprintf (f, " n " unsposFmt "\n", n); - fprintf (f, "}\n"); - } - -static void print_lav_m_interval - (unspos b, unspos e, arg_dont_complain(void* info)) - { fprintf (plmiFile, " x " unsposFmt " " unsposFmt "\n", b, e); } - - -void print_lav_census_stanza (FILE* f, census* cen) - { - fprintf (f, "Census {\n"); - print_census (f, NULL, cen, ' '); - fprintf (f, "}\n"); - } - - -void print_lav_x_stanza (FILE* f, unspos numMasked) - { fprintf (f, "x {\n n " unsposFmt "\n}\n", numMasked); } - -//---------- -// -// print_lav_comment_open, print_lav_comment_close-- -// Support general comment printing in a lav file. -// -// Note that the lav format does *not* support comments. Here we use a -// d-stanza, the presence of which may spoil the lav file for some downstream -// tools. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// -// Returns: -// (print_lav_comment_open only) A string which the caller should use as a -// prefix on all lines within the comment (this may be NULL). -// -//---------- - -char* print_lav_comment_open - (FILE* f) - { - fprintf (f, "#:lav\n"); - fprintf (f, "d {\n"); - - return NULL; - } - - -void print_lav_comment_close - (FILE* f) - { - fprintf (f, "}\n"); - } - - -//---------- -// -// print_lav_comment-- -// Print a comment in a lav file. -// -// Note that the lav format does *not* support comments, so these records will -// invariably spoil the lav file for most downstream tools. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// const char* format: A format string, as per printf. -// ...: (same as for printf) -// -// Returns: -// (nothing) -// -//---------- - -void print_lav_comment - (FILE* f, - const char* format, - ...) - { - va_list args; - - va_start (args, format); - vprint_lav_comment (f, format, args); - va_end (args); - } - -void vprint_lav_comment - (FILE* f, - const char* format, - va_list args) - { - fprintf (f, "# "); - if (format != NULL) - vfprintf (f, format, args); - fprintf (f, "\n"); - } - diff --git a/programs/lastz/src/lav.h b/programs/lastz/src/lav.h deleted file mode 100644 index 5b3b6e7..0000000 --- a/programs/lastz/src/lav.h +++ /dev/null @@ -1,55 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: lav.h -// -//---------- - -#ifndef lav_H // (prevent multiple inclusion) -#define lav_H - -// other files - -#include // standard C i/o stuff -#include // standard C variable argument list stuff -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "masking.h" // dynamic masking stuff -#include "edit_script.h" // alignment edit script stuff - -//---------- -// -// prototypes for routines in lav.c -// -//---------- - -void print_lav_job_header (FILE* f, - char* programName, char* name1, char* name2, - char* args, scoreset* scoring, - sthresh* hspThreshold, sthresh* gappedThreshold, - u8 dynamicMasking, - int withExtras, score xDrop, score yDrop); -void print_lav_job_footer (FILE* f); -void print_lav_header (FILE* f, seq* seq1, seq* seq2); -void print_lav_align_list (FILE* f, alignel* alignList, seq* seq1, seq* seq2); -void print_lav_align (FILE* f, - const u8* seq1, unspos beg1, unspos end1, - const u8* seq2, unspos beg2, unspos end2, - editscript* script, score s); -void print_lav_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - score s); -void print_lavscore_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - score s); -void print_lav_m_stanza (FILE* f, census* cen); -void print_lav_census_stanza (FILE* f, census* cen); -void print_lav_x_stanza (FILE* f, unspos numMasked); -char* print_lav_comment_open (FILE* f); -void print_lav_comment_close (FILE* f); -void print_lav_comment (FILE* f, const char* format, ...); -void vprint_lav_comment (FILE* f, const char* format, va_list args); - -#endif // lav_H diff --git a/programs/lastz/src/maf.c b/programs/lastz/src/maf.c deleted file mode 100755 index f956bbe..0000000 --- a/programs/lastz/src/maf.c +++ /dev/null @@ -1,721 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: maf.c -// -//---------- -// -// maf-- -// Support for printing alignments in MAF format. -// -// MAF format is a well-established multiple alignment format. As of Jan/2009, -// a spec for MAF files can be found at -// http://genome.ucsc.edu/FAQ/FAQformat#format5 -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff -#include "diag_hash.h" // diagonals hashing stuff -#include "identity_dist.h" // identity distribution stuff -#include "coverage_dist.h" // query coverage distribution stuff -#include "continuity_dist.h" // query continuity distribution stuff - -#define maf_owner // (make this the owner of its globals) -#include "maf.h" // interface to this module - -static int max_digits (s64 num1, s64 num2); - -// debugging defines - -//#define debugSeq1Beg 858 // if defined, only alignments entirely within -//#define debugSeq1End 1153 // .. this range in sequence 1 are output; note - // .. that these positions are origin-zero - -//#define snoopMafAlignList // if this is defined, extra code is added to - // .. track alignment lists - -//---------- -// -// print_maf_job_header-- -// Print maf format job header. -// -//---------- - -void print_maf_job_header - (FILE* f, - char* _programName, - char* _args, - scoreset* scoring, - sthresh* hspThreshold, - sthresh* gappedThreshold, - score xDrop, - score yDrop, - int withComments) - { - char* programName = _programName; - char* args = _args; - - if (!withComments) return; - - if (programName == NULL) programName = "(no name)"; - if (args == NULL) args = ""; - - fprintf (f, "##maf version=1 scoring=%s\n", programName); - fprintf (f, "# %s %s\n", programName, args); - fprintf (f, "#\n"); - fprintf (f, "# hsp_threshold = %s\n", score_thresh_to_string (hspThreshold)); - if ((gappedThreshold->t == 'S') || (hspThreshold->t == 'S')) - fprintf (f, "# gapped_threshold = %s\n", score_thresh_to_string (gappedThreshold)); - else - fprintf (f, "# gapped_threshold = (derived from hsp_threshold)\n"); - fprintf (f, "# x_drop = " scoreFmtSimple "\n", xDrop); - fprintf (f, "# y_drop = " scoreFmtSimple "\n", yDrop); - print_score_matrix_prefix (f, scoring, true, "# "); - } - -//---------- -// -// print_maf_job_footer-- -// Print maf format job footer. -// -//---------- - -void print_maf_job_footer - (arg_dont_complain(FILE* f)) - { - // (do nothing) - } - -//---------- -// -// print_maf_header-- -// Print maf format query header. -// -//---------- - -void print_maf_header - (arg_dont_complain(FILE* f), - arg_dont_complain(seq* seq1), - arg_dont_complain(seq* seq2)) - { - // (do nothing) - } - -//---------- -// -// print_maf_align_list-- -// Print a list of gapped alignments in maf format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// int withComments: true => print comments as well -// -// Returns: -// (nothing) -// -//---------- - -//=== stuff for snoopMafAlignList === - -#ifndef snoopMafAlignList -#define snoopMafAlignList_1 ; -#endif // not snoopMafAlignList - -#ifdef snoopMafAlignList - -#define snoopMafAlignList_1 \ - fprintf (stderr, "print_maf_align_list a=%08lX" \ - " a->seq1=%08lX a->seq2=%08lX\n", \ - (long) a, (long) a->seq1, (long) a->seq2); - -#endif // snoopMafAlignList - - -// print_maf_align_list-- - -void print_maf_align_list - (FILE* f, - alignel* alignList, - seq* seq1, - seq* seq2, - int withComments) - { - alignel* a; - unspos numer, denom; - - for (a=alignList ; a!=NULL ; a=a->next) - { - snoopMafAlignList_1; - if (withComments) - { - unspos height, width, i, j, prevI, prevJ, run; - u32 opIx; - - // report identity - alignment_identity (seq1, seq2, a, &numer, &denom); - fprintf (f, "# identity=" unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); - fprintf (f, "\n"); - - // report coverage - alignment_coverage (seq1, seq2, a, &numer, &denom); - fprintf (f, "# coverage=" unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); - fprintf (f, "\n"); - - // report continuity - alignment_continuity (a, &numer, &denom); - fprintf (f, "# continuity=" unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); - fprintf (f, "\n"); - - // report alignment path - - fprintf (f, "# cigar="); - - height = a->end1 - a->beg1 + 1; - width = a->end2 - a->beg2 + 1; - - opIx = 0; - for (i=j=0 ; (i< height)||(jscript, &opIx); - if (run > 0) - { - fprintf (f, unsposFmt "m", run); - i += run; j += run; - } - - if ((i < height) || (j < width)) - { - prevI = i; prevJ = j; - edit_script_indel_len (a->script, &opIx, &i, &j); - if (i > prevI) - fprintf (f, unsposFmt "d", i - prevI); - if (j > prevJ) - fprintf (f, unsposFmt "i", j - prevJ); - } - } - fprintf (f, "\n"); - } - - print_maf_align (f, - seq1, a->beg1-1, a->end1, - seq2, a->beg2-1, a->end2, - a->script, a->s); - } - } - -//---------- -// -// print_maf_align-- -// Print a single gapped alignment in maf format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos beg1, end1: Range of positions in sequence 1 (origin 0). -// seq* seq2: Another sequence. -// unspos beg2, end2: Range of positions in sequence 2 (origin 0). -// editscript* script: The script describing the path the alignment -// .. takes in the DP matrix. -// score s: The alignment's score. -// -// Returns: -// (nothing) -// -//---------- - -static char* rcfSuffix[4] = { "", "~", "~", "" }; - -void print_maf_align - (FILE* f, - seq* seq1, - unspos beg1, - unspos end1, - seq* seq2, - unspos beg2, - unspos end2, - editscript* script, - score s) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - unspos height, width, i, j, run; - u32 opIx; - u8* p, *q; - unspos ix; - char* name1, *name2, *pref2, *suff1, *suff2; - unspos offset1, offset2, start1, start2; - unspos startLoc1, startLoc2; - unspos seq1Len, seq2Len, seq1True, seq2True; - char strand1, strand2; - unspos startI, startJ; - int len1, len2, nameW, startW, endW, lenW; - -#ifdef debugSeq1Beg - if ((beg1 < debugSeq1Beg) || (end1 > debugSeq1End)) return; -#endif // debugSeq1Beg - - beg1++; // (internally, we want origin 1, inclusive) - beg2++; - - height = end1 - beg1 + 1; - width = end2 - beg2 + 1; - - // report diagonal - - if (maf_dbgReportDiag) - fprintf (f, "# diagonal=" sgnposFmt "\n", diagNumber(beg1,beg2)); - - ////////// - // figure out position offsets and names - ////////// - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - seq1Len = seq1->len; - seq1True = seq1->trueLen; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, beg1-1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - seq1Len = part->sepAfter - offset1; - seq1True = part->trueLen; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2Len = seq2->len; - seq2True = seq2->trueLen; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, beg2-1); - name2 = &sp2->pool[part->header]; - startLoc2 = part->startLoc; - offset2 = part->sepBefore + 1; - seq2Len = part->sepAfter - offset2; - seq2True = part->trueLen; - } - - ////////// - // print summary line - ////////// - - fprintf (f, "a score=" scoreFmt "\n", s); - - ////////// - // print aligning path in sequence 1 - ////////// - - // figure out fields and widths - - pref2 = ((maf_distinguishNames) && (strcmp (name1, name2) == 0))? "~" : ""; - suff1 = rcfSuffix[seq1->revCompFlags]; - suff2 = rcfSuffix[seq2->revCompFlags]; - - if ((seq1->revCompFlags & rcf_rev) == 0) - { - start1 = beg1-1 - offset1 + startLoc1; - strand1 = '+'; - } - else - { - start1 = beg1-1 - offset1 + seq1True+2 - (startLoc1 + seq1Len); - strand1 = '-'; - } - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = beg2-1 - offset2 + startLoc2; - strand2 = '+'; - } - else - { - start2 = beg2-1 - offset2 + seq2True+2 - (startLoc2 + seq2Len); - strand2 = '-'; - } - - len1 = strlen (name1) + strlen (suff1); - len2 = strlen (pref2) + strlen (name2) + strlen (suff2); - nameW = (len1 >= len2)? len1 : len2; - - startW = max_digits (start1, start2); - endW = max_digits (end1+1-beg1, end2+1-beg2); - lenW = max_digits (seq1True, seq2True); - - // print aligning path in sequence 1 (non-printables are printed as '*' - // but such should never be seen unless there is a problem elsewhere) - - fprintf (f, "s %s%s%*s" unsposStarFmt " " unsposStarFmt " %c " unsposStarFmt " ", - name1, suff1, nameW+1-len1, " ", - startW, start1-1, endW, end1+1-beg1, strand1, lenW, seq1True); - - opIx = 0; - for (i=j=0 ; (iv+beg1+i-1; - q = seq2->v+beg2+j-1; - for (ix=0 ; ixv+beg1+i-1; - startJ = j; q = seq2->v+beg2+j-1; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startIv+beg1+i-1; - q = seq2->v+beg2+j-1; - for (ix=0 ; ixv+beg1+i-1; - startJ = j; q = seq2->v+beg2+j-1; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startI print comments as well -// -// Returns: -// (nothing) -// -//---------- - -void print_maf_match - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - score s, - int withComments) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - u8* s1 = seq1->v + pos1; - u8* s2 = seq2->v + pos2; - char* name1, *name2, *pref2, *suff1, *suff2; - unspos offset1, offset2, start1, start2; - unspos startLoc1, startLoc2; - unspos seq1Len, seq2Len, seq1True, seq2True; - char strand1, strand2; - int len1, len2, nameW, startW, lenW; - unspos ix; - segment seg; - unspos numer, denom; - - if (seq1->revCompFlags != rcf_forward) - suicide ("attempt to print - strand or complement for sequence 1 in print_maf_match"); - -#ifdef debugSeq1Beg - if ((pos1 < debugSeq1Beg) || (pos1+length > debugSeq1End)) return; -#endif // debugSeq1Beg - - // report diagonal - - if (maf_dbgReportDiag) - fprintf (f, "# diagonal=" sgnposFmt "\n", diagNumber(pos1,pos2)); - - if (withComments) - { - // report identity - segment_identity (seq1, pos1, seq2, pos2, length, &numer, &denom); - fprintf (f, "# identity=" unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); - fprintf (f, "\n"); - - // report coverage - seg.pos1 = pos1; - seg.pos2 = pos2; - seg.length = length; - segment_coverage (seq1, seq2, &seg, &numer, &denom); - fprintf (f, "# coverage=" unsposSlashFmt, numer, denom); - if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); - fprintf (f, "\n"); - - // report alignment path - fprintf (f, "# cigar=" unsposFmt "m\n", length); - } - - // figure out position offsets and names - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - seq1Len = seq1->len; - seq1True = seq1->trueLen; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, pos1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - seq1Len = part->sepAfter - offset1; - seq1True = part->trueLen; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2Len = seq2->len; - seq2True = seq2->trueLen; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, pos2); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2Len = part->sepAfter - offset2; - seq2True = part->trueLen; - } - - // print summary line - - fprintf (f, "a score=" scoreFmt "\n", s); - - // figure out fields and widths - - pref2 = ((maf_distinguishNames) && (strcmp (name1, name2) == 0))? "~" : ""; - suff1 = rcfSuffix[seq1->revCompFlags]; - suff2 = rcfSuffix[seq2->revCompFlags]; - - if ((seq1->revCompFlags & rcf_rev) == 0) - { - start1 = pos1 - offset1 + startLoc1; - strand1 = '+'; - } - else - { - start1 = pos1 - offset1 + seq1True+2 - (startLoc1 + seq1Len); - strand1 = '-'; - } - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = pos2 - offset2 + startLoc2; - strand2 = '+'; - } - else - { - start2 = pos2 - offset2 + seq2True+2 - (startLoc2 + seq2Len); - strand2 = '-'; - } - - len1 = strlen (name1) + strlen (suff1); - len2 = strlen (pref2) + strlen (name2) + strlen (suff2); - nameW = (len1 >= len2)? len1 : len2; - - startW = max_digits (start1, start2); - lenW = max_digits (seq1True, seq2True); - - // print aligning segment of sequence 1 (non-printables are printed as '*' - // but such should never be seen unless there is a problem elsewhere) - - fprintf (f, "s %s%s%*s" unsposStarFmt " " unsposFmt " %c " unsposStarFmt " ", - name1, suff1, nameW+1-len1, " ", - startW, start1-1, length, strand1, lenW, seq1True); - - for (ix=0 ; ix= w2)? w1 : w2; - } diff --git a/programs/lastz/src/maf.h b/programs/lastz/src/maf.h deleted file mode 100644 index c6b5c78..0000000 --- a/programs/lastz/src/maf.h +++ /dev/null @@ -1,64 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: maf.h -// -//---------- - -#ifndef maf_H // (prevent multiple inclusion) -#define maf_H - -// other files - -#include // standard C i/o stuff -#include // standard C variable argument list stuff -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef maf_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef maf_owner -int maf_distinguishNames = false; // true => add a "~" prefix to the second - // sequence name when names are identical -int maf_dbgReportDiag = false; // true => report diagonal as a maf comment -#else -global int maf_distinguishNames; -global int maf_dbgReportDiag; -#endif - -//---------- -// -// prototypes for routines in maf.c -// -//---------- - -void print_maf_job_header (FILE* f, - char* programName, char* args, scoreset* scoring, - sthresh* hspThreshold, sthresh* gappedThreshold, - score xDrop, score yDrop, - int withComments); -void print_maf_job_footer (FILE* f); -void print_maf_header (FILE* f, seq* seq1, seq* seq2); -void print_maf_align_list (FILE* f, alignel* alignList, seq* seq1, seq* seq2, - int withComments); -void print_maf_align (FILE* f, - seq* seq1, unspos beg1, unspos end1, - seq* seq2, unspos beg2, unspos end2, - editscript* script, score s); -void print_maf_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - score s, int withComments); -void print_maf_comment (FILE* f, const char* format, ...); -void vprint_maf_comment (FILE* f, const char* format, va_list args); - -#undef global -#endif // maf_H diff --git a/programs/lastz/src/masking.c b/programs/lastz/src/masking.c deleted file mode 100755 index 5a6e980..0000000 --- a/programs/lastz/src/masking.c +++ /dev/null @@ -1,827 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: masking.c -// -//---------- -// -// masking-- -// Provide dynamic masking for sequences that do not have their repeats -// masked. -// -// The basic premise is that if anything in the first sequence aligns to a -// repeat in the second sequence, it will align to all copies of the repeat. -// Thus, positions that align frequently are probably repeat elements. -// -// For every position in sequence 1, we keep a count of how many times it has -// been part of an alignment (even if it's part of an indel). When the count -// reaches the given threshold, we mark that position with an 'x' so that -// subsequent alignments will give it a large scoring penalty. -// -// This is most useful if the caller is comparing sequence 1 to many other -// sequences. The first comparison(s) will get no advantage-- nothing will be -// masked. But if enough sequences are compared, the later sequences will -// benefit from the masking. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "segment.h" // segment table management stuff -#include "edit_script.h" // alignment edit script stuff - -#define masking_owner // (make this the owner of its globals) -#include "masking.h" // interface to this module - -//---------- -// -// miscellaneous data -// -//---------- - -//#define noCallback // if this is defined, mask_interval doesn't - // bother to call the callback routine for - // masked intervals; this is to test whether - // there is any speed advantage to removing - // masked seeds from the position table; note - // that there are other good reasons to remove - // masked seeds, regardless of speed - -//---------- -// -// prototypes for private functions -// -//---------- - -static unspos mask_interval (u8* fwd, u8* rev, unspos beg, unspos end, - census* cen, - void(*func) (unspos beg, unspos end, void* info), - void* info); - -//---------- -// -// new_census-- -// Create a new census of a given length. -// -//---------- -// -// Arguments: -// new_census len: The length of the sequence to be censused. -// char kind: The type of count[] array to allocate (see the -// .. definition of the census type in masking.h). -// u32 maskThresh: The count threshold. Note that we don't range -// .. check this to compare its validity vs. kind. -// -// Returns: -// Pointer to the new census (failures are fatal); caller must eventually -// dispose of this memory with a call to free(). -// -//---------- - -census* new_census - (unspos len, - char kind, - u32 maskThresh) - { - census* cen; - size_t bytesNeeded; - - if (strchr ("BWL", kind) == NULL) kind = 'B'; - if (len == 0) len = 1; - - if (kind == 'B') - { - bytesNeeded = sizeof(census) + (len*sizeof(u8)); - cen = (census*) zalloc_or_die ("new_census", bytesNeeded); - cen->count8 = (u8*) (cen + 1); - } - else if (kind == 'W') - { - bytesNeeded = sizeof(census) + (len*sizeof(u16)); - cen = (census*) zalloc_or_die ("new_census", bytesNeeded); - cen->count16 = (u16*) (cen + 1); - } - else // if (kind == 'L') - { - bytesNeeded = sizeof(census) + (len*sizeof(u32)); - cen = (census*) zalloc_or_die ("new_census", bytesNeeded); - cen->count32 = (u32*) (cen + 1); - } - - cen->len = len; - cen->kind = kind; - cen->maskThresh = maskThresh; - - return cen; - } - -//---------- -// -// census_mask_segments-- -// Given a list of ungapped alignment segments, increment the count of all -// aligned locations and for any that meet or exceed the threshold, mask them -// from the sequence. -// -//---------- -// -// Arguments: -// segtable* st: The segments. -// u8* fwd, rev: The sequence and its reverse. rev may be NULL. -// census* cen: The census to update. -// void (*func): A callback function to report masked intervals to. -// (unspos beg, .. beg,end is the interval that is about to be -// unspos end, .. masked (origin 1, inclusive). info is passed -// void* info) .. through from our caller. -// void* info: Pass-thru for func. -// -// Returns: -// A count of the number of bases masked. This does not include any bases that -// were previously masked. -// -//---------- - -unspos census_mask_segments - (segtable* st, - u8* fwd, - u8* rev, - census* cen, - void (*func)(unspos beg, unspos end, void* info), - void* info) - { - segment* seg; - u32 ix; - unspos beg, end, pos; - unspos count = 0; - - if (cen == NULL) - return 0; - - for (ix=0,seg=st->seg ; ixlen ; ix++,seg++) - { - if ((seg->length > cen->len) || (seg->pos1 > cen->len - seg->length)) - suicide ("census_mask_segments, internal error"); - beg = seg->pos1; - end = beg + seg->length; - switch (cen->kind) - { - case 'B': - for (pos=beg ; poscount8[pos] < u8max) cen->count8 [pos]++; } - break; - case 'W': - for (pos=beg ; poscount16[pos] < u16max) cen->count16[pos]++; } - break; - case 'L': - for (pos=beg ; poscount32[pos] < u32max) cen->count32[pos]++; } - break; - } - if (cen->maskThresh > 0) - count += mask_interval (fwd, rev, beg+1, end, cen, func, info); - } - - masking_add_stat (maskedBases, count); - return count; - } - -//---------- -// -// census_mask_aligns-- -// Given a list of alignments, increment the count of all aligned locations -// and for any that meet or exceed the threshold, mask them from the sequence. -// -// Note that the entire aligned interval is counted/masked, even indels. -// -//---------- -// -// Arguments: -// alignel* alignList: The alignments. -// u8* fwd, rev: The sequence and its reverse. rev may be NULL. -// census* cen: The census to update. -// void (*func): A callback function to report masked intervals to. -// (unspos beg, .. beg,end is the interval that is about to be -// unspos end, .. masked (origin 1, inclusive). info is passed -// void* info) .. through from our caller. -// void* info: Pass-thru for func. -// -// Returns: -// A count of the number of bases masked. This does not include any bases that -// were previously masked. -// -//---------- - -unspos census_mask_aligns - (alignel* alignList, - u8* fwd, - u8* rev, - census* cen, - void (*func)(unspos beg, unspos end, void* info), - void* info) - { - alignel* a; - unspos beg, end, pos; - unspos count = 0; - - if (cen == NULL) - return 0; - - for (a=alignList ; a!=NULL ; a=a->next) - { - if (a->beg1 < 1) - suicide ("census_mask_aligns, internal error"); - if (a->end1 > cen->len) - suicide ("census_mask_aligns, internal error"); - beg = a->beg1 - 1; - end = a->end1; - switch (cen->kind) - { - case 'B': - for (pos=beg ; poscount8[pos] < u8max) cen->count8 [pos]++; } - break; - case 'W': - for (pos=beg ; poscount16[pos] < u16max) cen->count16[pos]++; } - break; - case 'L': - for (pos=beg ; poscount32[pos] < u32max) cen->count32[pos]++; } - break; - } - if (cen->maskThresh > 0) - count += mask_interval (fwd, rev, beg+1, end, cen, func, info); - } - - masking_add_stat (maskedBases, count); - return count; - } - -//---------- -// -// mask_interval-- -// Given an interval, mask any position that meets or exceeds the count -// threshold. -// -//---------- -// -// Arguments: -// u8* fwd, rev: The sequence and its reverse. rev may be NULL. -// unspos beg, end: The interval to check. Origin-1, inclusive. -// census* cen: The census describing which bases to mask. -// void (*func): A callback function to report masked intervals to. -// (unspos beg, .. beg,end is the interval that is about to be -// unspos end, .. masked, see note 1 (origin 1, inclusive). info -// void* info) .. is passed through from our caller. -// void* info: Pass-thru for func. -// -// Returns: -// A count of the number of bases masked. This does not include any bases that -// were previously masked. -// -//---------- -// -// Notes: -// (1) The intervals passed to the callback function are not the same as the -// interval passed to this routine. Only sub-intervals that reach the -// masking threshold for the first time are passed to the callback. -// -//---------- - -#ifndef noCallback // normal version, callback used - -static unspos mask_interval - (u8* fwd, - u8* rev, - unspos beg, - unspos end, - census* cen, - void (*func)(unspos beg, unspos end, void* info), - void* info) - { - static const u32 noRun = (u32) -1; - unspos revLen, runBeg, pos, j; - unspos basesMasked = 0; - u32 count; - - if (cen == NULL) - return 0; - - if ((beg < 1) || (end > cen->len)) - suicide ("mask_interval, internal error"); - - revLen = cen->len - 1; - - runBeg = noRun; - for (pos=beg-1 ; poskind == 'B')? cen->count8 [pos] - : (cen->kind == 'W')? cen->count16[pos] - : cen->count32[pos]; - if ((cen->maskThresh > 0) - && (count >= cen->maskThresh) - && (dna_isupper(fwd[pos]))) - { if (runBeg == noRun) runBeg = pos; } - else if (runBeg != noRun) - { - func (runBeg+1, pos, info); - for (j=runBeg ; jmaskThresh == 0)) - return 0; - - if ((beg < 1) || (end > cen->len)) - suicide ("mask_interval, internal error"); - - revLen = cen->len - 1; - for (pos=beg-1 ; poskind == 'B')? cen->count8 [pos] - : (cen->kind == 'W')? cen->count16[pos] - : cen->count32[pos]; - if (count >= cen->maskThresh) - { - if (dna_isupper(fwd[pos])) { fwd[pos] = 'x'; basesMasked++; } - if (rev != NULL) rev[revLen-pos] = 'x'; - } - } - - return basesMasked; - } - -#endif // noCallback - -//---------- -// -// count_masked_bases-- -// Report masked bases in a sequence. The masked bases could have been in the -// sequence when it was input, or they could have been added during running of -// the program (e.g. by dynamic masking). -// -//---------- -// -// Arguments: -// seq* seq: The sequence that has been masked. -// int maskChar: The character to consider as a mask. Normally this -// .. is a character (e.g. 'X' or 'N'). However, the -// .. value -1 means that we should mask by changing -// .. to lowercase. -// -// Returns: -// The number of bases masked. -// -//---------- - -unspos count_masked_bases - (seq* _seq, - int maskChar) - { - unspos ix; - unspos numMasked = 0; - int isMasked; - - for (ix=0 ; ix<=_seq->len ; ix++) - { - if (maskChar >= 0) - isMasked = (_seq->v[ix] == maskChar); - else - isMasked = ((_seq->v[ix] >= 'a') && (_seq->v[ix] <= 'z')); - - if (isMasked) numMasked++; - } - - return numMasked; - } - -//---------- -// -// report_census_intervals-- -// Report runs in a census of positions that meet or exceed the threshold. -// -//---------- -// -// Arguments: -// census* cen: The census to report. -// void (*func): The reporting function. beg,end is the interval -// (unspos beg, .. (origin 1, inclusive). info is passed through -// unspos end, .. from our caller. -// void* info) -// void* info: Pass-thru for func. -// -// Returns: -// The number of intervals reported. -// -//---------- - -unspos report_census_intervals - (census* cen, - void (*func)(unspos beg, unspos end, void* info), - void* info) - { - static const u32 noRun = (u32) -1; - unspos runBeg, pos; - unspos numIntervals = 0; - u32 count; - - if (cen == NULL) - return 0; - - runBeg = noRun; - for (pos=0 ; poslen ; pos++) - { - count = (cen->kind == 'B')? cen->count8 [pos] - : (cen->kind == 'W')? cen->count16[pos] - : cen->count32[pos]; - if (count >= cen->maskThresh) - { if (runBeg == noRun) runBeg = pos; } - else if (runBeg != noRun) - { - if (func != NULL) func (runBeg+1, pos, info); - numIntervals++; - runBeg = noRun; - } - } - - if (runBeg != noRun) - { - if (func != NULL) func (runBeg+1, cen->len, info); - numIntervals++; - } - - return numIntervals; - } - -//---------- -// -// report_masked_intervals-- -// Report masked runs in a sequence. The masked bases could have been in the -// sequence when it was input, or they could have been added during running of -// the program (e.g. by dynamic masking). -// -//---------- -// -// Arguments: -// seq* seq: The sequence that has been masked. -// int maskChar: The character to consider as a mask. Normally this -// .. is a character (e.g. 'X' or 'N'). However, the -// .. value -1 means that we should mask by changing -// .. to lowercase. -// void (*func): The reporting function. beg,end is the interval -// (unspos beg, .. (origin 1, inclusive). info is passed through -// unspos end, .. from our caller. -// void* info) -// void* info: Pass-thru for func. -// -// Returns: -// The number of intervals reported. -// -//---------- - -unspos report_masked_intervals - (seq* _seq, - int maskChar, - void (*func)(unspos beg, unspos end, void* info), - void* info) - { - static const u32 noRun = (u32) -1; - unspos runBeg, ix; - unspos numIntervals = 0; - int isMasked; - - runBeg = noRun; - for (ix=0 ; ix<=_seq->len ; ix++) - { - if (maskChar >= 0) - isMasked = (_seq->v[ix] == maskChar); - else - isMasked = ((_seq->v[ix] >= 'a') && (_seq->v[ix] <= 'z')); - - if (isMasked) - { if (runBeg == noRun) runBeg = ix; } - else if (runBeg != noRun) - { - if (func != NULL) func (runBeg+1, ix, info); - numIntervals++; - runBeg = noRun; - } - } - - if (runBeg != noRun) - { - if (func != NULL) func (runBeg+1, _seq->len, info); - numIntervals++; - } - - return numIntervals; - } - -//---------- -// [[-- a report_census_intervals or report_masked_intervals callback function --]] -// -// print_masking_interval-- -// Write a masking interval to a file, in a format compatible with being read -// by mask_sequence() or mask_sequence_keep(). -// print_masking_interval_3-- -// Write a masking interval to a file, including sequence names. This format -// is NOT compatible with mask_sequence() or mask_sequence_keep(). -// -// A typical masking file (resulting from several calls to this routine) looks -// like this (for print_masking_interval): -// -// 1527933 3184039 -// 4165389 6877343 -// 7374477 7902860 -// -// or like this (for print_masking_interval_3): -// -// chr1 1527933 3184039 -// chr1 4165389 6877343 -// chr3 7374477 7902860 -// -// Each line describes a region to be masked. Indexes are one-based, and -// inclusive on both ends. -// -//---------- -// -// Arguments: -// unspos beg, end: The interval, in the target sequence, that is to be -// .. printed. Origin-1, inclusive. -// void* info: (really pmiInfo*) -// info->f: The file to write to. -// info->seq: The sequence that has been masked. -// -// Returns: -// (nothing) -// -//---------- - -void print_masking_interval - (unspos beg, - unspos end, - void* info) - { - FILE* f = ((pmiInfo*) info)->f; - seq* _seq = ((pmiInfo*) info)->seq; - - beg += _seq->startLoc - 1; - end += _seq->startLoc - 1; - - fprintf (f, unsposFmt " " unsposFmt "\n", beg, end); - } - -void print_masking_interval_3 - (unspos beg, - unspos end, - void* info) - { - FILE* f = ((pmiInfo*) info)->f; - seq* _seq = ((pmiInfo*) info)->seq; - seqpartition* sp = &_seq->partition; - partition* part; - char* name; - unspos offset; - - // figure out position offsets and names - - if (sp->p == NULL) // sequence 1 is not partitioned - { - name = (_seq->useFullNames)? _seq->header : _seq->shortHeader; - if ((name == NULL) || (name[0] == 0)) name = "seq1"; - offset = 0; - } - else // sequence 1 is partitioned - { - part = lookup_partition (_seq, beg-1); - name = &sp->pool[part->header]; - offset = part->sepBefore + 1; - } - - beg += _seq->startLoc - offset - 1; - end += _seq->startLoc - offset - 1; - - // print the interval - - fprintf (f, "%s " unsposFmt " " unsposFmt "\n", name, beg, end); - } - -//---------- -// -// print_census-- -// Print all locations in a census that meet or exceed the threshold. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* _seq: The sequence being counted. If this is NULL, census -// .. positions are not named. If it is not NULL, the -// .. sequence name is shown with each census position. -// census* cen: The census to print. -// char delimiter: The character to use between fields (e.g. tab or space). -// -// Returns: -// (nothing) -// -//---------- - -void print_census - (FILE* f, - seq* _seq, - census* cen, - char delimiter) - { - seqpartition* sp; - partition* nextPart; - u32 nextIx; - char* name; - unspos offset; - unspos pos; - u32 count; - - if (cen == NULL) return; - - // simple print with no sequence names - - if (_seq == NULL) - { - for (pos=0 ; poslen ; pos++) - { - count = (cen->kind == 'B')? cen->count8 [pos] - : (cen->kind == 'W')? cen->count16[pos] - : cen->count32[pos]; - if (count >= cen->maskThresh) - fprintf(f, unsposFmt "%c%u\n", pos+1, delimiter, count); - } - return; - } - - // print with same sequence name, for non-partitioned sequence - - sp = &_seq->partition; - if (sp->p == NULL) - { - name = (_seq->useFullNames)? _seq->header : _seq->shortHeader; - if ((name == NULL) || (name[0] == 0)) name = "seq1"; - for (pos=0 ; poslen ; pos++) - { - count = (cen->kind == 'B')? cen->count8 [pos] - : (cen->kind == 'W')? cen->count16[pos] - : cen->count32[pos]; - if (count >= cen->maskThresh) - fprintf(f, "%s%c" unsposFmt "%c%u\n", - name, delimiter, pos+1, delimiter, count); - } - return; - } - - // print with sequence names, for partitioned sequence - - nextPart = sp->p; - nextIx = 0; - name = NULL; - offset = 0; - - for (pos=0 ; poslen ; pos++) - { - if (pos == nextPart->sepBefore) - { - if (nextIx < sp->len) - { - name = &sp->pool[nextPart->header]; - offset = nextPart->sepBefore + 1; - nextPart++; nextIx++; - } - else - name = NULL; - } - else if (name != NULL) - { - count = (cen->kind == 'B')? cen->count8 [pos] - : (cen->kind == 'W')? cen->count16[pos] - : cen->count32[pos]; - if (count >= cen->maskThresh) - fprintf(f, "%s%c" unsposFmt "%c%u\n", - name, delimiter, pos+1-offset, delimiter, count); - } - else - suicidef ("internal error in print_census\n"); - } - - } - -//---------- -// -// masking_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void masking_zero_stats - (void) - { -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&maskingStats, 0, sizeof(maskingStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - // (none to set, yet) - -#endif // collect_stats - } - -//---------- -// -// masking_show_stats-- -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// -// Returns: -// (nothing) -// -//---------- - -void masking_show_stats - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - if (f == NULL) return; - fprintf (f, " masked bases: %s\n", commatize(maskingStats.maskedBases)); - fprintf (f, "-------------------\n"); -#endif // collect_stats - } - -void masking_generic_stats - (arg_dont_complain(FILE* f), - arg_dont_complain(void (*func) (FILE*, const char*, ...))) - { -#ifdef collect_stats - if (f == NULL) return; - (*func) (f, "masked_bases=%d\n", maskingStats.maskedBases); -#endif // collect_stats - } - diff --git a/programs/lastz/src/masking.h b/programs/lastz/src/masking.h deleted file mode 100644 index f2214dd..0000000 --- a/programs/lastz/src/masking.h +++ /dev/null @@ -1,116 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: masking.h -// -//---------- - -#ifndef masking_H // (prevent multiple inclusion) -#define masking_H - -// other files - -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "segment.h" // segment table management stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef masking_owner -#define global -#else -#define global extern -#endif - -//---------- -// -// data structures and types -// -//---------- - -typedef struct census - { - unspos len; // the length of the sequence being censused - char kind; // the type of counting array - // .. 'B' => count8 is used - // .. 'W' => count16 is used - // .. 'L' => count32 is used - // .. (anthing else => count8) - u32 maskThresh; // the count threshold; any position with a count - // .. this high is to be masked - u8* count8; // how many times each base has been part of an - u16* count16; // .. alignment (variable-length array); the length - u32* count32; // .. of this array is len, and the entries - // .. correspond to sequence positions 1..len; - // .. only one of these is active, as indicated by - // .. census.kind; the memory for this array is - // .. allocated as part of this structure - } census; - - -// info struct for print_masking_interval (used for the "info" argument) - -typedef struct pmiInfo - { - FILE* f; // the file to write to - seq* seq; // the sequence that has been masked - } pmiInfo; - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -global struct - { - int maskedBases; - } maskingStats; - -// stats macros - -#define masking_count_stat(field) ++maskingStats.field -#define masking_uncount_stat(field) --maskingStats.field -#define masking_set_stat(field,val) (maskingStats.field = val) -#define masking_add_stat(field,val) (maskingStats.field += val) -#else -#define masking_count_stat(field) -#define masking_uncount_stat(field) -#define masking_set_stat(field,val) -#define masking_add_stat(field,val) -#endif // collect_stats - -// prototypes for stats routines - -void masking_zero_stats (void); -void masking_show_stats (FILE* f); -void masking_generic_stats (FILE* f, void (*func) (FILE*, const char*, ...)); - -//---------- -// -// prototypes for routines in masking.c -// -//---------- - -census* new_census (unspos len, char kind, u32 maskThresh); -unspos census_mask_segments (segtable* st, u8* fwd, u8* rev, census* cen, - void(*func) (unspos beg, unspos end, void* info), - void* info); -unspos census_mask_aligns (alignel* a, u8* fwd, u8* rev, census* cen, - void(*func) (unspos beg, unspos end, void* info), - void* info); -unspos count_masked_bases (seq* _seq, int maskChar); -unspos report_census_intervals (census* cen, - void(*func) (unspos beg, unspos end, void* info), - void* info); -unspos report_masked_intervals (seq* seq, int maskChar, - void(*func) (unspos beg, unspos end, void* info), - void* info); -void print_masking_interval (unspos beg, unspos end, void* info); -void print_masking_interval_3 (unspos beg, unspos end, void* info); -void print_census (FILE* f, seq* _seq, census* cen, char delimiter); - -#undef global -#endif // masking_H diff --git a/programs/lastz/src/output.c b/programs/lastz/src/output.c deleted file mode 100755 index f50f63b..0000000 --- a/programs/lastz/src/output.c +++ /dev/null @@ -1,1413 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: output.c -// -//---------- -// -// output-- -// Interface between the main program and he various output formats. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "sequences.h" // sequence stuff -#include "infer_scores.h" // scoring inference stuff -#include "identity_dist.h" // identity distribution stuff -#include "gfa.h" // gfa alignment format stuff -#include "lav.h" // lav alignment format stuff -#include "axt.h" // axt alignment format stuff -#include "maf.h" // maf alignment format stuff -#include "sam.h" // sam alignment format stuff -#include "cigar.h" // cigar alignment format stuff -#include "genpaf.h" // genpaf alignment format stuff -#include "text_align.h" // textual alignment format stuff -#include "align_diffs.h" // alignment differences format stuff -#include "lastz.h" // lastz program-wide stuff - -#define output_owner // (make this the owner of its globals) -#include "output.h" // interface to this module - -extern char* programName; // (from lastz.c) -extern char* programVersionMajor; // (from lastz.c) -extern char* programVersionMinor; // (from lastz.c) -extern char* programVersionSubMinor; // (from lastz.c) -extern char* programRevisionDate; // (from lastz.c) - -extern control* currParams; // (from lastz.c) - -// debugging defines - -//#define snoopGenpaf // if this is defined, extra code is added to - // .. track calls to print_genpaf_align() and - // .. print_genpaf_align(); note that another - // .. instance of this define is in genpaf.c - -//---------- -// -// private global data -// -//---------- - -u32 printedForQuery; // the number of alignments that have been printed - // .. for the current query (both strands together) - -int strandHeaderPrinted; // false => we have yet to print a header for the - // .. current strand-to-strand alignment - -// how often shall we flush the output? - -#define matchFlushFrequency 1000 - -//---------- -// -// prototypes for private functions -// -//---------- - -static void print_match_composition (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, - unspos length, - score s, - seed* hitSeed, u32 step); -static void dump_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, - unspos length); -static char* program_name (void); - -//---------- -// -// init_output_for_query, init_output_for_strand-- -// -//---------- - -void init_output_for_query (void) - { printedForQuery = 0; } - -void init_output_for_strand (void) - { strandHeaderPrinted = false; } - -//---------- -// -// print_align_list_segments-- -// -//---------- - -//=== stuff for snoopGenpaf === - -#ifndef snoopGenpaf -#define snoopGenpaf_1 ; -#endif // not snoopGenpaf - -#ifdef snoopGenpaf - -#define snoopGenpaf_1 \ - fprintf (stderr, "segmenting(" \ - "seq1:" unsposDotsFmt " seq2:" unsposDotsFmt ")\n", \ - beg1, end1, beg2, end2); - -#endif // snoopGenpaf - - -//=== print_align_list_segments === - -void print_align_list_segments (alignel* alignList) - { - alignel* a; - unspos beg1, end1, beg2, end2; - unspos height, width, i, j, prevI, prevJ, run; - u32 opIx; - score s; - - for (a=alignList ; a!=NULL ; a=a->next) - { - beg1 = a->beg1; - end1 = a->end1; - beg2 = a->beg2; - end2 = a->end2; - height = end1 - beg1 + 1; - width = end2 - beg2 + 1; - - snoopGenpaf_1; - - // print the alignment's segments - - opIx = 0; - for (i=j=0 ; (i< height)||(jscript, &opIx); - i += run; j += run; - if ((i < height) || (j < width)) - edit_script_indel_len (a->script, &opIx, &i, &j); - - s = score_match (currParams->scoring, - currParams->seq1, beg1-1+prevI, - currParams->seq2, beg2-1+prevJ, - run); - print_match (beg1-1+prevI, beg2-1+prevJ, run, s, a->hspId); - } - } - - } - -//---------- -// -// print_job_header, print_job_footer, print_header, print_align_list, -// print_match, print_comment_open, print_comment_close, print_eof_comment-- -// -//---------- - -void print_job_header (void) - { - int outputFormat = currParams->outputFormat; - - switch (outputFormat) - { - case fmtGfa: - case fmtGfaNoScore: - print_gfa_job_header - (currParams->outputFile, program_name(), - currParams->seq1->filename, currParams->seq2->filename); - break; - case fmtLav: - case fmtLavComment: - case fmtLavScore: - case fmtLavText: - case fmtLavInfScores: - print_lav_job_header - (currParams->outputFile, program_name(), - currParams->seq1->filename, currParams->seq2->filename, currParams->args, - currParams->scoring, &currParams->hspThreshold, &currParams->gappedThreshold, - currParams->dynamicMasking, - /*withExtras*/ (outputFormat==fmtLavComment), - currParams->xDrop, currParams->yDrop); - if (outputFormat == fmtLavText) - goto text_format; - if (outputFormat == fmtLavInfScores) - goto inf_scores_format; - break; - case fmtAxt: - case fmtAxtComment: - case fmtAxtGeneral: - print_axt_job_header - (currParams->outputFile, program_name(), currParams->args, - currParams->scoring, - &currParams->hspThreshold, &currParams->gappedThreshold, - currParams->xDrop, currParams->yDrop); - break; - case fmtMaf: - case fmtMafComment: - case fmtMafNoComment: - print_maf_job_header - (currParams->outputFile, program_name(), currParams->args, - currParams->scoring, - &currParams->hspThreshold, &currParams->gappedThreshold, - currParams->xDrop, currParams->yDrop, - (outputFormat != fmtMafNoComment)); - break; - case fmtSoftSam: - case fmtHardSam: - print_sam_job_header (currParams->outputFile,currParams->readGroup); - break; - case fmtSoftSamNoHeader: - case fmtHardSamNoHeader: - ; // (do nothing) - break; - case fmtCigar: - print_cigar_job_header (currParams->outputFile); - break; - case fmtGenpaf: - print_genpaf_job_header (currParams->outputFile, currParams->outputInfo); - break; - case fmtGenpafNoHeader: - case fmtGenpafNameHeader: - print_genpaf_job_header (NULL, NULL); - break; - case fmtGenpafBlast: - print_blast_job_header (currParams->outputFile); - break; - case fmtGenpafBlastNoHeader: - ; // (do nothing) - break; - case fmtText: - case fmtZeroText: - text_format: - print_text_align_job_header - (currParams->outputFile, program_name(), - currParams->seq1->filename, currParams->seq2->filename, - (outputFormat!=fmtZeroText)); - break; - case fmtDiffs: - case fmtDiffsNoBlocks: - print_align_diffs_job_header - (currParams->outputFile, program_name(), - currParams->seq1->filename, currParams->seq2->filename); - break; - case fmtInfStats: - init_inference_stats_job (currParams->seq1, currParams->seq2); - break; - case fmtIdDist: - init_identity_dist_job (currParams->seq1, currParams->seq2); - break; - case fmtInfScores: - inf_scores_format: - ; // (do nothing) - break; - case fmtHspComp: - case fmtDeseed: - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_job_header, outputFormat=%d", outputFormat); - } - -// if (currParams->dotplotFile != NULL) -// ; // (do nothing) - } - -void print_job_footer (void) - { - int outputFormat = currParams->outputFormat; - - switch (outputFormat) - { - case fmtGfa: - case fmtGfaNoScore: - print_gfa_job_footer (currParams->outputFile); - break; - case fmtLav: - case fmtLavComment: - case fmtLavScore: - case fmtLavText: - case fmtLavInfScores: - print_lav_job_footer (currParams->outputFile); - if (outputFormat == fmtLavText) - goto text_format; - if (outputFormat == fmtLavInfScores) - goto inf_scores_format; - break; - case fmtAxt: - case fmtAxtComment: - case fmtAxtGeneral: - print_axt_job_footer (currParams->outputFile); - break; - case fmtMaf: - case fmtMafComment: - case fmtMafNoComment: - print_maf_job_footer (currParams->outputFile); - break; - case fmtSoftSam: - case fmtSoftSamNoHeader: - case fmtHardSam: - case fmtHardSamNoHeader: - ; // (do nothing) - break; - case fmtCigar: - print_cigar_job_footer (currParams->outputFile); - break; - case fmtGenpaf: - print_genpaf_job_footer (currParams->outputFile); - break; - case fmtGenpafNoHeader: - case fmtGenpafNameHeader: - ; // (do nothing) - break; - case fmtGenpafBlast: - print_blast_job_footer (currParams->outputFile); - break; - case fmtGenpafBlastNoHeader: - ; // (do nothing) - break; - case fmtText: - case fmtZeroText: - text_format: - print_text_align_job_footer (currParams->outputFile); - break; - case fmtDiffs: - case fmtDiffsNoBlocks: - print_align_diffs_job_footer (currParams->outputFile); - break; - case fmtInfStats: - print_inference_stats_job (currParams->outputFile); - break; - case fmtIdDist: - print_identity_dist_job (currParams->outputFile); - break; - case fmtInfScores: - inf_scores_format: - ; // (do nothing) - break; - case fmtHspComp: - case fmtDeseed: - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_job_footer, outputFormat=%d", outputFormat); - } - -// if (currParams->dotplotFile != NULL) -// ; // (do nothing) - } - -void print_header (void) - { - static char* prevName1 = NULL; - static char* prevName2 = NULL; - static char prevNameBuff1[maxSequenceName+1]; - static char prevNameBuff2[maxSequenceName+1]; - char* name1, *name2; - int outputFormat = currParams->outputFormat; - - if (prevName1 == NULL) - { prevName1 = prevNameBuff1; prevNameBuff1[0] = 0; } - if (prevName2 == NULL) - { prevName2 = prevNameBuff2; prevNameBuff2[0] = 0; } - - switch (outputFormat) - { - case fmtGfa: - case fmtGfaNoScore: - print_gfa_header (currParams->outputFile, - currParams->seq1, currParams->seq2); - break; - case fmtLav: - case fmtLavComment: - case fmtLavScore: - case fmtLavText: - case fmtLavInfScores: - print_lav_header (currParams->outputFile, - currParams->seq1, currParams->seq2); - if (outputFormat == fmtLavText) - goto text_format; - if (outputFormat == fmtLavInfScores) - goto inf_scores_format; - break; - case fmtAxt: - case fmtAxtComment: - case fmtAxtGeneral: - print_axt_header (currParams->outputFile, - currParams->seq1, currParams->seq2); - break; - case fmtMaf: - case fmtMafComment: - case fmtMafNoComment: - print_maf_header (currParams->outputFile, - currParams->seq1, currParams->seq2); - break; - case fmtSoftSam: - case fmtHardSam: - print_sam_header (currParams->outputFile, - currParams->seq1, currParams->seq2); - break; - case fmtSoftSamNoHeader: - case fmtHardSamNoHeader: - ; // (do nothing) - break; - case fmtCigar: - print_cigar_header (currParams->outputFile, - currParams->seq1, currParams->seq2); - break; - case fmtGenpaf: - print_genpaf_header (currParams->outputFile, - currParams->seq1, currParams->seq2); - break; - case fmtGenpafNoHeader: - ; // (do nothing) - break; - case fmtGenpafNameHeader: - { - name1 = name2 = NULL; - if (currParams->seq1->partition.p == NULL) // sequence 1 is not partitioned - name1 = (currParams->seq1->useFullNames)? currParams->seq1->header - : currParams->seq1->shortHeader; - if (currParams->seq2->partition.p == NULL) // sequence 2 is not partitioned - name2 = (currParams->seq1->useFullNames)? currParams->seq2->header - : currParams->seq2->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - if ((strcmp (name1, prevName1) != 0) - || (strcmp (name2, prevName2) != 0)) - { - if (strcmp(currParams->outputInfo,genpafRDotplotScoreKeys) == 0) - fprintf (currParams->outputFile, "%s\t%s\tscore\n", name1, name2); - else - fprintf (currParams->outputFile, "%s\t%s\n", name1, name2); - strncpy (/*to*/ prevName1, /*from*/ name1, sizeof(prevNameBuff1)); - strncpy (/*to*/ prevName2, /*from*/ name2, sizeof(prevNameBuff2)); - } - } - break; - case fmtGenpafBlast: - print_blast_header - (currParams->outputFile, program_name(), currParams->args, - currParams->seq1, currParams->seq2); - break; - case fmtGenpafBlastNoHeader: - ; // (do nothing) - break; - case fmtText: - case fmtZeroText: - text_format: - print_text_align_header (currParams->outputFile, - currParams->seq1, currParams->seq2, - (outputFormat!=fmtZeroText)); - break; - case fmtDiffs: - case fmtDiffsNoBlocks: - print_align_diffs_header (currParams->outputFile, - currParams->seq1, currParams->seq2); - break; - case fmtHspComp: - case fmtInfStats: - case fmtInfScores: - inf_scores_format: - ; // (do nothing) - break; - case fmtIdDist: - case fmtDeseed: - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_header, outputFormat=%d", outputFormat); - } - - if (currParams->dotplotFile != NULL) - { - name1 = name2 = NULL; - if (currParams->seq1->partition.p == NULL) // sequence 1 is not partitioned - name1 = (currParams->seq1->useFullNames)? currParams->seq1->header - : currParams->seq1->shortHeader; - if (currParams->seq2->partition.p == NULL) // sequence 2 is not partitioned - name2 = (currParams->seq1->useFullNames)? currParams->seq2->header - : currParams->seq2->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - if ((strcmp (name1, prevName1) != 0) - || (strcmp (name2, prevName2) != 0)) - { - if (strcmp(currParams->dotplotKeys,genpafRDotplotScoreKeys) == 0) - fprintf (currParams->dotplotFile, "%s\t%s\tscore\n", name1, name2); - else - fprintf (currParams->dotplotFile, "%s\t%s\n", name1, name2); - strncpy (/*to*/ prevName1, /*from*/ name1, sizeof(prevNameBuff1)); - strncpy (/*to*/ prevName2, /*from*/ name2, sizeof(prevNameBuff2)); - } - } - } - -void print_align_list (alignel* alignList) - { - int outputFormat = currParams->outputFormat; - alignel* a; - - if ((currParams->searchLimit > 0) - && (printedForQuery >= currParams->searchLimit)) - return; - printedForQuery++; - - if (!strandHeaderPrinted) - { print_header (); strandHeaderPrinted = true; } - - if (infer_scores_dbgShowIdentity) - { - unspos numer, denom; - u32 bin; - - for (a=alignList ; a!=NULL ; a=a->next) - { - alignment_identity (currParams->seq1, currParams->seq2, a, - &numer, &denom); - bin = identity_bin (numer, denom); - // nota bene: positions written as 1-based - print_generic (currParams->outputFile, - unsposSlashFmt - " pct_identity=" unsposSlashFmt - " (bin as " identityBinFormat ")", - a->beg1, a->beg2, - numer, denom, - bin_to_identity (bin)); - } - } - - switch (outputFormat) - { - case fmtGfa: - case fmtGfaNoScore: - print_gfa_align_list (currParams->outputFile, - (outputFormat == fmtGfa)? currParams->scoring - : NULL, - alignList, - currParams->seq1, currParams->seq2); - break; - case fmtLav: - case fmtLavComment: - case fmtLavScore: - case fmtLavInfScores: - print_lav_align_list (currParams->outputFile, - alignList, - currParams->seq1, currParams->seq2); - if (outputFormat == fmtLavInfScores) - goto inf_scores_format; - break; - case fmtLavText: - for (a=alignList ; a!=NULL ; a=a->next) - { - print_lav_align (currParams->outputFile, - a->seq1, a->beg1-1, a->end1, - a->seq2, a->beg2-1, a->end2, - a->script, a->s); - print_text_align_align (currParams->outputFile, - currParams->seq1, a->beg1-1, a->end1, - currParams->seq2, a->beg2-1, a->end2, - a->script, a->s, - false, currParams->textContext); - } - break; - case fmtAxt: - case fmtAxtComment: - print_axt_align_list (currParams->outputFile, alignList, - currParams->seq1, currParams->seq2, - /* comments */ outputFormat==fmtAxtComment, - /* extras */ NULL); - break; - case fmtAxtGeneral: - print_axt_align_list (currParams->outputFile, alignList, - currParams->seq1, currParams->seq2, - /* comments */ false, - /* extras */ currParams->outputInfo); - break; - case fmtMaf: - case fmtMafNoComment: - print_maf_align_list (currParams->outputFile, - alignList, currParams->seq1, currParams->seq2, - /* comments */ false); - break; - case fmtMafComment: - print_maf_align_list (currParams->outputFile, - alignList, currParams->seq1, currParams->seq2, - /* comments */ true); - break; - case fmtSoftSam: - case fmtSoftSamNoHeader: - print_sam_align_list (currParams->outputFile, - alignList, currParams->seq1, currParams->seq2, - /* softMasking */ true, - currParams->samRGTags); - break; - case fmtHardSam: - case fmtHardSamNoHeader: - print_sam_align_list (currParams->outputFile, - alignList, currParams->seq1, currParams->seq2, - /* softMasking */ false, - currParams->samRGTags); - break; - case fmtCigar: - print_cigar_align_list (currParams->outputFile, - alignList, currParams->seq1, currParams->seq2, - /* withInfo */ true, - /* markMismatches */ false, - /* letterAfter */ false, - /* hideSingles */ false, - /* lowerCase */ false, - /* withNewLine */ true); - break; - case fmtGenpaf: - case fmtGenpafNoHeader: - case fmtGenpafNameHeader: - case fmtGenpafBlast: - case fmtGenpafBlastNoHeader: - print_genpaf_align_list (currParams->outputFile, - alignList, currParams->seq1, currParams->seq2, - currParams->outputInfo); - break; - case fmtText: - case fmtZeroText: - print_text_align_align_list (currParams->outputFile, - alignList, currParams->seq1, currParams->seq2, - (outputFormat!=fmtZeroText), - currParams->textContext); - break; - case fmtDiffs: - case fmtDiffsNoBlocks: - print_align_diffs_align_list (currParams->outputFile, - alignList, currParams->seq1, currParams->seq2, - (outputFormat == fmtDiffs), - currParams->nIsAmbiguous); - break; - case fmtInfStats: - infer_stats_from_align_list (alignList, currParams->seq1, currParams->seq2); - break; - case fmtInfScores: - inf_scores_format: - gather_stats_from_align_list (alignList, currParams->seq1, currParams->seq2); - break; - case fmtIdDist: - identity_dist_from_align_list (alignList, currParams->seq1, currParams->seq2); - break; - case fmtHspComp: - case fmtDeseed: - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_align_list, outputFormat=%d", outputFormat); - } - - if (currParams->dotplotFile != NULL) - print_genpaf_align_list_segments (currParams->dotplotFile, - alignList, currParams->seq1, currParams->seq2, - currParams->dotplotKeys, - currParams->scoring); - } - -void print_match (unspos pos1, unspos pos2, unspos length, score s, u64 hspId) - // pos1 and pos2 are the positions of first character in the match, - // .. (origin-0). - { - static u32 printsUntilFlush = matchFlushFrequency; - int outputFormat = currParams->outputFormat; - - if ((currParams->searchLimit > 0) - && (printedForQuery >= currParams->searchLimit)) - return; - printedForQuery++; - - if (!strandHeaderPrinted) - { print_header (); strandHeaderPrinted = true; } - - if (infer_scores_dbgShowIdentity) - { - unspos numer, denom; - u32 bin; - - segment_identity (currParams->seq1, pos1, currParams->seq2, pos2, length, - &numer, &denom); - bin = identity_bin (numer, denom); - // nota bene: positions written as 1-based - print_generic (currParams->outputFile, - unsposSlashFmt - " pct_identity=" unsposSlashFmt - " (bin as " identityBinFormat ")", - pos1+1, pos2+1, - numer, denom, - bin_to_identity (bin)); - } - - switch (outputFormat) - { - case fmtGfa: - case fmtGfaNoScore: - print_gfa_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - (outputFormat == fmtGfa)? s : 0); - break; - case fmtLav: - case fmtLavComment: - case fmtLavText: - case fmtLavInfScores: - print_lav_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s); - if (outputFormat == fmtLavText) - goto text_format; - if (outputFormat == fmtLavInfScores) - goto inf_scores_format; - break; - case fmtLavScore: - print_lavscore_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s); - break; - case fmtAxt: - case fmtAxtComment: - print_axt_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, - /* comments */ outputFormat==fmtAxtComment, - /* extras */ NULL); - break; - case fmtAxtGeneral: - print_axt_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, - /* comments */ false, - /* extras */ currParams->outputInfo); - break; - case fmtMaf: - case fmtMafNoComment: - print_maf_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, /* comments */ false); - break; - case fmtMafComment: - print_maf_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, /* comments */ true); - break; - case fmtSoftSam: - case fmtSoftSamNoHeader: - print_sam_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, - /* softMasking */ true, - currParams->samRGTags); - break; - case fmtHardSam: - case fmtHardSamNoHeader: - print_sam_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, - /* softMasking */ false, - currParams->samRGTags); - break; - case fmtCigar: - print_cigar_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, - /* withInfo */ true, - /* markMismatches */ false, - /* letterAfter */ false, - /* hideSingles */ false, - /* lowerCase */ false, - /* withNewLine */ true); - break; - case fmtGenpaf: - case fmtGenpafNoHeader: - case fmtGenpafNameHeader: - case fmtGenpafBlast: - case fmtGenpafBlastNoHeader: - print_genpaf_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, hspId, currParams->outputInfo); - break; - case fmtText: - case fmtZeroText: - text_format: - print_text_align_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, - (outputFormat!=fmtZeroText), - currParams->textContext); - break; - case fmtDiffs: - case fmtDiffsNoBlocks: - print_align_diffs_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - (outputFormat == fmtDiffs), - currParams->nIsAmbiguous); - break; - case fmtHspComp: - print_match_composition (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, currParams->hitSeed, currParams->step); - break; - case fmtInfStats: - infer_stats_from_match (currParams->seq1, pos1, - currParams->seq2, pos2, length); - break; - case fmtInfScores: - inf_scores_format: - gather_stats_from_match (currParams->seq1, pos1, - currParams->seq2, pos2, length); - break; - case fmtIdDist: - identity_dist_from_match (currParams->seq1, pos1, - currParams->seq2, pos2, length); - break; - case fmtDeseed: - dump_match (currParams->outputFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length); - printf ("\n"); - break; - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_match, outputFormat=%d", outputFormat); - } - - if (currParams->dotplotFile != NULL) - print_genpaf_match (currParams->dotplotFile, - currParams->seq1, pos1, - currParams->seq2, pos2, length, - s, (u64) 0, currParams->dotplotKeys); - - if (--printsUntilFlush == 0) - { - fflush (currParams->outputFile); - printsUntilFlush = matchFlushFrequency; - } - } - - -char* print_comment_open (void) - { - int outputFormat = currParams->outputFormat; - char* commentPrefix = NULL; - - switch (outputFormat) - { - case fmtLav: - case fmtLavComment: - case fmtLavScore: - case fmtLavText: - case fmtLavInfScores: - print_lav_comment_open (currParams->outputFile); - break; - case fmtGfa: - case fmtGfaNoScore: - commentPrefix = "#"; - break; - case fmtAxt: - case fmtAxtComment: - case fmtAxtGeneral: - commentPrefix = "#"; - break; - case fmtMaf: - case fmtMafComment: - case fmtMafNoComment: - fprintf (stderr, "WARNING. Output is not properly MAF format\n"); - commentPrefix = "#"; - break; - case fmtSoftSam: - case fmtSoftSamNoHeader: - case fmtHardSam: - case fmtHardSamNoHeader: - fprintf (stderr, "WARNING. Output is not properly SAM format\n"); - commentPrefix = "#"; - break; - case fmtCigar: - fprintf (stderr, "WARNING. Output is not properly CIGAR format\n"); - commentPrefix = "#"; - break; - case fmtGenpaf: - case fmtGenpafNoHeader: - case fmtGenpafNameHeader: - case fmtGenpafBlast: - case fmtGenpafBlastNoHeader: - commentPrefix = "#"; - break; - case fmtText: - case fmtZeroText: - ; // (do nothing) - break; - case fmtHspComp: - commentPrefix = "#"; - break; - case fmtDiffs: - case fmtDiffsNoBlocks: - ; // (do nothing) - break; - case fmtInfStats: - commentPrefix = "#"; - break; - case fmtInfScores: - commentPrefix = "#"; - break; - case fmtIdDist: - commentPrefix = "#"; - break; - case fmtDeseed: - commentPrefix = "#"; - break; - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_comment_open, outputFormat=%d", outputFormat); - } - - return commentPrefix; - } - - -void print_comment_close (void) - { - int outputFormat = currParams->outputFormat; - - switch (outputFormat) - { - case fmtLav: - case fmtLavComment: - case fmtLavScore: - case fmtLavText: - case fmtLavInfScores: - print_lav_comment_close (currParams->outputFile); - break; - case fmtGfa: - case fmtGfaNoScore: - case fmtAxt: - case fmtAxtComment: - case fmtAxtGeneral: - case fmtMaf: - case fmtMafComment: - case fmtMafNoComment: - case fmtSoftSam: - case fmtSoftSamNoHeader: - case fmtHardSam: - case fmtHardSamNoHeader: - case fmtCigar: - case fmtGenpaf: - case fmtGenpafNoHeader: - case fmtGenpafNameHeader: - case fmtGenpafBlast: - case fmtGenpafBlastNoHeader: - case fmtText: - case fmtZeroText: - case fmtHspComp: - case fmtDiffs: - case fmtDiffsNoBlocks: - case fmtInfStats: - case fmtInfScores: - case fmtIdDist: - case fmtDeseed: - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_comment_close, outputFormat=%d", outputFormat); - } - } - - -void print_eof_comment (void) - { - int outputFormat = currParams->outputFormat; - - if (outputFormat != fmtNone) - fprintf (currParams->outputFile, "# lastz end-of-file\n"); - } - - -void print_m_stanza (census* cen) - { // note that census might be NULL - int outputFormat = currParams->outputFormat; - - switch (outputFormat) - { - case fmtLav: - case fmtLavComment: - case fmtLavScore: - case fmtLavText: - case fmtLavInfScores: - print_lav_m_stanza (currParams->outputFile, cen); - break; - case fmtGfa: - case fmtGfaNoScore: - case fmtAxt: - case fmtAxtComment: - case fmtAxtGeneral: - case fmtMaf: - case fmtMafComment: - case fmtMafNoComment: - case fmtSoftSam: - case fmtSoftSamNoHeader: - case fmtHardSam: - case fmtHardSamNoHeader: - case fmtCigar: - case fmtGenpaf: - case fmtGenpafNoHeader: - case fmtGenpafNameHeader: - case fmtGenpafBlast: - case fmtGenpafBlastNoHeader: - case fmtText: - case fmtZeroText: - case fmtHspComp: - case fmtDiffs: - case fmtDiffsNoBlocks: - case fmtInfStats: - case fmtInfScores: - case fmtIdDist: - case fmtDeseed: - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_m_stanza, outputFormat=%d", outputFormat); - } - -// if (currParams->dotplotFile != NULL) -// ; // (do nothing) - } - -void print_census_stanza (census* cen) - { - int outputFormat = currParams->outputFormat; - - switch (outputFormat) - { - case fmtLav: - case fmtLavComment: - case fmtLavScore: - case fmtLavText: - case fmtLavInfScores: - print_lav_census_stanza (currParams->outputFile, cen); - break; - case fmtGfa: - case fmtGfaNoScore: - case fmtAxt: - case fmtAxtComment: - case fmtAxtGeneral: - case fmtMaf: - case fmtMafComment: - case fmtMafNoComment: - case fmtSoftSam: - case fmtSoftSamNoHeader: - case fmtHardSam: - case fmtHardSamNoHeader: - case fmtCigar: - case fmtGenpaf: - case fmtGenpafNoHeader: - case fmtGenpafNameHeader: - case fmtGenpafBlast: - case fmtGenpafBlastNoHeader: - case fmtText: - case fmtZeroText: - case fmtHspComp: - case fmtDiffs: - case fmtDiffsNoBlocks: - case fmtInfStats: - case fmtInfScores: - case fmtIdDist: - case fmtDeseed: - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_census_stanza, outputFormat=%d", outputFormat); - } - -// if (currParams->dotplotFile != NULL) -// ; // (do nothing) - } - -void print_x_stanza (unspos numMasked) - { - int outputFormat = currParams->outputFormat; - - switch (outputFormat) - { - case fmtGfa: - case fmtGfaNoScore: - print_gfa_generic (currParams->outputFile, - 'x', "num_masked=" unsposFmt, numMasked); - break; - case fmtLav: - case fmtLavComment: - case fmtLavScore: - case fmtLavText: - case fmtLavInfScores: - print_lav_x_stanza (currParams->outputFile, numMasked); - break; - case fmtAxt: - case fmtAxtComment: - case fmtAxtGeneral: - case fmtMaf: - case fmtMafComment: - case fmtMafNoComment: - case fmtSoftSam: - case fmtSoftSamNoHeader: - case fmtHardSam: - case fmtHardSamNoHeader: - case fmtCigar: - case fmtGenpaf: - case fmtGenpafNoHeader: - case fmtGenpafNameHeader: - case fmtGenpafBlast: - case fmtGenpafBlastNoHeader: - case fmtText: - case fmtZeroText: - case fmtHspComp: - case fmtDiffs: - case fmtDiffsNoBlocks: - case fmtInfStats: - case fmtIdDist: - case fmtDeseed: - print_generic (currParams->outputFile, - "num_masked=" unsposFmt, numMasked); - break; - case fmtInfScores: - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_x_stanza, outputFormat=%d", outputFormat); - } - -// if (currParams->dotplotFile != NULL) -// ; // (do nothing) - } - -void print_generic - (FILE* f, - const char* format, - ...) - { - int outputFormat; - va_list args; - - va_start (args, format); - - outputFormat = currParams->outputFormat; - - switch (outputFormat) - { - case fmtGfa: - case fmtGfaNoScore: - vprint_gfa_generic (f, 'z', format, args); - break; - case fmtLavComment: - vprint_lav_comment (f, format, args); - break; - case fmtLavText: - vprint_lav_comment (f, format, args); - if (format != NULL) - { - va_end (args); - va_start (args, format); - vfprintf (f, format, args); - fprintf (f, "\n"); - } - break; - case fmtAxtComment: - vprint_axt_comment (f, format, args); - break; - case fmtMafComment: - vprint_maf_comment (f, format, args); - break; - case fmtText: - case fmtZeroText: - if (format != NULL) - { - vfprintf (f, format, args); - fprintf (f, "\n"); - } - break; - case fmtLav: - case fmtLavScore: - case fmtLavInfScores: - case fmtAxt: - case fmtAxtGeneral: - case fmtMaf: - case fmtMafNoComment: - case fmtSoftSam: - case fmtSoftSamNoHeader: - case fmtHardSam: - case fmtHardSamNoHeader: - case fmtCigar: - case fmtGenpaf: - case fmtGenpafNoHeader: - case fmtGenpafNameHeader: - case fmtGenpafBlast: - case fmtGenpafBlastNoHeader: - case fmtHspComp: - case fmtDiffs: - case fmtDiffsNoBlocks: - case fmtInfStats: - case fmtInfScores: - case fmtIdDist: - case fmtDeseed: - case fmtNone: - ; // (do nothing) - break; - default: - suicidef ("internal error, in print_generic, outputFormat=%d", outputFormat); - } - -// if (currParams->dotplotFile != NULL) -// ; // (do nothing) - - va_end (args); - } - -//---------- -// -// print_match_composition-- -// Print a gap-free alignment including position and composition (counts of -// matched dna letter pairs). -// -// Typical output is shown below, with a header added. The first letter of the -// pairs is from sequence 1, the second from sequence 2. P is the 'discovery -// probability'-- the probability that this HSP would be discovered for this -// (seed,Z) combination, over random sequence positions. -// -// id score pos1/pos2 len p AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT -// 92 121 1475+/1395- 145 .750 27 0 4 0 1 38 2 0 3 0 45 0 1 1 0 23 -// 88 28 3374+/4837- 42 .200 18 0 0 0 1 7 0 0 2 1 7 1 0 0 0 5 -// ... -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos pos1: The first aligned position in sequence 1. -// seq* seq2: The other sequence. -// unspos pos2: The first aligned position in sequence 2. -// unspos length: The length of the alignment. -// seed* hitSeed: Seeding strategy for the hits that found this match. -// u32 step: Positional step size in the search for those hits. -// -// Returns: -// (nothing) -// -//---------- - -static void print_match_composition - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - score s, - seed* hitSeed, - u32 step) - { - int pctId; - unspos count[4][4]; - float p; - char pstr[6]; - int ix, iy; - - // compute percent identity, match compostion, and discovery probability - - pctId = percent_identical (seq1, pos1, seq2, pos2, length); - match_composition (seq1, pos1, seq2, pos2, length, count); - p = discovery_probability (seq1, pos1+length, seq2, pos2+length, length, - hitSeed, step); - - // convert discovery probability to a string - - if (p < 0.0) p = 0.0; - else if (p > 1.0) p = 1.0; - - snprintf (pstr, sizeof(pstr), "%.3f", p); - if (pstr[0] == '1') // (1.000 -> 1.00) - pstr[4] = 0; - else // (0.XXX -> .XXX) - { - pstr[0] = pstr[1]; - pstr[1] = pstr[2]; - pstr[2] = pstr[3]; - pstr[3] = pstr[4]; - pstr[4] = 0; - } - - // print it - - fprintf (f, "%d " scoreFmtSimple " " unsposSlashSFmt " " unsposFmt " %s", - pctId, s, - pos1+1, ((seq1->revCompFlags & rcf_rev) != 0)? "-" : "+", - pos2+1, ((seq2->revCompFlags & rcf_rev) != 0)? "-" : "+", - length, pstr); - - for (ix=0 ; ix<4 ; ix++) - for (iy=0 ; iy<4 ; iy++) - fprintf (f, " " unsposFmt, count[ix][iy]); - - fprintf (f, "\n"); - } - -//---------- -// -// dump_match-- -// Dump the nucleotides (from each sequence) for a gap-free alignment. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos pos1: The first aligned position in sequence 1. -// seq* seq2: The other sequence. -// unspos pos2: The first aligned position in sequence 2. -// unspos length: The length of the alignment. -// -// Returns: -// (nothing) -// -//---------- - -static void dump_match - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length) - { - print_prefix (f, (char*) seq1->v + pos1, length); - fprintf (f, "\n"); - print_prefix (f, (char*) seq2->v + pos2, length); - fprintf (f, "\n"); - } - -//---------- -// -// program_name-- -// Determnine the name of this program. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// A string describing the name of this program. This may point to static -// memory belonging to this routine, or it may point to memory in global -// memory space. But in any case, the caller should *not* deallocate the -// returned pointer. -// -//---------- - -static char* program_name - (void) - { - static char _progName[101]; - int n; - - n = snprintf (NULL, 0, "%s.v%s.%s.%s", - programName, programVersionMajor, programVersionMinor, programVersionSubMinor); - if (((unsigned) n) < sizeof(_progName)) - { - sprintf (_progName, - "%s.v%s.%s.%s", - programName, programVersionMajor, programVersionMinor, programVersionSubMinor); - return _progName; - } - else - return programName; - } - diff --git a/programs/lastz/src/output.h b/programs/lastz/src/output.h deleted file mode 100644 index c865ae1..0000000 --- a/programs/lastz/src/output.h +++ /dev/null @@ -1,113 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: output.h -// -//---------- - -#ifndef output_H // (prevent multiple inclusion) -#define output_H - -// other files - -#include // standard C i/o stuff -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "masking.h" // dynamic masking stuff -#include "edit_script.h" // alignment edit script stuff - -//---------- -// -// data structures and types -// -//---------- - -// internal codes for output formats -// -// nota bene: The entries in formatNames[] must 'line up with' the entries in -// this enum. Thus, if additional output types are added, they must be added -// to both the enum and to formatNames. fmt_max is not a real format, but can -// be used in the code to determine the largest index into formatNames. -// -// Unfortunately, I haven't figured out how to get the compiler to check for -// any inconsistencies here. - -enum - { - fmtBad = -1, - fmt_min = 0, - fmtGfa = fmt_min, // output alignments in GFA format - fmtGfaNoScore, // .. in GFA format but without scores - fmtLav, // .. in LAV format - fmtLavComment, // .. in LAV format with comments - fmtLavScore, // .. in LAV format with scores where pctid is - fmtLavText, // .. in LAV format with as-text - fmtAxt, // .. in AXT format - fmtAxtComment, // .. in AXT format with comments - fmtAxtGeneral, // .. in AXT format with extra (general) fields - fmtMaf, // .. in MAF format - fmtMafComment, // .. in MAF format with comments - fmtMafNoComment, // .. in MAF format with no comments at all - fmtSoftSam, // .. in SAM format, soft masking - fmtSoftSamNoHeader, // .. in SAM format, soft masking, no header - fmtHardSam, // .. in SAM format, hard masking - fmtHardSamNoHeader, // .. in SAM format, hard masking, no header - fmtCigar, // .. in standard CIGAR format - fmtGenpaf, // .. in 'standard' GENPAF format - fmtGenpafNoHeader, // .. in 'standard' GENPAF format, no header - fmtGenpafNameHeader, // .. in GENPAF format with names header line - fmtGenpafBlast, // .. in 'standard' BLASTN format - fmtGenpafBlastNoHeader, // .. in 'standard' BLASTN format, no header - fmtText, // .. as text - fmtZeroText, // .. as text (zero-based) - fmtHspComp, // .. as text, showing composition of each HSP - fmtDiffs, // .. as alignment differences - fmtDiffsNoBlocks, // .. as alignment differences, without blocks - fmtInfStats, // .. as scoring inference stats - fmtIdDist, // .. as identity distribution - fmtDeseed, // .. as text for deseed program - fmtInfScores, // .. collect scoring inference stats (this - // .. cannot be directly chosen by the user) - fmtLavInfScores, // .. fmtLav + fmtInfScores (debugging only) - fmtNone, // don't bother to output - fmt_max = fmtNone - }; - -#ifdef output_owner -char* formatNames[] = {"GFA","GFANOSCORE", - "LAV","lav+","LAVSCORE","lav+text", - "AXT","axt+",NULL,"MAF","maf+","maf-", - "sam","sam-","hardsam","hardsam-","cigar", - "general","general-",NULL,"blastn","blastn-", - "text", "ztext", "comp", "diffs", "diffs-", - "infstats","iddist","deseed", - "infscores","lav+infscores", - "none" }; -#else -extern char* formatNames[]; -#endif - -//---------- -// -// prototypes for routines in output.c -// -//---------- - -void init_output_for_query (void); -void init_output_for_strand (void); -void print_align_list_segments (alignel* alignList); -void print_job_header (void); -void print_job_footer (void); -void print_header (void); -void print_align_list (alignel* alignList); -char* print_comment_open (void); -void print_comment_close (void); -void print_eof_comment (void); -void print_match (unspos pos1, unspos pos2, unspos length, - score s, u64 hspId); -void print_m_stanza (census* cen); -void print_census_stanza (census* cen); -void print_x_stanza (unspos numMasked); -void print_generic (FILE* f, const char* format, ...); - -#undef global -#endif // output_H diff --git a/programs/lastz/src/pos_table.c b/programs/lastz/src/pos_table.c deleted file mode 100755 index b71fdde..0000000 --- a/programs/lastz/src/pos_table.c +++ /dev/null @@ -1,2249 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: pos_table.c -// -//---------- -// -// pos_table-- -// Support for creating a table of positions of words in genomic sequences. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C upper/lower stuff -#include // standard C math stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "seeds.h" // seed matching stuff - -#define pos_table_owner // (make this the owner of its globals) -#include "pos_table.h" // interface to this module - -// debugging defines - -//#define debugTablePos1 111 // if defined, breakdown what happens with this - // .. position in sequence 1 (this is the right - // .. end of the word; the left end is at - // .. X-(seedLength-1) counting from 1 at the - // .. start of the sequence - -//---------- -// -// private data -// -//---------- - -typedef struct dumpinfo - { - seed* seed; // the seed used to pack words - u8* bitsToAlphabet; // mapping from 2-bit values to characters (e.g. - // .. "ACGT") - } dumpinfo; - -//---------- -// -// stats to augment crude profiling -// -//---------- - -#ifndef dbgTiming -#define dbg_timing_set_stat(field,val) ; -#define dbg_timing_count_stat(field) ; -#define dbg_timing_report_stat(field,name) ; -#endif // not dbgTiming - -#ifdef dbgTiming -struct - { - int wordsInTable; - } posTableTimingStats; - -#define dbg_timing_set_stat(field,val) (posTableTimingStats.field = val) -#define dbg_timing_count_stat(field) ++posTableTimingStats.field -#define dbg_timing_report_stat(field,name) fprintf(stderr,"%-26s %d\n", \ - name":",posTableTimingStats.field) -#endif // dbgTiming - -//---------- -// -// prototypes for private functions -// -//---------- - -static void record_seed_positions - (postable* pt, seq* seq, - const s8 upperCharToBits[], seed* hitSeed); -static void record_seed_positions_halfweight - (postable* pt, seq* seq, - const s8 upperCharToBits[], seed* hitSeed); -static void record_seed_positions_bits - (postable* pt, seq* seq, - const s8 upperCharToBits[], seed* hitSeed); -static void record_seed_positions_quantum - (postable* pt, seq* seq, - const charvec qToBest[], seed* hitSeed); -static void mask_seed_positions - (postable* pt, seq* seq, unspos start, unspos end, - const s8 upperCharToBits[], seed* hitSeed); -static void mask_seed_positions_halfweight - (postable* pt, seq* seq, unspos start, unspos end, - const s8 upperCharToBits[], seed* hitSeed); -static int position_is_in_table (postable* pt, unspos position); -static void add_word (postable* pt, u32 word, unspos position); -static void remove_word (postable* pt, u32 word, unspos position); -static void dump_word_position - (FILE* f, postable* pt, int field, u64 fieldVal); -static void dump_seed_position - (FILE* f, postable* pt, int field, u64 fieldVal); -static void dump_quantum_seed_position - (FILE* f, postable* pt, int field, u64 fieldVal); - -//---------- -// -// build_seed_position_table-- -// Create a table of the positions of all seed-words in an interval of a -// sequence. The basic idea of a word is a series of W consecutive bases, but -// it is generalized to that of a seed containing W (specific) bits over L -// bases. -// -//---------- -// -// Arguments: -// seq* seq: The sequence to build the position table of. -// unspos start: First sequence position to consider. Zero is -// .. the first possible position. -// unspos end: One past the last sequence position to consider. -// .. If this is zero, the sequence length is used. -// s8 upperCharToBits[]: Table to map sequence characters to two-bit -// .. values, and illegal characters to -1. -// seed* hitSeed: The seed-word to base the table on. -// u32 step: Positional step size indicating the granularity -// .. of the positions stored. For example, step=5 -// .. means only every 5th position will be -// .. stored. Step=1 means all positions are -// .. stored. -// -// Returns: -// A pointer to a newly allocated table of positions; failures result in -// program fatality. The caller must eventually dispose of the table, with a -// call to free_position_table(). -// -//---------- - -postable* build_seed_position_table - (seq* seq, - unspos start, - unspos end, - const s8 upperCharToBits[], - seed* hitSeed, - u32 step) - { - postable* pt; - dumpinfo di; - - // sanity check - - if (step < 1) - suicidef ("in build_seed_position_table(), step can't be %u", step); - - if (end == 0) - end = seq->len; - - if (end <= start) - suicidef ("in build_seed_position_table(), interval is void (" unsposDotsFmt ")", - start, end); - - if (end > seq->len) - suicidef ("in build_seed_position_table(), interval end is bad (" unsposFmt ">" unsposFmt ")", - end, seq->len); - - // create an empty table - - pt = new_position_table (hitSeed->weight, start, end, step, - true, true, (hitSeed->type == 'R')); - - // install dumper - - pt->dump = (posdumper) dump_seed_position; - pt->dumpInfo = &di; - di.seed = hitSeed; - di.bitsToAlphabet = NULL; - - // fill the table - - if (hitSeed->isHalfweight) - record_seed_positions_halfweight (pt, seq, upperCharToBits, hitSeed); - else if (pt->asBits != NULL) - record_seed_positions_bits (pt, seq, upperCharToBits, hitSeed); - else - record_seed_positions (pt, seq, upperCharToBits, hitSeed); - - return pt; - } - -//---------- -// -// build_quantum_seed_position_table-- -// Create a table of the positions of all seed-words in an interval of a -// sequence (similar to build_seed_position_table). Here a word consists of -// W quantum bases, which is reduced to the closest (highest scoring) word in -// the bottleneck alphabet (which can be thought of as A, C, G, T; see note -// below). -// -//---------- -// -// Arguments: -// seq* seq: The sequence to build the position table of. -// unspos start: First sequence position to consider. Zero is -// .. the first possible position. -// unspos end: One past the last sequence position to consider. -// .. If this is zero, the sequence length is used. -// u8* bottleneck: The bottleneck alphabet. -// charvec qToBest[]: Table to map a quantum character to the two-bit -// .. code(s) for the 'closest' bottleneck -// .. character(s). -// seed* hitSeed: The seed-word to base the table on. -// u32 step: Positional step size indicating the granularity -// .. of the positions stored. For example, step=5 -// .. means only every 5th position will be -// .. stored. Step=1 means all positions are -// .. stored. -// -// Returns: -// A pointer to a newly allocated table of positions; failures result in -// program fatality. The caller must eventually dispose of the table, with a -// call to free_position_table(). -// -//---------- -// -// (1) The bottleneck alphabet is invisible to this routine. Its effect is -// completely described by the qToBits[] table. -// -//---------- - -postable* build_quantum_seed_position_table - (seq* seq, - unspos start, - unspos end, - u8* bottleneck, - const charvec qToBest[], - seed* hitSeed, - u32 step) - { - postable* pt; - dumpinfo di; - - // sanity check - - if (step < 1) - suicidef ("in build_quantum_seed_position_table(), step can't be %u", step); - - if (end == 0) - end = seq->len; - - if (end <= start) - suicidef ("in build_quantum_seed_position_table(), interval is void (" unsposDotsFmt ")", - start, end); - - if (end > seq->len) - suicidef ("in build_quantum_seed_position_table(), interval end is bad (" unsposFmt ">" unsposFmt ")", - end, seq->len); - - if (hitSeed->type != 'S') - suicide ("(internal error in build_quantum_seed_position_table: strict seeds only)\n"); - - // create an empty table - - pt = new_position_table (hitSeed->weight, start, end, step, - true, true, (hitSeed->type == 'R')); - - // install dumper - - pt->dump = (posdumper) dump_quantum_seed_position; - pt->dumpInfo = &di; - di.seed = hitSeed; - di.bitsToAlphabet = bottleneck; - - // fill the table - - record_seed_positions_quantum (pt, seq, qToBest, hitSeed); - - return pt; - } - -//---------- -// -// record_seed_positions, -// record_seed_positions_halfweight, -// record_seed_positions_bits-- -// record_seed_positions_quantum-- -// Record the positions of all spaced-seed words in (a subinterval of) a -// sequence. The subinterval is defined by (pt->start,pt->end). The only -// difference between these versions is -// - the normal version encodes at two bits per nucleotide (before packing) -// - the half-weight version encodes only one bit per nucleotide -// - the bits version encodes as two bits and also makes a copy of the -// the sequence as a bit stream -// - the quantum version is just like the normal version, but breaks ties -// when mapping quantum characters to bit pairs -// -//---------- -// -// Arguments: -// postable* pt: The position table in which to record the -// .. positions. -// seq* seq: The sequence to build the position table of. -// s8 upperCharToBits[]: (see note 2) Table to map sequence characters -// .. to two-bit values,and illegal characters to -// .. -1. -// seed* hitSeed: The seed-word to base the table on. -// -// Returns: -// (nothing) -// -//---------- -// -// Notes: -// -// (1) record_seed_positions_halfweight is dependent on the specific 2-bit -// encoding of nucleotides, which is defined (implicitly) in -// dna_utilities.c. We assume that the least significant of the two bits -// distinguishes between purines and pyramidines. -// -// (2) record_seed_positions_quantum replaces the upperCharToBits argument with -// qToBest: -// -// charvec qToBest[]: Table to map a quantum character to the two-bit -// .. code(s) for the 'closest' bottleneck -// .. character(s). -// -//---------- -// -// Normally we slide a 64-bit window along the sequence and collect a -// bit-packed version of the nucleotides in that window. 64 bits corresponds -// to 32 bases, or, for record_seed_positions_halfweight, 64 bases. When we -// have accumulated enough bits to satsify the seed, we pack them according to -// the seed and record the word/position pair in the table. -// -// Words that contain 'illegal' bases are excluded from the table. This is -// accomplished by restarting the collection of bits whenever an illegal base -// is encountered. The corresponding positions are never recorded in the table, -// thus their prev values remain zero. All positions recorded in the table have -// prev non-zero. -// -// A step size can be used to limit the number of locations stored in the -// table. This reduces memory needs and also increases overall speed (since -// later processing will have fewer matches to deal with). Only positions that -// are multiples of the step size are stored. For 'short' step sizes (step no -// longer than the seed) all bases are collected, but packing and recording are -// only performed at such positions. -// -// For 'long' step sizes (step longer than seed), we only collect bases that can -// possibly be part of the word for such a location. This is accomplished by -// "skip-ahead", moving the sequence pointer past several useless bases. There -// are two places we should skip ahead. The first is when we have recorded a -// word. The other is when we hit a bad base and restart collection. -// -// In the following examples, step size Z=15 and seed length L=10. oo indicates -// a good base, xx a bad base, and -- a base we don't care about. -// -// In the first case, after we have just recorded a word, we have something like -// this: -// -// oo oo -- -- -- -- --[oo oo oo oo oo oo oo oo oo oo]-- -- -- -- -- oo oo -// 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 -// s * * -// -// When s is 30 we report the word from 20..29. At this point the next word we -// would possibly report would occur when s=45. So we should skip ahead to -// 35. The formula is s' = s + L-Z. -// -// In the second case, we have something like this: -// -// -- --[oo oo xx -- -- -- -- -- -- --]-- -- -- -- -- oo oo oo oo oo oo oo -// 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 -// s * -// -// We were scanning along toward 45 and encounted a bad nucleotide at 37 (and s -// has already been incremented to 38). This kills any chance of having a word -// to record at 45. The next possible word will come when s=60, so we need to -// skip ahead to s=50. The formula is s' = s + Z-1 - (s+L-1 mod Z). -// -// Since the result of mod will be in the range 0..Z-1, s' will be in the range -// of s to s+Z-1. We want the result to be -L modulo Z so that the next L bases -// form a word when we reach Z. Below we prove this formula produces s' == -L, -// and since s <= s' < Z, s' is the desired position. -// -// s' == s + Z-1 - (s+L-1 mod Z) modulo Z -// == s - 1 - (s+L-1) -// == s - 1 - s - L + 1 -// == s-s - L + 1-1 -// == -L -// -//---------- - -static void record_seed_positions - (postable* pt, - seq* seq, - const s8 upperCharToBits[], - seed* hitSeed) - { - u32 step = pt->step; - u32 seedLength; - u8* seqStart = seq->v + pt->start; - u8* seqStop = seq->v + pt->end; - u8* s; - u64 w; - s32 ww; - u32 packed; - u32 nts; - unspos pos; - - seedLength = (unsigned) hitSeed->length; - - if (seq->len < seedLength) - return; // (nothing to search for) - - // scan the sequence, adding each packed word to the table - - for (s=seqStart ; s seedLength) // for large steps, skip - { // .. ahead to the next - pos = s - seq->v; // .. viable start position - s = s + (step-1) - ((pos+seedLength-1) % step); - } - - empty_skipped: - w = 0L; - for (nts=1 ; (nts start over - w = (w << 2) | ww; // append next nt - } - - // process each word of seedLength nucleotides - - for ( ; s start over - w = (w << 2) | ww; // append next nt - - pos = s - seq->v; // make sure position is - if ((pos % step) != 0) // .. on step boundary - { -#ifdef debugTablePos1 - if (pos == debugTablePos1) - printf ("seq 1 pos " unsposFmt " not on z-step boundary\n", pos); -#endif - continue; - } - - packed = apply_seed (hitSeed, w); // extract seed bits - add_word (pt, packed, pos); // add it to the table -#ifdef debugTablePos1 - if (pos == debugTablePos1) - printf ("recording %s at seq 1 pos " unsposFmt "\n", - seed_packed_to_string (hitSeed, packed), pos); -#endif - - if (step > seedLength) // for large steps, skip - { // .. directly to the start - s += step - seedLength; // .. of the next possible - goto empty_skipped; // .. word we'll record - } - } - } - - } - - -static void record_seed_positions_halfweight - (postable* pt, - seq* seq, - const s8 upperCharToBits[], - seed* hitSeed) - { - u32 step = pt->step; - u32 seedLength; - u8* seqStart = seq->v + pt->start; - u8* seqStop = seq->v + pt->end; - u8* s; - u64 w; - s32 ww; - u32 packed; - u32 nts; - unspos pos; - - seedLength = (unsigned) hitSeed->length; - - if (seq->len < seedLength) - return; // (nothing to search for) - - // scan the sequence, adding each packed word to the table - - for (s=seqStart ; s seedLength) // for large steps, skip - { // .. ahead to the next - pos = s - seq->v; // .. viable start position - s = s + (step-1) - ((pos+seedLength-1) % step); - } - - empty_skipped: - w = 0L; - for (nts=1 ; (nts start over - w = (w << 1) | (ww & 1); // append next R/Y - } - - // process each word of seedLength nucleotides - - for ( ; s start over - w = (w << 1) | (ww & 1); // append next R/Y - - pos = s - seq->v; // make sure position is - if ((pos % step) != 0) continue; // .. on step boundary - - packed = apply_seed (hitSeed, w); // extract seed bits - add_word (pt, packed, pos); // add it to the table - - if (step > seedLength) // for large steps, skip - { // .. directly to the start - s += step - seedLength; // .. of the next possible - goto empty_skipped; // .. word we'll record - } - } - } - - } - - -static void record_seed_positions_bits - (postable* pt, - seq* seq, - const s8 upperCharToBits[], - seed* hitSeed) - { - u32 step = pt->step; - u32 seedLength; - u8* seqStart = seq->v + pt->start; - u8* seqStop = seq->v + pt->end; - u8* s; - u64 w; - s32 ww; - u32 packed; - u32 nts; - unspos pos; - u32* tp; - u32 asBits; - int numNts; - - seedLength = (unsigned) hitSeed->length; - - if (seq->len < seedLength) - return; // (nothing to search for) - - // scan the sequence, adding each packed word to the table, and accum- - // ulating the 'top bits' for each nucleotide - // - // notes: (1) 'bad' characters are encoded the same as some good character - // (probably a T); this does not cause problems because no - // position is recorded (in the table) that would use any of - // those bad bits - // (2) unlike the other two record_seed_positions routines, here we - // cannot skip ahead when we have long step sizes, because we - // we miss encoding the intervening nucleotides - - tp = pt->asBits; - asBits = 0; - numNts = (int) (pt->start - pt->adjStart); - - for (s=seqStart ; s start over - w = (w << 2) | ww; // append next nt - } - - // process each word of seedLength nucleotides - - for ( ; s start over - w = (w << 2) | ww; // append next nt - - pos = s - seq->v; // make sure position is - if ((pos % step) != 0) continue; // .. on step boundary - - packed = apply_seed (hitSeed, w); // extract seed bits - add_word (pt, packed, pos); // add it to the table - } - } - - if (numNts > 0) - *tp = asBits << (2*(16-numNts)); - } - - -static void record_seed_positions_quantum - (postable* pt, - seq* seq, - const charvec qToBest[], - seed* hitSeed) - { - u32 step = pt->step; - u32 seedLength; - u8* seqStart = seq->v + pt->start; - u8* seqStop = seq->v + pt->end; - u8* s; - u8 ch; - u64 w; - s32 ww; - u32 packed; - u32 nts; - unspos pos; - int numTied; - - seedLength = (unsigned) hitSeed->length; - - if (seq->len < seedLength) - return; // (nothing to search for) - - // scan the sequence, adding each packed word to the table - - for (s=seqStart ; s seedLength) // for large steps, skip - { // .. ahead to the next - pos = s - seq->v; // .. viable start position - s = s + (step-1) - ((pos+seedLength-1) % step); - } - - empty_skipped: - w = 0L; - for (nts=1 ; (nts start over - if (numTied == 1) - ww = qToBest[ch].v[0]; // map next char - else - ww = qToBest[ch].v[(s-seq->v)%numTied]; // map next char - w = (w << 2) | ww; // append next nt - } - - // process each word of seedLength nucleotides - - for ( ; s start over - if (numTied == 1) - ww = qToBest[ch].v[0]; // map next char - else - ww = qToBest[ch].v[(s-seq->v)%numTied]; // map next char - w = (w << 2) | ww; // append next nt - - pos = s - seq->v; // make sure position is - if ((pos % step) != 0) // .. on step boundary - { -#ifdef debugTablePos1 - if (pos == debugTablePos1) - printf ("seq 1 pos " unsposFmt " not on z-step boundary\n", pos); -#endif - continue; - } - - packed = apply_seed (hitSeed, w); // extract seed bits - add_word (pt, packed, pos); // add it to the table -#ifdef debugTablePos1 - if (pos == debugTablePos1) - printf ("recording %s at seq 1 pos " unsposFmt "\n", - seed_packed_to_string (hitSeed, packed), pos); -#endif - - if (step > seedLength) // for large steps, skip - { // .. directly to the start - s += step - seedLength; // .. of the next possible - goto empty_skipped; // .. word we'll record - } - } - } - - } - -//---------- -// -// mask_seed_position_table-- -// Remove masked seeds from a position table. A masked seed is one that -// contains a masked base. -// -//---------- -// -// Arguments: -// postable* pt: The position table to operate on. -// unspos start,end: The range of sequence positions to consider. Any -// .. seed enclosed in this range is removed from the -// .. table. Origin-0, end-exclusive. If end==0, the -// .. sequence length is used. -// (all other arguments are as per build_seed_position_table) -// -// Returns: -// (nothing) -// -//---------- - -void mask_seed_position_table - (postable* pt, - seq* seq, - unspos start, - unspos end, - const s8 upperCharToBits[], - seed* hitSeed) - { - // sanity check - - if (end == 0) - end = seq->len; - - if (end <= start) - suicidef ("in mask_seed_position_table(), interval is void (" unsposFmt "-" unsposFmt ")", - start, end); - - if (end > seq->len) - suicidef ("in mask_seed_position_table(), interval end is bad (" unsposFmt ">" unsposFmt ")", - end, seq->len); - - pos_table_count_stat (intervalsMasked); - pos_table_add_stat (maskedIntervalBases, end-start); - - // mask the table - // - // note that if the table contains a copy of the the sequence as a bit - // stream (when pt->asBits is non NULL), we are unable to mask the bits in - // that bit stream (because it has just two bits per base and provides no - // way to encode a masked base); this causes no problem, since we remove - // the corresponding seed the bits we don't clear will never be used - - if (hitSeed->isHalfweight) - mask_seed_positions_halfweight (pt, seq, start, end, upperCharToBits, hitSeed); - else - mask_seed_positions (pt, seq, start, end, upperCharToBits, hitSeed); - } - -//---------- -// -// mask_seed_positions, -// mask_seed_positions_halfweight, -// Remove the positions of any masked spaced-seed words in (a subinterval of) -// a sequence. The subinterval is defined by the (start,end) arguments, not -// the start,end values in the position table structure. The only -// difference between these versions is -// - the normal version encodes at two bits per nucleotide (before packing) -// - the half-weight version encodes only one bit per nucleotide -// -//---------- -// -// Arguments: -// postable* pt: The position table in which to un-record the -// .. positions. -// seq* seq: The sequence the position table was built for. -// unspos start, end: The interval to check (same meaning as for -// .. mask_seed_position_table). Origin-0, end- -// .. exclusive. -// s8 upperCharToBits[]: Character to bit mapping that was used to build -// .. the table. -// seed* hitSeed: The seed-word to table is based on. -// -// Returns: -// (nothing) -// -//---------- - -static void mask_seed_positions - (postable* pt, - seq* seq, - unspos start, - unspos end, - const s8 upperCharToBits[], - seed* hitSeed) - { - u32 step = pt->step; - u32 seedLength; - u8* seqStart = seq->v + start; - u8* seqStop = seq->v + end; - u8* s; - u64 w; - s32 ww; - u32 packed; - u32 nts; - unspos pos; - - seedLength = (unsigned) hitSeed->length; - - if (end-start < seedLength) - return; // (nothing to search for) - - // scan the sequence, removing each packed word from the table - - for (s=seqStart ; s seedLength) // for large steps, skip - { // .. ahead to the next - pos = s - seq->v; // .. viable start position - s = s + (step-1) - ((pos+seedLength-1) % step); - } - - empty_skipped: - w = 0L; - for (nts=1 ; (nts start over - w = (w << 2) | ww; // append next nt - } - - // process each word of seedLength nucleotides - - for ( ; s start over - w = (w << 2) | ww; // append next nt - - pos = s - seq->v; // make sure position is - if ((pos % step) != 0) continue; // .. on step boundary - - if (!position_is_in_table(pt,pos)) // make sure position is - continue; // .. currently in the table - - packed = apply_seed (hitSeed, w); // extract seed bits - remove_word (pt, packed, pos); // remove it from the table - - if (step > seedLength) // for large steps, skip - { // .. directly to the start - s += step - seedLength; // .. of the next possible - goto empty_skipped; // .. word we'll record - } - } - } - - } - - -static void mask_seed_positions_halfweight - (postable* pt, - seq* seq, - unspos start, - unspos end, - const s8 upperCharToBits[], - seed* hitSeed) - { - u32 step = pt->step; - u32 seedLength; - u8* seqStart = seq->v + start; - u8* seqStop = seq->v + end; - u8* s; - u64 w; - s32 ww; - u32 packed; - u32 nts; - unspos pos; - - seedLength = (unsigned) hitSeed->length; - - if (end-start < seedLength) - return; // (nothing to search for) - - // scan the sequence, removing each packed word to the table - - for (s=seqStart ; s seedLength) // for large steps, skip - { // .. ahead to the next - pos = s - seq->v; // .. viable start position - s = s + (step-1) - ((pos+seedLength-1) % step); - } - - empty_skipped: - w = 0L; - for (nts=1 ; (nts start over - w = (w << 1) | (ww & 1); // append next R/Y - } - - // process each word of seedLength nucleotides - - for ( ; s start over - w = (w << 1) | (ww & 1); // append next R/Y - - pos = s - seq->v; // make sure position is - if ((pos % step) != 0) continue; // .. on step boundary - - if (!position_is_in_table(pt,pos)) // make sure position is - continue; // .. currently in the table - - packed = apply_seed (hitSeed, w); // extract seed bits - remove_word (pt, packed, pos); // remove it from the table - - if (step > seedLength) // for large steps, skip - { // .. directly to the start - s += step - seedLength; // .. of the next possible - goto empty_skipped; // .. word we'll record - } - } - } - - } - -//---------- -// -// new_position_table-- -// Allocate a new, empty, position table structure. -// -//---------- -// -// Arguments: -// int wordBits: The number of *bits* in a word (roughly speaking, twice -// .. the number of nucleotides). -// unspos start, end: Range of sequence positions that will be used. -// u32 step: The granularity of the positions that will be stored. -// int allocLast: true => allocate space for the last[] array. -// int allocPrev: true => allocate space for the prev[] array. -// int allocBits: true => allocate space in which to save 2 bits per bp. -// -// Returns: -// A pointer to the newly allocated position table, which the caller will -// have to dispose of eventually. The routine free_position_table() should -// be used for this purpose. -// -//---------- -// -// total memory used (give or take a few bytes) is -// 4 * (2^W + L/G) = 2^(W+2) + 4L/G -// where -// W = wordBits -// L = sequence length (end-start) -// G = granularity (step) -// -// When allocBits is true an additional L/4 bytes is used. This is usually -// insignificant relative to the rest. For example, this is equal to 4L/G if G -// is 16 (an absurdly large value for G). -// -// some examples: -// -// W | L | G | mem | -// ---+------+---+-------+ -// 20 | 250M | 1 | .94G | chr1, 10-of-L seed -// 20 | 250M | 2 | .47G | -// ---+------+---+-------+ -// 24 | 250M | 1 | .99G | chr1, 12-of-L seed -// 24 | 250M | 2 | .53G | -// ---+------+---+-------+ -// 24 | 50M | 1 | .25G | chr21, 12-of-L seed -// 24 | 50M | 2 | .16G | -// ---+------+---+-------+ -// 28 | 2M | 1 | 1.01G | ENCODE region, 14-of-L seed -// 28 | 2M | 2 | 1.00G | -// ---+------+---+-------+ -// -//---------- -// -// Relationship of start,end,adjStart,step -// -// start = 33 adjStart = start - (start%step) -// end = 47 " = 33 - 3 -// step = 5 " = 30 -// -// -// sequence: ..|28|29|30|31|32|33|34|35|36|37|38|39|40|41|42|43|44|45|46|47|.. -// interval: (-----------------------------------------( -// prev: [ 0] [ 1] [ 2] [ 3] -// ^ ^ ^ ^ ^ ^ ^ ^ ^ -// for word length = 6 | | | | | | | | | -// window end = 39, discarded (-----------------( + | | | | | | | | -// window end = 40, saved as 2 (-----------------( + | | | | | | | -// window end = 41, discarded (-----------------( + | | | | | | -// window end = 42, discarded (-----------------( + | | | | | -// window end = 43, discarded (-----------------( + | | | | -// window end = 44, discarded (-----------------( + | | | -// window end = 45, saved as 3 (-----------------( + | | -// window end = 46, discarded (-----------------( + | -// window end = 47, discarded (-----------------( + -// -//---------- - -postable* new_position_table - (int wordBits, - unspos start, - unspos end, - u32 step, - int allocLast, - int allocPrev, - int allocBits) - { - postable* pt; - unspos adjStart; - u32 wordEntries; - unspos prevEntries; - u64 bytesNeeded, bytesStruct, bytesLast, bytesPrev, bytesAsBits; - - if (wordBits > 28) - suicidef ("new_position_table can't support >28 seed bits (%d requested)", wordBits); - - // figger out how many bytes we need - - adjStart = start - (start % step); // (force adjStart down to a - // multiple of the granularity) - - wordEntries = ((u32) 1) << wordBits; - prevEntries = 1 + ((end-adjStart) / step); - - bytesStruct = round_up_16 (sizeof(postable)); - bytesNeeded = bytesStruct; - - bytesLast = 0; - bytesPrev = 0; - bytesAsBits = 0; - - if (allocLast) - { - bytesLast = round_up_16 (((u64) wordEntries) * sizeof(pt->last[0])); - bytesNeeded += bytesLast; - if (bytesLast > mallocLimit) goto overflow_last; - } - - if (allocPrev) - { - bytesPrev = round_up_16 (((u64) prevEntries) * sizeof(pt->prev[0])); - bytesNeeded += bytesPrev; - if (bytesPrev > mallocLimit) goto overflow_prev; - } - - if (allocBits) - { - bytesAsBits = round_up_16((end-adjStart+3) / 4); - bytesNeeded += bytesAsBits; - if (bytesAsBits > mallocLimit) goto overflow_as_bits; - } - - if (bytesNeeded > mallocLimit) goto overflow; - - //fprintf (stderr, "wordBits = %d\n", wordBits); - //fprintf (stderr, "wordEntries = %s\n", commatize(wordEntries)); - //fprintf (stderr, "bytesLast = %s\n", commatize(bytesLast)); - //fprintf (stderr, "\n"); - //fprintf (stderr, "start = %s\n", commatize(start)); - //fprintf (stderr, "end = %s\n", commatize(end)); - //fprintf (stderr, "prevEntries = %s\n", commatize(prevEntries)); - //fprintf (stderr, "bytesPrev = %s\n", commatize(bytesPrev)); - //fprintf (stderr, "\n"); - //fprintf (stderr, "bytesAsBits = %s\n", commatize(bytesAsBits)); - //fprintf (stderr, "\n"); - //fprintf (stderr, "bytesNeeded = %s\n", commatize(bytesNeeded)); - - // allocate - - pt = (postable*) zalloc_or_die ("new_position_table", bytesNeeded); - - // initialize control fields - - pt->allocLast = wordEntries; - pt->allocPrev = prevEntries; - pt->wordBits = wordBits; - pt->wordEntries = wordEntries; - pt->start = start; - pt->adjStart = adjStart; - pt->end = end; - pt->step = step; - pt->dump = NULL; - pt->dumpInfo = NULL; - - // hook up the internal arrays; note that we do not need to initialize - // their contents, since allocation filled them with zeros - - pt->last = (unspos*) (((char*) pt) + bytesStruct); - pt->prev = (unspos*) (((char*) pt->last) + bytesLast); - pt->asBits = (u32*) (((char*) pt->prev) + bytesPrev); - - if (bytesLast == 0) pt->last = NULL; - if (bytesPrev == 0) pt->prev = NULL; - if (bytesAsBits == 0) pt->asBits = NULL; - - return pt; - -// failure exits - -#define suggestions " consider using lastz_32," \ - " or setting max_malloc_index for a special build," \ - " or breaking your target sequence into smaller pieces" - - -overflow: - { - char* tempStruct = commatize(bytesStruct); - char* tempLast = commatize(bytesLast); - char* tempPrev = commatize(bytesPrev); - char* tempAsBits = commatize(bytesAsBits); - suicidef ("in new_position_table(), structure size (%s+%s+%s+%s = %s) exceeds allocation limit of %s;" - suggestions, - tempStruct, tempLast, tempPrev, tempAsBits, commatize(bytesNeeded), - commatize(mallocLimit)); - return NULL; // (doesn't get here) - } - -overflow_last: - suicidef ("in new_position_table(), last[] array size (%s) exceeds allocation limit of %s;" - suggestions, - commatize(bytesStruct), commatize(mallocLimit)); - return NULL; // (doesn't get here) - -overflow_prev: - suicidef ("in new_position_table(), prev[] array size (%s) exceeds allocation limit of %s;" - suggestions, - commatize(bytesPrev), commatize(mallocLimit)); - return NULL; // (doesn't get here) - -overflow_as_bits: - suicidef ("in new_position_table(), asBits[] array size (%s) exceeds allocation limit of %s;" - suggestions, - commatize(bytesAsBits), commatize(mallocLimit)); - return NULL; // (doesn't get here) - } - -//---------- -// -// free_position_table-- -// De-allocate a position table. -// -//---------- -// -// Arguments: -// postable* pt: The position table to de-allocate. -// -// Returns: -// (nothing) -// -//---------- - -void free_position_table (postable* pt) - { free_if_valid ("free_position_table (table)", pt); } - -//---------- -// -// fetch_resolving_bits-- -// Fetch a 16-nucleotide word from a position table's packed representation of -// sequence 1. -// -//---------- -// -// Arguments: -// postable* pt: The position table. We assume pt->asBits is non-NULL. -// unspos pos1: The position following the end of that word in the -// .. sequence (origin-0, relative to pt->adjStart). Note -// .. that this is relative to the adjusted subinterval. -// -// Returns: -// 16 consecutive nucleotides from the sequence, as *32* bits. From most -// significant to least, bit pairs represent positions P-16 to P-1 (where P is -// pos2). -// -//---------- -// -// Notes: -// -// (1) Schematic of the fetch. Each x is a bit, and the bars show the boundary -// of the 32-bit words in the asBits array. ix is an index into the array, -// while iy is an index into the sequence. -// -// ix: .. 5 6 .. -// seq: .. xx xx xx|xx xx xx xx xx xx xx xx xx xx xx xx xx xx xx xx|xx .. -// iy: .. 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 .. -// pos1=94: * -// result: [xx xx|xx xx xx xx xx xx xx xx xx xx xx xx xx xx] -// -//---------- - -#define wordSize (8*sizeof(pt->asBits[0])) // (must be 32 or the return type -#define halfSize (wordSize/2) // .. is wrong!) - -u32 fetch_resolving_bits - (postable* pt, - unspos pos1) - { - unspos ix; - int shift; - u32 seqBits; - - // split pos1 into array index and bit position - - ix = pos1 / halfSize; - pos1 %= halfSize; - - // if bit position is zero we just fetch and return - - if (ix == 0) seqBits = 0; - else seqBits = pt->asBits[ix-1]; - - if (pos1 == 0) return seqBits; - - // otherwise we have to shift and bring in more bits from the next - // array word - // - // when wordSize=32, halfSize=16, and - // pos1=15 gives a shift of 2 - // pos1=1 gives a shift of 30 - - shift = (int) (2*(halfSize-pos1)); - - return (seqBits << (wordSize-shift)) - + (pt->asBits[ix] >> shift); - } - -//---------- -// -// position_is_in_table-- -// Determine if a position has a word stored in a position table. -// -//---------- -// -// Arguments: -// postable* pt: The position table. -// unspos position: The position following the end of the word in the -// .. sequence (origin-0, and divided by step). Note -// .. that this is relative to the *sequence*, and not -// .. to the subinterval defined by pt->start,pt->end. -// .. We expect (but do not check) that this is an -// .. exact multiple of pt->step. -// -// Returns: -// (nothing) -// -//---------- - -static int position_is_in_table - (postable* pt, - unspos position) - { - // convert the position to a prev[] index; note that we expect (but do not - // check) that position and start are both exact multiples of step - - position = (position - pt->adjStart) / pt->step; - - // see if that position is part of any list - - return (pt->prev[position] != 0); - } - -//---------- -// -// add_word-- -// Add a word/position pair to a position table. -// -//---------- -// -// Arguments: -// postable* pt: The position table to add to. -// u32 word: The word to add. -// unspos position: The position following the end of that word in the -// .. sequence (origin-0, and divided by step). Note -// .. that this is relative to the *sequence*, and not -// .. to the subinterval defined by pt->start,pt->end. -// .. We expect (but do not check) that this is an -// .. exact multiple of pt->step. -// -// Returns: -// (nothing) -// -//---------- - -static void add_word - (postable* pt, - u32 word, - unspos position) - { - u32 step = pt->step; - unspos oldLast; - - // convert the position to a prev[] index; note that we expect (but do not - // check) that position and start are both exact multiples of step - - position = (position - pt->adjStart) / step; - - // add the node to the front of the appropriate list - - oldLast = pt->last[word]; - if (oldLast == 0) pt->prev[position] = noPreviousPos;// was empty => end-of-list - else pt->prev[position] = oldLast; // not empty => prepend - pt->last[word] = position; - - // track some stats - - pos_table_count_stat (wordsInTable); - dbg_timing_count_stat (wordsInTable); - - if (oldLast == 0) // (first occurence of this word) - { - pos_table_count_stat (wordsPresent); - pos_table_count_stat (singletonWords); - } - else if (pt->prev[oldLast] == noPreviousPos)// (second occurence of this word) - { - pos_table_uncount_stat (singletonWords); - } - - // debug - - if (pos_table_dbgShowWords) - { - posdumper dump = (pt->dump != NULL)? pt->dump - : (posdumper) dump_word_position; - - printf ("adding "); - (*dump) (stdout, pt, posdump_word, word); - printf ("/"); - (*dump) (stdout, pt, posdump_position, position); - printf (" to table, prev is " unsposFmt "\n", pt->prev[position]); - } - - } - -//---------- -// -// remove_word-- -// Remove a word/position pair from a position table. -// -//---------- -// -// Arguments: -// postable* pt: The position table to remove from. -// u32 word: The word to remove. -// unspos position: The position following the end of that word in the -// .. sequence (origin-0, and divided by step). Note -// .. that this is relative to the *sequence*, and not -// .. to the subinterval defined by pt->start,pt->end. -// .. We expect (but do not check) that this is an -// .. exact multiple of pt->step. -// -// Returns: -// (nothing) -// -//---------- - -static void remove_word - (postable* pt, - u32 word, - unspos position) - { - u32 step = pt->step; - unspos pos, predPos; - posdumper dump = NULL; - - if (pos_table_dbgShowWords) - { - dump = (pt->dump != NULL)? pt->dump - : (posdumper) dump_word_position; - - printf ("removing "); - (*dump) (stdout, pt, posdump_word, word); - printf ("/"); - (*dump) (stdout, pt, posdump_position, position); - printf (" from table"); - } - - // convert the position to a prev[] index; note that we expect (but do not - // check) that position and start are both exact multiples of step - - position = (position - pt->adjStart) / step; - - // make sure the list for this word isn't empty - - if (pt->last[word] == 0) - { - if (pos_table_dbgShowWords) printf (" (list was empty)\n"); - return; - } - - // find this position in the list - - predPos = noPreviousPos; - for (pos=pt->last[word] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - { - if (pos == position) break; - predPos = pos; - } - - if (pos != position) // this position wasn't in the list - { - if (pos_table_dbgShowWords) printf (" (not found in list)\n"); - return; - } - - // remove this position from the list - - if (predPos != noPreviousPos) // this position was *not* first in list - pt->prev[predPos] = pt->prev[pos]; - else // this position *was* the first in list - { - if (pt->prev[pos] == noPreviousPos) // the list is now empty - pt->last[word] = 0; - else - pt->last[word] = pt->prev[pos]; - } - - pt->prev[pos] = 0; // indicate position is no longer in table - - // track some stats - - pos_table_count_stat (wordsRemovedFromTable); - - if (pos_table_dbgShowWords) - { - if (predPos != noPreviousPos) // this position was *not* first in list - printf (", prev[" unsposFmt "] <- " unsposFmt "\n", - predPos, pt->prev[predPos]); - else - { - printf (", last["); - (*dump) (stdout, pt, posdump_word, word); - printf ("] <- " unsposFmt "\n", pt->last[word]); - } - } - } - -//---------- -// -// dump_position_table-- -// Dump the contents of a single position table. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// postable* pt: The position table to print. -// seed* hitSeed: The seed-word the table was built for. This is used -// .. solely to unpack the table indexes. This can be -// .. NULL. -// int showPositions: true => show a list of positions for each entry. -// int showCounts: true => show count for each entry -// -// Returns: -// (nothing) -// -//---------- - -// $$$ this needs to be updated to use the bottleneck alphabet rather than -// $$$ .. assuming ACGT - -void dump_position_table - (FILE* f, - postable* pt, - seed* hitSeed, - int showPositions, - int showCounts) - { - posdumper dump = (pt->dump != NULL)? pt->dump - : (posdumper) dump_word_position; - unspos adjStart = pt->adjStart; - u32 step = pt->step; - u32 w; - unspos pos; - unspos count; - char* s; - - for (w=0 ; wwordEntries ; w++) - { - if (pt->last[w] == 0) continue; - if (pt->last[w] == noPreviousPos) continue; - (*dump) (f, pt, posdump_index, w); - - if (hitSeed == NULL) - fprintf (f, ":"); - else - { - s = seed_packed_to_string (hitSeed, w); - fprintf (f, "/%s:", s); - } - - if (showCounts) - { - count = 0; - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - count++; - fprintf (f, " " unsposFmt, count); - } - - if (showPositions) - { - fprintf (f, " "); - pos = pt->last[w]; - (*dump) (f, pt, posdump_position, adjStart+step*pos); - for (pos=pt->prev[pos] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - { - fprintf (f, ","); - (*dump) (f, pt, posdump_position, adjStart+step*pos); - } - } - - fprintf (f, "\n"); - } - - } - -//---------- -// -// dump_word_position-- -// Dump a word/position pair. -// -//---------- -// -// Arguments: -// (see posdumper description in pos_table.h) -// The pt->dumpInfo field is expected to contain an int* pointing to the -// .. word length -// -// Returns: -// (nothing) -// -//---------- - -static void dump_word_position - (FILE* f, - postable* pt, - int field, - u64 fieldVal) - { - int wordLen = *(int*) pt->dumpInfo; - - switch (field) - { - default: - break; - case posdump_index: - fprintf (f, "%0*X", (pt->wordBits+3)/4, (u32) fieldVal); - break; - case posdump_index_space: - fprintf (f, "%*s", ((pt->wordBits+3)/4), ""); - break; - case posdump_word: - fprintf (f, "%s", bits_to_nuc_string (fieldVal, wordLen)); - break; - case posdump_word_space: - fprintf (f, "%*s", wordLen, ""); - break; - case posdump_position: - fprintf (f, unsposFmt, (unspos) fieldVal); - break; - } - } - -//---------- -// -// dump_seed_position-- -// Dump a seed/position pair. -// -//---------- -// -// Arguments: -// (see posdumper description in pos_table.h) -// -// Returns: -// (nothing) -// -//---------- - -static void dump_seed_position - (FILE* f, - postable* pt, - int field, - u64 fieldVal) - { - dumpinfo* di = pt->dumpInfo; - seed* hitSeed = di->seed; - - switch (field) - { - default: - break; - case posdump_index: - fprintf (f, "%0*X", (pt->wordBits+3)/4, (u32) fieldVal); - break; - case posdump_index_space: - fprintf (f, "%*s", (pt->wordBits+3)/4, ""); - break; - case posdump_word: - fprintf (f, "%s", seed_packed_to_string (hitSeed, fieldVal)); - break; - case posdump_word_space: - fprintf (f, "%*s", hitSeed->length, ""); - break; - case posdump_position: - fprintf (f, unsposFmt, (unspos) fieldVal); - break; - } - } - -//---------- -// -// dump_quantum_seed_position-- -// Dump a seed/position pair for a quantum sequence. -// -//---------- -// -// Arguments: -// (see posdumper description in pos_table.h) -// The pt->dumpInfo field is expected to contain a seed* pointing to the seed -// -// Returns: -// (nothing) -// -//---------- - -static void dump_quantum_seed_position - (FILE* f, - postable* pt, - int field, - u64 fieldVal) - { - dumpinfo* di = pt->dumpInfo; - seed* hitSeed = di->seed; - u8* bitsToAlphabet = di->bitsToAlphabet; - char* s; - - switch (field) - { - default: - break; - case posdump_index: - fprintf (f, "%0*X", (pt->wordBits+3)/4, (u32) fieldVal); - break; - case posdump_index_space: - fprintf (f, "%*s", (pt->wordBits+3)/4, ""); - break; - case posdump_word: - s = seed_packed_to_string2 (hitSeed, fieldVal, NULL, bitsToAlphabet); - if (*s != 0) fprintf (f, "%02X", *(s++)); - while (*s != 0) fprintf (f, " %02X", *(s++)); - break; - case posdump_word_space: - fprintf (f, "%*s", hitSeed->length, ""); - break; - case posdump_position: - fprintf (f, unsposFmt, (unspos) fieldVal); - break; - } - } - - -//---------- -// -// count_position_table-- -// Count the number of positions in a table. -// -//---------- -// -// Arguments: -// postable* pt: The position table to count. -// -// Returns: -// The number of words in the table. -// -//---------- - -unspos count_position_table - (postable* pt) - { - u32 w; - unspos pos; - unspos count; - - count = 0; - for (w=0 ; wwordEntries ; w++) - { - if (pt->last[w] == 0) continue; - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - count++; - } - - return count; - } - -//---------- -// -// limit_position_table-- -// Remove any words from a table that occur too frequently. -// -//---------- -// -// Arguments: -// postable* pt: The position table to modify. -// u32 limit: Words occurring more often than this are removed -// .. from the table. -// u32 maxChasm: The maximum length of an interval of discarded -// .. seed word positions that will be tolerated. -// .. Some seed word positions may be preotected from -// .. removal so that no interval will exceed this. -// .. The value zero indicates that there is no such -// .. limit. -// -// Returns: -// (nothing) -// -//---------- - -static void breakup_chasm (char* protected, unspos startPos, unspos endPos, - unspos maxChasm); - -void limit_position_table - (postable* pt, - u32 _limit, - unspos maxChasm) - { - u32 w; - unspos pos, next; - unspos count; - unspos limit = _limit; - char* protected = NULL; - - pos_table_set_stat (wordCountLimit, _limit); - pos_table_set_stat (maxWordCountChasm, maxChasm); - - maxChasm /= pt->step; // (convert maxChasm into the step realm) - - ////////// - // if we have a limit to the length of discard intervals, create a list of - // "protected" positions - ////////// - - if (maxChasm > 0) - { - size_t bytesNeeded; - int inChasm; - unspos chasmStart = 0; // (placation assignment) - - // create a list of position marks; at this point all positions are - // marked as "unprotected" - - bytesNeeded = pt->allocPrev * sizeof(char); - protected = (char*) zalloc_or_die ("protected seed word positions", bytesNeeded); - - // scan position table and mark any positions that we intend to discard; - // such positions will *all* temporarily be marked as "protected" - - for (w=0 ; wwordEntries ; w++) - { - if (pt->last[w] == 0) continue; - - count = 0; - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - count++; - if (count <= limit) continue; - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - protected[pos] = true; - } - - // scan marks and mark some positions in long intervals as protected - - inChasm = false; - for (pos=0 ; posallocPrev ; pos++) - { - if (protected[pos]) - { - if (!inChasm) - { chasmStart = pos; inChasm = true; } - protected[pos] = false; // (breakup_chasm will set it back to - continue; // .. true if necessary) - } - if (!inChasm) - continue; - inChasm = false; - if (pos - chasmStart > maxChasm) - breakup_chasm (protected, chasmStart, pos, maxChasm); - } - - if ((inChasm) && (pos - chasmStart >= maxChasm)) - breakup_chasm (protected, chasmStart, pos, maxChasm); - } - - ////////// - // dump the positions that will be limited - ////////// - - if (pos_table_dbgShowDiscards) - { - unspos adjStart = pt->adjStart; - u32 step = pt->step; - posdumper dump = (pt->dump != NULL)? pt->dump - : (posdumper) dump_word_position; - char* s; - unspos numWords, numDiscarded; - - numWords = numDiscarded = 0; - - for (w=0 ; wwordEntries ; w++) - { - if (pt->last[w] == 0) continue; - - count = 0; - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - count++; - numWords += count; - if (count <= limit) continue; - if (maxChasm > 0) - { - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - { if (protected[pos]) count--; } - } - numDiscarded += count; - } - - fprintf (stderr, "discarding %s/%s (%.2f%%) for maxwordcount=%d\n", - commatize(numDiscarded), commatize(numWords), - 100.0*numDiscarded/numWords, _limit); - - for (w=0 ; wwordEntries ; w++) - { - if (pt->last[w] == 0) continue; - - count = 0; - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - count++; - if (count <= limit) continue; - - (*dump) (stderr, pt, posdump_index, w); - - if (pos_table_dbgSeed == NULL) - fprintf (stderr, ":"); - else - { - s = seed_packed_to_string (pos_table_dbgSeed, w); - fprintf (stderr, "/%s:", s); - } - - fprintf (stderr, " "); - pos = pt->last[w]; - (*dump) (stderr, pt, posdump_position, adjStart+step*pos); - for (pos=pt->prev[pos] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - { - fprintf (stderr, ","); - (*dump) (stderr, pt, posdump_position, adjStart+step*pos); - if ((maxChasm > 0) && (protected[pos])) - fprintf (stderr, "*"); - } - fprintf (stderr, "\n"); - } - } - - ////////// - // discard positions from the table - ////////// - - for (w=0 ; wwordEntries ; w++) - { - if (pt->last[w] == 0) continue; - - count = 0; - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - count++; - if (count <= limit) continue; - - pos_table_add_stat (discardedWords, count); - - if (maxChasm == 0) - { - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=next) - { next = pt->prev[pos]; pt->prev[pos] = noPreviousPos; } - pt->last[w] = noPreviousPos; - } - else - { - unspos* pred; - - pred = &pt->last[w]; - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=next) - { - next = pt->prev[pos]; - if (protected[pos]) - pred = &pt->prev[pos]; - else - { - *pred = next; - pt->prev[pos] = noPreviousPos; - } - } - } - } - - ////////// - // erase any marks we made for the purpose of limiting discard intervals - ////////// - - free_if_valid ("protected seed word positions", protected); - } - - -// breakup_chasm-- mark enough points in an interval to meet the maximum-chasm -// criterion. The algorithm used is similar to Brensenham's line drawing -// algorithm, starting at position -1/2, stepping along the interval in steps -// of N/D, truncating to an integer, and marking that position. - -static void breakup_chasm - (char* protected, - unspos startPos, - unspos endPos, - unspos maxChasm) - { - unspos pos, len, markNum; - s64 numer; - u64 denom; - - len = endPos - startPos; - denom = 1 + (len / (maxChasm+1)); // (number of sub-intervals) - numer = (denom/2) - denom; // (intentionally wraps to 'negative') - for (markNum=1 ; markNumoccurrences!=0 ; pd++) - numPositions += pd->count * pd->occurrences; - - minToKeep = (unspos) ceil (numPositions * keep); - - // scan the list, counting up positions until we meet or exceed the - // requirement (note that the posDist entries are in order of increasing - // count) - - limit = 0; - for (pd=posDist ; pd->occurrences!=0 ; pd++) - { - if (pd->count * pd->occurrences >= minToKeep) - { limit = pd->count; break; } - minToKeep -= pd->count * pd->occurrences; - } - - free_if_valid ("seed word position counts distribution", posDist); - - if (limit > maxPossibleLimit) return maxPossibleLimit; // (bloody unlikely) - else return limit; - } - -//---------- -// -// position_table_count_distribution-- -// Determine the distribution of occurence counts in a table. -// -//---------- -// -// Arguments: -// postable* pt: The position table. -// -// Returns: -// A pointer to the newly allocated count distribution, which the caller will -// have to dispose of eventually (using free()). This is an array ordered by -// increasing count, and terminated by an entry with zero occurrences. -// -//---------- - -static int qIncreasingCount (const void* _a, const void* _b); -static int qIncreasingCount (const void* _a, const void* _b) - { - poscount* a = (poscount*) _a; - poscount* b = (poscount*) _b; - if (a->count < b->count) return -1; - else if (a->count > b->count) return 1; - else return 0; - } - - -poscount* position_table_count_distribution - (postable* pt) - { -#ifndef noMemoryWrappers - char* idString = "position_table_count_distribution"; -#endif // not noMemoryWrappers - poscount* posDist, *pd; - int countsAllocated, countsUsed; - size_t bytesNeeded; - u32 w; - unspos pos, count; - int ix; - - // allocate an array to hold the distribution; for hg18.chr1 with 13-mers, - // the number of distict counts is 3,259 - - countsAllocated = 3500; - countsUsed = 0; - - bytesNeeded = countsAllocated * sizeof(poscount); - posDist = (poscount*) malloc_or_die (idString, bytesNeeded); - - posDist[0].occurrences = 0; // (terminate list) - - // scan the table, count the locations for each word, and add to the - // distibution - // $$$ re-implement the list/search using heapsort - - for (w=0 ; wwordEntries ; w++) - { - if (pt->last[w] == 0) continue; - - count = 0; - for (pos=pt->last[w] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - count++; - - pd = NULL; - for (ix=0 ; ixcount = count; // (note that pd->occurrences == 0) - (pd+1)->occurrences = 0; // (terminate list) - } - - pd->occurrences++; - } - - // sort by decreasing count - - qsort (posDist, countsUsed, sizeof(poscount), qIncreasingCount); - - return posDist; - } - -//---------- -// -// pos_table_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void pos_table_zero_stats - (void) - { - dbg_timing_set_stat (wordsInTable, 0); - -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&posTableStats, 0, sizeof(posTableStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - // (none to set, yet) - -#endif // collect_stats - } - -//---------- -// -// pos_table_show_stats, -// pos_table_show_stats_after-- -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// postable* pt: The relevant position table (this can be NULL). -// -// Returns: -// (nothing) -// -//---------- - -void pos_table_show_stats - (arg_dont_complain(FILE* f), - arg_dont_complain(postable* pt)) - { -#ifdef collect_stats - char weight[10]; -#endif // collect_stats - - dbg_timing_report_stat (wordsInTable, "DNA words in table"); - -#ifdef collect_stats - - if (f == NULL) return; - - if ((posTableStats.wordWeight % 2) == 0) - sprintf (weight, "%d", posTableStats.wordWeight / 2); - else - sprintf (weight, "%d.5", posTableStats.wordWeight / 2); - - fprintf (f, "word len or weight: %s\n", weight); - fprintf (f, " possible words: %s\n", commatize (posTableStats.wordSpace)); - fprintf (f, "-------------------\n"); - fprintf (f, "DNA words in table: %s\n", commatize (posTableStats.wordsInTable)); -// if (pt != NULL) -// fprintf (f, " (actual): %s\n", commatize (tableCount)); - fprintf (f, "distinct DNA words: %s\n", commatize (posTableStats.wordsPresent)); - fprintf (f, " singleton words: %s\n", commatize (posTableStats.singletonWords)); - if (posTableStats.discardedWords > 0) - { - fprintf (f, " kept words: %s\n", commatize (posTableStats.wordsInTable - posTableStats.discardedWords)); - fprintf (f, " discarded words: %s\n", commatize (posTableStats.discardedWords)); - } - if (posTableStats.wordSpace > 0) - fprintf (f, " distinct/possible: %.2f%%\n", (((u64) 100)*posTableStats.wordsPresent) / (float) posTableStats.wordSpace); - if (posTableStats.wordsInTable > 0) - { - fprintf (f, " distinct/words: %.2f%%\n", (((u64) 100)*posTableStats.wordsPresent) / (float) posTableStats.wordsInTable); - fprintf (f, " singleton/words: %.2f%%\n", (((u64) 100)*posTableStats.singletonWords) / (float) posTableStats.wordsInTable); - if (posTableStats.discardedWords > 0) - { - fprintf (f, " kept/words: %.2f%%", (((u64) 100)*(posTableStats.wordsInTable-posTableStats.discardedWords)) / (float) posTableStats.wordsInTable); - fprintf (f, " (word count limit = %u)\n", posTableStats.wordCountLimit); - fprintf (f, " (max chasm = %u)\n", posTableStats.maxWordCountChasm); - fprintf (f, " discarded/words: %.2f%%\n", (((u64) 100)*posTableStats.discardedWords) / (float) posTableStats.wordsInTable); - fprintf (f, " protected/words: %.2f%%\n", (((u64) 100)*posTableStats.protectedWords) / (float) posTableStats.wordsInTable); - } - } - fprintf (f, " bases parsed: %s\n", commatize (posTableStats.basesParsed)); - fprintf (f, "-------------------\n"); - -#endif // collect_stats - } - - -void pos_table_show_stats_after - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - - if (f == NULL) return; - - fprintf (f, " intervals masked: %s\n", commatize (posTableStats.intervalsMasked)); - fprintf (f, "masked i'val bases: %s\n", commatize (posTableStats.maskedIntervalBases)); - if (posTableStats.intervalsMasked > 0) - fprintf (f, "bases/masked i'val: %.1f\n", ((float) posTableStats.maskedIntervalBases) / posTableStats.intervalsMasked); - fprintf (f, " DNA words removed: %s\n", commatize (posTableStats.wordsRemovedFromTable)); - fprintf (f, " mask bases parsed: %s\n", commatize (posTableStats.maskBasesParsed)); - fprintf (f, "-------------------\n"); - -#endif // collect_stats - } - diff --git a/programs/lastz/src/pos_table.h b/programs/lastz/src/pos_table.h deleted file mode 100644 index 563408f..0000000 --- a/programs/lastz/src/pos_table.h +++ /dev/null @@ -1,253 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: pos_table.h -// -//---------- - -#ifndef pos_table_H // (prevent multiple inclusion) -#define pos_table_H - -// other files - -#include // standard C i/o stuff -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "seeds.h" // seed matching stuff - -// establish ownership of global variables - -#ifdef pos_table_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access -// nota bene: showProgress is a relic which is not currently used - -#ifdef pos_table_owner -int pos_table_showProgress = false; // true => make periodic progress reports -int pos_table_dbgShowWords = false; // true => show 'words' in add_word() -int pos_table_dbgShowDiscards = false; // true => show 'words' discarded in -seed* pos_table_dbgSeed = NULL; // .. limit_position_table() -#else -global int pos_table_showProgress; -global int pos_table_dbgShowWords; -global int pos_table_dbgShowDiscards; -global seed* pos_table_dbgSeed; -#endif - -//---------- -// -// data structures and types -// -//---------- - -// position dumper functions-- -// Dump one position (from a position table) to a file. -// -// Arguments: -// FILE* f: The file to print to. -// postable* pt: The table containing the position. -// int field: Which field to dump (one of posdump_xxx). -// u64 fieldVal: The value of the field being dumped. - -typedef void (*posdumper) (FILE*, void*, int, u64); - -enum - { - posdump_index = 0, // dump index - posdump_index_space, // dump as much space as index - posdump_word, // dump word - posdump_word_space, // dump as much space as word - posdump_position // dump position - }; - -// position table-- -// A position table maps a word to a list of positions of that word in the -// associated sequence. Think of a word as a W-mer with two bits per -// nucleotide (though it may be bits selected from a larger window and then -// packed). -// -// Positions are recorded as the index (into a subinterval of the sequence) of -// the first character *after* the end of the word. For example, "abra" would -// be recorded as positions 4 and 11 in "abracadabra". Positions are relative -// to a (start,end) subinterval of the sequence. If the sequence is -// "elabrabracadabrazil" and the subinterval is (5,16(, the positions of "abra" -// are still 4 and 11. -// -// index: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 -// sequence: e l a b r a b r a c a d a b r a z i l -// interval: (-------------------------------( -// position: 0 1 2 3 4 5 6 7 8 9 10 -// "abra": X X -// -// This implementation consists of two arrays-- last and prev. Last has an -// entry for every possible word and gives the rightmost position in the -// sequence where that word exists. Prev has an entry for each position in -// the sequence and gives the position of the first duplicate of the word, to -// its left. Thus all the positions of a given word can be found by traversing -// a chain of integers (the equivalent of a linked list), with a special value -// (noPrevious) indicating the end of the list. In contrast, an empty list has -// last set to zero. Any position for which a seed is *not* stored in the -// table (i.e. is not part of a linked list) has prev set to zero. -// -// The reason for the incongruity for the end marker (0 for empty list, -// noPrevious for end of list) is that it easier to allocate memory filled -// with zeros, thus the entries in last and prev that we never touch will -// contain zero. For dynamic masking, in order to remove seeds from the table, -// we need to be able to quickly determine whether a position (to be masked) is -// in the table or not. This is distinguishable by prev zero (not in table) or -// non-zero (in table). -// -// The positions in both last and prev are indexes into prev, and thus are -// positions (relative to granularity) in the *subinterval* of the sequence -// defined by start and end. Granularity (G) specifies a subset of the -// positions that can be stored. If G=1, every position can be stored; if -// G=2 only even positions can be stored; if G=5 only every 5th position can -// be stored. Since the values in last and prev are indexes into prev, the -// following conversions are helpful: -// P = S + G*I -// I = (P-S)/G -// where -// P = the position in the sequence -// S = the adjusted start of the subinvterval (adjStart) -// I = the index into prev -// G = the granularity -// -// note (1): The memory for last, prev, and (if needed) asBits, is usually -// allocated as part of the same block as the postable struct. -// Thus the whole kit and kaboodle can be deallocated with a -// single call to free (but you should use free_position_table). -// In some cases, however, the caller will install pointers for -// last, prev and asBits to point elsewhere. In this case the -// caller is responsible for making sure they get cleaned up. - -typedef struct postable - { - u32 allocLast; // actually number of entries allocated for the - unspos allocPrev; // .. last and prev arrays; some may be unused - - int wordBits; // number of *bits* in a word (for a simple - // .. W-mer match, this would be 2W) - u32 wordEntries;// number of entries in last[] - - posdumper dump; // function to dump a position to the console - // .. (only used for debugging; can be NULL) - void* dumpInfo; // custom argument for dump() - - unspos start; // first sequence position of interest - unspos end; // sequence position just beyond the last - // .. position of interest - unspos adjStart; // sequence position corresponding to first - // .. entry in prev[]; this is guaranteed to - // .. be a multiple of postable.step - u32 step; // granularity of positions (see note above) - - // (see note (1) about whether last[], prev[], and asBits[] arrays are - // allocated *within* this same malloc block, or not - - unspos* last; // array giving the rightmost position of each - // .. word (see text above) (also see note 1) - unspos* prev; // array giving the position of the first - // .. duplicate to the left of each position - // .. (see text above) (also see note 1); the - // .. first entry in this array corresponds to - // .. sequence postion postable.adjStart - u32* asBits; // a packed version of the sequence the table - // .. catalogs; these are two bits per bp - // .. (encoded using whatever upperCharToBits[] - // .. array is in use); the first word - // .. corresponds to sequence locations adjStart - // .. thru adjStart+15, with adjStart in the two - // .. most significant bits; this field can be - // .. NULL; (see note 1) - } postable; - -#define noPreviousPos ((unspos) -1) - -// position count distribution-- - -typedef struct poscount - { - unspos count; // the number of positions some particular seed - // .. word occurs at in the sequence - unspos occurrences;// the number of words which have that count; a - // .. zero indicates the end of a list - } poscount; - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -global struct - { - int wordWeight; - int wordSpace; - int wordsInTable; - int wordsPresent; - int singletonWords; - int basesParsed; - u32 wordCountLimit; - u32 maxWordCountChasm; - int discardedWords; - int protectedWords; - int intervalsMasked; - int maskedIntervalBases; - int wordsRemovedFromTable; - int maskBasesParsed; - } posTableStats; - -// stats macros - -#define pos_table_count_stat(field) ++posTableStats.field -#define pos_table_uncount_stat(field) --posTableStats.field -#define pos_table_set_stat(field,val) (posTableStats.field = val) -#define pos_table_add_stat(field,val) (posTableStats.field += val) -#else -#define pos_table_count_stat(field) -#define pos_table_uncount_stat(field) -#define pos_table_set_stat(field,val) -#define pos_table_add_stat(field,val) -#endif // collect_stats - -// prototypes for stats routines - -void pos_table_zero_stats (void); -void pos_table_show_stats (FILE* f, postable* pt); -void pos_table_show_stats_after (FILE* f); - -//---------- -// -// prototypes for routines in pos_table.c -// -//---------- - -postable* build_seed_position_table (seq* seq, unspos start, unspos end, - const s8 upperCharToBits[], - seed* seed, u32 step); -postable* build_quantum_seed_position_table - (seq* seq, unspos start, unspos end, - u8* bottleneck, const charvec qToBest[], - seed* seed, u32 step); -void mask_seed_position_table (postable* pt, - seq* seq, unspos start, unspos end, - const s8 upperCharToBits[], seed* hitSeed); -postable* new_position_table (int wordBits, unspos start, unspos end, - u32 step, int allocLast, int allocPrev, - int allocBits); -void free_position_table (postable* pt); -u32 fetch_resolving_bits (postable* pt, unspos pos1); -void dump_position_table (FILE* f, postable* pt, seed* hitSeed, - int showPositions, int showCounts); -unspos count_position_table (postable* pt); -void limit_position_table (postable* pt, u32 limit, u32 maxChasm); -poscount* position_table_count_distribution (postable* pt); -u32 find_position_table_limit (postable* pt, float keep); - -#undef global -#endif // pos_table_H diff --git a/programs/lastz/src/quantum.c b/programs/lastz/src/quantum.c deleted file mode 100755 index 22fc86c..0000000 --- a/programs/lastz/src/quantum.c +++ /dev/null @@ -1,675 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: quantum.c -// -//---------- -// -// quantum-- -// Support for finding "high scoring segment pairs" between a quantum DNA -// sequence and a DNA sequence. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "seeds.h" // seed strategy stuff -#include "pos_table.h" // position table stuff -#include "diag_hash.h" // diagonals hashing stuff -#include "seed_search.h" // seed hit search stuff - -#define quantum_owner // (make this the owner of its globals) -#include "quantum.h" // interface to this module - -//---------- -// -// data structures and types -// -//---------- - -//#define debugQuantumPartials // if defined (and if quantum_dbgQuantumBall is - // .. true), show the failed "partials" in the - // .. computation of the ball of DNA words close - // .. to each quantum word - -// private globals shared by all the routines under the umbrella of -// quantum_seed_hit_search() - -static seq* seq1; -static postable* pt; -static seq* seq2; -static unspos start; -static unspos end; -static const s8* charToBits; -static seed* hitSeed; -static scoreset* scoring; -static score ballScore; -static hitprocessor processor; -static void* processorInfo; -static const u8* bitsToSym; - -//---------- -// -// prototypes for private functions -// -//---------- - -static u32 private_quantum_word_hit_search (void); -static u32 private_quantum_seed_hit_search (void); - -static u32 generate_dna_ball (u8* goal, u32 wordLen, u32 matchLen, - unspos goalEnd, - scoreset* scoring, score ballScore, - qdjudger judger, void* info); -static u32 judge_qd (void* info, u8* qWord, u8* dWord, - u32 wordLen, u32 matchLen, unspos qEnd); - -//---------- -// -// quantum_seed_hit_search-- -// Search for high scoring segment pairs (HSPs) between a quantum sequence -// and a DNA sequence. HSPs are regions that align with high similarity. -// Substitutions are allowed, but insertions and deletions are not. -// -// The caller must already have built a table of word positions in the DNA -// sequence. -// -//---------- -// -// Arguments: -// seq* seq1: The sequence being searched. -// postable* pt: A table of positions of of words in seq1. -// s8 charToBits[]: Table to map DNA characters to two-bit -// .. values, and illegal characters to -1. -// seq* seq2: The quantum sequence being searched for. -// unspos start: First sequence position to consider. Zero is -// .. the first possible position. -// unspos end: One past the last sequence position to consider. -// .. If this is zero, the sequence length is used. -// s8 charToBits[]: Table to map sequence characters to two-bit -// .. values, and illegal characters to -1. -// seed* hitSeed: The seed-word the table is based on. -// scoreset* scoring: The alignment scoring parameters. It is assumed -// .. that a row corresponds to a DNA symbol (i.e. -// .. sequence1 is DNA). -// score ballScore: The minimum score required of a DNA word to be -// .. considered 'in' a quantum word's ball. -// hitprocessor processor: Function to call for each hit to determine if it -// .. is 'good enough'. -// void* processorInfo: A value to pass thru with each call to processor. -// -// Returns: -// The number of HSPs found. -// -// $$$ the return value should be changed to the number of bases covered, to be -// $$$ .. consistent with seed_hit_search() -// -//---------- -// -// Notes: -// -// (1) This routine allocates and reuses memory via global pointers. The -// caller should make a call to free_quantum_search() to de-allocate this -// memory, after all searches are complete. -// -//---------- - -u32 quantum_seed_hit_search - (seq* _seq1, - postable* _pt, - seq* _seq2, - unspos _start, - unspos _end, - const s8 _charToBits[], - seed* _hitSeed, - scoreset* _scoring, - score _ballScore, - hitprocessor _processor, - void* _processorInfo) - { - - // sanity check - - if (_hitSeed->resolvingMask != 0) - suicide ("quantum_seed_hit_search doesn't support overweight seeds"); - - if (_hitSeed->type != 'S') - suicide ("quantum_seed_hit_search only supports strict seeds" - " (1s and 0s only)"); - - if (_hitSeed->withTrans != 0) - suicide ("quantum_seed_hit_search doesn't support seeds with transitions"); - - if (_end == 0) - _end = _seq2->len; - - if (_end <= _start) - suicidef ("in quantum_seed_hit_search(), interval is void (" unsposFmt "-" unsposFmt ")", - _start, _end); - - if (_end > _seq2->len) - suicidef ("in quantum_seed_hit_search(), interval end is bad (" unsposFmt ">" unsposFmt ")", - _end, _seq2->len); - - // allocate (or re-use) memory - - empty_diag_hash (); - - // pass globals to the rest of the search - // note: this makes this module non-threadsafe - - seq1 = _seq1; - pt = _pt; - seq2 = _seq2; - start = _start; - end = _end; - charToBits = _charToBits; - hitSeed = _hitSeed; - scoring = _scoring; - ballScore = _ballScore; - processor = _processor; - processorInfo = _processorInfo; - - if (_scoring->rowsAreDna) bitsToSym = bits_to_nuc; - else bitsToSym = _scoring->bottleneck; - - // perform search separately for match-seeds vs spaced-seeds - - if (hitSeed->weight == 2*hitSeed->length) - return private_quantum_word_hit_search (); - else - return private_quantum_seed_hit_search (); - } - - -void free_quantum_search (void) { free_diag_hash (); } - - -static u32 private_quantum_word_hit_search - (void) - { - u32 wordLen = (unsigned) hitSeed->length; - u8* qStart = seq2->v; - u8* qStop = seq2->v + seq2->len; - u8* q; - unspos qPos; - u32 numHsps = 0; - - if (seq2->len < wordLen) - return 0; // (nothing to search for) - - // scan the sequence, finding the ball of DNA words 'close' to each quantum - // word, and processing each seed match therein - - for (q=qStart,qPos=wordLen ; q<=qStop-wordLen ; q++,qPos++) - numHsps += generate_dna_ball (q, wordLen, wordLen, qPos, - scoring, ballScore, judge_qd, NULL); - - return numHsps; - } - - -static u32 private_quantum_seed_hit_search - (void) - { - u8 word[maxSeedLen+1]; - u32 matchLen = (unsigned) hitSeed->length; - u32 wordLen = (unsigned) (hitSeed->weight / 2); - u8* qStart = seq2->v; - u8* qStop = seq2->v + seq2->len; - u8* q; - unspos qPos; - u32 ix; - u32* shuffle = NULL; - u32 numHsps = 0; - - if (seq2->len < matchLen) - return 0; // (nothing to search for) - - // get shuffle list for this seed - - shuffle = seed_shuffle_list (hitSeed); - if (shuffle[0] != wordLen) - suicide ("in hsp_quantum_seed_search(), internal error"); - - for (ix=0 ; ix= T } -// -// Where -// d is a dna word -// q is the goal quantum word -// T is a similarity score threshold -// s(.,.) is the similarity score between dna and quantum words, which is -// summed over the similarity scores for individual letters -// -//---------- -// -// Arguments: -// u8* goal: The word about which the ball will be generated. -// .. This is a string of quantum characters, but need -// .. not be zero-terminated. -// u32 wordLen: The word length (number of characters in the word). -// u32 matchLen: The length of the match the word represents. This -// .. can be longer than wordLen when a spaced seed is -// .. being used. -// unspos goalEnd: Position the goal represents in seq2. This is -// .. the index of the first position *after* the word. -// scoreset* scoring: The alignment scoring parameters. It is assumed -// .. that a row corresponds to a DNA symbol (i.e. -// .. sequence1 is DNA). -// score ballScore: The minimum score required of a DNA word to be -// .. considered 'in' the ball surrounding the -// .. goal quantum word. -// qdjudger judger: Function to call for each DNA word in the ball, -// .. to determine if the DNA word results in any -// .. high-scoring pairs. This can be NULL if the -// .. caller just wants to count the number of DNA -// .. words in the ball. -// void* info: Additional control/arguments specific to the judger -// .. being called. -// -// Returns: -// The number of 'good' DNA words in the ball. Which words are good is -// determined by the judger function. If there is no judger all words are -// considered good. -// -//---------- - -#define maxWord 16 -#define alphabetSize 4 - -static u32 generate_dna_ball - (u8* goal, - u32 wordLen, - u32 matchLen, - unspos goalEnd, - scoreset* scoring, - score ballScore, - qdjudger judger, - void* info) - { - score minNeeded [maxWord]; // minNeeded[i] is min score needed from - // .. bases 0..i to have a chance to - // .. reach ballScore - s8 citizenVal[maxWord]; // (each location is -1, 0, 1, 2, or 3) - u8 citizenDna[maxWord+1]; // (each location is A, C, G, T) - u32 dnaWords, goodWords; - int ix, sym; - score symScore, maxScore, wordScore, bestScore; - - if (wordLen == 0) - suicidef ("wordLen is zero in generate_dna_ball"); - else if (wordLen > 16) - suicidef ("wordLen=%u is too large in generate_dna_ball", wordLen); - - quantum_count_stat (qWordsExamined); - - ////////// - // precompute running minimum requirement - ////////// - - // the following #if clause is a workaround for gcc bug 37861, which - // .. erroneously reports: "array subscript is above array bounds"; see - // .. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37861 - // update dec/2009: since various versions of gcc complain about this, and - // .. since wordLen>0 due to the fact that it is unsigned and has failed - // .. the wordLen==0 test above, I've commented-out all the #if stuff, and - // .. hope that the optimizer is smart enough to remove the wordLen>0 check - -//#if ((defined(__GNUC__)) && (GCC_VERSION == 40302)) - if (wordLen > 0) minNeeded[wordLen-1] = ballScore; -//#else -// minNeeded[wordLen-1] = ballScore; -//#endif - - maxScore = 0; - - for (ix=(int)wordLen-1 ; ix>=0 ; ix--) - { - bestScore = scoring->sub[bitsToSym[0]][goal[ix]]; - for (sym=1 ; symsub[bitsToSym[sym]][goal[ix]]; - if (symScore > bestScore) bestScore = symScore; - } - - if (ix > 0) minNeeded[ix-1] = minNeeded[ix] - bestScore; - maxScore += bestScore; - } - - if (quantum_dbgQuantumBall) - { - if (maxScore >= ballScore) - fprintf (stderr, "candidate: %s maxScore=" scoreFmtSimple "\n", - quantum_word_string (goal, wordLen, 3), maxScore); -#ifdef debugQuantumPartials - else - fprintf (stderr, " no ball: %s" - " maxScore=" scoreFmtSimple - "<" scoreFmtSimple "\n", - quantum_word_string (goal, wordLen, 3), - maxScore, ballScore); -#endif // debugQuantumPartials - } - - if (maxScore < ballScore) - { - quantum_count_stat (dWordsInBallDistrib[0]); - return 0; - } - - ////////// - // generate the ball - ////////// - - dnaWords = goodWords = 0; - - citizenDna[wordLen] = 0; - citizenVal[0] = -1; - wordScore = 0; - - ix = 0; - while (ix >= 0) - { - // subtract the score for the symbol in this position - - if (citizenVal[ix] >= 0) - wordScore -= scoring->sub[citizenDna[ix]][goal[ix]]; - - // if we've tried all symbols in this position, backtrack - - if (citizenVal[ix] == alphabetSize-1) - { ix--; continue; } - - // try the next symbol in this position - - citizenVal[ix]++; - citizenDna[ix] = bitsToSym[(u8)citizenVal[ix]]; - - // add score for this symbol, and if it's not enough, prune (and go try - // the next symbol) - - wordScore += scoring->sub[citizenDna[ix]][goal[ix]]; - if (wordScore < minNeeded[ix]) - { -#ifdef debugQuantumPartials - if (quantum_dbgQuantumBall) - fprintf (stderr, " partial: %s score=" scoreFmtSimple "\n", - quantum_word_string (citizenDna, ix+1, 3), wordScore); -#endif // debugQuantumPartials - continue; - } - - // if we don't have a full word yet, advance to the next position - - if (ix < (int)wordLen-1) - { citizenVal[++ix] = -1; continue; } - - // we have a word that occupies the 'ball'-- report it (and then go try - // the next symbol) - - dnaWords++; - - if (judger == NULL) - goodWords++; - else - { - quantum_count_stat (dWordsInBall); - goodWords += (*judger) (info, goal, (u8*) citizenVal, - wordLen, matchLen, goalEnd); - } - } - -#ifdef collect_stats - if (dnaWords < qstatMaxDnaWords-1) - quantum_count_stat (dWordsInBallDistrib[dnaWords]); - else - quantum_count_stat (dWordsInBallDistrib[qstatMaxDnaWords-1]); -#endif // collect_stats - - return goodWords; - } - -//---------- -// [[-- qdjudger function --]] -// -// judge_qd-- -// Determine whether a quantum word and a dna word lead to any high-scoring -// pairs. -// -//---------- -// -// Arguments (as per qdjudger functions): -// void* info: (not used) -// u8* qWord: The quantum word. -// u8* dWord: The DNA word (2 bits per letter). -// u32 wordLen: The word length. -// u32 matchLen: The length of the match the word represents. -// unspos qEnd: Position the quantum word represents in seq2. This is -// .. the index of the first position *after* the word. -// -// Returns: -// The number of seed hits found. -// -// Notes: -// We assume the scoring parameters are such that a row corresponds to a DNA -// symbol (i.e. sequence1 is DNA). -// -//---------- - -static u32 judge_qd - (arg_dont_complain(void* info), - u8* qWord, - u8* dWord, - u32 wordLen, - u32 matchLen, - unspos qEnd) - { - unspos adjStart = pt->adjStart; - u32 step = pt->step; - u32 dnaPacked; - unspos pos, pos1; - u32 ix; - u32 numHits = 0; - - // pack the dna word into a table index - - dnaPacked = 0; - for (ix=0 ; ixsub[dna[ix]][qWord[ix]]; - } - dna[wordLen] = 0; - - fprintf (stderr, " in ball: %s score=" scoreFmtSimple, - quantum_word_string (dna, wordLen, 3), dnaScore); - - if (pt->last[dnaPacked] == 0) - fprintf (stderr, " no hits\n"); - } - - // process - - if (pt->last[dnaPacked] == 0) - return 0; - - for (pos=pt->last[dnaPacked] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - { - pos1 = adjStart + step*pos; - quantum_count_stat (wordHits); - numHits += (*processor) (processorInfo, pos1, qEnd, matchLen); - } - - if (quantum_dbgQuantumBall) - fprintf (stderr, " %u hits\n", numHits); - -#ifdef collect_stats - if (numHits < qstatMaxHits-1) - quantum_count_stat (wordLookupDistrib[numHits]); - else - quantum_count_stat (wordLookupDistrib[qstatMaxHits-1]); -#endif // collect_stats - - return numHits; - } - -//---------- -// -// quantum_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void quantum_zero_stats - (void) - { -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&quantumStats, 0, sizeof(quantumStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - // (none to set, yet) - -#endif // collect_stats - } - -//---------- -// -// quantum_show_stats-- -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// -// Returns: -// (nothing) -// -//---------- - -void quantum_show_stats - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - int numEntries; - int ix, lastIx; - - if (f == NULL) return; - - fprintf (f, " max sym score: " scoreFmtSimple "\n", quantumStats.maxSymScore); - fprintf (f, " min sym score: " scoreFmtSimple "\n", quantumStats.minSymScore); - fprintf (f, " max word score: " scoreFmtSimple "\n", quantumStats.maxWordScore); - fprintf (f, " ball score: " scoreFmtSimple "\n", quantumStats.ballScore); - fprintf (f, " x drop: " scoreFmtSimple "\n", quantumStats.xDrop); - fprintf (f, " hsp threshold: %s\n", score_thresh_to_string (&quantumStats.hspThreshold)); - fprintf (f, "-------------------\n"); - fprintf (f, " qWords examined: %s\n", commatize (quantumStats.qWordsExamined)); - fprintf (f, " dWords in ball: %s\n", commatize (quantumStats.dWordsInBall)); - fprintf (f, " dWords per qWord: %.2f\n", quantumStats.dWordsInBall / (float) quantumStats.qWordsExamined); - fprintf (f, " word hits: %s\n", commatize (quantumStats.wordHits)); - fprintf (f, " hits per qWord: %.2f\n", quantumStats.wordHits / (float) quantumStats.qWordsExamined); - fprintf (f, " hits per dWord: %.2f\n", quantumStats.wordHits / (float) quantumStats.dWordsInBall); - fprintf (f, "word hits extended: %s\n", commatize (quantumStats.wordHitsExtended)); - fprintf (f, " extended/hits: %.2f%%\n", 100*quantumStats.wordHitsExtended / (float) quantumStats.wordHits); - fprintf (f, " HSPs found: %s\n", commatize (quantumStats.hspsFound)); - fprintf (f, " HSPs/hits: %.2f%%\n", 100*quantumStats.hspsFound / (float) quantumStats.wordHits); - fprintf (f, "-------------------\n"); - - fprintf (f, "dWords in ball distribution (by actual usage)\n"); - lastIx = 0; - numEntries = qstatMaxDnaWords; - for (ix=0 ; ix%2d]: %s\n", numEntries-2, commatize (quantumStats.dWordsInBallDistrib[numEntries-1])); - - fprintf (f, "-------------------\n"); - fprintf (f, "dWords per lookup distribution (by actual usage)\n"); - lastIx = 0; - numEntries = qstatMaxHits; - for (ix=0 ; ix%2d]: %s\n", numEntries-2, commatize (quantumStats.wordLookupDistrib[numEntries-1])); - -#endif // collect_stats - } - -void quantum_generic_stats - (arg_dont_complain(FILE* f), - arg_dont_complain(void (*func) (FILE*, const char*, ...))) - { - ; // add these later if desired - } - diff --git a/programs/lastz/src/quantum.h b/programs/lastz/src/quantum.h deleted file mode 100644 index 29ffb92..0000000 --- a/programs/lastz/src/quantum.h +++ /dev/null @@ -1,126 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: quantum.h -// -//---------- - -#ifndef quantum_H // (prevent multiple inclusion) -#define quantum_H - -// other files - -#include // standard C i/o stuff -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "seeds.h" // seed strategy stuff -#include "pos_table.h" // position table stuff -#include "seed_search.h" // seed hit search stuff - -// establish ownership of global variables - -#ifdef quantum_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef quantum_owner -int quantum_dbgQuantumBall = false; // true => show details of the ball of DNA - // .. words close to each quantum word -#else -global int quantum_dbgQuantumBall; -#endif - - -//---------- -// -// data structures and types -// -//---------- - -// quantum-to-dna judger functions-- -// Judge whether a quantum-word and dna-word comprise a high-scoring pair. -// -// Arguments: -// void* info: (pass-thru argument) -// u8* qword: The quantum word. This is a string of characters, -// .. but is *not* zero-terminated. -// u8* dword: The DNA word, with nucleotides encoded as two bits. -// u32 wordLen: The word length (number of characters in the words). -// u32 matchLen: The length of the match the word represents; this -// .. can be longer than wordLen when a spaced seed is -// .. being used. -// unspos qEnd: Position the quantum word represents in some -// sequence. This is the index of the first position -// *after* the word. -// -// Returns: -// The number of HSP's derived from this pair. - -typedef u32 (*qdjudger) (void*, u8*, u8*, u32, u32, unspos); - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -global struct - { - score maxSymScore; - score minSymScore; - score maxWordScore; - score ballScore; - score xDrop; - sthresh hspThreshold; - u32 qWordsExamined; - u32 dWordsInBall; - u32 wordHits; - u32 wordHitsExtended; - u32 hspsFound; - u32 dWordsInBallDistrib[22]; - u32 wordLookupDistrib[152]; - } quantumStats; - -#define qstatMaxDnaWords entriesof(quantumStats.dWordsInBallDistrib) -#define qstatMaxHits entriesof(quantumStats.wordLookupDistrib) - -// stats macros - -#define quantum_count_stat(field) ++quantumStats.field -#define quantum_uncount_stat(field) --quantumStats.field -#define quantum_set_stat(field,val) (quantumStats.field = val) -#define quantum_add_stat(field,val) (quantumStats.field += val) -#else -#define quantum_count_stat(field) -#define quantum_uncount_stat(field) -#define quantum_set_stat(field,val) -#define quantum_add_stat(field,val) -#endif // collect_stats - -// prototypes for stats routines - -void quantum_zero_stats (void); -void quantum_show_stats (FILE* f); -void quantum_generic_stats (FILE* f, void (*func) (FILE*, const char*, ...)); - -//---------- -// -// prototypes for routines in quantum.c -// -//---------- - -u32 quantum_seed_hit_search (seq* seq1, postable* pt, - seq* seq2, unspos start, unspos end, - const s8 charToBits[], seed* hitSeed, - scoreset* scoring, score ballScore, - hitprocessor processor, void* processorInfo); -void free_quantum_search (void); - -#undef global -#endif // quantum_H diff --git a/programs/lastz/src/sam.c b/programs/lastz/src/sam.c deleted file mode 100755 index 1c46a86..0000000 --- a/programs/lastz/src/sam.c +++ /dev/null @@ -1,792 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: sam.c -// -//---------- -// -// sam-- -// Support for printing alignments in SAM format. -// -// SAM format is a pairwise alignment format designed for short read alignments. -// A spec can be found at samtools.sourceforge.net/SAM1.pdf. -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C variable argument list stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff -#include "identity_dist.h" // identity distribution "format" stuff -#include "coverage_dist.h" // query coverage distribution stuff - -#define sam_owner // (make this the owner of its globals) -#include "sam.h" // interface to this module - -// SAM bit-encoded flags - -#define BAM_FPAIRED 1 // the read is paired in sequencing, no matter whether it is mapped in a pair -#define BAM_FPROPER_PAIR 2 // the read is mapped in a proper pair -#define BAM_FUNMAP 4 // the read itself is unmapped; conflictive with BAM_FPROPER_PAIR -#define BAM_FMUNMAP 8 // the mate is unmapped -#define BAM_FREVERSE 16 // the read is mapped to the reverse strand -#define BAM_FMREVERSE 32 // the mate is mapped to the reverse strand -#define BAM_FREAD1 64 // this is read1 -#define BAM_FREAD2 128 // this is read2 -#define BAM_FSECONDARY 256 // not primary alignment -#define BAM_FQCFAIL 512 // QC failure -#define BAM_FDUP 1024 // optical or PCR duplicate - -//---------- -// -// prototypes for private functions -// -//---------- - -static void print_query_bases (FILE* f, seq* seq2, unspos pos2, unspos length, - int softMasked); -static void print_query_quals (FILE* f, seq* seq2, unspos pos2, unspos length, - int softMasked); - -//---------- -// -// sam_rg_tags-- -// Parse a readgroup string and extract the tags that should be attached to -// each read. -// -//---------- -// -// Arguments: -// char* readGroup: The readgroup string (e.g. "ID:TRWFT SM:BGDNCSA32"). -// char** errorText: Place to return any error text. The string pointer -// .. returned here is local to THIS routine, and does -// .. not need to be deallocated. This pointer may be -// .. NULL, in which case no error text is returned. -// -// Returns: -// A pointer to the RG tags, allocated from the heap. The caller is -// responsible for deallocating that string. If there is any error in parsing -// the string, NULLL is returned and the explaination of the error is copied -// to errorText (unless it is NULL). -// -//---------- - -char* sam_rg_tags - (char* readGroup, - char** errorText) - { - char* idTag; - int idLen; - u32 bytesNeeded; - char* s, *ss; - - ////////// - // parse and validate the important fields - ////////// - - idTag = find_tabbed_tag (readGroup, "ID"); - if (idTag == NULL) - { - if (errorText != NULL) *errorText = "ID is a required field"; - return NULL; - } - idLen = tabbed_tag_length (idTag); - if (idLen <= 3) - { - if (errorText != NULL) *errorText = "ID field cannot be empty"; - return NULL; - } - - // nota bene: SAM spec 0.1.2-draft 20090820 specified SM as a required - // tag. But SAM spec 1.4 shows it as an optional field. - - //smTag = find_tabbed_tag (readGroup, "SM"); - //if (smTag == NULL) - // { - // if (errorText != NULL) *errorText = "SM is a required field"; - // return NULL; - // } - //smLen = tabbed_tag_length (smTag); - //if (smLen <= 3) - // { - // if (errorText != NULL) *errorText = "SM field cannot be empty"; - // return NULL; - // } - - // nota bene: SAM spec 1.4 appears to indicate that LB and PU tags don't - // have to be propagated from the RG header line to alignment - // records - - //lbTag = find_tabbed_tag (readGroup, "LB"); - //lbLen = 0; // (placate complier) - //if (lbTag != NULL) - // { - // lbLen = tabbed_tag_length (lbTag); - // if (lbLen <= 3) - // { - // if (errorText != NULL) *errorText = "LB field cannot be empty"; - // return NULL; - // } - // } - - //puTag = find_tabbed_tag (readGroup, "PU"); - //puLen = 0; // (placate complier) - //if (puTag != NULL) - // { - // puLen = tabbed_tag_length (puTag); - // if (puLen <= 3) - // { - // if (errorText != NULL) *errorText = "PU field cannot be empty"; - // return NULL; - // } - // } - - ////////// - // collect the ID, LB and PU fields into a single string - ////////// - - bytesNeeded = idLen + 1; - //if (lbTag != NULL) bytesNeeded += 1 + lbLen; - //if (puTag != NULL) bytesNeeded += 1 + puLen; - - s = ss = malloc_or_die ("sam_rg_tags", bytesNeeded); - strncpy (/*to*/ ss, /*from*/ idTag, idLen); - ss += idLen; - - //if (lbTag != NULL) - // { - // *(ss++) = '\t'; - // strncpy (/*to*/ ss, /*from*/ lbTag, lbLen); - // ss += lbLen; - // } - - //if (puTag != NULL) - // { - // *(ss++) = '\t'; - // strncpy (/*to*/ ss, /*from*/ puTag, puLen); - // ss += puLen; - // } - - *ss = 0; - - if (errorText != NULL) *errorText = NULL; - return s; - } - -//---------- -// -// print_sam_job_header-- -// Print sam format job header. -// -//---------- - -static int headerPrinted = false; - -void print_sam_job_header - (FILE* f, - char* readGroup) - { - fprintf (f, "@HD\tVN:1.0\tSO:unsorted\n"); - if (readGroup != NULL) - fprintf (f, "@RG\t%s\n", readGroup); - headerPrinted = false; - } - -//---------- -// -// print_sam_header-- -// Print sam format query header. -// -//---------- - -void print_sam_header - (FILE* f, - seq* seq1, - arg_dont_complain(seq* seq2)) - { - seqpartition* sp1 = &seq1->partition; - partition* p; - u32 ix; - char* name1; - - if (headerPrinted) return; - - // if seq1 is not partitioned, just print the (single) header name - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - fprintf (f, "@SQ\tSN:%s\tLN:" unsposFmt "\n", - name1, seq1->trueLen); - } - - // otherwise, seq1 is partitioned, so print the header name for each - // partition - - else - { - p = sp1->p; - - for (ix=0 ; ixlen ; ix++) - fprintf (f, "@SQ\tSN:%s\tLN:" unsposFmt "\n", - &sp1->pool[p[ix].header], p[ix].trueLen); - } - - headerPrinted = true; - } - -//---------- -// -// print_sam_align_list-- -// Print a list of gapped alignments in sam format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// int softMasked: true => sequence ends should be soft masked -// false => sequence ends should be hard masked -// char* rgTags: Additional tags for RG (read group). This is a -// .. valid string per the format described in the SAM -// .. spec. This may be NULL. -// -// Returns: -// (nothing) -// -//---------- - -void print_sam_align_list - (FILE* f, - alignel* alignList, - seq* seq1, - seq* seq2, - int softMasked, - char* rgTags) - { - alignel* a; - - for (a=alignList ; a!=NULL ; a=a->next) - { - print_sam_align (f, - seq1, a->beg1-1, a->end1, - seq2, a->beg2-1, a->end2, - a->script, a->s, softMasked, rgTags); - } - - } - -//---------- -// -// print_sam_align-- -// Print a single gapped alignment in sam format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos beg1, end1: Range of positions in sequence 1 (origin 0). -// seq* seq2: Another sequence. -// unspos beg2, end2: Range of positions in sequence 2 (origin 0). -// editscript* script: The script describing the path the alignment takes -// .. in the DP matrix. -// score s: The alignment's score. -// int softMasked: true => sequence ends should be soft masked -// false => sequence ends should be hard masked -// char* rgTags: Additional tags for RG (read group). This is a -// .. valid string per the format described in the SAM -// .. spec. This may be NULL. -// -// Returns: -// (nothing) -// -//---------- - -void print_sam_align - (FILE* f, - seq* seq1, - unspos beg1, - unspos end1, - seq* seq2, - unspos beg2, - unspos end2, - editscript* script, - arg_dont_complain(score s), - int softMasked, - char* rgTags) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - unspos height, width, i, j, prevI, prevJ, run; - u32 opIx; - unspos len2; - char* name1, *name2; - unspos offset1, offset2, start1, start2; - unspos startLoc1, startLoc2; - unspos seq2Len, seq2True; - int flag; - char maskCh; - unspos preMask, postMask, tmp; - - if (seq1->revCompFlags != rcf_forward) - suicide ("attempt to print - strand or complement for sequence 1 in print_sam_align"); - - beg1++; // (internally, we want origin 1, inclusive) - beg2++; - - height = end1 - beg1 + 1; - len2 = width = end2 - beg2 + 1; - - ////////// - // figure out position offsets and names - ////////// - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, beg1-1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - seq2Len = seq2->len; - seq2True = seq2->trueLen; - startLoc2 = seq2->startLoc; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, beg2-1); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - seq2Len = part->sepAfter - offset2; - seq2True = part->trueLen; - startLoc2 = part->startLoc; - } - - ////////// - // print sam line (field names indicate below are per sam spec) - ////////// - - start1 = beg1-1 - offset1 + startLoc1; - - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = beg2-1 - offset2 + startLoc2; - end2 = start2-1 + len2; - flag = 0; - } - else - { - start2 = startLoc2 + offset2 + (seq2Len - beg2) - (len2-1); - end2 = startLoc2 + offset2 + (seq2Len - beg2); - flag = BAM_FREVERSE; - } - - // print qname, flag, rname, pos and mapq - - fprintf (f, "%s\t%d\t%s\t" unsposFmt "\t%d\t", - name2, flag, name1, start1, 255); - - // print cigar - - maskCh = (softMasked)? 'S' : 'H'; - - preMask = postMask = 0; - if (start2 > 1) preMask = start2 - 1; - if (end2 < seq2True) postMask = seq2True - end2; - if ((seq2->revCompFlags & rcf_rev) != 0) - { tmp = preMask; preMask = postMask; postMask = tmp; } - - if (preMask != 0) fprintf (f, unsposFmt "%c", preMask, maskCh); - - opIx = 0; - for (i=j=0 ; (i< height)||(j 0) - { - fprintf (f, unsposFmt "M", run); - i += run; j += run; - } - - if ((i < height) || (j < width)) - { - prevI = i; prevJ = j; - edit_script_indel_len (script, &opIx, &i, &j); - if (i > prevI) - fprintf (f, unsposFmt "D", i - prevI); - if (j > prevJ) - fprintf (f, unsposFmt "I", j - prevJ); - } - } - - if (postMask != 0) fprintf (f, unsposFmt "%c", postMask, maskCh); - - // print mrnm, mpos, and isize - - fprintf (f, "\t%s\t%d\t%d\t", "*", 0, 0); - - // print seq (data from sequence 2) - - print_query_bases (f, seq2, beg2-1, len2, softMasked); - - // print qual (if we have no qual data, we print "*") - - if (seq2->vq == NULL) - fprintf (f, "\t%s", "*"); - else - { - fprintf (f, "\t"); - print_query_quals (f, seq2, beg2-1, len2, softMasked); - } - - // print tags - - if (rgTags != NULL) - fprintf (f, "\t%s", rgTags); - - fprintf (f, "\n"); - } - -//---------- -// -// print_sam_match-- -// Print an hsp in sam format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos pos1: The position, in seq1, of first character in the -// .. match (origin-0). -// seq* seq2: Another sequence. -// unspos pos1: The position, in seq2, of first character in the -// .. match (origin-0). -// unspos length: The number of nucleotides in the HSP. -// score s: The HSP's score. -// int softMasked: true => sequence ends should be soft masked -// false => sequence ends should be hard masked -// char* rgTags: Additional tags for RG (read group). This is a valid -// .. string per the format described in the SAM spec. -// .. This may be NULL. -// -// Returns: -// (nothing) -// -//---------- - -void print_sam_match - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - arg_dont_complain(score s), - int softMasked, - char* rgTags) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - char* name1, *name2; - unspos offset1, offset2, start1, start2, end2; - unspos startLoc1, startLoc2; - unspos seq2Len, seq2True; - int flag; - char maskCh; - unspos preMask, postMask, tmp; - - if (seq1->revCompFlags != rcf_forward) - suicide ("attempt to print - strand or complement for sequence 1 in print_sam_match"); - - ////////// - // figure out position offsets and names - ////////// - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; - if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; - offset1 = 0; - startLoc1 = seq1->startLoc; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, pos1); - name1 = &sp1->pool[part->header]; - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - offset2 = 0; - seq2Len = seq2->len; - seq2True = seq2->trueLen; - startLoc2 = seq2->startLoc; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, pos2); - name2 = &sp2->pool[part->header]; - offset2 = part->sepBefore + 1; - seq2Len = part->sepAfter - offset2; - seq2True = part->trueLen; - startLoc2 = part->startLoc; - } - - ////////// - // print sam line (field names indicate below are per sam spec) - ////////// - - start1 = pos1 - offset1 + startLoc1; - - if ((seq2->revCompFlags & rcf_rev) == 0) - { - start2 = pos2 - offset2 + startLoc2; - end2 = start2-1 + length; - flag = 0; - } - else - { - start2 = startLoc2 + offset2 + (seq2Len - pos2) - length; - end2 = startLoc2 + offset2 + (seq2Len - pos2) - 1; - flag = BAM_FREVERSE; - } - - // print qname, flag, rname, pos and mapq (field names per sam spec) - - fprintf (f, "%s\t%d\t%s\t" unsposFmt "\t%d\t", - name2, flag, name1, start1, 255); - - // print cigar - - maskCh = (softMasked)? 'S' : 'H'; - - preMask = postMask = 0; - if (start2 > 1) preMask = start2 - 1; - if (end2 < seq2True) postMask = seq2True - end2; - if ((seq2->revCompFlags & rcf_rev) != 0) - { tmp = preMask; preMask = postMask; postMask = tmp; } - - if (preMask != 0) fprintf (f, unsposFmt "%c", preMask, maskCh); - fprintf (f, unsposFmt "M", length); - if (postMask != 0) fprintf (f, unsposFmt "%c", postMask, maskCh); - - // print mrnm, mpos, and isize - - fprintf (f, "\t%s\t%d\t%d\t", "*", 0, 0); - - // print seq (data from sequence 2) - - print_query_bases (f, seq2, pos2, length, softMasked); - - // print qual (if we have no qual data, we print "*") - - if (seq2->vq == NULL) - fprintf (f, "\t%s", "*"); - else - { - fprintf (f, "\t"); - print_query_quals (f, seq2, pos2, length, softMasked); - } - - // print rgTags - - if (rgTags != NULL) - fprintf (f, "\t%s", rgTags); - - fprintf (f, "\n"); - } - -//---------- -// -// print_query_bases-- -// Print the "seq" field for sam format. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq2: The query sequence. -// unspos pos1: The position, in seq2, of first character in the -// .. match (origin-0). -// unspos length: The number of nucleotides in the HSP. -// int softMasked: true => sequence ends should be soft masked -// false => sequence ends should be hard masked -// -// Returns: -// (nothing) -// -//---------- - -static void print_query_bases - (FILE* f, - seq* seq2, - unspos pos2, - unspos length, - int softMasked) - { - seqpartition* sp2 = &seq2->partition; - partition* part; - u8* s2 = seq2->v + pos2; - u8* ss2; - unspos offset2, start2, end2; - unspos startLoc2; - unspos seq2True; - unspos ix; - - ////////// - // figure out position offsets, etc. - ////////// - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2True = seq2->trueLen; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, pos2); - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2True = part->trueLen; - } - - ////////// - // print seq (data from sequence 2) - ////////// - - start2 = pos2 - offset2 + startLoc2; - end2 = start2-1 + length; - - if ((softMasked) && (start2 > 1)) - { - ss2 = seq2->v + pos2 - (start2-1); - for (ix=0 ; ix sequence ends should be soft masked -// false => sequence ends should be hard masked -// -// Returns: -// (nothing) -// -//---------- - -static void print_query_quals - (FILE* f, - seq* seq2, - unspos pos2, - unspos length, - int softMasked) - { - seqpartition* sp2 = &seq2->partition; - partition* part; - u8* s2 = seq2->vq + pos2; - u8* ss2; - unspos offset2, start2, end2; - unspos startLoc2; - unspos seq2True; - unspos ix; - - ////////// - // figure out position offsets, etc. - ////////// - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - offset2 = 0; - startLoc2 = seq2->startLoc; - seq2True = seq2->trueLen; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, pos2); - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - seq2True = part->trueLen; - } - - ////////// - // print qual (qualities from sequence 2) - ////////// - - start2 = pos2 - offset2 + startLoc2; - end2 = start2-1 + length; - - if ((softMasked) && (start2 > 1)) - { - ss2 = seq2->vq + pos2 - (start2-1); - for (ix=0 ; ix // standard C i/o stuff -#include // standard C variable argument list stuff -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -//---------- -// -// prototypes for routines in sam.c -// -//---------- - -char* sam_rg_tags (char* readGroup, char** errorText); -void print_sam_job_header (FILE* f, char* readGroup); -void print_sam_header (FILE* f, seq* seq1, seq* seq2); -void print_sam_align_list (FILE* f, alignel* alignList, seq* seq1, seq* seq2, - int softMasked, char* rgTags); -void print_sam_align (FILE* f, - seq* seq1, unspos beg1, unspos end1, - seq* seq2, unspos beg2, unspos end2, - editscript* script, score s, - int softMasked, char* rgTags); -void print_sam_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - score s, int softMasked, char* rgTags); - -#endif // sam_H diff --git a/programs/lastz/src/seed_search.c b/programs/lastz/src/seed_search.c deleted file mode 100755 index a391ae4..0000000 --- a/programs/lastz/src/seed_search.c +++ /dev/null @@ -1,4245 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: seed_search.c -// -//---------- -// -// seed_search-- -// Support for finding "seed hits" in genomic sequences. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C upper/lower stuff -#include // standard C math stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "seeds.h" // seed matching stuff -#include "pos_table.h" // position table stuff -#include "diag_hash.h" // diagonals hashing stuff -#include "segment.h" // segment table management stuff - -#define seed_search_owner // (make this the owner of its globals) -#include "seed_search.h" // interface to this module - -// debugging defines - -//#define debugDiag 97-60724168 // if defined, breakdown what happens with - // .. every seed hit on this pos1-pos2 diagonal - -//#define debugResolvingSeeds // if defined, show how partial seed hits are -//#define debugPos1 49615 // .. resolved for this seed hit (starting -//#define debugPos2 1345896 // .. positions of the hit) - -#ifdef debugDiag // if we are debugging a particular diagonal, -static int debugThisDiag; // .. debugThisDiag will be set true only on -#else // .. that diagonal; otherwise it is set false -#define debugThisDiag false // .. either at run-time or compile-time -#endif - -#ifdef debugResolvingSeeds // if we are debugging a particular seed hit, -static int debugThisHit; // .. debugThisHit will be set true only on -#else // .. that hit; otherwise it is set false -#define debugThisHit false // .. either at run-time or compile-time -#endif - -//#define debugSearchPos2 2412 // if defined, breakdown what happens with this - // .. position in sequence 2 (this is the right - // .. end of the word; the left end is at - // .. X-(seedLength-1) counting from 1 at the - // .. start of the sequence - -// other debugging defines - -//#define debugMismatchExtend // if defined, show process of mismatch_extend_seed_hit - -//#define snoopDiagHash // if this is defined, extra code is added to - // .. report on diag hash collisions - -//#define snoopXDrop // if this is defined, extra code is added to - // .. report on x-drop extensions - -//#define snoopEntropy // if this is defined, extra code is added to - // .. report entropy calculations - -//#define snoopHspSubrange // if this is defined, extra code is added to - // .. report situations where an HSP subrange - // .. scores higher than the HSP -//#define extendHspFromLeft // if this is defined, HSP extension is from - // .. the left edge of the hit, not the right - -//#define densityFiltering // if this is defined, we perform alignment - // .. density filtering; note that, as of this - // .. writing, it has not been shown that - // .. density filtering provides any benefit - -//#define snoopPosFilter // if this is defined, extra code is added to - // .. filter_seed_hit_by_pos to report whether - // .. or not each seed was discarded - -//#define snoopBelowDiagonal // if this is defined, extra code is added to - // .. seed_hit_below_diagonal, for debugging - -//#define snoopReporterCalls // if this is defined, extra code is added to - // .. to track calls to the seed hit reporter - // .. function - -#ifdef densityFiltering -#define densityCheckDepth2 // if this is defined, we check for density - // .. filtering at depth 2 - -#define densityCheckDepth3 // if this is defined, we check for density - // .. filtering at depth 3 -#endif // densityFiltering - -//---------- -// -// stats to augment crude profiling -// -//---------- - -#ifndef dbgTiming -#define dbg_timing_set_stat(field,val) ; -#define dbg_timing_count_stat(field) ; -#define dbg_timing_report_stat(field,name) ; -#define dbg_timing_report_big_stat(field,name) ; -#endif // not dbgTiming - -#ifdef dbgTiming -struct - { - int64 ungappedExtensions; - int hsps; - } seedSearchTimingStats; - -#define dbg_timing_set_stat(field,val) (seedSearchTimingStats.field = val) -#define dbg_timing_count_stat(field) ++seedSearchTimingStats.field -#define dbg_timing_report_stat(field,name) fprintf(stderr,"%-26s %d\n", name":",seedSearchTimingStats.field) -#define dbg_timing_report_big_stat(field,name) fprintf(stderr,"%-26s %" PRId64 "\n",name":",seedSearchTimingStats.field) -#endif // dbgTiming - -//---------- -// -// private data -// -//---------- - -// private globals shared by all the routines under the umbrella of -// seed_hit_search() - -static seq* seq1; -static postable* pt; -static seq* seq2; -static unspos start; -static unspos end; -static int selfCompare; -static int sameStrand; // (only meaningful if selfCompare is true) -static const s8* upperCharToBits; -static seed* hitSeed; -static u32 searchLimit; -static u32 reportSearchLimit; -static s32 searchToGo; -#ifdef densityFiltering -static u64 maxBasesAllowed; -#endif // densityFiltering -static hitprocessor processor; -static void* processorInfo; - -static int unblockedLeftExtension; // true => seed extension routines - // .. (e.g. xdrop_extend_seed_hit) shouldn't - // .. block left-extension at previous diagEnd - -// static data area for discovery_probability - -static u32 foldedSize = 0; -static u8* foldedHits = NULL; - -//---------- -// -// prototypes for private functions -// -//---------- - -static u64 private_hit_search (void); -static u64 private_hit_search_halfweight (void); -static u64 private_hit_search_resolve (void); - -static u64 find_table_matches (u32 packed2, unspos pos2); -static u64 find_table_matches_resolve (u32 packed2, unspos pos2, - u32 unpacked2, int transAllowed); - -static int seed_hit_below_diagonal (unspos pos1, unspos pos2); -static int filter_seed_hit_by_pos (hitprocinfo* hp, - unspos pos1, unspos pos2, unspos length); -static int filter_seed_hit_by_subs (hitprocinfo* hp, - unspos pos1, unspos pos2, unspos length); -static score xdrop_extend_seed_hit (hitprocinfo* hp, - unspos* pos1, unspos* pos2, unspos* length); -static score match_extend_seed_hit (hitprocinfo* hp, - unspos* pos1, unspos* pos2, unspos* length); -static score mismatch_extend_seed_hit(hitprocinfo* hp, - unspos* pos1, unspos* pos2, unspos* length); -static void warn_for_search_limit (void); -static void dump_raw_hit (FILE* f, unspos pos1, unspos pos2); - -#ifdef debugDiag - -static void dump_extended_match (FILE* f, - seq* seq1, seq* seq2, sgnpos diag, - u8* p1, u8* p2, u8* p3, - u8* p4, u8* p5, u8* p6); -static char* pair_diagonal_as_text (unspos pos1, unspos pos2); -static char* diagonal_as_text (sgnpos diag); - -#endif // debugDiag - -#if ((defined snoopDiagHash) && (!defined debugDiag)) -static char* pair_diagonal_as_text (unspos pos1, unspos pos2); -static char* diagonal_as_text (sgnpos diag); -#endif // snoopDiagHash && not debugDiag - -#if ((defined snoopXDrop) && (!defined debugDiag)) -static char* pair_diagonal_as_text (unspos pos1, unspos pos2); -static char* diagonal_as_text (sgnpos diag); -#endif // snoopXDrop && not debugDiag - -#if (defined snoopXDrop) -static char* display_sequence_character (seq* _seq, u8 ch); -#endif // snoopXDrop - -//---------- -// -// seed_hit_search-- -// Search for seed hits between one sequence and another. -// -// The caller must already have built a table of seed-word positions in one of -// the sequences. -// -//---------- -// -// Arguments: -// seq* seq1: The sequence being searched. -// postable* pt: A table of positions of words in seq1. -// seq* seq2: The sequence being searched for. -// unspos start: First sequence position to consider. Zero is -// .. the first possible position. -// unspos end: One past the last sequence position to consider. -// .. If this is zero, the sequence length is used. -// int selfCompare: true => seq1 and seq2 are the same sequence. -// s8 upperCharToBits[]: Table to map sequence characters to two-bit -// .. values,and illegal characters to -1. -// seed* hitSeed: The seed-word the table is based on. -// u32 searchLimit: The maximum number of "HSPs" allowed; zero -// .. indicates "no limit". See note (3) below. -// u32 reportSearchLimit: The number to report (to the user) as the -// .. search limit if searchLimit is reached. Note -// .. that searchLimit is per-search, whereas -// .. reportSearchLimit is per-query. The special -// .. value of zero indicates that we should not -// .. report this condition. -// double maxDensity: (only if densityFiltering is #defined) -// The maximum alignment density we will "allow" -// .. before discarding a query sequence; zero -// .. means there is no limit (see note 2 below). -// hitprocessor processor: Function to call for each hit to determine if it -// .. is 'good enough'. -// void* processorInfo: A value to pass thru with each call to processor. -// -// Returns: -// The number of bases in the seed hits (see note 2 below). -// -//---------- -// -// Notes: -// -// (1) This routine allocates and reuses memory via global pointers. The -// caller should make a call to free_seed_hit_search() to de-allocate this -// memory, after all searches are complete. -// -// (2) If the density limit is exceeded, the value u64max is returned. It is -// possible (in fact, a near certainty) that the processor (and associated -// reporter) function will have been called. It is up the the caller to -// dispose of any seed hits already reported. -// -// (3) searchLimit is not a hard limit, and there are many circumstances by -// which we will report more seeds/hits/HSPs/alignments than the limit. -// This allows us to avoid checking the limit after each and every hit. -// The limit should be taken as giving us the permission to stop once we -// have found that many hits. -// -//---------- - -#ifndef debugSearchPos2 -#define debugSearchPos2_1 ; -#define debugSearchPos2_2 ; -#define debugSearchPos2_3 ; -#define debugSearchPos2_4 ; -#endif // not debugSearchPos2 - -#ifdef debugSearchPos2 - -#define debugSearchPos2_1 \ - if (pos2 == debugSearchPos2) \ - printf ("checking %s at seq 2 pos " unsposFmt " (matches)\n", \ - seed_packed_to_string (hitSeed, packed), pos2); - -#define debugSearchPos2_2 \ - if (pos2 == debugSearchPos2) \ - printf ("checking %s at seq 2 pos " unsposFmt " (one transition)\n", \ - seed_packed_to_string (hitSeed, packedTrans), pos2); - -#define debugSearchPos2_3 \ - if (pos2 == debugSearchPos2) \ - printf ("checking %s at seq 2 pos " unsposFmt " (one transition)\n", \ - seed_packed_to_string (hitSeed, packedTrans), pos2); - -#define debugSearchPos2_4 \ - if (pos2 == debugSearchPos2) - printf ("checking %s at seq 2 pos " unsposFmt " (two transitions)\n", - seed_packed_to_string (hitSeed, packedTrans), pos2); - -#endif // debugSearchPos2 - - -u64 seed_hit_search - (seq* _seq1, - postable* _pt, - seq* _seq2, - unspos _start, - unspos _end, - int _selfCompare, - const s8 _upperCharToBits[], - seed* _hitSeed, - u32 _searchLimit, - u32 _reportSearchLimit, -#ifdef densityFiltering - double _maxDensity, -#endif // densityFiltering - hitprocessor _processor, - void* _processorInfo) - { - u64 basesHit; - seqpartition* sp2; - char* name2; - char strand2; - - // sanity check - - if (_end == 0) - _end = _seq2->len; - - if (_end <= _start) - suicidef ("in seed_hit_search(), interval is void (%d-%d)", - _start, _end); - - if (_end > _seq2->len) - suicidef ("in seed_hit_search(), interval end is bad (%d>%d)", - _end, _seq2->len); - - // allocate (or re-use) memory - - empty_diag_hash (); - - // pass globals to the rest of the search - // note: this makes this module non-threadsafe - - seq1 = _seq1; - pt = _pt; - seq2 = _seq2; - start = _start; - end = _end; - selfCompare = _selfCompare; - sameStrand = (selfCompare) && (seq1->revCompFlags == seq2->revCompFlags); - upperCharToBits = _upperCharToBits; - hitSeed = _hitSeed; - searchLimit = _searchLimit; - reportSearchLimit = _reportSearchLimit; - searchToGo = _searchLimit; -#ifdef densityFiltering - maxBasesAllowed = _maxDensity * seq2->len; -#endif // densityFiltering - processor = _processor; - processorInfo = _processorInfo; - - seed_search_set_stat (withTrans, hitSeed->withTrans); - seed_search_set_stat (searchLimit, searchLimit); - - // perform the search - - if (hitSeed->isHalfweight) basesHit = private_hit_search_halfweight (); - else if (pt->asBits != NULL) basesHit = private_hit_search_resolve (); - else basesHit = private_hit_search (); - - // cleanup - - if (foldedHits != NULL) - { free_if_valid ("folded hits", foldedHits); foldedHits = NULL; } - - if ((seed_search_dbgShowCoverage) && (basesHit > 0)) - { - sp2 = &seq2->partition; - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - } - else // sequence 2 is partitioned - name2 = "seq2"; - - strand2 = ((seq2->revCompFlags & rcf_rev) == 0)? '+' : '-'; - - printf ("# seed bases hit in %s%c: " u64Fmt, name2, strand2, basesHit); -#ifdef densityFiltering - if ((maxBasesAllowed > 0) && (basesHit > maxBasesAllowed)) - printf (" (rejected)"); -#endif // densityFiltering - printf ("\n"); - } - -#ifdef densityFiltering - if ((maxBasesAllowed > 0) && (basesHit > maxBasesAllowed)) - { - if (seed_search_dbgShowRejections) - { - sp2 = &seq2->partition; - if (sp2->p == NULL) // sequence 2 is not partitioned - { - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; - } - else // sequence 2 is partitioned - name2 = "seq2"; - - strand2 = ((seq2->revCompFlags & rcf_rev) == 0)? '+' : '-'; - - fprintf (stderr, "%s%c rejected due to excessive hsp density\n", - name2, strand2); - } - - return u64max; - } -#endif // densityFiltering - - return basesHit; - } - - -void free_seed_hit_search (void) { free_diag_hash (); } - - -// private_hit_search-- seed requires two bits per unpacked bp -// -// nota bene: I had expected that telling the compiler that the sequence is -// not changing (by declaring vars as follows) would produce faster -// code, but it was actually slightly slower. -// -// const u8* const qStart = seq2->v + start; -// const u8* const qStop = seq2->v + end; -// const u8* q; - -static u64 private_hit_search (void) - { - int seedLength; - u8* qStart = seq2->v + start; - u8* qStop = seq2->v + end; - u8* q; - u64 w; - s32 ww; - u32 packed, packedTrans; - int nts; - unspos pos2; - u32* f1, *f2; - u64 basesHit = 0; -#if ((defined collect_stats) && (defined maxHitsPerColumn)) - u64 prevRawHits, hitsInColumn; -#endif // collect_stats && maxHitsPerColumn - - seedLength = hitSeed->length; - - if (seedLength < 2) - suicidef ("seed length must be at least two (yours is %d)", seedLength); - - if (seq2->len < (unsigned) seedLength) - return 0; // (nothing to search for) - - // scan the sequence, processing each seed match - - for (q=qStart ; q start over - w = (w << 2) | ww; // append next nt - } - - // process each word of seedLength nucleotides - - for ( ; q start over - w = (w << 2) | ww; // append next nt - - pos2 = q-seq2->v + 1; - packed = apply_seed (hitSeed, w); // extract seed bits - seed_search_count_stat (wordsInSequence); - - // generate seed hits for the complete seed match - -#if ((defined collect_stats) && (defined maxHitsPerColumn)) - prevRawHits = seedSearchStats.rawSeedHits; -#endif // collect_stats && maxHitsPerColumn - debugSearchPos2_1; - basesHit += find_table_matches (packed, pos2); - - // generate seed hits for all seed matches with 1 or 2 transitions - - if (hitSeed->withTrans == 1) - { - for (f1=hitSeed->transFlips ; *f1!=0 ; f1++) - { - packedTrans = packed ^ (*f1); - debugSearchPos2_2; - basesHit += find_table_matches (packedTrans, pos2); - } - } - else if (hitSeed->withTrans >= 2) - { - for (f1=hitSeed->transFlips ; *f1!=0 ; f1++) - { - packedTrans = packed ^ (*f1); - debugSearchPos2_3; - basesHit += find_table_matches (packedTrans, pos2); - for (f2=f1+1 ; *f2!=0 ; f2++) - { - packedTrans = packed ^ (*f1) ^ (*f2); - debugSearchPos2_4; - basesHit += find_table_matches (packedTrans, pos2); - } - } - } - - if ((searchLimit > 0) && (searchToGo < 0)) - { warn_for_search_limit (); return basesHit; } - -#if ((defined collect_stats) && (defined maxHitsPerColumn)) - hitsInColumn = seedSearchStats.rawSeedHits - prevRawHits; - if (hitsInColumn <= maxHitsPerColumn) - seedSearchStats.hitsPerColumn[hitsInColumn]++; - else - { - seedSearchStats.hitsPerColumn[maxHitsPerColumn+1]++; - if (hitsInColumn > seedSearchStats.mostHitsInColumn) - seedSearchStats.mostHitsInColumn = hitsInColumn; - } -#endif // collect_stats && maxHitsPerColumn - -#ifdef densityCheckDepth2 - if ((maxBasesAllowed > 0) && (basesHit > maxBasesAllowed)) - return basesHit; -#endif // densityCheckDepth2 - } - } - - return basesHit; - } - - -// private_hit_search_halfweight-- seed requires one bit per unpacked bp - -static u64 private_hit_search_halfweight (void) - { - int seedLength; - u8* qStart = seq2->v + start; - u8* qStop = seq2->v + end; - u8* q; - u64 w; - s32 ww; - u32 packed; - int nts; - unspos pos2; - u64 basesHit = 0; -#if ((defined collect_stats) && (defined maxHitsPerColumn)) - u64 prevRawHits, hitsInColumn; -#endif // collect_stats && maxHitsPerColumn - - seedLength = hitSeed->length; - - if (seedLength < 2) - suicidef ("seed length must be at least two (yours is %d)", seedLength); - - if (seq2->len < (unsigned) seedLength) - return 0; // (nothing to search for) - - // scan the sequence, processing each seed match - - for (q=qStart ; q start over - w = (w << 1) | (ww & 1); // append next R/Y - } - - // process each word of seedLength nucleotides - - for ( ; q start over - w = (w << 1) | (ww & 1); // append next R/Y - - pos2 = q-seq2->v + 1; - packed = apply_seed (hitSeed, w); // extract seed bits - seed_search_count_stat (wordsInSequence); - - // generate seed hits for seed match - -#if ((defined collect_stats) && (defined maxHitsPerColumn)) - prevRawHits = seedSearchStats.rawSeedHits; -#endif // collect_stats && maxHitsPerColumn - basesHit += find_table_matches (packed, pos2); - if ((searchLimit > 0) && (searchToGo < 0)) - { warn_for_search_limit (); return basesHit; } - -#if ((defined collect_stats) && (defined maxHitsPerColumn)) - hitsInColumn = seedSearchStats.rawSeedHits - prevRawHits; - if (hitsInColumn <= maxHitsPerColumn) - seedSearchStats.hitsPerColumn[hitsInColumn]++; - else - { - seedSearchStats.hitsPerColumn[maxHitsPerColumn+1]++; - if (hitsInColumn > seedSearchStats.mostHitsInColumn) - seedSearchStats.mostHitsInColumn = hitsInColumn; - } -#endif // collect_stats && maxHitsPerColumn - -#ifdef densityCheckDepth2 - if ((maxBasesAllowed > 0) && (basesHit > maxBasesAllowed)) - return basesHit; -#endif // densityCheckDepth2 - } - } - - return basesHit; - } - - -// private_hit_search_resolve-- full seed requires two bits per unpacked bp, -// .. but not all seed bits are in the table, so -// .. the remaining bits must be resolved by direct -// .. comparison to sequence1 - -static u64 private_hit_search_resolve (void) - { - int seedLength; - int transAllowed = hitSeed->withTrans; - u8* qStart = seq2->v + start; - u8* qStop = seq2->v + end; - u8* q; - u64 w; - s32 ww; - u32 packed, packedTrans; - int nts; - unspos pos2; - u32* f1, *f2; - u64 basesHit = 0; -#if ((defined collect_stats) && (defined maxHitsPerColumn)) - u64 prevRawHits, hitsInColumn; -#endif // collect_stats && maxHitsPerColumn - - seedLength = hitSeed->length; - - if (seedLength < 2) - suicidef ("seed length must be at least two (yours is %d)", seedLength); - - if (seq2->len < (unsigned) seedLength) - return 0; // (nothing to search for) - - // scan the sequence, processing each seed match - - for (q=qStart ; q start over - w = (w << 2) | ww; // append next nt - } - - // process each word of seedLength nucleotides - - for ( ; q start over - w = (w << 2) | ww; // append next nt - - pos2 = q-seq2->v + 1; - packed = apply_seed (hitSeed, w); // extract seed bits - seed_search_count_stat (wordsInSequence); - - // generate seed hits for the complete seed match - -#if ((defined collect_stats) && (defined maxHitsPerColumn)) - prevRawHits = seedSearchStats.rawSeedHits; -#endif // collect_stats && maxHitsPerColumn - basesHit += find_table_matches_resolve (packed, pos2, - w, transAllowed); - // generate seed hits for all seed matches with 1 or 2 transitions - - if (transAllowed == 1) - { - for (f1=hitSeed->transFlips ; *f1!=0 ; f1++) - { - packedTrans = packed ^ (*f1); - basesHit += find_table_matches_resolve (packedTrans, pos2, - w, 0); - } - } - else if (transAllowed >= 2) - { - for (f1=hitSeed->transFlips ; *f1!=0 ; f1++) - { - packedTrans = packed ^ (*f1); - basesHit += find_table_matches_resolve (packedTrans, pos2, - w, 1); - for (f2=f1+1 ; *f2!=0 ; f2++) - { - packedTrans = packed ^ (*f1) ^ (*f2); - basesHit += find_table_matches_resolve (packedTrans, pos2, - w, 0); - } - } - } - - if ((searchLimit > 0) && (searchToGo < 0)) - { warn_for_search_limit (); return basesHit; } - -#if ((defined collect_stats) && (defined maxHitsPerColumn)) - hitsInColumn = seedSearchStats.rawSeedHits - prevRawHits; - if (hitsInColumn <= maxHitsPerColumn) - seedSearchStats.hitsPerColumn[hitsInColumn]++; - else - { - seedSearchStats.hitsPerColumn[maxHitsPerColumn+1]++; - if (hitsInColumn > seedSearchStats.mostHitsInColumn) - seedSearchStats.mostHitsInColumn = hitsInColumn; - } -#endif // collect_stats && maxHitsPerColumn - -#ifdef densityCheckDepth2 - if ((maxBasesAllowed > 0) && (basesHit > maxBasesAllowed)) - return basesHit; -#endif // densityCheckDepth2 - } - } - - return basesHit; - } - -//---------- -// -// find_table_matches, find_table_matches_resolve-- -// Given a packed word in sequence 2, find and process all its matches in the -// table of sequence 1 word positions. -// -// find_table_matches_resolve is used when the full seed is too big to use as -// an index (into the position table), and we have to resolve any remaining -// seed bits by comparing the sequences. -// -//---------- -// -// Arguments: -// u32 packed2: The packed word, representing some seed/window of -// .. nucleotides in sequence 2. -// unspos pos2: The hit position in sequence 2. This is the -// .. position following the end of the hit. -// u32 unpacked2: (find_table_matches_resolve only) The last 16 -// .. nucleotides from sequence 2, packed two bits per -// .. nucleotide (oldest nt in most significant bits). -// int transAllowed: (find_table_matches_resolve only) The maximum -// .. number of transitions allowed in the resolved -// .. bits; this is the same as the number of mis- -// .. matches, because resolved bases can only be -// .. matches or transitions (never transversions) -// -// Returns: -// The number of bases in the seed hits. -// -//---------- - -static u64 find_table_matches - (u32 packed2, - unspos pos2) - { - u32 seedLength, len1; - unspos adjStart = pt->adjStart; - u32 step = pt->step; - unspos pos, pos1; - u64 basesHit = 0; - - seedLength = (unsigned) hitSeed->length; - len1 = seedLength-1; - - if (pt->last[packed2] == 0) - { -#ifdef debugSearchPos2 - if (pos2 == debugSearchPos2) - printf (" no hits in sequence 1\n"); -#endif - return 0; - } - - for (pos=pt->last[packed2] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - { - pos1 = adjStart + step*pos; - -#ifdef debugSearchPos2 - if (pos2 == debugSearchPos2) - printf (" hit at pos " unsposFmt "\n", pos1); -#endif - - if ((selfCompare) && (seed_hit_below_diagonal (pos1, pos2))) - continue; - - if (seed_search_dbgDumpRawHits) - { - if (seed_search_dbgShowRawHits) - dump_raw_hit (stderr, pos1, pos2); - else - { - printf ("\nraw seed hit " unsposSlashFmt "\n", pos1-len1, pos2-len1); - dump_aligned_nucleotides (stdout, - seq1, pos1-seedLength, - seq2, pos2-seedLength, - seedLength); - } - } - - // call the seed hit processor for this seed hit - - seed_search_count_stat (rawSeedHits); - basesHit += (*processor) (processorInfo, pos1, pos2, seedLength); - -#ifdef densityCheckDepth3 - if ((maxBasesAllowed > 0) && (basesHit > maxBasesAllowed)) - return basesHit; -#endif // densityCheckDepth3 - } - - return basesHit; - } - - -static u64 find_table_matches_resolve - (u32 packed2, - unspos pos2, - u32 unpacked2, - int transAllowed) - { - u32 seedLength, len1; - unspos adjStart = pt->adjStart; - u32 step = pt->step; - unspos pos, pos1, pos1Rel; - u64 basesHit = 0; - u32 unpacked1; - int mismatches; - - seedLength = (unsigned) hitSeed->length; - len1 = seedLength-1; - - if (pt->last[packed2] == 0) - return 0; - - for (pos=pt->last[packed2] ; pos!=noPreviousPos ; pos=pt->prev[pos]) - { - pos1Rel = step*pos; - pos1 = adjStart + pos1Rel; - - if ((selfCompare) && (seed_hit_below_diagonal (pos1, pos2))) - continue; - - // resolve the remaining seed bits - -#ifdef debugResolvingSeeds - debugThisHit = (pos1 == debugPos1+len1) && (pos2 == debugPos2+len1); -#endif - if (debugThisHit) - mismatches = 0; // (only to set a breakpoint here) - - unpacked1 = fetch_resolving_bits (pt, pos1Rel); - - if (debugThisHit) - { - printf ("\npartial seed hit " unsposSlashFmt "\n", pos1-len1, pos2-len1); - dump_aligned_nucleotides (stdout, - seq1, pos1-seedLength, - seq2, pos2-seedLength, - seedLength); - printf (" %08X %s\n", unpacked1, bits_to_nuc_string(unpacked1,16)); - printf (" %08X %s\n", unpacked2, bits_to_nuc_string(unpacked2,16)); - } - - unpacked1 ^= unpacked2; // combine bits into A-B-C-D- - unpacked1 &= hitSeed->resolvingMask; // ... where any 1 => mismatch - unpacked1 += unpacked1 >> 17; // shift bits to ----CADB - mismatches = bit_count_16(unpacked1); // count mismatches - if (mismatches > transAllowed) - { - if (debugThisHit) - printf (" rejected (%d mismatches)\n", mismatches); - seed_search_count_stat (unresolvedSeedHits); - continue; - } - - // seed is resolved, ship it - - if (debugThisHit) - printf (" accepted (%d mismatches)\n", mismatches); - - if (seed_search_dbgDumpRawHits) - { - if (seed_search_dbgShowRawHits) - dump_raw_hit (stderr, pos1, pos2); - else - { - printf ("\nraw seed hit " unsposSlashFmt "\n", pos1-len1, pos2-len1); - dump_aligned_nucleotides (stdout, - seq1, pos1-seedLength, - seq2, pos2-seedLength, - seedLength); - } - } - - // call the seed hit processor for this seed hit - - seed_search_count_stat (rawSeedHits); - basesHit += (*processor) (processorInfo, pos1, pos2, seedLength); - -#ifdef densityCheckDepth3 - if ((maxBasesAllowed > 0) && (basesHit > maxBasesAllowed)) - return basesHit; -#endif // densityCheckDepth3 - } - - return basesHit; - } - -//---------- -// [[-- a seed hit processor function --]] -// -// process_for_plain_hit-- -// Process a seed hit for a given word, without bothering to check for -// overlap with other hits, and without performing extension. -// -// Arguments and Return value: (see seed_search.h) -// -//---------- -// -// Implementation: -// -// This is the simplest seed hit processor. We do not use any of the diag -// hashing arrays (diagEnd, diagStart or diagActual). -// -//---------- - -u64 process_for_plain_hit - (void* _info, - unspos pos1, - unspos pos2, - unspos length) - { - hitprocsimple* info = (hitprocsimple*) _info; - u32 basesHit; - - // filter by position (if specified) - - if ((info->hp.posFilter) - && (filter_seed_hit_by_pos (&info->hp, pos1, pos2, length))) - return 0; - - // filter by match/transversion count (if specified) - - if ((info->hp.minMatches >= 0) - && (filter_seed_hit_by_subs (&info->hp, pos1, pos2, length))) - return 0; - - if (seed_search_dbgShowHits) - printf ("plain seed hit " unsposSlashFmt " (diag " sgnposFmt ")\n", - pos1-(length-1), pos2-(length-1), diagNumber (pos1, pos2)); - - // report the hit - -#ifdef snoopReporterCalls - fprintf (stderr, "process_for_plain_hit reporting " unsposSlashFmt " #" unsposFmt " (to %p)\n", - pos1, pos2, length, info->hp.reporter); -#endif - basesHit = ((*info->hp.reporter) (info->hp.reporterInfo, pos1, pos2, length, 0)); - if (basesHit > 0) searchToGo--; - return basesHit; - } - -//---------- -// [[-- a seed hit processor function --]] -// -// process_for_simple_hit-- -// Process a seed hit for a given word, with one hit "good enough". -// -// Arguments and Return value: (see seed_search.h) -// -//---------- -// -// Implementation: -// -// Note that seed hits arrive in increasing positions on sequence 2, thus the -// arrivals on any particular diagonal are also increasing. -// -// We record and report seed hits as we encounter them, but we do not report -// overlapping hits. When a new seed hit arrives, if it overlaps the most -// recent on that hash-equivalent diagonal we record the new end but otherwise -// ignore the new seed hit. This treatment thus "suffers" from undetected hash -// collisions. -// -// Note that we do not use the diagStart or diagActual arrays. -// -//---------- - -u64 process_for_simple_hit - (void* _info, - unspos pos1, - unspos pos2, - unspos length) - { - hitprocsimple* info = (hitprocsimple*) _info; - u32 hDiag; - score s; - u32 basesHit; -#ifdef snoopDiagHash - unspos start2 = pos2 - length; -#endif // snoopDiagHash - - // filter by position (if specified) - - if ((info->hp.posFilter) - && (filter_seed_hit_by_pos (&info->hp, pos1, pos2, length))) - return 0; - - unblockedLeftExtension = false; - - // if we've already extended beyond this point on this hash-equivalent - // diagonal, ignore this hit - - hDiag = hashedDiag (pos1, pos2); - -#ifdef debugDiag - debugThisDiag = (hDiag == hashedDiag(debugDiag,0)); - - if (debugThisDiag) - { - printf ("simp: (diag %9s", pair_diagonal_as_text(pos1,pos2)); - printf ("|%9s|%04X) " unsposSlashFmt " " unsposDotsFmt " end was " unsposFmt "\n", - diagonal_as_text(diagActual[hDiag]), hDiag, - pos1, pos2, pos2-length, pos2, diagEnd[hDiag]); - if (diagEnd[hDiag] == hashInactiveEnd) printf ("simp: first hit on diagonal\n"); - else if (diagEnd[hDiag] > pos2-length) printf ("simp: hit discarded\n"); - } -#endif - - if (diagEnd[hDiag] == hashInactiveEnd) - { -#ifdef snoopDiagHash - fprintf (stderr, " activating diag %9s" - " " - ", diagEnd[%04X] = " unsposFmt - ", seed end = " unsposSlashFmt - ", in seq 1: " unsposDotsFmt "\n", - pair_diagonal_as_text(pos1,pos2), - hDiag, diagEnd[hDiag], - pos1, pos2, start2, pos2); -#endif // snoopDiagHash - activate_hashed_diag (hDiag); - diagEnd[hDiag] = 0; - } - - if (diagEnd[hDiag] > pos2-length) - { -#ifdef snoopDiagHash - fprintf (stderr, " ignoring diag %9s" - " " - ", diagEnd[%04X] = " unsposFmt - ", seed end = " unsposSlashFmt - ", in seq 1: " unsposDotsFmt "\n", - pair_diagonal_as_text(pos1,pos2), - hDiag, diagEnd[hDiag], - pos1, pos2, start2, pos2); -#endif // snoopDiagHash - return 0; - } - - // filter by match/transversion count (if specified) - - if ((info->hp.minMatches >= 0) - && (filter_seed_hit_by_subs (&info->hp, pos1, pos2, length))) - return 0; - - // perform gap-free extension (if specified) and record the extent of seed - // hits on this diagonal; note that the extention routines (such as - // match_extend_seed_hit) will record the extent of the extended hit - - if ((info->hp.gfExtend != gfexNoExtend) && (seed_search_dbgShowHits)) - { - int isRev1 = ((seq1->revCompFlags & rcf_rev) != 0); - int isRev2 = ((seq2->revCompFlags & rcf_rev) != 0); - printf ("simple seed hit " unsposSlashCFmt " (diag " sgnposFmt ")\n", - pos1-(length-1), (isRev1)?'-':'+', - pos2-(length-1), (isRev2)?'-':'+', - diagNumber (pos1, pos2)); - } - - if (info->hp.gfExtend == gfexExact) - { - s = match_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (s == noScore) - return 0; - } - else if (info->hp.gfExtend == gfexXDrop) - { - s = xdrop_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (s == noScore) - return 0; - } - else if ((info->hp.gfExtend >= gfexMismatch_min) - && (info->hp.gfExtend <= gfexMismatch_max)) - { - s = mismatch_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (s == noScore) - return 0; - } - else // if (info->hp.gfExtend == gfexNoExtend) - { - diagEnd[hDiag] = pos2; - s = 0; -#ifdef snoopDiagHash - fprintf (stderr, " setting diag %9s" - " " - ", diagEnd[%04X] = " unsposFmt - ", seed end = " unsposSlashFmt - ", in seq 1: " unsposDotsFmt "\n", - pair_diagonal_as_text(pos1,pos2), - hDiag, diagEnd[hDiag], - pos1, pos2, start2, pos2); -#endif // snoopDiagHash - } - - // report the hit - -#ifdef snoopReporterCalls - fprintf (stderr, "process_for_simple_hit reporting " unsposSlashFmt " #" unsposFmt " (to %p)\n", - pos1, pos2, length, info->hp.reporter); -#endif - basesHit = ((*info->hp.reporter) (info->hp.reporterInfo, pos1, pos2, length, s)); - if (basesHit > 0) searchToGo--; - return basesHit; - } - -//---------- -// [[-- a seed hit processor function --]] -// -// process_for_recoverable_hit-- -// Process a seed hit for a given word, with one hit "good enough", recovering -// from hash collisions. -// -// Arguments and Return value: (see seed_search.h) -// -//---------- -// -// Implementation: -// -// Note that seed hits arrive in increasing positions on sequence 2, thus the -// arrivals on any particular diagonal are also increasing. -// -// We record and report seed hits as we encounter them, but we may report hits -// that overlap. When a new seed hit arrives, if it overlaps the most recent -// on that hash-equivalent diagonal AND is on a different actual diagonal, we -// treat it as a new hit. This leads to potential reporting of duplicate HSPs -// (for example, if we get a hit on diag A, then B, then A again). Overlaps -// that have the same actual diagonal are reported on the first hit. -// -// Note that we do not use the diagStart array. -// -//---------- - -u64 process_for_recoverable_hit - (void* _info, - unspos pos1, - unspos pos2, - unspos length) - { - hitprocsimple* info = (hitprocsimple*) _info; - unspos start2 = pos2 - length; - sgnpos diag; - u32 hDiag; - score s; - u32 basesHit; -#ifdef debugDiag - static char prevSeq2Name[41] = ""; - static int prevRevCompFlags; -#endif -#ifdef snoopDiagHash - seq* seq1 = info->hp.seq1; - seq* seq2 = info->hp.seq2; -#endif // snoopDiagHash - - // filter by position (if specified) - - if ((info->hp.posFilter) - && (filter_seed_hit_by_pos (&info->hp, pos1, pos2, length))) - return 0; - - ////////// - // decide whether to discard this hit, based on the extent of previous hits - // along a hash-equivalent diagonal - ////////// - - unblockedLeftExtension = true; - - // get the diagonal's hash value - - diag = diagNumber (pos1, pos2); - hDiag = hashedDiag (pos1, pos2); - -#ifdef debugDiag - debugThisDiag = (hDiag == hashedDiag(debugDiag,0)); - - if (debugThisDiag) - { - if ((prevSeq2Name[0] == 0) - || (strcmp (seq2->header, prevSeq2Name) != 0) - || (seq2->revCompFlags != prevRevCompFlags)) - { - strncpy (prevSeq2Name, seq2->header, sizeof(prevSeq2Name)); - prevRevCompFlags = seq2->revCompFlags; - if (prevRevCompFlags == rcf_forward) printf ("%s+\n", prevSeq2Name); - else if (prevRevCompFlags == rcf_revcomp) printf ("%s-\n", prevSeq2Name); - else printf ("%s\n", prevSeq2Name); - } - - printf ("sing: (diag %9s", pair_diagonal_as_text(pos1,pos2)); - printf ("|%9s|%04X) " unsposSlashFmt " " unsposDotsFmt " end was " unsposFmt "\n", - diagonal_as_text(diagActual[hDiag]), hDiag, - pos1, pos2, start2, pos2, diagEnd[hDiag]); - } -#endif - - // if the diagonal was inactive, we treat it as a fresh hit - - if (diagEnd[hDiag] == hashInactiveEnd) - { - activate_hashed_diag (hDiag); - diagEnd[hDiag] = 0; - goto fresh_hit; - } - - // if we have a collision, accept it as a fresh hit; note that accepting - // prevents us from being able to recognize a later hit on that same - // (previous) diagonal later, in which case we would end up reporting it - // more than once; the premise of this routine is that multiple-extension - // is preferable to missing a colliding hit - - if (diag != diagActual[hDiag]) - { -#ifdef snoopDiagHash - fprintf (stderr, "%s:\n", seq2->header); -#endif // snoopDiagHash - seed_search_count_stat (hashCollisions); - if (start2 < diagEnd[hDiag]) - { - seed_search_count_stat (hashFailures); -#ifdef debugDiag - if (debugThisDiag) - printf ("sing: accepted in spite of hash failure\n"); -#endif -#ifdef snoopDiagHash - fprintf (stderr, " recovery on diag %9s" - ", diagActual[%04X] = %9s" - ", diagEnd[%04X] = " unsposFmt - ", seed end = " unsposSlashFmt - ", in seq 1: " unsposDotsFmt "\n", - pair_diagonal_as_text(pos1,pos2), - hDiag, diagonal_as_text(diagActual[hDiag]), - hDiag, diagEnd[hDiag], - pos1, pos2, start2, pos2); - } - else - { - fprintf (stderr, " collision on diag %9s" - ", diagActual[%04X] = %9s" - ", diagEnd[%04X] = " unsposFmt - ", seed end = " unsposSlashFmt - ", in seq 1: " unsposDotsFmt "\n", - pair_diagonal_as_text(pos1,pos2), - hDiag, diagonal_as_text(diagActual[hDiag]), - hDiag, diagEnd[hDiag], - pos1, pos2, start2, pos2); -#endif // snoopDiagHash - } - goto fresh_hit; - } - - // if this hit overlaps an earlier hit on the same diagonal, reject it; - // note that we record the extent, but only if it increases it - - if (start2 < diagEnd[hDiag]) - { -#ifdef debugDiag - if (debugThisDiag) - printf ("sing: rejected (diagEnd[%04X] blocks at " unsposFmt ")\n", - hDiag, diagEnd[hDiag]); -#endif - if (pos2 > diagEnd[hDiag]) - { - diagEnd [hDiag] = pos2; - diagActual[hDiag] = diag; -#ifdef debugDiag - if (debugThisDiag) - printf ("sing: (diag %9s) " unsposSlashFmt " diagEnd[%04X] <-- " unsposFmt "\n", - pair_diagonal_as_text(pos1,pos2), pos1, pos2, - hDiag, diagEnd[hDiag]); -#endif - } - return 0; - } - - ////////// - // this hit is a keeper, as far as diagonal extent is concerned; now - // perform whatever other filtering is specified, extend it, and record - // the extent - // - // note that we have to be careful and only record the extent if it would - // increase it; a previous hit on a hash-equivalent diagonal may have - // already extended further - ////////// - -fresh_hit: - diagActual[hDiag] = diag; - - // filter by match/transversion count (if specified) - - if ((info->hp.minMatches >= 0) - && (filter_seed_hit_by_subs (&info->hp, pos1, pos2, length))) - { - if (pos2 > diagEnd[hDiag]) diagEnd[hDiag] = pos2; - return 0; - } - - // perform gap-free extension (if specified) and record the extent of seed - // hits on this diagonal; note that the extention routines (such as - // match_extend_seed_hit) will record the extent of the extended hit - - if ((info->hp.gfExtend != gfexNoExtend) && (seed_search_dbgShowHits)) - { - int isRev1 = ((seq1->revCompFlags & rcf_rev) != 0); - int isRev2 = ((seq2->revCompFlags & rcf_rev) != 0); - printf ("simple seed hit " unsposSlashCFmt " (diag " sgnposFmt ")\n", - pos1-(length-1), (isRev1)?'-':'+', - pos2-(length-1), (isRev2)?'-':'+', - diag); - } - - if (info->hp.gfExtend == gfexExact) - { - s = match_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (s == noScore) - return 0; - } - else if (info->hp.gfExtend == gfexXDrop) - { - s = xdrop_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (s == noScore) - return 0; - } - else if ((info->hp.gfExtend >= gfexMismatch_min) - && (info->hp.gfExtend <= gfexMismatch_max)) - { - s = mismatch_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (s == noScore) - return 0; - } - else // if (info->hp.gfExtend == gfexNoExtend) - { - if (pos2 > diagEnd[hDiag]) diagEnd[hDiag] = pos2; - s = 0; - } - -#ifdef debugDiag - if (debugThisDiag) - printf ("sing: (diag %9s) " unsposSlashFmt " diagEnd[%04X] <-- " unsposFmt "\n", - pair_diagonal_as_text(pos1,pos2), pos1, pos2, - hDiag, diagEnd[hDiag]); -#endif - - if ((seed_search_dbgShowHits || debugThisDiag)) - dump_aligned_nucleotides (stdout, - seq1, pos1-length, - seq2, pos2-length, - length); - -#ifdef snoopReporterCalls - fprintf (stderr, "process_for_recoverable_hit reporting " unsposSlashFmt " #" unsposFmt " (to %p)\n", - pos1, pos2, length, info->hp.reporter); -#endif - basesHit = ((*info->hp.reporter) (info->hp.reporterInfo, pos1, pos2, length, s)); - if (basesHit > 0) searchToGo--; - return basesHit; - } - -//---------- -// [[-- a seed hit processor function --]] -// -// process_for_twin_hit-- -// Process a seed hit for a given word, with two nearby hits required for a -// hit to be "good enough". -// -// Arguments and Return value: (see seed_search.h) -// The info argument points to a hitproctwin record specifying the twin's span -// criteria. -// -//---------- -// -// Implementation: -// -// Note that seed hits arrive in increasing positions on sequence 2, thus the -// arrivals on any particular diagonal are also increasing. -// -// We record seed hits as we encounter them, but delay reporting them until we -// get a second hit within the (minSpan,maxSpan) criteria. The span of two hits -// is the length from the start of the first hit to the end of the second. -// -// When a new seed hit arrives, we check the distance between it and the most -// recent hit on the same diagonal. If it is too far we record the new hit (and -// erase the previous one). If it is too close we just extend the previous -// hit (record the new end). If it is within (minSpan,maxSpan) we extend the -// previous hit and report it. We will continue to extend the hit as long as -// new hits are close enough to the end, but will only report it once. -// -// Note that we expect that length < minSpan <= maxSpan. -// -//---------- -// -// Gap computation example: -// -// Case 1: In the figure below, we have a seed of length 10, and we get simple -// hits at (end points) 11 and 36 (stars show hits, arrows show the position -// reported to this routine). Suppose the criteria for twins is 20<=span<=30. -// The span of hits A and B is 35, so they are too far apart to qualify. -// -// 1 2 3 4 -// pos2: 1234567890123456789012345678901234567890 -// sequence: ..XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX.. -// seed hit A: **********^ -// seed hit B: **********^ -// AB span (35): =================================== -// -// Case 2: Now suppose we had had an intervening simple hit at 18 (below). The -// span of CD is 17, which is too short. But the span of DE is 28, which sat- -// isfies the criteria. -// 1 2 3 4 -// pos2: 1234567890123456789012345678901234567890 -// sequence: ..XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX.. -// seed hit C: **********^ -// seed hit D: **********^ -// seed hit E: **********^ -// CD span (17): ================= -// DE span (28): ============================ -// -// The implementation here accumulates simple hits into an unresolved hit as -// long as the new hit is unsatisfactory when compared to the first hit and to -// the last hit. It may be possible to contrive a 4-hit example in which hits -// 2 and 4 are satifactory but which this implementation will not report. The -// author presently believes these are not a problem in practice. -// -//---------- - -//--- (implementation NOT using the seed hit queue) --- - -#ifdef noSeedHitQueue - -#error ***** non-seed-queue version of process_for_twin_hit() has a serious flaw ***** - -// The flaw is that diagEnd is used here to track the end of the most recent seed, -// while in xdrop_extend_seed_hit() it is used to block the left-extension of a -// seed hit. Thus when we do find a valid twin seed, left-extension stops at the -// RIGHT end of the first seed of the pair. So we miss some portion of the HSP -// and also incorrectly reject some HSPs because the right-extension is not enough, -// by itself, to meet the score threshold. See the two "(PROBLEM)" notes below; -// these indicate where diagEnd is set in a way that defeats extension. - -u64 process_for_twin_hit - (void* _info, - unspos pos1, - unspos pos2, - unspos length) - { - hitproctwin* info = (hitproctwin*) _info; - unspos start2 = pos2 - length; - sgnpos diag; - u32 hDiag; - u32 span; - score s; - - // filter by position (if specified) - - if ((info->hp.posFilter) - && (filter_seed_hit_by_pos (&info->hp, pos1, pos2, length))) - return 0; - - ////////// - // decide whether to discard this hit, based on the extent of previous hits - // and twinliness along a hash-equivalent diagonal - ////////// - - unblockedLeftExtension = false; - - // get the diagonal's hash value - - diag = diagNumber (pos1, pos2); - hDiag = hashedDiag (pos1, pos2); - -#ifdef debugDiag - debugThisDiag = (hDiag == hashedDiag(debugDiag,0)); - - if (debugThisDiag) - { - printf ("twin: (diag %9s", pair_diagonal_as_text(pos1,pos2)); - printf ("|%9s|%04X) " unsposSlashFmt " " unsposDotsFmt " end was " unsposFmt "\n", - diagonal_as_text(diagActual[hDiag]), hDiag, - pos1, pos2, start2, pos2, diagEnd[hDiag]); - } -#endif - - // if the diagonal was inactive, we treat it as a fresh hit; note that we - // might not really have to activate the diagonal here, because this hit - // may get discarded; but the penalty for unecessary activation is only - // one extra pass through the loop in empty_diag_hash(), vs. having to do - // this test again after we filter at fresh_hit - - if (diagEnd[hDiag] == hashInactiveEnd) - { -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: first hit on diagonal\n"); -#endif - activate_hashed_diag (hDiag); - diagEnd[hDiag] = 0; - goto fresh_hit; - } - - // if we have a collision, reject/accept it based on whether we have - // already reported a twin for the first hit; once we have reported a twin, - // it is safe to discard that hit in favor of the new one; but *before* - // that event, we must discard the new hit; the reason is that if we - // accept the new hit, we would be vulnerable to a situation in which two - // orthologies colliding on hash-equivalent diagonals cause us to discard - // the other before a twin is detected, resulting in us missing both of - // them - - if (diag != diagActual[hDiag]) - { - seed_search_count_stat (hashCollisions); - if (pos2 >= diagEnd[hDiag] - length + info->maxSpan) - goto fresh_hit; // (beyond maxSpan from end of previous hit) - - seed_search_count_stat (hashFailures); - - span = diagEnd[hDiag] - diagStart[hDiag]; // (span of old hit(s)) - if (span >= info->maxSpan) - { -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: accepted in spite of hash failure" - " (span was " unsposFmt ", extent is " unsposFmt ")\n", - span, pos2 - (diagEnd[hDiag] - length)); -#endif - goto fresh_hit; // (old hit, on different diag, already reported) - } - -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: rejected (hash failure on pending twin," - " span was " unsposFmt ", extent is " unsposFmt ")\n", - span, pos2 - (diagEnd[hDiag] - length)); -#endif - - return 0; - } - - // this hit is on the same diagonal as the previous one; if the last hit - // already extends beyond this one, we can ignore this one; this can happen - // if it was extended (by xdrop_extend_seed_hit), in which case it has - // already been reported - - if (pos2 <= diagEnd[hDiag]) - { -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: rejected (diagEnd[%04X] blocks at " unsposFmt ")\n", - hDiag, diagEnd[hDiag]); -#endif - return 0; - } - - // if the span of the last hit with the new hit is too large, record this - // as a fresh hit; this is like seed hit B in case 1 - - span = length + pos2 - diagEnd[hDiag]; - if (span > info->maxSpan) - { -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: span too long (" unsposFmt " > " unsposFmt ")\n", span, info->maxSpan); -#endif - goto fresh_hit; - } - - // otherwise, we will extend the previous hit, but we have to decide whether - // to report it; if that hit was previously reported, we just extend it; - // this is like seed hits *after* E in case 2 - - if (diagEnd[hDiag] - diagStart[hDiag] >= info->minSpan) - { -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: already reported (" unsposFmt " >= " unsposFmt ")\n", - diagEnd[hDiag]-diagStart[hDiag], info->minSpan); -#endif - goto simple_extend_hit; - } - - // if the combined length of the previous hit with this one added is too - // short, we have not reached minSpan yet, so we just extend it; this is - // like seed hit D in case 2 - - span = pos2 - diagStart[hDiag]; - if (span < info->minSpan) - { -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: not long enough yet (" unsposFmt " < " unsposFmt ")\n", - span, info->minSpan); -#endif - goto simple_extend_hit; - } - - // otherwise, the gap has met the (minSpan,maxSpan) citeria for the first - // time; this is like seed hit E in case 2 - - goto fresh_twin_hit; - - ////////// - // this is a fresh hit, as far as diagonal extent is concerned; perform - // whatever other filtering is specified before recording it - ////////// - - // filter by match/transversion count (if specified) - -fresh_hit: - - if ((info->hp.minMatches >= 0) - && (filter_seed_hit_by_subs (&info->hp, pos1, pos2, length))) - return 0; - - // record it - - diagStart [hDiag] = start2; - diagActual[hDiag] = diag; -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: (diag %9s) " unsposSlashFmt " diagStart[%04X] <-- " unsposFmt "\n", - pair_diagonal_as_text(pos1,pos2), pos1, pos2, - hDiag, diagStart[hDiag]); -#endif - goto record_extent; - - ////////// - // this hit only requires that we record it's extent, as far as diagonal - // extent is concerned; perform whatever other filtering is specified - // before recording it - ////////// - - // filter by match/transversion count (if specified) - -simple_extend_hit: - - if ((info->hp.minMatches >= 0) - && (filter_seed_hit_by_subs (&info->hp, pos1, pos2, length))) - return 0; - - // record it - -record_extent: - diagEnd[hDiag] = pos2; // (PROBLEM) -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: (diag %9s) " unsposSlashFmt " diagEnd[%04X] <-- " unsposFmt "\n", - pair_diagonal_as_text(pos1,pos2), pos1, pos2, - hDiag, diagEnd[hDiag]); -#endif - - return 0; - - ////////// - // this hit is a keeper, as far as diagonal extent is concerned; now - // perform whatever other filtering is specified, extend it, and record - // the extent - ////////// - -fresh_twin_hit: - - // filter by match/transversion count (if specified) - - if ((info->hp.minMatches >= 0) - && (filter_seed_hit_by_subs (&info->hp, pos1, pos2, length))) - return 0; - - // perform gap-free extension (if specified) and record the extent of seed - // hits on this diagonal; note that the extention routines (such as - // match_extend_seed_hit) will record the extent of the extended hit - - length = span; - - if ((info->hp.gfExtend != gfexNoExtend) && (seed_search_dbgShowHits)) - printf ("twin seed hit " unsposSlashFmt "\n", pos1-(span-1), pos2-(span-1)); - - if (info->hp.gfExtend == gfexExact) - { - s = match_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (s == noScore) - return 0; - } - else if (info->hp.gfExtend == gfexXDrop) - { - s = xdrop_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (s == noScore) - return 0; - } - else if ((info->hp.gfExtend >= gfexMismatch_min) - && (info->hp.gfExtend <= gfexMismatch_max)) - { - s = mismatch_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (s == noScore) - return 0; - } - else // if (info->hp.gfExtend == gfexNoExtend) - { - diagEnd[hDiag] = pos2; // (PROBLEM) - s = 0; - } - -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: (diag %9s) " unsposSlashFmt " diagEnd[%04X] <-- " unsposFmt "\n", - pair_diagonal_as_text(pos1,pos2), pos1, pos2, - hDiag, diagEnd[hDiag]); -#endif - - if (seed_search_dbgShowHits || debugThisDiag) - dump_aligned_nucleotides (stdout, - seq1, pos1-length, - seq2, pos2-length, - length); - -#ifdef snoopReporterCalls - fprintf (stderr, "process_for_twin_hit reporting " unsposSlashFmt " #" unsposFmt " (to %p)\n", - pos1, pos2, length, info->hp.reporter); -#endif - return ((*info->hp.reporter) (info->hp.reporterInfo, pos1, pos2, length, s)); - } - -#endif // noSeedHitQueue - - -//--- (implementation using the seed hit queue) --- - -#ifndef noSeedHitQueue - -u64 process_for_twin_hit - (void* _info, - unspos pos1, - unspos pos2, - unspos length) - { - hitproctwin* info = (hitproctwin*) _info; - sgnpos diag; - u32 hDiag; - unspos oldDiagEnd, extent; - u64 num; - shqhit* q; - u32 span; - unspos start2; - score s; -#ifdef debugDiag - u32 longestShortSpan = 0; -#endif - - // filter by position (if specified) - - if ((info->hp.posFilter) - && (filter_seed_hit_by_pos (&info->hp, pos1, pos2, length))) - return 0; - - // filter by match/transversion count (if specified) - - if ((info->hp.minMatches >= 0) - && (filter_seed_hit_by_subs (&info->hp, pos1, pos2, length))) - return 0; - - ////////// - // scan the seed hit queue for hits along this diagonal with valid span, or - // for 'blocks' placed at the end of previously extended diagonals - ////////// - - unblockedLeftExtension = false; - - // get the diagonal's hash value - - diag = diagNumber (pos1, pos2); - hDiag = hashedDiag (pos1, pos2); - -#ifdef debugDiag - debugThisDiag = (hDiag == hashedDiag(debugDiag,0)); - - if (debugThisDiag) - { - start2 = pos2 - length; - printf ("\ntwin: (diag %9s", pair_diagonal_as_text(pos1,pos2)); - printf ("|%9s|%04X) " unsposSlashFmt " " unsposDotsFmt " end was " unsposFmt "\n", - diagonal_as_text(diagActual[hDiag]), hDiag, - pos1, pos2, start2, pos2, diagEnd[hDiag]); - } -#endif - - // if the diagonal was inactive, activate it, then add this hit to the queue, - // and exit - - if (diagEnd[hDiag] == hashInactiveEnd) - { -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: first hit on diagonal\n"); -#endif - activate_hashed_diag (hDiag); - diagEnd[hDiag] = 0; - - enqueue_seed_hit (pos1, pos2, /*isBlock*/ false); - -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: (diag %9s) " unsposSlashFmt " diagEnd[%04X] <-- " unsposFmt "\n", - pair_diagonal_as_text(pos1,pos2), pos1, pos2, - hDiag, diagEnd[hDiag]); -#endif - - return 0; - } - - // scan the queue entries for this hashed diagonal, until one of the - // following occurs: - // - we find a twin hit with an acceptable span - // - we reach the end of previously extended seed hits on this diagonal; - // any additional seed hits in the queue would be older than this block - // and thus needn't be considered - // - we reach the 'end' of the queue; this is accomplished by checking - // the scanned record's 'number' to see if it would still be in the - // queue - - for (num = lastSeedHit[hDiag] ; - num > seedHitNum - seedHitQueueSize ; - num = q->prevHit) - { - q = &seedHitQueue[num % seedHitQueueSize]; - seed_search_count_stat (queueSeedsScanned); - span = pos2 - (q->pos2 - length); - if (span > info->maxSpan) - { // (span too long from previous hit) -#ifdef debugDiag - if ((debugThisDiag) && (q->diag == diag)) - printf ("twin: span too long (" unsposFmt " > " unsposFmt ")\n", span, info->maxSpan); -#endif - break; - } - - if (q->diag != diag) // (not on correct diagonal) - continue; - - seed_search_count_stat (queueSeedsExamined); - - if (q->isBlock) // (we've already extended this far) - { - start2 = pos2 - length; - if (start2 <= q->pos2) - { - seed_search_count_stat (queueSeedsBlocked); - return 0; // (this seed hit overlaps previous extension) - } - else - break; // (this seed hit is right of extension) - } - - if (span < info->minSpan) // (span not long enough from previous hit) - { -#ifdef debugDiag - if ((debugThisDiag) && (span > longestShortSpan)) - longestShortSpan = span; -#endif - continue; - } - - goto twin_hit; // (this hit is part of a twin with a desired span) - } - -#ifdef debugDiag - if ((debugThisDiag) && (longestShortSpan > 0)) - printf ("twin: not long enough yet (" unsposFmt " < " unsposFmt ")\n", - longestShortSpan, info->minSpan); -#endif - - // we don't have a twin; add this seed hit to the queue, and exit - - enqueue_seed_hit (pos1, pos2, /*isBlock*/ false); - return 0; - - ////////// - // this hit is part of a twin with a desired span; now extend it and - // record the extent - ////////// - - // perform gap-free extension (if specified) and record the extent of seed - // hits on this diagonal; note that the extention routines (such as - // match_extend_seed_hit) will record the extent of the extended hit - -twin_hit: -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: twin hit, length = %u\n", span); -#endif - - length = span; - - if ((info->hp.gfExtend != gfexNoExtend) && (seed_search_dbgShowHits)) - printf ("twin seed hit " unsposSlashFmt "\n", pos1-(span-1), pos2-(span-1)); - - if (info->hp.gfExtend == gfexExact) - { - oldDiagEnd = diagEnd[hDiag]; - s = match_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (diagEnd[hDiag] != oldDiagEnd) - { - extent = diagEnd[hDiag]; - enqueue_seed_hit (diagToPos1(diag,extent), extent, /*isBlock*/ true); - if (s == noScore) - enqueue_seed_hit (pos1, pos2, /*isBlock*/ false); - } - if (s == noScore) - return 0; - } - else if (info->hp.gfExtend == gfexXDrop) - { - oldDiagEnd = diagEnd[hDiag]; - s = xdrop_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (diagEnd[hDiag] != oldDiagEnd) - { - extent = diagEnd[hDiag]; - enqueue_seed_hit (diagToPos1(diag,extent), extent, /*isBlock*/ true); - } - if (s == noScore) - return 0; - } - else if ((info->hp.gfExtend >= gfexMismatch_min) - && (info->hp.gfExtend <= gfexMismatch_max)) - { - oldDiagEnd = diagEnd[hDiag]; - s = mismatch_extend_seed_hit (&info->hp, &pos1, &pos2, &length); - if (diagEnd[hDiag] != oldDiagEnd) - { - extent = diagEnd[hDiag]; - enqueue_seed_hit (diagToPos1(diag,extent), extent, /*isBlock*/ true); - if (s == noScore) - enqueue_seed_hit (pos1, pos2, /*isBlock*/ false); - } - if (s == noScore) - return 0; - } - else // if (info->hp.gfExtend == gfexNoExtend) - { - diagEnd[hDiag] = pos2; - enqueue_seed_hit (pos1, pos2, /*isBlock*/ true); - s = 0; - } - -#ifdef debugDiag - if (debugThisDiag) - printf ("twin: (diag %9s) " unsposSlashFmt " diagEnd[%04X] <-- " unsposFmt "\n", - pair_diagonal_as_text(pos1,pos2), pos1, pos2, - hDiag, diagEnd[hDiag]); -#endif - - if (seed_search_dbgShowHits || debugThisDiag) - dump_aligned_nucleotides (stdout, - seq1, pos1-length, - seq2, pos2-length, - length); - -#ifdef snoopReporterCalls - fprintf (stderr, "process_for_twin_hit reporting " unsposSlashFmt " #" unsposFmt " (to %p)\n", - pos1, pos2, length, info->hp.reporter); -#endif - return ((*info->hp.reporter) (info->hp.reporterInfo, pos1, pos2, length, s)); - } - -#endif // not noSeedHitQueue - -//---------- -// -// seed_hit_below_diagonal-- -// Determine whether a raw seed hit is below (or on) the diagonal. -// -// Arguments: -// unspos pos1, pos2: The hit position in sequence 1 and 2. This is the -// .. position following the end of the hit (and origin -// .. zero). -// -// Returns: -// true if the hit is "below the digaonal", false if it is not. -// -//---------- -// -// Notes: [ similar notes appear in mirror_alignments() ] -// -// (1) We assume, without checking, that seq1 and seq2 are essentially the -// same. I.e. that they have the same length, and if one is partitioned, -// the other has the same partitions. -// -// (2) The DP matrix is viewed as having sequence 1 along the x axis and -// sequence 2 along the y axis, as in this diagram: -// -// +-------------+ -// ^ | . . . . . / | -// | | . . . . / | -// | | . . . / | -// seq 2 | . . / | -// | | . / | -// | | / | -// +-------------+ -// --- seq 1 --> -// -// (3) The diagonal runs from lower-left to upper-right, shown as the slashed -// line in the diagrom. -// -// (4) Alignments "above the diagonal" have pos1 < pos2. In the diagram, this -// is the region filled with dots. Alignments "below the diagonal" or "on -// the diagonal" have pos1 >= pos2; the region is empty in the diagram. -// -// (5) When sequence 2 is on the minus strand, pos2 is actually in counted in -// reverse. -// -// (conceptual) (actual) -// +-------------+ +-------------+ -// ^ | . . . . . / | ^ | \ . . . . . | -// | | . . . . / | | | \ . . . . | -// | | . . . / | | | \ . . . | -// seq 2 | . . / | seq 2 | \ . . | -// | | . / | | | \ . | -// | | / | | | \ | -// +-------------+ +-------------+ -// --- seq 1 --> --- seq 1 --> -// -// (6) When sequence 2 is partitioned, and on the minus strand, the situation -// with positions in complicated by the fact that the partitions have been -// reversed individually, not the sequence as a whole. -// -// (conceptual) (actual) -// +-------------+-------+ +---------------------+ -// ^ | . . . . . . | . . / | ^ | . . . . . . | \ . . | -// | | . . . . . . | . / | | | . . . . . . | \ . | -// | | . . . . . . | / | | | . . . . . . | \ | -// | +-------------+-------+ | +-------------+-------+ -// seq 2 | . . . . . / | | seq 2 | \ . . . . . | | -// | | . . . . / | | | | \ . . . . | | -// | | . . . / | | | | \ . . . | | -// | | . . / | | | | \ . . | | -// | | . / | | | | \ . | | -// | | / | | | | \ | | -// +---------------------+ +---------------------+ -// --- seq 1 --> --- seq 1 --> -// -//---------- - -//=== stuff for snoopBelowDiagonal === - -#ifndef snoopBelowDiagonal -#define snoopBelowDiagonal_1 ; -#define snoopBelowDiagonal_2 ; -#define snoopBelowDiagonal_3 ; -#define snoopBelowDiagonal_4 ; -#define snoopBelowDiagonal_5 ; -#define snoopBelowDiagonal_6 ; -#define snoopBelowDiagonal_7 ; -#define snoopBelowDiagonal_8 ; -#endif // not snoopBelowDiagonal - -#ifdef snoopBelowDiagonal - -#define snoopBelowDiagonal_1 \ - int nonTrivial = (pos1 != pos2); \ - if (nonTrivial) fprintf (stderr, unsposSlashFmt, \ - pos1-hitSeed->length, \ - pos2-hitSeed->length); - -#define snoopBelowDiagonal_2 \ - if (nonTrivial) fprintf (stderr, " (same strand)"); - -#define snoopBelowDiagonal_3 \ - if (nonTrivial) \ - fprintf (stderr, " --> %s\n", \ - (pos1 == pos2)? "on" : (pos1 >= pos2)? "below" : "above"); - -#define snoopBelowDiagonal_4 \ - if (!nonTrivial) fprintf (stderr, unsposSlashFmt, pos1, pos2); \ - nonTrivial = true; \ - fprintf (stderr, " (opposite strand)"); - -#define snoopBelowDiagonal_5 \ - if (nonTrivial) fprintf (stderr, " (partitioned)"); - -#define snoopBelowDiagonal_6 \ - if (nonTrivial) fprintf (stderr, " parts %ld/%ld", \ - part1 - sp1->p, part2 - sp2->p); - -#define snoopBelowDiagonal_7 \ - if (nonTrivial) fprintf (stderr, " --> %s (by part)\n", \ - (partIx1 >= partIx2)? "below" : "above"); - -#define snoopBelowDiagonal_8 \ - if (nonTrivial) \ - fprintf (stderr, " " unsposSlashFmt " --> %s\n", \ - pos1,pos2, \ - (pos1 == pos2)? "on" : (pos1 >= pos2)? "below" : "above"); - -#endif // snoopBelowDiagonal - - -//=== seed_hit_below_diagonal() === - -static int seed_hit_below_diagonal - (unspos pos1, - unspos pos2) - { - seqpartition* sp1, *sp2; - partition* part1, *part2; - int partIx1, partIx2; - - snoopBelowDiagonal_1; - - // same strand case; see notes (2) thru (4) - - if (sameStrand) - { - snoopBelowDiagonal_2; - snoopBelowDiagonal_3; - return (pos1 >= pos2); - } - - // opposite strand case; see note (5) - - pos1 -= hitSeed->length; // (note we don't bother to do this for same - pos2 -= hitSeed->length; // .. strand since it doesn't affect the test) - - snoopBelowDiagonal_4; - - sp2 = &seq2->partition; - if (sp2->p == NULL) // (seq2 is not partitioned) - { - pos2 = (seq2->len-1) - pos2; - snoopBelowDiagonal_8; - return (pos1 >= pos2); - } - - // parititioned opposite strand case; see note (6) - - snoopBelowDiagonal_5; - - sp1 = &seq1->partition; - part1 = lookup_partition (seq1, pos1); - part2 = lookup_partition (seq2, pos2); - partIx1 = part1 - sp1->p; - partIx2 = part2 - sp2->p; - - snoopBelowDiagonal_6; - - if (partIx1 != partIx2) - { - snoopBelowDiagonal_7; - return (partIx1 >= partIx2); - } - - pos2 = (part2->sepBefore + part2->sepAfter) - pos2; - snoopBelowDiagonal_8; - return (pos1 >= pos2); - } - -//---------- -// -// filter_seed_hit_by_pos-- -// Determine whether a raw seed hit should be filtered, based on its position -// in the target or query. -// -// Arguments: -// hitprocinfo* hp: Pointer to record containing (among other things) -// .. the filter criteria, tStart, tEnd, qStart and -// .. qEnd. -// unspos pos1: The hit position in sequence 1, relative to the -// .. entire sequence (not to the interval). This is -// .. the first letter following the end of the match -// .. (origin-0). -// unspos pos2: The hit position in sequence 2 (with details the -// .. same as for pos1). -// unspos length: The length of the hit (number of nucleotides). -// -// Returns: -// true if the hit should be filtered (discarded), false if it should be kept. -// -//---------- -// -// Notes: -// -// (1) The seed hit is discarded if it extends outside the allowable range on -// .. either target (pos1) or query (pos2). -// -//---------- - -static int filter_seed_hit_by_pos - (hitprocinfo* hp, - unspos pos1, - unspos pos2, - unspos length) - { - unspos tStart = hp->targetInterval.s; - unspos tEnd = hp->targetInterval.e; - unspos qStart = hp->queryInterval.s; - unspos qEnd = hp->queryInterval.e; - - pos1 -= length; // (move from first location AFTER the - pos2 -= length; // .. hit, to first location OF the hit) - -#ifdef snoopPosFilter - fprintf (stderr, "filter_seed_hit_by_pos(" unsposFmt "/" unsposFmt "#" unsposFmt ")" - " vs [" unsposFmt "," unsposFmt "] / [" unsposFmt "," unsposFmt "]", - pos1, pos2, length, - tStart, tEnd, qStart, qEnd); - - if (pos1 < tStart) - fprintf (stderr, " (discarded, before target start)\n"); - else if (pos1+length > tEnd) - fprintf (stderr, " (discarded, beyond target end)\n"); - else if (pos2 < qStart) - fprintf (stderr, " (discarded, before query start)\n"); - else if (pos2+length > qEnd) - fprintf (stderr, " (discarded, beyond query end)\n"); - else - fprintf (stderr, " (kept)\n"); - -#endif // snoopPosFilter - - // if the hit extends beyond target or query, discard it - - if ((pos1 < tStart) || (pos1+length > tEnd)) return true; - if ((pos2 < qStart) || (pos2+length > qEnd)) return true; - - // otherwise, keep it - - return false; - } - -//---------- -// -// filter_seed_hit_by_subs-- -// Determine whether a raw seed hit should be filtered, based on the number of -// matches and transversions it contains. -// -// Arguments: -// hitprocinfo* hp: Pointer to record containing (among other things) -// .. the filter criteria, minMatches and -// .. maxTransversions. -// unspos pos1: The hit position in sequence 1. -// unspos pos2: The hit position in sequence 2. -// unspos length: The length of the hit. -// -// Returns: -// true if the hit should be filtered (discarded), false if it should be kept. -// -//---------- -// -// Notes: -// -// (1) This is dependent on the specific 2-bit encoding of nucleotides, which -// is defined (implicitly) in dna_utilities.c. We assume that the least -// significant of the two bits distinguishes between purines and -// pyramidines, so if it is different in two nucleotides, they are a -// transversion. -// -// (2) This routine considers any substitution involving non-DNA characters to -// be a transversion (anything outside of ACGTacgt). -// -//---------- - -#define is_transversion(bits1,bits2) ((((bits1)^(bits2))&1)==1) - -static int filter_seed_hit_by_subs - (hitprocinfo* hp, - unspos pos1, - unspos pos2, - unspos length) - { - u8* scan1 = hp->seq1->v + pos1; - u8* scan2 = hp->seq2->v + pos2; - char* pScan; - unspos remaining; - int matches, transversions; - s8 bits1, bits2; - - // count the number of matches and transversions in this hit - - if (hp->filterPattern != NULL) - { - scan1 -= length; - scan2 -= length; - pScan = hp->filterPattern; - matches = transversions = 0; - for (remaining=length ; remaining>0 ; remaining--) - { - if (*(pScan++) != '0') - { - bits1 = hp->charToBits[*scan1]; - bits2 = hp->charToBits[*scan2]; - - if ((bits1 < 0) || (bits2 < 0)) // (negative => not ACGTacgt) - transversions++; - else if (bits1 == bits2) - matches++; - else if (is_transversion(bits1,bits2)) - transversions++; - } - scan1++; scan2++; - } - } - else - { - matches = transversions = 0; - for (remaining=length ; remaining>0 ; remaining--) - { - bits1 = hp->charToBits[*(--scan1)]; - bits2 = hp->charToBits[*(--scan2)]; - - if ((bits1 < 0) || (bits2 < 0)) // (negative => not ACGTacgt) - transversions++; - else if (bits1 == bits2) - matches++; - else if (is_transversion(bits1,bits2)) - transversions++; - } - } - - // if the counts don't meet the filter criteria, discard this hit - - if (matches < hp->minMatches) - { seed_search_count_stat (notEnoughMatches); return true; } - if ((hp->maxTransversions >= 0) && (transversions > hp->maxTransversions)) - { seed_search_count_stat (tooManyTransversions); return true; } - - // otherwise, keep it - - return false; - } - -//---------- -// -// xdrop_extend_seed_hit-- -// Perform gap-free extension on a seed hit, and discard those that don't -// score high enough. -// -// Gap-free extension extends the hit/match in each direction, along the -// diagonal, as long as the running score has not dropped too far below the -// maximum score. -// -// Low-scoring extensions are 'discarded' by the caller. This routine only -// makes the scoring decision. Further, if an adpative scoring threshold is -// being used, this routine will consider all extensions as being acceptible. -// -// Arguments: -// hitprocinfo* hp: Pointer to record containing (among other things) -// .. the extension controls and filtering criteria. -// unspos* pos1: The hit position in sequence 1. (see note 1 below) -// unspos* pos2: The hit position in sequence 2. (see note 1 below) -// unspos* length: The length of the hit. (see note 1 below) -// -// Returns: -// The score of the extended match. If the match should be filtered -// (discarded), noScore is returned. -// -//---------- -// -// Notes: -// -// (1) Upon return, the values of pos1, pos2, and length will have been changed -// to reflect the extended hit *IF* the match is to be kept (i.e. if the -// return value is not noScore). -// -// (2) We expect that the hits will arrive in increasing order on sequence 2. -// See note 5. -// -// (3) It may appear that we (incorrectly) assume that the entirety of -// sequences 1 and 2 are fair game for the HSP. However, we indirectly -// halt processing when we encounter a NUL character. This happens because -// the scoring matrix contains veryBadScore for any substitution with NUL. -// This in turn causes loop 1 or loop 2 to exit (because the running score -// exceeds the xdrop threshold). A NUL character indicates (1) the end of -// a partition, either in a partitioned sequence or X-separated sequence, -// or (2) the end of a chore. Further, positional seed filtering -// (performed by filter_seed_hit_by_pos) prevents us from being called for -// hits outside the range of a chore. -// -// (4) We assume hp->hspThreshold.t is either 'C' (count) or 'S' (score). -// -// (5) Though hits arrive in increasing order on sequence 2, it is possible -// that diagEnd[hDiag] > pos2. This happens when a previous seed hit on a -// hash-equivalent diagonal was extended (diagEnd records the righthand -// limit of that extension). In such case, left-extension of this seed hit -// is prematurely halted (at pos2-length). Right-extension is no affected. -// -//---------- - -#ifndef snoopXDrop -#define snoopXDrop_Pos ; -#define snoopXDrop_Left ; -#define snoopXDrop_Right ; -#define snoopXDrop_Score ; -#endif // not snoopXDrop - -#ifdef snoopXDrop - -#define snoopXDrop_Pos \ - fprintf (stderr, \ - "xdrop, seed hit at " unsposSlashFmt "\tdiag=%s\n", \ - pos1, pos2, pair_diagonal_as_text(pos1,pos2)); - -#define snoopXDrop_Left \ - fprintf (stderr, \ - " left: %s %s" \ - " " scoreFmtStar \ - " " scoreFmtStar "\n", \ - display_sequence_character (seq1, s1[-1]), \ - display_sequence_character (seq2, s2[-1]), \ - 8, scoring->sub[s1[-1]][s2[-1]], \ - 8, runScore + scoring->sub[s1[-1]][s2[-1]]); - -#define snoopXDrop_Right \ - fprintf (stderr, \ - " right: %s %s" \ - " " scoreFmtStar \ - " " scoreFmtStar "\n", \ - display_sequence_character (seq1, *s1), \ - display_sequence_character (seq2, *s2), \ - 8, scoring->sub[*s1][*s2], \ - 8, runScore + scoring->sub[*s1][*s2]); - -#define snoopXDrop_Score \ - fprintf (stderr, \ - " score=" scoreFmtSimple "\n", \ - similarity); - -static char* display_sequence_character (seq* _seq, u8 ch) - { - static char s1[4]; - static char s2[4]; - static char* s = s2; - - s = (s == s1)? s2 : s1; // (ping pong) - - if (_seq->fileType == seq_type_qdna) sprintf (s, "%02X", ch); - else sprintf (s, "%c", ch); - - return s; - } - -#endif // snoopXDrop - - -//--- xdrop_extend_seed_hit-- - -static score xdrop_extend_seed_hit - (hitprocinfo* hp, - unspos* _pos1, - unspos* _pos2, - unspos* _length) - { - unspos pos1 = *_pos1; - unspos pos2 = *_pos2; - unspos length = *_length; - seq* seq1 = hp->seq1; - seq* seq2 = hp->seq2; - scoreset* scoring = hp->scoring; - score xDrop = hp->xDrop; - sgnpos diag, block2; - unspos oldDiagEnd, extent; - u32 hDiag; - u8* s1, *s2, *stop, *leftStart, *rightStop, *rightBlock; - score similarity, runScore, leftScore, rightScore; - int adjustScore; -#ifdef debugDiag - unspos start2 = pos2 - length; - u8* p1, *p2, *p3, *p4; -#else -#ifdef collect_stats - u8* p1; -#endif -#endif -#ifdef snoopHspSubrange - score bestSubrangeScore; - unspos hspPos1, hspPos2, hspLen, subPos1, subPos2, subLen; -#endif -#ifdef snoopDiagHash - unspos start2 = pos2 - length; -#endif // snoopDiagHash - - ////////// - // get ready to extend the hit - ////////// - -#ifdef debugDiag - p1 = p2 = p3 = p4 = NULL; // (satisfy the compiler) -#endif - - diag = diagNumber (pos1, pos2); - hDiag = hashedDiag (pos1, pos2); - -#ifdef debugDiag - debugThisDiag = (hDiag == hashedDiag(debugDiag,0)); - - if (debugThisDiag) - { - printf ("gfex: (diag %9s", pair_diagonal_as_text(pos1,pos2)); - printf ("|%9s|%04X) " unsposSlashFmt " " unsposDotsFmt " end was " unsposFmt "\n", - diagonal_as_text(diagActual[hDiag]), hDiag, - pos1, pos2, start2, pos2, diagEnd[hDiag]); - } -#endif - - snoopXDrop_Pos; - - ////////// - // extend to the left (loop 1) - // - // results: - // leftStart: position of 1st nucleotide in extended match, in seq1 - // leftScore: score of bp added by left extension - // length: possibly shortened (if the extension does not include the - // .. entire hit) - ////////// - - s1 = seq1->v + pos1; // start just past end of hit in both seq1 and seq2; - s2 = seq2->v + pos2; // .. we will pre-decrement before reads, so first - // .. bp read are the ones at the tail of the hit - -#ifdef extendHspFromLeft - s1 = seq1->v + pos1 - length; - s2 = seq2->v + pos2 - length; -#endif // extendHspFromLeft - - // determine stop location; this is at the start of sequence 1, except - // that if this diagonal ends (or is blocked) earlier in sequence 2, we - // stop there - // (see note 3; instead of zero, we should use subsequence's start) - - if (unblockedLeftExtension) oldDiagEnd = 0; - else oldDiagEnd = diagEnd[hDiag]; - block2 = (sgnpos) oldDiagEnd; - if (block2 + diag > 0) stop = seq1->v + block2 + diag; - else stop = seq1->v; - - // extend - - leftStart = s1; - runScore = leftScore = 0; - - while ((s1 > stop) && (runScore >= leftScore-xDrop)) - { - snoopXDrop_Left; - runScore += scoring->sub[*(--s1)][*(--s2)]; - if (runScore > leftScore) - { - leftStart = s1; - leftScore = runScore; - } - } - - // adjust length if the extension is shorter than the hit - -#ifndef extendHspFromLeft - s2 = seq1->v + pos1 - length; // (left end of hit) - if (leftStart > s2) - length -= leftStart - s2; -#endif // not extendHspFromLeft - -#ifdef debugDiag - p1 = s1; - if (debugThisDiag) - { - p2 = leftStart; - p3 = seq1->v+pos1-length; - p4 = seq1->v+pos1; - } -#endif -#ifdef collect_stats - p1 = s1; -#endif - - ////////// - // extend to the right (loop 2) - // - // results: - // rightStop: position of 1st nucleotide beyond extended match, in seq1 - // similarity: increased by score of bp added by left and right extension - ////////// - - s1 = seq1->v + pos1; // start just past end of hit in both seq1 - s2 = seq2->v + pos2; // .. and seq2 - -#ifdef extendHspFromLeft - s1 = seq1->v + pos1 - length; - s2 = seq2->v + pos2 - length; -#endif // extendHspFromLeft - - // determine stop location; this is at the end of sequence 1, except - // that if this diagonal ends earlier in sequence 2, we stop there - // (see note 3; instead of sequence's end, we should use subsequence's end) - - block2 = (sgnpos) seq2->len; - if ((sgnpos) seq1->len <= block2 + diag) stop = seq1->v + seq1->len; - else stop = seq1->v + block2 + diag; - - // extend - - rightStop = s1; - runScore = rightScore = 0; - - while ((s1 < stop) && (runScore >= rightScore-xDrop)) - { - snoopXDrop_Right; - runScore += scoring->sub[*(s1++)][*(s2++)]; - if (runScore > rightScore) - { - rightStop = s1; - rightScore = runScore; - } - } - rightBlock = s1; - - // adjust length if the extension is shorter than the hit - -#ifdef extendHspFromLeft - s2 = seq1->v + pos1; // (past right end of hit) - if (rightStop < s2) - length -= s2 - rightStop; -#endif // extendHspFromLeft - -#ifdef debugDiag - if (debugThisDiag) - dump_extended_match (stdout, seq1, seq2, diag, - p1, p2, p3, p4, rightStop, s1); -#endif - - similarity = leftScore + rightScore; - - ////////// - // (for debugging only) - // determine if some subrange of the HSP outscores the whole HSP - // - // We use the algorithm described in Bentley's "Programming Pearls" ("A - // scanning Algorithm", section 8.4, page 81 in the second edition). - // - // bestSubrangeScore == Bentley's maxSoFar - // subrangeScore == Bentley's maxEndingHere - ////////// - -#ifdef snoopHspSubrange - - { - score subrangeScore; - u8* currLeft, *subLeft, *subRight; - - s1 = leftStart; - s2 = seq2->v + diagToPos2 (diag, leftStart - seq1->v); - currLeft = s1; - subLeft = subRight = s1; - - subrangeScore = bestSubrangeScore = 0; - runScore = 0; - while (s1 < rightStop) - { - runScore = runScore + scoring->sub[*s1][*s2]; - subrangeScore = subrangeScore + scoring->sub[*(s1++)][*(s2++)]; - if (subrangeScore < 0) - { subrangeScore = 0; currLeft = s1; } - if (subrangeScore > bestSubrangeScore) - { - bestSubrangeScore = subrangeScore; - subLeft = currLeft; - subRight = s1; - } - - if (debugThisDiag) - printf (unsposFmt ":" - " %c%c " scoreFmtSimple - " " scoreFmtSimple - " " scoreFmtSimple " " unsposFmt - " " scoreFmtSimple " " unsposFmt ".." unsposFmt - "\n", - (s1-1) - seq1->v, - s1[-1], s2[-1], scoring->sub[s1[-1]][s2[-1]], - runScore, - subrangeScore, currLeft - seq1->v, - bestSubrangeScore, subLeft - seq1->v, subRight - seq1->v); - } - - hspPos1 = hspPos2 = hspLen = subPos1 = subPos2 = subLen = 0; - - if (bestSubrangeScore > similarity) - { - hspPos1 = leftStart - seq1->v; - hspPos2 = diagToPos2 (diag, leftStart - seq1->v); - hspLen = rightStop - leftStart; - subPos1 = subLeft - seq1->v; - subPos2 = diagToPos2 (diag, subLeft - seq1->v); - subLen = subRight - subLeft; - seed_search_count_stat (suboptimalHsp); - } - } - -#endif // snoopHspSubrange - - ////////// - // record the extent of HSP search on this diagonal - ////////// - - // record the extent - - extent = (unspos) (((sgnpos) (rightBlock-seq1->v)) - diag); - if (extent > diagEnd[hDiag]) - { - diagEnd [hDiag] = extent; - diagActual[hDiag] = diag; -#ifdef snoopDiagHash - fprintf (stderr, " setting diag %9s" - ", diagActual[%04X] = %9s" - ", diagEnd[%04X] = " unsposFmt - ", seed end = " unsposSlashFmt - ", in seq 1: " unsposDotsFmt "\n", - pair_diagonal_as_text(pos1,pos2), - hDiag, diagonal_as_text(diagActual[hDiag]), - hDiag, diagEnd[hDiag], - pos1, pos2, start2, pos2); -#endif // snoopDiagHash - } - -#ifdef debugDiag - if (debugThisDiag) - { - printf ("gfex: (diag %9s) " unsposSlashFmt " diagEnd[%04X] <-- " unsposFmt, - pair_diagonal_as_text(pos1,pos2), pos1, pos2, - hDiag, diagEnd[hDiag]); - printf (" (" unsposSlashFmt " -> " unsposSlashFmt ")\n", - (unspos) (rightStop-seq1->v), (unspos) (rightStop-seq1->v-diag), - (unspos) (rightBlock-seq1->v), diagEnd[hDiag]); - } -#endif - - snoopXDrop_Score; - -#ifdef collect_stats - seed_search_add_stat (bpExtended, rightBlock-p1); -#endif - - ////////// - // update length of hit - ////////// - - pos1 = (unspos) (rightStop - seq1->v); - pos2 = (unspos) (((sgnpos) pos1) - diag); - length = (unspos) (rightStop - leftStart); - - ////////// - // if the extended hit's score is acceptable, but not very high, adjust - // the score downward, based on the entropy of the sequences in the match; - // note that we only adjust positive scores (since hspZeroThreshold == - // max(0,hspZeroThreshold)), otherwise the entropy adjustment would - // *increase* the score when entropy is poor. - // - // When an adaptive scoring threshold is being used, we can't determine - // what a reasonable "high enough" threshold is to not bother to perform - // the entropy reduction, so we perform the reduction on any extended hit - // that could potentially make the hsp table. - // - // $$$ Heuristically, we could still estimate some reasonable "high enough" - // .. threshold based on the current low score threshold, how many hits - // .. we have accepted/rejected so far, and how much room is left in - // .. the table. Simpler schemes may also work well in practice. This - // .. should be considered if the entropy calculation ends up being a - // .. significant time factor. - ////////// - - // decide whether to adjust the score - - if (!hp->entropicHsp) - adjustScore = false; - else if (hp->hspThreshold.t == 'S') // (fixed score threshold) - adjustScore = (similarity >= hp->hspZeroThreshold) - && (similarity <= 3*hp->hspThreshold.s); - else if (similarity <= 0) // (adaptive score threshold, negative) - adjustScore = false; - else // (adaptive score threshold, positive) - { - segtable* anchors = *(hp->anchors); - adjustScore = (anchors->len > 0) - && (similarity >= anchors->lowScore); - } - - // adjust it - - if (adjustScore) - { - double q = entropy (hp->seq1->v + pos1 - length, - hp->seq2->v + pos2 - length, - length); - - score rawS = similarity; - similarity *= q; - if ((similarity < hp->hspThreshold.s) && (hp->reportEntropy)) - fprintf(stderr, "hit of score " scoreFmtSimple - " at " unsposSlashFmt "#" unsposFmt " (diag " sgnposFmt " had block at " unsposFmt ")" - " fails entropy filter (%f)\n", - rawS, - pos1-length, pos2-length, length, - diag, oldDiagEnd, - q); -#ifdef snoopEntropy - else - fprintf(stderr, "hit of score " scoreFmtSimple - " at " unsposSlashFmt "#" unsposFmt " (diag " sgnposFmt " had block at " unsposFmt ")" - " passes entropy filter (%f)\n", - rawS, - pos1-length, pos2-length, length, - diag, oldDiagEnd, - q); -#endif // snoopEntropy - -#ifdef snoopHspSubrange - bestSubrangeScore *= q; -#endif // snoopHspSubrange - } - - ////////// - // decide whether or not this extended seed hit is an hsp. - ////////// - - dbg_timing_count_stat (ungappedExtensions); - - // if it doesn't score high enough, discard it - - if ((hp->hspThreshold.t == 'S') // (fixed score threshold) - && (similarity < hp->hspThreshold.s)) - { - seed_search_count_stat (lowScoringHsps); -#ifdef snoopHspSubrange - if (bestSubrangeScore >= hp->hspThreshold.s) - { - fprintf (stderr, "WARNING: discarded HSP " unsposSlashFmt "#" unsposFmt - " scores " scoreFmtSimple - " but subrange " unsposSlashFmt "#" unsposFmt - " scores " scoreFmtSimple "\n", - hspPos1, hspPos2, hspLen, similarity, - subPos1, subPos2, subLen, bestSubrangeScore); - - seed_search_count_stat (suboptimalHspB); - } - else if ((seed_search_dbgSubrangeHsps) - && (bestSubrangeScore > similarity)) - fprintf (stderr, "INFO: HSP " unsposSlashFmt "#" unsposFmt - " scores " scoreFmtSimple - " but subrange " unsposSlashFmt "#" unsposFmt - " scores " scoreFmtSimple "\n", - hspPos1, hspPos2, hspLen, similarity, - subPos1, subPos2, subLen, bestSubrangeScore); -#endif // snoopHspSubrange - return noScore; - } - -#ifdef snoopHspSubrange - if ((seed_search_dbgSubrangeHsps) - && (bestSubrangeScore > similarity)) - fprintf (stderr, "INFO: HSP " unsposSlashFmt "#" unsposFmt - " scores " scoreFmtSimple - " but subrange " unsposSlashFmt "#" unsposFmt - " scores " scoreFmtSimple "\n", - hspPos1, hspPos2, hspLen, similarity, - subPos1, subPos2, subLen, bestSubrangeScore); -#endif // snoopHspSubrange - - // it's a keeper - - *_pos1 = pos1; - *_pos2 = pos2; - *_length = length; - - if (hp->anchors != NULL) - (*(hp->anchors))->haveScores = true; - - seed_search_count_stat (hsps); - dbg_timing_count_stat (hsps); - - return similarity; - } - -//---------- -// -// match_extend_seed_hit-- -// Perform exact match extension on a seed hit, and discard those that aren't -// long enough. -// -// Exact match extension extends the hit/match in each direction, along the -// diagonal, as long as it encounters matches. -// -// Short extensions are 'discarded' by the caller. This routine only makes the -// decision. -// -// Arguments: -// hitprocinfo* hp: Pointer to record containing (among other things) -// .. the extension controls and filtering criteria. -// unspos* pos1: The hit position in sequence 1. (see note 1 below) -// unspos* pos2: The hit position in sequence 2. (see note 1 below) -// unspos* length: The length of the hit. (see note 1 below) -// -// Returns: -// The "score" of the extended match; this is actually the number of matching -// bases (i.e. the length of the match). If the match should be filtered -// (discarded), noScore is returned. -// -//---------- -// -// Notes: -// -// (1) Upon return, the values of pos1, pos2, and length will have been changed -// to reflect the extended hit *IF* the match is to be kept (i.e. if the -// return value is not noScore). -// -// (2) We expect that the hits will arrive in increasing order on sequence 2. -// -// (3) It may appear that we (incorrectly) assume that the entirety of -// sequences 1 and 2 are fair game for the HSP. However, we halt -// processing when we encounter a NUL character, which indicates (1) the -// end of a partition, either in a partitioned sequence or X-separated -// sequence, or (2) the end of a chore. Further, positional seed filtering -// (performed by filter_seed_hit_by_pos) prevents us from being called for -// hits outside the range of a chore. -// -// (4) We assume hp->hspThreshold.t is 'S', but we treat it as the required -// length of the match. -// -// (5) Any non-ACGT is considered to be a mismatch (even if both sequences -// have the same letter). -// -// (6) Though hits arrive in increasing order on sequence 2, it is possible -// that diagEnd[hDiag] > pos2. This happens when a previous seed hit on a -// hash-equivalent diagonal was extended (diagEnd records the righthand -// limit of that extension). In such case, left-extension of this seed -// hit is prematurely halted (at pos2-length). Right-extension is not -// affected. -// -//---------- - -static score match_extend_seed_hit - (hitprocinfo* hp, - unspos* _pos1, - unspos* _pos2, - unspos* _length) - { - unspos pos1 = *_pos1; - unspos pos2 = *_pos2; - unspos length = *_length; - seq* seq1 = hp->seq1; - seq* seq2 = hp->seq2; - sgnpos diag, block2; - unspos oldDiagEnd, extent; - u32 hDiag; - u8* s1, *s2, *stop, *left, *right; - u8 nuc1, nuc2; - s8 bits1, bits2; -#ifdef snoopDiagHash - unspos start2 = pos2 - length; -#endif // snoopDiagHash - - ////////// - // get ready to extend the hit - ////////// - - diag = diagNumber (pos1, pos2); - hDiag = hashedDiag (pos1, pos2); - -#ifdef debugDiag - debugThisDiag = (hDiag == hashedDiag(debugDiag,0)); - - if (debugThisDiag) - { - printf ("gfex: (diag %9s", pair_diagonal_as_text(pos1,pos2)); - printf ("|%9s|%04X) " unsposSlashFmt " end was " unsposFmt "\n", - diagonal_as_text(diagActual[hDiag]), hDiag, - pos1, pos2, diagEnd[hDiag]); - } -#endif - - ////////// - // validate that the hit is an exact match - ////////// - - s1 = seq1->v + pos1; - s2 = seq2->v + pos2; - stop = s1 - length; - - while (s1 > stop) - { - bits1 = hp->charToBits[*(--s1)]; - bits2 = hp->charToBits[*(--s2)]; - - if ((bits1 != bits2) - || (bits1 < 0) // (negative => not ACGTacgt) - || (bits2 < 0)) - { - extent = s2 - seq2->v; // (position of rightmost mismatch) - goto hit_isnt_a_match; - } - } - - ////////// - // extend to the left - ////////// - - s1 = seq1->v + pos1 - length; // start at start of hit in both seq1 and - s2 = seq2->v + pos2 - length; // .. seq2; will pre-decrement before - // .. reads, so first bp read are the ones - // .. immediately in front of the hit - - // determine stop location; this is at the start of sequence 1, except - // that if this diagonal ends (or is blocked) earlier in sequence 2, we - // stop there - // (see note 3; instead of zero, we should use subsequence's start) - - if (unblockedLeftExtension) oldDiagEnd = 0; - else oldDiagEnd = diagEnd[hDiag]; - block2 = (sgnpos) oldDiagEnd; - if (block2 + diag > 0) stop = seq1->v + block2 + diag; - else stop = seq1->v; - - // extend - - if (s1 < stop) - { - s1--; // if the new hit is left of the previous block (as can happen - s2--; // .. when called by process_for_recoverable_hit), the normal - // .. loop (below) will fail to step, and so won't stop on a - // .. mismatch; so in this case we need to push the position - // .. to one to the left of the start of the match - } - else - { - while (s1 >= stop) - { - if (s1 == stop) // (this test is necessary since we - { s1--; s2--; break; }// .. don't have a zero-terminator at - // .. the start of the sequence) - nuc1 = *(--s1); - bits1 = hp->charToBits[nuc1]; - nuc2 = *(--s2); - bits2 = hp->charToBits[nuc2]; - - if ((nuc1 == 0) // (NUL => end of partition or chore) - || (nuc2 == 0) - || (bits1 != bits2) - || (bits1 < 0) // (negative => not ACGTacgt) - || (bits2 < 0)) - break; - } - } - - left = s1; // the first mismatch, or just to the left of the stop - - ////////// - // extend to the right - ////////// - - s1 = seq1->v + pos1 - 1; // start at end of hit in both seq1 and seq2; - s2 = seq2->v + pos2 - 1; // .. will pre-increment before reads, so first - // .. bp read are the ones immediately after the - // .. hit - - // determine stop location; this is at the end of sequence 1, except - // that if this diagonal ends earlier in sequence 2, we stop there - // (see note 3; instead of sequence's end, we should use subsequence's end) - - block2 = (sgnpos) seq2->len; - if ((sgnpos) seq1->len <= block2 + diag) stop = seq1->v + seq1->len; - else stop = seq1->v + block2 + diag; - - // extend - - while (s1 < stop) - { - nuc1 = *(++s1); - bits1 = hp->charToBits[nuc1]; - nuc2 = *(++s2); - bits2 = hp->charToBits[nuc2]; - - if ((nuc1 == 0) // (NUL => end of partition or chore) - || (nuc2 == 0) - || (bits1 != bits2) - || (bits1 < 0) // (negative => not ACGTacgt) - || (bits2 < 0)) - break; - } - - right = s1; // the first mismatch, or at the stop - - ////////// - // record the extent of the search on this diagonal - ////////// - - // record the extent - - extent = (unspos) (((sgnpos) (right-seq1->v)) - diag); - if (extent > diagEnd[hDiag]) - { - diagEnd [hDiag] = extent; - diagActual[hDiag] = diag; -#ifdef snoopDiagHash - fprintf (stderr, " m setting diag %9s" - ", diagActual[%04X] = %9s" - ", diagEnd[%04X] = " unsposFmt - ", seed end = " unsposSlashFmt - ", in seq 1: " unsposDotsFmt "\n", - pair_diagonal_as_text(pos1,pos2), - hDiag, diagonal_as_text(diagActual[hDiag]), - hDiag, diagEnd[hDiag], - pos1, pos2, start2, pos2); -#endif // snoopDiagHash - } - - ////////// - // update length of hit - ////////// - - pos1 = (unspos) (right - seq1->v); - pos2 = (unspos) (((sgnpos) pos1) - diag); - length = (unspos) (right - (left + 1)); - - ////////// - // decide whether or not this extended seed hit is long enough. - ////////// - - dbg_timing_count_stat (ungappedExtensions); - - // if it isn't long enough, discard it (see note 4) - - if (length < (unsigned) hp->hspThreshold.s) - { - seed_search_count_stat (lowScoringHsps); - return noScore; - } - - // it's a keeper - - *_pos1 = pos1; - *_pos2 = pos2; - *_length = length; - - seed_search_count_stat (hsps); - dbg_timing_count_stat (hsps); - - return (score) length; - - ////////// - // special exit for the case of the hit not being an exact match - ////////// - -hit_isnt_a_match: - - // record the extent of the search on this diagonal - - if (extent > diagEnd[hDiag]) - { - diagEnd [hDiag] = extent; - diagActual[hDiag] = diag; -#ifdef snoopDiagHash - fprintf (stderr, " nm setting diag %9s" - ", diagActual[%04X] = %9s" - ", diagEnd[%04X] = " unsposFmt - ", seed end = " unsposSlashFmt - ", in seq 1: " unsposDotsFmt "\n", - pair_diagonal_as_text(pos1,pos2), - hDiag, diagonal_as_text(diagActual[hDiag]), - hDiag, diagEnd[hDiag], - pos1, pos2, start2, pos2); -#endif // snoopDiagHash - } - - seed_search_count_stat (lowScoringHsps); - dbg_timing_count_stat (ungappedExtensions); - return noScore; - } - -//---------- -// -// mismatch_extend_seed_hit-- -// Perform M-mismatch extension on a seed hit, and discard those that aren't -// long enough. -// -// Conceptually, M-mismatch extension extends the hit/match in each direction, -// along the diagonal, as long as it encounters fewer than M mismatches. In -// actuality, it considers more then M mismatches because it scans both left -// and right; then picks the longest of several M-mismatch segments. -// -// Short extensions are 'discarded' by the caller. This routine only makes the -// decision. -// -// Arguments: -// hitprocinfo* hp: Pointer to record containing (among other things) -// .. the extension controls and filtering criteria. -// unspos* pos1: The hit position in sequence 1. (see note 1 below) -// unspos* pos2: The hit position in sequence 2. (see note 1 below) -// unspos* length: The length of the hit. (see note 1 below) -// -// Returns: -// The "score" of the extended match; this is actually the length of the -// match (i.e. the number of matching bases plus the number of mismatches). -// If the match should be filtered (discarded), noScore is returned. -// -//---------- -// -// Notes: -// -// (1) Upon return, the values of pos1, pos2, and length will have been changed -// to reflect the extended hit *IF* the match is to be kept (i.e. if the -// return value is not noScore). -// -// (2) We expect that the hits will arrive in increasing order on sequence 2. -// -// (3) It may appear that we (incorrectly) assume that the entirety of -// sequences 1 and 2 are fair game for the HSP. However, we halt -// processing when we encounter a NUL character, which indicates (1) the -// end of a partition, either in a partitioned sequence or X-separated -// sequence, or (2) the end of a chore. Further, positional seed filtering -// (performed by filter_seed_hit_by_pos) prevents us from being called for -// hits outside the range of a chore. -// -// (4) We assume hp->hspThreshold.t is 'S', but we treat it as the required -// length of the match. -// -// (5) Any non-ACGT is considered to be a mismatch (even if both sequences -// have the same letter). -// -// (6) Though hits arrive in increasing order on sequence 2, it is possible -// that diagEnd[hDiag] > pos2. This happens when a previous seed hit on a -// hash-equivalent diagonal was extended (diagEnd records the righthand -// limit of that extension). In such case, left-extension of this seed -// hit is prematurely halted (at pos2-length). Right-extension is not -// affected. -// -//---------- -// -// Algorithm: -// -// The figure below shows an example of a 5 mismatch extension (M=5). The -// asterisks represent a 19 bp seed hit. Below that is the sequence of match -// (-) or mismatch (o) along the diagonal. The seed contains two mismatches -// (E=2). -// -// [--- seed hit ----] -// ******************* -// sequence: o-----o-----oo---------o------o-----------o--------------o--o---o -// (len=41) [----------------- 5-mm ----------------] -// (len=50) [--------------------- 5-mm ---------------------] -// (len=47) [-------------------- 5-mm -------------------] -// (len=50) [--------------------- 5-mm ---------------------] -// -// We scan left until we find the 4th mismatch (M+1-E = 4). Any of these can -// determine the start of a 5-mismatch segment containing the seed. Note that -// the segment actually starts at the first base *after* that mismatch. There -// are 4 possible starting points. We scan right to find the end points for -// each of these 4 intervals, and choose the longest. In the case of ties we -// prefer the one further to the left. -// -// The algorithm is complicated by the fact that we may hit either endpoint -// before seeing the requisite number of mismatches. The example below shows -// a case where we fall 2 mismatches short during the left scan. In this case -// we treat the stop point (x) as a mismatch. Since this leaves us still one -// mismatch short, we skip the first mismatch during right-scanning. -// -// stop [--- seed hit ----] -// | ******************* -// sequence: x---oo---------o------o-----------o--------------o--o---o -// (len=48) [-------------------- 5-mm --------------------] -// (len=47) [-------------------- 5-mm -------------------] -// (len=50) [--------------------- 5-mm ---------------------] -// -// The extent (which limits subsequent processing on teh same diagonal) is set -// to the first of these that is true -// (1) if seed hit had more than E mismatches, to the E+1st rightmost mismatch -// (2) if the segment is long enough, just beyond the mismatch at the right -// end of the segment -// (3) if seed hit had any mismatches, to the leftmost mismatch -// (4) otherwise, to the leftmost mismatch to the right of the seed -// -// The justifciation for (3) is that when we reject all possible intervals for -// this seed hit, we have considered all intervals starting with anything to the -// left of the leftmost mismatch in the seed hit. So we would like to consider -// intervals beginning from that point as part of a later seed hit. The -// justification for (1) and (4) is similar. -// -// The justification for (2) is that we don't want to 'report' segments that -// overlap. Note that in a long segment of high identity, there will be many -// M-mismatch intervals that are long enough, all them overlapping. Criteria -// (2) has the unfortunate side effect of often not reporting the longest -// acceptable interval in such a run. However, when the segments are used as -// anchors for gapped extension, this side effect is unlikely to matter. And -// the time savings gained from setting the extent as far to the right as -// possible, and from not reporting overlapping segments, can be substantial. -// -//---------- - -#ifndef debugMismatchExtend -#define debugMismatchExtend_1 ; -#define debugMismatchExtend_2 ; -#define debugMismatchExtend_3A ; -#define debugMismatchExtend_3B ; -#define debugMismatchExtend_3C ; -#endif // not debugMismatchExtend - -#ifdef debugMismatchExtend - -#ifdef debugDiag -static int debugMmExtendDiag; -#else -#define debugMmExtendDiag true -#endif - -#define debugMismatchExtend_1 \ - if (debugMmExtendDiag) \ - { \ - fprintf (stderr, "mmex: hit at " unsposSlashFmt " %d mm\n", \ - pos1-length, pos2-length, E); \ - } - -#define debugMismatchExtend_2 \ - if (debugMmExtendDiag) \ - { \ - u8** mm; \ - fprintf (stderr, "mmex: lefties at"); \ - for (mm=mmScan ; mmv); \ - } \ - if (mmShortfall > 0) \ - fprintf (stderr, " shortfall=%d", mmShortfall); \ - fprintf (stderr, "\n"); \ - } - -#define debugMismatchExtend_3A \ - if (debugMmExtendDiag) \ - { \ - unspos p1 = (*mmScan)+1 - seq1->v; \ - unspos p2 = diagToPos2 (diag,p1); \ - fprintf (stderr, "mmex: %dmm interval" \ - " at " unsposSlashFmt " length %d (shorty)\n", \ - M-mmShortfall, p1, p2, (s1-1)-*mmScan); \ - } - -#define debugMismatchExtend_3B \ - if (debugMmExtendDiag) \ - { \ - unspos p1 = (*mmScan)+1 - seq1->v; \ - unspos p2 = diagToPos2 (diag,p1); \ - fprintf (stderr, "mmex: %dmm interval" \ - " at " unsposSlashFmt " length %d\n", \ - M, p1, p2, thisLength-1); \ - } - -#define debugMismatchExtend_3C \ - if (debugMmExtendDiag) \ - { \ - unspos p1 = (*mmScan)+1 - seq1->v; \ - unspos p2 = diagToPos2 (diag,p1); \ - fprintf (stderr, "mmex: %dmm interval" \ - " at " unsposSlashFmt " length %d (right stop)\n", \ - M, p1, p2, thisLength-1); \ - } - - -#endif // debugMismatchExtend - - -// mismatch_extend_seed_hit-- - -static score mismatch_extend_seed_hit - (hitprocinfo* hp, - unspos* _pos1, - unspos* _pos2, - unspos* _length) - { - unspos pos1 = *_pos1; - unspos pos2 = *_pos2; - unspos length = *_length; - seq* seq1 = hp->seq1; - seq* seq2 = hp->seq2; - sgnpos diag, block2; - unspos oldDiagEnd, extent; - u32 hDiag; - u8* s1, *s2, *stop, *left, *right; - u8 nuc1, nuc2; - s8 bits1, bits2; - int M = hp->gfExtend; // (max allowed mismatches) - int E; // (number of mms in seed) - u8* mmLoc[gfexMismatch_max+1]; // (mm loc'ns as starters) - u8** mmScan, **mmStop; - int mmShortfall; - unspos thisLength, bestLength; -#ifdef snoopDiagHash - unspos start2 = pos2 - length; -#endif // snoopDiagHash - - ////////// - // get ready to extend the hit - ////////// - - diag = diagNumber (pos1, pos2); - hDiag = hashedDiag (pos1, pos2); - -#ifdef debugDiag - debugThisDiag = (hDiag == hashedDiag(debugDiag,0)); -#ifdef debugMismatchExtend - debugMmExtendDiag = debugThisDiag; -#endif - - if (debugThisDiag) - { - printf ("mmex: (diag %9s", pair_diagonal_as_text(pos1,pos2)); - printf ("|%9s|%04X) " unsposSlashFmt " end was " unsposFmt "\n", - diagonal_as_text(diagActual[hDiag]), hDiag, - pos1, pos2, diagEnd[hDiag]); - } -#endif - - ////////// - // count the number of mismatches in the hit - ////////// - - s1 = seq1->v + pos1; - s2 = seq2->v + pos2; - stop = s1 - length; - - E = 0; - extent = hashInactiveEnd; // (this will remain unchanged through - // .. the following loop iff there are - // .. no mismatches in the hit) - - while (s1 > stop) - { - bits1 = hp->charToBits[*(--s1)]; - bits2 = hp->charToBits[*(--s2)]; - - if ((bits1 != bits2) - || (bits1 < 0) // (negative => not ACGTacgt) - || (bits2 < 0)) - { - extent = s2 - seq2->v; // (leftmost interesting mismatch in hit) - if (++E > M) // seed contains too many mismatches - goto hit_isnt_a_match; - } - } - - debugMismatchExtend_1; - - ////////// - // extend left until the M+1-Eth mismatch, saving positions in an array; - // note that we might not find that many mismatches, since we may hit the - // stop first - ////////// - - s1 = seq1->v + pos1 - length; // start at start of hit in both seq1 and - s2 = seq2->v + pos2 - length; // .. seq2; will pre-decrement before - // .. reads, so first bp read are the ones - // .. immediately in front of the hit - - // determine stop location; this is at the start of sequence 1, except - // that if this diagonal ends (or is blocked) earlier in sequence 2, we - // stop there - // (see note 3; instead of zero, we should use subsequence's start) - - if (unblockedLeftExtension) oldDiagEnd = 0; - else oldDiagEnd = diagEnd[hDiag]; - block2 = (sgnpos) oldDiagEnd; - if (block2 + diag > 0) stop = seq1->v + block2 + diag; - else stop = seq1->v; - - // set up mmScan to the first location past the end of the mmLoc array; - // we view the array as being of size M+1-E, since this is the number of - // mismatches left of the seed that we have any interest in; we will - // pre-decrement this pointer as we collect mismatches into the array; - // entries in the array will point to the position of a mismatch (or a fake - // mismatch one left of the stop); note that since M+1-E > 0, we will - // always be looking for at least 1 mismatch - - mmScan = mmLoc + M+1-E; - mmStop = mmScan; // (this is used in the right-scanning stage) - - // extend - - if (s1 < stop) - { - s1--; // if the new hit is left of the previous block (as can happen - s2--; // .. when called by process_for_recoverable_hit), the normal - // .. loop (below) will fail to step, and so won't stop on a - // .. mismatch; so in this case we need to push the position - // .. to one to the left of the start of the match - } - else - { - while (s1 >= stop) - { - if (s1 == stop) // (this test is necessary since we - { s1--; s2--; break; }// .. don't have a zero-terminator at - // .. the start of the sequence) - - nuc1 = *(--s1); - bits1 = hp->charToBits[nuc1]; - nuc2 = *(--s2); - bits2 = hp->charToBits[nuc2]; - - if ((nuc1 == 0) // (NUL => end of partition or chore) - || (nuc2 == 0)) - break; - - if ((bits1 != bits2) - || (bits1 < 0) // (negative => not ACGTacgt) - || (bits2 < 0)) - { - *(--mmScan) = s1; // save this as left endpoint - if (mmScan == mmLoc) // if we have enough, quit scanning - break; - } - } - } - - // if we did not get enough mismatches, add the point at which we stopped; - // also record the number of mismatches that we *didn't* find, since we will - // need to skip that many during the right-scanning stage; note that this - // guarantees that the array of collected interval starts is never empty - - if (mmScan > mmLoc) - *(--mmScan) = s1; - - mmShortfall = mmScan - mmLoc; - debugMismatchExtend_2; - - ////////// - // extend to the right, finding an ending mismatch for each of our - // intervals - ////////// - - s1 = seq1->v + pos1 - 1; // start at end of hit in both seq1 and seq2; - s2 = seq2->v + pos2 - 1; // .. will pre-increment before reads, so first - // .. bp read are the ones immediately after the - // .. hit - - // determine stop location; this is at the end of sequence 1, except - // that if this diagonal ends earlier in sequence 2, we stop there - // (see note 3; instead of sequence's end, we should use subsequence's end) - - block2 = (sgnpos) seq2->len; - if ((sgnpos) seq1->len <= block2 + diag) stop = seq1->v + seq1->len; - else stop = seq1->v + block2 + diag; - - // extend - - bestLength = 0; - left = right = NULL; - - while (s1 < stop) - { - nuc1 = *(++s1); - bits1 = hp->charToBits[nuc1]; - nuc2 = *(++s2); - bits2 = hp->charToBits[nuc2]; - - if ((nuc1 == 0) // (NUL => end of partition or chore) - || (nuc2 == 0)) - break; - - if ((bits1 != bits2) - || (bits1 < 0) // (negative => not ACGTacgt) - || (bits2 < 0)) - { - if (extent == hashInactiveEnd) - extent = s2 - seq2->v; - if (mmShortfall > 0) - { - debugMismatchExtend_3A; - mmShortfall--; - continue; - } - thisLength = s1 - *mmScan; - debugMismatchExtend_3B; - if (thisLength > bestLength)// if this is the best interval so far - { // .. save the endpoints - bestLength = thisLength; - left = *mmScan; - right = s1; - } - if (++mmScan == mmStop) // if we have enough, quit scanning - break; - } - } - - // if we did not get enough mismatches, treat the point at which we stopped - // as another endpoint; note that we don't honor mmShortfall here, because - // the interval at this endpoint has less than M mismatches and if it is - // long enough it will be an acceptible extension - - if (mmScan < mmStop) - { - if (extent == hashInactiveEnd) - extent = s2 - seq2->v; - thisLength = s1 - *mmScan; - debugMismatchExtend_3C; - if (thisLength > bestLength) // if this is the best interval so far - { // .. save the endpoints - left = *mmScan; - right = s1; - } - } - - if (left == NULL) - suicide ("internal error (in mismatch_extend_seed_hit) found no interval"); - - ////////// - // update length of hit - ////////// - - pos1 = (unspos) (right - seq1->v); - pos2 = (unspos) (((sgnpos) pos1) - diag); - length = (unspos) (right - (left + 1)); - - ////////// - // record the extent of the search on this diagonal - ////////// - - if (length >= (unsigned) hp->hspThreshold.s) - extent = (unspos) (((sgnpos) pos1+1) - diag); - - // record the extent - - if (extent > diagEnd[hDiag]) - { - diagEnd [hDiag] = extent; - diagActual[hDiag] = diag; -#ifdef snoopDiagHash - fprintf (stderr, " m setting diag %9s" - ", diagActual[%04X] = %9s" - ", diagEnd[%04X] = " unsposFmt - ", seed end = " unsposSlashFmt - ", in seq 1: " unsposDotsFmt "\n", - pair_diagonal_as_text(pos1,pos2), - hDiag, diagonal_as_text(diagActual[hDiag]), - hDiag, diagEnd[hDiag], - pos1, pos2, start2, pos2); -#endif // snoopDiagHash - } - - ////////// - // decide whether or not this extended seed hit is long enough. - ////////// - - dbg_timing_count_stat (ungappedExtensions); - - // if it isn't long enough, discard it (see note 4) - - if (length < (unsigned) hp->hspThreshold.s) - { - seed_search_count_stat (lowScoringHsps); - return noScore; - } - - // it's a keeper - - *_pos1 = pos1; - *_pos2 = pos2; - *_length = length; - - seed_search_count_stat (hsps); - dbg_timing_count_stat (hsps); - - return (score) length; - - ////////// - // special exit for the case of the hit not being an exact match - ////////// - -hit_isnt_a_match: - - // record the extent of the search on this diagonal - - if (extent > diagEnd[hDiag]) - { - diagEnd [hDiag] = extent; - diagActual[hDiag] = diag; -#ifdef snoopDiagHash - fprintf (stderr, " nm setting diag %9s" - ", diagActual[%04X] = %9s" - ", diagEnd[%04X] = " unsposFmt - ", seed end = " unsposSlashFmt - ", in seq 1: " unsposDotsFmt "\n", - pair_diagonal_as_text(pos1,pos2), - hDiag, diagonal_as_text(diagActual[hDiag]), - hDiag, diagEnd[hDiag], - pos1, pos2, start2, pos2); -#endif // snoopDiagHash - } - - seed_search_count_stat (lowScoringHsps); - dbg_timing_count_stat (ungappedExtensions); - return noScore; - } - -//---------- -// -// warn_for_search_limit-- -// Tell the user that this query exceeded the limit for HSPs. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -static void warn_for_search_limit - (void) - { - static int firstReport = true; - char* name2 ; - - seed_search_dbgSearchLimitExceeded++; - if (reportSearchLimit == 0) return; - - name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; - fprintf (stderr, "WARNING. Query \"%s\" contains more than %s HSPs.\n", - name2, commatize(reportSearchLimit)); - - if (firstReport) - { - fprintf (stderr, "All HSPs for this query are discarded and the query is not processed further.\n"); - firstReport = false; - } - } - -//---------- -// -// discovery_probability-- -// Compute the probability that a particular HSP would be discovered by a given -// (seed,step) search strategy, if it was equally likely to have occured at any -// sequence position. -// -// For any step size Z greater than 1, our search process will miss some HSPs. -// For example, if an HSP contains only one seed hit, and that seed hit occurs -// in sequence 2 at an odd position, if Z=2 we will miss that seed hit (and thus -// the HSP). More generally, if a seed hit occurs at position X, we will only -// find it if X == 0 modulo Z. -// -// This routine scans an HSP for the positions of all the seed hits it contains, -// and counts how many positional shifts would put at least one seed hit on a -// multiple of Z. There are Z different positional shifts, so the probability -// that the HSP would have been discovered is C/Z, where C is that count. -// -//---------- -// -// Arguments: -// seq* seq1: One sequence. -// unspos pos1: The hit position in sequence 1. This is the position -// .. following the end of the hit. -// seq* seq2: The other sequence. -// unspos pos2: The hit position position in sequence 2 (same details as -// .. pos1). -// unspos length: The length of the alignment. -// seed* hitSeed: Seeding strategy for the hits that found this match. -// u32 step: Positional step size in the search for those hits. -// -// Returns: -// (nothing) -// -//---------- - -float discovery_probability - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - seed* hitSeed, - u32 step) - { - u8* aStart = seq1->v + pos1 - length; - u8* aStop = seq1->v + pos1; - u8* bStart = seq2->v + pos2 - length; - u8* a, *b; - int aa, bb; - u64 aUnpacked, bUnpacked; - u32 aPacked, bPacked; - int len; - u32 *flip, flipBits, trans; - u64 diffBits, transBits; - u32 i; - int foundCount; - - // allocate a scratch array - - if ((foldedSize > step) && (foldedHits != NULL)) - { free_if_valid ("folded hits", foldedHits); foldedHits = NULL; } - - if (foldedHits == NULL) - { - foldedHits = (u8*) malloc_or_die ("", step); - foldedSize = step; - } - - // build the transition bits mask - - flipBits = 0; - for (flip=hitSeed->transFlips ; (*flip)!=0 ; flip++) - flipBits += *flip; - - transBits = seed_unpack (hitSeed, flipBits, NULL); - - // scan the alignment, checking for seed matches - - foundCount = 0; - for (i=0 ; ilength)&&(awithTrans == 0) continue; - - diffBits = aUnpacked ^ bUnpacked; - trans = (diffBits<<1) & transBits; // (1=>transversion) - if (trans != 0) continue; - trans = (diffBits & ~(diffBits<<1)) & transBits; // (1=>transition) - if (bit_count(trans) > hitSeed->withTrans) continue; - - got_a_hit: - i = (a-(aStart+hitSeed->length)) % step; - if (foldedHits[i] == 0) - { foldedHits[i] = 1; foundCount++; } - } - } - - return foundCount / (float) step; - } - -//---------- -// -// dump_raw_hit-- -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// unspos pos1, pos2: The hit position in sequences 1 and 2. This is the -// .. position following the end of the hit. -// -// Returns: -// (nothing) -// -//---------- - -static void dump_raw_hit - (FILE* f, - unspos pos1, - unspos pos2) - { - int isRev1 = ((seq1->revCompFlags & rcf_rev) != 0); - int isRev2 = ((seq2->revCompFlags & rcf_rev) != 0); - u32 seedLength, len1; - - seedLength = (unsigned) hitSeed->length; - len1 = seedLength-1; - - fprintf (f, "raw seed hit " unsposSlashCFmt " ", - pos1-len1, (isRev1)?'-':'+', - pos2-len1, (isRev2)?'-':'+'); - print_prefix (f, (char*) seq1->v + pos1-seedLength, (int) seedLength); - fprintf (f, "/"); - print_prefix (f, (char*) seq2->v + pos2-seedLength, (int) seedLength); - fprintf (f, "\n"); - } - -//---------- -// -// dump_extended_match-- -// Show an extended match, with its negative scoring flanks. -// -// Example: -// -// 4100: TTGCAAGAAGG ACAT[GGAAGGAA]GA ACGGATCTA -// GCTGTTATCAA ACAA[GGAAGGAA]GA CTTCTAGGT -// -// - GGAAGGAA/GGAAGGAA is the seed match (in this case it was an 8-mer -// exact match) -// - ACAT/ACAA improves the score on the left -// - GA/GA improves the score on the right -// - TTGCAAGAAGG/GCTGTTATCAA would drop the score on the left -// - ACGGATCTA/CTTCTAGGT would drop the score on the right -// - 4100 is the index of ACGGATCTA in sequence 1 (origin-0) -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1, seq2: The sequences. -// sgnpos diag: The diagonal the match is on (pos1 - pos2). -// u8* p1..p6: Pointers into sequence 1, best described by the diagram -// .. below. -// -// 4100: TTGCAAGAAGG ACAT[GGAAGGAA]GA ACGGATCTA -// ^ ^ ^ ^ ^ ^ -// p1 p2 p3 p4 p5 p6 -// -// Returns: -// (nothing) -// -//---------- - -#ifdef debugDiag - -static void dump_extended_match - (FILE* f, - seq* seq1, - seq* seq2, - sgnpos diag, - u8* p1, - u8* p2, - u8* p3, - u8* p4, - u8* p5, - u8* p6) - { - u8* s1, *s2; - - fprintf (f, "\n"); - fprintf (f, "%9u: ", (unspos) (p1-seq1->v)); - for (s1=p1 ; s1v - diag)); - s2 = seq2->v + ((sgnpos) (p1-seq1->v)) - diag; - for (s1=p1 ; s1 0) sprintf (s, "+" sgnposFmt, diag); - else sprintf (s, sgnposFmt, diag); - - return s; - } - -#endif // debugDiag || snoopXDrop - -//---------- -// -// seed_search_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void seed_search_zero_stats - (void) - { - dbg_timing_set_stat (ungappedExtensions, 0); - dbg_timing_set_stat (hsps, 0); - -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&seedSearchStats, 0, sizeof(seedSearchStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - // (none to set, yet) - -#endif // collect_stats - } - -//---------- -// -// seed_search_show_stats-- -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// -// Returns: -// (nothing) -// -//---------- - -void seed_search_show_stats - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats -#ifdef maxHitsPerColumn - int hits; - int haveAnyHits; - char scratch[19]; -#endif // maxHitsPerColumn -#endif // collect_stats - - dbg_timing_report_big_stat (ungappedExtensions, "ungapped extensions"); - dbg_timing_report_stat (hsps, "HSPs"); - -#ifdef collect_stats - - if (f == NULL) return; - - fprintf (f, " allow transition: %s\n", (seedSearchStats.withTrans==0)? "no" : - (seedSearchStats.withTrans==1)? "yes" - : "two"); - fprintf (f, " words in seq 2: %s\n", commatize(seedSearchStats.wordsInSequence)); - fprintf (f, " false seed hits: %s\n", commatize(seedSearchStats.unresolvedSeedHits)); - fprintf (f, " raw seed hits: %s\n", commatize(seedSearchStats.rawSeedHits)); - if (seedSearchStats.rawSeedHits > 0) - { - fprintf (f, " hash collisions: %s (%.2f%%)\n", commatize(seedSearchStats.hashCollisions), - 100*seedSearchStats.hashCollisions / (float) seedSearchStats.rawSeedHits); - fprintf (f, " hash failures: %s (%.2f%%)\n", commatize(seedSearchStats.hashFailures), - 100*seedSearchStats.hashFailures / (float) seedSearchStats.rawSeedHits); - } - fprintf (f, " bp extended: %s\n", commatize(seedSearchStats.bpExtended)); - -#ifndef noSeedHitQueue - fprintf (f, " queue scanned: %s (%.1f)\n", commatize(seedSearchStats.queueSeedsScanned), - seedSearchStats.queueSeedsScanned / (float) seedSearchStats.rawSeedHits); - fprintf (f, " queue examined: %s (%.1f)\n", commatize(seedSearchStats.queueSeedsExamined), - seedSearchStats.queueSeedsExamined / (float) seedSearchStats.rawSeedHits); - fprintf (f, " queue blocked: %s (%.1f)\n", commatize(seedSearchStats.queueSeedsBlocked), - seedSearchStats.queueSeedsBlocked / (float) seedSearchStats.rawSeedHits); -#endif // not noSeedHitQueue - fprintf (f, "-------------------\n"); - - if (seedSearchStats.minMatches >= 0) - { - fprintf (f, " matches >= %2d: %s\n", seedSearchStats.minMatches, commatize(seedSearchStats.notEnoughMatches)); - if (seedSearchStats.maxTransversions >= 0) - fprintf (f, "transvers'ns <= %2d: %s\n", seedSearchStats.maxTransversions, commatize(seedSearchStats.tooManyTransversions)); - if (seedSearchStats.filterCaresOnly) - fprintf (f, " (cares only)\n"); - fprintf (f, "-------------------\n"); - } - if (seedSearchStats.searchLimit > 0) - fprintf (f, " search limit: %s\n", commatize(seedSearchStats.searchLimit)); - if (seedSearchStats.isHspSearch) - { - int64 numExt = seedSearchStats.hsps+seedSearchStats.lowScoringHsps; - fprintf (f, " GF extensions: %s\n", commatize(numExt)); - fprintf (f, " HSP wanna-bes: %s\n", commatize(seedSearchStats.lowScoringHsps)); - fprintf (f, " HSPs: %s\n", commatize(seedSearchStats.hsps)); - if (numExt > 0) - fprintf (f, " bp/extension: %s\n", commatize((2*seedSearchStats.bpExtended+numExt)/(2*numExt))); -#ifdef snoopHspSubrange - fprintf (f, " suboptimal hsps: %s (%.2f%%)\n", commatize(seedSearchStats.suboptimalHsp), - 100*seedSearchStats.suboptimalHsp / (float) seedSearchStats.rawSeedHits); - if (seedSearchStats.hsps + seedSearchStats.suboptimalHspB != 0) - fprintf (f, "unjustly discarded: %s (%.2f%%)\n", commatize(seedSearchStats.suboptimalHspB), - 100*seedSearchStats.suboptimalHspB / (float) (seedSearchStats.hsps + seedSearchStats.suboptimalHspB)); - else - fprintf (f, "unjustly discarded: 0\n"); -#endif // snoopHspSubrange - fprintf (f, "-------------------\n"); - } - -#ifdef maxHitsPerColumn - haveAnyHits = false; - for (hits=1 ; hits<=maxHitsPerColumn+1 ; hits++) - { - if (seedSearchStats.hitsPerColumn[hits] != 0) { haveAnyHits = true; break; } - } - - if (haveAnyHits) - { - fprintf (f, "(seq 2 words with N raw seed hits)\n"); - for (hits=0 ; hits<=maxHitsPerColumn ; hits++) - { - if (seedSearchStats.hitsPerColumn[hits] == 0) continue; - fprintf (f, "%18d: %s\n", hits, commatize(seedSearchStats.hitsPerColumn[hits])); - } - hits = maxHitsPerColumn + 1; - if (seedSearchStats.hitsPerColumn[hits] != 0) - { - sprintf (scratch, "> %d", maxHitsPerColumn); - fprintf (f, "%18s: %s\n", scratch, commatize(seedSearchStats.hitsPerColumn[hits])); - fprintf (f, " max hits for word: %s\n", commatize(seedSearchStats.mostHitsInColumn)); - } - fprintf (f, "-------------------\n"); - } -#endif // maxHitsPerColumn - -#endif // collect_stats - } - -void seed_search_generic_stats - (arg_dont_complain(FILE* f), - arg_dont_complain(void (*func) (FILE*, const char*, ...))) - { -#ifdef collect_stats - if (f == NULL) return; - (*func) (f, "raw_seed_hits=%" PRId64 "\n", seedSearchStats.rawSeedHits); - (*func) (f, "hash_collisions=%" PRId64 "\n", seedSearchStats.hashCollisions); - (*func) (f, "hash_failures=%" PRId64 "\n", seedSearchStats.hashFailures); - (*func) (f, "bp_extended=%" PRId64 "\n", seedSearchStats.bpExtended); -#endif // collect_stats - } - -#ifdef collect_stats -int64 seed_search_hsps (void) { return seedSearchStats.hsps; } -int64 seed_search_low_scoring_hsps (void) { return seedSearchStats.lowScoringHsps; } -int64 seed_search_bp_extended (void) { return seedSearchStats.bpExtended; } -#endif // collect_stats - diff --git a/programs/lastz/src/seed_search.h b/programs/lastz/src/seed_search.h deleted file mode 100644 index 3cdec0f..0000000 --- a/programs/lastz/src/seed_search.h +++ /dev/null @@ -1,288 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: seed_search.h -// -//---------- - -#ifndef seed_search_H // (prevent multiple inclusion) -#define seed_search_H - -// other files - -#include // standard C i/o stuff -#include "utilities.h" // utility stuff -#include "sequences.h" // sequence stuff -#include "pos_table.h" // position table stuff -#include "segment.h" // segment table management stuff - -// establish ownership of global variables - -#ifdef seed_search_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access -// nota bene: showProgress is a relic which is not currently used - -#ifdef seed_search_owner -int seed_search_showProgress = false; // true => make periodic progress reports -int seed_search_dbgDumpRawHits = false; // true => dump hits in find_table_matches() -int seed_search_dbgShowRawHits = false; // true => show hits in find_table_matches() -int seed_search_dbgShowHits = false; // true => show hits in process_for_plain_hit(), - // .. process_for_simple_hit(), - // .. process_for_recoverable_hit(), - // .. or process_for_twin_hit() -int seed_search_dbgShowCoverage= false; // true => show bases hit count in - // .. seed_hit_search(), -int seed_search_dbgShowRejections= false;// true => report queries rejected in - // .. seed_hit_search(), - - -int seed_search_dbgSearchLimitExceeded = 0; -#ifdef snoopHspSubrange -int seed_search_dbgSubrangeHsps = false;// true => show all suboptimal HSPs -#endif // snoopHspSubrange -#else -global int seed_search_showProgress; -global int seed_search_dbgDumpRawHits; -global int seed_search_dbgShowRawHits; -global int seed_search_dbgShowHits; -global int seed_search_dbgShowCoverage; -global int seed_search_dbgShowRejections; -global int seed_search_dbgSearchLimitExceeded; -#ifdef snoopHspSubrange -global int seed_search_dbgSubrangeHsps; -#endif // snoopHspSubrange -#endif - -//---------- -// -// seed hit reporter functions-- -// Report the discovery of a seed hit or HSP. -// -// Arguments: -// void* info: Additional control/arguments specific to the seed hit -// .. reporter being called. -// unspos pos1: position, in the first sequence, of first character -// .. *after* the last character in the match (origin-0) -// unspos pos2: position, in the second sequence, of first character -// .. *after* the last character in the match (origin-0) -// unspos length: number of nucleotides -// score s: the match's "score" -// -// Returns: -// The number of bases in the seed hit (or HSP); 0 if the hit is -// rejected. -// -//---------- - -typedef u32 (*hitreporter) (void*, unspos, unspos, unspos, score); - -//---------- -// -// seed hit processor functions-- -// Process a simple seed hit for a given seed pattern. -// -//---------- -// -// Arguments: -// void* info: Additional control/arguments specific to the seed hit -// .. processor being called. This will usually be of a -// .. type that includes at least the stuff in hitprocinfo. -// unspos pos1: The hit position in sequence 1, relative to the -// .. entire sequence (not to the interval). This is -// .. the first letter following the end of the match -// .. (origin-0). -// unspos pos2: The hit position in sequence 2 (with details the -// .. same as for pos1). -// unspos length: The length of the hit (number of nucleotides). -// -// Returns: -// The number of bases in the seed hit (or HSP); 0 if the hit is -// rejected. -// -//---------- - -typedef u64 (*hitprocessor) (void*, unspos, unspos, unspos); - -// basic data structure for all hit processors - -typedef struct hitprocinfo - { - hitreporter reporter; // function to call to report each hit that is - // .. 'good enough' - void* reporterInfo; // value to pass thru with each call to reporter - - // filtering - - int posFilter; // true => discard seed hits outside of - interval targetInterval; // .. tStart,tEnd in target->v[] or - interval queryInterval; // .. qStart,qEnd in query->v[]; the - // .. intervals are origin-zero closed - - int minMatches; // filter criteria for each seed hit; we - int maxTransversions;//.. require at least minMatches matches and no - char* filterPattern; // .. more than maxTransversions transversions; - // .. if minMatches<0 no filtering is performed; - // .. if maxTransversions<0 there is no - // .. transversion limit; if filterPattern is - // .. non-NULL the filter criteria only applies - // .. to the pattern's "care" positions (and - // .. not to any don't-care positions); - // .. filterPattern points directly into the - // .. active seed, and needn't be deallocated - const s8* charToBits; // table to map sequence characters to two-bit - // .. values, and illegal characters to -1; - // .. indexed by a u8 value, 0..255; normally - // .. this will consider upper and lower case to - // .. be the same - - // gap-free extension - - int gfExtend; // whether to extend seed hits into HSPs (one - // .. of gfexXXX) - seq* seq1; - seq* seq2; - scoreset* scoring; // the scoring scheme; usually this treats - // .. lowercase letters as being 'bad' - score xDrop; - sthresh hspThreshold; - score hspZeroThreshold; // max(0,hspThreshold.s) - segtable** anchors; - int entropicHsp; - int reportEntropy; - } hitprocinfo; - -// legal values for gfExtend; note that gfexMismatch_min has to be 1, since -// values in the range gfexMismatch_min..gfexMismatch_max are identical to the -// number of mismatches they represent - -enum - { - gfexNoExtend = -2, // simple gfex, no hash collision detection - gfexXDrop = -1, // extend seed hits into HSPs, using xdrop - gfexExact = 0, // extend seed hits using exact match - gfexMismatch_min = 1, // 1..50 => extend seed hits using N-mismatch - gfexMismatch_max = 50 - }; - -// special data structure for process_for_plain_hit(info,...) and -// process_for_simple_hit(info,...) - -typedef struct hitprocsimple - { - hitprocinfo hp; // basic info - } hitprocsimple; - -// special data structure for process_for_twin_hit(info,...) - -typedef struct hitproctwin - { - hitprocinfo hp; // basic info - u32 minSpan; // span threshold for hits to be considered - u32 maxSpan; // .. twins; we require two (or more) hits - // .. with end2-start1 between minSpan and - // .. maxSpan, inclusive - } hitproctwin; - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -//#define maxHitsPerColumn 1000 - -global struct - { - int withTrans; - int minMatches; - int maxTransversions; - int filterCaresOnly; - int isHspSearch; - u32 searchLimit; - - int wordsInSequence; - int64 unresolvedSeedHits; - int64 rawSeedHits; - int64 hashCollisions; - int64 hashFailures; - int64 notEnoughMatches; - int64 tooManyTransversions; - int64 bpExtended; - int64 lowScoringHsps; - int64 hsps; - -#ifdef snoopHspSubrange - int64 suboptimalHsp; - int64 suboptimalHspB; -#endif // snoopHspSubrange - -#ifndef noSeedHitQueue - int64 queueSeedsScanned; - int64 queueSeedsExamined; - int64 queueSeedsBlocked; -#endif // not noSeedHitQueue - -#ifdef maxHitsPerColumn - int64 hitsPerColumn[maxHitsPerColumn+2]; - u64 mostHitsInColumn; -#endif // maxHitsPerColumn - } seedSearchStats; - -// stats macros - -#define seed_search_count_stat(field) ++seedSearchStats.field -#define seed_search_uncount_stat(field) --seedSearchStats.field -#define seed_search_set_stat(field,val) (seedSearchStats.field = val) -#define seed_search_add_stat(field,val) (seedSearchStats.field += val) -#else -#define seed_search_count_stat(field) -#define seed_search_uncount_stat(field) -#define seed_search_set_stat(field,val) -#define seed_search_add_stat(field,val) -#endif // collect_stats - -// prototypes for stats routines - -void seed_search_zero_stats (void); -void seed_search_show_stats (FILE* f); -void seed_search_generic_stats (FILE* f, void (*func) (FILE*, const char*, ...)); -int64 seed_search_hsps (void); -int64 seed_search_low_scoring_hsps (void); -int64 seed_search_bp_extended (void); - -//---------- -// -// prototypes for routines in seed_search.c -// -//---------- - -u64 seed_hit_search (seq* seq1, postable* pt, - seq* seq2, unspos start, unspos end, - int selfCompare, - const s8 charToBits[], seed* hitSeed, - u32 searchLimit, u32 reportSearchLimit, -#ifdef densityFiltering - double maxDensity, -#endif // densityFiltering - hitprocessor processor, void* processorInfo); -void free_seed_hit_search (void); -u64 process_for_plain_hit (void* info, - unspos pos1, unspos pos2, unspos length); -u64 process_for_simple_hit (void* info, - unspos pos1, unspos pos2, unspos length); -u64 process_for_recoverable_hit (void* info, - unspos pos1, unspos pos2, unspos length); -u64 process_for_twin_hit (void* info, - unspos pos1, unspos pos2, unspos length); -float discovery_probability (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - seed* hitSeed, u32 step); - -#undef global -#endif // seed_search_H diff --git a/programs/lastz/src/seeds.c b/programs/lastz/src/seeds.c deleted file mode 100755 index a9620d7..0000000 --- a/programs/lastz/src/seeds.c +++ /dev/null @@ -1,1435 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: seeds.c -// -//---------- -// -// seeds-- -// Support for generalized spaced seeds, including transitions. -// -// A "seed pattern" (usually shortened here to "seed") describes a criteria -// for local matches (called "seed hits" or just "hits") between two DNA -// sequences. The seed specifies which positions in a small local window must -// match and which are allowed to differ. This is the basis of an anchor- -// finding heuristic for discovering orthology between the two sequences. -// -// A seed is described by a string over the symbols {1,0,T}, with 0 prohibited -// from each end of the string. A 1 indicates a position which must match, a 0 -// allows any mismatch, and a T allows transition mismatches. Seeds with only -// 1 and 0 bits are called "strict seeds". For the seed 1101100100101111, the -// two sequences below have a hit at the locations shown, because the sequences -// match at every 1 position in the seed. -// -// sequence 1: ..GGACCTCTTCTCGCTCTATATAAGCGGTGG.. -// match/trans: | || || | || |||| | : | -// sequence 2: ..CACTACTGTCGCTCTATATGAACGTGATGT.. -// seed: 1101100100101111 -// -// In addition to the basic criteria above, a seed can further specify that one -// or two transitions are allowed among its match bits (the 1's). Thus if the -// same seed allowed one transition, we would also have the hit shown below. If -// the seed did not allow a transition, this would *not* be a hit, in spite of -// the fact that the sequences have more matches in this window that in the -// window above; matches in the spaces (the 0s) are irrelevant. -// -// sequence 1: ..GGACCTCTTCTCGCTCTATATAAGCGGTGG.. -// match/trans: || :| |||||||||||:|:|| : -// sequence 2: ..CACTACTGTCGCTCTATATGAACGTGATGT.. -// 1101100100101111 -// transitions: * -// -// Another extension of the hit criteria is to allow T positions. A T specifies -// that the position must contain either a match or a transion, but not a -// transversion. So if our seed were 1101T00100101T11 we would have both of -// those hits. Note that the transitions allowed by a T location are separate -// from the one or two transitions that can be allowed at match positions. -// -// sequence 1: ..GGACCTCTTCTCGCTCTATATAAGCGGTGG.. -// match/trans: | || || | || |||| | : | -// sequence 2: ..CACTACTGTCGCTCTATATGAACGTGATGT.. -// seed: 1101T00100101T11 -// -// sequence 1: ..GGACCTCTTCTCGCTCTATATAAGCGGTGG.. -// match/trans: || :| |||||||||||:|:|| : -// sequence 2: ..CACTACTGTCGCTCTATATGAACGTGATGT.. -// 1101T00100101T11 -// -// The "length" (L) of a seed is the number of locations in its string. This -// corresponds to the length of a corresponding hit. The "weight" (W) of a -// strict seed is the number of 1s; for seeds with Ts we count each T as half -// of a 1. 1101100100101111 is called a "10 of 16" seed (length 16, weight 10). -// The same terminology doesn't apply to non-strict seeds; 1101T00100101T11 -// has weight 9 and length 16 but is not rightfully a 9 of 16 seed. The "bit -// weight" of a seed is twice its weight. -// -// Yet another extension is the concept of "half-weight" seeds. These are seeds -// consisting entirely of Ts and 0s. This facilitates a heuristic process in -// which the seed is used to identify hits with transitions or matches in the -// prescribed locations, and then the hits are further qualified by requiring -// a minimum number of matches over the length of the seed (including in the -// spaces). -// -// Seed hits are usually found by the following process (this is implemented -// in some other module, but is included in this discussion to give motivation -// for the discussion of "overweight seeds" below). A window of L nucleotides -// slides across sequence 1. The nucleotides in the window are converted to a -// two-bit-per-base word, the bits relevant to the seed are packed into a -// smaller word, and a list of all the locations at which that word occurs is -// kept. Then the second sequence is scanned in a similar manner, with each -// packed word used to locate the list of matching positions. In practice the -// packed word is used as an index into a table of lists, and so we often call -// it the index. -// -// Here's how a packing for 1101100100101111 might work. Since we have two bits -// per base, we really need to collect bits in pairs. The general idea is shown -// below, but we won't go into great detail here about how this is accomplished. -// The key concept is that we extract 2W bits from a word of size 2L, and -// combine the relevant bits to form a unique word of size 2W, the index. -// -// seed string: 1 1 0 1 1 0 0 1 0 0 1 0 1 1 1 1 -// seed bits: abcd--efgh----ij----kl--mnopqrst -// packed bits: ghijabcdklefmnopqrst -// -// In practice, the size of the index is limited by the amount of memory -// available for the sequence position table. On 32-bit machines with 1G of -// memory the practical limit is around 26 index bits. To allow for heavier -// seeds than this, we handle "overweight seeds". The actual index is limited -// to the practical maximum, and the remaining seed bits are resolved by -// comparison to the sequence. -// -// For example, consider the weight-14 seed 1110101100110010101111, and suppose -// the maximum index size is 23 bits. We need to reduce the seed's 28 bits to -// 23, so we change the last 5 1s to Ts: 1110101100110010T0TTTT (why we choose -// this particular reduction will become clear in a moment). Any hit for the -// full seed will be a hit for this partial seed. But (in random sequences) -// only 1 in 32 hits to the partial seed are true hits for the full seed. Upon -// detection of a hit (to the partial seed) we resolve whether it is a true hit -// by comparing the missing bits (the "resolving" bits). These are the most- -// signifcant bits from each of the five induced T positions. If these all -// match (as they do in the example below), then the hit is a true hit for the -// full seed. Due to the way we chose the resolving bits, if they don't match, -// the number of bits that don't match gives the number of transitions over -// those seed positions (mismatches can't be transversions since the least- -// signifcant bits would not have matched). -// -// sequence 1: .. C T C T T C T C G C T C T A T A T A A G C G .. -// match/trans: | | | | | | | : | | | : : | | : | | | | -// sequence 2: .. C T C G T C T C A C T C C G T C T G A G C G .. -// ------------------------------------------------------------- -// full seed: 1 1 1 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 1 1 1 -// partial seed: 1 1 1 0 1 0 1 1 0 0 1 1 0 0 1 0 T 0 T T T T -// ------------------------------------------------------------- -// seq1 bits: 01110111110111011001110111001100110000100110 -// seq2 bits: 01110110110111010001110101101101111000100110 -// seq1 r-bits: --------------------------------1---0-1-0-1- -// seq2 r-bits: --------------------------------1---0-1-0-1- -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include // standard C upper/lower stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff - -#define seeds_owner // (make this the owner of its globals) -#include "seeds.h" // interface to this module - -// complain if someone has tried to set limits higher than we can support - -#if maxSeedLen > 31 -#error ***** maxSeedLen is too large (this module only supports maxSeedLen <= 31) ***** -#endif - -#if maxHwSeedLen > 63 -#error ***** maxHwSeedLen is too large (this module only supports maxHwSeedLen <= 63) ***** -#endif - -#if maxSeedBitWeight > 31 -#error ***** maxSeedBitWeight is too large (this module only supports maxSeedBitWeight <= 31) ***** -#endif - -#if maxResolvedBits > 16 -#error ***** maxResolvedBits is too large (this module only supports maxResolvedBits <= 16) ***** -#endif - -#define maintainFlippedBitOrder // if defined, the transition flip order is - // .. the same for overweight seeds as for the - // .. equivalent full seed - -//---------- -// -// prototypes for private functions -// -//---------- - -static seed* parse_one_seed (char* s, char* e, int transitionsOk, - int maxIndexBits); -static seed* new_seed (int numParts, int patternLen, int numFlips); -static int best_shift (u32 uncoveredBits, u64 seedBits); - -//---------- -// -// parse_seeds_string, parse_strict_seeds_string-- -// Convert a seed(s) string into a collection of bits 'implementing' the -// seed(s). parse_strict_seeds_string() doesn't allow transitions ('T'). -// -//---------- -// -// Arguments: -// char* s: The seeds, represented as a comma-separated string. -// .. See parse_one_seed() for the format of each seed. -// seed** seed: A place to store the resulting implementation of the -// .. seeds. This will be a linked list of seed -// .. structures that have been allocated from the -// .. heap, and the caller must eventually dispose of -// .. it, with a call to free_seeds(). -// int maxIndexBits: The maximum weight of the seed (in bits) that can -// .. be directly supported. If the seed is heavier -// .. than this, it will be implemented as an -// .. overweight seed. Zero indicates that overweight -// .. seeds should not be created. -// -// Returns: -// The *bit* weight of the seed (the maximum of the weights if there are -// multiple seeds). -// -//---------- -// -// Notes: -// -// (1) Failures result in program fatality. -// -// (2) No seed can be longer than 31 locations. -// -// (3) Seed weights are limited by maxIndexBits. The weight of the seed's -// index will not exceed maxIndexBits. Seeds heavier than that will be -// made as overweight seeds. maxIndexBits is limited to 31. -// -// (4) The internal representation of the seed is dependent on the specific -// 2-bit encoding of nucleotides, which is defined (implicitly) in -// dna_utilities.c. It requires that the least significant of the two bits -// distinguishes between purines and pyramidines. -// -//---------- - -static int _parse_seeds_string (char* s, seed** seed, int transitionsOk, - int maxIndexBits); - -int parse_seeds_string (char* s, seed** seed, int maxIndexBits) - { return _parse_seeds_string (s, seed, /*transitionsOk*/ true, maxIndexBits); } - -int parse_strict_seeds_string (char* s, seed** seed, int maxIndexBits) - { return _parse_seeds_string (s, seed, /*transitionsOk*/ false, maxIndexBits); } - -static int _parse_seeds_string - (char* s, - seed** _seed, - int transitionsOk, - int maxIndexBits) - { - seed* tail = NULL; - seed* newSeed; - int maxWeight; - char* terminator; - - ////////// - // convert each comma-separated piece of the string into a seed, - // collecting them into a linked list - ////////// - - *_seed = NULL; - maxWeight = -1; - - while (true) - { - terminator = strchr (s, ','); - if (terminator == NULL) terminator = s + strlen (s); - - newSeed = parse_one_seed (s, terminator-1, transitionsOk, maxIndexBits); - if (*_seed == NULL) - { - tail = *_seed = newSeed; - maxWeight = newSeed->weight; - } - else - { - tail->next = newSeed; - tail = newSeed; - if (newSeed->weight > maxWeight) - maxWeight = newSeed->weight; - } - - if (*terminator == 0) - break; - s = terminator+1; - } - - // return weight, counted in bits - - return maxWeight; - } - -//---------- -// -// parse_one_seed-- -// Convert a single seed string into a collection of bits 'implementing' the -// seed. -// -//---------- -// -// Arguments: -// char* s: The seed, represented as a string. This is a string -// .. of 1s (representing matches), 0s (don't cares), -// .. and Ts (transition allows). X may be substituted -// .. for 0, and spaces may be used (they are ignored). -// .. Any leading or trailing don't cares will be -// .. removed. The string need not be terminated. -// char* e: The end of the seed (a pointer to the last character -// .. in the string). -// int transitionsOk: true => seed is allowed to contain 'T'. -// false => seed may only contain match and don't-care. -// int maxIndexBits: The maximum weight of the seed (in bits) that can -// .. be directly supported. If the seed is heavier -// .. than this, it will be implemented as an -// .. overweight seed. Zero indicates that overweight -// .. seeds should not be created. -// -// Returns: -// The resulting implementation of the seed. -// -//---------- -// -// Notes: -// -// (1) The same restrictions from parse_seed_string() are in force. -// -// (2) The seed 'implementation' (as a collection of shits and masks) is not -// optimal in general. -// -//---------- - -static seed* parse_one_seed - (char* _s, - char* e, - int transitionsOk, - int maxIndexBits) - { - char pattern[maxHwSeedLen+1]; - char* s, *ss, *p; - char type; - int isStrict, isHalfweight; - int length; // seed length, measured in locations - int weight; // seed weight, measured in bits - u64 seedBits; // the seed, as two bits per location - u64 flipBits; // the transition flip bits - u32 resolveBits; // bits removed from seed, which must be - // .. resolved by looking at the sequences - int shift, bitsPer; // number of bits to shift the seed - int matches; // number of matches in the seed, and how many - int matchesToKeep; // .. to keep for an overweight seed - u32 mask; // mask to apply to the shifted seed - u32 wBits; // the W least significant bits, where W=weight - u32 covered; // bits (of wBits) which we have covered so far - u64 remBits; // bits (of seedBits) which we haven't taken - // .. care of yet - int numParts; // number of masked-shifts needed - seed* seed; - - if (maxIndexBits > maxSeedBitWeight) - suicidef ("max index bits cannot exceed %d (it's %d).", - maxSeedBitWeight, maxIndexBits); - - ////////// - // determine the length and weight of the seed - ////////// - - // skip leading don't cares - - s = _s; - - while ((s <= e) && ((*s == '0') || (*s == 'X') || (*s == 'x'))) - s++; - - if (s > e) - suicide ("seed string is empty!"); - - // skip trailing don't cares - - while ((*e == '0') || (*e == 'X') || (*e == 'x')) - e--; - - // scan string, to determine if seed is "strict", "half-weight", or a mixture - - isStrict = true; - isHalfweight = true; - matches = 0; - weight = 0; - - for (ss=s ; ss<=e ; ss++) - { - switch (*ss) - { - case '1': - isHalfweight = false; - matches++; - weight += 2; - break; - - case 'T': - case 't': - isStrict = false; - weight++; - break; - - case '0': - case 'X': - case 'x': - break; - } - } - - if (isStrict) type = 'S'; - else if (isHalfweight) type = 'H'; - else type = '_'; - - ////////// - // if the seed will be too heavy, turn it into an overweight seed - ////////// - - matchesToKeep = matches; - if ((maxIndexBits > 0) && (weight > maxIndexBits)) - { - int toResolve = weight - maxIndexBits; - if (toResolve > matches) - suicidef ("seed (%s) requires more resolving bits (%d) than it has matches (%d).", - _s, toResolve, matches); - if (toResolve > maxResolvedBits) - suicidef ("seed (%s) requires more resolving bits (%d) than are allowed (%d).", - _s, toResolve, maxResolvedBits); - type = 'R'; - matchesToKeep -= toResolve; - } - - ////////// - // scan the string, converting each location into one bit (for half-weight - // seeds) or two bits (for seeds with matches) - ////////// - - resolveBits = 0; - seedBits = 0; - flipBits = 0; - bitsPer = (type=='H')? 1 : 2; - matches = 0; - length = 0; - weight = 0; - - for (ss=s,p=pattern ; ss<=e ; ss++) - { - switch (*ss) - { - default: - bad_character: - if (isprint (*ss)) - suicidef ("seed string %s contains illegal character %c", - _s, *ss); - else - suicidef ("seed string %s contains illegal character %02X", - _s, *ss); - - case ' ': - case '\t': - case '\n': - break; - - case '1': - if (matches >= matchesToKeep) - { - if ((resolveBits << 2) < resolveBits) // (overflow) - suicidef ("resolving bits in seed string %s are spread too widely", - _s); - resolveBits = (resolveBits << bitsPer) + 2; - goto transition; - } - resolveBits <<= bitsPer; - seedBits = (seedBits << bitsPer) + 3; - flipBits = (flipBits << bitsPer) + 2; - matches++; - length++; - weight += 2; - *(p++) = '1'; - break; - - case 'T': - case 't': - if (!transitionsOk) goto bad_character; - resolveBits <<= bitsPer; - transition: - seedBits = (seedBits << bitsPer) + 1; - flipBits <<= bitsPer; - length++; - weight++; - *(p++) = 'T'; - break; - - case '0': - case 'X': - case 'x': - resolveBits <<= bitsPer; - seedBits = (seedBits << bitsPer) + 0; - flipBits <<= bitsPer; - length++; - *(p++) = '0'; - break; - } - } - - *p = 0; // terminae pattern string - - // sanity check on sizes - - if (type == 'H') - { - if (length > maxHwSeedLen) - suicidef ("half-weight seed string (%s) cannot have length exceeding %d (it's %d).", - _s, maxHwSeedLen, length); - } - else - { - if (length > maxSeedLen) - suicidef ("seed string (%s) cannot have length exceeding %d (it's %d).", - _s, maxSeedLen, length); - } - - if (weight > maxSeedBitWeight) - suicidef ("seed string (%s) cannot have bit weight exceeding %d (it's %d).", - _s, maxSeedBitWeight, weight); - - if (weight == 0) - suicidef ("seed string (%s) cannot have zero weight.", _s); - - ////////// - // figure out how to implement the seed - // - // we want to find the minimum set of masked-shifts that will bring all - // the seed bits from scattered positions 0..length-1 into a covering of - // the positions 0..weight-1; rather than try to find an optimal set, we - // use a greedy algorithm to find a good set - // - // note: for many seeds it is possible to find smaller sets of masked- - // shifts using more sophistocated (and time-consuming) algorithms - ////////// - - wBits = (1L << weight) - 1; - - // first masked-shift in the set will be shift-zero - - covered = seedBits & wBits; - remBits = seedBits - covered; - numParts = 1; - - // take whatever masked-shift will cover the most bits; keep doing so - // until all bits are covered - - while (covered != wBits) - { - shift = best_shift ((~covered) & wBits, remBits); - mask = (remBits >> shift) & (~covered) & wBits; - covered = covered + mask; - remBits = remBits - (((u64)mask) << shift); - numParts++; - } - - ////////// - // record the seed implementation - // - // we run the same algorithm again, depositing the masked-shifts into a - // seed structure - ////////// - - // allocate the seed structure - - if (type == 'H') - seed = new_seed (numParts, length, 0); - else - seed = new_seed (numParts, length, bit_count_64(flipBits)); - - seed->next = NULL; - seed->type = type; - seed->length = length; - seed->weight = weight; - seed->isHalfweight = (type == 'H'); - seed->withTrans = 0; - seed->resolvingMask = resolveBits; - seed->revComp = false; - - strcpy (seed->pattern, pattern); - - // first masked-shift in the set is shift-zero - - covered = seedBits & wBits; - remBits = seedBits - covered; - numParts = 1; - seed->shift[0] = 0; - seed->mask [0] = covered; - - // take whatever masked-shift will cover the most bits; keep doing so - // until all bits are covered - - while (covered != wBits) - { - shift = best_shift ((~covered) & wBits, remBits); - mask = (remBits >> shift) & (~covered) & wBits; - covered = covered + mask; - remBits = remBits - (((u64)mask) << shift); - seed->shift[numParts] = shift; - seed->mask [numParts] = mask; - numParts++; - } - - // separate the transition-flip bits into a list of single-bit values - - if (seed->transFlips != NULL) - { -#ifdef maintainFlippedBitOrder - u64 rightBit; - u32* f=seed->transFlips; - - while (flipBits != 0) - { - rightBit = flipBits-(flipBits&(flipBits-1));// isolate rightmost 1 - flipBits -= rightBit; // remove it - *(f++) = apply_seed (seed, rightBit); // add it to the list - } - *f = 0; // terminate the transFlips array -#else // not maintainFlippedBitOrder - u32 packed = apply_seed (seed, flipBits); - u32 rightBit; - u32* f=seed->transFlips; - - while (packed != 0) - { - rightBit = packed - (packed & (packed-1)); // isolate rightmost 1 - packed -= rightBit; // remove it - *(f++) = rightBit; // add it to the list - } - *f = 0; // terminate the transFlips array -#endif // maintainFlippedBitOrder - } - - // return the seed - - return seed; - } - -//---------- -// -// new_seed-- -// Allocate a new seed structure. -// -//---------- -// -// Arguments: -// int numParts: The number of masked-shifts that will be needed to -// .. implement the seed. -// int patternLen: The number of bytes to allow for a pattern (we allocate -// .. one additional byte to allow for a terminating -// .. zero). -// int numFlips: The number of transition flips to allow for (usually -// .. this should be the number of full match positions in -// .. seed). -// -// Returns: -// A pointer to the newly allocated seed, which the caller will have to -// dispose of eventually. The routine free_seeds() should be used for this -// purpose. -// -//---------- - -static seed* new_seed - (int numParts, - int patternLen, - int numFlips) - { - seed* s; - int bytesNeeded, bytesMain, bytesShift, bytesMask, bytesFlips; - - // figger out how many bytes we need - - bytesMain = round_up_8 (sizeof(seed)); - bytesShift = round_up_8 (numParts * sizeof(s->shift[0])); - bytesMask = round_up_8 (numParts * sizeof(s->mask [0])); - bytesFlips = 0; - if (numFlips > 0) bytesFlips = (numFlips+1) * sizeof(s->transFlips[0]); - if (patternLen > 0) patternLen += 1; - else if (patternLen < 0) patternLen = 0; - bytesNeeded = bytesMain + bytesShift + bytesMask + bytesFlips + patternLen; - - // allocate - - s = (seed*) zalloc_or_die ("new_seed", bytesNeeded); - - // hook up the internal arrays - - s->shift = (int*) (((char*) s) + bytesMain); - s->mask = (u32*) (((char*) s->shift) + bytesShift); - s->transFlips = NULL; - if (numFlips > 0) - s->transFlips = (u32*) (((char*) s->mask) + bytesMask); - if (patternLen > 0) - s->pattern = (char*) (((char*) s->mask) + bytesMask + bytesFlips); - - // initialize - - s->numParts = numParts; - - return s; - } - -//---------- -// -// reconstruct_seed-- -// Build a single seed string from information about a previously-constructed -// seed. (Usually this information would come from a file). -// -//---------- -// -// Arguments: -// (the arguemnts have the same meaning as in the seed structure definition) -// -// Returns: -// The resulting implementation of the seed. -// -//---------- - -seed* reconstruct_seed - (char type, - int length, - int weight, - char* pattern, - u32 resolvingMask, - int revComp, - int isHalfweight, - int numParts, - int* shift, - u32* mask, - u32* transFlips) - { - seed* s; - int numFlips, ix; - - // count flips - - for (numFlips=0 ; transFlips[numFlips]!=0 ; numFlips++) - ; - - // allocate (this links up arrays and sets numParts) - - if (pattern == NULL) - s = new_seed (numParts, 0, numFlips); - else - s = new_seed (numParts, strlen(pattern), numFlips); - - // copy fields - - s->type = type; - s->length = length; - s->weight = weight; - s->resolvingMask = resolvingMask; - s->revComp = revComp; - s->isHalfweight = isHalfweight; - - if (pattern != NULL) - strcpy (s->pattern, pattern); - - // copy arrays - - for (ix=0 ; ixshift[ix] = shift[ix]; - for (ix=0 ; ixmask[ix] = mask[ix]; - for (ix=0 ; ix<=numFlips ; ix++) s->transFlips[ix] = transFlips[ix]; - - return s; - } - -//---------- -// -// copy_seeds-- -// Make a copy of a list of seed structures. -// -//---------- -// -// Arguments: -// seed* seed: The linked list of seeds to copy. Note that all seeds in -// .. the list are copied. -// -// Returns: -// A pointer to the newly allocated seed, which the caller will have to -// dispose of eventually. The routine free_seeds() should be used for this -// purpose. -// -//---------- - -static seed* copy_seed (seed* _seed); - - -seed* copy_seeds - (seed* _seed) - { - seed* head = NULL; - seed* prev = NULL; - seed* s; - - for ( ; _seed!=NULL ; _seed=_seed->next) - { - s = copy_seed (_seed); - s->next = NULL; - if (prev == NULL) head = s; - else prev->next = s; - prev = s; - } - - return head; - } - - -static seed* copy_seed - (seed* _seed) - { - u32* f; - int numFlips = 0; - seed* s; - int ix; - - // allocate a seed with enough room - - numFlips = 0; - if (_seed->transFlips != NULL) - { for (f=_seed->transFlips ; *f!=0 ; f++) numFlips++; } - - if (_seed->pattern == NULL) - s = new_seed (_seed->numParts, 0, numFlips); - else - s = new_seed (_seed->numParts, strlen(_seed->pattern), numFlips); - - // copy the simple fields - - s->next = NULL; - s->type = _seed->type; - s->length = _seed->length; - s->weight = _seed->weight; - s->revComp = _seed->revComp; - s->isHalfweight = _seed->isHalfweight; - - if (_seed->pattern != NULL) - strcpy (s->pattern, _seed->pattern); - - // copy parts - - s->numParts = _seed->numParts; - s->resolvingMask = _seed->resolvingMask; - - for (ix=0 ; ix<_seed->numParts ; ix++) - { - s->shift[ix] = _seed->shift[ix]; - s->mask [ix] = _seed->mask [ix]; - } - - // copy transition flips - - s->withTrans = _seed->withTrans; - if (_seed->transFlips != NULL) - { - for (ix=0 ; ixtransFlips[ix] = _seed->transFlips[ix]; - s->transFlips[numFlips] = 0; - } - - return s; - } - -//---------- -// -// free_seeds-- -// De-allocate a list of seed structures. -// -//---------- -// -// Arguments: -// seed* seed: The linked list of seeds to de-allocate. -// -// Returns: -// (nothing) -// -//---------- - -void free_seeds - (seed* _seed) - { - seed* next; - - for ( ; _seed!=NULL ; _seed=next) - { next = _seed->next; free_if_valid ("free_seeds", _seed); } - } - -//---------- -// -// is_same_seed-- -// Determine whether two seeds are identical, including having the same index -// encoding. -// -//---------- -// -// Arguments: -// seed* seed1, seed2: The seeds to compare. Only the single seed pointed -// .. to is compared; any remaining seeds in the -// .. linked list are ignored. -// -// Returns: -// (nothing) -// -//---------- - -int is_same_seed - (seed* seed1, - seed* seed2) - { - int part; - - if (seed1 == seed2) return true; - if (seed1 == NULL) return false; - if (seed2 == NULL) return false; - - if (seed1->type != seed2->type) return false; - if (seed1->length != seed2->length) return false; - if (seed1->weight != seed2->weight) return false; - if (seed1->revComp != seed2->revComp) return false; - if (seed1->isHalfweight != seed2->isHalfweight) return false; - if (seed1->withTrans != seed2->withTrans) return false; - - if (seed1->numParts != seed2->numParts) return false; - for (part=0 ; partnumParts ; part++) - { - if (seed1->mask [part] != seed2->mask [part]) return false; - if (seed1->shift[part] != seed2->shift[part]) return false; - } - - if (seed1->type == 'R') - { if (seed1->resolvingMask != seed2->resolvingMask) return false; } - - return true; - } - -//---------- -// -// seed_pattern-- -// Create a string describing a list of seed structures. -// -//---------- -// -// Arguments: -// seed* seed: The linked list of seeds. -// -// Returns: -// A string containing the representation of the seed. This string is -// actually static data belonging to this routine, so the caller must copy -// it if more than one such string is to be used simultaneously. -// -//---------- - -char* seed_pattern - (seed* _seed) - { - static char s[70]; - char* ss; - seed* seed; - int firstInList; - int part; - u64 seedBits; - int bitsPer; - u32 mask; - int loc; - char ch; - - // convert each seed in the list to a string of 1TX's, and collect them - // in the pattern string - - ss = s; - firstInList = true; - - for (seed=_seed ; seed!=NULL ; seed=seed->next) - { - // recover this seed's bits - - seedBits = 0; - - for (part=0 ; partnumParts ; part++) - seedBits |= ((u64) seed->mask[part]) << seed->shift[part]; - - // convert it to a pattern string - - if (!firstInList) - { - if (ss-s >= (int) sizeof(s)-1) goto full; - *(ss++) = ','; - } - - bitsPer = (seed->type=='H')? 1 : 2; - mask = (seed->type=='H')? 1 : 3; - - for (loc=seed->length-1 ; loc>=0 ; loc--) - { - switch ((seedBits >> (bitsPer*loc)) & mask) - { - default: // (to placate compiler, can't happen) - case 3: ch = '1'; break; - case 2: ch = '?'; break; - case 1: ch = 'T'; break; - case 0: ch = '0'; break; - } - - if (ss-s >= (int) sizeof(s)-1) goto full; - *(ss++) = ch; - } - - firstInList = false; - } - - // add resolving bits - - seed = _seed; - if (seed->type == 'R') - { - for (loc=0 ; loc<16 ; loc++) - { if (seed->resolvingMask >> (2*loc) == 0) break; } - - if (loc > 0) - { - if (ss-s >= (int) sizeof(s)-1) goto full; - *(ss++) = '/'; - - for (loc-- ; loc>=0 ; loc--) - { - switch ((seed->resolvingMask >> (2*loc)) & 3) - { - default: // (to placate compiler, can't happen) - case 3: ch = '?'; break; - case 2: ch = 'R'; break; - case 1: ch = '?'; break; - case 0: ch = '0'; break; - } - - if (ss-s >= (int) sizeof(s)-1) goto full; - *(ss++) = ch; - } - } - } - - // terminate and return - - *ss = 0; - return s; - -full: - *ss = 0; - ss[-3] = '.'; - ss[-2] = '.'; - ss[-1] = '.'; - - return s; - } - -//---------- -// -// seed_shuffle_list-- -// Create a list of indexes describing how a seed shuffles locations when it -// is applied. -// -// Example: -// The seed 11101100101111 is implemented as a list of masks and shifts like -// this: -// -// seed | 1 1 1 0 1 1 0 0 1 0 1 1 1 1 -// bits | 1111110011110000110011111111 --> 11111111111111111111 -// ------------+------------------------------------------------------ -// mask shift | -// F0CFF 0 | 11110000110011111111 --> 11110000110011111111 -// 0F000 10 | 1111000000000000 --> 1111000000000000 -// 00300 18 | 1100000000 --> 1100000000 -// -// From the standpoint of rearrangements of a character string, this looks -// like this: -// -// seed | 11101100101111 -// characters | ABCDEFGHIJKLMN --> E F B C I A K L M N -// ------------+------------------------------------------------- -// F0CFF 0 | EF I KLMN --> E F I K L M N -// 0F000 10 | BC --> B C -// 00300 18 | A --> A -// ------------+------------------------------------------------- -// 4 5 1 2 8 0 10 11 12 13 -// -// If we consider a pointer pointing at the leftmost character of the original -// string (the A in this example), we can create the packed word by taking -// characters at indexes 4 (E), 5 (F), 1 (B), 2 (C), 8 (I), 0 (A), 10 (K), -// 11 (L), 12 (M), 13 (N). This is the list of indexes returned by this -// function. -// -//---------- -// -// Arguments: -// seed* seed: The seed (if it happens to be a list, only the first seed -// .. is processed). This must not contain transitions (only -// .. match and don't care are allowed). -// -// Returns: -// A list of indexes, in form (see note below). This is -// allocated from the heap, and the caller is responsible for disposing of it. -// Failure results in fatality. -// -// Note: -// form means the first entry in the list is the number of items -// in the list, NOT including this entry. So, for example, the list of primes -// less than ten would be stored as 4,2,3,5,7. 4 is the length and the array -// has total length 5. -// -//---------- - -u32* seed_shuffle_list - (seed* seed) - { - int length = seed->length; - int weight = seed->weight/2; - u32* indexes; - u32 mask, siteBits; - int part, origPos, listPos; - - // allocate the index list - - indexes = (u32*) malloc_or_die ("seed_shuffle_list", (1+weight)*sizeof(u32)); - - indexes[0] = (unsigned) weight; - - // deposit indexes - - for (part=0 ; partnumParts ; part++) - { - mask = seed->mask[part]; - origPos = (length-1) - (seed->shift[part]/2); - listPos = weight; - - for ( ; mask!=0 ; mask>>=2,origPos--,listPos--) - { - siteBits = mask & 3; - if (siteBits == 0) continue; - if (siteBits != 3) - suicide ("seed contains things other than don't-care and match"); - - if (listPos < 1) - suicide ("internal error, seed weight and masks conflict"); - - indexes[listPos] = (unsigned) origPos; - } - } - - return indexes; - } - -//---------- -// -// print_seeds-- -// Print a list of seed structures. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seed* seed: The linked list of seeds to print. -// -// Returns: -// (nothing) -// -//---------- - -void print_seeds - (FILE* f, - seed* seed) - { - int part; - u64 seedBits; - - for ( ; seed!=NULL ; seed=seed->next) - { - // recover the seed - - seedBits = 0; - - for (part=0 ; partnumParts ; part++) - seedBits |= ((u64) seed->mask[part]) << seed->shift[part]; - - fprintf (f, "%016llX\n", (unsigned long long) seedBits); - - // print the masked-shifts - - for (part=0 ; partnumParts ; part++) - fprintf (f, " ( >> %2d) & %08X\n", - seed->shift[part], seed->mask[part]); - - // print the resolving mask - - if (seed->resolvingMask != 0) - fprintf (f, " resolve: %08X\n", seed->resolvingMask); - } - } - -//---------- -// -// seed_packed_to_string, seed_packed_to_string2-- -// Convert bits, packed as per a seed, to a character string. -// -//---------- -// -// Arguments: -// seed* seed: The seed. This describes how the bits were packed. -// u32 word: The nucleotides, packed as per the seed. -// u8* bitToChar: Mapping from a 0/1 value to the character for that -// .. value (e.g. "RY"). -// u8* bitsToChar: Mapping from a 0/1/2/3 value to the character for that -// .. value (e.g. "ACGT"). -// -// Returns: -// A string containing the nucleotide characters. This string is actually -// static data belonging to this routine, so the caller must copy it if more -// than one such string is to be used simultaneously. -// -//---------- - -char* seed_packed_to_string (seed* seed, u32 word) - { return seed_packed_to_string2 (seed, word, bit_to_pur_pyr, bits_to_nuc); } - -char* seed_packed_to_string2 - (seed* seed, - u32 word, - const u8* bitToChar, - const u8* bitsToChar) - { - static char s[maxHwSeedLen+1]; - u64 unpackedWord, unpackedSeed; - int numChars; - int bitsPer; - char* ss; - u32 twoWordBits, twoSeedBits, mask; - - // unpack the bits - - unpackedWord = seed_unpack (seed, word, &unpackedSeed); - - // convert to characters - - numChars = seed->length; - if (numChars > (int) sizeof(s)-1) numChars = sizeof(s)-1; - - // convert each bit pair to a character - - ss = s; - bitsPer = (seed->type=='H')? 1 : 2; - mask = (seed->type=='H')? 1 : 3; - - while (numChars-- > 0) - { - twoWordBits = (unpackedWord >> (bitsPer*numChars)) & mask; - twoSeedBits = (unpackedSeed >> (bitsPer*numChars)) & mask; - - switch (twoSeedBits) - { - case 0: *(ss++) = 'x'; break; - case 1: if (twoWordBits < 2) - *(ss++) = bitToChar[twoWordBits]; - else - *(ss++) = '?'; break; - case 2: *(ss++) = '?'; break; - case 3: *(ss++) = bitsToChar[twoWordBits]; break; - } - } - - *ss = 0; - return s; - } - -//---------- -// -// seed_unpack-- -// Convert bits, packed as per a seed, to a character string. -// -//---------- -// -// Arguments: -// seed* seed: The seed. This describes how the bits were packed. -// u32 word: The nucleotides, packed as per the seed. -// u64* seedBits: Place to return the bits that are 'active' in the seed. -// .. This may be NULL. -// -// Returns: -// The nucleotides, unpacked to their original bit format. Any bits that are -// not 'active' in the seed are zero. -// -//---------- - -u64 seed_unpack - (seed* seed, - u32 word, - u64* seedBits) - { - u64 unpackedWord, unpackedSeed, partMask; - int part; - - ////////// - // unpack the bits - ////////// - - unpackedWord = 0; // the bits we have - unpackedSeed = 0; // the bits we could have had - - for (part=0 ; partnumParts ; part++) - { - partMask = (u64) seed->mask[part]; - unpackedWord |= (word & partMask) << seed->shift[part]; - unpackedSeed |= partMask << seed->shift[part]; - } - - if (seedBits != NULL) *seedBits = unpackedSeed; - return unpackedWord; - } - -//---------- -// -// apply_seed-- -// Apply a seed to a word, extracting and packing the seed bits. -// -//---------- -// -// Arguments: -// seed* seed: The seed. This describes how to extract bits from the -// .. word and pack them. -// u64 word: A word of consecutive nts. This must have at least as -// .. many nts as the seed needs (seed->length). If it has -// .. more nts than that, the extras are ignored. -// -// Returns: -// The packed word. -// -//---------- - -#ifndef hardCodedSeed - -u32 apply_seed - (seed* seed, - u64 word) - { - int part; - u64 rcWord = 0; - u32 packedWord = 0; - int seedBits = 0; - - // perform reverse-complement if necessary; for complementing seeds both - // a k-mer and its reverse complement are represented by whichever is - // numerically lowest - - if (seed->revComp) - { - if (seed->type == 'H') // half-weight seed - { - seedBits = seed->length; - rcWord = rev_comp_by_bits (word, seed->length); - } - else if (seed->type == 'R') // overweight seed - suicide ("internal error: overweight seeds cannot be complementing"); - else - { - seedBits = 2*seed->length; - rcWord = rev_comp_by_pairs (word, seed->length); - } - - word &= (((u64)1)< %0*llX/%s\n", - // (seedBits+3)/4, rcWord, seed_packed_to_string(seed,rcWord)); - - if (rcWord < word) word = rcWord; - } - - // apply the seed by combining the masked-shifts of the word - - for (part=0 ; partnumParts ; part++) - packedWord |= (word >> seed->shift[part]) & seed->mask[part]; - - return packedWord; - } - -#endif // not hardCodedSeed - -//---------- -// -// best_shift-- -// Determine the amount to shift a seed to fill in the most uncovered bits. -// -//---------- -// -// Arguments: -// u32 uncoveredBits: The bits we want to cover. -// u64 seedBits: The bits in the seed that we can shift into any -// .. coverage position. -// -// Returns: -// The best right shift count. -// -//---------- - -static int best_shift - (u32 uncoveredBits, - u64 seedBits) - { - int coverage, bestCoverage; - int shift, bestShift; - - bestCoverage = -1; - bestShift = -1; - - for (shift=0 ; seedBits!=0 ; seedBits>>=1,shift++) - { - coverage = bit_count (seedBits & uncoveredBits); - if (coverage > bestCoverage) - { bestCoverage = coverage; bestShift = shift; } - } - - return bestShift; - } - -// Scraps no longer need, saved for future use. -// -// unused routine that determines a seed's corresponding bit pattern -// -//static u64 seed_bits -// (seed* seed) -// { -// u64 seedBits; -// int part; -// -// if (seed->type == 'R') seedBits = seed->resolvingMask; -// else seedBits = 0; -// -// for (part=0 ; partnumParts ; part++) -// seedBits |= ((u64) seed->mask[part]) << seed->shift[part]; -// -// return seedBits; -// } - diff --git a/programs/lastz/src/seeds.h b/programs/lastz/src/seeds.h deleted file mode 100644 index 7a8b7ac..0000000 --- a/programs/lastz/src/seeds.h +++ /dev/null @@ -1,153 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: seeds.h -// -//---------- - -#ifndef seeds_H // (prevent multiple inclusion) -#define seeds_H - -// other files - -#include // standard C i/o stuff -#include "utilities.h" // utility stuff - -// debugging defines - -//#define hardCodedSeed // if this is defined, a hard-coded seed packing - // .. routine replaces apply_seed; see pack_seed - // .. below for more details - -//---------- -// -// data structures and types -// -//---------- - -// seeds-- -// A structure describing the implementation of a seed; note that "shift" -// and mask point to arrays included within the structure, so that only the -// structure itself, and any seeds down the "next" link, need be freed. To -// apply the seed to a sequence s (stored as as two bits per location), the -// following is computed (see apply_seed): -// sum over i=0..numParts-1 of (s>>shift[i] & mask[i]) - -// $$$ some of these should be unsigned - -typedef struct seed - { - struct seed* next; // next seed in a linked list - - char type; // the type of seed pattern - // 'S' => strict (only 1's and 0's) - // 'H' => half-weight (only T's and 0's) - // 'R' => overweight (use resolvingMask) - // '_' => nothing special - - int length; // seed length, measured in locations - int weight; // seed weight, measured in bits (half locs) - char* pattern; // the seed, represented as a string of 1s - // .. (matches), 0s (don't cares) and Ts - // .. (transition-allows); this may be NULL - int numParts; // length of shift[] and mask[] arrays - int* shift; // array of shift counts, indexed by - // .. 0..numParts-1 - u32* mask; // array of masks, indexed by 0..numParts-1 - - u32 resolvingMask; // mask (in unpacked bit order) to pick out - // .. the seed bits which are *not* accounted - // .. for by shift[] and mask[], and which have - // .. to be resolved by looking at the sequences - - int revComp; // true => pack such that k-mers are identical - // .. to their reverse-complements - - int isHalfweight; // false => each unpacked bp is 2-bit nucleotide - // true => each unpacked bp is 1-bit R/Y - int withTrans; // non-zero => we allow 1 or 2 transitions in - // .. any of the 'match' positions; assumes - // .. that the seed pattern is "strict" - u32* transFlips; // array of words (in packed bit order) for each - // .. bit that can be flipped to match a - // .. transition; each word has only one bit - // .. set; terminated by an empty word (zero); - // .. this can be NULL; space for this is part - // .. of this block - } seed; - -#define maxSeedLen 31 // maximum locations allowed in a seed -#define maxHwSeedLen 63 // maximum locations allowed in a half-weight - // .. seed -#define maxSeedBitWeight 31 // maximum bit weight allowed in a seed; bit - // .. weight is 2*matches+transitions -#define maxResolvedBits 16 // maximum number of "extra" bits in a resolving - // .. seed - -#define seed_12of19 "1110100110010101111" -#define seed_14of22 "1110101100110010101111" - -//---------- -// -// hard-coded seed packing routine -// -// The following gives us a way to plug in a specific seed-packing routine. -// This is provided only as a means to compare timing performance of seed -// packing optimized at compile-time vs the general packing routine. It is up -// to the user to make sure she specifies the same seed on the command line, or -// all bets are off. -// -// The program seed_function can be used to create the packing routine. -// -//---------- - -#ifndef hardCodedSeed -u32 apply_seed (seed* seed, u64 word); -#endif - -#ifdef hardCodedSeed -#define apply_seed(seed,word) pack_seed(word) -#ifdef straightforwardSeed -static inline u32 pack_seed (u64 word) // pack_1110100110010101111 - { - return ( word & 0x000000FF) - | ((word >> 2) & 0x00000300) - | ((word >> 4) & 0x00000C00) - | ((word >> 8) & 0x0000F000) - | ((word >> 12) & 0x00030000) - | ((word >> 14) & 0x00FC0000); - } -#else -static inline u32 pack_seed (u64 word) // pack_1110100110010101111 - { - return ( word & 0x00F0CCFF) - | ((word >> 16) & 0x000F3000) - | ((word >> 28) & 0x00000300); - } -#endif -#endif - -//---------- -// -// prototypes for routines in seeds.c -// -//---------- - -int parse_seeds_string (char* s, seed** seed, int maxIndexBits); -int parse_strict_seeds_string (char* s, seed** seed, int maxIndexBits); -seed* reconstruct_seed (char type, int length, int weight, - char* pattern, - u32 resolvingMask, int revComp, - int isHalfweight, int numParts, - int* shift, u32* mask, u32* transFlips); -seed* copy_seeds (seed* seed); -void free_seeds (seed* seed); -int is_same_seed (seed* seed1, seed* seed2); -char* seed_pattern (seed* seed); -u32* seed_shuffle_list (seed* seed); -void print_seeds (FILE* f, seed* seed); -char* seed_packed_to_string (seed* seed, u32 word); -char* seed_packed_to_string2 (seed* seed, u32 word, - const u8* bitToChar, const u8* bitsToChar); -u64 seed_unpack (seed* seed, u32 word, u64* seedBits); - -#endif // seeds_H diff --git a/programs/lastz/src/segment.c b/programs/lastz/src/segment.c deleted file mode 100755 index d73c4f2..0000000 --- a/programs/lastz/src/segment.c +++ /dev/null @@ -1,2139 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: segment.c -// -//---------- -// -// segment-- -// Support for lists of matched segments of DNA sequences. -// -// This module implements the collection of a table of segments of a pair of -// DNA sequences. A segment consists of an interval in each sequence (of the -// same length) and a score. The actual DNA content of the interval is of -// no importance to this module-- all that is important is represented by a -// segment's length and score. -// -// The caller can specify a limit on the total length of segments in the table. -// The intent is to give the user the ability to collect alignments covering -// some meaningful portion of the sequences, even though she does not know what -// score threshold would accomplish this. Without this capability, the user is -// forced to do some preliminary runs to try to estimate the score distribution, -// or set the score threshold so low that the process is overwhelmed by false -// positives. -// -// The limit (coverageLimit) is the number of bases of coverage that the caller -// wishes to allow. However, we allow the sum of all the segment lengths to -// exceed this, by no more than one segment. For example, if the caller -// specifies a limit of 10K bp, and if there is at least 10K bp of homologous -// segments in the two sequences, we will keep at least 10K bp in this table, -// but not much more. -// -//---------- -// -// $$$ Caveats -// -// The coverageLimit stuff doesn't consider the possibility that the incoming -// segments could contain overlaps. As of this writing (Mar/2008) we will only -// have overlaps if the caller is using a twin-hit seed. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include "build_options.h" // build options -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "diag_hash.h" // diagonals hashing stuff - -#define segment_owner // (make this the owner of its globals) -#include "segment.h" // interface to this module - -// miscellany - -#define segtable_bytes(size) (sizeof(segtable) + (((size)-1)*sizeof(segment))) - -//#define debugBinaryHeap - -//---------- -// -// prototypes for private functions -// -//---------- - -static void remove_root (segtable* st); -static void record_tie_scores (segtable* st); -static int record_tie_score (segtable* st, int ix); - -#ifdef debugBinaryHeap -static void validate_heap (segtable* st, char* msg); -#endif // debugBinaryHeap - -//---------- -// -// new_segment_table-- -// Allocate an empty segment table. -// -//---------- -// -// Arguments: -// u32 size: The number of entries to provide for. This can be -// .. increased later. -// unspos coverageLimit: Limit on the total lengths of the segments in the -// .. table (see discussion in file header above); -// .. zero indicates no limit. -// -// Returns: -// A pointer to a newly allocated segment table; failures result in program -// fatality. The caller must eventually dispose of the table, with a call to -// free_segment_table(). -// -//---------- - -segtable* new_segment_table - (u32 size, - unspos coverageLimit) - { - segtable* st; - size_t bytesNeeded; - - // sanity check - - if (size < 1) - suicidef ("in new_segment_table(), size can't be %d", size); - - // allocate - - bytesNeeded = segtable_bytes (size); -#if (SIZE_MAX > mallocLimit) - if (bytesNeeded > mallocLimit) goto overflow; -#endif // overflow possible - st = (segtable*) malloc_or_die ("new_segment_table", bytesNeeded); - - // initialize - - st->size = size; - st->len = 0; - st->haveScores = false; - st->coverageLimit = coverageLimit; - st->coverage = 0; - st->lowScore = worstPossibleScore; - st->seg = st->_seg; // (hook up the segment table) - - return st; - -// failure exits - -#if (SIZE_MAX > mallocLimit) -overflow: - suicidef ("internal error, in new_segment_table()\n" - "table size (%s) exceeds allocation limit of %s;", - commatize(bytesNeeded), - commatize(mallocLimit)); - return NULL; // (doesn't get here) -#endif // overflow possible - } - -//---------- -// -// subset_segment_table-- -// Create a subset of a segment table, sharing the actual elements in the -// table. -// -// NOTE: This just fills the subset object with information that fools it into -// thinking it has its own seg[] array, but it actually points into the -// other obejct's seg[] array. Tread lightly! -// -//---------- -// -// Arguments: -// segtable* st: The segment table to "copy". -// u32 startIx: First entry in st->seg that should be included in -// the subset. -// u32 endIx: One past the last entry in st->seg that should be -// included in the subset. -// segtable* subset: The segment table object to write the subset to. -// Note that we don NOT expect this to have been -// allocated from the head, and in any event -// free_segment_table() should NOT be used to dispose -// of it. -// -// Returns: -// (nothing) -// -//---------- - -void subset_segment_table - (segtable* st, - u32 startIx, - u32 endIx, - segtable* subset) - { - subset->size = endIx - startIx; - subset->len = subset->size; - subset->haveScores = st->haveScores; - subset->coverageLimit = st->coverageLimit; - subset->coverage = st->coverage; - subset->lowScore = st->lowScore; - subset->seg = &st->seg[startIx]; - } - -//---------- -// -// empty_segment_table-- -// "Erase" all segments from a segment table. -// -//---------- -// -// Arguments: -// segtable* st: The segment table to empty. -// -// Returns: -// (nothing) -// -//---------- - -void empty_segment_table - (segtable* st) - { - st->len = 0; - st->haveScores = false; - st->coverage = 0; - st->lowScore = worstPossibleScore; - } - -//---------- -// -// limit_segment_table-- -// Change a segment table's total length limit. -// -//---------- -// -// Arguments: -// segtable* st: The segment table to modify. -// unspos coverageLimit: Limit on the total lengths of the segments in -// .. the table; zero indicates no limit. -// -// Returns: -// (nothing) -// -//---------- - -void limit_segment_table - (segtable* st, - unspos coverageLimit) - { - int ix, newLen; - possum cov, newCov; - score prevScore, newLow; - segment* seg, *tail; - segment tempSeg; - - st->coverageLimit = coverageLimit; - - // if this leaves us below the limit, we're done - - if ((st->coverageLimit == 0) - || (st->coverage < st->coverageLimit) - || (st->len < 2)) - return; - - // otherwise, may need to reduce; begin by sorting segments by decreasing - // score - - sort_segments (st, qSegmentsByDecreasingScore); - - // find the score that reduces coverage to no more than the limit - - newLen = st->len; - newCov = st->coverage; - newLow = st->lowScore; - - ix = st->len; - seg = &st->seg[--ix]; - - cov = st->coverage - seg->length; - prevScore = seg->s; - - while (ix > 0) - { - seg = &st->seg[--ix]; - - // if this is the same as the segment after it, just keep looking (we - // treat segments with the same score as one unbreakable entity) - - if (seg->s == prevScore) - { cov -= seg->length; continue; } - - // if removing the segments above this one would be enough to get us - // within the limit, we are finished; note that what we remove will be - // the segments above the (score-tied) segments above this one - - if (cov < st->coverageLimit) - break; - - // potentially, we will remove the segments above this one, so record - // that information and keep going - - newLen = ix+1; - newCov = cov; - newLow = seg->s; - - cov -= seg->length; - prevScore = seg->s; - } - - st->len = newLen; - st->coverage = newCov; - st->lowScore = newLow; - - // now make the list a proper min-heap by simply reversing its order (which - // orders it by increasing score), and add the tied-score information - - seg = &st->seg[0]; - tail = &st->seg[st->len-1]; - while (seg < tail) - { - tempSeg = *seg; - *(seg++) = *tail; - *(tail--) = tempSeg; - } - - record_tie_scores (st); - } - -//---------- -// -// free_segment_table-- -// Deallocate a segment table. -// -//---------- -// -// Arguments: -// segtable* st: The segment table to dispose of. -// -// Returns: -// (nothing) -// -//---------- - -void free_segment_table - (segtable* st) - { - free_if_valid ("free_segment_table", st); - } - -//---------- -// -// read_segment_table-- -// Read a segment table from a file. -// -// The file consists of segments, one per line, like the ones below. Note that -// this is NOT the same format as is written by write_segments or dump_segments -// (though it is similar to that written by write_segments). There are three -// columns for the target-- name, start, end. These are followed by three -// columns for the query (with the same meaning), the query strand, and an -// optional score. -// -// E88BQJZ01A3EQH 151 225 E86HODY01D81JM 14 88 + 6875 -// E88BQJZ01D4L6V 26 100 E86HODY01D81JM 10 84 + 6808 -// E88BQJZ01EVLNU 19 93 E86HODY01D81JM 7 81 + 6842 -// E88BQJZ01CEBPD 8 81 E86HODY01D81JM 9 82 + 7108 -// E88BQJZ01BLO6X 132 205 E86HODY01D81JM 11 84 - 7339 -// E88BQJZ01A2W3P 162 214 E86HODY01D81JM 2 54 - 5024 -// E88BQJZ01A9395 62 136 E86HODY01A323K 18 92 + 7231 -// E88BQJZ01DNC74 18 82 E86HODY01A323K 2 66 + 6418 -// E88BQJZ01CTR26 83 167 E86HODY01ASA7F 19 103 + 8034 -// E88BQJZ01C2TAC 95 181 E86HODY01ASA7F 15 101 + 8272 -// -// Start and end are origin one, closed (thus the interval "154 228" has length -// 75 and is preceded by 153 bases in its sequence). Negative strand intervals -// are measured from the 5' end of the query's *negative* strand (e.g from the -// opposite end as that used for the positive strand). All target intervals are -// on the positive strand. The query interval length *must* match the target -// interval length. Segments without scores are given the score of zero. -// -// Sequence names for the query *must* appear in the same order as they do in -// the query file (but see note below about partitioned sequences). For a given -// query, all positive strand intervals must appear before any negative strand -// intervals. Sequence names for the target may appear in any order, and are -// only meaningful for partitioned sequences (see below); otherwise they are -// ignored. Intervals with names not found in the target or query are not -// allowed. -// -// A * can be used as a generic sequence name, in those cases where sequence -// names are either unknown or of no importance. -// -// A "#" is considered a comment. Anything following a "#" (on the same line) -// is ignored. Blank lines are also ignored. -// -// Note: Partitioned sequences (enabled at the command line by the "multi" file -// action) are internally treated as a single sequence. In this case, -// query names can appear in any order. However, *all* positive strands -// must appear before any negative strands. -// -//---------- -// -// Arguments: -// FILE* f: The file to read segments from. -// char* fName: The name of the file (used only for reporting errors); -// .. may be NULL. -// segtable* st: The segment table to fill. If this is NULL, it -// .. indicates that this is the final call for the file -// .. (see note about final call below). -// seq* target: The sequence being searched. -// seq* query: The sequence(s) being searched for. -// -// Returns: -// A pointer to a the segment table; if there was room in st for all the -// segments, this is the same as st; otherwise, this is a pointer to a new -// copy of st, and the previous st has been deallocated; reallocation -// failures result in program fatality. -// -//---------- -// -// Notes: -// (1) After the final query, this routine should be called with st == NULL, -// to verify that all segments have been read. -// -// (2) WARNING: Since the line buffer is used to save information between -// calls, this routine can not be used to handle multiple files in the -// same run of a program. -// -// (3) [this note is also referenced from resolve_chore_query] -// Example of position arithmetic for queries on negative strand in -// partitions. Suppose we have this partitioning of the query (note that -// startLoc is 1-based): -// -// sepBefore sepAfter startLoc trueLen contig header -// [ 0] 0 481 51 2000 1 query17 -// [ 1] 481 951 532 2000 1 query17 -// [ 2] 951 1481 1002 2000 1 query17 -// [ 3] 1481 1851 1532 2000 1 query17 -// [ 4] 1851 2332 51 2000 2 query20 -// [ 5] 2332 2802 532 2000 2 query20 -// [ 6] 2802 3332 1002 2000 2 query20 -// [ 7] 3332 3702 1532 2000 2 query20 -// [ 8] 3702 0 0 0 -// -// and this segment: -// -// target22 1392 1463 query17 272 343 - -// -// Since the segment is counted along the reverse strand and the full -// query is 2000 bp long, the interval 272..343 corresponds to the interval -// 1658..1729 along the forward strand (the segment file uses 1-based -// intervals, so 2001-343 gives 1658, and 2001-272 gives 1729). The reason -// we need the forward strand is so we can identify the partition that -// contains the interval. -// -// Now, when the query is reverse-complemented, each partition is reverse- -// complemented separately in place, so that the partition table remains -// unchanged. This means that (using 1-based intervals) partition 3 -// corresponds to forward-strand query17 1532..1900 and reverse-strand -// query17 101..469. It is in memory at 1482..1850, in reverse (where -// "memory" means seq->v[]). -// -// For this segment, we want the interval 1658..1729. Along a number line, -// this looks like this (1-based intervals): -// -// reverse-strand sequence: 101............... 272..343 ......... 469 -// forward-strand sequence: 1900...............1729..1658.........1532 -// memory: 1482...............1654..1725.........1850 -// -// So our interval is in memory at 1654..1725. In zero-based indexes this -// is 1653..1725. -// -//---------- - -segtable* read_segment_table - (FILE* f, - char* fName, - segtable* st, - seq* target, - seq* query) - { - // (parsing variables, preserve between calls) - static char line[1024]; - static int pendingRewind = false; - static int pendingLine = false; - static int pendingFirstAfterRewind = false; - static int lineNum = 0; - static char* tName, *qName, *tPrevName, *tPartName; - static unspos tStart, tEnd, qStart, qEnd; - static char qStrand; - static score s; - static int prevQueryWasPartitioned = false; - // (normal local variables) - int firstAfterRewind; - unspos tSeqStart, tSeqEnd, qSeqStart, qSeqEnd; - unspos tSegStart, qSegStart, segLen; - int len, missingEol; - char* scan, *field; - int numItems, charsUsed; - unspos tOffset, tLen, qOffset, qLen, qTrue, qNegStart; - char* queryName = ""; - char queryStrand = '+'; - seqpartition* tSp = &target->partition; - seqpartition* qSp = &query->partition; - partition* tNamePart, *tPart, *qNamePart, *qPart; - u32 tIx; - int err; - - if (fName == NULL) fName = "(filename not known)"; - - if (st != NULL) - { - if (qSp->p != NULL) // query is partitioned - { - queryName = "(partitioned query)"; - prevQueryWasPartitioned = true; - } - else // query is not partitioned - { - queryName = (query->useFullNames)? query->header - : query->shortHeader; - prevQueryWasPartitioned = false; - } - queryStrand = ((query->revCompFlags & rcf_rev) != 0)? '-' : '+'; - } - - // read the segments for this query/strand - - missingEol = false; - - firstAfterRewind = false; - if (pendingRewind) - { - if (st != NULL) - { - err = fseek (f, 0, SEEK_SET); - if (err != 0) goto rewind_failed; - lineNum = 0; - } - pendingRewind = false; - firstAfterRewind = true; - } - - if ((st == NULL) && (pendingLine) && (pendingFirstAfterRewind)) - pendingLine = pendingFirstAfterRewind = false; - - while (true) - { - // get the next line, if we need one; we also check for lines getting - // split by fgets (the final line in the file might not have a newline, - // but no internal lines can be that way) - - if (pendingLine) - { - pendingLine = false; - firstAfterRewind = pendingFirstAfterRewind; - goto parsing_finished; - } - else - { - if (fgets (line, sizeof(line), f) == NULL) break; - lineNum++; - - if (missingEol) - goto split_line; - - len = strlen(line); - if (len == 0) continue; - missingEol = (line[len-1] != '\n'); - } - - // trim blanks, end of line, and comments, and ignore blank lines - - if (line[len-1] == '\n') line[--len] = 0; - - field = strchr (line, '#'); - if (field != NULL) *field = 0; - - trim_string (line); - if (line[0] == 0) continue; - - // see if this is a "rewind" command - // $$$ we should make sure there's nothing left in the file - - if (strcmp (line, "rewind") == 0) - { pendingRewind = true; break; } - - // parse the line - - if (segment_dbgAnchorParsing) - fprintf (stderr, "segment line: \"%s\"\n", line); - - scan = line; - - if (*scan == 0) goto not_enough_fields; - tName = scan; - scan = skip_darkspace (scan); - *(scan++) = 0; - scan = skip_whitespace (scan); - - if (*scan == 0) goto not_enough_fields; - field = scan; - scan = skip_darkspace (scan); - *(scan++) = 0; - scan = skip_whitespace (scan); - charsUsed = -1; - numItems = sscanf (field, unsposFmtScanf "%n", &tStart, &charsUsed); - if ((numItems != 1) || (((u32)charsUsed) != strlen(field))) goto bad_field; - - if (*scan == 0) goto not_enough_fields; - field = scan; - scan = skip_darkspace (scan); - *(scan++) = 0; - scan = skip_whitespace (scan); - charsUsed = -1; - numItems = sscanf (field, unsposFmtScanf "%n", &tEnd, &charsUsed); - if ((numItems != 1) || (((u32)charsUsed) != strlen(field))) goto bad_field; - if (tEnd < tStart) goto bad_target_interval; - - if (*scan == 0) goto not_enough_fields; - qName = scan; - scan = skip_darkspace (scan); - *(scan++) = 0; - scan = skip_whitespace (scan); - - if (*scan == 0) goto not_enough_fields; - field = scan; - scan = skip_darkspace (scan); - *(scan++) = 0; - scan = skip_whitespace (scan); - charsUsed = -1; - numItems = sscanf (field, unsposFmtScanf "%n", &qStart, &charsUsed); - if ((numItems != 1) || (((u32)charsUsed) != strlen(field))) goto bad_field; - - if (*scan == 0) goto not_enough_fields; - field = scan; - scan = skip_darkspace (scan); - *(scan++) = 0; - scan = skip_whitespace (scan); - charsUsed = -1; - numItems = sscanf (field, unsposFmtScanf "%n", &qEnd, &charsUsed); - if ((numItems != 1) || (((u32)charsUsed) != strlen(field))) goto bad_field; - if (qEnd < qStart) goto bad_query_interval; - if (qEnd-qStart != tEnd - tStart) goto interval_length_mismatch; - - if (*scan == 0) goto not_enough_fields; - field = scan; - scan = skip_darkspace (scan); - *(scan++) = 0; - scan = skip_whitespace (scan); - if (strlen(field) != 1) goto bad_field; - qStrand = *field; - if ((qStrand != '+') && (qStrand != '-')) goto bad_strand; - - s = 0; - if (*scan == 0) goto new_parse_finished; - field = scan; - scan = skip_darkspace (scan); - *(scan++) = 0; - scan = skip_whitespace (scan); - charsUsed = -1; - numItems = sscanf (field, scoreFmtScanf "%n", &s, &charsUsed); - if ((numItems != 1) || (((u32)charsUsed) != strlen(field))) goto bad_field; - - if (*scan != 0) goto too_many_fields; - - new_parse_finished: - - if (segment_dbgAnchorParsing) - fprintf (stderr, " (line parsed)\n"); - - parsing_finished: - - // it's a *syntactically* valid segment, but if we aren't accepting any - // more segments, this is a failure - - if (st == NULL) - goto extra_segments; - - // resolve query interval - // qSeqStart is in-file position of start of the resident piece-of-sequence (origin zero) - // qOffset is index into query v[] of the start of the resident piece-of-sequence - - if (qStrand != queryStrand) - { - if (segment_dbgAnchorParsing) - fprintf (stderr, " (query strand mismatch-- %s%c vs %s%c)\n", - qName, qStrand, queryName, queryStrand); - goto query_name_mismatch; - } - - if (qSp->p == NULL) // query is not partitioned - { - if (strcmp (qName, "*") != 0) - { - if ((queryName != NULL) - && (queryName[0] != 0) - && (strcmp (qName, queryName) != 0)) - { - if (segment_dbgAnchorParsing) - fprintf (stderr, " (query name mismatch-- %s%c vs %s%c)\n", - qName, qStrand, queryName, queryStrand); - goto query_name_mismatch; - } - } - - qSeqStart = query->startLoc - 1; - qOffset = 0; - qLen = query->len; - qSeqEnd = qSeqStart + qLen; - if (qStrand == '-') - { // negative strand - qTrue = query->trueLen; - qNegStart = qTrue - qSeqEnd; // (origin zero) - qSeqEnd = qTrue - qSeqStart; - qSeqStart = qNegStart; - } - if (qStart <= qSeqStart) goto query_interval_before_start; - if (qEnd > qSeqEnd) goto query_interval_after_end; - } - else if (strcmp (qName, "*") == 0) // query is partitioned and - goto query_wild_card; // .. name is wildcard - else // query is partitioned and - { // .. specific name is given - qNamePart = lookup_named_partition (query, qName); - if (qNamePart == NULL) goto bad_query_name; - if (qStrand != '-') - { // positive strand - qPart = lookup_partition_seq_pos (query, qNamePart, qStart); - if (qPart == NULL) goto bad_query_position_forward; - qSeqStart = qPart->startLoc - 1; - qOffset = qPart->sepBefore + 1; - qLen = qPart->sepAfter - qOffset; - qSeqEnd = qSeqStart + qLen; - if (qEnd > qSeqEnd) goto query_interval_after_end; - } - else - { // negative strand, see note 3 above - qTrue = qNamePart->trueLen; - qNegStart = qTrue+1 - qEnd; // (origin one) - qPart = lookup_partition_seq_pos (query, qNamePart, qNegStart); - if (qPart == NULL) goto bad_query_position_reverse; - qOffset = qPart->sepBefore + 1; - qLen = qPart->sepAfter - qOffset; - qSeqEnd = qTrue - (qPart->startLoc-1); - qSeqStart = qSeqEnd - qLen; // (origin zero) - if (qStart <= qSeqStart) goto query_interval_before_reversed_end; - } - } - - // resolve target interval - - if (tSp->p == NULL) // target is not partitioned - { - tSeqStart = target->startLoc - 1; - tOffset = 0; - tLen = target->len; - tSeqEnd = tSeqStart + tLen; - if (tStart <= tSeqStart) goto target_interval_before_start; - if (tEnd > tSeqEnd) goto target_interval_after_end; - } - else if (strcmp (tName, "*") == 0) // target is partitioned and - goto target_wild_card; // .. name is wildcard - else // target is partitioned and - { // .. specific name is given - tNamePart = lookup_named_partition (target, tName); - if (tNamePart == NULL) goto bad_target_name; - tPart = lookup_partition_seq_pos (target, tNamePart, tStart); - if (tPart == NULL) goto bad_target_position; - tSeqStart = tPart->startLoc - 1; - tOffset = tPart->sepBefore + 1; - tLen = tPart->sepAfter - tOffset; - tSeqEnd = tSeqStart + tLen; - if (tEnd > tSeqEnd) goto target_interval_after_end; - } - - // (phew!) it's a valid segment, add it to the table; note that we use - // the strand as an id - - tSegStart = tOffset + (tStart-1)-tSeqStart; - qSegStart = qOffset + (qStart-1)-qSeqStart; - segLen = tEnd+1 - tStart; - - if (segment_dbgAnchorParsing) - fprintf (stderr, " --> adding segment " unsposFmt ".." unsposFmt - "/" unsposFmt ".." unsposFmt "%c\n", - tSegStart, tSegStart + segLen, - qSegStart, qSegStart + segLen, qStrand); - - st = add_segment (st, tSegStart, qSegStart, segLen, - s, /*id*/ qStrand, /*hspId*/ 0); - continue; - - // the given target name is a wild card, and target is partitioned, so - // we have to add a segment for every sequence in the target; note that - // some partitions may have the same name (e.g. if the user is using a - // separator), so we have to look for the correct partition on a name- - // by-name basis - - target_wild_card: - - tPrevName = ""; - for (tIx=0 ; tIxlen ; tIx++) - { - tNamePart = &tSp->p[tIx]; - tPartName = &tSp->pool[tNamePart->header]; - if (strcmp (tPartName, tPrevName) == 0) - continue; // (this partititon has same name as previous) - tPrevName = tPartName; - - tPart = lookup_partition_seq_pos (target, tNamePart, tStart); - if (tPart == NULL) goto bad_target_position; - tSeqStart = tPart->startLoc - 1; - tOffset = tPart->sepBefore + 1; - tLen = tPart->sepAfter - tOffset; - tSeqEnd = tSeqStart + tLen; - if (tEnd > tSeqEnd) goto target_interval_after_end; - - // add the segment to the table; note that we use the strand as an - // id - - tSegStart = tOffset + (tStart-1)-tSeqStart; - qSegStart = qOffset + (qStart-1)-qSeqStart; - segLen = tEnd+1 - tStart; - - if (segment_dbgAnchorParsing) - fprintf (stderr, " --> adding segment " unsposFmt ".." unsposFmt - "/" unsposFmt ".." unsposFmt "%c\n", - tSegStart, tSegStart + segLen, - qSegStart, qSegStart + segLen, qStrand); - - st = add_segment (st, tSegStart, qSegStart, segLen, - s, /*id*/ qStrand, /*hspId*/ 0); - } - continue; - - // interval name or strand did not match query; this marks the end of - // the list; otherwise, we need to keep looking - - query_name_mismatch: - pendingLine = true; - pendingFirstAfterRewind = firstAfterRewind; - break; - } - - // success - - if (st != NULL) st->haveScores = true; - return st; - - ////////// - // failure exits - ////////// - -rewind_failed: - suicidef ("failed to rewind segments file\n" - "in read_segment_table for %s, index fseek(0) returned %d", - fName, err); - return NULL; - -split_line: - suicidef ("line is too long (%s: line %d)", fName, lineNum-1); - return NULL; - -not_enough_fields: - suicidef ("line has too few fields (%s: line %d)", fName, lineNum); - return NULL; - -too_many_fields: - suicidef ("line has too many fields (%s: line %d)", fName, lineNum); - return NULL; - -bad_field: - suicidef ("bad field (%s: line %d, %s)", fName, lineNum, field); - return NULL; - -bad_target_interval: - suicidef ("bad target interval (%s: line %d, " unsposFmt ">" unsposFmt ")", - fName, lineNum, tStart, tEnd); - return NULL; - -target_interval_before_start: - suicidef ("target interval out of range (%s: line %d, " unsposFmt "<" unsposFmt ")", - fName, lineNum, tStart, tSeqStart+1); - return NULL; - -target_interval_after_end: - // $$$ this should be more informative when a separator has been crossed - suicidef ("target interval out of range (%s: line %d, " unsposFmt ">" unsposFmt ")", - fName, lineNum, tEnd, tSeqEnd); - return NULL; - -bad_query_interval: - suicidef ("bad query interval (%s: line %d, " unsposFmt ">" unsposFmt ")", - fName, lineNum, qStart, qEnd); - return NULL; - -query_interval_before_start: - if (qStrand == '-') - suicidef ("query interval out of range (%s: line %d, " unsposFmt "<" unsposFmt")" - "\nminus strand subrange is " unsposFmt ".." unsposFmt, - fName, lineNum, qStart, qSeqStart+1, qSeqStart+1, qSeqEnd); - else - suicidef ("query interval out of range (%s: line %d, " unsposFmt "<" unsposFmt ")", - fName, lineNum, qStart, qSeqStart+1); - return NULL; - -query_interval_after_end: - // $$$ this should be more informative when a separator has been crossed - if (qStrand == '-') - suicidef ("query interval out of range (%s: line %d, " unsposFmt ">" unsposFmt")" - "\nminus strand subrange is " unsposFmt ".." unsposFmt, - fName, lineNum, qEnd, qSeqEnd, qSeqStart+1, qSeqEnd); - else - suicidef ("query interval out of range (%s: line %d, " unsposFmt ">" unsposFmt ")", - fName, lineNum, qEnd, qSeqEnd); - return NULL; - -query_interval_before_reversed_end: - suicidef ("query interval out of range (%s: line %d, " unsposFmt "<" unsposFmt ")" - "\nminus strand subrange is " unsposFmt ".." unsposFmt, - fName, lineNum, qStart, qSeqStart+1, qSeqStart+1, qSeqEnd); - return NULL; - -interval_length_mismatch: - suicidef ("intervals have different lengths (%s: line %d, " unsposFmt " vs. " unsposFmt ")", - fName, lineNum, tEnd+1-tStart, qEnd+1-qStart); - return NULL; - -bad_strand: - suicidef ("bad strand (%s: line %d, %c)", - fName, lineNum, qStrand); - return NULL; - -bad_target_name: - suicidef ("bad target sequence name (%s: line %d, %s)", - fName, lineNum, tName); - return NULL; - -bad_target_position: - suicidef ("bad target sequence name/position (%s: line %d, %s:" unsposFmt ")", - fName, lineNum, tName, tStart); - return NULL; - -bad_query_name: - suicidef ("bad query sequence name (%s: line %d, %s)", - fName, lineNum, qName); - return NULL; - -bad_query_position_forward: - suicidef ("bad query sequence name/position (%s: line %d, %s:" unsposFmt ")", - fName, lineNum, qName, qStart); - return NULL; - -bad_query_position_reverse: - suicidef ("bad query sequence name/position (%s: line %d, %s:" unsposFmt ")" - "\n" unsposFmt " on forward strand", - fName, lineNum, qName, qStart, qNegStart); - return NULL; - -query_wild_card: - suicidef ("bad query sequence name (%s: line %d, %s)\n" - "wildcard segment name (*) is not supported for queries with [multi]", - fName, lineNum, qName); - return NULL; - -extra_segments: - if (prevQueryWasPartitioned) - suicidef ("extra segments in file (%s: line %d, %s/%s%c)\n" - "(for this usage all + strand segments must appear before all - strand segments)", - fName, lineNum, tName, qName, qStrand); - else - suicidef ("extra segments in file (%s: line %d, %s/%s%c)\n" - "(for this usage segments must appear in the same order as the query file, with\n" - "all + strand segments before all - strand segments for each query)", - fName, lineNum, tName, qName, qStrand); - return NULL; - } - -//---------- -// -// add_segment-- -// Add a segment to a segment table. -// -//---------- -// -// Arguments: -// segtable* st: The segment table to add to. -// pos1, pos2, length, s, id, hspId: The segment to add. (Positions are -// .. origin-zero). -// -// Returns: -// A pointer to a the segment table; if there was room in st for the segment, -// this is the same as st; otherwise, this is a pointer to a new copy of st, -// and the previous st has been deallocated; reallocation failures result in -// program fatality. -// -//---------- - -segtable* add_segment - (segtable* st, - unspos pos1, - unspos pos2, - unspos length, - score s, - int id, - u64 hspId) - { - static u64 hspIdCounter; - u32 newSize; - size_t bytesNeeded; - segment* seg, *parent; - segment tempSeg; - int ix, pIx; - int tied, stopped; - -// fprintf (stderr, "add " unsposSlashSFmt " " unsposFmt " " scoreFmtSimple "; id %d\n", -// pos1+1, "+", -// pos2+1, ((id & rcf_rev) != 0)? "-" : "+", -// length, s, id); - - if (hspId == 0) - hspId = ++hspIdCounter; - - ////////// - // add the segment to the table, enlarging the table if needed, but - // discarding the segment if it is low-scoring and the table has met its - // coverage limit - ////////// - - // if the table is already full and this segment scores less than the - // lowest score in the table, discard it - - if ((st->coverageLimit != 0) - && (st->coverage >= st->coverageLimit) - && (st->len > 0) - && (s < st->lowScore)) - return st; - - // if there's no room for the new segment, re-allocate - - if (st->len >= st->size) - { - newSize = st->size + 100 + (st->size / 3); - bytesNeeded = segtable_bytes (newSize); -#if (SIZE_MAX > mallocLimit) - if (bytesNeeded > mallocLimit) goto overflow; -#endif // overflow possible - st = (segtable*) realloc_or_die ("add_segment", st, bytesNeeded); - st->size = newSize; - st->seg = st->_seg; // (hook up the segment table) - } - - // add the segment, by appending it at the end - - seg = &st->seg[st->len++]; - seg->pos1 = pos1; - seg->pos2 = pos2; - seg->length = length; - seg->s = s; - seg->id = id; - seg->hspId = hspId; - seg->filter = false; - seg->scoreCov = (possum) length; - - st->coverage += length; - if ((st->len == 1) || (s < st->lowScore)) st->lowScore = s; - - ////////// - // handle the transition between the two table states - // below-the-coverage-limit: table is kept as a simple list - // met-the-coverage-limit: table is kept as a proper min-heap - ////////// - - // if this segment leaves us below the limit, we're done - - if ((st->coverageLimit == 0) - || (st->coverage < st->coverageLimit)) - return st; - - // if this is the first time we've reached the limit, sort the segments to - // create a proper min-heap, and add the tied-score information - // nota bene: if we reach here, st->coverageLimit > 0 and - // st->coverage >= st->coverageLimit - - if (st->coverage - length < st->coverageLimit) - { - sort_segments (st, qSegmentsByIncreasingScore); - record_tie_scores (st); - #ifdef debugBinaryHeap - fprintf (stderr, "\nafter sort:\n"); - dump_segments (stderr, st, NULL, NULL); - validate_heap (st, "after sort"); - #endif // debugBinaryHeap - goto prune; - } - - ////////// - // maintain the min-heap property - ////////// - - #ifdef debugBinaryHeap - //fprintf (stderr, "\nbefore percolation:\n"); - //dump_segments (stderr, st, NULL, NULL); - #endif // debugBinaryHeap - - // the rest of the list is a proper min-heap, so percolate the new segment - // up the tree, while maintaining the tied-score information - // nota bene: if we reach here, length >= 2 - - tied = false; - for (ix=st->len-1 ; ix>0 ; ) - { - pIx = (ix-1) / 2; - seg = &st->seg[ix]; - parent = &st->seg[pIx]; - - if (seg->s >= parent->s) - { tied = (seg->s == parent->s); break; } - - // swap this segment with its parent, and adjust old parent's tied-score - // subheap - - tempSeg = *seg; *seg = *parent; *parent = tempSeg; - record_tie_score (st, ix); - - ix = pIx; - } - - record_tie_score (st, ix); - - // if the new segment tied an existing score, we must continue to percolate - // the tied-score info up the tree - - if (tied) - { - stopped = false; - for (ix=(ix-1)/2 ; ix>0 ; ix=(ix-1)/2) - { - if (!record_tie_score (st, ix)) - { stopped = true; break; } - } - if (!stopped) record_tie_score (st, 0); - } - - #ifdef debugBinaryHeap - fprintf (stderr, "\nafter percolation:\n"); - dump_segments (stderr, st, NULL, NULL); - validate_heap (st, "after percolation"); - #endif // debugBinaryHeap - - ////////// - // remove low-scoring segments - ////////// - -prune: - - // if removing the minimum scoring subheap would bring us below the - // limit, no pruning is necessary - - if (st->coverage - st->seg[0].scoreCov < st->coverageLimit) - return st; - - // otherwise, we must remove subheaps as long as doing so leaves us at or - // above the limit - - while (st->coverage - st->seg[0].scoreCov >= st->coverageLimit) - { - s = st->seg[0].s; - while (st->seg[0].s == s) - { - remove_root (st); - #ifdef debugBinaryHeap - fprintf (stderr, "\nafter a pruning:\n"); - dump_segments (stderr, st, NULL, NULL); - validate_heap (st, "after pruning"); - #endif // debugBinaryHeap - } - } - - st->lowScore = st->seg[0].s; - - #ifdef debugBinaryHeap - fprintf (stderr, "\nafter pruning:\n"); - dump_segments (stderr, st, NULL, NULL); - validate_heap (st, "after pruning"); - #endif // debugBinaryHeap - - return st; - -// failure exits - -#if (SIZE_MAX > mallocLimit) -#define suggestions " consider raising scoring threshold (--hspthresh or --exact)" \ - " or breaking your target sequence into smaller pieces" - -overflow: - suicidef ("in add_segment()\n" - "table size (%s for %s segments) exceeds allocation limit of %s;\n" - suggestions, - commatize(bytesNeeded), - commatize(newSize), - commatize(mallocLimit)); - return NULL; // (doesn't get here) -#endif // overflow possible - } - - -static void remove_root (segtable* st) - { - segment* seg, *detached, *child, *rgtChild; - u32 ix, childIx, rgtIx; - - // remove the root node's coverage - - seg = &st->seg[0]; - st->coverage -= seg->length; - - if (st->len <= 1) - { st->len = 0; return; } - - // detach the final segment; conceptually we will consider this as the new - // root, then percolate it down to a new position - - detached = &st->seg[--st->len]; - - if (st->len == 1) - { *seg = *detached; return; } - - // recompute tied-score info up the tree from the detached node - // nota bene: st->len > 1, so the initial value of ix, (st->len-1)/2 >= 0 - - for (ix=(st->len-1)/2 ; ix>0 ; ix=(ix-1)/2) - { if (!record_tie_score (st, ix)) break; } - - // the detached node may violate the min-heap property; percolate it down - // the tree until the property is re-established - - ix = 0; - while (true) - { - childIx = 2*ix+1; - if (childIx >= st->len) break; // (ix is a leaf position) - child = &st->seg[childIx]; - - rgtIx = childIx + 1; - if (rgtIx < st->len) - { - rgtChild = &st->seg[rgtIx]; - if (rgtChild->s < child->s) - { childIx = rgtIx; child = rgtChild; } - } - - if (detached->s <= child->s)// (if we place the detached node here - break; // .. we'll satisfy the min-heap property) - - st->seg[ix] = *child; // move the child up the tree - - ix = childIx; - } - - st->seg[ix] = *detached; // copy the detached node into the tree - - // now reverse that path, up the tree, to recompute tied-score info - - for ( ; ix>0 ; ix=(ix-1)/2) - record_tie_score (st, ix); - record_tie_score (st, 0); - } - -//---------- -// -// record_tie_scores-- -// Record tied-score information in a segment table. -// -//---------- -// -// Arguments: -// segtable* st: The segment table. -// -// Returns: -// (nothing) -// -//---------- - -static void record_tie_scores - (segtable* st) - { - int ix; - - // determine the coverage of all equal-scoring subheaps; we start at the - // tail and percolate coverage up the tree whenever we have a tie; at the - // end, seg->scoreCov will be equal to the sum of the lengths of the - // subheap rooted at seg that consists of segments having the same score - - for (ix=st->len-1 ; ix>=0 ; ix--) - record_tie_score (st, ix); - } - -//---------- -// -// record_tie_score-- -// Record tied-score information for one node in a segment table. -// -//---------- -// -// Arguments: -// segtable* st: The segment table. -// int ix: The node index. -// -// Returns: -// true => score coverage for the node was changed by this operation; -// false => score coverage for the node remained the same -// -//---------- - -static int record_tie_score - (segtable* st, - int ix) - { - possum scoreCov; - segment* seg, *lftChild, *rgtChild; - u32 lftIx, rgtIx; - - seg = &st->seg[ix]; - - // scoreCov is the sum of the length of this segment, plus the left and - // right subheaps *if* they exist and have the same score as this segment - - scoreCov = (possum) seg->length; - - lftIx = 2*ix+1; - if (lftIx < st->len) - { - lftChild = &st->seg[lftIx]; - if (lftChild->s == seg->s) - scoreCov += lftChild->scoreCov; - - rgtIx = lftIx + 1; - if (rgtIx < st->len) - { - rgtChild = &st->seg[rgtIx]; - if (rgtChild->s == seg->s) - scoreCov += rgtChild->scoreCov; - } - } - - if (scoreCov != seg->scoreCov) - { seg->scoreCov = scoreCov; return true; } - - return false; - } - -//---------- -// -// split_segment_table-- -// Split a segment table into two tables. All segments with a particular id -// are left in the table, and all others are moved to a new table. -// -//---------- -// -// Arguments: -// segtable* st: The segment table to split. -// int id: The id of segments to keep in the incoming table. -// segtable** leftovers: The segment table in which to collect the leftovers. -// .. This table must already exist. -// -// Returns: -// (nothing) -// -//---------- - -void split_segment_table - (segtable* st, - int id, - segtable** _leftovers) - { - segtable* leftovers = *_leftovers; - possum cov; - score lowScore; - u32 srcIx, dstIx; - segment* seg; - - cov = 0; - lowScore = worstPossibleScore; - - //fprintf (stderr, "incoming\n"); - //dump_segments (stderr, st, NULL, NULL); - - dstIx = 0; - for (srcIx=0 ; srcIxlen ; srcIx++) - { - seg = &st->seg[srcIx]; - - if (seg->id != id) - { - leftovers = add_segment (leftovers, - seg->pos1, seg->pos2, seg->length, - seg->s, seg->id, seg->hspId); - continue; - } - - cov += seg->length; - if ((dstIx == 0) || (seg->s < lowScore)) lowScore = seg->s; - - if (dstIx != srcIx) st->seg[dstIx] = *seg; - dstIx++; - } - - st->len = dstIx; - st->coverage = cov; - st->lowScore = lowScore; - - //fprintf (stderr, "\nid=%d\n", id); - //dump_segments (stderr, st, NULL, NULL); - //fprintf (stderr, "\nileftovers\n"); - //dump_segments (stderr, leftovers, NULL, NULL); - //fprintf (stderr, "\n"); - - *_leftovers = leftovers; - } - -//---------- -// -// score_segments-- -// Score every segment in a table, as the sum of the substitution scores along -// the segment. -// -//---------- -// -// Arguments: -// segtable* st: The segment table to score. -// seq* seq1: The sequence corresponding to the segments' pos1. -// seq* seq2: The sequence corresponding to the segments' pos2. -// scoreset* scoring: The scoring scheme; usually this treats lowercase -// .. letters as being 'bad'. -// -// Returns: -// (nothing) -// -//---------- - -void score_segments - (segtable* st, - seq* seq1, - seq* seq2, - scoreset* scoring) - { - u32 ix; - segment* seg; - u8* s1, *s2; - score s; - unspos togo; - - for (ix=0,seg=st->seg ; ixlen ; ix++,seg++) - { - s1 = seq1->v + seg->pos1; - s2 = seq2->v + seg->pos2; - - s = 0; - for (togo=seg->length ; togo>0 ; togo--) - s += scoring->sub[*(s1++)][*(s2++)]; - - seg->s = s; - } - - } - -//---------- -// -// sort_segments-- -// Sort a segment table. -// sort_some_segments-- -// Sort part of a segment table. -// -//---------- -// -// Arguments: -// segtable* st: The segment table to add to. -// u32 start: (sort_some_segments only) Index into the table of the -// .. first entry to be sorted. -// u32 end: (sort_some_segments only) Index into the table of the -// .. first entry NOT to be sorted (i.e. the one after the -// .. last index to be sorted). -// int (*qCompare): Comparison function to use (suitable for the standard -// .. c function qsort). For example, this could be -// .. qSegmentsByPos2. -// -// Returns: -// (nothing) -// -//---------- - -void sort_segments - (segtable* st, - int (*qCompare) (const void* el1, const void* el2)) - { - qsort (st->seg, st->len, sizeof(segment), qCompare); - } - - -void sort_some_segments - (segtable* st, - u32 start, - u32 end, - int (*qCompare) (const void* el1, const void* el2)) - { - if (end <= start) return; // (empty interval) - if (end > st->len) return; // (interval is beyond end of lest) - qsort (st->seg+start, end-start, sizeof(segment), qCompare); - } - -//---------- -// -// merge_segments-- -// Merge any overlapping segments in a segment table. -// -//---------- -// -// Arguments: -// segtable* st: The segment table to operate upon. -// -// Returns: -// (nothing) -// -//---------- -// -// Notes: -// (1) Segments are considered to overlap if they are on the same diagonal and -// share any positions. Segments that adjoin, but do not overlap, are not -// merged. -// (2) Any merged segment is given the maximum score of the segments being -// merged. If this is not appropriate, the caller will need to rescore -// the segments afterwards. -// $$$ The problem with making the caller rescore is that the caller has -// $$$ .. no way to know which segments have been merged, thus *every* -// $$$ .. segment would have to be rescored. We could resolve this by -// $$$ .. having the caller provide a callback routine to rescore a -// $$$ .. segment. -// (3) The min-heap property is NOT maintained. We view this routine as a -// post-processing step, and we do not expect additional segments to be -// added to the table after this is called. -// -//---------- - -void merge_segments - (segtable* st) - { - u32 srcIx, dstIx; - segment* srcSeg, *dstSeg; - unspos pos2, end2, srcPos2, srcEnd2; - sgnpos diag, srcDiag; - score s, srcS; - - // if we have fewer than two segments, there's nothing to merge - - if (st->len < 2) return; - - // sort segments by increasing pos2 along each diagonal - - sort_segments (st, qSegmentsByDiag); - - // start first segment - - srcSeg = st->seg; - pos2 = srcSeg->pos2; - diag = diagNumber (srcSeg->pos1, pos2); - end2 = pos2 + srcSeg->length; - s = srcSeg->s; - srcSeg++; - - // scan segments, merging as needed; note that any segment written to the - // list is written to an earlier (or the same) index as the latest read - - dstIx = 0; - for (srcIx=1 ; srcIxlen ; srcIx++,srcSeg++) - { - srcPos2 = srcSeg->pos2; - srcDiag = diagNumber (srcSeg->pos1, srcPos2); - srcEnd2 = srcPos2 + srcSeg->length; - srcS = srcSeg->s; - - if ((srcDiag == diag) && (srcPos2 < end2)) - { // merge - if (srcEnd2 > end2) end2 = srcEnd2; - if (srcS > s) s = srcS; - continue; - } - - // deposit the previous segment - - dstSeg = &st->seg[dstIx++]; - dstSeg->pos1 = (unspos) (diag + pos2); - dstSeg->pos2 = pos2; - dstSeg->length = end2 - pos2; - dstSeg->s = s; - - // start a new segment - - pos2 = srcPos2; - diag = srcDiag; - end2 = srcEnd2; - s = srcS; - } - - // deposit the final segment - - dstSeg = &st->seg[dstIx++]; - dstSeg->pos1 = (unspos) (diag + pos2); - dstSeg->pos2 = pos2; - dstSeg->length = end2 - pos2; - dstSeg->s = s; - - // adjust the length of the list - - st->len = dstIx; - } - -//---------- -// -// filter_marked_segments-- -// Remove from a table any segments marked for filtering. -// -//---------- -// -// Arguments: -// segtable* st: The segment table to operate upon. -// -// Returns: -// nothing; the segment table is modified in place, with segments NOT marked -// for filtering brought to the front, and the table shortened by modifying -// st->len. -// -//---------- - -void filter_marked_segments - (segtable* st) - { - segment* srcSeg, *dstSeg; - - if (st == NULL) return; -// if (st->seg == NULL) return; clang flags this; seg is an array, not a -// pointer, so it can never be NULL; unless the -// implementation of segments is changed, which -// is why I build in this sanity check - - for (dstSeg=srcSeg=st->seg ; ((u32)(srcSeg-st->seg))len ; srcSeg++) - { - if (srcSeg->filter) continue; - if (srcSeg != dstSeg) *dstSeg = *srcSeg; - dstSeg++; - } - - st->len = dstSeg - st->seg; - } - -//---------- -// [[-- comparison function for the standard c function qsort --]] -// -// qSegmentsByPos1-- -// Compare two segments by pos1, so that qsort will sort by increasing pos1. -// -//---------- -// -// Arguments: -// const void* (really segment*) segA: Pointer to one segment. -// const void* (really segment*) segB: Pointer to another. -// -// Returns: -// > 0 => segA is greater than segB. -// = 0 => segA and segB are the same. -// < 0 => segA is less than segB. -// -//---------- - -int qSegmentsByPos1 - (const void* _segA, - const void* _segB) - { - segment* segA = (segment*) _segA; - segment* segB = (segment*) _segB; - - if (segA->pos1 < segB->pos1) return -1; - else if (segA->pos1 > segB->pos1) return 1; - - if (segA->length < segB->length) return -1; - else if (segA->length > segB->length) return 1; - - if (segA->pos2 < segB->pos2) return -1; - else if (segA->pos2 > segB->pos2) return 1; - - if (segA->id < segB->id) return -1; - else if (segA->id > segB->id) return 1; - - if (segA->s < segB->s) return -1; - else if (segA->s > segB->s) return 1; - - return 0; - } - -//---------- -// [[-- comparison function for the standard c function qsort --]] -// -// qSegmentsByPos2-- -// Compare two segments by pos2, so that qsort will sort by increasing pos2. -// -//---------- -// -// Arguments: -// const void* (really segment*) segA: Pointer to one segment. -// const void* (really segment*) segB: Pointer to another. -// -// Returns: -// > 0 => segA is greater than segB. -// = 0 => segA and segB are the same. -// < 0 => segA is less than segB. -// -//---------- - -int qSegmentsByPos2 - (const void* _segA, - const void* _segB) - { - segment* segA = (segment*) _segA; - segment* segB = (segment*) _segB; - - if (segA->pos2 < segB->pos2) return -1; - else if (segA->pos2 > segB->pos2) return 1; - - if (segA->length < segB->length) return -1; - else if (segA->length > segB->length) return 1; - - if (segA->pos1 < segB->pos1) return -1; - else if (segA->pos1 > segB->pos1) return 1; - - if (segA->id < segB->id) return -1; - else if (segA->id > segB->id) return 1; - - if (segA->s < segB->s) return -1; - else if (segA->s > segB->s) return 1; - - return 0; - } - -//---------- -// [[-- comparison function for the standard c function qsort --]] -// -// qSegmentsByDecreasingScore-- -// Compare two segments by score, so that qsort will sort by decreasing score. -// qSegmentsByIncreasingScore-- -// Compare two segments by score, so that qsort will sort by increasing score. -// -//---------- -// -// Arguments: -// const void* (really segment*) segA: Pointer to one segment. -// const void* (really segment*) segB: Pointer to another. -// -// Returns: -// (for qSegmentsByDecreasingScore) (for qSegmentsByIncreasingScore) -// > 0 => segA is greater than segB. > 0 => segA is greater than segB. -// = 0 => segA and segB are the same. = 0 => segA and segB are the same. -// < 0 => segA is less than segB. < 0 => segA is less than segB. -// -//---------- - -int qSegmentsByDecreasingScore - (const void* _segA, - const void* _segB) - { - segment* segA = (segment*) _segA; - segment* segB = (segment*) _segB; - - if (segA->s < segB->s) return 1; - else if (segA->s > segB->s) return -1; - - if (segA->length < segB->length) return -1; // if scores are equal we - else if (segA->length > segB->length) return 1; // .. prefer shorter length - - if (segA->pos2 < segB->pos2) return -1; - else if (segA->pos2 > segB->pos2) return 1; - - if (segA->pos1 < segB->pos1) return -1; - else if (segA->pos1 > segB->pos1) return 1; - - if (segA->id < segB->id) return -1; - else if (segA->id > segB->id) return 1; - - return 0; - } - - -int qSegmentsByIncreasingScore - (const void* _segA, - const void* _segB) - { - segment* segA = (segment*) _segA; - segment* segB = (segment*) _segB; - - if (segA->s < segB->s) return -1; - else if (segA->s > segB->s) return 1; - - if (segA->length < segB->length) return -1; // if scores are equal we - else if (segA->length > segB->length) return 1; // .. prefer shorter length - - if (segA->pos2 < segB->pos2) return -1; - else if (segA->pos2 > segB->pos2) return 1; - - if (segA->pos1 < segB->pos1) return -1; - else if (segA->pos1 > segB->pos1) return 1; - - if (segA->id < segB->id) return -1; - else if (segA->id > segB->id) return 1; - - return 0; - } - -//---------- -// [[-- comparison function for the standard c function qsort --]] -// -// qSegmentsByDiag-- -// Compare two segments by diagonal, so that qsort will sort by increasing -// diagonal, and by increasing pos2 along each diagonal. -// -//---------- -// -// Arguments: -// const void* (really segment*) segA: Pointer to one segment. -// const void* (really segment*) segB: Pointer to another. -// -// Returns: -// > 0 => segA is greater than segB. -// = 0 => segA and segB are the same. -// < 0 => segA is less than segB. -// -//---------- - -int qSegmentsByDiag - (const void* _segA, - const void* _segB) - { - segment* segA = (segment*) _segA; - segment* segB = (segment*) _segB; - sgnpos diagA, diagB; - - // compare by diagonal - - diagA = diagNumber (segA->pos1, segA->pos2); - diagB = diagNumber (segB->pos1, segB->pos2); - - if (diagA < diagB) return -1; - else if (diagA > diagB) return 1; - - // resort to tiebreakers - - if (segA->pos2 < segB->pos2) return -1; - else if (segA->pos2 > segB->pos2) return 1; - - if (segA->length < segB->length) return -1; - else if (segA->length > segB->length) return 1; - - if (segA->id < segB->id) return -1; - else if (segA->id > segB->id) return 1; - - if (segA->s < segB->s) return -1; - else if (segA->s > segB->s) return 1; - - return 0; - } - -//---------- -// [[-- comparison function for the standard c function qsort --]] -// -// qSegmentsById-- -// Compare two segments by id, so that qsort will sort by increasing id. -// -//---------- -// -// Arguments: -// const void* (really segment*) segA: Pointer to one segment. -// const void* (really segment*) segB: Pointer to another. -// -// Returns: -// > 0 => segA is greater than segB. -// = 0 => segA and segB are the same. -// < 0 => segA is less than segB. -// -//---------- - -int qSegmentsById - (const void* _segA, - const void* _segB) - { - segment* segA = (segment*) _segA; - segment* segB = (segment*) _segB; - - if (segA->id < segB->id) return -1; - else if (segA->id > segB->id) return 1; - - if (segA->s < segB->s) return -1; - else if (segA->s > segB->s) return 1; - - if (segA->pos1 < segB->pos1) return -1; - else if (segA->pos1 > segB->pos1) return 1; - - if (segA->length < segB->length) return -1; - else if (segA->length > segB->length) return 1; - - if (segA->pos2 < segB->pos2) return -1; - else if (segA->pos2 > segB->pos2) return 1; - - return 0; - } - -//---------- -// -// write_segments-- -// Write a segment table to a file, for debugging. -// -// The table will look something like what is shown below. Intervals are -// origin-zero, half-open. -// -// [0] apple 1126 1177 orange 411 461 + 0 -// [1] apple 1224 1267 orange 498 540 + 0 -// [2] apple 1262 1322 orange 562 621 + 0 -// [3] apple 1340 1377 orange 655 691 + 0 -// [4] apple 1471 1530 orange 807 865 + 0 -// -// [0] apple 576 624 orange 222 269 - 0 -// [1] apple 666 724 orange 326 383 - 0 -// [2] apple 856 930 orange 482 555 - 0 -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// segtable* st: The segment table to dump. -// seq* target: The target sequence the table relates to. -// seq* query: The query sequence the table relates to. -// int withText: true => add nucleotide text (for debug only) -// int subsample: <=0 => write all segments -// >0 => write every Nth segment -// -// Returns: -// (nothing) -// -//---------- - -void write_segments - (FILE* f, - segtable* st, - seq* target, - seq* query, - int withText, - int subsample) - { - u32 segIx; - segment* seg; - static char* tName, *qName; - static unspos tStart, qStart; - seqpartition* tSp = &target->partition; - seqpartition* qSp = &query->partition; - partition* tPart, *qPart; - u8* tV, *qV; - u32 ix; - - for (segIx=0,seg=st->seg ; segIxlen ; segIx++,seg++) - { - if ((subsample != 0) && (segIx % subsample != 0)) continue; - - tPart = NULL; - if (tSp->p == NULL) // target is not partitioned - { - tName = (target->useFullNames)? target->header : target->shortHeader; - if ((tName == NULL) || (tName[0] == 0)) tName = "target"; - tStart = seg->pos1; - } - else // target is partitioned - { - tPart = lookup_partition (target, seg->pos1); - tName = &tSp->pool[tPart->header]; - tStart = seg->pos1 - (tPart->sepBefore + 1); - } - - qPart = NULL; - if (qSp->p == NULL) // query is not partitioned - { - qName = (query->useFullNames)? query->header : query->shortHeader; - if ((qName == NULL) || (qName[0] == 0)) qName = "query"; - qStart = seg->pos2; - } - else // query is partitioned - { - qPart = lookup_partition (query, seg->pos2); - qName = &qSp->pool[qPart->header]; - qStart = seg->pos2 - (qPart->sepBefore + 1); - } - - fprintf (f, "[%d]", segIx); - if (tPart == NULL) - fprintf (f, " %s " unsposFmt " " unsposFmt, - tName, tStart, tStart+seg->length); - else - fprintf (f, " %s " unsposFmt "+" unsposFmt " " unsposFmt "+" unsposFmt, - tName, tPart->startLoc-1, tStart, tPart->startLoc-1, tStart+seg->length); - if (qPart == NULL) - fprintf (f, " %s " unsposFmt " " unsposFmt, - qName, qStart, qStart+seg->length); - else - fprintf (f, " %s " unsposFmt "+" unsposFmt " " unsposFmt "+" unsposFmt, - qName, qPart->startLoc-1, qStart, qPart->startLoc-1, qStart+seg->length); - fprintf (f, " %c " scoreFmtSimple, seg->id, seg->s); - - if (withText) - { - tV = target->v + seg->pos1; - qV = query->v + seg->pos2; - - fprintf (f, " %lu:", (unsigned long) (tV - target->v)); - for (ix=0 ; ixlength ; ix++) - fprintf (f, "%c", dna_toprint(tV[ix])); - fprintf (f, " %lu:", (unsigned long) (qV - query->v)); - for (ix=0 ; ixlength ; ix++) - fprintf (f, "%c", dna_toprint(qV[ix])); - } - - fprintf (f, "\n"); - } - } - -//---------- -// -// dump_segments-- -// Dump a segment table, for debugging. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// segtable* st: The segment table to dump. -// char* sym1,sym2: Identifying strings to attach to pos1 and pos2, -// .. respectively. E.g. "+" or "-" for DNA strand. -// .. These can be NULL, in which case we presume that -// .. segment id fields are the second sequence's -// .. revCompFlags, and we take the strand from that. -// -// Returns: -// (nothing) -// -//---------- - -//#define show_separation - -void dump_segments - (FILE* f, - segtable* st, - char* _sym1, - char* _sym2) - { - char* sym1, *sym2; - u32 ix; - segment* seg; - sgnpos diag; -#ifdef show_separation - unspos prevPos2 = 0; - sgnpos prevDiag = 0; -#endif // show_separation - - sym1 = _sym1; - sym2 = _sym2; - - for (ix=0,seg=st->seg ; ixlen ; ix++,seg++) - { - if (_sym1 == NULL) sym1 = "+"; - if (_sym2 == NULL) sym2 = ((seg->id & rcf_rev) != 0)? "-" : "+"; - - diag = diagNumber (seg->pos1, seg->pos2); -#ifdef show_separation - if ((ix != 0) && (diag == prevDiag)) - fprintf (f, "[" unsposFmt "] " unsposSlashSFmt " %d " scoreFmtSimple - "; id %d, diag " sgnposFmt " * (" unsposFmt "), tied_length=" possumFmt "\n", - ix, seg->pos1+1, sym1, seg->pos2+1, sym2, seg->length, - seg->s, seg->id, diag, seg->pos2-prevPos2, seg->scoreCov); - else -#endif // show_separation - fprintf (f, "[%d] " unsposSlashSFmt " " unsposFmt " " scoreFmtSimple - "; id %d, diag " sgnposFmt ", tied_length=" possumFmt "\n", - ix, seg->pos1+1, sym1, seg->pos2+1, sym2, seg->length, - seg->s, seg->id, diag, seg->scoreCov); -#ifdef show_separation - prevDiag = diag; - prevPos2 = seg->pos2; -#endif // show_separation - } - - } - -//---------- -// -// validate_heap-- -// Check whether a segment table is a valid min-heap, for debugging. -// -//---------- -// -// Arguments: -// segtable* st: The segment table to check. -// char* msg: A message to output if validate fails. -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -#ifdef debugBinaryHeap - -static void validate_heap - (segtable* st, - char* msg) - { - possum scoreCov; - segment* seg, *lftChild, *rgtChild; - int ix, lftIx, rgtIx; - - if (st->coverage < st->coverageLimit) - suicidef ("%s, below coverage limit", msg); - - for (ix=0,seg=st->seg ; ixlen ; ix++,seg++) - { - scoreCov = (possum) seg->length; - - lftIx = 2*ix+1; - if (lftIx < st->len) - { - lftChild = &st->seg[lftIx]; - if (lftChild->s < seg->s) - suicidef ("%s, node %d > node %d", msg, ix, lftIx); - if (lftChild->s == seg->s) - scoreCov += lftChild->scoreCov; - - rgtIx = lftIx + 1; - if (rgtIx < st->len) - { - rgtChild = &st->seg[rgtIx]; - if (rgtChild->s < seg->s) - suicidef ("%s, node %d > node %d", msg, ix, rgtIx); - if (rgtChild->s == seg->s) - scoreCov += rgtChild->scoreCov; - } - } - - if (scoreCov != seg->scoreCov) - suicidef ("%s, node %d has bad score coverage", msg, ix); - } - - } - -#endif // debugBinaryHeap - diff --git a/programs/lastz/src/segment.h b/programs/lastz/src/segment.h deleted file mode 100755 index e29ac45..0000000 --- a/programs/lastz/src/segment.h +++ /dev/null @@ -1,137 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: segment.h -// -//---------- - -#ifndef segment_H // (prevent multiple inclusion) -#define segment_H - -//---------- -// -// other files -// -//---------- - -#include // standard C i/o stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff - -// establish ownership of global variables - -#ifdef segment_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef segment_owner -int segment_dbgAnchorParsing = false; // true => debug anchor parsing -#else -global int segment_dbgAnchorParsing; -#endif - -//---------- -// -// data structures and types -// -//---------- - -// segment-- -// A segment is a pair of intervals with the same length. This can represent -// many things, for example an ungapped alignment between two sequences. - -typedef struct segment - { - u64 hspId; // (for debugging) uniquely identifies this hsp - // .. within one run of the program - unspos pos1; // start of the interval, in one sequence (this - // .. is origin-zero) - unspos pos2; // start of the interval, in the other sequence - // .. (this is origin-zero) - unspos length; // length of the interval (e.g. number of - // .. nucleotides - score s; // segment score, (e.g. score of an ungapped - // .. match) - int id; // identifier to be used at caller's will - possum scoreCov; // total lengths of the subheap (rooted at this - // .. node) that has the same score as this - // .. segment; this is only valid when the - // .. segtable is a proper min-heap - int filter; // true => this segment should be discarded - } segment; - -// segment table-- -// A segment table is a list of segments. -// -// Note that normally the segment array _seg[] is allocated allocated as part -// of this heap block, and seg points to that array. Access to the list should -// be through seg, not _seg. In rare cases, one segtable object will not have -// its own _seg[] array -- seg will point into another segtable object's _seg[] -// array. This is done to implement subsets of the array, see the function -// subset_segment_table for more information. - -typedef struct segtable - { - u32 size; // the number of entries allocated for seq[] - u32 len; // the number of entries in seq[] that are - // .. actually used - int haveScores; // true => the segments have been scored - // false => they have not been scored - unspos coverageLimit; // 'suggested' limit on the total lengths of - // .. the segments in the table (see discussion - // .. in file header of segment.c about how the - // .. limit is honored); zero indicates no - // .. limit - possum coverage; // total lengths of the segments in the table - score lowScore; // score of lowest segment in the table; if - // .. there are no segments in the table this - // .. is worstPossibleScore - segment* seg; // pointer to the segment table (which is - // .. allocated as part of this heap block) - segment _seg[1]; // the segment table (variable-length array) - } segtable; - -//---------- -// -// prototypes for routines in segment.c -// -//---------- - -segtable* new_segment_table (u32 size, unspos coverageLimit); -void subset_segment_table (segtable* st, u32 startIx, u32 endIx, - segtable* subset); -void empty_segment_table (segtable* st); -void limit_segment_table (segtable* st, unspos coverageLimit); -void free_segment_table (segtable* st); -segtable* read_segment_table (FILE* f, char* fName, segtable* st, - seq* target, seq* query); -segtable* add_segment (segtable* st, - unspos pos1, unspos pos2, unspos length, - score s, int id, u64 hspId); -void split_segment_table (segtable* st, int id, segtable** leftovers); -void score_segments (segtable* st, seq* seq1, seq* seq2, - scoreset* scoring); -void sort_segments (segtable* st, - int (*qCompare) (const void* el1, const void* el2)); -void sort_some_segments (segtable* st, - u32 start, u32 end, - int (*qCompare) (const void* el1, const void* el2)); -void merge_segments (segtable* st); -void filter_marked_segments (segtable* st); -int qSegmentsByPos1 (const void* segA, const void* segB); -int qSegmentsByPos2 (const void* segA, const void* segB); -int qSegmentsByDecreasingScore (const void* segA, const void* segB); -int qSegmentsByIncreasingScore (const void* segA, const void* segB); -int qSegmentsByDiag (const void* segA, const void* segB); -int qSegmentsById (const void* segA, const void* segB); -void write_segments (FILE* f, segtable* st, - seq* target, seq* query, - int withText, int subsample); -void dump_segments (FILE* f, segtable* st, - char* sym1, char* sym2); - -#undef global -#endif // segment_H diff --git a/programs/lastz/src/sequences.c b/programs/lastz/src/sequences.c deleted file mode 100755 index fac7ab6..0000000 --- a/programs/lastz/src/sequences.c +++ /dev/null @@ -1,10020 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: sequences.c -// -//---------- -// -// sequences-- -// Support for DNA sequences. -// -// FASTA format stores DNA sequences as plain text. A single file can store -// multiple sequences, which we call contigs. Each contig begins with a header -// line, which begins with a ">". -// -// FASTQ format stores DNA sequences and associated base-call quality scores as -// plain text. A single file can store multiple sequences (contigs). Each -// contig consists of four lines-- a header line (which begins with a "@"), a -// sequence line (nucleotides), a separator line (which begins with a "+"), and -// a qualities line. As of Apr/2011, a spec for FASTQ files can be found at -// http://maq.sourceforge.net/fastq.shtml -// Note that there has been much confusion with FASTQ quality values, since -// different variants of the format encode them in different ways. As of -// Apr/2011, the only thing we don't interpret quality values, so we are immune -// to that confusion. -// -// NIB format stores a single DNA sequence, containing {A,C,G,T,a,c,g,t,N,N} in -// two bases per byte. As of Jan/2008, a spec for NIB files can be found at -// http://genome.ucsc.edu/FAQ/FAQformat#format8 -// -// 2BIT format stores multiple DNA sequences encoded as four bases per byte -// with some additional information describing runs of masked bases or Ns. As -// of Jan/2008, a spec for 2BIT files can be found at -// http://genome.ucsc.edu/FAQ/FAQformat#format7 -// -// HSX format is a hashed sequence index that consists of references to -// sequences in other files. This allows random access. See the file format -// spec in the lastz readme file for more information. -// -// Quantum-dna format stores each base as a byte, with the meaning of the -// byte values essentially defined by the scoring matrix. See the file format -// spec in the lastz readme file for more information. -// -//---------- -// -// WARNING: As of this writing (Apr/2008), the code to read sequences is a -// mess. The additions of rewindability and contigs-of-interest did -// not fit well with the original design, and have been patched in -// with bandaids. The author hopes to rectify this in the future. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C i/o stuff -#include // standard C string stuff -#include // standard C upper/lower stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff - -#define sequences_owner // (make this the owner of its globals) -#include "sequences.h" // interface to this module - - -#define pathSlash '/' -#ifdef compileForWindows -#undef pathSlash -#define pathSlash '\\' -#endif - -//---------- -// -// debug set ups -// -//---------- - -#define sequence_filename(seq) (((seq)->filename != NULL)? (seq)->filename : "(unnamed sequence file)") - -//--- debug set up for debugging the contigs-of-interest names file --- -// .. and also chores file - -//#define debugNamesFile - -#ifndef debugNamesFile -#define debugNamesFile_1 ; -#define debugNamesFile_2 ; -#define debugNamesFile_3 ; -#define debugNamesFile_4 ; -#define debugNamesFile_5 ; -#define debugNamesFile_6 ; -#define debugNamesFile_7 ; -#define debugNamesFile_8 ; -#define debugNamesFile_9 ; -#define debugNamesFile_10 ; -#define debugNamesFile_11 ; -#define debugNamesFile_12 ; -#define debugNamesFile_13 ; -#define debugNamesFile_14 ; -#define debugNamesFile_15 ; -#define debugNamesFile_16 ; -#define debugNamesFile_17 ; -#define debugNamesFile_18 ; -#define debugNamesFile_19 ; -#define debugNamesFile_20 ; -#define debugNamesFile_21 ; -#define debugNamesFile_22 ; -#define debugNamesFile_23 ; -#endif // not debugNamesFile - -#ifdef debugNamesFile - -#define debugNamesFile_1 \ - fprintf (stderr, "load_sequence (%s):\n", sequence_filename(_seq)); \ - if ((_seq != NULL) && (_seq->preLoaded)) \ - fprintf (stderr, " preloaded, header: %s\n", _seq->header); - -#define debugNamesFile_2 \ - fprintf (stderr, " (back in load_sequence)\n"); - -#define debugNamesFile_3 \ - fprintf (stderr, "load_sequence_core (%s):\n", sequence_filename(_seq)); \ - if (!keeper) \ - fprintf (stderr, " (not a keeper)\n"); - -#define debugNamesFile_4 \ - fprintf (stderr, " header: %s\n", _seq->header); - -#define debugNamesFile_5 \ - fprintf (stderr, "locate_hsx_first_sequence:" \ - " single contig-of-interest" \ - " hsx.contigFilePos <- %010lX\n", \ - (long unsigned int) _seq->hsx.contigFilePos); - -#define debugNamesFile_6 \ - fprintf (stderr, "locate_hsx_first_sequence:" \ - " no contig names" \ - " hsx.contigFilePos <- %010lX\n", \ - (long unsigned int) _seq->hsx.contigFilePos); - -#define debugNamesFile_7 \ - fprintf (stderr, "load_hsx_sequence:" \ - " hsx.contigFilePos == %010lX\n", \ - (long unsigned int) _seq->hsx.contigFilePos); - -#define debugNamesFile_8 \ - fprintf (stderr, "load_hsx_sequence:" \ - " hsx.contigFilePos <- %010lX\n", \ - (long unsigned int) _seq->hsx.contigFilePos); - -#define debugNamesFile_9 \ - fprintf (stderr, "lookup_hsx_sequence(%s):\n", name); \ - fprintf (stderr, " bucket == %08X\n hash entry == %010lX\n", \ - (unsigned int) bucket, (long unsigned int) fileOffset); - -#define debugNamesFile_10 \ - fprintf (stderr, " bucketStart == %010lX\n", \ - (long unsigned int) bucketStart); - -#define debugNamesFile_11 \ - fprintf (stderr, "another_sequence (%s):\n", sequence_filename(_seq)); - -#define debugNamesFile_12 \ - long int fpos; \ - fprintf (stderr, "find_next_general_fasta_coi (%s):\n", sequence_filename(_seq)); - -#define debugNamesFile_13 \ - long int fpos; \ - fprintf (stderr, "find_next_fastq_coi (%s):\n", sequence_filename(_seq)); - -#define debugNamesFile_14 \ - fpos = ftell (_seq->f) - _seq->pendingLen - 1; - -#define debugNamesFile_15 \ - { \ - char headerSaveCh = header[headerLen]; \ - header[headerLen] = 0; \ - fprintf (stderr, " test: [%08lX] %s\n", fpos, header); \ - header[headerLen] = headerSaveCh; \ - } - -#define debugNamesFile_16 \ - fprintf (stderr, " found: [%08lX] %s\n", fpos, _seq->nextContigName); - -#define debugNamesFile_17 \ - fprintf (stderr, "find_next_2bit_coi (%s): [%s]\n", \ - sequence_filename(_seq), _seq->nextContigName); - -#define debugNamesFile_18 \ - fprintf (stderr, "find_next_hsx_coi (%s): [%s]\n", \ - sequence_filename(_seq), _seq->nextContigName); - -#define debugNamesFile_19 \ - fprintf (stderr, "load_hsx_sequence:" \ - " hsx.contigFilePos <- %010lX\n", \ - (long unsigned int) _seq->hsx.contigFilePos); - -#define debugNamesFile_20 \ - fprintf (stderr, "read_contig_name: %s\n", line); - -#define debugNamesFile_21 \ - static int choreNumber = 0; - -#define debugNamesFile_22 \ - choreNumber++; \ - fprintf (stderr, "read_chore #%d: %s\n", choreNumber, line); - -#define debugNamesFile_23 \ - fprintf (stderr, "read_chore -->"); \ - \ - if (_seq->chore.tSubrange) \ - fprintf (stderr, " %s " unsposFmt " " unsposFmt, \ - _seq->chore.tName, \ - _seq->chore.tStart, _seq->chore.tEnd); \ - else \ - fprintf (stderr, " %s * *", \ - _seq->chore.tName); \ - \ - if (_seq->chore.qSubrange) \ - fprintf (stderr, " %s " unsposFmt " " unsposFmt, \ - _seq->nextContigName, \ - _seq->chore.qStart, _seq->chore.qEnd); \ - else \ - fprintf (stderr, " %s * *", \ - _seq->nextContigName); \ - \ - if (_seq->chore.qStrand == 0) fprintf (stderr, " +"); \ - else if (_seq->chore.qStrand < 0) fprintf (stderr, " -"); \ - \ - if (_seq->chore.idTag[0] != 0) \ - fprintf (stderr, " id=%s", _seq->chore.idTag); \ - \ - if ((header != NULL) && (strcmp (header, _seq->nextContigName) == 0)) \ - fprintf (stderr, " (same as existing header)"); \ - \ - fprintf (stderr, "\n"); - -#endif // debugNamesFile - - -//--- debug set up for debugging partitioned sequences --- - -//#define debugPartitions - -#ifndef debugPartitions -#define debugPartitions_1a ; -#define debugPartitions_1b ; -#define debugPartitions_2 ; -#define debugPartitions_3 ; -#define debugPartitions_4 ; -#endif // not debugPartitions - -#ifdef debugPartitions - -#define debugPartitions_1a \ - if (sp->p != NULL) \ - { \ - print_partition_table (stderr, _seq); \ - print_sequence (stderr, _seq, "", 100); \ - } - -#define debugPartitions_1b \ - if (sp->p != NULL) \ - { \ - print_partition_table (stderr, _seq); \ - } - -#define debugPartitions_2 \ - if (_seq->filename == NULL) \ - fprintf (stderr, "lookup_partition(.," unsposFmt ")\n", \ - pos); \ - else \ - fprintf (stderr, "lookup_partition(%s," unsposFmt ")\n", \ - _seq->filename, pos); \ - fprintf (stderr, " p[%d].sepBefore=" unsposFmt \ - " p[%d].sepBefore=" unsposFmt "\n", \ - lo, p[lo].sepBefore, hi, p[hi].sepBefore); - -#define debugPartitions_3 \ - fprintf (stderr, " p[%d].sepBefore=" unsposFmt \ - " p[%d].sepBefore=" unsposFmt \ - " p[%d].sepBefore=" unsposFmt "\n", \ - lo, p[lo].sepBefore, ix, p[ix].sepBefore, hi, p[hi].sepBefore); - -#define debugPartitions_4 \ - fprintf (stderr, " -> %u." unsposFmt "\n", ix, pos); - -#endif // debugPartitions - - -//#define debugSeparation - -#ifndef debugSeparation -#define debugSeparation_1 ; -#define debugSeparation_2 ; -#define debugSeparation_3 ; -#define debugSeparation_4 ; -#define debugSeparation_5 ; -#define debugSeparation_6 ; -#define debugSeparation_7 ; -#define debugSeparation_8 ; -#define debugSeparation_9 ; -#define debugSeparation_10 ; -#endif // not debugSeparation - -#ifdef debugSeparation - -#define debugSeparation_1 \ - if (sp->p != NULL) \ - { \ - print_partition_table (stderr, _seq); \ - print_sequence (stderr, _seq, "", 100); \ - } - -#define debugSeparation_2 \ - fprintf (stderr, " incoming separator at " unsposFmt "\n", \ - (unspos) ((scan-1)-_seq->v)); - -#define debugSeparation_3 \ - fprintf (stderr, " extraPieces=%d\n", \ - extraPieces); - -#define debugSeparation_4 \ - fprintf (stderr, " copying sentinel from partition %d" \ - " to partition %d\n", \ - fromIx, toIx); - -#define debugSeparation_5 \ - fprintf (stderr, " scanning partition %d," \ - " from " unsposFmt \ - " thru " unsposFmt \ - " (to partition = %d)\n", \ - fromIx, sepPrefix+1, sepSuffix-1, toIx); - -#define debugSeparation_6 \ - fprintf (stderr, " seq->v[" unsposFmt "]=%c\n", \ - scan-_seq->v, ch); - -#define debugSeparation_7 \ - fprintf (stderr, " sepAfter=" unsposFmt \ - " (ch=%c)\n", \ - sepAfter, ch); - -#define debugSeparation_8 \ - fprintf (stderr, " copying from partition %d" \ - " to partition %d" \ - " sepBefore=" unsposFmt \ - " sepAfter=" unsposFmt \ - " startLoc=" unsposFmt "\n", \ - fromIx, toIx, sepBefore, sepAfter, startLoc); - -#define debugSeparation_9 \ - fprintf (stderr, " done scanning partitions," \ - " (to partition = %d)\n", \ - toIx); - -#define debugSeparation_10 \ - print_partition_table (stderr, _seq); \ - print_sequence (stderr, _seq, "", 100); - - -#endif // debugSeparation - - -//--- debug set up for debugging problems with reading binary files --- - -//#define debugBinaryFile - -#ifdef debugBinaryFile - -static FILE* dbg_fopen_or_die (const char* name, const char* mode); -static FILE* dbg_fopen_or_die (const char* name, const char* mode) - { - FILE* f = fopen_or_die (name, mode); - fprintf (stderr, "fopen (\"%s\", \"%s\") = %08X\n", - name, mode, (u32)f); - return f; - } - -static void dbg_rewind (FILE *f); -static void dbg_rewind (FILE *f) - { - fprintf (stderr, "rewind (%08X)\n", (u32)f); - rewind (f); - } - -static int dbg_fseek (FILE *f, long int offset, int whence); -static int dbg_fseek (FILE *f, long int offset, int whence) - { - fprintf (stderr, "fseek (%08X, %08X, %d)\n", - (u32)f, (u32)offset, whence); - return fseek (f, offset, whence); - } - -static size_t dbg_fread (void *ptr, size_t size, size_t nmemb, FILE* f); -static size_t dbg_fread (void *ptr, size_t size, size_t nmemb, FILE* f) - { - fprintf (stderr, "fread (%08X, %08X, %08X, %08X)\n", - (u32)ptr, (u32)size, (u32)nmemb, (u32)f); - return fread (ptr, size, nmemb, f); - } - -#define fopen_or_die dbg_fopen_or_die -#define rewind dbg_rewind -#define fseek dbg_fseek -#define fread dbg_fread - -#endif // debugBinaryFile - - -//--- debug set up for debugging problems with reading text files --- - -//#define debugTextFile - -#ifndef debugTextFile -#define debugTextFile_1 ; -#define debugTextFile_2 ; -#define debugTextFile_3 ; -#define debugTextFile_4 ; -#define debugTextFile_5 ; -#define debugTextFile_6 ; -#define debugTextFile_7 ; -#endif // not debugTextFile - -#ifdef debugTextFile - -#define debugTextFile_1 \ - fprintf (stderr, "(for %s) rewinding\n", \ - _seq->filename); - -#define debugTextFile_2 \ - fprintf (stderr, "(for %s) add_partition\n", \ - _seq->filename); - -#define debugTextFile_3 \ - fprintf (stderr, "(for %s) from file: %d --> %s\n", \ - _seq->filename, ch, char_to_description(ch)); - -#define debugTextFile_4 \ - fprintf (stderr, "(for %s) from buff: %d --> %s\n", \ - _seq->filename, ch, char_to_description(ch)); - -#define debugTextFile_5 \ - fprintf (stderr, "(for %s) to buff: %d --> %s\n", \ - _seq->filename, ch, char_to_description(ch)); - -#define debugTextFile_6 \ - fprintf (stderr, "(for %s) saving file state\n", \ - _seq->filename); - -#define debugTextFile_7 \ - fprintf (stderr, "(for %s) restoring file state\n", \ - _seq->filename); - -#endif // debugTextFile - - -//--- debug set up for debugging problems with reading text files --- - -//#define debugFastaFile - -#ifndef debugFastaFile -#define debugFastaFile_1 ; -#define debugFastaFile_2 ; -#endif // not debugFastaFile - -#ifdef debugFastaFile - -#define debugFastaFile_1 \ - if (keeper) \ - fprintf (stderr, "load_fasta_sequence(%s,keeper)\n", \ - _seq->filename); \ - else \ - fprintf (stderr, "load_fasta_sequence(%s,NOT keeper)\n", \ - _seq->filename); - -#define debugFastaFile_2 \ - fprintf (stderr, " header = \"%s\"\n", _seq->header); - -#endif // debugFastaFile - - -//--- debug set up for debugging sequence cloning --- - -//#define debugCloning - -#ifndef debugCloning -#define debugCloning_1 ; -#define debugCloning_2 ; -#endif // not debugCloning - -#ifdef debugCloning - -#define debugCloning_1 \ - fprintf (stderr, "cloning this seq:\n-----------------\n"); \ - dump_sequence_state (stderr, _seq); \ - fprintf (stderr, "\n"); - -#define debugCloning_2 \ - fprintf (stderr, "newSeq:\n-------\n"); \ - dump_sequence_state (stderr, newSeq); \ - fprintf (stderr, "\n"); - -#endif // debugCloning - - -//--- debug set up for debugging "fencing" used to facilitate chores --- - -//#define debugFencing - -#ifndef debugFencing -#define debugFencing_1 ; -#define debugFencing_2 ; -#define debugFencing_3 ; -#define debugFencing_4 ; -#define debugFencing_5 ; -#define debugFencing_6 ; -#define debugFencing_7 ; -#define debugFencing_8 ; -#endif // not debugFencing - -#ifdef debugFencing - -#define debugFencing_1 \ - fprintf (stderr, "fence: "); - -#define debugFencing_2 \ - fprintf (stderr, " left: [" unsposFmt "]=%02X", \ - _seq->leftFencePos, _seq->leftFenceCh); - -#define debugFencing_3 \ - fprintf (stderr, " right: [" unsposFmt "]=%02X", \ - _seq->rightFencePos, _seq->rightFenceCh); - -#define debugFencing_4 \ - fprintf (stderr, "\n"); - -#define debugFencing_5 \ - fprintf (stderr, "unfence:"); - -#define debugFencing_6 \ - fprintf (stderr, " left: [" unsposFmt "]=%02X", \ - _seq->leftFencePos, _seq->leftFenceCh); - -#define debugFencing_7 \ - fprintf (stderr, " right: [" unsposFmt "]=%02X", \ - _seq->rightFencePos, _seq->rightFenceCh); - -#define debugFencing_8 \ - fprintf (stderr, "\n"); - -#endif // debugFencing - -//---------- -// -// private global data relating to fasta and csfasta format -// -//---------- - -// tables to map 8-bit ascii character to fasta or csfasta character type -// "nucleotide" characters are the A, C, G, T and N -// "ambiguous" characters are the remaining IUPAC 15-letter alphabet - -enum - { _bad = 0, _whitespace, _newline, _nucleotide, _ambiguous, _color }; - -#define __ _bad -#define _w _whitespace -#define _l _newline -#define _n _nucleotide -#define _a _ambiguous -#define _c _color - -static const u8 char_to_fasta_type[256] = - { - __,__,__,__,__,__,__,__,__,_w,_l,__,_w,_l,__,__, // 0x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 1x - _w,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 2x - _w,_w,_w,_w,_w,_w,_w,_w,_w,_w,__,__,__,__,__,__, // 3x (numbers) - __,_n,_a,_n,_a,__,__,_n,_a,__,__,_a,__,_a,_n,__, // 4x (upper case) - __,__,_a,_a,_n,__,_a,_a,_n,_a,__,__,__,__,__,__, // 5x (upper case) - __,_n,_a,_n,_a,__,__,_n,_a,__,__,_a,__,_a,_n,__, // 6x (lower case) - __,__,_a,_a,_n,__,_a,_a,_n,_a,__,__,__,__,__,__, // 7x (lower case) - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 8x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 9x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Ax - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Bx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Cx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Dx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Ex - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__ // Fx - }; - -static const u8 char_to_csfasta_type[256] = - { - __,__,__,__,__,__,__,__,__,_w,_l,__,_w,_l,__,__, // 0x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 1x - _w,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 2x - _c,_c,_c,_c,__,__,__,__,__,__,__,__,__,__,__,__, // 3x (numbers) - __,_n,__,_n,__,__,__,_n,__,__,__,__,__,__,__,__, // 4x (upper case) - __,__,__,__,_n,__,__,__,__,__,__,__,__,__,__,__, // 5x (upper case) - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 6x (lower case) - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 7x (lower case) - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 8x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 9x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Ax - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Bx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Cx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Dx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Ex - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__ // Fx - }; - -#undef __ -#undef _w -#undef _l -#undef _n -#undef _a -#undef _c - -//---------- -// -// private global data relating to nib format -// -//---------- - -// nib file magic number(s) - -static const u32 nibMagicBig = 0x6BE93D3A; // in big endian format -static const u32 nibMagicLittle = 0x3A3DE96B; // in little endian format - -// tables to map 4-bit nybbles from nib file to a character -// -// nybbles map as follows: -// nybble: 0 1 2 3 4 5 6 7 8 9 A B C D E F -// character: T C A G N X X X t c a g n x x x -// For (alleged) efficiency's sake, we use separate lookup tables for the left -// and right nybble mapping. - -static const unsigned char nibTo1stChar[256] = - "TTTTTTTTTTTTTTTT" - "CCCCCCCCCCCCCCCC" - "AAAAAAAAAAAAAAAA" - "GGGGGGGGGGGGGGGG" - "NNNNNNNNNNNNNNNN" - "XXXXXXXXXXXXXXXX" - "XXXXXXXXXXXXXXXX" - "XXXXXXXXXXXXXXXX" - "tttttttttttttttt" - "cccccccccccccccc" - "aaaaaaaaaaaaaaaa" - "gggggggggggggggg" - "nnnnnnnnnnnnnnnn" - "xxxxxxxxxxxxxxxx" - "xxxxxxxxxxxxxxxx" - "xxxxxxxxxxxxxxxx"; - -static const unsigned char nibTo2ndChar[256] = - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx" - "TCAGNXXXtcagnxxx"; - -static const unsigned char nibTo1stCharUnmasked[256] = - "TTTTTTTTTTTTTTTT" - "CCCCCCCCCCCCCCCC" - "AAAAAAAAAAAAAAAA" - "GGGGGGGGGGGGGGGG" - "NNNNNNNNNNNNNNNN" - "XXXXXXXXXXXXXXXX" - "XXXXXXXXXXXXXXXX" - "XXXXXXXXXXXXXXXX" - "TTTTTTTTTTTTTTTT" - "CCCCCCCCCCCCCCCC" - "AAAAAAAAAAAAAAAA" - "GGGGGGGGGGGGGGGG" - "NNNNNNNNNNNNNNNN" - "XXXXXXXXXXXXXXXX" - "XXXXXXXXXXXXXXXX" - "XXXXXXXXXXXXXXXX"; - -static const unsigned char nibTo2ndCharUnmasked[256] = - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX" - "TCAGNXXXTCAGNXXX"; - -//---------- -// -// private global data relating to 2bit format -// -//---------- - -// 2bit file magic number(s) - -static const u32 twobitMagicBig = 0x1A412743; // in big endian format -static const u32 twobitMagicLittle = 0x4327411A; // in little endian format - -static const char* twobitToChars[256] = - { - "TTTT","TTTC","TTTA","TTTG","TTCT","TTCC","TTCA","TTCG", - "TTAT","TTAC","TTAA","TTAG","TTGT","TTGC","TTGA","TTGG", - "TCTT","TCTC","TCTA","TCTG","TCCT","TCCC","TCCA","TCCG", - "TCAT","TCAC","TCAA","TCAG","TCGT","TCGC","TCGA","TCGG", - "TATT","TATC","TATA","TATG","TACT","TACC","TACA","TACG", - "TAAT","TAAC","TAAA","TAAG","TAGT","TAGC","TAGA","TAGG", - "TGTT","TGTC","TGTA","TGTG","TGCT","TGCC","TGCA","TGCG", - "TGAT","TGAC","TGAA","TGAG","TGGT","TGGC","TGGA","TGGG", - - "CTTT","CTTC","CTTA","CTTG","CTCT","CTCC","CTCA","CTCG", - "CTAT","CTAC","CTAA","CTAG","CTGT","CTGC","CTGA","CTGG", - "CCTT","CCTC","CCTA","CCTG","CCCT","CCCC","CCCA","CCCG", - "CCAT","CCAC","CCAA","CCAG","CCGT","CCGC","CCGA","CCGG", - "CATT","CATC","CATA","CATG","CACT","CACC","CACA","CACG", - "CAAT","CAAC","CAAA","CAAG","CAGT","CAGC","CAGA","CAGG", - "CGTT","CGTC","CGTA","CGTG","CGCT","CGCC","CGCA","CGCG", - "CGAT","CGAC","CGAA","CGAG","CGGT","CGGC","CGGA","CGGG", - - "ATTT","ATTC","ATTA","ATTG","ATCT","ATCC","ATCA","ATCG", - "ATAT","ATAC","ATAA","ATAG","ATGT","ATGC","ATGA","ATGG", - "ACTT","ACTC","ACTA","ACTG","ACCT","ACCC","ACCA","ACCG", - "ACAT","ACAC","ACAA","ACAG","ACGT","ACGC","ACGA","ACGG", - "AATT","AATC","AATA","AATG","AACT","AACC","AACA","AACG", - "AAAT","AAAC","AAAA","AAAG","AAGT","AAGC","AAGA","AAGG", - "AGTT","AGTC","AGTA","AGTG","AGCT","AGCC","AGCA","AGCG", - "AGAT","AGAC","AGAA","AGAG","AGGT","AGGC","AGGA","AGGG", - - "GTTT","GTTC","GTTA","GTTG","GTCT","GTCC","GTCA","GTCG", - "GTAT","GTAC","GTAA","GTAG","GTGT","GTGC","GTGA","GTGG", - "GCTT","GCTC","GCTA","GCTG","GCCT","GCCC","GCCA","GCCG", - "GCAT","GCAC","GCAA","GCAG","GCGT","GCGC","GCGA","GCGG", - "GATT","GATC","GATA","GATG","GACT","GACC","GACA","GACG", - "GAAT","GAAC","GAAA","GAAG","GAGT","GAGC","GAGA","GAGG", - "GGTT","GGTC","GGTA","GGTG","GGCT","GGCC","GGCA","GGCG", - "GGAT","GGAC","GGAA","GGAG","GGGT","GGGC","GGGA","GGGG" - }; - -//---------- -// -// private global data relating to hsx format -// -//---------- - -// hsx file magic number(s) - -static const u32 hsxMagicBig = 0xD2527095; // in big endian format -static const u32 hsxMagicLittle = 0x957052D2; // in little endian format - -static const u64 hsxMsBit5 = ((u64) 0x80) << (4*8); -static const u64 hsxMaxFilePos = (u64) ((long int) -1); - -//---------- -// -// private global data relating to quantum-dna format -// -//---------- - -// quantum-dna file magic number(s) - -static const u32 qdnaMagicBig = 0xC4B47197; // in big endian format -static const u32 qdnaMagicLittle = 0x9771B4C4; // in little endian format -static const u32 oldQdnaMagicBig = 0xF656659E; // in big endian format -static const u32 oldQdnaMagicLittle = 0x9E6556F6; // in little endian format - -//---------- -// -// prototypes for private functions -// -//---------- - -static seq* alloc_sequence_record (char* id); -static void skip_sequences (seq* _seq, int skipCount); -static void load_sequence_core (seq* _seq, int keeper); -static void load_fasta_sequence (seq* _seq, int keeper); -static void parse_fasta_header (seq* _seq); -static unspos parse_fasta (seq* _seq, int storeEm); -static void load_fastq_sequence (seq* _seq, int keeper); -static void parse_fastq_header (seq* _seq); -static unspos parse_fastq (seq* _seq); -static int fastq_skip_content (seq* _seq); -static void load_csfasta_sequence (seq* _seq, int keeper); -static void parse_csfasta_header (seq* _seq); -static unspos parse_csfasta (seq* _seq, int storeEm); -static void load_nib_sequence (seq* _seq, int keeper); -static void read_2bit_header (seq* _seq); -static void load_2bit_sequence (seq* _seq, int keeper); -static void read_hsx_header (seq* _seq); -static void locate_hsx_first_sequence (seq* _seq); -static void load_hsx_sequence (seq* _seq, int keeper); -static void load_qdna_sequence (seq* _seq, int keeper); -static int another_sequence_core (seq* _seq); -static void create_short_header (seq* _seq); -static int find_next_fasta_coi (seq* _seq); -static int find_next_fastq_coi (seq* _seq); -static int find_next_csfasta_coi (seq* _seq); -static int find_next_general_fasta_coi (seq* _seq, int allowComments); -static int find_next_2bit_coi (seq* _seq); -static int find_next_hsx_coi (seq* _seq); -static int read_contig_name (seq* _seq); -static int read_chore (seq* _seq); -static void shorten_header (char* src, int nameParseType, int skipPath, - char** dst, u32* dstSize); -static void whitespace_to_under (char* s, int sLen); -static void expand_nickname (char* src, u32 contigNumber, - char** dst, u32* dstSize); -static void separate_sequence (seq* _seq, char sepCh); -static void add_partition (seq* _seq, unspos sepPos, - unspos startLoc, unspos trueLenOffset); -static void copy_partitions (seq* seqTo, seq* seqFrom); -static void enough_partitions (seq* _seq, u32 numPartitions, u32 poolSize, - int anticipate, int roundUp); -static void parse_sequence_name (const char* name, - char** filename, char** nickname, - char** contigOfInterest, - char** namesFilename, - char** choresFilename, - int* subsampleK, int* subsampleN, - char** softMaskFilename, int* softMaskComplement, - char** xMaskFilename, int* xMaskComplement, - char** nMaskFilename, int* nMaskComplement, - int* nameParseType, - char** nameTrigger, - int* doRevCompFlags, - int* doUnmask, - int* doPartitioning, int* doJoin, - char* separatorCh, - int* useFullNames, - int* fileType, - int* isQuantum, char** qCodingFilename, - unspos* _start, unspos* _end, - int* endIsSoft); -static int detect_file_type (seq* _seq); -static u32 read_4 (seq* _seq, int asBigEndian); -static u32 read_4_big (seq* _seq); -static u32 read_4_little (seq* _seq); -static u64 read_5 (seq* _seq, int asBigEndian); -static u64 read_5_big (seq* _seq); -static u64 read_5_little (seq* _seq); -static u64 read_6 (seq* _seq, int asBigEndian); -static u64 read_6_big (seq* _seq); -static u64 read_6_little (seq* _seq); -static int skip_seq_whitespace (seq* _seq); -static int seq_getc (seq* _seq); -static void seq_ungetc (char ch, seq* _seq); -static int skip_chars (seq* _seq, u32 toSkip); -static int test_rewindability (seq* _seq); -static void save_fstate (seq* _seq); -static void restore_fstate (seq* _seq); - -//---------- -// -// open_sequence_file-- -// Open a sequence file for read operations. -// open_rewindable_sequence_file-- -// Open a sequence file for read operations, which may be rewound later. Note -// that if the actual file is not rewindable, but only contains one sequence, -// we can still satisfy the caller's desire for rewindability. -// -//---------- -// -// Arguments: -// char* name: The name of the file that sequence data is to be -// .. read from. This can be NULL, which indicates -// .. that data is to be read from stdin. Note that -// .. the name may have actions attached as a suffix. -// int fileType: The type of file, e.g. seq_type_nib. This can be -// .. seq_type_unknown if the caller would like the -// .. type to be determined from the file' contents. -// int choresAllowed: true => an "alignment chores" action is allowed. -// char* choresFilename: The name of the file to read "alignment chores" -// .. from. This can be NULL, in which case there -// .. are no *specific* chores to perform. Note that, -// .. even if this is NULL, a chores file may be -// .. specified by an action attached to the file -// .. name. -// unspos allocLen: If non-zero, pre-allocate for a sequence of this -// .. length. Zero indicates that the caller doesn't -// .. have any pre-allocation preference. -// int needTrueLen: true => set seq->trueLen correctly, even if this -// .. means reading additional characters -// .. outside the desired (sub)interval -// false => the value written to trueLen is unimportant -// int allowAmbiDNA: true => permit ambiguous DNA characters -// .. B,D,H,K,M,R,S,V,W,Y -// false => only A,C,G,T,N,X permitted -// u8* qToComplement: (similar to nuc_to_complement) array to map a -// .. quantum base to its complement. This is only -// .. used if the sequence is quantum DNA. This may -// .. be NULL (in which case the sequence can not be -// .. reverse-complemented). -// -// Returns: -// A pointer to the sequence; failures result in fatality. The caller must -// eventually de-allocate this by calling free_sequence(). -// -//---------- -// -// Implementation notes: -// -// To satisfy the caller's request that the file be rewindable, we perform the -// initial sequence load here (and set a flag to let load_sequence know this -// has happened). Then we check whether the file contains any additional data. -// If it doesn't, then we treat the file as rewindable regardless of whether -// the underlying file actualy is. Only if the file contains additional data -// do we then perform a test for rewindability, by attempting to set the file's -// position back to the end of the first sequence. -// -//---------- - -static seq* private_open_sequence_file (char* name, int fileType, - int choresAllowed, char* choresFilename, - unspos allocLen, - int beRewindable, int needTrueLen, int allowAmbiDNA, - u8* qToComplement); - -seq* open_sequence_file (char* name, int fileType, int choresAllowed, char* choresFilename, unspos allocLen, int needTrueLen, int allowAmbiDNA, u8* qToComplement) - { return private_open_sequence_file (name, fileType, choresAllowed, choresFilename, allocLen, false, needTrueLen, allowAmbiDNA, qToComplement); } - -seq* open_rewindable_sequence_file (char* name, int fileType, int choresAllowed, char* choresFilename, unspos allocLen, int needTrueLen, int allowAmbiDNA, u8* qToComplement) - { return private_open_sequence_file (name, fileType, choresAllowed, choresFilename, allocLen, true, needTrueLen, allowAmbiDNA, qToComplement); } - -static seq* private_open_sequence_file - (char* name, - int fileType, - int choresAllowed, - char* choresFilename, - unspos allocLen, - int beRewindable, - int needTrueLen, - int allowAmbiDNA, - u8* qToComplement) - { - seq* _seq; - int isQuantum = false; - char* qCodingFilename = NULL; - int forcedfileType = seq_type_unknown; - char* header; - int searchForContig; - int err; - - // allocate the sequence tracking structure - - _seq = alloc_sequence_record ("open_sequence"); - _seq->vOwner = true; // (even though _seq->v is NULL, we - _seq->vcOwner = true; // .. still will be considered as the - _seq->vqOwner = true; // .. 'owner' so we can resize it) - _seq->partition.poolOwner = true; // (similarly, we are owner of partition - // .. names pool so we can resize it) - _seq->headerOwner = true; // (and we're owner of header[], - _seq->shortHeaderOwner = true; // .. shortHeader[] and trueHeader[] - _seq->trueHeaderOwner = true; // .. so we can resize them) - - _seq->pendingChars = zalloc_or_die ("open_sequence (pendingChars)", - seqBufferSize); - _seq->pendingStack = _seq->pendingChars + seqBufferSize; - _seq->pendingLen = 0; - - _seq->lockedHeader = false; - _seq->needTrueLen = needTrueLen; - _seq->allowAmbiDNA = allowAmbiDNA; - - // if we have no name, use stdin - - if (name == NULL) - { - _seq->filename = copy_string ("(stdin)"); - _seq->f = stdin; - } - - // otherwise, open the file - // nota bene: we'd like to open the file as "rt" instead of "rb" if it is - // a fasta file; unfortunately we don't know what the file - // type is until we open it - - else - { - if (choresFilename != NULL) _seq->choresFilename = copy_string (choresFilename); - else _seq->choresFilename = NULL; - - parse_sequence_name (name, - &_seq->filename, &_seq->header, - &_seq->contigOfInterest, - &_seq->namesFilename, - &_seq->choresFilename, - &_seq->subsampleK, &_seq->subsampleN, - &_seq->softMaskFilename, &_seq->softMaskComplement, - &_seq->xMaskFilename, &_seq->xMaskComplement, - &_seq->nMaskFilename, &_seq->nMaskComplement, - &_seq->nameParseType, - &_seq->nameTrigger, - &_seq->doRevCompFlags, - &_seq->doUnmask, - &_seq->doPartitioning, &_seq->doJoin, - &_seq->separatorCh, - &_seq->useFullNames, - &forcedfileType, - &isQuantum, &qCodingFilename, - &_seq->startLimit, &_seq->endLimit, - &_seq->endIsSoft); - _seq->f = fopen_or_die (_seq->filename, "rb"); - if (_seq->header != NULL) - { - _seq->headerSize = strlen (_seq->header) + 1; - _seq->lockedHeader = true; - _seq->hasNickname = true; - } - } - - if ((!choresAllowed) && (_seq->choresFilename != NULL)) - suicidef ("can't use [chores] for the target file (%s)\n" - "move [chores] to the query file, or use the --chores option", - sequence_filename(_seq)); - - if ((_seq->doJoin) && (_seq->choresFilename != NULL)) - { - if (choresAllowed) suicidef ("can't use --chores with [multiple]"); - else suicidef ("can't use [chores] with [multiple]"); - } - - // init any non-zero fields - - _seq->hasSavedState = false; - _seq->rewindable = -1; // (rewindability unknown at this point) - _seq->contig = 0; - - if (isQuantum) - { - if ((fileType != seq_type_qdna) && (fileType != seq_type_unknown)) - suicidef ("clashing file type for %s ([quantum] used for %s file)", - sequence_filename(_seq), seqTypeNames[fileType]); - _seq->fileType = seq_type_qdna; - } - else if (forcedfileType != seq_type_unknown) - { - if ((fileType != forcedfileType) && (fileType != seq_type_unknown)) - suicidef ("clashing file type for %s (%s used for %s file)", - sequence_filename(_seq), seqTypeNames[forcedfileType], seqTypeNames[fileType]); - _seq->fileType = forcedfileType; - } - else - _seq->fileType = fileType; - - // initialize subsampling - - if (_seq->subsampleN == 1) - _seq->subsampleN = 0; // (subsampling 1 of 1 is meaningless) - - if (_seq->subsampleN == 0) - _seq->subsampleSkip = 0; - else - _seq->subsampleSkip = _seq->subsampleK - 1; - - // if the sequences in this file are to be joined into a parititioned - // sequence, initialize that - - if (_seq->doPartitioning) - { - enough_partitions (_seq, /*numPartitions*/ 100, /*poolSize*/ 0, - /*anticipate*/ false, /*round up*/ true); - _seq->partition.state = seqpart_empty; - } - - // if the file type is not yet known, figure out what it is - - if (_seq->fileType == seq_type_unknown) - _seq->fileType = detect_file_type (_seq); - - if ((!sequences_dbgAllowColors) - && (_seq->fileType == seq_type_csfasta)) - suicidef ("sorry, color space is not fully implemented yet"); - - if ((_seq->fileType != seq_type_2bit) - && (_seq->fileType != seq_type_hsx) - && (_seq->contigOfInterest != NULL)) - suicidef ("specific contig-of-interest only valid for 2bit or hsx files (%s)", - _seq->contigOfInterest); - - if ((_seq->fileType != seq_type_fasta) - && (_seq->fileType != seq_type_fastq) - && (_seq->fileType != seq_type_csfasta) - && (_seq->fileType != seq_type_2bit) - && (_seq->fileType != seq_type_hsx) - && (_seq->namesFilename != NULL)) - suicidef ("sequence-subset file only valid for fasta, fastq, csfasta, 2bit or hsx files\n(%s)", - _seq->namesFilename); - - if ((_seq->fileType != seq_type_fasta) - && (_seq->fileType != seq_type_fastq) - && (_seq->fileType != seq_type_csfasta) - && (_seq->fileType != seq_type_2bit) - && (_seq->fileType != seq_type_hsx) - && (_seq->choresFilename != NULL)) - suicidef ("chores file only valid for fasta, fastq, csfasta, 2bit or hsx files\n(%s)", - _seq->choresFilename); - - if ((_seq->fileType == seq_type_hsx) - && (parse_type(_seq->nameParseType) != name_parse_type_core)) - suicidef ("\"nameparse=\" is not valid for hsx files"); - - if ((_seq->fileType != seq_type_fasta) - && (_seq->fileType != seq_type_fastq) - && (_seq->fileType != seq_type_csfasta) - && (_seq->fileType != seq_type_2bit) - && (parse_type(_seq->nameParseType) == name_parse_type_alnum)) - suicidef ("\"nameparse=alphanum\" only valid for fasta, fastq, csfasta or 2bit files"); - - if ((_seq->fileType != seq_type_fasta) - && (_seq->fileType != seq_type_fastq) - && (_seq->fileType != seq_type_csfasta) - && (_seq->fileType != seq_type_2bit) - && (parse_type(_seq->nameParseType) == name_parse_type_darkspace)) - suicidef ("\"nameparse=darkspace\" only valid for fasta, fastq, csfasta or 2bit files"); - - if ((_seq->fileType != seq_type_fasta) - && (_seq->fileType != seq_type_fastq) - && (_seq->fileType != seq_type_csfasta) - && (_seq->nameTrigger != NULL)) - suicidef ("\"nameparse=tag:%s\" only valid for fasta, fastq or csfasta files", _seq->nameTrigger); - - if ((_seq->fileType == seq_type_csfasta) - && (_seq->separatorCh != 0)) - suicidef ("[separator=%c] is not allowed for csfasta files", _seq->separatorCh); - - // for fastq files, bind allocation for vq (qualities) to v (nucleotides) - - if (_seq->fileType == seq_type_fastq) - _seq->needsVq = true; - - // pre-allocate; note that we can't do this earlier, since we wouldn't - // have known whether to allocate quality values - - if (allocLen != 0) - sequence_long_enough (_seq, allocLen, false); - - // for quantum DNA files, attach the complement mapping - - if (_seq->fileType == seq_type_qdna) - _seq->qToComplement = qToComplement; - - // make sure the file is rewindable if the caller requires it to be - - if (beRewindable) - { - if (load_sequence (_seq)) - { - _seq->preLoaded = true; - if (another_sequence (_seq)) - { - err = test_rewindability (_seq); - if (err != 0) goto not_rewindable; - _seq->rewindable = true; - } - } - } - - // for 2bit or hsx files, we need to read the header - // $$$ should this be moved to before the pre-load above? - - if (_seq->fileType == seq_type_2bit) - read_2bit_header (_seq); - else if (_seq->fileType == seq_type_hsx) - read_hsx_header (_seq); - - // if we have a contigs-of-interest file, open it, read the first line, and - // decide whether we'll need to advance to that contig - - _seq->contigPending = false; - searchForContig = false; - - if (_seq->namesFilename != NULL) - { - _seq->namesFile = fopen_or_die (_seq->namesFilename, "rt"); - if (!read_contig_name (_seq)) - suicidef ("contigs-of-interest file is empty: %s", _seq->namesFilename); - - searchForContig = true; - if (_seq->preLoaded) - { - header = (_seq->useFullNames)? _seq->header : _seq->shortHeader; - if (strcmp (header, _seq->nextContigName) == 0) - searchForContig = false; - } - } - - // if we have a chores file, open it, read the first chore, and decide - // whether we'll need to advance to that contig - - else if (_seq->choresFilename != NULL) - { - _seq->choresFile = fopen_or_die (_seq->choresFilename, "rt"); - _seq->choresLineNum = 0; - if (!read_chore (_seq)) - suicidef ("chores file is empty: %s", _seq->choresFilename); - - header = (_seq->useFullNames)? _seq->header : _seq->shortHeader; - - searchForContig = true; - if (_seq->preLoaded) - { - if (strcmp (header, _seq->nextContigName) == 0) - searchForContig = false; - } - else // if (!_seq->preLoaded) - { - if ((header != NULL) && (strcmp (header, _seq->nextContigName) == 0)) - { - searchForContig = false; - _seq->preLoaded = true; - validate_rev_comp (_seq); - } - } - } - - // if necessary, advance to the first specified contig - - if (searchForContig) - { - if (_seq->fileType == seq_type_fasta) - find_next_fasta_coi (_seq); - else if (_seq->fileType == seq_type_fastq) - find_next_fastq_coi (_seq); - else if (_seq->fileType == seq_type_csfasta) - find_next_csfasta_coi (_seq); - else if (_seq->fileType == seq_type_2bit) - find_next_2bit_coi (_seq); - else // if (_seq->fileType == seq_type_hsx) - find_next_hsx_coi (_seq); - } - - // if we have a quantum coding file, read it - - if (qCodingFilename != NULL) - { - _seq->qCoding = read_quantum_code_by_name (qCodingFilename); - free_if_valid ("open_sequence_file (qCodingFilename)", qCodingFilename); - } - - return _seq; - -// failure exits - -not_rewindable: - if (name == NULL) name = "(stdin)"; - suicidef_with_perror ("sequence file \"%s\" is not rewindable" - " (fseek returned %d): %s", - name, err, sequence_filename(_seq)); - return NULL; // (never gets here) - } - -//---------- -// -// rewind_sequence_file-- -// Rewind a sequence file, so that the sequence(s) within it can be read again. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to rewind. -// -// Returns: -// (nothing; failures cause program termination) -// -//---------- - -void rewind_sequence_file - (seq* _seq) - { - int err; - - if (_seq->f == NULL) return; // (no file to rewind) - - // decide whether we can take a short cut and ignore the rewind request - - if ((_seq->fileType == seq_type_2bit) // (we have only one - && (_seq->contigOfInterest != NULL)) // .. sequence in the file) - { - if (_seq->contig != 0) _seq->preLoaded = true; - return; - } - - if ((_seq->fileType == seq_type_hsx) // (we have only one - && (_seq->contigOfInterest != NULL)) // .. sequence in the file) - { - if (_seq->contig != 0) _seq->preLoaded = true; - return; - } - - if (_seq->contig < 2) // (we haven't read more than - { // .. one sequence from file) - if (_seq->contig != 0) _seq->preLoaded = true; - return; - } - - if (_seq->fileType == seq_type_2bit) - { - _seq->twoBit.contigFilePos = _seq->twoBit.indexFilePos; - _seq->twoBit.contigLoaded = false; - goto reset_file_data; - } - - if (_seq->fileType == seq_type_hsx) - { - _seq->hsx.contigLoaded = false; - goto reset_file_data; - } - - if (_seq->rewindable == -1) - _seq->rewindable = (test_rewindability (_seq) == 0); - - if (_seq->rewindable == false) - suicidef ("sequence file is not rewindable: %s", sequence_filename(_seq)); - - // rewind the file and reset the data - - debugTextFile_1; - rewind (_seq->f); - -reset_file_data: - _seq->len = 0; - _seq->contig = 0; - _seq->preLoaded = false; - _seq->pendingStack = _seq->pendingChars + seqBufferSize; - _seq->pendingLen = 0; - _seq->hasSavedState = false; - - // re-initialize subsampling - - if (_seq->subsampleN == 0) - _seq->subsampleSkip = 0; - else - _seq->subsampleSkip = _seq->subsampleK - 1; - - // rewind the contigs-of-interest file - - if (_seq->namesFile != NULL) - { - err = fseek (_seq->namesFile, 0, SEEK_SET); - if (err != 0) - suicidef ("failed to seek to position in file\n" - "in rewind_sequence_file for %s, index fseek(%08lX) returned %d", - _seq->namesFilename, 0, err); - _seq->contigPending = false; - } - - // rewind the chores file - - if (_seq->choresFile != NULL) - { - err = fseek (_seq->choresFile, 0, SEEK_SET); - if (err != 0) - suicidef ("failed to seek to position in file\n" - "in rewind_sequence_file for %s, index fseek(%08lX) returned %d", - _seq->choresFilename, 0, err); - _seq->contigPending = false; - } - - if (_seq->fileType == seq_type_hsx) - locate_hsx_first_sequence (_seq); - } - -//---------- -// -// clone_sequence-- -// Make a copy of a sequence, so that the copy is in the same state as if -// open_rewindable_sequence_file() had been called. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to make a clone of. -// -// Returns: -// A pointer to the sequence; failures result in fatality. The caller must -// eventually de-allocate this by calling free_sequence(). -// -//---------- - -seq* clone_sequence - (seq* _seq) - { - seq* newSeq; - - debugCloning_1; - newSeq = copy_sequence (_seq); - newSeq->preLoaded = true; - debugCloning_2; - - return newSeq; - } - -//---------- -// -// copy_sequence-- -// Make a copy of a sequence. Note that the copy is not as functional as the -// original. For example, file operations cannot be performed on the copy. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to make a copy of. -// -// Returns: -// A pointer to the sequence; failures result in fatality. The caller must -// eventually de-allocate this by calling free_sequence(). -// -//---------- - -seq* copy_sequence - (seq* _seq) - { - seqpartition* sp = &_seq->partition; - seq* newSeq; - unspos ix; - - // allocate the sequence tracking structure - - newSeq = alloc_sequence_record ("copy_sequence"); - - newSeq->pendingChars = zalloc_or_die ("copy_sequence (pendingChars)", - seqBufferSize); - newSeq->pendingStack = newSeq->pendingChars + seqBufferSize; - newSeq->pendingLen = 0; - - // allocate the sequence content - - if ((_seq->v == NULL) || (_seq->len < 1)) - { - newSeq->v = NULL; - newSeq->vOwner = true; - newSeq->size = 0; - newSeq->len = 0; - newSeq->trueLen = 0; - } - else - { - if (_seq->len > maxSequenceLen) - suicidef ("in copy_sequence, " - "sequence length " unsposFmt " exceeds maximum (" unsposFmt ")", - _seq->len, maxSequenceLen); - - newSeq->v = malloc_or_die ("copy_sequence (v)", _seq->len+1); - newSeq->vOwner = true; - newSeq->size = _seq->len+1; - newSeq->len = _seq->len; - newSeq->trueLen = _seq->trueLen; - } - - if (_seq->vc == NULL) newSeq->vc = NULL; - else newSeq->vc = malloc_or_die ("copy_sequence (vc)", _seq->len+1); - newSeq->vcOwner = true; - - if (_seq->vq == NULL) newSeq->vq = NULL; - else newSeq->vq = malloc_or_die ("copy_sequence (vq)", _seq->len+1); - newSeq->vqOwner = true; - - // set up file info - - newSeq->fileType = _seq->fileType; - - if (_seq->filename != NULL) - newSeq->filename = copy_string (_seq->filename); - - // copy the sequence content (including a terminating zero) - - if (newSeq->v != NULL) - { - for (ix=0 ; ix<=newSeq->len ; ix++) - newSeq->v[ix] = _seq->v[ix]; - } - - newSeq->preLoaded = _seq->preLoaded; - newSeq->contig = _seq->contig; - - // copy the partition info - - if (sp->p != NULL) - { - if (sp->state != seqpart_ready) - suicidef ("internal error, attempt to copy sequence (%s) in partition state %d", - _seq->filename, sp->state); - newSeq->partition.state = seqpart_ready; - copy_partitions (/*to*/ newSeq, /*from*/ _seq); - } - - // copy other fields - - newSeq->startLoc = _seq->startLoc; - newSeq->revCompFlags = _seq->revCompFlags; - newSeq->contig = _seq->contig; - newSeq->lockedHeader = _seq->lockedHeader; - newSeq->allowAmbiDNA = _seq->allowAmbiDNA; - - if (_seq->header == NULL) - { - newSeq->header = NULL; - newSeq->headerSize = 0; - } - else - { - newSeq->header = copy_string (_seq->header); - newSeq->headerSize = strlen (newSeq->header) + 1; - } - newSeq->headerOwner = true; - - if (_seq->shortHeader == NULL) - { - newSeq->shortHeader = NULL; - newSeq->shortHeaderSize = 0; - } - else - { - newSeq->shortHeader = copy_string (_seq->shortHeader); - newSeq->shortHeaderSize = strlen (newSeq->shortHeader) + 1; - } - newSeq->shortHeaderOwner = true; - - if (_seq->trueHeader == NULL) - { - newSeq->trueHeader = NULL; - newSeq->trueHeaderSize = 0; - } - else - { - newSeq->trueHeader = copy_string (_seq->trueHeader); - newSeq->trueHeaderSize = strlen (newSeq->trueHeader) + 1; - } - newSeq->trueHeaderOwner = true; - - newSeq->startLimit = _seq->startLimit; - newSeq->endLimit = _seq->endLimit; - newSeq->endIsSoft = _seq->endIsSoft; - newSeq->useFullNames = _seq->useFullNames; - - if (_seq->contigOfInterest == NULL) - newSeq->contigOfInterest = NULL; - else - newSeq->contigOfInterest = copy_string (_seq->contigOfInterest); - - return newSeq; - } - -//---------- -// -// new_sequence-- -// Create a new, empty, sequence. -// -//---------- -// -// Arguments: -// unspos allocLen: The sequence length to allocate for. The special -// .. value seqposInfinity indicates that no memory should -// .. be allocated for the sequence vector. -// -// Returns: -// A pointer to the sequence; failures result in fatality. The caller must -// eventually de-allocate this by calling free_sequence(). -// -//---------- - -seq* new_sequence - (unspos allocLen) - { - seq* _seq; - - // allocate the sequence tracking structure - - _seq = alloc_sequence_record ("new_sequence"); - - _seq->pendingChars = zalloc_or_die ("new_sequence (pendingChars)", - seqBufferSize); - _seq->pendingStack = _seq->pendingChars + seqBufferSize; - _seq->pendingLen = 0; - - // allocate space for sequence data (but leave it empty) - - if (allocLen == seqposInfinity) - { - _seq->v = NULL; - _seq->vc = NULL; - _seq->vq = NULL; - _seq->vOwner = false; - _seq->vcOwner = false; - _seq->vqOwner = false; - _seq->size = 0; - _seq->len = 0; - } - else - { - _seq->v = malloc_or_die ("new_sequence (v)", allocLen+1); - _seq->vc = NULL; - _seq->vq = NULL; - _seq->vOwner = true; - _seq->vcOwner = true; - _seq->vqOwner = true; - _seq->size = allocLen+1; - _seq->len = 0; - _seq->v[0] = 0; - } - - // initialize the other fields - - _seq->fileType = seq_type_nofile; - _seq->contig = 1; - _seq->startLoc = 1; - _seq->trueLen = 0; - - return _seq; - } - -//---------- -// -// alloc_sequence_record-- -// Allocate a new sequence tracking structure, and make sure all pointer -// fields are NULL. -// -//---------- -// -// Arguments: -// char* id: an identifying string to be used when trackMemoryUsage is -// .. turned on; this can be NULL. -// -// Returns: -// A pointer to the sequence; failures result in fatality. The caller must -// eventually de-allocate this by calling free_sequence(). -// -//---------- - -static seq* alloc_sequence_record - (arg_dont_complain(char* id)) - { - seq* _seq; - - _seq = zalloc_or_die (id, sizeof(*_seq)); - - _seq->v = NULL; - _seq->vc = NULL; - _seq->vq = NULL; - _seq->needsVq = false; - _seq->pendingChars = NULL; - _seq->filename = NULL; - _seq->header = NULL; - _seq->shortHeader = NULL; - _seq->trueHeader = NULL; - _seq->f = NULL; - _seq->namesFile = NULL; - _seq->namesFilename = NULL; - _seq->choresFile = NULL; - _seq->choresFilename = NULL; - _seq->subsampleK = 0; - _seq->subsampleN = 0; - _seq->softMaskFilename = NULL; - _seq->xMaskFilename = NULL; - _seq->nMaskFilename = NULL; - _seq->nameTrigger = NULL; - _seq->contigOfInterest = NULL; - _seq->twoBit.nBlockStarts = NULL; - _seq->twoBit.nBlockSizes = NULL; - _seq->twoBit.mBlockstarts = NULL; - _seq->twoBit.mBlocksizes = NULL; - _seq->partition.p = NULL; - _seq->partition.pool = NULL; - _seq->allowAmbiDNA = false; - _seq->qToComplement = NULL; - - return _seq; - } - -//---------- -// -// sequence_long_enough-- -// Make sure a sequence has enough room (including an extra byte for a -// terminating zero). -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to check. -// unspos allocLen: The sequence length to allocate for (not including the -// .. terminator). -// int anticipate: true => allocate extra, anticipating the need for more -// false => don't -// -// Returns: -// nothing; the sequence's v[] (and possible vq, see note 1) may be modified; -// failures result in fatality. -// -//---------- -// -// Notes: -// -// (1) Normally we only allocate v[]. But if _seq->needsVq is true, we also -// allocate vq[], and keep it the same length as v[]. -// -//---------- - -void sequence_long_enough - (seq* _seq, - unspos allocLen, - int anticipate) - { - char* name; - - if (_seq->size >= allocLen+1) - return; - - allocLen += 2; // (add space for a terminating zero, - if (anticipate) // .. etc.) - allocLen += 30 + allocLen / 8; // anticipatory, grow by about 13% - allocLen = round_up_16K (allocLen); // we expect that allocation in - // .. multiples of 16K is better for - // .. the heap manager - - if (!_seq->vOwner) - { - name = (_seq->filename != NULL)? _seq->filename - : _seq->header; - suicidef ("internal error, attempt to resize external sequence (%s)", - name); - } - - if ((_seq->needsVq) && (!_seq->vqOwner)) - { - name = (_seq->filename != NULL)? _seq->filename - : _seq->header; - suicidef ("internal error, attempt to resize external qualities (%s)", - name); - } - - // fprintf (stderr, "re-allocating " unsposFmt " bytes\n", allocLen); - _seq->v = realloc_or_die ("sequence_long_enough", _seq->v, allocLen); - _seq->size = allocLen; - - if (_seq->needsVq) - _seq->vq = realloc_or_die ("sequence_long_enough (qualities)", _seq->vq, allocLen); - } - -//---------- -// -// free_sequence-- -// Deallocate a sequence, along with any associated memory or files. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to dispose of. -// -// Returns: -// (nothing) -// -//---------- - -void free_sequence - (seq* _seq) - { - if (_seq == NULL) return; - - if (_seq->vOwner) free_if_valid ("free_sequence (v)", _seq->v); - if (_seq->vcOwner) free_if_valid ("free_sequence (vc)", _seq->vc); - if (_seq->vqOwner) free_if_valid ("free_sequence (vq)", _seq->vq); - free_if_valid ("free_sequence (pendingChars)", _seq->pendingChars); - free_if_valid ("free_sequence (filename)", _seq->filename); - if (_seq->headerOwner) free_if_valid ("free_sequence (header)", _seq->header); - if (_seq->shortHeaderOwner) free_if_valid ("free_sequence (shortHeader)", _seq->shortHeader); - if (_seq->trueHeaderOwner) free_if_valid ("free_sequence (trueHeader)", _seq->trueHeader); - free_if_valid ("free_sequence (qCoding)", _seq->qCoding); - - free_if_valid ("free_sequence (contigOfInterest)", _seq->contigOfInterest); - - if (_seq->fileType != seq_type_nofile) - { - fclose_if_valid (_seq->f); - fclose_if_valid (_seq->namesFile); - free_if_valid ("free_sequence (namesFilename)", _seq->namesFilename); - fclose_if_valid (_seq->choresFile); - free_if_valid ("free_sequence (choresFilename)", _seq->choresFilename); - free_if_valid ("free_sequence (softMaskFilename)", _seq->softMaskFilename); - free_if_valid ("free_sequence (xMaskFilename)", _seq->xMaskFilename); - free_if_valid ("free_sequence (nMaskFilename)", _seq->nMaskFilename); - free_if_valid ("free_sequence (nameTrigger)", _seq->nameTrigger); - } - - free_if_valid ("free_sequence (twoBit.nBlockStarts)", _seq->twoBit.nBlockStarts); - free_if_valid ("free_sequence (twoBit.nBlockSizes)", _seq->twoBit.nBlockSizes); - free_if_valid ("free_sequence (twoBit.mBlockstarts)", _seq->twoBit.mBlockstarts); - free_if_valid ("free_sequence (twoBit.mBlocksizes)", _seq->twoBit.mBlocksizes); - - if (_seq->hsx.fileInfo != NULL) - { - u32 fileNum; - for (fileNum=0 ; fileNum<_seq->hsx.numFiles ; fileNum++) - fclose_if_valid (_seq->hsx.fileInfo[fileNum].f); - free_if_valid ("free_sequence (hsx.fileInfo)", _seq->hsx.fileInfo); - } - - free_if_valid ("free_sequence (partition.p)", _seq->partition.p); - if (_seq->partition.poolOwner) free_if_valid ("free_sequence (partition.pool)", _seq->partition.pool); - - free_if_valid ("free_sequence (_seq)", _seq); - } - -//---------- -// -// load_sequence-- -// Load the next sequence from the associated file. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to load. -// -// Returns: -// true if there was another sequence to load, false if not; failures other -// than eof result in fatality. -// -//---------- - -int load_sequence - (seq* _seq) - { - seqpartition* sp; - partition* p; - unspos sepPos; - unspos oldTrueLen; - - debugNamesFile_1; - - if (_seq == NULL) suicide ("load_sequence(NULL)"); - if (_seq->preLoaded) { _seq->preLoaded = false; return true; } - if (_seq->fileType == seq_type_nofile) return false; - if (!another_sequence (_seq)) return false; - // (another_sequence() may set _seq->preLoaded, so we must check a second time) - if (_seq->preLoaded) { _seq->preLoaded = false; return true; } - - debugNamesFile_2; - - // get rid of sequence data from previous load - - _seq->len = 0; - _seq->trueLen = 0; - - ////////// - // read the sequence data, either - // o sequence is not partitioned => read just one sequence from the file - // o sequence is partitioned => read all sequences from the file - ////////// - - sp = &_seq->partition; - if (!_seq->doPartitioning) - { - // not partitioned, just read the next sequence - - if (_seq->subsampleN > 0) - { // we're subsampling, skip sequences as appropriate - if (_seq->subsampleSkip > 0) - skip_sequences (_seq, _seq->subsampleSkip); - _seq->subsampleSkip = _seq->subsampleN-1; - } - - load_sequence_core (_seq, /*keeper*/ true); - } - - else if (_seq->doJoin) - { - // partitioned, read all the sequences - - sp->state = seqpart_loading; - - // write first separator - - sequence_long_enough (_seq, 1, false); - _seq->v[0] = 0; - _seq->len = 0; - sp->len = 0; - - // read all the sequences - - while (another_sequence (_seq)) - { - // load the next sequence - - sepPos = _seq->len++; // (advance length past the separator) - oldTrueLen = _seq->trueLen; - - if (_seq->subsampleN > 0) - { // we're subsampling, skip sequences as appropriate - if (_seq->subsampleSkip > 0) - skip_sequences (_seq, _seq->subsampleSkip); - _seq->subsampleSkip = _seq->subsampleN-1; - } - load_sequence_core (_seq, /*keeper*/ true); - add_partition (_seq, sepPos, _seq->startLoc, _seq->trueLen-oldTrueLen); - } - - // add final separator to the table - - p = &sp->p[sp->len-1]; - p->sepAfter = _seq->len; - - p = &sp->p[sp->len]; - p->sepBefore = _seq->len; - - sp->state = seqpart_ready; - } - - else if (!_seq->doJoin) - { - // partitioned, but only so as to allow separators, just read the next - // sequence - - sp->state = seqpart_loading; - - // write first separator - - sequence_long_enough (_seq, 1, false); - _seq->v[0] = 0; - _seq->len = 0; - sp->len = 0; - - // load the sequence - - sepPos = _seq->len++; // (advance length past the separator) - oldTrueLen = _seq->trueLen; - - if (_seq->subsampleN > 0) - { // we're subsampling, skip sequences as appropriate - if (_seq->subsampleSkip > 0) - skip_sequences (_seq, _seq->subsampleSkip); - _seq->subsampleSkip = _seq->subsampleN-1; - } - load_sequence_core (_seq, /*keeper*/ true); - add_partition (_seq, sepPos, _seq->startLoc, _seq->trueLen-oldTrueLen); - - // add final separator to the table - - p = &sp->p[sp->len-1]; - p->sepAfter = _seq->len; - - p = &sp->p[sp->len]; - p->sepBefore = _seq->len; - - sp->state = seqpart_ready; - } - - // apply any required operators to it; note that nib and 2bit sequences - // are unmasked (if desired) during the earlier call to load_nib_sequence - // or load_2bit_sequence, so there is no need to unmask them here; further, - // csfasta files do not have the concept of masking - - if ((_seq->doUnmask) - && ((_seq->fileType == seq_type_fasta) - || (_seq->fileType == seq_type_fastq) - || (_seq->fileType == seq_type_hsx))) - upper_sequence (_seq); - - if (_seq->fileType == seq_type_qdna) - { - if ((_seq->softMaskFilename != NULL) - || (_seq->xMaskFilename != NULL) - || (_seq->nMaskFilename != NULL)) - suicidef ("masking not allowed for %s", sequence_filename(_seq)); - if (_seq->doUnmask) - suicidef ("unmasking not allowed for %s", sequence_filename(_seq)); - if (((_seq->doRevCompFlags & rcf_comp) != 0) - && (_seq->qToComplement == NULL)) - suicidef ("reverse complement not allowed for %s\n", - "(the score file lacks complements)", - sequence_filename(_seq)); - } - - if (_seq->softMaskFilename != NULL) - { - if (_seq->softMaskComplement) - mask_sequence_keep (_seq, _seq->softMaskFilename, -1); - else - mask_sequence (_seq, _seq->softMaskFilename, -1); - } - if (_seq->xMaskFilename != NULL) - { - if (_seq->xMaskComplement) - mask_sequence_keep (_seq, _seq->xMaskFilename, 'X'); - else - mask_sequence (_seq, _seq->xMaskFilename, 'X'); - } - if (_seq->nMaskFilename != NULL) - { - if (_seq->nMaskComplement) - mask_sequence_keep (_seq, _seq->nMaskFilename, 'N'); - else - mask_sequence (_seq, _seq->nMaskFilename, 'N'); - } - - if (_seq->separatorCh != 0) - separate_sequence (_seq, _seq->separatorCh); - - if (_seq->doRevCompFlags == rcf_revcomp) - rev_comp_sequence (_seq, _seq->qToComplement); - else if (_seq->doRevCompFlags == rcf_rev) - backward_sequence (_seq); - else if (_seq->doRevCompFlags == rcf_comp) - { - backward_sequence (_seq); - rev_comp_sequence (_seq, _seq->qToComplement); - } - - //debugPartitions_1a; - debugPartitions_1b; - - if (sequences_dbgDumpSequence) - dump_sequence (stderr, _seq); - - return true; - } - - -//-- skip_sequences-- - -static void skip_sequences - (seq* _seq, - int skipCount) - { - while ((skipCount-- > 0) && another_sequence_core (_seq)) - load_sequence_core (_seq, /*keeper*/ false); - } - - -//-- load_sequence_core -- - -static void load_sequence_core - (seq* _seq, - int keeper) - { - debugNamesFile_3; - - // get rid of header data from previous load - - if (!_seq->lockedHeader) - { - if ((_seq->header != NULL) && (_seq->headerSize != 0)) - _seq->header[0] = 0; - if ((_seq->shortHeader != NULL) && (_seq->shortHeaderSize != 0)) - _seq->shortHeader[0] = 0; - if ((_seq->trueHeader != NULL) && (_seq->trueHeaderSize != 0)) - _seq->trueHeader[0] = 0; - } - - // read the next sequence for this type - - _seq->revCompFlags = rcf_forward; - _seq->contig++; - - switch (_seq->fileType) - { - case seq_type_fasta: - load_fasta_sequence (_seq, keeper); - break; - - case seq_type_fastq: - load_fastq_sequence (_seq, keeper); - break; - - case seq_type_csfasta: - load_csfasta_sequence (_seq, keeper); - break; - - case seq_type_nib: - load_nib_sequence (_seq, keeper); - break; - - case seq_type_2bit: - if (_seq->contigOfInterest != NULL) - _seq->contig--; // (cancel earlier increment) - load_2bit_sequence (_seq, keeper); - break; - - case seq_type_hsx: - if (_seq->contigOfInterest != NULL) - _seq->contig--; // (cancel earlier increment) - load_hsx_sequence (_seq, keeper); - break; - - case seq_type_qdna: - load_qdna_sequence (_seq, keeper); - break; - - default: - suicidef ("unknown sequence type: %X", _seq->fileType); - } - - debugNamesFile_4; - - _seq->contigPending = false; - - if ((_seq->header != NULL) - && (_seq->headerSize != 0) - && ((_seq->shortHeader == NULL) - || (_seq->shortHeaderSize == 0) - || (_seq->shortHeader[0] == 0) - || (_seq->hasNickname))) - create_short_header (_seq); - - if ((_seq->header != NULL) - && (_seq->headerSize != 0) - && ((_seq->nameParseType & name_parse_fill_white) != 0)) - whitespace_to_under (_seq->header, strlen(_seq->header)); - } - -//---------- -// -// load_fasta_sequence-- -// Load the next fasta sequence from the associated file. -// -// A typical file looks like this: -// -// > some header for the first sequence -// GCGGTATCGCGCACAAGATTTAGGGATAGATCGTTTTGATGACCTCTCGCCACCTGGCAA -// ... -// AAAAAAGGTAGGCCCATTAGCCCCCC -// -// The header line is optional. However, if several sequences are included in -// the same file, the header lines are necessary as separators. Nucleotides -// can be upper or lower case. X can be used to indicate a masked position. -// whitespace can be added as desired (and so line breaks can be anywhere). -// digits are ignored so it is easy to use numerically annotated sequences. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to load. -// int keeper: true => actually load the sequence -// false => just skip the sequence -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -static void load_fasta_sequence - (seq* _seq, - int keeper) - { - int ch; - unspos length; - - if (_seq == NULL) suicide ("load_fasta_sequence(NULL)"); - - debugFastaFile_1; - - ////////// - // read the header - ////////// - - ch = skip_seq_whitespace (_seq); - - if (ch != '>') seq_ungetc (ch, _seq); - else parse_fasta_header (_seq); - - debugFastaFile_2; - - ////////// - // read ahead to determine the length of the sequence and to pre-allocate - // the vector - ////////// - - if (_seq->rewindable == -1) - _seq->rewindable = (test_rewindability (_seq) == 0); - - if ((_seq->rewindable == true) && (keeper)) - { - // read ahead, counting chars needed - - save_fstate (_seq); - length = parse_fasta (_seq, /*storeEm*/ false); - restore_fstate (_seq); - - // allocate the vector - - if ((length > maxSequenceLen) || (_seq->len > maxSequenceLen - length)) - suicidef ("in load_fasta_sequence for %s, " - "sequence length %s+%s exceeds maximum (%s)", - sequence_filename(_seq), - commatize(_seq->len), commatize(length), - commatize(maxSequenceLen)); - - sequence_long_enough (_seq, _seq->len+length, false); - } - - ////////// - // read the sequence - ////////// - - parse_fasta (_seq, /*storeEm*/ keeper); - } - -//---------- -// -// parse_fasta_header-- -// Parse a fasta header from the associated file. This assumes that the -// sequence's file is positioned at the first character in the header, *after* -// the '>' character. -// -// Upon return, the file is positioned at the start of the first line -// following the header. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence being parsed. -// -// Returns: -// (nothing; the header is written to _seq->header) -// -//---------- - -static void parse_fasta_header - (seq* _seq) - { - u32 headerLen; - int ch; - - if (_seq->lockedHeader) - { - ch = seq_getc (_seq); - while ((ch != '\n') && (ch != '\r') && (ch != EOF)) - ch = seq_getc (_seq); - } - else - { - headerLen = 0; - _seq->headerOwner = _seq->shortHeaderOwner = true; - if (sequences_keepFastaArrow) - { - append_char (&_seq->header, &_seq->headerSize, &headerLen, '>'); - ch = seq_getc (_seq); - while ((ch == ' ') || (ch == '\t')) - { - append_char (&_seq->header, &_seq->headerSize, &headerLen, ch); - ch = seq_getc (_seq); - } - } - else - ch = skip_seq_whitespace (_seq); - - while ((ch != '\n') && (ch != '\r') && (ch != EOF)) - { - append_char (&_seq->header, &_seq->headerSize, &headerLen, ch); - ch = seq_getc (_seq); - } - append_char (&_seq->header, &_seq->headerSize, &headerLen, 0); - - if (_seq->nameTrigger != NULL) - { - char* triggerFound, *src, *dst; - triggerFound = strstr (_seq->header, _seq->nameTrigger); - if (triggerFound != NULL) - { - triggerFound += strlen (_seq->nameTrigger); - for (src=triggerFound,dst=_seq->header ; *src!=0 ; ) - { - ch = *(src++); - if ((!isalnum(ch)) && (ch != '_')) - break; - *(dst++) = ch; - } - *dst = 0; - } - } - } - - if (ch == '\r') // handle possible DOS CR-LF line ending - { - ch = seq_getc (_seq); - if (ch != '\n') seq_ungetc (ch, _seq); - } - - } - -//---------- -// -// parse_fasta-- -// Parse a fasta sequence from the associated file. This assumes that the -// sequence's file is positioned at the first character in the sequence, -// *after* the sequence header line. -// -// (see load_fasta_sequence() for info about the file format) -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to parse. -// int storeEm: true => store the results (in the sequence) -// false => just count -// -// Returns: -// The number of characters read into the sequence; failure causes program -// fatality. -// -//---------- - -static unspos parse_fasta - (seq* _seq, - int storeEm) - { - unspos index, startLimit, endLimit, count; - int prevCh, ch; - char* description; - - index = 0; - count = 0; - startLimit = _seq->startLimit; - endLimit = _seq->endLimit; - - // scan the file, keeping characters that are (a) nucleotides and (b) are - // within our index limits - - prevCh = '\n'; - ch = skip_seq_whitespace (_seq); - while (ch != EOF) - { - if ((prevCh == '\n') && (ch == '>')) // (start of next sequence) - { seq_ungetc (ch, _seq); break; } - - if ((_seq->separatorCh == 0) || (ch != _seq->separatorCh)) - { - switch (char_to_fasta_type[(u8)ch]) - { - case _nucleotide: - break; - case _ambiguous: - if (!_seq->allowAmbiDNA) - goto bad_char; - break; - case _newline: - ch = '\n'; // (allow for unix, mac, or pc line ends) - goto next_char; - case _bad: - goto bad_char; - } - } - - // this is a nucleotide (or separator), do we want it? - - index++; - - if ((startLimit != 0) && (index < startLimit)) goto next_char; - if ((endLimit != 0) && (index > endLimit)) goto next_char; - - // we want it; are we just counting? - - if ((!storeEm) && (count+1 < count)) - suicidef ("in parse_fasta, " - "sequence length " unsposFmt "+1 overflows internal data type", - count); - - count++; - if (!storeEm) goto next_char; - - // ok, let's store it - - if (_seq->len > maxSequenceLen - 1) - suicidef ("in parse_fasta, " - "sequence length " unsposFmt "+1 exceeds maximum (" unsposFmt ")", - _seq->len, maxSequenceLen); - - sequence_long_enough (_seq, _seq->len+1, true); - _seq->v[_seq->len++] = ch; - - // go try the next character - - next_char: - prevCh = ch; - ch = skip_seq_whitespace (_seq); - } - - if (storeEm) - { - if (_seq->v == NULL) - sequence_long_enough (_seq, _seq->len+1, true); - _seq->v[_seq->len] = 0; // (set the terminating zero) - _seq->trueLen += index; // (account for the characters - // .. we've read so far) - } - - // make sure we got somethin' useful - - if ((startLimit != 0) && (startLimit > index)) - goto beyond_start; - - if ((endLimit != 0) && (endLimit > index)) - { - if (_seq->endIsSoft) - { _seq->endLimit = 0; _seq->endIsSoft = false; } - else - goto beyond_end; - } - - if ((count == 0) && (storeEm)) - { - if (_seq->header == NULL) - fprintf (stderr, "WARNING. %s contains an empty sequence\n", - sequence_filename(_seq)); - else - fprintf (stderr, "WARNING. %s contains an empty sequence:\n%s\n", - sequence_filename(_seq), _seq->header); - } - - if (startLimit == 0) _seq->startLoc = 1; - else _seq->startLoc = startLimit; - -// (no longer needed; the loop above exits only at end-of-file or end-of-seq) -// // skip to the next sequence -// -// if ((storeEm) && (_seq->needTrueLen)) -// { -// prevCh = '\n'; -// ch = skip_seq_whitespace (_seq); -// while (ch != EOF) -// { -// if ((prevCh == '\n') && (ch == '>')) // (start of next sequence) -// { seq_ungetc (ch, _seq); break; } -// -// switch (char_to_fasta_type[(u8)ch]) -// { -// case _nucleotide: -// case _ambiguous: -// _seq->trueLen++; -// break; -// case _newline: -// ch = '\n'; // (allow for unix, mac, or pc line ends) -// break; -// case _bad: -// goto bad_char; -// } -// -// // go try the next character -// -// prevCh = ch; -// ch = skip_seq_whitespace (_seq); -// } -// } - - return count; - - // failure exits - // $$$ report line number here - -bad_char: - description = char_to_description (ch); - if ((_seq->header == NULL) || (_seq->header[0] == 0)) - suicidef ("bad fasta character in %s (%s)\n" - "remove or replace non-ACGTN characters or consider using --ambiguous=iupac", - sequence_filename(_seq), description); - else - suicidef ("bad fasta character in %s, %s (%s)\n" - "remove or replace non-ACGTN characters or consider using --ambiguous=iupac", - sequence_filename(_seq), _seq->header, description); - -beyond_start: - if (_seq->header == NULL) - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), startLimit, index); - else - suicidef ("beyond end in %s, %s (%ld > %ld)", - sequence_filename(_seq), _seq->header, startLimit, index); - -beyond_end: - if (_seq->header == NULL) - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), endLimit, index); - else - suicidef ("beyond end in %s, %s (%ld > %ld)", - sequence_filename(_seq), _seq->header, endLimit, index); - - return 0; // (never gets here) - } - -//---------- -// -// load_fastq_sequence-- -// Load the next fastq sequence from the associated file. -// -// A typical file looks like this: -// -// @HWI-ST407_110227_0090_A80FT9ABXX:1:1:1190:2064#0/1 -// AGCTAAGGAATGACACAATTTGTCCTAATGGCAAATGCAGGGATTGTGATAAATATATCCNATATCTTA -// + -// cccWXccbcc[aZ_bRaaa`edadeefffff\cdaIXPWZdd`adcXaXN_BBBBBBBBBBBBBBBBBB -// @HWI-ST407_110227_0090_A80FT9ABXX:1:1:1120:2089#0/1 -// TAGAACATGAGGGAAAGGAACAACCCTGCTGACTGACATGAGGCTGCCTGCCGCGGGGGGATGGGCAGG -// + -// c_aaeaaWcdddcaNZa`baaXa^VYX\V_[[[]TVJJOYbbcBBBBBBBBBBBBBBBBBBBBBBBBBB -// ... -// -// In each four-line block, the first line is @name. The third line may also -// contain the name, in which case it much match the first line except that it -// starts with a plus sign. The second line is the nucleotides. The fourth -// line is qualities. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to load. -// int keeper: true => actually load the sequence -// false => just skip the sequence -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -static void load_fastq_sequence - (seq* _seq, - int keeper) - { - int ch; - char* description; - - if (_seq == NULL) suicide ("load_fastq_sequence(NULL)"); - - ////////// - // read the header - ////////// - - ch = seq_getc (_seq); - if (ch == EOF) goto end_of_file; - if (ch != '@') goto bad_fastq_header; - - parse_fastq_header (_seq); - - ////////// - // read the sequence - ////////// - - if (_seq->rewindable == -1) - _seq->rewindable = (test_rewindability (_seq) == 0); - - sequence_long_enough (_seq, _seq->len+maxFastqSequenceLen, false); - - if (!keeper) fastq_skip_content (_seq); - else parse_fastq (_seq); - - return; - - // failure exits - // $$$ report line number here - -end_of_file: - suicidef ("premature end of fastq file %s\n", - sequence_filename(_seq)); - -bad_fastq_header: - description = char_to_description (ch); - suicidef ("bad fastq header character in %s (expected \"@\" but read \"%s\")\n", - sequence_filename(_seq), description); - - return; // (never gets here) - } - -//---------- -// -// parse_fastq_header-- -// Parse a fastq header from the associated file. This assumes that -// the sequence's file is positioned at the first character in the header, -// *after* the '@' character. -// -// Upon return, the file is positioned at the start of the first line -// following the header. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence being parsed. -// -// Returns: -// (nothing; the header is written to _seq->header) -// -//---------- - -static void parse_fastq_header - (seq* _seq) - { - u32 headerLen; - int ch; - char* s; - - // parse the complete header and save it in trueHeader - - _seq->trueHeaderOwner = true; - - headerLen = 0; - ch = seq_getc (_seq); - while ((ch != '\n') && (ch != '\r') && (ch != EOF)) - { - append_char (&_seq->trueHeader, &_seq->trueHeaderSize, &headerLen, ch); - ch = seq_getc (_seq); - } - append_char (&_seq->trueHeader, &_seq->trueHeaderSize, &headerLen, 0); - - if (ch == '\r') // handle possible DOS CR-LF line ending - { - ch = seq_getc (_seq); - if (ch != '\n') seq_ungetc (ch, _seq); - } - - // copy from trueHeader into the header (unless the header is locked), then - // if a name trigger is active and matched, handle it - - if (_seq->lockedHeader) - { - ch = seq_getc (_seq); - while ((ch != '\n') && (ch != '\r') && (ch != EOF)) - ch = seq_getc (_seq); - } - else - { - _seq->headerOwner = _seq->shortHeaderOwner = true; - - headerLen = 0; - for (s=_seq->trueHeader ; (*s)!=0 ; s++) - append_char (&_seq->header, &_seq->headerSize, &headerLen, *s); - append_char (&_seq->header, &_seq->headerSize, &headerLen, 0); - - if (_seq->nameTrigger != NULL) - { - char* triggerFound, *src, *dst; - triggerFound = strstr (_seq->header, _seq->nameTrigger); - if (triggerFound != NULL) - { - triggerFound += strlen (_seq->nameTrigger); - for (src=triggerFound,dst=_seq->header ; *src!=0 ; ) - { - ch = *(src++); - if ((!isalnum(ch)) && (ch != '_')) - break; - *(dst++) = ch; - } - *dst = 0; - } - } - } - - } - -//---------- -// -// parse_fastq-- -// Parse a fastq sequence from the associated file. This assumes that the -// sequence's file is positioned at the first character in the sequence, -// *after* the sequence header line. -// -// (see load_fastq_sequence() for info about the file format) -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to parse. -// -// Returns: -// The number of characters read into the sequence; failure causes program -// fatality. -// -//---------- - -static unspos parse_fastq - (seq* _seq) - { - unspos index, startLimit, endLimit, nucCount; - unspos qualCount, qualLen; - int ch; - int checkVsHeader; - u32 headerIx; - char* description; - - headerIx = qualCount = 0; // (placate compiler) - - qualLen = _seq->len; - startLimit = _seq->startLimit; - endLimit = _seq->endLimit; - - ////////// - // read nucleotides and keep any that are within our index limits - ////////// - - index = 0; - nucCount = 0; - - ch = seq_getc (_seq); - while (ch != EOF) - { - if ((_seq->separatorCh == 0) || (ch != _seq->separatorCh)) - { - switch (char_to_fasta_type[(u8)ch]) - { - case _nucleotide: - break; - case _ambiguous: - if (!_seq->allowAmbiDNA) goto bad_nucleotide; - break; - case _newline: - goto end_of_nucleotides; - case _bad: - goto bad_nucleotide; - } - } - - // this is a nucleotide (or separator), do we want it? - - index++; - if ((startLimit != 0) && (index < startLimit)) goto next_nucleotide; - if ((endLimit != 0) && (index > endLimit)) goto next_nucleotide; - - // we want it; let's store it - - nucCount++; - - if (_seq->len > maxSequenceLen - 1) - suicidef ("in parse_fastq, " - "sequence length " unsposFmt "+1 exceeds maximum (" unsposFmt ")", - _seq->len, maxSequenceLen); - - sequence_long_enough (_seq, _seq->len+1, true); - _seq->v[_seq->len++] = ch; - - // go try the next nucleotide character - - next_nucleotide: - ch = seq_getc (_seq); - } - - // we've read all the nucleotides, finish up the sequence info - -end_of_nucleotides: - - if (ch == '\r') // handle possible DOS CR-LF line ending - { - ch = seq_getc (_seq); - if (ch != '\n') seq_ungetc (ch, _seq); - } - - _seq->v[_seq->len] = 0; // (set the terminating zero) - _seq->trueLen += index; // (account for the characters - // .. we've read so far) - - // make sure we got somethin' useful - - if ((startLimit != 0) && (startLimit > index)) - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), startLimit, index); - - if ((endLimit != 0) && (endLimit > index)) - { - if (_seq->endIsSoft) - { _seq->endLimit = 0; _seq->endIsSoft = false; } - else - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), endLimit, index); - } - - if (nucCount == 0) - { - if (_seq->header == NULL) - fprintf (stderr, "WARNING. %s contains an empty sequence\n", - sequence_filename(_seq)); - else - fprintf (stderr, "WARNING. %s contains an empty sequence:\n%s\n", - sequence_filename(_seq), _seq->header); - } - - if (startLimit == 0) _seq->startLoc = 1; - else _seq->startLoc = startLimit; - - ////////// - // read and validate the 3rd line - ////////// - - // make sure it starts with a plus sign - - ch = seq_getc (_seq); - if (ch == EOF) goto end_of_file; - if (ch != '+') goto bad_fastq_third_line; - - // if there's nothing else on the line, we're done with it - - ch = seq_getc (_seq); - if ((ch == '\n') || (ch == '\r')) goto end_of_third_line; - - // otherwise, it has to match the name that was given in the header - - checkVsHeader = (_seq->trueHeader != NULL) && (_seq->trueHeader[0] != 0); - - headerIx = 0; - while ((ch != '\n') && (ch != '\r')) - { - if (ch == EOF) goto end_of_file; - if (checkVsHeader) - { if (ch != _seq->trueHeader[headerIx]) goto third_line_mismatch; } - headerIx++; - ch = seq_getc (_seq); - } - - if (checkVsHeader) - { if (headerIx != strlen(_seq->trueHeader)) goto third_line_short; } - -end_of_third_line: - - if (ch == '\r') // handle possible DOS CR-LF line ending - { - ch = seq_getc (_seq); - if (ch != '\n') seq_ungetc (ch, _seq); - } - - ////////// - // read qualities and keep those that are within our index limits - ////////// - - index = 0; - qualCount = 0; - - ch = seq_getc (_seq); - while (ch != EOF) - { - if ((ch == '\n') || (ch == '\r')) goto end_of_qualities; - if ((ch < minFastqCh) || (ch > maxFastqCh)) goto bad_quality; - - // this is a quality character, do we want it? - - index++; - if ((startLimit != 0) && (index < startLimit)) goto next_quality; - if ((endLimit != 0) && (index > endLimit)) goto next_quality; - - // we want it; let's store it; note that we don't need to check - // whether the array is long enough, since vq[] has been allocated in - // lock step with v[] - - qualCount++; - if (qualCount > nucCount) goto too_many_qualities; - _seq->vq[qualLen++] = ch; - - // go try the next quality character - - next_quality: - ch = seq_getc (_seq); - } - - // we've read all the qualities, finish up the sequence info - -end_of_qualities: - - if (ch == '\r') // handle possible DOS CR-LF line ending - { - ch = seq_getc (_seq); - if (ch != '\n') seq_ungetc (ch, _seq); - } - - if (qualCount < nucCount) goto not_enough_qualities; - _seq->vq[qualLen] = 0; // (set the terminating zero) - - return nucCount; - - // failure exits - // $$$ report line number here - -bad_nucleotide: - description = char_to_description (ch); - if ((_seq->header == NULL) || (_seq->header[0] == 0)) - suicidef ("bad fastq nucleotide character in %s (%s)\n" - "remove or replace non-ACGTN characters or consider using --ambiguous=iupac", - sequence_filename(_seq), description); - else - suicidef ("bad fastq nucleotide character in %s, %s (%s)\n" - "remove or replace non-ACGTN characters or consider using --ambiguous=iupac", - sequence_filename(_seq), _seq->header, description); - -bad_quality: - description = char_to_description (ch); - if ((_seq->header == NULL) || (_seq->header[0] == 0)) - suicidef ("bad fastq quality character in %s (%s)\n", - sequence_filename(_seq), description); - else - suicidef ("bad fastq quality character in %s, %s (%s)\n", - sequence_filename(_seq), _seq->header, description); - -not_enough_qualities: - description = char_to_description (ch); - if ((_seq->header == NULL) || (_seq->header[0] == 0)) - suicidef ("not enough fastq quality characters in %s\n" - unsposFmt " nucleotides and only " unsposFmt " quality characters\n" - "(this may be a line-wrapped FASTQ file, which is not supported)", - sequence_filename(_seq), nucCount, qualCount); - else - suicidef ("not enough fastq quality characters in %s, %s\n" - unsposFmt " nucleotides and only " unsposFmt " quality characters\n" - "(this may be a line-wrapped FASTQ file, which is not supported)", - sequence_filename(_seq), _seq->header, nucCount, qualCount); - -too_many_qualities: - description = char_to_description (ch); - if ((_seq->header == NULL) || (_seq->header[0] == 0)) - suicidef ("too many fastq quality characters in %s\n" - unsposFmt " nucleotides and at least " unsposFmt " quality characters\n", - sequence_filename(_seq),nucCount,qualCount); - else - suicidef ("too many fastq quality characters in %s, %s\n" - unsposFmt " nucleotides and at least " unsposFmt " quality characters\n", - sequence_filename(_seq), _seq->header, nucCount, qualCount); - -bad_fastq_third_line: - description = char_to_description (ch); - suicidef ("bad fastq third line character in %s (expected \"+\" but read \"%s\")\n" - "(this may be a line-wrapped FASTQ file, which is not supported)", - sequence_filename(_seq), description); - -third_line_mismatch: - if (headerIx >= strlen(_seq->trueHeader)) goto third_line_long; - description = char_to_description (ch); - suicidef ("fastq third line mismatch in %s (character %d is \"%s\")\n(expected \"+%s\")\n", - sequence_filename(_seq), headerIx+2, description, _seq->trueHeader); - -third_line_long: - suicidef ("fastq third line mismatch in %s (line has more than %d characters)\n(expected \"+%s\")\n", - sequence_filename(_seq), strlen(_seq->trueHeader)+1, _seq->trueHeader); - -third_line_short: - suicidef ("fastq third line mismatch in %s (line has only %d characters)\n(expected \"+%s\")\n", - sequence_filename(_seq), headerIx+1, _seq->trueHeader); - -end_of_file: - if ((_seq->header == NULL) || (_seq->header[0] == 0)) - suicidef ("premature end of fastq file %s\n", - sequence_filename(_seq)); - else - suicidef ("premature end of fastq file %s, %s\n", - sequence_filename(_seq), _seq->header); - - return 0; // (never gets here) - } - -//---------- -// -// fastq_skip_content-- -// Skip over the content of one sequence in a fastq file. This assumes that -// the sequence's file is positioned at the start of the line following the -// header. -// -// Upon return, the file is positioned at the start of the header for the next -// contig (or at the end of file). -// -//---------- -// -// Arguments: -// seq* _seq: The sequence being parsed. -// -// Returns: -// true if we were successful; false if we hit the end-of-file before -// reading all the content. -// -//---------- - -static int fastq_skip_content - (seq* _seq) - { - int linesToSkip; - int prevCh, ch; - - linesToSkip = 3; - prevCh = 0; - while (linesToSkip > 0) - { - ch = seq_getc (_seq); - if (ch == EOF) return false; - if ((ch == '\n') && (prevCh == '\r')) - { prevCh = 0; continue; } - if ((ch == '\n') || (ch == '\r')) linesToSkip--; - prevCh = ch; - } - - return true; - } - -//---------- -// -// load_csfasta_sequence-- -// Load the next fasta color sequence from the associated file. -// -// A typical file looks like this: -// -// # Wed Apr 22 15:07:58 2009 ... -// >538_743_229_F7 -// T013131021212033022020113200231003030002 -// >538_4021_559_F7 -// T002120310210323111000110101233231231210 -// >534_6488_139_F7 -// T112211320333111020130303120302210313113 -// -// Line beginning with '#' are comments and are ignored, but they can only -// occur immediately in front of a header line. Lines beginning with ">" are -// header lines. If the file contains only one sequence, the header line is -// optional. Sequences must begin with a nucleotide and thereafter consist -// only of the digits '0', '1', '2' and '3'. Sequences may occupy multiple -// lines. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to load. -// int keeper: true => actually load the sequence -// false => just skip the sequence -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -static void load_csfasta_sequence - (seq* _seq, - int keeper) - { - int ch; - unspos length; - - if (_seq == NULL) suicide ("load_csfasta_sequence(NULL)"); - - ////////// - // read the header - ////////// - - while (true) // skip comment lines - { - ch = skip_seq_whitespace (_seq); - if (ch != '#') break; - - while ((ch != '\n') && (ch != '\r') && (ch != EOF)) - ch = seq_getc (_seq); - } - - if (ch != '>') seq_ungetc (ch, _seq); - else parse_csfasta_header (_seq); - - ////////// - // read ahead to determine the length of the sequence and to pre-allocate - // the vector - ////////// - - if (_seq->rewindable == -1) - _seq->rewindable = (test_rewindability (_seq) == 0); - - if ((_seq->rewindable == true) && (keeper)) - { - // read ahead, counting chars needed - - save_fstate (_seq); - length = parse_csfasta (_seq, /*storeEm*/ false); - restore_fstate (_seq); - - // allocate the vector - - if ((length > maxSequenceLen) || (_seq->len > maxSequenceLen - length)) - suicidef ("in load_csfasta_sequence for %s, " - "sequence length %s+%s exceeds maximum (%s)", - sequence_filename(_seq), - commatize(_seq->len), commatize(length), - commatize(maxSequenceLen)); - - sequence_long_enough (_seq, _seq->len+length, false); - } - - ////////// - // read the sequence - ////////// - - parse_csfasta (_seq, /*storeEm*/ keeper); - } - -//---------- -// -// parse_csfasta_header-- -// Parse a csfasta header from the associated file. This assumes that the -// sequence's file is positioned at the first character in the header, *after* -// the '>' character. -// -// Upon return, the file is positioned at the start of the first line -// following the header. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence being parsed. -// -// Returns: -// (nothing; the header is written to _seq->header) -// -//---------- - -static void parse_csfasta_header - (seq* _seq) - { - u32 headerLen; - int ch; - - if (_seq->lockedHeader) - { - ch = seq_getc (_seq); - while ((ch != '\n') && (ch != '\r') && (ch != EOF)) - ch = seq_getc (_seq); - } - else - { - headerLen = 0; - _seq->headerOwner = _seq->shortHeaderOwner = true; - ch = seq_getc (_seq); - - while ((ch != '\n') && (ch != '\r') && (ch != EOF)) - { - append_char (&_seq->header, &_seq->headerSize, &headerLen, ch); - ch = seq_getc (_seq); - } - append_char (&_seq->header, &_seq->headerSize, &headerLen, 0); - - if (_seq->nameTrigger != NULL) - { - char* triggerFound, *src, *dst; - triggerFound = strstr (_seq->header, _seq->nameTrigger); - if (triggerFound != NULL) - { - triggerFound += strlen (_seq->nameTrigger); - for (src=triggerFound,dst=_seq->header ; *src!=0 ; ) - { - ch = *(src++); - if ((!isalnum(ch)) && (ch != '_')) - break; - *(dst++) = ch; - } - *dst = 0; - } - } - } - - if (ch == '\r') // handle possible DOS CR-LF line ending - { - ch = seq_getc (_seq); - if (ch != '\n') seq_ungetc (ch, _seq); - } - - } - -//---------- -// -// parse_csfasta-- -// Parse a csfasta sequence from the associated file. This assumes that the -// sequence's file is positioned at the first character in the sequence, -// *after* the sequence header line. -// -// (see load_csfasta_sequence() for info about the file format) -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to parse. -// int storeEm: true => store the results (in the sequence) -// false => just count -// -// Returns: -// The number of characters read into the sequence; failure causes program -// fatality. -// -//---------- -// -// Notes: -// -// (1) Unlike fasta and fastq, we do not allow csfasta to include separator -// characters. The reasoning is that following a separator we would not -// have the primer nucleotide that we have at the start of the sequence. -// -//---------- - -static unspos parse_csfasta - (seq* _seq, - int storeEm) - { - unspos index, startLimit, endLimit, count; - int prevCh, ch; - u8 chType; - - index = 0; - count = 0; - startLimit = _seq->startLimit; - endLimit = _seq->endLimit; - - // scan the file, keeping characters that are (a) colors (or an initial - // nucleotide) and (b) are within our index limits - - prevCh = '\n'; - ch = skip_seq_whitespace (_seq); - while (ch != EOF) - { - if ((prevCh == '\n') - && ((ch == '#') || (ch == '>'))) // (start of next sequence) - { seq_ungetc (ch, _seq); break; } - - chType = char_to_csfasta_type[(u8)ch]; - switch (chType) - { - case _nucleotide: - case _color: - break; - case _newline: - ch = '\n'; // (allow for unix, mac, or pc line ends) - goto next_char; - case _bad: - goto bad_char; - } - - // this is a color or nucleotide, do we want it? - - if ((index == 0) != (chType == _nucleotide)) - { - if (index == 0) goto bad_nucleotide; - else goto bad_color; - } - - index++; - - if ((startLimit != 0) && (index < startLimit)) goto next_char; - if ((endLimit != 0) && (index > endLimit)) goto next_char; - - // we want it; are we just counting? - - if ((!storeEm) && (count+1 < count)) - suicidef ("in parse_csfasta, " - "sequence length " unsposFmt "+1 overflows internal data type", - count); - - count++; - if (!storeEm) goto next_char; - - // ok, let's store it - - if (_seq->len > maxSequenceLen - 1) - suicidef ("in parse_csfasta, " - "sequence length " unsposFmt "+1 exceeds maximum (" unsposFmt ")", - _seq->len, maxSequenceLen); - - sequence_long_enough (_seq, _seq->len+1, true); - _seq->v[_seq->len++] = ch; - - // go try the next character - - next_char: - prevCh = ch; - ch = skip_seq_whitespace (_seq); - } - - if (storeEm) - { - _seq->v[_seq->len] = 0; // (set the terminating zero) - _seq->trueLen += index; // (account for the characters - // .. we've read so far) - } - - // make sure we got somethin' useful - - if ((startLimit != 0) && (startLimit > index)) - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), startLimit, index); - - if ((endLimit != 0) && (endLimit > index)) - { - if (_seq->endIsSoft) - { _seq->endLimit = 0; _seq->endIsSoft = false; } - else - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), endLimit, index); - } - - if ((count == 0) && (storeEm)) - { - if (_seq->header == NULL) - fprintf (stderr, "WARNING. %s contains an empty sequence\n", - sequence_filename(_seq)); - else - fprintf (stderr, "WARNING. %s contains an empty sequence:\n%s\n", - sequence_filename(_seq), _seq->header); - } - - if (startLimit == 0) _seq->startLoc = 1; - else _seq->startLoc = startLimit; - -// (no longer needed; the loop above exits only at end-of-file or end-of-seq) -// // skip to the next sequence -// -// if ((storeEm) && (_seq->needTrueLen)) -// { -// prevCh = '\n'; -// ch = skip_seq_whitespace (_seq); -// while (ch != EOF) -// { -// if ((prevCh == '\n') -// && ((ch == '#') || (ch == '>'))) // (start of next sequence) -// { seq_ungetc (ch, _seq); break; } -// -// switch (char_to_csfasta_type[(u8)ch]) -// { -// case _nucleotide: -// goto bad_color; -// case _color: -// _seq->trueLen++; -// break; -// case _newline: -// ch = '\n'; // (allow for unix, mac, or pc line ends) -// break; -// case _bad: -// goto bad_char; -// } -// -// // go try the next character -// -// prevCh = ch; -// ch = skip_seq_whitespace (_seq); -// } -// } - - return count; - - // failure exits - // $$$ report line number and sequence name here - -bad_char: - if (dna_isprint(ch)) - suicidef ("bad csfasta character in %s: %c", - sequence_filename(_seq), (int) ch); - else - suicidef ("bad csfasta character in %s (ascii %02X)", - sequence_filename(_seq), (u8) ch); - return 0; // (never gets here) - -bad_nucleotide: - if (dna_isprint(ch)) - suicidef ("bad csfasta nucleotide in %s: %c", - sequence_filename(_seq), (u8) ch); - else - suicidef ("bad csfasta nucleotide in %s (ascii %02X)", - sequence_filename(_seq), (int) ch); - return 0; // (never gets here) - -bad_color: - if (dna_isprint(ch)) - suicidef ("bad csfasta color in %s: %c", - sequence_filename(_seq), (int) ch); - else - suicidef ("bad csfasta color in %s (ascii %02X)", - sequence_filename(_seq), (u8) ch); - return 0; // (never gets here) - } - -//---------- -// -// load_nib_sequence-- -// Load a nib sequence from the associated file. -// -// A nib file stores each nucleotide in four bits (one nybble). The file -// consists of a 4 byte magic number, followed by a 4 byte length, followed by -// the nucleotides. The magic number is in the file as -// (first byte) 3A 3D E9 6B (third byte) -// The length field is in little-endian order, so -// (first byte) C0 E1 E4 00 (third byte) -// means 0x00E4E1C0 bytes (15 million). The length is the number of -// nucleotides. The first nucleotide is in the most significant nybble of the -// 9th byte, the second one is in the least significant nybble, the third in -// the 10th byte (msnybble), and so on. Nybble bits are mapped to characters -// as per the tables nibTo1stChar[] and nibTo2ndChar[]. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to load. -// int keeper: true => actually load the sequence -// false => just skip the sequence -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -static void load_nib_sequence - (seq* _seq, - int keeper) - { - u32 magic, length; - unspos newSeqLen, ix; - u32 startLimit, endLimit, startIndex; - u32 bytesLeft, bytesToRead, bytesRead; - u8 ch; - const u8* to1stChar, *to2ndChar; - u8* dst; - - if (!keeper) return; - if (_seq == NULL) suicide ("load_nib_sequence(NULL)"); - - ////////// - // get the sequence length - ////////// - - // check the magic number and decide if it's little or big endian - - magic = read_4_big (_seq); - - // read the length - - if (magic == nibMagicLittle) - length = read_4_little (_seq); - else if (magic == nibMagicBig) - length = read_4_big (_seq); - else - { - length = 0; // (placate compiler) - suicidef ("bad nib magic number in %s (%08lX)", - sequence_filename(_seq), magic); - } - - if ((length == 0) || (((s32) length) == -1)) - suicidef ("bad nib length in %s (%08lX)", sequence_filename(_seq), length); - - // validate sequence limits - - if ((_seq->startLimit != 0) && (_seq->startLimit > (unspos) length)) - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), _seq->startLimit, length); - - if ((_seq->endLimit != 0) && (_seq->endLimit > (unspos) length)) - { - if (_seq->endIsSoft) - { _seq->endLimit = 0; _seq->endIsSoft = false; } - else - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), _seq->endLimit, length); - } - - startLimit = (u32) _seq->startLimit; - endLimit = (u32) _seq->endLimit; - - _seq->trueLen += length; - - // skip ahead to the first desired base, and determine how many bases - // we'll read - - if (startLimit == 0) startLimit = 1; - startIndex = startLimit - 1; - - bytesLeft = length; - if (startIndex > 0) - { skip_chars (_seq, startIndex/2); bytesLeft -= 2*(startIndex/2); } - - length = bytesLeft; - if ((startIndex&1) != 0) // start offset is odd - length--; - if (endLimit != 0) - { - if (length > endLimit - startIndex) - length = endLimit - startIndex; - } - - ////////// - // allocate the vector, including an extra byte since we may overshoot by - // 1 when unpacking - ////////// - -#if (maxSequenceIndex <= 32) // otherwise compiler complains that this test is - // .. always false - if ((length > maxSequenceLen) || (_seq->len > maxSequenceLen - length)) - suicidef ("in load_nib_sequence for %s, " - "sequence length %s+%s exceeds maximum (%s)", - sequence_filename(_seq), - commatize(_seq->len), commatize(length), - commatize(maxSequenceLen)); -#endif - - newSeqLen = _seq->len + length; - sequence_long_enough (_seq, newSeqLen+1, false); - - ////////// - // read the sequence - ////////// - - // decide which lookup tables we'll use - - if (_seq->doUnmask) - { - to1stChar = nibTo1stCharUnmasked; - to2ndChar = nibTo2ndCharUnmasked; - } - else - { - to1stChar = nibTo1stChar; - to2ndChar = nibTo2ndChar; - } - - // read the first, partial, byte - - ix = _seq->len; - if ((startIndex&1) != 0) // start offset is odd - { - ch = seq_getc (_seq); bytesLeft -= 2; - _seq->v[ix++] = to2ndChar[ch]; - } - - // process any bytes in the pending buffer, one at a time - - while ((ix < newSeqLen) && (_seq->pendingLen > 0)) - { - ch = seq_getc (_seq); bytesLeft -= 2; - _seq->v[ix++] = to1stChar[ch]; - _seq->v[ix++] = to2ndChar[ch]; - } - - // read the remaining bytes to the tail end of the buffer - - bytesToRead = ((newSeqLen-ix) + 1) / 2; - dst = _seq->v + _seq->size - bytesToRead; - if (bytesToRead > 0) - { - bytesRead = fread (dst, 1, bytesToRead, _seq->f); - if (bytesRead != bytesToRead) - suicidef ("in load_nib_sequence(%s), block read\n" - "wanted %d bytes, only got %d", - sequence_filename(_seq), bytesToRead, bytesRead); - } - - // unpack those bytes; note that although we are writing into the same - // buffer that we are reading from, and writing two bytes for each one read, - // the write pointer will not overtake the read pointer; further note that - // we may unpack an extra nybble (this will be overwritten when we set the - // terminating zero) - - while (ix < newSeqLen) - { - ch = *(dst++); bytesLeft--; - _seq->v[ix++] = to1stChar[ch]; - _seq->v[ix++] = to2ndChar[ch]; - } - - _seq->v[newSeqLen] = 0; // (set the terminating zero) - _seq->len = newSeqLen; - - skip_chars (_seq, bytesLeft); // skip the rest of the sequence - _seq->pendingLen = 0; // (discard any pending chars) - _seq->pendingStack = _seq->pendingChars + seqBufferSize; - - if (startLimit == 0) _seq->startLoc = 1; - else _seq->startLoc = startLimit; - - ////////// - // create a header - ////////// - - if (!_seq->lockedHeader) - { - length = snprintf (_seq->header, 0, "%s:" unsposDashFmt, - sequence_filename(_seq), - _seq->startLoc, - _seq->startLoc + _seq->len-1); - - if (_seq->headerSize < length+1) - { - _seq->header = realloc_or_die ("load_nib_sequence (header)", - _seq->header, length+1); - _seq->headerSize = length+1; - } - _seq->headerOwner = _seq->shortHeaderOwner = true; - - snprintf (_seq->header, length+1, "%s:" unsposDashFmt, - sequence_filename(_seq), - _seq->startLoc, - _seq->startLoc + _seq->len-1); - } - } - -//---------- -// -// read_2bit_header, load_2bit_sequence-- -// Load a 2bit sequence from the associated file. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to load. -// int keeper: (load_2bit_sequence only) -// true => actually load the sequence -// false => just skip the sequence -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -static int find_2bit_sequence (seq* _seq, char* name); -static u32 read_2bit_index_entry (seq* _seq, char seqName[256], u32 seqNum); - -//--- read_2bit_header --- - -static void read_2bit_header - (seq* _seq) - { - u32 magic, version, reserved; - - // read and validate the header - - magic = read_4_big (_seq); - - if (magic == twobitMagicLittle) - _seq->twoBit.bigEndian = false; - else if (magic == twobitMagicBig) - _seq->twoBit.bigEndian = true; - else - suicidef ("bad 2bit magic number in %s (%08lX)", - sequence_filename(_seq), magic); - - version = read_4 (_seq, _seq->twoBit.bigEndian); - _seq->twoBit.numContigs = read_4 (_seq, _seq->twoBit.bigEndian); - reserved = read_4 (_seq, _seq->twoBit.bigEndian); - - if (version != 0) - suicidef ("bad 2bit version in %s (%08lX)", - sequence_filename(_seq), version); - if (reserved != 0) - suicidef ("bad 2bit header word 4 in %s (%08lX)", - sequence_filename(_seq), reserved); - - if (_seq->twoBit.numContigs == 0) - suicidef ("empty 2bit file %s", sequence_filename(_seq)); - - // save index's file position - - _seq->twoBit.indexFilePos = _seq->twoBit.contigFilePos = ftell (_seq->f); - - // if we have a single contig-of-interest, locate it - - if (_seq->contigOfInterest != NULL) - { - if (!find_2bit_sequence (_seq, _seq->contigOfInterest)) - suicidef ("2bit file %s doesn't contain %s", - sequence_filename(_seq), _seq->contigOfInterest); - } - - } - -//--- load_2bit_sequence --- - -static void load_2bit_sequence - (seq* _seq, - int keeper) - { - char seqName[maxSequenceName+1]; - int numChars; - u32 dnaSize, reserved; - u32 nBlockCount, maskBlockCount; - u32 seqDataPos, seekPos; - unspos oldSeqLen, ix; - u32 startLimit, endLimit, length, startIndex, endIndex; - u32 basesToGo, bytesToSkip, bytesToRead, bytesRead; - u32 blockIx, s, e, scanIx; - u8 ch; - u8* data, *dst; - char* seekType; - int err; - - _seq->pendingLen = 0; // (discard any pending chars) - _seq->pendingStack = _seq->pendingChars + seqBufferSize; - - ////////// - // read the sequence's index table entry - ////////// - - err = fseek (_seq->f, _seq->twoBit.contigFilePos, SEEK_SET); - if (err != 0) - { seekType = "index"; seekPos = _seq->twoBit.contigFilePos; goto fseek_failed; } - - seqDataPos = read_2bit_index_entry (_seq, seqName, _seq->contig); - _seq->twoBit.contigFilePos = ftell (_seq->f); - - if (!keeper) return; - - // copy the sequence name as our header (unless the header is locked) - - if (!_seq->lockedHeader) - { - numChars = strlen (seqName); - if (_seq->headerSize < (unsigned) (numChars+1)) - { - _seq->header = realloc_or_die ("load_2bit_sequence (header)", - _seq->header, numChars+1); - _seq->headerSize = numChars+1; - _seq->headerOwner = _seq->shortHeaderOwner = true; - } - - strcpy (/*to*/ _seq->header, /*from*/ seqName); - } - - ////////// - // make sure we have enough room for the sequence's data - ////////// - - err = fseek (_seq->f, seqDataPos, SEEK_SET); - if (err != 0) - { seekType = "header data"; seekPos = seqDataPos; goto fseek_failed; } - - dnaSize = read_4 (_seq, _seq->twoBit.bigEndian); - seqDataPos += 4; - - if ((dnaSize == 0) || (((s32) dnaSize) == -1)) - suicidef ("bad 2bit length in %s (%08lX)", sequence_filename(_seq), dnaSize); - - if ((_seq->startLimit != 0) && (_seq->startLimit > (unspos) dnaSize)) - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), _seq->startLimit, dnaSize); - - if ((_seq->endLimit != 0) && (_seq->endLimit > (unspos) dnaSize)) - { - if (_seq->endIsSoft) - { _seq->endLimit = 0; _seq->endIsSoft = false; } - else - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), _seq->endLimit, dnaSize); - } - - startLimit = (u32) _seq->startLimit; - endLimit = (u32) _seq->endLimit; - - if (startLimit == 0) startLimit = 1; - if (endLimit == 0) endLimit = dnaSize; - - _seq->trueLen += dnaSize; - - // allocate the vector; we ask for an additional three characters because - // during unpacking we write 4 bytes at a time, and thus may overshoot the - // end by 3 - // $$$ note that this may be way more than needed, if start and end limits - // $$$ .. reduce the number of characters we actually want; this could be - // $$$ .. improved if it causes problems - -#if (maxSequenceIndex <= 32) // otherwise compiler complains that this test is - // .. always false - if ((dnaSize+3 > maxSequenceLen) || (_seq->len > maxSequenceLen - (dnaSize+3))) - goto sequence_too_big; -#endif - - sequence_long_enough (_seq, _seq->len + dnaSize+3, false); - - ////////// - // read and save the intervening block-marking fields - ////////// - - // make sure we have enough room for the n-blocks - - nBlockCount = read_4 (_seq, _seq->twoBit.bigEndian); - seqDataPos += 4; - - if (nBlockCount > _seq->twoBit.nBlocksSize) - { - _seq->twoBit.nBlockStarts = (u32*) realloc_or_die ("nBlockStarts", _seq->twoBit.nBlockStarts, nBlockCount * sizeof(u32)); - _seq->twoBit.nBlockSizes = (u32*) realloc_or_die ("nBlockSizes", _seq->twoBit.nBlockSizes, nBlockCount * sizeof(u32)); - _seq->twoBit.nBlocksSize = nBlockCount; - } - - // read the n-blocks - - for (blockIx=0 ; blockIxtwoBit.nBlockStarts[blockIx] = read_4 (_seq, _seq->twoBit.bigEndian); - - for (blockIx=0 ; blockIxtwoBit.nBlockSizes[blockIx] = read_4 (_seq, _seq->twoBit.bigEndian); - - seqDataPos += 4 * (2 * nBlockCount); - - // make sure we have enough room for the mask-blocks; note that if we are - // unmasking the sequence, then we skip over the mask-blocks - - maskBlockCount = read_4 (_seq, _seq->twoBit.bigEndian); - seqDataPos += 4; - - if (_seq->doUnmask) - { - seqDataPos += 4 * (2 * maskBlockCount); - err = fseek (_seq->f, seqDataPos, SEEK_SET); - if (err != 0) - { seekType = "mask data"; seekPos = seqDataPos; goto fseek_failed; } - maskBlockCount = 0; - } - else - { - if (maskBlockCount > _seq->twoBit.mBlocksSize) - { - _seq->twoBit.mBlockstarts = (u32*) realloc_or_die ("mBlockstarts", _seq->twoBit.mBlockstarts, maskBlockCount * sizeof(u32)); - _seq->twoBit.mBlocksizes = (u32*) realloc_or_die ("mBlocksizes", _seq->twoBit.mBlocksizes, maskBlockCount * sizeof(u32)); - _seq->twoBit.mBlocksSize = maskBlockCount; - } - } - - // read the mask-blocks - - for (blockIx=0 ; blockIxtwoBit.mBlockstarts[blockIx] = read_4 (_seq, _seq->twoBit.bigEndian); - - for (blockIx=0 ; blockIxtwoBit.mBlocksizes[blockIx] = read_4 (_seq, _seq->twoBit.bigEndian); - - seqDataPos += 4 * (2 * maskBlockCount); - - // skip the reserved data prefix - - reserved = read_4 (_seq, _seq->twoBit.bigEndian); - if (reserved != 0) - suicidef ("bad 2bit reserved data prefix in %s\n" - " (data at %08lX is %08lX)", - sequence_filename(_seq), seqDataPos, reserved); - - ////////// - // read the sequence's data - ////////// - - // skip to the first byte containing data of interest - - startIndex = startLimit-1; - length = basesToGo = endLimit+1 - startLimit; - - bytesToSkip = startIndex / 4; - if (bytesToSkip != 0) - { - err = fseek (_seq->f, bytesToSkip, SEEK_CUR); - if (err != 0) - { seekType = "data"; seekPos = _seq->twoBit.contigFilePos; goto fseek_failed; } - startIndex -= 4*bytesToSkip; - } - - // read the leading partial byte (if any) - - ix = oldSeqLen = _seq->len; - if (startIndex > 0) - { - ch = (u8) seq_getc (_seq); - data = (u8*) (twobitToChars[ch] + startIndex); - while (*data != 0) - { - if (basesToGo-- <= 0) break; - _seq->v[ix++] = *(data++); - } - } - - // process any bytes in the pending buffer; note that we may end up writing - // as many as 3 bytes beyond the end of the sequence, but will correct this - // when we write the terminating zero; also note that we have to separate - // the last iteration of this loop, since basesToGo is unsigned - - for ( ; (basesToGo>=4)&&(_seq->pendingLen>0) ; basesToGo-=4) - { - ch = (u8) seq_getc (_seq); - data = (u8*) twobitToChars[ch]; - _seq->v[ix++] = data[0]; - _seq->v[ix++] = data[1]; - _seq->v[ix++] = data[2]; - _seq->v[ix++] = data[3]; - } - - if ((basesToGo > 0) && (_seq->pendingLen > 0)) - { - ch = (u8) seq_getc (_seq); - data = (u8*) twobitToChars[ch]; - _seq->v[ix++] = data[0]; - _seq->v[ix++] = data[1]; - _seq->v[ix++] = data[2]; - _seq->v[ix++] = data[3]; - basesToGo = 0; - } - - // read the remaining bytes to the tail end of the buffer - - bytesToRead = (basesToGo + 3) / 4; - dst = _seq->v + _seq->size - bytesToRead; - if (bytesToRead > 0) - { - bytesRead = fread (dst, 1, bytesToRead, _seq->f); - if (bytesRead != bytesToRead) goto read_failed; - } - - // unpack those bytes; note that although we are writing into the same - // buffer that we are reading from, and writing four bytes for each one - // read, the write pointer will not overtake the read pointer; as above, - // we may end up writing as many as 3 bytes beyond the end of the sequence - - for ( ; basesToGo>=4 ; basesToGo-=4) - { - ch = *(dst++); - data = (u8*) twobitToChars[ch]; - _seq->v[ix++] = data[0]; - _seq->v[ix++] = data[1]; - _seq->v[ix++] = data[2]; - _seq->v[ix++] = data[3]; - } - - if (basesToGo > 0) - { - ch = *(dst++); - data = (u8*) twobitToChars[ch]; - _seq->v[ix++] = data[0]; - _seq->v[ix++] = data[1]; - _seq->v[ix++] = data[2]; - _seq->v[ix++] = data[3]; - basesToGo = 0; - } - - _seq->len += length; - _seq->v[_seq->len] = 0; // (set the terminating zero) - - ////////// - // mark the Ns and masked bases - ////////// - - startIndex = startLimit-1; - endIndex = endLimit; - - for (blockIx=0 ; blockIxtwoBit.nBlockStarts[blockIx]; - e = s + _seq->twoBit.nBlockSizes[blockIx]; - if (e <= startIndex) continue; - if (s >= endIndex) continue; - if (s < startIndex) s = startIndex; - if (e > endIndex) e = endIndex; - s -= startIndex; - e -= startIndex; - for (scanIx=s ; scanIxv[oldSeqLen+scanIx] = 'N'; - } - - for (blockIx=0 ; blockIxtwoBit.mBlockstarts[blockIx]; - e = s + _seq->twoBit.mBlocksizes[blockIx]; - if (e <= startIndex) continue; - if (s >= endIndex) continue; - if (s < startIndex) s = startIndex; - if (e > endIndex) e = endIndex; - s -= startIndex; - e -= startIndex; - for (scanIx=s ; scanIxv[oldSeqLen+scanIx] = dna_tolower (_seq->v[oldSeqLen+scanIx]); - } - - _seq->twoBit.contigLoaded = true; - - if (startLimit == 0) _seq->startLoc = 1; - else _seq->startLoc = startLimit; - return; - -// failure exits - -fseek_failed: - suicidef ("failed to seek to position in \"%s\"\n" - "in load_2bit_sequence, %s fseek(%08lX) returned %d", - sequence_filename(_seq), seekType, seekPos, err); - return; // (never gets here) - -sequence_too_big: - suicidef ("in load_2bit_sequence for %s, " - "sequence length %s+%s exceeds maximum (%s)", - sequence_filename(_seq), - commatize(_seq->len),commatize(dnaSize+3), - commatize(maxSequenceLen)); - return; // (never gets here) - -read_failed: - suicidef ("in load_2bit_sequence for %s," - " block read for sequence %u\n" - "wanted %d bytes, only got %d", - sequence_filename(_seq), _seq->contig, bytesToRead, bytesRead); - return; // (never gets here) - } - -//--- find_2bit_sequence --- - -static int find_2bit_sequence - (seq* _seq, - char* name) - { - char seqName[maxSequenceName+1]; - u32 ix; - - for (ix=0 ; ix<_seq->twoBit.numContigs ; ix++) - { - _seq->twoBit.contigFilePos = ftell (_seq->f); - read_2bit_index_entry (_seq, seqName, ix+1); - if (strcmp (seqName, name) == 0) - { _seq->contig = ix + 1; return true; } - } - - return false; - } - -//--- read_2bit_index_entry --- - -static u32 read_2bit_index_entry - (seq* _seq, - char seqName[maxSequenceName+1], - u32 seqNum) - { - unsigned int nameSize; - size_t bytesRead; - - // read the name - - nameSize = getc_or_die (_seq->f, _seq->filename); - if (nameSize > 0) - { - bytesRead = fread (seqName, 1, nameSize, _seq->f); - if (bytesRead != nameSize) - suicidef ("in load_2bit_sequence for %s, short read for sequence %u\n" - "wanted %d bytes, only got %d", - sequence_filename(_seq), seqNum, nameSize, bytesRead); - } - seqName[nameSize] = 0; - - // read the data offset - - return read_4 (_seq, _seq->twoBit.bigEndian); - } - -//---------- -// -// read_hsx_header, load_hsx_sequence-- -// Load a sequence from the associated hsx file. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to load. -// int keeper: (load_hsx_sequence only) -// true => actually load the sequence -// false => just skip the sequence -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -static u64 lookup_hsx_sequence (seq* _seq, char* name); -static u64 find_hsx_sequence (seq* _seq, char* name, - u64 bucketStart, u64 bucketEnd); -static char* read_hsx_index_entry (seq* _seq); -static char* read_hsx_string (seq* _seq, FILE* f); - -//--- read_hsx_header --- - -static void read_hsx_header - (seq* _seq) - { - u32 fileInfoOffset[255]; - u32 magic, headerLength; - char* s; - char extension[10]; - u32 infoBytes, nameBytes; - char* slash, *dot, *nameScan; - int baseLen, pathLen; - u32 fileNum; - int err; - - // read and validate the header - - magic = read_4_big (_seq); - - if (magic == hsxMagicLittle) - _seq->hsx.bigEndian = false; - else if (magic == hsxMagicBig) - _seq->hsx.bigEndian = true; - else - suicidef ("bad hsx magic number in %s (%08lX)", - sequence_filename(_seq), magic); - - _seq->hsx.version = read_4 (_seq, _seq->hsx.bigEndian); - if (_seq->hsx.version != 0x00000100L) - suicidef ("bad hsx version in %s (%08lX)", - sequence_filename(_seq), _seq->hsx.version); - - headerLength = read_4 (_seq, _seq->hsx.bigEndian); - if (headerLength != 0x1C) - suicidef ("bad hsx header length in %s (%08lX)", - sequence_filename(_seq), headerLength); - - _seq->hsx.numFiles = read_4 (_seq, _seq->hsx.bigEndian); - _seq->hsx.fileTableOffset = (u64) read_4 (_seq, _seq->hsx.bigEndian); - _seq->hsx.numBuckets = read_4 (_seq, _seq->hsx.bigEndian); - _seq->hsx.hashTableOffset = (u64) read_4 (_seq, _seq->hsx.bigEndian); - _seq->hsx.numContigs = read_4 (_seq, _seq->hsx.bigEndian); - _seq->hsx.seqTableOffset = (u64) read_4 (_seq, _seq->hsx.bigEndian); - - if (_seq->hsx.numFiles == 0) - suicidef ("empty file table in hsx file %s", sequence_filename(_seq)); - - if (_seq->hsx.numFiles > 255) - suicidef ("corrupt header in hsx file %s (numFiles > 255; %d)", - sequence_filename(_seq), _seq->hsx.numFiles); - - if (_seq->hsx.numBuckets == 0) - suicidef ("corrupt header in hsx file %s (numBuckets = 0)", - sequence_filename(_seq)); - - // read and validate the file table - - err = fseek (_seq->f, (long int) _seq->hsx.fileTableOffset, SEEK_SET); - if (err != 0) - suicidef ("in read_hsx_header for %s, file table fseek(%08lX) returned %d", - sequence_filename(_seq), _seq->hsx.fileTableOffset, err); - - for (fileNum=0 ; fileNum<_seq->hsx.numFiles ; fileNum++) - fileInfoOffset[fileNum] = (u64) read_4 (_seq, _seq->hsx.bigEndian); - - slash = strrchr (_seq->filename, pathSlash); - dot = strrchr (_seq->filename, '.'); - if ((dot == NULL) || ((slash != NULL) && (dot < slash))) - baseLen = strlen(_seq->filename); - else - baseLen = dot - _seq->filename; - if (slash == NULL) - pathLen = 0; - else - pathLen = slash+1 - _seq->filename; - - infoBytes = sizeof(hsxfileinfo) * _seq->hsx.numFiles; - nameBytes = 0; - for (fileNum=0 ; fileNum<_seq->hsx.numFiles ; fileNum++) - { - err = fseek (_seq->f, (long int) fileInfoOffset[fileNum], SEEK_SET); - if (err != 0) - suicidef ("in read_hsx_header for %s, file table fseek(%08lX) returned %d", - sequence_filename(_seq), fileInfoOffset[fileNum], err); - - s = read_hsx_string (_seq, _seq->f); - if ((strcmp (s, "fa") != 0) - && (strcmp (s, "fasta") != 0)) - suicidef ("in read_hsx_header for %s, unsupported file type: %s", - sequence_filename(_seq), s); - strncpy (/*to*/ extension, /*from*/ s, sizeof(extension)); - - s = read_hsx_string (_seq, _seq->f); - if (s[0] != 0) - nameBytes += pathLen + strlen(s) + 1 + strlen(extension) + 1; - else - nameBytes += baseLen + 1 + strlen(extension) + 1; - } - - _seq->hsx.fileInfo = (hsxfileinfo*) zalloc_or_die ("read_hsx_header", infoBytes + nameBytes); - - nameScan = ((char*) _seq->hsx.fileInfo) + infoBytes; - for (fileNum=0 ; fileNum<_seq->hsx.numFiles ; fileNum++) - { - _seq->hsx.fileInfo[fileNum].name = nameScan; - _seq->hsx.fileInfo[fileNum].f = NULL; - - err = fseek (_seq->f, (long int) fileInfoOffset[fileNum], SEEK_SET); - if (err != 0) - suicidef ("in read_hsx_header for %s, file table fseek(%08lX) returned %d", - sequence_filename(_seq), fileInfoOffset[fileNum], err); - - s = read_hsx_string (_seq, _seq->f); - strncpy (/*to*/ extension, /*from*/ s, sizeof(extension)); - - s = read_hsx_string (_seq, _seq->f); - if (s[0] != 0) - { - strncpy (/*to*/ nameScan, - /*from*/ _seq->filename, - /*limit*/ pathLen); - strcpy (/*to*/ nameScan + pathLen, - /*from*/ s); - nameScan[pathLen+strlen(s)] = '.'; - strcpy (/*to*/ nameScan + pathLen+strlen(s) + 1, - /*from*/ extension); - nameScan += pathLen + strlen(s) + 1 + strlen(extension) + 1; - } - else - { - strncpy (/*to*/ nameScan, - /*from*/ _seq->filename, - /*limit*/ baseLen); - nameScan[baseLen] = '.'; - strcpy (/*to*/ nameScan + baseLen + 1, - /*from*/ extension); - nameScan += baseLen + 1 + strlen(extension) + 1; - } - } - - // locate the first contig - - locate_hsx_first_sequence (_seq); - } - -//--- locate_hsx_first_sequence --- - -static void locate_hsx_first_sequence - (seq* _seq) - { - u64 fileOffset; - long int bucketOffset; - int err; - - // if we have a single contig-of-interest, locate it - - if (_seq->contigOfInterest != NULL) - { - fileOffset = lookup_hsx_sequence (_seq, _seq->contigOfInterest); - if ((fileOffset & hsxMsBit5) != 0) - suicidef ("hsx file %s doesn't contain %s", - sequence_filename(_seq), _seq->contigOfInterest); - if (fileOffset > hsxMaxFilePos) - suicidef ("in read_hsx_header for %s," - " file pos for %s (%010lX) exceeds max (%010lX)", - sequence_filename(_seq), _seq->contigOfInterest, - fileOffset, hsxMaxFilePos); - _seq->hsx.contigFilePos = fileOffset; - debugNamesFile_5; - } - - // otherwise, if we have no contig names or chores, locate the first - // sequence in the index - - else if ((_seq->namesFilename == NULL) && (_seq->choresFilename == NULL)) - { - bucketOffset = (long int) _seq->hsx.hashTableOffset; - err = fseek (_seq->f, bucketOffset, SEEK_SET); - if (err != 0) - suicidef ("in read_hsx_header for %s," - " file table fseek(%010lX) returned %d", - sequence_filename(_seq), 0, err); - - fileOffset = read_5 (_seq, _seq->hsx.bigEndian) & ~hsxMsBit5; - if (fileOffset > hsxMaxFilePos) - suicidef ("in read_hsx_header for %s," - " file pos for index 0 (%010lX) exceeds max (%010lX)", - sequence_filename(_seq), fileOffset, hsxMaxFilePos); - _seq->hsx.contigFilePos = fileOffset; - debugNamesFile_6; - } - - } - -//--- load_hsx_sequence --- - -static void load_hsx_sequence - (seq* _seq, - int keeper) - { - int err; - char* seqName; - int numChars; - unspos dnaSize; - unspos index, startLimit, endLimit; - int prevCh, ch; - char* seqFName; - FILE* seqF; - char* description; - - _seq->pendingLen = 0; // (discard any pending chars) - _seq->pendingStack = _seq->pendingChars + seqBufferSize; - - debugNamesFile_7; - - ////////// - // read the sequence's index table entry - ////////// - - if (_seq->hsx.contigFilePos > hsxMaxFilePos) - suicidef ("in load_hsx_sequence for %s," - " file pos for contig %u (%010lX) exceeds max (%010lX)", - sequence_filename(_seq), _seq->contig, - _seq->hsx.contigFilePos, hsxMaxFilePos); - err = fseek (_seq->f, (long int) _seq->hsx.contigFilePos, SEEK_SET); - if (err != 0) - suicidef ("in load_hsx_sequence for %s, index fseek(%010lX) returned %d", - sequence_filename(_seq), _seq->hsx.contigFilePos, err); - - seqName = read_hsx_index_entry (_seq); - _seq->hsx.contigFilePos = (u64) ftell (_seq->f); - debugNamesFile_8; - - if (!keeper) return; - - if (_seq->hsx.seqLength > (u64) maxSequenceLen) - suicidef ("in load_hsx_sequence for %s, " - "sequence length " unsposFmt " for %s " - "exceeds maximum (" unsposFmt ")", - sequence_filename(_seq), _seq->hsx.seqLength, seqName, - maxSequenceLen); - - // copy the sequence name as our header - - numChars = strlen (seqName); - if (_seq->headerSize < (unsigned) (numChars+1)) - { - _seq->header = realloc_or_die ("load_hsx_sequence (header)", - _seq->header, numChars+1); - _seq->headerSize = numChars+1; - _seq->headerOwner = _seq->shortHeaderOwner = true; - } - - strcpy (/*to*/ _seq->header, /*from*/ seqName); - - ////////// - // make sure we have enough room for the sequence's data - // - // $$$ note that the allocated vector may be way more than needed, if start - // $$$ .. and end limits reduce the number of characters we actually want; - // $$$ .. this could be improved if it causes problems - ////////// - - dnaSize = (unspos) _seq->hsx.seqLength; - - if ((_seq->startLimit != 0) && (_seq->startLimit > dnaSize)) - suicidef ("beyond end in %s/%s (%ld > " unsposFmt ")", - sequence_filename(_seq), seqName, _seq->startLimit, dnaSize); - - if ((_seq->endLimit != 0) && (_seq->endLimit > dnaSize)) - { - if (_seq->endIsSoft) - { _seq->endLimit = 0; _seq->endIsSoft = false; } - else - suicidef ("beyond end in %s/%s (%ld > " unsposFmt ")", - sequence_filename(_seq), seqName, _seq->endLimit, dnaSize); - } - - startLimit = (u32) _seq->startLimit; - endLimit = (u32) _seq->endLimit; - - if (startLimit == 0) startLimit = 1; - if (endLimit == 0) endLimit = dnaSize; - - _seq->trueLen += dnaSize; - -#if (maxSequenceIndex <= 32) // otherwise compiler complains that this test is - // .. always false - if ((dnaSize > maxSequenceLen) || (_seq->len > maxSequenceLen - dnaSize)) - suicidef ("in load_hsx_sequence for %s/%s, " - "sequence length %s+%s exceeds maximum (%s)", - sequence_filename(_seq), seqName, - commatize(_seq->len),commatize(dnaSize), - commatize(maxSequenceLen)); -#endif - - sequence_long_enough (_seq, _seq->len + dnaSize, false); - - // if the sequence is empty, warn the user but return the empty sequence to - // our caller - - if (_seq->hsx.seqLength == 0) - { - if (_seq->header == NULL) - fprintf (stderr, "WARNING. %s contains an empty sequence\n", - sequence_filename(_seq)); - else - fprintf (stderr, "WARNING. %s contains an empty sequence:\n%s\n", - sequence_filename(_seq), _seq->header); - return; - } - - ////////// - // read the sequence's data - ////////// - - seqFName = _seq->hsx.fileInfo[_seq->hsx.seqFileIx].name; - seqF = _seq->hsx.fileInfo[_seq->hsx.seqFileIx].f; - if (seqF == NULL) - { - // $$$ we should probably keep track of the number of open files and - // $$$ close some (by LRU) if too many are open - seqF = fopen_or_die (seqFName, "rb"); - _seq->hsx.fileInfo[_seq->hsx.seqFileIx].f = seqF; - } - - if (_seq->hsx.seqFilePos > hsxMaxFilePos) - suicidef ("in load_hsx_sequence for %s/%s," - " file pos for sequence %s (%010lX) exceeds max (%010lX)", - sequence_filename(_seq), seqName, _seq->header, - _seq->hsx.seqFilePos, hsxMaxFilePos); - err = fseek (seqF, _seq->hsx.seqFilePos, SEEK_SET); - if (err != 0) - suicidef ("in load_hsx_sequence for %s/s," - " data fseek(%s,%08lX) returned %d", - sequence_filename(_seq), seqName, - seqFName, _seq->hsx.seqFilePos, err); - - // if the first character is a '>' (and the length is non-zero), we have to - // skip this sequence header - // $$$ it might be a good idea to validate that the header we are reading - // $$$ .. matches the name of the sequence we think we're going to be reading - - prevCh = '\n'; - - ch = getc_or_die (seqF, seqFName); - if ((ch == '>') && (dnaSize != 0)) - { - while (ch != '\n') // (skip line) - ch = getc_or_die (seqF, seqFName); - // get first character of next line - ch = getc_or_die (seqF, seqFName); - } - - while ((ch == ' ') || (ch == '\t')) // (skip whitespace) - ch = getc_or_die (seqF, seqFName); - - // scan the file, keeping characters that are (a) nucleotides and (b) are - // within our index limits - - index = 0; - while (ch != EOF) - { - if ((prevCh == '\n') && (ch == '>')) // (start of next sequence) - break; - - if ((_seq->separatorCh == 0) || (ch != _seq->separatorCh)) - { - switch (char_to_fasta_type[(u8)ch]) - { - case _nucleotide: - break; - case _ambiguous: - if (!_seq->allowAmbiDNA) goto bad_char; - break; - case _newline: - ch = '\n'; // (allow for unix, mac, or pc line ends) - goto next_char; - case _bad: - goto bad_char; - } - } - - // this is a nucleotide (or separator), do we want it? - - index++; - - if ((startLimit != 0) && (index < startLimit)) goto next_char; - if ((endLimit != 0) && (index > endLimit)) goto next_char; - - // ok, let's store it - - _seq->v[_seq->len++] = ch; - - // go try the next character - - next_char: - prevCh = ch; - do // (skip whitespace) - { - ch = getc_or_die (seqF, seqFName); - } while ((ch == ' ') || (ch == '\t')); - } - - _seq->v[_seq->len] = 0; // (set the terminating zero) - - // $$$ we should make sure the sequence was as long as it said it was - - _seq->hsx.contigLoaded = true; - - if (startLimit == 0) _seq->startLoc = 1; - else _seq->startLoc = startLimit; - - return; - - // failure exits - // $$$ report line number and sequence name here - -bad_char: - description = char_to_description (ch); - suicidef ("bad fasta character in %s, %s (%s)\n" - "remove or replace non-ACGTN characters or consider using --ambiguous=iupac", - sequence_filename(_seq), seqName, description); - } - -//--- lookup_hsx_sequence --- - -static u64 lookup_hsx_sequence - (seq* _seq, - char* name) - { - u32 bucket; - u64 fileOffset; - u64 bucketStart, bucketEnd; - int err; - - bucket = hassock_hash (name, strlen(name)) % _seq->hsx.numBuckets; - fileOffset = _seq->hsx.hashTableOffset + (5 * (u64) bucket); - debugNamesFile_9; - if (fileOffset > hsxMaxFilePos) - suicidef ("in lookup_hsx_sequence for %s," - " file pos for %s hash bucket %d (%010lX) exceeds max (%010lX)", - sequence_filename(_seq), bucket, fileOffset, hsxMaxFilePos); - err = fseek (_seq->f, (long int) fileOffset, SEEK_SET); - if (err != 0) - suicidef ("in lookup_hsx_sequence for %s, file table fseek(%010lX) returned %d", - sequence_filename(_seq), fileOffset, err); - - bucketStart = read_5 (_seq, _seq->hsx.bigEndian); - debugNamesFile_10; - if ((bucketStart & hsxMsBit5) != 0) // (bucket is empty) - return hsxMsBit5; // (not found) - if (bucketStart > hsxMaxFilePos) - suicidef ("in lookup_hsx_sequence for %s," - " file pos for %s bucket start (%010lX) exceeds max (%010lX)", - sequence_filename(_seq), bucketStart, hsxMaxFilePos); - - bucketEnd = read_5 (_seq, _seq->hsx.bigEndian) & ~hsxMsBit5; - if (bucketEnd > hsxMaxFilePos) - suicidef ("in lookup_hsx_sequence for %s," - " file pos for %s bucket end (%010lX) exceeds max (%010lX)", - sequence_filename(_seq), bucketEnd, hsxMaxFilePos); - - return find_hsx_sequence (_seq, name, bucketStart, bucketEnd); - } - -//--- find_hsx_sequence --- - -static u64 find_hsx_sequence - (seq* _seq, - char* name, - u64 bucketStart, - u64 bucketEnd) - { - u64 bucketOffset = bucketStart; - char* seqName; - int diff, err; - - err = fseek (_seq->f, (unsigned long) bucketOffset, SEEK_SET); - if (err != 0) - suicidef ("in find_hsx_sequence for %s," - " file table fseek(%010lX) returned %d", - sequence_filename(_seq), bucketOffset, err); - - while (bucketOffset < bucketEnd) - { - seqName = read_hsx_index_entry (_seq); - diff = strcmp (seqName, name); - if (diff == 0) return bucketOffset; // (sequence name found) - if (diff > 0) break; // (sequence name not found) - bucketOffset += 1 + 6 + 5 + strlen(seqName) + 1; - } - - return hsxMsBit5; // (not found) - } - -//--- read_hsx_index_entry --- - -static char* read_hsx_index_entry - (seq* _seq) - { - _seq->hsx.seqLength = read_5 (_seq, _seq->hsx.bigEndian); - _seq->hsx.seqFileIx = seq_getc (_seq); - _seq->hsx.seqFilePos = read_6 (_seq, _seq->hsx.bigEndian); - return read_hsx_string (_seq, _seq->f); - } - -//--- read_hsx_string --- - -static char* read_hsx_string - (seq* _seq, - FILE* f) - { - static char s[256]; - unsigned int stringSize; - size_t bytesRead; - - // read the name - - stringSize = getc_or_die (_seq->f, _seq->filename); - if (stringSize == 0) - { s[0] = 0; return s; } - - bytesRead = fread (s, 1, stringSize, f); - if (bytesRead != stringSize) - suicidef ("in read_hsx_string for %s, short read\n" - "wanted %d bytes, only got %d", - sequence_filename(_seq), stringSize, bytesRead); - - s[stringSize] = 0; - return s; - } - -//---------- -// -// load_qdna_sequence-- -// Load a quantum-dna sequence from the associated file. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to load. -// int keeper: true => actually load the sequence -// false => just skip the sequence -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- -// -// Qdna file format: -// -// Fields can be in big- or little-endian format; they must match the -// endianess of the magic number. -// -// Version 2 (name ignored; files with named properites not supported): -// -// offset 0x00: C4 B4 71 97 big endian magic number (97 71 B4 C4 => little endian) -// offset 0x04: 00 00 02 00 version 2.0 (fourth byte is sub version) -// offset 0x08: 00 00 00 14 header length (in bytes, including this field) -// offset 0x0C: xx xx xx xx S, offset (from file start) to data sequence -// offset 0x10: xx xx xx xx N, offset to name, 0 indicates no name -// offset 0x14: xx xx xx xx length of data sequence (counted in 'items') -// offset 0x18: xx xx xx xx (for version >= 2.0) P, offset to named -// .. properties, 0 indicates no properties -// offset N: ... name (zero-terminated string) -// offset S: ... data sequence -// offset P: ... named properties (see below) -// -// The named properties section is not allowed in this implementation. -// -// Version 1 (name ignored): -// -// offset 0x00: C4 B4 71 97 big endian magic number (97 71 B4 C4 => little endian) -// offset 0x04: 00 00 01 00 version (fourth byte will be sub version) -// offset 0x08: 00 00 00 10 header length (in bytes, including this field) -// offset 0x0C: xx xx xx xx S, offset (from file start) to data sequence -// offset 0x10: xx xx xx xx N, offset to name, 0 indicates no name -// offset 0x14: xx xx xx xx length of data sequence (counted in 'items') -// offset N: ... name (zero-terminated string) -// offset S: ... data sequence -// -// Version 0: -// -// offset 0x00: 9E 65 56 F6 magic number -// offset 0x04: ... data sequence -// -// Additionally, we will accept any binary file and interpret it as the data -// sequence. Note that if the data sequence happens to begin with one of the -// magic numbers above, we will fail to read the file properly. Further, if -// the file contains newlines that are not part of the sequence, we will fail -// to read the file properly. -// -//---------- - -// $$$ why don't we use the name from the file????? - -static void load_qdna_sequence - (seq* _seq, - int keeper) - { - u32 magic, version, seqOffset, propOffset; - u32 length, startLimit, startIndex, endLimit; - unspos newSeqLen; - int oldFormat, bigEndian, lengthKnown; - int ch; - int err, numChars; - - if (!keeper) return; - if (_seq == NULL) suicide ("load_qdna_sequence(NULL)"); - - ////////// - // process the header - ////////// - - // validate the magic number - - oldFormat = bigEndian = false; - - magic = read_4_big (_seq); - if (magic == qdnaMagicLittle) { ; } - else if (magic == qdnaMagicBig) { bigEndian = true; } - else if (magic == oldQdnaMagicLittle) { oldFormat = true; } - else if (magic == oldQdnaMagicBig) { oldFormat = bigEndian = true; } - else - { - seq_ungetc ((magic >> 24) & 0xFF, _seq); - seq_ungetc ((magic >> 16) & 0xFF, _seq); - seq_ungetc ((magic >> 8) & 0xFF, _seq); - seq_ungetc ( magic & 0xFF, _seq); - oldFormat = true; - } - - // skip the header (unless it's the old format) - - if (oldFormat) - { - lengthKnown = false; - length = 0; - } - else - { - version = read_4 (_seq, bigEndian); - if (((version >> 8) != 1) && ((version >> 8) != 2)) - suicidef ("unsupported qdna version in %s (%08lX)", - sequence_filename(_seq), version); - - /*headerLen=*/ read_4 (_seq, bigEndian); - seqOffset = read_4 (_seq, bigEndian); - /*nameOffset=*/ read_4 (_seq, bigEndian); - length = read_4 (_seq, bigEndian); lengthKnown = true; - - if ((version >> 8) == 1) - skip_chars (_seq, seqOffset - 0x18); - if ((version >> 8) == 2) - { - propOffset = read_4 (_seq, bigEndian); - if (propOffset != 0) - suicidef ("qdna named properties are not supported in %s", - sequence_filename(_seq)); - skip_chars (_seq, seqOffset - 0x1C); - } - - _seq->trueLen += length; - } - - ////////// - // skip ahead to the first desired base, and try to determine how many - // bases we'll read - ////////// - - if (lengthKnown) - { - if ((_seq->startLimit != 0) && (_seq->startLimit > (unspos) length)) - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), _seq->startLimit, length); - - if ((_seq->endLimit != 0) && (_seq->endLimit > (unspos) length)) - { - if (_seq->endIsSoft) - { _seq->endLimit = 0; _seq->endIsSoft = false; } - else - suicidef ("beyond end in %s (%ld > %ld)", - sequence_filename(_seq), _seq->endLimit, length); - } - } - else - { - if ((_seq->startLimit != 0) && (_seq->startLimit > (unspos) 0xFFFFFFFF)) - suicidef ("invalid start limit in %s (%ld > %ld)", - sequence_filename(_seq), _seq->startLimit, 0xFFFFFFFF); - - if ((_seq->endLimit != 0) && (_seq->endLimit > (unspos) 0xFFFFFFFF)) - { - if (_seq->endIsSoft) - { _seq->endLimit = 0; _seq->endIsSoft = false; } - else - suicidef ("invalid end limit in %s (%ld > %ld)", - sequence_filename(_seq), _seq->endLimit, 0xFFFFFFFF); - } - } - - startLimit = (u32) _seq->startLimit; - endLimit = (u32) _seq->endLimit; - - if (startLimit == 0) startLimit = 1; - startIndex = startLimit - 1; - - if (startIndex > 0) - { - if (!skip_chars (_seq, startIndex)) - suicidef ("bad start index for %s: %d", - sequence_filename(_seq), startIndex); - } - - if (endLimit != 0) - { length = endLimit - startIndex; lengthKnown = true; } - - ////////// - // allocate the vector (if we know the length) - ////////// - - newSeqLen = 0; - if (lengthKnown) - { -#if (maxSequenceIndex <= 32) // otherwise compiler complains that this test is - // .. always false - if ((length > maxSequenceLen) || (_seq->len > maxSequenceLen - length)) - suicidef ("in load_qdna_sequence for %s, " - "sequence length %s+%s exceeds maximum (%s)", - sequence_filename(_seq), - commatize(_seq->len), commatize(length), - commatize(maxSequenceLen)); -#endif - - newSeqLen = _seq->len + length; - sequence_long_enough (_seq, newSeqLen, false); - } - - ////////// - // read the sequence - ////////// - - while (true) - { - if ((newSeqLen != 0) && (_seq->len >= newSeqLen)) - break; - - // read the next character from the sequence - - ch = seq_getc (_seq); - if (ch == EOF) break; - - if (ch == 0) - suicidef ("in load_qdna_sequence(), file contains a zero"); - - // allocate more room in the vector if we need it, and deposit the - // character in the sequence - - if (!lengthKnown) - sequence_long_enough (_seq, _seq->len+1, true); - - _seq->v[_seq->len++] = (u8) ch; - } - - _seq->v[_seq->len] = 0; - - if (oldFormat) - _seq->trueLen += _seq->len + startIndex; - else - { ; } // (for new format, we already added the file length to _seq->trueLen - - if ((newSeqLen != 0) && (_seq->len < newSeqLen)) - suicidef ("beyond end in %s (%ld > end of file)", - sequence_filename(_seq), endLimit); - - _seq->pendingLen = 0; // (discard any pending chars) - _seq->pendingStack = _seq->pendingChars + seqBufferSize; - - if (startLimit == 0) _seq->startLoc = 1; - else _seq->startLoc = startLimit; - - // skip the rest of the sequence - - if ((oldFormat) && (_seq->needTrueLen)) - { - while ((ch = seq_getc (_seq)) != EOF) - _seq->trueLen++; - } - else - { - err = fseek (_seq->f, 0, SEEK_END); // skip the rest of the sequence - if (err != 0) - suicidef_with_perror ("in load_qdna_sequence(), fseek returned %d", - err); - } - - ////////// - // create a sequence header - ////////// - - if (!_seq->lockedHeader) - { - numChars = snprintf (_seq->header, 0, "%s:" unsposDashFmt, - sequence_filename(_seq), - _seq->startLoc, - _seq->startLoc + _seq->len-1); - - if (_seq->headerSize < (unsigned) numChars+1) - { - _seq->header = realloc_or_die ("load_qdna_sequence (header)", - _seq->header, numChars+1); - _seq->headerSize = numChars+1; - _seq->headerOwner = _seq->shortHeaderOwner = true; - } - - snprintf (_seq->header, numChars+1, "%s:" unsposDashFmt, - sequence_filename(_seq), - _seq->startLoc, - _seq->startLoc + _seq->len-1); - } - } - -//---------- -// -// another_sequence-- -// Determine if the associated file has another sequence (see note 1 for -// clarification). -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to dispose of. -// -// Returns: -// true if there is another sequence to load, false if not. -// -//---------- -// -// Notes: -// -// (1) If a chores file is active, "another sequence" means "another chore", -// even if any remaining chores are for the current sequence. -// -//---------- - -int another_sequence - (seq* _seq) - { - debugNamesFile_11; - - if (_seq == NULL) suicide ("another_sequence(NULL)"); - if (_seq->fileType == seq_type_nofile) return false; - - // if we're subsampling the file's sequences, skip past sequences as needed - - if (_seq->subsampleN > 0) - { - if (_seq->subsampleSkip > 0) - skip_sequences (_seq, _seq->subsampleSkip); - _seq->subsampleSkip = 0; - } - - return another_sequence_core (_seq); - } - -static int another_sequence_core - (seq* _seq) - { - seqpartition* sp; - int ch; - char* header; - int haveNamesFile, inhibitSearch; - - // if this is a partitioned sequence and we've finished loading it, there's - // never another sequence - - sp = &_seq->partition; - if ((sp->p != NULL) && (sp->state >= seqpart_ready)) - { - if (_seq->doJoin) return false; - sp->state = seqpart_reusable; - } - - // if we've previously positioned the file to a contig, but have yet to - // load that contig, then we have another sequence - - if (_seq->contigPending) return true; - - // if we pre-loaded a sequence then that sequence counts as "another" - // sequence since the caller doesn't know we did so - - if (_seq->preLoaded) - { - if ((sp->p != NULL) && (sp->state == seqpart_reusable)) - sp->state = seqpart_ready; - return true; - } - - // if we have a contigs-of-interest file, get the next contig name ... - - inhibitSearch = false; - - if (_seq->namesFile != NULL) - { - if (!read_contig_name (_seq)) - return false; - } - - // ... or, if we have a chores file, get the next chore - - else if (_seq->choresFile != NULL) - { - if (!read_chore (_seq)) - return false; - - header = (_seq->useFullNames)? _seq->header : _seq->shortHeader; - if ((header != NULL) && (strcmp (header, _seq->nextContigName) == 0)) - { - _seq->preLoaded = true; - if ((sp->p != NULL) && (sp->state == seqpart_reusable)) - sp->state = seqpart_ready; - inhibitSearch = true; - validate_rev_comp (_seq); - } - } - - // for 2bit or hsx files it's a matter of whether we're at the end of the - // index list - - haveNamesFile = (_seq->namesFile != NULL) || (_seq->choresFile != NULL); - - if (_seq->fileType == seq_type_2bit) - { - if ((!haveNamesFile) - && (!_seq->twoBit.contigLoaded)) return (_seq->twoBit.numContigs > 0); - if (_seq->contigOfInterest != NULL) return false; - if ((haveNamesFile) - && (!inhibitSearch)) return find_next_2bit_coi (_seq); - if ((_seq->contig >= _seq->twoBit.numContigs) - && (!inhibitSearch)) return false; - return true; - } - else if (_seq->fileType == seq_type_hsx) - { - if ((!haveNamesFile) - && (!_seq->hsx.contigLoaded)) return (_seq->hsx.numContigs > 0); - if (_seq->contigOfInterest != NULL) return false; - if ((haveNamesFile) - && (!inhibitSearch)) return find_next_hsx_coi (_seq); - if ((_seq->contig >= _seq->hsx.numContigs) - && (!inhibitSearch)) return false; - return true; - } - - // otherwise it's a matter of having data left in the file - - if (_seq->f == NULL) return false; // we've have no file to read from - if (feof (_seq->f)) return false; // we've previously hit end of file - if (ferror (_seq->f)) return false; // we've previously had a problem - - if ((_seq->fileType == seq_type_fasta) // we've got the next contig-of- - && (haveNamesFile)) // .. interest - return find_next_fasta_coi (_seq); - if ((_seq->fileType == seq_type_fastq) && (haveNamesFile)) - return find_next_fastq_coi (_seq); - if ((_seq->fileType == seq_type_csfasta) && (haveNamesFile)) - return find_next_csfasta_coi (_seq); - - if (_seq->pendingLen > 0) return true; // we have characters to process - - ch = getc_or_die (_seq->f, // take a peek and see what's left - _seq->filename); - if (ch == EOF) return false; // we're at end of file now - - seq_ungetc (ch, _seq); // save what we peeked at - return true; // we have characters to process - } - - -// find_next_fasta_coi, find_next_csfasta_coi-- -// advance to the next contig-of-interest in fasta or csfasta file -// (always returns true) - -static int find_next_fasta_coi (seq* _seq) - { return find_next_general_fasta_coi (_seq, false); } - -static int find_next_csfasta_coi (seq* _seq) - { return find_next_general_fasta_coi (_seq, true); } - -static int find_next_general_fasta_coi - (seq* _seq, - int allowComments) - { - char buffer[maxSequenceHeader+1]; - char* header; - int headerLen; - int mustBeHeader; - int leadingWhite; - char ch, *s; - int ix; - - debugNamesFile_12; - - mustBeHeader = true; - - while (true) - { - // find the next header - - ch = seq_getc (_seq); - if (ch == EOF) goto failure; - - if ((allowComments) && (ch == '#')) - { // comment, skip to end-of-line and go back and try again - while (ch != '\n') - { - ch = seq_getc (_seq); - if (ch == EOF) goto failure; - } - continue; - } - - if (ch != '>') - { - if (mustBeHeader) - suicidef ("internal error in find_next_fasta_coi\n" - "processing %s, looking for \"%s\"\n", - sequence_filename(_seq), _seq->nextContigName); - continue; - } - - if (!mustBeHeader) _seq->contig++; - mustBeHeader = false; - - // skip leading white space - - debugNamesFile_14; - - leadingWhite = 0; - - ch = seq_getc (_seq); - if (ch == EOF) goto failure; - while ((ch != '\n') && (isspace (ch))) - { - leadingWhite++; - ch = seq_getc (_seq); - if (ch == EOF) goto failure; - } - - if (ch == '\n') - continue; // (unnamed sequence) - - // read the header - - s = buffer; - while (ch != '\n') - { - if (s - buffer >= maxSequenceHeader) // (overflow; - break; // .. truncate the header) - *(s++) = ch; - ch = seq_getc (_seq); - if (ch == EOF) goto failure; - } - *s = 0; - - // if we have a name trigger, locate the sequence's name - - if (_seq->nameTrigger != NULL) - { - header = strstr (buffer, _seq->nameTrigger); - if (header == NULL) continue; // (effectively unnamed sequence) - header += strlen (_seq->nameTrigger); - - s = header; - while ((*s != 0) && ((isalnum(*s)) || (*s == '_'))) - s++; - headerLen = s-header; - } - else if (!_seq->useFullNames) - { - shorten_header (/* from */ buffer, _seq->nameParseType, false, - /* to */ NULL, NULL); - header = buffer; - headerLen = strlen(buffer); - } - else - { - header = buffer; - headerLen = strlen(buffer); - } - - if ((_seq->nameParseType & name_parse_fill_white) != 0) - whitespace_to_under (header, headerLen); - - // compare header to the contig-of-interest - - debugNamesFile_15; - - if (strncmp (header, _seq->nextContigName, headerLen) != 0) - continue; - if ((int) strlen (_seq->nextContigName) != headerLen) - continue; - - break; // found a match! - } - - debugNamesFile_16; - - // unget the header - - seq_ungetc (ch, _seq); // (ch terminated the header) - - for (ix=strlen(buffer) ; ix>0 ; ) - seq_ungetc (buffer[--ix], _seq); - - while (leadingWhite-- > 0) seq_ungetc (' ', _seq); - seq_ungetc ('>', _seq); - - _seq->contigPending = true; - return true; - - // failure, the contig name was not found - -failure: - suicidef ("%s does not contain (or contains out of order)\n" - " the sequence \"%s\"", - sequence_filename(_seq), _seq->nextContigName); - return false; // (will never reach here) - } - -// find_next_fastq_coi-- -// advance to the next contig-of-interest in fastq file; note that we don't -// completely validate the file format here-- if we can locate a suitable -// sequence header we use it; validation is left to the sequence parser -// (always returns true) - -static int find_next_fastq_coi - (seq* _seq) - { - char buffer[maxSequenceHeader+1]; - char* header; - int headerLen; - char ch, *s; - int ix; - int ok; - - debugNamesFile_13; - - while (true) - { - // find the next header - - debugNamesFile_14; - - ch = seq_getc (_seq); - if (ch == EOF) goto failure; - - if (ch != '@') - suicidef ("internal error in find_next_fastq_coi\n" - "processing %s, looking for \"%s\"\n", - sequence_filename(_seq), _seq->nextContigName); - - // read the header - - ch = seq_getc (_seq); - if (ch == EOF) goto failure; - - s = buffer; - while ((ch != '\n') && (ch != '\r')) - { - if (s - buffer >= maxSequenceHeader) // (overflow; - break; // .. truncate the header) - *(s++) = ch; - ch = seq_getc (_seq); - if (ch == EOF) goto failure; - } - *s = 0; - - if (ch == '\r') // handle possible DOS CR-LF line ending - { - ch = seq_getc (_seq); - if (ch != '\n') seq_ungetc (ch, _seq); - } - - // if we have a name trigger, locate the sequence's name - - if (_seq->nameTrigger != NULL) - { - header = strstr (buffer, _seq->nameTrigger); - if (header == NULL) goto skip_content; // (effectively unnamed sequence) - header += strlen (_seq->nameTrigger); - - s = header; - while ((*s != 0) && ((isalnum(*s)) || (*s == '_'))) - s++; - headerLen = s-header; - } - else if (!_seq->useFullNames) - { - shorten_header (/* from */ buffer, _seq->nameParseType, false, - /* to */ NULL, NULL); - header = buffer; - headerLen = strlen(buffer); - } - else - { - header = buffer; - headerLen = strlen(buffer); - } - - if ((_seq->nameParseType & name_parse_fill_white) != 0) - whitespace_to_under (header, headerLen); - - // compare header to the contig-of-interest; if this is not the - // contig-of-interest, skip the sequence content - - debugNamesFile_15; - - if ((strncmp (header, _seq->nextContigName, headerLen) != 0) - || ((int) strlen (_seq->nextContigName) != headerLen)) - { - skip_content: - ok = fastq_skip_content (_seq); - if (!ok) goto failure; - continue; - } - - break; // found a match! - } - - debugNamesFile_16; - - // unget the header - - seq_ungetc (ch, _seq); // (ch terminated the header) - - for (ix=strlen(buffer) ; ix>0 ; ) - seq_ungetc (buffer[--ix], _seq); - - seq_ungetc ('@', _seq); - - _seq->contigPending = true; - return true; - - // failure, the contig name was not found - -failure: - suicidef ("%s does not contain (or contains out of order)\n" - " the sequence \"%s\"", - sequence_filename(_seq), _seq->nextContigName); - return false; // (will never reach here) - } - -// find_next_2bit_coi-- -// advance to the next contig-of-interest in 2bit header -// (always returns true) - -static int find_next_2bit_coi - (seq* _seq) - { - char seqName[maxSequenceName+1]; - long int savedContigFilePos; - int err; - - debugNamesFile_17; - - // position to the sequence's next index table entry - - err = fseek (_seq->f, _seq->twoBit.contigFilePos, SEEK_SET); - if (err != 0) - suicidef ("in find_next_2bit_coi(%s), index fseek(%08lX) returned %d", - sequence_filename(_seq), _seq->twoBit.contigFilePos, err); - - // read index table entries until we find the one we're looking for - - while (true) - { - if (_seq->contig >= _seq->twoBit.numContigs) - suicidef ("%s does not contain (or contains out of order)\n" - " the sequence \"%s\"", - sequence_filename(_seq), _seq->nextContigName); - - // read the sequence's next index table entry - - savedContigFilePos = ftell (_seq->f); - /*seqDataPos=*/ read_2bit_index_entry (_seq, seqName, _seq->contig); - if (strcmp (seqName, _seq->nextContigName) == 0) break; - - _seq->contig++; - } - - _seq->twoBit.contigFilePos = savedContigFilePos; - _seq->contigPending = true; - return true; - } - - -// find_next_hsx_coi-- -// advance to the next contig-of-interest in hsx header -// (always returns true) - -static int find_next_hsx_coi - (seq* _seq) - { - u64 fileOffset; - - debugNamesFile_18; - - fileOffset = lookup_hsx_sequence (_seq, _seq->nextContigName); - - if ((fileOffset & hsxMsBit5) != 0) - suicidef ("hsx file %s doesn't contain %s", - sequence_filename(_seq), _seq->nextContigName); - if (fileOffset > hsxMaxFilePos) - suicidef ("in find_next_hsx_coi for %s," - " file pos for %s (%010lX) exceeds max (%010lX)", - sequence_filename(_seq), _seq->nextContigName, fileOffset); - - _seq->hsx.contigFilePos = fileOffset; - _seq->contigPending = true; - debugNamesFile_19; - return true; - } - -//---------- -// -// read_contig_name-- -// Read the next name from a contigs-of-interest file. -// -// The file will contain one name per line. Any leading whitespace is ignored, -// any comment lines are ignored (# is the comment character), and the name is -// only up to the first whitespace character. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence. -// -// Returns: -// true if we were successful; false if there are no more names in the file. -// -//---------- - -static int read_contig_name - (seq* _seq) - { - char* line = _seq->nextContigName; - int lineSize = sizeof(_seq->nextContigName); - char discard[maxSequenceName+1]; - int len; - int missingEol; - char* waffle, *s; - - while (fgets (line, lineSize, _seq->namesFile) != NULL) - { - // check for lines getting split by fgets (the final line in the file - // might not have a newline, but no internal lines can be that way); - // if the line was split we simply read ahead until we find the end of - // the line (and discard the extra) - - len = strlen(line); - if (len == 0) continue; - missingEol = (line[len-1] != '\n'); - - if (missingEol) - { - while (fgets (discard, sizeof(discard), _seq->namesFile) != NULL) - { - len = strlen(discard); - if (len == 0) break; - if (discard[len-1] == '\n') break; - } - } - - // trim blanks, end of line, and comments, and ignore blank lines - // nota bene: since illumina read names contain our comment character - // (#), and to maintain backward compatibility for lines that contain - // a contig name *and* a comment, the comment character now requires a - // space or tab just before it (unless it is at the start of the line) - - len = strlen(line); - if (line[len-1] == '\n') line[--len] = 0; - - waffle = strchr (line, '#'); - while (waffle != NULL) - { - if (waffle == line) - { *waffle = 0; break; } - else if ((waffle[-1] == ' ') || (waffle[-1] == '\t')) - { *waffle = 0; break; } - waffle = strchr (waffle+1, '#'); - } - - trim_string (line); - if (line[0] == 0) continue; - - // ok, the line has something in it - - s = skip_darkspace(line); - *s = 0; - - debugNamesFile_20; - - return true; - } - - return false; - } - -//---------- -// -// read_chore-- -// Read the next chore from a chores file. -// -// The file contains one chore per line, and any comment lines are ignored. "#" -// is the comment character, but only if it appears at the beginning of a line -// or immediately after whitespace. -// -// A chore contains the following fields (some of which are optional): -// -// [ ] [] [id=] -// -// In cases where the target name is irrelevant (i.e. there is only one name in -// the target sequence file), "*" can replace . Similarly, if we don't -// have a target (or query) subrange, "* *" can be used. Note that the query -// subrange and strand are optional, as is the tag. -// -// The tag can be any short string (but without whitespace) the user wants to -// associate with the chore (maximum length is defined by maxChoreTagLen). -// This tag can be reported along with alignments for the chore, in the general -// tab-delimited format. -// -// Note that intervals in the file are origin-one half open, and they are -// *not* altered as written into the chore struct. -// -// Typical lines: -// -// chr11 5931512 5931843 APPLE_READ_00009 -// chr11 5931512 5931843 APPLE_READ_00009 + -// chr11 5931512 5931843 APPLE_READ_00009 - -// * 5931512 5931843 APPLE_READ_00036 -// chr22 * * APPLE_READ_00087 -// chr11 2878300 1933292 chr11 1486276 1741268 + -// chr11 2878300 1933292 chr2 6865671 7149925 - -// -//---------- -// -// Arguments: -// seq* _seq: The sequence. -// -// Returns: -// true if we were successful; false if there are no more chores in the file. -// -//---------- - -static int read_chore - (seq* _seq) - { - char line[511+1], discard[511+1]; - int len; - int missingEol; - char* scan, *waffle, *field; - int numItems, charsUsed; - char* tName, *tStart, *tEnd; - char* qName, *qStart, *qEnd, *qStrand; - char* idTag; - char* header; - - debugNamesFile_21; - - while (fgets (line, sizeof(line), _seq->choresFile) != NULL) - { - _seq->choresLineNum++; - - // check for lines getting split by fgets (the final line in the file - // might not have a newline, but no internal lines can be that way); - // if the line was split we simply read ahead until we find the end of - // the line (and discard the extra) - - len = strlen(line); - if (len == 0) continue; - missingEol = (line[len-1] != '\n'); - - if (missingEol) - { - while (fgets (discard, sizeof(discard), _seq->choresFile) != NULL) - { - len = strlen(discard); - if (len == 0) break; - if (discard[len-1] == '\n') break; - } - } - - // trim blanks, end of line, and comments, and ignore blank lines - // nota bene: since illumina read names contain our comment character - // (#), and to allow lines that contain a chore *and* a comment, the - // comment character now requires a space or tab just before it (unless - // it is at the start of the line) - - len = strlen(line); - if (line[len-1] == '\n') line[--len] = 0; - - waffle = strchr (line, '#'); - while (waffle != NULL) - { - if (waffle == line) - { *waffle = 0; break; } - else if ((waffle[-1] == ' ') || (waffle[-1] == '\t')) - { *waffle = 0; break; } - waffle = strchr (waffle+1, '#'); - } - - trim_string (line); - if (line[0] == 0) continue; - - // ok, the line has something in it; parse to find the chore fields - - debugNamesFile_22; - - tName = scan = line; - - scan = skip_darkspace (scan); scan = skip_whitespace (scan); - if (*scan == 0) goto not_enough_fields; - tStart = scan; - - scan = skip_darkspace (scan); scan = skip_whitespace (scan); - if (*scan == 0) goto not_enough_fields; - tEnd = scan; - - scan = skip_darkspace (scan); scan = skip_whitespace (scan); - if (*scan == 0) goto not_enough_fields; - qName = scan; - - qStart = qEnd = qStrand = idTag = NULL; - scan = skip_darkspace (scan); scan = skip_whitespace (scan); - - if ((*scan != 0) - && ((scan[0] != '+') || ((scan[1] != 0) && (!isspace (scan[1])))) - && ((scan[0] != '-') || ((scan[1] != 0) && (!isspace (scan[1])))) - && (strncmp (scan, "id=", strlen("id+")) != 0)) - { - qStart = scan; - - scan = skip_darkspace (scan); scan = skip_whitespace (scan); - if (*scan == 0) goto missing_query_end; - qEnd = scan; - - scan = skip_darkspace (scan); scan = skip_whitespace (scan); - } - - if (((scan[0] == '+') && ((scan[1] == 0) || (isspace (scan[1])))) - || ((scan[0] == '-') && ((scan[1] == 0) || (isspace (scan[1]))))) - { - qStrand = scan; - scan = skip_darkspace (scan); scan = skip_whitespace (scan); - } - - if (strncmp (scan, "id=", strlen("id+")) == 0) - { - idTag = scan + strlen("id+"); - scan = skip_darkspace (scan); scan = skip_whitespace (scan); - } - - if (*scan != 0) goto extra_fields; - - // interpret the chore fields - - _seq->chore.tSubrange = false; - _seq->chore.qSubrange = false; - - field = tStart; - scan = skip_darkspace (field); *scan = 0; - if (strcmp (field, "*") != 0) - { - charsUsed = -1; - numItems = sscanf (field, unsposFmtScanf "%n", &_seq->chore.tStart, &charsUsed); - if ((numItems != 1) || (((u32)charsUsed) != strlen(field))) goto bad_field; - if (_seq->chore.tStart == 0) goto bad_target_start1; - _seq->chore.tSubrange = true; - } - - field = tEnd; - scan = skip_darkspace (field); *scan = 0; - if (strcmp (field, "*") == 0) - { if (_seq->chore.tSubrange) goto bad_target_end; } - else - { - charsUsed = -1; - numItems = sscanf (field, unsposFmtScanf "%n", &_seq->chore.tEnd, &charsUsed); - if ((numItems != 1) || (((u32)charsUsed) != strlen(field))) goto bad_field; - if (!_seq->chore.tSubrange) goto bad_target_start2; - if (_seq->chore.tEnd <= _seq->chore.tStart) goto bad_target_interval; - } - - if (qStart != NULL) - { - field = qStart; - scan = skip_darkspace (field); *scan = 0; - if (strcmp (field, "*") != 0) - { - charsUsed = -1; - numItems = sscanf (field, unsposFmtScanf "%n", &_seq->chore.qStart, &charsUsed); - if ((numItems != 1) || (((u32)charsUsed) != strlen(field))) goto bad_field; - if (_seq->chore.qStart == 0) goto bad_query_start1; - _seq->chore.qSubrange = true; - } - - field = qEnd; - scan = skip_darkspace (field); *scan = 0; - if (strcmp (field, "*") == 0) - { if (_seq->chore.qSubrange) goto bad_query_end; } - else - { - charsUsed = -1; - numItems = sscanf (field, unsposFmtScanf "%n", &_seq->chore.qEnd, &charsUsed); - if ((numItems != 1) || (((u32)charsUsed) != strlen(field))) goto bad_field; - if (!_seq->chore.qSubrange) goto bad_query_start2; - if (qEnd <= qStart) goto bad_query_interval; - } - } - - if (qStrand == NULL) - _seq->chore.qStrand = 1; // (both strands) - else - { - scan = skip_darkspace (qStrand); *scan = 0; - if (strcmp (qStrand, "+") == 0) _seq->chore.qStrand = 0; // (forward strand only) - else _seq->chore.qStrand = -1; // (reverse strand only) - } - - scan = skip_darkspace (tName); *scan = 0; - if (strcmp (tName, "*") == 0) - _seq->chore.tName[0] = 0; - else if (strlen(tName) >= sizeof(_seq->chore.tName)) - goto target_name_too_long; - else - strcpy (/*to*/ _seq->chore.tName, /*from*/ tName); - - scan = skip_darkspace (qName); *scan = 0; - if (strlen(qName) >= sizeof(_seq->nextContigName)) - goto query_name_too_long; - else - strcpy (/*to*/ _seq->nextContigName, /*from*/ qName); - - header = (_seq->useFullNames)? _seq->header : _seq->shortHeader; - if ((header == NULL) || (strcmp (header, _seq->nextContigName) != 0)) - _seq->chore.num = 1; - else - _seq->chore.num++; - - if (idTag == NULL) - _seq->chore.idTag[0] = 0; - else - { - scan = skip_darkspace (idTag); *scan = 0; - if (strlen(idTag) >= sizeof(_seq->chore.idTag)) - goto id_tag_too_long; - strcpy (/*to*/ _seq->chore.idTag, /*from*/ idTag); - } - - debugNamesFile_23; - - return true; - } - - return false; - -// failure exits - -not_enough_fields: - suicidef ("bad chore (in %s, line %d): \"%s\"\n" - "not enough fields in line", - _seq->choresFilename, _seq->choresLineNum, line); - return false; // (never gets here) - -extra_fields: - suicidef ("bad chore (in %s, line %d): \"%s\"\n" - "extra fields in line: \"%s\"", - _seq->choresFilename, _seq->choresLineNum, line, scan); - return false; // (never gets here) - -missing_query_end: - suicidef ("bad chore (in %s, line %d): \"%s\"\n" - "has start of query subrange but not end", - _seq->choresFilename, _seq->choresLineNum, line); - return false; // (never gets here) - -bad_field: - suicidef ("bad chore field (in %s, line %d): \"%s\"", - _seq->choresFilename, _seq->choresLineNum, field); - return false; // (never gets here) - -bad_target_start1: - suicidef ("bad chore target interval (in %s, line %d)\n" - "start cannot be zero", - _seq->choresFilename, _seq->choresLineNum); - return false; // (never gets here) - -bad_target_start2: - suicidef ("bad chore target interval (in %s, line %d): * " unsposFmt "\n" - "can't wildcard start and not end", - _seq->choresFilename, _seq->choresLineNum, _seq->chore.tEnd); - return false; // (never gets here) - -bad_target_end: - suicidef ("bad chore target interval (in %s, line %d): " unsposFmt " *\n" - "can't wildcard end and not start", - _seq->choresFilename, _seq->choresLineNum, _seq->chore.tStart); - return false; // (never gets here) - -bad_target_interval: - suicidef ("bad chore target interval (in %s, line %d): " unsposFmt ">=" unsposFmt, - _seq->choresFilename, _seq->choresLineNum, _seq->chore.tStart, _seq->chore.tEnd); - return false; // (never gets here) - -bad_query_start1: - suicidef ("bad chore query interval (in %s, line %d)\n" - "start cannot be zero", - _seq->choresFilename, _seq->choresLineNum); - return false; // (never gets here) - -bad_query_start2: - suicidef ("bad chore query interval (in %s, line %d): * " unsposFmt "\n" - "can't wildcard start and not end", - _seq->choresFilename, _seq->choresLineNum, _seq->chore.qEnd); - return false; // (never gets here) - -bad_query_end: - suicidef ("bad chore query interval (in %s, line %d): " unsposFmt " *\n" - "can't wildcard end and not start", - _seq->choresFilename, _seq->choresLineNum, _seq->chore.qStart); - return false; // (never gets here) - -bad_query_interval: - suicidef ("bad chore query interval (in %s, line %d): " unsposFmt ">=" unsposFmt, - _seq->choresFilename, _seq->choresLineNum, _seq->chore.qStart, _seq->chore.qEnd); - return false; // (never gets here) - -target_name_too_long: - suicidef ("chore target name too long (in %s, line %d): \"%s\"", - _seq->choresFilename, _seq->choresLineNum, tName); - return false; // (never gets here) - -query_name_too_long: - suicidef ("chore query name too long (in %s, line %d): \"%s\"", - _seq->choresFilename, _seq->choresLineNum, qName); - return false; // (never gets here) - -id_tag_too_long: - suicidef ("chore id tag too long, allowed length is %d (in %s, line %d): \"%s\"", - sizeof(_seq->chore.idTag)-1, _seq->choresFilename, _seq->choresLineNum, idTag); - return false; // (never gets here) - } - -//---------- -// -// create_short_header-- -// Convert a sequence's header into a shorter version. The shorter version is -// intended to be useful as a sequence's name in maf or axt files. -// -// Examples: -// -// >~username/human/hg18/_seq/chr16.nib:120000-190000 chr16 -// owl_monkey 122000-180000 of owl_monkey.ENm008.fa owl_monkey -// > armadillo|ENm001|JAN-2006|9361|NISC|...|1|1|. armadillo -// >reverse complement of ~username/human/hg18/_seq/chr14.nib chr14 -// >positions 180000-250000 of armadillo|ENm008|... armadillo -// -//---------- -// -// Arguments: -// seq* _seq: The sequence. -// -// Returns: -// (nothing; _seq->shortHeader and _seq->shortHeaderSize are modified) -// -//---------- -// -// Note: It is possible for the resulting short header name to be an empty -// string. -// -//---------- - -static void create_short_header - (seq* _seq) - { - int skipPath; - - if (_seq->header == NULL) - { - if ((_seq->shortHeader != NULL) && (_seq->shortHeaderSize != 0)) - _seq->shortHeader[0] = 0; - return; - } - - if ((!_seq->headerOwner) || (!_seq->shortHeaderOwner)) - { - char* name = (_seq->filename != NULL)? _seq->filename - : _seq->header; - suicidef ("internal error, attempt to shorten external sequence header (%s)", - name); - } - - if (strstr(_seq->header,"{number}") != NULL) - expand_nickname( /* from */ _seq->header, _seq->contig, - /* to */ &_seq->shortHeader, &_seq->shortHeaderSize); - else - { - skipPath = (_seq->fileType == seq_type_nib); - shorten_header (/* from */ _seq->header, _seq->nameParseType, skipPath, - /* to */ &_seq->shortHeader, &_seq->shortHeaderSize); - } - } - - -static void shorten_header - (char* src, - int nameParseType, - int skipPath, - char** _dst, // (NULL => write it in place) - u32* _dstSize) - { - char* dst; - u32 dstSize; - char* h, *hh, *s; - u32 len, sLen; - - // skip fasta '>', leading whitespace, and/or a path - - h = src; - if (h[0] == '>') h++; - h = skip_whitespace (h); - - // skip "reverse complement of" and/or "positions A-B of" - - s = "reverse complement of "; - if (strcmp_prefix (h, s) == 0) - h = skip_whitespace (h + strlen (s)); - - s = "positions "; - if (strcmp_prefix (h, s) == 0) - { - hh = skip_whitespace (h + strlen (s)); - hh = skip_darkspace (hh); - hh = skip_whitespace (hh); - s = "of "; - if (strcmp_prefix (hh, s) == 0) // (we only change h if - h = skip_whitespace (hh + strlen (s)); // .. "of" is present) - } - - // skip a path - - if (skipPath) - { - while (true) - { - hh = strchr (h, pathSlash); - if (hh == NULL) break; - h = hh + 1; - } - } - - h = skip_whitespace (h); - - // figure out the length to copy; we'll truncate at the first space or - // "funny" character - - if (parse_type(nameParseType) == name_parse_type_alnum) - { - len = strspn (h, "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789" "_"); - goto skip_suffix_trim; - } - else if (parse_type(nameParseType) == name_parse_type_darkspace) - len = strcspn (h, " \t"); - else // if (parse_type(nameParseType) == name_parse_type_core) - len = strcspn (h, " \t|:"); - - // if the suffix is ".nib", ".fasta", etc., remove it - - s = ".nib"; - sLen = strlen(s); - if ((len > sLen) && (strncmp (h+len-sLen,s,sLen) == 0)) - len -= sLen; - - s = ".2bit"; - sLen = strlen(s); - if ((len > sLen) && (strncmp (h+len-sLen,s,sLen) == 0)) - len -= sLen; - - s = ".hsx"; - sLen = strlen(s); - if ((len > sLen) && (strncmp (h+len-sLen,s,sLen) == 0)) - len -= sLen; - - s = ".fasta"; - sLen = strlen(s); - if ((len > sLen) && (strncmp (h+len-sLen,s,sLen) == 0)) - len -= sLen; - - s = ".fa"; - sLen = strlen(s); - if ((len > sLen) && (strncmp (h+len-sLen,s,sLen) == 0)) - len -= sLen; - -skip_suffix_trim: - - // create the header - - if (_dst == NULL) - { - strncpy (src, h, len); - src[len] = 0; - - if ((nameParseType & name_parse_fill_white) != 0) - whitespace_to_under (src, strlen(src)); - } - else - { - dst = *_dst; - dstSize = *_dstSize; - - if (len+1 > dstSize) - { - dst = realloc_or_die ("shorten_header", dst, len+1); - dstSize = len+1; - } - - strncpy (dst, h, len); - dst[len] = 0; - *_dst = dst; - *_dstSize = dstSize; - - if ((nameParseType & name_parse_fill_white) != 0) - whitespace_to_under (dst, strlen(dst)); - } - - } - - -static void whitespace_to_under (char* s, int sLen) - { for ( ; sLen-->0 ; s++) { if (isspace (*s)) *s = '_'; } } - - -static void expand_nickname - (char* src, - u32 contigNumber, - char** _dst, - u32* _dstSize) - { - char* dst; - u32 dstSize; - char* s, *d, *expand; - u32 len; - - // determine the size of the resulting header - - len = strlen (src) - - strlen ("{number}") - + snprintf (NULL, 0, unsposFmt, contigNumber); - - // allocate the header (if necessary) - - dst = *_dst; - dstSize = *_dstSize; - - if (len+1 > dstSize) - { - dst = realloc_or_die ("expand_nickname", dst, len+1); - dstSize = len+1; - } - - // create the header - - s = src; - d = dst; - - expand = strstr (src, "{number}"); - if (expand > src) - { - strncpy (d, s, expand-src); - d += expand-src; - s = expand + strlen("{number}"); - } - - sprintf (d, unsposFmt, contigNumber); - d += strlen(d); - - strcpy (d, s); - - *_dst = dst; - *_dstSize = dstSize; - } - -//---------- -// -// separate_sequence-- -// Separate each of a sequence's subsequences, spliting them into pieces -// wherever a specified character occurs. -// -// We require that the sequence is already partitioned (though it may have only -// one subsequence), and we introduce additional partition blocks whenever a -// subsequence is split into pieces. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to separate. -// char sepCh: The character at which to separate the sequence. -// -// Returns: -// nothing -// -//---------- -// -// Notes: -// -// (1) The term 'separator' has two similar meanings in this routine. The -// NUL characters between partitions are called separators, as are the -// user-defined separator characters in the incoming sequence. One of the -// results of this routine is to turn those latter characters into the -// former. -// -//---------- - -static void separate_sequence - (seq* _seq, - char _sepCh) - { - seqpartition* sp = &_seq->partition; - u8 sepCh = (u8) _sepCh; - u32 extraPieces; - int inSepRun; - u8 ch, chBefore, chAfter; - u32 newSpLen; - partition* p, *pFrom, *pTo; - u32 fromIx, toIx; - unspos sepPrefix, sepSuffix, sepBefore, sepAfter; - unspos startLoc; - u8* scan; - - if (sp->p == NULL) - suicidef ("internal error in separate_sequence\n" - "sequence has no partition table"); - if (sp->state == seqpart_empty) - suicidef ("internal error in separate_sequence\n" - "partition table is in empty state"); - if (sp->state == seqpart_reusable) - suicidef ("internal error in separate_sequence\n" - "partition table is in re-usable state"); - if (sp->state == seqpart_loading) - suicidef ("internal error in separate_sequence\n" - "partition table is in loading state"); - - debugSeparation_1; - - // scan the sequence, looking for runs of separators, to see how many - // additional partitions we'll need; in rough terms, we will need an - // extra partition whenever we see a transition from nuc to sep or sep to - // nuc; however, if the sep has a NUL on the other end (marking the start - // or end of the current partition), then we will not need an extra - // partition - // - // examples (0 is NUL, X is a separator, and n is a nucleotide): - // 0XXXXXnnnnnnnnnnXXXXX0 no extras needed - // 0XXXXXnnnnnnnnnnXXXXXXnnnnnnnnnnXXXXX0 one extra needed - - extraPieces = 0; - inSepRun = false; - chBefore = 0; - for (scan=_seq->v ; scan<_seq->v+_seq->len ; scan++) - { - ch = *scan; - if (ch == 0) - inSepRun = false; - else if (ch == sepCh) - { if ((chBefore != 0) && (chBefore != sepCh)) inSepRun = true; } - else - { - if (inSepRun) - { - extraPieces++; - debugSeparation_2; - } - inSepRun = false; - } - chBefore = ch; - } - - debugSeparation_3; - - // allocate the extra partitions; note that we won't need any additional - // space in the header pool, since the new partitions will share existing - // names - - newSpLen = sp->len + extraPieces; - if ((newSpLen < sp->len) || (newSpLen == u32max)) - suicidef ("in separate_sequence, " - "number of partitions overflows internal data type"); - - if (extraPieces != 0) - enough_partitions (_seq, newSpLen, sp->poolLen, - /*anticipate*/ false, /*round up*/ true); - - // scan the current partitions from last to first, dowing the following: - // (1) expanding any partition that contains multiple pieces - // (2) adjusting partition bounds whenever a separator exists at an end - // (3) replacing all separators with NUL characters - - p = sp->p; - fromIx = sp->len; pFrom = &p[fromIx]; - toIx = newSpLen; pTo = &p[toIx]; - pTo->sepBefore = pFrom->sepBefore; // copy sentinel - debugSeparation_4; - - sepPrefix = pFrom->sepBefore; - while (fromIx-- > 0) - { - pFrom--; - sepSuffix = sepPrefix; // (separator at end of this partition) - sepPrefix = pFrom->sepBefore; // (separator at start of this partition) - - debugSeparation_5; - - sepAfter = 0; - chAfter = 0; - for (scan=_seq->v+sepSuffix-1 ; scan>_seq->v+sepPrefix ; scan--) - { - ch = *scan; - //debugSeparation_6; - if (ch == 0) - { - suicidef ("internal error in separate_sequence\n" - "seq->v[" unsposFmt "]=0x00", (unspos) (scan-_seq->v)); - } - else if (ch == sepCh) - { - *scan = 0; // replace separator with NUL - if ((chAfter != 0) && (chAfter != sepCh) && (sepAfter != 0)) - { - toIx--; pTo--; - - sepBefore = scan - _seq->v; - startLoc = pFrom->startLoc + (sepBefore - pFrom->sepBefore); - - pTo->sepBefore = sepBefore; - pTo->sepAfter = sepAfter; - pTo->contig = pFrom->contig; - pTo->startLoc = startLoc; - pTo->trueLen = pFrom->trueLen; - pTo->header = pFrom->header; - debugSeparation_8; - - sepAfter = 0; - } - } - else if (sepAfter == 0) - { - sepAfter = (scan - _seq->v) + 1; - debugSeparation_7; - } - chAfter = ch; - } - - if (sepAfter != 0) - { - toIx--; pTo--; - - sepBefore = scan - _seq->v; - startLoc = pFrom->startLoc + (sepBefore - pFrom->sepBefore); - - pTo->sepBefore = sepBefore; - pTo->sepAfter = sepAfter; - pTo->contig = pFrom->contig; - pTo->startLoc = startLoc; - pTo->trueLen = pFrom->trueLen; - pTo->header = pFrom->header; - debugSeparation_8; - } - } - debugSeparation_9; - - if (toIx != 0) - suicidef ("internal error in separate_sequence\n" - "toIx=%d", toIx); - - sp->len = newSpLen; - debugSeparation_10; - } - -//---------- -// -// add_partition-- -// Add a partition block for a sequence. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to add the partition to. -// unspos sepPos: The position of the partition's prefix separator; see -// .. note (1) below. -// unspos startLoc: The partition's startLoc field. -// unspos trueLen: The partition's trueLen field. -// -// Returns: -// nothing -// -//---------- -// -// Notes: -// -// (1) We use sepPos as this partition's prefix separator (sepBefore) *and* as -// the previous partition's suffix separator (sepAfter). -// -// (2) We initially write a zero to this partition's sepAfter. It is replaced -// with the correct on the next call (the call to add the next partition). -// However, this leaves the final partition's sepAfter field unset. So the -// caller *must* set it. -// -//---------- - -static void add_partition - (seq* _seq, - unspos sepPos, - unspos startLoc, - unspos trueLen) - { - seqpartition* sp = &_seq->partition; - partition* p; - char* header; - int headerLen; - - debugTextFile_2; - - header = (_seq->useFullNames)? _seq->header : _seq->shortHeader; - headerLen = strlen(header); - - enough_partitions (_seq, sp->len+1, sp->poolLen+headerLen+1, - /*anticipate*/ true, /*round up*/ true); - - if (sp->len > 0) - { - p = &sp->p[sp->len-1]; - p->sepAfter = sepPos; - } - - p = &sp->p[sp->len]; - p->sepBefore = sepPos; - p->sepAfter = 0; // (see note 2) - p->contig = _seq->contig; - p->startLoc = startLoc; - p->trueLen = trueLen; - p->header = sp->poolLen; - strcpy (/*to*/ &sp->pool[p->header], /*from*/ header); - sp->poolLen += headerLen+1; - sp->len++; - } - -//---------- -// -// copy_partitions-- -// Make a copy of a sequence's parititions. -// -//---------- -// -// Arguments: -// seq* seqTo: The sequence to copy partition info to. -// seq* seqFrom: The sequence to copy partition info from. -// -// Returns: -// nothing; failures result in fatality. -// -//---------- - -static void copy_partitions - (seq* seqTo, - seq* seqFrom) - { - seqpartition* spFrom = &seqFrom->partition; - seqpartition* spTo = &seqTo->partition; - u32 len, poolLen; - u32 ix; - - // make sure we have space for the partitions - - poolLen = spFrom->poolLen; - spTo->poolOwner = true; - enough_partitions (seqTo, spFrom->len, poolLen, - /*anticipate*/ false, /*round up*/ false); - - // copy the pool directly - - spTo->poolLen = poolLen; - memcpy (spTo->pool, spFrom->pool, poolLen); - - // copy the partition records - - len = spTo->len = spFrom->len; - for (ix=0 ; ix<=len ; ix++) - spTo->p[ix] = spFrom->p[ix]; - } - -//---------- -// -// enough_partitions-- -// Make sure a sequence has enough room for a specified number of partitions. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence to check. -// u32 numPartitions: The number of partitions to allocate for (not -// .. including the extra partition used as a -// .. sentinel). -// u32 poolSize: The number of bytes to allocate for a pool of -// .. headers. If this is zero, we will estimate the -// .. pool size from the number of partitions. -// int anticipate: true => allocate extra, anticipating the need for -// .. more -// false => don't -// int roundUp: true => round up the allocation size to some -// .. convenient size -// false => don't -// -// Returns: -// nothing; _seq->partition.p and _seq->partition.pool may be modified; -// failures result in fatality. -// -//---------- - -#define averageHeaderSize 20 - -static void enough_partitions - (seq* _seq, - u32 numPartitions, - u32 poolSize, - int anticipate, - int roundUp) - { - seqpartition* sp = &_seq->partition; - u32 bytesNeeded; - - // if we have enough already, just return - - if ((sp->p != NULL) - && (sp->size > numPartitions) - && (sp->pool != NULL) - && ((poolSize > 0) && (sp->poolSize >= poolSize))) - return; - - if (!sp->poolOwner) - { - char* name = (_seq->filename != NULL)? _seq->filename - : _seq->header; - suicidef ("internal error, attempt to resize external partition names pool (%s)", - name); - } - - if (sp->p == NULL) sp->len = 0; - if (sp->pool == NULL) sp->poolLen = 0; - - if (poolSize == 0) poolSize = numPartitions * (averageHeaderSize + 1); - - // allocate partition array; note that we bump up the number of records - // allocated to as many as can fit in a multiple of 16K - - if (sp->size <= numPartitions) - { - numPartitions++; // (extra one for a sentinel) - if (anticipate) // anticipatory, grow by about 13% - numPartitions += 30 + numPartitions / 8; - bytesNeeded = numPartitions * sizeof(partition); - - if (roundUp) - { - bytesNeeded = round_up_16K (bytesNeeded); - numPartitions = bytesNeeded / sizeof(partition); - bytesNeeded = numPartitions * sizeof(partition); - } - - sp->p = realloc_or_die ("enough_partitions (p)", sp->p, bytesNeeded); - sp->size = numPartitions; - } - - // allocate pool for partition headers - - if (sp->poolSize < poolSize) - { - if (anticipate) // anticipatory, grow by about 13% - poolSize += 30*(averageHeaderSize+1) + poolSize / 8; - bytesNeeded = round_up_16K (poolSize); - sp->pool = realloc_or_die ("enough_partitions (pool)", - sp->pool, bytesNeeded); - sp->poolSize = bytesNeeded; - sp->poolOwner = true; - } - - } - -//---------- -// -// lookup_partition, lookup_partition_no_die-- -// Map a position in a partitioned sequence to its partition record. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence. -// unspos pos: The position to look up. This is an index into _seq->v, -// .. (origin zero). -// -// Returns: -// A pointer to the partition record (see note (1)); lookup_partition_no_die -// returns NULL on failure; for lookup_partition, failures result in fatality. -// -//---------- -// -// Notes: -// -// (1) The pointer p returned points to the partition record which contains the -// contig, header, sepBefore and sepAfter for the partition. sepBefore is -// the bounding NUL at the left end (lower index) of the sequence, and -// sepAfter is the bounding NUL at the right end (higher index) of the -// sequence. -// -// (2) It has been suggested that there are cases for which caching the latest -// lookup is beneficial. Code to do this can be enabled by #defining -// cache_partition_lookups. However, in the author's tests the cached -// lookup was rarely hit. -// -//---------- - -static partition* lookup_partition_core (seq* _seq, unspos pos, int dieOnFailure); - -partition* lookup_partition_no_die (seq* _seq, unspos pos) - { return lookup_partition_core (_seq, pos, /* dieOnFailure */ false); } - -partition* lookup_partition (seq* _seq, unspos pos) - { return lookup_partition_core (_seq, pos, /* dieOnFailure */ true); } - - -partition* lookup_partition_core - (seq* _seq, - unspos pos, - int dieOnFailure) - { -#ifdef cache_partition_lookups // see note (2) - static seq* cachedSeq = NULL; - static unspos cachedPos = ((unspos) -1); - static partition* cachedResult = NULL; -#endif // cache_partition_lookups - seqpartition* sp = &_seq->partition; - partition* p; - u32 hi, lo, ix; - char* reason; - - if (sp->p == NULL) goto no_partitions; - if (sp->len == 0) goto no_partitions; - - sequence_count_stat (partitionLookups); - -#ifdef cache_partition_lookups - if ((_seq == cachedSeq) && (pos == cachedPos)) - { - sequence_count_stat (partitionHits); - return cachedResult; - } -#endif // cache_partition_lookups - - p = sp->p; - - lo = 0; - hi = sp->len; - - debugPartitions_2; - - if (pos <= p[lo].sepBefore) goto before_first; - if (pos >= p[hi].sepBefore) goto after_last; - - // perform binary search for the position - // - // loop invariants: loop termination: - // 0 <= lo < hi <= len ix = lo = hi-1 - // pos > p[lo].sepBefore pos > p[ix].sepBefore - // pos < p[hi].sepBefore pos < p[ix+1].sepBefore - - while (true) - { - sequence_count_stat (lookupIterations); - ix = (lo + hi) / 2; // when hi==lo+1, ix==lo - debugPartitions_3; - if (hi == lo+1) break; - if (pos < p[ix].sepBefore) hi = ix; - else if (pos > p[ix].sepBefore) lo = ix; - else goto on_separator; // pos == p[ix].sepBefore, which is illegal - } - - // make sure the position was within the actual partition - // nota bene: we use ">" rather than ">=" to allow the caller to position - // on the open end of the sequence - - if (pos > p[ix].sepAfter) goto not_in_partition; - - // success - - debugPartitions_4; - -#ifdef cache_partition_lookups - cachedSeq = _seq; - cachedPos = pos; - cachedResult = &sp->p[ix]; -#endif // cache_partition_lookups - - return &sp->p[ix]; - - // failure - -no_partitions: - reason = "there are no partitions"; - goto failure; - -before_first: - reason = "before first partition"; - goto failure; - -after_last: - reason = "after last partition"; - goto failure; - -on_separator: - reason = "on partition separator prefix"; - goto failure; - -not_in_partition: - reason = "not within partition"; - goto failure; - -failure: - if (!dieOnFailure) return NULL; - if (_seq->filename == NULL) - suicidef ("lookup_partition could not locate position " unsposFmt "\n%s", - pos, reason); - else - suicidef ("lookup_partition could not locate position " unsposFmt " in %s\n%s", - pos, _seq->filename, reason); - - - return NULL; // (never gets here) - } - -//---------- -// -// lookup_named_partition-- -// Map a name to the corresponding partition record in a partitioned sequence. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence. -// name name: The name to look up. -// -// Returns: -// A pointer to the partition record (see note 1); NULL if not found. -// -//---------- -// -// Notes: -// -// (1) The parition record returned is the first with the given name. If -// there are other partitions with the same name, lookup_partition_seq_pos -// should then be used to determine the partition containing that position. -// Or, last_partition_with_name can then be used to locate the last -// partition with that same given name. -// -//---------- - -partition* lookup_named_partition - (seq* _seq, - char* name) - { - seqpartition* sp = &_seq->partition; - partition* part; - u32 ix; - int found; - - if (sp->p == NULL) return NULL; - if (sp->len == 0) return NULL; - - // perform linear search for the name - - found = false; - for (ix=0 ; ixlen ; ix++) - { - part = &sp->p[ix]; - if (strcmp (&sp->pool[part->header], name) == 0) - { found = true; break; } - } - if (!found) return NULL; - - return part; - } - -//---------- -// -// last_partition_with_name-- -// Find the last partition record, in a partitioned sequence, that has the -// same name as a given partition record. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence. -// partition* part: The first partition with the desired name. -// -// Returns: -// A pointer to the partition record. -// -//---------- - -partition* last_partition_with_name - (seq* _seq, - partition* firstPart) - { - seqpartition* sp = &_seq->partition; - partition* scanPart, *prevPart; - u32 ix; - char* name; - - if (sp->p == NULL) return NULL; - if (sp->len == 0) return NULL; - - // determine the index into the partition list - // $$$ perhaps to be safe we should use integer arithmetic to make sure - // .. firstPart - sp->p is a multiple of sizeof(partition) - - ix = firstPart - sp->p; - if (ix >= sp->len) - suicidef ("internal error in lookup_named_partition\n" - "invalid partition pointer"); - - // get the name of this partition - - name = &sp->pool[firstPart->header]; - - // scan forward along the partition list until we find a different name - - prevPart = firstPart; - for (ix=ix+1 ; ixlen ; ix++) - { - scanPart = &sp->p[ix]; - if (strcmp (&sp->pool[scanPart->header], name) != 0) break; - prevPart = scanPart; - } - - return prevPart; - } - -//---------- -// -// lookup_partition_seq_pos-- -// Given the first partition for a given name, map a sequence position to the -// corresponding partition record. -// -//---------- -// -// Arguments: -// seq* _seq: The sequence. -// partition* part: The first partition for a given sequence (by name). -// unspos pos: The position (within the sequence) to look for. This -// .. is origin-one. -// -// Returns: -// A pointer to the partition record; NULL if not found. -// -//---------- - -partition* lookup_partition_seq_pos - (seq* _seq, - partition* _part, - unspos pos) - { - seqpartition* sp = &_seq->partition; - partition* part = _part; - char* name; - unspos endLoc; - u32 ix; - int found; - - if (sp->p == NULL) return NULL; - if (sp->len == 0) return NULL; - if (_part == NULL) return NULL; - - name = &sp->pool[part->header]; - ix = part - sp->p; - - found = false; - while (true) - { - if (pos >= part->startLoc) - { - endLoc = part->startLoc + part->sepAfter - part->sepBefore; - if (pos < endLoc) - { found = (pos < endLoc-1); break; } - } - if (++ix >= sp->len) break; // (not found) - part = &sp->p[ix]; - if (strcmp (&sp->pool[part->header], name) != 0) break; // (not found) - } - if (!found) return NULL; - - return part; - } - -//---------- -// -// print_sequence-- -// Print a sequence to a file. -// -//---------- -// -// Arguments: -// FILE* f: The file to write to. -// seq* seq: The sequence to print. -// char* header: The header to give the sequence. If this is NULL, -// .. _seq->header is used. If this is empty, we assume -// .. the caller has already taken care of printing the -// .. header. -// int perLine: The number of nucleotides to print per line. Zero -// .. indicates that they should all be printed on one -// .. line. -// -// Returns: -// (nothing) -// -//---------- - -void print_sequence - (FILE* f, - seq* _seq, - char* header, - int perLine) - { - unspos ix; - - if (_seq == NULL) - { fprintf (f, "(null sequence)\n"); return; } - - if ((header == NULL) || (header[0] != 0)) - { - if (header == NULL) - { - header = _seq->header; - if (header == NULL) - header = ""; - } - - if (header[0] == '>') - header = skip_whitespace(header+1); - - if (header[0] == 0) fprintf (f, ">\n"); - else fprintf (f, "> %s\n", header); - } - - if (_seq->fileType == seq_type_qdna) - { - for (ix=0 ; ix<_seq->len ; ix++) - { - if ((ix != 0) && ((ix % perLine) == 0)) fprintf (f, "\n"); - fprintf (f, " %02X", _seq->v[ix]); - } - fprintf (f, "\n"); - } - else - { - for (ix=0 ; ix<_seq->len ; ix++) - { - if ((ix != 0) && ((ix % perLine) == 0)) fprintf (f, "\n"); - if (_seq->v[ix] == 0) fprintf (f, "*"); - else fprintf (f, "%c", _seq->v[ix]); - } - fprintf (f, "\n"); - } - } - -//---------- -// -// print_partition_table-- -// Print a sequence's partition table. (for debugging only) -// -//---------- -// -// Arguments: -// FILE* f: The file to write to. -// seq* seq: The sequence to print. -// -// Returns: -// (nothing) -// -//---------- - -void print_partition_table - (FILE* f, - seq* _seq) - { - seqpartition* sp = &_seq->partition; - partition* p; - u32 ix; - - if (sp->p == NULL) - { fprintf (f, "sequence has no partition\n"); return; } - if (sp->state == seqpart_empty) - { fprintf (f, "partition table is in empty state\n"); return; } - if (sp->state == seqpart_reusable) - { fprintf (f, "partition table is in re-usable state\n"); return; } - if (sp->state == seqpart_loading) - { fprintf (f, "partition table is in loading state\n"); return; } - - fprintf (f, " %8s %8s %8s %8s %6s %s\n", - "sepBefore", "sepAfter", "startLoc", "trueLen", "contig", "header"); - - p = sp->p; - for (ix=0 ; ix<=sp->len ; ix++) - { - fprintf (f, "[%2d] %8u %8u " unsposStarFmt " " unsposStarFmt, - ix, p[ix].sepBefore, p[ix].sepAfter, - 8, p[ix].startLoc, - 8, p[ix].trueLen); - if (ix < sp->len) - fprintf (f, " %6d %s", p[ix].contig, &sp->pool[p[ix].header]); - fprintf (f, "\n"); - } - - } - -//---------- -// -// mask_sequence, mask_sequence_keep-- -// Mask a sequence, in place, as prescribed by some file. -// -// mask_sequence() replaces any base in the prescribed intervals. -// mask_sequence_keep() replaces any base NOT in the prescribed intervals. -// -// A typical masking file looks like this: -// -// 1527933 3184039 -// 4165389 6877343 -// 7374477 7902860 -// -// Each line describes a region to be masked. Indexes are one-based, and -// inclusive on both ends. Numbers are free format. Comment lines (beginning -// with #) are ignored, as are blank lines. Additional information after the -// first two columns is also ignored. -// -// Note that if the sequence has been reverse complemented, the masking -// intervals are relative to the reverse strand. -// -//---------- -// -// Arguments: -// seq* seq: The sequence to mask. -// char* maskFilename: The name of the file containing masking -// .. information. -// int maskChar: The character to ask as a mask. Normally this is a -// .. character (e.g. 'X' or 'N'). However, the value -// .. -1 means that we should mask by changing to -// .. lowercase. -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -void mask_sequence - (seq* _seq, - char* maskFilename, - int _maskChar) - { - char line[511+1], discard[511+1]; - char maskChar = 0; - int toLower = false; - FILE* maskF; - int len; - int lineNum, missingEol; - char* waffle; - char extra; - int numItems; - seqpartition* sp = &_seq->partition; - partition* part; - unspos b, e, pB, pE, pOffset, pLen; - u32 ix; - - if (_maskChar == -1) toLower = true; - else maskChar = (u8) _maskChar; - - // read the masking intervals and deposit the masking character at every - // contained base - - maskF = fopen_or_die (maskFilename, "rt"); - - lineNum = 0; - while (fgets (line, sizeof(line), maskF) != NULL) - { - lineNum++; - - // check for lines getting split by fgets (the final line in the file - // might not have a newline, but no internal lines can be that way); - // if the line was split we simply read ahead until we find the end of - // the line (and discard the extra) - - len = strlen(line); - if (len == 0) continue; - missingEol = (line[len-1] != '\n'); - - if (missingEol) - { - while (fgets (discard, sizeof(discard), maskF) != NULL) - { - len = strlen(discard); - if (len == 0) break; - if (discard[len-1] == '\n') break; - } - } - - // trim blanks, end of line, and comments, and ignore blank lines - - len = strlen(line); - if (line[len-1] == '\n') line[--len] = 0; - - waffle = strchr (line, '#'); - if (waffle != NULL) *waffle = 0; - - trim_string (line); - if (line[0] == 0) continue; - - // ok, the line has something in it; parse it as an interval - - numItems = sscanf (line, unsposFmtScanf " " unsposFmtScanf "%c", - &b, &e, &extra); - - if ((numItems == 3) && ((extra == ' ') || (extra == '\t'))) - numItems = 2; - - if (numItems != 2) - { - char* scan = skip_whitespace (line); - numItems = 0; - if (*scan != 0) { scan = skip_darkspace (scan); scan = skip_whitespace (scan); numItems++; } - if (*scan != 0) { scan = skip_darkspace (scan); scan = skip_whitespace (scan); numItems++; } - if (*scan != 0) { scan = skip_darkspace (scan); scan = skip_whitespace (scan); numItems++; } - if (*scan != 0) { scan = skip_darkspace (scan); scan = skip_whitespace (scan); numItems++; } - - if (numItems == 3) - suicidef ("bad interval (in %s, line %d): \"%s\"\n" - "three-column masking intervals are not supported for this operation", - maskFilename, lineNum, line); - else - suicidef ("bad interval (in %s, line %d): \"%s\"", - maskFilename, lineNum, line); - } - - // trim the left end of the interval to our subsequence - - if (e < _seq->startLoc) continue; - if (b < _seq->startLoc) b = _seq->startLoc; - b -= _seq->startLoc; // (nota bene, b is zero-based afterwards) - e -= _seq->startLoc-1; // (nota bene, e is open interval afterwards) - - if (sp->p == NULL) //=== sequence is not partitioned === - { - // trim the right end of the interval to our subsequence - - if (b >= _seq->len) continue; - if (e >= _seq->len) e = _seq->len; - if (e <= b) continue; - - // mask the interval - - if (toLower) - { - for ( ; bv[b] >= 'A') && (_seq->v[b] <= 'Z')) - _seq->v[b] = _seq->v[b] + 'a' - 'A'; - } - } - else - memset (_seq->v+b, maskChar, (size_t) (e-b)); - } - else //=== sequence is partitioned === - { - for (ix=0 ; ixlen ; ix++) - { - part = &sp->p[ix]; - pOffset = part->sepBefore + 1; - pLen = (part+1)->sepBefore - pOffset; - - // trim the right end of the interval to our subsequence - - pB = b; - pE = e; - - if (pB >= pLen) continue; - if (pE >= pLen) pE = pLen; - if (pE <= pB) continue; - - // mask the interval - - pB += pOffset; - pE += pOffset; - - if (toLower) - { - for ( ; pBv[pB] >= 'A') && (_seq->v[pB] <= 'Z')) - _seq->v[pB] = _seq->v[pB] + 'a' - 'A'; - } - } - else - memset (_seq->v+pB, maskChar, (size_t) (pE-pB)); - } - } - } - - fclose(maskF); - } - -void mask_sequence_keep - (seq* _seq, - char* maskFilename, - int _maskChar) - { - char line[511+1], discard[511+1]; - char maskChar = 0; - int toLower = false; - FILE* maskF; - int len; - int lineNum, missingEol; - char* waffle; - char extra; - int numItems; - seqpartition* sp = &_seq->partition; - partition* part; - unspos b, e, pB, pE, pOffset, pLen; - u32 ix; - - if ((_seq->fileType != seq_type_fasta) - && (_seq->fileType != seq_type_fastq) - && (_seq->fileType != seq_type_nib) - && (_seq->fileType != seq_type_2bit) - && (_seq->fileType != seq_type_hsx)) - suicidef ("masking of interval complements only valid for DNA sequences\n" - " (%s is %s file)", - sequence_filename(_seq), seqTypeNames[_seq->fileType]); - - if (_maskChar == -1) toLower = true; - else maskChar = (u8) _maskChar; - - // read the masking intervals and mark the most-significant bit of every - // contained base - - maskF = fopen_or_die (maskFilename, "rt"); - - lineNum = 0; - while (fgets (line, sizeof(line), maskF) != NULL) - { - lineNum++; - - // check for lines getting split by fgets (the final line in the file - // might not have a newline, but no internal lines can be that way); - // if the line was split we simply read ahead until we find the end of - // the line (and discard the extra) - - len = strlen(line); - if (len == 0) continue; - missingEol = (line[len-1] != '\n'); - - if (missingEol) - { - while (fgets (discard, sizeof(discard), maskF) != NULL) - { - len = strlen(discard); - if (len == 0) break; - if (discard[len-1] == '\n') break; - } - } - - // trim blanks, end of line, and comments, and ignore blank lines - - len = strlen(line); - if (line[len-1] == '\n') line[--len] = 0; - - waffle = strchr (line, '#'); - if (waffle != NULL) *waffle = 0; - - trim_string (line); - if (line[0] == 0) continue; - - // ok, the line has something in it; parse it as an interval - - numItems = sscanf (line, unsposFmtScanf " " unsposFmtScanf "%c", - &b, &e, &extra); - - if ((numItems == 3) && ((extra == ' ') || (extra == '\t'))) - numItems = 2; - - if (numItems != 2) - { - char* scan = skip_whitespace (line); - numItems = 0; - if (*scan != 0) { scan = skip_darkspace (scan); scan = skip_whitespace (scan); numItems++; } - if (*scan != 0) { scan = skip_darkspace (scan); scan = skip_whitespace (scan); numItems++; } - if (*scan != 0) { scan = skip_darkspace (scan); scan = skip_whitespace (scan); numItems++; } - if (*scan != 0) { scan = skip_darkspace (scan); scan = skip_whitespace (scan); numItems++; } - - if (numItems == 3) - suicidef ("bad interval (in %s, line %d): \"%s\"\n" - "three-column masking intervals are not supported for this operation", - maskFilename, lineNum, line); - else - suicidef ("bad interval (in %s, line %d): \"%s\"", - maskFilename, lineNum, line); - } - - // trim the left end of the interval to our subsequence - - if (e < _seq->startLoc) continue; - if (b < _seq->startLoc) b = _seq->startLoc; - b -= _seq->startLoc; // (nota bene, b is zero-based afterwards) - e -= _seq->startLoc-1; // (nota bene, e is open interval afterwards) - - if (sp->p == NULL) //=== sequence is not partitioned === - { - // trim the right end of the interval to our subsequence - - if (b >= _seq->len) continue; - if (e >= _seq->len) e = _seq->len; - - // mark the interval - - while (bv[b++] |= 0x80; - } - else //=== sequence is partitioned === - { - for (ix=0 ; ixlen ; ix++) - { - part = &sp->p[ix]; - pOffset = part->sepBefore + 1; - pLen = (part+1)->sepBefore - pOffset; - - // trim the right end of the interval to our subsequence - - pB = b; - pE = e; - - if (pB >= pLen) continue; - if (pE >= pLen) pE = pLen; - - // mark the interval - - pB += pOffset; - pE += pOffset; - - while (pBv[pB++] |= 0x80; - } - } - - } - - fclose(maskF); - - // scan the sequence, replacing unmarked bases with the masking character, - // and removing the marks - - for (b=0 ; b<_seq->len ; b++) - { - if (_seq->v[b] == 0) continue; - if ((_seq->v[b] & 0x80) != 0) // marked => erase mark - _seq->v[b] &= ~0x80; - else if (!toLower) // unmarked => mask it - _seq->v[b] = maskChar; - else // unmarked => change to lower case - { - if ((_seq->v[b] >= 'A') && (_seq->v[b] <= 'Z')) - _seq->v[b] = _seq->v[b] + 'a' - 'A'; - } - } - - } - -//---------- -// -// colorize_sequence-- -// Create a sequence's color-space equivalent (see description below). -// -//---------- -// -// Arguments: -// seq* seq: The sequence to modify. -// -// Returns: -// (nothing) -// -//---------- -// -// We colorize by mapping each pair of nucleotides to a single "color" value -// according to the table below. This extends the normal color space -// definition to account for lower case, N, and other characters, in a way that -// makes sense in the context of lastz's alignment operations. The DNA sequence -// may contain upper and lower case nucleotides, N's, and other stuff. Further, -// a partitioned sequence will contain NUL character separators (shown as '*' in -// the table below). We also treat the DNA sequence as if it had an 'X' -// prepended to it. -// -// second in pair -// | A | C | G | T | a | c | g | t | N/n | other | * | -// ------+---+---+---+---+---+---+---+---+-----+-------+---+ -// A | 0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 | N | X | * | -// C | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | N | X | * | -// G | 2 | 3 | 0 | 1 | 2 | 3 | 0 | 1 | N | X | * | -// first T | 3 | 1 | 2 | 0 | 3 | 1 | 2 | 0 | N | X | * | -// in a | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | N | X | * | -// pair c | 1 | 0 | 3 | 2 | 5 | 4 | 7 | 6 | N | X | * | -// g | 2 | 3 | 0 | 1 | 6 | 7 | 4 | 5 | N | X | * | -// t | 3 | 1 | 2 | 0 | 7 | 6 | 5 | 4 | N | X | * | -// N/n | N | N | N | N | N | N | N | N | N | X | * | -// ------+---+---+---+---+---+---+---+---+-----+-------+---+ -// other | X | X | X | X | X | X | X | X | X | X | * | -// ------+---+---+---+---+---+---+---+---+-----+-------+---+ -// * | X | X | X | X | X | X | X | X | X | X | * | -// ------+---+---+---+---+---+---+---+---+-----+-------+---+ -// -// Example: -// -// dna: G T C G A A C C C G * C A A C C G T A T T * T A A T A A G T T T -// color: X 1 2 3 2 0 1 0 0 3 * X 1 0 1 0 3 1 3 3 0 * X 3 0 3 3 0 2 1 0 0 -// -//---------- - -static void do_colorize (u8* seq, u8* colorSeq, unspos seqLen); - -void colorize_sequence - (seq* _seq) - { - char* name; - - if (_seq == NULL) suicide ("colorize_sequence(NULL)"); - if (_seq->len < 1) return; - - if (!_seq->vcOwner) - { - name = (_seq->filename != NULL)? _seq->filename : _seq->header; - suicidef ("internal error, attempt to colorize external sequence (%s)", - name); - } - - if (_seq->vc != NULL) - { - name = (_seq->filename != NULL)? _seq->filename : _seq->header; - suicidef ("internal error, attempt to re-colorize sequence (%s)", - name); - } - - _seq->vc = malloc_or_die ("colorize_sequence (vc)", _seq->len+1); - do_colorize (_seq->v, _seq->vc, _seq->len); - } - -#define A_ 0 -#define C_ 1 -#define G_ 2 -#define T_ 3 -#define a_ 4 -#define c_ 5 -#define g_ 6 -#define t_ 7 -#define N_ 8 -#define X_ 9 -#define __ X_ -#define Z_ 10 - -const s8 nuc_to_color_bits[256] = - { - Z_,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 0x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 1x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 2x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 3x (numbers) - __,A_,__,C_,__,__,__,G_,__,__,__,__,__,__,N_,__, // 4x (upper case) - __,__,__,__,T_,__,__,__,__,__,__,__,__,__,__,__, // 5x (upper case) - __,a_,__,c_,__,__,__,g_,__,__,__,__,__,__,N_,__, // 6x (lower case) - __,__,__,__,t_,__,__,__,__,__,__,__,__,__,__,__, // 7x (lower case) - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 8x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // 9x - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Ax - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Bx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Cx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Dx - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, // Ex - __,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__ // Fx - }; - -static void do_colorize - (u8* dnaSeq, - u8* colorSeq, - unspos seqLen) - { - u8 bits1, bits2; - - bits1 = nuc_to_color_bits[X_]; - - while (seqLen-- > 0) - { - bits2 = nuc_to_color_bits[*(dnaSeq++)]; - - if (bits1 == Z_) - *(colorSeq++) = 0; - else if ((bits1 == X_) || (bits2 == X_)) - *(colorSeq++) = 'X'; - else if ((bits1 == N_) || (bits2 == N_)) - *(colorSeq++) = 'N'; - else if ((bits1 <= T_) || (bits2 <= T_)) - *(colorSeq++) = '0' + ((bits1 ^ bits2) & 3); - else - *(colorSeq++) = '4' + ((bits1 ^ bits2) & 3); - - bits1 = bits2; - } - - *colorSeq = 0; - } - -#undef A_ -#undef C_ -#undef G_ -#undef T_ -#undef a_ -#undef c_ -#undef g_ -#undef t_ -#undef N_ -#undef X_ -#undef __ -#undef Z_ - -//---------- -// -// validate_rev_comp-- -// Validate that a sequence is in the same direction as it would normally be -// after operations were performed at initial load. If it is not, it is -// reversed and/or complemented as needed to bring it back to that state. -// -// v IS v should be ->> | F | C | R | RC | -// --------------------------+----+----+----+----+ -// forward (F) | - | C | R | RC | <<- correction -// complemented (C) | C | - | RC | R | -// reversed (R) | R | RC | - | C | -// reverse-complemented (RC) | RC | R | C | - | -// --------------------------+----+----+----+----+ -// -//---------- -// -// Arguments: -// seq* seq: The sequence to operate upon. -// -// Returns: -// (nothing) -// -//---------- - -void validate_rev_comp - (seq* _seq) - { - int operation = _seq->revCompFlags ^ _seq->doRevCompFlags; - - if (operation == rcf_revcomp) - rev_comp_sequence (_seq, _seq->qToComplement); - else if (operation == rcf_rev) - backward_sequence (_seq); - else if (operation == rcf_comp) - { - backward_sequence (_seq); - rev_comp_sequence (_seq, _seq->qToComplement); - } - } - -//---------- -// -// rev_comp_sequence-- -// Convert a sequence to its reverse-complement, in place. Note that in -// partitioned sequences, each partition is reverse-complemented separately -// (independent of the others), so that the partition table can remain -// unchanged. -// -//---------- -// -// Arguments: -// seq* seq: The sequence to reverse-complement. -// const u8* nucToComplement: Array to map a nucleotide to its complement. -// .. If this is NULL, nuc_to_complement is -// .. used for DNA sequences. For quantum -// .. sequences this cannot be NULL. -// -// Returns: -// (nothing) -// -//---------- - -static void revcomp_in_place (u8* seq, unspos seqLen, const u8* nucToComplement); -static void reverse_in_place (u8* seq, unspos seqLen); - - -void rev_comp_sequence - (seq* _seq, - const u8* _nucToComplement) - { - seqpartition* sp = &_seq->partition; - partition* p; - u32 ix; - const u8* nucToComplement; - - if (_seq == NULL) suicide ("rev_comp_sequence(NULL)"); - if (_seq->len < 1) return; - - if ((_seq->fileType == seq_type_qdna) - && (_nucToComplement == NULL)) - { - suicidef ("quantum DNA cannot be complemented (%s)\n" - "(the score file lacks complements)", - sequence_filename(_seq)); - return; // (we never reach here) - } - - if (_nucToComplement != NULL) nucToComplement = _nucToComplement; - else nucToComplement = nuc_to_complement; - - if (sp->p == NULL) - { - revcomp_in_place (_seq->v, _seq->len, nucToComplement); - if (_seq->vq != NULL) - reverse_in_place (_seq->vq, _seq->len); - } - else - { - p = sp->p; - for (ix=0 ; ixlen ; ix++) - { - revcomp_in_place (/*start*/ _seq->v + p[ix].sepBefore+1, - /*length*/ p[ix].sepAfter - (p[ix].sepBefore+1), - /*how*/ nucToComplement); - if (_seq->vq != NULL) - reverse_in_place (/*start*/ _seq->vq + p[ix].sepBefore+1, - /*length*/ p[ix].sepAfter - (p[ix].sepBefore+1)); - } - } - - if (_seq->vc != NULL) - do_colorize (_seq->v, _seq->vc, _seq->len); - - _seq->revCompFlags = _seq->revCompFlags ^ rcf_revcomp; - } - - -static void revcomp_in_place - (u8* _seq, - unspos seqLen, - const u8* nucToComplement) - { - u8* left, *right; - u8 nuc; - - left = _seq; - right = left + seqLen-1; - for ( ; left<=right ; left++,right--) - { - nuc = nucToComplement[*left ]; - *left = nucToComplement[*right]; - *right = nuc; - } - } - - -static void reverse_in_place - (u8* _seq, - unspos seqLen) - { - u8* left, *right; - u8 nuc; - - left = _seq; - right = left + seqLen-1; - for ( ; left<=right ; left++,right--) - { nuc = *left; *left = *right; *right = nuc; } - } - -//---------- -// -// backward_sequence-- -// Convert a sequence to its reverse (without complement), in place. Note -// that in partitioned sequences, each partition is reversed separately. -// -// [[ see also copy_reverse_of_string and strncpy_reverse ]] -// -//---------- -// -// Arguments: -// seq* seq: The sequence to reverse. -// -// Returns: -// (nothing) -// -//---------- - -static void backward_in_place (u8* seq, unspos seqLen); - -void backward_sequence - (seq* _seq) - { - seqpartition* sp = &_seq->partition; - partition* p; - u32 ix; - - if (_seq->fileType == seq_type_csfasta) - suicidef ("color space cannot be reversed (%s)", sequence_filename(_seq)); - - if (_seq == NULL) suicide ("backward_sequence(NULL)"); - if (_seq->len < 1) return; - - if (sp->p == NULL) - backward_in_place (_seq->v, _seq->len); - else - { - p = sp->p; - for (ix=0 ; ixlen ; ix++) - backward_in_place (/*start*/ _seq->v + p[ix].sepBefore+1, - /*length*/ p[ix].sepAfter - (p[ix].sepBefore+1)); - } - - _seq->revCompFlags = _seq->revCompFlags ^ rcf_rev; - } - - -static void backward_in_place - (u8* _seq, - unspos seqLen) - { - u8* left, *right; - u8 nuc; - - left = _seq; - right = left + seqLen-1; - for ( ; left<=right ; left++,right--) - { nuc = *left; *left = *right; *right = nuc; } - } - -//---------- -// -// upper_sequence-- -// Convert a sequence to its upper-case equivalent, in place. -// -//---------- -// -// Arguments: -// seq* seq: The sequence to modify. -// -// Returns: -// (nothing) -// -//---------- - -static void upper_in_place (u8* seq, unspos seqLen); - -void upper_sequence - (seq* _seq) - { - if (_seq == NULL) suicide ("upper_sequence(NULL)"); - if (_seq->len < 1) return; - - if (_seq->fileType == seq_type_csfasta) - suicidef ("color space cannot be upper-cased (%s)", sequence_filename(_seq)); - else if (_seq->fileType == seq_type_qdna) - suicidef ("quantum DNA cannot be upper-cased (%s)", sequence_filename(_seq)); - - upper_in_place (_seq->v, _seq->len); - } - -static void upper_in_place - (u8* _seq, - unspos seqLen) - { - u8* left, *right; - - left = _seq; - right = left + seqLen; - for ( ; left=r ; t--) - *t = *s++; - - r[len] = 0; - return r; - } - -//---------- -// -// strncpy_reverse-- -// Copy of a string, in reversed order. -// -// [[ see also backward_sequence and copy_reverse_of_string ]] -// -//---------- -// -// Arguments: -// char* d: The place to copy the string to. -// char* s: The string to copy. -// unspos len: Its length (not including the terminating zero). -// -// Returns: -// (nothing) -// The copy, newly allocated from the heap (including a terminating zero). -// Caller must eventually dispose of this with a call to free(). -// -//---------- - -void strncpy_reverse - (char* d, - char* s, - unspos len) - { - char* t; - - for (t=d+len-1 ; t>=d ; t--) - *t = *s++; - - d[len] = 0; - } - -//---------- -// -// fence_sequence_interval-- -// Place two markers in a sequence, bracketing the ends of an interval. -// -// Note: the markers can later be removed by calling unfence_sequence_interval. -// -//---------- -// -// Arguments: -// seq* seq: The sequence to modify. -// interval interval: The interval to mark. We expect this to be origin- -// .. zero, half-open. We mark the positions s-1 -// .. and e. It is legal for the interval to indicate -// .. points beyond the sequence. -// u8 ch: The character to mark with. -// -// Returns: -// (nothing) -// -//---------- - -void fence_sequence_interval - (seq* _seq, - interval _interval, - u8 ch) - { - unspos s, e; - u8 sCh, eCh; - - if (_seq == NULL) suicide ("fence_sequence_interval(NULL)"); - - if ((_seq->hasLeftFence) || (_seq->hasRightFence)) - suicide ("INTERNAL ERROR-- sequence already has fences"); - - debugFencing_1 - - s = _interval.s; - if (s >= 1) - { - s--; - sCh = _seq->v[s]; - _seq->v[s] = ch; - _seq->hasLeftFence = true; - _seq->leftFencePos = s; - _seq->leftFenceCh = sCh; - debugFencing_2 - } - - e = _interval.e; - if (e <= _seq->len) - { - eCh = _seq->v[e]; - _seq->v[e] = ch; - _seq->hasRightFence = true; - _seq->rightFencePos = e; - _seq->rightFenceCh = eCh; - debugFencing_3 - } - - debugFencing_4 - } - -//---------- -// -// unfence_sequence_interval-- -// Remove the markers placed by fence_sequence_interval. -// -//---------- -// -// Arguments: -// seq* seq: The sequence to modify. -// -// Returns: -// (nothing) -// -//---------- - -void unfence_sequence_interval - (seq* _seq) - { - if ((!_seq->hasLeftFence) && (!_seq->hasRightFence)) - suicide ("INTERNAL ERROR-- sequence has no fences to tear down"); - - debugFencing_5 - - if (_seq->hasLeftFence) - { - _seq->v[_seq->leftFencePos] = _seq->leftFenceCh; - _seq->hasLeftFence = false; - debugFencing_6 - } - - if (_seq->hasRightFence) - { - _seq->v[_seq->rightFencePos] = _seq->rightFenceCh; - _seq->hasRightFence = false; - debugFencing_7 - } - - debugFencing_8 - } - -//---------- -// -// parse_sequence_name-- -// Parse a sequence name -// -// The seqence name is the name of a file, plus some control options. The -// basic form is -// -// nickname::sequence_file/contig_name{mask_file}[actions]- -// -//---------- -// -// Arguments: -// const char* name: The name string to parse -// char** filename: Place to return the file name. -// char** nickname: Place to return the nickname, if any. -// char** contigOfInterest: Place to return the name of the particular -// contig-of-interest, if any. -// char** namesFilename: Place to return the contigs-of-interest -// .. file name, if any. -// char** choresFilename: Place to return the chores file name, if -// .. any. Note that, unlike the other -// .. filename arguments, this *must* be set -// .. to either NULL or a filename upon entry. -// int* subsampleK: Place to return K-of-N subsampling -// int* subsampleN: .. specification. -// char** softMaskFilename: Place to return the soft-mask file name, if -// .. any. -// int* softMaskComplement: Place to return true/false for soft-masking. -// char** xMaskFilename: Place to return the x-mask file name, if any. -// int* xMaskComplement: Place to return true/false for x-masking. -// char** nMaskFilename: Place to return the n-mask file name, if any. -// int* xMaskComplement: Place to return true/false for n-masking. -// int* nameParseType: Place to return the name parse type, if any. -// char** nameTrigger: Place to return the mask name trigger, if -// .. any. -// int* doRevCompFlags: Place to return rcf_forward/rcf_revcomp. -// .. Actually can return any combination of -// .. rcf_xxx flags. -// int* doUnmask: (see field in the sequence structure). -// int* doPartitioning: (see field in the sequence structure). -// int* doJoin: (see field in the sequence structure). -// char* separatorCh: Place to return a separator character if -// .. any is specified. -// int* useFullNames: (see field in the sequence structure). -// int* fileType: Place to return file type if any is -// .. specified (one of seq_type_xxx). -// int* isQuantum: Place to return true/false for whether the -// .. sequence is to be quantum DNA. -// char** qCodingFilename: Place to return the quantum coding file -// .. name, if any. -// unspos* start: Place to return the starting index (origin-1). -// .. If no start is specified, 0 is placed -// .. here. (see note 1 below) -// unspos* end: Place to return the ending index (inclusive). -// .. If no end is specified, 0 is placed here. -// .. (see note 1 below) -// int* endIsSoft: Place to return a flag telling the caller -// .. that, while the end has been specified, -// .. it is a soft specification and the caller -// .. can trim it to the actual end of the -// .. sequence. -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- -// -// Notes: -// -// (1) Start and end are origin-1, inclusive on both ends. So, for example, -// start=10 and end=15 defines a 6 letter sequence, the 10th thru 15th -// letters of the file. The first 9 characters from the file should be -// skipped. -// -//---------- - -void print_file_actions (FILE* f) - { - int typeIx, indent, nameLen, needComma, lineWidth; - char* action, *description, *name; - - fprintf (f, "Supported actions:\n"); - fprintf (f, " only process a subrange of the file (see below)\n"); - fprintf (f, " revcomp reverse complement\n"); - fprintf (f, " multiple file's sequences are internally treated as a single\n"); - fprintf (f, " sequence\n"); - fprintf (f, " separator= file's sequences are internally separated by the given\n"); - fprintf (f, " character; no alignments will cross a separator\n"); - fprintf (f, " (this forces multiple)\n"); - fprintf (f, " subset= process only the sequences listed in namesfile\n"); - fprintf (f, " (only valid for fasta, fastq, 2bit and hsx)\n"); - fprintf (f, " chores= process \"alignment chores\" listed in choresfile\n"); - fprintf (f, " (only valid for fasta, fastq, 2bit and hsx)\n"); - fprintf (f, " subsample=/ process only the kth sequence of every group of n\n"); - fprintf (f, " sequences. k ranges from 1 to n\n"); - fprintf (f, " (only valid for fasta, 2bit and hsx)\n"); - fprintf (f, " unmask convert any lowercase bases to uppercase\n"); - fprintf (f, " softmask= mask segments specified in , replacing them with\n"); - fprintf (f, " lowercase equivalents\n"); - fprintf (f, " softmask=keep: mask bases NOT in segments specified in , with Xs\n"); - fprintf (f, " xmask= mask segments specified in , replacing them with Xs\n"); - fprintf (f, " xmask=keep: mask bases NOT in segments specified in , with Xs\n"); - fprintf (f, " nmask= mask segments specified in , replacing them with Ns\n"); - fprintf (f, " nmask=keep: mask bases NOT in segments specified in , with Ns\n"); - fprintf (f, " nickname= name to use for this sequence in any output files\n"); - fprintf (f, " nameparse=full report full names in alignments instead of short names\n"); - fprintf (f, " nameparse=alphanum pull short name from sequence header, alphanumeric only\n"); - fprintf (f, " nameparse=darkspace pull short name from sequence header, non-whitespace only\n"); - fprintf (f, " nameparse=tag: pull a short name from sequence header, starting from\n"); - fprintf (f, " marker (only valid for fasta)\n"); - fprintf (f, " quantum the sequence contains quantum DNA\n"); - fprintf (f, " quantum= the sequence contains quantum DNA, and \n"); - fprintf (f, " describes the mapping from symbols to probabilities (only\n"); - fprintf (f, " meaningful for --format=text)\n"); - - action = " format= "; - description = "override auto-format detect; is one of "; - fprintf (f, "%s%s", action, description); - indent = strlen (action); - lineWidth = indent + strlen (description); - for (typeIx=seq_type_unknown+1 ; typeIx= 79) - { - fprintf (f, ",\n%*s", indent, " "); - lineWidth = indent; - needComma = false; - } - - if (needComma) fprintf (f, ", "); - fprintf (f, "%s", name); - if (needComma) lineWidth += 2; - lineWidth += nameLen; - } - fprintf (f, "\n\n"); - - fprintf (f, "Subranges:\n"); - fprintf (f, " start,end same as start..end (for BLASTZ compatibility)\n"); - fprintf (f, " start..end process from start thru end, inclusive\n"); - fprintf (f, " start.. process from given start thru the end of the sequence\n"); - fprintf (f, " ..end process from the start of the sequence thru given end\n"); - fprintf (f, " start#length same as start..start+length-1\n"); - fprintf (f, " center^length same as center-length/2..center+length/2-1\n"); - fprintf (f, " start..end+zoom%% process from start thru end, zoomed out by zoom%%\n"); - fprintf (f, " (subrange indices begin with 1 and are inclusive)\n"); - } - - -//--- parse_sequence_name-- - -static void parse_sequence_name - (const char* name, - char** filename, - char** nickname, - char** contigOfInterest, - char** namesFilename, - char** choresFilename, - int* subsampleK, - int* subsampleN, - char** softMaskFilename, - int* softMaskComplement, - char** xMaskFilename, - int* xMaskComplement, - char** nMaskFilename, - int* nMaskComplement, - int* nameParseType, - char** nameTrigger, - int* doRevCompFlags, - int* doUnmask, - int* doPartitioning, - int* doJoin, - char* separatorCh, - int* useFullNames, - int* fileType, - int* isQuantum, - char** qCodingFilename, - unspos* _start, - unspos* _end, - int* endIsSoft) - { - int len; - char* fname, *bracket, *mask, *actions, *action, *actionName; - char* parse, *slashParse, *extParse; - int numItems, charsUsed; - unspos start, end, pendingStart, temp, length, mid; - int tempInt; - float size, zoom, fLength; - int parsed; - - *namesFilename = NULL; - *contigOfInterest = NULL; - *qCodingFilename = NULL; - - *_start = *_end = 0; - *endIsSoft = false; - - *doRevCompFlags = rcf_forward; - *doUnmask = false; - *doPartitioning = false; - *doJoin = false; - *separatorCh = 0; - *useFullNames = false; - *fileType = seq_type_unknown; - *isQuantum = false; - - ////////// - // copy the name, splitting out the nickname if present; we will shorten - // this copy if other components are present, so we are potentially - // allocating more memory for the copy than is really needed - ////////// - - if (name == NULL) suicide ("parse_sequence_name(NULL)"); - - parse = strstr (name, "::"); - actions = strchr (name, '['); - if ((parse == NULL) // no "::" - || ((actions != NULL) && (parse > actions))) // "::" is after "[" - { - *nickname = NULL; - *filename = fname = copy_string (name); - } - else - { - if (parse-name == 0) goto empty_species_name; - *nickname = copy_prefix (name, parse-name); - *filename = fname = copy_string (parse+2); - } - - len = strlen (fname); - if (len < 1) goto empty_file_name; - - ////////// - // see if we are to reverse the sequence - ////////// - - switch (fname[len-1]) - { - case '-': *doRevCompFlags = rcf_revcomp; fname[--len] = 0; break; - case '+': fname[--len] = 0; break; - } - - if (len < 1) goto empty_file_name; - - ////////// - // split the file name string into its components - ////////// - - mask = strchr (fname, '{'); - bracket = strchr (fname, '['); - if ((bracket != NULL) && (mask != NULL)) - { if (mask > bracket) mask = NULL; } - - if (mask == fname) goto empty_file_name; - - if (mask != NULL) - *(mask++) = 0; - - parse = (mask == NULL)? fname : mask; - - actions = strchr (parse, '['); - if (actions == parse) goto empty_file_name; - - if (actions != NULL) - *(actions++) = 0; - - ////////// - // parse the mask file name - ////////// - - *softMaskFilename = NULL; - *xMaskFilename = NULL; - *nMaskFilename = NULL; - - if (mask != NULL) - { - len = strlen (mask); - if (mask[--len] != '}') goto bad_mask; - if (len == 0) goto empty_mask_file_name; - mask[len] = 0; - *xMaskFilename = copy_string (mask); - } - - ////////// - // split out the contig-of-interest if present - ////////// - - slashParse = NULL; - - extParse = strstr (fname, ".2bit/"); - if (extParse != NULL) - slashParse = extParse+5; - else - { - extParse = strstr (fname, ".hsx/"); - if (extParse != NULL) - slashParse = extParse+4; - } - - if (slashParse != NULL) - { - if (strchr (slashParse+1, pathSlash) != NULL) extParse = NULL; - else if (strstr (slashParse+1, ".2bit") != NULL) extParse = NULL; - } - - if (extParse != NULL) - { - *contigOfInterest = copy_string (slashParse+1); - *slashParse = 0; - } - - ////////// - // parse the actions list - ////////// - - if (actions != NULL) - { - len = strlen (actions); - if (len == 0) - goto bad_action_list; - else if (actions[len-1] != ']') - { - if (strchr (actions, ']') == NULL) goto bad_action_list; - else goto actions_not_at_end; - } - actions[--len] = 0; - - start = end = pendingStart = 0; - while (actions != NULL) - { - //fprintf(stderr,"actions=\"%s\"\n", actions); - action = actions; - actions = strchr (actions, ','); - if (actions != NULL) - *(actions++) = 0; - else - { - actions = strstr (action, "]["); - if (actions != NULL) - { *(actions++) = 0; actions++; } - } - - //fprintf(stderr," action=\"%s\"\n", action); - len = strlen (action); - if (len == 0) goto blank_action; - - // parse simple actions - - if (strcmp (action, "unmask") == 0) - { - if (pendingStart != 0) goto unsatisfied_start; - *doUnmask = true; - continue; - } - - if (strcmp (action, "revcomp") == 0) - { - if (pendingStart != 0) goto unsatisfied_start; - *doRevCompFlags ^= rcf_revcomp; - continue; - } - - if ((strcmp (action, "backward") == 0) // (unadvertised) - || (strcmp (action, "backwards") == 0)) - { - if (pendingStart != 0) goto unsatisfied_start; - *doRevCompFlags ^= rcf_rev; - continue; - } - - if ((strcmp (action, "multi") == 0) - || (strcmp (action, "multiple") == 0)) - { - if (pendingStart != 0) goto unsatisfied_start; - *doPartitioning = *doJoin = true; - continue; - } - - if (strcmp_prefix (action, "sep=") == 0) - { - actionName = action + strlen("sep="); - goto action_separator; - } - - if (strcmp_prefix (action, "separator=") == 0) - { - actionName = action + strlen("separator="); - action_separator: - if (pendingStart != 0) goto unsatisfied_start; - if (actionName[0] == 0) goto bad_separator; - if (actionName[1] != 0) goto bad_separator; - if (*separatorCh != 0) goto many_separators; - *separatorCh = actionName[0]; - *doPartitioning = true; - continue; - } - - if ((strcmp (action, "nameparse=full") == 0) - || (strcmp (action, "fullname") == 0) - || (strcmp (action, "fullnames") == 0)) - { - if (pendingStart != 0) goto unsatisfied_start; - *useFullNames = true; - continue; - } - - if (action[0] == '@') - { - actionName = action + strlen("@"); - goto action_subset; - } - - if (strcmp_prefix (action, "subset=") == 0) - { - actionName = action + strlen("subset="); - action_subset: - if (pendingStart != 0) goto unsatisfied_start; - if (*namesFilename != NULL) goto many_name_files; - if (strlen(actionName) == 0) goto bad_name_file; - *namesFilename = copy_string (actionName); - continue; - } - - if (strcmp_prefix (action, "chores=") == 0) - { - actionName = action + strlen("chores="); - if (pendingStart != 0) goto unsatisfied_start; - if (*choresFilename != NULL) goto many_chore_files; - if (strlen(actionName) == 0) goto bad_chore_file; - *choresFilename = copy_string (actionName); - continue; - } - - if (strcmp_prefix (action, "subsample=") == 0) - { - actionName = action + strlen("subsample="); - if (pendingStart != 0) goto unsatisfied_start; - if (*subsampleN != 0) goto many_subsamples; - if (strlen(actionName) == 0) goto bad_subsample; - slashParse = strchr (actionName, '/'); - if (slashParse == NULL) goto bad_subsample; - - len = slashParse - actionName; - *slashParse = ']'; // (write a sentinel for parsing K) - charsUsed = -1; - numItems = sscanf (actionName, "%d]%n", &tempInt, &charsUsed); - if ((numItems != 1) || (charsUsed != len+1) || (tempInt < 1)) - { *slashParse = '/'; goto bad_subsample; } - *subsampleK = tempInt; - - *(slashParse++) = '/'; - len = strlen(slashParse); - slashParse[len] = ']'; // (write a sentinel for parsing N) - charsUsed = -1; - numItems = sscanf (slashParse, "%d]%n", &tempInt, &charsUsed); - if ((numItems != 1) || (charsUsed != len+1) || (tempInt < *subsampleK)) - { slashParse[len] = 0; goto bad_subsample; } - *subsampleN = tempInt; - continue; - } - - if ((strcmp (action, "nameparse=alnum") == 0) - || (strcmp (action, "nameparse=alphanum") == 0) - || (strcmp (action, "name:alnum") == 0) - || (strcmp (action, "name:alphanum") == 0)) - { - if (pendingStart != 0) goto unsatisfied_start; - if (*nameTrigger != NULL) goto many_name_parse_types; - *nameParseType = name_parse_type_alnum - | (*nameParseType & name_parse_fill_white); - continue; - } - - if (strcmp (action, "nameparse=darkspace") == 0) - { - if (pendingStart != 0) goto unsatisfied_start; - if (*nameTrigger != NULL) goto many_name_parse_types; - *nameParseType = name_parse_type_darkspace - | (*nameParseType & name_parse_fill_white); - continue; - } - - if (strcmp_prefix (action, "nickname=") == 0) - { - actionName = action + strlen("nickname="); - if (pendingStart != 0) goto unsatisfied_start; - if (*nickname != NULL) goto many_nicknames; - if (strlen(actionName) == 0) goto bad_nickname; - *nickname = copy_string (actionName); - continue; - } - - if (strcmp_prefix (action, "name=") == 0) - { - actionName = action + strlen("name="); - goto action_tag; - } - - if (strcmp_prefix (action, "nameparse=tag:") == 0) - { - actionName = action + strlen("nameparse=tag:"); - action_tag: - if (pendingStart != 0) goto unsatisfied_start; - if (*nameTrigger != NULL) goto many_name_triggers; - if (strlen(actionName) == 0) goto bad_name_trigger; - if (parse_type(*nameParseType) != name_parse_type_core) - goto many_name_parse_types; - *nameParseType = name_parse_type_trigger - | (*nameParseType & name_parse_fill_white); - *nameTrigger = copy_string (actionName); - continue; - } - - if (strcmp (action, "namejoin") == 0) - { - *nameParseType |= name_parse_fill_white; - continue; - } - - if (strcmp_prefix (action, "soft=keep:") == 0) - { - actionName = action + strlen("soft=keep:"); - goto action_softmaskkeep; - } - - if (strcmp_prefix (action, "softmask=keep:") == 0) - { - actionName = action + strlen("softmask=keep:"); - action_softmaskkeep: - if (pendingStart != 0) goto unsatisfied_start; - if (*softMaskFilename != NULL) goto many_soft_mask_files; - if (strlen(actionName) == 0) goto bad_soft_mask_file; - *softMaskFilename = copy_string (actionName); - *softMaskComplement = true; - continue; - } - - if (strcmp_prefix (action, "soft=") == 0) - { - actionName = action + strlen("soft="); - goto action_softmask; - } - - if (strcmp_prefix (action, "softmask=") == 0) - { - actionName = action + strlen("softmask="); - action_softmask: - if (pendingStart != 0) goto unsatisfied_start; - if (*softMaskFilename != NULL) goto many_soft_mask_files; - if (strlen(actionName) == 0) goto bad_soft_mask_file; - *softMaskFilename = copy_string (actionName); - *softMaskComplement = false; - continue; - } - - if (strcmp_prefix (action, "xmask=keep:") == 0) - { - actionName = action + strlen("xmask=keep:"); - if (pendingStart != 0) goto unsatisfied_start; - if (*xMaskFilename != NULL) goto many_x_mask_files; - if (strlen(actionName) == 0) goto bad_x_mask_file; - *xMaskFilename = copy_string (actionName); - *xMaskComplement = true; - continue; - } - - if (strcmp_prefix (action, "xmask=") == 0) - { - actionName = action + strlen("xmask="); - if (pendingStart != 0) goto unsatisfied_start; - if (*xMaskFilename != NULL) goto many_x_mask_files; - if (strlen(actionName) == 0) goto bad_x_mask_file; - *xMaskFilename = copy_string (actionName); - *xMaskComplement = false; - continue; - } - - if (strcmp_prefix (action, "nmask=keep:") == 0) - { - actionName = action + strlen("nmask=keep:"); - if (pendingStart != 0) goto unsatisfied_start; - if (*nMaskFilename != NULL) goto many_n_mask_files; - if (strlen(actionName) == 0) goto bad_n_mask_file; - *nMaskFilename = copy_string (actionName); - *nMaskComplement = true; - continue; - } - - if (strcmp_prefix (action, "nmask=") == 0) - { - actionName = action + strlen("nmask="); - if (pendingStart != 0) goto unsatisfied_start; - if (*nMaskFilename != NULL) goto many_n_mask_files; - if (strlen(actionName) == 0) goto bad_n_mask_file; - *nMaskFilename = copy_string (actionName); - *nMaskComplement = false; - continue; - } - - if (strcmp (action, "quantum") == 0) - { - if (pendingStart != 0) goto unsatisfied_start; - if (*isQuantum) goto many_quantums; - if (*fileType != seq_type_unknown) goto many_file_types; - *isQuantum = true; - *fileType = seq_type_qdna; - continue; - } - - if (strcmp_prefix (action, "quantum=") == 0) - { - actionName = action + strlen("quantum="); - if (pendingStart != 0) goto unsatisfied_start; - if (*isQuantum) goto many_quantums; - if (strlen(actionName) == 0) goto bad_code_file; - *qCodingFilename = copy_string (actionName); - *isQuantum = true; - continue; - } - - if (strcmp_prefix (action, "format=") == 0) - { - int fType, typeIx; - actionName = action + strlen("format="); - if (pendingStart != 0) goto unsatisfied_start; - if (*fileType != seq_type_unknown) goto many_file_types; - fType = seq_type_unknown; - for (typeIx=seq_type_unknown+1 ; typeIx= maxSequenceLen) end = maxSequenceLen; - else end = mid + length/2; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposCommaFmtScanf "]%n", &start, &end, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as two, comma\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "#" unsposFmtScanf "]%n", &start, &end, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as two, waffle\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - end += start-1; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "#" unsposFmtScanf "K]%n", &start, &end, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as one, waffle with K\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - end *= 1000; - end += start-1; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "#%fK]%n", &start, &size, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as one, waffle with K\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - end = (size * 1000) + 1; - end += start-1; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "#" unsposFmtScanf "M]%n", &start, &end, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as one, waffle with M\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - end *= 1000 * 1000; - end += start-1; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "#%fM]%n", &start, &size, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as one, waffle with M\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - end = (size * 1000 * 1000) + 1; - end += start-1; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "^" unsposFmtScanf "]%n", &start, &end, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as two, caret\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - start -= end / 2; - end += start-1; - if (start < 1) start = 1; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "^" unsposFmtScanf "K]%n", &start, &end, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as one, caret with K\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - end *= 1000; - start -= end / 2; - end += start-1; - if (start < 1) start = 1; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "^%fK]%n", &start, &size, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as one, caret with K\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - end = (size * 1000) + 1; - start -= end / 2; - end += start-1; - if (start < 1) start = 1; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "^" unsposFmtScanf "M]%n", &start, &end, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as one, caret with M\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - end *= 1000 * 1000; - start -= end / 2; - end += start-1; - if (start < 1) start = 1; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "^%fM]%n", &start, &size, &charsUsed); - if ((numItems == 2) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as one, caret with M\n"); - if ((start == 0) || (end == 0)) - { action[len] = 0; goto bad_limits; } - end = (size * 1000 * 1000) + 1; - start -= end / 2; - end += start-1; - if (start < 1) start = 1; - *endIsSoft = true; - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, unsposFmtScanf "..]%n", &start, &charsUsed); - if ((numItems == 1) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as one, dots at end\n"); - if (start == 0) - { action[len] = 0; goto bad_limits; } - parsed = true; - } - } - - if (!parsed) - { - charsUsed = -1; - numItems = sscanf (action, ".." unsposFmtScanf "]%n", &end, &charsUsed); - if ((numItems == 1) && (charsUsed == len+1)) - { - //fprintf(stderr," parsed as one, dots at start\n"); - if (end == 0) - { action[len] = 0; goto bad_limits; } - parsed = true; - } - } - - if (!parsed) - { - action[len] = 0; // (clear the sentinel) - goto bad_action; - } - } - - if (pendingStart != 0) goto unsatisfied_start_2; - - if ((start != 0) && (end != 0) && (start > end)) - { - temp = start; start = end; end = temp; - *doRevCompFlags ^= rcf_revcomp; - } - - if ((start != 0) || (end != 0)) - { - *_start = start; - *_end = end; - } - } - - ////////// - // sanity checks - ////////// - - if ((*contigOfInterest != NULL) && (*namesFilename != NULL)) - suicidef ("(for %s) can't use these together:\n %s\n %s", - fname, *contigOfInterest, *namesFilename); - - if ((*contigOfInterest != NULL) && (*choresFilename != NULL)) - suicidef ("(for %s) can't use these together:\n %s\n %s", - fname, *contigOfInterest, *choresFilename); - - if ((*namesFilename != NULL) && (*choresFilename != NULL)) - suicidef ("(for %s) can't use these together:\n %s\n %s", - fname, *namesFilename, *choresFilename); - -// (these are no longer prohibited combinations) -// -// if ((*doPartitioning) && (*softMaskFilename != NULL)) -// { -// if (*softMaskComplement) -// suicidef ("(for %s) can't use [multi] with [softmask=keep:%s]", -// fname, *softMaskFilename); -// else -// suicidef ("(for %s) can't use [multi] with [softmask=%s]", -// fname, *softMaskFilename); -// } -// -// if ((*doPartitioning) && (*xMaskFilename != NULL)) -// { -// if (*xMaskComplement) -// suicidef ("(for %s) can't use [multi] with [xmask=keep:%s]", -// fname, *xMaskFilename); -// else -// suicidef ("(for %s) can't use [multi] with [xmask=%s]", -// fname, *xMaskFilename); -// } -// -// if ((*doPartitioning) && (*nMaskFilename != NULL)) -// { -// if (*nMaskComplement) -// suicidef ("(for %s) can't use [multi] with [nmask=keep:%s]", -// fname, *nMaskFilename); -// else -// suicidef ("(for %s) can't use [multi] with [nmask=%s]", -// fname, *nMaskFilename); -// } - - return; - - ////////// - // failure exits - ////////// - -empty_file_name: - suicidef ("sequence file name is absent from \"%s\"", name); - return; // (can't reach here) - -empty_species_name: - suicidef ("(for %s) empty nickname", parse+2); - return; // (can't reach here) - -bad_mask: - suicidef ("(for %s) mask name needs closing } (%s)", fname, mask); - return; // (can't reach here) - -empty_mask_file_name: - suicidef ("(for %s) use [unmask] instead of {}", fname); - return; // (can't reach here) - -actions_not_at_end: - suicidef ("(for %s[%s)\n" - "The action list is not at the end of the sequence specifier. See the README\n" - "section on sequence specifiers. Perhaps you forgot a space after the closing\n" - "square bracket?", - fname,actions); - return; // (can't reach here) - -bad_action_list: - suicidef ("(for %s) bad action list", fname); - return; // (can't reach here) - -blank_action: - suicidef ("(for %s) blank action", fname); - return; // (can't reach here) - -bad_action: - suicidef ("(for %s) bad action \"%s\"", fname, action); - return; // (can't reach here) - -bad_sequence_position: - suicidef ("(for %s) bad limit \"%s\"", fname, action); - return; // (can't reach here) - -bad_limits: - suicidef ("(for %s) bad limits \"%s\"", fname, action); - return; // (can't reach here) - -unsatisfied_start: - suicidef ("(for %s) incomplete limits (%d,%s)", fname, pendingStart, action); - return; // (can't reach here) - -unsatisfied_start_2: - suicidef ("(for %s) incomplete limits (%d)", fname, pendingStart); - return; // (can't reach here) - -many_separators: - suicidef ("(for %s) only one separator allowed:\n" - " separator=%c\n" - " separator=%c", - fname, *separatorCh, actionName[0]); - return; // (can't reach here) - -bad_separator: - if (actionName[0] == 0) - suicidef ("(for %s) separator= requires a character", fname, actionName); - else - suicidef ("(for %s) bad separator, separator=%s, only one character allowed", fname); - return; // (can't reach here) - -many_name_files: - suicidef ("(for %s) only one names file allowed:\n" - " subset=%s\n" - " subset=%s", - fname, *namesFilename, actionName); - return; // (can't reach here) - -bad_name_file: - suicidef ("(for %s) subset= requires a names file", fname); - return; // (can't reach here) - -many_chore_files: - suicidef ("(for %s) only one chores file allowed:\n" - " chores=%s\n" - " chores=%s", - fname, *choresFilename, actionName); - return; // (can't reach here) - -bad_chore_file: - suicidef ("(for %s) chores= requires a chores file", fname); - return; // (can't reach here) - -many_subsamples: - suicidef ("(for %s) only one subsampling allowed:\n" - " subsample=%d/%d\n" - " subsample=%s", - fname, *subsampleK, *subsampleN, actionName); - return; // (can't reach here) - -bad_subsample: - suicidef ("(for %s) bad subsample \"%s\"", fname, actionName); - return; // (can't reach here) - -many_nicknames: - suicidef ("(for %s) only one nickname allowed:\n" - " nickname=%s\n" - " nickname=%s", - fname, *nickname, actionName); - return; // (can't reach here) - -bad_nickname: - suicidef ("(for %s) nickname= requires a non-empty string", fname); - return; // (can't reach here) - -many_name_parse_types: - suicidef ("(for %s) only one name parsing allowed\n", fname); - return; // (can't reach here) - -many_name_triggers: - suicidef ("(for %s) only one name trigger allowed:\n" - " nameparse=tag:%s\n" - " nameparse=tag:%s", - fname, *nameTrigger, actionName); - return; // (can't reach here) - -bad_name_trigger: - suicidef ("(for %s) nameparse=tag: requires a non-empty string", fname); - return; // (can't reach here) - -many_soft_mask_files: - suicidef ("(for %s) only one softmask allowed:\n" - " softmask=%s\n" - " softmask=%s", - fname, *softMaskFilename, actionName); - return; // (can't reach here) - -bad_soft_mask_file: - suicidef ("(for %s) softMask= or softMask=keep: require a non-empty string", fname); - return; // (can't reach here) - -many_x_mask_files: - suicidef ("(for %s) only one xmask allowed:\n" - " xmask=%s\n" - " xmask=%s", - fname, *xMaskFilename, actionName); - return; // (can't reach here) - -bad_x_mask_file: - suicidef ("(for %s) xmask= or xmask=keep: require a non-empty string", fname); - return; // (can't reach here) - -many_n_mask_files: - suicidef ("(for %s) only one nmask allowed:\n" - " nmask=%s\n" - " nmask=%s", - fname, *nMaskFilename, actionName); - return; // (can't reach here) - -bad_n_mask_file: - suicidef ("(for %s) nmask= or nmask=keep: require a non-empty string", fname); - return; // (can't reach here) - -many_file_types: - suicidef ("(for %s) more than one file type is defined", fname); - return; // (can't reach here) - -bad_file_format: - suicidef ("(for %s) unknown file format: %s", fname, actionName); - return; // (can't reach here) - -many_quantums: - suicidef ("(for %s) only one instance of quantum allowed", fname); - return; // (can't reach here) - -bad_code_file: - suicidef ("(for %s) quantum= requires a non-empty string", fname); - return; // (can't reach here) - } - -//---------- -// -// detect_file_type-- -// Attempt to determine what type of file we are dealing with (e.g. fasta, -// nib, quantum, etc.). -// -//---------- -// -// Arguments: -// seq* seq: The sequence. -// -// Returns: -// The type of file being read (one of seq_type_xxx); failure causes program -// fatality. -// -//---------- - -static int detect_file_type - (seq* _seq) - { - int type = seq_type_unknown; - char buffer[maxSequenceHeader+3]; - u32 bufferLen; - u32 magic; - u8 ch; - int intCh; - - ////////// - // determine if it's a nib file (from the magic number) - ////////// - - // read the first four bytes; if it's a recognizable magic number then it - // must be the corresponding type of file; otherwise we assume it must be - // a fasta file (if not, it will die later) - - magic = read_4_little (_seq); - - if ((magic == nibMagicLittle) || (magic == nibMagicBig)) - type = seq_type_nib; - else if ((magic == twobitMagicLittle) || (magic == twobitMagicBig)) - type = seq_type_2bit; - else if ((magic == hsxMagicLittle) || (magic == hsxMagicBig)) - type = seq_type_hsx; - else if ((magic == qdnaMagicLittle) || (magic == qdnaMagicBig)) - type = seq_type_qdna; - else if ((magic == oldQdnaMagicLittle) || (magic == oldQdnaMagicBig)) - type = seq_type_qdna; - - // put those four bytes back in the file (in reverse of the read order) - - seq_ungetc (magic >> 24, _seq); - seq_ungetc (magic >> 16, _seq); - seq_ungetc (magic >> 8, _seq); - seq_ungetc (magic , _seq); - - if (type != seq_type_unknown) - return type; - - ////////// - // determine if it's a fastq file - ////////// - - ch = seq_getc (_seq); - seq_ungetc (ch, _seq); - if (ch == '@') - return seq_type_fastq; - - ////////// - // determine if it's a fasta or csfasta file; these are very similar - // formats; if the first character is a '#' we know it is csfasta (because - // regular fasta doesn't allow comments); if the first character is a '>' - // then it can still be either, in which case we need to skip the header - // line and read the first two sequence characters - ////////// - - ch = seq_getc (_seq); - if (ch == '#') - { - seq_ungetc (ch, _seq); - return seq_type_csfasta; - } - - if (ch != '>') - seq_ungetc (ch, _seq); - else - { - // read header - - bufferLen = 0; - buffer[bufferLen++] = ch; - - do - { - intCh = seq_getc (_seq); - if (intCh == EOF) goto unknown; - if (bufferLen >= sizeof(buffer)-3) goto unknown; - buffer[bufferLen++] = (char) intCh; - } while (intCh != '\n'); - - // first character must be a nucleotide or it is not fasta - // $$$ we are ignoring the slim chance of an empty first sequence, or - // $$$ .. that the first line contains just a single nucleotide - - intCh = seq_getc (_seq); - if (intCh == EOF) goto unknown; - buffer[bufferLen++] = intCh; - if (ustrchr ("ACGTacgtNn", intCh) != NULL) - { - intCh = seq_getc (_seq); - if (intCh == EOF) goto unknown; - buffer[bufferLen++] = intCh; - if (ustrchr ("ACGTacgtNn", intCh) != NULL) - type = seq_type_fasta; - else if (ustrchr ("0123", intCh) != NULL) - type = seq_type_csfasta; - } - - unknown: - while (bufferLen > 0) - seq_ungetc (buffer[--bufferLen], _seq); - - if (type != seq_type_unknown) - return type; - } - - ////////// - // if all else fails, assume it's a fasta file (for compatibility with - // blastz) - ////////// - - if (type == seq_type_unknown) - type = seq_type_fasta; - - return type; - } - -//---------- -// -// read_4, read_4_big, read_4_little-- -// Read four bytes from a file, in big or little endian order. -// read_5, read_5_big, read_5_little-- -// Read five bytes from a file, in big or little endian order. -// read_6, read_6_big, read_6_little-- -// Read six bytes from a file, in big or little endian order. -// -//---------- -// -// Arguments: -// seq* seq: The sequence. -// int asBigEndian: true => read 'em as big endian -// false => read 'em as little endian -// -// Returns: -// The magic number read. -// -//---------- - -static u32 read_4 - (seq* _seq, - int asBigEndian) - { - if (asBigEndian) return read_4_big (_seq); - else return read_4_little (_seq); - } - -static u32 read_4_big - (seq* _seq) - { - u32 val; - - val = seq_getc (_seq) << 24; - val |= seq_getc (_seq) << 16; - val |= seq_getc (_seq) << 8; - val |= seq_getc (_seq); - - return val; - } - -static u32 read_4_little - (seq* _seq) - { - u32 val; - - val = seq_getc (_seq); - val |= seq_getc (_seq) << 8; - val |= seq_getc (_seq) << 16; - val |= seq_getc (_seq) << 24; - - return val; - } - - -static u64 read_5 - (seq* _seq, - int asBigEndian) - { - if (asBigEndian) return read_5_big (_seq); - else return read_5_little (_seq); - } - -static u64 read_5_big - (seq* _seq) - { - u64 val; - - val = ((u64) seq_getc (_seq)) << 32; - val |= seq_getc (_seq) << 24; - val |= seq_getc (_seq) << 16; - val |= seq_getc (_seq) << 8; - val |= seq_getc (_seq); - - return val; - } - -static u64 read_5_little - (seq* _seq) - { - u64 val; - - val = seq_getc (_seq); - val |= seq_getc (_seq) << 8; - val |= seq_getc (_seq) << 16; - val |= seq_getc (_seq) << 24; - val |= ((u64) seq_getc (_seq)) << 32; - - return val; - } - - -static u64 read_6 - (seq* _seq, - int asBigEndian) - { - if (asBigEndian) return read_6_big (_seq); - else return read_6_little (_seq); - } - -static u64 read_6_big - (seq* _seq) - { - u64 val; - - val = ((u64) seq_getc (_seq)) << 40; - val |= ((u64) seq_getc (_seq)) << 32; - val |= seq_getc (_seq) << 24; - val |= seq_getc (_seq) << 16; - val |= seq_getc (_seq) << 8; - val |= seq_getc (_seq); - - return val; - } - -static u64 read_6_little - (seq* _seq) - { - u64 val; - - val = seq_getc (_seq); - val |= seq_getc (_seq) << 8; - val |= seq_getc (_seq) << 16; - val |= seq_getc (_seq) << 24; - val |= ((u64) seq_getc (_seq)) << 32; - val |= ((u64) seq_getc (_seq)) << 40; - - return val; - } - -//---------- -// -// skip_seq_whitespace-- -// Read characters from the associated sequence until we get something that -// ain't whitespace. -// -//---------- -// -// Arguments: -// seq* seq: The sequence to read. -// -// Returns: -// (same as for getc()) -// -//---------- - -static int skip_seq_whitespace - (seq* _seq) - { - int ch; - - do - { - ch = seq_getc (_seq); - } while ((ch == ' ') || (ch == '\t')); - - return ch; - } - -//---------- -// -// seq_getc-- -// Read the next character from the associated file. -// -//---------- -// -// Arguments: -// seq* seq: The sequence to read. -// -// Returns: -// (same as for getc(); the character read, or EOF) -// -//---------- - -static int seq_getc - (seq* _seq) - { - int ch; - - // if there are characters pending, get one from the pending buffer; - // otherwise, feed one straight from the file - - if (_seq->pendingLen == 0) - { - ch = getc_or_die (_seq->f, _seq->filename); - debugTextFile_3; - return ch; - } - - _seq->pendingLen--; - ch = (int) (u8) *(_seq->pendingStack++); - - debugTextFile_4; - return ch; - } - -//---------- -// -// seq_ungetc-- -// Give back a character to the associated file. -// -// WARNING: This routine is not an exact drop in replacement for the standard -// c routine ungetc(). -// -//---------- -// -// Arguments: -// char ch: The character to return. Characters should be returned in -// .. the opposite order of that read (ie newest char returned -// .. first). -// seq* seq: The sequence being read. -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -static void seq_ungetc - (char ch, - seq* _seq) - { - debugTextFile_5; - - if (_seq->pendingLen >= seqBufferSize) - suicide ("seq_ungetc() buffer is already full"); - - _seq->pendingLen++; - *(--_seq->pendingStack) = ch; - } - -//---------- -// -// skip_chars-- -// Skip the next so many characters from the associated file. -// -//---------- -// -// Arguments: -// seq* seq: The sequence being read. -// u32 toSkip: The number of characters to skip. -// -// Returns: -// true if successful, false if there's a problem (such as reaching premature -// end-of-file). -// -//---------- - -static int skip_chars - (seq* _seq, - u32 toSkip) - { - int ch; - - // see if we have any to skip in the pending buffer - - if (_seq->pendingLen >= toSkip) - { - // we have all we need in the pending buffer - - _seq->pendingLen -= toSkip; - _seq->pendingStack += toSkip; - return true; - } - - if (_seq->pendingLen > 0) - { - // everything in the pending buffer will be skipped - - toSkip -= _seq->pendingLen; - _seq->pendingLen = 0; - _seq->pendingStack = _seq->pendingChars + seqBufferSize; - } - - if (toSkip == 0) return true; // (none left to skip) - - // skip the rest by seeking past the characters - - if (fseek (_seq->f, toSkip, SEEK_CUR) != 0) - { - // seek failed, so let's try reading instead - - while (toSkip-- > 0) - { - ch = getc (_seq->f); - if ((ch == EOF) || (ferror (_seq->f))) - return false; - } - } - - return true; - } - -//---------- -// -// test_rewindability-- -// Test whether a sequence's underflying file is rewinable. -// -//---------- -// -// Arguments: -// seq* seq: The sequence being read. -// -// Returns: -// An fseek error code; zero indicates the underlying file is rewindable; -// any other value indicates that it is not. -// -//---------- - -static int test_rewindability - (seq* _seq) - { - long int savedFilePos; - - savedFilePos = ftell (_seq->f); - return fseek (_seq->f, savedFilePos, SEEK_SET); - } - -//---------- -// -// save_fstate, restore_fstate-- -// Save and restore the state of the associated file. This lets the caller -// read ahead in a file, then return to the original point. -// -//---------- -// -// Arguments: -// seq* seq: The sequence being read. -// -// Returns: -// (nothing; failure causes program fatality) -// -//---------- - -static void save_fstate - (seq* _seq) - { - if (_seq == NULL) suicide ("save_fstate(NULL)"); - - // save read head - - debugTextFile_6; - - _seq->savedFilePos = ftell (_seq->f) - _seq->pendingLen; - _seq->hasSavedState = true; - } - -static void restore_fstate - (seq* _seq) - { - int err; - - if (_seq == NULL) suicide ("restore_fstate(NULL)"); - if (!_seq->hasSavedState) suicide ("restore_fstate(), no state saved"); - - // restore read head - - debugTextFile_7; - - err = fseek (_seq->f, _seq->savedFilePos, SEEK_SET); - if (err != 0) - suicidef_with_perror ("restore_fstate(), fseek returned %d", err); - - // restore pending character state - - _seq->pendingLen = 0; - _seq->pendingStack = _seq->pendingChars + seqBufferSize; - } - -//---------- -// -// match_composition-- -// Count the number of matched DNA letter pairs in a gap free alignment. -// -//---------- -// -// Arguments: -// seq* seq1: The first sequence. -// unspos pos1: The subsequence start position in seq1 (origin-0). -// seq* seq2: The second sequence. -// unspos pos2: The subsequence start position in seq2 (origin-0). -// unspos length: The length of the subsequence. -// unspos count[4][4]:Place to return the counts of each matched DNA letter -// .. pair. Indexing is as per nuc_to_bits. -// -// Returns: -// nothing; composition is returned in the count[][] array -// -//---------- -// -// Note: Masked (lowercase) bp do not contribute to the results. -// -//---------- - -void match_composition - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - unspos count[4][4]) - { - u8* s1 = seq1->v + pos1; - u8* s2 = seq2->v + pos2; - unspos ix; - int r, c; - - for (r=0 ; r<4 ; r++) - for (c=0 ; c<4 ; c++) - count[r][c] = 0; - - for (ix=0 ; ix= 0) && (c >= 0)) - count[r][c]++; - } - } - -//---------- -// -// percent_identical-- -// Determine the percentage of bases that match in two subsequences. -// -//---------- -// -// Arguments: -// seq* seq1: The first sequence. -// unspos pos1: The subsequence start position in seq1 (origin-0). -// seq* seq2: The second sequence. -// unspos pos2: The subsequence start position in seq2 (origin-0). -// unspos length: The length of the subsequence. -// -// Returns: -// The percentage (an integer in the range 0..100). -// -//---------- -// -// Note: Masked (lowercase) bp *do* contribute to the results, but illegal -// values like N do not (and they are not counted in the denominator -// either). -// -//---------- - -int percent_identical - (seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length) - { - u8* s1 = seq1->v + pos1; - u8* s2 = seq2->v + pos2; - s8 c1, c2; - unspos numMatches = 0; - unspos denom = 0; - unspos ix; - - if (length == 0) - return 0; - - if ((seq1->fileType == seq_type_qdna) - || (seq2->fileType == seq_type_qdna)) - return 0; - - for (ix=0 ; ix= 0) && (c2 >= 0)) - { - if (c1 == c2) numMatches++; - denom++; - } - } - - if (denom == 0) - return 0; - else - return (200*numMatches + denom) / (2*denom); // 100*numMatches/denom, rounded - } - -//---------- -// -// score_match-- -// Determine the substitution score of aligned bases in two subsequences. -// -//---------- -// -// Arguments: -// scoreset* scoring: The scoring scheme to use. -// seq* seq1: The first sequence. -// unspos pos1: The subsequence start position in seq1 (origin-0). -// seq* seq2: The second sequence. -// unspos pos2: The subsequence start position in seq2 (origin-0). -// unspos length: The length of the subsequence. Note that this may -// .. be zero. -// -// Returns: -// The substitution score of the two subsequences. -// -//---------- - -score score_match - (scoreset* scoring, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length) - { - u8* s1 = seq1->v + pos1; - u8* s2 = seq2->v + pos2; - u8* stop = s1 + length; - score similarity = 0; - - if (length == 0) - return (score) 0; - - while (s1 < stop) - similarity += scoring->sub[*(s1++)][*(s2++)]; - - return similarity; - } - -//---------- -// -// dump_aligned_nucleotides-- -// Dump the nucleotides (from each sequence) for a gap-free alignment. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos pos1: The first aligned position in sequence 1. -// seq* seq2: The other sequence. -// unspos pos2: The first aligned position in sequence 2. -// unspos length: The length of the alignment. -// -// Returns: -// (nothing) -// -//---------- - -void dump_aligned_nucleotides - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length) - { - int isRev1 = ((seq1->revCompFlags & rcf_rev) != 0); - int isRev2 = ((seq2->revCompFlags & rcf_rev) != 0); - char* start1 = (char*) seq1->v + pos1; - char* start2 = (char*) seq2->v + pos2; - int digits = 10; - - fprintf (f, unsposStarFmt "%c:", digits, pos1+1, (isRev1)?'-':'+'); - print_prefix (f, (char*) seq1->v + pos1, (int) length); - fprintf (f, "\n"); - - fprintf (f, "%*s ", digits, ""); - print_dna_similarities - (f, start1, start2, (int) length); - fprintf (f, "\n"); - - fprintf (f, unsposStarFmt "%c:", digits, pos2+1, (isRev2)?'-':'+'); - print_prefix (f, (char*) seq2->v + pos2, (int) length); - fprintf (f, "\n"); - } - -//---------- -// -// dump_sequence-- -// Write a sequence to a file (for debugging). -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq: The sequence to print. -// -// Returns: -// (nothing) -// -//---------- - -void dump_sequence - (FILE* f, - seq* _seq) - { - char buffer[101]; - unspos ix, start = 0; - int width; - char ch; - int bx; - int needSeparator; - - start = _seq->len; - width = 1; - while (start > 9) { start/=10; width++; } - - needSeparator = false; - - bx = 0; - for (ix=0 ; ix<_seq->len ; ix++) - { - ch = (char) _seq->v[ix]; - - if (ch == 0) - { - if (bx > 0) - { - if (needSeparator) - { fprintf (f, "%*s =====\n", width, ""); needSeparator = false; } - buffer[bx] = 0; - fprintf (f, unsposStarFmt ": %s\n", width, start, buffer); - } - bx = 0; - needSeparator = true; - continue; - } - - if (bx == sizeof(buffer)-1) - { - if (needSeparator) - { fprintf (f, "%*s =====\n", width, ""); needSeparator = false; } - buffer[bx] = 0; - fprintf (f, unsposStarFmt ": %s\n", width, start, buffer); - bx = 0; - } - - if (bx == 0) start = ix; - buffer[bx++] = ch; - } - - if (bx > 0) - { - if (needSeparator) - { fprintf (f, "%*s =====\n", width, ""); needSeparator = false; } - buffer[bx] = 0; - fprintf (f, unsposStarFmt ": %s\n", width, start, buffer); - } - } - -//---------- -// -// dump_sequence_state-- -// Write a sequence's state information to a file (for debugging). -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq: The sequence to print. -// -// Returns: -// (nothing) -// -//---------- - -void dump_sequence_state - (FILE* f, - seq* _seq) - { - seqpartition* sp = &_seq->partition; - u32 ix; - - fprintf (f, "size: %s\n", commatize(_seq->size)); - fprintf (f, "len: %s\n", commatize(_seq->len)); - fprintf (f, "needsVq: %d\n", _seq->needsVq); - - fprintf (f, "v: %s%p\n", (_seq->vOwner)?"[owner] ":"", _seq->v); - if (_seq->vc != NULL) fprintf (f, "vc: %s%p\n", (_seq->vcOwner)?"[owner] ":"", _seq->vc); - if (_seq->vq != NULL) fprintf (f, "vq: %s%p\n", (_seq->vqOwner)?"[owner] ":"", _seq->vq); - - fprintf (f, "startLoc: %s\n", commatize(_seq->startLoc)); - fprintf (f, "trueLen: %s%s\n", (_seq->needTrueLen)?"[need] ":"", commatize(_seq->trueLen)); - fprintf (f, "revCompFlags: %d\n", _seq->revCompFlags); - - if (_seq->contigOfInterest != NULL) fprintf (f, "vc: \"%s\"\n", _seq->contigOfInterest); - fprintf (f, "contig: %u\n", _seq->contig); - fprintf (f, "preLoaded: %d\n", _seq->preLoaded); - - fprintf (f, "lockedHeader: %d\n", _seq->lockedHeader); - fprintf (f, "headerSize: %s\n", commatize(_seq->headerSize)); - if (_seq->header != NULL) fprintf (f, "header: %s\"%s\"\n", (_seq->headerOwner)?"[owner] ":"", _seq->header); - else fprintf (f, "header: %s(null)\n", (_seq->headerOwner)?"[owner] ":""); - fprintf (f, "shortHeaderSize: %s\n", commatize(_seq->shortHeaderSize)); - if (_seq->shortHeader != NULL) fprintf (f, "shortHeader: %s\"%s\"\n", (_seq->shortHeaderOwner)?"[owner] ":"", _seq->shortHeader); - else fprintf (f, "shortHeader: %s(null)\n", (_seq->shortHeaderOwner)?"[owner] ":""); - fprintf (f, "hasNickname: %d\n", _seq->hasNickname); - fprintf (f, "trueHeaderSize: %s\n", commatize(_seq->trueHeaderSize)); - if (_seq->trueHeader != NULL) fprintf (f, "trueHeader: %s\"%s\"\n", (_seq->trueHeaderOwner)?"[owner] ":"", _seq->trueHeader); - else fprintf (f, "trueHeader: %s(null)\n", (_seq->trueHeaderOwner)?"[owner] ":""); - - if (_seq->hasLeftFence) fprintf (f, "leftFence: " unsposFmt " %02X\n", _seq->leftFencePos, _seq->leftFenceCh); - if (_seq->hasRightFence) fprintf (f, "rightFence: " unsposFmt " %02X\n", _seq->rightFencePos, _seq->rightFenceCh); - - if (_seq->filename != NULL) fprintf (f, "filename: \"%s\"\n", _seq->filename); - else fprintf (f, "filename: (null)\n"); - if (_seq->f != NULL) fprintf (f, "f: %p\n", _seq->f); - else fprintf (f, "f: (null)\n"); - fprintf (f, "fileType: %s (%d)\n", seqTypeNames[_seq->fileType], _seq->fileType); - fprintf (f, "rewindable: %d\n", _seq->rewindable); - - fprintf (f, "pending: "); - if (_seq->pendingLen == 0) - fprintf (f, "(empty)\n"); - else - { - fprintf (f, "\""); - for (ix=0 ; ix<_seq->pendingLen ; ix++) - fprintf (f, "%c", _seq->pendingStack[ix]); - fprintf (f, "\"\n"); - } - - if (_seq->hasSavedState) fprintf (f, "savedFilePos: %016lX\n", _seq->savedFilePos); - - if (_seq->namesFile != NULL) - { - if (_seq->namesFilename != NULL) fprintf (f, "namesFilename: \"%s\"\n", _seq->namesFilename); - else fprintf (f, "namesFilename: (null)\n"); - fprintf (f, "namesFile: %p\n", _seq->namesFile); - fprintf (f, "nextContigName: \"%s\"\n", _seq->nextContigName); - fprintf (f, "contigPending: %d\n", _seq->contigPending); - } - - if (_seq->choresFile != NULL) - { - if (_seq->choresFilename != NULL) fprintf (f, "choresFilename: \"%s\"\n", _seq->choresFilename); - else fprintf (f, "choresFilename: (null)\n"); - fprintf (f, "choresFile: %p\n", _seq->choresFile); - fprintf (f, "choresLineNum: %d\n", _seq->choresLineNum); - fprintf (f, "chore.num: %d\n", _seq->chore.num); - fprintf (f, "chore.tName: \"%s\"\n", _seq->chore.tName); - if (_seq->chore.tSubrange) fprintf (f, "chore.tSubrange: " unsposDotsFmt "\n", _seq->chore.tStart, _seq->chore.tEnd); - else fprintf (f, "chore.tSubrange: (whole sequence)\n"); - if (_seq->chore.qSubrange) fprintf (f, "chore.qSubrange: " unsposDotsFmt "\n", _seq->chore.qStart, _seq->chore.qEnd); - else fprintf (f, "chore.qSubrange: (whole sequence)\n"); - fprintf (f, "chore.qStrand: %s\n", (_seq->chore.qStrand<0)?"- strand":((_seq->chore.qStrand>0)?"+ strand":"both strands")); - fprintf (f, "chore.idTag: \"%s\"\n", _seq->chore.idTag); - - fprintf (f, "chore.targetInt: " unsposDotsFmt "\n", _seq->chore.targetInterval.s, _seq->chore.targetInterval.e); - fprintf (f, "chore.queryInt: " unsposDotsFmt "\n", _seq->chore.queryInterval.s, _seq->chore.queryInterval.e); - } - - if (_seq->subsampleN > 0) fprintf (f, "subsample: %d/%d [skip %d]\n", _seq->subsampleK, _seq->subsampleN, _seq->subsampleN); - - if (_seq->softMaskFilename != NULL) fprintf (f, "softMaskFilename: %s\"%s\"\n", (_seq->softMaskComplement)?"[keep] ":"", _seq->softMaskFilename); - if (_seq->xMaskFilename != NULL) fprintf (f, "xMaskFilename: %s\"%s\"\n", (_seq->xMaskComplement)?"[keep] ":"", _seq->xMaskFilename); - if (_seq->nMaskFilename != NULL) fprintf (f, "nMaskFilename: %s\"%s\"\n", (_seq->nMaskComplement)?"[keep] ":"", _seq->nMaskFilename); - - if ((_seq->startLimit != 0) || (_seq->endLimit != 0)) - fprintf (f, "limits: " unsposDotsFmt "%s\n", _seq->startLimit, _seq->endLimit, (_seq->endIsSoft)?"[soft] ":""); - - fprintf (f, "doRevCompFlags: %d\n", _seq->doRevCompFlags); - fprintf (f, "doUnmask: %d\n", _seq->doUnmask); - fprintf (f, "doPartitioning: %d\n", _seq->doPartitioning); - fprintf (f, "doJoin: %d\n", _seq->doJoin); - if (_seq->separatorCh != 0) - { - if ((' ' < _seq->separatorCh) && (_seq->separatorCh <= '~')) fprintf (f, "separatorCh: '%c' %02X\n", _seq->separatorCh, _seq->separatorCh); - else fprintf (f, "separatorCh: %02X\n", _seq->separatorCh); - } - fprintf (f, "useFullNames: %d\n", _seq->useFullNames); - fprintf (f, "nameParseType: %d\n", _seq->nameParseType); - if (_seq->nameTrigger != NULL) fprintf (f, "nameTrigger: \"%s\"\n", _seq->nameTrigger); - fprintf (f, "allowAmbiDNA: %d\n", _seq->allowAmbiDNA); - if (_seq->qToComplement != NULL) fprintf (f, "qToComplement: %p\n", _seq->qToComplement); - if (_seq->qCoding != NULL) fprintf (f, "qCoding: %p\n", _seq->qCoding); - - if (_seq->fileType == seq_type_2bit) - { - ; // $$$ dump _seq->twoBit - } - - else if (_seq->fileType == seq_type_hsx) - { - ; // $$$ dump _seq->hsx - } - - if (sp->p != NULL) - { - fprintf (f, "partition.state: %d\n", sp->state); - fprintf (f, "partition.size: %s\n", commatize(sp->size)); - fprintf (f, "partition.len: %s\n", commatize(sp->len)); - fprintf (f, "partition.p: %p\n", sp->p); - - fprintf (f, "partition.poolSize: %s\n", commatize(sp->poolSize)); - fprintf (f, "partition.poolLen: %s\n", commatize(sp->poolLen)); - fprintf (f, "partition.pool: %s%p\n", (sp->poolOwner)?"[owner] ":"", sp->pool); - - print_partition_table (f, _seq); - } - - } - -//---------- -// -// sequence_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void sequence_zero_stats - (void) - { -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&sequenceStats, 0, sizeof(sequenceStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - // (none to set, yet) - -#endif // collect_stats - } - -//---------- -// -// sequence_show_stats, -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// -// Returns: -// (nothing) -// -//---------- - -void sequence_show_stats - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - if (f == NULL) return; - - fprintf (f, " partition lookups: %s\n", commatize (sequenceStats.partitionLookups)); - if (sequenceStats.partitionHits != 0) - fprintf (f, " partition hits: %s\n", commatize (sequenceStats.partitionHits)); - if (sequenceStats.partitionLookups != 0) - fprintf (f, " lookup iterations: %s (%.1f per)\n", - commatize (sequenceStats.lookupIterations), - sequenceStats.lookupIterations / ((double) sequenceStats.partitionLookups)); - fprintf (f, "-------------------\n"); - -#endif // collect_stats - } diff --git a/programs/lastz/src/sequences.h b/programs/lastz/src/sequences.h deleted file mode 100755 index 51cfe96..0000000 --- a/programs/lastz/src/sequences.h +++ /dev/null @@ -1,721 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: sequences.h -// -//---------- - -#ifndef sequences_H // (prevent multiple inclusion) -#define sequences_H - -// other files - -#include // standard C i/o stuff -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff - -// establish ownership of global variables - -#ifdef sequences_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef sequences_owner -int sequences_keepFastaArrow = false; // true => keep ">" on header in - // .. load_fasta_sequence() -int sequences_dbgDumpSequence = false; // true => dump sequences to stderr - // .. after input, masking, etc. -int sequences_dbgAllowColors = false; // true => allow color space -#else -global int sequences_keepFastaArrow; -global int sequences_dbgDumpSequence; -global int sequences_dbgAllowColors; -#endif - -//---------- -// -// sequence sizing types -// (see also "malloc sizing range" in utilties.h) -// These types control the range of sequence lengths we can handle before some -// values will overflow. For the sake of efficiency, we generally do not check -// for overflow when indexing sequences. -// -// Sequence lengths are normally assumed to be small enough to fit into a -// 31-bit integer. This gives a maximum length of about 2.1 billion bp, which -// is half the length of a (hypothetical) monoploid human genome. The -// programmer can override this at compile time by defining max_sequence_index -// as 32 or 63. -// -// maxSequenceLen (L) is the longest sequence that we intend to support. -// -// unspos is an unsigned type that can represent 0..L. -// -// sgnpos is a signed type that can represent -L..L. -// -// possum is an unsigned type that can represent a sum of values in the 0..L -// range. -// -//---------- - -#define maxSequenceOverrun 10 - -#if defined(max_sequence_index) -#define maxSequenceIndex max_sequence_index -#else -#define maxSequenceIndex 31 -#endif - -#if (maxSequenceIndex == 31) -#define unspos_sz 4 -typedef u32 unspos; -typedef s32 sgnpos; -typedef u64 possum; -#elif (maxSequenceIndex == 32) -#define unspos_sz 4 -typedef u32 unspos; -typedef s64 sgnpos; -typedef u64 possum; -#elif (maxSequenceIndex == 63) -#define unspos_sz 8 -typedef u64 unspos; -typedef s64 sgnpos; -typedef u64 possum; -#else -#error ***** undecipherable max sequence length definition ***** -#endif - -#define maxSequenceLen ((unspos)(((1ULL)< -#if (maxSequenceIndex == 31) -#define unsposFmt "%" PRIu32 -#define unsposStarFmt "%*" PRIu32 -#define sgnposFmt "%" PRId32 -#define possumFmt "%" PRIu64 -#define unsposFmtScanf "%" SCNu32 -#define sgnposFmtScanf "%" SCNd32 -#define possumFmtScanf "%" SCNu64 -#elif (maxSequenceIndex == 32) -#define unsposFmt "%" PRIu32 -#define unsposStarFmt "%*" PRIu32 -#define sgnposFmt "%" PRId64 -#define possumFmt "%" PRIu64 -#define unsposFmtScanf "%" SCNu32 -#define sgnposFmtScanf "%" SCNd64 -#define possumFmtScanf "%" SCNu64 -#elif (maxSequenceIndex == 63) -#define unsposFmt "%" PRIu64 -#define unsposStarFmt "%*" PRIu64 -#define sgnposFmt "%" PRId64 -#define possumFmt "%" PRIu64 -#define unsposFmtScanf "%" SCNu64 -#define sgnposFmtScanf "%" SCNd64 -#define possumFmtScanf "%" SCNu64 -#endif - -#endif // override_inttypes - -#define unsposSlashFmt unsposFmt "/" unsposFmt -#define unsposSlashSFmt unsposFmt "%s/" unsposFmt "%s" -#define unsposSlashCFmt unsposFmt "%c/" unsposFmt "%c" -#define unsposCommaFmt unsposFmt "," unsposFmt -#define unsposDashFmt unsposFmt "-" unsposFmt -#define unsposDotsFmt unsposFmt ".." unsposFmt - -#define unsposSlashFmtScanf unsposFmtScanf "/" unsposFmtScanf -#define unsposSlashSFmtScanf unsposFmtScanf "%s/" unsposFmtScanf "%s" -#define unsposSlashCFmtScanf unsposFmtScanf "%c/" unsposFmtScanf "%c" -#define unsposCommaFmtScanf unsposFmtScanf "," unsposFmtScanf -#define unsposDashFmtScanf unsposFmtScanf "-" unsposFmtScanf -#define unsposDotsFmtScanf unsposFmtScanf ".." unsposFmtScanf - -// interval (start-end position pair) -// note that we do not define here whether the the interval is origin-zero/one -// .. or open/closed - -typedef struct interval { unspos s; unspos e; } interval; - -//---------- -// -// sequence data structures and types -// -//---------- -// -// sequences-- -// A sequence is a vector of characters (unsigned 8-bit values). -// -//---------- -// -// Notes: -// (1) In the struct definitions, an (H) indicates pointers to items allocated -// separately, and which therefore must be deallocated when the sequence is -// deallocated. (O) indicates pointers to items for which another field -// tells us whether or not should be deallocated here. (I) indicates -// pointers to items within the same structure. (X) indicates pointers to -// other things but which do not need to be deallocated. (F) indicates -// files which must be closed when the sequence is deallocated. -// -// (2) For partitioned sequences, p[0].sepBefore is always 0 and -// p[len].sepBefore is always seq->len. seq->v[p[ix].sepBefore] is a NUL, -// for all ix (even for 0 and len). All partitions are bounded on both -// sides by a NUL, even the first partition. -// -//---------- - -#define maxSequenceName 100 -#define maxSequenceHeader 992 -#define seqBufferSize (maxSequenceHeader+32) -#define maxFastqSequenceLen 10000 -#define maxChoreTagLen 15 - -typedef struct chore - { - int num; // the number of this chore among chores on the - // .. same query; the first chore is 1 - - char tName // name of the target sequence; an empty - [maxSequenceName+1];// .. string indicates a wildcard name; - // .. note that the query sequence name is - // .. stored in seq.nextContigName - - int tSubrange; // true => tStart,tEnd are meaningful - // false => the entire target is to be used - unspos tStart, tEnd; // origin-one half-open - - int qSubrange; // true => qStart,qEnd are meaningful - // false => the entire query is to be used - unspos qStart, qEnd; // origin-one half-open - int qStrand; // 0 => search + strand only - // < 0 => search - strand only - // > 0 => search both strands - - char idTag // user-specified tag to identify this chore - [maxChoreTagLen+1];//.. an empty string indicates "no tag" - - interval targetInterval; // corresponding index range in target->v[] or - interval queryInterval; // and query->v[]; the intervals are origin- - // .. zero closed - } chore; - - -typedef struct partition - { // .. layout must match struct cappartition (capsule.h) - unspos sepBefore; // the position (in seq->v) of the - // .. separating NUL preceding this - // .. partition; valid only for entries - // .. 0..len in seqpartition->p; for - // .. entry seqpartition.len, it is the - // .. position of the final NUL in seq->v; - // .. see note (2) above - unspos sepAfter; // the position (in seq->v) of the - // .. separating NUL following this - // .. partition; valid only for entries - // .. 0..len-1 in seqpartition->p - u32 contig; // the contig number of this partition - // .. in the actual sequence file; valid - // .. only for entries 0..len-1 in - // .. seqpartition->p - unspos startLoc; // 1-based starting location; this is only - // .. valid after a sequence is loaded - unspos trueLen; // the number of characters in the actual - // .. sequence for this partition (similar - // .. to seq->trueLen); valid only for - // .. entries 0..len-1 in seqpartition->p - u32 header; // the header of this partition; valid - // .. only for entries 0..len-1 in - // .. seqpartition->p; points into - // .. seqpartition->pool - } partition; - - -typedef struct seqpartition - { - // information for a partitioned sequence-- a single sequence which holds - // several actual sequences, each separated by a NUL character (a zero) - - int state; // the current initialization state; one - // .. of seqpart_xxx below - - u32 size; // the number of entries allocated for - // .. p[]; note that this must always be - // .. at least 1 more than len - u32 len; // the number of partitions of the sequence - - partition* p; // (H) an array, indexed by 0..len, of the - // .. partitions - - u32 poolSize; // the number of bytes allocated (size) and - u32 poolLen; // .. actually used (len) for pool[] - char* pool; // (O) an array containing the characters for - // .. header[] - int poolOwner; // true => we must deallocate pool[] - // false => we must *not* deallocate it - } seqpartition; - -enum // states for a partitioned sequence (only valid - { // .. if seqpartition.p != NULL - seqpart_empty = 0, // created but no partitions loaded yet - seqpart_reusable, // all partitions loaded, ready for use if same - // .. sequence is to be used again - seqpart_loading, // some partitions loaded, but not all - seqpart_ready // all partitions loaded - }; - - -typedef struct twobit - { - // information for 2bit files only - - int bigEndian; // true => file was written as big endian - // false => written as little endian - u32 numContigs; // the number of contigs in the file (this - // .. is the sequenceCount field) - long int indexFilePos; // position (as per ftell) of the beginning - // .. of the file index - long int contigFilePos; // position (as per ftell) of the next entry - // .. in the file index - - int contigLoaded; // true => the sequence corresponding to - // .. seq.contig has been loaded - // .. into v[] - - u32 nBlocksSize; // "temporary" arrays to hold n-blocks; - u32* nBlockStarts; // (H) .. nBlocksSize is the number of entries - u32* nBlockSizes; // (H) .. *allocated* for nBlockStarts[] and - // .. nBlockSizes[] - - u32 mBlocksSize; // "temporary" arrays to hold mask-blocks; - u32* mBlockstarts; // (H) .. mBlocksSize is the number of entries - u32* mBlocksizes; // (H) .. *allocated* for mBlockstarts[] and - // .. mBlocksizes[] - } twobit; - - -typedef struct hsxfileinfo // (see hsx.fileInfo) - { - char* name; // file's name - FILE* f; // (if non-NULL), pointer to open file - } hsxfileinfo; - -typedef struct hsx - { - // information for hsx files only - - int bigEndian; // true => file was written as big endian - // false => written as little endian - - u32 version; - u32 numContigs; // the number of contigs in the file (this - // .. is the numSequences field) - u32 numFiles; // the number of files in the file index - u32 numBuckets; // the number of hash buckets (not including - // .. the sentinel bucket) - - u64 fileTableOffset; // position of the beginning of the file - // .. table - u64 hashTableOffset; // position of the beginning of the hash - // .. table - u64 seqTableOffset; // position of the beginning of the sequence - // .. index table - - int contigLoaded; // true => the sequence corresponding to - // .. seq.contig has been loaded - // .. into v[] - - u64 contigFilePos; // position of the next entry in the - // .. sequence table - - hsxfileinfo* fileInfo; // (H) internal copy of the file info table; - // .. indexed by 0..numFiles-1; - // .. *seqFileInfo[i].name is the - // .. reconstructed name of file i; - // .. *seqFileInfo[i].f is the file pointer - // .. to file i, if non-NULL; - int seqFileIx; // index (into fileInfo) of the current - // .. sequence - u64 seqFilePos; // position of the current sequence, in - // .. seqFile - u64 seqLength; // length (in bytes) of the current sequence - } hsx; - - -typedef struct seq - { - // sequence content - - unspos size; // the number of bytes allocated for v[] - unspos len; // the number of characters in the sequence, - // .. not including a terminating zero - int needsVq; // true => we allocate for vq with v - // fasle => we don't - u8* v; // (O) the sequence content; if v is not NULL, - // .. v[len] is a terminating zero - u8* vc; // (O) the sequence content in color space; - // .. if vc is not NULL, vc[len] is a - // .. terminating zero - u8* vq; // (O) the sequence's base-call qualities (ascii) - // .. if vq is not NULL, vq[len] is a - // .. terminating zero - - int vOwner; // true => we must deallocate v[] - // false => we must *not* deallocate it - int vcOwner; // true => we must deallocate vc[] - // false => we must *not* deallocate it - int vqOwner; // true => we must deallocate vq[] - // false => we must *not* deallocate it - - unspos startLoc; // 1-based starting location; this is only - // .. valid after a sequence is loaded - unspos trueLen; // the number of characters in the actual - // .. sequence, including those not stored - // .. in v[] - int needTrueLen; // true => trueLen must be set correctly, - // .. even if this means reading - // .. additional characters outside - // .. the desired (sub)interval - int revCompFlags; // two bits describing how this sequence - // .. relates to what was read from the - // .. file; the four values are the rcf_xxx - // .. values defined below - char* contigOfInterest; // (H) the name of the only sequence of - // .. interest; NULL means we're interested - // .. in every sequence - u32 contig; // the number of the subsequence of the - // .. actual file (provided for fasta, 2bit - // .. and hsx files); the first contig is 1 - int preLoaded; // true => the data currently in v[] has - // .. been pre-loaded from the file - // .. behind the caller's back - - int lockedHeader; // true => don't change header or - // .. shortHeader - u32 headerSize; // number of bytes allocated for header[] - char* header; // (O) the sequence's header (e.g. for each - // .. sequence in a fasta file); this may - // .. be NULL - int headerOwner; // true => we must deallocate header[] - // false => we must *not* deallocate it - u32 shortHeaderSize; // number of bytes allocated for - // .. shortHeader[] - char* shortHeader; // (O) short version of the header; this may be - // .. NULL - int shortHeaderOwner; // true => we must deallocate shortHeader[] - // false => we must *not* deallocate it - int hasNickname; // true => header is a nickname, and should - // .. be copied into shortHeader - // .. without stripping paths - u32 trueHeaderSize; // number of bytes allocated for trueHeader[] - char* trueHeader; // (O) the sequence's true header (primarily - // .. used to validate fastq files); this - // .. may be NULL - int trueHeaderOwner; // true => we must deallocate fauxHeader[] - // false => we must *not* deallocate it - - int hasLeftFence; // a 'fence' at the left end of some - unspos leftFencePos; // .. interval; if hasLeftFence is true, - u8 leftFenceCh; // .. a marker has been written to - // .. v[leftFencePos]; leftFenceCh is the - // .. character that was there before - int hasRightFence; // a 'fence' at the right end of some - unspos rightFencePos; // .. interval (similar ot the left fence) - u8 rightFenceCh; // - - // file containing the sequence - - char* filename; // (H) the name of the file associated with this - // .. sequence; this may be NULL - FILE* f; // (F) the file associated with this sequence; - // .. this can be NULL; with NULL, a - // .. fileType of seq_type_nofile indicates - // .. this sequence was manufactured in - // .. memory; any other fileType indicates - // .. a cloned seqeunce - int fileType; // the type of file being read (one of - // .. seq_type_xxx) - int rewindable; // true => we can rewind the file - // false => we cannot rewind the file - // -1 => we do not know if we can rewind - // .. the file or not - - u32 pendingLen; // characters that have been read from the - char* pendingChars; // (H) .. file but which have not been consumed; - char* pendingStack; // (X) .. pendingChars is an array of size - // .. seqBufferSize; pendingLen gives the - // .. number of non-consumed characters - // .. remaining; pendingStack points to the - // .. top of the stack, which builds down - // .. from pendingChars+seqBufferSize - - twobit twoBit; // additional info for 2bit files (only - // .. valid if fileType == seq_type_2bit) - hsx hsx; // additional info for hsx files (only - // .. valid if fileType == seq_type_hsx) - seqpartition partition; // additional info for partitioned - // .. sequences (only valid if partition.p - // .. is not NULL) - qcode* qCoding; // (H) table to map quantum symbols to - // .. probabilities (this may be NULL) - - // saved file state - - int hasSavedState; // true => savedFilePos is meaningful - long int savedFilePos; // (as per ftell) - - // sequence read control options - - char* namesFilename; // (H) the name of a file containing a list of - // .. contig names; this may be NULL - FILE* namesFile; // (F) file corresponding to namesFilename - char nextContigName // the name of the next contig-of-interest; - [maxSequenceName+1];// .. only valid if namesFile is not NULL - int contigPending; // true => the sequence file has already - // .. been scanned to locate the - // .. sequence for nextContigName, - // .. is positioned appropriately, - // .. and that sequence hasn't been - // .. loaded yet; only valid if - // .. namesFile is not NULL - - char* choresFilename; // (H) the name of a file containing a list of - // .. "alignment chores"; this may be NULL - FILE* choresFile; // (F) file corresponding to choresFilename - int choresLineNum; // line number of the current chore - chore chore; // the current alignment chore (valid only - // .. if choresFile is non-NULL) - - int subsampleK; // specifier for K-of-N subsampling; only - int subsampleN; // the Kth sequence of every group of N are - // .. processed, with 1<=K<=N; only valid - // .. if subsampleN is greater than zero - int subsampleSkip; // the current subsampling state; this is - // .. the number of sequences that we will - // .. skip before the next one that we keep - - char* softMaskFilename; // (H) the name of a file containing - // .. soft-masking info to apply; this - // .. may be NULL - int softMaskComplement; // false => soft-masking replaces any bases - // .. in the intervals - // true => soft-masking replaces any bases - // .. NOT in the intervals - char* xMaskFilename; // (H) the name of a file containing X-masking - // .. info to apply; this may be NULL - int xMaskComplement; // false => x-masking replaces any bases in - // .. the intervals - // true => x-masking replaces any bases - // .. NOT in the intervals - char* nMaskFilename; // (H) the name of a file containing N-masking - // .. info to apply; this may be NULL - int nMaskComplement; // (similar to xMaskComplement) - unspos startLimit, // 1-based starting and ending locations - endLimit; // .. (inclusive) limiting the part of the - // .. sequence we are to read; these are - // .. zero if there are no limits - int endIsSoft; // true => while the endLimit has been - // .. specified, it is a soft limit - // .. and can be trimmed to the - // .. actual end of the sequence - - int doRevCompFlags; // two bits describing whether any sequence - // .. we read is to be reversed or - // .. complemented; the four values are the - // .. rcf_xxx values defined below - int doUnmask; // true => any sequence we read is to be - // .. unmasked (converted to upper - // .. case) - int doPartitioning; // true => the sequence will be partitioned - int doJoin; // true => combine the file's sequences - // .. into a partitioned sequence - char separatorCh; // (if not NUL) the file's sequence should - // .. be separated into partitions at any - // .. run of this character - int useFullNames; // true => report full names in alignments - // false => report short names instead - int nameParseType; // how to parse sequence headers, if at all; - // .. one of name_parse_type_xxx - char* nameTrigger; // a string to trigger the fetch of the - // .. short sequence name from a fasta - // .. header line; e.g. "name:" means to - // .. fetch the name stating after the ":"; - // .. NULL indicates an absent trigger - int allowAmbiDNA; // (this applies only to reading fasta - // .. files) - // true => permit ambiguous DNA characters - // .. B,D,H,K,M,R,S,V,W,Y - // false => only A,C,G,T,N,X permitted - u8* qToComplement; // (X) (similar to nuc_to_complement) array to - // .. map a quantum base to its complement; - // .. this may be NULL - } seq; - - -enum - { - seq_type_nofile=0, - seq_type_unknown, - seq_type_fasta, // dna, permitting ambiguity - seq_type_fastq, // dna with base-call quality scores - seq_type_csfasta, // color-space dna - seq_type_nib, // nybble-coded dna - seq_type_2bit, // 2bit-coded dna - seq_type_hsx, // hashed sequence index - seq_type_qdna, // quantum-dna (byte-coded prob'listic ambiguity) - seq_type_max // (sentinel for enum of seq_type_xxx) - }; - -#ifdef sequences_owner -char* seqTypeNames[seq_type_max] = - {"(no file)", "unknown", - "fasta", "fastq", "csfasta", "nib", "2bit", "hsx", "qdna" }; -#else -extern char* seqTypeNames[]; -#endif - -enum - { - rcf_forward = 0, // sequence is in normal order, uncomplemented - rcf_comp = 1, // sequence is complemented but not reversed - rcf_rev = 2, // sequence is reversed but not complemented - rcf_revcomp = 3 // sequence is reversed and complemented - }; - -enum - { - name_parse_fill_white=1, // (modifier for types below) convert any - // whitespace in the resulting name to - // underline character - name_parse_type_core=0, // trim path on left, junk and ext on right - name_parse_type_alnum=2, // trim path on left, use only alphanumeric and - // .. underscore - name_parse_type_darkspace=4,// trim path on left, use anything but - // .. whitespace - name_parse_type_trigger=6 // use nameTrigger - }; - -#define parse_type(nameParseType) ((nameParseType) & (~name_parse_fill_white)) - -#define minFastqCh '!' -#define maxFastqCh '~' - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -global struct - { - u64 partitionLookups; - u64 partitionHits; - u64 lookupIterations; - } sequenceStats; - -// stats macros - -#define sequence_count_stat(field) ++sequenceStats.field -#define sequence_uncount_stat(field) --sequenceStats.field -#define sequence_set_stat(field,val) (sequenceStats.field = val) -#define sequence_add_stat(field,val) (sequenceStats.field += val) -#else -#define sequence_count_stat(field) -#define sequence_uncount_stat(field) -#define sequence_set_stat(field,val) -#define sequence_add_stat(field,val) -#endif // collect_stats - -// prototypes for stats routines - -void sequence_zero_stats (void); -void sequence_show_stats (FILE* f); - -//---------- -// -// prototypes for routines in sequences.c -// -//---------- - -seq* open_sequence_file (char* name, int fileType, - int choresAllowed, char* choresFilename, - unspos allocLen, - int needTrueLen, int prohibitAmbiDNA, - u8* qToComplement); -seq* open_rewindable_sequence_file (char* name, int fileType, - int choresAllowed, char* choresFilename, - unspos allocLen, - int needTrueLen, int prohibitAmbiDNA, - u8* qToComplement); -void rewind_sequence_file (seq* seq); -seq* clone_sequence (seq* seq); -seq* copy_sequence (seq* seq); -seq* new_sequence (unspos allocLen); -void sequence_long_enough (seq* seq, unspos allocLen, int anticipate); -void free_sequence (seq* seq); -int load_sequence (seq* seq); -int another_sequence (seq* seq); -partition* lookup_partition_no_die (seq* seq, unspos pos); -partition* lookup_partition (seq* seq, unspos pos); -partition* lookup_named_partition (seq* seq, char* name); -partition* last_partition_with_name (seq* seq, partition* firstPart); -partition* lookup_partition_seq_pos (seq* seq, partition* part, unspos pos); -void print_sequence (FILE* f, seq* seq, char* header, int perLine); -void print_partition_table (FILE* f, seq* _seq); -void mask_sequence (seq* seq, char* maskFilename, int maskChar); -void mask_sequence_keep (seq* seq, char* maskFilename, int maskChar); -void colorize_sequence (seq* seq); -void validate_rev_comp (seq* seq); -void rev_comp_sequence (seq* seq, const u8* nucToComplement); -void backward_sequence (seq* seq); -void upper_sequence (seq* seq); -char* copy_reverse_of_string (char* s, unspos len); -void strncpy_reverse (char* d, char* s, unspos len); -void fence_sequence_interval (seq* seq, interval interval, u8 ch); -void unfence_sequence_interval (seq* seq); -void print_file_actions (FILE* f); -void match_composition (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - unspos count[4][4]); -int percent_identical (seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length); -score score_match (scoreset* scoring, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length); -void dump_aligned_nucleotides (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length); -void dump_sequence (FILE* f, seq* _seq); -void dump_sequence_state (FILE* f, seq* _seq); - -#undef global -#endif // sequences_H diff --git a/programs/lastz/src/text_align.c b/programs/lastz/src/text_align.c deleted file mode 100755 index c5749be..0000000 --- a/programs/lastz/src/text_align.c +++ /dev/null @@ -1,1093 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: text_align.c -// -//---------- -// -// text_align-- -// Support for printing alignments in a textual alignment format. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include "build_options.h" // build options -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff -#include "diag_hash.h" // diagonals hashing stuff - -#define text_align_owner // (make this the owner of its globals) -#include "text_align.h" // interface to this module - -//---------- -// -// private data -// -//---------- - -#define maxDisplayPerRow 50 -#define dnaDisplayPerRow 50 -#define quantumDisplayPerRow 20 - -typedef struct aligndisplay - { - FILE* f; // the file to print to - int displayPerRow; // number of characters in a displayed row - int rev1, rev2; // true => the corresponding sequence is - // .. a reverse-complement - unspos beg1, beg2; // index into seq1 and seq2 of start of - // .. current line (origin1, inclusive) - unspos loc1, loc2; // current index into seq1 and seq2 - int ix; // current index into row1 and row2 - int quantum1; // sequence 1 is quantum DNA - qcode* qCoding1; // table to map sequence 1 quantum symbols - // .. to probabilities (this may be NULL) - int quantum2; // sequence 2 is quantum DNA - qcode* qCoding2; // table to map sequence 2 quantum symbols - // .. to probabilities (this may be NULL) - u8 gap1, gap2; // character for gap in seq1 and seq2 - u8 row1[maxDisplayPerRow+1]; // current row of seq1 - u8 row2[maxDisplayPerRow+1]; // current row of seq2 - } aligndisplay; - -//---------- -// -// prototypes for private functions -// -//---------- - -static void expand_segment (seq* seq1, unspos* pos1, - seq* seq2, unspos* pos2, unspos* length, - u32 expandLeft, u32 expandRight); -static aligndisplay* display_init (FILE* f, - unspos beg1, int rev1, - unspos beg2, int rev2, - int quantum1, qcode* qCoding1, - int quantum2, qcode* qCoding2); -static void display_finish (aligndisplay* disp); -static void display_add (aligndisplay* disp, u8 ch1, u8 ch2); -static void display_print (aligndisplay* disp); - -//---------- -// -// print_text_align_job_header-- -// Print a textual alignment job header. -// -//---------- - -void print_text_align_job_header - (arg_dont_complain(FILE* f), - arg_dont_complain(char* programName), - arg_dont_complain(char* name1), - arg_dont_complain(char* name2), - arg_dont_complain(int oneBased)) - { - } - -//---------- -// -// print_text_align_job_footer-- -// Print a textual alignment job footer. -// -//---------- - -void print_text_align_job_footer - (arg_dont_complain(FILE* f)) - { - // (do nothing) - } - -//---------- -// -// print_text_align_header-- -// Print a textual alignment query header. -// -//---------- - -void print_text_align_header - (arg_dont_complain(FILE* f), - arg_dont_complain(seq* seq1), - arg_dont_complain(seq* seq2), - arg_dont_complain(int oneBased)) - { - // (do nothing) - } - -//---------- -// -// print_text_align_align_list-- -// Print a list of gapped alignments, textually. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// alignel* alignList: The list of alignments to print. -// seq* seq1: One sequence. -// seq* seq2: Another sequence. -// int oneBased: true => show positions as origin 1 -// false => show positions as origin 0 -// u32 expand: Number of extra bp to print at the ends of matches, -// .. to provide context. -// -// Returns: -// (nothing) -// -//---------- - -void print_text_align_align_list - (FILE* f, - alignel* alignList, - seq* seq1, - seq* seq2, - int oneBased, - u32 expand) - { - alignel* a; - - for (a=alignList ; a!=NULL ; a=a->next) - print_text_align_align (f, - seq1, a->beg1-1, a->end1, - seq2, a->beg2-1, a->end2, - a->script, a->s, oneBased, expand); - } - -//---------- -// -// print_text_align_align-- -// Print a single gapped alignment, textually. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos beg1, end1: Range of positions in sequence 1 (origin 0). -// seq* seq2: Another sequence. -// unspos beg2, end2: Range of positions in sequence 2 (origin 0). -// editscript* script: The script describing the path the alignment takes -// .. in the DP matrix. -// score s: The alignment's score. -// int oneBased: true => show positions as origin 1 -// false => show positions as origin 0 -// u32 expand: Number of extra bp to print at the ends of matches, -// .. to provide context. -// -// Returns: -// (nothing) -// -//---------- -// -// Typical output: -// -// 61 TCTATCGGTAACCTAATAGA--GACTGAAGCTTACCCCTATGATCTTTGA -// ||||||||||||||||||| ||| | || | | | | || -// 41 TCTATCGGTAACCTAATAGTTTGACGACGGTGTATCTATGTTAGTGTTAT -// -// 109 CTGGAGTTGTTACGCGATAT-CTTTACCTGTTATCTGGCAC -// ||| | | | | || | |||||||||||||||||||| -// 91 CTG---CTATAAAGACATCTACTTTACCTGTTATCTGGCAC -// -//---------- - -void print_text_align_align - (FILE* f, - seq* seq1, - unspos beg1, - unspos end1, - seq* seq2, - unspos beg2, - unspos end2, - editscript* script, - score s, - int oneBased, - u32 expand) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - const int rev1 = ((seq1->revCompFlags & rcf_rev) != 0); - const int rev2 = ((seq2->revCompFlags & rcf_rev) != 0); - unspos extra1, extra2; - u32 expandLeft, expandRight; - unspos offset1, offset2; - unspos seq1Len, seq2Len; - unspos dispBeg1, dispBeg2; - aligndisplay* disp; - unspos height, width, i, j, run; - u32 opIx; - u8* p, *q; - u32 ix; - int bo = (oneBased)? 0 : -1; - - expandLeft = expandRight = 0; - if (expand > 0) - { - expandLeft = (u32) beg1; - if (((u32) beg2) < expandLeft) expandLeft = (u32) beg2; - if (expand < expandLeft) expandLeft = expand; - beg1 -= expandLeft; - beg2 -= expandLeft; - - extra1 = seq1->len - end1; - extra2 = seq2->len - end2; - expandRight = (u32) extra1; - if (((u32) extra2) < expandRight) expandRight = (u32) extra2; - if (expand < expandRight) expandRight = expand; - end1 += expandRight; - end2 += expandRight; - } - - beg1++; // (internally, we want origin 1, inclusive) - beg2++; - - height = end1 - beg1 + 1; - width = end2 - beg2 + 1; - - // report diagonal - - if (text_align_dbgReportDiag) - fprintf (f, "# diagonal=" sgnposFmt "\n", diagNumber(beg1,beg2)); - - ////////// - // figure out the alignment's length - ////////// - - opIx = 0; - for (i=j=0 ; (ilen) run += expandRight; - i += run; j += run; - - // handle the next indel - - if ((i < height) || (j < width)) - edit_script_indel_len (script, &opIx, &i, &j); - } - - fprintf (f, "score:" scoreFmt " length:(" unsposFmt " " unsposFmt ")\n", - s, i, j); - - ////////// - // figure out position offsets - ////////// - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - offset1 = 0; - seq1Len = seq1->len; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, beg1); - offset1 = part->sepBefore + 1; - seq1Len = part->sepAfter - offset1; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - offset2 = 0; - seq2Len = seq2->len; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, beg2); - offset2 = part->sepBefore + 1; - seq2Len = part->sepAfter - offset2; - } - - ////////// - // draw the alignment (non-printables are printed as '*' but such should - // never be seen unless there is a problem elsewhere) - ////////// - - dispBeg1 = (rev1)? (seq1Len+1 + bo - beg1) : (beg1 + bo - offset1); - dispBeg2 = (rev2)? (seq2Len+1 + bo - beg2) : (beg2 + bo - offset2); - - disp = display_init (f, dispBeg1, rev1, dispBeg2, rev2, - (seq1->fileType == seq_type_qdna), seq1->qCoding, - (seq2->fileType == seq_type_qdna), seq2->qCoding); - if (disp == NULL) - return; - - opIx = 0; - for (i=j=0 ; (ilen) run += expandRight; - - p = seq1->v+beg1+i-1; - q = seq2->v+beg2+j-1; - for (ix=0 ; ixv+beg1+i-1; - startJ = j; q = seq2->v+beg2+j-1; - - edit_script_indel_len (script, &opIx, &i, &j); - - if (i != startI) - { - for ( ; startIgap2); p++; } - } - - if (j != startJ) - { - for ( ; startJgap1, dna_toprint(*q)); q++; } - } - } - } - - display_finish (disp); - } - -//---------- -// -// print_text_align_match-- -// Print an hsp in a textual alignment. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// seq* seq1: One sequence. -// unspos pos1: The position, in seq1, of first character in the match -// .. (origin-0). -// seq* seq2: Another sequence. -// unspos pos2: The position, in seq2, of first character in the match -// .. (origin-0). -// unspos length: The number of nucleotides in the HSP. -// score s: The HSP's score. -// int oneBased: true => show positions as origin 1 -// false => show positions as origin 0 -// u32 expand: Number of extra bp to print at the ends of matches, to -// .. provide context. -// -// Returns: -// (nothing) -// -//---------- - -static void print_quantum_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - score s, int oneBased); - -static char quantum_match_char (qcode* qCoding1, u8 ch1, - qcode* qCoding2, u8 ch2); - - -void print_text_align_match - (FILE* f, - seq* seq1, - unspos pos1, - seq* seq2, - unspos pos2, - unspos length, - score s, - int oneBased, - u32 expand) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part; - u8* s1, *s2; - s8 b1, b2; - unspos offset1, offset2; - unspos startLoc1, startLoc2; - char c; - unspos ix; - int bo = (oneBased)? 0 : -1; - - if (expand > 0) - expand_segment (seq1, &pos1, seq2, &pos2, &length, expand, expand); - s1 = seq1->v + pos1; - s2 = seq2->v + pos2; - - if ((seq1->fileType == seq_type_qdna) - || (seq2->fileType == seq_type_qdna)) - { - print_quantum_match (f, seq1, pos1, seq2, pos2, length, s, oneBased); - return; - } - - // report diagonal - - if (text_align_dbgReportDiag) - fprintf (f, "# diagonal=" sgnposFmt "\n", diagNumber(pos1,pos2)); - - fprintf (f, "score:" scoreFmt " length:" unsposFmt "\n", s, length); - - // figure out position offsets - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - offset1 = 0; - startLoc1 = seq1->startLoc; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, pos1); - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - offset2 = 0; - startLoc2 = seq2->startLoc; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, pos2); - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - } - - // print aligning segment of sequence 1 (non-printables are printed as '*' - // but such should never be seen unless there is a problem elsewhere) - - fprintf (f, unsposStarFmt ": ", 10, pos1 + bo - offset1 + startLoc1); - for (ix=0 ; ixpartition; - seqpartition* sp2 = &seq2->partition; - int quantum1 = (seq1->fileType == seq_type_qdna); - int quantum2 = (seq2->fileType == seq_type_qdna); - qcode* qCoding1 = seq1->qCoding; - qcode* qCoding2 = seq2->qCoding; - partition* part; - u8* s1 = seq1->v + pos1; - u8* s2 = seq2->v + pos2; - unspos offset1, offset2; - unspos startLoc1, startLoc2; - unspos ix; - char3 pField; - u8 nucIx, ch1, ch2; - int bo = (oneBased)? 0 : -1; - - fprintf (f, "score:" scoreFmt " length:" unsposFmt "\n", s, length); - - // figure out position offsets - - if (sp1->p == NULL) // sequence 1 is not partitioned - { - offset1 = 0; - startLoc1 = seq1->startLoc; - } - else // sequence 1 is partitioned - { - part = lookup_partition (seq1, pos1); - offset1 = part->sepBefore + 1; - startLoc1 = part->startLoc; - } - - if (sp2->p == NULL) // sequence 2 is not partitioned - { - offset2 = 0; - startLoc2 = seq2->startLoc; - } - else // sequence 2 is partitioned - { - part = lookup_partition (seq2, pos2); - offset2 = part->sepBefore + 1; - startLoc2 = part->startLoc; - } - - // print sequence 1 probabilities - - if (qCoding1 != NULL) - { - for (nucIx=0 ; nucIxdna) ; nucIx++) - { - fprintf (f, "%10c:", qCoding1->dna[nucIx]); - for (ix=0 ; ixp[ch1][nucIx]); - fprintf (f, " %s", pField.s); - } - fprintf (f, "\n"); - } - } - - // print aligning segment of sequence 1 - - fprintf (f, unsposStarFmt ":", 10, pos1 + bo - offset1 + startLoc1); - if (seq1->fileType == seq_type_qdna) - { for (ix=0 ; ixfileType == seq_type_qdna) - { for (ix=0 ; ixdna) ; nucIx++) - { - fprintf (f, "%10c:", qCoding2->dna[nucIx]); - for (ix=0 ; ixp[ch2][nucIx]); - fprintf (f, " %s", pField.s); - } - fprintf (f, "\n"); - } - } - - fprintf (f, "\n"); - } - - -static char quantum_match_char - (qcode* qCoding1, - u8 ch1, - qcode* qCoding2, - u8 ch2) - { - double pDiff, pDiffSum; - int chIx; - char* lookup1, *lookup2; - u8 nucIx1, nucIx2, ch; - - // if we have no coding, just return a blank - - if ((qCoding1 == NULL) && (qCoding2 == NULL)) - return ' '; - - // if one of the codings is absent, make sure it is qCoding2 - - if (qCoding1 == NULL) - { - qcode* tqc; - u8 tch; - tqc = qCoding1; qCoding1 = qCoding2; qCoding2 = tqc; - tch = ch1; ch1 = ch2; ch1 = tch; - } - - // handle the one coding case - - if (qCoding2 == NULL) - { - lookup1 = strchr(qCoding1->dna,(char)ch2); - if (lookup1 != NULL) - { - nucIx1 = lookup1 - qCoding1->dna; - if (qCoding1->p[ch1][nucIx1] >= .75) return '|'; - else if (qCoding1->p[ch1][nucIx1] >= .40) return ':'; - } - return ' '; - } - - // handle the two coding case - - pDiffSum = 0.0; - for (chIx=0 ; chIx<4 ; chIx++) - { - ch = bits_to_nuc[chIx]; - lookup1 = strchr(qCoding1->dna,(char)ch); - lookup2 = strchr(qCoding2->dna,(char)ch); - if ((lookup1 != NULL) && (lookup2 != NULL)) - { - nucIx1 = lookup1 - qCoding1->dna; - nucIx2 = lookup2 - qCoding2->dna; - pDiff = qCoding1->p[ch1][nucIx1] - qCoding2->p[ch2][nucIx2]; - if (pDiff < 0) pDiff = -pDiff; - pDiffSum += pDiff; - } - } - - if (1-pDiffSum >= .75) return '|'; - else if (1-pDiffSum >= .40) return ':'; - - return ' '; - } - -//---------- -// -// expand_segment-- -// Expand a segment by adding bases to its ends. -// -//---------- -// -// Arguments: -// seq* seq1: One sequence. -// unspos* pos1: The position, in seq1, of first character in the -// .. match (origin-0). -// seq* seq2: Another sequence. -// unspos* pos1: The position, in seq2, of first character in the -// .. match (origin-0). -// unspos* length: The number of nucleotides in the HSP. -// u32 expandLeft: Number of extra bp to print add at the left end. -// u32 expandRight: Number of extra bp to print add at the right end. -// -// Returns: -// (nothing) -// -//---------- - -static void expand_segment - (seq* seq1, - unspos* pos1, - seq* seq2, - unspos* pos2, - unspos* length, - u32 expandLeft, - u32 expandRight) - { - unspos beg1 = *pos1; - unspos beg2 = *pos2; - unspos end1 = beg1 + *length; - unspos end2 = beg2 + *length; - unspos extra1, extra2; - - if (expandLeft > 0) - { - if (beg1 < (unspos) expandLeft) expandLeft = beg1; - if (beg2 < (unspos) expandLeft) expandLeft = beg2; - beg1 -= expandLeft; - beg2 -= expandLeft; - } - - if (expandRight > 0) - { - extra1 = seq1->len - end1; - extra2 = seq2->len - end2; - if (extra1 < (unspos) expandRight) expandRight = extra1; - if (extra2 < (unspos) expandRight) expandRight = extra2; - end1 += expandRight; - // end2 += expandRight; (not needed) - } - - *pos1 = beg1; - *pos2 = beg2; - *length = end1 - beg1; - } - -//---------- -// -// display_init-- -// Initialize an alignment display. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// unspos beg1: Location of the start of the display, for sequence 1 -// (origin1, inclusive). -// int rev1: true => sequence 1 is a reverse-complement. -// unspos beg2: Location of the start of the display, for sequence 2. -// int rev2: true => sequence 2 is a reverse-complement. -// int quantum1: true => sequence 1 is quantum DNA -// qcode* qCoding1: Table to map sequence 1 quantum symbols to -// .. probabilities (this may be NULL) -// int quantum2: true => sequence 2 is quantum DNA -// qcode* qCoding2: Table to map sequence 2 quantum symbols to -// .. probabilities (this may be NULL) -// -// Returns: -// A pointer to the newly allocated display. If there is a failure, it is -// reported to the user and NULL is returned. -// -//---------- - -static aligndisplay* display_init - (FILE* f, - unspos beg1, - int rev1, - unspos beg2, - int rev2, - int quantum1, - qcode* qCoding1, - int quantum2, - qcode* qCoding2) - { - aligndisplay* disp; - - // allocate the display - // note that we don't call malloc_or_die here, because we don't want to kill - // an alignment in this case. - - disp = (aligndisplay*) malloc (sizeof(aligndisplay)); - if (disp == NULL) - { - fprintf (stderr, "unable to allocate alignment display for " unsposSlashFmt "\n", - beg1, beg2); - return NULL; - } - - // initialize it - - disp->f = f; - disp->beg1 = disp->loc1 = beg1; disp->rev1 = rev1; - disp->beg2 = disp->loc2 = beg2; disp->rev2 = rev2; - disp->ix = 0; - - disp->gap1 = disp->gap2 = '-'; - disp->displayPerRow = dnaDisplayPerRow; - - disp->quantum1 = quantum1; - disp->qCoding1 = NULL; - disp->quantum2 = quantum2; - disp->qCoding2 = NULL; - - if (quantum1) - { - disp->qCoding1 = qCoding1; - disp->gap1 = 0; - disp->displayPerRow = quantumDisplayPerRow; - } - - if (quantum2) - { - disp->qCoding2 = qCoding2; - disp->gap2 = 0; - disp->displayPerRow = quantumDisplayPerRow; - } - - return disp; - } - -//---------- -// -// display_finish-- -// Finish an alignment display. The final pair of lines is printed and the -// display is de-allocated. -// -//---------- -// -// Arguments: -// aligndisplay* disp: The display to finish. -// -// Returns: -// (nothing) -// -//---------- - -static void display_finish - (aligndisplay* disp) - { - if (disp->ix > 0) - { display_print (disp); printf ("\n"); } - - free (disp); - } - -//---------- -// -// display_add-- -// Add an aligned pair of characters to an alignment display. -// -//---------- -// -// Arguments: -// aligndisplay* disp: The display to add to. -// u8 ch1, ch2: The aligned characters. If either of these is -// .. '-', this is considered an indel. -// -// Returns: -// (nothing) -// -//---------- - -static void display_add - (aligndisplay* disp, - u8 ch1, - u8 ch2) - { - - // if there's no more room, push stuff out to the console - - if (disp->ix >= disp->displayPerRow) - display_print (disp); - - // add these characters - - disp->row1[disp->ix] = ch1; - disp->row2[disp->ix] = ch2; - disp->ix++; - - // update the sequence positions - - if (ch1 != disp->gap1) { if (disp->rev1) disp->loc1--; - else disp->loc1++; } - if (ch2 != disp->gap2) { if (disp->rev2) disp->loc2--; - else disp->loc2++; } - } - -//---------- -// -// display_print-- -// Print one pair of lines of an alignment display. This should only be -// called by display_add() or display_finish(). -// -//---------- -// -// Arguments: -// aligndisplay* disp: The display to print. -// -// Returns: -// (nothing) -// -//---------- - -static void quantum_display_print (aligndisplay* disp); - - -static void display_print - (aligndisplay* disp) - { - FILE* f = disp->f; - int digits = 10; - s8 b1, b2; - char c; - int ix; - - if ((disp->quantum1) || (disp->quantum2)) - { quantum_display_print (disp); return; } - - // terminate the lines - - disp->row1[disp->ix] = disp->row2[disp->ix] = 0; - - // print a top spacer - - fprintf (f, "\n"); - - // print the top (first) sequence - - fprintf (f, unsposStarFmt " %s\n", digits, disp->beg1, disp->row1); - - // print the match/mismatch row - - fprintf (f, "%*s ", digits, ""); - for (ix=0 ; ixix ; ix++) - { - b1 = nuc_to_bits[disp->row1[ix]]; - b2 = nuc_to_bits[disp->row2[ix]]; - - if ((disp->row1[ix] == disp->gap1) - || (disp->row2[ix] == disp->gap2)) - c = '-'; - else if ((b1 < 0) || (b2 < 0)) - c = ' '; - else if (b1 == b2) - c = '|'; - else if (bits_to_pur_pyr[(u8)b1] == bits_to_pur_pyr[(u8)b2]) - c = ':'; - else - c = ' '; - - fprintf (f, "%c", c); - } - fprintf (f, "\n"); - - // print the bottom (second) sequence - - fprintf (f, unsposStarFmt " %s\n", digits, disp->beg2, disp->row2); - - // prepare for the next line - - disp->beg1 = disp->loc1; - disp->beg2 = disp->loc2; - disp->ix = 0; - } - - -static void quantum_display_print - (aligndisplay* disp) - { - FILE* f = disp->f; - int quantum1 = disp->quantum1; - int quantum2 = disp->quantum2; - qcode* qCoding1 = disp->qCoding1; - qcode* qCoding2 = disp->qCoding2; - int digits = 10; - int ix; - char3 pField; - u8 nuc, ch1, ch2; - - // print a top spacer - - fprintf (f, "\n"); - - // print sequence 1 probabilities - - if (qCoding1 != NULL) - { - for (nuc=0 ; nucdna) ; nuc++) - { - fprintf (f, "%*c:", digits, qCoding1->dna[nuc]); - for (ix=0 ; ixix ; ix++) - { - ch1 = disp->row1[ix]; - if (ch1 == disp->gap1) - { fprintf (f, " .."); continue; } - ch1 = disp->row1[ix]; - if (ch1 == disp->gap1) - { fprintf (f, " ,,"); continue; } - pField = prob_to_string(qCoding1->p[ch1][nuc]); - fprintf (f, " %s", pField.s); - } - fprintf (f, "\n"); - } - } - - // print aligning text for sequence 1 - - fprintf (f, unsposStarFmt " ", digits, disp->beg1); - for (ix=0 ; ixix ; ix++) - { - if (disp->row1[ix] == disp->gap1) fprintf (f, " --"); - else if (quantum1) fprintf (f, " %02X", disp->row1[ix]); - else fprintf (f, " %c ", disp->row1[ix]); - } - fprintf (f, "\n"); - - // print the match/mismatch row - - if ((( quantum1) && (qCoding1 != NULL) && ( quantum2) && (qCoding2 != NULL)) - || ((!quantum1) && (qCoding1 == NULL) && ( quantum2) && (qCoding2 != NULL)) - || (( quantum1) && (qCoding1 != NULL) && (!quantum2) && (qCoding2 == NULL))) - { - fprintf (f, "%*s ", digits, ""); - for (ix=0 ; ixix ; ix++) - fprintf (f, " %c ", quantum_match_char (qCoding1, disp->row1[ix], - qCoding2, disp->row2[ix])); - fprintf (f, "\n"); - } - - // print aligning text for sequence 2 - - fprintf (f, unsposStarFmt " ", digits, disp->beg2); - for (ix=0 ; ixix ; ix++) - { - if (disp->row2[ix] == disp->gap2) fprintf (f, " --"); - else if (quantum2) fprintf (f, " %02X", disp->row2[ix]); - else fprintf (f, " %c ", disp->row2[ix]); - } - fprintf (f, "\n"); - - // print sequence 2 probabilities - - if (qCoding2 != NULL) - { - for (nuc=0 ; nucdna) ; nuc++) - { - fprintf (f, "%*c:", digits, qCoding2->dna[nuc]); - for (ix=0 ; ixix ; ix++) - { - ch1 = disp->row1[ix]; - if (ch1 == disp->gap1) - { fprintf (f, " .."); continue; } - ch2 = disp->row2[ix]; - if (ch2 == disp->gap2) - { fprintf (f, " ,,"); continue; } - pField = prob_to_string(qCoding2->p[ch2][nuc]); - fprintf (f, " %s", pField.s); - } - fprintf (f, "\n"); - } - } - - // prepare for the next line - - disp->beg1 = disp->loc1; - disp->beg2 = disp->loc2; - disp->ix = 0; - } - diff --git a/programs/lastz/src/text_align.h b/programs/lastz/src/text_align.h deleted file mode 100644 index 1ab9c55..0000000 --- a/programs/lastz/src/text_align.h +++ /dev/null @@ -1,60 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: text_align.h -// -//---------- - -#ifndef text_align_H // (prevent multiple inclusion) -#define text_align_H - -// other files - -#include // standard C i/o stuff -#include "utilities.h" // utility stuff -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef text_align_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef text_align_owner -int text_align_dbgReportDiag = false; // true => report diagonal -#else -global int text_align_dbgReportDiag; -#endif - -//---------- -// -// prototypes for routines in text_align.c -// -//---------- - -void print_text_align_job_header (FILE* f, - char* programName, char* name1, char* name2, - int oneBased); -void print_text_align_job_footer (FILE* f); -void print_text_align_header (FILE* f, seq* seq1, seq* seq2, - int oneBased); -void print_text_align_align_list (FILE* f, - alignel* alignList, seq* seq1, seq* seq2, - int oneBased, u32 expand); -void print_text_align_align (FILE* f, - seq* seq1, unspos beg1, unspos end1, - seq* seq2, unspos beg2, unspos end2, - editscript* script, score s, - int oneBased, u32 expand); -void print_text_align_match (FILE* f, - seq* seq1, unspos pos1, - seq* seq2, unspos pos2, unspos length, - score s, int oneBased, u32 expand); - -#undef global -#endif // text_align_H diff --git a/programs/lastz/src/tweener.c b/programs/lastz/src/tweener.c deleted file mode 100755 index 0819bd5..0000000 --- a/programs/lastz/src/tweener.c +++ /dev/null @@ -1,1182 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: tweener.c -// -//---------- -// -// tweener-- -// Given a list of alignments, search for alignments between them at a -// higher-sensitivity ("weak" alignments). We search for such "in-between -// alignments" between properly ordered pairs of alignments and/or sequence -// end points. -// -// We process the outer alignments in order and maintain list of alignments -// that have been processed but whose end point in seq1 is still potentially -// active (within window size of the start of the current alignment). -// -// For each alignment A we do the following: -// -// (1) Dismiss any now inactive alignments (alignments in the active list that -// end too far before A starts). If such an alignment is the end of a -// chain of (1 or more) alignments, look for weak alignments to its -// right. -// -// (2) Look for a current alignment, B, that overlaps A, i.e., B's end point is -// within windowSize diagonals of where A starts, and follows A in one of -// the sequences. In the abnormal case that B ends after A ends (relative -// to one of the sequences, mark A as "not the right end of a chain". -// -// (3) If no alignments overlap A, look for the alignment B that ends before A -// (in both sequences) and is closest. If it comes within windowSize bp in -// both sequences, search for weak alignments between A and B. If no such -// B exists, think of A as being on the left end of a chain of (1 or more) -// outer alignments, and search for weak alignments to its left. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C string stuff -#include "build_options.h" // build options -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "seeds.h" // seed strategy stuff -#include "pos_table.h" // position table stuff -#include "seed_search.h" // seed hit search stuff -#include "segment.h" // segment table management stuff -#include "chain.h" // segment chaining stuff -#include "gapped_extend.h" // gapped alignment stuff -#include "edit_script.h" // alignment edit script stuff - -#define tweener_owner // (make this the owner of its globals) -#include "tweener.h" // interface to this module - -//---------- -// -// private global data -// -//---------- - -int dbgShowInnerHsps = false; - -#define verifyOrDie -//#define debugTweener - -static alignel* innerList; - -// set of alignments that have been checked, but may lie within windowSize bp -// of the start of a future alignment, relative to sequence #1 - -typedef struct active - { - alignel* align; - int isRightEnd; // right end of seen-so-far chain - struct active* next; - } active; - -static active* activeList; -static active* activePool; - -#undef min -#define min(x,y) ((x)<(y)?(x):(y)) -#undef max -#define max(x,y) ((x)>(y)?(x):(y)) - -// private globals shared by all the routines under the umbrella of tweener() - -static seq* seq1; -static seq* seq2; -static int selfCompare; -static int inhibitTrivial; -static const s8* upperCharToBits; -static seed* innerSeed; -static scoreset* scoring; -static scoreset* maskedScoring; -static tback* tb; -static score xDrop; -static int gappedAllBounds; -static score yDrop; -static int trimToPeak; -static sthresh scoreThresh; -static score diagPen, antiPen; -static int scale; -static chainer konnekt; // (may/2014; was "connect" but with clang that -static u32 windowSize; // .. apparently collides with something else - // .. in the namespace) - -hitprocsimple simpleInfo; - -#define numDefaultAnchors 100 -static segtable* innerAnchors = NULL; - -static seq* tweenSeq1; -static seq* tweenSeq2; -static seq* tweenRev1; -static seq* tweenRev2; - -//---------- -// -// prototypes for private functions -// -//---------- - -static void try_bounded_align (unspos b1, unspos e1, - unspos b2, unspos e2); -static void bounded_align (unspos b1, unspos e1, - unspos b2, unspos e2); -static u32 collect_inner_hsps (void* info, - unspos pos1, unspos pos2, unspos length, - score s); -static alignel* merge_align (alignel* a, alignel* b); -static void activate (alignel* a, int isRightEnd); -static active* dismiss (active* c); -static void extract_subsequence (seq* sf, unspos b, unspos e, seq* dst); -static void copy_reverse_sequence (seq* sf, seq* dst); - -//---------- -// -// tweener_interpolate-- -// Interpolate "in-between" alignments in a chain of outer alignments. -// -//---------- -// -// Arguments: -// alignel* alignList: The list of 'outer' alignments. -// ... -// u32 windowSize: (see description in this file's header) -// -// The other arguments are all pass-thru's to the following routines: -// build_seed_position_table -// seed_hit_search -// reduce_to_chain -// reduce_to_points -// gapped_extend -// -// Returns: -// A pointer to the updated alignment list. This includes the outer -// alignments (in the same memory as they were in in the input list) and any -// inner alignments we find. -// -//---------- - -#define pairFmt unsposFmt ".." unsposFmt -#define pairsFmt "(" unsposFmt "," unsposFmt ")..(" unsposFmt "," unsposFmt ")" - -// macros for tweener_interpolate() - -#ifndef verifyOrDie -#define verifyOrDie_1 ; -#define verifyOrDie_2 ; -#endif // not verifyOrDie - -#ifdef verifyOrDie - -#define verifyOrDie_1 \ - for (a=alignList ; a!=NULL ; a=a->next) \ - { \ - if ((a->next != NULL) && (a->next->beg1 < a->beg1)) \ - suicide ("outer alignments out of order"); \ - } - -#define verifyOrDie_2 \ - for (c=activeList ; c!=NULL ; c=c->next) \ - { \ - b1 = c->align->end1; \ - if (a1 > b1 + windowSize) \ - suicidef ("tweener: impossible"); \ - } - -#endif // verifyOrDie - - -#ifndef debugTweener -#define debugTweener_1 ; -#define debugTweener_2 ; -#define debugTweener_3A ; -#define debugTweener_3B ; -#define debugTweener_3C ; -#endif // not debugTweener - -#ifdef debugTweener - -#define debugTweener_1 \ - fprintf(stderr, "tweening these alignments:\n"); \ - for (a=alignList ; a!=NULL ; a=a->next) \ - fprintf(stderr, " " pairsFmt "\n", \ - a->beg1, a->beg2, a->end1, a->end2); \ - if (seq1->partition.p != NULL) \ - print_partition_table (stderr, seq1); \ - if (seq2->partition.p != NULL) \ - print_partition_table (stderr, seq2); \ - else \ - fprintf(stderr, "seq2 = \"%s\"\n", seq2->header); - -#define debugTweener_2 \ - fprintf(stderr, "considering alignment " pairsFmt "\n", \ - a1, a2, a->end1, a->end2); - -#define debugTweener_3A \ - fprintf(stderr," overlap with " pairsFmt "?", \ - b->beg1, b->beg2, b1, b2); - -#define debugTweener_3B \ - fprintf(stderr, " (yes)\n"); - -#define debugTweener_3C \ - else fprintf(stderr, " (no)\n"); - -#endif // debugTweener - -// tweener_interpolate-- - -alignel* tweener_interpolate - (alignel* alignList, - seq* _seq1, - seq* _seq2, - int _selfCompare, - int _inhibitTrivial, - const s8 _upperCharToBits[], - seed* _innerSeed, - scoreset* _scoring, - scoreset* _maskedScoring, - tback* _tb, - score _xDrop, - int _gappedAllBounds, - score _yDrop, - int _trimToPeak, - score _scoreThresh, - score _diagPen, - score _antiPen, - int _scale, - chainer _connect, - u32 _windowSize) - { - active* c, *cNext; - alignel* a, *b; - unspos a1, a2, a1Lft; - unspos b1, b2; - sgnpos distToB, distToC; - unspos distD; - int isLeftEnd, isRightEnd, hasOverlap; - - // make sure we have something to do - - if (alignList == NULL) - return NULL; - - ////////// - // setup - ////////// - - // copy parameters into globals - - seq1 = _seq1; - seq2 = _seq2; - selfCompare = _selfCompare; - inhibitTrivial = _inhibitTrivial; - upperCharToBits = _upperCharToBits; - innerSeed = _innerSeed; - scoring = _scoring; - maskedScoring = _maskedScoring; - tb = _tb; - xDrop = _xDrop; - gappedAllBounds = _gappedAllBounds; - yDrop = _yDrop; - trimToPeak = _trimToPeak; - scoreThresh.t = 'S'; - scoreThresh.s = _scoreThresh; - diagPen = _diagPen; - antiPen = _antiPen; - scale = _scale; - konnekt = _connect; - windowSize = round_up_2 (_windowSize); // (rounded up to an even number) - - // create sequence structures to use for each in-between block's - // subsequences - - tweenSeq1 = new_sequence (windowSize); - tweenSeq2 = new_sequence (windowSize); - tweenRev1 = new_sequence (windowSize); - tweenRev2 = new_sequence (windowSize); - - tweenSeq1->needTrueLen = tweenRev1->needTrueLen = seq1->needTrueLen; - tweenSeq2->needTrueLen = tweenRev2->needTrueLen = seq2->needTrueLen; - - // set up part of the info record for process_for_simple_hit - - simpleInfo.hp.reporter = collect_inner_hsps; - simpleInfo.hp.reporterInfo = NULL; - simpleInfo.hp.minMatches = -1; // (no filtering) - simpleInfo.hp.gfExtend = gfexXDrop; - simpleInfo.hp.seq1 = tweenSeq1; - simpleInfo.hp.seq2 = tweenSeq2; - simpleInfo.hp.scoring = maskedScoring; - simpleInfo.hp.xDrop = xDrop; - simpleInfo.hp.hspThreshold = scoreThresh; - simpleInfo.hp.hspZeroThreshold = (scoreThresh.t !='S')? 0 - : (scoreThresh.s > 0 )? scoreThresh.s - : 0; - simpleInfo.hp.anchors = NULL; - simpleInfo.hp.entropicHsp = false; - simpleInfo.hp.reportEntropy = false; - - // create the table in which to collect the HSPs - - innerAnchors = new_segment_table (numDefaultAnchors, 0); - - // set up a pool of active elements (initially empty) - - activePool = NULL; - - ////////// - // process each viable in-between block (each inter-block gap, plus end - // gaps) - ////////// - - activeList = NULL; - innerList = NULL; - - verifyOrDie_1 - debugTweener_1 - - // process each outer alignment alignment in turn - - for (a=alignList ; a!=NULL ; a=a->next) - { - a1 = a->beg1; - a2 = a->beg2; - a1Lft = (a1-1 < windowSize)? 0 : (a1 - windowSize); - - debugTweener_2 - - // dismiss alignments that are too far from A - - while ((activeList != NULL) && (activeList->align->end1) < a1Lft) - activeList = dismiss (activeList); - - c = activeList; - while ((c != NULL) && ((cNext = c->next) != NULL)) - { - if (a1Lft > cNext->align->end1) c->next = dismiss (cNext); - else c = cNext; - } - - verifyOrDie_2 - - // look for an active alignment that overlaps A - - hasOverlap = false; - for (c=activeList ; c!=NULL ; c=c->next) - { - b = c->align; - b1 = b->end1; - b2 = b->end2; - distD = abs((((sgnpos)b2) - ((sgnpos)b1)) // (distance between - - (((sgnpos)a2) - ((sgnpos)a1))); // .. diagonals) - - debugTweener_3A - - if ((distD <= windowSize) - && ((b1 >= a1) || (b2 >= a2))) - { - hasOverlap = true; - debugTweener_3B - if ((b1 < a->end1) && (b2 < a->end2)) - c->isRightEnd = false; // B ends properly -- before A ends - else - break; - } - debugTweener_3C - } - - if (hasOverlap) - { - // if c is NULL, all overlaps were proper, so we're at the - // right end of a chain - - isRightEnd = (c == NULL); - activate (a, isRightEnd); - continue; // don't try to tween on the left side of A - } - - // find the closest alignment B to A such that B is active and ends - // before the start of A in both sequences, but doesn't end more than - // windowSize bp before the start of A - - b = NULL; - distToB = (sgnpos) (3*windowSize); - isLeftEnd = true; // A is the first outer alignment in a chain - for (c=activeList ; c!=NULL ; c=c->next) - { - b1 = c->align->end1; - b2 = c->align->end2; - if ((b1 < a1) && (b2 < a2) && (a2 < b2 + windowSize)) - { - isLeftEnd = false; - if (c->isRightEnd) - { - distToC = ((sgnpos)a1) - ((sgnpos)b1) // (manhattan - + ((sgnpos)a2) - ((sgnpos)b2); // .. distance) - if (distToC < distToB) - { b = c->align; distToB = distToC; } - } - c->isRightEnd = false; - } - } - - if (b != NULL) - { - b1 = b->end1; - b2 = b->end2; - try_bounded_align (b1, a1, b2, a2); - } - else if (isLeftEnd) - { - // A could be the first outer alignment in a chain; look for - // "in-between" alignments to the left - - b1 = (a1 <= windowSize/2)? 1 : (a1-windowSize/2); - b2 = (a2 <= windowSize/2)? 1 : (a2-windowSize/2); - try_bounded_align (b1, a1, b2, a2); - } - - activate (a, true); - } - - // align in windows after each chain-ending active alignment - - while (activeList != NULL) - activeList = dismiss (activeList); - -#ifdef collect_stats - a1 = 0; - for (a=innerList ; a!=NULL ; a=a->next) - a1 += (a->end1 - a->beg1 + 1); - tweener_add_stat (tweenerCoverage, a1); -#endif - - alignList = merge_align (alignList, innerList); - -#ifdef collect_stats - a1 = 0; - for (a=alignList ; a!=NULL ; a=a->next) - a1 += (a->end1 - a->beg1 + 1); - tweener_add_stat (totalCoverage, a1); -#endif - - ////////// - // cleanup - ////////// - - free_sequence (tweenSeq1); - free_sequence (tweenSeq2); - free_sequence (tweenRev1); - free_sequence (tweenRev2); - free_segment_table (innerAnchors); - - for (c=activeList ; c!=NULL ; c=cNext) { cNext = c->next; free(c); } - for (c=activePool ; c!=NULL ; c=cNext) { cNext = c->next; free(c); } - - return alignList; - } - -//---------- -// -// try_bounded_align-- -// This routine is a wrapper for bounded_align(), to handle cases that arise -// when either of sequence 1 or sequence 2 is partitioned. In this case, we -// must check whether the interval is split by a sequence boundary and break -// apart the interval(s) passed to bounded_align(). -// -//---------- -// -// Arguments: -// unspos b1,e1: Range of the rectangle in sequence 1 (origin 1, inclusive). -// unspos b2,e2: Range of the rectangle in sequence 2. -// -// Returns: -// (nothing) -// -//---------- - -// macros for bounded_align() - -#ifndef debugTweener -#define debugTweener_E1 ; -#define debugTweener_E2 ; -#define debugTweener_E3 ; -#define debugTweener_E4 ; -#define debugTweener_E5 ; -#define debugTweener_E6 ; -#define debugTweener_E7 ; -#endif // not debugTweener - -#ifdef debugTweener - -#define debugTweener_E1 \ - fprintf(stderr, " try_bounded_align: " pairsFmt "\n", \ - b1, b2, e1, e2); - -#define debugTweener_E2 \ - fprintf(stderr, " seq1 partition1=" pairFmt " %s\n", \ - part1->sepBefore, part1->sepAfter, \ - &sp1->pool[part1->header]); \ - fprintf(stderr, " seq1 partition2=" pairFmt " %s\n", \ - part2->sepBefore, part2->sepAfter, \ - &sp1->pool[part2->header]); \ - fprintf(stderr, " part2-part1=%ld\n", part2-part1); - -#define debugTweener_E3 \ - fprintf(stderr, " (E3) splitting " pairFmt \ - " into " pairFmt "," pairFmt "\n", \ - b1, e1, b1, e1left, b1right, e1); - -#define debugTweener_E4 \ - fprintf(stderr, " seq2 partition1=" pairFmt " %s\n", \ - part1->sepBefore, part1->sepAfter, \ - &sp2->pool[part1->header]); \ - fprintf(stderr, " seq2 partition2=" pairFmt " %s\n", \ - part2->sepBefore, part2->sepAfter, \ - &sp2->pool[part2->header]); \ - fprintf(stderr, " part2-part1=%ld\n", part2-part1); - -#define debugTweener_E5 \ - fprintf(stderr, " (E5) splitting " pairFmt \ - " into " pairFmt "," pairFmt "\n", \ - b2, e2, b2, e2left, b2right, e2); - -#define debugTweener_E6 \ - fprintf(stderr, " seq1 partitionX=" pairFmt " %s\n", \ - partX->sepBefore, partX->sepAfter, \ - &sp1->pool[partX->header]); - -#define debugTweener_E7 \ - fprintf(stderr, " seq2 partitionX=" pairFmt " %s\n", \ - partY->sepBefore, partY->sepAfter, \ - &sp2->pool[partY->header]); - -#endif // debugTweener - -// try_bounded_align-- - -static void try_bounded_align - (unspos b1, - unspos e1, - unspos b2, - unspos e2) - { - seqpartition* sp1 = &seq1->partition; - seqpartition* sp2 = &seq2->partition; - partition* part1, *part2; - partition* partX1, *partX2, *partY1, *partY2, *partX, *partY; - int split1, split2; - unspos b1right, e1left, b2right, e2left; - unspos b1x, e1x, b2y, e2y; - - debugTweener_E1 - - // if either interval is empty, let's not bother with it. - - if ((b1 == e1) || (b2 == e2)) return; - - // if neither sequence is partitioned, just pass the interval along to - // bounded_align() - - if ((sp1->p == NULL) // sequence 1 is not partitioned - && (sp2->p == NULL)) // sequence 2 is not partitioned - { - bounded_align (b1, e1, b2, e2); - return; - } - - // determine whether the interval is split in sequence 1 - - split1 = false; - e1left = e1; - b1right = b1; - partX1 = partX2 = NULL; - - if (sp1->p != NULL) // sequence 1 is partitioned - { - if (seq1->v[b1-1] == 0) b1 += 1; - else if (seq1->v[b1 ] == 0) b1 += 2; - if (seq1->v[e1-1] == 0) e1 -= 1; - if (b1 >= e1) return; - - part1 = lookup_partition (seq1, b1-1); - part2 = lookup_partition (seq1, e1-1); - if (part1 != part2) - { - // split is b1 / part1->sepAfter / part2->sepBefore / e1 - // --> b1 / e1left / b1right / e1 - debugTweener_E2 - e1left = part1->sepAfter; - b1right = part2->sepBefore+2; - split1 = true; - debugTweener_E3 - if (part2 - part1 > 1) - { partX1 = part1+1; partX2 = part2-1; } - } - } - - // determine whether the interval is split in sequence 2 - - split2 = false; - e2left = e2; - b2right = b2; - partY1 = partY2 = NULL; - - if (sp2->p != NULL) // sequence 2 is partitioned - { - if (seq2->v[b2-1] == 0) b2 += 1; - else if (seq2->v[b2 ] == 0) b2 += 2; - if (seq2->v[e2-1] == 0) e2 -= 1; - if (b2 >= e2) return; - - part1 = lookup_partition (seq2, b2-1); - part2 = lookup_partition (seq2, e2-1); - if (part1 != part2) - { - // split is b2 / part1->sepAfter / part2->sepBefore / e2 - // --> b2 / e2left / b2right / e2 - debugTweener_E4 - e2left = part1->sepAfter; - b2right = part2->sepBefore+2; - split2 = true; - debugTweener_E5 - if (part2 - part1 > 1) - { partY1 = part1+1; partY2 = part2-1; } - } - } - - // if neither sequence is split, just pass the interval along to - // bounded_align() - - if ((!split1) && (!split2)) - { - bounded_align (b1, e1, b2, e2); - return; - } - - // otherwise, send the split intervals to bounded_align(), plus intervals - // for any intervening partitions - - bounded_align (b1, e1left, b2, e2left); - bounded_align (b1right, e1, b2right, e2); - - if ((partX1 != NULL) && (partY1 == NULL)) - { - // loop over intervening partitions in seq1 - for (partX=partX1 ; partX<=partX2 ; partX++) - { - debugTweener_E6 - b1x = partX->sepBefore+2; - e1x = partX->sepAfter; - bounded_align (b1x, e1x, b2, e2left); - } - } - else if ((partX1 == NULL) && (partY1 != NULL)) - { - // loop over intervening partitions in seq2 - for (partY=partY1 ; partY<=partY2 ; partY++) - { - debugTweener_E7 - b2y = partY->sepBefore+2; - e2y = partY->sepAfter; - bounded_align (b1, e1left, b2y, e2y); - } - } - else if ((partX1 != NULL) && (partY1 != NULL)) - { - // loop over intervening partitions in both sequences - for (partX=partX1 ; partX<=partX2 ; partX++) - for (partY=partY1 ; partY<=partY2 ; partY++) - { - b1x = partX->sepBefore+2; - e1x = partX->sepAfter; - b2y = partY->sepBefore+2; - e2y = partY->sepAfter; - bounded_align (b1x, e1x, b2y, e2y); - } - } - } - -//---------- -// -// bounded_align-- -// Perform a high-sensitivity alignment within a specified rectangle. -// -//---------- -// -// Arguments: -// unspos b1,e1: Range of the rectangle in sequence 1 (origin 1, -// .. inclusive). -// unspos b2,e2: Range of the rectangle in sequence 2. -// -// Returns: -// (nothing) -// -//---------- - -// macros for bounded_align() - -#ifndef debugTweener -#define debugTweener_F1 ; -#define debugTweener_F2 ; -#define debugTweener_F3 ; -#define debugTweener_F4 ; -#endif // not debugTweener - -#ifdef debugTweener - -#define debugTweener_F1 \ - fprintf(stderr, " bounded_align: " pairsFmt "\n", b1, b2, e1, e2); - -#define debugTweener_F2 \ - { \ - segtable* st = innerAnchors; \ - segment* seg; \ - for (seg=st->seg ; (seg-st->seg)len ; seg++) \ - fprintf(stderr, " HSP " unsposSlashFmt " " unsposFmt " " scoreFmt "\n", \ - seg->pos1, seg->pos2, seg->length, seg->s); \ - } - -#define debugTweener_F3 \ - { \ - segtable* st = innerAnchors; \ - segment* seg; \ - for (seg=st->seg ; (seg-st->seg)len ; seg++) \ - fprintf(stderr, " chained " unsposSlashFmt " " unsposFmt " " scoreFmt "\n", \ - seg->pos1, seg->pos2, seg->length, seg->s); \ - } - -#define debugTweener_F4 \ - for (aa=a ; aa!=NULL ; aa=aa->next) \ - fprintf(stderr, " new alignment " pairsFmt "\n", \ - aa->beg1, aa->beg2, aa->end1, aa->end2); - -#endif // debugTweener - -// bounded_align-- - -static void bounded_align - (unspos b1, - unspos e1, - unspos b2, - unspos e2) - { - postable* seq1Positions = NULL; - alignel* a = NULL; - alignel* aa; - - tweener_count_stat (numTweeners); - tweener_add_stat (totalArea, (e1-((u64)b1-1)) * (e2-((u64)b2-1))); - - debugTweener_F1 - - // create the tiny subsequences - - extract_subsequence (seq1, b1-1, e1, tweenSeq1); - extract_subsequence (seq2, b2-1, e2, tweenSeq2); - - // build word position table for the first sequence - - seq1Positions = build_seed_position_table - (tweenSeq1, 0, tweenSeq1->len, upperCharToBits, - innerSeed, /*step*/ 1); - - // find HSPs; they get collected into innerAnchors[] by - // collect_inner_hsps(), which is called from process_for_simple_hit() - - empty_segment_table (innerAnchors); - seed_hit_search (tweenSeq1, seq1Positions, - tweenSeq2, 0, tweenSeq2->len, /*selfCompare*/ false, - upperCharToBits, innerSeed, - /* searchLimit */ 0, 0, -#ifdef densityFiltering - /*maxDensity*/ 0.0, -#endif // densityFiltering - process_for_simple_hit, (void*) &simpleInfo); - - free_position_table (seq1Positions); - - debugTweener_F2 - - // chain - - reduce_to_chain (innerAnchors, diagPen, antiPen, scale, konnekt); - sort_segments (innerAnchors, qSegmentsByPos1); - - debugTweener_F3 - - // gapped extension - - if ((innerAnchors != NULL) && (innerAnchors->len != 0)) - { - copy_reverse_sequence (tweenSeq1, tweenRev1); - copy_reverse_sequence (tweenSeq2, tweenRev2); - reduce_to_points (tweenSeq1, tweenSeq2, scoring, innerAnchors); - a = gapped_extend (tweenSeq1, tweenRev1->v, tweenSeq2, tweenRev2->v, - inhibitTrivial, - scoring, innerAnchors, tb, - gappedAllBounds, yDrop, trimToPeak, scoreThresh, - /* no pairs limit */ 0, false, false); - } - - // shift the positions, from subsequence back to sequence - - for (aa=a ; aa!=NULL ; aa=aa->next) - { - aa->seq1 = seq1->v; - aa->seq2 = seq2->v; - aa->beg1 += b1-1; - aa->end1 += b1-1; - aa->beg2 += b2-1; - aa->end2 += b2-1; - } - - debugTweener_F4 - - innerList = merge_align (a, innerList); - } - -//---------- -// [[-- a seed hit reporter function --]] -// -// collect_inner_hsps-- -// Collect a seed hit or HSP. -// -// Arguments and Return value: (see seed_search.h) -// -//---------- - -static u32 collect_inner_hsps - (arg_dont_complain(void* info), - unspos pos1, - unspos pos2, - unspos length, - score s) - { - innerAnchors = add_segment (innerAnchors, - pos1-length, pos2-length, length, s, - /*id*/ 0, /*hspId*/ 0); - - if (dbgShowInnerHsps) - { - fprintf (stderr, "\n"); - dump_aligned_nucleotides (stderr, - tweenSeq1, pos1-length, - tweenSeq2, pos2-length, - length); - } - - return 1; - } - -//---------- -// -// merge_align-- -// Merge two beg1-ordered lists of alignments into a single beg1-ordered list. -// -//---------- -// -// Arguments: -// alignel* a: One input list. -// alignel* b: The other. -// -// Returns: -// A pointer to the merged list, which uses the same allocated memory as the -// two input lists. -// -//---------- - -// macros for merge_align() - -#ifndef verifyOrDie -#define verifyOrDie_3 ; -#endif // not verifyOrDie - -#ifdef verifyOrDie - -#define verifyOrDie_3 \ - for (tail=a ; tail!=NULL ; tail=tail->next) \ - { \ - if ((tail->next != NULL) && (tail->next->beg1 < tail->beg1)) \ - suicidef ("merge_align: first list out of order at " unsposFmt, \ - tail->next->beg1); \ - } \ - for (tail=b ; tail!=NULL ; tail=tail->next) \ - { \ - if ((tail->next != NULL) && (tail->next->beg1 < tail->beg1)) \ - suicidef ("merge_align: second list out of order at " unsposFmt, \ - tail->next->beg1); \ - } - -#endif // verifyOrDie - -// merge_align-- - -static alignel* merge_align - (alignel* a, - alignel* b) - { - alignel* ret, *tail; - - verifyOrDie_3 - - if (b == NULL) return a; - if (a == NULL) return b; - - if (a->beg1 <= b->beg1) { ret = tail = a; a = a->next; } - else { ret = tail = b; b = b->next; } - - while (a != NULL && b != NULL) - { - if (a->beg1 <= b->beg1) { tail = tail->next = a; a = a->next; } - else { tail = tail->next = b; b = b->next; } - } - - if (a == NULL) tail->next = b; - else tail->next = a; - - return ret; - } - -//---------- -// -// activate-- -// Put an alignment at the front of the active list. -// -//---------- -// -// Arguments: -// alignel* a: The alignment. -// int isRightEnd: true => the alignment is the right end of a chain. -// false => it isn't. -// -// Returns: -// (nothing) -// -//---------- - -static void activate - (alignel* a, - int isRightEnd) - { - active* c; - - // reclaim an element from the pool, otherwise allocate a new one - - if (activePool == NULL) c = malloc_or_die ("activate", sizeof(*c)); - else { c = activePool; activePool = c->next; } - - // create the element and add it to the head of the active list - - c->align = a; - c->isRightEnd = isRightEnd; - c->next = activeList; - activeList = c; - } - -//---------- -// -// dismiss-- -// Remove an alignment from the active list. If it is the outer alignment at -// the right end of a chain, we look for "in-between" alignments beyond the -// end. -// -//---------- -// -// Arguments: -// active* c: The alignment to dismiss. -// -// Returns: -// c->next -// -//---------- - -static active* dismiss - (active* c) - { - active* next; - unspos a1, a2, b1, b2; - - next = c->next; - - if (c->isRightEnd) - { - b1 = c->align->end1; - b2 = c->align->end2; - a1 = min (b1+windowSize/2, seq1->len); - a2 = min (b2+windowSize/2, seq2->len); - try_bounded_align (b1, a1, b2, a2); - } - - // return c to the pool - - c->next = activePool; - activePool = c; - - return next; - } - -//---------- -// -// extract_subsequence-- -// Make a copy of a subsequence. -// -//---------- -// -// Arguments: -// seq* seq: The sequence to make a copy of. -// unspos b,e: The index range of the subsequence to copy, origin-zero, -// .. open end. -// seq* dst: The sequence in which to place the copy. Any previous -// .. contents will be destroyed. -// -// Returns: -// (nothing) -// -//---------- - -static void extract_subsequence - (seq* sf, - unspos b, - unspos e, - seq* dst) - { - u8* s = sf->v; - unspos len, i; - - if (e <= b) - suicidef ("internal error in extract_subsequence\n" - " interval " unsposFmt ".." unsposFmt " is empty", b, e); - - len = e - b; - sequence_long_enough (dst, len, false); - - for (i=0 ; iv[i] = s[b+i]; - dst->v[len] = 0; - dst->len = len; - - dst->fileType = seq_type_nofile; - dst->contig = 1; - dst->startLoc = 1; - } - -//---------- -// -// copy_reverse_sequence-- -// Make a copy of a sequence, in reverse (*not* reverse complement). -// -//---------- -// -// Arguments: -// seq* seq: The sequence to make a copy of. -// seq* dst: The sequence in which to place the copy. Any previous -// .. contents will be destroyed. -// -// Returns: -// (nothing) -// -//---------- - -static void copy_reverse_sequence - (seq* sf, - seq* dst) - { - u32 len = sf->len; - u8* s, *d; - - sequence_long_enough (dst, len, false); - - for (s=sf->v+len,d=dst->v ; s>sf->v ; ) - *(d++) = *(--s); - - *d = 0; - dst->len = len; - } - -//---------- -// -// tweener_zero_stats-- -// Clear the statistics for this module. -// -//---------- -// -// Arguments: -// (none) -// -// Returns: -// (nothing) -// -//---------- - -void tweener_zero_stats - (void) - { -#ifdef collect_stats - - // set 'em en masse to zero - - memset (&tweenerStats, 0, sizeof(tweenerStats)); - - // set any values that might be floating point to zero (fp bit pattern for - // zero may not be all-bits-zero) - - // (none to set, yet) - -#endif // collect_stats - } - -//---------- -// -// tweener_show_stats-- -// Show the statistics that have been collected for this module. -// -//---------- -// -// Arguments: -// FILE* f: The file to print the stats to. -// -// Returns: -// (nothing) -// -//---------- - -void tweener_show_stats - (arg_dont_complain(FILE* f)) - { -#ifdef collect_stats - if (f == NULL) return; - fprintf (f, "number of tweeners: %s\n", commatize(tweenerStats.numTweeners)); - fprintf (f, "tweener total area: %s\n", commatize(tweenerStats.totalArea)); - if (tweenerStats.numTweeners > 0) - fprintf (f, " avg tweener area: %s\n", commatize((tweenerStats.totalArea / ((float) tweenerStats.numTweeners)) + .5)); - fprintf (f, " total coverage: %s bp\n", commatize(tweenerStats.totalCoverage)); - fprintf (f, " tweener coverage: %s bp\n", commatize(tweenerStats.tweenerCoverage)); - fprintf (f, "-------------------\n"); -#endif // collect_stats - } - -void tweener_generic_stats - (arg_dont_complain(FILE* f), - arg_dont_complain(void (*func) (FILE*, const char*, ...))) - { -#ifdef collect_stats - if (f == NULL) return; - (*func) (f, "num_tweeners=%d\n", tweenerStats.numTweeners); - (*func) (f, "total_area=%" PRId64 "\n", tweenerStats.totalArea); - (*func) (f, "total_coverage=%d\n", tweenerStats.totalCoverage); - (*func) (f, "tweener_coverage=%d\n", tweenerStats.tweenerCoverage); -#endif // collect_stats - } - diff --git a/programs/lastz/src/tweener.h b/programs/lastz/src/tweener.h deleted file mode 100644 index 8b1d144..0000000 --- a/programs/lastz/src/tweener.h +++ /dev/null @@ -1,78 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: tweener.h -// -//---------- - -#ifndef tweener_H // (prevent multiple inclusion) -#define tweener_H - -// other files - -#include "dna_utilities.h" // dna/scoring stuff -#include "sequences.h" // sequence stuff -#include "seeds.h" // seed matching stuff -#include "chain.h" // segment chaining stuff -#include "gapped_extend.h" // gapped alignment stuff -#include "edit_script.h" // alignment edit script stuff - -// establish ownership of global variables - -#ifdef tweener_owner -#define global -#else -#define global extern -#endif - -//---------- -// -// statistics for events in this module -// -//---------- - -#ifdef collect_stats - -global struct - { - int numTweeners; - u64 totalArea; - int totalCoverage; - int tweenerCoverage; - } tweenerStats; - -// stats macros - -#define tweener_count_stat(field) ++tweenerStats.field -#define tweener_uncount_stat(field) --tweenerStats.field -#define tweener_set_stat(field,val) (tweenerStats.field = val) -#define tweener_add_stat(field,val) (tweenerStats.field += val) -#else -#define tweener_count_stat(field) -#define tweener_uncount_stat(field) -#define tweener_set_stat(field,val) -#define tweener_add_stat(field,val) -#endif // collect_stats - -// prototypes for stats routines - -void tweener_zero_stats (void); -void tweener_show_stats (FILE* f); -void tweener_generic_stats (FILE* f, void (*func) (FILE*, const char*, ...)); - -//---------- -// -// prototypes for routines in tweener.c -// -//---------- - -alignel* tweener_interpolate (alignel* a, seq* seq1, seq* seq2, - int selfCompare, int inhibitTrivial, - const s8 charToBits[], seed* tweenSeed, - scoreset* scoring, scoreset* maskedScoring, - tback* tb, score xDrop, int gappedAllBounds, - score yDrop, int trimToPeak, score scoreThresh, - score diagPen, score antiPen, - int scale, chainer connect, u32 windowSize); - -#undef global -#endif // tweener_H diff --git a/programs/lastz/src/utilities.c b/programs/lastz/src/utilities.c deleted file mode 100755 index 3ba990d..0000000 --- a/programs/lastz/src/utilities.c +++ /dev/null @@ -1,1908 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: utilities.c -// -//---------- -// -// utilities-- -// Miscellaneous utility functions. -// -//---------- - -//---------- -// -// other files -// -//---------- - -#include // standard C stuff -#define true 1 -#define false 0 -#include // standard C i/o stuff -#include // standard C string stuff -#include // standard C upper/lower stuff -#include // standard C variable arg list stuff -#include // standard C value limit stuff -#include // standard C error number stuff -#include "build_options.h" // build options - -#define utilities_owner // (make this the owner of its globals) -#include "utilities.h" // interface to this module - -//---------- -// -// memory allocation tracking macros -// -// if trackMemoryUsage is #defined (e.g. in the makefile), reportAlloc, -// reportRealloc, and reportFree report memory usage to stderr. If it is not -// #defined, the functions compile to nothing. -// -//---------- - -#ifdef trackMemoryUsage - -#define reportAlloc(id,p,sz) fprintf (stderr, "[[ memory alloc %s %08lX #%s ]]\n", (((id)==NULL)? "" : (id)), (long)(p), commatize(sz)); -#define reportRealloc(id,oldp,p,sz) fprintf (stderr, "[[ memory realloc %s %08lX->%08lX #%s ]]\n", (((id)==NULL)? "" : (id)), (long)(oldp), (long)(p), commatize(sz)); -#define reportFree(id,p) fprintf (stderr, "[[ memory free %s %08lX ]]\n", (((id)==NULL)? "" : (id)), (long)(p)); - -#else - -#define reportAlloc(id,p,sz) ; -#define reportRealloc(id,oldp,p,sz) ; -#define reportFree(id,p) ; - -#endif // trackMemoryUsage - -//---------- -// -// fopen_or_die-- -// Open a file. -// -//---------- -// -// Arguments: -// (same as for fopen()) -// -// Returns: -// A pointer to file; failures result in fatality. -// -//---------- - -FILE* fopen_or_die - (const char* name, - const char* mode) - { - FILE* f; - - f = fopen (name, mode); - if (f == NULL) - suicidef ("fopen_or_die failed to open \"%s\" for \"%s\"", name, mode); - - if (utilities_dbgDumpFilePointers) - fprintf (stderr, "fopen_or_die(\"%s\",\"%s\") returns %p\n", name, mode, f); - - return f; - } - -//---------- -// -// fclose_if_valid-- -// Close a file previously opened with fopen(). -// -//---------- -// -// Arguments: -// (same as for fclose()) -// -// Returns: -// (same as for fclose()) -// -//---------- - -int fclose_if_valid - (FILE* f) - { - if ((f == NULL) || (f == stdin) || (f == stdout) || (f == stderr)) - return 0; - - if (utilities_dbgDumpFilePointers) - fprintf (stderr, "fclose_if_valid(%p)\n", f); - - return fclose (f); - } - -//---------- -// -// getc_or_die-- -// Read a character from a file. -// -//---------- -// -// Arguments: -// FILE* f: (same as for getc()) -// char* filename: The name of the file associated with f. This is used -// .. only for error reporting, and may be NULL. -// -// Returns: -// (same as for getc(); except that errors result in program fatality) -// -//---------- - -int getc_or_die - (FILE* f, - char* filename) - { - int ch; - - ch = getc (f); - if (ch != EOF) return ch & 0xFF; - - if (ferror (f)) - { - if (filename == NULL) filename = "(unnamed file)"; - if (utilities_dbgDumpFilePointers) - fprintf (stderr, "getc_or_die(%p) (filename \"%s\") reported errno=%d\n", f, filename, errno); - suicidef_with_perror ("I/O failure for %s (getc reported errno=%d)", - filename, errno); - } - - return EOF; - } - -//---------- -// -// print_prefix-- -// Print a prefix of a string. -// -//---------- -// -// Arguments: -// FILE* f: The file to print to. -// const char* s: The string to print the prefix of. -// int n: The number of characters to print. -// -// Returns: -// The number of characters printed. This is expected to be n. However, if -// the string is shorter than n, then printing terminates when the string -// does, and the string length is returned. Also, if n is less than 1, -// nothing is printed and zero is returned. -// -//---------- - -int print_prefix - (FILE* f, - const char* s, - int n) - { - int ix; - - if (n < 1) return 0; - - for (ix=0 ; ix mallocLimit) - if (size > mallocLimit) - { - if (id == NULL) - suicidef ("malloc_or_die blocked large request, for %s bytes (max is %s)", - ucommatize(size), ucommatize(mallocLimit)); - else - suicidef ("malloc_or_die blocked large request, for %s bytes (max is %s), for %s", - ucommatize(size), ucommatize(mallocLimit), id); - } -#endif // overflow possible - - if (size == 0) size = 1; - - // allocate the memory - - p = malloc (size); - if (p == NULL) - { - if (id == NULL) - suicidef ("call to malloc failed to allocate %s bytes", - ucommatize(size)); - else - suicidef ("call to malloc failed to allocate %s bytes, for %s", - ucommatize(size), id); - } - - reportAlloc (id, p, size); - - return p; - } - -void* zalloc_or_die - (char* id, - size_t size) - { - void* p; - - // make sure size is legit - - if (size == 0) size = 1; - - // allocate the memory and clear it - - p = malloc_or_die (id, size); - memset (p, 0, size); - - return p; - } - -void* realloc_or_die - (char* id, - void* _p, - size_t size) - { - void* p; - - // make sure size is legit - -#if (SIZE_MAX > mallocLimit) - if (size > mallocLimit) - { - if (id == NULL) - suicidef ("realloc_or_die blocked large request, for %s bytes (max is %s)", - ucommatize(size), ucommatize(mallocLimit)); - else - suicidef ("realloc_or_die blocked large request, for %s bytes (max is %s), for %s", - ucommatize(size), ucommatize(mallocLimit), id); - } -#endif // overflow possible - - if (size == 0) size = 1; - - // allocate the memory - - p = realloc (_p, size); - if (p == NULL) - { - if (id == NULL) - suicidef ("call to realloc failed to allocate %s bytes", - ucommatize(size)); - else - suicidef ("call to realloc failed to allocate %s bytes, for %s", - ucommatize(size), id); - } - - reportRealloc (id, _p, p, size); - - return p; - } - -#endif // not noMemoryWrappers - -//---------- -// -// free_if_valid-- -// De-allocate a block of memory previously allocated from the heap, but -// first checking to make sure the pointer is not NULL. -// -//---------- -// -// Arguments: -// (same as for free(), except for the extra id argument) -// char* id: an identifying string to be used when trackMemoryUsage is -// .. turned on; this can be NULL. -// -// Returns: -// (nothing) -// -//---------- - -#ifndef noMemoryWrappers - -void free_if_valid (arg_dont_complain(char* id), void* p) - { - if (p == NULL) return; - - free (p); - reportFree (id, p); - } - -#endif // not noMemoryWrappers - -//---------- -// -// copy_string, copy_prefix-- -// Create (in the heap) a copy of a string or a prefix of a string. -// -//---------- -// -// Arguments: -// const char* s: The string to copy. -// int n: (copy_prefix only) the number of characters to copy. -// -// Returns: -// A pointer to new string; failures result in fatality. -// -//---------- - -char* copy_string - (const char* s) - { - char* ss; - - if (s == NULL) return NULL; - - ss = malloc_or_die ("copy_string", strlen(s) + 1); - return strcpy (/*to*/ ss, /*from*/ s); - } - -char* copy_prefix - (const char* s, - int n) - { - char* ss; - - if (s == NULL) return NULL; - - ss = malloc_or_die ("copy_prefix", n + 1); - memcpy (/*to*/ ss, /*from*/ s, /*how much*/ n); - ss[n] = 0; - return ss; - } - -//---------- -// -// concatenate_strings, concatenate_four_strings-- -// Create (in the heap) concatenation of two (or four) strings. -// -//---------- -// -// Arguments: -// const char* s1, s2, s3, s4: The strings to copy. -// -// Returns: -// A pointer to new string; failures result in fatality. -// -//---------- - -char* concatenate_strings - (const char* s1, - const char* s2) - { - char* s, *scan; - size_t len = 0; - - if (s1 != NULL) len += strlen (s1); - if (s2 != NULL) len += strlen (s2); - - s = malloc_or_die ("concatenate_strings", len + 1); - - scan = s; - if (s1 != NULL) { strcpy (scan, s1); scan += strlen (s1); } - if (s2 != NULL) { strcpy (scan, s2); scan += strlen (s2); } - *scan = 0; - - return s; - } - - -char* concatenate_four_strings - (const char* s1, - const char* s2, - const char* s3, - const char* s4) - { - char* s, *scan; - size_t len = 0; - - if (s1 != NULL) len += strlen (s1); - if (s2 != NULL) len += strlen (s2); - if (s3 != NULL) len += strlen (s3); - if (s4 != NULL) len += strlen (s4); - - s = malloc_or_die ("concatenate_strings", len + 1); - - scan = s; - if (s1 != NULL) { strcpy (scan, s1); scan += strlen (s1); } - if (s2 != NULL) { strcpy (scan, s2); scan += strlen (s2); } - if (s3 != NULL) { strcpy (scan, s3); scan += strlen (s3); } - if (s4 != NULL) { strcpy (scan, s4); scan += strlen (s4); } - *scan = 0; - - return s; - } - -//---------- -// -// append_char, append_u8-- -// Append a character to a growable string. -// -//---------- -// -// Arguments: -// type** s: (Pointer to) The string to append to. The string may be -// .. NULL. type is either char or u8. -// u32* size: (Pointer to) The number of bytes currently allocated for -// .. s[]. This may be zero. -// u32* len: (Pointer to) The number of characters currently in the -// .. string. Note that this routine does not consider -// .. terminating zeros. If the string has one it counts, -// .. if it doesn't have one, it doesn't get counted. -// type ch: The character to append. type is either char or u8. -// -// Returns: -// Nothing; failures result in fatality. However, the locations pointed to -// by s, size, and len may be altered. -// -//---------- -// -// Note: We have here two routines that are identical except for the type of -// .. the characters in the string (char or u8). The reson for two -// .. such routines is to satisfy certain compilers that would require -// .. a cast from u8** to char** (if we only had a char version of this -// .. function) but then complain about type-punning when such cast is -// .. made. -// -//---------- - -#define create_append_function(function_name,function_string,char_type) \ -void function_name \ - (char_type** s, \ - u32* size, \ - u32* len, \ - char_type ch) \ - { \ - /* if we don't have enough room, try to grow */ \ - \ - if (*len >= *size) \ - { \ - *size = *size + (*size >> 3) + 30; \ - *s = realloc_or_die (function_string, *s, *size); \ - } \ - \ - /* deposit the character */ \ - \ - (*s)[(*len)++] = ch; \ - } - -create_append_function(append_char,"append_char",char) -create_append_function(append_u8, "append_u8", u8) - -//---------- -// -// strcmp_prefix-- -// Determine if a string contains another as a prefix. -// -//---------- -// -// Arguments: -// const char* str1: The string. -// const char* str2: The prefix string. -// -// Returns: -// The same as strcmp(prefix1,str2) would, where prefix1 is str1 truncated -// to be no longer than str2. -// -//---------- - -int strcmp_prefix - (const char* str1, - const char* str2) - { - return strncmp (str1, str2, strlen (str2)); - } - -//---------- -// -// strcmp_suffix, strncmp_suffix-- -// Determine if a string contains another as a suffix. -// -//---------- -// -// Arguments: -// const char* str1: The string. -// const char* str2: The suffix string. -// size_t n: (strncmp_suffix only) The max length of str1. -// -// Returns: -// The same as strcmp(suffix1,str2) or strncmp(suffix1,str2,n) would, where -// suffix1 is the last N characters of str1, and N is the length of str2. If -// str2 is longer than str1, it cannot be a suffix (in this case we compare to -// the entirety of str1). -// -//---------- - -int strcmp_suffix - (const char* str1, - const char* str2) - { - size_t len1 = strlen(str1); - size_t len2 = strlen(str2); - - if (len2 <= len1) return strcmp (str1+len1-len2, str2); - else return strcmp (str1, str2); - } - -int strncmp_suffix - (const char* str1, - const char* str2, - size_t n) - { - size_t len1 = strlen(str1); - size_t len2 = strlen(str2); - - if (len1 > n) len1 = n; - - if (len2 <= len1) return strcmp (str1+len1-len2, str2); - else return strcmp (str1, str2); - } - -//---------- -// -// is_blank_string-- -// Determine if a string contains only blank characters. -// -//---------- -// -// Arguments: -// -// const char* s: The string. -// -// Returns: -// true if the string contains only characters for which isspace is true (e.g. -// spaces, tabs, line feeds); false otherwise -// -//---------- - -int is_blank_string - (const char* s) - { - char* ss = (char*) s; - - while (*ss != 0) - { if (!isspace(*(ss++))) return false; } - - return true; - } - -//---------- -// -// string_to_int-- -// Parse a string for the integer value it contains. -// -//---------- -// -// Arguments: -// const char* s: The string to parse. -// -// Returns: -// The integer value of the string. Note that the string *must not* contain -// anything other than a valid integer-- failures result in fatality. -// -//---------- - -int string_to_int - (const char* s) - { - char* ss; - int v; - char extra; - - // skip to first non-blank - - ss = (char*) s; - while ((*ss == ' ') || (*ss == '\t') || (*ss == '\n')) - ss++; - if (*ss == 0) goto empty_string; - - // convert to number - - if (sscanf (ss, "%d%c", &v, &extra) != 1) goto not_an_integer; - - // make sure signs match - - if ((v < 0) && (*ss != '-')) goto out_of_range; - if ((v > 0) && (*ss == '-')) goto out_of_range; - - return v; - - ////////// - // failure exits - ////////// - -empty_string: - suicidef ("an empty string is not an integer"); - -not_an_integer: - suicidef ("\"%s\" is not an integer", s); - -out_of_range: - suicidef ("\"%s\" is outside the range of a signed integer", s); - - return 0; - } - -//---------- -// -// string_to_unitized_int, string_to_unitized_int64-- -// Parse a string for the integer value it contains, allowing K, M, and G -// suffixes. -// -//---------- -// -// Arguments: -// const char* s: The string to parse. -// int byThousands: true => K means one thousand -// false => K means 1,024. -// -// Returns: -// The integer value of the string. Note that the string *must not* contain -// anything (except for an opptional suffix) other than a valid integer-- -// failures result in fatality. -// -//---------- - -int string_to_unitized_int - (const char* s, - int byThousands) - { - char ss[20]; - int len = strlen (s); - char* parseMe; - int v; - float vf; - char extra; - int mult; - int isFloat; - - mult = 1; - - if (len >= (int) sizeof (ss)) - parseMe = (char*) s; - else - { - parseMe = ss; - strcpy (ss, s); - - if (len > 0) - { - switch (ss[len-1]) - { - case 'K': case 'k': - mult = (byThousands)? 1000 : 1024; - break; - case 'M': case 'm': - mult = (byThousands)? 1000000 : 1024L * 1024L; - break; - case 'G': case 'g': - mult = (byThousands)? 1000000000 : 1024L * 1024L * 1024L; - break; - } - - if (mult != 1) - ss[len-1] = 0; - } - } - - isFloat = false; - if (sscanf (parseMe, "%d%c", &v, &extra) != 1) - { - if (sscanf (parseMe, "%f%c", &vf, &extra) != 1) goto bad; - isFloat = true; - } - - if (isFloat) - { - if ((vf > 0) && ( vf*mult > INT_MAX)) goto overflow; - if ((vf < 0) && (-vf*mult > INT_MAX)) goto overflow; - v = (vf * mult) + .5; - } - else if (mult != 1) - { - if ((v > 0) && ( v > INT_MAX / mult)) goto overflow; - if ((v < 0) && (-v > INT_MAX / mult)) goto overflow; - v *= mult; - } - - return v; - -bad: - suicidef ("\"%s\" is not an integer", s); - return 0; - -overflow: - suicidef ("\"%s\" is out of range for an integer", s); - return 0; - } - - -int64 string_to_unitized_int64 - (const char* s, - int byThousands) - { - char ss[20]; - int len = strlen (s); - char* parseMe; - int64 v; - float vf; - char extra; - int64 mult; - int isFloat; - - mult = 1; - - if (len >= (int) sizeof (ss)) - parseMe = (char*) s; - else - { - parseMe = ss; - strcpy (ss, s); - - if (len > 0) - { - switch (ss[len-1]) - { - case 'K': case 'k': - mult = (byThousands)? 1000 : 1024; - break; - case 'M': case 'm': - mult = (byThousands)? 1000000 : 1024L * 1024L; - break; - case 'G': case 'g': - mult = (byThousands)? 1000000000 : 1024L * 1024L * 1024L; - break; - } - - if (mult != 1) - ss[len-1] = 0; - } - } - - isFloat = false; - if (sscanf (parseMe, s64Fmt "%c", &v, &extra) != 1) - { - if (sscanf (parseMe, "%f%c", &vf, &extra) != 1) goto bad; - isFloat = true; - } - - if (isFloat) - { - if ((vf > 0) && ( vf*mult > s64max)) goto overflow; - if ((vf < 0) && (-vf*mult > s64max)) goto overflow; - v = (vf * mult) + .5; - } - else if (mult != 1) - { - if ((v > 0) && ( v > s64max / mult)) goto overflow; - if ((v < 0) && (-v > s64max / mult)) goto overflow; - v *= mult; - } - - return v; - -bad: - suicidef ("\"%s\" is not an integer", s); - return 0; - -overflow: - suicidef ("\"%s\" is out of range for an integer", s); - return 0; - } - -//---------- -// -// hex_string_to_int-- -// Parse a string for the hexadecimal integer value it contains. -// -//---------- -// -// Arguments: -// const char* s: The string to parse. -// -// Returns: -// The integer value of the string. Note that the string *must not* contain -// anything other than a valid integer-- failures result in fatality. -// -//---------- - -int hex_string_to_int - (const char* s) - { - int v; - char extra; - - if (sscanf (s, "%X%c", &v, &extra) != 1) - suicidef ("\"%s\" is not an integer", s); - - return v; - } - -//---------- -// -// string_to_double-- -// Parse a string for the double floating point value it contains. -// -//---------- -// -// Arguments: -// const char* s: The string to parse. -// -// Returns: -// The value of the string. Note that the string *must not* contain anything -// other than a valid number-- failures result in fatality. -// -//---------- - -double string_to_double - (const char* s) - { - double v; - char extra; - - if (sscanf (s, "%lf%c", &v, &extra) != 1) - suicidef ("\"%s\" is not a number", s); - - return v; - } - -//---------- -// -// string_to_unitized_double-- -// Parse a string for the floating point value it contains, allowing K, M, and -// G suffixes. -// -//---------- -// -// Arguments: -// const char* s: The string to parse. -// int byThousands: true => K means one thousand -// false => K means 1,024. -// -// Returns: -// The value of the string. Note that the string *must not* contain anything -// other than a valid number-- failures result in fatality. -// -//---------- - -double string_to_unitized_double - (const char* s, - int byThousands) - { - char ss[20]; - int len = strlen (s); - char* parseMe; - double v; - char extra; - int mult; - - mult = 1; - - if (len >= (int) sizeof (ss)) - parseMe = (char*) s; - else - { - parseMe = ss; - strcpy (ss, s); - - if (len > 0) - { - switch (ss[len-1]) - { - case 'K': case 'k': - mult = (byThousands)? 1000 : 1024; - break; - case 'M': case 'm': - mult = (byThousands)? 1000000 : 1024L * 1024L; - break; - case 'G': case 'g': - mult = (byThousands)? 1000000000 : 1024L * 1024L * 1024L; - break; - } - - if (mult != 1) - ss[len-1] = 0; - } - } - - if (sscanf (parseMe, "%lf%c", &v, &extra) != 1) - suicidef ("\"%s\" is not a number", s); - - return v * mult; - } - -//---------- -// -// pct_string_to_double-- -// Parse a percentage string for the double floating point value it contains. -// -//---------- -// -// Arguments: -// const char* s: The string to parse. -// -// Returns: -// The value of the string. Note that the string *must not* contain anything -// other than a valid percentage-- failures result in fatality. -// -//---------- - -double pct_string_to_double - (const char* s) - { - double v; - char pct, extra; - - if ((sscanf (s, "%lf%c%c", &v, &pct, &extra) != 2) - || (pct != '%')) - suicidef ("\"%s\" is not a percentage", s); - - return v / 100.0; - } - -//---------- -// -// is_valid_lastz_version-- -// Determine if a string is in proper format to be a lastz version number. -// -//---------- -// -// Arguments: -// char* s: The string to test. -// -// Returns: -// true if the string is valid; false if not. -// -//---------- -// -// notes: -// -// (1) We don't check whether the version ever existed, only that it is a -// properly formatted version string. This means that it is of the form -// .[.]. -// -//---------- - -int is_valid_lastz_version - (char* s) - { - int major, minor, subminor; - char extra; - - if (sscanf (s, "%d.%d%c", &major, &minor, &extra) == 2) - { - if (major < 0) return false; - if (minor < 0) return false; - return true; - } - - if (sscanf (s, "%d.%d.%d%c", &major, &minor, &subminor, &extra) == 3) - { - if (major < 0) return false; - if (minor < 0) return false; - if (subminor < 0) return false; - return true; - } - - return false; - } - -//---------- -// -// is_later_lastz_version-- -// Determine if a string is in proper format to be a lastz version number. -// -//---------- -// -// Arguments: -// char* s1, s2: The strings to test. It is assumed that these are both -// .. valid version strings (as per is_valid_lastz_version). -// -// Returns: -// true if s1 is a later version than s2; false if not. -// -//---------- - -int is_later_lastz_version - (char* s1, - char* s2) - { - int major1, minor1, subminor1; - int major2, minor2, subminor2; - char extra; - - if (sscanf (s1, "%d.%d%c", &major1, &minor1, &extra) == 2) - subminor1 = 0; - else - sscanf (s1, "%d.%d.%d", &major1, &minor1, &subminor1); - - if (sscanf (s2, "%d.%d%c", &major2, &minor2, &extra) == 2) - subminor2 = 0; - else - sscanf (s2, "%d.%d.%d", &major2, &minor2, &subminor2); - - if (major1 > major2) return true; - else if (major1 < major2) return false; - - if (minor1 > minor2) return true; - else if (minor1 < minor2) return false; - - return (subminor1 > subminor2); - } - -//---------- -// -// commatize, ucommatize-- -// Convert an integer to a string, including commas. -// -//---------- -// -// Arguments: -// const int64 v: The number to convert. -// -// Returns: -// A string representing that number, including commas. (see note 1) -// -//---------- -// -// notes: -// -// (1) The memory containing the returned string belongs to this routine, as -// static memory. There are only five such memory blocks, and they are -// used on alternate calls. So when you make more than five calls, the -// results of previous calls are clobbered. -// -//---------- - -char* commatize - (const int64 v) - { - static char s1[53];// (big enough for 128-bit decimal value with commas, - static char s2[53];// .. the biggest being - static char s3[53];// .. -170,141,183,460,469,231,731,687,303,715,884,105,728) - static char s4[53]; - static char s5[53]; - static char* s = s5; - int len, commas; - char* src, *dst; - - if (s == s1) s = s2; // (ping pong) - else if (s == s2) s = s3; - else if (s == s3) s = s4; - else if (s == s4) s = s5; - else s = s1; - - sprintf (s, "%jd", (intmax_t) v); // $$$ this could overflow the buffer - // $$$ .. if int_max_t > 128 bits - - len = strlen (s); - - if (s[0] == '-') commas = (len-2) / 3; - else commas = (len-1) / 3; - - if (commas != 0) - { - src = s + len - 1; - dst = s + len + commas; *(dst--) = 0; - - while (dst > src) - { - *(dst--) = *(src--); - *(dst--) = *(src--); - *(dst--) = *(src--); - *(dst--) = ','; - } - - } - - return s; - } - - -char* ucommatize - (const u64 v) - { - static char s1[52];// (big enough for 128-bit decimal value with commas, - static char s2[52];// .. the biggest being - static char s3[52];// .. 340,282,366,920,938,463,463,374,607,431,768,211,455) - static char s4[52]; - static char s5[52]; - static char* s = s5; - int len, commas; - char* src, *dst; - - if (s == s1) s = s2; // (ping pong) - else if (s == s2) s = s3; - else if (s == s3) s = s4; - else if (s == s4) s = s5; - else s = s1; - - sprintf (s, "%jd", (intmax_t) v); // $$$ this could overflow the buffer - // $$$ .. if int_max_t > 128 bits - - len = strlen (s); - commas = (len-1) / 3; - - if (commas != 0) - { - src = s + len - 1; - dst = s + len + commas; *(dst--) = 0; - - while (dst > src) - { - *(dst--) = *(src--); - *(dst--) = *(src--); - *(dst--) = *(src--); - *(dst--) = ','; - } - - } - - return s; - } - -//---------- -// -// unitize-- -// Convert an integer to a string, in units of K, M, or G. -// -//---------- -// -// Arguments: -// const int64 v: The number to convert. -// int byThousands: true => K means one thousand -// false => K means 1,024. -// -// Returns: -// A string representing that number, expressed as units. (see note 1) -// -//---------- -// -// notes: -// -// (1) The memory containing the returned string belongs to this routine, as -// static memory. There are only two such memory blocks, and they are -// used on alternate calls. So when you make more than two calls, the -// results of previous calls are clobbered. -// -//---------- - -// SI unit prefixes (see, e.g., http://en.wikipedia.org/wiki/SI_prefix) - -char* unitName[] = { "", "K", "M", "G", "T", "P", "E", "Z" }; - - -char* unitize - (const int64 v, - int byThousands) - { - static char s1[10]; - static char s2[10]; - static char* s = s2; - int sign, unit; - int64 vv, divisor; - float rep; - - s = (s == s1)? s2 : s1; // (ping pong) - - if (byThousands) divisor = 1000; - else divisor = 1024; - - - if (v >= 0) { sign = '\0'; vv = v; } - else { sign = '-'; vv = -v; } - - unit = 0; - for (rep=vv ; vv>1023 ; vv/=divisor,rep/=divisor) - unit++; - - if (rep > 99) { rep /= divisor; unit++; } - - if (sign < 0) sprintf (s, "-%.1f%s", rep, unitName[unit]); - else sprintf (s, "%.1f%s", rep, unitName[unit]); - - return s; - } - -//---------- -// -// hex_64_string-- -// Convert an integer to a 64-bit hexadecimal string. -// -//---------- -// -// Arguments: -// const int64 v: The number to convert. -// -// Returns: -// A string representing that number. (see note 1) -// -//---------- -// -// notes: -// -// (1) The memory containing the returned string belongs to this routine, as -// static memory. There are only two such memory blocks, and they are -// used on alternate calls. So when you make more than two calls, the -// results of previous calls are clobbered. -// -//---------- - -char* hex_64_string - (const int64 v) - { - static char s1[17]; - static char s2[17]; - static char* s = s2; - u64 vv = (u64) v; - char* dst; - - s = (s == s1)? s2 : s1; // (ping pong) - - dst = s + sizeof(s1); - *(--dst) = 0; - while (dst > s) - { - *(--dst) = "0123456789ABCDEF"[((u8) vv) & 0xF]; - vv >>= 4; - } - - return s; - } - -//---------- -// -// prob_to_string-- -// Convert a proability to a 3-digit string. -// -//---------- -// -// Arguments: -// double p: The probability value. -// -// Returns: -// The string, which always contains 3 characters (plus a terminating zero). -// -//---------- - -char3 prob_to_string - (double p) - { - char3 s; - char field[5]; // "0.xx" plus a terminator - - if (p > 1.0) strcpy (/*to*/ s.s, /*from*/ ">??"); - else if (p >= 0.995) strcpy (/*to*/ s.s, /*from*/ " 1 "); - else if (p < 0.005) strcpy (/*to*/ s.s, /*from*/ " ~~"); - else if (p < 0.0) strcpy (/*to*/ s.s, /*from*/ "= len-1) - return false; - - // if the replacement is smaller, move the tail toward the start - - if (repLen < subLen) - { - src = pos + subLen; - dst = pos + repLen; - while (*src != 0) *(dst++) = *(src++); - *dst = 0; - } - - // if the replacement is larger, move the tail toward the end - - else if (repLen > subLen) - { - src = s + sLen; - dst = src + (repLen - subLen); - *dst = 0; - while (src >= pos+subLen) - *(--dst) = *(--src); - } - - // copy the replacement (note that if the replacement is the same size, - // we haven't moved the tail at all) - - memcpy (/*to*/ pos, /*from*/ rep, /*how much*/ repLen); - - return true; - } - -//---------- -// -// trim_string-- -// Remove blanks (and end-of-line) from both ends of a string. -// -//---------- -// -// Arguments: -// char* s: The string. -// -// Returns: -// The string (the same as s). Leading blanks are removed by copying -// characters forward. Trailing blanks are removed by depositing a -// terminating zero. -// -//---------- - -char* trim_string - (char* s) - { - char* ss, *dd, *lastInk; - - // skip to first non-blank - - ss = s; - while ((*ss == ' ') || (*ss == '\t') || (*ss == '\n')) - ss++; - - if (*ss == 0) // (string has nothing but blanks) - { *s = 0; return s; } - - // copy the rest of the string (except the terminating zero) - - dd = lastInk = s; - while (*ss != 0) - { - *(dd++) = *(ss++); - - if ((*ss != 0) && (*ss != ' ') && (*ss != '\t') && (*ss != '\n')) - lastInk = dd; - } - - // poke a terminating zero just past the last non-blank - - lastInk[1] = 0; - - return s; - } - -//---------- -// -// skip_whitespace-- -// Skip characters until we get something that ain't whitespace. -// skip_darkspace-- -// Skip characters until we get something that ain't darkspace. -// skip_til-- -// Skip characters until we get something in a specified set of characters. -// skip_while-- -// Skip characters until we get something that ain't in a specified set of -// characters. -// -//---------- -// -// Arguments: -// char* s: The sequence to read. -// char* chars: (if needed) The set of characters. -// -// Returns: -// Pointer to the first character at or beyond s that meets the stopping -// criteria. Note that we never scan beyond the end of the string. -// -//---------- - -char* skip_whitespace (char* s) - { while ((*s != 0) && (isspace (*s))) s++; return s; } - -char* skip_darkspace (char* s) - { while ((*s != 0) && (!isspace (*s))) s++; return s; } - -char* skip_til (char* s, char* chars) - { while ((*s != 0) && (strchr (chars, *s) == NULL)) s++; return s; } - -char* skip_while (char* s, char* chars) - { while ((*s != 0) && (strchr (chars, *s) != NULL)) s++; return s; } - -//---------- -// -// find_tabbed_tag-- -// Locate a tag in a tab-delimited tagged string. -// -// An example of a tagged string is "ID:TRWFT\tSM:BGDNCSA32". The two tags are -// ID and SM and their respective values are TRWFT and BGDNCSA32. -// -//---------- -// -// Arguments: -// char* s: The tab-delimited tagged string to search. -// char* tag: The tag to search for. For example, "ID". Note that the -// .. colon should NOT be included in the tag. -// -// Returns: -// Pointer to the first character of the tag, for example to the "I" or "S" in -// the example given above. If the tag is not properly found, NULL is returned. -// -//---------- -// -// notes: -// -// (1) If the same tag occurs more than once, we only find the first instance -// .. of the tag. -// -//---------- - -char* find_tabbed_tag - (char* s, - char* tag) - { - char* t; - - t = s; - while (true) - { - if (*t == 0) return NULL; - - t = strstr (t, tag); - if (t == NULL) return NULL; - - if (t[2] != ':') { t++; continue; } - if ((t != s) && (t[-1] != '\t')) { t++; continue; } - break; - } - - return t; - } - -//---------- -// -// tabbed_tag_length-- -// Determine the length of a tag in a tab-delimited tagged string. -// -//---------- -// -// Arguments: -// char* tag: The tag. -// -// Returns: -// The number of characters in the tag, including the tag's name, colon and -// value, but not any terminating character(s). -// -//---------- - -int tabbed_tag_length - (char* tag) - { - char* t; - - t = strchr (tag, '\t'); - if (t != NULL) return t - tag; - else return strlen(tag); - } - -//---------- -// -// swap_64_halves, swap_two32_endian-- -// Perform endian-type shuffling on 64-bit or 32-bit values. -// -// swap_64_halves: ABCDEFGH IJKLMNOP --> IJKLMNOP ABCDEFGH -// swap_two32_endian: ABCDEFGH IJKLMNOP --> GHEFCDAB OPMNKLIJ -// swap_32_endian: ABCDEFGH --> GHEFCDAB -// -//---------- -// -// Arguments: -// u64/u32 v: The value to shuffle. -// -// Returns: -// The shuffled value. -// -//---------- - -u64 swap_64_halves (const u64 v) - { - u32 a = (u32) (v >> 32); - u32 b = (u32) v; - - return (((u64) b) << 32) + a; - } - -u64 swap_two32_endian (const u64 v) - { - u32 a = (u32) (v >> 32); - u32 b = (u32) v; - - return (((u64) swap_32_endian(a)) << 32) + swap_32_endian(b); - } - -u32 swap_32_endian (const u32 v) - { - return (( v & 0x000000FF) << 24) - + (((v >> 8) & 0x000000FF) << 16) - + (((v >> 16) & 0x000000FF) << 8) - + ((v >> 24) & 0x000000FF); - } - -//---------- -// -// bit_count, bit_count64, bit_count16, bit_count8-- -// Count the '1' bits in a 32-bit value. -// -//---------- -// -// Arguments: -// bits: The value to count the '1' bits of. -// -// Returns: -// The number of bits that are '1'. -// -//---------- -// -// Notes: -// -// (1) This algorithm was adapated from one written by Glenn C. Rhoads -// of the Computer Science Deptartment at Rutgers. -// -//---------- - -int bit_count - (u32 bits) - { - const u32 allBits = ~0L; - const u32 mask10 = (allBits/ 3) << 1; - const u32 mask0011 = allBits/ 5; - const u32 mask00001111 = allBits/17; - - // convert each pair to a count in the range 0..2 - // 00 => 00 - // 01 => 01 - // 10 => 01 - // 11 => 10 - - bits -= (bits & mask10) >> 1; - - // convert each nybble to a count in the range 0..4 - - bits = (bits & mask0011) + ((bits>>2) & mask0011); - - // convert each byte to a count in the range 0..8 - - bits = (bits + (bits >> 4)) & mask00001111; - - // sum counts over bytes, then 16-bit words - - bits += bits >> 8; - bits += bits >> 16; - - return bits & 0x000000FF; - } - - -int bit_count_64 - (u64 bits) - { - const u64 allBits = ~0L; - const u64 mask10 = (allBits/ 3) << 1; - const u64 mask0011 = allBits/ 5; - const u64 mask00001111 = allBits/17; - - // convert each pair to a count in the range 0..2 - // 00 => 00 - // 01 => 01 - // 10 => 01 - // 11 => 10 - - bits -= (bits & mask10) >> 1; - - // convert each nybble to a count in the range 0..4 - - bits = (bits & mask0011) + ((bits>>2) & mask0011); - - // convert each byte to a count in the range 0..8 - - bits = (bits + (bits >> 4)) & mask00001111; - - // sum counts over bytes, then 16-bit words, then 32-bit words - - bits += bits >> 8; - bits += bits >> 16; - bits += bits >> 32; - - return bits & 0x000000FF; - } - - -int bit_count_16 - (u32 bits) - { - const u32 allBits = ~0; - const u32 mask10 = (allBits/ 3) << 1; - const u32 mask0011 = allBits/ 5; - const u32 mask00001111 = allBits/17; - - // convert each pair to a count in the range 0..2 - // 00 => 00 - // 01 => 01 - // 10 => 01 - // 11 => 10 - - bits -= (bits & mask10) >> 1; - - // convert each nybble to a count in the range 0..4 - - bits = (bits & mask0011) + ((bits>>2) & mask0011); - - // convert each byte to a count in the range 0..8 - - bits = (bits + (bits >> 4)) & mask00001111; - - // sum counts over the two bytes - - bits += bits >> 8; - - return bits & 0x00FF; - } - - -int bit_count_8 - (u8 bits) - { - const u8 allBits = ~0; - const u8 mask10 = (allBits/ 3) << 1; - const u8 mask0011 = allBits/ 5; - const u8 mask00001111 = allBits/17; - - // convert each pair to a count in the range 0..2 - // 00 => 00 - // 01 => 01 - // 10 => 01 - // 11 => 10 - - bits -= (bits & mask10) >> 1; - - // convert each nybble to a count in the range 0..4 - - bits = (bits & mask0011) + ((bits>>2) & mask0011); - - // convert each byte to a count in the range 0..8 - - bits = (bits + (bits >> 4)) & mask00001111; - - return bits; - } - -//---------- -// -// hassock_hash-- -// Compute a variant of Austin Appleby's MurmurHash2. -// -//---------- -// -// Arguments: -// const void* key: The data block to hash. -// u32 len: The length of that block. -// -// Returns: -// A hash of the block. -// -//---------- -// -// Notes: -// -// (1) As of Apr/2009, information about this hash function can be found at -// murmurhash.googlepages.com -// (2) This implementation is based on an implementation found at -// murmurhash.googlepages.com/MurmurHashNeutral2.cpp -// It differs in the following ways: -// (a) The "seed" is hardwired. -// (b) We parse the data block in reverse; this allows the caller to -// prepend an additional seed pattern to his buffer, potentially -// getting better mixing for the bits in the final incorporated -// bytes. -// (c) The last three bytes are incorporated in a different order than -// they were in MurmurHash2, because the code just works out better -// this way. -// -//---------- - -u32 hassock_hash - (const void* key, - u32 len) - { - const u32 seed = 0x5C3FC4D3; - const u32 m = 0x87C10417; - const int r = 24; - const u8* data = ((const u8*) key) + len; - const u8* stop = ((const u8*) key) + 4; - u32 h, k; - - h = seed ^ len; - while (data >= stop) - { - k = *(--data); - k |= *(--data) << 8; - k |= *(--data) << 16; - k |= *(--data) << 24; - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - len -= 4; - } - - switch (len) - { - case 3: h ^= *(--data) << 16; - case 2: h ^= *(--data) << 8; - case 1: h ^= *(--data); - h *= m; - }; - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - //printf ("%08X %s\n", h, (char*) key); - - return h; - } - -//---------- -// -// suicide, suicidef, suicide_with_perror, suicidef_with_perror-- -// Cause program fatality, after pushing a message out to the user. -// -//---------- -// -// Arguments for suicide(): -// const char* message: The message to write to stderr before death. This -// .. may be NULL. -// -// Arguments for suicidef(): -// const char* format: A format string, as per printf. This may be NULL. -// ...: (same as for printf) -// -// Returns: -// (nothing; it does not return). -// -//---------- - -void suicide - (const char* message) - { - if (message == NULL) suicidef (NULL, NULL); - else suicidef ("%s", message); - } - -void suicidef - (const char* format, - ...) - { - va_list args; - - va_start (args, format); - - fflush (stdout); - fprintf (stderr, "FAILURE: "); - if (format != NULL) - { - vfprintf (stderr, format, args); - fprintf (stderr, "\n"); - } - - va_end (args); - - exit (EXIT_FAILURE); - } - -// _with_perror adds a call to the system routine perror() - -void suicide_with_perror - (const char* message) - { - if (message == NULL) suicidef_with_perror (NULL, NULL); - else suicidef_with_perror ("%s", message); - } - -void suicidef_with_perror - (const char* format, - ...) - { - va_list args; - - va_start (args, format); - - fflush (stdout); - fprintf (stderr, "FAILURE: "); - if (format != NULL) - { - vfprintf (stderr, format, args); - fprintf (stderr, "\n"); - } - - va_end (args); - - perror ("file I/O error"); - - exit (EXIT_FAILURE); - } - diff --git a/programs/lastz/src/utilities.h b/programs/lastz/src/utilities.h deleted file mode 100755 index 7711436..0000000 --- a/programs/lastz/src/utilities.h +++ /dev/null @@ -1,288 +0,0 @@ -//-------+---------+---------+---------+---------+---------+---------+--------= -// -// File: utilities.h -// -//---------- - -#ifndef utilities_H // (prevent multiple inclusion) -#define utilities_H - -#include // standard C i/o stuff - -// GNU compiler version - -#ifdef __GNUC__ -#define GCC_VERSION (10000 * __GNUC__ \ - + 100 * __GNUC_MINOR__ \ - + __GNUC_PATCHLEVEL__) -#endif - -// establish ownership of global variables - -#ifdef utilities_owner -#define global -#else -#define global extern -#endif - -// "deep link" control variable access - -#ifdef utilities_owner -int utilities_dbgDumpFilePointers = false; // true => dump file pointers in, e.g, fopen_or_die() and getc_or_die() -#else -global int utilities_dbgDumpFilePointers; -#endif - -//---------- -// -// data structures and types -// -//---------- - -// sized data types; these generally come from stdint.h but on some older -// platforms it may not exist, in which case override_stdint can be enabled -// and the proper types set up here - -#ifdef override_stdint - -typedef signed char int8; -typedef signed char s8; -typedef unsigned char u8; -typedef short int int16; -typedef short int s16; -typedef unsigned short int u16; -typedef long int int32; -typedef long int s32; -typedef unsigned long int u32; -typedef long long int64; -typedef long long s64; -typedef unsigned long long u64; - -#else - -#include -typedef int8_t int8; -typedef int8_t s8; -typedef uint8_t u8; -typedef int16_t int16; -typedef int16_t s16; -typedef uint16_t u16; -typedef int32_t int32; -typedef int32_t s32; -typedef uint32_t u32; -typedef int64_t int64; -typedef int64_t s64; -typedef uint64_t u64; - -#endif // override_stdint - -#define u8max 255U -#define u16max 65535U -#define u32max 4294967295U -#define u64max 18446744073709551615LLU - -#define s8max 127 -#define s16max 32767 -#define s32max 2147483647 -#define s64max 9223372036854775807LL - -// short strings - -typedef struct char3 { char s[4]; } char3; - -// macro to round data structure sizes to the next larger multiple of 8, 16, -// or m - -#define round_up_2(b) ((((u64) (b))+1)&(~1)) -#define round_up_4(b) ((((u64) (b))+3)&(~3)) -#define round_up_8(b) ((((u64) (b))+7)&(~7)) -#define round_up_16(b) ((((u64) (b))+15)&(~15)) -#define round_up_32(b) ((((u64) (b))+31)&(~31)) -#define round_up_64(b) ((((u64) (b))+63)&(~63)) -#define round_up_128(b) ((((u64) (b))+127)&(~127)) -#define round_up_256(b) ((((u64) (b))+255)&(~255)) -#define round_up_512(b) ((((u64) (b))+511)&(~511)) -#define round_up_1K(b) ((((u64) (b))+1023)&(~1023)) -#define round_up_2K(b) ((((u64) (b))+2047)&(~2047)) -#define round_up_4K(b) ((((u64) (b))+4095)&(~4095)) -#define round_up_8K(b) ((((u64) (b))+8191)&(~8191)) -#define round_up_16K(b) ((((u64) (b))+16383)&(~16383)) -#define round_up(b,m) (((((u64) (b))+((m)-1))/(m))*(m)) - -// macro to count the number of entries in a staticly declared array - -#define entriesof(array) (sizeof(array)/sizeof((array)[0])) - -// silly type check defeaters - -#define ustrlen(s) (strlen((char*)(s))) -#define ustrcmp(s1,s2) (strcmp((char*)(s1),(char*)(s2))) -#define ustrcpy(s1,s2) (strcpy((char*)(s1),(char*)(s2))) -#define ustrchr(s,c) (strchr((char*)(s),(char)(c))) - -#define strleni(s) ((int)(strlen(s))) - -// macro to convince gnu c compiler not to complain about unusued function -// arguments - -#ifdef __GNUC__ -#define arg_dont_complain(arg) arg __attribute__ ((unused)) -#else -#define arg_dont_complain(arg) arg -#endif // __GNUC__ - -// printf macros for sized integers - -#ifdef override_inttypes -#define s64Fmt "%jd" -#define u64Fmt "%ju" -#define u64xFmt "%jX" -#else -#include -#define s64Fmt "%" PRId64 -#define u64Fmt "%" PRIu64 -#define u64xFmt "%" PRIX64 -#endif // override_inttypes - -//---------- -// -// memory allocation routines in utilities.c -// -// These routines wrap the memory allcoation routines in the standard library, -// and permit us to use some post-processing tools to inspect memory usage. -// They are affected by two compile-time #defines: -// -// trackMemoryUsage: If defined, the routines will write a detailed -// memory de/re/allocation history to stderr, which -// can be processed by memory_sniffer (part of the -// lastz tools). -// -// noMemoryWrappers: If defined, calls to the memory wrappers are -// replaced by calls to the standard library routines. -// This is useful in conjunction with Valgrind's -// heap profiler, Massif. -// -//---------- - -#ifdef noMemoryWrappers -#define malloc_or_die(id,size) malloc (size) -#define zalloc_or_die(id,size) calloc (1,size) -#define realloc_or_die(id,p,size) realloc (p,size) -#define free_if_valid(id,p) free (p) -#endif // noMemoryWrappers - -#ifndef noMemoryWrappers -void* malloc_or_die (char* id, size_t size); -void* zalloc_or_die (char* id, size_t size); -void* realloc_or_die (char* id, void* _p, size_t size); -void free_if_valid (char* id, void* p); -#endif // not noMemoryWrappers - - -#ifdef trackMemoryUsage -#define memory_checkpoint(fmt) fprintf(stderr,fmt) -#define memory_checkpoint_1(fmt,i) fprintf(stderr,fmt,i) -#define memory_checkpoint_2(fmt,i,s) fprintf(stderr,fmt,i,s) -#endif // trackMemoryUsage - -#ifndef trackMemoryUsage -#define memory_checkpoint(fmt) ; -#define memory_checkpoint_1(fmt,i) ; -#define memory_checkpoint_2(fmt,i,s) ; -#endif // not trackMemoryUsage - -//---------- -// -// malloc sizing range -// (see also "sequence sizing types" in sequences.h) -// These types control the range of dynamically allocated block sizes we can -// handle. -// -// Allocation lengths are normally assumed to be small enough to fit into a -// 32-bit integer. This gives a maximum length of about 4 billion bytes. -// Since the biggest allocation expected is four bytes per each base in a -// sequence, this limits the maximum sequence to about 1 billion bp (long -// even for possum chromosomes). The programmer can override this at compile -// time by defining max_malloc_index as 31 or 40 (we also allow 20, but that -// is only to test whether the mechanism actually works). -// -//---------- - -#define mallocOverhead 16 - -#if defined(max_malloc_index) -#define maxMallocIndex max_malloc_index -#else -#define maxMallocIndex 32 -#endif - -#if (maxMallocIndex == 31) -#define mallocLimit ((u32max/2)-mallocOverhead) -#elif (maxMallocIndex == 32) -#define mallocLimit (u32max-mallocOverhead) -#elif (maxMallocIndex == 40) -#define mallocLimit (1099511627776LLU-mallocOverhead) -#elif (maxMallocIndex == 20) // for debug only -#define mallocLimit ((u32max/(1<<12))-mallocOverhead) -#else -#error ***** undecipherable max malloc length definition ***** -#endif - -//---------- -// -// prototypes for routines in utilities.c -// -//---------- - -FILE* fopen_or_die (const char* name, const char* mode); -int fclose_if_valid (FILE* f); -int getc_or_die (FILE* f, char* filename); -int print_prefix (FILE* f, const char* s, int n); -char* copy_string (const char* s); -char* copy_prefix (const char* s, int n); -char* concatenate_strings (const char* s1, const char* s2); -char* concatenate_four_strings (const char* s1, const char* s2, - const char* s3, const char* s4); -void append_char (char** s, u32* size, u32* len, char ch); -void append_u8 (u8** s, u32* size, u32* len, u8 ch); -int strcmp_prefix (const char* str1, const char* str2); -int strcmp_suffix (const char* str1, const char* str2); -int strncmp_suffix (const char* str1, const char* str2, size_t n); -int is_blank_string (const char* s); -int string_to_int (const char* s); -int string_to_unitized_int (const char* s, int byThousands); -int64 string_to_unitized_int64 (const char* s, int byThousands); -int hex_string_to_int (const char* s); -double string_to_double (const char* s); -double string_to_unitized_double (const char* s, int byThousands); -double pct_string_to_double (const char* s); -char3 prob_to_string (double p); -int string_replace (char* s, int len, char* sub, char* rep); -char* trim_string (char* s); -char* skip_whitespace (char* s); -char* skip_darkspace (char* s); -char* skip_til (char* s, char* chars); -char* skip_while (char* s, char* chars); -char* find_tabbed_tag (char* s, char* tag); -int tabbed_tag_length (char* tag); -int is_valid_lastz_version (char* s); -int is_later_lastz_version (char* s1, char* s2); -char* commatize (const int64 v); -char* ucommatize (const u64 v); -char* unitize (const int64 v, int byThousands); -char* hex_64_string (const int64 v); -u64 swap_64_halves (const u64 v); -u64 swap_two32_endian (const u64 v); -u32 swap_32_endian (const u32 v); -int bit_count (u32 bits); -int bit_count_64 (u64 bits); -int bit_count_16 (u32 bits); -int bit_count_8 (u8 bits); -u32 hassock_hash (const void* key, u32 len); -void suicide (const char* message); -void suicidef (const char* format, ...); -void suicide_with_perror (const char* message); -void suicidef_with_perror (const char* format, ...); - -#undef global -#endif // utilities_H diff --git a/programs/lastz/src/version.mak b/programs/lastz/src/version.mak deleted file mode 100644 index a081f25..0000000 --- a/programs/lastz/src/version.mak +++ /dev/null @@ -1,5 +0,0 @@ -VERSION_MAJOR=1 -VERSION_MINOR=04 -VERSION_SUBMINOR=00 -REVISION_DATE=20170312 -SUBVERSION_REV=1881:1893M diff --git a/programs/lastz/test_data/base_test.default.lav b/programs/lastz/test_data/base_test.default.lav deleted file mode 100644 index 83bf97c..0000000 --- a/programs/lastz/test_data/base_test.default.lav +++ /dev/null @@ -1,521 +0,0 @@ -#:lav -d { - ../test_data/pseudocat.fa ../test_data/pseudopig.fa - A C G T - 91 -114 -31 -123 - -114 100 -125 -31 - -31 -125 100 -114 - -123 -31 -114 91 - O = 400, E = 30, K = 3000, L = 3000, M = 0" -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa" 1 22929 0 1 -} -h { - "> cat" - "> pig1" -} -a { - s 5643 - b 4901 21309 - e 5171 21537 - l 4901 21309 4924 21332 67 - l 4925 21334 5024 21433 68 - l 5027 21434 5042 21449 75 - l 5088 21450 5116 21478 62 - l 5117 21483 5171 21537 64 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa-" 1 22929 1 1 -} -h { - "> cat" - "> pig1 (reverse complement)" -} -a { - s 9369 - b 1 1 - e 718 767 - l 1 1 43 43 58 - l 47 44 62 59 75 - l 63 62 71 70 67 - l 72 72 78 78 57 - l 79 81 83 85 80 - l 84 91 120 127 54 - l 121 133 135 147 73 - l 140 148 166 174 56 - l 169 175 181 187 62 - l 182 197 357 372 58 - l 361 373 416 428 66 - l 417 433 422 438 83 - l 423 442 545 564 55 - l 552 565 567 580 56 - l 568 582 575 589 88 - l 579 590 605 616 59 - l 606 630 612 636 71 - l 613 648 623 658 73 - l 624 670 643 689 65 - l 644 693 718 767 59 -} -a { - s 11526 - b 9232 6509 - e 9773 7071 - l 9232 6509 9264 6541 58 - l 9267 6542 9357 6632 58 - l 9358 6644 9365 6651 75 - l 9368 6652 9387 6671 70 - l 9388 6677 9424 6713 57 - l 9425 6717 9435 6727 64 - l 9436 6732 9443 6739 75 - l 9445 6740 9462 6757 44 - l 9463 6769 9500 6806 58 - l 9501 6809 9543 6851 72 - l 9553 6852 9559 6858 86 - l 9562 6859 9584 6881 74 - l 9585 6884 9652 6951 71 - l 9654 6952 9773 7071 62 -} -a { - s 15503 - b 10825 8530 - e 11401 9096 - l 10825 8530 10877 8582 70 - l 10878 8591 10887 8600 60 - l 10895 8601 10950 8656 77 - l 10951 8658 11030 8737 63 - l 11031 8740 11060 8769 67 - l 11061 8775 11151 8865 62 - l 11164 8866 11210 8912 57 - l 11218 8913 11401 9096 62 -} -a { - s 82556 - b 13866 16063 - e 17594 19721 - l 13866 16063 13902 16099 68 - l 13903 16105 13912 16114 90 - l 13913 16124 13959 16170 60 - l 13960 16173 14162 16375 59 - l 14165 16376 14178 16389 57 - l 14181 16390 14191 16400 73 - l 14192 16402 14201 16411 80 - l 14202 16415 14408 16621 61 - l 14413 16622 14532 16741 63 - l 14533 16750 14606 16823 66 - l 14607 16834 14613 16840 100 - l 14619 16841 14644 16866 54 - l 14654 16867 14723 16936 60 - l 14724 16938 14753 16967 63 - l 14767 16968 14889 17090 64 - l 14890 17092 14899 17101 80 - l 14900 17103 14936 17139 65 - l 14937 17142 14988 17193 58 - l 14989 17197 15038 17246 68 - l 15041 17247 15105 17311 57 - l 15107 17312 15114 17319 75 - l 15115 17321 15205 17411 59 - l 15209 17412 15327 17530 58 - l 15329 17531 15357 17559 55 - l 15364 17560 15488 17684 62 - l 15489 17689 15498 17698 70 - l 15499 17700 15598 17799 63 - l 15599 17801 15605 17807 100 - l 15607 17808 15655 17856 67 - l 15657 17857 15697 17897 56 - l 15700 17898 15725 17923 65 - l 15726 17925 15804 18003 63 - l 15805 18006 15919 18120 66 - l 15922 18121 16096 18295 62 - l 16103 18296 16122 18315 60 - l 16134 18316 16157 18339 67 - l 16158 18342 16406 18590 59 - l 16408 18591 16439 18622 72 - l 16440 18624 16495 18679 61 - l 16497 18680 16572 18755 54 - l 16573 18760 16613 18800 61 - l 16616 18801 16639 18824 67 - l 16642 18825 16682 18865 66 - l 16703 18866 16728 18891 54 - l 16732 18892 16841 19001 57 - l 16844 19002 16875 19033 72 - l 16887 19034 17024 19171 58 - l 17030 19172 17042 19184 77 - l 17044 19185 17052 19193 78 - l 17057 19194 17088 19225 53 - l 17093 19226 17101 19234 78 - l 17106 19235 17118 19247 62 - l 17126 19248 17152 19274 78 - l 17153 19276 17164 19287 67 - l 17166 19288 17184 19306 58 - l 17185 19311 17244 19370 58 - l 17246 19371 17306 19431 62 - l 17307 19435 17327 19455 57 - l 17328 19462 17352 19486 68 - l 17353 19489 17362 19498 80 - l 17364 19499 17445 19580 56 - l 17448 19581 17464 19597 59 - l 17475 19598 17503 19626 59 - l 17504 19630 17510 19636 71 - l 17511 19638 17594 19721 62 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa" 1 22929 0 2 -} -h { - "> cat" - "> pig2" -} -a { - s 4637 - b 4901 17469 - e 5024 17593 - l 4901 17469 4924 17492 71 - l 4925 17494 5024 17593 68 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa-" 1 22929 1 2 -} -h { - "> cat" - "> pig2 (reverse complement)" -} -a { - s 10451 - b 86 3933 - e 717 4606 - l 86 3933 116 3963 71 - l 117 3969 130 3982 71 - l 135 3983 165 4013 55 - l 169 4014 181 4026 54 - l 182 4037 344 4199 64 - l 347 4200 359 4212 46 - l 361 4213 414 4266 59 - l 415 4274 499 4358 59 - l 500 4360 511 4371 92 - l 514 4372 520 4378 71 - l 521 4380 555 4414 66 - l 559 4415 565 4421 71 - l 568 4422 577 4431 50 - l 581 4432 608 4459 64 - l 609 4494 613 4498 100 - l 614 4500 631 4517 56 - l 632 4521 717 4606 56 -} -a { - s 48163 - b 6629 14413 - e 9280 17100 - l 6629 14413 6645 14429 88 - l 6647 14430 6669 14452 61 - l 6680 14453 6749 14522 46 - l 6750 14524 6872 14646 54 - l 6873 14648 6895 14670 61 - l 6898 14671 6915 14688 72 - l 6917 14689 7044 14816 55 - l 7048 14817 7095 14864 65 - l 7099 14865 7113 14879 67 - l 7114 14882 7257 15025 63 - l 7260 15026 7289 15055 60 - l 7290 15059 7299 15068 60 - l 7300 15070 7374 15144 60 - l 7394 15145 7402 15153 56 - l 7404 15154 7454 15204 61 - l 7460 15205 7503 15248 66 - l 7504 15252 7506 15254 100 - l 7509 15255 7521 15267 77 - l 7526 15268 7550 15292 56 - l 7553 15293 7604 15344 62 - l 7605 15347 7624 15366 60 - l 7625 15369 7654 15398 70 - l 7657 15399 7753 15495 64 - l 7754 15497 7838 15581 58 - l 7840 15582 7899 15641 67 - l 7900 15648 7949 15697 46 - l 7950 15704 7953 15707 100 - l 7954 15713 7968 15727 73 - l 7969 15732 7975 15738 86 - l 7976 15786 7986 15796 64 - l 7987 15818 8014 15845 50 - l 8015 15861 8060 15906 74 - l 8063 15907 8146 15990 58 - l 8147 15992 8151 15996 80 - l 8154 15997 8175 16018 64 - l 8176 16022 8184 16030 78 - l 8190 16031 8196 16037 71 - l 8202 16038 8226 16062 72 - l 8227 16064 8257 16094 71 - l 8261 16095 8335 16169 61 - l 8339 16170 8392 16223 56 - l 8394 16224 8403 16233 50 - l 8404 16235 8411 16242 88 - l 8413 16243 8551 16381 61 - l 8555 16382 8576 16403 68 - l 8577 16405 8776 16604 56 - l 8778 16605 8811 16638 47 - l 8815 16639 8832 16656 78 - l 8839 16657 8858 16676 55 - l 8860 16677 8879 16696 65 - l 8880 16699 8943 16762 63 - l 8944 16764 8984 16804 59 - l 8985 16808 9065 16888 53 - l 9067 16889 9222 17044 59 - l 9224 17045 9260 17081 81 - l 9263 17082 9267 17086 80 - l 9268 17088 9280 17100 69 -} -a { - s 14884 - b 10825 19810 - e 11394 20369 - l 10825 19810 10907 19892 65 - l 10908 19894 10950 19936 77 - l 10951 19938 11029 20016 67 - l 11030 20019 11065 20054 58 - l 11066 20060 11156 20150 57 - l 11169 20151 11211 20193 53 - l 11219 20194 11394 20369 61 -} -a { - s 74814 - b 13866 7543 - e 17595 11202 - l 13866 7543 13901 7578 67 - l 13902 7584 13915 7597 50 - l 13916 7607 13959 7650 55 - l 13960 7653 14141 7834 59 - l 14148 7835 14159 7846 75 - l 14160 7851 14176 7867 65 - l 14179 7868 14189 7878 73 - l 14190 7883 14416 8109 58 - l 14421 8110 14540 8229 61 - l 14541 8238 14606 8303 65 - l 14607 8309 14635 8337 52 - l 14645 8338 14716 8409 61 - l 14717 8411 14742 8436 46 - l 14757 8437 14764 8444 63 - l 14765 8446 14889 8570 53 - l 14890 8572 14899 8581 50 - l 14900 8583 14936 8619 68 - l 14937 8622 14994 8679 60 - l 14995 8683 15026 8714 72 - l 15027 8716 15044 8733 61 - l 15048 8734 15106 8792 68 - l 15107 8799 15112 8804 100 - l 15119 8805 15202 8888 55 - l 15207 8889 15216 8898 60 - l 15217 8900 15335 9018 65 - l 15340 9019 15357 9036 56 - l 15361 9037 15490 9166 61 - l 15491 9172 15657 9338 57 - l 15659 9339 15697 9377 67 - l 15700 9378 15725 9403 42 - l 15726 9405 15809 9488 63 - l 15810 9491 15924 9605 58 - l 15926 9606 15931 9611 67 - l 15933 9612 16095 9774 63 - l 16096 9776 16114 9794 58 - l 16133 9795 16166 9828 59 - l 16167 9831 16406 10070 62 - l 16408 10071 16440 10103 58 - l 16441 10105 16496 10160 50 - l 16498 10161 16567 10230 57 - l 16568 10235 16612 10279 69 - l 16617 10280 16690 10353 59 - l 16692 10354 16710 10372 63 - l 16733 10373 16841 10481 59 - l 16844 10482 16878 10516 63 - l 16890 10517 17027 10654 50 - l 17028 10660 17035 10667 75 - l 17036 10669 17046 10679 64 - l 17050 10680 17072 10702 61 - l 17078 10703 17093 10718 50 - l 17117 10719 17178 10780 60 - l 17179 10785 17253 10859 59 - l 17255 10860 17306 10911 75 - l 17307 10915 17328 10936 73 - l 17329 10943 17350 10964 45 - l 17351 10966 17448 11063 58 - l 17457 11064 17480 11087 63 - l 17485 11088 17505 11108 62 - l 17506 11113 17595 11202 74 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa" 1 22929 0 3 -} -h { - "> cat" - "> pig3" -} -a { - s 7835 - b 4884 18350 - e 5171 18597 - l 4884 18350 4899 18365 75 - l 4900 18368 4932 18400 76 - l 4933 18402 5024 18493 75 - l 5027 18494 5035 18502 89 - l 5081 18503 5116 18538 64 - l 5117 18543 5171 18597 62 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa-" 1 22929 1 3 -} -h { - "> cat" - "> pig3 (reverse complement)" -} -a { - s 49940 - b 6629 17053 - e 9260 19721 - l 6629 17053 6645 17069 65 - l 6647 17070 6664 17087 72 - l 6665 17089 6697 17121 73 - l 6709 17122 6749 17162 51 - l 6750 17164 6872 17286 55 - l 6873 17288 6906 17321 53 - l 6910 17322 7037 17449 63 - l 7041 17450 7111 17520 62 - l 7113 17521 7257 17665 59 - l 7260 17666 7292 17698 58 - l 7293 17703 7379 17789 66 - l 7400 17790 7449 17839 46 - l 7455 17840 7490 17875 64 - l 7493 17876 7522 17905 50 - l 7524 17906 7565 17947 48 - l 7568 17948 7602 17982 51 - l 7603 17985 7622 18004 60 - l 7623 18007 7652 18036 70 - l 7655 18037 7749 18131 63 - l 7750 18133 7838 18221 54 - l 7840 18222 7883 18265 64 - l 7885 18266 7894 18275 70 - l 7895 18277 7916 18298 55 - l 7917 18305 7950 18338 71 - l 7951 18411 7961 18421 64 - l 7962 18438 8001 18477 53 - l 8002 18479 8012 18489 55 - l 8013 18499 8053 18539 61 - l 8056 18540 8146 18630 57 - l 8147 18632 8151 18636 80 - l 8154 18637 8181 18664 68 - l 8184 18665 8196 18677 54 - l 8202 18678 8226 18702 64 - l 8227 18704 8239 18716 69 - l 8240 18718 8253 18731 64 - l 8258 18732 8335 18809 62 - l 8339 18810 8386 18857 63 - l 8388 18858 8404 18874 76 - l 8405 18876 8411 18882 71 - l 8413 18883 8552 19022 65 - l 8556 19023 8575 19042 70 - l 8576 19044 8776 19244 63 - l 8778 19245 8809 19276 69 - l 8813 19277 8832 19296 70 - l 8839 19297 8879 19337 54 - l 8880 19339 8942 19401 62 - l 8943 19403 8985 19445 56 - l 8986 19449 9062 19525 62 - l 9064 19526 9222 19684 55 - l 9224 19685 9260 19721 76 -} -a { - s 10019 - b 9281 20477 - e 9775 20993 - l 9281 20477 9291 20487 91 - l 9293 20488 9361 20556 61 - l 9362 20566 9389 20593 54 - l 9390 20599 9449 20658 62 - l 9450 20676 9503 20729 59 - l 9504 20732 9546 20774 65 - l 9557 20775 9579 20797 61 - l 9582 20798 9600 20816 53 - l 9601 20820 9652 20871 60 - l 9654 20872 9775 20993 62 -} -a { - s 74886 - b 13874 10191 - e 17594 13841 - l 13874 10191 13904 10221 68 - l 13905 10227 13913 10235 89 - l 13914 10245 13961 10292 63 - l 13962 10295 14134 10467 60 - l 14136 10468 14143 10475 50 - l 14149 10476 14159 10486 82 - l 14160 10491 14178 10509 68 - l 14181 10510 14199 10528 47 - l 14200 10533 14412 10745 62 - l 14417 10746 14483 10812 64 - l 14484 10814 14497 10827 64 - l 14499 10828 14527 10856 59 - l 14528 10863 14542 10877 60 - l 14543 10880 14572 10909 63 - l 14574 10910 14586 10922 69 - l 14587 10924 14601 10938 67 - l 14602 10943 14617 10958 81 - l 14618 10960 14639 10981 68 - l 14649 10982 14716 11049 56 - l 14735 11050 14771 11086 51 - l 14772 11093 14889 11210 59 - l 14890 11212 14899 11221 60 - l 14900 11223 14936 11259 70 - l 14937 11262 14989 11314 60 - l 14990 11318 15031 11359 67 - l 15033 11360 15046 11373 64 - l 15048 11374 15205 11531 63 - l 15209 11532 15338 11661 58 - l 15340 11662 15352 11674 77 - l 15359 11675 15487 11803 56 - l 15488 11809 15657 11978 59 - l 15659 11979 15697 12017 72 - l 15700 12018 15725 12043 58 - l 15726 12045 15805 12124 64 - l 15806 12127 15926 12247 61 - l 15929 12248 16104 12423 63 - l 16112 12424 16132 12444 48 - l 16143 12445 16162 12464 60 - l 16163 12467 16406 12710 60 - l 16408 12711 16446 12749 67 - l 16447 12751 16496 12800 68 - l 16498 12801 16567 12870 57 - l 16568 12876 16572 12880 80 - l 16574 12881 16605 12912 69 - l 16610 12913 16690 12993 69 - l 16714 12994 16840 13120 55 - l 16843 13121 16877 13155 54 - l 16889 13156 17011 13278 63 - l 17012 13282 17016 13286 100 - l 17023 13287 17029 13293 57 - l 17032 13294 17043 13305 83 - l 17076 13306 17093 13323 72 - l 17094 13325 17109 13340 50 - l 17110 13350 17120 13360 73 - l 17121 13363 17184 13426 53 - l 17185 13431 17244 13490 52 - l 17246 13491 17306 13551 61 - l 17307 13555 17316 13564 70 - l 17318 13565 17347 13594 50 - l 17348 13603 17447 13702 57 - l 17456 13703 17474 13721 58 - l 17479 13722 17499 13742 67 - l 17500 13747 17594 13841 65 -} -m { - n 0 -} -#:eof diff --git a/programs/lastz/test_data/pseudocat.fa b/programs/lastz/test_data/pseudocat.fa deleted file mode 100644 index c7a452d..0000000 --- a/programs/lastz/test_data/pseudocat.fa +++ /dev/null @@ -1,378 +0,0 @@ -> cat -TTGGCATCTATCCTATCACAAATTGAATGCTAGGAAGACAAAATTTGGTC -TATGACCACAGGGACCTTATACATGTTGTCATTCTGATTAACAATTAAAT -AATTGACGGCTCTTTACATTCTTAATGGGTGTACTTATTCTTTATACTGC -TCTCAGGTTTGATATGACTCATTAATCACTTGGGTGTCATGGAACAGAGT -ACATTGGTAAGATAGTGGTAACTTTGATTCCTATCGAGTACATACATTAC -TACAGAAATTGTCATGTATTTTCAGTTAGATAACCCTTACGAGCAATTGC -TTGTACTTTCAGAGactgcttacaaggcatgcaatgggcgtgtggagctc -ttCTAAtacaaagcccctctggtaattcccctaggggagcataggaatca -gaaggctatgaggGctctccacaaggtctaagtgtcaatggttataactt -ctgaagccccaATAAgtgggggtttgttaaaaggggattggctagtttac -cctctggcttcttttcttaaccagcttacttctaacttctggctgtgtgc -aaggaaagatggctttgataggtcctattgcaccagtgtactcaatgcgt -ggacaaatgtagtcttggggaatgaaggaatagggatggagtgagaaggt -ggttttcccatcacaccacttaggagtgtggctttattctaacgccaaat -ctctaatggttcccctgactatagcctcctataccatgtgtgttattcag -cttgggtgtttctcttcacttccaacaattcaagccattggaagcagcct -aggttttcagagaaaaggagtgggagaatcaagggctaggggtggtcaag -cgaaatgcaacagttgtacaggaatacagataatctgatagataggccaa -acttggatgtgaaaagtagggaatcctaATCAattgatgattccagaaat -gtgcatcaaagttgcaataatataccatacTGGACcacccccataaaatg -gaatatatcaaataaacaggcaataacaatgaaagggaggtgcagaataa -tgtaaccttggggttggtgtgattgtaaattagtagaacactatgtagaa -cagtttggagtttcctcacaaacgtaaatgggcgcttgtctccctgtgct -tggtttaatcttctgtatgcttcagtaataccatgctcggtatactccca -agatatgttaggttggagaactaatacaagttaacttacagtagctaata -tttgtaatcaacctagatgcccattgcaggtaaatggacggagaatacct -ggtactcattctctaataagcaatgtccagtcagaactaagattgaaatt -cccccatttgaaaaaaaaaggaggaaaacatcatgataactgttataatc -caggactaatagacatacataagaCagttctcacttatcgttatctagaa -atcagaagaactgaGTcctgatggacatatactatagaagaatcagttcc -ttggtgacacacggtttggcacctgcctttggcccagggagtgaccctgg -aaaccccaaatggaaacccacactgggctcccagcgcaagaagccagctt -ctccctctgcctgtgcgtctggttatgagggggagtggaaaggtactaga -TAcgacagatggaatacaccagattttttgatccagacatgccttgcata -ataagcacaacaaggtgtataggttatcATActtcccttcaagtatttat -cctttgagttacacacaatccaaacacactctttaggttactataaaatg -aaaaattcagttgatattgattataacccttttgtgctataaatagtagt -tctatcacatgctatcttttttgtaccggttaaccatcccagcccccctc -ccaaacccgctacccttccctgcctcATCTGtggatacatcttctgctct -tcatgtttatggatttaataatttctatttgcTcgatcctacaaataagt -gagatcatgtgatgtgtaattttgtgTaaaatgtcttatttcaattatgc -catgtgtatatgtacctaattttctttatgttctcactgttaaagtatac -gtaggttcatgccaagtcttagctatggtagacagtcctgtggcgaacat -aggagtgcagatatatcaattgtgcccaccagagggattgctcatatgta -gcttaatctttacttctttgaggaacttctgagatgttctccatatcttt -tgtacttatttacattcccaccagccttgtacaatGggttaccttctatc -cacatccttggcagcatttattattgcctgtctTTttccaatataagcca -ctttaactggggtgaggagagagaagcaggctccatgcaccgggagcctg -atgtgggatttgatcccaggtctccaggatagctcccttggccaaaggca -ggcgccaaactgatgaaaaaagaaagatagaaggagacccagcatttGat -tagcataacatggtccattaaaaatggattttacattttaaaacaccgag -catgttgtcagattgtttgtgatacagccagctcgcttagcggcctgacc -ccaggggaaactcataagtgaaataagccaggcacagaaagacatacatc -acatgttctcacttatctgtggaatctaaatatcaaaacaattgagctag -tgtacatagagagtagaaggatggatatcagaggcttgtaagggtagtag -gggtttggtgggaaggtggcgatgttaatgggtacaaaaaacatgataaa -gaattaataagacctactatttgttagtaaacgaacttaaagagtgtaat -tggattgtttgtgactcaaaggataaatgcttgaggggatggatacccta -ttcttggtgatgtgcttatttcgaaggatatatgcttgatggaactcaGG -AGTCTGGAGGTAAGGCAAGATTACTGAACAAGATAGTCCTCCATTACACA -TCTCATAATGGCTTGGGTAGCTGTACTTTAGTATCTGTTGTTAACACTGA -GGGTAAGACCTGATGAGCCTGGTGCACCAAATTAAATATGAGATGATTTT -TGTTACCCCATCTTTACCACTTCACTAGAAAAAGCAAAATATGAAAATGT -CAACATGCTTTTTAATCTCCTTATCTGATAAAATTGATGGGATTTAGCAG -TTAATGTATAACTGACGATCCCTCCTTCATCTAGTTGAAATAGAGAGACA -GCAAATTGTACTGCAGGGTTTACCTAGAATGAACATTAAGCAGAGCAGAG -ATGCTTGTGCTCTAATCTTGAAGGATGAAAGCGTACTCACCATCCCACTT -ATTTGCTTACTTACATGATTCTTATAGCCCAGATGATAGAATTATTTGAT -ATTGATATTGGAGGCAGCCATGGAgcatcagcaagatcacctgggaggtt -CttttaaatagagattactctgacaagaccacagtCTggctgcttcagaa -gtttcatCAGTGGggtagggtgcaggagGGTGCATCTTTAGCAAGAACAT -GGACAGTAACCTTTAAGAGAATCATCATGTGATTCTAAAAGTTCTGCCAA -TGTGCTGATCAGTGAAGATTATTTAAACAGAGAAAACCCACCTAACCGGT -GTTGGGGGCCAGTATGAGAGGGTATTTGGTGTCCATCACAAATATGGTCT -CTAAGACAATCAGTTACTTCTTCGGATGCTTTATCtttatgccctgggga -gtcattagtgtaatgaacagcttcattcttctcttgcagagtccactgct -cactggggcaaataaaattgcagtcaggcctataacccagagtaacagtg -taggacgcaggagaCAACAGAAAATGATCATATTCTGGAGTTATCTCTAC -CATTGAAATTGTTTAACTCTAATTTTCTGAGCATACATAATCTTACTAAG -ATGTTCCCAgctgatacATCACACATTTTTACTCAAAAACACCGTATGAT -TAAACTAGTCTTGCTACTAAGCATATGACATTTCGGTTTTTACTACTAAC -CATTCAACTTTCATTTGGGACATGGAACCCCGATGTCTGAAGAGAATAAA -ACTTAATTGGAAcaatgattattaaagtgtggtcatagaaccagcagcta -agacatagcctgagaacttatgaaaactgttacttaagttgagcatgcgt -cagagtactgggaggttatgttgcgaaagagattaccattctTCTAATTA -ACCGTATATCCAgtcacccTccatattgaatcagagacgTTTACTAGGTC -cgatcccgggtctccaggatcgctccTTTctgggccagggctcaagAATA -ACACATTTAAGGACCCAGAAGGCTGCAATTTTCAGACAAATGTATGAAAG -AAATGCACAACATTGTCTAAGTAGAATATGGTGGCTAGTATTGCAAGATG -ACTTCGTACTGAAAAACTACTGTATATAGCATAACACAGTATTATGGGAA -AATAAAGGCAGCACAAATCGTGTATATCCATAAATATGGTGGGTCCCTTC -CCACAGTTTGACTGAAAAGTACTTTGACTTATTTGTATTAAGGGTAGGAC -AACCTGAATCCTTTTATAAAAATTCTTGAGCAACAGCTACATGGGCCTCA -GCAGAAtatccagctgcagatgtccatagtggttatcaaacctagattgt -gcacctgggttaaaactcctccactaaatcctccccatttgttattgcct -ggattggggaaacaagccatcttacctagggtccttGACAAGTGATGTAA -ATGTATGGGATTAAAAATTTAACTGAGTTATTAAAAGAAGGTTTAGATAC -GTACTAAGTAAAGGCAATTGTGAACATCTACCCTTTTCATCAAGATTAAC -GTGAGTAACTGAGCAAAGAAGATAACCATTTTCTTGAGTTTAGGTTCTTA -GATCCTGCACAGTCTAGTTATGCATATGTAATGGAAAGCATCAACTCTAC -TTTCTTACTCCTTCCTCAGCCAAAACTAAGCACATAAACCTGTTCTTTAA -TTAGCCTGCACCTCAAGGTGTAATATTTGCACAGTAAAGATCAGGTTTTT -AACTCTTTAGAAAAGTCAAGAGCTCCCACAGTTTCATCTTGTGAAGGATA -TAGATTAAAGTTCTGACAACACTACAATTTGTAATTGTGTAGGCTAGTCt -gagctacaaacagcacaatcacactctttaagctattctttaatgtccaa -tcaagtaggtattgactatatttaccctattactctcaagtagtaggtct -catttacttcttcttgccattttttaggcattaacccttctcacttctcc -ccgaaaccctaggcttcccagccactgatatacatgcttctactctctat -gaccatgaattcaattgttttgaaatatagttcaaacagatatgtgaaac -caagtcatgcctatctttgtgtttctCTgactaagtGAtcacttaccata -atgatctgcagttttattcgtgttgtttcatgtatgaaccaagggtgggg -caggaaacccagCTATAGTTGCATAGTGGCCTGTCTTAGTAGCATACAAC -ATTGCTTATATAAAGTTCTTTAAGAGATGGTAACAACTATAATCACAAAG -CACTGTTTTAAAATGATTACCTACATATGCACGCACATGTATATACCTTG -TTAGTCTTATGCtagctaatactttggagacacataggtgtccattaaca -gaccatggataaaaatgtgacaggtatgcacaactgagttatacacacgc -ataaagaagtggaagatctgagtgcaaagttttaccaaaggtatagctcg -gagcagatgggggtaccaaaaggaacatggagtggaatggtagttttccc -ataaagtaaattggcttagccgcctgcaggtaattcagttgacctcctgt -cagctctgcccaatacctcacgcttctcctcctccttagcctgcaagcca -atattggtggaattttcagcaggtcagcacctttttaagtccagcactta -tgcttgagcttgctagagtactctttaaaggtatggacgaagctactaag -ggatgggcttggacaatggtacttgtaggtagcataatgaaattgcccat -ccTATtcacctgagtaggCTAtcttcacagacattatgatggagccctag -ccagactgctgtttaccaactgctTACAATTTGACTTAGGTCACCATGAT -TTCAAGAATATCTTCAAGCATGAGTATTTCCATTCCTTCTAATCTTGTAG -TCATTTTTAGTTCCCTTAAGTCCCCTATATTCAACAATATACATTATCTT -AAACAGATTACTCTAGAGAAAGTTAATGACATCTCTTGAGCCTCCTGTAA -TGAAATTGTTCTACTTACGCAAAACTGGAACCTATAAGATAGCATGATCA -TTTAGAATCtctatctttctcttagcaatcccAGtctaggCCAGGTGCAA -TTTGATAGTTGATTAGCTCTTTATTTGAACAGGAACCAACAATAAAACCT -TCTGCTTAGACGATTTAGTGACTGGAGTAATTCTCTGATGAATCAACCAT -GATGAATAATATATTTAACCATAAGCACCAGATTTGTATATCTATTAACA -TACTATTCTAAACTTTTTACAAATAGAAAGTCAAGTTAGGACTTGTCATG -GTTCTTTCTAAACAACTCCATAACTATTTCAATTGTGCTTGATGCAGATT -GATGGACTAGCTAAAAAAGGTGATCCTTAGGAGTGGTACGATGAGTCAGG -AGGATAATATGTCCTGTTCTCTTTCAGACAGTAAAGGAACATCTTATTTT -AGGTTTAATATCATGACAATcagaaatactgggagAtggggaatggACTT -TTCTTATGAGTAAAATGTATAtcaacatcattgatccttcgtgaaattca -aatcagaactacaataagctgttatctcaccccagttaaaattccttata -tccaaaagacatgcaataacggctTgctaactaggatctggaGTAggtaa -gagagctgttgtacactgttggtgggaatgtaaagtggtacaatcTATgc -tatggagtaaagttaaaagattccacaaaaaactacaaattgagctatcc -tatgatccaacaatctcactgtggggtatatacccataagaaaggaaatc -agtatattaaagagacatccacactccccagtttgttacaacactgttga -caatagcccagatttggaagcaacaaaagtatccatTcaatggataaagg -gattaagaatatgtggaaaatatacaaaatagagaacttatcagccataa -agtagaatgagatgcagtaatttggaacagaataggtggaacCTGCTATT -TCACTGAGAGATCCAGTTCATGTTAGCTACAAGTCATCAGTTGTAATTAG -AAAATTGTAAACCATTGGGATTGTTTAATTAAAACTGGTGCCTGAATCAA -ATTCAAGGTGTATTTCCCAGTATGTCTGCCTGTTGGGTTATATTGCTTTA -GATGCGTCACTGTAGACTGGAAGTAGAGTGTTTTTTAGTGGTCTGCTTCT -TTTAGAGCTCTGAATTGAGATGTGCACACAGTGATCCCTGTACTGAATCT -ACTTATTAAGTACATAAAGTCAACATAGCATAGCATTGTTTTCCACCTAT -TATACCTGATAAAGTGAAATGTTAGTATAGCTTCCTGAAAGGTTACATTA -TATAGGGAATCATGAGAGTATACAACAAGGTTATAACCCTCCAAcattgc -cctggataagcttaaagtccagtggtgcagGTATGTTAtctatttggtta -caggcactaatagaccaggAgtggctgggttagaGctaggaggtccacag -gcgtgacactggcccatcagtATACcagttaaggcgtcttggaggaagtt -gaagtgggaagggctaAGTTTAAAAGAGTTGTTTTGAGTATTCGCCCAAT -AGTGGTGTCACCGTATTTGAAATTAAAGCTACTTATTACTATAATAACCA -ACATAGAGAGTTCACCCAAGAAGGCTTAACAAACTTCATGAGAAAAGAGC -TATACAATGAATTATATTGCTCAATAATTCATAGCACTAAAACAATCTTA -GACTGTGGGTGTTTAAGCATTTCCAAAATGTCTATTAAAGACCAAGAATA -TTTTATCTCTAACTTCAATTCAATGTTAGAGGGGCCAGCTTGGAAAATTG -ATTGCAATCAAGATGCAAAAAAATAACTAACTGTGTTACAACTGAGATAG -GCTAGCATCATGTGATTGTATTTTGAGAGGGTTTGAGCCTTAGATTTAGT -CCATCACAGAGACTTTCATTCATTTACTGCCCTAGTTCAACAATCTTGCA -AATGTATTGAGGATACCTTTCTTAGTTGACACCCATCAGAGATTACTTAA -AAACTTGTTAATGGTATCATGTCTCAAGGATTCATTTTTAAGTCTTTTCA -ACATATAAATGATTGAATAGCCTAGAGGTCTTTTGTGAGCCTCTGGCATC -AAACATATAATTGCACCAAGAGTATATGAGTTAGCAATACAATTGTGGTA -CAAGACAAGGGGTAGGAGCTTATTACCCTATGATCCAATGGTAAAATGCA -AAGTTGGATGTCATTTCACTTCATAATCCTAGGAAACTGGGAGTAGACAC -TATCTCATAGATGTAATCAGTTCTACAGGGAAGAGAATCTATCACCGCCT -TCTAGAAATTATTTGGCCAATAGTTCAGATGTTATATGTTACTTAAAACT -GGAGAGTTTCTACCTTCACACCCTAATAGTATACTACCTATGATGTGATG -ATATTACAGGTCCAGCTTCTCTCACTATATCGGATACAATACATTACATC -AGTTAGTTATTTCTCTGTGGATGAGTGCCTTCTTAATTACCTCTGTTTCT -GATATTAACGTACTCAAGGGTTGCCAGGAGTACATATTCTGCCTCCTCGG -ATGTGTCTTACAGCTTCTTACCAACAATTACTACTTTCGGTAAAAGTAAA -TATTTGGACTACATCACATGATAAAAttgatttggctacaggaaaaggtc -acgagattaggagaaatagtgcaaagtttggtagggccatgatgggcatg -gcaggggacgacagtgacaaatcacagtgctgtaaggcagggcctagctg -acttccatctgtcttcaagccaGcacgagcagaattctCAAAAAtaatgt -ttcaccagttTGCAGCTCATTGGGCTGATCAAAATATTCAAATGACTTTG -TTACTACCTCTAATCAGGTTTTATCATGGGTCATTCCATTGGCTTTGGTT -AATCTATTTTGCATGTCACAGTATTGTAGGATGTACCATGCATTACTATG -AATATCATTCTGATGATAGGGTTATTGACTTCTGTCAAAGGGTATTATAA -AATTATTTCAATTCTTAGAGCTGTAGAGATCACTGTTTATCAACCAGATA -TGTAAAGTGAAATGACATCCCGGtggaagggagctggcatgtatgatcaA -CACTGAgaagcaagcaagggatTGttcccggaggaaagtgagaaaTGAGA -GCCTATTGCTTAATCAAGTGGATTATGAACATACACATATTCTAAAACCT -ATGTTGGCCACTCCAGAAGTATAGGATTATTGGATAAATCTAAGTCCAAG -TGATATTACTCACTACAATATAGCTTAAGGATCATGAACTTGTTATATGA -TCTatggagaaaggcttggaagtaccaaaaagaaccaaaaattaagctac -cataagatcaagGCTAAGTGACACCAcaattccactgctaggtctacacg -cacaagaaaagaaatcagtacattaaagataCAtatctatcatacaggct -gaaggagaagcaggttccatgcaccgggagccagatgtgggattttatcc -ctggtctcaaggatagctccctcggccacaggcacgcgccaaactgagcc -ccacccagggattcctcctttagtggcagtatgtttacattatcaacaac -ttggaagcaaactcacagtcaatcaacaggtgtatgggacaagaaaatgt -cgtacatataccAcaatggagttcaattcatctgtaaagaagtatgagag -tcagtcatttgcaacaacactggtggatccagagcccactatgtttatgg -aaataagtcaggcTcaACGGGGATgaccatatgatccagcaatcccactg -ctgggtatatacccaaaagaaaagaaatcagtatatcaaagagatatctg -cactcctatgtttgttgcagcactatttacaatagctaagatttggaagc -aacctaagtgtccataaacagatgaatggataaagaaaatgtggtacata -tacacaatggagtactattcagccataaaaaagaaagagatccagtcatt -tgcaacaacatggatggaactggtgatcattgtgttaagtgaatagtcag -gcacggaaagacaaacatcacatgttctcacttatttgtgggatctaaaa -gtcaaaacaattaaactcatgggcttagagagtagaatggtggttaccag -aggctgggaaaggtagtgggggtttgggggggaggttgggatggttgatg -ggtacaaaaaaaataagaaagaatgaataagacctagtatttgatagcac -aacagggtgactatagtcaataataacttaattgtacgggtggcgcacgg -tttggcgcctgcctttggcccagggagcgatcctggagacccgggatcga -atcccacatcgggctcccggtattCCCCATCTAAATTACAGAGACTTGTG -TATTTGTTTGATATATAGAGGATCTTTCAGAACCCTCGAAATAATTTTGA -ACAGAAACAGTGTCCCAATCTGATGTCAAGGATGATAGTTGCAGAATTTA -AGCTTAAGCCCATGTATGCTAGTTTTCTTTACTCTGCCATCCAAAAATAG -CAATGAAATGGACCATTTTAGCCTCTCTGACTAGCATAATATGGTTTCAT -GTGCTAATTTTATAAATATGCCCTATTTTTGATTTACATGTGAAATTATT -GAAAAGCAGGATAATGGTCAAATTTAGGCGTCCCTTCACAAACTCTAAAA -GTTACAAAGGCCCCTCTCTGATGGCCCATAGCAAGCATAATAGCATCAAG -CTGACCTTGACTCTATGGGAGTCCATAGAGGAGCTTGTTCCGGATATTCA -CTACTTCCTGGTAGGCAATTCCCTAAGTTATCTAAGCATTATAATAAATT -CTTCTGATATGTTTCAGTTAGCTAAGCAATGTGCATGTGTGCTATAACAA -ATCTAACGATGGACTGATACCTATAAAGATAACTTGGCAAGTTTACTTCA -ACACTCAAGCCAGATATGGGAGCAATACAGTAATAGAATGGTCAACATAG -AATATTCCGTAGCTGCCTACAACAATATGAATTCTATCTTCTATGCCAGG -GTTTTGGCTTAAGGTGAGCATCAGTTCTTTATTATAGCTACAAATCTATC -CAATCAATCAGTAAGTAAAGATAAATggcagcaagagtacagctaaacac -ccgcccgtgtgctaccaaattccagaagactaccatAtttccgactctaa -ttccaaggccctcttctgcttgctctaatctcccagcccacctgtgatct -gattttttcccatctaataacACcatctctctactctctttgtatacgtt -ggttttaatttttagatctcaaaaattaccaggaacatttgatattcttt -gtttagtgcctagcttattttactgaactgacagatcaccagttttagcc -atgttcttgtgactagacctcatacttttttacggctaaagagtattcct -tggtctatatgtacaacaaactctttacccattgatctgctgaagggcac -ttaggAttgcttttaaatcttaaacaTATTGtaatgctgttgcaactacc -atgggagtgcaaatattacttttcccagttcatgaagcctgcttctcctg -taatattaatttcctctctatagggtatatatccaggtgggtatgctcca -tctaatgagttgttttaaggaacctctaaaatgATAGAGACATATTTGTG -ACAGGTTCAATTCAAGTATGATACTTCCACACCCTCAAGTGAGTGCAGTG -GTTGACACTTAGGTCTTTTATACTCTACCTGCAAGTCAATTTTATTCTCT -gtgtttttcgatgtaggaatagtatgtatcacactctatttcacctatgt -ctatttatacactctggatccTTAATATTTTTAAtgctatatctatacca -ttataaaaggttgcagcattacagagcataagtctcttttatgagcttat -gtcctttcatgtgacatatatctaacagaatgaaagttaaatcatgagtt -ggttcaaattttaaatatttgaggaattgccatttatatctcttataaca -agaattctcttccattcttccattgaagaattttaatattaatttattat -agaattctctgagctccatgtcTTTGACCAAATTGCCTCTAACATAGTAA -CTATTGCCATGATTTATGTCCATAGcccgcttattccttcatatcaaggt -cagagaccttgtgatatataaagctcttatggagttacccccttgtcccc -ctttatccccttgccatgggggtgtggctctccagtctcctcaaacctcc -aagggatactacaaagtggcaggtttgcggcacctaccccaaagccTTGg -tgactctgggtTcaatttttgcggcttaagaagaccctttgggcatgtgt -ttcaggatgtttttttagtttgccttctatctaccacatgatttacccaa -ctaagtcagaatttctatcataaggaaagagACTggaattctatatccag -gagttccttgtctcaatgtcccagaagaatcagatgagcccttgtacact -gttcgtcagagcatcagtgcaaagttttagtgagtggtggagctctgagc -acatgtgggagcaggaagggagatggagtaggaagatgttctccccttgc -gtctggttccCGGGGTTTttatcacacagcagaaagtctaactgagtggg -gttacacaatctccctgtagttgggaattaataagagcaccctgtaacac -acctccactggctcttaaggaactgtaaacattaacccttcggtatcatt -gtggggtcatgacccaacagactgtatttcagtatgctctcccagaggtt -caagcagtggggccctctagaaggaagctacatgtaaagagcaagtcctg -tACAgcagaggacaatgaggtatttcccatttcCAATCCAAGTAGATGAG -AGATTTGAGATGTAAAGGACAGTTACCAGGCCAACCATTAATAAACTAAT -AATATCAGCAAACATTCAGCTACAAGTTTATGATGCCAACCAATTGGATA -TTAACAGACTGCTTGCAATCCTGGGACCCAGAAACAGATGGTttagatac -ctaaaagaaatgttatgtttgcctcaaaagattacatttagtccaaagtc -tctttctgacttgtatacaatatctaagatcaaatgaatggacaactagt -atgtgttacatacacatataggaaccccttttggcttgcataatgaagag -atctatccatttgtcatggcatgggtgggactggagaggactttgtttgg -caggatagaccagttacagaaggactactgtaacatatcttcaataaggt -gtcaacttatattaaaattactgctAGaagttaatattgactatagtgtc -ccttactggctattTTTGATGTGTCTTTGCCGGAGCACTCTATTCTACAT -GGAAAACTATAAGTCTACCAGATCTGATACCATTTTACCCTTTGGCAAAT -ATTTCTTTCATAATCCAAGAGTACTATTAACCTGCTAACTCCAATGATTA -TTGCCTGGTTAGCCATGCCTTAAAATAAATAGAGCCCAACTGCTAGGCAC -AGATGAAGCACAAAAAAAATGAAAACAATAATACCTACTCCTAACTTGGC -ATTTGATTGGAGGACTGGACTATATCATGCAGGCTACCTCAAAAATCTGA -GTTGTATCTGTATAATGATCTCAATTACCTGCCTTACTAGACAAAGGGAC -CAAGTTCTCCCCAAATATCTAACAATGGTGCTGATATGTAGACATGTTTA -AAAGTGGTTGTACACTATCAGCTTCTTCAATTGAACAACATCAACCATAA -CTAAGTTGTAAGAATAATATGAATCAGACAGAGGTGTATTTTTAACATCC -CTTAAGGAGTGCCTAATAATGAATTGTTTTTAATGGCTGGATTCTTTGTA -CTTCtagAATGcataggatctagttaataaggcagATGGTattttaggtc -TTTTCAAAATGTGCAGCCAACCCTCTTGAATTCACTTGGAACACTCCTTT -AATTTAATGATTGAGAACCTTCAGAATAAGACAGTGTTTACTTCAGTAGG -GCAAAATGCTACCTCATAGCTACTATCAGATGCTGTATTCAATTCAAATG -AATATATTTGTTTTTGACTATCCATAGGATCCATCTACAATTGATTAGTA -CCCAGGTTAGGCTCCTTGGGCCAAAAACATTACTCAGGACTAAAGCATTA -GCACACCTTATGCTAGAGAAAGGTTGATTGACCATCAAAGATAGCATGCT -ACCATCAATCAATGGTCTGCACAAGTATACCCCTTGTATCTAAAGAGTGG -ACATTAGTTTAGAACAGAGAATCCAGATAAATATGCCCTCCAAATCAAAT -TTGAAGCTCTGTACCAGTGGAATCGTCACAAAGTTAATATATGGTTATAG -GCCTGCCACAGCCCATGCTGCTTCAATGATATTCTCGGTAGTCAAATTCT -GGCCATCAACAGATGAATATGAAAGTGGCATGTTTAtccaaaaatttact -gagtgcctcctaggtaccagccattgtgctaggcgctttcgttatattgg -tgaacaaatggtcactgttcatgtcatctaagagctcactgtccaTCCCC -AAGCTAACATCTCAGTGAGTTGATTACATACAAACTGTTGGACATAGTTA -GGCTTTGTGCTGTCTTTGTGCTCCTACCATGGTCATAATTGAAATATGGT -GGAACCGATTTGTCTATTGTGACAGTAATGTTATTTTAACCAAAGGGGTT -CTCTGGAAGCTGGTTTCCTCTTTAGGTATTTGGAAGTACTCCCTCAGCAA -TAGGGTAAGCCAGAAATGAGATATTTACTCTTGGTTGTCATTGTAGCATC -CTACTCCCCATCATCAAATGCATCTGATTATGCTTAAGATGATAAAGCTG -ATGTTGACTATCTTTtCATTcatgcattctttctcattttttatgatcca -ctatataatcaccccctctTTTATGCACATTtctataaaccttcccagca -aatgtTACCTTCTcatccagtcttctacaccttGtgtccatgagctcaat -ggttttgatcactggttctcacaaataaaggagaagatttgatgtttttt -ttgctgtgtctggattattcacttgacataatggtaaacagttccttcct -tggcattatctatgactggagctaaatttttgttaagcctgattagtagT -GatgagtgtgtagaattcacGcaacctttcatatagtgatggacTactta -agaagctgccaaattttggcTCtattgtaaaaaTtactaagcacaatagg -ggggtggaaatctcattggtatattcatttcctttcttatgagTTTTTta -taacctaagacttggtcccctggatcaggcggtgaagaattttCCCTCCT -AAAGAGCCAAATCCTTACGCCTTATGGTATCAGCCATAAATCAGGACCTG -AGCATATGGTACTTAAGGAGTATCCTACTATGTTAATGAGCAGTAGGCTG -CCTAAGTGTAGTTGAGTGAAATAGCTCTCTGGCCGCTCAAACTATGCAAA -AATTCACAAATCTTGAAAATGTGCATTCTCAATTCCCGTAAGCCTATGTA -TTTTCTAGCCCCTCAActctccActgggccaaaaCAATTGGTATTTATGA -CATTTTTGAtcataagttgtcttgatataggcatgcaatgcaaaataagg -gtatcattgacaatgacttacccattccttcaggagtgtaccttttaagt -tccaaagtctcccaatacaccctttgagtaatgttgtaatgtacaataaa -gttcttatccattatagtcactcagatggtacctatagtaggtcttactc -attcctccttaatttatttgtacccatcaatcatccccccctaatctgcc -actaccattcccaaactcaggtaggaatcctcctcctctttgtttcgatg -agtactgtttggatttttggatcccgcaaatgagtgagaacatgtactgt -ttgtctttctgtccatatattatttcactaaacataatgatctccagatg -catttatgttgttgctacaacctctctGttcttatggcagaaaagagctg -cattgtgtatatgtagaacattttctttatccatccatctgttgatggac -attaagattgtttccaaatctgggcaactgtaaacagtgcttcaaaaaac -ccaggagtgctgatattgttttgacttacAtgattaccttttttttgcgt -atatacacagtaatggggttacaggagcatatggtaaatcaaggtctagc -accatggggaacatcataactgttatccataatagttgtagtcatgtaca -ttctcactaacagtgtataaggaTtCATTAGGGATGtgcctgtcccctac -atcttcaccagcatatgttattgcctgtcttttggacatAaagcctttat -acctggtgtgagataaaagcttattgttgtgatagtttgcaactctctgt -tgcttaatggtgtgtaacaccctttcatatgcttgtttgtagtttgtatg -acttcttactaggaatgacttttgaaaccttttgcccaTgACCATTTTAT -TTAGAAACCtttttgaacagattcttacatttttacctacagagttgTGA -GATAAGGTGTCTGGTAAGGCTATCAAAACTCCAGCAGATTGGGCTAACAC -GAGTTTTTAGCTTTAAGATCCTGAAATATGCCACCAACAGAATTCCTAAC -TTGCAAGAAGGACATGGAAGGAAACGGATCTATACATTAGAGTTTGAAAA -CAGGATGCATTGGTATCTTTTTACATTTGGAAACTGAACTATGTAGTAGT -TTTTAATTTTTTAGTAGCCATAATTCAAACCAAGTAATCACGGAACTACA -CATGCATGGACTAAGTTGTAATCTAGTTGGGAGGCTCCATTCCTCTGCAG -GATCCAAGTATACTGGTGGTTTAGATTTAGTCATAGGTAACATACTTGAG -GCTCGGAATAATTTTATCATCTATTCTATGAACCTTGTTTAGATATTGGA -GGTTATATGTAGGGGCCCAAGTCCTGGCCCTCATAATACAAAGTTGGAtg -ccttatcccaggatagagagctgtctgcatcccagcgtttccagccttga -tggaggaaggttcagatcataaatgagcttggagaatgtggccagagttt -tttagattggaagaagctcagcagatggggaagctagatagggaattgag -tgggcaagtggttctccccaggagtccatgcactcagtgacctgcctgat -ttccagctgccctcctgcatcccctgttaaatATGTAATGTGCATtccca -tgggtactaccagttagtgtcctgcagactccactgtcagtggcttcttc -cccaggtgtgttcccctatactcctggcagcctgtgtgtgtgcCctTTCC -TAGAGGCATGGGAGCTCAGGTTCTAACATACAAGACTGTGAAAGATAAAG -AGCACACTAAACACTTCTTTCAAACATCTACATAAACTCCATAATAAATG -CGGACACACGTCAGAAACCCTACTTCTTGTAAAAGGCACGACTTAGTAGT -TACTACCTTGTAATAcccaccccagacctactgaatcagaaactctggga -CAATAGCGTCTGGGGGCTTCATagagagacacatctccaggggaaaaata -ccttcccactgaatccccccttggcagcctgcacctactgacagttactt -ccactcattaaaaccttgcactcattttccaagctcagctgtgacctgtt -gctttcagtaaatcaaggtccctctgttcttgagataagcagaggGGAGA -AAAGTCTAAACAATTTAGTTCGATATTTTACTAGAAACAAAAGGAGAGGT -GATGAACTTTAGTCCCACTGTATCATTTGCCTTAATATTACCACATGCGA -GAGTACCCCCTAATACTTGTCCCACACATAAATGAGTAGTACACCTACCA -AACTGACATGACATTACTGCAGGGATGATACATAGCCTGTTACCTTCTAT -GCCATTTTAGGAAACTTACTCCACATAACCCATTAATATTTAGTATCAAC -ATATAATATGACTAGGGCCCTAAAAATTGGGCATGCACACATTTGTTTCT -CATTATCTGGCATTCATTGTTAATGGCTCCATTGTGTCTTCCCTGATTAG -TCTATGGCCCTTAAACTACACAAAGAAAAGGCATAATTTTCAAAAAAGCT -TTTAACTTGAGGTGCCCACTCCAAATTGTAAGAATTTTGTAGGGGAATTG -CTC diff --git a/programs/lastz/test_data/pseudopig.fa b/programs/lastz/test_data/pseudopig.fa deleted file mode 100644 index 63ef5a2..0000000 --- a/programs/lastz/test_data/pseudopig.fa +++ /dev/null @@ -1,693 +0,0 @@ -> pig1 -AAGCAATTCTCCTACGAAGTCCTGGAACTTTGAGGTGGGCGTAGTGAGTTATGGGCTTTTTTCAATTGTCTATTCTTTCCTCCactatacattgactcat -gaacagcggacctattaactaccagtgtacttctcgttgttctattggcagggacacTAGCGtgtacatctgacTTAAATgttgcgcactgggtacaagg -tttagttcatcgagagttacactcgatgtgcggggttagcataaggggtgtttagaaaagaggtaggtcttccctggacatcgatctcttgGCTATGTCC -CTAGACCCTGTTACCAGACCAGGACAAAATCTCTAGCTCTTGCCTTAGATAAGAATTAGAGTTCCTTACATTTGGTATGTCATTGAGTTTGTAAGGTGAT -GTTGAATTTACTCTTTTTCTATCACAGCGAGGTATGCAAAGTCCTGAGACGATGCCTTTGGGTTagAgggcgcgcaccatcccccggcagctcgagagta -tgaacttcaggaagggactgtaagcgcttcagcctgcggaatactagctgctaaataccatgaaaATAACGCGGTATAGgtccagcaccgagttctgggg -ggcagatggaaaacgaccagtctggtgagaggatagcctcccggggaagacgacacccccgttgcatgcccgatccggcttcctcttttgttaaacttct -agcctttaatagaaccccgtgcccatattcccagccctttaataacttgatcctcccaatacgtcacggcaggagcgtctcgaacaaagtaaactctcag -tcctggtcaatacagcagatcgggccgaaaagatgcacatcagctgccatagataacaggctagaggagctcccggtaaaattgagccatagaaacttcc -agagttgtggatattcactattcgacaagaccctcaatttcccgttcacccctgtcttccttgaaatttcgcagagcggattacAACACAATTTGATTAA -GGTATCCAGTTGAAAGAGAGTGCATTGTTCGAACATTGTTATCATAAAGAGTTACGAATGTTCTGACTTAGGAACGTAAAAtacgcaggagtgaaagcta -ggtaGCAagtatacattaagaataggggatccgcacccttaaacattcacttcgtgaatcacgagcaacgccattacattttgtaagttCattttaagac -aacgaatcatattgttgagacatcaagaagtgagcttcaagcgtacctttttatctttcttgtgctcagtaaatcgtcccacctgggccctaaatcctcg -ttctccttcccagccttcagtgggtattttgagtgtccctcaggtaaccaaagtcacccaatagttcggaaaccccgctgctcagcacaggcccgtgacg -aaggaaattggtgtattaatgtgatatctgtgcccccgtgggtgttgcgatgctgtctgcaatagtttagatgcagtcataacttaagtctccgtcaact -tctggataagtgaaggatccgtggcaaatatgcgtaattgagtactgttcaaccatgaaaaagtgaaacttagttatttgggacaacatgggtgggactg -gagatcatcatgttaagtgaggtaagcctggcatgggagggagacaccgggtgctgtcagtcaattgtgggatccagtaattaaattgattgaacccacg -gagatggcgagggggacgatgatcagaggaagtgtgacagaggctaagcacatcatagggagtagggtatcccttctgtcgagcatttgcttcttgagtt -aaaatcacccccaccacacctaataagttgtcttaaaatatacaaatcaattactattgctgataatcacctcgtcgtgcgaacaaagagcaggtctttt -tcatactctcttgtttgctttatgtccaattaccagccctcccttccccccaaaaccctagtacccgcaccatcctcggttcaccgttcttctggtcgct -atggtcacagtgtaggcgtgctggttattagatccctcaattaagggagatcctttgatgcatgtctttctgtagttgactttatttactcgccatagtg -tctttcagctcccttcacgctgtttcaaaagacaggagctaatgctatcttatggccggatcgtgctccagtgtatctatagagcgcgtgctctgtatgc -ttccatctcttaatatacacccaggctgtttcgaggccttagctatcgtaggTAAcattgcggaaacaggttggggagtacggacatccttttcatacac -caatcttctgtcttttggatatgtgcccaacagtggggttgctagatcatattgcaacctaatctttggtttttagagaagcctacatgctaattccaag -agtagtcgtacaaatcagcttgtcccgtggtagttgacaagtgttaccttttgaccatatacgcaccggcacctgctgccgcctatcgttcgcttacaag -ccattttagctggggtaaaatggtatctcgttgcagttgtcacttcgactcccctgatgattaatggtactggccacccctacacatgcctggttgtcgg -ttggatatcctcctttagggagcgcgtaatcaaatcttttgcttatctgttaatcagatcattacgcttcctaccacagagaaatttgagccgggaggca -gagtaaggctcaggagcttcgtggggatagttgctggttacaaagcatgtgagtaagagtgagtaaggcctccaatcctaaagcacaacagggtaattat -agcaataggagcttgcatgtgcgttctatagtaactcaacatgtgtgactgtgttgtttggaacctgaagggtaagtgttcgagggagtgggtacctcgt -tctctatggtgtggttattgcgccttgcatgcccgtataaacgcatcttctgtagcgcacgaatatatacacgtattgtttatcctttatcctttgcctc -cGAGaggagtcccgtcttactggttgttagataccataaacaaatgggaacaaagagtgtttgtactttcttatgcttttccttcatttaacgtaatcat -ttctagccctaggcatgtcatcgcgatggcccggatctcattctatattaaaaccgagtagtactctCAAGCCGTTGACAGTAACGGACGTTATAACTTA -GGGCCATTATTTTGTATATTAGGGGAGAGGAATTAGGCCTCCCCAGACACCATCCGCGGTCCAGGTAAGGTCCATATAATGGACACTGAAACGTGTTTCG -GTTAGCTACTAATATATTAGCTGTTGGTATGTACCGTGCTTGTACCCGGAAAGCATGGCGTCGGTCAGTTTCCTTACGCTCTAGCTCCACTATACATAGT -TTAGAAGTTACTAGGTTTGCCTTATTGATACAAAAGCTTAAGGAAACCACTACCTAATACACTTTTTTCGTTTTAAGTTACATAGTCCCTGATGCTTTAC -CCTTAGTTCAAACCTCGACATATAGATCCTTTCCTGTCCATATTTTTCTCTCTGTGTAGAGCTAGGTTCATAGTGCGTTTCGGGAACTTAATGCCAACAA -CTCCAGCCTGCACGCCCTGGCATTGGGTGCTTTCCCAGCCCTCCAGTCCCATCAATCCCAtaacgttgtacgtaaaaatgtggtactaaatGAtcaaaaa -cATGcgggtagtaaatttctgtcgccgaccctaatagccagacaactgcaaacaaataaagaatgtacaacgccgttcggtaagagagagatgtaaatag -agggttggacaagccataacatccctccaggaatactggactTatatccgtaagaggagaaatcacaattgctaacggtgacccaggggaaggggaaAtc -tttacgtaatgttgttgagaatttgcatttacaactattaagtggaacggtcttaagggccggagttgcgccagataaggatctcgcgtgagcgccttta -attccagtgttgtatacatacccaggatggatgtgttctaccacgaagtagcaacagtatttcttagaactggcaggttttggaagcagttttgctgcac -atcaagatagagagagagaaaagatgtgccgtatacacactgaaattcaatcctcagagaaggaCaagggtctgggtgtcggtatcgtcataaccccatc -tagggactaccacgttaggtagggagatacgttaaccggaacataaacaaacggacacttacttgtttgcgaaagctaaaaatccaagcatggctcatcA -gcgtgagagaggaggaggatgactcccaggcaatgggaggttagaggcagatgcggtgggggtgactgatgggtatggccagcatacgtacgaatacctt -aaacctaccgtacgaactatagcagtcatcaaaatgggtgtccattaaatcttacctctcagaatagcataaaaaacgtaattagggagtttatgtctcg -aagagtaaacccttggaggtgcgggtgctccactgtccataatccccctgttctacacagcacgcctgaaccgagaccgctaatatccGGACAAACATTA -TGAATTTCACTTGTGGAGGGATAAGAAAGGAGAATAGACTTATGAGATTTAAAAATCCACTCTATTCGGATTTGTGATTCTTTACACGATCAGGGCAGCT -CAAGGATAACTTCACCCAGCTGTATTTGGGCAGCTTATTGATCCTTAACCTGATGAGATACTGCCGATGTATAATGAGATAAATTCCTGATATAGGGCGG -CTTCCGGTATGTACTGATTCAACTATTTCGGATAGagaatcctctacgatcttatccagcgtaccaagtgttggattattAGAAActtataagaggaaga -atgaaaataaggaaaagatatcgaagcgctcctgtcgtgcctagggAtgactattatgTAgccaaaagcccgaaaccccttaagtAgtccgttagtagcc -gaaaaCgtcgGgagaactttatacaatcaccgacttaccagacctagcacacacgcaatgctgatcatatacatgctaagagagtaactttttcccacta -taacgagctgataagctggacaagcaaaatcaagctccgcaCtcccgcttttattcatgagaaccaatgatggagctcattgagctcaccgataGcagat -atagaacactggattTGAAAATCtcacacgtccggtcaggtacctttaagcgGATTCGTGCAATaaaggaggcgactgtgtagtggttcataagagctga -gaaaagctgattaGGTCaAAATATAGACAACAATGGCGTCTCTCTCCTGAGCCTAACCAGATGCGTCTGAGGAAGATCACACAACGGTGCCGCCCAGGAC -AAATACTTCGTCCCTACGTAACTAACTAGCTGGAGGCATACTATCAAACATCTAAGGGGGAAATCTGGCCTCAACAGGTCCCTCTGGCTGATATACTATT -AATATCGCAAGGGGAAAAGCTCAACAATATATCGATCGTACCTACAATAGGAGTACAAACCCGATGGGAAGCTCAGCTACGTACAAGAATCTGACCGTGA -TAAAATCATGTCAGACGTGGCATACGGTtggtccttgggtctctaaaccacatgcgcagtgacagttcattctccagccttccattgacCcagaacatcg -gctggtccctaggaggtactccacagatgtttattagtTAAACAAACCACTCCCATACGCAATTTTGGGCCGTTAAGGTTTGGATACTCAAAGAATATTA -TCAAAGCGATGTGGGCTCCAGCGGTCCATATAATCAAAGTGTTGACTTTATGATGGTGCCGCCAGCACACACCCTCACATTTGATTTGACAGGTATTATC -GTCTTGGTGGTTTCTTCCAAACTCATGCCCACCCCTTCGATACTAGTTAGACTCCAGTGCGGGTGCACACCCTATCCTTAGTGGTCTATCTATCCTTGTT -TTAGTGATAGGTGAGATCGTGGCTTAGTCCAGGATGAGATTTTGCTCTCAAGAGTTCTAATCTCCTTCCGAAAGTGAACCTTATGGCTAAAACAAAGTGG -AGGATGTACGTTCATTCGAAATGCGCGCATCAATAGTGGGCAGCTATCAACTAGCTTTTTGCTCTACAGAAGGGAATACTCCAATAATAATTTATTCTGC -ACGCGTGGTATCATTCAATTAAGAAAATTACTCGGCTGTATTCAAGTGAGTTGATTCTACATTTTGAAAAggtctagaacACCGTctgctttactcgctg -tttcatactgtaGAAGTGTAAGGAGTTCGGATGTGAGAAACAGACGGTTATCAGATAGCGACTGTGGAATGTTGAAATTGCATGTCTGTCCGATCCTCTT -AATTCGTATGATAGAATTAGGGTTGATGTTGTTCCACAGGGGGAGTCGATTTCGTACACCAACTTCTATACACCCTTACTTTCTAGCGCTCTTTTGAAAT -GACTGGGAAGAGCTTAGAAGACCCTTTGCTCCAATAAGTCTCATTTGAACCCATTACAGTCACAAATGCGATGCATGAGGGAGTCTACATAATGTATCCC -AGCCCTGCCAACAAAAGTTCAGCCAAGATTCGATGTGATTATTGTCACTTTCTATGTTCTTCAATTGTGCCCAATGTTCAGACGTTTTTCACCTTGTGGC -CCGTCCAGCCAGGGATTCGTTGTCAAATTCGACAGTTTCATCAGAGTCTCCGATCACCGGCTGGAGGTGTTTACCAGAATGTGACATTGGACTATACCTG -GTGGACTTTACCGGTTTCTAGTTTTTTGTACCATCTAGCGCAGAACGCTCCGGTGAAGACGTGTAAATTGATGACATGTCGATTCCAAAAGCCAAGTTCC -TTTGTCTTTCTGATGACAAAAGCAGATTAAGAGGTTGCACAGATTTAGGGATCCTCTCTCATGATTCCATTACGGAGCGCACAgggctTCGCATTGgttt -aggcaGGCTGCGGTGACAAACGCCTAAGTCCCGCATCAGCTATCTTAACGTTGTGGGTATACGTAGGTAAAATTTATTATTTACGTGTTTCTATCTTGAT -AGGATGGCGAGACGTTCTTACGTAGGATTTACTCAGGTTTACATATCATATAGTACTCATATtctgaaaatagtgcacatgtgtacattaaaggggccta -tggcatgtctttataggagggtaaggttactgatgacaacTaaacaagagaaggggcgaccaggacctagagcgaggcacatcaaagtcactgcaaatga -taatatgttaacgggcatacaggtgtaagctaatcctgtgatgtgattagggaagagcaatccatgcatagctgatctgccatactgagtaaagtacatt -caacttgcatccaagttgggttaaatcactaaatctcttcctagctatgccagaaaggggctaaatatggtgacttgatgtagatcccttatctacctat -ggaGTAGCAtctttaggcacttagactcgtccgaacctcattcttcatggataggccagaggcaagtagggtactaagtttatttcttgaaggtacggtt -tactttacttttggataactaaATCATCTGTCTAGGGCCCTACGTCACAAGGAGTCCTTTAATGCTCCCCTTGGCTGGTATCATCAACTAGTAGTTGGCC -ATCGGCTTAAGTCGCCAGTTACCTACTGGATGACCTGAGTGCTGTCAGCCCCCCTACATCTTATACCTTTCATCTGCTTGCATTGgaggttgggaatatc -gctggtctccatCGTacgggtcatgtctcttaggtgtacttggttttctgtacactcccccgccgcgaccatggggagagcacatgggcatacaggtctg -aggtcatcatcaaGTTTAATCCgcaaaattatcttagggcccacgcttacagttccataaatcccggtgtcggggtatgacagggtcccaatgcaatgct -ctaacggcagggggaatgtgttattccattcgaataaatagtttgcccggctagaagcagtatactataaaggtaagtggtactcccactcagtcttcat -ctctgggccctcatacgctactacttactcgagtgtcgtgttcactatctaggacgtctaattacctagttaaaagatcctgggttataaccgcccttgg -cacacggcggagacccatactgaatttggtcaccatgtagcacttcaaacagctaagaacacatcctatgacgtatatctgcaagaccttctacggccgt -ggaaatcgGggtcagggtcacCAAaaactttaggtgaggacttccgagtatgttaccccctacgatcccgtagagatcggaccacgctggagaggctctc -acccgtggttctggggttaagatgaacaaggtggtctgtcaactcacagctcttggtatgaagaacagagcaggTTTTGTACGTGAAGCTAGTCGTCGCG -ATAATTCCTATATCGGCGAGTAAACACAAATTGTTTGATGCAGATTCCCCTCAAGTTttgaaaatagactcgggtgattttaccagaatttggtaataaa -acccgtctctggacaagtttcgagaagtgaatcctcgttggcctagacacgagtaccagactttcatctgaaagggtgagcgaaagcataccacagtaag -cttgtaCCCCAAGTACGTGaaatcAATTgCAgaacacaatagcaaatggtattgtttgcaaaaatgggctTaaatcaaaaaagagtataattcatactgg -ttaaagaatgtaggaaaggaagctctgtgtactgcgTtggggaatataagcgccgcagcgactctgtagaggagttggcgatttctctagatacctagaa -ctcaagtcgactcaccgccaagccttcatctaacgtacatagtcacgacTCTaaaggaggtcagttcTccgcaagagacttaggcattgctatgccgcag -ccccttatgaaaacgcagcatagagTTATAAATACTGACgcaacctggccggtccagtagaagcaaacaaaaacagaacgggcacataatcagagcAGga -agtattattttcccatcgagtctgccAGAGAATAATCTTAACAAAGTTGACCTTAAGGTATTGTAAAAAGTCAAACACGCTTAAACCGCACCCATCATAC -GTATGTCTCACGccttgcaaagtttctagacaacgggttattagatggtgaacagccattgtaagatatatactccatatcggtaaaattggtactgcag -agttttccgtactcctacggtagttgagtcagaattacatttaagcatcggaagaactctatacgccacgcaattgatcgacgggtctacatcatctgtc -acgtataggtaatggggcgtacattcgccgtgggaaagcgtatgatctaatcagaggaacatgagtacaacttgcacttgttccattaagtgaaatagga -cagacacctagcaaagaaaacgaaggatccctgtcaaatttcgagatctgacaaaaaaaactaaagtaaaaggagtgagcagaaaggtgATgttatgaca -tgcgaacagatgagccggACcaggtgggcgctgagaattaaggagagcgaaagggagtctcgggctcggcatggaatctcctggggttgctagctcaggg -acccgagcattcaattacattccttatatccatcgaggactttgaatagttctgtcagcaaaaattattctgtgccaatgattatttctgttcctctagc -acgcgcctcttaaggttgagctgattgcgtatatgcggatatgcgtcgcgagtttttatcgttacaggaagtcagcatagcccccagggtcgttttggct -aagtcaatttttgagcaggaaaatgaacagtcctgttgtcactaagctggggtgattgtactctgggatggaggtgctagcggttagaagatatagaaga -agagactaagaaatatcaagactctcattgtttccgtaactaggagcctttgcttcatagtgtctcaatgatgcacctaacttcgactaaaaTAGACCTT -AAACCACCAGCTCAATGAACAGCGGCATATGATTTAATAAATAGACCTGTGTACACATCTACATTTAATGTTACGCAAGAGGGCTCCATAAGTTGCATAA -AATCCTCTATTCTAGCACACGCACATCCATTTCTGGACTAGCATGGCTTGTCGGGTGATAACGCTTAAAGCTCTATGACTAACGGCCACCCTGACATTTG -ATGTCTCCCAAGCACGCATTGCGGTAAATATATACGTGTCACTAACTGGTGGATCAGTAActttctgctaggatcAaccgaatgactatgatggctattc -atttagagagatgttgcgataatttggtagGGGAATCGAATCTTTGAAGTATGATCGTCTGAGTGTGATTTTTATTTTTATTCAACCAAGGGGCATATGC -GAATTACCATTGCCTTGAATGAGCCCAACTACTGACCCATGCAGGGCACACGAGGGAGATTAGAAGGCTTATAACGCTATAAACTCGCAATCTTAGAGAC -GGAGCGCACTCACCTAAATATGTTACCTTTGCGGATGATTAAAAATATTTCATAGATGCAATAGGTTCTAAACAAGCACCGGACGTGTGGAACCGATCAC -GTGAAGGGGCTGATAGCTTAACCTAGGGATGCTACCCCCGTTCAATTACTAAGCGGATAGGTAGCATGATCAAAGGCAATAGCCTAAACTCATAGAGTTA -ATGCTCTCTCTCAGCAGAGAATTCAGGATGGAACATAGAATTGGTATGGATGTGGGGAGCTAGAGACTTTCCATATTGTTTATTTAACTATTGAGTTGTC -TCCAAATCCGGAACGGGTGCAGTAGGCAAAGAAGCTCCCCTCATTGTTGTCTAGCCGTAGACAGGACTACGAAATAGTGTTAGATATTAatcgaaggggc -aagaagtaataaaggtcaagaagccctgAGTCAGTACggtcTTAGTTTCATAATACAGTTGAGGTAATGCCTGAGGTGATCTTGAGAAACTCGTGAGTGT -GACGTGCATGTATCAATGTCAGTATGTTAGTTAAGGTTAGTCTAGATACTATCCTGACTGAAGGTTGGCTAACGGTCACCACATCGGCAGCCCGGCTACG -TATGTACGCCTGTTGGGCAAGTGGAGTCTCAGTTTGACACAGGGTGCTTAAATGTAGGCTGGATCGATGACAGCATCTCCTACATTAGCACAATAGTTTT -GGGCGTGGTACTTGCTTTAGTAATTTCAAGATGCCCTTATTCTAACTCAATGAAATTGAAAGATAGACTTACCTTTCCACTGGCCATCTATTATAGAGCA -ATTCTCCGCAAGGTGACAAATACTTGCTACGTGTTACTGAAAGTGTTTTGCATACGTCGGAGCTGATGAACATGGAAGATATTAAAGCTTGCTCCCATGA -ACATCACCAGCACATGCGTCCCATAAAATTACTAGGTGCAAAAGTGTAGTGACTGGTATACTCACAAACAGTACTAACTTGAGGGACCCAACCGAGAATT -CATGCTGATGTCTACATTATCGGTGTTCGTCATCAATATGATTACCAGCCACTGGATAAGGCCGTGATCATAGGGAATAGCACGCAAAGCTTACCATATT -ATAGCTATTCATGTCTTTTAGCTGAAATTTTCGCGATATACAAATGGTCTACTTAAAAAGATTTACCTGACCAATGAAAGACGGTGGATCCAACCGCGGT -GCTGAGGCATAGATGGGATGCAAATAGtttatctagcggggattgtgttcttagcgctggagAacggaaacttacgagtaggagccgcgatgcggcttgg -tctgcactaagagttagacttctaagttgaacagggcagcttatttatttgaaagtagggctgatacgggtagaaagacgctggtagtatacatttttta -cggctgaagaaactgtagatcagtgaggttaactggtctaccccgggctaaaaagctcatctaccgtaatacagtgatttgagactacctatgctagttT -CaggggtgacatgaactaataccctccacagggtTCGATGGAGTTACACCTCCCTCAGACACTAGCTATGGAGTCTTCGCCCGCATTGGGCAACTTTTCT -AGTACTGCGATACAACCGAACCAACTGATCACACTCGCCGTTCCCTAGCTCCATTCGTCTTGAATTGATTCTGGGCAATTGCATTATAGCTACAGAAGGC -CTTTTAGCGATGTTCCATAGCCAGAAGCAAATCGTAAAAACAGGACTAGGGAAAGCGAGGCTGTAGAAGCGTGAAAAAATCTTGCAAGACGTTTACTTAG -GAGTGACTCAAGGTCTCACCATAACTCATTCGACTGTAACGCTAAGGTGTGGTTCAGACCCAACTTTAGTGATGACTTAAGTACGACTTTCATGTATTGA -ATTTGGCCAGGAAGCTtgaaagctcgTAattatcaaccctgtcaatctgactctctgggcgGGATTTTCTACTCTTAGTCACTCAGGTCTAGTCTTAGAA -CATGCTCCTATTGTATTTCTGACGCGCACAAAACAAAACACATCTGGGTCTTCTGGTGAAACTTTCTGGTTAATGGACCTTCATACTACATTTTGAATAT -CAAAGAGTCAAAGTTGAAACACAGATTCTAATTTGTATATAGCCTGGGCGCAGGATTATAACTGGGCTTGCGAATTGAATTCTAAACTGTTCTTTACGTT -GGCCGCATTTCAGCCTAAATTTAGTGAGTCATGCGAGACGTTCACCAGATCTTCATACTCCTAGGAGGACTATATGGGAACCAAAAGAATTGTACGTCAG -GGTTTTATATACAGGAACTGAGAGAGAGACCTTATTTCTTAGCTAAGTTAAGCCAGAAACATGGGAGATACTATTCTTAAGTTTAGTCTCGTGACGAATG -GACTTATGACCAATCATCTGCTAGTGCGGGTTTTCCATTTAGCCATTCAGATAATTCCCCTCTCGTGCCGGGAATTAAACAATGATAACTGTAGGCTATT -TTAGACAACGGGTGTCAGCCTCTAGACTAAGGCCCTACTAATGTATGCGCTATGTTGTGCAGACTATTACCGCGGAGGGTACAATTGGAGCGAAgtggat -cccaaaatgtcgccttctgacaaacagtgttagtgccacgttggatcgtgcgagaaatacagaatctcgagtcttaccccagatcttccacctcagggtt -cttagacgtggggtcctacaatgtgtattttcagaaagTTcttccaggtggtgctaagacatgcttaagttcaagaaccacTGTCCCCAAGGGCCAGAAC -CCAGGTCCGCATATCTGATATCGGCTGGGAGGAATCGATACAATCCCGTATGGTGAGATGCACACATTTTTGTACTCCCTCGAGTTCTATGGGACATTCT -GGCTGCGCGAGCATGTACATACCAGTAAGACATCTTGGACAAAGTCACGCTTGCTCTTTTGACTCTTCTCGGTATCACCCCCATCATTAGCAGTGTATAG -CTACTCACCTGTGCGACCTCTAGGGTTCCTGCTGATAGTATTCATTAGAAAGCCACCTTTAAAAATATAAATAACCTTCCTCGGATCCGACCTCTCTCTG -CGGGGGGTCAATTTACGCCTCTCGAACCGGAGCGTTCAGCCGGAAAGGTTCTGAACTTGTTGTACAAATCCATTCTACTTCGGGATACTCGTTCATATCC -ACTGGGAAACATTTAATGGAGTTTAATATGAATTTTACATAGACTTCGCAGGTCATATGTATATGTATCATTCCAGTATTAGGTGTCATATAATACACCT -GCTGATTCTCCAGCAAATGCAGAAATATCAAAAAGCCTCGTTATAGTGCTTGAGCAAATTGGGGTATTGCCTATGCTGTAGCAATGGAGACATGGGGCAA -AAGCCACAACGGATTCCTCAGGGTCGGTTCGATAATATAGCACTTACCCAAAGCAACCGGAGTGCGTCTAGAAATTATGCTGGGACACAACAATTTTACT -CTCACCCATTATTTCAATAGTATCATGTGCGCCTGAAAAATTGGGCGTTTCTACGGGACCATATTAGCGCATGAGCCCTTAACGTGCTGACTCCATTGAG -TCGGGATTGGTTGTTCCGATGGCTATTCCTAAATGTTAGATTAAGGAGAACTAGTATACGTTGGTTAAGGCTGTACATGCTGAAACCATAATCCGTGACA -CCAAATTGGGAGACCGATTTTGTTCAAAGAGAGGTTCAGTTAATCGCTAAGATTTCCTATATATTCAGCAACTACATGAGTATCCATAATTTAAATAGAG -CCTTTTTCTCACTTGTTAGTAATAACGTGCCGCAAGTAATCAGAAAGCAAGAGTCGCATCCTGATTAACTGTGAATCATATAGTATCAATCTAATTAGTG -CATAATTTCTGAATATGCTCTGTTATTATCCATAATCGTAGTGCCTTATTTTATTGTACGCCTCGAGAGCGCACTTAACTTTTCGTTGGAGACTAATTTT -CacagtcaaagaggatatcgtgaggtgggcttttggagtcttatcccGacgtgatccgtttttcagctgctgtctaaggttacAcGActttaAcgcgtct -aactaaggccatttaggccctcactctccccccttagtagaatctataatatGCtcccgcgcgcagctactcagaGTATATTTAAGATACCTGAGTTGTA -ACGATGGCAACTTATTTAATGGTCCAGGATATACTGCACTAATAGCAGTAGTTCTATTGCCACAAAGAAGGGTATACTAAGTCCAAGAAGAGGTAACTAC -AAGAGCACGTGGGGAAAGCTCCCATATAGTGAGAAAAGACTGGGGGGGGTGATGTAAGTGTTAAGTTGTTCATGAGGACGTCACTGCTGTATCAGTTTGG -GAAACTGGACTTGTGACCTTTTTACTGGGTTTCCCTCAGACTTACTTTTAGTTCTAAGGACTGAAGAGCCTACCCTTCTACTATAGCAAGTGTTGTGGAA -AAAATTACCGGTTGCATGTTACATTCACAAGATGCAAGAGCTAGTAGACTACGATATTTAACTAAAGGAACAATATGGATAACTTCATAAATACTAGATT -GAGGGATCATACTGTCCTTAGTAtcccgtgatgaaatgagaacatacgacttttgttgccgGTTAGCGTtgAtactctACAACTgctcgtcgaggtaaac -aAGTTtgatcgtctcaagtccaaccggttcgatagatattgatcggatctcatcctctacagcgcTGtgaactaaaccacacaggccaggcgtcccacac -ccccttaatcgcgacacctcttaatccaccgtgggtttgctgttaaacttgtttgtcgtagacatggctcgtctaaagaaggagatttgtatcactaata -tgcaacctcctttcttttgggtgcatacccagcggtgcagttaCGATtttatgcaggcaggcaactttcagtgggCATTcacttagcgccgaggaggcta -ttcacctccttgagcattcatcatctgaattataaccgattgagttacactctctagttcgttacaagatgtatagctacgtagcccttgcctatagtcg -tcctgctatgccaacggttattcggatgtaataattgttttctggtttacttgtgtccattaactgtccctacctcccccccaggtcccgacccttcctt -ccatgtacttgttgtcatccttccaacctccacgcTCGcgctaagtttgattgcgttaatctttaagtccctttccttAATCCGATTGTCAGCTTCATGA -CAAATAAGCTGTATGGTGTTGAATGGCATTACTTTGAGTTAGATCCATCCCATAATTCTATATTTCAAAAATTGCCTGATAGGTTTCAGAATGTATGTGT -ATACGTAATCCACATGGTGGATCATTAAGCTCCCCttgcttattatcaccttctaAaccattcccttgcttttcccagtacatcccagctaccttttaCC -ATCATGTAAGTTCATTGTAAATGTATTGGGTCGATTAGTATGGATTCCCACAGTATTGCGAGTGATAAAACCGTTGGGTTAGCTTCGCAGACACGTTGGA -TCATAAAAGCTAAACTCAACTTTATCGTCGTAGTCTTATCTATATCGCTCCATTGGACTGCACACACTTCAATATTGTCATGTAAGGAATACCTGACATA -GGTCAGCTTAATTAACTACAACGCGCCATGCTGTAATCTGACTAGGGATCGTAATAAAGTTGTTAAAGATCTACGATCGGCTCACCCATTCCAGAAGGTA -TAATTTTAGTTTGGAGAGCCCCCAACTTGCCTATAACTAATAATGCCATCTTAGGATTTATATATCGGTACGTAGTACTGAAGCGCGATCTCCAGTTATA -AATACATTCGCGTCAAAGTGAAATTTTCTTTGGTGTTACGAAGGTTGATTCCCTCACTAAGATGGACAATTACCGagtTTCCAAGAtctgcagattatcc -taatactgctggctcaaaGGCGTCCTCAAGTCAAAATCTTCCATGCTATGCCTAGTCAGGAAAGAATATACACTAAAATAACACGTGTAACAGTAAAGCT -GTCTATGTCCCATTTTGCTTAAAGCAGGTTCCGTTAGGCTATTAGTGTAATCTGGAGGCCTCATCCTCAGTAATGTGCGGTCCGTGCGTCAGACAAGGCA -AAAGCGCACACTCGTGATGCTGGTAAGTTGGAGTCGCGCCTAATCATGTTTGACTACCTTCGGCACCTCAGCGGATCCCAGGATGGATTACCTATCAGTC -GCAGATAGAAATCAGCCCAATCTGTTGTGAAAACCCGCAGAAATGATTAGCGCTACAATGGCACAGACCCCTTATACCACGCCATACAAACTATCATGAG -AGATGCCGCTAGGACGAGAGGTCTACTAGGCTTTAAAGTAAGTTAAACTGCTCatgcactagagggtataattaatggcggtacgctggatcgtaattac -actagaggagatcaacctaacaCATGcgatccgctgggtaactcTTTTTAtgaattccctttatgaggcttaacaccaaatagtagacgtcttagaccat -tttaacagcccggtggtttagtgcagttagcctccaccactcctattacagacctgccgattcttggttcgctcattcgcgcccaatggttttttttgtc -aagtttatcaaCTTCAAAACATAACTGATCTTAGCATTTCGTTTTGCCCAAAGTAGCCATCGTGGATAAGAAACTGGAATCCCTGTCAGTGGAAGTAGGA -ACCTCTACATTTCACGAAACTCTTGGGTCAGCTGACAAAACAGATATGGGTTTATCGTGAGAAGACCCCCCCACGAAGCCATGACTAACTAGAGTACTGT -AGTGTGTATTATTTCGTTGGGATGATGCGTGGCAGAGGTGTCATCACACCACGGTGTTCTCGAGATGTGGAGTAAACTGTCTGGCCTAAACTGCGAAGTT -ACATTGGACTTCAGGTTAAAAAGTTCGTAGGAGGATGCGATAGCCGCCCTTACCGAAAGGAGGGGTTACCTTTATCAGATCATCACAATTTCTTTCTTCT -TACCACAATGCACTGATATGGAATCCGGCCTCGCAGTTTAACAGTAGTGCATAGTATATTACGTTTTTCCTCCATACCCTACTCTACAATCCCACTGTTA -AAGCTTCCTAATCAGGGTGCAGCCACATTGACGCTTTAGTCTCAATAAAGACCTTTTGGCTTCATACTCCGTTGTAGGCCCGAAAGGTTTAGCAACTAAT -TCTTGGAATGTGATAACATTACAGGGCTTTTGGGTAATTTTAGGTATATCACAACTAGGAGGGGTGTCTCAATATCATTTCCAGAATATCTACTTAGGAT -GATAGATGAATGCACTTCTCTATCACCGACTATATCTGAACGAAGCCTCTCAAAATTCATAATCATGATGATCGTATATTTCAATCGAGACATAGTTAGT -CGTTCTTTATAATCCTTGCAACCAATTTTCTAAACTAGCCTCCCCACGTATTAAGTTAAAATTTGGGTTGAATTTATTCCTGATAGACATCTGGTAATGC -CCAAGTTTCCACATTAAATTTCGTTATGGCGAGAAACATTATTAATCAGAACTATTTCTTGTAGAGTACAACAATAATGGCATCTGTTACGCCTTCATGG -GCTATTTCTATATAAATTATCACCATTCCCGATCAATTTGATTTAAAGCACCACTGGTAAGGAGGCATAACAATTCTGGATCTTCATGAAATAGAGATCA -CTATTCCCATCTACATGATCAAATATTCTCTCTTAAAGTGATTTGAGTTTTCTATCATGTTCTACTCGCGATTCAAGATGTCGTgcgtcccgcataccta -agcccgtactccacatcatcccggcccttttgactcGTATactaatgagtcagtgccttcgcttaggtcgtgaggcctccttgtgtaaacaagccgtTtc -agggttattagtgcctctccaaaaatagaAAAACATttccaccattcaatccggtacgctcgcaaggcaaagCTGGAATATTGCGAACTAATAGTATGTT -ACGAGGTTTTCTTATGCCAATAAAATCCGTCAAGCACGAATAGTAGCGCTGTTCTACACCCTGTATCATAGTCGGAGGACTACGCTATGTAATGATGACG -CCTTTTACTTAGTGTATAATCCGTATCAGGTTTACCGTTGGTAGACATCTCATTTCCCATCTTTAAAGGGAACAAACATTTCCCAAAGGTTCACAGCCAC -CACTGTTAGGTTGTACTTGCACCAATATGGTCCTACAGGTAGACTGCGTACTACGCTGCCTAATGTTTTAAAGATAACAGTTCTAATTAGTCACTCTGAA -CGGTTCTATACTTGGGGAACCAACAACCTCTGATTAACTTAAACTACATTCGTCAAATGATGattcagacagctggatctcgtcttctctcatagctaac -gaccctctcattttgtacatatacctgattctcctATTATagtccacttgttatctgaagagcattgagagcgctgtcaaaccgattgtcgtcaactgtg -cggcaagaagcaggtgtgtgtggaactctcttcatcatgctgtcttcctttattggggggctgtgcccaaaggtcgaatccttagtccatagaacagccc -agtttggtgtttttcgattaacccctaaactTTccactccatagtgattgtaccaccttacatggccactaatattgcgtaacagcctccttattcccac -gccctagccaccAtgctttagttgcgggttttttggacataggggatttcaaccgggatgggataaaagttcagcgtgtttctaatttacatcacataag -ggatcagtggtgttcaTATACCGTGAACCACAAACTCCGcagcccccaTatcccagtagttcGtaATTCTAGTGTTAATTACGGCGAGATAAGAAGCTTG -TTAACTGCCTCAAGTGGATCAGGGCTCATTTCCTGACGTGGCTAAAGTACCTCCCCTAAGATTTTTCCTACTCAGACAGTCCATCTCATCTACATTAAGT -ACATCTGAAGTAACTCTTAAGTTGACTCAGCACCGATCGAGTTAACAATATTCTTGTTTAAAAAATTGGATTAAAATGATAGCGATATATAACTCCGCAA -GCTGTGGCTGGATTTCCTATCCTTTATGAAAGATTTGCCCGAAAATTACGTTACTCACTAGAGTGTGAACAGCAAGTTATTTTGCCGAATTTTAGTCAAA -CGGAGTGCAAAGATATTCTCAAATCTCATCTAAccggggACggaaatgttagaagagagcaggtATTTCTGAACGATCGTAACAGGTTATGGGTCATAGT -TTTAAATGAACGGCACTTTGCAATTGTAGGACGTTCAATACGAGTTGCTAACAAATTACTTCAATCGAATCGGTTAGAACCTTCTGGATAACATAATGTG -CATTTCCAAGTACAGGGATGAAAAGGAACCAAAACTCACCACAAGCTCGAAAGTAGTGGAGTCACTATCGCCAGAAGGGGTCCTCGACACGCGCCTCATC -TAAAAAAGTGacctgtagtaaaagggcggccgagctggggctccatacttgggtctataaagaCGGtgtacgcgagccaATAcgttagattatccggtcg -tccaacctaagtgttgcgtgtcgctagtcgacccctgagcgttgcacgcttttaccgcagcgataaaggctttaagcgctggtaagctgaatcatgagag -gtggagtttaaaTCACGCAaggtccccgcacactgaagattccagcgaccttagcctcaggggcgaggagtgagagcgttattgaaaactacagggagtt -gaccgaggggatctaagcgatctctagggggctgggcctatccaatctctggtaagactacttttccctcttctcatacgcttggcgcccccagtttttc -taggtcatacatctaagaaaactttttacacattctccgtgcccgaaAtgacCtcgaacctgcgcttctttatggttggacatgacttagccatatgtgc -cgcccatctttgtacccagtcctctattgatcgattcctgtgtgtcttctacaTtactctgtaACACAGGATAAGAAAGATGTATACATGTGCATGCATA -CGCAACTAATCATGCCCAAGCAGTGCCTGTCAGTCATGGTTAGTACAAAATCATAACACGCTTTTCATTAAACAATTCGGATTGCACAATTAGTACAATC -AAccaggtttcgatcccatccccaatacctgaatcataggtgtaggagCGATGAAcatcgattaggcggtgagttatcaccgtaaacctACctcttgccC -Ggaacacgataagaggacattatctagcctaactcatctgtttgttatacaattcgaaattctcgagttgactgccggggagagtagatgcacataacga -tgatcgggcaagttaggatttgggtgcgggttagaaagtattagtgggtaaaagagggcgacaagacctggatgagaactactaggtgagaatcgctgtg -taatacggttgattatagctagaaggttcctagttgaatgtctaaagagagtgactgcgctgtattcggttcaGACTAGTATATATAAAACTTTGTTATG -GTGGAACTGTGAACGAATGGCCAAAGTAGACACACCTCTTGCGATTGTTTAGATGACTGGACGCAGTACAGTTCAGTTGTGCACATGTAATAGGAAACAT -CAACTCTAGTCCCTTCCTCGCTCCACGGCGAAATAAGCGCACGAACCGGAATTTAGATTCTTAACACGCTTGAGTAGTAGTGCACGAGCTTGGGCAGTTA -CGTGTTGTGGGAGACCGAAAGTAAATTTTTGATAATATGTTTAGTAACAAACTATCGACCATCTTAAGAATCTATTTGAGCCATCCTCGATCAAGTCATG -CAGATTTTGTACTTGTGACAGTTCCATACAATTCTCgttggacccaggtcacgctgcccgttctccgaagacaggcaatcacaaatacctcgggtattat -cagggaattttaacccaactacgctccgcaactctctaaggacatttgccgtcctctagcCACTGGTGAGGTCTGTCAAGCTAGTGATCGAAACACAAGT -ATGAAAGCAAGATGCCCCACCTTTGGTTCGGATAGGGTGGTCAGATGCTAGCCAGTTTACCAAGGGGGCACCCACCAAGTATAGGGGTATTCAGAATCCT -AATCGCACCCATTTAATCCCTACCCTTGCCTATCACGATGTGCATGGTGTTTCTCCAGTTCGCATCCGCCTCGAAATAATATCCACCTTCTTTCATACAC -TTGTCTCGAACCTGTAGCCTTCTAATTTCTCAAGGTCGCTAGTttggacccctagcccggGAGGGTCTAATGCAaatTGCCAATccctcaattgatgtaa -tcgAggGTTTATTCCCTTGGAgcgagactctaactatacctggcttgtgtctctcccagatttctagggaaccccttgggatgtgttgccggagggtagt -cacgcgcttacataatctcatggtaaaactggctcctggccatcccatcgctcctcccacgtaattaagtaattatctaagatcggtagtgtcctatact -acccctttcaccttatcaatgcgtgggtcataccggcgcaggacccatacaaaaccctgccccccccgctgaggccttgatgtgagacggatgaaataag -acgcaatagtagagattaatcagcccttcataaacaaaccgctacTCGTaagtgtttaaggatgtcgaaaaattgaatcttggacgcttcggtggggtcg -cggCcctcacatattccgaggtcatatgcttccctcagacttttaccagatgggcttaagagaaaacagccacggctcgcttgcgtaacaagtgaacttC -CCTTTATAGCGTCAGCGGAAGCTGCTAAGCCCAGTCTAATTAAGAAGCCGTACTAGTATCAGTAGCAATACACGTAAATGACAGGAAGCAACATTATCAC -AATATAACCACAATACTCTGTTGGATGACACCTCTTTTCGTGAGGGGATTCACGGCCGAACGGAACTGACTGCAGCGTAGGGAGCACCTCCATTCAGATC -CCAGACTAACAAGCTATCATTGGTTGTATCATCAATCTTCTCCCAATTATCCGATATTCTACAAGATTCGCCAGTGGTGATGGAACCTTTCTCTTTCTTT -GGTTCAACTGGCGGAGAGATAACTATCAA -> pig2 -CACACTAGTAGAGTATTCCTGAACGAAGTGATCACTTCGATGGTATGAAGGTGCTACCATTGTAAGTTGACGAAATTCTGATGTAGACTCTCCACCTCGT -CGGCCACCTAACGCGTTGCGTTTCTCAGCAATAAGGCCAATACTTACTCTCCGCTACTAAAAGCGATTTACTTGCATTAAGGCTGATGAACATGGGAAAC -TACGATGTTTGTCACAATGGAAATCGTCACTACGTATATCCTATAAGATTACCAGATAGGGAAGTTCCATTCCCGAGATGCTCATAAATAGGGTTGACCT -AAGGAACCCAAACTAGAATGAGTAATGGTCTCGACACTACCACCCCTCGTCATTAAAATATTTGCCAGCACACAGGATGAATTTTGATAAGAGCTAATAC -CATGTAAGATCCGCAATGTCATTATCATCTGTACTCTCTCTCTGAACCCTTTTCGATACAGGAAATGCCAGTTCAAAGAGAACTTCATGGACAATCGAGG -ATGTTATACAAGACAGTGGCATAAAGACATAAGTAGTGGGAGAATAGtccatacagttgacacagtgtcgctagtgtcgaagAtcggaaaagttaaacaa -ggcgctgcggtaggactgagattgcagtgtgagtcaaacttctgatttggacaaaggaattaactcagttaaagctcggactagtgcctgaagaaagact -ctactcgctaggatttttttgagctaagagaactacaggcctgagaggtcaagcgacttcctataggttacaaggcttattaggtatggttctagaattt -gaatccactgacttcagcaCGatagtcggtatgggctaaaatcgcttgcaagacCTGATAAGGCTCTGCGCGCTCAAAACATTGGCCAAGAGATCTTCGT -TGATATCTGATCATTCCTATCCTACCGTGCTGCGGCGAGGATGATCGAATGTACCTGCTAGTTGTAAACTCTGTCGTCCTTGAGACGACTTTGAGCAACT -CCGCCACAACGAAGAAGGACCTGTGAGCGGTGTTGCATAACCGGTGGCAAGTTGTAATATCAGACCTACGGAAAGTTAAGTCAGTGGAGCGTCAATAGGT -CTCAGAACATGACTATTTTATTAGTATCCAAAGCTTCAAAGTTTCCTGCCTGATGGTTGATTTAGTGTATGAATTAGATACGGCTTTAGCGGAGACCGAG -ATATAAATTCCATGTACGGGGCTCAATCGGGACGTTtgagagttccGAactctcgcttcatttagttcgatattttgggtaCTCAATTCTAACCCTCGTC -CCTTAGGTCGAGTCTCAGAGCGTTTTTTAGTTGTGTTTCACTGACTGATCTGAAAAGGCACATCGGCCTCTTTTCGTGGAACGTTTTGGTGAATAAACCC -CAGTATTAGATAGCGAACACGTCGGCCCCGACGTCGAAGTCCGGGCGAGAATCTCTTCAGCCGGTGAGTACGAGCATCTAGATGGGCTTGCGAATCAAAC -CTTATGCTATTTCGTACGTTGACACTGTCTCTGCTCAAGTCTAGAAAGTTTCGGAAGGAATTCACCAGTCTTGTAAATGAACGAGAGGGCTACCCGCGAG -CTGAGACATTGTCACATTGGGATTGTACATGTAGGAAATGAGAGAGGAGCCACGTCCCTCTGCTAAGCTACGTTAGCAGCATGGATGGGCACCTTCTTAA -CTTTGGTCTTATGATGAGCGACTTTATTACGCATCTTCGGTCTGCCCGGTTCGTTCATTCAGTCATTCAGATGATAGTCTGCCTGTGTTGGCACACAAAC -TTTGGCAATTCTAGCATATTCCAGACAACTCATGTTTACCTCAGCCTCAAGAATTAACTAACGTCTGCGTCGTGCACTGCGAACTGCTACCATAAAGGCT -AGAGGTGATTGTTAagagatctcagaacgtaggttcttaatgaatggcagtaatatggcttaatagttcgcccaccgtgcgaattcccggattctgcccc -ctacacaccgaacacaagactctgggagtggagccgtgaaatgcatgtttttagaaatCTaccgcgggggaagctaatacattcctagctgtaacagtca -tTGTTCTAGAGGACTAGAACTTGGCTCCACACATGTGGTGCCGGCTAGGAGGGACCGATTAATTCTCAATTTATGCGTGATATTCATTTCAATACTCACT -TTAATTTTATAGAATATCTTAACTATACTCGTGTGTACATGCATTTGCGCTACCTTCGAAAAAACCACATGTGCTTCTCTGGCTTCTGTGAGTTTCACCA -CCATTACTTACAGAGAATAACCACTCGCTCGGATAACCACTAGCAATTCCGATGGCAGATTACTTTAGAAAGCCAACTTCTAAGGCATAGGTGATTTTCT -AGAAATCCAACCTCTCTCTGTGTGAGTTCCGAGCAGAGCCTTTTAACTCACCGACTGATCTCGAAAGGTTTTAAGTTTATTGTGTAGATCCGCTATACTC -CAAGGCTCTAGTCCATATCCGTTAGCAACCTTCTGATGAATTTAAAAACGGACGTTGTCCAGGCTCGCCGAGTAAACCCTGTGCCCGTTATCCCATTATA -AGGTGTGCTATAACACACGGTCTCGGTTCCTAGCAAACCCAAAAATATCAAAAGATTTTGTCATAATGCTTAGATAATTTGGAGTAATGTCTGCCCTATA -GCAGTTAAGACTTGGGGCGAGTCCTGCAGTGGGTTCAGCGGGGACAACCTAATGCAGAAATGCCTACTTCAAGCAACCATTCCGCGTTTGGAGTTTGTGA -AGCTGCGCCAAAATCCCATGACTCTGCAGTATGTGAATATGATCATGAGAGGCTCTAAGCTTAGGCGTATAGGCGTGATTTCATGAACGCAAGAACCCAT -AACCCGATGGTCCCAGTGAGTCAAAAATGATATTTGCCATTCCTCTTGTCGGATGTTAAAATGAAGACAACTGGCATAAGTCGGTTAAGGCTCTAAATGC -CCTAACCATCATCCATGACGTCAGATCAGGACACCGCTTTCGGTCAAAAATTGTTTCTAGTATCTAGAATAACTTCCTATATCTCAGGCAACTACACGAG -TTTTCACAACTTGGATAGGGTTCCTTTCAAACTTATCAGGGGAACCATGATGCAGGTGACCAATAAGTGAATGTGATACCCTTATCGACTGTAAACCATG -TAGTTTGTGCCCGCTCAGCGCTTAATCTTTAAAGATGCTGCATTGCCGGCGACAATCGGAGTACTCTATTGCGCAATGCATACTAAGGGTGTATACGACG -TCTGAGAAAAAAATGACCTTCaaaggcagagtgattactatatggtggttacccttgttatcatacgAacatgaccttcttctcagctgcgctcccaggt -catAcACtttcaAtatgccggactagatcggtttagtgtattaccctttacgataagcgcagtccgacacatACcctcagcggtcacgcactgtaTCGTA -TCTACGAAATCTGGGGTACAACAACATTGGCTCGGACAATGGTCTAAGACTCAGCGCGTAAATGGGTTTAGCTCTACACACTCAAGCAATAACGTAATGA -TTTCAAAAGCGGATATCTGTAAAAGCAAATGAAAAAAGCTTACATATAATACAAAAAAATTAGGGAAGATGTCGAAAATGCCAAATGGTCCATAACGCCT -CCAGTGCCACATTAGTTCAGTAAGCTCAACATTTCTGCTAAATAGAGAGTCCCTCGCAGCCCTACCGGTAGTCATAAGGGCGCAAGAGATCTCCCCTTCA -CTATCGCGAATATTATAACAAATACTCCCAGTGATATTCTCTATTCACAGTAGGTTAGTCTGATCAGCTCCCACCATTTCATTACATCGACAAATAAGGA -GACCCAATAAATGCAGGCATGAGGAGTTACACTCCATCTGGCGtccaataattaattgggagcatatacgattagccctttTCCACGGTtgAtatcctGC -AGTTgttcctttccttacacaAGCTtagtggtcttaggttccgcaggtgttgttgtagcttactgaatttcatcttgcctagagtCAtgagctgagcgac -gcaggtccggtgtaccacgctctctctgtagatactcctattggtgaaccataagcttgcccccaaattttcgcgtcgtatatacacttcttctaaagaa -atggagatacctcttggatatgccatttgttatattttgagtatttgctctcccgagtaatctTGGTatcacacgggcccgcagttcgaactgagCATTg -acactatgccggagagggtgtccctcccctcaaccatttattctctgagtttcgaataatccaggtagattttttaattctctctagaatgcatcattta -gctgttattgactattggcatctcctaatgctatcaagtactagggtttcttagttcctgcctacctttattgtcctgattgaccatccccacctcgccc -ccgaaccccgacttccctttccatcagcgagtactcattcttctatagtctgtgcTCAttataggttggacagttacaactttaagatctcactcaagGT -CCCGCTTAACAGTTTCGTGACCTCTGAGCTGCATTGTGTTAAGCACTAATTCCCGGAGTCGGGTTTACCGCGTCATCCTACGTCTCAGAAATGGTTAAGT -CGGCTCCAGAATCTATGTATGTGCACAATCCACGTGTTCAAGCTTTGGGCTTCTCctgcctactgtcctcctctaAattacctccccgctactcctcata -caccctagctatccccaaTACCGACGTCACTTCATTGTGGACGCATCTGGTAAATGAGGAGAGATCTCAGCGACTATAAGAATCGTAAGAATGTTTGGCA -ACTTTTGCAACTCTATTCCATCATAAAACCATAGCTCAGCTCTATCGTTGGAACATTATACCTAATCGTGCATTGGACTGAACATACGCCAGTTGCGTGG -CACGAGGAACGTATAAACAGGATTCGCGCAAGTTAGTGCAGTGGGCCATGATAAAATCTGATCAGATGTAACAATGAAGTCATGGGATATCTTTAATTAA -CGCGCCCATTCGAGAAGGTACAATTTCAGTCCAGTATGTCGACCATTTATTGATAATTATCGATGTCGCCGTAAGGCTAATATGTTGGCAGATACTTCTA -AAGCACCATCTTCAGCTATAAGTGCGTTCACAACGGCATGATGGATACCTTGGAGTTGTAAAAACGAGTACCTTTACCAATAGAGACAATCACCGgacCT -CGGGTAtctccagaatgctctcgtccggccgacccaagAGTCTTCTTAAAGCAAAAATCCCCGTACTTCACCCAGCAAAAAATAAATATAAGCTCAAAGA -TGGCGTGTAGCAGAAGCGCCGTCGATGTACTATCATTTTCAAAGCTTACATCACTAGGATATAATTCCAATGCATTGGGATCCTTTTTGGCGGTTTGCAT -TACTTGCGCCATACAAGGAAGAAAAATACGCAGCTGATACTGGGAATTTATAGCCGAGCTCGACCAGATGTAGCTACCCTTGTCAATACAATGAATTCGG -TAATATATCGCTAACCATTCGTAGGCGACAACCTACCCCTTTCCCTGTAGACCTTTATAAAACTATGCACAACTGCAATGGAAAATATCTTTTACACACT -GCGTTATGAACTATCCGGAGATACGCGCGTCCAACGAAATACCTAGTAAGTGGCGAGTTTAATTTAACTGGTCatgccctggaggatctaggtactagcc -atattggggctagaagatgtacagatggaggccaccccaacgTAACtgaagtggcgaaacattaTTCTTGtgaatcctattcgtggagcttgacaaaaga -tggacgatagcttggccctccctggcactgctgtgatttattataacaacacatcgctatccgtgttgtgatcccacgaagccttgaatggtttttgccg -cccgagtgaccttttccgccgatcgtatcagTCTTATAATGAACAAAACCTCAATGGTTAACCCTTCCTGAAGTGGTCGCTATGAATAAGAGACTGTCGG -GCCTCTCAGCGAAGGCGGGATACTCTGTTGTTTACGAAATCCTTGGGTTCTTCAATATTATAATTGTGAGTCAGTTATGAGAGCTCTCATTCACACAGAA -GCAATTAAGTGAAATGGTGCGGAGAACCTGAATTCTCGGGAAAGATGCCTGACTGACATGTCTCCCTATCGTGATGCTATTGGAGTGTGTAATAAACCTC -GCACCTAGTAGTAGGCGACGATACCGGATTTTAGGCCTCGCATTTCATGATGAGAAGCGACAAATCCTCACTCTTAAAGAGAGGATCACCTCTATGAGGT -CATCTCTGGTTCCCTCTTCTTTTAATAACGAGCTGCAATCAATTTTAATTATGGAGCTTATCGATGGGCCACTGTGTACCAAGCTCCTTTCCGGAGTTGT -GTATCGCAAATGCACAATTGAATCACTCTCATCTTAGTGTGATTATACCGCCGCGAGGTCCTTATTAAAGACGTCCGGGCTATCCACTTCTTTATATAAA -GAAGAAGTTTAACAGTAAATCCTTGAGGTGTAATAGCATTACCGAGGCTCCAGGTAAGTTGCGGTGCGTGCCGGCCAGGGAGCGGATCTCGATACTACCT -ACGGGATGTATGCCGGGAGTGGAATCTGACTCTACGTCTTAGTACTGGACCTAACCTAAATAAAGTTCCACAAAATATGTAATCCTGGTATTAGCGTATC -TCAGCTTTCACGTGGTTAGTTAACTTCTCGAATATTTGCAGCCAATTTGTCAATTCGGTCCCTCCATATATTGTTTCGTAGTTTGAGATGAACTTATTCT -AAACATTCATGCAGACATGCCTAGCCACCTACGCTAGGCCTTGTCTGAGTGCAATGCGACGATCATCAGGGATATGCGTTGTAGGCCTCATCGTTCATGA -CATGTGTTAAGCTTCCATCAGTTTTCTCTAAGTTGGTTATCATAATGAGCAGTCGGCTTAATTCCAAACAACGATCATGGAGGAACAGCAAACTCCCGAT -CAATCGTAAACTACAGCTAAATGTTCCCATTCACAGGATAACTTGCTTTCTCTGAGGGTGATTCAAGTTTGATATTTTATACCGTTTATAATTTGAAATG -TTCTtagcccaacatacgtgagttcatctcacgcttcctttcgtctaccctggctgGTTCactaccagggtggtgtgggattctattatgtggcgctccc -tcatttaacccagccatTttagggtcgttagtacctgtcggcaagtagcAAAATAActccacgacttagcttggtacttgtccgaagtagggCCAGAGAA -ATATGAACTTGTTACATACTACAAGAATTTACTAATTTAATGTAACTCATCTAGAAATAATGCTAGCGCTACGCTCTATCATGTAGAATAAATGGACAAC -TATTCTATGTAGCGCTGGTATTATATGGCTAAATTAGAATTCGTACAGGGTGTACTATGGGACCTTATCCCATCTCCTAGCCCTTTAAAGAGAATACACT -ACTTAGAAACTCTCCACTGCCATCCTTGTGTGGAGTATGAACCAGCATGATTCGATACGTAGACTGTTAAATATATTGCATAATTCTACGGAAGAAGCTG -TCTTAATAAAGTACGCCCAATGGTTTTTTCTGTACCTAACCGCTTACCCCTAGTCAATTTGGTCTGCATCCCTCAGACATTAacacaaatggctgggcct -caacctttcccatgactaaagagtcttccatttagtacgtatactgcatactcttGCTCTaatgtaatcattcgtgtgtggacacttaaagtacccccta -atatattattgtcatctacgcggcaaaaaacaagcgagtgtggaagtctccttatcatactgactttccttctcgcgagtttttacctagccgtaaaaac -ttagattcgtaggatggctcaacctcttgtttttcaacgcctccctaaataTGtcactccataatgattgtgtcgtgtagcatttccaccaatgctgcat -aacagtttcctcaactgcatactctgtctcccGaaccctgatcggagattttctaaacggaggagtttcaggccggaatgggataacagcacattgcatt -tctaacttgtattgtctggagagtgggcgacattggTATGCCGTTACCCATAAGAATTGcaagatccgTgccctcgtatttcAttACTTTCAGGTTACTT -AGAGTGGGATAAGGAGTGTGTTATCTAACTGAAAGGTATTGGCATAAACGATTCTCTGTATTCATAGGGCTTACCCCTAGGTTTATCCTTTCTCGATAGC -TTATCACGTTCTCTTAGAAAACATGGGAGTTGATTCCTAAGCTGAGTAAATAGGATCTCAGCAAACCCGATCCTTGCTAACAAGTCAGAAATGGAAGGCT -ACAGATATACAAGTCTGTAACAACTTTTCTCCTATAGATTTGCGGAAGTTCAGGCTCGGCGCATTGGGGTATAAGCTCTCTTAAGATACCTGTTTTATTT -CCCgttatacatcgtctcagaaccagagggttcgcggtccacctcgacgcccattgttactccatcgaaacgatcgcTGAGGcatccatctcagTTGAGT -ctggcaaaccgcgtaagccgttttattgattaatagcacccttcaataggcgaggttgccggaagcaggatccagcgaagaggtagatttccccgggcca -cccgcgtcttgAGAACGTCCCCAGAGTCTGCCATCAAGCAGAGAAGGTGCCCCCCGTTCCTGTACTATATACGGATTCGTGCTGATTTCGCGCAATTTAC -CATCCGGTTTGAACGTAAAAATCGAATTGGCCCTTAACCAATCCCAGTTATTTATGTAAAGTCCTGACAGTACGACTATGGGAAagGgggtatgcaccga -ccacatgcggtttgggggacagcatcttcggaaaaaatcacttacagtttagcgcgcacgatagtaactgatggatcctgtcgggATAGCACATTACGTa -ttctacaatatgagctggaagataggtaacaaacggacaaattaacaagcgtgcagtctccaacgcacaaccaccttctcttcgctttcctggtctagtt -ttttcttttgttgagctgcttaccctcaatacaactctgtgctcactttcctagctcgcctttgatctgattcgttcaataagtcaagccaggaggccgt -gggatacggaaagccctttcccatgagcactgctgtagtttaagccgcaattattcacataggctgctatggatgcaaaacgtaaagagctcactgtaac -agtcgatgatagggatttccgaagctaccgtcactcattattggacaatacacctaactccctgaccatctttttgatcggcgaagggttgaagaggggg -gcacGGCGCCATTCAGTAAGGTGATGCAGCTCATTGCAAGGAGAGCGCTTCTAGAACACTGCTCTAATGCGCCGCGAACGTACTGATCTAACGGCGAGGT -GtatacaggtatagaagttaaatcGTGagcacaggttcagaataggggacccgtgcctttatgcgcttaattttctatttacggacaatccacaaacaat -agctatcccCgtattaggatgattgatgaaagtgtttcgctatcaggaggaagtccttaatcatgctgtttcatccgttttgttcacggtaggcatgtct -ccctgcgcctcgagcccctactcttttgcgcgacatttggtaggtcttgaaacggttcctcaaaaaattaaagttatcatttgacccactacttccactg -ctaggttcacacccaaaagaaacgggaagtgtttataagcgagatacctgccctcatatgcttgttgcagcacagtttactacggtttgagcttagaact -aacttagatatccaacaaaagatggatagatagaggatctctggtaagtatacacaatgaaatgctattctgccataaaagaacgaaacctagctctttg -tgacaacctggatgggggtaaatattatcatgtcaagtaggataaacacagtacttaaagcaatcattatactccctcatttatccgtggggtctaaaaa -ttaaaacaattgagcttgtggacatggagagtggaaaaattgtgacagcggtcgcagttggggccaagtatattataagtaatggggtaacagtcccctg -aagcatttgcctcctgagttacgaaagatccgattacaccggatcagacgtgttagaatgtgcgaagaagttactaatcaccacagttagccagctgtac -tggttggttataggttttactcatctttacctgtttattttatatccatttaccactcccactacccccccgcacgccgactaccctgcccggctttcgc -gaatggtcttgctatccactatttctatggatcagatcctttgatttttaggccgcacaaacaggtatggatgcgtgaagtaaccctttctgtaaccggc -ccatttcacttaccataatggtttccggtttaatttaacttgctgtgaacgaccggctcgagttgtctttcccagccaaatagtatcccatcctgcacac -ctagcgtattttctatgtcgattcgtttattggcggatacctgggctgctctcaaacccgagtgagtacatgCAAtggtataggagcgaacatagtagtg -cagatatctctctaaaggactagtttcctctttcggagatatatatcccccagtgggacaacttgatcagatagcaactcaacctttagtttcgtaggaa -gtatatagaccgtactccgtggtggttgtactagttttccttcccaccagcaacggacaggggttcccttttcatgatattttcagcaatatttggtatt -gcctaccttatcggtacaagccctgtaaaacggagtgatatcatctctcattgcagttgtgactcctccgtttaacataaccaatgatgttgagcacctc -tttacgagcctatttgttttctgaacgtcttttttcgtgagatgcatattctaaacttttgcccactttcccgtaggatgattcccctttctcctatagc -gctagccgggctgaaaagcgtggcgggagttcaggggtatctgaaaatggataataggtacaagcggcataatagaggacaagccgaatcgactatctgg -aagtataacagactgactagcatgacactcgcgtgctagtagatcttaagatagctgaaggtgtgaaattgtattattgtgaactcaggggaaaattatt -ggaggaaaaagatatcccactatctatgatgtgctttgttcactttgcatgcctatataaacgcgcgtagcgtaccctataaagatatgcacgtccggcg -taccccttatcctgtatttgcACGgcgagtttacttgtattaattgtcaaaccctaccgataagcgagaagggacgatgttagcctttcttcgcgttgtt -cttttgctgggcgtacagatctctagtcacatgcatgtcgctacgaaggcttgtatctcattccctactaaagctaaacagcattccCGAGTCGATGACG -GTTATGATGACAGGCCATTAAGACGGTAACTGTATACTTTAGGGGATAGGCCTCAGGCCGCCTCATATAACCTTCAATACTTAAACAAGGTTCATACAAT -TGATTCTAAAAGTACTCATGGCCAGTTTTTAGAATGTTTTCTCCTAATATAGTCTACCGCTATACGACGGAGGAATGATGCCTCACAACTTGGTTACATT -TCTGCTCCAGGGTGCGTTTTCTAAGGTTTGCTCGGTCTAGATTATTGAAGCGAAAGCTTTTAGATGATGTTACGTGATATATTCTCTCCATTTAGAGCTA -AAAAAATTCCATTGCTTTATCTTATGTTCAAACATTGATGGATAGATCCATTTTTGTCCATGTTCTTATTCCTGTCTGAAATTCGGTGCGCGACTTACGG -CAGGACCTCACGGTTAAAAACTTTTTCTAGCATATCCTGACACAGGCGTTTTCCATAGCCCCACTAGACCCCAGATCTCGgaatgtctgaggtaattata -tctcaacgtgcGAttgtggacACGtagaaaataaatccatgtagctaatctgaacagcaaaacagtcataaaccactataatgtgcctgacatcattagg -caacaaaaggccaccagtcgactcttagataagtcgtaaccacataccgaaagcaacagatcTacatccactaaagaagaagtcgcaaaggccgacggtc -gcttaagggaaagcggaAcctttacgcagtgatgttgagaacgtagacctactgtcaacacggaggacagttctgagggacggtattgtgctaaatactg -gtccaacatgtttgcccttaatcctactggcgtatatatactcaaagtaaagtcgttgaattcctaagtattaagagcattttctgcgactggtagggtc -tgggagctactttactggtcatcgacagatagacggggaatcgatatgttgcatatacgacgaagctcttctcacccataagaaCagaagcggggccatt -aatactactgcgaacgcgactgaggactattacgttgagtagaaagatacatggatccaaacgcggacacatatactcttacttagttaaaaaagctaga -agccagatcattactcaatAgactgtcagagggtaaggattgtcacttagggctgagaggatagcggcaggtcgggggagcgtgattaatgagtgcaaga -tagttaggccagaatggctaagagctactgcacaaacaatcggagtgactataacaaataagagccaaactgtaatttctagaatttttcgaagagcgga -actagggagtttacattttaatgggtaagcgcgggtagggacggataccccattatcgacggttcccccatcttacactgtgtgtgtatgttaagaccct -tcccctgcTCATAAACATAATTAACTCCACTTATAGGACGGTTATAAAACGGATTAGGGTTATCGGGTTTGAAAATCTAAACTTTCATGCTTTTTGACTT -TTAGCATGATTTAAACAGCGCAATGGAAATATCACTAAACTACACTTAGACAGCTTGCTAATCTGTTCCAAAATAAGATACTGTCCAGGTACCGCGAGTA -ATACCTCTGATACAGGGCGATTTCTAGTGAGCACAGAATCAGTGCACTGAGAGGGgaggttatcggccgtttcatccagccgattaagtattgggtcatt -AGAAAcccataaccaatgcgatcaaggtgcaaaagggttaagtgcattctgctgttgtggttaagaAtatttgtagcgGAggcaaccgttggcggtttgt -tgagtAttctaaatctagctgaaagCaccgGgcaagttttatacgacgattgacttattgggcttagcaggaatttcgtaccatttaaataaatgccagg -gggggcatttctctccattgtgacacataagtaggccggacacgataggacaaacatcacaAtctctttctccctatcaggaaccggtgatgggaacctc -tggtctcgcaggtaTtagctgtcaaatactgggaaTGAAAACCttatacgatggggagagttcgcttaagaaGACGTGCATAATagacggggggactata -taattttttgaaaagactaggaaaggatgggggGGCGaATGTACAGGCAACATCAGCTTCACTATCTCCGGTCTCAATGGATAAAATTAACGCGGGGCAA -GGTGCAGTGATACGTAGAATAAATACGTTATTTCTACGCGACTTAATTGCCGAAGGAGCATTTCCAATTGTCCCAGGAGGAAGGCAGGCCCCAGCAAACC -CCATTAGTTGTTATAGCATGACTACTACGAGTGGAGGAATACAATAATACCTCGCCCGTGATTGCTTTAGGGGTGTAAAAATAGTGTGGAGTTTAAGTAC -ATGCAGCAGTCTGAATGTAATGAGCAGATGTCAGATGTAGCATGGTGAcgggcgggggactccagaatgacatgaataacggctgtttgatcaccatacc -cttagcgctTaacctcattcatgggtacctaggacgcgcgctgtaaacgcttgttggaCGAATAAGCCACTTCCGCATAAACCTTTGGTTGTTCAGATTC -CGGACGCCTGGACAATATCATTGAAACCATATAGATTATGACTGTCCATACGGACGTAATAGCAACTCTACAATGGTACGACTGGTACCCACTCTTGAGT -TTGATATGATAGGTGTGATTGTATTGGTCGTGTGTCCCAAGCTTATGTCTGCCGGCTTTTTGCAGTTAAGATTTCTGTACTGGCACATCGCTCATATTCA -ACAGTCTATTTATCGCCGGGCTAGCAATAGATGAGACTGTGTTTTAATTCGGAGCGATGCTCGAGTTCCTAGGATCCCGACCCGCCTACCAAAGCCGATC -CTTCAGGTAAAAACAGGCAAAAAATGTAAATTCATTGGAGGTGGGTGCACCAACGGGAGACAGCAACAACGTAGCCTTTTATCCTACAGAAGGGGATATG -TTAACCGCGGCTCCTACTGAACGCCCTGTATCCTTAGATCAAAAGTGGTGCCCAACTGCTTTCAAGGTAGTTGACTTCACATGTTGGATAggttcgaaac -GCTATtcgcctttctagccacttggcacagtaGAAGCACGTTGAATTCAGATAAGAATGACGGGCGGCTATAAAGAAGTGCCCATGGGCGACTTAAAATG -CACCTATATCTATTTCTTATTACTCACACGGCTTAGTTGAGGCTGATCTTATATCGTTGAAAATAGGGTCTCCACTCAACTATTTCTATGCATCTCTACC -TATCAATACCACTGTCAAACGTCAAGAGGGAACTCGGGAGTGCCGTCCCCGTAATAAGTCCGAATTGAACTAACTAAGCTAAGAGATACTATGCATGCAC -GAGAATATAGGTTATAGTGAAGCCGTCTAATCACGTGCGAACCTAGCAGTAGATATTATCGTTCCCACTTTTTCCGTTCTCCAACTACATCTAACAGTAG -AGTTCAAACGGCTTTCAAGCGTATCCTACCAGGTAACCATTATGGAACTAGGCAGGTTGACCAGAACTTCGAACTGCCGGTAAATAATATCTACCAGAAT -ATTGCACCGAGTTAGGTCTTGTGTGTATCATCAGTCTCCAGTATTCGTCAGTATACGGGATGGGGTTCTCTGGCAAAAACATATGAATTGGTAATCCATC -GATTTCAATATCCAAGTCCTTTTGCCTTACTAACGGCGAAAACAAATTAAGATTTTGTGCATTTGCGGATAATATCCCTTTTAGTTTTATTACGCGGTTC -ACAgaagaCCCTATTAgttaaggtaGGCTACAGAGACAGACTACTCGATCCCATACTAATTTCTTTAGTGTTGCGAGTACATACGTGTTTAGTATGTGAT -TTATGTACTTTTTTCTTGGTGAAGTAACTAGCCTTCCTTGCATAAGTTATTCTTAAACTCACGTGAAGTGCGCGATATATACtctgagtacaaggtagtt -gtgtgtgtcaaagagctccatggtatgtctctataagtggctatggtcagtgaatccaaaTaaaggtgataacaggttactgagatctaggctagtccac -attagggccatcgtagaaagtgacatctttactgtaaagtaaatctaagatgacattttagtgggaatatgttatagatatttctgcattgccggactat -ctagccgaaagggatgatctacggttacgtctaaggcatggtaaggtactaaatcccttttgctccaaacaccaaatgggtcaggtatgttgatgtggca -cagatcccatctgcattcatttgATAGCGtttgagggcacttagactcttgttaaccttatttatcatggaaaggctaaaaggggaaatacatctaagct -catcccttgaaggcgcacgttgctttattcttaggtgcctgaGTAATATGCCTAGCGCCCTACCTCACATGAAATCGTTTGATGGTCCGCCTGGCTAGTA -TGGCCAAGCCGCGGACAAATGATTGCTTAGGATACTAGGAATTCAGTGCGTACACTGGGCATTGTTGACACTCCTACATCTCACATTCCTCATCTACTTG -GCTTGgcgatcggagatttcattattctctacTGTacgggacctacccctgaggcgtacctggctgcttgagcgccacttcgcccagaccattgaggaag -cgtactcagagacaggtgtgaagccattattccACTGGGCCCatcagaggacgtaggggctaatgattacagaaccaaaagccccagtgccagagcgctt -cagggtactcacgtaacttcccaacaacacaaggagggtagtcttttacctagttaggtagtctatcctgttacaaggaacatactgcgagaaggaacga -tcttcccaccacgcctctttttctgagttcctatgtgctcccgctcgctcaaaccttttgctgacgccccggcagatcggattcccttcttgagggatcc -ctgactaagaccacccttaactcacgatagaaagccagactctatctagtaagcatgctcgggcttgaggaactgaaattctatccgataacacatgttc -acaagccttcctcaaactgtaaagattgAtgtcagcagcaaCAAggtcggtgggtaggaggtcgccagtatgcccctctgtaatacctcatgggtgtatg -actgcactggagggcctctttcccccgcctctgagttaaagaagaacaaggagactgctaaacacattagccttgatatgaagaacaaaacaacTTGTTG -ACGGAAACCTAGACGTAGCTATGGTCGCCACATTAACGAGACTGTGCAACTGAAATAGGGAAttggctcacccgctccctaactgtgtgtacatggttag -tgcaccaaCAAAAACcatggataaagctgtggatcattatagtaagtgaTTacttcgccCAaaagaagaaaggagggcattacttagtatcacgctcctg -tgtaattcaagctccaagacgattgaatttatcgccaacgagaacagaagcacatagccgttcgcgggaaagcccaggtctgcgggagaggttaaaaggg -ttagcgggtgaaaaaacgtgataaaacgcgaatgaaatttattactgcagagcaacggcgtaccatagatgtctgcggctgcaagggacttcgtaagatt -ttccaggtgcattactatattgtctctagttcaGAGGAGTTTATAAATAGCTTCGTTGAGTAGAAACTGTGAGCACTTAAGCGAAGTAGACATGTCCTTT -GCTATAGTTTCGGAAATTGCCATTTGCACAATCTAGTGACGCACATGTAATGCAAAGAGTCAACTTTAGTTTCCTCCCCTTCCCTCTGCCTAATAGATAT -AAAATCTAAAGATCACGCCTGTAACAGTTCAGAGTGGTCCTATGACGCCTTTGAGGGCTCTATTTCATCTAGCACACGTATTGAAAGTCCACGAACCTGT -CTCATAAAAGATTGTCGCCTGGATTGGGAGCTTTTCGAATCCCTCCGTTAACGAGAGATATATACGATCTCTCGGTTGGACTTTCGTGCAGTTCCCaccg -gcaccggacgcgaccacttctatcccaacaaaaggcaataaccaatatttaacatgctatcaaggattattaacttaggtaaacgatgccagttaccatg -gaaatttgctcttttggagaTCTTAGTAAGACTCGATAGATTGGTAAGTAAAGAACGATGGTGCATTTAGTGTGTTCTATCTTTGACTCAAATGGGACAT -CGGATTGCTTCACAATTCACTAGGGGTAGACTCATCACTAATGTGGATAGTTGTAGTTCTAATTACGCTTATTTGACCCGCTCCCCTGACTATTATGCTA -TATACGGTGGTTTTCCGCGGCAGAGCTATCTTCTAATGACATCCACCGCTTCCCATGCATTGGAATCAAAGCTACCGACTTCTAGTCCCCGGAACGTGAT -GATctgggatcctaacccagACAAGCCCCACACAagtCGAGAAAttatgggtcagtatgtgcgAatACGCATTATTTTGGAatacacgcttggccgcact -tagagtgtggttctcccaccttccgggagtacttttagggaagcattgtcggatgctagctagtctttttagtcgtttggcggggacactaaccccacac -tacctcctttagactgcttcatctttgagggactacctccgacagaaaatatcctgcactgctcaccccaacccgtcaacgtattgattgcagcgggggg -gaacccacacgcgattcctttcatctgagaagaagtttgaagtcagcgagattaagtcgaagccagagagactaactaaacgatcctccctatacggggc -cccacTTGTgagggtttgaggagttgtaatcttcgaagtatgaatcctgtagtggggtcgaacCctccgtacgttcccaaaccctatgattcccgtgggc -ctttcacgaaggggcttgaacggaaggagtcacatccctctggcatgccatgtaaactcCTCCTTAAAGTATTAGCGTTTGCAATTAAACGTTGGCTAAT -CTAAATTACATACGGATTCTAGCGGTAATGCCCTTATATAATAGGAACCAGTGTCGACACTATATAACTAATGTGCTCTGCTCCATGATATCTTTTACTG -TAACGGTGTTACCTGACTACGGATCCTGATCGTGGCCTAGCGAATAGCCCCGATAAGATTGGTAATCAACAAGCTGTCATTTATTAAGATGTCAATCTTA -TCCTTATCATCCGAGATAGTGCACCATTCGCTTGTAAGAATAGAGTCCTTTTCCTCCTTGTATTCCGCTATTGATACCGTTTAACACAGACTACTGACTT -CGAACTACTCTTAGATTttgaacccaagcccaggaagttctatagcagattagcactcaaattccagggtggagtaatctaaggagatgaatctttgtca -atagagagctaagcggcgcgctttcatgtgagagggcgtacgtgatcatataaagataagactgctCCTCAAGCAAATTtagctGATTgCAaggtacaac -cctaaacagtctggcccgtttaactgagccTaaatcagagaggggcttgactaatactggccgcgctgcggaggaaaagaaactctgcgcaccgttTtta -ggtctacagttcacacagggactatgtagtcgaggtgatactttcctcaaatacctaaaacctgagccagcccgtgacttggctcccagtttggatacat -tgtctcagtTGCaggataggtcagtttTgtgaaagaagcttaggcattaagccacctcaacctcttataaagacggtgtataagaTCACAAAGATATGCg -cgaactaagtgtatcgatagacataagaggaaagagaataggcatattaacaagacCGggaggagtactcgcccgttgaagcgcccAAGGAATGAGACTA -AGGAAATTAAATTCAGCGTACTGTATAAGGTCAGTCTCGCTTGGATCCAACTGGCTACAGATATGTTTAGTGcaccttgtgggtacccagcgactgctca -ttagttggtgtatttacaccgctgtatatttaccctacgctggggtacccagtttctgagagagttctgcactcccacgtttattgaatcaaaggtattt -tcagggtcgagaaacgctctaaatcccatatgacagattaacgagtaaacaatacttggtacatgtagacaatggaccgcactttagtcataggaagggc -tgaagctcgaccaaaagtatatgtgtagaactggatattattttatcaggtgaaacaagttaggcactcaacttagaagaccaaacggttctgctgattt -gtgtgatcgaaaggtagagactaaagtgtataaagaaaggggaatggttACgttcccacctttgaaaaaacgagcccaGTcaggtgggcatggagtaatg -aggaaagcagaaagggaattcggagtcggaggcacatctcccacagttaacagctcaggggcccagggatgcagttgttctctctgtgtttttaaagaac -tctgaactactgcgccagtgtataaaccccgatgccattgtctatcttgcttcccccggcatactgattattaagtcgaattccctgtgtgtatgactac -tcgatgattgggtctttgaggtcccaggggattggagtggcacccgggattgccttggcttattcaaattttggccagcaaaataaataagccagcctcc -accaaactggagtagtcgttcgtgggattagagttgtcgatagttaaaggtcataaaagaggtatgcgggcaagattgaaatttattttgctgctatatt -agggaacctttacttgacattgtctcaattatgcggtcaactcgagccaagaTGGGTCTTGTTGCACTGGTCCATGGGACGGGGGGTCTCAAGCTGATAA -ATGTTCCGGTGAGTCCACCTGCACTTAATTCAATACAAGAGAGCTCTATAGGTTGTACAGATCTCCCCTTCATCGCCCATGTTTGTTCGTGCCTGGGTGA -GCATATTCCGGCAAAGGCGAGCTATTAGATTAGTATAATTGAATACTAGCCCGACCAGAAACATCTTACAAGTACACATGGTAGCCCGCGACTATGTGGC -ACCTACAAATGAATCAGCGTgtctctgacagcactAgttgggcgactgtagtgactacttagttaaaaacattccgaaacagctcggtatAATATTGAAA -AGTTTGACGCGTGGCCAACTTATGGTGATTCTGCTCCTTATTCGTTGAAAGAGGTCATCCGAACTACCCCGACCGTTGATTACTCCAAATCCTGTTTCAG -ACAAAGTATAAGCTGGATGTAAGAAAATTTGTAATGCTATAAGCCAGTAATCCCAGAGATAGAACACATCCACTTAAATACGTTACATTTACGAATTATG -ATCGGTAGTTTCTGGTTGCAGAACCTTCCATGCAAGGACAAAACATGTGATGATAATCAACTGGAGCGGCTGTGAGTTTAGAGCGGAAATATTAGCGATG -CTTACTTACTAATCAGATGACTAGCATGATCAGGGATAGTAGGTTAAATACAAAAGGCCGATGGACCTCGTAGACTGAAAGCTCAGCACGGAGAATCAAG -CTGATACGGATGTGGGTGCTGAAGAGCTTCCCATCTCTATAATTCGCCTACTGAGTTAATCCCGTATTTGGAATGGATGCACTAAATAAGGACGCTGCTC -CTTTTGTTGTCAAAACCAGAATAGATCTGTGAAATAGCGGTATATGTTGtacacggagccatcaagtaccaggacccaaaaagcccccGGTGAGCCCtgc -cTCGGCTTTGCACCAGAGTAGCAATGACATCTAAGGCGATGTTCAGAAGCTAGTTAATGTATCTCGCATGCACTTATACCGTCGTAATAGTCAAGCTGCC -TGCTCAAGTTAATCGGCGTGAAGGTTGAAGAATGCTCTTCTTATCGTCGTCTTGGACAGGTATGTATGCCCAATAGGCGACGGGAACCTCAACTCAGTAA -AGTGTGCCCGAAGGCAGAGAGGGTCAAGTAGATTACTTTCTCTACTCTTCTATCTATTACGCCTCATCGTTGGTTGAATAGAGGATCACACCACTCATTA -GAACCTATGCAGAGGATTTGTTTATTGGAGCCTGTGCAAACGAAGAGTTGGACTGTTGTCACATTTCATCTCTcccggaCTggagtcattgaaaaagaac -agaaGTTTCTGGATGATCATGGTAAATTGTAAGTTATGGTCCCAGGCGAGTAAGACGCAATTATTACAAGATGATTAAGATACGTCATTAATGAGCGGCG -GAGATACGGCTCCCAGCAGAACTCTGGATAAGATAATGCATGGTGTCAATCACAAAGCTCCAAGGAGAATAAGACCTGGTGCTAGCTTAGGAGAAGCGCA -ATTACTGTCTCCCAATGACGTTCTTGAAAGCTGGCCTAAGTTCAATTGTAggcatgtcggagagggtgtccttgatagagttccagacttaggtcagtaa -aggTAGcctacttgcgctaAGAgaatgcgtgtttttgttactcgacctaaaattggcagttcccaagactatctatgggtactagaccccgttcccatag -atataaacgcttaaagtgttagaaagcccgagtgtgggtgctagacttcaaaGCTGGGGaatttctgccttactgcggataccaccggtttcggtccccg -ggccatggaatgggagaggtgctaggaaccggggggaaccgcacgtggtcatctgagttgctagcacgagtctagcccgatggtatctatggcgaatcaa -ccttttcacttgcccttcccgttcgtacttccagccgctccagactataagcttcaccaaatttgcaattcactctcgtaccccaaaTcgatCctgaatt -tccatttttgtatggaagggtaaaatttagctgtccatacctttgtcatttgtatccattatcctgcgaatggacttctacatgtcccctatgTtatttt -tgaACATAACACCAAATCGGTATGCTAACATGCATGCAGACGTAGATGATCGTATTAAAGGCGCATCTGTAAGACATGATCGTTGTAGCATTGCAAGAAA -CTCTTCGCTATACTTATTGAAGCAACTAA -> pig3 -TGAATTATCAAACGATCTTGATATGACATATACATAACTTCGGCCGGCGCTTCTCCTACAGCCGCGAAAATATGAGATAAAAGCCGCAACAATTCCACCA -CAGTCAGCTTGATCTTGTAACACATAACATAGGTAGGAAAGCAACTCTTGTAGTCTATGAAAGTAGACTGGAATTCTATTATTAGACAGCACATCCATAC -TAGAATATGTGACTCAAAGATTGGACATATTCGTGGGATTCTATGAGCGCGAGGGCCTATAAGATGATGATCCGAAGGAGTTTGAGATGGTCCTTACCGT -TACTTTTGGTGGAACCCAGATTACACAAAACGAATAAATGCTGGTTGGAGCTCCTAGTGCTACAGATATCACCCACGACATCAGGTTTAGAGACTGGTTT -CACTCAAACGTCGTCTTAAATAATCCCAAGGGTTTTCTACATGTCACACGATGACACAAGTTTCTACAGTTCCGAATGGGTATTTTGCGGACTTTTGAGA -GGAAACGTGCTGTAGCTAATCTAGCAGTGAATGTGAGATCCTCACTGTCTACAAAGCATATGGAATCAATAAACTCAGTGCCCAACATTTAGGAATACTC -CACTATTGGAGACAATCGGGATCCCATAATTCACCATAATCCTTACGGCTATACATAGTATCTCCTTAAGCAACAACTTTTgtgacctaggtaagtacct -caggctgaatatccctcgtcgtaaaccAatctgaccttttttctagcggccttgtccgattgtAcCCtttaaAcgtgtcgaaccaagtcctcttagttta -tgcctccttcccctaaactaaatctgtcatatGCctcctggagtgactccctcgaGTCGATCTATGCTATCGGTAATGGAACAATAGCTGTTTAGGCAAA -AGTCTAAGACTTGTCATCTGGATAAAGCTAGTTCTGCTCACAAAAGCAAGGGTACATCAATTCTAGAGACAGACTCGAACAAATATAAACGCAAAAAGCT -CATACGTAGTATGGACGAATTACGAGGGATGACGAATGGGCCAAGCCGTTCCCAACATCTCCCTTGTTGTAGCAATGCGGAAAGCATAACATGGAGCCTC -AATAGAGTGTCTCCCTTGGTCATACTAATAGTCCTGGAAACTGAAGAGCTTATCCCTCTGCTACAGCAAGTGTTATAATAGTAACGTCCAGTTACATTTT -TTATTCACAATATGTCAAACCTGCCAACCTACACTGTCTAGCTATAATGATGAGTAGGAAAAACCAAACAATACCATAACAAGGAATTACATTATATTTG -GCGtcccataattaaataagcacatgtggctcttatgttccGCCAGCGTtcAtgacccGTGGCTgccgacttcaataactaGGTTcgctggtctccaagt -ccagcaggttggttgccattgactgaagcactttctctttaagatTAcgaacttagtcacataagccagatatacctacttatgttggttgaggcacctc -tcagtaggccaagagcttgcttccgagtttgtgttttgggagcgcactttctcctagaaaggagggatgtctctttattattgcgatctctctcctgtaa -acggatacccagcagttcaattaCGACtccaaatacgcatgaaatttgaaataagTATTcacgctatgtaagagggggtatccataccctcaaaagttca -tcctttaggttactaacactgcaattatactgattgagttatcttaaagttgataattagggtattattgcttataggcatcttgttataccatctatta -gcaggtcttattagttcttttatagttttgttgtaatcactggctgtctcatccccccactcaaacccccggttcctctccgatctcctgataatgattt -ttctactttatatgtTTAacataaatccgatcgcttgcacttctaaataccactcgacCACTTATTCAACAGTTTCATGATCTTAAAGCTGTGCTGTGGT -GAGTAGTATTACAAAGGACTGGCCTCATCCCCTAGTCCTGCATTTCCGAAATGGGCAAATAAGCGTTAGAGTCGATGAGTGTACGTGGTCTGCATAGGTA -AGCATTACGCACACCtcgcttgccgcctccttataAacaatcttcttgttacttcccatacgtttcagctaccttccaCATACTCGTTACATCACTATAA -ACGCATCTGATCGGAAGACTTAAACCCGTACAATTATAAGAATGGGAAAAACGTCCGGTGAATTTTGCAGATATATTTCGCTGTTAGGCGAGACTCTAGC -CCTCTCGCCGGAATGCTGTACTCATTAATGCCTTTGACTGCTCATAGTTCATTAGTGCGCTTTGGGTAATCTCCAAAATAAACAAACTATGGCCACTGAA -GTATGCCATAATAAGACCCAACTAGATGCTTCCGCGAAATTATTGGATATTTTTAATTTGCTTGCGCATTCAAAACATCAAAGTTTTAATCTAGAAAGCC -GCCCCGTCGTTGGCAATTATCGACACCACCGAAATCTCTATATATCAGCATACAGTATCAAGGGAGCGTTTTCAGCCATGAGTATACTCTTATCAACGTG -AAATCATCCTCGGTATCAAGGGAGTTAAGACCTTTACCAAAAATCAGAATTACCCcacCTTCAGTAtctccagattgccctgatgctgttgatcagaaAG -CATCTTTAGATCTGAGAAATTCATACTATGTCTGAAAGAAATTAAATGCGTGTTGAAAGCTTGTTCGGAGCAGGGACCCCACGTGCGTGCTATTGCTTTT -TCAGCTTATACCACTAGGGCTTAGATCTGATTTACAAGCTTTACCCTTGGTAGTCCGCAATCTTTGTGCTAAGTAACATGGAAGAACATATTAGTGCTAC -TAGGACCCCAGAGTCAAGCTAGATCGAACCTGGCTACCCCGGTCATCATGGTGGATTTCACGATGTGCTACTAATAATTCACAGACAGGTATTTGTCCAT -TTTGATGTGGAAACCATTTAAACTAAGTTATATTGCAATGGAAAGTGTTTTTTTTACCCTGTACTATGAACTACCTTGAGATATGGCCACAAAGCTAAAT -ACCCAATCGGTTTTGTGGTAAGTTCAACTACTCatgcttcaagtgatatgtttaccaacaacacacgggactgcagacatactgaaggagatcaccgtaa -cgTATCtgaacgggtgaaacattgTATTTGtaaattctgtgcccggggcccgggctcatgaagatgacgtttagtccctacttaacagctctaggactta -atacagtcaccccccggcagtctcactacagacctatcaaacgttaggacattcttcctaccccggtgaccttttttgccaagtgcattaaTTTTGCGAC -CCAAACAGTTTGAATGCATATTTTTAACTGAGGTATCCATTGTGGGAAAGAAACTGTAACCCCGATCAGCGGAGATGGAACTCGCTGCTGTACTTAAAAC -CCTTGGGCCACTCAATAGCACAGATATTATTCAACTAGAAGGGAACTCGTCCACAGGAAAAGAATTGATTGGGCTAGTTCGATTTATCCGAACTTACCGA -GGACATGCCAAGCTGGTATGTTACCATATCATAAAACTATTCGGCTGTGTATTAAGCTCTCTAGTTCAGACTGAGAGACAACATCGGGTTCTATACCAAA -TAATTTCTAGATACGTGCAATAGATCCTCGTCCCTGAGGCGTTGTATACATTTATGGGGTTATCCATACTCTCTGATTCTTTCGATAGCGAACTAGTATG -AGAGTTAACTTTGCACTCTACCGACGGATCATAGTGCAGCGAGCTCCCTTAACAGGTCTTGTACCGCAACTGCCCTAATAAATCTTCATAGTCTTCACTC -GATTATATTAACGCCAGAAGCCTATTGAAGACCTATGGGTTACTCGTTCCTTTGTATGATGCAGACGTTTGACTATGAATCCTCCAGATGTGAAAGCATT -ATTAGGCGCTTAGGTATTCTTAGCTGGGTGGCAACTAAATGGAGCATCGCAGTATCATATACAAGATACTTGCCTAGAGAAGGCGCTGAATAAATGTCCC -TACGGCACACTGAATCTAAACAAAGCCTCACCAGCCCCGTAGTCAGGGCGTTAGCGTATCCTAGCTTTAAGATGGTAAGTTGTTTTCTAGAATCTTTAAC -TTTATTCTTCAACCCTGGCCTTTTTATATATTAAGTTGAAGTGTAAGTCGAATTTAATCTTCAGCTTAATACGGAGATGCTCAAGCACCCGCACTGAGCA -TCGGCCTAATGCAGTGCATTATGCATCAAGAGTGTTCATTGTAGAGCCCATTGAACGCAGTCCTCCCTAAGCCCTTATGGGTTGCCTGTTTGTCGATTAT -GGTAATAAGCTGTCGTTCTTGTTTCAGACATCAATGATGGGAGGATCTCACATTTTCGGCAAATTGTGAAGTAGACCCCTCCATTCCGATCCGCACGGCG -CTATTTTTTTTTTCACTACGATTTGAGTTCCAGATCAGGCTTTACTTATAATTCGGAGCGTTTAcagtcctgtacacccgcgctcatatgagacttcctc -cgggcagccttggctgGTATgccaacatagctccaccatattctagtctgcggtacgctctagcctggcatagcaatTttacggtcgttagtgcctgtca -gcaaatagcGAGATAGctccgacactgagctgggtgccgggctaaggtaaacTCGTAATATTGTGAACTGGTAATATACTATAAGGGTTTCTGATTTCAG -GGTAACTCATCAGGAAAAAATATTAGCATTGTGCTTCTTTACGTATAGTAGGCGGACGACAGTGTTAGGTGGAGTTGGTATTATAACCTTACTGCAGAGT -TTGTGCATGGTTCACTGTGCGTAGGTATTCCATTTCTTCTCTCTAGGGAGAACACAAATTAGCCAAAAGCTCTCAAATGTCACTATTATGTGCTACTGAA -CCCGACACAATCCGATGGATGGACTGCTGGAGACCCTACCTGATTTTCGGAACATATCAGCTTTAACTAACTGCTCTCAATGATTTCCGGAACATGTAGC -CAATCACTCTCGATTGAGTTGGACTACATCGTTTGATAAATGatgtaaatgactggtcctcatttctttttatggcggacgggtgctgcactttgcacat -ataccacaggttcttATCCTattcatcttatcccccgatagatacttagaatgcttgtagatccgctatcgttaaccatgcagcgcaaaataaatgacta -tggtcacctacttatcttactaaactcctttcgtaagggcctgttcccagaagtaggatctccggttcacaagatagttcaacctgtcggttctagtcta -atccctcaattTAccacaccgtagtggttctgctacttcgcattaccaccggaaatgtataagagcttccttggctgtgcacctctgctaccGacttttt -attgcacgcttttcggatggaagaagtcctaataggggtgggataatgactcatcgtatttctacgttacattacatagaggaccaatggtattgaTATA -CTGCTAGCTGCAAAGATTGcgacctccaAcgaccggtacctcAtaATCCTCGCGTTAGTTAAGCCGGGACGAGGGGCATGTGAGCGGTCTGATAGAGAAC -AAGATATGTTACGCTTTGAAACCACAGCAGCTTTCTCTAGGTTTATCTCTCTTATATAGTCCACAACACTTATGACAAGCAGGTGCGACACAATTTTTAA -CTTGCTTGCGCAAGGCCTAAATTGACCTTGTATTCGTGAGATAGGTTGAAATAGAAAGTTACAAATATATGAGTGTGCAAAGGCAACCCTCCCATAAAAT -TTTGGGAGTTTAGGTAAGGCAAATTAGGTTGGGATTCTTTTTCGATCGTCCGTCCTTTTCTCCaccataaattcatttaaaggcaggcagtacgttgttt -accgggatatctcccgttgcgtgacgaaaagggttatTCAGGgagacatctgagTTAAGCgatgtaagtgaagcccatagttctattcgtcgccagtagc -tagtggtggataagatcaccagaagggagatttaaacgggagaaccgcctttcctcgacatctctctcttgAAGATGGCCCCAGTCTCGATGACCAATTA -AAGAAGGAACCTATCAGCACCGCCCTGTATAGAAATTAAAATTACTCACTGATAACTTAGTATCGAGTTCAGATGGAGAAATCTAGTATGCTCTTTTTCT -ATCCGAGCACCGTCTGGGATATCTTTAGTCCATGCGACCGGGGAagGtggcacataccagtctccggcagtccgggagcagatacactagagaggagtca -ctctcagtacaggctgcggaacgcagactggtaaatcccatgggcCTGGTACATTACGTgtcaaggagtggatgttggagggaaaatgaagagccgttag -gtctatgagcaaatagagtccaggggagaaccatctccctcccgcacggccaaccaagtttcttcttttgctaagcttcttacactccacgcagctctgc -gattattctaaccgcctcctcgtgaccttgtccctttaatacctcaagagagttgaccatgagatgtaaaaagctttgtatcccggagtaaggagcggct -aaaacagaagtgaaaaactcaagctgctatagacgagaaacccgtagggcaccatataacaatcggccgctgggacttgcacagctgtggacgctcacta -cccgacgatgcctttacttgtctgaccactatttcgttctttgaaggtttggagagagtaaggcGACATCTCTTGGTTATATCGCTCAGCTCGAAGCAAC -TGGATGATTTCGGAATTGTTATTATAAAGAGCCTAAAGTATTCCGATCTAGGAACCGAATGtatacagacatcgaaatcaagtaTTAcacctacagtaag -gataaactatcagcgccccaacgcatttaccctttgtgtgataaacaatcacaacgcagcatttaagccCattttagaatcatcgactagaatgttgggc -taccgggaagcgggcctcgcagaaacttttctgcaccttttgcatctagccagcgtccccatagggactctaaacccccattgttcgtcccactctttgg -ccaatcttttggaagtcactaaaagaacgaaagctattatatggctcagcgtcctttttgctaggtgtataccgaaaaaggacagctttaatataccaaa -gcgatacctgcagtcccatgttagtcacagaaatgtttagaacggttcagatttgggggcaacttaaattgctatcaacaagtgaatcggtgaagggtct -gtggcgagtacatgcagtgtaggaccatacagacacaaaaaaataaagttcagtcgtctgcgtcggcatggatggagccgaaactcattatacaaggtgc -aagaagtcgggcacatagaaaagacatcacacgtccttacttgattatgggatttgggaatcaagacaattgaactcatcgacggattgagttgaaggat -gttaaccccaaacgcactagaacgttagcatgtcatggaaaatgcggtgtccgttcccttaaacctttatccttcagggcgtaaacagccctattatacc -ttacaagctggtttaaaatctacgaataagctattacagatcacggttaccccactgtggcctcacataatgagtcttattcactttttctcgctcattc -cgaactcactaaccgtccccacttcccccctaaactctcgcgaccctccgcggcctctgtcagccatgattatcctttctcaactcacgcttcacgtgtt -tcaatttttagattctacagataaatgaaactaagtgatgcttgtttttttgttcctgggatagcccgctcaccatagcgactcgcagtttcaacctttt -tgctgcagtagaccggattacgttcgatttcatgaatgggtattactccagcgtatatgtgcaccgtattctcttcgttcctccatatagtggtggacgc -ttagggtgcctacaaattagagacgtcgttgaCACcagcgtaggaataaagataggggtgcagatatccctttgcgatattgaacccttgccctttggat -gcatgcccaccaatcagactgccaaatcatcggttagctcaagtcgttgtcgttttactaacccacagactgctgtccgtagtcattgtactaatccaca -tacttaccggaggcgaaaaggggttacctcgttactacattgtcaccagcgtttcatatctcttcccttatggacgctatccattttaacccgggtgagg -tgacttatcatcgtattcgagatttccatgaccctgatcgcaaattatgttgagcagttttccacatacctgtccgccatttaaatttcttttttcgatg -aacgggtatccaaatctcctgcccattttgcaatcggattgttacatttttctttatagaggtatttgaggtggaaggggcagcaggcgttcaaggatag -acgaagatgtttgatgggtacaagaaacatacaaaagaatgaataagacctactgttggatagcacgagaaggtcaccatagcgataataatttgattct -acatcttaagatgataccagaagtgcattcgtgttattgggagctcggcagattagtgttggaggacttggatctccaattcactatttagcgctcgttt -tacgttacatggctgtataagcaggtcttgtgtactccatagctatacatacttatggtataccccttattatctatttccACGatgaatcccctcgtat -tgattgccggatcccacagaaaagtcggaagaagcaatgcttatccttctttattgtactgcttcatttaacataatgacttccaatcccatgcaagtgg -tcacaaaagtgtagatctcatcctttattaaggctgattaatattccCAAGTTCATGACAGTTATATTAACCAATCAATAAGAGTATGACCTTGTATTTC -GTGTCTAGAGAATCAGGCCTCCCCATAGAACATCCAACGTCTAGATATGATCCATACAACAGATGATAAGACTGTTCCCGGCTAATATCTATGATGTTAG -CTTTTATTTTACGCCGCCATTAGTCATTAGGGGGATGGTCCCTCTCGTCTTGGTTCCGTTCTAATCCCATGACGTAGAATACAGAGGCTGCTCAGTTCGC -GTGATTGCTCCAAAAATCTTAATAACCCACGATAGAATATGTTGTTTCTGATCTAAGCTTGACAAATTCCAACACTTTGTCCCGTTTTCAGACTCTGAAA -TACGCGTACATTCCGGCCCATGTTCTTTTACCCGACTAGAATTAGACTCGTTGGGTCTTTCAGGCTTCCAAGGCAAGTAACTTTGACGAGCATGCCTTGG -TATCCAGGACCCCCGTTGCTCTGCCAGACCTCTTATCCCAtaacgtcgcgggtgtcagtgacataatacctCAtgaaaaatGCGtggcaggtagctctgt -agcaccggtcataagcaggagattactacaggcaagtgcggaatacacggcaccgctcggcaacagacaggtgcaaattagcgcttagataagccattgt -cgcactcagggagtagtgggccTatgtccttgagaagtgaactgacaaatgctggcgacgacccaaggagtagggtaAttctaacacggcattgctgaga -gcttagatatagagctgttatggaaaagggtcatgtggagagccttcgaactggacgccgatctcccttaggtgtcgtgaatcctgctactgcgtgccta -tccatgttataggtacttagctcataagaaatagtagtacggctcgaagtttccaagatttggaggcaatctcaccgtccgtcaacatccgaatcgagga -cgaatatgctgtacatacaatgtagctcgatccattcggaagaaCggaggtttgatcactagttccgtttcaactgcaagtggggaccatcacgtctaga -ggaaagacatatagacagcgtgatgcatctacacatgctcatgcatttaaggaagccaagaaaccaaatcacacacatcAgactcaaagaggaggaggat -tgtagcgtgaaacgtggagagcggggacgaattggggcggagtgactggtaggtatgacaagattacgcagaaatgagagaaacctactgtacggacgat -aacagaaactataatggctaagaatccaattcgactcttcagaccaattcgaagcgtgtaatcagagaatatgtgccacgaggggcatacgcgtgaagat -agagacacctcatcgcccgtagtttcctcatcgcgcactgcacgcgtatatcaggatcatctacttctTCACGAATATCGAGAATTTGGCTTGAAGAAAG -GTTACAAAAATGAATGGGCTTATGGGGCCTAGGAATCCACTTTTTCCAGCTCGATAATTTTAATTACGGATTGGGCAGCCCAGTGGCAGCCTCACTCACC -TACACTTAATCGGCCTGCCGGTCCTCAACATAATAGAGTGCTACTTAAGTAATGTGGGCCGCCTCCCTGATATAGGGCGGCTTACAGCGAGTACGGATTC -AGTTCCTAAGGAGGAggagttccctaacacttctaccatccgaccaagtgctaggttatcCAAAAcccatagaaacgtttacgggtatacaaaagagctg -cctgtactcccctattatacttaaagAgatttacaacgAAgccatcactcgagactctcttgagtAgcccgtcgccaggcgaaggCgctgGgagggcgtg -acacaatcataaatttatcaggtttcgtaaagattttgctccatttatggaaacctcaaagggggatccctttcagctgctggctagtcgataatcaaga -tactgaaaaataagcatcgcaAtctgtcaagcatcagtgagagctattgatgggaaccgctgaactcacaaacgGttaatgtatagtactagatgTGGAG -GTCtaatttggcgggcagagtacccataaggtGCCGTGCACAGCaagaatggtaaatgtgtagtggtttaaaaagacgaagaatggaggaacaAATGaAA -GTGGAGCCGATAGCAGCTGCTCGGTATAAAATATGACCAGACGTATCTAATAAAGAGCATATTACAATACTACCTAGGGCGGGTATTTTGTCATTGTATG -GCACAATTGCGAAGGGAGCACTCCCAAAAACCTAAGCAAGAGGTTAGTCCCCATCAGGCCCCCTTGGTTGACAAATTGCTAATGTTACAGAGGGAAAAGT -TTAGGAATGTCTCAATTGCGGCCATACCAGGAGTACAAACACAGCATGTAGGCTAGCTATTCGTTATAATTGGAATGCAATCGACCCGTATCAGTTGTAG -TATGGGGTaggcctttaagctcctaaatgacaggaacagtaactgttggtccaccctatatctagtgtgCgggcgcatgggttggtatctacgtagcgct -tagtaaatacttaccggaAAAGCATGCCACATCTACATCCATCGGCGTGCGGCAGGTACGTGGGTACTTGGCGAACGTCATCCAAGCAGTATAGGCTTTG -GCAAGTTATGTATCTATCCTATCAACTTTACCATGGTACCACTAGTAGACACGCTCAATTCCAACCTGGCAGGTATTGCCTTCTTGTTAGTTTGTTCTAA -ACCTATGCTCACTGATTTTTTACTAGACTAATTCCTGAGCTGATACAGATACTTAATCTAGGGGGGTATCTTCCACTGGACCAGAAATAGGTAGGACTAT -GTCCCTATTGAGAGTAATGTTTTAGTCCCACGCATTGTGATCGGTTTATGAAGATGGACCTTGCGGCTAAATCAAAGAGAAAGATATATAGTCGTTAGAA -GTGCCTGAAAGCACTGGATACAGCCATGGGGCAGATCTATGCACTGCAGAGAAGGACGCACCAACTTTAATCTATTGTGCATGCGCTGTACCATCGACTT -ACGACTAGTACCAAGTTGCATTGAAGTTGGCTGGTTCTATATTGTGAGCAataccaaaacACTATctatcttactagctagtccacgttgtaGAAGTGCA -TAGGGTTTAGGCAAGAAAAGCAAAAGGTTTTAAAGAATCACCTTTGGAGCGTTTGAAATGCATCTTCGTCTGATTCCGATCATGCACATGACCTAGTTAG -GGCTGACGTTGTACTATTGAGGATGTTGCTTAGCTGTGCCCACTTTGCTACTTTCCTACTTGTCAGCATCCTTGCTAAATGCCTGAGAGGACCTTGGAAG -TCTCTTTTACCCAGCAGTTCTTAACAGAGTTCATTAAATTGGGAGATACATCGCCCTCTGGTGCCTACACGATACAGTTTGACCCCTTAAGCAAACATTG -GAATATGATTAGATTTTATTATTTTGATTTCCTCTGTTTTCCAACTGTGCCCAACAGTAGGATGGTATTTATCTTGAGTTACGCCAATCCAGGCAACTGT -CATCAAATTTGGCGGCGTGCTCACAATCTCCATTTGCAGGAAAAAGATATTTTCCACAACATAAAGCTAAATAACATCCGGTGAATTTTATTTCCCTATA -GTTCCCCGTGCCACGCCGTATAGAGTGTTTTGGCGAGGAGAAGCGTACTGAGAGTCTGTTACTTTCAGGAACCGAGTTCCCTTGCATTACTAATAGTAAA -GGCAAGTGCAGGGTTTATACAATTGCAGGGAACCTCTCTGTTAACTCTTTTACAGAATGCGCAgcatgTTTCATTGatttaggtaGGCGGTTGAGACAGA -CACCGAGGTCCCACATCAATTCTCCTAGTGTTGTGAGTATACAGGAGTATCACACGCTATTGATCCGTACGTACCCTAGTAGTGCAGCTAGCGTTTCTTA -TATAAGGTTTTCATAAGTGCAGATTTAGCCAGGTATTTTCACcctaggattagtgtatctgcagatgttatacgggtccaTACATTAACACAATGCTTTC -GAACGTGGCTGTTACTTCGACAGTGTAACGATAATACTATTCTAAGTGACCAAAATTTTGGGGCAAACTTTCTTTCCTCTAGGCCAGCCGCGCTAGGGCA -TTTCTAAACGGCGGGGCGAACACTTGCTCCTTGCGACTTGTGATGGCTTATGTGCGCTGGAGCCGTTGAACATAAGGGGCTCCAATATTTGTCAGCCTGG -GCATCGCCAGCATGTGCGTTTTATATGATTGTCAAATAGAAGATTTTAATTGCCAGGTCGCTTGTGGGTATTTTTGATATGAGGTATTCAGTCTAGAGCT -TGAGATGAGTTTTACATTTTTACTCCTCATCAGTAAGGGGATCGACAACGGATATAAAGAGTCTTGACGCAAGTCGATAGCATCTGGCGTCCGTCACCCC -ATTGCCATCTATTCCTTTTCTCGGGTCTTATTGCAATATAGATCAAATCAACTTCACAAGACCCTTCCGGTCAGTAAGAGATGGTATATCAAGCGTTCGC -GCCAGGACGTAAGTAGAATGGATAGGCAGTGGGTTATGATTTTGGGAAAGTAAAATACTGCCATTATTGGGCGACCGAGATATATGATTCACGTATCATA -TATACCCCACTTGTTAGAACATTCTGGATAAGATTGTGTACACTATCAATTGTAGGGATGCAGGAGGACTAAAACTCACCTTAAACTTAGGCAGAGTGGA -ATCCCTAGCTCTCGAGGATACAGTTGATATCTTACCTAAGTAAAAAATTAagtagttacagtcggcctgtcccactagggttccacgcctggatttctcg -aggTAGtctacttacgccaGTGggacatgaaatttcagtatccgaccttagaatagcattttacaggcctgtccctcaatattaaacccccttcccactg -ttgcaaaggtcggaagtattaccaggatcaagaacaaatctaggaattcagaCCCGTAAgggctgctacacattaaaagtttcaccgatcttagccttag -agccgtggtgtaggaggattatcgggaactgaggggggttgtaggaggacatctggattatctgtagaaggtgtcgccggtatagtgtatgcaaaagtct -attttccaccctccgatctacgtggtccctccatttattccaggctacacgtatggcgaaacttttacctcatcttacttgccgagaCcgagTtggcacc -cacatctgttcatgcacagatatggtttagctgcatatacctttcacgttcgtacctcttcttccgccgatagactcctatgtgtctcccgcgTtatttg -ccaACAGAAAATTAGCAAGATGTATGCATGTGCATATAAACATAGATAATTATGCTATATAAGCGCCTCTTGATCGTGGTTGTCAAAACCCCATGCAACA -GCCACAGTTATGCTGTCTAGAAGGCAGtttattctgctgattctgtgtgcctggcactggcgAgcggaaacgttctgccacgagctagggaaaaagtagg -actgcaccgagttgtgaatttctgagtttaaacaagaaaataattcgtttaaaattcttaccactacctggaggggggcattaacaatatgcgtttttta -catttaagtaaagtgtatgtcagagaggttaactgatatccgctgggctacaaggctcattacttatagcactacgatttgaaaccacctactcgagtcC -AggggctgacgtgaaataaaacgtcacgtaaagcTCGAAGTAGTCGTCCCTCTCCAAAGCACTAACTAAGAACTCTTCGCAGACACTTGGTCATACTTAT -TGCCCCATAATATAGCTGAAAGAATTAGGCATACCAACCATGTTTGAACTCCGTCTATTCTGATGAAGCTTTAAGTACCTGCACAATAATTAAAAGTGAC -ACTGGAGTGATGTGACATGATTAGTGATGACTTGCCATAACAAAGTGAGCGAAAATAATGTTAGTGAAACGCGGAAAGGTCTTAAAGCAGGATCATCTTG -ATATGATCCAAGGTTTTAAAGTCACTTTATAGATTGTTATCTCAACGTGCGAATCGGACACAACTTTGGCGTAGACCTCAGGACTAGTTTGATGTGCAGA -ATCCAAATGGGGTATTtaaaagccctGAagcaccagcacacctagtctggtagttttaatcGAATATACTCGTTCTGGTTTACTCAGTTCAGTGTTAGAA -TATTGTTTTGCTCGAATCCGGGGGACAACCAGACGTGTTACACTTAGTTCTTTTGGTAAATTCTTTTAGTCAATTATCCCTCATATTACGCTGCCGATTT -CAAAGTCCTAAGGCCGCAATACACACGCCAACTATTTTCGCCACTAGGCACAAGGCGTTACAAGAGCTTGCAAATTGAATCCTGATCTATTTTCTACGTC -GGTAACCTATTAGTATGAGTTGAGTGGGCTTCACATAGGGTTCACTAGATTTTCGTATTCACGATGGGATTATGTTGAGGCCAAAGCTATGTTATATTAG -GGGTTTGCACACAGAAATTGAGCGTAGAACCCCATCCATTCGCTAAGTTGTACTAGATGCTTGAAGGCGTCCTTTTTCATGTTGGACCTAATGATCAAGG -GGTTTCCGGCATACCATCCGGTTAACAGGATATTCTATTTAGTCATTCAGAGAATTGGGTCCTTATGGTTTGAAGAAAACAATAATGGCTCTTGGAGATC -CCAGGCAACAGATGCTCCTTTCTCGTTCAACACTTGACTAATGTGTGTGCTATGCTCTTCAGATTATTGTCATAAAGACGGAAAGTGCATCTAAgtggct -ctcccagtgcggtcctctggtaagcagcattagcatcagctaggagctcgttaaaagtgcagatccgccggtcttaccccagatccgctgtgttagacag -cctgaggatagggccctacattgtccgctatgaaaagcCTccttcagggaattctaatccgtgtttggattctggaacccgTGTCCCAGAGAATGAGAAC -TTGGTTGCACTTGTTGGATCTCAGTCGGGAGTAATCAACGGATTCCTATTTGGTGTAATGAATGCGTTCCTATCATCAATCTAGTCTTATAGAAAGTCTT -CGCTATACATGGATGTGCATACCCCTAAATTATCATAGGAAAGGTTACGTTTGCTTCTTCGGCTCTCTTTAGTATCCTTGCCGTTACTCGCAGAGTGTGG -CTATCCGCTCCAATGACAGCTATGGTCCCTTTTGAAGGAGTTTACTACAAGGCCATTCTTTAAAGGGCAGGCAACCCTTAATCGATCCAGGGTAATCCTG -TGTGAATTTCTTTCAGGCTCTTTCAACGAGCACATTGGGACTTGAGGGGCCGGAATTTATTGTATTTGTCTGTTATCCTTCAGGGCTCTGGGTTATATTT -ACTAGTAGGAATTCACAAGAGTTCTAAATTAATATTCGACAGGCTTCCTAAGTTAGATACAGATTGATCACTCCCTGGTTGAATGCGAAATAGCAACCCC -GCTCATTTCCTAACGAATCTcgttgtgcctttacagcagggtacagttactggatgcggaTgaaaataaaaagcagtgactaacgtctaggctaggacat -atcacggtcaacacagatgctaataactcaacagtaggtaaaatgtgagttaatatcctaatgggaatatgtaatggtaatcttcatagcgttggtttgt -cttgtcaaataggagactcacccgttagcgtaaagacacgatgactcattgggtatcttcccaattatggcttaaaaggaccagagatgtacacgtaagg -caggtcccttacccattaaatcgGTTGTCtctacggactcttaggttcttccaatcctcagctatcgtctataggtcagaaaagagcagagagttagaga -tattccctaaaggtacactttgtcttaactttaggtatttaaACCATCTATCTAGGGCCCCGCATTACGAAGAATCATCTGTCGACCCACTTAATTGGTA -TGATTATCTGGTAACTAACTCTTTCTTGACGTTACAAGCTGTTTAATGGACGGCCTGTAGACGGTCCGCACATTGACGCTTCTTACGACATATCGAGCTG -GATTGtggattagggatttcattgctgccagtTATataggactagccctttagatgcatctggcttcgagagcacgcctctgcccatacaattgagctaa -cgcatagggagacaggcgtaaggtcactactccACTGGATCGgtagcccggtccaagagataacgatcaccgttcggtacgcacaagtgtcggagtctta -cggagcgctcgtgtagtataccggggacagtggcaaatcgtacttctgcttggctaaaaagcccgtcgcgttacaaagagtctactgtgaatgagaacga -ctttttcactccatatccccccctgagttcccacgtagtatcactttcttaaacctatcacgtatacacccggaggtcgaattgcctagtcaagagaact -ctaggtaacactcttcttaatcttcgacggacaatttgactggatctattaattacgcagcagacagaagaaccagagaaataccttgttacttgtgtgc -acaagaccctctaggactgtaaaagttgAggtcaacgtcacTAAggcctttgaataggacgacatcaatatgcccctctgcaatattctatgggtgtttg -gcgacagtgcgaatcctctgacccatggctttgggttgaggtgtggcaaattaatctctcagcacgctacccttgatatgaagagcggagtatcTTGTGG -ACGTAAAACTAATCGTGGTAAGGGTTCTTATGTTAAGGGGACTGTGTGACTGATACAACTAGctcgctttacggctaccttagcaagttcagatcgtgaa -tactacaaCGAAAGGcatgggtaaaccggtggatgtttctgaagagcgaCCattccacaCAcgaaacgaaaagggggaactaccccgtcttgcttatttg -gatgatctatggctcagagcagaagcacccgttgcttgagaatgttggagcgaatggggacgactggggaatttaggagtcgggggagaggatgaaaggg -ttcatcggtaggaaagggcaacaaaaggtgaatgacagccactaattgaaggcagtgacgtaagatagataattgcgactaagggtaacttgatagaatg -tctaaaggccgtaatcgaattacctctaggttaGTCTTGTTTCCAGATATCTTCATTGAGTTTAAATTGTGAGTAACTGGGCATAGACGATATGCTTCTT -ACAATAGTTTAAGTTATTGGATCCTGCGCGGCCGGGTTATGCACACGTGATCGGGAGCTTCAACTCTAGTGTCTCACTCTTTCCTCGGCCAAATAAGTAC -ATACGACACAGGTCAGATTTTTGACACATGAAAATGGTTATCTAAGTACCCTTCCAGTTTCAAATTGTTGAGAACATACCTTAAGGATTCAATAACATAC -ATAGTGATAAACTATTGCTCGTCTTCGGAATTTTTCCAATCCTTCTTCTAGCCGCCCATAGACATCGCCCATATATTTCAGTGATGTGCAATTTTCactg -gtcccaggtaaaactactcctggaccaatgacaggcaacaaccgttcgcggcgatcttatcaaggccaatcaaactaaatacacagtacagcccgttgat -gaaatttgctgtcctctaagTTTTTGTGAGGCCAATCAAGTTCATGACCAGGTAATCACTCTGGATGCAGGGTGTCTTCTCATCAACCCGAAGAGATTGT -CTAGGTACCGTCCGATACGCAGTGAGGGGGCCCGTCACATATATGGATATTCATAATTTTACGGGCCCTTATCTAAGCCACGCTGGTGACTATCAGCATG -TATACAGTGGTTCTCTACCATCAAATCATTCCATAATGGTGCCCACCTTCTTTAGTACGTTTGCTTCAACTTTGAAGCTCTCTGATTTCCTAAATGTGTT -AATtggaagccttagttttgTCCGTTCCATTACAtgcCGACCAGctttgaatcaatttgttcgAgaATACCTCACTTTGGGgcaattgttcgatcgtgcc -cagcgtgtggccctcacactgttcaaggggatccttagggagtcgttgttgaaaaacagccaggctcctaagttgcccgctaggaaaaccaactccttgc -taccgcatccctgctcctccgtgtttagggaggtagacctagttcgtgatatcccgcgccggctcctctcacctgtcagtggattgatcgcaccgatgtc -gggcccgtacaaagcgtcctcaacccaaccaggagttcggaatgaccaggacaaattgagggacagcaaggtaagctgactatcccaccttaaacgtatc -cccatTCGGaggggtttcacgacccacaatcattcaaccttcgtacttgtggtgggattggagTcttcacatgtctcattatccaatgcttccgttacag -tttcatcagagagacctgagaaaagaatgtcgcacccgccgtatatgccaagcacgtctCTCCTTATGATGCCAGCATCTGTCAGTAAACGGCACCTGAA -CAGAAATTTATGCCAGCTTCCGGGGTAGGATATGGATATTGTGGGAATTGGCATGAACGACACATGACTAATGTGTTCTGTTGCATGACGCCTATTAGCG -CATGGGGATCGATTCGTTACCCAGCCTGGTAGCGGCGTAATGAGCACCTCCATTCATGTTCTCGACCAACAAGTTTTCTGTTGTTAGACGGTCAATCAAG -TCCTAAGTATCCGATATTTCATAAAATTCGCGCGTGATGCTGGAAGATTGTTATCCTTCATATTCATTTAACAATGGAATTAGACCCACAACGACAGCCG -CGAGCTGCCCGGTAATCttaaaagagagcccagggagattcataataaatgaacattcaaattccacaatggtcagatttaaggaagtaaggttatgttt -gcatgaaaataaaagggaagctctcatatgtaagggtgtacttagctatatcaagataaaactaccCATCAGGTAAATAaaactAATTgCAacatgcaac -agtaaataacagtgcctgttaacacctatcCccgtcagatatgggcacaatgaacatcggctaagatacgggggaaaagaggcccagtttactactTtta -tgaactcagctcactgagaggctatgtaggtcggatcacggtttttctaaatacttaaaccttggggcaactcataatttaactttcactttagatgcat -ggcaacaagCGCaaaggaggttggtttTgtaaaagagacttagtcaccgcaatgccgtatcctcctagagagtcacaatagagacTCGAAAATACCTGCg -caacgtgaggctattaataaatatgaggggacagaaaacagttacgtcaatggagcAAagaatgctcttcctggctagagatggacAGATGGCAACGCCA -GCAGCATTGACTTCAGGAAATTATACCAAATCAGTCAGGCCTACAGCCGGCCCGTTCGGTACATGTCTCGCGgattttatcggtttctaataacagctcg -ttggatagtgtattactaccgctgaatgtgtactctaaatcggggaaaccagtgtcgcaaagatctcgatattcccatggtggatacatcggaattgttt -ttaggacttggaactgggctaaataccatccagcagaccagtagataagcagtgttcgccacaaatcaatggtgaagtgcacctaagccataaagaaggg -tgagttctaatcacagtaatatgggtagggctagcatttactcgattaaatgaaacagggcaggaggtttacaaagcaagccgaacgtctttgtaagttt -ttgaggttcgaaagtagaaaccaaagcgaacaaaaggtgtagagagccgATaatatagcacgtaggaaaaaaagctcaACcaggtgggtgccgaaaatta -tactaggcagaaagaaaccttggaacctgagcgggacctcctaggctcagcgacccagaagcccctggatctaaccgtactcctcacactcttacagaat -tatgaacaacattgacactggacgtccttctgagtcagtgtctgatctgccttctccggcatgcccgtcttgaactcgagctgcttgcttgtatacctac -ttgagttttagggccccatagtcacaggaaattaatgttatgcccggggttgtctaagctcattcaacttttcgacgggaacacaaacatgcctgttccc -accgagttgtggtcatgatctcctgaagcgatggcggtaatggttgggaaacgtaaacgaagacagtgagtaggattagggcttattctgtttgcgcagc -agggcatcccgatttcacagtggctcgggtatgcaagtaaccttagctaacaTGGACCTTCATCTACCGTCGCAAGAGATATTAGCTGTTTATTTACCAA -TCATATCCATGGACACGTCTGCATTTAATTCAGGCCTATAGGGCATCGTTCACTATCCAGACCCCACTGCAACCACCCAGATACGACCAGCTCTGGTTAG -GCATATTTTGGCAAAGGATAGCAACTGAAGTTGTATAATCGATGGTTACTCAAGCTATTGACATCTAAACAGTATGCATGGTCGCAAATGTCTGTAGGAT -GCTAACGGCTGAATCGATTCctttttagcggagatAatccgattattgcagcggtgaattacataaaaacattttaaaatgtgtcagtagAGTAATCAAA -TGTTAAAAGGATGAGCACTCCAGAGAAGCTTTACTCGTCTTCCACCCAAAAGAAGCATGCAAACTCCTCTGACGTTTTGTCCGCCCAATTCGATACGTAA -GTGAAGCACATGCTGCATACGATAGAACTTGTAACGTCGTCAACTACTAATACTAATATCAGAATACATTTGCTTGAGTATACCGCCCCCCCGCGTAACA -TTTAATATTTCACAAGGACAAAATTTTCCGAGTAAAGAAAAATCCTGTGGTAACAATCGGGCGGGGTGGGTCATAACTTGGCGCGGGGGTATGGTGTACA -CTCATTGACCAGTCTGGCGGGTAACATAATCCAGAACGATAGCTTAAATGCACAGGACAGACGCTCTCCGTAAAGAGAATACCCAGCATAGAGAGAGGAG -TTAGTATGACTGCAGGTAATAAGGAGTTGTACTTGTCGGCCATTCGTTTACTGAAATATCCCCGGACACGGAATCAGTGAATTAGTTGAGGAACCTGTCT -CCTTTTTCATCAAAACCAAGACAAGGCTGTGAAAAGTTGACAAATGTTAggcacgaacgcatgaaataatagaattgaaataatctttAGTCAGTACtgt -cCTTGTTTTTTGTTATAATAGAAGTAGTATTTTGGGCGATATTGAGCAATCAGTAAATGTGCGACCAATCCACCGATACTAATGTGTTGGATAAAGTTAC -TATTGCTGTTTTGCTGGGTAGTTGTTTATTAGGGCTCCCCATGTTGTTAGCTCGGATAAGTGTCTGTGCTAATTAAATTAGAAAGATCTCCTCCGCGCTA -AAGGTGCCTTTATGAAGGGTTAATTGATGTAATCACCTTTGTTGTGGTCAAGTTTATTCTTCTCCATAGTTTATTGGCCGGAGAATCACACCGGACACTA -GGTTCTGAGTAGAAAGTTTTATTTTTGGGGCCTATGCAAATAGAATCTCTACGTATTGTCGAGTTTCACTTGGcctagtCTtgaaacgtccaaggcaggg -aagaGTTTTGTGATGGCGATGGCTACTAG diff --git a/programs/lastz/test_results/base_test.default.lav b/programs/lastz/test_results/base_test.default.lav deleted file mode 100644 index 83bf97c..0000000 --- a/programs/lastz/test_results/base_test.default.lav +++ /dev/null @@ -1,521 +0,0 @@ -#:lav -d { - ../test_data/pseudocat.fa ../test_data/pseudopig.fa - A C G T - 91 -114 -31 -123 - -114 100 -125 -31 - -31 -125 100 -114 - -123 -31 -114 91 - O = 400, E = 30, K = 3000, L = 3000, M = 0" -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa" 1 22929 0 1 -} -h { - "> cat" - "> pig1" -} -a { - s 5643 - b 4901 21309 - e 5171 21537 - l 4901 21309 4924 21332 67 - l 4925 21334 5024 21433 68 - l 5027 21434 5042 21449 75 - l 5088 21450 5116 21478 62 - l 5117 21483 5171 21537 64 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa-" 1 22929 1 1 -} -h { - "> cat" - "> pig1 (reverse complement)" -} -a { - s 9369 - b 1 1 - e 718 767 - l 1 1 43 43 58 - l 47 44 62 59 75 - l 63 62 71 70 67 - l 72 72 78 78 57 - l 79 81 83 85 80 - l 84 91 120 127 54 - l 121 133 135 147 73 - l 140 148 166 174 56 - l 169 175 181 187 62 - l 182 197 357 372 58 - l 361 373 416 428 66 - l 417 433 422 438 83 - l 423 442 545 564 55 - l 552 565 567 580 56 - l 568 582 575 589 88 - l 579 590 605 616 59 - l 606 630 612 636 71 - l 613 648 623 658 73 - l 624 670 643 689 65 - l 644 693 718 767 59 -} -a { - s 11526 - b 9232 6509 - e 9773 7071 - l 9232 6509 9264 6541 58 - l 9267 6542 9357 6632 58 - l 9358 6644 9365 6651 75 - l 9368 6652 9387 6671 70 - l 9388 6677 9424 6713 57 - l 9425 6717 9435 6727 64 - l 9436 6732 9443 6739 75 - l 9445 6740 9462 6757 44 - l 9463 6769 9500 6806 58 - l 9501 6809 9543 6851 72 - l 9553 6852 9559 6858 86 - l 9562 6859 9584 6881 74 - l 9585 6884 9652 6951 71 - l 9654 6952 9773 7071 62 -} -a { - s 15503 - b 10825 8530 - e 11401 9096 - l 10825 8530 10877 8582 70 - l 10878 8591 10887 8600 60 - l 10895 8601 10950 8656 77 - l 10951 8658 11030 8737 63 - l 11031 8740 11060 8769 67 - l 11061 8775 11151 8865 62 - l 11164 8866 11210 8912 57 - l 11218 8913 11401 9096 62 -} -a { - s 82556 - b 13866 16063 - e 17594 19721 - l 13866 16063 13902 16099 68 - l 13903 16105 13912 16114 90 - l 13913 16124 13959 16170 60 - l 13960 16173 14162 16375 59 - l 14165 16376 14178 16389 57 - l 14181 16390 14191 16400 73 - l 14192 16402 14201 16411 80 - l 14202 16415 14408 16621 61 - l 14413 16622 14532 16741 63 - l 14533 16750 14606 16823 66 - l 14607 16834 14613 16840 100 - l 14619 16841 14644 16866 54 - l 14654 16867 14723 16936 60 - l 14724 16938 14753 16967 63 - l 14767 16968 14889 17090 64 - l 14890 17092 14899 17101 80 - l 14900 17103 14936 17139 65 - l 14937 17142 14988 17193 58 - l 14989 17197 15038 17246 68 - l 15041 17247 15105 17311 57 - l 15107 17312 15114 17319 75 - l 15115 17321 15205 17411 59 - l 15209 17412 15327 17530 58 - l 15329 17531 15357 17559 55 - l 15364 17560 15488 17684 62 - l 15489 17689 15498 17698 70 - l 15499 17700 15598 17799 63 - l 15599 17801 15605 17807 100 - l 15607 17808 15655 17856 67 - l 15657 17857 15697 17897 56 - l 15700 17898 15725 17923 65 - l 15726 17925 15804 18003 63 - l 15805 18006 15919 18120 66 - l 15922 18121 16096 18295 62 - l 16103 18296 16122 18315 60 - l 16134 18316 16157 18339 67 - l 16158 18342 16406 18590 59 - l 16408 18591 16439 18622 72 - l 16440 18624 16495 18679 61 - l 16497 18680 16572 18755 54 - l 16573 18760 16613 18800 61 - l 16616 18801 16639 18824 67 - l 16642 18825 16682 18865 66 - l 16703 18866 16728 18891 54 - l 16732 18892 16841 19001 57 - l 16844 19002 16875 19033 72 - l 16887 19034 17024 19171 58 - l 17030 19172 17042 19184 77 - l 17044 19185 17052 19193 78 - l 17057 19194 17088 19225 53 - l 17093 19226 17101 19234 78 - l 17106 19235 17118 19247 62 - l 17126 19248 17152 19274 78 - l 17153 19276 17164 19287 67 - l 17166 19288 17184 19306 58 - l 17185 19311 17244 19370 58 - l 17246 19371 17306 19431 62 - l 17307 19435 17327 19455 57 - l 17328 19462 17352 19486 68 - l 17353 19489 17362 19498 80 - l 17364 19499 17445 19580 56 - l 17448 19581 17464 19597 59 - l 17475 19598 17503 19626 59 - l 17504 19630 17510 19636 71 - l 17511 19638 17594 19721 62 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa" 1 22929 0 2 -} -h { - "> cat" - "> pig2" -} -a { - s 4637 - b 4901 17469 - e 5024 17593 - l 4901 17469 4924 17492 71 - l 4925 17494 5024 17593 68 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa-" 1 22929 1 2 -} -h { - "> cat" - "> pig2 (reverse complement)" -} -a { - s 10451 - b 86 3933 - e 717 4606 - l 86 3933 116 3963 71 - l 117 3969 130 3982 71 - l 135 3983 165 4013 55 - l 169 4014 181 4026 54 - l 182 4037 344 4199 64 - l 347 4200 359 4212 46 - l 361 4213 414 4266 59 - l 415 4274 499 4358 59 - l 500 4360 511 4371 92 - l 514 4372 520 4378 71 - l 521 4380 555 4414 66 - l 559 4415 565 4421 71 - l 568 4422 577 4431 50 - l 581 4432 608 4459 64 - l 609 4494 613 4498 100 - l 614 4500 631 4517 56 - l 632 4521 717 4606 56 -} -a { - s 48163 - b 6629 14413 - e 9280 17100 - l 6629 14413 6645 14429 88 - l 6647 14430 6669 14452 61 - l 6680 14453 6749 14522 46 - l 6750 14524 6872 14646 54 - l 6873 14648 6895 14670 61 - l 6898 14671 6915 14688 72 - l 6917 14689 7044 14816 55 - l 7048 14817 7095 14864 65 - l 7099 14865 7113 14879 67 - l 7114 14882 7257 15025 63 - l 7260 15026 7289 15055 60 - l 7290 15059 7299 15068 60 - l 7300 15070 7374 15144 60 - l 7394 15145 7402 15153 56 - l 7404 15154 7454 15204 61 - l 7460 15205 7503 15248 66 - l 7504 15252 7506 15254 100 - l 7509 15255 7521 15267 77 - l 7526 15268 7550 15292 56 - l 7553 15293 7604 15344 62 - l 7605 15347 7624 15366 60 - l 7625 15369 7654 15398 70 - l 7657 15399 7753 15495 64 - l 7754 15497 7838 15581 58 - l 7840 15582 7899 15641 67 - l 7900 15648 7949 15697 46 - l 7950 15704 7953 15707 100 - l 7954 15713 7968 15727 73 - l 7969 15732 7975 15738 86 - l 7976 15786 7986 15796 64 - l 7987 15818 8014 15845 50 - l 8015 15861 8060 15906 74 - l 8063 15907 8146 15990 58 - l 8147 15992 8151 15996 80 - l 8154 15997 8175 16018 64 - l 8176 16022 8184 16030 78 - l 8190 16031 8196 16037 71 - l 8202 16038 8226 16062 72 - l 8227 16064 8257 16094 71 - l 8261 16095 8335 16169 61 - l 8339 16170 8392 16223 56 - l 8394 16224 8403 16233 50 - l 8404 16235 8411 16242 88 - l 8413 16243 8551 16381 61 - l 8555 16382 8576 16403 68 - l 8577 16405 8776 16604 56 - l 8778 16605 8811 16638 47 - l 8815 16639 8832 16656 78 - l 8839 16657 8858 16676 55 - l 8860 16677 8879 16696 65 - l 8880 16699 8943 16762 63 - l 8944 16764 8984 16804 59 - l 8985 16808 9065 16888 53 - l 9067 16889 9222 17044 59 - l 9224 17045 9260 17081 81 - l 9263 17082 9267 17086 80 - l 9268 17088 9280 17100 69 -} -a { - s 14884 - b 10825 19810 - e 11394 20369 - l 10825 19810 10907 19892 65 - l 10908 19894 10950 19936 77 - l 10951 19938 11029 20016 67 - l 11030 20019 11065 20054 58 - l 11066 20060 11156 20150 57 - l 11169 20151 11211 20193 53 - l 11219 20194 11394 20369 61 -} -a { - s 74814 - b 13866 7543 - e 17595 11202 - l 13866 7543 13901 7578 67 - l 13902 7584 13915 7597 50 - l 13916 7607 13959 7650 55 - l 13960 7653 14141 7834 59 - l 14148 7835 14159 7846 75 - l 14160 7851 14176 7867 65 - l 14179 7868 14189 7878 73 - l 14190 7883 14416 8109 58 - l 14421 8110 14540 8229 61 - l 14541 8238 14606 8303 65 - l 14607 8309 14635 8337 52 - l 14645 8338 14716 8409 61 - l 14717 8411 14742 8436 46 - l 14757 8437 14764 8444 63 - l 14765 8446 14889 8570 53 - l 14890 8572 14899 8581 50 - l 14900 8583 14936 8619 68 - l 14937 8622 14994 8679 60 - l 14995 8683 15026 8714 72 - l 15027 8716 15044 8733 61 - l 15048 8734 15106 8792 68 - l 15107 8799 15112 8804 100 - l 15119 8805 15202 8888 55 - l 15207 8889 15216 8898 60 - l 15217 8900 15335 9018 65 - l 15340 9019 15357 9036 56 - l 15361 9037 15490 9166 61 - l 15491 9172 15657 9338 57 - l 15659 9339 15697 9377 67 - l 15700 9378 15725 9403 42 - l 15726 9405 15809 9488 63 - l 15810 9491 15924 9605 58 - l 15926 9606 15931 9611 67 - l 15933 9612 16095 9774 63 - l 16096 9776 16114 9794 58 - l 16133 9795 16166 9828 59 - l 16167 9831 16406 10070 62 - l 16408 10071 16440 10103 58 - l 16441 10105 16496 10160 50 - l 16498 10161 16567 10230 57 - l 16568 10235 16612 10279 69 - l 16617 10280 16690 10353 59 - l 16692 10354 16710 10372 63 - l 16733 10373 16841 10481 59 - l 16844 10482 16878 10516 63 - l 16890 10517 17027 10654 50 - l 17028 10660 17035 10667 75 - l 17036 10669 17046 10679 64 - l 17050 10680 17072 10702 61 - l 17078 10703 17093 10718 50 - l 17117 10719 17178 10780 60 - l 17179 10785 17253 10859 59 - l 17255 10860 17306 10911 75 - l 17307 10915 17328 10936 73 - l 17329 10943 17350 10964 45 - l 17351 10966 17448 11063 58 - l 17457 11064 17480 11087 63 - l 17485 11088 17505 11108 62 - l 17506 11113 17595 11202 74 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa" 1 22929 0 3 -} -h { - "> cat" - "> pig3" -} -a { - s 7835 - b 4884 18350 - e 5171 18597 - l 4884 18350 4899 18365 75 - l 4900 18368 4932 18400 76 - l 4933 18402 5024 18493 75 - l 5027 18494 5035 18502 89 - l 5081 18503 5116 18538 64 - l 5117 18543 5171 18597 62 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa-" 1 22929 1 3 -} -h { - "> cat" - "> pig3 (reverse complement)" -} -a { - s 49940 - b 6629 17053 - e 9260 19721 - l 6629 17053 6645 17069 65 - l 6647 17070 6664 17087 72 - l 6665 17089 6697 17121 73 - l 6709 17122 6749 17162 51 - l 6750 17164 6872 17286 55 - l 6873 17288 6906 17321 53 - l 6910 17322 7037 17449 63 - l 7041 17450 7111 17520 62 - l 7113 17521 7257 17665 59 - l 7260 17666 7292 17698 58 - l 7293 17703 7379 17789 66 - l 7400 17790 7449 17839 46 - l 7455 17840 7490 17875 64 - l 7493 17876 7522 17905 50 - l 7524 17906 7565 17947 48 - l 7568 17948 7602 17982 51 - l 7603 17985 7622 18004 60 - l 7623 18007 7652 18036 70 - l 7655 18037 7749 18131 63 - l 7750 18133 7838 18221 54 - l 7840 18222 7883 18265 64 - l 7885 18266 7894 18275 70 - l 7895 18277 7916 18298 55 - l 7917 18305 7950 18338 71 - l 7951 18411 7961 18421 64 - l 7962 18438 8001 18477 53 - l 8002 18479 8012 18489 55 - l 8013 18499 8053 18539 61 - l 8056 18540 8146 18630 57 - l 8147 18632 8151 18636 80 - l 8154 18637 8181 18664 68 - l 8184 18665 8196 18677 54 - l 8202 18678 8226 18702 64 - l 8227 18704 8239 18716 69 - l 8240 18718 8253 18731 64 - l 8258 18732 8335 18809 62 - l 8339 18810 8386 18857 63 - l 8388 18858 8404 18874 76 - l 8405 18876 8411 18882 71 - l 8413 18883 8552 19022 65 - l 8556 19023 8575 19042 70 - l 8576 19044 8776 19244 63 - l 8778 19245 8809 19276 69 - l 8813 19277 8832 19296 70 - l 8839 19297 8879 19337 54 - l 8880 19339 8942 19401 62 - l 8943 19403 8985 19445 56 - l 8986 19449 9062 19525 62 - l 9064 19526 9222 19684 55 - l 9224 19685 9260 19721 76 -} -a { - s 10019 - b 9281 20477 - e 9775 20993 - l 9281 20477 9291 20487 91 - l 9293 20488 9361 20556 61 - l 9362 20566 9389 20593 54 - l 9390 20599 9449 20658 62 - l 9450 20676 9503 20729 59 - l 9504 20732 9546 20774 65 - l 9557 20775 9579 20797 61 - l 9582 20798 9600 20816 53 - l 9601 20820 9652 20871 60 - l 9654 20872 9775 20993 62 -} -a { - s 74886 - b 13874 10191 - e 17594 13841 - l 13874 10191 13904 10221 68 - l 13905 10227 13913 10235 89 - l 13914 10245 13961 10292 63 - l 13962 10295 14134 10467 60 - l 14136 10468 14143 10475 50 - l 14149 10476 14159 10486 82 - l 14160 10491 14178 10509 68 - l 14181 10510 14199 10528 47 - l 14200 10533 14412 10745 62 - l 14417 10746 14483 10812 64 - l 14484 10814 14497 10827 64 - l 14499 10828 14527 10856 59 - l 14528 10863 14542 10877 60 - l 14543 10880 14572 10909 63 - l 14574 10910 14586 10922 69 - l 14587 10924 14601 10938 67 - l 14602 10943 14617 10958 81 - l 14618 10960 14639 10981 68 - l 14649 10982 14716 11049 56 - l 14735 11050 14771 11086 51 - l 14772 11093 14889 11210 59 - l 14890 11212 14899 11221 60 - l 14900 11223 14936 11259 70 - l 14937 11262 14989 11314 60 - l 14990 11318 15031 11359 67 - l 15033 11360 15046 11373 64 - l 15048 11374 15205 11531 63 - l 15209 11532 15338 11661 58 - l 15340 11662 15352 11674 77 - l 15359 11675 15487 11803 56 - l 15488 11809 15657 11978 59 - l 15659 11979 15697 12017 72 - l 15700 12018 15725 12043 58 - l 15726 12045 15805 12124 64 - l 15806 12127 15926 12247 61 - l 15929 12248 16104 12423 63 - l 16112 12424 16132 12444 48 - l 16143 12445 16162 12464 60 - l 16163 12467 16406 12710 60 - l 16408 12711 16446 12749 67 - l 16447 12751 16496 12800 68 - l 16498 12801 16567 12870 57 - l 16568 12876 16572 12880 80 - l 16574 12881 16605 12912 69 - l 16610 12913 16690 12993 69 - l 16714 12994 16840 13120 55 - l 16843 13121 16877 13155 54 - l 16889 13156 17011 13278 63 - l 17012 13282 17016 13286 100 - l 17023 13287 17029 13293 57 - l 17032 13294 17043 13305 83 - l 17076 13306 17093 13323 72 - l 17094 13325 17109 13340 50 - l 17110 13350 17120 13360 73 - l 17121 13363 17184 13426 53 - l 17185 13431 17244 13490 52 - l 17246 13491 17306 13551 61 - l 17307 13555 17316 13564 70 - l 17318 13565 17347 13594 50 - l 17348 13603 17447 13702 57 - l 17456 13703 17474 13721 58 - l 17479 13722 17499 13742 67 - l 17500 13747 17594 13841 65 -} -m { - n 0 -} -#:eof diff --git a/programs/lastz/test_results/base_test.hits.lav b/programs/lastz/test_results/base_test.hits.lav deleted file mode 100644 index 9e8b81a..0000000 --- a/programs/lastz/test_results/base_test.hits.lav +++ /dev/null @@ -1,39305 +0,0 @@ -#:lav -d { - "lastz.v1.03.73 ../test_data/pseudocat.fa ../test_data/pseudopig.fa W=8 T=0 --plus --nogfextend --nogapped - A C G T - 91 -114 -31 -123 - -114 100 -125 -31 - -31 -125 100 -114 - -123 -31 -114 91 - O = 400, E = 30, K = 0, L = 3000, M = 0" -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa" 1 22929 0 1 -} -h { - "> cat" - "> pig1" -} -a { - s 0 - b 17569 17 - e 17576 24 - l 17569 17 17576 24 100 -} -a { - s 0 - b 17570 18 - e 17577 25 - l 17570 18 17577 25 100 -} -a { - s 0 - b 220 26 - e 227 33 - l 220 26 227 33 100 -} -a { - s 0 - b 18757 30 - e 18764 37 - l 18757 30 18764 37 100 -} -a { - s 0 - b 15114 43 - e 15121 50 - l 15114 43 15121 50 100 -} -a { - s 0 - b 4824 45 - e 4831 52 - l 4824 45 4831 52 100 -} -a { - s 0 - b 9456 60 - e 9463 67 - l 9456 60 9463 67 100 -} -a { - s 0 - b 6727 60 - e 6734 67 - l 6727 60 6734 67 100 -} -a { - s 0 - b 14276 61 - e 14283 68 - l 14276 61 14283 68 100 -} -a { - s 0 - b 6728 61 - e 6735 68 - l 6728 61 6735 68 100 -} -a { - s 0 - b 6729 62 - e 6736 69 - l 6729 62 6736 69 100 -} -a { - s 0 - b 4412 65 - e 4419 72 - l 4412 65 4419 72 100 -} -a { - s 0 - b 15210 66 - e 15217 73 - l 15210 66 15217 73 100 -} -a { - s 0 - b 15211 67 - e 15218 74 - l 15211 67 15218 74 100 -} -a { - s 0 - b 8179 67 - e 8186 74 - l 8179 67 8186 74 100 -} -a { - s 0 - b 17520 69 - e 17527 76 - l 17520 69 17527 76 100 -} -a { - s 0 - b 13889 69 - e 13896 76 - l 13889 69 13896 76 100 -} -a { - s 0 - b 136 71 - e 143 78 - l 136 71 143 78 100 -} -a { - s 0 - b 18536 307 - e 18543 314 - l 18536 307 18543 314 100 -} -a { - s 0 - b 18537 308 - e 18544 315 - l 18537 308 18544 315 100 -} -a { - s 0 - b 13422 310 - e 13429 317 - l 13422 310 13429 317 100 -} -a { - s 0 - b 37 323 - e 44 330 - l 37 323 44 330 100 -} -a { - s 0 - b 6514 334 - e 6521 341 - l 6514 334 6521 341 100 -} -a { - s 0 - b 18427 340 - e 18434 347 - l 18427 340 18434 347 100 -} -a { - s 0 - b 8337 342 - e 8344 349 - l 8337 342 8344 349 100 -} -a { - s 0 - b 8338 343 - e 8345 350 - l 8338 343 8345 350 100 -} -a { - s 0 - b 276 345 - e 283 352 - l 276 345 283 352 100 -} -a { - s 0 - b 18779 350 - e 18786 357 - l 18779 350 18786 357 100 -} -a { - s 0 - b 17286 355 - e 17293 362 - l 17286 355 17293 362 100 -} -a { - s 0 - b 17287 356 - e 17294 363 - l 17287 356 17294 363 100 -} -a { - s 0 - b 17321 366 - e 17328 373 - l 17321 366 17328 373 100 -} -a { - s 0 - b 17322 367 - e 17329 374 - l 17322 367 17329 374 100 -} -a { - s 0 - b 17323 368 - e 17330 375 - l 17323 368 17330 375 100 -} -a { - s 0 - b 8658 377 - e 8665 384 - l 8658 377 8665 384 100 -} -a { - s 0 - b 15336 378 - e 15343 385 - l 15336 378 15343 385 100 -} -a { - s 0 - b 4934 383 - e 4941 390 - l 4934 383 4941 390 100 -} -a { - s 0 - b 11510 392 - e 11517 399 - l 11510 392 11517 399 100 -} -a { - s 0 - b 6767 393 - e 6774 400 - l 6767 393 6774 400 100 -} -a { - s 0 - b 18397 394 - e 18404 401 - l 18397 394 18404 401 100 -} -a { - s 0 - b 15399 397 - e 15406 404 - l 15399 397 15406 404 100 -} -a { - s 0 - b 15400 398 - e 15407 405 - l 15400 398 15407 405 100 -} -a { - s 0 - b 15323 406 - e 15330 413 - l 15323 406 15330 413 100 -} -a { - s 0 - b 15324 407 - e 15331 414 - l 15324 407 15331 414 100 -} -a { - s 0 - b 10978 407 - e 10985 414 - l 10978 407 10985 414 100 -} -a { - s 0 - b 15325 408 - e 15332 415 - l 15325 408 15332 415 100 -} -a { - s 0 - b 8738 418 - e 8745 425 - l 8738 418 8745 425 100 -} -a { - s 0 - b 13 419 - e 20 426 - l 13 419 20 426 100 -} -a { - s 0 - b 16043 433 - e 16050 440 - l 16043 433 16050 440 100 -} -a { - s 0 - b 8646 434 - e 8653 441 - l 8646 434 8653 441 100 -} -a { - s 0 - b 8647 435 - e 8654 442 - l 8647 435 8654 442 100 -} -a { - s 0 - b 17569 439 - e 17576 446 - l 17569 439 17576 446 100 -} -a { - s 0 - b 6226 988 - e 6233 995 - l 6226 988 6233 995 100 -} -a { - s 0 - b 5174 988 - e 5181 995 - l 5174 988 5181 995 100 -} -a { - s 0 - b 6498 989 - e 6505 996 - l 6498 989 6505 996 100 -} -a { - s 0 - b 6227 989 - e 6234 996 - l 6227 989 6234 996 100 -} -a { - s 0 - b 6499 990 - e 6506 997 - l 6499 990 6506 997 100 -} -a { - s 0 - b 14101 991 - e 14108 998 - l 14101 991 14108 998 100 -} -a { - s 0 - b 7411 1004 - e 7418 1011 - l 7411 1004 7418 1011 100 -} -a { - s 0 - b 3233 1008 - e 3240 1015 - l 3233 1008 3240 1015 100 -} -a { - s 0 - b 7683 1022 - e 7690 1029 - l 7683 1022 7690 1029 100 -} -a { - s 0 - b 4409 1032 - e 4416 1039 - l 4409 1032 4416 1039 100 -} -a { - s 0 - b 18665 1034 - e 18672 1041 - l 18665 1034 18672 1041 100 -} -a { - s 0 - b 17995 1044 - e 18002 1051 - l 17995 1044 18002 1051 100 -} -a { - s 0 - b 14791 1045 - e 14798 1052 - l 14791 1045 14798 1052 100 -} -a { - s 0 - b 7973 1046 - e 7980 1053 - l 7973 1046 7980 1053 100 -} -a { - s 0 - b 5160 1060 - e 5167 1067 - l 5160 1060 5167 1067 100 -} -a { - s 0 - b 6232 1064 - e 6239 1071 - l 6232 1064 6239 1071 100 -} -a { - s 0 - b 6233 1065 - e 6240 1072 - l 6233 1065 6240 1072 100 -} -a { - s 0 - b 15220 3176 - e 15227 3183 - l 15220 3176 15227 3183 100 -} -a { - s 0 - b 15221 3177 - e 15228 3184 - l 15221 3177 15228 3184 100 -} -a { - s 0 - b 6827 3177 - e 6834 3184 - l 6827 3177 6834 3184 100 -} -a { - s 0 - b 3552 3177 - e 3559 3184 - l 3552 3177 3559 3184 100 -} -a { - s 0 - b 3553 3178 - e 3560 3185 - l 3553 3178 3560 3185 100 -} -a { - s 0 - b 7780 3190 - e 7787 3197 - l 7780 3190 7787 3197 100 -} -a { - s 0 - b 18784 3209 - e 18791 3216 - l 18784 3209 18791 3216 100 -} -a { - s 0 - b 6633 3211 - e 6640 3218 - l 6633 3211 6640 3218 100 -} -a { - s 0 - b 6634 3212 - e 6641 3219 - l 6634 3212 6641 3219 100 -} -a { - s 0 - b 3338 3248 - e 3345 3255 - l 3338 3248 3345 3255 100 -} -a { - s 0 - b 2958 3263 - e 2965 3270 - l 2958 3263 2965 3270 100 -} -a { - s 0 - b 18600 3273 - e 18607 3280 - l 18600 3273 18607 3280 100 -} -a { - s 0 - b 8554 3273 - e 8561 3280 - l 8554 3273 8561 3280 100 -} -a { - s 0 - b 4031 3296 - e 4038 3303 - l 4031 3296 4038 3303 100 -} -a { - s 0 - b 11317 3301 - e 11324 3308 - l 11317 3301 11324 3308 100 -} -a { - s 0 - b 7422 3301 - e 7429 3308 - l 7422 3301 7429 3308 100 -} -a { - s 0 - b 7423 3302 - e 7430 3309 - l 7423 3302 7430 3309 100 -} -a { - s 0 - b 14567 3303 - e 14574 3310 - l 14567 3303 14574 3310 100 -} -a { - s 0 - b 14568 3304 - e 14575 3311 - l 14568 3304 14575 3311 100 -} -a { - s 0 - b 4013 3305 - e 4020 3312 - l 4013 3305 4020 3312 100 -} -a { - s 0 - b 14885 3310 - e 14892 3317 - l 14885 3310 14892 3317 100 -} -a { - s 0 - b 6607 3310 - e 6614 3317 - l 6607 3310 6614 3317 100 -} -a { - s 0 - b 14601 3311 - e 14608 3318 - l 14601 3311 14608 3318 100 -} -a { - s 0 - b 6608 3311 - e 6615 3318 - l 6608 3311 6615 3318 100 -} -a { - s 0 - b 298 3337 - e 305 3344 - l 298 3337 305 3344 100 -} -a { - s 0 - b 299 3338 - e 306 3345 - l 299 3338 306 3345 100 -} -a { - s 0 - b 4983 3348 - e 4990 3355 - l 4983 3348 4990 3355 100 -} -a { - s 0 - b 4984 3349 - e 4991 3356 - l 4984 3349 4991 3356 100 -} -a { - s 0 - b 15912 3371 - e 15919 3378 - l 15912 3371 15919 3378 100 -} -a { - s 0 - b 15913 3372 - e 15920 3379 - l 15913 3372 15920 3379 100 -} -a { - s 0 - b 17280 3390 - e 17287 3397 - l 17280 3390 17287 3397 100 -} -a { - s 0 - b 18528 3392 - e 18535 3399 - l 18528 3392 18535 3399 100 -} -a { - s 0 - b 15142 3394 - e 15149 3401 - l 15142 3394 15149 3401 100 -} -a { - s 0 - b 14805 3397 - e 14812 3404 - l 14805 3397 14812 3404 100 -} -a { - s 0 - b 14806 3398 - e 14813 3405 - l 14806 3398 14813 3405 100 -} -a { - s 0 - b 14807 3399 - e 14814 3406 - l 14807 3399 14814 3406 100 -} -a { - s 0 - b 18098 3406 - e 18105 3413 - l 18098 3406 18105 3413 100 -} -a { - s 0 - b 4291 3408 - e 4298 3415 - l 4291 3408 4298 3415 100 -} -a { - s 0 - b 4292 3409 - e 4299 3416 - l 4292 3409 4299 3416 100 -} -a { - s 0 - b 18426 3416 - e 18433 3423 - l 18426 3416 18433 3423 100 -} -a { - s 0 - b 18427 3417 - e 18434 3424 - l 18427 3417 18434 3424 100 -} -a { - s 0 - b 3400 3423 - e 3407 3430 - l 3400 3423 3407 3430 100 -} -a { - s 0 - b 18744 3432 - e 18751 3439 - l 18744 3432 18751 3439 100 -} -a { - s 0 - b 10950 3434 - e 10957 3441 - l 10950 3434 10957 3441 100 -} -a { - s 0 - b 10951 3435 - e 10958 3442 - l 10951 3435 10958 3442 100 -} -a { - s 0 - b 9722 3435 - e 9729 3442 - l 9722 3435 9729 3442 100 -} -a { - s 0 - b 11507 3436 - e 11514 3443 - l 11507 3436 11514 3443 100 -} -a { - s 0 - b 9723 3436 - e 9730 3443 - l 9723 3436 9730 3443 100 -} -a { - s 0 - b 15962 3437 - e 15969 3444 - l 15962 3437 15969 3444 100 -} -a { - s 0 - b 14351 3437 - e 14358 3444 - l 14351 3437 14358 3444 100 -} -a { - s 0 - b 9724 3437 - e 9731 3444 - l 9724 3437 9731 3444 100 -} -a { - s 0 - b 17268 3440 - e 17275 3447 - l 17268 3440 17275 3447 100 -} -a { - s 0 - b 8833 3449 - e 8840 3456 - l 8833 3449 8840 3456 100 -} -a { - s 0 - b 18459 3453 - e 18466 3460 - l 18459 3453 18466 3460 100 -} -a { - s 0 - b 8486 3471 - e 8493 3478 - l 8486 3471 8493 3478 100 -} -a { - s 0 - b 11149 3475 - e 11156 3482 - l 11149 3475 11156 3482 100 -} -a { - s 0 - b 18690 3486 - e 18697 3493 - l 18690 3486 18697 3493 100 -} -a { - s 0 - b 3725 3491 - e 3732 3498 - l 3725 3491 3732 3498 100 -} -a { - s 0 - b 3726 3492 - e 3733 3499 - l 3726 3492 3733 3499 100 -} -a { - s 0 - b 13934 3496 - e 13941 3503 - l 13934 3496 13941 3503 100 -} -a { - s 0 - b 13935 3497 - e 13942 3504 - l 13935 3497 13942 3504 100 -} -a { - s 0 - b 8383 3504 - e 8390 3511 - l 8383 3504 8390 3511 100 -} -a { - s 0 - b 17374 3507 - e 17381 3514 - l 17374 3507 17381 3514 100 -} -a { - s 0 - b 5148 3520 - e 5155 3527 - l 5148 3520 5155 3527 100 -} -a { - s 0 - b 4949 3523 - e 4956 3530 - l 4949 3523 4956 3530 100 -} -a { - s 0 - b 12617 3535 - e 12624 3542 - l 12617 3535 12624 3542 100 -} -a { - s 0 - b 12325 3540 - e 12332 3547 - l 12325 3540 12332 3547 100 -} -a { - s 0 - b 8912 3549 - e 8919 3556 - l 8912 3549 8919 3556 100 -} -a { - s 0 - b 4100 3585 - e 4107 3592 - l 4100 3585 4107 3592 100 -} -a { - s 0 - b 13483 3591 - e 13490 3598 - l 13483 3591 13490 3598 100 -} -a { - s 0 - b 6711 3596 - e 6718 3603 - l 6711 3596 6718 3603 100 -} -a { - s 0 - b 6712 3597 - e 6719 3604 - l 6712 3597 6719 3604 100 -} -a { - s 0 - b 6713 3598 - e 6720 3605 - l 6713 3598 6720 3605 100 -} -a { - s 0 - b 17177 3599 - e 17184 3606 - l 17177 3599 17184 3606 100 -} -a { - s 0 - b 17178 3600 - e 17185 3607 - l 17178 3600 17185 3607 100 -} -a { - s 0 - b 5053 3605 - e 5060 3612 - l 5053 3605 5060 3612 100 -} -a { - s 0 - b 5054 3606 - e 5061 3613 - l 5054 3606 5061 3613 100 -} -a { - s 0 - b 18657 3617 - e 18664 3624 - l 18657 3617 18664 3624 100 -} -a { - s 0 - b 7513 3631 - e 7520 3638 - l 7513 3631 7520 3638 100 -} -a { - s 0 - b 14835 3638 - e 14842 3645 - l 14835 3638 14842 3645 100 -} -a { - s 0 - b 14752 3649 - e 14759 3656 - l 14752 3649 14759 3656 100 -} -a { - s 0 - b 14753 3650 - e 14760 3657 - l 14753 3650 14760 3657 100 -} -a { - s 0 - b 13459 4592 - e 13466 4599 - l 13459 4592 13466 4599 100 -} -a { - s 0 - b 14675 4593 - e 14682 4600 - l 14675 4593 14682 4600 100 -} -a { - s 0 - b 9622 4597 - e 9629 4604 - l 9622 4597 9629 4604 100 -} -a { - s 0 - b 11476 4599 - e 11483 4606 - l 11476 4599 11483 4606 100 -} -a { - s 0 - b 8663 4604 - e 8670 4611 - l 8663 4604 8670 4611 100 -} -a { - s 0 - b 7398 4604 - e 7405 4611 - l 7398 4604 7405 4611 100 -} -a { - s 0 - b 8664 4605 - e 8671 4612 - l 8664 4605 8671 4612 100 -} -a { - s 0 - b 14481 4606 - e 14488 4613 - l 14481 4606 14488 4613 100 -} -a { - s 0 - b 18390 4626 - e 18397 4633 - l 18390 4626 18397 4633 100 -} -a { - s 0 - b 6903 4639 - e 6910 4646 - l 6903 4639 6910 4646 100 -} -a { - s 0 - b 3087 4641 - e 3094 4648 - l 3087 4641 3094 4648 100 -} -a { - s 0 - b 4811 4649 - e 4818 4656 - l 4811 4649 4818 4656 100 -} -a { - s 0 - b 13886 4658 - e 13893 4665 - l 13886 4658 13893 4665 100 -} -a { - s 0 - b 13887 4659 - e 13894 4666 - l 13887 4659 13894 4666 100 -} -a { - s 0 - b 13888 4660 - e 13895 4667 - l 13888 4660 13895 4667 100 -} -a { - s 0 - b 12144 4670 - e 12151 4677 - l 12144 4670 12151 4677 100 -} -a { - s 0 - b 3578 4673 - e 3585 4680 - l 3578 4673 3585 4680 100 -} -a { - s 0 - b 3579 4674 - e 3586 4681 - l 3579 4674 3586 4681 100 -} -a { - s 0 - b 3366 4675 - e 3373 4682 - l 3366 4675 3373 4682 100 -} -a { - s 0 - b 14390 4676 - e 14397 4683 - l 14390 4676 14397 4683 100 -} -a { - s 0 - b 137 4677 - e 144 4684 - l 137 4677 144 4684 100 -} -a { - s 0 - b 10975 4678 - e 10982 4685 - l 10975 4678 10982 4685 100 -} -a { - s 0 - b 111 4679 - e 118 4686 - l 111 4679 118 4686 100 -} -a { - s 0 - b 9262 4695 - e 9269 4702 - l 9262 4695 9269 4702 100 -} -a { - s 0 - b 8473 4699 - e 8480 4706 - l 8473 4699 8480 4706 100 -} -a { - s 0 - b 10926 4700 - e 10933 4707 - l 10926 4700 10933 4707 100 -} -a { - s 0 - b 8474 4700 - e 8481 4707 - l 8474 4700 8481 4707 100 -} -a { - s 0 - b 11378 4705 - e 11385 4712 - l 11378 4705 11385 4712 100 -} -a { - s 0 - b 8210 4707 - e 8217 4714 - l 8210 4707 8217 4714 100 -} -a { - s 0 - b 8061 4711 - e 8068 4718 - l 8061 4711 8068 4718 100 -} -a { - s 0 - b 14582 4719 - e 14589 4726 - l 14582 4719 14589 4726 100 -} -a { - s 0 - b 10849 4721 - e 10856 4728 - l 10849 4721 10856 4728 100 -} -a { - s 0 - b 15276 4722 - e 15283 4729 - l 15276 4722 15283 4729 100 -} -a { - s 0 - b 3672 4722 - e 3679 4729 - l 3672 4722 3679 4729 100 -} -a { - s 0 - b 8617 4732 - e 8624 4739 - l 8617 4732 8624 4739 100 -} -a { - s 0 - b 6771 4739 - e 6778 4746 - l 6771 4739 6778 4746 100 -} -a { - s 0 - b 6772 4740 - e 6779 4747 - l 6772 4740 6779 4747 100 -} -a { - s 0 - b 13977 4745 - e 13984 4752 - l 13977 4745 13984 4752 100 -} -a { - s 0 - b 3058 4748 - e 3065 4755 - l 3058 4748 3065 4755 100 -} -a { - s 0 - b 3059 4749 - e 3066 4756 - l 3059 4749 3066 4756 100 -} -a { - s 0 - b 3060 4750 - e 3067 4757 - l 3060 4750 3067 4757 100 -} -a { - s 0 - b 15316 4753 - e 15323 4760 - l 15316 4753 15323 4760 100 -} -a { - s 0 - b 3204 4767 - e 3211 4774 - l 3204 4767 3211 4774 100 -} -a { - s 0 - b 14159 4768 - e 14166 4775 - l 14159 4768 14166 4775 100 -} -a { - s 0 - b 14160 4769 - e 14167 4776 - l 14160 4769 14167 4776 100 -} -a { - s 0 - b 14161 4770 - e 14168 4777 - l 14161 4770 14168 4777 100 -} -a { - s 0 - b 15315 4773 - e 15322 4780 - l 15315 4773 15322 4780 100 -} -a { - s 0 - b 15316 4774 - e 15323 4781 - l 15316 4774 15323 4781 100 -} -a { - s 0 - b 17148 4775 - e 17155 4782 - l 17148 4775 17155 4782 100 -} -a { - s 0 - b 14825 4777 - e 14832 4784 - l 14825 4777 14832 4784 100 -} -a { - s 0 - b 11569 4777 - e 11576 4784 - l 11569 4777 11576 4784 100 -} -a { - s 0 - b 11294 4779 - e 11301 4786 - l 11294 4779 11301 4786 100 -} -a { - s 0 - b 7750 4790 - e 7757 4797 - l 7750 4790 7757 4797 100 -} -a { - s 0 - b 7639 4810 - e 7646 4817 - l 7639 4810 7646 4817 100 -} -a { - s 0 - b 4052 4817 - e 4059 4824 - l 4052 4817 4059 4824 100 -} -a { - s 0 - b 6722 4821 - e 6729 4828 - l 6722 4821 6729 4828 100 -} -a { - s 0 - b 6723 4822 - e 6730 4829 - l 6723 4822 6730 4829 100 -} -a { - s 0 - b 11469 5327 - e 11476 5334 - l 11469 5327 11476 5334 100 -} -a { - s 0 - b 14221 5329 - e 14228 5336 - l 14221 5329 14228 5336 100 -} -a { - s 0 - b 9492 5355 - e 9499 5362 - l 9492 5355 9499 5362 100 -} -a { - s 0 - b 7550 5359 - e 7557 5366 - l 7550 5359 7557 5366 100 -} -a { - s 0 - b 7551 5360 - e 7558 5367 - l 7551 5360 7558 5367 100 -} -a { - s 0 - b 3968 5376 - e 3975 5383 - l 3968 5376 3975 5383 100 -} -a { - s 0 - b 8274 5419 - e 8281 5426 - l 8274 5419 8281 5426 100 -} -a { - s 0 - b 8275 5420 - e 8282 5427 - l 8275 5420 8282 5427 100 -} -a { - s 0 - b 3409 5431 - e 3416 5438 - l 3409 5431 3416 5438 100 -} -a { - s 0 - b 6649 5437 - e 6656 5444 - l 6649 5437 6656 5444 100 -} -a { - s 0 - b 14571 5439 - e 14578 5446 - l 14571 5439 14578 5446 100 -} -a { - s 0 - b 17170 5441 - e 17177 5448 - l 17170 5441 17177 5448 100 -} -a { - s 0 - b 8548 5443 - e 8555 5450 - l 8548 5443 8555 5450 100 -} -a { - s 0 - b 18020 5444 - e 18027 5451 - l 18020 5444 18027 5451 100 -} -a { - s 0 - b 8549 5444 - e 8556 5451 - l 8549 5444 8556 5451 100 -} -a { - s 0 - b 18021 5445 - e 18028 5452 - l 18021 5445 18028 5452 100 -} -a { - s 0 - b 18022 5446 - e 18029 5453 - l 18022 5446 18029 5453 100 -} -a { - s 0 - b 18023 5447 - e 18030 5454 - l 18023 5447 18030 5454 100 -} -a { - s 0 - b 4873 5447 - e 4880 5454 - l 4873 5447 4880 5454 100 -} -a { - s 0 - b 8856 5474 - e 8863 5481 - l 8856 5474 8863 5481 100 -} -a { - s 0 - b 8540 5481 - e 8547 5488 - l 8540 5481 8547 5488 100 -} -a { - s 0 - b 14230 5487 - e 14237 5494 - l 14230 5487 14237 5494 100 -} -a { - s 0 - b 6650 5493 - e 6657 5500 - l 6650 5493 6657 5500 100 -} -a { - s 0 - b 13972 5494 - e 13979 5501 - l 13972 5494 13979 5501 100 -} -a { - s 0 - b 13973 5495 - e 13980 5502 - l 13973 5495 13980 5502 100 -} -a { - s 0 - b 18582 5498 - e 18589 5505 - l 18582 5498 18589 5505 100 -} -a { - s 0 - b 6855 5499 - e 6862 5506 - l 6855 5499 6862 5506 100 -} -a { - s 0 - b 8387 5521 - e 8394 5528 - l 8387 5521 8394 5528 100 -} -a { - s 0 - b 6331 5521 - e 6338 5528 - l 6331 5521 6338 5528 100 -} -a { - s 0 - b 11470 5522 - e 11477 5529 - l 11470 5522 11477 5529 100 -} -a { - s 0 - b 6537 5522 - e 6544 5529 - l 6537 5522 6544 5529 100 -} -a { - s 0 - b 6332 5522 - e 6339 5529 - l 6332 5522 6339 5529 100 -} -a { - s 0 - b 11471 5523 - e 11478 5530 - l 11471 5523 11478 5530 100 -} -a { - s 0 - b 6333 5523 - e 6340 5530 - l 6333 5523 6340 5530 100 -} -a { - s 0 - b 9715 5524 - e 9722 5531 - l 9715 5524 9722 5531 100 -} -a { - s 0 - b 6334 5524 - e 6341 5531 - l 6334 5524 6341 5531 100 -} -a { - s 0 - b 5668 5539 - e 5675 5546 - l 5668 5539 5675 5546 100 -} -a { - s 0 - b 9713 5542 - e 9720 5549 - l 9713 5542 9720 5549 100 -} -a { - s 0 - b 8976 5549 - e 8983 5556 - l 8976 5549 8983 5556 100 -} -a { - s 0 - b 8977 5550 - e 8984 5557 - l 8977 5550 8984 5557 100 -} -a { - s 0 - b 13466 5573 - e 13473 5580 - l 13466 5573 13473 5580 100 -} -a { - s 0 - b 8598 5581 - e 8605 5588 - l 8598 5581 8605 5588 100 -} -a { - s 0 - b 9069 5598 - e 9076 5605 - l 9069 5598 9076 5605 100 -} -a { - s 0 - b 3176 5598 - e 3183 5605 - l 3176 5598 3183 5605 100 -} -a { - s 0 - b 3177 5599 - e 3184 5606 - l 3177 5599 3184 5606 100 -} -a { - s 0 - b 8466 5605 - e 8473 5612 - l 8466 5605 8473 5612 100 -} -a { - s 0 - b 4375 5759 - e 4382 5766 - l 4375 5759 4382 5766 100 -} -a { - s 0 - b 3980 5785 - e 3987 5792 - l 3980 5785 3987 5792 100 -} -a { - s 0 - b 8194 5791 - e 8201 5798 - l 8194 5791 8201 5798 100 -} -a { - s 0 - b 6254 5791 - e 6261 5798 - l 6254 5791 6261 5798 100 -} -a { - s 0 - b 11449 5792 - e 11456 5799 - l 11449 5792 11456 5799 100 -} -a { - s 0 - b 8195 5792 - e 8202 5799 - l 8195 5792 8202 5799 100 -} -a { - s 0 - b 18600 5827 - e 18607 5834 - l 18600 5827 18607 5834 100 -} -a { - s 0 - b 8554 5827 - e 8561 5834 - l 8554 5827 8561 5834 100 -} -a { - s 0 - b 5638 5829 - e 5645 5836 - l 5638 5829 5645 5836 100 -} -a { - s 0 - b 15402 5840 - e 15409 5847 - l 15402 5840 15409 5847 100 -} -a { - s 0 - b 13477 5847 - e 13484 5854 - l 13477 5847 13484 5854 100 -} -a { - s 0 - b 13478 5848 - e 13485 5855 - l 13478 5848 13485 5855 100 -} -a { - s 0 - b 12178 5868 - e 12185 5875 - l 12178 5868 12185 5875 100 -} -a { - s 0 - b 8817 5868 - e 8824 5875 - l 8817 5868 8824 5875 100 -} -a { - s 0 - b 12179 5869 - e 12186 5876 - l 12179 5869 12186 5876 100 -} -a { - s 0 - b 12180 5870 - e 12187 5877 - l 12180 5870 12187 5877 100 -} -a { - s 0 - b 18638 5876 - e 18645 5883 - l 18638 5876 18645 5883 100 -} -a { - s 0 - b 14100 5878 - e 14107 5885 - l 14100 5878 14107 5885 100 -} -a { - s 0 - b 14101 5879 - e 14108 5886 - l 14101 5879 14108 5886 100 -} -a { - s 0 - b 11078 5880 - e 11085 5887 - l 11078 5880 11085 5887 100 -} -a { - s 0 - b 12149 5887 - e 12156 5894 - l 12149 5887 12156 5894 100 -} -a { - s 0 - b 9441 5892 - e 9448 5899 - l 9441 5892 9448 5899 100 -} -a { - s 0 - b 17464 5905 - e 17471 5912 - l 17464 5905 17471 5912 100 -} -a { - s 0 - b 17465 5906 - e 17472 5913 - l 17465 5906 17472 5913 100 -} -a { - s 0 - b 7622 5963 - e 7629 5970 - l 7622 5963 7629 5970 100 -} -a { - s 0 - b 12178 5966 - e 12185 5973 - l 12178 5966 12185 5973 100 -} -a { - s 0 - b 8817 5966 - e 8824 5973 - l 8817 5966 8824 5973 100 -} -a { - s 0 - b 8818 5967 - e 8825 5974 - l 8818 5967 8825 5974 100 -} -a { - s 0 - b 6773 5974 - e 6780 5981 - l 6773 5974 6780 5981 100 -} -a { - s 0 - b 7585 5978 - e 7592 5985 - l 7585 5978 7592 5985 100 -} -a { - s 0 - b 7586 5979 - e 7593 5986 - l 7586 5979 7593 5986 100 -} -a { - s 0 - b 7587 5980 - e 7594 5987 - l 7587 5980 7594 5987 100 -} -a { - s 0 - b 46 5982 - e 53 5989 - l 46 5982 53 5989 100 -} -a { - s 0 - b 11544 5988 - e 11551 5995 - l 11544 5988 11551 5995 100 -} -a { - s 0 - b 6 5988 - e 13 5995 - l 6 5988 13 5995 100 -} -a { - s 0 - b 7 5989 - e 14 5996 - l 7 5989 14 5996 100 -} -a { - s 0 - b 17533 5994 - e 17540 6001 - l 17533 5994 17540 6001 100 -} -a { - s 0 - b 7583 5999 - e 7590 6006 - l 7583 5999 7590 6006 100 -} -a { - s 0 - b 6564 6000 - e 6571 6007 - l 6564 6000 6571 6007 100 -} -a { - s 0 - b 8346 6024 - e 8353 6031 - l 8346 6024 8353 6031 100 -} -a { - s 0 - b 148 6043 - e 155 6050 - l 148 6043 155 6050 100 -} -a { - s 0 - b 6287 6055 - e 6294 6062 - l 6287 6055 6294 6062 100 -} -a { - s 0 - b 6288 6056 - e 6295 6063 - l 6288 6056 6295 6063 100 -} -a { - s 0 - b 3311 6056 - e 3318 6063 - l 3311 6056 3318 6063 100 -} -a { - s 0 - b 3163 6058 - e 3170 6065 - l 3163 6058 3170 6065 100 -} -a { - s 0 - b 3164 6059 - e 3171 6066 - l 3164 6059 3171 6066 100 -} -a { - s 0 - b 3165 6060 - e 3172 6067 - l 3165 6060 3172 6067 100 -} -a { - s 0 - b 5008 6062 - e 5015 6069 - l 5008 6062 5015 6069 100 -} -a { - s 0 - b 9504 6071 - e 9511 6078 - l 9504 6071 9511 6078 100 -} -a { - s 0 - b 7711 6071 - e 7718 6078 - l 7711 6071 7718 6078 100 -} -a { - s 0 - b 17529 6075 - e 17536 6082 - l 17529 6075 17536 6082 100 -} -a { - s 0 - b 14705 6078 - e 14712 6085 - l 14705 6078 14712 6085 100 -} -a { - s 0 - b 15920 6079 - e 15927 6086 - l 15920 6079 15927 6086 100 -} -a { - s 0 - b 8137 6087 - e 8144 6094 - l 8137 6087 8144 6094 100 -} -a { - s 0 - b 8138 6088 - e 8145 6095 - l 8138 6088 8145 6095 100 -} -a { - s 0 - b 9378 6101 - e 9385 6108 - l 9378 6101 9385 6108 100 -} -a { - s 0 - b 9379 6102 - e 9386 6109 - l 9379 6102 9386 6109 100 -} -a { - s 0 - b 7997 6131 - e 8004 6138 - l 7997 6131 8004 6138 100 -} -a { - s 0 - b 7998 6132 - e 8005 6139 - l 7998 6132 8005 6139 100 -} -a { - s 0 - b 17169 6143 - e 17176 6150 - l 17169 6143 17176 6150 100 -} -a { - s 0 - b 250 6164 - e 257 6171 - l 250 6164 257 6171 100 -} -a { - s 0 - b 13988 6178 - e 13995 6185 - l 13988 6178 13995 6185 100 -} -a { - s 0 - b 14076 6182 - e 14083 6189 - l 14076 6182 14083 6189 100 -} -a { - s 0 - b 17507 6186 - e 17514 6193 - l 17507 6186 17514 6193 100 -} -a { - s 0 - b 10890 6186 - e 10897 6193 - l 10890 6186 10897 6193 100 -} -a { - s 0 - b 12241 6191 - e 12248 6198 - l 12241 6191 12248 6198 100 -} -a { - s 0 - b 8985 6193 - e 8992 6200 - l 8985 6193 8992 6200 100 -} -a { - s 0 - b 15925 6206 - e 15932 6213 - l 15925 6206 15932 6213 100 -} -a { - s 0 - b 8462 6206 - e 8469 6213 - l 8462 6206 8469 6213 100 -} -a { - s 0 - b 8463 6207 - e 8470 6214 - l 8463 6207 8470 6214 100 -} -a { - s 0 - b 18420 6208 - e 18427 6215 - l 18420 6208 18427 6215 100 -} -a { - s 0 - b 9403 6209 - e 9410 6216 - l 9403 6209 9410 6216 100 -} -a { - s 0 - b 14587 6213 - e 14594 6220 - l 14587 6213 14594 6220 100 -} -a { - s 0 - b 7449 6222 - e 7456 6229 - l 7449 6222 7456 6229 100 -} -a { - s 0 - b 14582 6235 - e 14589 6242 - l 14582 6235 14589 6242 100 -} -a { - s 0 - b 14583 6236 - e 14590 6243 - l 14583 6236 14590 6243 100 -} -a { - s 0 - b 14584 6237 - e 14591 6244 - l 14584 6237 14591 6244 100 -} -a { - s 0 - b 14585 6238 - e 14592 6245 - l 14585 6238 14592 6245 100 -} -a { - s 0 - b 12160 6240 - e 12167 6247 - l 12160 6240 12167 6247 100 -} -a { - s 0 - b 12185 6242 - e 12192 6249 - l 12185 6242 12192 6249 100 -} -a { - s 0 - b 12186 6243 - e 12193 6250 - l 12186 6243 12193 6250 100 -} -a { - s 0 - b 12187 6244 - e 12194 6251 - l 12187 6244 12194 6251 100 -} -a { - s 0 - b 15114 6245 - e 15121 6252 - l 15114 6245 15121 6252 100 -} -a { - s 0 - b 15115 6246 - e 15122 6253 - l 15115 6246 15122 6253 100 -} -a { - s 0 - b 15116 6247 - e 15123 6254 - l 15116 6247 15123 6254 100 -} -a { - s 0 - b 15117 6248 - e 15124 6255 - l 15117 6248 15124 6255 100 -} -a { - s 0 - b 15118 6249 - e 15125 6256 - l 15118 6249 15125 6256 100 -} -a { - s 0 - b 6507 6249 - e 6514 6256 - l 6507 6249 6514 6256 100 -} -a { - s 0 - b 3580 6252 - e 3587 6259 - l 3580 6252 3587 6259 100 -} -a { - s 0 - b 13892 6254 - e 13899 6261 - l 13892 6254 13899 6261 100 -} -a { - s 0 - b 13893 6255 - e 13900 6262 - l 13893 6255 13900 6262 100 -} -a { - s 0 - b 10894 6261 - e 10901 6268 - l 10894 6261 10901 6268 100 -} -a { - s 0 - b 17293 6263 - e 17300 6270 - l 17293 6263 17300 6270 100 -} -a { - s 0 - b 15964 6319 - e 15971 6326 - l 15964 6319 15971 6326 100 -} -a { - s 0 - b 14353 6319 - e 14360 6326 - l 14353 6319 14360 6326 100 -} -a { - s 0 - b 3721 6326 - e 3728 6333 - l 3721 6326 3728 6333 100 -} -a { - s 0 - b 8997 6327 - e 9004 6334 - l 8997 6327 9004 6334 100 -} -a { - s 0 - b 8998 6328 - e 9005 6335 - l 8998 6328 9005 6335 100 -} -a { - s 0 - b 13530 6336 - e 13537 6343 - l 13530 6336 13537 6343 100 -} -a { - s 0 - b 10903 6336 - e 10910 6343 - l 10903 6336 10910 6343 100 -} -a { - s 0 - b 13531 6337 - e 13538 6344 - l 13531 6337 13538 6344 100 -} -a { - s 0 - b 14574 6349 - e 14581 6356 - l 14574 6349 14581 6356 100 -} -a { - s 0 - b 8151 6360 - e 8158 6367 - l 8151 6360 8158 6367 100 -} -a { - s 0 - b 3234 6371 - e 3241 6378 - l 3234 6371 3241 6378 100 -} -a { - s 0 - b 8017 6372 - e 8024 6379 - l 8017 6372 8024 6379 100 -} -a { - s 0 - b 3903 6372 - e 3910 6379 - l 3903 6372 3910 6379 100 -} -a { - s 0 - b 6401 6373 - e 6408 6380 - l 6401 6373 6408 6380 100 -} -a { - s 0 - b 3904 6373 - e 3911 6380 - l 3904 6373 3911 6380 100 -} -a { - s 0 - b 9359 6378 - e 9366 6385 - l 9359 6378 9366 6385 100 -} -a { - s 0 - b 9360 6379 - e 9367 6386 - l 9360 6379 9367 6386 100 -} -a { - s 0 - b 8931 6397 - e 8938 6404 - l 8931 6397 8938 6404 100 -} -a { - s 0 - b 3993 6405 - e 4000 6412 - l 3993 6405 4000 6412 100 -} -a { - s 0 - b 12166 6406 - e 12173 6413 - l 12166 6406 12173 6413 100 -} -a { - s 0 - b 3383 6408 - e 3390 6415 - l 3383 6408 3390 6415 100 -} -a { - s 0 - b 3384 6409 - e 3391 6416 - l 3384 6409 3391 6416 100 -} -a { - s 0 - b 3385 6410 - e 3392 6417 - l 3385 6410 3392 6417 100 -} -a { - s 0 - b 3386 6411 - e 3393 6418 - l 3386 6411 3393 6418 100 -} -a { - s 0 - b 3387 6412 - e 3394 6419 - l 3387 6412 3394 6419 100 -} -a { - s 0 - b 15399 6424 - e 15406 6431 - l 15399 6424 15406 6431 100 -} -a { - s 0 - b 56 6434 - e 63 6441 - l 56 6434 63 6441 100 -} -a { - s 0 - b 17279 6465 - e 17286 6472 - l 17279 6465 17286 6472 100 -} -a { - s 0 - b 283 6472 - e 290 6479 - l 283 6472 290 6479 100 -} -a { - s 0 - b 4998 6477 - e 5005 6484 - l 4998 6477 5005 6484 100 -} -a { - s 0 - b 16102 6480 - e 16109 6487 - l 16102 6480 16109 6487 100 -} -a { - s 0 - b 8016 6493 - e 8023 6500 - l 8016 6493 8023 6500 100 -} -a { - s 0 - b 9508 6495 - e 9515 6502 - l 9508 6495 9515 6502 100 -} -a { - s 0 - b 9509 6496 - e 9516 6503 - l 9509 6496 9516 6503 100 -} -a { - s 0 - b 9290 6497 - e 9297 6504 - l 9290 6497 9297 6504 100 -} -a { - s 0 - b 8728 6505 - e 8735 6512 - l 8728 6505 8735 6512 100 -} -a { - s 0 - b 13937 6521 - e 13944 6528 - l 13937 6521 13944 6528 100 -} -a { - s 0 - b 13909 6534 - e 13916 6541 - l 13909 6534 13916 6541 100 -} -a { - s 0 - b 6523 6543 - e 6530 6550 - l 6523 6543 6530 6550 100 -} -a { - s 0 - b 18577 6548 - e 18584 6555 - l 18577 6548 18584 6555 100 -} -a { - s 0 - b 18578 6549 - e 18585 6556 - l 18578 6549 18585 6556 100 -} -a { - s 0 - b 2991 6551 - e 2998 6558 - l 2991 6551 2998 6558 100 -} -a { - s 0 - b 14875 6559 - e 14882 6566 - l 14875 6559 14882 6566 100 -} -a { - s 0 - b 16054 6560 - e 16061 6567 - l 16054 6560 16061 6567 100 -} -a { - s 0 - b 3686 6560 - e 3693 6567 - l 3686 6560 3693 6567 100 -} -a { - s 0 - b 16 6560 - e 23 6567 - l 16 6560 23 6567 100 -} -a { - s 0 - b 18027 6584 - e 18034 6591 - l 18027 6584 18034 6591 100 -} -a { - s 0 - b 18028 6585 - e 18035 6592 - l 18028 6585 18035 6592 100 -} -a { - s 0 - b 3934 6586 - e 3941 6593 - l 3934 6586 3941 6593 100 -} -a { - s 0 - b 3202 6590 - e 3209 6597 - l 3202 6590 3209 6597 100 -} -a { - s 0 - b 18386 6610 - e 18393 6617 - l 18386 6610 18393 6617 100 -} -a { - s 0 - b 3587 6613 - e 3594 6620 - l 3587 6613 3594 6620 100 -} -a { - s 0 - b 5016 6619 - e 5023 6626 - l 5016 6619 5023 6626 100 -} -a { - s 0 - b 8842 6632 - e 8849 6639 - l 8842 6632 8849 6639 100 -} -a { - s 0 - b 8310 6633 - e 8317 6640 - l 8310 6633 8317 6640 100 -} -a { - s 0 - b 3577 6633 - e 3584 6640 - l 3577 6633 3584 6640 100 -} -a { - s 0 - b 13995 6636 - e 14002 6643 - l 13995 6636 14002 6643 100 -} -a { - s 0 - b 13996 6637 - e 14003 6644 - l 13996 6637 14003 6644 100 -} -a { - s 0 - b 9675 6637 - e 9682 6644 - l 9675 6637 9682 6644 100 -} -a { - s 0 - b 14273 6657 - e 14280 6664 - l 14273 6657 14280 6664 100 -} -a { - s 0 - b 14274 6658 - e 14281 6665 - l 14274 6658 14281 6665 100 -} -a { - s 0 - b 14275 6659 - e 14282 6666 - l 14275 6659 14282 6666 100 -} -a { - s 0 - b 8213 6659 - e 8220 6666 - l 8213 6659 8220 6666 100 -} -a { - s 0 - b 14276 6660 - e 14283 6667 - l 14276 6660 14283 6667 100 -} -a { - s 0 - b 6728 6660 - e 6735 6667 - l 6728 6660 6735 6667 100 -} -a { - s 0 - b 6729 6661 - e 6736 6668 - l 6729 6661 6736 6668 100 -} -a { - s 0 - b 8590 6662 - e 8597 6669 - l 8590 6662 8597 6669 100 -} -a { - s 0 - b 6730 6662 - e 6737 6669 - l 6730 6662 6737 6669 100 -} -a { - s 0 - b 4865 6662 - e 4872 6669 - l 4865 6662 4872 6669 100 -} -a { - s 0 - b 6731 6663 - e 6738 6670 - l 6731 6663 6738 6670 100 -} -a { - s 0 - b 14941 6722 - e 14948 6729 - l 14941 6722 14948 6729 100 -} -a { - s 0 - b 11117 6722 - e 11124 6729 - l 11117 6722 11124 6729 100 -} -a { - s 0 - b 14942 6723 - e 14949 6730 - l 14942 6723 14949 6730 100 -} -a { - s 0 - b 7497 6723 - e 7504 6730 - l 7497 6723 7504 6730 100 -} -a { - s 0 - b 5128 6732 - e 5135 6739 - l 5128 6732 5135 6739 100 -} -a { - s 0 - b 5129 6733 - e 5136 6740 - l 5129 6733 5136 6740 100 -} -a { - s 0 - b 5130 6734 - e 5137 6741 - l 5130 6734 5137 6741 100 -} -a { - s 0 - b 5131 6735 - e 5138 6742 - l 5131 6735 5138 6742 100 -} -a { - s 0 - b 4885 6736 - e 4892 6743 - l 4885 6736 4892 6743 100 -} -a { - s 0 - b 8434 6739 - e 8441 6746 - l 8434 6739 8441 6746 100 -} -a { - s 0 - b 2954 6761 - e 2961 6768 - l 2954 6761 2961 6768 100 -} -a { - s 0 - b 8158 6766 - e 8165 6773 - l 8158 6766 8165 6773 100 -} -a { - s 0 - b 14534 6767 - e 14541 6774 - l 14534 6767 14541 6774 100 -} -a { - s 0 - b 9054 6787 - e 9061 6794 - l 9054 6787 9061 6794 100 -} -a { - s 0 - b 14116 6788 - e 14123 6795 - l 14116 6788 14123 6795 100 -} -a { - s 0 - b 14117 6789 - e 14124 6796 - l 14117 6789 14124 6796 100 -} -a { - s 0 - b 7701 6793 - e 7708 6800 - l 7701 6793 7708 6800 100 -} -a { - s 0 - b 10969 6818 - e 10976 6825 - l 10969 6818 10976 6825 100 -} -a { - s 0 - b 17347 6819 - e 17354 6826 - l 17347 6819 17354 6826 100 -} -a { - s 0 - b 9382 6827 - e 9389 6834 - l 9382 6827 9389 6834 100 -} -a { - s 0 - b 8245 6865 - e 8252 6872 - l 8245 6865 8252 6872 100 -} -a { - s 0 - b 3181 6865 - e 3188 6872 - l 3181 6865 3188 6872 100 -} -a { - s 0 - b 3182 6866 - e 3189 6873 - l 3182 6866 3189 6873 100 -} -a { - s 0 - b 8171 6883 - e 8178 6890 - l 8171 6883 8178 6890 100 -} -a { - s 0 - b 14200 6892 - e 14207 6899 - l 14200 6892 14207 6899 100 -} -a { - s 0 - b 8946 6907 - e 8953 6914 - l 8946 6907 8953 6914 100 -} -a { - s 0 - b 9408 6908 - e 9415 6915 - l 9408 6908 9415 6915 100 -} -a { - s 0 - b 9409 6909 - e 9416 6916 - l 9409 6909 9416 6916 100 -} -a { - s 0 - b 6584 6909 - e 6591 6916 - l 6584 6909 6591 6916 100 -} -a { - s 0 - b 11102 6918 - e 11109 6925 - l 11102 6918 11109 6925 100 -} -a { - s 0 - b 17183 6921 - e 17190 6928 - l 17183 6921 17190 6928 100 -} -a { - s 0 - b 5077 6935 - e 5084 6942 - l 5077 6935 5084 6942 100 -} -a { - s 0 - b 14047 6937 - e 14054 6944 - l 14047 6937 14054 6944 100 -} -a { - s 0 - b 17473 6941 - e 17480 6948 - l 17473 6941 17480 6948 100 -} -a { - s 0 - b 8342 6941 - e 8349 6948 - l 8342 6941 8349 6948 100 -} -a { - s 0 - b 16878 6945 - e 16885 6952 - l 16878 6945 16885 6952 100 -} -a { - s 0 - b 3364 6960 - e 3371 6967 - l 3364 6960 3371 6967 100 -} -a { - s 0 - b 9333 6964 - e 9340 6971 - l 9333 6964 9340 6971 100 -} -a { - s 0 - b 2990 6966 - e 2997 6973 - l 2990 6966 2997 6973 100 -} -a { - s 0 - b 16000 7022 - e 16007 7029 - l 16000 7022 16007 7029 100 -} -a { - s 0 - b 9690 7024 - e 9697 7031 - l 9690 7024 9697 7031 100 -} -a { - s 0 - b 6317 7025 - e 6324 7032 - l 6317 7025 6324 7032 100 -} -a { - s 0 - b 6345 7041 - e 6352 7048 - l 6345 7041 6352 7048 100 -} -a { - s 0 - b 8640 7066 - e 8647 7073 - l 8640 7066 8647 7073 100 -} -a { - s 0 - b 4814 7069 - e 4821 7076 - l 4814 7069 4821 7076 100 -} -a { - s 0 - b 11528 7073 - e 11535 7080 - l 11528 7073 11535 7080 100 -} -a { - s 0 - b 3618 7076 - e 3625 7083 - l 3618 7076 3625 7083 100 -} -a { - s 0 - b 11482 7089 - e 11489 7096 - l 11482 7089 11489 7096 100 -} -a { - s 0 - b 11483 7090 - e 11490 7097 - l 11483 7090 11490 7097 100 -} -a { - s 0 - b 15323 7127 - e 15330 7134 - l 15323 7127 15330 7134 100 -} -a { - s 0 - b 3978 7128 - e 3985 7135 - l 3978 7128 3985 7135 100 -} -a { - s 0 - b 14680 7129 - e 14687 7136 - l 14680 7129 14687 7136 100 -} -a { - s 0 - b 14681 7130 - e 14688 7137 - l 14681 7130 14688 7137 100 -} -a { - s 0 - b 17965 7132 - e 17972 7139 - l 17965 7132 17972 7139 100 -} -a { - s 0 - b 152 7132 - e 159 7139 - l 152 7132 159 7139 100 -} -a { - s 0 - b 9314 7133 - e 9321 7140 - l 9314 7133 9321 7140 100 -} -a { - s 0 - b 5091 7133 - e 5098 7140 - l 5091 7133 5098 7140 100 -} -a { - s 0 - b 153 7133 - e 160 7140 - l 153 7133 160 7140 100 -} -a { - s 0 - b 18612 7632 - e 18619 7639 - l 18612 7632 18619 7639 100 -} -a { - s 0 - b 18613 7633 - e 18620 7640 - l 18613 7633 18620 7640 100 -} -a { - s 0 - b 18614 7634 - e 18621 7641 - l 18614 7634 18621 7641 100 -} -a { - s 0 - b 18615 7635 - e 18622 7642 - l 18615 7635 18622 7642 100 -} -a { - s 0 - b 14874 7643 - e 14881 7650 - l 14874 7643 14881 7650 100 -} -a { - s 0 - b 14495 7655 - e 14502 7662 - l 14495 7655 14502 7662 100 -} -a { - s 0 - b 14496 7656 - e 14503 7663 - l 14496 7656 14503 7663 100 -} -a { - s 0 - b 15925 7676 - e 15932 7683 - l 15925 7676 15932 7683 100 -} -a { - s 0 - b 8462 7676 - e 8469 7683 - l 8462 7676 8469 7683 100 -} -a { - s 0 - b 8463 7677 - e 8470 7684 - l 8463 7677 8470 7684 100 -} -a { - s 0 - b 15360 7680 - e 15367 7687 - l 15360 7680 15367 7687 100 -} -a { - s 0 - b 4989 7682 - e 4996 7689 - l 4989 7682 4996 7689 100 -} -a { - s 0 - b 18094 7689 - e 18101 7696 - l 18094 7689 18101 7696 100 -} -a { - s 0 - b 17344 7689 - e 17351 7696 - l 17344 7689 17351 7696 100 -} -a { - s 0 - b 9653 7694 - e 9660 7701 - l 9653 7694 9660 7701 100 -} -a { - s 0 - b 14950 7696 - e 14957 7703 - l 14950 7696 14957 7703 100 -} -a { - s 0 - b 11506 7704 - e 11513 7711 - l 11506 7704 11513 7711 100 -} -a { - s 0 - b 6315 7706 - e 6322 7713 - l 6315 7706 6322 7713 100 -} -a { - s 0 - b 13420 7716 - e 13427 7723 - l 13420 7716 13427 7723 100 -} -a { - s 0 - b 5667 7719 - e 5674 7726 - l 5667 7719 5674 7726 100 -} -a { - s 0 - b 14082 7720 - e 14089 7727 - l 14082 7720 14089 7727 100 -} -a { - s 0 - b 15945 7732 - e 15952 7739 - l 15945 7732 15952 7739 100 -} -a { - s 0 - b 15157 7739 - e 15164 7746 - l 15157 7739 15164 7746 100 -} -a { - s 0 - b 6839 7756 - e 6846 7763 - l 6839 7756 6846 7763 100 -} -a { - s 0 - b 6840 7757 - e 6847 7764 - l 6840 7757 6847 7764 100 -} -a { - s 0 - b 7700 7761 - e 7707 7768 - l 7700 7761 7707 7768 100 -} -a { - s 0 - b 5692 7762 - e 5699 7769 - l 5692 7762 5699 7769 100 -} -a { - s 0 - b 8413 7763 - e 8420 7770 - l 8413 7763 8420 7770 100 -} -a { - s 0 - b 8414 7764 - e 8421 7771 - l 8414 7764 8421 7771 100 -} -a { - s 0 - b 5132 7768 - e 5139 7775 - l 5132 7768 5139 7775 100 -} -a { - s 0 - b 13509 7774 - e 13516 7781 - l 13509 7774 13516 7781 100 -} -a { - s 0 - b 13510 7775 - e 13517 7782 - l 13510 7775 13517 7782 100 -} -a { - s 0 - b 17241 8503 - e 17248 8510 - l 17241 8503 17248 8510 100 -} -a { - s 0 - b 227 8504 - e 234 8511 - l 227 8504 234 8511 100 -} -a { - s 0 - b 8875 8508 - e 8882 8515 - l 8875 8508 8882 8515 100 -} -a { - s 0 - b 8876 8509 - e 8883 8516 - l 8876 8509 8883 8516 100 -} -a { - s 0 - b 17 8525 - e 24 8532 - l 17 8525 24 8532 100 -} -a { - s 0 - b 18 8526 - e 25 8533 - l 18 8526 25 8533 100 -} -a { - s 0 - b 18772 8527 - e 18779 8534 - l 18772 8527 18779 8534 100 -} -a { - s 0 - b 3252 8527 - e 3259 8534 - l 3252 8527 3259 8534 100 -} -a { - s 0 - b 6403 8528 - e 6410 8535 - l 6403 8528 6410 8535 100 -} -a { - s 0 - b 3906 8528 - e 3913 8535 - l 3906 8528 3913 8535 100 -} -a { - s 0 - b 14372 8529 - e 14379 8536 - l 14372 8529 14379 8536 100 -} -a { - s 0 - b 3907 8529 - e 3914 8536 - l 3907 8529 3914 8536 100 -} -a { - s 0 - b 10854 8531 - e 10861 8538 - l 10854 8531 10861 8538 100 -} -a { - s 0 - b 10855 8532 - e 10862 8539 - l 10855 8532 10862 8539 100 -} -a { - s 0 - b 6739 8535 - e 6746 8542 - l 6739 8535 6746 8542 100 -} -a { - s 0 - b 6740 8536 - e 6747 8543 - l 6740 8536 6747 8543 100 -} -a { - s 0 - b 6741 8537 - e 6748 8544 - l 6741 8537 6748 8544 100 -} -a { - s 0 - b 6742 8538 - e 6749 8545 - l 6742 8538 6749 8545 100 -} -a { - s 0 - b 6743 8539 - e 6750 8546 - l 6743 8539 6750 8546 100 -} -a { - s 0 - b 16109 8547 - e 16116 8554 - l 16109 8547 16116 8554 100 -} -a { - s 0 - b 12182 8548 - e 12189 8555 - l 12182 8548 12189 8555 100 -} -a { - s 0 - b 12183 8549 - e 12190 8556 - l 12183 8549 12190 8556 100 -} -a { - s 0 - b 11060 9026 - e 11067 9033 - l 11060 9026 11067 9033 100 -} -a { - s 0 - b 11061 9027 - e 11068 9034 - l 11061 9027 11068 9034 100 -} -a { - s 0 - b 4091 9127 - e 4098 9134 - l 4091 9127 4098 9134 100 -} -a { - s 0 - b 4092 9128 - e 4099 9135 - l 4092 9128 4099 9135 100 -} -a { - s 0 - b 14311 9129 - e 14318 9136 - l 14311 9129 14318 9136 100 -} -a { - s 0 - b 3937 9132 - e 3944 9139 - l 3937 9132 3944 9139 100 -} -a { - s 0 - b 3938 9133 - e 3945 9140 - l 3938 9133 3945 9140 100 -} -a { - s 0 - b 8075 9137 - e 8082 9144 - l 8075 9137 8082 9144 100 -} -a { - s 0 - b 8076 9138 - e 8083 9145 - l 8076 9138 8083 9145 100 -} -a { - s 0 - b 17588 9141 - e 17595 9148 - l 17588 9141 17595 9148 100 -} -a { - s 0 - b 14878 9141 - e 14885 9148 - l 14878 9141 14885 9148 100 -} -a { - s 0 - b 17589 9142 - e 17596 9149 - l 17589 9142 17596 9149 100 -} -a { - s 0 - b 8649 9142 - e 8656 9149 - l 8649 9142 8656 9149 100 -} -a { - s 0 - b 14350 9151 - e 14357 9158 - l 14350 9151 14357 9158 100 -} -a { - s 0 - b 11508 9152 - e 11515 9159 - l 11508 9152 11515 9159 100 -} -a { - s 0 - b 9371 9158 - e 9378 9165 - l 9371 9158 9378 9165 100 -} -a { - s 0 - b 7454 9160 - e 7461 9167 - l 7454 9160 7461 9167 100 -} -a { - s 0 - b 18077 9161 - e 18084 9168 - l 18077 9161 18084 9168 100 -} -a { - s 0 - b 5111 9166 - e 5118 9173 - l 5111 9166 5118 9173 100 -} -a { - s 0 - b 7666 9167 - e 7673 9174 - l 7666 9167 7673 9174 100 -} -a { - s 0 - b 6677 9167 - e 6684 9174 - l 6677 9167 6684 9174 100 -} -a { - s 0 - b 5112 9167 - e 5119 9174 - l 5112 9167 5119 9174 100 -} -a { - s 0 - b 8430 9189 - e 8437 9196 - l 8430 9189 8437 9196 100 -} -a { - s 0 - b 8431 9190 - e 8438 9197 - l 8431 9190 8438 9197 100 -} -a { - s 0 - b 15357 9191 - e 15364 9198 - l 15357 9191 15364 9198 100 -} -a { - s 0 - b 7520 9201 - e 7527 9208 - l 7520 9201 7527 9208 100 -} -a { - s 0 - b 8469 9203 - e 8476 9210 - l 8469 9203 8476 9210 100 -} -a { - s 0 - b 18709 10097 - e 18716 10104 - l 18709 10097 18716 10104 100 -} -a { - s 0 - b 3278 10114 - e 3285 10121 - l 3278 10114 3285 10121 100 -} -a { - s 0 - b 4021 10125 - e 4028 10132 - l 4021 10125 4028 10132 100 -} -a { - s 0 - b 12609 10129 - e 12616 10136 - l 12609 10129 12616 10136 100 -} -a { - s 0 - b 13438 10134 - e 13445 10141 - l 13438 10134 13445 10141 100 -} -a { - s 0 - b 18042 10135 - e 18049 10142 - l 18042 10135 18049 10142 100 -} -a { - s 0 - b 11292 10135 - e 11299 10142 - l 11292 10135 11299 10142 100 -} -a { - s 0 - b 14024 10136 - e 14031 10143 - l 14024 10136 14031 10143 100 -} -a { - s 0 - b 14025 10137 - e 14032 10144 - l 14025 10137 14032 10144 100 -} -a { - s 0 - b 14026 10138 - e 14033 10145 - l 14026 10138 14033 10145 100 -} -a { - s 0 - b 2995 10152 - e 3002 10159 - l 2995 10152 3002 10159 100 -} -a { - s 0 - b 2996 10153 - e 3003 10160 - l 2996 10153 3003 10160 100 -} -a { - s 0 - b 18024 10155 - e 18031 10162 - l 18024 10155 18031 10162 100 -} -a { - s 0 - b 4874 10155 - e 4881 10162 - l 4874 10155 4881 10162 100 -} -a { - s 0 - b 18025 10156 - e 18032 10163 - l 18025 10156 18032 10163 100 -} -a { - s 0 - b 14632 10156 - e 14639 10163 - l 14632 10156 14639 10163 100 -} -a { - s 0 - b 18026 10157 - e 18033 10164 - l 18026 10157 18033 10164 100 -} -a { - s 0 - b 4353 10161 - e 4360 10168 - l 4353 10161 4360 10168 100 -} -a { - s 0 - b 14502 10163 - e 14509 10170 - l 14502 10163 14509 10170 100 -} -a { - s 0 - b 15226 10166 - e 15233 10173 - l 15226 10166 15233 10173 100 -} -a { - s 0 - b 6415 10171 - e 6422 10178 - l 6415 10171 6422 10178 100 -} -a { - s 0 - b 18675 10182 - e 18682 10189 - l 18675 10182 18682 10189 100 -} -a { - s 0 - b 17433 10182 - e 17440 10189 - l 17433 10182 17440 10189 100 -} -a { - s 0 - b 18037 10184 - e 18044 10191 - l 18037 10184 18044 10191 100 -} -a { - s 0 - b 6716 10184 - e 6723 10191 - l 6716 10184 6723 10191 100 -} -a { - s 0 - b 5567 10191 - e 5574 10198 - l 5567 10191 5574 10198 100 -} -a { - s 0 - b 5568 10192 - e 5575 10199 - l 5568 10192 5575 10199 100 -} -a { - s 0 - b 13888 10205 - e 13895 10212 - l 13888 10205 13895 10212 100 -} -a { - s 0 - b 17520 10206 - e 17527 10213 - l 17520 10206 17527 10213 100 -} -a { - s 0 - b 13889 10206 - e 13896 10213 - l 13889 10206 13896 10213 100 -} -a { - s 0 - b 17521 10207 - e 17528 10214 - l 17521 10207 17528 10214 100 -} -a { - s 0 - b 13890 10207 - e 13897 10214 - l 13890 10207 13897 10214 100 -} -a { - s 0 - b 6653 10207 - e 6660 10214 - l 6653 10207 6660 10214 100 -} -a { - s 0 - b 14699 10213 - e 14706 10220 - l 14699 10213 14706 10220 100 -} -a { - s 0 - b 5680 10218 - e 5687 10225 - l 5680 10218 5687 10225 100 -} -a { - s 0 - b 5681 10219 - e 5688 10226 - l 5681 10219 5688 10226 100 -} -a { - s 0 - b 14115 10233 - e 14122 10240 - l 14115 10233 14122 10240 100 -} -a { - s 0 - b 6753 10234 - e 6760 10241 - l 6753 10234 6760 10241 100 -} -a { - s 0 - b 6754 10235 - e 6761 10242 - l 6754 10235 6761 10242 100 -} -a { - s 0 - b 11029 10236 - e 11036 10243 - l 11029 10236 11036 10243 100 -} -a { - s 0 - b 11030 10237 - e 11037 10244 - l 11030 10237 11037 10244 100 -} -a { - s 0 - b 3008 10243 - e 3015 10250 - l 3008 10243 3015 10250 100 -} -a { - s 0 - b 8023 10265 - e 8030 10272 - l 8023 10265 8030 10272 100 -} -a { - s 0 - b 49 10272 - e 56 10279 - l 49 10272 56 10279 100 -} -a { - s 0 - b 18607 10274 - e 18614 10281 - l 18607 10274 18614 10281 100 -} -a { - s 0 - b 16148 10292 - e 16155 10299 - l 16148 10292 16155 10299 100 -} -a { - s 0 - b 4026 10292 - e 4033 10299 - l 4026 10292 4033 10299 100 -} -a { - s 0 - b 14100 10295 - e 14107 10302 - l 14100 10295 14107 10302 100 -} -a { - s 0 - b 13865 10297 - e 13872 10304 - l 13865 10297 13872 10304 100 -} -a { - s 0 - b 9046 10324 - e 9053 10331 - l 9046 10324 9053 10331 100 -} -a { - s 0 - b 8276 10341 - e 8283 10348 - l 8276 10341 8283 10348 100 -} -a { - s 0 - b 7483 10344 - e 7490 10351 - l 7483 10344 7490 10351 100 -} -a { - s 0 - b 17462 10345 - e 17469 10352 - l 17462 10345 17469 10352 100 -} -a { - s 0 - b 11557 10353 - e 11564 10360 - l 11557 10353 11564 10360 100 -} -a { - s 0 - b 12164 10447 - e 12171 10454 - l 12164 10447 12171 10454 100 -} -a { - s 0 - b 12165 10448 - e 12172 10455 - l 12165 10448 12172 10455 100 -} -a { - s 0 - b 3094 10466 - e 3101 10473 - l 3094 10466 3101 10473 100 -} -a { - s 0 - b 17095 10470 - e 17102 10477 - l 17095 10470 17102 10477 100 -} -a { - s 0 - b 14337 10473 - e 14344 10480 - l 14337 10473 14344 10480 100 -} -a { - s 0 - b 12326 10473 - e 12333 10480 - l 12326 10473 12333 10480 100 -} -a { - s 0 - b 12240 10476 - e 12247 10483 - l 12240 10476 12247 10483 100 -} -a { - s 0 - b 6328 10479 - e 6335 10486 - l 6328 10479 6335 10486 100 -} -a { - s 0 - b 3898 10505 - e 3905 10512 - l 3898 10505 3905 10512 100 -} -a { - s 0 - b 14032 10521 - e 14039 10528 - l 14032 10521 14039 10528 100 -} -a { - s 0 - b 14033 10522 - e 14040 10529 - l 14033 10522 14040 10529 100 -} -a { - s 0 - b 14034 10523 - e 14041 10530 - l 14034 10523 14041 10530 100 -} -a { - s 0 - b 4465 10527 - e 4472 10534 - l 4465 10527 4472 10534 100 -} -a { - s 0 - b 14126 10538 - e 14133 10545 - l 14126 10538 14133 10545 100 -} -a { - s 0 - b 8069 10562 - e 8076 10569 - l 8069 10562 8076 10569 100 -} -a { - s 0 - b 4367 10562 - e 4374 10569 - l 4367 10562 4374 10569 100 -} -a { - s 0 - b 8070 10563 - e 8077 10570 - l 8070 10563 8077 10570 100 -} -a { - s 0 - b 8071 10564 - e 8078 10571 - l 8071 10564 8078 10571 100 -} -a { - s 0 - b 18032 10579 - e 18039 10586 - l 18032 10579 18039 10586 100 -} -a { - s 0 - b 8143 10588 - e 8150 10595 - l 8143 10588 8150 10595 100 -} -a { - s 0 - b 8144 10589 - e 8151 10596 - l 8144 10589 8151 10596 100 -} -a { - s 0 - b 8145 10590 - e 8152 10597 - l 8145 10590 8152 10597 100 -} -a { - s 0 - b 9463 10591 - e 9470 10598 - l 9463 10591 9470 10598 100 -} -a { - s 0 - b 14828 10615 - e 14835 10622 - l 14828 10615 14835 10622 100 -} -a { - s 0 - b 11063 10615 - e 11070 10622 - l 11063 10615 11070 10622 100 -} -a { - s 0 - b 4531 10615 - e 4538 10622 - l 4531 10615 4538 10622 100 -} -a { - s 0 - b 3083 10615 - e 3090 10622 - l 3083 10615 3090 10622 100 -} -a { - s 0 - b 8784 10618 - e 8791 10625 - l 8784 10618 8791 10625 100 -} -a { - s 0 - b 8785 10619 - e 8792 10626 - l 8785 10619 8792 10626 100 -} -a { - s 0 - b 18538 10621 - e 18545 10628 - l 18538 10621 18545 10628 100 -} -a { - s 0 - b 18539 10622 - e 18546 10629 - l 18539 10622 18546 10629 100 -} -a { - s 0 - b 3996 10635 - e 4003 10642 - l 3996 10635 4003 10642 100 -} -a { - s 0 - b 3997 10636 - e 4004 10643 - l 3997 10636 4004 10643 100 -} -a { - s 0 - b 4809 10637 - e 4816 10644 - l 4809 10637 4816 10644 100 -} -a { - s 0 - b 4810 10638 - e 4817 10645 - l 4810 10638 4817 10645 100 -} -a { - s 0 - b 4811 10639 - e 4818 10646 - l 4811 10639 4818 10646 100 -} -a { - s 0 - b 9281 10642 - e 9288 10649 - l 9281 10642 9288 10649 100 -} -a { - s 0 - b 13947 10643 - e 13954 10650 - l 13947 10643 13954 10650 100 -} -a { - s 0 - b 9048 10643 - e 9055 10650 - l 9048 10643 9055 10650 100 -} -a { - s 0 - b 13948 10644 - e 13955 10651 - l 13948 10644 13955 10651 100 -} -a { - s 0 - b 8705 10650 - e 8712 10657 - l 8705 10650 8712 10657 100 -} -a { - s 0 - b 8706 10651 - e 8713 10658 - l 8706 10651 8713 10658 100 -} -a { - s 0 - b 8261 10654 - e 8268 10661 - l 8261 10654 8268 10661 100 -} -a { - s 0 - b 15297 10658 - e 15304 10665 - l 15297 10658 15304 10665 100 -} -a { - s 0 - b 4941 10662 - e 4948 10669 - l 4941 10662 4948 10669 100 -} -a { - s 0 - b 17968 10663 - e 17975 10670 - l 17968 10663 17975 10670 100 -} -a { - s 0 - b 17969 10664 - e 17976 10671 - l 17969 10664 17976 10671 100 -} -a { - s 0 - b 6706 10666 - e 6713 10673 - l 6706 10666 6713 10673 100 -} -a { - s 0 - b 6656 10666 - e 6663 10673 - l 6656 10666 6663 10673 100 -} -a { - s 0 - b 18355 10667 - e 18362 10674 - l 18355 10667 18362 10674 100 -} -a { - s 0 - b 6707 10667 - e 6714 10674 - l 6707 10667 6714 10674 100 -} -a { - s 0 - b 18356 10668 - e 18363 10675 - l 18356 10668 18363 10675 100 -} -a { - s 0 - b 6708 10668 - e 6715 10675 - l 6708 10668 6715 10675 100 -} -a { - s 0 - b 15199 10687 - e 15206 10694 - l 15199 10687 15206 10694 100 -} -a { - s 0 - b 15200 10688 - e 15207 10695 - l 15200 10688 15207 10695 100 -} -a { - s 0 - b 15201 10689 - e 15208 10696 - l 15201 10689 15208 10696 100 -} -a { - s 0 - b 15202 10690 - e 15209 10697 - l 15202 10690 15209 10697 100 -} -a { - s 0 - b 9272 10707 - e 9279 10714 - l 9272 10707 9279 10714 100 -} -a { - s 0 - b 9720 10713 - e 9727 10720 - l 9720 10713 9727 10720 100 -} -a { - s 0 - b 9721 10714 - e 9728 10721 - l 9721 10714 9728 10721 100 -} -a { - s 0 - b 16879 10724 - e 16886 10731 - l 16879 10724 16886 10731 100 -} -a { - s 0 - b 14746 10729 - e 14753 10736 - l 14746 10729 14753 10736 100 -} -a { - s 0 - b 14556 10729 - e 14563 10736 - l 14556 10729 14563 10736 100 -} -a { - s 0 - b 12155 10740 - e 12162 10747 - l 12155 10740 12162 10747 100 -} -a { - s 0 - b 14171 10742 - e 14178 10749 - l 14171 10742 14178 10749 100 -} -a { - s 0 - b 9025 10743 - e 9032 10750 - l 9025 10743 9032 10750 100 -} -a { - s 0 - b 9026 10744 - e 9033 10751 - l 9026 10744 9033 10751 100 -} -a { - s 0 - b 3943 10746 - e 3950 10753 - l 3943 10746 3950 10753 100 -} -a { - s 0 - b 4015 10747 - e 4022 10754 - l 4015 10747 4022 10754 100 -} -a { - s 0 - b 6440 10762 - e 6447 10769 - l 6440 10762 6447 10769 100 -} -a { - s 0 - b 6441 10763 - e 6448 10770 - l 6441 10763 6448 10770 100 -} -a { - s 0 - b 6442 10764 - e 6449 10771 - l 6442 10764 6449 10771 100 -} -a { - s 0 - b 6443 10765 - e 6450 10772 - l 6443 10765 6450 10772 100 -} -a { - s 0 - b 9276 10767 - e 9283 10774 - l 9276 10767 9283 10774 100 -} -a { - s 0 - b 4860 10772 - e 4867 10779 - l 4860 10772 4867 10779 100 -} -a { - s 0 - b 4861 10773 - e 4868 10780 - l 4861 10773 4868 10780 100 -} -a { - s 0 - b 8516 10778 - e 8523 10785 - l 8516 10778 8523 10785 100 -} -a { - s 0 - b 8517 10779 - e 8524 10786 - l 8517 10779 8524 10786 100 -} -a { - s 0 - b 8704 10789 - e 8711 10796 - l 8704 10789 8711 10796 100 -} -a { - s 0 - b 6371 10796 - e 6378 10803 - l 6371 10796 6378 10803 100 -} -a { - s 0 - b 3199 10796 - e 3206 10803 - l 3199 10796 3206 10803 100 -} -a { - s 0 - b 4647 10810 - e 4654 10817 - l 4647 10810 4654 10817 100 -} -a { - s 0 - b 4648 10811 - e 4655 10818 - l 4648 10811 4655 10818 100 -} -a { - s 0 - b 3294 10813 - e 3301 10820 - l 3294 10813 3301 10820 100 -} -a { - s 0 - b 14815 10815 - e 14822 10822 - l 14815 10815 14822 10822 100 -} -a { - s 0 - b 189 10828 - e 196 10835 - l 189 10828 196 10835 100 -} -a { - s 0 - b 11444 10832 - e 11451 10839 - l 11444 10832 11451 10839 100 -} -a { - s 0 - b 8050 10832 - e 8057 10839 - l 8050 10832 8057 10839 100 -} -a { - s 0 - b 11445 10833 - e 11452 10840 - l 11445 10833 11452 10840 100 -} -a { - s 0 - b 11446 10834 - e 11453 10841 - l 11446 10834 11453 10841 100 -} -a { - s 0 - b 3386 10835 - e 3393 10842 - l 3386 10835 3393 10842 100 -} -a { - s 0 - b 16136 10839 - e 16143 10846 - l 16136 10839 16143 10846 100 -} -a { - s 0 - b 17309 10840 - e 17316 10847 - l 17309 10840 17316 10847 100 -} -a { - s 0 - b 16137 10840 - e 16144 10847 - l 16137 10840 16144 10847 100 -} -a { - s 0 - b 14712 10859 - e 14719 10866 - l 14712 10859 14719 10866 100 -} -a { - s 0 - b 10840 10862 - e 10847 10869 - l 10840 10862 10847 10869 100 -} -a { - s 0 - b 8358 10862 - e 8365 10869 - l 8358 10862 8365 10869 100 -} -a { - s 0 - b 8359 10863 - e 8366 10870 - l 8359 10863 8366 10870 100 -} -a { - s 0 - b 8360 10864 - e 8367 10871 - l 8360 10864 8367 10871 100 -} -a { - s 0 - b 7470 10875 - e 7477 10882 - l 7470 10875 7477 10882 100 -} -a { - s 0 - b 3908 10875 - e 3915 10882 - l 3908 10875 3915 10882 100 -} -a { - s 0 - b 17096 10879 - e 17103 10886 - l 17096 10879 17103 10886 100 -} -a { - s 0 - b 3619 10880 - e 3626 10887 - l 3619 10880 3626 10887 100 -} -a { - s 0 - b 6612 10881 - e 6619 10888 - l 6612 10881 6619 10888 100 -} -a { - s 0 - b 4817 10882 - e 4824 10889 - l 4817 10882 4824 10889 100 -} -a { - s 0 - b 12598 10885 - e 12605 10892 - l 12598 10885 12605 10892 100 -} -a { - s 0 - b 6721 10885 - e 6728 10892 - l 6721 10885 6728 10892 100 -} -a { - s 0 - b 12599 10886 - e 12606 10893 - l 12599 10886 12606 10893 100 -} -a { - s 0 - b 14148 10892 - e 14155 10899 - l 14148 10892 14155 10899 100 -} -a { - s 0 - b 18769 10900 - e 18776 10907 - l 18769 10900 18776 10907 100 -} -a { - s 0 - b 14838 10900 - e 14845 10907 - l 14838 10900 14845 10907 100 -} -a { - s 0 - b 14839 10901 - e 14846 10908 - l 14839 10901 14846 10908 100 -} -a { - s 0 - b 15907 10902 - e 15914 10909 - l 15907 10902 15914 10909 100 -} -a { - s 0 - b 15990 10919 - e 15997 10926 - l 15990 10919 15997 10926 100 -} -a { - s 0 - b 15991 10920 - e 15998 10927 - l 15991 10920 15998 10927 100 -} -a { - s 0 - b 11261 10922 - e 11268 10929 - l 11261 10922 11268 10929 100 -} -a { - s 0 - b 4913 10926 - e 4920 10933 - l 4913 10926 4920 10933 100 -} -a { - s 0 - b 4914 10927 - e 4921 10934 - l 4914 10927 4921 10934 100 -} -a { - s 0 - b 18664 10941 - e 18671 10948 - l 18664 10941 18671 10948 100 -} -a { - s 0 - b 14685 10962 - e 14692 10969 - l 14685 10962 14692 10969 100 -} -a { - s 0 - b 275 10979 - e 282 10986 - l 275 10979 282 10986 100 -} -a { - s 0 - b 17539 10980 - e 17546 10987 - l 17539 10980 17546 10987 100 -} -a { - s 0 - b 17540 10981 - e 17547 10988 - l 17540 10981 17547 10988 100 -} -a { - s 0 - b 5130 11044 - e 5137 11051 - l 5130 11044 5137 11051 100 -} -a { - s 0 - b 13956 11046 - e 13963 11053 - l 13956 11046 13963 11053 100 -} -a { - s 0 - b 13957 11047 - e 13964 11054 - l 13957 11047 13964 11054 100 -} -a { - s 0 - b 8670 11047 - e 8677 11054 - l 8670 11047 8677 11054 100 -} -a { - s 0 - b 17581 11048 - e 17588 11055 - l 17581 11048 17588 11055 100 -} -a { - s 0 - b 17582 11049 - e 17589 11056 - l 17582 11049 17589 11056 100 -} -a { - s 0 - b 17583 11050 - e 17590 11057 - l 17583 11050 17590 11057 100 -} -a { - s 0 - b 11424 11052 - e 11431 11059 - l 11424 11052 11431 11059 100 -} -a { - s 0 - b 18396 11073 - e 18403 11080 - l 18396 11073 18403 11080 100 -} -a { - s 0 - b 6768 11074 - e 6775 11081 - l 6768 11074 6775 11081 100 -} -a { - s 0 - b 11331 11104 - e 11338 11111 - l 11331 11104 11338 11111 100 -} -a { - s 0 - b 7518 11120 - e 7525 11127 - l 7518 11120 7525 11127 100 -} -a { - s 0 - b 7831 11122 - e 7838 11129 - l 7831 11122 7838 11129 100 -} -a { - s 0 - b 7719 11124 - e 7726 11131 - l 7719 11124 7726 11131 100 -} -a { - s 0 - b 8902 11126 - e 8909 11133 - l 8902 11126 8909 11133 100 -} -a { - s 0 - b 5700 11136 - e 5707 11143 - l 5700 11136 5707 11143 100 -} -a { - s 0 - b 18697 11137 - e 18704 11144 - l 18697 11137 18704 11144 100 -} -a { - s 0 - b 4559 11155 - e 4566 11162 - l 4559 11155 4566 11162 100 -} -a { - s 0 - b 6239 11174 - e 6246 11181 - l 6239 11174 6246 11181 100 -} -a { - s 0 - b 7528 11208 - e 7535 11215 - l 7528 11208 7535 11215 100 -} -a { - s 0 - b 7529 11209 - e 7536 11216 - l 7529 11209 7536 11216 100 -} -a { - s 0 - b 7530 11210 - e 7537 11217 - l 7530 11210 7537 11217 100 -} -a { - s 0 - b 9615 11218 - e 9622 11225 - l 9615 11218 9622 11225 100 -} -a { - s 0 - b 4554 11230 - e 4561 11237 - l 4554 11230 4561 11237 100 -} -a { - s 0 - b 4555 11231 - e 4562 11238 - l 4555 11231 4562 11238 100 -} -a { - s 0 - b 4798 11249 - e 4805 11256 - l 4798 11249 4805 11256 100 -} -a { - s 0 - b 5188 11253 - e 5195 11260 - l 5188 11253 5195 11260 100 -} -a { - s 0 - b 15993 11254 - e 16000 11261 - l 15993 11254 16000 11261 100 -} -a { - s 0 - b 14385 11257 - e 14392 11264 - l 14385 11257 14392 11264 100 -} -a { - s 0 - b 17283 11281 - e 17290 11288 - l 17283 11281 17290 11288 100 -} -a { - s 0 - b 14696 11283 - e 14703 11290 - l 14696 11283 14703 11290 100 -} -a { - s 0 - b 14697 11284 - e 14704 11291 - l 14697 11284 14704 11291 100 -} -a { - s 0 - b 14698 11285 - e 14705 11292 - l 14698 11285 14705 11292 100 -} -a { - s 0 - b 8768 11291 - e 8775 11298 - l 8768 11291 8775 11298 100 -} -a { - s 0 - b 15957 11306 - e 15964 11313 - l 15957 11306 15964 11313 100 -} -a { - s 0 - b 7543 11312 - e 7550 11319 - l 7543 11312 7550 11319 100 -} -a { - s 0 - b 7544 11313 - e 7551 11320 - l 7544 11313 7551 11320 100 -} -a { - s 0 - b 3026 11315 - e 3033 11322 - l 3026 11315 3033 11322 100 -} -a { - s 0 - b 6249 11323 - e 6256 11330 - l 6249 11323 6256 11330 100 -} -a { - s 0 - b 6250 11324 - e 6257 11331 - l 6250 11324 6257 11331 100 -} -a { - s 0 - b 8258 11326 - e 8265 11333 - l 8258 11326 8265 11333 100 -} -a { - s 0 - b 8259 11327 - e 8266 11334 - l 8259 11327 8266 11334 100 -} -a { - s 0 - b 134 11336 - e 141 11343 - l 134 11336 141 11343 100 -} -a { - s 0 - b 9638 11338 - e 9645 11345 - l 9638 11338 9645 11345 100 -} -a { - s 0 - b 6654 11338 - e 6661 11345 - l 6654 11338 6661 11345 100 -} -a { - s 0 - b 11001 11348 - e 11008 11355 - l 11001 11348 11008 11355 100 -} -a { - s 0 - b 11002 11349 - e 11009 11356 - l 11002 11349 11009 11356 100 -} -a { - s 0 - b 6399 11349 - e 6406 11356 - l 6399 11349 6406 11356 100 -} -a { - s 0 - b 6400 11350 - e 6407 11357 - l 6400 11350 6407 11357 100 -} -a { - s 0 - b 6401 11351 - e 6408 11358 - l 6401 11351 6408 11358 100 -} -a { - s 0 - b 3904 11351 - e 3911 11358 - l 3904 11351 3911 11358 100 -} -a { - s 0 - b 20 11353 - e 27 11360 - l 20 11353 27 11360 100 -} -a { - s 0 - b 15187 11354 - e 15194 11361 - l 15187 11354 15194 11361 100 -} -a { - s 0 - b 17989 11357 - e 17996 11364 - l 17989 11357 17996 11364 100 -} -a { - s 0 - b 17990 11358 - e 17997 11365 - l 17990 11358 17997 11365 100 -} -a { - s 0 - b 14737 11359 - e 14744 11366 - l 14737 11359 14744 11366 100 -} -a { - s 0 - b 8414 11370 - e 8421 11377 - l 8414 11370 8421 11377 100 -} -a { - s 0 - b 14949 11380 - e 14956 11387 - l 14949 11380 14956 11387 100 -} -a { - s 0 - b 14950 11381 - e 14957 11388 - l 14950 11381 14957 11388 100 -} -a { - s 0 - b 17518 11385 - e 17525 11392 - l 17518 11385 17525 11392 100 -} -a { - s 0 - b 6640 11386 - e 6647 11393 - l 6640 11386 6647 11393 100 -} -a { - s 0 - b 7697 11388 - e 7704 11395 - l 7697 11388 7704 11395 100 -} -a { - s 0 - b 11530 11389 - e 11537 11396 - l 11530 11389 11537 11396 100 -} -a { - s 0 - b 291 11396 - e 298 11403 - l 291 11396 298 11403 100 -} -a { - s 0 - b 18462 11419 - e 18469 11426 - l 18462 11419 18469 11426 100 -} -a { - s 0 - b 4010 11423 - e 4017 11430 - l 4010 11423 4017 11430 100 -} -a { - s 0 - b 2971 11434 - e 2978 11441 - l 2971 11434 2978 11441 100 -} -a { - s 0 - b 4457 11435 - e 4464 11442 - l 4457 11435 4464 11442 100 -} -a { - s 0 - b 14970 11438 - e 14977 11445 - l 14970 11438 14977 11445 100 -} -a { - s 0 - b 7577 11442 - e 7584 11449 - l 7577 11442 7584 11449 100 -} -a { - s 0 - b 9357 11446 - e 9364 11453 - l 9357 11446 9364 11453 100 -} -a { - s 0 - b 15396 11461 - e 15403 11468 - l 15396 11461 15403 11468 100 -} -a { - s 0 - b 6585 11463 - e 6592 11470 - l 6585 11463 6592 11470 100 -} -a { - s 0 - b 18400 11464 - e 18407 11471 - l 18400 11464 18407 11471 100 -} -a { - s 0 - b 9625 11466 - e 9632 11473 - l 9625 11466 9632 11473 100 -} -a { - s 0 - b 3279 11466 - e 3286 11473 - l 3279 11466 3286 11473 100 -} -a { - s 0 - b 3545 11468 - e 3552 11475 - l 3545 11468 3552 11475 100 -} -a { - s 0 - b 3546 11469 - e 3553 11476 - l 3546 11469 3553 11476 100 -} -a { - s 0 - b 17262 11470 - e 17269 11477 - l 17262 11470 17269 11477 100 -} -a { - s 0 - b 13897 11470 - e 13904 11477 - l 13897 11470 13904 11477 100 -} -a { - s 0 - b 4070 11470 - e 4077 11477 - l 4070 11470 4077 11477 100 -} -a { - s 0 - b 17263 11471 - e 17270 11478 - l 17263 11471 17270 11478 100 -} -a { - s 0 - b 13497 11478 - e 13504 11485 - l 13497 11478 13504 11485 100 -} -a { - s 0 - b 8951 11478 - e 8958 11485 - l 8951 11478 8958 11485 100 -} -a { - s 0 - b 8183 11480 - e 8190 11487 - l 8183 11480 8190 11487 100 -} -a { - s 0 - b 8022 11481 - e 8029 11488 - l 8022 11481 8029 11488 100 -} -a { - s 0 - b 8023 11482 - e 8030 11489 - l 8023 11482 8030 11489 100 -} -a { - s 0 - b 9625 11497 - e 9632 11504 - l 9625 11497 9632 11504 100 -} -a { - s 0 - b 3279 11497 - e 3286 11504 - l 3279 11497 3286 11504 100 -} -a { - s 0 - b 4871 11498 - e 4878 11505 - l 4871 11498 4878 11505 100 -} -a { - s 0 - b 18442 11511 - e 18449 11518 - l 18442 11511 18449 11518 100 -} -a { - s 0 - b 9447 11523 - e 9454 11530 - l 9447 11523 9454 11530 100 -} -a { - s 0 - b 3178 11523 - e 3185 11530 - l 3178 11523 3185 11530 100 -} -a { - s 0 - b 9448 11524 - e 9455 11531 - l 9448 11524 9455 11531 100 -} -a { - s 0 - b 9026 11527 - e 9033 11534 - l 9026 11527 9033 11534 100 -} -a { - s 0 - b 4291 11529 - e 4298 11536 - l 4291 11529 4298 11536 100 -} -a { - s 0 - b 4292 11530 - e 4299 11537 - l 4292 11530 4299 11537 100 -} -a { - s 0 - b 6493 11534 - e 6500 11541 - l 6493 11534 6500 11541 100 -} -a { - s 0 - b 16004 11542 - e 16011 11549 - l 16004 11542 16011 11549 100 -} -a { - s 0 - b 16005 11543 - e 16012 11550 - l 16005 11543 16012 11550 100 -} -a { - s 0 - b 6566 11547 - e 6573 11554 - l 6566 11547 6573 11554 100 -} -a { - s 0 - b 6567 11548 - e 6574 11555 - l 6567 11548 6574 11555 100 -} -a { - s 0 - b 6568 11549 - e 6575 11556 - l 6568 11549 6575 11556 100 -} -a { - s 0 - b 11136 11562 - e 11143 11569 - l 11136 11562 11143 11569 100 -} -a { - s 0 - b 17246 11574 - e 17253 11581 - l 17246 11574 17253 11581 100 -} -a { - s 0 - b 14091 11574 - e 14098 11581 - l 14091 11574 14098 11581 100 -} -a { - s 0 - b 18753 11575 - e 18760 11582 - l 18753 11575 18760 11582 100 -} -a { - s 0 - b 18754 11576 - e 18761 11583 - l 18754 11576 18761 11583 100 -} -a { - s 0 - b 18755 11577 - e 18762 11584 - l 18755 11577 18762 11584 100 -} -a { - s 0 - b 17494 11577 - e 17501 11584 - l 17494 11577 17501 11584 100 -} -a { - s 0 - b 13523 11583 - e 13530 11590 - l 13523 11583 13530 11590 100 -} -a { - s 0 - b 15397 11604 - e 15404 11611 - l 15397 11604 15404 11611 100 -} -a { - s 0 - b 10920 11605 - e 10927 11612 - l 10920 11605 10927 11612 100 -} -a { - s 0 - b 7744 11613 - e 7751 11620 - l 7744 11613 7751 11620 100 -} -a { - s 0 - b 6340 11613 - e 6347 11620 - l 6340 11613 6347 11620 100 -} -a { - s 0 - b 6341 11614 - e 6348 11621 - l 6341 11614 6348 11621 100 -} -a { - s 0 - b 11473 11634 - e 11480 11641 - l 11473 11634 11480 11641 100 -} -a { - s 0 - b 3995 11637 - e 4002 11644 - l 3995 11637 4002 11644 100 -} -a { - s 0 - b 5663 11638 - e 5670 11645 - l 5663 11638 5670 11645 100 -} -a { - s 0 - b 5664 11639 - e 5671 11646 - l 5664 11639 5671 11646 100 -} -a { - s 0 - b 3875 11665 - e 3882 11672 - l 3875 11665 3882 11672 100 -} -a { - s 0 - b 7752 11670 - e 7759 11677 - l 7752 11670 7759 11677 100 -} -a { - s 0 - b 7753 11671 - e 7760 11678 - l 7753 11671 7760 11678 100 -} -a { - s 0 - b 11530 11697 - e 11537 11704 - l 11530 11697 11537 11704 100 -} -a { - s 0 - b 11531 11698 - e 11538 11705 - l 11531 11698 11538 11705 100 -} -a { - s 0 - b 11532 11699 - e 11539 11706 - l 11532 11699 11539 11706 100 -} -a { - s 0 - b 11533 11700 - e 11540 11707 - l 11533 11700 11540 11707 100 -} -a { - s 0 - b 8467 11709 - e 8474 11716 - l 8467 11709 8474 11716 100 -} -a { - s 0 - b 12213 11713 - e 12220 11720 - l 12213 11713 12220 11720 100 -} -a { - s 0 - b 7598 11714 - e 7605 11721 - l 7598 11714 7605 11721 100 -} -a { - s 0 - b 17205 11716 - e 17212 11723 - l 17205 11716 17212 11723 100 -} -a { - s 0 - b 14761 11743 - e 14768 11750 - l 14761 11743 14768 11750 100 -} -a { - s 0 - b 7648 11748 - e 7655 11755 - l 7648 11748 7655 11755 100 -} -a { - s 0 - b 6410 11748 - e 6417 11755 - l 6410 11748 6417 11755 100 -} -a { - s 0 - b 8790 11750 - e 8797 11757 - l 8790 11750 8797 11757 100 -} -a { - s 0 - b 8444 11750 - e 8451 11757 - l 8444 11750 8451 11757 100 -} -a { - s 0 - b 8791 11751 - e 8798 11758 - l 8791 11751 8798 11758 100 -} -a { - s 0 - b 8445 11751 - e 8452 11758 - l 8445 11751 8452 11758 100 -} -a { - s 0 - b 8446 11752 - e 8453 11759 - l 8446 11752 8453 11759 100 -} -a { - s 0 - b 11001 11772 - e 11008 11779 - l 11001 11772 11008 11779 100 -} -a { - s 0 - b 4394 11774 - e 4401 11781 - l 4394 11774 4401 11781 100 -} -a { - s 0 - b 17450 11786 - e 17457 11793 - l 17450 11786 17457 11793 100 -} -a { - s 0 - b 14227 11798 - e 14234 11805 - l 14227 11798 14234 11805 100 -} -a { - s 0 - b 8706 11808 - e 8713 11815 - l 8706 11808 8713 11815 100 -} -a { - s 0 - b 3186 11812 - e 3193 11819 - l 3186 11812 3193 11819 100 -} -a { - s 0 - b 8262 11817 - e 8269 11824 - l 8262 11817 8269 11824 100 -} -a { - s 0 - b 3886 12139 - e 3893 12146 - l 3886 12139 3893 12146 100 -} -a { - s 0 - b 15289 12150 - e 15296 12157 - l 15289 12150 15296 12157 100 -} -a { - s 0 - b 15290 12151 - e 15297 12158 - l 15290 12151 15297 12158 100 -} -a { - s 0 - b 8695 12157 - e 8702 12164 - l 8695 12157 8702 12164 100 -} -a { - s 0 - b 6756 12161 - e 6763 12168 - l 6756 12161 6763 12168 100 -} -a { - s 0 - b 9268 12184 - e 9275 12191 - l 9268 12184 9275 12191 100 -} -a { - s 0 - b 18626 12185 - e 18633 12192 - l 18626 12185 18633 12192 100 -} -a { - s 0 - b 6897 12193 - e 6904 12200 - l 6897 12193 6904 12200 100 -} -a { - s 0 - b 16101 12195 - e 16108 12202 - l 16101 12195 16108 12202 100 -} -a { - s 0 - b 6533 12218 - e 6540 12225 - l 6533 12218 6540 12225 100 -} -a { - s 0 - b 18676 12248 - e 18683 12255 - l 18676 12248 18683 12255 100 -} -a { - s 0 - b 17434 12248 - e 17441 12255 - l 17434 12248 17441 12255 100 -} -a { - s 0 - b 17435 12249 - e 17442 12256 - l 17435 12249 17442 12256 100 -} -a { - s 0 - b 14474 12258 - e 14481 12265 - l 14474 12258 14481 12265 100 -} -a { - s 0 - b 14475 12259 - e 14482 12266 - l 14475 12259 14482 12266 100 -} -a { - s 0 - b 7611 12261 - e 7618 12268 - l 7611 12261 7618 12268 100 -} -a { - s 0 - b 14639 12263 - e 14646 12270 - l 14639 12263 14646 12270 100 -} -a { - s 0 - b 8246 12263 - e 8253 12270 - l 8246 12263 8253 12270 100 -} -a { - s 0 - b 4863 12274 - e 4870 12281 - l 4863 12274 4870 12281 100 -} -a { - s 0 - b 293 12275 - e 300 12282 - l 293 12275 300 12282 100 -} -a { - s 0 - b 11286 12281 - e 11293 12288 - l 11286 12281 11293 12288 100 -} -a { - s 0 - b 11531 12283 - e 11538 12290 - l 11531 12283 11538 12290 100 -} -a { - s 0 - b 11532 12284 - e 11539 12291 - l 11532 12284 11539 12291 100 -} -a { - s 0 - b 11533 12285 - e 11540 12292 - l 11533 12285 11540 12292 100 -} -a { - s 0 - b 14566 12286 - e 14573 12293 - l 14566 12286 14573 12293 100 -} -a { - s 0 - b 11534 12286 - e 11541 12293 - l 11534 12286 11541 12293 100 -} -a { - s 0 - b 11535 12287 - e 11542 12294 - l 11535 12287 11542 12294 100 -} -a { - s 0 - b 7424 12287 - e 7431 12294 - l 7424 12287 7431 12294 100 -} -a { - s 0 - b 250 12290 - e 257 12297 - l 250 12290 257 12297 100 -} -a { - s 0 - b 4366 12293 - e 4373 12300 - l 4366 12293 4373 12300 100 -} -a { - s 0 - b 3950 12309 - e 3957 12316 - l 3950 12309 3957 12316 100 -} -a { - s 0 - b 15308 12319 - e 15315 12326 - l 15308 12319 15315 12326 100 -} -a { - s 0 - b 4514 12328 - e 4521 12335 - l 4514 12328 4521 12335 100 -} -a { - s 0 - b 17297 12337 - e 17304 12344 - l 17297 12337 17304 12344 100 -} -a { - s 0 - b 17298 12338 - e 17305 12345 - l 17298 12338 17305 12345 100 -} -a { - s 0 - b 14685 12341 - e 14692 12348 - l 14685 12341 14692 12348 100 -} -a { - s 0 - b 18610 12344 - e 18617 12351 - l 18610 12344 18617 12351 100 -} -a { - s 0 - b 9470 12360 - e 9477 12367 - l 9470 12360 9477 12367 100 -} -a { - s 0 - b 14141 12375 - e 14148 12382 - l 14141 12375 14148 12382 100 -} -a { - s 0 - b 16058 12377 - e 16065 12384 - l 16058 12377 16065 12384 100 -} -a { - s 0 - b 8392 12378 - e 8399 12385 - l 8392 12378 8399 12385 100 -} -a { - s 0 - b 8393 12379 - e 8400 12386 - l 8393 12379 8400 12386 100 -} -a { - s 0 - b 8394 12380 - e 8401 12387 - l 8394 12380 8401 12387 100 -} -a { - s 0 - b 17250 12381 - e 17257 12388 - l 17250 12381 17257 12388 100 -} -a { - s 0 - b 17251 12382 - e 17258 12389 - l 17251 12382 17258 12389 100 -} -a { - s 0 - b 4441 12382 - e 4448 12389 - l 4441 12382 4448 12389 100 -} -a { - s 0 - b 14536 12391 - e 14543 12398 - l 14536 12391 14543 12398 100 -} -a { - s 0 - b 11391 12391 - e 11398 12398 - l 11391 12391 11398 12398 100 -} -a { - s 0 - b 6776 12396 - e 6783 12403 - l 6776 12396 6783 12403 100 -} -a { - s 0 - b 6777 12397 - e 6784 12404 - l 6777 12397 6784 12404 100 -} -a { - s 0 - b 6778 12398 - e 6785 12405 - l 6778 12398 6785 12405 100 -} -a { - s 0 - b 8962 12406 - e 8969 12413 - l 8962 12406 8969 12413 100 -} -a { - s 0 - b 5062 12407 - e 5069 12414 - l 5062 12407 5069 12414 100 -} -a { - s 0 - b 3336 12415 - e 3343 12422 - l 3336 12415 3343 12422 100 -} -a { - s 0 - b 14294 12418 - e 14301 12425 - l 14294 12418 14301 12425 100 -} -a { - s 0 - b 14295 12419 - e 14302 12426 - l 14295 12419 14302 12426 100 -} -a { - s 0 - b 6718 12419 - e 6725 12426 - l 6718 12419 6725 12426 100 -} -a { - s 0 - b 17153 12443 - e 17160 12450 - l 17153 12443 17160 12450 100 -} -a { - s 0 - b 18405 12462 - e 18412 12469 - l 18405 12462 18412 12469 100 -} -a { - s 0 - b 18406 12463 - e 18413 12470 - l 18406 12463 18413 12470 100 -} -a { - s 0 - b 3025 12463 - e 3032 12470 - l 3025 12463 3032 12470 100 -} -a { - s 0 - b 6564 12465 - e 6571 12472 - l 6564 12465 6571 12472 100 -} -a { - s 0 - b 4447 12471 - e 4454 12478 - l 4447 12471 4454 12478 100 -} -a { - s 0 - b 7656 12477 - e 7663 12484 - l 7656 12477 7663 12484 100 -} -a { - s 0 - b 8361 12485 - e 8368 12492 - l 8361 12485 8368 12492 100 -} -a { - s 0 - b 8362 12486 - e 8369 12493 - l 8362 12486 8369 12493 100 -} -a { - s 0 - b 4057 12486 - e 4064 12493 - l 4057 12486 4064 12493 100 -} -a { - s 0 - b 11045 12488 - e 11052 12495 - l 11045 12488 11052 12495 100 -} -a { - s 0 - b 262 12490 - e 269 12497 - l 262 12490 269 12497 100 -} -a { - s 0 - b 263 12491 - e 270 12498 - l 263 12491 270 12498 100 -} -a { - s 0 - b 8402 12492 - e 8409 12499 - l 8402 12492 8409 12499 100 -} -a { - s 0 - b 8403 12493 - e 8410 12500 - l 8403 12493 8410 12500 100 -} -a { - s 0 - b 8761 12501 - e 8768 12508 - l 8761 12501 8768 12508 100 -} -a { - s 0 - b 8762 12502 - e 8769 12509 - l 8762 12502 8769 12509 100 -} -a { - s 0 - b 16100 12564 - e 16107 12571 - l 16100 12564 16107 12571 100 -} -a { - s 0 - b 4005 12589 - e 4012 12596 - l 4005 12589 4012 12596 100 -} -a { - s 0 - b 5702 12590 - e 5709 12597 - l 5702 12590 5709 12597 100 -} -a { - s 0 - b 14809 12595 - e 14816 12602 - l 14809 12595 14816 12602 100 -} -a { - s 0 - b 3544 12597 - e 3551 12604 - l 3544 12597 3551 12604 100 -} -a { - s 0 - b 3152 12599 - e 3159 12606 - l 3152 12599 3159 12606 100 -} -a { - s 0 - b 15169 12603 - e 15176 12610 - l 15169 12603 15176 12610 100 -} -a { - s 0 - b 8315 12610 - e 8322 12617 - l 8315 12610 8322 12617 100 -} -a { - s 0 - b 8316 12611 - e 8323 12618 - l 8316 12611 8323 12618 100 -} -a { - s 0 - b 7509 12612 - e 7516 12619 - l 7509 12612 7516 12619 100 -} -a { - s 0 - b 14058 12626 - e 14065 12633 - l 14058 12626 14065 12633 100 -} -a { - s 0 - b 18385 12631 - e 18392 12638 - l 18385 12631 18392 12638 100 -} -a { - s 0 - b 2996 12638 - e 3003 12645 - l 2996 12638 3003 12645 100 -} -a { - s 0 - b 9346 12667 - e 9353 12674 - l 9346 12667 9353 12674 100 -} -a { - s 0 - b 18669 12669 - e 18676 12676 - l 18669 12669 18676 12676 100 -} -a { - s 0 - b 8457 12669 - e 8464 12676 - l 8457 12669 8464 12676 100 -} -a { - s 0 - b 11007 12672 - e 11014 12679 - l 11007 12672 11014 12679 100 -} -a { - s 0 - b 10894 12690 - e 10901 12697 - l 10894 12690 10901 12697 100 -} -a { - s 0 - b 9399 12694 - e 9406 12701 - l 9399 12694 9406 12701 100 -} -a { - s 0 - b 9400 12695 - e 9407 12702 - l 9400 12695 9407 12702 100 -} -a { - s 0 - b 14734 12699 - e 14741 12706 - l 14734 12699 14741 12706 100 -} -a { - s 0 - b 17589 12709 - e 17596 12716 - l 17589 12709 17596 12716 100 -} -a { - s 0 - b 8649 12709 - e 8656 12716 - l 8649 12709 8656 12716 100 -} -a { - s 0 - b 3233 12712 - e 3240 12719 - l 3233 12712 3240 12719 100 -} -a { - s 0 - b 3581 12724 - e 3588 12731 - l 3581 12724 3588 12731 100 -} -a { - s 0 - b 3918 12727 - e 3925 12734 - l 3918 12727 3925 12734 100 -} -a { - s 0 - b 5176 12730 - e 5183 12737 - l 5176 12730 5183 12737 100 -} -a { - s 0 - b 6632 12731 - e 6639 12738 - l 6632 12731 6639 12738 100 -} -a { - s 0 - b 4581 12731 - e 4588 12738 - l 4581 12731 4588 12738 100 -} -a { - s 0 - b 6633 12732 - e 6640 12739 - l 6633 12732 6640 12739 100 -} -a { - s 0 - b 6634 12733 - e 6641 12740 - l 6634 12733 6641 12740 100 -} -a { - s 0 - b 5688 12734 - e 5695 12741 - l 5688 12734 5695 12741 100 -} -a { - s 0 - b 4471 12734 - e 4478 12741 - l 4471 12734 4478 12741 100 -} -a { - s 0 - b 4472 12735 - e 4479 12742 - l 4472 12735 4479 12742 100 -} -a { - s 0 - b 4473 12736 - e 4480 12743 - l 4473 12736 4480 12743 100 -} -a { - s 0 - b 18532 12739 - e 18539 12746 - l 18532 12739 18539 12746 100 -} -a { - s 0 - b 9673 12752 - e 9680 12759 - l 9673 12752 9680 12759 100 -} -a { - s 0 - b 3207 12757 - e 3214 12764 - l 3207 12757 3214 12764 100 -} -a { - s 0 - b 21 12773 - e 28 12780 - l 21 12773 28 12780 100 -} -a { - s 0 - b 14476 12775 - e 14483 12782 - l 14476 12775 14483 12782 100 -} -a { - s 0 - b 11478 12776 - e 11485 12783 - l 11478 12776 11485 12783 100 -} -a { - s 0 - b 11479 12777 - e 11486 12784 - l 11479 12777 11486 12784 100 -} -a { - s 0 - b 9639 12779 - e 9646 12786 - l 9639 12779 9646 12786 100 -} -a { - s 0 - b 6655 12779 - e 6662 12786 - l 6655 12779 6662 12786 100 -} -a { - s 0 - b 3582 12779 - e 3589 12786 - l 3582 12779 3589 12786 100 -} -a { - s 0 - b 6706 12780 - e 6713 12787 - l 6706 12780 6713 12787 100 -} -a { - s 0 - b 6656 12780 - e 6663 12787 - l 6656 12780 6663 12787 100 -} -a { - s 0 - b 6657 12781 - e 6664 12788 - l 6657 12781 6664 12788 100 -} -a { - s 0 - b 15132 12784 - e 15139 12791 - l 15132 12784 15139 12791 100 -} -a { - s 0 - b 5040 12787 - e 5047 12794 - l 5040 12787 5047 12794 100 -} -a { - s 0 - b 5041 12788 - e 5048 12795 - l 5041 12788 5048 12795 100 -} -a { - s 0 - b 11524 12789 - e 11531 12796 - l 11524 12789 11531 12796 100 -} -a { - s 0 - b 5615 12789 - e 5622 12796 - l 5615 12789 5622 12796 100 -} -a { - s 0 - b 5042 12789 - e 5049 12796 - l 5042 12789 5049 12796 100 -} -a { - s 0 - b 10975 12790 - e 10982 12797 - l 10975 12790 10982 12797 100 -} -a { - s 0 - b 11120 12817 - e 11127 12824 - l 11120 12817 11127 12824 100 -} -a { - s 0 - b 18362 12818 - e 18369 12825 - l 18362 12818 18369 12825 100 -} -a { - s 0 - b 6563 12819 - e 6570 12826 - l 6563 12819 6570 12826 100 -} -a { - s 0 - b 6564 12820 - e 6571 12827 - l 6564 12820 6571 12827 100 -} -a { - s 0 - b 18444 12830 - e 18451 12837 - l 18444 12830 18451 12837 100 -} -a { - s 0 - b 18445 12831 - e 18452 12838 - l 18445 12831 18452 12838 100 -} -a { - s 0 - b 6626 12843 - e 6633 12850 - l 6626 12843 6633 12850 100 -} -a { - s 0 - b 13917 12844 - e 13924 12851 - l 13917 12844 13924 12851 100 -} -a { - s 0 - b 13918 12845 - e 13925 12852 - l 13918 12845 13925 12852 100 -} -a { - s 0 - b 14086 12856 - e 14093 12863 - l 14086 12856 14093 12863 100 -} -a { - s 0 - b 8677 12859 - e 8684 12866 - l 8677 12859 8684 12866 100 -} -a { - s 0 - b 14109 12864 - e 14116 12871 - l 14109 12864 14116 12871 100 -} -a { - s 0 - b 14117 12867 - e 14124 12874 - l 14117 12867 14124 12874 100 -} -a { - s 0 - b 14118 12868 - e 14125 12875 - l 14118 12868 14125 12875 100 -} -a { - s 0 - b 11414 12872 - e 11421 12879 - l 11414 12872 11421 12879 100 -} -a { - s 0 - b 4493 12873 - e 4500 12880 - l 4493 12873 4500 12880 100 -} -a { - s 0 - b 6532 12877 - e 6539 12884 - l 6532 12877 6539 12884 100 -} -a { - s 0 - b 3254 12888 - e 3261 12895 - l 3254 12888 3261 12895 100 -} -a { - s 0 - b 11497 12898 - e 11504 12905 - l 11497 12898 11504 12905 100 -} -a { - s 0 - b 3264 12898 - e 3271 12905 - l 3264 12898 3271 12905 100 -} -a { - s 0 - b 11498 12899 - e 11505 12906 - l 11498 12899 11505 12906 100 -} -a { - s 0 - b 9317 12901 - e 9324 12908 - l 9317 12901 9324 12908 100 -} -a { - s 0 - b 6529 12911 - e 6536 12918 - l 6529 12911 6536 12918 100 -} -a { - s 0 - b 8290 12916 - e 8297 12923 - l 8290 12916 8297 12923 100 -} -a { - s 0 - b 7403 12917 - e 7410 12924 - l 7403 12917 7410 12924 100 -} -a { - s 0 - b 7404 12918 - e 7411 12925 - l 7404 12918 7411 12925 100 -} -a { - s 0 - b 3242 12923 - e 3249 12930 - l 3242 12923 3249 12930 100 -} -a { - s 0 - b 63 12928 - e 70 12935 - l 63 12928 70 12935 100 -} -a { - s 0 - b 8907 12932 - e 8914 12939 - l 8907 12932 8914 12939 100 -} -a { - s 0 - b 13950 12933 - e 13957 12940 - l 13950 12933 13957 12940 100 -} -a { - s 0 - b 8418 12935 - e 8425 12942 - l 8418 12935 8425 12942 100 -} -a { - s 0 - b 11318 12939 - e 11325 12946 - l 11318 12939 11325 12946 100 -} -a { - s 0 - b 11319 12940 - e 11326 12947 - l 11319 12940 11326 12947 100 -} -a { - s 0 - b 11273 12943 - e 11280 12950 - l 11273 12943 11280 12950 100 -} -a { - s 0 - b 15306 12949 - e 15313 12956 - l 15306 12949 15313 12956 100 -} -a { - s 0 - b 15307 12950 - e 15314 12957 - l 15307 12950 15314 12957 100 -} -a { - s 0 - b 11407 12950 - e 11414 12957 - l 11407 12950 11414 12957 100 -} -a { - s 0 - b 15308 12951 - e 15315 12958 - l 15308 12951 15315 12958 100 -} -a { - s 0 - b 15309 12952 - e 15316 12959 - l 15309 12952 15316 12959 100 -} -a { - s 0 - b 13528 12953 - e 13535 12960 - l 13528 12953 13535 12960 100 -} -a { - s 0 - b 13529 12954 - e 13536 12961 - l 13529 12954 13536 12961 100 -} -a { - s 0 - b 10902 12954 - e 10909 12961 - l 10902 12954 10909 12961 100 -} -a { - s 0 - b 17957 12960 - e 17964 12967 - l 17957 12960 17964 12967 100 -} -a { - s 0 - b 6650 12968 - e 6657 12975 - l 6650 12968 6657 12975 100 -} -a { - s 0 - b 6651 12969 - e 6658 12976 - l 6651 12969 6658 12976 100 -} -a { - s 0 - b 6652 12970 - e 6659 12977 - l 6652 12970 6659 12977 100 -} -a { - s 0 - b 118 12973 - e 125 12980 - l 118 12973 125 12980 100 -} -a { - s 0 - b 51 13005 - e 58 13012 - l 51 13005 58 13012 100 -} -a { - s 0 - b 7688 13030 - e 7695 13037 - l 7688 13030 7695 13037 100 -} -a { - s 0 - b 6277 13032 - e 6284 13039 - l 6277 13032 6284 13039 100 -} -a { - s 0 - b 14009 13039 - e 14016 13046 - l 14009 13039 14016 13046 100 -} -a { - s 0 - b 11161 13057 - e 11168 13064 - l 11161 13057 11168 13064 100 -} -a { - s 0 - b 18357 13076 - e 18364 13083 - l 18357 13076 18364 13083 100 -} -a { - s 0 - b 14924 13080 - e 14931 13087 - l 14924 13080 14931 13087 100 -} -a { - s 0 - b 5188 13090 - e 5195 13097 - l 5188 13090 5195 13097 100 -} -a { - s 0 - b 5189 13091 - e 5196 13098 - l 5189 13091 5196 13098 100 -} -a { - s 0 - b 6845 13097 - e 6852 13104 - l 6845 13097 6852 13104 100 -} -a { - s 0 - b 182 13110 - e 189 13117 - l 182 13110 189 13117 100 -} -a { - s 0 - b 3202 13139 - e 3209 13146 - l 3202 13139 3209 13146 100 -} -a { - s 0 - b 4800 13140 - e 4807 13147 - l 4800 13140 4807 13147 100 -} -a { - s 0 - b 4389 13140 - e 4396 13147 - l 4389 13140 4396 13147 100 -} -a { - s 0 - b 10962 13141 - e 10969 13148 - l 10962 13141 10969 13148 100 -} -a { - s 0 - b 9649 13150 - e 9656 13157 - l 9649 13150 9656 13157 100 -} -a { - s 0 - b 14636 13180 - e 14643 13187 - l 14636 13180 14643 13187 100 -} -a { - s 0 - b 8588 13180 - e 8595 13187 - l 8588 13180 8595 13187 100 -} -a { - s 0 - b 13491 13182 - e 13498 13189 - l 13491 13182 13498 13189 100 -} -a { - s 0 - b 15096 13384 - e 15103 13391 - l 15096 13384 15103 13391 100 -} -a { - s 0 - b 10878 13395 - e 10885 13402 - l 10878 13395 10885 13402 100 -} -a { - s 0 - b 14650 13399 - e 14657 13406 - l 14650 13399 14657 13406 100 -} -a { - s 0 - b 3172 13412 - e 3179 13419 - l 3172 13412 3179 13419 100 -} -a { - s 0 - b 13922 13413 - e 13929 13420 - l 13922 13413 13929 13420 100 -} -a { - s 0 - b 3173 13413 - e 3180 13420 - l 3173 13413 3180 13420 100 -} -a { - s 0 - b 11303 13414 - e 11310 13421 - l 11303 13414 11310 13421 100 -} -a { - s 0 - b 8948 13414 - e 8955 13421 - l 8948 13414 8955 13421 100 -} -a { - s 0 - b 8883 13437 - e 8890 13444 - l 8883 13437 8890 13444 100 -} -a { - s 0 - b 18633 13458 - e 18640 13465 - l 18633 13458 18640 13465 100 -} -a { - s 0 - b 18634 13459 - e 18641 13466 - l 18634 13459 18641 13466 100 -} -a { - s 0 - b 7623 13459 - e 7630 13466 - l 7623 13459 7630 13466 100 -} -a { - s 0 - b 18635 13460 - e 18642 13467 - l 18635 13460 18642 13467 100 -} -a { - s 0 - b 18636 13461 - e 18643 13468 - l 18636 13461 18643 13468 100 -} -a { - s 0 - b 3970 13461 - e 3977 13468 - l 3970 13461 3977 13468 100 -} -a { - s 0 - b 18637 13462 - e 18644 13469 - l 18637 13462 18644 13469 100 -} -a { - s 0 - b 4351 13462 - e 4358 13469 - l 4351 13462 4358 13469 100 -} -a { - s 0 - b 3971 13462 - e 3978 13469 - l 3971 13462 3978 13469 100 -} -a { - s 0 - b 3972 13463 - e 3979 13470 - l 3972 13463 3979 13470 100 -} -a { - s 0 - b 16150 13464 - e 16157 13471 - l 16150 13464 16157 13471 100 -} -a { - s 0 - b 3973 13464 - e 3980 13471 - l 3973 13464 3980 13471 100 -} -a { - s 0 - b 16151 13465 - e 16158 13472 - l 16151 13465 16158 13472 100 -} -a { - s 0 - b 3096 13466 - e 3103 13473 - l 3096 13466 3103 13473 100 -} -a { - s 0 - b 14395 13469 - e 14402 13476 - l 14395 13469 14402 13476 100 -} -a { - s 0 - b 15286 13472 - e 15293 13479 - l 15286 13472 15293 13479 100 -} -a { - s 0 - b 15287 13473 - e 15294 13480 - l 15287 13473 15294 13480 100 -} -a { - s 0 - b 15288 13474 - e 15295 13481 - l 15288 13474 15295 13481 100 -} -a { - s 0 - b 11212 13486 - e 11219 13493 - l 11212 13486 11219 13493 100 -} -a { - s 0 - b 11213 13487 - e 11220 13494 - l 11213 13487 11220 13494 100 -} -a { - s 0 - b 4066 13490 - e 4073 13497 - l 4066 13490 4073 13497 100 -} -a { - s 0 - b 14946 13496 - e 14953 13503 - l 14946 13496 14953 13503 100 -} -a { - s 0 - b 238 13515 - e 245 13522 - l 238 13515 245 13522 100 -} -a { - s 0 - b 14612 13557 - e 14619 13564 - l 14612 13557 14619 13564 100 -} -a { - s 0 - b 11207 13559 - e 11214 13566 - l 11207 13559 11214 13566 100 -} -a { - s 0 - b 14932 13566 - e 14939 13573 - l 14932 13566 14939 13573 100 -} -a { - s 0 - b 14933 13567 - e 14940 13574 - l 14933 13567 14940 13574 100 -} -a { - s 0 - b 15356 13579 - e 15363 13586 - l 15356 13579 15363 13586 100 -} -a { - s 0 - b 15357 13580 - e 15364 13587 - l 15357 13580 15364 13587 100 -} -a { - s 0 - b 14696 13585 - e 14703 13592 - l 14696 13585 14703 13592 100 -} -a { - s 0 - b 3194 13587 - e 3201 13594 - l 3194 13587 3201 13594 100 -} -a { - s 0 - b 7725 13595 - e 7732 13602 - l 7725 13595 7732 13602 100 -} -a { - s 0 - b 11533 13596 - e 11540 13603 - l 11533 13596 11540 13603 100 -} -a { - s 0 - b 14566 13597 - e 14573 13604 - l 14566 13597 14573 13604 100 -} -a { - s 0 - b 11534 13597 - e 11541 13604 - l 11534 13597 11541 13604 100 -} -a { - s 0 - b 14567 13598 - e 14574 13605 - l 14567 13598 14574 13605 100 -} -a { - s 0 - b 3334 13602 - e 3341 13609 - l 3334 13602 3341 13609 100 -} -a { - s 0 - b 14229 13630 - e 14236 13637 - l 14229 13630 14236 13637 100 -} -a { - s 0 - b 7446 13644 - e 7453 13651 - l 7446 13644 7453 13651 100 -} -a { - s 0 - b 3559 13655 - e 3566 13662 - l 3559 13655 3566 13662 100 -} -a { - s 0 - b 4811 13659 - e 4818 13666 - l 4811 13659 4818 13666 100 -} -a { - s 0 - b 8503 13665 - e 8510 13672 - l 8503 13665 8510 13672 100 -} -a { - s 0 - b 11061 13666 - e 11068 13673 - l 11061 13666 11068 13673 100 -} -a { - s 0 - b 5010 13675 - e 5017 13682 - l 5010 13675 5017 13682 100 -} -a { - s 0 - b 8994 13678 - e 9001 13685 - l 8994 13678 9001 13685 100 -} -a { - s 0 - b 8995 13679 - e 9002 13686 - l 8995 13679 9002 13686 100 -} -a { - s 0 - b 12235 13707 - e 12242 13714 - l 12235 13707 12242 13714 100 -} -a { - s 0 - b 15915 13713 - e 15922 13720 - l 15915 13713 15922 13720 100 -} -a { - s 0 - b 14718 13743 - e 14725 13750 - l 14718 13743 14725 13750 100 -} -a { - s 0 - b 7737 13743 - e 7744 13750 - l 7737 13743 7744 13750 100 -} -a { - s 0 - b 9735 13752 - e 9742 13759 - l 9735 13752 9742 13759 100 -} -a { - s 0 - b 9736 13753 - e 9743 13760 - l 9736 13753 9743 13760 100 -} -a { - s 0 - b 9737 13754 - e 9744 13761 - l 9737 13754 9744 13761 100 -} -a { - s 0 - b 8452 13754 - e 8459 13761 - l 8452 13754 8459 13761 100 -} -a { - s 0 - b 14257 13759 - e 14264 13766 - l 14257 13759 14264 13766 100 -} -a { - s 0 - b 11539 13763 - e 11546 13770 - l 11539 13763 11546 13770 100 -} -a { - s 0 - b 6409 13773 - e 6416 13780 - l 6409 13773 6416 13780 100 -} -a { - s 0 - b 4353 13809 - e 4360 13816 - l 4353 13809 4360 13816 100 -} -a { - s 0 - b 14502 13811 - e 14509 13818 - l 14502 13811 14509 13818 100 -} -a { - s 0 - b 14379 13812 - e 14386 13819 - l 14379 13812 14386 13819 100 -} -a { - s 0 - b 6853 13821 - e 6860 13828 - l 6853 13821 6860 13828 100 -} -a { - s 0 - b 6854 13822 - e 6861 13829 - l 6854 13822 6861 13829 100 -} -a { - s 0 - b 18604 13824 - e 18611 13831 - l 18604 13824 18611 13831 100 -} -a { - s 0 - b 14315 13824 - e 14322 13831 - l 14315 13824 14322 13831 100 -} -a { - s 0 - b 14966 13825 - e 14973 13832 - l 14966 13825 14973 13832 100 -} -a { - s 0 - b 14316 13825 - e 14323 13832 - l 14316 13825 14323 13832 100 -} -a { - s 0 - b 11474 13825 - e 11481 13832 - l 11474 13825 11481 13832 100 -} -a { - s 0 - b 3138 13825 - e 3145 13832 - l 3138 13825 3145 13832 100 -} -a { - s 0 - b 14317 13826 - e 14324 13833 - l 14317 13826 14324 13833 100 -} -a { - s 0 - b 11475 13826 - e 11482 13833 - l 11475 13826 11482 13833 100 -} -a { - s 0 - b 11476 13827 - e 11483 13834 - l 11476 13827 11483 13834 100 -} -a { - s 0 - b 17319 13833 - e 17326 13840 - l 17319 13833 17326 13840 100 -} -a { - s 0 - b 4973 13854 - e 4980 13861 - l 4973 13854 4980 13861 100 -} -a { - s 0 - b 5687 13857 - e 5694 13864 - l 5687 13857 5694 13864 100 -} -a { - s 0 - b 17554 13860 - e 17561 13867 - l 17554 13860 17561 13867 100 -} -a { - s 0 - b 18419 13864 - e 18426 13871 - l 18419 13864 18426 13871 100 -} -a { - s 0 - b 18420 13865 - e 18427 13872 - l 18420 13865 18427 13872 100 -} -a { - s 0 - b 9403 13866 - e 9410 13873 - l 9403 13866 9410 13873 100 -} -a { - s 0 - b 9331 13868 - e 9338 13875 - l 9331 13868 9338 13875 100 -} -a { - s 0 - b 4487 13874 - e 4494 13881 - l 4487 13874 4494 13881 100 -} -a { - s 0 - b 183 13882 - e 190 13889 - l 183 13882 190 13889 100 -} -a { - s 0 - b 18600 13887 - e 18607 13894 - l 18600 13887 18607 13894 100 -} -a { - s 0 - b 8554 13887 - e 8561 13894 - l 8554 13887 8561 13894 100 -} -a { - s 0 - b 18601 13888 - e 18608 13895 - l 18601 13888 18608 13895 100 -} -a { - s 0 - b 17583 13890 - e 17590 13897 - l 17583 13890 17590 13897 100 -} -a { - s 0 - b 17179 13908 - e 17186 13915 - l 17179 13908 17186 13915 100 -} -a { - s 0 - b 3249 13911 - e 3256 13918 - l 3249 13911 3256 13918 100 -} -a { - s 0 - b 15365 13914 - e 15372 13921 - l 15365 13914 15372 13921 100 -} -a { - s 0 - b 17171 13926 - e 17178 13933 - l 17171 13926 17178 13933 100 -} -a { - s 0 - b 6735 13946 - e 6742 13953 - l 6735 13946 6742 13953 100 -} -a { - s 0 - b 4625 13949 - e 4632 13956 - l 4625 13949 4632 13956 100 -} -a { - s 0 - b 4626 13950 - e 4633 13957 - l 4626 13950 4633 13957 100 -} -a { - s 0 - b 4910 13951 - e 4917 13958 - l 4910 13951 4917 13958 100 -} -a { - s 0 - b 3250 13953 - e 3257 13960 - l 3250 13953 3257 13960 100 -} -a { - s 0 - b 3251 13954 - e 3258 13961 - l 3251 13954 3258 13961 100 -} -a { - s 0 - b 18624 13956 - e 18631 13963 - l 18624 13956 18631 13963 100 -} -a { - s 0 - b 13999 13965 - e 14006 13972 - l 13999 13965 14006 13972 100 -} -a { - s 0 - b 14580 13973 - e 14587 13980 - l 14580 13973 14587 13980 100 -} -a { - s 0 - b 10998 13979 - e 11005 13986 - l 10998 13979 11005 13986 100 -} -a { - s 0 - b 14549 13995 - e 14556 14002 - l 14549 13995 14556 14002 100 -} -a { - s 0 - b 5012 14014 - e 5019 14021 - l 5012 14014 5019 14021 100 -} -a { - s 0 - b 18368 14027 - e 18375 14034 - l 18368 14027 18375 14034 100 -} -a { - s 0 - b 6803 14031 - e 6810 14038 - l 6803 14031 6810 14038 100 -} -a { - s 0 - b 6606 14032 - e 6613 14039 - l 6606 14032 6613 14039 100 -} -a { - s 0 - b 9717 14034 - e 9724 14041 - l 9717 14034 9724 14041 100 -} -a { - s 0 - b 4474 14035 - e 4481 14042 - l 4474 14035 4481 14042 100 -} -a { - s 0 - b 8131 14037 - e 8138 14044 - l 8131 14037 8138 14044 100 -} -a { - s 0 - b 8751 14067 - e 8758 14074 - l 8751 14067 8758 14074 100 -} -a { - s 0 - b 8752 14068 - e 8759 14075 - l 8752 14068 8759 14075 100 -} -a { - s 0 - b 8753 14069 - e 8760 14076 - l 8753 14069 8760 14076 100 -} -a { - s 0 - b 8754 14070 - e 8761 14077 - l 8754 14070 8761 14077 100 -} -a { - s 0 - b 11092 14071 - e 11099 14078 - l 11092 14071 11099 14078 100 -} -a { - s 0 - b 8755 14071 - e 8762 14078 - l 8755 14071 8762 14078 100 -} -a { - s 0 - b 15377 14074 - e 15384 14081 - l 15377 14074 15384 14081 100 -} -a { - s 0 - b 11469 14087 - e 11476 14094 - l 11469 14087 11476 14094 100 -} -a { - s 0 - b 9022 14088 - e 9029 14095 - l 9022 14088 9029 14095 100 -} -a { - s 0 - b 18359 14089 - e 18366 14096 - l 18359 14089 18366 14096 100 -} -a { - s 0 - b 12237 14091 - e 12244 14098 - l 12237 14091 12244 14098 100 -} -a { - s 0 - b 18375 14093 - e 18382 14100 - l 18375 14093 18382 14100 100 -} -a { - s 0 - b 3977 14094 - e 3984 14101 - l 3977 14094 3984 14101 100 -} -a { - s 0 - b 15324 14095 - e 15331 14102 - l 15324 14095 15331 14102 100 -} -a { - s 0 - b 10978 14095 - e 10985 14102 - l 10978 14095 10985 14102 100 -} -a { - s 0 - b 18578 14104 - e 18585 14111 - l 18578 14104 18585 14111 100 -} -a { - s 0 - b 9452 14108 - e 9459 14115 - l 9452 14108 9459 14115 100 -} -a { - s 0 - b 9453 14109 - e 9460 14116 - l 9453 14109 9460 14116 100 -} -a { - s 0 - b 9454 14110 - e 9461 14117 - l 9454 14110 9461 14117 100 -} -a { - s 0 - b 6725 14110 - e 6732 14117 - l 6725 14110 6732 14117 100 -} -a { - s 0 - b 9455 14111 - e 9462 14118 - l 9455 14111 9462 14118 100 -} -a { - s 0 - b 6726 14111 - e 6733 14118 - l 6726 14111 6733 14118 100 -} -a { - s 0 - b 8825 14116 - e 8832 14123 - l 8825 14116 8832 14123 100 -} -a { - s 0 - b 18591 14118 - e 18598 14125 - l 18591 14118 18598 14125 100 -} -a { - s 0 - b 8464 14120 - e 8471 14127 - l 8464 14120 8471 14127 100 -} -a { - s 0 - b 8465 14121 - e 8472 14128 - l 8465 14121 8472 14128 100 -} -a { - s 0 - b 8307 14122 - e 8314 14129 - l 8307 14122 8314 14129 100 -} -a { - s 0 - b 3574 14122 - e 3581 14129 - l 3574 14122 3581 14129 100 -} -a { - s 0 - b 11047 14123 - e 11054 14130 - l 11047 14123 11054 14130 100 -} -a { - s 0 - b 4459 14133 - e 4466 14140 - l 4459 14133 4466 14140 100 -} -a { - s 0 - b 18622 14136 - e 18629 14143 - l 18622 14136 18629 14143 100 -} -a { - s 0 - b 18623 14137 - e 18630 14144 - l 18623 14137 18630 14144 100 -} -a { - s 0 - b 18624 14138 - e 18631 14145 - l 18624 14138 18631 14145 100 -} -a { - s 0 - b 18625 14139 - e 18632 14146 - l 18625 14139 18632 14146 100 -} -a { - s 0 - b 8806 14147 - e 8813 14154 - l 8806 14147 8813 14154 100 -} -a { - s 0 - b 18707 14175 - e 18714 14182 - l 18707 14175 18714 14182 100 -} -a { - s 0 - b 4896 14179 - e 4903 14186 - l 4896 14179 4903 14186 100 -} -a { - s 0 - b 18677 14191 - e 18684 14198 - l 18677 14191 18684 14198 100 -} -a { - s 0 - b 15331 14207 - e 15338 14214 - l 15331 14207 15338 14214 100 -} -a { - s 0 - b 11235 14211 - e 11242 14218 - l 11235 14211 11242 14218 100 -} -a { - s 0 - b 17242 14225 - e 17249 14232 - l 17242 14225 17249 14232 100 -} -a { - s 0 - b 7717 14231 - e 7724 14238 - l 7717 14231 7724 14238 100 -} -a { - s 0 - b 8222 14232 - e 8229 14239 - l 8222 14232 8229 14239 100 -} -a { - s 0 - b 7718 14232 - e 7725 14239 - l 7718 14232 7725 14239 100 -} -a { - s 0 - b 8223 14233 - e 8230 14240 - l 8223 14233 8230 14240 100 -} -a { - s 0 - b 5151 14237 - e 5158 14244 - l 5151 14237 5158 14244 100 -} -a { - s 0 - b 15963 14241 - e 15970 14248 - l 15963 14241 15970 14248 100 -} -a { - s 0 - b 14352 14241 - e 14359 14248 - l 14352 14241 14359 14248 100 -} -a { - s 0 - b 8827 14252 - e 8834 14259 - l 8827 14252 8834 14259 100 -} -a { - s 0 - b 9345 14261 - e 9352 14268 - l 9345 14261 9352 14268 100 -} -a { - s 0 - b 14293 14284 - e 14300 14291 - l 14293 14284 14300 14291 100 -} -a { - s 0 - b 6617 14284 - e 6624 14291 - l 6617 14284 6624 14291 100 -} -a { - s 0 - b 13959 14287 - e 13966 14294 - l 13959 14287 13966 14294 100 -} -a { - s 0 - b 8672 14287 - e 8679 14294 - l 8672 14287 8679 14294 100 -} -a { - s 0 - b 9818 14295 - e 9825 14302 - l 9818 14295 9825 14302 100 -} -a { - s 0 - b 9819 14296 - e 9826 14303 - l 9819 14296 9826 14303 100 -} -a { - s 0 - b 3075 14299 - e 3082 14306 - l 3075 14299 3082 14306 100 -} -a { - s 0 - b 12577 14300 - e 12584 14307 - l 12577 14300 12584 14307 100 -} -a { - s 0 - b 3076 14300 - e 3083 14307 - l 3076 14300 3083 14307 100 -} -a { - s 0 - b 18771 14301 - e 18778 14308 - l 18771 14301 18778 14308 100 -} -a { - s 0 - b 12578 14301 - e 12585 14308 - l 12578 14301 12585 14308 100 -} -a { - s 0 - b 18624 14303 - e 18631 14310 - l 18624 14303 18631 14310 100 -} -a { - s 0 - b 15204 14313 - e 15211 14320 - l 15204 14313 15211 14320 100 -} -a { - s 0 - b 11313 14335 - e 11320 14342 - l 11313 14335 11320 14342 100 -} -a { - s 0 - b 271 14335 - e 278 14342 - l 271 14335 278 14342 100 -} -a { - s 0 - b 3198 14337 - e 3205 14344 - l 3198 14337 3205 14344 100 -} -a { - s 0 - b 6326 14360 - e 6333 14367 - l 6326 14360 6333 14367 100 -} -a { - s 0 - b 6269 14375 - e 6276 14382 - l 6269 14375 6276 14382 100 -} -a { - s 0 - b 6270 14376 - e 6277 14383 - l 6270 14376 6277 14383 100 -} -a { - s 0 - b 15968 14378 - e 15975 14385 - l 15968 14378 15975 14385 100 -} -a { - s 0 - b 14619 14381 - e 14626 14388 - l 14619 14381 14626 14388 100 -} -a { - s 0 - b 4525 14381 - e 4532 14388 - l 4525 14381 4532 14388 100 -} -a { - s 0 - b 4526 14382 - e 4533 14389 - l 4526 14382 4533 14389 100 -} -a { - s 0 - b 18038 14383 - e 18045 14390 - l 18038 14383 18045 14390 100 -} -a { - s 0 - b 17368 14384 - e 17375 14391 - l 17368 14384 17375 14391 100 -} -a { - s 0 - b 18732 14385 - e 18739 14392 - l 18732 14385 18739 14392 100 -} -a { - s 0 - b 14500 14387 - e 14507 14394 - l 14500 14387 14507 14394 100 -} -a { - s 0 - b 14026 14392 - e 14033 14399 - l 14026 14392 14033 14399 100 -} -a { - s 0 - b 14027 14393 - e 14034 14400 - l 14027 14393 14034 14400 100 -} -a { - s 0 - b 3238 14393 - e 3245 14400 - l 3238 14393 3245 14400 100 -} -a { - s 0 - b 14028 14394 - e 14035 14401 - l 14028 14394 14035 14401 100 -} -a { - s 0 - b 14029 14395 - e 14036 14402 - l 14029 14395 14036 14402 100 -} -a { - s 0 - b 9738 14411 - e 9745 14418 - l 9738 14411 9745 14418 100 -} -a { - s 0 - b 8453 14411 - e 8460 14418 - l 8453 14411 8460 14418 100 -} -a { - s 0 - b 5697 14412 - e 5704 14419 - l 5697 14412 5704 14419 100 -} -a { - s 0 - b 5698 14413 - e 5705 14420 - l 5698 14413 5705 14420 100 -} -a { - s 0 - b 7720 14414 - e 7727 14421 - l 7720 14414 7727 14421 100 -} -a { - s 0 - b 17381 14433 - e 17388 14440 - l 17381 14433 17388 14440 100 -} -a { - s 0 - b 17382 14434 - e 17389 14441 - l 17382 14434 17389 14441 100 -} -a { - s 0 - b 17383 14435 - e 17390 14442 - l 17383 14435 17390 14442 100 -} -a { - s 0 - b 8713 14436 - e 8720 14443 - l 8713 14436 8720 14443 100 -} -a { - s 0 - b 18692 14460 - e 18699 14467 - l 18692 14460 18699 14467 100 -} -a { - s 0 - b 84 14461 - e 91 14468 - l 84 14461 91 14468 100 -} -a { - s 0 - b 85 14462 - e 92 14469 - l 85 14462 92 14469 100 -} -a { - s 0 - b 8278 14466 - e 8285 14473 - l 8278 14466 8285 14473 100 -} -a { - s 0 - b 17985 14468 - e 17992 14475 - l 17985 14468 17992 14475 100 -} -a { - s 0 - b 18591 14481 - e 18598 14488 - l 18591 14481 18598 14488 100 -} -a { - s 0 - b 18592 14482 - e 18599 14489 - l 18592 14482 18599 14489 100 -} -a { - s 0 - b 4243 14490 - e 4250 14497 - l 4243 14490 4250 14497 100 -} -a { - s 0 - b 18731 14500 - e 18738 14507 - l 18731 14500 18738 14507 100 -} -a { - s 0 - b 18732 14501 - e 18739 14508 - l 18732 14501 18739 14508 100 -} -a { - s 0 - b 14964 14510 - e 14971 14517 - l 14964 14510 14971 14517 100 -} -a { - s 0 - b 14619 14527 - e 14626 14534 - l 14619 14527 14626 14534 100 -} -a { - s 0 - b 4525 14527 - e 4532 14534 - l 4525 14527 4532 14534 100 -} -a { - s 0 - b 4526 14528 - e 4533 14535 - l 4526 14528 4533 14535 100 -} -a { - s 0 - b 18038 14529 - e 18045 14536 - l 18038 14529 18045 14536 100 -} -a { - s 0 - b 8924 14540 - e 8931 14547 - l 8924 14540 8931 14547 100 -} -a { - s 0 - b 6843 14545 - e 6850 14552 - l 6843 14545 6850 14552 100 -} -a { - s 0 - b 15231 14546 - e 15238 14553 - l 15231 14546 15238 14553 100 -} -a { - s 0 - b 6844 14546 - e 6851 14553 - l 6844 14546 6851 14553 100 -} -a { - s 0 - b 8199 14547 - e 8206 14554 - l 8199 14547 8206 14554 100 -} -a { - s 0 - b 17094 14548 - e 17101 14555 - l 17094 14548 17101 14555 100 -} -a { - s 0 - b 12239 14548 - e 12246 14555 - l 12239 14548 12246 14555 100 -} -a { - s 0 - b 11054 14593 - e 11061 14600 - l 11054 14593 11061 14600 100 -} -a { - s 0 - b 3919 14593 - e 3926 14600 - l 3919 14593 3926 14600 100 -} -a { - s 0 - b 18734 14594 - e 18741 14601 - l 18734 14594 18741 14601 100 -} -a { - s 0 - b 3920 14594 - e 3927 14601 - l 3920 14594 3927 14601 100 -} -a { - s 0 - b 6610 14777 - e 6617 14784 - l 6610 14777 6617 14784 100 -} -a { - s 0 - b 6611 14778 - e 6618 14785 - l 6611 14778 6618 14785 100 -} -a { - s 0 - b 17212 14781 - e 17219 14788 - l 17212 14781 17219 14788 100 -} -a { - s 0 - b 7702 14787 - e 7709 14794 - l 7702 14787 7709 14794 100 -} -a { - s 0 - b 14147 14791 - e 14154 14798 - l 14147 14791 14154 14798 100 -} -a { - s 0 - b 14148 14792 - e 14155 14799 - l 14148 14792 14155 14799 100 -} -a { - s 0 - b 14149 14793 - e 14156 14800 - l 14149 14793 14156 14800 100 -} -a { - s 0 - b 17414 14794 - e 17421 14801 - l 17414 14794 17421 14801 100 -} -a { - s 0 - b 14304 14794 - e 14311 14801 - l 14304 14794 14311 14801 100 -} -a { - s 0 - b 7439 14794 - e 7446 14801 - l 7439 14794 7446 14801 100 -} -a { - s 0 - b 11354 14799 - e 11361 14806 - l 11354 14799 11361 14806 100 -} -a { - s 0 - b 11355 14800 - e 11362 14807 - l 11355 14800 11362 14807 100 -} -a { - s 0 - b 4577 14810 - e 4584 14817 - l 4577 14810 4584 14817 100 -} -a { - s 0 - b 3347 14810 - e 3354 14817 - l 3347 14810 3354 14817 100 -} -a { - s 0 - b 3619 14812 - e 3626 14819 - l 3619 14812 3626 14819 100 -} -a { - s 0 - b 14502 14814 - e 14509 14821 - l 14502 14814 14509 14821 100 -} -a { - s 0 - b 14379 14815 - e 14386 14822 - l 14379 14815 14386 14822 100 -} -a { - s 0 - b 8458 14816 - e 8465 14823 - l 8458 14816 8465 14823 100 -} -a { - s 0 - b 11112 14817 - e 11119 14824 - l 11112 14817 11119 14824 100 -} -a { - s 0 - b 5145 14826 - e 5152 14833 - l 5145 14826 5152 14833 100 -} -a { - s 0 - b 143 14830 - e 150 14837 - l 143 14830 150 14837 100 -} -a { - s 0 - b 11188 14840 - e 11195 14847 - l 11188 14840 11195 14847 100 -} -a { - s 0 - b 15989 14844 - e 15996 14851 - l 15989 14844 15996 14851 100 -} -a { - s 0 - b 12601 14854 - e 12608 14861 - l 12601 14854 12608 14861 100 -} -a { - s 0 - b 12602 14855 - e 12609 14862 - l 12602 14855 12609 14862 100 -} -a { - s 0 - b 18719 14861 - e 18726 14868 - l 18719 14861 18726 14868 100 -} -a { - s 0 - b 18720 14862 - e 18727 14869 - l 18720 14862 18727 14869 100 -} -a { - s 0 - b 4914 14863 - e 4921 14870 - l 4914 14863 4921 14870 100 -} -a { - s 0 - b 4834 14864 - e 4841 14871 - l 4834 14864 4841 14871 100 -} -a { - s 0 - b 9438 14868 - e 9445 14875 - l 9438 14868 9445 14875 100 -} -a { - s 0 - b 8829 14872 - e 8836 14879 - l 8829 14872 8836 14879 100 -} -a { - s 0 - b 4852 14875 - e 4859 14882 - l 4852 14875 4859 14882 100 -} -a { - s 0 - b 9690 14877 - e 9697 14884 - l 9690 14877 9697 14884 100 -} -a { - s 0 - b 9691 14878 - e 9698 14885 - l 9691 14878 9698 14885 100 -} -a { - s 0 - b 9692 14879 - e 9699 14886 - l 9692 14879 9699 14886 100 -} -a { - s 0 - b 9693 14880 - e 9700 14887 - l 9693 14880 9700 14887 100 -} -a { - s 0 - b 8066 14883 - e 8073 14890 - l 8066 14883 8073 14890 100 -} -a { - s 0 - b 17998 14901 - e 18005 14908 - l 17998 14901 18005 14908 100 -} -a { - s 0 - b 5121 14917 - e 5128 14924 - l 5121 14917 5128 14924 100 -} -a { - s 0 - b 8089 14930 - e 8096 14937 - l 8089 14930 8096 14937 100 -} -a { - s 0 - b 18347 14931 - e 18354 14938 - l 18347 14931 18354 14938 100 -} -a { - s 0 - b 8090 14931 - e 8097 14938 - l 8090 14931 8097 14938 100 -} -a { - s 0 - b 8091 14932 - e 8098 14939 - l 8091 14932 8098 14939 100 -} -a { - s 0 - b 4792 14949 - e 4799 14956 - l 4792 14949 4799 14956 100 -} -a { - s 0 - b 4793 14950 - e 4800 14957 - l 4793 14950 4800 14957 100 -} -a { - s 0 - b 17412 14962 - e 17419 14969 - l 17412 14962 17419 14969 100 -} -a { - s 0 - b 14302 14962 - e 14309 14969 - l 14302 14962 14309 14969 100 -} -a { - s 0 - b 8085 14969 - e 8092 14976 - l 8085 14969 8092 14976 100 -} -a { - s 0 - b 7555 14979 - e 7562 14986 - l 7555 14979 7562 14986 100 -} -a { - s 0 - b 14581 14985 - e 14588 14992 - l 14581 14985 14588 14992 100 -} -a { - s 0 - b 18418 14987 - e 18425 14994 - l 18418 14987 18425 14994 100 -} -a { - s 0 - b 17329 15000 - e 17336 15007 - l 17329 15000 17336 15007 100 -} -a { - s 0 - b 8682 15000 - e 8689 15007 - l 8682 15000 8689 15007 100 -} -a { - s 0 - b 8683 15001 - e 8690 15008 - l 8683 15001 8690 15008 100 -} -a { - s 0 - b 8796 15002 - e 8803 15009 - l 8796 15002 8803 15009 100 -} -a { - s 0 - b 6422 15002 - e 6429 15009 - l 6422 15002 6429 15009 100 -} -a { - s 0 - b 14114 15004 - e 14121 15011 - l 14114 15004 14121 15011 100 -} -a { - s 0 - b 6689 15007 - e 6696 15014 - l 6689 15007 6696 15014 100 -} -a { - s 0 - b 10843 15008 - e 10850 15015 - l 10843 15008 10850 15015 100 -} -a { - s 0 - b 17317 15018 - e 17324 15025 - l 17317 15018 17324 15025 100 -} -a { - s 0 - b 6663 15018 - e 6670 15025 - l 6663 15018 6670 15025 100 -} -a { - s 0 - b 4037 15019 - e 4044 15026 - l 4037 15019 4044 15026 100 -} -a { - s 0 - b 3976 15019 - e 3983 15026 - l 3976 15019 3983 15026 100 -} -a { - s 0 - b 15290 15032 - e 15297 15039 - l 15290 15032 15297 15039 100 -} -a { - s 0 - b 6305 15046 - e 6312 15053 - l 6305 15046 6312 15053 100 -} -a { - s 0 - b 18364 15047 - e 18371 15054 - l 18364 15047 18371 15054 100 -} -a { - s 0 - b 6306 15047 - e 6313 15054 - l 6306 15047 6313 15054 100 -} -a { - s 0 - b 4087 15061 - e 4094 15068 - l 4087 15061 4094 15068 100 -} -a { - s 0 - b 9598 15065 - e 9605 15072 - l 9598 15065 9605 15072 100 -} -a { - s 0 - b 4878 15070 - e 4885 15077 - l 4878 15070 4885 15077 100 -} -a { - s 0 - b 11178 15083 - e 11185 15090 - l 11178 15083 11185 15090 100 -} -a { - s 0 - b 9359 15112 - e 9366 15119 - l 9359 15112 9366 15119 100 -} -a { - s 0 - b 7742 15118 - e 7749 15125 - l 7742 15118 7749 15125 100 -} -a { - s 0 - b 114 15119 - e 121 15126 - l 114 15119 121 15126 100 -} -a { - s 0 - b 16052 15123 - e 16059 15130 - l 16052 15123 16059 15130 100 -} -a { - s 0 - b 8259 15128 - e 8266 15135 - l 8259 15128 8266 15135 100 -} -a { - s 0 - b 8260 15129 - e 8267 15136 - l 8260 15129 8267 15136 100 -} -a { - s 0 - b 8261 15130 - e 8268 15137 - l 8261 15130 8268 15137 100 -} -a { - s 0 - b 5117 15135 - e 5124 15142 - l 5117 15135 5124 15142 100 -} -a { - s 0 - b 8095 15136 - e 8102 15143 - l 8095 15136 8102 15143 100 -} -a { - s 0 - b 18371 15152 - e 18378 15159 - l 18371 15152 18378 15159 100 -} -a { - s 0 - b 15320 15153 - e 15327 15160 - l 15320 15153 15327 15160 100 -} -a { - s 0 - b 6611 15154 - e 6618 15161 - l 6611 15154 6618 15161 100 -} -a { - s 0 - b 6612 15155 - e 6619 15162 - l 6612 15155 6619 15162 100 -} -a { - s 0 - b 4817 15156 - e 4824 15163 - l 4817 15156 4824 15163 100 -} -a { - s 0 - b 6832 15163 - e 6839 15170 - l 6832 15163 6839 15170 100 -} -a { - s 0 - b 6833 15164 - e 6840 15171 - l 6833 15164 6840 15171 100 -} -a { - s 0 - b 6834 15165 - e 6841 15172 - l 6834 15165 6841 15172 100 -} -a { - s 0 - b 11471 15169 - e 11478 15176 - l 11471 15169 11478 15176 100 -} -a { - s 0 - b 6333 15169 - e 6340 15176 - l 6333 15169 6340 15176 100 -} -a { - s 0 - b 11472 15170 - e 11479 15177 - l 11472 15170 11479 15177 100 -} -a { - s 0 - b 11378 15178 - e 11385 15185 - l 11378 15178 11385 15185 100 -} -a { - s 0 - b 8210 15180 - e 8217 15187 - l 8210 15180 8217 15187 100 -} -a { - s 0 - b 8082 15181 - e 8089 15188 - l 8082 15181 8089 15188 100 -} -a { - s 0 - b 8668 15182 - e 8675 15189 - l 8668 15182 8675 15189 100 -} -a { - s 0 - b 8669 15183 - e 8676 15190 - l 8669 15183 8676 15190 100 -} -a { - s 0 - b 4529 15186 - e 4536 15193 - l 4529 15186 4536 15193 100 -} -a { - s 0 - b 14268 15889 - e 14275 15896 - l 14268 15889 14275 15896 100 -} -a { - s 0 - b 8084 15893 - e 8091 15900 - l 8084 15893 8091 15900 100 -} -a { - s 0 - b 6861 15895 - e 6868 15902 - l 6861 15895 6868 15902 100 -} -a { - s 0 - b 6862 15896 - e 6869 15903 - l 6862 15896 6869 15903 100 -} -a { - s 0 - b 9390 15926 - e 9397 15933 - l 9390 15926 9397 15933 100 -} -a { - s 0 - b 4570 15930 - e 4577 15937 - l 4570 15930 4577 15937 100 -} -a { - s 0 - b 8577 15935 - e 8584 15942 - l 8577 15935 8584 15942 100 -} -a { - s 0 - b 274 15937 - e 281 15944 - l 274 15937 281 15944 100 -} -a { - s 0 - b 4948 15939 - e 4955 15946 - l 4948 15939 4955 15946 100 -} -a { - s 0 - b 14628 15942 - e 14635 15949 - l 14628 15942 14635 15949 100 -} -a { - s 0 - b 3340 15945 - e 3347 15952 - l 3340 15945 3347 15952 100 -} -a { - s 0 - b 17368 15950 - e 17375 15957 - l 17368 15950 17375 15957 100 -} -a { - s 0 - b 17369 15951 - e 17376 15958 - l 17369 15951 17376 15958 100 -} -a { - s 0 - b 11480 15954 - e 11487 15961 - l 11480 15954 11487 15961 100 -} -a { - s 0 - b 9454 15961 - e 9461 15968 - l 9454 15961 9461 15968 100 -} -a { - s 0 - b 6725 15961 - e 6732 15968 - l 6725 15961 6732 15968 100 -} -a { - s 0 - b 18738 15963 - e 18745 15970 - l 18738 15963 18745 15970 100 -} -a { - s 0 - b 14452 15963 - e 14459 15970 - l 14452 15963 14459 15970 100 -} -a { - s 0 - b 18739 15964 - e 18746 15971 - l 18739 15964 18746 15971 100 -} -a { - s 0 - b 14139 15965 - e 14146 15972 - l 14139 15965 14146 15972 100 -} -a { - s 0 - b 16047 15966 - e 16054 15973 - l 16047 15966 16054 15973 100 -} -a { - s 0 - b 18622 15967 - e 18629 15974 - l 18622 15967 18629 15974 100 -} -a { - s 0 - b 12580 15969 - e 12587 15976 - l 12580 15969 12587 15976 100 -} -a { - s 0 - b 12581 15970 - e 12588 15977 - l 12581 15970 12588 15977 100 -} -a { - s 0 - b 14000 15971 - e 14007 15978 - l 14000 15971 14007 15978 100 -} -a { - s 0 - b 10875 15984 - e 10882 15991 - l 10875 15984 10882 15991 100 -} -a { - s 0 - b 14520 15985 - e 14527 15992 - l 14520 15985 14527 15992 100 -} -a { - s 0 - b 4800 15990 - e 4807 15997 - l 4800 15990 4807 15997 100 -} -a { - s 0 - b 4389 15990 - e 4396 15997 - l 4389 15990 4396 15997 100 -} -a { - s 0 - b 15196 16014 - e 15203 16021 - l 15196 16014 15203 16021 100 -} -a { - s 0 - b 3284 16023 - e 3291 16030 - l 3284 16023 3291 16030 100 -} -a { - s 0 - b 8306 16100 - e 8313 16107 - l 8306 16100 8313 16107 100 -} -a { - s 0 - b 3573 16100 - e 3580 16107 - l 3573 16100 3580 16107 100 -} -a { - s 0 - b 18663 16111 - e 18670 16118 - l 18663 16111 18670 16118 100 -} -a { - s 0 - b 15338 16112 - e 15345 16119 - l 15338 16112 15345 16119 100 -} -a { - s 0 - b 7454 16114 - e 7461 16121 - l 7454 16114 7461 16121 100 -} -a { - s 0 - b 4796 16116 - e 4803 16123 - l 4796 16116 4803 16123 100 -} -a { - s 0 - b 4797 16117 - e 4804 16124 - l 4797 16117 4804 16124 100 -} -a { - s 0 - b 4798 16118 - e 4805 16125 - l 4798 16118 4805 16125 100 -} -a { - s 0 - b 8400 16119 - e 8407 16126 - l 8400 16119 8407 16126 100 -} -a { - s 0 - b 6913 16119 - e 6920 16126 - l 6913 16119 6920 16126 100 -} -a { - s 0 - b 4799 16119 - e 4806 16126 - l 4799 16119 4806 16126 100 -} -a { - s 0 - b 4388 16119 - e 4395 16126 - l 4388 16119 4395 16126 100 -} -a { - s 0 - b 8401 16120 - e 8408 16127 - l 8401 16120 8408 16127 100 -} -a { - s 0 - b 8402 16121 - e 8409 16128 - l 8402 16121 8409 16128 100 -} -a { - s 0 - b 14643 16133 - e 14650 16140 - l 14643 16133 14650 16140 100 -} -a { - s 0 - b 4548 16145 - e 4555 16152 - l 4548 16145 4555 16152 100 -} -a { - s 0 - b 5124 16146 - e 5131 16153 - l 5124 16146 5131 16153 100 -} -a { - s 0 - b 4549 16146 - e 4556 16153 - l 4549 16146 4556 16153 100 -} -a { - s 0 - b 5125 16147 - e 5132 16154 - l 5125 16147 5132 16154 100 -} -a { - s 0 - b 4550 16147 - e 4557 16154 - l 4550 16147 4557 16154 100 -} -a { - s 0 - b 9367 16149 - e 9374 16156 - l 9367 16149 9374 16156 100 -} -a { - s 0 - b 4485 16149 - e 4492 16156 - l 4485 16149 4492 16156 100 -} -a { - s 0 - b 9368 16150 - e 9375 16157 - l 9368 16150 9375 16157 100 -} -a { - s 0 - b 4486 16150 - e 4493 16157 - l 4486 16150 4493 16157 100 -} -a { - s 0 - b 9369 16151 - e 9376 16158 - l 9369 16151 9376 16158 100 -} -a { - s 0 - b 4437 16152 - e 4444 16159 - l 4437 16152 4444 16159 100 -} -a { - s 0 - b 9069 16163 - e 9076 16170 - l 9069 16163 9076 16170 100 -} -a { - s 0 - b 3176 16163 - e 3183 16170 - l 3176 16163 3183 16170 100 -} -a { - s 0 - b 6542 16165 - e 6549 16172 - l 6542 16165 6549 16172 100 -} -a { - s 0 - b 7532 16173 - e 7539 16180 - l 7532 16173 7539 16180 100 -} -a { - s 0 - b 7533 16174 - e 7540 16181 - l 7533 16174 7540 16181 100 -} -a { - s 0 - b 4054 16216 - e 4061 16223 - l 4054 16216 4061 16223 100 -} -a { - s 0 - b 5702 16231 - e 5709 16238 - l 5702 16231 5709 16238 100 -} -a { - s 0 - b 5703 16232 - e 5710 16239 - l 5703 16232 5710 16239 100 -} -a { - s 0 - b 8875 16240 - e 8882 16247 - l 8875 16240 8882 16247 100 -} -a { - s 0 - b 18676 16247 - e 18683 16254 - l 18676 16247 18683 16254 100 -} -a { - s 0 - b 17434 16247 - e 17441 16254 - l 17434 16247 17441 16254 100 -} -a { - s 0 - b 18677 16248 - e 18684 16255 - l 18677 16248 18684 16255 100 -} -a { - s 0 - b 9335 16249 - e 9342 16256 - l 9335 16249 9342 16256 100 -} -a { - s 0 - b 18634 16259 - e 18641 16266 - l 18634 16259 18641 16266 100 -} -a { - s 0 - b 7623 16259 - e 7630 16266 - l 7623 16259 7630 16266 100 -} -a { - s 0 - b 8212 16266 - e 8219 16273 - l 8212 16266 8219 16273 100 -} -a { - s 0 - b 258 16274 - e 265 16281 - l 258 16274 265 16281 100 -} -a { - s 0 - b 6693 16275 - e 6700 16282 - l 6693 16275 6700 16282 100 -} -a { - s 0 - b 259 16275 - e 266 16282 - l 259 16275 266 16282 100 -} -a { - s 0 - b 260 16276 - e 267 16283 - l 260 16276 267 16283 100 -} -a { - s 0 - b 261 16277 - e 268 16284 - l 261 16277 268 16284 100 -} -a { - s 0 - b 7702 16289 - e 7709 16296 - l 7702 16289 7709 16296 100 -} -a { - s 0 - b 8932 16307 - e 8939 16314 - l 8932 16307 8939 16314 100 -} -a { - s 0 - b 7475 16308 - e 7482 16315 - l 7475 16308 7482 16315 100 -} -a { - s 0 - b 4245 16309 - e 4252 16316 - l 4245 16309 4252 16316 100 -} -a { - s 0 - b 14913 16326 - e 14920 16333 - l 14913 16326 14920 16333 100 -} -a { - s 0 - b 14580 16328 - e 14587 16335 - l 14580 16328 14587 16335 100 -} -a { - s 0 - b 17417 16332 - e 17424 16339 - l 17417 16332 17424 16339 100 -} -a { - s 0 - b 11026 16337 - e 11033 16344 - l 11026 16337 11033 16344 100 -} -a { - s 0 - b 11027 16338 - e 11034 16345 - l 11027 16338 11034 16345 100 -} -a { - s 0 - b 18609 16339 - e 18616 16346 - l 18609 16339 18616 16346 100 -} -a { - s 0 - b 18610 16340 - e 18617 16347 - l 18610 16340 18617 16347 100 -} -a { - s 0 - b 5610 16354 - e 5617 16361 - l 5610 16354 5617 16361 100 -} -a { - s 0 - b 3037 16359 - e 3044 16366 - l 3037 16359 3044 16366 100 -} -a { - s 0 - b 5085 16364 - e 5092 16371 - l 5085 16364 5092 16371 100 -} -a { - s 0 - b 9662 16390 - e 9669 16397 - l 9662 16390 9669 16397 100 -} -a { - s 0 - b 4365 16391 - e 4372 16398 - l 4365 16391 4372 16398 100 -} -a { - s 0 - b 18733 16400 - e 18740 16407 - l 18733 16400 18740 16407 100 -} -a { - s 0 - b 17508 16400 - e 17515 16407 - l 17508 16400 17515 16407 100 -} -a { - s 0 - b 10891 16400 - e 10898 16407 - l 10891 16400 10898 16407 100 -} -a { - s 0 - b 17509 16401 - e 17516 16408 - l 17509 16401 17516 16408 100 -} -a { - s 0 - b 11055 16401 - e 11062 16408 - l 11055 16401 11062 16408 100 -} -a { - s 0 - b 6305 16404 - e 6312 16411 - l 6305 16404 6312 16411 100 -} -a { - s 0 - b 14297 16433 - e 14304 16440 - l 14297 16433 14304 16440 100 -} -a { - s 0 - b 8273 16433 - e 8280 16440 - l 8273 16433 8280 16440 100 -} -a { - s 0 - b 13444 16435 - e 13451 16442 - l 13444 16435 13451 16442 100 -} -a { - s 0 - b 13445 16436 - e 13452 16443 - l 13445 16436 13452 16443 100 -} -a { - s 0 - b 14363 16437 - e 14370 16444 - l 14363 16437 14370 16444 100 -} -a { - s 0 - b 13446 16437 - e 13453 16444 - l 13446 16437 13453 16444 100 -} -a { - s 0 - b 14364 16438 - e 14371 16445 - l 14364 16438 14371 16445 100 -} -a { - s 0 - b 8876 16461 - e 8883 16468 - l 8876 16461 8883 16468 100 -} -a { - s 0 - b 7640 16475 - e 7647 16482 - l 7640 16475 7647 16482 100 -} -a { - s 0 - b 4456 16475 - e 4463 16482 - l 4456 16475 4463 16482 100 -} -a { - s 0 - b 11060 16496 - e 11067 16503 - l 11060 16496 11067 16503 100 -} -a { - s 0 - b 11061 16497 - e 11068 16504 - l 11061 16497 11068 16504 100 -} -a { - s 0 - b 8888 16501 - e 8895 16508 - l 8888 16501 8895 16508 100 -} -a { - s 0 - b 9504 16515 - e 9511 16522 - l 9504 16515 9511 16522 100 -} -a { - s 0 - b 7711 16515 - e 7718 16522 - l 7711 16515 7718 16522 100 -} -a { - s 0 - b 9505 16516 - e 9512 16523 - l 9505 16516 9512 16523 100 -} -a { - s 0 - b 7712 16516 - e 7719 16523 - l 7712 16516 7719 16523 100 -} -a { - s 0 - b 16015 16517 - e 16022 16524 - l 16015 16517 16022 16524 100 -} -a { - s 0 - b 9506 16517 - e 9513 16524 - l 9506 16517 9513 16524 100 -} -a { - s 0 - b 7713 16517 - e 7720 16524 - l 7713 16517 7720 16524 100 -} -a { - s 0 - b 11090 16518 - e 11097 16525 - l 11090 16518 11097 16525 100 -} -a { - s 0 - b 3921 16522 - e 3928 16529 - l 3921 16522 3928 16529 100 -} -a { - s 0 - b 4928 16523 - e 4935 16530 - l 4928 16523 4935 16530 100 -} -a { - s 0 - b 10973 16524 - e 10980 16531 - l 10973 16524 10980 16531 100 -} -a { - s 0 - b 3675 16529 - e 3682 16536 - l 3675 16529 3682 16536 100 -} -a { - s 0 - b 14720 16541 - e 14727 16548 - l 14720 16541 14727 16548 100 -} -a { - s 0 - b 14721 16542 - e 14728 16549 - l 14721 16542 14728 16549 100 -} -a { - s 0 - b 14722 16543 - e 14729 16550 - l 14722 16543 14729 16550 100 -} -a { - s 0 - b 224 16545 - e 231 16552 - l 224 16545 231 16552 100 -} -a { - s 0 - b 3945 16556 - e 3952 16563 - l 3945 16556 3952 16563 100 -} -a { - s 0 - b 3946 16557 - e 3953 16564 - l 3946 16557 3953 16564 100 -} -a { - s 0 - b 9024 16566 - e 9031 16573 - l 9024 16566 9031 16573 100 -} -a { - s 0 - b 14172 16567 - e 14179 16574 - l 14172 16567 14179 16574 100 -} -a { - s 0 - b 12183 16624 - e 12190 16631 - l 12183 16624 12190 16631 100 -} -a { - s 0 - b 12232 16627 - e 12239 16634 - l 12232 16627 12239 16634 100 -} -a { - s 0 - b 6794 16654 - e 6801 16661 - l 6794 16654 6801 16661 100 -} -a { - s 0 - b 8194 16662 - e 8201 16669 - l 8194 16662 8201 16669 100 -} -a { - s 0 - b 6254 16662 - e 6261 16669 - l 6254 16662 6261 16669 100 -} -a { - s 0 - b 6336 16665 - e 6343 16672 - l 6336 16665 6343 16672 100 -} -a { - s 0 - b 18004 16670 - e 18011 16677 - l 18004 16670 18011 16677 100 -} -a { - s 0 - b 8135 16671 - e 8142 16678 - l 8135 16671 8142 16678 100 -} -a { - s 0 - b 14021 16674 - e 14028 16681 - l 14021 16674 14028 16681 100 -} -a { - s 0 - b 8270 16675 - e 8277 16682 - l 8270 16675 8277 16682 100 -} -a { - s 0 - b 4347 16677 - e 4354 16684 - l 4347 16677 4354 16684 100 -} -a { - s 0 - b 6828 16690 - e 6835 16697 - l 6828 16690 6835 16697 100 -} -a { - s 0 - b 5081 16690 - e 5088 16697 - l 5081 16690 5088 16697 100 -} -a { - s 0 - b 6829 16691 - e 6836 16698 - l 6829 16691 6836 16698 100 -} -a { - s 0 - b 5082 16691 - e 5089 16698 - l 5082 16691 5089 16698 100 -} -a { - s 0 - b 15393 16694 - e 15400 16701 - l 15393 16694 15400 16701 100 -} -a { - s 0 - b 3352 16714 - e 3359 16721 - l 3352 16714 3359 16721 100 -} -a { - s 0 - b 9606 16715 - e 9613 16722 - l 9606 16715 9613 16722 100 -} -a { - s 0 - b 11103 16721 - e 11110 16728 - l 11103 16721 11110 16728 100 -} -a { - s 0 - b 15147 16733 - e 15154 16740 - l 15147 16733 15154 16740 100 -} -a { - s 0 - b 14656 16733 - e 14663 16740 - l 14656 16733 14663 16740 100 -} -a { - s 0 - b 17417 16746 - e 17424 16753 - l 17417 16746 17424 16753 100 -} -a { - s 0 - b 2953 16751 - e 2960 16758 - l 2953 16751 2960 16758 100 -} -a { - s 0 - b 15223 16768 - e 15230 16775 - l 15223 16768 15230 16775 100 -} -a { - s 0 - b 15224 16769 - e 15231 16776 - l 15224 16769 15231 16776 100 -} -a { - s 0 - b 17835 16770 - e 17842 16777 - l 17835 16770 17842 16777 100 -} -a { - s 0 - b 17836 16771 - e 17843 16778 - l 17836 16771 17843 16778 100 -} -a { - s 0 - b 4382 16789 - e 4389 16796 - l 4382 16789 4389 16796 100 -} -a { - s 0 - b 8603 16791 - e 8610 16798 - l 8603 16791 8610 16798 100 -} -a { - s 0 - b 17161 16820 - e 17168 16827 - l 17161 16820 17168 16827 100 -} -a { - s 0 - b 17591 16825 - e 17598 16832 - l 17591 16825 17598 16832 100 -} -a { - s 0 - b 8651 16825 - e 8658 16832 - l 8651 16825 8658 16832 100 -} -a { - s 0 - b 4556 16848 - e 4563 16855 - l 4556 16848 4563 16855 100 -} -a { - s 0 - b 14613 16849 - e 14620 16856 - l 14613 16849 14620 16856 100 -} -a { - s 0 - b 18102 16853 - e 18109 16860 - l 18102 16853 18109 16860 100 -} -a { - s 0 - b 8810 16854 - e 8817 16861 - l 8810 16854 8817 16861 100 -} -a { - s 0 - b 5058 16863 - e 5065 16870 - l 5058 16863 5065 16870 100 -} -a { - s 0 - b 5665 16886 - e 5672 16893 - l 5665 16886 5672 16893 100 -} -a { - s 0 - b 5666 16887 - e 5673 16894 - l 5666 16887 5673 16894 100 -} -a { - s 0 - b 10914 16916 - e 10921 16923 - l 10914 16916 10921 16923 100 -} -a { - s 0 - b 10915 16917 - e 10922 16924 - l 10915 16917 10922 16924 100 -} -a { - s 0 - b 3033 16920 - e 3040 16927 - l 3033 16920 3040 16927 100 -} -a { - s 0 - b 3034 16921 - e 3041 16928 - l 3034 16921 3041 16928 100 -} -a { - s 0 - b 3632 16929 - e 3639 16936 - l 3632 16929 3639 16936 100 -} -a { - s 0 - b 15311 16938 - e 15318 16945 - l 15311 16938 15318 16945 100 -} -a { - s 0 - b 15312 16939 - e 15319 16946 - l 15312 16939 15319 16946 100 -} -a { - s 0 - b 8507 16941 - e 8514 16948 - l 8507 16941 8514 16948 100 -} -a { - s 0 - b 5661 16941 - e 5668 16948 - l 5661 16941 5668 16948 100 -} -a { - s 0 - b 13993 16942 - e 14000 16949 - l 13993 16942 14000 16949 100 -} -a { - s 0 - b 5662 16942 - e 5669 16949 - l 5662 16942 5669 16949 100 -} -a { - s 0 - b 6510 16944 - e 6517 16951 - l 6510 16944 6517 16951 100 -} -a { - s 0 - b 14046 16960 - e 14053 16967 - l 14046 16960 14053 16967 100 -} -a { - s 0 - b 65 16970 - e 72 16977 - l 65 16970 72 16977 100 -} -a { - s 0 - b 15127 16983 - e 15134 16990 - l 15127 16983 15134 16990 100 -} -a { - s 0 - b 15128 16984 - e 15135 16991 - l 15128 16984 15135 16991 100 -} -a { - s 0 - b 15129 16985 - e 15136 16992 - l 15129 16985 15136 16992 100 -} -a { - s 0 - b 16038 16987 - e 16045 16994 - l 16038 16987 16045 16994 100 -} -a { - s 0 - b 6859 16992 - e 6866 16999 - l 6859 16992 6866 16999 100 -} -a { - s 0 - b 7759 16993 - e 7766 17000 - l 7759 16993 7766 17000 100 -} -a { - s 0 - b 8086 16994 - e 8093 17001 - l 8086 16994 8093 17001 100 -} -a { - s 0 - b 7760 16994 - e 7767 17001 - l 7760 16994 7767 17001 100 -} -a { - s 0 - b 7761 16995 - e 7768 17002 - l 7761 16995 7768 17002 100 -} -a { - s 0 - b 13396 16996 - e 13403 17003 - l 13396 16996 13403 17003 100 -} -a { - s 0 - b 13397 16997 - e 13404 17004 - l 13397 16997 13404 17004 100 -} -a { - s 0 - b 7405 16997 - e 7412 17004 - l 7405 16997 7412 17004 100 -} -a { - s 0 - b 3297 16999 - e 3304 17006 - l 3297 16999 3304 17006 100 -} -a { - s 0 - b 8524 17017 - e 8531 17024 - l 8524 17017 8531 17024 100 -} -a { - s 0 - b 15149 17027 - e 15156 17034 - l 15149 17027 15156 17034 100 -} -a { - s 0 - b 3208 17321 - e 3215 17328 - l 3208 17321 3215 17328 100 -} -a { - s 0 - b 8166 17333 - e 8173 17340 - l 8166 17333 8173 17340 100 -} -a { - s 0 - b 17363 17352 - e 17370 17359 - l 17363 17352 17370 17359 100 -} -a { - s 0 - b 17364 17353 - e 17371 17360 - l 17364 17353 17371 17360 100 -} -a { - s 0 - b 8683 17370 - e 8690 17377 - l 8683 17370 8690 17377 100 -} -a { - s 0 - b 8796 17371 - e 8803 17378 - l 8796 17371 8803 17378 100 -} -a { - s 0 - b 6422 17371 - e 6429 17378 - l 6422 17371 6429 17378 100 -} -a { - s 0 - b 6423 17372 - e 6430 17379 - l 6423 17372 6430 17379 100 -} -a { - s 0 - b 7634 17379 - e 7641 17386 - l 7634 17379 7641 17386 100 -} -a { - s 0 - b 14865 17387 - e 14872 17394 - l 14865 17387 14872 17394 100 -} -a { - s 0 - b 15281 17390 - e 15288 17397 - l 15281 17390 15288 17397 100 -} -a { - s 0 - b 7568 17390 - e 7575 17397 - l 7568 17390 7575 17397 100 -} -a { - s 0 - b 7569 17391 - e 7576 17398 - l 7569 17391 7576 17398 100 -} -a { - s 0 - b 8662 17408 - e 8669 17415 - l 8662 17408 8669 17415 100 -} -a { - s 0 - b 15327 17419 - e 15334 17426 - l 15327 17419 15334 17426 100 -} -a { - s 0 - b 13532 17438 - e 13539 17445 - l 13532 17438 13539 17445 100 -} -a { - s 0 - b 6351 17438 - e 6358 17445 - l 6351 17438 6358 17445 100 -} -a { - s 0 - b 11411 17441 - e 11418 17448 - l 11411 17441 11418 17448 100 -} -a { - s 0 - b 9495 17441 - e 9502 17448 - l 9495 17441 9502 17448 100 -} -a { - s 0 - b 11412 17442 - e 11419 17449 - l 11412 17442 11419 17449 100 -} -a { - s 0 - b 11413 17443 - e 11420 17450 - l 11413 17443 11420 17450 100 -} -a { - s 0 - b 13968 17491 - e 13975 17498 - l 13968 17491 13975 17498 100 -} -a { - s 0 - b 10847 17503 - e 10854 17510 - l 10847 17503 10854 17510 100 -} -a { - s 0 - b 9452 17508 - e 9459 17515 - l 9452 17508 9459 17515 100 -} -a { - s 0 - b 18522 17519 - e 18529 17526 - l 18522 17519 18529 17526 100 -} -a { - s 0 - b 14329 17533 - e 14336 17540 - l 14329 17533 14336 17540 100 -} -a { - s 0 - b 14330 17534 - e 14337 17541 - l 14330 17534 14337 17541 100 -} -a { - s 0 - b 183 17537 - e 190 17544 - l 183 17537 190 17544 100 -} -a { - s 0 - b 7617 17562 - e 7624 17569 - l 7617 17562 7624 17569 100 -} -a { - s 0 - b 6573 17568 - e 6580 17575 - l 6573 17568 6580 17575 100 -} -a { - s 0 - b 11149 17596 - e 11156 17603 - l 11149 17596 11156 17603 100 -} -a { - s 0 - b 7742 17598 - e 7749 17605 - l 7742 17598 7749 17605 100 -} -a { - s 0 - b 200 17600 - e 207 17607 - l 200 17600 207 17607 100 -} -a { - s 0 - b 3587 17619 - e 3594 17626 - l 3587 17619 3594 17626 100 -} -a { - s 0 - b 6798 17629 - e 6805 17636 - l 6798 17629 6805 17636 100 -} -a { - s 0 - b 18539 17665 - e 18546 17672 - l 18539 17665 18546 17672 100 -} -a { - s 0 - b 14574 17673 - e 14581 17680 - l 14574 17673 14581 17680 100 -} -a { - s 0 - b 3684 17681 - e 3691 17688 - l 3684 17681 3691 17688 100 -} -a { - s 0 - b 13951 17688 - e 13958 17695 - l 13951 17688 13958 17695 100 -} -a { - s 0 - b 13952 17689 - e 13959 17696 - l 13952 17689 13959 17696 100 -} -a { - s 0 - b 6702 17690 - e 6709 17697 - l 6702 17690 6709 17697 100 -} -a { - s 0 - b 9014 17696 - e 9021 17703 - l 9014 17696 9021 17703 100 -} -a { - s 0 - b 9015 17697 - e 9022 17704 - l 9015 17697 9022 17704 100 -} -a { - s 0 - b 9016 17698 - e 9023 17705 - l 9016 17698 9023 17705 100 -} -a { - s 0 - b 18438 17700 - e 18445 17707 - l 18438 17700 18445 17707 100 -} -a { - s 0 - b 14231 17713 - e 14238 17720 - l 14231 17713 14238 17720 100 -} -a { - s 0 - b 11304 17713 - e 11311 17720 - l 11304 17713 11311 17720 100 -} -a { - s 0 - b 5570 17748 - e 5577 17755 - l 5570 17748 5577 17755 100 -} -a { - s 0 - b 8826 17751 - e 8833 17758 - l 8826 17751 8833 17758 100 -} -a { - s 0 - b 2987 17768 - e 2994 17775 - l 2987 17768 2994 17775 100 -} -a { - s 0 - b 18067 17776 - e 18074 17783 - l 18067 17776 18074 17783 100 -} -a { - s 0 - b 12221 17780 - e 12228 17787 - l 12221 17780 12228 17787 100 -} -a { - s 0 - b 14634 17783 - e 14641 17790 - l 14634 17783 14641 17790 100 -} -a { - s 0 - b 3342 17789 - e 3349 17796 - l 3342 17789 3349 17796 100 -} -a { - s 0 - b 18413 17790 - e 18420 17797 - l 18413 17790 18420 17797 100 -} -a { - s 0 - b 18414 17791 - e 18421 17798 - l 18414 17791 18421 17798 100 -} -a { - s 0 - b 8023 17798 - e 8030 17805 - l 8023 17798 8030 17805 100 -} -a { - s 0 - b 7729 17802 - e 7736 17809 - l 7729 17802 7736 17809 100 -} -a { - s 0 - b 9310 17808 - e 9317 17815 - l 9310 17808 9317 17815 100 -} -a { - s 0 - b 9311 17809 - e 9318 17816 - l 9311 17809 9318 17816 100 -} -a { - s 0 - b 14461 17817 - e 14468 17824 - l 14461 17817 14468 17824 100 -} -a { - s 0 - b 14462 17818 - e 14469 17825 - l 14462 17818 14469 17825 100 -} -a { - s 0 - b 18407 17834 - e 18414 17841 - l 18407 17834 18414 17841 100 -} -a { - s 0 - b 8120 17841 - e 8127 17848 - l 8120 17841 8127 17848 100 -} -a { - s 0 - b 8186 17846 - e 8193 17853 - l 8186 17846 8193 17853 100 -} -a { - s 0 - b 11502 17854 - e 11509 17861 - l 11502 17854 11509 17861 100 -} -a { - s 0 - b 11503 17855 - e 11510 17862 - l 11503 17855 11510 17862 100 -} -a { - s 0 - b 18165 17858 - e 18172 17865 - l 18165 17858 18172 17865 100 -} -a { - s 0 - b 14718 17882 - e 14725 17889 - l 14718 17882 14725 17889 100 -} -a { - s 0 - b 7737 17882 - e 7744 17889 - l 7737 17882 7744 17889 100 -} -a { - s 0 - b 4838 17884 - e 4845 17891 - l 4838 17884 4845 17891 100 -} -a { - s 0 - b 4839 17885 - e 4846 17892 - l 4839 17885 4846 17892 100 -} -a { - s 0 - b 3537 17888 - e 3544 17895 - l 3537 17888 3544 17895 100 -} -a { - s 0 - b 4621 17898 - e 4628 17905 - l 4621 17898 4628 17905 100 -} -a { - s 0 - b 14676 17915 - e 14683 17922 - l 14676 17915 14683 17922 100 -} -a { - s 0 - b 8891 17916 - e 8898 17923 - l 8891 17916 8898 17923 100 -} -a { - s 0 - b 8853 17918 - e 8860 17925 - l 8853 17918 8860 17925 100 -} -a { - s 0 - b 17509 17935 - e 17516 17942 - l 17509 17935 17516 17942 100 -} -a { - s 0 - b 11055 17935 - e 11062 17942 - l 11055 17935 11062 17942 100 -} -a { - s 0 - b 18554 17937 - e 18561 17944 - l 18554 17937 18561 17944 100 -} -a { - s 0 - b 6846 17937 - e 6853 17944 - l 6846 17937 6853 17944 100 -} -a { - s 0 - b 6847 17938 - e 6854 17945 - l 6847 17938 6854 17945 100 -} -a { - s 0 - b 15271 17939 - e 15278 17946 - l 15271 17939 15278 17946 100 -} -a { - s 0 - b 15272 17940 - e 15279 17947 - l 15272 17940 15279 17947 100 -} -a { - s 0 - b 14 17947 - e 21 17954 - l 14 17947 21 17954 100 -} -a { - s 0 - b 8470 17965 - e 8477 17972 - l 8470 17965 8477 17972 100 -} -a { - s 0 - b 9401 17971 - e 9408 17978 - l 9401 17971 9408 17978 100 -} -a { - s 0 - b 6857 17971 - e 6864 17978 - l 6857 17971 6864 17978 100 -} -a { - s 0 - b 9402 17972 - e 9409 17979 - l 9402 17972 9409 17979 100 -} -a { - s 0 - b 18421 17973 - e 18428 17980 - l 18421 17973 18428 17980 100 -} -a { - s 0 - b 8168 17976 - e 8175 17983 - l 8168 17976 8175 17983 100 -} -a { - s 0 - b 6255 17983 - e 6262 17990 - l 6255 17983 6262 17990 100 -} -a { - s 0 - b 6256 17984 - e 6263 17991 - l 6256 17984 6263 17991 100 -} -a { - s 0 - b 14214 17985 - e 14221 17992 - l 14214 17985 14221 17992 100 -} -a { - s 0 - b 7647 17988 - e 7654 17995 - l 7647 17988 7654 17995 100 -} -a { - s 0 - b 7648 17989 - e 7655 17996 - l 7648 17989 7655 17996 100 -} -a { - s 0 - b 6410 17989 - e 6417 17996 - l 6410 17989 6417 17996 100 -} -a { - s 0 - b 10929 17996 - e 10936 18003 - l 10929 17996 10936 18003 100 -} -a { - s 0 - b 18523 17997 - e 18530 18004 - l 18523 17997 18530 18004 100 -} -a { - s 0 - b 10930 17997 - e 10937 18004 - l 10930 17997 10937 18004 100 -} -a { - s 0 - b 10931 17998 - e 10938 18005 - l 10931 17998 10938 18005 100 -} -a { - s 0 - b 9412 17998 - e 9419 18005 - l 9412 17998 9419 18005 100 -} -a { - s 0 - b 3382 17998 - e 3389 18005 - l 3382 17998 3389 18005 100 -} -a { - s 0 - b 3383 17999 - e 3390 18006 - l 3383 17999 3390 18006 100 -} -a { - s 0 - b 14961 18004 - e 14968 18011 - l 14961 18004 14968 18011 100 -} -a { - s 0 - b 8738 18019 - e 8745 18026 - l 8738 18019 8745 18026 100 -} -a { - s 0 - b 8739 18020 - e 8746 18027 - l 8739 18020 8746 18027 100 -} -a { - s 0 - b 8740 18021 - e 8747 18028 - l 8740 18021 8747 18028 100 -} -a { - s 0 - b 14118 18028 - e 14125 18035 - l 14118 18028 14125 18035 100 -} -a { - s 0 - b 14119 18029 - e 14126 18036 - l 14119 18029 14126 18036 100 -} -a { - s 0 - b 8874 18029 - e 8881 18036 - l 8874 18029 8881 18036 100 -} -a { - s 0 - b 11020 18044 - e 11027 18051 - l 11020 18044 11027 18051 100 -} -a { - s 0 - b 16049 18052 - e 16056 18059 - l 16049 18052 16056 18059 100 -} -a { - s 0 - b 8126 18054 - e 8133 18061 - l 8126 18054 8133 18061 100 -} -a { - s 0 - b 13957 18056 - e 13964 18063 - l 13957 18056 13964 18063 100 -} -a { - s 0 - b 8670 18056 - e 8677 18063 - l 8670 18056 8677 18063 100 -} -a { - s 0 - b 13958 18057 - e 13965 18064 - l 13958 18057 13965 18064 100 -} -a { - s 0 - b 8671 18057 - e 8678 18064 - l 8671 18057 8678 18064 100 -} -a { - s 0 - b 7758 18061 - e 7765 18068 - l 7758 18061 7765 18068 100 -} -a { - s 0 - b 6598 18064 - e 6605 18071 - l 6598 18064 6605 18071 100 -} -a { - s 0 - b 9454 18077 - e 9461 18084 - l 9454 18077 9461 18084 100 -} -a { - s 0 - b 6725 18077 - e 6732 18084 - l 6725 18077 6732 18084 100 -} -a { - s 0 - b 9455 18078 - e 9462 18085 - l 9455 18078 9462 18085 100 -} -a { - s 0 - b 6726 18078 - e 6733 18085 - l 6726 18078 6733 18085 100 -} -a { - s 0 - b 12137 18087 - e 12144 18094 - l 12137 18087 12144 18094 100 -} -a { - s 0 - b 15141 18089 - e 15148 18096 - l 15141 18089 15148 18096 100 -} -a { - s 0 - b 15142 18090 - e 15149 18097 - l 15142 18090 15149 18097 100 -} -a { - s 0 - b 15143 18091 - e 15150 18098 - l 15143 18091 15150 18098 100 -} -a { - s 0 - b 15144 18092 - e 15151 18099 - l 15144 18092 15151 18099 100 -} -a { - s 0 - b 11524 18102 - e 11531 18109 - l 11524 18102 11531 18109 100 -} -a { - s 0 - b 5615 18102 - e 5622 18109 - l 5615 18102 5622 18109 100 -} -a { - s 0 - b 5042 18102 - e 5049 18109 - l 5042 18102 5049 18109 100 -} -a { - s 0 - b 11525 18103 - e 11532 18110 - l 11525 18103 11532 18110 100 -} -a { - s 0 - b 138 18103 - e 145 18110 - l 138 18103 145 18110 100 -} -a { - s 0 - b 139 18104 - e 146 18111 - l 139 18104 146 18111 100 -} -a { - s 0 - b 8673 18109 - e 8680 18116 - l 8673 18109 8680 18116 100 -} -a { - s 0 - b 13487 18119 - e 13494 18126 - l 13487 18119 13494 18126 100 -} -a { - s 0 - b 13488 18120 - e 13495 18127 - l 13488 18120 13495 18127 100 -} -a { - s 0 - b 4376 18123 - e 4383 18130 - l 4376 18123 4383 18130 100 -} -a { - s 0 - b 3921 18124 - e 3928 18131 - l 3921 18124 3928 18131 100 -} -a { - s 0 - b 16100 18125 - e 16107 18132 - l 16100 18125 16107 18132 100 -} -a { - s 0 - b 6705 18127 - e 6712 18134 - l 6705 18127 6712 18134 100 -} -a { - s 0 - b 6706 18128 - e 6713 18135 - l 6706 18128 6713 18135 100 -} -a { - s 0 - b 6656 18128 - e 6663 18135 - l 6656 18128 6663 18135 100 -} -a { - s 0 - b 6657 18129 - e 6664 18136 - l 6657 18129 6664 18136 100 -} -a { - s 0 - b 4001 18131 - e 4008 18138 - l 4001 18131 4008 18138 100 -} -a { - s 0 - b 4585 18148 - e 4592 18155 - l 4585 18148 4592 18155 100 -} -a { - s 0 - b 7654 18149 - e 7661 18156 - l 7654 18149 7661 18156 100 -} -a { - s 0 - b 40 18158 - e 47 18165 - l 40 18158 47 18165 100 -} -a { - s 0 - b 41 18159 - e 48 18166 - l 41 18159 48 18166 100 -} -a { - s 0 - b 17160 18190 - e 17167 18197 - l 17160 18190 17167 18197 100 -} -a { - s 0 - b 17565 18199 - e 17572 18206 - l 17565 18199 17572 18206 100 -} -a { - s 0 - b 14675 18234 - e 14682 18241 - l 14675 18234 14682 18241 100 -} -a { - s 0 - b 171 18241 - e 178 18248 - l 171 18241 178 18248 100 -} -a { - s 0 - b 6722 18250 - e 6729 18257 - l 6722 18250 6729 18257 100 -} -a { - s 0 - b 6723 18251 - e 6730 18258 - l 6723 18251 6730 18258 100 -} -a { - s 0 - b 13950 18253 - e 13957 18260 - l 13950 18253 13957 18260 100 -} -a { - s 0 - b 18074 18256 - e 18081 18263 - l 18074 18256 18081 18263 100 -} -a { - s 0 - b 6293 18257 - e 6300 18264 - l 6293 18257 6300 18264 100 -} -a { - s 0 - b 196 18263 - e 203 18270 - l 196 18263 203 18270 100 -} -a { - s 0 - b 11468 18267 - e 11475 18274 - l 11468 18267 11475 18274 100 -} -a { - s 0 - b 7771 18267 - e 7778 18274 - l 7771 18267 7778 18274 100 -} -a { - s 0 - b 11469 18268 - e 11476 18275 - l 11469 18268 11476 18275 100 -} -a { - s 0 - b 11470 18269 - e 11477 18276 - l 11470 18269 11477 18276 100 -} -a { - s 0 - b 6537 18269 - e 6544 18276 - l 6537 18269 6544 18276 100 -} -a { - s 0 - b 6332 18269 - e 6339 18276 - l 6332 18269 6339 18276 100 -} -a { - s 0 - b 14074 18270 - e 14081 18277 - l 14074 18270 14081 18277 100 -} -a { - s 0 - b 6538 18270 - e 6545 18277 - l 6538 18270 6545 18277 100 -} -a { - s 0 - b 14075 18271 - e 14082 18278 - l 14075 18271 14082 18278 100 -} -a { - s 0 - b 3005 18274 - e 3012 18281 - l 3005 18274 3012 18281 100 -} -a { - s 0 - b 2 18278 - e 9 18285 - l 2 18278 9 18285 100 -} -a { - s 0 - b 15915 18287 - e 15922 18294 - l 15915 18287 15922 18294 100 -} -a { - s 0 - b 15916 18288 - e 15923 18295 - l 15916 18288 15923 18295 100 -} -a { - s 0 - b 7395 18301 - e 7402 18308 - l 7395 18301 7402 18308 100 -} -a { - s 0 - b 5607 18309 - e 5614 18316 - l 5607 18309 5614 18316 100 -} -a { - s 0 - b 8503 18310 - e 8510 18317 - l 8503 18310 8510 18317 100 -} -a { - s 0 - b 6498 18334 - e 6505 18341 - l 6498 18334 6505 18341 100 -} -a { - s 0 - b 6227 18334 - e 6234 18341 - l 6227 18334 6234 18341 100 -} -a { - s 0 - b 6499 18335 - e 6506 18342 - l 6499 18335 6506 18342 100 -} -a { - s 0 - b 14101 18336 - e 14108 18343 - l 14101 18336 14108 18343 100 -} -a { - s 0 - b 11078 18337 - e 11085 18344 - l 11078 18337 11085 18344 100 -} -a { - s 0 - b 11079 18338 - e 11086 18345 - l 11079 18338 11086 18345 100 -} -a { - s 0 - b 6623 18346 - e 6630 18353 - l 6623 18346 6630 18353 100 -} -a { - s 0 - b 17161 18354 - e 17168 18361 - l 17161 18354 17168 18361 100 -} -a { - s 0 - b 17162 18355 - e 17169 18362 - l 17162 18355 17169 18362 100 -} -a { - s 0 - b 18729 18363 - e 18736 18370 - l 18729 18363 18736 18370 100 -} -a { - s 0 - b 4479 18365 - e 4486 18372 - l 4479 18365 4486 18372 100 -} -a { - s 0 - b 89 18368 - e 96 18375 - l 89 18368 96 18375 100 -} -a { - s 0 - b 14945 18372 - e 14952 18379 - l 14945 18372 14952 18379 100 -} -a { - s 0 - b 3882 18373 - e 3889 18380 - l 3882 18373 3889 18380 100 -} -a { - s 0 - b 8084 18382 - e 8091 18389 - l 8084 18382 8091 18389 100 -} -a { - s 0 - b 16017 18387 - e 16024 18394 - l 16017 18387 16024 18394 100 -} -a { - s 0 - b 3236 18387 - e 3243 18394 - l 3236 18387 3243 18394 100 -} -a { - s 0 - b 3237 18388 - e 3244 18395 - l 3237 18388 3244 18395 100 -} -a { - s 0 - b 14027 18389 - e 14034 18396 - l 14027 18389 14034 18396 100 -} -a { - s 0 - b 3238 18389 - e 3245 18396 - l 3238 18389 3245 18396 100 -} -a { - s 0 - b 3239 18390 - e 3246 18397 - l 3239 18390 3246 18397 100 -} -a { - s 0 - b 9474 18392 - e 9481 18399 - l 9474 18392 9481 18399 100 -} -a { - s 0 - b 9475 18393 - e 9482 18400 - l 9475 18393 9482 18400 100 -} -a { - s 0 - b 9476 18394 - e 9483 18401 - l 9476 18394 9483 18401 100 -} -a { - s 0 - b 9477 18395 - e 9484 18402 - l 9477 18395 9484 18402 100 -} -a { - s 0 - b 10826 18406 - e 10833 18413 - l 10826 18406 10833 18413 100 -} -a { - s 0 - b 14631 18407 - e 14638 18414 - l 14631 18407 14638 18414 100 -} -a { - s 0 - b 18025 18408 - e 18032 18415 - l 18025 18408 18032 18415 100 -} -a { - s 0 - b 14632 18408 - e 14639 18415 - l 14632 18408 14639 18415 100 -} -a { - s 0 - b 18026 18409 - e 18033 18416 - l 18026 18409 18033 18416 100 -} -a { - s 0 - b 13894 18410 - e 13901 18417 - l 13894 18410 13901 18417 100 -} -a { - s 0 - b 3362 18412 - e 3369 18419 - l 3362 18412 3369 18419 100 -} -a { - s 0 - b 6443 18414 - e 6450 18421 - l 6443 18414 6450 18421 100 -} -a { - s 0 - b 9276 18416 - e 9283 18423 - l 9276 18416 9283 18423 100 -} -a { - s 0 - b 13946 18420 - e 13953 18427 - l 13946 18420 13953 18427 100 -} -a { - s 0 - b 9282 18421 - e 9289 18428 - l 9282 18421 9289 18428 100 -} -a { - s 0 - b 14929 18423 - e 14936 18430 - l 14929 18423 14936 18430 100 -} -a { - s 0 - b 12243 18424 - e 12250 18431 - l 12243 18424 12250 18431 100 -} -a { - s 0 - b 9503 18434 - e 9510 18441 - l 9503 18434 9510 18441 100 -} -a { - s 0 - b 7710 18434 - e 7717 18441 - l 7710 18434 7717 18441 100 -} -a { - s 0 - b 13402 18440 - e 13409 18447 - l 13402 18440 13409 18447 100 -} -a { - s 0 - b 4934 18443 - e 4941 18450 - l 4934 18443 4941 18450 100 -} -a { - s 0 - b 10971 18446 - e 10978 18453 - l 10971 18446 10978 18453 100 -} -a { - s 0 - b 8465 18453 - e 8472 18460 - l 8465 18453 8472 18460 100 -} -a { - s 0 - b 6407 18458 - e 6414 18465 - l 6407 18458 6414 18465 100 -} -a { - s 0 - b 6408 18459 - e 6415 18466 - l 6408 18459 6415 18466 100 -} -a { - s 0 - b 8258 18473 - e 8265 18480 - l 8258 18473 8265 18480 100 -} -a { - s 0 - b 13444 18686 - e 13451 18693 - l 13444 18686 13451 18693 100 -} -a { - s 0 - b 8823 18688 - e 8830 18695 - l 8823 18688 8830 18695 100 -} -a { - s 0 - b 8824 18689 - e 8831 18696 - l 8824 18689 8831 18696 100 -} -a { - s 0 - b 8825 18690 - e 8832 18697 - l 8825 18690 8832 18697 100 -} -a { - s 0 - b 7831 18694 - e 7838 18701 - l 7831 18694 7838 18701 100 -} -a { - s 0 - b 8785 18695 - e 8792 18702 - l 8785 18695 8792 18702 100 -} -a { - s 0 - b 10972 18706 - e 10979 18713 - l 10972 18706 10979 18713 100 -} -a { - s 0 - b 6899 18707 - e 6906 18714 - l 6899 18707 6906 18714 100 -} -a { - s 0 - b 6900 18708 - e 6907 18715 - l 6900 18708 6907 18715 100 -} -a { - s 0 - b 6901 18709 - e 6908 18716 - l 6901 18709 6908 18716 100 -} -a { - s 0 - b 5705 18710 - e 5712 18717 - l 5705 18710 5712 18717 100 -} -a { - s 0 - b 6540 18718 - e 6547 18725 - l 6540 18718 6547 18725 100 -} -a { - s 0 - b 6407 18750 - e 6414 18757 - l 6407 18750 6414 18757 100 -} -a { - s 0 - b 8720 18751 - e 8727 18758 - l 8720 18751 8727 18758 100 -} -a { - s 0 - b 18418 18761 - e 18425 18768 - l 18418 18761 18425 18768 100 -} -a { - s 0 - b 18419 18762 - e 18426 18769 - l 18419 18762 18426 18769 100 -} -a { - s 0 - b 14109 18774 - e 14116 18781 - l 14109 18774 14116 18781 100 -} -a { - s 0 - b 4975 18786 - e 4982 18793 - l 4975 18786 4982 18793 100 -} -a { - s 0 - b 17833 18787 - e 17840 18794 - l 17833 18787 17840 18794 100 -} -a { - s 0 - b 4976 18787 - e 4983 18794 - l 4976 18787 4983 18794 100 -} -a { - s 0 - b 6396 18788 - e 6403 18795 - l 6396 18788 6403 18795 100 -} -a { - s 0 - b 14159 18813 - e 14166 18820 - l 14159 18813 14166 18820 100 -} -a { - s 0 - b 9313 18825 - e 9320 18832 - l 9313 18825 9320 18832 100 -} -a { - s 0 - b 5090 18825 - e 5097 18832 - l 5090 18825 5097 18832 100 -} -a { - s 0 - b 9314 18826 - e 9321 18833 - l 9314 18826 9321 18833 100 -} -a { - s 0 - b 5091 18826 - e 5098 18833 - l 5091 18826 5098 18833 100 -} -a { - s 0 - b 153 18826 - e 160 18833 - l 153 18826 160 18833 100 -} -a { - s 0 - b 3267 18829 - e 3274 18836 - l 3267 18829 3274 18836 100 -} -a { - s 0 - b 14238 18841 - e 14245 18848 - l 14238 18841 14245 18848 100 -} -a { - s 0 - b 6378 18844 - e 6385 18851 - l 6378 18844 6385 18851 100 -} -a { - s 0 - b 15107 18845 - e 15114 18852 - l 15107 18845 15114 18852 100 -} -a { - s 0 - b 2998 18845 - e 3005 18852 - l 2998 18845 3005 18852 100 -} -a { - s 0 - b 2999 18846 - e 3006 18853 - l 2999 18846 3006 18853 100 -} -a { - s 0 - b 7512 18852 - e 7519 18859 - l 7512 18852 7519 18859 100 -} -a { - s 0 - b 3107 18856 - e 3114 18863 - l 3107 18856 3114 18863 100 -} -a { - s 0 - b 3108 18857 - e 3115 18864 - l 3108 18857 3115 18864 100 -} -a { - s 0 - b 3533 18858 - e 3540 18865 - l 3533 18858 3540 18865 100 -} -a { - s 0 - b 3109 18858 - e 3116 18865 - l 3109 18858 3116 18865 100 -} -a { - s 0 - b 13459 18873 - e 13466 18880 - l 13459 18873 13466 18880 100 -} -a { - s 0 - b 7512 18878 - e 7519 18885 - l 7512 18878 7519 18885 100 -} -a { - s 0 - b 14256 18909 - e 14263 18916 - l 14256 18909 14263 18916 100 -} -a { - s 0 - b 14396 18911 - e 14403 18918 - l 14396 18911 14403 18918 100 -} -a { - s 0 - b 301 18911 - e 308 18918 - l 301 18911 308 18918 100 -} -a { - s 0 - b 8561 18917 - e 8568 18924 - l 8561 18917 8568 18924 100 -} -a { - s 0 - b 8562 18918 - e 8569 18925 - l 8562 18918 8569 18925 100 -} -a { - s 0 - b 3073 18918 - e 3080 18925 - l 3073 18918 3080 18925 100 -} -a { - s 0 - b 15193 18924 - e 15200 18931 - l 15193 18924 15200 18931 100 -} -a { - s 0 - b 11038 18924 - e 11045 18931 - l 11038 18924 11045 18931 100 -} -a { - s 0 - b 4533 18924 - e 4540 18931 - l 4533 18924 4540 18931 100 -} -a { - s 0 - b 4425 18924 - e 4432 18931 - l 4425 18924 4432 18931 100 -} -a { - s 0 - b 3691 18924 - e 3698 18931 - l 3691 18924 3698 18931 100 -} -a { - s 0 - b 3692 18925 - e 3699 18932 - l 3692 18925 3699 18932 100 -} -a { - s 0 - b 7562 18939 - e 7569 18946 - l 7562 18939 7569 18946 100 -} -a { - s 0 - b 15997 18955 - e 16004 18962 - l 15997 18955 16004 18962 100 -} -a { - s 0 - b 11462 18955 - e 11469 18962 - l 11462 18955 11469 18962 100 -} -a { - s 0 - b 15998 18956 - e 16005 18963 - l 15998 18956 16005 18963 100 -} -a { - s 0 - b 14360 18957 - e 14367 18964 - l 14360 18957 14367 18964 100 -} -a { - s 0 - b 5654 18964 - e 5661 18971 - l 5654 18964 5661 18971 100 -} -a { - s 0 - b 5655 18965 - e 5662 18972 - l 5655 18965 5662 18972 100 -} -a { - s 0 - b 11566 18969 - e 11573 18976 - l 11566 18969 11573 18976 100 -} -a { - s 0 - b 11374 18969 - e 11381 18976 - l 11374 18969 11381 18976 100 -} -a { - s 0 - b 17991 18970 - e 17998 18977 - l 17991 18970 17998 18977 100 -} -a { - s 0 - b 11567 18970 - e 11574 18977 - l 11567 18970 11574 18977 100 -} -a { - s 0 - b 11375 18970 - e 11382 18977 - l 11375 18970 11382 18977 100 -} -a { - s 0 - b 11376 18971 - e 11383 18978 - l 11376 18971 11383 18978 100 -} -a { - s 0 - b 4919 18971 - e 4926 18978 - l 4919 18971 4926 18978 100 -} -a { - s 0 - b 8718 18978 - e 8725 18985 - l 8718 18978 8725 18985 100 -} -a { - s 0 - b 4243 18982 - e 4250 18989 - l 4243 18982 4250 18989 100 -} -a { - s 0 - b 6532 19016 - e 6539 19023 - l 6532 19016 6539 19023 100 -} -a { - s 0 - b 6533 19017 - e 6540 19024 - l 6533 19017 6540 19024 100 -} -a { - s 0 - b 8046 19018 - e 8053 19025 - l 8046 19018 8053 19025 100 -} -a { - s 0 - b 6534 19018 - e 6541 19025 - l 6534 19018 6541 19025 100 -} -a { - s 0 - b 9020 19019 - e 9027 19026 - l 9020 19019 9027 19026 100 -} -a { - s 0 - b 6535 19019 - e 6542 19026 - l 6535 19019 6542 19026 100 -} -a { - s 0 - b 15373 19029 - e 15380 19036 - l 15373 19029 15380 19036 100 -} -a { - s 0 - b 83 19029 - e 90 19036 - l 83 19029 90 19036 100 -} -a { - s 0 - b 84 19030 - e 91 19037 - l 84 19030 91 19037 100 -} -a { - s 0 - b 85 19031 - e 92 19038 - l 85 19031 92 19038 100 -} -a { - s 0 - b 18710 19038 - e 18717 19045 - l 18710 19038 18717 19045 100 -} -a { - s 0 - b 18711 19039 - e 18718 19046 - l 18711 19039 18718 19046 100 -} -a { - s 0 - b 4000 19039 - e 4007 19046 - l 4000 19039 4007 19046 100 -} -a { - s 0 - b 18712 19040 - e 18719 19047 - l 18712 19040 18719 19047 100 -} -a { - s 0 - b 18713 19041 - e 18720 19048 - l 18713 19041 18720 19048 100 -} -a { - s 0 - b 14594 19053 - e 14601 19060 - l 14594 19053 14601 19060 100 -} -a { - s 0 - b 9288 19053 - e 9295 19060 - l 9288 19053 9295 19060 100 -} -a { - s 0 - b 11137 19529 - e 11144 19536 - l 11137 19529 11144 19536 100 -} -a { - s 0 - b 11138 19530 - e 11145 19537 - l 11138 19530 11145 19537 100 -} -a { - s 0 - b 8933 19575 - e 8940 19582 - l 8933 19575 8940 19582 100 -} -a { - s 0 - b 17149 19586 - e 17156 19593 - l 17149 19586 17156 19593 100 -} -a { - s 0 - b 11232 19595 - e 11239 19602 - l 11232 19595 11239 19602 100 -} -a { - s 0 - b 8454 19597 - e 8461 19604 - l 8454 19597 8461 19604 100 -} -a { - s 0 - b 3038 19598 - e 3045 19605 - l 3038 19598 3045 19605 100 -} -a { - s 0 - b 12183 19608 - e 12190 19615 - l 12183 19608 12190 19615 100 -} -a { - s 0 - b 12184 19609 - e 12191 19616 - l 12184 19609 12191 19616 100 -} -a { - s 0 - b 9614 19610 - e 9621 19617 - l 9614 19610 9621 19617 100 -} -a { - s 0 - b 9615 19611 - e 9622 19618 - l 9615 19611 9622 19618 100 -} -a { - s 0 - b 9616 19612 - e 9623 19619 - l 9616 19612 9623 19619 100 -} -a { - s 0 - b 15349 19666 - e 15356 19673 - l 15349 19666 15356 19673 100 -} -a { - s 0 - b 14324 19672 - e 14331 19679 - l 14324 19672 14331 19679 100 -} -a { - s 0 - b 6824 19672 - e 6831 19679 - l 6824 19672 6831 19679 100 -} -a { - s 0 - b 6825 19673 - e 6832 19680 - l 6825 19673 6832 19680 100 -} -a { - s 0 - b 8348 19678 - e 8355 19685 - l 8348 19678 8355 19685 100 -} -a { - s 0 - b 2999 19682 - e 3006 19689 - l 2999 19682 3006 19689 100 -} -a { - s 0 - b 18025 19687 - e 18032 19694 - l 18025 19687 18032 19694 100 -} -a { - s 0 - b 14632 19687 - e 14639 19694 - l 14632 19687 14639 19694 100 -} -a { - s 0 - b 18026 19688 - e 18033 19695 - l 18026 19688 18033 19695 100 -} -a { - s 0 - b 3283 19692 - e 3290 19699 - l 3283 19692 3290 19699 100 -} -a { - s 0 - b 7655 19694 - e 7662 19701 - l 7655 19694 7662 19701 100 -} -a { - s 0 - b 7656 19695 - e 7663 19702 - l 7656 19695 7663 19702 100 -} -a { - s 0 - b 7657 19696 - e 7664 19703 - l 7657 19696 7664 19703 100 -} -a { - s 0 - b 7658 19697 - e 7665 19704 - l 7658 19697 7665 19704 100 -} -a { - s 0 - b 5100 19711 - e 5107 19718 - l 5100 19711 5107 19718 100 -} -a { - s 0 - b 88 19742 - e 95 19749 - l 88 19742 95 19749 100 -} -a { - s 0 - b 11471 19744 - e 11478 19751 - l 11471 19744 11478 19751 100 -} -a { - s 0 - b 6333 19744 - e 6340 19751 - l 6333 19744 6340 19751 100 -} -a { - s 0 - b 17534 19753 - e 17541 19760 - l 17534 19753 17541 19760 100 -} -a { - s 0 - b 7471 19754 - e 7478 19761 - l 7471 19754 7478 19761 100 -} -a { - s 0 - b 3909 19754 - e 3916 19761 - l 3909 19754 3916 19761 100 -} -a { - s 0 - b 14245 19755 - e 14252 19762 - l 14245 19755 14252 19762 100 -} -a { - s 0 - b 14246 19756 - e 14253 19763 - l 14246 19756 14253 19763 100 -} -a { - s 0 - b 7968 19756 - e 7975 19763 - l 7968 19756 7975 19763 100 -} -a { - s 0 - b 18622 19761 - e 18629 19768 - l 18622 19761 18629 19768 100 -} -a { - s 0 - b 18623 19762 - e 18630 19769 - l 18623 19762 18630 19769 100 -} -a { - s 0 - b 13492 19764 - e 13499 19771 - l 13492 19764 13499 19771 100 -} -a { - s 0 - b 4808 19768 - e 4815 19775 - l 4808 19768 4815 19775 100 -} -a { - s 0 - b 4809 19769 - e 4816 19776 - l 4809 19769 4816 19776 100 -} -a { - s 0 - b 5658 19771 - e 5665 19778 - l 5658 19771 5665 19778 100 -} -a { - s 0 - b 5659 19772 - e 5666 19779 - l 5659 19772 5666 19779 100 -} -a { - s 0 - b 5660 19773 - e 5667 19780 - l 5660 19773 5667 19780 100 -} -a { - s 0 - b 3871 19773 - e 3878 19780 - l 3871 19773 3878 19780 100 -} -a { - s 0 - b 10860 19784 - e 10867 19791 - l 10860 19784 10867 19791 100 -} -a { - s 0 - b 14384 19805 - e 14391 19812 - l 14384 19805 14391 19812 100 -} -a { - s 0 - b 14385 19806 - e 14392 19813 - l 14385 19806 14392 19813 100 -} -a { - s 0 - b 14386 19807 - e 14393 19814 - l 14386 19807 14393 19814 100 -} -a { - s 0 - b 228 19814 - e 235 19821 - l 228 19814 235 19821 100 -} -a { - s 0 - b 14968 19825 - e 14975 19832 - l 14968 19825 14975 19832 100 -} -a { - s 0 - b 4393 19825 - e 4400 19832 - l 4393 19825 4400 19832 100 -} -a { - s 0 - b 4394 19826 - e 4401 19833 - l 4394 19826 4401 19833 100 -} -a { - s 0 - b 17989 19827 - e 17996 19834 - l 17989 19827 17996 19834 100 -} -a { - s 0 - b 9706 19851 - e 9713 19858 - l 9706 19851 9713 19858 100 -} -a { - s 0 - b 9707 19852 - e 9714 19859 - l 9707 19852 9714 19859 100 -} -a { - s 0 - b 9708 19853 - e 9715 19860 - l 9708 19853 9715 19860 100 -} -a { - s 0 - b 3122 19855 - e 3129 19862 - l 3122 19855 3129 19862 100 -} -a { - s 0 - b 7574 19859 - e 7581 19866 - l 7574 19859 7581 19866 100 -} -a { - s 0 - b 4869 19864 - e 4876 19871 - l 4869 19864 4876 19871 100 -} -a { - s 0 - b 8905 19875 - e 8912 19882 - l 8905 19875 8912 19882 100 -} -a { - s 0 - b 15230 19876 - e 15237 19883 - l 15230 19876 15237 19883 100 -} -a { - s 0 - b 9355 19878 - e 9362 19885 - l 9355 19878 9362 19885 100 -} -a { - s 0 - b 17476 19891 - e 17483 19898 - l 17476 19891 17483 19898 100 -} -a { - s 0 - b 14939 19893 - e 14946 19900 - l 14939 19893 14946 19900 100 -} -a { - s 0 - b 14736 19908 - e 14743 19915 - l 14736 19908 14743 19915 100 -} -a { - s 0 - b 14928 19912 - e 14935 19919 - l 14928 19912 14935 19919 100 -} -a { - s 0 - b 14929 19913 - e 14936 19920 - l 14929 19913 14936 19920 100 -} -a { - s 0 - b 16075 19915 - e 16082 19922 - l 16075 19915 16082 19922 100 -} -a { - s 0 - b 9326 19989 - e 9333 19996 - l 9326 19989 9333 19996 100 -} -a { - s 0 - b 9327 19990 - e 9334 19997 - l 9327 19990 9334 19997 100 -} -a { - s 0 - b 5655 19999 - e 5662 20006 - l 5655 19999 5662 20006 100 -} -a { - s 0 - b 4864 20020 - e 4871 20027 - l 4864 20020 4871 20027 100 -} -a { - s 0 - b 9373 20023 - e 9380 20030 - l 9373 20023 9380 20030 100 -} -a { - s 0 - b 9374 20024 - e 9381 20031 - l 9374 20024 9381 20031 100 -} -a { - s 0 - b 11345 20049 - e 11352 20056 - l 11345 20049 11352 20056 100 -} -a { - s 0 - b 14538 20056 - e 14545 20063 - l 14538 20056 14545 20063 100 -} -a { - s 0 - b 11393 20056 - e 11400 20063 - l 11393 20056 11400 20063 100 -} -a { - s 0 - b 11394 20057 - e 11401 20064 - l 11394 20057 11401 20064 100 -} -a { - s 0 - b 8212 20058 - e 8219 20065 - l 8212 20058 8219 20065 100 -} -a { - s 0 - b 14514 20076 - e 14521 20083 - l 14514 20076 14521 20083 100 -} -a { - s 0 - b 14515 20077 - e 14522 20084 - l 14515 20077 14522 20084 100 -} -a { - s 0 - b 6546 20078 - e 6553 20085 - l 6546 20078 6553 20085 100 -} -a { - s 0 - b 6547 20079 - e 6554 20086 - l 6547 20079 6554 20086 100 -} -a { - s 0 - b 17836 20094 - e 17843 20101 - l 17836 20094 17843 20101 100 -} -a { - s 0 - b 17837 20095 - e 17844 20102 - l 17837 20095 17844 20102 100 -} -a { - s 0 - b 16068 20095 - e 16075 20102 - l 16068 20095 16075 20102 100 -} -a { - s 0 - b 14458 20095 - e 14465 20102 - l 14458 20095 14465 20102 100 -} -a { - s 0 - b 11328 20095 - e 11335 20102 - l 11328 20095 11335 20102 100 -} -a { - s 0 - b 17838 20096 - e 17845 20103 - l 17838 20096 17845 20103 100 -} -a { - s 0 - b 16069 20096 - e 16076 20103 - l 16069 20096 16076 20103 100 -} -a { - s 0 - b 11329 20096 - e 11336 20103 - l 11329 20096 11336 20103 100 -} -a { - s 0 - b 16070 20097 - e 16077 20104 - l 16070 20097 16077 20104 100 -} -a { - s 0 - b 8167 20100 - e 8174 20107 - l 8167 20100 8174 20107 100 -} -a { - s 0 - b 8168 20101 - e 8175 20108 - l 8168 20101 8175 20108 100 -} -a { - s 0 - b 8169 20102 - e 8176 20109 - l 8169 20102 8176 20109 100 -} -a { - s 0 - b 17453 20105 - e 17460 20112 - l 17453 20105 17460 20112 100 -} -a { - s 0 - b 13387 20105 - e 13394 20112 - l 13387 20105 13394 20112 100 -} -a { - s 0 - b 8724 20111 - e 8731 20118 - l 8724 20111 8731 20118 100 -} -a { - s 0 - b 18520 20113 - e 18527 20120 - l 18520 20113 18527 20120 100 -} -a { - s 0 - b 18521 20114 - e 18528 20121 - l 18521 20114 18528 20121 100 -} -a { - s 0 - b 3323 20116 - e 3330 20123 - l 3323 20116 3330 20123 100 -} -a { - s 0 - b 6833 20122 - e 6840 20129 - l 6833 20122 6840 20129 100 -} -a { - s 0 - b 6531 20124 - e 6538 20131 - l 6531 20124 6538 20131 100 -} -a { - s 0 - b 6532 20125 - e 6539 20132 - l 6532 20125 6539 20132 100 -} -a { - s 0 - b 5020 20129 - e 5027 20136 - l 5020 20129 5027 20136 100 -} -a { - s 0 - b 17174 20130 - e 17181 20137 - l 17174 20130 17181 20137 100 -} -a { - s 0 - b 3335 20134 - e 3342 20141 - l 3335 20134 3342 20141 100 -} -a { - s 0 - b 4365 20171 - e 4372 20178 - l 4365 20171 4372 20178 100 -} -a { - s 0 - b 10828 20197 - e 10835 20204 - l 10828 20197 10835 20204 100 -} -a { - s 0 - b 6761 20200 - e 6768 20207 - l 6761 20200 6768 20207 100 -} -a { - s 0 - b 6762 20201 - e 6769 20208 - l 6762 20201 6769 20208 100 -} -a { - s 0 - b 11107 20767 - e 11114 20774 - l 11107 20767 11114 20774 100 -} -a { - s 0 - b 68 20783 - e 75 20790 - l 68 20783 75 20790 100 -} -a { - s 0 - b 69 20784 - e 76 20791 - l 69 20784 76 20791 100 -} -a { - s 0 - b 11085 20785 - e 11092 20792 - l 11085 20785 11092 20792 100 -} -a { - s 0 - b 17838 20788 - e 17845 20795 - l 17838 20788 17845 20795 100 -} -a { - s 0 - b 16069 20788 - e 16076 20795 - l 16069 20788 16076 20795 100 -} -a { - s 0 - b 11329 20788 - e 11336 20795 - l 11329 20788 11336 20795 100 -} -a { - s 0 - b 11330 20789 - e 11337 20796 - l 11330 20789 11337 20796 100 -} -a { - s 0 - b 6695 20833 - e 6702 20840 - l 6695 20833 6702 20840 100 -} -a { - s 0 - b 6696 20834 - e 6703 20841 - l 6696 20834 6703 20841 100 -} -a { - s 0 - b 4480 20852 - e 4487 20859 - l 4480 20852 4487 20859 100 -} -a { - s 0 - b 4883 20861 - e 4890 20868 - l 4883 20861 4890 20868 100 -} -a { - s 0 - b 18357 20869 - e 18364 20876 - l 18357 20869 18364 20876 100 -} -a { - s 0 - b 18358 20870 - e 18365 20877 - l 18358 20870 18365 20877 100 -} -a { - s 0 - b 14644 20889 - e 14651 20896 - l 14644 20889 14651 20896 100 -} -a { - s 0 - b 5607 21282 - e 5614 21289 - l 5607 21282 5614 21289 100 -} -a { - s 0 - b 4096 21285 - e 4103 21292 - l 4096 21285 4103 21292 100 -} -a { - s 0 - b 4097 21286 - e 4104 21293 - l 4097 21286 4104 21293 100 -} -a { - s 0 - b 9295 21290 - e 9302 21297 - l 9295 21290 9302 21297 100 -} -a { - s 0 - b 9296 21291 - e 9303 21298 - l 9296 21291 9303 21298 100 -} -a { - s 0 - b 15195 21297 - e 15202 21304 - l 15195 21297 15202 21304 100 -} -a { - s 0 - b 4535 21297 - e 4542 21304 - l 4535 21297 4542 21304 100 -} -a { - s 0 - b 4427 21297 - e 4434 21304 - l 4427 21297 4434 21304 100 -} -a { - s 0 - b 15196 21298 - e 15203 21305 - l 15196 21298 15203 21305 100 -} -a { - s 0 - b 15197 21299 - e 15204 21306 - l 15197 21299 15204 21306 100 -} -a { - s 0 - b 15198 21300 - e 15205 21307 - l 15198 21300 15205 21307 100 -} -a { - s 0 - b 17985 21306 - e 17992 21313 - l 17985 21306 17992 21313 100 -} -a { - s 0 - b 8692 21325 - e 8699 21332 - l 8692 21325 8699 21332 100 -} -a { - s 0 - b 8693 21326 - e 8700 21333 - l 8693 21326 8700 21333 100 -} -a { - s 0 - b 7469 21343 - e 7476 21350 - l 7469 21343 7476 21350 100 -} -a { - s 0 - b 7470 21344 - e 7477 21351 - l 7470 21344 7477 21351 100 -} -a { - s 0 - b 3908 21344 - e 3915 21351 - l 3908 21344 3915 21351 100 -} -a { - s 0 - b 17535 21345 - e 17542 21352 - l 17535 21345 17542 21352 100 -} -a { - s 0 - b 17536 21346 - e 17543 21353 - l 17536 21346 17543 21353 100 -} -a { - s 0 - b 17537 21347 - e 17544 21354 - l 17537 21347 17544 21354 100 -} -a { - s 0 - b 17469 21347 - e 17476 21354 - l 17469 21347 17476 21354 100 -} -a { - s 0 - b 4841 21347 - e 4848 21354 - l 4841 21347 4848 21354 100 -} -a { - s 0 - b 7547 21348 - e 7554 21355 - l 7547 21348 7554 21355 100 -} -a { - s 0 - b 4446 21351 - e 4453 21358 - l 4446 21351 4453 21358 100 -} -a { - s 0 - b 6569 21354 - e 6576 21361 - l 6569 21354 6576 21361 100 -} -a { - s 0 - b 14113 21355 - e 14120 21362 - l 14113 21355 14120 21362 100 -} -a { - s 0 - b 7437 21373 - e 7444 21380 - l 7437 21373 7444 21380 100 -} -a { - s 0 - b 7621 21378 - e 7628 21385 - l 7621 21378 7628 21385 100 -} -a { - s 0 - b 5683 21381 - e 5690 21388 - l 5683 21381 5690 21388 100 -} -a { - s 0 - b 5684 21382 - e 5691 21389 - l 5684 21382 5691 21389 100 -} -a { - s 0 - b 18558 21391 - e 18565 21398 - l 18558 21391 18565 21398 100 -} -a { - s 0 - b 8680 21391 - e 8687 21398 - l 8680 21391 8687 21398 100 -} -a { - s 0 - b 14287 21396 - e 14294 21403 - l 14287 21396 14294 21403 100 -} -a { - s 0 - b 14288 21397 - e 14295 21404 - l 14288 21397 14295 21404 100 -} -a { - s 0 - b 4989 21398 - e 4996 21405 - l 4989 21398 4996 21405 100 -} -a { - s 0 - b 4990 21399 - e 4997 21406 - l 4990 21399 4997 21406 100 -} -a { - s 0 - b 4991 21400 - e 4998 21407 - l 4991 21400 4998 21407 100 -} -a { - s 0 - b 4992 21401 - e 4999 21408 - l 4992 21401 4999 21408 100 -} -a { - s 0 - b 11130 21409 - e 11137 21416 - l 11130 21409 11137 21416 100 -} -a { - s 0 - b 4543 21409 - e 4550 21416 - l 4543 21409 4550 21416 100 -} -a { - s 0 - b 4544 21410 - e 4551 21417 - l 4544 21410 4551 21417 100 -} -a { - s 0 - b 5010 21412 - e 5017 21419 - l 5010 21412 5017 21419 100 -} -a { - s 0 - b 10887 21429 - e 10894 21436 - l 10887 21429 10894 21436 100 -} -a { - s 0 - b 17470 21452 - e 17477 21459 - l 17470 21452 17477 21459 100 -} -a { - s 0 - b 3367 21456 - e 3374 21463 - l 3367 21456 3374 21463 100 -} -a { - s 0 - b 118 21457 - e 125 21464 - l 118 21457 125 21464 100 -} -a { - s 0 - b 18483 21471 - e 18490 21478 - l 18483 21471 18490 21478 100 -} -a { - s 0 - b 18484 21472 - e 18491 21479 - l 18484 21472 18491 21479 100 -} -a { - s 0 - b 17343 21474 - e 17350 21481 - l 17343 21474 17350 21481 100 -} -a { - s 0 - b 9043 21518 - e 9050 21525 - l 9043 21518 9050 21525 100 -} -a { - s 0 - b 9044 21519 - e 9051 21526 - l 9044 21519 9051 21526 100 -} -a { - s 0 - b 16152 21525 - e 16159 21532 - l 16152 21525 16159 21532 100 -} -a { - s 0 - b 11075 21525 - e 11082 21532 - l 11075 21525 11082 21532 100 -} -a { - s 0 - b 11076 21526 - e 11083 21533 - l 11076 21526 11083 21533 100 -} -a { - s 0 - b 6803 21531 - e 6810 21538 - l 6803 21531 6810 21538 100 -} -a { - s 0 - b 18603 21532 - e 18610 21539 - l 18603 21532 18610 21539 100 -} -a { - s 0 - b 14314 21532 - e 14321 21539 - l 14314 21532 14321 21539 100 -} -a { - s 0 - b 11036 21532 - e 11043 21539 - l 11036 21532 11043 21539 100 -} -a { - s 0 - b 6804 21532 - e 6811 21539 - l 6804 21532 6811 21539 100 -} -a { - s 0 - b 6805 21533 - e 6812 21540 - l 6805 21533 6812 21540 100 -} -a { - s 0 - b 11307 21535 - e 11314 21542 - l 11307 21535 11314 21542 100 -} -a { - s 0 - b 8077 21545 - e 8084 21552 - l 8077 21545 8084 21552 100 -} -a { - s 0 - b 8078 21546 - e 8085 21553 - l 8078 21546 8085 21553 100 -} -a { - s 0 - b 16038 21548 - e 16045 21555 - l 16038 21548 16045 21555 100 -} -a { - s 0 - b 8734 21567 - e 8741 21574 - l 8734 21567 8741 21574 100 -} -a { - s 0 - b 8735 21568 - e 8742 21575 - l 8735 21568 8742 21575 100 -} -a { - s 0 - b 9351 21569 - e 9358 21576 - l 9351 21569 9358 21576 100 -} -a { - s 0 - b 9352 21570 - e 9359 21577 - l 9352 21570 9359 21577 100 -} -a { - s 0 - b 8332 21575 - e 8339 21582 - l 8332 21575 8339 21582 100 -} -a { - s 0 - b 7430 21592 - e 7437 21599 - l 7430 21592 7437 21599 100 -} -a { - s 0 - b 14125 21596 - e 14132 21603 - l 14125 21596 14132 21603 100 -} -a { - s 0 - b 6742 21598 - e 6749 21605 - l 6742 21598 6749 21605 100 -} -a { - s 0 - b 6743 21599 - e 6750 21606 - l 6743 21599 6750 21606 100 -} -a { - s 0 - b 18784 21604 - e 18791 21611 - l 18784 21604 18791 21611 100 -} -a { - s 0 - b 14395 21606 - e 14402 21613 - l 14395 21606 14402 21613 100 -} -a { - s 0 - b 14396 21607 - e 14403 21614 - l 14396 21607 14403 21614 100 -} -a { - s 0 - b 301 21607 - e 308 21614 - l 301 21607 308 21614 100 -} -a { - s 0 - b 15217 21613 - e 15224 21620 - l 15217 21613 15224 21620 100 -} -a { - s 0 - b 12146 21613 - e 12153 21620 - l 12146 21613 12153 21620 100 -} -a { - s 0 - b 15218 21614 - e 15225 21621 - l 15218 21614 15225 21621 100 -} -a { - s 0 - b 12147 21614 - e 12154 21621 - l 12147 21614 12154 21621 100 -} -a { - s 0 - b 15219 21615 - e 15226 21622 - l 15219 21615 15226 21622 100 -} -a { - s 0 - b 8587 21626 - e 8594 21633 - l 8587 21626 8594 21633 100 -} -a { - s 0 - b 9431 21772 - e 9438 21779 - l 9431 21772 9438 21779 100 -} -a { - s 0 - b 14770 21794 - e 14777 21801 - l 14770 21794 14777 21801 100 -} -a { - s 0 - b 14771 21795 - e 14778 21802 - l 14771 21795 14778 21802 100 -} -a { - s 0 - b 12163 21796 - e 12170 21803 - l 12163 21796 12170 21803 100 -} -a { - s 0 - b 12164 21797 - e 12171 21804 - l 12164 21797 12171 21804 100 -} -a { - s 0 - b 4392 21799 - e 4399 21806 - l 4392 21799 4399 21806 100 -} -a { - s 0 - b 14968 21800 - e 14975 21807 - l 14968 21800 14975 21807 100 -} -a { - s 0 - b 4393 21800 - e 4400 21807 - l 4393 21800 4400 21807 100 -} -a { - s 0 - b 3325 21801 - e 3332 21808 - l 3325 21801 3332 21808 100 -} -a { - s 0 - b 4443 21807 - e 4450 21814 - l 4443 21807 4450 21814 100 -} -a { - s 0 - b 8259 21808 - e 8266 21815 - l 8259 21808 8266 21815 100 -} -a { - s 0 - b 9343 21821 - e 9350 21828 - l 9343 21821 9350 21828 100 -} -a { - s 0 - b 9415 21831 - e 9422 21838 - l 9415 21831 9422 21838 100 -} -a { - s 0 - b 14576 21841 - e 14583 21848 - l 14576 21841 14583 21848 100 -} -a { - s 0 - b 14577 21842 - e 14584 21849 - l 14577 21842 14584 21849 100 -} -a { - s 0 - b 17379 21874 - e 17386 21881 - l 17379 21874 17386 21881 100 -} -a { - s 0 - b 17454 21875 - e 17461 21882 - l 17454 21875 17461 21882 100 -} -a { - s 0 - b 17455 21876 - e 17462 21883 - l 17455 21876 17462 21883 100 -} -a { - s 0 - b 14772 21876 - e 14779 21883 - l 14772 21876 14779 21883 100 -} -a { - s 0 - b 9667 21877 - e 9674 21884 - l 9667 21877 9674 21884 100 -} -a { - s 0 - b 9668 21878 - e 9675 21885 - l 9668 21878 9675 21885 100 -} -a { - s 0 - b 14520 21890 - e 14527 21897 - l 14520 21890 14527 21897 100 -} -a { - s 0 - b 4878 21920 - e 4885 21927 - l 4878 21920 4885 21927 100 -} -a { - s 0 - b 12 21929 - e 19 21936 - l 12 21929 19 21936 100 -} -a { - s 0 - b 7619 21937 - e 7626 21944 - l 7619 21937 7626 21944 100 -} -a { - s 0 - b 17838 21938 - e 17845 21945 - l 17838 21938 17845 21945 100 -} -a { - s 0 - b 16069 21938 - e 16076 21945 - l 16069 21938 16076 21945 100 -} -a { - s 0 - b 11329 21938 - e 11336 21945 - l 11329 21938 11336 21945 100 -} -a { - s 0 - b 11330 21939 - e 11337 21946 - l 11330 21939 11337 21946 100 -} -a { - s 0 - b 18644 21948 - e 18651 21955 - l 18644 21948 18651 21955 100 -} -a { - s 0 - b 7412 21954 - e 7419 21961 - l 7412 21954 7419 21961 100 -} -a { - s 0 - b 10884 21969 - e 10891 21976 - l 10884 21969 10891 21976 100 -} -a { - s 0 - b 10885 21970 - e 10892 21977 - l 10885 21970 10892 21977 100 -} -a { - s 0 - b 10886 21971 - e 10893 21978 - l 10886 21971 10893 21978 100 -} -a { - s 0 - b 10887 21972 - e 10894 21979 - l 10887 21972 10894 21979 100 -} -a { - s 0 - b 10888 21973 - e 10895 21980 - l 10888 21973 10895 21980 100 -} -a { - s 0 - b 14313 21975 - e 14320 21982 - l 14313 21975 14320 21982 100 -} -a { - s 0 - b 13448 21975 - e 13455 21982 - l 13448 21975 13455 21982 100 -} -a { - s 0 - b 6605 21975 - e 6612 21982 - l 6605 21975 6612 21982 100 -} -a { - s 0 - b 13449 21976 - e 13456 21983 - l 13449 21976 13456 21983 100 -} -a { - s 0 - b 18014 21987 - e 18021 21994 - l 18014 21987 18021 21994 100 -} -a { - s 0 - b 18015 21988 - e 18022 21995 - l 18015 21988 18022 21995 100 -} -a { - s 0 - b 13953 21988 - e 13960 21995 - l 13953 21988 13960 21995 100 -} -a { - s 0 - b 13954 21989 - e 13961 21996 - l 13954 21989 13961 21996 100 -} -a { - s 0 - b 13955 21990 - e 13962 21997 - l 13955 21990 13962 21997 100 -} -a { - s 0 - b 8747 22017 - e 8754 22024 - l 8747 22017 8754 22024 100 -} -a { - s 0 - b 6285 22018 - e 6292 22025 - l 6285 22018 6292 22025 100 -} -a { - s 0 - b 6286 22019 - e 6293 22026 - l 6286 22019 6293 22026 100 -} -a { - s 0 - b 3918 22021 - e 3925 22028 - l 3918 22021 3925 22028 100 -} -a { - s 0 - b 8472 22028 - e 8479 22035 - l 8472 22028 8479 22035 100 -} -a { - s 0 - b 5062 22029 - e 5069 22036 - l 5062 22029 5069 22036 100 -} -a { - s 0 - b 18153 22606 - e 18160 22613 - l 18153 22606 18160 22613 100 -} -a { - s 0 - b 15255 22617 - e 15262 22624 - l 15255 22617 15262 22624 100 -} -a { - s 0 - b 10955 22626 - e 10962 22633 - l 10955 22626 10962 22633 100 -} -a { - s 0 - b 4243 22635 - e 4250 22642 - l 4243 22635 4250 22642 100 -} -a { - s 0 - b 4244 22636 - e 4251 22643 - l 4244 22636 4251 22643 100 -} -a { - s 0 - b 18591 22654 - e 18598 22661 - l 18591 22654 18598 22661 100 -} -a { - s 0 - b 8582 22663 - e 8589 22670 - l 8582 22663 8589 22670 100 -} -a { - s 0 - b 11421 22664 - e 11428 22671 - l 11421 22664 11428 22671 100 -} -a { - s 0 - b 8583 22664 - e 8590 22671 - l 8583 22664 8590 22671 100 -} -a { - s 0 - b 11422 22665 - e 11429 22672 - l 11422 22665 11429 22672 100 -} -a { - s 0 - b 8584 22665 - e 8591 22672 - l 8584 22665 8591 22672 100 -} -a { - s 0 - b 9510 22676 - e 9517 22683 - l 9510 22676 9517 22683 100 -} -a { - s 0 - b 6341 22691 - e 6348 22698 - l 6341 22691 6348 22698 100 -} -a { - s 0 - b 14 22695 - e 21 22702 - l 14 22695 21 22702 100 -} -a { - s 0 - b 9715 22699 - e 9722 22706 - l 9715 22699 9722 22706 100 -} -a { - s 0 - b 6334 22699 - e 6341 22706 - l 6334 22699 6341 22706 100 -} -a { - s 0 - b 15135 22718 - e 15142 22725 - l 15135 22718 15142 22725 100 -} -a { - s 0 - b 8653 22720 - e 8660 22727 - l 8653 22720 8660 22727 100 -} -a { - s 0 - b 17390 22760 - e 17397 22767 - l 17390 22760 17397 22767 100 -} -a { - s 0 - b 2988 22787 - e 2995 22794 - l 2988 22787 2995 22794 100 -} -a { - s 0 - b 17435 22788 - e 17442 22795 - l 17435 22788 17442 22795 100 -} -a { - s 0 - b 15331 22820 - e 15338 22827 - l 15331 22820 15338 22827 100 -} -a { - s 0 - b 14255 22821 - e 14262 22828 - l 14255 22821 14262 22828 100 -} -a { - s 0 - b 14151 22823 - e 14158 22830 - l 14151 22823 14158 22830 100 -} -a { - s 0 - b 18419 22825 - e 18426 22832 - l 18419 22825 18426 22832 100 -} -a { - s 0 - b 15360 22828 - e 15367 22835 - l 15360 22828 15367 22835 100 -} -a { - s 0 - b 14753 22830 - e 14760 22837 - l 14753 22830 14760 22837 100 -} -a { - s 0 - b 14928 22853 - e 14935 22860 - l 14928 22853 14935 22860 100 -} -a { - s 0 - b 9637 22854 - e 9644 22861 - l 9637 22854 9644 22861 100 -} -a { - s 0 - b 13891 22855 - e 13898 22862 - l 13891 22855 13898 22862 100 -} -a { - s 0 - b 13892 22856 - e 13899 22863 - l 13892 22856 13899 22863 100 -} -a { - s 0 - b 4072 22880 - e 4079 22887 - l 4072 22880 4079 22887 100 -} -a { - s 0 - b 6426 22881 - e 6433 22888 - l 6426 22881 6433 22888 100 -} -a { - s 0 - b 8415 22885 - e 8422 22892 - l 8415 22885 8422 22892 100 -} -a { - s 0 - b 6817 22889 - e 6824 22896 - l 6817 22889 6824 22896 100 -} -a { - s 0 - b 6818 22890 - e 6825 22897 - l 6818 22890 6825 22897 100 -} -a { - s 0 - b 9343 22897 - e 9350 22904 - l 9343 22897 9350 22904 100 -} -a { - s 0 - b 11377 22917 - e 11384 22924 - l 11377 22917 11384 22924 100 -} -a { - s 0 - b 6720 22919 - e 6727 22926 - l 6720 22919 6727 22926 100 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa" 1 22929 0 2 -} -h { - "> cat" - "> pig2" -} -a { - s 0 - b 7572 7 - e 7579 14 - l 7572 7 7579 14 100 -} -a { - s 0 - b 7986 11 - e 7993 18 - l 7986 11 7993 18 100 -} -a { - s 0 - b 7732 16 - e 7739 23 - l 7732 16 7739 23 100 -} -a { - s 0 - b 14747 52 - e 14754 59 - l 14747 52 14754 59 100 -} -a { - s 0 - b 14748 53 - e 14755 60 - l 14748 53 14755 60 100 -} -a { - s 0 - b 3897 54 - e 3904 61 - l 3897 54 3904 61 100 -} -a { - s 0 - b 3898 55 - e 3905 62 - l 3898 55 3905 62 100 -} -a { - s 0 - b 18775 59 - e 18782 66 - l 18775 59 18782 66 100 -} -a { - s 0 - b 14944 73 - e 14951 80 - l 14944 73 14951 80 100 -} -a { - s 0 - b 9407 75 - e 9414 82 - l 9407 75 9414 82 100 -} -a { - s 0 - b 81 75 - e 88 82 - l 81 75 88 82 100 -} -a { - s 0 - b 9408 76 - e 9415 83 - l 9408 76 9415 83 100 -} -a { - s 0 - b 10919 77 - e 10926 84 - l 10919 77 10926 84 100 -} -a { - s 0 - b 14236 81 - e 14243 88 - l 14236 81 14243 88 100 -} -a { - s 0 - b 7561 82 - e 7568 89 - l 7561 82 7568 89 100 -} -a { - s 0 - b 3638 104 - e 3645 111 - l 3638 104 3645 111 100 -} -a { - s 0 - b 3639 105 - e 3646 112 - l 3639 105 3646 112 100 -} -a { - s 0 - b 18645 120 - e 18652 127 - l 18645 120 18652 127 100 -} -a { - s 0 - b 15293 124 - e 15300 131 - l 15293 124 15300 131 100 -} -a { - s 0 - b 15294 125 - e 15301 132 - l 15294 125 15301 132 100 -} -a { - s 0 - b 15295 126 - e 15302 133 - l 15295 126 15302 133 100 -} -a { - s 0 - b 8765 135 - e 8772 142 - l 8765 135 8772 142 100 -} -a { - s 0 - b 18564 142 - e 18571 149 - l 18564 142 18571 149 100 -} -a { - s 0 - b 4013 153 - e 4020 160 - l 4013 153 4020 160 100 -} -a { - s 0 - b 6585 183 - e 6592 190 - l 6585 183 6592 190 100 -} -a { - s 0 - b 18400 184 - e 18407 191 - l 18400 184 18407 191 100 -} -a { - s 0 - b 9625 186 - e 9632 193 - l 9625 186 9632 193 100 -} -a { - s 0 - b 3279 186 - e 3286 193 - l 3279 186 3286 193 100 -} -a { - s 0 - b 3545 188 - e 3552 195 - l 3545 188 3552 195 100 -} -a { - s 0 - b 4494 192 - e 4501 199 - l 4494 192 4501 199 100 -} -a { - s 0 - b 4980 216 - e 4987 223 - l 4980 216 4987 223 100 -} -a { - s 0 - b 14871 222 - e 14878 229 - l 14871 222 14878 229 100 -} -a { - s 0 - b 14872 223 - e 14879 230 - l 14872 223 14879 230 100 -} -a { - s 0 - b 4253 233 - e 4260 240 - l 4253 233 4260 240 100 -} -a { - s 0 - b 4522 234 - e 4529 241 - l 4522 234 4529 241 100 -} -a { - s 0 - b 4254 234 - e 4261 241 - l 4254 234 4261 241 100 -} -a { - s 0 - b 9 237 - e 16 244 - l 9 237 16 244 100 -} -a { - s 0 - b 6431 240 - e 6438 247 - l 6431 240 6438 247 100 -} -a { - s 0 - b 6432 241 - e 6439 248 - l 6432 241 6439 248 100 -} -a { - s 0 - b 6433 242 - e 6440 249 - l 6433 242 6440 249 100 -} -a { - s 0 - b 2967 245 - e 2974 252 - l 2967 245 2974 252 100 -} -a { - s 0 - b 13916 250 - e 13923 257 - l 13916 250 13923 257 100 -} -a { - s 0 - b 9493 251 - e 9500 258 - l 9493 251 9500 258 100 -} -a { - s 0 - b 7752 256 - e 7759 263 - l 7752 256 7759 263 100 -} -a { - s 0 - b 6278 265 - e 6285 272 - l 6278 265 6285 272 100 -} -a { - s 0 - b 17436 266 - e 17443 273 - l 17436 266 17443 273 100 -} -a { - s 0 - b 6279 266 - e 6286 273 - l 6279 266 6286 273 100 -} -a { - s 0 - b 3298 275 - e 3305 282 - l 3298 275 3305 282 100 -} -a { - s 0 - b 4529 283 - e 4536 290 - l 4529 283 4536 290 100 -} -a { - s 0 - b 14025 284 - e 14032 291 - l 14025 284 14032 291 100 -} -a { - s 0 - b 15299 287 - e 15306 294 - l 15299 287 15306 294 100 -} -a { - s 0 - b 9416 288 - e 9423 295 - l 9416 288 9423 295 100 -} -a { - s 0 - b 3274 313 - e 3281 320 - l 3274 313 3281 320 100 -} -a { - s 0 - b 3275 314 - e 3282 321 - l 3275 314 3282 321 100 -} -a { - s 0 - b 18481 317 - e 18488 324 - l 18481 317 18488 324 100 -} -a { - s 0 - b 6906 318 - e 6913 325 - l 6906 318 6913 325 100 -} -a { - s 0 - b 11112 323 - e 11119 330 - l 11112 323 11119 330 100 -} -a { - s 0 - b 14761 324 - e 14768 331 - l 14761 324 14768 331 100 -} -a { - s 0 - b 3694 325 - e 3701 332 - l 3694 325 3701 332 100 -} -a { - s 0 - b 14020 353 - e 14027 360 - l 14020 353 14027 360 100 -} -a { - s 0 - b 9281 355 - e 9288 362 - l 9281 355 9288 362 100 -} -a { - s 0 - b 13947 356 - e 13954 363 - l 13947 356 13954 363 100 -} -a { - s 0 - b 9048 356 - e 9055 363 - l 9048 356 9055 363 100 -} -a { - s 0 - b 9049 357 - e 9056 364 - l 9049 357 9056 364 100 -} -a { - s 0 - b 5072 357 - e 5079 364 - l 5072 357 5079 364 100 -} -a { - s 0 - b 5073 358 - e 5080 365 - l 5073 358 5080 365 100 -} -a { - s 0 - b 7624 368 - e 7631 375 - l 7624 368 7631 375 100 -} -a { - s 0 - b 17300 372 - e 17307 379 - l 17300 372 17307 379 100 -} -a { - s 0 - b 3322 374 - e 3329 381 - l 3322 374 3329 381 100 -} -a { - s 0 - b 18782 379 - e 18789 386 - l 18782 379 18789 386 100 -} -a { - s 0 - b 10893 380 - e 10900 387 - l 10893 380 10900 387 100 -} -a { - s 0 - b 8095 389 - e 8102 396 - l 8095 389 8102 396 100 -} -a { - s 0 - b 17214 405 - e 17221 412 - l 17214 405 17221 412 100 -} -a { - s 0 - b 8658 416 - e 8665 423 - l 8658 416 8665 423 100 -} -a { - s 0 - b 18650 419 - e 18657 426 - l 18650 419 18657 426 100 -} -a { - s 0 - b 17514 422 - e 17521 429 - l 17514 422 17521 429 100 -} -a { - s 0 - b 17515 423 - e 17522 430 - l 17515 423 17522 430 100 -} -a { - s 0 - b 11164 438 - e 11171 445 - l 11164 438 11171 445 100 -} -a { - s 0 - b 11023 438 - e 11030 445 - l 11023 438 11030 445 100 -} -a { - s 0 - b 4880 446 - e 4887 453 - l 4880 446 4887 453 100 -} -a { - s 0 - b 4881 447 - e 4888 454 - l 4881 447 4888 454 100 -} -a { - s 0 - b 7413 468 - e 7420 475 - l 7413 468 7420 475 100 -} -a { - s 0 - b 8082 481 - e 8089 488 - l 8082 481 8089 488 100 -} -a { - s 0 - b 8083 482 - e 8090 489 - l 8083 482 8090 489 100 -} -a { - s 0 - b 3548 486 - e 3555 493 - l 3548 486 3555 493 100 -} -a { - s 0 - b 8778 500 - e 8785 507 - l 8778 500 8785 507 100 -} -a { - s 0 - b 8779 501 - e 8786 508 - l 8779 501 8786 508 100 -} -a { - s 0 - b 17978 506 - e 17985 513 - l 17978 506 17985 513 100 -} -a { - s 0 - b 17979 507 - e 17986 514 - l 17979 507 17986 514 100 -} -a { - s 0 - b 8599 507 - e 8606 514 - l 8599 507 8606 514 100 -} -a { - s 0 - b 8600 508 - e 8607 515 - l 8600 508 8607 515 100 -} -a { - s 0 - b 14528 510 - e 14535 517 - l 14528 510 14535 517 100 -} -a { - s 0 - b 14529 511 - e 14536 518 - l 14529 511 14536 518 100 -} -a { - s 0 - b 14974 515 - e 14981 522 - l 14974 515 14981 522 100 -} -a { - s 0 - b 16034 871 - e 16041 878 - l 16034 871 16041 878 100 -} -a { - s 0 - b 14674 876 - e 14681 883 - l 14674 876 14681 883 100 -} -a { - s 0 - b 8763 882 - e 8770 889 - l 8763 882 8770 889 100 -} -a { - s 0 - b 3172 904 - e 3179 911 - l 3172 904 3179 911 100 -} -a { - s 0 - b 6445 908 - e 6452 915 - l 6445 908 6452 915 100 -} -a { - s 0 - b 227 913 - e 234 920 - l 227 913 234 920 100 -} -a { - s 0 - b 228 914 - e 235 921 - l 228 914 235 921 100 -} -a { - s 0 - b 8 917 - e 15 924 - l 8 917 15 924 100 -} -a { - s 0 - b 15971 918 - e 15978 925 - l 15971 918 15978 925 100 -} -a { - s 0 - b 10929 938 - e 10936 945 - l 10929 938 10936 945 100 -} -a { - s 0 - b 13980 953 - e 13987 960 - l 13980 953 13987 960 100 -} -a { - s 0 - b 10967 956 - e 10974 963 - l 10967 956 10974 963 100 -} -a { - s 0 - b 17414 960 - e 17421 967 - l 17414 960 17421 967 100 -} -a { - s 0 - b 14304 960 - e 14311 967 - l 14304 960 14311 967 100 -} -a { - s 0 - b 7439 960 - e 7446 967 - l 7439 960 7446 967 100 -} -a { - s 0 - b 7455 962 - e 7462 969 - l 7455 962 7462 969 100 -} -a { - s 0 - b 4626 991 - e 4633 998 - l 4626 991 4633 998 100 -} -a { - s 0 - b 4627 992 - e 4634 999 - l 4627 992 4634 999 100 -} -a { - s 0 - b 17255 1012 - e 17262 1019 - l 17255 1012 17262 1019 100 -} -a { - s 0 - b 17256 1013 - e 17263 1020 - l 17256 1013 17263 1020 100 -} -a { - s 0 - b 15943 1016 - e 15950 1023 - l 15943 1016 15950 1023 100 -} -a { - s 0 - b 3647 1028 - e 3654 1035 - l 3647 1028 3654 1035 100 -} -a { - s 0 - b 5568 1032 - e 5575 1039 - l 5568 1032 5575 1039 100 -} -a { - s 0 - b 3643 1038 - e 3650 1045 - l 3643 1038 3650 1045 100 -} -a { - s 0 - b 3644 1039 - e 3651 1046 - l 3644 1039 3651 1046 100 -} -a { - s 0 - b 11385 1045 - e 11392 1052 - l 11385 1045 11392 1052 100 -} -a { - s 0 - b 11386 1046 - e 11393 1053 - l 11386 1046 11393 1053 100 -} -a { - s 0 - b 17413 1049 - e 17420 1056 - l 17413 1049 17420 1056 100 -} -a { - s 0 - b 14303 1049 - e 14310 1056 - l 14303 1049 14310 1056 100 -} -a { - s 0 - b 17414 1050 - e 17421 1057 - l 17414 1050 17421 1057 100 -} -a { - s 0 - b 14304 1050 - e 14311 1057 - l 14304 1050 14311 1057 100 -} -a { - s 0 - b 7439 1050 - e 7446 1057 - l 7439 1050 7446 1057 100 -} -a { - s 0 - b 17415 1051 - e 17422 1058 - l 17415 1051 17422 1058 100 -} -a { - s 0 - b 7440 1051 - e 7447 1058 - l 7440 1051 7447 1058 100 -} -a { - s 0 - b 18108 1052 - e 18115 1059 - l 18108 1052 18115 1059 100 -} -a { - s 0 - b 5069 1053 - e 5076 1060 - l 5069 1053 5076 1060 100 -} -a { - s 0 - b 13450 1055 - e 13457 1062 - l 13450 1055 13457 1062 100 -} -a { - s 0 - b 6856 1055 - e 6863 1062 - l 6856 1055 6863 1062 100 -} -a { - s 0 - b 13451 1056 - e 13458 1063 - l 13451 1056 13458 1063 100 -} -a { - s 0 - b 6368 1071 - e 6375 1078 - l 6368 1071 6375 1078 100 -} -a { - s 0 - b 14880 1072 - e 14887 1079 - l 14880 1072 14887 1079 100 -} -a { - s 0 - b 6369 1072 - e 6376 1079 - l 6369 1072 6376 1079 100 -} -a { - s 0 - b 3544 1104 - e 3551 1111 - l 3544 1104 3551 1111 100 -} -a { - s 0 - b 8199 1114 - e 8206 1121 - l 8199 1114 8206 1121 100 -} -a { - s 0 - b 17094 1115 - e 17101 1122 - l 17094 1115 17101 1122 100 -} -a { - s 0 - b 12239 1115 - e 12246 1122 - l 12239 1115 12246 1122 100 -} -a { - s 0 - b 18590 1121 - e 18597 1128 - l 18590 1121 18597 1128 100 -} -a { - s 0 - b 3028 1121 - e 3035 1128 - l 3028 1121 3035 1128 100 -} -a { - s 0 - b 14722 1155 - e 14729 1162 - l 14722 1155 14729 1162 100 -} -a { - s 0 - b 11079 1157 - e 11086 1164 - l 11079 1157 11086 1164 100 -} -a { - s 0 - b 17474 1159 - e 17481 1166 - l 17474 1159 17481 1166 100 -} -a { - s 0 - b 8343 1159 - e 8350 1166 - l 8343 1159 8350 1166 100 -} -a { - s 0 - b 6562 1159 - e 6569 1166 - l 6562 1159 6569 1166 100 -} -a { - s 0 - b 6563 1160 - e 6570 1167 - l 6563 1160 6570 1167 100 -} -a { - s 0 - b 4391 1166 - e 4398 1173 - l 4391 1166 4398 1173 100 -} -a { - s 0 - b 11476 1168 - e 11483 1175 - l 11476 1168 11483 1175 100 -} -a { - s 0 - b 8107 1169 - e 8114 1176 - l 8107 1169 8114 1176 100 -} -a { - s 0 - b 4843 1174 - e 4850 1181 - l 4843 1174 4850 1181 100 -} -a { - s 0 - b 4844 1175 - e 4851 1182 - l 4844 1175 4851 1182 100 -} -a { - s 0 - b 8503 1201 - e 8510 1208 - l 8503 1201 8510 1208 100 -} -a { - s 0 - b 11294 1203 - e 11301 1210 - l 11294 1203 11301 1210 100 -} -a { - s 0 - b 16078 1282 - e 16085 1289 - l 16078 1282 16085 1289 100 -} -a { - s 0 - b 9458 1283 - e 9465 1290 - l 9458 1283 9465 1290 100 -} -a { - s 0 - b 7784 1290 - e 7791 1297 - l 7784 1290 7791 1297 100 -} -a { - s 0 - b 10881 1291 - e 10888 1298 - l 10881 1291 10888 1298 100 -} -a { - s 0 - b 11129 1297 - e 11136 1304 - l 11129 1297 11136 1304 100 -} -a { - s 0 - b 12208 1302 - e 12215 1309 - l 12208 1302 12215 1309 100 -} -a { - s 0 - b 6235 1302 - e 6242 1309 - l 6235 1302 6242 1309 100 -} -a { - s 0 - b 7580 1322 - e 7587 1329 - l 7580 1322 7587 1329 100 -} -a { - s 0 - b 17357 1323 - e 17364 1330 - l 17357 1323 17364 1330 100 -} -a { - s 0 - b 7581 1323 - e 7588 1330 - l 7581 1323 7588 1330 100 -} -a { - s 0 - b 17358 1324 - e 17365 1331 - l 17358 1324 17365 1331 100 -} -a { - s 0 - b 7582 1324 - e 7589 1331 - l 7582 1324 7589 1331 100 -} -a { - s 0 - b 6304 1324 - e 6311 1331 - l 6304 1324 6311 1331 100 -} -a { - s 0 - b 6305 1325 - e 6312 1332 - l 6305 1325 6312 1332 100 -} -a { - s 0 - b 7399 1336 - e 7406 1343 - l 7399 1336 7406 1343 100 -} -a { - s 0 - b 7400 1337 - e 7407 1344 - l 7400 1337 7407 1344 100 -} -a { - s 0 - b 4562 1350 - e 4569 1357 - l 4562 1350 4569 1357 100 -} -a { - s 0 - b 18725 1352 - e 18732 1359 - l 18725 1352 18732 1359 100 -} -a { - s 0 - b 18726 1353 - e 18733 1360 - l 18726 1353 18733 1360 100 -} -a { - s 0 - b 18081 1353 - e 18088 1360 - l 18081 1353 18088 1360 100 -} -a { - s 0 - b 18082 1354 - e 18089 1361 - l 18082 1354 18089 1361 100 -} -a { - s 0 - b 7516 1399 - e 7523 1406 - l 7516 1399 7523 1406 100 -} -a { - s 0 - b 4487 1401 - e 4494 1408 - l 4487 1401 4494 1408 100 -} -a { - s 0 - b 8733 1447 - e 8740 1454 - l 8733 1447 8740 1454 100 -} -a { - s 0 - b 288 1469 - e 295 1476 - l 288 1469 295 1476 100 -} -a { - s 0 - b 7494 1492 - e 7501 1499 - l 7494 1492 7501 1499 100 -} -a { - s 0 - b 14705 1499 - e 14712 1506 - l 14705 1499 14712 1506 100 -} -a { - s 0 - b 14706 1500 - e 14713 1507 - l 14706 1500 14713 1507 100 -} -a { - s 0 - b 14707 1501 - e 14714 1508 - l 14707 1501 14714 1508 100 -} -a { - s 0 - b 14708 1502 - e 14715 1509 - l 14708 1502 14715 1509 100 -} -a { - s 0 - b 7394 1505 - e 7401 1512 - l 7394 1505 7401 1512 100 -} -a { - s 0 - b 7395 1506 - e 7402 1513 - l 7395 1506 7402 1513 100 -} -a { - s 0 - b 12201 1518 - e 12208 1525 - l 12201 1518 12208 1525 100 -} -a { - s 0 - b 8425 1518 - e 8432 1525 - l 8425 1518 8432 1525 100 -} -a { - s 0 - b 12202 1519 - e 12209 1526 - l 12202 1519 12209 1526 100 -} -a { - s 0 - b 8751 1540 - e 8758 1547 - l 8751 1540 8758 1547 100 -} -a { - s 0 - b 6674 1542 - e 6681 1549 - l 6674 1542 6681 1549 100 -} -a { - s 0 - b 6367 1543 - e 6374 1550 - l 6367 1543 6374 1550 100 -} -a { - s 0 - b 17266 1553 - e 17273 1560 - l 17266 1553 17273 1560 100 -} -a { - s 0 - b 14478 1558 - e 14485 1565 - l 14478 1558 14485 1565 100 -} -a { - s 0 - b 18075 1569 - e 18082 1576 - l 18075 1569 18082 1576 100 -} -a { - s 0 - b 18076 1570 - e 18083 1577 - l 18076 1570 18083 1577 100 -} -a { - s 0 - b 4796 1572 - e 4803 1579 - l 4796 1572 4803 1579 100 -} -a { - s 0 - b 4410 1606 - e 4417 1613 - l 4410 1606 4417 1613 100 -} -a { - s 0 - b 7463 1615 - e 7470 1622 - l 7463 1615 7470 1622 100 -} -a { - s 0 - b 7464 1616 - e 7471 1623 - l 7464 1616 7471 1623 100 -} -a { - s 0 - b 7465 1617 - e 7472 1624 - l 7465 1617 7472 1624 100 -} -a { - s 0 - b 7466 1618 - e 7473 1625 - l 7466 1618 7473 1625 100 -} -a { - s 0 - b 7467 1619 - e 7474 1626 - l 7467 1619 7474 1626 100 -} -a { - s 0 - b 15313 1635 - e 15320 1642 - l 15313 1635 15320 1642 100 -} -a { - s 0 - b 15314 1636 - e 15321 1643 - l 15314 1636 15321 1643 100 -} -a { - s 0 - b 13396 1638 - e 13403 1645 - l 13396 1638 13403 1645 100 -} -a { - s 0 - b 11227 1643 - e 11234 1650 - l 11227 1643 11234 1650 100 -} -a { - s 0 - b 8928 1692 - e 8935 1699 - l 8928 1692 8935 1699 100 -} -a { - s 0 - b 8929 1693 - e 8936 1700 - l 8929 1693 8936 1700 100 -} -a { - s 0 - b 219 1698 - e 226 1705 - l 219 1698 226 1705 100 -} -a { - s 0 - b 44 1702 - e 51 1709 - l 44 1702 51 1709 100 -} -a { - s 0 - b 5704 1706 - e 5711 1713 - l 5704 1706 5711 1713 100 -} -a { - s 0 - b 6902 1707 - e 6909 1714 - l 6902 1707 6909 1714 100 -} -a { - s 0 - b 13478 1709 - e 13485 1716 - l 13478 1709 13485 1716 100 -} -a { - s 0 - b 3061 1712 - e 3068 1719 - l 3061 1712 3068 1719 100 -} -a { - s 0 - b 11527 1722 - e 11534 1729 - l 11527 1722 11534 1729 100 -} -a { - s 0 - b 8365 1754 - e 8372 1761 - l 8365 1754 8372 1761 100 -} -a { - s 0 - b 8774 1766 - e 8781 1773 - l 8774 1766 8781 1773 100 -} -a { - s 0 - b 3380 1768 - e 3387 1775 - l 3380 1768 3387 1775 100 -} -a { - s 0 - b 15387 1769 - e 15394 1776 - l 15387 1769 15394 1776 100 -} -a { - s 0 - b 3381 1769 - e 3388 1776 - l 3381 1769 3388 1776 100 -} -a { - s 0 - b 10931 1770 - e 10938 1777 - l 10931 1770 10938 1777 100 -} -a { - s 0 - b 9412 1770 - e 9419 1777 - l 9412 1770 9419 1777 100 -} -a { - s 0 - b 3382 1770 - e 3389 1777 - l 3382 1770 3389 1777 100 -} -a { - s 0 - b 10932 1771 - e 10939 1778 - l 10932 1771 10939 1778 100 -} -a { - s 0 - b 7524 1777 - e 7531 1784 - l 7524 1777 7531 1784 100 -} -a { - s 0 - b 7525 1778 - e 7532 1785 - l 7525 1778 7532 1785 100 -} -a { - s 0 - b 7526 1779 - e 7533 1786 - l 7526 1779 7533 1786 100 -} -a { - s 0 - b 11137 1794 - e 11144 1801 - l 11137 1794 11144 1801 100 -} -a { - s 0 - b 8079 1795 - e 8086 1802 - l 8079 1795 8086 1802 100 -} -a { - s 0 - b 13940 1800 - e 13947 1807 - l 13940 1800 13947 1807 100 -} -a { - s 0 - b 13941 1801 - e 13948 1808 - l 13941 1801 13948 1808 100 -} -a { - s 0 - b 11264 1804 - e 11271 1811 - l 11264 1804 11271 1811 100 -} -a { - s 0 - b 11031 1811 - e 11038 1818 - l 11031 1811 11038 1818 100 -} -a { - s 0 - b 14979 1831 - e 14986 1838 - l 14979 1831 14986 1838 100 -} -a { - s 0 - b 14243 1831 - e 14250 1838 - l 14243 1831 14250 1838 100 -} -a { - s 0 - b 3268 1834 - e 3275 1841 - l 3268 1834 3275 1841 100 -} -a { - s 0 - b 5014 1839 - e 5021 1846 - l 5014 1839 5021 1846 100 -} -a { - s 0 - b 6252 1847 - e 6259 1854 - l 6252 1847 6259 1854 100 -} -a { - s 0 - b 8274 1855 - e 8281 1862 - l 8274 1855 8281 1862 100 -} -a { - s 0 - b 14038 1882 - e 14045 1889 - l 14038 1882 14045 1889 100 -} -a { - s 0 - b 14747 1885 - e 14754 1892 - l 14747 1885 14754 1892 100 -} -a { - s 0 - b 14748 1886 - e 14755 1893 - l 14748 1886 14755 1893 100 -} -a { - s 0 - b 4502 1892 - e 4509 1899 - l 4502 1892 4509 1899 100 -} -a { - s 0 - b 8522 1899 - e 8529 1906 - l 8522 1899 8529 1906 100 -} -a { - s 0 - b 18395 1901 - e 18402 1908 - l 18395 1901 18402 1908 100 -} -a { - s 0 - b 18396 1902 - e 18403 1909 - l 18396 1902 18403 1909 100 -} -a { - s 0 - b 8312 1905 - e 8319 1912 - l 8312 1905 8319 1912 100 -} -a { - s 0 - b 14093 2118 - e 14100 2125 - l 14093 2118 14100 2125 100 -} -a { - s 0 - b 11381 2118 - e 11388 2125 - l 11381 2118 11388 2125 100 -} -a { - s 0 - b 18674 2122 - e 18681 2129 - l 18674 2122 18681 2129 100 -} -a { - s 0 - b 18472 2127 - e 18479 2134 - l 18472 2127 18479 2134 100 -} -a { - s 0 - b 6578 2160 - e 6585 2167 - l 6578 2160 6585 2167 100 -} -a { - s 0 - b 16075 2162 - e 16082 2169 - l 16075 2162 16082 2169 100 -} -a { - s 0 - b 16076 2163 - e 16083 2170 - l 16076 2163 16083 2170 100 -} -a { - s 0 - b 16077 2164 - e 16084 2171 - l 16077 2164 16084 2171 100 -} -a { - s 0 - b 14169 2164 - e 14176 2171 - l 14169 2164 14176 2171 100 -} -a { - s 0 - b 9700 2177 - e 9707 2184 - l 9700 2177 9707 2184 100 -} -a { - s 0 - b 14927 2178 - e 14934 2185 - l 14927 2178 14934 2185 100 -} -a { - s 0 - b 11243 2179 - e 11250 2186 - l 11243 2179 11250 2186 100 -} -a { - s 0 - b 8480 2182 - e 8487 2189 - l 8480 2182 8487 2189 100 -} -a { - s 0 - b 8368 2182 - e 8375 2189 - l 8368 2182 8375 2189 100 -} -a { - s 0 - b 8661 2184 - e 8668 2191 - l 8661 2184 8668 2191 100 -} -a { - s 0 - b 9455 2186 - e 9462 2193 - l 9455 2186 9462 2193 100 -} -a { - s 0 - b 6726 2186 - e 6733 2193 - l 6726 2186 6733 2193 100 -} -a { - s 0 - b 9707 2193 - e 9714 2200 - l 9707 2193 9714 2200 100 -} -a { - s 0 - b 14497 2199 - e 14504 2206 - l 14497 2199 14504 2206 100 -} -a { - s 0 - b 5045 2199 - e 5052 2206 - l 5045 2199 5052 2206 100 -} -a { - s 0 - b 17352 2200 - e 17359 2207 - l 17352 2200 17359 2207 100 -} -a { - s 0 - b 14498 2200 - e 14505 2207 - l 14498 2200 14505 2207 100 -} -a { - s 0 - b 17353 2201 - e 17360 2208 - l 17353 2201 17360 2208 100 -} -a { - s 0 - b 17509 2202 - e 17516 2209 - l 17509 2202 17516 2209 100 -} -a { - s 0 - b 11055 2202 - e 11062 2209 - l 11055 2202 11062 2209 100 -} -a { - s 0 - b 17510 2203 - e 17517 2210 - l 17510 2203 17517 2210 100 -} -a { - s 0 - b 12238 2203 - e 12245 2210 - l 12238 2203 12245 2210 100 -} -a { - s 0 - b 11056 2203 - e 11063 2210 - l 11056 2203 11063 2210 100 -} -a { - s 0 - b 11057 2204 - e 11064 2211 - l 11057 2204 11064 2211 100 -} -a { - s 0 - b 11447 2209 - e 11454 2216 - l 11447 2209 11454 2216 100 -} -a { - s 0 - b 11448 2210 - e 11455 2217 - l 11448 2210 11455 2217 100 -} -a { - s 0 - b 4422 2210 - e 4429 2217 - l 4422 2210 4429 2217 100 -} -a { - s 0 - b 6255 2211 - e 6262 2218 - l 6255 2211 6262 2218 100 -} -a { - s 0 - b 6256 2212 - e 6263 2219 - l 6256 2212 6263 2219 100 -} -a { - s 0 - b 6257 2213 - e 6264 2220 - l 6257 2213 6264 2220 100 -} -a { - s 0 - b 6345 2215 - e 6352 2222 - l 6345 2215 6352 2222 100 -} -a { - s 0 - b 17400 2237 - e 17407 2244 - l 17400 2237 17407 2244 100 -} -a { - s 0 - b 9387 2238 - e 9394 2245 - l 9387 2238 9394 2245 100 -} -a { - s 0 - b 8810 2250 - e 8817 2257 - l 8810 2250 8817 2257 100 -} -a { - s 0 - b 18440 2264 - e 18447 2271 - l 18440 2264 18447 2271 100 -} -a { - s 0 - b 11048 2268 - e 11055 2275 - l 11048 2268 11055 2275 100 -} -a { - s 0 - b 8865 2273 - e 8872 2280 - l 8865 2273 8872 2280 100 -} -a { - s 0 - b 15249 2275 - e 15256 2282 - l 15249 2275 15256 2282 100 -} -a { - s 0 - b 16026 2276 - e 16033 2283 - l 16026 2276 16033 2283 100 -} -a { - s 0 - b 8442 2303 - e 8449 2310 - l 8442 2303 8449 2310 100 -} -a { - s 0 - b 3357 2304 - e 3364 2311 - l 3357 2304 3364 2311 100 -} -a { - s 0 - b 3358 2305 - e 3365 2312 - l 3358 2305 3365 2312 100 -} -a { - s 0 - b 10836 2308 - e 10843 2315 - l 10836 2308 10843 2315 100 -} -a { - s 0 - b 10837 2309 - e 10844 2316 - l 10837 2309 10844 2316 100 -} -a { - s 0 - b 14814 2310 - e 14821 2317 - l 14814 2310 14821 2317 100 -} -a { - s 0 - b 3627 2310 - e 3634 2317 - l 3627 2310 3634 2317 100 -} -a { - s 0 - b 14815 2311 - e 14822 2318 - l 14815 2311 14822 2318 100 -} -a { - s 0 - b 4091 2312 - e 4098 2319 - l 4091 2312 4098 2319 100 -} -a { - s 0 - b 4092 2313 - e 4099 2320 - l 4092 2313 4099 2320 100 -} -a { - s 0 - b 8043 2316 - e 8050 2323 - l 8043 2316 8050 2323 100 -} -a { - s 0 - b 4921 2332 - e 4928 2339 - l 4921 2332 4928 2339 100 -} -a { - s 0 - b 11265 2343 - e 11272 2350 - l 11265 2343 11272 2350 100 -} -a { - s 0 - b 6354 2356 - e 6361 2363 - l 6354 2356 6361 2363 100 -} -a { - s 0 - b 8440 2357 - e 8447 2364 - l 8440 2357 8447 2364 100 -} -a { - s 0 - b 6355 2357 - e 6362 2364 - l 6355 2357 6362 2364 100 -} -a { - s 0 - b 2968 2357 - e 2975 2364 - l 2968 2357 2975 2364 100 -} -a { - s 0 - b 8441 2358 - e 8448 2365 - l 8441 2358 8448 2365 100 -} -a { - s 0 - b 3024 2361 - e 3031 2368 - l 3024 2361 3031 2368 100 -} -a { - s 0 - b 5105 2363 - e 5112 2370 - l 5105 2363 5112 2370 100 -} -a { - s 0 - b 17100 2364 - e 17107 2371 - l 17100 2364 17107 2371 100 -} -a { - s 0 - b 5106 2364 - e 5113 2371 - l 5106 2364 5113 2371 100 -} -a { - s 0 - b 18728 2382 - e 18735 2389 - l 18728 2382 18735 2389 100 -} -a { - s 0 - b 16100 2394 - e 16107 2401 - l 16100 2394 16107 2401 100 -} -a { - s 0 - b 16101 2395 - e 16108 2402 - l 16101 2395 16108 2402 100 -} -a { - s 0 - b 8750 2397 - e 8757 2404 - l 8750 2397 8757 2404 100 -} -a { - s 0 - b 8751 2398 - e 8758 2405 - l 8751 2398 8758 2405 100 -} -a { - s 0 - b 8752 2399 - e 8759 2406 - l 8752 2399 8759 2406 100 -} -a { - s 0 - b 8912 2415 - e 8919 2422 - l 8912 2415 8919 2422 100 -} -a { - s 0 - b 3293 2431 - e 3300 2438 - l 3293 2431 3300 2438 100 -} -a { - s 0 - b 3289 2432 - e 3296 2439 - l 3289 2432 3296 2439 100 -} -a { - s 0 - b 18749 2440 - e 18756 2447 - l 18749 2440 18756 2447 100 -} -a { - s 0 - b 18750 2441 - e 18757 2448 - l 18750 2441 18757 2448 100 -} -a { - s 0 - b 5097 2441 - e 5104 2448 - l 5097 2441 5104 2448 100 -} -a { - s 0 - b 5098 2442 - e 5105 2449 - l 5098 2442 5105 2449 100 -} -a { - s 0 - b 3912 2442 - e 3919 2449 - l 3912 2442 3919 2449 100 -} -a { - s 0 - b 14718 2463 - e 14725 2470 - l 14718 2463 14725 2470 100 -} -a { - s 0 - b 7737 2463 - e 7744 2470 - l 7737 2463 7744 2470 100 -} -a { - s 0 - b 9316 2466 - e 9323 2473 - l 9316 2466 9323 2473 100 -} -a { - s 0 - b 8486 2469 - e 8493 2476 - l 8486 2469 8493 2476 100 -} -a { - s 0 - b 13474 2473 - e 13481 2480 - l 13474 2473 13481 2480 100 -} -a { - s 0 - b 5184 2479 - e 5191 2486 - l 5184 2479 5191 2486 100 -} -a { - s 0 - b 5185 2480 - e 5192 2487 - l 5185 2480 5192 2487 100 -} -a { - s 0 - b 8347 2509 - e 8354 2516 - l 8347 2509 8354 2516 100 -} -a { - s 0 - b 11220 2510 - e 11227 2517 - l 11220 2510 11227 2517 100 -} -a { - s 0 - b 8580 2521 - e 8587 2528 - l 8580 2521 8587 2528 100 -} -a { - s 0 - b 6546 2527 - e 6553 2534 - l 6546 2527 6553 2534 100 -} -a { - s 0 - b 6547 2528 - e 6554 2535 - l 6547 2528 6554 2535 100 -} -a { - s 0 - b 11301 2530 - e 11308 2537 - l 11301 2530 11308 2537 100 -} -a { - s 0 - b 9408 2531 - e 9415 2538 - l 9408 2531 9415 2538 100 -} -a { - s 0 - b 9409 2532 - e 9416 2539 - l 9409 2532 9416 2539 100 -} -a { - s 0 - b 6584 2532 - e 6591 2539 - l 6584 2532 6591 2539 100 -} -a { - s 0 - b 6585 2533 - e 6592 2540 - l 6585 2533 6592 2540 100 -} -a { - s 0 - b 6600 2534 - e 6607 2541 - l 6600 2534 6607 2541 100 -} -a { - s 0 - b 6586 2534 - e 6593 2541 - l 6586 2534 6593 2541 100 -} -a { - s 0 - b 10944 2538 - e 10951 2545 - l 10944 2538 10951 2545 100 -} -a { - s 0 - b 8447 2542 - e 8454 2549 - l 8447 2542 8454 2549 100 -} -a { - s 0 - b 11287 2594 - e 11294 2601 - l 11287 2594 11294 2601 100 -} -a { - s 0 - b 17152 2598 - e 17159 2605 - l 17152 2598 17159 2605 100 -} -a { - s 0 - b 17153 2599 - e 17160 2606 - l 17153 2599 17160 2606 100 -} -a { - s 0 - b 11337 2603 - e 11344 2610 - l 11337 2603 11344 2610 100 -} -a { - s 0 - b 11338 2604 - e 11345 2611 - l 11338 2604 11345 2611 100 -} -a { - s 0 - b 11339 2605 - e 11346 2612 - l 11339 2605 11346 2612 100 -} -a { - s 0 - b 11340 2606 - e 11347 2613 - l 11340 2606 11347 2613 100 -} -a { - s 0 - b 11341 2607 - e 11348 2614 - l 11341 2607 11348 2614 100 -} -a { - s 0 - b 11342 2608 - e 11349 2615 - l 11342 2608 11349 2615 100 -} -a { - s 0 - b 4481 2610 - e 4488 2617 - l 4481 2610 4488 2617 100 -} -a { - s 0 - b 4348 2610 - e 4355 2617 - l 4348 2610 4355 2617 100 -} -a { - s 0 - b 10991 2639 - e 10998 2646 - l 10991 2639 10998 2646 100 -} -a { - s 0 - b 10992 2640 - e 10999 2647 - l 10992 2640 10999 2647 100 -} -a { - s 0 - b 17171 2646 - e 17178 2653 - l 17171 2646 17178 2653 100 -} -a { - s 0 - b 15182 2660 - e 15189 2667 - l 15182 2660 15189 2667 100 -} -a { - s 0 - b 3003 2661 - e 3010 2668 - l 3003 2661 3010 2668 100 -} -a { - s 0 - b 6553 2667 - e 6560 2674 - l 6553 2667 6560 2674 100 -} -a { - s 0 - b 276 2670 - e 283 2677 - l 276 2670 283 2677 100 -} -a { - s 0 - b 6573 2680 - e 6580 2687 - l 6573 2680 6580 2687 100 -} -a { - s 0 - b 6574 2681 - e 6581 2688 - l 6574 2681 6581 2688 100 -} -a { - s 0 - b 15224 2683 - e 15231 2690 - l 15224 2683 15231 2690 100 -} -a { - s 0 - b 7522 2687 - e 7529 2694 - l 7522 2687 7529 2694 100 -} -a { - s 0 - b 7523 2688 - e 7530 2695 - l 7523 2688 7530 2695 100 -} -a { - s 0 - b 8377 2691 - e 8384 2698 - l 8377 2691 8384 2698 100 -} -a { - s 0 - b 11069 2692 - e 11076 2699 - l 11069 2692 11076 2699 100 -} -a { - s 0 - b 3195 2699 - e 3202 2706 - l 3195 2699 3202 2706 100 -} -a { - s 0 - b 3196 2700 - e 3203 2707 - l 3196 2700 3203 2707 100 -} -a { - s 0 - b 3197 2701 - e 3204 2708 - l 3197 2701 3204 2708 100 -} -a { - s 0 - b 12194 2725 - e 12201 2732 - l 12194 2725 12201 2732 100 -} -a { - s 0 - b 4597 2742 - e 4604 2749 - l 4597 2742 4604 2749 100 -} -a { - s 0 - b 4598 2743 - e 4605 2750 - l 4598 2743 4605 2750 100 -} -a { - s 0 - b 15311 2755 - e 15318 2762 - l 15311 2755 15318 2762 100 -} -a { - s 0 - b 4399 2756 - e 4406 2763 - l 4399 2756 4406 2763 100 -} -a { - s 0 - b 18069 2763 - e 18076 2770 - l 18069 2763 18076 2770 100 -} -a { - s 0 - b 11394 2765 - e 11401 2772 - l 11394 2765 11401 2772 100 -} -a { - s 0 - b 6262 2767 - e 6269 2774 - l 6262 2767 6269 2774 100 -} -a { - s 0 - b 6263 2768 - e 6270 2775 - l 6263 2768 6270 2775 100 -} -a { - s 0 - b 13432 2774 - e 13439 2781 - l 13432 2774 13439 2781 100 -} -a { - s 0 - b 4048 2775 - e 4055 2782 - l 4048 2775 4055 2782 100 -} -a { - s 0 - b 5139 2795 - e 5146 2802 - l 5139 2795 5146 2802 100 -} -a { - s 0 - b 17443 2822 - e 17450 2829 - l 17443 2822 17450 2829 100 -} -a { - s 0 - b 7518 2827 - e 7525 2834 - l 7518 2827 7525 2834 100 -} -a { - s 0 - b 14964 2834 - e 14971 2841 - l 14964 2834 14971 2841 100 -} -a { - s 0 - b 14965 2835 - e 14972 2842 - l 14965 2835 14972 2842 100 -} -a { - s 0 - b 9745 2837 - e 9752 2844 - l 9745 2837 9752 2844 100 -} -a { - s 0 - b 6444 2839 - e 6451 2846 - l 6444 2839 6451 2846 100 -} -a { - s 0 - b 3874 2839 - e 3881 2846 - l 3874 2839 3881 2846 100 -} -a { - s 0 - b 9730 2841 - e 9737 2848 - l 9730 2841 9737 2848 100 -} -a { - s 0 - b 7759 2842 - e 7766 2849 - l 7759 2842 7766 2849 100 -} -a { - s 0 - b 8086 2843 - e 8093 2850 - l 8086 2843 8093 2850 100 -} -a { - s 0 - b 7760 2843 - e 7767 2850 - l 7760 2843 7767 2850 100 -} -a { - s 0 - b 7761 2844 - e 7768 2851 - l 7761 2844 7768 2851 100 -} -a { - s 0 - b 3664 2845 - e 3671 2852 - l 3664 2845 3671 2852 100 -} -a { - s 0 - b 10949 2856 - e 10956 2863 - l 10949 2856 10956 2863 100 -} -a { - s 0 - b 11124 2861 - e 11131 2868 - l 11124 2861 11131 2868 100 -} -a { - s 0 - b 6247 2876 - e 6254 2883 - l 6247 2876 6254 2883 100 -} -a { - s 0 - b 9732 2881 - e 9739 2888 - l 9732 2881 9739 2888 100 -} -a { - s 0 - b 3541 2889 - e 3548 2896 - l 3541 2889 3548 2896 100 -} -a { - s 0 - b 18574 2898 - e 18581 2905 - l 18574 2898 18581 2905 100 -} -a { - s 0 - b 15113 2914 - e 15120 2921 - l 15113 2914 15120 2921 100 -} -a { - s 0 - b 14139 2921 - e 14146 2928 - l 14139 2921 14146 2928 100 -} -a { - s 0 - b 14065 2923 - e 14072 2930 - l 14065 2923 14072 2930 100 -} -a { - s 0 - b 5660 2924 - e 5667 2931 - l 5660 2924 5667 2931 100 -} -a { - s 0 - b 3871 2924 - e 3878 2931 - l 3871 2924 3878 2931 100 -} -a { - s 0 - b 14925 2926 - e 14932 2933 - l 14925 2926 14932 2933 100 -} -a { - s 0 - b 14926 2927 - e 14933 2934 - l 14926 2927 14933 2934 100 -} -a { - s 0 - b 8848 2927 - e 8855 2934 - l 8848 2927 8855 2934 100 -} -a { - s 0 - b 5073 2930 - e 5080 2937 - l 5073 2930 5080 2937 100 -} -a { - s 0 - b 17437 2937 - e 17444 2944 - l 17437 2937 17444 2944 100 -} -a { - s 0 - b 6280 2937 - e 6287 2944 - l 6280 2937 6287 2944 100 -} -a { - s 0 - b 17438 2938 - e 17445 2945 - l 17438 2938 17445 2945 100 -} -a { - s 0 - b 17439 2939 - e 17446 2946 - l 17439 2939 17446 2946 100 -} -a { - s 0 - b 15265 2940 - e 15272 2947 - l 15265 2940 15272 2947 100 -} -a { - s 0 - b 8997 2949 - e 9004 2956 - l 8997 2949 9004 2956 100 -} -a { - s 0 - b 5658 2956 - e 5665 2963 - l 5658 2956 5665 2963 100 -} -a { - s 0 - b 5659 2957 - e 5666 2964 - l 5659 2957 5666 2964 100 -} -a { - s 0 - b 14066 2958 - e 14073 2965 - l 14066 2958 14073 2965 100 -} -a { - s 0 - b 34 2963 - e 41 2970 - l 34 2963 41 2970 100 -} -a { - s 0 - b 11068 2997 - e 11075 3004 - l 11068 2997 11075 3004 100 -} -a { - s 0 - b 4046 3002 - e 4053 3009 - l 4046 3002 4053 3009 100 -} -a { - s 0 - b 15358 3006 - e 15365 3013 - l 15358 3006 15365 3013 100 -} -a { - s 0 - b 18058 3018 - e 18065 3025 - l 18058 3018 18065 3025 100 -} -a { - s 0 - b 5088 3023 - e 5095 3030 - l 5088 3023 5095 3030 100 -} -a { - s 0 - b 15940 3025 - e 15947 3032 - l 15940 3025 15947 3032 100 -} -a { - s 0 - b 9034 3037 - e 9041 3044 - l 9034 3037 9041 3044 100 -} -a { - s 0 - b 14139 3044 - e 14146 3051 - l 14139 3044 14146 3051 100 -} -a { - s 0 - b 16047 3045 - e 16054 3052 - l 16047 3045 16054 3052 100 -} -a { - s 0 - b 18622 3046 - e 18629 3053 - l 18622 3046 18629 3053 100 -} -a { - s 0 - b 7451 3047 - e 7458 3054 - l 7451 3047 7458 3054 100 -} -a { - s 0 - b 6403 3048 - e 6410 3055 - l 6403 3048 6410 3055 100 -} -a { - s 0 - b 3906 3048 - e 3913 3055 - l 3906 3048 3913 3055 100 -} -a { - s 0 - b 14372 3049 - e 14379 3056 - l 14372 3049 14379 3056 100 -} -a { - s 0 - b 3907 3049 - e 3914 3056 - l 3907 3049 3914 3056 100 -} -a { - s 0 - b 18643 3051 - e 18650 3058 - l 18643 3051 18650 3058 100 -} -a { - s 0 - b 3029 3058 - e 3036 3065 - l 3029 3058 3036 3065 100 -} -a { - s 0 - b 18714 3090 - e 18721 3097 - l 18714 3090 18721 3097 100 -} -a { - s 0 - b 17394 3090 - e 17401 3097 - l 17394 3090 17401 3097 100 -} -a { - s 0 - b 17197 3094 - e 17204 3101 - l 17197 3094 17204 3101 100 -} -a { - s 0 - b 17198 3095 - e 17205 3102 - l 17198 3095 17205 3102 100 -} -a { - s 0 - b 17199 3096 - e 17206 3103 - l 17199 3096 17206 3103 100 -} -a { - s 0 - b 17200 3097 - e 17207 3104 - l 17200 3097 17207 3104 100 -} -a { - s 0 - b 9415 3114 - e 9422 3121 - l 9415 3114 9422 3121 100 -} -a { - s 0 - b 9416 3115 - e 9423 3122 - l 9416 3115 9423 3122 100 -} -a { - s 0 - b 18017 3124 - e 18024 3131 - l 18017 3124 18024 3131 100 -} -a { - s 0 - b 18018 3125 - e 18025 3132 - l 18018 3125 18025 3132 100 -} -a { - s 0 - b 6595 3143 - e 6602 3150 - l 6595 3143 6602 3150 100 -} -a { - s 0 - b 6596 3144 - e 6603 3151 - l 6596 3144 6603 3151 100 -} -a { - s 0 - b 6243 3144 - e 6250 3151 - l 6243 3144 6250 3151 100 -} -a { - s 0 - b 6597 3145 - e 6604 3152 - l 6597 3145 6604 3152 100 -} -a { - s 0 - b 6740 3148 - e 6747 3155 - l 6740 3148 6747 3155 100 -} -a { - s 0 - b 7456 3190 - e 7463 3197 - l 7456 3190 7463 3197 100 -} -a { - s 0 - b 7457 3191 - e 7464 3198 - l 7457 3191 7464 3198 100 -} -a { - s 0 - b 7458 3192 - e 7465 3199 - l 7458 3192 7465 3199 100 -} -a { - s 0 - b 9608 3220 - e 9615 3227 - l 9608 3220 9615 3227 100 -} -a { - s 0 - b 14578 3233 - e 14585 3240 - l 14578 3233 14585 3240 100 -} -a { - s 0 - b 13887 3263 - e 13894 3270 - l 13887 3263 13894 3270 100 -} -a { - s 0 - b 8089 3303 - e 8096 3310 - l 8089 3303 8096 3310 100 -} -a { - s 0 - b 14063 3307 - e 14070 3314 - l 14063 3307 14070 3314 100 -} -a { - s 0 - b 8267 3307 - e 8274 3314 - l 8267 3307 8274 3314 100 -} -a { - s 0 - b 14064 3308 - e 14071 3315 - l 14064 3308 14071 3315 100 -} -a { - s 0 - b 14065 3309 - e 14072 3316 - l 14065 3309 14072 3316 100 -} -a { - s 0 - b 11468 3517 - e 11475 3524 - l 11468 3517 11475 3524 100 -} -a { - s 0 - b 7771 3517 - e 7778 3524 - l 7771 3517 7778 3524 100 -} -a { - s 0 - b 14284 3520 - e 14291 3527 - l 14284 3520 14291 3527 100 -} -a { - s 0 - b 5596 3521 - e 5603 3528 - l 5596 3521 5603 3528 100 -} -a { - s 0 - b 4407 3521 - e 4414 3528 - l 4407 3521 4414 3528 100 -} -a { - s 0 - b 5597 3522 - e 5604 3529 - l 5597 3522 5604 3529 100 -} -a { - s 0 - b 4408 3522 - e 4415 3529 - l 4408 3522 4415 3529 100 -} -a { - s 0 - b 9337 3525 - e 9344 3532 - l 9337 3525 9344 3532 100 -} -a { - s 0 - b 17500 3529 - e 17507 3536 - l 17500 3529 17507 3536 100 -} -a { - s 0 - b 14222 3536 - e 14229 3543 - l 14222 3536 14229 3543 100 -} -a { - s 0 - b 14760 3537 - e 14767 3544 - l 14760 3537 14767 3544 100 -} -a { - s 0 - b 14761 3538 - e 14768 3545 - l 14761 3538 14768 3545 100 -} -a { - s 0 - b 3700 3543 - e 3707 3550 - l 3700 3543 3707 3550 100 -} -a { - s 0 - b 6513 3568 - e 6520 3575 - l 6513 3568 6520 3575 100 -} -a { - s 0 - b 11401 3578 - e 11408 3585 - l 11401 3578 11408 3585 100 -} -a { - s 0 - b 11402 3579 - e 11409 3586 - l 11402 3579 11409 3586 100 -} -a { - s 0 - b 11403 3580 - e 11410 3587 - l 11403 3580 11410 3587 100 -} -a { - s 0 - b 14505 3595 - e 14512 3602 - l 14505 3595 14512 3602 100 -} -a { - s 0 - b 6246 3597 - e 6253 3604 - l 6246 3597 6253 3604 100 -} -a { - s 0 - b 6247 3598 - e 6254 3605 - l 6247 3598 6254 3605 100 -} -a { - s 0 - b 6248 3599 - e 6255 3606 - l 6248 3599 6255 3606 100 -} -a { - s 0 - b 18738 3601 - e 18745 3608 - l 18738 3601 18745 3608 100 -} -a { - s 0 - b 14452 3601 - e 14459 3608 - l 14452 3601 14459 3608 100 -} -a { - s 0 - b 14155 3614 - e 14162 3621 - l 14155 3614 14162 3621 100 -} -a { - s 0 - b 18078 3618 - e 18085 3625 - l 18078 3618 18085 3625 100 -} -a { - s 0 - b 3130 3621 - e 3137 3628 - l 3130 3621 3137 3628 100 -} -a { - s 0 - b 3131 3622 - e 3138 3629 - l 3131 3622 3138 3629 100 -} -a { - s 0 - b 14595 3626 - e 14602 3633 - l 14595 3626 14602 3633 100 -} -a { - s 0 - b 14067 3627 - e 14074 3634 - l 14067 3627 14074 3634 100 -} -a { - s 0 - b 14068 3628 - e 14075 3635 - l 14068 3628 14075 3635 100 -} -a { - s 0 - b 18742 3632 - e 18749 3639 - l 18742 3632 18749 3639 100 -} -a { - s 0 - b 18743 3633 - e 18750 3640 - l 18743 3633 18750 3640 100 -} -a { - s 0 - b 18744 3634 - e 18751 3641 - l 18744 3634 18751 3641 100 -} -a { - s 0 - b 18599 3642 - e 18606 3649 - l 18599 3642 18606 3649 100 -} -a { - s 0 - b 8553 3642 - e 8560 3649 - l 8553 3642 8560 3649 100 -} -a { - s 0 - b 8501 3642 - e 8508 3649 - l 8501 3642 8508 3649 100 -} -a { - s 0 - b 18600 3643 - e 18607 3650 - l 18600 3643 18607 3650 100 -} -a { - s 0 - b 8554 3643 - e 8561 3650 - l 8554 3643 8561 3650 100 -} -a { - s 0 - b 18601 3644 - e 18608 3651 - l 18601 3644 18608 3651 100 -} -a { - s 0 - b 17583 3646 - e 17590 3653 - l 17583 3646 17590 3653 100 -} -a { - s 0 - b 17584 3647 - e 17591 3654 - l 17584 3647 17591 3654 100 -} -a { - s 0 - b 17585 3648 - e 17592 3655 - l 17585 3648 17592 3655 100 -} -a { - s 0 - b 14060 3651 - e 14067 3658 - l 14060 3651 14067 3658 100 -} -a { - s 0 - b 14061 3652 - e 14068 3659 - l 14061 3652 14068 3659 100 -} -a { - s 0 - b 8266 3652 - e 8273 3659 - l 8266 3652 8273 3659 100 -} -a { - s 0 - b 14063 3653 - e 14070 3660 - l 14063 3653 14070 3660 100 -} -a { - s 0 - b 8267 3653 - e 8274 3660 - l 8267 3653 8274 3660 100 -} -a { - s 0 - b 16877 3659 - e 16884 3666 - l 16877 3659 16884 3666 100 -} -a { - s 0 - b 8727 3662 - e 8734 3669 - l 8727 3662 8734 3669 100 -} -a { - s 0 - b 6717 3689 - e 6724 3696 - l 6717 3689 6724 3696 100 -} -a { - s 0 - b 14801 3709 - e 14808 3716 - l 14801 3709 14808 3716 100 -} -a { - s 0 - b 14802 3710 - e 14809 3717 - l 14802 3710 14809 3717 100 -} -a { - s 0 - b 8771 3713 - e 8778 3720 - l 8771 3713 8778 3720 100 -} -a { - s 0 - b 11558 3717 - e 11565 3724 - l 11558 3717 11565 3724 100 -} -a { - s 0 - b 14026 3739 - e 14033 3746 - l 14026 3739 14033 3746 100 -} -a { - s 0 - b 14027 3740 - e 14034 3747 - l 14027 3740 14034 3747 100 -} -a { - s 0 - b 3238 3740 - e 3245 3747 - l 3238 3740 3245 3747 100 -} -a { - s 0 - b 3239 3741 - e 3246 3748 - l 3239 3741 3246 3748 100 -} -a { - s 0 - b 8053 3742 - e 8060 3749 - l 8053 3742 8060 3749 100 -} -a { - s 0 - b 3240 3742 - e 3247 3749 - l 3240 3742 3247 3749 100 -} -a { - s 0 - b 8054 3743 - e 8061 3750 - l 8054 3743 8061 3750 100 -} -a { - s 0 - b 14936 3766 - e 14943 3773 - l 14936 3766 14943 3773 100 -} -a { - s 0 - b 14937 3767 - e 14944 3774 - l 14937 3767 14944 3774 100 -} -a { - s 0 - b 6297 3768 - e 6304 3775 - l 6297 3768 6304 3775 100 -} -a { - s 0 - b 17478 3769 - e 17485 3776 - l 17478 3769 17485 3776 100 -} -a { - s 0 - b 11132 3794 - e 11139 3801 - l 11132 3794 11139 3801 100 -} -a { - s 0 - b 3120 3796 - e 3127 3803 - l 3120 3796 3127 3803 100 -} -a { - s 0 - b 9443 3811 - e 9450 3818 - l 9443 3811 9450 3818 100 -} -a { - s 0 - b 11343 3814 - e 11350 3821 - l 11343 3814 11350 3821 100 -} -a { - s 0 - b 11344 3815 - e 11351 3822 - l 11344 3815 11351 3822 100 -} -a { - s 0 - b 11345 3816 - e 11352 3823 - l 11345 3816 11352 3823 100 -} -a { - s 0 - b 9699 3830 - e 9706 3837 - l 9699 3830 9706 3837 100 -} -a { - s 0 - b 9700 3831 - e 9707 3838 - l 9700 3831 9707 3838 100 -} -a { - s 0 - b 14927 3832 - e 14934 3839 - l 14927 3832 14934 3839 100 -} -a { - s 0 - b 14928 3833 - e 14935 3840 - l 14928 3833 14935 3840 100 -} -a { - s 0 - b 14929 3834 - e 14936 3841 - l 14929 3834 14936 3841 100 -} -a { - s 0 - b 12243 3835 - e 12250 3842 - l 12243 3835 12250 3842 100 -} -a { - s 0 - b 13888 3839 - e 13895 3846 - l 13888 3839 13895 3846 100 -} -a { - s 0 - b 9366 3845 - e 9373 3852 - l 9366 3845 9373 3852 100 -} -a { - s 0 - b 5700 3854 - e 5707 3861 - l 5700 3854 5707 3861 100 -} -a { - s 0 - b 3605 3860 - e 3612 3867 - l 3605 3860 3612 3867 100 -} -a { - s 0 - b 5121 3866 - e 5128 3873 - l 5121 3866 5128 3873 100 -} -a { - s 0 - b 5122 3867 - e 5129 3874 - l 5122 3867 5129 3874 100 -} -a { - s 0 - b 8892 3880 - e 8899 3887 - l 8892 3880 8899 3887 100 -} -a { - s 0 - b 8893 3881 - e 8900 3888 - l 8893 3881 8900 3888 100 -} -a { - s 0 - b 18043 3906 - e 18050 3913 - l 18043 3906 18050 3913 100 -} -a { - s 0 - b 18044 3907 - e 18051 3914 - l 18044 3907 18051 3914 100 -} -a { - s 0 - b 14127 3911 - e 14134 3918 - l 14127 3911 14134 3918 100 -} -a { - s 0 - b 8074 4604 - e 8081 4611 - l 8074 4604 8081 4611 100 -} -a { - s 0 - b 5128 4609 - e 5135 4616 - l 5128 4609 5135 4616 100 -} -a { - s 0 - b 18680 4632 - e 18687 4639 - l 18680 4632 18687 4639 100 -} -a { - s 0 - b 8133 4642 - e 8140 4649 - l 8133 4642 8140 4649 100 -} -a { - s 0 - b 16081 4648 - e 16088 4655 - l 16081 4648 16088 4655 100 -} -a { - s 0 - b 3266 4661 - e 3273 4668 - l 3266 4661 3273 4668 100 -} -a { - s 0 - b 3267 4662 - e 3274 4669 - l 3267 4662 3274 4669 100 -} -a { - s 0 - b 15347 4674 - e 15354 4681 - l 15347 4674 15354 4681 100 -} -a { - s 0 - b 15311 4686 - e 15318 4693 - l 15311 4686 15318 4693 100 -} -a { - s 0 - b 9661 4704 - e 9668 4711 - l 9661 4704 9668 4711 100 -} -a { - s 0 - b 8734 4708 - e 8741 4715 - l 8734 4708 8741 4715 100 -} -a { - s 0 - b 8735 4709 - e 8742 4716 - l 8735 4709 8742 4716 100 -} -a { - s 0 - b 16094 4713 - e 16101 4720 - l 16094 4713 16101 4720 100 -} -a { - s 0 - b 7620 4719 - e 7627 4726 - l 7620 4719 7627 4726 100 -} -a { - s 0 - b 7621 4720 - e 7628 4727 - l 7621 4720 7628 4727 100 -} -a { - s 0 - b 8666 4827 - e 8673 4834 - l 8666 4827 8673 4834 100 -} -a { - s 0 - b 8667 4828 - e 8674 4835 - l 8667 4828 8674 4835 100 -} -a { - s 0 - b 18663 4831 - e 18670 4838 - l 18663 4831 18670 4838 100 -} -a { - s 0 - b 17160 4846 - e 17167 4853 - l 17160 4846 17167 4853 100 -} -a { - s 0 - b 18479 4851 - e 18486 4858 - l 18479 4851 18486 4858 100 -} -a { - s 0 - b 7406 4859 - e 7413 4866 - l 7406 4859 7413 4866 100 -} -a { - s 0 - b 13906 4873 - e 13913 4880 - l 13906 4873 13913 4880 100 -} -a { - s 0 - b 6432 4874 - e 6439 4881 - l 6432 4874 6439 4881 100 -} -a { - s 0 - b 18778 4885 - e 18785 4892 - l 18778 4885 18785 4892 100 -} -a { - s 0 - b 14308 4885 - e 14315 4892 - l 14308 4885 14315 4892 100 -} -a { - s 0 - b 13941 4894 - e 13948 4901 - l 13941 4894 13948 4901 100 -} -a { - s 0 - b 4992 4908 - e 4999 4915 - l 4992 4908 4999 4915 100 -} -a { - s 0 - b 13887 4910 - e 13894 4917 - l 13887 4910 13894 4917 100 -} -a { - s 0 - b 13888 4911 - e 13895 4918 - l 13888 4911 13895 4918 100 -} -a { - s 0 - b 6542 4923 - e 6549 4930 - l 6542 4923 6549 4930 100 -} -a { - s 0 - b 14486 4948 - e 14493 4955 - l 14486 4948 14493 4955 100 -} -a { - s 0 - b 3281 4951 - e 3288 4958 - l 3281 4951 3288 4958 100 -} -a { - s 0 - b 7745 4953 - e 7752 4960 - l 7745 4953 7752 4960 100 -} -a { - s 0 - b 7699 4955 - e 7706 4962 - l 7699 4955 7706 4962 100 -} -a { - s 0 - b 7700 4956 - e 7707 4963 - l 7700 4956 7707 4963 100 -} -a { - s 0 - b 17306 4969 - e 17313 4976 - l 17306 4969 17313 4976 100 -} -a { - s 0 - b 11360 4974 - e 11367 4981 - l 11360 4974 11367 4981 100 -} -a { - s 0 - b 2973 4977 - e 2980 4984 - l 2973 4977 2980 4984 100 -} -a { - s 0 - b 9626 4979 - e 9633 4986 - l 9626 4979 9633 4986 100 -} -a { - s 0 - b 9627 4980 - e 9634 4987 - l 9627 4980 9634 4987 100 -} -a { - s 0 - b 17298 5015 - e 17305 5022 - l 17298 5015 17305 5022 100 -} -a { - s 0 - b 17299 5016 - e 17306 5023 - l 17299 5016 17306 5023 100 -} -a { - s 0 - b 6682 5029 - e 6689 5036 - l 6682 5029 6689 5036 100 -} -a { - s 0 - b 12192 5035 - e 12199 5042 - l 12192 5035 12199 5042 100 -} -a { - s 0 - b 12193 5036 - e 12200 5043 - l 12193 5036 12200 5043 100 -} -a { - s 0 - b 12194 5037 - e 12201 5044 - l 12194 5037 12201 5044 100 -} -a { - s 0 - b 12606 5045 - e 12613 5052 - l 12606 5045 12613 5052 100 -} -a { - s 0 - b 9067 5047 - e 9074 5054 - l 9067 5047 9074 5054 100 -} -a { - s 0 - b 15389 5048 - e 15396 5055 - l 15389 5048 15396 5055 100 -} -a { - s 0 - b 9068 5048 - e 9075 5055 - l 9068 5048 9075 5055 100 -} -a { - s 0 - b 9069 5049 - e 9076 5056 - l 9069 5049 9076 5056 100 -} -a { - s 0 - b 3176 5049 - e 3183 5056 - l 3176 5049 3183 5056 100 -} -a { - s 0 - b 3177 5050 - e 3184 5057 - l 3177 5050 3184 5057 100 -} -a { - s 0 - b 14142 5053 - e 14149 5060 - l 14142 5053 14149 5060 100 -} -a { - s 0 - b 14143 5054 - e 14150 5061 - l 14143 5054 14150 5061 100 -} -a { - s 0 - b 10917 5055 - e 10924 5062 - l 10917 5055 10924 5062 100 -} -a { - s 0 - b 3605 5058 - e 3612 5065 - l 3605 5058 3612 5065 100 -} -a { - s 0 - b 14575 5061 - e 14582 5068 - l 14575 5061 14582 5068 100 -} -a { - s 0 - b 8775 5062 - e 8782 5069 - l 8775 5062 8782 5069 100 -} -a { - s 0 - b 13408 5064 - e 13415 5071 - l 13408 5064 13415 5071 100 -} -a { - s 0 - b 8709 5064 - e 8716 5071 - l 8709 5064 8716 5071 100 -} -a { - s 0 - b 14220 5069 - e 14227 5076 - l 14220 5069 14227 5076 100 -} -a { - s 0 - b 8104 5071 - e 8111 5078 - l 8104 5071 8111 5078 100 -} -a { - s 0 - b 5044 5090 - e 5051 5097 - l 5044 5090 5051 5097 100 -} -a { - s 0 - b 14497 5091 - e 14504 5098 - l 14497 5091 14504 5098 100 -} -a { - s 0 - b 5045 5091 - e 5052 5098 - l 5045 5091 5052 5098 100 -} -a { - s 0 - b 7474 5092 - e 7481 5099 - l 7474 5092 7481 5099 100 -} -a { - s 0 - b 5046 5092 - e 5053 5099 - l 5046 5092 5053 5099 100 -} -a { - s 0 - b 7475 5093 - e 7482 5100 - l 7475 5093 7482 5100 100 -} -a { - s 0 - b 4245 5094 - e 4252 5101 - l 4245 5094 4252 5101 100 -} -a { - s 0 - b 6225 5119 - e 6232 5126 - l 6225 5119 6232 5126 100 -} -a { - s 0 - b 5173 5119 - e 5180 5126 - l 5173 5119 5180 5126 100 -} -a { - s 0 - b 7517 5131 - e 7524 5138 - l 7517 5131 7524 5138 100 -} -a { - s 0 - b 3659 5131 - e 3666 5138 - l 3659 5131 3666 5138 100 -} -a { - s 0 - b 7518 5132 - e 7525 5139 - l 7518 5132 7525 5139 100 -} -a { - s 0 - b 7519 5133 - e 7526 5140 - l 7519 5133 7526 5140 100 -} -a { - s 0 - b 11011 5141 - e 11018 5148 - l 11011 5141 11018 5148 100 -} -a { - s 0 - b 3400 5148 - e 3407 5155 - l 3400 5148 3407 5155 100 -} -a { - s 0 - b 17164 5171 - e 17171 5178 - l 17164 5171 17171 5178 100 -} -a { - s 0 - b 17165 5172 - e 17172 5179 - l 17165 5172 17172 5179 100 -} -a { - s 0 - b 6805 5178 - e 6812 5185 - l 6805 5178 6812 5185 100 -} -a { - s 0 - b 9650 5181 - e 9657 5188 - l 9650 5181 9657 5188 100 -} -a { - s 0 - b 9651 5182 - e 9658 5189 - l 9651 5182 9658 5189 100 -} -a { - s 0 - b 12170 5191 - e 12177 5198 - l 12170 5191 12177 5198 100 -} -a { - s 0 - b 14690 5198 - e 14697 5205 - l 14690 5198 14697 5205 100 -} -a { - s 0 - b 6623 5201 - e 6630 5208 - l 6623 5201 6630 5208 100 -} -a { - s 0 - b 13465 5211 - e 13472 5218 - l 13465 5211 13472 5218 100 -} -a { - s 0 - b 13907 5216 - e 13914 5223 - l 13907 5216 13914 5223 100 -} -a { - s 0 - b 8411 5243 - e 8418 5250 - l 8411 5243 8418 5250 100 -} -a { - s 0 - b 8412 5244 - e 8419 5251 - l 8412 5244 8419 5251 100 -} -a { - s 0 - b 5693 5245 - e 5700 5252 - l 5693 5245 5700 5252 100 -} -a { - s 0 - b 14149 5253 - e 14156 5260 - l 14149 5253 14156 5260 100 -} -a { - s 0 - b 17414 5254 - e 17421 5261 - l 17414 5254 17421 5261 100 -} -a { - s 0 - b 14304 5254 - e 14311 5261 - l 14304 5254 14311 5261 100 -} -a { - s 0 - b 7439 5254 - e 7446 5261 - l 7439 5254 7446 5261 100 -} -a { - s 0 - b 18077 5256 - e 18084 5263 - l 18077 5256 18084 5263 100 -} -a { - s 0 - b 3112 5272 - e 3119 5279 - l 3112 5272 3119 5279 100 -} -a { - s 0 - b 3239 5279 - e 3246 5286 - l 3239 5279 3246 5286 100 -} -a { - s 0 - b 12134 5280 - e 12141 5287 - l 12134 5280 12141 5287 100 -} -a { - s 0 - b 12135 5281 - e 12142 5288 - l 12135 5281 12142 5288 100 -} -a { - s 0 - b 3704 5284 - e 3711 5291 - l 3704 5284 3711 5291 100 -} -a { - s 0 - b 3705 5285 - e 3712 5292 - l 3705 5285 3712 5292 100 -} -a { - s 0 - b 8929 5342 - e 8936 5349 - l 8929 5342 8936 5349 100 -} -a { - s 0 - b 3131 5348 - e 3138 5355 - l 3131 5348 3138 5355 100 -} -a { - s 0 - b 3132 5349 - e 3139 5356 - l 3132 5349 3139 5356 100 -} -a { - s 0 - b 16046 5351 - e 16053 5358 - l 16046 5351 16053 5358 100 -} -a { - s 0 - b 14140 5352 - e 14147 5359 - l 14140 5352 14147 5359 100 -} -a { - s 0 - b 8061 5367 - e 8068 5374 - l 8061 5367 8068 5374 100 -} -a { - s 0 - b 8265 5375 - e 8272 5382 - l 8265 5375 8272 5382 100 -} -a { - s 0 - b 8268 5377 - e 8275 5384 - l 8268 5377 8275 5384 100 -} -a { - s 0 - b 8269 5378 - e 8276 5385 - l 8269 5378 8276 5385 100 -} -a { - s 0 - b 14022 5379 - e 14029 5386 - l 14022 5379 14029 5386 100 -} -a { - s 0 - b 4499 5379 - e 4506 5386 - l 4499 5379 4506 5386 100 -} -a { - s 0 - b 14023 5380 - e 14030 5387 - l 14023 5380 14030 5387 100 -} -a { - s 0 - b 14024 5381 - e 14031 5388 - l 14024 5381 14031 5388 100 -} -a { - s 0 - b 14827 5382 - e 14834 5389 - l 14827 5382 14834 5389 100 -} -a { - s 0 - b 11062 5382 - e 11069 5389 - l 11062 5382 11069 5389 100 -} -a { - s 0 - b 4530 5382 - e 4537 5389 - l 4530 5382 4537 5389 100 -} -a { - s 0 - b 14735 5394 - e 14742 5401 - l 14735 5394 14742 5401 100 -} -a { - s 0 - b 14571 5428 - e 14578 5435 - l 14571 5428 14578 5435 100 -} -a { - s 0 - b 18421 5431 - e 18428 5438 - l 18421 5431 18428 5438 100 -} -a { - s 0 - b 18736 5435 - e 18743 5442 - l 18736 5435 18743 5442 100 -} -a { - s 0 - b 18737 5436 - e 18744 5443 - l 18737 5436 18744 5443 100 -} -a { - s 0 - b 14451 5436 - e 14458 5443 - l 14451 5436 14458 5443 100 -} -a { - s 0 - b 8894 5446 - e 8901 5453 - l 8894 5446 8901 5453 100 -} -a { - s 0 - b 9060 5447 - e 9067 5454 - l 9060 5447 9067 5454 100 -} -a { - s 0 - b 5145 5456 - e 5152 5463 - l 5145 5456 5152 5463 100 -} -a { - s 0 - b 8555 5459 - e 8562 5466 - l 8555 5459 8562 5466 100 -} -a { - s 0 - b 17305 5470 - e 17312 5477 - l 17305 5470 17312 5477 100 -} -a { - s 0 - b 17306 5471 - e 17313 5478 - l 17306 5471 17313 5478 100 -} -a { - s 0 - b 7463 5473 - e 7470 5480 - l 7463 5473 7470 5480 100 -} -a { - s 0 - b 7464 5474 - e 7471 5481 - l 7464 5474 7471 5481 100 -} -a { - s 0 - b 4608 5480 - e 4615 5487 - l 4608 5480 4615 5487 100 -} -a { - s 0 - b 9389 5496 - e 9396 5503 - l 9389 5496 9396 5503 100 -} -a { - s 0 - b 9390 5497 - e 9397 5504 - l 9390 5497 9397 5504 100 -} -a { - s 0 - b 17977 5510 - e 17984 5517 - l 17977 5510 17984 5517 100 -} -a { - s 0 - b 3372 5548 - e 3379 5555 - l 3372 5548 3379 5555 100 -} -a { - s 0 - b 4878 5574 - e 4885 5581 - l 4878 5574 4885 5581 100 -} -a { - s 0 - b 8585 5584 - e 8592 5591 - l 8585 5584 8592 5591 100 -} -a { - s 0 - b 8586 5585 - e 8593 5592 - l 8586 5585 8593 5592 100 -} -a { - s 0 - b 8102 5586 - e 8109 5593 - l 8102 5586 8109 5593 100 -} -a { - s 0 - b 8103 5587 - e 8110 5594 - l 8103 5587 8110 5594 100 -} -a { - s 0 - b 8104 5588 - e 8111 5595 - l 8104 5588 8111 5595 100 -} -a { - s 0 - b 8105 5589 - e 8112 5596 - l 8105 5589 8112 5596 100 -} -a { - s 0 - b 14368 5590 - e 14375 5597 - l 14368 5590 14375 5597 100 -} -a { - s 0 - b 8106 5590 - e 8113 5597 - l 8106 5590 8113 5597 100 -} -a { - s 0 - b 11477 5591 - e 11484 5598 - l 11477 5591 11484 5598 100 -} -a { - s 0 - b 9036 5596 - e 9043 5603 - l 9036 5596 9043 5603 100 -} -a { - s 0 - b 14885 5601 - e 14892 5608 - l 14885 5601 14892 5608 100 -} -a { - s 0 - b 6607 5601 - e 6614 5608 - l 6607 5601 6614 5608 100 -} -a { - s 0 - b 4046 5611 - e 4053 5618 - l 4046 5611 4053 5618 100 -} -a { - s 0 - b 4923 5612 - e 4930 5619 - l 4923 5612 4930 5619 100 -} -a { - s 0 - b 4047 5612 - e 4054 5619 - l 4047 5612 4054 5619 100 -} -a { - s 0 - b 4048 5613 - e 4055 5620 - l 4048 5613 4055 5620 100 -} -a { - s 0 - b 4598 5627 - e 4605 5634 - l 4598 5627 4605 5634 100 -} -a { - s 0 - b 14778 5634 - e 14785 5641 - l 14778 5634 14785 5641 100 -} -a { - s 0 - b 7635 5642 - e 7642 5649 - l 7635 5642 7642 5649 100 -} -a { - s 0 - b 7560 5645 - e 7567 5652 - l 7560 5645 7567 5652 100 -} -a { - s 0 - b 11059 5654 - e 11066 5661 - l 11059 5654 11066 5661 100 -} -a { - s 0 - b 4613 5654 - e 4620 5661 - l 4613 5654 4620 5661 100 -} -a { - s 0 - b 9445 5655 - e 9452 5662 - l 9445 5655 9452 5662 100 -} -a { - s 0 - b 4614 5655 - e 4621 5662 - l 4614 5655 4621 5662 100 -} -a { - s 0 - b 4096 5657 - e 4103 5664 - l 4096 5657 4103 5664 100 -} -a { - s 0 - b 13903 5659 - e 13910 5666 - l 13903 5659 13910 5666 100 -} -a { - s 0 - b 16039 5660 - e 16046 5667 - l 16039 5660 16046 5667 100 -} -a { - s 0 - b 16040 5661 - e 16047 5668 - l 16040 5661 16047 5668 100 -} -a { - s 0 - b 16041 5662 - e 16048 5669 - l 16041 5662 16048 5669 100 -} -a { - s 0 - b 15472 5664 - e 15479 5671 - l 15472 5664 15479 5671 100 -} -a { - s 0 - b 4403 5665 - e 4410 5672 - l 4403 5665 4410 5672 100 -} -a { - s 0 - b 4404 5666 - e 4411 5673 - l 4404 5666 4411 5673 100 -} -a { - s 0 - b 4980 5677 - e 4987 5684 - l 4980 5677 4987 5684 100 -} -a { - s 0 - b 13899 5678 - e 13906 5685 - l 13899 5678 13906 5685 100 -} -a { - s 0 - b 8241 5679 - e 8248 5686 - l 8241 5679 8248 5686 100 -} -a { - s 0 - b 4497 5680 - e 4504 5687 - l 4497 5680 4504 5687 100 -} -a { - s 0 - b 14213 5683 - e 14220 5690 - l 14213 5683 14220 5690 100 -} -a { - s 0 - b 6257 5684 - e 6264 5691 - l 6257 5684 6264 5691 100 -} -a { - s 0 - b 17314 5686 - e 17321 5693 - l 17314 5686 17321 5693 100 -} -a { - s 0 - b 9623 5704 - e 9630 5711 - l 9623 5704 9630 5711 100 -} -a { - s 0 - b 17335 5707 - e 17342 5714 - l 17335 5707 17342 5714 100 -} -a { - s 0 - b 14080 5738 - e 14087 5745 - l 14080 5738 14087 5745 100 -} -a { - s 0 - b 7473 5757 - e 7480 5764 - l 7473 5757 7480 5764 100 -} -a { - s 0 - b 17352 5758 - e 17359 5765 - l 17352 5758 17359 5765 100 -} -a { - s 0 - b 14498 5758 - e 14505 5765 - l 14498 5758 14505 5765 100 -} -a { - s 0 - b 14499 5759 - e 14506 5766 - l 14499 5759 14506 5766 100 -} -a { - s 0 - b 14500 5760 - e 14507 5767 - l 14500 5760 14507 5767 100 -} -a { - s 0 - b 4816 5761 - e 4823 5768 - l 4816 5761 4823 5768 100 -} -a { - s 0 - b 4817 5762 - e 4824 5769 - l 4817 5762 4824 5769 100 -} -a { - s 0 - b 4818 5763 - e 4825 5770 - l 4818 5763 4825 5770 100 -} -a { - s 0 - b 14161 6035 - e 14168 6042 - l 14161 6035 14168 6042 100 -} -a { - s 0 - b 14366 6036 - e 14373 6043 - l 14366 6036 14373 6043 100 -} -a { - s 0 - b 3278 6038 - e 3285 6045 - l 3278 6038 3285 6045 100 -} -a { - s 0 - b 14759 6051 - e 14766 6058 - l 14759 6051 14766 6058 100 -} -a { - s 0 - b 281 6059 - e 288 6066 - l 281 6059 288 6066 100 -} -a { - s 0 - b 7731 6064 - e 7738 6071 - l 7731 6064 7738 6071 100 -} -a { - s 0 - b 7732 6065 - e 7739 6072 - l 7732 6065 7739 6072 100 -} -a { - s 0 - b 9396 6080 - e 9403 6087 - l 9396 6080 9403 6087 100 -} -a { - s 0 - b 9397 6081 - e 9404 6088 - l 9397 6081 9404 6088 100 -} -a { - s 0 - b 6602 6082 - e 6609 6089 - l 6602 6082 6609 6089 100 -} -a { - s 0 - b 14524 6084 - e 14531 6091 - l 14524 6084 14531 6091 100 -} -a { - s 0 - b 3034 6124 - e 3041 6131 - l 3034 6124 3041 6131 100 -} -a { - s 0 - b 3035 6125 - e 3042 6132 - l 3035 6125 3042 6132 100 -} -a { - s 0 - b 15909 6137 - e 15916 6144 - l 15909 6137 15916 6144 100 -} -a { - s 0 - b 14663 6140 - e 14670 6147 - l 14663 6140 14670 6147 100 -} -a { - s 0 - b 14273 6148 - e 14280 6155 - l 14273 6148 14280 6155 100 -} -a { - s 0 - b 14274 6149 - e 14281 6156 - l 14274 6149 14281 6156 100 -} -a { - s 0 - b 9443 6156 - e 9450 6163 - l 9443 6156 9450 6163 100 -} -a { - s 0 - b 11288 6157 - e 11295 6164 - l 11288 6157 11295 6164 100 -} -a { - s 0 - b 8556 6159 - e 8563 6166 - l 8556 6159 8563 6166 100 -} -a { - s 0 - b 5182 6161 - e 5189 6168 - l 5182 6161 5189 6168 100 -} -a { - s 0 - b 4866 6162 - e 4873 6169 - l 4866 6162 4873 6169 100 -} -a { - s 0 - b 6792 6167 - e 6799 6174 - l 6792 6167 6799 6174 100 -} -a { - s 0 - b 3663 6176 - e 3670 6183 - l 3663 6176 3670 6183 100 -} -a { - s 0 - b 7604 6180 - e 7611 6187 - l 7604 6180 7611 6187 100 -} -a { - s 0 - b 9505 6207 - e 9512 6214 - l 9505 6207 9512 6214 100 -} -a { - s 0 - b 7712 6207 - e 7719 6214 - l 7712 6207 7719 6214 100 -} -a { - s 0 - b 16015 6208 - e 16022 6215 - l 16015 6208 16022 6215 100 -} -a { - s 0 - b 9506 6208 - e 9513 6215 - l 9506 6208 9513 6215 100 -} -a { - s 0 - b 7713 6208 - e 7720 6215 - l 7713 6208 7720 6215 100 -} -a { - s 0 - b 9507 6209 - e 9514 6216 - l 9507 6209 9514 6216 100 -} -a { - s 0 - b 7714 6209 - e 7721 6216 - l 7714 6209 7721 6216 100 -} -a { - s 0 - b 11004 6210 - e 11011 6217 - l 11004 6210 11011 6217 100 -} -a { - s 0 - b 14224 6213 - e 14231 6220 - l 14224 6213 14231 6220 100 -} -a { - s 0 - b 14513 6222 - e 14520 6229 - l 14513 6222 14520 6229 100 -} -a { - s 0 - b 4601 6225 - e 4608 6232 - l 4601 6225 4608 6232 100 -} -a { - s 0 - b 4602 6226 - e 4609 6233 - l 4602 6226 4609 6233 100 -} -a { - s 0 - b 11478 6229 - e 11485 6236 - l 11478 6229 11485 6236 100 -} -a { - s 0 - b 14931 6232 - e 14938 6239 - l 14931 6232 14938 6239 100 -} -a { - s 0 - b 18502 6252 - e 18509 6259 - l 18502 6252 18509 6259 100 -} -a { - s 0 - b 18503 6253 - e 18510 6260 - l 18503 6253 18510 6260 100 -} -a { - s 0 - b 8468 6257 - e 8475 6264 - l 8468 6257 8475 6264 100 -} -a { - s 0 - b 17544 6278 - e 17551 6285 - l 17544 6278 17551 6285 100 -} -a { - s 0 - b 3406 6278 - e 3413 6285 - l 3406 6278 3413 6285 100 -} -a { - s 0 - b 5068 6287 - e 5075 6294 - l 5068 6287 5075 6294 100 -} -a { - s 0 - b 13439 6290 - e 13446 6297 - l 13439 6290 13446 6297 100 -} -a { - s 0 - b 5034 6292 - e 5041 6299 - l 5034 6292 5041 6299 100 -} -a { - s 0 - b 18554 6328 - e 18561 6335 - l 18554 6328 18561 6335 100 -} -a { - s 0 - b 6846 6328 - e 6853 6335 - l 6846 6328 6853 6335 100 -} -a { - s 0 - b 6598 6346 - e 6605 6353 - l 6598 6346 6605 6353 100 -} -a { - s 0 - b 15908 6361 - e 15915 6368 - l 15908 6361 15915 6368 100 -} -a { - s 0 - b 10867 6380 - e 10874 6387 - l 10867 6380 10874 6387 100 -} -a { - s 0 - b 7596 6416 - e 7603 6423 - l 7596 6416 7603 6423 100 -} -a { - s 0 - b 7597 6417 - e 7604 6424 - l 7597 6417 7604 6424 100 -} -a { - s 0 - b 4372 6432 - e 4379 6439 - l 4372 6432 4379 6439 100 -} -a { - s 0 - b 8253 6434 - e 8260 6441 - l 8253 6434 8260 6441 100 -} -a { - s 0 - b 8254 6435 - e 8261 6442 - l 8254 6435 8261 6442 100 -} -a { - s 0 - b 14756 6436 - e 14763 6443 - l 14756 6436 14763 6443 100 -} -a { - s 0 - b 11551 6436 - e 11558 6443 - l 11551 6436 11558 6443 100 -} -a { - s 0 - b 12236 6439 - e 12243 6446 - l 12236 6439 12243 6446 100 -} -a { - s 0 - b 12237 6440 - e 12244 6447 - l 12237 6440 12244 6447 100 -} -a { - s 0 - b 17351 6443 - e 17358 6450 - l 17351 6443 17358 6450 100 -} -a { - s 0 - b 7474 6444 - e 7481 6451 - l 7474 6444 7481 6451 100 -} -a { - s 0 - b 5046 6444 - e 5053 6451 - l 5046 6444 5053 6451 100 -} -a { - s 0 - b 8615 6453 - e 8622 6460 - l 8615 6453 8622 6460 100 -} -a { - s 0 - b 8616 6454 - e 8623 6461 - l 8616 6454 8623 6461 100 -} -a { - s 0 - b 10846 6497 - e 10853 6504 - l 10846 6497 10853 6504 100 -} -a { - s 0 - b 15365 6507 - e 15372 6514 - l 15365 6507 15372 6514 100 -} -a { - s 0 - b 4401 6508 - e 4408 6515 - l 4401 6508 4408 6515 100 -} -a { - s 0 - b 4402 6509 - e 4409 6516 - l 4402 6509 4409 6516 100 -} -a { - s 0 - b 4403 6510 - e 4410 6517 - l 4403 6510 4410 6517 100 -} -a { - s 0 - b 14637 6514 - e 14644 6521 - l 14637 6514 14644 6521 100 -} -a { - s 0 - b 14278 6515 - e 14285 6522 - l 14278 6515 14285 6522 100 -} -a { - s 0 - b 21 6516 - e 28 6523 - l 21 6516 28 6523 100 -} -a { - s 0 - b 7699 6542 - e 7706 6549 - l 7699 6542 7706 6549 100 -} -a { - s 0 - b 7652 6561 - e 7659 6568 - l 7652 6561 7659 6568 100 -} -a { - s 0 - b 4828 6562 - e 4835 6569 - l 4828 6562 4835 6569 100 -} -a { - s 0 - b 8183 6563 - e 8190 6570 - l 8183 6563 8190 6570 100 -} -a { - s 0 - b 8184 6564 - e 8191 6571 - l 8184 6564 8191 6571 100 -} -a { - s 0 - b 8185 6565 - e 8192 6572 - l 8185 6565 8192 6572 100 -} -a { - s 0 - b 18012 6585 - e 18019 6592 - l 18012 6585 18019 6592 100 -} -a { - s 0 - b 18013 6586 - e 18020 6593 - l 18013 6586 18020 6593 100 -} -a { - s 0 - b 11525 6588 - e 11532 6595 - l 11525 6588 11532 6595 100 -} -a { - s 0 - b 138 6588 - e 145 6595 - l 138 6588 145 6595 100 -} -a { - s 0 - b 139 6589 - e 146 6596 - l 139 6589 146 6596 100 -} -a { - s 0 - b 5606 6592 - e 5613 6599 - l 5606 6592 5613 6599 100 -} -a { - s 0 - b 5607 6593 - e 5614 6600 - l 5607 6593 5614 6600 100 -} -a { - s 0 - b 5608 6594 - e 5615 6601 - l 5608 6594 5615 6601 100 -} -a { - s 0 - b 11372 6595 - e 11379 6602 - l 11372 6595 11379 6602 100 -} -a { - s 0 - b 4915 6598 - e 4922 6605 - l 4915 6598 4922 6605 100 -} -a { - s 0 - b 6828 6612 - e 6835 6619 - l 6828 6612 6835 6619 100 -} -a { - s 0 - b 5081 6612 - e 5088 6619 - l 5081 6612 5088 6619 100 -} -a { - s 0 - b 15909 6617 - e 15916 6624 - l 15909 6617 15916 6624 100 -} -a { - s 0 - b 18756 6622 - e 18763 6629 - l 18756 6622 18763 6629 100 -} -a { - s 0 - b 18757 6623 - e 18764 6630 - l 18757 6623 18764 6630 100 -} -a { - s 0 - b 14331 6625 - e 14338 6632 - l 14331 6625 14338 6632 100 -} -a { - s 0 - b 5066 6626 - e 5073 6633 - l 5066 6626 5073 6633 100 -} -a { - s 0 - b 5067 6627 - e 5074 6634 - l 5067 6627 5074 6634 100 -} -a { - s 0 - b 5068 6628 - e 5075 6635 - l 5068 6628 5075 6635 100 -} -a { - s 0 - b 11188 6631 - e 11195 6638 - l 11188 6631 11195 6638 100 -} -a { - s 0 - b 11189 6632 - e 11196 6639 - l 11189 6632 11196 6639 100 -} -a { - s 0 - b 7680 6633 - e 7687 6640 - l 7680 6633 7687 6640 100 -} -a { - s 0 - b 17431 6644 - e 17438 6651 - l 17431 6644 17438 6651 100 -} -a { - s 0 - b 17432 6645 - e 17439 6652 - l 17432 6645 17439 6652 100 -} -a { - s 0 - b 8831 6692 - e 8838 6699 - l 8831 6692 8838 6699 100 -} -a { - s 0 - b 18101 6693 - e 18108 6700 - l 18101 6693 18108 6700 100 -} -a { - s 0 - b 9302 6693 - e 9309 6700 - l 9302 6693 9309 6700 100 -} -a { - s 0 - b 8832 6693 - e 8839 6700 - l 8832 6693 8839 6700 100 -} -a { - s 0 - b 8833 6694 - e 8840 6701 - l 8833 6694 8840 6701 100 -} -a { - s 0 - b 10962 6706 - e 10969 6713 - l 10962 6706 10969 6713 100 -} -a { - s 0 - b 14866 6718 - e 14873 6725 - l 14866 6718 14873 6725 100 -} -a { - s 0 - b 14867 6719 - e 14874 6726 - l 14867 6719 14874 6726 100 -} -a { - s 0 - b 11208 6727 - e 11215 6734 - l 11208 6727 11215 6734 100 -} -a { - s 0 - b 5583 6736 - e 5590 6743 - l 5583 6736 5590 6743 100 -} -a { - s 0 - b 5584 6737 - e 5591 6744 - l 5584 6737 5591 6744 100 -} -a { - s 0 - b 3640 6749 - e 3647 6756 - l 3640 6749 3647 6756 100 -} -a { - s 0 - b 4500 6758 - e 4507 6765 - l 4500 6758 4507 6765 100 -} -a { - s 0 - b 5610 6760 - e 5617 6767 - l 5610 6760 5617 6767 100 -} -a { - s 0 - b 5611 6761 - e 5618 6768 - l 5611 6761 5618 6768 100 -} -a { - s 0 - b 5156 6761 - e 5163 6768 - l 5156 6761 5163 6768 100 -} -a { - s 0 - b 9280 6771 - e 9287 6778 - l 9280 6771 9287 6778 100 -} -a { - s 0 - b 3135 6771 - e 3142 6778 - l 3135 6771 3142 6778 100 -} -a { - s 0 - b 3136 6772 - e 3143 6779 - l 3136 6772 3143 6779 100 -} -a { - s 0 - b 9498 6775 - e 9505 6782 - l 9498 6775 9505 6782 100 -} -a { - s 0 - b 4974 6775 - e 4981 6782 - l 4974 6775 4981 6782 100 -} -a { - s 0 - b 4975 6776 - e 4982 6783 - l 4975 6776 4982 6783 100 -} -a { - s 0 - b 8711 6777 - e 8718 6784 - l 8711 6777 8718 6784 100 -} -a { - s 0 - b 13517 6781 - e 13524 6788 - l 13517 6781 13524 6788 100 -} -a { - s 0 - b 11256 6783 - e 11263 6790 - l 11256 6783 11263 6790 100 -} -a { - s 0 - b 8902 6815 - e 8909 6822 - l 8902 6815 8909 6822 100 -} -a { - s 0 - b 8196 6830 - e 8203 6837 - l 8196 6830 8203 6837 100 -} -a { - s 0 - b 9049 6831 - e 9056 6838 - l 9049 6831 9056 6838 100 -} -a { - s 0 - b 5072 6831 - e 5079 6838 - l 5072 6831 5079 6838 100 -} -a { - s 0 - b 5073 6832 - e 5080 6839 - l 5073 6832 5080 6839 100 -} -a { - s 0 - b 5074 6833 - e 5081 6840 - l 5074 6833 5081 6840 100 -} -a { - s 0 - b 14462 6837 - e 14469 6844 - l 14462 6837 14469 6844 100 -} -a { - s 0 - b 14463 6838 - e 14470 6845 - l 14463 6838 14470 6845 100 -} -a { - s 0 - b 5175 6843 - e 5182 6850 - l 5175 6843 5182 6850 100 -} -a { - s 0 - b 13404 6882 - e 13411 6889 - l 13404 6882 13411 6889 100 -} -a { - s 0 - b 13405 6883 - e 13412 6890 - l 13405 6883 13412 6890 100 -} -a { - s 0 - b 7615 6883 - e 7622 6890 - l 7615 6883 7622 6890 100 -} -a { - s 0 - b 3089 6884 - e 3096 6891 - l 3089 6884 3096 6891 100 -} -a { - s 0 - b 18401 6887 - e 18408 6894 - l 18401 6887 18408 6894 100 -} -a { - s 0 - b 18402 6888 - e 18409 6895 - l 18402 6888 18409 6895 100 -} -a { - s 0 - b 9734 6888 - e 9741 6895 - l 9734 6888 9741 6895 100 -} -a { - s 0 - b 133 6892 - e 140 6899 - l 133 6892 140 6899 100 -} -a { - s 0 - b 134 6893 - e 141 6900 - l 134 6893 141 6900 100 -} -a { - s 0 - b 9638 6895 - e 9645 6902 - l 9638 6895 9645 6902 100 -} -a { - s 0 - b 6654 6895 - e 6661 6902 - l 6654 6895 6661 6902 100 -} -a { - s 0 - b 9639 6896 - e 9646 6903 - l 9639 6896 9646 6903 100 -} -a { - s 0 - b 6655 6896 - e 6662 6903 - l 6655 6896 6662 6903 100 -} -a { - s 0 - b 3582 6896 - e 3589 6903 - l 3582 6896 3589 6903 100 -} -a { - s 0 - b 6706 6897 - e 6713 6904 - l 6706 6897 6713 6904 100 -} -a { - s 0 - b 6656 6897 - e 6663 6904 - l 6656 6897 6663 6904 100 -} -a { - s 0 - b 18355 6898 - e 18362 6905 - l 18355 6898 18362 6905 100 -} -a { - s 0 - b 6707 6898 - e 6714 6905 - l 6707 6898 6714 6905 100 -} -a { - s 0 - b 13460 6901 - e 13467 6908 - l 13460 6901 13467 6908 100 -} -a { - s 0 - b 13461 6902 - e 13468 6909 - l 13461 6902 13468 6909 100 -} -a { - s 0 - b 14125 6907 - e 14132 6914 - l 14125 6907 14132 6914 100 -} -a { - s 0 - b 12190 6946 - e 12197 6953 - l 12190 6946 12197 6953 100 -} -a { - s 0 - b 6861 6995 - e 6868 7002 - l 6861 6995 6868 7002 100 -} -a { - s 0 - b 18507 6996 - e 18514 7003 - l 18507 6996 18514 7003 100 -} -a { - s 0 - b 10948 7007 - e 10955 7014 - l 10948 7007 10955 7014 100 -} -a { - s 0 - b 11519 7016 - e 11526 7023 - l 11519 7016 11526 7023 100 -} -a { - s 0 - b 8897 7016 - e 8904 7023 - l 8897 7016 8904 7023 100 -} -a { - s 0 - b 7435 7016 - e 7442 7023 - l 7435 7016 7442 7023 100 -} -a { - s 0 - b 10971 7020 - e 10978 7027 - l 10971 7020 10978 7027 100 -} -a { - s 0 - b 3698 7025 - e 3705 7032 - l 3698 7025 3705 7032 100 -} -a { - s 0 - b 17411 7028 - e 17418 7035 - l 17411 7028 17418 7035 100 -} -a { - s 0 - b 14301 7028 - e 14308 7035 - l 14301 7028 14308 7035 100 -} -a { - s 0 - b 3003 7040 - e 3010 7047 - l 3003 7040 3010 7047 100 -} -a { - s 0 - b 15984 7043 - e 15991 7050 - l 15984 7043 15991 7050 100 -} -a { - s 0 - b 15985 7044 - e 15992 7051 - l 15985 7044 15992 7051 100 -} -a { - s 0 - b 15986 7045 - e 15993 7052 - l 15986 7045 15993 7052 100 -} -a { - s 0 - b 15987 7046 - e 15994 7053 - l 15987 7046 15994 7053 100 -} -a { - s 0 - b 3248 7085 - e 3255 7092 - l 3248 7085 3255 7092 100 -} -a { - s 0 - b 13456 7086 - e 13463 7093 - l 13456 7086 13463 7093 100 -} -a { - s 0 - b 18712 7107 - e 18719 7114 - l 18712 7107 18719 7114 100 -} -a { - s 0 - b 18713 7108 - e 18720 7115 - l 18713 7108 18720 7115 100 -} -a { - s 0 - b 3951 7121 - e 3958 7128 - l 3951 7121 3958 7128 100 -} -a { - s 0 - b 3952 7122 - e 3959 7129 - l 3952 7122 3959 7129 100 -} -a { - s 0 - b 11107 7134 - e 11114 7141 - l 11107 7134 11114 7141 100 -} -a { - s 0 - b 11378 7137 - e 11385 7144 - l 11378 7137 11385 7144 100 -} -a { - s 0 - b 11379 7138 - e 11386 7145 - l 11379 7138 11386 7145 100 -} -a { - s 0 - b 17247 7139 - e 17254 7146 - l 17247 7139 17254 7146 100 -} -a { - s 0 - b 8910 7147 - e 8917 7154 - l 8910 7147 8917 7154 100 -} -a { - s 0 - b 6581 7148 - e 6588 7155 - l 6581 7148 6588 7155 100 -} -a { - s 0 - b 3047 7152 - e 3054 7159 - l 3047 7152 3054 7159 100 -} -a { - s 0 - b 12160 7161 - e 12167 7168 - l 12160 7161 12167 7168 100 -} -a { - s 0 - b 10856 7167 - e 10863 7174 - l 10856 7167 10863 7174 100 -} -a { - s 0 - b 157 7167 - e 164 7174 - l 157 7167 164 7174 100 -} -a { - s 0 - b 10857 7168 - e 10864 7175 - l 10857 7168 10864 7175 100 -} -a { - s 0 - b 3395 7168 - e 3402 7175 - l 3395 7168 3402 7175 100 -} -a { - s 0 - b 158 7168 - e 165 7175 - l 158 7168 165 7175 100 -} -a { - s 0 - b 3402 7169 - e 3409 7176 - l 3402 7169 3409 7176 100 -} -a { - s 0 - b 3396 7169 - e 3403 7176 - l 3396 7169 3403 7176 100 -} -a { - s 0 - b 18372 7171 - e 18379 7178 - l 18372 7171 18379 7178 100 -} -a { - s 0 - b 18373 7172 - e 18380 7179 - l 18373 7172 18380 7179 100 -} -a { - s 0 - b 8198 7172 - e 8205 7179 - l 8198 7172 8205 7179 100 -} -a { - s 0 - b 8199 7173 - e 8206 7180 - l 8199 7173 8206 7180 100 -} -a { - s 0 - b 11057 7174 - e 11064 7181 - l 11057 7174 11064 7181 100 -} -a { - s 0 - b 12216 7175 - e 12223 7182 - l 12216 7175 12223 7182 100 -} -a { - s 0 - b 14848 7190 - e 14855 7197 - l 14848 7190 14855 7197 100 -} -a { - s 0 - b 8015 7191 - e 8022 7198 - l 8015 7191 8022 7198 100 -} -a { - s 0 - b 8016 7192 - e 8023 7199 - l 8016 7192 8023 7199 100 -} -a { - s 0 - b 7715 7194 - e 7722 7201 - l 7715 7194 7722 7201 100 -} -a { - s 0 - b 7716 7195 - e 7723 7202 - l 7716 7195 7723 7202 100 -} -a { - s 0 - b 3628 7394 - e 3635 7401 - l 3628 7394 3635 7401 100 -} -a { - s 0 - b 17223 7398 - e 17230 7405 - l 17223 7398 17230 7405 100 -} -a { - s 0 - b 15191 7398 - e 15198 7405 - l 15191 7398 15198 7405 100 -} -a { - s 0 - b 3137 7399 - e 3144 7406 - l 3137 7399 3144 7406 100 -} -a { - s 0 - b 3084 7399 - e 3091 7406 - l 3084 7399 3091 7406 100 -} -a { - s 0 - b 14966 7400 - e 14973 7407 - l 14966 7400 14973 7407 100 -} -a { - s 0 - b 14316 7400 - e 14323 7407 - l 14316 7400 14323 7407 100 -} -a { - s 0 - b 11474 7400 - e 11481 7407 - l 11474 7400 11481 7407 100 -} -a { - s 0 - b 3138 7400 - e 3145 7407 - l 3138 7400 3145 7407 100 -} -a { - s 0 - b 18402 7403 - e 18409 7410 - l 18402 7403 18409 7410 100 -} -a { - s 0 - b 9734 7403 - e 9741 7410 - l 9734 7403 9741 7410 100 -} -a { - s 0 - b 9735 7404 - e 9742 7411 - l 9735 7404 9742 7411 100 -} -a { - s 0 - b 9736 7405 - e 9743 7412 - l 9736 7405 9743 7412 100 -} -a { - s 0 - b 9737 7406 - e 9744 7413 - l 9737 7406 9744 7413 100 -} -a { - s 0 - b 8452 7406 - e 8459 7413 - l 8452 7406 8459 7413 100 -} -a { - s 0 - b 9738 7407 - e 9745 7414 - l 9738 7407 9745 7414 100 -} -a { - s 0 - b 8453 7407 - e 8460 7414 - l 8453 7407 8460 7414 100 -} -a { - s 0 - b 15124 7412 - e 15131 7419 - l 15124 7412 15131 7419 100 -} -a { - s 0 - b 6648 7414 - e 6655 7421 - l 6648 7414 6655 7421 100 -} -a { - s 0 - b 18780 7423 - e 18787 7430 - l 18780 7423 18787 7430 100 -} -a { - s 0 - b 10943 7424 - e 10950 7431 - l 10943 7424 10950 7431 100 -} -a { - s 0 - b 14500 7433 - e 14507 7440 - l 14500 7433 14507 7440 100 -} -a { - s 0 - b 14501 7434 - e 14508 7441 - l 14501 7434 14508 7441 100 -} -a { - s 0 - b 14502 7435 - e 14509 7442 - l 14502 7435 14509 7442 100 -} -a { - s 0 - b 3201 7437 - e 3208 7444 - l 3201 7437 3208 7444 100 -} -a { - s 0 - b 3227 7447 - e 3234 7454 - l 3227 7447 3234 7454 100 -} -a { - s 0 - b 8751 7450 - e 8758 7457 - l 8751 7450 8758 7457 100 -} -a { - s 0 - b 8752 7451 - e 8759 7458 - l 8752 7451 8759 7458 100 -} -a { - s 0 - b 10888 7454 - e 10895 7461 - l 10888 7454 10895 7461 100 -} -a { - s 0 - b 26 7459 - e 33 7466 - l 26 7459 33 7466 100 -} -a { - s 0 - b 8465 7477 - e 8472 7484 - l 8465 7477 8472 7484 100 -} -a { - s 0 - b 4421 7483 - e 4428 7490 - l 4421 7483 4428 7490 100 -} -a { - s 0 - b 4093 7485 - e 4100 7492 - l 4093 7485 4100 7492 100 -} -a { - s 0 - b 18043 7487 - e 18050 7494 - l 18043 7487 18050 7494 100 -} -a { - s 0 - b 11006 7490 - e 11013 7497 - l 11006 7490 11013 7497 100 -} -a { - s 0 - b 5633 7496 - e 5640 7503 - l 5633 7496 5640 7503 100 -} -a { - s 0 - b 6652 7499 - e 6659 7506 - l 6652 7499 6659 7506 100 -} -a { - s 0 - b 17521 7500 - e 17528 7507 - l 17521 7500 17528 7507 100 -} -a { - s 0 - b 13890 7500 - e 13897 7507 - l 13890 7500 13897 7507 100 -} -a { - s 0 - b 6653 7500 - e 6660 7507 - l 6653 7500 6660 7507 100 -} -a { - s 0 - b 17522 7501 - e 17529 7508 - l 17522 7501 17529 7508 100 -} -a { - s 0 - b 17523 7502 - e 17530 7509 - l 17523 7502 17530 7509 100 -} -a { - s 0 - b 17339 7505 - e 17346 7512 - l 17339 7505 17346 7512 100 -} -a { - s 0 - b 9441 7517 - e 9448 7524 - l 9441 7517 9448 7524 100 -} -a { - s 0 - b 9442 7518 - e 9449 7525 - l 9442 7518 9449 7525 100 -} -a { - s 0 - b 10831 7529 - e 10838 7536 - l 10831 7529 10838 7536 100 -} -a { - s 0 - b 7445 7532 - e 7452 7539 - l 7445 7532 7452 7539 100 -} -a { - s 0 - b 127 7548 - e 134 7555 - l 127 7548 134 7555 100 -} -a { - s 0 - b 128 7549 - e 135 7556 - l 128 7549 135 7556 100 -} -a { - s 0 - b 11213 7555 - e 11220 7562 - l 11213 7555 11220 7562 100 -} -a { - s 0 - b 61 7559 - e 68 7566 - l 61 7559 68 7566 100 -} -a { - s 0 - b 62 7560 - e 69 7567 - l 62 7560 69 7567 100 -} -a { - s 0 - b 63 7561 - e 70 7568 - l 63 7561 70 7568 100 -} -a { - s 0 - b 8731 7588 - e 8738 7595 - l 8731 7588 8738 7595 100 -} -a { - s 0 - b 4090 7588 - e 4097 7595 - l 4090 7588 4097 7595 100 -} -a { - s 0 - b 3565 7588 - e 3572 7595 - l 3565 7588 3572 7595 100 -} -a { - s 0 - b 4091 7589 - e 4098 7596 - l 4091 7589 4098 7596 100 -} -a { - s 0 - b 11249 7597 - e 11256 7604 - l 11249 7597 11256 7604 100 -} -a { - s 0 - b 17101 7603 - e 17108 7610 - l 17101 7603 17108 7610 100 -} -a { - s 0 - b 10984 7617 - e 10991 7624 - l 10984 7617 10991 7624 100 -} -a { - s 0 - b 10985 7618 - e 10992 7625 - l 10985 7618 10992 7625 100 -} -a { - s 0 - b 17527 7636 - e 17534 7643 - l 17527 7636 17534 7643 100 -} -a { - s 0 - b 6441 7644 - e 6448 7651 - l 6441 7644 6448 7651 100 -} -a { - s 0 - b 3364 7646 - e 3371 7653 - l 3364 7646 3371 7653 100 -} -a { - s 0 - b 4846 7654 - e 4853 7661 - l 4846 7654 4853 7661 100 -} -a { - s 0 - b 7562 7659 - e 7569 7666 - l 7562 7659 7569 7666 100 -} -a { - s 0 - b 8149 7660 - e 8156 7667 - l 8149 7660 8156 7667 100 -} -a { - s 0 - b 3082 7667 - e 3089 7674 - l 3082 7667 3089 7674 100 -} -a { - s 0 - b 14601 7670 - e 14608 7677 - l 14601 7670 14608 7677 100 -} -a { - s 0 - b 6608 7670 - e 6615 7677 - l 6608 7670 6615 7677 100 -} -a { - s 0 - b 8113 7672 - e 8120 7679 - l 8113 7672 8120 7679 100 -} -a { - s 0 - b 7539 7672 - e 7546 7679 - l 7539 7672 7546 7679 100 -} -a { - s 0 - b 18731 7678 - e 18738 7685 - l 18731 7678 18738 7685 100 -} -a { - s 0 - b 17369 7679 - e 17376 7686 - l 17369 7679 17376 7686 100 -} -a { - s 0 - b 15159 7697 - e 15166 7704 - l 15159 7697 15166 7704 100 -} -a { - s 0 - b 5581 7698 - e 5588 7705 - l 5581 7698 5588 7705 100 -} -a { - s 0 - b 13438 7703 - e 13445 7710 - l 13438 7703 13445 7710 100 -} -a { - s 0 - b 7993 7714 - e 8000 7721 - l 7993 7714 8000 7721 100 -} -a { - s 0 - b 8635 7717 - e 8642 7724 - l 8635 7717 8642 7724 100 -} -a { - s 0 - b 14858 7729 - e 14865 7736 - l 14858 7729 14865 7736 100 -} -a { - s 0 - b 3640 7734 - e 3647 7741 - l 3640 7734 3647 7741 100 -} -a { - s 0 - b 3641 7735 - e 3648 7742 - l 3641 7735 3648 7742 100 -} -a { - s 0 - b 12234 7752 - e 12241 7759 - l 12234 7752 12241 7759 100 -} -a { - s 0 - b 12235 7753 - e 12242 7760 - l 12235 7753 12242 7760 100 -} -a { - s 0 - b 42 7756 - e 49 7763 - l 42 7756 49 7763 100 -} -a { - s 0 - b 43 7757 - e 50 7764 - l 43 7757 50 7764 100 -} -a { - s 0 - b 44 7758 - e 51 7765 - l 44 7758 51 7765 100 -} -a { - s 0 - b 14763 7760 - e 14770 7767 - l 14763 7760 14770 7767 100 -} -a { - s 0 - b 7589 7760 - e 7596 7767 - l 7589 7760 7596 7767 100 -} -a { - s 0 - b 14764 7761 - e 14771 7768 - l 14764 7761 14771 7768 100 -} -a { - s 0 - b 15290 7769 - e 15297 7776 - l 15290 7769 15297 7776 100 -} -a { - s 0 - b 18779 8251 - e 18786 8258 - l 18779 8251 18786 8258 100 -} -a { - s 0 - b 305 8286 - e 312 8293 - l 305 8286 312 8293 100 -} -a { - s 0 - b 8788 8294 - e 8795 8301 - l 8788 8294 8795 8301 100 -} -a { - s 0 - b 15964 8310 - e 15971 8317 - l 15964 8310 15971 8317 100 -} -a { - s 0 - b 14353 8310 - e 14360 8317 - l 14353 8310 14360 8317 100 -} -a { - s 0 - b 14354 8311 - e 14361 8318 - l 14354 8311 14361 8318 100 -} -a { - s 0 - b 11277 8320 - e 11284 8327 - l 11277 8320 11284 8327 100 -} -a { - s 0 - b 11278 8321 - e 11285 8328 - l 11278 8321 11285 8328 100 -} -a { - s 0 - b 14216 8322 - e 14223 8329 - l 14216 8322 14223 8329 100 -} -a { - s 0 - b 7735 8329 - e 7742 8336 - l 7735 8329 7742 8336 100 -} -a { - s 0 - b 7736 8330 - e 7743 8337 - l 7736 8330 7743 8337 100 -} -a { - s 0 - b 6580 8351 - e 6587 8358 - l 6580 8351 6587 8358 100 -} -a { - s 0 - b 8911 8352 - e 8918 8359 - l 8911 8352 8918 8359 100 -} -a { - s 0 - b 14583 8356 - e 14590 8363 - l 14583 8356 14590 8363 100 -} -a { - s 0 - b 14584 8357 - e 14591 8364 - l 14584 8357 14591 8364 100 -} -a { - s 0 - b 8127 8360 - e 8134 8367 - l 8127 8360 8134 8367 100 -} -a { - s 0 - b 6850 8378 - e 6857 8385 - l 6850 8378 6857 8385 100 -} -a { - s 0 - b 9720 8396 - e 9727 8403 - l 9720 8396 9727 8403 100 -} -a { - s 0 - b 6816 8408 - e 6823 8415 - l 6816 8408 6823 8415 100 -} -a { - s 0 - b 7447 8414 - e 7454 8421 - l 7447 8414 7454 8421 100 -} -a { - s 0 - b 5107 8414 - e 5114 8421 - l 5107 8414 5114 8421 100 -} -a { - s 0 - b 17957 8422 - e 17964 8429 - l 17957 8422 17964 8429 100 -} -a { - s 0 - b 11215 8423 - e 11222 8430 - l 11215 8423 11222 8430 100 -} -a { - s 0 - b 15117 8427 - e 15124 8434 - l 15117 8427 15124 8434 100 -} -a { - s 0 - b 15118 8428 - e 15125 8435 - l 15118 8428 15125 8435 100 -} -a { - s 0 - b 6507 8428 - e 6514 8435 - l 6507 8428 6514 8435 100 -} -a { - s 0 - b 224 8430 - e 231 8437 - l 224 8430 231 8437 100 -} -a { - s 0 - b 225 8431 - e 232 8438 - l 225 8431 232 8438 100 -} -a { - s 0 - b 226 8432 - e 233 8439 - l 226 8432 233 8439 100 -} -a { - s 0 - b 17242 8433 - e 17249 8440 - l 17242 8433 17249 8440 100 -} -a { - s 0 - b 6907 8443 - e 6914 8450 - l 6907 8443 6914 8450 100 -} -a { - s 0 - b 9045 8445 - e 9052 8452 - l 9045 8445 9052 8452 100 -} -a { - s 0 - b 14624 8450 - e 14631 8457 - l 14624 8450 14631 8457 100 -} -a { - s 0 - b 15293 8457 - e 15300 8464 - l 15293 8457 15300 8464 100 -} -a { - s 0 - b 13455 8458 - e 13462 8465 - l 13455 8458 13462 8465 100 -} -a { - s 0 - b 13456 8459 - e 13463 8466 - l 13456 8459 13463 8466 100 -} -a { - s 0 - b 7429 8480 - e 7436 8487 - l 7429 8480 7436 8487 100 -} -a { - s 0 - b 18060 8484 - e 18067 8491 - l 18060 8484 18067 8491 100 -} -a { - s 0 - b 15311 8486 - e 15318 8493 - l 15311 8486 15318 8493 100 -} -a { - s 0 - b 11005 8488 - e 11012 8495 - l 11005 8488 11012 8495 100 -} -a { - s 0 - b 17264 8491 - e 17271 8498 - l 17264 8491 17271 8498 100 -} -a { - s 0 - b 7428 8508 - e 7435 8515 - l 7428 8508 7435 8515 100 -} -a { - s 0 - b 5630 8517 - e 5637 8524 - l 5630 8517 5637 8524 100 -} -a { - s 0 - b 5631 8518 - e 5638 8525 - l 5631 8518 5638 8525 100 -} -a { - s 0 - b 6897 8523 - e 6904 8530 - l 6897 8523 6904 8530 100 -} -a { - s 0 - b 5149 8533 - e 5156 8540 - l 5149 8533 5156 8540 100 -} -a { - s 0 - b 17499 8552 - e 17506 8559 - l 17499 8552 17506 8559 100 -} -a { - s 0 - b 16023 8574 - e 16030 8581 - l 16023 8574 16030 8581 100 -} -a { - s 0 - b 15383 8580 - e 15390 8587 - l 15383 8580 15390 8587 100 -} -a { - s 0 - b 5653 8590 - e 5660 8597 - l 5653 8590 5660 8597 100 -} -a { - s 0 - b 17095 8593 - e 17102 8600 - l 17095 8593 17102 8600 100 -} -a { - s 0 - b 7511 8596 - e 7518 8603 - l 7511 8596 7518 8603 100 -} -a { - s 0 - b 10983 8827 - e 10990 8834 - l 10983 8827 10990 8834 100 -} -a { - s 0 - b 10984 8828 - e 10991 8835 - l 10984 8828 10991 8835 100 -} -a { - s 0 - b 14952 8830 - e 14959 8837 - l 14952 8830 14959 8837 100 -} -a { - s 0 - b 11194 8832 - e 11201 8839 - l 11194 8832 11201 8839 100 -} -a { - s 0 - b 3288 8836 - e 3295 8843 - l 3288 8836 3295 8843 100 -} -a { - s 0 - b 3294 8837 - e 3301 8844 - l 3294 8837 3301 8844 100 -} -a { - s 0 - b 18760 8845 - e 18767 8852 - l 18760 8845 18767 8852 100 -} -a { - s 0 - b 7637 8859 - e 7644 8866 - l 7637 8859 7644 8866 100 -} -a { - s 0 - b 14228 8879 - e 14235 8886 - l 14228 8879 14235 8886 100 -} -a { - s 0 - b 3602 8879 - e 3609 8886 - l 3602 8879 3609 8886 100 -} -a { - s 0 - b 18705 8928 - e 18712 8935 - l 18705 8928 18712 8935 100 -} -a { - s 0 - b 18706 8929 - e 18713 8936 - l 18706 8929 18713 8936 100 -} -a { - s 0 - b 18707 8930 - e 18714 8937 - l 18707 8930 18714 8937 100 -} -a { - s 0 - b 15236 8934 - e 15243 8941 - l 15236 8934 15243 8941 100 -} -a { - s 0 - b 8905 8946 - e 8912 8953 - l 8905 8946 8912 8953 100 -} -a { - s 0 - b 16142 8949 - e 16149 8956 - l 16142 8949 16149 8956 100 -} -a { - s 0 - b 12612 8950 - e 12619 8957 - l 12612 8950 12619 8957 100 -} -a { - s 0 - b 9499 8953 - e 9506 8960 - l 9499 8953 9506 8960 100 -} -a { - s 0 - b 13410 8954 - e 13417 8961 - l 13410 8954 13417 8961 100 -} -a { - s 0 - b 9500 8954 - e 9507 8961 - l 9500 8954 9507 8961 100 -} -a { - s 0 - b 9501 8955 - e 9508 8962 - l 9501 8955 9508 8962 100 -} -a { - s 0 - b 17569 8959 - e 17576 8966 - l 17569 8959 17576 8966 100 -} -a { - s 0 - b 15220 8965 - e 15227 8972 - l 15220 8965 15227 8972 100 -} -a { - s 0 - b 11213 8976 - e 11220 8983 - l 11213 8976 11220 8983 100 -} -a { - s 0 - b 4493 8977 - e 4500 8984 - l 4493 8977 4500 8984 100 -} -a { - s 0 - b 11558 9513 - e 11565 9520 - l 11558 9513 11565 9520 100 -} -a { - s 0 - b 11510 9517 - e 11517 9524 - l 11510 9517 11517 9524 100 -} -a { - s 0 - b 6767 9518 - e 6774 9525 - l 6767 9518 6774 9525 100 -} -a { - s 0 - b 18397 9519 - e 18404 9526 - l 18397 9519 18404 9526 100 -} -a { - s 0 - b 6740 9522 - e 6747 9529 - l 6740 9522 6747 9529 100 -} -a { - s 0 - b 9261 9525 - e 9268 9532 - l 9261 9525 9268 9532 100 -} -a { - s 0 - b 9262 9526 - e 9269 9533 - l 9262 9526 9269 9533 100 -} -a { - s 0 - b 9263 9527 - e 9270 9534 - l 9263 9527 9270 9534 100 -} -a { - s 0 - b 9264 9528 - e 9271 9535 - l 9264 9528 9271 9535 100 -} -a { - s 0 - b 9265 9529 - e 9272 9536 - l 9265 9529 9272 9536 100 -} -a { - s 0 - b 4440 9533 - e 4447 9540 - l 4440 9533 4447 9540 100 -} -a { - s 0 - b 18391 9538 - e 18398 9545 - l 18391 9538 18398 9545 100 -} -a { - s 0 - b 8749 9548 - e 8756 9555 - l 8749 9548 8756 9555 100 -} -a { - s 0 - b 8750 9549 - e 8757 9556 - l 8750 9549 8757 9556 100 -} -a { - s 0 - b 146 9558 - e 153 9565 - l 146 9558 153 9565 100 -} -a { - s 0 - b 3308 9560 - e 3315 9567 - l 3308 9560 3315 9567 100 -} -a { - s 0 - b 3309 9561 - e 3316 9568 - l 3309 9561 3316 9568 100 -} -a { - s 0 - b 8957 9577 - e 8964 9584 - l 8957 9577 8964 9584 100 -} -a { - s 0 - b 4455 9579 - e 4462 9586 - l 4455 9579 4462 9586 100 -} -a { - s 0 - b 11351 9586 - e 11358 9593 - l 11351 9586 11358 9593 100 -} -a { - s 0 - b 13478 11702 - e 13485 11709 - l 13478 11702 13485 11709 100 -} -a { - s 0 - b 4469 11729 - e 4476 11736 - l 4469 11729 4476 11736 100 -} -a { - s 0 - b 3024 11735 - e 3031 11742 - l 3024 11735 3031 11742 100 -} -a { - s 0 - b 14897 11746 - e 14904 11753 - l 14897 11746 14904 11753 100 -} -a { - s 0 - b 4644 11749 - e 4651 11756 - l 4644 11749 4651 11756 100 -} -a { - s 0 - b 14516 11769 - e 14523 11776 - l 14516 11769 14523 11776 100 -} -a { - s 0 - b 8790 11778 - e 8797 11785 - l 8790 11778 8797 11785 100 -} -a { - s 0 - b 8444 11778 - e 8451 11785 - l 8444 11778 8451 11785 100 -} -a { - s 0 - b 6348 11780 - e 6355 11787 - l 6348 11780 6355 11787 100 -} -a { - s 0 - b 7774 11784 - e 7781 11791 - l 7774 11784 7781 11791 100 -} -a { - s 0 - b 7775 11785 - e 7782 11792 - l 7775 11785 7782 11792 100 -} -a { - s 0 - b 8587 11794 - e 8594 11801 - l 8587 11794 8594 11801 100 -} -a { - s 0 - b 14636 11795 - e 14643 11802 - l 14636 11795 14643 11802 100 -} -a { - s 0 - b 8588 11795 - e 8595 11802 - l 8588 11795 8595 11802 100 -} -a { - s 0 - b 14637 11796 - e 14644 11803 - l 14637 11796 14644 11803 100 -} -a { - s 0 - b 14638 11797 - e 14645 11804 - l 14638 11797 14645 11804 100 -} -a { - s 0 - b 14639 11798 - e 14646 11805 - l 14639 11798 14646 11805 100 -} -a { - s 0 - b 8246 11798 - e 8253 11805 - l 8246 11798 8253 11805 100 -} -a { - s 0 - b 3580 11801 - e 3587 11808 - l 3580 11801 3587 11808 100 -} -a { - s 0 - b 3581 11802 - e 3588 11809 - l 3581 11802 3588 11809 100 -} -a { - s 0 - b 9639 11803 - e 9646 11810 - l 9639 11803 9646 11810 100 -} -a { - s 0 - b 6655 11803 - e 6662 11810 - l 6655 11803 6662 11810 100 -} -a { - s 0 - b 3582 11803 - e 3589 11810 - l 3582 11803 3589 11810 100 -} -a { - s 0 - b 9640 11804 - e 9647 11811 - l 9640 11804 9647 11811 100 -} -a { - s 0 - b 3583 11804 - e 3590 11811 - l 3583 11804 3590 11811 100 -} -a { - s 0 - b 11144 11805 - e 11151 11812 - l 11144 11805 11151 11812 100 -} -a { - s 0 - b 3584 11805 - e 3591 11812 - l 3584 11805 3591 11812 100 -} -a { - s 0 - b 11145 11806 - e 11152 11813 - l 11145 11806 11152 11813 100 -} -a { - s 0 - b 3585 11806 - e 3592 11813 - l 3585 11806 3592 11813 100 -} -a { - s 0 - b 9041 11807 - e 9048 11814 - l 9041 11807 9048 11814 100 -} -a { - s 0 - b 4565 11808 - e 4572 11815 - l 4565 11808 4572 11815 100 -} -a { - s 0 - b 4566 11809 - e 4573 11816 - l 4566 11809 4573 11816 100 -} -a { - s 0 - b 15284 11810 - e 15291 11817 - l 15284 11810 15291 11817 100 -} -a { - s 0 - b 17348 11824 - e 17355 11831 - l 17348 11824 17355 11831 100 -} -a { - s 0 - b 17202 11824 - e 17209 11831 - l 17202 11824 17209 11831 100 -} -a { - s 0 - b 17203 11825 - e 17210 11832 - l 17203 11825 17210 11832 100 -} -a { - s 0 - b 6451 11828 - e 6458 11835 - l 6451 11828 6458 11835 100 -} -a { - s 0 - b 13912 11851 - e 13919 11858 - l 13912 11851 13919 11858 100 -} -a { - s 0 - b 13480 11875 - e 13487 11882 - l 13480 11875 13487 11882 100 -} -a { - s 0 - b 7741 11892 - e 7748 11899 - l 7741 11892 7748 11899 100 -} -a { - s 0 - b 7742 11893 - e 7749 11900 - l 7742 11893 7749 11900 100 -} -a { - s 0 - b 17321 11894 - e 17328 11901 - l 17321 11894 17328 11901 100 -} -a { - s 0 - b 3616 11939 - e 3623 11946 - l 3616 11939 3623 11946 100 -} -a { - s 0 - b 13996 11940 - e 14003 11947 - l 13996 11940 14003 11947 100 -} -a { - s 0 - b 9675 11940 - e 9682 11947 - l 9675 11940 9682 11947 100 -} -a { - s 0 - b 11095 11941 - e 11102 11948 - l 11095 11941 11102 11948 100 -} -a { - s 0 - b 11096 11942 - e 11103 11949 - l 11096 11942 11103 11949 100 -} -a { - s 0 - b 18745 11953 - e 18752 11960 - l 18745 11953 18752 11960 100 -} -a { - s 0 - b 18746 11954 - e 18753 11961 - l 18746 11954 18753 11961 100 -} -a { - s 0 - b 18747 11955 - e 18754 11962 - l 18747 11955 18754 11962 100 -} -a { - s 0 - b 7599 11957 - e 7606 11964 - l 7599 11957 7606 11964 100 -} -a { - s 0 - b 7547 11959 - e 7554 11966 - l 7547 11959 7554 11966 100 -} -a { - s 0 - b 10859 11975 - e 10866 11982 - l 10859 11975 10866 11982 100 -} -a { - s 0 - b 14929 11979 - e 14936 11986 - l 14929 11979 14936 11986 100 -} -a { - s 0 - b 12243 11980 - e 12250 11987 - l 12243 11980 12250 11987 100 -} -a { - s 0 - b 6449 11989 - e 6456 11996 - l 6449 11989 6456 11996 100 -} -a { - s 0 - b 7601 11991 - e 7608 11998 - l 7601 11991 7608 11998 100 -} -a { - s 0 - b 9465 11992 - e 9472 11999 - l 9465 11992 9472 11999 100 -} -a { - s 0 - b 7602 11992 - e 7609 11999 - l 7602 11992 7609 11999 100 -} -a { - s 0 - b 6759 11996 - e 6766 12003 - l 6759 11996 6766 12003 100 -} -a { - s 0 - b 6760 11997 - e 6767 12004 - l 6760 11997 6767 12004 100 -} -a { - s 0 - b 6761 11998 - e 6768 12005 - l 6761 11998 6768 12005 100 -} -a { - s 0 - b 14063 12000 - e 14070 12007 - l 14063 12000 14070 12007 100 -} -a { - s 0 - b 8267 12000 - e 8274 12007 - l 8267 12000 8274 12007 100 -} -a { - s 0 - b 16048 12002 - e 16055 12009 - l 16048 12002 16055 12009 100 -} -a { - s 0 - b 4618 12002 - e 4625 12009 - l 4618 12002 4625 12009 100 -} -a { - s 0 - b 9333 12006 - e 9340 12013 - l 9333 12006 9340 12013 100 -} -a { - s 0 - b 9334 12007 - e 9341 12014 - l 9334 12007 9341 12014 100 -} -a { - s 0 - b 5600 12010 - e 5607 12017 - l 5600 12010 5607 12017 100 -} -a { - s 0 - b 7542 12011 - e 7549 12018 - l 7542 12011 7549 12018 100 -} -a { - s 0 - b 7543 12012 - e 7550 12019 - l 7543 12012 7550 12019 100 -} -a { - s 0 - b 3727 12013 - e 3734 12020 - l 3727 12013 3734 12020 100 -} -a { - s 0 - b 3728 12014 - e 3735 12021 - l 3728 12014 3735 12021 100 -} -a { - s 0 - b 6344 12017 - e 6351 12024 - l 6344 12017 6351 12024 100 -} -a { - s 0 - b 18019 12027 - e 18026 12034 - l 18019 12027 18026 12034 100 -} -a { - s 0 - b 18020 12028 - e 18027 12035 - l 18020 12028 18027 12035 100 -} -a { - s 0 - b 8549 12028 - e 8556 12035 - l 8549 12028 8556 12035 100 -} -a { - s 0 - b 13459 12029 - e 13466 12036 - l 13459 12029 13466 12036 100 -} -a { - s 0 - b 6748 12034 - e 6755 12041 - l 6748 12034 6755 12041 100 -} -a { - s 0 - b 3183 12034 - e 3190 12041 - l 3183 12034 3190 12041 100 -} -a { - s 0 - b 6749 12035 - e 6756 12042 - l 6749 12035 6756 12042 100 -} -a { - s 0 - b 16151 12049 - e 16158 12056 - l 16151 12049 16158 12056 100 -} -a { - s 0 - b 3096 12050 - e 3103 12057 - l 3096 12050 3103 12057 100 -} -a { - s 0 - b 4084 12073 - e 4091 12080 - l 4084 12073 4091 12080 100 -} -a { - s 0 - b 15942 12101 - e 15949 12108 - l 15942 12101 15949 12108 100 -} -a { - s 0 - b 8447 12114 - e 8454 12121 - l 8447 12114 8454 12121 100 -} -a { - s 0 - b 8448 12115 - e 8455 12122 - l 8448 12115 8455 12122 100 -} -a { - s 0 - b 8449 12116 - e 8456 12123 - l 8449 12116 8456 12123 100 -} -a { - s 0 - b 6660 12118 - e 6667 12125 - l 6660 12118 6667 12125 100 -} -a { - s 0 - b 6661 12119 - e 6668 12126 - l 6661 12119 6668 12126 100 -} -a { - s 0 - b 16101 12123 - e 16108 12130 - l 16101 12123 16108 12130 100 -} -a { - s 0 - b 16102 12124 - e 16109 12131 - l 16102 12124 16109 12131 100 -} -a { - s 0 - b 11031 12127 - e 11038 12134 - l 11031 12127 11038 12134 100 -} -a { - s 0 - b 7688 12148 - e 7695 12155 - l 7688 12148 7695 12155 100 -} -a { - s 0 - b 13918 12171 - e 13925 12178 - l 13918 12171 13925 12178 100 -} -a { - s 0 - b 4245 13118 - e 4252 13125 - l 4245 13118 4252 13125 100 -} -a { - s 0 - b 13986 13122 - e 13993 13129 - l 13986 13122 13993 13129 100 -} -a { - s 0 - b 3345 13127 - e 3352 13134 - l 3345 13127 3352 13134 100 -} -a { - s 0 - b 7779 13140 - e 7786 13147 - l 7779 13140 7786 13147 100 -} -a { - s 0 - b 9445 13142 - e 9452 13149 - l 9445 13142 9452 13149 100 -} -a { - s 0 - b 4614 13142 - e 4621 13149 - l 4614 13142 4621 13149 100 -} -a { - s 0 - b 17272 13147 - e 17279 13154 - l 17272 13147 17279 13154 100 -} -a { - s 0 - b 9417 13155 - e 9424 13162 - l 9417 13155 9424 13162 100 -} -a { - s 0 - b 9418 13156 - e 9425 13163 - l 9418 13156 9425 13163 100 -} -a { - s 0 - b 8329 13165 - e 8336 13172 - l 8329 13165 8336 13172 100 -} -a { - s 0 - b 17292 13167 - e 17299 13174 - l 17292 13167 17299 13174 100 -} -a { - s 0 - b 17293 13168 - e 17300 13175 - l 17293 13168 17300 13175 100 -} -a { - s 0 - b 16063 13169 - e 16070 13176 - l 16063 13169 16070 13176 100 -} -a { - s 0 - b 11349 13173 - e 11356 13180 - l 11349 13173 11356 13180 100 -} -a { - s 0 - b 9686 13173 - e 9693 13180 - l 9686 13173 9693 13180 100 -} -a { - s 0 - b 6657 13176 - e 6664 13183 - l 6657 13176 6664 13183 100 -} -a { - s 0 - b 6658 13177 - e 6665 13184 - l 6658 13177 6665 13184 100 -} -a { - s 0 - b 6659 13178 - e 6666 13185 - l 6659 13178 6666 13185 100 -} -a { - s 0 - b 4056 13180 - e 4063 13187 - l 4056 13180 4063 13187 100 -} -a { - s 0 - b 8362 13181 - e 8369 13188 - l 8362 13181 8369 13188 100 -} -a { - s 0 - b 4057 13181 - e 4064 13188 - l 4057 13181 4064 13188 100 -} -a { - s 0 - b 3154 13186 - e 3161 13193 - l 3154 13186 3161 13193 100 -} -a { - s 0 - b 3155 13187 - e 3162 13194 - l 3155 13187 3162 13194 100 -} -a { - s 0 - b 3156 13188 - e 3163 13195 - l 3156 13188 3163 13195 100 -} -a { - s 0 - b 14611 13191 - e 14618 13198 - l 14611 13191 14618 13198 100 -} -a { - s 0 - b 14612 13192 - e 14619 13199 - l 14612 13192 14619 13199 100 -} -a { - s 0 - b 6230 13193 - e 6237 13200 - l 6230 13193 6237 13200 100 -} -a { - s 0 - b 4573 13193 - e 4580 13200 - l 4573 13193 4580 13200 100 -} -a { - s 0 - b 6440 13202 - e 6447 13209 - l 6440 13202 6447 13209 100 -} -a { - s 0 - b 6441 13203 - e 6448 13210 - l 6441 13203 6448 13210 100 -} -a { - s 0 - b 12608 13205 - e 12615 13212 - l 12608 13205 12615 13212 100 -} -a { - s 0 - b 6245 13205 - e 6252 13212 - l 6245 13205 6252 13212 100 -} -a { - s 0 - b 12609 13206 - e 12616 13213 - l 12609 13206 12616 13213 100 -} -a { - s 0 - b 3621 13209 - e 3628 13216 - l 3621 13209 3628 13216 100 -} -a { - s 0 - b 3622 13210 - e 3629 13217 - l 3622 13210 3629 13217 100 -} -a { - s 0 - b 6349 13211 - e 6356 13218 - l 6349 13211 6356 13218 100 -} -a { - s 0 - b 3623 13211 - e 3630 13218 - l 3623 13211 3630 13218 100 -} -a { - s 0 - b 4980 13222 - e 4987 13229 - l 4980 13222 4987 13229 100 -} -a { - s 0 - b 18005 13233 - e 18012 13240 - l 18005 13233 18012 13240 100 -} -a { - s 0 - b 18712 13236 - e 18719 13243 - l 18712 13236 18719 13243 100 -} -a { - s 0 - b 18713 13237 - e 18720 13244 - l 18713 13237 18720 13244 100 -} -a { - s 0 - b 18714 13238 - e 18721 13245 - l 18714 13238 18721 13245 100 -} -a { - s 0 - b 17394 13238 - e 17401 13245 - l 17394 13238 17401 13245 100 -} -a { - s 0 - b 12205 13242 - e 12212 13249 - l 12205 13242 12212 13249 100 -} -a { - s 0 - b 8171 13266 - e 8178 13273 - l 8171 13266 8178 13273 100 -} -a { - s 0 - b 8172 13267 - e 8179 13274 - l 8172 13267 8179 13274 100 -} -a { - s 0 - b 6434 13273 - e 6441 13280 - l 6434 13273 6441 13280 100 -} -a { - s 0 - b 14079 13299 - e 14086 13306 - l 14079 13299 14086 13306 100 -} -a { - s 0 - b 8938 13302 - e 8945 13309 - l 8938 13302 8945 13309 100 -} -a { - s 0 - b 13923 13306 - e 13930 13313 - l 13923 13306 13930 13313 100 -} -a { - s 0 - b 7402 13345 - e 7409 13352 - l 7402 13345 7409 13352 100 -} -a { - s 0 - b 7403 13346 - e 7410 13353 - l 7403 13346 7410 13353 100 -} -a { - s 0 - b 8324 13348 - e 8331 13355 - l 8324 13348 8331 13355 100 -} -a { - s 0 - b 3665 13348 - e 3672 13355 - l 3665 13348 3672 13355 100 -} -a { - s 0 - b 14286 13848 - e 14293 13855 - l 14286 13848 14293 13855 100 -} -a { - s 0 - b 14267 13852 - e 14274 13859 - l 14267 13852 14274 13859 100 -} -a { - s 0 - b 14268 13853 - e 14275 13860 - l 14268 13853 14275 13860 100 -} -a { - s 0 - b 3120 13857 - e 3127 13864 - l 3120 13857 3127 13864 100 -} -a { - s 0 - b 8698 13860 - e 8705 13867 - l 8698 13860 8705 13867 100 -} -a { - s 0 - b 8699 13861 - e 8706 13868 - l 8699 13861 8706 13868 100 -} -a { - s 0 - b 9681 13878 - e 9688 13885 - l 9681 13878 9688 13885 100 -} -a { - s 0 - b 3177 13880 - e 3184 13887 - l 3177 13880 3184 13887 100 -} -a { - s 0 - b 9447 13881 - e 9454 13888 - l 9447 13881 9454 13888 100 -} -a { - s 0 - b 3178 13881 - e 3185 13888 - l 3178 13881 3185 13888 100 -} -a { - s 0 - b 9448 13882 - e 9455 13889 - l 9448 13882 9455 13889 100 -} -a { - s 0 - b 12193 13902 - e 12200 13909 - l 12193 13902 12200 13909 100 -} -a { - s 0 - b 4846 13909 - e 4853 13916 - l 4846 13909 4853 13916 100 -} -a { - s 0 - b 4421 13914 - e 4428 13921 - l 4421 13914 4428 13921 100 -} -a { - s 0 - b 4093 13916 - e 4100 13923 - l 4093 13916 4100 13923 100 -} -a { - s 0 - b 14024 13918 - e 14031 13925 - l 14024 13918 14031 13925 100 -} -a { - s 0 - b 8906 13927 - e 8913 13934 - l 8906 13927 8913 13934 100 -} -a { - s 0 - b 8907 13928 - e 8914 13935 - l 8907 13928 8914 13935 100 -} -a { - s 0 - b 4101 13941 - e 4108 13948 - l 4101 13941 4108 13948 100 -} -a { - s 0 - b 4102 13942 - e 4109 13949 - l 4102 13942 4109 13949 100 -} -a { - s 0 - b 8166 13957 - e 8173 13964 - l 8166 13957 8173 13964 100 -} -a { - s 0 - b 8167 13958 - e 8174 13965 - l 8167 13958 8174 13965 100 -} -a { - s 0 - b 8168 13959 - e 8175 13966 - l 8168 13959 8175 13966 100 -} -a { - s 0 - b 8169 13960 - e 8176 13967 - l 8169 13960 8176 13967 100 -} -a { - s 0 - b 18467 13968 - e 18474 13975 - l 18467 13968 18474 13975 100 -} -a { - s 0 - b 6797 13974 - e 6804 13981 - l 6797 13974 6804 13981 100 -} -a { - s 0 - b 13456 13992 - e 13463 13999 - l 13456 13992 13463 13999 100 -} -a { - s 0 - b 14802 14002 - e 14809 14009 - l 14802 14002 14809 14009 100 -} -a { - s 0 - b 9740 14008 - e 9747 14015 - l 9740 14008 9747 14015 100 -} -a { - s 0 - b 4475 14012 - e 4482 14019 - l 4475 14012 4482 14019 100 -} -a { - s 0 - b 14741 14013 - e 14748 14020 - l 14741 14013 14748 14020 100 -} -a { - s 0 - b 6439 14013 - e 6446 14020 - l 6439 14013 6446 14020 100 -} -a { - s 0 - b 6440 14014 - e 6447 14021 - l 6440 14014 6447 14021 100 -} -a { - s 0 - b 8586 14038 - e 8593 14045 - l 8586 14038 8593 14045 100 -} -a { - s 0 - b 8884 14039 - e 8891 14046 - l 8884 14039 8891 14046 100 -} -a { - s 0 - b 14075 14041 - e 14082 14048 - l 14075 14041 14082 14048 100 -} -a { - s 0 - b 14076 14042 - e 14083 14049 - l 14076 14042 14083 14049 100 -} -a { - s 0 - b 14077 14043 - e 14084 14050 - l 14077 14043 14084 14050 100 -} -a { - s 0 - b 14078 14044 - e 14085 14051 - l 14078 14044 14085 14051 100 -} -a { - s 0 - b 14079 14045 - e 14086 14052 - l 14079 14045 14086 14052 100 -} -a { - s 0 - b 7542 14061 - e 7549 14068 - l 7542 14061 7549 14068 100 -} -a { - s 0 - b 7543 14062 - e 7550 14069 - l 7543 14062 7550 14069 100 -} -a { - s 0 - b 7544 14063 - e 7551 14070 - l 7544 14063 7551 14070 100 -} -a { - s 0 - b 10993 14077 - e 11000 14084 - l 10993 14077 11000 14084 100 -} -a { - s 0 - b 7656 14093 - e 7663 14100 - l 7656 14093 7663 14100 100 -} -a { - s 0 - b 7657 14094 - e 7664 14101 - l 7657 14094 7664 14101 100 -} -a { - s 0 - b 7658 14095 - e 7665 14102 - l 7658 14095 7665 14102 100 -} -a { - s 0 - b 17833 14115 - e 17840 14122 - l 17833 14115 17840 14122 100 -} -a { - s 0 - b 4976 14115 - e 4983 14122 - l 4976 14115 4983 14122 100 -} -a { - s 0 - b 6396 14116 - e 6403 14123 - l 6396 14116 6403 14123 100 -} -a { - s 0 - b 15984 14118 - e 15991 14125 - l 15984 14118 15991 14125 100 -} -a { - s 0 - b 15985 14119 - e 15992 14126 - l 15985 14119 15992 14126 100 -} -a { - s 0 - b 15986 14120 - e 15993 14127 - l 15986 14120 15993 14127 100 -} -a { - s 0 - b 8775 14131 - e 8782 14138 - l 8775 14131 8782 14138 100 -} -a { - s 0 - b 15342 14136 - e 15349 14143 - l 15342 14136 15349 14143 100 -} -a { - s 0 - b 5033 14276 - e 5040 14283 - l 5033 14276 5040 14283 100 -} -a { - s 0 - b 5034 14277 - e 5041 14284 - l 5034 14277 5041 14284 100 -} -a { - s 0 - b 9343 14283 - e 9350 14290 - l 9343 14283 9350 14290 100 -} -a { - s 0 - b 15331 14285 - e 15338 14292 - l 15331 14285 15338 14292 100 -} -a { - s 0 - b 8773 14291 - e 8780 14298 - l 8773 14291 8780 14298 100 -} -a { - s 0 - b 9401 14314 - e 9408 14321 - l 9401 14314 9408 14321 100 -} -a { - s 0 - b 6857 14314 - e 6864 14321 - l 6857 14314 6864 14321 100 -} -a { - s 0 - b 9402 14315 - e 9409 14322 - l 9402 14315 9409 14322 100 -} -a { - s 0 - b 3901 14319 - e 3908 14326 - l 3901 14319 3908 14326 100 -} -a { - s 0 - b 5148 14329 - e 5155 14336 - l 5148 14329 5155 14336 100 -} -a { - s 0 - b 5149 14330 - e 5156 14337 - l 5149 14330 5156 14337 100 -} -a { - s 0 - b 5150 14331 - e 5157 14338 - l 5150 14331 5157 14338 100 -} -a { - s 0 - b 9621 14334 - e 9628 14341 - l 9621 14334 9628 14341 100 -} -a { - s 0 - b 12617 14343 - e 12624 14350 - l 12617 14343 12624 14350 100 -} -a { - s 0 - b 11188 14357 - e 11195 14364 - l 11188 14357 11195 14364 100 -} -a { - s 0 - b 10996 14358 - e 11003 14365 - l 10996 14358 11003 14365 100 -} -a { - s 0 - b 4992 14363 - e 4999 14370 - l 4992 14363 4999 14370 100 -} -a { - s 0 - b 4993 14364 - e 5000 14371 - l 4993 14364 5000 14371 100 -} -a { - s 0 - b 14634 14367 - e 14641 14374 - l 14634 14367 14641 14374 100 -} -a { - s 0 - b 14222 14370 - e 14229 14377 - l 14222 14370 14229 14377 100 -} -a { - s 0 - b 8636 14371 - e 8643 14378 - l 8636 14371 8643 14378 100 -} -a { - s 0 - b 6784 14374 - e 6791 14381 - l 6784 14374 6791 14381 100 -} -a { - s 0 - b 6383 14392 - e 6390 14399 - l 6383 14392 6390 14399 100 -} -a { - s 0 - b 4932 14393 - e 4939 14400 - l 4932 14393 4939 14400 100 -} -a { - s 0 - b 4933 14394 - e 4940 14401 - l 4933 14394 4940 14401 100 -} -a { - s 0 - b 4934 14395 - e 4941 14402 - l 4934 14395 4941 14402 100 -} -a { - s 0 - b 17290 14397 - e 17297 14404 - l 17290 14397 17297 14404 100 -} -a { - s 0 - b 10856 14399 - e 10863 14406 - l 10856 14399 10863 14406 100 -} -a { - s 0 - b 157 14399 - e 164 14406 - l 157 14399 164 14406 100 -} -a { - s 0 - b 10857 14400 - e 10864 14407 - l 10857 14400 10864 14407 100 -} -a { - s 0 - b 3395 14400 - e 3402 14407 - l 3395 14400 3402 14407 100 -} -a { - s 0 - b 158 14400 - e 165 14407 - l 158 14400 165 14407 100 -} -a { - s 0 - b 159 14401 - e 166 14408 - l 159 14401 166 14408 100 -} -a { - s 0 - b 160 14402 - e 167 14409 - l 160 14402 167 14409 100 -} -a { - s 0 - b 9413 14406 - e 9420 14413 - l 9413 14406 9420 14413 100 -} -a { - s 0 - b 8311 14414 - e 8318 14421 - l 8311 14414 8318 14421 100 -} -a { - s 0 - b 8312 14415 - e 8319 14422 - l 8312 14415 8319 14422 100 -} -a { - s 0 - b 8313 14416 - e 8320 14423 - l 8313 14416 8320 14423 100 -} -a { - s 0 - b 8314 14417 - e 8321 14424 - l 8314 14417 8321 14424 100 -} -a { - s 0 - b 8315 14418 - e 8322 14425 - l 8315 14418 8322 14425 100 -} -a { - s 0 - b 10910 14432 - e 10917 14439 - l 10910 14432 10917 14439 100 -} -a { - s 0 - b 10911 14433 - e 10918 14440 - l 10911 14433 10918 14440 100 -} -a { - s 0 - b 15098 14436 - e 15105 14443 - l 15098 14436 15105 14443 100 -} -a { - s 0 - b 7521 14444 - e 7528 14451 - l 7521 14444 7528 14451 100 -} -a { - s 0 - b 7522 14445 - e 7529 14452 - l 7522 14445 7529 14452 100 -} -a { - s 0 - b 7523 14446 - e 7530 14453 - l 7523 14446 7530 14453 100 -} -a { - s 0 - b 3197 14462 - e 3204 14469 - l 3197 14462 3204 14469 100 -} -a { - s 0 - b 7638 14475 - e 7645 14482 - l 7638 14475 7645 14482 100 -} -a { - s 0 - b 3878 14492 - e 3885 14499 - l 3878 14492 3885 14499 100 -} -a { - s 0 - b 9284 14494 - e 9291 14501 - l 9284 14494 9291 14501 100 -} -a { - s 0 - b 6327 14494 - e 6334 14501 - l 6327 14494 6334 14501 100 -} -a { - s 0 - b 6328 14495 - e 6335 14502 - l 6328 14495 6335 14502 100 -} -a { - s 0 - b 6329 14496 - e 6336 14503 - l 6329 14496 6336 14503 100 -} -a { - s 0 - b 4959 14501 - e 4966 14508 - l 4959 14501 4966 14508 100 -} -a { - s 0 - b 8300 14519 - e 8307 14526 - l 8300 14519 8307 14526 100 -} -a { - s 0 - b 8582 14522 - e 8589 14529 - l 8582 14522 8589 14529 100 -} -a { - s 0 - b 15296 14523 - e 15303 14530 - l 15296 14523 15303 14530 100 -} -a { - s 0 - b 13393 14528 - e 13400 14535 - l 13393 14528 13400 14535 100 -} -a { - s 0 - b 13394 14529 - e 13401 14536 - l 13394 14529 13401 14536 100 -} -a { - s 0 - b 17983 14534 - e 17990 14541 - l 17983 14534 17990 14541 100 -} -a { - s 0 - b 8150 14534 - e 8157 14541 - l 8150 14534 8157 14541 100 -} -a { - s 0 - b 8280 14536 - e 8287 14543 - l 8280 14536 8287 14543 100 -} -a { - s 0 - b 5654 14540 - e 5661 14547 - l 5654 14540 5661 14547 100 -} -a { - s 0 - b 17351 14542 - e 17358 14549 - l 17351 14542 17358 14549 100 -} -a { - s 0 - b 8677 14567 - e 8684 14574 - l 8677 14567 8684 14574 100 -} -a { - s 0 - b 14625 14570 - e 14632 14577 - l 14625 14570 14632 14577 100 -} -a { - s 0 - b 18494 14585 - e 18501 14592 - l 18494 14585 18501 14592 100 -} -a { - s 0 - b 18495 14586 - e 18502 14593 - l 18495 14586 18502 14593 100 -} -a { - s 0 - b 17297 14610 - e 17304 14617 - l 17297 14610 17304 14617 100 -} -a { - s 0 - b 8265 14617 - e 8272 14624 - l 8265 14617 8272 14624 100 -} -a { - s 0 - b 14064 14619 - e 14071 14626 - l 14064 14619 14071 14626 100 -} -a { - s 0 - b 6912 14621 - e 6919 14628 - l 6912 14621 6919 14628 100 -} -a { - s 0 - b 4795 14624 - e 4802 14631 - l 4795 14624 4802 14631 100 -} -a { - s 0 - b 18662 14630 - e 18669 14637 - l 18662 14630 18669 14637 100 -} -a { - s 0 - b 17545 14634 - e 17552 14641 - l 17545 14634 17552 14641 100 -} -a { - s 0 - b 14106 14634 - e 14113 14641 - l 14106 14634 14113 14641 100 -} -a { - s 0 - b 3407 14634 - e 3414 14641 - l 3407 14634 3414 14641 100 -} -a { - s 0 - b 17546 14635 - e 17553 14642 - l 17546 14635 17553 14642 100 -} -a { - s 0 - b 3071 14644 - e 3078 14651 - l 3071 14644 3078 14651 100 -} -a { - s 0 - b 3072 14645 - e 3079 14652 - l 3072 14645 3079 14652 100 -} -a { - s 0 - b 8562 14646 - e 8569 14653 - l 8562 14646 8569 14653 100 -} -a { - s 0 - b 3073 14646 - e 3080 14653 - l 3073 14646 3080 14653 100 -} -a { - s 0 - b 3245 14657 - e 3252 14664 - l 3245 14657 3252 14664 100 -} -a { - s 0 - b 3246 14658 - e 3253 14665 - l 3246 14658 3253 14665 100 -} -a { - s 0 - b 3247 14659 - e 3254 14666 - l 3247 14659 3254 14666 100 -} -a { - s 0 - b 4610 14675 - e 4617 14682 - l 4610 14675 4617 14682 100 -} -a { - s 0 - b 15971 14680 - e 15978 14687 - l 15971 14680 15978 14687 100 -} -a { - s 0 - b 250 14684 - e 257 14691 - l 250 14684 257 14691 100 -} -a { - s 0 - b 11306 14695 - e 11313 14702 - l 11306 14695 11313 14702 100 -} -a { - s 0 - b 8784 14696 - e 8791 14703 - l 8784 14696 8791 14703 100 -} -a { - s 0 - b 15979 14697 - e 15986 14704 - l 15979 14697 15986 14704 100 -} -a { - s 0 - b 15170 14710 - e 15177 14717 - l 15170 14710 15177 14717 100 -} -a { - s 0 - b 2972 14715 - e 2979 14722 - l 2972 14715 2979 14722 100 -} -a { - s 0 - b 6773 14730 - e 6780 14737 - l 6773 14730 6780 14737 100 -} -a { - s 0 - b 8338 14732 - e 8345 14739 - l 8338 14732 8345 14739 100 -} -a { - s 0 - b 4947 14733 - e 4954 14740 - l 4947 14733 4954 14740 100 -} -a { - s 0 - b 9277 14737 - e 9284 14744 - l 9277 14737 9284 14744 100 -} -a { - s 0 - b 14250 14741 - e 14257 14748 - l 14250 14741 14257 14748 100 -} -a { - s 0 - b 14251 14742 - e 14258 14749 - l 14251 14742 14258 14749 100 -} -a { - s 0 - b 18761 14747 - e 18768 14754 - l 18761 14747 18768 14754 100 -} -a { - s 0 - b 14034 14750 - e 14041 14757 - l 14034 14750 14041 14757 100 -} -a { - s 0 - b 14035 14751 - e 14042 14758 - l 14035 14751 14042 14758 100 -} -a { - s 0 - b 14036 14752 - e 14043 14759 - l 14036 14752 14043 14759 100 -} -a { - s 0 - b 14037 14753 - e 14044 14760 - l 14037 14753 14044 14760 100 -} -a { - s 0 - b 7502 14761 - e 7509 14768 - l 7502 14761 7509 14768 100 -} -a { - s 0 - b 16008 14767 - e 16015 14774 - l 16008 14767 16015 14774 100 -} -a { - s 0 - b 8423 14768 - e 8430 14775 - l 8423 14768 8430 14775 100 -} -a { - s 0 - b 9425 14771 - e 9432 14778 - l 9425 14771 9432 14778 100 -} -a { - s 0 - b 71 14779 - e 78 14786 - l 71 14779 78 14786 100 -} -a { - s 0 - b 14476 14841 - e 14483 14848 - l 14476 14841 14483 14848 100 -} -a { - s 0 - b 14477 14842 - e 14484 14849 - l 14477 14842 14484 14849 100 -} -a { - s 0 - b 11371 14868 - e 11378 14875 - l 11371 14868 11378 14875 100 -} -a { - s 0 - b 11372 14869 - e 11379 14876 - l 11372 14869 11379 14876 100 -} -a { - s 0 - b 8791 14891 - e 8798 14898 - l 8791 14891 8798 14898 100 -} -a { - s 0 - b 8445 14891 - e 8452 14898 - l 8445 14891 8452 14898 100 -} -a { - s 0 - b 14019 14892 - e 14026 14899 - l 14019 14892 14026 14899 100 -} -a { - s 0 - b 5658 14893 - e 5665 14900 - l 5658 14893 5665 14900 100 -} -a { - s 0 - b 8642 14894 - e 8649 14901 - l 8642 14894 8649 14901 100 -} -a { - s 0 - b 8643 14895 - e 8650 14902 - l 8643 14895 8650 14902 100 -} -a { - s 0 - b 4401 14896 - e 4408 14903 - l 4401 14896 4408 14903 100 -} -a { - s 0 - b 6637 14905 - e 6644 14912 - l 6637 14905 6644 14912 100 -} -a { - s 0 - b 6638 14906 - e 6645 14913 - l 6638 14906 6645 14913 100 -} -a { - s 0 - b 6639 14907 - e 6646 14914 - l 6639 14907 6646 14914 100 -} -a { - s 0 - b 9352 14908 - e 9359 14915 - l 9352 14908 9359 14915 100 -} -a { - s 0 - b 13950 14911 - e 13957 14918 - l 13950 14911 13957 14918 100 -} -a { - s 0 - b 6900 14913 - e 6907 14920 - l 6900 14913 6907 14920 100 -} -a { - s 0 - b 8619 14916 - e 8626 14923 - l 8619 14916 8626 14923 100 -} -a { - s 0 - b 8032 14916 - e 8039 14923 - l 8032 14916 8039 14923 100 -} -a { - s 0 - b 8033 14917 - e 8040 14924 - l 8033 14917 8040 14924 100 -} -a { - s 0 - b 9704 14918 - e 9711 14925 - l 9704 14918 9711 14925 100 -} -a { - s 0 - b 14679 14919 - e 14686 14926 - l 14679 14919 14686 14926 100 -} -a { - s 0 - b 9705 14919 - e 9712 14926 - l 9705 14919 9712 14926 100 -} -a { - s 0 - b 9706 14920 - e 9713 14927 - l 9706 14920 9713 14927 100 -} -a { - s 0 - b 8421 14932 - e 8428 14939 - l 8421 14932 8428 14939 100 -} -a { - s 0 - b 8422 14933 - e 8429 14940 - l 8422 14933 8429 14940 100 -} -a { - s 0 - b 16009 14934 - e 16016 14941 - l 16009 14934 16016 14941 100 -} -a { - s 0 - b 17496 14937 - e 17503 14944 - l 17496 14937 17503 14944 100 -} -a { - s 0 - b 9273 14941 - e 9280 14948 - l 9273 14941 9280 14948 100 -} -a { - s 0 - b 16063 14957 - e 16070 14964 - l 16063 14957 16070 14964 100 -} -a { - s 0 - b 15299 14962 - e 15306 14969 - l 15299 14962 15306 14969 100 -} -a { - s 0 - b 6722 14978 - e 6729 14985 - l 6722 14978 6729 14985 100 -} -a { - s 0 - b 6723 14979 - e 6730 14986 - l 6723 14979 6730 14986 100 -} -a { - s 0 - b 18545 14984 - e 18552 14991 - l 18545 14984 18552 14991 100 -} -a { - s 0 - b 11489 14984 - e 11496 14991 - l 11489 14984 11496 14991 100 -} -a { - s 0 - b 15368 14988 - e 15375 14995 - l 15368 14988 15375 14995 100 -} -a { - s 0 - b 3893 14992 - e 3900 14999 - l 3893 14992 3900 14999 100 -} -a { - s 0 - b 3894 14993 - e 3901 15000 - l 3894 14993 3901 15000 100 -} -a { - s 0 - b 12223 14994 - e 12230 15001 - l 12223 14994 12230 15001 100 -} -a { - s 0 - b 8834 14996 - e 8841 15003 - l 8834 14996 8841 15003 100 -} -a { - s 0 - b 9432 15012 - e 9439 15019 - l 9432 15012 9439 15019 100 -} -a { - s 0 - b 5115 15021 - e 5122 15028 - l 5115 15021 5122 15028 100 -} -a { - s 0 - b 14356 15037 - e 14363 15044 - l 14356 15037 14363 15044 100 -} -a { - s 0 - b 14279 15063 - e 14286 15070 - l 14279 15063 14286 15070 100 -} -a { - s 0 - b 8275 15068 - e 8282 15075 - l 8275 15068 8282 15075 100 -} -a { - s 0 - b 14298 15071 - e 14305 15078 - l 14298 15071 14305 15078 100 -} -a { - s 0 - b 5024 15072 - e 5031 15079 - l 5024 15072 5031 15079 100 -} -a { - s 0 - b 5621 15080 - e 5628 15087 - l 5621 15080 5628 15087 100 -} -a { - s 0 - b 16041 15088 - e 16048 15095 - l 16041 15088 16048 15095 100 -} -a { - s 0 - b 18631 15093 - e 18638 15100 - l 18631 15093 18638 15100 100 -} -a { - s 0 - b 17550 15109 - e 17557 15116 - l 17550 15109 17557 15116 100 -} -a { - s 0 - b 7778 15109 - e 7785 15116 - l 7778 15109 7785 15116 100 -} -a { - s 0 - b 14893 15110 - e 14900 15117 - l 14893 15110 14900 15117 100 -} -a { - s 0 - b 9309 15126 - e 9316 15133 - l 9309 15126 9316 15133 100 -} -a { - s 0 - b 17385 15128 - e 17392 15135 - l 17385 15128 17392 15135 100 -} -a { - s 0 - b 15989 15145 - e 15996 15152 - l 15989 15145 15996 15152 100 -} -a { - s 0 - b 17540 15150 - e 17547 15157 - l 17540 15150 17547 15157 100 -} -a { - s 0 - b 3343 15163 - e 3350 15170 - l 3343 15163 3350 15170 100 -} -a { - s 0 - b 9058 15184 - e 9065 15191 - l 9058 15184 9065 15191 100 -} -a { - s 0 - b 14217 15189 - e 14224 15196 - l 14217 15189 14224 15196 100 -} -a { - s 0 - b 7572 15196 - e 7579 15203 - l 7572 15196 7579 15203 100 -} -a { - s 0 - b 8057 15199 - e 8064 15206 - l 8057 15199 8064 15206 100 -} -a { - s 0 - b 15970 15221 - e 15977 15228 - l 15970 15221 15977 15228 100 -} -a { - s 0 - b 15971 15222 - e 15978 15229 - l 15971 15222 15978 15229 100 -} -a { - s 0 - b 15172 15224 - e 15179 15231 - l 15172 15224 15179 15231 100 -} -a { - s 0 - b 4923 15234 - e 4930 15241 - l 4923 15234 4930 15241 100 -} -a { - s 0 - b 4047 15234 - e 4054 15241 - l 4047 15234 4054 15241 100 -} -a { - s 0 - b 13433 15235 - e 13440 15242 - l 13433 15235 13440 15242 100 -} -a { - s 0 - b 96 15281 - e 103 15288 - l 96 15281 103 15288 100 -} -a { - s 0 - b 14313 15283 - e 14320 15290 - l 14313 15283 14320 15290 100 -} -a { - s 0 - b 13448 15283 - e 13455 15290 - l 13448 15283 13455 15290 100 -} -a { - s 0 - b 6605 15283 - e 6612 15290 - l 6605 15283 6612 15290 100 -} -a { - s 0 - b 13449 15284 - e 13456 15291 - l 13449 15284 13456 15291 100 -} -a { - s 0 - b 14214 15286 - e 14221 15293 - l 14214 15286 14221 15293 100 -} -a { - s 0 - b 13914 15290 - e 13921 15297 - l 13914 15290 13921 15297 100 -} -a { - s 0 - b 13915 15291 - e 13922 15298 - l 13915 15291 13922 15298 100 -} -a { - s 0 - b 11449 15296 - e 11456 15303 - l 11449 15296 11456 15303 100 -} -a { - s 0 - b 8195 15296 - e 8202 15303 - l 8195 15296 8202 15303 100 -} -a { - s 0 - b 8560 15301 - e 8567 15308 - l 8560 15301 8567 15308 100 -} -a { - s 0 - b 12209 15312 - e 12216 15319 - l 12209 15312 12216 15319 100 -} -a { - s 0 - b 12210 15313 - e 12217 15320 - l 12210 15313 12217 15320 100 -} -a { - s 0 - b 18419 15324 - e 18426 15331 - l 18419 15324 18426 15331 100 -} -a { - s 0 - b 7434 15328 - e 7441 15335 - l 7434 15328 7441 15335 100 -} -a { - s 0 - b 7987 15340 - e 7994 15347 - l 7987 15340 7994 15347 100 -} -a { - s 0 - b 15245 15362 - e 15252 15369 - l 15245 15362 15252 15369 100 -} -a { - s 0 - b 15246 15363 - e 15253 15370 - l 15246 15363 15253 15370 100 -} -a { - s 0 - b 15247 15364 - e 15254 15371 - l 15247 15364 15254 15371 100 -} -a { - s 0 - b 15248 15365 - e 15255 15372 - l 15248 15365 15255 15372 100 -} -a { - s 0 - b 15249 15366 - e 15256 15373 - l 15249 15366 15256 15373 100 -} -a { - s 0 - b 16026 15367 - e 16033 15374 - l 16026 15367 16033 15374 100 -} -a { - s 0 - b 8541 15368 - e 8548 15375 - l 8541 15368 8548 15375 100 -} -a { - s 0 - b 14672 15374 - e 14679 15381 - l 14672 15374 14679 15381 100 -} -a { - s 0 - b 3984 15374 - e 3991 15381 - l 3984 15374 3991 15381 100 -} -a { - s 0 - b 14673 15375 - e 14680 15382 - l 14673 15375 14680 15382 100 -} -a { - s 0 - b 8551 15377 - e 8558 15384 - l 8551 15377 8558 15384 100 -} -a { - s 0 - b 14317 15381 - e 14324 15388 - l 14317 15381 14324 15388 100 -} -a { - s 0 - b 11475 15381 - e 11482 15388 - l 11475 15381 11482 15388 100 -} -a { - s 0 - b 11476 15382 - e 11483 15389 - l 11476 15382 11483 15389 100 -} -a { - s 0 - b 14369 15383 - e 14376 15390 - l 14369 15383 14376 15390 100 -} -a { - s 0 - b 16136 15386 - e 16143 15393 - l 16136 15386 16143 15393 100 -} -a { - s 0 - b 203 15387 - e 210 15394 - l 203 15387 210 15394 100 -} -a { - s 0 - b 6248 15401 - e 6255 15408 - l 6248 15401 6255 15408 100 -} -a { - s 0 - b 9455 15402 - e 9462 15409 - l 9455 15402 9462 15409 100 -} -a { - s 0 - b 6726 15402 - e 6733 15409 - l 6726 15402 6733 15409 100 -} -a { - s 0 - b 17452 15410 - e 17459 15417 - l 17452 15410 17459 15417 100 -} -a { - s 0 - b 13386 15410 - e 13393 15417 - l 13386 15410 13393 15417 100 -} -a { - s 0 - b 17567 15412 - e 17574 15419 - l 17567 15412 17574 15419 100 -} -a { - s 0 - b 17568 15413 - e 17575 15420 - l 17568 15413 17575 15420 100 -} -a { - s 0 - b 18426 15421 - e 18433 15428 - l 18426 15421 18433 15428 100 -} -a { - s 0 - b 18427 15422 - e 18434 15429 - l 18427 15422 18434 15429 100 -} -a { - s 0 - b 14180 15423 - e 14187 15430 - l 14180 15423 14187 15430 100 -} -a { - s 0 - b 14181 15424 - e 14188 15431 - l 14181 15424 14188 15431 100 -} -a { - s 0 - b 14182 15425 - e 14189 15432 - l 14182 15425 14189 15432 100 -} -a { - s 0 - b 3942 15426 - e 3949 15433 - l 3942 15426 3949 15433 100 -} -a { - s 0 - b 14071 15438 - e 14078 15445 - l 14071 15438 14078 15445 100 -} -a { - s 0 - b 3078 15443 - e 3085 15450 - l 3078 15443 3085 15450 100 -} -a { - s 0 - b 16070 15456 - e 16077 15463 - l 16070 15456 16077 15463 100 -} -a { - s 0 - b 6802 15467 - e 6809 15474 - l 6802 15467 6809 15474 100 -} -a { - s 0 - b 6803 15468 - e 6810 15475 - l 6803 15468 6810 15475 100 -} -a { - s 0 - b 13449 15469 - e 13456 15476 - l 13449 15469 13456 15476 100 -} -a { - s 0 - b 6305 15479 - e 6312 15486 - l 6305 15479 6312 15486 100 -} -a { - s 0 - b 10837 15529 - e 10844 15536 - l 10837 15529 10844 15536 100 -} -a { - s 0 - b 10838 15530 - e 10845 15537 - l 10838 15530 10845 15537 100 -} -a { - s 0 - b 8356 15530 - e 8363 15537 - l 8356 15530 8363 15537 100 -} -a { - s 0 - b 3244 15532 - e 3251 15539 - l 3244 15532 3251 15539 100 -} -a { - s 0 - b 13951 15559 - e 13958 15566 - l 13951 15559 13958 15566 100 -} -a { - s 0 - b 10974 15560 - e 10981 15567 - l 10974 15560 10981 15567 100 -} -a { - s 0 - b 235 15574 - e 242 15581 - l 235 15574 242 15581 100 -} -a { - s 0 - b 8978 15575 - e 8985 15582 - l 8978 15575 8985 15582 100 -} -a { - s 0 - b 236 15575 - e 243 15582 - l 236 15575 243 15582 100 -} -a { - s 0 - b 197 15575 - e 204 15582 - l 197 15575 204 15582 100 -} -a { - s 0 - b 8979 15576 - e 8986 15583 - l 8979 15576 8986 15583 100 -} -a { - s 0 - b 7659 15576 - e 7666 15583 - l 7659 15576 7666 15583 100 -} -a { - s 0 - b 237 15576 - e 244 15583 - l 237 15576 244 15583 100 -} -a { - s 0 - b 238 15577 - e 245 15584 - l 238 15577 245 15584 100 -} -a { - s 0 - b 18589 15588 - e 18596 15595 - l 18589 15588 18596 15595 100 -} -a { - s 0 - b 3027 15588 - e 3034 15595 - l 3027 15588 3034 15595 100 -} -a { - s 0 - b 8310 15594 - e 8317 15601 - l 8310 15594 8317 15601 100 -} -a { - s 0 - b 3577 15594 - e 3584 15601 - l 3577 15594 3584 15601 100 -} -a { - s 0 - b 12610 15597 - e 12617 15604 - l 12610 15597 12617 15604 100 -} -a { - s 0 - b 12611 15598 - e 12618 15605 - l 12611 15598 12618 15605 100 -} -a { - s 0 - b 12612 15599 - e 12619 15606 - l 12612 15599 12619 15606 100 -} -a { - s 0 - b 3022 15604 - e 3029 15611 - l 3022 15604 3029 15611 100 -} -a { - s 0 - b 302 15604 - e 309 15611 - l 302 15604 309 15611 100 -} -a { - s 0 - b 4929 15611 - e 4936 15618 - l 4929 15611 4936 15618 100 -} -a { - s 0 - b 12596 15623 - e 12603 15630 - l 12596 15623 12603 15630 100 -} -a { - s 0 - b 11274 15643 - e 11281 15650 - l 11274 15643 11281 15650 100 -} -a { - s 0 - b 135 15647 - e 142 15654 - l 135 15647 142 15654 100 -} -a { - s 0 - b 118 15649 - e 125 15656 - l 118 15649 125 15656 100 -} -a { - s 0 - b 6347 15651 - e 6354 15658 - l 6347 15651 6354 15658 100 -} -a { - s 0 - b 18710 15652 - e 18717 15659 - l 18710 15652 18717 15659 100 -} -a { - s 0 - b 10860 15674 - e 10867 15681 - l 10860 15674 10867 15681 100 -} -a { - s 0 - b 17225 16145 - e 17232 16152 - l 17225 16145 17232 16152 100 -} -a { - s 0 - b 14830 16145 - e 14837 16152 - l 14830 16145 14837 16152 100 -} -a { - s 0 - b 11065 16145 - e 11072 16152 - l 11065 16145 11072 16152 100 -} -a { - s 0 - b 14559 16160 - e 14566 16167 - l 14559 16160 14566 16167 100 -} -a { - s 0 - b 14134 16160 - e 14141 16167 - l 14134 16160 14141 16167 100 -} -a { - s 0 - b 9064 16165 - e 9071 16172 - l 9064 16165 9071 16172 100 -} -a { - s 0 - b 4432 16192 - e 4439 16199 - l 4432 16192 4439 16199 100 -} -a { - s 0 - b 4433 16193 - e 4440 16200 - l 4433 16193 4440 16200 100 -} -a { - s 0 - b 4434 16194 - e 4441 16201 - l 4434 16194 4441 16201 100 -} -a { - s 0 - b 4385 16214 - e 4392 16221 - l 4385 16214 4392 16221 100 -} -a { - s 0 - b 8507 16217 - e 8514 16224 - l 8507 16217 8514 16224 100 -} -a { - s 0 - b 5661 16217 - e 5668 16224 - l 5661 16217 5668 16224 100 -} -a { - s 0 - b 14506 16218 - e 14513 16225 - l 14506 16218 14513 16225 100 -} -a { - s 0 - b 8508 16218 - e 8515 16225 - l 8508 16218 8515 16225 100 -} -a { - s 0 - b 9605 16222 - e 9612 16229 - l 9605 16222 9612 16229 100 -} -a { - s 0 - b 5601 16222 - e 5608 16229 - l 5601 16222 5608 16229 100 -} -a { - s 0 - b 7683 16259 - e 7690 16266 - l 7683 16259 7690 16266 100 -} -a { - s 0 - b 12201 16264 - e 12208 16271 - l 12201 16264 12208 16271 100 -} -a { - s 0 - b 8425 16264 - e 8432 16271 - l 8425 16264 8432 16271 100 -} -a { - s 0 - b 12202 16265 - e 12209 16272 - l 12202 16265 12209 16272 100 -} -a { - s 0 - b 14491 16268 - e 14498 16275 - l 14491 16268 14498 16275 100 -} -a { - s 0 - b 15107 16276 - e 15114 16283 - l 15107 16276 15114 16283 100 -} -a { - s 0 - b 2998 16276 - e 3005 16283 - l 2998 16276 3005 16283 100 -} -a { - s 0 - b 17438 16284 - e 17445 16291 - l 17438 16284 17445 16291 100 -} -a { - s 0 - b 7647 16292 - e 7654 16299 - l 7647 16292 7654 16299 100 -} -a { - s 0 - b 17983 17039 - e 17990 17046 - l 17983 17039 17990 17046 100 -} -a { - s 0 - b 8150 17039 - e 8157 17046 - l 8150 17039 8157 17046 100 -} -a { - s 0 - b 17221 17050 - e 17228 17057 - l 17221 17050 17228 17057 100 -} -a { - s 0 - b 16017 17051 - e 16024 17058 - l 16017 17051 16024 17058 100 -} -a { - s 0 - b 3236 17051 - e 3243 17058 - l 3236 17051 3243 17058 100 -} -a { - s 0 - b 7752 17055 - e 7759 17062 - l 7752 17055 7759 17062 100 -} -a { - s 0 - b 11059 17440 - e 11066 17447 - l 11059 17440 11066 17447 100 -} -a { - s 0 - b 4613 17440 - e 4620 17447 - l 4613 17440 4620 17447 100 -} -a { - s 0 - b 11060 17441 - e 11067 17448 - l 11060 17441 11067 17448 100 -} -a { - s 0 - b 11061 17442 - e 11068 17449 - l 11061 17442 11068 17449 100 -} -a { - s 0 - b 14025 17443 - e 14032 17450 - l 14025 17443 14032 17450 100 -} -a { - s 0 - b 16019 17445 - e 16026 17452 - l 16019 17445 16026 17452 100 -} -a { - s 0 - b 7727 17447 - e 7734 17454 - l 7727 17447 7734 17454 100 -} -a { - s 0 - b 7570 17483 - e 7577 17490 - l 7570 17483 7577 17490 100 -} -a { - s 0 - b 8692 17485 - e 8699 17492 - l 8692 17485 8699 17492 100 -} -a { - s 0 - b 14238 17486 - e 14245 17493 - l 14238 17486 14245 17493 100 -} -a { - s 0 - b 14239 17487 - e 14246 17494 - l 14239 17487 14246 17494 100 -} -a { - s 0 - b 14240 17488 - e 14247 17495 - l 14240 17488 14247 17495 100 -} -a { - s 0 - b 5563 17502 - e 5570 17509 - l 5563 17502 5570 17509 100 -} -a { - s 0 - b 12580 17514 - e 12587 17521 - l 12580 17514 12587 17521 100 -} -a { - s 0 - b 12603 17516 - e 12610 17523 - l 12603 17516 12610 17523 100 -} -a { - s 0 - b 18550 17518 - e 18557 17525 - l 18550 17518 18557 17525 100 -} -a { - s 0 - b 5075 17522 - e 5082 17529 - l 5075 17522 5082 17529 100 -} -a { - s 0 - b 5076 17523 - e 5083 17530 - l 5076 17523 5083 17530 100 -} -a { - s 0 - b 17420 17530 - e 17427 17537 - l 17420 17530 17427 17537 100 -} -a { - s 0 - b 5681 17539 - e 5688 17546 - l 5681 17539 5688 17546 100 -} -a { - s 0 - b 5682 17540 - e 5689 17547 - l 5682 17540 5689 17547 100 -} -a { - s 0 - b 5683 17541 - e 5690 17548 - l 5683 17541 5690 17548 100 -} -a { - s 0 - b 5684 17542 - e 5691 17549 - l 5684 17542 5691 17549 100 -} -a { - s 0 - b 17833 17545 - e 17840 17552 - l 17833 17545 17840 17552 100 -} -a { - s 0 - b 4976 17545 - e 4983 17552 - l 4976 17545 4983 17552 100 -} -a { - s 0 - b 8645 17549 - e 8652 17556 - l 8645 17549 8652 17556 100 -} -a { - s 0 - b 8646 17550 - e 8653 17557 - l 8646 17550 8653 17557 100 -} -a { - s 0 - b 4054 17560 - e 4061 17567 - l 4054 17560 4061 17567 100 -} -a { - s 0 - b 18405 17562 - e 18412 17569 - l 18405 17562 18412 17569 100 -} -a { - s 0 - b 18406 17563 - e 18413 17570 - l 18406 17563 18413 17570 100 -} -a { - s 0 - b 3025 17563 - e 3032 17570 - l 3025 17563 3032 17570 100 -} -a { - s 0 - b 15263 17569 - e 15270 17576 - l 15263 17569 15270 17576 100 -} -a { - s 0 - b 4545 17577 - e 4552 17584 - l 4545 17577 4552 17584 100 -} -a { - s 0 - b 15998 17586 - e 16005 17593 - l 15998 17586 16005 17593 100 -} -a { - s 0 - b 14360 17587 - e 14367 17594 - l 14360 17587 14367 17594 100 -} -a { - s 0 - b 14361 17588 - e 14368 17595 - l 14361 17588 14368 17595 100 -} -a { - s 0 - b 8822 17589 - e 8829 17596 - l 8822 17589 8829 17596 100 -} -a { - s 0 - b 9446 17598 - e 9453 17605 - l 9446 17598 9453 17605 100 -} -a { - s 0 - b 11349 17602 - e 11356 17609 - l 11349 17602 11356 17609 100 -} -a { - s 0 - b 9686 17602 - e 9693 17609 - l 9686 17602 9693 17609 100 -} -a { - s 0 - b 14788 17604 - e 14795 17611 - l 14788 17604 14795 17611 100 -} -a { - s 0 - b 14789 17605 - e 14796 17612 - l 14789 17605 14796 17612 100 -} -a { - s 0 - b 5085 17607 - e 5092 17614 - l 5085 17607 5092 17614 100 -} -a { - s 0 - b 5086 17608 - e 5093 17615 - l 5086 17608 5093 17615 100 -} -a { - s 0 - b 8772 17626 - e 8779 17633 - l 8772 17626 8779 17633 100 -} -a { - s 0 - b 7396 17660 - e 7403 17667 - l 7396 17660 7403 17667 100 -} -a { - s 0 - b 6724 17660 - e 6731 17667 - l 6724 17660 6731 17667 100 -} -a { - s 0 - b 5132 17663 - e 5139 17670 - l 5132 17663 5139 17670 100 -} -a { - s 0 - b 3226 17664 - e 3233 17671 - l 3226 17664 3233 17671 100 -} -a { - s 0 - b 3227 17665 - e 3234 17672 - l 3227 17665 3234 17672 100 -} -a { - s 0 - b 14699 17670 - e 14706 17677 - l 14699 17670 14706 17677 100 -} -a { - s 0 - b 11097 17679 - e 11104 17686 - l 11097 17679 11104 17686 100 -} -a { - s 0 - b 3001 17700 - e 3008 17707 - l 3001 17700 3008 17707 100 -} -a { - s 0 - b 17959 17725 - e 17966 17732 - l 17959 17725 17966 17732 100 -} -a { - s 0 - b 3218 17739 - e 3225 17746 - l 3218 17739 3225 17746 100 -} -a { - s 0 - b 10860 17757 - e 10867 17764 - l 10860 17757 10867 17764 100 -} -a { - s 0 - b 5584 17921 - e 5591 17928 - l 5584 17921 5591 17928 100 -} -a { - s 0 - b 203 17940 - e 210 17947 - l 203 17940 210 17947 100 -} -a { - s 0 - b 204 17941 - e 211 17948 - l 204 17941 211 17948 100 -} -a { - s 0 - b 11561 17944 - e 11568 17951 - l 11561 17944 11568 17951 100 -} -a { - s 0 - b 11562 17945 - e 11569 17952 - l 11562 17945 11569 17952 100 -} -a { - s 0 - b 4855 17945 - e 4862 17952 - l 4855 17945 4862 17952 100 -} -a { - s 0 - b 11563 17946 - e 11570 17953 - l 11563 17946 11570 17953 100 -} -a { - s 0 - b 4856 17946 - e 4863 17953 - l 4856 17946 4863 17953 100 -} -a { - s 0 - b 11564 17947 - e 11571 17954 - l 11564 17947 11571 17954 100 -} -a { - s 0 - b 5083 17947 - e 5090 17954 - l 5083 17947 5090 17954 100 -} -a { - s 0 - b 11355 17954 - e 11362 17961 - l 11355 17954 11362 17961 100 -} -a { - s 0 - b 6563 17965 - e 6570 17972 - l 6563 17965 6570 17972 100 -} -a { - s 0 - b 11482 17975 - e 11489 17982 - l 11482 17975 11489 17982 100 -} -a { - s 0 - b 11483 17976 - e 11490 17983 - l 11483 17976 11490 17983 100 -} -a { - s 0 - b 15408 17977 - e 15415 17984 - l 15408 17977 15415 17984 100 -} -a { - s 0 - b 4572 17981 - e 4579 17988 - l 4572 17981 4579 17988 100 -} -a { - s 0 - b 4066 17993 - e 4073 18000 - l 4066 17993 4073 18000 100 -} -a { - s 0 - b 11134 18008 - e 11141 18015 - l 11134 18008 11141 18015 100 -} -a { - s 0 - b 14479 18014 - e 14486 18021 - l 14479 18014 14486 18021 100 -} -a { - s 0 - b 11246 18015 - e 11253 18022 - l 11246 18015 11253 18022 100 -} -a { - s 0 - b 3121 18016 - e 3128 18023 - l 3121 18016 3128 18023 100 -} -a { - s 0 - b 8608 18022 - e 8615 18029 - l 8608 18022 8615 18029 100 -} -a { - s 0 - b 10934 18046 - e 10941 18053 - l 10934 18046 10941 18053 100 -} -a { - s 0 - b 6504 18046 - e 6511 18053 - l 6504 18046 6511 18053 100 -} -a { - s 0 - b 4243 18058 - e 4250 18065 - l 4243 18058 4250 18065 100 -} -a { - s 0 - b 4578 18068 - e 4585 18075 - l 4578 18068 4585 18075 100 -} -a { - s 0 - b 3348 18068 - e 3355 18075 - l 3348 18068 3355 18075 100 -} -a { - s 0 - b 6521 18069 - e 6528 18076 - l 6521 18069 6528 18076 100 -} -a { - s 0 - b 3392 18069 - e 3399 18076 - l 3392 18069 3399 18076 100 -} -a { - s 0 - b 15377 18092 - e 15384 18099 - l 15377 18092 15384 18099 100 -} -a { - s 0 - b 14708 18093 - e 14715 18100 - l 14708 18093 14715 18100 100 -} -a { - s 0 - b 17465 18106 - e 17472 18113 - l 17465 18106 17472 18113 100 -} -a { - s 0 - b 8096 18122 - e 8103 18129 - l 8096 18122 8103 18129 100 -} -a { - s 0 - b 11484 18126 - e 11491 18133 - l 11484 18126 11491 18133 100 -} -a { - s 0 - b 11485 18127 - e 11492 18134 - l 11485 18127 11492 18134 100 -} -a { - s 0 - b 11486 18128 - e 11493 18135 - l 11486 18128 11493 18135 100 -} -a { - s 0 - b 6286 18130 - e 6293 18137 - l 6286 18130 6293 18137 100 -} -a { - s 0 - b 6374 18134 - e 6381 18141 - l 6374 18134 6381 18141 100 -} -a { - s 0 - b 9511 18135 - e 9518 18142 - l 9511 18135 9518 18142 100 -} -a { - s 0 - b 6375 18135 - e 6382 18142 - l 6375 18135 6382 18142 100 -} -a { - s 0 - b 9512 18136 - e 9519 18143 - l 9512 18136 9519 18143 100 -} -a { - s 0 - b 6376 18136 - e 6383 18143 - l 6376 18136 6383 18143 100 -} -a { - s 0 - b 9513 18137 - e 9520 18144 - l 9513 18137 9520 18144 100 -} -a { - s 0 - b 9386 18153 - e 9393 18160 - l 9386 18153 9393 18160 100 -} -a { - s 0 - b 9387 18154 - e 9394 18161 - l 9387 18154 9394 18161 100 -} -a { - s 0 - b 17305 18155 - e 17312 18162 - l 17305 18155 17312 18162 100 -} -a { - s 0 - b 17306 18156 - e 17313 18163 - l 17306 18156 17313 18163 100 -} -a { - s 0 - b 7494 18163 - e 7501 18170 - l 7494 18163 7501 18170 100 -} -a { - s 0 - b 8025 18168 - e 8032 18175 - l 8025 18168 8032 18175 100 -} -a { - s 0 - b 8845 18196 - e 8852 18203 - l 8845 18196 8852 18203 100 -} -a { - s 0 - b 18471 18227 - e 18478 18234 - l 18471 18227 18478 18234 100 -} -a { - s 0 - b 92 18782 - e 99 18789 - l 92 18782 99 18789 100 -} -a { - s 0 - b 10829 18799 - e 10836 18806 - l 10829 18799 10836 18806 100 -} -a { - s 0 - b 10830 18800 - e 10837 18807 - l 10830 18800 10837 18807 100 -} -a { - s 0 - b 10831 18801 - e 10838 18808 - l 10831 18801 10838 18808 100 -} -a { - s 0 - b 10832 18802 - e 10839 18809 - l 10832 18802 10839 18809 100 -} -a { - s 0 - b 10833 18803 - e 10840 18810 - l 10833 18803 10840 18810 100 -} -a { - s 0 - b 15123 18805 - e 15130 18812 - l 15123 18805 15130 18812 100 -} -a { - s 0 - b 15124 18806 - e 15131 18813 - l 15124 18806 15131 18813 100 -} -a { - s 0 - b 5605 18833 - e 5612 18840 - l 5605 18833 5612 18840 100 -} -a { - s 0 - b 5606 18834 - e 5613 18841 - l 5606 18834 5613 18841 100 -} -a { - s 0 - b 18601 18836 - e 18608 18843 - l 18601 18836 18608 18843 100 -} -a { - s 0 - b 6531 18843 - e 6538 18850 - l 6531 18843 6538 18850 100 -} -a { - s 0 - b 8696 18856 - e 8703 18863 - l 8696 18856 8703 18863 100 -} -a { - s 0 - b 8873 18858 - e 8880 18865 - l 8873 18858 8880 18865 100 -} -a { - s 0 - b 14297 18864 - e 14304 18871 - l 14297 18864 14304 18871 100 -} -a { - s 0 - b 8273 18864 - e 8280 18871 - l 8273 18864 8280 18871 100 -} -a { - s 0 - b 17836 18869 - e 17843 18876 - l 17836 18869 17843 18876 100 -} -a { - s 0 - b 3599 18870 - e 3606 18877 - l 3599 18870 3606 18877 100 -} -a { - s 0 - b 3306 18872 - e 3313 18879 - l 3306 18872 3313 18879 100 -} -a { - s 0 - b 17314 18889 - e 17321 18896 - l 17314 18889 17321 18896 100 -} -a { - s 0 - b 18538 18907 - e 18545 18914 - l 18538 18907 18545 18914 100 -} -a { - s 0 - b 17217 18922 - e 17224 18929 - l 17217 18922 17224 18929 100 -} -a { - s 0 - b 8515 18942 - e 8522 18949 - l 8515 18942 8522 18949 100 -} -a { - s 0 - b 203 18958 - e 210 18965 - l 203 18958 210 18965 100 -} -a { - s 0 - b 8659 18975 - e 8666 18982 - l 8659 18975 8666 18982 100 -} -a { - s 0 - b 7653 18981 - e 7660 18988 - l 7653 18981 7660 18988 100 -} -a { - s 0 - b 15384 18984 - e 15391 18991 - l 15384 18984 15391 18991 100 -} -a { - s 0 - b 3947 18985 - e 3954 18992 - l 3947 18985 3954 18992 100 -} -a { - s 0 - b 10922 18988 - e 10929 18995 - l 10922 18988 10929 18995 100 -} -a { - s 0 - b 8143 18993 - e 8150 19000 - l 8143 18993 8150 19000 100 -} -a { - s 0 - b 3168 19001 - e 3175 19008 - l 3168 19001 3175 19008 100 -} -a { - s 0 - b 17514 19004 - e 17521 19011 - l 17514 19004 17521 19011 100 -} -a { - s 0 - b 210 19014 - e 217 19021 - l 210 19014 217 19021 100 -} -a { - s 0 - b 3072 19019 - e 3079 19026 - l 3072 19019 3079 19026 100 -} -a { - s 0 - b 18776 19032 - e 18783 19039 - l 18776 19032 18783 19039 100 -} -a { - s 0 - b 14306 19032 - e 14313 19039 - l 14306 19032 14313 19039 100 -} -a { - s 0 - b 18777 19033 - e 18784 19040 - l 18777 19033 18784 19040 100 -} -a { - s 0 - b 14307 19033 - e 14314 19040 - l 14307 19033 14314 19040 100 -} -a { - s 0 - b 18778 19034 - e 18785 19041 - l 18778 19034 18785 19041 100 -} -a { - s 0 - b 14308 19034 - e 14315 19041 - l 14308 19034 14315 19041 100 -} -a { - s 0 - b 14309 19035 - e 14316 19042 - l 14309 19035 14316 19042 100 -} -a { - s 0 - b 14782 19056 - e 14789 19063 - l 14782 19056 14789 19063 100 -} -a { - s 0 - b 3400 19069 - e 3407 19076 - l 3400 19069 3407 19076 100 -} -a { - s 0 - b 4482 19082 - e 4489 19089 - l 4482 19082 4489 19089 100 -} -a { - s 0 - b 4449 19095 - e 4456 19102 - l 4449 19095 4456 19102 100 -} -a { - s 0 - b 4946 19109 - e 4953 19116 - l 4946 19109 4953 19116 100 -} -a { - s 0 - b 8339 19110 - e 8346 19117 - l 8339 19110 8346 19117 100 -} -a { - s 0 - b 3250 19272 - e 3257 19279 - l 3250 19272 3257 19279 100 -} -a { - s 0 - b 14876 19586 - e 14883 19593 - l 14876 19586 14883 19593 100 -} -a { - s 0 - b 5643 19586 - e 5650 19593 - l 5643 19586 5650 19593 100 -} -a { - s 0 - b 18719 19587 - e 18726 19594 - l 18719 19587 18726 19594 100 -} -a { - s 0 - b 14736 19589 - e 14743 19596 - l 14736 19589 14743 19596 100 -} -a { - s 0 - b 8019 19703 - e 8026 19710 - l 8019 19703 8026 19710 100 -} -a { - s 0 - b 8020 19704 - e 8027 19711 - l 8020 19704 8027 19711 100 -} -a { - s 0 - b 3079 19704 - e 3086 19711 - l 3079 19704 3086 19711 100 -} -a { - s 0 - b 3080 19705 - e 3087 19712 - l 3080 19705 3087 19712 100 -} -a { - s 0 - b 93 19705 - e 100 19712 - l 93 19705 100 19712 100 -} -a { - s 0 - b 3330 19715 - e 3337 19722 - l 3330 19715 3337 19722 100 -} -a { - s 0 - b 4468 19719 - e 4475 19726 - l 4468 19719 4475 19726 100 -} -a { - s 0 - b 4469 19720 - e 4476 19727 - l 4469 19720 4476 19727 100 -} -a { - s 0 - b 14158 19721 - e 14165 19728 - l 14158 19721 14165 19728 100 -} -a { - s 0 - b 17450 19743 - e 17457 19750 - l 17450 19743 17457 19750 100 -} -a { - s 0 - b 11411 19758 - e 11418 19765 - l 11411 19758 11418 19765 100 -} -a { - s 0 - b 9495 19758 - e 9502 19765 - l 9495 19758 9502 19765 100 -} -a { - s 0 - b 9496 19759 - e 9503 19766 - l 9496 19759 9503 19766 100 -} -a { - s 0 - b 11306 19760 - e 11313 19767 - l 11306 19760 11313 19767 100 -} -a { - s 0 - b 11307 19761 - e 11314 19768 - l 11307 19761 11314 19768 100 -} -a { - s 0 - b 8471 20686 - e 8478 20693 - l 8471 20686 8478 20693 100 -} -a { - s 0 - b 11196 20689 - e 11203 20696 - l 11196 20689 11203 20696 100 -} -a { - s 0 - b 11197 20690 - e 11204 20697 - l 11197 20690 11204 20697 100 -} -a { - s 0 - b 15395 20691 - e 15402 20698 - l 15395 20691 15402 20698 100 -} -a { - s 0 - b 7706 20694 - e 7713 20701 - l 7706 20694 7713 20701 100 -} -a { - s 0 - b 3175 20694 - e 3182 20701 - l 3175 20694 3182 20701 100 -} -a { - s 0 - b 11236 20702 - e 11243 20709 - l 11236 20702 11243 20709 100 -} -a { - s 0 - b 4101 20723 - e 4108 20730 - l 4101 20723 4108 20730 100 -} -a { - s 0 - b 17371 20726 - e 17378 20733 - l 17371 20726 17378 20733 100 -} -a { - s 0 - b 8217 20727 - e 8224 20734 - l 8217 20727 8224 20734 100 -} -a { - s 0 - b 8585 20731 - e 8592 20738 - l 8585 20731 8592 20738 100 -} -a { - s 0 - b 17978 20733 - e 17985 20740 - l 17978 20733 17985 20740 100 -} -a { - s 0 - b 7604 20740 - e 7611 20747 - l 7604 20740 7611 20747 100 -} -a { - s 0 - b 14256 20751 - e 14263 20758 - l 14256 20751 14263 20758 100 -} -a { - s 0 - b 14257 20752 - e 14264 20759 - l 14257 20752 14264 20759 100 -} -a { - s 0 - b 3224 20767 - e 3231 20774 - l 3224 20767 3231 20774 100 -} -a { - s 0 - b 10958 20775 - e 10965 20782 - l 10958 20775 10965 20782 100 -} -a { - s 0 - b 11513 20796 - e 11520 20803 - l 11513 20796 11520 20803 100 -} -a { - s 0 - b 11514 20797 - e 11521 20804 - l 11514 20797 11521 20804 100 -} -a { - s 0 - b 15949 20798 - e 15956 20805 - l 15949 20798 15956 20805 100 -} -a { - s 0 - b 3928 20798 - e 3935 20805 - l 3928 20798 3935 20805 100 -} -a { - s 0 - b 15950 20799 - e 15957 20806 - l 15950 20799 15957 20806 100 -} -a { - s 0 - b 11452 20803 - e 11459 20810 - l 11452 20803 11459 20810 100 -} -a { - s 0 - b 14643 20828 - e 14650 20835 - l 14643 20828 14650 20835 100 -} -a { - s 0 - b 7722 20830 - e 7729 20837 - l 7722 20830 7729 20837 100 -} -a { - s 0 - b 8556 20834 - e 8563 20841 - l 8556 20834 8563 20841 100 -} -a { - s 0 - b 15185 20835 - e 15192 20842 - l 15185 20835 15192 20842 100 -} -a { - s 0 - b 99 20835 - e 106 20842 - l 99 20835 106 20842 100 -} -a { - s 0 - b 15186 20836 - e 15193 20843 - l 15186 20836 15193 20843 100 -} -a { - s 0 - b 21 20837 - e 28 20844 - l 21 20837 28 20844 100 -} -a { - s 0 - b 8512 20838 - e 8519 20845 - l 8512 20838 8519 20845 100 -} -a { - s 0 - b 13528 20855 - e 13535 20862 - l 13528 20855 13535 20862 100 -} -a { - s 0 - b 13529 20856 - e 13536 20863 - l 13529 20856 13536 20863 100 -} -a { - s 0 - b 10902 20856 - e 10909 20863 - l 10902 20856 10909 20863 100 -} -a { - s 0 - b 18022 20859 - e 18029 20866 - l 18022 20859 18029 20866 100 -} -a { - s 0 - b 6838 20860 - e 6845 20867 - l 6838 20860 6845 20867 100 -} -a { - s 0 - b 6839 20861 - e 6846 20868 - l 6839 20861 6846 20868 100 -} -a { - s 0 - b 17397 20873 - e 17404 20880 - l 17397 20873 17404 20880 100 -} -a { - s 0 - b 11538 20903 - e 11545 20910 - l 11538 20903 11545 20910 100 -} -a { - s 0 - b 14595 20906 - e 14602 20913 - l 14595 20906 14602 20913 100 -} -a { - s 0 - b 14596 20907 - e 14603 20914 - l 14596 20907 14603 20914 100 -} -a { - s 0 - b 14319 20909 - e 14326 20916 - l 14319 20909 14326 20916 100 -} -a { - s 0 - b 6588 20909 - e 6595 20916 - l 6588 20909 6595 20916 100 -} -a { - s 0 - b 14320 20910 - e 14327 20917 - l 14320 20910 14327 20917 100 -} -a { - s 0 - b 11097 20993 - e 11104 21000 - l 11097 20993 11104 21000 100 -} -a { - s 0 - b 11098 20994 - e 11105 21001 - l 11098 20994 11105 21001 100 -} -a { - s 0 - b 11099 20995 - e 11106 21002 - l 11099 20995 11106 21002 100 -} -a { - s 0 - b 4563 20996 - e 4570 21003 - l 4563 20996 4570 21003 100 -} -a { - s 0 - b 4555 21001 - e 4562 21008 - l 4555 21001 4562 21008 100 -} -a { - s 0 - b 15921 21019 - e 15928 21026 - l 15921 21019 15928 21026 100 -} -a { - s 0 - b 3579 21025 - e 3586 21032 - l 3579 21025 3586 21032 100 -} -a { - s 0 - b 3167 21034 - e 3174 21041 - l 3167 21034 3174 21041 100 -} -a { - s 0 - b 15119 21075 - e 15126 21082 - l 15119 21075 15126 21082 100 -} -a { - s 0 - b 6508 21075 - e 6515 21082 - l 6508 21075 6515 21082 100 -} -a { - s 0 - b 15120 21076 - e 15127 21083 - l 15120 21076 15127 21083 100 -} -a { - s 0 - b 6356 21078 - e 6363 21085 - l 6356 21078 6363 21085 100 -} -a { - s 0 - b 18566 21080 - e 18573 21087 - l 18566 21080 18573 21087 100 -} -a { - s 0 - b 18768 21082 - e 18775 21089 - l 18768 21082 18775 21089 100 -} -a { - s 0 - b 18769 21083 - e 18776 21090 - l 18769 21083 18776 21090 100 -} -a { - s 0 - b 14838 21083 - e 14845 21090 - l 14838 21083 14845 21090 100 -} -a { - s 0 - b 14839 21084 - e 14846 21091 - l 14839 21084 14846 21091 100 -} -a { - s 0 - b 15907 21085 - e 15914 21092 - l 15907 21085 15914 21092 100 -} -a { - s 0 - b 15908 21086 - e 15915 21093 - l 15908 21086 15915 21093 100 -} -a { - s 0 - b 11310 21093 - e 11317 21100 - l 11310 21093 11317 21100 100 -} -a { - s 0 - b 6822 21095 - e 6829 21102 - l 6822 21095 6829 21102 100 -} -a { - s 0 - b 4380 21095 - e 4387 21102 - l 4380 21095 4387 21102 100 -} -a { - s 0 - b 6823 21096 - e 6830 21103 - l 6823 21096 6830 21103 100 -} -a { - s 0 - b 4381 21096 - e 4388 21103 - l 4381 21096 4388 21103 100 -} -a { - s 0 - b 4382 21097 - e 4389 21104 - l 4382 21097 4389 21104 100 -} -a { - s 0 - b 4383 21098 - e 4390 21105 - l 4383 21098 4390 21105 100 -} -a { - s 0 - b 14189 21099 - e 14196 21106 - l 14189 21099 14196 21106 100 -} -a { - s 0 - b 18777 21118 - e 18784 21125 - l 18777 21118 18784 21125 100 -} -a { - s 0 - b 14307 21118 - e 14314 21125 - l 14307 21118 14314 21125 100 -} -a { - s 0 - b 7449 21122 - e 7456 21129 - l 7449 21122 7456 21129 100 -} -a { - s 0 - b 40 21124 - e 47 21131 - l 40 21124 47 21131 100 -} -a { - s 0 - b 5176 21126 - e 5183 21133 - l 5176 21126 5183 21133 100 -} -a { - s 0 - b 5177 21127 - e 5184 21134 - l 5177 21127 5184 21134 100 -} -a { - s 0 - b 5178 21128 - e 5185 21135 - l 5178 21128 5185 21135 100 -} -a { - s 0 - b 11340 21135 - e 11347 21142 - l 11340 21135 11347 21142 100 -} -a { - s 0 - b 15306 21140 - e 15313 21147 - l 15306 21140 15313 21147 100 -} -a { - s 0 - b 7742 21182 - e 7749 21189 - l 7742 21182 7749 21189 100 -} -a { - s 0 - b 17321 21183 - e 17328 21190 - l 17321 21183 17328 21190 100 -} -a { - s 0 - b 14935 21202 - e 14942 21209 - l 14935 21202 14942 21209 100 -} -a { - s 0 - b 10938 21215 - e 10945 21222 - l 10938 21215 10945 21222 100 -} -a { - s 0 - b 10939 21216 - e 10946 21223 - l 10939 21216 10946 21223 100 -} -a { - s 0 - b 14514 21220 - e 14521 21227 - l 14514 21220 14521 21227 100 -} -a { - s 0 - b 14515 21221 - e 14522 21228 - l 14515 21221 14522 21228 100 -} -a { - s 0 - b 11086 21243 - e 11093 21250 - l 11086 21243 11093 21250 100 -} -a { - s 0 - b 8309 21244 - e 8316 21251 - l 8309 21244 8316 21251 100 -} -a { - s 0 - b 3576 21244 - e 3583 21251 - l 3576 21244 3583 21251 100 -} -a { - s 0 - b 8843 21245 - e 8850 21252 - l 8843 21245 8850 21252 100 -} -a { - s 0 - b 8844 21246 - e 8851 21253 - l 8844 21246 8851 21253 100 -} -a { - s 0 - b 8845 21247 - e 8852 21254 - l 8845 21247 8852 21254 100 -} -a { - s 0 - b 9411 21248 - e 9418 21255 - l 9411 21248 9418 21255 100 -} -a { - s 0 - b 8846 21248 - e 8853 21255 - l 8846 21248 8853 21255 100 -} -a { - s 0 - b 15388 21249 - e 15395 21256 - l 15388 21249 15395 21256 100 -} -a { - s 0 - b 8797 21259 - e 8804 21266 - l 8797 21259 8804 21266 100 -} -a { - s 0 - b 4935 21273 - e 4942 21280 - l 4935 21273 4942 21280 100 -} -a { - s 0 - b 4936 21274 - e 4943 21281 - l 4936 21274 4943 21281 100 -} -a { - s 0 - b 14806 21275 - e 14813 21282 - l 14806 21275 14813 21282 100 -} -a { - s 0 - b 7601 21277 - e 7608 21284 - l 7601 21277 7608 21284 100 -} -a { - s 0 - b 3354 21299 - e 3361 21306 - l 3354 21299 3361 21306 100 -} -a { - s 0 - b 3355 21300 - e 3362 21307 - l 3355 21300 3362 21307 100 -} -a { - s 0 - b 3356 21301 - e 3363 21308 - l 3356 21301 3363 21308 100 -} -a { - s 0 - b 3357 21302 - e 3364 21309 - l 3357 21302 3364 21309 100 -} -a { - s 0 - b 3942 21305 - e 3949 21312 - l 3942 21305 3949 21312 100 -} -a { - s 0 - b 9310 21309 - e 9317 21316 - l 9310 21309 9317 21316 100 -} -a { - s 0 - b 14575 21312 - e 14582 21319 - l 14575 21312 14582 21319 100 -} -a { - s 0 - b 4446 21315 - e 4453 21322 - l 4446 21315 4453 21322 100 -} -a { - s 0 - b 18608 21317 - e 18615 21324 - l 18608 21317 18615 21324 100 -} -a { - s 0 - b 11028 21318 - e 11035 21325 - l 11028 21318 11035 21325 100 -} -a { - s 0 - b 11029 21319 - e 11036 21326 - l 11029 21319 11036 21326 100 -} -a { - s 0 - b 11030 21320 - e 11037 21327 - l 11030 21320 11037 21327 100 -} -a { - s 0 - b 6440 21322 - e 6447 21329 - l 6440 21322 6447 21329 100 -} -a { - s 0 - b 6441 21323 - e 6448 21330 - l 6441 21323 6448 21330 100 -} -a { - s 0 - b 6442 21324 - e 6449 21331 - l 6442 21324 6449 21331 100 -} -a { - s 0 - b 6443 21325 - e 6450 21332 - l 6443 21325 6450 21332 100 -} -a { - s 0 - b 17585 21347 - e 17592 21354 - l 17585 21347 17592 21354 100 -} -a { - s 0 - b 18387 21350 - e 18394 21357 - l 18387 21350 18394 21357 100 -} -a { - s 0 - b 11357 21359 - e 11364 21366 - l 11357 21359 11364 21366 100 -} -a { - s 0 - b 7562 21370 - e 7569 21377 - l 7562 21370 7569 21377 100 -} -a { - s 0 - b 4560 21373 - e 4567 21380 - l 4560 21373 4567 21380 100 -} -a { - s 0 - b 3567 21391 - e 3574 21398 - l 3567 21391 3574 21398 100 -} -a { - s 0 - b 11195 21395 - e 11202 21402 - l 11195 21395 11202 21402 100 -} -a { - s 0 - b 11196 21396 - e 11203 21403 - l 11196 21396 11203 21403 100 -} -a { - s 0 - b 11197 21397 - e 11204 21404 - l 11197 21397 11204 21404 100 -} -a { - s 0 - b 15395 21398 - e 15402 21405 - l 15395 21398 15402 21405 100 -} -a { - s 0 - b 8998 21407 - e 9005 21414 - l 8998 21407 9005 21414 100 -} -a { - s 0 - b 8154 21411 - e 8161 21418 - l 8154 21411 8161 21418 100 -} -a { - s 0 - b 14227 21415 - e 14234 21422 - l 14227 21415 14234 21422 100 -} -a { - s 0 - b 4087 21419 - e 4094 21426 - l 4087 21419 4094 21426 100 -} -a { - s 0 - b 4822 21450 - e 4829 21457 - l 4822 21450 4829 21457 100 -} -a { - s 0 - b 4823 21451 - e 4830 21458 - l 4823 21451 4830 21458 100 -} -a { - s 0 - b 8011 21462 - e 8018 21469 - l 8011 21462 8018 21469 100 -} -a { - s 0 - b 8012 21463 - e 8019 21470 - l 8012 21463 8019 21470 100 -} -a { - s 0 - b 15276 21464 - e 15283 21471 - l 15276 21464 15283 21471 100 -} -a { - s 0 - b 3672 21464 - e 3679 21471 - l 3672 21464 3679 21471 100 -} -a { - s 0 - b 15277 21465 - e 15284 21472 - l 15277 21465 15284 21472 100 -} -a { - s 0 - b 9051 21465 - e 9058 21472 - l 9051 21465 9058 21472 100 -} -a { - s 0 - b 17325 21466 - e 17332 21473 - l 17325 21466 17332 21473 100 -} -a { - s 0 - b 15278 21466 - e 15285 21473 - l 15278 21466 15285 21473 100 -} -a { - s 0 - b 8134 21479 - e 8141 21486 - l 8134 21479 8141 21486 100 -} -a { - s 0 - b 74 21505 - e 81 21512 - l 74 21505 81 21512 100 -} -a { - s 0 - b 17986 21527 - e 17993 21534 - l 17986 21527 17993 21534 100 -} -a { - s 0 - b 11089 21528 - e 11096 21535 - l 11089 21528 11096 21535 100 -} -a { - s 0 - b 16016 21529 - e 16023 21536 - l 16016 21529 16023 21536 100 -} -a { - s 0 - b 16017 21530 - e 16024 21537 - l 16017 21530 16024 21537 100 -} -a { - s 0 - b 3236 21530 - e 3243 21537 - l 3236 21530 3243 21537 100 -} -a { - s 0 - b 16018 21531 - e 16025 21538 - l 16018 21531 16025 21538 100 -} -a { - s 0 - b 8783 21541 - e 8790 21548 - l 8783 21541 8790 21548 100 -} -a { - s 0 - b 8561 21608 - e 8568 21615 - l 8561 21608 8568 21615 100 -} -a { - s 0 - b 6625 21610 - e 6632 21617 - l 6625 21610 6632 21617 100 -} -a { - s 0 - b 10998 21619 - e 11005 21626 - l 10998 21619 11005 21626 100 -} -a { - s 0 - b 10999 21620 - e 11006 21627 - l 10999 21620 11006 21627 100 -} -a { - s 0 - b 9511 21623 - e 9518 21630 - l 9511 21623 9518 21630 100 -} -a { - s 0 - b 6375 21623 - e 6382 21630 - l 6375 21623 6382 21630 100 -} -a { - s 0 - b 9512 21624 - e 9519 21631 - l 9512 21624 9519 21631 100 -} -a { - s 0 - b 6376 21624 - e 6383 21631 - l 6376 21624 6383 21631 100 -} -a { - s 0 - b 6377 21625 - e 6384 21632 - l 6377 21625 6384 21632 100 -} -a { - s 0 - b 6371 21652 - e 6378 21659 - l 6371 21652 6378 21659 100 -} -a { - s 0 - b 3199 21652 - e 3206 21659 - l 3199 21652 3206 21659 100 -} -a { - s 0 - b 3200 21653 - e 3207 21660 - l 3200 21653 3207 21660 100 -} -a { - s 0 - b 3201 21654 - e 3208 21661 - l 3201 21654 3208 21661 100 -} -a { - s 0 - b 3202 21655 - e 3209 21662 - l 3202 21655 3209 21662 100 -} -a { - s 0 - b 18631 21666 - e 18638 21673 - l 18631 21666 18638 21673 100 -} -a { - s 0 - b 11196 21691 - e 11203 21698 - l 11196 21691 11203 21698 100 -} -a { - s 0 - b 11461 21694 - e 11468 21701 - l 11461 21694 11468 21701 100 -} -a { - s 0 - b 6681 21704 - e 6688 21711 - l 6681 21704 6688 21711 100 -} -a { - s 0 - b 14881 21706 - e 14888 21713 - l 14881 21706 14888 21713 100 -} -a { - s 0 - b 6370 21706 - e 6377 21713 - l 6370 21706 6377 21713 100 -} -a { - s 0 - b 14720 21721 - e 14727 21728 - l 14720 21721 14727 21728 100 -} -a { - s 0 - b 10962 21762 - e 10969 21769 - l 10962 21762 10969 21769 100 -} -a { - s 0 - b 7994 21768 - e 8001 21775 - l 7994 21768 8001 21775 100 -} -a { - s 0 - b 7995 21769 - e 8002 21776 - l 7995 21769 8002 21776 100 -} -a { - s 0 - b 6829 21795 - e 6836 21802 - l 6829 21795 6836 21802 100 -} -a { - s 0 - b 5082 21795 - e 5089 21802 - l 5082 21795 5089 21802 100 -} -a { - s 0 - b 9502 21797 - e 9509 21804 - l 9502 21797 9509 21804 100 -} -a { - s 0 - b 8325 21817 - e 8332 21824 - l 8325 21817 8332 21824 100 -} -a { - s 0 - b 3666 21817 - e 3673 21824 - l 3666 21817 3673 21824 100 -} -a { - s 0 - b 13389 21825 - e 13396 21832 - l 13389 21825 13396 21832 100 -} -a { - s 0 - b 13390 21826 - e 13397 21833 - l 13390 21826 13397 21833 100 -} -a { - s 0 - b 8440 21830 - e 8447 21837 - l 8440 21830 8447 21837 100 -} -a { - s 0 - b 6355 21830 - e 6362 21837 - l 6355 21830 6362 21837 100 -} -a { - s 0 - b 2968 21830 - e 2975 21837 - l 2968 21830 2975 21837 100 -} -a { - s 0 - b 8441 21831 - e 8448 21838 - l 8441 21831 8448 21838 100 -} -a { - s 0 - b 4998 21834 - e 5005 21841 - l 4998 21834 5005 21841 100 -} -a { - s 0 - b 11487 21846 - e 11494 21853 - l 11487 21846 11494 21853 100 -} -a { - s 0 - b 11482 21848 - e 11489 21855 - l 11482 21848 11489 21855 100 -} -a { - s 0 - b 6639 21851 - e 6646 21858 - l 6639 21851 6646 21858 100 -} -a { - s 0 - b 6640 21852 - e 6647 21859 - l 6640 21852 6647 21859 100 -} -a { - s 0 - b 15915 21857 - e 15922 21864 - l 15915 21857 15922 21864 100 -} -a { - s 0 - b 8513 21874 - e 8520 21881 - l 8513 21874 8520 21881 100 -} -a { - s 0 - b 11225 21878 - e 11232 21885 - l 11225 21878 11232 21885 100 -} -a { - s 0 - b 10865 21878 - e 10872 21885 - l 10865 21878 10872 21885 100 -} -a { - s 0 - b 10866 21879 - e 10873 21886 - l 10866 21879 10873 21886 100 -} -a { - s 0 - b 10867 21880 - e 10874 21887 - l 10867 21880 10874 21887 100 -} -a { - s 0 - b 167 21893 - e 174 21900 - l 167 21893 174 21900 100 -} -a { - s 0 - b 6428 21901 - e 6435 21908 - l 6428 21901 6435 21908 100 -} -a { - s 0 - b 9646 21902 - e 9653 21909 - l 9646 21902 9653 21909 100 -} -a { - s 0 - b 18641 21916 - e 18648 21923 - l 18641 21916 18648 21923 100 -} -a { - s 0 - b 14606 21916 - e 14613 21923 - l 14606 21916 14613 21923 100 -} -a { - s 0 - b 10852 21916 - e 10859 21923 - l 10852 21916 10859 21923 100 -} -a { - s 0 - b 9677 21922 - e 9684 21929 - l 9677 21922 9684 21929 100 -} -a { - s 0 - b 17544 21923 - e 17551 21930 - l 17544 21923 17551 21930 100 -} -a { - s 0 - b 3406 21923 - e 3413 21930 - l 3406 21923 3413 21930 100 -} -a { - s 0 - b 7974 21943 - e 7981 21950 - l 7974 21943 7981 21950 100 -} -a { - s 0 - b 74 21954 - e 81 21961 - l 74 21954 81 21961 100 -} -a { - s 0 - b 5132 21964 - e 5139 21971 - l 5132 21964 5139 21971 100 -} -a { - s 0 - b 6444 22013 - e 6451 22020 - l 6444 22013 6451 22020 100 -} -a { - s 0 - b 3874 22013 - e 3881 22020 - l 3874 22013 3881 22020 100 -} -a { - s 0 - b 8638 22019 - e 8645 22026 - l 8638 22019 8645 22026 100 -} -a { - s 0 - b 18773 22024 - e 18780 22031 - l 18773 22024 18780 22031 100 -} -a { - s 0 - b 7452 22024 - e 7459 22031 - l 7452 22024 7459 22031 100 -} -a { - s 0 - b 3253 22024 - e 3260 22031 - l 3253 22024 3260 22031 100 -} -a { - s 0 - b 18774 22025 - e 18781 22032 - l 18774 22025 18781 22032 100 -} -a { - s 0 - b 7453 22025 - e 7460 22032 - l 7453 22025 7460 22032 100 -} -a { - s 0 - b 18775 22026 - e 18782 22033 - l 18775 22026 18782 22033 100 -} -a { - s 0 - b 11274 22030 - e 11281 22037 - l 11274 22030 11281 22037 100 -} -a { - s 0 - b 4444 22067 - e 4451 22074 - l 4444 22067 4451 22074 100 -} -a { - s 0 - b 15386 22068 - e 15393 22075 - l 15386 22068 15393 22075 100 -} -a { - s 0 - b 3091 22069 - e 3098 22076 - l 3091 22069 3098 22076 100 -} -a { - s 0 - b 3996 22071 - e 4003 22078 - l 3996 22071 4003 22078 100 -} -a { - s 0 - b 4845 22078 - e 4852 22085 - l 4845 22078 4852 22085 100 -} -a { - s 0 - b 169 22085 - e 176 22092 - l 169 22085 176 22092 100 -} -a { - s 0 - b 15983 22088 - e 15990 22095 - l 15983 22088 15990 22095 100 -} -a { - s 0 - b 15984 22089 - e 15991 22096 - l 15984 22089 15991 22096 100 -} -a { - s 0 - b 17181 22113 - e 17188 22120 - l 17181 22113 17188 22120 100 -} -a { - s 0 - b 4649 22114 - e 4656 22121 - l 4649 22114 4656 22121 100 -} -a { - s 0 - b 6434 22128 - e 6441 22135 - l 6434 22128 6441 22135 100 -} -a { - s 0 - b 17402 22136 - e 17409 22143 - l 17402 22136 17409 22143 100 -} -a { - s 0 - b 8003 22141 - e 8010 22148 - l 8003 22141 8010 22148 100 -} -a { - s 0 - b 5641 22148 - e 5648 22155 - l 5641 22148 5648 22155 100 -} -a { - s 0 - b 5642 22149 - e 5649 22156 - l 5642 22149 5649 22156 100 -} -a { - s 0 - b 3685 22149 - e 3692 22156 - l 3685 22149 3692 22156 100 -} -a { - s 0 - b 15 22149 - e 22 22156 - l 15 22149 22 22156 100 -} -a { - s 0 - b 14876 22150 - e 14883 22157 - l 14876 22150 14883 22157 100 -} -a { - s 0 - b 5643 22150 - e 5650 22157 - l 5643 22150 5650 22157 100 -} -a { - s 0 - b 5644 22151 - e 5651 22158 - l 5644 22151 5651 22158 100 -} -a { - s 0 - b 4092 22165 - e 4099 22172 - l 4092 22165 4099 22172 100 -} -a { - s 0 - b 14523 22166 - e 14530 22173 - l 14523 22166 14530 22173 100 -} -a { - s 0 - b 14524 22167 - e 14531 22174 - l 14524 22167 14531 22174 100 -} -a { - s 0 - b 14525 22168 - e 14532 22175 - l 14525 22168 14532 22175 100 -} -a { - s 0 - b 3054 22170 - e 3061 22177 - l 3054 22170 3061 22177 100 -} -a { - s 0 - b 3055 22171 - e 3062 22178 - l 3055 22171 3062 22178 100 -} -a { - s 0 - b 3068 22175 - e 3075 22182 - l 3068 22175 3075 22182 100 -} -a { - s 0 - b 6776 22186 - e 6783 22193 - l 6776 22186 6783 22193 100 -} -a { - s 0 - b 9025 22199 - e 9032 22206 - l 9025 22199 9032 22206 100 -} -a { - s 0 - b 16061 22222 - e 16068 22229 - l 16061 22222 16068 22229 100 -} -a { - s 0 - b 16000 22234 - e 16007 22241 - l 16000 22234 16007 22241 100 -} -a { - s 0 - b 11272 22235 - e 11279 22242 - l 11272 22235 11279 22242 100 -} -a { - s 0 - b 12155 22240 - e 12162 22247 - l 12155 22240 12162 22247 100 -} -a { - s 0 - b 14276 22241 - e 14283 22248 - l 14276 22241 14283 22248 100 -} -a { - s 0 - b 6728 22241 - e 6735 22248 - l 6728 22241 6735 22248 100 -} -a { - s 0 - b 6729 22242 - e 6736 22249 - l 6729 22242 6736 22249 100 -} -a { - s 0 - b 4480 22805 - e 4487 22812 - l 4480 22805 4487 22812 100 -} -a { - s 0 - b 3075 22810 - e 3082 22817 - l 3075 22810 3082 22817 100 -} -a { - s 0 - b 10964 22820 - e 10971 22827 - l 10964 22820 10971 22827 100 -} -a { - s 0 - b 15103 22824 - e 15110 22831 - l 15103 22824 15110 22831 100 -} -a { - s 0 - b 17400 22828 - e 17407 22835 - l 17400 22828 17407 22835 100 -} -a { - s 0 - b 17401 22829 - e 17408 22836 - l 17401 22829 17408 22836 100 -} -a { - s 0 - b 13392 22842 - e 13399 22849 - l 13392 22842 13399 22849 100 -} -a { - s 0 - b 8183 22853 - e 8190 22860 - l 8183 22853 8190 22860 100 -} -a { - s 0 - b 15341 22883 - e 15348 22890 - l 15341 22883 15348 22890 100 -} -a { - s 0 - b 15342 22884 - e 15349 22891 - l 15342 22884 15349 22891 100 -} -a { - s 0 - b 7681 22886 - e 7688 22893 - l 7681 22886 7688 22893 100 -} -a { - s 0 - b 4440 22890 - e 4447 22897 - l 4440 22890 4447 22897 100 -} -a { - s 0 - b 17251 22891 - e 17258 22898 - l 17251 22891 17258 22898 100 -} -a { - s 0 - b 4441 22891 - e 4448 22898 - l 4441 22891 4448 22898 100 -} -a { - s 0 - b 17252 22892 - e 17259 22899 - l 17252 22892 17259 22899 100 -} -a { - s 0 - b 8030 22911 - e 8037 22918 - l 8030 22911 8037 22918 100 -} -a { - s 0 - b 7650 22911 - e 7657 22918 - l 7650 22911 7657 22918 100 -} -a { - s 0 - b 132 22911 - e 139 22918 - l 132 22911 139 22918 100 -} -a { - s 0 - b 11096 22914 - e 11103 22921 - l 11096 22914 11103 22921 100 -} -#:lav -s { - "../test_data/pseudocat.fa" 1 18803 0 1 - "../test_data/pseudopig.fa" 1 22929 0 3 -} -h { - "> cat" - "> pig3" -} -a { - s 0 - b 8108 1 - e 8115 8 - l 8108 1 8115 8 100 -} -a { - s 0 - b 159 18 - e 166 25 - l 159 18 166 25 100 -} -a { - s 0 - b 160 19 - e 167 26 - l 160 19 167 26 100 -} -a { - s 0 - b 161 20 - e 168 27 - l 161 20 168 27 100 -} -a { - s 0 - b 4023 21 - e 4030 28 - l 4023 21 4030 28 100 -} -a { - s 0 - b 16146 22 - e 16153 29 - l 16146 22 16153 29 100 -} -a { - s 0 - b 4024 22 - e 4031 29 - l 4024 22 4031 29 100 -} -a { - s 0 - b 6337 28 - e 6344 35 - l 6337 28 6344 35 100 -} -a { - s 0 - b 3933 30 - e 3940 37 - l 3933 30 3940 37 100 -} -a { - s 0 - b 3136 67 - e 3143 74 - l 3136 67 3143 74 100 -} -a { - s 0 - b 3137 68 - e 3144 75 - l 3137 68 3144 75 100 -} -a { - s 0 - b 3084 68 - e 3091 75 - l 3084 68 3091 75 100 -} -a { - s 0 - b 3085 69 - e 3092 76 - l 3085 69 3092 76 100 -} -a { - s 0 - b 3086 70 - e 3093 77 - l 3086 70 3093 77 100 -} -a { - s 0 - b 3087 71 - e 3094 78 - l 3087 71 3094 78 100 -} -a { - s 0 - b 15316 72 - e 15323 79 - l 15316 72 15323 79 100 -} -a { - s 0 - b 17148 73 - e 17155 80 - l 17148 73 17155 80 100 -} -a { - s 0 - b 9022 87 - e 9029 94 - l 9022 87 9029 94 100 -} -a { - s 0 - b 6292 112 - e 6299 119 - l 6292 112 6299 119 100 -} -a { - s 0 - b 18075 113 - e 18082 120 - l 18075 113 18082 120 100 -} -a { - s 0 - b 4349 118 - e 4356 125 - l 4349 118 4356 125 100 -} -a { - s 0 - b 18474 120 - e 18481 127 - l 18474 120 18481 127 100 -} -a { - s 0 - b 18572 121 - e 18579 128 - l 18572 121 18579 128 100 -} -a { - s 0 - b 12590 125 - e 12597 132 - l 12590 125 12597 132 100 -} -a { - s 0 - b 17482 128 - e 17489 135 - l 17482 128 17489 135 100 -} -a { - s 0 - b 4983 136 - e 4990 143 - l 4983 136 4990 143 100 -} -a { - s 0 - b 6293 146 - e 6300 153 - l 6293 146 6300 153 100 -} -a { - s 0 - b 6294 147 - e 6301 154 - l 6294 147 6301 154 100 -} -a { - s 0 - b 6295 148 - e 6302 155 - l 6295 148 6302 155 100 -} -a { - s 0 - b 18698 151 - e 18705 158 - l 18698 151 18705 158 100 -} -a { - s 0 - b 18699 152 - e 18706 159 - l 18699 152 18706 159 100 -} -a { - s 0 - b 48 153 - e 55 160 - l 48 153 55 160 100 -} -a { - s 0 - b 17525 154 - e 17532 161 - l 17525 154 17532 161 100 -} -a { - s 0 - b 14968 156 - e 14975 163 - l 14968 156 14975 163 100 -} -a { - s 0 - b 4393 156 - e 4400 163 - l 4393 156 4400 163 100 -} -a { - s 0 - b 14969 157 - e 14976 164 - l 14969 157 14976 164 100 -} -a { - s 0 - b 7562 163 - e 7569 170 - l 7562 163 7569 170 100 -} -a { - s 0 - b 7563 164 - e 7570 171 - l 7563 164 7570 171 100 -} -a { - s 0 - b 7564 165 - e 7571 172 - l 7564 165 7571 172 100 -} -a { - s 0 - b 7565 166 - e 7572 173 - l 7565 166 7572 173 100 -} -a { - s 0 - b 11479 171 - e 11486 178 - l 11479 171 11486 178 100 -} -a { - s 0 - b 11480 172 - e 11487 179 - l 11480 172 11487 179 100 -} -a { - s 0 - b 3246 184 - e 3253 191 - l 3246 184 3253 191 100 -} -a { - s 0 - b 18379 198 - e 18386 205 - l 18379 198 18386 205 100 -} -a { - s 0 - b 11448 201 - e 11455 208 - l 11448 201 11455 208 100 -} -a { - s 0 - b 4422 201 - e 4429 208 - l 4422 201 4429 208 100 -} -a { - s 0 - b 4423 202 - e 4430 209 - l 4423 202 4430 209 100 -} -a { - s 0 - b 14735 214 - e 14742 221 - l 14735 214 14742 221 100 -} -a { - s 0 - b 15138 221 - e 15145 228 - l 15138 221 15145 228 100 -} -a { - s 0 - b 15139 222 - e 15146 229 - l 15139 222 15146 229 100 -} -a { - s 0 - b 12139 224 - e 12146 231 - l 12139 224 12146 231 100 -} -a { - s 0 - b 9635 225 - e 9642 232 - l 9635 225 9642 232 100 -} -a { - s 0 - b 8982 225 - e 8989 232 - l 8982 225 8989 232 100 -} -a { - s 0 - b 17523 238 - e 17530 245 - l 17523 238 17530 245 100 -} -a { - s 0 - b 17524 239 - e 17531 246 - l 17524 239 17531 246 100 -} -a { - s 0 - b 6431 256 - e 6438 263 - l 6431 256 6438 263 100 -} -a { - s 0 - b 6432 257 - e 6439 264 - l 6432 257 6439 264 100 -} -a { - s 0 - b 6433 258 - e 6440 265 - l 6433 258 6440 265 100 -} -a { - s 0 - b 15385 260 - e 15392 267 - l 15385 260 15392 267 100 -} -a { - s 0 - b 15386 261 - e 15393 268 - l 15386 261 15393 268 100 -} -a { - s 0 - b 17290 278 - e 17297 285 - l 17290 278 17297 285 100 -} -a { - s 0 - b 13404 281 - e 13411 288 - l 13404 281 13411 288 100 -} -a { - s 0 - b 13405 282 - e 13412 289 - l 13405 282 13412 289 100 -} -a { - s 0 - b 7615 282 - e 7622 289 - l 7615 282 7622 289 100 -} -a { - s 0 - b 5624 284 - e 5631 291 - l 5624 284 5631 291 100 -} -a { - s 0 - b 15197 307 - e 15204 314 - l 15197 307 15204 314 100 -} -a { - s 0 - b 15198 308 - e 15205 315 - l 15198 308 15205 315 100 -} -a { - s 0 - b 15199 309 - e 15206 316 - l 15199 309 15206 316 100 -} -a { - s 0 - b 4073 310 - e 4080 317 - l 4073 310 4080 317 100 -} -a { - s 0 - b 6354 317 - e 6361 324 - l 6354 317 6361 324 100 -} -a { - s 0 - b 2993 320 - e 3000 327 - l 2993 320 3000 327 100 -} -a { - s 0 - b 18717 322 - e 18724 329 - l 18717 322 18724 329 100 -} -a { - s 0 - b 18043 333 - e 18050 340 - l 18043 333 18050 340 100 -} -a { - s 0 - b 18044 334 - e 18051 341 - l 18044 334 18051 341 100 -} -a { - s 0 - b 9313 380 - e 9320 387 - l 9313 380 9320 387 100 -} -a { - s 0 - b 5090 380 - e 5097 387 - l 5090 380 5097 387 100 -} -a { - s 0 - b 9314 381 - e 9321 388 - l 9314 381 9321 388 100 -} -a { - s 0 - b 5091 381 - e 5098 388 - l 5091 381 5098 388 100 -} -a { - s 0 - b 153 381 - e 160 388 - l 153 381 160 388 100 -} -a { - s 0 - b 4839 383 - e 4846 390 - l 4839 383 4846 390 100 -} -a { - s 0 - b 17468 384 - e 17475 391 - l 17468 384 17475 391 100 -} -a { - s 0 - b 4840 384 - e 4847 391 - l 4840 384 4847 391 100 -} -a { - s 0 - b 15260 394 - e 15267 401 - l 15260 394 15267 401 100 -} -a { - s 0 - b 11042 395 - e 11049 402 - l 11042 395 11049 402 100 -} -a { - s 0 - b 95 416 - e 102 423 - l 95 416 102 423 100 -} -a { - s 0 - b 96 417 - e 103 424 - l 96 417 103 424 100 -} -a { - s 0 - b 8965 427 - e 8972 434 - l 8965 427 8972 434 100 -} -a { - s 0 - b 11498 429 - e 11505 436 - l 11498 429 11505 436 100 -} -a { - s 0 - b 13893 435 - e 13900 442 - l 13893 435 13900 442 100 -} -a { - s 0 - b 13894 436 - e 13901 443 - l 13894 436 13901 443 100 -} -a { - s 0 - b 9362 440 - e 9369 447 - l 9362 440 9369 447 100 -} -a { - s 0 - b 9363 441 - e 9370 448 - l 9363 441 9370 448 100 -} -a { - s 0 - b 13472 456 - e 13479 463 - l 13472 456 13479 463 100 -} -a { - s 0 - b 8805 459 - e 8812 466 - l 8805 459 8812 466 100 -} -a { - s 0 - b 8806 460 - e 8813 467 - l 8806 460 8813 467 100 -} -a { - s 0 - b 8721 462 - e 8728 469 - l 8721 462 8728 469 100 -} -a { - s 0 - b 3670 478 - e 3677 485 - l 3670 478 3677 485 100 -} -a { - s 0 - b 8318 480 - e 8325 487 - l 8318 480 8325 487 100 -} -a { - s 0 - b 9355 481 - e 9362 488 - l 9355 481 9362 488 100 -} -a { - s 0 - b 8321 493 - e 8328 500 - l 8321 493 8328 500 100 -} -a { - s 0 - b 8322 494 - e 8329 501 - l 8322 494 8329 501 100 -} -a { - s 0 - b 8323 495 - e 8330 502 - l 8323 495 8330 502 100 -} -a { - s 0 - b 17269 500 - e 17276 507 - l 17269 500 17276 507 100 -} -a { - s 0 - b 17419 517 - e 17426 524 - l 17419 517 17426 524 100 -} -a { - s 0 - b 5645 551 - e 5652 558 - l 5645 551 5652 558 100 -} -a { - s 0 - b 4019 554 - e 4026 561 - l 4019 554 4026 561 100 -} -a { - s 0 - b 15951 555 - e 15958 562 - l 15951 555 15958 562 100 -} -a { - s 0 - b 4020 555 - e 4027 562 - l 4020 555 4027 562 100 -} -a { - s 0 - b 15952 556 - e 15959 563 - l 15952 556 15959 563 100 -} -a { - s 0 - b 13440 568 - e 13447 575 - l 13440 568 13447 575 100 -} -a { - s 0 - b 18032 569 - e 18039 576 - l 18032 569 18039 576 100 -} -a { - s 0 - b 18556 588 - e 18563 595 - l 18556 588 18563 595 100 -} -a { - s 0 - b 18567 596 - e 18574 603 - l 18567 596 18574 603 100 -} -a { - s 0 - b 17544 604 - e 17551 611 - l 17544 604 17551 611 100 -} -a { - s 0 - b 3406 604 - e 3413 611 - l 3406 604 3413 611 100 -} -a { - s 0 - b 3704 610 - e 3711 617 - l 3704 610 3711 617 100 -} -a { - s 0 - b 17368 624 - e 17375 631 - l 17368 624 17375 631 100 -} -a { - s 0 - b 17369 625 - e 17376 632 - l 17369 625 17376 632 100 -} -a { - s 0 - b 17370 626 - e 17377 633 - l 17370 626 17377 633 100 -} -a { - s 0 - b 8124 626 - e 8131 633 - l 8124 626 8131 633 100 -} -a { - s 0 - b 13959 635 - e 13966 642 - l 13959 635 13966 642 100 -} -a { - s 0 - b 8672 635 - e 8679 642 - l 8672 635 8679 642 100 -} -a { - s 0 - b 8673 636 - e 8680 643 - l 8673 636 8680 643 100 -} -a { - s 0 - b 15910 638 - e 15917 645 - l 15910 638 15917 645 100 -} -a { - s 0 - b 15911 639 - e 15918 646 - l 15911 639 15918 646 100 -} -a { - s 0 - b 15912 640 - e 15919 647 - l 15912 640 15919 647 100 -} -a { - s 0 - b 8099 648 - e 8106 655 - l 8099 648 8106 655 100 -} -a { - s 0 - b 17280 649 - e 17287 656 - l 17280 649 17287 656 100 -} -a { - s 0 - b 18528 651 - e 18535 658 - l 18528 651 18535 658 100 -} -a { - s 0 - b 12592 653 - e 12599 660 - l 12592 653 12599 660 100 -} -a { - s 0 - b 3029 656 - e 3036 663 - l 3029 656 3036 663 100 -} -a { - s 0 - b 3165 660 - e 3172 667 - l 3165 660 3172 667 100 -} -a { - s 0 - b 3166 661 - e 3173 668 - l 3166 661 3173 668 100 -} -a { - s 0 - b 4978 873 - e 4985 880 - l 4978 873 4985 880 100 -} -a { - s 0 - b 4979 874 - e 4986 881 - l 4979 874 4986 881 100 -} -a { - s 0 - b 189 876 - e 196 883 - l 189 876 196 883 100 -} -a { - s 0 - b 18350 898 - e 18357 905 - l 18350 898 18357 905 100 -} -a { - s 0 - b 18351 899 - e 18358 906 - l 18351 899 18358 906 100 -} -a { - s 0 - b 18352 900 - e 18359 907 - l 18352 900 18359 907 100 -} -a { - s 0 - b 3700 903 - e 3707 910 - l 3700 903 3707 910 100 -} -a { - s 0 - b 10842 907 - e 10849 914 - l 10842 907 10849 914 100 -} -a { - s 0 - b 6690 908 - e 6697 915 - l 6690 908 6697 915 100 -} -a { - s 0 - b 6691 909 - e 6698 916 - l 6691 909 6698 916 100 -} -a { - s 0 - b 6692 910 - e 6699 917 - l 6692 910 6699 917 100 -} -a { - s 0 - b 9681 919 - e 9688 926 - l 9681 919 9688 926 100 -} -a { - s 0 - b 15391 921 - e 15398 928 - l 15391 921 15398 928 100 -} -a { - s 0 - b 15392 922 - e 15399 929 - l 15392 922 15399 929 100 -} -a { - s 0 - b 8024 923 - e 8031 930 - l 8024 923 8031 930 100 -} -a { - s 0 - b 3590 930 - e 3597 937 - l 3590 930 3597 937 100 -} -a { - s 0 - b 3130 942 - e 3137 949 - l 3130 942 3137 949 100 -} -a { - s 0 - b 9458 957 - e 9465 964 - l 9458 957 9465 964 100 -} -a { - s 0 - b 6362 962 - e 6369 969 - l 6362 962 6369 969 100 -} -a { - s 0 - b 12135 964 - e 12142 971 - l 12135 964 12142 971 100 -} -a { - s 0 - b 3244 965 - e 3251 972 - l 3244 965 3251 972 100 -} -a { - s 0 - b 3688 979 - e 3695 986 - l 3688 979 3695 986 100 -} -a { - s 0 - b 6417 989 - e 6424 996 - l 6417 989 6424 996 100 -} -a { - s 0 - b 18743 993 - e 18750 1000 - l 18743 993 18750 1000 100 -} -a { - s 0 - b 18521 1025 - e 18528 1032 - l 18521 1025 18528 1032 100 -} -a { - s 0 - b 14667 1037 - e 14674 1044 - l 14667 1037 14674 1044 100 -} -a { - s 0 - b 15106 1054 - e 15113 1061 - l 15106 1054 15113 1061 100 -} -a { - s 0 - b 15341 1067 - e 15348 1074 - l 15341 1067 15348 1074 100 -} -a { - s 0 - b 10998 1070 - e 11005 1077 - l 10998 1070 11005 1077 100 -} -a { - s 0 - b 18047 1074 - e 18054 1081 - l 18047 1074 18054 1081 100 -} -a { - s 0 - b 4983 1079 - e 4990 1086 - l 4983 1079 4990 1086 100 -} -a { - s 0 - b 4984 1080 - e 4991 1087 - l 4984 1080 4991 1087 100 -} -a { - s 0 - b 11183 1082 - e 11190 1089 - l 11183 1082 11190 1089 100 -} -a { - s 0 - b 4478 1083 - e 4485 1090 - l 4478 1083 4485 1090 100 -} -a { - s 0 - b 4479 1084 - e 4486 1091 - l 4479 1084 4486 1091 100 -} -a { - s 0 - b 3546 1088 - e 3553 1095 - l 3546 1088 3553 1095 100 -} -a { - s 0 - b 7574 1103 - e 7581 1110 - l 7574 1103 7581 1110 100 -} -a { - s 0 - b 15180 1117 - e 15187 1124 - l 15180 1117 15187 1124 100 -} -a { - s 0 - b 8823 1125 - e 8830 1132 - l 8823 1125 8830 1132 100 -} -a { - s 0 - b 2983 1128 - e 2990 1135 - l 2983 1128 2990 1135 100 -} -a { - s 0 - b 17570 1130 - e 17577 1137 - l 17570 1130 17577 1137 100 -} -a { - s 0 - b 17328 1135 - e 17335 1142 - l 17328 1135 17335 1142 100 -} -a { - s 0 - b 17329 1136 - e 17336 1143 - l 17329 1136 17336 1143 100 -} -a { - s 0 - b 8682 1136 - e 8689 1143 - l 8682 1136 8689 1143 100 -} -a { - s 0 - b 17330 1137 - e 17337 1144 - l 17330 1137 17337 1144 100 -} -a { - s 0 - b 17331 1138 - e 17338 1145 - l 17331 1138 17338 1145 100 -} -a { - s 0 - b 4087 1141 - e 4094 1148 - l 4087 1141 4094 1148 100 -} -a { - s 0 - b 8616 1146 - e 8623 1153 - l 8616 1146 8623 1153 100 -} -a { - s 0 - b 11289 1173 - e 11296 1180 - l 11289 1173 11296 1180 100 -} -a { - s 0 - b 8824 1176 - e 8831 1183 - l 8824 1176 8831 1183 100 -} -a { - s 0 - b 12594 1178 - e 12601 1185 - l 12594 1178 12601 1185 100 -} -a { - s 0 - b 7742 1191 - e 7749 1198 - l 7742 1191 7749 1198 100 -} -a { - s 0 - b 17321 1192 - e 17328 1199 - l 17321 1192 17328 1199 100 -} -a { - s 0 - b 16150 1194 - e 16157 1201 - l 16150 1194 16157 1201 100 -} -a { - s 0 - b 3973 1194 - e 3980 1201 - l 3973 1194 3980 1201 100 -} -a { - s 0 - b 17356 1196 - e 17363 1203 - l 17356 1196 17363 1203 100 -} -a { - s 0 - b 12240 1199 - e 12247 1206 - l 12240 1199 12247 1206 100 -} -a { - s 0 - b 16052 1203 - e 16059 1210 - l 16052 1203 16059 1210 100 -} -a { - s 0 - b 11472 1207 - e 11479 1214 - l 11472 1207 11479 1214 100 -} -a { - s 0 - b 6806 1209 - e 6813 1216 - l 6806 1209 6813 1216 100 -} -a { - s 0 - b 13484 1222 - e 13491 1229 - l 13484 1222 13491 1229 100 -} -a { - s 0 - b 14161 1243 - e 14168 1250 - l 14161 1243 14168 1250 100 -} -a { - s 0 - b 14162 1244 - e 14169 1251 - l 14162 1244 14169 1251 100 -} -a { - s 0 - b 18482 1250 - e 18489 1257 - l 18482 1250 18489 1257 100 -} -a { - s 0 - b 14073 1266 - e 14080 1273 - l 14073 1266 14080 1273 100 -} -a { - s 0 - b 14294 1273 - e 14301 1280 - l 14294 1273 14301 1280 100 -} -a { - s 0 - b 7743 1287 - e 7750 1294 - l 7743 1287 7750 1294 100 -} -a { - s 0 - b 7744 1288 - e 7751 1295 - l 7744 1288 7751 1295 100 -} -a { - s 0 - b 6340 1288 - e 6347 1295 - l 6340 1288 6347 1295 100 -} -a { - s 0 - b 7745 1289 - e 7752 1296 - l 7745 1289 7752 1296 100 -} -a { - s 0 - b 7746 1290 - e 7753 1297 - l 7746 1290 7753 1297 100 -} -a { - s 0 - b 8111 1291 - e 8118 1298 - l 8111 1291 8118 1298 100 -} -a { - s 0 - b 14603 1293 - e 14610 1300 - l 14603 1293 14610 1300 100 -} -a { - s 0 - b 9050 1294 - e 9057 1301 - l 9050 1294 9057 1301 100 -} -a { - s 0 - b 8760 1295 - e 8767 1302 - l 8760 1295 8767 1302 100 -} -a { - s 0 - b 3346 1959 - e 3353 1966 - l 3346 1959 3353 1966 100 -} -a { - s 0 - b 133 1960 - e 140 1967 - l 133 1960 140 1967 100 -} -a { - s 0 - b 6328 1963 - e 6335 1970 - l 6328 1963 6335 1970 100 -} -a { - s 0 - b 6329 1964 - e 6336 1971 - l 6329 1964 6336 1971 100 -} -a { - s 0 - b 5128 1969 - e 5135 1976 - l 5128 1969 5135 1976 100 -} -a { - s 0 - b 5129 1970 - e 5136 1977 - l 5129 1970 5136 1977 100 -} -a { - s 0 - b 5130 1971 - e 5137 1978 - l 5130 1971 5137 1978 100 -} -a { - s 0 - b 11044 1972 - e 11051 1979 - l 11044 1972 11051 1979 100 -} -a { - s 0 - b 6346 1980 - e 6353 1987 - l 6346 1980 6353 1987 100 -} -a { - s 0 - b 8023 1983 - e 8030 1990 - l 8023 1983 8030 1990 100 -} -a { - s 0 - b 15393 1984 - e 15400 1991 - l 15393 1984 15400 1991 100 -} -a { - s 0 - b 15156 1990 - e 15163 1997 - l 15156 1990 15163 1997 100 -} -a { - s 0 - b 18483 2000 - e 18490 2007 - l 18483 2000 18490 2007 100 -} -a { - s 0 - b 18484 2001 - e 18491 2008 - l 18484 2001 18491 2008 100 -} -a { - s 0 - b 11152 2009 - e 11159 2016 - l 11152 2009 11159 2016 100 -} -a { - s 0 - b 11153 2010 - e 11160 2017 - l 11153 2010 11160 2017 100 -} -a { - s 0 - b 14111 2015 - e 14118 2022 - l 14111 2015 14118 2022 100 -} -a { - s 0 - b 8167 2040 - e 8174 2047 - l 8167 2040 8174 2047 100 -} -a { - s 0 - b 13944 2054 - e 13951 2061 - l 13944 2054 13951 2061 100 -} -a { - s 0 - b 6789 2073 - e 6796 2080 - l 6789 2073 6796 2080 100 -} -a { - s 0 - b 8920 2074 - e 8927 2081 - l 8920 2074 8927 2081 100 -} -a { - s 0 - b 7588 2085 - e 7595 2092 - l 7588 2085 7595 2092 100 -} -a { - s 0 - b 14763 2086 - e 14770 2093 - l 14763 2086 14770 2093 100 -} -a { - s 0 - b 7589 2086 - e 7596 2093 - l 7589 2086 7596 2093 100 -} -a { - s 0 - b 14764 2087 - e 14771 2094 - l 14764 2087 14771 2094 100 -} -a { - s 0 - b 17482 2093 - e 17489 2100 - l 17482 2093 17489 2100 100 -} -a { - s 0 - b 17483 2094 - e 17490 2101 - l 17483 2094 17490 2101 100 -} -a { - s 0 - b 11283 2099 - e 11290 2106 - l 11283 2099 11290 2106 100 -} -a { - s 0 - b 8164 2099 - e 8171 2106 - l 8164 2099 8171 2106 100 -} -a { - s 0 - b 14693 2100 - e 14700 2107 - l 14693 2100 14700 2107 100 -} -a { - s 0 - b 11284 2100 - e 11291 2107 - l 11284 2100 11291 2107 100 -} -a { - s 0 - b 8894 2187 - e 8901 2194 - l 8894 2187 8901 2194 100 -} -a { - s 0 - b 9060 2188 - e 9067 2195 - l 9060 2188 9067 2195 100 -} -a { - s 0 - b 8872 2192 - e 8879 2199 - l 8872 2192 8879 2199 100 -} -a { - s 0 - b 15370 2203 - e 15377 2210 - l 15370 2203 15377 2210 100 -} -a { - s 0 - b 15371 2204 - e 15378 2211 - l 15371 2204 15378 2211 100 -} -a { - s 0 - b 4494 2242 - e 4501 2249 - l 4494 2242 4501 2249 100 -} -a { - s 0 - b 4495 2243 - e 4502 2250 - l 4495 2243 4502 2250 100 -} -a { - s 0 - b 18782 2260 - e 18789 2267 - l 18782 2260 18789 2267 100 -} -a { - s 0 - b 9356 2262 - e 9363 2269 - l 9356 2262 9363 2269 100 -} -a { - s 0 - b 14602 2271 - e 14609 2278 - l 14602 2271 14609 2278 100 -} -a { - s 0 - b 6609 2271 - e 6616 2278 - l 6609 2271 6616 2278 100 -} -a { - s 0 - b 16104 2295 - e 16111 2302 - l 16104 2295 16111 2302 100 -} -a { - s 0 - b 14580 2313 - e 14587 2320 - l 14580 2313 14587 2320 100 -} -a { - s 0 - b 3020 2315 - e 3027 2322 - l 3020 2315 3027 2322 100 -} -a { - s 0 - b 167 2320 - e 174 2327 - l 167 2320 174 2327 100 -} -a { - s 0 - b 168 2321 - e 175 2328 - l 168 2321 175 2328 100 -} -a { - s 0 - b 169 2322 - e 176 2329 - l 169 2322 176 2329 100 -} -a { - s 0 - b 4572 2332 - e 4579 2339 - l 4572 2332 4579 2339 100 -} -a { - s 0 - b 4557 2333 - e 4564 2340 - l 4557 2333 4564 2340 100 -} -a { - s 0 - b 8770 2344 - e 8777 2351 - l 8770 2344 8777 2351 100 -} -a { - s 0 - b 3163 2367 - e 3170 2374 - l 3163 2367 3170 2374 100 -} -a { - s 0 - b 8172 2372 - e 8179 2379 - l 8172 2372 8179 2379 100 -} -a { - s 0 - b 14022 2375 - e 14029 2382 - l 14022 2375 14029 2382 100 -} -a { - s 0 - b 4499 2375 - e 4506 2382 - l 4499 2375 4506 2382 100 -} -a { - s 0 - b 8078 2381 - e 8085 2388 - l 8078 2381 8085 2388 100 -} -a { - s 0 - b 16038 2383 - e 16045 2390 - l 16038 2383 16045 2390 100 -} -a { - s 0 - b 16039 2384 - e 16046 2391 - l 16039 2384 16046 2391 100 -} -a { - s 0 - b 18702 2387 - e 18709 2394 - l 18702 2387 18709 2394 100 -} -a { - s 0 - b 9655 2390 - e 9662 2397 - l 9655 2390 9662 2397 100 -} -a { - s 0 - b 18548 2402 - e 18555 2409 - l 18548 2402 18555 2409 100 -} -a { - s 0 - b 17367 2405 - e 17374 2412 - l 17367 2405 17374 2412 100 -} -a { - s 0 - b 18039 2406 - e 18046 2413 - l 18039 2406 18046 2413 100 -} -a { - s 0 - b 18040 2407 - e 18047 2414 - l 18040 2407 18047 2414 100 -} -a { - s 0 - b 14525 2410 - e 14532 2417 - l 14525 2410 14532 2417 100 -} -a { - s 0 - b 3299 2424 - e 3306 2431 - l 3299 2424 3306 2431 100 -} -a { - s 0 - b 11092 2436 - e 11099 2443 - l 11092 2436 11099 2443 100 -} -a { - s 0 - b 8755 2436 - e 8762 2443 - l 8755 2436 8762 2443 100 -} -a { - s 0 - b 11093 2437 - e 11100 2444 - l 11093 2437 11100 2444 100 -} -a { - s 0 - b 9450 2437 - e 9457 2444 - l 9450 2437 9457 2444 100 -} -a { - s 0 - b 8756 2437 - e 8763 2444 - l 8756 2437 8763 2444 100 -} -a { - s 0 - b 11094 2438 - e 11101 2445 - l 11094 2438 11101 2445 100 -} -a { - s 0 - b 9676 2439 - e 9683 2446 - l 9676 2439 9683 2446 100 -} -a { - s 0 - b 9677 2440 - e 9684 2447 - l 9677 2440 9684 2447 100 -} -a { - s 0 - b 9678 2441 - e 9685 2448 - l 9678 2441 9685 2448 100 -} -a { - s 0 - b 13493 2442 - e 13500 2449 - l 13493 2442 13500 2449 100 -} -a { - s 0 - b 9679 2442 - e 9686 2449 - l 9679 2442 9686 2449 100 -} -a { - s 0 - b 13494 2443 - e 13501 2450 - l 13494 2443 13501 2450 100 -} -a { - s 0 - b 13495 2444 - e 13502 2451 - l 13495 2444 13502 2451 100 -} -a { - s 0 - b 18372 2446 - e 18379 2453 - l 18372 2446 18379 2453 100 -} -a { - s 0 - b 12325 2447 - e 12332 2454 - l 12325 2447 12332 2454 100 -} -a { - s 0 - b 14337 2448 - e 14344 2455 - l 14337 2448 14344 2455 100 -} -a { - s 0 - b 12326 2448 - e 12333 2455 - l 12326 2448 12333 2455 100 -} -a { - s 0 - b 14338 2449 - e 14345 2456 - l 14338 2449 14345 2456 100 -} -a { - s 0 - b 12327 2449 - e 12334 2456 - l 12327 2449 12334 2456 100 -} -a { - s 0 - b 8484 2449 - e 8491 2456 - l 8484 2449 8491 2456 100 -} -a { - s 0 - b 17350 2450 - e 17357 2457 - l 17350 2450 17357 2457 100 -} -a { - s 0 - b 14377 2450 - e 14384 2457 - l 14377 2450 14384 2457 100 -} -a { - s 0 - b 3159 2450 - e 3166 2457 - l 3159 2450 3166 2457 100 -} -a { - s 0 - b 17351 2451 - e 17358 2458 - l 17351 2451 17358 2458 100 -} -a { - s 0 - b 17352 2452 - e 17359 2459 - l 17352 2452 17359 2459 100 -} -a { - s 0 - b 14498 2452 - e 14505 2459 - l 14498 2452 14505 2459 100 -} -a { - s 0 - b 3351 2456 - e 3358 2463 - l 3351 2456 3358 2463 100 -} -a { - s 0 - b 14287 2474 - e 14294 2481 - l 14287 2474 14294 2481 100 -} -a { - s 0 - b 14733 2476 - e 14740 2483 - l 14733 2476 14740 2483 100 -} -a { - s 0 - b 3160 2484 - e 3167 2491 - l 3160 2484 3167 2491 100 -} -a { - s 0 - b 3161 2485 - e 3168 2492 - l 3161 2485 3168 2492 100 -} -a { - s 0 - b 9349 2486 - e 9356 2493 - l 9349 2486 9356 2493 100 -} -a { - s 0 - b 17419 2487 - e 17426 2494 - l 17419 2487 17426 2494 100 -} -a { - s 0 - b 8751 2490 - e 8758 2497 - l 8751 2490 8758 2497 100 -} -a { - s 0 - b 13452 2542 - e 13459 2549 - l 13452 2542 13459 2549 100 -} -a { - s 0 - b 13453 2543 - e 13460 2550 - l 13453 2543 13460 2550 100 -} -a { - s 0 - b 5591 2547 - e 5598 2554 - l 5591 2547 5598 2554 100 -} -a { - s 0 - b 3930 2547 - e 3937 2554 - l 3930 2547 3937 2554 100 -} -a { - s 0 - b 11425 2550 - e 11432 2557 - l 11425 2550 11432 2557 100 -} -a { - s 0 - b 18592 2554 - e 18599 2561 - l 18592 2554 18599 2561 100 -} -a { - s 0 - b 15930 2572 - e 15937 2579 - l 15930 2572 15937 2579 100 -} -a { - s 0 - b 3415 2573 - e 3422 2580 - l 3415 2573 3422 2580 100 -} -a { - s 0 - b 6269 2577 - e 6276 2584 - l 6269 2577 6276 2584 100 -} -a { - s 0 - b 6270 2578 - e 6277 2585 - l 6270 2578 6277 2585 100 -} -a { - s 0 - b 7766 2580 - e 7773 2587 - l 7766 2580 7773 2587 100 -} -a { - s 0 - b 17457 2581 - e 17464 2588 - l 17457 2581 17464 2588 100 -} -a { - s 0 - b 8828 2581 - e 8835 2588 - l 8828 2581 8835 2588 100 -} -a { - s 0 - b 12219 2583 - e 12226 2590 - l 12219 2583 12226 2590 100 -} -a { - s 0 - b 9487 2590 - e 9494 2597 - l 9487 2590 9494 2597 100 -} -a { - s 0 - b 3054 2626 - e 3061 2633 - l 3054 2626 3061 2633 100 -} -a { - s 0 - b 3112 2632 - e 3119 2639 - l 3112 2632 3119 2639 100 -} -a { - s 0 - b 10991 2637 - e 10998 2644 - l 10991 2637 10998 2644 100 -} -a { - s 0 - b 14140 2638 - e 14147 2645 - l 14140 2638 14147 2645 100 -} -a { - s 0 - b 14541 2659 - e 14548 2666 - l 14541 2659 14548 2666 100 -} -a { - s 0 - b 3532 2700 - e 3539 2707 - l 3532 2700 3539 2707 100 -} -a { - s 0 - b 3533 2701 - e 3540 2708 - l 3533 2701 3540 2708 100 -} -a { - s 0 - b 3109 2701 - e 3116 2708 - l 3109 2701 3116 2708 100 -} -a { - s 0 - b 3534 2702 - e 3541 2709 - l 3534 2702 3541 2709 100 -} -a { - s 0 - b 5104 2703 - e 5111 2710 - l 5104 2703 5111 2710 100 -} -a { - s 0 - b 7546 2704 - e 7553 2711 - l 7546 2704 7553 2711 100 -} -a { - s 0 - b 13920 2708 - e 13927 2715 - l 13920 2708 13927 2715 100 -} -a { - s 0 - b 8126 2718 - e 8133 2725 - l 8126 2718 8133 2725 100 -} -a { - s 0 - b 6649 2722 - e 6656 2729 - l 6649 2722 6656 2729 100 -} -a { - s 0 - b 15976 2724 - e 15983 2731 - l 15976 2724 15983 2731 100 -} -a { - s 0 - b 7521 2727 - e 7528 2734 - l 7521 2727 7528 2734 100 -} -a { - s 0 - b 4083 2728 - e 4090 2735 - l 4083 2728 4090 2735 100 -} -a { - s 0 - b 4084 2729 - e 4091 2736 - l 4084 2729 4091 2736 100 -} -a { - s 0 - b 4395 2733 - e 4402 2740 - l 4395 2733 4402 2740 100 -} -a { - s 0 - b 4396 2734 - e 4403 2741 - l 4396 2734 4403 2741 100 -} -a { - s 0 - b 4397 2735 - e 4404 2742 - l 4397 2735 4404 2742 100 -} -a { - s 0 - b 8754 2737 - e 8761 2744 - l 8754 2737 8761 2744 100 -} -a { - s 0 - b 8019 2738 - e 8026 2745 - l 8019 2738 8026 2745 100 -} -a { - s 0 - b 8020 2739 - e 8027 2746 - l 8020 2739 8027 2746 100 -} -a { - s 0 - b 3079 2739 - e 3086 2746 - l 3079 2739 3086 2746 100 -} -a { - s 0 - b 3080 2740 - e 3087 2747 - l 3080 2740 3087 2747 100 -} -a { - s 0 - b 93 2740 - e 100 2747 - l 93 2740 100 2747 100 -} -a { - s 0 - b 18045 2743 - e 18052 2750 - l 18045 2743 18052 2750 100 -} -a { - s 0 - b 11232 2758 - e 11239 2765 - l 11232 2758 11239 2765 100 -} -a { - s 0 - b 11233 2759 - e 11240 2766 - l 11233 2759 11240 2766 100 -} -a { - s 0 - b 59 2771 - e 66 2778 - l 59 2771 66 2778 100 -} -a { - s 0 - b 9603 2790 - e 9610 2797 - l 9603 2790 9610 2797 100 -} -a { - s 0 - b 9604 2791 - e 9611 2798 - l 9604 2791 9611 2798 100 -} -a { - s 0 - b 7541 2791 - e 7548 2798 - l 7541 2791 7548 2798 100 -} -a { - s 0 - b 7542 2792 - e 7549 2799 - l 7542 2792 7549 2799 100 -} -a { - s 0 - b 3156 2794 - e 3163 2801 - l 3156 2794 3163 2801 100 -} -a { - s 0 - b 18611 2813 - e 18618 2820 - l 18611 2813 18618 2820 100 -} -a { - s 0 - b 4947 2820 - e 4954 2827 - l 4947 2820 4954 2827 100 -} -a { - s 0 - b 13920 2823 - e 13927 2830 - l 13920 2823 13927 2830 100 -} -a { - s 0 - b 13921 2824 - e 13928 2831 - l 13921 2824 13928 2831 100 -} -a { - s 0 - b 15372 2825 - e 15379 2832 - l 15372 2825 15379 2832 100 -} -a { - s 0 - b 11080 2828 - e 11087 2835 - l 11080 2828 11087 2835 100 -} -a { - s 0 - b 11081 2829 - e 11088 2836 - l 11081 2829 11088 2836 100 -} -a { - s 0 - b 13934 2840 - e 13941 2847 - l 13934 2840 13941 2847 100 -} -a { - s 0 - b 13935 2841 - e 13942 2848 - l 13935 2841 13942 2848 100 -} -a { - s 0 - b 15163 2861 - e 15170 2868 - l 15163 2861 15170 2868 100 -} -a { - s 0 - b 15164 2862 - e 15171 2869 - l 15164 2862 15171 2869 100 -} -a { - s 0 - b 15153 2862 - e 15160 2869 - l 15153 2862 15160 2869 100 -} -a { - s 0 - b 15165 2863 - e 15172 2870 - l 15165 2863 15172 2870 100 -} -a { - s 0 - b 15154 2863 - e 15161 2870 - l 15154 2863 15161 2870 100 -} -a { - s 0 - b 11050 2865 - e 11057 2872 - l 11050 2865 11057 2872 100 -} -a { - s 0 - b 4854 2869 - e 4861 2876 - l 4854 2869 4861 2876 100 -} -a { - s 0 - b 3546 2875 - e 3553 2882 - l 3546 2875 3553 2882 100 -} -a { - s 0 - b 17262 2876 - e 17269 2883 - l 17262 2876 17269 2883 100 -} -a { - s 0 - b 13897 2876 - e 13904 2883 - l 13897 2876 13904 2883 100 -} -a { - s 0 - b 4070 2876 - e 4077 2883 - l 4070 2876 4077 2883 100 -} -a { - s 0 - b 17263 2877 - e 17270 2884 - l 17263 2877 17270 2884 100 -} -a { - s 0 - b 3543 2882 - e 3550 2889 - l 3543 2882 3550 2889 100 -} -a { - s 0 - b 4012 2895 - e 4019 2902 - l 4012 2895 4019 2902 100 -} -a { - s 0 - b 3572 2943 - e 3579 2950 - l 3572 2943 3579 2950 100 -} -a { - s 0 - b 15196 2948 - e 15203 2955 - l 15196 2948 15203 2955 100 -} -a { - s 0 - b 11049 2963 - e 11056 2970 - l 11049 2963 11056 2970 100 -} -a { - s 0 - b 4012 2966 - e 4019 2973 - l 4012 2966 4019 2973 100 -} -a { - s 0 - b 4013 2967 - e 4020 2974 - l 4013 2967 4020 2974 100 -} -a { - s 0 - b 13445 2970 - e 13452 2977 - l 13445 2970 13452 2977 100 -} -a { - s 0 - b 14363 2971 - e 14370 2978 - l 14363 2971 14370 2978 100 -} -a { - s 0 - b 13446 2971 - e 13453 2978 - l 13446 2971 13453 2978 100 -} -a { - s 0 - b 8123 2973 - e 8130 2980 - l 8123 2973 8130 2980 100 -} -a { - s 0 - b 17370 2974 - e 17377 2981 - l 17370 2974 17377 2981 100 -} -a { - s 0 - b 8124 2974 - e 8131 2981 - l 8124 2974 8131 2981 100 -} -a { - s 0 - b 16051 2976 - e 16058 2983 - l 16051 2976 16058 2983 100 -} -a { - s 0 - b 15274 2987 - e 15281 2994 - l 15274 2987 15281 2994 100 -} -a { - s 0 - b 15275 2988 - e 15282 2995 - l 15275 2988 15282 2995 100 -} -a { - s 0 - b 3671 2988 - e 3678 2995 - l 3671 2988 3678 2995 100 -} -a { - s 0 - b 10850 2989 - e 10857 2996 - l 10850 2989 10857 2996 100 -} -a { - s 0 - b 13865 3001 - e 13872 3008 - l 13865 3001 13872 3008 100 -} -a { - s 0 - b 13866 3002 - e 13873 3009 - l 13866 3002 13873 3009 100 -} -a { - s 0 - b 7459 3011 - e 7466 3018 - l 7459 3011 7466 3018 100 -} -a { - s 0 - b 4924 3012 - e 4931 3019 - l 4924 3012 4931 3019 100 -} -a { - s 0 - b 3621 3016 - e 3628 3023 - l 3621 3016 3628 3023 100 -} -a { - s 0 - b 18711 3018 - e 18718 3025 - l 18711 3018 18718 3025 100 -} -a { - s 0 - b 4000 3018 - e 4007 3025 - l 4000 3018 4007 3025 100 -} -a { - s 0 - b 13442 3019 - e 13449 3026 - l 13442 3019 13449 3026 100 -} -a { - s 0 - b 5023 3020 - e 5030 3027 - l 5023 3020 5030 3027 100 -} -a { - s 0 - b 14299 3021 - e 14306 3028 - l 14299 3021 14306 3028 100 -} -a { - s 0 - b 17410 3022 - e 17417 3029 - l 17410 3022 17417 3029 100 -} -a { - s 0 - b 14300 3022 - e 14307 3029 - l 14300 3022 14307 3029 100 -} -a { - s 0 - b 11273 3023 - e 11280 3030 - l 11273 3023 11280 3030 100 -} -a { - s 0 - b 11274 3024 - e 11281 3031 - l 11274 3024 11281 3031 100 -} -a { - s 0 - b 7537 3027 - e 7544 3034 - l 7537 3027 7544 3034 100 -} -a { - s 0 - b 8112 3028 - e 8119 3035 - l 8112 3028 8119 3035 100 -} -a { - s 0 - b 7538 3028 - e 7545 3035 - l 7538 3028 7545 3035 100 -} -a { - s 0 - b 8113 3029 - e 8120 3036 - l 8113 3029 8120 3036 100 -} -a { - s 0 - b 7539 3029 - e 7546 3036 - l 7539 3029 7546 3036 100 -} -a { - s 0 - b 4439 3031 - e 4446 3038 - l 4439 3031 4446 3038 100 -} -a { - s 0 - b 8251 3032 - e 8258 3039 - l 8251 3032 8258 3039 100 -} -a { - s 0 - b 4980 3037 - e 4987 3044 - l 4980 3037 4987 3044 100 -} -a { - s 0 - b 4981 3038 - e 4988 3045 - l 4981 3038 4988 3045 100 -} -a { - s 0 - b 7577 3044 - e 7584 3051 - l 7577 3044 7584 3051 100 -} -a { - s 0 - b 7578 3045 - e 7585 3052 - l 7578 3045 7585 3052 100 -} -a { - s 0 - b 7579 3046 - e 7586 3053 - l 7579 3046 7586 3053 100 -} -a { - s 0 - b 13933 3052 - e 13940 3059 - l 13933 3052 13940 3059 100 -} -a { - s 0 - b 13934 3053 - e 13941 3060 - l 13934 3053 13941 3060 100 -} -a { - s 0 - b 7636 3057 - e 7643 3064 - l 7636 3057 7643 3064 100 -} -a { - s 0 - b 7637 3058 - e 7644 3065 - l 7637 3058 7644 3065 100 -} -a { - s 0 - b 9394 3062 - e 9401 3069 - l 9394 3062 9401 3069 100 -} -a { - s 0 - b 9395 3063 - e 9402 3070 - l 9395 3063 9402 3070 100 -} -a { - s 0 - b 17526 3064 - e 17533 3071 - l 17526 3064 17533 3071 100 -} -a { - s 0 - b 18102 3070 - e 18109 3077 - l 18102 3070 18109 3077 100 -} -a { - s 0 - b 18103 3071 - e 18110 3078 - l 18103 3071 18110 3078 100 -} -a { - s 0 - b 15317 3077 - e 15324 3084 - l 15317 3077 15324 3084 100 -} -a { - s 0 - b 11412 3079 - e 11419 3086 - l 11412 3079 11419 3086 100 -} -a { - s 0 - b 5644 3088 - e 5651 3095 - l 5644 3088 5651 3095 100 -} -a { - s 0 - b 4033 3107 - e 4040 3114 - l 4033 3107 4040 3114 100 -} -a { - s 0 - b 8593 3113 - e 8600 3120 - l 8593 3113 8600 3120 100 -} -a { - s 0 - b 8384 3121 - e 8391 3128 - l 8384 3121 8391 3128 100 -} -a { - s 0 - b 4553 3405 - e 4560 3412 - l 4553 3405 4560 3412 100 -} -a { - s 0 - b 4554 3406 - e 4561 3413 - l 4554 3406 4561 3413 100 -} -a { - s 0 - b 17291 3407 - e 17298 3414 - l 17291 3407 17298 3414 100 -} -a { - s 0 - b 23 3410 - e 30 3417 - l 23 3410 30 3417 100 -} -a { - s 0 - b 4970 3414 - e 4977 3421 - l 4970 3414 4977 3421 100 -} -a { - s 0 - b 12325 3418 - e 12332 3425 - l 12325 3418 12332 3425 100 -} -a { - s 0 - b 14337 3419 - e 14344 3426 - l 14337 3419 14344 3426 100 -} -a { - s 0 - b 12326 3419 - e 12333 3426 - l 12326 3419 12333 3426 100 -} -a { - s 0 - b 14338 3420 - e 14345 3427 - l 14338 3420 14345 3427 100 -} -a { - s 0 - b 12327 3420 - e 12334 3427 - l 12327 3420 12334 3427 100 -} -a { - s 0 - b 8484 3420 - e 8491 3427 - l 8484 3420 8491 3427 100 -} -a { - s 0 - b 14339 3421 - e 14346 3428 - l 14339 3421 14346 3428 100 -} -a { - s 0 - b 5096 3421 - e 5103 3428 - l 5096 3421 5103 3428 100 -} -a { - s 0 - b 18750 3422 - e 18757 3429 - l 18750 3422 18757 3429 100 -} -a { - s 0 - b 5097 3422 - e 5104 3429 - l 5097 3422 5104 3429 100 -} -a { - s 0 - b 4818 3423 - e 4825 3430 - l 4818 3423 4825 3430 100 -} -a { - s 0 - b 4819 3424 - e 4826 3431 - l 4819 3424 4826 3431 100 -} -a { - s 0 - b 4906 3425 - e 4913 3432 - l 4906 3425 4913 3432 100 -} -a { - s 0 - b 4820 3425 - e 4827 3432 - l 4820 3425 4827 3432 100 -} -a { - s 0 - b 18678 3436 - e 18685 3443 - l 18678 3436 18685 3443 100 -} -a { - s 0 - b 18679 3437 - e 18686 3444 - l 18679 3437 18686 3444 100 -} -a { - s 0 - b 4396 3446 - e 4403 3453 - l 4396 3446 4403 3453 100 -} -a { - s 0 - b 4076 3458 - e 4083 3465 - l 4076 3458 4083 3465 100 -} -a { - s 0 - b 4077 3459 - e 4084 3466 - l 4077 3459 4084 3466 100 -} -a { - s 0 - b 3020 3487 - e 3027 3494 - l 3020 3487 3027 3494 100 -} -a { - s 0 - b 3021 3488 - e 3028 3495 - l 3021 3488 3028 3495 100 -} -a { - s 0 - b 130 3489 - e 137 3496 - l 130 3489 137 3496 100 -} -a { - s 0 - b 15959 3490 - e 15966 3497 - l 15959 3490 15966 3497 100 -} -a { - s 0 - b 8790 3491 - e 8797 3498 - l 8790 3491 8797 3498 100 -} -a { - s 0 - b 8444 3491 - e 8451 3498 - l 8444 3491 8451 3498 100 -} -a { - s 0 - b 8791 3492 - e 8798 3499 - l 8791 3492 8798 3499 100 -} -a { - s 0 - b 8445 3492 - e 8452 3499 - l 8445 3492 8452 3499 100 -} -a { - s 0 - b 8792 3493 - e 8799 3500 - l 8792 3493 8799 3500 100 -} -a { - s 0 - b 14664 3501 - e 14671 3508 - l 14664 3501 14671 3508 100 -} -a { - s 0 - b 14665 3502 - e 14672 3509 - l 14665 3502 14672 3509 100 -} -a { - s 0 - b 14666 3503 - e 14673 3510 - l 14666 3503 14673 3510 100 -} -a { - s 0 - b 9656 3506 - e 9663 3513 - l 9656 3506 9663 3513 100 -} -a { - s 0 - b 14047 3518 - e 14054 3525 - l 14047 3518 14054 3525 100 -} -a { - s 0 - b 6328 3528 - e 6335 3535 - l 6328 3528 6335 3535 100 -} -a { - s 0 - b 4052 3529 - e 4059 3536 - l 4052 3529 4059 3536 100 -} -a { - s 0 - b 14639 3563 - e 14646 3570 - l 14639 3563 14646 3570 100 -} -a { - s 0 - b 8246 3563 - e 8253 3570 - l 8246 3563 8253 3570 100 -} -a { - s 0 - b 8247 3564 - e 8254 3571 - l 8247 3564 8254 3571 100 -} -a { - s 0 - b 14103 3565 - e 14110 3572 - l 14103 3565 14110 3572 100 -} -a { - s 0 - b 17187 3567 - e 17194 3574 - l 17187 3567 17194 3574 100 -} -a { - s 0 - b 17188 3568 - e 17195 3575 - l 17188 3568 17195 3575 100 -} -a { - s 0 - b 9269 3568 - e 9276 3575 - l 9269 3568 9276 3575 100 -} -a { - s 0 - b 17189 3569 - e 17196 3576 - l 17189 3569 17196 3576 100 -} -a { - s 0 - b 18366 3575 - e 18373 3582 - l 18366 3575 18373 3582 100 -} -a { - s 0 - b 18367 3576 - e 18374 3583 - l 18367 3576 18374 3583 100 -} -a { - s 0 - b 17259 3600 - e 17266 3607 - l 17259 3600 17266 3607 100 -} -a { - s 0 - b 15257 3610 - e 15264 3617 - l 15257 3610 15264 3617 100 -} -a { - s 0 - b 7831 3616 - e 7838 3623 - l 7831 3616 7838 3623 100 -} -a { - s 0 - b 8785 3617 - e 8792 3624 - l 8785 3617 8792 3624 100 -} -a { - s 0 - b 4096 3631 - e 4103 3638 - l 4096 3631 4103 3638 100 -} -a { - s 0 - b 13903 3633 - e 13910 3640 - l 13903 3633 13910 3640 100 -} -a { - s 0 - b 10847 3646 - e 10854 3653 - l 10847 3646 10854 3653 100 -} -a { - s 0 - b 4584 3648 - e 4591 3655 - l 4584 3648 4591 3655 100 -} -a { - s 0 - b 4585 3649 - e 4592 3656 - l 4585 3649 4592 3656 100 -} -a { - s 0 - b 16023 3655 - e 16030 3662 - l 16023 3655 16030 3662 100 -} -a { - s 0 - b 8382 3661 - e 8389 3668 - l 8382 3661 8389 3668 100 -} -a { - s 0 - b 8771 3662 - e 8778 3669 - l 8771 3662 8778 3669 100 -} -a { - s 0 - b 8772 3663 - e 8779 3670 - l 8772 3663 8779 3670 100 -} -a { - s 0 - b 7403 3670 - e 7410 3677 - l 7403 3670 7410 3677 100 -} -a { - s 0 - b 7404 3671 - e 7411 3678 - l 7404 3671 7411 3678 100 -} -a { - s 0 - b 3243 3673 - e 3250 3680 - l 3243 3673 3250 3680 100 -} -a { - s 0 - b 14285 3678 - e 14292 3685 - l 14285 3678 14292 3685 100 -} -a { - s 0 - b 10889 3698 - e 10896 3705 - l 10889 3698 10896 3705 100 -} -a { - s 0 - b 97 3698 - e 104 3705 - l 97 3698 104 3705 100 -} -a { - s 0 - b 17507 3699 - e 17514 3706 - l 17507 3699 17514 3706 100 -} -a { - s 0 - b 10890 3699 - e 10897 3706 - l 10890 3699 10897 3706 100 -} -a { - s 0 - b 4844 3708 - e 4851 3715 - l 4844 3708 4851 3715 100 -} -a { - s 0 - b 4845 3709 - e 4852 3716 - l 4845 3709 4852 3716 100 -} -a { - s 0 - b 4949 3721 - e 4956 3728 - l 4949 3721 4956 3728 100 -} -a { - s 0 - b 17281 3745 - e 17288 3752 - l 17281 3745 17288 3752 100 -} -a { - s 0 - b 6338 3745 - e 6345 3752 - l 6338 3745 6345 3752 100 -} -a { - s 0 - b 14619 3761 - e 14626 3768 - l 14619 3761 14626 3768 100 -} -a { - s 0 - b 4525 3761 - e 4532 3768 - l 4525 3761 4532 3768 100 -} -a { - s 0 - b 11164 3769 - e 11171 3776 - l 11164 3769 11171 3776 100 -} -a { - s 0 - b 11023 3769 - e 11030 3776 - l 11023 3769 11030 3776 100 -} -a { - s 0 - b 11165 3770 - e 11172 3777 - l 11165 3770 11172 3777 100 -} -a { - s 0 - b 6582 3770 - e 6589 3777 - l 6582 3770 6589 3777 100 -} -a { - s 0 - b 3366 3774 - e 3373 3781 - l 3366 3774 3373 3781 100 -} -a { - s 0 - b 14390 3775 - e 14397 3782 - l 14390 3775 14397 3782 100 -} -a { - s 0 - b 3661 3795 - e 3668 3802 - l 3661 3795 3668 3802 100 -} -a { - s 0 - b 3662 3796 - e 3669 3803 - l 3662 3796 3669 3803 100 -} -a { - s 0 - b 3663 3797 - e 3670 3804 - l 3663 3797 3670 3804 100 -} -a { - s 0 - b 7762 3798 - e 7769 3805 - l 7762 3798 7769 3805 100 -} -a { - s 0 - b 219 3806 - e 226 3813 - l 219 3806 226 3813 100 -} -a { - s 0 - b 13885 3813 - e 13892 3820 - l 13885 3813 13892 3820 100 -} -a { - s 0 - b 12222 3815 - e 12229 3822 - l 12222 3815 12229 3822 100 -} -a { - s 0 - b 5120 3841 - e 5127 3848 - l 5120 3841 5127 3848 100 -} -a { - s 0 - b 14348 3845 - e 14355 3852 - l 14348 3845 14355 3852 100 -} -a { - s 0 - b 6312 3845 - e 6319 3852 - l 6312 3845 6319 3852 100 -} -a { - s 0 - b 8376 3869 - e 8383 3876 - l 8376 3869 8383 3876 100 -} -a { - s 0 - b 8377 3870 - e 8384 3877 - l 8377 3870 8384 3877 100 -} -a { - s 0 - b 18458 3873 - e 18465 3880 - l 18458 3873 18465 3880 100 -} -a { - s 0 - b 8821 3873 - e 8828 3880 - l 8821 3873 8828 3880 100 -} -a { - s 0 - b 14362 3874 - e 14369 3881 - l 14362 3874 14369 3881 100 -} -a { - s 0 - b 18042 3876 - e 18049 3883 - l 18042 3876 18049 3883 100 -} -a { - s 0 - b 11292 3876 - e 11299 3883 - l 11292 3876 11299 3883 100 -} -a { - s 0 - b 9684 3878 - e 9691 3885 - l 9684 3878 9691 3885 100 -} -a { - s 0 - b 8111 3902 - e 8118 3909 - l 8111 3902 8118 3909 100 -} -a { - s 0 - b 13498 3905 - e 13505 3912 - l 13498 3905 13505 3912 100 -} -a { - s 0 - b 8952 3905 - e 8959 3912 - l 8952 3905 8959 3912 100 -} -a { - s 0 - b 8953 3906 - e 8960 3913 - l 8953 3906 8960 3913 100 -} -a { - s 0 - b 16090 3918 - e 16097 3925 - l 16090 3918 16097 3925 100 -} -a { - s 0 - b 9600 3919 - e 9607 3926 - l 9600 3919 9607 3926 100 -} -a { - s 0 - b 9601 3920 - e 9608 3927 - l 9601 3920 9608 3927 100 -} -a { - s 0 - b 13479 3955 - e 13486 3962 - l 13479 3955 13486 3962 100 -} -a { - s 0 - b 6740 3957 - e 6747 3964 - l 6740 3957 6747 3964 100 -} -a { - s 0 - b 6741 3958 - e 6748 3965 - l 6741 3958 6748 3965 100 -} -a { - s 0 - b 4556 3967 - e 4563 3974 - l 4556 3967 4563 3974 100 -} -a { - s 0 - b 14613 3968 - e 14620 3975 - l 14613 3968 14620 3975 100 -} -a { - s 0 - b 15404 3969 - e 15411 3976 - l 15404 3969 15411 3976 100 -} -a { - s 0 - b 14614 3969 - e 14621 3976 - l 14614 3969 14621 3976 100 -} -a { - s 0 - b 9395 3972 - e 9402 3979 - l 9395 3972 9402 3979 100 -} -a { - s 0 - b 9396 3973 - e 9403 3980 - l 9396 3973 9403 3980 100 -} -a { - s 0 - b 14318 3974 - e 14325 3981 - l 14318 3974 14325 3981 100 -} -a { - s 0 - b 4605 3976 - e 4612 3983 - l 4605 3976 4612 3983 100 -} -a { - s 0 - b 11088 3988 - e 11095 3995 - l 11088 3988 11095 3995 100 -} -a { - s 0 - b 17987 3989 - e 17994 3996 - l 17987 3989 17994 3996 100 -} -a { - s 0 - b 4984 3992 - e 4991 3999 - l 4984 3992 4991 3999 100 -} -a { - s 0 - b 14692 3993 - e 14699 4000 - l 14692 3993 14699 4000 100 -} -a { - s 0 - b 14693 3994 - e 14700 4001 - l 14693 3994 14700 4001 100 -} -a { - s 0 - b 11284 3994 - e 11291 4001 - l 11284 3994 11291 4001 100 -} -a { - s 0 - b 11285 3995 - e 11292 4002 - l 11285 3995 11292 4002 100 -} -a { - s 0 - b 15272 4010 - e 15279 4017 - l 15272 4010 15279 4017 100 -} -a { - s 0 - b 15273 4011 - e 15280 4018 - l 15273 4011 15280 4018 100 -} -a { - s 0 - b 9461 4016 - e 9468 4023 - l 9461 4016 9468 4023 100 -} -a { - s 0 - b 17978 4059 - e 17985 4066 - l 17978 4059 17985 4066 100 -} -a { - s 0 - b 2978 4061 - e 2985 4068 - l 2978 4061 2985 4068 100 -} -a { - s 0 - b 8520 4072 - e 8527 4079 - l 8520 4072 8527 4079 100 -} -a { - s 0 - b 14713 4074 - e 14720 4081 - l 14713 4074 14720 4081 100 -} -a { - s 0 - b 6363 4074 - e 6370 4081 - l 6363 4074 6370 4081 100 -} -a { - s 0 - b 18043 4089 - e 18050 4096 - l 18043 4089 18050 4096 100 -} -a { - s 0 - b 7642 4109 - e 7649 4116 - l 7642 4109 7649 4116 100 -} -a { - s 0 - b 7643 4110 - e 7650 4117 - l 7643 4110 7650 4117 100 -} -a { - s 0 - b 7644 4111 - e 7651 4118 - l 7644 4111 7651 4118 100 -} -a { - s 0 - b 18355 4115 - e 18362 4122 - l 18355 4115 18362 4122 100 -} -a { - s 0 - b 6707 4115 - e 6714 4122 - l 6707 4115 6714 4122 100 -} -a { - s 0 - b 18356 4116 - e 18363 4123 - l 18356 4116 18363 4123 100 -} -a { - s 0 - b 6708 4116 - e 6715 4123 - l 6708 4116 6715 4123 100 -} -a { - s 0 - b 15970 4156 - e 15977 4163 - l 15970 4156 15977 4163 100 -} -a { - s 0 - b 17208 4162 - e 17215 4169 - l 17208 4162 17215 4169 100 -} -a { - s 0 - b 17209 4163 - e 17216 4170 - l 17209 4163 17216 4170 100 -} -a { - s 0 - b 17210 4164 - e 17217 4171 - l 17210 4164 17217 4171 100 -} -a { - s 0 - b 17211 4165 - e 17218 4172 - l 17211 4165 17218 4172 100 -} -a { - s 0 - b 5618 4165 - e 5625 4172 - l 5618 4165 5625 4172 100 -} -a { - s 0 - b 3561 4165 - e 3568 4172 - l 3561 4165 3568 4172 100 -} -a { - s 0 - b 17212 4166 - e 17219 4173 - l 17212 4166 17219 4173 100 -} -a { - s 0 - b 15384 4167 - e 15391 4174 - l 15384 4167 15391 4174 100 -} -a { - s 0 - b 5625 4170 - e 5632 4177 - l 5625 4170 5632 4177 100 -} -a { - s 0 - b 5626 4171 - e 5633 4178 - l 5626 4171 5633 4178 100 -} -a { - s 0 - b 17412 4176 - e 17419 4183 - l 17412 4176 17419 4183 100 -} -a { - s 0 - b 14302 4176 - e 14309 4183 - l 14302 4176 14309 4183 100 -} -a { - s 0 - b 7977 4178 - e 7984 4185 - l 7977 4178 7984 4185 100 -} -a { - s 0 - b 7978 4179 - e 7985 4186 - l 7978 4179 7985 4186 100 -} -a { - s 0 - b 7686 4180 - e 7693 4187 - l 7686 4180 7693 4187 100 -} -a { - s 0 - b 16101 4183 - e 16108 4190 - l 16101 4183 16108 4190 100 -} -a { - s 0 - b 8750 4185 - e 8757 4192 - l 8750 4185 8757 4192 100 -} -a { - s 0 - b 18751 4195 - e 18758 4202 - l 18751 4195 18758 4202 100 -} -a { - s 0 - b 12241 4201 - e 12248 4208 - l 12241 4201 12248 4208 100 -} -a { - s 0 - b 135 4202 - e 142 4209 - l 135 4202 142 4209 100 -} -a { - s 0 - b 14273 4205 - e 14280 4212 - l 14273 4205 14280 4212 100 -} -a { - s 0 - b 7654 4229 - e 7661 4236 - l 7654 4229 7661 4236 100 -} -a { - s 0 - b 10944 4250 - e 10951 4257 - l 10944 4250 10951 4257 100 -} -a { - s 0 - b 14501 4251 - e 14508 4258 - l 14501 4251 14508 4258 100 -} -a { - s 0 - b 3161 4253 - e 3168 4260 - l 3161 4253 3168 4260 100 -} -a { - s 0 - b 3298 4274 - e 3305 4281 - l 3298 4274 3305 4281 100 -} -a { - s 0 - b 4908 4293 - e 4915 4300 - l 4908 4293 4915 4300 100 -} -a { - s 0 - b 15948 4294 - e 15955 4301 - l 15948 4294 15955 4301 100 -} -a { - s 0 - b 3927 4294 - e 3934 4301 - l 3927 4294 3934 4301 100 -} -a { - s 0 - b 11515 4295 - e 11522 4302 - l 11515 4295 11522 4302 100 -} -a { - s 0 - b 4968 4319 - e 4975 4326 - l 4968 4319 4975 4326 100 -} -a { - s 0 - b 11193 4323 - e 11200 4330 - l 11193 4323 11200 4330 100 -} -a { - s 0 - b 4888 4324 - e 4895 4331 - l 4888 4324 4895 4331 100 -} -a { - s 0 - b 7575 4329 - e 7582 4336 - l 7575 4329 7582 4336 100 -} -a { - s 0 - b 18663 4335 - e 18670 4342 - l 18663 4335 18670 4342 100 -} -a { - s 0 - b 15338 4336 - e 15345 4343 - l 15338 4336 15345 4343 100 -} -a { - s 0 - b 15339 4337 - e 15346 4344 - l 15339 4337 15346 4344 100 -} -a { - s 0 - b 14030 4342 - e 14037 4349 - l 14030 4342 14037 4349 100 -} -a { - s 0 - b 14031 4343 - e 14038 4350 - l 14031 4343 14038 4350 100 -} -a { - s 0 - b 3900 4348 - e 3907 4355 - l 3900 4348 3907 4355 100 -} -a { - s 0 - b 2985 4359 - e 2992 4366 - l 2985 4359 2992 4366 100 -} -a { - s 0 - b 11270 4364 - e 11277 4371 - l 11270 4364 11277 4371 100 -} -a { - s 0 - b 15920 4373 - e 15927 4380 - l 15920 4373 15927 4380 100 -} -a { - s 0 - b 8968 4379 - e 8975 4386 - l 8968 4379 8975 4386 100 -} -a { - s 0 - b 7527 4383 - e 7534 4390 - l 7527 4383 7534 4390 100 -} -a { - s 0 - b 15922 4397 - e 15929 4404 - l 15922 4397 15929 4404 100 -} -a { - s 0 - b 11310 4420 - e 11317 4427 - l 11310 4420 11317 4427 100 -} -a { - s 0 - b 6822 4422 - e 6829 4429 - l 6822 4422 6829 4429 100 -} -a { - s 0 - b 4380 4422 - e 4387 4429 - l 4380 4422 4387 4429 100 -} -a { - s 0 - b 6823 4423 - e 6830 4430 - l 6823 4423 6830 4430 100 -} -a { - s 0 - b 4381 4423 - e 4388 4430 - l 4381 4423 4388 4430 100 -} -a { - s 0 - b 14923 4431 - e 14930 4438 - l 14923 4431 14930 4438 100 -} -a { - s 0 - b 3185 4435 - e 3192 4442 - l 3185 4435 3192 4442 100 -} -a { - s 0 - b 10868 4441 - e 10875 4448 - l 10868 4441 10875 4448 100 -} -a { - s 0 - b 3972 4449 - e 3979 4456 - l 3972 4449 3979 4456 100 -} -a { - s 0 - b 3251 4459 - e 3258 4466 - l 3251 4459 3258 4466 100 -} -a { - s 0 - b 18772 4460 - e 18779 4467 - l 18772 4460 18779 4467 100 -} -a { - s 0 - b 3252 4460 - e 3259 4467 - l 3252 4460 3259 4467 100 -} -a { - s 0 - b 4866 4462 - e 4873 4469 - l 4866 4462 4873 4469 100 -} -a { - s 0 - b 4867 4463 - e 4874 4470 - l 4867 4463 4874 4470 100 -} -a { - s 0 - b 5139 4464 - e 5146 4471 - l 5139 4464 5146 4471 100 -} -a { - s 0 - b 7570 4468 - e 7577 4475 - l 7570 4468 7577 4475 100 -} -a { - s 0 - b 2988 4478 - e 2995 4485 - l 2988 4478 2995 4485 100 -} -a { - s 0 - b 17435 4479 - e 17442 4486 - l 17435 4479 17442 4486 100 -} -a { - s 0 - b 17436 4480 - e 17443 4487 - l 17436 4480 17443 4487 100 -} -a { - s 0 - b 6279 4480 - e 6286 4487 - l 6279 4480 6286 4487 100 -} -a { - s 0 - b 11073 4501 - e 11080 4508 - l 11073 4501 11080 4508 100 -} -a { - s 0 - b 11247 4512 - e 11254 4519 - l 11247 4512 11254 4519 100 -} -a { - s 0 - b 13402 4520 - e 13409 4527 - l 13402 4520 13409 4527 100 -} -a { - s 0 - b 5088 4532 - e 5095 4539 - l 5088 4532 5095 4539 100 -} -a { - s 0 - b 5070 4755 - e 5077 4762 - l 5070 4755 5077 4762 100 -} -a { - s 0 - b 15215 4759 - e 15222 4766 - l 15215 4759 15222 4766 100 -} -a { - s 0 - b 4867 4760 - e 4874 4767 - l 4867 4760 4874 4767 100 -} -a { - s 0 - b 4868 4761 - e 4875 4768 - l 4868 4761 4875 4768 100 -} -a { - s 0 - b 8037 4777 - e 8044 4784 - l 8037 4777 8044 4784 100 -} -a { - s 0 - b 13906 4778 - e 13913 4785 - l 13906 4778 13913 4785 100 -} -a { - s 0 - b 8945 4787 - e 8952 4794 - l 8945 4787 8952 4794 100 -} -a { - s 0 - b 8946 4788 - e 8953 4795 - l 8946 4788 8953 4795 100 -} -a { - s 0 - b 82 4789 - e 89 4796 - l 82 4789 89 4796 100 -} -a { - s 0 - b 6247 4792 - e 6254 4799 - l 6247 4792 6254 4799 100 -} -a { - s 0 - b 8268 4815 - e 8275 4822 - l 8268 4815 8275 4822 100 -} -a { - s 0 - b 9281 4817 - e 9288 4824 - l 9281 4817 9288 4824 100 -} -a { - s 0 - b 7681 4824 - e 7688 4831 - l 7681 4824 7688 4831 100 -} -a { - s 0 - b 7682 4825 - e 7689 4832 - l 7682 4825 7689 4832 100 -} -a { - s 0 - b 6732 4828 - e 6739 4835 - l 6732 4828 6739 4835 100 -} -a { - s 0 - b 6733 4829 - e 6740 4836 - l 6733 4829 6740 4836 100 -} -a { - s 0 - b 7594 4832 - e 7601 4839 - l 7594 4832 7601 4839 100 -} -a { - s 0 - b 7595 4833 - e 7602 4840 - l 7595 4833 7602 4840 100 -} -a { - s 0 - b 10975 4835 - e 10982 4842 - l 10975 4835 10982 4842 100 -} -a { - s 0 - b 14530 4858 - e 14537 4865 - l 14530 4858 14537 4865 100 -} -a { - s 0 - b 14531 4859 - e 14538 4866 - l 14531 4859 14538 4866 100 -} -a { - s 0 - b 16138 4875 - e 16145 4882 - l 16138 4875 16145 4882 100 -} -a { - s 0 - b 9441 4877 - e 9448 4884 - l 9441 4877 9448 4884 100 -} -a { - s 0 - b 9442 4878 - e 9449 4885 - l 9442 4878 9449 4885 100 -} -a { - s 0 - b 9443 4879 - e 9450 4886 - l 9443 4879 9450 4886 100 -} -a { - s 0 - b 7781 4881 - e 7788 4888 - l 7781 4881 7788 4888 100 -} -a { - s 0 - b 18514 4889 - e 18521 4896 - l 18514 4889 18521 4896 100 -} -a { - s 0 - b 18515 4890 - e 18522 4897 - l 18515 4890 18522 4897 100 -} -a { - s 0 - b 3259 4890 - e 3266 4897 - l 3259 4890 3266 4897 100 -} -a { - s 0 - b 17289 4896 - e 17296 4903 - l 17289 4896 17296 4903 100 -} -a { - s 0 - b 11330 4902 - e 11337 4909 - l 11330 4902 11337 4909 100 -} -a { - s 0 - b 6697 4906 - e 6704 4913 - l 6697 4906 6704 4913 100 -} -a { - s 0 - b 15273 4922 - e 15280 4929 - l 15273 4922 15280 4929 100 -} -a { - s 0 - b 9333 4927 - e 9340 4934 - l 9333 4927 9340 4934 100 -} -a { - s 0 - b 8866 4936 - e 8873 4943 - l 8866 4936 8873 4943 100 -} -a { - s 0 - b 17 4953 - e 24 4960 - l 17 4953 24 4960 100 -} -a { - s 0 - b 5049 4957 - e 5056 4964 - l 5049 4957 5056 4964 100 -} -a { - s 0 - b 5018 4961 - e 5025 4968 - l 5018 4961 5025 4968 100 -} -a { - s 0 - b 3145 4975 - e 3152 4982 - l 3145 4975 3152 4982 100 -} -a { - s 0 - b 11049 4988 - e 11056 4995 - l 11049 4988 11056 4995 100 -} -a { - s 0 - b 2972 4994 - e 2979 5001 - l 2972 4994 2979 5001 100 -} -a { - s 0 - b 11358 5018 - e 11365 5025 - l 11358 5018 11365 5025 100 -} -a { - s 0 - b 6751 5018 - e 6758 5025 - l 6751 5018 6758 5025 100 -} -a { - s 0 - b 11359 5019 - e 11366 5026 - l 11359 5019 11366 5026 100 -} -a { - s 0 - b 7703 5037 - e 7710 5044 - l 7703 5037 7710 5044 100 -} -a { - s 0 - b 13452 5054 - e 13459 5061 - l 13452 5054 13459 5061 100 -} -a { - s 0 - b 14266 5055 - e 14273 5062 - l 14266 5055 14273 5062 100 -} -a { - s 0 - b 14267 5056 - e 14274 5063 - l 14267 5056 14274 5063 100 -} -a { - s 0 - b 17209 5059 - e 17216 5066 - l 17209 5059 17216 5066 100 -} -a { - s 0 - b 8274 5064 - e 8281 5071 - l 8274 5064 8281 5071 100 -} -a { - s 0 - b 8275 5065 - e 8282 5072 - l 8275 5065 8282 5072 100 -} -a { - s 0 - b 8276 5066 - e 8283 5073 - l 8276 5066 8283 5073 100 -} -a { - s 0 - b 146 5070 - e 153 5077 - l 146 5070 153 5077 100 -} -a { - s 0 - b 147 5071 - e 154 5078 - l 147 5071 154 5078 100 -} -a { - s 0 - b 148 5072 - e 155 5079 - l 148 5072 155 5079 100 -} -a { - s 0 - b 14923 5077 - e 14930 5084 - l 14923 5077 14930 5084 100 -} -a { - s 0 - b 13992 5078 - e 13999 5085 - l 13992 5078 13999 5085 100 -} -a { - s 0 - b 6246 5080 - e 6253 5087 - l 6246 5080 6253 5087 100 -} -a { - s 0 - b 9054 5119 - e 9061 5126 - l 9054 5119 9061 5126 100 -} -a { - s 0 - b 9055 5120 - e 9062 5127 - l 9055 5120 9062 5127 100 -} -a { - s 0 - b 9056 5121 - e 9063 5128 - l 9056 5121 9063 5128 100 -} -a { - s 0 - b 9057 5122 - e 9064 5129 - l 9057 5122 9064 5129 100 -} -a { - s 0 - b 9058 5123 - e 9065 5130 - l 9058 5123 9065 5130 100 -} -a { - s 0 - b 10856 5131 - e 10863 5138 - l 10856 5131 10863 5138 100 -} -a { - s 0 - b 157 5131 - e 164 5138 - l 157 5131 164 5138 100 -} -a { - s 0 - b 143 5597 - e 150 5604 - l 143 5597 150 5604 100 -} -a { - s 0 - b 144 5598 - e 151 5605 - l 144 5598 151 5605 100 -} -a { - s 0 - b 14039 5600 - e 14046 5607 - l 14039 5600 14046 5607 100 -} -a { - s 0 - b 8902 5654 - e 8909 5661 - l 8902 5654 8909 5661 100 -} -a { - s 0 - b 2976 5697 - e 2983 5704 - l 2976 5697 2983 5704 100 -} -a { - s 0 - b 2977 5698 - e 2984 5705 - l 2977 5698 2984 5705 100 -} -a { - s 0 - b 2978 5699 - e 2985 5706 - l 2978 5699 2985 5706 100 -} -a { - s 0 - b 9496 5702 - e 9503 5709 - l 9496 5702 9503 5709 100 -} -a { - s 0 - b 11306 5703 - e 11313 5710 - l 11306 5703 11313 5710 100 -} -a { - s 0 - b 8784 5704 - e 8791 5711 - l 8784 5704 8791 5711 100 -} -a { - s 0 - b 8785 5705 - e 8792 5712 - l 8785 5705 8792 5712 100 -} -a { - s 0 - b 6850 5738 - e 6857 5745 - l 6850 5738 6857 5745 100 -} -a { - s 0 - b 8202 5742 - e 8209 5749 - l 8202 5742 8209 5749 100 -} -a { - s 0 - b 8203 5743 - e 8210 5750 - l 8203 5743 8210 5750 100 -} -a { - s 0 - b 3891 5743 - e 3898 5750 - l 3891 5743 3898 5750 100 -} -a { - s 0 - b 5605 5751 - e 5612 5758 - l 5605 5751 5612 5758 100 -} -a { - s 0 - b 7748 5752 - e 7755 5759 - l 7748 5752 7755 5759 100 -} -a { - s 0 - b 5166 5763 - e 5173 5770 - l 5166 5763 5173 5770 100 -} -a { - s 0 - b 16145 5770 - e 16152 5777 - l 16145 5770 16152 5777 100 -} -a { - s 0 - b 14338 5793 - e 14345 5800 - l 14338 5793 14345 5800 100 -} -a { - s 0 - b 12327 5793 - e 12334 5800 - l 12327 5793 12334 5800 100 -} -a { - s 0 - b 8484 5793 - e 8491 5800 - l 8484 5793 8491 5800 100 -} -a { - s 0 - b 14339 5794 - e 14346 5801 - l 14339 5794 14346 5801 100 -} -a { - s 0 - b 5096 5794 - e 5103 5801 - l 5096 5794 5103 5801 100 -} -a { - s 0 - b 18750 5795 - e 18757 5802 - l 18750 5795 18757 5802 100 -} -a { - s 0 - b 5097 5795 - e 5104 5802 - l 5097 5795 5104 5802 100 -} -a { - s 0 - b 18751 5796 - e 18758 5803 - l 18751 5796 18758 5803 100 -} -a { - s 0 - b 18752 5797 - e 18759 5804 - l 18752 5797 18759 5804 100 -} -a { - s 0 - b 17247 5798 - e 17254 5805 - l 17247 5798 17254 5805 100 -} -a { - s 0 - b 11202 5823 - e 11209 5830 - l 11202 5823 11209 5830 100 -} -a { - s 0 - b 18105 5825 - e 18112 5832 - l 18105 5825 18112 5832 100 -} -a { - s 0 - b 14782 5826 - e 14789 5833 - l 14782 5826 14789 5833 100 -} -a { - s 0 - b 8293 5837 - e 8300 5844 - l 8293 5837 8300 5844 100 -} -a { - s 0 - b 8294 5838 - e 8301 5845 - l 8294 5838 8301 5845 100 -} -a { - s 0 - b 3234 5845 - e 3241 5852 - l 3234 5845 3241 5852 100 -} -a { - s 0 - b 15189 5846 - e 15196 5853 - l 15189 5846 15196 5853 100 -} -a { - s 0 - b 3235 5846 - e 3242 5853 - l 3235 5846 3242 5853 100 -} -a { - s 0 - b 16017 5847 - e 16024 5854 - l 16017 5847 16024 5854 100 -} -a { - s 0 - b 3236 5847 - e 3243 5854 - l 3236 5847 3243 5854 100 -} -a { - s 0 - b 3237 5848 - e 3244 5855 - l 3237 5848 3244 5855 100 -} -a { - s 0 - b 6671 5849 - e 6678 5856 - l 6671 5849 6678 5856 100 -} -a { - s 0 - b 6672 5850 - e 6679 5857 - l 6672 5850 6679 5857 100 -} -a { - s 0 - b 6673 5851 - e 6680 5858 - l 6673 5851 6680 5858 100 -} -a { - s 0 - b 6674 5852 - e 6681 5859 - l 6674 5852 6681 5859 100 -} -a { - s 0 - b 6367 5853 - e 6374 5860 - l 6367 5853 6374 5860 100 -} -a { - s 0 - b 6368 5854 - e 6375 5861 - l 6368 5854 6375 5861 100 -} -a { - s 0 - b 11148 5855 - e 11155 5862 - l 11148 5855 11155 5862 100 -} -a { - s 0 - b 11149 5856 - e 11156 5863 - l 11149 5856 11156 5863 100 -} -a { - s 0 - b 11150 5857 - e 11157 5864 - l 11150 5857 11157 5864 100 -} -a { - s 0 - b 11151 5858 - e 11158 5865 - l 11151 5858 11158 5865 100 -} -a { - s 0 - b 6667 5859 - e 6674 5866 - l 6667 5859 6674 5866 100 -} -a { - s 0 - b 6668 5860 - e 6675 5867 - l 6668 5860 6675 5867 100 -} -a { - s 0 - b 3688 5861 - e 3695 5868 - l 3688 5861 3695 5868 100 -} -a { - s 0 - b 14886 5864 - e 14893 5871 - l 14886 5864 14893 5871 100 -} -a { - s 0 - b 8573 5866 - e 8580 5873 - l 8573 5866 8580 5873 100 -} -a { - s 0 - b 8574 5867 - e 8581 5874 - l 8574 5867 8581 5874 100 -} -a { - s 0 - b 4860 5879 - e 4867 5886 - l 4860 5879 4867 5886 100 -} -a { - s 0 - b 14468 5884 - e 14475 5891 - l 14468 5884 14475 5891 100 -} -a { - s 0 - b 7785 5885 - e 7792 5892 - l 7785 5885 7792 5892 100 -} -a { - s 0 - b 9447 5894 - e 9454 5901 - l 9447 5894 9454 5901 100 -} -a { - s 0 - b 3178 5894 - e 3185 5901 - l 3178 5894 3185 5901 100 -} -a { - s 0 - b 4936 5906 - e 4943 5913 - l 4936 5906 4943 5913 100 -} -a { - s 0 - b 4937 5907 - e 4944 5914 - l 4937 5907 4944 5914 100 -} -a { - s 0 - b 4938 5908 - e 4945 5915 - l 4938 5908 4945 5915 100 -} -a { - s 0 - b 15271 5909 - e 15278 5916 - l 15271 5909 15278 5916 100 -} -a { - s 0 - b 2958 5912 - e 2965 5919 - l 2958 5912 2965 5919 100 -} -a { - s 0 - b 17163 5913 - e 17170 5920 - l 17163 5913 17170 5920 100 -} -a { - s 0 - b 2959 5913 - e 2966 5920 - l 2959 5913 2966 5920 100 -} -a { - s 0 - b 2960 5914 - e 2967 5921 - l 2960 5914 2967 5921 100 -} -a { - s 0 - b 2961 5915 - e 2968 5922 - l 2961 5915 2968 5922 100 -} -a { - s 0 - b 7465 5929 - e 7472 5936 - l 7465 5929 7472 5936 100 -} -a { - s 0 - b 14389 5932 - e 14396 5939 - l 14389 5932 14396 5939 100 -} -a { - s 0 - b 14390 5933 - e 14397 5940 - l 14390 5933 14397 5940 100 -} -a { - s 0 - b 11170 6174 - e 11177 6181 - l 11170 6174 11177 6181 100 -} -a { - s 0 - b 92 6195 - e 99 6202 - l 92 6195 99 6202 100 -} -a { - s 0 - b 8021 6196 - e 8028 6203 - l 8021 6196 8028 6203 100 -} -a { - s 0 - b 8184 6197 - e 8191 6204 - l 8184 6197 8191 6204 100 -} -a { - s 0 - b 4834 6200 - e 4841 6207 - l 4834 6200 4841 6207 100 -} -a { - s 0 - b 17255 6201 - e 17262 6208 - l 17255 6201 17262 6208 100 -} -a { - s 0 - b 6427 6206 - e 6434 6213 - l 6427 6206 6434 6213 100 -} -a { - s 0 - b 6428 6207 - e 6435 6214 - l 6428 6207 6435 6214 100 -} -a { - s 0 - b 14265 6211 - e 14272 6218 - l 14265 6211 14272 6218 100 -} -a { - s 0 - b 13453 6212 - e 13460 6219 - l 13453 6212 13460 6219 100 -} -a { - s 0 - b 8753 6230 - e 8760 6237 - l 8753 6230 8760 6237 100 -} -a { - s 0 - b 8754 6231 - e 8761 6238 - l 8754 6231 8761 6238 100 -} -a { - s 0 - b 8019 6232 - e 8026 6239 - l 8019 6232 8026 6239 100 -} -a { - s 0 - b 8020 6233 - e 8027 6240 - l 8020 6233 8027 6240 100 -} -a { - s 0 - b 3079 6233 - e 3086 6240 - l 3079 6233 3086 6240 100 -} -a { - s 0 - b 7477 6234 - e 7484 6241 - l 7477 6234 7484 6241 100 -} -a { - s 0 - b 9448 6237 - e 9455 6244 - l 9448 6237 9455 6244 100 -} -a { - s 0 - b 14679 6241 - e 14686 6248 - l 14679 6241 14686 6248 100 -} -a { - s 0 - b 9705 6241 - e 9712 6248 - l 9705 6241 9712 6248 100 -} -a { - s 0 - b 9706 6242 - e 9713 6249 - l 9706 6242 9713 6249 100 -} -a { - s 0 - b 9707 6243 - e 9714 6250 - l 9707 6243 9714 6250 100 -} -a { - s 0 - b 11378 6251 - e 11385 6258 - l 11378 6251 11385 6258 100 -} -a { - s 0 - b 18091 6255 - e 18098 6262 - l 18091 6255 18098 6262 100 -} -a { - s 0 - b 18590 6257 - e 18597 6264 - l 18590 6257 18597 6264 100 -} -a { - s 0 - b 3028 6257 - e 3035 6264 - l 3028 6257 3035 6264 100 -} -a { - s 0 - b 232 6261 - e 239 6268 - l 232 6261 239 6268 100 -} -a { - s 0 - b 8772 6266 - e 8779 6273 - l 8772 6266 8779 6273 100 -} -a { - s 0 - b 8773 6267 - e 8780 6274 - l 8773 6267 8780 6274 100 -} -a { - s 0 - b 8774 6268 - e 8781 6275 - l 8774 6268 8781 6275 100 -} -a { - s 0 - b 17420 6280 - e 17427 6287 - l 17420 6280 17427 6287 100 -} -a { - s 0 - b 18157 6311 - e 18164 6318 - l 18157 6311 18164 6318 100 -} -a { - s 0 - b 3534 6321 - e 3541 6328 - l 3534 6321 3541 6328 100 -} -a { - s 0 - b 18407 6323 - e 18414 6330 - l 18407 6323 18414 6330 100 -} -a { - s 0 - b 18408 6324 - e 18415 6331 - l 18408 6324 18415 6331 100 -} -a { - s 0 - b 8345 6324 - e 8352 6331 - l 8345 6324 8352 6331 100 -} -a { - s 0 - b 8346 6325 - e 8353 6332 - l 8346 6325 8353 6332 100 -} -a { - s 0 - b 8347 6326 - e 8354 6333 - l 8347 6326 8354 6333 100 -} -a { - s 0 - b 8890 6450 - e 8897 6457 - l 8890 6450 8897 6457 100 -} -a { - s 0 - b 243 6450 - e 250 6457 - l 243 6450 250 6457 100 -} -a { - s 0 - b 6378 6865 - e 6385 6872 - l 6378 6865 6385 6872 100 -} -a { - s 0 - b 6379 6866 - e 6386 6873 - l 6379 6866 6386 6873 100 -} -a { - s 0 - b 6380 6867 - e 6387 6874 - l 6380 6867 6387 6874 100 -} -a { - s 0 - b 6381 6868 - e 6388 6875 - l 6381 6868 6388 6875 100 -} -a { - s 0 - b 15328 6870 - e 15335 6877 - l 15328 6870 15335 6877 100 -} -a { - s 0 - b 15329 6871 - e 15336 6878 - l 15329 6871 15336 6878 100 -} -a { - s 0 - b 14892 6874 - e 14899 6881 - l 14892 6874 14899 6881 100 -} -a { - s 0 - b 17551 6875 - e 17558 6882 - l 17551 6875 17558 6882 100 -} -a { - s 0 - b 7536 6875 - e 7543 6882 - l 7536 6875 7543 6882 100 -} -a { - s 0 - b 3092 6903 - e 3099 6910 - l 3092 6903 3099 6910 100 -} -a { - s 0 - b 6246 6904 - e 6253 6911 - l 6246 6904 6253 6911 100 -} -a { - s 0 - b 14371 6913 - e 14378 6920 - l 14371 6913 14378 6920 100 -} -a { - s 0 - b 11529 6919 - e 11536 6926 - l 11529 6919 11536 6926 100 -} -a { - s 0 - b 9443 6920 - e 9450 6927 - l 9443 6920 9450 6927 100 -} -a { - s 0 - b 9444 6921 - e 9451 6928 - l 9444 6921 9451 6928 100 -} -a { - s 0 - b 11372 6923 - e 11379 6930 - l 11372 6923 11379 6930 100 -} -a { - s 0 - b 17995 6924 - e 18002 6931 - l 17995 6924 18002 6931 100 -} -a { - s 0 - b 17996 6925 - e 18003 6932 - l 17996 6925 18003 6932 100 -} -a { - s 0 - b 15900 6925 - e 15907 6932 - l 15900 6925 15907 6932 100 -} -a { - s 0 - b 15901 6926 - e 15908 6933 - l 15901 6926 15908 6933 100 -} -a { - s 0 - b 9598 6928 - e 9605 6935 - l 9598 6928 9605 6935 100 -} -a { - s 0 - b 15201 6951 - e 15208 6958 - l 15201 6951 15208 6958 100 -} -a { - s 0 - b 7415 9050 - e 7422 9057 - l 7415 9050 7422 9057 100 -} -a { - s 0 - b 6861 9053 - e 6868 9060 - l 6861 9053 6868 9060 100 -} -a { - s 0 - b 13418 9057 - e 13425 9064 - l 13418 9057 13425 9064 100 -} -a { - s 0 - b 7537 9061 - e 7544 9068 - l 7537 9061 7544 9068 100 -} -a { - s 0 - b 13498 9064 - e 13505 9071 - l 13498 9064 13505 9071 100 -} -a { - s 0 - b 8952 9064 - e 8959 9071 - l 8952 9064 8959 9071 100 -} -a { - s 0 - b 13975 9065 - e 13982 9072 - l 13975 9065 13982 9072 100 -} -a { - s 0 - b 15236 9067 - e 15243 9074 - l 15236 9067 15243 9074 100 -} -a { - s 0 - b 11550 9071 - e 11557 9078 - l 11550 9071 11557 9078 100 -} -a { - s 0 - b 14756 9072 - e 14763 9079 - l 14756 9072 14763 9079 100 -} -a { - s 0 - b 11551 9072 - e 11558 9079 - l 11551 9072 11558 9079 100 -} -a { - s 0 - b 8568 9080 - e 8575 9087 - l 8568 9080 8575 9087 100 -} -a { - s 0 - b 11202 9087 - e 11209 9094 - l 11202 9087 11209 9094 100 -} -a { - s 0 - b 18105 9089 - e 18112 9096 - l 18105 9089 18112 9096 100 -} -a { - s 0 - b 14782 9090 - e 14789 9097 - l 14782 9090 14789 9097 100 -} -a { - s 0 - b 8316 9092 - e 8323 9099 - l 8316 9092 8323 9099 100 -} -a { - s 0 - b 7509 9093 - e 7516 9100 - l 7509 9093 7516 9100 100 -} -a { - s 0 - b 6362 9104 - e 6369 9111 - l 6362 9104 6369 9111 100 -} -a { - s 0 - b 14713 9105 - e 14720 9112 - l 14713 9105 14720 9112 100 -} -a { - s 0 - b 6363 9105 - e 6370 9112 - l 6363 9105 6370 9112 100 -} -a { - s 0 - b 14816 9107 - e 14823 9114 - l 14816 9107 14823 9114 100 -} -a { - s 0 - b 8732 9107 - e 8739 9114 - l 8732 9107 8739 9114 100 -} -a { - s 0 - b 3566 9107 - e 3573 9114 - l 3566 9107 3573 9114 100 -} -a { - s 0 - b 3567 9108 - e 3574 9115 - l 3567 9108 3574 9115 100 -} -a { - s 0 - b 15354 9119 - e 15361 9126 - l 15354 9119 15361 9126 100 -} -a { - s 0 - b 9745 9145 - e 9752 9152 - l 9745 9145 9752 9152 100 -} -a { - s 0 - b 8629 9146 - e 8636 9153 - l 8629 9146 8636 9153 100 -} -a { - s 0 - b 8630 9147 - e 8637 9154 - l 8630 9147 8637 9154 100 -} -a { - s 0 - b 5593 9153 - e 5600 9160 - l 5593 9153 5600 9160 100 -} -a { - s 0 - b 7770 9154 - e 7777 9161 - l 7770 9154 7777 9161 100 -} -a { - s 0 - b 5594 9154 - e 5601 9161 - l 5594 9154 5601 9161 100 -} -a { - s 0 - b 14957 9157 - e 14964 9164 - l 14957 9157 14964 9164 100 -} -a { - s 0 - b 14958 9158 - e 14965 9165 - l 14958 9158 14965 9165 100 -} -a { - s 0 - b 13533 9158 - e 13540 9165 - l 13533 9158 13540 9165 100 -} -a { - s 0 - b 14959 9159 - e 14966 9166 - l 14959 9159 14966 9166 100 -} -a { - s 0 - b 14049 9159 - e 14056 9166 - l 14049 9159 14056 9166 100 -} -a { - s 0 - b 3380 9160 - e 3387 9167 - l 3380 9160 3387 9167 100 -} -a { - s 0 - b 15387 9161 - e 15394 9168 - l 15387 9161 15394 9168 100 -} -a { - s 0 - b 3381 9161 - e 3388 9168 - l 3381 9161 3388 9168 100 -} -a { - s 0 - b 15388 9162 - e 15395 9169 - l 15388 9162 15395 9169 100 -} -a { - s 0 - b 17982 9168 - e 17989 9175 - l 17982 9168 17989 9175 100 -} -a { - s 0 - b 14214 9184 - e 14221 9191 - l 14214 9184 14221 9191 100 -} -a { - s 0 - b 6638 9185 - e 6645 9192 - l 6638 9185 6645 9192 100 -} -a { - s 0 - b 8838 9189 - e 8845 9196 - l 8838 9189 8845 9196 100 -} -a { - s 0 - b 8839 9190 - e 8846 9197 - l 8839 9190 8846 9197 100 -} -a { - s 0 - b 7420 9194 - e 7427 9201 - l 7420 9194 7427 9201 100 -} -a { - s 0 - b 7421 9195 - e 7428 9202 - l 7421 9195 7428 9202 100 -} -a { - s 0 - b 17207 9197 - e 17214 9204 - l 17207 9197 17214 9204 100 -} -a { - s 0 - b 18747 9199 - e 18754 9206 - l 18747 9199 18754 9206 100 -} -a { - s 0 - b 17095 9202 - e 17102 9209 - l 17095 9202 17102 9209 100 -} -a { - s 0 - b 15231 9204 - e 15238 9211 - l 15231 9204 15238 9211 100 -} -a { - s 0 - b 6844 9204 - e 6851 9211 - l 6844 9204 6851 9211 100 -} -a { - s 0 - b 18374 9205 - e 18381 9212 - l 18374 9205 18381 9212 100 -} -a { - s 0 - b 17477 9220 - e 17484 9227 - l 17477 9220 17484 9227 100 -} -a { - s 0 - b 6298 9221 - e 6305 9228 - l 6298 9221 6305 9228 100 -} -a { - s 0 - b 16876 9225 - e 16883 9232 - l 16876 9225 16883 9232 100 -} -a { - s 0 - b 15329 9248 - e 15336 9255 - l 15329 9248 15336 9255 100 -} -a { - s 0 - b 6287 9259 - e 6294 9266 - l 6287 9259 6294 9266 100 -} -a { - s 0 - b 4421 9274 - e 4428 9281 - l 4421 9274 4428 9281 100 -} -a { - s 0 - b 10990 9309 - e 10997 9316 - l 10990 9309 10997 9316 100 -} -a { - s 0 - b 10991 9310 - e 10998 9317 - l 10991 9310 10998 9317 100 -} -a { - s 0 - b 14140 9311 - e 14147 9318 - l 14140 9311 14147 9318 100 -} -a { - s 0 - b 14141 9312 - e 14148 9319 - l 14141 9312 14148 9319 100 -} -a { - s 0 - b 8042 9320 - e 8049 9327 - l 8042 9320 8049 9327 100 -} -a { - s 0 - b 18575 9322 - e 18582 9329 - l 18575 9322 18582 9329 100 -} -a { - s 0 - b 3385 9331 - e 3392 9338 - l 3385 9331 3392 9338 100 -} -a { - s 0 - b 11447 9332 - e 11454 9339 - l 11447 9332 11454 9339 100 -} -a { - s 0 - b 11448 9333 - e 11455 9340 - l 11448 9333 11455 9340 100 -} -a { - s 0 - b 4422 9333 - e 4429 9340 - l 4422 9333 4429 9340 100 -} -a { - s 0 - b 4423 9334 - e 4430 9341 - l 4423 9334 4430 9341 100 -} -a { - s 0 - b 18643 9342 - e 18650 9349 - l 18643 9342 18650 9349 100 -} -a { - s 0 - b 8944 9343 - e 8951 9350 - l 8944 9343 8951 9350 100 -} -a { - s 0 - b 8945 9344 - e 8952 9351 - l 8945 9344 8952 9351 100 -} -a { - s 0 - b 8946 9345 - e 8953 9352 - l 8946 9345 8953 9352 100 -} -a { - s 0 - b 11280 9351 - e 11287 9358 - l 11280 9351 11287 9358 100 -} -a { - s 0 - b 4379 9385 - e 4386 9392 - l 4379 9385 4386 9392 100 -} -a { - s 0 - b 6822 9386 - e 6829 9393 - l 6822 9386 6829 9393 100 -} -a { - s 0 - b 4380 9386 - e 4387 9393 - l 4380 9386 4387 9393 100 -} -a { - s 0 - b 17221 9395 - e 17228 9402 - l 17221 9395 17228 9402 100 -} -a { - s 0 - b 10958 9416 - e 10965 9423 - l 10958 9416 10965 9423 100 -} -a { - s 0 - b 5041 9421 - e 5048 9428 - l 5041 9421 5048 9428 100 -} -a { - s 0 - b 7597 9423 - e 7604 9430 - l 7597 9423 7604 9430 100 -} -a { - s 0 - b 13933 9426 - e 13940 9433 - l 13933 9426 13940 9433 100 -} -a { - s 0 - b 3387 9437 - e 3394 9444 - l 3387 9437 3394 9444 100 -} -a { - s 0 - b 10873 9456 - e 10880 9463 - l 10873 9456 10880 9463 100 -} -a { - s 0 - b 6820 9456 - e 6827 9463 - l 6820 9456 6827 9463 100 -} -a { - s 0 - b 2962 9470 - e 2969 9477 - l 2962 9470 2969 9477 100 -} -a { - s 0 - b 218 9477 - e 225 9484 - l 218 9477 225 9484 100 -} -a { - s 0 - b 219 9478 - e 226 9485 - l 219 9478 226 9485 100 -} -a { - s 0 - b 220 9479 - e 227 9486 - l 220 9479 227 9486 100 -} -a { - s 0 - b 4571 9480 - e 4578 9487 - l 4571 9480 4578 9487 100 -} -a { - s 0 - b 14014 9491 - e 14021 9498 - l 14014 9491 14021 9498 100 -} -a { - s 0 - b 17310 9497 - e 17317 9504 - l 17310 9497 17317 9504 100 -} -a { - s 0 - b 10982 9519 - e 10989 9526 - l 10982 9519 10989 9526 100 -} -a { - s 0 - b 11503 10485 - e 11510 10492 - l 11503 10485 11510 10492 100 -} -a { - s 0 - b 14717 10495 - e 14724 10502 - l 14717 10495 14724 10502 100 -} -a { - s 0 - b 14718 10496 - e 14725 10503 - l 14718 10496 14725 10503 100 -} -a { - s 0 - b 7737 10496 - e 7744 10503 - l 7737 10496 7744 10503 100 -} -a { - s 0 - b 7738 10497 - e 7745 10504 - l 7738 10497 7745 10504 100 -} -a { - s 0 - b 7739 10498 - e 7746 10505 - l 7739 10498 7746 10505 100 -} -a { - s 0 - b 7740 10499 - e 7747 10506 - l 7740 10499 7747 10506 100 -} -a { - s 0 - b 11151 10501 - e 11158 10508 - l 11151 10501 11158 10508 100 -} -a { - s 0 - b 14065 10506 - e 14072 10513 - l 14065 10506 14072 10513 100 -} -a { - s 0 - b 14066 10507 - e 14073 10514 - l 14066 10507 14073 10514 100 -} -a { - s 0 - b 14596 10508 - e 14603 10515 - l 14596 10508 14603 10515 100 -} -a { - s 0 - b 8678 10528 - e 8685 10535 - l 8678 10528 8685 10535 100 -} -a { - s 0 - b 18733 10554 - e 18740 10561 - l 18733 10554 18740 10561 100 -} -a { - s 0 - b 17508 10554 - e 17515 10561 - l 17508 10554 17515 10561 100 -} -a { - s 0 - b 10891 10554 - e 10898 10561 - l 10891 10554 10898 10561 100 -} -a { - s 0 - b 17509 10555 - e 17516 10562 - l 17509 10555 17516 10562 100 -} -a { - s 0 - b 11055 10555 - e 11062 10562 - l 11055 10555 11062 10562 100 -} -a { - s 0 - b 17351 10558 - e 17358 10565 - l 17351 10558 17358 10565 100 -} -a { - s 0 - b 7474 10559 - e 7481 10566 - l 7474 10559 7481 10566 100 -} -a { - s 0 - b 5046 10559 - e 5053 10566 - l 5046 10559 5053 10566 100 -} -a { - s 0 - b 8933 10560 - e 8940 10567 - l 8933 10560 8940 10567 100 -} -a { - s 0 - b 17187 10569 - e 17194 10576 - l 17187 10569 17194 10576 100 -} -a { - s 0 - b 18626 10570 - e 18633 10577 - l 18626 10570 18633 10577 100 -} -a { - s 0 - b 14900 10612 - e 14907 10619 - l 14900 10612 14907 10619 100 -} -a { - s 0 - b 18596 10625 - e 18603 10632 - l 18596 10625 18603 10632 100 -} -a { - s 0 - b 11442 10625 - e 11449 10632 - l 11442 10625 11449 10632 100 -} -a { - s 0 - b 8498 10625 - e 8505 10632 - l 8498 10625 8505 10632 100 -} -a { - s 0 - b 7670 10625 - e 7677 10632 - l 7670 10625 7677 10632 100 -} -a { - s 0 - b 11186 10629 - e 11193 10636 - l 11186 10629 11193 10636 100 -} -a { - s 0 - b 8028 10640 - e 8035 10647 - l 8028 10640 8035 10647 100 -} -a { - s 0 - b 15960 10642 - e 15967 10649 - l 15960 10642 15967 10649 100 -} -a { - s 0 - b 15224 10648 - e 15231 10655 - l 15224 10648 15231 10655 100 -} -a { - s 0 - b 17835 10649 - e 17842 10656 - l 17835 10649 17842 10656 100 -} -a { - s 0 - b 18690 10664 - e 18697 10671 - l 18690 10664 18697 10671 100 -} -a { - s 0 - b 7750 10670 - e 7757 10677 - l 7750 10670 7757 10677 100 -} -a { - s 0 - b 9007 10681 - e 9014 10688 - l 9007 10681 9014 10688 100 -} -a { - s 0 - b 9446 11225 - e 9453 11232 - l 9446 11225 9453 11232 100 -} -a { - s 0 - b 3136 11228 - e 3143 11235 - l 3136 11228 3143 11235 100 -} -a { - s 0 - b 3137 11229 - e 3144 11236 - l 3137 11229 3144 11236 100 -} -a { - s 0 - b 3084 11229 - e 3091 11236 - l 3084 11229 3091 11236 100 -} -a { - s 0 - b 18605 11230 - e 18612 11237 - l 18605 11230 18612 11237 100 -} -a { - s 0 - b 51 11232 - e 58 11239 - l 51 11232 58 11239 100 -} -a { - s 0 - b 14786 11243 - e 14793 11250 - l 14786 11243 14793 11250 100 -} -a { - s 0 - b 17995 11250 - e 18002 11257 - l 17995 11250 18002 11257 100 -} -a { - s 0 - b 17996 11251 - e 18003 11258 - l 17996 11251 18003 11258 100 -} -a { - s 0 - b 15900 11251 - e 15907 11258 - l 15900 11251 15907 11258 100 -} -a { - s 0 - b 17997 11252 - e 18004 11259 - l 17997 11252 18004 11259 100 -} -a { - s 0 - b 15950 11255 - e 15957 11262 - l 15950 11255 15957 11262 100 -} -a { - s 0 - b 8851 11259 - e 8858 11266 - l 8851 11259 8858 11266 100 -} -a { - s 0 - b 8885 11263 - e 8892 11270 - l 8885 11263 8892 11270 100 -} -a { - s 0 - b 8831 11267 - e 8838 11274 - l 8831 11267 8838 11274 100 -} -a { - s 0 - b 18101 11268 - e 18108 11275 - l 18101 11268 18108 11275 100 -} -a { - s 0 - b 9302 11268 - e 9309 11275 - l 9302 11268 9309 11275 100 -} -a { - s 0 - b 8832 11268 - e 8839 11275 - l 8832 11268 8839 11275 100 -} -a { - s 0 - b 8833 11269 - e 8840 11276 - l 8833 11269 8840 11276 100 -} -a { - s 0 - b 3670 11281 - e 3677 11288 - l 3670 11281 3677 11288 100 -} -a { - s 0 - b 8318 11283 - e 8325 11290 - l 8318 11283 8325 11290 100 -} -a { - s 0 - b 15335 11288 - e 15342 11295 - l 15335 11288 15342 11295 100 -} -a { - s 0 - b 76 11288 - e 83 11295 - l 76 11288 83 11295 100 -} -a { - s 0 - b 15336 11289 - e 15343 11296 - l 15336 11289 15343 11296 100 -} -a { - s 0 - b 15337 11290 - e 15344 11297 - l 15337 11290 15344 11297 100 -} -a { - s 0 - b 15338 11291 - e 15345 11298 - l 15338 11291 15345 11298 100 -} -a { - s 0 - b 13882 11315 - e 13889 11322 - l 13882 11315 13889 11322 100 -} -a { - s 0 - b 13883 11316 - e 13890 11323 - l 13883 11316 13890 11323 100 -} -a { - s 0 - b 14671 11324 - e 14678 11331 - l 14671 11324 14678 11331 100 -} -a { - s 0 - b 9644 11327 - e 9651 11334 - l 9644 11327 9651 11334 100 -} -a { - s 0 - b 11322 11332 - e 11329 11339 - l 11322 11332 11329 11339 100 -} -a { - s 0 - b 18409 11344 - e 18416 11351 - l 18409 11344 18416 11351 100 -} -a { - s 0 - b 15355 11348 - e 15362 11355 - l 15355 11348 15362 11355 100 -} -a { - s 0 - b 15356 11349 - e 15363 11356 - l 15356 11349 15363 11356 100 -} -a { - s 0 - b 8432 11350 - e 8439 11357 - l 8432 11350 8439 11357 100 -} -a { - s 0 - b 15330 11363 - e 15337 11370 - l 15330 11363 15337 11370 100 -} -a { - s 0 - b 12199 11365 - e 12206 11372 - l 12199 11365 12206 11372 100 -} -a { - s 0 - b 12200 11366 - e 12207 11373 - l 12200 11366 12207 11373 100 -} -a { - s 0 - b 18 11371 - e 25 11378 - l 18 11371 25 11378 100 -} -a { - s 0 - b 12579 11372 - e 12586 11379 - l 12579 11372 12586 11379 100 -} -a { - s 0 - b 15226 11380 - e 15233 11387 - l 15226 11380 15233 11387 100 -} -a { - s 0 - b 10836 11385 - e 10843 11392 - l 10836 11385 10843 11392 100 -} -a { - s 0 - b 4937 11398 - e 4944 11405 - l 4937 11398 4944 11405 100 -} -a { - s 0 - b 18556 11400 - e 18563 11407 - l 18556 11400 18563 11407 100 -} -a { - s 0 - b 8469 11407 - e 8476 11414 - l 8469 11407 8476 11414 100 -} -a { - s 0 - b 8470 11408 - e 8477 11415 - l 8470 11408 8477 11415 100 -} -a { - s 0 - b 16077 11410 - e 16084 11417 - l 16077 11410 16084 11417 100 -} -a { - s 0 - b 14169 11410 - e 14176 11417 - l 14169 11410 14176 11417 100 -} -a { - s 0 - b 8974 11427 - e 8981 11434 - l 8974 11427 8981 11434 100 -} -a { - s 0 - b 8975 11428 - e 8982 11435 - l 8975 11428 8982 11435 100 -} -a { - s 0 - b 8976 11429 - e 8983 11436 - l 8976 11429 8983 11436 100 -} -a { - s 0 - b 8977 11430 - e 8984 11437 - l 8977 11430 8984 11437 100 -} -a { - s 0 - b 5188 11447 - e 5195 11454 - l 5188 11447 5195 11454 100 -} -a { - s 0 - b 5189 11448 - e 5196 11455 - l 5189 11448 5196 11455 100 -} -a { - s 0 - b 8298 11449 - e 8305 11456 - l 8298 11449 8305 11456 100 -} -a { - s 0 - b 5190 11449 - e 5197 11456 - l 5190 11449 5197 11456 100 -} -a { - s 0 - b 8299 11450 - e 8306 11457 - l 8299 11450 8306 11457 100 -} -a { - s 0 - b 8556 11465 - e 8563 11472 - l 8556 11465 8563 11472 100 -} -a { - s 0 - b 4104 11467 - e 4111 11474 - l 4104 11467 4111 11474 100 -} -a { - s 0 - b 4105 11468 - e 4112 11475 - l 4105 11468 4112 11475 100 -} -a { - s 0 - b 7436 11490 - e 7443 11497 - l 7436 11490 7443 11497 100 -} -a { - s 0 - b 7437 11491 - e 7444 11498 - l 7437 11491 7444 11498 100 -} -a { - s 0 - b 7438 11492 - e 7445 11499 - l 7438 11492 7445 11499 100 -} -a { - s 0 - b 18024 11629 - e 18031 11636 - l 18024 11629 18031 11636 100 -} -a { - s 0 - b 4874 11629 - e 4881 11636 - l 4874 11629 4881 11636 100 -} -a { - s 0 - b 18025 11630 - e 18032 11637 - l 18025 11630 18032 11637 100 -} -a { - s 0 - b 14632 11630 - e 14639 11637 - l 14632 11630 14639 11637 100 -} -a { - s 0 - b 18026 11631 - e 18033 11638 - l 18026 11631 18033 11638 100 -} -a { - s 0 - b 9668 11688 - e 9675 11695 - l 9668 11688 9675 11695 100 -} -a { - s 0 - b 15149 11692 - e 15156 11699 - l 15149 11692 15156 11699 100 -} -a { - s 0 - b 15150 11693 - e 15157 11700 - l 15150 11693 15157 11700 100 -} -a { - s 0 - b 9341 11694 - e 9348 11701 - l 9341 11694 9348 11701 100 -} -a { - s 0 - b 13940 11696 - e 13947 11703 - l 13940 11696 13947 11703 100 -} -a { - s 0 - b 13941 11697 - e 13948 11704 - l 13941 11697 13948 11704 100 -} -a { - s 0 - b 11384 11698 - e 11391 11705 - l 11384 11698 11391 11705 100 -} -a { - s 0 - b 11385 11699 - e 11392 11706 - l 11385 11699 11392 11706 100 -} -a { - s 0 - b 11386 11700 - e 11393 11707 - l 11386 11700 11393 11707 100 -} -a { - s 0 - b 14785 11709 - e 14792 11716 - l 14785 11709 14792 11716 100 -} -a { - s 0 - b 11544 11712 - e 11551 11719 - l 11544 11712 11551 11719 100 -} -a { - s 0 - b 6 11712 - e 13 11719 - l 6 11712 13 11719 100 -} -a { - s 0 - b 7 11713 - e 14 11720 - l 7 11713 14 11720 100 -} -a { - s 0 - b 8 11714 - e 15 11721 - l 8 11714 15 11721 100 -} -a { - s 0 - b 9 11715 - e 16 11722 - l 9 11715 16 11722 100 -} -a { - s 0 - b 10 11716 - e 17 11723 - l 10 11716 17 11723 100 -} -a { - s 0 - b 11 11717 - e 18 11724 - l 11 11717 18 11724 100 -} -a { - s 0 - b 4054 11722 - e 4061 11729 - l 4054 11722 4061 11729 100 -} -a { - s 0 - b 3112 11726 - e 3119 11733 - l 3112 11726 3119 11733 100 -} -a { - s 0 - b 15175 11729 - e 15182 11736 - l 15175 11729 15182 11736 100 -} -a { - s 0 - b 15176 11730 - e 15183 11737 - l 15176 11730 15183 11737 100 -} -a { - s 0 - b 8692 11744 - e 8699 11751 - l 8692 11744 8699 11751 100 -} -a { - s 0 - b 8693 11745 - e 8700 11752 - l 8693 11745 8700 11752 100 -} -a { - s 0 - b 16078 11754 - e 16085 11761 - l 16078 11754 16085 11761 100 -} -a { - s 0 - b 16079 11755 - e 16086 11762 - l 16079 11755 16086 11762 100 -} -a { - s 0 - b 13999 11774 - e 14006 11781 - l 13999 11774 14006 11781 100 -} -a { - s 0 - b 8926 11777 - e 8933 11784 - l 8926 11777 8933 11784 100 -} -a { - s 0 - b 8927 11778 - e 8934 11785 - l 8927 11778 8934 11785 100 -} -a { - s 0 - b 18073 11780 - e 18080 11787 - l 18073 11780 18080 11787 100 -} -a { - s 0 - b 5697 11783 - e 5704 11790 - l 5697 11783 5704 11790 100 -} -a { - s 0 - b 5698 11784 - e 5705 11791 - l 5698 11784 5705 11791 100 -} -a { - s 0 - b 6406 11792 - e 6413 11799 - l 6406 11792 6413 11799 100 -} -a { - s 0 - b 6706 11795 - e 6713 11802 - l 6706 11795 6713 11802 100 -} -a { - s 0 - b 6656 11795 - e 6663 11802 - l 6656 11795 6663 11802 100 -} -a { - s 0 - b 9645 11799 - e 9652 11806 - l 9645 11799 9652 11806 100 -} -a { - s 0 - b 9646 11800 - e 9653 11807 - l 9646 11800 9653 11807 100 -} -a { - s 0 - b 3094 11813 - e 3101 11820 - l 3094 11813 3101 11820 100 -} -a { - s 0 - b 17356 11815 - e 17363 11822 - l 17356 11815 17363 11822 100 -} -a { - s 0 - b 4037 11817 - e 4044 11824 - l 4037 11817 4044 11824 100 -} -a { - s 0 - b 3976 11817 - e 3983 11824 - l 3976 11817 3983 11824 100 -} -a { - s 0 - b 18376 11818 - e 18383 11825 - l 18376 11818 18383 11825 100 -} -a { - s 0 - b 4038 11818 - e 4045 11825 - l 4038 11818 4045 11825 100 -} -a { - s 0 - b 18377 11819 - e 18384 11826 - l 18377 11819 18384 11826 100 -} -a { - s 0 - b 4290 11819 - e 4297 11826 - l 4290 11819 4297 11826 100 -} -a { - s 0 - b 18378 11820 - e 18385 11827 - l 18378 11820 18385 11827 100 -} -a { - s 0 - b 14184 11820 - e 14191 11827 - l 14184 11820 14191 11827 100 -} -a { - s 0 - b 14185 11821 - e 14192 11828 - l 14185 11821 14192 11828 100 -} -a { - s 0 - b 9349 11853 - e 9356 11860 - l 9349 11853 9356 11860 100 -} -a { - s 0 - b 17419 11854 - e 17426 11861 - l 17419 11854 17426 11861 100 -} -a { - s 0 - b 17312 11865 - e 17319 11872 - l 17312 11865 17319 11872 100 -} -a { - s 0 - b 15310 11881 - e 15317 11888 - l 15310 11881 15317 11888 100 -} -a { - s 0 - b 4593 11890 - e 4600 11897 - l 4593 11890 4600 11897 100 -} -a { - s 0 - b 7763 11909 - e 7770 11916 - l 7763 11909 7770 11916 100 -} -a { - s 0 - b 15224 11913 - e 15231 11920 - l 15224 11913 15231 11920 100 -} -a { - s 0 - b 15225 11914 - e 15232 11921 - l 15225 11914 15232 11921 100 -} -a { - s 0 - b 18408 11921 - e 18415 11928 - l 18408 11921 18415 11928 100 -} -a { - s 0 - b 8345 11921 - e 8352 11928 - l 8345 11921 8352 11928 100 -} -a { - s 0 - b 18409 11922 - e 18416 11929 - l 18409 11922 18416 11929 100 -} -a { - s 0 - b 18410 11923 - e 18417 11930 - l 18410 11923 18417 11930 100 -} -a { - s 0 - b 18411 11924 - e 18418 11931 - l 18411 11924 18418 11931 100 -} -a { - s 0 - b 13476 11944 - e 13483 11951 - l 13476 11944 13483 11951 100 -} -a { - s 0 - b 14842 11969 - e 14849 11976 - l 14842 11969 14849 11976 100 -} -a { - s 0 - b 14734 11971 - e 14741 11978 - l 14734 11971 14741 11978 100 -} -a { - s 0 - b 14715 11976 - e 14722 11983 - l 14715 11976 14722 11983 100 -} -a { - s 0 - b 6365 11976 - e 6372 11983 - l 6365 11976 6372 11983 100 -} -a { - s 0 - b 17990 11979 - e 17997 11986 - l 17990 11979 17997 11986 100 -} -a { - s 0 - b 10860 11983 - e 10867 11990 - l 10860 11983 10867 11990 100 -} -a { - s 0 - b 10861 11984 - e 10868 11991 - l 10861 11984 10868 11991 100 -} -a { - s 0 - b 7488 12001 - e 7495 12008 - l 7488 12001 7495 12008 100 -} -a { - s 0 - b 7489 12002 - e 7496 12009 - l 7489 12002 7496 12009 100 -} -a { - s 0 - b 7734 12004 - e 7741 12011 - l 7734 12004 7741 12011 100 -} -a { - s 0 - b 5647 12008 - e 5654 12015 - l 5647 12008 5654 12015 100 -} -a { - s 0 - b 5648 12009 - e 5655 12016 - l 5648 12009 5655 12016 100 -} -a { - s 0 - b 3415 12021 - e 3422 12028 - l 3415 12021 3422 12028 100 -} -a { - s 0 - b 3416 12022 - e 3423 12029 - l 3416 12022 3423 12029 100 -} -a { - s 0 - b 17256 12050 - e 17263 12057 - l 17256 12050 17263 12057 100 -} -a { - s 0 - b 3161 12066 - e 3168 12073 - l 3161 12066 3168 12073 100 -} -a { - s 0 - b 9349 12067 - e 9356 12074 - l 9349 12067 9356 12074 100 -} -a { - s 0 - b 9350 12068 - e 9357 12075 - l 9350 12068 9357 12075 100 -} -a { - s 0 - b 9351 12069 - e 9358 12076 - l 9351 12069 9358 12076 100 -} -a { - s 0 - b 15213 12071 - e 15220 12078 - l 15213 12071 15220 12078 100 -} -a { - s 0 - b 15214 12072 - e 15221 12079 - l 15214 12072 15221 12079 100 -} -a { - s 0 - b 11330 12076 - e 11337 12083 - l 11330 12076 11337 12083 100 -} -a { - s 0 - b 14859 12086 - e 14866 12093 - l 14859 12086 14866 12093 100 -} -a { - s 0 - b 9382 12087 - e 9389 12094 - l 9382 12087 9389 12094 100 -} -a { - s 0 - b 14199 12110 - e 14206 12117 - l 14199 12110 14206 12117 100 -} -a { - s 0 - b 5567 12114 - e 5574 12121 - l 5567 12114 5574 12121 100 -} -a { - s 0 - b 15215 12140 - e 15222 12147 - l 15215 12140 15222 12147 100 -} -a { - s 0 - b 8533 12142 - e 8540 12149 - l 8533 12142 8540 12149 100 -} -a { - s 0 - b 9416 12200 - e 9423 12207 - l 9416 12200 9423 12207 100 -} -a { - s 0 - b 3265 12202 - e 3272 12209 - l 3265 12202 3272 12209 100 -} -a { - s 0 - b 2963 12209 - e 2970 12216 - l 2963 12209 2970 12216 100 -} -a { - s 0 - b 18723 12213 - e 18730 12220 - l 18723 12213 18730 12220 100 -} -a { - s 0 - b 11101 12215 - e 11108 12222 - l 11101 12215 11108 12222 100 -} -a { - s 0 - b 3130 12216 - e 3137 12223 - l 3130 12216 3137 12223 100 -} -a { - s 0 - b 3131 12217 - e 3138 12224 - l 3131 12217 3138 12224 100 -} -a { - s 0 - b 3132 12218 - e 3139 12225 - l 3132 12218 3139 12225 100 -} -a { - s 0 - b 9316 12225 - e 9323 12232 - l 9316 12225 9323 12232 100 -} -a { - s 0 - b 5655 12227 - e 5662 12234 - l 5655 12227 5662 12234 100 -} -a { - s 0 - b 17292 12251 - e 17299 12258 - l 17292 12251 17299 12258 100 -} -a { - s 0 - b 8016 12252 - e 8023 12259 - l 8016 12252 8023 12259 100 -} -a { - s 0 - b 4400 12255 - e 4407 12262 - l 4400 12255 4407 12262 100 -} -a { - s 0 - b 15366 12256 - e 15373 12263 - l 15366 12256 15373 12263 100 -} -a { - s 0 - b 15367 12257 - e 15374 12264 - l 15367 12257 15374 12264 100 -} -a { - s 0 - b 15368 12258 - e 15375 12265 - l 15368 12258 15375 12265 100 -} -a { - s 0 - b 3531 12259 - e 3538 12266 - l 3531 12259 3538 12266 100 -} -a { - s 0 - b 14124 12279 - e 14131 12286 - l 14124 12279 14131 12286 100 -} -a { - s 0 - b 18632 12281 - e 18639 12288 - l 18632 12281 18639 12288 100 -} -a { - s 0 - b 15473 12282 - e 15480 12289 - l 15473 12282 15480 12289 100 -} -a { - s 0 - b 15145 12294 - e 15152 12301 - l 15145 12294 15152 12301 100 -} -a { - s 0 - b 13971 12311 - e 13978 12318 - l 13971 12311 13978 12318 100 -} -a { - s 0 - b 8405 12315 - e 8412 12322 - l 8405 12315 8412 12322 100 -} -a { - s 0 - b 8406 12316 - e 8413 12323 - l 8406 12316 8413 12323 100 -} -a { - s 0 - b 8407 12317 - e 8414 12324 - l 8407 12317 8414 12324 100 -} -a { - s 0 - b 18762 12336 - e 18769 12343 - l 18762 12336 18769 12343 100 -} -a { - s 0 - b 18763 12337 - e 18770 12344 - l 18763 12337 18770 12344 100 -} -a { - s 0 - b 4011 12345 - e 4018 12352 - l 4011 12345 4018 12352 100 -} -a { - s 0 - b 9031 12348 - e 9038 12355 - l 9031 12348 9038 12355 100 -} -a { - s 0 - b 4997 12348 - e 5004 12355 - l 4997 12348 5004 12355 100 -} -a { - s 0 - b 18464 12357 - e 18471 12364 - l 18464 12357 18471 12364 100 -} -a { - s 0 - b 6691 12358 - e 6698 12365 - l 6691 12358 6698 12365 100 -} -a { - s 0 - b 15345 12365 - e 15352 12372 - l 15345 12365 15352 12372 100 -} -a { - s 0 - b 15280 12394 - e 15287 12401 - l 15280 12394 15287 12401 100 -} -a { - s 0 - b 13933 12405 - e 13940 12412 - l 13933 12405 13940 12412 100 -} -a { - s 0 - b 11522 12415 - e 11529 12422 - l 11522 12415 11529 12422 100 -} -a { - s 0 - b 13501 12421 - e 13508 12428 - l 13501 12421 13508 12428 100 -} -a { - s 0 - b 193 12423 - e 200 12430 - l 193 12423 200 12430 100 -} -a { - s 0 - b 8057 12426 - e 8064 12433 - l 8057 12426 8064 12433 100 -} -a { - s 0 - b 18624 12436 - e 18631 12443 - l 18624 12436 18631 12443 100 -} -a { - s 0 - b 7485 12458 - e 7492 12465 - l 7485 12458 7492 12465 100 -} -a { - s 0 - b 7486 12459 - e 7493 12466 - l 7486 12459 7493 12466 100 -} -a { - s 0 - b 11464 12462 - e 11471 12469 - l 11464 12462 11471 12469 100 -} -a { - s 0 - b 4553 12474 - e 4560 12481 - l 4553 12474 4560 12481 100 -} -a { - s 0 - b 4554 12475 - e 4561 12482 - l 4554 12475 4561 12482 100 -} -a { - s 0 - b 4555 12476 - e 4562 12483 - l 4555 12476 4562 12483 100 -} -a { - s 0 - b 14349 12484 - e 14356 12491 - l 14349 12484 14356 12491 100 -} -a { - s 0 - b 6313 12484 - e 6320 12491 - l 6313 12484 6320 12491 100 -} -a { - s 0 - b 13457 12490 - e 13464 12497 - l 13457 12490 13464 12497 100 -} -a { - s 0 - b 13458 12491 - e 13465 12498 - l 13458 12491 13465 12498 100 -} -a { - s 0 - b 13459 12492 - e 13466 12499 - l 13459 12492 13466 12499 100 -} -a { - s 0 - b 14965 12501 - e 14972 12508 - l 14965 12501 14972 12508 100 -} -a { - s 0 - b 3995 12504 - e 4002 12511 - l 3995 12504 4002 12511 100 -} -a { - s 0 - b 17471 12509 - e 17478 12516 - l 17471 12509 17478 12516 100 -} -a { - s 0 - b 8340 12509 - e 8347 12516 - l 8340 12509 8347 12516 100 -} -a { - s 0 - b 17094 12513 - e 17101 12520 - l 17094 12513 17101 12520 100 -} -a { - s 0 - b 12239 12513 - e 12246 12520 - l 12239 12513 12246 12520 100 -} -a { - s 0 - b 11528 12515 - e 11535 12522 - l 11528 12515 11535 12522 100 -} -a { - s 0 - b 8319 12520 - e 8326 12527 - l 8319 12520 8326 12527 100 -} -a { - s 0 - b 11077 12522 - e 11084 12529 - l 11077 12522 11084 12529 100 -} -a { - s 0 - b 11078 12523 - e 11085 12530 - l 11078 12523 11085 12530 100 -} -a { - s 0 - b 15264 12528 - e 15271 12535 - l 15264 12528 15271 12535 100 -} -a { - s 0 - b 17440 12529 - e 17447 12536 - l 17440 12529 17447 12536 100 -} -a { - s 0 - b 8940 12531 - e 8947 12538 - l 8940 12531 8947 12538 100 -} -a { - s 0 - b 8941 12532 - e 8948 12539 - l 8941 12532 8948 12539 100 -} -a { - s 0 - b 7687 12535 - e 7694 12542 - l 7687 12535 7694 12542 100 -} -a { - s 0 - b 7688 12536 - e 7695 12543 - l 7688 12536 7695 12543 100 -} -a { - s 0 - b 9376 12557 - e 9383 12564 - l 9376 12557 9383 12564 100 -} -a { - s 0 - b 16139 12563 - e 16146 12570 - l 16139 12563 16146 12570 100 -} -a { - s 0 - b 16140 12564 - e 16147 12571 - l 16140 12564 16147 12571 100 -} -a { - s 0 - b 16141 12565 - e 16148 12572 - l 16141 12565 16148 12572 100 -} -a { - s 0 - b 4932 12572 - e 4939 12579 - l 4932 12572 4939 12579 100 -} -a { - s 0 - b 4933 12573 - e 4940 12580 - l 4933 12573 4940 12580 100 -} -a { - s 0 - b 15361 12600 - e 15368 12607 - l 15361 12600 15368 12607 100 -} -a { - s 0 - b 15362 12601 - e 15369 12608 - l 15362 12601 15369 12608 100 -} -a { - s 0 - b 14844 12602 - e 14851 12609 - l 14844 12602 14851 12609 100 -} -a { - s 0 - b 7496 12602 - e 7503 12609 - l 7496 12602 7503 12609 100 -} -a { - s 0 - b 14845 12603 - e 14852 12610 - l 14845 12603 14852 12610 100 -} -a { - s 0 - b 11118 12603 - e 11125 12610 - l 11118 12603 11125 12610 100 -} -a { - s 0 - b 14846 12604 - e 14853 12611 - l 14846 12604 14853 12611 100 -} -a { - s 0 - b 41 12605 - e 48 12612 - l 41 12605 48 12612 100 -} -a { - s 0 - b 15319 12645 - e 15326 12652 - l 15319 12645 15326 12652 100 -} -a { - s 0 - b 18372 12646 - e 18379 12653 - l 18372 12646 18379 12653 100 -} -a { - s 0 - b 7689 12650 - e 7696 12657 - l 7689 12650 7696 12657 100 -} -a { - s 0 - b 4406 12655 - e 4413 12662 - l 4406 12655 4413 12662 100 -} -a { - s 0 - b 7662 12659 - e 7669 12666 - l 7662 12659 7669 12666 100 -} -a { - s 0 - b 15392 12661 - e 15399 12668 - l 15392 12661 15399 12668 100 -} -a { - s 0 - b 8024 12662 - e 8031 12669 - l 8024 12662 8031 12669 100 -} -a { - s 0 - b 14343 12672 - e 14350 12679 - l 14343 12672 14350 12679 100 -} -a { - s 0 - b 17510 12684 - e 17517 12691 - l 17510 12684 17517 12691 100 -} -a { - s 0 - b 12238 12684 - e 12245 12691 - l 12238 12684 12245 12691 100 -} -a { - s 0 - b 11056 12684 - e 11063 12691 - l 11056 12684 11063 12691 100 -} -a { - s 0 - b 17094 12685 - e 17101 12692 - l 17094 12685 17101 12692 100 -} -a { - s 0 - b 12239 12685 - e 12246 12692 - l 12239 12685 12246 12692 100 -} -a { - s 0 - b 17095 12686 - e 17102 12693 - l 17095 12686 17102 12693 100 -} -a { - s 0 - b 7511 12689 - e 7518 12696 - l 7511 12689 7518 12696 100 -} -a { - s 0 - b 5563 12696 - e 5570 12703 - l 5563 12696 5570 12703 100 -} -a { - s 0 - b 6308 12699 - e 6315 12706 - l 6308 12699 6315 12706 100 -} -a { - s 0 - b 7574 12721 - e 7581 12728 - l 7574 12721 7581 12728 100 -} -a { - s 0 - b 7575 12722 - e 7582 12729 - l 7575 12722 7582 12729 100 -} -a { - s 0 - b 7576 12723 - e 7583 12730 - l 7576 12723 7583 12730 100 -} -a { - s 0 - b 7577 12724 - e 7584 12731 - l 7577 12724 7584 12731 100 -} -a { - s 0 - b 11501 12727 - e 11508 12734 - l 11501 12727 11508 12734 100 -} -a { - s 0 - b 3329 12741 - e 3336 12748 - l 3329 12741 3336 12748 100 -} -a { - s 0 - b 3330 12742 - e 3337 12749 - l 3330 12742 3337 12749 100 -} -a { - s 0 - b 4455 12744 - e 4462 12751 - l 4455 12744 4462 12751 100 -} -a { - s 0 - b 7403 12747 - e 7410 12754 - l 7403 12747 7410 12754 100 -} -a { - s 0 - b 8787 12757 - e 8794 12764 - l 8787 12757 8794 12764 100 -} -a { - s 0 - b 304 12760 - e 311 12767 - l 304 12760 311 12767 100 -} -a { - s 0 - b 305 12761 - e 312 12768 - l 305 12761 312 12768 100 -} -a { - s 0 - b 6530 12766 - e 6537 12773 - l 6530 12766 6537 12773 100 -} -a { - s 0 - b 15201 12768 - e 15208 12775 - l 15201 12768 15208 12775 100 -} -a { - s 0 - b 6309 12775 - e 6316 12782 - l 6309 12775 6316 12782 100 -} -a { - s 0 - b 6310 12776 - e 6317 12783 - l 6310 12776 6317 12783 100 -} -a { - s 0 - b 9389 12783 - e 9396 12790 - l 9389 12783 9396 12790 100 -} -a { - s 0 - b 9390 12784 - e 9397 12791 - l 9390 12784 9397 12791 100 -} -a { - s 0 - b 9391 12785 - e 9398 12792 - l 9391 12785 9398 12792 100 -} -a { - s 0 - b 245 12785 - e 252 12792 - l 245 12785 252 12792 100 -} -a { - s 0 - b 8823 12790 - e 8830 12797 - l 8823 12790 8830 12797 100 -} -a { - s 0 - b 8824 12791 - e 8831 12798 - l 8824 12791 8831 12798 100 -} -a { - s 0 - b 6830 12795 - e 6837 12802 - l 6830 12795 6837 12802 100 -} -a { - s 0 - b 4857 12795 - e 4864 12802 - l 4857 12795 4864 12802 100 -} -a { - s 0 - b 4858 12796 - e 4865 12803 - l 4858 12796 4865 12803 100 -} -a { - s 0 - b 4859 12797 - e 4866 12804 - l 4859 12797 4866 12804 100 -} -a { - s 0 - b 4503 12797 - e 4510 12804 - l 4503 12797 4510 12804 100 -} -a { - s 0 - b 4860 12798 - e 4867 12805 - l 4860 12798 4867 12805 100 -} -a { - s 0 - b 2962 12799 - e 2969 12806 - l 2962 12799 2969 12806 100 -} -a { - s 0 - b 3262 12807 - e 3269 12814 - l 3262 12807 3269 12814 100 -} -a { - s 0 - b 3263 12808 - e 3270 12815 - l 3263 12808 3270 12815 100 -} -a { - s 0 - b 11497 12809 - e 11504 12816 - l 11497 12809 11504 12816 100 -} -a { - s 0 - b 3264 12809 - e 3271 12816 - l 3264 12809 3271 12816 100 -} -a { - s 0 - b 3265 12810 - e 3272 12817 - l 3265 12810 3272 12817 100 -} -a { - s 0 - b 8101 12816 - e 8108 12823 - l 8101 12816 8108 12823 100 -} -a { - s 0 - b 8587 12817 - e 8594 12824 - l 8587 12817 8594 12824 100 -} -a { - s 0 - b 14636 12818 - e 14643 12825 - l 14636 12818 14643 12825 100 -} -a { - s 0 - b 8588 12818 - e 8595 12825 - l 8588 12818 8595 12825 100 -} -a { - s 0 - b 18518 12824 - e 18525 12831 - l 18518 12824 18525 12831 100 -} -a { - s 0 - b 11163 12833 - e 11170 12840 - l 11163 12833 11170 12840 100 -} -a { - s 0 - b 11022 12833 - e 11029 12840 - l 11022 12833 11029 12840 100 -} -a { - s 0 - b 5099 12841 - e 5106 12848 - l 5099 12841 5106 12848 100 -} -a { - s 0 - b 3913 12841 - e 3920 12848 - l 3913 12841 3920 12848 100 -} -a { - s 0 - b 5100 12842 - e 5107 12849 - l 5100 12842 5107 12849 100 -} -a { - s 0 - b 5101 12843 - e 5108 12850 - l 5101 12843 5108 12850 100 -} -a { - s 0 - b 18469 12908 - e 18476 12915 - l 18469 12908 18476 12915 100 -} -a { - s 0 - b 9458 12916 - e 9465 12923 - l 9458 12916 9465 12923 100 -} -a { - s 0 - b 7766 12935 - e 7773 12942 - l 7766 12935 7773 12942 100 -} -a { - s 0 - b 7767 12936 - e 7774 12943 - l 7767 12936 7774 12943 100 -} -a { - s 0 - b 8975 12942 - e 8982 12949 - l 8975 12942 8982 12949 100 -} -a { - s 0 - b 15966 12943 - e 15973 12950 - l 15966 12943 15973 12950 100 -} -a { - s 0 - b 15967 12944 - e 15974 12951 - l 15967 12944 15974 12951 100 -} -a { - s 0 - b 6900 12994 - e 6907 13001 - l 6900 12994 6907 13001 100 -} -a { - s 0 - b 3369 12995 - e 3376 13002 - l 3369 12995 3376 13002 100 -} -a { - s 0 - b 5605 12997 - e 5612 13004 - l 5605 12997 5612 13004 100 -} -a { - s 0 - b 5606 12998 - e 5613 13005 - l 5606 12998 5613 13005 100 -} -a { - s 0 - b 13956 13009 - e 13963 13016 - l 13956 13009 13963 13016 100 -} -a { - s 0 - b 6743 13018 - e 6750 13025 - l 6743 13018 6750 13025 100 -} -a { - s 0 - b 17473 13021 - e 17480 13028 - l 17473 13021 17480 13028 100 -} -a { - s 0 - b 8342 13021 - e 8349 13028 - l 8342 13021 8349 13028 100 -} -a { - s 0 - b 3191 13022 - e 3198 13029 - l 3191 13022 3198 13029 100 -} -a { - s 0 - b 15274 13031 - e 15281 13038 - l 15274 13031 15281 13038 100 -} -a { - s 0 - b 16098 13033 - e 16105 13040 - l 16098 13033 16105 13040 100 -} -a { - s 0 - b 266 13033 - e 273 13040 - l 266 13033 273 13040 100 -} -a { - s 0 - b 267 13034 - e 274 13041 - l 267 13034 274 13041 100 -} -a { - s 0 - b 8787 13110 - e 8794 13117 - l 8787 13110 8794 13117 100 -} -a { - s 0 - b 3713 13111 - e 3720 13118 - l 3713 13111 3720 13118 100 -} -a { - s 0 - b 14530 13119 - e 14537 13126 - l 14530 13119 14537 13126 100 -} -a { - s 0 - b 6650 13134 - e 6657 13141 - l 6650 13134 6657 13141 100 -} -a { - s 0 - b 6651 13135 - e 6658 13142 - l 6651 13135 6658 13142 100 -} -a { - s 0 - b 6652 13136 - e 6659 13143 - l 6652 13136 6659 13143 100 -} -a { - s 0 - b 17521 13137 - e 17528 13144 - l 17521 13137 17528 13144 100 -} -a { - s 0 - b 13890 13137 - e 13897 13144 - l 13890 13137 13897 13144 100 -} -a { - s 0 - b 6653 13137 - e 6660 13144 - l 6653 13137 6660 13144 100 -} -a { - s 0 - b 9638 13138 - e 9645 13145 - l 9638 13138 9645 13145 100 -} -a { - s 0 - b 6654 13138 - e 6661 13145 - l 6654 13138 6661 13145 100 -} -a { - s 0 - b 9814 13142 - e 9821 13149 - l 9814 13142 9821 13149 100 -} -a { - s 0 - b 9815 13143 - e 9822 13150 - l 9815 13143 9822 13150 100 -} -a { - s 0 - b 12575 13147 - e 12582 13154 - l 12575 13147 12582 13154 100 -} -a { - s 0 - b 39 13151 - e 46 13158 - l 39 13151 46 13158 100 -} -a { - s 0 - b 4999 13167 - e 5006 13174 - l 4999 13167 5006 13174 100 -} -a { - s 0 - b 13952 13169 - e 13959 13176 - l 13952 13169 13959 13176 100 -} -a { - s 0 - b 15264 13173 - e 15271 13180 - l 15264 13173 15271 13180 100 -} -a { - s 0 - b 6705 13201 - e 6712 13208 - l 6705 13201 6712 13208 100 -} -a { - s 0 - b 6706 13202 - e 6713 13209 - l 6706 13202 6713 13209 100 -} -a { - s 0 - b 6656 13202 - e 6663 13209 - l 6656 13202 6663 13209 100 -} -a { - s 0 - b 14661 13226 - e 14668 13233 - l 14661 13226 14668 13233 100 -} -a { - s 0 - b 10843 13235 - e 10850 13242 - l 10843 13235 10850 13242 100 -} -a { - s 0 - b 9626 13267 - e 9633 13274 - l 9626 13267 9633 13274 100 -} -a { - s 0 - b 9049 13284 - e 9056 13291 - l 9049 13284 9056 13291 100 -} -a { - s 0 - b 5072 13284 - e 5079 13291 - l 5072 13284 5079 13291 100 -} -a { - s 0 - b 14604 13285 - e 14611 13292 - l 14604 13285 14611 13292 100 -} -a { - s 0 - b 12142 13285 - e 12149 13292 - l 12142 13285 12149 13292 100 -} -a { - s 0 - b 9743 13321 - e 9750 13328 - l 9743 13321 9750 13328 100 -} -a { - s 0 - b 9744 13322 - e 9751 13329 - l 9744 13322 9751 13329 100 -} -a { - s 0 - b 6670 13333 - e 6677 13340 - l 6670 13333 6677 13340 100 -} -a { - s 0 - b 6671 13334 - e 6678 13341 - l 6671 13334 6678 13341 100 -} -a { - s 0 - b 17351 13344 - e 17358 13351 - l 17351 13344 17358 13351 100 -} -a { - s 0 - b 8971 13350 - e 8978 13357 - l 8971 13350 8978 13357 100 -} -a { - s 0 - b 3670 13366 - e 3677 13373 - l 3670 13366 3677 13373 100 -} -a { - s 0 - b 14336 13368 - e 14343 13375 - l 14336 13368 14343 13375 100 -} -a { - s 0 - b 11074 13369 - e 11081 13376 - l 11074 13369 11081 13376 100 -} -a { - s 0 - b 16152 13370 - e 16159 13377 - l 16152 13370 16159 13377 100 -} -a { - s 0 - b 11075 13370 - e 11082 13377 - l 11075 13370 11082 13377 100 -} -a { - s 0 - b 11076 13371 - e 11083 13378 - l 11076 13371 11083 13378 100 -} -a { - s 0 - b 10857 13373 - e 10864 13380 - l 10857 13373 10864 13380 100 -} -a { - s 0 - b 3395 13373 - e 3402 13380 - l 3395 13373 3402 13380 100 -} -a { - s 0 - b 158 13373 - e 165 13380 - l 158 13373 165 13380 100 -} -a { - s 0 - b 159 13374 - e 166 13381 - l 159 13374 166 13381 100 -} -a { - s 0 - b 160 13375 - e 167 13382 - l 160 13375 167 13382 100 -} -a { - s 0 - b 4960 13389 - e 4967 13396 - l 4960 13389 4967 13396 100 -} -a { - s 0 - b 13405 13400 - e 13412 13407 - l 13405 13400 13412 13407 100 -} -a { - s 0 - b 7615 13400 - e 7622 13407 - l 7615 13400 7622 13407 100 -} -a { - s 0 - b 3089 13401 - e 3096 13408 - l 3089 13401 3096 13408 100 -} -a { - s 0 - b 17319 13410 - e 17326 13417 - l 17319 13410 17326 13417 100 -} -a { - s 0 - b 17320 13411 - e 17327 13418 - l 17320 13411 17327 13418 100 -} -a { - s 0 - b 113 13411 - e 120 13418 - l 113 13411 120 13418 100 -} -a { - s 0 - b 17321 13412 - e 17328 13419 - l 17321 13412 17328 13419 100 -} -a { - s 0 - b 16150 13414 - e 16157 13421 - l 16150 13414 16157 13421 100 -} -a { - s 0 - b 3973 13414 - e 3980 13421 - l 3973 13414 3980 13421 100 -} -a { - s 0 - b 8483 13415 - e 8490 13422 - l 8483 13415 8490 13422 100 -} -a { - s 0 - b 6302 13415 - e 6309 13422 - l 6302 13415 6309 13422 100 -} -a { - s 0 - b 3974 13415 - e 3981 13422 - l 3974 13415 3981 13422 100 -} -a { - s 0 - b 3975 13416 - e 3982 13423 - l 3975 13416 3982 13423 100 -} -a { - s 0 - b 4037 13417 - e 4044 13424 - l 4037 13417 4044 13424 100 -} -a { - s 0 - b 3976 13417 - e 3983 13424 - l 3976 13417 3983 13424 100 -} -a { - s 0 - b 3977 13418 - e 3984 13425 - l 3977 13418 3984 13425 100 -} -a { - s 0 - b 5005 13420 - e 5012 13427 - l 5005 13420 5012 13427 100 -} -a { - s 0 - b 7434 13427 - e 7441 13434 - l 7434 13427 7441 13434 100 -} -a { - s 0 - b 11557 13429 - e 11564 13436 - l 11557 13429 11564 13436 100 -} -a { - s 0 - b 11558 13430 - e 11565 13437 - l 11558 13430 11565 13437 100 -} -a { - s 0 - b 5608 13452 - e 5615 13459 - l 5608 13452 5615 13459 100 -} -a { - s 0 - b 11372 13453 - e 11379 13460 - l 11372 13453 11379 13460 100 -} -a { - s 0 - b 17995 13454 - e 18002 13461 - l 17995 13454 18002 13461 100 -} -a { - s 0 - b 14791 13455 - e 14798 13462 - l 14791 13455 14798 13462 100 -} -a { - s 0 - b 14740 13476 - e 14747 13483 - l 14740 13476 14747 13483 100 -} -a { - s 0 - b 6438 13476 - e 6445 13483 - l 6438 13476 6445 13483 100 -} -a { - s 0 - b 11190 13477 - e 11197 13484 - l 11190 13477 11197 13484 100 -} -a { - s 0 - b 12603 13501 - e 12610 13508 - l 12603 13501 12610 13508 100 -} -a { - s 0 - b 17518 13506 - e 17525 13513 - l 17518 13506 17525 13513 100 -} -a { - s 0 - b 17519 13507 - e 17526 13514 - l 17519 13507 17526 13514 100 -} -a { - s 0 - b 4439 13529 - e 4446 13536 - l 4439 13529 4446 13536 100 -} -a { - s 0 - b 8251 13530 - e 8258 13537 - l 8251 13530 8258 13537 100 -} -a { - s 0 - b 9716 13534 - e 9723 13541 - l 9716 13534 9723 13541 100 -} -a { - s 0 - b 5148 13536 - e 5155 13543 - l 5148 13536 5155 13543 100 -} -a { - s 0 - b 14841 13544 - e 14848 13551 - l 14841 13544 14848 13551 100 -} -a { - s 0 - b 11134 13552 - e 11141 13559 - l 11134 13552 11141 13559 100 -} -a { - s 0 - b 11558 13571 - e 11565 13578 - l 11558 13571 11565 13578 100 -} -a { - s 0 - b 5621 13575 - e 5628 13582 - l 5621 13575 5628 13582 100 -} -a { - s 0 - b 5622 13576 - e 5629 13583 - l 5622 13576 5629 13583 100 -} -a { - s 0 - b 5623 13577 - e 5630 13584 - l 5623 13577 5630 13584 100 -} -a { - s 0 - b 5624 13578 - e 5631 13585 - l 5624 13578 5631 13585 100 -} -a { - s 0 - b 5625 13579 - e 5632 13586 - l 5625 13579 5632 13586 100 -} -a { - s 0 - b 4418 13610 - e 4425 13617 - l 4418 13610 4425 13617 100 -} -a { - s 0 - b 4419 13611 - e 4426 13618 - l 4419 13611 4426 13618 100 -} -a { - s 0 - b 4420 13612 - e 4427 13619 - l 4420 13612 4427 13619 100 -} -a { - s 0 - b 11434 13614 - e 11441 13621 - l 11434 13614 11441 13621 100 -} -a { - s 0 - b 7534 13630 - e 7541 13637 - l 7534 13630 7541 13637 100 -} -a { - s 0 - b 3093 13636 - e 3100 13643 - l 3093 13636 3100 13643 100 -} -a { - s 0 - b 9043 13647 - e 9050 13654 - l 9043 13647 9050 13654 100 -} -a { - s 0 - b 6909 13649 - e 6916 13656 - l 6909 13649 6916 13656 100 -} -a { - s 0 - b 9676 13663 - e 9683 13670 - l 9676 13663 9683 13670 100 -} -a { - s 0 - b 9744 13681 - e 9751 13688 - l 9744 13681 9751 13688 100 -} -a { - s 0 - b 5690 13699 - e 5697 13706 - l 5690 13699 5697 13706 100 -} -a { - s 0 - b 14776 13701 - e 14783 13708 - l 14776 13701 14783 13708 100 -} -a { - s 0 - b 9738 13709 - e 9745 13716 - l 9738 13709 9745 13716 100 -} -a { - s 0 - b 8453 13709 - e 8460 13716 - l 8453 13709 8460 13716 100 -} -a { - s 0 - b 5697 13710 - e 5704 13717 - l 5697 13710 5704 13717 100 -} -a { - s 0 - b 14809 13714 - e 14816 13721 - l 14809 13714 14816 13721 100 -} -a { - s 0 - b 3882 13721 - e 3889 13728 - l 3882 13721 3889 13728 100 -} -a { - s 0 - b 5184 13733 - e 5191 13740 - l 5184 13733 5191 13740 100 -} -a { - s 0 - b 14259 13737 - e 14266 13744 - l 14259 13737 14266 13744 100 -} -a { - s 0 - b 14260 13738 - e 14267 13745 - l 14260 13738 14267 13745 100 -} -a { - s 0 - b 14261 13739 - e 14268 13746 - l 14261 13739 14268 13746 100 -} -a { - s 0 - b 14262 13740 - e 14269 13747 - l 14262 13740 14269 13747 100 -} -a { - s 0 - b 8697 13740 - e 8704 13747 - l 8697 13740 8704 13747 100 -} -a { - s 0 - b 14263 13741 - e 14270 13748 - l 14263 13741 14270 13748 100 -} -a { - s 0 - b 6729 13746 - e 6736 13753 - l 6729 13746 6736 13753 100 -} -a { - s 0 - b 9373 13749 - e 9380 13756 - l 9373 13749 9380 13756 100 -} -a { - s 0 - b 18787 13750 - e 18794 13757 - l 18787 13750 18794 13757 100 -} -a { - s 0 - b 16879 13753 - e 16886 13760 - l 16879 13753 16886 13760 100 -} -a { - s 0 - b 6797 13761 - e 6804 13768 - l 6797 13761 6804 13768 100 -} -a { - s 0 - b 14109 13763 - e 14116 13770 - l 14109 13763 14116 13770 100 -} -a { - s 0 - b 14686 13765 - e 14693 13772 - l 14686 13765 14693 13772 100 -} -a { - s 0 - b 14687 13766 - e 14694 13773 - l 14687 13766 14694 13773 100 -} -a { - s 0 - b 8136 13768 - e 8143 13775 - l 8136 13768 8143 13775 100 -} -a { - s 0 - b 18709 13779 - e 18716 13786 - l 18709 13779 18716 13786 100 -} -a { - s 0 - b 18710 13780 - e 18717 13787 - l 18710 13780 18717 13787 100 -} -a { - s 0 - b 14794 13793 - e 14801 13800 - l 14794 13793 14801 13800 100 -} -a { - s 0 - b 14866 13795 - e 14873 13802 - l 14866 13795 14873 13802 100 -} -a { - s 0 - b 14867 13796 - e 14874 13803 - l 14867 13796 14874 13803 100 -} -a { - s 0 - b 16022 13806 - e 16029 13813 - l 16022 13806 16029 13813 100 -} -a { - s 0 - b 8409 13814 - e 8416 13821 - l 8409 13814 8416 13821 100 -} -a { - s 0 - b 4854 13836 - e 4861 13843 - l 4854 13836 4861 13843 100 -} -a { - s 0 - b 11562 13837 - e 11569 13844 - l 11562 13837 11569 13844 100 -} -a { - s 0 - b 4855 13837 - e 4862 13844 - l 4855 13837 4862 13844 100 -} -a { - s 0 - b 3867 14404 - e 3874 14411 - l 3867 14404 3874 14411 100 -} -a { - s 0 - b 3868 14405 - e 3875 14412 - l 3868 14405 3875 14412 100 -} -a { - s 0 - b 7449 14406 - e 7456 14413 - l 7449 14406 7456 14413 100 -} -a { - s 0 - b 3538 14412 - e 3545 14419 - l 3538 14412 3545 14419 100 -} -a { - s 0 - b 3539 14413 - e 3546 14420 - l 3539 14413 3546 14420 100 -} -a { - s 0 - b 4443 14415 - e 4450 14422 - l 4443 14415 4450 14422 100 -} -a { - s 0 - b 10962 14420 - e 10969 14427 - l 10962 14420 10969 14427 100 -} -a { - s 0 - b 11332 14425 - e 11339 14432 - l 11332 14425 11339 14432 100 -} -a { - s 0 - b 17838 14428 - e 17845 14435 - l 17838 14428 17845 14435 100 -} -a { - s 0 - b 16069 14428 - e 16076 14435 - l 16069 14428 16076 14435 100 -} -a { - s 0 - b 11329 14428 - e 11336 14435 - l 11329 14428 11336 14435 100 -} -a { - s 0 - b 8502 14433 - e 8509 14440 - l 8502 14433 8509 14440 100 -} -a { - s 0 - b 11444 14439 - e 11451 14446 - l 11444 14439 11451 14446 100 -} -a { - s 0 - b 8050 14439 - e 8057 14446 - l 8050 14439 8057 14446 100 -} -a { - s 0 - b 15377 14449 - e 15384 14456 - l 15377 14449 15384 14456 100 -} -a { - s 0 - b 14708 14450 - e 14715 14457 - l 14708 14450 14715 14457 100 -} -a { - s 0 - b 14472 14466 - e 14479 14473 - l 14472 14466 14479 14473 100 -} -a { - s 0 - b 14254 14476 - e 14261 14483 - l 14254 14476 14261 14483 100 -} -a { - s 0 - b 15332 14477 - e 15339 14484 - l 15332 14477 15339 14484 100 -} -a { - s 0 - b 15333 14478 - e 15340 14485 - l 15333 14478 15340 14485 100 -} -a { - s 0 - b 4630 14495 - e 4637 14502 - l 4630 14495 4637 14502 100 -} -a { - s 0 - b 5126 14502 - e 5133 14509 - l 5126 14502 5133 14509 100 -} -a { - s 0 - b 4551 14502 - e 4558 14509 - l 4551 14502 4558 14509 100 -} -a { - s 0 - b 4966 14506 - e 4973 14513 - l 4966 14506 4973 14513 100 -} -a { - s 0 - b 15158 14511 - e 15165 14518 - l 15158 14511 15165 14518 100 -} -a { - s 0 - b 5646 14855 - e 5653 14862 - l 5646 14855 5653 14862 100 -} -a { - s 0 - b 5647 14856 - e 5654 14863 - l 5647 14856 5654 14863 100 -} -a { - s 0 - b 8133 14858 - e 8140 14865 - l 8133 14858 8140 14865 100 -} -a { - s 0 - b 14298 14863 - e 14305 14870 - l 14298 14863 14305 14870 100 -} -a { - s 0 - b 15180 14888 - e 15187 14895 - l 15180 14888 15187 14895 100 -} -a { - s 0 - b 8030 14894 - e 8037 14901 - l 8030 14894 8037 14901 100 -} -a { - s 0 - b 7650 14894 - e 7657 14901 - l 7650 14894 7657 14901 100 -} -a { - s 0 - b 132 14894 - e 139 14901 - l 132 14894 139 14901 100 -} -a { - s 0 - b 13998 14897 - e 14005 14904 - l 13998 14897 14005 14904 100 -} -a { - s 0 - b 18039 14905 - e 18046 14912 - l 18039 14905 18046 14912 100 -} -a { - s 0 - b 11035 14906 - e 11042 14913 - l 11035 14906 11042 14913 100 -} -a { - s 0 - b 6606 14907 - e 6613 14914 - l 6606 14907 6613 14914 100 -} -a { - s 0 - b 9717 14909 - e 9724 14916 - l 9717 14909 9724 14916 100 -} -a { - s 0 - b 9718 14910 - e 9725 14917 - l 9718 14910 9725 14917 100 -} -a { - s 0 - b 4395 14917 - e 4402 14924 - l 4395 14917 4402 14924 100 -} -a { - s 0 - b 13431 14934 - e 13438 14941 - l 13431 14934 13438 14941 100 -} -a { - s 0 - b 6594 14935 - e 6601 14942 - l 6594 14935 6601 14942 100 -} -a { - s 0 - b 17520 14954 - e 17527 14961 - l 17520 14954 17527 14961 100 -} -a { - s 0 - b 13889 14954 - e 13896 14961 - l 13889 14954 13896 14961 100 -} -a { - s 0 - b 9407 14957 - e 9414 14964 - l 9407 14957 9414 14964 100 -} -a { - s 0 - b 81 14957 - e 88 14964 - l 81 14957 88 14964 100 -} -a { - s 0 - b 9408 14958 - e 9415 14965 - l 9408 14958 9415 14965 100 -} -a { - s 0 - b 9409 14959 - e 9416 14966 - l 9409 14959 9416 14966 100 -} -a { - s 0 - b 6584 14959 - e 6591 14966 - l 6584 14959 6591 14966 100 -} -a { - s 0 - b 6585 14960 - e 6592 14967 - l 6585 14960 6592 14967 100 -} -a { - s 0 - b 14052 14962 - e 14059 14969 - l 14052 14962 14059 14969 100 -} -a { - s 0 - b 17209 14967 - e 17216 14974 - l 17209 14967 17216 14974 100 -} -a { - s 0 - b 17210 14968 - e 17217 14975 - l 17210 14968 17217 14975 100 -} -a { - s 0 - b 7656 14971 - e 7663 14978 - l 7656 14971 7663 14978 100 -} -a { - s 0 - b 12226 14976 - e 12233 14983 - l 12226 14976 12233 14983 100 -} -a { - s 0 - b 4954 14978 - e 4961 14985 - l 4954 14978 4961 14985 100 -} -a { - s 0 - b 14767 14979 - e 14774 14986 - l 14767 14979 14774 14986 100 -} -a { - s 0 - b 14075 14983 - e 14082 14990 - l 14075 14983 14082 14990 100 -} -a { - s 0 - b 8122 14984 - e 8129 14991 - l 8122 14984 8129 14991 100 -} -a { - s 0 - b 7476 14987 - e 7483 14994 - l 7476 14987 7483 14994 100 -} -a { - s 0 - b 7477 14988 - e 7484 14995 - l 7477 14988 7484 14995 100 -} -a { - s 0 - b 4830 14989 - e 4837 14996 - l 4830 14989 4837 14996 100 -} -a { - s 0 - b 14248 14990 - e 14255 14997 - l 14248 14990 14255 14997 100 -} -a { - s 0 - b 14249 14991 - e 14256 14998 - l 14249 14991 14256 14998 100 -} -a { - s 0 - b 9816 14994 - e 9823 15001 - l 9816 14994 9823 15001 100 -} -a { - s 0 - b 9817 14995 - e 9824 15002 - l 9817 14995 9824 15002 100 -} -a { - s 0 - b 6571 15001 - e 6578 15008 - l 6571 15001 6578 15008 100 -} -a { - s 0 - b 4791 15006 - e 4798 15013 - l 4791 15006 4798 15013 100 -} -a { - s 0 - b 8841 15008 - e 8848 15015 - l 8841 15008 8848 15015 100 -} -a { - s 0 - b 18504 15013 - e 18511 15020 - l 18504 15013 18511 15020 100 -} -a { - s 0 - b 3363 15015 - e 3370 15022 - l 3363 15015 3370 15022 100 -} -a { - s 0 - b 18694 15018 - e 18701 15025 - l 18694 15018 18701 15025 100 -} -a { - s 0 - b 14642 15018 - e 14649 15025 - l 14642 15018 14649 15025 100 -} -a { - s 0 - b 4447 15026 - e 4454 15033 - l 4447 15026 4454 15033 100 -} -a { - s 0 - b 11344 15037 - e 11351 15044 - l 11344 15037 11351 15044 100 -} -a { - s 0 - b 4498 15051 - e 4505 15058 - l 4498 15051 4505 15058 100 -} -a { - s 0 - b 15226 15056 - e 15233 15063 - l 15226 15056 15233 15063 100 -} -a { - s 0 - b 8222 15057 - e 8229 15064 - l 8222 15057 8229 15064 100 -} -a { - s 0 - b 7718 15057 - e 7725 15064 - l 7718 15057 7725 15064 100 -} -a { - s 0 - b 7719 15058 - e 7726 15065 - l 7719 15058 7726 15065 100 -} -a { - s 0 - b 11103 15084 - e 11110 15091 - l 11103 15084 11110 15091 100 -} -a { - s 0 - b 11104 15085 - e 11111 15092 - l 11104 15085 11111 15092 100 -} -a { - s 0 - b 11105 15086 - e 11112 15093 - l 11105 15086 11112 15093 100 -} -a { - s 0 - b 17447 15087 - e 17454 15094 - l 17447 15087 17454 15094 100 -} -a { - s 0 - b 9728 15089 - e 9735 15096 - l 9728 15089 9735 15096 100 -} -a { - s 0 - b 5134 15093 - e 5141 15100 - l 5134 15093 5141 15100 100 -} -a { - s 0 - b 159 15098 - e 166 15105 - l 159 15098 166 15105 100 -} -a { - s 0 - b 160 15099 - e 167 15106 - l 160 15099 167 15106 100 -} -a { - s 0 - b 9745 15101 - e 9752 15108 - l 9745 15101 9752 15108 100 -} -a { - s 0 - b 8629 15102 - e 8636 15109 - l 8629 15102 8636 15109 100 -} -a { - s 0 - b 8630 15103 - e 8637 15110 - l 8630 15103 8637 15110 100 -} -a { - s 0 - b 8631 15104 - e 8638 15111 - l 8631 15104 8638 15111 100 -} -a { - s 0 - b 17451 15105 - e 17458 15112 - l 17451 15105 17458 15112 100 -} -a { - s 0 - b 9316 15111 - e 9323 15118 - l 9316 15111 9323 15118 100 -} -a { - s 0 - b 5655 15113 - e 5662 15120 - l 5655 15113 5662 15120 100 -} -a { - s 0 - b 7665 15117 - e 7672 15124 - l 7665 15117 7672 15124 100 -} -a { - s 0 - b 5149 15128 - e 5156 15135 - l 5149 15128 5156 15135 100 -} -a { - s 0 - b 3890 15136 - e 3897 15143 - l 3890 15136 3897 15143 100 -} -a { - s 0 - b 18051 15155 - e 18058 15162 - l 18051 15155 18058 15162 100 -} -a { - s 0 - b 14683 15177 - e 14690 15184 - l 14683 15177 14690 15184 100 -} -a { - s 0 - b 14684 15178 - e 14691 15185 - l 14684 15178 14691 15185 100 -} -a { - s 0 - b 14685 15179 - e 14692 15186 - l 14685 15179 14692 15186 100 -} -a { - s 0 - b 13865 15188 - e 13872 15195 - l 13865 15188 13872 15195 100 -} -a { - s 0 - b 13866 15189 - e 13873 15196 - l 13866 15189 13873 15196 100 -} -a { - s 0 - b 7619 15191 - e 7626 15198 - l 7619 15191 7626 15198 100 -} -a { - s 0 - b 14459 15192 - e 14466 15199 - l 14459 15192 14466 15199 100 -} -a { - s 0 - b 10940 15195 - e 10947 15202 - l 10940 15195 10947 15202 100 -} -a { - s 0 - b 14818 15198 - e 14825 15205 - l 14818 15198 14825 15205 100 -} -a { - s 0 - b 3978 15279 - e 3985 15286 - l 3978 15279 3985 15286 100 -} -a { - s 0 - b 14680 15280 - e 14687 15287 - l 14680 15280 14687 15287 100 -} -a { - s 0 - b 11448 15296 - e 11455 15303 - l 11448 15296 11455 15303 100 -} -a { - s 0 - b 4422 15296 - e 4429 15303 - l 4422 15296 4429 15303 100 -} -a { - s 0 - b 11449 15297 - e 11456 15304 - l 11449 15297 11456 15304 100 -} -a { - s 0 - b 8195 15297 - e 8202 15304 - l 8195 15297 8202 15304 100 -} -a { - s 0 - b 14373 15302 - e 14380 15309 - l 14373 15302 14380 15309 100 -} -a { - s 0 - b 7685 15302 - e 7692 15309 - l 7685 15302 7692 15309 100 -} -a { - s 0 - b 7979 15303 - e 7986 15310 - l 7979 15303 7986 15310 100 -} -a { - s 0 - b 4597 15323 - e 4604 15330 - l 4597 15323 4604 15330 100 -} -a { - s 0 - b 9491 15326 - e 9498 15333 - l 9491 15326 9498 15333 100 -} -a { - s 0 - b 8283 15335 - e 8290 15342 - l 8283 15335 8290 15342 100 -} -a { - s 0 - b 12205 15340 - e 12212 15347 - l 12205 15340 12212 15347 100 -} -a { - s 0 - b 11523 15346 - e 11530 15353 - l 11523 15346 11530 15353 100 -} -a { - s 0 - b 5614 15346 - e 5621 15353 - l 5614 15346 5621 15353 100 -} -a { - s 0 - b 11295 15357 - e 11302 15364 - l 11295 15357 11302 15364 100 -} -a { - s 0 - b 11296 15358 - e 11303 15365 - l 11296 15358 11303 15365 100 -} -a { - s 0 - b 4620 15358 - e 4627 15365 - l 4620 15358 4627 15365 100 -} -a { - s 0 - b 7597 15361 - e 7604 15368 - l 7597 15361 7604 15368 100 -} -a { - s 0 - b 7598 15362 - e 7605 15369 - l 7598 15362 7605 15369 100 -} -a { - s 0 - b 17476 15365 - e 17483 15372 - l 17476 15365 17483 15372 100 -} -a { - s 0 - b 12234 15368 - e 12241 15375 - l 12234 15368 12241 15375 100 -} -a { - s 0 - b 17578 15378 - e 17585 15385 - l 17578 15378 17585 15385 100 -} -a { - s 0 - b 6248 15396 - e 6255 15403 - l 6248 15396 6255 15403 100 -} -a { - s 0 - b 11422 15416 - e 11429 15423 - l 11422 15416 11429 15423 100 -} -a { - s 0 - b 8584 15416 - e 8591 15423 - l 8584 15416 8591 15423 100 -} -a { - s 0 - b 6722 15430 - e 6729 15437 - l 6722 15430 6729 15437 100 -} -a { - s 0 - b 14043 15444 - e 14050 15451 - l 14043 15444 14050 15451 100 -} -a { - s 0 - b 14044 15445 - e 14051 15452 - l 14044 15445 14051 15452 100 -} -a { - s 0 - b 5117 15461 - e 5124 15468 - l 5117 15461 5124 15468 100 -} -a { - s 0 - b 13511 15466 - e 13518 15473 - l 13511 15466 13518 15473 100 -} -a { - s 0 - b 8395 15467 - e 8402 15474 - l 8395 15467 8402 15474 100 -} -a { - s 0 - b 8396 15468 - e 8403 15475 - l 8396 15468 8403 15475 100 -} -a { - s 0 - b 3251 15470 - e 3258 15477 - l 3251 15470 3258 15477 100 -} -a { - s 0 - b 19 15471 - e 26 15478 - l 19 15471 26 15478 100 -} -a { - s 0 - b 20 15472 - e 27 15479 - l 20 15472 27 15479 100 -} -a { - s 0 - b 21 15473 - e 28 15480 - l 21 15473 28 15480 100 -} -a { - s 0 - b 4605 15476 - e 4612 15483 - l 4605 15476 4612 15483 100 -} -a { - s 0 - b 9352 15485 - e 9359 15492 - l 9352 15485 9359 15492 100 -} -a { - s 0 - b 9353 15486 - e 9360 15493 - l 9353 15486 9360 15493 100 -} -a { - s 0 - b 16099 15488 - e 16106 15495 - l 16099 15488 16106 15495 100 -} -a { - s 0 - b 16100 15489 - e 16107 15496 - l 16100 15489 16107 15496 100 -} -a { - s 0 - b 7695 15505 - e 7702 15512 - l 7695 15505 7702 15512 100 -} -a { - s 0 - b 3661 15512 - e 3668 15519 - l 3661 15512 3668 15519 100 -} -a { - s 0 - b 8575 15514 - e 8582 15521 - l 8575 15514 8582 15521 100 -} -a { - s 0 - b 15116 15516 - e 15123 15523 - l 15116 15516 15123 15523 100 -} -a { - s 0 - b 16010 15518 - e 16017 15525 - l 16010 15518 16017 15525 100 -} -a { - s 0 - b 16011 15519 - e 16018 15526 - l 16011 15519 16018 15526 100 -} -a { - s 0 - b 18164 15526 - e 18171 15533 - l 18164 15526 18171 15533 100 -} -a { - s 0 - b 9416 15535 - e 9423 15542 - l 9416 15535 9423 15542 100 -} -a { - s 0 - b 3121 15541 - e 3128 15548 - l 3121 15541 3128 15548 100 -} -a { - s 0 - b 3122 15542 - e 3129 15549 - l 3122 15542 3129 15549 100 -} -a { - s 0 - b 3186 15563 - e 3193 15570 - l 3186 15563 3193 15570 100 -} -a { - s 0 - b 4805 15564 - e 4812 15571 - l 4805 15564 4812 15571 100 -} -a { - s 0 - b 3187 15564 - e 3194 15571 - l 3187 15564 3194 15571 100 -} -a { - s 0 - b 4806 15565 - e 4813 15572 - l 4806 15565 4813 15572 100 -} -a { - s 0 - b 9620 15567 - e 9627 15574 - l 9620 15567 9627 15574 100 -} -a { - s 0 - b 15978 15587 - e 15985 15594 - l 15978 15587 15985 15594 100 -} -a { - s 0 - b 8779 15589 - e 8786 15596 - l 8779 15589 8786 15596 100 -} -a { - s 0 - b 9741 15590 - e 9748 15597 - l 9741 15590 9748 15597 100 -} -a { - s 0 - b 8780 15590 - e 8787 15597 - l 8780 15590 8787 15597 100 -} -a { - s 0 - b 7537 15591 - e 7544 15598 - l 7537 15591 7544 15598 100 -} -a { - s 0 - b 5076 15604 - e 5083 15611 - l 5076 15604 5083 15611 100 -} -a { - s 0 - b 18634 15606 - e 18641 15613 - l 18634 15606 18641 15613 100 -} -a { - s 0 - b 7623 15606 - e 7630 15613 - l 7623 15606 7630 15613 100 -} -a { - s 0 - b 7624 15607 - e 7631 15614 - l 7624 15607 7631 15614 100 -} -a { - s 0 - b 252 15611 - e 259 15618 - l 252 15611 259 15618 100 -} -a { - s 0 - b 253 15612 - e 260 15619 - l 253 15612 260 15619 100 -} -a { - s 0 - b 254 15613 - e 261 15620 - l 254 15613 261 15620 100 -} -a { - s 0 - b 3105 15629 - e 3112 15636 - l 3105 15629 3112 15636 100 -} -a { - s 0 - b 17411 15643 - e 17418 15650 - l 17411 15643 17418 15650 100 -} -a { - s 0 - b 14301 15643 - e 14308 15650 - l 14301 15643 14308 15650 100 -} -a { - s 0 - b 17412 15644 - e 17419 15651 - l 17412 15644 17419 15651 100 -} -a { - s 0 - b 14302 15644 - e 14309 15651 - l 14302 15644 14309 15651 100 -} -a { - s 0 - b 17413 15645 - e 17420 15652 - l 17413 15645 17420 15652 100 -} -a { - s 0 - b 14303 15645 - e 14310 15652 - l 14303 15645 14310 15652 100 -} -a { - s 0 - b 3299 15655 - e 3306 15662 - l 3299 15655 3306 15662 100 -} -a { - s 0 - b 3300 15656 - e 3307 15663 - l 3300 15656 3307 15663 100 -} -a { - s 0 - b 3317 15660 - e 3324 15667 - l 3317 15660 3324 15667 100 -} -a { - s 0 - b 11126 15665 - e 11133 15672 - l 11126 15665 11133 15672 100 -} -a { - s 0 - b 11045 15675 - e 11052 15682 - l 11045 15675 11052 15682 100 -} -a { - s 0 - b 7417 15676 - e 7424 15683 - l 7417 15676 7424 15683 100 -} -a { - s 0 - b 15136 15680 - e 15143 15687 - l 15136 15680 15143 15687 100 -} -a { - s 0 - b 14163 15689 - e 14170 15696 - l 14163 15689 14170 15696 100 -} -a { - s 0 - b 3873 15690 - e 3880 15697 - l 3873 15690 3880 15697 100 -} -a { - s 0 - b 8606 15696 - e 8613 15703 - l 8606 15696 8613 15703 100 -} -a { - s 0 - b 15243 15697 - e 15250 15704 - l 15243 15697 15250 15704 100 -} -a { - s 0 - b 17299 15724 - e 17306 15731 - l 17299 15724 17306 15731 100 -} -a { - s 0 - b 11242 15728 - e 11249 15735 - l 11242 15728 11249 15735 100 -} -a { - s 0 - b 14928 15729 - e 14935 15736 - l 14928 15729 14935 15736 100 -} -a { - s 0 - b 9637 15730 - e 9644 15737 - l 9637 15730 9644 15737 100 -} -a { - s 0 - b 17522 15731 - e 17529 15738 - l 17522 15731 17529 15738 100 -} -a { - s 0 - b 18587 15736 - e 18594 15743 - l 18587 15736 18594 15743 100 -} -a { - s 0 - b 17475 15737 - e 17482 15744 - l 17475 15737 17482 15744 100 -} -a { - s 0 - b 8344 15737 - e 8351 15744 - l 8344 15737 8351 15744 100 -} -a { - s 0 - b 17476 15738 - e 17483 15745 - l 17476 15738 17483 15745 100 -} -a { - s 0 - b 17477 15739 - e 17484 15746 - l 17477 15739 17484 15746 100 -} -a { - s 0 - b 6298 15740 - e 6305 15747 - l 6298 15740 6305 15747 100 -} -a { - s 0 - b 14815 15748 - e 14822 15755 - l 14815 15748 14822 15755 100 -} -a { - s 0 - b 15920 15761 - e 15927 15768 - l 15920 15761 15927 15768 100 -} -a { - s 0 - b 15921 15762 - e 15928 15769 - l 15921 15762 15928 15769 100 -} -a { - s 0 - b 11040 15764 - e 11047 15771 - l 11040 15764 11047 15771 100 -} -a { - s 0 - b 14071 15775 - e 14078 15782 - l 14071 15775 14078 15782 100 -} -a { - s 0 - b 14072 15776 - e 14079 15783 - l 14072 15776 14079 15783 100 -} -a { - s 0 - b 8139 15776 - e 8146 15783 - l 8139 15776 8146 15783 100 -} -a { - s 0 - b 14073 15777 - e 14080 15784 - l 14073 15777 14080 15784 100 -} -a { - s 0 - b 14074 15778 - e 14081 15785 - l 14074 15778 14081 15785 100 -} -a { - s 0 - b 6538 15778 - e 6545 15785 - l 6538 15778 6545 15785 100 -} -a { - s 0 - b 14075 15779 - e 14082 15786 - l 14075 15779 14082 15786 100 -} -a { - s 0 - b 3005 15782 - e 3012 15789 - l 3005 15782 3012 15789 100 -} -a { - s 0 - b 18671 15783 - e 18678 15790 - l 18671 15783 18678 15790 100 -} -a { - s 0 - b 14381 15783 - e 14388 15790 - l 14381 15783 14388 15790 100 -} -a { - s 0 - b 3006 15783 - e 3013 15790 - l 3006 15783 3013 15790 100 -} -a { - s 0 - b 18672 15784 - e 18679 15791 - l 18672 15784 18679 15791 100 -} -a { - s 0 - b 14957 15806 - e 14964 15813 - l 14957 15806 14964 15813 100 -} -a { - s 0 - b 14958 15807 - e 14965 15814 - l 14958 15807 14965 15814 100 -} -a { - s 0 - b 13533 15807 - e 13540 15814 - l 13533 15807 13540 15814 100 -} -a { - s 0 - b 14577 15809 - e 14584 15816 - l 14577 15809 14584 15816 100 -} -a { - s 0 - b 8385 15825 - e 8392 15832 - l 8385 15825 8392 15832 100 -} -a { - s 0 - b 11397 15826 - e 11404 15833 - l 11397 15826 11404 15833 100 -} -a { - s 0 - b 11398 15827 - e 11405 15834 - l 11398 15827 11405 15834 100 -} -a { - s 0 - b 11336 15844 - e 11343 15851 - l 11336 15844 11343 15851 100 -} -a { - s 0 - b 11337 15845 - e 11344 15852 - l 11337 15845 11344 15852 100 -} -a { - s 0 - b 11338 15846 - e 11345 15853 - l 11338 15846 11345 15853 100 -} -a { - s 0 - b 3616 15861 - e 3623 15868 - l 3616 15861 3623 15868 100 -} -a { - s 0 - b 13996 15862 - e 14003 15869 - l 13996 15862 14003 15869 100 -} -a { - s 0 - b 9675 15862 - e 9682 15869 - l 9675 15862 9682 15869 100 -} -a { - s 0 - b 258 15866 - e 265 15873 - l 258 15866 265 15873 100 -} -a { - s 0 - b 3530 15885 - e 3537 15892 - l 3530 15885 3537 15892 100 -} -a { - s 0 - b 14815 16087 - e 14822 16094 - l 14815 16087 14822 16094 100 -} -a { - s 0 - b 15330 16100 - e 15337 16107 - l 15330 16100 15337 16107 100 -} -a { - s 0 - b 15109 16118 - e 15116 16125 - l 15109 16118 15116 16125 100 -} -a { - s 0 - b 6574 16128 - e 6581 16135 - l 6574 16128 6581 16135 100 -} -a { - s 0 - b 17383 16130 - e 17390 16137 - l 17383 16130 17390 16137 100 -} -a { - s 0 - b 226 16141 - e 233 16148 - l 226 16141 233 16148 100 -} -a { - s 0 - b 227 16142 - e 234 16149 - l 227 16142 234 16149 100 -} -a { - s 0 - b 3673 16147 - e 3680 16154 - l 3673 16147 3680 16154 100 -} -a { - s 0 - b 3674 16148 - e 3681 16155 - l 3674 16148 3681 16155 100 -} -a { - s 0 - b 3675 16149 - e 3682 16156 - l 3675 16149 3682 16156 100 -} -a { - s 0 - b 5067 16152 - e 5074 16159 - l 5067 16152 5074 16159 100 -} -a { - s 0 - b 6396 16154 - e 6403 16161 - l 6396 16154 6403 16161 100 -} -a { - s 0 - b 6397 16155 - e 6404 16162 - l 6397 16155 6404 16162 100 -} -a { - s 0 - b 14367 16156 - e 14374 16163 - l 14367 16156 14374 16163 100 -} -a { - s 0 - b 228 16167 - e 235 16174 - l 228 16167 235 16174 100 -} -a { - s 0 - b 11 16168 - e 18 16175 - l 11 16168 18 16175 100 -} -a { - s 0 - b 15360 16172 - e 15367 16179 - l 15360 16172 15367 16179 100 -} -a { - s 0 - b 14753 16174 - e 14760 16181 - l 14753 16174 14760 16181 100 -} -a { - s 0 - b 17420 16178 - e 17427 16185 - l 17420 16178 17427 16185 100 -} -a { - s 0 - b 4005 16181 - e 4012 16188 - l 4005 16181 4012 16188 100 -} -a { - s 0 - b 5702 16182 - e 5709 16189 - l 5702 16182 5709 16189 100 -} -a { - s 0 - b 5703 16183 - e 5710 16190 - l 5703 16183 5710 16190 100 -} -a { - s 0 - b 3370 16185 - e 3377 16192 - l 3370 16185 3377 16192 100 -} -a { - s 0 - b 6673 16189 - e 6680 16196 - l 6673 16189 6680 16196 100 -} -a { - s 0 - b 6674 16190 - e 6681 16197 - l 6674 16190 6681 16197 100 -} -a { - s 0 - b 6675 16191 - e 6682 16198 - l 6675 16191 6682 16198 100 -} -a { - s 0 - b 8099 16202 - e 8106 16209 - l 8099 16202 8106 16209 100 -} -a { - s 0 - b 17280 16203 - e 17287 16210 - l 17280 16203 17287 16210 100 -} -a { - s 0 - b 68 16204 - e 75 16211 - l 68 16204 75 16211 100 -} -a { - s 0 - b 13896 16206 - e 13903 16213 - l 13896 16206 13903 16213 100 -} -a { - s 0 - b 7619 16212 - e 7626 16219 - l 7619 16212 7626 16219 100 -} -a { - s 0 - b 17838 16213 - e 17845 16220 - l 17838 16213 17845 16220 100 -} -a { - s 0 - b 16069 16213 - e 16076 16220 - l 16069 16213 16076 16220 100 -} -a { - s 0 - b 11329 16213 - e 11336 16220 - l 11329 16213 11336 16220 100 -} -a { - s 0 - b 14777 16219 - e 14784 16226 - l 14777 16219 14784 16226 100 -} -a { - s 0 - b 10831 16225 - e 10838 16232 - l 10831 16225 10838 16232 100 -} -a { - s 0 - b 14718 16239 - e 14725 16246 - l 14718 16239 14725 16246 100 -} -a { - s 0 - b 7737 16239 - e 7744 16246 - l 7737 16239 7744 16246 100 -} -a { - s 0 - b 7738 16240 - e 7745 16247 - l 7738 16240 7745 16247 100 -} -a { - s 0 - b 7739 16241 - e 7746 16248 - l 7739 16241 7746 16248 100 -} -a { - s 0 - b 7594 16252 - e 7601 16259 - l 7594 16252 7601 16259 100 -} -a { - s 0 - b 14271 16253 - e 14278 16260 - l 14271 16253 14278 16260 100 -} -a { - s 0 - b 3717 16254 - e 3724 16261 - l 3717 16254 3724 16261 100 -} -a { - s 0 - b 3718 16255 - e 3725 16262 - l 3718 16255 3725 16262 100 -} -a { - s 0 - b 15268 16265 - e 15275 16272 - l 15268 16265 15275 16272 100 -} -a { - s 0 - b 5103 16265 - e 5110 16272 - l 5103 16265 5110 16272 100 -} -a { - s 0 - b 3026 16267 - e 3033 16274 - l 3026 16267 3033 16274 100 -} -a { - s 0 - b 18589 16268 - e 18596 16275 - l 18589 16268 18596 16275 100 -} -a { - s 0 - b 3027 16268 - e 3034 16275 - l 3027 16268 3034 16275 100 -} -a { - s 0 - b 18590 16269 - e 18597 16276 - l 18590 16269 18597 16276 100 -} -a { - s 0 - b 3028 16269 - e 3035 16276 - l 3028 16269 3035 16276 100 -} -a { - s 0 - b 15969 16271 - e 15976 16278 - l 15969 16271 15976 16278 100 -} -a { - s 0 - b 13989 16308 - e 13996 16315 - l 13989 16308 13996 16315 100 -} -a { - s 0 - b 13990 16309 - e 13997 16316 - l 13990 16309 13997 16316 100 -} -a { - s 0 - b 4542 16324 - e 4549 16331 - l 4542 16324 4549 16331 100 -} -a { - s 0 - b 3318 16332 - e 3325 16339 - l 3318 16332 3325 16339 100 -} -a { - s 0 - b 11390 16339 - e 11397 16346 - l 11390 16339 11397 16346 100 -} -a { - s 0 - b 4039 16341 - e 4046 16348 - l 4039 16341 4046 16348 100 -} -a { - s 0 - b 247 16342 - e 254 16349 - l 247 16342 254 16349 100 -} -a { - s 0 - b 137 16355 - e 144 16362 - l 137 16355 144 16362 100 -} -a { - s 0 - b 5616 16356 - e 5623 16363 - l 5616 16356 5623 16363 100 -} -a { - s 0 - b 5043 16356 - e 5050 16363 - l 5043 16356 5050 16363 100 -} -a { - s 0 - b 282 16373 - e 289 16380 - l 282 16373 289 16380 100 -} -a { - s 0 - b 13951 16406 - e 13958 16413 - l 13951 16406 13958 16413 100 -} -a { - s 0 - b 13952 16407 - e 13959 16414 - l 13952 16407 13959 16414 100 -} -a { - s 0 - b 18015 16408 - e 18022 16415 - l 18015 16408 18022 16415 100 -} -a { - s 0 - b 13953 16408 - e 13960 16415 - l 13953 16408 13960 16415 100 -} -a { - s 0 - b 10873 16409 - e 10880 16416 - l 10873 16409 10880 16416 100 -} -a { - s 0 - b 6820 16409 - e 6827 16416 - l 6820 16409 6827 16416 100 -} -a { - s 0 - b 108 16416 - e 115 16423 - l 108 16416 115 16423 100 -} -a { - s 0 - b 6819 16418 - e 6826 16425 - l 6819 16418 6826 16425 100 -} -a { - s 0 - b 18016 16419 - e 18023 16426 - l 18016 16419 18023 16426 100 -} -a { - s 0 - b 7463 16433 - e 7470 16440 - l 7463 16433 7470 16440 100 -} -a { - s 0 - b 18755 16440 - e 18762 16447 - l 18755 16440 18762 16447 100 -} -a { - s 0 - b 17494 16440 - e 17501 16447 - l 17494 16440 17501 16447 100 -} -a { - s 0 - b 8229 16444 - e 8236 16451 - l 8229 16444 8236 16451 100 -} -a { - s 0 - b 8315 16459 - e 8322 16466 - l 8315 16459 8322 16466 100 -} -a { - s 0 - b 8316 16460 - e 8323 16467 - l 8316 16460 8323 16467 100 -} -a { - s 0 - b 10849 16461 - e 10856 16468 - l 10849 16461 10856 16468 100 -} -a { - s 0 - b 10850 16462 - e 10857 16469 - l 10850 16462 10857 16469 100 -} -a { - s 0 - b 15208 16464 - e 15215 16471 - l 15208 16464 15215 16471 100 -} -a { - s 0 - b 7534 16489 - e 7541 16496 - l 7534 16489 7541 16496 100 -} -a { - s 0 - b 7535 16490 - e 7542 16497 - l 7535 16490 7542 16497 100 -} -a { - s 0 - b 17551 16491 - e 17558 16498 - l 17551 16491 17558 16498 100 -} -a { - s 0 - b 7536 16491 - e 7543 16498 - l 7536 16491 7543 16498 100 -} -a { - s 0 - b 7537 16492 - e 7544 16499 - l 7537 16492 7544 16499 100 -} -a { - s 0 - b 6610 16494 - e 6617 16501 - l 6610 16494 6617 16501 100 -} -a { - s 0 - b 15321 16495 - e 15328 16502 - l 15321 16495 15328 16502 100 -} -a { - s 0 - b 15322 16496 - e 15329 16503 - l 15322 16496 15329 16503 100 -} -a { - s 0 - b 18377 16498 - e 18384 16505 - l 18377 16498 18384 16505 100 -} -a { - s 0 - b 4290 16498 - e 4297 16505 - l 4290 16498 4297 16505 100 -} -a { - s 0 - b 14478 16509 - e 14485 16516 - l 14478 16509 14485 16516 100 -} -a { - s 0 - b 16051 16510 - e 16058 16517 - l 16051 16510 16058 16517 100 -} -a { - s 0 - b 16052 16511 - e 16059 16518 - l 16052 16511 16059 16518 100 -} -a { - s 0 - b 10830 16523 - e 10837 16530 - l 10830 16523 10837 16530 100 -} -a { - s 0 - b 10831 16524 - e 10838 16531 - l 10831 16524 10838 16531 100 -} -a { - s 0 - b 18582 16528 - e 18589 16535 - l 18582 16528 18589 16535 100 -} -a { - s 0 - b 18583 16529 - e 18590 16536 - l 18583 16529 18590 16536 100 -} -a { - s 0 - b 18432 16529 - e 18439 16536 - l 18432 16529 18439 16536 100 -} -a { - s 0 - b 12322 16529 - e 12329 16536 - l 12322 16529 12329 16536 100 -} -a { - s 0 - b 11272 16547 - e 11279 16554 - l 11272 16547 11279 16554 100 -} -a { - s 0 - b 11273 16548 - e 11280 16555 - l 11273 16548 11280 16555 100 -} -a { - s 0 - b 274 16551 - e 281 16558 - l 274 16551 281 16558 100 -} -a { - s 0 - b 275 16552 - e 282 16559 - l 275 16552 282 16559 100 -} -a { - s 0 - b 4843 16553 - e 4850 16560 - l 4843 16553 4850 16560 100 -} -a { - s 0 - b 6745 16560 - e 6752 16567 - l 6745 16560 6752 16567 100 -} -a { - s 0 - b 6746 16561 - e 6753 16568 - l 6746 16561 6753 16568 100 -} -a { - s 0 - b 23 16579 - e 30 16586 - l 23 16579 30 16586 100 -} -a { - s 0 - b 16018 16587 - e 16025 16594 - l 16018 16587 16025 16594 100 -} -a { - s 0 - b 10995 16588 - e 11002 16595 - l 10995 16588 11002 16595 100 -} -a { - s 0 - b 10996 16589 - e 11003 16596 - l 10996 16589 11003 16596 100 -} -a { - s 0 - b 17243 16607 - e 17250 16614 - l 17243 16607 17250 16614 100 -} -a { - s 0 - b 5 17045 - e 12 17052 - l 5 17045 12 17052 100 -} -a { - s 0 - b 18612 17052 - e 18619 17059 - l 18612 17052 18619 17059 100 -} -a { - s 0 - b 18613 17053 - e 18620 17060 - l 18613 17053 18620 17060 100 -} -a { - s 0 - b 3568 17071 - e 3575 17078 - l 3568 17071 3575 17078 100 -} -a { - s 0 - b 3569 17072 - e 3576 17079 - l 3569 17072 3576 17079 100 -} -a { - s 0 - b 3344 17086 - e 3351 17093 - l 3344 17086 3351 17093 100 -} -a { - s 0 - b 4101 17089 - e 4108 17096 - l 4101 17089 4108 17096 100 -} -a { - s 0 - b 4102 17090 - e 4109 17097 - l 4102 17090 4109 17097 100 -} -a { - s 0 - b 4103 17091 - e 4110 17098 - l 4103 17091 4110 17098 100 -} -a { - s 0 - b 16136 17093 - e 16143 17100 - l 16136 17093 16143 17100 100 -} -a { - s 0 - b 17309 17094 - e 17316 17101 - l 17309 17094 17316 17101 100 -} -a { - s 0 - b 16137 17094 - e 16144 17101 - l 16137 17094 16144 17101 100 -} -a { - s 0 - b 3994 17098 - e 4001 17105 - l 3994 17098 4001 17105 100 -} -a { - s 0 - b 3995 17099 - e 4002 17106 - l 3995 17099 4002 17106 100 -} -a { - s 0 - b 13994 17100 - e 14001 17107 - l 13994 17100 14001 17107 100 -} -a { - s 0 - b 18652 17103 - e 18659 17110 - l 18652 17103 18659 17110 100 -} -a { - s 0 - b 18653 17104 - e 18660 17111 - l 18653 17104 18660 17111 100 -} -a { - s 0 - b 17160 17107 - e 17167 17114 - l 17160 17107 17167 17114 100 -} -a { - s 0 - b 216 17109 - e 223 17116 - l 216 17109 223 17116 100 -} -a { - s 0 - b 8274 17112 - e 8281 17119 - l 8274 17112 8281 17119 100 -} -a { - s 0 - b 8275 17113 - e 8282 17120 - l 8275 17113 8282 17120 100 -} -a { - s 0 - b 5100 17116 - e 5107 17123 - l 5100 17116 5107 17123 100 -} -a { - s 0 - b 5101 17117 - e 5108 17124 - l 5101 17117 5108 17124 100 -} -a { - s 0 - b 4930 17122 - e 4937 17129 - l 4930 17122 4937 17129 100 -} -a { - s 0 - b 7472 17140 - e 7479 17147 - l 7472 17140 7479 17147 100 -} -a { - s 0 - b 14379 17142 - e 14386 17149 - l 14379 17142 14386 17149 100 -} -a { - s 0 - b 7560 17155 - e 7567 17162 - l 7560 17155 7567 17162 100 -} -a { - s 0 - b 9013 17178 - e 9020 17185 - l 9013 17178 9020 17185 100 -} -a { - s 0 - b 9014 17179 - e 9021 17186 - l 9014 17179 9021 17186 100 -} -a { - s 0 - b 14386 17197 - e 14393 17204 - l 14386 17197 14393 17204 100 -} -a { - s 0 - b 5022 17905 - e 5029 17912 - l 5022 17905 5029 17912 100 -} -a { - s 0 - b 13443 17906 - e 13450 17913 - l 13443 17906 13450 17913 100 -} -a { - s 0 - b 17162 17916 - e 17169 17923 - l 17162 17916 17169 17923 100 -} -a { - s 0 - b 4943 17923 - e 4950 17930 - l 4943 17923 4950 17930 100 -} -a { - s 0 - b 6901 17925 - e 6908 17932 - l 6901 17925 6908 17932 100 -} -a { - s 0 - b 15979 17929 - e 15986 17936 - l 15979 17929 15986 17936 100 -} -a { - s 0 - b 11362 17948 - e 11369 17955 - l 11362 17948 11369 17955 100 -} -a { - s 0 - b 11363 17949 - e 11370 17956 - l 11363 17949 11370 17956 100 -} -a { - s 0 - b 14822 18342 - e 14829 18349 - l 14822 18342 14829 18349 100 -} -a { - s 0 - b 11410 18343 - e 11417 18350 - l 11410 18343 11417 18350 100 -} -a { - s 0 - b 9494 18343 - e 9501 18350 - l 9494 18343 9501 18350 100 -} -a { - s 0 - b 6258 18347 - e 6265 18354 - l 6258 18347 6265 18354 100 -} -a { - s 0 - b 6259 18348 - e 6266 18355 - l 6259 18348 6266 18355 100 -} -a { - s 0 - b 4934 18356 - e 4941 18363 - l 4934 18356 4941 18363 100 -} -a { - s 0 - b 4935 18357 - e 4942 18364 - l 4935 18357 4942 18364 100 -} -a { - s 0 - b 7967 18359 - e 7974 18366 - l 7967 18359 7974 18366 100 -} -a { - s 0 - b 4866 18365 - e 4873 18372 - l 4866 18365 4873 18372 100 -} -a { - s 0 - b 4901 18369 - e 4908 18376 - l 4901 18369 4908 18376 100 -} -a { - s 0 - b 4902 18370 - e 4909 18377 - l 4902 18370 4909 18377 100 -} -a { - s 0 - b 4903 18371 - e 4910 18378 - l 4903 18371 4910 18378 100 -} -a { - s 0 - b 4904 18372 - e 4911 18379 - l 4904 18372 4911 18379 100 -} -a { - s 0 - b 6557 18384 - e 6564 18391 - l 6557 18384 6564 18391 100 -} -a { - s 0 - b 7594 18393 - e 7601 18400 - l 7594 18393 7601 18400 100 -} -a { - s 0 - b 9013 18394 - e 9020 18401 - l 9013 18394 9020 18401 100 -} -a { - s 0 - b 9014 18395 - e 9021 18402 - l 9014 18395 9021 18402 100 -} -a { - s 0 - b 8768 18402 - e 8775 18409 - l 8768 18402 8775 18409 100 -} -a { - s 0 - b 11274 18410 - e 11281 18417 - l 11274 18410 11281 18417 100 -} -a { - s 0 - b 9677 18414 - e 9684 18421 - l 9677 18414 9684 18421 100 -} -a { - s 0 - b 9678 18415 - e 9685 18422 - l 9678 18415 9685 18422 100 -} -a { - s 0 - b 4951 18420 - e 4958 18427 - l 4951 18420 4958 18427 100 -} -a { - s 0 - b 4967 18436 - e 4974 18443 - l 4967 18436 4974 18443 100 -} -a { - s 0 - b 15471 18437 - e 15478 18444 - l 15471 18437 15478 18444 100 -} -a { - s 0 - b 15472 18438 - e 15479 18445 - l 15472 18438 15479 18445 100 -} -a { - s 0 - b 18633 18439 - e 18640 18446 - l 18633 18439 18640 18446 100 -} -a { - s 0 - b 4991 18460 - e 4998 18467 - l 4991 18460 4998 18467 100 -} -a { - s 0 - b 4992 18461 - e 4999 18468 - l 4992 18461 4999 18468 100 -} -a { - s 0 - b 7657 18494 - e 7664 18501 - l 7657 18494 7664 18501 100 -} -a { - s 0 - b 7658 18495 - e 7665 18502 - l 7658 18495 7665 18502 100 -} -a { - s 0 - b 8979 18496 - e 8986 18503 - l 8979 18496 8986 18503 100 -} -a { - s 0 - b 7659 18496 - e 7666 18503 - l 7659 18496 7666 18503 100 -} -a { - s 0 - b 237 18496 - e 244 18503 - l 237 18496 244 18503 100 -} -a { - s 0 - b 238 18497 - e 245 18504 - l 238 18497 245 18504 100 -} -a { - s 0 - b 3095 18516 - e 3102 18523 - l 3095 18516 3102 18523 100 -} -a { - s 0 - b 16152 18517 - e 16159 18524 - l 16152 18517 16159 18524 100 -} -a { - s 0 - b 11075 18517 - e 11082 18524 - l 11075 18517 11082 18524 100 -} -a { - s 0 - b 14611 18518 - e 14618 18525 - l 14611 18518 14618 18525 100 -} -a { - s 0 - b 3141 18528 - e 3148 18535 - l 3141 18528 3148 18535 100 -} -a { - s 0 - b 16064 18529 - e 16071 18536 - l 16064 18529 16071 18536 100 -} -a { - s 0 - b 3142 18529 - e 3149 18536 - l 3142 18529 3149 18536 100 -} -a { - s 0 - b 14891 18534 - e 14898 18541 - l 14891 18534 14898 18541 100 -} -a { - s 0 - b 11277 18537 - e 11284 18544 - l 11277 18537 11284 18544 100 -} -a { - s 0 - b 11278 18538 - e 11285 18545 - l 11278 18538 11285 18545 100 -} -a { - s 0 - b 11279 18539 - e 11286 18546 - l 11279 18539 11286 18546 100 -} -a { - s 0 - b 9688 18540 - e 9695 18547 - l 9688 18540 9695 18547 100 -} -a { - s 0 - b 4416 18541 - e 4423 18548 - l 4416 18541 4423 18548 100 -} -a { - s 0 - b 5129 18555 - e 5136 18562 - l 5129 18555 5136 18562 100 -} -a { - s 0 - b 18772 18561 - e 18779 18568 - l 18772 18561 18779 18568 100 -} -a { - s 0 - b 3252 18561 - e 3259 18568 - l 3252 18561 3259 18568 100 -} -a { - s 0 - b 6403 18562 - e 6410 18569 - l 6403 18562 6410 18569 100 -} -a { - s 0 - b 3906 18562 - e 3913 18569 - l 3906 18562 3913 18569 100 -} -a { - s 0 - b 14511 18568 - e 14518 18575 - l 14511 18568 14518 18575 100 -} -a { - s 0 - b 9627 18572 - e 9634 18579 - l 9627 18572 9634 18579 100 -} -a { - s 0 - b 14350 18579 - e 14357 18586 - l 14350 18579 14357 18586 100 -} -a { - s 0 - b 15962 18580 - e 15969 18587 - l 15962 18580 15969 18587 100 -} -a { - s 0 - b 14351 18580 - e 14358 18587 - l 14351 18580 14358 18587 100 -} -a { - s 0 - b 9724 18580 - e 9731 18587 - l 9724 18580 9731 18587 100 -} -a { - s 0 - b 9725 18581 - e 9732 18588 - l 9725 18581 9732 18588 100 -} -a { - s 0 - b 8476 18583 - e 8483 18590 - l 8476 18583 8483 18590 100 -} -a { - s 0 - b 8477 18584 - e 8484 18591 - l 8477 18584 8484 18591 100 -} -a { - s 0 - b 17974 18593 - e 17981 18600 - l 17974 18593 17981 18600 100 -} -a { - s 0 - b 17488 18593 - e 17495 18600 - l 17488 18593 17495 18600 100 -} -a { - s 0 - b 6646 18593 - e 6653 18600 - l 6646 18593 6653 18600 100 -} -a { - s 0 - b 17975 18594 - e 17982 18601 - l 17975 18594 17982 18601 100 -} -a { - s 0 - b 9628 18594 - e 9635 18601 - l 9628 18594 9635 18601 100 -} -a { - s 0 - b 240 18595 - e 247 18602 - l 240 18595 247 18602 100 -} -a { - s 0 - b 3932 18596 - e 3939 18603 - l 3932 18596 3939 18603 100 -} -a { - s 0 - b 18528 18597 - e 18535 18604 - l 18528 18597 18535 18604 100 -} -a { - s 0 - b 13441 18607 - e 13448 18614 - l 13441 18607 13448 18614 100 -} -a { - s 0 - b 12599 18610 - e 12606 18617 - l 12599 18610 12606 18617 100 -} -a { - s 0 - b 12600 18611 - e 12607 18618 - l 12600 18611 12607 18618 100 -} -a { - s 0 - b 9603 18612 - e 9610 18619 - l 9603 18612 9610 18619 100 -} -a { - s 0 - b 8115 18613 - e 8122 18620 - l 8115 18613 8122 18620 100 -} -a { - s 0 - b 3719 18622 - e 3726 18629 - l 3719 18622 3726 18629 100 -} -a { - s 0 - b 16103 18646 - e 16110 18653 - l 16103 18646 16110 18653 100 -} -a { - s 0 - b 11174 18654 - e 11181 18661 - l 11174 18654 11181 18661 100 -} -a { - s 0 - b 14602 18671 - e 14609 18678 - l 14602 18671 14609 18678 100 -} -a { - s 0 - b 6609 18671 - e 6616 18678 - l 6609 18671 6616 18678 100 -} -a { - s 0 - b 4791 18680 - e 4798 18687 - l 4791 18680 4798 18687 100 -} -a { - s 0 - b 7619 18683 - e 7626 18690 - l 7619 18683 7626 18690 100 -} -a { - s 0 - b 6495 18686 - e 6502 18693 - l 6495 18686 6502 18693 100 -} -a { - s 0 - b 6496 18687 - e 6503 18694 - l 6496 18687 6503 18694 100 -} -a { - s 0 - b 4374 18687 - e 4381 18694 - l 4374 18687 4381 18694 100 -} -a { - s 0 - b 4375 18688 - e 4382 18695 - l 4375 18688 4382 18695 100 -} -a { - s 0 - b 4376 18689 - e 4383 18696 - l 4376 18689 4383 18696 100 -} -a { - s 0 - b 8531 18822 - e 8538 18829 - l 8531 18822 8538 18829 100 -} -a { - s 0 - b 8532 18823 - e 8539 18830 - l 8532 18823 8539 18830 100 -} -a { - s 0 - b 11550 18832 - e 11557 18839 - l 11550 18832 11557 18839 100 -} -a { - s 0 - b 8255 18833 - e 8262 18840 - l 8255 18833 8262 18840 100 -} -a { - s 0 - b 9612 18834 - e 9619 18841 - l 9612 18834 9619 18841 100 -} -a { - s 0 - b 7415 18839 - e 7422 18846 - l 7415 18839 7422 18846 100 -} -a { - s 0 - b 17384 18852 - e 17391 18859 - l 17384 18852 17391 18859 100 -} -a { - s 0 - b 173 18853 - e 180 18860 - l 173 18853 180 18860 100 -} -a { - s 0 - b 3262 18866 - e 3269 18873 - l 3262 18866 3269 18873 100 -} -a { - s 0 - b 18684 18872 - e 18691 18879 - l 18684 18872 18691 18879 100 -} -a { - s 0 - b 14289 18881 - e 14296 18888 - l 14289 18881 14296 18888 100 -} -a { - s 0 - b 4412 18896 - e 4419 18903 - l 4412 18896 4419 18903 100 -} -a { - s 0 - b 14887 18939 - e 14894 18946 - l 14887 18939 14894 18946 100 -} -a { - s 0 - b 13495 18944 - e 13502 18951 - l 13495 18944 13502 18951 100 -} -a { - s 0 - b 11242 18945 - e 11249 18952 - l 11242 18945 11249 18952 100 -} -a { - s 0 - b 11243 18946 - e 11250 18953 - l 11243 18946 11250 18953 100 -} -a { - s 0 - b 13957 18950 - e 13964 18957 - l 13957 18950 13964 18957 100 -} -a { - s 0 - b 8670 18950 - e 8677 18957 - l 8670 18950 8677 18957 100 -} -a { - s 0 - b 15183 18951 - e 15190 18958 - l 15183 18951 15190 18958 100 -} -a { - s 0 - b 18732 18952 - e 18739 18959 - l 18732 18952 18739 18959 100 -} -a { - s 0 - b 18733 18953 - e 18740 18960 - l 18733 18953 18740 18960 100 -} -a { - s 0 - b 17508 18953 - e 17515 18960 - l 17508 18953 17515 18960 100 -} -a { - s 0 - b 10891 18953 - e 10898 18960 - l 10891 18953 10898 18960 100 -} -a { - s 0 - b 17509 18954 - e 17516 18961 - l 17509 18954 17516 18961 100 -} -a { - s 0 - b 11055 18954 - e 11062 18961 - l 11055 18954 11062 18961 100 -} -a { - s 0 - b 18706 18964 - e 18713 18971 - l 18706 18964 18713 18971 100 -} -a { - s 0 - b 3169 18967 - e 3176 18974 - l 3169 18967 3176 18974 100 -} -a { - s 0 - b 11278 18969 - e 11285 18976 - l 11278 18969 11285 18976 100 -} -a { - s 0 - b 11279 18970 - e 11286 18977 - l 11279 18970 11286 18977 100 -} -a { - s 0 - b 11280 18971 - e 11287 18978 - l 11280 18971 11287 18978 100 -} -a { - s 0 - b 15405 18987 - e 15412 18994 - l 15405 18987 15412 18994 100 -} -a { - s 0 - b 14615 18987 - e 14622 18994 - l 14615 18987 14622 18994 100 -} -a { - s 0 - b 14572 18989 - e 14579 18996 - l 14572 18989 14579 18996 100 -} -a { - s 0 - b 14264 18989 - e 14271 18996 - l 14264 18989 14271 18996 100 -} -a { - s 0 - b 14265 18990 - e 14272 18997 - l 14265 18990 14272 18997 100 -} -a { - s 0 - b 13453 18991 - e 13460 18998 - l 13453 18991 13460 18998 100 -} -a { - s 0 - b 5686 18997 - e 5693 19004 - l 5686 18997 5693 19004 100 -} -a { - s 0 - b 12196 19005 - e 12203 19012 - l 12196 19005 12203 19012 100 -} -a { - s 0 - b 15247 19009 - e 15254 19016 - l 15247 19009 15254 19016 100 -} -a { - s 0 - b 3894 19012 - e 3901 19019 - l 3894 19012 3901 19019 100 -} -a { - s 0 - b 3895 19013 - e 3902 19020 - l 3895 19013 3902 19020 100 -} -a { - s 0 - b 3896 19014 - e 3903 19021 - l 3896 19014 3903 19021 100 -} -a { - s 0 - b 14749 19015 - e 14756 19022 - l 14749 19015 14756 19022 100 -} -a { - s 0 - b 14750 19016 - e 14757 19023 - l 14750 19016 14757 19023 100 -} -a { - s 0 - b 14751 19017 - e 14758 19024 - l 14751 19017 14758 19024 100 -} -a { - s 0 - b 14731 19017 - e 14738 19024 - l 14731 19017 14738 19024 100 -} -a { - s 0 - b 14732 19018 - e 14739 19025 - l 14732 19018 14739 19025 100 -} -a { - s 0 - b 15362 19019 - e 15369 19026 - l 15362 19019 15369 19026 100 -} -a { - s 0 - b 9331 19026 - e 9338 19033 - l 9331 19026 9338 19033 100 -} -a { - s 0 - b 9332 19027 - e 9339 19034 - l 9332 19027 9339 19034 100 -} -a { - s 0 - b 18038 19030 - e 18045 19037 - l 18038 19030 18045 19037 100 -} -a { - s 0 - b 3004 19032 - e 3011 19039 - l 3004 19032 3011 19039 100 -} -a { - s 0 - b 11111 19033 - e 11118 19040 - l 11111 19033 11118 19040 100 -} -a { - s 0 - b 14224 19035 - e 14231 19042 - l 14224 19035 14231 19042 100 -} -a { - s 0 - b 18761 19038 - e 18768 19045 - l 18761 19038 18768 19045 100 -} -a { - s 0 - b 18762 19039 - e 18769 19046 - l 18762 19039 18769 19046 100 -} -a { - s 0 - b 3026 19050 - e 3033 19057 - l 3026 19050 3033 19057 100 -} -a { - s 0 - b 14919 19062 - e 14926 19069 - l 14919 19062 14926 19069 100 -} -a { - s 0 - b 4053 19065 - e 4060 19072 - l 4053 19065 4060 19072 100 -} -a { - s 0 - b 4054 19066 - e 4061 19073 - l 4054 19066 4061 19073 100 -} -a { - s 0 - b 220 19068 - e 227 19075 - l 220 19068 227 19075 100 -} -a { - s 0 - b 14850 19071 - e 14857 19078 - l 14850 19071 14857 19078 100 -} -a { - s 0 - b 14851 19072 - e 14858 19079 - l 14851 19072 14858 19079 100 -} -a { - s 0 - b 14852 19073 - e 14859 19080 - l 14852 19073 14859 19080 100 -} -a { - s 0 - b 14853 19074 - e 14860 19081 - l 14853 19074 14860 19081 100 -} -a { - s 0 - b 16023 19076 - e 16030 19083 - l 16023 19076 16030 19083 100 -} -a { - s 0 - b 16024 19077 - e 16031 19084 - l 16024 19077 16031 19084 100 -} -a { - s 0 - b 11164 19078 - e 11171 19085 - l 11164 19078 11171 19085 100 -} -a { - s 0 - b 11023 19078 - e 11030 19085 - l 11023 19078 11030 19085 100 -} -a { - s 0 - b 11165 19079 - e 11172 19086 - l 11165 19079 11172 19086 100 -} -a { - s 0 - b 6582 19079 - e 6589 19086 - l 6582 19079 6589 19086 100 -} -a { - s 0 - b 2990 19126 - e 2997 19133 - l 2990 19126 2997 19133 100 -} -a { - s 0 - b 2991 19127 - e 2998 19134 - l 2991 19127 2998 19134 100 -} -a { - s 0 - b 3167 19660 - e 3174 19667 - l 3167 19660 3174 19667 100 -} -a { - s 0 - b 13478 19664 - e 13485 19671 - l 13478 19664 13485 19671 100 -} -a { - s 0 - b 13479 19665 - e 13486 19672 - l 13479 19665 13486 19672 100 -} -a { - s 0 - b 13480 19666 - e 13487 19673 - l 13480 19666 13487 19673 100 -} -a { - s 0 - b 13481 19667 - e 13488 19674 - l 13481 19667 13488 19674 100 -} -a { - s 0 - b 10898 19697 - e 10905 19704 - l 10898 19697 10905 19704 100 -} -a { - s 0 - b 10899 19698 - e 10906 19705 - l 10899 19698 10906 19705 100 -} -a { - s 0 - b 10900 19699 - e 10907 19706 - l 10900 19699 10907 19706 100 -} -a { - s 0 - b 3866 19699 - e 3873 19706 - l 3866 19699 3873 19706 100 -} -a { - s 0 - b 252 19700 - e 259 19707 - l 252 19700 259 19707 100 -} -a { - s 0 - b 253 19701 - e 260 19708 - l 253 19701 260 19708 100 -} -a { - s 0 - b 11492 19709 - e 11499 19716 - l 11492 19709 11499 19716 100 -} -a { - s 0 - b 8234 19712 - e 8241 19719 - l 8234 19712 8241 19719 100 -} -a { - s 0 - b 8862 19713 - e 8869 19720 - l 8862 19713 8869 19720 100 -} -a { - s 0 - b 8609 19722 - e 8616 19729 - l 8609 19722 8616 19729 100 -} -a { - s 0 - b 8610 19723 - e 8617 19730 - l 8610 19723 8617 19730 100 -} -a { - s 0 - b 4592 19723 - e 4599 19730 - l 4592 19723 4599 19730 100 -} -a { - s 0 - b 13495 19733 - e 13502 19740 - l 13495 19733 13502 19740 100 -} -a { - s 0 - b 18793 19743 - e 18800 19750 - l 18793 19743 18800 19750 100 -} -a { - s 0 - b 3066 19823 - e 3073 19830 - l 3066 19823 3073 19830 100 -} -a { - s 0 - b 11257 19825 - e 11264 19832 - l 11257 19825 11264 19832 100 -} -a { - s 0 - b 15984 19838 - e 15991 19845 - l 15984 19838 15991 19845 100 -} -a { - s 0 - b 15985 19839 - e 15992 19846 - l 15985 19839 15992 19846 100 -} -a { - s 0 - b 2988 19847 - e 2995 19854 - l 2988 19847 2995 19854 100 -} -a { - s 0 - b 17435 19848 - e 17442 19855 - l 17435 19848 17442 19855 100 -} -a { - s 0 - b 7417 19853 - e 7424 19860 - l 7417 19853 7424 19860 100 -} -a { - s 0 - b 9020 19865 - e 9027 19872 - l 9020 19865 9027 19872 100 -} -a { - s 0 - b 6535 19865 - e 6542 19872 - l 6535 19865 6542 19872 100 -} -a { - s 0 - b 13472 19869 - e 13479 19876 - l 13472 19869 13479 19876 100 -} -a { - s 0 - b 10971 19872 - e 10978 19879 - l 10971 19872 10978 19879 100 -} -a { - s 0 - b 3034 19877 - e 3041 19884 - l 3034 19877 3041 19884 100 -} -a { - s 0 - b 3035 19878 - e 3042 19885 - l 3035 19878 3042 19885 100 -} -a { - s 0 - b 3036 19879 - e 3043 19886 - l 3036 19879 3043 19886 100 -} -a { - s 0 - b 14755 19892 - e 14762 19899 - l 14755 19892 14762 19899 100 -} -a { - s 0 - b 8255 19893 - e 8262 19900 - l 8255 19893 8262 19900 100 -} -a { - s 0 - b 9612 19894 - e 9619 19901 - l 9612 19894 9619 19901 100 -} -a { - s 0 - b 17568 19897 - e 17575 19904 - l 17568 19897 17575 19904 100 -} -a { - s 0 - b 18371 19912 - e 18378 19919 - l 18371 19912 18378 19919 100 -} -a { - s 0 - b 13956 19917 - e 13963 19924 - l 13956 19917 13963 19924 100 -} -a { - s 0 - b 9447 19921 - e 9454 19928 - l 9447 19921 9454 19928 100 -} -a { - s 0 - b 3178 19921 - e 3185 19928 - l 3178 19921 3185 19928 100 -} -a { - s 0 - b 3223 19954 - e 3230 19961 - l 3223 19954 3230 19961 100 -} -a { - s 0 - b 3878 19958 - e 3885 19965 - l 3878 19958 3885 19965 100 -} -a { - s 0 - b 8480 19962 - e 8487 19969 - l 8480 19962 8487 19969 100 -} -a { - s 0 - b 8368 19962 - e 8375 19969 - l 8368 19962 8375 19969 100 -} -a { - s 0 - b 8369 19963 - e 8376 19970 - l 8369 19963 8376 19970 100 -} -a { - s 0 - b 88 19968 - e 95 19975 - l 88 19968 95 19975 100 -} -a { - s 0 - b 14220 19969 - e 14227 19976 - l 14220 19969 14227 19976 100 -} -a { - s 0 - b 14221 19970 - e 14228 19977 - l 14221 19970 14228 19977 100 -} -a { - s 0 - b 9424 20606 - e 9431 20613 - l 9424 20606 9431 20613 100 -} -a { - s 0 - b 9425 20607 - e 9432 20614 - l 9425 20607 9432 20614 100 -} -a { - s 0 - b 11092 20617 - e 11099 20624 - l 11092 20617 11099 20624 100 -} -a { - s 0 - b 8755 20617 - e 8762 20624 - l 8755 20617 8762 20624 100 -} -a { - s 0 - b 7699 20620 - e 7706 20627 - l 7699 20620 7706 20627 100 -} -a { - s 0 - b 14840 20626 - e 14847 20633 - l 14840 20626 14847 20633 100 -} -a { - s 0 - b 8468 20662 - e 8475 20669 - l 8468 20662 8475 20669 100 -} -a { - s 0 - b 3224 21557 - e 3231 21564 - l 3224 21557 3231 21564 100 -} -a { - s 0 - b 3225 21558 - e 3232 21565 - l 3225 21558 3232 21565 100 -} -a { - s 0 - b 3226 21559 - e 3233 21566 - l 3226 21559 3233 21566 100 -} -a { - s 0 - b 4875 21561 - e 4882 21568 - l 4875 21561 4882 21568 100 -} -a { - s 0 - b 15318 21576 - e 15325 21583 - l 15318 21576 15325 21583 100 -} -a { - s 0 - b 9483 21586 - e 9490 21593 - l 9483 21586 9490 21593 100 -} -a { - s 0 - b 17096 21589 - e 17103 21596 - l 17096 21589 17103 21596 100 -} -a { - s 0 - b 4524 21603 - e 4531 21610 - l 4524 21603 4531 21610 100 -} -a { - s 0 - b 3548 21608 - e 3555 21615 - l 3548 21608 3555 21615 100 -} -a { - s 0 - b 14617 21652 - e 14624 21659 - l 14617 21652 14624 21659 100 -} -a { - s 0 - b 14005 21693 - e 14012 21700 - l 14005 21693 14012 21700 100 -} -a { - s 0 - b 14655 21695 - e 14662 21702 - l 14655 21695 14662 21702 100 -} -a { - s 0 - b 13941 21707 - e 13948 21714 - l 13941 21707 13948 21714 100 -} -a { - s 0 - b 13942 21708 - e 13949 21715 - l 13942 21708 13949 21715 100 -} -a { - s 0 - b 17413 21728 - e 17420 21735 - l 17413 21728 17420 21735 100 -} -a { - s 0 - b 14303 21728 - e 14310 21735 - l 14303 21728 14310 21735 100 -} -a { - s 0 - b 14150 21729 - e 14157 21736 - l 14150 21729 14157 21736 100 -} -a { - s 0 - b 14159 21732 - e 14166 21739 - l 14159 21732 14166 21739 100 -} -a { - s 0 - b 3979 21746 - e 3986 21753 - l 3979 21746 3986 21753 100 -} -a { - s 0 - b 8961 21747 - e 8968 21754 - l 8961 21747 8968 21754 100 -} -a { - s 0 - b 11403 21748 - e 11410 21755 - l 11403 21748 11410 21755 100 -} -a { - s 0 - b 6377 21759 - e 6384 21766 - l 6377 21759 6384 21766 100 -} -a { - s 0 - b 10828 21762 - e 10835 21769 - l 10828 21762 10835 21769 100 -} -a { - s 0 - b 18355 21764 - e 18362 21771 - l 18355 21764 18362 21771 100 -} -a { - s 0 - b 6707 21764 - e 6714 21771 - l 6707 21764 6714 21771 100 -} -a { - s 0 - b 17402 21774 - e 17409 21781 - l 17402 21774 17409 21781 100 -} -a { - s 0 - b 8398 21784 - e 8405 21791 - l 8398 21784 8405 21791 100 -} -a { - s 0 - b 8176 21786 - e 8183 21793 - l 8176 21786 8183 21793 100 -} -a { - s 0 - b 9375 21793 - e 9382 21800 - l 9375 21793 9382 21800 100 -} -a { - s 0 - b 9376 21794 - e 9383 21801 - l 9376 21794 9383 21801 100 -} -a { - s 0 - b 17383 21891 - e 17390 21898 - l 17383 21891 17390 21898 100 -} -a { - s 0 - b 14843 21894 - e 14850 21901 - l 14843 21894 14850 21901 100 -} -a { - s 0 - b 7495 21894 - e 7502 21901 - l 7495 21894 7502 21901 100 -} -a { - s 0 - b 15363 21895 - e 15370 21902 - l 15363 21895 15370 21902 100 -} -a { - s 0 - b 7717 21898 - e 7724 21905 - l 7717 21898 7724 21905 100 -} -a { - s 0 - b 10928 21907 - e 10935 21914 - l 10928 21907 10935 21914 100 -} -a { - s 0 - b 3321 21907 - e 3328 21914 - l 3321 21907 3328 21914 100 -} -a { - s 0 - b 13883 21913 - e 13890 21920 - l 13883 21913 13890 21920 100 -} -a { - s 0 - b 9659 21916 - e 9666 21923 - l 9659 21916 9666 21923 100 -} -a { - s 0 - b 9660 21917 - e 9667 21924 - l 9660 21917 9667 21924 100 -} -a { - s 0 - b 10977 21929 - e 10984 21936 - l 10977 21929 10984 21936 100 -} -a { - s 0 - b 4833 21948 - e 4840 21955 - l 4833 21948 4840 21955 100 -} -a { - s 0 - b 7993 21982 - e 8000 21989 - l 7993 21982 8000 21989 100 -} -a { - s 0 - b 4846 21992 - e 4853 21999 - l 4846 21992 4853 21999 100 -} -a { - s 0 - b 14054 22002 - e 14061 22009 - l 14054 22002 14061 22009 100 -} -a { - s 0 - b 14055 22003 - e 14062 22010 - l 14055 22003 14062 22010 100 -} -a { - s 0 - b 5028 22004 - e 5035 22011 - l 5028 22004 5035 22011 100 -} -a { - s 0 - b 14914 22009 - e 14921 22016 - l 14914 22009 14921 22016 100 -} -a { - s 0 - b 9736 22025 - e 9743 22032 - l 9736 22025 9743 22032 100 -} -a { - s 0 - b 4042 22043 - e 4049 22050 - l 4042 22043 4049 22050 100 -} -a { - s 0 - b 18460 22047 - e 18467 22054 - l 18460 22047 18467 22054 100 -} -a { - s 0 - b 13450 22054 - e 13457 22061 - l 13450 22054 13457 22061 100 -} -a { - s 0 - b 6856 22054 - e 6863 22061 - l 6856 22054 6863 22061 100 -} -a { - s 0 - b 13451 22055 - e 13458 22062 - l 13451 22055 13458 22062 100 -} -a { - s 0 - b 14521 22059 - e 14528 22066 - l 14521 22059 14528 22066 100 -} -a { - s 0 - b 8888 22063 - e 8895 22070 - l 8888 22063 8895 22070 100 -} -a { - s 0 - b 17322 22065 - e 17329 22072 - l 17322 22065 17329 22072 100 -} -a { - s 0 - b 3351 22068 - e 3358 22075 - l 3351 22068 3358 22075 100 -} -a { - s 0 - b 7984 22074 - e 7991 22081 - l 7984 22074 7991 22081 100 -} -a { - s 0 - b 7766 22076 - e 7773 22083 - l 7766 22076 7773 22083 100 -} -a { - s 0 - b 14774 22077 - e 14781 22084 - l 14774 22077 14781 22084 100 -} -a { - s 0 - b 4353 22098 - e 4360 22105 - l 4353 22098 4360 22105 100 -} -a { - s 0 - b 6854 22101 - e 6861 22108 - l 6854 22101 6861 22108 100 -} -a { - s 0 - b 18583 22102 - e 18590 22109 - l 18583 22102 18590 22109 100 -} -a { - s 0 - b 18432 22102 - e 18439 22109 - l 18432 22102 18439 22109 100 -} -a { - s 0 - b 12322 22102 - e 12329 22109 - l 12322 22102 12329 22109 100 -} -a { - s 0 - b 18584 22103 - e 18591 22110 - l 18584 22103 18591 22110 100 -} -a { - s 0 - b 12323 22103 - e 12330 22110 - l 12323 22103 12330 22110 100 -} -a { - s 0 - b 5071 22103 - e 5078 22110 - l 5071 22103 5078 22110 100 -} -a { - s 0 - b 13948 22104 - e 13955 22111 - l 13948 22104 13955 22111 100 -} -a { - s 0 - b 7397 22106 - e 7404 22113 - l 7397 22106 7404 22113 100 -} -a { - s 0 - b 37 22117 - e 44 22124 - l 37 22117 44 22124 100 -} -a { - s 0 - b 38 22118 - e 45 22125 - l 38 22118 45 22125 100 -} -a { - s 0 - b 39 22119 - e 46 22126 - l 39 22119 46 22126 100 -} -a { - s 0 - b 11564 22131 - e 11571 22138 - l 11564 22131 11571 22138 100 -} -a { - s 0 - b 5083 22131 - e 5090 22138 - l 5083 22131 5090 22138 100 -} -a { - s 0 - b 18722 22134 - e 18729 22141 - l 18722 22134 18729 22141 100 -} -a { - s 0 - b 215 22147 - e 222 22154 - l 215 22147 222 22154 100 -} -a { - s 0 - b 5628 22148 - e 5635 22155 - l 5628 22148 5635 22155 100 -} -a { - s 0 - b 5629 22149 - e 5636 22156 - l 5629 22149 5636 22156 100 -} -a { - s 0 - b 4538 22165 - e 4545 22172 - l 4538 22165 4545 22172 100 -} -a { - s 0 - b 9327 22167 - e 9334 22174 - l 9327 22167 9334 22174 100 -} -a { - s 0 - b 15181 22169 - e 15188 22176 - l 15181 22169 15188 22176 100 -} -a { - s 0 - b 11379 22173 - e 11386 22180 - l 11379 22173 11386 22180 100 -} -a { - s 0 - b 14092 22174 - e 14099 22181 - l 14092 22174 14099 22181 100 -} -a { - s 0 - b 11380 22174 - e 11387 22181 - l 11380 22174 11387 22181 100 -} -a { - s 0 - b 14093 22175 - e 14100 22182 - l 14093 22175 14100 22182 100 -} -a { - s 0 - b 11381 22175 - e 11388 22182 - l 11381 22175 11388 22182 100 -} -a { - s 0 - b 14259 22195 - e 14266 22202 - l 14259 22195 14266 22202 100 -} -a { - s 0 - b 14727 22204 - e 14734 22211 - l 14727 22204 14734 22211 100 -} -a { - s 0 - b 17486 22220 - e 17493 22227 - l 17486 22220 17493 22227 100 -} -a { - s 0 - b 17487 22221 - e 17494 22228 - l 17487 22221 17494 22228 100 -} -a { - s 0 - b 3935 22224 - e 3942 22231 - l 3935 22224 3942 22231 100 -} -a { - s 0 - b 13959 22225 - e 13966 22232 - l 13959 22225 13966 22232 100 -} -a { - s 0 - b 8672 22225 - e 8679 22232 - l 8672 22225 8679 22232 100 -} -a { - s 0 - b 13960 22226 - e 13967 22233 - l 13960 22226 13967 22233 100 -} -a { - s 0 - b 14820 22228 - e 14827 22235 - l 14820 22228 14827 22235 100 -} -a { - s 0 - b 9720 22239 - e 9727 22246 - l 9720 22239 9727 22246 100 -} -a { - s 0 - b 9721 22240 - e 9728 22247 - l 9721 22240 9728 22247 100 -} -a { - s 0 - b 4401 22246 - e 4408 22253 - l 4401 22246 4408 22253 100 -} -a { - s 0 - b 4402 22247 - e 4409 22254 - l 4402 22247 4409 22254 100 -} -a { - s 0 - b 8731 22273 - e 8738 22280 - l 8731 22273 8738 22280 100 -} -a { - s 0 - b 4090 22273 - e 4097 22280 - l 4090 22273 4097 22280 100 -} -a { - s 0 - b 3565 22273 - e 3572 22280 - l 3565 22273 3572 22280 100 -} -a { - s 0 - b 4091 22274 - e 4098 22281 - l 4091 22274 4098 22281 100 -} -a { - s 0 - b 8052 22287 - e 8059 22294 - l 8052 22287 8059 22294 100 -} -a { - s 0 - b 8053 22288 - e 8060 22295 - l 8053 22288 8060 22295 100 -} -a { - s 0 - b 3240 22288 - e 3247 22295 - l 3240 22288 3247 22295 100 -} -a { - s 0 - b 3241 22289 - e 3248 22296 - l 3241 22289 3248 22296 100 -} -a { - s 0 - b 7721 22300 - e 7728 22307 - l 7721 22300 7728 22307 100 -} -a { - s 0 - b 18516 22309 - e 18523 22316 - l 18516 22309 18523 22316 100 -} -a { - s 0 - b 3260 22309 - e 3267 22316 - l 3260 22309 3267 22316 100 -} -a { - s 0 - b 15964 22320 - e 15971 22327 - l 15964 22320 15971 22327 100 -} -a { - s 0 - b 14353 22320 - e 14360 22327 - l 14353 22320 14360 22327 100 -} -a { - s 0 - b 14149 22324 - e 14156 22331 - l 14149 22324 14156 22331 100 -} -a { - s 0 - b 14396 22327 - e 14403 22334 - l 14396 22327 14403 22334 100 -} -a { - s 0 - b 301 22327 - e 308 22334 - l 301 22327 308 22334 100 -} -a { - s 0 - b 18464 22330 - e 18471 22337 - l 18464 22330 18471 22337 100 -} -a { - s 0 - b 2971 22348 - e 2978 22355 - l 2971 22348 2978 22355 100 -} -a { - s 0 - b 4457 22349 - e 4464 22356 - l 4457 22349 4464 22356 100 -} -a { - s 0 - b 17221 22351 - e 17228 22358 - l 17221 22351 17228 22358 100 -} -a { - s 0 - b 17222 22352 - e 17229 22359 - l 17222 22352 17229 22359 100 -} -a { - s 0 - b 15190 22352 - e 15197 22359 - l 15190 22352 15197 22359 100 -} -a { - s 0 - b 3608 22373 - e 3615 22380 - l 3608 22373 3615 22380 100 -} -a { - s 0 - b 3609 22374 - e 3616 22381 - l 3609 22374 3616 22381 100 -} -a { - s 0 - b 8422 22382 - e 8429 22389 - l 8422 22382 8429 22389 100 -} -a { - s 0 - b 16009 22383 - e 16016 22390 - l 16009 22383 16016 22390 100 -} -a { - s 0 - b 4884 22404 - e 4891 22411 - l 4884 22404 4891 22411 100 -} -a { - s 0 - b 4885 22405 - e 4892 22412 - l 4885 22405 4892 22412 100 -} -a { - s 0 - b 4886 22406 - e 4893 22413 - l 4886 22406 4893 22413 100 -} -a { - s 0 - b 15361 22407 - e 15368 22414 - l 15361 22407 15368 22414 100 -} -a { - s 0 - b 17172 22409 - e 17179 22416 - l 17172 22409 17179 22416 100 -} -a { - s 0 - b 17377 22413 - e 17384 22420 - l 17377 22413 17384 22420 100 -} -a { - s 0 - b 8601 22417 - e 8608 22424 - l 8601 22417 8608 22424 100 -} -a { - s 0 - b 8602 22418 - e 8609 22425 - l 8602 22418 8609 22425 100 -} -a { - s 0 - b 8603 22419 - e 8610 22426 - l 8603 22419 8610 22426 100 -} -a { - s 0 - b 17986 22427 - e 17993 22434 - l 17986 22427 17993 22434 100 -} -a { - s 0 - b 4563 22430 - e 4570 22437 - l 4563 22430 4570 22437 100 -} -a { - s 0 - b 8424 22435 - e 8431 22442 - l 8424 22435 8431 22442 100 -} -a { - s 0 - b 4385 22439 - e 4392 22446 - l 4385 22439 4392 22446 100 -} -a { - s 0 - b 4386 22440 - e 4393 22447 - l 4386 22440 4393 22447 100 -} -a { - s 0 - b 7717 22442 - e 7724 22449 - l 7717 22442 7724 22449 100 -} -a { - s 0 - b 14608 22503 - e 14615 22510 - l 14608 22503 14615 22510 100 -} -a { - s 0 - b 14374 22503 - e 14381 22510 - l 14374 22503 14381 22510 100 -} -a { - s 0 - b 7579 22504 - e 7586 22511 - l 7579 22504 7586 22511 100 -} -a { - s 0 - b 3097 22507 - e 3104 22514 - l 3097 22507 3104 22514 100 -} -a { - s 0 - b 3098 22508 - e 3105 22515 - l 3098 22508 3105 22515 100 -} -a { - s 0 - b 9740 22510 - e 9747 22517 - l 9740 22510 9747 22517 100 -} -a { - s 0 - b 11289 22513 - e 11296 22520 - l 11289 22513 11296 22520 100 -} -a { - s 0 - b 11431 22516 - e 11438 22523 - l 11431 22516 11438 22523 100 -} -a { - s 0 - b 8318 22527 - e 8325 22534 - l 8318 22527 8325 22534 100 -} -a { - s 0 - b 3398 22538 - e 3405 22545 - l 3398 22538 3405 22545 100 -} -a { - s 0 - b 4626 22542 - e 4633 22549 - l 4626 22542 4633 22549 100 -} -a { - s 0 - b 11555 22547 - e 11562 22554 - l 11555 22547 11562 22554 100 -} -a { - s 0 - b 3707 22547 - e 3714 22554 - l 3707 22547 3714 22554 100 -} -a { - s 0 - b 11556 22548 - e 11563 22555 - l 11556 22548 11563 22555 100 -} -a { - s 0 - b 11557 22549 - e 11564 22556 - l 11557 22549 11564 22556 100 -} -a { - s 0 - b 4797 22553 - e 4804 22560 - l 4797 22553 4804 22560 100 -} -a { - s 0 - b 16067 22555 - e 16074 22562 - l 16067 22555 16074 22562 100 -} -a { - s 0 - b 14457 22555 - e 14464 22562 - l 14457 22555 14464 22562 100 -} -a { - s 0 - b 9680 22587 - e 9687 22594 - l 9680 22587 9687 22594 100 -} -a { - s 0 - b 9681 22588 - e 9688 22595 - l 9681 22588 9688 22595 100 -} -a { - s 0 - b 7708 22590 - e 7715 22597 - l 7708 22590 7715 22597 100 -} -a { - s 0 - b 5610 22591 - e 5617 22598 - l 5610 22591 5617 22598 100 -} -a { - s 0 - b 11148 22593 - e 11155 22600 - l 11148 22593 11155 22600 100 -} -a { - s 0 - b 18098 22595 - e 18105 22602 - l 18098 22595 18105 22602 100 -} -a { - s 0 - b 12600 22599 - e 12607 22606 - l 12600 22599 12607 22606 100 -} -a { - s 0 - b 9603 22600 - e 9610 22607 - l 9603 22600 9610 22607 100 -} -a { - s 0 - b 7977 22620 - e 7984 22627 - l 7977 22620 7984 22627 100 -} -a { - s 0 - b 15354 22635 - e 15361 22642 - l 15354 22635 15361 22642 100 -} -a { - s 0 - b 72 22640 - e 79 22647 - l 72 22640 79 22647 100 -} -a { - s 0 - b 3036 22642 - e 3043 22649 - l 3036 22642 3043 22649 100 -} -a { - s 0 - b 7421 22645 - e 7428 22652 - l 7421 22645 7428 22652 100 -} -a { - s 0 - b 11050 22665 - e 11057 22672 - l 11050 22665 11057 22672 100 -} -a { - s 0 - b 11051 22666 - e 11058 22673 - l 11051 22666 11058 22673 100 -} -a { - s 0 - b 11052 22667 - e 11059 22674 - l 11052 22667 11059 22674 100 -} -a { - s 0 - b 4244 22669 - e 4251 22676 - l 4244 22669 4251 22676 100 -} -a { - s 0 - b 7476 22670 - e 7483 22677 - l 7476 22670 7483 22677 100 -} -a { - s 0 - b 3080 22671 - e 3087 22678 - l 3080 22671 3087 22678 100 -} -a { - s 0 - b 93 22671 - e 100 22678 - l 93 22671 100 22678 100 -} -a { - s 0 - b 7445 22676 - e 7452 22683 - l 7445 22676 7452 22683 100 -} -a { - s 0 - b 7446 22677 - e 7453 22684 - l 7446 22677 7453 22684 100 -} -a { - s 0 - b 3182 22722 - e 3189 22729 - l 3182 22722 3189 22729 100 -} -a { - s 0 - b 4793 22725 - e 4800 22732 - l 4793 22725 4800 22732 100 -} -a { - s 0 - b 8710 22726 - e 8717 22733 - l 8710 22726 8717 22733 100 -} -a { - s 0 - b 8711 22727 - e 8718 22734 - l 8711 22727 8718 22734 100 -} -a { - s 0 - b 8712 22728 - e 8719 22735 - l 8712 22728 8719 22735 100 -} -a { - s 0 - b 17384 22729 - e 17391 22736 - l 17384 22729 17391 22736 100 -} -a { - s 0 - b 6680 22747 - e 6687 22754 - l 6680 22747 6687 22754 100 -} -a { - s 0 - b 13473 22749 - e 13480 22756 - l 13473 22749 13480 22756 100 -} -a { - s 0 - b 11388 22749 - e 11395 22756 - l 11388 22749 11395 22756 100 -} -a { - s 0 - b 13474 22750 - e 13481 22757 - l 13474 22750 13481 22757 100 -} -a { - s 0 - b 12241 22753 - e 12248 22760 - l 12241 22753 12248 22760 100 -} -a { - s 0 - b 135 22754 - e 142 22761 - l 135 22754 142 22761 100 -} -a { - s 0 - b 11298 22756 - e 11305 22763 - l 11298 22756 11305 22763 100 -} -a { - s 0 - b 3567 22781 - e 3574 22788 - l 3567 22781 3574 22788 100 -} -a { - s 0 - b 4941 22799 - e 4948 22806 - l 4941 22799 4948 22806 100 -} -a { - s 0 - b 6674 22810 - e 6681 22817 - l 6674 22810 6681 22817 100 -} -a { - s 0 - b 6367 22811 - e 6374 22818 - l 6367 22811 6374 22818 100 -} -a { - s 0 - b 17095 22817 - e 17102 22824 - l 17095 22817 17102 22824 100 -} -a { - s 0 - b 11074 22820 - e 11081 22827 - l 11074 22820 11081 22827 100 -} -a { - s 0 - b 16042 22832 - e 16049 22839 - l 16042 22832 16049 22839 100 -} -a { - s 0 - b 16043 22833 - e 16050 22840 - l 16043 22833 16050 22840 100 -} -a { - s 0 - b 6670 22837 - e 6677 22844 - l 6670 22837 6677 22844 100 -} -a { - s 0 - b 6671 22838 - e 6678 22845 - l 6671 22838 6678 22845 100 -} -a { - s 0 - b 11432 22839 - e 11439 22846 - l 11432 22839 11439 22846 100 -} -a { - s 0 - b 3893 22845 - e 3900 22852 - l 3893 22845 3900 22852 100 -} -a { - s 0 - b 8664 22864 - e 8671 22871 - l 8664 22864 8671 22871 100 -} -a { - s 0 - b 14481 22865 - e 14488 22872 - l 14481 22865 14488 22872 100 -} -a { - s 0 - b 14482 22866 - e 14489 22873 - l 14482 22866 14489 22873 100 -} -a { - s 0 - b 176 22866 - e 183 22873 - l 176 22866 183 22873 100 -} -a { - s 0 - b 8531 22906 - e 8538 22913 - l 8531 22906 8538 22913 100 -} -m { - n 0 -} -#:eof diff --git a/programs/lastz/tools/any_to_qdna.py b/programs/lastz/tools/any_to_qdna.py deleted file mode 100755 index 23acc4b..0000000 --- a/programs/lastz/tools/any_to_qdna.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python -""" -Convert any file to a LASTZ quantum dna file, just by appending qdna headers - -Qdna file format is shown below (omitting "named properties", which we don't -use). We simply create all the headers and copy the file as the "data -sequence". - - offset 0x00: C4 B4 71 97 big endian magic number (97 71 B4 C4 => little endian) - offset 0x04: 00 00 02 00 version 2.0 (fourth byte is sub version) - offset 0x08: 00 00 00 14 header length (in bytes, including this field) - offset 0x0C: xx xx xx xx S, offset (from file start) to data sequence - offset 0x10: xx xx xx xx N, offset to name, 0 indicates no name - offset 0x14: xx xx xx xx length of data sequence (counted in 'items') - offset 0x18: 00 00 00 00 (offset to named properties, not used) - offset N: ... name (zero-terminated string) - offset S: ... data sequence - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -from sys import argv,stdin,stdout,exit - - -def usage(s=None): - message = """any_to_qdna [options] < any_file > qdna_file - Convert any file to a LASTZ quantum dna file. - - options: - --name= the name of the sequence - (by default, the sequence is unnamed) - --striplinebreaks strip line breaks from the file - (default is to include line breaks in the qdna file) - --simple create an "old-style" qdna file - (default is to create a version 2 qda file)""" - - if (s == None): exit (message) - else: exit ("%s\n%s" % (s,message)) - - -def main(): - - qdnaOldMagic = 0xF656659EL # big endian magic number for older qdna files - qdnaMagic = 0xC4B47197L # big endian magic number for qdna files - qdnaVersion = 0x00000200L - - # parse args - - name = None - strip = False - simple = False - - for arg in argv[1:]: - if (arg.startswith("--name=")): - name = arg.split("=",1)[1] - elif (arg == "--striplinebreaks") or (arg == "--strip"): - strip = True - elif (arg == "--simple") or (arg == "--old"): - simple = True - elif (arg.startswith("--")): - usage("can't understand %s" % arg) - else: - usage("can't understand %s" % arg) - - if (simple) and (name != None): - uaseg("simple qdna file cannot carry a sequence name") - - # === read the input file === - - seq = [] - for line in stdin: - if (strip): line = line.rstrip() - seq += [line] - seq = "".join(seq) - - # === write the qdna file === - - if (not simple): - headerLen = 20 - if (name == None): - nameOffset = 0 - seqOffset = headerLen + 8; - else: - nameOffset = headerLen + 8; - seqOffset = nameOffset + len(name) + 1 - - # prepend magic number - - if (simple): write_4(stdout,qdnaOldMagic) - else: write_4(stdout,qdnaMagic) - - # write the rest of the header - - if (not simple): - write_4(stdout,qdnaVersion) - write_4(stdout,headerLen) - write_4(stdout,seqOffset) - write_4(stdout,nameOffset) - write_4(stdout,len(seq)) - write_4(stdout,0) - - if (name != None): - stdout.write(name) - stdout.write(chr(0)) - - # write the sequence - - stdout.write(seq) - - -def write_4(f,val): - f.write (chr((val >> 24) & 0xFF)) - f.write (chr((val >> 16) & 0xFF)) - f.write (chr((val >> 8) & 0xFF)) - f.write (chr( val & 0xFF)) - - -if __name__ == "__main__": main() diff --git a/programs/lastz/tools/build_fasta_hsx.py b/programs/lastz/tools/build_fasta_hsx.py deleted file mode 100755 index 1ad9f4b..0000000 --- a/programs/lastz/tools/build_fasta_hsx.py +++ /dev/null @@ -1,501 +0,0 @@ -#!/usr/bin/env python -""" -Build a "hashed sequence index" (hsx) file for a fasta file ------------------------------------------------------------ - -(see the header of hsx_file.py for file format details) - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -import sys -from math import ceil -from hsx_file import HsxFile - - -def usage(s=None): - message = """ -build_fasta_hsx [options] [fasta_file ...] > hsx_file - (if no fasta_files are present, we read fasta from stdin) - --bucketsize= set the average hash bucket size - --numbuckets= set the number of hash buckets (overrides avg size) - --anonymous don't copy fasta_file name into the index - --secondary use secondary hash in file instead of sequence names - --skipheader point to sequence data rather than header - --windows the fasta file has two-byte line feeds (this is what - microsoft windows uses) - --bigendian write fields as big endian (default is little endian) - --oddbuckets force the number of hash buckets to be odd - --keepempties don't discard empty sequences - --progress[=] print progress reports on stderr -""" - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -def main(): - global write4,write5,write6 - - ########## - # parse the command line - ########## - - avgBucket = 10 - numBuckets = None - anonymous = False - doSecondary = False - skipHeader = False - isWindows = False - fileNames = [] - bigEndian = False - oddBuckets = False - keepEmpties = False - screwup = [] # (so that we can verify that validation works!) - debug = [] - progress = None - - args = sys.argv[1:] - while (len(args) > 0): - arg = args.pop(0) - val = None - fields = arg.split("=",1) - if (len(fields) == 2): - arg = fields[0] - val = fields[1] - if (val == ""): - usage("missing a value in %s=" % arg) - - if (arg == "--bucketsize") and (val != None): - try: - avgBucket = int(val) - if (avgBucket < 1): raise ValueError - except ValueError: - assert (False), "invalid bucket size: %s" % val - elif (arg == "--numbuckets") and (val != None): - try: - numBuckets = int(val) - if (numBuckets < 1): raise ValueError - except ValueError: - assert (False), "invalid number of buckets: %s" % val - elif (arg == "--secondary") and (val == None): - doSecondary = True - assert (False), "secondary hash is not implemented yet (sorry)" - elif (arg == "--anonymous") and (val == None): - anonymous = True - elif (arg == "--skipheader") and (val == None): - skipHeader = True - elif (arg == "--windows") and (val == None): - isWindows = True - elif (arg == "--bigendian") and (val == None): - bigEndian = True - elif (arg == "--oddbuckets") and (val == None): - oddBuckets = True - elif (arg == "--keepempties") and (val == None): - keepEmpties = True - elif (arg == "--screwup") and (val != None): - screwup += [val] - elif (arg == "--debug") and (val == None): - debug += ["debug"] - elif (arg == "--debug") and (val != None): - debug += [val] - elif (arg == "--progress") and (val == None): - debug += ["progress"] - progress = None - elif (arg == "--progress") and (val != None): - debug += ["progress"] - progress = int(val) - elif (arg.startswith("--")): - usage("unknown argument: %s" % arg) - elif (val == None): - fileNames += [arg] - else: - usage("unknown argument: %s" % arg) - - # sanity check on file name - - if (fileNames != []): - for fileName in fileNames: - try: - slash = fileName.rfind("/") - dot = fileName.rfind(".") - if (dot < 0): raise ValueError - if (dot < slash): raise ValueError - if (fileName[dot:] not in [".fa",".fasta"]): raise ValueError - except ValueError: - assert (False), \ - "bad fasta file name (it has to end with .fa or .fasta)" \ - % fileName - - if (anonymous) and (len(fileNames) > 1): - assert (False), "can't use anonymous when you have multiple fasta files" - - assert (len(fileNames) <= 255), "too many input files (max is 255)" - - # set up big- or little-endian - - if (bigEndian): - write4 = write4_big_endian - write5 = write5_big_endian - write6 = write6_big_endian - else: - write4 = write4_little_endian - write5 = write5_little_endian - write6 = write6_little_endian - - ########## - # read the fasta file(s) - ########## - - fileNameToNum = {} - - # read the fasta file(s), collecting names, etc. - - if (fileNames == []): - fileNames += [""] - - sequences = [] - nameSeen = {} - - for (fileNum,fileName) in enumerate(fileNames): - assert (fileName not in fileNameToNum), \ - "can't use the same file twice (%s)" % fileName - fileNameToNum[fileName] = fileNum - - if (fileName == ""): - f = sys.stdin - else: - try: - f = file(fileName,"rt") - except IOError: - assert (False), "unable to open %s" % fileName - - seqNum = 0 - for seqInfo in fasta_sequences(f,twoByteLFs=isWindows): - (name,length,lineNum,headerOffset,seqOffset) = seqInfo - seqNum += 1 - - assert (name not in nameSeen), \ - "%s is used for two sequences (at %s and %s)" \ - % (name, - line_reference(nameSeen[name]), - line_reference((fileName,lineNum))) - nameSeen[name] = (fileName,lineNum) - - if (length == 0): - if (keepEmpties): - print >>sys.stderr, "WARNING: keeping empty sequence %s (%s)" \ - % (name,line_reference((fileName,lineNum))) - else: - print >>sys.stderr, "WARNING: discarding empty sequence %s (%s)" \ - % (name,line_reference((fileName,lineNum))) - continue - - if (skipHeader): sequences += [(name,length,fileNum,seqOffset)] - else: sequences += [(name,length,fileNum,headerOffset)] - - if ("progress" in debug) and (progress != None) and (seqNum % progress == 0): - print >>sys.stderr, "read sequence %d (%s)" % (seqNum,name) - - if (fileName != ""): f.close() - - if ("progress" in debug): - if (fileName != ""): - print >>sys.stderr, "finished reading %s" % fileName - else: - print >>sys.stderr, "finished reading input file" - - # scan collected sequence info and assign hash values - - numSequences = len(sequences) - assert (numSequences > 0), "input file contains no sequences!" - if (numBuckets == None): - numBuckets = int(ceil(numSequences / avgBucket)) - if (oddBuckets) and (numBuckets % 1 == 0): - numBuckets += 1 - - sequences = [(HsxFile.hash(name) % numBuckets,name,length,fileNum,offset) \ - for (name,length,fileNum,offset) in sequences] - sequences.sort() - - if ("progress" in debug): - print >>sys.stderr, "finished computing hashes" - - if ("info" in debug): - for (hash,name,length,fileNum,offset) in sequences: - print >>sys.stderr, "%10d==%08X %2d:%08X %s %d" \ - % (HsxFile.hash(name),hash,fileNum,offset,name,length) - - ########## - # write the index - ########## - - # decide how we will write the file names - - fileNumToOffset = {} - fileNumToFastaName = {} - fileNumToFastaExt = {} - fileInfoLength = 0 - - for fileName in fileNames: - fileNum = fileNameToNum[fileName] - - fastaName = "" - fastaExt = "fa" - if (fileName != ""): - dot = fileName.rfind(".") - fastaExt = fileName[dot+1:] - if (not anonymous): - fastaName = fileName[:dot] - - fileNumToOffset [fileNum] = fileInfoLength - fileNumToFastaName[fileNum] = fastaName - fileNumToFastaExt [fileNum] = fastaExt - fileInfoLength += len(fastaExt)+1 + len(fastaName)+1 - - # determine header and table sizes - - headerLength = 0x1C - headerPad = pad_for_16(8+headerLength) - headerSize = headerLength + headerPad - - numFiles = len(fileNames) - fileTableOffset = 0x08 + headerSize - fileTableLength = numFiles * 4 - fileTablePad = pad_for_16(fileTableLength) - fileTableSize = fileTableLength + fileTablePad - - fileInfoOffset = fileTableOffset + fileTableSize - fileInfoPad = pad_for_16(fileInfoLength) - fileInfoSize = fileInfoLength + fileInfoPad - - hashTableOffset = fileInfoOffset + fileInfoSize - hashTableLength = (numBuckets+1) * 5 - hashTablePad = pad_for_16(hashTableLength) - if ("hashpad" in screwup): hashTablePad = -1 - hashTableSize = hashTableLength + hashTablePad - - seqTableOffset = hashTableOffset + hashTableSize - - if ("file" in debug): - print >>sys.stderr, "fileTableOffset = %08X (%08X)" % (fileTableOffset,fileTableSize) - print >>sys.stderr, "fileInfoOffset = %08X (%08X)" % (fileInfoOffset,fileInfoSize) - print >>sys.stderr, "hashTableOffset = %08X (%08X)" % (hashTableOffset,hashTableSize) - print >>sys.stderr, "seqTableOffset = %08X" % seqTableOffset - - # determine offsets into the sequence table - - nameToOffset = {} - - prevHash = None - for (hash,name,length,fileNum,offset) in sequences: - if (hash == prevHash): continue - nameToOffset[name] = True - - seqOffset = seqTableOffset - for (hash,name,length,fileNum,offset) in sequences: - if (name in nameToOffset): - nameToOffset[name] = seqOffset - seqOffset += 12 + len(name) + 1 - nameToOffset[""] = seqOffset - - # write header - - write4(HsxFile.magicBig) - write4(HsxFile.version) - - write4(headerLength) - write4(numFiles) - write4(fileTableOffset) - write4(numBuckets) - write4(hashTableOffset) - write4(numSequences) - write4(seqTableOffset) - writeZeros(headerPad) - - if ("progress" in debug): - print >>sys.stderr, "finished writing header" - - # write file table and file info - - for fileName in fileNames: - fileNum = fileNameToNum[fileName] - write4(fileInfoOffset + fileNumToOffset[fileNum]) - writeZeros(fileTablePad) - - for fileName in fileNames: - fileNum = fileNameToNum[fileName] - writeString(fileNumToFastaExt [fileNum]) - writeString(fileNumToFastaName[fileNum]) - writeZeros(fileInfoPad) - - if ("progress" in debug): - print >>sys.stderr, "finished writing file table" - - # write hash table - - msBit5 = 0x80 << (4*8) - - prevHash = None - - for (hash,name,length,fileNum,offset) in sequences: - if (hash == prevHash): - bucketSize += 1 - continue - - if (prevHash != None): - # output previous bucket - write5(seqOffset) - if ("progress" in debug) and (progress != None) and ((hash+1) % progress == 0): - print >>sys.stderr, "wrote hash bucket %d" % (hash+1) - # output intervening empty buckets - prevHash += 1 - while (prevHash < hash): - write5(msBit5 + nameToOffset[name]) - prevHash += 1 - if ("progress" in debug) and (progress != None) and (prevHash % progress == 0): - print >>sys.stderr, "wrote hash bucket %d" % (prevHash) - - bucketSize = 1 - seqOffset = nameToOffset[name] - prevHash = hash - - # output previous bucket - write5(seqOffset) - seqOffset = nameToOffset[""] # offset past end of sequence index table - # output intervening empty buckets - prevHash += 1 - while (prevHash < numBuckets): - write5(msBit5 + seqOffset) - prevHash += 1 - # output extra bucket - write5(msBit5 + seqOffset) - - writeZeros(hashTablePad) - - if ("progress" in debug): - print >>sys.stderr, "finished writing hash table" - - # write sequence table - - for (seqNum,(hash,name,length,fileNum,offset)) in enumerate(sequences): - write5(length) # length of the sequence - write1(fileNum) # file number (index into file table) - write6(offset) # offset to the sequence data - writeString(name) # name of sequence - if ("progress" in debug) and (progress != None) and ((seqNum+1) % progress == 0): - print >>sys.stderr, "wrote sequence entry %d" % (seqNum+1) - - if ("progress" in debug): - print >>sys.stderr, "finished writing index" - - -def pad_for_16(n): - return (16 - (n % 16)) % 16 - -def write1(val): - sys.stdout.write(chr(val & 0xFF)) - -def writeString(s): - assert (len(s) <= 255) - sys.stdout.write(chr(len(s))) - sys.stdout.write(s) - -def writeZeros(n): - for i in range(n): sys.stdout.write(chr(0)) - -def write4_little_endian(val): - sys.stdout.write(chr( val & 0xFF)) - sys.stdout.write(chr((val >> 8) & 0xFF)) - sys.stdout.write(chr((val >> 16) & 0xFF)) - sys.stdout.write(chr((val >> 24) & 0xFF)) - -def write5_little_endian(val): - sys.stdout.write(chr( val & 0xFF)) - sys.stdout.write(chr((val >> 8) & 0xFF)) - sys.stdout.write(chr((val >> 16) & 0xFF)) - sys.stdout.write(chr((val >> 24) & 0xFF)) - sys.stdout.write(chr((val >> 32) & 0xFF)) - -def write6_little_endian(val): - sys.stdout.write(chr( val & 0xFF)) - sys.stdout.write(chr((val >> 8) & 0xFF)) - sys.stdout.write(chr((val >> 16) & 0xFF)) - sys.stdout.write(chr((val >> 24) & 0xFF)) - sys.stdout.write(chr((val >> 32) & 0xFF)) - sys.stdout.write(chr((val >> 40) & 0xFF)) - -def write4_big_endian(val): - sys.stdout.write(chr((val >> 24) & 0xFF)) - sys.stdout.write(chr((val >> 16) & 0xFF)) - sys.stdout.write(chr((val >> 8) & 0xFF)) - sys.stdout.write(chr( val & 0xFF)) - -def write5_big_endian(val): - sys.stdout.write(chr((val >> 32) & 0xFF)) - sys.stdout.write(chr((val >> 24) & 0xFF)) - sys.stdout.write(chr((val >> 16) & 0xFF)) - sys.stdout.write(chr((val >> 8) & 0xFF)) - sys.stdout.write(chr( val & 0xFF)) - -def write6_big_endian(val): - sys.stdout.write(chr((val >> 40) & 0xFF)) - sys.stdout.write(chr((val >> 32) & 0xFF)) - sys.stdout.write(chr((val >> 24) & 0xFF)) - sys.stdout.write(chr((val >> 16) & 0xFF)) - sys.stdout.write(chr((val >> 8) & 0xFF)) - sys.stdout.write(chr( val & 0xFF)) - -# fasta_sequences-- -# Read the fasta sequences from a file - -def fasta_sequences(f,nameParse=None,twoByteLFs=False): - - lineNum = 0 - fileOffset = 0 - seqName = None - seqLength = 0 - - for line in f: - lineNum += 1 - lineOffset = fileOffset - fileOffset += len(line) - if (twoByteLFs): fileOffset += 1 - line = line.strip() - - if (line.startswith(">")): - if (seqName != None): - if (seqOffset == None): seqOffset = lineOffset - yield (seqName,seqLength,seqLine,headerOffset,seqOffset) - seqLine = lineNum - headerOffset = lineOffset - seqName = sequence_name(line) - seqLength = 0 - seqOffset = None - elif (seqName == None): - assert (False), "first sequence has no header" - else: - if (seqOffset == None): seqOffset = lineOffset - seqLength += len(line) - - if (seqName != None): - if (seqOffset == None): seqOffset = fileOffset - yield (seqName,seqLength,seqLine,headerOffset,seqOffset) - - -# sequence_name-- -# Extract the sequence name from a fasta header. -# $$$ this needs to use nameParse - -def sequence_name(s,nameParse=None): - s = s[1:].strip() - if (s == ""): return "" - else: return s.split()[0] - - -def line_reference((fileName,lineNum)): - if (fileName == ""): return "line %d" % lineNum - else: return "line %s:%d" % (fileName,lineNum) - - -if __name__ == "__main__": main() - diff --git a/programs/lastz/tools/create_scores_file.control b/programs/lastz/tools/create_scores_file.control deleted file mode 100644 index 80d03df..0000000 --- a/programs/lastz/tools/create_scores_file.control +++ /dev/null @@ -1,14 +0,0 @@ -# base inference on alignments in the middle 50 percentile -# by percent-identity -min_identity = 25.0% # 25th percentile -max_identity = 75.0% # 75th percentile - -# scale scores so max substitution will be 100 and only use -# alignments scoring as well as 20 substitutions -inference_scale = 100 # score for max substitution -hsp_threshold = 20*inference_scale -gapped_threshold = hsp_threshold - -# allow substitution score inference to iterate at most -# 20 times; don't perform gap score inference -max_sub_iterations = 20 diff --git a/programs/lastz/tools/create_scores_file.sh b/programs/lastz/tools/create_scores_file.sh deleted file mode 100755 index 33f3c7e..0000000 --- a/programs/lastz/tools/create_scores_file.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -# -# create_scores_ path_to_encode_directories comparison_species - -ENCODE="$1" -REGION="ENm010" -REFSPECIES="human" -SECSPECIES=$2 - -THISDIR=`dirname $0` - -lastz_D --inferonly=${THISDIR}/create_scores_file.control \ - ${ENCODE}/${REGION}/${REFSPECIES}.${REGION}.fa \ - ${ENCODE}/${REGION}/${SECSPECIES}.${REGION}.fa \ - | ${THISDIR}/expand_scores_file.py --overridegaps - diff --git a/programs/lastz/tools/expand_scores_file.py b/programs/lastz/tools/expand_scores_file.py deleted file mode 100755 index 427fe60..0000000 --- a/programs/lastz/tools/expand_scores_file.py +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env python -""" -Add scoring-related parameters to a lastz scores file ------------------------------------------------------ - -:Author: Bob Harris (rsharris@bx.psu.edu) - -Typical input scores file: - - # (a LASTZ scoring set, created by "LASTZ --infer") - - bad_score = X:-1910 # used for sub[X][*] and sub[*][X] - fill_score = -191 # used when sub[*][*] not otherwise defined - gap_open_penalty = 400 - gap_extend_penalty = 30 - - A C G T - A 85 -164 -70 -191 - C -164 100 -151 -70 - G -70 -151 100 -164 - T -191 -70 -164 85 -""" - -import sys - -def usage(s=None): - message = """ -expand_scores_file [options]< scores_file > scores_file - --overridegaps ignore gap scores already set -""" - - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -def main(): - - overrideGaps = False - - for arg in sys.argv[1:]: - if (arg == "--overridegaps"): - overrideGaps = True - continue - raise "unrecognized argument: %s" % arg - - # read the scores file - - lines = [] - numValueLines = None - valuesFinished = False - nameToVal = {} - subs = subRows = subColumns = None - - lineNumber = 0 - for line in sys.stdin: - lineNumber += 1 - line = line.rstrip() - lines += [line] - if (line == ""): continue - if (line.startswith("#")): continue - if ("#" in line): line = line.split("#",1)[0].strip() - - if ("=" in line): - if (valuesFinished): - raise "in scores file, unexpected assignment (line %d): %s" \ - % (lineNumber,line) - fields = line.split("=",1) - name = fields[0].strip() - val = fields[1].strip() - if (name == "gap_open_penalty"): name = "O" - elif (name == "gap_extend_penalty"): name = "E" - if (name in nameToVal): - raise "in scores file, %s is assigned twice (line %d): %s" \ - % (name,lineNumber,line) - if (overrideGaps): - if (name in ["O","E"]): - lines.pop() - continue - try: - nameToVal[name] = int_or_float(fields[1]) - except: - if (name in ["O","E"]): - raise "in scores file, bad assignment value (line %d): %s" \ - % (lineNumber,line) - elif (not valuesFinished): - numValueLines = len(lines) - 1 - valuesFinished = True - subColumns = line.split() - subRows = [] - subs = {} - else: - fields = line.split() - rowCh = fields.pop(0) - subRows += [rowCh] - if (len(fields) != len(subColumns)): - raise "in scores file, inconsistent matrix (line %d): %s" \ - % (lineNumber,line) - for ix in range(len(fields)): - colCh = subColumns[ix] - subs[rowCh+colCh] = int_or_float(fields[ix]) - - if (subs == None): - raise "scores file is missing a matrix" - - if ("AA" not in subs): - raise "scores file lacks A-to-A score" - - # compute a few values from the scores matrix - - bestSub = float(max([subs[digram] for digram in subs])) - worstSub = float(min([subs[digram] for digram in subs])) - aaSub = float(subs["AA"]) - - # add expanded values - - knownVals = [name for name in nameToVal] - - if ("O" not in nameToVal): - nameToVal["O"] = -int(3.25 * worstSub) - - if ("E" not in nameToVal): - nameToVal["E"] = -int(0.25 * worstSub) - - if ("X" not in nameToVal): - nameToVal["X"] = int(10 * aaSub) - - if ("Y" not in nameToVal): - nameToVal["Y"] = int(nameToVal["O"] + 100*nameToVal["E"]) - - if ("K" not in nameToVal): - nameToVal["K"] = int(30 * bestSub) - - if ("L" not in nameToVal): - nameToVal["L"] = int(30 * bestSub) - - if ("T" not in nameToVal) and (worstSub/bestSub < -1.5): - nameToVal["T"] = "2" - - if ("Z" not in nameToVal) and (worstSub/bestSub < -3.0): - nameToVal["Z"] = "3" - - # figure out what values we've added, and in what order to print them - - addedNames = [name for name in ["T","Z","O","E","X","Y","K","L"] \ - if (name in nameToVal) \ - and (name not in knownVals)] - addedNames += [name for name in nameToVal \ - if (name not in addedNames) \ - and (name not in knownVals)] - - # print the new scores file - - blankLine = False - - for ix in range(numValueLines): - print lines[ix] - blankLine = (lines[ix] == "") - - if (addedNames != []): - if (not blankLine): print "" - print "# (score parameters added by expand_scores_file)" - print "" - - for name in addedNames: - print "%s=%s" % (name,nameToVal[name]) - - blankLine = (lines[numValueLines] == "") - if (not blankLine): print "" - - for ix in range(numValueLines,len(lines)): - print lines[ix] - - -def int_or_float(s): - try: return int(s) - except: return float(s) - - -if __name__ == "__main__": main() diff --git a/programs/lastz/tools/fasta_fragments.py b/programs/lastz/tools/fasta_fragments.py deleted file mode 100755 index f1690f0..0000000 --- a/programs/lastz/tools/fasta_fragments.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python -""" -Break a fasta file into fragments. - -$$$ todo: spread out the fragment starts so that the last fragment ends at the -$$$ .. end of a sequence, if possible - -$$$ todo: find runs of N and reset the fragment start position to skip past -$$$ .. such runs - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -from sys import argv,stdin,stderr,exit -from random import seed as random_seed,shuffle - - -def usage(s=None): - message = """fasta_fragments [options] < fasta_file > fasta_file - Split a fasta file into overlapping fragments. - - options: - --fragment= length of each fragment - (default is 100) - --step= distance between the start of each fragment - (default is 50) - --shuffle[=] randomly shuffle the order that fragments are output; - this can be very memory intensive, as all fragments - are collected in a list before any are output - (by default, fragments are output in sequence order) - --origin=one output positions are origin-one - (surprisingly, this is the default) - --origin=zero output positions are origin-zero - --head= limit the number of fragments emitted""" - - if (s == None): exit (message) - else: exit ("%s\n%s" % (s,message)) - - -def main(): - - fragmentLength = 100 - stepLength = 50 - shuffleEm = False - origin = "one" - headLimit = None - - for arg in argv[1:]: - if ("=" in arg): - argVal = arg.split("=",1)[1] - - if (arg.startswith("--fragment=")): - fragmentLength = int_with_unit(argVal) - elif (arg.startswith("--step=")): - stepLength = int_with_unit(argVal) - elif (arg == "--shuffle"): - shuffleEm = True - elif (arg.startswith("--shuffle=")): - shuffleEm = True - random_seed(argVal) - elif (arg.startswith("--origin=")): - origin = argVal - if (origin == "0"): origin = "zero" - if (origin == "1"): origin = "one" - assert (origin in ["zero","one"]), "can't understand %s" % arg - elif (arg.startswith("--head=")): - headLimit = int_with_unit(argVal) - elif (arg.startswith("--")): - usage("can't understand %s" % arg) - else: - usage("can't understand %s" % arg) - - allN = "N" * fragmentLength - - # process the sequences - - if (shuffleEm): - fragments = [] - - fragNum = 0 - for (name,seq) in fasta_sequences(stdin): - if (headLimit != None) and (fragNum > headLimit): break - - seq = seq.upper() - for ix in xrange(0,len(seq)-fragmentLength,stepLength): - frag = seq[ix:ix+fragmentLength] - if (frag == allN): continue - - fragNum += 1 - if (headLimit != None) and (fragNum > headLimit): - print >>stderr, "limit of %d emitted fragments reached" % headLimit - break - - if (origin == "zero"): header = ">%s_%d" % (name,ix) - else: header = ">%s_%d" % (name,ix+1) - if (shuffleEm): - fragments += [(header,frag)] - else: - print header - print frag - - if (shuffleEm): - shuffle(fragments) - for (header,frag) in fragments: - print header - print frag - - -# fasta_sequences-- -# Read the fasta sequences from a file - -def fasta_sequences(f): - seqName = None - seqNucs = None - - for line in f: - line = line.strip() - - if (line.startswith(">")): - if (seqName != None): - yield (seqName,"".join(seqNucs)) - seqName = line[1:].strip().split()[0] - seqNucs = [] - elif (seqName == None): - assert (False), "first sequence has no header" - else: - seqNucs += [line] - - if (seqName != None): - yield (seqName,"".join(seqNucs)) - - -# int_with_unit-- -# Parse a string as an integer, allowing unit suffixes - -def int_with_unit(s): - if (s.endswith("K")): - multiplier = 1000 - s = s[:-1] - elif (s.endswith("M")): - multiplier = 1000 * 1000 - s = s[:-1] - elif (s.endswith("G")): - multiplier = 1000 * 1000 * 1000 - s = s[:-1] - else: - multiplier = 1 - - try: return int(s) * multiplier - except ValueError: return int(math.ceil(float(s) * multiplier)) - - -if __name__ == "__main__": main() - diff --git a/programs/lastz/tools/fasta_softmask_intervals.py b/programs/lastz/tools/fasta_softmask_intervals.py deleted file mode 100755 index 5042b0d..0000000 --- a/programs/lastz/tools/fasta_softmask_intervals.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python -""" -Given a list of intervals, mask those bases in the fasta sequence(s). -""" - -from sys import argv,stdin,exit - - -def usage(s=None): - message = """fasta_softmask_intervals [options] < fasta_file > fasta_file - Apply masking intervals to create a soft-masked fasta file. - - options: - file containing a list of intervals to be masked, - in the form ; --origin - determines whether these are origin one or zero - --chrom= copy (and mask) only the specified sequence(s) - is a comma-separated list - (default is to copy and mask all sequences) - --origin=one intervals are origin-one, closed - (default is origin-zero, half-open) - --wrap= split each sequence into multiple lines if needed - (default is to write sequence on a single line) - --mask= mask with a particular character (usually X or N) - (default is to mask with lowercase)""" - - if (s == None): exit (message) - else: exit ("%s\n%s" % (s,message)) - - -def main(): - - # parse args - - chromsOfInterest = None - origin = "zero" - wrapLength = 100 - maskChar = None - intervalsFile = None - - for arg in argv[1:]: - if ("=" in arg): - argVal = arg.split("=",1)[1] - - if (arg.startswith("--chrom=")) or (arg.startswith("--chroms=")): - if (chromsOfInterest == None): - chromsOfInterest = [] - chromsOfInterest += argVal.split(",") - elif (arg.startswith("--origin=")): - origin = argVal - if (origin == "0"): origin = "zero" - if (origin == "1"): origin = "one" - if (origin not in ["zero","one"]): - usage("unknown argument: %s=%s" % (arg,val)) - elif (arg.startswith("--wrap=")): - wrapLength = int(argVal) - elif (arg.startswith("--mask=")): - maskChar = argVal - if (len(maskChar) != 1): usage("--mask requires a single character") - elif (arg.startswith("--")): - usage("can't understand %s" % arg) - elif (intervalsFile == None): - intervalsFile = arg - else: - usage("can't understand %s" % arg) - - if (intervalsFile == None): - usage("you have to tell me the intervals you're interested in") - - # read the intervals - - f = file(intervalsFile,"rt") - - chromToIntervals = {} - - lineNumber = 0 - for line in f: - lineNumber += 1 - line = line.strip() - if (line == "") or (line.startswith("#")): continue - - fields = line.split() - assert (len(fields) >= 3), \ - "not enough fields (line %s): %s" % (lineNumber,line) - - try: - chrom = fields[0] - start = int(fields[1]) - end = int(fields[2]) - if (origin == "one"): start -= 1 - if (start < 0): raise ValueError - if (start >= end): raise ValueError - except ValueError: - assert (False), \ - "bad line (line %s): %s" % (lineNumber,line) - - if (chromsOfInterest != None) and (chrom not in chromsOfInterest): - continue - - if (chrom not in chromToIntervals): chromToIntervals[chrom] = [] - chromToIntervals[chrom] += [(start,end)] - - f.close() - - for chrom in chromToIntervals: - chromToIntervals[chrom] = merge_and_sort(chromToIntervals[chrom]) - - # process the sequences - - chromSeen = {} - - for (chrom,seq) in fasta_sequences(stdin): - if (chromsOfInterest != None) and (chrom not in chromsOfInterest): - continue - - assert (chrom not in chromSeen), \ - "more than one sequence is named %s" % chrom - chromSeen[chrom] = True - - seq = seq.upper() - if (chrom not in chromToIntervals): chromToIntervals[chrom] = [] - - newSeq = [] - - prevEnd = 0 - for (start,end) in chromToIntervals[chrom]: - if (prevEnd < start): newSeq += [seq[prevEnd:start]] - if (maskChar == None): newSeq += [seq[start:end].lower()] - else: newSeq += [maskChar*(end-start)] - prevEnd = end - if (prevEnd < len(seq)): newSeq += [seq[prevEnd:]] - - print ">%s" % chrom - newSeq = "".join(newSeq) - assert (len(newSeq) == len(seq)), "internal error" - - for i in range(0,len(newSeq),wrapLength): - print "".join(newSeq[i:i+wrapLength]) - - # make sure all sequences were given - - missing = [chrom for chrom in chromToIntervals if (chrom not in chromSeen)] - assert (missing == []), "missing fasta sequence %s" % (", ".join(missing)) - - -# fasta_sequences-- -# Read the fasta sequences from a file - -def fasta_sequences(f): - seqName = None - seqNucs = None - - for line in f: - line = line.strip() - - if (line.startswith(">")): - if (seqName != None): - yield (seqName,"".join(seqNucs)) - seqName = line[1:].strip().split()[0] - seqNucs = [] - elif (seqName == None): - assert (False), "first sequence has no header" - else: - seqNucs += [line] - - if (seqName != None): - yield (seqName,"".join(seqNucs)) - - -# merge_and_sort-- -# Marge a set of intervals (union of sets) and sort them by increasing -# position - -def merge_and_sort(intervals): - intervals.sort() - - start = None - for (s,e) in intervals: - if (start == None): - (start,end) = (s,e) - elif (s > end): - yield (start,end) - (start,end) = (s,e) - continue - elif (e > end): - end = e - - if (start != None): - yield (start,end) - - -if __name__ == "__main__": main() diff --git a/programs/lastz/tools/hassock_hash.py b/programs/lastz/tools/hassock_hash.py deleted file mode 100755 index 1d1c3d3..0000000 --- a/programs/lastz/tools/hassock_hash.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python -""" -Python implementation of the hash used for "hashed sequence index" files. - -The "hassock" hash is a variant of Austin Appleby's MurmurHash2. The -latter is described (as of Apr/2009) at - murmurhash.googlepages.com -This variant is based on the endian-neutral version found at - murmurhash.googlepages.com/MurmurHashNeutral2.cpp -and differs in the following ways: - (a) The "seed" is hardwired. - (b) We parse the data block in reverse; this allows the caller to - prepend an additional seed pattern to his buffer, potentially - getting better mixing for the bits in the final incorporated - bytes. - (c) The last three bytes are incorporated in a different order than - they were in MurmurHash2, because the code just works out better - this way. - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -import sys - -seed = 0x5C3FC4D3 -mult = 0x87C10417 - -def hassock_hash(s): - ix = len(s) - h = seed ^ ix # h = seed ^ len; - while (ix >= 4): - k = ord(s[ix-1]) # k = *(--data); - k |= ord(s[ix-2]) << 8 # k |= *(--data) << 8; - k |= ord(s[ix-3]) << 16 # k |= *(--data) << 16; - k |= ord(s[ix-4]) << 24 # k |= *(--data) << 24; - - k = (k * mult) & 0xFFFFFFFF # k *= m; - k ^= k >> 24 # k ^= k >> r; - k = (k * mult) & 0xFFFFFFFF # k *= m; - - h = (h * mult) & 0xFFFFFFFF # h *= m; - h ^= k # h ^= k; - ix -= 4 - - if (ix >= 3): - h ^= ord(s[2]) << 16 # h ^= *(--data) << 16; - if (ix >= 2): - h ^= ord(s[1]) << 8 # h ^= *(--data) << 8; - if (ix >= 1): - h ^= ord(s[0]) # h ^= *(--data); - h = (h * mult) & 0xFFFFFFFF # h *= m; - - h ^= h >> 13 # h ^= h >> 13; - h = (h * mult) & 0xFFFFFFFF # h *= m; - h ^= h >> 15 # h ^= h >> 15; - - return h - - -# main program to test - -def main(): - m = None - - strings = [] - - for s in sys.argv[1:]: - if (s.startswith("--mod=")): m = int(s.split("=",1)[1]) - else: strings += [s] - - if (strings != []): - for s in strings: - demonstrate_hash(s,m) - else: - for line in sys.stdin: - line = line.rstrip() - demonstrate_hash(line,m) - -def demonstrate_hash(s,m): - if (m == None): print "%08X: %s" % (hassock_hash(s),s) - else: print "%d: %s" % (hassock_hash(s)%m,s) - - -if __name__ == "__main__": main() - diff --git a/programs/lastz/tools/hsx_file.py b/programs/lastz/tools/hsx_file.py deleted file mode 100755 index dbd1959..0000000 --- a/programs/lastz/tools/hsx_file.py +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python -""" -"Hashed sequence index" (hsx) file reader (for a fasta file) -------------------------------------------------------------------- - -offset 0x00: D2 52 70 95 big endian magic number - .. (95 70 52 D2 => little endian) -offset 0x04: 00 00 01 xx version 1.0 (see note 1) -offset 0x08: 00 00 00 1C header length (in bytes, including this - .. field) -offset 0x0C: xx xx xx xx FN, number of files (see note 2) -offset 0x10: xx xx xx xx FO, offset to file table -offset 0x14: xx xx xx xx HN, number of hash buckets (see notes 3 and 4) -offset 0x18: xx xx xx xx HO, offset to hash table -offset 0x1C: xx xx xx xx SN, number of sequences -offset 0x20: xx xx xx xx SO, offset to sequence index table (see - .. note 5) - -offset FO: xx xx xx xx FIO0, offset to file info for file 0 - ... (FN-1 more entries, at 4 bytes per) - -offset FIOn: LL xx .. type of file (ascii "fa", "2bit", etc., see - note 6) - LL xx .. name of file (see note 7) - ... (FN-1 more entries, variable length) - -offset HO: xx xx xx xx xx SIOn, offset into sequence index table (see - .. notes 8, 9 and 10) - ... (HN-1 more entries, at 5 bytes per) - xx xx xx xx xx offset past end of sequence index table - -offset SO: xx xx xx xx xx length of the sequence (see note 11) - xx file number (index into file table) - xx xx xx xx xx xx offset to the sequence data (see note 12) - LL xx .. name of sequence (see note 13) - ... (SN-1 more entries, variable length) - -Notes: - - (1) The least significant byte of the version is the "sub version". - For version 1, this is 00 (secondary hashes are not in use) or 01 - (secondary hashes are in use). - (2) The number of files is limited to 255. - (3) It is assumed that the number of buckets is set so that the average - number of sequences per bucket (SN/HN) is reasonably small (e.g. 10). - (4) The hash table actually includes HN+1 buckets. The extra bucket has - size zero and gives the offset to just past the end of the sequence - index table. - (5) Entries in the sequence index table are necessarily stored in hash - order. Entries with the same hash are stored in alphabetical order; - actually, in lexicographic order over the bytes of their names. - (6) Strings are stored as a length byte followed by ascii text. - (7) If a file info record contains an empty name, the name of the file is - the same as the index file itself, with the file type used as the - extension (e.g. "reads.hsx" becomes "reads.fa"). This allows files to - be renamed without rebuilding the index. - (8) SIOn is the file offset for the nth entry in the sequence index table. - When this is in a hash table entry, it is the index for the first - sequence in that hash's bucket. - (9) The most significant bit in a bucket's SIOn value is used to indicate - whether the bucket is empty or not. If a bucket is empty, this bit is - set (1), otherwise it is clear. - (10) The end of a bucket can be determined from the SIOn entry for the - start of the next bucket. - (11) A sequence may be empty, so zero is a legitimate value for the - sequence length. - (12) The offset to the sequence data is an offset into the sequence file. - For fasta it can point to the ">" at the start of the sequence's - header, or directly to the sequence data. - (13) When secondary hashes are in use, the sequence name (including the - terminating zero) is replaced by the four-byte secondary hash. - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -import sys,struct -import hassock_hash - - -class HsxFile(object): - - def __init__(self,fileName,debug=None): - self.fileName = fileName - self.file = None - self.numFiles = 0 - if (debug == None): self.debug = [] - else: self.debug = debug - self.open() - - magicBig = 0xD2527095L - magicLittle = 0x957052D2L - version = 0x00000100L - msBit5 = 0x80 << (4*8) - - def open(self): - self.file = file(self.fileName,"rb") - - self.magic = magic = struct.unpack(">L",self.file.read(4))[0] - if (magic == HsxFile.magicBig): self.byteOrder = ">" # (big endian) - elif (magic == HsxFile.magicLittle): self.byteOrder = "<" # (little endian) - else: - assert (False), \ - "%s is not an hsx file (magic = %08X)" \ - % (self.fileName,magic) - self.struct4 = "%sL" % self.byteOrder - - self.version = self.read4() - assert (self.version == HsxFile.version), \ - "%s is hsx version %08X, which is not supported" \ - % (self.fileName,self.version) - - self.read_header() - self.load_file_table() - - def close(self): - self.file.close() - for fileIx in range(self.numFiles): - (name,file) = self.fileTable[fileIx] - if (file != None): file.close() - - def read_header(self): - self.headerLength = self.read4() - assert (self.headerLength >= 0x1C), \ - "%s has unsupported header length (%08X)" \ - % (self.fileName,self.headerSize) - (self.numFiles, - self.fileTableOffset, - self.numBuckets, - self.hashTableOffset, - self.numSequences, - self.seqTableOffset) = struct.unpack("%sLLLLLL" % self.byteOrder,self.file.read(24)) - assert (self.numBuckets != 0), \ - "%s has corrupt header (numBuckets = 0)" % (self.fileName) - - def load_file_table(self): - self.file.seek(self.fileTableOffset) - offsetTable = self.file.read(4*self.numFiles) - offsetTable = struct.unpack("%s%s" % (self.byteOrder,"L"*self.numFiles),offsetTable) - self.fileTable = [None] * self.numFiles - - basePath = baseName = None - for fileIx in range(self.numFiles): - self.file.seek(offsetTable[fileIx]) - extension = self.readString() - name = self.readString() - if (name == ""): - if (baseName == None): - baseName = self.base_file_name() - name = baseName + "." + extension - else: - if (basePath == None): - basePath = self.base_file_path() - name = basePath + name + "." + extension - self.fileTable[fileIx] = (name,None) # (second field holds file when opened) - #.. print "fileTable[%d] = %s" % (fileIx,name) - - def base_file_name(self): - slash = self.fileName.rfind("/") - dot = self.fileName.rfind(".") - if (dot < 0): return self.fileName - if (dot < slash): return self.fileName - return self.fileName[:dot] - - def base_file_path(self): - slash = self.fileName.rfind("/") - if (slash < 0): return "" - return self.fileName[:slash+1] - - def get_sequence(self,name): - if ("fetch" in self.debug): - print >>sys.stderr, "[fetching %s]" % name - # read hash bucket for this name - bucket = HsxFile.hash(name) % self.numBuckets - if ("fetch" in self.debug): - print >>sys.stderr, "[ bucket = %d (file offset %08X)]" \ - % (bucket,self.hashTableOffset+5*bucket) - self.file.seek(self.hashTableOffset + 5*bucket) - bucketOffset = self.read5() - if (bucketOffset & HsxFile.msBit5 != 0): - if ("fetch" in self.debug): - print >>sys.stderr, "[ bucket is empty]" - return None - bucketEnd = self.read5() & ~HsxFile.msBit5 - if ("fetch" in self.debug): - print >>sys.stderr, "[ bucket offset = %010X..%010X ]" \ - % (bucketOffset,bucketEnd) - # scan the bucket until we find this sequence - self.file.seek(bucketOffset) - seqIx = 1 - seqName = None - while (bucketOffset < bucketEnd): - seqLength = self.read5() - fileIx = self.read1() - seqOffset = self.read6() - seqName = self.readString() - if ("fetch" in self.debug): - print >>sys.stderr, "[ (%010X) name %d = %s]" \ - % (bucketOffset,seqIx,seqName) - if (seqName == name): break - if (seqName > name): return None - bucketOffset += 1 + 6 + 5 + len(seqName) + 1 - seqIx += 1 - if (seqName != name): - if ("fetch" in self.debug): - print >>sys.stderr, "[ %s not in bucket]" % name - return None - # open the sequence file (if it isn't already open) - assert (fileIx < len(self.fileTable)), \ - "file index for %s is out of bounds (%d > %d)" \ - % (name,fileIx,len(self.fileTable)) - (seqFileName,seqFile) = self.fileTable[fileIx] - if (seqFile == None): - if ("fetch" in self.debug): - print >>sys.stderr, "[ opening %s]" % seqFileName - seqFile = file(seqFileName,"rt") - self.fileTable[fileIx] = (seqFileName,seqFile) - if ("fetch" in self.debug): - print >>sys.stderr, "[ reading from %s:%012X]" \ - % (seqFileName,seqOffset) - # read the sequence - seqFile.seek(seqOffset) - seqLines = [] - seqRead = 0 - while (True): - line = seqFile.readline() - if (line == ""): break - line = line.strip() - if ("fetch" in self.debug): - print >>sys.stderr, "[ read %s]" % line - if (line.startswith(">")): - if (len(seqLines) != 0): break - seqLines += [line] - continue - seqRead += len(line) - if (seqRead > seqLength): - line = line[:-seqLength-seqRead] - seqRead = seqLength - seqLines += [line] - if (seqRead == seqLength): - break - assert (seqRead == seqLength), \ - "sequence for %s is short (%d < %d)" \ - % (name,seqRead,seqLength) - return "\n".join(seqLines) - - def read1(self): - return ord(self.file.read(1)) - - def read4(self): - return struct.unpack(self.struct4,self.file.read(4))[0] - - def read5(self): - return self.read_and_unpack(5) - - def read6(self): - return self.read_and_unpack(6) - - def readString(self): - ch = self.file.read(1) - s = self.file.read(ord(ch)) - return "".join(s) - - def read_and_unpack(self,bytes): - data = self.file.read(bytes) - if (self.byteOrder == "<"): # (make data big endian) - data = [ch for ch in data] - data.reverse() - val = 0 - for ch in data: val = (val << 8) + ord(ch) - return val - - # hash - - def hash(name): - return hassock_hash.hassock_hash(name) - hash = staticmethod(hash) - - -if __name__ == "__main__": main() - diff --git a/programs/lastz/tools/install_py.py b/programs/lastz/tools/install_py.py deleted file mode 100755 index 537839b..0000000 --- a/programs/lastz/tools/install_py.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python - -import sys -import string -import commands -true = 1 -false = 0 - -def do (command): - print command - output = commands.getoutput(command) - if (len(output) > 0): print output - -binDir = "~/py/bin" -currentDir = commands.getoutput("pwd").strip() - -args = sys.argv[1:] -remove = False -if (args[0] == "--remove"): - remove = True - args.pop(0) - -binDir = args.pop(0) - -for f in args: - - if f.endswith (".py"): - fPy = f - fNoPy = f[:-3] - else: - fPy = "%s.py" % f - fNoPy = f - - fileHere = "%s/%s" % (currentDir, fPy) - fileInBin = "%s/%s" % (binDir, fNoPy) - - if (remove): - do ("rm %s" % (fileInBin)) - else: - do ("ln -s %s %s" % (fileHere,fileInBin)) - do ("chmod +x %s" % (fileInBin)) diff --git a/programs/lastz/tools/lav_compare.py b/programs/lastz/tools/lav_compare.py deleted file mode 100755 index c6f41d7..0000000 --- a/programs/lastz/tools/lav_compare.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -""" -Compare two lav files, reporting differences but ignoring some trivial ones ---------------------------------------------------------------------------- - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -import sys - -def usage(s=None): - message = """ -lav_diff lav_file1 lav_file2 -""" - - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -def main(): - - # parse the command line - - if (len(sys.argv) < 3): - usage("you must specify two lav files") - elif (len(sys.argv) > 3): - usage("wrong number of arguments") - - lav1Filename = sys.argv[1] - lav2Filename = sys.argv[2] - - # compare the files - - lav1 = file(lav1Filename,"rt") - lav2 = file(lav2Filename,"rt") - - different = True - stanza = None - lineNum = 0 - - while (True): - lineNum += 1 - line1 = lav1.readline() - line2 = lav2.readline() - if (line1 == "") and (line2 == ""): - different = False - break - line1 = line1.rstrip() - line2 = line2.rstrip() - - if (stanza != None): - if (line1 == "}") != (line2 == "}"): break - if (line1 == "}") and (line2 == "}"): - stanza = None - continue - stanzaIx += 1 - - if (stanza == "d") and (stanzaIx == 1): - continue # ignore command line differences - - elif (stanza == "s") and (stanzaIx <= 2): - line1 = line1.strip() - line2 = line2.strip() - - elif (stanza == "h") and (stanzaIx <= 2): - line1 = header_strip(line1) - line2 = header_strip(line2) - - if (line1 != line2): - # print >>sys.stderr,"%s\n%s" % (line1,line2) - break - - if (stanza != None) and (line1 == "}"): - stanza = None - continue - - if (line1.endswith("{")): - stanza = line1[:-1].strip() - stanzaIx = 0 - - if (different): - print >>sys.stderr,"FAILURE: %s and %s are different (line %d)" \ - % (lav1Filename,lav2Filename,lineNum) - sys.exit(1) - - print >>sys.stderr,"SUCCESS: %s and %s are equivalent" \ - % (lav1Filename,lav2Filename) - - -def header_strip(s): - s = s.strip() - if (s.startswith("\"")) and (s.endswith("\"")): - s = s[1:-1].strip() - if (s.startswith(">")): - s = s[1:].strip() - return s - - -if __name__ == "__main__": main() diff --git a/programs/lastz/tools/lav_sort.py b/programs/lastz/tools/lav_sort.py deleted file mode 100755 index 09383fa..0000000 --- a/programs/lastz/tools/lav_sort.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python -""" -Sort the a-stanzas in a lav file, according to the user's choice of key ------------------------------------------------------------------------ - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -import sys - -validKeys = ["score","pos1","pos2","beg1","beg2","end1","end2"] - -def usage(s=None): - message = """ -lav_sort --key=[-] < lav_file > lav_file -""" - - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -def main(): - - # parse the command line - - if (len(sys.argv) < 2): - usage("you must specify a key") - elif (len(sys.argv) > 2): - usage("wrong number of arguments") - - arg = sys.argv[1] - if (not arg.startswith("--key=")): - usage("unrecognized argument: \"%s\"" % arg) - - keyName = arg[arg.find("=")+1:] - keyReverse = False - if (keyName.startswith("-")): - keyName = keyName[1:] - keyReverse = True - if (keyName.startswith("+")): - keyName = keyName[1:] - keyReverse = False - if (keyName not in validKeys): - usage("unrecognized key: \"%s\"" % keyName) - - # process the stanzas - - blocks = [] - for (kind,stanza) in read_stanzas(sys.stdin): - if (kind == "a"): - key = get_key_value(keyName,stanza) - blocks += [(key,stanza)] - continue - if (len(blocks) > 0): - blocks.sort() - if (keyReverse): blocks.reverse() - for (key,s) in blocks: - print "\n".join(s) - blocks = [] - print "\n".join(stanza) - - if (len(blocks) > 0): - blocks.sort() - if (keyReverse): blocks.reverse() - for (key,s) in blocks: - print "\n".join(s) - -# read_stanzas-- -# Collect the lines that belong to the next stanza. A stanza has the form -# shown below. It consists of several lines bracketed by a pair of curlies, -# and has a type indicated by a single letter. -# -# x { -# ... -# } -# -# In this routine we generalize the stanza concept to include lines not -# strictly with a pair of curlies. First, lines beginning with a "#:" are -# considered to be single line stanzas with no type (e.g. the "#:lav" and -# "#:eof" lines). Second, any other blank lines are appended to whatever -# stanza preceeded them. This allows for lav+text and other debugging output -# from lastz to be carried around with the appropriate stanza. - -def read_stanzas(f): - kind = None - stanza = [] - inCurly = False - for line in f: - line = line.rstrip() - if (not inCurly): - isWaffle = line.startswith("#:") - inCurly = (len(line) == 3) and (line.endswith(" {")) - if (isWaffle) or (inCurly): - if (len(stanza) > 0): - yield (kind,stanza) - stanza = [] - if (isWaffle): - yield (line[2:],[line]) - kind = None - continue - kind = line[0] - stanza += [line] - else: # (inCurly) - stanza += [line] - if (line == "}"): inCurly = False - - assert (len(stanza) == 0), "premature end of file" - -# get_key_value-- -# Extract the specied key value from an a-stanza. A typical a-stanza looks -# like this one: -# -# a { -# s 14400 -# b 425 4438 -# e 697 4714 -# l 425 4438 448 4461 96 -# l 449 4464 579 4594 83 -# l 581 4595 604 4618 96 -# l 605 4627 609 4631 100 -# l 617 4632 648 4663 91 -# l 649 4666 697 4714 90 -# } - -def get_key_value(keyName,aStanza): - if (keyName == "score"): - assert (len(aStanza) >= 2) and (aStanza[1].startswith(" s")) - score = aStanza[1].split()[1] - try: - return int(score) - except: - try: - return float(score) - except: - pass - return score - - if (keyName in ["pos1","beg1"]): - assert (len(aStanza) >= 3) and (aStanza[2].startswith(" b")) - beg1 = aStanza[2].split()[1] - return int(beg1) - - if (keyName in ["pos2","beg2"]): - assert (len(aStanza) >= 3) and (aStanza[2].startswith(" b")) - beg2 = aStanza[2].split()[2] - return int(beg2) - - if (keyName in ["end1"]): - assert (len(aStanza) >= 4) and (aStanza[3].startswith(" e")) - end1 = aStanza[3].split()[1] - return int(end1) - - if (keyName in ["end2"]): - assert (len(aStanza) >= 4) and (aStanza[3].startswith(" e")) - end2 = aStanza[3].split()[2] - return int(end2) - - assert False - - -if __name__ == "__main__": main() diff --git a/programs/lastz/tools/maf_sort.py b/programs/lastz/tools/maf_sort.py deleted file mode 100755 index de3a7dd..0000000 --- a/programs/lastz/tools/maf_sort.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python -""" -Sort alignment blocks in a maf file, according to the user's choice of key --------------------------------------------------------------------------- - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -import sys,re - -validKeys = ["score","pos1","pos2","beg1","beg2","end1","end2","diag","name1","name2"] - -def usage(s=None): - message = """ -maf_sort --key=[-] < maf_file > maf_file -""" - - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -def main(): - - # parse the command line - - if (len(sys.argv) < 2): - usage("you must specify a key") - elif (len(sys.argv) > 2): - usage("wrong number of arguments") - - arg = sys.argv[1] - if (not arg.startswith("--key=")): - usage("unrecognized argument: \"%s\"" % arg) - - keyName = arg[arg.find("=")+1:] - keyReverse = False - if (keyName.startswith("-")): - keyName = keyName[1:] - keyReverse = True - if (keyName.startswith("+")): - keyName = keyName[1:] - keyReverse = False - if (keyName not in validKeys): - usage("unrecognized key: \"%s\"" % keyName) - - # process the blocks - - blocks = [] - for (block,comments) in read_blocks(sys.stdin): - key = get_key_value(keyName,block) - blocks += [(key,block,comments)] - - if (len(blocks) > 0): - blocks.sort() - if (keyReverse): blocks.reverse() - for (key,block,comments) in blocks: - if (comments != []): - print "\n".join([line for line in comments]) - print "\n".join([line for line in block]) - print - - -# read_blocks-- -# Collect the lines that belong to the next alignment block. A block has the -# form shown below. -# -# a score=19951 -# s apple 23871 367 + 70000 CCCCCGC... -# s orange 13 390 - 408 CTCCTGC... - -def read_blocks(f): - comments = [] - block = [] - lineNumber = 0 - for line in f: - lineNumber += 1 - line = line.rstrip() - if (line.startswith("#")): - comments += [line] - continue - if (line == ""): - if (len(block) == 3): - yield (block,comments) - comments = [] - block = [] - continue - elif (len(block) == 0): - continue - else: - assert (False), "premature end of block at line %d" % lineNumber - if (len(block) == 3): "long block at line %d" % lineNumber - block += [line] - - if (len(block) == 3): - yield (block,comments) - elif (len(block) != 0): - assert (False), "premature end of file" - - -# get_key_value-- -# Extract the specied key value from a maf block -# -# a score=19951 -# s apple 23871 367 + 70000 CCCCCGC... -# s orange 13 390 - 408 CTCCTGC... - -scoreRe = re.compile("^a score=(?P.+)$") -textRe = re.compile("^s" - + " +(?P[^ ]+)" - + " +(?P[0-9]+)" - + " +(?P[0-9]+)" - + " +(?P[-+])" - + " +[0-9]+" - + " +[-ACGTacgtNn]+$") - - -def get_key_value(keyName,block): - try: - line = block[0] - m = scoreRe.match(line) - if (m == None): raise ValueError - score = float(m.group("score")) - except ValueError: - assert (False), "bad score line: %s" % line - - try: - line = block[1] - m = textRe.match(line) - if (m == None): raise ValueError - name1 = m.group("name") - pos1 = int(m.group("pos")) - len1 = int(m.group("len")) - strand1 = m.group("strand") - except ValueError: - assert (False), "bad line: %s" % line - - try: - line = block[2] - m = textRe.match(line) - if (m == None): raise ValueError - name2 = m.group("name") - pos2 = int(m.group("pos")) - len2 = int(m.group("len")) - strand2 = m.group("strand") - except ValueError: - assert (False), "bad line: %s" % line - - if (keyName == "score"): - return (score,pos1,strand1,pos2,strand2,len1,len2,name1,name2) - - if (keyName in ["pos1","beg1"]): - return (pos1,strand1,pos2,strand2,len1,len2,score,name1,name2) - - if (keyName in ["pos2","beg2"]): - return (pos2,strand2,pos1,strand1,len2,len1,score,name1,name2) - - if (keyName in ["end1"]): - return (pos1+len1,strand1,pos2+len2,strand2,len1,len2,score,name1,name2) - - if (keyName in ["end2"]): - return (pos2+len2,strand2,pos1+len1,strand1,len2,len1,score,name1,name2) - - if (keyName in ["diag"]): - return (strand1,strand2,pos1-pos2,pos1,len1,len2,score,name1,name2) - - if (keyName in ["name1"]): - return (name1,score,len1,strand1,pos1,name2,len2,strand2,pos2) - - if (keyName in ["name2"]): - return (name2,score,len2,strand2,pos2,name1,len1,strand1,pos1) - - assert False - - -if __name__ == "__main__": main() diff --git a/programs/lastz/tools/merge_masking_intervals.py b/programs/lastz/tools/merge_masking_intervals.py deleted file mode 100755 index a94a69a..0000000 --- a/programs/lastz/tools/merge_masking_intervals.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python -""" -Given a file of masking intervals, combine overlapping intervals. - -Masking intervals are as would be used as lastz's softmask, xmask, or nmask -sequence specifier actions. They can be produced by lastz using the --masking -and --outputmasking options. - - input: output: - 555110 555310 555110 555310 - 555941 556479 555941 556663 - 555966 556402 - 555976 556402 - 555977 556402 - 556125 556479 - 556153 556663 - 557674 558206 557674 558278 - 557802 558278 - 559509 559769 559509 559769 - 798462 798922 798462 799008 - 798462 798963 - 798614 799008 - 799495 799603 799495 799603 - -Intervals are origin 1, closed. They needn't be sorted. - -We consider adjoining intervals to be overlapping. -""" - -__author__ = "Bob Harris (rsharris@bx.psu.edu)" - - -from sys import argv,stdin - - -def main(): - global origin,adjoining - - assert (len(argv) == 1), "give me no arguments" - - # collect the intervals - # nota bene: internally we work with them as origin-zero, half-open - - intervals = [] - - lineNumber = 0 - for line in stdin: - lineNumber += 1 - line = line.strip() - if (line == ""): continue - if (line.startswith("#")): continue - - fields = line.split() - try: - s = int(fields[0]) - 1 - e = int(fields[1]) - except ValueError: - assert (False), "bad line (%d): %s" % (lineNumber,line) - - intervals += [(s,e)] - - # merge 'em - - intervals.sort() - - start = None - for (s,e) in intervals: - if (start == None): - (start,end) = (s,e) - elif (s > end): - print "%d\t%d" % (start+1,end) - (start,end) = (s,e) - continue - elif (e > end): - end = e - - if (start != None): - print "%d\t%d" % (start+1,end) - - -if __name__ == "__main__": main() diff --git a/programs/lastz/tools/pick_from_fasta_hsx.py b/programs/lastz/tools/pick_from_fasta_hsx.py deleted file mode 100755 index 18c82d6..0000000 --- a/programs/lastz/tools/pick_from_fasta_hsx.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python -""" -Select a subset of sequences from a fasta file indexed by an hsx file ---------------------------------------------------------------------- - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -import sys -from hsx_file import HsxFile - - -def usage(s=None): - message = """ -pick_from_fasta_hsx hsx_file [--names=] [name1 name2 ...] - --names= read sequence names from a file - --nowarn don't warn about sequences that aren't found -""" - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -def main(): - - ########## - # parse the command line - ########## - - hsxFileName = None - seqNames = [] - warnOnMissing = True - showProgress = False - debug = [] - - args = sys.argv[1:] - while (len(args) > 0): - arg = args.pop(0) - val = None - fields = arg.split("=",1) - if (len(fields) == 2): - arg = fields[0] - val = fields[1] - if (val == ""): - usage("missing a value in %s=" % arg) - - if (arg == "--names") and (val != None): - f = file(val) - seqNames += [line.strip() for line in f] - f.close() - elif (arg == "--nowarn") and (val == None): - warnOnMissing = False - elif (arg == "--progress") and (val == None): - showProgress = True - elif (arg == "--debug") and (val == None): - debug += ["debug"] - elif (arg == "--debug") and (val != None): - debug += [val] - elif (arg.startswith("--")): - usage("unknown argument: %s" % arg) - elif (hsxFileName == None) and (val == None): - hsxFileName = arg - elif (val == None): - seqNames += [arg] - else: - usage("unknown argument: %s" % arg) - - if (hsxFileName == None): usage("you must give me an hsx file!") - if (seqNames == []): usage("you must give me some sequence names!") - - ########## - # fetch the sequences - ########## - - hsx = HsxFile(hsxFileName,debug=debug) - for name in seqNames: - seq = hsx.get_sequence(name) - if (seq != None): - print seq - if (showProgress): - print >>sys.stderr, name - elif (warnOnMissing): - print >>sys.stderr, "WARNING: %s not found" % name - hsx.close() - - -if __name__ == "__main__": main() - diff --git a/programs/lastz/tools/probabilities_to_scores.py b/programs/lastz/tools/probabilities_to_scores.py deleted file mode 100755 index e5e69a9..0000000 --- a/programs/lastz/tools/probabilities_to_scores.py +++ /dev/null @@ -1,412 +0,0 @@ -#!/usr/bin/env python -""" -Convert probabilities to a LASTZ scores file (including quantum scores) ------------------------------------------------------------------------ - -Given background probabilities, probabilities of each DNA substitution event, -and an optional list of quantum symbols, we create a log-odds scoring matrix -suitable for LASTZ. - -Typical command line: - - probabilities_to_scores --scaleto=100 \ - A:.26585 C:.23415 G:.23415 T:.26585 \ <--- background probabilties - AA:.18204 AC:.01903 AG:.04510 AT:.01967 \ - CA:.01903 CC:.15508 CG:.01495 CT:.04510 \ <--- substitution probabilties - GA:.04510 GC:.01495 GG:.15508 GT:.01903 \ - TA:.01967 TC:.04510 TG:.01903 TT:.18204 \ - R=G:.5,A:.5 Y=T:.5,C:.5 <--- quantum symbols - -An equivalent command line that takes advantage of the usual symmetry: - - probabilities_to_scores --scaleto=100 \ - --symmetric \ - A:.26585 C:.23415 \ <--- background probabilties - AA:.18204 AC:.01903 AG:.04510 AT:.01967 \ <--- substitution probabilties - CC:.15508 CG:.01495 \ - R=G:.5,A:.5 Y=T:.5,C:.5 <--- quantum symbols - -The resulting scores file would look like this: - - A C G T R Y - A 91 -114 -31 -123 52 -119 - C -114 100 -125 -31 -119 52 - G -31 -125 100 -114 52 -119 - T -123 -31 -114 91 -119 52 - R 52 -119 52 -119 52 -119 - Y -119 52 -119 52 -119 52 - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -import sys -from math import log - -def usage(s=None): - message = """ -probabilities_to_scores [options] > lastz_score_file - --scaleto= scale scores to give desired max - --symmetric map probabilities symmetrically - --nodna don't include A,G,C,T in the alphabets - --dnarows (target) row alphabet is A,C,G,T - --dnacol[umn]s (query) column alphabet is A,C,G,T - --hoxd70 use HOXD70 (lastz default scores) for probabilities - --iupac alphabets are IUPAC 15-letter code - --writecode= write quantum code to a file - --creator= set name of creator to write as a comment in output - --nocreator inhibit creator comment in output - = set background probability of a nucleotide - = set basepair substitution probability - = define the profile for a quantum symbol - .. e.g. Y=T:.5,C:.5 or 07=A:0.311,C:0.228,G:0.422,T:0.039 -""" - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -bases = "ACGT" -basePairs = ["AA","AC","AG","AT", - "CA","CC","CG","CT", - "GA","GC","GG","GT", - "TA","TC","TG","TT"] - -symmetries = [["A","T"],["C","G"], - ["AA","TT"],["CC","GG"],["AT","TA"],["CG","GC"], - ["AC","CA","GT","TG"],["AG","GA","CT","TC"]] - -hoxd70 = [("A", .26585),("C", .23415), - ("AA",.18204),("AC",.01903),("AG",.04510),("AT",.01967), - ("CC",.15508),("CG",.01495)] - -iupac = [("R","G,A"), - ("Y","T,C"), - ("K","G,T"), - ("M","A,C"), - ("S","G,C"), - ("W","A,T"), - ("B","G,T,C"), - ("D","G,A,T"), - ("H","A,C,T"), - ("V","G,C,A"), - ("N","A,C,G,T")] - - -def main(): - - ########## - # parse the command line - ########## - - prob = {} - scaleTo = None - symmetric = False - dnaQuery = True - symbols = [] - symProb = {} - symGroup = {} - settings = [] - rowsAreDNA = False - colsAreDNA = False - creator = "probabilities_to_scores" - codeName = None - debug = [] - - args = sys.argv[1:] - while (len(args) > 0): - arg = args.pop(0) - val = None - fields = arg.split("=",1) - if (len(fields) == 2): - arg = fields[0] - val = fields[1] - if (val == ""): - usage("missing a value in %s=" % arg) - - if (arg == "--scaleto") and (val != None): - try: scaleTo = int(val) - except ValueError: scaleTo = float(val) - elif (arg == "--symmetric") and (val == None): - symmetric = True - elif (arg == "--nodna") and (val == None): - dnaQuery = False - elif (arg == "--dnarows") and (val == None): - rowsAreDNA = True - elif (arg in ["--dnacols","--dnacolumns"]) and (val == None): - colsAreDNA = True - elif (arg in ["--hoxd70","--HOXD70"]) and (val == None): - symmetric = True - for (s,p) in hoxd70: - assert (s not in prob), "duplicate DNA event: %s" % s - prob[s] = p - elif (arg in ["--iupac","--IUPAC"]) and (val == None): - for (sym,val) in iupac: - assert (sym not in symProb), "duplicate quantum symbol: %s" % sym - symbols += [sym] - symProb[sym] = {} - symGroup[sym] = "" - vals = val.split(",") - for s in vals: - symProb[sym][s] = 1.0/len(vals) - symGroup[sym] += s - elif (arg == "--writecode") and (val != None): - codeName = val - elif (arg == "--nocreator") and (val == None): - creator = None - elif (arg == "--creator") and (val != None): - creator = val - elif (arg == "--debug") and (val != None): - debug.append(val) - elif (arg == "--debug") and (val == None): - debug.append("debug") - elif (arg.startswith("--")) and (val != None): - settings += [(arg[2:],val)] - elif (arg.startswith("--")): - usage("unknown argument: %s" % arg) - elif (val == None) and (":" in arg): - (s,p) = dna_event(arg) - assert (s not in prob), "duplicate DNA event: %s" % s - prob[s] = p - elif (valid_quantum_symbol(arg)) and (val != None): - sym = arg - assert (sym not in symProb), "duplicate quantum symbol: %s" % sym - symbols += [sym] - symProb[sym] = {} - symGroup[sym] = "" - vals = val.split(",") - haveProbs = False - for val in vals: - if (":" in val): - haveProbs = True - break - if (haveProbs): - for val in vals: - (s,p) = dna_event(val) - assert (len(s) == 1), \ - "invalid DNA event for %s: %s" % (sym,s) - assert (s not in symProb[sym]), \ - "duplicate DNA event for %s: %s" % (sym,s) - symProb[sym][s] = p - symGroup[sym] += s - else: - for s in vals: - assert (len(s) == 1) and (s in bases), \ - "invalid DNA event for %s: %s" % (sym,s) - assert (s not in symProb[sym]), \ - "duplicate DNA event for %s: %s" % (sym,s) - symProb[sym][s] = 1.0/len(vals) - symGroup[sym] += s - else: - usage("unknown argument: %s" % arg) - - ########## - # sanity check - ########## - - if (symmetric): - for group in symmetries: - present = len([x for x in group if (x in prob)]) - assert (present == 1), \ - "need a probability for exactly one of %s" \ - % (",".join(group)) - val = None - for x in group: - if (x in prob): - val = prob[x] - break - for x in group: - if (x not in prob): prob[x] = val - - for nuc in bases: - assert (nuc in prob), \ - "need a probability for %s" % nuc - - for xy in basePairs: - assert (xy in prob), \ - "need a probability for %s" % (xy) - - p = sum([prob[nuc] for nuc in bases]) - assert (abs(p-1) < .000001), \ - "base probabilities sum to %f" % p - - p = sum([prob[xy] for xy in basePairs]) - assert (abs(p-1) < .000001), \ - "base pair probabilities sum to %f" % p - - for sym in symProb: - p = sum([symProb[sym][nuc] for nuc in symProb[sym]]) - assert (abs(p-1) < .000001), \ - "probabilities for %s sum to %f" % (sym,p) - for nuc in bases: - if (nuc not in symProb[sym]): - symProb[sym][nuc] = 0 - - if (dnaQuery): - for sym in bases: - if (sym in symProb): continue - symbols += [sym] - symProb[sym] = {} - symGroup[sym] = sym - for nuc in bases: - if (nuc == sym): symProb[sym][nuc] = 1 - else: symProb[sym][nuc] = 0 - symbols = [sym for sym in bases] \ - + [sym for sym in symbols if (sym not in bases)] - - if (rowsAreDNA): rowSymbols = bases - else: rowSymbols = symbols - - if (colsAreDNA): colSymbols = bases - else: colSymbols = symbols - - ########## - # print what we got - ########## - - if ("debug" in debug): - print " ".join([" %s:%.5f" % (nuc,prob[nuc]) for nuc in bases]) - - for x in bases: - print " ".join(["%s:%.5f" % (x+y,prob[x+y]) for y in bases]) - - print - for sym in symbols: - p = symProb[sym] - print "%s -> %s" \ - % (sym," ".join([" %s:%.5f" % (nuc,p[nuc]) for nuc in bases])) - - ########## - # write quantum code file - ########## - - if (codeName != None): - codeF = file(codeName,"wt") - for sym in symbols: - p = symProb[sym] - print >>codeF, "%s\t%s" \ - % (sym,"\t".join(["%.6f" % p[nuc] for nuc in bases])) - codeF.close() - - ########## - # assign scores - ########## - - sub = {} - maxSub = None - - for row in rowSymbols: - u = symProb[row] - sub[row] = {} - for col in colSymbols: - v = symProb[col] - numer = sum([u[y]*v[x]*prob[y+x] for (y,x) in basePairs]) - denom = sum([u[y]*v[x]*prob[y]*prob[x] for (y,x) in basePairs]) - sub[row][col] = log (float(numer) / float(denom)) - if (maxSub == None) or (sub[row][col] > maxSub): - maxSub = sub[row][col] - - if (scaleTo != None): - scale = scaleTo / maxSub - for row in rowSymbols: - for col in colSymbols: - sub[row][col] *= scale - if (type(scaleTo) == int): - sub[row][col] = round(sub[row][col]) - - ########## - # print the settings, if there are any - ########## - - if (creator != None): - print "# created by %s" % creator - print - - if (settings != []): - sLen = max([len(s) for (s,val) in settings]) - for (s,val) in settings: - print "%-*s = %s" % (sLen,s,val) - print - - ########## - # print the substitution matrix - ########## - - if (scaleTo != None) and (type(scaleTo) == int): - wSub = 4 - for row in rowSymbols: - for col in colSymbols: - wSub = max(wSub,len("%d" % sub[row][col])) - - print "%s %s" \ - % ("#"," ".join(["%*s" % (wSub,non_single(symGroup[col])) for col in colSymbols])) - - print "%s %s" \ - % (" "," ".join(["%*s" % (wSub,col) for col in colSymbols])) - - for row in rowSymbols: - print "%s %s%s" \ - % (row, - " ".join(["%*d" % (wSub,sub[row][col]) for col in colSymbols]), - non_single_comment(symGroup[row])) - - else: - wSub = 4 - for row in rowSymbols: - for col in colSymbols: - wSub = max(wSub,len("%.6f" % sub[row][col])) - - print "%s %s" \ - % ("#"," ".join(["%*s" % (wSub,non_single(symGroup[col])) for col in colSymbols])) - - print "%s %s" \ - % (" "," ".join(["%*s" % (wSub,col) for col in colSymbols])) - - for row in rowSymbols: - print "%s %s%s" \ - % (row, - " ".join(["%*.6f" % (wSub,sub[row][col]) for col in colSymbols]), - non_single_comment(symGroup[row])) - - -def dna_event(s): - (s,p) = s.split(":",1) - assert (valid_dna_event(s)), "invalid DNA event: %s" % s - try: - p = float(p) - if (not (0 <= p <= 1)): raise ValueError - except ValueError: - assert (False), "invalid probability for %s: %s" % (s,p) - return (s,p) - - -def valid_dna_event(s): - if (len(s) == 0): - return False - if (len(s) == 1): - return (s in bases) - if (len(s) == 2): - return (s[0] in bases) and (s[1] in bases) - return False - - -def valid_quantum_symbol(s): - if (len(s) == 0): - return False - if (len(s) == 1): - return (s in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") - if (len(s) == 2): - if (s == "00"): return False - return (s[0] in "0123456789ABCDEF") and (s[1] in "0123456789ABCDEF") - return False - - -def non_single_comment(s): - if (len(s) == 1): return "" - else: return " # " + s - - -def non_single(s): - if (len(s) == 1): return "" - else: return s - - -if __name__ == "__main__": main() diff --git a/programs/lastz/tools/qcode_to_scores.py b/programs/lastz/tools/qcode_to_scores.py deleted file mode 100755 index b99cea4..0000000 --- a/programs/lastz/tools/qcode_to_scores.py +++ /dev/null @@ -1,497 +0,0 @@ -#!/usr/bin/env python -""" -Convert quantum-code files to a LASTZ scores file -------------------------------------------------- - -Given background probabilities, probabilities of each DNA substitution event, -and one (or two) quantum code files, we create a log-odds scoring matrix -suitable for LASTZ. - -Typical command line: - - qcode_to_scores --scaleto=100 \ - A:.26585 C:.23415 G:.23415 T:.26585 \ <--- background probabilties - AA:.18204 AC:.01903 AG:.04510 AT:.01967 \ - CA:.01903 CC:.15508 CG:.01495 CT:.04510 \ <--- substitution probabilties - GA:.04510 GC:.01495 GG:.15508 GT:.01903 \ - TA:.01967 TC:.04510 TG:.01903 TT:.18204 \ - --code.target= --code.query= - -An equivalent command line that takes advantage of the usual symmetry: - - qcode_to_scores --scaleto=100 \ - --symmetric \ - A:.26585 C:.23415 \ <--- background probabilties - AA:.18204 AC:.01903 AG:.04510 AT:.01967 \ <--- substitution probabilties - CC:.15508 CG:.01495 \ - --code.target= --code.query= - -Quantum code files look something like the one below. Each row represents a -quantum symbol. The first value is the code value, either a single ascii -character or a two character hex value. The remaining four values are the -probability of that symbol being A, C, G, or T. Lines beginning with a # are -comments, and anything other than five columns is an error. - - # p(A) p(C) p(G) p(T) - 01 0.125041 0.080147 0.100723 0.694088 - 02 0.111162 0.053299 0.025790 0.809749 - 03 0.065313 0.007030 0.004978 0.922679 - ... - -:Author: Bob Harris (rsharris@bx.psu.edu) -""" - -import sys -from math import log - -def usage(s=None): - message = """ -qcode_to_scores [options] > lastz_score_file - --scaleto= scale scores to give desired max - --symmetric map probabilities symmetrically - --hoxd70 use HOXD70 (lastz default scores) for probabilities - --code.target= specify the quantum code for rows (LASTZ target) - --code.query= specify the quantum code for columns (LASTZ query) - --code= specify the quantum code for both rows *and* columns - --creator= set name of creator to write as a comment in output - --nocreator inhibit creator comment in output - .target: set target background probability of a nucleotide - .query: set query background probability of a nucleotide - : set background probability of a nucleotide for *both* - target and query - : set basepair substitution probability; first base is - for target, second for query -""" - if (s == None): sys.exit (message) - else: sys.exit ("%s\n%s" % (s,message)) - - -bases = ["A","C","G","T"] -basePairs = ["AA","AC","AG","AT", - "CA","CC","CG","CT", - "GA","GC","GG","GT", - "TA","TC","TG","TT"] - -baseSymmetries = [["A","T"],["C","G"]] -pairSymmetries = [["AA","TT"],["CC","GG"],["AT","TA"],["CG","GC"], - ["AC","CA","GT","TG"],["AG","GA","CT","TC"]] - -hoxd70 = [("A", .26585),("C", .23415), - ("AA",.18204),("AC",.01903),("AG",.04510),("AT",.01967), - ("CC",.15508),("CG",.01495)] - - -def main(): - - ########## - # parse the command line - ########## - - rProb = {} - cProb = {} - rcProb = {} - scaleTo = None - symmetric = False - dnaQuery = True - symbols = [] - settings = [] - rowCodeName = None - colCodeName = None - creator = "qcode_to_scores" - debug = [] - - args = sys.argv[1:] - while (len(args) > 0): - arg = args.pop(0) - val = None - fields = arg.split("=",1) - if (len(fields) == 2): - arg = fields[0] - val = fields[1] - if (val == ""): - usage("missing a value in %s=" % arg) - - if (arg == "--scaleto") and (val != None): - try: scaleTo = int(val) - except ValueError: scaleTo = float(val) - elif (arg == "--symmetric") and (val == None): - symmetric = True - elif (arg == "--nodna") and (val == None): - dnaQuery = False - elif (arg == "--dnarows") and (val == None): - rowsAreDNA = True - elif (arg in ["--dnacols","--dnacolumns"]) and (val == None): - colsAreDNA = True - elif (arg in ["--hoxd70","--HOXD70"]) and (val == None): - symmetric = True - for (s,p) in hoxd70: - assert (s not in rProb) and (s not in cProb), \ - "duplicate DNA event: %s" % s - rProb[s] = cProb[s] = p - elif (arg in ["--code.row","--code.target"]) and (val != None): - assert (rowCodeName == None), \ - "can't have more than one row/target code" - rowCodeName = val - elif (arg in ["--code.column","--code.col","--code.query"]) and (val != None): - assert (colCodeName == None), \ - "can't have more than one column/target code" - colCodeName = val - elif (arg == "--code") and (val != None): - assert (rowCodeName == None), \ - "can't have more than one row/target code" - assert (colCodeName == None), \ - "can't have more than one column/target code" - rowCodeName = colCodeName = val - elif (arg == "--nocreator") and (val == None): - creator = None - elif (arg == "--creator") and (val != None): - creator = val - elif (arg == "--debug") and (val != None): - debug.append(val) - elif (arg == "--debug") and (val == None): - debug.append("debug") - elif (arg.startswith("--")) and (val != None): - settings += [(arg[2:],val)] - elif (arg.startswith("--")): - usage("unknown argument: %s" % arg) - elif (val == None) and (":" in arg): - (s,which,p) = dna_event(arg) - if (which == "target"): w = "row" - elif (which == "query"): w = "col" - elif (which == "column"): w = "col" - else: w = which - assert (w in ["row","col",None]), \ - "can't decipher \"%s\" (in %s)" % (which,arg) - if (w == "row"): - assert (s in bases), \ - "can't specify %s for %s (in %s)" % (which,s,arg) - assert (s not in rProb), \ - "duplicate DNA event: %s.target" % s - rProb[s] = p - elif (w == "col"): - assert (s in bases), \ - "can't specify %s for %s (in %s)" % (which,s,arg) - assert (s not in cProb), \ - "duplicate DNA event: %s.query" % s - cProb[s] = p - elif (s in bases): - assert (s not in rProb) and (s not in cProb), \ - "duplicate DNA event: %s" % s - rProb[s] = cProb[s] = p - else: - assert (s not in rcProb), \ - "duplicate DNA pair event: %s" % s - rcProb[s] = p - else: - usage("unknown argument: %s" % arg) - - ########## - # sanity check - ########## - - if (symmetric): - conProb = {} - for nuc in bases: - if (nuc in rProb) and (nuc not in cProb): - conProb[nuc] = rProb[nuc] - elif (nuc in cProb) and (nuc not in rProb): - conProb[nuc] = cProb[nuc] - elif (nuc in cProb) and (nuc in rProb): - assert (rProb[nuc] == cProb[nuc]), \ - "can't use --symmetric with %s.target != %s.query" \ - % (nuc,nuc) - conProb[nuc] = rProb[nuc] - - for group in baseSymmetries: - present = len([x for x in group if (x in conProb)]) - assert (present == 1), \ - "need a probability for exactly one of %s" \ - % (",".join(group)) - val = None - for x in group: - if (x in conProb): - val = conProb[x] - break - for x in group: - if (x not in conProb): conProb[x] = val - rProb = cProb = conProb - - for group in pairSymmetries: - present = len([x for x in group if (x in rcProb)]) - assert (present == 1), \ - "need a probability for exactly one of %s" \ - % (",".join(group)) - val = None - for x in group: - if (x in rcProb): - val = rcProb[x] - break - for x in group: - if (x not in rcProb): rcProb[x] = val - - for nuc in bases: - assert (nuc in rProb), \ - "need a target probability for %s" % nuc - assert (nuc in cProb), \ - "need a query probability for %s" % nuc - - for xy in basePairs: - assert (xy in rcProb), \ - "need a probability for %s" % (xy) - - p = sum([rProb[nuc] for nuc in bases]) - assert (abs(p-1) < .00001), \ - "target base probabilities sum to %f" % p - - p = sum([cProb[nuc] for nuc in bases]) - assert (abs(p-1) < .00001), \ - "query base probabilities sum to %f" % p - - p = sum([rcProb[yx] for yx in basePairs]) - assert (abs(p-1) < .00001), \ - "base pair probabilities sum to %f" % p - - ########## - # read code files - ########## - - # read row code - - if (rowCodeName == None): - rowCode = simple_dna_quantum_code() - else: - rowCode = read_quantum_code(rowCodeName) - - if (".order" in rowCode): - rowSymbols = rowCode[".order"] - else: - rowSymbols = [sym for sym in rowCode] - rowSymbols.sort() - - # read column code - - if (colCodeName == None): - colCode = simple_dna_quantum_code() - elif (colCodeName == rowCodeName): - colCode = rowCode - else: - colCode = read_quantum_code(colCodeName) - - if (".order" in colCode): - colSymbols = colCode[".order"] - else: - colSymbols = [sym for sym in colCode] - colSymbols.sort() - - ########## - # print what we got - ########## - - if ("debug" in debug): - print "target" \ - + " ".join([" %s:%.5f" % (nuc,rProb[nuc]) for nuc in bases]) - print "query" \ - + " ".join([" %s:%.5f" % (nuc,cProb[nuc]) for nuc in bases]) - for y in bases: - print " ".join(["%s:%.5f" % (y+x,rcProb[y+x]) for x in bases]) - - ########## - # assign scores - ########## - - sub = {} - maxSub = None - - for row in rowSymbols: - u = rowCode[row] - sub[row] = {} - for col in colSymbols: - v = colCode[col] - numer = sum([u[y]*v[x]*rcProb[y+x] for (y,x) in basePairs]) - denom = sum([u[y]*v[x]*rProb[y]*cProb[x] for (y,x) in basePairs]) - sub[row][col] = log (float(numer) / float(denom)) - if (maxSub == None) or (sub[row][col] > maxSub): - maxSub = sub[row][col] - - if (scaleTo != None): - scale = scaleTo / maxSub - for row in rowSymbols: - for col in colSymbols: - sub[row][col] *= scale - if (type(scaleTo) == int): - sub[row][col] = round(sub[row][col]) - - ########## - # print the settings, if there are any - ########## - - if (creator != None): - print "# created by %s" % creator - print - - if (settings != []): - sLen = max([len(s) for (s,val) in settings]) - for (s,val) in settings: - print "%-*s = %s" % (sLen,s,val) - print - - ########## - # print the substitution matrix - ########## - - wRow = max([len(row) for row in rowSymbols]) - - if (scaleTo != None) and (type(scaleTo) == int): - wCol = 4 - for row in rowSymbols: - for col in colSymbols: - wCol = max(wCol,len("%d" % sub[row][col])) - - print "%-*s %s" \ - % (wRow," "," ".join(["%*s" % (wCol,col) for col in colSymbols])) - - for row in rowSymbols: - print "%-*s %s" \ - % (wRow,row, - " ".join(["%*d" % (wCol,sub[row][col]) for col in colSymbols])) - - else: - wCol = 4 - for row in rowSymbols: - for col in colSymbols: - wCol = max(wCol,len("%.6f" % sub[row][col])) - - print "%-*s %s" \ - % (wRow," "," ".join(["%*s" % (wCol,col) for col in colSymbols])) - - for row in rowSymbols: - print "%-*s %s" \ - % (wRow,row, - " ".join(["%*.6f" % (wCol,sub[row][col]) for col in colSymbols])) - - -def simple_dna_quantum_code(): - symToProfile = {} - for nuc1 in bases: - symToProfile[nuc1] = {} - for nuc2 in bases: - if (nuc2 == nuc1): symToProfile[nuc1][nuc2] = 1 - else: symToProfile[nuc1][nuc2] = 0 - return symToProfile - - -def read_quantum_code(codeName): - codeF = file (codeName, "rt") - - symToProfile = {} - codeNumUsed = {} - symOrder = [] - - lineNum = 0 - for line in codeF: - lineNum += 1 - line = line.strip() - if ("#" in line): - line = line.split("#",1)[0].strip() - if (line == ""): - continue - - fields = line.split() - - assert (len(fields) >= 5), \ - "fewer than four probabilities (%s line %d)" \ - % (codeName,lineNum) - assert (len(fields) <= 5), \ - "more than four probabilities (%s line %d)" \ - % (codeName,lineNum) - - try: - sym = fields[0] - codeNum = quantum_code_num(sym) - except ValueError: - assert (False), \ - "%s is not a valid quantum symbol (%s line %d)" \ - % (sym,codeName,lineNum) - - if (codeNum in codeNumUsed): - assert (False), \ - "%s (or equivalent) appears more than once (%s line %d)" \ - % (sym,codeName,lineNum) - - try: - profile = {} - for ix in range(4): - p = float_or_fraction(fields[ix+1]) - if (not (0 <= p <= 1)): raise ValueError - profile[bases[ix]] = p - except: - assert (False), \ - "%s is a bad probability value (%s line %d)" \ - % (fields[ix+1],codeName,lineNum) - - symToProfile[sym] = profile - codeNumUsed[codeNum] = True - symOrder += [sym] - - codeF.close () - - # sanity check - - assert (len(symToProfile) >= 1), \ - "%s contains no code vectors!" % codeName - - for sym in symToProfile: - p = sum([symToProfile[sym][nuc] for nuc in bases]) - assert (abs(p-1) < .00001), \ - "probabilities for %s sum to %f (in %s)" % (sym,p,codeName) - - symToProfile[".order"] = symOrder - - return symToProfile - - -def dna_event(s): - (s,p) = s.split(":",1) - if ("." in s): (s,which) = s.split(".",1) - else: which = None - assert (valid_dna_event(s)), "invalid DNA event: %s" % s - try: - p = float_or_fraction(p) - if (not (0 <= p <= 1)): raise ValueError - except ValueError: - assert (False), "invalid probability for %s: %s" % (s,p) - return (s,which,p) - - -def valid_dna_event(s): - if (len(s) == 0): - return False - if (len(s) == 1): - return (s in bases) - if (len(s) == 2): - return (s[0] in bases) and (s[1] in bases) - return False - - -def float_or_fraction(s): - if ("/" in s): - (n,d) = s.split("/",1) - return float(n)/float(d) - else: - return float(s) - - -def quantum_code_num(s): - if (len(s) == 0): - raise ValueError - if (len(s) == 1): - if (0x21 <= ord(s) <= 0x7E): return ord(s) - else: raise ValueError - if (len(s) == 2): - if (s == "00"): raise ValueError - try: return int(s,16) - except: raise ValueError - raise ValueError - - -if __name__ == "__main__": main() diff --git a/programs/parasight_v7.6/parasight.pl b/programs/parasight_v7.6/parasight.pl deleted file mode 100755 index fc56660..0000000 --- a/programs/parasight_v7.6/parasight.pl +++ /dev/null @@ -1,7379 +0,0 @@ -#!/usr/bin/perl - -use strict 'vars'; -#use bytes; - - -#USEFUL ENVIRONMENTAL VARIABLES# -#print "$^O\n"; -#print "$] \n"; -#print "$0 \n"; -#print "H$ENV{HOME} S\n"; -#print "P$ENV{PATH} S$ENV{SHELL}\n"; - -#LOAD MODULES ######################################################### -use Getopt::Long; -use Tk; -use Cwd ; -use Tk::Dialog; -use Tk::Balloon; -use Tk::BrowseEntry; - -####THINGS TO ADD/DO ############################################################# - -#1) A jump and mark function to search multiscreen views -#2) Change to drawing boxes instead of really thick lines for subjects and extras -# This should allow some cool stuff using Illustator effects such as lighting. - -############### PARASIGHT EVOLUTION (some of the puncuated events) ########################## -#030226 decomtaminate algorithm added to remove high copies over a certain range and all related pairwise. -#030122 added better help descriptions for options and more pod help documentation -#021207 fixed printing finally (how obtuse) -#021203 fixed SscaleC line issues (a little more intutitive now) -#021029 fixed precode issues -#021029 returns are usable now in text_text options -#020821 remove Storable module usage (versions incompatibities were the last straw) -#020411 add FILE option for precode so that it can work with Win32 limited command prompt -#020410 start version 7.2-- clean up for publication -#011026 added support for extracting sequences from files and by fastacmd -#011015 fixed subseq drawing error -#011014 added description array for every variable -#011014 added four execute commands that can be combined with columns -#011014 modified more variable names changing sub-> seq -#011002 added graph1 and graph2 option -#011002 fixed extra Left Click popup window -#011002 removed verbose output -#011002 removed defn from extra option--just seq begin end -#011002 added default template locations for unix -#011002 change sub_labelhit_col to sub_labelhit_col -#010822 added menu view of alignments -#010814 fix the option template--it didn't appear to be loading over the defaults -#010807 create my own pop up window--ballon help is slow and adding overhead -#010529 added extra level order changing and quick color -#010525 added quick color and level order changing for send to background and foreground -#010523 added template option to save menu -#010523 added large amounts of documenation and more popup help windows -#010523 change random subject color so that subject sequence will have consistant color -#010523 added searching through conditional coloring as requested by jules (better command processing) -#010522 added arrows to objects (technological breakthough!) -#010519 added color variation based on conditional pseudo-conditional statements calle hitcondition -#011002 changed save to be *.psa (alignment array) *.pse (extra table) and *.pso (options) -#010517 fixed sequence drawing so that numerical only sequence names can be used# -#011002 changed highlight color to yellow -#011002 added ability to specify arrangement using a file with seq and begin position -#010506 revamped option display to index cards -#010506 fixed seq1 and seq2 so that subject labeling is proper order -#010505 minor cosmetic changes and error fixes -#010505 add new options: -option, -die, and -precode -#010116 add program stats section for program_manager.pl -#001221 fixed -in when to stop alignments reloading on subsequent executions# -#001211 fixed a whole lot of little bugs# -#001011 added vertical scales -#001015 cleaned up interface and added new options - -####################################### -####################################### -#####hard-wired defaults############### -###################################### - - -#DECLARE GLOBAL VARIABLES####################################### -use vars qw($n1a $n2a $n1 $n2 $loaded $tmp $b1f $e1f $b2f $e2f); -use vars qw($x_max $x_min $opt_t $opt_m $margin $l1 $l2 %subscaleC @subscaleC); - -use vars qw($column_header_display); -use vars qw($first_pass $widest_line); -use vars qw($mw $balloon $ballooni $canvas $frame $fontsize $scale_x $scrolledcanvas $output $scale); -use vars qw($file %deleted_pairwise); -use vars qw(%acc %accsub @acc_order @acc_ordersub @l %lpos %msghash); -use vars qw(@m %mh @mheader $mstring); -use vars qw(@e %eh @eheader $estring); -use vars qw(@g1 @g2); -use vars qw(%pairwise2delete); -use vars qw(%opt %colheader %newopt $options %iinfo %optdesc); -use vars qw($filepath $optionpath ); -use vars qw($canvas_width $bp_per_pixel); - -########################################################### -############# PROGRAM DESCRIPTION ######################### -########################################################### - -use vars qw($program $pversion $pdescription $pgenerate $pusage); -$program = "$0"; -$program =~ s/^.*\///; -### program stats ### -$pversion='7.6'; -$pdescription = "$program (ver:$pversion) displays pairwise alignments and accompaning annontation in a wide variety of formats"; -$pgenerate= 'jeff: labmates: genetics: public:'; -$pusage="$program [-in file/-align file/-showseq file] [other data] [other options]"; -### program stats end ### - -########################################################### -############PRE-DEFAULTS################################### - -#template paths# -use vars qw($template_path $export_text_path); -$export_text_path='untitled.txt'; -$template_path='~/.PARASIGHT:/people/PARASIGHT'; -$template_path=~ s/~/$ENV{HOME}/g; - -#default show# -use vars qw($default_show); -$default_show='ALL'; - - - -########################################################## -#########################HELP############################# -########################################################## - -if (! defined $ARGV[0]) { -print "USAGE -$pusage -DESCRIPTION\n$pdescription - -MAIN DATA INPUT COMMANDS --in [filepath] load a saved parasight view (*.pso *.psa *.pse ...) --align [filepath1:filepath2:etc] load pairwise alignment table(s) --showseq [ALL | file | seqname(s):] names of sequences to display - no colon = load as file of names - colon(:) = parse as list of colon-delimited seq names -ADDITIONAL DATA INPUT COMMANDS --extra [filepath1:filepath2:etc] loads extra simple sequence feature(s) - (e.g. exons, introns, and repeats) --graph1 [filepath1:filepath2:etc] - graphs a set of values on a scale above the sequence at given positions - (e.g. moving windows such as %GC) --graph2 [filepath1:filepath2:etc] - adds another line to the graph scale -OPTIONAL COMMANDS --arrangeseq [oneperline | sameline | file:filepath] arranges sequences - file:filepath allows specific positions to be assigned to sequences --arrangesub [oneperline|stagger|subscaleS|subscaleV] (default stagger) - easier to manipulate from user interface then at command line --colorsub [NONE | RESET | seqrandom | hitrandom | hitcondition] --options ['opt1=>value1,opt2=>value2'] change any parasight option - (for on/off, yes/no or true/false options use 1 and 0 as input) - (e.g. 'canvas_width=>500,seq_tick_on=>1,-arrangeseq=>oneperline') --showsub [ALL|file|seqnames:] names of subjects to display - no colon = load as file of names - colon(:) = parse as list of colon-delimited seq names --template [filepath] load a template file containing options to apply --showseqqueryonly [switch] will only draw sequences in first (blast query) position --quiet [switch] decreases screen output -ADVANCED COMMANDS --minload [switch] loads only the pairwise relavant to current -showseq - (quicker when just certain sequences are needed from large files) --precode 'perl code commands to execute after first screen draw' --die parasight exits after executing precode -FULL DOCUMENTATION --help -HINT FOR BEGINNERS - GET YOUR DATA LOADED AND THEN MANIPULATE IT THROUGH THE GUI OPTIONS! -"; -exit; -} - - -###################################################################################################### -############################OPTION HANDLING ########################################################## -###################################################################################################### -if ( &GetOptions(\%newopt, "in=s", "align=s", "showseq=s","showsub=s","color=s",'colorsub=s', "extra=s","arrangeseq=s","arrangesub=s","options=s", - "template=s","graph1=s","graph2=s","showqueryonly","quiet","minload", "precode=s","die","help") ) { - print "Command line arguments parsed sucessfully\n" if !$newopt{'quiet'}; -} else { - die "\nCommand line arguments were not sucessfully parsed\n"; -} -###EXECUTE POD DOCMENTATION/LONG HELP #### -if ($newopt{'help'}) { - system "perldoc $0\n"; - exit; -} - - -if (!$newopt{'in'} && !$newopt{'align'} && !$newopt{'extra'} && !$newopt{'graph1'}) { - print "*******************************************************************************\n" if !$newopt{'quiet'}; - print "**WARNING: No major options (-in, -align,-graph or -extra) were provided! *****\n" if !$newopt{'quiet'}; - print "*******************************************************************************\n" if !$newopt{'quiet'}; - -} - -########################################################### -############ load options into from a template file ####### -############ loaded directly into %opt #################### -########################################################### - -$options.=''; -if ( $newopt{'template'} ) { - ###require file to end in .pst### - $newopt{'template'}=~s/\.pst//; - $newopt{'template'}.='.pst'; - my @dir=split /[:;] */,$template_path; - if (-e $newopt{'template'} ) { - &load_option_template($newopt{'template'}); - $newopt{'template'}=''; - } else { - ###check for and if found load from default directories# - foreach(@dir) { - my $p="$_/$newopt{'template'}"; - if ( -e $p) { - #load and clear template# - &load_option_template($p); - $newopt{'template'}=''; - last; - } - } - } - ##template should be blank if found# - if ($newopt{'template'} ne '') { - print "ERROR:Can't locate requested option template ($newopt{'template'})\n"; - foreach (@dir) { - opendir (DIR, "$_" ) || die "Can't read template directory ($_)!\n"; - my @templates=grep { /\.pst$/} readdir DIR; - print "***VALID templates to choose from in ($_) are: [", join (", ", @templates), "]\n"; - } - die "\n"; - } -} - -############################################# -######## parse command line options ######### -############################################# - -if ( $newopt{'options'} ) { - if ($newopt{'options'} !~ /=>/ ) { - ####add file loading with format option=>value### - open (OPTIONS, $newopt{'options'}) || die "Can't read ($newopt{'options'}) ($!)!\n"; - ###this had yet to be implemented### - $newopt{"options"}=''; - while () { - next if /^#/; - chomp; - s/,$//; - $newopt{'options'}.= "$_ ,"; - } - - } - $options.=$newopt{'options'}; - delete $newopt{'options'}; -} - -########################################## -###########parse other major options ##### - -$newopt{'showseq'}=$default_show if !defined $newopt{'showseq'} && !defined $newopt{'in'}; -$newopt{'showsub'}='ALL' if !defined $newopt{'showsub'} && !defined $newopt{'in'}; -if ($newopt{'arrangeseq'}=~/file:(.+)/ ) { - $newopt{'arrange_file'}=$1; - if (open (TEST, $newopt{'arrange_file'} ) ) { - close TEST; - $newopt{'arrangeseq'}='file'; - } else { - warn "-arrangeseq $newopt{'arrangeseq'} could not be opened\n"; - $newopt{'arrangeseq'}=''; - } -} -$newopt{'arrangeseq'}='oneperline' if !defined $newopt{'showsub'} && !defined $newopt{'in'} &&!defined $newopt{'template'}; -$newopt{'arrangesub'}='stagger' if !defined $newopt{'arrangesub'} && !defined $newopt{'in'}&&!defined $newopt{'template'}; -$newopt{'color'}='None' if !defined $newopt{'color'} && !defined $newopt{'in'}&&!defined $newopt{'template'}; -$newopt{'colorsub'}='None' if !defined $newopt{'colorsub'} && !defined $newopt{'in'}&&!defined $newopt{'template'}; - -################################################################ -#######PARSE NEW OPTIONS OUT INTO PROPER FORMAT################# -if ( $newopt{'arrangesub'} ) { - if ( $newopt{'arrangesub'} =~/(subscale[NC]+):([A-Za-z_0-9#]+):([\-0-9.:]+)$/ ) { - $newopt{'arrangesub'}=$1; - ($newopt{'sub_scale_col'},$newopt{'sub_scale_col2'})=split '#',$2; - #print "$1|$2|$3|$4|$5\n"; - ($newopt{'sub_scale_lines'},$newopt{'sub_scale_min'},$newopt{'sub_scale_max'},$newopt{'sub_scale_step'})= split ':',$3; - } - #print "$newopt{'sub_scale_min'},$newopt{'sub_scale_max'},$newopt{'sub_scale_lines'}\n" -} - - -############################################################# -####################LOAD OLD OPTIONS OR GET DEFAULTS ######## -if ($newopt{'in'}) { - $newopt{'in'}=~s/\.ps[aeo]?$//; - $newopt{'in'}.='.psa'; - &load_parasight_table( $newopt{'in'} ); - $opt{'align'}= $newopt{'align'}; - $opt{'extra'} = $newopt{'extra'}; - $opt{'graph1'} = $newopt{'graph1'}; - $opt{'graph2'} = $newopt{'graph2'}; -} - -################################################################ -#####general option built-in defaults and descriptions ######### -################################################################ -# any of these options will be overridden with -option command # -################################################################ -#### 1 begin is just a place holder to check for to make sure array is synced -#### 2 is name of the option for %opt array -#### 3 is the default value of the option -#### 4 is the description of the option for %optdesc array - -my @todefine=( -#'just_pairwise_regions' , 0 , 'THIS OPTION IS INACTIVATED AND NONFUNCTIONAL' , - - -'begin', 'alignment_col' , 0, '[integer] column for the first query (first sequence) in a parsed pairwise alignment. Blank/zero hides option from popup menu. The sequence will contain dashes for gaps.' , -'begin', 'alignment_col2' , 0, '[integer] column for subject (second pairwise position) sequence alignment. Blank/zero hides option from popup menu. The sequence will contain dashes for gaps.' , -'begin', 'alignment_wrap' , 50, '[integer] line width in aligned characters (bases/amino acids/dashes) for displaying any alignments' , -'begin', 'arrangeseq' , 'oneperline' , '[oneperline|sameline|file] determines the arrangement of sequences that are currently being shown with -showseq. Choices: oneperline = each sequence placed on a separate line; sameline = sequences are place one after the other on the same line; file = load a file with exact positions in terms of line number and base position within the colorsub_hitcond_tests variable' , -'begin', 'arrangesub' , 'stagger' , '[stagger|oneperline|subscaleC|subscaleN] basic' , -'begin', 'arrangesub_stagger_spacing' , 40000 , '[integer] bases of spacing between for sequences placed on the same sub line. Sequences separated by less than this distance from each other will be placed on separate sub lines). This option is useful for providing space for a label.' , -'begin', 'canvas_bpwidth' , 250000 , '[integer] number of bases that the width of screen represents (not including indentations). This is the number of bases per line across' , -'begin', 'canvas_indent_left' , 60 , '[integer] pixels to indent from the left-side of screen window image (the drawing areas is the canvas in Tk)' , -'begin', 'canvas_indent_right' , 30 , '[integer] pixels to indent from the right-side of screen window image (the drawing areas is the canvas in Tk)' , -'begin', 'canvas_indent_top' , 40 , '[integer] pixels to indent from top of screen before drawing sequence lines (it does not take into account graphs or extras) (the drawing areas is the canvas in Tk)' , -'begin', 'color' , 'None' , '(not implemented yet)' , -'begin', 'colorsub' , 'None' , '[NONE|RESET|hitrandom|seqrandom|hitconditional] Choices for coloring subs: NONE=no coloring routines; RESET=clear all assigned colors to pairwise; hitrandom=randomly color each hit/pairwise a different color; seqrandom=randomly color each defined seequence; hitconditional=color each hit based on pseudo-perl if than statements found in the varaible' , -'begin', 'colorsub_hitcond_col' , 34 , '[integer] column against which to test conditional statements in pairwise data (does not work on extra items or graphs)' , -'begin', 'colorsub_hitcond_tests' , 'red if <2; orange if <0.99; yellow if <0.98; green if <0.97; blue if <0.96; purple if <0.95; brown if <0.94; grey if <0.93; black if <0.92; pink if <0.91' , '[fake code] conditional statements to color pairwise hits based on the values in the column colorsub_hitcond_col (format for tests: color [= or < or >] value; )' , -'begin', 'execute' , '', '[external system command] to execute on Control-Shift-Click Left Button' , -'begin', 'execute2' , '', '[external system command] to execute on Control-Shift-Click Middle Button' , -'begin', 'execute2_array' , 'm', '[e|m] extra or pairwise array to use in execute2 command' , -'begin', 'execute2_desc', '','[text] description to display in right-click menu for execute2 command', -'begin', 'execute3' , '', '[external system command] to execute on Control-Shift-Click Right Button' , -'begin', 'execute3_array' , 'm', '[e|m] extra or pairwise array to use in execute3 command' , -'begin', 'execute3_desc', 'widget','[text] description to display in right-click menu for execute3 command', -'begin', 'execute4' , '', '[external system command] to execute from within right-click menu only' , -'begin', 'execute4_array' , 'm', '[e|m] extra or pairwise array to use in execute4 command' , -'begin', 'execute4_desc', '','[text] description to display in right-click menu for execute command', -'begin', 'execute_array' , 'e', '[e|m] extra or pairwise array to use in execute command' , -'begin', 'execute_desc', '','[text] description to display in right-click menu for execute command', -'begin', 'extra_arrow_diag' , 5 , '[integer] distance from point of arrow to wing/elbow of arrow' , -'begin', 'extra_arrow_on' , 1 , '[0|1] toggles arrows for extras off and on' , -'begin', 'extra_arrow_para' , 5 , '[integer] pixel distance from point of arrow along the line' , -'begin', 'extra_arrow_perp' , 4 , '[integer] pixel distance from base on line to wing of arrow' , -'begin', 'extra_color' , 'purple' , '[color] default for extra object' , -'begin', 'extra_label_col' , 10 , '[integer] column to take values to use for extra labels' , -'begin', 'extra_label_col_pattern' , '' , '[regular expression] pattern to match (and extract via parentheses) replacing current value. Allows display of only part of the data found in a column.' , -'begin', 'extra_label_color' , 'purple' , '[color] default for the labels of extra objects' , -'begin', 'extra_label_fontsize' , 6 , '[integer] font size (in points) the labels of extra objects' , -'begin', 'extra_label_offset' , 2 , '[integer] horizontal offset for extra labels (left is negative, right is positive)', -'begin', 'extra_label_on' , 1 , '[0|1] toggles the text label for extra objects off and on' , -'begin', 'extra_label_test_col' , '' , '[integer] column to test for a pattern--if pattern matched then extra not drawn' , -'begin', 'extra_label_test_pattern' , '' , '[regular expression] pattern to match in order to NOT draw the matching extra object' , -'begin', 'extra_offset' , -4 , '[integer] default vertical offset of extra object (negative = up; positive = down)' , -'begin', 'extra_on' , 1 , '[0|1] toggles off and on the display of all extras' , -'begin', 'extra_width' , 6 , '[integer] default width (horizontal thickness of extra object' , -'begin', 'fasta_blastdb' , 'htg:nt', '[database names] for sequence fastacmd lookups ', -'begin', 'fasta_directory' , '.:fastax', '[directories] to search for fasta files corresponding to sequence names in order to extract subsequences on command (names of files must be same as names of sequences)' , -'begin', 'fasta_fragsize' , 400000, '[integer] fragment size for sequences in fasta directory. Useful for quick lookups in long sequences like chromosomes. If this is non-zero than fragments of files are searched for in the fasta_directory (nomenclature of fragmented files end with _###, e.g. chr1_000, chr1_001, etc.)' , -'begin', 'fasta_on', 1, '[0|1] off|on turns fasta extraction on and off', -'begin', 'fasta_wrap', 50, '[integer] line width in characters for fasta files created', -'begin', 'filename_color' , 'grey' , '[color] of text label for the filename' , -'begin', 'filename_offset' , -10 , '[integer] vertical offset of text label for filename (up is negative, down is positive)' , -'begin', 'filename_offset_h' , 0 , '[integer] horizontal offset of text label for filename (left is negative, right is positive)' , -'begin', 'filename_on' , 1 , '[0|1] toggle off and on display of designated filename/parasight name (initially defined by -in if empty)' , -'begin', 'filename_pattern' , '' , '[regular expression] pattern to match in the filename. Useful for removing the path. (Although if using graphical interface, it is easier to change the filename.) ' , -'begin', 'filename_size' , 10 , '[integer] point size of text label shown for the filename' , -'begin', 'filter1_col' , '', '[integer] column that contains data with which to filter pairwise' , -'begin', 'filter1_max' , '' , '[float] limit for value in filter1_col above which pairwise are NOT drawn' , -'begin', 'filter1_min' , '' , '[float] limit for value in filter1_col below which pairwise are NOT drawn' , -'begin', 'filter2_col' , '', '[integer] column that contains data with which to filter pairwise' , -'begin', 'filter2_max' , '' , '[float] limit for value in filter2_col above which pairwise are NOT drawn' , -'begin', 'filter2_min' , '' , '[float] limit for value in filter2_col below which pairwise are NOT drawn' , -'begin', 'filterextra1_col' , '', '[integer] column number that contains data with which to filter extra objects--sequences are removed before arrange functions are executed' , -'begin', 'filterextra1_max' , '' , '[float] limit for value in filterextra1_col above which extras are NOT drawn' , -'begin', 'filterextra1_min' , '' , '[float] limit for value in filterextra1_col below which extras are NOT drawn' , -'begin', 'filterextra2_col' , '', '[integer] column number that contains data with which to filter extras' , -'begin', 'filterextra2_max' , '' , '[float] limit for value in filterextra2_col above which extras are NOT drawn' , -'begin', 'filterextra2_min' , '' , '[float] limit for value in filterextra2_col below which extras are NOT drawn' , -'begin', 'filterpre1_col' , '' , '[integer] column that contains data with which to prefilter pairwise--prefiltering removes pairwise before any arranging (normal filtering removes pairwise after filtering)' , -'begin', 'filterpre1_max' , '' , '[float] limit for value in filterpre1_col above which pairwise are NOT drawn or arranged' , -'begin', 'filterpre1_min' , '' , '[float] limit for value in filterpre1_col below which pairwise are NOT drawn or arranged' , -'begin', 'filterpre2_col' , '' , '[integer] column that contains data with which to prefilter pairwise—prefilter removes pairwise before any arranging' , -'begin', 'filterpre2_max' , '' , '[float] limit for value in filterpre2_col above which pairwise are NOT drawn or arranged' , -'begin', 'filterpre2_min' , '' , '[float] limit for value in filterpre2_col below which pairwise are NOT drawn or arranged' , -'begin', 'gif_anchor' , 'center' , '[center|nw|ne|sw|se|e|w|n] positioning of background gif relative to draw point gif_x and gif_y' , -'begin', 'gif_on' , 0 , 'displays a gif image in background (the image will not print out in postscript)' , -'begin', 'gif_path' , '' , '[file path] of gif image to display in background--image does not make it into the Postscript file file' , -'begin', 'gif_x' , int($opt{'window_width'}/2) , '[integer] background picture pixel x coordinate position (top of image is zero)' , -'begin', 'gif_y' , 0 , '[integer] background gif y coordinate position (0 is top of screen)' , -'begin', 'graph1_label_color' , 'blue' , '[color] for graph1 labels (left side axis)' , -'begin', 'graph1_label_decimal' , 2 , '[integer] number of decimal points to round graph1 labels (left side axis)' , -'begin', 'graph1_label_fontsize' , 10 , '[integer] point size of graph1 labels (left side axis)' , -'begin', 'graph1_label_multiplier' , 1 , '[float] multiplier for graph1 labels (left side axis)' , -'begin', 'graph1_label_offset' , 1 , '[integer] horizontal offset for graph1 labels (left side axis)' , -'begin', 'graph1_label_on' , 1 , '[0|1] toggles on labels for graph1 scale (left side axis)' , -'begin', 'graph1_line_color' , 'blue' , '[color] for graph1 connecting lines' , -'begin', 'graph1_line_on' , 1 , '[0|1] toggles graph1 connecting line off and on' , -'begin', 'graph1_line_smooth' , 0 , '[0|1] toggles on and off smoothing function for connecting line', -'begin', 'graph1_line_width' , 1 , '[integer] width for graph1 connecting line' , -'begin', 'graph1_max' , 100 , '[integer] maximum value of graph1 scale' , -'begin', 'graph1_min' , -5 , '[integer] minimum value of graph1 scale' , -'begin', 'graph1_on' , 0 , '[0|1] toggles off and on graph1' , -'begin', 'graph1_point_fill_color' , 'blue' , '[color] to fill points with for graph1' , -'begin', 'graph1_point_on' , 1 , '[0|1] toggles point drawing on and off for graph1' , -'begin', 'graph1_point_outline_color' , 'blue' , '[color] to outline point with for graph1' , -'begin', 'graph1_point_outline_width' , 1 , '[integer] thickness of point outline for graph1' , -'begin', 'graph1_point_size' , 2 , '[integer] pixel radius size for drawing graph1 points' , -'begin', 'graph1_tick_color' , 'black' , '[color] of tick marks for graph1 scale' , -'begin', 'graph1_tick_length' , 6 , '[integer] length of tick marks for graph1 scale' , -'begin', 'graph1_tick_offset' , 1 , '[integer] horizontal offset of tick marks for graph1 scale' , -'begin', 'graph1_tick_on' , 1 , '[0|1] toggles tick marks for graph1 scale off and on' , -'begin', 'graph1_tick_width' , 3 , '[integer] thickness of tick marks for graph1 scale' , -'begin', 'graph1_vline_color' , 'black' , '[color] of vertical line for graph1 scale on left' , -'begin', 'graph1_vline_on' , 1 , '[0|1} toggles on and off vertical line for graph1 scale on left' , -'begin', 'graph1_vline_width' , 2 , '[integer] vertical line width for graph1 scale on left' , -'begin', 'graph2_label_color' , 'red' , '[color] of graph2 scale labels' , -'begin', 'graph2_label_decimal' , 2 , '[integer] number of decimal point to round graph2 scale label' , -'begin', 'graph2_label_fontsize' , 10 , '[integer] point size of graph2 scale labels' , -'begin', 'graph2_label_multiplier' , 1 , '[float] graph2 scale label multiplier' , -'begin', 'graph2_label_offset' , 8 , '[integer] horizontal offset of graph2 scale labels' , -'begin', 'graph2_label_on' , 1 , '[0|1] toggles graph2 scale labels off and n' , -'begin', 'graph2_line_color' , 'red' , '[color] of graph2 connecting lines' , -'begin', 'graph2_line_on' , 1 , '[0|1] toggles graph2 connecting lines off and on' , -'begin', 'graph2_line_smooth' , 0 , '[0|1] toggles graph2 connecting line smoothing off and on', -'begin', 'graph2_line_width' , 1 , '[integer] thickness of graph2 connecting lines' , -'begin', 'graph2_max' , 1000 , '[integer] maximum value for graph2 scale' , -'begin', 'graph2_min' , -1000 , '[integer] minimum value for graph2 scale' , -'begin', 'graph2_on' , 0 , '[0|1] toggles graph2_on' , -'begin', 'graph2_point_fill_color' , 'red' , '[color] of interior of graph2 points' , -'begin', 'graph2_point_on' , 1 , '[0|1] toggles graph2 point drawing on and off' , -'begin', 'graph2_point_outline_color' , 'red' , '[color] of graph2 point outline' , -'begin', 'graph2_point_outline_width' , 1 , '[integer] thickness of graph2 point outline' , -'begin', 'graph2_point_size' , 2 , '[integer] radius size of graph 2 points' , -'begin', 'graph2_tick_color' , 'black' , '[color] of graph2 vertical scale ticks' , -'begin', 'graph2_tick_length' , 6 , '[integer] length of graph2 vertical scale ticks' , -'begin', 'graph2_tick_offset' , 5 , '[integer] horizontal offset of graph2 vertical scale ticks' , -'begin', 'graph2_tick_on' , 1 , '[0|1] toggles graph2 vertical scale ticks on and off' , -'begin', 'graph2_tick_width' , 3 , '[integer] thickness of graph2 vertical scale ticks' , -'begin', 'graph2_vline_color' , 'black' , '[color] of graph2 vertical scale line' , -'begin', 'graph2_vline_on' , 1 , '[0|1] toggles graph2 vertical scale line off and on' , -'begin', 'graph2_vline_width' , 2 , '[integer] thickness of graph2 vertical scale line' , -'begin', 'graph_scale_height' , 80 , '[integer] pixel height of shared graph scale' , -'begin', 'graph_scale_hline_color', 'black' , '[color] of horizontal shared graph scale lines' , -'begin', 'graph_scale_hline_on' , 1 , '[0|1] toggles off and on the shared horizontal interval lines of the graph scales' , -'begin', 'graph_scale_hline_width' , 1 , '[integer] width of shared horizontal shared graph scale lines' , -'begin', 'graph_scale_indent' , -20 , '[integer] indentation for placing gscale above (or even below) the sequence line' , -'begin', 'graph_scale_interval' , 4 , '[integer] number of intervals' , -'begin', 'graph_scale_on' , 0 , '[0|1] toggles off and on the graph scales' , -'begin', 'help_on', 1, '[0|1] toggles off and on the popup help messages', -'begin', 'help_wrap', 50, '[integer] line width in characters for popup help menus', -'begin', 'mark_advanced' , '' , "code for an advanced marking algorithm. Allowing for more complex searches. Data foreach pair or extra is accessed using an array reference \$c. Therefore to access column 4 \$\$c[4] would work." , -'begin', 'mark_array' , 'm', '[e|m] default array to search (m is alignment/e is extra)(m is historical)' , -'begin', 'mark_col' , '' , '[integer] column to search for given pattern in order to mark matches with a color' , -'begin', 'mark_col2' , '' , '[integer] second column to search for pattern in order to mark matches with a color' , -'begin', 'mark_color' , 'red' , '[color] to mark objects with' , -'begin', 'mark_pairs' , 0 , '[0|1] toggles the coloring/marking of sub(jects) off and on' , -'begin', 'mark_pattern' , 'AC002038' , '[regular expression] pattern to search for with mark/find button' , -'begin', 'mark_permanent' , 0 , '[0|1] toggles on and off changing the color of objects permanently (if not permanent then on redraw colors will be erased' , -'begin', 'mark_subs' , 1 , '[0|1] toggles the coloring/marking of sub(jects) off and on' , -'begin', 'pair_inter_color' , 'red' , '[color] default of inter pairwise and connecting lines' , -'begin', 'pair_inter_line_on' , 0 , '[0|1] toggles off and on the connecting lines between inter pairwise alignments' , -'begin', 'pair_inter_offset' , 0 , '[integer] default offset from sequence line of inter pairwise (up is negative, down is positive)' , -'begin', 'pair_inter_on' , 1 , '[0|1] toggles off and on the inter pairwise alignments normally drawn on top of sequence line' , -'begin', 'pair_inter_width' , 13 , '[integer] width of inter pairwise' , -'begin', 'pair_intra_color' , 'blue' , '[color] default of intra pairwise and connecting lines' , -'begin', 'pair_intra_line_on' , 0 , '[0|1] toggles connecting lines between intra pairwise off and on' , -'begin', 'pair_intra_offset' , 0 , '[integer] default offset from seuqence' , -'begin', 'pair_intra_on' , 1 , '[0|1] toggles off and on the intra pairwise' , -'begin', 'pair_intra_width' , 9 , '[integer] width of intra pairwise' , -'begin', 'pair_level' , 'NONE' , '[NONE|inter_over_intra|intra_over_inter] determines which pairwise type appears above the other--NONE leaves the appearance to the order of the pairwise in the inputted alignment or parasight.psa table' , -'begin', 'pair_type_col' , '' , '[integer] column number to determine pairwise type for sequence 1, which is checked against sequence 2. If match then intra if no match then inter. (Useful on sequence names that contain chromosome assignment.)' , -'begin', 'pair_type_col2' , '' , '[integer] column to determine pairwise type for sequence 2 in row ' , -'begin', 'pair_type_col2_pattern' , '' , '[regular expression] to extract pairwise type determing value with parentheses' , -'begin', 'pair_type_col_pattern' , '' , '[regular expression] to extract pairwise type determining value with parentheses' , -'begin', 'popup_format' , 'text' , '[text|number] determines whether column numbers or text headers are shown in popup window' , -'begin', 'popup_max_len' , 300 , '[integer] character length for fields in the popup menu (allows long definitions or sequences be excluded)' , -'begin', 'print_command' , 'lpr -P Rainbow {}' , '[string] print command with brackets {} representing file name. This is a system command executed to drive a printer. I have never been able to get DOS to work. This is setup for Unix on our system. Rainbow is our color printer name. It will fail in MSWin' , -'begin', 'print_multipages_high' , 1 , '[integer] height in number of pages for the print/postscript all command' , -'begin', 'print_multipages_wide' , 1 , '[integer] width in number of pages for print/postscript all command' , -'begin', 'printer_page_length' , '11i' , '[special] physical page length (longest dimension of paper) in inches for printer (requires number followed by units with i=inches or c=cm)' , -'begin', 'printer_page_orientation' , 1 , '[0|1] toggles printer page orientation (1=landscape 0=portrait)' , -'begin', 'printer_page_width' , '8i' , '[special] physical page width in inches for printer (requires number followed by units i=inches or c=cm)' , -'begin', 'quick_color' , 'purple' , '[color] for the quick color function Shift-Button3 and Shift-Double Click Button3' , -'begin', 'seq_color' , 'black' , '[color] of sequence (All sequences take this color. There is currently no way to color sequences individually.)' , -'begin', 'seq_label_color' , 'black' , '[color] of sequence name text' , -'begin', 'seq_label_fontsize' , 12 , '[integer] font size (in points) for all sequence names' , -'begin', 'seq_label_offset' , -4 , '[integer] vertical offset of sequence names (up is negative, down is positive)' , -'begin', 'seq_label_offset_h' , 0 , '[integer] horizontal offset of sequence names' , -'begin', 'seq_label_on' , 1 , '[0|1] toggles off and on the display of sequence name labels' , -'begin', 'seq_label_pattern' , '' , '[regular expression] to match in sequence name for display purposes--parentheses must be used to denote the part of match to display' , -'begin', 'seq_line_spacing_btwn' , 250 , '[integer] pixels to separate sequence lines from each other (roughly equivalent to spacing between text paragraphs if you consider a wrapping line of sequences to be a paragraph)' , -'begin', 'seq_line_spacing_wrap' , 200 , '[integer] pixels to space between a wrapping line of sequences (roughly equivaelent to spacing between the lines within a text paragraph)' , -'begin', 'seq_spacing_btwn_sequences' , 10000 , '[integer] bases to separate sequences drawn within the same line (roughly equivalent to spacing between words of a text paragraph)' , -'begin', 'seq_tick_b_color' , 'black' , '[color] for begin tick marks' , -'begin', 'seq_tick_b_label_anchor' , 'ne' , '[center|n|w|s|e|nw|ne|sw|se] anchor point for begin tick mark labels' , -'begin', 'seq_tick_b_label_color' , 'black' , '[valid color] of tick mark label at the beginning of sequence' , -'begin', 'seq_tick_b_label_fontsize' , 9 , '[integer] font size (in points) for label at beginning of sequence' , -'begin', 'seq_tick_b_label_multiplier' , 0.001 , '[float] scaling factor for begin tick mark labels' , -'begin', 'seq_tick_b_label_offset' , 2 , '[integer] vertical offset for begin tick mark label' , -'begin', 'seq_tick_b_label_offset_h' , 0 , '[integer] horizontal offset for begin tick mark labels' , -'begin', 'seq_tick_b_label_on' , 1 , '[0|1] toggles off and on the beginning tick mark labels' , -'begin', 'seq_tick_b_length' , 10 , '[integer] length of begin tick marks' , -'begin', 'seq_tick_b_offset' , 0 , '[integer] vertical offset for begin tick marks' , -'begin', 'seq_tick_b_on' , 1 , '[0|1] toggles off and on the begin tick marks' , -'begin', 'seq_tick_b_width' , 2 , '[integer] width of begin tick marks' , -'begin', 'seq_tick_bp' , 20000 , '[integer] tick mark interval' , -'begin', 'seq_tick_color' , 'black' , '[color] of interval tick marks' , -'begin', 'seq_tick_e_color' , 'black' , '[valid color] for end tick marks' , -'begin', 'seq_tick_e_label_anchor' , 'nw' , '[center|n|w|s|e|nw|ne|se|sw] anchor point for end tick mark labels' , -'begin', 'seq_tick_e_label_color' , 'black' , '[valid color] for end tick mark labels' , -'begin', 'seq_tick_e_label_fontsize' , 9 , '[integer] font size (in points) for end tick mark labels' , -'begin', 'seq_tick_e_label_multiplier' , 0.001 , '[float] scaling factor for end tick mark labels' , -'begin', 'seq_tick_e_label_offset' , 2 , '[integer] vertical offset for end tick mark labels' , -'begin', 'seq_tick_e_label_offset_h' , 0 , '[integer] horizontal offset for end tick mark labels' , -'begin', 'seq_tick_e_label_on' , 1 , '[0|1] toggles end tick labels off and on' , -'begin', 'seq_tick_e_length' , 10 , '[integer] length of end tick marks' , -'begin', 'seq_tick_e_offset' , 0 , '[integer] vertical offset for ending tick marks' , -'begin', 'seq_tick_e_on' , 1 , '[0|1] toggles off and on the ending tick marks' , -'begin', 'seq_tick_e_width' , 2 , '[integer] width of end tick marks' , -'begin', 'seq_tick_label_anchor' , 'n' , '[center|n|s|w|e|nw|sw|ne|se] anchor of text from tick mark draw point' , -'begin', 'seq_tick_label_color' , 'black' , '[color] for interval tick mark' , -'begin', 'seq_tick_label_fontsize' , 9 , '[integer] font size (in points) for interval tick mark label' , -'begin', 'seq_tick_label_multiplier' , 0.001 , '[float] scaling factor for the interval tick label' , -'begin', 'seq_tick_label_offset' , 2 , '[integer] vertical offset of sequence interval tick mark labels' , -'begin', 'seq_tick_label_on' , 1 , '[0|1] toggles off and on the interval tick labels' , -'begin', 'seq_tick_length' , 10 , '[integer] length of interval tick marks' , -'begin', 'seq_tick_offset' , 0 , '[integer] vertical offset for interval tick marks' , -'begin', 'seq_tick_on' , 1 , '[0|1] toggles off and on the interval sequence tick marks' , -'begin', 'seq_tick_whole', 0, '[0|1] toggles whether numbering is for each individual sequence (0) or continious across multiple accession on same line (useful when analyzing chromosomes in multiple fragments)' , -'begin', 'seq_tick_width' , 2 , '[integer] width of interval tick marks' , -'begin', 'seq_width' , 3 , '[integer] width of sequence line' , -'begin', 'showqueryonly' , 0 , '[0|1] toggles the display of just the first sequence in a pairwise data (i.e.first column in an alignment file). For most parsing this is equivalent to the Blast query position' , -'begin', 'sub_arrow_diag' , 5 , '[integer] distance between arrow point to wing/edge of arrow' , -'begin', 'sub_arrow_on' , 0 , '[0|1] toggles off and on the directional/orientation arrows for subjects' , -'begin', 'sub_arrow_paral' , 5 , '[integer] distance between arrow point to base of arrow' , -'begin', 'sub_arrow_perp' , 4 , '[integer] distance from base end to wing tip of arrow ' , -'begin', 'sub_color' , 'lightgreen' , '[color] default of sub(ject) objects (all other coloring schemes over ride default)' , -'begin', 'sub_initoffset' , 30 , '[integer] pixel indent from top of subscales to associated sequence line (increasing pushes scales further below associated sequence)' , -'begin', 'sub_labelhit_col' , 13 , '[integer] column to use for labeling each hit/pairwise (label will be drawn at beginning of each hit sub)' , -'begin', 'sub_labelhit_color' , 'black' , 'color of pairwise hit label text' , -'begin', 'sub_labelhit_offset' , 0 , '[integer] horizontal offset for hit label' , -'begin', 'sub_labelhit_on' , 0 , '[0|1] turns on individual labeling of each pairwise hit' , -'begin', 'sub_labelhit_pattern' , '0?([0-9.]{4})' , '[regular expression] to match in data from column' , -'begin', 'sub_labelhit_size' , 9 , '[integer] font size (in points) for hit label' , -'begin', 'sub_labelseq_col' , 0 , '[integer] column to use for the beginning sub label' , -'begin', 'sub_labelseq_col2' , 4 , '[integer] column for second position sequence in alignment table pairwise row' , -'begin', 'sub_labelseq_col2_pattern' , '' , '[regular expression] pattern to match in data from sub label sequence column 2' , -'begin', 'sub_labelseq_col_pattern' ,'', '[regular expression] pattern to match in data from sub label sequence column (use parenthesis to denote data within the match to display)' , -'begin', 'sub_labelseq_color' , 'black' , '[color] of text label for sub objects' , -'begin', 'sub_labelseq_offset' , 0 , '[integer] horizontal offset label' , -'begin', 'sub_labelseq_on' , 1 , '[0|1] toggles overall begin sequence label for sub(ject) label off and on' , -'begin', 'sub_labelseq_size' , 6 , '[integer] font size (in points) for begin label sequence' , -'begin', 'sub_labelseqe_col' , 4 , '[integer] column to use for the end subject label' , -'begin', 'sub_labelseqe_col2' , 0 , '[integer] column for second position in alignment table pairwise row' , -'begin', 'sub_labelseqe_col2_pattern' , '' , '[regular expression] pattern to match in data from column' , -'begin', 'sub_labelseqe_col_pattern' , '' , '[regular expression] pattern to match in data from column' , -'begin', 'sub_labelseqe_color' , 'black' , '[valid color] of label text' , -'begin', 'sub_labelseqe_offset' , 0 , '[integer] horizontal offset for label' , -'begin', 'sub_labelseqe_on' , 0 , '[0|1] toggles off and on the overall sub(ject) label at end of last hit/pairwise' , -'begin', 'sub_labelseqe_size' , 6 , '[integer] font size (in points) for end subject label' , -'begin', 'sub_line_spacing' , 9 , '[integer] pixels per line determining the spacing between subs placed on different lines' , -'begin', 'sub_on' , 1 , '[0|1] toggles sub(ject) display off and on (these are the pairwise representations drawn below the sequence line) For BLAST searches these traditionally represent the subject sequences found in a database search.' , -'begin', 'sub_scale_categoric_string' , '' , '[string] list of comma delimited category names' , -'begin', 'sub_scale_col' , '' , '[integer] column for value to arrange pairwise hit on sub scale (subscale)' , -'begin', 'sub_scale_col2' , '' , '[integer] column for second position sequence in alignment pairwise (only used if defined)' , -'begin', 'sub_scale_col2_pattern' , '' , '[regular expression] pattern to match in column 2' , -'begin', 'sub_scale_col_pattern' , '' , '[regular expression] pattern to match in column' , -'begin', 'sub_scale_hline_color' , 'grey' , '[valid color] for horizontal sub scale lines' , -'begin', 'sub_scale_hline_on' , 1 , '[0|1] toggles off and on the horizontal scale lines for sub scale' , -'begin', 'sub_scale_hline_width' , 1 , '[integer] width of horizontal sub scale lines' , -'begin', 'sub_scale_label_color' , 'black' , '[color] for sub scale axis label' , -'begin', 'sub_scale_label_fontsize' , 12 , '[integer] font size (in points) for sub scale axis label' , -'begin', 'sub_scale_label_multiplier' , 100 , '[integer] multiplication factor for sub scale label' , -'begin', 'sub_scale_label_offset' , 1 , '[integer] horizontal offset for sub scale axis tick marks' , -'begin', 'sub_scale_label_on' , 1 , '[0|1] toggles off and on sub scale axis tick mark labels' , -'begin', 'sub_scale_label_pattern' , '' , '[regular expression] pattern to match in sub scale label' , -'begin', 'sub_scale_lines' , 10 , '[integer] number of lines (or interval steps) to plot for stagger or cscale (automatically set for subscaleC)' , -'begin', 'sub_scale_max' , 1.00 , '[float] maximum value to place on the sub scale (automatically set for subscaleC)' , -'begin', 'sub_scale_min' , 0.80 , '[float] minimum value to place on the sub scale (automatically set for subscaleC)' , -'begin', 'sub_scale_on' , 0 , '[0|1] toggles sub scale on and off' , -'begin', 'sub_scale_step' , 0.01 , '[float] value to increment between each step (automatically set to -1 for subscaleC, 1 reverses subscaleC)' , -'begin', 'sub_scale_tick_color' , 'black' , '[color] for sub scale axis tick marks' , -'begin', 'sub_scale_tick_length' , 9 , '[integer] length of sub axis tick marks' , -'begin', 'sub_scale_tick_offset' , 4 , '[integer] offset of sub scale axis tick marks' , -'begin', 'sub_scale_tick_on' , 1 , '[0|1] toggles off and on the sub scale axis at horizontal tick positions' , -'begin', 'sub_scale_tick_width' , 3 , '[integer] width of sub scale axis tick marks' , -'begin', 'sub_scale_vline_color' , 'black' , '[color] for vertical axis line of sub scale' , -'begin', 'sub_scale_vline_offset' , -5 , '[integer] horizontal offset for subject axis line' , -'begin', 'sub_scale_vline_on' , 1 , '[0|1] toggles off and on the vertical axis line for sub scale' , -'begin', 'sub_scale_vline_width' , 2 , '[integer] width of sub scale axis line' , -'begin', 'sub_width' , 8 , '[integer] default width (thickness) of sub objects' , -'begin', 'template_desc_on' , 1 , '[0|1] toggles off and on wether descriptions, such as this one, are saved in a template file with each option variable' , -'begin', 'text2_anchor' , 'nw' , '[center|n|w|s|e|nw|ne|se|sw] anchor point for end tick mark labels' , -'begin', 'text2_color' , 'red' , '[color] for end tick mark labels' , -'begin', 'text2_offset' , 0 , '[integer] vertical offset for end tick mark labels' , -'begin', 'text2_offset_h' , 0 , '[integer] horizontal offset for end tick mark labels' , -'begin', 'text2_on' , 1 , '[0|1] toggles end tick labels off and on' , -'begin', 'text2_size' , 20 , '[integer] font size (in points) for end tick mark labels' , -'begin', 'text2_text' , '' , '[text] to display within a parasight view (useful for automation)' , -'begin', 'text_anchor' , 'nw' , '[center|n|w|s|e|nw|ne|se|sw] anchor point for end tick mark labels' , -'begin', 'text_color' , 'red' , '[color] for end tick mark labels' , -'begin', 'text_fontsize' , 20 , '[integer] font size (in points) for end tick mark labels' , -'begin', 'text_offset' , 0 , '[integer] vertical offset for end tick mark labels' , -'begin', 'text_offset_h' , 0 , '[integer] horizontal offset for end tick mark labels' , -'begin', 'text_on' , 1 , '[0|1] toggles end tick labels off and on' , -'begin', 'text_text' , '' , '[text] to display within a parasight view (useful for automation)' , -'begin', 'window_font_size' , 9 , '[integer] font size for parasight in general (not implemented)' , -'begin', 'window_height', 550 , '[integer] pixel height of main window on the initial start up' , -'begin', 'window_width' , 800 , '[integer] pixel width of the main window on the initial start up' , -# -); - -for(my $i=0; $i<@todefine; $i+=4) { - my ($b,$o,$v,$d)=@todefine[$i..$i+3]; - die "Error in Option Array ($o,$v,$d,$b) $b ne begin !\n" if $b ne 'begin'; - $opt{$o} = $v if !defined $opt{$o}; - $optdesc{$o}=$d; -} - - -if ($options) { - #print "$options\n"; - my @tmparray= split / *[=>,]+ */,$options; - my %moreoptions=@tmparray; - #print keys %moreoptions, "\n"; - foreach my $k (keys %moreoptions) { - my $clean = substr($k,1); - if (!defined $opt{$clean} ) { - print "VALID OPTIONS are: "; - foreach (sort keys %opt) { print "-$_ ";} - print "\nBAD OPTION: -$clean is an invalid option for parasight.\nValid options are listed above.\n"; - exit; - } - #print "OPTIONS:$k\n"; - $opt{$clean}=$moreoptions{$k}; - } - - -} - -%opt = (%opt,%newopt); - -warn "Newline character not tolerated in -text_text.\nUse \\\\n if entering to get \\n n the input!\n" if $opt{'text_text'} =~/\n/mg; - -################################################################### -################################################################### -############################ GUI CREATION ######################### -################################################################### - -###########################MAIN WINDOW ############################ -################################################################### - $mw=MainWindow->new; - my $cwd=cwd(); - $optionpath="$cwd/"; - $filepath=''; - $filepath=$opt{'in'}; - $filepath="$cwd/" if !$filepath; - if ($opt{'in'} ) { - $mw->title("PARASIGHT: $opt{'in'}"); - } else { - $mw->title("PARASIGHT: New"); - } - $mw->setPalette('lightgrey'); - $mw->configure(-background=>'darkgrey'); - $balloon=$mw->Balloon(-initwait =>600, -background => '#ffff9d', -font=>"Courier 10" ); - - #####frame ####### - $frame = $mw->Frame(-relief => 'groove', -bd => 2); - - - $tmp=$frame->Menubutton(-text => "File", - -menuitems=> [ - #['command' => "Load Parasight", - # -command => sub{ - # my ($dir,$name); - # $name=$filepath; - # ($dir,$name)= ($1,$2) if $filepath=~ /^(.*)\/(.*)$/; - # my @filetypes= (['parasight', '.psa'],['All Files', '*']); - # my $file = $mw->getOpenFile(-title=> 'LOAD PARASIGHT FILES', -filetypes => \@filetypes, - # -initialdir=>$dir, -initialfile=>$name); - # if ($file eq "") { return; } - # $filepath=$file; - # &load_parasight_table($filepath); - # $opt{'in'}=$filepath; - ##} - #], - ['command' => "Saving Parasight", - -command => sub{ - my ($dir,$name); - $name=$filepath; - print "$filepath"; - ($dir,$name)= ($1,$2) if $filepath=~ /^(.*)\/(.*)$/; - if ($^O=~/MSWin/) {$dir =~ s/\//\\/mg;} - #print "POSITION FP:$filepath D($dir) N($name)\n"; - my @filetypes= (['parasight', '.psa'],['All Files', '*']); - my $file = $mw->getSaveFile( -title=>'SAVE PARASIGHT FILES', - -filetypes => \@filetypes, - -initialdir=> $dir, - -initialfile=>$name - ); - if ($file eq "") { print "CANCELLED!\n" if $opt{'quiet'}; return; } - $filepath=$file; - print "SAVING: $filepath\n" if !$opt{'quiet'}; - &save_parasight_table($filepath); - $opt{'in'}=$filepath; - $mw->title("PARASIGHT: $filepath"); - } - ], - ['command' => "Load Option Template", - -command => sub{ - my ($dir,$name); - $name=$optionpath; - ($dir,$name)= ($1,$2) if $optionpath=~ /^(.*)\/(.*)$/; - if ($^O=~/MSWin/) {$dir =~ s/\//\\/mg;} - print "POSITION FP:$filepath D($dir) N($name)\n"; - my @filetypes= (['option template', ['.pst']],['All Files', '*']); - my $file = $mw->getOpenFile( -title=>'LOAD OPTION TEMPLATE', - -filetypes => \@filetypes, - -initialdir=>$dir, -initialfile=>$name); - if ($file eq "") { return; } - $optionpath=$file; - &load_option_template($optionpath); - } - ], - ['command' => "Save Option Template", - -command => sub{ - my ($dir,$name); - $name=$optionpath; - ($dir,$name)= ($1,$2) if $optionpath=~ /^(.*)\/(.*)$/; - if ($^O=~/MSWin/) {$dir =~ s/\//\\/mg;} - my @filetypes= (['option template', ['.pst']],[ 'All Files', '*']); - my $file = $mw->getSaveFile( -title=>'SAVE OPTION TEMPLATE', - # -filetypes => \@filetypes, - -initialdir=>$dir, -initialfile=>$name); - if ($file eq "") { return; } - $optionpath=$file; - &save_option_template($optionpath); - } - ]])->pack(-side => 'left'); - - $balloon->attach($tmp,-justify => 'left',-msg=>"Click for menu to save and load\nparasight files\n" - . " or option template files."); - - $tmp=$frame->Menubutton(-text => "Print", - -menuitems=> [ - ['command' => "Print Screen", -command => [\&print_screen,1]], - ['command' => "Postscript Screen", -command => [\&print_screen,0]], - ['command' => "Print All",-command => [\&print_all,1]], - ['command' => "Postscript All",-command => [\&print_all,0]] - - ])->pack(-side => 'left'); - $tmp->separator; - $tmp->checkbutton(-label=> 'landscape',, -variable => \$opt{'printer_page_orientation'}); - - $balloon->attach($tmp,-justify => 'left',-msg=>"Click to see menu of output choices.\nOn Windows print may not work\n"); - $tmp=$frame->Menubutton(-text => "Order", - -menuitems=> [ - ['command' => "pair inter => raise", -command => sub{$canvas->raise('inter');}], - ['command' => "pair inter => lower", -command => sub{$canvas->lower('inter');}], - ['command' => "pair intra => raise", -command => sub{$canvas->raise('intra');}], - ['command' => "pair intra => lower", -command => sub{$canvas->lower('intra');}], - ['command' => "sub => raise", -command => sub{$canvas->raise('sub');}], - ['command' => "sub => lower", -command => sub{$canvas->lower('sub');}], - ['command' => "sub label => raise", -command => sub{$canvas->raise('subl');}], - ['command' => "sub label => lower", -command => sub{$canvas->lower('subl');}], - ['command' => "extra => raise", -command => sub{$canvas->raise('ex');}], - ['command' => "extra => lower", -command => sub{$canvas->lower('ex');}], - ['command' => "extra label => raise", -command => sub{$canvas->raise('exl');}], - ['command' => "extra label => lower", -command => sub{$canvas->lower('exl');}], - ['command' => "graph line => raise", -command => sub{$canvas->raise('gl');}], - ['command' => "graph line => lower", -command => sub{$canvas->lower('gl');}], - - - ['command' => "subscale => raise", -command => sub{$canvas->raise('ss');}], - ['command' => "subscale => lower", -command => sub{$canvas->lower('ss');}], - ['command' => "subscale label => raise", -command => sub{$canvas->raise('ssl');}], - ['command' => "subscale label => lower", -command => sub{$canvas->lower('ssl');}], - ['command' => "gscale => raise", -command => sub{$canvas->raise('gs');}], - ['command' => "gscale => lower", -command => sub{$canvas->lower('gs');}], - ['command' => "gscale label => raise", -command => sub{$canvas->raise('gsl');}], - ['command' => "gscale label => lower", -command => sub{$canvas->lower('gsl');}], - ['command' => "sequence => raise", -command => sub{$canvas->raise('seq');}], - ['command' => "sequence => lower", -command => sub{$canvas->lower('seq');}], - ['command' => "sequence name => raise", -command => sub{$canvas->raise('seqn');}], - ['command' => "sequence name => lower", -command => sub{$canvas->lower('seqn');}], - ['command' => "tick => raise", -command => sub{$canvas->raise('tick');}], - ['command' => "tick => lower", -command => sub{$canvas->lower('tick');}], - ['command' => "tick label => raise", -command => sub{$canvas->raise('tickl');}], - ['command' => "tick label => lower", -command => sub{$canvas->lower('tickl');}] - ])->pack(-side => 'left'); - $balloon->attach($tmp,-justify => 'left',-msg=>"Allows the drawn objects to be raised\nand lowered relative to each other."); - $tmp=$frame->Menubutton(-text => "Misc", - -menuitems=> [ - ['command' => "color transfer sub -> pair", -command => sub{ - my $c=$mh{'color'}; - my $s=$mh{'scolor'}; - for(my $i=0;$i<@m;$i++) { $m[$i][$c]=$m[$i][$s] } - }], - ['command' => "color transfer pair -> sub", -command => sub{ - my $c=$mh{'color'}; - my $s=$mh{'scolor'}; - for(my $i=0;$i<@m;$i++) { $m[$i][$s]=$m[$i][$c] } - }], - ['command' => "color transfer pair -> sub", -command => sub{$canvas->raise('inter');}], - ])->pack(-side => 'left'); - $balloon->attach($tmp,-justify => 'left',-msg=>"A Hodge-Podge of Misc functions including\nraising and lowering objects relative to each other."); - $tmp=$frame->Button(-text => 'Options', - -command => [\&indexcard_options])->pack(-side=>'left',-anchor => 'e'); - $balloon->attach($tmp,-justify => 'left',-msg=>"Press to Display PopUp Window of All Options"); - - - $tmp=$frame->Button(-text => 'PrintScreen', - -command => [\&print_screen,1])->pack(-side=>'left',-anchor => 'e'); - $balloon->attach($tmp,-justify => 'left',-msg=>"Press to Print Current Visible Screen View\n" - . "(Options->Misc to change printing function)"); - - $tmp=$frame->Button(-text=>'DeZoom',-command=> sub{$canvas->scale("all",0,0,1/$scale,1/$scale); - $canvas->configure(-scrollregion=>[$canvas->bbox("all")]); - #print $canvas->cget(-height)," ",$canvas->cget(-width),"\n"; - $scale=1;}) - ->pack(-side=>'left',-anchor=>'e'); - $balloon->attach($tmp,-justify => 'left',-msg=>"Press to reset to normal scale after zooming.\n" - . "to Zoom In use Control-Button1 (Left-Click).\n" - . "to Zoom Out use Control-Button3 (Right-Click)."); - - $tmp=$frame->Button(-text=>'FitLongLine', -command => \&fitlongestline - )->pack(-side=>'left',-anchor=>'e'); - $balloon->attach($tmp,-justify => 'left',-msg=>"Press to set bp width\nto longest sequence line."); - - - $tmp=$frame->Button(-text => 'Find', -command=>\&mark_window) - ->pack(-side=>'left',-anchor => 'e'); - $balloon->attach($tmp,-justify => 'left',-msg=>"Press to bring up window\nfor searching and color-marking results."); - $tmp=$frame->Button(-text => 'Quick Color', -foreground=>$opt{'quick_color'},-background=>'white', - -activebackground=>$opt{'quick_color'}, -activeforeground=>'white') - ->pack(-side=>'left',-anchor => 'e'); - $tmp->configure(-command=> - [sub{ my $b= $_[0]; - $mw->grabRelease(); - my $color = $b->chooseColor(-title=>'Choose New Quick Color', - -initialcolor=> $opt{'quick_color'}); - if (defined $color) { - $b->configure(-foreground=> $color,-activebackground=>$color); - $opt{'quick_color'}=$color; - } - $mw->grab(); - $mw->raise(); - }, $tmp] ); - $balloon->attach($tmp,-justify => 'left',-msg=>"Press to set a quick color.\n" - ."To quickly color a seq, pair, sub or extra object\n use Shift-Right-Click\n" - ."To remove color (return to default) ===> Shift-Double-Right-Click \n" - ."(NOTE:for pairs and labels color where information is unknown, removing color leaves it black until a redraw!"); -# $tmp=$frame->Button(-text => 'Help') -# ->pack(-side=>'left',-anchor => 'e'); -# $tmp->configure(-command=> - # sub {my $text="This is the simple text version of the internal #documentation\nBetter foramts can be accessed with parsight -h, perldoc, or pod2html\n"; - # $text.=`pod2text $0`; - # print "$0\n"; - # &export_text(\$text,"Internal Help Documenation ($0)"); - # } ); - #$#balloon->attach($tmp,-justify => 'left',-msg=>"Press to set quick color.\n" - # ."To quickly color a seq, pair, sub or extra object\n ==>Shift-Right-Click\n" - # ."To remove color (return to default) ===> Shift-Double-Right-Click \n" - # ."(NOTE:for pairs and labels color were information is unknown, removing color leaves black!"); - $tmp=$frame->Button(-borderwidth => 3,-activebackground=>'black',-activeforeground=>'white', - -background=>'white',-command=>\&redraw,-text => "Redraw")->pack(-side=>'right'); - $balloon->attach($tmp,-justify => 'left',-msg=>"Press to redraw the entire screen.\n" - ."This is quicker then the blue button,\nbut any blue options may not be changed." - ); - $tmp=$frame->Button(-borderwidth => 3,-activebackground=>'blue',-background=>'#bbe8ff', - -command=>\&reshowNredraw,-text => "R R & R")->pack(-side=>'right'); - $balloon->attach($tmp,-justify => 'left',-msg=>"Press to Reshow, Rearrange and Redraw\n" - ."This is slower than the just Redraw\nas many initial calculations are redone." - ); - - $frame->pack(-side => 'top', -expand=> 0,-fill=>'x', -anchor=>'w'); - ########################################################## - ########################################################## - ############## create a canvas ########################### - $scrolledcanvas = $mw->Scrolled('Canvas',-height=>$opt{'window_height'}, -width=>$opt{'window_width'}, -background => 'white') - ->pack(-side=>'top',-fill => 'both', -expand => 1, -anchor =>'n'); - ##could add - ##$scrolledcanvas->Tk::bind("", [sub { print "SIZE CHANGED $_[1], $_[2]\n"; }, Ev{'h'}, Ev{'w'} ]); - $canvas= $scrolledcanvas->Subwidget('canvas'); - -############################################################ -############# highlighting on mouse over ############# -########################################################## - my ($oldhighlightcolor,$oldhighlightid); - my @highlightcolors=('yellow', 'orange','yellow','pink','yellow','lightblue'); - $canvas->Tk::bind('', - [sub{ - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - - #my @close=$canv->find("closest",$cx,$cy) ; - my @tags = $canv->gettags( "current"); - my $id=$tags[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; #currently sequence gets sS not S - $id=$tag; - } - #print "$id\n"; - my $color=$canvas->itemcget($id,-fill); - if ($oldhighlightid ne $id && $oldhighlightcolor ne '') { - #print "FIX $oldhighlightid ($id) $oldhighlightcolor=>$color\n"; - $canvas->itemconfigure($oldhighlightid ,-fill=>$oldhighlightcolor); - $oldhighlightid = ''; - } - if ($id =~ /^[MSE]/ && $color ne '' ) { - #print "$oldhighlightid ($id) $color -> oldcolor\n"; - $oldhighlightcolor=$color if $oldhighlightid ne $id; - $oldhighlightid=$id; - #print "ENTER:$id:$c\n"; - push @highlightcolors, shift @highlightcolors; - $canvas->itemconfigure( $id,-fill=>$highlightcolors[0]); - } - } - ,Ev('x'),Ev('y')]); - - - - ############################################# - ########### popup window #################### - - $canvas->Tk::bind('', - [sub{ - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - #my @close=$canv->find("closest",$cx,$cy) ; - #my @tags = $canv->gettags( "$close[0]"); - my @tags=$canv->gettags("current"); - my $id=$tags[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; - $id=$tag; - } - my ($type, $numb) = $id=~/([A-Za-z]+)(\d+)/; - my $text.="($id) No Associated Data\n"; - if ($type =~ /^M|S/) { - my @r=@{ $m[$numb] }; #find the row of the data assocated with this item# - my $span1=$r[2]-$r[1]+1; - my $span2=$r[6]-$r[5]+1; - $span2=-($r[5]-$r[6]+1) if ($r[5]>$r[6]); - - for (1..3,5..7) {next if !/[0-9.]+$/; $r[$_]=&commify($r[$_]);} - $text="$id\n"; - $text.= "Sa: $r[0] $r[1] - $r[2] ($span1) len:$r[3]\n"; - $text.= "Sb: $r[4] $r[5] - $r[6] ($span2) len:$r[7]\n"; - for (my $j=8; $j<@r; $j++) { - next if $r[$j] eq ''; - my $t=$r[$j]; - if (length($t)> $opt{'popup_max_len'} ) { - #print "LEN:,",length($t),"\n"; - $t='(too long)'; - } - if ($opt{'popup_format'} eq 'text') { - $text.="\[$mheader[$j]\]$t " ; - } else { - $text.="\[$j\]$t " ; - } - } - } elsif ($type =~/^E/ ) { - my @r=@{ $e[$numb] }; - for (my $i=1;$i<3;$i++) {$r[$i]=&commify($r[$i]);} - $text="$id\n"; - $text.="$r[0] $r[1]-$r[2]\n"; - for (my $j=3; $j<@r; $j++) { - next if $r[$j] eq ''; - my $t=$r[$j]; - if (length($t)>$opt{'popup_max_len'}) { - $t='(too long)'; - } - if ($opt{'popup_format'} eq 'text') { - $text.="\[$eheader[$j]\]$t " ; - } else { - $text.="\[$j\]$t " ; - } - } - } - next if $text=~/No Associated Data/m; - my $displayedcolor=$canvas->itemcget($id, -fill); - my $poptext=$canvas->createText($cx, $cy,,-anchor=>'nw',-justify=>'left',-width=>400,-fill=>'black',-text=>$text); - my ($l,$r,$t,$b)=$canvas->bbox($poptext); - my $poprect=$canvas->createRectangle($l-2, $r-2, $t+2, $b+2,,-fill=>'#ffff9d'); - $canvas->addtag("POP$poprect",'withtag',$poprect); - $canvas->addtag("POP$poprect",'withtag',$poptext); - $canvas->lower($poprect,$poptext); - $canvas->bind("POP$poprect","", sub {$canvas->delete("POP$poprect"); } ); - $canvas->bind("POP$poprect","", - [sub{ - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - $canvas->move( "POP$poprect", $cx - $iinfo{'lastX'}, $cy-$iinfo{'lastY'} ); - $iinfo{'lastX'}=$cx; - $iinfo{'lastY'}=$cy; - } - ,Ev('x'),Ev('y')]); - - } - ,Ev('x'),Ev('y')]); - #######################################################33 - ################# popup menu ############################ - $canvas->Tk::bind('', - [sub{ - my ($canv, $x, $y)=@_; - #my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - #print "$cx $cy =>"; - #$canv->delete ("2"); - #my @close=$canv->find("closest",$cx,$cy) ; - - #my @close=$canv->find("all") ; - #print join ("X",@close),"X\n"; - #$canv->itemconfigure( $close[0],-fill=>"yellow"); - - my @tags = $canv->gettags( "current"); - #print join ("X",@tags),"X\n"; - my $id=$tags[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; - $id=$tag; - } - my ($type, $numb) = $id=~/([A-Za-z]+)(\d+)/; - my ($apnt,$hhashp, $hpnt); - if ( $type=~/M|S/) { - $apnt = \@m; $hhashp =\%mh; $hpnt=\@mheader; - } elsif ($type =~/E/) { - $apnt =\@e; $hhashp =\%eh; ; $hpnt =\@eheader; - } else { - return; - } - my $menu = $canvas->Menu(-relief => 'groove', -tearoff => 0, - -menuitems => [ ['command' => "**$id**"], - ['command' => 'color', -command => [\&color_change, $id, $apnt, $hhashp] ], - ['command' => 'edit', -command => [\&edit, $id,$apnt,$hpnt] ], - #['command' => 'quick_capture', -command => [\&quick_capture, $id, $apnt, $hhashp] ], - #['command' => 'quick_edit',-command => [\&quick_edit, $id, $apnt, $hhashp] ] - # - ] ); - if ( $type=~/^M|S/ ) { - $menu->command ( -label => "decontaminate via seq 1 ($m[$numb][0])", - -command=> [\&decontaminate_high_copy_repeat,$m[$numb][0],$m[$numb][1],$m[$numb][2] ]); - $menu->command ( -label => "decontaminate vua seq 2 ($m[$numb][4])", - -command=> [\&decontaminate_high_copy_repeat,$m[$numb][4],$m[$numb][5],$m[$numb][6] ]); - if ($opt{'alignment_col'} !=0 && $opt{'alignment_col2'} !=0) { - $menu->command ( -label => 'show alignment', -command=> [\&show_alignment, $numb] ); - $menu->command ( -label => ' aln seq 1 w -',-command=> [\&alignment_internal, $numb,0,'with'] ); - $menu->command ( -label => ' w/o -',-command=> [\&alignment_internal, $numb,0,'without'] ); - $menu->command ( -label => ' aln seq 2 w -',-command=> [\&alignment_internal, $numb,4,'with'] ); - $menu->command ( -label => ' w/o -' ,-command=> [\&alignment_internal, $numb,4,'without'] ); - } - if ($opt{'fasta_on'}==1) { - $menu->command ( -label => "seq 1 external ($m[$numb][0])", - -command=> [\&extract_sequence,$m[$numb][0],$m[$numb][1],$m[$numb][2] ]); - $menu->command ( -label => ' modify +/- F/R', - -command=> [\&extract_sequence,$m[$numb][0],$m[$numb][1],$m[$numb][2],'modify' ]); - $menu->command ( -label => "seq 2 external ($m[$numb][4])", - -command=> [\&extract_sequence,$m[$numb][4],$m[$numb][5],$m[$numb][6] ]); - $menu->command ( -label => ' modify +/- F/R' , - -command=> [\&extract_sequence,$m[$numb][4],$m[$numb][5],$m[$numb][6],'modify']); - } - } - - if ( $type=~/^E/ ) { - $menu->command ( -label => "seq 1 external ($m[$numb][0])", - -command=> [\&extract_sequence,$e[$numb][0],$e[$numb][1],$e[$numb][2] ]); - $menu->command ( -label => ' modify +/- F/R', - -command=> [\&extract_sequence,$e[$numb][0],$e[$numb][1],$e[$numb][2],'modify' ]); - } - - my $cnt=1; - foreach my $n (('',2,3,4)) { - my $command=$opt{"execute$n"}; - my $desc=$opt{"execute$n"."_desc"}; - my $array=$opt{"execute$n"."_array"}; - my $col=$opt{"execute$n"."_col"}; - #print "x$n xx$type xxx$array\n"; - next if $type eq 'E' && $array ne 'e'; - next if $type ne 'E' && $array ne 'm'; - next if $command eq ''; - $menu->command ( -label => "CMD$cnt)$desc" , - -command=> sub { &execute_execute($id,$n)} - ); - $cnt++; - } - - - $menu-> Popup(-popover => 'cursor', -popanchor => 'nw'); - - break; } - ,Ev('x'),Ev('y')]); - - -sub alignment_internal { - my $numb=shift; - my $column=shift; - my $type=shift; - my $seq_col=''; - my $header=''; - my $title=''; - if ($column==0) { - $seq_col=$opt{'alignment_col'}; - $title.= "First (query) alignment sequence"; - $header= ">$m[$numb][0].$m[$numb][1]-$m[$numb][2]\n"; - } elsif ( $column==4) { - $title.= "Second (sujbect) alignment sequence"; - $header= ">$m[$numb][4].$m[$numb][5]-$m[$numb][6]\n"; - $seq_col=$opt{'alignment_col2'}; - } else { - warn "BAD USE OF alignment_internal subroutine\n"; - } - my $seq=$m[$numb][$seq_col]; - #print "($seq)\n"; - if ($type =~ /without/ ) { - $seq=~ tr/-//d; - $title .= ' without dashes'; - } else { - $title .= ' with dashes'; - } - #print "$header($seq)\n"; - $title .= " for M$numb"; - $seq=&fasta_format_wrap($seq,50); - $seq = $header. $seq; - if ($seq_col==0) { - $seq= "Prealigned sequences must be included as columns\nin the align file for this option to work!\nThese sequences must contain indel dashes\nas the alignment is not recalculated." - } - - &export_text( \$seq, $title); - -} - - -sub extract_sequence { - my $seqname=shift; - my $begin=shift; - my $end=shift; - my $modify=shift;# modify=modify - my $orient='F'; - if ($begin>$end) { - $orient ='R'; - ($end,$begin)=($begin,$end); - } - if ($modify) { - my ($b,$e) = ($begin,$end); - my $mwx = new MainWindow; - $mwx->title("MODIFY SEQUENCE EXTRACTION"); - $mwx->setPalette('lightgrey'); - $mwx->configure(-background=>'darkgrey'); - my $f = $mwx->Frame( - -borderwidth => 4, - -relief => 'raised', - )->pack(-expand=>1,-fill=>'both'); - $f->Label(-text => "$seqname")->pack(-side=> 'left',-anchor => 'e'); - $f->Label(-text => " begin")->pack(-side=> 'left',-anchor => 'e'); - $f->Entry(-textvariable => \$b ,-width=> 10)->pack(-side=> 'left', -anchor=> 'e'); - $f->Label(-text => " end")->pack(-side=> 'left',-anchor => 'e'); - $f->Entry(-textvariable => \$e ,-width=> 10)->pack(-side=> 'left', -anchor=> 'e'); - $f->Optionmenu(-textvariable=>\$orient, -options => ['F','R'] )->pack(-side => 'left',-anchor=>'e'); - $f->Button(-borderwidth => 3,-activebackground=>'white',-background=>'#bbe8ff', - -command=>sub { $mwx->destroy;},-text => "Extract")->pack(-side=>'right'); - $f->Button(-borderwidth => 3,-activebackground=>'white',-background=>'#bbe8ff', - -command=>sub { $b=0; $e=0; $mwx->destroy;},-text => "Cancel")->pack(-side=>'right'); - $mwx->waitWindow; - $begin = $b if $b; - $end = $e if $e; - - } - print "$seqname ($begin-$end) ($orient)\n" if !$opt{'quiet'}; - my $seqpath=''; - my $fragged=0; - foreach my $fdir ( (split /:/,$opt{'fasta_directory'}) ) { - #print "DIRECTORY:($fdir)\n"; - if (-d $fdir && opendir (DIR, $fdir) ) { - my @matches=grep { /^$seqname/ } readdir DIR; - my @single= grep {/^$seqname$/ } @matches; - if (@single ==1 ) { - #print "Perfect file ($single[0]) \n"; - $seqpath="$fdir/$single[0] "; - } elsif ( $matches[0]=~ /^$seqname.?_\d+$/ ) { - #print "FRACTIONATED $matches[0]\n"; - $seqpath="$fdir/$matches[0]"; - $fragged=1; - } - } - last if $seqpath; - - } - #search in blastdbs with fastacmd# - if (! $seqpath ) { - my $db = $opt{'fasta_blastdb'}; - $db =~ s/:/ /mg; - #hunt for it in a fastacmd# - if (open (FASTA, "fastacmd -d '$db' -s $seqname |") ) { - my $head=; - - if ($head =~ />/ && $head =~/$seqname/) { - #print "EXTRACTING SEQUENCE FROM BLASTDB\n"; - open (OUTFASTA, ">tmpfasta") || die "Can't create tmpfasta\n"; - print OUTFASTA $head; - while () { - print OUTFASTA; - } - $seqpath='tmpfasta'; - } - } - } - print "SEARCH RESULT:$seqname ($seqpath) $begin-$end FRAG($fragged)\n" if !$opt{'quiet'}; - my $seq=''; - return if $seqpath eq ''; - if ($fragged == 0 ) { - print "$seqpath $begin $end\n" if !$opt{'quiet'};; - $seq=&fasta_getsubseq_whole($seqpath,$begin,$end); - } else { - #get fragged sequence# - $seqpath=~s/_\d+$//; - $seq=&fasta_getsubseq_frac($seqpath,$begin,$end,$opt{'fasta_fragsize'}); - } - #print "($seq)\n"; - if ($seq eq '') { - print "ERROR:No sequence extracted"; - return; - } - $seq=&fasta_format_wrap($seq,50); - $seq = ">$seqname.$begin.$end.$orient\n$seq"; - - &export_text( \$seq, "sequence extracted", "$seqname.$begin.$end.$orient",); #print error message otherwise# - - print "DONE\n"; - ############# - #export it### - - -} - - -{ -my $removals_count=0; - -my @ocolor =qw(#ffffd4fdd4fd #beb700000000 #ffff00000000 #ffff63d60000 #ffff8f1846e9 #f4bb0000ffff #fffffa5d0000 #c941cb01c10e #764547e80ec8 #12f1ffff0000 #d2cef78ca2e3 #b0a39aa7855c #e2d0ff1effff #778cfbe7ffff #000000000000 #9374d1c1ffff #8f5bab84ffff #00000000ffff #be518cf0beb7 #d2f00000ffff #22d000007fff #d915d915d915 #b374b374b374 #70a370a370a3 #dfffb9488869); -#my @ocolor=qw(red yellow green pink blue purple orange #cd0d32083208 cyan #86e4ce2ceb01 #0068cd0d0000 #cd0daa2f7d15 -# #a207cd0d5a04 #9b1ecd0d9b05 brown #9fb7b6dccdd2 honeydew tan); -sub decontaminate_high_copy_repeat { - my $seqname=shift; - my $begin=shift; - my $end=shift; - my $flank=500; - my $max_bpalign=$end-$begin+2*$flank +1; - my $min_fraction_pair=0.7; - my $min_fraction_region=0; - my $join_distance=300; - - my $modify=shift;# modify=modify - - - if ($begin>$end) { - ($end,$begin)=($begin,$end); - } - #flip the initial search around so that it will be in relation to this sequence region - $modify=1; - if ($modify) { - my ($b,$e, $fl, $ml) = ($begin,$end, $flank, $max_bpalign); - my $mwx = new MainWindow; - $mwx->title("DELETE HIGH COPY REPEAT"); - $mwx->setPalette('lightgrey'); - $mwx->configure(-background=>'darkgrey'); - my $f = $mwx->Frame( -borderwidth => 4,-relief => 'raised' )->pack(-expand=>1,-fill=>'both'); - $f->Label(-text => "This will irrevocably delete the pairwise from parasight table!\nIt currently errors on the side of removing too much sequence\nand thus pairwise segmental dups may be lost\nThus, this is a tool for initially identifying unknown high copy repeats for better masking!")->pack(-side=> 'left',-anchor => 'e'); - $f = $mwx->Frame( -borderwidth => 4, -relief => 'raised', )->pack(-expand=>1,-fill=>'both'); - $f->Label(-text => "$seqname")->pack(-side=> 'left',-anchor => 'e'); - $f->Label(-text => " begin")->pack(-side=> 'left',-anchor => 'e'); - $f->Entry(-textvariable => \$b ,-width=> 10)->pack(-side=> 'left', -anchor=> 'e'); - $f->Label(-text => " end")->pack(-side=> 'left',-anchor => 'e'); - $f->Entry(-textvariable => \$e ,-width=> 10)->pack(-side=> 'left', -anchor=> 'e'); - $f->Label(-text => " flank")->pack(-side=> 'left',-anchor => 'e'); - $f->Entry(-textvariable => \$fl ,-width=> 10)->pack(-side=> 'left', -anchor=> 'e'); - $f->Label(-text => " max bp align")->pack(-side=> 'left',-anchor => 'e'); - $f->Entry(-textvariable => \$ml ,-width=> 10)->pack(-side=> 'left', -anchor=> 'e'); - $f = $mwx->Frame( -borderwidth => 4, -relief => 'raised' )->pack(-expand=>1,-fill=>'both'); - $f->Label(-text => " min fraction of region")->pack(-side=> 'left',-anchor => 'e'); - $f->Entry(-textvariable => \$min_fraction_region ,-width=> 10)->pack(-side=> 'left', -anchor=> 'e'); - $f->Label(-text => " min fraction of pairwise piece")->pack(-side=> 'left',-anchor => 'e'); - $f->Entry(-textvariable => \$min_fraction_pair ,-width=> 10)->pack(-side=> 'left', -anchor=> 'e'); - - $f->Button(-borderwidth => 3,-activebackground=>'white',-background=>'red', - -command=>sub { $mwx->destroy;},-text => "Remove")->pack(-side=>'right'); - $f->Button(-borderwidth => 3,-activebackground=>'white',-background=>'#bbe8ff', - -command=>sub { $b=0; $e=0; $fl=0; $mwx->destroy;},-text => "Cancel")->pack(-side=>'right'); - $mwx->waitWindow; - $begin = $b; - $end = $e; - $flank=$fl; - $max_bpalign=$ml; - - } - return if $begin ==0; - $removals_count++; - my $fremoval_count=substr ("00000".$removals_count, -4); - print "REMOVAL#$fremoval_count\nCOLLECTING ALL PAIRWISE FOR $seqname ($begin-$end) ($flank) ($max_bpalign)\n" if !$opt{'quiet'}; - #set up regions array with initial region in the + orientation - my @regions= ( {'seq'=>$seqname,'b'=>$begin,'e'=>$end, 'orient'=>+1,'first'=>1 } ); - - ##################################### - ##trace out all of the connections### - ##mark all for deletion############## - my %deleted=(); - my $one_pass_only=0; - my @processed_regions=(); - my $first_pass=1; - REGIONPASS: while (@regions > 0) { - my @new_regions=(); - my $mlen=@m; - foreach my $r (@regions) { - my $begin=$$r{'b'}-$flank; - $begin=1 if $begin < 1; - my $end= $$r{'e'}+$flank; - print "EXAMINING FOR PAIRS MATCHING: $$r{'seq'} $$r{'b'}-$$r{'e'}\n"; - for (my $i=0; $i< $mlen; $i++) { - my $refi=$m[$i]; - next if exists $pairwise2delete{$i}; - my @c=(); - next if $$r{'seq'} ne $$refi[0] && $$r{'seq'} ne $$refi[4]; - # print "SEQUENCE MATCH $$r{'seq'} equals $$refi[0] and/or $$refi[4]\n"; - ###adjust c so that seq1 is region and seq2 is match in positive orientation - ###orientation is in integer for so multiplication can take place. - if ( $$r{'seq'} eq $$refi[0] ) { - next if $end < $$refi[1] || $begin > $$refi[2]; - #we have overlap of seq1# - @c=@$refi; - if ($c[5]>$c[6]) { - ($c[5],$c[6])=($c[6],$c[5]); - $c[8]=-1; - } else { - $c[8]=+1; - } - } - if ($$r{'seq'} eq $$refi[4] ) { - next if ($end < $$refi[5] && $end < $$refi[6]) || ($begin > $$refi[5] && $begin > $$refi[6] ); - ##we have ovlerlapof seq2# - @c=@$refi; - ($c[0],$c[1],$c[2],$c[3],$c[4],$c[5],$c[6],$c[7])=($c[4],$c[5],$c[6],$c[7],$c[0],$c[1],$c[2],$c[3]); - if ($c[1]>$c[2]) { - ($c[1],$c[2])=($c[2],$c[1]); - $c[8]=-1; - } else { - $c[8]=+1; - } - } - next if @c==0; - ################### - ###is within my region### - # print " region contained $begin-$end contains: $c[0] $c[1]-$c[2] $c[8]\n"; - my $span=$c[2]-$c[1]+1; - next if $span > $max_bpalign; - #print " not to big\n"; - my $fraction_pair=0; - my $fraction_region=0; - if ($c[1] <= $begin && $c[2] >= $end ) { - #print " region surround by pairwise\n"; - #print "R $begin - $end \n"; - #print "P $c[1] $c[2]\n"; - $fraction_pair=($end-$begin+1)/$span; - $fraction_region=1; - } elsif ($begin <= $c[1] && $end >= $c[2]) { - #print " pairwise completely contained\n"; - #print "R $begin $end\n"; - #print "P $c[1] $c[2] \n"; - $fraction_pair=1; - $fraction_region=($c[2]-$c[1]+1)/($end-$begin+1); - } elsif ( $begin <= $c[2] && $c[2] <= $end ) { - #print " pairwise stop contained--start outside\n"; - #print "R $begin $end\n"; - #print "P $c[1] $c[2]\n"; - my $overlap=$c[2]-$begin+1; - $fraction_pair=$overlap/$span; - $fraction_region=$overlap/($end-$begin+1); - } elsif ( $begin <= $c[1] && $c[1] <= $end ) { - #print " pairwise start contained--top outside\n"; - #print "R $begin $end\n"; - #print "P $c[1] $c[2]\n"; - my $overlap=$end-$c[1]+1; - $fraction_pair=$overlap/$span; - $fraction_region=$overlap/($end-$begin+1); - - } else { - print "($c[0],$c[1],$c[2],$c[3],$c[4],$c[5],$c[6],$c[7])\n"; - &warnNpause("Your algorithm is screwy! I shouldn't be here!\n"); - } - #print "Rfrac:$fraction_region Pfrac:$fraction_pair\n"; - next if $fraction_pair < $min_fraction_pair; - next if $fraction_region < $min_fraction_region; #this should stay low except for specialized purposes - my $relative_orient=$$r{'orient'} * $c[8]; - #print "Orient relative to first is $relative_orient\n"; - print "Delete pairwise $i\n"; - $deleted{$i}=$refi; - $pairwise2delete{$i}=$refi; - push @new_regions, {'seq'=>$c[4],'b'=>$c[5],'e'=>$c[6], 'orient'=>$relative_orient ,"first"=>$first_pass}; - #recolor Sa Sb and M# - $$refi[$mh{'color'}]='black'; - my $owidth=3; - $owidth=20 if $one_pass_only==1; - $canvas->itemconfigure("Sa$i", -fill=>'black',-outline=>$ocolor[0],-width=>$owidth ); - $canvas->itemconfigure("Sb$i", -fill=>'black' ,-outline=>$ocolor[0],-width=>$owidth ); - $canvas->itemconfigure("Ma$i", -fill=>'black',-outline=>$ocolor[0],-width=>$owidth ); - - } #@m loop - push @processed_regions, $r; - if ($one_pass_only) { - push @processed_regions, @new_regions; - last REGIONPASS; - } - } #region #loop - $first_pass=0; - @regions=@new_regions;# - #should have same orientations to merge - #merge any close together regions for subsequent searches - - #my $pause=; - } #while region looop - print "REMOVING (", scalar keys %deleted, ") PAIRWISE\n"; - my $fname="repeat$fremoval_count.$seqname.b$begin.e$end.f$flank.m$max_bpalign.pf$min_fraction_pair.rf$min_fraction_region"; - #my $fname="repeat$fremoval_count"; - print "$fname\n"; - mkdir "decontamination" if !-d 'decontamination'; - open (OUT, ">decontamination/$fname.pairs") || die "Can't create decontamination/$fname.pairs!\n"; - foreach (reverse sort {$a <=> $b} keys %deleted) { - print OUT join("\t", @{$m[$_]}), "\n"; - } - close OUT; - open (OUT, ">decontamination/$fname.coordinates") || die "Can't create decontamination/$fname.coordinates!\n"; - print OUT "seq\tbegin\tend\tname\torient\n"; - ###### catalog ###### - @processed_regions= sort {$$a{'seq'} cmp $$b{'seq'} || $$a{'b'} <=> $$b{'b'} || $$a{'e'} <=> $$b{'e'} } @processed_regions ; - for (my $i=0; $i < @processed_regions-1; $i++ ) { - my $ci=$processed_regions[$i]; - my $cj=$processed_regions[$i+1]; - next if $$ci{'seq'} ne $$cj{'seq'}; - if ($$ci{'e'} + $join_distance > $$cj{'b'} ) { - $$ci{'e'} = $$cj{'e'} if $$cj{'e'} > $$ci{'e'}; - splice(@processed_regions,$i+1,1); - $i--; - next; - } - } - - foreach (@processed_regions) { - my $b=$$_{'b'}; - my $e=$$_{'e'}; - ($b,$e)=($e,$b) if ($$_{'orient'}<0) ; - print OUT "$$_{'seq'}\t$b\t$e\t$$_{'seq'}.$b.$e\t$$_{'orient'}\t$$_{'first'}\n"; - } - close OUT; - push @ocolor, (shift @ocolor); -} - - -} - - - -######################################################## -######Zoom in ####################################### - - $canvas->Tk::bind('', - [sub{ my ($canv, $x, $y)=@_; $scale*=2; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - $canvas->scale("all",$cx,$cy,2,2); - my @box=$canvas->bbox("all"); - #print "$box[0] $box[1] $box[2] $box[3]\n"; - $canvas->configure(-scrollregion=>\@box); - $canvas->configure(-height=>$box[3], -width=>$box[2]); - #print $canvas->cget(-height)," ",$canvas->cget(-width),"\n"; - } - ,Ev('x'),Ev('y')]); - #######Zoom out #### - $canvas->Tk::bind('', - [sub { my ($canv, $x, $y)=@_; $scale*=0.5; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - $canvas->scale("all",$cx,$cy,0.5,0.5); - my @box=$canvas->bbox("all"); - #print "$box[0] $box[1] $box[2] $box[3]\n"; - $canvas->configure(-scrollregion=>\@box); - $canvas->configure(-height=>$box[3], -width=>$box[2]); - #print $canvas->cget(-height)," ",$canvas->cget(-width),"\n"; - } - ,Ev('x'),Ev('y')]); - - -############################################# -####################3 move any object ############# - $canvas->Tk::bind('', - [sub{ - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - $iinfo{'lastX'}=$cx; - $iinfo{'lastY'}=$cy; - #print "START move\n"; - - } - ,Ev('x'),Ev('y')]); - - $canvas->Tk::bind('', - [sub{ - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - $canvas->move( 'current', $cx - $iinfo{'lastX'}, $cy-$iinfo{'lastY'} ); - $iinfo{'lastX'}=$cx; - $iinfo{'lastY'}=$cy; - } - ,Ev('x'),Ev('y')]); - -############################################################ -############# any delete a tag ################################## - - $canvas->Tk::bind('', - [sub{ - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - #my @close=$canv->find("closest",$cx,$cy) ; - my @tags = $canv->gettags( "current"); - my $id=$tags[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; - $id=$tag; - } - my ($type, $numb) = $id=~/([A-Za-z]+)(\d+)/; - if ($type =~ /^[MS]/) { - $canvas->delete("M$numb"); - $canvas->delete("Sa$numb"); - $canvas->delete("Sb$numb"); - $m[$numb][$mh{'hide'}]=1; - - - } else { - $canvas->delete('current'); - } - } - ,Ev('x'),Ev('y')]); - - ##################################################### - ########################################3 - ########## QUICK COLOR OBJECTS############### - - $canvas->Tk::bind('', - [sub{ - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - my @close=$canv->find("closest",$cx,$cy) ; - my @tags = $canv->gettags( "$close[0]"); - my $id=$close[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; - $id=$tag; - } - my ($type, $numb) = $id=~/([A-Za-z]+)(\d+)/; - my $text.="Nothing of interest\n"; - my $color=$opt{'quick_color'}; - if ($type =~ /M/) { - $m[$numb][$mh{'color'}]=$color; - $canvas->itemconfigure("M$numb", -fill=>$color ); - $oldhighlightcolor=$color if $oldhighlightid eq "M$numb" ; - } elsif ($type=~/S/) { - $oldhighlightcolor=$color if $oldhighlightid =~ /^S[ab]$numb/ ; - $m[$numb][$mh{'scolor'}]=$color; - $canvas->itemconfigure("Sa$numb", -fill=>$color); - $canvas->itemconfigure("Sb$numb", -fill=>$color); - } elsif ($type=~/E/) { - $oldhighlightcolor=$color if $oldhighlightid eq "E$numb" ; - $e[$numb][$eh{'color'}]=$color; - $canvas->itemconfigure("E$numb", -fill=>$color); - #$oldhighlightcolor=$color if $oldhighlightid eq "E$numb" ; - - } else { - $canvas->itemconfigure("$id",-fill,$color); - } - } - ,Ev('x'),Ev('y')]); - - - $canvas->Tk::bind('', - [sub{ - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - my @close=$canv->find("closest",$cx,$cy) ; - my @tags = $canv->gettags( "$close[0]"); - my $id=$close[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; - $id=$tag; - } - my ($type, $numb) = $id=~/([A-Za-z]+)(\d+)/; - if ($type =~ /M/) { - my $color='black'; # no way to know if inter or intra - $oldhighlightcolor=$color if $oldhighlightid eq "M$numb" ; - $m[$numb][$mh{'color'}]=''; - $canvas->itemconfigure("M$numb", -fill=>$color ); - } elsif ($type=~/S/) { - - my $color=$opt{'sub_color'}; - $color=$accsub{$m[$numb][4]}{'color'} if defined $accsub{$m[$numb][4]}{'color'}; - $m[$numb][$mh{'scolor'}]=''; - $oldhighlightcolor=$color if $oldhighlightid =~ /^S[ab]$numb/ ; - $canvas->itemconfigure("Sa$numb", -fill=>$color ); - $color=$accsub{$m[$numb][0]}{'color'} if defined $accsub{$m[$numb][4]}{'color'}; - $canvas->itemconfigure("Sb$numb", -fill=>$color); - - - } elsif ($type=~/E/) { - my $color=$opt{'extra_color'}; # no way to know if inter or intra - $oldhighlightcolor=$color if $oldhighlightid eq "E$numb" ; - $e[$numb][$eh{'color'}]=''; - $canvas->itemconfigure("E$numb", -fill=>$color ); - - } else { - $canvas->itemconfigure ("$id", -fill => black); - } - } - ,Ev('x'),Ev('y')]); - -############################################################# -########## raise and lower tags ############################ - $canvas->Tk::bind('', - [sub{ - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - $canv->lower('current'); - } - ,Ev('x'),Ev('y')]); - - $canvas->Tk::bind('', - [sub{ - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - $canv->raise('current'); - } - ,Ev('x'),Ev('y')]); - - -############################################## -############################################# -################ user defined ############### - - $canvas->Tk::bind('', - [sub{ - - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - my @tags = $canv->gettags( "current"); - my $id=$tags[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; - $id=$tag; - } - &execute_execute($id,''); - } - ,Ev('x'),Ev('y') - ]); - $canvas->Tk::bind('', - [sub{ - - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - my @tags = $canv->gettags( "current"); - my $id=$tags[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; - $id=$tag; - } - &execute_execute($id,'2'); - } - ,Ev('x'),Ev('y') - ]); - $canvas->Tk::bind('', - [sub{ - - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - my @tags = $canv->gettags( "current"); - my $id=$tags[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; - $id=$tag; - } - &execute_execute($id,''); - } - ,Ev('x'),Ev('y') - ]); - $canvas->Tk::bind('', - [sub{ - - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - my @tags = $canv->gettags( "current"); - my $id=$tags[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; - $id=$tag; - } - &execute_execute($id,'2'); - } - ,Ev('x'),Ev('y') - ]); - $canvas->Tk::bind('', - [sub{ - - my ($canv, $x, $y)=@_; - my ($cx,$cy)=($canv->canvasx($x),$canv->canvasy($y) ); - my @tags = $canv->gettags( "current"); - my $id=$tags[0]; - foreach my $tag (@tags) { - next if $tag !~/^[MSE]/; - $id=$tag; - } - &execute_execute($id,'3'); - } - ,Ev('x'),Ev('y') - ]); - -sub execute_execute { - #this subroutine executes a predefined command substituting - #data from a column via replacing occurence of {} - my $type=shift; - my $n=shift; - my $command=$opt{"execute$n"}; - my $desc=$opt{"execute$n"."_desc"}; - my $array=$opt{"execute$n"."_array"}; - my ($numb) = $type =~ /(\d+)/; - - print "row$n pairORextra$type arrayname$array\n"; - next if $type =~ /E/ && $array ne 'e'; - next if $type !~ /E/ && $array ne 'm'; - my @c=@{$$array[$numb]}; - print "ARRAY", join ("==>",@c), "\n"; - ###evaluate and execute command while trapping runtime errors### - print "PRE ($command)\n" if !$opt{'quiet'}; - $command = '$command = " ' . $command . ' "'; - print "PREEVAL ($command)\n" if !$opt{'quiet'}; - eval ($command); - if ($@) { - warn $@ ; - } else { - print "SYSTEM CALL($command)\n" if !$opt{'quiet'}; - system "$command"; - } -} - - ################################################ - ########### FIRST DRAW ######################### - ################################################ - $first_pass=1; - &reshowNredraw; - $mw->deiconify(); - $mw->raise(); - $mw->waitVisibility(); - - ################################################# - ######### EXECUTE SUPPLIED PRECODE ############## - my $to_die=$opt{'die'}; - $mw->after(50, sub { - if ($opt{'precode'}) { - print "PRECODE\n" if !$opt{'quiet'}; - if ($opt{'precode'} =~ /^file:(\S+)/ ) { - my $prefile=$1; - print "PRECODE IS FILE. LOADING NOW ...\n" ; - open (PRECODE, $prefile) || die "Can't read precode file ($prefile)!"; - $opt{'precode'}=''; - while () { - s/\r\n/\n/; - $opt{'precode'}.=$_; - } - } - my $precodetext=$opt{'precode'}; - ###scan precode for valid $opt{'variables'}### - while ( $precodetext =~ /opt\{["']([a-z_]+)["']\}/mg ) { - #my $value=$1; - #print "$value\n"; - warn "\$opt{\"$1\"} is not a valid option!\n" if ! defined $opt{$1}; - #my $pause=; - - } - ###run the precode and watch for errors### - eval $precodetext; - if ($@) { - warn "PRECODE REPORTED A SYNTAX ERROR!\n$@\n" ; - my @precode_array =split /\n/ , $precodetext; - if ( $@ =~ /line (\d+)/ ) { - warn "LINE $1 WAS: $precode_array[$1]\n"; - } - } - - } - - if ($to_die ) { - print "\n-die has been envoked--quitting now\n" if !$opt{'quiet'};; - exit; - } - } ); - ################################################ - ########## MAIN LOOP ########################### - MainLoop; - - - -#################################################################################33 -####################### SUBROUTINES ############################################# -################################################################################# - -sub find_column_options { - ####################### - ####figures out name of column given a number and vis-versa - #####m only ####### - my @farray=qw(sub_scale_col sub_scale_col2 sub_labelseq_col sub_labelseq_col2 - sub_labelseqe_col sub_labelseqe_col2 alignment_col alignment_col2 - sub_labelhit_col sub_labelhit_col2 filter1_col filter2_col - filterpre1_col - filterpre2_col pair_type_col colorsub_hitcond_col pair_type_col2); - for my $n (@farray) { - if ($opt{$n} =~/[a-zA-z]/) { - if ( defined $mh{$opt{$n}} ) { - $opt{$n}=$mh{$opt{$n}}; - } else { - $opt{$n}.='??' if $opt{$n} !~/\?/; - } - } - $colheader{("$n"."_header")}='( )'; - $colheader{("$n"."_header")}="($mheader[$opt{$n}])" if $opt{$n} =~/^\d+$/; - } - ################################## - #### e only ##################### - @farray=qw(filterextra1_col filterextra2_col); - for my $n (@farray) { - if ($opt{$n} =~/[a-zA-z]/) { - if ( defined $eh{$opt{$n}} ) { - $opt{$n}=$eh{$opt{$n}}; - } else { - $opt{$n}.='??' if $opt{$n} !~/\?/; - } - } - $colheader{("$n"."_header")}='( )'; - $colheader{("$n"."_header")}="($eheader[$opt{$n}])" if $opt{$n} =~/^\d+$/; - } - ##################################### - #### m and e ######################## - my %fhash = ( mark_col=> mark_array, mark_col2 => mark_array - ) ; - - for my $n (keys %fhash) { - if ($opt{$fhash{$n}}=~/[psm]/ ) { - ###change text to number### - if ($opt{$n} =~/[a-zA-z]/) { - if ( defined $mh{ $opt{$n} }) { - $opt{$n}=$mh{$opt{$n}}; - } else { - $opt{$n}.='??' if $opt{$n} !~/\?/; - } - } - $colheader{("$n"."_header")}='( )'; - $colheader{("$n"."_header")}="($mheader[$opt{$n}])" if $opt{$n} =~/^\d+$/; - } elsif ($opt{$fhash{$n}}=~/e/ ) { - ###extras### - if ($opt{$n} =~/[a-zA-z]/) { - if ( defined $eh{$opt{$n}}) { - $opt{$n}=$eh{$opt{$n}}; - }else { - $opt{$n}.='??' if $opt{$n} !~/\?/; - } - } - $colheader{("$n"."_header")}='( )'; - $colheader{("$n"."_header")}="($eheader[$opt{$n}])" if $opt{$n} =~/^\d+$/; - } else { - $colheader{("$n"."_header")}='( )'; - } - } -} - - -sub mark_window { - #,'#bbe8ff' light blue - # '#ffff9b' light yellow - #$ballooni->attach($tmp, -msg => &balloon_format_var('sub_arrow_on') ) if $opt{'help_on'}; -#some code from gbarr# - my $mw = new MainWindow; - $ballooni = $mw->Balloon(-initwait =>600, -background => '#ffff9d' ,-font=>'Courier 8'); - $mw->title("MARK / FIND OBJECTS"); - my $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-fill=>'x'); - my ($tl,$te); #tmp varaibles to toss - ($tl,$te) = &fast_lentry($frame,$tmp,"pattern", 'mark_pattern', 50 , \&mark); - $te->pack(-fill=>'x',-expand=>1); - #$te->configure(-background=>'lightgray'); - $frame->Button(-borderwidth => 4,-command=>[\&mark,'markall'],-text => "Mark All")->pack(-side=>'right'); - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-expand=>1,-fill=>'x'); - $frame->Button(-borderwidth => 4,-command=>[\&mark, 'findnext'],-text => "Find Next")->pack(-side=>'right'); - &fast_lentry($frame,$tmp,"col", 'mark_col', 5 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'mark_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col2", 'mark_col2', 5 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'mark_col2_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"color", 'mark_color', 15 , \&mark); - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-fill=>'x'); - $tmp=$frame->Checkbutton(-text=>'Permanent Color',-variable => \$opt{'mark_permanent'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('mark_permanent') ) if $opt{'help_on'}; - $frame->Button(-borderwidth => 4,-command=>\&mark_remove,-text => "Clear")->pack(-side=>'right'); - my $text_variable='pairs'; - - my $opt_menu=$frame->Optionmenu(-textvariable=>\$text_variable, - -variable=>\$opt{'mark_array'}, -options => [['pairs','m'], ['extras','e']], - )->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($opt_menu, -msg => &balloon_format_var('mark_array') ) if $opt{'help_on'}; - my $lab=$frame->Label(-text=>'Mark/Find:')->pack(-side=> 'left',-anchor => 'e'); - my $mark_pairs=$frame->Checkbutton(-text=>'pairs',-variable => \$opt{'mark_pairs'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($opt_menu, -msg => &balloon_format_var('mark_pairs') ) if $opt{'help_on'}; - my $mark_subs=$frame->Checkbutton(-text=>'subs',-variable => \$opt{'mark_subs'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($opt_menu, -msg => &balloon_format_var('mark_subs') ) if $opt{'help_on'}; - $opt_menu->configure(-command=> - sub { - if ($opt{'mark_array'} eq 'e') { - #$lab->configure(-background=>'darkgray'); - $mark_pairs->configure(-state=>'disabled'); - $mark_subs->configure(-state=>'disabled'); - } else { - #$lab->configure(-background=>'lightgray'); - $mark_pairs->configure(-state=>'normal'); - $mark_subs->configure(-state=>'normal'); - } - } - ); - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-fill=>'x'); - ($tl,$te)=&fast_lentry($frame,$tmp,"advanced", 'mark_advanced', 50 , \&mark,); - $te->pack(-fill=>'x',-expand=>1); - -} - - -sub mark_windowx { - #,'#bbe8ff' light blue - # '#ffff9b' light yellow - #some code from gbarr# - my $mw = new MainWindow; - $mw->title("MARK / FIND OBJECTS"); - my $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-fill=>'x'); - my ($tl,$te); #tmp varaibles to toss - ($tl,$te) = &fast_lentry($frame,$tmp,"pattern", 'mark_pattern', 50 , \&mark); - $te->pack(-fill=>'x',-expand=>1); - #$te->configure(-background=>'lightgray'); - $frame->Button(-borderwidth => 4,-command=>[\&mark,'markall'],-text => "Mark All")->pack(-side=>'right'); - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-expand=>1,-fill=>'x'); - $frame->Button(-borderwidth => 4,-command=>[\&mark, 'findnext'],-text => "Find Next")->pack(-side=>'right'); - &fast_lentry($frame,$tmp,"col", 'mark_col', 5 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'mark_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col", 'mark_col2', 5 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'mark_col2_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"color", 'mark_color', 15 , \&mark); - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-fill=>'x'); - $tmp=$frame->Checkbutton(-text=>'Permanent Color',-variable => \$opt{'mark_permanent'})->pack(-side=>'left',-anchor=>'e'); - #$ballooni->attach($tmp, -msg => &balloon_format_var('mark_permanent') ) if $opt{'help_on'}; - $frame->Button(-borderwidth => 4,-command=>\&mark_remove,-text => "Clear")->pack(-side=>'right'); - my $text_variable='pairs'; - - my $opt_menu=$frame->Optionmenu(-textvariable=>\$text_variable, - -variable=>\$opt{'mark_array'}, -options => [['pairs','m'], ['extras','e']], - )->pack(-side=> 'left',-anchor => 'e'); - #$ballooni->attach($opt_menu, -msg => &balloon_format_var('mark_array') ) if $opt{'help_on'}; - my $lab=$frame->Label(-text=>'Mark/Find:')->pack(-side=> 'left',-anchor => 'e'); - my $mark_pairs=$frame->Checkbutton(-text=>'pairs',-variable => \$opt{'mark_pairs'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($mark_pairs, -msg => &balloon_format_var('mark_pairs') ) if $opt{'help_on'}; - - my $mark_subs=$frame->Checkbutton(-text=>'subs',-variable => \$opt{'mark_subs'})->pack(-side=>'left',-anchor=>'e'); - $opt_menu->configure(-command=> - sub { - if ($opt{'mark_array'} eq 'e') { - #$lab->configure(-background=>'darkgray'); - $mark_pairs->configure(-state=>'disabled'); - $mark_subs->configure(-state=>'disabled'); - } else { - #$lab->configure(-background=>'lightgray'); - $mark_pairs->configure(-state=>'normal'); - $mark_subs->configure(-state=>'normal'); - } - } - ); - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-fill=>'x'); - ($tl,$te)=&fast_lentry($frame,$tmp,"advanced", 'mark_advanced', 50 , \&mark,); - $te->pack(-fill=>'x',-expand=>1); - -} - -{ -my %tmp_marks=(); -my $array_position_next=0; -my $query='naNa'; - -sub mark { - my $command=shift; - #print "$command\n"; - if ( $command eq 'markall' ) { - #reset start position - $array_position_next=0; - $query='naNa'; - } - if ( $command eq 'findnext') { - if ($query ne $opt{'mark_pattern'} ) { - print "RESETING QUERY\n" if !$opt{'quiet'};; - $array_position_next=0; - $query=$opt{'mark_pattern'}; - } - } - my $a = eval("\\\@$opt{'mark_array'}"); - #print "$opt{'mark_array'}=>$a=>",scalar @$a," (\@m)\n"; - my $color=$opt{'mark_color'}; - my $total_matches=0; - for (my $i=$array_position_next; $i < @$a; $i++) { - my $numb=$i; - my $match=0; - $match=1 if $opt{'mark_col'} =~/^\d+$/ && $$a[$i][$opt{'mark_col'}]=~/$opt{'mark_pattern'}/; - #print "$i col:$$a[$i][$opt{'mark_col'}] (",$$a[$i][$opt{'mark_col'}]=~/$opt{'mark_pattern'}/,")match\n"; - $match=1 if $opt{'mark_col2'} =~/^\d+$/ && $$a[$i][$opt{'mark_col2'}] =~ /$opt{'mark_pattern'}/ ; - #print "$i col:$$a[$i][$opt{'mark_col2'}] $match\n"; - if ($opt{'mark_advanced'} ne '' ) { - my $c=$$a[$i]; - my $result=eval($opt{'mark_advanced'}); - $match=1 if $result !=0; - } - next if $match==0; - - if ($opt{'mark_array'} eq 'm') { - if ($opt{'mark_pairs'} ==1) { - if ($opt{'mark_permanent'}) { - $m[$numb][$mh{'color'}]=$color; - } else { - $tmp_marks{"M$numb"} = $canvas->itemcget("M$numb",-fill) if !defined $tmp_marks{"M$numb"}; - } - $canvas->itemconfigure("M$numb", -fill=>$color ); - } - if ($opt{'mark_subs'} ==1) { - if ($opt{'mark_permanent'}) { - $m[$numb][$mh{'scolor'}]=$color; - } else { - $tmp_marks{"Sa$numb"} = $canvas->itemcget("Sa$numb",-fill)if !defined $tmp_marks{"Sa$numb"}; - $tmp_marks{"Sb$numb"} = $canvas->itemcget("Sb$numb",-fill)if !defined $tmp_marks{"Sb$numb"}; - } - $canvas->itemconfigure("Sa$numb", -fill=>$color); - $canvas->itemconfigure("Sb$numb", -fill=>$color); - } - } elsif ($opt{'mark_array'} eq 'e') { - if ($opt{'mark_permanent'} ==1) { - $e[$numb][$eh{'color'}]=$color; - } else { - $tmp_marks{"E$numb"} = $canvas->itemcget("E$numb",-fill)if !defined $tmp_marks{"E$numb"}; - } - $canvas->itemconfigure("E$numb", -fill=>$color); - } - $total_matches++; - if ($command eq 'findnext') { - #i found one so I am done# - #can i center on the object# - #print "FIND $i\n"; - $array_position_next=$i+1; - return; - } - - } - if ($command eq 'markall') { - print "TOTAL /$opt{'mark_pattern'}/ found was $total_matches\n" if !$opt{'quiet'};; - } - if ($command eq 'findnext') { - print "End of objects without match! Postion reset to beginning!\n" if !$newopt{'quiet'}; - $array_position_next=0; - } -} #close sub - - -sub mark_remove { - foreach my $o (keys %tmp_marks) { - $canvas->itemconfigure($o,-fill=> $tmp_marks{$o}); - } - %tmp_marks=(); -} - -} #close private variables - - -sub indexcard_options { - #,'#bbe8ff' light blue # '#ffff9b' light yellow - #the index cards were based off of some code from gbarr. thanks!# - my $mw = new MainWindow; - $mw->title("PARASIGHT OPTIONS"); - $ballooni = $mw->Balloon(-initwait =>600, -background => '#ffff9d' ,-font=>'Courier 8'); - - my $current; - $mw->setPalette('lightgrey'); - - $mw->configure(-background=>'darkgrey'); - my $bf= $mw->Frame()->pack(-side=>'top',-anchor=>'w'); - $bf->Button(-borderwidth => 4,-background=>'#bbe8ff',-command=>\&reshowNredraw,-text => "Reshow, Rearrange & Redraw")->pack(-side=>'left'); - $bf->Button(-borderwidth => 4,-background=>'white',-command=>\&redraw,-text => "Redraw Only")->pack(-side=>'left'); - - my $f = $mw->Frame( - -borderwidth => 4, - -relief => 'raised', - )->pack(-expand=>1,-fill=>'both'); - my %br; - my @l=(); - my $depth=0; - my $tf = $f->Frame( - -borderwidth => 0, - -relief => 'flat' - )->pack(-side => 'top',-fill=>'x'); - - #print (pop @{$tf->configure(-background)}), "\n"; - - foreach ( "MAIN\n","SEQ\nPAIRS","SUB\n","EXTRA\n","GRAPH\n","FILTER","MISC\n") { - my $label= $tf->Label( - -text => $_, - -borderwidth => 0, - -relief=>'sunken', - -background=>'grey', - -padx => 5, - -anchor => 'w', - )->pack(-side => 'left'); - $br{$label}=$depth; - $depth+=2; - push @l, $label; - } - $depth+=2; - #print "DEPTH:$depth\n"; - foreach (@l) { $_->configure(-pady=>$depth-$br{$_}, -borderwidth=>$br{$_},-padx=>$depth-$br{$_}); } - - my $minimize = $tf->Label( - -text => " \n ", - -borderwidth => $depth, - -background=>'darkgrey', - - -relief => 'sunken', - -padx =>2, -pady => 0, - -anchor => 'w' - )->pack(-side => 'right',-fill=> 'x',-expand=>1); - - my %c; - foreach my $i (@l) { - $i->bind('<1>', [ sub { - #print "CUR$current ==== $i\n"; - my $i=shift; - return if $current eq $i; - $br{$i}=-2; - foreach (@l) { - $br{$_}+=2 if ($br{$_} < $depth) ; - $_->configure(-borderwidth=>$br{$_} - , -pady=>$depth-$br{$_},-padx=>$depth-$br{$_}, - ); - if ( $br{$_}==0 ) { - $_->configure(-background=>'lightgrey'); - } else { - $_->configure(-background=>'grey'); - } - - } - $c{$current}->packForget; - $current = $i; - $c{$current}->pack(-side=>'bottom',-fill=>'x',-expand=>1); - }, $i]); - - $c{$i}= $f->Frame(-borderwidth => 0,-relief => 'raised'); - - } - - $current=$l[0]; - $current->configure(-background=>'lightgrey'); - #print "COLOR",join (" ",$current->configure(-background)),"\n"; - #print "CURRENT$current\n"; - $c{$current}->pack(-side=>'bottom',-fill=>'x',-expand=>1); - &card_main($c{$l[0]}); - &card_seq($c{$l[1]}); - &card_sub_subscale($c{$l[2]});; - &card_extra($c{$l[3]}); - &card_graph($c{$l[4]}); - &card_filter($c{$l[5]}); - &card_misc($c{$l[6]}); -} - -sub card_main { -# $tmp->configure(-background=>'#ffff9d'); - my $mw=shift; - my ($frame, $tmp); - ######################################################################## - $frame = $mw->Frame(-borderwidth=>1); - $frame->Label(-text=> "MAIN DATA INPUT",-borderwidth=>1)->pack(-side=>'left',-anchor=>'w'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(-borderwidth=>1); - &fast_lentry($frame,$tmp,"Filename", 'filename', 25 , \&redraw); - $tmp=&doublelabel($frame,$tmp," -in", \$opt{'in'}, 20 , \&reshowNredraw,); - $ballooni->attach($tmp, -justify => 'left', - -msg=>( &help_format( "-in [ parsight path] will display the last loaded or last saved parasight file. This field can not be directly altered by the user." ))) if $opt{'help_on'}; - - - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(-borderwidth=>1); - $tmp=$frame->Checkbutton(-text=>' on',-variable => \$opt{'filename_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('filename_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"size", 'filename_size', 3 , \&redraw); - &fast_lentry($frame,$tmp,"color", 'filename_color', 7 , \&redraw); - &fast_lentry($frame,$tmp,"offset v:",'filename_offset', 3 , \&redraw); - &fast_lentry($frame,$tmp,"h:", 'filename_offset', 3 , \&redraw); - &fast_lentry($frame,$tmp,"pattern",'filename_pattern',8, \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - - - $frame = $mw->Frame(-borderwidth=>1); - $tmp=&fast_lentry($frame,$tmp,"-align", 'align', 65 , \&reshowNredraw,'#bbe8ff'); - $ballooni->attach($tmp, -justify => 'left', - -msg=>("-align [filepath1:filepath2:etc] \n" - ." *more files containing alignments may be added \n" - ." *must be in miropeats format tab-delimited format \n" - ." (name1 b1 e1 seqlen1 name2 b2 e2 seqlen2 [optcolumns\n" - ." (the text will disappear on sucessful load) \n" )) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(-borderwidth=>1); - $tmp=&fast_lentry($frame,$tmp,"-extra", 'extra', 65 , \&reshowNredraw,'#bbe8ff'); - $ballooni->attach($tmp, -justify => 'left', - -msg=>("-extra [filepath1:filepath2:etc] \n" - ." *more files containing extra sequence annotation \n" - ." *the first 3 columns must be (seqname begin end) \n" - ." (the text will disappear on sucessful load) \n" )) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(-borderwidth=>1); - $tmp=&fast_lentry($frame,$tmp,"-showseq", 'showseq', 65 , \&reshowNredraw,'#bbe8ff'); - $ballooni->attach($tmp, -justify => 'left', - -msg=>("Designates sequences to draw: \n" - ." colon-delimited seq name(s) (e.g. name1:name2:name3) \n" - ." ALL (no-colon) will draw all sequences \n" - ." if just one sequence name needs ending-colon (e.g. name:) \n" - ." no-colon assumes that it is a file to open \n" - ." (seq lengths can be specified in 2nd column) \n" )) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(-borderwidth=>1); - $frame->Label(-text=> " show query (1st) sequence only ")->pack(-side=>'left',-anchor=>'w'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'showqueryonly'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('shoqueryonly') ) if $opt{'help_on'}; - $frame->Label(-text=> "(works with ALL only)")->pack(-side=>'left',-anchor=>'w'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - - $frame = $mw->Frame(-borderwidth=>1); - $tmp=&fast_lentry($frame,$tmp,"-showsub", 'showsub', 60 , \&reshowNredraw,'#bbe8ff'); - $ballooni->attach($tmp, -justify => 'left', - -msg=>("Chooses the subject sequences to be drawn: \n" - ." colon-delimited seq name(s) (e.g. name1:name2:name3)\n" - ." ALL (no-colon) will draw all sequences \n" - ." for just one sequence name need colon (e.g. name:) \n" - ." no-colon assumes that it is a file to open \n" - ." (seq lengths can be specified in 2nd column) \n" ))if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(-borderwidth=>1); - - - - $frame = $mw->Frame(-borderwidth=>1); - &fast_lentry($frame,$tmp,"SCREEN Indent: left", 'canvas_indent_left', 5 , \&redraw); - &fast_lentry($frame,$tmp,"right", 'canvas_indent_right', 5 , \&redraw); - &fast_lentry($frame,$tmp,"top", 'canvas_indent_top', 5 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(-borderwidth=>1); - &fast_lentry($frame,$tmp,"Window Pixel Width", 'window_width', 6 , \&redraw); - &fast_lentry($frame,$tmp,"Screen bp Width:", 'canvas_bpwidth', 10 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - - $frame = $mw->Frame(-relief => 'groove', -bd => 6 ); - $frame->Label(-text=> "ALIGN ARRAY ")->pack(-side=>'top',-anchor=>'w'); - $frame->Label(-textvariable => \$mstring,-wraplength=>450,-justify=>'left')->pack(-side=> 'top',-anchor => 'w'); - $frame->pack(-side => 'top', -anchor => 'w'); - $frame->Label(-text=> "EXTRA ARRAY ")->pack(-side=>'top',-anchor=>'w'); - $frame->Label(-textvariable => \$estring, -wraplength=>450,-justify=>'left' )->pack(-side=> 'top',-anchor => 'w'); - $frame->pack(-side => 'top', -anchor => 'w'); -} - -sub card_seq { - my $mw=shift; - my ($frame, $tmp); - ######################## - ############################# - ######frame for sequence options######### - $frame = $mw->Frame(); - $tmp=$frame->Label(-text => "SEQUENCE", -background => '#ffff9d')->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($tmp,-justify => 'left', - -msg=>("Sequence options that need explainations: \n" - ." *spacing is somewhat confusing a line of sequence \n" - ." can wrap to form the equivalent of a paragraph \n" - ." spacing between lines (between paragraphs) and \n" - ." line wrapping spacing can be set to different \n" - ." values \n" - ." \n" - ." \n" - ." \n")) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - - $tmp=$frame->Label(-text => " -arrangeseq",-background => '#ffff9d')->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($tmp,-justify => 'left', - -msg=>("Arranges sequences to draw: \n" - ." onerperline = each sequence is drawn on a separate line (paragraph) \n" - ." sameline = sequences are drawn on same line with set spacing between\n" - ." file = exact line and begin positions are designated in a file \n")) if $opt{'help_on'}; - - $tmp=$frame->Optionmenu(-background=> '#bbe8ff',-textvariable=>\$opt{'arrangeseq'}, -options => ['oneperline','sameline', - 'file'] )->pack(-side => 'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('arrangeseq') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"arrange file", 'arrange_file', 30 , \&redraw,'#bbe8ff'); - - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); -# $frame = $mw->Frame(); - -# $tmp=$frame->Label(-text => " -color",-background => '#ffff9d')->pack(-side=> 'left',-anchor => 'e'); -# $ballooni->attach($tmp,-justify => 'left', -# -msg=>("Choose color schemes for sequence and pairwise: \n" -# ."--not yet implemented!!!! \n")); -# $frame->Optionmenu(-background=> '#bbe8ff',-textvariable=>\$opt{'color'}, -# -options => ['NONE','???'] )->pack(-side => 'left',-anchor=>'e'); -# $ballooni->attach($tmp, -msg => &balloon_format_var('mark_pairs') ) if $opt{'help_on'}; - - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - - - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp," Color", 'seq_color', 10 , \&redraw,'#bbe8ff'); - &fast_lentry($frame,$tmp,"Width", 'seq_width', 3 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => " Names")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'seq_label_on'}) - ->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_label_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'seq_label_color', 7 , \&redraw); - &fast_lentry($frame,$tmp,"size", 'seq_label_fontsize', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset v:", 'seq_label_offset', 3 , \&redraw); - &fast_lentry($frame,$tmp,"h:", 'seq_label_offset_h', 3 , \&redraw); - &fast_lentry($frame,$tmp,"pattern", 'seq_label_pattern', 10 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp," Spacing: btwn lines (pixels)", 'seq_line_spacing_btwn', 4 , \&redraw); - &fast_lentry($frame,$tmp,"wrap within line (pixels)", 'seq_line_spacing_wrap', 4 , \&redraw); - &fast_lentry($frame,$tmp,"btwn sequences (bp)", 'seq_spacing_btwn_sequences', 8 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - ####frame ticks ####### - $frame = $mw->Frame(); - $tmp=$frame->Label(-text => "SEQUENCE TICK MARKS")->pack(-side=> 'left',-anchor => 'e'); - $tmp->configure(-background=>'#ffff9d'); - $ballooni->attach($tmp, -justify => 'left', - -msg=>("TICK MARKS (self explainatory) \n" - ." interval tick are equally spaced \n" - ." begin ticks appear at beginning of sequence \n" - ." end ticks appear at end of sequence \n" )) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $tmp=$frame->Checkbutton(-text=>'whole/continuous line numbering',-variable => \$opt{'seq_tick_whole'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_tick_whole') ) if $opt{'help_on'}; - - $frame = $mw->Frame(); - $frame->Label(-text => " Interval Tick:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'seq_tick_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_tick_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"width", 'seq_tick_width', 3 , \&redraw); - &fast_lentry($frame,$tmp,"length", 'seq_tick_length', 3 , \&redraw); - &fast_lentry($frame,$tmp,"color", 'seq_tick_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'seq_tick_offset', 3 , \&redraw); - &fast_lentry($frame,$tmp,"bp inteval", 'seq_tick_bp', 8 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => " label:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'seq_tick_label_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_tick_label_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"size", 'seq_tick_label_fontsize', 3 , \&redraw); - &fast_lentry($frame,$tmp,"color", 'seq_tick_label_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"offset",'seq_tick_label_offset', 3 , \&redraw); - $frame->Label(-text => "anchor") -> pack(-side=>'left',-anchor=>'e'); - $tmp=$frame->Optionmenu(-textvariable=>\$opt{'seq_tick_label_anchor'}, -options => ['n','e','w','s','ne','nw','se','sw'] )->pack(-side => 'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_tick_label_anchor') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"scaling",'seq_tick_label_multiplier', 8 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => " Begin Tick:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'seq_tick_b_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_tick_b_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"width", 'seq_tick_b_width', 4 , \&redraw); - &fast_lentry($frame,$tmp,"length", 'seq_tick_b_length', 4 , \&redraw); - &fast_lentry($frame,$tmp,"color", 'seq_tick_b_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'seq_tick_b_offset', 4 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => " label:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'seq_tick_b_label_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_tick_b_label_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"size", 'seq_tick_b_label_fontsize', 3 , \&redraw); - &fast_lentry($frame,$tmp,"color", 'seq_tick_b_label_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"offset v:",'seq_tick_b_offset', 3 , \&redraw); - &fast_lentry($frame,$tmp,"h:",'seq_tick_b_label_offset_h', 3 , \&redraw); - $frame->Label(-text => "anchor") -> pack(-side=>'left',-anchor=>'e'); - $tmp=$frame->Optionmenu(-textvariable=>\$opt{'seq_tick_b_label_anchor'}, -options => ['n','e','w','s','ne','nw','se','sw'] )->pack(-side => 'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_tick_b_label_anchor') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"scaling",'seq_tick_b_label_multiplier', 8 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => " End Tick:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'seq_tick_e_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_tick_e_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"width", 'seq_tick_e_width', 4 , \&redraw); - &fast_lentry($frame,$tmp,"length", 'seq_tick_e_length', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'seq_tick_e_offset', 3 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => " label:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'seq_tick_e_label_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_tick_e_label_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"size", 'seq_tick_e_label_fontsize', 3 , \&redraw); - &fast_lentry($frame,$tmp,"color", 'seq_tick_e_label_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"offset v:",'seq_tick_e_label_offset', 3 , \&redraw); - &fast_lentry($frame,$tmp,"h:",'seq_tick_e_label_offset_h', 3 , \&redraw); - $frame->Label(-text => "anchor") -> pack(-side=>'left',-anchor=>'e'); - $tmp=$frame->Optionmenu(-textvariable=>\$opt{'seq_tick_e_label_anchor'}, -options => ['n','e','w','s','ne','nw','se','sw'] )->pack(-side => 'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('seq_tick_e_label_anchor') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"scaling",'seq_tick_e_label_multiplier', 8 , \&redraw); - - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - - ######frame for pair options######### - $frame = $mw->Frame(); - $tmp=$frame->Label(-text => "PAIRS")->pack(-side=> 'left',-anchor => 'e'); - $tmp->configure(-background=>'#ffff9d'); - $ballooni->attach($tmp, -justify => 'left', - -msg=>("PAIR(WISE) \n" - ." *pairwise determination default is intra-screen \n" - ." else define equality using 2 columns here \n" - ." *draw level determines who is on top in picture \n" - )) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1, -anchor=>'w'); - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp,"DEFINING col", 'pair_type_col', 8 ,\&find_column_options); - $frame->Label(-textvariable =>\$colheader{'pair_type_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"pattern", 'pair_type_col_pattern', 10 , \&redraw,); - &fast_lentry($frame,$tmp,"col2", 'pair_type_col2', 8 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'pair_type_col2_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"pattern", 'pair_type_col2_pattern', 10 , \&redraw,); - $frame->pack(-side => 'top', -expand=> 1, -anchor=>'w'); - $frame = $mw->Frame(); - $tmp=$frame->Label(-text => "DRAW LEVEL:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Optionmenu(-textvariable=>\$opt{'pair_level'}, - -options => ['NONE','inter_over_intra','intra_over_inter'] )->pack(-side => 'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('pair_level') ) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1, -anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => "INTRA-PAIRS: ")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'pair_intra_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('pair_intra_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'pair_intra_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'pair_intra_width', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'pair_intra_offset', 3 , \&redraw); - $tmp=$frame->Checkbutton(-text=>'lines on',-variable => \$opt{'pair_intra_line_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('pair_intra_line_on') ) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1, -anchor=>'w'); - - #####frame for inter ############# - $frame = $mw->Frame(); - $frame->Label(-text => "INTER-PAIRS: ")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'pair_inter_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('pair_inter_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'pair_inter_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'pair_inter_width', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'pair_inter_offset', 3 , \&redraw); - $tmp=$frame->Checkbutton(-text=>'lines on',-variable => \$opt{'pair_inter_line_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('pair_inter_line_on') ) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1, -anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => "ALIGNMENT TEXT: ")->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"file name col", 'pair_align_path_col', 5 , \&redraw); - &fast_lentry($frame,$tmp,"base path", 'pair_align_path_col', 30 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1, -anchor=>'w'); -} - - -sub card_sub_subscale { - my $mw=shift; - my ($frame,$tmp); - #####SUBJECT BELOW ##################### - $frame = $mw->Frame(); - $tmp=$frame->Label(-text => "-arrangesub",-background => '#ffff9d')->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($tmp,-justify => 'left', - -msg=>("Arranges subject hits below sequence \n" - ." oneperline - each subject accession is placed on a separate line \n" - ." stagger - subjects are staggered(placed on same line if nonoverlapping\n" - ." (can use arrangesub to choose column to sort the arrange \n" - ." subscaleN - plots subjects on a continuous numerical scale \n" - ." subscaleC - plots subjects on a categorical (noncontinous) scale \n" - ." *subscaleC#CHR_oo21 plots subjects on basis of chromosome assignment \n" - ." for Jim Kent assembly but now just good example \n" - ." *subscaleN#ident?? graphs each hit on basis of percent similarity \n")); - $tmp=$frame->Optionmenu(-textvariable=>\$opt{'arrangesub'},-background=>'#bbe8ff' , - -options => ['oneperline','stagger','subscaleN','subscaleC', - '*subscaleN#ident90', '*subscaleN#ident85', '*subscaleN#ident80', - '*subscaleC#CHR_oo21'] - )->pack(-side => 'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('arrangesub') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"Arrangesub Column", 'arrangesub_col', 10 , \&reshowNredraw,'#bbe8ff'); - $tmp=$frame->Checkbutton(-text=>'reverse sort',-variable => \$opt{'arrangesub_rev_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('arrangesub_rev_on') ) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - - - $frame = $mw->Frame(); - $tmp = $frame->Label(-text => "-colorsub",-background => '#ffff9d')->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($tmp,-justify => 'left', - -msg=>("Choose coloring scheme for subjects \n" - ." NONE = no color scheme \n" - ." seqrandom = randomly assign a color to all pairwise for a sequence \n" - ." hitrandom = randomly assign a color to each indvidual pairwise \n" - ." hitconditional = assigns color based on conditional tests of a column \n" - ." RESET = removes color of hits (which overrides subject colors) ")); - - $tmp=$frame->Optionmenu(-textvariable=>\$opt{'colorsub'}, -background=>'#bbe8ff' , - -options => ['NONE','seqrandom','hitrandom','hitconditional','RESET'] )->pack(-side => 'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('colorsub') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"Default Color", 'sub_color', 10 , \&reshowNredraw,'#bbe8ff'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp,"Condition Column", 'colorsub_hitcond_col', 10 , \&find_column_options,'#bbe8ff'); - $frame->Label(-textvariable =>\$colheader{'colorsub_hitcond_col_header'})->pack(-side=> 'left',-anchor => 'e'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp,"Condition Tests", 'colorsub_hitcond_tests', 70 , \&reshowNredraw,'#bbe8ff'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - - $frame = $mw->Frame(); - $tmp=$frame->Label(-text => "SUBJECTS:",-background => '#ffff9d')->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($tmp,-justify => 'left', - -msg=>("Subjects are the matching regions drawn below the sequence \n" - ." Subject comes from blast (query hitting subjects) additionally \n" - ." they are all drawn below the sequence sub-sequence. \n" - ." col will be used for seq1 and seq2 if col2 is empty. If col2 is not \n" - ." empty then col2 will be used for seq2 data. \n" - ." Patterns must be regular expressions with () enclosing part of match \n" - ." to extract from the string. ")); - - - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'sub_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('sub_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"Width", 'sub_width', 3 , \&redraw); - &fast_lentry($frame,$tmp,"Line: space:", 'sub_line_spacing', 3 , \&redraw); - &fast_lentry($frame,$tmp,"init indent", 'sub_initoffset', 3 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => " Arrow")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable=> \$opt{'sub_arrow_on'}) - ->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('sub_arrow_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"line paral (-)", 'sub_arrow_paral', 7 , \&redraw); - &fast_lentry($frame,$tmp,"line diag (/)", 'sub_arrow_diag', 7 , \&redraw); - &fast_lentry($frame,$tmp,"line perp (|)", 'sub_arrow_perp', 7 , \&redraw); - $frame->pack(-side => 'top', -anchor => 'w'); - - #########SUBJECT LABELS ################## - $frame = $mw->Frame(); - $frame->Label(-text => "Label Sequence Begin:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'sub_labelseq_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('sub_labelseq_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"col", 'sub_labelseq_col', 8 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'sub_labelseq_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col2", 'sub_labelseq_col2', 8 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'sub_labelseq_col2_header'})->pack(-side=> 'left',-anchor => 'e'); - $frame->pack(-side => 'top', -expand=> 1, -anchor=>'w'); - ### - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp," pattern", 'sub_labelseq_col_pattern', 14 , \&redraw); - &fast_lentry($frame,$tmp,"pattern2", 'sub_labelseq_col2_pattern', 14 , \&redraw); - &fast_lentry($frame,$tmp,"color", 'sub_labelseq_color', 10 , \&redraw); - &fast_lentry($frame,$tmp,"size", 'sub_labelseq_size', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'sub_labelseq_offset', 3 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1, -anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => "Label Sequence End:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'sub_labelseqe_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('sub_labelseqe_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"col", 'sub_labelseqe_col', 8 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'sub_labelseqe_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col2", 'sub_labelseqe_col2', 8 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'sub_labelseqe_col2_header'})->pack(-side=> 'left',-anchor => 'e'); - $frame->pack(-side => 'top', -expand=> 1, -anchor=>'w'); - ### - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp," pattern", 'sub_labelseqe_col_pattern', 14 , \&redraw); - &fast_lentry($frame,$tmp,"pattern2", 'sub_labelseqe_col2_pattern', 14 , \&redraw); - &fast_lentry($frame,$tmp,"color", 'sub_labelseqe_color', 10 , \&redraw); - &fast_lentry($frame,$tmp,"size", 'sub_labelseqe_size', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'sub_labelseqe_offset', 3 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1, -anchor=>'w'); - - - ##### HIT LABELS ####################################### - $frame = $mw->Frame(); - $frame->Label(-text => "Label Each Hits: ")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'sub_labelhit_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('sub_labelhit_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"col", 'sub_labelhit_col', 8 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'sub_labelhit_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col2", 'sub_labelhit_col2', 8 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'sub_labelhit_col2_header'})->pack(-side=> 'left',-anchor => 'e'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp," pattern", 'sub_labelhit_col_pattern', 14 , \&redraw); - &fast_lentry($frame,$tmp,"pattern2", 'sub_labelhit_col2_pattern', 14 , \&redraw); - &fast_lentry($frame,$tmp,"color", 'sub_labelhit_color', 10 , \&redraw); - &fast_lentry($frame,$tmp,"size", 'sub_labelhit_size', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'sub_labelhit_offset', 3 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - - $frame = $mw->Frame(); - $tmp=$frame->Label(-text => "subscale ", -background => '#ffff9d')->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($tmp,-justify => 'left', - -msg=>("subscale creates a numeric or categorical plot of sub-subjects below sequence:\n" - ."(N)umeric scale plots numeric values in a continous fashion \n" - ."(C)ategoric scale overlays a name for each step of the numeric scale \n" - ." categorical names are separated by commas (e.g. X,Y,1,2,3,4) \n" - ."col and col2 choose columns containing y values to plot \n" - ." if col2 is empty col will be used for seq1 and seq2 of pairwise \n" - ." if col2 filled then seq2 will use col2 while col will be for seq1 \n" - ." ")); - &fast_lentry($frame,$tmp,"col", 'sub_scale_col', 8 , \&find_column_options,'#bbe8ff'); - $frame->Label(-textvariable=> \$colheader{'sub_scale_col_header'})->pack(-side=>'left',-anchor=>'e'); - &fast_lentry($frame,$tmp,"pattern", 'sub_scale_col_pattern', 12 , \&reshowNredraw,'#bbe8ff'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp,"col2", 'sub_scale_col2', 8 , \&find_column_options,'#bbe8ff'); - $frame->Label(-textvariable=> \$colheader{'sub_scale_col2_header'})->pack(-side=>'left',-anchor=>'e'); - &fast_lentry($frame,$tmp,"pattern2", 'sub_scale_col2_pattern', 12, \&reshowNredraw,'#bbe8ff'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - - $frame = $mw->Frame(); - $frame->Label(-text => " (N)umeric ")->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"step", 'sub_scale_step', 5 , \&reshowNredraw,'#bbe8ff'); - &fast_lentry($frame,$tmp,"min", 'sub_scale_min', 8 , \&reshowNredraw,'#bbe8ff'); - &fast_lentry($frame,$tmp,"max", 'sub_scale_max', 8 , \&reshowNredraw,'#bbe8ff'); - &fast_lentry($frame,$tmp,"lines", 'sub_scale_lines', 5 , \&reshowNredraw,'#bbe8ff'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - $frame->Label(-text => "(C)ategoric ")->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"names", 'sub_scale_categoric_string', 60 , \&reshowNredraw,'#bbe8ff'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - - #######################SCALES ##############################3 - $frame = $mw->Frame(); - $tmp=$frame->Label(-text => "DRAW SCALES", -background => '#ffff9d')->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($tmp,-justify => 'left', - -msg=>("subscale creates a numeric or categorical plot of sub-subjects below sequence:\n" - ."(N)umeric scale plots numeric values in a continous fashion \n" - ."(C)ategoric scale overlays a name for each step of the numeric scale \n" - ."col and col2 choose columns containing y values to plot \n" - ." if col2 is empty col will be used for seq1 and seq2 of pairwise \n" - ." if col2 filled then seq2 will use col2 while col will be for seq1 \n" - ." ")); - - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'sub_scale_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('sub_scale_on') ) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => "Vertical Line")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'sub_scale_vline_on'}, -command => \&redraw)->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('sub_scale_vline_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'sub_scale_vline_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'sub_scale_vline_width', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'sub_scale_vline_offset', 3 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - ####################### - $frame = $mw->Frame(); - $frame->Label(-text => "Horizontal Lines: ")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'sub_scale_hline_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('sub_scale_hline_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'sub_scale_hline_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'sub_scale_hline_width', 8 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => "Tick Marks")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'sub_scale_tick_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('sub_scale_tick_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'sub_scale_tick_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'sub_scale_tick_width', 3 , \&redraw); - &fast_lentry($frame,$tmp,"length", 'sub_scale_tick_length', 4 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'sub_scale_tick_offset', 3 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => "Tick Labels")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'sub_scale_label_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('sub_scale_label_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'sub_scale_label_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"size", 'sub_scale_label_fontsize', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'sub_scale_label_offset', 4 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp," value_scaling", 'sub_scale_label_multiplier', 8 , \&redraw); - &fast_lentry($frame,$tmp,"pattern", 'sub_scale_label_pattern', 14 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - ################################# - $frame = $mw->Frame(-relief => 'groove', -bd => 6 ); - $frame->Label(-text=> "ALIGN ARRAY ")->pack(-side=>'top',-anchor=>'w'); - $frame->Label(-textvariable => \$mstring,-wraplength=>450,-justify=>'left')->pack(-side=> 'top',-anchor => 'w'); - $frame->pack(-side => 'top', -anchor => 'w'); -} - - -sub card_extra { - my $mw=shift; - my ($frame,$tmp); - $frame = $mw->Frame(); - $frame->Label(-text => "EXTRA")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'extra_on'}) - ->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('extra_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"default color", 'extra_color', 10 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'extra_width', 4 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'extra_offset', 4 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - $frame->Label(-text => " Label:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable=> \$opt{'extra_label_on'}) - ->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('extra_label_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"col", 'extra_label_col', 7 , \&redraw); - &fast_lentry($frame,$tmp,"pattern", 'extra_label_col_pattern', 15 , \&redraw); - - &fast_lentry($frame,$tmp,"color", 'extra_label_color', 7 , \&redraw); - &fast_lentry($frame,$tmp,"size", 'extra_label_fontsize', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'extra_label_offset', 3 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - &fast_lentry($frame,$tmp," test before label:col", 'extra_label_test_col', 3 , \&redraw); - &fast_lentry($frame,$tmp,"pattern", 'extra_label_test_pattern', 15 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - $frame->Label(-text => " Arrow")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable=> \$opt{'extra_arrow_on'}) - ->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('extra_arrow_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"line paral (-)", 'extra_arrow_para', 7 , \&redraw); - &fast_lentry($frame,$tmp,"line diag (/)", 'extra_arrow_diag', 7 , \&redraw); - &fast_lentry($frame,$tmp,"line perp (|)", 'extra_arrow_perp', 7 , \&redraw); - $frame->pack(-side => 'top', -anchor => 'w'); - - - - $frame = $mw->Frame(-relief => 'groove', -bd => 6 ); - $frame->Label(-text=> "EXTRA ARRAY ")->pack(-side=>'top',-anchor=>'w'); - $frame->Label(-textvariable => \$estring, -wraplength=>450,-justify=>'left' )->pack(-side=> 'top',-anchor => 'w'); - $frame->pack(-side => 'top', -anchor => 'w'); - -} - -sub card_graph { - my $mw=shift; - my ($frame,$tmp); - #####SUBJECT BELOW ##################### - $frame = $mw->Frame(); - $tmp=$frame->Label(-text => "GRAPH:",-background => '#ffff9d')->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($tmp,-justify => 'left', - -msg=>("These are general options controlling the graph scale \n" - ." pixel height of scale determines the breath of the common graph scale \n" - ." # of intervals determines the number of horizontal lines drawn \n" - ." inital indent determines how far above the line bottom of scale is drawn\n" - ." empty then col2 will be used for seq2 data. \n" - ." Horizontal line is common to both graph1 and graph 2 \n" - )); - - - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph_scale_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph_scale_on') ) if $opt{'help_on'}; - - &fast_lentry($frame,$tmp,"pixel height:", 'graph_scale_height', 5 , \&redraw); - &fast_lentry($frame,$tmp,"# of intervals", 'graph_scale_interval', 5 , \&redraw); - &fast_lentry($frame,$tmp,"init indent", 'graph_scale_indent', 5 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - ### - $frame = $mw->Frame()->pack(-side => 'top', -anchor => 'w'); - $frame->Label(-text => " Common Horz Line:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'ON',-variable=> \$opt{'graph_scale_hline_on'}) - ->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph_scale_hline_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"width", 'graph_scale_hline_width', 7 , \&redraw); - &fast_lentry($frame,$tmp,"color", 'graph_scale_hline_color', 7 , \&redraw); - - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $tmp=$frame->Label(-text => "GRAPH1", -background => '#ffff9d')->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($tmp,-justify => 'left', - -msg=>("GRAPH1 reates a numeric or categorical plot of sub-subjects below sequence:\n" - ."(N)umeric scale plots numeric values in a continous fashion \n" - ."(C)ategoric scale overlays a name for each step of the numeric scale \n" - ." categorical names are separated by commas (e.g. X,Y,1,2,3,4) \n" - ."col and col2 choose columns containing y values to plot \n" - ." if col2 is empty col will be used for seq1 and seq2 of pairwise \n" - ." if col2 filled then seq2 will use col2 while col will be for seq1 \n" - ." ")); - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph1_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph1_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"Scale Values: min", 'graph1_min', 8 , \&redraw); - &fast_lentry($frame,$tmp,"Scale Values: min", 'graph1_max', 8 , \&redraw); - - #######################SCALES ##############################3 - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "Point:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'graph1_point_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph1_point_on') ) if $opt{'help_on'}; - #not a supported choice - #$frame->Optionmenu(-textvariable=>\$opt{'graph1_point_shape'},-background=>'#bbe8ff' , - # -options => ['Circle'] - # )->pack(-side => 'left',-anchor=>'e'); - &fast_lentry($frame,$tmp,"point size", 'graph1_point_size', 3 , \&redraw); - &fast_lentry($frame,$tmp,"fill color", 'graph1_point_fill_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"outline: color", 'graph1_point_outline_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'graph1_point_outline_width', 3 , \&redraw); - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "Connecting Line:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'graph1_line_on'},)->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph1_line_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'graph1_line_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'graph1_line_width', 3 , \&redraw); - $frame->Label(-text => "smoothing")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph1_line_smooth'}, )->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph1_line_smooth') ) if $opt{'help_on'}; - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "Vertical Line")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph1_vline_on'},)->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph1_vline_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'graph1_vline_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'graph1_vline_width', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'graph1_vline_offset', 3 , \&redraw); - - ####################### - $frame = $mw->Frame(); - $frame->Label(-text => "Tick Marks")->pack(-side=> 'left',-anchor => 'w'); - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph1_tick_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph1_tick_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'graph1_tick_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"length", 'graph1_tick_length', 4 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'graph1_tick_width', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'graph1_tick_offset', 3 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "Tick Labels")->pack(-side=> 'left',-anchor => 'w'); - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph1_label_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph1_label_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'graph1_label_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"size", 'graph1_label_fontsize', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'graph1_label_offset', 4 , \&redraw); - - $frame = $mw->Frame()->pack(-side=> 'top',-anchor => 'w'); - &fast_lentry($frame,$tmp," value_scaling", 'graph1_label_multiplier', 8 , \&redraw); - &fast_lentry($frame,$tmp,"decimal points", 'graph1_label_decimal', 14 , \&redraw); - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $tmp=$frame->Label(-text => "GRAPH2", -background => '#ffff9d')->pack(-side=> 'left',-anchor => 'w'); - $ballooni->attach($tmp,-justify => 'left', - -msg=>("GRAPH2 reates a numeric or categorical plot of sub-subjects below sequence:\n" - ."(N)umeric scale plots numeric values in a continous fashion \n" - ."(C)ategoric scale overlays a name for each step of the numeric scale \n" - ." categorical names are separated by commas (e.g. X,Y,1,2,3,4) \n" - ."col and col2 choose columns containing y values to plot \n" - ." if col2 is empty col will be used for seq1 and seq2 of pairwise \n" - ." if col2 filled then seq2 will use col2 while col will be for seq1 \n" - ." ")); - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph2_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph2_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"Scale Values: min", 'graph2_min', 8 , \&redraw); - &fast_lentry($frame,$tmp,"Scale Values: min", 'graph2_max', 8 , \&redraw); - - #######################SCALES ##############################3 - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "Point:")->pack(-side=> 'left',-anchor => 'w'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'graph2_point_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph2_point_on') ) if $opt{'help_on'}; - #not a supported choice - #$frame->Optionmenu(-textvariable=>\$opt{'graph2_point_shape'},-background=>'#bbe8ff' , - # -options => ['Circle'] - # )->pack(-side => 'left',-anchor=>'e'); - &fast_lentry($frame,$tmp,"point size", 'graph2_point_size', 3 , \&redraw); - &fast_lentry($frame,$tmp,"fill color", 'graph2_point_fill_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"outline: color", 'graph2_point_outline_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'graph2_point_outline_width', 3 , \&redraw); - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "Connecting Line:")->pack(-side=> 'left',-anchor => 'w'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'graph2_line_on'},)->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph2_line_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'graph2_line_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'graph2_line_width', 3 , \&redraw); - $frame->Label(-text => "smoothing")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph2_line_smooth'}, )->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph2_line_smooth') ) if $opt{'help_on'}; - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "Vertical Line")->pack(-side=> 'left',-anchor => 'w'); - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph2_vline_on'},)->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph2_vline_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'graph2_vline_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'graph2_vline_width', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'graph2_vline_offset', 3 , \&redraw); - - ####################### - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "Tick Marks")->pack(-side=> 'left',-anchor => 'w'); - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph2_tick_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph2_tick_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'graph2_tick_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"length", 'graph2_tick_length', 4 , \&redraw); - &fast_lentry($frame,$tmp,"width", 'graph2_tick_width', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'graph2_tick_offset', 3 , \&redraw); - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "Tick Labels")->pack(-side=> 'left',-anchor => 'w'); - $tmp=$frame->Checkbutton(-text=>'ON',-variable => \$opt{'graph2_label_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('graph2_label_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"color", 'graph2_label_color', 8 , \&redraw); - &fast_lentry($frame,$tmp,"size", 'graph2_label_fontsize', 3 , \&redraw); - &fast_lentry($frame,$tmp,"offset", 'graph2_label_offset', 4 , \&redraw); - - $frame = $mw->Frame()->pack(-side=> 'top',-anchor => 'w'); - &fast_lentry($frame,$tmp," value_scaling", 'graph2_label_multiplier', 8 , \&redraw); - &fast_lentry($frame,$tmp,"decimal points", 'graph2_label_decimal', 14 , \&redraw); - - ################################# - #$frame = $mw->Frame(-relief => 'groove', -bd => 6 )->pack(-side=> 'top',-anchor => 'e'); - #$frame->Label(-text=> "GRAPH input files are fixed:sequence position value")->pack(-side=>'top',-anchor=>'w'); -} - - - -sub card_filter { - my $mw=shift; - my ($frame,$tmp); - $frame = $mw->Frame(); - $frame->Label(-text => "ALIGN FILTERS")->pack(-side=> 'left',-anchor => 'e'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $tmp=$frame->Checkbutton(-text=>'reset to show all',-variable => \$opt{'pfilter_reset'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('pfilter_reset') ) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => "PRE-ARRANGE-1 ")->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col", 'filterpre1_col', 5 ,\&find_column_options,'#bbe8ff'); - $frame->Label(-textvariable => \$colheader{'filterpre1_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"min", 'filterpre1_min', 6 , \&redraw,'#bbe8ff'); - &fast_lentry($frame,$tmp,"max", 'filterpre1_max', 6 , \&redraw,'#bbe8ff'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - $frame->Label(-text => "PRE-ARRANGE-2: ")->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col", 'filterpre2_col', 5 , \&find_column_options,'#bbe8ff'); - $frame->Label(-textvariable => \$colheader{'filterpre2_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"min", 'filterpre2_min', 6 , \&redraw,'#bbe8ff'); - &fast_lentry($frame,$tmp,"max", 'filterpre2_max', 6 , \&redraw,'#bbe8ff'); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => "POST-ARRANGE-1: ")->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col", 'filter1_col', 5 , \&find_column_options); - $frame->Label(-textvariable => \$colheader{'filter1_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"min", 'filter1_min', 6 , \&redraw); - &fast_lentry($frame,$tmp,"max", 'filter1_max', 6 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - $frame->Label(-text => "POST-ARRANGE-2: ")->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col", 'filter2_col', 5 , \&find_column_options); - $frame->Label(-textvariable => \$colheader{'filter2_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"min", 'filter2_min', 6 , \&redraw); - &fast_lentry($frame,$tmp,"max", 'filter2_max', 6 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(); - $frame->Label(-text => "EXTRA POST-ARRANGE-1: ")->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col", 'filterextra1_col', 5 , \&find_column_options); - $frame->Label(-textvariable => \$colheader{'filterextra1_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"min", 'filterextra1_min', 6 , \&redraw); - &fast_lentry($frame,$tmp,"max", 'filterextra1_max', 6 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame = $mw->Frame(); - $frame->Label(-text => "EXTRA POST-ARRANGE-2: ")->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"col", 'filterextra2_col', 5 , \&find_column_options); - $frame->Label(-textvariable => \$colheader{'filterextra2_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"min", 'filterextra2_min', 6 , \&redraw); - &fast_lentry($frame,$tmp,"max", 'filterextra2_max', 6 , \&redraw); - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame = $mw->Frame(-relief => 'groove', -bd => 6 ); - $frame->Label(-text=> "ALIGN ARRAY ")->pack(-side=>'top',-anchor=>'w'); - $frame->Label(-textvariable => \$mstring,-wraplength=>450,-justify=>'left')->pack(-side=> 'top',-anchor => 'w'); - $frame->Label(-text=> "EXTRA ARRAY ")->pack(-side=>'top',-anchor=>'w'); - $frame->Label(-textvariable => \$estring, -wraplength=>450,-justify=>'left' )->pack(-side=> 'top',-anchor => 'w'); - $frame->pack(-side => 'top', -anchor => 'w'); - -} - -sub card_misc { - my $mw=shift; - my ($frame, $tmp); - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w');; - $frame->Label(-text => "PRINTING") -> pack(-side=>'left',-anchor=>'e'); - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w');; - &fast_lentry($frame,$tmp,"Printer:", 'print_command', 12, \&focus_ok ); - &fast_lentry($frame,$tmp,"Multipage # wide:", 'print_multipages_wide', 12, \&focus_ok ); - &fast_lentry($frame,$tmp,"# high:", 'print_multipages_high', 12, \&focus_ok ); - $tmp=$frame->Checkbutton(-text=>'landscape',-variable => \$opt{'printer_page_orientation'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('printer_page_orientation') ) if $opt{'help_on'}; - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); ; - &fast_lentry($frame,$tmp,"Page width:", 'printer_page_width', 12, \&focus_ok ); - &fast_lentry($frame,$tmp,"height:", 'printer_page_length', 12, \&focus_ok ); - - $frame = $mw->Frame(); - $frame->Label(-text => "BACKGROUND GIF") -> pack(-side=>'left',-anchor=>'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'gif_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('gif_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"path", 'gif_path', 35, \&redraw ); - &fast_lentry($frame,$tmp,"x", 'gif_x', 6, \&redraw ); - &fast_lentry($frame,$tmp,"y", 'gif_y', 6, \&redraw ); - $frame->Label(-text => "anchor") -> pack(-side=>'left',-anchor=>'e'); - $tmp=$frame->Optionmenu(-textvariable=>\$opt{'gif_anchor'}, -options => ['center','n','e','w','s','ne','nw','se','sw'] )->pack(-side => 'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('gif_anchor') ) if $opt{'help_on'}; - $frame->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-expand=>1,-fill=>'x'); - $frame->Label(-text =>"LEFT-CLICK MENU OPTIONS")->pack(-side=> 'left',-anchor => 'e'); - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-expand=>1,-fill=>'x'); - &fast_lentry($frame,$tmp,"ALIGNMENTS Bases: query col", 'alignment_col', 5 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'alignment_col_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"subject col", 'alignment_col2', 5 , \&find_column_options); - $frame->Label(-textvariable =>\$colheader{'alignment_col2_header'})->pack(-side=> 'left',-anchor => 'e'); - &fast_lentry($frame,$tmp,"line wrap width", 'alignment_wrap', 5 ); - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-expand=>1,-fill=>'x'); - $frame->Label(-text =>"FASTA EXTRACT:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'fasta_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('fasta_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"Fasta directory: ", 'fasta_directory', 40 , \&find_column_options); - &fast_lentry($frame,$tmp,"frag size", 'fasta_fragsize', 7 , \&find_column_options); - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-expand=>1,-fill=>'x'); - &fast_lentry($frame,$tmp,"Fasta blast db(s): ", 'fasta_blastdb', 40 , \&find_column_options); - &fast_lentry($frame,$tmp,"fasta wrap width: ", 'fasta_wrap', 4 , \&find_column_options); - - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-expand=>1,-fill=>'x'); - &fast_lentry($frame,$tmp,"Execute 1 command", 'execute', 35 , \&find_column_options); - &fast_lentry($frame,$tmp,"desc", 'execute_desc', 15 , \&find_column_options); - my $tv='pairs'; - $tv='extras' if $opt{'execute_array'} eq 'e'; - my $opt_menu=$frame->Optionmenu(-textvariable=>\$tv, - -variable=>\$opt{'execute_array'}, -options => [['pairs','m'], ['extras','e']], - -command => \&find_column_options - )->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($opt_menu, -msg => &balloon_format_var('execute_array') ) if $opt{'help_on'}; - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-expand=>1,-fill=>'x'); - &fast_lentry($frame,$tmp,"Execute 2", 'execute2', 35 , \&find_column_options); - &fast_lentry($frame,$tmp,"desc", 'execute2_desc', 15 , \&find_column_options); - my $tv2='pairs'; - $tv2='extras' if $opt{'execute2_array'} eq 'e'; - my $opt_menu=$frame->Optionmenu(-textvariable=>\$tv2, - -variable=>\$opt{'execute2_array'}, -options => [['pairs','m'], ['extras','e']], - -command => \&find_column_options - )->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($opt_menu, -msg => &balloon_format_var('execute2_array') ) if $opt{'help_on'}; - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-expand=>1,-fill=>'x'); - &fast_lentry($frame,$tmp,"Execute 3", 'execute3', 35 , \&find_column_options); - &fast_lentry($frame,$tmp,"desc", 'execute3_desc', 15 , \&find_column_options); - my $tv3='pairs'; - $tv3='extras' if $opt{'execute3_array'} eq 'e'; - my $opt_menu=$frame->Optionmenu(-textvariable=>\$tv3, - -variable=>\$opt{'execute3_array'}, -options => [['pairs','m'], ['extras','e']], - -command => \&find_column_options - )->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($opt_menu, -msg => &balloon_format_var('execute3_array') ) if $opt{'help_on'}; - - $frame= $mw->Frame()->pack(-side=>'top',-anchor=>'w',-expand=>1,-fill=>'x'); - &fast_lentry($frame,$tmp,"Execute 4", 'execute4', 35 , \&find_column_options); - &fast_lentry($frame,$tmp,"desc", 'execute4_desc', 15 , \&find_column_options); - my $tv4='pairs'; - $tv4='extras' if $opt{'execute4_array'} eq 'e'; - my $opt_menu=$frame->Optionmenu(-textvariable=>\$tv4, - -variable=>\$opt{'execute4_array'}, -options => [['pairs','m'], ['extras','e']], - -command => \&find_column_options - )->pack(-side=> 'left',-anchor => 'e'); - $ballooni->attach($opt_menu, -msg => &balloon_format_var('execute4_array') ) if $opt{'help_on'}; - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "POPUP Window Desc: header format") -> pack(-side=>'left',-anchor=>'e'); - $tmp=$frame->Optionmenu(-textvariable=>\$opt{'popup_format'}, -options => ['number','text'] )->pack(-side => 'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('popup_format') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"max entry length", 'popup_max_len', 5 ); - - $frame = $mw->Frame()->pack(-side => 'top', -expand=> 1,-anchor=>'w'); - $frame->Label(-text => "HELP POPUP: ") -> pack(-side=>'left',-anchor=>'e'); - $tmp=$frame->Checkbutton(-text=>'on',-variable => \$opt{'help_on'})->pack(-side=>'left',-anchor=>'e'); - $ballooni->attach($tmp, -msg => &balloon_format_var('help_on') ) if $opt{'help_on'}; - &fast_lentry($frame,$tmp,"line wrap width", 'help_wrap', 5 ); - - $frame = $mw->Frame(-relief => 'groove', -bd => 6 ); - $frame->Label(-text=> "ALIGN ARRAY ")->pack(-side=>'top',-anchor=>'w'); - $frame->Label(-textvariable => \$mstring,-wraplength=>450,-justify=>'left')->pack(-side=> 'top',-anchor => 'w'); - $frame->Label(-text=> "EXTRA ARRAY ")->pack(-side=>'top',-anchor=>'w'); - $frame->Label(-textvariable => \$estring, -wraplength=>450,-justify=>'left' )->pack(-side=> 'top',-anchor => 'w'); - $frame->pack(-side => 'top', -anchor => 'w'); - -} - - - - - -sub global_edit { - my $aname=$opt{'edit_arrayname'}; - return if !($aname eq 'm' || $aname eq 'e'); - my $ap = eval ( '\@'.$aname); - my @a = @{$ap}; - for (my $i=0;$i<@a;$i++) { - my $do_it=1; - my @c = @{ $a[$i] }; - foreach my $v ( qw(A B C D E F) ) { - #print "$v: $opt{'edit'}{$v}{'col'}\n"; - next if $opt{'edit'}{$v}{'col'} eq ''; - my $variable = '$c['. $opt{'edit'}{$v}{'col'} . "]"; - my $operation = $opt{'edit'}{$v}{'op'}; - #print "===>$variable $operation\n"; - my $e = eval ( ($variable . ' '.$operation) ); - if ($e eq '') { - $do_it = 0; - last; - } - } - # if modify @c; - #if delete remove this bugger - # if - # if del - # if modify return - # - - - } -} - -sub edit_options { - my $mw=MainWindow->new; - ### frame spacing #### - $frame = $mw->Frame(-relief => 'groove', -bd => 4 ); - $frame->Label(-text => "ARRAY")->pack(-side=> 'left',-anchor => 'e'); - my $button=$frame->Button(-text => "COLOR:$opt{'edit_color'}",-background=>$opt{'edit_color'}) - ->pack(-side=>'right',-anchor => 'e'); - $button->configure(-command=>[sub { my $b= $_[0]; my $mw=$_[1]; - my $color = $b->chooseColor(-title=>'Choose New Color', - -initialcolor=> $opt{'edit_color'} ); - $mw->raise(); - if (defined $color) { - $b->configure(-background=> $color, -text => "COLOR:$color"); - $opt{'edit_color'}=$color; - - } - - }, $button, $mw] ); - - - - $tmp=$frame->Entry(-textvariable => \$opt{'edit_arrayname'} ,-width=>3)->pack(-side=> 'left', -anchor=> 'e'); - $tmp->bind("", sub{ my $a= $opt{'edit_arrayname'}; - #print "XXXX",$a,"YYYYY\n"; - my $ah= '\@'."$a"."header"; - #print "XXXX",$ah,"YYYYY"; - my $ahp= eval( $ah); - next if ! defined $ahp; - $column_header_display=''; - for (my $i=0; $i< @{$ahp}; $i++) { - $column_header_display.= "$i)$$ahp[$i] "; - } - #print "\n"; - }); - #sub edit_print_array { - - $frame->Button(-text=>'Delete',-command=> [\&global_edit, "Delete"])->pack(-side=>'right',-anchor=>'e'); - - $frame->Button(-text=>'Hide',-command=> [\&global_edit, "Hide"])->pack(-side=>'right',-anchor=>'e'); - - $frame->Button(-text=>'Test and Modify',-command=> [\&global_edit, 'Test&Mod'])->pack(-side=>'right',-anchor=>'e'); - $frame->pack(-side => 'top', -expand=> 1); - - #########TEST AND OPERATION FRAME ################## - $frame = $mw->Frame(-relief => 'groove', -bd => 4 ); - $frame->Label(-text => "colA:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Entry(-textvariable => \$opt{'edit_A_col'},-width=>10)->pack(-side=> 'left', -anchor=> 'e'); - - $frame->Label(-text => "opA:")->pack(-side=> 'left',-anchor => 'e'); - $tmp = $frame->Entry(-textvariable => \$opt{'edit_A_op'},-width=>20)->pack(-side=> 'left', -anchor=> 'e'); - $frame->pack(-side => 'top', -expand=> 1); - - $frame->Label(-text => "colB:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Entry(-textvariable => \$opt{'edit'}{'B'}{'col'},-width=>10)->pack(-side=> 'left', -anchor=> 'e'); - - $frame->Label(-text => "opB:")->pack(-side=> 'left',-anchor => 'e'); - $tmp = $frame->Entry(-textvariable => \$opt{'edit'}{'B'}{'op'},-width=>20)->pack(-side=> 'left', -anchor=> 'e'); - - $frame->Label(-text => "colC:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Entry(-textvariable => \$opt{'edit'}{'C'}{'col'},-width=>10)->pack(-side=> 'left', -anchor=> 'e'); - - $frame->Label(-text => "opC:")->pack(-side=> 'left',-anchor => 'e'); - $tmp = $frame->Entry(-textvariable => \$opt{'edit'}{'C'}{'op'},-width=>20)->pack(-side=> 'left', -anchor=> 'e'); - - $frame->pack(-side => 'top', -expand=> 1); - - - ####TEST AND OPERATION FRAME 2 ################### - $frame = $mw->Frame(-relief => 'groove', -bd => 4 ); - $frame->Label(-text => "colD:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Entry(-textvariable => \$opt{'edit'}{'D'}{'col'},-width=>10)->pack(-side=> 'left', -anchor=> 'e'); - - $frame->Label(-text => "opD:")->pack(-side=> 'left',-anchor => 'e'); - $tmp = $frame->Entry(-textvariable => \$opt{'edit'}{'D'}{'op'},-width=>20)->pack(-side=> 'left', -anchor=> 'e'); - $frame->pack(-side => 'top', -expand=> 1); - - $frame->Label(-text => "colE:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Entry(-textvariable => \$opt{'edit'}{'E'}{'col'},-width=>10)->pack(-side=> 'left', -anchor=> 'e'); - - $frame->Label(-text => "opE:")->pack(-side=> 'left',-anchor => 'e'); - $tmp = $frame->Entry(-textvariable => \$opt{'edit'}{'E'}{'op'},-width=>20)->pack(-side=> 'left', -anchor=> 'e'); - - $frame->Label(-text => "colF:")->pack(-side=> 'left',-anchor => 'e'); - $tmp=$frame->Entry(-textvariable => \$opt{'edit'}{'F'}{'col'},-width=>10)->pack(-side=> 'left', -anchor=> 'e'); - - $frame->Label(-text => "opF:")->pack(-side=> 'left',-anchor => 'e'); - $tmp = $frame->Entry(-textvariable => \$opt{'edit'}{'F'}{'op'},-width=>20)->pack(-side=> 'left', -anchor=> 'e'); - $frame->pack(-side => 'top', -expand=> 1); - - $frame = $mw->Frame(-relief => 'groove', -bd => 4 ); - $frame->Label(-textvariable => \$column_header_display, -wraplength=>800 )->pack(-side=> 'left',-anchor => 'e'); - $frame->pack(-side => 'top', -expand=> 1); -} - -############################################################ -##############################DRAWING ###################### -sub update { - $mw->update; - $canvas->update; - print "VIEW UPDATE\n" if !$newopt{'quiet'}; -} - -sub reshowNredraw { - if ( (keys %pairwise2delete) > 0) { - foreach (reverse sort {$a <=> $b} keys %pairwise2delete) { - print "$_ "; - splice(@m,$_,1); - } - %pairwise2delete=(); - - } - #print "ALIGN_UPDATE\n" ; - &align_update; - $opt{'align'}=''; - #print "show_UPDATE\n"; - &show_update; - &show_update_subject; - #print "extra UPDATE\n"; - &extra_update; - $opt{'extra'}=''; - #print "graph UPDATE\n"; - &graph_update; - $opt{'graph'}=''; - &find_column_options; - print "DISPLAY CALCULATIONS\n" if !$newopt{'quiet'}; - &show_calculations; - - print "**** REDRAWING ****\n" if !$newopt{'quiet'}; - foreach my $ac (@acc_order) { - foreach my $s ( keys % {$acc{$ac}{'sub'}} ) { - $accsub{$s}{'acc'}{$ac}{'qmin_f'}=99999999999999; - $accsub{$s}{'acc'}{$ac}{'qmax_f'}=-10; - } - } - &accession_begin_end; - &arrange_seq_lines; - &arrange_subjects if $opt{'sub_on'}; - &colorsub; - &generate_column_header_strings; - - &redraw; - $first_pass=0; -} - -sub redraw { - ####RESET CERTAIN VALUES### - &find_column_options; - %msghash=(); - $scale=1; - - $canvas->delete("all"); - if ($opt{'filename_on'}) { - $opt{'filename'}=$filepath if $filepath && !$opt{'filename'}; - my $fn=$opt{'filename'}; - $fn=$filepath if $filepath && !$opt{'filename'}; - $fn=$1 if $fn =~ /$opt{'filename_pattern'}/ && $opt{'filename_pattern'}; - #print "FILENAME $fn\n"; - $canvas->createText(1+$opt{'filename_offset_h'},1+$opt{'filename_offset'}, -text=>$fn, -fill=>$opt{'filename_color'},-font=>"Courier $opt{'filename_size'}", -anchor=>"nw"); - } - if ($opt{'gif_on'}) { - my $photo=$canvas->createImage($opt{'gif_x'},$opt{'gif_y'},-image=> $mw->Photo(-file => $opt{'gif_path'}), - -anchor => $opt{'gif_anchor'}); - } - $canvas_width = $opt{'window_width'} -$opt{'canvas_indent_left'} -$opt{'canvas_indent_right'}; - $bp_per_pixel = $opt{'canvas_bpwidth'} / $canvas_width; - - &draw_sequences; #draw sequences also draws graph scales - &draw_scales; - &draw_pairwise; #draw pairwise also draws subs# - &draw_extra; - &draw_subject_labels if$opt{'sub_on'} && ( $opt{'sub_labelseq_on'} || $opt{'sub_labelseqe_on'} ) && $opt{'arrangesub'} !~/statistic/; - - &draw_graph; - my $drawtext=eval( '"'. $opt{'text_text'} . '"' ); - if ($opt{'text_on'} ) { - $canvas->createText($opt{'text_offset_h'},$opt{'text_offset'}, - -text=> $drawtext, - -fill=>$opt{'text_color'}, - -font=>"Courier $opt{'text_fontsize'}", - -anchor=>$opt{'text_anchor'} - ); - } - $drawtext=eval( '"'. $opt{'text2_text'} . '"' ); - if ($opt{'text2_on'} ) { - $canvas->createText($opt{'text2_offset_h'},$opt{'text2_offset'}, - -text=> $drawtext, - -fill=>$opt{'text2_color'}, - -font=>"Courier $opt{'text2_fontsize'}", - -anchor=>$opt{'text2_anchor'} - ); - } - - - - - $canvas->configure(-scrollregion=>[$canvas->bbox("all")]); - #$canvas->delete ("2"); -} - -sub accession_begin_end { -# if ($opt{'just_pairwise'} == 0 ) { - #print "ALL OF SEQUENCE PLOTTED\n"; - foreach ( keys %acc) { - $acc{$_}{'b'}=1 if !defined $acc{$_}{'b'}; - $acc{$_}{'e'}=$acc{$_}{'len'} if !defined $acc{$_}{'e'}; - #print "$_ $acc{$_}{'b'} $acc{$_}{'e'}\n"; - } -} - - -sub arrange_seq_lines { - my $i=0; - @l=(); - %lpos=(); - if ($opt{'arrangeseq'} eq 'oneperline') { - foreach ( @acc_order ) { - @{$l[$i]{'acc'}}=($_); - $i++; - } - } elsif ($opt{'arrangeseq'} eq 'sameline') { - @{$l[0]{'acc'}}=(@acc_order ); - } elsif ($opt{'arrangeseq'} eq 'file' ) { - print "ARRANGING USING FILE ($opt{'arrange_file'}\n"; - open (ARRANGE, $opt{'arrange_file'} ) || die "Can't read ($opt{'arrange_file'})!\n"; - #file that is inputted contains seqname begin# - #newlines are represented NEWLINE# - my $head=; - print "HEAD$head"; - while () { - s/\r\n/\n/;chomp; chomp; - my @c=split /\t/; - print "$_\n"; - if ($c[0] =~ /NEWLINE/) {$i++;next;} - next if $c[0] !~ /\S+/; - die "$opt{'arrange_file'} contains $_ which has no digit data for position\n" if $c[1] !~ /^\d+$/; - push @{$l[$i]{'acc'}},$c[0]; - $lpos{$c[0]}=$c[1]; - } - close ARRANGE; - ##################################### - my $count=0; - foreach (keys %acc) { - if (! defined $lpos{$_}) { - #print "WARNING\n"; - $count++; - &warnNpause( "ERROR:$_ does not have line and base start position in ($opt{'arrange_file'})!\n"); - } - } - if ($count > 0) { - print uc "WARNING missing position data for show seq in ($opt{'arrange_file'})\n"; - print uc "Display Results will be unpredicable!!!!!!!\n"; - print "PRESS RETURN AND CONTINUE AT OWN RISK. (Cntl-C to Quit) "; - my $pause=; - } - } else { - die "Can not arrange with -arrangeseq $opt{'arrangeseq'}!\n"; - } -} - - -sub draw_sequences { - $widest_line=0; - my %sub_scale_hash=(); - my %graph_scale_hash=(); - print "DRAWING SEQUENCES...\n" if !$newopt{'quiet'}; - #variables for changing globally eventually - my $wrap_on=1; - my $color=$opt{'seq_color'}; - my $thickness=$opt{'seq_width'}; - ###the subroutine itself - my $liney=$opt{'canvas_indent_top'}; - foreach (my $l=0; $l<@l; $l++) { - $l[$l]{'liney'} = $liney; - $l[$l]{'maxbp'}=0; - my $bp_x=1; - foreach my $a( @{ $l[$l]{'acc'} }) { - #####initial calc of line and bp - $acc{$a}{'l'} = $l; - if ( defined $lpos{$a} ) { - #print "LPOS$a => $lpos{$a}\n"; - $bp_x=$lpos{$a}; - } - $acc{$a}{'xb'}=$bp_x; - $bp_x+=($acc{$a}{'e'}-$acc{$a}{'b'}); - $acc{$a}{'xe'} = $bp_x; - $l[$l]{'max'}=$bp_x if $bp_x > $l[$l]{'max'}; - $bp_x+= $opt{'seq_spacing_btwn_sequences'}+1 if !defined $lpos{$a}; - - - my $xb=$acc{$a}{'xb'}; - my $xe=$acc{$a}{'xe'}; - $widest_line=$xe if $xe>$widest_line; - my $begin=$acc{$a}{'b'}; - &draw_line_horz_pieces($l,$xb,$l,$xe, "s$a", $color, $thickness); - my $tagname = "s$a"; #can't go inside a sub!!! - $canvas->addtag('seq','withtag',$tagname); - - my ($x1,$y1)=&linexbp2xy($l,$xb); - my ($x2,$y2)=&linexbp2xy($l,$xe); - - if ($opt{'seq_tick_whole'} == 0) { - if ($opt{'seq_tick_b_on'}) { - my $tag=$canvas->createLine($x1,$y1+$opt{'seq_tick_b_offset'}, $x1,$y1+$opt{'seq_tick_b_length'}+$opt{'seq_tick_b_offset'}, - -width => $opt{'seq_tick_b_width'}, -fill => $opt{'seq_tick_b_color'}); - $canvas->addtag("tick","withtag",$tag); - if ($opt{'seq_tick_b_label_on'} ) { - $tag=$canvas->createText($x1+$opt{'seq_tick_b_label_offset_h'}, $y1+$opt{'seq_tick_b_length'} + $opt{'seq_tick_b_label_offset'}, - -text=>int($begin*$opt{'seq_tick_b_label_multiplier'}), - -fill=>$opt{'seq_tick_b_label_color'}, - -font=>"Courier $opt{'seq_tick_b_label_fontsize'}", - -anchor=>$opt{'seq_tick_b_label_anchor'} - ); - $canvas->addtag("tickl","withtag",$tag); - } - } - if ($opt{'seq_tick_e_on'}) { - my $tag=$canvas->createLine($x2,$y2+$opt{'seq_tick_e_offset'}, $x2,$y2+$opt{'seq_tick_e_length'}+$opt{'seq_tick_e_offset'}, - -width => $opt{'seq_tick_e_width'}, -fill => $opt{'seq_tick_e_color'}); - $canvas->addtag("tick","withtag",$tag); - if ($opt{'seq_tick_e_label_on'} ) { - $tag=$canvas->createText($x2+$opt{'seq_tick_e_label_offset_h'}, $y2+$opt{'seq_tick_e_length'} + $opt{'seq_tick_e_label_offset'}, - -text=>int(($begin+$xe-$xb)*$opt{'seq_tick_e_label_multiplier'} ), - -fill=>$opt{'seq_tick_e_label_color'}, - -font=>"Courier $opt{'seq_tick_e_label_fontsize'}", - -anchor=>$opt{'seq_tick_e_label_anchor'} - ) ; - $canvas->addtag("tickl","withtag",$tag); - } - } - - #print "=======> ($x1,$y1) ($x2,$y2)\n"; - if ($opt{'seq_tick_on'} ) { - for (my $t=$xb-1+$opt{'seq_tick_bp'}-(($begin) % $opt{'seq_tick_bp'}); $t<= $xe; $t+=$opt{'seq_tick_bp'}) { - my ($xt, $yt)=&linexbp2xy($l, $t); - my $tag=$canvas->createLine($xt,$yt+$opt{'seq_tick_offset'}, $xt,$yt+$opt{'seq_tick_length'}+$opt{'seq_tick_offset'}, - -width => $opt{'seq_tick_width'}, -fill => $opt{'seq_tick_color'}); - $canvas->addtag('tick',"withtag",$tag); - if ($opt{'seq_tick_label_on'} ) { - $tag=$canvas->createText($xt, $yt+$opt{'seq_tick_length'} + $opt{'seq_tick_label_offset'}, - -text=> ($t-$xb+1+$begin)*$opt{'seq_tick_label_multiplier'}, - -fill=> $opt{'seq_tick_label_color'}, - -font=>"Courier $opt{'seq_tick_label_fontsize'}", - -anchor=>$opt{'seq_tick_label_anchor'} - ); - $canvas->addtag('tickl','withtag',$tag); - } - } - } - } - # foreach (qw(seq_label_on seq_label_pattern)) { - # print "$_=>$opt{$_}####\n"; - # } - if ($opt{'seq_label_on'}) { - # print "A:$a\n"; - my $name=$a; - $name=$1 if $opt{'seq_label_pattern'} ne '' && $name=~/$opt{'seq_label_pattern'}/ ; - my $tag=$canvas->createText($x1+$opt{'seq_label_offset_h'}, $y1-6+$opt{'seq_label_offset'}, - -text=>$name, - -font=>"Courier $opt{'seq_label_fontsize'}", - -fill=> $opt{'seq_label_color'}, -anchor=>"sw"); - $canvas->addtag("seqn","withtag",$tag); - # print "SEQN:$name\n"; - } - - } - $liney+= int(($bp_x-1)/$opt{'canvas_bpwidth'}) * $opt{'seq_line_spacing_wrap'} + $opt{'seq_line_spacing_btwn'}; - } - #my $pause=; -} - -sub draw_scales { - print "DRAWING SCALES FOR GRAPHS AND SUBJECTS...\n" if !$newopt{'quiet'}; - foreach (my $l=0; $l<@l; $l++) { - my ($x1,$y1)=&linexbp2xy($l,1); - my ($x2,$y2)=&linexbp2xy($l,$l[$l]{'max'}); - my $xb=1; - my $xe=$l[$l]{'max'}; - #print "$l $y1 $y2\n"; - if ($opt{'seq_tick_whole'} == 1) { - if ($opt{'seq_tick_b_on'}) { - my $tag=$canvas->createLine($x1,$y1+$opt{'seq_tick_b_offset'}, $x1,$y1+$opt{'seq_tick_b_length'}+$opt{'seq_tick_b_offset'}, - -width => $opt{'seq_tick_b_width'}, -fill => $opt{'seq_tick_b_color'}); - $canvas->addtag("tick","withtag",$tag); - if ($opt{'seq_tick_b_label_on'} ) { - $tag=$canvas->createText($x1+$opt{'seq_tick_b_label_offset_h'}, $y1+$opt{'seq_tick_b_length'} + $opt{'seq_tick_b_label_offset'}, - -text=>int($xb*$opt{'seq_tick_b_label_multiplier'}), - -fill=>$opt{'seq_tick_b_label_color'}, - -font=>"Courier $opt{'seq_tick_b_label_fontsize'}", - -anchor=>$opt{'seq_tick_b_label_anchor'} - ); - $canvas->addtag("tickl","withtag",$tag); - } - } - if ($opt{'seq_tick_e_on'}) { - my $tag=$canvas->createLine($x2,$y2+$opt{'seq_tick_e_offset'}, $x2,$y2+$opt{'seq_tick_e_length'}+$opt{'seq_tick_e_offset'}, - -width => $opt{'seq_tick_e_width'}, -fill => $opt{'seq_tick_e_color'}); - $canvas->addtag("tick","withtag",$tag); - if ($opt{'seq_tick_e_label_on'} ) { - $tag=$canvas->createText($x2+$opt{'seq_tick_e_label_offset_h'}, $y2+$opt{'seq_tick_e_length'} + $opt{'seq_tick_e_label_offset'}, - -text=>int(($xe +0.000001)*$opt{'seq_tick_e_label_multiplier'}) , - -fill=>$opt{'seq_tick_e_label_color'}, - -font=>"Courier $opt{'seq_tick_e_label_fontsize'}", - -anchor=>$opt{'seq_tick_e_label_anchor'} - ) ; - $canvas->addtag("tickl","withtag",$tag); - } - } - - #print "=======> ($x1,$y1) ($x2,$y2)\n"; - if ($opt{'seq_tick_on'} ) { - for (my $t=$xb-1+$opt{'seq_tick_bp'}; $t<= $xe; $t+=$opt{'seq_tick_bp'}) { - my ($xt, $yt)=&linexbp2xy($l, $t); - my $tag=$canvas->createLine($xt,$yt+$opt{'seq_tick_offset'}, $xt,$yt+$opt{'seq_tick_length'}+$opt{'seq_tick_offset'}, - -width => $opt{'seq_tick_width'}, -fill => $opt{'seq_tick_color'}); - $canvas->addtag('tick',"withtag",$tag); - if ($opt{'seq_tick_label_on'} ) { - $tag=$canvas->createText($xt, $yt+$opt{'seq_tick_length'} + $opt{'seq_tick_label_offset'}, - -text=> ($t-$xb+1)*$opt{'seq_tick_label_multiplier'}, - -fill=> $opt{'seq_tick_label_color'}, - -font=>"Courier $opt{'seq_tick_label_fontsize'}", - -anchor=>$opt{'seq_tick_label_anchor'} - ); - $canvas->addtag('tickl','withtag',$tag); - } - } - } - - #if ($opt{'seq_label_on'}) { - # my $name=$_; - # $name=$1 if $opt{'seq_label_pattern'} ne '' && $name=~/$opt{'seq_label_pattern'}/ ; - # my $tag=$canvas->createText($x1+$opt{'seq_label_offset_h'}, $y1-6+$opt{'seq_label_offset'}, - # -text=>$name, - # -font=>"Courier $opt{'seq_label_fontsize'}", - # -fill=> $opt{'seq_label_color'}, -anchor=>"sw"); - # $canvas->addtag("seqn","withtag",$tag); - #} - - } - if ( $opt {'graph_scale_on'} ) { - my $intercept1=$opt{'graph1_min'}; - my $slope1=($opt{'graph1_max'}-$opt{'graph1_min'})/$opt{'graph_scale_height'}; - my $scale1= $opt{'graph_scale_height'} / ( $opt{'graph1_max'}-$opt{'graph1_min'} ); - - my $intercept2=$opt{'graph2_min'}; - my $slope2=($opt{'graph2_max'}-$opt{'graph2_min'})/$opt{'graph_scale_height'}; - my $scale2= $opt{'graph_scale_height'} / ( $opt{'graph2_max'}-$opt{'graph2_min'} ); - - my $step = $opt{'graph_scale_height'}/$opt{'graph_scale_interval'}; - - for(my $i=$y1; $i<=$y2; $i+=$opt{'seq_line_spacing_wrap'}) { - #next if defined $graph_scale_hash{$i}; - #$graph_scale_hash{$i}='done'; - #print "GSCALE $i\n"; - my $xbegin= $opt{'canvas_indent_left'}+$opt{'sub_scale_vline_offset'}; - my $xend = $opt{'window_width'}-$opt{'canvas_indent_right'}; - if ($i == $y2) { $xend=$x2}; - #print "DRAWING VLINE\n"; - if ($opt{'graph1_on'} && $opt{'graph1_vline_on'}) { - my $tag=$canvas->createLine ( $xbegin,$i+$opt{'graph_scale_indent'}, - $xbegin,$i-$opt{'graph_scale_height'}+$opt{'graph_scale_indent'}, - -width => $opt{'graph1_vline_width'}, -fill => $opt{'graph1_vline_color'} - ); - $canvas->addtag('gs','withtag',$tag); - } - if ($opt{'graph2_on'} && $opt{'graph2_vline_on'}) { - my $tag=$canvas->createLine ( $xend,$i+$opt{'graph_scale_indent'}, - $xend,$i-$opt{'graph_scale_height'}+$opt{'graph_scale_indent'}, - -width => $opt{'graph2_vline_width'}, -fill => $opt{'graph2_vline_color'} - ); - $canvas->addtag('gs','withtag',$tag); - } - for (my $j=0; $j<=$opt{'graph_scale_height'};$j+=$step) { - my $y=$i+$opt{'graph_scale_indent'} -$j; - # print "$j => $l => $y\n"; - if ($opt{'graph_scale_hline_on'}) { - my $tag=$canvas->createLine($xbegin, $y,$xend,$y, - -width => $opt{'graph_scale_hline_width'}, -fill => $opt{'graph_scale_hline_color'}); - $canvas->addtag('gs','withtag',$tag); - } - if ($opt{'graph1_on'} && $opt{'graph1_tick_on'}) { - #print "DRAWING GSCALE1 TICK\n"; - my $tag=$canvas->createLine($xbegin-$opt{'graph1_tick_length'}+$opt{'graph1_tick_offset'}, $y - ,$xbegin+$opt{'graph1_tick_offset'},$y, - -width => $opt{'graph1_tick_width'}, -fill => $opt{'graph1_tick_color'} - ); - $canvas->addtag('gs','withtag',$tag); - } - if ($opt{'graph2_on'} && $opt{'graph2_tick_on'}) { - #print "DRAWING GSCALE1 TICK\n"; - my $tag=$canvas->createLine($xend-$opt{'graph2_tick_length'}+$opt{'graph2_tick_offset'}, $y - ,$xend+$opt{'graph2_tick_offset'},$y, - -width => $opt{'graph2_tick_width'}, -fill => $opt{'graph2_tick_color'} - ); - $canvas->addtag('gs','withtag',$tag); - } - if ($opt{'graph1_on'} && $opt{'graph1_label_on'}) { - my $label=$j*$slope1+$intercept1; ; - $label=int($label* $opt{'graph1_label_multiplier'}*10**$opt{'graph1_label_decimal'}+0.00000000000001 )/10**$opt{'graph1_label_decimal'}; - my $tag=$canvas->createText( - $xbegin-$opt{'graph1_tick_length'}+$opt{'graph1_tick_offset'}-2+$opt{'graph1_label_offset'}, $y, - -text=>$label, - -font=>"Courier $opt{'graph1_label_fontsize'}", - -fill=> $opt{'graph1_label_color'},-anchor=>"e"); - $canvas->addtag('gsl','withtag',$tag); - } - if ($opt{'graph2_on'} && $opt{'graph2_label_on'}) { - my $label=$j*$slope2+$intercept2; ; - $label=int($label* $opt{'graph2_label_multiplier'}*10**$opt{'graph2_label_decimal'}+0.00000000000001 )/10**$opt{'graph2_label_decimal'}; - my $tag=$canvas->createText( - $xend-$opt{'graph2_tick_length'}+$opt{'graph2_tick_offset'}-2+$opt{'graph2_label_offset'}, $y, - -text=>$label, - -font=>"Courier $opt{'graph2_label_fontsize'}", - -fill=> $opt{'graph2_label_color'},-anchor=>"w"); - $canvas->addtag('gsl','withtag',$tag); - } - } - } - } - if ( $opt{'sub_scale_on'} ) { - for(my $i=$y1; $i<=$y2; $i+=$opt{'seq_line_spacing_wrap'}) { - ###################################################################### - my $scale= $opt{'sub_scale_lines'} / ( $opt{'sub_scale_max'}-$opt{'sub_scale_min'} ); - my $step = $opt{'sub_scale_step'}; - my $xbegin= $opt{'canvas_indent_left'}+$opt{'sub_scale_vline_offset'}; - my $xend = $opt{'window_width'}-$opt{'canvas_indent_right'}; - if ($i == $y2) { $xend=$x2}; - - #########draw vertical lines of scale ##################### - #top is $opt{'sub_intoffset'} - #bottom is $opt{'sub_scale_lines'} - if ($opt{'sub_scale_vline_on'}) { - my $tag=$canvas->createLine ( $xbegin,$i+$opt{'sub_initoffset'}, - $xbegin,$i+$opt{'sub_scale_lines'}*$opt{'sub_line_spacing'}+$opt{'sub_initoffset'}, - -width => $opt{'sub_scale_vline_width'}, -fill => $opt{'sub_scale_vline_color'} - ); - $canvas->addtag('ss','withtag',$tag); - } - #########draw horizontal scale lines and ticks and labels####################### - my ($min,$max)=($opt{'sub_scale_min'},$opt{'sub_scale_max'}); - ($min,$max)=($max,$min) if $min > $max; - for (my $j=$opt{'sub_scale_min'}; $j<=$max && $j>=$min;$j+=$step) { - #need max to min step1# - my $l = ($opt{'sub_scale_max'} - $j)*$scale; - my $y= $l*$opt{"sub_line_spacing"}+$opt{'sub_initoffset'} +$i; - #print "$j => $l => $y\n"; - if ($opt{'sub_scale_hline_on'}) { - my $tag=$canvas->createLine($xbegin, $y,$xend,$y, - -width => $opt{'seq_tick_width'}, -fill => $opt{'sub_scale_hline_color'}); - $canvas->addtag('ss','withtag',$tag); - } - if ($opt{'sub_scale_tick_on'}) { - my $tag=$canvas->createLine($xbegin-$opt{'sub_scale_tick_length'}+$opt{'sub_scale_tick_offset'}, $y - ,$xbegin+$opt{'sub_scale_tick_offset'},$y, - -width => $opt{'sub_scale_tick_width'}, -fill => $opt{'sub_scale_tick_color'}); - $canvas->addtag('ss','withtag',$tag); - - } - if ($opt{'sub_scale_label_on'}) { - my $label; - if ($opt{'arrangesub'}=~/subscaleN/) { - $label=int(($j+0.000000000001)*1000000)/1000000*$opt{'sub_scale_label_multiplier'}; - } else { - $label=$subscaleC[$j]; - } - my $tag=$canvas->createText($xbegin-$opt{'sub_scale_tick_length'}+$opt{'sub_scale_tick_offset'}-2+$opt{'sub_scale_label_offset'}, $y, - -text=>$label, - -font=>"Courier $opt{'sub_scale_label_fontsize'}", - -fill=> $opt{'sub_scale_label_color'},-anchor=>"e"); - $canvas->addtag('ssl','withtag',$tag); - } - } - #########draw vertical lines of scale ##################### - #top is $opt{'sub_intoffset'} - #bottom is $opt{'sub_scale_lines'} - if ($opt{'sub_scale_vline_on'}) { - my $tag=$canvas->createLine ( $xbegin,$i+$opt{'sub_initoffset'}, - $xbegin,$i+$opt{'sub_scale_lines'}*$opt{'sub_line_spacing'}+$opt{'sub_initoffset'}, - -width => $opt{'sub_scale_vline_width'}, -fill => $opt{'sub_scale_vline_color'} - ) ; - $canvas->addtag('ss','withtag',$tag); - - } - } - } - } -} - - -sub draw_pairwise { - print "DRAWING PAIRWISE AND SUBJECTS...\n" if !$newopt{'quiet'}; - my $size=scalar(@m); - #my $m_pointer=\@m; - #my $m_point_point= \$m_pointer; - #print "ARRAYPOINTER M:$m_pointer\n"; - my ($n1,$b1,$e1,$l1,$n2,$b2,$e2,$l2,$skip_this ) ; - for (my $i=0; $i < $size;$i++) { - ($n1,$b1,$e1,$l1,$n2,$b2,$e2,$l2 ) = @{$m[$i]}; - # print "$i) $n1 $b1 $e1 $l1 $n2 $b2 $e2 $l2\n"; - print "..$i\n" if $i % 1000 ==0 && $i !=0 && !$newopt{'quiet'}; - next if $m[$i][$mh{'hide'}]; - my $defined1=(defined $acc{$n1}); - my $defined2=(defined $acc{$n2}); - ###calculate whether sequence is present### - if ($defined1) { - if ( ($b1 < $acc{$n1}{'b'} && $e1 < $acc{$n1}{'b'} ) - || ($b1 > $acc{$n1}{'e'} && $e1 > $acc{$n1}{'e'} ) ) { - $defined1=0 ; - } else { - $b1=$acc{$n1}{'b'} if $b1 < $acc{$n1}{'b'} ; - $e1=$acc{$n1}{'e'} if $e1 > $acc{$n1}{'e'} ; - } - } - if ($defined2) { - #print "$b2-$e2\n"; - if ( ($b2 < $acc{$n2}{'b'} && $e2 < $acc{$n2}{'b'} ) - || ($b2 > $acc{$n2}{'e'} && $e2 > $acc{$n2}{'e'} ) ) { - $defined2=0 ; - } else { - if ($b2 < $e2 ) { - $b2=$acc{$n2}{'b'} if $b2 < $acc{$n2}{'b'} ; - $e2=$acc{$n2}{'e'} if $e2 > $acc{$n2}{'e'} ; - } else { - #print "else $b2-$e2\n"; - $b2=$acc{$n2}{'e'} if $b2 > $acc{$n2}{'e'} ; - $e2=$acc{$n2}{'b'} if $e2 < $acc{$n2}{'b'} ; - - - } - } - - } - next if !( ($defined1 && defined $accsub{$n2} ) || ($defined2 && defined $accsub{$n1}) ); - next if ( !$defined1 && !$defined2 ); - ###convert to proper xposition #### - #print "$n1=> $acc{$n1}{'xb'} $acc{$n1}{'b'}-$acc{$n1}{'e'} ($b1-$e1)\n"; - #print "$n2=> $acc{$n2}{'xb'} $acc{$n2}{'b'}-$acc{$n2}{'e'} ($b2-$e2)\n"; - if (defined $acc{$n1}) { - $b1= $acc{$n1}{'xb'}+$b1 -$acc{$n1}{'b'}; - $e1= $acc{$n1}{'xb'}+$e1 -$acc{$n1}{'b'}; - } - if (defined $acc{$n2}) { - $b2= $acc{$n2}{'xb'}+$b2 -$acc{$n2}{'b'}; - $e2= $acc{$n2}{'xb'}+$e2 -$acc{$n2}{'b'}; - } - #print "ACC ($n1) $acc{$n1}{'b'}-$acc{$n1}{'e'} $b1-$e1\n"; - ############################################################# - ##########FILTERING TO OCCUR################################# - ############################################################# - $skip_this=0; - if ($opt{'filter1_col'} =~/^\d+$/) { - $skip_this=1 if $m[$i][$opt{'filter1_col'}]< $opt{'filter1_min'} && $opt{'filter1_min'} ne ''; - $skip_this=1 if $m[$i][$opt{'filter1_col'}]> $opt{'filter1_max'} && $opt{'filter1_max'} ne ''; - } - if ($opt{'filter2_col'} =~/^\d+$/) { - $skip_this=1 if $m[$i][$opt{'filter2_col'}]< $opt{'filter2_min'} && $opt{'filter2_min'} ne ''; - $skip_this=1 if $m[$i][$opt{'filter2_col'}]> $opt{'filter2_max'} && $opt{'filter2_max'} ne ''; - } - next if $skip_this; - ############################################################## - #################DRAW SUBJECTS################################ - ############################################################## - #print "$n1 => $acc{$n1}{'xb'} $b1 $acc{$n1}{'b'} ($b1-$e1)\n"; - #print "$n2=> $acc{$n2}{'xb'} $b1 $acc{$n2}{'b'} ($b2-$e2)\n"; - - if ($opt{'sub_on'}==1) { - my $color=$opt{'sub_color'}; - if ($defined1 && defined $accsub{$n2} ) { - ########N2 is the SUB##################### - #next if !$defined1; #don't draw if if $n1 isn't being drawn - if ($m[$i][$mh{'scolor'}] ) { - $color=$m[$i][$mh{'scolor'}]; - } else { - $color=$accsub{$n2}{'color'} if $accsub{$n2}{'color'}; - } - my $offset= $accsub{$n2}{'acc'}{$n1}{'line'}*$opt{'sub_line_spacing'}+$opt{'sub_initoffset'}; - $offset = $m[$i][$mh{'sline'}] * $opt{'sub_line_spacing'}+$opt{'sub_initoffset'}if $m[$i][$mh{'sline'}] ne ''; - my $width= $opt{'sub_width'}; - my ($start,$stop)=($b1,$e1); - ($stop,$start)=($start,$stop) if $start> $stop; - #print "$n1:$n2 C$color W$width O$offset\n"; - ($stop,$start)=($start,$stop) if $start> $stop; - $accsub{$n2}{'acc'}{$n1}{'qmin_f'}=$start if $start < $accsub{$n2}{'acc'}{$n1}{'qmin_f'} ; - $accsub{$n2}{'acc'}{$n1}{'qmax_f'}=$stop if $stop > $accsub{$n2}{'acc'}{$n1}{'qmax_f'}; - #print "$start $stop\n"; - if ( $opt{'sub_arrow_on'} ) { - my $arrow='first'; - $arrow='last' if $b2<$e2; - #print "$acc{$n1}{'l'} $start $acc{$n1}{'l'} $stop\n"; - &draw_line_horz_pieces($acc{$n1}{'l'},$start,$acc{$n1}{'l'},$stop, "Sa$i" - ,$color, $width, $offset - ,$arrow,$opt{'sub_arrow_paral'},$opt{'sub_arrow_diag'},$opt{'sub_arrow_perp'}); - - } else { - &draw_line_horz_pieces($acc{$n1}{'l'},$start,$acc{$n1}{'l'},$stop, "Sa$i", - $color, $width, $offset); - } - if ($opt{'sub_labelhit_on'}==1) { - #print "WRITING LABEL FOR N2:$n2\n"; - my $label=''; - my $xblabel=$acc{$n1}{'xb'}+$accsub{$n2}{'acc'}{$n1}{'qmin'}; - if ( $opt{'sub_labelhit_col2'} ne '') { - $label = $m[$i][ $opt{'sub_labelhit_col2'} ]; - $label=$1 if $opt{'sub_labelhit_col2_pattern'} && $label =~ /$opt{'sub_labelhit_col2_pattern'}/; - } elsif ($opt{'sub_labelhit_col'} ne '') { - - $label = $m[$i][ $opt{'sub_labelhit_col'} ]; - $label=$1 if $opt{'sub_labelhit_col_pattern'} && $label =~ /$opt{'sub_labelhit_col_pattern'}/; - #print " DRAW $label\n"; - } - my ($xl, $yl)=&linexbp2xy($acc{$n1}{'l'},$start); - my $tag=$canvas->createText($xl, $yl+$offset, - -text=>$label, -fill => $opt{'sub_labelhit_color'}, - -font=>"Courier $opt{'sub_labelhit_size'}", -anchor=>"e"); - $canvas->addtag('subl','withtag',$tag); - } - my $tagname = "Sa$i"; #can't go inside a sub!!! - $canvas->addtag("sub","withtag",$tagname); - } - #print "ACCN2:(",defined $acc{$n2},")(", defined $accsub{$n1}, ")\n"; - if ($defined2 && defined $accsub{$n1} ) { - #next if !defined2; - ############N1 is the SUB######################## - $color=$opt{'sub_color'}; - if ($m[$i][$mh{'scolor'}] ){ - $color=$m[$i][$mh{'scolor'}]; - } else { - $color=$accsub{$n1}{'color'} if $accsub{$n1}{'color'}; - } - my $offset= $accsub{$n1}{'acc'}{$n2}{'line'}*$opt{'sub_line_spacing'}+$opt{'sub_initoffset'}; - $offset = $m[$i][$mh{'sline'}] * $opt{'sub_line_spacing'}+$opt{'sub_initoffset'}if $m[$i][$mh{'sline'}] ne ''; - my $width= $opt{'sub_width'}; - my ($start,$stop)=($b2,$e2); - #print "$n1:$n2 C$color W$width O$offset\n"; - ($stop,$start)=($start,$stop) if $start> $stop; - $accsub{$n1}{'acc'}{$n2}{'qmin_f'}=$start if $start < $accsub{$n1}{'acc'}{$n2}{'qmin_f'} ; - $accsub{$n1}{'acc'}{$n2}{'qmax_f'}=$stop if $stop > $accsub{$n1}{'acc'}{$n2}{'qmax_f'}; - if ($opt{'sub_arrow_on'} ) { - my $arrow='first'; - $arrow='last' if $b2<$e2; - &draw_line_horz_pieces($acc{$n2}{'l'},$start,$acc{$n2}{'l'},$stop, "Sb$i", - $color, $width, $offset, - ,$arrow,$opt{'sub_arrow_paral'},$opt{'sub_arrow_diag'},$opt{'sub_arrow_perp'}); - - } else { - &draw_line_horz_pieces($acc{$n2}{'l'},$start,$acc{$n2}{'l'},$stop, "Sb$i", - "$color", $width, $offset); - } - - #print "Zoom...\n"; - if ( $opt{'sub_labelhit_on'}==1) { - #print "WRITING LABEL FOR N1:$n1\n"; - my $label=''; - my $xblabel=$acc{$n2}{'xb'}+$accsub{$n1}{'acc'}{$n2}{'qmin'}; - if ( $opt{'sub_labelhit_col'} ne '' ) { - $label = $m[$i][ $opt{'sub_labelhit_col'} ]; - $label=$1 if $opt{'sub_labelhit_col_pattern'} && $label=~/$opt{'sub_labelhit_col_pattern'}/; - } - my ($xl, $yl)=&linexbp2xy($acc{$n2}{'l'},$start); - my $tag=$canvas->createText($xl, $yl+$offset, - -text=>$label, - -font=>"Courier $opt{'sub_labelhit_size'}", -anchor=>"e"); - $canvas->addtag('subl','withtag',$tag); - $acc{$n2}{'labeldrawn'}=1; - } - my $tagname = "Sb$i"; #can't go inside a sub!!! - $canvas->addtag("sub","withtag",$tagname); - } - - } - - ############################################## - ####DETERMINE IF INTER OR INTRA PICTURE ###### - my $pairtype = 'inter'; - #print "$n1:($acc{$n1}{'e'})$n2:($acc{$n2}{'e'})"; - $pairtype='intra' if (defined $acc{$n1} && defined $acc{$n2}); - if ($opt{'pair_type_col'} ne '') { - if ($opt{'pair_type_col2'} ne '') { - #print "PAIR TYPE CHECK\n"; - my $text1=$m[$i][$opt{'pair_type_col'}]; - $text1=$1 if $opt{'pair_type_col_pattern'} && $text1=~/$opt{'pair_type_col_pattern'}/; - my $text2=$m[$i][$opt{'pair_type_col2'}]; - $text2=$1 if $opt{'pair_type_col2_pattern'} && $text2=~/$opt{'pair_type_col2_pattern'}/; - if ($text1 eq $text2) { $pairtype='intra'} else {$pairtype='inter'} - - } else { - my $text1=$m[$i][$opt{'pair_type_col'}]; - $text1=$1 if $opt{'pair_type_col_pattern'} && $text1=~/$opt{'pair_type_col_pattern'}/; - $pairtype='intra' if $text1=~/intra/i; - $pairtype='inter' if $text1=~/inter/i; - } - - } - ######FILTER THE PAIRWISE AND HIDE IF NECESSARY ###### - ##################################################### - #########determine other characteristics ########### - my $color = $opt{"pair_$pairtype".'_color'}; - $color = $m[$i][$mh{'color'}] if $m[$i][$mh{'color'}]; - my $width = $opt{"pair_$pairtype".'_width'}; - $width = $m[$i][$mh{'width'}] if $m[$i][$mh{'width'}]; - my $offset = $opt{"pair_$pairtype".'_offset'}; - $offset =$m[$i][$mh{'offset'}] if $m[$i][$mh{'offset'}]; - - #### SKIP IF PAIRTYPE NOT TO BE DISPLAYED ###### - next if !$opt{'pair_intra_on'} && $pairtype eq 'intra'; - next if !$opt{'pair_inter_on'} && $pairtype eq 'inter'; - ####draw the lines ####### - my ($start,$stop)=($b1,$e1); - ($stop,$start)=($start,$stop) if $start> $stop; - &draw_line_horz_pieces($acc{$n1}{'l'},$start,$acc{$n1}{'l'},$stop, "M$i", - $color, $width, $offset) - if $defined1; - ($start,$stop)=($b2,$e2); - ($stop,$start)=($start,$stop) if $start> $stop; - &draw_line_horz_pieces($acc{$n2}{'l'},$start,$acc{$n2}{'l'},$stop, "M$i", - $color, $width, $offset) - if $defined2; - if ( ($pairtype eq 'intra' && $opt{'pair_intra_line_on'}) - || ($pairtype eq 'inter' && $opt{'pair_inter_line_on'} && $defined1 && $defined2 ) - ) { - my $acc_e1=$acc{$n1}{'e'}+$acc{$n1}{'xb'}-1; - my $acc_e2=$acc{$n2}{'e'}+$acc{$n2}{'xb'}-1; - - - #print " $acc{$n1}{'xb'} <= $b1 && $b1 <= $acc_e1 && $acc{$n2}{'xb'} <= $b2 && $b2 <= $acc_e2\n"; - #print " $acc{$n1}{'xb'} <= $e1 && $e1 <= $acc_e1 && $acc{$n2}{'xb'} <= $e2 && $e2 <= $acc_e2\n"; - if ($acc{$n1}{'xb'} <= $b1 && $b1 <= $acc_e1 && $acc{$n2}{'xb'} <= $b2 && $b2 <= $acc_e2) { - my ($b1x, $b1y)=&linexbp2xy($acc{$n1}{'l'},$b1); - my ($b2x, $b2y)=&linexbp2xy($acc{$n2}{'l'},$b2); - my $line; - if ($b1y==$b2y) { - #print "draw begin on same\n"; - $line= $canvas->createLine($b1x,$b1y, ($b1x+$b2x)/2, $b1y-0.66*$opt{'seq_line_spacing_wrap'}, - $b2x,$b2y,-width => 1, -fill => $color); - } else { - #print " begin drawing different\n"; - $line= $canvas->createLine($b1x,$b1y,$b2x,$b2y,-width => 1, -fill => $color); - } - $canvas->addtag("M$i", 'withtag',$line); - } - if ( $acc{$n1}{'xb'} <= $e1 && $e1 <= $acc_e1 && $acc{$n2}{'xb'} <= $e2 && $e2 <= $acc_e2) { - my ($e1x, $e1y)=&linexbp2xy($acc{$n1}{'l'},$e1); - my ($e2x, $e2y)=&linexbp2xy($acc{$n2}{'l'},$e2); - my $line; - if ($e1y==$e2y) { - $line= $canvas->createLine($e1x,$e1y, ($e1x+$e2x)/2, $e1y-0.66*$opt{'seq_line_spacing_wrap'}, - $e2x,$e2y,-width => 1, -fill => $color); - } else { - $line= $canvas->createLine($e1x,$e1y,$e2x,$e2y,-width => 1, -fill => $color); - - } - $canvas->addtag("M$i", 'withtag',$line); - } - } elsif ($pairtype eq 'inter' && $opt{'pair_inter_line_on'} ) { - my ($b1x,$b1y,$b2x,$b2y); - if ($defined1 ) { - #draw n1# - ($b1x, $b1y)=&linexbp2xy($acc{$n1}{'l'},$b1); - ($b2x, $b2y)=&linexbp2xy($acc{$n1}{'l'},$e1); - } else { - #draw n2# - ($b1x, $b1y)=&linexbp2xy($acc{$n2}{'l'},$b2); - ($b2x, $b2y)=&linexbp2xy($acc{$n2}{'l'},$e2); - } - my $line= $canvas->createLine($b1x,$b1y, $b1x, $b1y-0.90*$opt{'seq_line_spacing_wrap'}, - $b2x,$b2y,-width => 1, -fill => $color); - $canvas->addtag("M$i", 'withtag',$line); - } - - my $tagname = "M$i"; #can't go inside a sub!!! - if ($opt{'pair_level'} eq 'inter_over_intra' ) { - if ($pairtype eq 'inter') {$canvas->raise($tagname);} else { $canvas->lower($tagname); } - } elsif ($opt{'pair_level'} eq 'intra_over_inter' ) { - if ($pairtype eq 'inter') {$canvas->lower($tagname);} else { $canvas->raise($tagname); } - - } - $canvas->addtag("$pairtype","withtag",$tagname); - } - print "..$size\n" if !$newopt{'quiet'}; -} - -sub draw_subject_labels { - foreach my $ac (@acc_order) { - foreach my $s ( keys % {$acc{$ac}{'sub'}} ) { - #print "$ac $s\n"; - my $ref=$m[$accsub{$s}{'acc'}{$ac}{'eghit'}]; - my $label=''; - #print " $s $a $accsub{$s}{'acc'}{$ac}{'qmin'} $accsub{$s}{'acc'}{$ac}{'qmax'}\n "; - next if $accsub{$s}{'acc'}{$ac}{'qmin'} > $acc{$ac}{'e'} - || $accsub{$s}{'acc'}{$ac}{'qmax'} < $acc{$ac}{'b'}; - if($opt{'sub_labelseq_on'} && $opt{'sub_labelseq_col'} ne '') { - #print "TRYING TO LABEL SUBJECTS $$ref[4]\n"; - if ( $$ref[4] eq $s && $opt{'sub_labelseq_col2'} ne '' ) { - $label = $$ref[ $opt{'sub_labelseq_col2'} ]; - $label=$1 if $opt{'sub_labelseq_col2_pattern'} && $label =~ /$opt{'sub_labelseq_col2_pattern'}/; - } elsif ($opt{'sub_labelseq_col'} ne '') { - - $label = $$ref[ $opt{'sub_labelseq_col'} ]; - $label=$1 if $opt{'sub_labelseq_col_pattern'} && $label =~ /$opt{'sub_labelseq_col_pattern'}/; - #print " DRAW $label\n"; - } - my $xblabel= $accsub{$s}{'acc'}{$ac}{'qmin'}+ $acc{$ac}{'xb'}-$acc{$ac}{'b'}; - my $offset= $accsub{$s}{'acc'}{$ac}{'line'}*$opt{'sub_line_spacing'}+$opt{'sub_initoffset'}; - my ($xl, $yl)=&linexbp2xy($acc{$ac}{'l'},$xblabel); - my $tag=$canvas->createText($xl- $opt{'sub_labelseq_offset'}, $yl+$offset, - -text=>$label, -fill => $opt{'sub_labelseq_color'}, - -font=>"Courier $opt{'sub_labelseq_size'}", -anchor=>"e"); - $canvas->addtag('subl','withtag',$tag); - - } - if($opt{'sub_labelseqe_on'} && $opt{'sub_labelseqe_col'} ne '') { - #print "TRYING TO LABEL SUBJECTS $$ref[4]\n"; - if ( $$ref[4] eq $s && $opt{'sub_labelseqe_col2'} ne '' ) { - $label = $$ref[ $opt{'sub_labelseqe_col2'} ]; - $label=$1 if $opt{'sub_labelseqe_col2_pattern'} && $label =~ /$opt{'sub_labelseqe_col2_pattern'}/; - } elsif ($opt{'sub_labelseqe_col'} ne '') { - - $label = $$ref[ $opt{'sub_labelseqe_col'} ]; - $label=$1 if $opt{'sub_labelseqe_col_pattern'} && $label =~ /$opt{'sub_labelseqe_col_pattern'}/; - #print " DRAW $label\n"; - } - my $xblabel= $accsub{$s}{'acc'}{$ac}{'qmax'}+ $acc{$ac}{'xb'}-$acc{$ac}{'b'}; - my $offset= $accsub{$s}{'acc'}{$ac}{'line'}*$opt{'sub_line_spacing'}+$opt{'sub_initoffset'}; - my ($xl, $yl)=&linexbp2xy($acc{$ac}{'l'},$xblabel); - my $tag=$canvas->createText($xl- $opt{'sub_labelseqe_offset'}, $yl+$offset, - -text=>$label, -fill => $opt{'sub_labelseqe_color'}, - -font=>"Courier $opt{'sub_labelseqe_size'}", -anchor=>"w"); - $canvas->addtag('subl','withtag',$tag); - - } - - } - } -} - -sub draw_extra { - print "DRAWING EXTRA...\n" if !$newopt{'quiet'}; - my $size=scalar(@e); - my $skip_this=0; - for (my $i=0; $i < $size;$i++) { - print "..$i\n" if $i % 1000 ==0 && $i !=0 && !$newopt{'quiet'}; - my $ep=$e[$i]; - my ($n1,$b1,$e1,$color,$offset,$width,$arrow)=@{$ep}[0..6]; - next if !defined($acc{$n1}); - $skip_this=0; - if ($opt{'filter1_col'} =~/^\d+$/) { - $skip_this=1 if $$ep[$opt{'filterextra1_col'}]< $opt{'filterextra1_min'} && $opt{'filterextra1_min'} ne ''; - $skip_this=1 if $$ep[$opt{'filterextra1_col'}]> $opt{'filterextra1_max'} && $opt{'filterextra1_max'} ne ''; - } - if ($opt{'filter2_col'} =~/^\d+$/) { - $skip_this=1 if $$ep[$opt{'filterextra2_col'}]< $opt{'filterextra2_min'} && $opt{'filterextra2_min'} ne ''; - $skip_this=1 if $$ep[$opt{'filterextra2_col'}]> $opt{'filterextra2_max'} && $opt{'filterextra2_max'} ne ''; - } - next if $skip_this; - if ( ($b1 < $acc{$n1}{'b'} && $e1 < $acc{$n1}{'b'} ) - || ($b1 > $acc{$n1}{'e'} && $e1 > $acc{$n1}{'e'} ) ) { - next; - } else { - $b1=$acc{$n1}{'b'} if $b1 < $acc{$n1}{'b'} ; - $e1=$acc{$n1}{'e'} if $e1 > $acc{$n1}{'e'} ; - } - #print "$n1 $b1 $e1 C$color W$width O$offset ..../n"; - $color=$opt{'extra_color'} if ! $color; - $width=$opt{'extra_width'} if !$width; - $offset=$opt{'extra_offset'} if $offset eq ''; - - $b1= $acc{$n1}{'xb'}+$b1 -$acc{$n1}{'b'}; - $e1= $acc{$n1}{'xb'}+$e1 -$acc{$n1}{'b'}; - my $tag=''; - if (defined $acc{$n1} ) { - my ($start,$stop)=($b1,$e1); - ($stop,$start)=($start,$stop) if $start> $stop; - if ($opt{'extra_arrow_on'} ) { - if ($arrow eq 'F') { $arrow = 'last' - } elsif ($arrow eq 'R') { $arrow = 'first' - } else { $arrow = 'none'} -# #print "DRAWING ARROW:$arrow\n"; - &draw_line_horz_pieces($acc{$n1}{'l'},$start,$acc{$n1}{'l'},$stop, "E$i", $color,$width,$offset - ,$arrow,$opt{'extra_arrow_para'},$opt{'extra_arrow_diag'},$opt{'extra_arrow_perp'}); #$thick *$opt{'pair_intra_width'}); - } else { - &draw_line_horz_pieces($acc{$n1}{'l'},$start,$acc{$n1}{'l'},$stop, "E$i", $color,$width,$offset); #$thick *$opt{'pair_intra_width'}); - - } - - if ( $opt{'extra_label_on'}==1) { - #print "DRAW LABEL\n"; - if ( $opt{'extra_label_test_col'} && $opt{'extra_label_test_pattern'} ) { - #print "SKIPPING\n"; - next if $e[$i][ $opt{'extra_label_test_col'}] !~ /$opt{'extra_label_test_pattern'}/; - } - - my $label=''; - my $xblabel=$start; - if ( $opt{'extra_label_col'} ne '' ) { - #print "FIGURE OUT LABEL $opt{'extra_label_col'}\n"; - $label = $e[$i][ $opt{'extra_label_col'} ]; - $label=$1 if $opt{'extra_label_col_pattern'} && $label=~/$opt{'extra_label_col_pattern'}/; - } - #print "DRAWING LABEL $label\n"; - my ($xl, $yl)=&linexbp2xy($acc{$n1}{'l'},$start); - $tag=$canvas->createText($xl+$opt{'extra_label_offset'}, $yl+$offset, - -text=>$label, - -font=>"Courier $opt{'extra_label_fontsize'}", -anchor=>"e",-fill=>$opt{'extra_label_color'}); - $canvas->addtag('exl','withtag',$tag); - - } - my $tagname = "E$i"; #can't go inside a sub!!! - $canvas->addtag("ex","withtag",$tagname); - } - - } - print "..$size\n" if !$newopt{'quiet'}; - -} - -sub draw_graph { - foreach my $numb (1,2) { - my $array = "g$numb"; - my $size= scalar(@$array); - next if $size==0; - print "GRAPHING $numb..\n" if !$newopt{'quiet'}; - my $p = \@$array; - my $bpos; - my $offset_slope= -$opt{'graph_scale_height'}/($opt{"graph$numb"."_max"} - $opt{"graph$numb"."_min"}); - my $offset_intercept=$opt{'graph_scale_indent'} - $offset_slope*$opt{"graph$numb"."_min"}; - my @line; - my $line_on=$opt{"graph$numb"."_line_on"}; - my $line_color=$opt{"graph$numb"."_line_color"}; - my $line_width=$opt{"graph$numb"."_line_width"}; - my $line_smooth=$opt{"graph$numb"."_line_smooth"}; - my $point_on=$opt{"graph$numb"."_point_on"}; - my $point_shape=$opt{"graph$numb"."_point_shape"}; - my $point_size=$opt{"graph$numb"."_point_size"}; - my $point_fill_color=$opt{"graph$numb"."_point_fill_color"}; - my $point_outline_color=$opt{"graph$numb"."_point_outline_color"}; - my $point_outline_width=$opt{"graph$numb"."_point_outline_width"}; - #print "$point_on $point_shape $point_size $point_fill_color\n"; - my ($seq,$position,$val); - my ($x1,$y1,$offset); - for (my $i=0; $i < $size;$i++ ) { - print "..$i\n" if $i % 1000 ==0 && $i !=0 && !$newopt{'quiet'}; - $seq=$$p[$i][0]; - $position=$$p[$i][1]; - $val=$$p[$i][2]; - next if !defined($acc{$seq}); - next if $position < $acc{$seq}{'b'}; - next if $position > $acc{$seq}{'e'}; - ### calculate the graph positions### - $bpos = $acc{$seq}{'xb'} + $position - $acc{$seq}{'b'}; - ($x1,$y1) = &linexbp2xy($acc{$seq}{'l'},$bpos); - $offset=$val*$offset_slope + $offset_intercept; - $offset+=$y1; - if ($line_on) { - push @line, $x1,$offset if $val ne '' ; - my ($x2,$y2); - if ($i< $size-1) { - ($x2,$y2)=&linexbp2xy($acc{$seq}{'l'},$acc{$seq}{'xb'} + $$p[$i+1][1] - $acc{$seq}{'b'}) - } - #print "$y1 ne $y2 $seq ne $$p[$i+1][0] $$p[$i+1][1] > $acc{$seq}{'e'}\n"; - if ( $i == $size-1 || $y1 ne $y2 || $seq ne $$p[$i+1][0] - || $$p[$i+1][1] > $acc{$seq}{'e'} || $val eq '') { - #draw lines - #print "DRAWING LINES\n"; - if (@line!=0) { - my $tag=$canvas->createLine( @line,-fill=>$line_color,-width=>$line_width,-smooth=>$line_smooth,); - $canvas->addtag('gl','withtag',$tag); - } - @line=(); - } - } - #print "$x1-$point_size,$offset-$point_size,$x1+$point_size, $offset+$point_size\n"; - #print " -fill => $point_fill_color,-outline=>$point_outline_color,-width=>$point_outline_width \n"; - if ($point_on && $val ne '') { - $canvas->createOval($x1-$point_size, $offset-$point_size, - $x1+$point_size, $offset+$point_size, - -fill => $point_fill_color, - -outline=>$point_outline_color, - -width=>$point_outline_width ); - } - } - print "..$size\n" if !$newopt{'quiet'}; - - } -} - - -sub linexbp2xy { - my $line=shift; - my $xbp= (shift) -1; - my $x = ( $xbp % $opt{'canvas_bpwidth'})/$bp_per_pixel +$opt{'canvas_indent_left'}; - my $y = $l[$line]{'liney'} + int($xbp/$opt{'canvas_bpwidth'}) * $opt{'seq_line_spacing_wrap'}; - return $x, $y; - -} - -#the main line/rectangle drawing routine for seqs, pairs, subs, and extras -sub draw_line_horz_pieces { - my ($l1,$xbp1,$l2,$xbp2, $tagname, $color, $width,$offset,$arrow,$a1,$a2,$a3) = @_; - my ($x1,$y1)=&linexbp2xy($l1,$xbp1); - my ($x2,$y2)=&linexbp2xy($l2,$xbp2); - #print "=======> ($x1,$y1) ($x2,$y2)\n"; - my $line; - for(my $i=$y1; $i<=$y2; $i+=$opt{'seq_line_spacing_wrap'}) { - my ($xb,$xe)=($x1,$x2); - $xb=$opt{'canvas_indent_left'} if $i > $y1; - $xe=$opt{'canvas_indent_left'}+$canvas_width if $i< $y2; - if (defined $arrow && $arrow ne 'none' ) { - $line= $canvas->createLine($xb,$i+$offset,$xe,$i+$offset,-width => $width, - -arrow=>$arrow, -arrowshape=>[$a1,$a2,$a3], - -fill => $color); - } else { - #$line= $canvas->createLine($xb,$i+$offset,$xe,$i+$offset,-width => $width, -fill => $color); - $line= $canvas->createRectangle($xb,$i+$offset-$width/2, $xe, $i+$offset+$width/2 ,-fill=>$color, -outline => undef); - } - $canvas->addtag($tagname, 'withtag',$line); - } -} - -sub arrange_subjects { - ########################################################################### - ############### sub categories ############################################ - if ($opt{'arrangesub'} =~/subscaleC#CHR_oo21/ ) { - $opt{'arrangesub'}='subscaleC'; - $opt{'sub_scale_categoric_string'}='UK,NA,UL,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y'; - $opt{'sub_scale_max'}=0; - $opt{'sub_scale_min'}=26; - $opt{'sub_scale_lines'}=26; - $opt{'sub_scale_step'}=-1; - $opt{'sub_scale_col'}='0'; - $opt{'sub_scale_col_pattern'}='^([A-Z0-9]+)'; - $opt{'sub_scale_col2'}='4'; - $opt{'sub_scale_col2_pattern'}='^([A-Z0-9]+)'; - $opt{'sub_on'}=1; - &find_column_options; - } elsif ($opt{'arrangesub'} =~/subscaleN#ident(\d+)/ ) { - $opt{'arrangesub'}='subscaleN'; - $opt{'sub_scale_categoric_string'}=''; - $opt{'sub_scale_max'}=1; - $opt{'sub_scale_min'}=$1/100; - $opt{'sub_scale_lines'}=20; - $opt{'sub_scale_step'}=0.01; - $opt{'sub_scale_col'}='FRACBPMATCH'; - $opt{'sub_scale_col_pattern'}=''; - $opt{'sub_scale_col2'}=''; - $opt{'sub_scale_col2_pattern'}=''; - &find_column_options; - } - ########################################################################### - ############### main categories ########################################### - if ($opt{'arrangesub'}=~ /oneperline/ ) { - print "ARRANGING SUBJECTS ONE PER LINE\n" if !$newopt{'quiet'}; - for (my $i=0; $i< @m; $i++) { $m[$i][$mh{'sline'}] =''; } - my $line=0; - foreach my $ac (@acc_order) { - #print "$ac\n"; - my @subnames = keys %{ $acc{$ac}{'sub'} } ; - if ($opt{'arrangesub_col'}) { - print "SORTING ONEPERLINE ARRANGE USING COLUMN $opt{'arrangesub_col'}\n" if !$newopt{'quiet'}; - foreach (@subnames) { - #print "($accsub{$_}{'acc'}{$ac}{'eghit'}) $m[$acc{$ac}{'sub'}{$_}{'eghit'}][$opt{'arrangesub_col'}],$_] \n"; - - } - - @subnames= map { $_->[1] } - sort {$a->[0] <=> $b->[0] } - map { [ $m[$accsub{$_}{'acc'}{$ac}{'eghit'}][$opt{'arrangesub_col'}],$_ ] } - @subnames; - - - } - @subnames=reverse @subnames if ($opt{'arrangesub_rev_on'}==1) ; - $acc{$ac}{'subnum'}=0; - foreach my $s (@subnames) { - #print - #foreach my $a (keys %{$accsub{$s}{'acc'}}) { - $accsub{$s}{'acc'}{$ac}{'line'}=$acc{$ac}{'subnum'}++; - $accsub{$s}{'acc'}{$ac}{'color'}=$opt{'sub_color'}; - #} - } - } - } elsif ($opt{'arrangesub'} =~/stagger/ ) { - print "ARRANGING SUBJECTS IN A STAGGER \n" if !$newopt{'quiet'}; - my $spacer=$opt{'arrangesub_stagger_spacing'}; - for (my $i=0; $i< @m; $i++) { $m[$i][$mh{'sline'}] =''; } - - foreach my $ac (@acc_order) { - #print "*staggering:$ac\n"; - my @tmpsub; - my @subnames= keys % {$acc{$ac}{'sub'}}; - foreach my $s (@subnames) { - my %tmp= ( 'name'=>$s, 'qmin'=>$accsub{$s}{'acc'}{$ac}{'qmin'}-$spacer, - 'qmax'=>$accsub{$s}{'acc'}{$ac}{'qmax'}+$spacer ); - push @tmpsub , \%tmp; - } - @tmpsub = sort { $$a{'qmin'} <=> $$b{'qmin'} } @tmpsub; - my @endp=(); - #print "ACCS=>",scalar(@tmpsub),"\n"; - for (my $i=0; $i<@tmpsub; $i++) { - $accsub{$tmpsub[$i]{'name'}}{'acc'}{$ac}{'color'}=$opt{'sub_color'}; - my $placed=0; - for (my $j=0; $j<=@endp;$j++) { - #print "$i:SEARCHING FOR EMPTY SLOT ",scalar(@endp),"\n"; - if ($endp[$j] < $tmpsub[$i]{'qmin'} ) { - $accsub{$tmpsub[$i]{'name'}}{'acc'}{$ac}{'line'}=$j+1; - $endp[$j]=$tmpsub[$i]{'qmax'}; - $placed=1; - last; - } - } - if ($placed==0) { - #print "PUSHING $i\n"; - push @endp,$tmpsub[$i]{'qmax'}; - $accsub{$tmpsub[$i]{'name'}}{'acc'}{$ac}{'line'}=scalar(@endp); - } - } - #my $pause=; - } - } - - if ($opt{'arrangesub'} =~/subscaleC/ ) { - - %subscaleC=(); - @subscaleC=split /[:, ]+/, $opt{'sub_scale_categoric_string'}; - for(my $i=0;$i<@subscaleC;$i++) { $subscaleC{$subscaleC[$i]}=$i; } - $opt{'sub_scale_lines'}=scalar @subscaleC; - if ($opt{'sub_scale_step'} ==1 ) { - $opt{'sub_scale_max'}=$opt{'sub_scale_lines'}; - $opt{'sub_scale_min'}=0; - } else { - $opt{'sub_scale_step'}=-1 ; - $opt{'sub_scale_min'}=$opt{'sub_scale_lines'}; - $opt{'sub_scale_max'}=0; - } - - my $upper_bound=$opt{'sub_scale_max'}; - my $lower_bound=$opt{'sub_scale_min'}; - my $scale= $opt{'sub_scale_lines'} / ($upper_bound-$lower_bound); - my $col = $opt{'sub_scale_col'}; - my $col2 = $opt{'sub_scale_col2'}; - #print "U:$upper_bound L:$lower_bound LN:$opt{'sub_scale_lines'} S:$scale F:$field\n"; - for (my $i=0; $i< @m; $i++) { $m[$i][$mh{'sline'}] =''; } - foreach my $s (@acc_ordersub) { - my $line=0; - my $cval=''; - foreach my $a (keys %{$accsub{$s}{'acc'}}) { - my $eg=$accsub{$s}{'acc'}{$a}{'eghit'}; - if ( $col2 eq '' || $m[$eg][0] eq $s ) { - $cval=$m[$eg][$col] ; - ##add if minside### - $cval=$1 if $cval=~/$opt{'sub_scale_col_pattern'}/; - } else { - $cval=$m[$eg][$col2] ; - $cval=$1 if $cval=~/$opt{'sub_scale_col2_pattern'}/; - } - $line=$subscaleC{$cval}; - $accsub{$s}{'acc'}{$a}{'line'}= ($upper_bound - $line)*$scale; - if ($line) { - $accsub{$s}{'acc'}{$a}{'color'}='purple'; #$opt{'sub_color'}; - } else { - $accsub{$s}{'acc'}{$a}{'color'}='darkgrey'; - } - } - } - } elsif ($opt{'arrangesub'} =~/subscaleN/ ) { - my $upper_bound=$opt{'sub_scale_max'}; - my $lower_bound=$opt{'sub_scale_min'}; - my $scale= $opt{'sub_scale_lines'} / ($upper_bound-$lower_bound); - my $col = $opt{'sub_scale_col'}; - #print "U:$upper_bound L:$lower_bound LN:$opt{'sub_scale_lines'} S:$scale F:$field\n"; - for (my $i=0; $i< @m; $i++) { - #print "hello"; - $m[$i][$mh{'sline'}] = ($upper_bound - $m[$i][$col])*$scale; - } - } - -} - -sub colorsub { - #$opt{'colorsub'}='hitrandom'; - if ($opt{'colorsub'}=~/NO CHANGE/) { - - } elsif ($opt{'colorsub'}=~/RESET/) { - my $col=$mh{'scolor'}; - foreach (@m) { $$_[$col]='';} - } elsif ($opt{'colorsub'}=~/hitrandom/ ) { - my @colors = qw(#cd0d32083208 #86e4ce2ceb01 orange #0068cd0d0000 purple #cd0daa2f7d15 #a207cd0d5a04 #9b1ecd0d9b05 brown); - my $col=$mh{'scolor'}; - foreach (@m) { - $$_[$col]=$colors[0]; - push(@colors,shift(@colors)); #first shall be last - } - } elsif ($opt{'colorsub'}=~/hitconditional/ ) { - print "COLORING SUBJECTS CONDITIONALLY...\n" if !$newopt{'quiet'}; - my $col=$mh{'scolor'}; - my @colors=split / *[,;] */, $opt{'colorsub_hitcond_tests'}; - my @tests=(); - foreach (@colors) { - my @c=split / *if */; - $c[0] =~s/ +//mg; - if (@c !=2) { - warn "$c[0] $c[1] bad color extract from ($_)!\n"; - return; - } - next if $c[0] !~/^[A-Za-z0-9]+$/; - push @tests , \@c; - } - foreach my $row (@m) { - foreach my $c (@tests) { - if ( eval( '$$row[$opt{"colorsub_hitcond_col"}]' ." $$c[1]" ) ) { - $$row[$col]=$$c[0]; - } - } - } - } elsif ( $opt{'colorsub'}=~/seqrandom/ ) { - print "COLORING...\n" if !$newopt{'quiet'}; - my @colors = qw(#cd0d32083208 orange #0068cd0d0000 #86e4ce2ceb01 purple #cd0daa2f7d15 #a207cd0d5a04 #9b1ecd0d9b05 brown); - foreach my $ac (@acc_order) { - my @subnames= keys % { $acc{$ac}{'sub'} }; - foreach my $s (@subnames) { - $accsub{$s}{'color'}=$colors[0]; - push(@colors,shift(@colors)); #first shall be last - } - } - } -} - - - - -sub show_calculations { - #print "showKEYS: ",join(" ",keys %acc),"\n"; - - for (my $i=0; $i<@m; $i++) { - ##########FILTERING TO OCCUR################################# - ############################################################# - $m[$i][$mh{'hide'}]=0 if $opt{'pfilter_reset'} == 1; #checkbox - if ($opt{'filterpre1_col'} =~/^\d+$/) { - $m[$i][$mh{'hide'}]=1 if $m[$i][$opt{'filterpre1_col'}]< $opt{'filterpre1_min'} && $opt{'filterpre1_min'} ne ''; - $m[$i][$mh{'hide'}]=1 if $m[$i][$opt{'filterpre1_col'}]> $opt{'filterpre1_max'} && $opt{'filterpre1_max'} ne ''; - } - if ($opt{'filterpre2_col'} =~/^\d+$/) { - $m[$i][$mh{'hide'}]=1 if $m[$i][$opt{'filterpre2_col'}]< $opt{'filterpre2_min'} && $opt{'filterpre2_min'} ne ''; - $m[$i][$mh{'hide'}]=1 if $m[$i][$opt{'filterpre2_col'}]> $opt{'filterpre2_max'} && $opt{'filterpre2_max'} ne ''; - } - next if $m[$i][$mh{'hide'}]==1; - ############################################################ - my $n1= $m[$i][0]; - my $n2= $m[$i][4]; - ##############QUERY######################## - #print "$m[$i][0] $m[$i][4]\n"; - if ( defined $acc{ $n1 } ) { - $acc{$n1}{'len'}=$m[$i][3]; - $acc{$n1}{'sub'}{$n2}++ if defined $accsub{ $n2}; - - #print "show:seq1 $m[$i][0] => $acc{$m[$i][0]}{'len'}\n"; - } - if ( defined $acc{$n2} ) { - #print "show:seq2 $m[$i][4] => $acc{$m[$i][4]}{'len'}\n"; - $acc{$n2}{'len'}=$m[$i][7]; - $acc{$n2}{'sub'}{$n1}++ if defined $accsub{ $n1}; - } - ###########SUBJECT ########################### - if ( defined $accsub{ $n1 } && defined $acc{$n2} ) { - $accsub{$n1}{'len'}=$m[$i][3]; - my ($b,$e)=($m[$i][5],$m[$i][6] ); - ($b,$e)=($e,$b) if $b> $e; - if (! defined $accsub{$n1}{'acc'}{$n2}{'qmin'}) { - $accsub{$n1}{'acc'}{$n2}{'qmin_f'} = 999999999; - $accsub{$n1}{'acc'}{$n2}{'qmax_f'} = -10; - $accsub{$n1}{'acc'}{$n2}{'qmin'} = 999999999 ; - $accsub{$n1}{'acc'}{$n2}{'qmax'} =-10 ; - $accsub{$n1}{'acc'}{$n2}{'eghit'} = $i; - } - if ( $b<$accsub{$n1}{'acc'}{$n2}{'qmin'} ) { $accsub{$n1}{'acc'}{$n2}{'qmin'}= $b ;} - if ($e>$accsub{$n1}{'acc'}{$n2}{'qmax'} ) { $accsub{$n1}{'acc'}{$n2}{'qmax'}= $e ; } - - #print "show:seq1 $m[$i][0] => $acc{$m[$i][0]}{'len'}\n"; - } - - if ( defined $accsub{ $n2 } && defined $acc{$n1} ) { - my ($b,$e) = ( $m[$i][1],$m[$i][2] ); - $accsub{$n2}{'len'}=$m[$i][7]; - $accsub{$n2}{'desc'}=$m[$i][$mh{'DEFN2'}]; - if (! defined $accsub{$n2}{'acc'}{$n1}{'qmin'} ) { - $accsub{$n2}{'acc'}{$n1}{'qmin_f'}=99999999999999; - $accsub{$n2}{'acc'}{$n1}{'qmax_f'}=-10; - $accsub{$n2}{'acc'}{$n1}{'qmin'}= 99999999999999 ; - $accsub{$n2}{'acc'}{$n1}{'qmax'}=-10; - $accsub{$n2}{'acc'}{$n1}{'eghit'} = $i; - } - if ( $b<$accsub{$n2}{'acc'}{$n1}{'qmin'} ) { $accsub{$n2}{'acc'}{$n1}{'qmin'}= $b ; } - if ( $e>$accsub{$n2}{'acc'}{$n1}{'qmax'} ) { $accsub{$n2}{'acc'}{$n1}{'qmax'}= $e ; } - } - - } - #print "showKEYS: ",join(" ",keys %acc),"\n"; - foreach my $s (keys %accsub) { - foreach my $a (keys %{ $accsub{$s}{'acc'} } ) { - #print "$s => $a \n"; - #print "$s:$a $accsub{$s}{'acc'}{$a}{'qmin'} = $acc{$a}{'b'}\n"; - if ( defined $acc{$a}{'b'} && $accsub{$s}{'acc'}{$a}{'qmin'} < $acc{$a}{'b'} - && $accsub{$s}{'acc'}{$a}{'qmax'} >= $acc{$a}{'b'} ) { - $accsub{$s}{'acc'}{$a}{'qmin'} = $acc{$a}{'b'}; - #print "$s:$a $accsub{$s}{'acc'}{$a}{'qmin'} = $acc{$a}{'b'}\n"; - } - #print "$s:$a $accsub{$s}{'acc'}{$a}{'qmax'} = $acc{$a}{'e'}\n"; - if (defined $acc{$a}{'e'} && $accsub{$s}{'acc'}{$a}{'qmax'} > $acc{$a}{'e'} - && $accsub{$s}{'acc'}{$a}{'qmin'} <= $acc{$a}{'e'} ) { - $accsub{$s}{'acc'}{$a}{'qmax'} = $acc{$a}{'e'}; - #print "$s:$a $accsub{$s}{'acc'}{$a}{'qmax'} = $acc{$a}{'e'}\n"; - } - } - } - #my $pause=; - -} - - -sub fitlongestline { - $opt{'canvas_bpwidth'}=$widest_line+100; - &redraw; -} -################################################################################# -################ FILE HANDLING AND DATA PARSING ################################# -################################################################################# - - -sub align_update { - if (!defined $mh{'name1'}) { - ###INITALIZE IF $MH NOT DEFINED - %mh=(name1=>0,begin1=>1,end1=>2,len1=>3,name2=>4,begin2=>5,end2=>6,len2=>7) ; - @mheader=qw(name1 begin1 end1 len1 name2 begin2 end2 len2); - } - my $col=scalar(@mheader); - ######THIS IS WHERE TO ADD ADDITIONAL REQUIRED FIELDS# - $mh{'color'}=$col++ if !defined $mh{'color'}; - $mheader[$mh{'color'}]='color'; - $mh{'offset'}=$col++ if !defined $mh{'offset'}; - $mheader[$mh{'offset'}]='offset'; - $mh{'width'}=$col++ if !defined $mh{'width'}; - $mheader[$mh{'width'}]='width'; - $mh{'display'}=$col++ if !defined $mh{'display'}; - $mheader[$mh{'display'}]='display'; - $mh{'sline'}=$col++ if !defined $mh{'sline'}; - $mheader[$mh{'sline'}] = 'sline'; - $mh{'scolor'}=$col++ if !defined $mh{'scolor'}; - $mheader[$mh{'scolor'}] = 'scolor'; - $mh{'hide'}=$col++ if !defined $mh{'hide'}; - $mheader[$mh{'hide'}] = 'hide'; - - - return if $opt{'align'} eq ''; - my @files; - (@files=split":",$opt{'align'}) || ( $files[0]=$opt{'align'} ); - foreach my $f (@files) { - print "LOADING ALIGN $f...\n" if !$newopt{'quiet'}; - if (!open (ALIGN, $f) ) { - $opt{'align'}.='(bad)'; - die "Can't open $f!\n"; - } - my $line=; - $line=~s/\r\n/\n/; - close ALIGN; - my ($tmp, @head_file); - if ($line=~ /\t/) { - #some form of tab delimited# - chomp $line; - @head_file=split "\t", $line; - $tmp = &import_blastout($f); - - } else { - #traditional mirrorpeat output# - @head_file=(); #no extra columns; - $tmp = &import_miropeat($f); - } - for (my $i=8; $i<@head_file; $i++) { - if (!defined $mh{$head_file[$i]} ) { - my $l=scalar(@mheader); - $mheader[$l]=$head_file[$i]; - $mh{$head_file[$i]}=$l; - } - } - ################################# - ###this is really slow I think### - foreach (@{$tmp}) { - my @c=@{$_}; - next if $c[0] eq ''; - my @cc; - $cc[0]=$c[0]; $cc[1]=$c[1]; $cc[2]=$c[2]; $cc[3]=$c[3]; - $cc[4]=$c[4]; $cc[5]=$c[5]; $cc[6]=$c[6]; $cc[7]=$c[7]; - for (my $i=8; $i<@head_file; $i++) { - $cc[$mh{$head_file[$i]}]=$c[$i]; - } - - push @m , \@cc; - } - } #files loop - print " ===> ", scalar(@m)," total pairwise comparisons to display\n" if !$newopt{'quiet'}; - -} - -sub show_update { - #####show (NONE, blank current, file - #print "FORCING...($opt{'showseq'})\n"; - #assume a sequence name if file path doesn't exist or if input contains a colon or comma# - if ( $opt{'showseq'} eq 'ALL') { - print "BUILDING show data for ALL\n" if !$newopt{'quiet'}; - %acc=(); - @acc_order=(); - foreach ( @m ) { - $acc{$$_[0]}{'len'}="0"; - $acc{$$_[4]}{'len'}="0" if !$opt{'showqueryonly'}; - } - @acc_order= sort keys %acc; - } elsif ($opt{'showseq'} =~ /[:,]/ || ! -e $opt{'showseq'} ) { - warn "ASSUMMING: $opt{'showseq'} is a sequence name rather than file\n" if $opt{'showseq'} !~ /[:,]/; - my $show = ":$opt{'showseq'}"; - $show=~s/\s$//; - $show.=':' if $show !~ /:$/; - %acc=(); - @acc_order=(); - foreach (split /:/ ,$opt{'showseq'}) { - print "ACC_ORDER ($_)\n" if !$newopt{'quiet'}; - if (/,/) { - warn "ERROR: Bad format for -showseq file commas ($_)!\n" if !/^\S+,\d*,\d+,\d+$/ && !/^\S+,\d+$/; - my @c=split /,/; - if ($c[1] eq '') { $acc{$c[0]}{'len'}=0 } else { $acc{$c[0]}{'len'}= $c[1] } - $acc{$c[0]}{'b'}=$c[2]; - $acc{$c[0]}{'e'}=$c[3]; - push @acc_order,$c[0]; - - } else { - $acc{$_}{'len'}='0'; - push @acc_order,$_; - } - } - - } elsif ($opt{'showseq'} ne '') { - %acc=(); - @acc_order=(); - open(IN,$opt{'showseq'} )|| die "Can't open assummed showseq file --showseq ($opt{'showseq'})\n"; - print "LOADING show $opt{'showseq'}...\n" if !$newopt{'quiet'}; - my $header=; - while () { - s/\r\n/\n/; - chomp; chomp; - my @c = split "\t"; - next if $c[0] eq ''; - $acc{$c[0]}{'len'}="0"; - if ($c[1] =~/^\d+$/ ) { - $acc{$c[0]}{'len'}=$c[1] ; - } elsif ($c[2] =~ /\w/ ) { - die "SHOWSEQ FILE ($opt{'showseq'}) ERROR!: sequence ($c[0]) subseq begin ($c[2]) is not an integer position\n"; - } - if ($c[2] =~/^\d+$/ ) { - $acc{$c[0]}{'b'}=$c[2] ; - &warnNpause("SHOWSEQ FILE ($opt{'showseq'}) WARNING!: sequence ($c[0]) subseq begin ($c[2]) more than the length ($c[1])!") if $c[2] > $c[1]; - } elsif ($c[2] =~ /\w/ ) { - die "SHOWSEQ FILE ($opt{'showseq'}) ERROR!: sequence ($c[0]) subseq begin ($c[2]) is not an integer position\n"; - } - if ($c[2] =~/^\d+$/ ) { - $acc{$c[0]}{'e'}=$c[3] ; - &warnNpause("SHOWSEQ FILE ($opt{'showseq'}) WARNING!: sequence ($c[0]) subseq end($c[3]) more than the length ($c[1])!\n") if $c[3] > $c[1]; - } elsif ($c[2] =~ /\w/ ) { - die "SHOWSEQ FILE ($opt{'showseq'}) ERROR!: sequence ($c[0]) subseq end ($c[3]) is not an integer position!\n"; - } - - push @acc_order, $c[0]; - } - close IN; - } else { - #return if keys (%acc); - ###This is the first time!!! - die "IMPROPER show $opt{'showseq'}...\n"; - } - print " ===> ",scalar (@acc_order), " total sequences to display\n" if !$newopt{'quiet'}; -} - -sub show_update_subject { - #####show (NONE, blank current, file - print "FORCINGSUB...$opt{'showsub'} \n" if !$newopt{'quiet'}; - if ($opt{'showsub'} =~ /\:/ ) { - my $show = $opt{'showsub'}; - %accsub=(); - @acc_ordersub=(); - for (my $i=0; $i < @m; $i++) { - if ( defined $acc{$m[$i][4]} && $show =~/$m[$i][0]/ ) { - $accsub{$m[$i][0]}{'len'}="0"; - } - if ( defined $acc{$m[$i][0]} && $show =~ /$m[$i][4]/ ) { - $accsub{$m[$i][4]}{'len'}="0"; - } - @acc_ordersub=keys %accsub; - if ( $show =~/^SORT/ ) { - @acc_ordersub=sort @acc_order; - } - } - } elsif ( $opt{'showsub'} eq 'ALL') { - print "BUILDING show FILE BY ALL\n"; - %accsub=(); - @acc_ordersub=(); - for (my $i=0; $i < @m; $i++) { - #print "$m[$i][0]:::$m[$i][3]\n"; - $accsub{$m[$i][0]}{'len'}="0" if defined $acc{$m[$i][4]}; - $accsub{$m[$i][4]}{'len'}="0" if defined $acc{$m[$i][0]}; - } - @acc_ordersub= sort keys %accsub; - - } elsif ($opt{'showsub'} ne '') { - %accsub=(); - @acc_order=(); - open(IN,$opt{'showsub'} )|| die "Can't open accession show display --showseq ($opt{'showseq'}\n"; - print " ===>loading file ($opt{'showsub'})...\n" if !$newopt{'quiet'}; - my $header=; - while () { - s/\r\n/\n/; - chomp; chomp; - my @c = split "\t"; - next if $c[0] eq ''; - #print "$c[0] => $c[1]\n"; - $accsub{$c[0]}{'len'}="0"; - push @acc_ordersub, $c[0]; - } - close IN; - } else { - return if keys (%accsub); - die "Unable to choose subjects to choose ALL use -showsub ALL\n"; - } - print " ===> ",scalar(@acc_ordersub)," total sub sequences to display\n" if !$newopt{'quiet'}; -} -sub extra_update { - if (!keys %eh) { - #must have these seq, begin end# - %eh=(seq=>0,begin=>1,end=>2,color=>3,offset=>4,width=>5, orient=>6); - @eheader=qw(seq begin end color offset width orient); - } - #ADD NEW CALCULATED VARIABLES AS BELOW WITH ORIENT# - my $col = scalar(@eheader); - $eh{'orient'}=$col++ if !defined $eh{'orient'}; - $eheader[$eh{'orient'}]='orient'; - return if $opt{'extra'} eq ''; - my @files; - @files=split ":",$opt{'extra'} || ($files[0]=$opt{'extra'}); - foreach my $f (@files) { - print "LOADING EXTRA FILE ($f)\n" if !$newopt{'quiet'}; - if (! open (IN,"$f") ) { - $opt{'extra'}.="(bad name)"; - die "Can't open extra file ($f) [$!]\n"; - } - my $header=; - $header=~s/\r\n/\n/; - chomp $header; - my @head = split "\t",$header; - for (my $i=3; $i<@head; $i++) { - if (!defined $eh{$head[$i]}) { - my $l= scalar(@eheader); - $eheader[$l]=$head[$i]; - $eh{$head[$i]}=$l; - } - #print "$head[$i] ===> " - - } - #Sprint "PAUSE\n"; my $pause=; - my $test_show=''; - $test_show=":$opt{'showseq'}" if $opt{'minload'} && $opt{'showseq'}=~/:$/; - print "$test_show\n"; - while () { - s/\r\n/\n/; - chomp; chomp; - my @c = split "\t"; - next if $c[0] eq '' || $c[1] eq ''; - if ($test_show) { - next if $test_show !~ /\:$c[0]\:/; - - } - #### - my @cc=(); - $cc[0]=$c[0];$cc[1]=$c[1];$cc[2]=$c[2]; - for (my $i=3; $i<@head; $i++) { - if ($head[$i] eq 'orient' && $c[$i] ne '') { - $c[$i]=uc$c[$i]; - if ($c[$i] eq 'R' || $c[$i] eq 'F') { - } elsif ($c[$i]=~/^PLUS|^POSITIVE|^FORWARD|^1|^\+/ ) { #gets 1 or +1 - $c[$i]='F'; - } elsif ($c[$i]=~/^MINUS|NEGATIVE|REVERSE|^\-/ ) { #this gets -1 too - $c[$i]='R'; - } else { - &warnNpause("Unknown designation for orientation ($c[$i])!\n" ); - } - } - $cc[$eh{$head[$i]}]=$c[$i]; - } - push @e, \@cc; - } - close IN; - } - print " ===> ", scalar(@e)," total extra sequence features to display\n" if !$newopt{'quiet'}; - -} -sub graph_update { - #ADD NEW DATA TO GRAPH# - # @g1 and @g2 # - print "UPDATING GRAPH DATA\n" if !$newopt{'quiet'}; - foreach my $g ( 1, 2 ) { - next if $opt{"graph$g"} eq ''; - my @files; - @files=split /:/, $opt{"graph$g"}; - my $array = "g$g"; - my $p = \@$array; - foreach my $f (@files) { - print "LOADING GRAPH $f\n" if !$newopt{'quiet'}; - if (! open (IN,"$f") ) { - $opt{"graph$g"}.="(bad)"; - die "Can't read graph file ($f) [$!]\n"; - } - my $header=; - my $last_pos=0; - my $last_seq=''; - while () { - s/\r\n/\n/; - chomp; - my @c = split /\t/; - next if $c[0] eq '' || $c[1] eq ''; - my @cc=($c[0],$c[1],$c[2]); - warn "Bad non-numerical format in ($f) for $cc[1]\n" if $cc[1] !~/^[-0-9.]+$/; - warn "Bad non-numerical format in ($f) for $cc[2]\n" if $cc[2] !~/^[-0-9.]+$/ && $cc[2] ne ''; - if ($cc[1] < $last_pos && $last_seq eq $cc[0] ) { - warn "graph data not in sequential order\n"; - } - push @$p, \@cc; - $last_pos=$cc[1]; - $last_seq=$cc[0]; - } - } - print " ===> ",scalar(@$p)," total points to display for graph$g\n" if !$newopt{'quiet'}; - $opt{"graph$g"}=''; - } - -} - - -sub import_blastout { - my $file = shift; - my $showr=":$opt{'showseq'}"; - if ($opt{'showseq'} !~/\:/) { - if ($opt{'showseq'} ne 'ALL') { - $showr=':'; - print " PREPROCESSING show\n" if !$newopt{'quiet'}; - open (show, "$opt{'showseq'}") || die "Can't open show file ($opt{'showseq'}\n"; - my $header=; - while () { - s/\r\n/\n/; - chomp; - my @c=split "\t"; - #print "$c[0]\n"; - $showr.="$c[0]:"; - } - } - } - print "$showr\n"; - open (IN, "$file") || die "Can't open blast $file \n"; - my $header=; - my @array; - my $i; - while () { - s/\r\n/\n/; - chomp; - my @c=split "\t"; - if ($opt{'showseq'} ne 'ALL') { - next if $c[1]!~/^\d+$/; #check for proper format - next if $opt{'minload'} && $showr!~/$c[0]:/ && $showr !~ /$c[4]:/; - } - @{$array[$i] }= @c; - #print $array[$i][0], " "; - $i++; - } - close IN; - print "DONE\n"; - return \@array; -} - - -sub import_miropeat { - my $file = $_[0]; - open (IN, "$file") || die "Can't open mirorepeat output $file \n"; - my $line =""; - $line = until ( $line=~/^\./ );; - $line=~s/^.//; - return undef if $line=~/not find/; - my $i =0; - my @array=(); - while ($line !~ /^Graphic/ ) { - $line=~s/\r\n/\n/; - chomp $line; - @{$array[$i] }= split " ",$line; - $line=; - $i++; - } - close IN; - return \@array; -} - - -sub show_alignment { - my $i=shift; - my $orient = 'F'; - if ($m[$i][5]>$m[$i][6]) { $orient ='R'} - #print "$i)))$m[$i][$opt{'alignment_col'}]\n"; - my $text; - $$text="Prealigned sequences must be included as columns\nin the align file for this option to work!\nThese sequences must contain indel dashes\nas the alignment is not recalculated."; - if ( $opt{'alignment_col'}!=0 && $opt{'alignment_col2'} !=0 ) { - $text=&alignment_format( -seq1=>$m[$i][$opt{'alignment_col'}], -seq2=>$m[$i][$opt{'alignment_col2'}], - -name1=>$m[$i][0], -begin1=>$m[$i][1], -end1=>$m[$i][2], - -name2=>$m[$i][4], -begin2=>$m[$i][5],-end2=>$m[$i][6], - -orient2=>'F', -width=>$opt{'alignment_wrap'}); - } - #print "$$text\n"; - &export_text( $text, "Formatted Alignment M$i" ); -} - - -sub save_parasight_table { - my $name = $_[0]; - $name=~ s/\.ps[a-z]?$//; - print "SAVING BASENAME ($name)\n"; - - #####SAVE .psa #### - if ( !open(OUT, ">$name.psa") ) { - print "WARNING: Can't Save file ($name.psa)\n"; - return; - } - print OUT join("\t",@mheader),"\n"; - for (my $i=0; $i< @m; $i++) { - print OUT join ( "\t",@{$m[$i]} ),"\n"; - } - close OUT; - #####SAVE .pse #### - if ( !open(OUT, ">$name.pse") ) { - print "WARNING: Can't Save file ($name.pse)\n"; - return; - } - print OUT join("\t",@eheader),"\n"; - for (my $i=0; $i< @e; $i++) { - print OUT join ( "\t",@{$e[$i]} ),"\n"; - } - close OUT; - ####SAVE .psg if it exists#### - my $max=@g1; - $max=@g2 if @g2 > $max; - if ($max > 0 ) { - if ( !open (OUT, ">$name.psg") ) { - print "WARNING: Can't Save file ($name.psg)\n"; - return; - } - print OUT "g1seq\tg1point\tg1value\tg2seq\tg2point\tg2value\n"; - for (my $i=0; $i < $max; $i++) { - if (defined $g1[$i]) { - print OUT "$g1[$i][0]\t$g1[$i][1]\t$g1[$i][2]"; - } else { - print OUT "\t\t\t"; - } - if (defined $g2[$i]) { - print OUT "\t$g2[$i][0]\t$g2[$i][1]\t$g2[$i][2]\n"; - } else { - print OUT "\t\t\t\n"; - } - } - close OUT; - } - ##### SAVE.pso ###### - ###removing nstore### - if ( !open (OUT, ">$name.pso") ) { - print "WARNING: Can't save parsight option file ($name.pso)\n"; - return; - } - foreach (keys %acc) { - print OUT "#ACC||||$_||||$acc{$_}\n"; - } - foreach (@acc_order) { - print OUT "#ORDER||||$_\n"; - } - foreach (sort keys %opt) { - print OUT "#OPT||||$_||||$opt{$_}\n"; - } - close OUT; -} - - - - - -sub load_option_template { - my $f=shift; - if (!open (OPTION, "$f")) { - print "WARNING: Can't read option file ($f)\n"; - return; - } - my $line=

S< >[center|nw|ne|sw|se|e|w|n] positioning of background gif relative to draw point gif_x and gif_y - -=item * C=EC<0>S< >displays a gif image in background (the image will not print out in postscript) - -=item * C=EC<>S< >[file path] of gif image to display in background--image does not make it into the Postscript file file - -=item * C=ECS< >[integer] background picture pixel x coordinate position (top of image is zero) - -=item * C=EC<0>S< >[integer] background gif y coordinate position (0 is top of screen) - -=item * C=ECS< >[color] for graph1 labels (left side axis) - -=item * C=EC<2>S< >[integer] number of decimal points to round graph1 labels (left side axis) - -=item * C=EC<10>S< >[integer] point size of graph1 labels (left side axis) - -=item * C=EC<1>S< >[float] multiplier for graph1 labels (left side axis) - -=item * C=EC<1>S< >[integer] horizontal offset for graph1 labels (left side axis) - -=item * C=EC<1>S< >[0|1] toggles on labels for graph1 scale (left side axis) - -=item * C=ECS< >[color] for graph1 connecting lines - -=item * C=EC<1>S< >[0|1] toggles graph1 connecting line off and on - -=item * C=EC<0>S< >[0|1] toggles on and off smoothing function for connecting line - -=item * C=EC<1>S< >[integer] width for graph1 connecting line - -=item * C=EC<100>S< >[integer] maximum value of graph1 scale - -=item * C=EC<-5>S< >[integer] minimum value of graph1 scale - -=item * C=EC<0>S< >[0|1] toggles off and on graph1 - -=item * C=ECS< >[color] to fill points with for graph1 - -=item * C=EC<1>S< >[0|1] toggles point drawing on and off for graph1 - -=item * C=ECS< >[color] to outline point with for graph1 - -=item * C=EC<1>S< >[integer] thickness of point outline for graph1 - -=item * C=EC<2>S< >[integer] pixel radius size for drawing graph1 points - -=item * C=ECS< >[color] of tick marks for graph1 scale - -=item * C=EC<6>S< >[integer] length of tick marks for graph1 scale - -=item * C=EC<1>S< >[integer] horizontal offset of tick marks for graph1 scale - -=item * C=EC<1>S< >[0|1] toggles tick marks for graph1 scale off and on - -=item * C=EC<3>S< >[integer] thickness of tick marks for graph1 scale - -=item * C=ECS< >[color] of vertical line for graph1 scale on left - -=item * C=EC<1>S< >[0|1} toggles on and off vertical line for graph1 scale on left - -=item * C=EC<2>S< >[integer] vertical line width for graph1 scale on left - -=item * C=ECS< >[color] of graph2 scale labels - -=item * C=EC<2>S< >[integer] number of decimal point to round graph2 scale label - -=item * C=EC<10>S< >[integer] point size of graph2 scale labels - -=item * C=EC<1>S< >[float] graph2 scale label multiplier - -=item * C=EC<8>S< >[integer] horizontal offset of graph2 scale labels - -=item * C=EC<1>S< >[0|1] toggles graph2 scale labels off and n - -=item * C=ECS< >[color] of graph2 connecting lines - -=item * C=EC<1>S< >[0|1] toggles graph2 connecting lines off and on - -=item * C=EC<0>S< >[0|1] toggles graph2 connecting line smoothing off and on - -=item * C=EC<1>S< >[integer] thickness of graph2 connecting lines - -=item * C=EC<1000>S< >[integer] maximum value for graph2 scale - -=item * C=EC<-1000>S< >[integer] minimum value for graph2 scale - -=item * C=EC<0>S< >[0|1] toggles graph2_on - -=item * C=ECS< >[color] of interior of graph2 points - -=item * C=EC<1>S< >[0|1] toggles graph2 point drawing on and off - -=item * C=ECS< >[color] of graph2 point outline - -=item * C=EC<1>S< >[integer] thickness of graph2 point outline - -=item * C=EC<2>S< >[integer] radius size of graph 2 points - -=item * C=ECS< >[color] of graph2 vertical scale ticks - -=item * C=EC<6>S< >[integer] length of graph2 vertical scale ticks - -=item * C=EC<5>S< >[integer] horizontal offset of graph2 vertical scale ticks - -=item * C=EC<1>S< >[0|1] toggles graph2 vertical scale ticks on and off - -=item * C=EC<3>S< >[integer] thickness of graph2 vertical scale ticks - -=item * C=ECS< >[color] of graph2 vertical scale line - -=item * C=EC<1>S< >[0|1] toggles graph2 vertical scale line off and on - -=item * C=EC<2>S< >[integer] thickness of graph2 vertical scale line - -=item * C=EC<80>S< >[integer] pixel height of shared graph scale - -=item * C=ECS< >[color] of horizontal shared graph scale lines - -=item * C=EC<1>S< >[0|1] toggles off and on the shared horizontal interval lines of the graph scales - -=item * C=EC<1>S< >[integer] width of shared horizontal shared graph scale lines - -=item * C=EC<-20>S< >[integer] indentation for placing gscale above (or even below) the sequence line - -=item * C=EC<4>S< >[integer] number of intervals - -=item * C=EC<0>S< >[0|1] toggles off and on the graph scales - -=item * C=EC<1>S< >[0|1] toggles off and on the popup help messages - -=item * C=EC<50>S< >[integer] line width in characters for popup help menus - -=item * C=EC< >code for an advanced marking algorithm. Allowing for more complex searches. Data foreach pair or extra is accessed using an array reference \$c. Therefore to access column 4 \$\$c[4] would work. - -=item * C=ECS< >[e|m] default array to search (m is alignment/e is extra)(m is historical) - -=item * C=EC<>S< >[integer] column to search for given pattern in order to mark matches with a color - -=item * C=EC<>S< >[integer] second column to search for pattern in order to mark matches with a color - -=item * C=ECS< >[color] to mark objects with - -=item * C=EC<0>S< >[0|1] toggles the coloring/marking of sub(jects) off and on - -=item * C=ECS< >[regular expression] pattern to search for with mark/find button - -=item * C=EC<0>S< >[0|1] toggles on and off changing the color of objects permanently (if not permanent then on redraw colors will be erased - -=item * C=EC<1>S< >[0|1] toggles the coloring/marking of sub(jects) off and on - -=item * C=ECS< >[color] default of inter pairwise and connecting lines - -=item * C=EC<0>S< >[0|1] toggles off and on the connecting lines between inter pairwise alignments - -=item * C=EC<0>S< >[integer] default offset from sequence line of inter pairwise (up is negative, down is positive) - -=item * C=EC<1>S< >[0|1] toggles off and on the inter pairwise alignments normally drawn on top of sequence line - -=item * C=EC<13>S< >[integer] width of inter pairwise - -=item * C=ECS< >[color] default of intra pairwise and connecting lines - -=item * C=EC<0>S< >[0|1] toggles connecting lines between intra pairwise off and on - -=item * C=EC<0>S< >[integer] default offset from seuqence - -=item * C=EC<1>S< >[0|1] toggles off and on the intra pairwise - -=item * C=EC<9>S< >[integer] width of intra pairwise - -=item * C=ECS< >[NONE|inter_over_intra|intra_over_inter] determines which pairwise type appears above the other--NONE leaves the appearance to the order of the pairwise in the inputted alignment or parasight.psa table - -=item * C=EC<>S< >[integer] column number to determine pairwise type for sequence 1, which is checked against sequence 2. If match then intra if no match then inter. (Useful on sequence names that contain chromosome assignment.) - -=item * C=EC<>S< >[integer] column to determine pairwise type for sequence 2 in row - -=item * C=EC<>S< >[regular expression] to extract pairwise type determing value with parentheses - -=item * C=EC<>S< >[regular expression] to extract pairwise type determining value with parentheses - -=item * C=ECS< >[text|number] determines whether column numbers or text headers are shown in popup window - -=item * C=EC<300>S< >[integer] character length for fields in the popup menu (allows long definitions or sequences be excluded) - -=item * C=ECS< >[string] print command with brackets {} representing file name. This is a system command executed to drive a printer. I have never been able to get DOS to work. This is setup for Unix on our system. Rainbow is our color printer name. It will fail in MSWin - -=item * C=EC<1>S< >[integer] height in number of pages for the print/postscript all command - -=item * C=EC<1>S< >[integer] width in number of pages for print/postscript all command - -=item * C=EC<11i>S< >[special] physical page length (longest dimension of paper) in inches for printer (requires number followed by units with i=inches or c=cm) - -=item * C=EC<1>S< >[0|1] toggles printer page orientation (1=landscape 0=portrait) - -=item * C=EC<8i>S< >[special] physical page width in inches for printer (requires number followed by units i=inches or c=cm) - -=item * C=ECS< >[color] for the quick color function Shift-Button3 and Shift-Double Click Button3 - -=item * C=ECS< >[color] of sequence (All sequences take this color. There is currently no way to color sequences individually.) - -=item * C=ECS< >[color] of sequence name text - -=item * C=EC<12>S< >[integer] font size (in points) for all sequence names - -=item * C=EC<-4>S< >[integer] vertical offset of sequence names (up is negative, down is positive) - -=item * C=EC<0>S< >[integer] horizontal offset of sequence names - -=item * C=EC<1>S< >[0|1] toggles off and on the display of sequence name labels - -=item * C=EC<>S< >[regular expression] to match in sequence name for display purposes--parentheses must be used to denote the part of match to display - -=item * C=EC<250>S< >[integer] pixels to separate sequence lines from each other (roughly equivalent to spacing between text paragraphs if you consider a wrapping line of sequences to be a paragraph) - -=item * C=EC<200>S< >[integer] pixels to space between a wrapping line of sequences (roughly equivaelent to spacing between the lines within a text paragraph) - -=item * C=EC<10000>S< >[integer] bases to separate sequences drawn within the same line (roughly equivalent to spacing between words of a text paragraph) - -=item * C=ECS< >[color] for begin tick marks - -=item * C=ECS< >[center|n|w|s|e|nw|ne|sw|se] anchor point for begin tick mark labels - -=item * C=ECS< >[valid color] of tick mark label at the beginning of sequence - -=item * C=EC<9>S< >[integer] font size (in points) for label at beginning of sequence - -=item * C=EC<0.001>S< >[float] scaling factor for begin tick mark labels - -=item * C=EC<2>S< >[integer] vertical offset for begin tick mark label - -=item * C=EC<0>S< >[integer] horizontal offset for begin tick mark labels - -=item * C=EC<1>S< >[0|1] toggles off and on the beginning tick mark labels - -=item * C=EC<10>S< >[integer] length of begin tick marks - -=item * C=EC<0>S< >[integer] vertical offset for begin tick marks - -=item * C=EC<1>S< >[0|1] toggles off and on the begin tick marks - -=item * C=EC<2>S< >[integer] width of begin tick marks - -=item * C=EC<20000>S< >[integer] tick mark interval - -=item * C=ECS< >[color] of interval tick marks - -=item * C=ECS< >[valid color] for end tick marks - -=item * C=ECS< >[center|n|w|s|e|nw|ne|se|sw] anchor point for end tick mark labels - -=item * C=ECS< >[valid color] for end tick mark labels - -=item * C=EC<9>S< >[integer] font size (in points) for end tick mark labels - -=item * C=EC<0.001>S< >[float] scaling factor for end tick mark labels - -=item * C=EC<2>S< >[integer] vertical offset for end tick mark labels - -=item * C=EC<0>S< >[integer] horizontal offset for end tick mark labels - -=item * C=EC<1>S< >[0|1] toggles end tick labels off and on - -=item * C=EC<10>S< >[integer] length of end tick marks - -=item * C=EC<0>S< >[integer] vertical offset for ending tick marks - -=item * C=EC<1>S< >[0|1] toggles off and on the ending tick marks - -=item * C=EC<2>S< >[integer] width of end tick marks - -=item * C=ECS< >[center|n|s|w|e|nw|sw|ne|se] anchor of text from tick mark draw point - -=item * C=ECS< >[color] for interval tick mark - -=item * C=EC<9>S< >[integer] font size (in points) for interval tick mark label - -=item * C=EC<0.001>S< >[float] scaling factor for the interval tick label - -=item * C=EC<2>S< >[integer] vertical offset of sequence interval tick mark labels - -=item * C=EC<1>S< >[0|1] toggles off and on the interval tick labels - -=item * C=EC<10>S< >[integer] length of interval tick marks - -=item * C=EC<0>S< >[integer] vertical offset for interval tick marks - -=item * C=EC<1>S< >[0|1] toggles off and on the interval sequence tick marks - -=item * C=EC<0>S< >[0|1] toggles whether numbering is for each individual sequence (0) or continious across multiple accession on same line (useful when analyzing chromosomes in multiple fragments) - -=item * C=EC<2>S< >[integer] width of interval tick marks - -=item * C=EC<3>S< >[integer] width of sequence line - -=item * C=EC<0>S< >[0|1] toggles the display of just the first sequence in a pairwise data (i.e.first column in an alignment file). For most parsing this is equivalent to the Blast query position - -=item * C=EC<5>S< >[integer] distance between arrow point to wing/edge of arrow - -=item * C=EC<0>S< >[0|1] toggles off and on the directional/orientation arrows for subjects - -=item * C=EC<5>S< >[integer] distance between arrow point to base of arrow - -=item * C=EC<4>S< >[integer] distance from base end to wing tip of arrow - -=item * C=ECS< >[color] default of sub(ject) objects (all other coloring schemes over ride default) - -=item * C=EC<30>S< >[integer] pixel indent from top of subscales to associated sequence line (increasing pushes scales further below associated sequence) - -=item * C=EC<13>S< >[integer] column to use for labeling each hit/pairwise (label will be drawn at beginning of each hit sub) - -=item * C=ECS< >color of pairwise hit label text - -=item * C=EC<0>S< >[integer] horizontal offset for hit label - -=item * C=EC<0>S< >[0|1] turns on individual labeling of each pairwise hit - -=item * C=EC<0?([0-9.]{4})>S< >[regular expression] to match in data from column - -=item * C=EC<9>S< >[integer] font size (in points) for hit label - -=item * C=EC<0>S< >[integer] column to use for the beginning sub label - -=item * C=EC<4>S< >[integer] column for second position sequence in alignment table pairwise row - -=item * C=EC<>S< >[regular expression] pattern to match in data from sub label sequence column 2 - -=item * C=EC<>S< >[regular expression] pattern to match in data from sub label sequence column (use parenthesis to denote data within the match to display) - -=item * C=ECS< >[color] of text label for sub objects - -=item * C=EC<0>S< >[integer] horizontal offset label - -=item * C=EC<1>S< >[0|1] toggles overall begin sequence label for sub(ject) label off and on - -=item * C=EC<6>S< >[integer] font size (in points) for begin label sequence - -=item * C=EC<4>S< >[integer] column to use for the end subject label - -=item * C=EC<0>S< >[integer] column for second position in alignment table pairwise row - -=item * C=EC<>S< >[regular expression] pattern to match in data from column - -=item * C=EC<>S< >[regular expression] pattern to match in data from column - -=item * C=ECS< >[valid color] of label text - -=item * C=EC<0>S< >[integer] horizontal offset for label - -=item * C=EC<0>S< >[0|1] toggles off and on the overall sub(ject) label at end of last hit/pairwise - -=item * C=EC<6>S< >[integer] font size (in points) for end subject label - -=item * C=EC<9>S< >[integer] pixels per line determining the spacing between subs placed on different lines - -=item * C=EC<1>S< >[0|1] toggles sub(ject) display off and on (these are the pairwise representations drawn below the sequence line) For BLAST searches these traditionally represent the subject sequences found in a database search. - -=item * C=EC<>S< >[string] list of comma delimited category names - -=item * C=EC<>S< >[integer] column for value to arrange pairwise hit on sub scale (subscale) - -=item * C=EC<>S< >[integer] column for second position sequence in alignment pairwise (only used if defined) - -=item * C=EC<>S< >[regular expression] pattern to match in column 2 - -=item * C=EC<>S< >[regular expression] pattern to match in column - -=item * C=ECS< >[valid color] for horizontal sub scale lines - -=item * C=EC<1>S< >[0|1] toggles off and on the horizontal scale lines for sub scale - -=item * C=EC<1>S< >[integer] width of horizontal sub scale lines - -=item * C=ECS< >[color] for sub scale axis label - -=item * C=EC<12>S< >[integer] font size (in points) for sub scale axis label - -=item * C=EC<100>S< >[integer] multiplication factor for sub scale label - -=item * C=EC<1>S< >[integer] horizontal offset for sub scale axis tick marks - -=item * C=EC<1>S< >[0|1] toggles off and on sub scale axis tick mark labels - -=item * C=EC<>S< >[regular expression] pattern to match in sub scale label - -=item * C=EC<10>S< >[integer] number of lines (or interval steps) to plot for stagger or cscale (automatically set for subscaleC) - -=item * C=EC<1.00>S< >[float] maximum value to place on the sub scale (automatically set for subscaleC) - -=item * C=EC<0.80>S< >[float] minimum value to place on the sub scale (automatically set for subscaleC) - -=item * C=EC<0>S< >[0|1] toggles sub scale on and off - -=item * C=EC<0.01>S< >[float] value to increment between each step (automatically set to -1 for subscaleC, 1 reverses subscaleC) - -=item * C=ECS< >[color] for sub scale axis tick marks - -=item * C=EC<9>S< >[integer] length of sub axis tick marks - -=item * C=EC<4>S< >[integer] offset of sub scale axis tick marks - -=item * C=EC<1>S< >[0|1] toggles off and on the sub scale axis at horizontal tick positions - -=item * C=EC<3>S< >[integer] width of sub scale axis tick marks - -=item * C=ECS< >[color] for vertical axis line of sub scale - -=item * C=EC<-5>S< >[integer] horizontal offset for subject axis line - -=item * C=EC<1>S< >[0|1] toggles off and on the vertical axis line for sub scale - -=item * C=EC<2>S< >[integer] width of sub scale axis line - -=item * C=EC<8>S< >[integer] default width (thickness) of sub objects - -=item * C=EC<1>S< >[0|1] toggles off and on wether descriptions, such as this one, are saved in a template file with each option variable - -=item * C=ECS< >[center|n|w|s|e|nw|ne|se|sw] anchor point for end tick mark labels - -=item * C=ECS< >[color] for end tick mark labels - -=item * C=EC<0>S< >[integer] vertical offset for end tick mark labels - -=item * C=EC<0>S< >[integer] horizontal offset for end tick mark labels - -=item * C=EC<1>S< >[0|1] toggles end tick labels off and on - -=item * C=EC<20>S< >[integer] font size (in points) for end tick mark labels - -=item * C=EC<>S< >[text] to display within a parasight view (useful for automation) - -=item * C=ECS< >[center|n|w|s|e|nw|ne|se|sw] anchor point for end tick mark labels - -=item * C=ECS< >[color] for end tick mark labels - -=item * C=EC<20>S< >[integer] font size (in points) for end tick mark labels - -=item * C=EC<0>S< >[integer] vertical offset for end tick mark labels - -=item * C=EC<0>S< >[integer] horizontal offset for end tick mark labels - -=item * C=EC<1>S< >[0|1] toggles end tick labels off and on - -=item * C=EC<>S< >[text] to display within a parasight view (useful for automation) - -=item * C=EC<9>S< >[integer] font size for parasight in general (not implemented) - -=item * C=EC<550>S< >[integer] pixel height of main window on the initial start up - -=item * C=EC<800>S< >[integer] pixel width of the main window on the initial start up , - -=back - -=head1 APPENDIX B: QUICK REFERENCE - -=head2 COMMAND LINE SUMMARY - -B<-align [I]> load pairwise alignment table(s) (table must be miropeats format) - -B<-arrangeseq [I]> (default is oneperline) - - *oneperline = each sequence is placed on a separate wrapping line - *sameline = the sequences are placed in alphabetical - order on the same line - *file:filepath = arrange file that allows specification - of line/paragraph and position - -B<-arrangesub [I]> (default stagger) -Arrange subs below the sequence. - - *oneperline = each sequence is placed on its own line - underneath sequence - *stagger = multiple subjects are placed on same line - only when non-overlapping - *subscaleN = pairwise hits are placed on a numerical scale - based on values in chosen column(s) - *subscaleC = pairwise hits are placed on categorical - scale based on hash(s) - --color [I] B<***not implemented yet, no demand yet***> Use other options for determining inter vs intrachromosal*** - -B<-colorsub [I]> - - *NONE = does not add a colorsub and does not remove colors - for pairwise hits - *RESET = removes colors for pairwise hits - colors for pairwise hits override colors for sequence hits - *seqrandom = color all pairwise comparisons for a subject the same - *hitrandom = randomly independently color each pairwise comparison - *hitconditional = allows coloring based on a conditional statement - -B<-extra [I]> loads extra sequence feature table(s) Sequence features are annotation that have single begin and end points (e.g. exons, introns, and repeats). The rows must consist of seqname[tab]begin[tab]end. Further columns may contain optional data. Columns named C, C, and C provide extra formatting information. - -B<-graph1 [I]> Graphs a data set of values above the sequence line. such as %GC. The data scale is found on the left. The data row format is simply seqname[TAB]begin[TAB]value. No more, no less. For regions with out a value a blank will cause the graph line to be disrupted. - -B<-graph2 [I]> Creates another graph using the scale on the right axis. Same parameters as -graph1 - -B<-in [I]> load a previously saved parasight view. Three files required are *.psa, *.pse and *.psm (*.psg needed only if a graph has been used) - -B<-options [I<'opt1=Evalue1,opt2=Evalue2'>]> *** Allows all of the parasight options to be changed directly ***. One and zero are used for on/off, yes/no and true/false. Complete access for the programmer using parasight as a displayer (e.g. 'canvas_width=E500,seq_tick_on=E1,graph_scale_on=E1') - -B<-showseq [I]> names of sequences to display - - *ALL = show all files (default) - *no colon = load as file of names - format each line ( seqname[TAB]length[TAB]begin[TAB]end ) - only sequence name is required other info optional - *colon(:) = parse as list of colon-delimited seq names - format: (seqname,length,begin,end:seqname2,length2,begin2,end2) - -B<-showseqqueryonly> This toggles the display of only the first sequence in a given row. This is the usually position for a blast query (hence the name of the option). - -B<-showsub [I]> names of subjects to display - - *ALL: displays all subject sequences (default) - *no colon = load file containing names (one seqname per line) - *colon(:) = parse input as list of colon-delimited sequence names - -B<-template [filepath]> loads a saved option template file. Template files can be stored in default directories for easy loading. - -B - -B<-minload> - *loads only the relevant pairwise that will be displayed - (quicker when just certain sequences are needed from large files) - -B<-precode I<'perl code commands to execute after first screen draw'>> - *an advanced option useful for automating initial tasks - -B<-die> parasight ends after executing precode - *an advanced option useful in automating tasks - - -=head2 OPTION PRECEDENCE - -B -in ---E -template ---E -option ---E commandline> - - -=head2 MOUSE FUNCTIONS - - [DBL]=double click [DRAG]=button hold down and move mouse - EXECUTE # = Execute Command (User Defined under MISC options) - - KEY LEFT-BUTTON MIDDLE BUTTON RIGHT-CLICK - --------- ----------- ------------- -------------------- - NONE Popup Desc Menu - CONTROL Zoom in Zoom out - SHIFT Move Object[DRAG] Quick color; Uncolor[DBL] - ALTERNATE Del Object [DBL] Lower Object; Raise Object[DBL] - CONTROL-SHIFT Execute 1 Execute 2 Execute 3 - - -=head2 COMPACT ALPHABETICAL LIST OF -OPTIONS WITH DEFAULTS - -B=EC<0>S< > -B=EC<0>S< > -B=EC<50>S< > -B=ECS< > -B=ECS< > -B=EC<40000>S< > -B=EC<250000>S< > -B=EC<60>S< > -B=EC<30>S< > -B=EC<40>S< > -B=EC< None>S< > -B=EC< None>S< > -B=EC<34>S< > -B=EC2; orange if E0.99; yellow if E0.98; green if E0.97; blue if E0.96; purple if E0.95; brown if E0.94; grey if E0.93; black if E0.92; pink if E0.91>S< > -B=EC<>S< > -B=EC<>S< > -B=ECS< > -B=EC<>S< > -B=EC<>S< > -B=ECS< > -B=ECS< > -B=EC<>S< > -B=ECS< > -B=EC<>S< > -B=ECS< > -B=EC<>S< > -B=EC<5>S< > -B=EC<1>S< > -B=EC<5>S< > -B=EC<4>S< > -B=ECS< > -B=EC<10>S< > -B=EC<>S< > -B=ECS< > -B=EC<6>S< > -B=EC<2>S< > -B=EC<1>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<-4>S< > -B=EC<1>S< > -B=EC<6>S< > -B=ECS< > -B=EC<.:fastax>S< > -B=EC<400000>S< > -B=EC<1>S< > -B=EC<50>S< > -B=ECS< > -B=EC<-10>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<>S< > -B=EC<10>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC
S< > -B=EC<0>S< > -B=EC<>S< > -B=EC< int($opt{window_width}/2)>S< > -B=EC<0>S< > -B=ECS< > -B=EC<2>S< > -B=EC<10>S< > -B=EC<1>S< > -B=EC<1>S< > -B=EC<1>S< > -B=ECS< > -B=EC<1>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<100>S< > -B=EC<-5>S< > -B=EC<0>S< > -B=ECS< > -B=EC<1>S< > -B=ECS< > -B=EC<1>S< > -B=EC<2>S< > -B=ECS< > -B=EC<6>S< > -B=EC<1>S< > -B=EC<1>S< > -B=EC<3>S< > -B=ECS< > -B=EC<1>S< > -B=EC<2>S< > -B=ECS< > -B=EC<2>S< > -B=EC<10>S< > -B=EC<1>S< > -B=EC<8>S< > -B=EC<1>S< > -B=ECS< > -B=EC<1>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<1000>S< > -B=EC<-1000>S< > -B=EC<0>S< > -B=ECS< > -B=EC<1>S< > -B=ECS< > -B=EC<1>S< > -B=EC<2>S< > -B=ECS< > -B=EC<6>S< > -B=EC<5>S< > -B=EC<1>S< > -B=EC<3>S< > -B=ECS< > -B=EC<1>S< > -B=EC<2>S< > -B=EC<80>S< > -B=ECS< > -B=EC<1>S< > -B=EC<1>S< > -B=EC<-20>S< > -B=EC<4>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<50>S< > -B=EC<>S< > -B=ECS< > -B=EC<>S< > -B=EC<>S< > -B=ECS< > -B=EC<0>S< > -B=ECS< > -B=EC<0>S< > -B=EC<1>S< > -B=ECS< > -B=EC<0>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<13>S< > -B=ECS< > -B=EC<0>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<9>S< > -B=ECS< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=ECS< > -B=EC<300>S< > -B=ECS< > -B=EC<1>S< > -B=EC<1>S< > -B=EC<11i>S< > -B=EC<1>S< > -B=EC<8i>S< > -B=ECS< > -B=ECS< > -B=ECS< > -B=EC<12>S< > -B=EC<-4>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<>S< > -B=EC<250>S< > -B=EC<200>S< > -B=EC<10000>S< > -B=ECS< > -B=ECS< > -B=ECS< > -B=EC<9>S< > -B=EC<0.001>S< > -B=EC<2>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<10>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<2>S< > -B=EC<20000>S< > -B=ECS< > -B=ECS< > -B=ECS< > -B=ECS< > -B=EC<9>S< > -B=EC<0.001>S< > -B=EC<2>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<10>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<2>S< > -B=ECS< > -B=ECS< > -B=EC<9>S< > -B=EC<0.001>S< > -B=EC<2>S< > -B=EC<1>S< > -B=EC<10>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<0>S< > -B=EC<2>S< > -B=EC<3>S< > -B=EC<0>S< > -B=EC<5>S< > -B=EC<0>S< > -B=EC<5>S< > -B=EC<4>S< > -B=ECS< > -B=EC<30>S< > -B=EC<13>S< > -B=ECS< > -B=EC<0>S< > -B=EC<0>S< > -B=EC<0?([0-9.]{4})>S< > -B=EC<9>S< > -B=EC<0>S< > -B=EC<4>S< > -B=EC<>S< > -B=EC<>S< > -B=ECS< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<6>S< > -B=EC<4>S< > -B=EC<0>S< > -B=EC<>S< > -B=EC<>S< > -B=ECS< > -B=EC<0>S< > -B=EC<0>S< > -B=EC<6>S< > -B=EC<9>S< > -B=EC<1>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=EC<>S< > -B=ECS< > -B=EC<1>S< > -B=EC<1>S< > -B=ECS< > -B=EC<12>S< > -B=EC<100>S< > -B=EC<1>S< > -B=EC<1>S< > -B=EC<>S< > -B=EC<10>S< > -B=EC<1.00>S< > -B=EC<0.80>S< > -B=EC<0>S< > -B=EC<0.01>S< > -B=ECS< > -B=EC<9>S< > -B=EC<4>S< > -B=EC<1>S< > -B=EC<3>S< > -B=ECS< > -B=EC<-5>S< > -B=EC<1>S< > -B=EC<2>S< > -B=EC<8>S< > -B=EC<1>S< > -B=ECS< > -B=ECS< > -B=EC<0>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<20>S< > -B=EC<>S< > -B=ECS< > -B=ECS< > -B=EC<20>S< > -B=EC<0>S< > -B=EC<0>S< > -B=EC<1>S< > -B=EC<>S< > -B=EC<9>S< > -B=EC<550>S< > -B=EC<800> - - -=head1 APPENDIX C: INSTALLATION (WINDOWS OR UNIX) - -Parasight has been tested extensively on Solaris, Linux, and MsWindows. Perl is available from www.perl.org. ActiveState (www.activestate.com) has binary versions available for many platforms--particularly useful for Windows installs. Follow instructions on the choosen sites for installing Perl. Unix installs should be easier simply because you probably have more experience with Perl or you have a network administrator. Windows installs are quite easy--just like installing any other program. Once the install is done put parasight program in the Perl bin directory (usually C:\Perl\bin). If you need to install any Perl modules such as Tk consult the individual OS. For Windows Active State binary the B provides easy searches and installations of modules. UNIX environments can utilize the B module.. - -If there is a strong need a standalone versionsof the program that are package together with all need Perl functions could be generated using ActiveState's PerlApp program. All needed components are contained within the "packed up" executable for both Linux, Solaris, and Windows. No installation of Perl is needed. Note this is not a compiled version, so the run speed will be the same as the non-PerlApp-packaged program. It is actually just an executable that has collected all of the Perl components required for Parasight to run. - -=head1 APPENDIX D: PRECODE HINTS - -Precode affords the ability to add additional code to further manipulate parasight. Extensive use of precode is found in the F file. The best way to figure out how to manipulate parasight is to study all of the parasight code. Of course even I am trying to forget most of the code so the following are useful subroutines to abuse: - -First the hash variable containing all of the command line options is %opt. So, if you want to chance arrangesub you have to use the code $opt{'arrangesub'}; - -=over 5 - -Useful commands to use when scripting: - - $opt{'x'} - -Any normal option can be accessed within the hash %opt. - - &reshowNredraw; &update; - -These two subroutines will cause the any changes in options to be redrawn and updated on the screen. While update is not normally used in the internal code (as it is called automatically whenever control is returned to the GUI), it is necessary when a script has control of parsight. - - &print_screen(0, "fileoutpath"); - -This will print a postscript of the visble screen to the designated file. If 1 is used for the initial print varaible then the postscript will be sent to the printer. If zero is used only the file is created. - - &print_all (1, "fileoutpath"); - -This will print a poscript of the entire parasight area to the designated file. If 1 is used for the intial print variable then the postscript will be sent to the printer. If zero is used only the file is created. Depending upon the multipage options, multiple files may be created. - - &save_parasight_table("basefileoutpath"); - -To save as parasight formated files which can be reload with the -in "basefileoutpath" name. - - &fitlongestline; - -This will force the length of the screen in bases to the length of the longest sequence. This is most useful for BLAST views. - - $opt{'die'}=0; - -This is useful to turn off the die option if you are subsequently saving the parasight files. Otherwise when you load the saved parasight it will "die" before you get to see it. - - &reshowNredraw; &update; print "PAUSED\n"; my $pause=; - -A useful sequence of commands if you want to pause for the user. - - $opt{"text_text"}="This is displayed text."; $opt{"text_fontsize"}=16; $opt{"text_offset_h"}=10; - -Allows for a line of text to be printed within the image. text2_text allows for a second line. - -=back - - -=head1 APPENDIX E: ADDITIONAL EXAMPLES - - parasight -showseq show.file -extra repeat.file:exon.file - -=over 5 - -This draws the sequences specified in C decorated with the repeats and exons specified in C and C. Note: this example does not contain any alignments so C is required in order to specify the lengths of the sequencesto be displayed. - -=back - - parasight -in saved -extra exons:introns - -arrangeseq oneperline - -=over 5 - -This loads a saved parasight, adds extra annotation from the files C and C annotation. It arranges subjects one per line below the sequence - -=back - - -=head1 AUTHOR - -Jeff Bailey (jab@cwru.edu) - -=head1 ACKNOWLEDGEMENTS - -This software was developed in the laboratory of Evan Eichler, Department of Genetics,Case Western Reserve University and University Hosiptals, Cleveland. - -=head1 COPYRIGHT - -Copyright (C) 2001-3 Jeff Bailey. Distribute and modify freely as defined by the GNU General Public License. - -=head1 DISCLAIMER - -This software is provided "as is" without warranty of any kind. - -=cut - - diff --git a/programs/parasight_v7.6/pdocumentation.htm b/programs/parasight_v7.6/pdocumentation.htm deleted file mode 100644 index 084048e..0000000 --- a/programs/parasight_v7.6/pdocumentation.htm +++ /dev/null @@ -1,1521 +0,0 @@ - - - -c:/perl/bin/parasight74.pl - - - - - -

- - - - - -
-

-

-
-

NAME

-

parasight (version 7.4)

-

-

-
-

SYNOPSIS

-
- parasight -align alignment.table
-

This simply loads the file alignment.table containing either a table of tab-delimited alignments or miropeats standard output (see below).

-
- parasight -align AC002038.blast.parse -showseqqueryonly
-

This simply loads the file AC002038.blast.parse containing parse blastdata and displays the hits relative to the query sequence. This is the number one use in our lab for parasight.

-
- parasight -align AC002038.blast.parse -showseq AC002038.1: 
-     -extra repeats -template bacblastview.pst 
-     -options 'seq_color=>red, canvas_width=>1000'
-

This draws the blast output from a search with AC002038.1 formatted with the options contained in the template file bacblastview.pst. It uses -options to modify the screen width and sequence color.

-
- parasight -in saved.parasight -showseq AC002304:AC002035: 
-     -arrangeseq sameline -template template.file 
-     -options 'seq_color=>red,extra_arrow_on=>1'
-

This loads a previously saved parasight view (the files saved.parsight.psa, saved.parasight.pse and saved.parasight.pso, shows only 2 of the sequences (AC002034 and AC002035), arranges or places these two sequences on the same line, loads a template file of options to reformat the view, and modifies two options directly (sequence color and turns on arrows for annotation extra).

-

-

-
-

DESCRIPTION

-

Parasight is a generalized pairwise alignment viewer originally developed for analyzing segmental duplications (or paralogy) within the human genome. It is designed to display the positions and relationships of pairwise alignments within sequeunce(s). It provides both interactive analysis as well as publication quality postscript output. Parasight can arrange and color alignments on the basis of any other included data such as size, percent similarity or even species designation. It can also display the position of any type of simple sequence annotation such as repeats and exons. Finally, it can graph numerical data in relation to the seqeunce such as windows of percentage GC content. Parasight has been used to analyze output from programs such as BLAST, miropeats, and pip maker from the scale of whole-genomes to (such as segmental duplications in the human genome) to the analysis of a single protein searched against a database of interest. If it is pairwise data, parasight can display the output.

-

Parasight functions on both Unix and Windows platforms. The program is written in Perl using the graphical Perl/Tk module. It was designed to be extremely flexible and thus a price is paid in terms of speed as well as the complexity, i.e. the large number of options). However, the numerous options makes it more likely that parasight can do what you want it to do. Although not necessary for basic interactive use, an understanding of regular expressions and a familiarity Perl is helpful in order to utilize the program more fully. Parasight and its options are fully accessible through the GUI interface, the command line, or via loadable templates making anlyses flexible and automatable. Most users of parasight load their data into the program and then format the view interactively using the extensive options menu (and now templates). Programmers will be the most likely to use command line manipulation of internal options. Parasight has been used and tested extensively on both Linux and MS Windows. The Unix version is the most extensively tested and all options should be available. Windows lacks some of the more advanced options due to incompatiblities/inflexabilities of Bill Gates' operating system.

-

To extol parasight's strengths:

-
    -
  1. Flexible
    -
  2. -The RAM is the limit when it comes to loading data. Other than the most basic description of a pairwise alignment or extra sequence feature Parasight makes absolutely no assumptions about your data allowing the user to analyze what they are interested in analyzing. Technically parasight can't tell a bp apart from an inch or DNA from Protein. -

    -
  3. Formatible
    -
  4. -Parasight has a pletora of options, if I have ever needed it parasight has it. Every parasight option is available is availble from the command line, the GUI interface or from a saved template file of options. Thus, the basic user and the programmer can completely tailor their parasight views to their exact needs. -

    -
  5. Interactive
    -
  6. -Parasight can interact with the user and with other programs. The user can interaction format the parasight image via the GUI option menu as well as edit the data. Parasight allows the user to print screen shots or dump a postscript of the entire image. Popup windows over alignments and extra sequence features display the objects data. In addition to scaling with the options menu, users can zoom in and out to gain an appreciation of the detail. Parasight has the ability to link to (or execute) other programs allowing the viewing of web pages or associated sequence alignments at the bp level. -

    -
  7. Programmable
    -
  8. -Parasight has the ability to accept additional Perl code from the command line or via a file, which allows for more complex formatting or for the execution of commands such as searching or printing. Combined with the -die option this allows for powerful batch processes such as generating PostScript images of 30,000 BACs (if you are so inclined). -

-

-

-
-

UNDERSTANDING DATA CATEGORIZATION/CLASSIFICATION

-

A basic understanding of the logic behind parasight is useful in understanding data input and manipulation. Data falls into three categories, pairwise alignments, extra sequence annotation and graph data. The graphical option menu is organized on the basis of which data is being manipulated. Alignments, the core of parasight, have two forms of display: pairs and subs. Pairs are representations of the pairwise alignments that are normally drawn atop the seqeunce. For each alignment the pairs representing it can be connected by lines to show their relationship. Thus, for pairs relationships are only visable if the both sequences containing the pairwise are drawn. Subs are representations of pairwise in relation to only one of the sequences mimicing blast type output. Sub is for sub-sequence (as they are drawn below the seqeunce) or subjects (if you are examining BLAST results). Extras are simple sequence annotations that have one beginning and one end such as introns, LINEs, SINES, motifs, etc. In the case of a gene the intron exon structure can not be drawn as one object, but only as individual exons and indvidual introns. (A -gene data structure is planned for the distant future.) The last data type is graph data Graph data is a plot of sequence positions (x-axis) versus a numerical value (y-axis).

-

Below is a crude schematic (the best I could do in POD) of a typical display with Sequence(-), Pairs(P), Subs(S), Extras(E), and Graph(G) data.

-
-                                                                 G
-                   G                   G         G                   G 
-          G               G     G                               G           
-               G                                          G                 
-               
-           EEE   EEEE          EEEE         EEEE      EEE     EEEEEEE
-    S0001--------PPPPPPPPPPPPP-----------PPPPPPPPPPPPPP--PPPPPPPP-----
-    SEQ04        SSSSSSSSSSSSS                           SSSSSSSS
-    SEQ02        SSSSSS   SSSS           SSSSSSSSS
-    SEQ03           SSSSSSSSSSS              SSSSSSSSSSS  SSSSS
-

-

-
-

COMMAND LINE OPTIONS

-

The main command line arguments available when parsight is executed can be divided into three main headings: data input, reloading a saved parasight view, and changing view options.

-

-

-

DATA INPUT

-

While many types of data can be loaded and displayed, the absolute minimum input is simply the length of a sequence to be drawn. The length of a sequence can be provided in -showseq file (see below). Usually the lengths are supplied as part of the pairwise alignment file. If no alignments are being drawn then the user must supply the lengths with the -showseq option. Of course examining a line representing a sequence is pretty boring--even if it is decorated with tick marks--so parasight has a few other data input options:

-

-

-

Option: -align

-

-align [filepath1:filepath2:filepath3:etc] loads files containing pairwise alignments. The files must be either the saved standard output for miropeats (Jeremy Parsons) or a tab-delimited format akin to miropeats standard output. The tab-delimited format is simply a file where the first 8 columns contain the pairwise coordinates and lengths of the two similar sequences. The align file is assumed to have a descriptive header in the first row. Hence, the first alignment will be lost (and loaded as the header) if no header actually is present.

-

-

-

Miropeats standard output

-

An example of Jeremy Parson's Miropeats standard output is:

-
- ## Minimum repeat length set to 300.
- 
- 
-         ICAass  Version 2.1
-         =======
- 
- 
- Indexing all the sequences now. This may take a few minutes.
- 
- Total of 1 sequences indexed
- The sorted index is being saved to the file cluster.index.7507 
- .AC002038 118 731 161973 AC002038 44681 44068 161973
- AC002038 1299 1788 161973 AC002038 47175 47664 161973
- AC002038 22870 23591 161973 AC002038 39920 40641 161973
- AC002038 46067 46524 161973 AC002038 26363 26820 161973
- AC002038 46067 47435 161973 AC002038 26363 27731 161973
- AC002038 46699 47435 161973 AC002038 26995 27731 161973
- AC002038 47175 47664 161973 AC002038 1299 1788 161973
- Graphic ready for printing - type the command shown below to print:
- lp threshold300
-

-

-

Example tab-delimited alignment file

-

An example of a tab-delimited align file consisting of parsed BLAST output with additional data columns is shown below:

-
- name1 begin1 end1  len1  name2 begin2 end2  len2  similarity  transversions
- S001  1322   20001 20001 S002  1      18064 18064 0.945632    125
- S001  1322   20001 20001 S003  1      21010 21010 0.980581    143
- S002  1      18064 18064 S003  100    21010 21010 0.999587    7
- S002  1      18064 18064 S004  1      19041 19041 0.989587    43
- S002  1      12191 18064 S005  1      12141 18073 0.997548    17
- S002  12799  18064 18064 S005  12809  18073 18073 0.998548    3
-

The format consists of 2 sequence names, the coordinates of the pairwise similarity, and the overall lengths of the named sequences (name1 begin1 end1 len1 name2 begin2 end2 len2), which must always be in this given order. The first row of the alignment file contains column header names. For the first 8 rows the header rows are ignored and thus it is necessary to place these eiqht columns in the exact order given above. The only data from these first 8 rows that may be omitted is the overall lengths of the sequence. However, these columns must still be present and contain no value (empty cell in Excel). Also, the lengths of the sequence must then be provided via -showseq. Any additional columns (such as the similarity and transversions columns above) are kept within the internal alignment data table. This additional data can be used to format and filter the parasight views generated. Additional columns that are created if not present in the alignment file are color, width, offset, sline, scolor, and hide. These are case sensitive and all lower case. ( color contains the color of the pairwise. width is the width or thickness for the bar representing the pairwise. offset is the offset of the subject object. scolor is the color of a subject object. hide does not display a pairwise if it is equal to 1. If the values for these columns are not inputed or blank then teh default values for the options are used. (NOTE: It is usually simpler to modify these formatting columns in saved parasight tables using programs such as Excel.)

-

-

-

Option: -showseq

-

-showseq [filepath | seqname1[,length,begin,end]:seqname2[,length,begin,end]:etc] displays only the designated sequences. With no colon a filename is assumed and the program attempts to load it. If a colon is found in the option then it is assumed that the input is a colon separated group of sequence names. Optional length and begin and end positions can be designated as well. This information may be given on the command line using commas after the sequence name or be contained within a file (tab-delimited). For analysis such as BLAST searches where you just want to display the query sequence it is easier to use the short cut option -showseqqueryonly in combination with -showseq ALL.

-

-

-

Example showseq file

-

The format of the tab-delimited show file is shown below:

-
- seqname    length   begin   end
- S001       10000    50      1000
- S002       15432    1000    15432
- 
-An example of the data above as a command line entry is:
-
- -showseq S001,10000,50,1000:S002,15432,1,15432:
-

Or with just the lengths to display the entire sequence:

-
- -showseq S001,10000:S002,15432:
- 
-Or if the lengths are in the alignment file, just the begin and end positions can be designated (note the double comma to skip the sequence length):
-
- -showseq S001,,50,1000:S002,,1000,15432
-

The only required column is the sequence name. The names must be exactly the same as in the alignment file and extra file. The lengths, begins, and ends, are optional for the sequences to be drawn. However, if length, begin or end are used, they must appear in the proper columns. Begin and end must always be found in the 3rd and 4th columns and thus a blank sequence length must be provided for column 2 when sequence length is not designated but begin and end are. If lengths are not supplied in the alignment files or the show file then errors will occur. Lengths designated by -showseq always supercede lengths found in alignments.

-

-extra [filepath1:filepath2:etc] loads any 'extra' sequence annotation/feature that can be expressed as a continuous block of the displayed sequence (i.e. a simple begin and end position). This can include features such as high copy repeats, introns, exons, and genes (if you don’t care about introns/exon structure). The simplest extra file contains 3 columns in the given order seqname begin end.

-

-

-

Example of an tab-delimited extra file

-

An example of a tab-delimited extra file is shown below:

-
- seqname   begin   end    name   color  offset
- S001      50      1000   exon1  blue   -10
- S001      5000    5500   exon2  blue   -10
- S002      5000    9000   LINE1  red    -20
-

Columns added if not present are color, offset, width, and orient. -(Again it is usually simpler to modify formatting data after the fact unless it is generated beforehand (e.g. orient). In terms of formatting simple color names should work: black, red, green. Orientation should be either 'F' or 'R' (capitalized). Plus and Minus will NOT work. However, during the initial loading of the extras they as well as other common designations for orientation are automatically changed to 'F' and 'R'. Other columns may be added that give additional information such as names and descriptions.

-

-

-

Option: -graph1 or -graph2

-

-graph1 file(:s) and -graph2 file(:s) loads simple graphing data in the form of seqname position value. The position within the sequence in bases and the value must be numerical (floating point). It plots the points and/or a connecting line. The graph is numerical values of sequence positon on the x-axis versus y-axis numerical values. -graph1 is used to generate one plot. -graph2 is used to plot another line or set of data points on the same scales. The left and right axis can be scaled for different ranges. Thus, Alu content and GC content can be graphed at the same time. The left axis shows the scale for -graph1 and the right axis the scale for -graph2. No header is required for the input file. An unknown value can be designated with an empty value position. An empty value position causes a discontinuous line to be drawn. Only the first 3 columns are loaded all additional columns are ignored. Graphing is built for speed-not flexiblity. The only flexibility is in scaling and formating the axes.

-

-

-

Example of a graph file

-
- seqname  position  value
- chr1      5000      0.43
- chr1     10000      0.65
- chr1     15000      0.73
- chr1     20000      0.65
-

-

-

RELOADING A SAVED PARASIGHT VIEW

-

Data can be saved directly as parasight formatted files.

-

-

-

Option: -in

-

-in [base filepath] loads a previously saved parasight dataset. Data is saved in 4 separate files (basefilename.psa, basefilename.pse, basefilename.pso) and basefilename.psg. Each file is editable text. The .psa, .pse, and .pso are required even if there are no alignments and/or extras. These extensions to the basefilename are automatically searched for in the given path.

-

The .psa and .pse are tab-delimited tables containing the alignment and extra data, respectively. These tables are easily edited with any text editor; spreadsheets such as Excel are particularly useful in modifying these tables since the data is separated into columns and calculations can easily be done to modify the data as necessary.

-

The .pso file contains all of the current option information. It is saved as text which can be modified by the end user. It has a similar format to the template files.

-

The .psg file contains all of the current data to graph. The -graph1 data is stored in the first 3 columns and the -graph2 data is stored in the next 3 columns (4 to 6). The graph file, unlike the alignment and extra files is only created if graph data has been loaded. Thus, a missing *.psg will will not generate an error. For each set of 3 columns, column one is the seqeunce, column 2 is the position on the sequence, and column 3 is the value of to plot on the y-axis.

-

IMPORTANT: All data necessary for a parasight view is contained in these files EXCEPT for any -showseq files or -arrangeseq files. These files must still be accessible in the same relative path positions in order for the saved file to be loaded properly. In other words, only the file names to a show and arrange files are saved and that data must be reloaded. If the files get moved then the link will be broken and their paths will need to be altered.

-

-

-

CHANGING DISPLAY OPTIONS

-

Option arguments modify and format the parasight view. All of these options may either be change from the command line, a template file, or interactively within the program OPTION menu. The interactive menu is the easiest way to learn and template files the easiest way to apply a set of options again and again. Changing options at the command line follows a set order of precedence--whereby old options loaded from a previous parasight view (-in) are overridden by an option template file (-template), both of which are overridden by any options specified in (-option) command. All of these are overridden by direct command line options such as -arrangeseq, -colorsub, and -showsub.

-

PRECEDENCE: internal default ---> -in ---> -template ---> -option ---> commandline

-

-

-

Option: -template

-

-template [filepath] loads an option template file. This allows a user to quickly format future parasight views so that they are just like the saved one. It is created using the save option template in the file menu. When loading a template, if the file is not found in the current or specified path, hard-wired default template directories are searched. For our lab one directory contains templates shared among multiple users. And a user specific directory for an individuals PARASIGHT files. The $template_path variable contains the paths. To modify them you must modify the code. The current setting is '~/.PARASIGHT:/people/PARASIGHT'. The search is left to right and first one found is first one used. Template directory as it is currently set does not work for WINDOWS. The tilde must be removed as it only works on Unix where the HOME directory (~) is designated by the environmental variables..

-

The template is an standard text file so a user can modify the values easily. It is created using the save option template in the file menu. 0 and 1 are used for on and off as well as yes and no values. An empty string is simply a line return right after the (=>) A line beginning with ### is ignored and is used to give descriptions of the values. Be careful about adding blank space, it is a good idea to edit with normally unseen characters such as spaces and line breaks visualized.

-

-

-

Option: -options

-

-options ['opt1=>value1,opt2=>value2'] is a list of options to modify. All of the underlying options are available; however, there are probably many that you will never have reason to modify, but they are all listed in appendix A for completeness.

-

-

-

Option: -showsub

-

-showsub [filepath | seqname1:seqname2:etc] This option shows only the designated subs to be drawn under sequences. Multiple sequence names can be directly entered with colon delimitation. If no colon is present then the input will be treated as a file containing a list of subs and will be loaded. Default is ALL, which displays all possible subs.

-

-

-

Option: -arrangeseq

-

-arrangeseq [oneperline | sameline |file:filename] This option arranges the sequences in a specified manner.

-

oneperline draws each sequence on a separate line that may wrap if needed.

-

sameline draws all of the sequences on the same line with a given amount of spacing between them.

-

file:filename uses the data in the file to arrange the sequences in user defined pattern. The file consists of two columns seqname and position in current line. To start a new line NEWLINE is typed alone. The example below places the chromosomes on 3 lines.

-

-

-

Example Arrange File

-
-        acc     start
-   chr1 400000000       
-   chr6 1668388704      
-   chr7 1870803946      
-   chr8 2057427852      
-   chr9 2230204273      
-   NEWLINE              
-   chr22        1       
-   NEWLINE              
-   chr10        400000000       
-   chr11        565589288       
-   chr12        736372841       
-   chr13        900655330       
-   chr14        1040400228      
-   chr15        1167353549
-

-arrangesub [oneperline|stagger|subscale|cscale] This option arranges subs below the drawn sequences. The name came from blast subjects, but you can also think of them in terms of sub (beneath) the sequence.

-

oneperline = each sub sequence is placed on its own line underneath the drawn sequence. The ordering of sequences can be altered by choosing a column to sort on (arrangesub_col)

-

stagger = multiple subjects are placed on same line only when non-overlapping. The spacing required between the beginning and end of two subs can be varied. This spacing gives room for labels. The ordering starts in terms of other sequences with hits closest to the beginning of the sequence of interest under which the subs are being drawn

-

subscaleN = subjects are places on a numerical scale based on given column values. Tricky so avoid setting up from command line--use the GUI and then save a template from that.

-

subscaleC = subjects are placed on categorical scale based on column values. Tricky so avoid setting up from command line. Use a template or the GUI.

-

Note: the best way to figure the scales out is to experiment with them interactively in the options menu .There are specific modifications of subscaleN and subscaleC that are included as choices. They are denoted by a preceding asterisk and were developed to display breakdowns of percent similarity and chromosome position (for mostly oudated draft versions of the genome) However,they may be instructive to the new user. New views are now simply done via a template rather than adding adding even more choices. -=back

-

-color ***not implemented*** When implemented it will color the pairwise sequences and connecting lines. Currently, coloring is only based inter and intrachromosomal designation. (As of yet the need hasn't really arisen.) For consistency this should be called colorseq.

-

-colorsub [NONE|RESET|seqrandom|hitrandom|hitconditional] This option provides color schemes for the subs drawn below the sequence.

-

NONE does not change the color and leaves hit colors intact. Hit colors are stored within each pairwise in the table. Subject colors are stored transiently. Hit colors over-ride subject colors. To remove hit colors use RESET.

-

RESET removes hit (individual pairwise) colors, which override any assigned subject colors. For example, if you use hitrandom and then try to switch to seqrandom, nothing will change. This is because hitrandom colors are still stored in the internal alignment table and they take precedence over the subject color scheme. Thus, this intermediate RESET step is required to clear the hit colors. CAUTION: if you use RESET all of your manual coloring will be wiped out. (NOTE: This is because hit colors reside in the same column scoloras manually modified sub colors. The column color defines the pairwise color--overriding inter and intra colors.) Sorry, this is part of the program that could be simplified if I ever have a chance to gut it.

-

seqrandom randomly assigns colors to the various sequences that are displayed as subs. (There is a random set of 20 odd colors that are cycled through.)

-

hitrandom randomly assigns colors to each individual hit or pairwise alignment. (There is a random set of 20 odd colors that are cycled through.)

-

hitconditional allows for each pairwise to be assigned a color based on pseudo-Perl code by using a series of conditional statements that test a single alignment column. Basic syntax is [color] [test] [value];, where color= color to set, test is =, >, or <, and value is some numerical value.

-

-minload is a switch to load only the alignments and extras for the sequences that will be drawn as designated by -showseq. It is very useful for increasing the speed of the program when there are a large number of alignments that will not be drawn in the current view. Why load the genome if you only want to look at chromosome 22?

-

-precode ['Perl code'] This code is executed after the initial drawing of objects. It allows automation for batch processes when combined with die option. (See Advanced option section below for details.)

-

-die parasight quits after executing the precode option (See Advanced option section below for details.)

-

-

-
-

INTERACTIVE MENUS

-

-

-

RESHOW, REARRANGE, REDRAW

-

This part is to answer why there is a blue and white button for updating the drawing. For beginners, I simply suggest using the blue R,R&R (Reshow, Rearrange, and Redraw) button. For extremely large data sets; however, the Reshow, and Rearrange calculations can take a significant amount of time. Thus, if you are just changing the spacing of tick marks it is handy to skip the sequence and arrangement calculations. However, for simple views of BAC BLAST output stick with the blue button.

-

-

-

OPTION MENU

-

The option menu has popup help (over yellow text) and most options are self-explanatory. If in doubt try changing an option and see what happens. I have tried to adhere to a semi-logical naming convention when ever possible. Blue color coding is to show whether a variable will require reshow and rearrangement before taking effect. The menu is subdivided into 6 main parts: MAIN, SEQ/PAIRS, SUBS, EXTRA, GRAPH, FILTER, and MISC. The organization trys to follow the organization of the data in parasight.

-

The MAIN menu allows access to important command line options like -showseq and -showsub. Also, basic screen properties such as size of the window and the number of bases for the width of the screen.

-

The SEQ/PAIRS portion allows manipulation of the sequence and assocaiated tick marks. Pairs and their designation as inter and intrachromomal as well as connecting lines are controled from this part of the menu as well.

-

The SUB portion of course is all about the manipulation of subs. This is some of the more complex data manipulation.

-

The EXTRA portion is about the options relating to the extra data.

-

The GRAPH portion is for the graph data. Try turning everything on when you first test out this feature.

-

The FILTER portion allows for the filtering/removal of pairwise and extras based on data in a given column of numerical data.

-

The MISC portion allows for the setting of options controling printing, the display of alignments, the extraction of sequence, and the execution of other programs.

-

-

-

FILE DROP-DOWN MENU

-

This is the only place where the save parsight command is found. All data and options are saved. A few files are not saved--see information about -in. Loading must be done at the command line. Additionally, template files (*.pst) may be saved and loaded through this menu. After loading a template file the screen must be R,R,& R.

-

-

-

PRINT DROP-DOWN MENU

-

The print menu allows for the generation of a postscript file and its subsequent transmission to a printer if the option print_command is properly set. The postscript file can consist of the visible screen (screen) or the entire parasight drawing (all). If the all option is chosen then the number of pages (vertically and horizontally) across which to print the image is set with the option print_multipages_wide and print_multipages_high. The postcript files are encapsulated and can be easily turned into PDF files with software such as Adobe Distiller or imported into Adobe Illustrator. Also, word has a special eps import option which was handy when writing my dissertation.

-

-

-

ORDER DROP-DOWN MENU

-

The order menu on the main drop down menu bar allows the order or level of objects in the display to be changed. You can either send objects all the way to the background or the foreground.

-

-

-

MISC DROP-DOWN MENU

-

Currently it contains the ability to transfer colors for alignments in order to allow syncing of colors between the pairs and the subs. It requires a redraw to see the effect after choosing one of these options. This is really the only way to currently go outside of the inter vs intra coloring schemes for pairs.

-

-

-
-

SCREEN MANIPULATION

-

In addition to gazing lovingly at the pretty images after formating them using the option menu, direct manipulation of the display once drawn can be accomplished with various commands.

-

-

-

MOUSE BUTTON FUNCTIONS

-

(see APPENDIX B: for table of mouse functions)

-

First when you mouse over an object it will shimmer with a number of bright colors. the shimmering object represents the object you will select if you click on it.Most of the mouse commands work on sequence, pairwise, extra, and subjects. Tick Marks and Labels are immune except for the ALT buttons. The middle mouse button is not used since some systems like my home PC lack them (and I don’t have the dexterity to precisely click both Left and Right at the exact same time which is the usual substitute).

-

DATA POPUP WINDOW (Left-Click) This pops up a simple window displaying all data for an alignment or an extra object. Use Shift-Drag to move the popup window if it is obscured or obscuring data. Formatting options for this popup window are found under MISC tab of the OPTIONS menu.

-

OPTIONS POPUP (Right-Click) Brings up a popup menu of options, which includes a variety of commands such as choosing colors and editing the underlying data. If the actual alignments are present in the alignment table, the alignments can be viewed. If the underlying sequence files are available, subsequences representing objects can be extracted.

-

ZOOM IN AND OUT (Control-Left-Click and Control-Right-Click) -Zooming can be accomplished with Control held down at the same time as a mouse click. The left mouse clicked in conjunction with the control key will zoom in two fold centered at the point of the click. The right mouse has the opposite effect and zooms out. The DeZoom button on the main window returns the scaling to normal.

-

MOVE OBJECT TO FOREGROUND OR BACKGROUND (Alt-Left-Click and Alt-Right-Click) This causes the object clicked on to move all the way to the foreground or the background. The left mouse button moves it to the foreground. The right mouse button moves the object to the background.

-

MOVE (nonpermanent) ANY OBJECT (Shift-Left Drag) -Allows for the movement of object in the drawing--even tick marks and sequence lines. It is non-permanent but it is useful for removing tick marks or names before you print or create a PostScript file.

-

QUICK COLOR (Shift-Right-Click to color and Shift-Right-Double-Click to uncolor) -Allow for rapid coloring of objects. Shift-Right Click causes the object's color to change to that of the Quick Color Button on the Main Window. Shift-Double-Click-Button attempts to remove the color and leave the default color. In the case of Pairs, black is assigned to the object as inter and intra chromosomal colors can not be reassigned until a Redraw. Coloring of all other objects (i.e. not extras and not alignments) are not saved or stored and consequently revert to normal as soon as the image is redrawn.

-

HIDE SEQUENCE OR EXTRA (Alt-Right-Double-Click) This will hide sequences from view (i.e it will disappear from view). To unhide sequences you must use the pre-filter in the filter options. (For which I should add a command line!).

-

-

-
-

APPENDIX A: LIST OF VALID OPTIONS WITH INTERNAL DEFAULTS

- -

-

-
-

APPENDIX B: QUICK REFERENCE

-

-

-

COMMAND LINE SUMMARY

-

-align [filepath1:filepath2:etc] load pairwise alignment table(s) (table must be miropeats format)

-

-arrangeseq [oneperline/sameline/file] (default is oneperline)

-
- *oneperline = each sequence is placed on a separate wrapping line
- *sameline = the sequences are placed in alphabetical 
-    order on the same line
- *file:filepath = arrange file that allows specification
-    of line/paragraph and position
-

-arrangesub [oneperline/stagger/subscale/cscale] (default stagger) -Arrange subs below the sequence.

-
- *oneperline = each sequence is placed on its own line
-    underneath sequence
- *stagger = multiple subjects are placed on same line 
-    only when non-overlapping
- *subscaleN =  pairwise hits are placed on a numerical scale
-    based on values in chosen column(s)
- *subscaleC =  pairwise hits are placed on categorical 
-    scale based on hash(s)
-

-color [scheme] ***not implemented yet, no demand yet*** Use other options for determining inter vs intrachromosal***

-

-colorsub [NONE/RESET/seqrandom/hitrandom/hitconditional]

-
- *NONE = does not add a colorsub and does not remove colors 
-    for pairwise hits
- *RESET = removes colors for pairwise hits 
-    colors for pairwise hits override colors for sequence hits
- *seqrandom = color all pairwise comparisons for a subject the same
- *hitrandom = randomly independently color each pairwise comparison
- *hitconditional = allows coloring based on a conditional statement
-

-extra [filepath1:filepath2:etc] loads extra sequence feature table(s) Sequence features are annotation that have single begin and end points (e.g. exons, introns, and repeats). The rows must consist of seqname[tab]begin[tab]end. Further columns may contain optional data. Columns named offset, width, and color provide extra formatting information.

-

-graph1 [filepath1:filepath2:etc] Graphs a data set of values above the sequence line. such as %GC. The data scale is found on the left. The data row format is simply seqname[TAB]begin[TAB]value. No more, no less. For regions with out a value a blank will cause the graph line to be disrupted.

-

-graph2 [filepath1:filepath2:etc] Creates another graph using the scale on the right axis. Same parameters as -graph1

-

-in [filepath] load a previously saved parasight view. Three files required are *.psa, *.pse and *.psm (*.psg needed only if a graph has been used)

-

-options ['opt1=>value1,opt2=>value2'] *** Allows all of the parasight options to be changed directly ***. One and zero are used for on/off, yes/no and true/false. Complete access for the programmer using parasight as a displayer (e.g. 'canvas_width=>500,seq_tick_on=>1,graph_scale_on=>1')

-

-showseq [a file or seqname(s):] names of sequences to display

-
-   *ALL = show all files (default) 
-   *no colon = load as file of names 
-     format each line ( seqname[TAB]length[TAB]begin[TAB]end )
-     only sequence name is required other info optional
-   *colon(:) = parse as list of colon-delimited seq names
-     format: (seqname,length,begin,end:seqname2,length2,begin2,end2)
-

-showseqqueryonly This toggles the display of only the first sequence in a given row. This is the usually position for a blast query (hence the name of the option).

-

-showsub [file | seqnames: | ALL] names of subjects to display

-
-   *ALL: displays all subject sequences (default)
-   *no colon = load file containing names (one seqname per line)
-   *colon(:) = parse input as list of colon-delimited sequence names
-

-template [filepath] loads a saved option template file. Template files can be stored in default directories for easy loading.

-

ADVANCED OPTIONS

-

-minload - *loads only the relevant pairwise that will be displayed - (quicker when just certain sequences are needed from large files)

-

-precode 'perl code commands to execute after first screen draw' - *an advanced option useful for automating initial tasks

-

-die parasight ends after executing precode - *an advanced option useful in automating tasks

-

-

-

OPTION PRECEDENCE

-

internal default ---> -in ---> -template ---> -option ---> commandline - -

-

-

-

MOUSE FUNCTIONS

-
- [DBL]=double click  [DRAG]=button hold down and move mouse 
- EXECUTE # = Execute Command (User Defined under MISC options)
- 
- KEY            LEFT-BUTTON       MIDDLE BUTTON  RIGHT-CLICK                             
- ---------      -----------       -------------  --------------------
- NONE           Popup Desc                       Menu
- CONTROL        Zoom in                          Zoom out
- SHIFT          Move Object[DRAG]                Quick color; Uncolor[DBL]
- ALTERNATE      Del  Object [DBL]                Lower Object; Raise Object[DBL]
- CONTROL-SHIFT  Execute 1         Execute 2      Execute 3
-
-
-

-

-

COMPACT ALPHABETICAL LIST OF -OPTIONS WITH DEFAULTS

-

alignment_col=>0   -alignment_col2=>0   -alignment_wrap=>50   -arrangeseq=>oneperline   -arrangesub=>stagger   -arrangesub_stagger_spacing=>40000   -canvas_bpwidth=>250000   -canvas_indent_left=>60   -canvas_indent_right=>30   -canvas_indent_top=>40   -color=> None   -colorsub=> None   -colorsub_hitcond_col=>34   -colorsub_hitcond_tests=>red if <2; orange if <0.99; yellow if <0.98; green if <0.97; blue if <0.96; purple if <0.95; brown if <0.94; grey if <0.93; black if <0.92; pink if <0.91   -execute=>   -execute2=>   -execute2_array=>m   -execute2_desc=>   -execute3=>   -execute3_array=>m   -execute3_desc=>widget   -execute4=>   -execute4_array=>m   -execute4_desc=>   -execute_array=>e   -execute_desc=>   -extra_arrow_diag=>5   -extra_arrow_on=>1   -extra_arrow_para=>5   -extra_arrow_perp=>4   -extra_color=>purple   -extra_label_col=>10   -extra_label_col_pattern=>   -extra_label_color=>purple   -extra_label_fontsize=>6   -extra_label_offset=>2   -extra_label_on=>1   -extra_label_test_col=>   -extra_label_test_pattern=>   -extra_offset=>-4   -extra_on=>1   -extra_width=>6   -fasta_blastdb=>htg:nt   -fasta_directory=>.:fastax   -fasta_fragsize=>400000   -fasta_on=>1   -fasta_wrap=>50   -filename_color=>grey   -filename_offset=>-10   -filename_offset_h=>0   -filename_on=>1   -filename_pattern=>   -filename_size=>10   -filter1_col=>   -filter1_max=>   -filter1_min=>   -filter2_col=>   -filter2_max=>   -filter2_min=>   -filterextra1_col=>   -filterextra1_max=>   -filterextra1_min=>   -filterextra2_col=>   -filterextra2_max=>   -filterextra2_min=>   -filterpre1_col=>   -filterpre1_max=>   -filterpre1_min=>   -filterpre2_col=>   -filterpre2_max=>   -filterpre2_min=>   -gif_anchor=>center   -gif_on=>0   -gif_path=>   -gif_x=> int($opt{window_width}/2)   -gif_y=>0   -graph1_label_color=>blue   -graph1_label_decimal=>2   -graph1_label_fontsize=>10   -graph1_label_multiplier=>1   -graph1_label_offset=>1   -graph1_label_on=>1   -graph1_line_color=>blue   -graph1_line_on=>1   -graph1_line_smooth=>0   -graph1_line_width=>1   -graph1_max=>100   -graph1_min=>-5   -graph1_on=>0   -graph1_point_fill_color=>blue   -graph1_point_on=>1   -graph1_point_outline_color=>blue   -graph1_point_outline_width=>1   -graph1_point_size=>2   -graph1_tick_color=>black   -graph1_tick_length=>6   -graph1_tick_offset=>1   -graph1_tick_on=>1   -graph1_tick_width=>3   -graph1_vline_color=>black   -graph1_vline_on=>1   -graph1_vline_width=>2   -graph2_label_color=>red   -graph2_label_decimal=>2   -graph2_label_fontsize=>10   -graph2_label_multiplier=>1   -graph2_label_offset=>8   -graph2_label_on=>1   -graph2_line_color=>red   -graph2_line_on=>1   -graph2_line_smooth=>0   -graph2_line_width=>1   -graph2_max=>1000   -graph2_min=>-1000   -graph2_on=>0   -graph2_point_fill_color=>red   -graph2_point_on=>1   -graph2_point_outline_color=>red   -graph2_point_outline_width=>1   -graph2_point_size=>2   -graph2_tick_color=>black   -graph2_tick_length=>6   -graph2_tick_offset=>5   -graph2_tick_on=>1   -graph2_tick_width=>3   -graph2_vline_color=>black   -graph2_vline_on=>1   -graph2_vline_width=>2   -graph_scale_height=>80   -graph_scale_hline_color=>black   -graph_scale_hline_on=>1   -graph_scale_hline_width=>1   -graph_scale_indent=>-20   -graph_scale_interval=>4   -graph_scale_on=>0   -help_on=>1   -help_wrap=>50   -mark_advanced=>   -mark_array=>m   -mark_col=>   -mark_col2=>   -mark_color=>red   -mark_pairs=>0   -mark_pattern=>AC002038   -mark_permanent=>0   -mark_subs=>1   -pair_inter_color=>red   -pair_inter_line_on=>0   -pair_inter_offset=>0   -pair_inter_on=>1   -pair_inter_width=>13   -pair_intra_color=>blue   -pair_intra_line_on=>0   -pair_intra_offset=>0   -pair_intra_on=>1   -pair_intra_width=>9   -pair_level=>NONE   -pair_type_col=>   -pair_type_col2=>   -pair_type_col2_pattern=>   -pair_type_col_pattern=>   -popup_format=>text   -popup_max_len=>300   -print_command=>lpr -P Rainbow {}   -print_multipages_high=>1   -print_multipages_wide=>1   -printer_page_length=>11i   -printer_page_orientation=>1   -printer_page_width=>8i   -quick_color=>purple   -seq_color=>black   -seq_label_color=>black   -seq_label_fontsize=>12   -seq_label_offset=>-4   -seq_label_offset_h=>0   -seq_label_on=>1   -seq_label_pattern=>   -seq_line_spacing_btwn=>250   -seq_line_spacing_wrap=>200   -seq_spacing_btwn_sequences=>10000   -seq_tick_b_color=>black   -seq_tick_b_label_anchor=>ne   -seq_tick_b_label_color=>black   -seq_tick_b_label_fontsize=>9   -seq_tick_b_label_multiplier=>0.001   -seq_tick_b_label_offset=>2   -seq_tick_b_label_offset_h=>0   -seq_tick_b_label_on=>1   -seq_tick_b_length=>10   -seq_tick_b_offset=>0   -seq_tick_b_on=>1   -seq_tick_b_width=>2   -seq_tick_bp=>20000   -seq_tick_color=>black   -seq_tick_e_color=>black   -seq_tick_e_label_anchor=>nw   -seq_tick_e_label_color=>black   -seq_tick_e_label_fontsize=>9   -seq_tick_e_label_multiplier=>0.001   -seq_tick_e_label_offset=>2   -seq_tick_e_label_offset_h=>0   -seq_tick_e_label_on=>1   -seq_tick_e_length=>10   -seq_tick_e_offset=>0   -seq_tick_e_on=>1   -seq_tick_e_width=>2   -seq_tick_label_anchor=>n   -seq_tick_label_color=>black   -seq_tick_label_fontsize=>9   -seq_tick_label_multiplier=>0.001   -seq_tick_label_offset=>2   -seq_tick_label_on=>1   -seq_tick_length=>10   -seq_tick_offset=>0   -seq_tick_on=>1   -seq_tick_whole=>0   -seq_tick_width=>2   -seq_width=>3   -showqueryonly=>0   -sub_arrow_diag=>5   -sub_arrow_on=>0   -sub_arrow_paral=>5   -sub_arrow_perp=>4   -sub_color=>lightgreen   -sub_initoffset=>30   -sub_labelhit_col=>13   -sub_labelhit_color=>black   -sub_labelhit_offset=>0   -sub_labelhit_on=>0   -sub_labelhit_pattern=>0?([0-9.]{4})   -sub_labelhit_size=>9   -sub_labelseq_col=>0   -sub_labelseq_col2=>4   -sub_labelseq_col2_pattern=>   -sub_labelseq_col_pattern=>   -sub_labelseq_color=>black   -sub_labelseq_offset=>0   -sub_labelseq_on=>1   -sub_labelseq_size=>6   -sub_labelseqe_col=>4   -sub_labelseqe_col2=>0   -sub_labelseqe_col2_pattern=>   -sub_labelseqe_col_pattern=>   -sub_labelseqe_color=>black   -sub_labelseqe_offset=>0   -sub_labelseqe_on=>0   -sub_labelseqe_size=>6   -sub_line_spacing=>9   -sub_on=>1   -sub_scale_categoric_string=>   -sub_scale_col=>   -sub_scale_col2=>   -sub_scale_col2_pattern=>   -sub_scale_col_pattern=>   -sub_scale_hline_color=>grey   -sub_scale_hline_on=>1   -sub_scale_hline_width=>1   -sub_scale_label_color=>black   -sub_scale_label_fontsize=>12   -sub_scale_label_multiplier=>100   -sub_scale_label_offset=>1   -sub_scale_label_on=>1   -sub_scale_label_pattern=>   -sub_scale_lines=>10   -sub_scale_max=>1.00   -sub_scale_min=>0.80   -sub_scale_on=>0   -sub_scale_step=>0.01   -sub_scale_tick_color=>black   -sub_scale_tick_length=>9   -sub_scale_tick_offset=>4   -sub_scale_tick_on=>1   -sub_scale_tick_width=>3   -sub_scale_vline_color=>black   -sub_scale_vline_offset=>-5   -sub_scale_vline_on=>1   -sub_scale_vline_width=>2   -sub_width=>8   -template_desc_on=>1   -text2_anchor=>nw   -text2_color=>red   -text2_offset=>0   -text2_offset_h=>0   -text2_on=>1   -text2_size=>20   -text2_text=>   -text_anchor=>nw   -text_color=>red   -text_fontsize=>20   -text_offset=>0   -text_offset_h=>0   -text_on=>1   -text_text=>   -window_font_size=>9   -window_height=>550   -window_width=>800 - -

-

-

-
-

APPENDIX C: INSTALLATION (WINDOWS OR UNIX)

-

Parasight has been tested extensively on Solaris, Linux, and MsWindows. Perl is available from www.perl.org. ActiveState (www.activestate.com) has binary versions available for many platforms--particularly useful for Windows installs. Follow instructions on the choosen sites for installing Perl. Unix installs should be easier simply because you probably have more experience with Perl or you have a network administrator. Windows installs are quite easy--just like installing any other program. Once the install is done put parasight program in the Perl bin directory (usually C:\Perl\bin). If you need to install any Perl modules such as Tk consult the individual OS. For Windows Active State binary the PPM provides easy searches and installations of modules. UNIX environments can utilize the CPAN module.. - -

-

If there is a strong need a standalone versionsof the program that are package together with all need Perl functions could be generated using ActiveState's PerlApp program. All needed components are contained within the ``packed up'' executable for both Linux, Solaris, and Windows. No installation of Perl is needed. Note this is not a compiled version, so the run speed will be the same as the non-PerlApp-packaged program. It is actually just an executable that has collected all of the Perl components required for Parasight to run. - -

-

-

-
-

APPENDIX D: PRECODE HINTS

-

Precode affords the ability to add additional code to further manipulate parasight. Extensive use of precode is found in the parasight.examples file. The best way to figure out how to manipulate parasight is to study all of the parasight code. Of course even I am trying to forget most of the code so the following are useful subroutines to abuse: - -

-

First the hash variable containing all of the command line options is %opt. So, if you want to chance arrangesub you have to use the code $opt{'arrangesub'}; - -

-

Useful commands to use when scripting: - -

-
- $opt{'x'}
-
-
-

Any normal option can be accessed within the hash %opt. - -

-
- &reshowNredraw; &update;
- 
-These two subroutines will cause the any changes in options to be redrawn and updated on the screen.  While update is not normally used in the internal code (as it is called automatically whenever control is returned to the GUI), it is necessary when a script has control of parsight.
-
-
-
- &print_screen(0, "fileoutpath");
-
-
-

This will print a postscript of the visble screen to the designated file. If 1 is used for the initial print varaible then the postscript will be sent to the printer. If zero is used only the file is created. - -

-
- &print_all (1, "fileoutpath");
-
-
-

This will print a poscript of the entire parasight area to the designated file. If 1 is used for the intial print variable then the postscript will be sent to the printer. If zero is used only the file is created. Depending upon the multipage options, multiple files may be created. - -

-
- &save_parasight_table("basefileoutpath");
-
-
-

To save as parasight formated files which can be reload with the -in ``basefileoutpath'' name. - -

-
- &fitlongestline;
-
-
-

This will force the length of the screen in bases to the length of the longest sequence. This is most useful for BLAST views. - -

-
- $opt{'die'}=0;
-
-
-

This is useful to turn off the die option if you are subsequently saving the parasight files. Otherwise when you load the saved parasight it will ``die'' before you get to see it. - -

-
- &reshowNredraw; &update; print "PAUSED\n"; my $pause=<STDIN>; 
-
-
-

A useful sequence of commands if you want to pause for the user. - -

-
- $opt{"text_text"}="This is displayed text."; $opt{"text_fontsize"}=16;         $opt{"text_offset_h"}=10;
-
-
-

Allows for a line of text to be printed within the image. text2_text allows for a second line. - -

-

-

-
-

APPENDIX E: ADDITIONAL EXAMPLES

-
- parasight -showseq show.file -extra repeat.file:exon.file
-
-
-

This draws the sequences specified in show.file decorated with the repeats and exons specified in repeat.file and exon.file. Note: this example does not contain any alignments so show.file is required in order to specify the lengths of the sequencesto be displayed. - -

-
- parasight -in saved  -extra exons:introns 
-     -arrangeseq oneperline
-
-
-

This loads a saved parasight, adds extra annotation from the files exons and introns annotation. It arranges subjects one per line below the sequence - -

-

-

-
-

AUTHOR

-

Jeff Bailey (jab@cwru.edu) - -

-

-

-
-

ACKNOWLEDGEMENTS

-

This software was developed in the laboratory of Evan Eichler, Department of Genetics,Case Western Reserve University and University Hosiptals, Cleveland. - -

-

-

-
-

COPYRIGHT

-

Copyright (C) 2001-3 Jeff Bailey. Distribute and modify freely as defined by the GNU General Public License. - -

-

-

-
-

DISCLAIMER

-

This software is provided ``as is'' without warranty of any kind. - -

- - - - diff --git a/programs/parasight_v7.6/readme.doc b/programs/parasight_v7.6/readme.doc deleted file mode 100644 index a5d4150..0000000 Binary files a/programs/parasight_v7.6/readme.doc and /dev/null differ diff --git a/programs/parasight_v7.6/readme.txt b/programs/parasight_v7.6/readme.txt deleted file mode 100644 index f9e1247..0000000 --- a/programs/parasight_v7.6/readme.txt +++ /dev/null @@ -1,22 +0,0 @@ - - -Installation should be fairly easy--the html help document has general install instructions towards the end. I definitely recommend UNIX over running it on Windows. If you have no choice however, I have manage to iron out most of the Windows inadequacies/bugs. - - -To install the program and the example script: - -1) Install Perl if not installed (most standard unix installs have all that is required). -a. For unix see www.perl.org, although for most unix computers Perl is part of the standard install. -b. For MS Windows, try ActiveState Perl (http://www.activestate.com/Products/ActivePerl/ ). They have already compiled a binary so you don’t. -2) Uncompress the file parasight_v7.4.zip file creating the directory parasight_v7.4 . -3) Directly within the parasight_v7.4 directory is the parasight executable, it should either be moved or linked into your bin path for ease of use. On windows machines that have installed ActiveState the simplest solution is to place it in the Perl bin directory (usually C:\Perl\bin). Parasight code no longer requires additional modules except for Tk. -4) Check to see if parasight runs by typing parasight at a command line. You should get a summary of options. -5) If it doesn’t work you may need to fix the path or install any modules such as Tk but only if it complains that they are not found. -6) Once you get parasight to run (i.e. list it’s main options when run without any arguments), try changing to the examples directory and running parasight_examples1.pl –example 1 This is a cheesy scripted tutorial that will demonstrate some of the things parasight can do. This program won’t run unless it can find parasight (i.e. after you put it in a bin directory). You need to execute parasight_examples1.pl in its directory so it can find the example data. -7) The other examples aren’t really scripted but they are examples to give you ideas of what parasight can do and how to go about getting parasight doing it. - - -Enjoy, Jeff - -jab@cwru.edu - diff --git a/programs/parasight_v7.6/test.htm b/programs/parasight_v7.6/test.htm deleted file mode 100644 index 0b8fba5..0000000 --- a/programs/parasight_v7.6/test.htm +++ /dev/null @@ -1,1555 +0,0 @@ - - - -C:/perl/bin/parasight76.pl - - - - - -

- - - - - -
-

-

-
-

NAME

-

parasight (version 7.6)

-

-

-
-

SYNOPSIS

-
- parasight -align alignment.table
-

This loads the file alignment.table containing either a table of tab-delimited alignments or miropeats standard output (see below).

-
- parasight -align AC002038.blast.parse -showseqqueryonly
-

This loads the file AC002038.blast.parse containing parsed tab-delimited blast data. The display is set relative to the query sequence only. This is the number one use in the Eichler lab for parasight.

-
- parasight -align AC002038.blast.parse -showseq AC002038.1: 
-     -extra repeats -template bacblastview.pst 
-     -options 'seq_color=>red, canvas_width=>1000'
-

This draws the blast output from a search with AC002038.1 formatted with the options contained in a previously saved template file bacblastview.pst. It uses -options to modify the screen width and sequence color.

-
- parasight -in saved.parasight -showseq AC002304:AC002035: 
-     -arrangeseq sameline -template template.file 
-     -options 'seq_color=>red,extra_arrow_on=>1'
-

This loads a previously saved parasight view (the files saved.parsight.psa, saved.parasight.pse and saved.parasight.pso (and sometimes saved.parasight.psg). The display is limited to show only 2 of the sequences (AC002034 and AC002035), which are arranged/placed on the same line. A previously saved template file of options to reformat the view, and modifies two options directly (sequence color and turns on arrows for annotation extra).

-

-

-
-

DESCRIPTION

-

Parasight is a generalized pairwise alignment viewer for biological sequences, which was originally developed for analyzing segmental duplications (i.e. paralogy) within the human genome. It is designed to display the positions and relationships of pairwise alignments within sequence(s). It provides both interactive analysis as well as publication quality postscript output. Parasight can arrange and color alignments on the basis of any other included data such as size, percent similarity or even species designation. It can also display the position of any type of simple sequence annotation (a single begin and end point) such as repeats and exons. Finally, it can graph numerical data in relation to the sequence such as windows of percentage GC content. Parasight has been used to analyze output, from programs such as BLAST, Megablast, PipMaker, blastz and miropeats, from the scale of whole human genome to the analysis of a single protein searched against a database of interest. If it is pairwise data, parasight can display the relationships and aid in their analysis.

-

Parasight functions on both UNIX and Windows platforms. It is written in Perl using the graphical Perl/Tk module. It was designed to be extremely flexible and thus a price is paid in terms of speed (Perl) as well as the complexity (the large number of options). However, the plethora of options increases the odds that parasight can do what you want it to do. Although not necessary for basic interactive use, an understanding of regular expressions and a familiarity Perl is helpful in order to fully utilize the program. Parasight and all of its options are accessible through the GUI interface, the command line, or via loadable templates making analyses flexible and automat able. Most users of parasight load their data into the program and then format the view interactively using the extensive options menu (and now templates). Programmers will be the most likely to use command line manipulation of internal options, for extensive formatting and batch processing. Parasight has been used and tested extensively on both Linux and MS Windows. The UNIX version is the most extensively tested and all options should be available. Windows lacks some of the more advanced options due to incompatibilities/inflexibilities of Bill Gates' more limited (albeit extensively borrowed) command line operating system.

-

To extol parasight's strengths:

-
    -
  1. Flexible
    -
  2. -The RAM is the limit when it comes to loading data. Other than the most basic description of a pairwise alignment or extra sequence feature Parasight makes absolutely no assumptions about your data--allowing the user to analyze what they are interested in analyzing. Technically parasight can't tell a bp apart from an inch or DNA from Protein. -

    -
  3. Formatible
    -
  4. -Parasight has a pletora of options, if I have ever needed it parasight has it. Every parasight option is available is available from the command line, the GUI interface or from a saved template file of options. Thus, the basic user and the programmer can completely tailor their parasight views to their exact needs. -

    -
  5. Interactive
    -
  6. -Parasight can interact with the user and with other programs. The user can format the parasight image on the fly via the GUI option menu as well as edit the data by right clicking. Parasight allows the user to print screen shots or dump a postscript of the entire image. Popup windows over alignments and extra sequence features display the objects data. In addition to scaling with the options menu, users can zoom in and out to gain an appreciation of the detail. Parasight has the ability to link to (or execute) other programs allowing the viewing of web pages or associated sequence alignments at the bp level. -

    -
  7. Programmable
    -
  8. -Parasight can accept additional Perl code from the command line or via a file. This allows for more even more complex formatting or for the execution of commands such as searching or printing. This is how the tutorial example was automated. Also, combined with the -die option, batch processing, such as generating PostScript images of 30,000 BACs (if you are so inclined), can easily be accomplished. -=back -

-

-

-
-

UNDERSTANDING DATA CATEGORIZATION/CLASSIFICATION

-

A basic understanding of the parasight data model eases learning to use the program. Basically, data falls into three main categories (pairwise alignments, extra simple sequence annotation and numerical graph data). The GUI option menu is organized on the basis of which data is being manipulated.

-
    -
  • -Alignments, for which parasight was developed, have two forms of display (pairs and subs). -
      -
    • -Pairs, which are normally drawn as blocks atop the lines representing the sequence, were the first representation of the pairwise alignments. For each alignment the pairs representing can be connected by lines to show their relationship. Thus, pairs relationships are only visible if the both sequences containing the pairwise are drawn. -

      -
    • -Subs are representations of pairwise in relation to only one of the sequences (essentially mimicking blast type output). Sub is for sub-sequence (as they are drawn below the sequence) or subjects (if you are examining BLAST results). -

    -
  • -Extras are simple sequence annotations that have one beginning and one end such as introns, LINEs, SINEs, motifs, etc. In the case of a gene the intron exon structure can not be drawn as one object, but only as individual exons and individual introns. (A -gene data structure is planned for the distant future.) -

    -
  • -Graph data is a plot of sequence positions (x-axis) versus a numerical value (y-axis). -

-

Below is a crude schematic (the best I could do in POD) of a typical BLAST-like display with data represented by letters -DATAKEY: Sequence(-), Pairs(P), Subs(S), Extras(E), and Graph(G).

-
-                                                                 G
-                   G                   G         G                   G 
-          G               G     G                               G           
-               G                                          G                 
-               
-           EEE   EEEE          EEEE         EEEE      EEE     EEEEEEE
-    S0001--------PPPPPPPPPPPPP-----------PPPPPPPPPPPPPP--PPPPPPPP-----
-    SEQ04        SSSSSSSSSSSSS                           SSSSSSSS
-    SEQ02        SSSSSS   SSSS           SSSSSSSSS
-    SEQ03           SSSSSSSSSSS              SSSSSSSSSSS  SSSSS    
-                                                                                                                                                       ,
-

-

-
-

COMMAND LINE OPTIONS

-

The key command line arguments available can be divided into three main headings: data input, reloading a saved parasight view, and changing view options.

-

-

-

DATA INPUT

-

While many types of data can be loaded and displayed, the absolute minimum input is simply the length of a sequence to be drawn. The length of a sequence can be provided in -showseq file (e.g. parasight -showseq SEQ1,23000:). Usually the lengths are supplied as part of the pairwise alignment file. If no alignments are being drawn then the user must supply the lengths with the -showseq option. Of course examining a line representing a sequence is pretty boring--even if it is decorated with tick marks--so most people use additional data input options. In fact, if you don't use any of the main options parasight will spit out an initial warning asking if you forgot them.

-

-

-

Option: -align

-

-align [filepath1:filepath2:filepath3:etc] loads files containing pairwise alignments. The files must be either the saved standard output for miropeats (Jeremy Parsons) or a tab-delimited format akin to miropeats standard output. The tab-delimited format is simply a table where the first 8 columns contain the pairwise coordinates and lengths of the two similar sequences and each row represents an alignment. The align file is assumed to have a descriptive header in the first row. Hence, the first alignment will be lost (and loaded as the header) if no header actually is present.

-

-

-

Miropeats standard output

-

An example of Jeremy Parson's Miropeats standard output is:

-
- ## Minimum repeat length set to 300.
- 
- 
-         ICAass  Version 2.1
-         =======
- 
- 
- Indexing all the sequences now. This may take a few minutes.
- 
- Total of 1 sequences indexed
- The sorted index is being saved to the file cluster.index.7507 
- .AC002038 118 731 161973 AC002038 44681 44068 161973
- AC002038 1299 1788 161973 AC002038 47175 47664 161973
- AC002038 22870 23591 161973 AC002038 39920 40641 161973
- AC002038 46067 46524 161973 AC002038 26363 26820 161973
- AC002038 46067 47435 161973 AC002038 26363 27731 161973
- AC002038 46699 47435 161973 AC002038 26995 27731 161973
- AC002038 47175 47664 161973 AC002038 1299 1788 161973
- Graphic ready for printing - type the command shown below to print:
- lp threshold300
-

Parasight will recognize this format and parse it automatically.

-

-

-

Example tab-delimited alignment file

-

An example of a tab-delimited align file consisting of parsed BLAST output with additional data columns is shown below:

-
- name1 begin1 end1  len1  name2 begin2 end2  len2  similarity  transversions
- S001  1322   20001 20001 S002  1      18064 18064 0.945632    125
- S001  1322   20001 20001 S003  1      21010 21010 0.980581    143
- S002  1      18064 18064 S003  100    21010 21010 0.999587    7
- S002  1      18064 18064 S004  1      19041 19041 0.989587    43
- S002  1      12191 18064 S005  1      12141 18073 0.997548    17
- S002  12799  18064 18064 S005  12809  18073 18073 0.998548    3
-

The pairwise format consists of the 2 sequence names and their lengths, and the coordinates of the pairwise similarity (name1 begin1 end1 len1 name2 begin2 end2 len2). The first row of the alignment file contains column header names. For the first 8 rows the header rows are ignored and fixed to those given. Thus, it is necessary to place these required eiqht columns in the exact order given above. The only data from these first 8 rows that may be omitted is the overall lengths of the sequences (len1 and len2). While the data may be absent, blank entries for these columns must still be present. Also, the lengths of the sequence must of coure be provided elsewhere via -showseq option. Any additional columns (such as the similarity and transversions columns in the above example) are kept within the internal alignment data table. Any such additional data can be used to format and filter and analyze the parasight views generated. Additional columns used for specific formatting that are created if not present in the alignment file are color, width, offset, sline, scolor, and hide. These are case sensitive headers and must be all lower case if supplied by the user. (color contains the color of the pairwise. width is the width or thickness for the bar representing the pairwise. offset is the offset of the subject object. scolor is the color of a subject object. hide does not display a pairwise if it is equal to 1. If the values for these columns are not inputed or blank then the default values for the options are used. (NOTE: It is usually simpler to modify these formatting columns after initially saving in parasight format and then editting the *.psa file using programs such as Excel.)

-

-

-

Option: -showseq

-

-showseq [filepath | seqname1[,length,begin,end]:seqname2[,length,begin,end]:etc] displays only the designated sequences. With no colon a filename is assumed and the program attempts to load it. If a colon is found in the option then it is assumed that the input is a colon separated group of sequence names (e.g. -showseq AC002038 or -showseq AC004232:AC042438). Optional length and begin and end positions can be designated as well (see below). This information may be given on the command line using commas after the sequence name or be contained within a file (tab-delimited). For analysis such as BLAST searches where you just want to display the query sequence it is easier to use the short cut option -showseqqueryonly in combination with -showseq ALL.

-

-

-

Example showseq file

-

The format of the tab-delimited show file is shown below:

-
- seqname    length   begin   end
- S001       10000    50      1000
- S002       15432    1000    15432
- 
-An example of the data above as a command line entry is:
-
- -showseq S001,10000,50,1000:S002,15432,1,15432:
-

Or with just the lengths to display the entire sequence:

-
- -showseq S001,10000:S002,15432:
-

Or if the lengths are in the alignment file, just the begin and end positions can be designated (note the double comma to skip the sequence length):

-
- -showseq S001,,50,1000:S002,,1000,15432
-

The only required column is the sequence name. The names must be exactly the same as in the alignment file and extra file. The lengths, begins, and ends, are optional for the sequences to be drawn. However, if length, begin or end are used, they must appear in the proper columns. Begin and end must always be found in the 3rd and 4th columns and thus a blank sequence length must be provided for column 2 when sequence length is not designated but begin and end are. If lengths are not supplied in the alignment files or the show file then errors will occur. Lengths designated by -showseq always supercede lengths found in alignments.

-

-extra [filepath1:filepath2:etc] loads any 'extra' sequence annotation/feature that can be expressed as a continuous block of the displayed sequence (i.e. a simple begin and end position). This can include features such as high copy repeats, introns, exons, and genes (if you don’t care about introns/exon structure). The simplest extra file contains 3 columns in the given order seqname begin end.

-

-

-

Example of an tab-delimited extra file

-

An example of a tab-delimited extra file is shown below:

-
- seqname   begin   end    name   color  offset
- S001      50      1000   exon1  blue   -10
- S001      5000    5500   exon2  blue   -10
- S002      5000    9000   LINE1  red    -20
-

Columns added if not present are color, offset, width, and orient. -(Again it is usually simpler to modify formatting data after the fact unless it is generated beforehand (e.g. orient). In terms of formatting simple color names should work: black, red, green. Orientation should be either 'F' or 'R' (capitalized). Case insensitive PLUS/MINUS, POSITIVE/NEGATIVE, +/- and FORWARD/REVERSE should also work but these will be changed to 'F' and 'R'. Again, any other columns may be added that give additional information such as names and descriptions.

-

-

-

Option: -graph1 or -graph2

-

-graph1 file(:s) and -graph2 file(:s) loads simple graphing data in the form of seqname position value. The position within the sequence in base position and the value must be numerical (floating point). Graph plots the points and/or a connecting line. The graph appears as numerical values of sequence positon on the x-axis versus y-axis numerical values. -graph1 is used to generate one plot with scale shown on the left. -graph2 is used to plot another line or set of data points on the same x y axis although the scale is shown on the right. The left and right axis can be scaled for different ranges. Thus, Alu content and GC content can be graphed at the same time. As mentioned the left axis shows the scale for -graph1 and the right side the scale for -graph2. No header is required for the input file. An unknown value can be designated with an empty value position. An empty value position causes a discontinuity in the line. Only the first 3 columns are loaded all additional columns are ignored. Graphing is built for speed-not flexiblity. The only flexibility is in scaling and formating the axes.

-

-

-

Example of a graph file

-
- seqname  position  value
- chr1      5000      0.43
- chr1     10000      0.65
- chr1     15000      0.73
- chr1     20000      0.65
-

-

-

RELOADING A SAVED PARASIGHT VIEW

-

After data is loaded into parasight it can be saved as ``native'' parasight files, which are simply tab delimited files for *.psa, *.pse, and *.psg. and a text list of all the option settings. These files can be reloaded, using the -in command; however, if some text files were used for formatting or arranging sequences. These files still need to be located in the same relative filepaths. Below is more specifics regarding these options:

-

-

-

Option: -in

-

-in [base filepath] loads a previously saved parasight dataset. Data is saved in 4 separate files (basefilename.psa, basefilename.pse, basefilename.pso) and basefilename.psg. Each file is editable text. The .psa, .pse, and .pso are required even if there are no alignments and/or extras. These extensions to the basefilename are automatically searched for in the given path.

-

The .psa and .pse are tab-delimited tables containing the alignment and extra data, respectively. These tables are easily edited with any text editor; spreadsheets such as Excel are particularly useful in modifying these tables since the data is separated into columns and calculations can easily be done to modify the data as necessary. The first line/row contains the names of all of the columns.

-

The .pso file contains all of the current option information. It is saved as text which can be modified by the end user. It has a similar format to the template files.

-

The .psg file contains all of the current data to graph. The -graph1 data is stored in the first 3 columns and the -graph2 data is stored in the next 3 columns (4 to 6). The graph file, unlike the alignment and extra files is only created if graph data has been loaded. This was done to provide for backward compatiblity before the graph option was added. Thus, a missing *.psg will will not generate an error. For each set of 3 columns, column one is the seqeunce, column 2 is the position on the sequence, and column 3 is the value of to plot on the y-axis.

-

IMPORTANT REITIERATION: All data necessary for a parasight view is contained in these files EXCEPT for any -showseq files or -arrangeseq files. These files must still be accessible in the same relative path positions in order for the saved file to be loaded properly. In other words, only the file names to a show and arrange files are saved and that data must be reloaded. If the files get moved then the link will be broken and their paths will need to be altered as the paths are usually relative.

-

-

-

CHANGING DISPLAY OPTIONS

-

Option arguments modify and format the parasight view. All of these options may either be change from the command line, a template file, or interactively within the program OPTION menu. The interactive menu is the easiest way to learn and template files the easiest way to apply a set of options again and again. Changing options at the command line follows a set order of precedence--whereby old options loaded from a previous parasight view (-in) are overridden by an option template file (-template), both of which are overridden by any options specified with the (-option) command. These in turn are overridden by direct command line options such as -arrangeseq, -colorsub, and -showsub.

-

PRECEDENCE SUMMARY: internal default ---> -in ---> -template ---> -option ---> commandline

-

-

-

Option: -template

-

-template [filepath] loads an option template file. This allows a user to quickly format future parasight views so that they are just like the saved one. It is created using the save option template in the file menu. When loading a template, if the file is not found in the current or specified path, hard-wired default template directories are searched. For our lab one directory contains templates shared among multiple users. And a user specific directory for an individuals PARASIGHT files. The $template_path variable contains the paths. To modify them you must modify the code. The current setting is '~/.PARASIGHT:/people/PARASIGHT'. The search is left to right and first one found is first one used. Template directory as it is currently set does not work for WINDOWS. The tilde must be removed as it only works on Unix where the HOME directory (~) is designated by the environmental variables..

-

The template is an standard text file so a user can modify the values easily. It is created using the save option template in the file menu. 0 and 1 are used for on and off as well as yes and no values. An empty string is simply a line return right after the (=>) A line beginning with ### is ignored as comments and is used to give descriptions of the values.

-

IMPORTANT: Be careful about adding blank space, it is a good idea to edit with normally unseen characters such as spaces and line breaks visualized. This can turn blank lines into spaces and numbers into text.

-

-

-

Option: -options

-

-options ['opt1=>value1,opt2=>value2' | filename ] is a list of options to modify. All of the underlying options are available; however, there are probably many that you will never have reason to modify, but they are all listed in appendix A for completeness. A file containing the option information (same format as above) can be given. Multiple options can be on the same line. As options are loaded they are checked against valid ones. If the option is incorrect (often due to a simple typo), parasight will halt and using some crude pattern matching attempt to provide the most likely name/names of the option one was trying to enter.

-
- Example Options File
- canvas_width=>500, seq_tick_on=>1,
- arrangeseq=>oneperline, 
- showseq=>AC004328:AC0042328
-

-

-

Option: -showsub

-

-showsub [filepath | seqname1:seqname2:etc] This option shows only the designated subs to be drawn under sequences. Multiple sequence names can be directly entered with colon delimitation. If no colon is present then the input will be treated as a file containing a list of subs and will be loaded. Default is ALL, which displays all possible subs. Unlike -showseq the subsequences can not be designated.

-

-

-

Option: -arrangeseq

-

-arrangeseq [oneperline | sameline | file:filename] This option arranges the sequences in a specified manner.

-

oneperline draws each sequence designated by -showseq on a separate line that may wrap if needed.

-

sameline draws all of the sequences designated by -showseq on the same line with a given amount of spacing between them.

-

file:filename uses the data in the file to arrange the sequences in user defined pattern. The file consists of two columns seqname and position in current line. To start a new line NEWLINE is typed alone. The example below places the chromosomes on 3 lines.

-

-

-

Example Arrange File

-
-   acc  start
-   chr1 400000000       
-   chr6 1668388704      
-   chr7 1870803946      
-   chr8 2057427852      
-   chr9 2230204273      
-   NEWLINE              
-   chr22        1       
-   NEWLINE              
-   chr10        400000000       
-   chr11        565589288       
-   chr12        736372841       
-   chr13        900655330       
-   chr14        1040400228      
-   chr15        1167353549
-

Base positions in the 2nd column are optional. Positions will be assigned based on their order. Spacing between sequences on same line will be separted based on seq_spacing_btwn_sequences. (Behavior is akin to -arrangeseq sameline.)

-

-

-

Another Example Arrange File

-
-   acc  
-   chr1 
-   chr6
-   chr7
-   chr8
-   chr9
-   NEWLINE      
-   chr22
-   NEWLINE      
-   chr10
-

-arrangesub [oneperline | stagger | subscale | cscale] This option arranges subs below the drawn sequences. The name came from blast subjects, but you can also think of them in terms of sub (under the) sequence.

-

oneperline means each sub sequence is placed on its own line beneath the drawn sequence. The ordering of sequences can be altered by choosing a column to sort on (arrangesub_col). Adding a the number sign character (#) will cause the sort function to be numerical rather than the default alphanumeric.

-

stagger causes multiple subjects to be placed on same line only when they are non-overlapping. The spacing required between the beginning and end of two subs can be varied. This spacing gives room for labels. The ordering starts in terms of other sequences with hits closest to the beginning of the sequence of interest under which the subs are being drawn

-

subscaleN = subjects are places on a numerical scale based on given column values. Tricky so avoid setting up from command line--use the GUI and then save a template from that.

-

subscaleC = subjects are placed on categorical scale based on column values. Tricky so avoid setting up from command line. Use a template or the GUI.

-

NOTE: the best way to figure the scales out is to experiment with them interactively in the options menu. There are specific modifications of subscaleN and subscaleC that are included as choices. They are denoted by a preceding asterisk and were developed to display breakdowns of percent similarity and chromosome position (for mostly oudated draft versions of the genome). However,they may be instructive to the new user. New views are now simply done via a template rather than adding adding even more choices.

-

-color ***not implemented*** When implemented it will color the pairwise sequences and connecting lines. Currently, coloring is only based inter and intrachromosomal designation. (As of yet the need hasn't really arisen.) For consistency this should be called colorseq.

-

-colorsub [NONE|RESET|seqrandom|hitrandom|hitconditional] This option provides color schemes for the subs drawn below the sequence.

-

NONE does not change the color and leaves hit colors intact. Hit colors are stored within each pairwise in the table. Subject colors are stored transiently. Hit colors over-ride subject colors. To remove hit colors use RESET.

-

RESET removes hit (individual pairwise) colors, which override any assigned subject colors. For example, if you use hitrandom and then try to switch to seqrandom, nothing will change. This is because hitrandom colors are still stored in the internal alignment table and they take precedence over the subject color scheme. Thus, this intermediate RESET step is required to clear the hit colors. CAUTION: if you use RESET all of your manual coloring will be wiped out. (NOTE: This is because hit colors reside in the same column scoloras manually modified sub colors. The column color defines the pairwise color--overriding inter and intra colors.) Sorry, this is part of the program that could be simplified if I ever have a chance to gut it.

-

seqrandom randomly assigns colors to the various sequences that are displayed as subs. (There is a random set of 20 odd colors that are cycled through.)

-

hitrandom randomly assigns colors to each individual hit or pairwise alignment. (There is a random set of 20 odd colors that are cycled through.)

-

hitconditional allows for each pairwise to be assigned a color based on pseudo-Perl code by using a series of conditional statements that test a single alignment column. Basic syntax is [color] [test] [value];, where color= color to set, test is =, >, or <, and value is some numerical value.

-

-minload is a switch to load only the alignments and extras for the sequences that will be drawn as designated by -showseq. It is very useful for increasing the speed of the program when there are a large number of alignments that will not be drawn in the current view. Why load the genome if you only want to look at chromosome 22?

-

-precode ['Perl code'] This code is executed after the initial drawing of objects. It allows automation for batch processes when combined with die option. (See Advanced option section below for details.)

-

-die parasight quits after executing the precode option (See Advanced option section below for details.)

-

-

-
-

INTERACTIVE MENUS

-

-

-

RESHOW, REARRANGE, REDRAW

-

This part is to answer why there is a blue and white button for updating the drawing. For beginners, I simply suggest using the blue R,R&R (Reshow, Rearrange, and Redraw) button. For extremely large data sets; however, the Reshow, and Rearrange calculations can take a significant amount of time. Thus, if you are just changing the spacing of tick marks it is handy to skip the sequence and arrangement calculations. However, for simple views of BAC BLAST output stick with the blue button.

-

SPEED TIP: Another important tip when working with large data sets is to limit the data shown by using the filter command. By skipping most of the data (not drawing it) you can get all of your formatting options just right. Then once all of the formatting is just right, you can remove the filtering and allow all the objects (alignments and extras) to be drawn.

-

-

-

OPTION MENU

-

The option menu has popup help (over yellow text) and most options are self-explanatory. If in doubt try changing an option and see what happens. I have tried to adhere to a semi-logical naming convention when ever possible. Blue color coding is to show whether a variable will require reshow and rearrangement before taking effect. The menu is subdivided into 6 main parts: MAIN, SEQ/PAIRS, SUBS, EXTRA, GRAPH, FILTER, and MISC. The organization trys to follow the organization of the data in parasight.

-

The MAIN menu allows access to important command line options like -showseq and -showsub. Also, basic screen properties such as size of the window and the number of bases for the width of the screen.

-

The SEQ/PAIRS portion allows manipulation of the sequence and assocaiated tick marks. Pairs and their designation as inter and intrachromomal as well as connecting lines are controled from this part of the menu as well.

-

The SUB portion of course is all about the manipulation of subs. This is some of the more complex data manipulation.

-

The EXTRA portion is about the options relating to the extra data.

-

The GRAPH portion is for the graph data. Try turning everything on when you first test out this feature.

-

The FILTER portion allows for the filtering/removal of pairwise and extras based on data in a given column of numerical data.

-

The MISC portion allows for the setting of options controling printing, the display of alignments, the extraction of sequence, and the execution of other programs.

-

-

-

FILE DROP-DOWN MENU

-

This is the only place where the save parsight command is found. All data and options are saved. A few files are not saved--see information about -in. Loading must be done at the command line. Additionally, template files (*.pst) may be saved and loaded through this menu. After loading a template file the screen must be R,R,& R.

-

-

-

PRINT DROP-DOWN MENU

-

The print menu allows for the generation of a postscript file and its subsequent transmission to a printer if the option print_command is properly set. The postscript file can consist of the visible screen (screen) or the entire parasight drawing (all). If the all option is chosen then the number of pages (vertically and horizontally) across which to print the image is set with the option print_multipages_wide and print_multipages_high. The postcript files are encapsulated and can be easily turned into PDF files with software such as Adobe Distiller or imported into Adobe Illustrator. Also, word has a special eps import option which was handy when writing my dissertation.

-

ADD MORE DESCRIPTION HERE FOR THE USERS!!!!! -ADD MORE DESCRIPTION HERE FOR THE USERS!!!!! -ADD MORE DESCRIPTION HERE FOR THE USERS!!!!! -ADD MORE DESCRIPTION HERE FOR THE USERS!!!!!

-

-

-

ORDER DROP-DOWN MENU

-

The order menu on the main drop down menu bar allows the order or display level of objects to be changed. You can either send objects all the way to the background or bring them to the foreground. This occurs instantaneously without the need for pressing Redraw.

-

-

-

MISC DROP-DOWN MENU

-

Currently it contains the ability to transfer colors between the two different representations of alignments (pairs and subs). This allows syncing of colors between the pairs and the subs. The changes will not take effect visually until redraw is done. This is really the only way to currently go outside of the inter vs intra coloring schemes for pairs. This option has not been tested extensively.

-

-

-
-

SCREEN MANIPULATION

-

In addition to gazing lovingly at the pretty images after formating them using the option menu, direct manipulation of the display once drawn can be accomplished with various commands. This allows for interactive analysis to occur!

-

-

-

MOUSE BUTTON FUNCTIONS

-

(see APPENDIX B: for table of mouse functions)

-

First when the mouse is moving over an object it will shimmer with number of different bright colors. The shimmering object represents anobject that you can select if you click on it. Most of the mouse commands work on sequence, pairwise, extra, and subjects. Tick Marks and Labels are immune to most manipulations except for the ALT buttons. The middle mouse button is not used since some systems like my home PC lack them (and I don’t have the dexterity to precisely click both Left and Right at the exact same time which is the usual substitute).

-

DATA POPUP WINDOW (Left-Click) This pops up a simple window displaying all data for an alignment or an extra object. Use Shift-Drag to move the popup window if it is obscured or obscuring data. Formatting options for this popup window are found under MISC tab of the OPTIONS menu.

-

OPTIONS POPUP (Right-Click) Brings up a popup menu of options, which includes a variety of commands such as choosing colors and editing the underlying data. If the actual alignments are present in the alignment table, the alignments can be viewed. If the underlying sequence files are available, subsequences representing objects can be extracted.

-

ZOOM IN AND OUT (Control-Left-Click and Control-Right-Click) -Zooming can be accomplished with Control key being held down current with a mouse click. The Control left click zooms in two fold (centered at the point of the click). The Control left click has the opposite effect and zooms out. The DeZoom button on the main window returns the scaling to normal.

-

MOVE OBJECT TO FOREGROUND OR BACKGROUND (Alt-Left-Click and Alt-Right-Click) This causes the object clicked on to move all the way to the foreground or the background. The left mouse button moves it to the foreground. The right mouse button moves the object to the background. This is useful to examine multiple overlapping pairwise.

-

MOVE (nonpermanent) ANY OBJECT (Shift-Left Drag) -Allows for the movement of object in the drawing--even tick marks and sequence lines. It is non-permanent but it is useful for removing certain tick marks or names before you print or create a PostScript file. (Altering the data is strongly discouraged!)

-

QUICK COLOR (Shift-Right-Click to color and Shift-Right-Double-Click to uncolor) -Allow for rapid coloring of objects. Shift-Right Click causes the object's color to change to that of the Quick Color Button on the Main Window. Shift-Double-Click-Button attempts to remove the color and leave the default color. In the case of Pairs, black is assigned to the object as inter and intra chromosomal colors can not be reassigned until a Redraw. WARNING: the coloring of all other objects (i.e. not extras and not alignments) are not saved or stored and consequently revert to normal as soon as the image is redrawn.

-

HIDE SEQUENCE OR EXTRA (Alt-Right-Double-Click) This will hide sequences from view (i.e it will disappear from view). To unhide sequences you must use the pre-filter reset in the filter options. (For which I should add a command line!).

-

-

-
-

APPENDIX A: LIST OF VALID OPTIONS WITH INTERNAL DEFAULTS

- -

-

-
-

APPENDIX B: QUICK REFERENCE

-

-

-

COMMAND LINE SUMMARY

-

-align [filepath1:filepath2:etc] load pairwise alignment table(s) (table must be miropeats format)

-

-arrangeseq [oneperline/sameline/file] (default is oneperline)

-
- *oneperline = each sequence is placed on a separate wrapping line
- *sameline = the sequences are placed in alphabetical 
-    order on the same line
- *file:filepath = arrange file that allows specification
-    of line/paragraph and position
-

-arrangesub [oneperline/stagger/subscale/cscale] (default stagger) -Arrange subs below the sequence.

-
- *oneperline = each sequence is placed on its own line
-    underneath sequence
- *stagger = multiple subjects are placed on same line 
-    only when non-overlapping
- *subscaleN =  pairwise hits are placed on a numerical scale
-    based on values in chosen column(s)
- *subscaleC =  pairwise hits are placed on categorical 
-    scale based on hash(s)
-

-color [scheme] ***not implemented yet, no demand yet*** Use other options for determining inter vs intrachromosal***

-

-colorsub [NONE/RESET/seqrandom/hitrandom/hitconditional]

-
- *NONE = does not add a colorsub and does not remove colors 
-    for pairwise hits
- *RESET = removes colors for pairwise hits 
-    colors for pairwise hits override colors for sequence hits
- *seqrandom = color all pairwise comparisons for a subject the same
- *hitrandom = randomly independently color each pairwise comparison
- *hitconditional = allows coloring based on a conditional statement
-

-extra [filepath1:filepath2:etc] loads extra sequence feature table(s) Sequence features are annotation that have single begin and end points (e.g. exons, introns, and repeats). The rows must consist of seqname[tab]begin[tab]end. Further columns may contain optional data. Columns named offset, width, and color provide extra formatting information.

-

-graph1 [filepath1:filepath2:etc] Graphs a data set of values above the sequence line. such as %GC. The data scale is found on the left. The data row format is simply seqname[TAB]begin[TAB]value. No more, no less. For regions with out a value a blank will cause the graph line to be disrupted.

-

-graph2 [filepath1:filepath2:etc] Creates another graph using the scale on the right axis. Same parameters as -graph1

-

-in [filepath] load a previously saved parasight view. Three files required are *.psa, *.pse and *.psm (*.psg needed only if a graph has been used)

-

-options ['opt1=>value1,opt2=>value2'] *** Allows all of the parasight options to be changed directly ***. One and zero are used for on/off, yes/no and true/false. Complete access for the programmer using parasight as a displayer (e.g. 'canvas_width=>500,seq_tick_on=>1,graph_scale_on=>1')

-

-showseq [a file or seqname(s):] names of sequences to display

-
-   *ALL = show all files (default) 
-   *no colon = load as file of names 
-     format each line ( seqname[TAB]length[TAB]begin[TAB]end )
-     only sequence name is required other info optional
-   *colon(:) = parse as list of colon-delimited seq names
-     format: (seqname,length,begin,end:seqname2,length2,begin2,end2)
-

-showseqqueryonly This toggles the display of only the first sequence in a given row. This is the usually position for a blast query (hence the name of the option).

-

-showsub [file | seqnames: | ALL] names of subjects to display

-
-   *ALL: displays all subject sequences (default)
-   *no colon = load file containing names (one seqname per line)
-   *colon(:) = parse input as list of colon-delimited sequence names
-

-template [filepath] loads a saved option template file. Template files can be stored in default directories for easy loading.

-

ADVANCED OPTIONS

-

-minload - *loads only the relevant pairwise that will be displayed - (quicker when just certain sequences are needed from large files)

-

-precode 'perl code commands to execute after first screen draw' - *an advanced option useful for automating initial tasks

-

-die parasight ends after executing precode - *an advanced option useful in automating tasks

-

-

-

OPTION PRECEDENCE

-

internal default ---> -in ---> -template ---> -option ---> commandline

-

-

-

MOUSE FUNCTIONS

-
- [DBL]=double click  [DRAG]=button hold down and move mouse 
- EXECUTE # = Execute Command (User Defined under MISC options)
- 
- KEY            LEFT-BUTTON       MIDDLE BUTTON  RIGHT-CLICK                             
- ---------      -----------       -------------  --------------------
- NONE           Popup Desc                       Menu
- CONTROL        Zoom in                          Zoom out
- SHIFT          Move Object[DRAG]                Quick color; Uncolor[DBL]
- ALTERNATE      Del  Object [DBL]                Lower Object; Raise Object[DBL]
- CONTROL-SHIFT  Execute 1         Execute 2      Execute 3
-

-

-

COMPACT ALPHABETICAL LIST OF -OPTIONS WITH DEFAULTS

-

alignment_col=>0   -alignment_col2=>0   -alignment_wrap=>50   -arrangeseq=>oneperline   -arrangesub=>stagger   -arrangesub_stagger_spacing=>40000   -canvas_bpwidth=>250000   -canvas_indent_left=>60   -canvas_indent_right=>30   -canvas_indent_top=>40   -color=> None   -colorsub=> None   -colorsub_hitcond_col=>34   -colorsub_hitcond_tests=>red if <2; orange if <0.99; yellow if <0.98; green if <0.97; blue if <0.96; purple if <0.95; brown if <0.94; grey if <0.93; black if <0.92; pink if <0.91   -execute=>   -execute2=>   -execute2_array=>m   -execute2_desc=>   -execute3=>   -execute3_array=>m   -execute3_desc=>widget   -execute4=>   -execute4_array=>m   -execute4_desc=>   -execute_array=>e   -execute_desc=>   -extra_arrow_diag=>5   -extra_arrow_on=>1   -extra_arrow_para=>5   -extra_arrow_perp=>4   -extra_color=>purple   -extra_label_col=>10   -extra_label_col_pattern=>   -extra_label_color=>purple   -extra_label_fontsize=>6   -extra_label_offset=>2   -extra_label_on=>1   -extra_label_test_col=>   -extra_label_test_pattern=>   -extra_offset=>-4   -extra_on=>1   -extra_width=>6   -fasta_blastdb=>htg:nt   -fasta_directory=>.:fastax   -fasta_fragsize=>400000   -fasta_on=>1   -fasta_wrap=>50   -filename_color=>grey   -filename_offset=>-10   -filename_offset_h=>0   -filename_on=>1   -filename_pattern=>   -filename_size=>10   -filter1_col=>   -filter1_max=>   -filter1_min=>   -filter2_col=>   -filter2_max=>   -filter2_min=>   -filterextra1_col=>   -filterextra1_max=>   -filterextra1_min=>   -filterextra2_col=>   -filterextra2_max=>   -filterextra2_min=>   -filterpre1_col=>   -filterpre1_max=>   -filterpre1_min=>   -filterpre2_col=>   -filterpre2_max=>   -filterpre2_min=>   -gif_anchor=>center   -gif_on=>0   -gif_path=>   -gif_x=> int($opt{window_width}/2)   -gif_y=>0   -graph1_label_color=>blue   -graph1_label_decimal=>2   -graph1_label_fontsize=>10   -graph1_label_multiplier=>1   -graph1_label_offset=>1   -graph1_label_on=>1   -graph1_line_color=>blue   -graph1_line_on=>1   -graph1_line_smooth=>0   -graph1_line_width=>1   -graph1_max=>100   -graph1_min=>-5   -graph1_on=>0   -graph1_point_fill_color=>blue   -graph1_point_on=>1   -graph1_point_outline_color=>blue   -graph1_point_outline_width=>1   -graph1_point_size=>2   -graph1_tick_color=>black   -graph1_tick_length=>6   -graph1_tick_offset=>1   -graph1_tick_on=>1   -graph1_tick_width=>3   -graph1_vline_color=>black   -graph1_vline_on=>1   -graph1_vline_width=>2   -graph2_label_color=>red   -graph2_label_decimal=>2   -graph2_label_fontsize=>10   -graph2_label_multiplier=>1   -graph2_label_offset=>8   -graph2_label_on=>1   -graph2_line_color=>red   -graph2_line_on=>1   -graph2_line_smooth=>0   -graph2_line_width=>1   -graph2_max=>1000   -graph2_min=>-1000   -graph2_on=>0   -graph2_point_fill_color=>red   -graph2_point_on=>1   -graph2_point_outline_color=>red   -graph2_point_outline_width=>1   -graph2_point_size=>2   -graph2_tick_color=>black   -graph2_tick_length=>6   -graph2_tick_offset=>5   -graph2_tick_on=>1   -graph2_tick_width=>3   -graph2_vline_color=>black   -graph2_vline_on=>1   -graph2_vline_width=>2   -graph_scale_height=>80   -graph_scale_hline_color=>black   -graph_scale_hline_on=>1   -graph_scale_hline_width=>1   -graph_scale_indent=>-20   -graph_scale_interval=>4   -graph_scale_on=>0   -help_on=>1   -help_wrap=>50   -mark_advanced=>   -mark_array=>m   -mark_col=>   -mark_col2=>   -mark_color=>red   -mark_pairs=>0   -mark_pattern=>AC002038   -mark_permanent=>0   -mark_subs=>1   -pair_inter_color=>red   -pair_inter_line_on=>0   -pair_inter_offset=>0   -pair_inter_on=>1   -pair_inter_width=>13   -pair_intra_color=>blue   -pair_intra_line_on=>0   -pair_intra_offset=>0   -pair_intra_on=>1   -pair_intra_width=>9   -pair_level=>NONE   -pair_type_col=>   -pair_type_col2=>   -pair_type_col2_pattern=>   -pair_type_col_pattern=>   -popup_format=>text   -popup_max_len=>300   -print_command=>lpr -P Rainbow {}   -print_multipages_high=>1   -print_multipages_wide=>1   -printer_page_length=>11i   -printer_page_orientation=>1   -printer_page_width=>8i   -quick_color=>purple   -seq_color=>black   -seq_label_color=>black   -seq_label_fontsize=>12   -seq_label_offset=>-4   -seq_label_offset_h=>0   -seq_label_on=>1   -seq_label_pattern=>   -seq_line_spacing_btwn=>250   -seq_line_spacing_wrap=>200   -seq_spacing_btwn_sequences=>10000   -seq_tick_b_color=>black   -seq_tick_b_label_anchor=>ne   -seq_tick_b_label_color=>black   -seq_tick_b_label_fontsize=>9   -seq_tick_b_label_multiplier=>0.001   -seq_tick_b_label_offset=>2   -seq_tick_b_label_offset_h=>0   -seq_tick_b_label_on=>1   -seq_tick_b_length=>10   -seq_tick_b_offset=>0   -seq_tick_b_on=>1   -seq_tick_b_width=>2   -seq_tick_bp=>20000   -seq_tick_color=>black   -seq_tick_e_color=>black   -seq_tick_e_label_anchor=>nw   -seq_tick_e_label_color=>black   -seq_tick_e_label_fontsize=>9   -seq_tick_e_label_multiplier=>0.001   -seq_tick_e_label_offset=>2   -seq_tick_e_label_offset_h=>0   -seq_tick_e_label_on=>1   -seq_tick_e_length=>10   -seq_tick_e_offset=>0   -seq_tick_e_on=>1   -seq_tick_e_width=>2   -seq_tick_label_anchor=>n   -seq_tick_label_color=>black   -seq_tick_label_fontsize=>9   -seq_tick_label_multiplier=>0.001   -seq_tick_label_offset=>2   -seq_tick_label_on=>1   -seq_tick_length=>10   -seq_tick_offset=>0   -seq_tick_on=>1   -seq_tick_whole=>0   -seq_tick_width=>2   -seq_width=>3   -showqueryonly=>0   -sub_arrow_diag=>5   -sub_arrow_on=>0   -sub_arrow_paral=>5   -sub_arrow_perp=>4   -sub_color=>lightgreen   -sub_initoffset=>30   -sub_labelhit_col=>13   -sub_labelhit_color=>black   -sub_labelhit_offset=>0   -sub_labelhit_on=>0   -sub_labelhit_pattern=>0?([0-9.]{4})   -sub_labelhit_size=>9   -sub_labelseq_col=>0   -sub_labelseq_col2=>4   -sub_labelseq_col2_pattern=>   -sub_labelseq_col_pattern=>   -sub_labelseq_color=>black   -sub_labelseq_offset=>0   -sub_labelseq_on=>1   -sub_labelseq_size=>6   -sub_labelseqe_col=>4   -sub_labelseqe_col2=>0   -sub_labelseqe_col2_pattern=>   -sub_labelseqe_col_pattern=>   -sub_labelseqe_color=>black   -sub_labelseqe_offset=>0   -sub_labelseqe_on=>0   -sub_labelseqe_size=>6   -sub_line_spacing=>9   -sub_on=>1   -sub_scale_categoric_string=>   -sub_scale_col=>   -sub_scale_col2=>   -sub_scale_col2_pattern=>   -sub_scale_col_pattern=>   -sub_scale_hline_color=>grey   -sub_scale_hline_on=>1   -sub_scale_hline_width=>1   -sub_scale_label_color=>black   -sub_scale_label_fontsize=>12   -sub_scale_label_multiplier=>100   -sub_scale_label_offset=>1   -sub_scale_label_on=>1   -sub_scale_label_pattern=>   -sub_scale_lines=>10   -sub_scale_max=>1.00   -sub_scale_min=>0.80   -sub_scale_on=>0   -sub_scale_step=>0.01   -sub_scale_tick_color=>black   -sub_scale_tick_length=>9   -sub_scale_tick_offset=>4   -sub_scale_tick_on=>1   -sub_scale_tick_width=>3   -sub_scale_vline_color=>black   -sub_scale_vline_offset=>-5   -sub_scale_vline_on=>1   -sub_scale_vline_width=>2   -sub_width=>8   -template_desc_on=>1   -text2_anchor=>nw   -text2_color=>red   -text2_offset=>0   -text2_offset_h=>0   -text2_on=>1   -text2_size=>20   -text2_text=>   -text_anchor=>nw   -text_color=>red   -text_fontsize=>20   -text_offset=>0   -text_offset_h=>0   -text_on=>1   -text_text=>   -window_font_size=>9   -window_height=>550   -window_width=>800

-

-

-
-

APPENDIX C: INSTALLATION (WINDOWS OR UNIX)

-

Parasight has been tested extensively on Solaris, Linux, and MsWindows. Perl is available from www.perl.org. ActiveState (www.activestate.com) has binary versions available for many platforms--particularly useful for Windows installs. Follow instructions on the choosen sites for installing Perl. Unix installs should be easier simply because you probably have more experience with Perl or you have a network administrator. Windows installs are quite easy--just like installing any other program. Once the install is done put parasight program in the Perl bin directory (usually C:\Perl\bin). If you need to install any Perl modules such as Tk consult the individual OS. For Windows Active State binary the PPM provides easy searches and installations of modules. UNIX environments can utilize the CPAN module..

-

If there is a strong need a standalone versionsof the program that are package together with all need Perl functions could be generated using ActiveState's PerlApp program. All needed components are contained within the ``packed up'' executable for both Linux, Solaris, and Windows. No installation of Perl is needed. Note this is not a compiled version, so the run speed will be the same as the non-PerlApp-packaged program. It is actually just an executable that has collected all of the Perl components required for Parasight to run.

-

-

-
-

APPENDIX D: PRECODE HINTS

-

Precode affords the ability to execute additional code as parasight executes. Extensive use of precode is found in the parasight.examples file. The best way to figure out how to manipulate parasight is to study all of the parasight code. Of course even I am trying to forget most of the code so here is a list of useful subroutines to use for automation and batch processing.

-

First the hash variable containing all of the command line options is %opt. So, if you want to change arrangesub you have to use the code $opt{'arrangesub'}; - -

-

Useful commands to use when scripting: - -

-
- $opt{'x'}
-
-
-

Any normal option can be accessed within the hash %opt. - -

-
- &reshowNredraw; &update;
- 
-These two subroutines used in tandem will cause the any changes in options to take effect being redrawn and updated on the screen.  While update is not normally used in the internal code (as it is called automatically whenever control is returned to the GUI), it is necessary when a script has control of parsight.
-
-
-
- &print_screen(0, "fileoutpath");
-
-
-

This will print a postscript of the visble screen to the designated file. If 1 is used for the initial print varaible then the postscript will be sent to the default printer. If zero is used only the file is created. - -

-
- &print_all (1, "fileoutpath");
-
-
-

This will print a postscript of the entire parasight area to the designated file. If 1 is used for the initial print variable then the postscript will be sent to the printer. If zero is used only the file is created. Depending upon the multipage options, multiple files may be created. - -

-
- &save_parasight_table("basefileoutpath");
-
-
-

To save as parasight formated files which can be reload with the -in ``basefileoutpath'' name. - -

-
- &fitlongestline;
-
-
-

This will force the length of the screen in bases to the length of the longest sequence. This is most useful for BLAST views. - -

-
- $opt{'die'}=0;
-
-
-

This is useful to turn off the die option if you are subsequently saving the parasight files. Otherwise when you load the saved parasight it will ``die'' before you get to see it. - -

-
- &reshowNredraw; &update; print "PAUSED\n"; my $pause=<STDIN>; 
-
-
-

A useful sequence of commands if you want to pause for the user. It requires the user to hit enter in the COMMAND window. See example one of the parasight examples. - -

-
- $opt{"text_text"}="This is displayed text."; $opt{"text_fontsize"}=16;         $opt{"text_offset_h"}=10;
-
-
-

Allows for a line of text to be printed within the image. text2_text allows for a second line. to be positioned elsewhere. - -

-

-

-
-

APPENDIX E: ADDITIONAL EXAMPLES

-
- parasight -showseq show.file -extra repeat.file:exon.file
-
-
-

This draws the sequences specified in show.file decorated with the repeats and exons specified in repeat.file and exon.file. Note: this example does not contain any alignments so show.file is required in order to specify the lengths of the sequencesto be displayed. - -

-
- parasight -in saved  -extra exons:introns 
-     -arrangeseq oneperline
-
-
-

This loads saved parasight files, adds extra annotation from the files exons and introns annotation. It arranges subjects one per line below the sequence - -

-

-

-
-

AUTHOR

-

Jeff Bailey (jab@case.edu) - -

-

-

-
-

ACKNOWLEDGEMENTS

-

This software was developed in the laboratory of Evan Eichler, Department of Genetics, Case Western Reserve University and University Hosiptals, Cleveland. - -

-

-

-
-

COPYRIGHT

-

Copyright (C) 2001-4 Jeff Bailey. Distribute and modify freely as defined by the GNU General Public License. - -

-

-

-
-

DISCLAIMER

-

This software is provided ``as is'' without warranty of any kind. - -

- - - - diff --git a/snakemake/02_check_run_stats.smk b/snakemake/02_check_run_stats.smk new file mode 100644 index 0000000..27520bb --- /dev/null +++ b/snakemake/02_check_run_stats.smk @@ -0,0 +1,150 @@ +configfile: 'variant_calling.yaml' +output_folder='/opt/analysis' +log_folder=output_folder+'/run_settings' +import subprocess +subprocess.call(f'mkdir -p {log_folder}', shell=True) +rule all: + input: + snakefile=log_folder+'/02_check_run_stats.smk', + repool_csv=output_folder+'/repool.csv', + barcode_counts=output_folder+'/barcode_counts.csv', + output_graph=output_folder+'/umi_heatmap.html' + +rule copy_params: + ''' + copies snakemake file, config file, profile, and python scripts to output + folder + ''' + input: + snakefile='/opt/snakemake/02_check_run_stats.smk', + configfile='variant_calling.yaml', + scripts='/opt/snakemake/scripts' + output: + snakefile=log_folder+'/02_check_run_stats.smk', + configfile=log_folder+'/variant_calling.yaml', + scripts=directory(log_folder+'/scripts') + resources: + log_dir=log_folder + shell: + ''' + cp {input.snakefile} {output.snakefile} + cp {input.configfile} {output.configfile} + cp -r {input.scripts} {output.scripts} + ''' + +rule modify_ozkan_settings: + ''' + copies Ozkan's default settings, plus any user updated settings, to an + output folder alongside the data for later reference. + ''' + params: + template_settings='/opt/resources/templates/analysis_settings_templates/settings.txt', + processor_number=config['processor_number'], + bwa_extra=config['bwa_extra'], + species=config['species'], + probe_sets_used=config['probe_sets_used'], + freebayes_threads=config['freebayes_threads'], + min_haplotype_barcodes=config['min_haplotype_barcodes'], + min_haplotype_samples=config['min_haplotype_samples'], + min_haplotype_sample_fraction=config['min_haplotype_sample_fraction'], + wdir='/opt/analysis' + output: + user_settings=output_folder+'/settings.txt' + resources: + log_dir=log_folder + script: + 'scripts/modify_ozkan_settings.py' + +rule parse_info_file: + ''' + parses the original info file into multiple sub-files + ''' + input: + user_settings=output_folder+'/settings.txt' + output: + data=output_folder+'/data.tsv', + samples=output_folder+'/samples.tsv', + unique_haplotypes=output_folder+'/unique_haplotypes.csv' + params: + wdir='/opt/analysis', + settings_file='settings.txt', + info_files=['/opt/data/'+config['wrangled_file']], + sample_sheets='/opt/data/sample_sheet.tsv', + sample_groups=config['sample_groups'] + resources: + log_dir=log_folder + script: + 'scripts/parse_info_file.py' + +rule map_haplotypes: + ''' + maps haplotypes against the reference genome and outputs several tables + showing these mappings and whether they are on target. + ''' + input: + data=output_folder+'/data.tsv', + samples=output_folder+'/samples.tsv', + unique_haplotypes=output_folder+'/unique_haplotypes.csv' + params: + wdir='/opt/analysis', + settings_file='settings.txt', + output: + fastq_haps=output_folder+'/haplotypes.fq', + haps_sam=output_folder+'/haplotypes_bwa.sam', + aligned_haps=output_folder+'/aligned_haplotypes.csv', + all_haps=output_folder+'/all_haplotypes.csv', + mapped_haps=output_folder+'/mapped_haplotypes.csv', + offtarget_haps=output_folder+'/offtarget_haplotypes.csv', + metadata=output_folder+'/run_meta.csv', + barcode_counts=output_folder+'/barcode_counts.csv', + haplotype_counts=output_folder+'/haplotype_counts.csv', + sample_summary=output_folder+'/sample_summary.csv' + #resources below are currently not utilized - haven't figured out a way to + #get singularity profile, slurm profile, and high ulimits all at once. + resources: + mem_mb=200000, + time_min=4320, + nodes=20, + log_dir=log_folder + script: + 'scripts/map_haplotypes.py' + +rule graph_barcodes: + ''' + graphs the barcodes that worked and the barcodes that failed + ''' + input: + barcode_counts=output_folder+'/barcode_counts.csv', + params: + wdir='/opt/analysis', + output: + output_graph=output_folder+'/umi_heatmap.html' + resources: + log_dir=log_folder + script: + 'scripts/graph_barcodes.py' + +rule make_repool_table: + ''' + creates a table that recommends (for each sample) whether it needs to be + repooled or recaptured based on some user-defined thresholds + ''' + input: + output_folder+'/run_meta.csv' + params: + high_barcode_threshold=config['high_barcode_threshold'], + low_coverage_action=config['low_coverage_action'], + target_coverage_count=config['target_coverage_count'], + target_coverage_fraction=config['target_coverage_fraction'], + target_coverage_key=config['target_coverage_key'], + barcode_coverage_threshold=config['barcode_coverage_threshold'], + barcode_count_threshold=config['barcode_count_threshold'], + assessment_key=config['assessment_key'], + good_coverage_quantile=config['good_coverage_quantile'], + repool_csv='/opt/analysis/repool.csv' + resources: + log_dir=log_folder + output: + repool_csv=output_folder+'/repool.csv' + script: + 'scripts/make_repool_table.py' diff --git a/snakemake/03_generate_contigs.smk b/snakemake/03_generate_contigs.smk new file mode 100644 index 0000000..dd92aa6 --- /dev/null +++ b/snakemake/03_generate_contigs.smk @@ -0,0 +1,66 @@ +configfile: 'variant_calling.yaml' +output_folder='/opt/analysis' +log_folder='/opt/analysis/run_settings' +import subprocess +subprocess.call(f'mkdir {log_folder}', shell=True) + +rule all: + input: + freebayes_command_dict=output_folder+'/freebayes_command_dict.yaml', + snakefile=log_folder+'/03_generate_contigs.smk' + +rule copy_params: + ''' + copies snakemake file, config file, profile, and python scripts to output + folder + ''' + input: + generate_contigs_snakefile='/opt/snakemake/03_generate_contigs.smk', + run_freebayes_snakefile = '/opt/snakemake/04_run_freebayes.smk', + scripts='/opt/snakemake/scripts' + output: + generate_contigs_snakefile=log_folder+'/03_generate_contigs.smk', + run_freebayes_snakefile = log_folder+'/04_run_freebayes.smk', + scripts=directory(log_folder+'/scripts') + resources: + log_dir=log_folder + shell: + ''' + cp {input.generate_contigs_snakefile} {output.generate_contigs_snakefile} + cp {input.run_freebayes_snakefile} {output.run_freebayes_snakefile} + cp -r {input.scripts} {output.scripts} + ''' + +rule generate_contigs: + ''' + generates padded bams and padded fastqs and a list of commands to run freebayes + ''' + input: + output_folder+'/aligned_haplotypes.csv' + output: + #contig_vcfs=directory(output_folder+'/contig_vcfs'), + padded_bams=directory(output_folder+'/padded_bams'), + padded_fastqs=directory(output_folder+'/padded_fastqs'), + freebayes_command_dict=output_folder+'/freebayes_command_dict.yaml' + #variants_index=output_folder+'/variants.vcf.gz.csi', + #variants=output_folder+'/variants.vcf.gz', + #unfixed_variants=output_folder+'/unfixed.vcf.gz', + #new_header=output_folder+'/new_vcf_header.txt', + #warnings=output_folder+'/freebayes_warnings.txt', + #errors=output_folder+'/freebayes_errors.txt', + #targets_index=output_folder+'/targets.vcf.gz.tbi', + #targets_vcf=output_folder+'/targets.vcf.gz' + params: + targets_file=config['target_aa_annotation'], + freebayes_settings=config['freebayes_settings'], + wdir='/opt/analysis', + settings_file='settings.txt' + #resources below are currently not utilized - haven't figured out a way to + #get singularity profile, slurm profile, and high ulimits all at once. + resources: + mem_mb=200000, + nodes=16, + time_min=5760, + log_dir=log_folder + script: + 'scripts/generate_contigs.py' diff --git a/snakemake/04_run_freebayes.smk b/snakemake/04_run_freebayes.smk new file mode 100644 index 0000000..67335fd --- /dev/null +++ b/snakemake/04_run_freebayes.smk @@ -0,0 +1,79 @@ +configfile: 'variant_calling.yaml' +#singularity: config['sif_file'] +output_folder='/opt/analysis' + +import yaml +import subprocess + +freebayes_command_dict_yaml = open(output_folder+'/freebayes_command_dict.yaml','r') +freebayes_command_dict = yaml.safe_load(freebayes_command_dict_yaml) + +rule all: + input: + ref_table=output_folder+'/reference_table.csv', + cov_table=output_folder+'/coverage_table.csv', + alt_table=output_folder+'/alternate_table.csv' + + +rule run_freebayes: + ''' + Takes the contigs generated by the generate_contigs script and runs freebayes on them + ''' + output: + contig_vcf = output_folder+"/contig_vcfs/{contig}.vcf.gz" + params: + wdir='/opt/analysis', + freebayes_command_dict = freebayes_command_dict + #resources below are currently not utilized - haven't figured out a way to + #get singularity profile, slurm profile, and high ulimits all at once. + resources: + mem_mb=200000, + nodes=16, + time_min=5760, + #log_dir=log_folder + script: + 'scripts/run_freebayes.py' + + +rule concatenate_and_fix_vcf_headers: + input: + contig_vcf = expand(output_folder+"/contig_vcfs/{contig}.vcf.gz", + contig = freebayes_command_dict.keys()) + output: + variants=output_folder+'/variants.vcf.gz', + params: + freebayes_settings=config['freebayes_settings'], + wdir='/opt/analysis', + settings_file='settings.txt' + #resources below are currently not utilized - haven't figured out a way to + #get singularity profile, slurm profile, and high ulimits all at once. + resources: + mem_mb=200000, + nodes=16, + time_min=5760, + #log_dir=log_folder + script: + 'scripts/concatenate_headers.py' + +rule generate_tables: + input: + variants=output_folder+'/variants.vcf.gz', + output: + ref_table=output_folder+'/reference_table.csv', + cov_table=output_folder+'/coverage_table.csv', + alt_table=output_folder+'/alternate_table.csv' + params: + wdir='/opt/analysis', + settings_file='settings.txt', + geneid_to_genename=config['geneid_to_genename'], + target_aa_annotation=config['target_aa_annotation'], + aggregate_nucleotides=config['aggregate_nucleotides'], + aggregate_aminoacids=config['aggregate_aminoacids'], + target_nt_annotation=config['target_nt_annotation'], + annotate=config['annotate'], + decompose_options=config['decompose_options'], + annotated_vcf=config['annotated_vcf'], + aggregate_none=config['aggregate_none'], + output_prefix=config['output_prefix'] + script: + 'scripts/generate_tables.py' diff --git a/snakemake/README.md b/snakemake/README.md new file mode 100644 index 0000000..f1bc178 --- /dev/null +++ b/snakemake/README.md @@ -0,0 +1,51 @@ +# miptools_analysis_no_jupyter + +This is a snakemake pipeline for running a modified analysis_template_with_qual +notebook (from miptools analysis). It consists of three parts: + + - setup_run: creates a singularity profile for running remaining steps + - check_run_stats: checks which samples and mips worked and at what levels + - variant_calling: calls variants and generates output tables + +Currently each step requires that the previous steps have run - you can't check +run stats without setting up the singularity profile, and you can't run the +variant calling step without setting up the run and checking run stats. + +## Installation + + - Install conda: https://github.com/conda-forge/miniforge#unix-like-platforms-mac-os--linux. +You'll need to follow the instructions to 'initialize' the conda environment at the end of the +installer, then sign out and back in again. + - Create a conda environment and install snakemake there: +```bash +mamba create -c conda-forge -c bioconda -n snakemake snakemake +conda activate snakemake +``` + +### Setup your environment: + - Change directory to a folder where you want to run the analysis + - Download the files of this repository into that folder + +## Usage: + - Edit the miptools_analysis_no_jupyter.yaml file to point to your files. +Use a text editor that outputs unix line endings (e.g. vscode, notepad++, gedit, micro, emacs, vim, vi, etc.) + - If snakemake is not your active conda environment, activate snakemake with: +```bash +conda activate snakemake +``` + - To setup your singularity environment, use: +```bash +snakemake -s setup_run.smk --cores 4 +``` + - To check run stats, use: +```bash +snakemake -s check_run_stats.smk --profile singularity_profile +``` + - To run variant calling, use: +```bash +snakemake -s variant_calling.smk --profile singularity_profile +``` + - to run all three steps at once, use: + ```bash + bash run_all_steps.sh + ``` diff --git a/snakemake/scripts/concatenate_headers.py b/snakemake/scripts/concatenate_headers.py new file mode 100644 index 0000000..b855b3c --- /dev/null +++ b/snakemake/scripts/concatenate_headers.py @@ -0,0 +1,40 @@ +import sys +import subprocess +import gzip +import os +import yaml +sys.path.append("/opt/src") +import mip_functions as mip + +contig_vcf_gz_paths_yaml = open("/opt/analysis/contig_vcf_gz_paths.yaml",'r') +contig_vcf_gz_paths = yaml.safe_load(contig_vcf_gz_paths_yaml) + +vcf_file="/opt/analysis/variants.vcf.gz" + +wdir=snakemake.params['wdir'] +settings_file=snakemake.params['settings_file'] +options=snakemake.params['freebayes_settings'] +settings = mip.get_analysis_settings(wdir+'/'+settings_file) + +# concatanate contig vcfs. The number of contigs may be high, so we'll +# write the vcf paths to a file and bcftools will read from that file +cvcf_paths_file = os.path.join(wdir, "contig_vcfs", "vcf_file_list.txt") +with open(cvcf_paths_file, "w") as outfile: + outfile.write("\n".join(contig_vcf_gz_paths) + "\n") +subprocess.run(["bcftools", "concat", "-f", cvcf_paths_file, "-Oz", + "-o", vcf_file], check=True) +subprocess.run(["bcftools", "index", "-f", vcf_file], check=True) + +# fix vcf header if --gvcf option has been used +if "--gvcf" in options: + temp_vcf_path = os.path.join(wdir, "temp.vcf.gz") + mip.vcf_reheader(os.path.basename(vcf_file), temp_vcf_path, wdir=wdir) + old_vcf_path = os.path.join(wdir, "unfixed.vcf.gz") + subprocess.run(["mv", vcf_file, old_vcf_path]) + subprocess.run(["mv", temp_vcf_path, vcf_file]) + subprocess.run(["bcftools", "index", "-f", vcf_file], check=True) + print('did a reheader') + +with open('/opt/analysis/freebayes_reheader_check.txt','w') as file: + file.write('reheader done') + diff --git a/snakemake/scripts/create_profile.py b/snakemake/scripts/create_profile.py new file mode 100644 index 0000000..577908d --- /dev/null +++ b/snakemake/scripts/create_profile.py @@ -0,0 +1,22 @@ +project_resources=snakemake.params.project_resources +species_resources=snakemake.params.species_resources +wrangler_directory=snakemake.params.wrangler_directory +output_directory=snakemake.params.output_directory +#miptools_directory=snakemake.params.miptools_directory +output_profile=open(snakemake.output.profile, 'w') + +output_profile.write('use-singularity: True\n') +output_profile.write(f'singularity-args: "-B {project_resources}:/opt/project_resources\n') +output_profile.write(f' -B {species_resources}:/opt/species_resources\n') +output_profile.write(f' -B {wrangler_directory}:/opt/data\n') +output_profile.write(f' -B {output_directory}:/opt/analysis\n') +#output_profile.write(f' -B {miptools_directory}:/opt/src\n') +output_profile.write(f' --app jupyter"\n') +output_profile.write('printshellcmds: True\n') +output_profile.write('cores: 16\n') +output_profile.write('keep-going: True\n') +output_profile.write('rerun-incomplete: True\n') +output_profile.write('use-conda: True\n') +output_profile.write('latency-wait: 60\n') +output_profile.write('#keep-incomplete: True\n') +output_profile.write('#restart-times: 3\n') diff --git a/snakemake/scripts/generate_contigs.py b/snakemake/scripts/generate_contigs.py new file mode 100644 index 0000000..2f31726 --- /dev/null +++ b/snakemake/scripts/generate_contigs.py @@ -0,0 +1,34 @@ +import sys +import yaml +sys.path.append("/opt/src") +import mip_functions as mip + + +wdir=snakemake.params['wdir'] +settings_file=snakemake.params['settings_file'] +options=snakemake.params['freebayes_settings'] +targets_file=snakemake.params.targets_file + +settings = mip.get_analysis_settings(wdir+'/'+settings_file) +freebayes_command_dict_yaml = open('/opt/analysis/freebayes_command_dict.yaml','w') +contig_vcf_gz_paths_yaml = open('/opt/analysis/contig_vcf_gz_paths.yaml','w') + +verbose=True +fastq_dir="/opt/analysis/padded_fastqs" +bam_dir="/opt/analysis/padded_bams" +vcf_file="/opt/analysis/variants.vcf.gz" +errors_file="/opt/analysis/freebayes_errors.txt" +warnings_file="/opt/analysis/freebayes_warnings.txt" + +#what is the purpose of the variable 'r'? I don't think it ever gets used again. +r = mip.freebayes_call(settings=settings, options=options, align=True, +verbose=True, fastq_dir=fastq_dir, bam_dir=bam_dir, vcf_file=vcf_file, +targets_file=targets_file, bam_files=None, errors_file=errors_file, +warnings_file=warnings_file, fastq_padding=20) + +#contig_dict_list, results, errors=r +freebayes_command_dict,contig_vcf_gz_paths=r + + +yaml.dump(freebayes_command_dict,freebayes_command_dict_yaml) +yaml.dump(contig_vcf_gz_paths,contig_vcf_gz_paths_yaml) diff --git a/snakemake/scripts/generate_mip_files.py b/snakemake/scripts/generate_mip_files.py new file mode 100644 index 0000000..d3d319d --- /dev/null +++ b/snakemake/scripts/generate_mip_files.py @@ -0,0 +1,49 @@ +import pandas as pd +import subprocess +import os + +arms_file=snakemake.input.arms_file +input_fastq_folder=snakemake.input.fastq_folder +input_sample_sheet=snakemake.input.sample_sheet +desired_sample_set=snakemake.params.sample_set +desired_probe_sets=snakemake.params.probe_sets.replace(' ', '').strip().split(',') +mip_arms=snakemake.output.mip_arms +sample_file=open(snakemake.output.sample_file, 'w') +output_sample_sheet=snakemake.output.sample_sheet +subprocess.call(f'cp {input_sample_sheet} {output_sample_sheet}', shell=True) + +#grab only selected columns from original arms file and output them to new arms file +arms_df=pd.read_table(arms_file) +arms_df=arms_df[['mip_id', 'mip_family', 'extension_arm', 'ligation_arm', 'extension_barcode_length', 'ligation_barcode_length', 'gene_name', 'mipset']] +arms_df.to_csv(mip_arms, index=False, sep='\t') +sequenced_samples=[sample.split('_')[0] for sample in os.listdir(input_fastq_folder)] + +print('sequenced samples are', sequenced_samples) + +samples_used=set([]) +for line_number, line in enumerate(open(input_sample_sheet)): + line=line.strip().split('\t') + if line_number==0: + sample_name_c, sample_set_c, replicate_c, probe_set_c=line.index('sample_name'), line.index('sample_set'), line.index('replicate'), line.index('probe_set') + else: + probe_sets=line[probe_set_c].replace(' ', '').strip().split(',') + probe_sets=[entry.upper() for entry in probe_sets] + sample_set=line[sample_set_c].replace(' ', '').strip() + for desired_probe_set in desired_probe_sets: + new_sample_name=f'{line[sample_name_c]}-{line[sample_set_c]}-{line[replicate_c]}' + if new_sample_name in sequenced_samples: + if desired_probe_set.upper() in probe_sets and sample_set.upper()==desired_sample_set.upper(): + samples_used.add(new_sample_name) + +family_df=arms_df[['mip_family']] +family_dict=family_df.to_dict() +family_list=sorted([family_dict['mip_family'][row] for row in family_dict['mip_family']]) +sample_list=sorted(list(samples_used)) + +bigger_size=max(len(sample_list), len(family_list)) +sample_list=sample_list+['']*(bigger_size-len(sample_list)) +family_list=family_list+['']*(bigger_size-len(family_list)) + +sample_file.write('mips\tsamples\n') +for entry in range(bigger_size): + sample_file.write(f'{family_list[entry]}\t{sample_list[entry]}\n') diff --git a/snakemake/scripts/generate_tables.py b/snakemake/scripts/generate_tables.py new file mode 100644 index 0000000..1516119 --- /dev/null +++ b/snakemake/scripts/generate_tables.py @@ -0,0 +1,43 @@ +import sys +sys.path.append("/opt/src") +import mip_functions as mip + +wdir=snakemake.params['wdir'] +settings_file=snakemake.params['settings_file'] +settings = mip.get_analysis_settings(wdir+'/'+settings_file) + +vcf_file="/opt/analysis/variants.vcf.gz" +geneid_to_genename=snakemake.params.geneid_to_genename +target_aa_annotation=snakemake.params.target_aa_annotation +aggregate_nucleotides=snakemake.params.aggregate_nucleotides +aggregate_aminoacids=snakemake.params.aggregate_aminoacids +target_nt_annotation=snakemake.params.target_nt_annotation +annotate=snakemake.params.annotate +decompose_options=snakemake.params.decompose_options +annotated_vcf=snakemake.params.annotated_vcf +aggregate_none=snakemake.params.aggregate_none +output_prefix=snakemake.params.output_prefix +min_site_qual=1 +min_target_site_qual=-1 +min_genotype_qual=-1 +min_mean_alt_qual=-1 + +vcf_file = vcf_file.split("/")[-1] +mip.vcf_to_tables_fb( + vcf_file, + settings=settings, + settings_file=settings_file, + annotate=annotate, + geneid_to_genename=geneid_to_genename, + target_aa_annotation=target_aa_annotation, + aggregate_aminoacids=aggregate_aminoacids, + target_nt_annotation=target_nt_annotation, + aggregate_nucleotides=aggregate_nucleotides, + decompose_options=decompose_options, + annotated_vcf=annotated_vcf, + aggregate_none=aggregate_none, + min_site_qual=min_site_qual, + min_target_site_qual=min_target_site_qual, + min_genotype_qual=min_genotype_qual, + min_mean_alt_qual=min_mean_alt_qual, + output_prefix=output_prefix) diff --git a/snakemake/scripts/get_good_samples.py b/snakemake/scripts/get_good_samples.py new file mode 100644 index 0000000..1df7e7a --- /dev/null +++ b/snakemake/scripts/get_good_samples.py @@ -0,0 +1,26 @@ +sample_file=snakemake.input.sample_file +good_samples=snakemake.output.good_samples +nested_output=snakemake.params.nested_output + +samples=[] +for line in open(sample_file): + line=line.strip().split('\t') + if len(line)>1: + samples.append(line[1]) + +samples=samples[1:] +summary_dict={} +for sample in samples: + summary_file=nested_output+f'/analysis/{sample}/{sample}_mipExtraction/extractInfoSummary.txt' + for line_number, line in enumerate(open(summary_file)): + if line_number>0: + good_read_count=int(line.strip().split('\t')[6].split('(')[0]) + summary_dict[sample]=good_read_count + +if len(summary_dict)==len(samples): + #only make output file if all extractions gave data - double checks that all + #extractions really did complete for all samples + good_samples=open(good_samples, 'w') + for sample in summary_dict: + if summary_dict[sample]>0: + good_samples.write(sample+'\n') diff --git a/snakemake/scripts/graph_barcodes.py b/snakemake/scripts/graph_barcodes.py new file mode 100644 index 0000000..92b6bd4 --- /dev/null +++ b/snakemake/scripts/graph_barcodes.py @@ -0,0 +1,34 @@ +import sys +sys.path.append("/opt/src") +#import mip_functions_freebayes_call_edit as mip +import pandas as pd +import matplotlib.pyplot as plt +import os + +wdir=snakemake.params['wdir'] + +def make_graphing_list(barcode_file): + import math + graphing_list, rows=[],[] + for line_number, line in enumerate(open(barcode_file)): + line=line.strip().split(',') + if line_number==0: + columns=line[1:] + if line_number>2: + rows.append(line[0]) + int_line=list(map(int, list(map(float, line[1:])))) + log_line=[math.log(number+1, 2) for number in int_line] + graphing_list.append(log_line) + return graphing_list, columns, rows + +def plot_heatmap(graphing_list, x_values, y_values, x_title, y_title, count_title, output_path, width=2000, height=4000): + import plotly.express as px +# print(graphing_list) + fig = px.imshow(graphing_list, aspect='auto', labels=dict(x=x_title, y=y_title, + color=count_title), x=x_values, y=y_values) + fig.update_xaxes(side="top") + fig.update_layout(width=width, height=height, autosize=False) + fig.write_html(output_path) + +graphing_list, x_values, y_values=make_graphing_list(wdir+'/barcode_counts.csv') +plot_heatmap(graphing_list, x_values, y_values, 'mips', 'samples', 'log2 of umi_counts+1', '/opt/analysis/umi_heatmap.html') diff --git a/snakemake/scripts/make_repool_table.py b/snakemake/scripts/make_repool_table.py new file mode 100644 index 0000000..40c3878 --- /dev/null +++ b/snakemake/scripts/make_repool_table.py @@ -0,0 +1,33 @@ +import sys +sys.path.append("/opt/src") +import subprocess +import mip_functions as mip +import pandas as pd + +wdir='/opt/analysis/' + +high_barcode_threshold=snakemake.params.high_barcode_threshold +low_coverage_action=snakemake.params.low_coverage_action +target_coverage_count=snakemake.params.target_coverage_count +target_coverage_fraction=snakemake.params.target_coverage_fraction +target_coverage_key=snakemake.params.target_coverage_key +barcode_coverage_threshold=snakemake.params.barcode_coverage_threshold +barcode_count_threshold=snakemake.params.barcode_count_threshold +assessment_key=snakemake.params.assessment_key +good_coverage_quantile=snakemake.params.good_coverage_quantile +repool_csv=snakemake.params.repool_csv + +sample_summary = pd.read_csv(wdir+'sample_summary.csv') +meta = pd.read_csv(wdir+'run_meta.csv') +data_summary = pd.merge(sample_summary, meta) + +mip.repool(wdir, data_summary, high_barcode_threshold, +target_coverage_count=target_coverage_count, +target_coverage_fraction=target_coverage_fraction, +target_coverage_key=target_coverage_key, +barcode_coverage_threshold=barcode_coverage_threshold, +barcode_count_threshold=barcode_count_threshold, +low_coverage_action=low_coverage_action, +assesment_key=assessment_key, +good_coverage_quantile=good_coverage_quantile, +output_file=repool_csv) diff --git a/snakemake/scripts/map_haplotypes.py b/snakemake/scripts/map_haplotypes.py new file mode 100644 index 0000000..427f738 --- /dev/null +++ b/snakemake/scripts/map_haplotypes.py @@ -0,0 +1,10 @@ +import sys +sys.path.append("/opt/src") +import mip_functions as mip + +wdir=snakemake.params['wdir'] +settings_file=snakemake.params['settings_file'] +settings = mip.get_analysis_settings(wdir+'/'+settings_file) + +mip.map_haplotypes(settings) +mip.get_haplotype_counts(settings) diff --git a/snakemake/scripts/mip_barcode_correction.py b/snakemake/scripts/mip_barcode_correction.py new file mode 100644 index 0000000..e0ceadc --- /dev/null +++ b/snakemake/scripts/mip_barcode_correction.py @@ -0,0 +1,19 @@ +import tqdm_thing + +sample_fastq=snakemake.input.sample_fastq +sample_output=snakemake.output.sample_output +sample=snakemake.wildcards.sample + + +#see /nfs/jbailey5/baileyweb/asimkin/miptools/miptools_by_sample_prototyping/output/analysis/analysis/D10-JJJ-28/D10-JJJ-28_mipExtraction/k13_S0_Sub0_mip3_ref/k13_S0_Sub0_mip3_ref.fastq.gz for example UMIs +#wrangler_downsample_umi(wrangler_downsample_list) + +mip_barcode_correction(barcode_correction_list) +#command: MIPWrangler mipBarcodeCorrection --keepIntermediateFiles --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --sample {sample} +#marker file: /nfs/jbailey5/baileyweb/asimkin/miptools/miptools_by_sample_prototyping/output/analysis/analysis/D10-JJJ-2/D10-JJJ-2_mipBarcodeCorrection/barcodeFilterStats.tab.txt + +mip_correct_for_contam_with_same_barcodes(correct_for_contam_list) +# + +mip_clustering(mip_clustering) + diff --git a/snakemake/scripts/mip_clustering.py b/snakemake/scripts/mip_clustering.py new file mode 100644 index 0000000..f1fd6d4 --- /dev/null +++ b/snakemake/scripts/mip_clustering.py @@ -0,0 +1,7 @@ +corrected_barcode_marker=snakemake.input.corrected_barcode_marker +mip_cluster_finished=snakemake.output.mip_cluster_finished +sample=snakemake.wildcards.sample + +MIPWrangler mipClustering --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps --sample sample +subprocess.call(f'touch {mip_cluster_finished}', shell=True) + diff --git a/snakemake/scripts/modify_ozkan_settings.py b/snakemake/scripts/modify_ozkan_settings.py new file mode 100644 index 0000000..876bcfa --- /dev/null +++ b/snakemake/scripts/modify_ozkan_settings.py @@ -0,0 +1,33 @@ +''' +The code this function depends on could probably be significantly streamlined by +reading and writing settings file dictionaries with yaml instead of Ozkan's +custom functions. +Also, not sure why mipSetKey needs an empty entry added on to the existing list. +''' +import sys +sys.path.append("/opt/src") +import subprocess +import mip_functions as mip + +temp_settings_file = snakemake.params['template_settings'] +processor_number=snakemake.params['processor_number'] +bwa_extra=snakemake.params['bwa_extra'] +species=snakemake.params['species'] +probe_sets_used=snakemake.params['probe_sets_used'] +wdir=snakemake.params['wdir'] +min_haplotype_barcodes=snakemake.params['min_haplotype_barcodes'] +min_haplotype_samples=snakemake.params['min_haplotype_samples'] +min_haplotype_sample_fraction=snakemake.params['min_haplotype_sample_fraction'] +freebayes_threads=snakemake.params['freebayes_threads'] + +# extract the settings template +settings = mip.get_analysis_settings(temp_settings_file) +settings["bwaOptions"]=[settings['bwaOptions']]+bwa_extra +settings['species']=species +settings['freebayes_threads']=freebayes_threads +settings['processorNumber']=processor_number +settings['mipSetKey'] = probe_sets_used + [''] #this feels weird, why are we adding an extra element? +settings['minHaplotypeBarcodes']=min_haplotype_barcodes +settings['minHaplotypeSamples']=min_haplotype_samples +settings['minHaplotypeSampleFraction']=min_haplotype_sample_fraction +mip.write_analysis_settings(settings, wdir+'/settings.txt') diff --git a/snakemake/scripts/output_final_table.py b/snakemake/scripts/output_final_table.py new file mode 100644 index 0000000..fd1fa01 --- /dev/null +++ b/snakemake/scripts/output_final_table.py @@ -0,0 +1,29 @@ +''' +still needs to be written +''' +import os +import gzip +#from natsort import natsorted +all_targets=snakemake.params.all_targets +prefix=snakemake.params.prefix +suffix=snakemake.params.suffix +final_table=gzip.open(snakemake.output.final_table, mode='wt') + +full_list=[] +header='' +for target in all_targets: + file_path=prefix+target+suffix + if os.path.exists(file_path): + for line_number, line in enumerate(gzip.open(file_path, mode='rt')): + if line_number>0: + line=line.strip().split('\t') +# line[18]='not_shown' + full_list.append(line) + elif len(header)==0: + header=line +#sorted_list=natsorted(full_list) +full_list.sort() + +final_table.write(header) +for line in full_list: + final_table.write('\t'.join(line)+'\n') diff --git a/snakemake/scripts/parse_info_file.py b/snakemake/scripts/parse_info_file.py new file mode 100644 index 0000000..b224c3c --- /dev/null +++ b/snakemake/scripts/parse_info_file.py @@ -0,0 +1,22 @@ +''' +parses the original wrangler output file to produce abbreviated versions of the +same, separated into data.tsv, samples.tsv, and unique_haplotypes.csv files. +''' +import sys +sys.path.append("/opt/src") +import mip_functions as mip + +wdir=snakemake.params['wdir']+'/' +settings_file=snakemake.params['settings_file'] +info_files=snakemake.params['info_files'] +sample_sheets=[snakemake.params['sample_sheets']] +sample_groups=snakemake.params['sample_groups'] +settings = mip.get_analysis_settings(wdir+'/'+settings_file) + +if len(info_files) > 1: + mip.combine_info_files(wdir, settings_file, info_files, sample_sheets, + settings["mipsterFile"], + sample_sets=sample_groups) +else: + mip.process_info_file(wdir, settings_file, info_files, sample_sheets, + settings["mipsterFile"], sample_sets=sample_groups) diff --git a/snakemake/scripts/run_freebayes.py b/snakemake/scripts/run_freebayes.py new file mode 100644 index 0000000..3c510df --- /dev/null +++ b/snakemake/scripts/run_freebayes.py @@ -0,0 +1,15 @@ +import subprocess +sys.path.append("/opt/src") +#import mip_functions_freebayes_call_edit as mip + +freebayes_command_dict = snakemake.params.freebayes_command_dict + +vcf_file = snakemake.wildcards.contig +#vcf_file = snakemake.params.contig_name + +command = freebayes_command_dict[vcf_file] + +freebayes_status=subprocess.run(command,shell=True) +if freebayes_status.returncode==0: + subprocess.call(f"bgzip -f /opt/analysis/contig_vcfs/{vcf_file}.vcf",shell=True) + subprocess.call(f"bcftools index -f /opt/analysis/contig_vcfs/{vcf_file}.vcf.gz",shell=True) diff --git a/snakemake/scripts/wrangle_sample.py b/snakemake/scripts/wrangle_sample.py new file mode 100644 index 0000000..e0ceadc --- /dev/null +++ b/snakemake/scripts/wrangle_sample.py @@ -0,0 +1,19 @@ +import tqdm_thing + +sample_fastq=snakemake.input.sample_fastq +sample_output=snakemake.output.sample_output +sample=snakemake.wildcards.sample + + +#see /nfs/jbailey5/baileyweb/asimkin/miptools/miptools_by_sample_prototyping/output/analysis/analysis/D10-JJJ-28/D10-JJJ-28_mipExtraction/k13_S0_Sub0_mip3_ref/k13_S0_Sub0_mip3_ref.fastq.gz for example UMIs +#wrangler_downsample_umi(wrangler_downsample_list) + +mip_barcode_correction(barcode_correction_list) +#command: MIPWrangler mipBarcodeCorrection --keepIntermediateFiles --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --sample {sample} +#marker file: /nfs/jbailey5/baileyweb/asimkin/miptools/miptools_by_sample_prototyping/output/analysis/analysis/D10-JJJ-2/D10-JJJ-2_mipBarcodeCorrection/barcodeFilterStats.tab.txt + +mip_correct_for_contam_with_same_barcodes(correct_for_contam_list) +# + +mip_clustering(mip_clustering) + diff --git a/snakemake/wrangler_by_sample_finish.smk b/snakemake/wrangler_by_sample_finish.smk new file mode 100644 index 0000000..5dc09d4 --- /dev/null +++ b/snakemake/wrangler_by_sample_finish.smk @@ -0,0 +1,159 @@ +configfile: 'wrangler_by_sample.yaml' +output='/opt/analysis' + +all_samples, all_targets=[],[] + +for line_number, line in enumerate(open(output+'/mip_ids/allMipsSamplesNames.tab.txt')): + if line_number>0: + line=line.rstrip().split('\t') + if len(line)>1 and len(line[1])>0: + all_samples.append(line[1]) + if len(line[0])>0: + all_targets.append(line[0]) + +final_dict={1: expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples), + 2: expand(output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt', sample=all_samples), + 3: output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json', + 4: expand(output+'/clustering_status/{sample}_mip_clustering_finished.txt', sample=all_samples), + 5: expand(output+'/analysis/populationClustering/{target}/analysis/log.txt', target=all_targets), + 6: output+'/allInfo.tsv.gz'} +output_choice=config['output_choice'] +final_out=final_dict[output_choice] + +rule all: + input: + final_out + +rule extract_by_arm: + input: + params: + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# fastq_dir=config['fastq_dir'] + resources: + time_min=240 + output: + output+'/analysis/{sample}/{sample}_mipExtraction/log.txt' + shell: + ''' + MIPWrangler mipExtractByArm --masterDir {params.output_dir} --sample {wildcards.sample} --overWriteDirs --minCaptureLength=30 + ''' +if config['downsample_umi_count']<2**32: + rule mip_barcode_correction: + input: + good_samples=expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples) + params: + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# sif_file=config['miptools_sif'], + downsample_seed=config['downsample_seed'], + downsample_amount=config['downsample_umi_count'] + resources: + mem_mb=config['memory_mb_per_step'], + time_min=20 + output: + barcode_corrections_finished=output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt' + shell: + ''' + MIPWrangler mipBarcodeCorrection --masterDir {params.output_dir} \ + --downSampleAmount {params.downsample_amount} --downSampleSeed \ + {params.downsample_seed} --overWriteDirs --sample {wildcards.sample} + ''' +else: + rule mip_barcode_correction: + input: + good_samples=expand(output+'/analysis/{sample}/{sample}_mipExtraction/log.txt', sample=all_samples) + params: + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# sif_file=config['miptools_sif'], + downsample_seed=config['downsample_seed'], + resources: + mem_mb=config['memory_mb_per_step'], + time_min=20 + output: + barcode_corrections_finished=output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt' + shell: + ''' + MIPWrangler mipBarcodeCorrection --masterDir {params.output_dir} \ + --doNotDownSample --downSampleSeed \ + {params.downsample_seed} --overWriteDirs --sample {wildcards.sample} + ''' + + +rule correct_for_same_barcode_contam: + input: + all_corrected=expand(output+'/analysis/{sample}/{sample}_mipBarcodeCorrection/barcodeFilterStats.tab.txt', sample=all_samples) + params: + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# sif_file=config['miptools_sif'], + resources: + mem_mb=40000, + time_min=1440, + nodes=20 + threads: 20 + output: + #name is controlled by --logFile + corrected_barcode_marker=output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json' + shell: + ''' + MIPWrangler mipCorrectForContamWithSameBarcodesMultiple --masterDir {params.output_dir} --numThreads {threads} --overWriteDirs --overWriteLog --logFile mipCorrectForContamWithSameBarcodes_run1 + ''' + +rule mip_clustering: + input: + corrected_barcode_marker=output+'/analysis/logs/mipCorrectForContamWithSameBarcodes_run1.json', + #sample_dir=output+'/analysis/{sample}' + params: + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# sif_file=config['miptools_sif'] + resources: + mem_mb=config['memory_mb_per_step'], + time_min=60, + output: + mip_clustering=output+'/clustering_status/{sample}_mip_clustering_finished.txt' + shell: + ''' + MIPWrangler mipClustering --masterDir {params.output_dir} --overWriteDirs --par /opt/resources/clustering_pars/illumina_collapseHomoploymers.pars.txt --countEndGaps --sample {wildcards.sample} + touch {output.mip_clustering} + ''' + +rule pop_cluster_target: + input: + mip_cluster_files=expand(output+'/clustering_status/{sample}_mip_clustering_finished.txt', sample=all_samples) + params: + output_dir='/opt/analysis/analysis', +# wrangler_dir=output, +# sif_file=config['miptools_sif'] + resources: + mem_mb=config['memory_mb_per_step'], + time_min=60, + output: + pop_clustering=output+'/analysis/populationClustering/{target}/analysis/log.txt' + shell: + ''' + MIPWrangler mipPopulationClustering --keepIntermediateFiles --masterDir {params.output_dir} --overWriteDirs --cutoff 0 --countEndGaps --fraccutoff 0.005 --mipName {wildcards.target} + touch {output.pop_clustering} + ''' + +rule output_final_table: + ''' + cat together output files of previous step into a final file, do a "natural + sort" to sort things similar to how Nick's are output. gzip it + ''' + input: + pop_clustering=expand(output+'/analysis/populationClustering/{target}/analysis/log.txt', target=all_targets) +# final_sample_outputs=expand('/path/to/sample/outputs/{sample}.something', sample=sample_list) + params: + all_targets=all_targets, + prefix=output+'/analysis/populationClustering/', + suffix='/analysis/selectedClustersInfo.tab.txt.gz' + resources: + mem_mb=20000, + time_min=480 + output: + final_table=output+'/allInfo.tsv.gz' + script: + 'scripts/output_final_table.py' diff --git a/snakemake/wrangler_by_sample_setup.smk b/snakemake/wrangler_by_sample_setup.smk new file mode 100644 index 0000000..3e1c729 --- /dev/null +++ b/snakemake/wrangler_by_sample_setup.smk @@ -0,0 +1,76 @@ +''' +creates a mip_ids folder and an allMipsSamplesNames.tab.txt file. extracts mips, +corrects mips, and generates files that can be used to determine sample names as +well as sample names that had extractable data. +''' + +configfile: 'wrangler_by_sample.yaml' +output='/opt/analysis' + +rule all: + input: + setup_finished=output+'/setup_finished.txt', +# good_samples=output+'/successfully_extracted_samples.txt', + output_configfile=output+'/snakemake_params/wrangler_by_sample.yaml' + +rule copy_files: + input: + setup_snakefile='/opt/snakemake/wrangler_by_sample_setup.smk', + finish_snakefile='/opt/snakemake/wrangler_by_sample_finish.smk', + input_configfile='wrangler_by_sample.yaml', + in_scripts='/opt/snakemake/scripts' + output: + setup_snakefile=output+'/snakemake_params/setup_run.smk', + finish_snakefile=output+'/snakemake_params/finish_run.smk', + output_configfile=output+'/snakemake_params/wrangler_by_sample.yaml', + out_scripts=directory(output+'/snakemake_params/scripts') + shell: + ''' + cp {input.setup_snakefile} {output.setup_snakefile} + cp {input.finish_snakefile} {output.finish_snakefile} + cp {input.input_configfile} {output.output_configfile} + cp -r {input.in_scripts} {output.out_scripts} + ''' + +rule generate_mip_files: + ''' + given that I'm repackaging miptools wrangler (so wrangler.sh is not needed) + and that the existing generate_wrangler_scripts.py seems unnecessarily + convoluted and that only two files are needed by subsequent steps + (mipArms.txt and allMipsSamplesNames.tab.txt) I wrote my own + script for this. Input is an arms file and a sample sheet. Output is an arms + file with rearranged columns and a two column file with names of all mips + and names of all samples (with no pairing between columns of any given row). + ''' + input: + arms_file='/opt/project_resources/mip_ids/mip_arms.txt', + sample_sheet='/opt/input_sample_sheet_directory/'+config['input_sample_sheet'].split('/')[-1], + fastq_folder='/opt/data' + params: + sample_set=config['sample_set_used'], + probe_sets=config['probe_sets_used'] + output: + mip_arms=output+'/mip_ids/mipArms.txt', + sample_file=output+'/mip_ids/allMipsSamplesNames.tab.txt', + sample_sheet=output+'/sample_sheet.tsv' + script: + 'scripts/generate_mip_files.py' + +rule setup: + input: + mip_arms=output+'/mip_ids/mipArms.txt', + sample_file=output+'/mip_ids/allMipsSamplesNames.tab.txt' + params: + output_dir='/opt/analysis/analysis', + project_resources='/opt/project_resources', +# wrangler_dir=output, +# sif_file=config['miptools_sif'], + fastq_dir='/opt/data' + output: + setup_finished=output+'/setup_finished.txt' + threads: config['cpu_count'] + shell: + ''' + MIPWrangler mipSetup --mipArmsFilename /opt/analysis/mip_ids/mipArms.txt --mipSampleFile /opt/analysis/mip_ids/allMipsSamplesNames.tab.txt --numThreads {threads} --masterDir {params.output_dir} --dir /opt/data --mipServerNumber 1 + touch {output.setup_finished} + ''' diff --git a/src/align.R b/src/align.R deleted file mode 100644 index f5193bd..0000000 --- a/src/align.R +++ /dev/null @@ -1,17 +0,0 @@ -# read arguments from command line -args <- commandArgs(TRUE) -fas = args[1] -output_file = args[2] - -# load the DECIPHER library in R -library(DECIPHER) - -# load the sequences from the file -# change "DNA" to "RNA" or "AA" if necessary -seqs <- readDNAStringSet(fas) - -# perform the alignment -aligned <- AlignSeqs(seqs) - -# write the alignment to a new FASTA file -writeXStringSet(aligned, file=output_file) diff --git a/src/demux_qc.py b/src/demux_qc.py index c988ab1..bd6f35f 100644 --- a/src/demux_qc.py +++ b/src/demux_qc.py @@ -1,4 +1,3 @@ -"""Generate demultiplexing statistics after a sequencine run.""" import mip_functions as mip import pickle import os @@ -7,17 +6,21 @@ def main(platform, stats_dir): - """Generate demultiplexing statistics after a sequencine run.""" - bc_dict = "/opt/resources/barcode_dict.json" + """Generate demultiplexing statistics after a sequencing run.""" + bc_dict = "/opt/resources/sample_prep/barcode_dict.pickle" + # load barcode dict to be passed to the header-primer conversion function with open(bc_dict, "rb") as infile: bc_dict = pickle.load(infile) - # fastq summary fies created for each lane contains raw read numbers - # and reads passing filter for tile and sample + + # fastq summary files created for each lane contains raw read numbers and + # reads passing filter for tile and sample fsums = [] + # demultiplexing summary files have the information with the most popular # unindexed sample barcodes. dfiles = [] + # scan the stats dir and extract information from the relevant files for f in os.scandir(stats_dir): if f.name.startswith("FastqSummary"): @@ -28,6 +31,7 @@ def main(platform, stats_dir): elif f.name.startswith("DemuxSummary"): lane = "Lane" + f.name.split(".")[0][-1] dfiles.append([f.path, lane]) + dsums = [] for d, l in dfiles: with open(d) as infile: @@ -39,7 +43,8 @@ def main(platform, stats_dir): dsums.append(counts) elif line.startswith("### Columns: Index_Sequence Hit_Count"): start = True - # create summary dataframe for read counts + + # create summary data frame for read counts fsums = pd.concat(fsums) # create per sample read count summary if sample sheet is provided @@ -53,31 +58,41 @@ def main(platform, stats_dir): parse_line = True if parse_line: sample_sheet_list.append(line.split(",")) - sample_sheet = pd.DataFrame(sample_sheet_list[1:], - columns=sample_sheet_list[0]) + sample_sheet = pd.DataFrame(sample_sheet_list[1:], columns=sample_sheet_list[0]) sample_sheet["Sample_ID"] = sample_sheet["Sample_ID"].astype(int) sample_sheet = sample_sheet.rename( - columns = {"Sample_ID": "SampleNumber", "Sample_Name": "Sample ID"}) + columns={"Sample_ID": "SampleNumber", "Sample_Name": "Sample ID"} + ) sample_sums = fsums.groupby(["SampleNumber", "Lane"], as_index=False)[ - ["NumberOfReadsRaw", "NumberOfReadsPF"]].sum() + ["NumberOfReadsRaw", "NumberOfReadsPF"] + ].sum() sample_sums = sample_sums.merge(sample_sheet) sample_sums.to_csv( - os.path.join(stats_dir, "PerSampleReadCounts.csv"), - index = False) + os.path.join(stats_dir, "PerSampleReadCounts.csv"), index=False + ) except IOError: pass + # Print out read number summary. - fsums = fsums[["NumberOfReadsRaw", "NumberOfReadsPF", "Lane"]].groupby( - "Lane").sum().reset_index() - print(("Total number of raw reads and reads passing filter were " - "{0[NumberOfReadsRaw]:,} and {0[NumberOfReadsPF]:,}, " - "respectively.").format( - fsums.sum() - )) + fsums = ( + fsums[["NumberOfReadsRaw", "NumberOfReadsPF", "Lane"]] + .groupby("Lane") + .sum() + .reset_index() + ) + print( + ( + "Total number of raw reads and reads passing filter were " + "{0[NumberOfReadsRaw]:,} and {0[NumberOfReadsPF]:,}, " + "respectively." + ).format(fsums.sum()) + ) fsums.to_csv(os.path.join(stats_dir, "ReadSummary.csv")) + # create a dataframe with unindexed read information dsums = pd.DataFrame(dsums, columns=["Header", "Read Count", "Lane"]) dsums["Read Count"] = dsums["Read Count"].astype(int) + # get primer indexes for corresponding headers dsums["Fw,Rev"] = dsums["Header"].apply( lambda a: mip.header_to_primer(bc_dict, a, platform) @@ -85,12 +100,15 @@ def main(platform, stats_dir): dsums["Fw"] = dsums["Fw,Rev"].apply(lambda a: a[0]) dsums["Rev"] = dsums["Fw,Rev"].apply(lambda a: a[1]) dsums["Index Difference"] = dsums["Rev"] - dsums["Fw"] + # separate 999 values which do not correspond to our indexes - caught = dsums.loc[(dsums["Fw"] != 999) - & (dsums["Rev"] != 999)] - print(("There were {:,} undetermined reads. {:,} of these belong to " - "possible primer pairs.").format(dsums["Read Count"].sum(), - caught["Read Count"].sum())) + caught = dsums.loc[(dsums["Fw"] != 999) & (dsums["Rev"] != 999)] + print( + ( + "There were {:,} undetermined reads. {:,} of these belong to " + "possible primer pairs." + ).format(dsums["Read Count"].sum(), caught["Read Count"].sum()) + ) dsums.to_csv(os.path.join(stats_dir, "UndeterminedIndexSummary.csv")) caught.to_csv(os.path.join(stats_dir, "UndeterminedPrimerSummary.csv")) @@ -98,18 +116,20 @@ def main(platform, stats_dir): if __name__ == "__main__": # Read input arguments parser = argparse.ArgumentParser( - description=""" Create a QC report following a bcl demultiplex operateon. - """) + description="Create a QC report following demultiplexing of data." + ) parser.add_argument( - "-p", "--platform", + "-p", + "--platform", help=("Sequencing platform."), required=True, - choices=["nextseq", "miseq"] + choices=["nextseq", "miseq"], ) parser.add_argument( - "-d", "--stats-dir", + "-d", + "--stats-dir", help=("Path to directory where demultiplexing stats are saved."), - default="/opt/analysis/Stats" + default="/opt/analysis/Stats", ) # parse arguments from command line diff --git a/src/freebayes_caller.py b/src/freebayes_caller.py index 45db950..9654000 100644 --- a/src/freebayes_caller.py +++ b/src/freebayes_caller.py @@ -1,89 +1,145 @@ import mip_functions as mip import argparse -# Read input arguments +# Read command line arguments parser = argparse.ArgumentParser( - description="""Call variants using freebayes.""") -parser.add_argument("-k", "--skip-fastq", - help=("Set this flag to skip creating fastq files " - "from MIP data."), - action="store_false") -parser.add_argument("-d", "--fastq-dir", - help="Fastq directory to create or use fastqs in.", - default="/opt/analysis/padded_fastqs") -parser.add_argument("-p", "--skip-align", - help="Set this flag to skip bwa alignment to genome.", - action="store_false") -parser.add_argument("-b", "--bam-dir", - help="bam directory to create or use bam files in.", - default="/opt/analysis/padded_bams") -parser.add_argument("-o", "--output-vcf", - help="vcf.gz file name to create.", - default="/opt/analysis/variants.vcf.gz") -parser.add_argument("-s", "--settings-file", - help="MIPTools analysis settings file to use.", - default="/opt/analysis/settings.txt") -parser.add_argument("-g", "--targets-file", - help="Tab separated targets file for variant calling.") -parser.add_argument("-z", "--bam-files", - nargs="*", - help=("Bam file to use for variant calling." - "Use multiple times for more than one file.")) -parser.add_argument("-l", "--bam-list", - help="File containing absolute paths to bam files to use.") -parser.add_argument("-e", "--extra-freebayes-options", - help=("Additional freebayes options to pass directly " - "to freebayes. Options must have + in place of -. " - "For example, ++pooled+continuous if you want to " - "pass --pooled-continuous"), - nargs="*") -parser.add_argument("-w", "--extra-bwa-options", - help=("Additional bwa options to pass directly " - "to bwa during alignments. Options must have " - "'+'' in place of '-'."), - nargs="*") -parser.add_argument("-t", "--threads", - help="Number of CPU threads to use.", - type=int, - default=1) -parser.add_argument("-f", "--fastq-padding", - help=("Number of reference genome bases to flank " - "haplotypes."), - type=int, - default=20) -parser.add_argument("-q", "--min-base-quality", - help=("Minimum base qual to consider an allele"), - type=int, - default=1) -# parse arguments from command line + description="""Call variants using Freebayes.""" +) +parser.add_argument( + "-b", + "--bam-dir", + help="bam directory to create or use bam files in.", + default="/opt/analysis/padded_bams", +) +parser.add_argument( + "-d", + "--fastq-dir", + help="fastq directory to create or use fastqs in.", + default="/opt/analysis/padded_fastqs", +) +parser.add_argument( + "-o", + "--output-vcf", + help="vcf.gz file name to create.", + default="/opt/analysis/variants.vcf.gz", +) +parser.add_argument( + "-g", + "--targets-file", + help="tab separated targets file for variant calling.", +) +parser.add_argument( + "-k", + "--skip-fastq", + help=("set this flag to skip creating fastq files from MIP data."), + action="store_false", +) +parser.add_argument( + "-p", + "--skip-align", + help="set this flag to skip bwa alignment to genome.", + action="store_false", +) +parser.add_argument( + "-z", + "--bam-files", + nargs="*", + help=( + "bam file to use for variant calling." + "Use multiple times for more than one file." + ), +) +parser.add_argument( + "-l", + "--bam-list", + help="file containing absolute paths to bam files to use.", +) + + +parser.add_argument( + "-s", + "--settings-file", + help="MIPTools analysis settings file to use.", + default="/opt/analysis/settings.txt", +) +parser.add_argument( + "-f", + "--fastq-padding", + help="number of reference genome bases to flank " "haplotypes.", + type=int, + default=20, +) +parser.add_argument( + "-q", + "--min-base-quality", + help="minimum base qual to consider an allele", + type=int, + default=1, +) +parser.add_argument( + "-t", + "--threads", + help="number of CPU threads to use.", + type=int, + default=20, +) +parser.add_argument( + "-e", + "--extra-freebayes-options", + nargs="*", + help=( + "additional Freebayes options to pass directly to Freebayes. Options " + "must have + in place of -. For example, ++pooled+continuous if you " + "want to pass --pooled-continuous." + ), +) +parser.add_argument( + "-w", + "--extra-bwa-options", + nargs="*", + help=( + "additional bwa options to pass directly to bwa during alignments. " + "Options must have '+'' in place of '-'." + ), +) + +# Parse command line arguments args = vars(parser.parse_args()) +# Process extra Freebayes options extra_freebayes_options = args["extra_freebayes_options"] if extra_freebayes_options is not None: - extra_freebayes_options = [e.replace("+", "-") - for e in extra_freebayes_options] + extra_freebayes_options = [ + e.replace("+", "-") for e in extra_freebayes_options + ] else: extra_freebayes_options = [] +# Read settings from settings file settings_file = args["settings_file"] settings = mip.get_analysis_settings(settings_file) +# Parse and set extra bwa options ebo = args["extra_bwa_options"] if ebo is not None: ebo = [[e.replace("+", "-") for e in ebo]] settings["bwaOptions"].extend(ebo) +# Set processor number setting settings["processorNumber"] = args["threads"] -mip.freebayes_call(bam_dir=args["bam_dir"], - fastq_dir=args["fastq_dir"], - options=extra_freebayes_options, - vcf_file=args["output_vcf"], - targets_file=args["targets_file"], - make_fastq=args["skip_fastq"], - align=args["skip_align"], - settings=settings, - bam_files=args["bam_files"], - bam_list=args["bam_list"], - fastq_padding=args["fastq_padding"], - min_base_quality=args["min_base_quality"]) +# Call Freebayes +mip.freebayes_call( + bam_dir=args["bam_dir"], + fastq_dir=args["fastq_dir"], + options=extra_freebayes_options, + vcf_file=args["output_vcf"], + targets_file=args["targets_file"], + make_fastq=args["skip_fastq"], + align=args["skip_align"], + settings=settings, + bam_files=args["bam_files"], + bam_list=args["bam_list"], + fastq_padding=args["fastq_padding"], + min_base_quality=args["min_base_quality"], +) diff --git a/src/generate_wrangler_scripts.py b/src/generate_wrangler_scripts.py index c1ed574..a9dfb84 100644 --- a/src/generate_wrangler_scripts.py +++ b/src/generate_wrangler_scripts.py @@ -11,60 +11,123 @@ parser = argparse.ArgumentParser( description=""" Generate bash scripts to be used for processing after a MIP sequencing run. - """) -parser.add_argument("-e", "--experiment-id", - help=("A Unique id given to each sequencing " - "run by the user."), - required=True) -parser.add_argument("-c", "--cpu-count", - type=int, - help="Number of available processors to use.", - default=1) -parser.add_argument("-n", "--server-num", - type=int, - help="Starting number for MIP server.", - default=1) -parser.add_argument("-d", "--data-dir", - help=("Absolute path to the directory where sequencing " - " (.fastq/.fastq.gz) files are located."), - default="/opt/data") -parser.add_argument("-a", "--analysis-dir", - help=("Absolute path to base directory for " - "MIPWrangler working directory."), - default="/opt/analysis") -parser.add_argument("-w", "--cluster-script", - help="Absolute path to MIPWrangler run script.", - default="/opt/bin/runMIPWranglerCurrent.sh") -parser.add_argument("-r", "--project-resource-dir", - help=("Path to directory where project specific resources " - "such as probe sets used, mip arm info etc. are"), - default="/opt/project_resources") -parser.add_argument("-b", "--base-resource-dir", - help=("Path to directory where general resources such as " - "barcode dictionary, sample sheet " - "templates etc. are."), - default="/opt/resources") -parser.add_argument("-l", "--sample-list", - help=("File providing a list of samples with associated " - "information."), - required=True) -parser.add_argument("-s", "--sample-sets", - help=("Sample sets to be processed."), - required=True) -parser.add_argument("-p", "--probe-sets", - help=("Probe sets to be processed."), - required=True) -parser.add_argument("-k", "--keep-files", - help=("Keep intermediate files generated by MIPWrangler."), - action="store_true") -parser.add_argument("-x", "--stitch-options", - help=("Probe set to be processed."), - required=True) -parser.add_argument("-m", "--min-capture-length", - help=("Minimum capture length for stitching, " - "excluding probe arms."), - type=int) -# parse arguments from command line + """ +) +parser.add_argument( + "-e", + "--experiment-id", + help="A Unique id given to each sequencing run by the user.", + required=True, +) +parser.add_argument( + "-c", + "--cpu-count", + type=int, + help="Number of available processors to use.", + default=1, +) +parser.add_argument( + "-n", + "--server-num", + type=int, + help="Starting number for MIP server.", + default=1, +) +parser.add_argument( + "-d", + "--data-dir", + help=( + "Absolute path to the directory where sequencing (.fastq/.fastq.gz)" + "files are located." + ), + default="/opt/data", +) +parser.add_argument( + "-a", + "--analysis-dir", + help="Absolute path to base directory for MIPWrangler working directory.", + default="/opt/analysis", +) +parser.add_argument( + "-o", + "--cluster-script", + help="Absolute path to MIPWrangler run script.", + default="/opt/bin/runMIPWranglerCurrent.sh", +) +parser.add_argument( + "-r", + "--project-resource-dir", + help=( + "Path to directory where project specific resources such as probe sets" + "used, MIP arm info etc. are." + ), + default="/opt/project_resources", +) +parser.add_argument( + "-b", + "--base-resource-dir", + help=( + "Path to directory where general resources such as barcode dictionary," + "sample sheet templates etc. are." + ), + default="/opt/resources", +) +parser.add_argument( + "-l", + "--sample-list", + help="File providing a list of samples with associated information.", + required=True, +) +parser.add_argument( + "-s", "--sample-sets", help=("Sample sets to be processed."), required=True +) +parser.add_argument( + "-p", "--probe-sets", help=("Probe sets to be processed."), required=True +) +parser.add_argument( + "-k", + "--keep-files", + help="Keep intermediate files generated by MIPWrangler.", + action="store_true", +) +parser.add_argument( + "-x", + "--stitch-options", + help=( + "Additional arguments to pass to MIPWrangler mipSetupAndExtractByArm." + "This command extracts sequences and stitches paired end reads to" + "single sequences." + ), + required=True, +) +parser.add_argument( + "-m", + "--min-capture-length", + help="Minimum capture length for stitching, excluding probe arms.", + type=int, +) +parser.add_argument( + "-f", + "--population-fraction-cutoff", + help="Population clustering fraction cutoff.", + default=0.005, + type=float, +) +parser.add_argument( + "-t", + "--downsample-threshold", + help="The threshold at which UMIs will be downsampled.", + default=2000, + type=int, +) +parser.add_argument( + "-w", + "--weighted", + action="store_true", + help="Whether to apply a weight when randomly sampling UMIs.", +) + +# Parse arguments from command line args = vars(parser.parse_args()) experiment_id = args["experiment_id"] cluster_script = args["cluster_script"] @@ -76,25 +139,25 @@ base_resource_dir = os.path.abspath(args["base_resource_dir"]) sample_list_file = os.path.join(analysis_dir, args["sample_list"]) raw_mip_ids_dir = os.path.join(analysis_dir, "mip_ids") -# get sample sets and probe sets to be processed. -# sample sets should be provided as a comma separated text -# sometimes semicolon is used and spaces may be included by mistake -# we'll check for these potential mistakes and create a list of sample sets -# to process. + +# Get sample sets and probe sets to be processed. +# These sets should be provided as comma separated text, however, sometimes a +# semicolon is used by mistake. We split on commas and semicolons. sam_set = args["sample_sets"] sam_set = sam_set.split(",") sample_sets = [] for s in sam_set: - sample_sets.extend(s.strip().split(";")) -sample_sets = set([s.strip() for s in sample_sets]) -# probe sets are provided and will be processed similarly to sample sets + sample_sets.extend(s.split(";")) +sample_sets = set(sample_sets) + pr_set = args["probe_sets"] pr_set = pr_set.split(",") probe_sets = [] for p in pr_set: - probe_sets.extend(p.strip().split(";")) -probe_sets = set([p.strip() for p in probe_sets]) -# get mipwrangler options + probe_sets.extend(p.split(";")) +probe_sets = set(probe_sets) + +# Get MIPWrangler mipSetupAndExtractByArm options keep_files = args["keep_files"] stitch_options = args["stitch_options"] min_capture_length = args["min_capture_length"] @@ -108,18 +171,24 @@ break else: stitch_options.append("--minCaptureLength=" + str(min_capture_length)) -# create dirs if they do not exist + +# Create dirs if they do not exist if not os.path.exists(raw_mip_ids_dir): os.makedirs(raw_mip_ids_dir) + # First part of the MIPWrangler process is to extract the sequences and -# stitch forward and reverse reads. This is done with mipSetupAndExtractByArm -# read in sample information +# stitch forward and reverse reads. This is done with mipSetupAndExtractByArm. +# We first read in sample information sample_info = pd.read_table(sample_list_file) sample_info["Sample ID"] = sample_info.apply( - lambda a: "-".join(a[["sample_name", "sample_set", - "replicate"]].astype(str)), axis=1) + lambda a: "-".join( + a[["sample_name", "sample_set", "replicate"]].astype(str) + ), + axis=1, +) sample_info_dict = (sample_info.set_index("Sample ID")).to_dict(orient="index") -# select the libraries belonging to sample and probe sets specified + +# Next, we select the libraries belonging to sample and probe sets specified selected_samples = set() for sid in sample_info_dict: if sample_info_dict[sid]["sample_set"] in sample_sets: @@ -136,143 +205,194 @@ mipset_table = os.path.join(project_resource_dir, "mip_ids", "mipsets.csv") mipsets = pd.read_csv(mipset_table) mipset_list = mipsets.to_dict(orient="list") -# convert the mip sets dataframe to dict for easy access + +# Convert the mip sets dataframe to dict for easy access and keep mip arm files +# for each mip set in a dictionary all_probes = {} -# keep mip arm files for each mip set in a dictionary mip_arms_dict = {} for mipset in mipset_list: list_m = mipset_list[mipset] - # the file name should be the second line in the mipsets.csv + + # The file name should be the second line in the mipsets.csv mip_arms_dict[mipset] = list_m[0] - # rest of the lines have probe names in the set + + # The rest of the lines have probe names in the set set_m = set(list_m[1:]) set_m.discard(np.nan) all_probes[mipset] = set_m -# For the sample and probe set create +# For the sample and probe set create: # 1) MIPWrangler input files (samples etc.) # 2) Scripts for MIPWrangler Part I (extract + stitch) -# 3) Scripts for MIPWrangler Part II (clustering) +# 3) Scripts for MIPWrangler Part II (barcode correction + clustering) probes = set() mip_arms_list = [] for p_name in probe_sets: try: temp_probes = all_probes[p_name] except KeyError: - print(("Probe set name {} is not present in the mipsets " - "file. This probe set will be ignored.").format(p_name)) + print( + ( + "Probe set name {} is not present in the mipsets " + "file. This probe set will be ignored." + ).format(p_name) + ) continue - arm_file = os.path.join(project_resource_dir, - "mip_ids", - mip_arms_dict[p_name]) + arm_file = os.path.join( + project_resource_dir, "mip_ids", mip_arms_dict[p_name] + ) try: with open(arm_file) as infile: mip_arms_list.append(pd.read_table(infile)) probes.update(temp_probes) except IOError: - print(("MIP arm file {} is required but missing for " - "the probe set {}. It will be generated if the " - "mip_info.json resource file is available at " - "/opt/project_resources/mip_ids/mip_info.json").format( - arm_file, p_name)) + print( + ( + "MIP arm file {} is required but missing for " + "the probe set {}. It will be generated if the " + "mip_info.json resource file is available at " + "/opt/project_resources/mip_ids/mip_info.json" + ).format(arm_file, p_name) + ) try: + # Generate the probe arm file from the mip info file probe_summary_generator.generate_mip_arms_file( - p_name, probe_sets_file=mipset_table) + p_name, probe_sets_file=mipset_table + ) + # Load the arm file generated + with open(arm_file) as infile: + mip_arms_list.append(pd.read_table(infile)) + probes.update(temp_probes) except Exception as e: - print(("MIP arm file generation for probe set {} " - "failed due to {}.")).format(p_name, e) + print( + ( + "MIP arm file generation for probe set {} " + "failed due to {}." + ) + ).format(p_name, e) if len(mip_arms_list) == 0: - print(("No MIP arms file were found for the probe sets {}" - " scripts will not be generated for them. Make sure " - "relevant files are present in the {} directory").format( - probe_sets, project_resource_dir)) + print( + ( + "No MIP arms file were found for the probe sets {}" + " scripts will not be generated for them. Make sure " + "relevant files are present in the {} directory" + ).format(probe_sets, project_resource_dir) + ) sys.exit(1) -mip_arms_table = pd.concat(mip_arms_list, - ignore_index=True).drop_duplicates() -mip_arms_table = mip_arms_table.loc[ - mip_arms_table["mip_family"].isin(probes) -] +mip_arms_table = pd.concat(mip_arms_list, ignore_index=True).drop_duplicates() +mip_arms_table = mip_arms_table.loc[mip_arms_table["mip_family"].isin(probes)] mip_family_names = probes + # Create MIPWrangler Input files sample_subset = list(selected_samples) with open( - os.path.join( - raw_mip_ids_dir, - "allMipsSamplesNames.tab.txt" - ), "w" + os.path.join(raw_mip_ids_dir, "allMipsSamplesNames.tab.txt"), "w" ) as outfile: outfile_list = ["\t".join(["mips", "samples"])] - mips_samples = zip_longest( - mip_family_names, sample_subset, fillvalue="" - ) + mips_samples = zip_longest(mip_family_names, sample_subset, fillvalue="") for ms in mips_samples: outfile_list.append("\t".join(ms)) outfile.write("\n".join(outfile_list)) - pd.DataFrame(mip_arms_table).groupby( - "mip_id").first().reset_index().dropna( - how="all", axis=1 - ).to_csv( - os.path.join( - raw_mip_ids_dir, - "mipArms.txt" - ), sep="\t", - index=False - ) -# Create MIPWrangler part I script commands + pd.DataFrame(mip_arms_table).groupby("mip_id").first().reset_index().dropna( + how="all", axis=1 + ).to_csv( + os.path.join(raw_mip_ids_dir, "mipArms.txt"), sep="\t", index=False + ) + +# Create MIPWrangler part I script commands (extract + stitch) stitch_commands = [ ["cd", analysis_dir], - ["nohup MIPWrangler mipSetupAndExtractByArm", "--mipArmsFilename", - os.path.join(raw_mip_ids_dir, "mipArms.txt"), - "--mipSampleFile", os.path.join( - raw_mip_ids_dir, - "allMipsSamplesNames.tab.txt" - ), "--numThreads", str(cpu_count), "--masterDir analysis", - "--dir", fastq_dir, "--mipServerNumber", str(server_num)] + [ + "nohup MIPWrangler mipSetupAndExtractByArm", + "--mipArmsFilename", + os.path.join(raw_mip_ids_dir, "mipArms.txt"), + "--mipSampleFile", + os.path.join(raw_mip_ids_dir, "allMipsSamplesNames.tab.txt"), + "--numThreads", + str(cpu_count), + "--masterDir analysis", + "--dir", + fastq_dir, + "--mipServerNumber", + str(server_num), + ], ] stitch_commands[-1].extend(stitch_options) if keep_files: stitch_commands[-1].append("--keepIntermediateFiles") -# Create MIPWrangler part II script commands + +# Create MIPWrangler part II script commands (barcode correction + clustering) now = datetime.datetime.now() run_date = now.strftime("%Y%m%d") -info_file = os.path.join(analysis_dir, - "analysis/populationClustering/allInfo.tab.txt") -renamed_info = os.path.join(analysis_dir, "run_" + experiment_id + "_wrangled_" - + run_date + ".txt") +info_file = os.path.join( + analysis_dir, "analysis/populationClustering/allInfo.tab.txt.gz" +) +renamed_info = os.path.join( + analysis_dir, "run_" + experiment_id + "_wrangled_" + run_date + ".txt.gz" +) +# Setup downsample weighing by read counts. If this is false, we need to feed in +# the empty string to the bash script as an argument. Otherwise, we feed in the +# flag as a string. +if args["weighted"]: + weighted = "-w" +else: + weighted = "''" wrangler_commands = [ ["cd", "analysis"], - ["nohup", "bash", cluster_script, str(server_num), str(cpu_count)], - ["mv", os.path.join(analysis_dir, "analysis/logs"), analysis_dir], - ["mv", os.path.join(analysis_dir, "analysis/scripts"), analysis_dir], - ["mv", os.path.join(analysis_dir, "analysis/resources"), analysis_dir], - ["mv", os.path.join(analysis_dir, "analysis/nohup.out"), - os.path.join(analysis_dir, "nohup2.out"), "2>/dev/null ||true"], - ["mv", info_file, renamed_info], - ["pigz", "-9", "-p", str(cpu_count), renamed_info] + [ + "nohup", + "bash", + cluster_script, + str(server_num), + str(cpu_count), + str(args["population_fraction_cutoff"]), + str(args["downsample_threshold"]), + weighted, + ">>", + os.path.join(analysis_dir, "nohup.out"), + ], + ["mv -f", os.path.join(analysis_dir, "analysis/logs"), analysis_dir], + ["mv -f", os.path.join(analysis_dir, "analysis/scripts"), analysis_dir], + ["mv -f", os.path.join(analysis_dir, "analysis/resources"), analysis_dir], + ["mv -f", info_file, renamed_info], ] extraction_summary_file = "extractInfoSummary.txt" extraction_per_target_file = "extractInfoByTarget.txt" stitching_per_target_file = "stitchInfoByTarget.txt" -for filename in [extraction_summary_file, - extraction_per_target_file, - stitching_per_target_file]: - stat_command = ["find", os.path.join(analysis_dir, "analysis"), - "-name", filename, "-exec", "cat", - "{}", "+", ">", os.path.join(analysis_dir, filename)] +for filename in [ + extraction_summary_file, + extraction_per_target_file, + stitching_per_target_file, +]: + stat_command = [ + "find", + os.path.join(analysis_dir, "analysis"), + "-name", + filename, + "-exec", + "cat", + "{}", + "+", + ">", + os.path.join(analysis_dir, filename), + ] wrangler_commands.append(stat_command) wrangler_commands.append(["cd", "/opt/analysis"]) -for filename in [extraction_summary_file, - extraction_per_target_file, - stitching_per_target_file]: +for filename in [ + extraction_summary_file, + extraction_per_target_file, + stitching_per_target_file, +]: zip_command = ["pigz", "-9", "-p", str(cpu_count), filename] wrangler_commands.append(zip_command) - mv_command = ["mv", filename + ".gz", experiment_id + "_" - + run_date + "_" + filename + ".gz"] + mv_command = [ + "mv", + filename + ".gz", + experiment_id + "_" + run_date + "_" + filename + ".gz", + ] server_num += 1 -# Save all scripts to files. -with open(os.path.join(analysis_dir, "wrangle.sh"), "w") as outfile: - outfile.write("\n".join( - [" ".join(c) for c in stitch_commands]) + "\n") - outfile.write("\n".join( - [" ".join(c) for c in wrangler_commands]) + "\n") +# Save the final script to a file to run +with open(os.path.join(analysis_dir, "wrangle.sh"), "w") as outfile: + outfile.write("\n".join([" ".join(c) for c in stitch_commands]) + "\n") + outfile.write("\n".join([" ".join(c) for c in wrangler_commands]) + "\n") diff --git a/src/mip_functions.py b/src/mip_functions.py index 08148b6..1f079b7 100644 --- a/src/mip_functions.py +++ b/src/mip_functions.py @@ -22,7 +22,6 @@ from primer3 import calcHeterodimerTm import primer3 import traceback -from msa_to_vcf import msa_to_vcf as msa_to_vcf import itertools import sys import allel @@ -2397,6 +2396,15 @@ def bwa(fastq_file, output_file, output_type, input_dir, def bwa_multi(fastq_files, output_type, fastq_dir, bam_dir, options, species, processor_number, parallel_processes): """Align fastq files to species genome using bwa in parallel.""" + #if a person doesn't add any extra bwa options, bwa options will be a + #string 'mem' - needs to be a list so it can be concatenated to -t options + #below + if type(options)==str: + options=[options] + #remove threads argument if present, so it doesn't get added twice below + if '-t' in options: + thread_location=options.index('-t') + options=options[:thread_location]+options[thread_location+2:] if len(fastq_files) == 0: fastq_files = [f.name for f in os.scandir(fastq_dir)] if output_type == "sam": @@ -2410,19 +2418,29 @@ def bwa_multi(fastq_files, output_type, fastq_dir, bam_dir, options, species, if not os.path.exists(bam_dir): os.makedirs(bam_dir) if parallel_processes == 1: + # Set number of processors + options = options + ["-t " + str(processor_number)] + + # Run bwa on each fastq file for f in fastq_files: # get base file name base_name = f.split(".")[0] bam_name = base_name + extension - options.extend("-t" + str(processor_number)) bwa(f, bam_name, output_type, fastq_dir, bam_dir, options, species, base_name) else: + # Determine number of processors per parallel process processor_per_process = processor_number // parallel_processes p = NoDaemonProcessPool(parallel_processes) + + # Set number of processors options = options + ["-t " + str(processor_per_process)] + + # Initialize lists results = [] errors = [] + + # Run bwa on each fastq file for f in fastq_files: base_name = f.split(".")[0] bam_name = base_name + extension @@ -4280,7 +4298,7 @@ def parasight(resource_dir, gs_list.append(gs_command) pdf_list.append("cp " + basename + ".pdf " + os.path.join(pdf_dir, t + ".pdf")) - outlist = ["parasight76.pl", + outlist = ["parasight.pl", "-showseq", basename + ".show", "-extra", basename + extra_extension, "-template", "/opt/resources/nolabel.pst", @@ -4334,7 +4352,7 @@ def parasight_print(resource_dir, design_dir, design_info_file, if (designed_gene_list is None) or (g in designed_gene_list): show_file = os.path.join(design_dir, g, g + ".show") extras_file = os.path.join(design_dir, g, g + extra_extension) - line = ["parasight76.pl", "-showseq", show_file, + line = ["parasight.pl", "-showseq", show_file, "-extra ", extras_file] if print_out: print(" ".join(line)) @@ -4835,6 +4853,7 @@ def get_haplotype_counts(settings): # barcode count data is only available for samples with data # so if a sample has not produced any data, it will be missing # these samples should be added with 0 values for each probe + print('run meta is', run_meta) all_barcode_counts = pd.merge( run_meta[["Sample ID", "replicate"]].set_index("Sample ID"), barcode_counts, left_index=True, right_index=True, how="left") @@ -5131,6 +5150,7 @@ def get_contig(g): if not os.path.exists(cvcfs_dir): os.makedirs(cvcfs_dir) # update contig_dict with contig specific options + freebayes_command_dict = {} for chrom in chrom_dict: for contig_name in chrom_dict[chrom]: contig_dict = chrom_dict[chrom][contig_name] @@ -5159,9 +5179,12 @@ def get_contig(g): # the options list in case bam files were added to the options # and they must stay at the end because they are positional args. contig_dict["options"] = contig_options + options + #contig_name = contig_dict['options'].split(' ')[3].split('/')[-1] # add the contig dict to contig dict list - contig_dict_list.append(contig_dict) - + contig_value = ('freebayes '+' '.join(contig_dict['options'])) + freebayes_command_dict[contig_name]=contig_value + return(freebayes_command_dict,contig_vcf_gz_paths) +''' # create a processor pool for parallel processing pool = Pool(int(settings["processorNumber"])) # create a results container for the return values from the worker function @@ -5235,30 +5258,30 @@ def get_contig(g): subprocess.run(["mv", temp_vcf_path, vcf_file]) subprocess.run(["bcftools", "index", "-f", vcf_file], check=True) return (contig_dict_list, results, errors) +''' + -def freebayes_worker(contig_dict): +def freebayes_worker(command): """Run freebayes program with the specified options. Run freebayes program with the specified options and return a subprocess.CompletedProcess object. """ - options = contig_dict["options"] - command = ["freebayes"] - command.extend(options) + command=command.split(' ') # run freebayes command piping the output fres = subprocess.run(command, stderr=subprocess.PIPE) # check the return code of the freebayes run. if succesfull continue if fres.returncode == 0: # bgzip the vcf output, using the freebayes output as bgzip input - vcf_path = contig_dict["vcf_path"] + vcf_index=command.index('-v')+1 + vcf_path=command[vcf_index] gres = subprocess.run(["bgzip", "-f", vcf_path], stderr=subprocess.PIPE) # make sure bugzip process completed successfully if gres.returncode == 0: # index the vcf.gz file - vcf_gz_path = contig_dict["vcf_gz_path"] - ires = subprocess.run(["bcftools", "index", "-f", vcf_gz_path], + ires = subprocess.run(["bcftools", "index", "-f", vcf_path+'.gz'], stderr=subprocess.PIPE) # return the CompletedProcess objects return (fres, gres, ires) @@ -5307,6 +5330,25 @@ def vcf_reheader(vcf_file, fixed_vcf_file, wdir="/opt/analysis/"): vcf_path, "-o", fixed_vcf_path], check=True) return +def concatenate_headers(settings=None, wdir='/opt/analysis', freebayes_settings=None, vcf_paths=None): + vcf_file="/opt/analysis/variants.vcf.gz" + # concatanate contig vcfs. The number of contigs may be high, so we'll + # write the vcf paths to a file and bcftools will read from that file + cvcf_paths_file = os.path.join(wdir, "contig_vcfs", "vcf_file_list.txt") + with open(cvcf_paths_file, "w") as outfile: + outfile.write("\n".join(vcf_paths) + "\n") + subprocess.run(["bcftools", "concat", "-f", cvcf_paths_file, "-Oz", + "-o", vcf_file], check=True) + subprocess.run(["bcftools", "index", "-f", vcf_file], check=True) + # fix vcf header if --gvcf option has been used + if "--gvcf" in freebayes_settings: + temp_vcf_path = os.path.join(wdir, "temp.vcf.gz") + vcf_reheader(os.path.basename(vcf_file), temp_vcf_path, wdir=wdir) + old_vcf_path = os.path.join(wdir, "unfixed.vcf.gz") + subprocess.run(["mv", vcf_file, old_vcf_path]) + subprocess.run(["mv", temp_vcf_path, vcf_file]) + subprocess.run(["bcftools", "index", "-f", vcf_file], check=True) + print('did a reheader') def gatk(options): """GATK wrapper function. @@ -6980,475 +7022,6 @@ def annotate_vcf_file(settings, vcf_file, annotated_vcf_file, options=[]): return res.returncode return 0 - -def process_contig(contig_dict): - try: - chrom = contig_dict["chrom"] - contig_start = contig_dict["contig_start"] - contig_end = contig_dict["contig_end"] - species = contig_dict["species"] - contig_ref_seq = get_sequence(create_region( - chrom, contig_start, contig_end), species) - contig_haplotypes_file = contig_dict["contig_haplotypes_file"] - contig_haps = pd.read_csv(contig_haplotypes_file) - nastring = ".:.:.:.:.:.:." - # Create a contig sequence for each haplotype. - # This will be done by gettig the forward strand sequence for each - # haplotype and padding it on both flanks with the reference sequence - # up to the contig start/end. - # - # get forward strand sequence for all haplotypes - contig_haps["forward_sequence"] = contig_haps["haplotype_sequence"] - reverse_index = contig_haps["orientation"] == "reverse" - contig_haps.loc[reverse_index, "forward_sequence"] = ( - contig_haps.loc[reverse_index, "forward_sequence"].apply( - reverse_complement)) - - def get_padded_sequence(row): - chrom = row["Chrom"] - contig_start = int(row["contig_start"]) - contig_end = int(row["contig_end"]) - capture_start = int(row["capture_start"]) - capture_end = int(row["capture_end"]) - left_key = create_region(chrom, contig_start, capture_start - 1) - right_key = create_region(chrom, capture_end + 1, contig_end) - left_pad = get_sequence(left_key, species) - right_pad = get_sequence(right_key, species) - return left_pad + str(row["forward_sequence"]) + right_pad - - contig_haps["padded_sequence"] = contig_haps.apply( - get_padded_sequence, axis=1) - g_dict = contig_haps.set_index( - ["MIP", "Copy", "haplotype_ID"]).to_dict(orient="index") - sequences = {"ref": contig_ref_seq} - contig_targets = contig_dict["contig_targets"] - if contig_targets is not None: - contig_targets["padded_sequence"] = contig_targets.apply( - get_padded_sequence, axis=1) - target_pos = contig_targets[ - ["Pos", "End", "Mutation Name"]].to_dict(orient="records") - targets_dict = contig_targets.to_dict(orient="index") - for t in targets_dict: - sequences[t] = targets_dict[t]["padded_sequence"] - else: - targets_dict = {} - target_pos = [] - for k in g_dict.keys(): - sequences[":".join(k)] = g_dict[k]["padded_sequence"] - wdir = contig_dict["contigs_dir"] - contig_name = contig_dict["contig_name"] - fasta_file = os.path.join(wdir, contig_name + ".fa") - alignment_file = os.path.join(wdir, contig_name + ".aln") - save_fasta_dict(sequences, fasta_file) - if contig_dict["aligner"] == "muscle": - mh = contig_dict["max_hours"] - subprocess.call(["muscle", "-in", fasta_file, "-out", - alignment_file, "-maxhours", mh]) - elif contig_dict["aligner"] == "decipher": - subprocess.call(["Rscript", "/opt/src/align.R", fasta_file, - alignment_file]) - alignments = fasta_parser(alignment_file) - ref_seq = alignments["ref"] - alignment_to_genomic = {0: contig_start - 1} - insertion_count = 0 - for i in range(len(ref_seq)): - if ref_seq[i] != "-": - alignment_to_genomic[i+1] = i + contig_start - insertion_count - else: - insertion_count += 1 - genomic_to_alignment = {} - for alignment_position in alignment_to_genomic: - genomic_to_alignment[alignment_to_genomic[ - alignment_position]] = alignment_position - - def get_hap_start_index(row): - hid = row["haplotype_ID"] - cop = row["Copy"] - hap_start = row["capture_start"] - 1 - hap_start_index = genomic_to_alignment[hap_start] - hap_mip = row["MIP"] - alignment_header = ":".join([hap_mip, cop, hid]) - hap_al = alignments[alignment_header][:hap_start_index] - ref_al = alignments["ref"][:hap_start_index] - diff = ref_al.count("-") - hap_al.count("-") - return hap_start_index - diff - - contig_haps["haplotype_start_index"] = contig_haps.apply( - get_hap_start_index, axis=1) - - raw_vcf_file = os.path.join(wdir, contig_name + ".raw.vcf") - if contig_dict["msa_to_vcf"] == "miptools": - msa_to_vcf(alignment_file, raw_vcf_file, ref="ref", - snp_only=contig_dict["snp_only"]) - else: - subprocess.call( - ["java", "-jar", "/opt/programs/jvarkit/dist/msa2vcf.jar", - "-m", "-c", "ref", "-o", raw_vcf_file, alignment_file]) - contig_dict["raw_vcf_file"] = raw_vcf_file - # find comment line number - with open(raw_vcf_file) as infile: - line_count = 0 - for line in infile: - if line.startswith("##"): - line_count += 1 - else: - break - vcf = pd.read_table(raw_vcf_file, skiprows=line_count) - if vcf.empty: - return contig_name + "_empty" - vcf = vcf.drop(["ID", "QUAL", "FILTER", "INFO", "FORMAT"], - axis=1).set_index(["#CHROM", "POS", "REF", "ALT"]) - vcf = vcf.applymap(lambda a: 0 if a == "." else int(a.split(":")[0])) - vcf = vcf.reset_index() - vcf["alignment_position"] = vcf["POS"] - vcf["POS"] = vcf["alignment_position"].map(alignment_to_genomic) - vcf["CHROM"] = chrom - vcf.drop("#CHROM", inplace=True, axis=1) - vcf = vcf.set_index(["CHROM", "POS", "REF", "ALT", - "alignment_position"]) - drop_seqs = ["ref"] + list(map(str, targets_dict.keys())) - vcf.drop(drop_seqs, axis=1, inplace=True) - vcf_stack = pd.DataFrame(vcf.stack()).reset_index() - vcf_stack.rename( - columns={"level_5": "alignment_header", 0: "genotype"}, - inplace=True) - vcf_stack[["MIP", "Copy", "haplotype_ID"]] = vcf_stack[ - "alignment_header"].apply(lambda a: pd.Series(a.split(":"))) - vcf_merge = vcf_stack.merge( - contig_haps[["MIP", "Copy", "haplotype_ID", - "capture_start", "capture_end", - "haplotype_start_index"]]) - vcf_merge["END"] = vcf_merge["REF"].apply(len) + vcf_merge["POS"] - 1 - vcf_merge["covered"] = ( - (vcf_merge["capture_start"] - 30 <= vcf_merge["END"]) - & (vcf_merge["capture_end"] + 30 >= vcf_merge["POS"])) - vcf_merge.loc[~vcf_merge["covered"], "genotype"] = np.nan - vcf_clean = vcf_merge.loc[~vcf_merge["genotype"].isnull()] - if vcf_clean.empty: - return contig_name + "_empty" - contig_seq = pd.DataFrame(contig_haps.groupby("haplotype_ID")[ - "forward_sequence"].first()).to_dict(orient="index") - - def get_variant_index(row): - pos_index = row["alignment_position"] - hap_start_index = row["haplotype_start_index"] - hap_copy = row["Copy"] - hid = row["haplotype_ID"] - hap_mip = row["MIP"] - alignment_header = ":".join([hap_mip, hap_copy, hid]) - hap_al = alignments[alignment_header] - hap_al = hap_al[hap_start_index:pos_index] - variant_index = len(hap_al) - hap_al.count("-") - 1 - alts = [row["REF"]] - alts.extend(row["ALT"].split(",")) - gen = int(row["genotype"]) - alt = alts[gen] - variant_end_index = variant_index + len(alt) - if variant_index < 0: - variant_index = 0 - if variant_end_index < 1: - variant_end_index = 1 - seq = contig_seq[hid]["forward_sequence"] - var_seq = seq[variant_index:variant_end_index] - return pd.Series([variant_index, variant_end_index, alt, var_seq]) - - vcf_clean[ - ["variant_index", "variant_end_index", "allele", "variant"] - ] = vcf_clean.apply(get_variant_index, axis=1) - - contig_counts_file = contig_dict["contig_counts_file"] - contig_counts = pd.read_csv(contig_counts_file) - contig_counts["forward_sequence_quality"] = contig_counts[ - "sequence_quality"] - reverse_index = contig_counts["orientation"] == "reverse" - contig_counts.loc[reverse_index, "forward_sequence_quality"] = ( - contig_counts.loc[reverse_index, "forward_sequence_quality"].apply( - lambda a: a[::-1])) - combined_vcf = vcf_clean[ - ["CHROM", "POS", "REF", "ALT", "genotype", - "MIP", "Copy", "haplotype_ID", "variant_index", - "variant_end_index"]].merge(contig_counts[ - ["Sample ID", "haplotype_ID", "MIP", "Copy", - "Barcode Count", "forward_sequence_quality"]]) - - def get_variant_quality(row): - start_index = row["variant_index"] - end_index = row["variant_end_index"] - qual = row["forward_sequence_quality"] - if end_index > len(qual) - 1: - end_index = len(qual) - 1 - qual_scores = [ord(qual[i]) - 33 for i in - range(start_index, end_index)] - return np.mean(qual_scores) - - combined_vcf["variant_quality"] = combined_vcf.apply( - get_variant_quality, axis=1) - - min_count = contig_dict["min_count"] - if min_count < 1: - min_count = 1 - min_depth = contig_dict["min_coverage"] - if min_depth < 1: - min_depth = 1 - min_wsaf = contig_dict["min_wsaf"] - if min_wsaf == 0: - min_wsaf = 0.0001 - - def collapse_vcf(group): - key = group.iloc[0][["CHROM", "POS", "REF", "ALT"]].values - alts = key[3].split(",") - allele_count = len(alts) + 1 - allele_depths = [] - for i in range(allele_count): - allele_depths.append(group.loc[group["genotype"] == i, - "Barcode Count"].sum().round(0)) - total_depth = int(round(np.sum(allele_depths), 0)) - wsaf = np.array(allele_depths)/total_depth - if total_depth < min_depth: - return nastring - genotypes = [] - for i in range(allele_count): - if (allele_depths[i] >= min_count) and (wsaf[i] >= min_wsaf): - genotypes.append(i) - if len(genotypes) == 0: - return nastring - else: - alleles = list(range(allele_count)) - geno = sorted(zip(alleles, allele_depths), - key=itemgetter(1, 0), reverse=True)[:2] - if len(genotypes) == 1: - gt = str(geno[0][0]) - gt = gt + "/" + gt - else: - gt1 = geno[0][0] - gt2 = geno[1][0] - gt = sorted(map(str, [gt1, gt2])) - gt = "/".join(gt) - allele_depths = [str(int(a)) for a in allele_depths] - variant_quals = [] - for i in range(allele_count): - variant_quals.append(group.loc[group["genotype"] == i, - "variant_quality"].max()) - variant_quals = ["." if np.isnan(v) else str(int(round(v, 0))) - for v in variant_quals] - mip_count = [] - for i in range(allele_count): - mip_count.append(len(set(group.loc[group["genotype"] == i, - "MIP"]))) - hap_count = [] - for i in range(allele_count): - hap_count.append(len(set(group.loc[group["genotype"] == i, - "haplotype_ID"]))) - return ":".join([gt, ",".join(allele_depths), - str(total_depth), - ",".join(variant_quals), - ",".join(map(str, mip_count)), - ",".join(map(str, hap_count)), - ",".join(map(str, wsaf.round(3))), - ]) - - collapsed_vcf = pd.DataFrame(combined_vcf.groupby( - ["CHROM", "POS", "REF", "ALT", "Sample ID"]).apply(collapse_vcf) - ).reset_index() - vcf_table = collapsed_vcf.pivot_table( - index=["CHROM", "POS", "REF", "ALT"], - columns="Sample ID", aggfunc="first") - vcf_table.fillna(nastring, inplace=True) - - def get_var_summary(row): - val = row.values - ad = [] - quals = [] - wsafs = [] - mip_counts = [] - hap_counts = [] - genotypes = [] - for v in val: - if v != nastring: - genotypes.append(v.split(":")[0]) - ad.append(list(map(int, v.split(":")[1].split(",")))) - quals.append(v.split(":")[3].split(",")) - mip_counts.append(list(map( - int, v.split(":")[4].split(",")))) - hap_counts.append(list(map( - int, v.split(":")[5].split(",")))) - wsafs.append(list(map(float, v.split(":")[6].split(",")))) - if len(ad) == 0: - return "." - geno_dict = {} - an_count = 0 - for geno in genotypes: - try: - geno_list = list(map(int, geno.split("/"))) - for gt in geno_list: - try: - geno_dict[gt] += 1 - except KeyError: - geno_dict[gt] = 1 - an_count += 1 - except ValueError: - continue - number_of_alleles = len(ad[0]) - ac_list = [] - for i in range(number_of_alleles): - try: - ac_list.append(geno_dict[i]) - except KeyError: - ac_list.append(0) - - quality = [] - for q in quals: - nq = [] - for q_val in q: - if q_val == ".": - nq.append(np.nan) - else: - nq.append(int(q_val)) - quality.append(nq) - quals = np.nanmean(quality, axis=0) - quality = [] - for q in quals: - if np.isnan(q): - quality.append(".") - else: - quality.append(str(round(q, 1))) - - wsafs = pd.DataFrame(wsafs) - wsafs = wsafs.applymap( - lambda a: a if a >= min_wsaf else np.nan).mean().round(4) - wsafs = wsafs.fillna(0).astype(str) - - mip_counts = pd.DataFrame(mip_counts) - mip_counts = mip_counts.applymap( - lambda a: a if a > 0 else np.nan).mean().round(2) - mip_frac = (mip_counts / (mip_counts.max())).round(2) - mip_frac = mip_frac.fillna(0).astype(str) - mip_counts = mip_counts.fillna(0).astype(str) - - hap_counts = pd.DataFrame(hap_counts) - hap_counts = hap_counts.applymap( - lambda a: a if a > 0 else np.nan).mean().round(2) - hap_counts = hap_counts.fillna(0).astype(str) - - info_cols = [ - "DP=" + str(np.sum(ad)), - "AD=" + ",".join(map(str, np.sum(ad, axis=0))), - "AC=" + ",".join(map(str, ac_list[1:])), - "AN=" + str(an_count), - "AF=" + ",".join(map(str, ( - np.array(ac_list)/an_count)[1:].round(4))), - "RAF=" + ",".join(map(str, ( - np.array(ac_list)/an_count).round(4))), - "RAC=" + ",".join(map(str, ac_list)), - "NS=" + str(len(ad)), - "SC=" + ",".join(map(str, (np.array(ad) >= min_count).sum( - axis=0))), - "SF=" + ",".join(map(str, ((np.array(ad) >= min_count).sum( - axis=0)/len(ad)).round(5))), - "QS=" + ",".join(quality), - "WSAF=" + ",".join(wsafs), - "MC=" + ",".join(mip_counts), - "MCF=" + ",".join(mip_frac), - "HC=" + ",".join(hap_counts)] - - variant_pos = row.name[1] - ref_len = len(row.name[2]) - variant_end = variant_pos + ref_len - 1 - overlapping_targets = set() - for p in target_pos: - ol = overlap([variant_pos, variant_end], - [p["Pos"], p["End"]]) - if len(ol) > 0: - overlapping_targets.add(p["Mutation Name"]) - if len(overlapping_targets) > 0: - ot_field = ",".join(sorted(overlapping_targets)) - info_cols.append("OT=" + ot_field) - - return ";".join(info_cols) - - var_summary = pd.DataFrame(vcf_table.apply( - get_var_summary, axis=1)).rename(columns={0: "INFO"}) - var_summary["FORMAT"] = "GT:AD:DP:QS:MC:HC:WSAF" - var_summary["ID"] = "." - var_summary["QUAL"] = "." - var_summary["FILTER"] = "." - samples = vcf_table.columns.droplevel(0).tolist() - vcf_table.columns = samples - samples = contig_dict["sample_ids"] - vcf_table = vcf_table.loc[:, samples].fillna(nastring) - vcf_table = vcf_table.merge(var_summary, left_index=True, - right_index=True) - vcf_table = vcf_table.reset_index()[ - ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", - "FORMAT"] + samples] - vcf_table.rename(columns={"CHROM": "#CHROM"}, inplace=True) - vcf_table = vcf_table.sort_values("POS") - vcf_header = [ - "##fileformat=VCFv4.2", - '##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - "##INFO=', - '##FORMAT=', - '##FORMAT=', - '##FORMAT=', - '##FORMAT=', - '##FORMAT=', - '##FORMAT=', - '##FORMAT='] - # save vcf file - contig_vcf_file = os.path.join(wdir, contig_name + ".vcf") - with open(contig_vcf_file, "w") as outfile: - outfile.write("\n".join(vcf_header) + "\n") - vcf_table.to_csv(outfile, index=False, sep="\t") - contig_variants_file = os.path.join(wdir, - contig_name + "_variants.csv") - combined_vcf.to_csv(contig_variants_file) - collapsed_variants_file = os.path.join(wdir, contig_name - + "_collapsed_variants.csv") - collapsed_vcf.to_csv(collapsed_variants_file) - contig_haps.to_csv(contig_haplotypes_file) - contig_counts.to_csv(contig_counts_file) - return contig_name - except Exception as e: - return ExceptionWrapper(e) - - ############################################################################### # general use functions. ############################################################################### @@ -9695,105 +9268,6 @@ def save_fasta_dict(fasta_dict, fasta_file, linewidth=60): outfile.write(fasta_seq[i: i + linewidth] + "\n") -def generate_sample_sheet(sample_list_file, - barcode_dict_file, - sample_sheet_template, - platform, - output_dir, - warnings=False): - """Create a sample sheet to be used by bcl2fasq file from sample list.""" - with open(barcode_dict_file, "rb") as in1: - barcode_dic = pickle.load(in1) - # read in sample information - sample_names = [] - sample_info = {} - with open(sample_list_file) as infile: - linenum = 0 - for line in infile: - newline = line.strip().split("\t") - # first line is the header with column names - if linenum == 0: - colnames = newline - linenum += 1 - else: - sample_dict = {colname: colvalue for colname, colvalue - in zip(colnames, newline)} - sample_set = sample_dict["sample_set"] - sample_name = sample_dict["sample_name"] - replicate_number = sample_dict["replicate"] - forward_index = sample_dict["fw"] - reverse_index = sample_dict["rev"] - sample_id = "-".join([sample_name, - sample_set, - replicate_number]) - if sample_id in sample_info: - print("Repeating sample name ", sample_id) - if not sample_id.replace("-", "").isalnum(): - print(("Sample IDs can only contain " - "alphanumeric characters and '-'. " - "{} has invalid characters.").format(sample_id)) - continue - # nextseq and miseq barcodes are handled differently - if platform == "nextseq": - sample_dict.update( - {"i7": barcode_dic[reverse_index]["index_sequence"], - "i5": barcode_dic[forward_index]["index_sequence"]}) - elif platform == "miseq": - sample_dict.update( - {"i7": barcode_dic[reverse_index]["index_sequence"], - "i5": barcode_dic[forward_index]["sequence"]}) - sample_dict["sample_index"] = linenum - linenum += 1 - sample_info[sample_id] = sample_dict - sample_names.append(sample_id) - # Check for samples sharing one or both barcodes. One barcode sharing is - # allowed but a warning can be printed if desired by setting the warning - # to True. If both barcodes are shared among two samples, those samples - # will be ignored and a message will be broadcast. - samples_sharing = [] - for s1 in sample_info: - for s2 in sample_info: - if s1 != s2: - if ((sample_info[s1]["fw"] == sample_info[s2]["fw"]) - and (sample_info[s1]["rev"] == sample_info[s2]["rev"])): - samples_sharing.append([s1, s2]) - elif warnings and ( - (sample_info[s1]["fw"] == sample_info[s2]["fw"]) - or (sample_info[s1]["rev"] == sample_info[s2]["rev"]) - ): - print("Samples %s and %s share a barcode" % (s1, s2)) - samples_sharing_set = [] - if len(samples_sharing) > 0: - for s in samples_sharing: - samples_sharing_set.extend(s) - samples_sharing_set = set(samples_sharing_set) - print("There are %d samples sharing the same barcode pair" - % len(samples_sharing_set)) - pd.DataFrame(samples_sharing).to_csv( - os.path.join(output_dir, "samples_sharing_barcodes.tsv"), - sep="\t" - ) - # create sample sheet - sample_sheet = os.path.join(output_dir, "SampleSheet.csv") - with open(sample_sheet_template) as infile, \ - open(sample_sheet, "w") as outfile: - outfile_list = infile.readlines() - outfile_list = [o.strip() for o in outfile_list] - for sample_id in sample_names: - if sample_id in samples_sharing_set: - continue - reverse_index = sample_info[sample_id]["rev"] - forward_index = sample_info[sample_id]["fw"] - sample_index = str(sample_info[sample_id]["sample_index"]) - outlist = [sample_index, sample_id, "", "", - "S" + reverse_index, - sample_info[sample_id]["i7"], - "N" + forward_index, - sample_info[sample_id]["i5"], "", ""] - outfile_list.append(",".join(outlist)) - outfile.write("\n".join(outfile_list)) - - def chromosome_converter(chrom, from_malariagen): """ Convert plasmodium chromosome names from standard (chr1, etc) to malariagen names (Pf3d7...) and vice versa. diff --git a/src/mip_functions_testing.py b/src/mip_functions_testing.py index 2a886b0..f9224d3 100644 --- a/src/mip_functions_testing.py +++ b/src/mip_functions_testing.py @@ -10546,7 +10546,7 @@ def parasight(resource_dir, gs_list.append(gs_command) pdf_list.append("cp " + basename + ".pdf " + os.path.join(pdf_dir, t + ".pdf")) - outlist = ["parasight76.pl", + outlist = ["parasight.pl", "-showseq", basename + ".show", "-extra", basename + extra_extension, "-template", "/opt/resources/nolabel.pst", @@ -10682,7 +10682,7 @@ def parasight_mod(resource_dir, design_info_file, species, pdf_convert_list.append(t + ".mod.pdf") gs_list.append(gs_command) pdf_list.append("cp " + basename + ".mod.pdf " + pdf_dir + t + ".mod.pdf") - outlist = ["parasight76.pl", + outlist = ["parasight.pl", "-showseq", basename + ".show", "-extra", @@ -10775,7 +10775,7 @@ def parasight_shift(resource_dir, design_info_file, species, gs_list.append(gs_command) pdf_list.append("cp " + basename + ".mod.pdf " + pdf_dir + t + ".mod.pdf") - outlist = ["parasight76.pl", + outlist = ["parasight.pl", "-showseq", basename + ".show", "-extra", basename + extra_extension + ".mod", "-template", "/opt/resources/nolabel.pst", @@ -10815,7 +10815,7 @@ def parasight_shift(resource_dir, design_info_file, species, def parasight_print(gene_list, extra_suffix=".extra"): for g in gene_list: print(("cd ../" + g)) - print(("parasight76.pl -showseq " + g + ".show " + print(("parasight.pl -showseq " + g + ".show " + "-extra " + g + extra_suffix)) diff --git a/src/msa_to_vcf.py b/src/msa_to_vcf.py deleted file mode 100644 index ee79e5e..0000000 --- a/src/msa_to_vcf.py +++ /dev/null @@ -1,99 +0,0 @@ -import mip_functions as mip -import pandas as pd - - -def msa_to_vcf(alignment_file, vcf_file, ref=None, snp_only=False): - """ Take a multiple sequence alignment file and create a vcf file - containing all variants. - """ - # read in the alignment file - aln = mip.fasta_parser(alignment_file) - - # convert each alignment string to list - aln = {k: list(aln[k]) for k in aln} - # create alignment dataframe - aln_df = pd.DataFrame(aln) - - if not snp_only: - # find indexes of all indels - indel_index = aln_df.loc[aln_df.apply( - lambda a: "-" in a.values, axis=1)].index - if len(indel_index) == 0: - variant_df = aln_df - else: - # merge all neighboring indel indexes - indel_index = [[i, i] for i in indel_index] - indel_index = mip.merge_overlap(indel_index, spacer=1) - # include the prior base for indel calls - indel_index = [[i[0] - 1, i[1]] if i[0] > 0 else i - for i in indel_index] - # get indexes for snps (non-indel changes) - snp_sets = [set(range(indel_index[i][1] + 1, indel_index[i+1][0])) - for i in range(len(indel_index) - 1)] - # include the positions from last indel to the end of alignment - snp_sets.append(set(range(indel_index[-1][1] + 1, len(aln_df)))) - # include the positions from the beginning of the alignment to the - # first indel - snp_sets.append(set(range(0, indel_index[0][0]))) - # create a set of snp indexes - snp_index = set() - for s in snp_sets: - snp_index.update(s) - - # go through each indel and create a dataframe for each where the - # each haplotype has the allele for the indel at the beginning - # index position - indel_list = [] - for ind in indel_index: - indel_list.append( - # slice the alignment df for the indel location - aln_df.loc[ind[0]: ind[1]].apply( - # join the nucleotides (remove "-") for each haplotype - lambda a: "".join([v for v in a.values if v != "-"]))) - # concatanate the indel dataframes - indel_df = pd.concat(indel_list, axis=1).T - # set index using the first position for each indel - indel_df.index = [ind[0] for ind in indel_index] - - # remove snp indexes where all haplotypes are same as reference - snp_index = sorted([ind for ind in snp_index - if len(set(aln_df.loc[ind].values)) > 1]) - - # merge indel and snp dfs - variant_df = pd.concat([indel_df, aln_df.loc[snp_index]]) - else: - snp_index = sorted([ind for ind in aln_df.index if len(set( - aln_df.loc[ind].values).difference(["-"])) > 1]) - snp_df = aln_df.loc[snp_index] - variant_df = snp_df.loc[snp_df["ref"] != "-"].replace("-", "*") - variant_df.sort_index(inplace=True) - - # if reference sequence name is provided, sort alleles by that - if ref is not None: - allele_df = variant_df.apply( - lambda a: [a["ref"]] + list(set(a).difference([a["ref"]])), axis=1) - else: - allele_df = variant_df.apply( - lambda a: list(set(a)), axis=1) - allele_dict = allele_df.to_dict() - - def get_genotype(row): - allele_list = allele_dict[row.name] - return pd.Series([allele_list.index(v) for v in row.values]) - - genotypes = variant_df.apply(get_genotype, axis=1) - genotypes.columns = variant_df.columns - allele_df.name = "alleles" - vcf = pd.concat([genotypes, allele_df], axis=1) - vcf.index.name = "POS" - vcf = vcf.reset_index() - vcf["POS"] = vcf["POS"] + 1 - vcf["REF"] = vcf["alleles"].apply(lambda a: a[0]) - vcf["ALT"] = vcf["alleles"].apply(lambda a: ",".join(a[1:])) - fields = ["#CHROM", "ID", "QUAL", "FILTER", "INFO", "FORMAT"] - for f in fields: - vcf[f] = "." - fields = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", - "FORMAT"] + list(genotypes.columns) - vcf = vcf[fields] - vcf.to_csv(vcf_file, sep="\t", index=False) diff --git a/src/reference_code.py b/src/reference_code.py index 1fece9d..f215639 100644 --- a/src/reference_code.py +++ b/src/reference_code.py @@ -96,7 +96,7 @@ def parasight_mod(resource_dir, design_info_file, species, pdf_convert_list.append(t + ".mod.pdf") gs_list.append(gs_command) pdf_list.append("cp " + basename + ".mod.pdf " + pdf_dir + t + ".mod.pdf") - outlist = ["parasight76.pl", + outlist = ["parasight.pl", "-showseq", basename + ".show", "-extra", @@ -189,7 +189,7 @@ def parasight_shift(resource_dir, design_info_file, species, gs_list.append(gs_command) pdf_list.append("cp " + basename + ".mod.pdf " + pdf_dir + t + ".mod.pdf") - outlist = ["parasight76.pl", + outlist = ["parasight.pl", "-showseq", basename + ".show", "-extra", basename + extra_extension + ".mod", "-template", "/opt/resources/nolabel.pst", diff --git a/src/sample_sheet_prep.py b/src/sample_sheet_prep.py index cef8bc4..92f0040 100644 --- a/src/sample_sheet_prep.py +++ b/src/sample_sheet_prep.py @@ -2,17 +2,23 @@ import argparse import os import numpy as np +import subprocess +import pickle def sample_sheet_prep( capture_plates, sample_plates, legacy_sheets, output_file, - wdir="/opt/analysis", + working_directory="/opt/analysis", quadrants="/opt/resources/sample_prep/quadrants.csv", forward_plates="/opt/resources/sample_prep/forward_plates.csv", - reverse_plates="/opt/resources/sample_prep/reverse_plates.csv"): + reverse_plates="/opt/resources/sample_prep/reverse_plates.csv", + barcode_dictionary="/opt/resources/sample_prep/barcode_dict.pickle", + platform="nextseq", + template_dir="/opt/resources/templates/sample_sheet_templates/"): quad = pd.read_csv(quadrants) forward_plates = pd.read_csv(forward_plates) reverse_plates = pd.read_csv(reverse_plates) + wdir = working_directory if capture_plates is not None: capture_paths = [os.path.join(wdir, p) for p in capture_plates] capture_plates = [] @@ -21,9 +27,8 @@ def sample_sheet_prep( capture_plates.append(pd.read_table(p).rename( columns={"Library Prep": "library_prep"})) except IOError: - print(("Warning: Capture plate file {} does not exist in the " - "run directory {} and will not be used.").format( - p, wdir)) + raise Exception(("Error: Capture plate file {} does not exist " + "in the run directory {}").format(p, wdir)) if len(capture_plates) > 0: capture_plates = pd.concat(capture_plates, ignore_index=True, axis=0) @@ -33,9 +38,8 @@ def sample_sheet_prep( try: sample_plates.append(pd.read_table(p)) except IOError: - print(("Warning:Sample plate file {} does not exist in the" - " run directory {} and will not be used.").format( - p, wdir)) + raise Exception(("Error: Sample plate file {} does not " + "exist in the run directory {}").format(p, wdir)) if len(sample_plates) > 0: sample_plates = pd.concat(sample_plates, ignore_index=True, axis=0) @@ -45,12 +49,16 @@ def sample_sheet_prep( lambda a: int(a[1:])) plating_cols = ["sample_name", "sample_plate", "row", "column"] sample_plates = sample_plates.loc[:, plating_cols] + # if a sample plate is referenced in capture plates but + # sample plate information is not provided, raise an error. plates_without_samples = set( capture_plates["sample_plate"]).difference( sample_plates["sample_plate"]) if len(plates_without_samples) > 0: - print(("Warning: {} does not have corresponding sample " - "plates.").format(plates_without_samples)) + raise Exception(("Error: Sample plate(s) {} do not " + "exist.").format(plates_without_samples)) + # if a sample plate is provided but not referenced in the + # captures, print a warning. samples_without_plates = set( sample_plates["sample_plate"]).difference( capture_plates["sample_plate"]) @@ -64,9 +72,11 @@ def sample_sheet_prep( captures = captures.drop_duplicates() captures["replicate"] = np.nan else: - print("Warning: No valid sample plate was found.") + raise Exception("Error: sample plates {} were not found." + ).format(sample_plates) else: - print("Warning: No valid capture plate was found.") + raise Exception("Error: capture plates {} were not found." + ).format(capture_plates) else: capture_plates = [] @@ -78,31 +88,28 @@ def sample_sheet_prep( legacy_sheets.append(pd.read_table(p).rename( columns={"Library Prep": "library_prep"})) except IOError: - print(("Warning:Sample sheet file {} does not exist in the" - " run directory {} and will not be used.").format( - p, wdir)) - if len(legacy_sheets) > 0: - legacy_sheets = pd.concat(legacy_sheets, ignore_index=True, axis=0) - legacy_sheets = legacy_sheets.drop_duplicates() + raise Exception(("Error: Sample sheet file {} does not exist " + "in the run directory {}").format(p, wdir)) + legacy_sheets = pd.concat(legacy_sheets, ignore_index=True, axis=0) + legacy_sheets = legacy_sheets.drop_duplicates() else: legacy_sheets = [] if (len(capture_plates) > 0) and (len(legacy_sheets) > 0): - com = pd.concat([captures, legacy_sheets], axis=0, ignore_index=True, - sort=False) + sample_sheet = pd.concat([captures, legacy_sheets], axis=0, + ignore_index=True, sort=False) elif len(capture_plates) > 0: # "replicate" column is only available in 96 well format # this will need to be set to NaN value if no 96 well sample sheet # was used. captures["replicate"] = np.nan - com = captures + sample_sheet = captures elif len(legacy_sheets) > 0: - com = legacy_sheets + sample_sheet = legacy_sheets else: - print("At least one sample sheet must be provided.") - return + raise Exception("At least one sample sheet must be provided.") - com = com.drop_duplicates() + sample_sheet = sample_sheet.drop_duplicates() def assign_replicate(replicates): replicates = list(replicates) @@ -123,58 +130,137 @@ def assign_replicate(replicates): reps_used.add(rep) return pd.Series(replicates) try: - com["Replicate"] = com.groupby(["sample_name", "sample_set"])[ - "replicate"].transform(assign_replicate).astype(int) - com.drop("replicate", inplace=True, axis=1) - com.rename(columns={"Replicate": "replicate"}, inplace=True) + sample_sheet["Replicate"] = sample_sheet.groupby( + ["sample_name", "sample_set"])["replicate"].transform( + assign_replicate).astype(int) + sample_sheet.drop("replicate", inplace=True, axis=1) + sample_sheet.rename(columns={"Replicate": "replicate"}, inplace=True) except ValueError: - print("Error in assigning replicates. Please make sure " + raise Exception("Error in assigning replicates. Please make sure " "the 'sample_name' and 'sample_set' fields have " "valid, non-empty values in all provided files.") + + # load barcode dictionary file to add sample barcode sequences + with open(barcode_dictionary, "rb") as bc_file: + barcodes = pickle.load(bc_file) + + # create the sample id as the combination of sample name, sample set + # and replicate number + sample_sheet["sample_id"] = sample_sheet[ + ["sample_name", "sample_set", "replicate"]].apply( + lambda a: "-".join(map(str, a)), axis=1) + + # check sample ids for formatting. + sample_sheet["valid_sample_id"] = sample_sheet["sample_id"].map( + lambda a: a.replace("-", "").isalnum()) + invalid_samples = sample_sheet.loc[~sample_sheet["valid_sample_id"]] + if invalid_samples.shape[0] > 0: + invalid_samples_file = os.path.join(wdir, "invalid_samples.csv") + invalid_samples.to_csv(invalid_samples_file) + raise Exception(("Sample IDs can only contain alphanumeric characters" + " and '-'. There are samples with invalid characters. " + "Please correct the sample ids saved in {}")).format( + invalid_samples_file) + + # get i5 and i7 index (sample barcode) sequences for each sample. + # orientation of i5 sequences are different for miseq and nextseq + sample_sheet["i7"] = sample_sheet["rev"].map( + lambda a: barcodes[a]["index_sequence"]) + if platform == "nextseq": + sample_sheet["i5"] = sample_sheet["fw"].map( + lambda a: barcodes[a]["index_sequence"]) + elif platform == "miseq": + sample_sheet["i5"] = sample_sheet["fw"].map( + lambda a: barcodes[a]["sequence"]) + + # reset the index to make sure index starts from zero + sample_sheet.reset_index(inplace=True) + # Create barcode names. This naming scheme is following earlier notation + # used but probably any unique name would work. + sample_sheet["reverse_index"] = "S" + sample_sheet["rev"].astype(str) + sample_sheet["forward_index"] = "N" + sample_sheet["fw"].astype(str) + # there are 4 fields that can be left empty in the sample sheet but + # we still need to have those columns as empty strings + empty_cols = ["empty" + str(i) for i in range(4)] + for c in empty_cols: + sample_sheet[c] = "" + # check whether all required columns have valid values required_columns = ["sample_name", "sample_set", "probe_set", - "replicate", "fw", "rev", "library_prep"] - missing_values = com[required_columns].isnull().any() + "replicate", "fw", "rev", "library_prep", + "sample_id", "i5", "i7", "reverse_index", + "forward_index"] + missing_values = sample_sheet[required_columns].isnull().any() missing_values = missing_values.loc[missing_values].index.to_list() if len(missing_values) > 0: - print(("Error: Required column(s) {} cannot have missing values." - ).format(", ".join(missing_values))) + bad_sample_sheet = os.path.join(wdir, "bad_sample_sheet.csv") + sample_sheet.to_csv(bad_sample_sheet) + raise Exception(("Error: Required column(s) {} cannot have missing " + "values. Please inspect the file {}").format( + ", ".join(missing_values), bad_sample_sheet)) + + # create a small function to save files + def make_sample_sheet(s_sheet, pfix): + """Save sample sheet to temporary file and concatenate to template.""" + # select the columns needed in the final sample sheet + cols = ["sample_id", "empty0", "empty1", "reverse_index", "i7", + "forward_index", "i5", "empty2", "empty3"] + # save sample sheet to a temporary file + sample_sheet_tail = os.path.join(wdir, "temp_samples.csv") + s_sheet.loc[:, cols].to_csv(sample_sheet_tail, index=True, + header=False) + # the sample sheet we generated needs some lines from the template file + # we'll cat that file before the sample sheet tail just saved. + sample_sheet_head = os.path.join( + template_dir, platform + "_sample_sheet_template.csv") + + sample_sheet_file = os.path.join(wdir, pfix + "SampleSheet.csv") + with open(sample_sheet_file, "w") as final_sample_sheet: + res = subprocess.run(["cat", sample_sheet_head, sample_sheet_tail], + stdout=final_sample_sheet, + stderr=subprocess.PIPE) + if res.stderr == b"": + subprocess.run(["rm", sample_sheet_tail]) + else: + raise Exception(( + "Error creating final sample sheet file: {}").format( + res.stderr)) + # save the entire dataframe as _samples.tsv file + s_sheet.to_csv(os.path.join(wdir, pfix + output_file), + index=False, sep="\t") return - if com.shape[0] != (com.groupby(["fw", "rev"]).first().shape[0]): + + # check if there are non-unique primer pairs + if sample_sheet.shape[0] != ( + sample_sheet.groupby(["fw", "rev"]).first().shape[0]): size_file = os.path.join(wdir, "repeating_primers.csv") + # nonunique primer pairs may be allowed if they belong to + # different probe sets becouse the data can be separated based + # on the probe sequences. print(("There are repeating forward/reverse primer pairs.\n" "Sample sheet will be split based on the probe sets used.\n" "Inspect {} for repeating primer information.").format( size_file)) - c_size = com.groupby(["fw", "rev"]).size().sort_values( + c_size = sample_sheet.groupby(["fw", "rev"]).size().sort_values( ascending=False) c_size = c_size.loc[c_size > 1].reset_index() - com.merge(c_size).to_csv(size_file, index=False) - gb = com.groupby("probe_set") + sample_sheet.merge(c_size).to_csv(size_file, index=False) + + # create a separate sample sheet file for each probe set + gb = sample_sheet.groupby("probe_set") for group_key in gb.groups: g = gb.get_group(group_key) + # raise error if non-unique primers exist within probesets if g.shape[0] != (g.groupby(["fw", "rev"]).first().shape[0]): size_file = os.path.join(wdir, group_key + "_repeating.csv") - print(("There are repeating forward/reverse primer pairs " - "within probe set {}. Inspect {} and correct the " - "sample sheet before proceeding with demultiplexing." - ).format(group_key, size_file)) - g_size = g.groupby(["fw", "rev"]).size().sort_values( - ascending=False) - g_size = g_size.loc[g_size > 1].reset_index() - g.merge(g_size).to_csv(size_file, index=False) - g.to_csv(os.path.join(wdir, group_key + "_" + output_file), - index=False, sep="\t") + raise Exception(("There are repeating forward/reverse primer " + "pairs within probe set {}. Inspect {} and correct the " + "sample sheet.").format(group_key, size_file)) + make_sample_sheet(g, group_key + "_") else: - com.to_csv(os.path.join(wdir, output_file), index=False, sep="\t") - - for sample_id in com["sample_name"]: - if not sample_id.replace("-", "").isalnum(): - print(("Sample names can only contain " - "alphanumeric characters and '-'. " - "{} has invalid characters. " - "This sample will not be processed.").format(sample_id)) + # save a single sample sheet when all primer pairs are unique + make_sample_sheet(sample_sheet, "") if __name__ == "__main__": @@ -189,8 +275,8 @@ def assign_replicate(replicates): parser.add_argument("-s", "--sample-plates", help=("Sample plate file(s)."), nargs="*") - parser.add_argument("-t", "--sample-sheets", - help=("Finished sample sheet file(s)."), + parser.add_argument("-t", "--legacy-sheets", + help=("Legacy sample sheet file(s)."), nargs="*") parser.add_argument("-o", "--output-file", help=("Output file name."), @@ -210,9 +296,19 @@ def assign_replicate(replicates): help=("Reverse primer plate file."), default=("/opt/resources/sample_prep/" "reverse_plates.csv")) + parser.add_argument("-b", "--barcode-dictionary", + help="Path to sample barcode dictionary.", + default=("/opt/resources/sample_prep/" + "barcode_dict.pickle")) + parser.add_argument("-p", "--platform", + help="Sequencing platform", + required=True, + choices=["nextseq", "miseq"]) + parser.add_argument("-d", "--template-dir", + help="Directory containing sample sheet headers.", + default=("/opt/resources/templates/" + "sample_sheet_templates/")) + args = vars(parser.parse_args()) - sample_sheet_prep(args["capture_plates"], args["sample_plates"], - args["sample_sheets"], args["output_file"], - args["working_directory"], args["quadrants"], - args["forward_plates"], args["reverse_plates"]) + sample_sheet_prep(**args) diff --git a/src/wrangler_downsample_umi.py b/src/wrangler_downsample_umi.py new file mode 100644 index 0000000..7dd460e --- /dev/null +++ b/src/wrangler_downsample_umi.py @@ -0,0 +1,107 @@ +import argparse +from Bio import SeqIO +from itertools import chain +import os +import numpy as np +from random import sample +import re +import subprocess +from multiprocessing import Pool + +# Parse input arguments +parser = argparse.ArgumentParser( + description="""Downsample the number of UMIs sequenced per MIP.""" +) +parser.add_argument( + "-c", + "--cpu-count", + help="The number of available processors to use.", + default=1, + type=int, +) +parser.add_argument( + "-t", + "--downsample-threshold", + help="The threshold at which UMIs will be downsampled.", + default=2000, + type=int, +) +parser.add_argument( + "-w", + "--weighted", + action="store_true", + help="Whether to apply a weight when randomly sampling UMIs.", +) +parser.add_argument( + "file", + nargs="+", + help="The files on which to downsample the UMIs.", +) +args = vars(parser.parse_args()) +cpu_count = args["cpu_count"] +downsample_threshold = int(args["downsample_threshold"]) +weighted = args["weighted"] + +# Remove empty first element from list +if args["file"][0] == "": + args["file"] = args["file"][1:] + + +def downsammple_fastq(file, downsample_threshold, weighted): + """Downsamples a FASTQ file by removing UMIs. + + Args: + file (str): The path of the FASTQ file. + downsample_threshold (int): The threshold at which UMIs will be + downsampled. + weighted (bool): Whether to downsample, weighing by the read count of + each UMI. + """ + # Unzip the file + subprocess.run(["gzip", "-df", file]) + unzipped = os.path.splitext(file)[0] + + # Read the file + records = list(SeqIO.parse(unzipped, "fastq")) + + # Randomly select a certain number of records. Either weigh by the read + # count, or just randomly select a sample. + if len(records) > downsample_threshold: + if weighted: + # Find the read counts for each UMI + read_cnts = [] + for r in records: + read_cnts.append(re.findall("readCnt=(\\d+)", r.id)) + + # Flatten the list, convert to int, and make list sum to one + read_cnts = list(chain.from_iterable(read_cnts)) + read_cnts = [int(x) for x in read_cnts] + weights = [x / sum(read_cnts) for x in read_cnts] + + # Subset the list using weights + subset = [ + records[i] + for i in np.random.choice( + len(records), downsample_threshold, False, weights + ) + ] + else: + subset = sample(records, downsample_threshold) + + # Write the subsetted file + SeqIO.write(subset, unzipped, "fastq") + + # Zip the file + subprocess.run(["gzip", "-f", unzipped]) + + +# Create a pool of workers to parallelize process +p = Pool(cpu_count) + +# Iterate over all the files input +for file in args["file"]: + p.apply_async(downsammple_fastq, [file, downsample_threshold, weighted]) + +# Close pool and wait for all child processes to terminate +p.close() +p.join() diff --git a/unused_unversioned_environment.yaml b/unused_unversioned_environment.yaml new file mode 100644 index 0000000..00cae9f --- /dev/null +++ b/unused_unversioned_environment.yaml @@ -0,0 +1,59 @@ +name: base +channels: + - conda-forge + - bioconda + - r + - nodefaults +dependencies: + - basemap-data-hires + - bcftools + - bioconductor-dnacopy + - biopython + - bowtie2 + - bwa + - freebayes + - gatk4 + - htslib + - lastz + - matplotlib + - matplotlib-venn + - nbconvert=7.6.0 + - notebook + - numpy + - openpyxl + - pandas=1.2.3 + - parallel + - pip + - pip: + - mipscripts + - plotly + - plotnine + - primer3 + - primer3-py + - pysam + - python=3.9 + - r-base + - r-devtools + - r-dplyr + - r-dt + - r-epitools + - r-ggplot2 + - r-irkernel + - r-knitr + - r-pkgbuild + - r-plotly + - r-shiny + - rpy2 + - samtools + - scandir + - scikit-allel + - scikit-learn + - scipy + - seaborn + - seqtk=1.3 + - simplegeneric + - tblib + - texlive-core + - vcftools + - xlrd +prefix: /opt/conda diff --git a/user_scripts/README.md b/user_scripts/README.md new file mode 100644 index 0000000..1cf01db --- /dev/null +++ b/user_scripts/README.md @@ -0,0 +1,7 @@ +The user can run pipelines by editing the yaml files in this directory and then running the corresponding shell script by opening a terminal, navigating to this directory, then entering the command below + +**Note that the variant_calling.sh and the check_run_stats.sh are both run from the variant_calling.yaml file** + +``` +bash +``` diff --git a/user_scripts/check_run_stats.sh b/user_scripts/check_run_stats.sh new file mode 100755 index 0000000..38550eb --- /dev/null +++ b/user_scripts/check_run_stats.sh @@ -0,0 +1,67 @@ +######################################################## +# README +# this file uses variant_calling.yaml for its parameters +######################################################### + + +################################################# +# set the ulimit high (necessary for big datasets) +################################################# +ulimit -n $(ulimit -Hn) + +################################################# +# set the home directory as the current working directory +################################################# +newhome=$(pwd -P) + +############################################### +# function to parse the yaml file edited by the user +# pulls out the location of the sif file, output directory, etc. +############################################ +function parse_yaml { + local prefix=$2 + local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') + sed -ne "s|^\($s\):|\1|" \ + -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \ + -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | + awk -F$fs '{ + indent = length($1)/2; + vname[indent] = $2; + for (i in vname) {if (i > indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i