From 433c701fb5fa3130270ff1524af3c3de04b55e84 Mon Sep 17 00:00:00 2001 From: Daniel Dale Date: Thu, 10 Sep 2020 20:40:06 -0700 Subject: [PATCH] added ipfs-based current-statement explorer --- .ipynb_checkpoints/README-checkpoint.md | 149 +- README.md | 162 +- analysis/inference.py | 38 +- analysis/inference_utils.py | 19 + analysis/model_analysis_rpt.py | 20 +- configs/config_defaults.yaml | 12 +- configs/config_defaults_sql.yaml | 32 +- configs/dataprep_only.yaml | 6 +- configs/gen_report.yaml | 4 +- configs/gen_swa_ckpt.yaml | 4 +- configs/infsvc.yaml | 13 + configs/test_only.yaml | 9 + configs/test_report.yaml | 21 - configs/train_albertbase.yaml | 3 +- deep_classiflie.py | 22 +- deep_classiflie_infsvc.sh | 26 + docs/.ipynb_checkpoints/about-checkpoint.md | 133 +- docs/_config.yml | 2 +- docs/_layouts/default.html | 32 +- docs/about.md | 129 +- docs/assets/css/datatables.min.css | 18 + docs/assets/css/loading_bar.css | 18 + docs/assets/css/style.scss | 160 +- docs/assets/js/current_pred_nav.js | 152 + docs/assets/js/datatables.js | 17047 ++++++++++++++++++ docs/assets/js/datatables.min.js | 246 + docs/assets/js/loading_bar.js | 811 + docs/assets/sort_asc.png | Bin 0 -> 160 bytes docs/assets/sort_asc_disabled.png | Bin 0 -> 148 bytes docs/assets/sort_both.png | Bin 0 -> 201 bytes docs/assets/sort_desc.png | Bin 0 -> 158 bytes docs/assets/sort_desc_disabled.png | Bin 0 -> 146 bytes docs/assets/test_recs.json | 502 + docs/current_explorer.html | 39 + docs/perf_explorer.html | 1 + docs/pred_explorer.html | 1 + models/deep_classiflie_module.py | 7 +- requirements.txt | 8 +- training/trainer.py | 78 +- training/training_utils.py | 51 +- utils/constants.py | 10 + utils/core_utils.py | 19 +- utils/dc_infsvc.py | 241 + 43 files changed, 19778 insertions(+), 467 deletions(-) create mode 100644 configs/infsvc.yaml create mode 100644 configs/test_only.yaml delete mode 100644 configs/test_report.yaml create mode 100755 deep_classiflie_infsvc.sh create mode 100644 docs/assets/css/datatables.min.css create mode 100644 docs/assets/css/loading_bar.css create mode 100644 docs/assets/js/current_pred_nav.js create mode 100644 docs/assets/js/datatables.js create mode 100644 docs/assets/js/datatables.min.js create mode 100644 docs/assets/js/loading_bar.js create mode 100644 docs/assets/sort_asc.png create mode 100644 docs/assets/sort_asc_disabled.png create mode 100644 docs/assets/sort_both.png create mode 100644 docs/assets/sort_desc.png create mode 100644 docs/assets/sort_desc_disabled.png create mode 100644 docs/assets/test_recs.json create mode 100644 docs/current_explorer.html create mode 100644 utils/dc_infsvc.py diff --git a/.ipynb_checkpoints/README-checkpoint.md b/.ipynb_checkpoints/README-checkpoint.md index 75eef12..e228096 100644 --- a/.ipynb_checkpoints/README-checkpoint.md +++ b/.ipynb_checkpoints/README-checkpoint.md @@ -1,7 +1,9 @@ [![python badge](https://img.shields.io/badge/python->=3.7-brightgreen.svg)](https://shields.io/) [![pytorch badge](https://img.shields.io/badge/pytorch->=1.6.0-blue.svg)](https://shields.io/) +[![DOI](https://zenodo.org/badge/254935540.svg)](https://zenodo.org/badge/latestdoi/254935540) Deep Classiflie Logo -
Table of Contents + +
Table of Contents - [What is Deep Classiflie?](#what-is-deep-classiflie) @@ -23,9 +25,7 @@ --- ### What is Deep Classiflie? - Deep Classiflie is a framework for developing ML models that bolster fact-checking efficiency. Predominantly a research project[e](#ce), I plan to extend and maintain this framework in pursuing my own research interests so am sharing it in case it's of any utility to the broader community. -- As a POC, the initial alpha release of Deep Classiflie generates/analyzes a model that continuously classifies a single individual's statements (Donald Trump)[1](#f1) using a single ground truth labeling source (The Washington Post). For statements the model deems most likely to be labeled falsehoods (see [current performance](#current-performance) for more detail), the [@DeepClassiflie](https://twitter.com/DeepClassiflie) twitter bot tweets out a statement analysis and model interpretation "report" such as the one below: - - Example tweet report +- As a POC, the initial alpha release of Deep Classiflie generates/analyzes a model that continuously classifies a single individual's statements (Donald Trump)[1](#f1) using a single ground truth labeling source (The Washington Post). - The Deep Classiflie POC model's predictions and performance on the most recent test set can be [explored](#model-exploration) and better understood using the [prediction explorer](pred_explorer.html): prediction explorer - and the [performance explorer](perf_explorer.html): @@ -52,20 +52,29 @@ --- ### Model Exploration + The best way to start understanding/exploring the current model is to use the explorers on deepclassiflie.org: -#### [Prediction Explorer](pred_explorer.html): -Explore randomly sampled predictions from the test set of the latest model incarnation. The explorer uses [captum's](https://captum.ai/) implementation of integrated gradients[7](#f7) to visualize attributions of statement predictions to tokens in each statement. Read more about explorer [below.](##current-performance) +
Prediction Explorer + + + +[Explore](pred_explorer.html) randomly sampled predictions from the test set of the latest model incarnation. The explorer uses [captum's](https://captum.ai/) implementation of integrated gradients[7](#f7) to visualize attributions of statement predictions to tokens in each statement. Read more about explorer [below.](##current-performance) prediction explorer +
-#### [Performance Explorer](perf_explorer.html): -Explore the performance of the current model incarnation using confusion matrices oriented along temporal and confidence-based axes. +
Performance Explorer + + +[Explore](perf_explorer.html) the performance of the current model incarnation using confusion matrices oriented along temporal and confidence-based axes. temporal performance explorer confidence bucket performance explorer +
--- + ### Core Components The entire initial Deep Classiflie system (raw dataset, model, analytics modules, twitter bot etc.) can be built from scratch using the publicly available code here.[2](#f2) @@ -75,31 +84,42 @@ The entire initial Deep Classiflie system (raw dataset, model, analytics modules | Component | Description | | ---- | --- | | [**deep_classiflie**](https://github.com/speediedan/deep_classiflie) | Core framework for building, training and analyzing fact-check facilitating ML models. Can operate independently from deep_classiflie_db when training a model using existing dataset collections or when performing inference. Depends on deep_classiflie_db for certain functions such as creating new dataset collections, running the tweetbot, running the analytics modules etc. [3](#f3) | -| [**deep_classiflie_db**](https://github.com/speediedan/deep_classiflie_db) | Backend data system for managing Deep Classiflie metadata, analyzing Deep Classiflie intermediate datasets and orchestrating Deep Classiflie model training pipelines. Includes data scraping modules for the initial model data sources (twitter, factba.se, washington post -- politifact and the toronto star were removed from an earlier version and may be re-added among others as models for other prominent politicians are explored) | +| [**deep_classiflie_db**](https://github.com/speediedan/deep_classiflie_db) | Backend datastore for managing Deep Classiflie metadata, analyzing Deep Classiflie intermediate datasets and orchestrating Deep Classiflie model training pipelines. Includes data scraping modules for the initial model data sources (twitter, factba.se, washington post -- politifact and the toronto star were removed from an earlier version and may be re-added among others as models for other prominent politicians are explored) | -[Dataset Generation](#data-pipeline) +
Dataset Generation + + - For simplicity, scrape "ground truth" falsehood labels from a single source ([Washington Post Factchecker](https://www.washingtonpost.com/graphics/politics/trump-claims-database)) - Scrape a substantial fraction of public statements from multiple sources. ([Factba.se](https://factba.se), [Twitter](https://twitter.com)) - Use statement hashes and subword representations from a base model (ALBERT[8](#f8)) to remove "false" statements present in the larger "truths" corpus. - Prepare chronologically disjoint train/dev/test sets (to avoid data leakage) and attempt to reduce undesired superficial class-aligned distributional artifacts that could be leveraged during model training. NNs are lazy, they'll cheat if we let them. - -**Model Training** + +
+
Model Training + + - Fine-tune a base model (currently HuggingFace's [ALBERT implementation](https://huggingface.co/transformers/model_doc/albert.html) with some minor customizations) in tandem with a simple embedding reflecting the semantic shift associated with the medium via which the statement was conveyed (i.e., for the POC, just learn the tweet vs non-tweet transformation) (using [Pytorch](https://pytorch.org/)) -- Explore the latest model's training session on tensorboard.dev. +- Explore the latest model's training session on [tensorboard.dev](https://tensorboard.dev/experiment/rGNQpYnYSOaHb2A84xRAzw). - N.B. neuro-symbolic methods[6](#f6) that leverage knowledge bases and integrate symbolic reasoning with connectionist methods are not used in this model. Use of these approaches may be explored in [future research](#further-research) using this framework. - -**Analysis & Reporting** +
+
Analysis & Reporting + + - Interpret statement-level predictions using [captum's](https://captum.ai/) implementation of integrated gradients to visualize attributions of statement predictions to tokens in each statement. - Prediction and model performance exploration dashboards were built using [bokeh](https://docs.bokeh.org/en/latest/index.html) and [Jekyll](https://github.com/jekyll/jekyll) -- Automated false statement reports for predictions meeting the desired [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) confidence threshold are published on twitter via the [@DeepClassiflie](https://twitter.com/DeepClassiflie) bot, which leverages [Tweepy](https://www.tweepy.org/) -- XKCD fans may notice the style of the dashboard explorers and statement reports are XKCD-inspired using the Humor Sans font created by [@ch00ftech](https://twitter.com/ch00ftech). Thanks to him (and [@xkcd](https://twitter.com/xkcd) of course!) +- Two inference daemons poll, analyze and classify new statements: + 1. (still in development) A daemon that publishes via IPFS pubsub, all new statement classifications and inference output. + 2. (currently available) Automated false statement reports for predictions meeting the desired [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) confidence threshold can be published on twitter via a twitter bot, which leverages [Tweepy](https://www.tweepy.org/). The bot [h](#ch) tweets out a statement analysis and model interpretation "report" such as the one below for statements the model deems most likely to be labeled falsehoods (see [current performance](#current-performance) for more detail): + Example tweet report +- XKCD fans may notice the style of the dashboard explorers and statement reports are XKCD-inspired using the Humor Sans font created by [@ch00ftech](https://twitter.com/ch00ftech). Thanks to him (and [@xkcd](https://twitter.com/xkcd) of course!) +
--- ### Current Performance -
Global +
Global Global metrics[9](#f9) summarized in the table below relate to the current model's performance on a test set comprised of ~12K statements made between 2020-04-03 and 2020-07-08:
@@ -107,7 +127,7 @@ Global metrics[9](#f9) summarized in the table below relate t
-
Local +
Local To minimize false positives and maximize the model's utility, the following approach is used to issue high-confidence predictions: @@ -117,95 +137,64 @@ To minimize false positives and maximize the model's utility, the following appr * [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) * Positive prediction ratio: (bucket true positives + bucket false positives)/#statements in bucket * Bucket-level accuracy -3. Report estimated local accuracy metrics of given prediction by associating it with its corresponding confidence bucket +3. Report estimated local accuracy metrics of given prediction by associating it with its corresponding confidence bucket. See [caveats](#caveats) regarding recognized performance biases[a](#ca) * In the prediction explorer, randomly sample 100 statements (including all confusion matrix classes) from each of four confidence buckets: the maximum and minimum accuracy buckets for each statement type. Max Accuracy Non-Tweets
Max Accuracy Tweets -4. Use statement-type aligned (tweet vs non-tweet) PPV thresholds to estimate @DeepClassiflie's statement report publishing accuracy using performance from the most recent test set. See [caveats](#caveats) regarding recognized performance biases[a](#ca).
-
@DeepClassiflie Report Performance - - -Had @DeepClassiflie been publishing statement reports over the period comprising its test set (see above) using the current model, the publishing performance is expected to have been approximately characterized by the statistics below. See [caveats](#caveats) regarding the modest recognized performance biases[a](#ca). Now that report publishing has begun, once additional labeled data are available, the realized performance of the model will be similarly reported here.
- -
- -| Model Version | Period Days | Start Date | End Date | -| :--- | :---: | :---: | :---: | -| 20200816115426 | 96 | 04/03/2020 | 07/08/2020 | - -
-
- -| Statement Type | Publish Threshold | Stmts/max bucket | Bucket ppv | Bucket ppr| Est Reports Published | Estimated TP | Estimated FP | -| :--- | :---: | :---: | :---: | :---:| :---: | :---: | :---: | -Non-Tweets | 0.96 | 430 | 0.965 |1 | 430 |415 | 15 | -Tweets | 0.77 | 109 | 0.786 | 0.257 |28 | 22 | 6 | - -
-
- -| Period Estimate | Period total | Per day | -| :--- | :---: | :---: | -Non-tweet reports published | 430 | 4.48 | -Tweet reports published | 28 | 0.29 | -TP non-tweet reports published | 415 | 4.32 | -FP non-tweet reports published | 15 | 0.16 | -TP tweet reports published | 22 | 0.23 | -FP tweet reports published | 6 | 0.06 | -Projected report period non-tweet accuracy | 96.5% | -Projected report period tweet accuracy | 78.6% | -Projected report period global accuracy | 95.4% | - -
-
- - --- ### Noteworthy Features -#### Dataset generation: +
Dataset generation + + - Easily and extensively configurable using yaml [configuration files](#configuration). - Multiple different class balancing strategies available (oversampling, class ratios etc.) - "Weakly converge" superficially divergent class distributions using UnivariateDistReplicator abstraction - Easily experiment with different train/dev/test splits/configurations via declarative DatasetCollection definitions. +
-#### Model training: +
Model training + + - Automated recursive fine-tuning of the base model with a FineTuningScheduler abstraction - Configurable label-smoothing[4](#f4) - Generate and configure thawing schedules for models. - EarlyStopping easily configurable with multiple non-standard monitor metrics (e.g. mcc) - Both automatic and manually-specified [stochastic weight averaging](https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/) of model checkpoints[f](#cf) - mixed-precision training via [apex](https://github.com/NVIDIA/apex)[g](#cg) +
+
Analysis & reporting + -#### Analysis & reporting: - Extensive suite of reporting views for analyzing model performance and global and local levels - Statement and performance exploration dashboards for interpreting model predictions and understanding its performance - xkcd-themed visualization of UMAP-transformed statement embeddings +
--- ### Data Pipeline To conserve resources and for POC research expediency, the current pipeline uses a local relational DB (MariaDB). Ultimately, a distributed data store would be preferable and warranted if this project merits sufficient interest from the community or a POC involving a distributed network of models is initiated. -
Deep Classiflie Data Pipeline +
Deep Classiflie Data Pipeline ![Deep Classiflie Data Pipeline](docs/assets/deep_classiflie_data_pipeline.svg)
-
False Statement Filter Processes +
False Statement Filter Processes ![False Statement Filter Processes](docs/assets/False%20Statement%20Filter%20Processes.svg)
-
Distribution Convergence Process +
Distribution Convergence Process ![Distribution Convergence Process](docs/assets/Distribution%20Convergence%20Process.svg)
-
Dataset Generation Process +
Dataset Generation Process ![Dataset Generation Process](docs/assets/Dataset%20Generation%20Process.svg) @@ -243,7 +232,7 @@ The parameters used in all Deep Classiflie job executions related to the develop --- ### Model Replication -
Instructions (click to expand) +
Instructions N.B. before you begin, the core external dependency is admin access to a mariadb or mysql DB @@ -427,6 +416,7 @@ N.B. before you begin, the core external dependency is admin access to a mariadb --- ### Caveats +
  • [a] The distance threshold for filtering out "false truths" using base model embeddings matches falsehoods to their corresponding truths with high but imperfect accuracy. This fuzzy matching process will result in a modest upward performance bias in the test results. Model performance on datasets built using the noisy matching process (vs exclusively hash-based) improved by only ~2% globally with gains slightly disproportionately going to more confident buckets. This places a relatively low ceiling on the magnitude of the performance bias introduced through this filtering. The precise magnitude of this bias will be quantified in the future via one or both of the following methods :
    • @@ -434,26 +424,27 @@ N.B. before you begin, the core external dependency is admin access to a mariadb
    • when the next set of ground truth label data are released by the Washington Post, an estimated vs actual performance comparison can be performed
  • [b] The module used to bootstrap the POC model's tweet history by crawling factba.se needs to be refactored and added into the initial dataset bootstrap process. This is presently one of many issues in the backlog.
  • -
  • [c] Deep Classiflie depends upon deep_classiflie_db (initially released as a separate repository) for much of its analytical and dataset generation functionality but deep_classiflie_db is currently maintained as a separate repository here to maximize architectural flexibility. Depending on how Deep Classiflie evolves (e.g. as it supports distributed data stores etc.), it may make more sense to integrate deep_classiflie_db back into deep_classiflie.
  • +
  • [c] Deep Classiflie depends upon deep_classiflie_db (initially released as a separate repository) for much of its analytical and dataset generation functionality. Depending on how Deep Classiflie evolves (e.g. as it supports distributed data stores etc.), it may make more sense to integrate deep_classiflie_db back into deep_classiflie.
  • [d] It's notable that the model suffers a much higher FP ratio on tweets relative to non-tweets. Exploring tweet FPs, there are a number of plausible explanations for this discrepancy which could be explored in future research.
  • [e] Still in early development, there are significant outstanding issues (e.g. no tests yet!) and code quality shortcomings galore, but any constructive thoughts or contributions are welcome. I'm interested in using ML to curtail disinformation, not promulgate it, so I want to be clear -- this is essentially a fancy sentence similarity system with a lot of work put into building the dataset generation and model analysis data pipelines (I have a data engineering background, not a software engineering one).
  • [f] Current model release built/tested before swa graduated from torchcontrib to core pytorch. Next release of Deep Classiflie will use the integrated swa api.
  • [g] Current model release built/tested before AMP was integrated into core pytorch. Next release of Deep Classiflie will use the integrated AMP api.
  • +
  • [h] N.B. This daemon may violate Twitter's policy w.r.t. tweeting sensitive content if the subject's statements contain such content (no content-based filtering is included in the daemon). @DeepClassflie initially tested the Deep Classiflie twitter daemon but will post only framework-related announcements moving forward.
--- ### Citing Deep Classiflie Please cite: ```tex -@misc{Dan_Dale_2020_tbd, - author = {Dan Dale}, - title = {{Deep Classiflie: Shallow fact-checking with deep neural networks}}, - month = aug, - year = 2020, - doi = {tbd/zenodo.tbd}, - version = {1.0}, - publisher = {Zenodo}, - url = {https://doi.org/tbd/zenodo.tbd} +@misc{Dan_Dale_2020_3995079, + author = {Dan Dale}, + title = {{Deep Classiflie: Shallow fact-checking with deep neural networks}}, + month = aug, + year = 2020, + doi = {10.5281/zenodo.3995079}, + version = {v0.1.0-alpha}, + publisher = {Zenodo}, + url = {https://zenodo.org/record/3995079} } ``` Feel free to star the repo as well if you find it useful or interesting. Thanks! @@ -475,6 +466,4 @@ Feel free to star the repo as well if you find it useful or interesting. Thanks! --- ### License -[![License](http://img.shields.io/:license-mit-blue.svg?style=flat-square)](http://badges.mit-license.org) -- **[MIT license](http://opensource.org/licenses/mit-license.php)** - +[![License](https://img.shields.io/:license-mit-blue.svg?style=flat-square)](https://badges.mit-license.org) \ No newline at end of file diff --git a/README.md b/README.md index 7b62998..7801797 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![pytorch badge](https://img.shields.io/badge/pytorch->=1.6.0-blue.svg)](https://shields.io/) [![DOI](https://zenodo.org/badge/254935540.svg)](https://zenodo.org/badge/latestdoi/254935540) Deep Classiflie Logo +
Table of Contents @@ -24,10 +25,12 @@ --- ### What is Deep Classiflie? - Deep Classiflie is a framework for developing ML models that bolster fact-checking efficiency. Predominantly a research project[e](#ce), I plan to extend and maintain this framework in pursuing my own research interests so am sharing it in case it's of any utility to the broader community. -- As a POC, the initial alpha release of Deep Classiflie generates/analyzes a model that continuously classifies a single individual's statements (Donald Trump)[1](#f1) using a single ground truth labeling source (The Washington Post). For statements the model deems most likely to be labeled falsehoods (see [current performance](#current-performance) for more detail), the [@DeepClassiflie](https://twitter.com/DeepClassiflie) twitter bot tweets out a statement analysis and model interpretation "report" such as the one below: +- As a POC, the initial alpha release of Deep Classiflie generates/analyzes a model that continuously classifies a single individual's statements (Donald Trump)[1](#f1) using a single ground truth labeling source (The Washington Post). +- The Deep Classiflie POC model's [current predictions](current_explorer.html) and performance on the most recent test set can be [explored](#model-exploration) and better understood using +the [current prediction explorer](current_explorer.html): - Example tweet report -- The Deep Classiflie POC model's predictions and performance on the most recent test set can be [explored](#model-exploration) and better understood using the [prediction explorer](pred_explorer.html): + current prediction explorer +- the [prediction explorer](pred_explorer.html): prediction explorer - and the [performance explorer](perf_explorer.html): @@ -53,20 +56,40 @@ --- ### Model Exploration + The best way to start understanding/exploring the current model is to use the explorers on deepclassiflie.org: -#### [Prediction Explorer](pred_explorer.html): -Explore randomly sampled predictions from the test set of the latest model incarnation. The explorer uses [captum's](https://captum.ai/) implementation of integrated gradients[7](#f7) to visualize attributions of statement predictions to tokens in each statement. Read more about explorer [below.](##current-performance) +
Prediction Explorer + + + +[Explore](pred_explorer.html) randomly sampled predictions from the test set of the latest model incarnation. The explorer uses [captum's](https://captum.ai/) implementation of integrated gradients[7](#f7) to visualize attributions of statement predictions to tokens in each statement. Read more about explorer [below.](##current-performance) prediction explorer +
+ +
Performance Explorer + -#### [Performance Explorer](perf_explorer.html): -Explore the performance of the current model incarnation using confusion matrices oriented along temporal and confidence-based axes. +[Explore](perf_explorer.html) the performance of the current model incarnation using confusion matrices oriented along temporal and confidence-based axes. temporal performance explorer confidence bucket performance explorer +
+ +
[Current Predictions Explorer](current_explorer.html) + + +Explore the current (unlabeled) predictions generated by the latest model incarnation. All statements yet to be labeled by current fact-checking sources (currently, only [Washington Post Factchecker](https://www.washingtonpost.com/graphics/politics/trump-claims-database)) are available. +Live predictions continuously added via [ipfs](https://ipfs.io). Twitter statements will be delayed by ~15 minutes to allow thread-based scoring. [Factba.se](https://factba.se) is polled for new statements every 10 minutes. +This explorer provides fact-checkers a means (one of many possible) of using current model predictions and may also help those building fact-checking systems evaluate the potential utility of integrating similar models into their systems. + + +current predictions explorer +
--- + ### Core Components The entire initial Deep Classiflie system (raw dataset, model, analytics modules, twitter bot etc.) can be built from scratch using the publicly available code here.[2](#f2) @@ -76,31 +99,45 @@ The entire initial Deep Classiflie system (raw dataset, model, analytics modules | Component | Description | | ---- | --- | | [**deep_classiflie**](https://github.com/speediedan/deep_classiflie) | Core framework for building, training and analyzing fact-check facilitating ML models. Can operate independently from deep_classiflie_db when training a model using existing dataset collections or when performing inference. Depends on deep_classiflie_db for certain functions such as creating new dataset collections, running the tweetbot, running the analytics modules etc. [3](#f3) | -| [**deep_classiflie_db**](https://github.com/speediedan/deep_classiflie_db) | Backend data system for managing Deep Classiflie metadata, analyzing Deep Classiflie intermediate datasets and orchestrating Deep Classiflie model training pipelines. Includes data scraping modules for the initial model data sources (twitter, factba.se, washington post -- politifact and the toronto star were removed from an earlier version and may be re-added among others as models for other prominent politicians are explored) | +| [**deep_classiflie_db**](https://github.com/speediedan/deep_classiflie_db) | Backend datastore for managing Deep Classiflie metadata, analyzing Deep Classiflie intermediate datasets and orchestrating Deep Classiflie model training pipelines. Includes data scraping modules for the initial model data sources (twitter, factba.se, washington post -- politifact and the toronto star were removed from an earlier version and may be re-added among others as models for other prominent politicians are explored) | -[Dataset Generation](#data-pipeline) +
Dataset Generation + + - For simplicity, scrape "ground truth" falsehood labels from a single source ([Washington Post Factchecker](https://www.washingtonpost.com/graphics/politics/trump-claims-database)) - Scrape a substantial fraction of public statements from multiple sources. ([Factba.se](https://factba.se), [Twitter](https://twitter.com)) - Use statement hashes and subword representations from a base model (ALBERT[8](#f8)) to remove "false" statements present in the larger "truths" corpus. - Prepare chronologically disjoint train/dev/test sets (to avoid data leakage) and attempt to reduce undesired superficial class-aligned distributional artifacts that could be leveraged during model training. NNs are lazy, they'll cheat if we let them. - -**Model Training** + +
+
Model Training + + - Fine-tune a base model (currently HuggingFace's [ALBERT implementation](https://huggingface.co/transformers/model_doc/albert.html) with some minor customizations) in tandem with a simple embedding reflecting the semantic shift associated with the medium via which the statement was conveyed (i.e., for the POC, just learn the tweet vs non-tweet transformation) (using [Pytorch](https://pytorch.org/)) - Explore the latest model's training session on [tensorboard.dev](https://tensorboard.dev/experiment/rGNQpYnYSOaHb2A84xRAzw). - N.B. neuro-symbolic methods[6](#f6) that leverage knowledge bases and integrate symbolic reasoning with connectionist methods are not used in this model. Use of these approaches may be explored in [future research](#further-research) using this framework. - -**Analysis & Reporting** +
+
Analysis & Reporting + + - Interpret statement-level predictions using [captum's](https://captum.ai/) implementation of integrated gradients to visualize attributions of statement predictions to tokens in each statement. -- Prediction and model performance exploration dashboards were built using [bokeh](https://docs.bokeh.org/en/latest/index.html) and [Jekyll](https://github.com/jekyll/jekyll) -- Automated false statement reports for predictions meeting the desired [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) confidence threshold are published on twitter via the [@DeepClassiflie](https://twitter.com/DeepClassiflie) bot, which leverages [Tweepy](https://www.tweepy.org/) +- Test set prediction and model performance exploration dashboards were built using [bokeh](https://docs.bokeh.org/en/latest/index.html) and [Jekyll](https://github.com/jekyll/jekyll) +- The [current prediction explorer](current_explorer.html) was built using [datatables](https://datatables.net/) and [ipfs](https://ipfs.io) with pinning provided by [pinata](https://pinata.cloud/) +- Two inference daemons poll, analyze and classify new statements: + 1. A daemon that publishes via [IPFS](https://ipfs.io) all new statement classifications and inference output. + + current predictions explorer + 2. Automated false statement reports for predictions meeting the desired [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) confidence threshold can be published on twitter via a twitter bot, which leverages [Tweepy](https://www.tweepy.org/). The bot [h](#ch) tweets out a statement analysis and model interpretation "report" such as the one below for statements the model deems most likely to be labeled falsehoods (see [current performance](#current-performance) for more detail): + + Example tweet report - XKCD fans may notice the style of the dashboard explorers and statement reports are XKCD-inspired using the Humor Sans font created by [@ch00ftech](https://twitter.com/ch00ftech). Thanks to him (and [@xkcd](https://twitter.com/xkcd) of course!) - +
--- ### Current Performance -
Global +
Global Global metrics[9](#f9) summarized in the table below relate to the current model's performance on a test set comprised of ~12K statements made between 2020-04-03 and 2020-07-08:
@@ -108,7 +145,7 @@ Global metrics[9](#f9) summarized in the table below relate t
-
Local +
Local To minimize false positives and maximize the model's utility, the following approach is used to issue high-confidence predictions: @@ -118,95 +155,65 @@ To minimize false positives and maximize the model's utility, the following appr * [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) * Positive prediction ratio: (bucket true positives + bucket false positives)/#statements in bucket * Bucket-level accuracy -3. Report estimated local accuracy metrics of given prediction by associating it with its corresponding confidence bucket +3. Report estimated local accuracy metrics of given prediction by associating it with its corresponding confidence bucket. See [caveats](#caveats) regarding recognized performance biases[a](#ca) * In the prediction explorer, randomly sample 100 statements (including all confusion matrix classes) from each of four confidence buckets: the maximum and minimum accuracy buckets for each statement type. Max Accuracy Non-Tweets
Max Accuracy Tweets -4. Use statement-type aligned (tweet vs non-tweet) PPV thresholds to estimate @DeepClassiflie's statement report publishing accuracy using performance from the most recent test set. See [caveats](#caveats) regarding recognized performance biases[a](#ca).
-
@DeepClassiflie Report Performance - - -Had @DeepClassiflie been publishing statement reports over the period comprising its test set (see above) using the current model, the publishing performance is expected to have been approximately characterized by the statistics below. See [caveats](#caveats) regarding the modest recognized performance biases[a](#ca). Now that report publishing has begun, once additional labeled data are available, the realized performance of the model will be similarly reported here.
- -
- -| Model Version | Period Days | Start Date | End Date | -| :--- | :---: | :---: | :---: | -| 20200816115426 | 96 | 04/03/2020 | 07/08/2020 | - -
-
- -| Statement Type | Publish Threshold | Stmts/max bucket | Bucket ppv | Bucket ppr| Est Reports Published | Estimated TP | Estimated FP | -| :--- | :---: | :---: | :---: | :---:| :---: | :---: | :---: | -Non-Tweets | 0.96 | 430 | 0.965 |1 | 430 |415 | 15 | -Tweets | 0.77 | 109 | 0.786 | 0.257 |28 | 22 | 6 | - -
-
- -| Period Estimate | Period total | Per day | -| :--- | :---: | :---: | -Non-tweet reports published | 430 | 4.48 | -Tweet reports published | 28 | 0.29 | -TP non-tweet reports published | 415 | 4.32 | -FP non-tweet reports published | 15 | 0.16 | -TP tweet reports published | 22 | 0.23 | -FP tweet reports published | 6 | 0.06 | -Projected report period non-tweet accuracy | 96.5% | -Projected report period tweet accuracy | 78.6% | -Projected report period global accuracy | 95.4% | - -
-
- - --- ### Noteworthy Features -#### Dataset generation: +
Dataset generation + + - Easily and extensively configurable using yaml [configuration files](#configuration). - Multiple different class balancing strategies available (oversampling, class ratios etc.) - "Weakly converge" superficially divergent class distributions using UnivariateDistReplicator abstraction - Easily experiment with different train/dev/test splits/configurations via declarative DatasetCollection definitions. +
-#### Model training: +
Model training + + - Automated recursive fine-tuning of the base model with a FineTuningScheduler abstraction - Configurable label-smoothing[4](#f4) - Generate and configure thawing schedules for models. - EarlyStopping easily configurable with multiple non-standard monitor metrics (e.g. mcc) - Both automatic and manually-specified [stochastic weight averaging](https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/) of model checkpoints[f](#cf) - mixed-precision training via [apex](https://github.com/NVIDIA/apex)[g](#cg) +
+
Analysis & reporting + -#### Analysis & reporting: - Extensive suite of reporting views for analyzing model performance and global and local levels +- A [current prediction explorer](current_explorer.html) that provides fact-checkers a means (one of many possible) of using current model predictions. This dashboard may also help those building fact-checking systems evaluate the potential utility of integrating similar models into their systems. - Statement and performance exploration dashboards for interpreting model predictions and understanding its performance - xkcd-themed visualization of UMAP-transformed statement embeddings +
--- ### Data Pipeline To conserve resources and for POC research expediency, the current pipeline uses a local relational DB (MariaDB). Ultimately, a distributed data store would be preferable and warranted if this project merits sufficient interest from the community or a POC involving a distributed network of models is initiated. -
Deep Classiflie Data Pipeline +
Deep Classiflie Data Pipeline ![Deep Classiflie Data Pipeline](docs/assets/deep_classiflie_data_pipeline.svg)
-
False Statement Filter Processes +
False Statement Filter Processes ![False Statement Filter Processes](docs/assets/False%20Statement%20Filter%20Processes.svg)
-
Distribution Convergence Process +
Distribution Convergence Process ![Distribution Convergence Process](docs/assets/Distribution%20Convergence%20Process.svg)
-
Dataset Generation Process +
Dataset Generation Process ![Dataset Generation Process](docs/assets/Dataset%20Generation%20Process.svg) @@ -230,6 +237,7 @@ The parameters used in all Deep Classiflie job executions related to the develop | **gen_dashboards.yaml** | parameters used to generate model analysis dashboards | | **cust_predict.yaml** | parameters used to perform model inference on arbitrary input statements | | **tweetbot.yaml** | parameters used to run the tweetbot behind @DeepClassiflie | +| **infsvc.yaml** | parameters used to run the inference service behind the current prediction explorer | @@ -244,7 +252,7 @@ The parameters used in all Deep Classiflie job executions related to the develop --- ### Model Replication -
Instructions (click to expand) +
Instructions N.B. before you begin, the core external dependency is admin access to a mariadb or mysql DB @@ -428,6 +436,7 @@ N.B. before you begin, the core external dependency is admin access to a mariadb --- ### Caveats +
  • [a] The distance threshold for filtering out "false truths" using base model embeddings matches falsehoods to their corresponding truths with high but imperfect accuracy. This fuzzy matching process will result in a modest upward performance bias in the test results. Model performance on datasets built using the noisy matching process (vs exclusively hash-based) improved by only ~2% globally with gains slightly disproportionately going to more confident buckets. This places a relatively low ceiling on the magnitude of the performance bias introduced through this filtering. The precise magnitude of this bias will be quantified in the future via one or both of the following methods :
    • @@ -435,11 +444,12 @@ N.B. before you begin, the core external dependency is admin access to a mariadb
    • when the next set of ground truth label data are released by the Washington Post, an estimated vs actual performance comparison can be performed
  • [b] The module used to bootstrap the POC model's tweet history by crawling factba.se needs to be refactored and added into the initial dataset bootstrap process. This is presently one of many issues in the backlog.
  • -
  • [c] Deep Classiflie depends upon deep_classiflie_db (initially released as a separate repository) for much of its analytical and dataset generation functionality but deep_classiflie_db is currently maintained as a separate repository here to maximize architectural flexibility. Depending on how Deep Classiflie evolves (e.g. as it supports distributed data stores etc.), it may make more sense to integrate deep_classiflie_db back into deep_classiflie.
  • +
  • [c] Deep Classiflie depends upon deep_classiflie_db (initially released as a separate repository) for much of its analytical and dataset generation functionality. Depending on how Deep Classiflie evolves (e.g. as it supports distributed data stores etc.), it may make more sense to integrate deep_classiflie_db back into deep_classiflie.
  • [d] It's notable that the model suffers a much higher FP ratio on tweets relative to non-tweets. Exploring tweet FPs, there are a number of plausible explanations for this discrepancy which could be explored in future research.
  • [e] Still in early development, there are significant outstanding issues (e.g. no tests yet!) and code quality shortcomings galore, but any constructive thoughts or contributions are welcome. I'm interested in using ML to curtail disinformation, not promulgate it, so I want to be clear -- this is essentially a fancy sentence similarity system with a lot of work put into building the dataset generation and model analysis data pipelines (I have a data engineering background, not a software engineering one).
  • [f] Current model release built/tested before swa graduated from torchcontrib to core pytorch. Next release of Deep Classiflie will use the integrated swa api.
  • [g] Current model release built/tested before AMP was integrated into core pytorch. Next release of Deep Classiflie will use the integrated AMP api.
  • +
  • [h] N.B. This daemon may violate Twitter's policy w.r.t. tweeting sensitive content if the subject's statements contain such content (no content-based filtering is included in the daemon). @DeepClassflie initially tested the Deep Classiflie twitter daemon but will post only framework-related announcements moving forward.
--- @@ -447,14 +457,14 @@ N.B. before you begin, the core external dependency is admin access to a mariadb Please cite: ```tex @misc{Dan_Dale_2020_3995079, - author = {Dan Dale}, - title = {{Deep Classiflie: Shallow fact-checking with deep neural networks}}, - month = aug, - year = 2020, - doi = {10.5281/zenodo.3995079}, - version = {v0.1.0-alpha}, - publisher = {Zenodo}, - url = {https://zenodo.org/record/3995079} + author = {Dan Dale}, + title = {{Deep Classiflie: Shallow fact-checking with deep neural networks}}, + month = aug, + year = 2020, + doi = {10.5281/zenodo.3995079}, + version = {v0.1.0-alpha}, + publisher = {Zenodo}, + url = {https://zenodo.org/record/3995079} } ``` Feel free to star the repo as well if you find it useful or interesting. Thanks! @@ -476,4 +486,4 @@ Feel free to star the repo as well if you find it useful or interesting. Thanks! --- ### License -[![License](https://img.shields.io/:license-mit-blue.svg?style=flat-square)](https://badges.mit-license.org) +[![License](https://img.shields.io/:license-mit-blue.svg?style=flat-square)](https://badges.mit-license.org) \ No newline at end of file diff --git a/analysis/inference.py b/analysis/inference.py index 85cd4d7..6b1be24 100644 --- a/analysis/inference.py +++ b/analysis/inference.py @@ -7,20 +7,16 @@ import utils.constants as constants from models.deep_classiflie_module import DeepClassiflie -from analysis.inference_utils import tokens_to_sentence, gen_embed_mappings, prep_mapping_tups, \ - prep_base_mapping_tups, pred_inputs_from_config, pred_inputs_from_test, prep_rpt_tups, prep_pred_exp_tups +from analysis.inference_utils import tokens_to_sentence, gen_embed_mappings, prep_mapping_tups, prep_base_mapping_tups,\ + pred_inputs_from_config, pred_inputs_from_test, prep_rpt_tups, prep_pred_exp_tups, prep_batchpred_tups from utils.core_utils import log_config +from torch.utils.data import DataLoader from training.training_utils import load_ckpt from analysis.inference_utils import InferenceSession from analysis.interpretation import InterpretTransformer logger = logging.getLogger(constants.APP_NAME) -try: - from apex import amp -except ImportError as error: - logger.debug(f"{error.__class__.__name__}: No apex module found, fp16 will not be available.") - class Inference(object): def __init__(self, config: MutableMapping, mapping_set: List[Tuple] = None, @@ -30,7 +26,7 @@ def __init__(self, config: MutableMapping, mapping_set: List[Tuple] = None, self.inf_session = InferenceSession(config, mapping_set, analysis_set, pred_exp_set, rpt_type, base_mode) def init_predict(self, model: torch.nn.Module = None, ckpt: str = None, tokenizer: PreTrainedTokenizer = None, - eval_tuple: Tuple = None) -> Union[Tuple[List[Tuple], Optional[Dict]], Dict, List[Tuple]]: + eval_tuple: Tuple = None) -> Union[Tuple[List[Tuple], Optional[Dict]], Dict, List[Tuple], List]: ckpt = self.init_predict_model(model, ckpt) self.init_predict_tokenizer(tokenizer, ckpt) self.config_interpretation() @@ -41,6 +37,9 @@ def init_predict(self, model: torch.nn.Module = None, ckpt: str = None, tokenize return self.pred_exp_viz(pred_inputs) elif self.inf_session.mapping_set: return gen_embed_mappings(self.inf_session, pred_inputs) + elif self.inf_session.config.experiment.infsvc.enabled: + inf_outputs = self.batch_predict(pred_inputs) + return inf_outputs elif self.inf_session.config.inference.interpret_preds and self.inf_session.config.experiment.tweetbot.enabled: unpublished_reports = self.predict_viz(pred_inputs) return unpublished_reports @@ -86,7 +85,7 @@ def config_interpretation(self) -> None: self.inf_session.model, self.inf_session.tokenizer, self.inf_session.device, pred_report_path) - def prep_pred_inputs(self, eval_tuple: Tuple) -> List[Dict]: + def prep_pred_inputs(self, eval_tuple: Tuple) -> Union[List[Dict], DataLoader]: if not (eval_tuple or self.inf_session.config.inference.pred_inputs or self.inf_session.analysis_set or self.inf_session.mapping_set or self.inf_session.pred_exp_set): raise ValueError("init_predict must be provided inputs via either test set samples," @@ -99,6 +98,8 @@ def prep_pred_inputs(self, eval_tuple: Tuple) -> List[Dict]: pred_inputs = prep_base_mapping_tups(self.inf_session) elif self.inf_session.mapping_set: pred_inputs = prep_mapping_tups(self.inf_session) + elif self.inf_session.config.experiment.infsvc.enabled: + pred_inputs = prep_batchpred_tups(self.inf_session) elif eval_tuple: num_samples = self.inf_session.config.inference.sample_predictions pred_inputs = pred_inputs_from_test(self.inf_session, eval_tuple, num_samples) @@ -127,6 +128,23 @@ def predict(self, pred_inputs: List[Dict]) -> None: f"PREDICTION: {prob} ({round(prob)}), actual label: {round(label)}" f" INPUT: {parsed_sent} ") + def batch_predict(self, pred_inputs: DataLoader) -> List: + self.inf_session.model.set_interpret_mode() + batch_inf_outputs = [] + pred_batch_iterator = tqdm.tqdm(pred_inputs, desc="Batch") + for i, batch in enumerate(pred_batch_iterator): + batch = tuple(t.to(self.inf_session.device) for t in batch) + with torch.no_grad(): + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'token_type_ids': batch[2], + 'position_ids': batch[3], + 'ctxt_type': batch[4], + 'labels': None} + probs = (self.inf_session.model(**inputs)) + batch_inf_outputs.extend([round(p.squeeze(0).item(), 4) for p in probs]) + return batch_inf_outputs + def predict_viz(self, pred_inputs: List[Dict]) -> List[Tuple]: for sample in tqdm.tqdm(pred_inputs, desc=f'Interpreting {len(pred_inputs)} ' f'predictions and generating per-prediction reports'): @@ -179,8 +197,6 @@ def gen_model_rpt(self, pred_inputs: List[Dict]) -> Tuple[List[Tuple], Dict]: for sample in tqdm.tqdm(pred_inputs, desc=f"Generating report using {len(pred_inputs)} samples"): input_embedding, inputs, probs, token_list, prob = self.pass_interpretable_inputs(sample) token_list = list(filter(lambda l: l not in self.inf_session.special_token_mask, token_list)) - # all records should have a label ("True" unless explicitly labeled false by wapo) unless - # using "gt, ground truth" version of scoring (model_rpt_all_tweet_data_gt) label = sample['labels'].item() if sample['labels'] in [0, 1] else None parsed_sent = tokens_to_sentence(self.inf_session, token_list) # include only training data in the statement embedding diff --git a/analysis/inference_utils.py b/analysis/inference_utils.py index 30d07e6..024cd41 100644 --- a/analysis/inference_utils.py +++ b/analysis/inference_utils.py @@ -6,6 +6,7 @@ import torch from transformers import AlbertTokenizer import tqdm +from torch.utils.data import DataLoader, SequentialSampler, TensorDataset import utils.constants as constants from analysis.interpretation import InterpretTransformer @@ -155,6 +156,24 @@ def pred_inputs_from_config(inf_session: InferenceSession) -> List[Dict]: return pred_inputs +def prep_batchpred_tups(inf_session: InferenceSession) -> DataLoader: + pred_inputs = {'all_input_ids': [], 'all_attention_masks': [], 'all_token_type_ids': [], + 'all_position_ids': [], 'all_ctxt_types': []} + for i, (parentid, childid, ex, ctxt_type, _) in enumerate(inf_session.config.inference.pred_inputs): + input_ids, attention_mask, token_type_ids, position_ids = prep_model_inputs(inf_session, ex) + for k, v in zip(pred_inputs.keys(), [input_ids, attention_mask, token_type_ids, position_ids, ctxt_type]): + # noinspection PyUnresolvedReferences + pred_inputs[k].append(v) + for k, v in pred_inputs.items(): + pred_inputs[k] = torch.tensor([f for f in v], dtype=torch.float) if k == 'all_ctxt_types' else \ + torch.tensor([f for f in v], dtype=torch.long) + pred_dataset = TensorDataset(*list(pred_inputs.values())) + pred_sampler = SequentialSampler(pred_dataset) + pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, + batch_size=inf_session.config.experiment.infsvc.batch_size) + return pred_dataloader + + def prep_base_mapping_tups(inf_session: InferenceSession) -> List[Dict]: pred_inputs = [] for i, (tup_id, ex, _) in tqdm.tqdm(enumerate(inf_session.mapping_set), diff --git a/analysis/model_analysis_rpt.py b/analysis/model_analysis_rpt.py index 4666b13..02db524 100644 --- a/analysis/model_analysis_rpt.py +++ b/analysis/model_analysis_rpt.py @@ -10,7 +10,7 @@ import constants as db_constants from db_ingest import get_cnxp_handle from analysis.inference import Inference -from db_utils import fetchallwrapper, batch_execute_many +from db_utils import fetchallwrapper, batch_execute_many, single_execute from analysis.interpretation_utils import load_cache from analysis.gen_pred_explorer import build_pred_exp_doc from analysis.gen_perf_explorer import build_perf_exp_doc @@ -102,17 +102,25 @@ def gen_report(self, rpt_type: str) -> None: self.config.data_source.train_start_date = datetime.datetime.combine(ds_meta[1], datetime.time()) self.config.data_source.train_end_date = datetime.datetime.combine(ds_meta[2], datetime.time()) rpt_tups, stmt_embed_dict = Inference(self.config, analysis_set=analysis_set, rpt_type=rpt_type).init_predict() - inserted_rowcnt, error_rows = batch_execute_many(self.cnxp.get_connection(), - self.config.inference.sql.save_model_sql, rpt_tups) - logger.info(f"Generated {inserted_rowcnt} inference records for analysis of " - f"model version {constants.APP_INSTANCE}") + self.persist_rpt_data(rpt_tups) self.maybe_build_cache(stmt_embed_dict) + def persist_rpt_data(self, rpt_tups): + inserted_model_rowcnt, _ = batch_execute_many(self.cnxp.get_connection(), + self.config.inference.sql.save_model_rpt_sql, rpt_tups) + logger.info(f"Generated {inserted_model_rowcnt} inference records for analysis of " + f"model version {constants.APP_INSTANCE}") + inserted_model_rowcnt, _ = single_execute(self.cnxp.get_connection(), self.config.inference.sql.save_model_sql) + logger.info(f"Generated {inserted_model_rowcnt} global model performance summary for " + f"model version {constants.APP_INSTANCE}") + inserted_perf_rowcnt, _ = single_execute(self.cnxp.get_connection(), self.config.inference.sql.save_perf_sql) + logger.info(f"Generated {inserted_perf_rowcnt} local performance summary records for " + f"model version {constants.APP_INSTANCE}") + def gen_analysis_set(self) -> List[Tuple]: # current use case involves relatively small analysis set that fits in memory and should only be used once # so wasteful to persist. if later use cases necessitate, will pickle or persist for larger datasets report_sql = f"select * from {self.report_view}" - # TODO: remove this unnecessary transformation? should be able to directly return report_sql tuple list now... analysis_set = ModelAnalysisRpt.prep_model_analysis_ds(fetchallwrapper(self.cnxp.get_connection(), report_sql)) return analysis_set diff --git a/configs/config_defaults.yaml b/configs/config_defaults.yaml index 624d839..962d4c2 100644 --- a/configs/config_defaults.yaml +++ b/configs/config_defaults.yaml @@ -10,6 +10,16 @@ experiment: purge_intermediate_reports: False non_twitter_update_freq_multiple: 5 dcbot_poll_interval: 180 + infsvc: + enabled: False + batch_mode: True + batch_size: 16 + thread_latency: 900 + publish: False + skip_db_refresh: False + # purge_intermediate_reports: False + non_twitter_update_freq_multiple: 5 + dcbot_poll_interval: 180 debug: debug_enabled: False use_debug_dataset: False @@ -112,7 +122,7 @@ trainer: max_grad_norm: 1.0 amsgrad: False swa_mode: "best" - last_swa_snaps: 5 + last_swa_snaps: 10 warmup_epochs: 1 inference: report_mode: False diff --git a/configs/config_defaults_sql.yaml b/configs/config_defaults_sql.yaml index 4121b1d..66df436 100644 --- a/configs/config_defaults_sql.yaml +++ b/configs/config_defaults_sql.yaml @@ -1,8 +1,8 @@ experiment: tweetbot: sql: - stmts_to_analyze_sql: "select * from stmts_to_analyze" - tweets_to_analyze_sql: "select * from tweets_to_analyze" + stmts_to_analyze_sql: "select * from tweetbot_stmts_to_analyze" + tweets_to_analyze_sql: "select * from tweetbot_tweets_to_analyze" get_bot_creds_sql: >- select consumer_key, consumer_secret, access_token, access_secret from dcbot_creds tweets_analyzed_pub_sql: >- @@ -21,6 +21,28 @@ experiment: insert ignore into stmts_analyzed_notpublished (tid, sid, arc_report_name, t_date) VALUES (%s, %s, %s, CURRENT_TIMESTAMP()) + infsvc: + sql: + stmts_to_analyze_sql: "select * from stmts_to_analyze" + tweets_to_analyze_sql: "select * from tweets_to_analyze" + get_bot_creds_sql: >- + select consumer_key, consumer_secret, access_token, access_secret from dcbot_creds + get_pinata_creds_sql: >- + select api_key, api_secret from pinata_creds + get_cloudflare_creds_sql: >- + select * from cloudflare_creds + save_pinned_cid_sql: >- + insert into pinned_cids (pin_svc_id, cid, pinsize, p_date) values (%s, %s, %s, CURRENT_TIMESTAMP()) + fetch_current_pinned_cid_sql: >- + select cid from pinned_cids where p_date=(select max(p_date) from pinned_cids) limit 1 + tweets_pub_sql: >- + insert ignore into infsvc_tweets_published + (model_version, iid, thread_id, prediction, raw_pred, raw_confidence, p_date) + values (%s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP()); + stmts_pub_sql: >- + insert ignore into infsvc_stmts_published + (model_version, iid, tid, sid, prediction, raw_pred, raw_confidence, p_date) + values (%s, %s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP()); data_source: sql: debug: @@ -77,9 +99,11 @@ inference: nontweet_model_perf_cache_sql: "select * from nontweet_model_accuracy_lookup_cache" global_model_perf_cache_sql: "select * from global_model_accuracy_lookup_cache" pred_exp_sql: "select * from pred_explr_stmts" + save_model_sql: "insert into model_metadata select * from latest_global_model_perf_summary" + save_perf_sql: "insert into local_model_perf_summary_hist select * from latest_local_model_perf_summary" ds_md_sql: >- select dsid, train_start_date, train_end_date from ds_metadata where ds_type='converged_filtered' order by dsid desc limit 1 - save_model_sql: >- + save_model_rpt_sql: >- insert ignore into model_analysis_rpts (model_version, dsid, report_type, statement_id, statement_text, stype, sdate, label, prediction, raw_pred) - values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) \ No newline at end of file diff --git a/configs/dataprep_only.yaml b/configs/dataprep_only.yaml index bbe14d9..9a6dbfd 100644 --- a/configs/dataprep_only.yaml +++ b/configs/dataprep_only.yaml @@ -5,15 +5,15 @@ experiment: dataprep_only: True debug: debug_enabled: True - use_debug_dataset: False + use_debug_dataset: True data_source: # db_conf must be explictly specified only in dev mode or if db_conf is in a non-default location - # db_conf: "/home/speediedan/repos/edification/deep_classiflie_db_feat/deep_classiflie_db.yaml" + db_conf: "/home/speediedan/repos/edification/deep_classiflie_db_feat/deep_classiflie_db.yaml" model_filter_topk: 20 filter_w_embed_cache: False # safest way to build a new dataset is to verify backup of the previous one and remove the relevant cache softlink # (rather than override with "rebuild_dataset: True") - skip_db_refresh: False + skip_db_refresh: True # to update metadata only, will not load datasets (depends on db_functionality_enabled=True) update_ds_db_metadata_only: False diff --git a/configs/gen_report.yaml b/configs/gen_report.yaml index c555000..ec8a7b6 100644 --- a/configs/gen_report.yaml +++ b/configs/gen_report.yaml @@ -1,11 +1,13 @@ experiment: db_functionality_enabled: True # must set to True to generate reports, run dctweetbot, among other functions # provide the generated swa checkpoint below - inference_ckpt: "/home/speediedan/experiments/deep_classiflie/checkpoints/20200816114940/checkpoint-0.0000-swa_best_2_ckpts--1-0.pt" # note build_swa_from_ckpts will be ignored if inference_ckpt is present + inference_ckpt: "/home/speediedan/experiments/deep_classiflie_feat/checkpoints/20200901084410/checkpoint-0.0000-swa_best_2_ckpts--1-0.pt" # note build_swa_from_ckpts will be ignored if inference_ckpt is present debug: debug_enabled: False data_source: skip_db_refresh: True + # db_conf must be explictly specified only in dev mode or if db_conf is in a non-default location + db_conf: "/home/speediedan/repos/edification/deep_classiflie_db/deep_classiflie_db.yaml" inference: report_mode: True # set to true to enable report generation rebuild_perf_cache: True # set True to (re)build perf cache (report_mode must also be True) diff --git a/configs/gen_swa_ckpt.yaml b/configs/gen_swa_ckpt.yaml index 37d9a1e..49798c6 100644 --- a/configs/gen_swa_ckpt.yaml +++ b/configs/gen_swa_ckpt.yaml @@ -6,7 +6,7 @@ data_source: skip_db_refresh: True trainer: # replace checkpoints below with the desired checkpoints from your locally trained model (two checkpoints with lowest loss would be a good choice) - build_swa_from_ckpts: ['/home/speediedan/experiments/deep_classiflie/checkpoints/20200814165928/checkpoint-0.5533-20-104055.pt', - '/home/speediedan/experiments/deep_classiflie/checkpoints/20200814165928/checkpoint-0.5642-26-133785.pt'] + build_swa_from_ckpts: ['/home/speediedan/experiments/deep_classiflie_feat/checkpoints/20200831123242/checkpoint-0.5526-37-188214.pt', + '/home/speediedan/experiments/deep_classiflie_feat/checkpoints/20200831123242/checkpoint-0.5527-32-163449.pt'] inference: report_mode: False # set to true to enable report generation \ No newline at end of file diff --git a/configs/infsvc.yaml b/configs/infsvc.yaml new file mode 100644 index 0000000..12f1739 --- /dev/null +++ b/configs/infsvc.yaml @@ -0,0 +1,13 @@ +experiment: + inference_ckpt: "/home/speediedan/experiments/deep_classiflie_feat/checkpoints/20200901084410/checkpoint-0.0000-swa_best_2_ckpts--1-0.pt" + db_functionality_enabled: True + debug: + debug_enabled: True + infsvc: + enabled: True + publish: True + batch_mode: False + skip_db_refresh: False +data_source: + # db_conf must be explictly specified only in dev mode or if db_conf is in a non-default location + db_conf: "/home/speediedan/repos/edification/deep_classiflie_db_feat/deep_classiflie_db.yaml" \ No newline at end of file diff --git a/configs/test_only.yaml b/configs/test_only.yaml new file mode 100644 index 0000000..1225fee --- /dev/null +++ b/configs/test_only.yaml @@ -0,0 +1,9 @@ +experiment: + db_functionality_enabled: True # must set to True to generate reports, run dctweetbot, among other functions + inference_ckpt: "/home/speediedan/experiments/deep_classiflie_feat/checkpoints/20200828093250/checkpoint-0.0000-swa_best_2_ckpts--1-0.pt" # note build_swa_from_ckpts will be ignored if inference_ckpt is present + debug: + debug_enabled: False +data_source: + # db_conf must be explictly specified only in dev mode or if db_conf is in a non-default location + db_conf: "/home/speediedan/repos/edification/deep_classiflie_db/deep_classiflie_db.yaml" + skip_db_refresh: True \ No newline at end of file diff --git a/configs/test_report.yaml b/configs/test_report.yaml deleted file mode 100644 index 2acdfe1..0000000 --- a/configs/test_report.yaml +++ /dev/null @@ -1,21 +0,0 @@ -experiment: - db_functionality_enabled: True # must set to True to generate reports, run dctweetbot, among other functions - inference_ckpt: "/home/speediedan/experiments/deep_classiflie/checkpoints/20200816114940/checkpoint-0.0000-swa_best_2_ckpts--1-0.pt" # note build_swa_from_ckpts will be ignored if inference_ckpt is present - debug: - debug_enabled: False -data_source: - # db_conf must be explictly specified only in dev mode or if db_conf is in a non-default location - # db_conf: "/home/speediedan/repos/edification/deep_classiflie_db/deep_classiflie_db.yaml" - skip_db_refresh: True -#trainer: - #build_swa_from_ckpts: ['/home/speediedan/experiments/deep_classiflie/checkpoints/20200814165928/checkpoint-0.5533-20-104055.pt', - # '/home/speediedan/experiments/deep_classiflie/checkpoints/20200814165928/checkpoint-0.5642-26-133785.pt'] -inference: - report_mode: True # set to true to enable report generation - update_perf_caches_only: False # updates perf caches only using latest report, doesn't create a new report - rebuild_perf_cache: False # set True to (re)build perf cache (report_mode must also be True) - rebuild_stmt_cache: False # set True to (re)build statement cache (report_mode must also be True) - rebuild_pred_explorer: True # set True to (re)build pred_explorer widget (instead of running reports) (report_mode must also be True) - rebuild_pred_exp_stmt_cache: False # rebuild_pred_explorer must also be true - rebuild_perf_explorer: True # set True to (re)build perf_explorer widget (instead of running reports) (report_mode must also be True) - rebuild_perf_exp_cache: False # rebuild_perf_explorer must also be true \ No newline at end of file diff --git a/configs/train_albertbase.yaml b/configs/train_albertbase.yaml index d4dd77a..09c7e78 100644 --- a/configs/train_albertbase.yaml +++ b/configs/train_albertbase.yaml @@ -6,8 +6,9 @@ experiment: data_source: skip_db_refresh: True trainer: - # restart_training_ckpt: "/home/speediedan/experiments/deep_classiflie_feat/checkpoints/20200629114238/checkpoint-0.6178-0-103.pt" + # restart_training_ckpt: "/home/speediedan/experiments/deep_classiflie_feat/checkpoints/20200826121309/checkpoint-0.6039-11-1236.pt" dump_model_thaw_sched_only: False + label_smoothing_enabled: True # histogram_vars: ['classifier.weight', 'ctxt_embed.weight', 'albert.pooler.weight'] fine_tune_scheduler: thaw_schedule: "DeepClassiflie_thaw_schedule.yaml" diff --git a/deep_classiflie.py b/deep_classiflie.py index 0d3abf7..b4b1693 100644 --- a/deep_classiflie.py +++ b/deep_classiflie.py @@ -3,9 +3,7 @@ The initial alpha release of Deep Classiflie generates/analyzes a model that continuously classifies a single individual's statements (Donald Trump)[1](#f1) using a single ground truth labeling source -(The Washington Post). For statements the model deems most likely to be labeled falsehoods. - (see deepclassiflie.org for more detail), the [@DeepClassiflie](https://twitter.com/DeepClassiflie) twitter bot tweets - out a statement analysis and model interpretation "report" +(The Washington Post). See deepclassiflie.org for current predictions and to explore the model and its performance. @author: Dan Dale, @speediedan """ import logging @@ -18,6 +16,7 @@ from dataprep.dataprep import DatasetCollection from utils.core_utils import create_lock_file from utils.dc_tweetbot import DCTweetBot +from utils.dc_infsvc import DCInfSvc from utils.envconfig import EnvConfig from analysis.inference import Inference from analysis.model_analysis_rpt import ModelAnalysisRpt @@ -33,8 +32,10 @@ def main() -> Optional[NoReturn]: _ = DatasetCollection(config) elif config.experiment.predict_only and config.inference.pred_inputs: Inference(config).init_predict() + elif config.experiment.infsvc.enabled: + init_dc_service(config, 'infsvc') elif config.experiment.tweetbot.enabled: - init_tweetbot(config) + init_dc_service(config, 'tweetbot') elif config.inference.report_mode: if not config.experiment.db_functionality_enabled: logger.error(f"{constants.DB_WARNING_START} Model analysis reports {constants.DB_WARNING_END}") @@ -44,14 +45,21 @@ def main() -> Optional[NoReturn]: core_flow(config) -def init_tweetbot(config: MutableMapping) -> NoReturn: +def init_dc_service(config:MutableMapping, service_type: str) -> NoReturn: + if service_type == 'infsvc': + svc_name = 'inference service' + svc_module = DCInfSvc + else: + svc_name = 'tweetbot' + svc_module = DCTweetBot lock_file = None try: if not config.experiment.db_functionality_enabled: - logger.error(f"{constants.DB_WARNING_START} The tweetbot {constants.DB_WARNING_END}") + logger.error(f"{constants.DB_WARNING_START} The {svc_name} {constants.DB_WARNING_END}") sys.exit(0) lock_file = create_lock_file() - DCTweetBot(config) + svc_module(config) + os.remove(lock_file) except KeyboardInterrupt: logger.warning('Interrupted bot, removing lock file and exiting...') os.remove(lock_file) diff --git a/deep_classiflie_infsvc.sh b/deep_classiflie_infsvc.sh new file mode 100755 index 0000000..7bba079 --- /dev/null +++ b/deep_classiflie_infsvc.sh @@ -0,0 +1,26 @@ +source ${HOME}/.bashrc +cd `dirname "$(readlink -f "$0")"` +curr_branch=`git branch | grep '* ' | awk '{print $2}'` +# N.B. if working DEV mode, the branch name should equal the corresponding working tree name +if [ "${curr_branch}" == "master" ] +then + echo "Starting the deep classiflie inference service in non-dev mode" + source "${HOME}/.dc_config" + lock_file=${HOME}/dcbot.lock + bot_log_name="deep_classiflie_infsvc" +else + echo "Starting the deep classiflie inference service in dev mode" + source "${HOME}/.${curr_branch}_config" + lock_file=${HOME}/${curr_branch}_dcbot.lock + bot_log_name="${curr_branch}_infsvc" +fi +[ -f $lock_file ] && echo "Lock file ${lock_file} exists, abandoning daemon startup" && exit 0 +target_env=$1 +conda activate $target_env +prev_log=$HOME/$bot_log_name.out +if test -f $prev_log +then + d=`date +%Y%m%d%H%M%S` + mv $prev_log ${prev_log}_${d}.bkp +fi +nohup /opt/anaconda/envs/${target_env}/bin/python ${DC_BASE}/deep_classiflie.py --config "${DC_BASE}/configs/infsvc.yaml" 1>"${HOME}/${bot_log_name}.out" 2>&1 & \ No newline at end of file diff --git a/docs/.ipynb_checkpoints/about-checkpoint.md b/docs/.ipynb_checkpoints/about-checkpoint.md index 38d7546..e27bb20 100644 --- a/docs/.ipynb_checkpoints/about-checkpoint.md +++ b/docs/.ipynb_checkpoints/about-checkpoint.md @@ -1,6 +1,4 @@ -[![python badge](https://img.shields.io/badge/python->=3.7-brightgreen.svg)](https://shields.io/) -[![pytorch badge](https://img.shields.io/badge/pytorch->=4.8-blue.svg)](https://shields.io/) -
Table of Contents +
Table of Contents - [What is Deep Classiflie?](#what-is-deep-classiflie) @@ -17,14 +15,13 @@ - [Citing Deep Classiflie](#citing-deep-classiflie) - [References and Notes](#references-and-notes) - [License](#license) +- [View on GitHub]({{ site.github.repository_url }})
--- ### What is Deep Classiflie? - Deep Classiflie is a framework for developing ML models that bolster fact-checking efficiency. Predominantly a research project[e](#ce), I plan to extend and maintain this framework in pursuing my own research interests so am sharing it in case it's of any utility to the broader community. -- As a POC, the initial alpha release of Deep Classiflie generates/analyzes a model that continuously classifies a single individual's statements (Donald Trump)[1](#f1) using a single ground truth labeling source (The Washington Post). For statements the model deems most likely to be labeled falsehoods (see [current performance](#current-performance) for more detail), the [@DeepClassiflie](https://twitter.com/DeepClassiflie) twitter bot tweets out a statement analysis and model interpretation "report" such as the one below: - - Example tweet report +- As a POC, the initial alpha release of Deep Classiflie generates/analyzes a model that continuously classifies a single individual's statements (Donald Trump)[1](#f1) using a single ground truth labeling source (The Washington Post). - The Deep Classiflie POC model's predictions and performance on the most recent test set can be [explored](#model-exploration) and better understood using the [prediction explorer](pred_explorer.html): prediction explorer - and the [performance explorer](perf_explorer.html): @@ -53,17 +50,22 @@ ### Model Exploration The best way to start understanding/exploring the current model is to use the explorers on deepclassiflie.org: -#### [Prediction Explorer](pred_explorer.html): +
[Prediction Explorer](pred_explorer.html) + + Explore randomly sampled predictions from the test set of the latest model incarnation. The explorer uses [captum's](https://captum.ai/) implementation of integrated gradients[7](#f7) to visualize attributions of statement predictions to tokens in each statement. Read more about explorer [below.](##current-performance) prediction explorer +
+ +
[Performance Explorer](perf_explorer.html) + -#### [Performance Explorer](perf_explorer.html): Explore the performance of the current model incarnation using confusion matrices oriented along temporal and confidence-based axes. temporal performance explorer confidence bucket performance explorer - +
--- ### Core Components @@ -78,27 +80,38 @@ The entire initial Deep Classiflie system (raw dataset, model, analytics modules -[Dataset Generation](#data-pipeline) +
[Dataset Generation](#data-pipeline) + + - For simplicity, scrape "ground truth" falsehood labels from a single source ([Washington Post Factchecker](https://www.washingtonpost.com/graphics/politics/trump-claims-database)) - Scrape a substantial fraction of public statements from multiple sources. ([Factba.se](https://factba.se), [Twitter](https://twitter.com)) - Use statement hashes and subword representations from a base model (ALBERT[8](#f8)) to remove "false" statements present in the larger "truths" corpus. - Prepare chronologically disjoint train/dev/test sets (to avoid data leakage) and attempt to reduce undesired superficial class-aligned distributional artifacts that could be leveraged during model training. NNs are lazy, they'll cheat if we let them. - -**Model Training** + +
+
**Model Training** + + - Fine-tune a base model (currently HuggingFace's [ALBERT implementation](https://huggingface.co/transformers/model_doc/albert.html) with some minor customizations) in tandem with a simple embedding reflecting the semantic shift associated with the medium via which the statement was conveyed (i.e., for the POC, just learn the tweet vs non-tweet transformation) (using [Pytorch](https://pytorch.org/)) -- Explore the latest model's training session on tensorboard.dev. +- Explore the latest model's training session on [tensorboard.dev](https://tensorboard.dev/experiment/rGNQpYnYSOaHb2A84xRAzw). - N.B. neuro-symbolic methods[6](#f6) that leverage knowledge bases and integrate symbolic reasoning with connectionist methods are not used in this model. Use of these approaches may be explored in [future research](#further-research) using this framework. - -**Analysis & Reporting** +
+
**Analysis & Reporting** + + - Interpret statement-level predictions using [captum's](https://captum.ai/) implementation of integrated gradients to visualize attributions of statement predictions to tokens in each statement. - Prediction and model performance exploration dashboards were built using [bokeh](https://docs.bokeh.org/en/latest/index.html) and [Jekyll](https://github.com/jekyll/jekyll) -- Automated false statement reports for predictions meeting the desired [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) confidence threshold are published on twitter via the [@DeepClassiflie](https://twitter.com/DeepClassiflie) bot, which leverages [Tweepy](https://www.tweepy.org/) -- XKCD fans may notice the style of the dashboard explorers and statement reports are XKCD-inspired using the Humor Sans font created by [@ch00ftech](https://twitter.com/ch00ftech). Thanks to him (and [@xkcd](https://twitter.com/xkcd) of course!) +- Two inference daemons poll, analyze and classify new statements: + 1. (still in development) A daemon that publishes via IPFS pubsub, all new statement classifications and inference output. + 2. (currently available) Automated false statement reports for predictions meeting the desired [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) confidence threshold can be published on twitter via a twitter bot, which leverages [Tweepy](https://www.tweepy.org/). The bot [h](#ch) tweets out a statement analysis and model interpretation "report" such as the one below for statements the model deems most likely to be labeled falsehoods (see [current performance](#current-performance) for more detail): + Example tweet report +- XKCD fans may notice the style of the dashboard explorers and statement reports are XKCD-inspired using the Humor Sans font created by [@ch00ftech](https://twitter.com/ch00ftech). Thanks to him (and [@xkcd](https://twitter.com/xkcd) of course!) +
--- ### Current Performance -
Global +
Global Global metrics[9](#f9) summarized in the table below relate to the current model's performance on a test set comprised of ~12K statements made between 2020-04-03 and 2020-07-08:
@@ -106,7 +119,7 @@ Global metrics[9](#f9) summarized in the table below relate t
-
Local +
Local To minimize false positives and maximize the model's utility, the following approach is used to issue high-confidence predictions: @@ -116,95 +129,63 @@ To minimize false positives and maximize the model's utility, the following appr * [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) * Positive prediction ratio: (bucket true positives + bucket false positives)/#statements in bucket * Bucket-level accuracy -3. Report estimated local accuracy metrics of given prediction by associating it with its corresponding confidence bucket +3. Report estimated local accuracy metrics of given prediction by associating it with its corresponding confidence bucket. See [caveats](#caveats) regarding recognized performance biases[a](#ca) * In the prediction explorer, randomly sample 100 statements (including all confusion matrix classes) from each of four confidence buckets: the maximum and minimum accuracy buckets for each statement type. Max Accuracy Non-Tweets
Max Accuracy Tweets -4. Use statement-type aligned (tweet vs non-tweet) PPV thresholds to estimate @DeepClassiflie's statement report publishing accuracy using performance from the most recent test set. See [caveats](#caveats) regarding recognized performance biases[a](#ca). -
- -
@DeepClassiflie Report Performance - - -Had @DeepClassiflie been publishing statement reports over the period comprising its test set (see above) using the current model, the publishing performance is expected to have been approximately characterized by the statistics below. See [caveats](#caveats) regarding the modest recognized performance biases[a](#ca). Now that report publishing has begun, once additional labeled data are available, the realized performance of the model will be similarly reported here.
- -
- -| Model Version | Period Days | Start Date | End Date | -| :--- | :---: | :---: | :---: | -| 20200816115426 | 96 | 04/03/2020 | 07/08/2020 | - -
-
- -| Statement Type | Publish Threshold | Stmts/max bucket | Bucket ppv | Bucket ppr| Est Reports Published | Estimated TP | Estimated FP | -| :--- | :---: | :---: | :---: | :---:| :---: | :---: | :---: | -Non-Tweets | 0.96 | 430 | 0.965 |1 | 430 |415 | 15 | -Tweets | 0.78 | 109 | 0.786 | 0.257 |28 | 22 | 6 | - -
-
- -| Period Estimate | Period total | Per day | -| :--- | :---: | :---: | -Non-tweet reports published | 430 | 4.48 | -Tweet reports published | 28 | 0.29 | -TP non-tweet reports published | 415 | 4.32 | -FP non-tweet reports published | 15 | 0.16 | -TP tweet reports published | 22 | 0.23 | -FP tweet reports published | 6 | 0.06 | -Projected report period non-tweet accuracy | 96.5% | -Projected report period tweet accuracy | 78.6% | -Projected report period global accuracy | 95.4% | - -
- --- ### Noteworthy Features -#### Dataset generation: +
Dataset generation + + - Easily and extensively configurable using yaml [configuration files](#configuration). - Multiple different class balancing strategies available (oversampling, class ratios etc.) - "Weakly converge" superficially divergent class distributions using UnivariateDistReplicator abstraction - Easily experiment with different train/dev/test splits/configurations via declarative DatasetCollection definitions. +
-#### Model training: +
Model training + + - Automated recursive fine-tuning of the base model with a FineTuningScheduler abstraction - Configurable label-smoothing[4](#f4) - Generate and configure thawing schedules for models. - EarlyStopping easily configurable with multiple non-standard monitor metrics (e.g. mcc) - Both automatic and manually-specified [stochastic weight averaging](https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/) of model checkpoints[f](#cf) - mixed-precision training via [apex](https://github.com/NVIDIA/apex)[g](#cg) +
+
Analysis & reporting + -#### Analysis & reporting: - Extensive suite of reporting views for analyzing model performance and global and local levels - Statement and performance exploration dashboards for interpreting model predictions and understanding its performance - xkcd-themed visualization of UMAP-transformed statement embeddings - +
--- ### Data Pipeline To conserve resources and for POC research expediency, the current pipeline uses a local relational DB (MariaDB). Ultimately, a distributed data store would be preferable and warranted if this project merits sufficient interest from the community or a POC involving a distributed network of models is initiated. -
Deep Classiflie Data Pipeline +
Deep Classiflie Data Pipeline ![Deep Classiflie Data Pipeline](/assets/deep_classiflie_data_pipeline.svg)
-
False Statement Filter Processes +
False Statement Filter Processes ![False Statement Filter Processes](/assets/False%20Statement%20Filter%20Processes.svg)
-
Distribution Convergence Process +
Distribution Convergence Process ![Distribution Convergence Process](/assets/Distribution%20Convergence%20Process.svg)
-
Dataset Generation Process +
Dataset Generation Process ![Dataset Generation Process](/assets/Dataset%20Generation%20Process.svg) @@ -242,7 +223,7 @@ The parameters used in all Deep Classiflie job executions related to the develop --- ### Model Replication -
Instructions (click to expand) +
Instructions N.B. before you begin, the core external dependency is admin access to a mariadb or mysql DB @@ -438,24 +419,25 @@ N.B. before you begin, the core external dependency is admin access to a mariadb
  • [e] Still in early development, there are significant outstanding issues (e.g. no tests yet!) and code quality shortcomings galore, but any constructive thoughts or contributions are welcome. I'm interested in using ML to curtail disinformation, not promulgate it, so I want to be clear -- this is essentially a fancy sentence similarity system with a lot of work put into building the dataset generation and model analysis data pipelines (I have a data engineering background, not a software engineering one).
  • [f] Current model release built/tested before swa graduated from torchcontrib to core pytorch. Next release of Deep Classiflie will use the integrated swa api.
  • [g] Current model release built/tested before AMP was integrated into core pytorch. Next release of Deep Classiflie will use the integrated AMP api.
  • +
  • [h] N.B. This daemon may violate Twitter's [policy](https://help.twitter.com/en/rules-and-policies/twitter-automation) w.r.t. tweeting sensitive content if the subject's statements contain such content (no content-based filtering is included in the daemon). [@DeepClassflie](https://twitter.com/DeepClassiflie) initially tested the Deep Classiflie twitter daemon but will post only framework-related announcements moving forward.
  • --- ### Citing Deep Classiflie Please cite: ```tex -@misc{Dan_Dale_2020_tbd, +@misc{Dan_Dale_2020_3995079, author = {Dan Dale}, title = {% raw %}{{Deep Classiflie: Shallow fact-checking with deep neural networks}}{% endraw %}, month = aug, year = 2020, - doi = {tbd/zenodo.tbd}, - version = {1.0}, + doi = {10.5281/zenodo.3995079}, + version = {v0.1.0-alpha}, publisher = {Zenodo}, - url = {https://doi.org/tbd/zenodo.tbd} + url = {https://zenodo.org/record/3995079} } ``` -Feel free to star the repo as well if you find it useful or interesting. Thanks! +Feel free to star the [repo]({{ site.github.repository_url }}) as well if you find it useful or interesting. Thanks! --- ### References and Notes @@ -474,6 +456,7 @@ Feel free to star the repo as well if you find it useful or interesting. Thanks! --- ### License -[![License](http://img.shields.io/:license-mit-blue.svg?style=flat-square)](http://badges.mit-license.org) -- **[MIT license](http://opensource.org/licenses/mit-license.php)** +[![License](https://img.shields.io/:license-mit-blue.svg?style=flat-square)](https://badges.mit-license.org) + + [View on GitHub]({{ site.github.repository_url }}) diff --git a/docs/_config.yml b/docs/_config.yml index 8e80a8b..fa59eb7 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -12,7 +12,7 @@ show_downloads: false about_url: about.html pred_explorer_url: pred_explorer.html perf_explorer_url: perf_explorer.html -needs_bokeh: true +current_explorer_url: current_explorer.html url: https://deepclassiflie.org plugins: - jekyll-feed diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html index f6fd371..dd623c2 100644 --- a/docs/_layouts/default.html +++ b/docs/_layouts/default.html @@ -1,6 +1,7 @@ - + + {{ page.title }} {% if site.google_analytics %} + + + + + + + {% endif %} - {% if site.needs_bokeh %} + {% if page.needs_bokeh %} @@ -38,13 +48,17 @@ Deep Classiflie Logo
    {{ site.description | default: site.github.project_tagline }}
    - About - Prediction Explorer - Performance Explorer - {% if site.show_downloads %} - Download .zip - Download .tar.gz - {% endif %} + +
    diff --git a/docs/about.md b/docs/about.md index 91224e9..cc8d2a6 100644 --- a/docs/about.md +++ b/docs/about.md @@ -21,10 +21,12 @@ --- ### What is Deep Classiflie? - Deep Classiflie is a framework for developing ML models that bolster fact-checking efficiency. Predominantly a research project[e](#ce), I plan to extend and maintain this framework in pursuing my own research interests so am sharing it in case it's of any utility to the broader community. -- As a POC, the initial alpha release of Deep Classiflie generates/analyzes a model that continuously classifies a single individual's statements (Donald Trump)[1](#f1) using a single ground truth labeling source (The Washington Post). For statements the model deems most likely to be labeled falsehoods (see [current performance](#current-performance) for more detail), the [@DeepClassiflie](https://twitter.com/DeepClassiflie) twitter bot tweets out a statement analysis and model interpretation "report" such as the one below: +- As a POC, the initial alpha release of Deep Classiflie generates/analyzes a model that continuously classifies a single individual's statements (Donald Trump)[1](#f1) using a single ground truth labeling source (The Washington Post). +- The Deep Classiflie POC model's [current predictions](current_explorer.html) and performance on the most recent test set can be [explored](#model-exploration) and better understood using +the [current prediction explorer](current_explorer.html): - Example tweet report -- The Deep Classiflie POC model's predictions and performance on the most recent test set can be [explored](#model-exploration) and better understood using the [prediction explorer](pred_explorer.html): + current prediction explorer +- the [prediction explorer](pred_explorer.html): prediction explorer - and the [performance explorer](perf_explorer.html): @@ -52,17 +54,32 @@ ### Model Exploration The best way to start understanding/exploring the current model is to use the explorers on deepclassiflie.org: -#### [Prediction Explorer](pred_explorer.html): +
    [Prediction Explorer](pred_explorer.html) + + Explore randomly sampled predictions from the test set of the latest model incarnation. The explorer uses [captum's](https://captum.ai/) implementation of integrated gradients[7](#f7) to visualize attributions of statement predictions to tokens in each statement. Read more about explorer [below.](##current-performance) prediction explorer +
    + +
    [Performance Explorer](perf_explorer.html) + -#### [Performance Explorer](perf_explorer.html): Explore the performance of the current model incarnation using confusion matrices oriented along temporal and confidence-based axes. temporal performance explorer confidence bucket performance explorer +
    +
    [Current Predictions Explorer](current_explorer.html) + + +Explore the current (unlabeled) predictions generated by the latest model incarnation. All statements yet to be labeled by current fact-checking sources (currently, only [Washington Post Factchecker](https://www.washingtonpost.com/graphics/politics/trump-claims-database)) are available. +Live predictions continuously added via [ipfs](https://ipfs.io). Twitter statements will be delayed by ~15 minutes to allow thread-based scoring. [Factba.se](https://factba.se) is polled for new statements every 10 minutes. +This explorer provides fact-checkers a means (one of many possible) of using current model predictions and may also help those building fact-checking systems evaluate the potential utility of integrating similar models into their systems. + +current predictions explorer +
    --- ### Core Components @@ -77,27 +94,41 @@ The entire initial Deep Classiflie system (raw dataset, model, analytics modules -[Dataset Generation](#data-pipeline) +
    [Dataset Generation](#data-pipeline) + + - For simplicity, scrape "ground truth" falsehood labels from a single source ([Washington Post Factchecker](https://www.washingtonpost.com/graphics/politics/trump-claims-database)) - Scrape a substantial fraction of public statements from multiple sources. ([Factba.se](https://factba.se), [Twitter](https://twitter.com)) - Use statement hashes and subword representations from a base model (ALBERT[8](#f8)) to remove "false" statements present in the larger "truths" corpus. - Prepare chronologically disjoint train/dev/test sets (to avoid data leakage) and attempt to reduce undesired superficial class-aligned distributional artifacts that could be leveraged during model training. NNs are lazy, they'll cheat if we let them. - -**Model Training** + +
    +
    **Model Training** + + - Fine-tune a base model (currently HuggingFace's [ALBERT implementation](https://huggingface.co/transformers/model_doc/albert.html) with some minor customizations) in tandem with a simple embedding reflecting the semantic shift associated with the medium via which the statement was conveyed (i.e., for the POC, just learn the tweet vs non-tweet transformation) (using [Pytorch](https://pytorch.org/)) - Explore the latest model's training session on [tensorboard.dev](https://tensorboard.dev/experiment/rGNQpYnYSOaHb2A84xRAzw). - N.B. neuro-symbolic methods[6](#f6) that leverage knowledge bases and integrate symbolic reasoning with connectionist methods are not used in this model. Use of these approaches may be explored in [future research](#further-research) using this framework. - -**Analysis & Reporting** +
    +
    **Analysis & Reporting** + + - Interpret statement-level predictions using [captum's](https://captum.ai/) implementation of integrated gradients to visualize attributions of statement predictions to tokens in each statement. -- Prediction and model performance exploration dashboards were built using [bokeh](https://docs.bokeh.org/en/latest/index.html) and [Jekyll](https://github.com/jekyll/jekyll) -- Automated false statement reports for predictions meeting the desired [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) confidence threshold are published on twitter via the [@DeepClassiflie](https://twitter.com/DeepClassiflie) bot, which leverages [Tweepy](https://www.tweepy.org/) +- Test set prediction and model performance exploration dashboards were built using [bokeh](https://docs.bokeh.org/en/latest/index.html) and [Jekyll](https://github.com/jekyll/jekyll) +- The [current prediction explorer](current_explorer.html) was built using [datatables](https://datatables.net/) and [ipfs](https://ipfs.io) with pinning provided by [pinata](https://pinata.cloud/) +- Two inference daemons poll, analyze and classify new statements: + 1. A daemon that publishes via [IPFS](https://ipfs.io) all new statement classifications and inference output. + + current predictions explorer + 2. Automated false statement reports for predictions meeting the desired [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) confidence threshold can be published on twitter via a twitter bot, which leverages [Tweepy](https://www.tweepy.org/). The bot [h](#ch) tweets out a statement analysis and model interpretation "report" such as the one below for statements the model deems most likely to be labeled falsehoods (see [current performance](#current-performance) for more detail): + + Example tweet report - XKCD fans may notice the style of the dashboard explorers and statement reports are XKCD-inspired using the Humor Sans font created by [@ch00ftech](https://twitter.com/ch00ftech). Thanks to him (and [@xkcd](https://twitter.com/xkcd) of course!) - +
    --- ### Current Performance -
    Global +
    Global Global metrics[9](#f9) summarized in the table below relate to the current model's performance on a test set comprised of ~12K statements made between 2020-04-03 and 2020-07-08:
    @@ -105,7 +136,7 @@ Global metrics[9](#f9) summarized in the table below relate t
    -
    Local +
    Local To minimize false positives and maximize the model's utility, the following approach is used to issue high-confidence predictions: @@ -115,95 +146,64 @@ To minimize false positives and maximize the model's utility, the following appr * [PPV](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values) * Positive prediction ratio: (bucket true positives + bucket false positives)/#statements in bucket * Bucket-level accuracy -3. Report estimated local accuracy metrics of given prediction by associating it with its corresponding confidence bucket +3. Report estimated local accuracy metrics of given prediction by associating it with its corresponding confidence bucket. See [caveats](#caveats) regarding recognized performance biases[a](#ca) * In the prediction explorer, randomly sample 100 statements (including all confusion matrix classes) from each of four confidence buckets: the maximum and minimum accuracy buckets for each statement type. Max Accuracy Non-Tweets
    Max Accuracy Tweets -4. Use statement-type aligned (tweet vs non-tweet) PPV thresholds to estimate @DeepClassiflie's statement report publishing accuracy using performance from the most recent test set. See [caveats](#caveats) regarding recognized performance biases[a](#ca).
    -
    @DeepClassiflie Report Performance - - -Had @DeepClassiflie been publishing statement reports over the period comprising its test set (see above) using the current model, the publishing performance is expected to have been approximately characterized by the statistics below. See [caveats](#caveats) regarding the modest recognized performance biases[a](#ca). Now that report publishing has begun, once additional labeled data are available, the realized performance of the model will be similarly reported here.
    - -
    - -| Model Version | Period Days | Start Date | End Date | -| :--- | :---: | :---: | :---: | -| 20200816115426 | 96 | 04/03/2020 | 07/08/2020 | - -
    -
    - -| Statement Type | Publish Threshold | Stmts/max bucket | Bucket ppv | Bucket ppr| Est Reports Published | Estimated TP | Estimated FP | -| :--- | :---: | :---: | :---: | :---:| :---: | :---: | :---: | -Non-Tweets | 0.96 | 430 | 0.965 |1 | 430 |415 | 15 | -Tweets | 0.77 | 109 | 0.786 | 0.257 |28 | 22 | 6 | - -
    -
    - -| Period Estimate | Period total | Per day | -| :--- | :---: | :---: | -Non-tweet reports published | 430 | 4.48 | -Tweet reports published | 28 | 0.29 | -TP non-tweet reports published | 415 | 4.32 | -FP non-tweet reports published | 15 | 0.16 | -TP tweet reports published | 22 | 0.23 | -FP tweet reports published | 6 | 0.06 | -Projected report period non-tweet accuracy | 96.5% | -Projected report period tweet accuracy | 78.6% | -Projected report period global accuracy | 95.4% | - -
    -
    - - --- ### Noteworthy Features -#### Dataset generation: +
    Dataset generation + + - Easily and extensively configurable using yaml [configuration files](#configuration). - Multiple different class balancing strategies available (oversampling, class ratios etc.) - "Weakly converge" superficially divergent class distributions using UnivariateDistReplicator abstraction - Easily experiment with different train/dev/test splits/configurations via declarative DatasetCollection definitions. +
    -#### Model training: +
    Model training + + - Automated recursive fine-tuning of the base model with a FineTuningScheduler abstraction - Configurable label-smoothing[4](#f4) - Generate and configure thawing schedules for models. - EarlyStopping easily configurable with multiple non-standard monitor metrics (e.g. mcc) - Both automatic and manually-specified [stochastic weight averaging](https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/) of model checkpoints[f](#cf) - mixed-precision training via [apex](https://github.com/NVIDIA/apex)[g](#cg) +
    +
    Analysis & reporting + -#### Analysis & reporting: - Extensive suite of reporting views for analyzing model performance and global and local levels +- A [current prediction explorer](current_explorer.html) that provides fact-checkers a means (one of many possible) of using current model predictions. This dashboard may also help those building fact-checking systems evaluate the potential utility of integrating similar models into their systems. - Statement and performance exploration dashboards for interpreting model predictions and understanding its performance - xkcd-themed visualization of UMAP-transformed statement embeddings - +
    --- ### Data Pipeline To conserve resources and for POC research expediency, the current pipeline uses a local relational DB (MariaDB). Ultimately, a distributed data store would be preferable and warranted if this project merits sufficient interest from the community or a POC involving a distributed network of models is initiated. -
    Deep Classiflie Data Pipeline +
    Deep Classiflie Data Pipeline ![Deep Classiflie Data Pipeline](/assets/deep_classiflie_data_pipeline.svg)
    -
    False Statement Filter Processes +
    False Statement Filter Processes ![False Statement Filter Processes](/assets/False%20Statement%20Filter%20Processes.svg)
    -
    Distribution Convergence Process +
    Distribution Convergence Process ![Distribution Convergence Process](/assets/Distribution%20Convergence%20Process.svg)
    -
    Dataset Generation Process +
    Dataset Generation Process ![Dataset Generation Process](/assets/Dataset%20Generation%20Process.svg) @@ -227,6 +227,7 @@ The parameters used in all Deep Classiflie job executions related to the develop | **gen_dashboards.yaml** | parameters used to generate model analysis dashboards | | **cust_predict.yaml** | parameters used to perform model inference on arbitrary input statements | | **tweetbot.yaml** | parameters used to run the tweetbot behind @DeepClassiflie | +| **infsvc.yaml** | parameters used to run the inference service behind the current prediction explorer | @@ -437,6 +438,7 @@ N.B. before you begin, the core external dependency is admin access to a mariadb
  • [e] Still in early development, there are significant outstanding issues (e.g. no tests yet!) and code quality shortcomings galore, but any constructive thoughts or contributions are welcome. I'm interested in using ML to curtail disinformation, not promulgate it, so I want to be clear -- this is essentially a fancy sentence similarity system with a lot of work put into building the dataset generation and model analysis data pipelines (I have a data engineering background, not a software engineering one).
  • [f] Current model release built/tested before swa graduated from torchcontrib to core pytorch. Next release of Deep Classiflie will use the integrated swa api.
  • [g] Current model release built/tested before AMP was integrated into core pytorch. Next release of Deep Classiflie will use the integrated AMP api.
  • +
  • [h] N.B. This daemon may violate Twitter's [policy](https://help.twitter.com/en/rules-and-policies/twitter-automation) w.r.t. tweeting sensitive content if the subject's statements contain such content (no content-based filtering is included in the daemon). [@DeepClassflie](https://twitter.com/DeepClassiflie) initially tested the Deep Classiflie twitter daemon but will post only framework-related announcements moving forward.
  • --- @@ -477,4 +479,3 @@ Feel free to star the [repo]({{ site.github.repository_url }}) as well if you [View on GitHub]({{ site.github.repository_url }}) - diff --git a/docs/assets/css/datatables.min.css b/docs/assets/css/datatables.min.css new file mode 100644 index 0000000..636685e --- /dev/null +++ b/docs/assets/css/datatables.min.css @@ -0,0 +1,18 @@ +/* + * This combined file was created by the DataTables downloader builder: + * https://datatables.net/download + * + * To rebuild or modify this file with the latest versions of the included + * software please visit: + * https://datatables.net/download/#dt/dt-1.10.21/fc-3.3.1 + * + * Included libraries: + * DataTables 1.10.21, FixedColumns 3.3.1 + */ + +table.dataTable{width:100%;margin:0 auto;clear:both;border-collapse:separate;border-spacing:0}table.dataTable thead th,table.dataTable tfoot th{font-weight:bold}table.dataTable thead th,table.dataTable thead td{padding:10px 18px;border-bottom:1px solid #111}table.dataTable thead th:active,table.dataTable thead td:active{outline:none}table.dataTable tfoot th,table.dataTable tfoot td{padding:10px 18px 6px 18px;border-top:1px solid #111}table.dataTable thead .sorting,table.dataTable thead .sorting_asc,table.dataTable thead .sorting_desc,table.dataTable thead .sorting_asc_disabled,table.dataTable thead .sorting_desc_disabled{cursor:pointer;*cursor:hand;background-repeat:no-repeat;background-position:center right}table.dataTable thead .sorting{background-image:url("/assets/sort_both.png")}table.dataTable thead .sorting_asc{background-image:url("/assets/sort_asc.png")}table.dataTable thead .sorting_desc{background-image:url("/assets/sort_desc.png")}table.dataTable thead .sorting_asc_disabled{background-image:url("/assets/sort_asc_disabled.png")}table.dataTable thead .sorting_desc_disabled{background-image:url("DataTables-1.10.21/images/sort_desc_disabled.png")}table.dataTable tbody tr{background-color:#ffffff}table.dataTable tbody tr.selected{background-color:#B0BED9}table.dataTable tbody th,table.dataTable tbody td{padding:8px 10px}table.dataTable.row-border tbody th,table.dataTable.row-border tbody td,table.dataTable.display tbody th,table.dataTable.display tbody td{border-top:1px solid #ddd}table.dataTable.row-border tbody tr:first-child th,table.dataTable.row-border tbody tr:first-child td,table.dataTable.display tbody tr:first-child th,table.dataTable.display tbody tr:first-child td{border-top:none}table.dataTable.cell-border tbody th,table.dataTable.cell-border tbody td{border-top:1px solid #ddd;border-right:1px solid #ddd}table.dataTable.cell-border tbody tr th:first-child,table.dataTable.cell-border tbody tr td:first-child{border-left:1px solid #ddd}table.dataTable.cell-border tbody tr:first-child th,table.dataTable.cell-border tbody tr:first-child td{border-top:none}table.dataTable.stripe tbody tr.odd,table.dataTable.display tbody tr.odd{background-color:#f9f9f9}table.dataTable.stripe tbody tr.odd.selected,table.dataTable.display tbody tr.odd.selected{background-color:#acbad4}table.dataTable.hover tbody tr:hover,table.dataTable.display tbody tr:hover{background-color:#f6f6f6}table.dataTable.hover tbody tr:hover.selected,table.dataTable.display tbody tr:hover.selected{background-color:#aab7d1}table.dataTable.order-column tbody tr>.sorting_1,table.dataTable.order-column tbody tr>.sorting_2,table.dataTable.order-column tbody tr>.sorting_3,table.dataTable.display tbody tr>.sorting_1,table.dataTable.display tbody tr>.sorting_2,table.dataTable.display tbody tr>.sorting_3{background-color:#fafafa}table.dataTable.order-column tbody tr.selected>.sorting_1,table.dataTable.order-column tbody tr.selected>.sorting_2,table.dataTable.order-column tbody tr.selected>.sorting_3,table.dataTable.display tbody tr.selected>.sorting_1,table.dataTable.display tbody tr.selected>.sorting_2,table.dataTable.display tbody tr.selected>.sorting_3{background-color:#acbad5}table.dataTable.display tbody tr.odd>.sorting_1,table.dataTable.order-column.stripe tbody tr.odd>.sorting_1{background-color:#f1f1f1}table.dataTable.display tbody tr.odd>.sorting_2,table.dataTable.order-column.stripe tbody tr.odd>.sorting_2{background-color:#f3f3f3}table.dataTable.display tbody tr.odd>.sorting_3,table.dataTable.order-column.stripe tbody tr.odd>.sorting_3{background-color:whitesmoke}table.dataTable.display tbody tr.odd.selected>.sorting_1,table.dataTable.order-column.stripe tbody tr.odd.selected>.sorting_1{background-color:#a6b4cd}table.dataTable.display tbody tr.odd.selected>.sorting_2,table.dataTable.order-column.stripe tbody tr.odd.selected>.sorting_2{background-color:#a8b5cf}table.dataTable.display tbody tr.odd.selected>.sorting_3,table.dataTable.order-column.stripe tbody tr.odd.selected>.sorting_3{background-color:#a9b7d1}table.dataTable.display tbody tr.even>.sorting_1,table.dataTable.order-column.stripe tbody tr.even>.sorting_1{background-color:#fafafa}table.dataTable.display tbody tr.even>.sorting_2,table.dataTable.order-column.stripe tbody tr.even>.sorting_2{background-color:#fcfcfc}table.dataTable.display tbody tr.even>.sorting_3,table.dataTable.order-column.stripe tbody tr.even>.sorting_3{background-color:#fefefe}table.dataTable.display tbody tr.even.selected>.sorting_1,table.dataTable.order-column.stripe tbody tr.even.selected>.sorting_1{background-color:#acbad5}table.dataTable.display tbody tr.even.selected>.sorting_2,table.dataTable.order-column.stripe tbody tr.even.selected>.sorting_2{background-color:#aebcd6}table.dataTable.display tbody tr.even.selected>.sorting_3,table.dataTable.order-column.stripe tbody tr.even.selected>.sorting_3{background-color:#afbdd8}table.dataTable.display tbody tr:hover>.sorting_1,table.dataTable.order-column.hover tbody tr:hover>.sorting_1{background-color:#eaeaea}table.dataTable.display tbody tr:hover>.sorting_2,table.dataTable.order-column.hover tbody tr:hover>.sorting_2{background-color:#ececec}table.dataTable.display tbody tr:hover>.sorting_3,table.dataTable.order-column.hover tbody tr:hover>.sorting_3{background-color:#efefef}table.dataTable.display tbody tr:hover.selected>.sorting_1,table.dataTable.order-column.hover tbody tr:hover.selected>.sorting_1{background-color:#a2aec7}table.dataTable.display tbody tr:hover.selected>.sorting_2,table.dataTable.order-column.hover tbody tr:hover.selected>.sorting_2{background-color:#a3b0c9}table.dataTable.display tbody tr:hover.selected>.sorting_3,table.dataTable.order-column.hover tbody tr:hover.selected>.sorting_3{background-color:#a5b2cb}table.dataTable.no-footer{border-bottom:1px solid #111}table.dataTable.nowrap th,table.dataTable.nowrap td{white-space:nowrap}table.dataTable.compact thead th,table.dataTable.compact thead td{padding:4px 17px}table.dataTable.compact tfoot th,table.dataTable.compact tfoot td{padding:4px}table.dataTable.compact tbody th,table.dataTable.compact tbody td{padding:4px}table.dataTable th.dt-left,table.dataTable td.dt-left{text-align:left}table.dataTable th.dt-center,table.dataTable td.dt-center,table.dataTable td.dataTables_empty{text-align:center}table.dataTable th.dt-right,table.dataTable td.dt-right{text-align:right}table.dataTable th.dt-justify,table.dataTable td.dt-justify{text-align:justify}table.dataTable th.dt-nowrap,table.dataTable td.dt-nowrap{white-space:nowrap}table.dataTable thead th.dt-head-left,table.dataTable thead td.dt-head-left,table.dataTable tfoot th.dt-head-left,table.dataTable tfoot td.dt-head-left{text-align:left}table.dataTable thead th.dt-head-center,table.dataTable thead td.dt-head-center,table.dataTable tfoot th.dt-head-center,table.dataTable tfoot td.dt-head-center{text-align:center}table.dataTable thead th.dt-head-right,table.dataTable thead td.dt-head-right,table.dataTable tfoot th.dt-head-right,table.dataTable tfoot td.dt-head-right{text-align:right}table.dataTable thead th.dt-head-justify,table.dataTable thead td.dt-head-justify,table.dataTable tfoot th.dt-head-justify,table.dataTable tfoot td.dt-head-justify{text-align:justify}table.dataTable thead th.dt-head-nowrap,table.dataTable thead td.dt-head-nowrap,table.dataTable tfoot th.dt-head-nowrap,table.dataTable tfoot td.dt-head-nowrap{white-space:nowrap}table.dataTable tbody th.dt-body-left,table.dataTable tbody td.dt-body-left{text-align:left}table.dataTable tbody th.dt-body-center,table.dataTable tbody td.dt-body-center{text-align:center}table.dataTable tbody th.dt-body-right,table.dataTable tbody td.dt-body-right{text-align:right}table.dataTable tbody th.dt-body-justify,table.dataTable tbody td.dt-body-justify{text-align:justify}table.dataTable tbody th.dt-body-nowrap,table.dataTable tbody td.dt-body-nowrap{white-space:nowrap}table.dataTable,table.dataTable th,table.dataTable td{box-sizing:content-box}.dataTables_wrapper{position:relative;clear:both;*zoom:1;zoom:1}.dataTables_wrapper .dataTables_length{float:left}.dataTables_wrapper .dataTables_filter{float:right;text-align:right}.dataTables_wrapper .dataTables_filter input{margin-left:0.5em}.dataTables_wrapper .dataTables_info{clear:both;float:left;padding-top:0.755em}.dataTables_wrapper .dataTables_paginate{float:right;text-align:right;padding-top:0.25em}.dataTables_wrapper .dataTables_paginate .paginate_button{box-sizing:border-box;display:inline-block;min-width:1.5em;padding:0.5em 1em;margin-left:2px;text-align:center;text-decoration:none !important;cursor:pointer;*cursor:hand;color:#333 !important;border:1px solid transparent;border-radius:2px}.dataTables_wrapper .dataTables_paginate .paginate_button.current,.dataTables_wrapper .dataTables_paginate .paginate_button.current:hover{color:#333 !important;border:1px solid #979797;background-color:white;background:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #fff), color-stop(100%, #dcdcdc));background:-webkit-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:-moz-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:-ms-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:-o-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:linear-gradient(to bottom, #fff 0%, #dcdcdc 100%)}.dataTables_wrapper .dataTables_paginate .paginate_button.disabled,.dataTables_wrapper .dataTables_paginate .paginate_button.disabled:hover,.dataTables_wrapper .dataTables_paginate .paginate_button.disabled:active{cursor:default;color:#666 !important;border:1px solid transparent;background:transparent;box-shadow:none}.dataTables_wrapper .dataTables_paginate .paginate_button:hover{color:white !important;border:1px solid #111;background-color:#585858;background:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #585858), color-stop(100%, #111));background:-webkit-linear-gradient(top, #585858 0%, #111 100%);background:-moz-linear-gradient(top, #585858 0%, #111 100%);background:-ms-linear-gradient(top, #585858 0%, #111 100%);background:-o-linear-gradient(top, #585858 0%, #111 100%);background:linear-gradient(to bottom, #585858 0%, #111 100%)}.dataTables_wrapper .dataTables_paginate .paginate_button:active{outline:none;background-color:#2b2b2b;background:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #2b2b2b), color-stop(100%, #0c0c0c));background:-webkit-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:-moz-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:-ms-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:-o-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:linear-gradient(to bottom, #2b2b2b 0%, #0c0c0c 100%);box-shadow:inset 0 0 3px #111}.dataTables_wrapper .dataTables_paginate .ellipsis{padding:0 1em}.dataTables_wrapper .dataTables_processing{position:absolute;top:50%;left:50%;width:100%;height:40px;margin-left:-50%;margin-top:-25px;padding-top:20px;text-align:center;font-size:1.2em;background-color:white;background:-webkit-gradient(linear, left top, right top, color-stop(0%, rgba(255,255,255,0)), color-stop(25%, rgba(255,255,255,0.9)), color-stop(75%, rgba(255,255,255,0.9)), color-stop(100%, rgba(255,255,255,0)));background:-webkit-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:-moz-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:-ms-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:-o-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:linear-gradient(to right, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%)}.dataTables_wrapper .dataTables_length,.dataTables_wrapper .dataTables_filter,.dataTables_wrapper .dataTables_info,.dataTables_wrapper .dataTables_processing,.dataTables_wrapper .dataTables_paginate{color:#333}.dataTables_wrapper .dataTables_scroll{clear:both}.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody{*margin-top:-1px;-webkit-overflow-scrolling:touch}.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>th,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>td,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>th,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>td{vertical-align:middle}.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>th>div.dataTables_sizing,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>td>div.dataTables_sizing,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>th>div.dataTables_sizing,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>td>div.dataTables_sizing{height:0;overflow:hidden;margin:0 !important;padding:0 !important}.dataTables_wrapper.no-footer .dataTables_scrollBody{border-bottom:1px solid #111}.dataTables_wrapper.no-footer div.dataTables_scrollHead table.dataTable,.dataTables_wrapper.no-footer div.dataTables_scrollBody>table{border-bottom:none}.dataTables_wrapper:after{visibility:hidden;display:block;content:"";clear:both;height:0}@media screen and (max-width: 767px){.dataTables_wrapper .dataTables_info,.dataTables_wrapper .dataTables_paginate{float:none;text-align:center}.dataTables_wrapper .dataTables_paginate{margin-top:0.5em}}@media screen and (max-width: 640px){.dataTables_wrapper .dataTables_length,.dataTables_wrapper .dataTables_filter{float:none;text-align:center}.dataTables_wrapper .dataTables_filter{margin-top:0.5em}} + + +table.DTFC_Cloned thead,table.DTFC_Cloned tfoot{background-color:white}div.DTFC_Blocker{background-color:white}div.DTFC_LeftWrapper table.dataTable,div.DTFC_RightWrapper table.dataTable{margin-bottom:0;z-index:2}div.DTFC_LeftWrapper table.dataTable.no-footer,div.DTFC_RightWrapper table.dataTable.no-footer{border-bottom:none}table.dataTable.display tbody tr.DTFC_NoData{background-color:transparent} + + diff --git a/docs/assets/css/loading_bar.css b/docs/assets/css/loading_bar.css new file mode 100644 index 0000000..8bcd3bd --- /dev/null +++ b/docs/assets/css/loading_bar.css @@ -0,0 +1,18 @@ +.ldBar { + position: relative; +} +.ldBar.label-center > .ldBar-label { + position: absolute; + top: 50%; + left: 50%; + -webkit-transform: translate(-50%, -50%); + transform: translate(-50%, -50%); + text-shadow: 0 0 3px #fff; +} +.ldBar-label:after { + content: "%"; + display: inline; +} +.ldBar.no-percent .ldBar-label:after { + content: ""; +} \ No newline at end of file diff --git a/docs/assets/css/style.scss b/docs/assets/css/style.scss index 59da101..b895b07 100644 --- a/docs/assets/css/style.scss +++ b/docs/assets/css/style.scss @@ -113,6 +113,65 @@ blockquote ul li strong { font-size: 0.5rem; } +table#curr_preds, table.dataTable { + width: 100%; + font-size: 0.8rem; +} + +table#curr_preds a { + text-decoration: none; +} + +div.claim { + max-height: 100px; + overflow:hidden; + min-width: 13rem; +} + +table#curr_preds .falsehood { + color: darkred; + padding: 0rem; +} + +table#curr_preds .nofalsehood { + color: darkgreen; + padding: 0rem; +} + +table#curr_preds tr, table#curr_preds td { + background: transparent; + border: outset; + border-width:1px; + text-align:center; +} + +.dataTables_length, .dataTables_info, .dt_instructions, .loading_msg { + color: #003566; +} + +#progress_bar { + left: 35%; +} + +.dataTables_wrapper .dataTables_paginate .paginate_button { + padding: 0.2em 0.4em; +} + +#curr_preds_length.dataTables_length, #curr_preds_filter.dataTables_filter, .dataTables_info { + text-align:left; + font-size: 0.8rem; + padding-top:0rem; +} +#curr_preds_filter.dataTables_filter { + padding-bottom: 0.5rem; +} +table.curr_preds th { + background-color: #003566; + color: #ffffff; + border: outset; + border-width:1px; +} + .site-footer { visibility: hidden; } @@ -230,10 +289,25 @@ body { color: #000; padding-top: 5px; padding-bottom:0px; - max-width: min-content; + max-width: max-content; display: inline-flex; float: left; - margin-bottom: 0.5rem; + margin-bottom: 0px; +} + +.cust-nav-wrap ul { + display: flex; + flex-wrap: wrap; + justify-content: center; + margin-left: 0px; + padding-left: 0px; + padding-bottom:0px; +} +.cust-nav-wrap li { + display: block; + list-style-type: none; + padding-top: 0.3rem; + margin-left: 0.2rem; } details strong { @@ -241,7 +315,7 @@ font-weight: bold; color: #003566; } .cust-nav-wrap :first-child { - margin-top: 4px; + margin-top: 0px; } .main-content { @@ -437,6 +511,47 @@ a:hover, a:active { font-weight: bold; } +.dt_instructions, .loading_msg { + margin-left: 0rem; + font-size: 1.0rem; + font-weight: bold; + font-family: 'Humor Sans'; +} + +.paginate_button { + font-size: 0.8rem; +} + +.outer_container .curr_preds, #curr_preds_wrapper { + font-family: 'Humor Sans'; +} + +.ldBar-label { + color: green; + font-family: 'varela round'; + font-size: 1.2em; + //font-weight: 900; + } + +.ldBar.label-center > .ldBar-label { + position: absolute; + top: 50%; + left: 50%; + font-family: 'humor sans'; + } + +.ldBar path.mainline { + stroke-width: 10; + stroke: #000356; + stroke-linecap: round; +} +.ldBar path.baseline { + stroke-width: 14; + stroke: #f1f2f3; + stroke-linecap: round; + filter:url(#custom-shadow); +} + .stmt_title { margin-left: 0rem; font-size: 0.8rem; @@ -612,6 +727,17 @@ details[open] summary:after { margin-left: 0rem; font-size: 1.1rem; } + .dt_instructions, .loading_msg { + margin-left: 0rem; + font-size: 1.2rem; + } + #progress_bar { + left: unset; + margin-left: 2rem; + } + .dataTables_wrapper .dataTables_paginate .paginate_button { + padding: 0.2em 0.4em; + } .stmt_title { margin-left: 0rem; font-size: 1.1rem; @@ -675,7 +801,7 @@ details[open] summary:after { .cust-nav-wrap { float: unset; display: flex; -} + } .footnotes { font-size: 0.8rem; } @@ -701,6 +827,27 @@ details[open] summary:after { float: left; margin-right: 1.0rem; } + div.claim { + height: unset; + overflow:hidden; + text-align: left; + } + table.dataTable tbody td { + padding: 0rem 0.2rem; + margin: 0rem 0rem; + } + table#curr_preds, table.dataTable { + width: 100%; + font-size: 1.0rem; + } + #curr_preds_length.dataTables_length, #curr_preds_filter.dataTables_filter, .dataTables_info { + text-align:left; + font-size: 1.0rem; + padding-bottom: 0.5rem; + } + .paginate_button { + font-size: 1.0rem; + } } .radio_wrapper .bk-btn-default { @@ -729,6 +876,7 @@ details[open] summary:after { /* background:linear-gradient(to bottom, #e4685d 5%, #cc5500 100%); #e4685d*/ } + .outer_container .bk-root .bk-tabs-header.bk-above .bk-tab { color: #003566; background-color:transparent; @@ -910,10 +1058,6 @@ ol { overflow: auto; } - - - - .highlight .nt { color: #82B1FF; } diff --git a/docs/assets/js/current_pred_nav.js b/docs/assets/js/current_pred_nav.js new file mode 100644 index 0000000..44300f7 --- /dev/null +++ b/docs/assets/js/current_pred_nav.js @@ -0,0 +1,152 @@ +async function checkNew(latest_recs){ + let headers = {}; + const response = await fetch(latest_recs); + for(const header of response.headers){ + console.log(`Name: ${header[0]}, Value:${header[1]}`); + headers[header[0]] = header[1]; + } + return headers; + } + +async function wait(ms) { + return new Promise(resolve => { + setTimeout(resolve, ms); + }); +} + +function config_progbar(){ + let options = { + "stroke": '#003566', + "stroke-width": 20, + "preset": "circle", + "value": 0 + }; + let bar = new ldBar("#progress_bar", options); + return bar; +} + +async function readStreamResponse(response) { + let pct_progress = 0; + let bar = await config_progbar(); + const reader = await response.body.getReader(); + const contentLength = +response.headers.get('Content-Length'); + let receivedLength = 0; + let chunks = []; + while(true) { + const {done, value} = await reader.read(); + if (done) { + break; + } + chunks.push(value); + receivedLength += value.length; + pct_progress = Math.round((receivedLength/contentLength)*100); + await bar.set(pct_progress); + await wait(20); + } + let chunksAll = new Uint8Array(receivedLength); + let position = 0; + for(let chunk of chunks) { + chunksAll.set(chunk, position); + position += chunk.length; + } + return new TextDecoder("utf-8").decode(chunksAll); + +} + +function fetchTimeout(url, ms){ + const controller = new AbortController(); + var signal = controller.signal; + const promise = fetch(url, { signal }); + const timeout = setTimeout(() => controller.abort(), ms); + return promise.finally(() => clearTimeout(timeout)); +} + + +async function loadJSON(latest_recs) { + let response = {}; + try { + response = await fetchTimeout(latest_recs[0], 10000); + } catch(err) { + console.log("Failed loading from primary gateway "+latest_recs[0]+" attempting to fetch from backup "+latest_recs[1]); + await $("#progress_bar").toggle(); + $( ".loading_msg" ).html( "Failed loading from primary gateway:
    "+latest_recs[0]+ + ".
    Attempting to fetch from backup
    "+ + latest_recs[1]+"...
    " ); + response = await fetchTimeout(latest_recs[1], 10000); + } + let result = await readStreamResponse(response); + let curr_json = await JSON.parse(result); + return curr_json +} + +function dec_fmt( data, type, row ) { + return data.toFixed(2); +} + +async function add_table_func() { + let curr_json = {}; + var num_fmt = ['', '.', 2, '']; + const latest_recs = ["https://gateway.pinata.cloud/ipns/predictions-dev.deepclassiflie.org", + "https://cloudflare-ipfs.com/ipns/predictions-dev.deepclassiflie.org"]; + curr_json = await loadJSON(latest_recs); + var datatab = $('#curr_preds').DataTable( { + data: curr_json, + columns: [ + { data: 'claim_text', + render: function ( data, type, row ) { + return ''; + } + }, + { data: 'prediction', + render: function ( data, type, row ) { + let claim_class = '', + dtext = ''; + if (data == 0) { + claim_class='nofalsehood'; + dtext = 'No Falsehood Label'; + }else{ + claim_class='falsehood'; + dtext = 'Falsehood Label'; + } + return ''+dtext+''; + } + }, + { data: 'bucket_acc', + render: $.fn.dataTable.render.number(...num_fmt), + type: 'num'}, + { data: 'ppv', render: $.fn.dataTable.render.number(...num_fmt)}, + { data: 'npv', render: $.fn.dataTable.render.number(...num_fmt)}, + { data: 'ppr', render: $.fn.dataTable.render.number(...num_fmt)}, + { data: 'npr', render: $.fn.dataTable.render.number(...num_fmt)}, + { data: 'tp_ratio', render: $.fn.dataTable.render.number(...num_fmt)}, + { data: 'tn_ratio', render: $.fn.dataTable.render.number(...num_fmt)}, + { data: 'fp_ratio', render: $.fn.dataTable.render.number(...num_fmt)}, + { data: 'fn_ratio', render: $.fn.dataTable.render.number(...num_fmt)}, + { data: 't_date', + render: function ( data, type, row ) { + if ( type === 'display' || type === 'filter' ) { + var d = new Date( data ); + return (d.getMonth()+1) + '/' + (d.getDate()+1) +'/'+ d.getFullYear(); + } + return data; + } + } + ], + order: [[ 11, 'desc' ]], + scrollY: 400, + scrollX: 300 + } ); + return datatab; +} + +async function load_latest(){ + await $(".dt_instructions").toggle(); + var datatab = await add_table_func(); + await $(".loading_msg").toggle(); + await $(".dt_instructions").toggle(); + await $("#progress_bar").hide(); + window.dispatchEvent(new Event('resize')); + +} + +$(document).ready( load_latest ); \ No newline at end of file diff --git a/docs/assets/js/datatables.js b/docs/assets/js/datatables.js new file mode 100644 index 0000000..20fbdf8 --- /dev/null +++ b/docs/assets/js/datatables.js @@ -0,0 +1,17047 @@ +/* + * This combined file was created by the DataTables downloader builder: + * https://datatables.net/download + * + * To rebuild or modify this file with the latest versions of the included + * software please visit: + * https://datatables.net/download/#dt/dt-1.10.21/fc-3.3.1 + * + * Included libraries: + * DataTables 1.10.21, FixedColumns 3.3.1 + */ + +/*! DataTables 1.10.21 + * ©2008-2020 SpryMedia Ltd - datatables.net/license + */ + +/** + * @summary DataTables + * @description Paginate, search and order HTML tables + * @version 1.10.21 + * @file jquery.dataTables.js + * @author SpryMedia Ltd + * @contact www.datatables.net + * @copyright Copyright 2008-2020 SpryMedia Ltd. + * + * This source file is free software, available under the following license: + * MIT license - http://datatables.net/license + * + * This source file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the license files for details. + * + * For details please refer to: http://www.datatables.net + */ + +/*jslint evil: true, undef: true, browser: true */ +/*globals $,require,jQuery,define,_selector_run,_selector_opts,_selector_first,_selector_row_indexes,_ext,_Api,_api_register,_api_registerPlural,_re_new_lines,_re_html,_re_formatted_numeric,_re_escape_regex,_empty,_intVal,_numToDecimal,_isNumber,_isHtml,_htmlNumeric,_pluck,_pluck_order,_range,_stripHtml,_unique,_fnBuildAjax,_fnAjaxUpdate,_fnAjaxParameters,_fnAjaxUpdateDraw,_fnAjaxDataSrc,_fnAddColumn,_fnColumnOptions,_fnAdjustColumnSizing,_fnVisibleToColumnIndex,_fnColumnIndexToVisible,_fnVisbleColumns,_fnGetColumns,_fnColumnTypes,_fnApplyColumnDefs,_fnHungarianMap,_fnCamelToHungarian,_fnLanguageCompat,_fnBrowserDetect,_fnAddData,_fnAddTr,_fnNodeToDataIndex,_fnNodeToColumnIndex,_fnGetCellData,_fnSetCellData,_fnSplitObjNotation,_fnGetObjectDataFn,_fnSetObjectDataFn,_fnGetDataMaster,_fnClearTable,_fnDeleteIndex,_fnInvalidate,_fnGetRowElements,_fnCreateTr,_fnBuildHead,_fnDrawHead,_fnDraw,_fnReDraw,_fnAddOptionsHtml,_fnDetectHeader,_fnGetUniqueThs,_fnFeatureHtmlFilter,_fnFilterComplete,_fnFilterCustom,_fnFilterColumn,_fnFilter,_fnFilterCreateSearch,_fnEscapeRegex,_fnFilterData,_fnFeatureHtmlInfo,_fnUpdateInfo,_fnInfoMacros,_fnInitialise,_fnInitComplete,_fnLengthChange,_fnFeatureHtmlLength,_fnFeatureHtmlPaginate,_fnPageChange,_fnFeatureHtmlProcessing,_fnProcessingDisplay,_fnFeatureHtmlTable,_fnScrollDraw,_fnApplyToChildren,_fnCalculateColumnWidths,_fnThrottle,_fnConvertToWidth,_fnGetWidestNode,_fnGetMaxLenString,_fnStringToCss,_fnSortFlatten,_fnSort,_fnSortAria,_fnSortListener,_fnSortAttachListener,_fnSortingClasses,_fnSortData,_fnSaveState,_fnLoadState,_fnSettingsFromNode,_fnLog,_fnMap,_fnBindAction,_fnCallbackReg,_fnCallbackFire,_fnLengthOverflow,_fnRenderer,_fnDataSource,_fnRowAttributes*/ + +(function( factory ) { + "use strict"; + + if ( typeof define === 'function' && define.amd ) { + // AMD + define( ['jquery'], function ( $ ) { + return factory( $, window, document ); + } ); + } + else if ( typeof exports === 'object' ) { + // CommonJS + module.exports = function (root, $) { + if ( ! root ) { + // CommonJS environments without a window global must pass a + // root. This will give an error otherwise + root = window; + } + + if ( ! $ ) { + $ = typeof window !== 'undefined' ? // jQuery's factory checks for a global window + require('jquery') : + require('jquery')( root ); + } + + return factory( $, root, root.document ); + }; + } + else { + // Browser + factory( jQuery, window, document ); + } +} +(function( $, window, document, undefined ) { + "use strict"; + + /** + * DataTables is a plug-in for the jQuery Javascript library. It is a highly + * flexible tool, based upon the foundations of progressive enhancement, + * which will add advanced interaction controls to any HTML table. For a + * full list of features please refer to + * [DataTables.net](href="http://datatables.net). + * + * Note that the `DataTable` object is not a global variable but is aliased + * to `jQuery.fn.DataTable` and `jQuery.fn.dataTable` through which it may + * be accessed. + * + * @class + * @param {object} [init={}] Configuration object for DataTables. Options + * are defined by {@link DataTable.defaults} + * @requires jQuery 1.7+ + * + * @example + * // Basic initialisation + * $(document).ready( function { + * $('#example').dataTable(); + * } ); + * + * @example + * // Initialisation with configuration options - in this case, disable + * // pagination and sorting. + * $(document).ready( function { + * $('#example').dataTable( { + * "paginate": false, + * "sort": false + * } ); + * } ); + */ + var DataTable = function ( options ) + { + /** + * Perform a jQuery selector action on the table's TR elements (from the tbody) and + * return the resulting jQuery object. + * @param {string|node|jQuery} sSelector jQuery selector or node collection to act on + * @param {object} [oOpts] Optional parameters for modifying the rows to be included + * @param {string} [oOpts.filter=none] Select TR elements that meet the current filter + * criterion ("applied") or all TR elements (i.e. no filter). + * @param {string} [oOpts.order=current] Order of the TR elements in the processed array. + * Can be either 'current', whereby the current sorting of the table is used, or + * 'original' whereby the original order the data was read into the table is used. + * @param {string} [oOpts.page=all] Limit the selection to the currently displayed page + * ("current") or not ("all"). If 'current' is given, then order is assumed to be + * 'current' and filter is 'applied', regardless of what they might be given as. + * @returns {object} jQuery object, filtered by the given selector. + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Highlight every second row + * oTable.$('tr:odd').css('backgroundColor', 'blue'); + * } ); + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Filter to rows with 'Webkit' in them, add a background colour and then + * // remove the filter, thus highlighting the 'Webkit' rows only. + * oTable.fnFilter('Webkit'); + * oTable.$('tr', {"search": "applied"}).css('backgroundColor', 'blue'); + * oTable.fnFilter(''); + * } ); + */ + this.$ = function ( sSelector, oOpts ) + { + return this.api(true).$( sSelector, oOpts ); + }; + + + /** + * Almost identical to $ in operation, but in this case returns the data for the matched + * rows - as such, the jQuery selector used should match TR row nodes or TD/TH cell nodes + * rather than any descendants, so the data can be obtained for the row/cell. If matching + * rows are found, the data returned is the original data array/object that was used to + * create the row (or a generated array if from a DOM source). + * + * This method is often useful in-combination with $ where both functions are given the + * same parameters and the array indexes will match identically. + * @param {string|node|jQuery} sSelector jQuery selector or node collection to act on + * @param {object} [oOpts] Optional parameters for modifying the rows to be included + * @param {string} [oOpts.filter=none] Select elements that meet the current filter + * criterion ("applied") or all elements (i.e. no filter). + * @param {string} [oOpts.order=current] Order of the data in the processed array. + * Can be either 'current', whereby the current sorting of the table is used, or + * 'original' whereby the original order the data was read into the table is used. + * @param {string} [oOpts.page=all] Limit the selection to the currently displayed page + * ("current") or not ("all"). If 'current' is given, then order is assumed to be + * 'current' and filter is 'applied', regardless of what they might be given as. + * @returns {array} Data for the matched elements. If any elements, as a result of the + * selector, were not TR, TD or TH elements in the DataTable, they will have a null + * entry in the array. + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Get the data from the first row in the table + * var data = oTable._('tr:first'); + * + * // Do something useful with the data + * alert( "First cell is: "+data[0] ); + * } ); + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Filter to 'Webkit' and get all data for + * oTable.fnFilter('Webkit'); + * var data = oTable._('tr', {"search": "applied"}); + * + * // Do something with the data + * alert( data.length+" rows matched the search" ); + * } ); + */ + this._ = function ( sSelector, oOpts ) + { + return this.api(true).rows( sSelector, oOpts ).data(); + }; + + + /** + * Create a DataTables Api instance, with the currently selected tables for + * the Api's context. + * @param {boolean} [traditional=false] Set the API instance's context to be + * only the table referred to by the `DataTable.ext.iApiIndex` option, as was + * used in the API presented by DataTables 1.9- (i.e. the traditional mode), + * or if all tables captured in the jQuery object should be used. + * @return {DataTables.Api} + */ + this.api = function ( traditional ) + { + return traditional ? + new _Api( + _fnSettingsFromNode( this[ _ext.iApiIndex ] ) + ) : + new _Api( this ); + }; + + + /** + * Add a single new row or multiple rows of data to the table. Please note + * that this is suitable for client-side processing only - if you are using + * server-side processing (i.e. "bServerSide": true), then to add data, you + * must add it to the data source, i.e. the server-side, through an Ajax call. + * @param {array|object} data The data to be added to the table. This can be: + *
      + *
    • 1D array of data - add a single row with the data provided
    • + *
    • 2D array of arrays - add multiple rows in a single call
    • + *
    • object - data object when using mData
    • + *
    • array of objects - multiple data objects when using mData
    • + *
    + * @param {bool} [redraw=true] redraw the table or not + * @returns {array} An array of integers, representing the list of indexes in + * aoData ({@link DataTable.models.oSettings}) that have been added to + * the table. + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * // Global var for counter + * var giCount = 2; + * + * $(document).ready(function() { + * $('#example').dataTable(); + * } ); + * + * function fnClickAddRow() { + * $('#example').dataTable().fnAddData( [ + * giCount+".1", + * giCount+".2", + * giCount+".3", + * giCount+".4" ] + * ); + * + * giCount++; + * } + */ + this.fnAddData = function( data, redraw ) + { + var api = this.api( true ); + + /* Check if we want to add multiple rows or not */ + var rows = $.isArray(data) && ( $.isArray(data[0]) || $.isPlainObject(data[0]) ) ? + api.rows.add( data ) : + api.row.add( data ); + + if ( redraw === undefined || redraw ) { + api.draw(); + } + + return rows.flatten().toArray(); + }; + + + /** + * This function will make DataTables recalculate the column sizes, based on the data + * contained in the table and the sizes applied to the columns (in the DOM, CSS or + * through the sWidth parameter). This can be useful when the width of the table's + * parent element changes (for example a window resize). + * @param {boolean} [bRedraw=true] Redraw the table or not, you will typically want to + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable( { + * "sScrollY": "200px", + * "bPaginate": false + * } ); + * + * $(window).on('resize', function () { + * oTable.fnAdjustColumnSizing(); + * } ); + * } ); + */ + this.fnAdjustColumnSizing = function ( bRedraw ) + { + var api = this.api( true ).columns.adjust(); + var settings = api.settings()[0]; + var scroll = settings.oScroll; + + if ( bRedraw === undefined || bRedraw ) { + api.draw( false ); + } + else if ( scroll.sX !== "" || scroll.sY !== "" ) { + /* If not redrawing, but scrolling, we want to apply the new column sizes anyway */ + _fnScrollDraw( settings ); + } + }; + + + /** + * Quickly and simply clear a table + * @param {bool} [bRedraw=true] redraw the table or not + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Immediately 'nuke' the current rows (perhaps waiting for an Ajax callback...) + * oTable.fnClearTable(); + * } ); + */ + this.fnClearTable = function( bRedraw ) + { + var api = this.api( true ).clear(); + + if ( bRedraw === undefined || bRedraw ) { + api.draw(); + } + }; + + + /** + * The exact opposite of 'opening' a row, this function will close any rows which + * are currently 'open'. + * @param {node} nTr the table row to 'close' + * @returns {int} 0 on success, or 1 if failed (can't find the row) + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable; + * + * // 'open' an information row when a row is clicked on + * $('#example tbody tr').click( function () { + * if ( oTable.fnIsOpen(this) ) { + * oTable.fnClose( this ); + * } else { + * oTable.fnOpen( this, "Temporary row opened", "info_row" ); + * } + * } ); + * + * oTable = $('#example').dataTable(); + * } ); + */ + this.fnClose = function( nTr ) + { + this.api( true ).row( nTr ).child.hide(); + }; + + + /** + * Remove a row for the table + * @param {mixed} target The index of the row from aoData to be deleted, or + * the TR element you want to delete + * @param {function|null} [callBack] Callback function + * @param {bool} [redraw=true] Redraw the table or not + * @returns {array} The row that was deleted + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Immediately remove the first row + * oTable.fnDeleteRow( 0 ); + * } ); + */ + this.fnDeleteRow = function( target, callback, redraw ) + { + var api = this.api( true ); + var rows = api.rows( target ); + var settings = rows.settings()[0]; + var data = settings.aoData[ rows[0][0] ]; + + rows.remove(); + + if ( callback ) { + callback.call( this, settings, data ); + } + + if ( redraw === undefined || redraw ) { + api.draw(); + } + + return data; + }; + + + /** + * Restore the table to it's original state in the DOM by removing all of DataTables + * enhancements, alterations to the DOM structure of the table and event listeners. + * @param {boolean} [remove=false] Completely remove the table from the DOM + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * // This example is fairly pointless in reality, but shows how fnDestroy can be used + * var oTable = $('#example').dataTable(); + * oTable.fnDestroy(); + * } ); + */ + this.fnDestroy = function ( remove ) + { + this.api( true ).destroy( remove ); + }; + + + /** + * Redraw the table + * @param {bool} [complete=true] Re-filter and resort (if enabled) the table before the draw. + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Re-draw the table - you wouldn't want to do it here, but it's an example :-) + * oTable.fnDraw(); + * } ); + */ + this.fnDraw = function( complete ) + { + // Note that this isn't an exact match to the old call to _fnDraw - it takes + // into account the new data, but can hold position. + this.api( true ).draw( complete ); + }; + + + /** + * Filter the input based on data + * @param {string} sInput String to filter the table on + * @param {int|null} [iColumn] Column to limit filtering to + * @param {bool} [bRegex=false] Treat as regular expression or not + * @param {bool} [bSmart=true] Perform smart filtering or not + * @param {bool} [bShowGlobal=true] Show the input global filter in it's input box(es) + * @param {bool} [bCaseInsensitive=true] Do case-insensitive matching (true) or not (false) + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Sometime later - filter... + * oTable.fnFilter( 'test string' ); + * } ); + */ + this.fnFilter = function( sInput, iColumn, bRegex, bSmart, bShowGlobal, bCaseInsensitive ) + { + var api = this.api( true ); + + if ( iColumn === null || iColumn === undefined ) { + api.search( sInput, bRegex, bSmart, bCaseInsensitive ); + } + else { + api.column( iColumn ).search( sInput, bRegex, bSmart, bCaseInsensitive ); + } + + api.draw(); + }; + + + /** + * Get the data for the whole table, an individual row or an individual cell based on the + * provided parameters. + * @param {int|node} [src] A TR row node, TD/TH cell node or an integer. If given as + * a TR node then the data source for the whole row will be returned. If given as a + * TD/TH cell node then iCol will be automatically calculated and the data for the + * cell returned. If given as an integer, then this is treated as the aoData internal + * data index for the row (see fnGetPosition) and the data for that row used. + * @param {int} [col] Optional column index that you want the data of. + * @returns {array|object|string} If mRow is undefined, then the data for all rows is + * returned. If mRow is defined, just data for that row, and is iCol is + * defined, only data for the designated cell is returned. + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * // Row data + * $(document).ready(function() { + * oTable = $('#example').dataTable(); + * + * oTable.$('tr').click( function () { + * var data = oTable.fnGetData( this ); + * // ... do something with the array / object of data for the row + * } ); + * } ); + * + * @example + * // Individual cell data + * $(document).ready(function() { + * oTable = $('#example').dataTable(); + * + * oTable.$('td').click( function () { + * var sData = oTable.fnGetData( this ); + * alert( 'The cell clicked on had the value of '+sData ); + * } ); + * } ); + */ + this.fnGetData = function( src, col ) + { + var api = this.api( true ); + + if ( src !== undefined ) { + var type = src.nodeName ? src.nodeName.toLowerCase() : ''; + + return col !== undefined || type == 'td' || type == 'th' ? + api.cell( src, col ).data() : + api.row( src ).data() || null; + } + + return api.data().toArray(); + }; + + + /** + * Get an array of the TR nodes that are used in the table's body. Note that you will + * typically want to use the '$' API method in preference to this as it is more + * flexible. + * @param {int} [iRow] Optional row index for the TR element you want + * @returns {array|node} If iRow is undefined, returns an array of all TR elements + * in the table's body, or iRow is defined, just the TR element requested. + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Get the nodes from the table + * var nNodes = oTable.fnGetNodes( ); + * } ); + */ + this.fnGetNodes = function( iRow ) + { + var api = this.api( true ); + + return iRow !== undefined ? + api.row( iRow ).node() : + api.rows().nodes().flatten().toArray(); + }; + + + /** + * Get the array indexes of a particular cell from it's DOM element + * and column index including hidden columns + * @param {node} node this can either be a TR, TD or TH in the table's body + * @returns {int} If nNode is given as a TR, then a single index is returned, or + * if given as a cell, an array of [row index, column index (visible), + * column index (all)] is given. + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * $('#example tbody td').click( function () { + * // Get the position of the current data from the node + * var aPos = oTable.fnGetPosition( this ); + * + * // Get the data array for this row + * var aData = oTable.fnGetData( aPos[0] ); + * + * // Update the data array and return the value + * aData[ aPos[1] ] = 'clicked'; + * this.innerHTML = 'clicked'; + * } ); + * + * // Init DataTables + * oTable = $('#example').dataTable(); + * } ); + */ + this.fnGetPosition = function( node ) + { + var api = this.api( true ); + var nodeName = node.nodeName.toUpperCase(); + + if ( nodeName == 'TR' ) { + return api.row( node ).index(); + } + else if ( nodeName == 'TD' || nodeName == 'TH' ) { + var cell = api.cell( node ).index(); + + return [ + cell.row, + cell.columnVisible, + cell.column + ]; + } + return null; + }; + + + /** + * Check to see if a row is 'open' or not. + * @param {node} nTr the table row to check + * @returns {boolean} true if the row is currently open, false otherwise + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable; + * + * // 'open' an information row when a row is clicked on + * $('#example tbody tr').click( function () { + * if ( oTable.fnIsOpen(this) ) { + * oTable.fnClose( this ); + * } else { + * oTable.fnOpen( this, "Temporary row opened", "info_row" ); + * } + * } ); + * + * oTable = $('#example').dataTable(); + * } ); + */ + this.fnIsOpen = function( nTr ) + { + return this.api( true ).row( nTr ).child.isShown(); + }; + + + /** + * This function will place a new row directly after a row which is currently + * on display on the page, with the HTML contents that is passed into the + * function. This can be used, for example, to ask for confirmation that a + * particular record should be deleted. + * @param {node} nTr The table row to 'open' + * @param {string|node|jQuery} mHtml The HTML to put into the row + * @param {string} sClass Class to give the new TD cell + * @returns {node} The row opened. Note that if the table row passed in as the + * first parameter, is not found in the table, this method will silently + * return. + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable; + * + * // 'open' an information row when a row is clicked on + * $('#example tbody tr').click( function () { + * if ( oTable.fnIsOpen(this) ) { + * oTable.fnClose( this ); + * } else { + * oTable.fnOpen( this, "Temporary row opened", "info_row" ); + * } + * } ); + * + * oTable = $('#example').dataTable(); + * } ); + */ + this.fnOpen = function( nTr, mHtml, sClass ) + { + return this.api( true ) + .row( nTr ) + .child( mHtml, sClass ) + .show() + .child()[0]; + }; + + + /** + * Change the pagination - provides the internal logic for pagination in a simple API + * function. With this function you can have a DataTables table go to the next, + * previous, first or last pages. + * @param {string|int} mAction Paging action to take: "first", "previous", "next" or "last" + * or page number to jump to (integer), note that page 0 is the first page. + * @param {bool} [bRedraw=true] Redraw the table or not + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * oTable.fnPageChange( 'next' ); + * } ); + */ + this.fnPageChange = function ( mAction, bRedraw ) + { + var api = this.api( true ).page( mAction ); + + if ( bRedraw === undefined || bRedraw ) { + api.draw(false); + } + }; + + + /** + * Show a particular column + * @param {int} iCol The column whose display should be changed + * @param {bool} bShow Show (true) or hide (false) the column + * @param {bool} [bRedraw=true] Redraw the table or not + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Hide the second column after initialisation + * oTable.fnSetColumnVis( 1, false ); + * } ); + */ + this.fnSetColumnVis = function ( iCol, bShow, bRedraw ) + { + var api = this.api( true ).column( iCol ).visible( bShow ); + + if ( bRedraw === undefined || bRedraw ) { + api.columns.adjust().draw(); + } + }; + + + /** + * Get the settings for a particular table for external manipulation + * @returns {object} DataTables settings object. See + * {@link DataTable.models.oSettings} + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * var oSettings = oTable.fnSettings(); + * + * // Show an example parameter from the settings + * alert( oSettings._iDisplayStart ); + * } ); + */ + this.fnSettings = function() + { + return _fnSettingsFromNode( this[_ext.iApiIndex] ); + }; + + + /** + * Sort the table by a particular column + * @param {int} iCol the data index to sort on. Note that this will not match the + * 'display index' if you have hidden data entries + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Sort immediately with columns 0 and 1 + * oTable.fnSort( [ [0,'asc'], [1,'asc'] ] ); + * } ); + */ + this.fnSort = function( aaSort ) + { + this.api( true ).order( aaSort ).draw(); + }; + + + /** + * Attach a sort listener to an element for a given column + * @param {node} nNode the element to attach the sort listener to + * @param {int} iColumn the column that a click on this node will sort on + * @param {function} [fnCallback] callback function when sort is run + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * + * // Sort on column 1, when 'sorter' is clicked on + * oTable.fnSortListener( document.getElementById('sorter'), 1 ); + * } ); + */ + this.fnSortListener = function( nNode, iColumn, fnCallback ) + { + this.api( true ).order.listener( nNode, iColumn, fnCallback ); + }; + + + /** + * Update a table cell or row - this method will accept either a single value to + * update the cell with, an array of values with one element for each column or + * an object in the same format as the original data source. The function is + * self-referencing in order to make the multi column updates easier. + * @param {object|array|string} mData Data to update the cell/row with + * @param {node|int} mRow TR element you want to update or the aoData index + * @param {int} [iColumn] The column to update, give as null or undefined to + * update a whole row. + * @param {bool} [bRedraw=true] Redraw the table or not + * @param {bool} [bAction=true] Perform pre-draw actions or not + * @returns {int} 0 on success, 1 on error + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * oTable.fnUpdate( 'Example update', 0, 0 ); // Single cell + * oTable.fnUpdate( ['a', 'b', 'c', 'd', 'e'], $('tbody tr')[0] ); // Row + * } ); + */ + this.fnUpdate = function( mData, mRow, iColumn, bRedraw, bAction ) + { + var api = this.api( true ); + + if ( iColumn === undefined || iColumn === null ) { + api.row( mRow ).data( mData ); + } + else { + api.cell( mRow, iColumn ).data( mData ); + } + + if ( bAction === undefined || bAction ) { + api.columns.adjust(); + } + + if ( bRedraw === undefined || bRedraw ) { + api.draw(); + } + return 0; + }; + + + /** + * Provide a common method for plug-ins to check the version of DataTables being used, in order + * to ensure compatibility. + * @param {string} sVersion Version string to check for, in the format "X.Y.Z". Note that the + * formats "X" and "X.Y" are also acceptable. + * @returns {boolean} true if this version of DataTables is greater or equal to the required + * version, or false if this version of DataTales is not suitable + * @method + * @dtopt API + * @deprecated Since v1.10 + * + * @example + * $(document).ready(function() { + * var oTable = $('#example').dataTable(); + * alert( oTable.fnVersionCheck( '1.9.0' ) ); + * } ); + */ + this.fnVersionCheck = _ext.fnVersionCheck; + + + var _that = this; + var emptyInit = options === undefined; + var len = this.length; + + if ( emptyInit ) { + options = {}; + } + + this.oApi = this.internal = _ext.internal; + + // Extend with old style plug-in API methods + for ( var fn in DataTable.ext.internal ) { + if ( fn ) { + this[fn] = _fnExternApiFunc(fn); + } + } + + this.each(function() { + // For each initialisation we want to give it a clean initialisation + // object that can be bashed around + var o = {}; + var oInit = len > 1 ? // optimisation for single table case + _fnExtend( o, options, true ) : + options; + + /*global oInit,_that,emptyInit*/ + var i=0, iLen, j, jLen, k, kLen; + var sId = this.getAttribute( 'id' ); + var bInitHandedOff = false; + var defaults = DataTable.defaults; + var $this = $(this); + + + /* Sanity check */ + if ( this.nodeName.toLowerCase() != 'table' ) + { + _fnLog( null, 0, 'Non-table node initialisation ('+this.nodeName+')', 2 ); + return; + } + + /* Backwards compatibility for the defaults */ + _fnCompatOpts( defaults ); + _fnCompatCols( defaults.column ); + + /* Convert the camel-case defaults to Hungarian */ + _fnCamelToHungarian( defaults, defaults, true ); + _fnCamelToHungarian( defaults.column, defaults.column, true ); + + /* Setting up the initialisation object */ + _fnCamelToHungarian( defaults, $.extend( oInit, $this.data() ), true ); + + + + /* Check to see if we are re-initialising a table */ + var allSettings = DataTable.settings; + for ( i=0, iLen=allSettings.length ; i').appendTo($this); + } + oSettings.nTHead = thead[0]; + + var tbody = $this.children('tbody'); + if ( tbody.length === 0 ) { + tbody = $('').appendTo($this); + } + oSettings.nTBody = tbody[0]; + + var tfoot = $this.children('tfoot'); + if ( tfoot.length === 0 && captions.length > 0 && (oSettings.oScroll.sX !== "" || oSettings.oScroll.sY !== "") ) { + // If we are a scrolling table, and no footer has been given, then we need to create + // a tfoot element for the caption element to be appended to + tfoot = $('').appendTo($this); + } + + if ( tfoot.length === 0 || tfoot.children().length === 0 ) { + $this.addClass( oClasses.sNoFooter ); + } + else if ( tfoot.length > 0 ) { + oSettings.nTFoot = tfoot[0]; + _fnDetectHeader( oSettings.aoFooter, oSettings.nTFoot ); + } + + /* Check if there is data passing into the constructor */ + if ( oInit.aaData ) { + for ( i=0 ; i/g; + + // This is not strict ISO8601 - Date.parse() is quite lax, although + // implementations differ between browsers. + var _re_date = /^\d{2,4}[\.\/\-]\d{1,2}[\.\/\-]\d{1,2}([T ]{1}\d{1,2}[:\.]\d{2}([\.:]\d{2})?)?$/; + + // Escape regular expression special characters + var _re_escape_regex = new RegExp( '(\\' + [ '/', '.', '*', '+', '?', '|', '(', ')', '[', ']', '{', '}', '\\', '$', '^', '-' ].join('|\\') + ')', 'g' ); + + // http://en.wikipedia.org/wiki/Foreign_exchange_market + // - \u20BD - Russian ruble. + // - \u20a9 - South Korean Won + // - \u20BA - Turkish Lira + // - \u20B9 - Indian Rupee + // - R - Brazil (R$) and South Africa + // - fr - Swiss Franc + // - kr - Swedish krona, Norwegian krone and Danish krone + // - \u2009 is thin space and \u202F is narrow no-break space, both used in many + // - Ƀ - Bitcoin + // - Ξ - Ethereum + // standards as thousands separators. + var _re_formatted_numeric = /[',$£€¥%\u2009\u202F\u20BD\u20a9\u20BArfkɃΞ]/gi; + + + var _empty = function ( d ) { + return !d || d === true || d === '-' ? true : false; + }; + + + var _intVal = function ( s ) { + var integer = parseInt( s, 10 ); + return !isNaN(integer) && isFinite(s) ? integer : null; + }; + + // Convert from a formatted number with characters other than `.` as the + // decimal place, to a Javascript number + var _numToDecimal = function ( num, decimalPoint ) { + // Cache created regular expressions for speed as this function is called often + if ( ! _re_dic[ decimalPoint ] ) { + _re_dic[ decimalPoint ] = new RegExp( _fnEscapeRegex( decimalPoint ), 'g' ); + } + return typeof num === 'string' && decimalPoint !== '.' ? + num.replace( /\./g, '' ).replace( _re_dic[ decimalPoint ], '.' ) : + num; + }; + + + var _isNumber = function ( d, decimalPoint, formatted ) { + var strType = typeof d === 'string'; + + // If empty return immediately so there must be a number if it is a + // formatted string (this stops the string "k", or "kr", etc being detected + // as a formatted number for currency + if ( _empty( d ) ) { + return true; + } + + if ( decimalPoint && strType ) { + d = _numToDecimal( d, decimalPoint ); + } + + if ( formatted && strType ) { + d = d.replace( _re_formatted_numeric, '' ); + } + + return !isNaN( parseFloat(d) ) && isFinite( d ); + }; + + + // A string without HTML in it can be considered to be HTML still + var _isHtml = function ( d ) { + return _empty( d ) || typeof d === 'string'; + }; + + + var _htmlNumeric = function ( d, decimalPoint, formatted ) { + if ( _empty( d ) ) { + return true; + } + + var html = _isHtml( d ); + return ! html ? + null : + _isNumber( _stripHtml( d ), decimalPoint, formatted ) ? + true : + null; + }; + + + var _pluck = function ( a, prop, prop2 ) { + var out = []; + var i=0, ien=a.length; + + // Could have the test in the loop for slightly smaller code, but speed + // is essential here + if ( prop2 !== undefined ) { + for ( ; i') + .css( { + position: 'fixed', + top: 0, + left: $(window).scrollLeft()*-1, // allow for scrolling + height: 1, + width: 1, + overflow: 'hidden' + } ) + .append( + $('
    ') + .css( { + position: 'absolute', + top: 1, + left: 1, + width: 100, + overflow: 'scroll' + } ) + .append( + $('
    ') + .css( { + width: '100%', + height: 10 + } ) + ) + ) + .appendTo( 'body' ); + + var outer = n.children(); + var inner = outer.children(); + + // Numbers below, in order, are: + // inner.offsetWidth, inner.clientWidth, outer.offsetWidth, outer.clientWidth + // + // IE6 XP: 100 100 100 83 + // IE7 Vista: 100 100 100 83 + // IE 8+ Windows: 83 83 100 83 + // Evergreen Windows: 83 83 100 83 + // Evergreen Mac with scrollbars: 85 85 100 85 + // Evergreen Mac without scrollbars: 100 100 100 100 + + // Get scrollbar width + browser.barWidth = outer[0].offsetWidth - outer[0].clientWidth; + + // IE6/7 will oversize a width 100% element inside a scrolling element, to + // include the width of the scrollbar, while other browsers ensure the inner + // element is contained without forcing scrolling + browser.bScrollOversize = inner[0].offsetWidth === 100 && outer[0].clientWidth !== 100; + + // In rtl text layout, some browsers (most, but not all) will place the + // scrollbar on the left, rather than the right. + browser.bScrollbarLeft = Math.round( inner.offset().left ) !== 1; + + // IE8- don't provide height and width for getBoundingClientRect + browser.bBounding = n[0].getBoundingClientRect().width ? true : false; + + n.remove(); + } + + $.extend( settings.oBrowser, DataTable.__browser ); + settings.oScroll.iBarWidth = DataTable.__browser.barWidth; + } + + + /** + * Array.prototype reduce[Right] method, used for browsers which don't support + * JS 1.6. Done this way to reduce code size, since we iterate either way + * @param {object} settings dataTables settings object + * @memberof DataTable#oApi + */ + function _fnReduce ( that, fn, init, start, end, inc ) + { + var + i = start, + value, + isSet = false; + + if ( init !== undefined ) { + value = init; + isSet = true; + } + + while ( i !== end ) { + if ( ! that.hasOwnProperty(i) ) { + continue; + } + + value = isSet ? + fn( value, that[i], i, that ) : + that[i]; + + isSet = true; + i += inc; + } + + return value; + } + + /** + * Add a column to the list used for the table with default values + * @param {object} oSettings dataTables settings object + * @param {node} nTh The th element for this column + * @memberof DataTable#oApi + */ + function _fnAddColumn( oSettings, nTh ) + { + // Add column to aoColumns array + var oDefaults = DataTable.defaults.column; + var iCol = oSettings.aoColumns.length; + var oCol = $.extend( {}, DataTable.models.oColumn, oDefaults, { + "nTh": nTh ? nTh : document.createElement('th'), + "sTitle": oDefaults.sTitle ? oDefaults.sTitle : nTh ? nTh.innerHTML : '', + "aDataSort": oDefaults.aDataSort ? oDefaults.aDataSort : [iCol], + "mData": oDefaults.mData ? oDefaults.mData : iCol, + idx: iCol + } ); + oSettings.aoColumns.push( oCol ); + + // Add search object for column specific search. Note that the `searchCols[ iCol ]` + // passed into extend can be undefined. This allows the user to give a default + // with only some of the parameters defined, and also not give a default + var searchCols = oSettings.aoPreSearchCols; + searchCols[ iCol ] = $.extend( {}, DataTable.models.oSearch, searchCols[ iCol ] ); + + // Use the default column options function to initialise classes etc + _fnColumnOptions( oSettings, iCol, $(nTh).data() ); + } + + + /** + * Apply options for a column + * @param {object} oSettings dataTables settings object + * @param {int} iCol column index to consider + * @param {object} oOptions object with sType, bVisible and bSearchable etc + * @memberof DataTable#oApi + */ + function _fnColumnOptions( oSettings, iCol, oOptions ) + { + var oCol = oSettings.aoColumns[ iCol ]; + var oClasses = oSettings.oClasses; + var th = $(oCol.nTh); + + // Try to get width information from the DOM. We can't get it from CSS + // as we'd need to parse the CSS stylesheet. `width` option can override + if ( ! oCol.sWidthOrig ) { + // Width attribute + oCol.sWidthOrig = th.attr('width') || null; + + // Style attribute + var t = (th.attr('style') || '').match(/width:\s*(\d+[pxem%]+)/); + if ( t ) { + oCol.sWidthOrig = t[1]; + } + } + + /* User specified column options */ + if ( oOptions !== undefined && oOptions !== null ) + { + // Backwards compatibility + _fnCompatCols( oOptions ); + + // Map camel case parameters to their Hungarian counterparts + _fnCamelToHungarian( DataTable.defaults.column, oOptions, true ); + + /* Backwards compatibility for mDataProp */ + if ( oOptions.mDataProp !== undefined && !oOptions.mData ) + { + oOptions.mData = oOptions.mDataProp; + } + + if ( oOptions.sType ) + { + oCol._sManualType = oOptions.sType; + } + + // `class` is a reserved word in Javascript, so we need to provide + // the ability to use a valid name for the camel case input + if ( oOptions.className && ! oOptions.sClass ) + { + oOptions.sClass = oOptions.className; + } + if ( oOptions.sClass ) { + th.addClass( oOptions.sClass ); + } + + $.extend( oCol, oOptions ); + _fnMap( oCol, oOptions, "sWidth", "sWidthOrig" ); + + /* iDataSort to be applied (backwards compatibility), but aDataSort will take + * priority if defined + */ + if ( oOptions.iDataSort !== undefined ) + { + oCol.aDataSort = [ oOptions.iDataSort ]; + } + _fnMap( oCol, oOptions, "aDataSort" ); + } + + /* Cache the data get and set functions for speed */ + var mDataSrc = oCol.mData; + var mData = _fnGetObjectDataFn( mDataSrc ); + var mRender = oCol.mRender ? _fnGetObjectDataFn( oCol.mRender ) : null; + + var attrTest = function( src ) { + return typeof src === 'string' && src.indexOf('@') !== -1; + }; + oCol._bAttrSrc = $.isPlainObject( mDataSrc ) && ( + attrTest(mDataSrc.sort) || attrTest(mDataSrc.type) || attrTest(mDataSrc.filter) + ); + oCol._setter = null; + + oCol.fnGetData = function (rowData, type, meta) { + var innerData = mData( rowData, type, undefined, meta ); + + return mRender && type ? + mRender( innerData, type, rowData, meta ) : + innerData; + }; + oCol.fnSetData = function ( rowData, val, meta ) { + return _fnSetObjectDataFn( mDataSrc )( rowData, val, meta ); + }; + + // Indicate if DataTables should read DOM data as an object or array + // Used in _fnGetRowElements + if ( typeof mDataSrc !== 'number' ) { + oSettings._rowReadObject = true; + } + + /* Feature sorting overrides column specific when off */ + if ( !oSettings.oFeatures.bSort ) + { + oCol.bSortable = false; + th.addClass( oClasses.sSortableNone ); // Have to add class here as order event isn't called + } + + /* Check that the class assignment is correct for sorting */ + var bAsc = $.inArray('asc', oCol.asSorting) !== -1; + var bDesc = $.inArray('desc', oCol.asSorting) !== -1; + if ( !oCol.bSortable || (!bAsc && !bDesc) ) + { + oCol.sSortingClass = oClasses.sSortableNone; + oCol.sSortingClassJUI = ""; + } + else if ( bAsc && !bDesc ) + { + oCol.sSortingClass = oClasses.sSortableAsc; + oCol.sSortingClassJUI = oClasses.sSortJUIAscAllowed; + } + else if ( !bAsc && bDesc ) + { + oCol.sSortingClass = oClasses.sSortableDesc; + oCol.sSortingClassJUI = oClasses.sSortJUIDescAllowed; + } + else + { + oCol.sSortingClass = oClasses.sSortable; + oCol.sSortingClassJUI = oClasses.sSortJUI; + } + } + + + /** + * Adjust the table column widths for new data. Note: you would probably want to + * do a redraw after calling this function! + * @param {object} settings dataTables settings object + * @memberof DataTable#oApi + */ + function _fnAdjustColumnSizing ( settings ) + { + /* Not interested in doing column width calculation if auto-width is disabled */ + if ( settings.oFeatures.bAutoWidth !== false ) + { + var columns = settings.aoColumns; + + _fnCalculateColumnWidths( settings ); + for ( var i=0 , iLen=columns.length ; i