swerik-project · BobBorges · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/.github/workflows/qe_ocr-estimate.yml b/.github/workflows/qe_ocr-estimate.yml
@@ -63,6 +63,10 @@ jobs:
         pip install torchmetrics
         pip install nltk
 
+    - name: Estimate OCR Quality
+      run: |
+        python quality/qe_ocr-estimation.py -D 1880 --lev-only
+
     - name: Cache result 1870s
       uses: actions/upload-artifact@v4
       with:

diff --git a/.github/workflows/qe_speaker-mapping.yml b/.github/workflows/qe_speaker-mapping.yml
@@ -0,0 +1,48 @@
+name: "Riksdagen Records: Speaker Mapping Accuracy Estimate"
+
+on:
+  pull_request:
+    branches:
+      - 'dev'
+    paths:
+      - data/
+
+jobs:
+  Estimate-speaker-accuracy:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.8]
+    steps:
+    - name: Checkout PR source branch
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ github.head_ref }}
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pyriksdagen
+        pip install scipy
+
+    - name: Estimate Mapping Accuracy
+      run: |
+        python quality/qe_speaker-mapping.py
+
+    - name: Add and commit changes
+      run: |
+        git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+        git config --local user.name "github-actions[bot]"
+        git add quality/estimates/speaker-mapping
+        git commit -m "chore (workflow): run speaker mapping estimation"
+
+    - name: Push changes
+      uses: ad-m/github-push-action@master
+      with:
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+        branch: ${{ github.head_ref }}
diff --git a/quality/data/speaker-mapping/speaker-mapping-gold-standard.csv b/quality/data/speaker-mapping/speaker-mapping-gold-standard.csv
diff --git a/quality/docs/qe-speaker_mapping.md → quality/docs/qe_speaker-mapping.md b/quality/docs/qe-speaker_mapping.md → quality/docs/qe_speaker-mapping.md
@@ -1,31 +1,36 @@
 # Speaker mapping error
 
 ## Summary
+
 Each speech in the parliament is mapped to a member of parliament, and each speech is introduced in an introduction segment. We are interested in knowing the quality of the mapping between the speech and the true member of parliament.
 
+
 ## What is the problem
-In this quality dimension we want to estimate the proportion of the maps to speakers that are correct. The quality of the mapping is important when there is an interest in the analysis of speeches by member of parliament.
+
+In this quality dimension, we want to estimate the proportion of the maps to speakers that are correctly. The quality of the mapping is important when there is an interest in the analysis of speeches by member of parliament.
+
 
 ## Estimation procedure
+
 This is a stratified simple random sample, where each introduction is the sampled unit.
 
 ### Sampling plan 
+
 To estimate the MP mapping errors, we take a stratified sample of three introductions per year and document type (i.e. three introductions per chamber) and annotate them by the true ID.
 
-### Annotation guidelines 
-You will get a CSV with the introduction text, a link to the PDF/image of the page, and the paragraph id (that is used later on). 
+### Annotation guidelines
+
+Annotators get a CSV with the introduction text, a link to the PDF/image of the page, and the paragraph id (that is used later on).
 
 The introduction might look something like this
 
->Herr Anderson i Stockholm (s):
+```
+Herr Anderson i Stockholm (s):
+```
 
-You should try and find a matching person on Wikidata (name, party, i-ort, time period, gender, chamber, etc.). Then you add the Wikidata identifier in the 'speaker' column.
+The annotator should try and find a matching person in the SWERIK person catalog or on Wikidata (name, party, i-ort, time period, gender, chamber, etc.). Then you add the Wikidata identifier in the 'speaker' column.
 
 If it is impossible to find the person, add 'unknown' in the speaker column for that introduction.
 
 
-## Other comments
-
-
-## References
 
diff --git a/quality/estimates/speaker-mapping/mapping-accuracy-estimate-byyear-sum.csv b/quality/estimates/speaker-mapping/mapping-accuracy-estimate-byyear-sum.csv
@@ -0,0 +1,18 @@
+decade,correct,incorrect,lower,upper,accuracy
+1860,9,9,0.3200865295887242,0.6799134704112757,0.5
+1870,48,12,0.7008414375371663,0.8690842599477397,0.8070175438596491
+1880,42,18,0.5944602159813313,0.7854812594241839,0.7
+1890,47,12,0.6961902699846472,0.8668121624565331,0.7982456140350878
+1900,56,4,0.8561950404588264,0.9671466720920268,0.9322033898305084
+1910,48,12,0.7008414375371663,0.8690842599477397,0.7894736842105263
+1920,51,8,0.772260054335675,0.9194545119227059,0.8620689655172413
+1930,50,8,0.7686204788586048,0.9180472284987325,0.8571428571428571
+1940,55,3,0.8737934432691498,0.976512448112573,0.9464285714285714
+1950,54,5,0.8327369235749102,0.9555470322310538,0.9074074074074074
+1960,57,2,0.8987641050758384,0.986234842431717,0.9655172413793104
+1970,28,0,0.9018553723227043,0.9982328289994504,1.0
+1980,22,2,0.7689600660418558,0.9664804050104953,0.9130434782608695
+1990,28,0,0.9018553723227043,0.9982328289994504,1.0
+2000,30,0,0.9078859400526763,0.998346745733359,1.0
+2010,21,2,0.7601989838711982,0.9650466651662819,0.9090909090909091
+2020,3,0,0.4728708045015879,0.9872585449014338,1.0