init

cldf-datasets · Sep 25, 2024 · f0d23c8 · f0d23c8
commit f0d23c8
Show file tree

Hide file tree

Showing 36 changed files with 185,476 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,102 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.idea/
diff --git a/README.md b/README.md
@@ -0,0 +1,68 @@
+# CLDF dataset with data and supplements for Barlow 2024 ...
+
+[![CLDF validation](https://github.com/cldf-datasets/barlowhandandfive/workflows/CLDF-validation/badge.svg)](https://github.com/cldf-datasets/barlowhandandfive/actions?query=workflow%3ACLDF-validation)
+
+## How to cite
+
+If you use these data please cite
+this dataset using the DOI of the [particular released version](../../releases/) you were using
+
+## Description
+
+
+This dataset is licensed under a CC-BY-4.0 license
+
+
+
+### Replacement events
+
+Replacement events, i.e. rows in the [replacements table](cldf/replacements.csv), represent a probable loss of the 
+inherited form ‘hand’ or ‘five’, whether in the individual history of a single language or in a protolanguage ancestral
+to multiple languages, with Glottolog languoids (i.e. language subgroups or individual languages in Glottolog 5.0's 
+classification of Austronesian) serving as proxies. While the replacements table lists name and Glottocode of this
+languoid, the individual languages in our sample which fall into this subgroup are linked via the Glottocodes in the
+`Language_IDs` column.
+
+Looking up related data from different tables of the dataset is best done by exploiting the fact that 
+[any CLDF dataset can be converted to a SQLite database](https://github.com/cldf/pycldf?tab=readme-ov-file#converting-a-cldf-dataset-to-an-sqlite-database).
+The schema of this database here is described below. So if we wanted to see whether the language Lenkau appears in any
+replacement events, we could run the following query:
+```sql
+sqlite> select distinct r.subgroup from languagetable as l, "replacements.csv_languagetable" as rl, "replacements.csv" as r where l.cldf_id == rl.languagetable_cldf_id and rl."replacemen
+ts.csv_cldf_id" = r.cldf_id and l.cldf_name = 'Lenkau';
+South-East Admiralty
+```
+and if we wanted to see which other languages are subsumed under "South-East Admiralty", we could run
+```
+sqlite> select distinct l.cldf_name from languagetable as l, "replacements.csv_languagetable" as rl, "replacements.csv" as r where l.cldf_id == rl.languagetable_cldf_id and rl."replacements.csv_cldf_id" = r.cldf_id and r.subgroup = 'South-East Admiralty';
+Lenkau
+Nauna
+Penchal
+Lou
+Paluai
+```
+
+As explained in the [cldf/README](cldf/README.md), replacement events can be reconstructed using a more conservative or
+a more liberal approach. As an example of a discrepancy between the two approaches, consider the replacement of
+*qalima ‘hand’ for [Bugawac](https://glottolog.org/resource/languoid/id/buga1250) and 
+[Kela (Papua New Guinea)](https://glottolog.org/resource/languoid/id/kela1255). In this case, both Bugawac and Kela
+exhibit the replacement of *qalima ‘hand’; however, their sister language (in Glottolog 5.0's classification) 
+[Yabem](https://glottolog.org/resource/languoid/id/yabe1254) does not, so such a replacement cannot be reconstructed 
+to the immediate ancestor of the three languages, [North Huon Gulf linkage](https://glottolog.org/resource/languoid/id/nort2858). 
+However, even though they are not subgrouped together within North Huon Gulf linkage (which has a flat—i.e., 
+ternary-branching—structure in Glottolog 5.0) excluding Yabem, the two languages may nevertheless have shared 
+in the replacement of *qalima ‘hand’. This view is reflected the replacements table by assigning the same `Replacement_Group`
+value `hand-43` to the two (conservative) replacement events:
+```shell
+$ csvgrep -c Replacement_Group -m"hand-43" cldf/replacements.csv | csvcut -c Subgroup,Comment
+Subgroup,Comment
+Bugawac,possibly shared change between Bugawac/Kela
+Kela (Papua New Guinea),possibly shared change between Bugawac/Kela
+```
+
+
+## CLDF Datasets
+
+The following CLDF datasets are available in [cldf](cldf):
+
+- CLDF [StructureDataset](https://github.com/cldf/cldf/tree/master/modules/StructureDataset) at [cldf/StructureDataset-metadata.json](cldf/StructureDataset-metadata.json)
diff --git a/barlowhandandfivecommands/__init__.py b/barlowhandandfivecommands/__init__.py
diff --git a/barlowhandandfivecommands/maps.py b/barlowhandandfivecommands/maps.py
@@ -0,0 +1,81 @@
+"""
+Plot parameter maps for the paper.
+"""
+import json
+import itertools
+import subprocess
+
+from cldfbench_barlowhandandfive import Dataset
+
+
+def register(parser):
+    parser.add_argument('--format', choices=['html', 'svg'], default='svg')
+
+
+def plot(format, pid, colors, mdpath, mapdir):
+    o = mapdir / '{}.{}'.format(pid, format)
+    cmd = [
+        'cldfbench',
+        'cldfviz.map',
+        '--parameter', pid,
+        '--colormaps',
+        json.dumps(colors),
+        '--pacific-centered',
+    ]
+    if format == 'html':
+        cmd.append('--with-layers')
+    else:  # format == 'svg'
+        cmd.extend([
+            '--format', 'svg',
+            '--padding-top', '5',
+            '--padding-bottom', '5',
+            '--projection', 'Mollweide',
+            '--width', '10',
+            '--markersize', '4',
+            '--with-ocean',
+            '--no-legend',
+        ])
+    cmd.extend(['--output', str(o), str(mdpath)])
+    subprocess.check_call(cmd)
+    assert o.exists()
+    return o
+
+
+def run(args):
+    cldf = Dataset().cldf_reader()
+    mapdir = cldf.directory.parent / 'maps'
+    pids = [r['ID'] for r in cldf.iter_rows('ParameterTable')]
+    parameters = {
+        pid: list(rows) for pid, rows in itertools.groupby(
+            sorted(cldf.iter_rows('CodeTable'), key=lambda r: r['Parameter_ID']),
+            lambda r: r['Parameter_ID'])}
+    value_count = {
+        cid: len(list(rows)) for cid, rows in itertools.groupby(
+            sorted(cldf.iter_rows('ValueTable'), key=lambda r: r['Code_ID'] or 'xxxx'),
+            lambda r: r['Code_ID'])}
+    readme = ['# Maps\n']
+    for pid, codes in sorted(parameters.items(), key=lambda t: pids.index(t[0])):
+        readme.append('## {}\n'.format(cldf.get_row('ParameterTable', pid)['Name'].replace('_', ' ')))
+        readme.append('&nbsp; | Description | Count')
+        readme.append('--- | --- | ---:')
+        for c in codes:
+            readme.append('$${{\color{{{}}}⏺}}$$ | {} | {}'.format(
+                c['color'], c['Name'], value_count[c['ID']]))
+        readme.append('&nbsp; | &nbsp; | **{}**'.format(sum(value_count[c['ID']] for c in codes)))
+
+        p = plot(
+            args.format,
+            pid,
+            {c['ID']: c['color'] for c in codes},
+            cldf.directory / cldf.filename,
+            mapdir)
+        p = plot(
+            'html',
+            pid,
+            {c['ID']: c['color'] for c in codes},
+            cldf.directory / cldf.filename,
+            mapdir)
+        readme.append('\n![{}]({})\n'.format(pid, p.name))
+        readme.append(
+            'View [interactive map](https://raw.githubusercontent.com/cldf-datasets/barlowhandandfive/refs/heads/main/maps/{}.html).\n'.format(pid))
+    mapdir.joinpath('README.md').write_text('\n'.join(readme))
diff --git a/cldf/.gitattributes b/cldf/.gitattributes
@@ -0,0 +1 @@
+*.csv text eol=crlf