Merge pull request #278 from kobotoolbox/277-additional-fields-exports

MVP: Handle extra fields to export
kobotoolbox · Nov 11, 2022 · 43daf1d · 43daf1d
2 parents abcdced + 62df5d9
commit 43daf1d
Show file tree

Hide file tree

Showing 20 changed files with 1,691 additions and 47 deletions.
diff --git a/src/formpack/constants.py b/src/formpack/constants.py
@@ -154,3 +154,13 @@
     'form_appearance',
     'form_meta_edit',
 ]
+
+# Analysis types
+ANALYSIS_TYPE_CODING = 'coding'
+ANALYSIS_TYPE_TRANSCRIPT = 'transcript'
+ANALYSIS_TYPE_TRANSLATION = 'translation'
+ANALYSIS_TYPES = [
+    ANALYSIS_TYPE_CODING,
+    ANALYSIS_TYPE_TRANSCRIPT,
+    ANALYSIS_TYPE_TRANSLATION,
+]
diff --git a/src/formpack/pack.py b/src/formpack/pack.py
@@ -3,9 +3,10 @@
 import json
 from collections import OrderedDict
 from copy import deepcopy
+from typing import Dict
 
 from formpack.schema.fields import CopyField
-from .version import FormVersion
+from .version import FormVersion, AnalysisForm
 from .reporting import Export, AutoReport
 from .utils.expand_content import expand_content
 from .utils.replace_aliases import replace_aliases
@@ -52,6 +53,8 @@ def __init__(
 
         self.asset_type = asset_type
 
+        self.analysis_form = None
+
         self.load_all_versions(versions)
 
     # FIXME: Find a safe way to use this. Wrapping with try/except isn't enough
@@ -176,6 +179,9 @@ def load_version(self, schema):
 
         self.versions[form_version.id] = form_version
 
+    def extend_survey(self, analysis_form: Dict) -> None:
+        self.analysis_form = AnalysisForm(self, analysis_form)
+
     def version_diff(self, vn1, vn2):
         v1 = self.versions[vn1]
         v2 = self.versions[vn2]

diff --git a/src/formpack/reporting/export.py b/src/formpack/reporting/export.py
@@ -4,16 +4,23 @@
 import zipfile
 from collections import defaultdict, OrderedDict
 from inspect import isclass
-from typing import Iterator, Generator, Optional
+from typing import (
+    Dict,
+    Generator,
+    Iterator,
+    Optional,
+)
 
 import xlsxwriter
 
 from ..constants import (
+    ANALYSIS_TYPE_TRANSCRIPT,
+    ANALYSIS_TYPE_TRANSLATION,
     GEO_QUESTION_TYPES,
     TAG_COLUMNS_AND_SEPARATORS,
     UNSPECIFIED_TRANSLATION,
 )
-from ..schema import CopyField
+from ..schema import CopyField, FormField
 from ..submission import FormSubmission
 from ..utils.exceptions import FormPackGeoJsonError
 from ..utils.flatten_content import flatten_tag_list
@@ -60,9 +67,11 @@ def __init__(
         :param tag_cols_for_header: list
         :param filter_fields: list
         :param xls_types_as_text: bool
+        :param include_media_url: bool
         """
 
         self.formpack = formpack
+        self.analysis_form = formpack.analysis_form
         self.lang = lang
         self.group_sep = group_sep
         self.title = title
@@ -81,6 +90,12 @@ def __init__(
             tag_cols_for_header = []
         self.tag_cols_for_header = tag_cols_for_header
 
+        _filter_fields = []
+        for item in self.filter_fields:
+            item = re.sub(r'^_supplementalDetails/', '', item)
+            _filter_fields.append(item)
+        self.filter_fields = _filter_fields
+
         # If some fields need to be arbitrarily copied, add them
         # to the first section
         if copy_fields:
@@ -224,6 +239,9 @@ def get_fields_labels_tags_for_all_versions(
 
         # Ensure that fields are filtered if they've been specified, otherwise
         # carry on as usual
+        if self.analysis_form:
+            all_fields = self.analysis_form.insert_analysis_fields(all_fields)
+
         if self.filter_fields:
             all_fields = [
                 field
@@ -320,6 +338,7 @@ def format_one_submission(
         submission,
         current_section,
         attachments=None,
+        supplemental_details=None,
     ):
 
         # 'current_section' is the name of what will become sheets in xls.
@@ -382,17 +401,46 @@ def _get_attachment(val, field, attachments):
                 if re.match(fr'^.*/{_val}$', f['filename']) is not None
             ]
 
-        def _get_value_from_entry(entry, field):
+        def _get_value_from_supplemental_details(
+            field: FormField, supplemental_details: Dict
+        ) -> Optional[str]:
+            source, name = field.analysis_path
+            _sup_details = supplemental_details.get(source, {})
+
+            if not _sup_details:
+                return
+
+            # The names for translation and transcript fields are in the format
+            # of `translated_<language code>` which must be stripped to get the
+            # value from the supplemental details dict
+            if _name := re.match(r'^(translation|transcript)_', name):
+                name = _name.groups()[0]
+
+            val = _sup_details.get(name)
+            if val is None:
+                return ''
+
+            return val
+
+        def _get_value_from_entry(
+            entry: Dict, field: FormField, supplemental_details: Dict
+        ) -> Optional[str]:
+            if field.analysis_question and supplemental_details:
+                return _get_value_from_supplemental_details(
+                    field, supplemental_details
+                )
+
             suffix = 'meta/' if field.data_type == 'audit' else ''
             return entry.get(f'{suffix}{field.path}')
 
+        if self.analysis_form:
+            _fields = self.analysis_form.insert_analysis_fields(_fields)
+
         # Ensure that fields are filtered if they've been specified, otherwise
         # carry on as usual
         if self.filter_fields:
             _fields = tuple(
-                field
-                for field in current_section.fields.values()
-                if field.path in self.filter_fields
+                field for field in _fields if field.path in self.filter_fields
             )
 
         # 'rows' will contain all the formatted entries for the current
@@ -423,13 +471,17 @@ def _get_value_from_entry(entry, field):
             row.update(_empty_row)
 
             attachments = entry.get('_attachments') or attachments
+            supplemental_details = (
+                entry.get('_supplementalDetails') or supplemental_details
+            )
 
             for field in _fields:
                 # TODO: pass a context to fields so they can all format ?
                 if field.can_format:
-
                     # get submission value for this field
-                    val = _get_value_from_entry(entry, field)
+                    val = _get_value_from_entry(
+                        entry, field, supplemental_details
+                    )
                     # get the attachment for this field
                     attachment = _get_attachment(val, field, attachments)
                     # get a mapping of {"col_name": "val", ...}
@@ -493,7 +545,8 @@ def _get_value_from_entry(entry, field):
                     chunk = self.format_one_submission(
                         entry[child_section.path],
                         child_section,
-                        attachments,
+                        attachments=attachments,
+                        supplemental_details=supplemental_details,
                     )
                     for key, value in iter(chunk.items()):
                         if key in chunks:

diff --git a/src/formpack/schema/fields.py b/src/formpack/schema/fields.py
@@ -8,7 +8,13 @@
 import statistics
 
 from .datadef import FormDataDef, FormChoice
-from ..constants import UNSPECIFIED_TRANSLATION
+from ..constants import (
+    ANALYSIS_TYPES,
+    ANALYSIS_TYPE_CODING,
+    ANALYSIS_TYPE_TRANSCRIPT,
+    ANALYSIS_TYPE_TRANSLATION,
+    UNSPECIFIED_TRANSLATION,
+)
 from ..utils import singlemode
 from ..utils.ordered_collection import OrderedDefaultdict
 
@@ -35,6 +41,20 @@ def __init__(
         self.section = section
         self.can_format = can_format
         self.tags = kwargs.get('tags', [])
+        self.analysis_question = False
+
+        source = kwargs.get('source')
+        if source is not None:
+            self.source = source
+            self.analysis_question = True
+            self.analysis_type = kwargs.get('analysis_type')
+            self.analysis_path = kwargs.get('analysis_path')
+            self.settings = kwargs.get('settings')
+            if self.analysis_type in [
+                ANALYSIS_TYPE_TRANSCRIPT,
+                ANALYSIS_TYPE_TRANSLATION,
+            ]:
+                self.language = kwargs['language']
 
         hierarchy = list(hierarchy) if hierarchy is not None else [None]
         self.hierarchy = hierarchy + [self]
@@ -45,11 +65,15 @@ def __init__(
         if has_stats is not None:
             self.has_stats = has_stats
         else:
-            self.has_stats = data_type != 'note'
+            self.has_stats = data_type != 'note' and not self.analysis_question
 
         # do not include the root section in the path
         self.path = '/'.join(info.name for info in self.hierarchy[1:])
 
+    @property
+    def qpath(self):
+        return self.path.replace('/', '-')
+
     def get_labels(
         self,
         lang=UNSPECIFIED_TRANSLATION,
@@ -139,7 +163,7 @@ def _get_label(
         # even if `lang` can be None, we don't want the `label` to be None.
         label = self.labels.get(lang, self.name)
         # If `label` is None, no matches are found, so return `field` name.
-        return self.name if label is None else label
+        return label or self.name
 
     def __repr__(self):
         args = (self.__class__.__name__, self.name, self.data_type)
@@ -178,13 +202,22 @@ def from_json_definition(
         labels = cls._extract_json_labels(definition, translations)
         appearance = definition.get('appearance')
         or_other = definition.get('_or_other', False)
+        source = definition.get('source')
+        analysis_type = definition.get('analysis_type', ANALYSIS_TYPE_CODING)
+        settings = definition.get('settings', {})
+        analysis_path = definition.get('path')
+        languages = definition.get('languages')
+        language = definition.get('language')
 
         # normalize spaces
         data_type = definition['type']
 
         if ' ' in data_type:
             raise ValueError('invalid data_type: %s' % data_type)
 
+        if analysis_type not in ANALYSIS_TYPES:
+            raise ValueError(f'Invalid analysis data type: {analysis_type}')
+
         if data_type in ('select_one', 'select_multiple'):
             choice_id = definition['select_from_list_name']
             # pyxform#472 introduced dynamic list_names for select_one with the
@@ -246,6 +279,12 @@ def from_json_definition(
             'section': section,
             'choice': choice,
             'or_other': or_other,
+            'source': source,
+            'analysis_type': analysis_type,
+            'settings': settings,
+            'analysis_path': analysis_path,
+            'language': language,
+            'languages': languages,
         }
 
         if data_type == 'select_multiple' and appearance == 'literacy':
@@ -424,21 +463,6 @@ def get_substats(
 
 
 class TextField(ExtendedFormField):
-    def get_stats(self, metrics, lang=UNSPECIFIED_TRANSLATION, limit=100):
-
-        stats = super().get_stats(metrics, lang, limit)
-
-        top = metrics.most_common(limit)
-        total = stats['total_count']
-
-        percentage = []
-        for key, val in top:
-            percentage.append((key, self._get_percentage(val, total)))
-
-        stats.update({'frequency': top, 'percentage': percentage})
-
-        return stats
-
     def get_disaggregated_stats(
         self, metrics, top_splitters, lang=UNSPECIFIED_TRANSLATION, limit=100
     ):
@@ -459,6 +483,75 @@ def sum_frequencies(element):
 
         return stats
 
+    def get_labels(
+        self,
+        lang=UNSPECIFIED_TRANSLATION,
+        group_sep='/',
+        hierarchy_in_labels=False,
+        multiple_select='both',
+        *args,
+        **kwargs,
+    ):
+        args = lang, group_sep, hierarchy_in_labels, multiple_select
+        if getattr(self, 'analysis_type', None) in [
+            ANALYSIS_TYPE_TRANSCRIPT,
+            ANALYSIS_TYPE_TRANSLATION,
+        ]:
+            source_label = self.source_field._get_label(*args)
+            _type = 'translation' if self._is_translation else 'transcript'
+            return [f'{source_label} - {_type} ({self.language})']
+        return [self._get_label(*args)]
+
+    def get_stats(self, metrics, lang=UNSPECIFIED_TRANSLATION, limit=100):
+
+        stats = super().get_stats(metrics, lang, limit)
+
+        top = metrics.most_common(limit)
+        total = stats['total_count']
+
+        percentage = []
+        for key, val in top:
+            percentage.append((key, self._get_percentage(val, total)))
+
+        stats.update({'frequency': top, 'percentage': percentage})
+
+        return stats
+
+    @property
+    def _is_transcript(self):
+        return getattr(self, 'analysis_type', '') == ANALYSIS_TYPE_TRANSCRIPT
+
+    @property
+    def _is_translation(self):
+        return getattr(self, 'analysis_type', '') == ANALYSIS_TYPE_TRANSLATION
+
+    def format(
+        self,
+        val,
+        lang=UNSPECIFIED_TRANSLATION,
+        group_sep='/',
+        hierarchy_in_labels=False,
+        multiple_select='both',
+        xls_types_as_text=True,
+        *args,
+        **kwargs,
+    ):
+        if val is None:
+            val = ''
+
+        if isinstance(val, dict):
+            if self._is_translation:
+                try:
+                    val = val[self.language]['value']
+                except KeyError:
+                    val = ''
+            elif self._is_transcript:
+                val = (
+                    val['value'] if val['languageCode'] == self.language else ''
+                )
+
+        return {self.name: val}
+
 
 class MediaField(TextField):
     def get_labels(self, include_media_url=False, *args, **kwargs):