Skip to content

Commit

Permalink
Merge pull request #278 from kobotoolbox/277-additional-fields-exports
Browse files Browse the repository at this point in the history
MVP: Handle extra fields to export
  • Loading branch information
jnm authored Nov 11, 2022
2 parents abcdced + 62df5d9 commit 43daf1d
Show file tree
Hide file tree
Showing 20 changed files with 1,691 additions and 47 deletions.
10 changes: 10 additions & 0 deletions src/formpack/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,13 @@
'form_appearance',
'form_meta_edit',
]

# Analysis types
ANALYSIS_TYPE_CODING = 'coding'
ANALYSIS_TYPE_TRANSCRIPT = 'transcript'
ANALYSIS_TYPE_TRANSLATION = 'translation'
ANALYSIS_TYPES = [
ANALYSIS_TYPE_CODING,
ANALYSIS_TYPE_TRANSCRIPT,
ANALYSIS_TYPE_TRANSLATION,
]
8 changes: 7 additions & 1 deletion src/formpack/pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import json
from collections import OrderedDict
from copy import deepcopy
from typing import Dict

from formpack.schema.fields import CopyField
from .version import FormVersion
from .version import FormVersion, AnalysisForm
from .reporting import Export, AutoReport
from .utils.expand_content import expand_content
from .utils.replace_aliases import replace_aliases
Expand Down Expand Up @@ -52,6 +53,8 @@ def __init__(

self.asset_type = asset_type

self.analysis_form = None

self.load_all_versions(versions)

# FIXME: Find a safe way to use this. Wrapping with try/except isn't enough
Expand Down Expand Up @@ -176,6 +179,9 @@ def load_version(self, schema):

self.versions[form_version.id] = form_version

def extend_survey(self, analysis_form: Dict) -> None:
self.analysis_form = AnalysisForm(self, analysis_form)

def version_diff(self, vn1, vn2):
v1 = self.versions[vn1]
v2 = self.versions[vn2]
Expand Down
71 changes: 62 additions & 9 deletions src/formpack/reporting/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,23 @@
import zipfile
from collections import defaultdict, OrderedDict
from inspect import isclass
from typing import Iterator, Generator, Optional
from typing import (
Dict,
Generator,
Iterator,
Optional,
)

import xlsxwriter

from ..constants import (
ANALYSIS_TYPE_TRANSCRIPT,
ANALYSIS_TYPE_TRANSLATION,
GEO_QUESTION_TYPES,
TAG_COLUMNS_AND_SEPARATORS,
UNSPECIFIED_TRANSLATION,
)
from ..schema import CopyField
from ..schema import CopyField, FormField
from ..submission import FormSubmission
from ..utils.exceptions import FormPackGeoJsonError
from ..utils.flatten_content import flatten_tag_list
Expand Down Expand Up @@ -60,9 +67,11 @@ def __init__(
:param tag_cols_for_header: list
:param filter_fields: list
:param xls_types_as_text: bool
:param include_media_url: bool
"""

self.formpack = formpack
self.analysis_form = formpack.analysis_form
self.lang = lang
self.group_sep = group_sep
self.title = title
Expand All @@ -81,6 +90,12 @@ def __init__(
tag_cols_for_header = []
self.tag_cols_for_header = tag_cols_for_header

_filter_fields = []
for item in self.filter_fields:
item = re.sub(r'^_supplementalDetails/', '', item)
_filter_fields.append(item)
self.filter_fields = _filter_fields

# If some fields need to be arbitrarily copied, add them
# to the first section
if copy_fields:
Expand Down Expand Up @@ -224,6 +239,9 @@ def get_fields_labels_tags_for_all_versions(

# Ensure that fields are filtered if they've been specified, otherwise
# carry on as usual
if self.analysis_form:
all_fields = self.analysis_form.insert_analysis_fields(all_fields)

if self.filter_fields:
all_fields = [
field
Expand Down Expand Up @@ -320,6 +338,7 @@ def format_one_submission(
submission,
current_section,
attachments=None,
supplemental_details=None,
):

# 'current_section' is the name of what will become sheets in xls.
Expand Down Expand Up @@ -382,17 +401,46 @@ def _get_attachment(val, field, attachments):
if re.match(fr'^.*/{_val}$', f['filename']) is not None
]

def _get_value_from_entry(entry, field):
def _get_value_from_supplemental_details(
field: FormField, supplemental_details: Dict
) -> Optional[str]:
source, name = field.analysis_path
_sup_details = supplemental_details.get(source, {})

if not _sup_details:
return

# The names for translation and transcript fields are in the format
# of `translated_<language code>` which must be stripped to get the
# value from the supplemental details dict
if _name := re.match(r'^(translation|transcript)_', name):
name = _name.groups()[0]

val = _sup_details.get(name)
if val is None:
return ''

return val

def _get_value_from_entry(
entry: Dict, field: FormField, supplemental_details: Dict
) -> Optional[str]:
if field.analysis_question and supplemental_details:
return _get_value_from_supplemental_details(
field, supplemental_details
)

suffix = 'meta/' if field.data_type == 'audit' else ''
return entry.get(f'{suffix}{field.path}')

if self.analysis_form:
_fields = self.analysis_form.insert_analysis_fields(_fields)

# Ensure that fields are filtered if they've been specified, otherwise
# carry on as usual
if self.filter_fields:
_fields = tuple(
field
for field in current_section.fields.values()
if field.path in self.filter_fields
field for field in _fields if field.path in self.filter_fields
)

# 'rows' will contain all the formatted entries for the current
Expand Down Expand Up @@ -423,13 +471,17 @@ def _get_value_from_entry(entry, field):
row.update(_empty_row)

attachments = entry.get('_attachments') or attachments
supplemental_details = (
entry.get('_supplementalDetails') or supplemental_details
)

for field in _fields:
# TODO: pass a context to fields so they can all format ?
if field.can_format:

# get submission value for this field
val = _get_value_from_entry(entry, field)
val = _get_value_from_entry(
entry, field, supplemental_details
)
# get the attachment for this field
attachment = _get_attachment(val, field, attachments)
# get a mapping of {"col_name": "val", ...}
Expand Down Expand Up @@ -493,7 +545,8 @@ def _get_value_from_entry(entry, field):
chunk = self.format_one_submission(
entry[child_section.path],
child_section,
attachments,
attachments=attachments,
supplemental_details=supplemental_details,
)
for key, value in iter(chunk.items()):
if key in chunks:
Expand Down
129 changes: 111 additions & 18 deletions src/formpack/schema/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@
import statistics

from .datadef import FormDataDef, FormChoice
from ..constants import UNSPECIFIED_TRANSLATION
from ..constants import (
ANALYSIS_TYPES,
ANALYSIS_TYPE_CODING,
ANALYSIS_TYPE_TRANSCRIPT,
ANALYSIS_TYPE_TRANSLATION,
UNSPECIFIED_TRANSLATION,
)
from ..utils import singlemode
from ..utils.ordered_collection import OrderedDefaultdict

Expand All @@ -35,6 +41,20 @@ def __init__(
self.section = section
self.can_format = can_format
self.tags = kwargs.get('tags', [])
self.analysis_question = False

source = kwargs.get('source')
if source is not None:
self.source = source
self.analysis_question = True
self.analysis_type = kwargs.get('analysis_type')
self.analysis_path = kwargs.get('analysis_path')
self.settings = kwargs.get('settings')
if self.analysis_type in [
ANALYSIS_TYPE_TRANSCRIPT,
ANALYSIS_TYPE_TRANSLATION,
]:
self.language = kwargs['language']

hierarchy = list(hierarchy) if hierarchy is not None else [None]
self.hierarchy = hierarchy + [self]
Expand All @@ -45,11 +65,15 @@ def __init__(
if has_stats is not None:
self.has_stats = has_stats
else:
self.has_stats = data_type != 'note'
self.has_stats = data_type != 'note' and not self.analysis_question

# do not include the root section in the path
self.path = '/'.join(info.name for info in self.hierarchy[1:])

@property
def qpath(self):
return self.path.replace('/', '-')

def get_labels(
self,
lang=UNSPECIFIED_TRANSLATION,
Expand Down Expand Up @@ -139,7 +163,7 @@ def _get_label(
# even if `lang` can be None, we don't want the `label` to be None.
label = self.labels.get(lang, self.name)
# If `label` is None, no matches are found, so return `field` name.
return self.name if label is None else label
return label or self.name

def __repr__(self):
args = (self.__class__.__name__, self.name, self.data_type)
Expand Down Expand Up @@ -178,13 +202,22 @@ def from_json_definition(
labels = cls._extract_json_labels(definition, translations)
appearance = definition.get('appearance')
or_other = definition.get('_or_other', False)
source = definition.get('source')
analysis_type = definition.get('analysis_type', ANALYSIS_TYPE_CODING)
settings = definition.get('settings', {})
analysis_path = definition.get('path')
languages = definition.get('languages')
language = definition.get('language')

# normalize spaces
data_type = definition['type']

if ' ' in data_type:
raise ValueError('invalid data_type: %s' % data_type)

if analysis_type not in ANALYSIS_TYPES:
raise ValueError(f'Invalid analysis data type: {analysis_type}')

if data_type in ('select_one', 'select_multiple'):
choice_id = definition['select_from_list_name']
# pyxform#472 introduced dynamic list_names for select_one with the
Expand Down Expand Up @@ -246,6 +279,12 @@ def from_json_definition(
'section': section,
'choice': choice,
'or_other': or_other,
'source': source,
'analysis_type': analysis_type,
'settings': settings,
'analysis_path': analysis_path,
'language': language,
'languages': languages,
}

if data_type == 'select_multiple' and appearance == 'literacy':
Expand Down Expand Up @@ -424,21 +463,6 @@ def get_substats(


class TextField(ExtendedFormField):
def get_stats(self, metrics, lang=UNSPECIFIED_TRANSLATION, limit=100):

stats = super().get_stats(metrics, lang, limit)

top = metrics.most_common(limit)
total = stats['total_count']

percentage = []
for key, val in top:
percentage.append((key, self._get_percentage(val, total)))

stats.update({'frequency': top, 'percentage': percentage})

return stats

def get_disaggregated_stats(
self, metrics, top_splitters, lang=UNSPECIFIED_TRANSLATION, limit=100
):
Expand All @@ -459,6 +483,75 @@ def sum_frequencies(element):

return stats

def get_labels(
self,
lang=UNSPECIFIED_TRANSLATION,
group_sep='/',
hierarchy_in_labels=False,
multiple_select='both',
*args,
**kwargs,
):
args = lang, group_sep, hierarchy_in_labels, multiple_select
if getattr(self, 'analysis_type', None) in [
ANALYSIS_TYPE_TRANSCRIPT,
ANALYSIS_TYPE_TRANSLATION,
]:
source_label = self.source_field._get_label(*args)
_type = 'translation' if self._is_translation else 'transcript'
return [f'{source_label} - {_type} ({self.language})']
return [self._get_label(*args)]

def get_stats(self, metrics, lang=UNSPECIFIED_TRANSLATION, limit=100):

stats = super().get_stats(metrics, lang, limit)

top = metrics.most_common(limit)
total = stats['total_count']

percentage = []
for key, val in top:
percentage.append((key, self._get_percentage(val, total)))

stats.update({'frequency': top, 'percentage': percentage})

return stats

@property
def _is_transcript(self):
return getattr(self, 'analysis_type', '') == ANALYSIS_TYPE_TRANSCRIPT

@property
def _is_translation(self):
return getattr(self, 'analysis_type', '') == ANALYSIS_TYPE_TRANSLATION

def format(
self,
val,
lang=UNSPECIFIED_TRANSLATION,
group_sep='/',
hierarchy_in_labels=False,
multiple_select='both',
xls_types_as_text=True,
*args,
**kwargs,
):
if val is None:
val = ''

if isinstance(val, dict):
if self._is_translation:
try:
val = val[self.language]['value']
except KeyError:
val = ''
elif self._is_transcript:
val = (
val['value'] if val['languageCode'] == self.language else ''
)

return {self.name: val}


class MediaField(TextField):
def get_labels(self, include_media_url=False, *args, **kwargs):
Expand Down
Loading

0 comments on commit 43daf1d

Please sign in to comment.