diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..3a8bfaf --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,64 @@ +name: Build and test +on: + push: + branches: + - main + pull_request: + branches: + - main +jobs: + package: + name: Build package + runs-on: "ubuntu-latest" + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: conda-incubator/setup-miniconda@v2 + with: + miniconda-version: "latest" + activate-environment: build-intake-ga + environment-file: etc/build-environment.yml + python-version: 3.8 + auto-activate-base: false + - name: Conda Build + shell: bash -l {0} + run: | + conda build conda.recipe --no-test + mv $CONDA_PREFIX/conda-bld . + - name: Upload conda-bld directory + uses: actions/upload-artifact@v2 + with: + name: package-${{ github.sha }} + path: ./conda-bld + test: + name: Test (${{ matrix.python-version }}, ${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest", "macos-latest", "windows-latest"] + python-version: ["3.7", "3.8", "3.9"] + steps: + - uses: actions/checkout@v2 + - uses: conda-incubator/setup-miniconda@v2 + with: + miniconda-version: "latest" + python-version: ${{ matrix.python-version }} + activate-environment: test-intake-ga + environment-file: etc/test-environment.yml + - name: Download the build artifact + uses: actions/download-artifact@v2 + with: + name: package-${{ github.sha }} + path: ~/.conda/conda-bld + - name: py.test + shell: bash -l {0} + run: | + conda install -n test-intake-ga --use-local ~/.conda/conda-bld/noarch/intake-google-analytics-*.tar.bz2 + py.test -xv + - name: Codecov + uses: codecov/codecov-action@v1 + with: + file: ./cov.xml + env_vars: OS,PYTHON \ No newline at end of file diff --git a/.gitignore b/.gitignore index 71e6852..2c423e6 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,8 @@ nosetests.xml coverage.xml *,cover .hypothesis/ +junit.xml +cov.xml # Translations *.mo diff --git a/environment.yml b/environment.yml index 0ca9cf9..688fc2e 100644 --- a/environment.yml +++ b/environment.yml @@ -7,6 +7,8 @@ dependencies: - pandas - intake - flake8 + - pytest + - pytest-cov channels: - defaults - - conda-forge \ No newline at end of file + - conda-forge diff --git a/etc/build-environment.yml b/etc/build-environment.yml new file mode 100644 index 0000000..e682081 --- /dev/null +++ b/etc/build-environment.yml @@ -0,0 +1,5 @@ +name: build-intake-ga +dependencies: + - conda + - conda-build + - conda-verify diff --git a/etc/test-environment.yml b/etc/test-environment.yml new file mode 100644 index 0000000..fb92fd8 --- /dev/null +++ b/etc/test-environment.yml @@ -0,0 +1,12 @@ +name: test-intake-ga +dependencies: + - google-api-python-client + - google-auth-oauthlib + - pandas + - intake + - flake8 + - pytest + - pytest-cov +channels: + - defaults + - conda-forge diff --git a/intake_google_analytics/source.py b/intake_google_analytics/source.py index 492b3ad..608a8db 100644 --- a/intake_google_analytics/source.py +++ b/intake_google_analytics/source.py @@ -17,7 +17,7 @@ "INTEGER": int, "TIME": float, "PERCENT": float, - "STRING": str, + "FLOAT": float, "CURRENCY": float } @@ -90,51 +90,39 @@ def to_dask(self): raise NotImplementedError() def _close(self): - self._dataframe = None + self._df = None class GoogleAnalyticsAPI(object): - def __init__(self, credentials_path=None): - credentials = None - - if credentials_path: - credentials = Credentials.from_service_account_file(credentials_path) + def __init__(self, credentials_path): + self._credentials_path = credentials_path + self.client = self.create_client() - self.client = discovery.build('analyticsreporting', 'v4', - credentials=credentials, - cache_discovery=False).reports() + def create_client(self): + credentials = Credentials.from_service_account_file(self._credentials_path) + c = discovery.build('analyticsreporting', 'v4', + credentials=credentials, + cache_discovery=False).reports() + return c - def query(self, view_id: str, start_date: DateTypes, end_date: DateTypes, metrics: list, - dimensions: list = None, filters: list = None): + def query(self, view_id: str, start_date: DateTypes, end_date: DateTypes, + metrics: list, dimensions: list = None, filters: list = None): result = self._query( view_id=view_id, start_date=start_date, end_date=end_date, metrics=metrics, dimensions=dimensions, filters=filters ) df = self._to_dataframe(result) - return df - def _query(self, view_id: str, start_date: DateTypes, end_date: DateTypes, metrics: list, + def _build_body(self, view_id: str, start_date: DateTypes, end_date: DateTypes, metrics: list, dimensions: list = None, filters: list = None): - date_range = {'startDate': start_date, 'endDate': end_date} - for key, value in date_range.items(): - if is_dt(value): - date_range[key] = as_day(value) - elif value.lower() in ['yesterday', 'today']: - date_range[key] = value.lower() - elif re.match(YYYY_MM_DD, value): - pass - elif re.match(r'\d+DaysAgo', value): - pass - else: - raise ValueError(f'{key}={value} is not a supported date.\n' - f'Please use a date/datetime object.') - - body = { - 'reportRequests': [] + date_range = { + 'startDate': self._parse_date(start_date), + 'endDate': self._parse_date(end_date) } + request = { 'viewId': view_id, 'dateRanges': [date_range], @@ -151,11 +139,23 @@ def _query(self, view_id: str, start_date: DateTypes, end_date: DateTypes, metri if filters: request['filtersExpression'] = filters - body['reportRequests'].append(request) + body = {'reportRequests': [request]} + return body + + def _query(self, view_id: str, start_date: DateTypes, end_date: DateTypes, metrics: list, + dimensions: list = None, filters: list = None): + + body = self._build_body( + view_id=view_id, start_date=start_date, end_date=end_date, + metrics=metrics, dimensions=dimensions, filters=filters + ) result = self.client.batchGet(body=body).execute() + report = result['reports'][0] - expected_rows = report['data']['rowCount'] + expected_rows = report['data'].get('rowCount', 0) + if expected_rows == 0: + return report while result['reports'][0].get('nextPageToken'): body['reportRequests'][0]['pageToken'] = result['reports'][0].get('nextPageToken') @@ -238,3 +238,18 @@ def _parse_fields(fields, style): raise ValueError('\n'.join(errors)) return parsed + + @staticmethod + def _parse_date(value): + if is_dt(value): + return as_day(value) + elif value in ['yesterday', 'today']: + return value + elif re.match(YYYY_MM_DD, value): + return value + elif re.match(r'\d+DaysAgo', value): + return value + else: + raise ValueError(f'{value} is not a supported date.\n' + f'Please use a date/datetime object or string of the following formats:\n' + f'"yesterday", "today", "NDaysAgo", "YYYY-MM-DD"') diff --git a/setup.cfg b/setup.cfg index 9ca8086..53df316 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,6 +9,8 @@ addopts = --junitxml=junit.xml --ignore setup.py --ignore run_test.py + --cov=intake_google_analytics + --cov-report=xml:cov.xml --cov-report term-missing --tb native --strict-markers diff --git a/tests/test_source.py b/tests/test_source.py new file mode 100644 index 0000000..d3cae52 --- /dev/null +++ b/tests/test_source.py @@ -0,0 +1,384 @@ +import datetime as dt + +import pandas as pd +import pytest +import intake +from intake_google_analytics.source import GoogleAnalyticsAPI +from pandas.api.types import (is_datetime64_any_dtype, is_float_dtype, + is_integer_dtype) +from pandas.testing import assert_frame_equal + + +def test_parse_fields_wrong_style(): + with pytest.raises(ValueError): + GoogleAnalyticsAPI._parse_fields(['ga:users'], style='nope') + + +def test_parse_metrics(): + metrics = ['ga:users'] + parsed = GoogleAnalyticsAPI._parse_fields(metrics, style='metrics') + assert parsed == [{'expression': 'ga:users'}] + + metrics = ['ga:users', 'ga:session'] + parsed = GoogleAnalyticsAPI._parse_fields(metrics, style='metrics') + assert parsed == [{'expression': 'ga:users'}, {'expression': 'ga:session'}] + + metrics = ['ga:users', {"expression": 'ga:session', 'alias': 'Session'}] + parsed = GoogleAnalyticsAPI._parse_fields(metrics, style='metrics') + assert parsed == [ + {'expression': 'ga:users'}, + {"expression": 'ga:session', 'alias': 'Session'} + ] + + metrics = [{"expression": 'ga:session'}] + parsed = GoogleAnalyticsAPI._parse_fields(metrics, style='metrics') + assert parsed == metrics + + metrics = [{"expression": 'ga:session', 'alias': 'Session'}] + parsed = GoogleAnalyticsAPI._parse_fields(metrics, style='metrics') + assert parsed == metrics + + with pytest.raises(ValueError): + metrics = [{"espresso": 'ga:session', 'alias': 'Session'}] + GoogleAnalyticsAPI._parse_fields(metrics, style='metrics') + + with pytest.raises(ValueError): + GoogleAnalyticsAPI._parse_fields([1], style='metrics') + + +def test_parse_dimensions(): + dimensions = ['ga:userType'] + parsed = GoogleAnalyticsAPI._parse_fields(dimensions, style='dimensions') + assert parsed == [{'name': 'ga:userType'}] + + dimensions = ['ga:userType', 'ga:date'] + parsed = GoogleAnalyticsAPI._parse_fields(dimensions, style='dimensions') + assert parsed == [{'name': 'ga:userType'}, {'name': 'ga:date'}] + + dimensions = ['ga:userType', {'name': 'ga:date'}] + parsed = GoogleAnalyticsAPI._parse_fields(dimensions, style='dimensions') + assert parsed == [{'name': 'ga:userType'}, {'name': 'ga:date'}] + + dimensions = [{'name': 'ga:date'}] + parsed = GoogleAnalyticsAPI._parse_fields(dimensions, style='dimensions') + assert parsed == dimensions + + with pytest.raises(ValueError): + dimensions = [{"nom": 'ga:date'}] + GoogleAnalyticsAPI._parse_fields(dimensions, style='dimensions') + + with pytest.raises(ValueError): + GoogleAnalyticsAPI._parse_fields([1], style='dimensions') + + +def test_parse_date_objects(): + assert GoogleAnalyticsAPI._parse_date('2020-03-19') == '2020-03-19' + assert GoogleAnalyticsAPI._parse_date(dt.date(2020, 3, 19)) == '2020-03-19' + assert GoogleAnalyticsAPI._parse_date(dt.datetime(2020, 3, 19, 16, 20, 0)) == '2020-03-19' + assert GoogleAnalyticsAPI._parse_date(pd.to_datetime('2020-03-19 16:20:00')) == '2020-03-19' + assert GoogleAnalyticsAPI._parse_date(pd.Timestamp(2020, 3, 19, 16, 20, 0)) == '2020-03-19' + + with pytest.raises(TypeError): + GoogleAnalyticsAPI._parse_date(dt.timedelta(days=2)) + + +def test_parse_date_strings(): + assert GoogleAnalyticsAPI._parse_date('yesterday') == 'yesterday' + assert GoogleAnalyticsAPI._parse_date('today') == 'today' + assert GoogleAnalyticsAPI._parse_date('1000DaysAgo') == '1000DaysAgo' + + with pytest.raises(ValueError): + GoogleAnalyticsAPI._parse_date('tomorrow') + + with pytest.raises(ValueError): + GoogleAnalyticsAPI._parse_date('Ď€DaysAgo') + + +def test_query_body(monkeypatch): + monkeypatch.setattr(GoogleAnalyticsAPI, 'create_client', lambda x: None) + + inputs = { + 'view_id': 'VIEWID', + 'start_date': '5DaysAgo', 'end_date': 'yesterday', + 'metrics': ['ga:users'] + } + expected_body = {'reportRequests': [ + {'dateRanges': [{'endDate': 'yesterday', 'startDate': '5DaysAgo'}], + 'hideTotals': True, + 'hideValueRanges': True, + 'includeEmptyRows': True, + 'metrics': [{'expression': 'ga:users'}], + 'viewId': 'VIEWID'} + ]} + + client = GoogleAnalyticsAPI(None) + body = client._build_body(**inputs) + assert body == expected_body + + +def test_query_body_with_dimensions(monkeypatch): + monkeypatch.setattr(GoogleAnalyticsAPI, 'create_client', lambda x: None) + + inputs = { + 'view_id': 'VIEWID', + 'start_date': '5DaysAgo', 'end_date': 'yesterday', + 'metrics': ['ga:users'], + 'dimensions': ['ga:userType'] + } + expected_body = {'reportRequests': [ + {'dateRanges': [{'endDate': 'yesterday', 'startDate': '5DaysAgo'}], + 'hideTotals': True, + 'hideValueRanges': True, + 'includeEmptyRows': True, + 'metrics': [{'expression': 'ga:users'}], + 'dimensions': [{'name': 'ga:userType'}], + 'viewId': 'VIEWID'} + ]} + + client = GoogleAnalyticsAPI(None) + body = client._build_body(**inputs) + assert body == expected_body + + +def test_dataframe_empty_report(): + report = { + 'columnHeader': + {'metricHeader': {'metricHeaderEntries': [{'name': 'ga:users', 'type': 'INTEGER'}]}}, + 'data': {} + } + df = GoogleAnalyticsAPI._to_dataframe(report) + assert df.empty + + +datetime_dimensions = [ + ('ga:yearMonth', '202003'), + ('ga:date', '20200319'), + ('ga:dateHour', '2020031916'), + ('ga:dateHourMinute', '202003191620'), +] + + +@pytest.mark.parametrize('dimension', datetime_dimensions, ids=[p[0] for p in datetime_dimensions]) +def test_dataframe_datetime_dimensions(dimension): + dim, value = dimension + + report = { + 'columnHeader': + {'dimensions': [dim], + 'metricHeader': {'metricHeaderEntries': [{'name': 'ga:users', 'type': 'INTEGER'}]}}, + 'data': { + 'rowCount': 1, + 'rows': [{'dimensions': [value], + 'metrics': [{'values': ['1']}]}] + } + } + df = GoogleAnalyticsAPI._to_dataframe(report) + assert is_datetime64_any_dtype(df[dim]) + + +def test_dataframe_multiple_datetime_dimensions(): + multi_column = { + 'columnHeader': + {'dimensions': ['ga:date', 'ga:dateHourMinute'], + 'metricHeader': {'metricHeaderEntries': [{'name': 'ga:users', 'type': 'INTEGER'}]}}, + 'data': { + 'rowCount': 1, + 'rows': [{'dimensions': ['20200319', '202003191620'], + 'metrics': [{'values': ['1']}]}] + } + } + df = GoogleAnalyticsAPI._to_dataframe(multi_column) + assert is_datetime64_any_dtype(df['ga:dateHourMinute']) + assert is_datetime64_any_dtype(df['ga:date']) + + +metric_dtypes = [ + ('INTEGER', "ga:users", '1', is_integer_dtype), + ('TIME', 'ga:sessionDuration', '1.1', is_float_dtype), + ('PERCENT', 'ga:percentNewSessions', '1.1', is_float_dtype), + ('CURRENCY', 'ga:goalValueAll', '1.1', is_float_dtype), + ('FLOAT', 'ga:pageviewsPerSession', '1.1', is_float_dtype) +] + + +@pytest.mark.parametrize('metric', metric_dtypes, ids=[p[0] for p in metric_dtypes]) +def test_dataframe_metric_dtype(metric): + ga_type, column, value, test_func = metric + + report = { + 'columnHeader': + {'metricHeader': {'metricHeaderEntries': + [{'name': column, 'type': ga_type}]}}, + 'data': { + 'rowCount': 1, + 'rows': [{'metrics': [{'values': [value]}]}] + } + } + df = GoogleAnalyticsAPI._to_dataframe(report) + assert test_func(df[column]) + + +class MockGAClient(): + def __init__(self, credentials_path): + pass + + def batchGet(self, body): + return MockGABatch(body) + + +class MockGABatch(): + def __init__(self, body): + self.body = body + + def execute(self): + pass + + +def test_query_to_dataframe(monkeypatch): + monkeypatch.setattr(MockGABatch, 'execute', lambda body: { + 'reports': [ + {'columnHeader': {'metricHeader': {'metricHeaderEntries': [{'name': 'ga:users', + 'type': 'INTEGER'}]}}, + 'data': {'rowCount': 1, 'rows': [{'metrics': [{'values': ['1']}]}]}} + ] + } + ) + monkeypatch.setattr(GoogleAnalyticsAPI, 'create_client', lambda x: MockGAClient(x)) + + ga_api = GoogleAnalyticsAPI(None) + df = ga_api.query( + 'VIEWID', + start_date='5DaysAgo', end_date='yesterday', + metrics=['ga:user'] + ) + assert_frame_equal(df, pd.DataFrame([{'ga:users': 1}]), check_dtype=False) + + +def test_query_wrong_row_count(monkeypatch): + monkeypatch.setattr(MockGABatch, 'execute', lambda body: { + 'reports': [ + {'columnHeader': {'metricHeader': {'metricHeaderEntries': [{'name': 'ga:users', + 'type': 'INTEGER'}]}}, + 'data': {'rowCount': 1, 'rows': [ + {'metrics': [{'values': ['1']}]}, + {'metrics': [{'values': ['2']}]} + ]}} + ] + } + ) + monkeypatch.setattr(GoogleAnalyticsAPI, 'create_client', lambda x: MockGAClient(x)) + + ga_api = GoogleAnalyticsAPI(None) + with pytest.raises(RuntimeError): + _ = ga_api._query( + 'VIEWID', + start_date='5DaysAgo', end_date='yesterday', + metrics=['ga:user'] + ) + + +def test_query_empty_result(monkeypatch): + monkeypatch.setattr(MockGABatch, 'execute', lambda body: { + 'reports': [ + {'columnHeader': {'metricHeader': {'metricHeaderEntries': [{'name': 'ga:users', + 'type': 'INTEGER'}]}}, + 'data': {}} + ] + } + ) + monkeypatch.setattr(GoogleAnalyticsAPI, 'create_client', lambda x: MockGAClient(x)) + + ga_api = GoogleAnalyticsAPI(None) + df = ga_api.query( + 'VIEWID', + start_date='5DaysAgo', end_date='yesterday', + metrics=['ga:user'] + ) + assert df.empty + + +def test_paginated_result(monkeypatch): + def execute(self): + paginated = [ + {'reports': [ + {'nextPageToken': 1, + 'columnHeader': {'metricHeader': {'metricHeaderEntries': [{'name': 'ga:users', + 'type': 'INTEGER'}]}}, + 'data': {'rowCount': 6, 'rows': [ + {'metrics': [{'values': ['1']}]}, {'metrics': [{'values': ['2']}]} + ]}} + ] + }, + {'reports': [ + {'nextPageToken': 2, + 'columnHeader': {'metricHeader': {'metricHeaderEntries': [{'name': 'ga:users', + 'type': 'INTEGER'}]}}, + 'data': {'rowCount': 6, 'rows': [ + {'metrics': [{'values': ['3']}]}, {'metrics': [{'values': ['4']}]} + ]}} + ] + }, + {'reports': [ + {'columnHeader': {'metricHeader': {'metricHeaderEntries': [{'name': 'ga:users', + 'type': 'INTEGER'}]}}, + 'data': {'rowCount': 6, 'rows': [ + {'metrics': [{'values': ['5']}]}, {'metrics': [{'values': ['6']}]} + ]}} + ] + }, + ] + + page_token = self.body['reportRequests'][0].get('pageToken', 0) + return paginated[page_token] + + monkeypatch.setattr(MockGABatch, 'execute', execute) + monkeypatch.setattr(GoogleAnalyticsAPI, 'create_client', lambda x: MockGAClient(x)) + + ga_api = GoogleAnalyticsAPI(None) + df = ga_api.query( + 'VIEWID', + start_date='5DaysAgo', end_date='yesterday', + metrics=['ga:user'] + ) + assert len(df) == 6 + + +def test_load_dataset(monkeypatch): + monkeypatch.setattr(MockGABatch, 'execute', lambda body: { + 'reports': [ + {'columnHeader': {'metricHeader': {'metricHeaderEntries': [{'name': 'ga:users', + 'type': 'INTEGER'}]}}, + 'data': {'rowCount': 1, 'rows': [{'metrics': [{'values': ['1']}]}]}} + ] + } + ) + monkeypatch.setattr(GoogleAnalyticsAPI, 'create_client', lambda x: MockGAClient(x)) + + ds = intake.open_google_analytics_query( + 'VIEWID', + start_date='5DaysAgo', end_date='yesterday', + metrics=['ga:user'], + credentials_path=None + ) + + assert ds.name == 'google_analytics_query' + assert ds.container == 'dataframe' + + yaml = """sources: + google_analytics_query: + args: + credentials_path: null + end_date: yesterday + metrics: + - ga:user + start_date: 5DaysAgo + view_id: VIEWID + description: '' + driver: intake_google_analytics.source.GoogleAnalyticsQuerySource + metadata: {} +""" + + assert ds.yaml() == yaml + + df = ds.read() + assert_frame_equal(df, pd.DataFrame([{'ga:users': 1}]), check_dtype=False) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..f2a1d2a --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,29 @@ +import datetime as dt + +import pandas as pd +import pytest +from intake_google_analytics.utils import as_day, is_dt + + +def test_is_dt(): + assert is_dt(dt.date(2020, 3, 19)) + assert is_dt(dt.datetime(2020, 3, 19, 16, 20, 0)) + assert is_dt(pd.to_datetime('2020-03-19')) + assert is_dt(pd.Timestamp(2020, 3, 19)) + + assert not is_dt('2020-03-19') + assert not is_dt(dt.timedelta(days=1)) + assert not is_dt(pd.DateOffset(months=2)) + + +def test_as_day(): + assert as_day(dt.date(2020, 3, 19)) == '2020-03-19' + assert as_day(dt.datetime(2020, 3, 19, 16, 20, 0)) == '2020-03-19' + assert as_day(pd.to_datetime('2020-03-19')) == '2020-03-19' + assert as_day(pd.to_datetime('2020-03-19 16:20:00')) == '2020-03-19' + assert as_day(pd.Timestamp(2020, 3, 19)) == '2020-03-19' + + with pytest.raises(AttributeError): + as_day(dt.timedelta(days=1)) + as_day(pd.DateOffset(days=1)) + as_day('2020-03-19')