From c063637ad9173554b22c503c3200e81455002ef2 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Thu, 10 Aug 2017 14:52:16 +0200
Subject: [PATCH 01/11] spiders: add desy spider

Signed-off-by: David Caro <david@dcaro.es>
---
 docs/spiders.rst                              |   6 +
 hepcrawl/pipelines.py                         | 145 +++----
 hepcrawl/settings.py                          |   2 +-
 hepcrawl/spiders/desy_spider.py               | 215 ++++++++++
 .../desy/fixtures/desy_ftp_records.json       | 308 ++++++++++++++
 .../desy/fixtures/desy_local_records.json     | 308 ++++++++++++++
 .../desy/fixtures/ftp_server/.netrc           |   3 +
 .../ftp_server/DESY/FFT/test_fft_1.txt        |   1 +
 .../ftp_server/DESY/FFT/test_fft_2.txt        |   1 +
 .../DESY/desy_collection_records.xml          | 149 +++++++
 .../desy_no_namespace_collection_records.xml  | 149 +++++++
 .../ftp_server/DESY/file_not_for_download.txt |   1 +
 .../desy/fixtures/ftp_server/pureftpd.passwd  |   1 +
 tests/functional/desy/test_desy.py            | 211 ++++++++++
 .../desy/desy_collection_records.xml          | 149 +++++++
 tests/unit/responses/desy/desy_record.xml     |  76 ++++
 tests/unit/test_desy.py                       | 381 ++++++++++++++++++
 17 files changed, 2025 insertions(+), 81 deletions(-)
 create mode 100644 hepcrawl/spiders/desy_spider.py
 create mode 100644 tests/functional/desy/fixtures/desy_ftp_records.json
 create mode 100644 tests/functional/desy/fixtures/desy_local_records.json
 create mode 100644 tests/functional/desy/fixtures/ftp_server/.netrc
 create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt
 create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt
 create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml
 create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml
 create mode 100644 tests/functional/desy/fixtures/ftp_server/DESY/file_not_for_download.txt
 create mode 100644 tests/functional/desy/fixtures/ftp_server/pureftpd.passwd
 create mode 100644 tests/functional/desy/test_desy.py
 create mode 100644 tests/unit/responses/desy/desy_collection_records.xml
 create mode 100644 tests/unit/responses/desy/desy_record.xml
 create mode 100644 tests/unit/test_desy.py

diff --git a/docs/spiders.rst b/docs/spiders.rst
index aab52646..d016e521 100644
--- a/docs/spiders.rst
+++ b/docs/spiders.rst
@@ -50,6 +50,12 @@ Brown
 .. automodule:: hepcrawl.spiders.brown_spider
    :members:
 
+Desy
+~~~~
+
+.. automodule:: hepcrawl.spiders.desy_spider
+   :members:
+
 DNB
 ~~~
 
diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
index 62ba867c..c464c7a0 100644
--- a/hepcrawl/pipelines.py
+++ b/hepcrawl/pipelines.py
@@ -15,30 +15,63 @@
 
 from __future__ import absolute_import, division, print_function
 
-import datetime
 import os
 
 import requests
 
-from .crawler2hep import crawler2hep
+from scrapy import Request
+from scrapy.pipelines.files import FilesPipeline
 
+from inspire_schemas.utils import validate
 
-def has_publication_info(item):
-    """If any publication info."""
-    return item.get('pubinfo_freetext') or item.get('journal_volume') or \
-        item.get('journal_title') or \
-        item.get('journal_year') or \
-        item.get('journal_issue') or \
-        item.get('journal_fpage') or \
-        item.get('journal_lpage') or \
-        item.get('journal_artid') or \
-        item.get('journal_doctype')
+from hepcrawl.crawler2hep import item_to_hep
+from hepcrawl.settings import FILES_STORE
+from hepcrawl.utils import RecordFile
 
 
-def filter_fields(item, keys):
-    """Filter away keys."""
-    for key in keys:
-        item.pop(key, None)
+class FftFilesPipeline(FilesPipeline):
+    """Download all the FFT files provided by record.
+
+    Note:
+
+         This pipeline only runs if the spider returns a ``ParsedItem`` that has a ``file_urls``
+         property.
+    """
+
+    def __init__(self, store_uri, *args, **kwargs):
+        store_uri = store_uri or FILES_STORE
+        super(FftFilesPipeline, self).__init__(*args, store_uri=store_uri, **kwargs)
+
+    def get_media_requests(self, item, info):
+        """Download FFT files using FTP."""
+        if item.get('file_urls'):
+            for fft_url in item.file_urls:
+                yield Request(
+                    url=fft_url,
+                    meta=item.ftp_params,
+                )
+
+    def get_absolute_file_path(self, path):
+        return os.path.abspath(
+            os.path.join(
+                self.store.basedir,
+                path
+            )
+        )
+
+    def item_completed(self, results, item, info):
+        """Create a map that connects file names with downloaded files."""
+        record_files = [
+            RecordFile(
+                path=self.get_absolute_file_path(result_data['path']),
+                name=os.path.basename(result_data['url']),
+            )
+            for ok, result_data in results
+            if ok
+        ]
+        item.record_files = record_files
+
+        return item
 
 
 class InspireAPIPushPipeline(object):
@@ -50,74 +83,26 @@ def __init__(self):
     def open_spider(self, spider):
         self.results_data = []
 
+    def _post_enhance_item(self, item, spider):
+        source = spider.name
+
+        return item_to_hep(
+            item=item,
+            source=source,
+        )
+
     def process_item(self, item, spider):
         """Convert internal format to INSPIRE data model."""
         self.count += 1
-        if 'related_article_doi' in item:
-            item['dois'] += item.pop('related_article_doi', [])
 
-        source = spider.name
-        item['acquisition_source'] = {
-            'source': source,
-            'method': 'hepcrawl',
-            'date': datetime.datetime.now().isoformat(),
-            'submission_number': os.environ.get('SCRAPY_JOB', ''),
-        }
-
-        item['titles'] = [{
-            'title': item.pop('title', ''),
-            'subtitle': item.pop('subtitle', ''),
-            'source': source,
-        }]
-        item['abstracts'] = [{
-            'value': item.pop('abstract', ''),
-            'source': source,
-        }]
-        item['imprints'] = [{
-            'date': item.pop('date_published', ''),
-        }]
-        item['copyright'] = [{
-            'holder': item.pop('copyright_holder', ''),
-            'year': item.pop('copyright_year', ''),
-            'statement': item.pop('copyright_statement', ''),
-            'material': item.pop('copyright_material', ''),
-        }]
-        if not item.get('publication_info'):
-            if has_publication_info(item):
-                item['publication_info'] = [{
-                    'journal_title': item.pop('journal_title', ''),
-                    'journal_volume': item.pop('journal_volume', ''),
-                    'journal_issue': item.pop('journal_issue', ''),
-                    'artid': item.pop('journal_artid', ''),
-                    'page_start': item.pop('journal_fpage', ''),
-                    'page_end': item.pop('journal_lpage', ''),
-                    'note': item.pop('journal_doctype', ''),
-                    'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
-                    'pubinfo_material': item.pop('pubinfo_material', ''),
-                }]
-                if item.get('journal_year'):
-                    item['publication_info'][0]['year'] = int(
-                        item.pop('journal_year')
-                    )
-
-        # Remove any fields
-        filter_fields(item, [
-            'journal_title',
-            'journal_volume',
-            'journal_year',
-            'journal_issue',
-            'journal_fpage',
-            'journal_lpage',
-            'journal_doctype',
-            'journal_artid',
-            'pubinfo_freetext',
-            'pubinfo_material',
-        ])
-
-        item = crawler2hep(dict(item))
-        spider.logger.debug('Validated item.')
-        self.results_data.append(item)
-        return item
+        hep_record = self._post_enhance_item(item, spider)
+
+        validate(hep_record, 'hep')
+        spider.logger.debug('Validated item by Inspire Schemas.')
+
+        self.results_data.append(hep_record)
+
+        return hep_record
 
     def _prepare_payload(self, spider):
         """Return payload for push."""
diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py
index 71dcfc75..bd16d8cd 100644
--- a/hepcrawl/settings.py
+++ b/hepcrawl/settings.py
@@ -85,7 +85,7 @@
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'scrapy.pipelines.files.FilesPipeline': 1,
+    'hepcrawl.pipelines.FftFilesPipeline': 1,
     'hepcrawl.pipelines.InspireCeleryPushPipeline': 300,
 }
 
diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
new file mode 100644
index 00000000..4f90d6e9
--- /dev/null
+++ b/hepcrawl/spiders/desy_spider.py
@@ -0,0 +1,215 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of hepcrawl.
+# Copyright (C) 2017 CERN.
+#
+# hepcrawl is a free software; you can redistribute it and/or modify it
+# under the terms of the Revised BSD License; see LICENSE file for
+# more details.
+
+"""Spider for DESY."""
+
+from __future__ import absolute_import, division, print_function
+
+import os
+
+from lxml import etree
+from dojson.contrib.marc21.utils import create_record
+from six.moves import urllib
+
+from scrapy import Request
+from scrapy.spiders import Spider
+
+from inspire_dojson.hep import hep
+
+from hepcrawl.utils import (
+    ftp_list_files,
+    ftp_connection_info,
+    ParsedItem,
+)
+
+
+class DesySpider(Spider):
+    """Desy spider.
+
+     This spider connects to a given FTP hosts and downloads XML files
+     for extraction into HEP records.
+
+    Examples:
+        To run a crawl, you need to pass FTP connection information via
+        ``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it will fallback to
+        ``DESY``::
+
+            $ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc'
+
+        To run a crawl on local folder, you need to pass the absolute ``source_folder``::
+
+            $ scrapy crawl desy -a 'source_folder=/path/to/package_dir'
+     """
+    name = 'desy'
+    custom_settings = {}
+    start_urls = []
+
+    def __init__(
+        self,
+        source_folder=None,
+        ftp_folder='/DESY',
+        ftp_host=None,
+        ftp_netrc=None,
+        destination_folder='/tmp/DESY',
+        *args,
+        **kwargs
+    ):
+        super(DesySpider, self).__init__(*args, **kwargs)
+        self.ftp_folder = ftp_folder
+        self.ftp_host = ftp_host
+        self.ftp_netrc = ftp_netrc
+        self.source_folder = source_folder
+        self.destination_folder = destination_folder
+        self.ftp_enabled = True if self.ftp_host else False
+        if not os.path.exists(self.destination_folder):
+            os.makedirs(self.destination_folder)
+
+    @staticmethod
+    def _list_xml_files_paths(list_files_paths):
+        return [
+            xml_file
+            for xml_file in list_files_paths
+            if xml_file.endswith('.xml')
+        ]
+
+    def crawl_local_directory(self):
+        file_names = os.listdir(self.source_folder)
+        xml_file_names = self._list_xml_files_paths(file_names)
+
+        for file_name in xml_file_names:
+            file_path = os.path.join(self.source_folder, file_name)
+            self.log('Local: Try to crawl local file: {0}'.format(file_path))
+            yield Request(
+                'file://{0}'.format(file_path),
+                callback=self.parse,
+            )
+
+    def crawl_ftp_directory(self):
+        ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
+
+        remote_files_paths = ftp_list_files(
+            self.ftp_folder,
+            destination_folder=self.destination_folder,
+            ftp_host=ftp_host,
+            user=ftp_params['ftp_user'],
+            password=ftp_params['ftp_password'],
+            only_missing_files=False,
+        )
+
+        xml_remote_files_paths = self._list_xml_files_paths(remote_files_paths)
+
+        for remote_file in xml_remote_files_paths:
+            self.log('Remote: Try to crawl file from FTP: {0}'.format(remote_file))
+            remote_file = str(remote_file)
+            ftp_params['ftp_local_filename'] = os.path.join(
+                self.destination_folder,
+                os.path.basename(remote_file),
+            )
+            remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file)
+            yield Request(
+                str(remote_url),
+                meta=ftp_params,
+                callback=self.handle_package_ftp,
+            )
+
+    def handle_package_ftp(self, response):
+        """Yield every XML file found.
+
+        This is an intermediate step before calling ``DesySpider.parse`` to handle ftp downloaded
+         "record collections".
+        """
+        self.log('Visited url {}'.format(response.url))
+        file_path = response.body
+        yield Request(
+            'file://{0}'.format(file_path),
+            meta={'source_folder': file_path},
+            callback=self.parse,
+        )
+
+    def start_requests(self):
+        """List selected folder on remote FTP and yield files."""
+
+        if self.source_folder:
+            requests = self.crawl_local_directory()
+        else:
+            requests = self.crawl_ftp_directory()
+
+        for request in requests:
+            yield request
+
+    @staticmethod
+    def _get_full_uri(current_path, base_url, schema, hostname=''):
+        if os.path.isabs(current_path):
+            full_path = current_path
+        else:
+            full_path = os.path.join(base_url, current_path)
+
+        return '{schema}://{hostname}{full_path}'.format(**vars())
+
+    def parse(self, response):
+        """Parse a ``Desy`` XML file into a ``hepcrawl.utils.ParsedItem``."""
+
+        self.log('Got record from url/path: {0}'.format(response.url))
+        self.log('FTP enabled: {0}'.format(self.ftp_enabled))
+        ftp_params = None
+
+        if self.ftp_enabled:
+            hostname, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
+            base_url = self.ftp_folder
+            url_schema = 'ftp'
+        else:
+            base_url = os.path.dirname(urllib.parse.urlparse(response.url).path)
+            url_schema = 'file'
+            hostname = None
+
+        marcxml_records = self._get_marcxml_records(response.body)
+        hep_records = self._hep_records_from_marcxml(marcxml_records)
+
+        for hep_record in hep_records:
+            list_file_urls = [
+                self._get_full_uri(
+                    current_path=fft_path['path'],
+                    base_url=base_url,
+                    schema=url_schema,
+                    hostname=hostname,
+                )
+                for fft_path in hep_record['_fft']
+            ]
+
+            parsed_item = ParsedItem(
+                record=hep_record,
+                file_urls=list_file_urls,
+                ftp_params=ftp_params,
+                record_format='hep',
+            )
+
+            yield parsed_item
+
+    @staticmethod
+    def _get_marcxml_records(response_body):
+        root = etree.fromstring(response_body)
+        list_items = root.findall('.//{http://www.loc.gov/MARC21/slim}record')
+        if not list_items:
+            list_items = root.findall('.//record')
+
+        return [etree.tostring(item) for item in list_items]
+
+    @staticmethod
+    def _hep_records_from_marcxml(marcxml_records):
+        def _create_json_record(xml_record):
+            object_record = create_record(etree.XML(xml_record))
+            dojson_record = hep.do(object_record)
+            return dojson_record
+
+        hep_records = []
+        for xml_record in marcxml_records:
+            json_record = _create_json_record(xml_record)
+            hep_records.append(json_record)
+
+        return hep_records
diff --git a/tests/functional/desy/fixtures/desy_ftp_records.json b/tests/functional/desy/fixtures/desy_ftp_records.json
new file mode 100644
index 00000000..0ffb18d1
--- /dev/null
+++ b/tests/functional/desy/fixtures/desy_ftp_records.json
@@ -0,0 +1,308 @@
+[{
+  "acquisition_source": {
+    "source": "desy",
+    "method": "hepcrawl",
+    "submission_number": "5652c7f6190f11e79e8000224dabeaad",
+    "datetime": "2017-04-03T10:26:40.365216"
+  },
+  "_collections": [
+    "Literature"
+  ],
+  "_fft": [
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt",
+      "type": "Main",
+      "filename": "cNFW_rogue_curves"
+    },
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:16",
+      "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt",
+      "type": "Main",
+      "filename": "scalingRelations_DutBeh_DC14_all_Oh"
+    }
+  ],
+  "control_number": 111111,
+  "public_notes": [
+    {
+      "value": "*Brief entry*"
+    }
+  ],
+  "self": {
+    "$ref": "http://inspirehep.net/api/literature/111111"
+  },
+  "number_of_pages": 6,
+  "titles": [
+    {
+      "source": "JACoW",
+      "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+    }
+  ],
+  "urls": [
+    {
+      "description": "Fulltext",
+      "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+    }
+  ],
+  "dois": [
+    {
+      "value": "10.18429/JACoW-IPAC2017-WEYB1"
+    }
+  ],
+  "publication_info": [
+    {
+      "parent_isbn": "9783954501823"
+    },
+    {
+      "page_start": "2520",
+      "page_end": "2525",
+      "year": 2017
+    }
+  ],
+  "$schema": "hep.json",
+  "document_type": [
+    "article"
+  ],
+  "abstracts": [
+    {
+      "source": "Deutsches Elektronen-Synchrotron",
+      "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+    }
+  ]
+},
+{
+  "acquisition_source": {
+    "source": "desy",
+    "method": "hepcrawl",
+    "submission_number": "5652c7f6190f11e79e8000224dabeaad",
+    "datetime": "2017-04-03T10:26:40.365216"
+  },
+  "_collections": [
+    "Literature"
+  ],
+  "_fft": [
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt",
+      "type": "Main",
+      "filename": "cNFW_rogue_curves"
+    },
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:16",
+      "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt",
+      "type": "Main",
+      "filename": "scalingRelations_DutBeh_DC14_all_Oh"
+    }
+  ],
+  "control_number": 222222,
+  "public_notes": [
+    {
+      "value": "*Brief entry*"
+    }
+  ],
+  "self": {
+    "$ref": "http://inspirehep.net/api/literature/222222"
+  },
+  "number_of_pages": 6,
+  "titles": [
+    {
+      "source": "JACoW",
+      "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+    }
+  ],
+  "urls": [
+    {
+      "description": "Fulltext",
+      "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+    }
+  ],
+  "dois": [
+    {
+      "value": "10.18429/JACoW-IPAC2017-WEYB1"
+    }
+  ],
+  "publication_info": [
+    {
+      "parent_isbn": "9783954501823"
+    },
+    {
+      "page_start": "2520",
+      "page_end": "2525",
+      "year": 2017
+    }
+  ],
+  "$schema": "hep.json",
+  "document_type": [
+    "article"
+  ],
+  "abstracts": [
+    {
+      "source": "Deutsches Elektronen-Synchrotron",
+      "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+    }
+  ]
+},
+{
+  "acquisition_source": {
+    "source": "desy",
+    "method": "hepcrawl",
+    "submission_number": "5652c7f6190f11e79e8000224dabeaad",
+    "datetime": "2017-04-03T10:26:40.365216"
+  },
+  "_collections": [
+    "Literature"
+  ],
+  "_fft": [
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt",
+      "type": "Main",
+      "filename": "cNFW_rogue_curves"
+    },
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:16",
+      "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt",
+      "type": "Main",
+      "filename": "scalingRelations_DutBeh_DC14_all_Oh"
+    }
+  ],
+  "control_number": 333333,
+  "public_notes": [
+    {
+      "value": "*Brief entry*"
+    }
+  ],
+  "self": {
+    "$ref": "http://inspirehep.net/api/literature/333333"
+  },
+  "number_of_pages": 6,
+  "titles": [
+    {
+      "source": "JACoW",
+      "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+    }
+  ],
+  "urls": [
+    {
+      "description": "Fulltext",
+      "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+    }
+  ],
+  "dois": [
+    {
+      "value": "10.18429/JACoW-IPAC2017-WEYB1"
+    }
+  ],
+  "publication_info": [
+    {
+      "parent_isbn": "9783954501823"
+    },
+    {
+      "page_start": "2520",
+      "page_end": "2525",
+      "year": 2017
+    }
+  ],
+  "$schema": "hep.json",
+  "document_type": [
+    "article"
+  ],
+  "abstracts": [
+    {
+      "source": "Deutsches Elektronen-Synchrotron",
+      "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+    }
+  ]
+},
+{
+  "acquisition_source": {
+    "source": "desy",
+    "method": "hepcrawl",
+    "submission_number": "5652c7f6190f11e79e8000224dabeaad",
+    "datetime": "2017-04-03T10:26:40.365216"
+  },
+  "_collections": [
+    "Literature"
+  ],
+  "_fft": [
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/809d9d2bebcea6eee5e400e3c49b31795a3acc3d.txt",
+      "type": "Main",
+      "filename": "cNFW_rogue_curves"
+    },
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:16",
+      "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/1d4b0a4eebdd03b95f882fa7feb9d3f06681ec50.txt",
+      "type": "Main",
+      "filename": "scalingRelations_DutBeh_DC14_all_Oh"
+    }
+  ],
+  "control_number": 444444,
+  "public_notes": [
+    {
+      "value": "*Brief entry*"
+    }
+  ],
+  "self": {
+    "$ref": "http://inspirehep.net/api/literature/444444"
+  },
+  "number_of_pages": 6,
+  "titles": [
+    {
+      "source": "JACoW",
+      "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+    }
+  ],
+  "urls": [
+    {
+      "description": "Fulltext",
+      "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+    }
+  ],
+  "dois": [
+    {
+      "value": "10.18429/JACoW-IPAC2017-WEYB1"
+    }
+  ],
+  "publication_info": [
+    {
+      "parent_isbn": "9783954501823"
+    },
+    {
+      "page_start": "2520",
+      "page_end": "2525",
+      "year": 2017
+    }
+  ],
+  "$schema": "hep.json",
+  "document_type": [
+    "article"
+  ],
+  "abstracts": [
+    {
+      "source": "Deutsches Elektronen-Synchrotron",
+      "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+    }
+  ]
+}]
diff --git a/tests/functional/desy/fixtures/desy_local_records.json b/tests/functional/desy/fixtures/desy_local_records.json
new file mode 100644
index 00000000..4197a456
--- /dev/null
+++ b/tests/functional/desy/fixtures/desy_local_records.json
@@ -0,0 +1,308 @@
+[{
+  "acquisition_source": {
+    "source": "desy",
+    "method": "hepcrawl",
+    "submission_number": "5652c7f6190f11e79e8000224dabeaad",
+    "datetime": "2017-04-03T10:26:40.365216"
+  },
+  "_collections": [
+    "Literature"
+  ],
+  "_fft": [
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "type": "Main",
+      "filename": "cNFW_rogue_curves"
+    },
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:16",
+      "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "type": "Main",
+      "filename": "scalingRelations_DutBeh_DC14_all_Oh"
+    }
+  ],
+  "control_number": 111111,
+  "public_notes": [
+    {
+      "value": "*Brief entry*"
+    }
+  ],
+  "self": {
+    "$ref": "http://inspirehep.net/api/literature/111111"
+  },
+  "number_of_pages": 6,
+  "titles": [
+    {
+      "source": "JACoW",
+      "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+    }
+  ],
+  "urls": [
+    {
+      "description": "Fulltext",
+      "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+    }
+  ],
+  "dois": [
+    {
+      "value": "10.18429/JACoW-IPAC2017-WEYB1"
+    }
+  ],
+  "publication_info": [
+    {
+      "parent_isbn": "9783954501823"
+    },
+    {
+      "page_start": "2520",
+      "page_end": "2525",
+      "year": 2017
+    }
+  ],
+  "$schema": "hep.json",
+  "document_type": [
+    "article"
+  ],
+  "abstracts": [
+    {
+      "source": "Deutsches Elektronen-Synchrotron",
+      "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+    }
+  ]
+},
+{
+  "acquisition_source": {
+    "source": "desy",
+    "method": "hepcrawl",
+    "submission_number": "5652c7f6190f11e79e8000224dabeaad",
+    "datetime": "2017-04-03T10:26:40.365216"
+  },
+  "_collections": [
+    "Literature"
+  ],
+  "_fft": [
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "type": "Main",
+      "filename": "cNFW_rogue_curves"
+    },
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:16",
+      "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "type": "Main",
+      "filename": "scalingRelations_DutBeh_DC14_all_Oh"
+    }
+  ],
+  "control_number": 222222,
+  "public_notes": [
+    {
+      "value": "*Brief entry*"
+    }
+  ],
+  "self": {
+    "$ref": "http://inspirehep.net/api/literature/222222"
+  },
+  "number_of_pages": 6,
+  "titles": [
+    {
+      "source": "JACoW",
+      "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+    }
+  ],
+  "urls": [
+    {
+      "description": "Fulltext",
+      "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+    }
+  ],
+  "dois": [
+    {
+      "value": "10.18429/JACoW-IPAC2017-WEYB1"
+    }
+  ],
+  "publication_info": [
+    {
+      "parent_isbn": "9783954501823"
+    },
+    {
+      "page_start": "2520",
+      "page_end": "2525",
+      "year": 2017
+    }
+  ],
+  "$schema": "hep.json",
+  "document_type": [
+    "article"
+  ],
+  "abstracts": [
+    {
+      "source": "Deutsches Elektronen-Synchrotron",
+      "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+    }
+  ]
+},
+{
+  "acquisition_source": {
+    "source": "desy",
+    "method": "hepcrawl",
+    "submission_number": "5652c7f6190f11e79e8000224dabeaad",
+    "datetime": "2017-04-03T10:26:40.365216"
+  },
+  "_collections": [
+    "Literature"
+  ],
+  "_fft": [
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "type": "Main",
+      "filename": "cNFW_rogue_curves"
+    },
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:16",
+      "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "type": "Main",
+      "filename": "scalingRelations_DutBeh_DC14_all_Oh"
+    }
+  ],
+  "control_number": 333333,
+  "public_notes": [
+    {
+      "value": "*Brief entry*"
+    }
+  ],
+  "self": {
+    "$ref": "http://inspirehep.net/api/literature/333333"
+  },
+  "number_of_pages": 6,
+  "titles": [
+    {
+      "source": "JACoW",
+      "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+    }
+  ],
+  "urls": [
+    {
+      "description": "Fulltext",
+      "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+    }
+  ],
+  "dois": [
+    {
+      "value": "10.18429/JACoW-IPAC2017-WEYB1"
+    }
+  ],
+  "publication_info": [
+    {
+      "parent_isbn": "9783954501823"
+    },
+    {
+      "page_start": "2520",
+      "page_end": "2525",
+      "year": 2017
+    }
+  ],
+  "$schema": "hep.json",
+  "document_type": [
+    "article"
+  ],
+  "abstracts": [
+    {
+      "source": "Deutsches Elektronen-Synchrotron",
+      "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+    }
+  ]
+},
+{
+  "acquisition_source": {
+    "source": "desy",
+    "method": "hepcrawl",
+    "submission_number": "5652c7f6190f11e79e8000224dabeaad",
+    "datetime": "2017-04-03T10:26:40.365216"
+  },
+  "_collections": [
+    "Literature"
+  ],
+  "_fft": [
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt",
+      "type": "Main",
+      "filename": "cNFW_rogue_curves"
+    },
+    {
+      "version": 1,
+      "creation_datetime": "2017-06-27T09:43:16",
+      "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.",
+      "format": ".txt",
+      "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt",
+      "type": "Main",
+      "filename": "scalingRelations_DutBeh_DC14_all_Oh"
+    }
+  ],
+  "control_number": 444444,
+  "public_notes": [
+    {
+      "value": "*Brief entry*"
+    }
+  ],
+  "self": {
+    "$ref": "http://inspirehep.net/api/literature/444444"
+  },
+  "number_of_pages": 6,
+  "titles": [
+    {
+      "source": "JACoW",
+      "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+    }
+  ],
+  "urls": [
+    {
+      "description": "Fulltext",
+      "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+    }
+  ],
+  "dois": [
+    {
+      "value": "10.18429/JACoW-IPAC2017-WEYB1"
+    }
+  ],
+  "publication_info": [
+    {
+      "parent_isbn": "9783954501823"
+    },
+    {
+      "page_start": "2520",
+      "page_end": "2525",
+      "year": 2017
+    }
+  ],
+  "$schema": "hep.json",
+  "document_type": [
+    "article"
+  ],
+  "abstracts": [
+    {
+      "source": "Deutsches Elektronen-Synchrotron",
+      "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+    }
+  ]
+}]
diff --git a/tests/functional/desy/fixtures/ftp_server/.netrc b/tests/functional/desy/fixtures/ftp_server/.netrc
new file mode 100644
index 00000000..59a152f7
--- /dev/null
+++ b/tests/functional/desy/fixtures/ftp_server/.netrc
@@ -0,0 +1,3 @@
+machine ftp_server
+login bob
+password bob
diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt
new file mode 100644
index 00000000..bb8e8348
--- /dev/null
+++ b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_1.txt
@@ -0,0 +1 @@
+sample file fft 1.
\ No newline at end of file
diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt
new file mode 100644
index 00000000..e1b54448
--- /dev/null
+++ b/tests/functional/desy/fixtures/ftp_server/DESY/FFT/test_fft_2.txt
@@ -0,0 +1 @@
+sample file fft 2.
\ No newline at end of file
diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml
new file mode 100644
index 00000000..359bb570
--- /dev/null
+++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_collection_records.xml
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<collection xmlns="http://www.loc.gov/MARC21/slim">
+    <record>
+        <controlfield tag="001">111111</controlfield>
+        <controlfield tag="005">20170705125610.0</controlfield>
+        <datafield tag="024" ind1="7" ind2=" ">
+            <subfield code="2">DOI</subfield>
+            <subfield code="a">10.18429/JACoW-IPAC2017-WEYB1</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="z">9783954501823</subfield>
+        </datafield>
+        <datafield tag="245" ind1=" " ind2=" ">
+            <subfield code="a">Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser
+                Acceleration (DLA) From the Source to Relativistic Electrons
+            </subfield>
+            <subfield code="9">JACoW</subfield>
+        </datafield>
+        <datafield tag="520" ind1=" " ind2=" ">
+            <subfield code="a">Dielectric laser acceleration of electrons has recently been
+                demonstrated with significantly higher accelerating gradients than other
+                structure-based linear accelerators. Towards the development of an integrated 1 MeV
+                electron accelerator based on dielectric laser accelerator technologies,
+                development in several relevant technologies is needed. In this work, recent
+                developments on electron sources, bunching, accelerating, focussing, deflecting and
+                laser coupling structures are reported. With an eye to the near future, components
+                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond
+                electron bunches are outlined.
+            </subfield>
+            <subfield code="9">Deutsches Elektronen-Synchrotron</subfield>
+        </datafield>
+        <datafield tag="300" ind1=" " ind2=" ">
+            <subfield code="a">6</subfield>
+        </datafield>
+        <datafield tag="500" ind1=" " ind2=" ">
+            <subfield code="a">*Brief entry*</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="y">2017</subfield>
+            <subfield code="c">2520-2525</subfield>
+        </datafield>
+        <datafield tag="856" ind1="4" ind2=" ">
+            <subfield code="s">100176</subfield>
+            <subfield code="u">http://inspirehep.net/record/1608652/files/Towards a fully
+                integrated acc on a chip.pdf
+            </subfield>
+            <subfield code="y">Fulltext</subfield>
+        </datafield>
+        <datafield tag="909" ind1="C" ind2="O">
+            <subfield code="o">oai:inspirehep.net:1608652</subfield>
+            <subfield code="p">INSPIRE:HEP</subfield>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_1.txt</subfield>
+            <subfield code="d">00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">cNFW_rogue_curves</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:17</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_2.txt</subfield>
+            <subfield code="d">00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">scalingRelations_DutBeh_DC14_all_Oh</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:16</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+    </record>
+    <record>
+        <controlfield tag="001">222222</controlfield>
+        <controlfield tag="005">20170705125610.0</controlfield>
+        <datafield tag="024" ind1="7" ind2=" ">
+            <subfield code="2">DOI</subfield>
+            <subfield code="a">10.18429/JACoW-IPAC2017-WEYB1</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="z">9783954501823</subfield>
+        </datafield>
+        <datafield tag="245" ind1=" " ind2=" ">
+            <subfield code="a">Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser
+                Acceleration (DLA) From the Source to Relativistic Electrons
+            </subfield>
+            <subfield code="9">JACoW</subfield>
+        </datafield>
+        <datafield tag="520" ind1=" " ind2=" ">
+            <subfield code="a">Dielectric laser acceleration of electrons has recently been
+                demonstrated with significantly higher accelerating gradients than other
+                structure-based linear accelerators. Towards the development of an integrated 1 MeV
+                electron accelerator based on dielectric laser accelerator technologies,
+                development in several relevant technologies is needed. In this work, recent
+                developments on electron sources, bunching, accelerating, focussing, deflecting and
+                laser coupling structures are reported. With an eye to the near future, components
+                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond
+                electron bunches are outlined.
+            </subfield>
+            <subfield code="9">Deutsches Elektronen-Synchrotron</subfield>
+        </datafield>
+        <datafield tag="300" ind1=" " ind2=" ">
+            <subfield code="a">6</subfield>
+        </datafield>
+        <datafield tag="500" ind1=" " ind2=" ">
+            <subfield code="a">*Brief entry*</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="y">2017</subfield>
+            <subfield code="c">2520-2525</subfield>
+        </datafield>
+        <datafield tag="856" ind1="4" ind2=" ">
+            <subfield code="s">100176</subfield>
+            <subfield code="u">http://inspirehep.net/record/1608652/files/Towards a fully
+                integrated acc on a chip.pdf
+            </subfield>
+            <subfield code="y">Fulltext</subfield>
+        </datafield>
+        <datafield tag="909" ind1="C" ind2="O">
+            <subfield code="o">oai:inspirehep.net:1608652</subfield>
+            <subfield code="p">INSPIRE:HEP</subfield>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_1.txt</subfield>
+            <subfield code="d">00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">cNFW_rogue_curves</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:17</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_2.txt</subfield>
+            <subfield code="d">00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">scalingRelations_DutBeh_DC14_all_Oh</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:16</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+    </record>
+</collection>
diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml
new file mode 100644
index 00000000..1f9c57a9
--- /dev/null
+++ b/tests/functional/desy/fixtures/ftp_server/DESY/desy_no_namespace_collection_records.xml
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<collection>
+    <record>
+        <controlfield tag="001">333333</controlfield>
+        <controlfield tag="005">20170705125610.0</controlfield>
+        <datafield tag="024" ind1="7" ind2=" ">
+            <subfield code="2">DOI</subfield>
+            <subfield code="a">10.18429/JACoW-IPAC2017-WEYB1</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="z">9783954501823</subfield>
+        </datafield>
+        <datafield tag="245" ind1=" " ind2=" ">
+            <subfield code="a">Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser
+                Acceleration (DLA) From the Source to Relativistic Electrons
+            </subfield>
+            <subfield code="9">JACoW</subfield>
+        </datafield>
+        <datafield tag="520" ind1=" " ind2=" ">
+            <subfield code="a">Dielectric laser acceleration of electrons has recently been
+                demonstrated with significantly higher accelerating gradients than other
+                structure-based linear accelerators. Towards the development of an integrated 1 MeV
+                electron accelerator based on dielectric laser accelerator technologies,
+                development in several relevant technologies is needed. In this work, recent
+                developments on electron sources, bunching, accelerating, focussing, deflecting and
+                laser coupling structures are reported. With an eye to the near future, components
+                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond
+                electron bunches are outlined.
+            </subfield>
+            <subfield code="9">Deutsches Elektronen-Synchrotron</subfield>
+        </datafield>
+        <datafield tag="300" ind1=" " ind2=" ">
+            <subfield code="a">6</subfield>
+        </datafield>
+        <datafield tag="500" ind1=" " ind2=" ">
+            <subfield code="a">*Brief entry*</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="y">2017</subfield>
+            <subfield code="c">2520-2525</subfield>
+        </datafield>
+        <datafield tag="856" ind1="4" ind2=" ">
+            <subfield code="s">100176</subfield>
+            <subfield code="u">http://inspirehep.net/record/1608652/files/Towards a fully
+                integrated acc on a chip.pdf
+            </subfield>
+            <subfield code="y">Fulltext</subfield>
+        </datafield>
+        <datafield tag="909" ind1="C" ind2="O">
+            <subfield code="o">oai:inspirehep.net:1608652</subfield>
+            <subfield code="p">INSPIRE:HEP</subfield>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_1.txt</subfield>
+            <subfield code="d">00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">cNFW_rogue_curves</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:17</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_2.txt</subfield>
+            <subfield code="d">00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">scalingRelations_DutBeh_DC14_all_Oh</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:16</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+    </record>
+    <record>
+        <controlfield tag="001">444444</controlfield>
+        <controlfield tag="005">20170705125610.0</controlfield>
+        <datafield tag="024" ind1="7" ind2=" ">
+            <subfield code="2">DOI</subfield>
+            <subfield code="a">10.18429/JACoW-IPAC2017-WEYB1</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="z">9783954501823</subfield>
+        </datafield>
+        <datafield tag="245" ind1=" " ind2=" ">
+            <subfield code="a">Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser
+                Acceleration (DLA) From the Source to Relativistic Electrons
+            </subfield>
+            <subfield code="9">JACoW</subfield>
+        </datafield>
+        <datafield tag="520" ind1=" " ind2=" ">
+            <subfield code="a">Dielectric laser acceleration of electrons has recently been
+                demonstrated with significantly higher accelerating gradients than other
+                structure-based linear accelerators. Towards the development of an integrated 1 MeV
+                electron accelerator based on dielectric laser accelerator technologies,
+                development in several relevant technologies is needed. In this work, recent
+                developments on electron sources, bunching, accelerating, focussing, deflecting and
+                laser coupling structures are reported. With an eye to the near future, components
+                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond
+                electron bunches are outlined.
+            </subfield>
+            <subfield code="9">Deutsches Elektronen-Synchrotron</subfield>
+        </datafield>
+        <datafield tag="300" ind1=" " ind2=" ">
+            <subfield code="a">6</subfield>
+        </datafield>
+        <datafield tag="500" ind1=" " ind2=" ">
+            <subfield code="a">*Brief entry*</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="y">2017</subfield>
+            <subfield code="c">2520-2525</subfield>
+        </datafield>
+        <datafield tag="856" ind1="4" ind2=" ">
+            <subfield code="s">100176</subfield>
+            <subfield code="u">http://inspirehep.net/record/1608652/files/Towards a fully
+                integrated acc on a chip.pdf
+            </subfield>
+            <subfield code="y">Fulltext</subfield>
+        </datafield>
+        <datafield tag="909" ind1="C" ind2="O">
+            <subfield code="o">oai:inspirehep.net:1608652</subfield>
+            <subfield code="p">INSPIRE:HEP</subfield>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_1.txt</subfield>
+            <subfield code="d">00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">cNFW_rogue_curves</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:17</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_2.txt</subfield>
+            <subfield code="d">00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">scalingRelations_DutBeh_DC14_all_Oh</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:16</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+    </record>
+</collection>
diff --git a/tests/functional/desy/fixtures/ftp_server/DESY/file_not_for_download.txt b/tests/functional/desy/fixtures/ftp_server/DESY/file_not_for_download.txt
new file mode 100644
index 00000000..5254be38
--- /dev/null
+++ b/tests/functional/desy/fixtures/ftp_server/DESY/file_not_for_download.txt
@@ -0,0 +1 @@
+This is a file not to download the Desy spider!
\ No newline at end of file
diff --git a/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd b/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd
new file mode 100644
index 00000000..275a727c
--- /dev/null
+++ b/tests/functional/desy/fixtures/ftp_server/pureftpd.passwd
@@ -0,0 +1 @@
+bob:$1$3ccy4I60$nSpFtRN8U6/BgmmPaxrYR/:1000:1000::/home/ftpusers/bob/./::::::::::::
diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py
new file mode 100644
index 00000000..5c3f4929
--- /dev/null
+++ b/tests/functional/desy/test_desy.py
@@ -0,0 +1,211 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of hepcrawl.
+# Copyright (C) 2017 CERN.
+#
+# hepcrawl is a free software; you can redistribute it and/or modify it
+# under the terms of the Revised BSD License; see LICENSE file for
+# more details.
+
+"""Functional tests for Desy spider"""
+
+from __future__ import absolute_import, division, print_function
+
+import pytest
+
+from time import sleep
+import hashlib
+
+from hepcrawl.testlib.celery_monitor import CeleryMonitor
+from hepcrawl.testlib.fixtures import (
+    get_test_suite_path,
+    expected_json_results_from_file,
+    clean_dir,
+)
+from hepcrawl.testlib.tasks import app as celery_app
+from hepcrawl.testlib.utils import get_crawler_instance
+
+
+def override_generated_fields(record):
+    record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216'
+    record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad'
+
+    return record
+
+
+def compare_two_files_using_md5(file_1, file_2):
+    """Compares two files calculating the md5 hash."""
+    def _generate_md5_hash(file_path):
+        hasher = hashlib.md5()
+        with open(str(file_path), 'rb') as fd:
+            buf = fd.read()
+            hasher.update(buf)
+            return hasher.hexdigest()
+
+    return _generate_md5_hash(file_1) == _generate_md5_hash(file_2)
+
+
+@pytest.fixture(scope="function")
+def get_fft_1_path():
+    return get_test_suite_path(
+        'desy',
+        'fixtures',
+        'ftp_server',
+        'DESY',
+        'FFT',
+        'test_fft_1.txt',
+        test_suite='functional',
+    )
+
+
+@pytest.fixture(scope="function")
+def get_fft_2_path():
+    return get_test_suite_path(
+        'desy',
+        'fixtures',
+        'ftp_server',
+        'DESY',
+        'FFT',
+        'test_fft_2.txt',
+        test_suite='functional',
+    )
+
+
+@pytest.fixture(scope="function")
+def set_up_ftp_environment():
+    netrc_location = get_test_suite_path(
+        'desy',
+        'fixtures',
+        'ftp_server',
+        '.netrc',
+        test_suite='functional',
+    )
+
+    # The test must wait until the docker environment is up (takes about 10 seconds).
+    sleep(10)
+
+    yield {
+        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
+        'CRAWLER_PROJECT': 'hepcrawl',
+        'CRAWLER_ARGUMENTS': {
+            'ftp_host': 'ftp_server',
+            'ftp_netrc': netrc_location,
+        }
+    }
+
+    clean_dir('/tmp/file_urls')
+    clean_dir('/tmp/DESY')
+
+
+@pytest.fixture(scope="function")
+def set_up_local_environment():
+    package_location = get_test_suite_path(
+        'desy',
+        'fixtures',
+        'ftp_server',
+        'DESY',
+        test_suite='functional',
+    )
+
+    yield {
+        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
+        'CRAWLER_PROJECT': 'hepcrawl',
+        'CRAWLER_ARGUMENTS': {
+            'source_folder': package_location,
+        }
+    }
+
+    clean_dir('/tmp/file_urls')
+    clean_dir('/tmp/DESY')
+
+
+@pytest.mark.parametrize(
+    'expected_results',
+    [
+        expected_json_results_from_file(
+            'desy',
+            'fixtures',
+            'desy_ftp_records.json',
+        ),
+    ],
+    ids=[
+        'smoke',
+    ]
+)
+def test_desy_ftp(
+        set_up_ftp_environment,
+        expected_results,
+        get_fft_1_path,
+        get_fft_2_path,
+):
+    crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=100,
+        events_limit=2,
+        crawler_instance=crawler,
+        project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
+        spider='desy',
+        settings={},
+        **set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+    expected_results = [override_generated_fields(expected) for expected in expected_results]
+
+    assert sorted(gotten_results) == expected_results
+
+    # Check using MD5 Hash if downloaded files are there.
+    for record in expected_results:
+        fft_file_paths = sorted(record['_fft'])
+
+        assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_2_path)
+        assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_1_path)
+
+
+@pytest.mark.parametrize(
+    'expected_results',
+    [
+        expected_json_results_from_file(
+            'desy',
+            'fixtures',
+            'desy_local_records.json',
+        ),
+    ],
+    ids=[
+        'smoke',
+    ]
+)
+def test_desy_local_package_path(
+        set_up_local_environment,
+        expected_results,
+        get_fft_1_path,
+        get_fft_2_path,
+):
+    crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))
+
+    results = CeleryMonitor.do_crawl(
+        app=celery_app,
+        monitor_timeout=5,
+        monitor_iter_limit=100,
+        events_limit=2,
+        crawler_instance=crawler,
+        project=set_up_local_environment.get('CRAWLER_PROJECT'),
+        spider='desy',
+        settings={},
+        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
+    )
+
+    gotten_results = [override_generated_fields(result) for result in results]
+    expected_results = [override_generated_fields(expected) for expected in expected_results]
+
+    assert sorted(gotten_results) == expected_results
+
+    # Check using MD5 Hash if downloaded files are there.
+    for record in expected_results:
+        fft_file_paths = sorted(record['_fft'])
+
+        assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_2_path)
+        assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_1_path)
diff --git a/tests/unit/responses/desy/desy_collection_records.xml b/tests/unit/responses/desy/desy_collection_records.xml
new file mode 100644
index 00000000..93ede820
--- /dev/null
+++ b/tests/unit/responses/desy/desy_collection_records.xml
@@ -0,0 +1,149 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<collection xmlns="http://www.loc.gov/MARC21/slim">
+    <record>
+        <controlfield tag="001">111111</controlfield>
+        <controlfield tag="005">20170705125610.0</controlfield>
+        <datafield tag="024" ind1="7" ind2=" ">
+            <subfield code="2">DOI</subfield>
+            <subfield code="a">10.18429/JACoW-IPAC2017-WEYB1</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="z">9783954501823</subfield>
+        </datafield>
+        <datafield tag="245" ind1=" " ind2=" ">
+            <subfield code="a">Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser
+                Acceleration (DLA) From the Source to Relativistic Electrons
+            </subfield>
+            <subfield code="9">JACoW</subfield>
+        </datafield>
+        <datafield tag="520" ind1=" " ind2=" ">
+            <subfield code="a">Dielectric laser acceleration of electrons has recently been
+                demonstrated with significantly higher accelerating gradients than other
+                structure-based linear accelerators. Towards the development of an integrated 1 MeV
+                electron accelerator based on dielectric laser accelerator technologies,
+                development in several relevant technologies is needed. In this work, recent
+                developments on electron sources, bunching, accelerating, focussing, deflecting and
+                laser coupling structures are reported. With an eye to the near future, components
+                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond
+                electron bunches are outlined.
+            </subfield>
+            <subfield code="9">Deutsches Elektronen-Synchrotron</subfield>
+        </datafield>
+        <datafield tag="300" ind1=" " ind2=" ">
+            <subfield code="a">6</subfield>
+        </datafield>
+        <datafield tag="500" ind1=" " ind2=" ">
+            <subfield code="a">*Brief entry*</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="y">2017</subfield>
+            <subfield code="c">2520-2525</subfield>
+        </datafield>
+        <datafield tag="856" ind1="4" ind2=" ">
+            <subfield code="s">100176</subfield>
+            <subfield code="u">http://inspirehep.net/record/1608652/files/Towards a fully
+                integrated acc on a chip.pdf
+            </subfield>
+            <subfield code="y">Fulltext</subfield>
+        </datafield>
+        <datafield tag="909" ind1="C" ind2="O">
+            <subfield code="o">oai:inspirehep.net:1608652</subfield>
+            <subfield code="p">INSPIRE:HEP</subfield>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_1.txt</subfield>
+            <subfield code="d">00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">cNFW_rogue_curves</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:17</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_2.txt</subfield>
+            <subfield code="d">00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">scalingRelations_DutBeh_DC14_all_Oh</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:16</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+    </record>
+    <record>
+        <controlfield tag="001">222222</controlfield>
+        <controlfield tag="005">20170705125610.0</controlfield>
+        <datafield tag="024" ind1="7" ind2=" ">
+            <subfield code="2">DOI</subfield>
+            <subfield code="a">10.18429/JACoW-IPAC2017-WEYB1</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="z">9783954501823</subfield>
+        </datafield>
+        <datafield tag="245" ind1=" " ind2=" ">
+            <subfield code="a">Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser
+                Acceleration (DLA) From the Source to Relativistic Electrons
+            </subfield>
+            <subfield code="9">JACoW</subfield>
+        </datafield>
+        <datafield tag="520" ind1=" " ind2=" ">
+            <subfield code="a">Dielectric laser acceleration of electrons has recently been
+                demonstrated with significantly higher accelerating gradients than other
+                structure-based linear accelerators. Towards the development of an integrated 1 MeV
+                electron accelerator based on dielectric laser accelerator technologies,
+                development in several relevant technologies is needed. In this work, recent
+                developments on electron sources, bunching, accelerating, focussing, deflecting and
+                laser coupling structures are reported. With an eye to the near future, components
+                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond
+                electron bunches are outlined.
+            </subfield>
+            <subfield code="9">Deutsches Elektronen-Synchrotron</subfield>
+        </datafield>
+        <datafield tag="300" ind1=" " ind2=" ">
+            <subfield code="a">6</subfield>
+        </datafield>
+        <datafield tag="500" ind1=" " ind2=" ">
+            <subfield code="a">*Brief entry*</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="y">2017</subfield>
+            <subfield code="c">2520-2525</subfield>
+        </datafield>
+        <datafield tag="856" ind1="4" ind2=" ">
+            <subfield code="s">100176</subfield>
+            <subfield code="u">http://inspirehep.net/record/1608652/files/Towards a fully
+                integrated acc on a chip.pdf
+            </subfield>
+            <subfield code="y">Fulltext</subfield>
+        </datafield>
+        <datafield tag="909" ind1="C" ind2="O">
+            <subfield code="o">oai:inspirehep.net:1608652</subfield>
+            <subfield code="p">INSPIRE:HEP</subfield>
+        </datafield>
+                <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_1.txt</subfield>
+            <subfield code="d">00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">cNFW_rogue_curves</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:17</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_2.txt</subfield>
+            <subfield code="d">00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">scalingRelations_DutBeh_DC14_all_Oh</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:16</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+    </record>
+</collection>
diff --git a/tests/unit/responses/desy/desy_record.xml b/tests/unit/responses/desy/desy_record.xml
new file mode 100644
index 00000000..9e20e8d0
--- /dev/null
+++ b/tests/unit/responses/desy/desy_record.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<collection xmlns="http://www.loc.gov/MARC21/slim">
+    <record>
+        <controlfield tag="001">111111</controlfield>
+        <controlfield tag="005">20170705125610.0</controlfield>
+        <datafield tag="024" ind1="7" ind2=" ">
+            <subfield code="2">DOI</subfield>
+            <subfield code="a">10.18429/JACoW-IPAC2017-WEYB1</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="z">9783954501823</subfield>
+        </datafield>
+        <datafield tag="245" ind1=" " ind2=" ">
+            <subfield code="a">Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser
+                Acceleration (DLA) From the Source to Relativistic Electrons
+            </subfield>
+            <subfield code="9">JACoW</subfield>
+        </datafield>
+        <datafield tag="520" ind1=" " ind2=" ">
+            <subfield code="a">Dielectric laser acceleration of electrons has recently been
+                demonstrated with significantly higher accelerating gradients than other
+                structure-based linear accelerators. Towards the development of an integrated 1 MeV
+                electron accelerator based on dielectric laser accelerator technologies,
+                development in several relevant technologies is needed. In this work, recent
+                developments on electron sources, bunching, accelerating, focussing, deflecting and
+                laser coupling structures are reported. With an eye to the near future, components
+                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond
+                electron bunches are outlined.
+            </subfield>
+            <subfield code="9">Deutsches Elektronen-Synchrotron</subfield>
+        </datafield>
+        <datafield tag="300" ind1=" " ind2=" ">
+            <subfield code="a">6</subfield>
+        </datafield>
+        <datafield tag="500" ind1=" " ind2=" ">
+            <subfield code="a">*Brief entry*</subfield>
+        </datafield>
+        <datafield tag="773" ind1=" " ind2=" ">
+            <subfield code="y">2017</subfield>
+            <subfield code="c">2520-2525</subfield>
+        </datafield>
+        <datafield tag="856" ind1="4" ind2=" ">
+            <subfield code="s">100176</subfield>
+            <subfield code="u">http://inspirehep.net/record/1608652/files/Towards a fully
+                integrated acc on a chip.pdf
+            </subfield>
+            <subfield code="y">Fulltext</subfield>
+        </datafield>
+        <datafield tag="909" ind1="C" ind2="O">
+            <subfield code="o">oai:inspirehep.net:1608652</subfield>
+            <subfield code="p">INSPIRE:HEP</subfield>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_1.txt</subfield>
+            <subfield code="d">00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \textsc{core}NFW models. Colors and symbols are as in Figure \ref{fig:dc14_fits}.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">cNFW_rogue_curves</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:17</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+        <datafield tag="FFT" ind1=" " ind2=" ">
+            <subfield code="a">FFT/test_fft_2.txt</subfield>
+            <subfield code="d">00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \cite{dutton14} (left) and the stellar mass-halo mass relation from \cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\sigma$ and 2$\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \cite{maccio08} and the stellar mass-halo mass relation from \cite{behroozi13} are also shown as the black dashed lines.</subfield>
+            <subfield code="f">.txt</subfield>
+            <subfield code="n">scalingRelations_DutBeh_DC14_all_Oh</subfield>
+            <subfield code="r"/>
+            <subfield code="s">2017-06-27 09:43:16</subfield>
+            <subfield code="t">Main</subfield>
+            <subfield code="v">1</subfield>
+            <subfield code="z"/>
+        </datafield>
+    </record>
+</collection>
diff --git a/tests/unit/test_desy.py b/tests/unit/test_desy.py
new file mode 100644
index 00000000..35ed24e2
--- /dev/null
+++ b/tests/unit/test_desy.py
@@ -0,0 +1,381 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of hepcrawl.
+# Copyright (C) 2017 CERN.
+#
+# hepcrawl is a free software; you can redistribute it and/or modify it
+# under the terms of the Revised BSD License; see LICENSE file for
+# more details.
+
+from __future__ import absolute_import, division, print_function
+
+import pytest
+import os
+
+from scrapy.crawler import Crawler
+from scrapy.http import TextResponse
+
+from hepcrawl.pipelines import InspireCeleryPushPipeline
+from hepcrawl.spiders import desy_spider
+
+from hepcrawl.testlib.fixtures import fake_response_from_file
+
+
+def create_spider():
+    crawler = Crawler(spidercls=desy_spider.DesySpider)
+    return desy_spider.DesySpider.from_crawler(crawler)
+
+
+def get_records(response_file_name):
+    """Return all results generator from the ``Desy`` spider via pipelines."""
+    # environmental variables needed for the pipelines payload
+    os.environ['SCRAPY_JOB'] = 'scrapy_job'
+    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
+    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'
+
+    spider = create_spider()
+    records = spider.parse(
+        fake_response_from_file(
+            file_name=response_file_name,
+            response_type=TextResponse
+        )
+    )
+
+    pipeline = InspireCeleryPushPipeline()
+    pipeline.open_spider(spider)
+
+    return (
+        pipeline.process_item(
+            record,
+            spider
+        ) for record in records
+    )
+
+
+def get_one_record(response_file_name):
+    parsed_items = get_records(response_file_name)
+    record = parsed_items.next()
+    return record
+
+
+def override_generated_fields(record):
+    record['acquisition_source']['datetime'] = '2017-05-04T17:49:07.975168'
+    record['acquisition_source']['submission_number'] = '5652c7f6190f11e79e8000224dabeaad'
+
+    return record
+
+
+@pytest.mark.parametrize(
+    'generated_record',
+    [
+        get_one_record('desy/desy_record.xml'),
+    ],
+    ids=[
+        'smoke',
+    ]
+)
+def test_pipeline_record(generated_record):
+    expected = {
+        '$schema': 'hep.json',
+        '_collections': [
+            'Literature'
+        ],
+        '_fft': [
+            {
+                'creation_datetime': '2017-06-27T09:43:17',
+                'description': '00013 Decomposition of the problematic rotation curves in our '
+                               'sample according to the best-fit \\textsc{core}NFW models. '
+                               'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.',
+                'filename': 'cNFW_rogue_curves',
+                'format': '.txt',
+                'path': 'FFT/test_fft_1.txt',
+                'type': 'Main',
+                'version': 1,
+            },
+            {
+                'creation_datetime': '2017-06-27T09:43:16',
+                'description': '00005 Comparison of the parameters of the best-fit DC14 models to '
+                               'the cosmological halo mass-concentration relation from \\'
+                               'cite{dutton14} (left) and the stellar mass-halo mass relation '
+                               'from \\cite{behroozi13} (right). The error bars correspond to the '
+                               'extremal values of the multidimensional 68\\% confidence region '
+                               'for each fit. The theoretical relations are shown as red lines '
+                               'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by '
+                               'the dark and light grey bands, respectively. The '
+                               'mass-concentration relation from \\cite{maccio08} and the stellar'
+                               ' mass-halo mass relation from \\cite{behroozi13} are also shown '
+                               'as the black dashed lines.',
+                'filename': 'scalingRelations_DutBeh_DC14_all_Oh',
+                'format': '.txt',
+                'path': 'FFT/test_fft_2.txt',
+                'type': 'Main',
+                'version': 1
+            }
+        ],
+        'abstracts': [
+            {
+                'source': 'Deutsches Elektronen-Synchrotron',
+                'value': 'Dielectric laser acceleration of electrons has recently been\n'
+                         '                demonstrated with significantly higher accelerating '
+                         'gradients than other\n                structure-based linear '
+                         'accelerators. Towards the development of an integrated 1 MeV\n         '
+                         '       electron accelerator based on dielectric laser accelerator '
+                         'technologies,\n                development in several relevant '
+                         'technologies is needed. In this work, recent\n                '
+                         'developments on electron sources, bunching, accelerating, focussing, '
+                         'deflecting and\n                laser coupling structures are reported. '
+                         'With an eye to the near future, components\n                '
+                         'required for a 1 MeV kinetic energy tabletop accelerator producing '
+                         'sub-femtosecond\n                electron bunches are outlined.\n       '
+                         '     '
+            }
+        ],
+        'acquisition_source': {
+            'datetime': '2017-05-04T17:49:07.975168',
+            'method': 'hepcrawl',
+            'source': 'desy',
+            'submission_number': '5652c7f6190f11e79e8000224dabeaad'
+        },
+        'control_number': 111111,
+        'document_type': [
+            'article'
+        ],
+        'dois': [
+            {
+                'value': '10.18429/JACoW-IPAC2017-WEYB1'
+            }
+        ],
+        'number_of_pages': 6,
+        'public_notes': [
+            {
+                'value': '*Brief entry*'
+            }
+        ],
+        'publication_info': [
+            {
+                'parent_isbn': '9783954501823'
+            },
+            {
+                'page_end': '2525',
+                'page_start': '2520',
+                'year': 2017
+            }
+        ],
+        'self': {
+            '$ref': 'http://inspirehep.net/api/literature/111111'
+        },
+        'titles': [
+            {
+                'source': 'JACoW',
+                 'title': 'Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n   '
+                          '             Acceleration (DLA) From the Source to Relativistic '
+                          'Electrons\n            '
+            }
+        ],
+        'urls': [
+            {
+                'description': 'Fulltext',
+                'value': 'http://inspirehep.net/record/1608652/files/Towards a fully\n            '
+                         '    integrated acc on a chip.pdf\n            '
+            }
+        ]
+    }
+
+    assert override_generated_fields(generated_record) == expected
+
+
+@pytest.mark.parametrize(
+    'generated_records',
+    [
+        get_records('desy/desy_collection_records.xml'),
+    ],
+    ids=[
+        'smoke',
+    ]
+)
+def test_pipeline_collection_records(generated_records):
+    expected = [{
+            "acquisition_source": {
+                "source": "desy",
+                "method": "hepcrawl",
+                "submission_number": "5652c7f6190f11e79e8000224dabeaad",
+                "datetime": "2017-05-04T17:49:07.975168"
+            },
+            "_collections": [
+                "Literature"
+            ],
+            "_fft": [
+                {
+                    'creation_datetime': '2017-06-27T09:43:17',
+                    'description': '00013 Decomposition of the problematic rotation curves in our '
+                                   'sample according to the best-fit \\textsc{core}NFW models. '
+                                   'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.',
+                    'filename': 'cNFW_rogue_curves',
+                    'format': '.txt',
+                    'path': 'FFT/test_fft_1.txt',
+                    'type': 'Main',
+                    'version': 1,
+                },
+                {
+                    'creation_datetime': '2017-06-27T09:43:16',
+                    'description': '00005 Comparison of the parameters of the best-fit DC14 models to '
+                                   'the cosmological halo mass-concentration relation from \\'
+                                   'cite{dutton14} (left) and the stellar mass-halo mass relation '
+                                   'from \\cite{behroozi13} (right). The error bars correspond to the '
+                                   'extremal values of the multidimensional 68\\% confidence region '
+                                   'for each fit. The theoretical relations are shown as red lines '
+                                   'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by '
+                                   'the dark and light grey bands, respectively. The '
+                                   'mass-concentration relation from \\cite{maccio08} and the stellar'
+                                   ' mass-halo mass relation from \\cite{behroozi13} are also shown '
+                                   'as the black dashed lines.',
+                    'filename': 'scalingRelations_DutBeh_DC14_all_Oh',
+                    'format': '.txt',
+                    'path': 'FFT/test_fft_2.txt',
+                    'type': 'Main',
+                    'version': 1
+                }
+            ],
+            "control_number": 111111,
+            "public_notes": [
+                {
+                    "value": "*Brief entry*"
+                }
+            ],
+            "self": {
+                "$ref": "http://inspirehep.net/api/literature/111111"
+            },
+            "number_of_pages": 6,
+            "titles": [
+                {
+                    "source": "JACoW",
+                    "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+                }
+            ],
+            "urls": [
+                {
+                    "description": "Fulltext",
+                    "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+                }
+            ],
+            "dois": [
+                {
+                    "value": "10.18429/JACoW-IPAC2017-WEYB1"
+                }
+            ],
+            "publication_info": [
+                {
+                    "parent_isbn": "9783954501823"
+                },
+                {
+                    "page_start": "2520",
+                    "page_end": "2525",
+                    "year": 2017
+                }
+            ],
+            "$schema": "hep.json",
+            "document_type": [
+                "article"
+            ],
+            "abstracts": [
+                {
+                    "source": "Deutsches Elektronen-Synchrotron",
+                    "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+                }
+            ]
+        },
+        {
+            "acquisition_source": {
+                "source": "desy",
+                "method": "hepcrawl",
+                "submission_number": "5652c7f6190f11e79e8000224dabeaad",
+                "datetime": "2017-05-04T17:49:07.975168"
+            },
+            "_collections": [
+                "Literature"
+            ],
+            "_fft": [
+                {
+                    'creation_datetime': '2017-06-27T09:43:17',
+                    'description': '00013 Decomposition of the problematic rotation curves in our '
+                                   'sample according to the best-fit \\textsc{core}NFW models. '
+                                   'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.',
+                    'filename': 'cNFW_rogue_curves',
+                    'format': '.txt',
+                    'path': 'FFT/test_fft_1.txt',
+                    'type': 'Main',
+                    'version': 1,
+                },
+                {
+                    'creation_datetime': '2017-06-27T09:43:16',
+                    'description': '00005 Comparison of the parameters of the best-fit DC14 models to '
+                                   'the cosmological halo mass-concentration relation from \\'
+                                   'cite{dutton14} (left) and the stellar mass-halo mass relation '
+                                   'from \\cite{behroozi13} (right). The error bars correspond to the '
+                                   'extremal values of the multidimensional 68\\% confidence region '
+                                   'for each fit. The theoretical relations are shown as red lines '
+                                   'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by '
+                                   'the dark and light grey bands, respectively. The '
+                                   'mass-concentration relation from \\cite{maccio08} and the stellar'
+                                   ' mass-halo mass relation from \\cite{behroozi13} are also shown '
+                                   'as the black dashed lines.',
+                    'filename': 'scalingRelations_DutBeh_DC14_all_Oh',
+                    'format': '.txt',
+                    'path': 'FFT/test_fft_2.txt',
+                    'type': 'Main',
+                    'version': 1
+                }
+            ],
+            "control_number": 222222,
+            "public_notes": [
+                {
+                    "value": "*Brief entry*"
+                }
+            ],
+            "self": {
+                "$ref": "http://inspirehep.net/api/literature/222222"
+            },
+            "number_of_pages": 6,
+            "titles": [
+                {
+                    "source": "JACoW",
+                    "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+                }
+            ],
+            "urls": [
+                {
+                    "description": "Fulltext",
+                    "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+                }
+            ],
+            "dois": [
+                {
+                    "value": "10.18429/JACoW-IPAC2017-WEYB1"
+                }
+            ],
+            "publication_info": [
+                {
+                    "parent_isbn": "9783954501823"
+                },
+                {
+                    "page_start": "2520",
+                    "page_end": "2525",
+                    "year": 2017
+                }
+            ],
+            "$schema": "hep.json",
+            "document_type": [
+                "article"
+            ],
+            "abstracts": [
+                {
+                    "source": "Deutsches Elektronen-Synchrotron",
+                    "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+                }
+            ]
+        }
+    ]
+
+    generated_results = [override_generated_fields(rec) for rec in generated_records]
+
+    assert generated_results == expected

From a6effce2bc68fc5d5e4f8d992c21cc1b284473b8 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Thu, 17 Aug 2017 14:43:50 +0200
Subject: [PATCH 02/11] hepcrawl.utils: add ProcessedItem and RecordFile

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/utils.py | 100 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 94 insertions(+), 6 deletions(-)

diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py
index 4ad9db3c..4d4a28db 100644
--- a/hepcrawl/utils.py
+++ b/hepcrawl/utils.py
@@ -31,6 +31,10 @@
 INST_PHRASES = ['for the development', ]
 
 
+class PathDoesNotExist(IOError):
+    pass
+
+
 def unzip_xml_files(filename, target_folder):
     """Unzip files (XML only) into target folder."""
     z = ZipFile(filename)
@@ -57,17 +61,38 @@ def ftp_connection_info(ftp_host, netrc_file, passive_mode=False):
     return ftp_host, connection_params
 
 
-def ftp_list_files(server_folder, target_folder, server, user, password, passive_mode=False):
-    """List files from given FTP's server folder to target folder."""
+def ftp_list_files(
+    server_folder,
+    ftp_host,
+    user,
+    password,
+    destination_folder=None,
+    passive_mode=False,
+    only_missing_files=True,
+):
+    """List files from given FTP's ftp_host folder to target folder.
+
+    Params:
+
+    """
     session_factory = ftputil.session.session_factory(
         base_class=ftplib.FTP,
         port=21,
         use_passive_mode=passive_mode,
-        encrypt_data_channel=True)
+        encrypt_data_channel=True,
+    )
 
-    with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host:
-        file_names = host.listdir(os.path.join(host.curdir, '/', server_folder))
-        return list_missing_files(server_folder, target_folder, file_names)
+    with ftputil.FTPHost(ftp_host, user, password, session_factory=session_factory) as host:
+        file_names = host.listdir(os.path.join(host.curdir, server_folder))
+        if only_missing_files:
+            return list_missing_files(server_folder, destination_folder, file_names)
+        else:
+            return [
+                os.path.join(
+                    server_folder,
+                    file_name
+                ) for file_name in file_names
+            ]
 
 
 def local_list_files(local_folder, target_folder):
@@ -321,3 +346,66 @@ def get_license_by_text(license_text):
             license = get_license_by_url(license_url=LICENSE_TEXTS[key])
 
     return license
+
+
+class RecordFile(object):
+    """Metadata of a file needed for a record.
+
+    Args:
+        path(str): local path to the file.
+        name(str): Optional, name of the file, if not passed, will use the name in the path.
+
+    Rises:
+        PathDoesNotExist:
+    """
+    def __init__(self, path, name=None):
+        self.path = path
+        if not os.path.exists(self.path):
+            raise PathDoesNotExist("The given record file path '%s' does not exist." % self.path)
+
+        if name is None:
+            name = os.path.basename(path)
+
+        self.name = name
+
+
+class ParsedItem(dict):
+    """Each of the individual items returned by the spider to the pipeline.
+
+    Args:
+        record(dict): Information about the crawled record, might be in different formats.
+        record_format(str): Format of the above record, for example ``"hep"`` or ``"hepcrawl"``.
+        file_urls(list(str)): URLs to the files to be downloaded by ``FftFilesPipeline``.
+        ftp_params(dict): Parameter for the ``FftFilesPipeline`` to be able to connect to the
+        ftp server, if any.
+        record_files(list(RecordFile)): files attached to the record, usually populated by
+        ``FftFilesPipeline`` from the ``file_urls`` parameter.
+    """
+    def __init__(
+            self,
+            record,
+            record_format,
+            file_urls=None,
+            ftp_params=None,
+            record_files=None,
+            **kwargs
+    ):
+        super(ParsedItem, self).__init__(
+            record=record,
+            record_format=record_format,
+            file_urls=file_urls,
+            ftp_params=ftp_params,
+            record_files=record_files,
+            **kwargs
+        )
+
+    def __getattr__(self, key):
+        if key not in self:
+            raise AttributeError(
+                "'%s' object has no attribute '%s'" % (self.__class__.__name__, key)
+            )
+
+        return self[key]
+
+    def __setattr__(self, key, value):
+        self[key] = value

From 677a2a60acbc91c7bb9a2d2ca2bce2891034db84 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Thu, 17 Aug 2017 14:45:27 +0200
Subject: [PATCH 03/11] crawler2hep: add other format-to-format functions

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/crawler2hep.py | 149 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 147 insertions(+), 2 deletions(-)

diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py
index d6898022..cacc5590 100644
--- a/hepcrawl/crawler2hep.py
+++ b/hepcrawl/crawler2hep.py
@@ -15,10 +15,155 @@
 
 from __future__ import absolute_import, division, print_function
 
+import os
+import datetime
+
 from inspire_schemas.api import LiteratureBuilder
 
 
-def crawler2hep(crawler_record):
+def _get_updated_fft_fields(current_fft_fields, record_files):
+    """
+
+    Params:
+        current_fft_fields(list(dict)): record current fft fields as generated by ``dojson``. We
+             expect each of then to have, at least, a key named ``path``.
+        record_files(list(RecordFile)): files attached to the record as populated by
+             ``FftFilesPipeline``.
+    """
+    record_files_index = {
+        record_file.name: record_file.path
+        for record_file in record_files
+    }
+    new_fft_fields = []
+    for fft_field in current_fft_fields:
+        file_name = os.path.basename(fft_field['path'])
+        if file_name in record_files_index:
+            fft_field['path'] = record_files_index[file_name]
+            new_fft_fields.append(fft_field)
+
+    return new_fft_fields
+
+
+def _has_publication_info(item):
+    """If any publication info."""
+    return item.get('pubinfo_freetext') or item.get('journal_volume') or \
+        item.get('journal_title') or \
+        item.get('journal_year') or \
+        item.get('journal_issue') or \
+        item.get('journal_fpage') or \
+        item.get('journal_lpage') or \
+        item.get('journal_artid') or \
+        item.get('journal_doctype')
+
+
+def _filter_fields(item, keys):
+    """Filter away keys."""
+    for key in keys:
+        item.pop(key, None)
+
+
+def _normalize_hepcrawl_record(item, source):
+    if 'related_article_doi' in item:
+        item['dois'] += item.pop('related_article_doi', [])
+
+    item['titles'] = [{
+        'title': item.pop('title', ''),
+        'subtitle': item.pop('subtitle', ''),
+        'source': source,
+    }]
+
+    item['abstracts'] = [{
+        'value': item.pop('abstract', ''),
+        'source': source,
+    }]
+
+    item['imprints'] = [{
+        'date': item.pop('date_published', ''),
+    }]
+
+    item['copyright'] = [{
+        'holder': item.pop('copyright_holder', ''),
+        'year': item.pop('copyright_year', ''),
+        'statement': item.pop('copyright_statement', ''),
+        'material': item.pop('copyright_material', ''),
+    }]
+
+    if _has_publication_info(item):
+        item['publication_info'] = [{
+            'journal_title': item.pop('journal_title', ''),
+            'journal_volume': item.pop('journal_volume', ''),
+            'journal_issue': item.pop('journal_issue', ''),
+            'artid': item.pop('journal_artid', ''),
+            'page_start': item.pop('journal_fpage', ''),
+            'page_end': item.pop('journal_lpage', ''),
+            'note': item.pop('journal_doctype', ''),
+            'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
+            'pubinfo_material': item.pop('pubinfo_material', ''),
+        }]
+        if item.get('journal_year'):
+            item['publication_info'][0]['year'] = int(
+                item.pop('journal_year')
+            )
+
+    # Remove any fields
+    _filter_fields(item, [
+        'journal_title',
+        'journal_volume',
+        'journal_year',
+        'journal_issue',
+        'journal_fpage',
+        'journal_lpage',
+        'journal_doctype',
+        'journal_artid',
+        'pubinfo_freetext',
+        'pubinfo_material',
+    ])
+
+    return item
+
+
+def _generate_acquisition_source(source):
+    acquisition_source = {
+        'source': source,
+        'method': 'hepcrawl',
+        'datetime': datetime.datetime.now().isoformat(),
+        'submission_number': os.environ.get('SCRAPY_JOB', ''),
+    }
+    return acquisition_source
+
+
+def item_to_hep(
+    item,
+    source,
+):
+    item.record['acquisition_source'] = _generate_acquisition_source(source=source)
+
+    if item.record_format == 'hep':
+        return hep_to_hep(
+            hep_record=item.record,
+            record_files=item.record_files,
+        )
+    elif item.record_format == 'hepcrawl':
+        record = _normalize_hepcrawl_record(
+            item=item.record,
+            source=source,
+        )
+        return hepcrawl_to_hep(dict(record))
+    else:
+        raise Exception('Unknown item_format::{}'.format(item.record_format))
+
+
+def hep_to_hep(hep_record, record_files):
+    if record_files:
+        hep_record['_fft'] = _get_updated_fft_fields(
+            current_fft_fields=hep_record['_fft'],
+            record_files=record_files,
+        )
+
+    return hep_record
+
+
+def hepcrawl_to_hep(crawler_record):
 
     def _filter_affiliation(affiliations):
         return [
@@ -98,7 +243,7 @@ def _filter_affiliation(affiliations):
     acquisition_source = crawler_record.get('acquisition_source', {})
     builder.add_acquisition_source(
         method=acquisition_source['method'],
-        date=acquisition_source['date'],
+        date=acquisition_source['datetime'],
         source=acquisition_source['source'],
         submission_number=acquisition_source['submission_number'],
     )

From f04db22fc9bbbaff89c3862278fb6c6f992affc1 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Thu, 17 Aug 2017 14:49:10 +0200
Subject: [PATCH 04/11] tohep: rename crawler2hep module to tohep

Now it contains more than just crawler format to hep format
converters.

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/pipelines.py                                     | 2 +-
 hepcrawl/{crawler2hep.py => tohep.py}                     | 0
 .../{crawler2hep => tohep}/in_generic_crawler_record.yaml | 2 +-
 .../{crawler2hep => tohep}/in_no_document_type.yaml       | 2 +-
 .../out_generic_crawler_record.yaml                       | 0
 .../{crawler2hep => tohep}/out_no_document_type.yaml      | 0
 tests/unit/{test_crawler2hep.py => test_tohep.py}         | 8 ++++----
 7 files changed, 7 insertions(+), 7 deletions(-)
 rename hepcrawl/{crawler2hep.py => tohep.py} (100%)
 rename tests/unit/responses/{crawler2hep => tohep}/in_generic_crawler_record.yaml (98%)
 rename tests/unit/responses/{crawler2hep => tohep}/in_no_document_type.yaml (98%)
 rename tests/unit/responses/{crawler2hep => tohep}/out_generic_crawler_record.yaml (100%)
 rename tests/unit/responses/{crawler2hep => tohep}/out_no_document_type.yaml (100%)
 rename tests/unit/{test_crawler2hep.py => test_tohep.py} (87%)

diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
index c464c7a0..9e3d84e8 100644
--- a/hepcrawl/pipelines.py
+++ b/hepcrawl/pipelines.py
@@ -24,7 +24,7 @@
 
 from inspire_schemas.utils import validate
 
-from hepcrawl.crawler2hep import item_to_hep
+from hepcrawl.tohep import item_to_hep
 from hepcrawl.settings import FILES_STORE
 from hepcrawl.utils import RecordFile
 
diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/tohep.py
similarity index 100%
rename from hepcrawl/crawler2hep.py
rename to hepcrawl/tohep.py
diff --git a/tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml b/tests/unit/responses/tohep/in_generic_crawler_record.yaml
similarity index 98%
rename from tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml
rename to tests/unit/responses/tohep/in_generic_crawler_record.yaml
index 4e80ba6b..1ade2b4b 100644
--- a/tests/unit/responses/crawler2hep/in_generic_crawler_record.yaml
+++ b/tests/unit/responses/tohep/in_generic_crawler_record.yaml
@@ -3,7 +3,7 @@
         "11"
     ],
     "acquisition_source": {
-        "date": "2017-02-21T18:03:40.858985",
+        "datetime": "2017-02-21T18:03:40.858985",
         "source": "arXiv",
         "method": "hepcrawl",
         "submission_number": "scrapy_job"
diff --git a/tests/unit/responses/crawler2hep/in_no_document_type.yaml b/tests/unit/responses/tohep/in_no_document_type.yaml
similarity index 98%
rename from tests/unit/responses/crawler2hep/in_no_document_type.yaml
rename to tests/unit/responses/tohep/in_no_document_type.yaml
index 22b93fd0..21543c36 100644
--- a/tests/unit/responses/crawler2hep/in_no_document_type.yaml
+++ b/tests/unit/responses/tohep/in_no_document_type.yaml
@@ -5,7 +5,7 @@
         "11"
     ],
     "acquisition_source": {
-        "date": "2017-02-21T18:03:40.858985",
+        "datetime": "2017-02-21T18:03:40.858985",
         "source": "arXiv",
         "method": "hepcrawl",
         "submission_number": "scrapy_job"
diff --git a/tests/unit/responses/crawler2hep/out_generic_crawler_record.yaml b/tests/unit/responses/tohep/out_generic_crawler_record.yaml
similarity index 100%
rename from tests/unit/responses/crawler2hep/out_generic_crawler_record.yaml
rename to tests/unit/responses/tohep/out_generic_crawler_record.yaml
diff --git a/tests/unit/responses/crawler2hep/out_no_document_type.yaml b/tests/unit/responses/tohep/out_no_document_type.yaml
similarity index 100%
rename from tests/unit/responses/crawler2hep/out_no_document_type.yaml
rename to tests/unit/responses/tohep/out_no_document_type.yaml
diff --git a/tests/unit/test_crawler2hep.py b/tests/unit/test_tohep.py
similarity index 87%
rename from tests/unit/test_crawler2hep.py
rename to tests/unit/test_tohep.py
index 95375ebf..547bc67e 100644
--- a/tests/unit/test_crawler2hep.py
+++ b/tests/unit/test_tohep.py
@@ -12,14 +12,14 @@
 import pytest
 import yaml
 
-from hepcrawl.crawler2hep import crawler2hep
+from hepcrawl.tohep import hepcrawl_to_hep
 from hepcrawl.testlib.fixtures import get_test_suite_path
 
 
 def load_file(file_name):
     path = get_test_suite_path(
         'responses',
-        'crawler2hep',
+        'tohep',
         file_name,
     )
     with open(path) as input_data:
@@ -52,7 +52,7 @@ def test_generic_crawler_record(
         input_generic_crawler_record,
         expected_generic_crawler_record
 ):
-    produced_record = crawler2hep(input_generic_crawler_record)
+    produced_record = hepcrawl_to_hep(input_generic_crawler_record)
     assert produced_record == expected_generic_crawler_record
 
 
@@ -60,5 +60,5 @@ def test_no_document_type(
         input_no_document_type_record,
         expected_no_document_type_record
 ):
-    produced_record = crawler2hep(input_no_document_type_record)
+    produced_record = hepcrawl_to_hep(input_no_document_type_record)
     assert produced_record == expected_no_document_type_record

From 01ff0d8023c34457b2d7191098a7b99929dcde2f Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Thu, 17 Aug 2017 14:51:59 +0200
Subject: [PATCH 05/11] travis: add desy functional tests

Signed-off-by: David Caro <david@dcaro.es>
---
 .travis.yml             |  1 +
 docker-compose.test.yml | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index f05e2d22..91407e6e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,6 +28,7 @@ env:
     - SUITE=unit
     - SUITE=functional_wsp
     - SUITE=functional_arxiv
+    - SUITE=functional_desy
 
 matrix:
   fast_finish: true
diff --git a/docker-compose.test.yml b/docker-compose.test.yml
index 7ffe0122..d14d7c87 100644
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@@ -17,6 +17,7 @@ services:
       - APP_CELERY_RESULT_BACKEND=amqp://guest:guest@rabbitmq:5672//
       - APP_CRAWLER_HOST_URL=http://scrapyd:6800
       - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results
+      - APP_FILES_STORE=/tmp/file_urls
       - COVERAGE_PROCESS_START=/code/.coveragerc
       - BASE_USER_UID=${BASE_USER_UID:-1000}
       - BASE_USER_GIT=${BASE_USER_GIT:-1000}
@@ -26,6 +27,7 @@ services:
       - ${PWD}:/code/
       - ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
       - /tmp/WSP:/tmp/WSP
+      - /tmp/file_urls:/tmp/file_urls
 
   functional_wsp:
     <<: *service_base
@@ -34,6 +36,13 @@ services:
       - scrapyd
       - ftp_server
 
+  functional_desy:
+    <<: *service_base
+    command: py.test -vv tests/functional/desy
+    links:
+      - scrapyd
+      - ftp_server
+
   functional_arxiv:
     <<: *service_base
     command: py.test -vv tests/functional/arxiv
@@ -68,6 +77,8 @@ services:
     environment:
       - PUBLICHOST=localhost
     volumes:
+      - ${PWD}/tests/functional/desy/fixtures/ftp_server/FFT:/home/ftpusers/bob/FFT
+      - ${PWD}/tests/functional/desy/fixtures/ftp_server/DESY:/home/ftpusers/bob/DESY
       - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP
       - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd
 

From e4b9f4301e03db7c7dc8125b19415825443f78a3 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Tue, 22 Aug 2017 16:24:53 +0200
Subject: [PATCH 06/11] global: use standard ParsedResponse in the spiders

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/pipelines.py               |  6 +++---
 hepcrawl/spiders/alpha_spider.py    | 12 ++++++++++--
 hepcrawl/spiders/aps_spider.py      | 15 +++++++++++++--
 hepcrawl/spiders/arxiv_spider.py    | 21 ++++++++++++++++-----
 hepcrawl/spiders/base_spider.py     | 15 +++++++++++++--
 hepcrawl/spiders/brown_spider.py    | 14 ++++++++++++--
 hepcrawl/spiders/desy_spider.py     |  2 +-
 hepcrawl/spiders/dnb_spider.py      | 15 +++++++++++++--
 hepcrawl/spiders/edp_spider.py      | 29 ++++++++++++++++++++---------
 hepcrawl/spiders/elsevier_spider.py | 10 ++++++++--
 hepcrawl/spiders/hindawi_spider.py  | 12 ++++++++++--
 hepcrawl/spiders/infn_spider.py     | 13 ++++++++++---
 hepcrawl/spiders/iop_spider.py      |  8 +++++++-
 hepcrawl/spiders/magic_spider.py    | 12 ++++++++++--
 hepcrawl/spiders/mit_spider.py      | 13 +++++++++++--
 hepcrawl/spiders/phenix_spider.py   |  8 +++++++-
 hepcrawl/spiders/phil_spider.py     | 13 +++++++++++--
 hepcrawl/spiders/pos_spider.py      | 18 +++++++++++++++---
 hepcrawl/spiders/t2k_spider.py      | 12 ++++++++++--
 hepcrawl/spiders/wsp_spider.py      | 29 +++++++++++++++++++++--------
 hepcrawl/testlib/celery_monitor.py  | 28 +++++++++++++++++++++++-----
 hepcrawl/testlib/fixtures.py        | 16 ++++++++++++++++
 22 files changed, 260 insertions(+), 61 deletions(-)

diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
index 9e3d84e8..e583dc2a 100644
--- a/hepcrawl/pipelines.py
+++ b/hepcrawl/pipelines.py
@@ -24,9 +24,9 @@
 
 from inspire_schemas.utils import validate
 
-from hepcrawl.tohep import item_to_hep
-from hepcrawl.settings import FILES_STORE
-from hepcrawl.utils import RecordFile
+from .tohep import item_to_hep
+from .settings import FILES_STORE
+from .utils import RecordFile
 
 
 class FftFilesPipeline(FilesPipeline):
diff --git a/hepcrawl/spiders/alpha_spider.py b/hepcrawl/spiders/alpha_spider.py
index 2ab883f3..c791546e 100644
--- a/hepcrawl/spiders/alpha_spider.py
+++ b/hepcrawl/spiders/alpha_spider.py
@@ -20,7 +20,10 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import has_numbers
+from ..utils import (
+    has_numbers,
+    ParsedItem,
+)
 
 
 class AlphaSpider(CrawlSpider):
@@ -145,4 +148,9 @@ def parse(self, response):
             record.add_value('source', 'Alpha experiment')
             record.add_value('collections', ['HEP', 'THESIS'])
 
-            yield record.load_item()
+            parsed_item = ParsedItem(
+                record=record.load_item(),
+                record_format='hepcrawl',
+            )
+
+            yield parsed_item
diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py
index 496e2e8e..1cda5bea 100644
--- a/hepcrawl/spiders/aps_spider.py
+++ b/hepcrawl/spiders/aps_spider.py
@@ -20,7 +20,12 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import get_licenses, get_nested, build_dict
+from ..utils import (
+    get_licenses,
+    get_nested,
+    build_dict,
+    ParsedItem,
+)
 
 
 class APSSpider(Spider):
@@ -110,7 +115,13 @@ def parse(self, response):
             record.add_value('license', license)
 
             record.add_value('collections', ['HEP', 'Citeable', 'Published'])
-            yield record.load_item()
+
+            parsed_item = ParsedItem(
+                record=record.load_item(),
+                record_format='hepcrawl',
+            )
+
+            yield parsed_item
 
         # Pagination support. Will yield until no more "next" pages are found
         if 'Link' in response.headers:
diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py
index d82c8318..59a01295 100644
--- a/hepcrawl/spiders/arxiv_spider.py
+++ b/hepcrawl/spiders/arxiv_spider.py
@@ -16,10 +16,15 @@
 from scrapy import Request, Selector
 from scrapy.spiders import XMLFeedSpider
 
-from ..mappings import CONFERENCE_WORDS, THESIS_WORDS
-from ..utils import coll_cleanforthe, get_licenses, split_fullname
 from ..items import HEPRecord
 from ..loaders import HEPLoader
+from ..mappings import CONFERENCE_WORDS, THESIS_WORDS
+from ..utils import (
+    coll_cleanforthe,
+    get_licenses,
+    split_fullname,
+    ParsedItem,
+)
 
 RE_CONFERENCE = re.compile(r'\b(%s)\b' % '|'.join(
     [re.escape(word) for word in CONFERENCE_WORDS]), re.I | re.U)
@@ -33,7 +38,9 @@ class ArxivSpider(XMLFeedSpider):
     Example:
         Using OAI-PMH XML files::
 
-            $ scrapy crawl arXiv -a source_file=file://`pwd`/tests/responses/arxiv/sample_arxiv_record.xml
+            $ scrapy crawl \\
+                arXiv \\
+                -a "source_file=file://$PWD/tests/responses/arxiv/sample_arxiv_record.xml"
 
     """
 
@@ -110,8 +117,12 @@ def parse_node(self, response, node):
         )
         record.add_value('license', license)
 
-        parsed_record = dict(record.load_item())
-        return parsed_record
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
 
     def _get_authors_or_collaboration(self, node):
         """Parse authors, affiliations; extract collaboration"""
diff --git a/hepcrawl/spiders/base_spider.py b/hepcrawl/spiders/base_spider.py
index 5eb22eb7..d7c2d06d 100644
--- a/hepcrawl/spiders/base_spider.py
+++ b/hepcrawl/spiders/base_spider.py
@@ -18,7 +18,12 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import get_mime_type, parse_domain, get_node
+from ..utils import (
+    get_mime_type,
+    parse_domain,
+    get_node,
+    ParsedItem,
+)
 
 
 class BaseSpider(XMLFeedSpider):
@@ -192,7 +197,13 @@ def build_item(self, response):
         record.add_value("authors", self.get_authors(node))
         record.add_value('thesis', {'degree_type': 'PhD'})
         record.add_value('collections', ['HEP', 'THESIS'])
-        return record.load_item()
+
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
 
     def scrape_for_pdf(self, response):
         """Scrape splash page for any links to PDFs.
diff --git a/hepcrawl/spiders/brown_spider.py b/hepcrawl/spiders/brown_spider.py
index 6c881252..f17dd197 100644
--- a/hepcrawl/spiders/brown_spider.py
+++ b/hepcrawl/spiders/brown_spider.py
@@ -21,7 +21,12 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import split_fullname, parse_domain, get_mime_type
+from ..utils import (
+    split_fullname,
+    parse_domain,
+    get_mime_type,
+    ParsedItem,
+)
 
 
 class BrownSpider(CrawlSpider):
@@ -219,4 +224,9 @@ def build_item(self, response):
         record.add_value('thesis', response.meta.get("thesis"))
         record.add_value('collections', ['HEP', 'THESIS'])
 
-        return record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
index 4f90d6e9..69d40619 100644
--- a/hepcrawl/spiders/desy_spider.py
+++ b/hepcrawl/spiders/desy_spider.py
@@ -22,7 +22,7 @@
 
 from inspire_dojson.hep import hep
 
-from hepcrawl.utils import (
+from ..utils import (
     ftp_list_files,
     ftp_connection_info,
     ParsedItem,
diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py
index 3ac8b901..f350cf8f 100644
--- a/hepcrawl/spiders/dnb_spider.py
+++ b/hepcrawl/spiders/dnb_spider.py
@@ -16,7 +16,12 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import get_mime_type, parse_domain, get_node
+from ..utils import (
+    get_mime_type,
+    parse_domain,
+    get_node,
+    ParsedItem,
+)
 
 
 class DNBSpider(XMLFeedSpider):
@@ -219,4 +224,10 @@ def build_item(self, response):
 
         record.add_value('thesis', {'degree_type': 'PhD'})
         record.add_value('collections', ['HEP', 'THESIS'])
-        return record.load_item()
+
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py
index beea699d..eb6f4cf3 100644
--- a/hepcrawl/spiders/edp_spider.py
+++ b/hepcrawl/spiders/edp_spider.py
@@ -30,6 +30,7 @@
     get_licenses,
     get_node,
     parse_domain,
+    ParsedItem,
 )
 
 
@@ -65,11 +66,11 @@ class EDPSpider(Jats, XMLFeedSpider):
 
         To run an ``EDPSpider`` using ``rich`` format::
 
-            $ scrapy crawl EDP -a package_path=file://`pwd`/tests/responses/edp/test_rich.tar.bz2
+            $ scrapy crawl EDP -a source_folder=file://`pwd`/tests/responses/edp/test_rich.tar.bz2
 
         To run an ``EDPSpider`` using ``gz`` format::
 
-            $ scrapy crawl EDP -a package_path=file://`pwd`/tests/responses/edp/test_gz.tar.gz
+            $ scrapy crawl EDP -a source_folder=file://`pwd`/tests/responses/edp/test_gz.tar.gz
 
     Todo:
 
@@ -144,9 +145,9 @@ def start_requests(self):
             ftp_host, ftp_params = ftp_connection_info(
                 self.ftp_host, self.ftp_netrc)
             _, new_files = ftp_list_files(
-                self.ftp_folder,
-                self.target_folder,
-                server=ftp_host,
+                server_folder=self.ftp_folder,
+                destination_folder=self.target_folder,
+                ftp_host=ftp_host,
                 user=ftp_params['ftp_user'],
                 password=ftp_params['ftp_password']
             )
@@ -175,7 +176,7 @@ def handle_package_ftp(self, response):
         for xml_file in xml_files:
             yield Request(
                 "file://{0}".format(xml_file),
-                meta={"package_path": zip_filepath}
+                meta={"source_folder": zip_filepath}
             )
 
     def handle_package_file(self, response):
@@ -188,7 +189,7 @@ def handle_package_file(self, response):
         for xml_file in xml_files:
             request = Request(
                 "file://{0}".format(xml_file),
-                meta={"package_path": zip_filepath}
+                meta={"source_folder": zip_filepath}
             )
             if "xml_rich" in xml_file:
                 request.meta["rich"] = True
@@ -318,7 +319,12 @@ def build_item_rich(self, response):
             )
         record.add_value("urls", response.meta.get("urls"))
 
-        return record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
 
     def build_item_jats(self, response):
         """Build the final HEPRecord with JATS-format XML ('jp')."""
@@ -388,7 +394,12 @@ def build_item_jats(self, response):
         references = self._get_references(node)
         record.add_value("references", references)
 
-        return record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
 
     def _get_references(self, node):
         """Get the references."""
diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py
index c9aacc00..b91e8372 100644
--- a/hepcrawl/spiders/elsevier_spider.py
+++ b/hepcrawl/spiders/elsevier_spider.py
@@ -31,6 +31,7 @@
     has_numbers,
     range_as_string,
     unzip_xml_files,
+    ParsedItem,
 )
 
 from ..dateutils import format_year
@@ -180,7 +181,7 @@ def handle_package(self, response):
             xml_url = u"file://{0}".format(os.path.abspath(xml_file))
             yield Request(
                 xml_url,
-                meta={"package_path": zip_filepath,
+                meta={"source_folder": zip_filepath,
                       "xml_url": xml_url},
             )
 
@@ -1034,4 +1035,9 @@ def build_item(self, response):
         record.add_value('collections', self.get_collections(doctype))
         record.add_value('references', self.get_references(node))
 
-        return record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py
index 941a3674..cce5e8eb 100644
--- a/hepcrawl/spiders/hindawi_spider.py
+++ b/hepcrawl/spiders/hindawi_spider.py
@@ -16,7 +16,10 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import get_licenses
+from ..utils import (
+    get_licenses,
+    ParsedItem,
+)
 
 
 class HindawiSpider(XMLFeedSpider):
@@ -222,4 +225,9 @@ def parse_node(self, response, node):
         record.add_xpath('source',
                          "./datafield[@tag='260']/subfield[@code='b']/text()")
 
-        return record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py
index 2e970c1c..a3457a21 100644
--- a/hepcrawl/spiders/infn_spider.py
+++ b/hepcrawl/spiders/infn_spider.py
@@ -21,8 +21,10 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import get_temporary_file
-
+from ..utils import (
+    get_temporary_file,
+    ParsedItem,
+)
 from ..dateutils import format_date
 
 
@@ -240,4 +242,9 @@ def build_item(self, response):
         record.add_value('source', 'INFN')
         record.add_value('collections', ['HEP', 'THESIS'])
 
-        return record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py
index 0e3bae65..288bd205 100644
--- a/hepcrawl/spiders/iop_spider.py
+++ b/hepcrawl/spiders/iop_spider.py
@@ -23,6 +23,7 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
+from ..utils import ParsedItem
 
 
 class IOPSpider(XMLFeedSpider, NLM):
@@ -222,4 +223,9 @@ def parse_node(self, response, node):
                 record.add_value("additional_files",
                                  self.add_fft_file(pdf_file_path, file_access, file_type))
 
-        return record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py
index 77bf7948..03d54618 100644
--- a/hepcrawl/spiders/magic_spider.py
+++ b/hepcrawl/spiders/magic_spider.py
@@ -18,7 +18,10 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import split_fullname
+from ..utils import (
+    split_fullname,
+    ParsedItem,
+)
 
 
 class MagicSpider(XMLFeedSpider):
@@ -176,4 +179,9 @@ def build_item(self, response):
         record.add_value("additional_files", response.meta.get("files"))
         record.add_value('collections', ['HEP', 'THESIS'])
 
-        yield record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        yield parsed_item
diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py
index c71234f9..5387042d 100644
--- a/hepcrawl/spiders/mit_spider.py
+++ b/hepcrawl/spiders/mit_spider.py
@@ -23,7 +23,11 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import get_temporary_file, split_fullname
+from ..utils import (
+    get_temporary_file,
+    split_fullname,
+    ParsedItem,
+)
 
 
 class MITSpider(XMLFeedSpider):
@@ -223,4 +227,9 @@ def build_item(self, response):
         record.add_value('page_nr', self.get_page_nr(node))
         record.add_value('collections', ['HEP', 'THESIS'])
 
-        return record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py
index 7200664e..9eaa9da0 100644
--- a/hepcrawl/spiders/phenix_spider.py
+++ b/hepcrawl/spiders/phenix_spider.py
@@ -18,6 +18,7 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
+from ..utils import ParsedItem
 
 
 class PhenixSpider(XMLFeedSpider):
@@ -128,4 +129,9 @@ def parse_node(self, response, node):
         record.add_value('source', 'PHENIX')
         record.add_value('collections', ['HEP', 'THESIS'])
 
-        return record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
diff --git a/hepcrawl/spiders/phil_spider.py b/hepcrawl/spiders/phil_spider.py
index 101b1163..d0cddaea 100644
--- a/hepcrawl/spiders/phil_spider.py
+++ b/hepcrawl/spiders/phil_spider.py
@@ -19,7 +19,11 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import parse_domain, get_mime_type
+from ..utils import (
+    parse_domain,
+    get_mime_type,
+    ParsedItem,
+)
 
 
 class PhilSpider(CrawlSpider):
@@ -160,4 +164,9 @@ def build_item(self, response):
             if not jsonrecord.get('year') == "forthcoming":
                 record.add_value('journal_year', int(jsonrecord['year']))
 
-        return record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py
index 7d3fb87d..152d6688 100644
--- a/hepcrawl/spiders/pos_spider.py
+++ b/hepcrawl/spiders/pos_spider.py
@@ -13,13 +13,19 @@
 
 import re
 
+from urlparse import urljoin
+
 from scrapy import Request, Selector
 from scrapy.spiders import Spider
-from urlparse import urljoin
-from ..utils import get_licenses, get_first
+
 from ..dateutils import create_valid_date
 from ..items import HEPRecord
 from ..loaders import HEPLoader
+from ..utils import (
+    get_licenses,
+    get_first,
+    ParsedItem,
+)
 
 
 class POSSpider(Spider):
@@ -128,7 +134,13 @@ def build_item(self, response):
             record.add_value('extra_data', extra_data)
 
         record.add_value('collections', ['HEP', 'ConferencePaper'])
-        return record.load_item()
+
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
 
     def _get_ext_systems_number(self, node):
         return [
diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py
index 661f0bec..86076e46 100644
--- a/hepcrawl/spiders/t2k_spider.py
+++ b/hepcrawl/spiders/t2k_spider.py
@@ -18,7 +18,10 @@
 
 from ..items import HEPRecord
 from ..loaders import HEPLoader
-from ..utils import split_fullname
+from ..utils import (
+    split_fullname,
+    ParsedItem,
+)
 
 
 class T2kSpider(XMLFeedSpider):
@@ -164,4 +167,9 @@ def build_item(self, response):
         record.add_value("additional_files", response.meta.get("additional_files"))
         record.add_value('collections', ['HEP', 'THESIS'])
 
-        yield record.load_item()
+        parsed_item = ParsedItem(
+            record=record.load_item(),
+            record_format='hepcrawl',
+        )
+
+        yield parsed_item
diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py
index 3f68131f..058e6cc0 100644
--- a/hepcrawl/spiders/wsp_spider.py
+++ b/hepcrawl/spiders/wsp_spider.py
@@ -26,6 +26,7 @@
     local_list_files,
     get_licenses,
     unzip_xml_files,
+    ParsedItem,
 )
 
 
@@ -71,7 +72,15 @@ class WorldScientificSpider(Jats, XMLFeedSpider):
         'rapid-communications'
     ]
 
-    def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc=None, *args, **kwargs):
+    def __init__(
+        self,
+        package_path=None,
+        ftp_folder="/WSP",
+        ftp_host=None,
+        ftp_netrc=None,
+        *args,
+        **kwargs
+    ):
         """Construct WSP spider."""
         super(WorldScientificSpider, self).__init__(*args, **kwargs)
         self.ftp_folder = ftp_folder
@@ -97,8 +106,8 @@ def start_requests(self):
 
             new_files_paths = ftp_list_files(
                 self.ftp_folder,
-                self.target_folder,
-                server=ftp_host,
+                destination_folder=self.target_folder,
+                ftp_host=ftp_host,
                 user=ftp_params['ftp_user'],
                 password=ftp_params['ftp_password']
             )
@@ -126,7 +135,7 @@ def handle_package_ftp(self, response):
         for xml_file in xml_files:
             yield Request(
                 "file://{0}".format(xml_file),
-                meta={"package_path": zip_filepath}
+                meta={"source_folder": zip_filepath}
             )
 
     def handle_package_file(self, response):
@@ -138,7 +147,7 @@ def handle_package_file(self, response):
         for xml_file in xml_files:
             yield Request(
                 "file://{0}".format(xml_file),
-                meta={"package_path": zip_filepath}
+                meta={"source_folder": zip_filepath}
             )
 
     def parse_node(self, response, node):
@@ -148,7 +157,7 @@ def parse_node(self, response, node):
         self.log("Got article_type {0}".format(article_type))
         if article_type is None or article_type[0] not in self.allowed_article_types:
             # Filter out non-interesting article types
-            return None
+            return
 
         record = HEPLoader(item=HEPRecord(), selector=node, response=response)
         if article_type in ['correction',
@@ -203,9 +212,13 @@ def parse_node(self, response, node):
         record.add_value('license', license)
 
         record.add_value('collections', self._get_collections(node, article_type, journal_title))
-        parsed_record = dict(record.load_item())
 
-        return parsed_record
+        parsed_item = ParsedItem(
+            record=dict(record.load_item()),
+            record_format='hepcrawl',
+        )
+
+        return parsed_item
 
     def _get_collections(self, node, article_type, current_journal_title):
         """Return this articles' collection."""
diff --git a/hepcrawl/testlib/celery_monitor.py b/hepcrawl/testlib/celery_monitor.py
index 6c720550..1347ab22 100644
--- a/hepcrawl/testlib/celery_monitor.py
+++ b/hepcrawl/testlib/celery_monitor.py
@@ -9,7 +9,12 @@
 
 """Celery monitor dealing with celery tasks for functional tests."""
 
-from __future__ import absolute_import, division, print_function, unicode_literals
+from __future__ import (
+    absolute_import,
+    division,
+    print_function,
+    unicode_literals,
+)
 
 from itertools import islice
 
@@ -19,13 +24,14 @@
 
 
 class CeleryMonitor(object):
-    def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100):
+    def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100, events_limit=2):
         self.results = []
         self.recv = None
         self.app = app
         self.connection = None
         self.monitor_timeout = monitor_timeout
         self.monitor_iter_limit = monitor_iter_limit
+        self.events_limit = events_limit
 
     def __enter__(self):
         state = self.app.events.State()
@@ -61,10 +67,16 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.connection.__exit__()
 
     def _wait_for_results(self, events_iter):
-        any(islice(
+        generator_events = islice(
             events_iter,  # iterable
             self.monitor_iter_limit  # stop
-        ))
+        )
+        counter = 0
+        for dummy in generator_events:
+            if dummy:
+                counter += 1
+            if counter == self.events_limit:
+                break
 
     @classmethod
     def do_crawl(
@@ -72,6 +84,7 @@ def do_crawl(
         app,
         monitor_timeout,
         monitor_iter_limit,
+        events_limit,
         crawler_instance,
         project='hepcrawl',
         spider='WSP',
@@ -80,7 +93,12 @@ def do_crawl(
     ):
         settings = settings or {}
 
-        with cls(app, monitor_timeout=monitor_timeout, monitor_iter_limit=monitor_iter_limit) as my_monitor:
+        with cls(
+            app,
+            monitor_timeout=monitor_timeout,
+            monitor_iter_limit=monitor_iter_limit,
+            events_limit=events_limit
+        ) as my_monitor:
             crawler_instance.schedule(
                 project=project,
                 spider=spider,
diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py
index 513b0395..73f28f96 100644
--- a/hepcrawl/testlib/fixtures.py
+++ b/hepcrawl/testlib/fixtures.py
@@ -11,6 +11,7 @@
 
 import os
 import json
+import shutil
 
 from scrapy.http import Request, TextResponse
 from scrapy.selector import Selector
@@ -131,3 +132,18 @@ def expected_json_results_from_file(*path_chunks, **kwargs):
         expected_data = json.load(fd)
 
     return expected_data
+
+
+def clean_dir(path):
+    """
+    Deletes all contained files of given target directory path.
+
+    Args:
+        path: Absolute path of target directory to be cleaned.
+
+    Example:
+
+         >>> clean_dir('/dir_1/dir_11/')
+
+    """
+    shutil.rmtree(path, ignore_errors=True)

From ce8ccc024f531c69f261dcc517634255705560a6 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Tue, 22 Aug 2017 16:25:21 +0200
Subject: [PATCH 07/11] tests: adapt tests for all previous changes

Signed-off-by: David Caro <david@dcaro.es>
---
 setup.py                             |  1 +
 tests/functional/arxiv/test_arxiv.py |  1 +
 tests/functional/wsp/test_wsp.py     |  8 ++--
 tests/unit/test_alpha.py             |  4 +-
 tests/unit/test_aps.py               |  4 +-
 tests/unit/test_arxiv_all.py         | 35 ++++----------
 tests/unit/test_arxiv_single.py      | 15 +++---
 tests/unit/test_base.py              | 23 ++++++++--
 tests/unit/test_brown.py             | 14 ++++--
 tests/unit/test_dnb.py               | 14 +++++-
 tests/unit/test_edp.py               | 68 ++++++++++++++++++++++------
 tests/unit/test_elsevier.py          | 45 ++++++++++++++----
 tests/unit/test_hindawi.py           |  8 ++--
 tests/unit/test_infn.py              | 16 +++++--
 tests/unit/test_iop.py               | 17 ++++---
 tests/unit/test_magic.py             | 26 ++++++++---
 tests/unit/test_mit.py               | 22 +++++++--
 tests/unit/test_phenix.py            | 13 ++++--
 tests/unit/test_phil.py              | 22 +++++++--
 tests/unit/test_pos.py               |  7 ++-
 tests/unit/test_t2k.py               | 16 ++++---
 tests/unit/test_world_scientific.py  |  7 ++-
 22 files changed, 264 insertions(+), 122 deletions(-)

diff --git a/setup.py b/setup.py
index b19e5f14..a98aeb88 100644
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,7 @@
 install_requires = [
     'autosemver~=0.2',
     'inspire-schemas~=42.0',
+    'inspire-dojson~=41.0',
     'Scrapy>=1.1.0',
     # TODO: unpin once they support wheel building again
     'scrapyd==1.1.0',
diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py
index a9677b89..0f58b17d 100644
--- a/tests/functional/arxiv/test_arxiv.py
+++ b/tests/functional/arxiv/test_arxiv.py
@@ -72,6 +72,7 @@ def test_arxiv(set_up_local_environment, expected_results):
         app=celery_app,
         monitor_timeout=5,
         monitor_iter_limit=100,
+        events_limit=1,
         crawler_instance=crawler,
         project=set_up_local_environment.get('CRAWLER_PROJECT'),
         spider='arXiv',
diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py
index 70996466..a0411b8e 100644
--- a/tests/functional/wsp/test_wsp.py
+++ b/tests/functional/wsp/test_wsp.py
@@ -13,7 +13,6 @@
 
 import pytest
 import os
-import shutil
 
 from time import sleep
 
@@ -21,6 +20,7 @@
 from hepcrawl.testlib.fixtures import (
     get_test_suite_path,
     expected_json_results_from_file,
+    clean_dir,
 )
 from hepcrawl.testlib.tasks import app as celery_app
 from hepcrawl.testlib.utils import get_crawler_instance
@@ -90,10 +90,6 @@ def remove_generated_files(package_location):
             os.unlink(os.path.join(package_location, file_name))
 
 
-def clean_dir(path='/tmp/WSP/'):
-    shutil.rmtree(path, ignore_errors=True)
-
-
 @pytest.mark.parametrize(
     'expected_results',
     [
@@ -114,6 +110,7 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results):
         app=celery_app,
         monitor_timeout=5,
         monitor_iter_limit=100,
+        events_limit=1,
         crawler_instance=crawler,
         project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
         spider='WSP',
@@ -147,6 +144,7 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results):
         app=celery_app,
         monitor_timeout=5,
         monitor_iter_limit=100,
+        events_limit=1,
         crawler_instance=crawler,
         project=set_up_local_environment.get('CRAWLER_PROJECT'),
         spider='WSP',
diff --git a/tests/unit/test_alpha.py b/tests/unit/test_alpha.py
index eef140b1..96bf9af1 100644
--- a/tests/unit/test_alpha.py
+++ b/tests/unit/test_alpha.py
@@ -20,13 +20,15 @@
 def results():
     """Return results generator from the Alpha spider."""
     spider = alpha_spider.AlphaSpider()
-    records = list(
+    parsed_items = list(
         spider.parse(
             fake_response_from_file('alpha/test_1.htm')
         )
     )
 
+    records = [parsed_item.record for parsed_item in parsed_items]
     assert records
+
     return records
 
 
diff --git a/tests/unit/test_aps.py b/tests/unit/test_aps.py
index eb53269d..3bb3698c 100644
--- a/tests/unit/test_aps.py
+++ b/tests/unit/test_aps.py
@@ -21,7 +21,7 @@ def results():
     from scrapy.http import TextResponse
 
     spider = aps_spider.APSSpider()
-    records = list(
+    parsed_items = list(
         spider.parse(
             fake_response_from_file(
                 'aps/aps_single_response.json',
@@ -30,6 +30,8 @@ def results():
         )
     )
 
+    records = [parsed_item.record for parsed_item in parsed_items]
+
     assert records
     return records
 
diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py
index bd75e5a4..1f4155c9 100644
--- a/tests/unit/test_arxiv_all.py
+++ b/tests/unit/test_arxiv_all.py
@@ -11,7 +11,8 @@
 
 import pytest
 
-from scrapy.crawler import  Crawler
+from scrapy.crawler import Crawler
+from scrapy.http import TextResponse
 
 from hepcrawl.pipelines import InspireCeleryPushPipeline
 from hepcrawl.spiders import arxiv_spider
@@ -25,36 +26,16 @@ def spider():
     return spider
 
 
-@pytest.fixture
-def one_result(spider):
-    """Return results generator from the arxiv spider. Tricky fields, one
-    record.
-    """
-    from scrapy.http import TextResponse
-
-    records = list(
-        spider.parse(
-            fake_response_from_file(
-                'arxiv/sample_arxiv_record0.xml',
-                response_type=TextResponse,
-            )
-        )
-    )
-
-    assert records
-    pipeline = InspireCeleryPushPipeline()
-    pipeline.open_spider(spider)
-    return [pipeline.process_item(record, spider) for record in records]
-
-
 @pytest.fixture
 def many_results(spider):
     """Return results generator from the arxiv spider. Tricky fields, many
     records.
     """
-    from scrapy.http import TextResponse
+    def _get_processed_record(item, spider):
+        record = pipeline.process_item(item, spider)
+        return record
 
-    records = list(
+    parsed_items = list(
         spider.parse(
             fake_response_from_file(
                 'arxiv/sample_arxiv_record.xml',
@@ -63,10 +44,10 @@ def many_results(spider):
         )
     )
 
-    assert records
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
-    return [pipeline.process_item(record, spider) for record in records]
+
+    return [_get_processed_record(parsed_item, spider) for parsed_item in parsed_items]
 
 
 def test_page_nr(many_results):
diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py
index a6ed66d6..329a2a49 100644
--- a/tests/unit/test_arxiv_single.py
+++ b/tests/unit/test_arxiv_single.py
@@ -24,10 +24,15 @@
 def results():
     """Return results generator from the arxiv spider. All fields, one record.
     """
+    def _get_processed_item(item, spider):
+        record = pipeline.process_item(item, spider)
+        validate(record, 'hep')
+        assert record
+        return record
 
     crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
     spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
-    records = list(
+    parsed_items = list(
         spider.parse(
             fake_response_from_file(
                 'arxiv/sample_arxiv_record0.xml',
@@ -36,16 +41,10 @@ def results():
         )
     )
 
-    assert records
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
-    processed_records = []
-    for record in records:
-        processed_record = pipeline.process_item(record, spider)
-        validate(processed_record, 'hep')
-        processed_records.append(processed_record)
 
-    return processed_records
+    return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items]
 
 
 
diff --git a/tests/unit/test_base.py b/tests/unit/test_base.py
index cc6ef093..b8ec5b8a 100644
--- a/tests/unit/test_base.py
+++ b/tests/unit/test_base.py
@@ -38,9 +38,12 @@ def record():
     nodes = selector.xpath('.//%s' % spider.itertag)
     response.meta["record"] = nodes[0].extract()
     response.meta["urls"] = ["http://hdl.handle.net/1885/10005"]
-    parsed_record = spider.build_item(response)
-    assert parsed_record
-    return parsed_record
+
+    parsed_item = spider.build_item(response)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 @pytest.fixture
@@ -169,7 +172,12 @@ def splash():
                 'Content-Type': 'text/html',
             },
         )
-        return spider.scrape_for_pdf(splash_response)
+
+        parsed_item = spider.scrape_for_pdf(splash_response)
+        assert parsed_item
+        assert parsed_item.record
+
+        return parsed_item.record
 
 
 def test_splash(splash):
@@ -201,7 +209,12 @@ def parsed_node():
     response = fake_response_from_string(text=body)
     node = get_node(spider, 'OAI-PMH:record', text=body)
     response.meta["record"] = node[0].extract()
-    return spider.parse_node(response, node[0])
+
+    parsed_item = spider.parse_node(response, node[0])
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_parsed_node(parsed_node):
diff --git a/tests/unit/test_brown.py b/tests/unit/test_brown.py
index 0b42b4df..8d0f20de 100644
--- a/tests/unit/test_brown.py
+++ b/tests/unit/test_brown.py
@@ -41,10 +41,12 @@ def record():
 
         splash_response = fake_response_from_file('brown/test_splash.html')
         splash_response.meta["jsonrecord"] = jsonrecord
-        parsed_record = spider.scrape_splash(splash_response)
 
-        assert parsed_record
-        return parsed_record
+        parsed_item = spider.scrape_splash(splash_response)
+        assert parsed_item
+        assert parsed_item.record
+
+        return parsed_item.record
 
 
 @pytest.fixture
@@ -200,7 +202,11 @@ def parsed_node_no_splash():
     jsonrecord = jsonresponse["items"]["docs"][0]
     response.meta["jsonrecord"] = jsonrecord
 
-    return spider.parse(response).next()
+    parsed_item = spider.parse(response).next()
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_no_splash(parsed_node_no_splash):
diff --git a/tests/unit/test_dnb.py b/tests/unit/test_dnb.py
index b00aff3d..5aa05a64 100644
--- a/tests/unit/test_dnb.py
+++ b/tests/unit/test_dnb.py
@@ -72,7 +72,12 @@ def record(scrape_pos_page_body):
             body=scrape_pos_page_body,
             **{'encoding': 'utf-8'}
         )
-        return request.callback(response)
+
+        parsed_item = request.callback(response)
+        assert parsed_item
+        assert parsed_item.record
+
+        return parsed_item.record
 
 
 def test_title(record):
@@ -241,7 +246,12 @@ def parse_without_splash():
                 'Content-Type': 'application/pdf;charset=base64',
             }
         )
-        return spider.parse_node(response, nodes[0])
+
+        parsed_item = spider.parse_node(response, nodes[0])
+        assert parsed_item
+        assert parsed_item.record
+
+        return parsed_item.record
 
 
 def test_parse_without_splash(parse_without_splash):
diff --git a/tests/unit/test_edp.py b/tests/unit/test_edp.py
index cc7885bd..115abda6 100644
--- a/tests/unit/test_edp.py
+++ b/tests/unit/test_edp.py
@@ -40,6 +40,7 @@ def scrape_pos_page_body():
         )
     )
 
+
 @pytest.fixture
 def targzfile():
     """Path to test tar.gz file with JATS XML file."""
@@ -50,6 +51,7 @@ def targzfile():
         'test_gz.tar.gz'
     )
 
+
 @pytest.fixture
 def package_jats(targzfile):
     """Extract tar.gz package with JATS XML file."""
@@ -75,7 +77,12 @@ def record_jats(package_jats, scrape_pos_page_body):
         body=scrape_pos_page_body,
         **{'encoding': 'utf-8'}
     )
-    return request.callback(response)
+
+    parsed_item = request.callback(response)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 @pytest.fixture
@@ -107,7 +114,11 @@ def record_rich(package_rich):
     fake_resp.meta["rich"] = True
     node = get_node(spider, "//EDPSArticle", fake_resp)[0]
 
-    return spider.parse_node(fake_resp, node)
+    parsed_item = spider.parse_node(fake_resp, node)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_title(record_jats):
@@ -145,6 +156,7 @@ def test_abstract(record_jats):
     assert 'abstract' in record_jats
     assert record_jats['abstract'] == abstract
 
+
 def test_date_published(record_jats):
     """Test extracting date_published."""
     date_published = "2015-01-01"
@@ -179,6 +191,7 @@ def test_doi(record_jats):
     assert 'dois' in record_jats
     assert record_jats['dois'][0]['value'] == doi
 
+
 def test_publication_info(record_jats):
     """Test extracting publication info."""
     assert 'journal_title' in record_jats
@@ -206,7 +219,6 @@ def test_keywords(record_jats):
         assert keyw["value"] in keywords
 
 
-
 def test_authors(record_jats):
     """Test authors."""
     authors = ["Arasoglu, Ali", "Ozdemir, Omer Faruk"]
@@ -326,7 +338,6 @@ def test_authors_rich(record_rich):
         assert astr[index]["affiliations"][0]["value"] == affiliations[index]
 
 
-
 def test_tarfile(tarbzfile, tmpdir):
     """Test untarring a tar.bz package with a test XML file.
 
@@ -343,7 +354,6 @@ def test_tarfile(tarbzfile, tmpdir):
     assert "aas/xml_rich/2000/01" not in xml_files_flat[0]
 
 
-
 def test_handle_package_ftp(tarbzfile):
     """Test getting the target folder name for xml files."""
     spider = edp_spider.EDPSpider()
@@ -351,7 +361,8 @@ def test_handle_package_ftp(tarbzfile):
     request = spider.handle_package_ftp(response).next()
 
     assert isinstance(request, Request)
-    assert request.meta["package_path"] == tarbzfile
+    assert request.meta["source_folder"] == tarbzfile
+
 
 def test_no_dois_jats():
     """Test parsing when no DOI in record. JATS format."""
@@ -370,7 +381,11 @@ def test_no_dois_jats():
     """
     response = fake_response_from_string(body)
     node = get_node(spider, "//article", response)[0]
-    record = spider.parse_node(response, node)
+
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+    record = parsed_item.record
 
     assert "dois" not in record
     assert "additional_files" not in record
@@ -390,7 +405,11 @@ def test_no_dois_rich():
     response = fake_response_from_string(body)
     response.meta["rich"] = True
     node = get_node(spider, "//EDPSArticle", response)[0]
-    record = spider.parse_node(response, node)
+
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+    record = parsed_item.record
 
     assert "dois" not in record
     assert "additional_files" not in record
@@ -416,7 +435,11 @@ def test_addendum_jats():
     """
     response = fake_response_from_string(body)
     node = get_node(spider, "//article", response)[0]
-    record = spider.parse_node(response, node)
+
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+    record = parsed_item.record
 
     assert "related_article_doi" in record
     assert record["related_article_doi"][0][
@@ -439,7 +462,11 @@ def test_author_with_email():
     """
     response = fake_response_from_string(body)
     node = get_node(spider, "//article", response)[0]
-    record = spider.parse_node(response, node)
+
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+    record = parsed_item.record
 
     assert 'email' in record['authors'][0]
     assert record['authors'][0]['email'] == "Fname.Sname@university.org"
@@ -472,7 +499,11 @@ def test_aff_with_email():
     """
     response = fake_response_from_string(body)
     node = get_node(spider, "//article", response)[0]
-    record = spider.parse_node(response, node)
+
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+    record = parsed_item.record
 
     affiliation = "Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA"
     assert 'affiliations' in record['authors'][0]
@@ -481,8 +512,6 @@ def test_aff_with_email():
     assert record['authors'][0]['email'] is None
 
 
-
-
 def test_no_valid_article():
     """Test parsing when filtering out non-interesting article types."""
     spider = edp_spider.EDPSpider()
@@ -506,7 +535,11 @@ def test_collections_review():
     """
     response = fake_response_from_string(body)
     node = get_node(spider, "//article", response)[0]
-    record = spider.parse_node(response, node)
+
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+    record = parsed_item.record
 
     assert "collections" in record
     assert record["collections"] == [{'primary': 'HEP'}, {'primary': 'Review'}]
@@ -533,7 +566,12 @@ def record_references_only():
     """
     response = fake_response_from_string(body)
     node = get_node(spider, "//article", response)[0]
-    return spider.parse_node(response, node)
+
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_references(record_references_only):
diff --git a/tests/unit/test_elsevier.py b/tests/unit/test_elsevier.py
index ca023122..3d5fb3f5 100644
--- a/tests/unit/test_elsevier.py
+++ b/tests/unit/test_elsevier.py
@@ -41,9 +41,12 @@ def record():
         response.meta["xml_url"] = 'elsevier/sample_consyn_record.xml'
         tag = '//%s' % spider.itertag
         nodes = get_node(spider, tag, response)
-        parsed_record = spider.parse_node(response, nodes)
-        assert parsed_record
-        return parsed_record
+
+        parsed_item = spider.parse_node(response, nodes)
+        assert parsed_item
+        assert parsed_item.record
+
+        return parsed_item.record
 
 
 @pytest.fixture(scope="module")
@@ -97,7 +100,12 @@ def parsed_node():
         response.meta["xml_url"] = 'elsevier/sample_consyn_record.xml'
         parse_response = spider.parse_node(response, node)
         parse_response.status = 404
-        return spider.scrape_sciencedirect(parse_response)
+
+        parsed_item = spider.scrape_sciencedirect(parse_response)
+        assert parsed_item
+        assert parsed_item.record
+
+        return parsed_item.record
 
 
 def test_collection(parsed_node):
@@ -164,7 +172,11 @@ def cover_display_date():
 
     node = get_node(spider, '/doc', text=body)
     response = fake_response_from_string(body)
-    return spider.parse_node(response, node)
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_cover_display_date(cover_display_date):
@@ -187,7 +199,11 @@ def cover_display_date_y_m():
     </doc>"""
     node = get_node(spider, '/doc', text=body)
     response = fake_response_from_string(body)
-    return spider.parse_node(response, node)
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_cover_display_date_y_m(cover_display_date_y_m):
@@ -210,7 +226,11 @@ def cover_display_date_y():
     </doc>"""
     node = get_node(spider, '/doc', text=body)
     response = fake_response_from_string(body)
-    return spider.parse_node(response, node)
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_cover_display_date_y(cover_display_date_y):
@@ -1579,11 +1599,11 @@ def test_handle_package(handled_package):
     for astro, nima in zip(astropart, nima):
         assert nima
         assert astro
-        assert astro.meta["package_path"] == "tests/unit/responses/elsevier/fake_astropart.zip"
+        assert astro.meta["source_folder"] == "tests/unit/responses/elsevier/fake_astropart.zip"
         url_to_match = u'file:///tmp/elsevier_fake_astropart_*/0927-6505/aip/S0927650515001656/S0927650515001656.xml'
         assert astro.meta["xml_url"] == fnmatch.filter([astro.meta["xml_url"]], url_to_match)[0]
 
-        assert nima.meta["package_path"] == "tests/unit/responses/elsevier/fake_nima.zip"
+        assert nima.meta["source_folder"] == "tests/unit/responses/elsevier/fake_nima.zip"
         url_to_match = u'file:///tmp/elsevier_fake_nima_*/0168-9002/S0168900215X00398/S0168900215015636/S0168900215015636.xml'
         assert nima.meta["xml_url"] == fnmatch.filter([nima.meta["xml_url"]], url_to_match)[0]
 
@@ -1644,7 +1664,12 @@ def sciencedirect():
         ])
     response.meta["info"] = {}
     response.meta["node"] = get_node(spider, '/head', text=body)
-    return spider.scrape_sciencedirect(response)
+
+    parsed_item = spider.scrape_sciencedirect(response)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_sciencedirect(sciencedirect):
diff --git a/tests/unit/test_hindawi.py b/tests/unit/test_hindawi.py
index 37e5e183..3af8ba3a 100644
--- a/tests/unit/test_hindawi.py
+++ b/tests/unit/test_hindawi.py
@@ -26,9 +26,11 @@ def record():
     response = fake_response_from_file("hindawi/test_1.xml")
     nodes = get_node(spider, "//marc:record", response)
 
-    parsed_record = spider.parse_node(response, nodes[0])
-    assert parsed_record
-    return parsed_record
+    parsed_item = spider.parse_node(response, nodes[0])
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_title(record):
diff --git a/tests/unit/test_infn.py b/tests/unit/test_infn.py
index 0c60799a..c15ef727 100644
--- a/tests/unit/test_infn.py
+++ b/tests/unit/test_infn.py
@@ -28,9 +28,12 @@ def record():
     """Return scraping results from the INFN spider."""
     spider = infn_spider.InfnSpider()
     response = fake_response_from_file('infn/test_splash.html')
-    parsed_record = spider.scrape_splash(response)
-    assert parsed_record
-    return parsed_record
+
+    parsed_item = spider.scrape_splash(response)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_title(record):
@@ -121,6 +124,7 @@ def test_non_thesis():
 
     assert record is None
 
+
 def test_parse_node():
     """Test parse_node function. This should be a scrapy Request object.
 
@@ -148,6 +152,8 @@ def test_parse_node_nolink():
     response = fake_response_from_file('infn/test_1_nolink.html')
     selector = Selector(response, type='html')
     node = selector.xpath('//%s' % spider.itertag)[0]
-    record = spider.parse_node(response, node).next()
+    parsed_item = spider.parse_node(response, node).next()
+    assert parsed_item
+    assert parsed_item.record
 
-    assert isinstance(record, hepcrawl.items.HEPRecord)
+    assert isinstance(parsed_item.record, hepcrawl.items.HEPRecord)
diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py
index b776adfa..bb01766c 100644
--- a/tests/unit/test_iop.py
+++ b/tests/unit/test_iop.py
@@ -38,9 +38,12 @@ def record():
     response = fake_response_from_file('iop/xml/test_standard.xml')
     node = get_node(spider, "Article", response)
     spider.pdf_files = TEST_PDF_DIR
-    parsed_record = spider.parse_node(response, node)
-    assert parsed_record
-    return parsed_record
+
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_abstract(record):
@@ -182,10 +185,12 @@ def erratum_open_access_record():
         'iop',
         'pdf',
     )
-    parsed_record = spider.parse_node(response, node)
-    assert parsed_record
-    return parsed_record
 
+    parsed_item = spider.parse_node(response, node)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_files_erratum_open_access_record(erratum_open_access_record):
diff --git a/tests/unit/test_magic.py b/tests/unit/test_magic.py
index eeb574fe..f3c0f355 100644
--- a/tests/unit/test_magic.py
+++ b/tests/unit/test_magic.py
@@ -23,6 +23,7 @@
     get_node,
 )
 
+
 @pytest.fixture
 def record():
     """Return results from the MAGIC spider. First parse node, then scrape,
@@ -39,9 +40,11 @@ def record():
     splash_response.meta["date"] = parsed_node.meta["date"]
     splash_response.meta["urls"] = parsed_node.meta["urls"]
 
-    parsed_record = spider.scrape_for_pdf(splash_response).next()
-    assert parsed_record
-    return parsed_record
+    parsed_item = spider.scrape_for_pdf(splash_response).next()
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_abstract(record):
@@ -102,7 +105,6 @@ def test_abstract(record):
     assert record["abstract"] == abstract
 
 
-
 def test_title(record):
     """Test extracting title."""
     title = "Limits to the violation of Lorentz invariance using the emission of the CRAB pulsar at TeV energies, discovered with archival data from the MAGIC telescopes"
@@ -139,6 +141,7 @@ def test_url(record):
     assert 'urls' in record
     assert record['urls'][0]['value'] == url
 
+
 def test_pdf_link(record):
     """Test pdf link(s)"""
     files = "http://stlab.adobe.com/wiki/images/d/d3/Test.pdf"
@@ -164,7 +167,10 @@ def test_no_author_no_date_no_url():
     """
     response = fake_response_from_string(body)
     node = get_node(spider, spider.itertag, text=body)
-    record = spider.parse_node(response, node).next()
+    parsed_item = spider.parse_node(response, node).next()
+    assert parsed_item
+    assert parsed_item.record
+    record = parsed_item.record
 
     assert isinstance(record, hepcrawl.items.HEPRecord)
     assert "date" not in record
@@ -184,7 +190,10 @@ def test_no_aff():
     </html>
     """
     response = fake_response_from_string(body)
-    record = spider.scrape_for_pdf(response).next()
+    parsed_item = spider.scrape_for_pdf(response).next()
+    assert parsed_item
+    assert parsed_item.record
+    record = parsed_item.record
 
     assert isinstance(record, hepcrawl.items.HEPRecord)
     assert "date" not in record
@@ -216,7 +225,10 @@ def test_no_spash_page():
     response.status = 404
     response.meta["title"] = parsed_node.meta["title"]
     response.meta["urls"] = parsed_node.meta["urls"]
-    record = spider.scrape_for_pdf(response).next()
+    parsed_item = spider.scrape_for_pdf(response).next()
+    assert parsed_item
+    assert parsed_item.record
+    record = parsed_item.record
 
     assert isinstance(record, hepcrawl.items.HEPRecord)
     assert "urls" in record
diff --git a/tests/unit/test_mit.py b/tests/unit/test_mit.py
index 0253d91f..8a185cef 100644
--- a/tests/unit/test_mit.py
+++ b/tests/unit/test_mit.py
@@ -25,9 +25,12 @@ def record():
     """Return scraping results from the MIT spider."""
     spider = mit_spider.MITSpider()
     response = fake_response_from_file('mit/test_splash.html')
-    parsed_record = spider.build_item(response)
-    assert parsed_record
-    return parsed_record
+
+    parsed_item = spider.build_item(response)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 @pytest.fixture
@@ -37,7 +40,11 @@ def parsed_node():
     response = fake_response_from_file('mit/test_list.html')
     tag = spider.itertag
     node = get_node(spider, tag, response, rtype="html")
-    return spider.parse_node(response, node).next()
+
+    parsed_item = spider.parse_node(response, node).next()
+    assert parsed_item
+
+    return parsed_item
 
 
 def test_url(parsed_node):
@@ -159,7 +166,12 @@ def supervisors():
     <html>
     """
     response = fake_response_from_string(body)
-    return spider.build_item(response)
+
+    parsed_item = spider.build_item(response)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_two_supervisors(supervisors):
diff --git a/tests/unit/test_phenix.py b/tests/unit/test_phenix.py
index 75384350..c272683f 100644
--- a/tests/unit/test_phenix.py
+++ b/tests/unit/test_phenix.py
@@ -29,9 +29,13 @@ def record():
     response = fake_response_from_file('phenix/test_1.html')
     selector = Selector(response, type='html')
     nodes = selector.xpath('//%s' % spider.itertag)
-    parsed_record = spider.parse_node(response, nodes[0])
-    assert parsed_record
-    return parsed_record
+
+    parsed_item = spider.parse_node(response, nodes[0])
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
+
 
 @pytest.fixture
 def non_thesis():
@@ -49,10 +53,12 @@ def non_thesis():
     node = get_node(spider, '//li', text=body)
     return spider.parse_node(response, node)
 
+
 def test_non_thesis(non_thesis):
     """Test MSc thesis skipping."""
     assert non_thesis is None
 
+
 def test_title(record):
     """Test extracting title."""
     title = "MEASUREMENT OF THE DOUBLE HELICITY ASYMMETRY IN INCLUSIVE $\pi^{0}$ PRODUCTION IN POLARIZED PROTON-PROTON COLLISIONS AT $\sqrt{s}$ = 510 GeV"
@@ -82,6 +88,7 @@ def test_authors(record):
             aff['value'] for aff in record['authors'][index]['affiliations']
         ]
 
+
 def test_pdf_link(record):
     """Test pdf link(s)"""
     files = "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/Guragain_Hari-DISSERTATION.pdf"
diff --git a/tests/unit/test_phil.py b/tests/unit/test_phil.py
index e99064b2..6db536ef 100644
--- a/tests/unit/test_phil.py
+++ b/tests/unit/test_phil.py
@@ -33,9 +33,12 @@ def record():
         "http://philpapers.org/go.pl?id=BROBB&proxyId=none&u=http%3A%2F%2Fanalysis.oxfordjournals.org%2Fcontent%2F66%2F3%2F194.full.pdf%2Bhtml%3Fframe%3Dsidebar",
         "http://philpapers.org/go.pl?id=BROBB&proxyId=none&u=http%3A%2F%2Fbrogaardb.googlepages.com%2Ftensedrelationsoffprint.pdf"
     ]
-    parsed_record = spider.build_item(response)
-    assert parsed_record
-    return parsed_record
+
+    parsed_item = spider.build_item(response)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 @pytest.fixture
@@ -48,7 +51,12 @@ def journal():
     response = fake_response_from_file('phil/test_journal.json')
     jsonrecord = json.loads(response.body_as_unicode())
     response.meta["jsonrecord"] = jsonrecord[0]
-    return spider.build_item(response)
+
+    parsed_item = spider.build_item(response)
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 @pytest.fixture
@@ -223,7 +231,11 @@ def splash():
         ]
     }
 
-        return spider.scrape_for_pdf(response)
+        parsed_item = spider.scrape_for_pdf(response)
+        assert parsed_item
+        assert parsed_item.record
+
+        return parsed_item.record
 
 
 def test_scrape(splash):
diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py
index 20c872f4..bea29b34 100644
--- a/tests/unit/test_pos.py
+++ b/tests/unit/test_pos.py
@@ -51,8 +51,11 @@ def record(scrape_pos_page_body):
     assert response
     pipeline = InspireCeleryPushPipeline()
     pipeline.open_spider(spider)
-    record = request.callback(response)
-    return pipeline.process_item(record, spider)
+    parsed_item = request.callback(response)
+    parsed_record = pipeline.process_item(parsed_item, spider)
+    assert parsed_record
+
+    return parsed_record
 
 
 def test_titles(record):
diff --git a/tests/unit/test_t2k.py b/tests/unit/test_t2k.py
index 283a02e5..d9395aa2 100644
--- a/tests/unit/test_t2k.py
+++ b/tests/unit/test_t2k.py
@@ -36,9 +36,11 @@ def record():
     splash_response.meta["urls"] = parsed_node.meta["urls"]
     splash_response.meta["authors"] = parsed_node.meta["authors"]
 
-    parsed_record = spider.scrape_for_pdf(splash_response).next()
-    assert parsed_record
-    return parsed_record
+    parsed_item = spider.scrape_for_pdf(splash_response).next()
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_abstact(record):
@@ -125,9 +127,11 @@ def non_url():
     selector = Selector(response, type='html')
     nodes = selector.xpath('//%s' % spider.itertag)
 
-    parsed_record = spider.parse_node(response, nodes[0]).next()
-    assert parsed_record
-    return parsed_record
+    parsed_item = spider.parse_node(response, nodes[0]).next()
+    assert parsed_item
+    assert parsed_item.record
+
+    return parsed_item.record
 
 
 def test_non_url(non_url):
diff --git a/tests/unit/test_world_scientific.py b/tests/unit/test_world_scientific.py
index 36438ab4..291d00d0 100644
--- a/tests/unit/test_world_scientific.py
+++ b/tests/unit/test_world_scientific.py
@@ -48,8 +48,11 @@ def get_records(response_file_name):
 
 
 def get_one_record(response_file_name):
-    results = get_records(response_file_name)
-    return results.next()
+    records = get_records(response_file_name)
+    record = records.next()
+    assert record
+
+    return record
 
 
 def override_generated_fields(record):

From c8840d23cdf6464a88b9f164950c02ae5d54d691 Mon Sep 17 00:00:00 2001
From: Spyridon Delviniotis <spirosdelviniotis@gmail.com>
Date: Tue, 22 Aug 2017 22:12:11 +0200
Subject: [PATCH 08/11] tests: added path argument to `clean_dir`

Signed-off-by: Spyridon Delviniotis <spirosdelviniotis@gmail.com>
---
 tests/functional/wsp/test_wsp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py
index a0411b8e..42f691c9 100644
--- a/tests/functional/wsp/test_wsp.py
+++ b/tests/functional/wsp/test_wsp.py
@@ -55,7 +55,7 @@ def set_up_ftp_environment():
         }
     }
 
-    clean_dir()
+    clean_dir(path='/tmp/WSP/')
 
 
 @pytest.fixture(scope="function")
@@ -80,7 +80,7 @@ def set_up_local_environment():
 
 
 def remove_generated_files(package_location):
-    clean_dir()
+    clean_dir(path='/tmp/WSP/')
 
     _, dirs, files = next(os.walk(package_location))
     for dir_name in dirs:

From 955e38f9bf1329c6b47f653b41a1d66f588a60d3 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 9 Aug 2017 20:40:34 +0200
Subject: [PATCH 09/11] Minor renaming and docs fixing

Signed-off-by: David Caro <david@dcaro.es>
---
 docs/Makefile                   |   2 +-
 docs/conf.py                    |   7 ++
 hepcrawl/pipelines.py           |  12 +++-
 hepcrawl/spiders/desy_spider.py |  82 +++++++++++++++++------
 hepcrawl/tohep.py               | 108 ++++++++++++++++++++++--------
 hepcrawl/utils.py               | 114 ++++++++++++++++++++++++--------
 6 files changed, 245 insertions(+), 80 deletions(-)

diff --git a/docs/Makefile b/docs/Makefile
index 28fb79f1..7d24ea24 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -24,7 +24,7 @@ PAPEROPT_letter = -D latex_paper_size=letter
 ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 # the i18n builder cannot share the environment and doctrees with the others
 I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-SPHINXAPIDOC    = sphinx-apidoc -M -P -f -o $(SOURCEDIR) $(CODEDIR) -E $(CODEDIR)/spiders
+SPHINXAPIDOC    = sphinx-apidoc --module-first --private --force --separate --output-dir $(SOURCEDIR) $(CODEDIR) $(CODEDIR)/spiders
 
 .PHONY: help
 help:
diff --git a/docs/conf.py b/docs/conf.py
index 7fb14e5e..2e12401b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -85,6 +85,13 @@ def _warn_node(self, msg, *args, **kwargs):
     'sphinx.ext.todo',
 ]
 
+
+autodoc_default_flags = [
+    'members',
+    'private-members',
+    'show-inheritance',
+]
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py
index e583dc2a..8cd31c0e 100644
--- a/hepcrawl/pipelines.py
+++ b/hepcrawl/pipelines.py
@@ -34,13 +34,19 @@ class FftFilesPipeline(FilesPipeline):
 
     Note:
 
-         This pipeline only runs if the spider returns a ``ParsedItem`` that has a ``file_urls``
-         property.
+         This pipeline only runs if the spider returns a
+         :class:`hepcrawl.utils.ParsedItem` that has a ``file_urls`` property.
+         See the scrapy docs on it for more details.
+         https://doc.scrapy.org/en/latest/topics/media-pipeline.html?highlight=file_urls#using-the-files-pipeline
     """
 
     def __init__(self, store_uri, *args, **kwargs):
         store_uri = store_uri or FILES_STORE
-        super(FftFilesPipeline, self).__init__(*args, store_uri=store_uri, **kwargs)
+        super(FftFilesPipeline, self).__init__(
+            *args,
+            store_uri=store_uri,
+            **kwargs
+        )
 
     def get_media_requests(self, item, info):
         """Download FFT files using FTP."""
diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
index 69d40619..117bede1 100644
--- a/hepcrawl/spiders/desy_spider.py
+++ b/hepcrawl/spiders/desy_spider.py
@@ -7,8 +7,6 @@
 # under the terms of the Revised BSD License; see LICENSE file for
 # more details.
 
-"""Spider for DESY."""
-
 from __future__ import absolute_import, division, print_function
 
 import os
@@ -30,19 +28,45 @@
 
 
 class DesySpider(Spider):
-    """Desy spider.
+    """This spider parses files in XML MARC format (collections or single
+    records).
+
+    It can retrieve the files from a remote FTP or from a local directory, they
+    must have the extension ``.xml``.
+
+    Args:
+        source_folder(str): Path to the folder with the MARC files to ingest,
+            might be collections or single records. Will be ignored if
+            ``ftp_host`` is passed.
+
+        ftp_folder(str): Remote folder where to look for the XML files.
+
+        ftp_host(str):
+
+        ftp_netrc(str): Path to the ``.netrc`` file with the authentication
+            details for the ftp connection. For more details see:
+            https://linux.die.net/man/5/netrc
+
+        destination_folder(str): Path to put the crawl results into. Will be
+            created if it does not exist.
+
+        *args: will be passed to the contstructor of
+            :class:`scrapy.spiders.Spider`.
 
-     This spider connects to a given FTP hosts and downloads XML files
-     for extraction into HEP records.
+        **kwargs: will be passed to the contstructor of
+            :class:`scrapy.spiders.Spider`.
 
     Examples:
         To run a crawl, you need to pass FTP connection information via
-        ``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it will fallback to
-        ``DESY``::
+        ``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it
+        will fallback to ``DESY``::
 
-            $ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc'
+            $ scrapy crawl desy \\
+                -a 'ftp_host=ftp.example.com' \\
+                -a 'ftp_netrc=/path/to/netrc'
 
-        To run a crawl on local folder, you need to pass the absolute ``source_folder``::
+        To run a crawl on local folder, you need to pass the absolute
+        ``source_folder``::
 
             $ scrapy crawl desy -a 'source_folder=/path/to/package_dir'
      """
@@ -67,20 +91,21 @@ def __init__(
         self.source_folder = source_folder
         self.destination_folder = destination_folder
         self.ftp_enabled = True if self.ftp_host else False
+
         if not os.path.exists(self.destination_folder):
             os.makedirs(self.destination_folder)
 
     @staticmethod
-    def _list_xml_files_paths(list_files_paths):
-        return [
+    def _filter_xml_files(list_files_paths):
+        return (
             xml_file
             for xml_file in list_files_paths
             if xml_file.endswith('.xml')
-        ]
+        )
 
     def crawl_local_directory(self):
         file_names = os.listdir(self.source_folder)
-        xml_file_names = self._list_xml_files_paths(file_names)
+        xml_file_names = self._filter_xml_files(file_names)
 
         for file_name in xml_file_names:
             file_path = os.path.join(self.source_folder, file_name)
@@ -91,7 +116,10 @@ def crawl_local_directory(self):
             )
 
     def crawl_ftp_directory(self):
-        ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
+        ftp_host, ftp_params = ftp_connection_info(
+            self.ftp_host,
+            self.ftp_netrc,
+        )
 
         remote_files_paths = ftp_list_files(
             self.ftp_folder,
@@ -102,10 +130,12 @@ def crawl_ftp_directory(self):
             only_missing_files=False,
         )
 
-        xml_remote_files_paths = self._list_xml_files_paths(remote_files_paths)
+        xml_remote_files_paths = self._filter_xml_files(remote_files_paths)
 
         for remote_file in xml_remote_files_paths:
-            self.log('Remote: Try to crawl file from FTP: {0}'.format(remote_file))
+            self.log(
+                'Remote: Try to crawl file from FTP: {0}'.format(remote_file),
+            )
             remote_file = str(remote_file)
             ftp_params['ftp_local_filename'] = os.path.join(
                 self.destination_folder,
@@ -121,8 +151,12 @@ def crawl_ftp_directory(self):
     def handle_package_ftp(self, response):
         """Yield every XML file found.
 
-        This is an intermediate step before calling ``DesySpider.parse`` to handle ftp downloaded
-         "record collections".
+        This is an intermediate step before calling :func:`DesySpider.parse`
+        to handle ftp downloaded "record collections".
+
+        Args:
+            response(hepcrawl.http.response.Response): response containing the
+                information about the ftp file download.
         """
         self.log('Visited url {}'.format(response.url))
         file_path = response.body
@@ -153,18 +187,24 @@ def _get_full_uri(current_path, base_url, schema, hostname=''):
         return '{schema}://{hostname}{full_path}'.format(**vars())
 
     def parse(self, response):
-        """Parse a ``Desy`` XML file into a ``hepcrawl.utils.ParsedItem``."""
+        """Parse a ``Desy`` XML file into a :class:`hepcrawl.utils.ParsedItem`.
+        """
 
         self.log('Got record from url/path: {0}'.format(response.url))
         self.log('FTP enabled: {0}'.format(self.ftp_enabled))
         ftp_params = None
 
         if self.ftp_enabled:
-            hostname, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)
+            hostname, ftp_params = ftp_connection_info(
+                self.ftp_host,
+                self.ftp_netrc,
+            )
             base_url = self.ftp_folder
             url_schema = 'ftp'
         else:
-            base_url = os.path.dirname(urllib.parse.urlparse(response.url).path)
+            base_url = os.path.dirname(
+                urllib.parse.urlparse(response.url).path
+            )
             url_schema = 'file'
             hostname = None
 
diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py
index cacc5590..9dca5433 100644
--- a/hepcrawl/tohep.py
+++ b/hepcrawl/tohep.py
@@ -7,10 +7,18 @@
 # under the terms of the Revised BSD License; see LICENSE file for
 # more details.
 
-"""Convert a crawler record to a valid HEP record.
+"""Functions used to convert records and items from one format to another.
+
+
+Currently there are only two formats for records that we consider:
+
+    * Hepcrawl format: internal format used by the spiders as middle step
+      before the pipeline, it's a generic wider format that should have at
+      least the same info as the HEP format used by Inspire.
+
+    * HEP format: Inspire compatible format, it's the fromat that you get as a
+      result of the crawl.
 
-Don't forget to add pipelines to the ITEM_PIPELINES setting
-See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 """
 
 from __future__ import absolute_import, division, print_function
@@ -21,14 +29,20 @@
 from inspire_schemas.api import LiteratureBuilder
 
 
+class UnknownItemFormat(Exception):
+    pass
+
+
 def _get_updated_fft_fields(current_fft_fields, record_files):
     """
 
-    Params:
-        current_fft_fields(list(dict)): record current fft fields as generated by ``dojson``. We
-             expect each of then to have, at least, a key named ``path``.
-        record_files(list(RecordFile)): files attached to the record as populated by
-             ``FftFilesPipeline``.
+    Args:
+        current_fft_fields(list(dict)): record current fft fields as generated
+            by ``dojson``. We expect each of then to have, at least, a key
+            named ``path``.
+
+        record_files(list(RecordFile)): files attached to the record as
+            populated by :class:`hepcrawl.pipelines.FftFilesPipeline`.
     """
     record_files_index = {
         record_file.name: record_file.path
@@ -45,8 +59,8 @@ def _get_updated_fft_fields(current_fft_fields, record_files):
 
 
 def _has_publication_info(item):
-    """If any publication info."""
-    return item.get('pubinfo_freetext') or item.get('journal_volume') or \
+    return item.get('pubinfo_freetext') or \
+        item.get('journal_volume') or \
         item.get('journal_title') or \
         item.get('journal_year') or \
         item.get('journal_issue') or \
@@ -56,9 +70,10 @@ def _has_publication_info(item):
         item.get('journal_doctype')
 
 
-def _filter_fields(item, keys):
-    """Filter away keys."""
+def _remove_fields(item, keys):
+    """Remove the given keys from the dict."""
     for key in keys:
+        # remove the key if there, no error if not there
         item.pop(key, None)
 
 
@@ -105,19 +120,21 @@ def _normalize_hepcrawl_record(item, source):
                 item.pop('journal_year')
             )
 
-    # Remove any fields
-    _filter_fields(item, [
-        'journal_title',
-        'journal_volume',
-        'journal_year',
-        'journal_issue',
-        'journal_fpage',
-        'journal_lpage',
-        'journal_doctype',
-        'journal_artid',
-        'pubinfo_freetext',
-        'pubinfo_material',
-    ])
+    _remove_fields(
+        item,
+        [
+            'journal_title',
+            'journal_volume',
+            'journal_year',
+            'journal_issue',
+            'journal_fpage',
+            'journal_lpage',
+            'journal_doctype',
+            'journal_artid',
+            'pubinfo_freetext',
+            'pubinfo_material',
+        ]
+    )
 
     return item
 
@@ -136,7 +153,23 @@ def item_to_hep(
     item,
     source,
 ):
-    item.record['acquisition_source'] = _generate_acquisition_source(source=source)
+    """Get an output ready hep formatted record from the given
+    :class:`hepcrawl.utils.ParsedItem`, whatever format it's record might be.
+
+    Args:
+        item(hepcrawl.utils.ParsedItem): item to convert.
+        source(str): string identifying the source for this item (ex. 'arXiv').
+
+    Returns:
+        hepcrawl.utils.ParsedItem: the new item, with the internal record
+            formated as hep record.
+
+    Raises:
+        UnknownItemFormat: if the source item format is unknown.
+    """
+    item.record['acquisition_source'] = _generate_acquisition_source(
+        source=source
+    )
 
     if item.record_format == 'hep':
         return hep_to_hep(
@@ -150,10 +183,18 @@ def item_to_hep(
         )
         return hepcrawl_to_hep(dict(record))
     else:
-        raise Exception('Unknown item_format::{}'.format(item.record_format))
+        raise UnknownItemFormat(
+            'Unknown ParsedItem::{}'.format(item.record_format)
+        )
 
 
 def hep_to_hep(hep_record, record_files):
+    """This is needed to be able to patch the ``_fft`` field in the record.
+
+    As earlier in the process we don't really have all the files yet. It should
+    be used by any spiders that generate hep format instead of the internal
+    hepcrawl one (normally, marc-ingesting spiders).
+    """
     if record_files:
         hep_record['_fft'] = _get_updated_fft_fields(
             current_fft_fields=hep_record['_fft'],
@@ -164,6 +205,19 @@ def hep_to_hep(hep_record, record_files):
 
 
 def hepcrawl_to_hep(crawler_record):
+    """
+    Args:
+        crawler_record(dict): dictionary representing the hepcrawl formatted
+            record.
+
+
+    Returns:
+        dict: The hep formatted (and validated) record.
+
+    Raises:
+        Exception: if there was a validation error (the exact class depends on
+            :class:`inspire_schemas.api.validate`).
+    """
 
     def _filter_affiliation(affiliations):
         return [
diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py
index 4d4a28db..256dd508 100644
--- a/hepcrawl/utils.py
+++ b/hepcrawl/utils.py
@@ -27,7 +27,10 @@
 
 from .mappings import LICENSES, LICENSE_TEXTS
 
-RE_FOR_THE = re.compile(r'\b(?:for|on behalf of|representing)\b', re.IGNORECASE)
+RE_FOR_THE = re.compile(
+    r'\b(?:for|on behalf of|representing)\b',
+    re.IGNORECASE,
+)
 INST_PHRASES = ['for the development', ]
 
 
@@ -70,10 +73,24 @@ def ftp_list_files(
     passive_mode=False,
     only_missing_files=True,
 ):
-    """List files from given FTP's ftp_host folder to target folder.
+    """
+
+    Args:
+        server_folder(str): remote folder to list.
+
+        ftp_host(str): name of the host. Example: 'ftp.cern.ch'
+
+        user(str): For authentication.
 
-    Params:
+        password(str): For authentication.
 
+        destination_folder(str): local folder to compare with.
+
+        passive_mode(bool): True if it should use firewall friendly ftp passive
+            mode.
+
+        only_missing_files(bool): If True will only list the files that are not
+            already in the ``destination_folder``.
     """
     session_factory = ftputil.session.session_factory(
         base_class=ftplib.FTP,
@@ -82,10 +99,19 @@ def ftp_list_files(
         encrypt_data_channel=True,
     )
 
-    with ftputil.FTPHost(ftp_host, user, password, session_factory=session_factory) as host:
+    with ftputil.FTPHost(
+        ftp_host,
+        user,
+        password,
+        session_factory=session_factory,
+    ) as host:
         file_names = host.listdir(os.path.join(host.curdir, server_folder))
         if only_missing_files:
-            return list_missing_files(server_folder, destination_folder, file_names)
+            return list_missing_files(
+                server_folder,
+                destination_folder,
+                file_names,
+            )
         else:
             return [
                 os.path.join(
@@ -131,7 +157,8 @@ def split_fullname(author, switch_name_order=False):
 
     It accepts author strings with and without comma separation.
     As default surname is first in case of comma separation, otherwise last.
-    Multi-part surnames are incorrectly detected in strings without comma separation.
+    Multi-part surnames are incorrectly detected in strings without comma
+    separation.
     """
     if not author:
         return "", ""
@@ -191,7 +218,8 @@ def build_dict(seq, key):
     """
     Creates a dictionary from a list, using the specified key.
 
-    Used to make searching in a list of objects faster (get operations are O(1)).
+    Used to make searching in a list of objects faster (get operations are
+    O(1)).
     """
     return dict((d[key], dict(d, index=i)) for (i, d) in enumerate(seq))
 
@@ -225,7 +253,10 @@ def range_as_string(data):
     """
     data = [int(i) for i in data]
     ranges = []
-    for key, group in groupby(enumerate(data), lambda (index, item): index - item):
+    for key, group in groupby(
+        enumerate(data),
+        lambda (index, item): index - item
+    ):
         group = map(itemgetter(1), group)
         if len(group) > 1:
             rangestring = "{}-{}".format(str(group[0]), str(group[-1]))
@@ -279,7 +310,12 @@ def get_journal_and_section(publication):
         if split_pub[-1] in possible_sections:
             section = split_pub.pop(-1)
         journal_title = "".join(
-            [word for word in split_pub if "section" not in word.lower()]).strip(", ")
+            [
+                word
+                for word in split_pub
+                if "section" not in word.lower()
+            ]
+        ).strip(", ")
     except IndexError:
         pass
 
@@ -295,8 +331,10 @@ def get_licenses(
 
     Args:
         license_url(str): Url of the license to generate.
-        license_text(str): Text with the description of the license (sometimes is
-            all we got...).
+
+        license_text(str): Text with the description of the license (sometimes
+            is all we got...).
+
         license_material(str): Material of the license.
 
     Returns:
@@ -353,7 +391,9 @@ class RecordFile(object):
 
     Args:
         path(str): local path to the file.
-        name(str): Optional, name of the file, if not passed, will use the name in the path.
+
+        name(str): Optional, name of the file, if not passed, will use the name
+            in the ``path``.
 
     Rises:
         PathDoesNotExist:
@@ -361,7 +401,9 @@ class RecordFile(object):
     def __init__(self, path, name=None):
         self.path = path
         if not os.path.exists(self.path):
-            raise PathDoesNotExist("The given record file path '%s' does not exist." % self.path)
+            raise PathDoesNotExist(
+                "The given record file path '%s' does not exist." % self.path
+            )
 
         if name is None:
             name = os.path.basename(path)
@@ -373,22 +415,35 @@ class ParsedItem(dict):
     """Each of the individual items returned by the spider to the pipeline.
 
     Args:
-        record(dict): Information about the crawled record, might be in different formats.
-        record_format(str): Format of the above record, for example ``"hep"`` or ``"hepcrawl"``.
-        file_urls(list(str)): URLs to the files to be downloaded by ``FftFilesPipeline``.
-        ftp_params(dict): Parameter for the ``FftFilesPipeline`` to be able to connect to the
-        ftp server, if any.
-        record_files(list(RecordFile)): files attached to the record, usually populated by
-        ``FftFilesPipeline`` from the ``file_urls`` parameter.
+        record(dict): Information about the crawled record, might be in
+            different formats.
+
+        record_format(str): Format of the above record, for example ``"hep"``
+            or ``"hepcrawl"``.
+
+        file_urls(list(str)): URLs to the files to be downloaded by
+            ``FftFilesPipeline``.
+
+        ftp_params(dict): Parameter for the
+            :class:`hepcrawl.pipelines.FftFilesPipeline` to be able to connect
+            to the ftp server, if any.
+
+        record_files(list(RecordFile)): files attached to the record, usually
+            populated by :class:`hepcrawl.pipelines.FftFilesPipeline` from the
+            ``file_urls`` parameter.
+
+    Attributes:
+        *: this class bypasses the regular dict ``__getattr__`` allowing to
+            access any of it's elements as attributes.
     """
     def __init__(
-            self,
-            record,
-            record_format,
-            file_urls=None,
-            ftp_params=None,
-            record_files=None,
-            **kwargs
+        self,
+        record,
+        record_format,
+        file_urls=None,
+        ftp_params=None,
+        record_files=None,
+        **kwargs
     ):
         super(ParsedItem, self).__init__(
             record=record,
@@ -402,7 +457,10 @@ def __init__(
     def __getattr__(self, key):
         if key not in self:
             raise AttributeError(
-                "'%s' object has no attribute '%s'" % (self.__class__.__name__, key)
+                "'%s' object has no attribute '%s'" % (
+                    self.__class__.__name__,
+                    key,
+                )
             )
 
         return self[key]

From 82fe872f3add8f285fa7bef919f56fb7a1f55616 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 23 Aug 2017 19:12:39 +0200
Subject: [PATCH 10/11] desy: add some missing config and minor fixes

Signed-off-by: David Caro <david@dcaro.es>
---
 hepcrawl/settings.py            |  5 +++++
 hepcrawl/spiders/desy_spider.py | 33 +++++++++++++++++++--------------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py
index bd16d8cd..31d608bf 100644
--- a/hepcrawl/settings.py
+++ b/hepcrawl/settings.py
@@ -123,6 +123,11 @@
 # ====
 JOBDIR = "jobs"
 
+# Marc to HEP conversion settings (Desy)
+MARC_TO_HEP_SETTINGS = {
+    'LEGACY_BASE_URL': 'https://inspirehep.net',
+    'SERVER_NAME': 'https://labs.inspirehep.net',
+}
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
index 117bede1..ec70ec39 100644
--- a/hepcrawl/spiders/desy_spider.py
+++ b/hepcrawl/spiders/desy_spider.py
@@ -11,14 +11,13 @@
 
 import os
 
-from lxml import etree
 from dojson.contrib.marc21.utils import create_record
-from six.moves import urllib
-
+from flask.app import Flask
+from inspire_dojson.hep import hep
+from lxml import etree
 from scrapy import Request
 from scrapy.spiders import Spider
-
-from inspire_dojson.hep import hep
+from six.moves import urllib
 
 from ..utils import (
     ftp_list_files,
@@ -71,8 +70,6 @@ class DesySpider(Spider):
             $ scrapy crawl desy -a 'source_folder=/path/to/package_dir'
      """
     name = 'desy'
-    custom_settings = {}
-    start_urls = []
 
     def __init__(
         self,
@@ -90,7 +87,10 @@ def __init__(
         self.ftp_netrc = ftp_netrc
         self.source_folder = source_folder
         self.destination_folder = destination_folder
-        self.ftp_enabled = True if self.ftp_host else False
+        self.ftp_enabled = False if self.source_folder else True
+
+        if self.ftp_enabled and not self.ftp_host:
+            raise Exception('You need to pass source_folder or ftp_host.')
 
         if not os.path.exists(self.destination_folder):
             os.makedirs(self.destination_folder)
@@ -169,10 +169,10 @@ def handle_package_ftp(self, response):
     def start_requests(self):
         """List selected folder on remote FTP and yield files."""
 
-        if self.source_folder:
-            requests = self.crawl_local_directory()
-        else:
+        if self.ftp_enabled:
             requests = self.crawl_ftp_directory()
+        else:
+            requests = self.crawl_local_directory()
 
         for request in requests:
             yield request
@@ -240,11 +240,16 @@ def _get_marcxml_records(response_body):
 
         return [etree.tostring(item) for item in list_items]
 
-    @staticmethod
-    def _hep_records_from_marcxml(marcxml_records):
+    def _hep_records_from_marcxml(self, marcxml_records):
         def _create_json_record(xml_record):
             object_record = create_record(etree.XML(xml_record))
-            dojson_record = hep.do(object_record)
+            app = Flask('hepcrawl')
+            app.config.update(
+                self.settings.getdict('MARC_TO_HEP_SETTINGS', {})
+            )
+            with app.app_context():
+                dojson_record = hep.do(object_record)
+
             return dojson_record
 
         hep_records = []

From ec4bb9aa0e6a8bc9255745e68434409c12649b39 Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Thu, 24 Aug 2017 14:21:09 +0200
Subject: [PATCH 11/11] test.desy: fix urls and fixtures

Signed-off-by: David Caro <david@dcaro.es>
---
 .../desy/fixtures/desy_ftp_records.json       |   8 +-
 .../desy/fixtures/desy_local_records.json     |   8 +-
 tests/functional/desy/test_desy.py            |  65 +++--
 tests/unit/test_desy.py                       | 251 ++++++++++++------
 4 files changed, 223 insertions(+), 109 deletions(-)

diff --git a/tests/functional/desy/fixtures/desy_ftp_records.json b/tests/functional/desy/fixtures/desy_ftp_records.json
index 0ffb18d1..8f7302c4 100644
--- a/tests/functional/desy/fixtures/desy_ftp_records.json
+++ b/tests/functional/desy/fixtures/desy_ftp_records.json
@@ -34,7 +34,7 @@
     }
   ],
   "self": {
-    "$ref": "http://inspirehep.net/api/literature/111111"
+    "$ref": "https://labs.inspirehep.net/api/literature/111111"
   },
   "number_of_pages": 6,
   "titles": [
@@ -111,7 +111,7 @@
     }
   ],
   "self": {
-    "$ref": "http://inspirehep.net/api/literature/222222"
+    "$ref": "https://labs.inspirehep.net/api/literature/222222"
   },
   "number_of_pages": 6,
   "titles": [
@@ -188,7 +188,7 @@
     }
   ],
   "self": {
-    "$ref": "http://inspirehep.net/api/literature/333333"
+    "$ref": "https://labs.inspirehep.net/api/literature/333333"
   },
   "number_of_pages": 6,
   "titles": [
@@ -265,7 +265,7 @@
     }
   ],
   "self": {
-    "$ref": "http://inspirehep.net/api/literature/444444"
+    "$ref": "https://labs.inspirehep.net/api/literature/444444"
   },
   "number_of_pages": 6,
   "titles": [
diff --git a/tests/functional/desy/fixtures/desy_local_records.json b/tests/functional/desy/fixtures/desy_local_records.json
index 4197a456..02f8fad2 100644
--- a/tests/functional/desy/fixtures/desy_local_records.json
+++ b/tests/functional/desy/fixtures/desy_local_records.json
@@ -34,7 +34,7 @@
     }
   ],
   "self": {
-    "$ref": "http://inspirehep.net/api/literature/111111"
+    "$ref": "https://labs.inspirehep.net/api/literature/111111"
   },
   "number_of_pages": 6,
   "titles": [
@@ -111,7 +111,7 @@
     }
   ],
   "self": {
-    "$ref": "http://inspirehep.net/api/literature/222222"
+    "$ref": "https://labs.inspirehep.net/api/literature/222222"
   },
   "number_of_pages": 6,
   "titles": [
@@ -188,7 +188,7 @@
     }
   ],
   "self": {
-    "$ref": "http://inspirehep.net/api/literature/333333"
+    "$ref": "https://labs.inspirehep.net/api/literature/333333"
   },
   "number_of_pages": 6,
   "titles": [
@@ -265,7 +265,7 @@
     }
   ],
   "self": {
-    "$ref": "http://inspirehep.net/api/literature/444444"
+    "$ref": "https://labs.inspirehep.net/api/literature/444444"
   },
   "number_of_pages": 6,
   "titles": [
diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py
index 5c3f4929..464907de 100644
--- a/tests/functional/desy/test_desy.py
+++ b/tests/functional/desy/test_desy.py
@@ -28,12 +28,14 @@
 
 def override_generated_fields(record):
     record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216'
-    record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad'
+    record['acquisition_source']['submission_number'] = (
+        u'5652c7f6190f11e79e8000224dabeaad'
+    )
 
     return record
 
 
-def compare_two_files_using_md5(file_1, file_2):
+def assert_files_equal(file_1, file_2):
     """Compares two files calculating the md5 hash."""
     def _generate_md5_hash(file_path):
         hasher = hashlib.md5()
@@ -42,11 +44,13 @@ def _generate_md5_hash(file_path):
             hasher.update(buf)
             return hasher.hexdigest()
 
-    return _generate_md5_hash(file_1) == _generate_md5_hash(file_2)
+    file_1_hash = _generate_md5_hash(file_1)
+    file_2_hash = _generate_md5_hash(file_2)
+    assert file_1_hash == file_2_hash
 
 
 @pytest.fixture(scope="function")
-def get_fft_1_path():
+def fft_1_path():
     return get_test_suite_path(
         'desy',
         'fixtures',
@@ -59,7 +63,7 @@ def get_fft_1_path():
 
 
 @pytest.fixture(scope="function")
-def get_fft_2_path():
+def fft_2_path():
     return get_test_suite_path(
         'desy',
         'fixtures',
@@ -72,7 +76,7 @@ def get_fft_2_path():
 
 
 @pytest.fixture(scope="function")
-def set_up_ftp_environment():
+def ftp_environment():
     netrc_location = get_test_suite_path(
         'desy',
         'fixtures',
@@ -81,7 +85,8 @@ def set_up_ftp_environment():
         test_suite='functional',
     )
 
-    # The test must wait until the docker environment is up (takes about 10 seconds).
+    # The test must wait until the docker environment is up (takes about 10
+    # seconds).
     sleep(10)
 
     yield {
@@ -98,7 +103,7 @@ def set_up_ftp_environment():
 
 
 @pytest.fixture(scope="function")
-def set_up_local_environment():
+def local_environment():
     package_location = get_test_suite_path(
         'desy',
         'fixtures',
@@ -133,12 +138,14 @@ def set_up_local_environment():
     ]
 )
 def test_desy_ftp(
-        set_up_ftp_environment,
+        ftp_environment,
         expected_results,
-        get_fft_1_path,
-        get_fft_2_path,
+        fft_1_path,
+        fft_2_path,
 ):
-    crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))
+    crawler = get_crawler_instance(
+        ftp_environment.get('CRAWLER_HOST_URL')
+    )
 
     results = CeleryMonitor.do_crawl(
         app=celery_app,
@@ -146,14 +153,17 @@ def test_desy_ftp(
         monitor_iter_limit=100,
         events_limit=2,
         crawler_instance=crawler,
-        project=set_up_ftp_environment.get('CRAWLER_PROJECT'),
+        project=ftp_environment.get('CRAWLER_PROJECT'),
         spider='desy',
         settings={},
-        **set_up_ftp_environment.get('CRAWLER_ARGUMENTS')
+        **ftp_environment.get('CRAWLER_ARGUMENTS')
     )
 
     gotten_results = [override_generated_fields(result) for result in results]
-    expected_results = [override_generated_fields(expected) for expected in expected_results]
+    expected_results = [
+        override_generated_fields(expected)
+        for expected in expected_results
+    ]
 
     assert sorted(gotten_results) == expected_results
 
@@ -161,8 +171,8 @@ def test_desy_ftp(
     for record in expected_results:
         fft_file_paths = sorted(record['_fft'])
 
-        assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_2_path)
-        assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_1_path)
+        assert_files_equal(fft_file_paths[0]['path'], fft_2_path)
+        assert_files_equal(fft_file_paths[1]['path'], fft_1_path)
 
 
 @pytest.mark.parametrize(
@@ -179,12 +189,12 @@ def test_desy_ftp(
     ]
 )
 def test_desy_local_package_path(
-        set_up_local_environment,
+        local_environment,
         expected_results,
-        get_fft_1_path,
-        get_fft_2_path,
+        fft_1_path,
+        fft_2_path,
 ):
-    crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))
+    crawler = get_crawler_instance(local_environment.get('CRAWLER_HOST_URL'))
 
     results = CeleryMonitor.do_crawl(
         app=celery_app,
@@ -192,14 +202,17 @@ def test_desy_local_package_path(
         monitor_iter_limit=100,
         events_limit=2,
         crawler_instance=crawler,
-        project=set_up_local_environment.get('CRAWLER_PROJECT'),
+        project=local_environment.get('CRAWLER_PROJECT'),
         spider='desy',
         settings={},
-        **set_up_local_environment.get('CRAWLER_ARGUMENTS')
+        **local_environment.get('CRAWLER_ARGUMENTS')
     )
 
     gotten_results = [override_generated_fields(result) for result in results]
-    expected_results = [override_generated_fields(expected) for expected in expected_results]
+    expected_results = [
+        override_generated_fields(expected)
+        for expected in expected_results
+    ]
 
     assert sorted(gotten_results) == expected_results
 
@@ -207,5 +220,5 @@ def test_desy_local_package_path(
     for record in expected_results:
         fft_file_paths = sorted(record['_fft'])
 
-        assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_2_path)
-        assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_1_path)
+        assert_files_equal(fft_file_paths[0]['path'], fft_2_path)
+        assert_files_equal(fft_file_paths[1]['path'], fft_1_path)
diff --git a/tests/unit/test_desy.py b/tests/unit/test_desy.py
index 35ed24e2..2b567191 100644
--- a/tests/unit/test_desy.py
+++ b/tests/unit/test_desy.py
@@ -9,21 +9,30 @@
 
 from __future__ import absolute_import, division, print_function
 
-import pytest
 import os
 
+import pytest
 from scrapy.crawler import Crawler
 from scrapy.http import TextResponse
+from scrapy.settings import Settings
 
+from hepcrawl import settings
 from hepcrawl.pipelines import InspireCeleryPushPipeline
 from hepcrawl.spiders import desy_spider
-
 from hepcrawl.testlib.fixtures import fake_response_from_file
 
 
 def create_spider():
-    crawler = Crawler(spidercls=desy_spider.DesySpider)
-    return desy_spider.DesySpider.from_crawler(crawler)
+    custom_settings = Settings()
+    custom_settings.setmodule(settings)
+    crawler = Crawler(
+        spidercls=desy_spider.DesySpider,
+        settings=custom_settings,
+    )
+    return desy_spider.DesySpider.from_crawler(
+        crawler,
+        source_folder='idontexist_but_it_does_not_matter',
+    )
 
 
 def get_records(response_file_name):
@@ -60,7 +69,9 @@ def get_one_record(response_file_name):
 
 def override_generated_fields(record):
     record['acquisition_source']['datetime'] = '2017-05-04T17:49:07.975168'
-    record['acquisition_source']['submission_number'] = '5652c7f6190f11e79e8000224dabeaad'
+    record['acquisition_source']['submission_number'] = (
+        '5652c7f6190f11e79e8000224dabeaad'
+    )
 
     return record
 
@@ -83,9 +94,12 @@ def test_pipeline_record(generated_record):
         '_fft': [
             {
                 'creation_datetime': '2017-06-27T09:43:17',
-                'description': '00013 Decomposition of the problematic rotation curves in our '
-                               'sample according to the best-fit \\textsc{core}NFW models. '
-                               'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.',
+                'description': (
+                    '00013 Decomposition of the problematic rotation curves '
+                    'in our sample according to the best-fit '
+                    '\\textsc{core}NFW models. Colors and symbols are as in '
+                    'Figure \\ref{fig:dc14_fits}.'
+                ),
                 'filename': 'cNFW_rogue_curves',
                 'format': '.txt',
                 'path': 'FFT/test_fft_1.txt',
@@ -94,17 +108,21 @@ def test_pipeline_record(generated_record):
             },
             {
                 'creation_datetime': '2017-06-27T09:43:16',
-                'description': '00005 Comparison of the parameters of the best-fit DC14 models to '
-                               'the cosmological halo mass-concentration relation from \\'
-                               'cite{dutton14} (left) and the stellar mass-halo mass relation '
-                               'from \\cite{behroozi13} (right). The error bars correspond to the '
-                               'extremal values of the multidimensional 68\\% confidence region '
-                               'for each fit. The theoretical relations are shown as red lines '
-                               'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by '
-                               'the dark and light grey bands, respectively. The '
-                               'mass-concentration relation from \\cite{maccio08} and the stellar'
-                               ' mass-halo mass relation from \\cite{behroozi13} are also shown '
-                               'as the black dashed lines.',
+                'description': (
+                    '00005 Comparison of the parameters of the best-fit DC14 '
+                    'models to the cosmological halo mass-concentration '
+                    'relation from \\cite{dutton14} (left) and the stellar '
+                    'mass-halo mass relation from \\cite{behroozi13} (right). '
+                    'The error bars correspond to the extremal values of the '
+                    'multidimensional 68\\% confidence region for each fit. '
+                    'The theoretical relations are shown as red lines and '
+                    'their 1$\\sigma$ and 2$\\sigma$ scatter are represented '
+                    'by the dark and light grey bands, respectively. The '
+                    'mass-concentration relation from \\cite{maccio08} and '
+                    'the stellar mass-halo mass relation from '
+                    '\\cite{behroozi13} are also shown as the black dashed '
+                    'lines.'
+                ),
                 'filename': 'scalingRelations_DutBeh_DC14_all_Oh',
                 'format': '.txt',
                 'path': 'FFT/test_fft_2.txt',
@@ -115,19 +133,24 @@ def test_pipeline_record(generated_record):
         'abstracts': [
             {
                 'source': 'Deutsches Elektronen-Synchrotron',
-                'value': 'Dielectric laser acceleration of electrons has recently been\n'
-                         '                demonstrated with significantly higher accelerating '
-                         'gradients than other\n                structure-based linear '
-                         'accelerators. Towards the development of an integrated 1 MeV\n         '
-                         '       electron accelerator based on dielectric laser accelerator '
-                         'technologies,\n                development in several relevant '
-                         'technologies is needed. In this work, recent\n                '
-                         'developments on electron sources, bunching, accelerating, focussing, '
-                         'deflecting and\n                laser coupling structures are reported. '
-                         'With an eye to the near future, components\n                '
-                         'required for a 1 MeV kinetic energy tabletop accelerator producing '
-                         'sub-femtosecond\n                electron bunches are outlined.\n       '
-                         '     '
+                'value': (
+                    'Dielectric laser acceleration of electrons has recently '
+                    'been\n                demonstrated with significantly '
+                    'higher accelerating gradients than other\n             '
+                    '   structure-based linear accelerators. Towards the '
+                    'development of an integrated 1 MeV\n                '
+                    'electron accelerator based on dielectric laser '
+                    'accelerator technologies,\n                development '
+                    'in several relevant technologies is needed. In this '
+                    'work, recent\n                developments on electron '
+                    'sources, bunching, accelerating, focussing, deflecting '
+                    'and\n                laser coupling structures are '
+                    'reported. With an eye to the near future, '
+                    'components\n                required for a 1 MeV kinetic '
+                    'energy tabletop accelerator producing '
+                    'sub-femtosecond\n                electron bunches are '
+                    'outlined.\n            '
+                )
             }
         ],
         'acquisition_source': {
@@ -162,21 +185,26 @@ def test_pipeline_record(generated_record):
             }
         ],
         'self': {
-            '$ref': 'http://inspirehep.net/api/literature/111111'
+            '$ref': 'https://labs.inspirehep.net/api/literature/111111'
         },
         'titles': [
             {
                 'source': 'JACoW',
-                 'title': 'Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n   '
-                          '             Acceleration (DLA) From the Source to Relativistic '
-                          'Electrons\n            '
+                'title': (
+                    'Towards a Fully Integrated Accelerator on a Chip: '
+                    'Dielectric Laser\n                Acceleration (DLA) '
+                    'From the Source to Relativistic Electrons\n            '
+                )
             }
         ],
         'urls': [
             {
                 'description': 'Fulltext',
-                'value': 'http://inspirehep.net/record/1608652/files/Towards a fully\n            '
-                         '    integrated acc on a chip.pdf\n            '
+                'value': (
+                    'http://inspirehep.net/record/1608652/files/Towards '
+                    'a fully\n                integrated acc on a chip.pdf'
+                    '\n            '
+                )
             }
         ]
     }
@@ -207,9 +235,12 @@ def test_pipeline_collection_records(generated_records):
             "_fft": [
                 {
                     'creation_datetime': '2017-06-27T09:43:17',
-                    'description': '00013 Decomposition of the problematic rotation curves in our '
-                                   'sample according to the best-fit \\textsc{core}NFW models. '
-                                   'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.',
+                    'description': (
+                        '00013 Decomposition of the problematic rotation '
+                        'curves in our sample according to the best-fit '
+                        '\\textsc{core}NFW models. Colors and symbols are as '
+                        'in Figure \\ref{fig:dc14_fits}.'
+                    ),
                     'filename': 'cNFW_rogue_curves',
                     'format': '.txt',
                     'path': 'FFT/test_fft_1.txt',
@@ -218,17 +249,22 @@ def test_pipeline_collection_records(generated_records):
                 },
                 {
                     'creation_datetime': '2017-06-27T09:43:16',
-                    'description': '00005 Comparison of the parameters of the best-fit DC14 models to '
-                                   'the cosmological halo mass-concentration relation from \\'
-                                   'cite{dutton14} (left) and the stellar mass-halo mass relation '
-                                   'from \\cite{behroozi13} (right). The error bars correspond to the '
-                                   'extremal values of the multidimensional 68\\% confidence region '
-                                   'for each fit. The theoretical relations are shown as red lines '
-                                   'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by '
-                                   'the dark and light grey bands, respectively. The '
-                                   'mass-concentration relation from \\cite{maccio08} and the stellar'
-                                   ' mass-halo mass relation from \\cite{behroozi13} are also shown '
-                                   'as the black dashed lines.',
+                    'description': (
+                        '00005 Comparison of the parameters of the best-fit '
+                        'DC14 models to the cosmological halo '
+                        'mass-concentration relation from \\cite{dutton14} '
+                        '(left) and the stellar mass-halo mass relation from '
+                        '\\cite{behroozi13} (right). The error bars correspond'
+                        ' to the extremal values of the multidimensional 68\\%'
+                        ' confidence region for each fit. The theoretical '
+                        'relations are shown as red lines and their '
+                        '1$\\sigma$ and 2$\\sigma$ scatter are represented '
+                        'by the dark and light grey bands, respectively. The '
+                        'mass-concentration relation from \\cite{maccio08} '
+                        'and the stellar mass-halo mass relation from '
+                        '\\cite{behroozi13} are also shown as the black '
+                        'dashed lines.'
+                    ),
                     'filename': 'scalingRelations_DutBeh_DC14_all_Oh',
                     'format': '.txt',
                     'path': 'FFT/test_fft_2.txt',
@@ -243,19 +279,28 @@ def test_pipeline_collection_records(generated_records):
                 }
             ],
             "self": {
-                "$ref": "http://inspirehep.net/api/literature/111111"
+                "$ref": "https://labs.inspirehep.net/api/literature/111111"
             },
             "number_of_pages": 6,
             "titles": [
                 {
                     "source": "JACoW",
-                    "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+                    "title": (
+                        'Towards a Fully Integrated Accelerator on a Chip: '
+                        'Dielectric Laser\n                Acceleration (DLA) '
+                        'From the Source to Relativistic Electrons'
+                        '\n            '
+                    )
                 }
             ],
             "urls": [
                 {
                     "description": "Fulltext",
-                    "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+                    "value": (
+                        'http://inspirehep.net/record/1608652/files/'
+                        'Towards a fully\n                integrated acc on a '
+                        'chip.pdf\n            '
+                    )
                 }
             ],
             "dois": [
@@ -280,7 +325,25 @@ def test_pipeline_collection_records(generated_records):
             "abstracts": [
                 {
                     "source": "Deutsches Elektronen-Synchrotron",
-                    "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+                    "value": (
+                        "Dielectric laser acceleration of electrons has "
+                        "recently been\n                demonstrated with "
+                        "significantly higher accelerating gradients than "
+                        "other\n                structure-based linear "
+                        "accelerators. Towards the development of an "
+                        "integrated 1 MeV\n                electron "
+                        "accelerator based on dielectric laser accelerator "
+                        "technologies,\n                development in "
+                        "several relevant technologies is needed. In this work"
+                        ", recent\n                developments on electron "
+                        "sources, bunching, accelerating, focussing, "
+                        "deflecting and\n                laser coupling "
+                        "structures are reported. With an eye to the near "
+                        "future, components\n                required for a 1 "
+                        "MeV kinetic energy tabletop accelerator producing sub"
+                        "-femtosecond\n                electron bunches are "
+                        "outlined.\n            "
+                    )
                 }
             ]
         },
@@ -297,9 +360,12 @@ def test_pipeline_collection_records(generated_records):
             "_fft": [
                 {
                     'creation_datetime': '2017-06-27T09:43:17',
-                    'description': '00013 Decomposition of the problematic rotation curves in our '
-                                   'sample according to the best-fit \\textsc{core}NFW models. '
-                                   'Colors and symbols are as in Figure \\ref{fig:dc14_fits}.',
+                    'description': (
+                        "00013 Decomposition of the problematic rotation "
+                        "curves in our sample according to the best-fit "
+                        "\\textsc{core}NFW models. Colors and symbols are as "
+                        "in Figure \\ref{fig:dc14_fits}."
+                    ),
                     'filename': 'cNFW_rogue_curves',
                     'format': '.txt',
                     'path': 'FFT/test_fft_1.txt',
@@ -308,17 +374,22 @@ def test_pipeline_collection_records(generated_records):
                 },
                 {
                     'creation_datetime': '2017-06-27T09:43:16',
-                    'description': '00005 Comparison of the parameters of the best-fit DC14 models to '
-                                   'the cosmological halo mass-concentration relation from \\'
-                                   'cite{dutton14} (left) and the stellar mass-halo mass relation '
-                                   'from \\cite{behroozi13} (right). The error bars correspond to the '
-                                   'extremal values of the multidimensional 68\\% confidence region '
-                                   'for each fit. The theoretical relations are shown as red lines '
-                                   'and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by '
-                                   'the dark and light grey bands, respectively. The '
-                                   'mass-concentration relation from \\cite{maccio08} and the stellar'
-                                   ' mass-halo mass relation from \\cite{behroozi13} are also shown '
-                                   'as the black dashed lines.',
+                    'description': (
+                        '00005 Comparison of the parameters of the best-fit '
+                        'DC14 models to the cosmological halo '
+                        'mass-concentration relation from \\cite{dutton14} '
+                        '(left) and the stellar mass-halo mass relation '
+                        'from \\cite{behroozi13} (right). The error bars '
+                        'correspond to the extremal values of the '
+                        'multidimensional 68\\% confidence region for each '
+                        'fit. The theoretical relations are shown as red '
+                        'lines and their 1$\\sigma$ and 2$\\sigma$ scatter '
+                        'are represented by the dark and light grey bands, '
+                        'respectively. The mass-concentration relation '
+                        'from \\cite{maccio08} and the stellar mass-halo '
+                        'mass relation from \\cite{behroozi13} are also '
+                        'shown as the black dashed lines.'
+                    ),
                     'filename': 'scalingRelations_DutBeh_DC14_all_Oh',
                     'format': '.txt',
                     'path': 'FFT/test_fft_2.txt',
@@ -333,19 +404,28 @@ def test_pipeline_collection_records(generated_records):
                 }
             ],
             "self": {
-                "$ref": "http://inspirehep.net/api/literature/222222"
+                "$ref": "https://labs.inspirehep.net/api/literature/222222"
             },
             "number_of_pages": 6,
             "titles": [
                 {
                     "source": "JACoW",
-                    "title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n                Acceleration (DLA) From the Source to Relativistic Electrons\n            "
+                    "title": (
+                        "Towards a Fully Integrated Accelerator on a Chip: "
+                        "Dielectric Laser\n                Acceleration "
+                        "(DLA) From the Source to Relativistic Electrons"
+                        "\n            "
+                    )
                 }
             ],
             "urls": [
                 {
                     "description": "Fulltext",
-                    "value": "http://inspirehep.net/record/1608652/files/Towards a fully\n                integrated acc on a chip.pdf\n            "
+                    "value": (
+                        "http://inspirehep.net/record/1608652/files/"
+                        "Towards a fully\n                integrated acc on a "
+                        "chip.pdf\n            "
+                    )
                 }
             ],
             "dois": [
@@ -370,12 +450,33 @@ def test_pipeline_collection_records(generated_records):
             "abstracts": [
                 {
                     "source": "Deutsches Elektronen-Synchrotron",
-                    "value": "Dielectric laser acceleration of electrons has recently been\n                demonstrated with significantly higher accelerating gradients than other\n                structure-based linear accelerators. Towards the development of an integrated 1 MeV\n                electron accelerator based on dielectric laser accelerator technologies,\n                development in several relevant technologies is needed. In this work, recent\n                developments on electron sources, bunching, accelerating, focussing, deflecting and\n                laser coupling structures are reported. With an eye to the near future, components\n                required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n                electron bunches are outlined.\n            "
+                    "value": (
+                        "Dielectric laser acceleration of electrons has "
+                        "recently been\n                demonstrated with "
+                        "significantly higher accelerating gradients than "
+                        "other\n                structure-based linear "
+                        "accelerators. Towards the development of an "
+                        "integrated 1 MeV\n                electron "
+                        "accelerator based on dielectric laser accelerator "
+                        "technologies,\n                development in "
+                        "several relevant technologies is needed. In this "
+                        "work, recent\n                developments on "
+                        "electron sources, bunching, accelerating, "
+                        "focussing, deflecting and\n                laser "
+                        "coupling structures are reported. With an eye to "
+                        "the near future, components\n                "
+                        "required for a 1 MeV kinetic energy tabletop "
+                        "accelerator producing sub-femtosecond"
+                        "\n                electron bunches are outlined."
+                        "\n            "
+                    )
                 }
             ]
         }
     ]
 
-    generated_results = [override_generated_fields(rec) for rec in generated_records]
+    generated_results = [
+        override_generated_fields(rec) for rec in generated_records
+    ]
 
     assert generated_results == expected