-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from nueces/master
some bits for the xml importer
- Loading branch information
Showing
6 changed files
with
199 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import os | ||
|
||
from zope.interface import classProvides, implements | ||
from collective.transmogrifier.interfaces import ISectionBlueprint | ||
from collective.transmogrifier.interfaces import ISection | ||
from collective.transmogrifier.utils import resolvePackageReferenceOrFile | ||
|
||
|
||
class DirectorySource(object): | ||
""" Reads the directory's contents and yield a readlines for every file. | ||
""" | ||
classProvides(ISectionBlueprint) | ||
implements(ISection) | ||
|
||
def __init__(self, transmogrifier, name, options, previous): | ||
""" Takes two options: | ||
directory: A full path to a directory or a relative path inside a | ||
package in the form collective.example:datadir. | ||
suffix: The extension of files that should be processed. | ||
""" | ||
self.previous = previous | ||
self.directory = resolvePackageReferenceOrFile(options['directory']) | ||
self.suffix = ".{0}".format(options['suffix'].split()) | ||
|
||
def __iter__(self): | ||
for item in self.previous: | ||
yield item | ||
|
||
for filename in os.listdir(self.directory): | ||
if filename.endswith(self.suffix): | ||
filepath = os.path.join(self.directory, filename) | ||
with open(filepath, 'r') as item: | ||
yield item.read() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
[transmogrifier] | ||
pipeline = | ||
sourcedirectory | ||
xmlprocesor | ||
constructor | ||
|
||
[sourcedirectory] | ||
blueprint = collective.nitf.import.sourcedirectory | ||
directory = collective.nitf:data | ||
|
||
[xmlprocessor] | ||
blueprint = collective.transmogrifier.constructor | ||
|
||
[folders] | ||
blueprint = collective.transmogrifier.sections.folders | ||
|
||
[constructor] | ||
blueprint = collective.transmogrifier.constructor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import xml.etree.ElementTree as etree | ||
|
||
from zope.interface import classProvides, implements | ||
from collective.transmogrifier.interfaces import ISection | ||
from collective.transmogrifier.interfaces import ISectionBlueprint | ||
|
||
#from collective.nitf.content import genre_default_value | ||
#from collective.nitf.content import section_default_value | ||
#from collective.nitf.content import urgency_default_value | ||
|
||
|
||
def get_text(dom, subelemet, attribute=None): | ||
""" Return the text value for a node xor a attribute value from that node. | ||
""" | ||
elem = dom.find(subelemet) | ||
if elem is not None: | ||
if attribute is None: | ||
return elem.text | ||
|
||
elif attribute in elem.keys(): | ||
return elem.get(attribute) | ||
|
||
return '' | ||
|
||
|
||
def get_date_path(dom, subelemet, attribute): | ||
""" Return a path 'YYYY/MM/DD' based on a date value normalized into | ||
ISO-8601 | ||
Note: Only work with the basic format. | ||
""" | ||
text = get_text(dom, subelemet, attribute) | ||
# We only need the YYYYMMDD part from the string | ||
return "/".join([text[:4], text[4:6], text[6:8]]) | ||
|
||
|
||
class XMLSource(object): | ||
""" Process an string containing a xml representation of a nitf object. | ||
""" | ||
classProvides(ISectionBlueprint) | ||
implements(ISection) | ||
|
||
def __init__(self, transmogrifier, name, options, previous): | ||
self.previous = previous | ||
|
||
def __iter__(self): | ||
for data in self.previous: | ||
item = {'id': '', | ||
'path': '', | ||
'title': '', | ||
'subtitle': '', | ||
'description': '', | ||
'byline': '', | ||
'text': '', | ||
'genre': '', | ||
'section': '', | ||
'urgency': '', | ||
'location': '', | ||
'media': {'image': [], | ||
'video': []} | ||
} | ||
|
||
dom = etree.fromstring(data) | ||
head = dom.find('head') | ||
body = dom.find('body') | ||
|
||
item['id'] = get_text(head, 'docdata/doc-id', 'id-string').lower() | ||
item['path'] = get_date_path(head, 'docdata/date.release', 'norm') | ||
item['title'] = get_text(head, 'title') | ||
item['genre'] = get_text(head, 'tobject/tobject.property', | ||
'tobject.property.type') | ||
item['section'] = get_text(head, 'pubdata', 'position.section') | ||
item['urgency'] = get_text(head, 'docdata/urgency', 'ed-urg') | ||
|
||
item['location'] = get_text(body, 'body.head/dateline/location') | ||
item['subtitle'] = get_text(body, 'body.head/hedline/hl2') | ||
item['description'] = get_text(body, 'body.head/abstract') | ||
item['byline'] = get_text(body, 'body.head/byline/person') | ||
|
||
for elem in list(body.find('body.content')): | ||
if elem.tag == 'media' and elem.get('media-type') == 'image': | ||
image = dict(elem.find('media-reference')) | ||
image['media-caption'] = get_text(elem, 'media-caption') | ||
item['media']['image'].append(image) | ||
|
||
elif elem.tag == 'media' and elem.get('media-type') == 'video': | ||
video = dict(elem.find('media-reference')) | ||
video['media-caption'] = get_text(elem, 'media-caption') | ||
item['media']['video'].append(video) | ||
|
||
else: # other tag are considered part of the body text and | ||
# should be preserved. | ||
item['text'] += etree.tostring(elem) | ||
|
||
yield item |