Skip to content

Commit

Permalink
Merge pull request #3 from nueces/master
Browse files Browse the repository at this point in the history
some bits for the xml importer
  • Loading branch information
hvelarde committed Apr 13, 2012
2 parents 0c10da7 + fbae0bc commit ddb0610
Show file tree
Hide file tree
Showing 6 changed files with 199 additions and 13 deletions.
30 changes: 29 additions & 1 deletion buildout.cfg
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[buildout]
extensions =
buildout.dumppickedversions
buildout.threatlevel

develop = .
unzip = true
Expand All @@ -11,6 +10,9 @@ parts =
omelette
test
zopepy
pylint
pyflakes
pep8

extends =
http://dist.plone.org/release/4.1.2/versions.cfg
Expand Down Expand Up @@ -55,3 +57,29 @@ recipe = zc.recipe.egg
eggs = ${instance:eggs}
interpreter = zopepy
scripts = zopepy

[pylint]
recipe = zc.recipe.egg
eggs =
pylint
${instance:eggs}
entry-points = pylint=pylint.lint:Run
scripts = pylint
arguments = [
'--output-format=colorized',
'--zope=y',
'--reports=no',
#Suppress certain errors (interfaces missing __init__, invalid imports etc)
'--disable=E0611,F0401,W0232',
] + sys.argv[1:]

[pyflakes]
recipe = zc.recipe.egg
scripts = pyflakes
eggs = pyflakes
#pyflakes reads sys.argv directly
entry-points = pyflakes=pyflakes.scripts.pyflakes:main

[pep8]
recipe = zc.recipe.egg
eggs = pep8
27 changes: 17 additions & 10 deletions src/transmogrify/nitf/configure.zcml
Original file line number Diff line number Diff line change
Expand Up @@ -13,29 +13,36 @@
configuration="migrator.cfg"
/>

<transmogrifier:registerConfig
name="nitfxmlimport"
title="Import XML files into NITF objects"
description="Create NITF objects from XML files"
configuration="xmlimport.cfg"
/>

<utility
component="collective.nitf.migrator.NewsItemSource"
name="collective.nitf.migrator.newsitemsource"
component="transmogrify.nitf.migrator.NewsItemSource"
name="transmogrify.nitf.migrator.newsitemsource"
/>

<utility
component="collective.nitf.migrator.SchemaUpdater"
name="collective.nitf.migrator.schemaupdater"
component="transmogrify.nitf.migrator.SchemaUpdater"
name="transmogrify.nitf.migrator.schemaupdater"
/>

<utility
component="collective.nitf.migrator.ImageMigrator"
name="collective.nitf.migrator.imagemigrator"
component="transmogrify.nitf.migrator.ImageMigrator"
name="transmogrify.nitf.migrator.imagemigrator"
/>

<utility
component="collective.nitf.migrator.ReplaceObject"
name="collective.nitf.migrator.replaceobject"
component="transmogrify.nitf.migrator.ReplaceObject"
name="transmogrify.nitf.migrator.replaceobject"
/>

<utility
component="collective.nitf.migrator.PrettyPrinter"
name="collective.nitf.migrator.pprinter"
component="transmogrify.nitf.migrator.PrettyPrinter"
name="transmogrify.nitf.migrator.pprinter"
/>

</configure>
37 changes: 37 additions & 0 deletions src/transmogrify/nitf/import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-

import os

from zope.interface import classProvides, implements
from collective.transmogrifier.interfaces import ISectionBlueprint
from collective.transmogrifier.interfaces import ISection
from collective.transmogrifier.utils import resolvePackageReferenceOrFile


class DirectorySource(object):
""" Reads the directory's contents and yield a readlines for every file.
"""
classProvides(ISectionBlueprint)
implements(ISection)

def __init__(self, transmogrifier, name, options, previous):
""" Takes two options:
directory: A full path to a directory or a relative path inside a
package in the form collective.example:datadir.
suffix: The extension of files that should be processed.
"""
self.previous = previous
self.directory = resolvePackageReferenceOrFile(options['directory'])
self.suffix = ".{0}".format(options['suffix'].split())

def __iter__(self):
for item in self.previous:
yield item

for filename in os.listdir(self.directory):
if filename.endswith(self.suffix):
filepath = os.path.join(self.directory, filename)
with open(filepath, 'r') as item:
yield item.read()
4 changes: 2 additions & 2 deletions src/transmogrify/nitf/migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from collective.transmogrifier.transmogrifier import Transmogrifier

from collective.nitf.content import INITF
from collective.nitf.content import kind_default_value
from collective.nitf.content import genre_default_value
from collective.nitf.content import section_default_value
from collective.nitf.content import urgency_default_value

Expand Down Expand Up @@ -111,7 +111,7 @@ def __iter__(self):
#obj.abstract = item['description']
obj.byline = ''
obj.text = RichTextValue(item['text'], 'text/html', 'text/x-html-safe')
obj.kind = kind_default_value(None)
obj.genre = genre_default_value(None)
obj.section = section_default_value(None)
obj.urgency = urgency_default_value(None)

Expand Down
18 changes: 18 additions & 0 deletions src/transmogrify/nitf/xmlimport.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[transmogrifier]
pipeline =
sourcedirectory
xmlprocesor
constructor

[sourcedirectory]
blueprint = collective.nitf.import.sourcedirectory
directory = collective.nitf:data

[xmlprocessor]
blueprint = collective.transmogrifier.constructor

[folders]
blueprint = collective.transmogrifier.sections.folders

[constructor]
blueprint = collective.transmogrifier.constructor
96 changes: 96 additions & 0 deletions src/transmogrify/nitf/xmlsource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as etree

from zope.interface import classProvides, implements
from collective.transmogrifier.interfaces import ISection
from collective.transmogrifier.interfaces import ISectionBlueprint

#from collective.nitf.content import genre_default_value
#from collective.nitf.content import section_default_value
#from collective.nitf.content import urgency_default_value


def get_text(dom, subelemet, attribute=None):
""" Return the text value for a node xor a attribute value from that node.
"""
elem = dom.find(subelemet)
if elem is not None:
if attribute is None:
return elem.text

elif attribute in elem.keys():
return elem.get(attribute)

return ''


def get_date_path(dom, subelemet, attribute):
""" Return a path 'YYYY/MM/DD' based on a date value normalized into
ISO-8601
Note: Only work with the basic format.
"""
text = get_text(dom, subelemet, attribute)
# We only need the YYYYMMDD part from the string
return "/".join([text[:4], text[4:6], text[6:8]])


class XMLSource(object):
""" Process an string containing a xml representation of a nitf object.
"""
classProvides(ISectionBlueprint)
implements(ISection)

def __init__(self, transmogrifier, name, options, previous):
self.previous = previous

def __iter__(self):
for data in self.previous:
item = {'id': '',
'path': '',
'title': '',
'subtitle': '',
'description': '',
'byline': '',
'text': '',
'genre': '',
'section': '',
'urgency': '',
'location': '',
'media': {'image': [],
'video': []}
}

dom = etree.fromstring(data)
head = dom.find('head')
body = dom.find('body')

item['id'] = get_text(head, 'docdata/doc-id', 'id-string').lower()
item['path'] = get_date_path(head, 'docdata/date.release', 'norm')
item['title'] = get_text(head, 'title')
item['genre'] = get_text(head, 'tobject/tobject.property',
'tobject.property.type')
item['section'] = get_text(head, 'pubdata', 'position.section')
item['urgency'] = get_text(head, 'docdata/urgency', 'ed-urg')

item['location'] = get_text(body, 'body.head/dateline/location')
item['subtitle'] = get_text(body, 'body.head/hedline/hl2')
item['description'] = get_text(body, 'body.head/abstract')
item['byline'] = get_text(body, 'body.head/byline/person')

for elem in list(body.find('body.content')):
if elem.tag == 'media' and elem.get('media-type') == 'image':
image = dict(elem.find('media-reference'))
image['media-caption'] = get_text(elem, 'media-caption')
item['media']['image'].append(image)

elif elem.tag == 'media' and elem.get('media-type') == 'video':
video = dict(elem.find('media-reference'))
video['media-caption'] = get_text(elem, 'media-caption')
item['media']['video'].append(video)

else: # other tag are considered part of the body text and
# should be preserved.
item['text'] += etree.tostring(elem)

yield item

0 comments on commit ddb0610

Please sign in to comment.