Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Mab Xml handler #2

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions examples/oaiPmh/openOaiPmh-zdbIsil.flux
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
default files = FLUX_DIR;

// beware: to use this URL your IP has to be allowed by registration
"http://services.d-nb.de/oai/repository" |
open-oaipmh(dateFrom="2013-08-11",dateUntil="2013-08-12",metadataPrefix="PicaPlus-xml",setSpec="bib") |
decode-xml |
handle-picaxml |
encode-formeta(style="multiline")|
write("stdout");
Binary file added examples/read/mab2/HT010726584.xml.bz2
Binary file not shown.
9 changes: 9 additions & 0 deletions examples/read/mab2/mabXml.flux
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
default files = FLUX_DIR;

files+"HT010726584.xml.bz2"|
open-file(compression="BZIP2") |
decode-xml |
handle-mabxml |
encode-formeta(style="multiline")|
write("stdout");
};
Binary file added examples/read/pica/pica.xml.bz2
Binary file not shown.
9 changes: 9 additions & 0 deletions examples/read/pica/picaXml.flux
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
default files = FLUX_DIR;

files+"pica.xml.bz2"|
open-file(compression="BZIP2") |
decode-xml |
handle-picaxml |
encode-formeta(style="multiline")|
write("stdout");
};
Binary file added examples/read/xmlSplitter/gndRdf.xml.bz2
Binary file not shown.
9 changes: 9 additions & 0 deletions examples/read/xmlSplitter/xmlEntitySplitting.flux
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
default files = FLUX_DIR;

files + "gndRdf.xml.bz2" |
open-file |
decode-xml|
split-xml(entityName="Description",toplevelelement="rdf:RDF")|
extract-literals|
write("stdout")
};
15 changes: 12 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -288,10 +288,19 @@
</exclusion>
</exclusions>
</dependency>
</dependencies>



<dependency>
<groupId>org.dspace</groupId>
<artifactId>oclc-harvester2</artifactId>
<version>0.1.12</version>
</dependency>
<dependency>
<groupId>xalan</groupId>
<artifactId>xalan</artifactId>
<version>2.7.1</version>
</dependency>

</dependencies>

<profiles>
<profile>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/** Copyright 2013,214 hbz, Pascal Christoph.
* Licensed under the Eclipse Public License 1.0
**/

package org.culturegraph.mf.stream.converter.xml;

import org.culturegraph.mf.framework.DefaultXmlPipe;
import org.culturegraph.mf.framework.StreamReceiver;
import org.culturegraph.mf.framework.XmlReceiver;
import org.culturegraph.mf.framework.annotations.Description;
import org.culturegraph.mf.framework.annotations.In;
import org.culturegraph.mf.framework.annotations.Out;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

/**
* A MAB XML reader.
*
* @author Pascal Christoph (dr0i)
*
*/
@Description("A MAB XML reader")
@In(XmlReceiver.class)
@Out(StreamReceiver.class)
public final class MabXmlHandler extends DefaultXmlPipe<StreamReceiver> {

private static final String SUBFIELD = "subfield";
private static final String DATAFIELD = "datafield";
private static final String CONTROLLFIELD = "controlfield";
private static final String RECORD = "ListRecords";
private static final String LEADER = "leader";
private static final String DATAFIELD_ATTRIBUTE = "tag";
private static final String SUBFIELD_ATTRIBUTE = "code";
private static final String INDICATOR1 = "ind1";
private static final String INDICATOR2 = "ind2";
private String currentTag = "";
private StringBuilder builder = new StringBuilder();

@Override
public void characters(final char[] chars, final int start, final int length)
throws SAXException {
this.builder.append(chars, start, length);
}

@Override
public void endElement(final String uri, final String localName, final String qName)
throws SAXException {
if (MabXmlHandler.CONTROLLFIELD.equals(localName)) {
getReceiver().literal(this.currentTag, this.builder.toString().trim());
getReceiver().endEntity();
} else if (MabXmlHandler.SUBFIELD.equals(localName)) {
getReceiver().literal(this.currentTag, this.builder.toString().trim());
} else if (MabXmlHandler.DATAFIELD.equals(localName)) {
getReceiver().endEntity();
} else if (MabXmlHandler.RECORD.equals(localName)) {
getReceiver().endRecord();
}
}

@Override
public void startElement(final String uri, final String localName, final String qName,
final Attributes attributes) throws SAXException {
if (MabXmlHandler.CONTROLLFIELD.equals(localName)) {
this.builder = new StringBuilder();
this.currentTag = "";
getReceiver().startEntity(attributes.getValue(MabXmlHandler.DATAFIELD_ATTRIBUTE));
} else if (MabXmlHandler.SUBFIELD.equals(localName)) {
this.builder = new StringBuilder();
this.currentTag = attributes.getValue(MabXmlHandler.SUBFIELD_ATTRIBUTE);
} else if (MabXmlHandler.DATAFIELD.equals(localName)) {
getReceiver().startEntity(
attributes.getValue(MabXmlHandler.DATAFIELD_ATTRIBUTE)
+ attributes.getValue(MabXmlHandler.INDICATOR1)
+ attributes.getValue(MabXmlHandler.INDICATOR2));
} else if (MabXmlHandler.RECORD.equals(localName)) {
getReceiver().startRecord("");
} else if (MabXmlHandler.LEADER.equals(localName)) {
this.builder = new StringBuilder();
this.currentTag = MabXmlHandler.LEADER;
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/** Copyright 2013 hbz, Pascal Christoph.
* Licensed under the Eclipse Public License 1.0
**/

package org.culturegraph.mf.stream.converter.xml;

import java.text.Normalizer;

import org.culturegraph.mf.framework.DefaultXmlPipe;
import org.culturegraph.mf.framework.StreamReceiver;
import org.culturegraph.mf.framework.XmlReceiver;
import org.culturegraph.mf.framework.annotations.Description;
import org.culturegraph.mf.framework.annotations.In;
import org.culturegraph.mf.framework.annotations.Out;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

/**
* A pica xml reader.
*
* @author Pascal Christoph (dr0i)
*
*/
@Description("A pica xml reader")
@In(XmlReceiver.class)
@Out(StreamReceiver.class)
public final class PicaXmlHandler extends DefaultXmlPipe<StreamReceiver> {

private static final String SUBFIELD = "subf";
private static final String DATAFIELD = "tag";
private static final String RECORD = "record";
private static final String NAMESPACE =
"http://www.oclcpica.org/xmlns/ppxml-1.0";
private static final String LEADER = "global";
private String currentTag = "";
private StringBuilder builder = new StringBuilder();

@Override
public void startElement(final String uri, final String localName,
final String qName, final Attributes attributes) throws SAXException {
if (SUBFIELD.equals(localName)) {
builder = new StringBuilder();
currentTag = attributes.getValue("id");
} else if (DATAFIELD.equals(localName)) {
getReceiver().startEntity(
attributes.getValue("id") + attributes.getValue("occ"));
} else if (RECORD.equals(localName) && NAMESPACE.equals(uri)) {
getReceiver().startRecord("");
} else if (LEADER.equals(localName)) {
builder = new StringBuilder();
currentTag = LEADER;
}
}

@Override
public void endElement(final String uri, final String localName,
final String qName) throws SAXException {
if (SUBFIELD.equals(localName)) {
getReceiver().literal(currentTag,
Normalizer.normalize(builder.toString().trim(), Normalizer.Form.NFC));
} else if (DATAFIELD.equals(localName)) {
getReceiver().endEntity();
} else if (RECORD.equals(localName) && NAMESPACE.equals(uri)) {
getReceiver().endRecord();
}
}

@Override
public void characters(final char[] chars, final int start, final int length)
throws SAXException {
builder.append(chars, start, length);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/** Copyright 2013,214 hbz, Pascal Christoph.
* Licensed under the Eclipse Public License 1.0
**/
package org.culturegraph.mf.stream.converter.xml;

import java.util.HashSet;

import org.apache.commons.lang.StringEscapeUtils;
import org.culturegraph.mf.framework.DefaultXmlPipe;
import org.culturegraph.mf.framework.StreamReceiver;
import org.culturegraph.mf.framework.XmlReceiver;
import org.culturegraph.mf.framework.annotations.Description;
import org.culturegraph.mf.framework.annotations.In;
import org.culturegraph.mf.framework.annotations.Out;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

/**
* An XML entity splitter.
*
* @author Pascal Christoph (dr0i)
*
*/
@Description("Splits all entities (aka records) residing in one XML document into multiple single XML documents.")
@In(XmlReceiver.class)
@Out(StreamReceiver.class)
public final class XmlEntitySplitter extends DefaultXmlPipe<StreamReceiver> {

/**
* Returns the XML declaration which is hard coded. @TODO change that hard
* wired.
*
* @return the XML decalration
*/
public static String getXmlDeclaration() {
return XmlEntitySplitter.XML_DECLARATION;
}

private String entity;
private StringBuilder builder = new StringBuilder();
private final HashSet<String> namespaces = new HashSet<String>();
private boolean inEntity = false;
private int recordCnt = 0;
private String root;
private final static String XML_DECLARATION = "<?xml version = \"1.0\" encoding = \"UTF-8\"?>";

private int entityDepth = 0;

private void appendValuesToEntity(final String qName, final Attributes attributes) {
this.builder.append("<" + qName);
if (attributes.getLength() > 0) {
for (int i = 0; i < attributes.getLength(); i++) {
this.builder.append(" " + attributes.getQName(i) + "=\""
+ StringEscapeUtils.escapeXml(attributes.getValue(i)) + "\"");
}
}

this.builder.append(">");
}

@Override
public void characters(final char[] chars, final int start, final int length)
throws SAXException {
try {
this.builder.append(StringEscapeUtils.escapeXml(new String(chars, start, length)));
} catch (final Exception e) {
reset();
}
}

@Override
public void endElement(final String uri, final String localName, final String qName)
throws SAXException {
if (this.inEntity) {
this.builder.append("</" + qName + ">");
if (this.entity.equals(localName)) {
if (this.entityDepth <= 1) {
final StringBuilder sb = new StringBuilder(XmlEntitySplitter.XML_DECLARATION
+ "<" + this.root);
if (this.namespaces != null) {
for (final String ns : this.namespaces) {
sb.append(ns);
}
sb.append(">");
}
this.builder.insert(0, sb.toString()).append("</" + this.root + ">");
getReceiver().literal("entity", this.builder.toString());
getReceiver().endRecord();
reset();
return;
}
this.entityDepth--;
}
}
}

@Override
public void onResetStream() {
reset();
}

private void reset() {
this.inEntity = false;
this.builder = new StringBuilder();
this.entityDepth = 0;
}

/**
* Sets the name of the entity. All these entities in the XML stream will be
* XML documents on their own.
*
* @param name
* Identifies the entities
*/
public void setEntityName(final String name) {
this.entity = name;
}

/**
* Sets the top-level XML document element.
*
* @param name
* the element
*/
public void setTopLevelElement(final String name) {
this.root = name;
}

@Override
public void startElement(final String uri, final String localName, final String qName,
final Attributes attributes) throws SAXException {
if (!this.inEntity) {
if (this.entity.equals(localName)) {
this.builder = new StringBuilder();
getReceiver().startRecord(String.valueOf(this.recordCnt++));
this.inEntity = true;
appendValuesToEntity(qName, attributes);
this.entityDepth++;
} else if (this.root == null) {
this.root = qName;
}
} else {
if (this.entity.equals(localName)) {
this.entityDepth++;
}
appendValuesToEntity(qName, attributes);
}
}

@Override
public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
super.startPrefixMapping(prefix, uri);
if (!prefix.isEmpty() && uri != null) {
this.namespaces.add(" xmlns:" + prefix + "=\"" + uri + "\"");
}
}
}
Loading