diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 0f569b6..b39da75 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -27,5 +27,6 @@ Contributors - `Ben Swanson `__ - Jenine Turner-Trauring - `Jim White `__ - +- `Patrick Claus `__ + (and many others with helpful bug reports and questions!) diff --git a/python/CHANGES FOR PYTHON3.txt b/python/CHANGES FOR PYTHON3.txt new file mode 100644 index 0000000..fc4bd3d --- /dev/null +++ b/python/CHANGES FOR PYTHON3.txt @@ -0,0 +1,32 @@ +#AN EXPLANATION ON HOW CONVERTION FOR BLLIPPARSER TO PYTHON3 WAS DONE# + +Downloaded source containing setup.py for bllipparser +Converted all *.py files to python3 with 2to3 +did setup -> only with extensions for python +Copied generated .so files (Charniak and Johnson) to new python files +Checked that everything is converted to python3 +downloaded fitting model +successfully tried for 1-best parsing + +In RerankingParser.py, the following change was made to support 1-best parsing: + try: + parses = parser.parse(sentence.sentrep) + except RuntimeError: + parses = [] + #nbest_list = NBestList(sentence, parses, sentence_id) + #if rerank: + # nbest_list.rerank(self) + #return nbest_list + return parses + +Function call for bllipparser python3: +>>> from bllipparser_python3 import * +>>> rrp = RerankingParser.from_unified_model_dir('/somedirectory/models/WSJ/') +>>> sentence = rrp.parse('To Sherlock Holmes she is always THE woman.')[0][1] +>>> print(sentence) +(S1 (S (NP (NP (NNP To) (NNP Sherlock) (NNP Holmes)) (NP (PRP she))) (VP (VBZ is) (ADVP (RB always)) (NP (DT THE) (NN woman))) (. .))) +>>> + +Fiddled with return states and checked that you get string or list object for iteration over top parsed sentences. + +#QUICK AND DIRTY!!# diff --git a/python/bllipparser_python3/CharniakParser.py b/python/bllipparser_python3/CharniakParser.py new file mode 100644 index 0000000..48c14ab --- /dev/null +++ b/python/bllipparser_python3/CharniakParser.py @@ -0,0 +1,657 @@ +# This file was automatically generated by SWIG (http://www.swig.org). +# Version 2.0.11 +# +# Do not make changes to this file unless you know what you are doing--modify +# the SWIG interface file instead. + + + + + +from sys import version_info +if version_info >= (2,6,0): + def swig_import_helper(): + from os.path import dirname + import imp + fp = None + try: + fp, pathname, description = imp.find_module('_CharniakParser', [dirname(__file__)]) + except ImportError: + import _CharniakParser + return _CharniakParser + if fp is not None: + try: + _mod = imp.load_module('_CharniakParser', fp, pathname, description) + finally: + fp.close() + return _mod + _CharniakParser = swig_import_helper() + del swig_import_helper +else: + import _CharniakParser +del version_info +def _swig_setattr_nondynamic(self,class_type,name,value,static=1): + if (name == "thisown"): return self.this.own(value) + if (name == "this"): + if type(value).__name__ == 'SwigPyObject': + self.__dict__[name] = value + return + method = class_type.__swig_setmethods__.get(name,None) + if method: return method(self,value) + if (not static): + self.__dict__[name] = value + else: + raise AttributeError("You cannot add attributes to %s" % self) + +def _swig_setattr(self,class_type,name,value): + return _swig_setattr_nondynamic(self,class_type,name,value,0) + +def _swig_getattr(self,class_type,name): + if (name == "thisown"): return self.this.own() + method = class_type.__swig_getmethods__.get(name,None) + if method: return method(self) + raise AttributeError(name) + +def _swig_repr(self): + try: strthis = "proxy of " + self.this.__repr__() + except: strthis = "" + return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,) + +class SwigPyIterator: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, SwigPyIterator, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, SwigPyIterator, name) + def __init__(self, *args, **kwargs): raise AttributeError("No constructor defined - class is abstract") + __repr__ = _swig_repr + __swig_destroy__ = _CharniakParser.delete_SwigPyIterator + __del__ = lambda self : None; + def value(self): return _CharniakParser.SwigPyIterator_value(self) + def incr(self, n=1): return _CharniakParser.SwigPyIterator_incr(self, n) + def decr(self, n=1): return _CharniakParser.SwigPyIterator_decr(self, n) + def distance(self, *args): return _CharniakParser.SwigPyIterator_distance(self, *args) + def equal(self, *args): return _CharniakParser.SwigPyIterator_equal(self, *args) + def copy(self): return _CharniakParser.SwigPyIterator_copy(self) + def __next__(self): return _CharniakParser.SwigPyIterator_next(self) + def __next__(self): return _CharniakParser.SwigPyIterator___next__(self) + def previous(self): return _CharniakParser.SwigPyIterator_previous(self) + def advance(self, *args): return _CharniakParser.SwigPyIterator_advance(self, *args) + def __eq__(self, *args): return _CharniakParser.SwigPyIterator___eq__(self, *args) + def __ne__(self, *args): return _CharniakParser.SwigPyIterator___ne__(self, *args) + def __iadd__(self, *args): return _CharniakParser.SwigPyIterator___iadd__(self, *args) + def __isub__(self, *args): return _CharniakParser.SwigPyIterator___isub__(self, *args) + def __add__(self, *args): return _CharniakParser.SwigPyIterator___add__(self, *args) + def __sub__(self, *args): return _CharniakParser.SwigPyIterator___sub__(self, *args) + def __iter__(self): return self +SwigPyIterator_swigregister = _CharniakParser.SwigPyIterator_swigregister +SwigPyIterator_swigregister(SwigPyIterator) + + +def parse(*args): + return _CharniakParser.parse(*args) +parse = _CharniakParser.parse + +def setOptions(*args): + return _CharniakParser.setOptions(*args) +setOptions = _CharniakParser.setOptions + +def tokenize(*args): + return _CharniakParser.tokenize(*args) +tokenize = _CharniakParser.tokenize + +def inputTreeFromString(*args): + return _CharniakParser.inputTreeFromString(*args) +inputTreeFromString = _CharniakParser.inputTreeFromString + +def inputTreesFromString(*args): + return _CharniakParser.inputTreesFromString(*args) +inputTreesFromString = _CharniakParser.inputTreesFromString + +def inputTreesFromFile(*args): + return _CharniakParser.inputTreesFromFile(*args) +inputTreesFromFile = _CharniakParser.inputTreesFromFile + +def sentRepsFromString(*args): + return _CharniakParser.sentRepsFromString(*args) +sentRepsFromString = _CharniakParser.sentRepsFromString + +def sentRepsFromFile(*args): + return _CharniakParser.sentRepsFromFile(*args) +sentRepsFromFile = _CharniakParser.sentRepsFromFile + +def asNBestList(*args): + return _CharniakParser.asNBestList(*args) +asNBestList = _CharniakParser.asNBestList + +def error(*args): + return _CharniakParser.error(*args) +error = _CharniakParser.error + +def ptbEscape(*args): + return _CharniakParser.ptbEscape(*args) +ptbEscape = _CharniakParser.ptbEscape + +def ptbUnescape(*args): + return _CharniakParser.ptbUnescape(*args) +ptbUnescape = _CharniakParser.ptbUnescape +class StringList: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, StringList, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, StringList, name) + __repr__ = _swig_repr + def iterator(self): return _CharniakParser.StringList_iterator(self) + def __iter__(self): return self.iterator() + def __bool__(self): return _CharniakParser.StringList___nonzero__(self) + def __bool__(self): return _CharniakParser.StringList___bool__(self) + def __len__(self): return _CharniakParser.StringList___len__(self) + def pop(self): return _CharniakParser.StringList_pop(self) + def __getslice__(self, *args): return _CharniakParser.StringList___getslice__(self, *args) + def __setslice__(self, *args): return _CharniakParser.StringList___setslice__(self, *args) + def __delslice__(self, *args): return _CharniakParser.StringList___delslice__(self, *args) + def __delitem__(self, *args): return _CharniakParser.StringList___delitem__(self, *args) + def __getitem__(self, *args): return _CharniakParser.StringList___getitem__(self, *args) + def __setitem__(self, *args): return _CharniakParser.StringList___setitem__(self, *args) + def append(self, *args): return _CharniakParser.StringList_append(self, *args) + def empty(self): return _CharniakParser.StringList_empty(self) + def size(self): return _CharniakParser.StringList_size(self) + def clear(self): return _CharniakParser.StringList_clear(self) + def swap(self, *args): return _CharniakParser.StringList_swap(self, *args) + def get_allocator(self): return _CharniakParser.StringList_get_allocator(self) + def begin(self): return _CharniakParser.StringList_begin(self) + def end(self): return _CharniakParser.StringList_end(self) + def rbegin(self): return _CharniakParser.StringList_rbegin(self) + def rend(self): return _CharniakParser.StringList_rend(self) + def pop_back(self): return _CharniakParser.StringList_pop_back(self) + def erase(self, *args): return _CharniakParser.StringList_erase(self, *args) + def __init__(self, *args): + this = _CharniakParser.new_StringList(*args) + try: self.this.append(this) + except: self.this = this + def push_back(self, *args): return _CharniakParser.StringList_push_back(self, *args) + def front(self): return _CharniakParser.StringList_front(self) + def back(self): return _CharniakParser.StringList_back(self) + def assign(self, *args): return _CharniakParser.StringList_assign(self, *args) + def resize(self, *args): return _CharniakParser.StringList_resize(self, *args) + def insert(self, *args): return _CharniakParser.StringList_insert(self, *args) + def pop_front(self): return _CharniakParser.StringList_pop_front(self) + def push_front(self, *args): return _CharniakParser.StringList_push_front(self, *args) + def reverse(self): return _CharniakParser.StringList_reverse(self) + __swig_destroy__ = _CharniakParser.delete_StringList + __del__ = lambda self : None; +StringList_swigregister = _CharniakParser.StringList_swigregister +StringList_swigregister(StringList) +cvar = _CharniakParser.cvar +max_sentence_length = cvar.max_sentence_length + +class SentRepList: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, SentRepList, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, SentRepList, name) + __repr__ = _swig_repr + def iterator(self): return _CharniakParser.SentRepList_iterator(self) + def __iter__(self): return self.iterator() + def __bool__(self): return _CharniakParser.SentRepList___nonzero__(self) + def __bool__(self): return _CharniakParser.SentRepList___bool__(self) + def __len__(self): return _CharniakParser.SentRepList___len__(self) + def pop(self): return _CharniakParser.SentRepList_pop(self) + def __getslice__(self, *args): return _CharniakParser.SentRepList___getslice__(self, *args) + def __setslice__(self, *args): return _CharniakParser.SentRepList___setslice__(self, *args) + def __delslice__(self, *args): return _CharniakParser.SentRepList___delslice__(self, *args) + def __delitem__(self, *args): return _CharniakParser.SentRepList___delitem__(self, *args) + def __getitem__(self, *args): return _CharniakParser.SentRepList___getitem__(self, *args) + def __setitem__(self, *args): return _CharniakParser.SentRepList___setitem__(self, *args) + def append(self, *args): return _CharniakParser.SentRepList_append(self, *args) + def empty(self): return _CharniakParser.SentRepList_empty(self) + def size(self): return _CharniakParser.SentRepList_size(self) + def clear(self): return _CharniakParser.SentRepList_clear(self) + def swap(self, *args): return _CharniakParser.SentRepList_swap(self, *args) + def get_allocator(self): return _CharniakParser.SentRepList_get_allocator(self) + def begin(self): return _CharniakParser.SentRepList_begin(self) + def end(self): return _CharniakParser.SentRepList_end(self) + def rbegin(self): return _CharniakParser.SentRepList_rbegin(self) + def rend(self): return _CharniakParser.SentRepList_rend(self) + def pop_back(self): return _CharniakParser.SentRepList_pop_back(self) + def erase(self, *args): return _CharniakParser.SentRepList_erase(self, *args) + def __init__(self, *args): + this = _CharniakParser.new_SentRepList(*args) + try: self.this.append(this) + except: self.this = this + def push_back(self, *args): return _CharniakParser.SentRepList_push_back(self, *args) + def front(self): return _CharniakParser.SentRepList_front(self) + def back(self): return _CharniakParser.SentRepList_back(self) + def assign(self, *args): return _CharniakParser.SentRepList_assign(self, *args) + def resize(self, *args): return _CharniakParser.SentRepList_resize(self, *args) + def insert(self, *args): return _CharniakParser.SentRepList_insert(self, *args) + def pop_front(self): return _CharniakParser.SentRepList_pop_front(self) + def push_front(self, *args): return _CharniakParser.SentRepList_push_front(self, *args) + def remove(self, *args): return _CharniakParser.SentRepList_remove(self, *args) + def unique(self): return _CharniakParser.SentRepList_unique(self) + def reverse(self): return _CharniakParser.SentRepList_reverse(self) + def sort(self): return _CharniakParser.SentRepList_sort(self) + def merge(self, *args): return _CharniakParser.SentRepList_merge(self, *args) + __swig_destroy__ = _CharniakParser.delete_SentRepList + __del__ = lambda self : None; +SentRepList_swigregister = _CharniakParser.SentRepList_swigregister +SentRepList_swigregister(SentRepList) + +class ScoredTreePair: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, ScoredTreePair, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, ScoredTreePair, name) + __repr__ = _swig_repr + def __init__(self, *args): + this = _CharniakParser.new_ScoredTreePair(*args) + try: self.this.append(this) + except: self.this = this + __swig_setmethods__["first"] = _CharniakParser.ScoredTreePair_first_set + __swig_getmethods__["first"] = _CharniakParser.ScoredTreePair_first_get + __swig_setmethods__["second"] = _CharniakParser.ScoredTreePair_second_set + __swig_getmethods__["second"] = _CharniakParser.ScoredTreePair_second_get + def __len__(self): return 2 + def __repr__(self): return str((self.first, self.second)) + def __getitem__(self, index): + if not (index % 2): + return self.first + else: + return self.second + def __setitem__(self, index, val): + if not (index % 2): + self.first = val + else: + self.second = val + __swig_destroy__ = _CharniakParser.delete_ScoredTreePair + __del__ = lambda self : None; +ScoredTreePair_swigregister = _CharniakParser.ScoredTreePair_swigregister +ScoredTreePair_swigregister(ScoredTreePair) + +class InputTrees: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, InputTrees, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, InputTrees, name) + __repr__ = _swig_repr + def iterator(self): return _CharniakParser.InputTrees_iterator(self) + def __iter__(self): return self.iterator() + def __bool__(self): return _CharniakParser.InputTrees___nonzero__(self) + def __bool__(self): return _CharniakParser.InputTrees___bool__(self) + def __len__(self): return _CharniakParser.InputTrees___len__(self) + def pop(self): return _CharniakParser.InputTrees_pop(self) + def __getslice__(self, *args): return _CharniakParser.InputTrees___getslice__(self, *args) + def __setslice__(self, *args): return _CharniakParser.InputTrees___setslice__(self, *args) + def __delslice__(self, *args): return _CharniakParser.InputTrees___delslice__(self, *args) + def __delitem__(self, *args): return _CharniakParser.InputTrees___delitem__(self, *args) + def __getitem__(self, *args): return _CharniakParser.InputTrees___getitem__(self, *args) + def __setitem__(self, *args): return _CharniakParser.InputTrees___setitem__(self, *args) + def append(self, *args): return _CharniakParser.InputTrees_append(self, *args) + def empty(self): return _CharniakParser.InputTrees_empty(self) + def size(self): return _CharniakParser.InputTrees_size(self) + def clear(self): return _CharniakParser.InputTrees_clear(self) + def swap(self, *args): return _CharniakParser.InputTrees_swap(self, *args) + def get_allocator(self): return _CharniakParser.InputTrees_get_allocator(self) + def begin(self): return _CharniakParser.InputTrees_begin(self) + def end(self): return _CharniakParser.InputTrees_end(self) + def rbegin(self): return _CharniakParser.InputTrees_rbegin(self) + def rend(self): return _CharniakParser.InputTrees_rend(self) + def pop_back(self): return _CharniakParser.InputTrees_pop_back(self) + def erase(self, *args): return _CharniakParser.InputTrees_erase(self, *args) + def __init__(self, *args): + this = _CharniakParser.new_InputTrees(*args) + try: self.this.append(this) + except: self.this = this + def push_back(self, *args): return _CharniakParser.InputTrees_push_back(self, *args) + def front(self): return _CharniakParser.InputTrees_front(self) + def back(self): return _CharniakParser.InputTrees_back(self) + def assign(self, *args): return _CharniakParser.InputTrees_assign(self, *args) + def resize(self, *args): return _CharniakParser.InputTrees_resize(self, *args) + def insert(self, *args): return _CharniakParser.InputTrees_insert(self, *args) + def pop_front(self): return _CharniakParser.InputTrees_pop_front(self) + def push_front(self, *args): return _CharniakParser.InputTrees_push_front(self, *args) + def remove(self, *args): return _CharniakParser.InputTrees_remove(self, *args) + def unique(self): return _CharniakParser.InputTrees_unique(self) + def reverse(self): return _CharniakParser.InputTrees_reverse(self) + def sort(self): return _CharniakParser.InputTrees_sort(self) + def merge(self, *args): return _CharniakParser.InputTrees_merge(self, *args) + __swig_destroy__ = _CharniakParser.delete_InputTrees + __del__ = lambda self : None; +InputTrees_swigregister = _CharniakParser.InputTrees_swigregister +InputTrees_swigregister(InputTrees) + +class VectorScoredTree: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, VectorScoredTree, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, VectorScoredTree, name) + __repr__ = _swig_repr + def iterator(self): return _CharniakParser.VectorScoredTree_iterator(self) + def __iter__(self): return self.iterator() + def __bool__(self): return _CharniakParser.VectorScoredTree___nonzero__(self) + def __bool__(self): return _CharniakParser.VectorScoredTree___bool__(self) + def __len__(self): return _CharniakParser.VectorScoredTree___len__(self) + def pop(self): return _CharniakParser.VectorScoredTree_pop(self) + def __getslice__(self, *args): return _CharniakParser.VectorScoredTree___getslice__(self, *args) + def __setslice__(self, *args): return _CharniakParser.VectorScoredTree___setslice__(self, *args) + def __delslice__(self, *args): return _CharniakParser.VectorScoredTree___delslice__(self, *args) + def __delitem__(self, *args): return _CharniakParser.VectorScoredTree___delitem__(self, *args) + def __getitem__(self, *args): return _CharniakParser.VectorScoredTree___getitem__(self, *args) + def __setitem__(self, *args): return _CharniakParser.VectorScoredTree___setitem__(self, *args) + def append(self, *args): return _CharniakParser.VectorScoredTree_append(self, *args) + def empty(self): return _CharniakParser.VectorScoredTree_empty(self) + def size(self): return _CharniakParser.VectorScoredTree_size(self) + def clear(self): return _CharniakParser.VectorScoredTree_clear(self) + def swap(self, *args): return _CharniakParser.VectorScoredTree_swap(self, *args) + def get_allocator(self): return _CharniakParser.VectorScoredTree_get_allocator(self) + def begin(self): return _CharniakParser.VectorScoredTree_begin(self) + def end(self): return _CharniakParser.VectorScoredTree_end(self) + def rbegin(self): return _CharniakParser.VectorScoredTree_rbegin(self) + def rend(self): return _CharniakParser.VectorScoredTree_rend(self) + def pop_back(self): return _CharniakParser.VectorScoredTree_pop_back(self) + def erase(self, *args): return _CharniakParser.VectorScoredTree_erase(self, *args) + def __init__(self, *args): + this = _CharniakParser.new_VectorScoredTree(*args) + try: self.this.append(this) + except: self.this = this + def push_back(self, *args): return _CharniakParser.VectorScoredTree_push_back(self, *args) + def front(self): return _CharniakParser.VectorScoredTree_front(self) + def back(self): return _CharniakParser.VectorScoredTree_back(self) + def assign(self, *args): return _CharniakParser.VectorScoredTree_assign(self, *args) + def resize(self, *args): return _CharniakParser.VectorScoredTree_resize(self, *args) + def insert(self, *args): return _CharniakParser.VectorScoredTree_insert(self, *args) + def reserve(self, *args): return _CharniakParser.VectorScoredTree_reserve(self, *args) + def capacity(self): return _CharniakParser.VectorScoredTree_capacity(self) + __swig_destroy__ = _CharniakParser.delete_VectorScoredTree + __del__ = lambda self : None; +VectorScoredTree_swigregister = _CharniakParser.VectorScoredTree_swigregister +VectorScoredTree_swigregister(VectorScoredTree) + +class StringVector: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, StringVector, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, StringVector, name) + __repr__ = _swig_repr + def iterator(self): return _CharniakParser.StringVector_iterator(self) + def __iter__(self): return self.iterator() + def __bool__(self): return _CharniakParser.StringVector___nonzero__(self) + def __bool__(self): return _CharniakParser.StringVector___bool__(self) + def __len__(self): return _CharniakParser.StringVector___len__(self) + def pop(self): return _CharniakParser.StringVector_pop(self) + def __getslice__(self, *args): return _CharniakParser.StringVector___getslice__(self, *args) + def __setslice__(self, *args): return _CharniakParser.StringVector___setslice__(self, *args) + def __delslice__(self, *args): return _CharniakParser.StringVector___delslice__(self, *args) + def __delitem__(self, *args): return _CharniakParser.StringVector___delitem__(self, *args) + def __getitem__(self, *args): return _CharniakParser.StringVector___getitem__(self, *args) + def __setitem__(self, *args): return _CharniakParser.StringVector___setitem__(self, *args) + def append(self, *args): return _CharniakParser.StringVector_append(self, *args) + def empty(self): return _CharniakParser.StringVector_empty(self) + def size(self): return _CharniakParser.StringVector_size(self) + def clear(self): return _CharniakParser.StringVector_clear(self) + def swap(self, *args): return _CharniakParser.StringVector_swap(self, *args) + def get_allocator(self): return _CharniakParser.StringVector_get_allocator(self) + def begin(self): return _CharniakParser.StringVector_begin(self) + def end(self): return _CharniakParser.StringVector_end(self) + def rbegin(self): return _CharniakParser.StringVector_rbegin(self) + def rend(self): return _CharniakParser.StringVector_rend(self) + def pop_back(self): return _CharniakParser.StringVector_pop_back(self) + def erase(self, *args): return _CharniakParser.StringVector_erase(self, *args) + def __init__(self, *args): + this = _CharniakParser.new_StringVector(*args) + try: self.this.append(this) + except: self.this = this + def push_back(self, *args): return _CharniakParser.StringVector_push_back(self, *args) + def front(self): return _CharniakParser.StringVector_front(self) + def back(self): return _CharniakParser.StringVector_back(self) + def assign(self, *args): return _CharniakParser.StringVector_assign(self, *args) + def resize(self, *args): return _CharniakParser.StringVector_resize(self, *args) + def insert(self, *args): return _CharniakParser.StringVector_insert(self, *args) + def reserve(self, *args): return _CharniakParser.StringVector_reserve(self, *args) + def capacity(self): return _CharniakParser.StringVector_capacity(self) + __swig_destroy__ = _CharniakParser.delete_StringVector + __del__ = lambda self : None; +StringVector_swigregister = _CharniakParser.StringVector_swigregister +StringVector_swigregister(StringVector) + +class TermVector: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, TermVector, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, TermVector, name) + __repr__ = _swig_repr + def iterator(self): return _CharniakParser.TermVector_iterator(self) + def __iter__(self): return self.iterator() + def __bool__(self): return _CharniakParser.TermVector___nonzero__(self) + def __bool__(self): return _CharniakParser.TermVector___bool__(self) + def __len__(self): return _CharniakParser.TermVector___len__(self) + def pop(self): return _CharniakParser.TermVector_pop(self) + def __getslice__(self, *args): return _CharniakParser.TermVector___getslice__(self, *args) + def __setslice__(self, *args): return _CharniakParser.TermVector___setslice__(self, *args) + def __delslice__(self, *args): return _CharniakParser.TermVector___delslice__(self, *args) + def __delitem__(self, *args): return _CharniakParser.TermVector___delitem__(self, *args) + def __getitem__(self, *args): return _CharniakParser.TermVector___getitem__(self, *args) + def __setitem__(self, *args): return _CharniakParser.TermVector___setitem__(self, *args) + def append(self, *args): return _CharniakParser.TermVector_append(self, *args) + def empty(self): return _CharniakParser.TermVector_empty(self) + def size(self): return _CharniakParser.TermVector_size(self) + def clear(self): return _CharniakParser.TermVector_clear(self) + def swap(self, *args): return _CharniakParser.TermVector_swap(self, *args) + def get_allocator(self): return _CharniakParser.TermVector_get_allocator(self) + def begin(self): return _CharniakParser.TermVector_begin(self) + def end(self): return _CharniakParser.TermVector_end(self) + def rbegin(self): return _CharniakParser.TermVector_rbegin(self) + def rend(self): return _CharniakParser.TermVector_rend(self) + def pop_back(self): return _CharniakParser.TermVector_pop_back(self) + def erase(self, *args): return _CharniakParser.TermVector_erase(self, *args) + def __init__(self, *args): + this = _CharniakParser.new_TermVector(*args) + try: self.this.append(this) + except: self.this = this + def push_back(self, *args): return _CharniakParser.TermVector_push_back(self, *args) + def front(self): return _CharniakParser.TermVector_front(self) + def back(self): return _CharniakParser.TermVector_back(self) + def assign(self, *args): return _CharniakParser.TermVector_assign(self, *args) + def resize(self, *args): return _CharniakParser.TermVector_resize(self, *args) + def insert(self, *args): return _CharniakParser.TermVector_insert(self, *args) + def reserve(self, *args): return _CharniakParser.TermVector_reserve(self, *args) + def capacity(self): return _CharniakParser.TermVector_capacity(self) + __swig_destroy__ = _CharniakParser.delete_TermVector + __del__ = lambda self : None; +TermVector_swigregister = _CharniakParser.TermVector_swigregister +TermVector_swigregister(TermVector) + +class TermVectorVector: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, TermVectorVector, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, TermVectorVector, name) + __repr__ = _swig_repr + def iterator(self): return _CharniakParser.TermVectorVector_iterator(self) + def __iter__(self): return self.iterator() + def __bool__(self): return _CharniakParser.TermVectorVector___nonzero__(self) + def __bool__(self): return _CharniakParser.TermVectorVector___bool__(self) + def __len__(self): return _CharniakParser.TermVectorVector___len__(self) + def pop(self): return _CharniakParser.TermVectorVector_pop(self) + def __getslice__(self, *args): return _CharniakParser.TermVectorVector___getslice__(self, *args) + def __setslice__(self, *args): return _CharniakParser.TermVectorVector___setslice__(self, *args) + def __delslice__(self, *args): return _CharniakParser.TermVectorVector___delslice__(self, *args) + def __delitem__(self, *args): return _CharniakParser.TermVectorVector___delitem__(self, *args) + def __getitem__(self, *args): return _CharniakParser.TermVectorVector___getitem__(self, *args) + def __setitem__(self, *args): return _CharniakParser.TermVectorVector___setitem__(self, *args) + def append(self, *args): return _CharniakParser.TermVectorVector_append(self, *args) + def empty(self): return _CharniakParser.TermVectorVector_empty(self) + def size(self): return _CharniakParser.TermVectorVector_size(self) + def clear(self): return _CharniakParser.TermVectorVector_clear(self) + def swap(self, *args): return _CharniakParser.TermVectorVector_swap(self, *args) + def get_allocator(self): return _CharniakParser.TermVectorVector_get_allocator(self) + def begin(self): return _CharniakParser.TermVectorVector_begin(self) + def end(self): return _CharniakParser.TermVectorVector_end(self) + def rbegin(self): return _CharniakParser.TermVectorVector_rbegin(self) + def rend(self): return _CharniakParser.TermVectorVector_rend(self) + def pop_back(self): return _CharniakParser.TermVectorVector_pop_back(self) + def erase(self, *args): return _CharniakParser.TermVectorVector_erase(self, *args) + def __init__(self, *args): + this = _CharniakParser.new_TermVectorVector(*args) + try: self.this.append(this) + except: self.this = this + def push_back(self, *args): return _CharniakParser.TermVectorVector_push_back(self, *args) + def front(self): return _CharniakParser.TermVectorVector_front(self) + def back(self): return _CharniakParser.TermVectorVector_back(self) + def assign(self, *args): return _CharniakParser.TermVectorVector_assign(self, *args) + def resize(self, *args): return _CharniakParser.TermVectorVector_resize(self, *args) + def insert(self, *args): return _CharniakParser.TermVectorVector_insert(self, *args) + def reserve(self, *args): return _CharniakParser.TermVectorVector_reserve(self, *args) + def capacity(self): return _CharniakParser.TermVectorVector_capacity(self) + __swig_destroy__ = _CharniakParser.delete_TermVectorVector + __del__ = lambda self : None; +TermVectorVector_swigregister = _CharniakParser.TermVectorVector_swigregister +TermVectorVector_swigregister(TermVectorVector) + + +def loadModel(*args): + return _CharniakParser.loadModel(*args) +loadModel = _CharniakParser.loadModel +class SentRep: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, SentRep, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, SentRep, name) + __repr__ = _swig_repr + def __init__(self, *args): + this = _CharniakParser.new_SentRep(*args) + try: self.this.append(this) + except: self.this = this + def __len__(self): return _CharniakParser.SentRep___len__(self) + def getWord(self, *args): return _CharniakParser.SentRep_getWord(self, *args) + def getName(self): return _CharniakParser.SentRep_getName(self) + def __str__(self): return _CharniakParser.SentRep___str__(self) + def makeFailureTree(self, *args): return _CharniakParser.SentRep_makeFailureTree(self, *args) + __swig_destroy__ = _CharniakParser.delete_SentRep + __del__ = lambda self : None; +SentRep_swigregister = _CharniakParser.SentRep_swigregister +SentRep_swigregister(SentRep) + +class InputTree: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, InputTree, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, InputTree, name) + __repr__ = _swig_repr + def num(self): return _CharniakParser.InputTree_num(self) + def start(self): return _CharniakParser.InputTree_start(self) + def __len__(self): return _CharniakParser.InputTree___len__(self) + def finish(self): return _CharniakParser.InputTree_finish(self) + def word(self): return _CharniakParser.InputTree_word(self) + def term(self): return _CharniakParser.InputTree_term(self) + def ntInfo(self): return _CharniakParser.InputTree_ntInfo(self) + def head(self): return _CharniakParser.InputTree_head(self) + def hTag(self): return _CharniakParser.InputTree_hTag(self) + def subTrees(self): return _CharniakParser.InputTree_subTrees(self) + def headTree(self): return _CharniakParser.InputTree_headTree(self) + def parent(self): return _CharniakParser.InputTree_parent(self) + def parentSet(self): return _CharniakParser.InputTree_parentSet(self) + __swig_destroy__ = _CharniakParser.delete_InputTree + __del__ = lambda self : None; + def make(self, *args): return _CharniakParser.InputTree_make(self, *args) + def makePosList(self, *args): return _CharniakParser.InputTree_makePosList(self, *args) + __swig_setmethods__["pageWidth"] = _CharniakParser.InputTree_pageWidth_set + __swig_getmethods__["pageWidth"] = _CharniakParser.InputTree_pageWidth_get + def __str__(self): return _CharniakParser.InputTree___str__(self) + def toStringPrettyPrint(self): return _CharniakParser.InputTree_toStringPrettyPrint(self) + def toSentRep(self): return _CharniakParser.InputTree_toSentRep(self) + def getTags(self): return _CharniakParser.InputTree_getTags(self) + def getWords(self): return _CharniakParser.InputTree_getWords(self) + def setTerm(self, *args): return _CharniakParser.InputTree_setTerm(self, *args) + def setNtInfo(self, *args): return _CharniakParser.InputTree_setNtInfo(self, *args) + def setWord(self, *args): return _CharniakParser.InputTree_setWord(self, *args) + def __init__(self): + this = _CharniakParser.new_InputTree() + try: self.this.append(this) + except: self.this = this +InputTree_swigregister = _CharniakParser.InputTree_swigregister +InputTree_swigregister(InputTree) + +class ewDciTokStrm: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, ewDciTokStrm, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, ewDciTokStrm, name) + __repr__ = _swig_repr + def __init__(self, *args): + this = _CharniakParser.new_ewDciTokStrm(*args) + try: self.this.append(this) + except: self.this = this + def read(self): return _CharniakParser.ewDciTokStrm_read(self) + __swig_destroy__ = _CharniakParser.delete_ewDciTokStrm + __del__ = lambda self : None; +ewDciTokStrm_swigregister = _CharniakParser.ewDciTokStrm_swigregister +ewDciTokStrm_swigregister(ewDciTokStrm) + +class Wrd: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, Wrd, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, Wrd, name) + __repr__ = _swig_repr + def lexeme(self): return _CharniakParser.Wrd_lexeme(self) + def __init__(self): + this = _CharniakParser.new_Wrd() + try: self.this.append(this) + except: self.this = this + __swig_destroy__ = _CharniakParser.delete_Wrd + __del__ = lambda self : None; +Wrd_swigregister = _CharniakParser.Wrd_swigregister +Wrd_swigregister(Wrd) + +class Term: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, Term, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, Term, name) + __repr__ = _swig_repr + def __init__(self, *args): + this = _CharniakParser.new_Term(*args) + try: self.this.append(this) + except: self.this = this + def toInt(self): return _CharniakParser.Term_toInt(self) + def terminal_p(self): return _CharniakParser.Term_terminal_p(self) + def isPunc(self): return _CharniakParser.Term_isPunc(self) + def openClass(self): return _CharniakParser.Term_openClass(self) + def isColon(self): return _CharniakParser.Term_isColon(self) + def isFinal(self): return _CharniakParser.Term_isFinal(self) + def isComma(self): return _CharniakParser.Term_isComma(self) + def isCC(self): return _CharniakParser.Term_isCC(self) + def isRoot(self): return _CharniakParser.Term_isRoot(self) + def isS(self): return _CharniakParser.Term_isS(self) + def isParen(self): return _CharniakParser.Term_isParen(self) + def isNP(self): return _CharniakParser.Term_isNP(self) + def isVP(self): return _CharniakParser.Term_isVP(self) + def isOpen(self): return _CharniakParser.Term_isOpen(self) + def isClosed(self): return _CharniakParser.Term_isClosed(self) + __swig_destroy__ = _CharniakParser.delete_Term + __del__ = lambda self : None; +Term_swigregister = _CharniakParser.Term_swigregister +Term_swigregister(Term) + +class ExtPos: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, ExtPos, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, ExtPos, name) + __repr__ = _swig_repr + def hasExtPos(self): return _CharniakParser.ExtPos_hasExtPos(self) + def addTagConstraints(self, *args): return _CharniakParser.ExtPos_addTagConstraints(self, *args) + def getTerms(self, *args): return _CharniakParser.ExtPos_getTerms(self, *args) + def size(self): return _CharniakParser.ExtPos_size(self) + def __init__(self): + this = _CharniakParser.new_ExtPos() + try: self.this.append(this) + except: self.this = this + __swig_destroy__ = _CharniakParser.delete_ExtPos + __del__ = lambda self : None; +ExtPos_swigregister = _CharniakParser.ExtPos_swigregister +ExtPos_swigregister(ExtPos) + +# This file is compatible with both classic and new-style classes. + + diff --git a/python/bllipparser_python3/JohnsonReranker.py b/python/bllipparser_python3/JohnsonReranker.py new file mode 100644 index 0000000..1449684 --- /dev/null +++ b/python/bllipparser_python3/JohnsonReranker.py @@ -0,0 +1,196 @@ +# This file was automatically generated by SWIG (http://www.swig.org). +# Version 2.0.11 +# +# Do not make changes to this file unless you know what you are doing--modify +# the SWIG interface file instead. + + + + + +from sys import version_info +if version_info >= (2,6,0): + def swig_import_helper(): + from os.path import dirname + import imp + fp = None + try: + fp, pathname, description = imp.find_module('_JohnsonReranker', [dirname(__file__)]) + except ImportError: + import _JohnsonReranker + return _JohnsonReranker + if fp is not None: + try: + _mod = imp.load_module('_JohnsonReranker', fp, pathname, description) + finally: + fp.close() + return _mod + _JohnsonReranker = swig_import_helper() + del swig_import_helper +else: + import _JohnsonReranker +del version_info +def _swig_setattr_nondynamic(self,class_type,name,value,static=1): + if (name == "thisown"): return self.this.own(value) + if (name == "this"): + if type(value).__name__ == 'SwigPyObject': + self.__dict__[name] = value + return + method = class_type.__swig_setmethods__.get(name,None) + if method: return method(self,value) + if (not static): + self.__dict__[name] = value + else: + raise AttributeError("You cannot add attributes to %s" % self) + +def _swig_setattr(self,class_type,name,value): + return _swig_setattr_nondynamic(self,class_type,name,value,0) + +def _swig_getattr(self,class_type,name): + if (name == "thisown"): return self.this.own() + method = class_type.__swig_getmethods__.get(name,None) + if method: return method(self) + raise AttributeError(name) + +def _swig_repr(self): + try: strthis = "proxy of " + self.this.__repr__() + except: strthis = "" + return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,) + +class SwigPyIterator: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, SwigPyIterator, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, SwigPyIterator, name) + def __init__(self, *args, **kwargs): raise AttributeError("No constructor defined - class is abstract") + __repr__ = _swig_repr + __swig_destroy__ = _JohnsonReranker.delete_SwigPyIterator + __del__ = lambda self : None; + def value(self): return _JohnsonReranker.SwigPyIterator_value(self) + def incr(self, n=1): return _JohnsonReranker.SwigPyIterator_incr(self, n) + def decr(self, n=1): return _JohnsonReranker.SwigPyIterator_decr(self, n) + def distance(self, *args): return _JohnsonReranker.SwigPyIterator_distance(self, *args) + def equal(self, *args): return _JohnsonReranker.SwigPyIterator_equal(self, *args) + def copy(self): return _JohnsonReranker.SwigPyIterator_copy(self) + def __next__(self): return _JohnsonReranker.SwigPyIterator_next(self) + def __next__(self): return _JohnsonReranker.SwigPyIterator___next__(self) + def previous(self): return _JohnsonReranker.SwigPyIterator_previous(self) + def advance(self, *args): return _JohnsonReranker.SwigPyIterator_advance(self, *args) + def __eq__(self, *args): return _JohnsonReranker.SwigPyIterator___eq__(self, *args) + def __ne__(self, *args): return _JohnsonReranker.SwigPyIterator___ne__(self, *args) + def __iadd__(self, *args): return _JohnsonReranker.SwigPyIterator___iadd__(self, *args) + def __isub__(self, *args): return _JohnsonReranker.SwigPyIterator___isub__(self, *args) + def __add__(self, *args): return _JohnsonReranker.SwigPyIterator___add__(self, *args) + def __sub__(self, *args): return _JohnsonReranker.SwigPyIterator___sub__(self, *args) + def __iter__(self): return self +SwigPyIterator_swigregister = _JohnsonReranker.SwigPyIterator_swigregister +SwigPyIterator_swigregister(SwigPyIterator) + +class RerankerError: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, RerankerError, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, RerankerError, name) + __repr__ = _swig_repr + __swig_getmethods__["description"] = _JohnsonReranker.RerankerError_description_get + def __init__(self, *args): + this = _JohnsonReranker.new_RerankerError(*args) + try: self.this.append(this) + except: self.this = this + __swig_destroy__ = _JohnsonReranker.delete_RerankerError + __del__ = lambda self : None; +RerankerError_swigregister = _JohnsonReranker.RerankerError_swigregister +RerankerError_swigregister(RerankerError) + +class NBestList: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, NBestList, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, NBestList, name) + __repr__ = _swig_repr + def __len__(self): return _JohnsonReranker.NBestList___len__(self) + def __init__(self): + this = _JohnsonReranker.new_NBestList() + try: self.this.append(this) + except: self.this = this + __swig_destroy__ = _JohnsonReranker.delete_NBestList + __del__ = lambda self : None; +NBestList_swigregister = _JohnsonReranker.NBestList_swigregister +NBestList_swigregister(NBestList) + + +def readNBestList(*args): + return _JohnsonReranker.readNBestList(*args) +readNBestList = _JohnsonReranker.readNBestList +class RerankerModel: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, RerankerModel, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, RerankerModel, name) + __repr__ = _swig_repr + __swig_setmethods__["maxid"] = _JohnsonReranker.RerankerModel_maxid_set + __swig_getmethods__["maxid"] = _JohnsonReranker.RerankerModel_maxid_get + def __init__(self, *args): + this = _JohnsonReranker.new_RerankerModel(*args) + try: self.this.append(this) + except: self.this = this + def scoreNBestList(self, *args): return _JohnsonReranker.RerankerModel_scoreNBestList(self, *args) + __swig_destroy__ = _JohnsonReranker.delete_RerankerModel + __del__ = lambda self : None; +RerankerModel_swigregister = _JohnsonReranker.RerankerModel_swigregister +RerankerModel_swigregister(RerankerModel) + + +def setOptions(*args): + return _JohnsonReranker.setOptions(*args) +setOptions = _JohnsonReranker.setOptions +class Weights: + __swig_setmethods__ = {} + __setattr__ = lambda self, name, value: _swig_setattr(self, Weights, name, value) + __swig_getmethods__ = {} + __getattr__ = lambda self, name: _swig_getattr(self, Weights, name) + __repr__ = _swig_repr + def iterator(self): return _JohnsonReranker.Weights_iterator(self) + def __iter__(self): return self.iterator() + def __bool__(self): return _JohnsonReranker.Weights___nonzero__(self) + def __bool__(self): return _JohnsonReranker.Weights___bool__(self) + def __len__(self): return _JohnsonReranker.Weights___len__(self) + def pop(self): return _JohnsonReranker.Weights_pop(self) + def __getslice__(self, *args): return _JohnsonReranker.Weights___getslice__(self, *args) + def __setslice__(self, *args): return _JohnsonReranker.Weights___setslice__(self, *args) + def __delslice__(self, *args): return _JohnsonReranker.Weights___delslice__(self, *args) + def __delitem__(self, *args): return _JohnsonReranker.Weights___delitem__(self, *args) + def __getitem__(self, *args): return _JohnsonReranker.Weights___getitem__(self, *args) + def __setitem__(self, *args): return _JohnsonReranker.Weights___setitem__(self, *args) + def append(self, *args): return _JohnsonReranker.Weights_append(self, *args) + def empty(self): return _JohnsonReranker.Weights_empty(self) + def size(self): return _JohnsonReranker.Weights_size(self) + def clear(self): return _JohnsonReranker.Weights_clear(self) + def swap(self, *args): return _JohnsonReranker.Weights_swap(self, *args) + def get_allocator(self): return _JohnsonReranker.Weights_get_allocator(self) + def begin(self): return _JohnsonReranker.Weights_begin(self) + def end(self): return _JohnsonReranker.Weights_end(self) + def rbegin(self): return _JohnsonReranker.Weights_rbegin(self) + def rend(self): return _JohnsonReranker.Weights_rend(self) + def pop_back(self): return _JohnsonReranker.Weights_pop_back(self) + def erase(self, *args): return _JohnsonReranker.Weights_erase(self, *args) + def __init__(self, *args): + this = _JohnsonReranker.new_Weights(*args) + try: self.this.append(this) + except: self.this = this + def push_back(self, *args): return _JohnsonReranker.Weights_push_back(self, *args) + def front(self): return _JohnsonReranker.Weights_front(self) + def back(self): return _JohnsonReranker.Weights_back(self) + def assign(self, *args): return _JohnsonReranker.Weights_assign(self, *args) + def resize(self, *args): return _JohnsonReranker.Weights_resize(self, *args) + def insert(self, *args): return _JohnsonReranker.Weights_insert(self, *args) + def reserve(self, *args): return _JohnsonReranker.Weights_reserve(self, *args) + def capacity(self): return _JohnsonReranker.Weights_capacity(self) + __swig_destroy__ = _JohnsonReranker.delete_Weights + __del__ = lambda self : None; +Weights_swigregister = _JohnsonReranker.Weights_swigregister +Weights_swigregister(Weights) + +# This file is compatible with both classic and new-style classes. + + diff --git a/python/bllipparser_python3/ModelFetcher.py b/python/bllipparser_python3/ModelFetcher.py new file mode 100644 index 0000000..f3d4124 --- /dev/null +++ b/python/bllipparser_python3/ModelFetcher.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""Simple BLLIP Parser unified parsing model repository and installer.""" + +import sys +import urllib.parse +import urllib.request, urllib.parse, urllib.error +from os import makedirs, system, chdir, getcwd +from os.path import basename, exists, join + +class ModelInfo: + def __init__(self, model_desc, url, uncompressed_size='unknown'): + """uncompressed_size is approximate size in megabytes.""" + self.model_desc = model_desc + self.url = url + self.uncompressed_size = uncompressed_size + def __str__(self): + return "%s [%sMB]" % (self.model_desc, self.uncompressed_size) + +# should this grow large enough, we'll find a better place to store it +models = { + 'OntoNotes-WSJ': ModelInfo('OntoNotes portion of WSJ', + 'http://nlp.stanford.edu/~mcclosky/models/BLLIP-OntoNotes-WSJ.tar.bz2', 61), + 'SANCL2012-Uniform': ModelInfo('Self-trained model on OntoNotes-WSJ and the Google Web Treebank', + 'http://nlp.stanford.edu/~mcclosky/models/BLLIP-SANCL2012-Uniform.tar.bz2', 890), + 'WSJ+Gigaword': ModelInfo('Self-trained model on PTB2-WSJ and approx. two million sentences from Gigaword', + 'http://nlp.stanford.edu/~mcclosky/models/BLLIP-WSJ-Gigaword2000.tar.bz2', 473), + 'WSJ+PubMed': ModelInfo('Self-trained model on PTB2-WSJ and approx. 200k sentences from PubMed', + 'http://nlp.stanford.edu/~mcclosky/models/BLLIP-WSJ-PubMed.tar.bz2', 152), + 'WSJ': ModelInfo('Wall Street Journal corpus from Penn Treebank, version 2', + 'http://nlp.stanford.edu/~mcclosky/models/BLLIP-WSJ-no-AUX.tar.bz2', 52), + 'WSJ-with-AUX': ModelInfo('Wall Street Journal corpus from Penn Treebank, version 2 (AUXified version, deprecated)', + 'http://nlp.stanford.edu/~mcclosky/models/BLLIP-WSJ-with-AUX.tar.bz2', 55), +} + +class UnknownParserModel(ValueError): + def __str__(self): + return "Unknown parser model name: " + self[0] + +def download_and_install_model(model_name, target_directory, verbose=False): + """Downloads and installs models to a specific directory. Models + can be specified by simple names (use list_models() for a list + of known models) or a URL. If the model is already installed in + target_directory, it won't download it again. Returns the path to + the new model.""" + + if model_name.lower().startswith('http'): + parsed_url = urllib.parse.urlparse(model_name) + model_url = model_name + model_name = basename(parsed_url.path).split('.')[0] + elif model_name in models: + model_url = models[model_name].url + else: + raise UnknownParserModel(model_name) + + output_path = join(target_directory, model_name) + if verbose: + print("Fetching model:", model_name, "from", model_url) + print("Model directory:", output_path) + + if exists(output_path): + if verbose: + print("Model directory already exists, not reinstalling") + return output_path + + if verbose: + def status_func(blocks, block_size, total_size): + amount_downloaded = blocks * block_size + if total_size == -1: + sys.stdout.write('Downloaded %s\r' % amount_downloaded) + else: + percent_downloaded = 100 * amount_downloaded / total_size + size = amount_downloaded / (1024 ** 2) + sys.stdout.write('Downloaded %.1f%% (%.1f MB)\r' % + (percent_downloaded, size)) + else: + status_func = None + + # needed since 404s, etc. aren't handled otherwise + class ErrorAwareOpener(urllib.request.FancyURLopener): + def http_error_default(self, url, fp, errcode, errmsg, headers): + print("Error downloading model (%s %s)" % (errcode, errmsg)) + raise SystemExit + + opener = ErrorAwareOpener() + downloaded_filename, headers = opener.retrieve(model_url, + reporthook=status_func) + if verbose: + sys.stdout.write('\rDownload complete' + (' ' * 20) + '\n') + print('Downloaded to temporary file', downloaded_filename) + + try: + makedirs(output_path) + except OSError as ose: + if ose.errno != 17: + raise + + orig_path = getcwd() + chdir(output_path) + # by convention, all models are currently in tar.bz2 format + # we may want to generalize this code later + assert downloaded_filename.lower().endswith('.bz2') + command = 'tar xvjf %s' % downloaded_filename + if verbose: + print("Extracting with %r to %s" % (command, output_path)) + system(command) + chdir(orig_path) + + return output_path + +def list_models(): + print(len(models), "known unified parsing models: [uncompressed size]") + for key, model_info in sorted(models.items()): + print('\t%-20s\t%s' % (key, model_info)) + +def main(): + from optparse import OptionParser + parser = OptionParser(usage="""%prog [options] + +Tool to help you download and install BLLIP Parser models.""") + parser.add_option("-l", "--list", action='store_true', + help="List known parsing models.") + parser.add_option("-i", "--install", metavar="NAME", action='append', + help="Install a unified parser model.") + parser.add_option("-d", "--directory", default='./models', metavar="PATH", + help="Directory to install parsing models in (will be " + "created if it doesn't exist). Default: %default") + + (options, args) = parser.parse_args() + + if not (options.list or options.install): + parser.print_help() + # flip this on to make 'list' the default action + options.list = True + print() + if options.list: + list_models() + if options.install: + for i, model in enumerate(options.install): + if i: + print() + try: + download_and_install_model(model, options.directory, + verbose=True) + except UnknownParserModel as u: + print(u) + list_models() + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/python/bllipparser_python3/ParsingShell.py b/python/bllipparser_python3/ParsingShell.py new file mode 100644 index 0000000..37a3a3f --- /dev/null +++ b/python/bllipparser_python3/ParsingShell.py @@ -0,0 +1,301 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""Simple interactive shell for viewing parses. To run: + + python -mbllipparser.ParsingShell /path/to/model/ + +Optional dependencies: +If you have NLTK installed, you'll be able to use the 'visual' command +which shows constituency trees. +If you have PyStanfordDependencies installed, you'll be able to use the +'sdparse' command. Dependencies are shown in CoNLL-X format, though ASCII +trees will be shown if you have the asciitree package.""" +import sys +from cmd import Cmd +import importlib +def import_maybe(module_name): + "Import a module and return it if available, otherwise returns None." + try: + return importlib.import_module(module_name) + except ImportError: + return None + +try: + import nltk.tree + import nltk.draw.tree + read_nltk_tree = nltk.tree.Tree.parse + have_nltk_tree_drawing = True +except ImportError: + have_nltk_tree_drawing = False +except AttributeError: # handle NLTK API changes + try: + read_nltk_tree = nltk.tree.Tree.fromstring + have_nltk_tree_drawing = True + except AttributeError: + have_nltk_tree_drawing = False + +StanfordDependencies = import_maybe('StanfordDependencies') +asciitree = import_maybe('asciitree') + +from bllipparser.RerankingParser import RerankingParser + +# TODO should integrate with bllipparser.ModelFetcher + +class ParsingShell(Cmd): + def __init__(self, model): + Cmd.__init__(self) + self.prompt = 'bllip> ' + if model is None: + print("Warning: no parsing model to load.") + print("Specify with: python -mbllipparser.ParsingShell " + \ + "/path/to/model/") + self.rrp = None + else: + sys.stdout.write("Loading models... ") + sys.stdout.flush() + self.rrp = RerankingParser.from_unified_model_dir(model) + print("done!") + print("Enter a sentence to see its parse or 'help' for more options.") + self.last_nbest_list = [] + self.options = {} + self.sd = None + + def do_visual(self, text): + """Use reranking parser to parse text. Visualize top parses from + parser and reranker.""" + if not have_nltk_tree_drawing: + print("Can't visualize without NLTK installation.") + return + + nbest_list = self.parse(text) + parser_top_parse = str(nbest_list.get_parser_best().ptb_parse) + parser_top_parse = parser_top_parse.replace('S1', 'parser') + reranker_top_parse = str(nbest_list[0].ptb_parse) + reranker_top_parse = reranker_top_parse.replace('S1', 'reranker') + + nltk_trees = [read_nltk_tree(parser_top_parse)] + if nbest_list[0].parser_rank != 0: + print("Parser:") + print(parser_top_parse) + print() + print("Reranker's parse: (parser index %d)" % \ + nbest_list[0].parser_rank) + print(reranker_top_parse) + nltk_trees.insert(0, read_nltk_tree(reranker_top_parse)) + + nltk.draw.tree.draw_trees(*nltk_trees) + + def do_parse(self, text): + """Use reranking parser to parse text. Show top parses from + parser and reranker.""" + self.parse(text) + self.print_parses() + + def do_nbest(self, text): + """Use reranking parser to parse text. Show complete n-best list.""" + nbest_list = self.parse(text) + for i, item in enumerate(nbest_list): + print('reranker rank: ', i) + print('reranker score:', item.reranker_score) + print('parser rank: ', item.parser_rank) + print('parser score: ', item.parser_score) + print(item.ptb_parse.pretty_string()) + print() + print() + + def do_visualnbest(self, text): + """Usage: visualnbest [start] stop + Visualizes all parses from start-stop in the n-best list. + Sentence must already be parsed.""" + if not have_nltk_tree_drawing: + print("Can't visualize without NLTK installation.") + return + + pieces = list(map(int, text.split())) + start = 0 + if len(pieces) == 2: + start = pieces[0] + end = pieces[1] + elif len(pieces) == 1: + end = pieces[0] + else: + print("Should only have 1 or 2 arguments.") + return + end += 1 # to make this inclusive of both end points + + nbest_list = self.last_nbest_list + nltk_trees = [] + for item in nbest_list[start:end]: + i = item.reranker_rank + print('reranker rank: ', i) + print('reranker score:', item.reranker_score) + print('parser rank: ', item.parser_rank) + print('parser score: ', item.parser_score) + print(item.ptb_parse.pretty_string()) + tree = str(item.ptb_parse) + tree = tree.replace('S1', 'S1-r%d-p%d' % (i, item.parser_rank)) + nltk_trees.append(read_nltk_tree(tree)) + print() + print() + nltk.draw.tree.draw_trees(*nltk_trees) + + def do_tagged(self, text): + """Use reranking parser to parse pre-tagged, pre-tokenized text. + Show top parses from parser and reranker. Example usage: + + rrp> tagged word1 word2:TAG1 word3:TAG2 word4:TAG2|TAG3 + + will require word2 to be tagged with TAG1, word3 to be tagged + with TAG2 and word4 to be tagged with TAG2 or TAG3.""" + tokens_and_tags = text.split() + tokens = [] + possible_tags = {} + for index, token_and_tag in enumerate(tokens_and_tags): + if ':' in token_and_tag and len(token_and_tag) > 3: + token, tags = token_and_tag.split(':') + tokens.append(token) + possible_tags[index] = tags.split('|') + else: + tokens.append(token_and_tag) + + nbest_list = self.rrp.parse_tagged(tokens, possible_tags) + self.got_nbest_list(nbest_list) + self.print_parses() + + def do_set(self, text): + """Set an option. Syntax: set option-name option-value + Current options: + + sdversion - version of Stanford CoreNLP to use for Stanford + Dependencies + sdbackend - backend to use for Stanford Dependencies conversion + sdvis - how to draw Stanford Dependencies (can be conll or + asciitree). Default is to use asciitree if available.""" + pieces = text.split() + if len(pieces) != 2: + print("Syntax: set option-name option-value") + return + key, value = pieces + self.options[key] = value + print("set %r = %r" % (key, value)) + if key.startswith('sd'): + # reset it so it can be reloaded + self.sd = None + + def do_sdparse(self, text): + """Use reranking parser to parse text, then show the + output as Stanford Dependencies in CoNLL format. Requires + PyStanfordDependencies. You may want to use the 'set' command + to set the sdversion, sdbackend, and sdvis options + (see 'help set').""" + if not StanfordDependencies: + print("Can't show dependencies without " \ + "PyStanfordDependencies installation.") + return + + try: + # load SD on demand so user can set version options before + self.load_stanford_dependencies() + nbest_list = self.parse(text) + + parser_tree = nbest_list.get_parser_best().ptb_parse + reranker_tree = nbest_list.get_reranker_best().ptb_parse + + parser_tokens = parser_tree.sd_tokens(sd_converter=self.sd) + reranker_tokens = reranker_tree.sd_tokens(sd_converter=self.sd) + except StanfordDependencies.JavaRuntimeVersionError as jrve: + # load_stanford_dependencies and sd_tokens potentially throw + # this + print('JavaRuntimeVersionError:', jrve) + print() + print("Try running: 'set sdversion 3.4.1'") + if self.options.get('sdbackend', 'jpype') == 'jpype': + print("Also, since you're using the jpype backend, " + \ + "you'll need to restart first.") + return + + if parser_tokens == reranker_tokens: + print('Parser and reranker:') + else: + print('Parser:') + + self.visualize_sd_tokens(parser_tokens) + + if parser_tokens != reranker_tokens: + print() + print('Reranker:') + self.visualize_sd_tokens(reranker_tokens) + + def default(self, text): + if text == 'EOF': + raise SystemExit + else: + return self.do_parse(text) + + def print_parses(self): + nbest_list = self.last_nbest_list + parser_top_parse = nbest_list.get_parser_best() + reranker_top_parse = nbest_list[0] + + if reranker_top_parse.parser_rank == 0: + print(parser_top_parse.ptb_parse.pretty_string()) + else: + print("Parser's parse:") + print(parser_top_parse.ptb_parse.pretty_string()) + print() + print("Reranker's parse: (parser index %d)" % \ + reranker_top_parse.parser_rank) + print(reranker_top_parse.ptb_parse.pretty_string()) + print() + + def visualize_sd_tokens(self, tokens): + # eventually, we may add dot/xdot support + sdvis = self.options.get('sdvis', 'asciitree').lower() + use_asciitree = sdvis == 'asciitree' + if use_asciitree and asciitree: + print(tokens.as_asciitree()) + else: + for token in tokens: + print(token.as_conll()) + + def got_nbest_list(self, nbest_list): + nbest_list.sort_by_reranker_scores() + self.last_nbest_list = nbest_list + + def parse(self, text): + if text.strip(): # if no text, return the last nbest list + nbest_list = self.rrp.parse(text) + print('Tokens:', ' '.join(nbest_list.tokens())) + print() + self.got_nbest_list(nbest_list) + + return self.last_nbest_list + + def load_stanford_dependencies(self): + if self.sd: + return + kwargs = dict(version=self.options.get('sdversion')) + if 'sdbackend' in self.options: + kwargs['backend'] = self.options['sdbackend'] + self.sd = StanfordDependencies.get_instance(**kwargs) + +def main(shell_class=ParsingShell): + if len(sys.argv) > 1: + model = sys.argv[-1] + else: + model = None + shell = shell_class(model) + shell.cmdloop() + +if __name__ == "__main__": + main() diff --git a/python/bllipparser_python3/RerankerFeatureCorpus.py b/python/bllipparser_python3/RerankerFeatureCorpus.py new file mode 100644 index 0000000..8ed6625 --- /dev/null +++ b/python/bllipparser_python3/RerankerFeatureCorpus.py @@ -0,0 +1,130 @@ +""" +cvlm corpus read/transform support. + +Data structures and utilities to read the format that cvlm (reranker +optimizer) takes as input (sparse feature values associated with each +candidate). + +This needs the waterworks utility library. + +Example: + +>>> corpus = RerankerFeatureCorpus('path/to/filename.gz') +>>> for sentence in corpus: +... print 'index', sentence.index, 'num parses', len(sentence.parses) +... print 'num parse0 features', len(sentence.parses[0].features) + +""" +from waterworks.Strings import try_parse_float, try_parse_int +from waterworks.Files import possibly_compressed_file +from waterworks.Tools import initialize, generic_repr + +def parse_kv_list(text): + """Parse cvlm key-value pairs from text. Returns a dictionary.""" + pieces = text.split() + results = {} + for piece in pieces: + if '=' in piece: + key, value = piece.split('=') + if value.endswith(','): + value = value[:-1] + value = try_parse_float(value, value) + else: + key = piece + value = 1 + if key.endswith(','): + key = key[:-1] + key = try_parse_int(key, key) + results[key] = value + return results + +def generate_kv_list(features): + """Render cvlm key-value pairs to text from a dictionary.""" + pieces = [] + for k, v in sorted(features.items()): + if v == 1: + pieces.append(str(k)) + else: + pieces.append('%s=%s' % (k, v)) + return ' '.join(pieces) + +class RerankerParse: + """A single parse of a RerankerSentence. Each parse includes + the number of proposed and matched brackets (which, combined with + gold_brackets will tell you its f-score) and a dictionary of features + to values.""" + def __init__(self, proposed_brackets, matched_brackets, features): + initialize(self, locals()) + __repr__ = generic_repr + def cvlm_format(self): + """Render this parse in cvlm's sparse feature vector format.""" + meta = 'P=%s W=%s' % (self.proposed_brackets, self.matched_brackets) + if self.features: + return '%s %s' % (meta, generate_kv_list(self.features)) + else: + return meta + +class RerankerSentence: + """A single sentence for input to cvlm. Each sentence includes the + number of gold brackets, its index in the corpus, and a list of all + candidate parses (RerankerParse objects).""" + def __init__(self, gold_brackets, parses, index): + initialize(self, locals()) + __repr__ = generic_repr + + def cvlm_format(self): + """Render this sentence in cvlm's sparse feature vector format.""" + return 'G=%s N=%s %s' % (self.gold_brackets, len(self.parses), + ', '.join(parse.cvlm_format() + for parse in self.parses)) + ',' + def __iter__(self): + return iter(self.parses) + + @classmethod + def from_string(this_class, text, index): + parses_text = text.split(', ') + gold_brackets = None + parses = [] + for parse_index, parse_text in enumerate(parses_text): + features = parse_kv_list(parse_text) + if parse_index == 0: + gold_brackets = features.pop('G') + features.pop('N') + proposed_brackets = features.pop('P') + matched_brackets = features.pop('W') + + parses.append(RerankerParse(proposed_brackets, matched_brackets, + features)) + assert gold_brackets is not None + return this_class(gold_brackets, parses, index) + +class RerankerFeatureCorpus: + """Made up of a series of sentences. Because these files are huge + and the Python wrappers around these structures cannot typically be + stored in memory, this only lets you iterate over the corpus. Note + that if you're generating a new reranker input file, you'll need + to create the S= header as well. The number + of sentences in the RerankerFeatureCorpus corpus is available under + the num_sentences property.""" + def __init__(self, filename): + initialize(self, locals()) + + self.reader = iter(possibly_compressed_file(filename)) + self.header = parse_kv_list(next(self.reader)) + assert 'S' in self.header + self.num_sentences = self.header['S'] + + __repr__ = generic_repr + + def __iter__(self): + for i, line in enumerate(self.reader): + sentence = RerankerSentence.from_string(line, i) + sentence.header = self.header + yield sentence + + def transform(self, transformer): + """Iterate over every sentence in this corpus, applying a + transformation function to each. The transformer will be called + on each RerankerSentence instance in order.""" + for sentence in self: + yield transformer(sentence) diff --git a/python/bllipparser_python3/RerankingParser.py b/python/bllipparser_python3/RerankingParser.py new file mode 100644 index 0000000..ad46496 --- /dev/null +++ b/python/bllipparser_python3/RerankingParser.py @@ -0,0 +1,692 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""Higher-level python frontend to the BLLIP reranking parser. Wraps the +lower-level (SWIG-generated) CharniakParser and JohnsonReranker modules +so you don't need to interact with them directly.""" + +from os.path import exists, join +from . import CharniakParser as parser +from . import JohnsonReranker as reranker + +class DeprecatedGetter: + """Used when a getter method has been converted to a property. All + attributes will be dispatched to the property's value and a warning + will be raised if this is called. This doesn't work if the property + being deprecated has its own __call__ method, since that will be + unreachable as a DeprecatedGetter.""" + def __init__(self, name, value): + """name is the attribute name of the property. value is its + value.""" + self.__name = name + self.__value = value + def __getattr__(self, attr): + """All attributes except __call__ are dispatched to the property's + value.""" + return getattr(self.__value, attr) + def __call__(self, *args, **kwargs): + """Shouldn't be called except by deprecated code. Issues a warning + about the deprecation then returns the value so that deprecated + code will continue to work.""" + from warnings import warn + warn("%r is no longer a method. It's now a property." % self.__name, + DeprecationWarning, stacklevel=2) + return self.__value + +class Tree(object): + """Represents a single parse (sub)tree in Penn Treebank format. This + wraps the InputTree structure in the Charniak parser.""" + def __init__(self, input_tree_or_string): + """These can be constructed from the Penn Treebank string + representations of trees, e.g.: + + >>> Tree('(S1 (NP (NN tree)))') + Tree('(S1 (NP (NN tree)))') + + Or from an existing InputTree (internal SWIG object). Users will + generally want the former.""" + if not isinstance(input_tree_or_string, parser.InputTree): + if not isinstance(input_tree_or_string, str): + raise TypeError("input_tree_or_string (%r) must be " + "an InputTree or string." % + input_tree_or_string) + input_tree_or_string = \ + parser.inputTreeFromString(input_tree_or_string) + self._tree = input_tree_or_string + self._sd_tokens = None + def __getitem__(self, index): + """Indexes into subtrees for this node. Raises an IndexError if + there's no such subtree. Slices are supported.""" + # list is necessary since otherwise it doesn't support all the + # slice stuff + subtrees = list(self._tree.subTrees()) + try: + subtree = subtrees[index] + except IndexError: + if self.is_preterminal(): + message = 'node is a preterminal' + else: + message = 'only %s children for this node' % len(self) + raise IndexError("list index %r out of range (%s)" % + (index, message)) + if isinstance(index, slice): + return [self.__class__(s) for s in subtree] + else: # single InputTree object + return self.__class__(subtree) + def __iter__(self): + """Provides an iterator over immediate subtrees in this Tree. + Each item yielded will be a Tree object rooted at one of the + children of this tree.""" + for tree in self._tree.subTrees(): + yield self.__class__(tree) + def all_subtrees(self): + """Iterates over all nodes in this tree in preorder (this node, + followed by its first child, etc.)""" + yield self + for subtree in self: + for subsubtree in subtree.all_subtrees(): + yield subsubtree + def subtrees(self): + """Returns a list of direct subtrees.""" + return list(iter(self)) + def __len__(self): + """Returns the number of direct subtrees.""" + return len(self.subtrees()) + def __repr__(self): + """Provides a representation of this tree which can be used to + reconstruct it.""" + return '%s(%r)' % (self.__class__.__name__, str(self)) + def __str__(self): + """Represent the tree in Penn Treebank format on one line.""" + return str(self._tree) + def pretty_string(self): + """Represent the tree in Penn Treebank format with line wrapping.""" + return self._tree.toStringPrettyPrint() + def tokens(self): + """Return a tuple of the word tokens in this tree.""" + return tuple(self._tree.getWords()) + def tags(self): + """Return a tuple of the part-of-speech tags in this tree.""" + return tuple(self._tree.getTags()) + def tokens_and_tags(self): + """Return a list of (word, tag) pairs.""" + return list(zip(self.tokens(), self.tags())) + def span(self): + """Returns indices of the span for this tree: (start, end)""" + return (self._tree.start(), self._tree.finish()) + def is_preterminal(self): + """Returns True iff this node is a preterminal (that is, its + label is a part of speech tag, it has a non-empty token, and it + has no child nodes).""" + return len(self) == 0 + + # + # properties + # + + def token(): + doc = """The word for the top node in this subtree. If this + node is not a preterminal, this will be return None. Setting + the token on a non-preterminal to anything other than None will + cause a ValueError. The same goes for setting a preterminal's + token to None.""" + def fget(self): + return self._tree.word() or None + def fset(self, new_word): + new_word = new_word or None + if self.is_preterminal(): + if new_word is None: + raise ValueError("Can't set a null token on a " + "preterminal Tree.") + else: + # not a preterminal + if new_word is not None: + raise ValueError("Can't set the token on a " + "non-preterminal Tree.") + self._tree.setWord(new_word) + return locals() + token = property(**token()) + + def label(): + doc = """The label at the top of this subtree as a string. If + this tree is a preterminal, this will be its part of speech, + otherwise it will be the phrasal category. This property was + previously a method. It now returns a DeprecatedGetter as an + intermediate solution to help you find deprecated calls.""" + def fget(self): + return DeprecatedGetter('label', self._tree.term()) + def fset(self, new_label): + self._tree.setTerm(new_label) + return locals() + label = property(**label()) + + def label_suffix(): + doc = """Suffix for the label at the top node of this subtree + (including the hyphen). These include function tags (e.g., + "-SBJ" for subject") and coindexing ("-2"). In general, this + will be the empty string for any trees produced by BLLIP parser + but this property may be set if you read in gold trees.""" + def fget(self): + return self._tree.ntInfo() + def fset(self, new_tag): + self._tree.setNtInfo(new_tag) + return locals() + label_suffix = property(**label_suffix()) + + def sd_tokens(self, sd_converter=None, conversion_kwargs=None): + """Convert this Tree to Stanford Dependencies + (requires PyStanfordDependencies). Returns a list of + StanfordDependencies.Token objects. This method caches + the converted tokens. You may optionally specify a + StanfordDependencies instance in sd_converter and keyword + arguments to StanfordDependencies.convert_tree as a dictionary + in conversion_kwargs.""" + if not self._sd_tokens: + try: + import StanfordDependencies + except ImportError: + raise ImportError("For sd_tokens(), you need to install" + "PyStanfordDependencies from PyPI") + sd_converter = sd_converter or StanfordDependencies.get_instance() + conversion_kwargs = conversion_kwargs or {} + self._sd_tokens = sd_converter.convert_tree(str(self), + **conversion_kwargs) + return self._sd_tokens + + @classmethod + def trees_from_string(this_class, text): + """Given text containing multiple Penn Treebank trees, returns + a list of Tree objects (one for each tree in the text).""" + # Note: the native method below gives us memory ownership of + # the InputTree objects in the vector. We acquire their pointers + # and store them in a Python list (the vector won't stick + # around). InputTree objects typically contain other InputTree + # objects and the outer tree will free the inner trees when it is + # deleted. So, we only need (and want) to acquire the pointer of + # the outermost InputTree tree. + trees = list(parser.inputTreesFromString(text)) + for tree in trees: + tree.this.acquire() + return list(map(this_class, trees)) + + @classmethod + def trees_from_file(this_class, filename): + """Given the path to a file containing multiple Penn Treebank + trees, returns a list of Tree objects (one for each tree in the + file).""" + # see trees_from_string for an explanation + trees = list(parser.inputTreesFromFile(filename)) + for tree in trees: + tree.this.acquire() + return list(map(this_class, trees)) + +class ScoredParse: + """Represents a single parse and its associated parser + probability and reranker score. + + Properties: + - ptb_parse: a Tree object representing the parse (str() it to get the + actual PTB formatted parse) + - parser_score: The log probability of the parse according to the parser + - parser_rank: The rank of the parse according to the parser + - reranker_score: The log probability of the parse according to + the reranker + - reranker_rank: The rank of the parse according to the reranker + + The latter two will be None if the reranker isn't being used.""" + def __init__(self, ptb_parse, parser_score=None, reranker_score=None, + parser_rank=None, reranker_rank=None): + self.ptb_parse = ptb_parse + self.parser_score = parser_score + self.parser_rank = parser_rank + self.reranker_score = reranker_score + self.reranker_rank = reranker_rank + def __str__(self): + return "%s %s %s" % \ + (self.parser_score, self.reranker_score, self.ptb_parse) + def __repr__(self): + return "%s(%r, parser_score=%r, reranker_score=%r)" % \ + (self.__class__.__name__, str(self.ptb_parse), + self.parser_score, self.reranker_score) + +class Sentence: + """Represents a single sentence as input to the parser. Typically, + you won't need to construct this object directly. This wraps the + SentRep structure in the Charniak parser.""" + def __init__(self, text_or_tokens): + if isinstance(text_or_tokens, parser.SentRep): + # ensure that Python owns the pointer + text_or_tokens.this.acquire() + self.sentrep = text_or_tokens + elif isinstance(text_or_tokens, Sentence): + self.sentrep = text_or_tokens.sentrep + elif isinstance(text_or_tokens, str): + self.sentrep = parser.tokenize(' ' + text_or_tokens + ' ') + else: + # text_or_tokens is a sequence -- need to make sure that each + # element is a string to avoid crashing + text_or_tokens = [parser.ptbEscape(str(token)) + for token in text_or_tokens] + self.sentrep = parser.SentRep(text_or_tokens) + def __repr__(self): + """Represent the Sentence as a string.""" + return "%s(%s)" % (self.__class__, self.tokens()) + def __len__(self): + """Returns the number of tokens in this sentence.""" + return len(self.sentrep) + def tokens(self): + """Returns a list of tokens in this sentence.""" + tokens = [] + for index in range(len(self.sentrep)): + tokens.append(self.sentrep.getWord(index).lexeme()) + return tokens + + @classmethod + def sentences_from_string(this_class, text): + """Given text containing SGML(-ish) lines (typical input to + the command line parser), returns a list of Sentence objects + (one for each tree in the text). Example usage: + + >>> Sentence.sentences_from_string(' Test ') + [bllipparser.RerankingParser.Sentence(['Test'])] + """ + # Note that the native method below leaks. We work around this + # by acquiring its pointer in __init__ + sentReps = parser.sentRepsFromString(text) + return list(map(this_class, sentReps)) + + @classmethod + def sentences_from_file(this_class, filename): + """Given the path to a filename containing multiple SGML(-ish) + lines (typical input to the command line parser), returns a list + of Sentence objects (one for each tree in the text).""" + # Note that the native method below leaks. We work around this + # by acquiring its pointer in __init__ + sentReps = parser.sentRepsFromFile(filename) + return list(map(this_class, sentReps)) + +class NBestList: + """Represents an n-best list of parses of the same sentence.""" + def __init__(self, sentrep, parses, sentence_id=None): + # we keep this around since it's our key to converting our input + # to the reranker's format (see __str__()) + self._parses = parses + self._sentrep = sentrep + self.parses = [] + for index, (score, parse) in enumerate(parses): + # acquire the InputTree pointers or they'll never be freed + parse.this.acquire() + scored_parse = ScoredParse(Tree(parse), score, parser_rank=index) + self.parses.append(scored_parse) + self.sentence_id = sentence_id + # True if we've added reranker scores to our parses + # (but doesn't necessarily imply that we're sorted by them) + self._reranked = False + + def __getattr__(self, key): + """Delegate everything else to our list of ScoredParse objects.""" + return getattr(self.parses, key) + + def sort_by_reranker_scores(self): + """Sort the parses by the reranker's score (highest to lowest). + If the reranker scores tie or there are no reranker scores, parser + probabilities are used as a secondary key.""" + self.parses.sort(key=lambda parse: (parse.reranker_score, + parse.parser_score), + reverse=True) + def sort_by_parser_scores(self): + """Sort the parses by the parser's probability (most likely to least + likely).""" + self.parses.sort(key=lambda parse: -parse.parser_score) + def get_parser_best(self): + """Get the best parse in this n-best list according to the parser.""" + if len(self.parses): + return min(self, key=lambda parse: parse.parser_rank) + else: + return None + def get_reranker_best(self): + """Get the best parse in this n-best list according to the reranker.""" + return min(self, key=lambda parse: parse.reranker_rank) + def tokens(self): + """Get the tokens of this sentence as a sequence of strings.""" + return self._sentrep.tokens() + def rerank(self, reranker_instance, lowercase=True): + """Rerank this n-best list according to a reranker model. + reranker_instance can be a RerankingParser or RerankerModel.""" + assert reranker_instance + if not self.parses: + self._reranked = True + return + if isinstance(reranker_instance, RerankingParser): + reranker_instance = reranker_instance.reranker_model + reranker_input = self.as_reranker_input() + scores = reranker_instance.scoreNBestList(reranker_input) + # this could be more efficient if needed + for (score, nbest_list_item) in zip(scores, self.parses): + nbest_list_item.reranker_score = score + self.sort_by_reranker_scores() + for index, nbest_list_item in enumerate(self.parses): + nbest_list_item.reranker_rank = index + self._reranked = True + + def __str__(self): + """Represent the n-best list in a similar output format to the + command-line parser and reranker.""" + sentence_id = self.sentence_id or 'x' + if self._reranked: + from io import StringIO + combined = StringIO() + combined.write('%d %s\n' % (len(self.parses), sentence_id)) + for parse in self.parses: + combined.write('%s %s\n%s\n' % (parse.reranker_score, + parse.parser_score, + parse.ptb_parse)) + combined.seek(0) + return combined.read() + else: + return parser.asNBestList(self._parses, str(sentence_id)) + def as_reranker_input(self, lowercase=True): + """Convert the n-best list to an internal structure used as input + to the reranker. You shouldn't typically need to call this.""" + return reranker.readNBestList(str(self), lowercase) + +class RerankingParser: + """Wraps the Charniak parser and Johnson reranker into a single + object. Note that RerankingParser is not thread safe.""" + def __init__(self): + """Create an empty reranking parser. You'll need to call + load_parser_model() at minimum and load_reranker_model() if + you're using the reranker. See also the from_unified_model_dir() + classmethod which will take care of calling both of these + for you.""" + self._parser_model_loaded = False + self.parser_model_dir = None + self.parser_options = {} + self.reranker_model = None + self.unified_model_dir = None + + def __repr__(self): + if self.unified_model_dir: + return "%s(unified_model_dir=%r)" % \ + (self.__class__.__name__, self.unified_model_dir) + else: + return "%s(parser_model_dir=%r, reranker_model=%r)" % \ + (self.__class__.__name__, self.parser_model_dir, + self.reranker_model) + + def load_parser_model(self, model_dir, **parser_options): + """Load the parsing model from model_dir and set parsing + options. In general, the default options should suffice but see + the set_parser_options() method for details. Note that the parser + does not allow loading multiple models within the same process + (calling this function twice will raise a RuntimeError).""" + if self._parser_model_loaded: + raise RuntimeError('Parser is already loaded and can only ' + 'be loaded once.') + if not exists(model_dir): + raise ValueError('Parser model directory %r does not exist.' % + model_dir) + self._parser_model_loaded = True + self.parser_model_dir = model_dir + parser.loadModel(model_dir) + self.set_parser_options(**parser_options) + + def load_reranker_model(self, features_filename, weights_filename, + feature_class=None): + """Load the reranker model from its feature and weights files. A + feature class may optionally be specified.""" + if not exists(features_filename): + raise ValueError('Reranker features filename %r does not exist.' % + features_filename) + if not exists(weights_filename): + raise ValueError('Reranker weights filename %r does not exist.' % + weights_filename) + self.reranker_model = reranker.RerankerModel(feature_class, + features_filename, + weights_filename) + + def parse(self, sentence, rerank='auto', sentence_id=None): + """Parse some text or tokens and return an NBestList with the + results. sentence can be a string or a sequence. If it is a + string, it will be tokenized. If rerank is True, we will rerank + the n-best list, if False the reranker will not be used. rerank + can also be set to 'auto' which will only rerank if a reranker + model is loaded. If there are no parses or an error occurs, + this will return an empty NBestList.""" + rerank = self._check_loaded_models(rerank) + + sentence = Sentence(sentence) + if len(sentence) > parser.max_sentence_length: + raise ValueError("Sentence is too long (%s tokens, maximum " + "supported: %s)" % + (len(sentence), parser.max_sentence_length)) + + try: + parses = parser.parse(sentence.sentrep) + except RuntimeError: + parses = [] + #nbest_list = NBestList(sentence, parses, sentence_id) + #if rerank: + # nbest_list.rerank(self) + #return nbest_list + return parses + + def parse_tagged(self, tokens, possible_tags, rerank='auto', + sentence_id=None): + """Parse some pre-tagged, pre-tokenized text. tokens must be a + sequence of strings. possible_tags is map from token indices + to possible POS tags (strings). Tokens without an entry in + possible_tags will be unconstrained by POS. POS tags must be + in the terms.txt file in the parsing model or else you will get + a ValueError. If rerank is True, we will rerank the n-best list, + if False the reranker will not be used. rerank can also be set to + 'auto' which will only rerank if a reranker model is loaded.""" + rerank = self._check_loaded_models(rerank) + if isinstance(tokens, str): + raise ValueError("tokens must be a sequence, not a string.") + + ext_pos = parser.ExtPos() + for index in range(len(tokens)): + tags = possible_tags.get(index, []) + if isinstance(tags, str): + tags = [tags] + tags = list(map(str, tags)) + valid_tags = ext_pos.addTagConstraints(parser.StringVector(tags)) + if not valid_tags: + # at least one of the tags is bad -- find out which ones + # and throw a ValueError + self._find_bad_tag_and_raise_error(tags) + + sentence = Sentence(tokens) + parses = parser.parse(sentence.sentrep, ext_pos) + nbest_list = NBestList(sentence, parses, sentence_id) + if rerank: + nbest_list.rerank(self) + return nbest_list + + def simple_parse(self, text_or_tokens): + """Helper method for just parsing a single sentence and getting + its Penn Treebank tree. If you want anything more complicated + (e.g., the Tree objects, n-best lists, parser or reranker scores, + etc.), you'll want the parse() or parse_tagged() interfaces. + + >>> rrp.simple_parse('Parse this.') + '(S1 (S (VP (VB Parse) (NP (DT this))) (. .)))' + + text_or_tokens can be either a string or a sequence of tokens.""" + parses = self.parse(text_or_tokens) + return str(parses[0].ptb_parse) + + def tag(self, text_or_tokens): + """Helper method for just getting the part-of-speech tags of + a single sentence. This will parse the sentence and then read + part-of-speech tags off the tree, so it's not recommended if + all you need is a fast tagger. Returns a list of (token, tag) + using Penn Treebank part-of-speech tags. + + >>> rrp.tag('Tag this.') + [('Tag', 'VB'), ('this', 'DT'), ('.', '.')] + + text_or_tokens can be either a string or a sequence of tokens.""" + parses = self.parse(text_or_tokens) + return parses[0].ptb_parse.tokens_and_tags() + + def _find_bad_tag_and_raise_error(self, tags): + ext_pos = parser.ExtPos() + bad_tags = set() + for tag in set(tags): + good_tag = ext_pos.addTagConstraints(parser.StringVector([tag])) + if not good_tag: + bad_tags.add(tag) + + raise ValueError("Invalid POS tags (not present in the parser's " + "terms.txt file): %s" % ', '.join(sorted(bad_tags))) + + def _check_loaded_models(self, rerank): + """Given a reranking mode (True, False, 'auto') determines + whether we have the appropriately loaded models. Also returns + whether the reranker should be used (essentially resolves the + value of rerank if rerank='auto').""" + if not self._parser_model_loaded: + raise ValueError("Parser model has not been loaded.") + if rerank is True and not self.reranker_model: + raise ValueError("Reranker model has not been loaded.") + if rerank == 'auto': + return bool(self.reranker_model) + else: + return rerank + + def set_parser_options(self, language='En', case_insensitive=False, + nbest=50, small_corpus=True, overparsing=21, + debug=0, smooth_pos=0): + """Set options for the parser. Note that this is called + automatically by load_parser_model() so you should only need to + call this to update the parsing options. The method returns a + dictionary of the new options. + + The options are as follows: language is a string describing + the language. Currently, it can be one of En (English), Ch + (Chinese), or Ar (Arabic). case_insensitive will make the parser + ignore capitalization. nbest is the maximum size of the n-best + list. small_corpus=True enables additional smoothing (originally + intended for training from small corpora, but helpful in many + situations). overparsing determines how much more time the parser + will spend on a sentence relative to the time it took to find the + first possible complete parse. This affects the speed/accuracy + tradeoff. debug takes a non-negative integer. Setting it higher + than 0 will cause the parser to print debug messages (surprising, + no?). Setting smooth_pos to a number higher than 0 will cause the + parser to assign that value as the probability of seeing a known + word in a new part-of-speech (one never seen in training).""" + if not self._parser_model_loaded: + raise RuntimeError('Parser must already be loaded (call ' + 'load_parser_model() first)') + + parser.setOptions(language, case_insensitive, nbest, small_corpus, + overparsing, debug, smooth_pos) + self.parser_options = { + 'language': language, + 'case_insensitive': case_insensitive, + 'nbest': nbest, + 'small_corpus': small_corpus, + 'overparsing': overparsing, + 'debug': debug, + 'smooth_pos': smooth_pos + } + return self.parser_options + + @classmethod + def from_unified_model_dir(this_class, model_dir, parsing_options=None, + reranker_options=None, parser_only=False): + """Create a RerankingParser from a unified parsing model on disk. + A unified parsing model should have the following filesystem + structure: + + parser/ + Charniak parser model: should contain pSgT.txt, *.g files + among others + reranker/ + features.gz or features.bz2 -- features for reranker + weights.gz or weights.bz2 -- corresponding weights of those + features + + If one of these subdirectories is missing, the corresponding + component will not be loaded. The parser_only flag can be used + to skip loading the reranker even if it available on disk.""" + parsing_options = parsing_options or {} + reranker_options = reranker_options or {} + (parser_model_dir, reranker_features_filename, + reranker_weights_filename) = get_unified_model_parameters(model_dir) + if parser_only and reranker_options: + raise ValueError("Can't set reranker_options if " + "parser_only is on.") + + rrp = this_class() + if parser_model_dir: + rrp.load_parser_model(parser_model_dir, **parsing_options) + if reranker_features_filename and reranker_weights_filename and \ + not parser_only: + rrp.load_reranker_model(reranker_features_filename, + reranker_weights_filename, + **reranker_options) + + rrp.unified_model_dir = model_dir + return rrp + +def tokenize(text): + """Helper method to tokenize a string. Note that most methods accept + untokenized text so you shouldn't need to run this if you intend + to parse this text. Returns a list of string tokens.""" + sentence = Sentence(text) + return sentence.tokens() + +def get_unified_model_parameters(model_dir): + """Determine the actual parser and reranker model filesystem entries + for a unified parsing model. Returns a triple: + + (parser_model_dir, reranker_features_filename, + reranker_weights_filename) + + Any of these can be None if that part of the model is not present + on disk (though, if you have only one of the reranker model files, + the reranker will not be loaded). + + A unified parsing model should have the following filesystem structure: + + parser/ + Charniak parser model: should contain pSgT.txt, *.g files + among others + reranker/ + features.gz or features.bz2 -- features for reranker + weights.gz or weights.bz2 -- corresponding weights of those + features + """ + if not exists(model_dir): + raise IOError("Model directory %r does not exist" % model_dir) + + parser_model_dir = join(model_dir, 'parser') + if not exists(parser_model_dir): + parser_model_dir = None + reranker_model_dir = join(model_dir, 'reranker') + + def get_reranker_model_filename(name): + filename = join(reranker_model_dir, '%s.gz' % name) + if not exists(filename): + # try bz2 version + filename = join(reranker_model_dir, '%s.bz2' % name) + if not exists(filename): + filename = None + return filename + + features_filename = get_reranker_model_filename('features') + weights_filename = get_reranker_model_filename('weights') + return (parser_model_dir, features_filename, weights_filename) diff --git a/python/bllipparser_python3/Utility.py b/python/bllipparser_python3/Utility.py new file mode 100644 index 0000000..a10692f --- /dev/null +++ b/python/bllipparser_python3/Utility.py @@ -0,0 +1,75 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import math +import importlib + +# now part of waterworks.Tools +class DeprecatedGetter: + """Used when a getter method has been converted to a property. All + attributes will be dispatched to the property's value and a warning + will be raised if this is called. This doesn't work if the property + being deprecated has its own __call__ method, since that will be + unreachable as a DeprecatedGetter.""" + def __init__(self, name, value): + """name is the attribute name of the property. value is its + value.""" + self.__name = name + self.__value = value + def __getattr__(self, attr): + """All attributes except __call__ are dispatched to the property's + value.""" + return getattr(self.__value, attr) + def __call__(self, *args, **kwargs): + """Shouldn't be called except by deprecated code. Issues a warning + about the deprecation then returns the value so that deprecated + code will continue to work.""" + from warnings import warn + warn("%r is no longer a method. It's now a property." % self.__name, + DeprecationWarning, stacklevel=2) + return self.__value + +def import_maybe(module_name): + "Import a module and return it if available, otherwise returns None." + try: + return importlib.import_module(module_name) + except ImportError: + return None + +def get_nltk_tree_reader_maybe(): + """Attempts to find the NLTK tree reader for various versions of NLTK. + Returns False if it fails or a function which takes a string and + returns an NLTK tree object otherwise.""" + try: + import nltk.tree + import nltk.draw.tree + return nltk.tree.Tree.parse + except ImportError: + return False + except AttributeError: # handle NLTK API changes + try: + return nltk.tree.Tree.fromstring + except AttributeError: + return False + +def normalize_logprobs(logprobs, exponent=1): + """Sum probs stored as log probs in a (more) numerically stable + fashion, see: + + blog.smola.org/post/987977550/log-probabilities-semirings-and-floating-point + + We optionally raise all log probs to an exponent.""" + biggest = max(logprobs) * exponent + exp_diffs = [math.exp((logprob * exponent) - biggest) + for logprob in logprobs] + z = sum(exp_diffs) + return [exp_diff / z for exp_diff in exp_diffs] diff --git a/python/bllipparser_python3/_CharniakParser.cpython-34m.so b/python/bllipparser_python3/_CharniakParser.cpython-34m.so new file mode 100755 index 0000000..22b9941 Binary files /dev/null and b/python/bllipparser_python3/_CharniakParser.cpython-34m.so differ diff --git a/python/bllipparser_python3/_JohnsonReranker.cpython-34m.so b/python/bllipparser_python3/_JohnsonReranker.cpython-34m.so new file mode 100755 index 0000000..bee3618 Binary files /dev/null and b/python/bllipparser_python3/_JohnsonReranker.cpython-34m.so differ diff --git a/python/bllipparser_python3/__init__.py b/python/bllipparser_python3/__init__.py new file mode 100644 index 0000000..5661cc5 --- /dev/null +++ b/python/bllipparser_python3/__init__.py @@ -0,0 +1,333 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +""" +Python frontend to the BLLIP natural language parser. + +The BLLIP parser (also known as the Charniak-Johnson parser or +Brown Reranking Parser) is described in the paper `Charniak +and Johnson (Association of Computational Linguistics, 2005) +`_. This package +provides the BLLIP parser runtime along with a Python interface. Note +that it does not come with any parsing models but includes a model +downloader. The primary maintenance for the parser takes place at +`GitHub `_. + +We request acknowledgement in any publications that make use of this +software and any code derived from this software. Please report the +release date of the software that you are using, as this will enable +others to compare their results to yours. + +References: + +* Eugene Charniak and Mark Johnson. "`Coarse-to-fine n-best parsing and + MaxEnt discriminative reranking + `_." Proceedings of + the 43rd Annual Meeting on Association for Computational Linguistics. + `Association for Computational Linguistics, 2005 + `_. + +* Eugene Charniak. "`A maximum-entropy-inspired parser + `_." Proceedings of + the 1st North American chapter of the Association for Computational + Linguistics conference. `Association for Computational Linguistics, 2000 + `_. + +Fetching parsing models +----------------------- + +Before you can parse, you'll need some parsing models. ``ModelFetcher`` +will help you download and install parsing models. It can be invoked +from the command line. For example, this will download and install the +standard WSJ model:: + + shell% python -mbllipparser.ModelFetcher -i WSJ + +Run ``python -mbllipparser.ModelFetcher`` with no arguments for a full +listing of options and available parsing models. It can also be invoked +as a Python library:: + + >>> from bllipparser.ModelFetcher import download_and_install_model + >>> download_and_install_model('WSJ', '/tmp/models') + '/tmp/models/WSJ' + +In this case, it would download WSJ and install it to +``/tmp/models/WSJ``. Note that it returns the path to the downloaded +model. + +Basic usage +----------- + +The easiest way to construct a parser is with the +``from_unified_model_dir`` class method. A unified model is a directory +that contains two subdirectories: ``parser/`` and ``reranker/``, each +with the respective model files:: + + >>> from bllipparser import RerankingParser, tokenize + >>> rrp = RerankingParser.from_unified_model_dir('/path/to/model/') + +This can be integrated with ModelFetcher (if the model is already +installed, ``download_and_install_model`` is a no-op):: + + >>> model_dir = download_and_install_model('WSJ', '/tmp/models') + >>> rrp = RerankingParser.from_unified_model_dir(model_dir) + +You can also load parser and reranker models manually:: + + >>> rrp = RerankingParser() + >>> rrp.load_parser_model('/tmp/models/WSJ/parser') + >>> rrp.load_reranker_model('/tmp/models/WSJ/reranker/features.gz', '/tmp/models/WSJ/reranker/weights.gz') + +If you only want the top parse of a sentence in Penn Treebank format, use +the ``simple_parse()`` method:: + + >>> rrp.simple_parse('This is simple.') + '(S1 (S (NP (DT This)) (VP (VBZ is) (ADJP (JJ simple))) (. .)))' + +If you want more information about the parse, you'll want to use the +``parse()`` method which returns an ``NBestList`` object. The parser +produces an *n-best list* of the *n* most likely parses of the sentence +(default: *n=50*). Typically you only want the top parse, but the others +are available as well:: + + >>> nbest_list = rrp.parse('This is a sentence.') + +To get information about the top parse (note that the ``ptb_parse`` +property is a ``Tree`` object, described in more detail later):: + + >>> print repr(nbest_list[0]) + ScoredParse('(S1 (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))', parser_score=-29.621201629004183, reranker_score=-7.9273829816098731) + >>> print nbest_list[0].ptb_parse + (S1 (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .))) + >>> print nbest_list[0].parser_score + -29.621201629 + >>> print nbest_list[0].reranker_score + -7.92738298161 + >>> print len(nbest_list) + 50 + +If you have the `PyStanfordDependencies +`_ package, +you can parse straight to `Stanford Dependencies +`_:: + + >>> tokens = nbest_list[0].ptb_parse.sd_tokens() + >>> for token in tokens: + ... print token + ... + Token(index=1, form=u'This', cpos=u'DT', pos=u'DT', head=4, deprel=u'nsubj') + Token(index=2, form=u'is', cpos=u'VBZ', pos=u'VBZ', head=4, deprel=u'cop') + Token(index=3, form=u'a', cpos=u'DT', pos=u'DT', head=4, deprel=u'det') + Token(index=4, form=u'sentence', cpos=u'NN', pos=u'NN', head=0, deprel=u'root') + Token(index=5, form=u'.', cpos=u'.', pos=u'.', head=4, deprel=u'punct') + +This will attempt to use a default converter but see docs for how to +customize dependency conversion (or if you run into Java version issues). + +If you have an existing tokenizer, tokenization can also be specified +by passing a list of strings:: + + >>> nbest_list = rrp.parse(['This', 'is', 'a', 'pretokenized', 'sentence', '.']) + +The reranker can be disabled by setting ``rerank=False``:: + + >>> nbest_list = rrp.parse('Parser only!', rerank=False) + +You can also parse text with existing POS tags (these act as soft +constraints). In this example, token 0 ('Time') should have tag VB and +token 1 ('flies') should have tag NNS:: + + >>> rrp.parse_tagged(['Time', 'flies'], possible_tags={0 : 'VB', 1 : 'NNS'})[0] + ScoredParse('(S1 (NP (VB Time) (NNS flies)))', parser_score=-53.94938875760073, reranker_score=-15.841407102717749) + +You don't need to specify a tag for all words: Here, token 0 ('Time') should +have tag VB and token 1 ('flies') is unconstrained:: + + >>> rrp.parse_tagged(['Time', 'flies'], possible_tags={0 : 'VB'})[0] + ScoredParse('(S1 (S (VP (VB Time) (NP (VBZ flies)))))', parser_score=-54.390430751112156, reranker_score=-17.290145080887005) + +You can specify multiple tags for each token. When you do this, the +tags for a token will be used in decreasing priority. token 0 ('Time') +should have tag VB, JJ, or NN and token 1 ('flies') is unconstrained:: + + >>> rrp.parse_tagged(['Time', 'flies'], possible_tags={0 : ['VB', 'JJ', 'NN']})[0] + ScoredParse('(S1 (NP (NN Time) (VBZ flies)))', parser_score=-42.82904107213723, reranker_score=-12.865900776775314) + +There are many parser options which can be adjusted (though the defaults +should work well for most cases) with ``set_parser_options``. This +will change the size of the n-best list and pick the defaults for all +other options. It returns a dictionary of the current options:: + + >>> rrp.set_parser_options(nbest=10) + {'language': 'En', 'case_insensitive': False, 'debug': 0, 'small_corpus': True, 'overparsing': 21, 'smooth_pos': 0, 'nbest': 10} + >>> nbest_list = rrp.parse('The list is smaller now.', rerank=False) + >>> len(nbest_list) + 10 + +The parser can also be used as a tagger:: + + >>> rrp.tag("Time flies while you're having fun.") + [('Time', 'NNP'), ('flies', 'VBZ'), ('while', 'IN'), ('you', 'PRP'), ("'re", 'VBP'), ('having', 'VBG'), ('fun', 'NN'), ('.', '.')] + +Use this if all you want is a tokenizer:: + + >>> tokenize("Tokenize this sentence, please.") + ['Tokenize', 'this', 'sentence', ',', 'please', '.'] + +Parsing shell +------------- + +There is an interactive shell which can help visualize a parse:: + + shell% python -mbllipparser.ParsingShell /path/to/model + +Once in the shell, type a sentence to have the parser parse it:: + + bllip> I saw the astronomer with the telescope. + Tokens: I saw the astronomer with the telescope . + + Parser's parse: + (S1 (S (NP (PRP I)) + (VP (VBD saw) + (NP (NP (DT the) (NN astronomer)) + (PP (IN with) (NP (DT the) (NN telescope))))) + (. .))) + + Reranker's parse: (parser index 2) + (S1 (S (NP (PRP I)) + (VP (VBD saw) + (NP (DT the) (NN astronomer)) + (PP (IN with) (NP (DT the) (NN telescope)))) + (. .))) + +If you have ``nltk`` installed, you can use its tree visualization to +see the output:: + + bllip> visual Show me this parse. + Tokens: Show me this parse . + + [graphical display of the parse appears] + +If you have ``PyStanfordDependencies`` installed, you can parse straight +to Stanford Dependencies:: + + bllip> sdparse Now with Stanford Dependencies integration! + Tokens: Now with Stanford Dependencies integration ! + + Parser and reranker: + Now [root] + +-- with [prep] + | +-- integration [pobj] + | +-- Stanford [nn] + | +-- Dependencies [nn] + +-- ! [punct] + +The ``asciitree`` package is required to visualize Stanford Dependencies +as a tree. If it is not available, the dependencies will be shown in +CoNLL-X format. + +There is more detailed help inside the shell under the ``help`` command. + +The Tree class +-------------- + +The parser provides a simple (immutable) Tree class which provides +information about Penn Treebank-style trees:: + + >>> tree = bllipparser.Tree('(S1 (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) (. .)))') + >>> print tree + (S1 (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) (. .))) + +``pretty_string()`` provides a line-wrapped stringification:: + + >>> print tree.pretty_string() + (S1 (S (NP (DT This)) + (VP (VBZ is) + (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) + (. .))) + +You can obtain the tokens and tags of the tree:: + + >>> print tree.tokens() + ('This', 'is', 'a', 'fairly', 'simple', 'parse', 'tree', '.') + >>> print tree.tags() + ('DT', 'VBZ', 'DT', 'RB', 'JJ', 'NN', 'NN', '.') + >>> print tree.tokens_and_tags() + [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('fairly', 'RB'), ('simple', 'JJ'), ('parse', 'NN'), ('tree', 'NN'), ('.', '.')] + +Or get information about the labeled spans in the tree:: + + >>> print tree.span() + (0, 8) + >>> print tree.label + S1 + +You can navigate within the trees and more:: + + >>> tree.subtrees() + [Tree('(S (NP (DT This)) (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) (. .))')] + >>> tree[0] # first subtree + Tree('(S (NP (DT This)) (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) (. .))') + >>> tree[0].label + 'S' + >>> tree[0][0] # first subtree of first subtree + Tree('(NP (DT This))') + >>> tree[0][0].label + 'NP' + >>> tree[0][0].span() + (0, 1) + >>> tree[0][0].tags() + ('DT',) + >>> tree[0][0].tokens() # tuple of all tokens in this span + ('This',) + >>> tree[0][0][0] + Tree('(DT This)') + >>> tree[0][0][0].token + 'This' + >>> tree[0][0][0].label + 'DT' + >>> tree[0][0][0].is_preterminal() + True + >>> len(tree[0]) # number of subtrees + 3 + >>> for subtree in tree[0]: + ... print subtree + ... + (NP (DT This)) + (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) + (. .) + >>> for subtree in tree.all_subtrees(): # all subtrees (recursive) + ... print subtree.is_preterminal(), subtree + ... + False (S1 (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) (. .))) + False (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) (. .)) + False (NP (DT This)) + True (DT This) + False (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) + True (VBZ is) + False (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree)) + True (DT a) + False (ADJP (RB fairly) (JJ simple)) + True (RB fairly) + True (JJ simple) + True (NN parse) + True (NN tree) + True (. .) +""" + +from .RerankingParser import RerankingParser, Tree, Sentence, tokenize + +__authors__ = 'Eugene Charniak, Mark Johnson, David McClosky, many others' +__license__ = 'Apache 2.0' +__version__ = '2015.01.11' +__maintainer__ = 'David McClosky' +__email__ = 'notsoweird+pybllipparser@gmail.com' diff --git a/python/bllipparser_python3/__main__.py b/python/bllipparser_python3/__main__.py new file mode 100644 index 0000000..b1151e5 --- /dev/null +++ b/python/bllipparser_python3/__main__.py @@ -0,0 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +if __name__ == "__main__": + from .ParsingShell import main + main()