Skip to content

Commit

Permalink
feat: optimize memory footprint
Browse files Browse the repository at this point in the history
Signed-off-by: James Yean <[email protected]>
  • Loading branch information
ifplusor committed Oct 29, 2024
1 parent 4e9890e commit 977f002
Show file tree
Hide file tree
Showing 23 changed files with 241 additions and 63 deletions.
10 changes: 6 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 2.8)
cmake_minimum_required(VERSION 3.5)
project(actrie)

set(CMAKE_EXECUTABLE_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/bin)
Expand Down Expand Up @@ -40,9 +40,10 @@ if(GENERATE_LRTABLE)
endif()

set(actrie_HEADER_FILES
include/matcher.h
include/utf8ctx.h
include/utf8helper.h
include/actrie/config.h
include/actrie/matcher.h
include/actrie/utf8ctx.h
include/actrie/utf8helper.h
src/vocab.h
src/pattern.h
src/parser/lr_reduce.h
Expand Down Expand Up @@ -74,6 +75,7 @@ set(actrie_SOURCE_FILES
src/reglet/expr/dist.c
src/trie/actrie.c
src/trie/acdat.c
src/config.c
src/matcher.c
src/utf8ctx.c
src/utf8helper.c)
Expand Down
68 changes: 56 additions & 12 deletions actrie/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,49 +16,83 @@ def __del__(self):
_actrie.Destruct(self._matcher)

def load_from_file(
self, path, all_as_plain=False, ignore_bad_pattern=False, bad_as_plain=True, deduplicate_extra=True
self,
path,
all_as_plain=False,
ignore_bad_pattern=False,
bad_as_plain=True,
deduplicate_extra=True,
extra_hint=None,
):
if self._matcher:
raise MatcherError("Matcher is already initialized.")
if not os.path.isfile(path):
return False
self._matcher = _actrie.ConstructByFile(
convert2pass(path), all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra
convert2pass(path), all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra, extra_hint
)
return self._matcher != 0

@classmethod
def create_by_file(
cls, path, all_as_plain=False, ignore_bad_pattern=False, bad_as_plain=True, deduplicate_extra=True
cls,
path,
all_as_plain=False,
ignore_bad_pattern=False,
bad_as_plain=True,
deduplicate_extra=True,
extra_hint=None,
):
matcher = cls()
if matcher.load_from_file(path, all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra):
if matcher.load_from_file(
path, all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra, extra_hint=extra_hint
):
return matcher
return None

def load_from_string(
self, keywords, all_as_plain=False, ignore_bad_pattern=False, bad_as_plain=True, deduplicate_extra=True
self,
keywords,
all_as_plain=False,
ignore_bad_pattern=False,
bad_as_plain=True,
deduplicate_extra=True,
extra_hint=None,
):
if self._matcher:
raise MatcherError("Matcher is already initialized.")
if keywords is None:
return False
self._matcher = _actrie.ConstructByString(
convert2pass(keywords), all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra
convert2pass(keywords), all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra, extra_hint
)
return self._matcher != 0

@classmethod
def create_by_string(
cls, keywords, all_as_plain=False, ignore_bad_pattern=False, bad_as_plain=True, deduplicate_extra=True
cls,
keywords,
all_as_plain=False,
ignore_bad_pattern=False,
bad_as_plain=True,
deduplicate_extra=True,
extra_hint=None,
):
matcher = cls()
if matcher.load_from_string(keywords, all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra):
if matcher.load_from_string(
keywords, all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra, extra_hint=extra_hint
):
return matcher
return None

def load_from_collection(
self, keywords, all_as_plain=False, ignore_bad_pattern=False, bad_as_plain=True, deduplicate_extra=True
self,
keywords,
all_as_plain=False,
ignore_bad_pattern=False,
bad_as_plain=True,
deduplicate_extra=True,
extra_hint=None,
):
if self._matcher:
raise MatcherError("Matcher is already initialized.")
Expand All @@ -68,14 +102,24 @@ def load_from_collection(
keywords = "\n".join([convert2pass(keyword) for keyword in keywords if convert2pass(keyword)])
else:
raise MatcherError("Keywords should be list or set.")
return self.load_from_string(keywords, all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra)
return self.load_from_string(
keywords, all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra, extra_hint=extra_hint
)

@classmethod
def create_by_collection(
cls, keywords, all_as_plain=False, ignore_bad_pattern=False, bad_as_plain=True, deduplicate_extra=True
cls,
keywords,
all_as_plain=False,
ignore_bad_pattern=False,
bad_as_plain=True,
deduplicate_extra=True,
extra_hint=None,
):
matcher = cls()
if matcher.load_from_collection(keywords, all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra):
if matcher.load_from_collection(
keywords, all_as_plain, ignore_bad_pattern, bad_as_plain, deduplicate_extra, extra_hint=extra_hint
):
return matcher
return None

Expand Down
33 changes: 25 additions & 8 deletions actrie/src/wrap.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#define PY_SSIZE_T_CLEAN
#include <Python.h>

#include "utf8ctx.h"
#include <actrie/utf8ctx.h>

#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
Expand All @@ -32,11 +32,20 @@ PyObject* wrap_construct_by_file(PyObject* dummy, PyObject* args) {
PyObject* ignore_bad_pattern;
PyObject* bad_as_plain;
PyObject* deduplicate_extra;
PyObject* extra_hint;
matcher_t matcher = NULL;

if (PyArg_ParseTuple(args, "sOOOO", &path, &all_as_plain, &ignore_bad_pattern, &bad_as_plain, &deduplicate_extra)) {
matcher = matcher_construct_by_file(path, PyObject_IsTrue(all_as_plain), PyObject_IsTrue(ignore_bad_pattern),
PyObject_IsTrue(bad_as_plain), PyObject_IsTrue(deduplicate_extra));
if (PyArg_ParseTuple(args, "sOOOOO", &path, &all_as_plain, &ignore_bad_pattern, &bad_as_plain, &deduplicate_extra,
&extra_hint)) {
if (extra_hint != Py_None) {
segarray_config_s extra_store_config = hint_segarray(PyLong_AsSize_t(extra_hint));
matcher = matcher_construct_by_file_ext(path, PyObject_IsTrue(all_as_plain), PyObject_IsTrue(ignore_bad_pattern),
PyObject_IsTrue(bad_as_plain), PyObject_IsTrue(deduplicate_extra),
&extra_store_config);
} else {
matcher = matcher_construct_by_file(path, PyObject_IsTrue(all_as_plain), PyObject_IsTrue(ignore_bad_pattern),
PyObject_IsTrue(bad_as_plain), PyObject_IsTrue(deduplicate_extra));
}
}

return Py_BuildValue("K", matcher);
Expand All @@ -49,13 +58,21 @@ PyObject* wrap_construct_by_string(PyObject* dummy, PyObject* args) {
PyObject* ignore_bad_pattern;
PyObject* bad_as_plain;
PyObject* deduplicate_extra;
PyObject* extra_hint;
matcher_t matcher = NULL;

if (PyArg_ParseTuple(args, "s#OOOO", &string, &length, &all_as_plain, &ignore_bad_pattern, &bad_as_plain,
&deduplicate_extra)) {
if (PyArg_ParseTuple(args, "s#OOOOO", &string, &length, &all_as_plain, &ignore_bad_pattern, &bad_as_plain,
&deduplicate_extra, &extra_hint)) {
strlen_s vocab = {.ptr = (char*)string, .len = (size_t)length};
matcher = matcher_construct_by_string(&vocab, PyObject_IsTrue(all_as_plain), PyObject_IsTrue(ignore_bad_pattern),
PyObject_IsTrue(bad_as_plain), PyObject_IsTrue(deduplicate_extra));
if (extra_hint != Py_None) {
segarray_config_s extra_store_config = hint_segarray(PyLong_AsSize_t(extra_hint));
matcher = matcher_construct_by_string_ext(&vocab, PyObject_IsTrue(all_as_plain),
PyObject_IsTrue(ignore_bad_pattern), PyObject_IsTrue(bad_as_plain),
PyObject_IsTrue(deduplicate_extra), &extra_store_config);
} else {
matcher = matcher_construct_by_string(&vocab, PyObject_IsTrue(all_as_plain), PyObject_IsTrue(ignore_bad_pattern),
PyObject_IsTrue(bad_as_plain), PyObject_IsTrue(deduplicate_extra));
}
}

return Py_BuildValue("K", matcher);
Expand Down
2 changes: 1 addition & 1 deletion deps/alib
26 changes: 26 additions & 0 deletions include/actrie/config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* config.h
*
* @author James Yin <[email protected]>
*/
#ifndef __ACTRIE_CONFIG_H__
#define __ACTRIE_CONFIG_H__

#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

typedef struct _segarray_config_ {
size_t seg_blen;
size_t region_size;
} segarray_config_s, *segarray_config_t;

segarray_config_s hint_segarray(size_t len);

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif // __ACTRIE_CONFIG_H__
14 changes: 14 additions & 0 deletions include/matcher.h → include/actrie/matcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#include <alib/string/astr.h>

#include <actrie/config.h>

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
Expand All @@ -26,11 +28,23 @@ matcher_t matcher_construct_by_file(const char* path,
bool ignore_bad_pattern,
bool bad_as_plain,
bool deduplicate_extra);
matcher_t matcher_construct_by_file_ext(const char* path,
bool all_as_plain,
bool ignore_bad_pattern,
bool bad_as_plain,
bool deduplicate_extra,
segarray_config_t extra_store_config);
matcher_t matcher_construct_by_string(strlen_t string,
bool all_as_plain,
bool ignore_bad_pattern,
bool bad_as_plain,
bool deduplicate_extra);
matcher_t matcher_construct_by_string_ext(strlen_t string,
bool all_as_plain,
bool ignore_bad_pattern,
bool bad_as_plain,
bool deduplicate_extra,
segarray_config_t extra_store_config);
void matcher_destruct(matcher_t matcher);

context_t matcher_alloc_context(matcher_t matcher);
Expand Down
4 changes: 2 additions & 2 deletions include/utf8ctx.h → include/actrie/utf8ctx.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
#ifndef __ACTRIE_UTF8POS_H__
#define __ACTRIE_UTF8POS_H__

#include <matcher.h>
#include <utf8helper.h>
#include <actrie/matcher.h>
#include <actrie/utf8helper.h>

#ifdef __cplusplus
extern "C" {
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions jni/src/main/cpp/wrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
#include <psn_ifplusor_actrie_Context.h>
#include <psn_ifplusor_actrie_Matcher.h>

#include <matcher.h>
#include <utf8ctx.h>
#include <actrie/matcher.h>
#include <actrie/utf8ctx.h>

/*
* Class: psn_ifplusor_actrie_Matcher
Expand Down
1 change: 1 addition & 0 deletions ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
line-length = 120
30 changes: 30 additions & 0 deletions src/config.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/**
* matcher.c
*
* @author James Yin <[email protected]>
*/
#include "actrie/config.h"

#include <stdio.h>

segarray_config_s hint_segarray(size_t len) {
segarray_config_s config = {.seg_blen = 0};

size_t hint = len >> 3;
while (hint != 0) {
hint >>= 1;
config.seg_blen++;
}
if (config.seg_blen < 10) {
config.seg_blen = 10;
}

config.region_size = (len >> config.seg_blen) + 1;
if (config.region_size < 8) {
config.region_size = 8;
}

fprintf(stderr, "seg_blen: %zu, region_size: %zu\n", config.seg_blen, config.region_size);

return config;
}
Loading

0 comments on commit 977f002

Please sign in to comment.