From 859a623327ea9ef5e6dd7b2304a04319e207784a Mon Sep 17 00:00:00 2001 From: microhuang Date: Mon, 29 Aug 2016 11:27:48 +0800 Subject: [PATCH 1/3] support PHP7 --- actypes.h | 105 +++++----- ahocorasick.c | 533 +++++++++++++++++++++++++++++--------------------- ahocorasick.h | 96 +++++---- mss.c | 300 ++++++++++++++++++++++++++-- node.c | 225 +++++++++++---------- node.h | 43 ++-- php_mss.h | 1 + sample.php | 30 ++- 8 files changed, 864 insertions(+), 469 deletions(-) diff --git a/actypes.h b/actypes.h index ac75ae2..1cbea9c 100644 --- a/actypes.h +++ b/actypes.h @@ -2,18 +2,15 @@ * actypes.h: Includes basic data types of ahocorasick library * This file is part of multifast. * - Copyright 2010-2012 Kamiar Kanani - + Copyright 2010-2013 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ @@ -21,14 +18,18 @@ #ifndef _AC_TYPES_H_ #define _AC_TYPES_H_ +#ifdef __cplusplus +extern "C" { +#endif + /* AC_ALPHABET_t: * defines the alphabet type. - * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal - * with streams of other (bigger) types e.g. integers, specific enum, objects. + * Actually defining AC_ALPHABET_t as a char work as well, but sometimes we deal + * with streams of other basic types e.g. integers or enumerators. * Although they consists of string of bytes (chars), but using their specific - * types for AC_ALPHABET_t will lead to a better performance. so instead of - * dealing with strings of chars, we assume dealing with strings of - * AC_ALPHABET_t and leave it optional for other developers to define their + * types as AC_ALPHABET_t will lead to a better performance. so instead of + * working with strings of chars, we assume that we are working with strings of + * AC_ALPHABET_t and leave it optional for other users to define their * own alphabets. **/ typedef char AC_ALPHABET_t; @@ -36,34 +37,35 @@ typedef char AC_ALPHABET_t; /* AC_REP_t: * Provides a more readable representative for a pattern. * because patterns themselves are not always suitable for displaying - * (e.g. for hex patterns), we offer this type to improve intelligibility - * of output. furthermore, sometimes it is useful, for example while + * (e.g. hex patterns), we offer this type to improve intelligibility + * of output. Sometimes it can be also useful, when you are * retrieving patterns from a database, to maintain their identifiers in the * automata for further reference. we provisioned two possible types as a - * union for this purpose. you can add your desired type in it. + * union. you can add your desired type in it. **/ -typedef union { - char * stringy; /* null-terminated string */ - unsigned long number; +typedef union AC_REP +{ + const char * stringy; /* null-terminated string */ + unsigned long number; } AC_REP_t; /* AC_PATTERN_t: * This is the pattern type that must be fed into AC automata. - * the 'astring' field is not null-terminated, due to it can contain zero + * the 'astring' field is not null-terminated, because it can contain zero * value bytes. the 'length' field determines the number of AC_ALPHABET_t it - * carries. the 'representative' field is described in AC_REP_t. despite - * 'astring', 'representative' can have duplicate values for different given + * carries. the 'rep' field is described in AC_REP_t. despite + * 'astring', 'rep' can have duplicate values for different given * AC_PATTERN_t. it is an optional field and you can just fill it with 0. * CAUTION: * Not always the 'astring' points to the correct position in memory. * it is the responsibility of your program to maintain a permanent allocation - * for astring field of the added pattern to automata. + * for astring field. **/ -typedef struct +typedef struct AC_PATTERN { - AC_ALPHABET_t * astring; /* String of alphabets */ - unsigned int length; /* Length of pattern */ - AC_REP_t rep; /* Representative string (optional) */ + const AC_ALPHABET_t * astring; /* String of alphabets */ + unsigned int length; /* Length of pattern */ + AC_REP_t rep; /* Representative string (optional) */ } AC_PATTERN_t; /* AC_TEXT_t: @@ -71,15 +73,15 @@ typedef struct * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input * text, but for the purpose of being more readable, we defined this new type. **/ -typedef struct +typedef struct AC_TEXT { - AC_ALPHABET_t * astring; /* String of alphabets */ - unsigned int length; /* Length of string */ + const AC_ALPHABET_t * astring; /* String of alphabets */ + unsigned int length; /* Length of string */ } AC_TEXT_t; /* AC_MATCH_t: - * Provides the structure for reporting a match event. - * a match event occurs when the automata reaches a final node. any final + * Provides the structure for reporting a match in the text. + * a match occurs when the automata reaches a final node. any final * node can match one or more pattern at a position in a text. the * 'patterns' field holds these matched patterns. obviously these * matched patterns have same end-position in the text. there is a relationship @@ -91,45 +93,48 @@ typedef struct * respectively. finally the field 'match_num' maintains the number of * matched patterns. **/ -typedef struct +typedef struct AC_MATCH { - AC_PATTERN_t * patterns; /* Array of matched pattern */ - long position; /* The end position of matching pattern(s) in the text */ - unsigned int match_num; /* Number of matched patterns */ + AC_PATTERN_t * patterns; /* Array of matched pattern */ + long position; /* The end position of matching pattern(s) in the text */ + unsigned int match_num; /* Number of matched patterns */ } AC_MATCH_t; -/* AC_ERROR_t: - * Error that may occur while adding a pattern to the automata. - * it is returned by ac_automata_add(). +/* AC_STATUS_t: + * Return status of an AC function **/ -typedef enum +typedef enum AC_STATUS { - ACERR_SUCCESS = 0, /* No error occurred */ - ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */ - ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */ - ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */ - ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling - ac_automata_finalize() you can not add new patterns to the automata. */ -} AC_ERROR_t; + ACERR_SUCCESS = 0, /* No error occurred */ + ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */ + ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */ + ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */ + ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling + * ac_automata_finalize() you can not add new + * patterns to the automata. */ +} AC_STATUS_t; -/* MATCH_CALBACK_t: - * This is the call-back function type that must be given to automata at - * initialization to report match occurrence to the caller. - * at a match event, the automata will reach you using this function and sends +/* AC_MATCH_CALBACK_t: + * This is the call-back function to report match back to the caller. + * when a match is find, the automata will reach you using this function and sends * you a pointer to AC_MATCH_t. using that pointer you can handle * matches. you can send parameters to the call-back function when you call * ac_automata_search(). at call-back, the automata will sent you those - * parameters as the second parameter (void *) of MATCH_CALBACK_t. inside + * parameters as the second parameter (void *) of AC_MATCH_CALBACK_t. inside * the call-back function you can cast it to whatever you want. - * If you return 0 from MATCH_CALBACK_t function to the automata, it will + * If you return 0 from AC_MATCH_CALBACK_t function to the automata, it will * continue searching, otherwise it will return from ac_automata_search() * to your calling function. **/ -typedef int (*MATCH_CALBACK_f)(AC_MATCH_t *, void *); +typedef int (*AC_MATCH_CALBACK_f)(AC_MATCH_t *, void *); /* AC_PATTRN_MAX_LENGTH: * Maximum acceptable pattern length in AC_PATTERN_t.length **/ #define AC_PATTRN_MAX_LENGTH 1024 +#ifdef __cplusplus +} #endif + +#endif \ No newline at end of file diff --git a/ahocorasick.c b/ahocorasick.c index 536f788..4568a0a 100644 --- a/ahocorasick.c +++ b/ahocorasick.c @@ -2,18 +2,15 @@ * ahocorasick.c: implementation of ahocorasick library's functions * This file is part of multifast. * - Copyright 2010-2012 Kamiar Kanani - + Copyright 2010-2013 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ @@ -23,6 +20,7 @@ #include #include +#include "node.h" #include "ahocorasick.h" /* Allocation step for automata.all_nodes */ @@ -30,35 +28,33 @@ /* Private function prototype */ static void ac_automata_register_nodeptr - (AC_AUTOMATA_t * thiz, AC_NODE_t * node); + (AC_AUTOMATA_t * thiz, AC_NODE_t * node); static void ac_automata_union_matchstrs - (AC_NODE_t * node); + (AC_NODE_t * node); static void ac_automata_set_failure - (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas); + (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas); static void ac_automata_traverse_setfailure - (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas); + (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas); +static void ac_automata_reset (AC_AUTOMATA_t * thiz); /****************************************************************************** * FUNCTION: ac_automata_init * Initialize automata; allocate memories and set initial values * PARAMS: - * MATCH_CALBACK mc: call-back function - * the call-back function will be used to reach the caller on match occurrence ******************************************************************************/ -AC_AUTOMATA_t * ac_automata_init (MATCH_CALBACK_f mc) +AC_AUTOMATA_t * ac_automata_init () { - AC_AUTOMATA_t * thiz = (AC_AUTOMATA_t *)malloc(sizeof(AC_AUTOMATA_t)); - memset (thiz, 0, sizeof(AC_AUTOMATA_t)); - thiz->root = node_create (); - thiz->all_nodes_max = REALLOC_CHUNK_ALLNODES; - thiz->all_nodes = (AC_NODE_t **) malloc (thiz->all_nodes_max*sizeof(AC_NODE_t *)); - thiz->match_callback = mc; - ac_automata_register_nodeptr (thiz, thiz->root); - ac_automata_reset (thiz); - thiz->total_patterns = 0; - thiz->automata_open = 1; - return thiz; + AC_AUTOMATA_t * thiz = (AC_AUTOMATA_t *)malloc(sizeof(AC_AUTOMATA_t)); + memset (thiz, 0, sizeof(AC_AUTOMATA_t)); + thiz->root = node_create (); + thiz->all_nodes_max = REALLOC_CHUNK_ALLNODES; + thiz->all_nodes = (AC_NODE_t **) malloc (thiz->all_nodes_max*sizeof(AC_NODE_t *)); + ac_automata_register_nodeptr (thiz, thiz->root); + ac_automata_reset (thiz); + thiz->total_patterns = 0; + thiz->automata_open = 1; + return thiz; } /****************************************************************************** @@ -70,47 +66,47 @@ AC_AUTOMATA_t * ac_automata_init (MATCH_CALBACK_f mc) * RETUERN VALUE: AC_ERROR_t * the return value indicates the success or failure of adding action ******************************************************************************/ -AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * patt) +AC_STATUS_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * patt) { - unsigned int i; - AC_NODE_t * n = thiz->root; - AC_NODE_t * next; - AC_ALPHABET_t alpha; - - if(!thiz->automata_open) - return ACERR_AUTOMATA_CLOSED; - - if (!patt->length) - return ACERR_ZERO_PATTERN; - - if (patt->length > AC_PATTRN_MAX_LENGTH) - return ACERR_LONG_PATTERN; - - for (i=0; ilength; i++) - { - alpha = patt->astring[i]; - if ((next = node_find_next(n, alpha))) - { - n = next; - continue; - } - else - { - next = node_create_next(n, alpha); - next->depth = n->depth + 1; - n = next; - ac_automata_register_nodeptr(thiz, n); - } - } - - if(n->final) - return ACERR_DUPLICATE_PATTERN; - - n->final = 1; - node_register_matchstr(n, patt); - thiz->total_patterns++; - - return ACERR_SUCCESS; + unsigned int i; + AC_NODE_t * n = thiz->root; + AC_NODE_t * next; + AC_ALPHABET_t alpha; + + if(!thiz->automata_open) + return ACERR_AUTOMATA_CLOSED; + + if (!patt->length) + return ACERR_ZERO_PATTERN; + + if (patt->length > AC_PATTRN_MAX_LENGTH) + return ACERR_LONG_PATTERN; + + for (i=0; ilength; i++) + { + alpha = patt->astring[i]; + if ((next = node_find_next(n, alpha))) + { + n = next; + continue; + } + else + { + next = node_create_next(n, alpha); + next->depth = n->depth + 1; + n = next; + ac_automata_register_nodeptr(thiz, n); + } + } + + if(n->final) + return ACERR_DUPLICATE_PATTERN; + + n->final = 1; + node_register_matchstr(n, patt); + thiz->total_patterns++; + + return ACERR_SUCCESS; } /****************************************************************************** @@ -124,19 +120,19 @@ AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * patt) ******************************************************************************/ void ac_automata_finalize (AC_AUTOMATA_t * thiz) { - unsigned int i; - AC_ALPHABET_t alphas[AC_PATTRN_MAX_LENGTH]; - AC_NODE_t * node; - - ac_automata_traverse_setfailure (thiz, thiz->root, alphas); - - for (i=0; i < thiz->all_nodes_num; i++) - { - node = thiz->all_nodes[i]; - ac_automata_union_matchstrs (node); - node_sort_edges (node); - } - thiz->automata_open = 0; /* do not accept patterns any more */ + unsigned int i; + AC_ALPHABET_t alphas[AC_PATTRN_MAX_LENGTH]; + AC_NODE_t * node; + + ac_automata_traverse_setfailure (thiz, thiz->root, alphas); + + for (i=0; i < thiz->all_nodes_num; i++) + { + node = thiz->all_nodes[i]; + ac_automata_union_matchstrs (node); + node_sort_edges (node); + } + thiz->automata_open = 0; /* do not accept patterns any more */ } /****************************************************************************** @@ -148,61 +144,142 @@ void ac_automata_finalize (AC_AUTOMATA_t * thiz) * PARAMS: * AC_AUTOMATA_t * thiz: the pointer to the automata * AC_TEXT_t * txt: the input text that must be searched + * int keep: is the input text the successive chunk of the previous given text * void * param: this parameter will be send to call-back function. it is * useful for sending parameter to call-back function from caller function. * RETURN VALUE: - * -1: failed call; automata is not finalized - * 0: success; continue searching; call-back sent me a 0 value - * 1: success; stop searching; call-back sent me a non-0 value + * -1: failed; automata is not finalized + * 0: success; input text was searched to the end + * 1: success; input text was searched partially. (callback broke the loop) +******************************************************************************/ +int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * text, int keep, + AC_MATCH_CALBACK_f callback, void * param) +{ + unsigned long position; + AC_NODE_t * current; + AC_NODE_t * next; + AC_MATCH_t match; + + if (thiz->automata_open) + /* you must call ac_automata_locate_failure() first */ + return -1; + + thiz->text = 0; + + if (!keep) + ac_automata_reset(thiz); + + position = 0; + current = thiz->current_node; + + /* This is the main search loop. + * it must be as lightweight as possible. */ + while (position < text->length) + { + if (!(next = node_findbs_next(current, text->astring[position]))) + { + if(current->failure_node /* we are not in the root node */) + current = current->failure_node; + else + position++; + } + else + { + current = next; + position++; + } + + if (current->final && next) + /* We check 'next' to find out if we came here after a alphabet + * transition or due to a fail. in second case we should not report + * matching because it was reported in previous node */ + { + match.position = position + thiz->base_position; + match.match_num = current->matched_patterns_num; + match.patterns = current->matched_patterns; + /* we found a match! do call-back */ + if (callback(&match, param)) + return 1; + } + } + + /* save status variables */ + thiz->current_node = current; + thiz->base_position += position; + return 0; +} + +/****************************************************************************** + * FUNCTION: ac_automata_settext ******************************************************************************/ -int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * txt, void * param) +void ac_automata_settext (AC_AUTOMATA_t * thiz, AC_TEXT_t * text, int keep) { - unsigned long position; - AC_NODE_t * current; - AC_NODE_t * next; - - if(thiz->automata_open) - /* you must call ac_automata_locate_failure() first */ - return -1; - - position = 0; - current = thiz->current_node; - - /* This is the main search loop. - * it must be keep as lightweight as possible. */ - while (position < txt->length) - { - if(!(next = node_findbs_next(current, txt->astring[position]))) - { - if(current->failure_node /* we are not in the root node */) - current = current->failure_node; - else - position++; - } - else - { - current = next; - position++; - } - - if(current->final && next) - /* We check 'next' to find out if we came here after a alphabet - * transition or due to a fail. in second case we should not report - * matching because it was reported in previous node */ - { - thiz->match.position = position + thiz->base_position; - thiz->match.match_num = current->matched_patterns_num; - thiz->match.patterns = current->matched_patterns; - /* we found a match! do call-back */ - if (thiz->match_callback(&thiz->match, param)) - return 1; - } - } - - /* save status variables */ - thiz->current_node = current; - thiz->base_position += position; - return 0; + thiz->text = text; + if (!keep) + ac_automata_reset(thiz); + thiz->position = 0; +} + +/****************************************************************************** + * FUNCTION: ac_automata_findnext +******************************************************************************/ +AC_MATCH_t * ac_automata_findnext (AC_AUTOMATA_t * thiz) +{ + unsigned long position; + AC_NODE_t * current; + AC_NODE_t * next; + static AC_MATCH_t match; + + if (thiz->automata_open) + return 0; + + if (!thiz->text) + return 0; + + position = thiz->position; + current = thiz->current_node; + match.match_num = 0; + + /* This is the main search loop. + * it must be as lightweight as possible. */ + while (position < thiz->text->length) + { + if (!(next = node_findbs_next(current, thiz->text->astring[position]))) + { + if (current->failure_node /* we are not in the root node */) + current = current->failure_node; + else + position++; + } + else + { + current = next; + position++; + } + + if (current->final && next) + /* We check 'next' to find out if we came here after a alphabet + * transition or due to a fail. in second case we should not report + * matching because it was reported in previous node */ + { + match.position = position + thiz->base_position; + match.match_num = current->matched_patterns_num; + match.patterns = current->matched_patterns; + break; + } + } + + /* save status variables */ + thiz->current_node = current; + thiz->position = position; + + if (!match.match_num) + /* if we came here due to reaching to the end of input text + * not a loop break + */ + thiz->base_position += position; + + return match.match_num?&match:0; } /****************************************************************************** @@ -215,8 +292,8 @@ int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * txt, void * param) ******************************************************************************/ void ac_automata_reset (AC_AUTOMATA_t * thiz) { - thiz->current_node = thiz->root; - thiz->base_position = 0; + thiz->current_node = thiz->root; + thiz->base_position = 0; } /****************************************************************************** @@ -227,16 +304,16 @@ void ac_automata_reset (AC_AUTOMATA_t * thiz) ******************************************************************************/ void ac_automata_release (AC_AUTOMATA_t * thiz) { - unsigned int i; - AC_NODE_t * n; - - for (i=0; i < thiz->all_nodes_num; i++) - { - n = thiz->all_nodes[i]; - node_release(n); - } - free(thiz->all_nodes); - free(thiz); + unsigned int i; + AC_NODE_t * n; + + for (i=0; i < thiz->all_nodes_num; i++) + { + n = thiz->all_nodes[i]; + node_release(n); + } + free(thiz->all_nodes); + free(thiz); } /****************************************************************************** @@ -249,48 +326,48 @@ void ac_automata_release (AC_AUTOMATA_t * thiz) ******************************************************************************/ void ac_automata_display (AC_AUTOMATA_t * thiz, char repcast) { - unsigned int i, j; - AC_NODE_t * n; - struct edge * e; - AC_PATTERN_t sid; - - printf("---------------------------------\n"); - - for (i=0; iall_nodes_num; i++) - { - n = thiz->all_nodes[i]; - printf("NODE(%3d)/----fail----> NODE(%3d)\n", - n->id, (n->failure_node)?n->failure_node->id:1); - for (j=0; joutgoing_degree; j++) - { - e = &n->outgoing[j]; - printf(" |----("); - if(isgraph(e->alpha)) - printf("%c)---", e->alpha); - else - printf("0x%x)", e->alpha); - printf("--> NODE(%3d)\n", e->next->id); - } - if (n->matched_patterns_num) { - printf("Accepted patterns: {"); - for (j=0; jmatched_patterns_num; j++) - { - sid = n->matched_patterns[j]; - if(j) printf(", "); - switch (repcast) - { - case 'n': - printf("%ld", sid.rep.number); - break; - case 's': - printf("%s", sid.rep.stringy); - break; - } - } - printf("}\n"); - } - printf("---------------------------------\n"); - } + unsigned int i, j; + AC_NODE_t * n; + struct edge * e; + AC_PATTERN_t sid; + + printf("---------------------------------\n"); + + for (i=0; iall_nodes_num; i++) + { + n = thiz->all_nodes[i]; + printf("NODE(%3d)/----fail----> NODE(%3d)\n", + n->id, (n->failure_node)?n->failure_node->id:1); + for (j=0; joutgoing_degree; j++) + { + e = &n->outgoing[j]; + printf(" |----("); + if(isgraph(e->alpha)) + printf("%c)---", e->alpha); + else + printf("0x%x)", e->alpha); + printf("--> NODE(%3d)\n", e->next->id); + } + if (n->matched_patterns_num) { + printf("Accepted patterns: {"); + for (j=0; jmatched_patterns_num; j++) + { + sid = n->matched_patterns[j]; + if(j) printf(", "); + switch (repcast) + { + case 'n': + printf("%ld", sid.rep.number); + break; + case 's': + printf("%s", sid.rep.stringy); + break; + } + } + printf("}\n"); + } + printf("---------------------------------\n"); + } } /****************************************************************************** @@ -299,13 +376,13 @@ void ac_automata_display (AC_AUTOMATA_t * thiz, char repcast) ******************************************************************************/ static void ac_automata_register_nodeptr (AC_AUTOMATA_t * thiz, AC_NODE_t * node) { - if(thiz->all_nodes_num >= thiz->all_nodes_max) - { - thiz->all_nodes_max += REALLOC_CHUNK_ALLNODES; - thiz->all_nodes = realloc - (thiz->all_nodes, thiz->all_nodes_max*sizeof(AC_NODE_t *)); - } - thiz->all_nodes[thiz->all_nodes_num++] = node; + if(thiz->all_nodes_num >= thiz->all_nodes_max) + { + thiz->all_nodes_max += REALLOC_CHUNK_ALLNODES; + thiz->all_nodes = realloc + (thiz->all_nodes, thiz->all_nodes_max*sizeof(AC_NODE_t *)); + } + thiz->all_nodes[thiz->all_nodes_num++] = node; } /****************************************************************************** @@ -315,18 +392,18 @@ static void ac_automata_register_nodeptr (AC_AUTOMATA_t * thiz, AC_NODE_t * node ******************************************************************************/ static void ac_automata_union_matchstrs (AC_NODE_t * node) { - unsigned int i; - AC_NODE_t * m = node; - - while ((m = m->failure_node)) - { - for (i=0; i < m->matched_patterns_num; i++) - node_register_matchstr(node, &(m->matched_patterns[i])); - - if (m->final) - node->final = 1; - } - // TODO : sort matched_patterns? is that necessary? I don't think so. + unsigned int i; + AC_NODE_t * m = node; + + while ((m = m->failure_node)) + { + for (i=0; i < m->matched_patterns_num; i++) + node_register_matchstr(node, &(m->matched_patterns[i])); + + if (m->final) + node->final = 1; + } + // TODO : sort matched_patterns? is that necessary? I don't think so. } /****************************************************************************** @@ -334,24 +411,24 @@ static void ac_automata_union_matchstrs (AC_NODE_t * node) * find failure node for the given node. ******************************************************************************/ static void ac_automata_set_failure - (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas) + (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas) { - unsigned int i, j; - AC_NODE_t * m; - - for (i=1; i < node->depth; i++) - { - m = thiz->root; - for (j=i; j < node->depth && m; j++) - m = node_find_next (m, alphas[j]); - if (m) - { - node->failure_node = m; - break; - } - } - if (!node->failure_node) - node->failure_node = thiz->root; + unsigned int i, j; + AC_NODE_t * m; + + for (i=1; i < node->depth; i++) + { + m = thiz->root; + for (j=i; j < node->depth && m; j++) + m = node_find_next (m, alphas[j]); + if (m) + { + node->failure_node = m; + break; + } + } + if (!node->failure_node) + node->failure_node = thiz->root; } /****************************************************************************** @@ -362,20 +439,20 @@ static void ac_automata_set_failure * can not add further pattern to automata. ******************************************************************************/ static void ac_automata_traverse_setfailure - (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas) + (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas) { - unsigned int i; - AC_NODE_t * next; + unsigned int i; + AC_NODE_t * next; - for (i=0; i < node->outgoing_degree; i++) - { - alphas[node->depth] = node->outgoing[i].alpha; - next = node->outgoing[i].next; + for (i=0; i < node->outgoing_degree; i++) + { + alphas[node->depth] = node->outgoing[i].alpha; + next = node->outgoing[i].next; - /* At every node look for its failure node */ - ac_automata_set_failure (thiz, next, alphas); + /* At every node look for its failure node */ + ac_automata_set_failure (thiz, next, alphas); - /* Recursively call itself to traverse all nodes */ - ac_automata_traverse_setfailure (thiz, next, alphas); - } -} + /* Recursively call itself to traverse all nodes */ + ac_automata_traverse_setfailure (thiz, next, alphas); + } +} \ No newline at end of file diff --git a/ahocorasick.h b/ahocorasick.h index fff0c1a..6af6d1a 100644 --- a/ahocorasick.h +++ b/ahocorasick.h @@ -2,18 +2,15 @@ * ahocorasick.h: the main ahocorasick header file. * This file is part of multifast. * - Copyright 2010-2012 Kamiar Kanani - + Copyright 2010-2013 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ @@ -21,49 +18,70 @@ #ifndef _AUTOMATA_H_ #define _AUTOMATA_H_ -#include "node.h" - -typedef struct -{ - /* The root of the Aho-Corasick trie */ - AC_NODE_t * root; - - /* maintain all nodes pointers. it will be used to access or release - * all nodes. */ - AC_NODE_t ** all_nodes; - - unsigned int all_nodes_num; /* Number of all nodes in the automata */ - unsigned int all_nodes_max; /* Current max allocated memory for *all_nodes */ - - AC_MATCH_t match; /* Any match is reported with this */ - MATCH_CALBACK_f match_callback; /* Match call-back function */ +#include "actypes.h" - /* this flag indicates that if automata is finalized by - * ac_automata_finalize() or not. 1 means finalized and 0 - * means not finalized (is open). after finalizing automata you can not - * add pattern to automata anymore. */ - unsigned short automata_open; - - /* It is possible to feed a large input to the automata chunk by chunk to - * be searched using ac_automata_search(). in fact by default automata - * thinks that all chunks are related unless you do ac_automata_reset(). - * followings are variables that keep track of searching state. */ - AC_NODE_t * current_node; /* Pointer to current node while searching */ - unsigned long base_position; /* Represents the position of current chunk - related to whole input text */ +#ifdef __cplusplus +extern "C" { +#endif - /* Statistic Variables */ - unsigned long total_patterns; /* Total patterns in the automata */ +struct AC_NODE; +typedef struct AC_AUTOMATA +{ + /* The root of the Aho-Corasick trie */ + struct AC_NODE * root; + + /* maintain all nodes pointers. it will be used to access or release + * all nodes. */ + struct AC_NODE ** all_nodes; + + unsigned int all_nodes_num; /* Number of all nodes in the automata */ + unsigned int all_nodes_max; /* Current max allocated memory for *all_nodes */ + + /* this flag indicates that if automata is finalized by + * ac_automata_finalize() or not. 1 means finalized and 0 + * means not finalized (is open). after finalizing automata you can not + * add pattern to automata anymore. */ + unsigned short automata_open; + + /* It is possible to feed a large input to the automata chunk by chunk to + * be searched using ac_automata_search(). in fact by default automata + * thinks that all chunks are related unless you do ac_automata_reset(). + * followings are variables that keep track of searching state. */ + struct AC_NODE * current_node; /* Pointer to current node while searching */ + unsigned long base_position; /* Represents the position of current chunk + * related to whole input text */ + + /* The input text. + * used only when it is working in settext/findnext mode */ + AC_TEXT_t * text; + + /* The lase searched position in the chunk. + * used only when it is working in settext/findnext mode */ + unsigned long position; + + /* Statistic Variables */ + + /* Total patterns in the automata */ + unsigned long total_patterns; + } AC_AUTOMATA_t; -AC_AUTOMATA_t * ac_automata_init (MATCH_CALBACK_f mc); -AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str); +AC_AUTOMATA_t * ac_automata_init (void); +AC_STATUS_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str); void ac_automata_finalize (AC_AUTOMATA_t * thiz); -int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * str, void * param); -void ac_automata_reset (AC_AUTOMATA_t * thiz); +int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * text, int keep, AC_MATCH_CALBACK_f callback, void * param); + +void ac_automata_settext (AC_AUTOMATA_t * thiz, AC_TEXT_t * text, int keep); +AC_MATCH_t * ac_automata_findnext (AC_AUTOMATA_t * thiz); + void ac_automata_release (AC_AUTOMATA_t * thiz); void ac_automata_display (AC_AUTOMATA_t * thiz, char repcast); + +#ifdef __cplusplus +} #endif + +#endif \ No newline at end of file diff --git a/mss.c b/mss.c index 1400772..5baffc4 100644 --- a/mss.c +++ b/mss.c @@ -42,19 +42,36 @@ typedef struct { zval *ext; } user_callback_param_t; -static int match_callback_closure(AC_MATCH_t *m, user_callback_param_t *ucp - TSRMLS_DC) { - zval *retval; +static int match_callback_closure(AC_MATCH_t *m, user_callback_param_t *ucp TSRMLS_DC) { zval invoke; +#if PHP_MAJOR_VERSION < 7 + zval *retval; +#else + zval retval; +#endif +//Closure::__invoke() +#if PHP_MAJOR_VERSION < 7 ZVAL_STRING(&invoke, "__invoke", 8); +#else + ZVAL_STRING(&invoke, "__invoke"); + //ZVAL_STRINGL(&invoke, "__invoke", 8); +#endif +#if PHP_MAJOR_VERSION < 7 zval **args[4]; +#else + zval args[4]; +#endif int argv; if (ucp->ext) { +#if PHP_MAJOR_VERSION < 7 args[3] = &(ucp->ext); +#else + args[3] = *(ucp->ext); //$ext => function(.....,$p4){} +#endif argv = 4; } else { - argv = 3; + argv = 3; //function($p1,$p2,$p3){} } int i; @@ -65,35 +82,76 @@ static int match_callback_closure(AC_MATCH_t *m, user_callback_param_t *ucp zval *idx; zval *type; +#if PHP_MAJOR_VERSION < 7 ALLOC_INIT_ZVAL(kw); ALLOC_INIT_ZVAL(idx); ALLOC_INIT_ZVAL(type); +#else + kw = ecalloc(sizeof(zval), 1); + idx = ecalloc(sizeof(zval), 1); + type = ecalloc(sizeof(zval), 1); +#endif +#if PHP_MAJOR_VERSION < 7 ZVAL_STRING(kw, pattern->astring, pattern->length); +#else + ZVAL_STRING(kw, pattern->astring); +#endif ZVAL_LONG(idx, m->position - pattern->length); if (pattern->rep.stringy) { +#if PHP_MAJOR_VERSION < 7 ZVAL_STRING(type, pattern->rep.stringy, strlen(pattern->rep.stringy)); +#else + ZVAL_STRING(type, pattern->rep.stringy); +#endif } else { ZVAL_NULL(type); } +#if PHP_MAJOR_VERSION < 7 args[0] = &kw; args[1] = &idx; args[2] = &type; +#else + args[0] = *kw; + args[1] = *idx; + args[2] = *type; +#endif + int call_result; +#if PHP_MAJOR_VERSION < 7 if (call_user_function_ex(NULL, &(ucp->callback), &invoke, &retval, argv, args, 0, NULL TSRMLS_CC) != SUCCESS) { +#else + if ((call_result = call_user_function_ex(NULL, ucp->callback, &invoke, &retval, + argv, args, 0, NULL TSRMLS_CC)) != SUCCESS) { +#endif zend_error(E_ERROR, "invoke callback failed"); } +#if PHP_MAJOR_VERSION < 7 zval_ptr_dtor(&type); zval_ptr_dtor(&idx); zval_ptr_dtor(&kw); +#else + zval_ptr_dtor(type); + zval_ptr_dtor(idx); + zval_ptr_dtor(kw); +#endif +#if PHP_MAJOR_VERSION < 7 if (Z_LVAL_P(retval)) { +#else + if (call_result == SUCCESS && Z_TYPE(retval) != IS_UNDEF && zval_is_true(&retval)) { +#endif return 1; } +#if PHP_MAJOR_VERSION < 7 +#else + zval_ptr_dtor(&retval); +#endif } + return 0; } @@ -106,8 +164,10 @@ static int match_callback(AC_MATCH_t *m, void *param TSRMLS_DC) { } if (mcp->type == MCP_TYPE_CLOSURE) { + return match_callback_closure(m, (user_callback_param_t *)(mcp->value) TSRMLS_CC); + } // MCP_TYPE_ARRAY @@ -115,22 +175,41 @@ static int match_callback(AC_MATCH_t *m, void *param TSRMLS_DC) { int i; for (i=0; i < m->match_num; i++) { AC_PATTERN_t *pattern = &(m->patterns[i]); +#if PHP_MAJOR_VERSION < 7 zval *match; ALLOC_INIT_ZVAL(match); array_init(match); +#else + zval match; + array_init(&match); +#endif + +#if PHP_MAJOR_VERSION < 7 add_index_string(match, 0, pattern->astring, 1); add_index_long(match, 1, m->position - pattern->length); +#else + add_index_string(&match, 0, pattern->astring); + add_index_long(&match, 1, m->position - pattern->length); +#endif if (pattern->rep.stringy) { +#if PHP_MAJOR_VERSION < 7 add_index_string(match, 2, pattern->rep.stringy, 1); +#else + add_index_string(&match, 2, pattern->rep.stringy); +#endif } + +#if PHP_MAJOR_VERSION < 7 add_next_index_zval(matches, match); +#else + add_next_index_zval(matches, &match); +#endif } return 0; } // - int le_mss, le_mss_persist; #define PHP_MSS_RES_NAME "MSS resource" @@ -176,12 +255,20 @@ static void mss_free(mss_t *mss TSRMLS_DC) { pefree(mss, mss->persist); } +#if PHP_MAJOR_VERSION < 7 static void mss_dtor(zend_rsrc_list_entry *rsrc TSRMLS_DC) { +#else +static void mss_dtor(zend_resource *rsrc TSRMLS_DC) { +#endif mss_t *mss = (mss_t *)rsrc->ptr; mss_free(mss TSRMLS_CC); } +#if PHP_MAJOR_VERSION < 7 static void mss_persist_dtor(zend_rsrc_list_entry *rsrc TSRMLS_DC) { +#else +static void mss_persist_dtor(zend_resource *rsrc TSRMLS_DC) { +#endif mss_t *mss = (mss_t *)rsrc->ptr; mss_free(mss TSRMLS_CC); } @@ -194,6 +281,7 @@ static zend_function_entry mss_functions[] = { PHP_FE(mss_add, NULL) PHP_FE(mss_search, NULL) PHP_FE(mss_match, NULL) + PHP_FE(mss_display, NULL) {NULL, NULL, NULL} }; @@ -228,39 +316,67 @@ PHP_MINIT_FUNCTION(mss) { } PHP_FUNCTION(mss_create) { +#if PHP_MAJOR_VERSION < 7 char *name = NULL; - int name_len; + int name_len = 0; +#else + char* name = NULL; + size_t name_len = 0; +#endif long expiry = -1; zend_bool persist; mss_t *mss = NULL; +#if PHP_MAJOR_VERSION < 7 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|sl", &name, &name_len, &expiry) == FAILURE) { +#else + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|sl", + &name, &name_len, &expiry) == FAILURE) { +#endif RETURN_FALSE; } persist = name ? 1 : 0; if (persist) { +#if PHP_MAJOR_VERSION < 7 zend_rsrc_list_entry *le; +#else + zend_resource *le; +#endif +#if PHP_MAJOR_VERSION < 7 if (zend_hash_find(&EG(persistent_list), name, name_len + 1, (void **)&le) == SUCCESS) { +#else + if ((le = zend_hash_str_find_ptr(&EG(persistent_list), name, name_len + 1))) { +#endif mss = le->ptr; struct timeval tv; gettimeofday(&tv, NULL); if (expiry < 0 || tv.tv_sec - mss->timestamp < expiry) { +#if PHP_MAJOR_VERSION < 7 ZEND_REGISTER_RESOURCE(return_value, mss, le_mss_persist); return; +#else + RETURN_RES(zend_register_resource(mss, le_mss_persist)); +#endif } +#if PHP_MAJOR_VERSION < 7 zend_hash_del(&EG(persistent_list), name, name_len + 1); +#else + //zend_hash_del(&EG(persistent_list), name); + zend_hash_str_del(&EG(persistent_list), name, name_len + 1); +#endif } } mss = pemalloc(sizeof(mss_t), persist); - mss->ac = ac_automata_init(match_callback); + //mss->ac = ac_automata_init(match_callback);//1.0.0 + mss->ac = ac_automata_init();//1.3.1 struct timeval tv; gettimeofday(&tv, NULL); @@ -273,32 +389,67 @@ PHP_FUNCTION(mss_create) { if (persist) { mss->name = pestrndup(name, name_len + 1, persist); +#if PHP_MAJOR_VERSION < 7 ZEND_REGISTER_RESOURCE(return_value, mss, le_mss_persist); +#else + zend_resource* new_le = zend_register_resource(mss, le_mss_persist); + RETVAL_RES(new_le); +#endif +#if PHP_MAJOR_VERSION < 7 zend_rsrc_list_entry new_le; new_le.ptr = mss; new_le.type = le_mss_persist; +#else + new_le->ptr = mss; + new_le->type = le_mss_persist; +#endif +#if PHP_MAJOR_VERSION < 7 zend_hash_add(&EG(persistent_list), name, name_len + 1, &new_le, sizeof(zend_rsrc_list_entry), NULL); +#else + zend_hash_str_update_mem(&EG(persistent_list), name, name_len + 1, new_le, sizeof(zend_resource)); +#endif } else { mss->name = NULL; +#if PHP_MAJOR_VERSION < 7 ZEND_REGISTER_RESOURCE(return_value, mss, le_mss); +#else + RETURN_RES(zend_register_resource(mss, le_mss)); +#endif } + } PHP_FUNCTION(mss_destroy) { mss_t *mss; zval *zmss; +#if PHP_MAJOR_VERSION < 7 + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zmss) + == FAILURE) { +#else if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zmss) == FAILURE) { +#endif RETURN_FALSE; } +#if PHP_MAJOR_VERSION < 7 ZEND_FETCH_RESOURCE2(mss, mss_t*, &zmss, -1, PHP_MSS_RES_NAME, le_mss, le_mss_persist); +#else + if((mss = (mss_t *)zend_fetch_resource2(Z_RES_P(zmss), PHP_MSS_RES_NAME, le_mss, le_mss_persist)) == NULL) + { + RETURN_FALSE; + } +#endif if (mss && mss->persist) { +#if PHP_MAJOR_VERSION < 7 zend_hash_del(&EG(persistent_list), mss->name, strlen(mss->name) + 1); +#else + zend_hash_str_del(&EG(persistent_list), mss->name, strlen(mss->name) + 1); +#endif RETURN_TRUE; } RETURN_FALSE; @@ -308,13 +459,25 @@ PHP_FUNCTION(mss_timestamp) { mss_t *mss; zval *zmss; +#if PHP_MAJOR_VERSION < 7 + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zmss) + == FAILURE) { +#else if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zmss) == FAILURE) { +#endif RETURN_FALSE; } +#if PHP_MAJOR_VERSION < 7 ZEND_FETCH_RESOURCE2(mss, mss_t*, &zmss, -1, PHP_MSS_RES_NAME, le_mss, le_mss_persist); +#else + if((mss = (mss_t *)zend_fetch_resource2(Z_RES_P(zmss), PHP_MSS_RES_NAME, le_mss, le_mss_persist)) == NULL) + { + RETURN_FALSE; + } +#endif RETURN_LONG(mss->timestamp); } @@ -323,13 +486,25 @@ PHP_FUNCTION(mss_is_ready) { mss_t *mss; zval *zmss; +#if PHP_MAJOR_VERSION < 7 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zmss) == FAILURE) { +#else + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zmss) + == FAILURE) { +#endif RETURN_FALSE; } +#if PHP_MAJOR_VERSION < 7 ZEND_FETCH_RESOURCE2(mss, mss_t*, &zmss, -1, PHP_MSS_RES_NAME, le_mss, le_mss_persist); +#else + if((mss = (mss_t *)zend_fetch_resource2(Z_RES_P(zmss), PHP_MSS_RES_NAME, le_mss, le_mss_persist)) == NULL) + { + RETURN_FALSE; + } +#endif RETURN_BOOL(!mss->ac->automata_open) } @@ -338,14 +513,27 @@ PHP_FUNCTION(mss_add) { mss_t *mss; zval *zmss; +#if PHP_MAJOR_VERSION < 7 char *kw; int kw_len; char *type = NULL; - int type_len; + int type_len = 0; +#else + char *kw; + size_t kw_len; + + char *type = NULL; + size_t type_len = 0; +#endif +#if PHP_MAJOR_VERSION < 7 + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs|s", &zmss, + &kw, &kw_len, &type, &type_len) == FAILURE) { +#else if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs|s", &zmss, &kw, &kw_len, &type, &type_len) == FAILURE) { +#endif RETURN_FALSE; } @@ -355,14 +543,22 @@ PHP_FUNCTION(mss_add) { RETURN_FALSE; } +#if PHP_MAJOR_VERSION < 7 ZEND_FETCH_RESOURCE2(mss, mss_t*, &zmss, -1, PHP_MSS_RES_NAME, le_mss, le_mss_persist); +#else + if((mss = (mss_t *)zend_fetch_resource2(Z_RES_P(zmss), PHP_MSS_RES_NAME, le_mss, le_mss_persist)) == NULL) + { + RETURN_FALSE; + } +#endif AC_PATTERN_t pattern; - pattern.astring = pestrdup(kw, mss->persist); + + pattern.astring = pestrndup(kw, kw_len, mss->persist); pattern.length = kw_len; pattern.rep.stringy = type - ? pestrdup(type, mss->persist) + ? pestrndup(type, type_len, mss->persist) : NULL; list_item_t *item = pemalloc(sizeof(list_item_t), mss->persist); @@ -391,13 +587,27 @@ PHP_FUNCTION(mss_search) { zval *callback = NULL; zval *ext = NULL; +#if PHP_MAJOR_VERSION < 7 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs|oz", &zmss, &text.astring, &text.length, &callback, &ext) == FAILURE) { RETURN_FALSE; } +#else + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs|oz", &zmss, + &text.astring, &text.length, &callback, &ext) == FAILURE) { + RETURN_FALSE; + } +#endif +#if PHP_MAJOR_VERSION < 7 ZEND_FETCH_RESOURCE2(mss, mss_t*, &zmss, -1, PHP_MSS_RES_NAME, le_mss, le_mss_persist); +#else + if((mss = (mss_t *)zend_fetch_resource2(Z_RES_P(zmss), PHP_MSS_RES_NAME, le_mss, le_mss_persist)) == NULL) + { + RETURN_FALSE; + } +#endif if (mss->ac->automata_open) { ac_automata_finalize(mss->ac); @@ -405,18 +615,27 @@ PHP_FUNCTION(mss_search) { AC_AUTOMATA_t ac; memcpy(&ac, mss->ac, sizeof(AC_AUTOMATA_t)); - - ac_automata_reset(&ac); + //ac_automata_reset(&ac);//1.0.0 match_callback_param_t mcp; + zend_bool matched = 0; + if (callback) { user_callback_param_t ucp; ucp.callback = callback; ucp.ext = ext; mcp.type = MCP_TYPE_CLOSURE; mcp.value = &ucp; + +#if PHP_MAJOR_VERSION < 7 RETVAL_TRUE; +#else + //ac_automata_search(&ac, &text, &mcp);//1.0.0 + ac_automata_search(&ac, &text, 0, match_callback, &mcp);//1.3.1 + RETURN_TRUE;//应立即返回 +#endif + } else { zval *matches = return_value; array_init(matches); @@ -424,7 +643,9 @@ PHP_FUNCTION(mss_search) { mcp.value = matches; } - ac_automata_search(&ac, &text, &mcp); + ac_automata_search(&ac, &text, 0, match_callback, &mcp); + + //efree(&ac); } PHP_FUNCTION(mss_match) { @@ -432,13 +653,25 @@ PHP_FUNCTION(mss_match) { zval *zmss; AC_TEXT_t text; +#if PHP_MAJOR_VERSION < 7 + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", &zmss, + &text.astring, &text.length) == FAILURE) { +#else if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "rs", &zmss, &text.astring, &text.length) == FAILURE) { +#endif RETURN_FALSE; } +#if PHP_MAJOR_VERSION < 7 ZEND_FETCH_RESOURCE2(mss, mss_t*, &zmss, -1, PHP_MSS_RES_NAME, le_mss, le_mss_persist); +#else + if((mss = (mss_t *)zend_fetch_resource2(Z_RES_P(zmss), PHP_MSS_RES_NAME, le_mss, le_mss_persist)) == NULL) + { + RETURN_FALSE; + } +#endif if (mss->ac->automata_open) { ac_automata_finalize(mss->ac); @@ -447,7 +680,7 @@ PHP_FUNCTION(mss_match) { AC_AUTOMATA_t *rac = emalloc(sizeof(AC_AUTOMATA_t)); memcpy(rac, mss->ac, sizeof(AC_AUTOMATA_t)); - ac_automata_reset(rac); + //ac_automata_reset(rac);//1.0.0 match_callback_param_t mcp; zend_bool matched = 0; @@ -455,9 +688,46 @@ PHP_FUNCTION(mss_match) { mcp.type = MCP_TYPE_MATCH; mcp.value = &matched; - ac_automata_search(rac, &text, &mcp); + ac_automata_search(rac, &text, 0, match_callback, &mcp); efree(rac); RETURN_BOOL(matched); } + +PHP_FUNCTION(mss_display) { + mss_t *mss; + zval *zmss; + +#if PHP_MAJOR_VERSION < 7 + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zmss) + == FAILURE) { +#else + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zmss) + == FAILURE) { +#endif + RETURN_FALSE; + } + +#if PHP_MAJOR_VERSION < 7 + ZEND_FETCH_RESOURCE2(mss, mss_t*, &zmss, -1, PHP_MSS_RES_NAME, le_mss, + le_mss_persist); +#else + if((mss = (mss_t *)zend_fetch_resource2(Z_RES_P(zmss), PHP_MSS_RES_NAME, le_mss, le_mss_persist)) == NULL) + { + RETURN_FALSE; + } +#endif + + if (mss->ac->automata_open) { + ac_automata_finalize(mss->ac); + } + + AC_AUTOMATA_t *rac = emalloc(sizeof(AC_AUTOMATA_t)); + memcpy(rac, mss->ac, sizeof(AC_AUTOMATA_t)); + + //ac_automata_reset(rac);//1.0.0 + + ac_automata_display(rac, 's'); + +} diff --git a/node.c b/node.c index 10e4b84..83377cb 100644 --- a/node.c +++ b/node.c @@ -2,18 +2,15 @@ * node.c: implementation of automata node * This file is part of multifast. * - Copyright 2010-2012 Kamiar Kanani - + Copyright 2010-2013 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ @@ -42,13 +39,13 @@ int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr); * FUNCTION: node_create * Create the node ******************************************************************************/ -struct node * node_create(void) +struct AC_NODE * node_create(void) { - AC_NODE_t * thiz; - thiz = (AC_NODE_t *) malloc (sizeof(AC_NODE_t)); - node_init(thiz); - node_assign_id(thiz); - return thiz; + AC_NODE_t * thiz; + thiz = (AC_NODE_t *) malloc (sizeof(AC_NODE_t)); + node_init(thiz); + node_assign_id(thiz); + return thiz; } /****************************************************************************** @@ -57,15 +54,15 @@ struct node * node_create(void) ******************************************************************************/ void node_init(AC_NODE_t * thiz) { - memset(thiz, 0, sizeof(AC_NODE_t)); + memset(thiz, 0, sizeof(AC_NODE_t)); - thiz->outgoing_max = REALLOC_CHUNK_OUTGOING; - thiz->outgoing = (struct edge *) malloc - (thiz->outgoing_max*sizeof(struct edge)); + thiz->outgoing_max = REALLOC_CHUNK_OUTGOING; + thiz->outgoing = (struct edge *) malloc + (thiz->outgoing_max*sizeof(struct edge)); - thiz->matched_patterns_max = REALLOC_CHUNK_MATCHSTR; - thiz->matched_patterns = (AC_PATTERN_t *) malloc - (thiz->matched_patterns_max*sizeof(AC_PATTERN_t)); + thiz->matched_patterns_max = REALLOC_CHUNK_MATCHSTR; + thiz->matched_patterns = (AC_PATTERN_t *) malloc + (thiz->matched_patterns_max*sizeof(AC_PATTERN_t)); } /****************************************************************************** @@ -74,9 +71,9 @@ void node_init(AC_NODE_t * thiz) ******************************************************************************/ void node_release(AC_NODE_t * thiz) { - free(thiz->matched_patterns); - free(thiz->outgoing); - free(thiz); + free(thiz->matched_patterns); + free(thiz->outgoing); + free(thiz); } /****************************************************************************** @@ -87,14 +84,14 @@ void node_release(AC_NODE_t * thiz) ******************************************************************************/ AC_NODE_t * node_find_next(AC_NODE_t * thiz, AC_ALPHABET_t alpha) { - int i; - - for (i=0; i < thiz->outgoing_degree; i++) - { - if(thiz->outgoing[i].alpha == alpha) - return (thiz->outgoing[i].next); - } - return NULL; + int i; + + for (i=0; i < thiz->outgoing_degree; i++) + { + if(thiz->outgoing[i].alpha == alpha) + return (thiz->outgoing[i].next); + } + return NULL; } /****************************************************************************** @@ -104,24 +101,24 @@ AC_NODE_t * node_find_next(AC_NODE_t * thiz, AC_ALPHABET_t alpha) ******************************************************************************/ AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha) { - int min, max, mid; - AC_ALPHABET_t amid; - - min = 0; - max = thiz->outgoing_degree - 1; - - while (min <= max) - { - mid = (min+max) >> 1; - amid = thiz->outgoing[mid].alpha; - if (alpha > amid) - min = mid + 1; - else if (alpha < amid) - max = mid - 1; - else - return (thiz->outgoing[mid].next); - } - return NULL; + int min, max, mid; + AC_ALPHABET_t amid; + + min = 0; + max = thiz->outgoing_degree - 1; + + while (min <= max) + { + mid = (min+max) >> 1; + amid = thiz->outgoing[mid].alpha; + if (alpha > amid) + min = mid + 1; + else if (alpha < amid) + max = mid - 1; + else + return (thiz->outgoing[mid].next); + } + return NULL; } /****************************************************************************** @@ -131,24 +128,24 @@ AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha) ******************************************************************************/ int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr) { - int i, j; - AC_PATTERN_t * str; + int i, j; + AC_PATTERN_t * str; - for (i=0; i < thiz->matched_patterns_num; i++) - { - str = &thiz->matched_patterns[i]; + for (i=0; i < thiz->matched_patterns_num; i++) + { + str = &thiz->matched_patterns[i]; - if (str->length != newstr->length) - continue; + if (str->length != newstr->length) + continue; - for (j=0; jlength; j++) - if(str->astring[j] != newstr->astring[j]) - continue; + for (j=0; jlength; j++) + if(str->astring[j] != newstr->astring[j]) + continue; - if (j == str->length) - return 1; - } - return 0; + if (j == str->length) + return 1; + } + return 0; } /****************************************************************************** @@ -157,16 +154,16 @@ int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr) ******************************************************************************/ AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha) { - AC_NODE_t * next; - next = node_find_next (thiz, alpha); - if (next) - /* The edge already exists */ - return NULL; - /* Otherwise register new edge */ - next = node_create (); - node_register_outgoing(thiz, next, alpha); - - return next; + AC_NODE_t * next; + next = node_find_next (thiz, alpha); + if (next) + /* The edge already exists */ + return NULL; + /* Otherwise register new edge */ + next = node_create (); + node_register_outgoing(thiz, next, alpha); + + return next; } /****************************************************************************** @@ -175,22 +172,22 @@ AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha) ******************************************************************************/ void node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str) { - /* Check if the new pattern already exists in the node list */ - if (node_has_matchstr(thiz, str)) - return; - - /* Manage memory */ - if (thiz->matched_patterns_num >= thiz->matched_patterns_max) - { - thiz->matched_patterns_max += REALLOC_CHUNK_MATCHSTR; - thiz->matched_patterns = (AC_PATTERN_t *) realloc - (thiz->matched_patterns, thiz->matched_patterns_max*sizeof(AC_PATTERN_t)); - } - - thiz->matched_patterns[thiz->matched_patterns_num].astring = str->astring; - thiz->matched_patterns[thiz->matched_patterns_num].length = str->length; - thiz->matched_patterns[thiz->matched_patterns_num].rep = str->rep; - thiz->matched_patterns_num++; + /* Check if the new pattern already exists in the node list */ + if (node_has_matchstr(thiz, str)) + return; + + /* Manage memory */ + if (thiz->matched_patterns_num >= thiz->matched_patterns_max) + { + thiz->matched_patterns_max += REALLOC_CHUNK_MATCHSTR; + thiz->matched_patterns = (AC_PATTERN_t *) realloc + (thiz->matched_patterns, thiz->matched_patterns_max*sizeof(AC_PATTERN_t)); + } + + thiz->matched_patterns[thiz->matched_patterns_num].astring = str->astring; + thiz->matched_patterns[thiz->matched_patterns_num].length = str->length; + thiz->matched_patterns[thiz->matched_patterns_num].rep = str->rep; + thiz->matched_patterns_num++; } /****************************************************************************** @@ -198,17 +195,17 @@ void node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str) * Establish an edge between two nodes ******************************************************************************/ void node_register_outgoing - (AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha) + (AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha) { - if(thiz->outgoing_degree >= thiz->outgoing_max) - { - thiz->outgoing_max += REALLOC_CHUNK_OUTGOING; - thiz->outgoing = (struct edge *) realloc - (thiz->outgoing, thiz->outgoing_max*sizeof(struct edge)); - } - - thiz->outgoing[thiz->outgoing_degree].alpha = alpha; - thiz->outgoing[thiz->outgoing_degree++].next = next; + if(thiz->outgoing_degree >= thiz->outgoing_max) + { + thiz->outgoing_max += REALLOC_CHUNK_OUTGOING; + thiz->outgoing = (struct edge *) realloc + (thiz->outgoing, thiz->outgoing_max*sizeof(struct edge)); + } + + thiz->outgoing[thiz->outgoing_degree].alpha = alpha; + thiz->outgoing[thiz->outgoing_degree++].next = next; } /****************************************************************************** @@ -217,8 +214,8 @@ void node_register_outgoing ******************************************************************************/ void node_assign_id (AC_NODE_t * thiz) { - static int unique_id = 1; - thiz->id = unique_id ++; + static int unique_id = 1; + thiz->id = unique_id ++; } /****************************************************************************** @@ -227,19 +224,19 @@ void node_assign_id (AC_NODE_t * thiz) ******************************************************************************/ int node_edge_compare (const void * l, const void * r) { - /* According to man page: - * The comparison function must return an integer less than, equal to, or - * greater than zero if the first argument is considered to be - * respectively less than, equal to, or greater than the second. if two - * members compare as equal, their order in the sorted array is undefined. - * - * NOTE: Because edge alphabets are unique in every node we ignore - * equivalence case. - **/ - if ( ((struct edge *)l)->alpha >= ((struct edge *)r)->alpha ) - return 1; - else - return -1; + /* According to man page: + * The comparison function must return an integer less than, equal to, or + * greater than zero if the first argument is considered to be + * respectively less than, equal to, or greater than the second. if two + * members compare as equal, their order in the sorted array is undefined. + * + * NOTE: Because edge alphabets are unique in every node we ignore + * equivalence case. + **/ + if ( ((struct edge *)l)->alpha >= ((struct edge *)r)->alpha ) + return 1; + else + return -1; } /****************************************************************************** @@ -248,6 +245,6 @@ int node_edge_compare (const void * l, const void * r) ******************************************************************************/ void node_sort_edges (AC_NODE_t * thiz) { - qsort ((void *)thiz->outgoing, thiz->outgoing_degree, sizeof(struct edge), - node_edge_compare); -} + qsort ((void *)thiz->outgoing, thiz->outgoing_degree, sizeof(struct edge), + node_edge_compare); +} \ No newline at end of file diff --git a/node.h b/node.h index 3ff2c45..bed0449 100644 --- a/node.h +++ b/node.h @@ -2,18 +2,15 @@ * node.h: automata node header file * This file is part of multifast. * - Copyright 2010-2012 Kamiar Kanani - + Copyright 2010-2013 Kamiar Kanani multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - multifast is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - You should have received a copy of the GNU Lesser General Public License along with multifast. If not, see . */ @@ -23,33 +20,37 @@ #include "actypes.h" +#ifdef __cplusplus +extern "C" { +#endif + /* Forward Declaration */ struct edge; /* automata node */ -typedef struct node +typedef struct AC_NODE { - int id; /* Node ID : for debugging purpose */ - short int final; /* 0: no ; 1: yes, it is a final node */ - struct node * failure_node; /* The failure node of this node */ - unsigned short depth; /* depth: distance between this node and the root */ + int id; /* Node ID : for debugging purpose */ + short int final; /* 0: no ; 1: yes, it is a final node */ + struct AC_NODE * failure_node; /* The failure node of this node */ + unsigned short depth; /* depth: distance between this node and the root */ - /* Matched patterns */ - AC_PATTERN_t * matched_patterns; /* Array of matched patterns */ - unsigned short matched_patterns_num; /* Number of matched patterns at this node */ - unsigned short matched_patterns_max; /* Max capacity of allocated memory for matched_patterns */ + /* Matched patterns */ + AC_PATTERN_t * matched_patterns; /* Array of matched patterns */ + unsigned short matched_patterns_num; /* Number of matched patterns at this node */ + unsigned short matched_patterns_max; /* Max capacity of allocated memory for matched_patterns */ - /* Outgoing Edges */ - struct edge * outgoing; /* Array of outgoing edges */ - unsigned short outgoing_degree; /* Number of outgoing edges */ - unsigned short outgoing_max; /* Max capacity of allocated memory for outgoing */ + /* Outgoing Edges */ + struct edge * outgoing; /* Array of outgoing edges */ + unsigned short outgoing_degree; /* Number of outgoing edges */ + unsigned short outgoing_max; /* Max capacity of allocated memory for outgoing */ } AC_NODE_t; /* The Edge of the Node */ struct edge { - AC_ALPHABET_t alpha; /* Edge alpha */ - struct node * next; /* Target of the edge */ + AC_ALPHABET_t alpha; /* Edge alpha */ + AC_NODE_t * next; /* Target of the edge */ }; @@ -63,4 +64,8 @@ void node_release (AC_NODE_t * thiz); void node_assign_id (AC_NODE_t * thiz); void node_sort_edges (AC_NODE_t * thiz); +#ifdef __cplusplus +} #endif + +#endif \ No newline at end of file diff --git a/php_mss.h b/php_mss.h index 0bd465f..fc533b1 100644 --- a/php_mss.h +++ b/php_mss.h @@ -25,6 +25,7 @@ PHP_FUNCTION(mss_create); PHP_FUNCTION(mss_destroy); PHP_FUNCTION(mss_timestamp); PHP_FUNCTION(mss_is_ready); +PHP_FUNCTION(mss_display); PHP_FUNCTION(mss_add); PHP_FUNCTION(mss_search); PHP_FUNCTION(mss_match); diff --git a/sample.php b/sample.php index 42d9f84..fc844de 100644 --- a/sample.php +++ b/sample.php @@ -1,10 +1,14 @@ = 0; $i--) { + $ext[1][$idx + $i] = '*'; + } +},array(&$count, &$text)); +echo "]\n"; + +echo " ", "count: ", $count, "\n"; + +echo $text, "\n"; + mss_search($mss, $text, function($kw, $idx, $type) { echo " ($kw, $idx, $type)\n"; }); -echo "]\n"; echo "\n"; $matched = mss_match($mss, $text); -echo $matched ? "matched" : "not matched", "\n"; \ No newline at end of file +echo $matched ? "matched" : "not matched", "\n"; + +var_dump(mss_is_ready($mss)); +var_dump(mss_timestamp($mss)); + +var_dump(mss_destroy($mss)); From 03f6e5f479458a7ab5bd65c0b5b4fe11e000e1b3 Mon Sep 17 00:00:00 2001 From: microhuang Date: Mon, 29 Aug 2016 11:28:47 +0800 Subject: [PATCH 2/3] prepared for v1.3 --- CHANGELOG | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 CHANGELOG diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 0000000..ad65a70 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,2 @@ +VERSION: 1.3.1 +-------------- From 206835bfa189a601d776cbc8afda3d830a1f0946 Mon Sep 17 00:00:00 2001 From: think Date: Sun, 14 May 2017 14:45:31 +0800 Subject: [PATCH 3/3] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ccb7912..b0b3cd6 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,7 @@ C library of Aho-Corasick Algorithm by [multifast][2] (LGPLv3). [1]: https://gist.github.com/1399772 [2]: http://sourceforge.net/projects/multifast/ - [3]: http://php.net/manual/en/functions.anonymous.php - [4]: http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm + [3]: http://multifast.sourceforge.net/library.php + [4]: http://php.net/manual/en/functions.anonymous.php + [5]: http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm