/* lrexlib_glib.c - Lua binding of GLib Regex library */ /* This is similar to Lrexlib's PCRE implementation, but has been changed * for GLib's pcre implementation, which is different. * * The changes made by me, Hadriel Kaplan, are in the Public Domain, or * under the MIT license if your country does not allow Public Domain. Copyright (c) 2014 Hadriel Kaplan SPDX-License-Identifier: MIT * Changes relative to Lrelxib-PCRE: * - No chartables or locale handling * - dfa_exec doesn't take 'ovecsize' nor 'wscount' args * - dfa_exec returns boolean true for partial match, without subcapture info * - named subgroups do not return a table of name-keyed entries, because * GLib doesn't provide a way to learn that information * - there is no 'config()' function, since GLib doesn't offer such info * - the 'flags()' function still works, returning all flags, but two new * functions 'compile_flags()' and 'match_flags()' return just their respective * flags, since GLib has a different and smaller set of such flags, for * regex compile vs. match functions * - Using POSIX character classes against strings with non-ASCII characters * might match high-order characters, because glib always sets PCRE_UCP * even if G_REGEX_RAW is set. For example, '[:alpha;]' and '\w' match certain * non-ASCII bytes. * - obviously quite a bit else is changed to interface to GLib's regex instead * of PCRE, but hopefully those changes aren't visible to user/caller */ #include #include #include #include #include "lua.h" #include "lauxlib.h" #include "lrexlib.h" extern flag_pair gregex_error_flags[]; /* These 2 settings may be redefined from the command-line or the makefile. * They should be kept in sync between themselves and with the target name. */ #ifndef REX_LIBNAME # ifdef LREXLIB_WIRESHARK # define REX_LIBNAME "GRegex" # else # define REX_LIBNAME "rex_glib" # endif #endif #define REX_TYPENAME REX_LIBNAME"_regex" #define ALG_CFLAGS_DFLT G_REGEX_RAW #define ALG_EFLAGS_DFLT 0 static int getcflags (lua_State *L, int pos); #define ALG_GETCFLAGS(L,pos) getcflags(L, pos) #define ALG_NOMATCH(res) ((res) == FALSE) #define ALG_ISMATCH(res) ((res) == TRUE) #define ALG_SUBBEG(ud,n) getSubStartPos(ud,n) #define ALG_SUBEND(ud,n) getSubEndPos(ud,n) #define ALG_SUBLEN(ud,n) (ALG_SUBEND(ud,n) - ALG_SUBBEG(ud,n)) #define ALG_SUBVALID(ud,n) (ALG_SUBBEG(ud,n) >= 0) #define ALG_NSUB(ud) ((int) g_regex_get_capture_count(ud->pr)) #define ALG_PUSHSUB(L,ud,text,n) \ lua_pushlstring (L, (text) + ALG_SUBBEG(ud,n), ALG_SUBLEN(ud,n)) #define ALG_PUSHSUB_OR_FALSE(L,ud,text,n) \ { if ( ALG_SUBVALID(ud,n) ) { ALG_PUSHSUB (L,ud,text,n); } else { lua_pushboolean (L,0); } } #define ALG_PUSHSTART(L,ud,offs,n) lua_pushinteger(L, (offs) + ALG_SUBBEG(ud,n) + 1) #define ALG_PUSHEND(L,ud,offs,n) lua_pushinteger(L, (offs) + ALG_SUBEND(ud,n)) #define ALG_PUSHOFFSETS(L,ud,offs,n) \ (ALG_PUSHSTART(L,ud,offs,n), ALG_PUSHEND(L,ud,offs,n)) #define ALG_BASE(st) 0 #define ALG_PULL /* we define ALG_USERETRY because GLib does expose PCRE's NOTEMPTY and ANCHORED flags */ #define ALG_USERETRY #define VERSION_GLIB (GLIB_MAJOR_VERSION*100 + GLIB_MINOR_VERSION) /* unfortunately GLib doesn't expose cerrtain macros it would be nice to have */ #if VERSION_GLIB >= 234 # define G_REGEX_COMPILE_MASK_234 (G_REGEX_FIRSTLINE | \ G_REGEX_NEWLINE_ANYCRLF | \ G_REGEX_BSR_ANYCRLF | \ G_REGEX_JAVASCRIPT_COMPAT) #else # define G_REGEX_COMPILE_MASK_234 0 #endif /* Mask of all the possible values for GRegexCompileFlags. */ #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \ G_REGEX_MULTILINE | \ G_REGEX_DOTALL | \ G_REGEX_EXTENDED | \ G_REGEX_ANCHORED | \ G_REGEX_DOLLAR_ENDONLY | \ G_REGEX_UNGREEDY | \ G_REGEX_RAW | \ G_REGEX_NO_AUTO_CAPTURE | \ G_REGEX_OPTIMIZE | \ G_REGEX_DUPNAMES | \ G_REGEX_NEWLINE_CR | \ G_REGEX_NEWLINE_LF | \ G_REGEX_NEWLINE_CRLF | \ G_REGEX_COMPILE_MASK_234) #if VERSION_GLIB >= 234 # define G_REGEX_MATCH_MASK_234 (G_REGEX_MATCH_NEWLINE_ANYCRLF | \ G_REGEX_MATCH_BSR_ANYCRLF | \ G_REGEX_MATCH_BSR_ANY | \ G_REGEX_MATCH_PARTIAL_SOFT | \ G_REGEX_MATCH_PARTIAL_HARD | \ G_REGEX_MATCH_NOTEMPTY_ATSTART) #else # define G_REGEX_MATCH_MASK_234 0 #endif /* Mask of all the possible values for GRegexMatchFlags. */ #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \ G_REGEX_MATCH_NOTBOL | \ G_REGEX_MATCH_NOTEOL | \ G_REGEX_MATCH_NOTEMPTY | \ G_REGEX_MATCH_PARTIAL | \ G_REGEX_MATCH_NEWLINE_CR | \ G_REGEX_MATCH_NEWLINE_LF | \ G_REGEX_MATCH_NEWLINE_CRLF | \ G_REGEX_MATCH_NEWLINE_ANY) static int check_eflags(lua_State *L, const int idx, const int def); #define ALG_GETEFLAGS(L,idx) check_eflags(L, idx, ALG_EFLAGS_DFLT) typedef struct { GRegex * pr; GMatchInfo * match_info; GError * error; /* didn't want to put this here, but can't free it otherwise */ int freed; } TGrgx; static void minfo_free(TGrgx* ud) { g_match_info_free (ud->match_info); ud->match_info = NULL; } static void gerror_free(TGrgx* ud) { if (ud->error) g_error_free (ud->error); ud->error = NULL; } static int getSubStartPos(TGrgx* ud, int n) { int start_pos = -1; g_match_info_fetch_pos (ud->match_info, n, &start_pos, NULL); return start_pos; } static int getSubEndPos(TGrgx* ud, int n) { int end_pos = -1; g_match_info_fetch_pos (ud->match_info, n, NULL, &end_pos); return end_pos; } #define TUserdata TGrgx /* TODO: handle named subpatterns somehow */ #if 0 static void do_named_subpatterns (lua_State *L, TGrgx *ud, const char *text); # define DO_NAMED_SUBPATTERNS do_named_subpatterns #endif #include "lrexlib_algo.h" /* Functions ****************************************************************************** */ static int getcflags (lua_State *L, int pos) { switch (lua_type (L, pos)) { case LUA_TNONE: case LUA_TNIL: return ALG_CFLAGS_DFLT; case LUA_TNUMBER: { int res = (int) lua_tointeger (L, pos); if ((res & ~G_REGEX_COMPILE_MASK) != 0) { return luaL_error (L, "GLib Regex compile flag is invalid"); } return res; } case LUA_TSTRING: { const char *s = lua_tostring (L, pos); int res = 0, ch; while ((ch = *s++) != '\0') { if (ch == 'i') res |= G_REGEX_CASELESS; else if (ch == 'm') res |= G_REGEX_MULTILINE; else if (ch == 's') res |= G_REGEX_DOTALL; else if (ch == 'x') res |= G_REGEX_EXTENDED; else if (ch == 'U') res |= G_REGEX_UNGREEDY; } return (int)res; } default: return luaL_typerror (L, pos, "number or string"); } } static int check_eflags(lua_State *L, const int idx, const int def) { int eflags = (int) luaL_optinteger (L, idx, def); if ((eflags & ~G_REGEX_MATCH_MASK) != 0) { return luaL_error (L, "GLib Regex match flag is invalid"); } return eflags; } /* this function is used in algo.h as well */ static int generate_error (lua_State *L, const TGrgx *ud, int errcode) { const char *key = get_flag_key (gregex_error_flags, ud->error->code); (void) errcode; if (key) return luaL_error (L, "error G_REGEX_%s (%s)", key, ud->error->message); else return luaL_error (L, "GLib Regex error: %s (code %d)", ud->error->message, ud->error->code); } static int compile_regex (lua_State *L, const TArgComp *argC, TGrgx **pud) { TGrgx *ud; ud = (TGrgx*)lua_newuserdata (L, sizeof (TGrgx)); memset (ud, 0, sizeof (TGrgx)); /* initialize all members to 0 */ lua_pushvalue (L, ALG_ENVIRONINDEX); lua_setmetatable (L, -2); ud->pr = g_regex_new (argC->pattern, (GRegexCompileFlags)(argC->cflags | G_REGEX_RAW), (GRegexMatchFlags)0, &ud->error); if (pud) *pud = ud; if (!ud->pr) return luaL_error (L, "%s (code: %d)", ud->error->message, ud->error->code); return 1; } /* method r:dfa_exec (s, [st], [ef]) */ static void checkarg_dfa_exec (lua_State *L, TArgExec *argE, TGrgx **ud) { *ud = check_ud (L); argE->text = luaL_checklstring (L, 2, &argE->textlen); argE->startoffset = get_startoffset (L, 3, argE->textlen); argE->eflags = ALG_GETEFLAGS (L, 4); } /* unlike PCRE, partial matching won't return the actual substrings/matches */ static int Gregex_dfa_exec (lua_State *L) { TArgExec argE; TGrgx *ud; gboolean res; checkarg_dfa_exec (L, &argE, &ud); gerror_free (ud); res = g_regex_match_all_full (ud->pr, argE.text, (int)argE.textlen, argE.startoffset, (GRegexMatchFlags)argE.eflags, &ud->match_info, &ud->error); if (ALG_ISMATCH (res)) { int i, start_pos, end_pos; int max = g_match_info_get_match_count (ud->match_info); g_match_info_fetch_pos (ud->match_info, 0, &start_pos, NULL); lua_pushinteger (L, start_pos + 1); /* 1-st return value */ lua_newtable (L); /* 2-nd return value */ for (i=0; imatch_info, i, NULL, &end_pos); /* I don't know why these offsets aren't incremented by 1 to match Lua indexing? */ lua_pushinteger (L, end_pos); lua_rawseti (L, -2, i+1); } lua_pushinteger (L, max); /* 3-rd return value */ minfo_free (ud); return 3; } else if (g_match_info_is_partial_match(ud->match_info)) { lua_pushboolean(L,1); minfo_free (ud); return 1; } else { minfo_free (ud); if (ALG_NOMATCH (res)) { lua_pushnil (L); return 1; } else return generate_error (L, ud, 0); } } #ifdef ALG_USERETRY static int gmatch_exec (TUserdata *ud, TArgExec *argE, int retry) { int eflags = retry ? (argE->eflags|G_REGEX_MATCH_NOTEMPTY|G_REGEX_MATCH_ANCHORED) : argE->eflags; minfo_free (ud); gerror_free (ud); return g_regex_match_full (ud->pr, argE->text, argE->textlen, argE->startoffset, (GRegexMatchFlags)eflags, &ud->match_info, &ud->error); } #else static int gmatch_exec (TUserdata *ud, TArgExec *argE) { minfo_free (ud); gerror_free (ud); return g_regex_match_full (ud->pr, argE->text, argE->textlen, argE->startoffset, (GRegexMatchFlags)argE->eflags, &ud->match_info, &ud->error); } #endif static void gmatch_pushsubject (lua_State *L, TArgExec *argE) { lua_pushlstring (L, argE->text, argE->textlen); } static int findmatch_exec (TGrgx *ud, TArgExec *argE) { minfo_free (ud); gerror_free (ud); return g_regex_match_full (ud->pr, argE->text, argE->textlen, argE->startoffset, (GRegexMatchFlags)argE->eflags, &ud->match_info, &ud->error); } #ifdef ALG_USERETRY static int gsub_exec (TGrgx *ud, TArgExec *argE, int st, int retry) { int eflags = retry ? (argE->eflags|G_REGEX_MATCH_NOTEMPTY|G_REGEX_MATCH_ANCHORED) : argE->eflags; minfo_free (ud); gerror_free (ud); return g_regex_match_full (ud->pr, argE->text, argE->textlen, st, (GRegexMatchFlags)eflags, &ud->match_info, &ud->error); } #else static int gsub_exec (TGrgx *ud, TArgExec *argE, int st) { minfo_free (ud); gerror_free (ud); return g_regex_match_full (ud->pr, argE->text, argE->textlen, st, (GRegexMatchFlags)argE->eflags, &ud->match_info, &ud->error); } #endif static int split_exec (TGrgx *ud, TArgExec *argE, int offset) { minfo_free (ud); gerror_free (ud); return g_regex_match_full (ud->pr, argE->text, argE->textlen, offset, (GRegexMatchFlags)argE->eflags, &ud->match_info, &ud->error); } static int Gregex_gc (lua_State *L) { TGrgx *ud = check_ud (L); if (ud->freed == 0) { /* precaution against "manual" __gc calling */ ud->freed = 1; if (ud->pr) g_regex_unref (ud->pr); minfo_free (ud); gerror_free (ud); } return 0; } static int Gregex_tostring (lua_State *L) { TGrgx *ud = check_ud (L); if (ud->freed == 0) lua_pushfstring (L, "%s (%p)", REX_TYPENAME, (void*)ud); else lua_pushfstring (L, "%s (deleted)", REX_TYPENAME); return 1; } static int Gregex_version (lua_State *L) { lua_pushfstring (L, "%d.%d.%d", GLIB_MAJOR_VERSION, GLIB_MINOR_VERSION, GLIB_MICRO_VERSION); return 1; } static const luaL_Reg r_methods[] = { { "exec", algm_exec }, { "tfind", algm_tfind }, /* old name: match */ { "find", algm_find }, { "match", algm_match }, { "dfa_exec", Gregex_dfa_exec }, { "__gc", Gregex_gc }, { "__tostring", Gregex_tostring }, { NULL, NULL } }; static const luaL_Reg r_functions[] = { { "match", algf_match }, { "find", algf_find }, { "gmatch", algf_gmatch }, { "gsub", algf_gsub }, { "split", algf_split }, { "new", algf_new }, { "flags", Gregex_get_flags }, { "compile_flags", Gregex_get_compile_flags }, { "match_flags", Gregex_get_match_flags }, { "version", Gregex_version }, { NULL, NULL } }; /* Open the library */ REX_API int REX_OPENLIB (lua_State *L) { alg_register(L, r_methods, r_functions, "GLib Regex"); return 1; }