Symbolic Regular Expression Groups (long, patches)

Tracy Tims (tracy@gold.sni.ca)
Fri, 19 Nov 1993 09:59:40 -0500

I have finally gotten around to adding symbolic group names to python
regular expressions. Several months ago, after some mailing-list
discussion of regular-expression improvements, I wrote a python module
that supported symbolic groups. I have now implemented it in C.

Example:

This is a bunch of regular expression fragments from a module
that contains a library of regular expressions for parsing
various kinds of command output. These particular expressions
are for parsing 'ls -l' output. I can maintain these
expressions independently from the programs that use them.

re_diff = '\( \|< \| > \|! \| ! \|- \| + \)'
re_type = '\(<type>[dbcps-]\)'
re_mode = '[r-][w-][xsS-][r-][w-][xsSl-][r-][w-][xtT-] *'
re_lcnt = '[0-9]+ '
re_user = '[^ ]+ +'
re_group = '\([^ ]+ +\)?'
re_size = '\(<size>[0-9]+\) '
re_date = '\(<date>[A-Z][a-z][a-z] +[0-9]+ +[0-9]+:?[0-9]*\) +'
re_file = '\(<file>.*\)$'
re_link = '\(<file>.*\) -> \(<link>.*\)$'
parse_leaf = regex.symcomp( '\(<status>' + re_type + re_mode
+ re_lcnt + re_user + re_group + re_size + re_date
+ '\)' + re_file)

Group names are shown in angle-brackets, after the open paren
that starts a group. A python program can pull out substrings
using named groups, rather than group indexes:

name, date, size = parse_leaf.group( 'file', 'date', 'size')

Following this note are the context diffs, suitable for 'patch', for
the C version (this is for python0.9.9). It is fast enough to use in
most cases instead of indexed groups. Note that the only changes I
made to the regular expression code itself (regexpr.c and regexpr.h)
is to export the syntax variable into regexmodule.c. I could have
avoided any changes to regexpr by having regexmodule.c maintain its
own copy of regexp_syntax. This patch affects regexpr.h, regexpr.c
and regexmodule.c.

This code works by preprocessing the regular expression string before
handing it off to regexpr.

It adds the following function to the regex module:

symcomp( pattern)
Compiles a regular expression with named groups. If
the \( or ( that begins a group is immediately
followed by an identifier matching [0-9a-zA-Z_]+
surrounded by angle brackets, the identifier is taken
as the name of the group and otherwise has no effect
on how the pattern will match strings.

It adds the following attributes to regular expression objects:

givenpat
The actual string passed as an argument to symcomp()
or compile().
realpat
The regular expression with the symbolic group names
removed. If the regex was compiled with compile, then
givenpat is realpat.
groupindex
If the regular expression was compiled with symcomp,
this is a dictionary that maps group names to group
index, otherwise this is None.

I also modified the group() method so that if one of its arguments is
a string, it is taken as a symbolic group name, which is used to look
up the group index in regexobj.groupindex.

I love these features, but what do you think?

Tracy Tims
tracy@sni.ca

====================cut here==========================================
*** oregexmodule.c Thu Nov 18 13:48:39 1993
--- regexmodule.c Thu Nov 18 13:43:45 1993
***************
*** 35,40 ****
--- 35,41 ----
#include "modsupport.h"

#include "regexpr.h"
+ #include "ctype.h"

static object *RegexError; /* Exception */

***************
*** 45,50 ****
--- 46,54 ----
char re_fastmap[256]; /* Storage for fastmap */
object *re_translate; /* String object for translate table */
object *re_lastok; /* String object last matched/searched */
+ object *re_groupindex; /* Group name to index dictionary */
+ object *re_givenpat; /* Pattern with symbolic groups */
+ object *re_realpat; /* Pattern without symbolic groups */
} regexobject;

/* Regex object methods */
***************
*** 55,60 ****
--- 59,67 ----
{
XDECREF(re->re_translate);
XDECREF(re->re_lastok);
+ XDECREF(re->re_groupindex);
+ XDECREF(re->re_givenpat);
+ XDECREF(re->re_realpat);
XDEL(re->re_patbuf.buffer);
XDEL(re->re_patbuf.translate);
DEL(re);
***************
*** 188,195 ****
}
return res;
}
! if (!getargs(args, "i", &i))
! return NULL;
if (i < 0 || i >= RE_NREGS) {
err_setstr(RegexError, "group() index out of range");
return NULL;
--- 195,215 ----
}
return res;
}
! if (!getargs(args, "i", &i)) {
! object *n;
! err_clear();
! if (!getargs(args, "S", &n))
! return NULL;
! else {
! object *index;
! index = mappinglookup(re->re_groupindex, n);
! if (index == NULL) {
! err_setstr(RegexError, "group() group name doesn't exist");
! return NULL;
! }
! i = getintvalue(index);
! }
! }
if (i < 0 || i >= RE_NREGS) {
err_setstr(RegexError, "group() index out of range");
return NULL;
***************
*** 243,254 ****
INCREF(re->re_translate);
return re->re_translate;
}
if (strcmp(name, "__members__") == 0) {
! object *list = newlistobject(3);
if (list) {
setlistitem(list, 0, newstringobject("last"));
setlistitem(list, 1, newstringobject("regs"));
setlistitem(list, 2, newstringobject("translate"));
if (err_occurred()) {
DECREF(list);
list = NULL;
--- 263,301 ----
INCREF(re->re_translate);
return re->re_translate;
}
+ if (strcmp(name, "groupindex") == 0) {
+ if (re->re_groupindex == NULL) {
+ INCREF(None);
+ return None;
+ }
+ INCREF(re->re_groupindex);
+ return re->re_groupindex;
+ }
+ if (strcmp(name, "realpat") == 0) {
+ if (re->re_realpat == NULL) {
+ INCREF(None);
+ return None;
+ }
+ INCREF(re->re_realpat);
+ return re->re_realpat;
+ }
+ if (strcmp(name, "givenpat") == 0) {
+ if (re->re_givenpat == NULL) {
+ INCREF(None);
+ return None;
+ }
+ INCREF(re->re_givenpat);
+ return re->re_givenpat;
+ }
if (strcmp(name, "__members__") == 0) {
! object *list = newlistobject(6);
if (list) {
setlistitem(list, 0, newstringobject("last"));
setlistitem(list, 1, newstringobject("regs"));
setlistitem(list, 2, newstringobject("translate"));
+ setlistitem(list, 3, newstringobject("groupindex"));
+ setlistitem(list, 4, newstringobject("realpat"));
+ setlistitem(list, 5, newstringobject("givenpat"));
if (err_occurred()) {
DECREF(list);
list = NULL;
***************
*** 275,286 ****
};

static object *
! newregexobject(pat, size, translate)
! char *pat;
! int size;
object *translate;
{
regexobject *re;
if (translate != NULL && getstringsize(translate) != 256) {
err_setstr(RegexError,
"translation table must be 256 bytes");
--- 322,337 ----
};

static object *
! newregexobject(pattern, translate, givenpat, groupindex)
! object *pattern;
object *translate;
+ object *givenpat;
+ object *groupindex;
{
regexobject *re;
+ char *pat = getstringvalue(pattern);
+ int size = getstringsize(pattern);
+
if (translate != NULL && getstringsize(translate) != 256) {
err_setstr(RegexError,
"translation table must be 256 bytes");
***************
*** 299,304 ****
--- 350,360 ----
XINCREF(translate);
re->re_translate = translate;
re->re_lastok = NULL;
+ re->re_groupindex = groupindex;
+ INCREF(pattern);
+ re->re_realpat = pattern;
+ INCREF(givenpat);
+ re->re_givenpat = givenpat;
error = re_compile_pattern(pat, size, &re->re_patbuf);
if (error != NULL) {
err_setstr(RegexError, error);
***************
*** 314,330 ****
object *self;
object *args;
{
! char *pat;
! int size;
object *tran = NULL;
! if (!getargs(args, "s#", &pat, &size)) {
err_clear();
! if (!getargs(args, "(s#S)", &pat, &size, &tran))
return NULL;
}
! return newregexobject(pat, size, tran);
}

static object *cache_pat;
static object *cache_prog;

--- 370,500 ----
object *self;
object *args;
{
! object *pat = NULL;
object *tran = NULL;
! if (!getargs(args, "S", &pat)) {
err_clear();
! if (!getargs(args, "(SS)", &pat, &tran))
return NULL;
}
! return newregexobject(pat, tran, pat, NULL);
}

+ static object *
+ symcomp(pattern, gdict)
+ object *pattern;
+ object *gdict;
+ {
+ char *opat = getstringvalue(pattern);
+ char *oend = opat + getstringsize(pattern);
+ int group_count = 0;
+ int escaped = 0;
+ char *o = opat;
+ char *n;
+ char name_buf[128];
+ char *g;
+ object *npattern;
+ int require_escape = re_syntax & RE_NO_BK_PARENS ? 0 : 1;
+
+ npattern = newsizedstringobject((char*)NULL, getstringsize(pattern));
+ if (npattern == NULL)
+ return NULL;
+ n = getstringvalue(npattern);
+
+ while (o < oend) {
+ if (*o == '(' && escaped == require_escape) {
+ char *backtrack;
+ escaped = 0;
+ ++group_count;
+ *n++ = *o;
+ if (++o >= oend || *o != '<')
+ continue;
+ /* *o == '<' */
+ if (o+1 < oend && *(o+1) == '>')
+ continue;
+ backtrack = o;
+ g = name_buf;
+ for (++o; o < oend;) {
+ if (*o == '>') {
+ object *group_name = NULL;
+ object *group_index = NULL;
+ *g++ = '\0';
+ group_name = newstringobject(name_buf);
+ group_index = newintobject(group_count);
+ if (group_name == NULL || group_index == NULL
+ || mappinginsert(gdict, group_name, group_index) != 0) {
+ XDECREF(group_name);
+ XDECREF(group_index);
+ XDECREF(npattern);
+ return NULL;
+ }
+ ++o; /* eat the '>' */
+ break;
+ }
+ if (!isalnum(*o) && *o != '_') {
+ o = backtrack;
+ break;
+ }
+ *g++ = *o++;
+ }
+ }
+ if (*o == '[' && !escaped) {
+ *n++ = *o;
+ ++o; /* eat the char following '[' */
+ *n++ = *o;
+ while (o < oend && *o != ']') {
+ ++o;
+ *n++ = *o;
+ }
+ if (o < oend)
+ ++o;
+ }
+ else if (*o == '\\') {
+ escaped = 1;
+ *n++ = *o;
+ ++o;
+ }
+ else {
+ escaped = 0;
+ *n++ = *o;
+ ++o;
+ }
+ }
+
+ if (resizestring(&npattern, n - getstringvalue(npattern)) == 0)
+ return npattern;
+ else {
+ DECREF(npattern);
+ return NULL;
+ }
+
+ }
+
+ static object *
+ regex_symcomp(self, args)
+ object *self;
+ object *args;
+ {
+ object *pattern;
+ object *tran = NULL;
+ object *gdict = NULL;
+ object *npattern;
+ if (!getargs(args, "S", &pattern)) {
+ err_clear();
+ if (!getargs(args, "(SS)", &pattern, &tran))
+ return NULL;
+ }
+ gdict = newmappingobject();
+ if (gdict == NULL
+ || (npattern = symcomp(pattern, gdict)) == NULL) {
+ DECREF(gdict);
+ DECREF(pattern);
+ return NULL;
+ }
+ return newregexobject(npattern, tran, pattern, gdict);
+ }
+
+
static object *cache_pat;
static object *cache_prog;

***************
*** 384,389 ****
--- 554,560 ----

static struct methodlist regex_global_methods[] = {
{"compile", regex_compile},
+ {"symcomp", regex_symcomp},
{"match", regex_match},
{"search", regex_search},
{"set_syntax", regex_set_syntax},
*** oregexpr.c Thu Nov 18 14:21:59 1993
--- regexpr.c Thu Nov 18 13:43:46 1993
***************
*** 114,121 ****
Rnum_ops
};

static int re_compile_initialized = 0;
- static int regexp_syntax = 0;
static unsigned char regexp_plain_ops[256];
static unsigned char regexp_quoted_ops[256];
static unsigned char regexp_precedences[Rnum_ops];
--- 114,121 ----
Rnum_ops
};

+ int re_syntax = 0;
static int re_compile_initialized = 0;
static unsigned char regexp_plain_ops[256];
static unsigned char regexp_quoted_ops[256];
static unsigned char regexp_precedences[Rnum_ops];
***************
*** 180,186 ****
for (a = '0'; a <= '9'; a++)
regexp_quoted_ops[a] = Rmemory;
regexp_plain_ops['\134'] = Rquote;
! if (regexp_syntax & RE_NO_BK_PARENS)
{
regexp_plain_ops['('] = Ropenpar;
regexp_plain_ops[')'] = Rclosepar;
--- 180,186 ----
for (a = '0'; a <= '9'; a++)
regexp_quoted_ops[a] = Rmemory;
regexp_plain_ops['\134'] = Rquote;
! if (re_syntax & RE_NO_BK_PARENS)
{
regexp_plain_ops['('] = Ropenpar;
regexp_plain_ops[')'] = Rclosepar;
***************
*** 190,201 ****
regexp_quoted_ops['('] = Ropenpar;
regexp_quoted_ops[')'] = Rclosepar;
}
! if (regexp_syntax & RE_NO_BK_VBAR)
regexp_plain_ops['\174'] = Ror;
else
regexp_quoted_ops['\174'] = Ror;
regexp_plain_ops['*'] = Rstar;
! if (regexp_syntax & RE_BK_PLUS_QM)
{
regexp_quoted_ops['+'] = Rplus;
regexp_quoted_ops['?'] = Roptional;
--- 190,201 ----
regexp_quoted_ops['('] = Ropenpar;
regexp_quoted_ops[')'] = Rclosepar;
}
! if (re_syntax & RE_NO_BK_VBAR)
regexp_plain_ops['\174'] = Ror;
else
regexp_quoted_ops['\174'] = Ror;
regexp_plain_ops['*'] = Rstar;
! if (re_syntax & RE_BK_PLUS_QM)
{
regexp_quoted_ops['+'] = Rplus;
regexp_quoted_ops['?'] = Roptional;
***************
*** 205,217 ****
regexp_plain_ops['+'] = Rplus;
regexp_plain_ops['?'] = Roptional;
}
! if (regexp_syntax & RE_NEWLINE_OR)
regexp_plain_ops['\n'] = Ror;
regexp_plain_ops['\133'] = Ropenset;
regexp_plain_ops['\136'] = Rbol;
regexp_plain_ops['$'] = Reol;
regexp_plain_ops['.'] = Ranychar;
! if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
{
#ifdef emacs
regexp_quoted_ops['='] = Remacs_at_dot;
--- 205,217 ----
regexp_plain_ops['+'] = Rplus;
regexp_plain_ops['?'] = Roptional;
}
! if (re_syntax & RE_NEWLINE_OR)
regexp_plain_ops['\n'] = Ror;
regexp_plain_ops['\133'] = Ropenset;
regexp_plain_ops['\136'] = Rbol;
regexp_plain_ops['$'] = Reol;
regexp_plain_ops['.'] = Ranychar;
! if (!(re_syntax & RE_NO_GNU_EXTENSIONS))
{
#ifdef emacs
regexp_quoted_ops['='] = Remacs_at_dot;
***************
*** 227,237 ****
regexp_quoted_ops['`'] = Rbegbuf;
regexp_quoted_ops['\''] = Rendbuf;
}
! if (regexp_syntax & RE_ANSI_HEX)
regexp_quoted_ops['v'] = Rextended_memory;
for (a = 0; a < Rnum_ops; a++)
regexp_precedences[a] = 4;
! if (regexp_syntax & RE_TIGHT_VBAR)
{
regexp_precedences[Ror] = 3;
regexp_precedences[Rbol] = 2;
--- 227,237 ----
regexp_quoted_ops['`'] = Rbegbuf;
regexp_quoted_ops['\''] = Rendbuf;
}
! if (re_syntax & RE_ANSI_HEX)
regexp_quoted_ops['v'] = Rextended_memory;
for (a = 0; a < Rnum_ops; a++)
regexp_precedences[a] = 4;
! if (re_syntax & RE_TIGHT_VBAR)
{
regexp_precedences[Ror] = 3;
regexp_precedences[Rbol] = 2;
***************
*** 245,252 ****
}
regexp_precedences[Rclosepar] = 1;
regexp_precedences[Rend] = 0;
! regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
! regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
}

int re_set_syntax(syntax)
--- 245,252 ----
}
regexp_precedences[Rclosepar] = 1;
regexp_precedences[Rend] = 0;
! regexp_context_indep_ops = (re_syntax & RE_CONTEXT_INDEP_OPS) != 0;
! regexp_ansi_sequences = (re_syntax & RE_ANSI_HEX) != 0;
}

int re_set_syntax(syntax)
***************
*** 254,261 ****
{
int ret;

! ret = regexp_syntax;
! regexp_syntax = syntax;
re_compile_initialize();
return ret;
}
--- 254,261 ----
{
int ret;

! ret = re_syntax;
! re_syntax = syntax;
re_compile_initialize();
return ret;
}
***************
*** 500,510 ****
goto store_opcode;
case Reol:
if (!((pos >= size) ||
! ((regexp_syntax & RE_NO_BK_VBAR) ?
(regex[pos] == '\174') :
(pos+1 < size && regex[pos] == '\134' &&
regex[pos+1] == '\174')) ||
! ((regexp_syntax & RE_NO_BK_PARENS)?
(regex[pos] == ')'):
(pos+1 < size && regex[pos] == '\134' &&
regex[pos+1] == ')'))))
--- 500,510 ----
goto store_opcode;
case Reol:
if (!((pos >= size) ||
! ((re_syntax & RE_NO_BK_VBAR) ?
(regex[pos] == '\174') :
(pos+1 < size && regex[pos] == '\134' &&
regex[pos+1] == '\174')) ||
! ((re_syntax & RE_NO_BK_PARENS)?
(regex[pos] == ')'):
(pos+1 < size && regex[pos] == '\134' &&
regex[pos+1] == ')'))))
*** oregexpr.h Thu Nov 18 14:22:15 1993
--- regexpr.h Thu Nov 18 13:43:46 1993
***************
*** 69,74 ****
--- 69,78 ----

#ifdef HAVE_PROTOTYPES

+ extern int re_syntax;
+ /* This is the actual syntax mask. It was added so that Python
+ could do syntax-dependent munging of patterns before compilation. */
+
int re_set_syntax(int syntax);
/* This sets the syntax to use and returns the previous syntax. The
syntax is specified by a bit mask of the above defined bits. */
***************
*** 129,134 ****
--- 133,139 ----

#else /* HAVE_PROTOTYPES */

+ extern int re_syntax;
int re_set_syntax();
char *re_compile_pattern();
int re_match();