Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 2042 → Rev 2043

/programs/develop/tinypy/modules/random/init.c
0,0 → 1,52
#include "random.c"
 
/*
* random_mod_init()
*
* random module initialization function
*/
void random_init(TP)
{
/*
* module dict for random
*/
tp_obj random_mod = tp_dict(tp);
 
/*
* bind functions to random module
*/
tp_set(tp, random_mod, tp_string("seed"), tp_fnc(tp, random_seed));
tp_set(tp, random_mod, tp_string("getstate"), tp_fnc(tp, random_getstate));
tp_set(tp, random_mod, tp_string("setstate"), tp_fnc(tp, random_setstate));
tp_set(tp, random_mod, tp_string("jumpahead"), tp_fnc(tp, random_jumpahead));
tp_set(tp, random_mod, tp_string("random"), tp_fnc(tp, random_random));
 
/*
* bind usual distribution random variable generator
*/
tp_set(tp, random_mod, tp_string("uniform"), tp_fnc(tp, random_uniform));
tp_set(tp, random_mod, tp_string("normalvariate"), tp_fnc(tp, random_normalvariate));
tp_set(tp, random_mod, tp_string("lognormvariate"), tp_fnc(tp, random_lognormvariate));
tp_set(tp, random_mod, tp_string("expovariate"), tp_fnc(tp, random_expovariate));
tp_set(tp, random_mod, tp_string("vonmisesvariate"), tp_fnc(tp, random_vonmisesvariate));
tp_set(tp, random_mod, tp_string("gammavariate"), tp_fnc(tp, random_gammavariate));
tp_set(tp, random_mod, tp_string("betavariate"), tp_fnc(tp, random_betavariate));
tp_set(tp, random_mod, tp_string("paretovariate"), tp_fnc(tp, random_paretovariate));
tp_set(tp, random_mod, tp_string("weibullvariate"), tp_fnc(tp, random_weibullvariate));
tp_set(tp, random_mod, tp_string("randrange"), tp_fnc(tp, random_randrange));
tp_set(tp, random_mod, tp_string("randint"), tp_fnc(tp, random_randint));
tp_set(tp, random_mod, tp_string("choice"), tp_fnc(tp, random_choice));
tp_set(tp, random_mod, tp_string("shuffle"), tp_fnc(tp, random_shuffle));
 
/*
* bind special attributes to random module
*/
tp_set(tp, random_mod, tp_string("__doc__"), tp_string("Random variable generators."));
tp_set(tp, random_mod, tp_string("__name__"), tp_string("random"));
tp_set(tp, random_mod, tp_string("__file__"), tp_string(__FILE__));
 
/*
* bind random module to tinypy modules[]
*/
tp_set(tp, tp->modules, tp_string("random"), random_mod);
}
/programs/develop/tinypy/modules/random/random.c
0,0 → 1,1107
#include <stdlib.h>
#include <time.h>
#include <errno.h>
 
#include <math.h>
#ifndef M_E
#define M_E 2.7182818284590452354
#endif
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
 
/*
* following random generators are mutual exclusive.
*/
#define __USE_MERSENNE_TWISTER_RANDOM_GENERATOR
/*#define __USE_WICHMANN_HILL_RANDOM_GENERATOR*/
/*#define __USE_POSIX_RANDOM_GENERATOR*/
 
 
#if defined(__USE_MERSENNE_TWISTER_RANDOM_GENERATOR)
 
/***********************************************************
* following code is borrowed from Python's _randommodule.c
***********************************************************/
/* Random objects */
 
/* ------------------------------------------------------------------
The code in this module was based on a download from:
http://www.math.keio.ac.jp/~matumoto/MT2002/emt19937ar.html
 
It was modified in 2002 by Raymond Hettinger as follows:
 
* the principal computational lines untouched except for tabbing.
 
* renamed genrand_res53() to random_random() and wrapped
in python calling/return code.
 
* genrand_int32() and the helper functions, init_genrand()
and init_by_array(), were declared static, wrapped in
Python calling/return code. also, their global data
references were replaced with structure references.
 
* unused functions from the original were deleted.
new, original C python code was added to implement the
Random() interface.
 
The following are the verbatim comments from the original code:
 
A C-program for MT19937, with initialization improved 2002/1/26.
Coded by Takuji Nishimura and Makoto Matsumoto.
 
Before using, initialize the state by using init_genrand(seed)
or init_by_array(init_key, key_length).
 
Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
All rights reserved.
 
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
 
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
 
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
 
3. The names of its contributors may not be used to endorse or promote
products derived from this software without specific prior written
permission.
 
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
Any feedback is very welcome.
http://www.math.keio.ac.jp/matumoto/emt.html
email: matumoto@math.keio.ac.jp
*/
 
/* Period parameters -- These are all magic. Don't change. */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0dfUL /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
 
typedef struct {
unsigned long state[N];
int index;
int has_seed;
} RandomObject;
 
/* generates a random number on [0,0xffffffff]-interval */
static unsigned long
genrand_int32(RandomObject *self)
{
unsigned long y;
static unsigned long mag01[2]={0x0UL, MATRIX_A};
/* mag01[x] = x * MATRIX_A for x=0,1 */
unsigned long *mt;
 
mt = self->state;
if (self->index >= N) { /* generate N words at one time */
int kk;
 
for (kk=0;kk<N-M;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
for (;kk<N-1;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
 
self->index = 0;
}
 
y = mt[self->index++];
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680UL;
y ^= (y << 15) & 0xefc60000UL;
y ^= (y >> 18);
return y;
}
 
/* initializes mt[N] with a seed */
static void
init_genrand(RandomObject *self, unsigned long s)
{
int mti;
unsigned long *mt;
 
mt = self->state;
mt[0]= s & 0xffffffffUL;
for (mti=1; mti<N; mti++) {
mt[mti] =
(1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
/* In the previous versions, MSBs of the seed affect */
/* only MSBs of the array mt[]. */
/* 2002/01/09 modified by Makoto Matsumoto */
mt[mti] &= 0xffffffffUL;
/* for >32 bit machines */
}
self->index = mti;
return;
}
 
/* initialize by an array with array-length */
/* init_key is the array for initializing keys */
/* key_length is its length */
static void
init_by_array(RandomObject *self, unsigned long init_key[], unsigned long key_length)
{
unsigned int i, j, k; /* was signed in the original code. RDH 12/16/2002 */
unsigned long *mt;
 
mt = self->state;
init_genrand(self, 19650218UL);
i=1; j=0;
k = (N>key_length ? N : key_length);
for (; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
+ init_key[j] + j; /* non linear */
mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
i++; j++;
if (i>=N) { mt[0] = mt[N-1]; i=1; }
if (j>=key_length) j=0;
}
for (k=N-1; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
- i; /* non linear */
mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
i++;
if (i>=N) { mt[0] = mt[N-1]; i=1; }
}
 
mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
return;
}
 
/*----------------------end of Mersenne Twister Algorithm----------------*/
 
/*************************************************************************
* following are tinypy related stuffs.
*************************************************************************/
 
static RandomObject _gRandom; /* random object */
 
static tp_obj random_seed(TP)
{
tp_obj arg = TP_DEFAULT(tp_None);
 
if (arg.type == TP_NONE) {
time_t now;
 
(void)time(&now);
init_genrand(&_gRandom, (unsigned long)now);
_gRandom.has_seed = 1;
} else if (arg.type == TP_NUMBER) {
init_genrand(&_gRandom, (unsigned long)arg.number.val);
_gRandom.has_seed = 1;
} else if (arg.type == TP_STRING) {
unsigned long seed;
seed = (unsigned long)tp_hash(tp, arg);
init_genrand(&_gRandom, seed);
_gRandom.has_seed = 1;
} else {
tp_raise(tp_None,tp_printf(tp, "%s", "invalid argument for seed()"));
}
return (tp_None);
}
 
static tp_obj random_getstate(TP)
{
tp_obj state_list = tp_list(tp);
int i;
 
for (i = 0; i < N; i++) {
_tp_list_append(tp, state_list.list.val, tp_number(_gRandom.state[i]));
}
_tp_list_append(tp, state_list.list.val, tp_number(_gRandom.index));
 
return (state_list);
}
 
/*
* @state_list must contain exactly N+1(625) integer.
*/
static tp_obj random_setstate(TP)
{
tp_obj state_list = TP_OBJ();
tp_obj state_elem;
tp_obj len;
int i;
 
len = tp_len(tp, state_list);
if (len.number.val != N+1) {
tp_raise(tp_None,tp_printf(tp, "%s: state vector's size invalid(should be %d)",
__func__, N+1));
}
 
for (i = 0; i < N; i++) {
state_elem = tp_get(tp, state_list, tp_number(i));
_gRandom.state[i] = (unsigned long)state_elem.number.val;
}
state_elem = tp_get(tp, state_list, tp_number(i));
_gRandom.index = (int)state_elem.number.val;
 
return (tp_None);
}
 
/*
* Jumpahead should be a fast way advance the generator n-steps ahead, but
* lacking a formula for that, the next best is to use n and the existing
* state to create a new state far away from the original.
*
* The generator uses constant spaced additive feedback, so shuffling the
* state elements ought to produce a state which would not be encountered
* (in the near term) by calls to random(). Shuffling is normally
* implemented by swapping the ith element with another element ranging
* from 0 to i inclusive. That allows the element to have the possibility
* of not being moved. Since the goal is to produce a new, different
* state, the swap element is ranged from 0 to i-1 inclusive. This assures
* that each element gets moved at least once.
*
* To make sure that consecutive calls to jumpahead(n) produce different
* states (even in the rare case of involutory shuffles), i+1 is added to
* each element at position i. Successive calls are then guaranteed to
* have changing (growing) values as well as shuffled positions.
*
* Finally, the self->index value is set to N so that the generator itself
* kicks in on the next call to random(). This assures that all results
* have been through the generator and do not just reflect alterations to
* the underlying state.
*/
static tp_obj random_jumpahead(TP)
{
long n = (long)TP_NUM();
long i, j;
unsigned long *mt;
unsigned long tmp;
 
mt = _gRandom.state;
 
for (i = N-1; i > 1; i--) {
j = n % i;
if (j == -1L) {
tp_raise(tp_None,tp_printf(tp, "error: %s: j = %ld", __func__, j));
}
tmp = mt[i];
mt[i] = mt[j];
mt[j] = tmp;
}
 
for (i = 0; i < N; i++)
mt[i] += i + 1;
 
_gRandom.index = N;
 
return (tp_None);
}
 
/* random_random is the function named genrand_res53 in the original code;
* generates a random number on [0,1) with 53-bit resolution; note that
* 9007199254740992 == 2**53; I assume they're spelling "/2**53" as
* multiply-by-reciprocal in the (likely vain) hope that the compiler will
* optimize the division away at compile-time. 67108864 is 2**26. In
* effect, a contains 27 random bits shifted left 26, and b fills in the
* lower 26 bits of the 53-bit numerator.
* The orginal code credited Isaku Wada for this algorithm, 2002/01/09.
*/
static tp_obj random_random(TP)
{
RandomObject *self = &_gRandom;
unsigned long a, b;
 
if (! self->has_seed)
random_seed(tp);
 
a = genrand_int32(self)>>5;
b = genrand_int32(self)>>6;
return tp_number((a*67108864.0+b)*(1.0/9007199254740992.0));
}
 
 
#elif defined(__USE_WICHMANN_HILL_RANDOM_GENERATOR)
 
/*************************************************
* divmod(x, y)
* a helper function borrowed from Python
*************************************************/
/* Compute Python divmod(x, y), returning the quotient and storing the
* remainder into *r. The quotient is the floor of x/y, and that's
* the real point of this. C will probably truncate instead (C99
* requires truncation; C89 left it implementation-defined).
* Simplification: we *require* that y > 0 here. That's appropriate
* for all the uses made of it. This simplifies the code and makes
* the overflow case impossible (divmod(LONG_MIN, -1) is the only
* overflow case).
*/
#include <assert.h>
 
static int
divmod(int x, int y, int *r)
{
int quo;
 
assert(y > 0);
quo = x / y;
*r = x - quo * y;
if (*r < 0) {
--quo;
*r += y;
}
assert(0 <= *r && *r < y);
return quo;
}
 
 
typedef struct WH_RandomObject {
struct seed {
unsigned long x;
unsigned long y;
unsigned long z;
} seed;
int has_seed;
} WH_RandomObject;
 
static WH_RandomObject _gWhRandom;
 
static tp_obj random_seed(TP)
{
long a;
int x, y, z;
tp_obj arg = TP_DEFAULT(tp_None);
 
if (arg.type == TP_NONE) {
time_t now;
 
(void)time(&now);
a = (long)now * 256;
} else if (arg.type == TP_NUMBER) {
a = (long)arg.number.val;
} else {
tp_raise(tp_None,tp_printf(tp, "%s", "invalid argument for seed()"));
}
a = divmod(a, 30268, &x);
a = divmod(a, 30306, &y);
a = divmod(a, 30322, &z);
_gWhRandom.seed.x = (int)x + 1;
_gWhRandom.seed.y = (int)y + 1;
_gWhRandom.seed.z = (int)z + 1;
_gWhRandom.has_seed = 1;
 
return (tp_None);
}
 
/*
* following comments are from Python's random.py
*---------------------------------------
* Wichman-Hill random number generator.
*
* Wichmann, B. A. & Hill, I. D. (1982)
* Algorithm AS 183:
* An efficient and portable pseudo-random number generator
* Applied Statistics 31 (1982) 188-190
*
* see also:
* Correction to Algorithm AS 183
* Applied Statistics 33 (1984) 123
*
* McLeod, A. I. (1985)
* A remark on Algorithm AS 183
* Applied Statistics 34 (1985),198-200
* This part is thread-unsafe:
* BEGIN CRITICAL SECTION
*/
static tp_obj random_random(TP)
{
long x, y, z;
double r;
 
if (! _gWhRandom.has_seed)
random_seed(tp);
 
x = _gWhRandom.seed.x;
y = _gWhRandom.seed.y;
z = _gWhRandom.seed.z;
 
x = (171 * x) % 30269;
y = (172 * y) % 30307;
z = (170 * z) % 30323;
 
_gWhRandom.seed.x = x;
_gWhRandom.seed.y = y;
_gWhRandom.seed.z = z;
 
/*
* Note: on a platform using IEEE-754 double arithmetic, this can
* never return 0.0 (asserted by Tim; proof too long for a comment).
*/
errno = 0;
r = fmod(((double)x/30269.0+(double)y/30307.0+(double)z/30323.0), 1.0);
if (errno == EDOM)
tp_raise(tp_None,tp_printf(tp, "%s", "fmod(): denominator can't be zero"));
 
return tp_number(r);
}
 
/*
* for compatibility
*/
static tp_obj random_setstate(TP)
{
return (tp_None);
}
 
/*
* for compatibility
*/
static tp_obj random_getstate(TP)
{
return (tp_None);
}
 
/*
* FIXME: risk of overflow.
* following comments are from Python's random.py
* --------------------------------------------
* Act as if n calls to random() were made, but quickly.
*
* n is an int, greater than or equal to 0.
*
* Example use: If you have 2 threads and know that each will
* consume no more than a million random numbers, create two Random
* objects r1 and r2, then do
* r2.setstate(r1.getstate())
* r2.jumpahead(1000000)
* Then r1 and r2 will use guaranteed-disjoint segments of the full
* period.
*/
static tp_obj random_jumpahead(TP)
{
int n = (int)TP_NUM();
long x, y, z;
 
if (n < 0)
tp_raise(tp_None,tp_printf(tp, "%s: n = %d invalid, should >= 0", __func__, n));
 
x = _gWhRandom.seed.x;
y = _gWhRandom.seed.y;
z = _gWhRandom.seed.z;
 
x = (int)(x * ((long)pow(171, n) % 30269)) % 30269;
y = (int)(y * ((long)pow(172, n) % 30307)) % 30307;
z = (int)(z * ((long)pow(170, n) % 30323)) % 30323;
 
_gWhRandom.seed.x = x;
_gWhRandom.seed.y = y;
_gWhRandom.seed.z = z;
 
return (tp_None);
}
 
 
#elif defined(__USE_POSIX_RANDOM_GENERATOR)
 
/*
* judge whether seeded
*/
static int has_seed = 0;
 
static tp_obj random_seed(TP)
{
tp_obj arg = TP_DEFAULT(tp_None);
 
if (arg.type == TP_NONE) {
time_t now;
 
(void)time(&now);
srandom((unsigned int)now);
has_seed = 1;
} else if (arg.type == TP_NUMBER) {
srandom((unsigned long)arg.number.val);
has_seed = 1;
} else {
tp_raise(tp_None,tp_printf(tp, "%s", "invalid argument for seed()"));
}
return (tp_None);
}
 
/*
* random()
*
* generate successive pseudo random number ranging from [0.0, 1.0).
* usually RAND_MAX is huge number, thus the periods of the success-
* ive random number is very long, about 16*((2**31)-1).
* NOTE: if seed() not called before random(), random() will
* automatically call seed() with current time.
*/
tp_obj random_random(TP)
{
double r = 0.0;
 
if (! has_seed)
random_seed(tp);
 
r = (tp_num)random()/(tp_num)RAND_MAX;
 
return (tp_number(r));
}
 
/*
* setstate(state)
*
* for compatibility.
*/
tp_obj random_setstate(TP)
{
return (tp_None);
}
 
/*
* getstate()
*
* for compatibility.
*/
tp_obj random_getstate(TP)
{
return (tp_None);
}
 
/*
* jumpahead()
*
* for compatibility.
*/
tp_obj random_jumpahead(TP)
{
return (tp_None);
}
 
#else
 
#error no underlying random generator is specified
 
#endif
 
/************************************************************
* some usual distributions
************************************************************/
 
/*
* return real number in range [a, b)
* a and b can be negtive, but a must be less than b.
*/
tp_obj random_uniform(TP)
{
double a = TP_NUM();
double b = TP_NUM();
double r = 0.0;
tp_obj rvo; /* random variable object */
 
if (a >= b)
tp_raise(tp_None,tp_printf(tp, "%s: a(%f) must be less than b(%f)", a, b));
 
rvo = random_random(tp);
r = a + (b - a) * rvo.number.val;
return (tp_number(r));
}
 
/*
* Normal distribution
* @mu mean
* @sigma standard deviation
*-----------------------------
* Uses Kinderman and Monahan method. Reference: Kinderman,
* A.J. and Monahan, J.F., "Computer generation of random
* variables using the ratio of uniform deviates", ACM Trans
* Math Software, 3, (1977), pp257-260.
*/
tp_obj random_normalvariate(TP)
{
double mu = TP_NUM();
double sigma = TP_NUM();
double NV_MAGICCONST;
double u1, u2;
double z, zz;
double r = 0.0;
tp_obj rvo; /* random variable object */
 
NV_MAGICCONST = 4.0 * exp(-0.5) / sqrt(2.0);
while (1) {
rvo = random_random(tp);
u1 = rvo.number.val;
rvo = random_random(tp);
u2 = 1.0 - rvo.number.val;
z = NV_MAGICCONST * (u1 - 0.5) / u2;
zz = z * z / 4.0;
if (zz <= - log(u2))
break;
}
 
r = mu + z * sigma;
 
return (tp_number(r));
}
 
/*
* Log normal distribution
*
* If take natural logarithm on log normal distribution, normal
* distribution with mean mu and standard deviation sigma will
* return.
* @mu mean, can be any value
* @sigma standard deviation, must be > 0.
*/
tp_obj random_lognormvariate(TP)
{
double mu = TP_NUM();
double sigma = TP_NUM();
tp_obj params;
tp_obj normvar; /* normal distribution variate */
double r = 0.0;
 
/*
* call random_normalvariate() actually
*/
params = tp_params_v(tp, 2, tp_number(mu), tp_number(sigma));
normvar = tp_ez_call(tp, "random", "normalvariate", params);
r = exp(normvar.number.val);
 
return (tp_number(r));
}
 
/*
* Exponential distribution
*
* @lambda reciprocal of mean.
* return value range (0, +inf)
*/
tp_obj random_expovariate(TP)
{
double lambda = TP_NUM();
double u, r;
tp_obj rvo;
 
do {
rvo = random_random(tp);
u = rvo.number.val;
} while (u <= 0.0000001);
 
r = -log(u) / lambda;
 
return (tp_number(r));
}
 
/*
* Circular data distribution.
*
* mu is the mean angle, expressed in radians between 0 and 2*pi, and
* kappa is the concentration parameter, which must be greater than or
* equal to zero. If kappa is equal to zero, this distribution reduces
* to a uniform random angle over the range 0 to 2*pi.
*
* mu: mean angle (in radians between 0 and 2*pi)
* kappa: concentration parameter kappa (>= 0)
* if kappa = 0 generate uniform random angle
*
* Based upon an algorithm published in: Fisher, N.I.,
* "Statistical Analysis of Circular Data", Cambridge
* University Press, 1993.
*
* Thanks to Magnus Kessler for a correction to the
* implementation of step 4.
*/
tp_obj random_vonmisesvariate(TP)
{
double mu = TP_NUM();
double kappa = TP_NUM();
tp_obj rvo;
double a, b, c, r;
double u1, u2, u3, z, f;
double theta;
double TWOPI = 2.0 * M_PI;
 
if (kappa <= 1e-6) {
rvo = random_random(tp);
theta = TWOPI * rvo.number.val;
return (tp_number(theta));
}
 
a = 1.0 + sqrt(1.0 + 4.0 * kappa * kappa);
b = (a - sqrt(2.0 * a))/(2.0 * kappa);
r = (1.0 + b * b)/(2.0 * b);
 
while (1) {
rvo = random_random(tp);
u1 = rvo.number.val;
 
z = cos(M_PI * u1);
f = (1.0 + r * z)/(r + z);
c = kappa * (r - f);
 
rvo = random_random(tp);
u2 = rvo.number.val;
 
if ((u2 < (c * (2.0 - c))) ||
(u2 <= (c * exp(1.0 - c))))
break;
}
 
rvo = random_random(tp);
u3 = rvo.number.val;
if (u3 > 0.5)
theta = fmod(mu, TWOPI) + acos(f);
else
theta = fmod(mu, TWOPI) - acos(f);
 
return (tp_number(theta));
}
 
/*
* Gamma distribution. Not the gamma function!
*
* Conditions on the parameters are alpha > 0 and beta > 0.
*/
tp_obj random_gammavariate(TP)
{
double alpha = TP_NUM();
double beta = TP_NUM();
tp_obj rvo;
double res;
double LOG4 = log(4.0);
double SG_MAGICCONST = 1.0 + log(4.5);
 
/*
* alpha > 0, beta > 0, mean is alpha*beta, variance is alpha*beta**2
* Warning: a few older sources define the gamma distribution in terms
* of alpha > -1.0
*/
if ((alpha <= 0.0) || (beta <= 0.0))
tp_raise(tp_None,tp_printf(tp, "%s: alpha(%f) and beta(%f) must be > 0.0",
__func__, alpha, beta));
 
if (alpha > 1.0) {
 
/*
* Uses R.C.H. Cheng, "The generation of Gamma
* variables with non-integral shape parameters",
* Applied Statistics, (1977), 26, No. 1, p71-74
*/
 
double ainv;
double bbb, ccc;
double u1, u2;
double v, x, z, r;
 
ainv = sqrt(2.0 * alpha - 1.0);
bbb = alpha - LOG4;
ccc = alpha + ainv;
 
while (1) {
rvo = random_random(tp);
u1 = rvo.number.val;
if (! ((1e-7 < u1) && (u1 < 0.9999999)))
continue;
rvo = random_random(tp);
u2 = 1.0 - rvo.number.val;
v = log(u1 / (1.0 - u1)) / ainv;
x = alpha * exp(v);
z = u1 * u1 * u2;
r = bbb + ccc * v - x;
if ((r + SG_MAGICCONST - 4.5 * z >= 0.0) ||
(r >= log(z))) {
res = x * beta;
return (tp_number(res));
}
}
}
else if (alpha == 1.0) {
 
/*
* expovariate(1)
*/
 
double u;
 
do {
rvo = random_random(tp);
u = rvo.number.val;
} while (u <= 1e-7);
 
res = - log(u) * beta;
return (tp_number(res));
} else {
 
/*
* alpha is between 0 and 1 (exclusive)
*
* Uses ALGORITHM GS of Statistical Computing - Kennedy & Gentle
*/
 
double b, p, u, u1, x;
 
while (1) {
rvo = random_random(tp);
u = rvo.number.val;
b = (M_E + alpha) / M_E;
p = b * u;
if (p <= 1.0)
/*FIXME: x = p ** (1.0/alpha)*/
x = pow(p, 1.0/alpha);
else
x = - log((b - p) / alpha);
rvo = random_random(tp);
u1 = rvo.number.val;
if (p > 1.0) {
/*FIXME: if u1 <= x ** (alpha - 1.0):*/
if (u1 <= pow(x, alpha - 1.0))
break;
}
else if (u1 <= exp(-x))
break;
}
 
res = x * beta;
return (tp_number(res));
}
}
 
/*
* Beta distribution.
*
* Conditions on the parameters are alpha > 0 and beta > 0.
* Returned values range between 0 and 1.
*
* # This version due to Janne Sinkkonen, and matches all the std
* # texts (e.g., Knuth Vol 2 Ed 3 pg 134 "the beta distribution").
*
* See also:
* http://sourceforge.net/bugs/?func=detailbug&bug_id=130030&group_id=5470
* for Ivan Frohne's insightful analysis of why the original implementation:
*
* def betavariate(self, alpha, beta):
* # Discrete Event Simulation in C, pp 87-88.
*
* y = self.expovariate(alpha)
* z = self.expovariate(1.0/beta)
* return z/(y+z)
*
* was dead wrong, and how it probably got that way.
*
*/
tp_obj random_betavariate(TP)
{
double alpha = TP_NUM();
double beta = TP_NUM();
double t;
double r = 0.0;
tp_obj y;
tp_obj params;
 
params = tp_params_v(tp, 2, tp_number(alpha), tp_number(1.0));
y = tp_ez_call(tp, "random", "gammavariate", params);
if (y.number.val == 0) {
return (y);
} else {
params = tp_params_v(tp, 2, tp_number(beta), tp_number(1.0));
t = y.number.val;
y = tp_ez_call(tp, "random", "gammavariate", params);
r = t / (t + y.number.val);
return (tp_number(r));
}
}
 
/*
* Pareto distribution. alpha is the shape parameter.
* # Jain, pg. 495
*/
tp_obj random_paretovariate(TP)
{
double alpha = TP_NUM();
double u;
double r;
tp_obj rvo;
rvo = random_random(tp);
u = 1.0 - rvo.number.val;
r = 1.0 / pow(u, 1.0/alpha);
return (tp_number(r));
}
 
/*
* Weibull distribution.
*
* alpha is the scale parameter and beta is the shape parameter.
*
* Jain, pg. 499; bug fix courtesy Bill Arms
*/
tp_obj random_weibullvariate(TP)
{
double alpha = TP_NUM();
double beta = TP_NUM();
double u;
double r;
tp_obj rvo;
rvo = random_random(tp);
u = 1.0 - rvo.number.val;
r = alpha * pow(-log(u), 1.0/beta);
return (tp_number(r));
}
 
/*
* randomly select an element from range ([start,] stop[, step])
*
* 'stop' must be larger than 'start', both can be negative;
* 'step' must be integer larger than zero.
*/
tp_obj random_randrange(TP)
{
tp_obj start = TP_OBJ();
tp_obj stop = TP_DEFAULT(tp_None);
tp_obj step = TP_DEFAULT(tp_number(1));
tp_obj rvo = random_random(tp);
int istart = (int)start.number.val;
int istep = (int)step.number.val;
int istop;
int iwidth;
double res;
if (stop.type == TP_NONE) {
/*
* if only one argument, then start just means stop
*/
istop = istart;
res = (rvo.number.val * istop);
return (tp_number(res));
} else if (stop.type == TP_NUMBER) {
istop = (int)stop.number.val;
iwidth = istop - istart;
if (iwidth < 0)
tp_raise(tp_None,tp_printf(tp, "%s", "stop must be > start"));
if (istep <= 0)
tp_raise(tp_None,tp_printf(tp, "%s", "step must be integer larger than 0"));
if (istep == 1) {
res = (int)(istart + (int)(rvo.number.val * iwidth));
return (tp_number(res));
} else {
int n = (iwidth + istep - 1) / istep;
res = (int)(istart + istep * (int)(n * rvo.number.val));
return (tp_number(res));
}
} else {
tp_raise(tp_None,tp_printf(tp, "%s", "wrong type of stop"));
}
}
 
/*
* return random integer between [a, b]
*/
tp_obj random_randint(TP)
{
double a = TP_NUM();
double b = TP_NUM();
tp_obj r;
tp_obj params;
 
params = tp_params_v(tp, 2, tp_number(a), tp_number(b + 1));
r = tp_ez_call(tp, "random", "randrange", params);
return (r);
}
 
/*
* return a random element of sequence 'seq'. 'seq' mustn't be empty.
*/
tp_obj random_choice(TP)
{
tp_obj seq = TP_OBJ();
tp_obj len;
tp_obj rvo;
tp_obj r;
int i;
len = tp_len(tp, seq);
if (len.number.val <= 0)
tp_raise(tp_None,tp_printf(tp, "%s", "seq mustn't be empty"));
rvo = random_random(tp);
i = (int)(len.number.val * rvo.number.val);
r = tp_get(tp, seq, tp_number(i));
return (r);
}
 
/*
* shuffle sequence 'seq' in place, return None
*/
tp_obj random_shuffle(TP)
{
tp_obj seq = TP_OBJ();
tp_obj elmi;
tp_obj elmj;
tp_obj params;
tp_obj rvo;
tp_obj len = tp_len(tp, seq);
int i, j;
if (len.number.val <= 0)
return (tp_None);
for (i = len.number.val - 1; i > len.number.val / 2; i--) {
/*
* randomly exchange elment i and elment j, element i from the behind end of 'seq', while
* element j from the front end of 'seq'.
*/
params = tp_params_v(tp, 2, tp_number(0), tp_number(len.number.val / 2));
rvo = tp_ez_call(tp, "random", "randint", params);
j = (int)rvo.number.val;
elmi = tp_get(tp, seq, tp_number(i));
elmj = tp_get(tp, seq, tp_number(j));
tp_set(tp, seq, tp_number(i), elmj);
tp_set(tp, seq, tp_number(j), elmi);
}
for (i = len.number.val / 2; i >= 0; i--) {
/*
* randomly exchange elment i and elment j, element i from the front end of 'seq', while
* element j from the behind end of 'seq'.
*/
params = tp_params_v(tp, 2, tp_number(len.number.val / 2), tp_number(len.number.val - 1));
rvo = tp_ez_call(tp, "random", "randint", params);
j = (int)rvo.number.val;
elmi = tp_get(tp, seq, tp_number(i));
elmj = tp_get(tp, seq, tp_number(j));
tp_set(tp, seq, tp_number(i), elmj);
tp_set(tp, seq, tp_number(j), elmi);
}
return (tp_None);
}
/programs/develop/tinypy/modules/random/tests.py
0,0 → 1,176
#!/usr/bin/env python
 
import random
#from math import log, exp, sqrt, pi
 
def test_seed_state():
"""test seed() and getstate()/setstate()
"""
# random ought to be able to deal with seeds in any form, of follows.
# following code shouldn't cause an exception.
random.seed()
random.seed(0)
random.seed(-1)
random.seed(0.1)
random.seed(-0.1)
random.seed("a")
random.seed("abc")
random.seed("abcd")
random.seed("fasdfasdfasdfadgaldhgldahlgahdlghadlgladh")
random.seed("lxhlh90yowhldshlgah;")
# state1 and state2 should be different for different seeds
random.seed(1)
state1 = random.getstate()
random.seed(2)
state2 = random.getstate()
rep = 0
for ind in range(len(state1)):
elem1 = state1[ind]
elem2 = state2[ind]
if (elem1 == elem2): rep += 1
if (rep > len(state1) / 2):
print("rep = ", rep, "len(state1) = ", len(state1))
raise "state1 and state2 should be different"
# for the same seeds, state1 and state2 should be the same
random.seed(100)
state1 = random.getstate()
random.seed(100)
state2 = random.getstate()
rep = 0
for ind in range(len(state1)):
elem1 = state1[ind]
elem2 = state2[ind]
if (elem1 == elem2): rep += 1
if (rep != len(state1)):
raise "state1 and state2 should be the same"
 
def test_jumpahead():
"""jumpahead will change the pseudo-number generator's internal state
"""
random.seed()
state1 = random.getstate()
random.jumpahead(20)
state2 = random.getstate()
rep = 0
for ind in range(len(state1)):
elem1 = state1[ind]
elem2 = state2[ind]
if (elem1 == elem2): rep += 1
if (rep > len(state1) / 2):
raise "state1 and state2 can't be the same"
def test_setstate():
"""
"""
random.seed()
oldState = random.getstate()
oldRandSeq = [random.random() for i in range(10)]
random.setstate(oldState)
newRandSeq = [random.random() for i in range(10)]
rep = 0
for ind in range(len(oldRandSeq)):
elem1 = oldRandSeq[ind]
elem2 = newRandSeq[ind]
if (elem1 == elem2): rep += 1
if (rep != len(oldRandSeq)):
raise "oldRandSeq and newRandSeq should be the same"
 
def test_random():
"""generate a random number list
"""
x = [random.random() for i in range(100)]
def test_distribution():
"""these lines are borrowed from python, they shouldn't
cause any exception.
"""
g = random
g.uniform(1,10)
g.paretovariate(1.0)
g.expovariate(1.0)
g.weibullvariate(1.0, 1.0)
g.normalvariate(0.0, 1.0)
g.lognormvariate(0.0, 1.0)
g.vonmisesvariate(0.0, 1.0)
g.gammavariate(0.01, 1.0)
g.gammavariate(1.0, 1.0)
g.gammavariate(200.0, 1.0)
g.betavariate(3.0, 3.0)
 
def test_randrange():
"""these input to randrange() shouldn't cause any exception.
"""
random.randrange(100000)
random.randrange(-100000)
random.randrange(0)
random.randrange(-10.2)
random.randrange(-10, 10)
random.randrange(2, 1000)
random.randrange(0, 1)
random.randrange(-1, 0)
random.randrange(10, 2000, 2)
random.randrange(-2000, 100, 5)
random.randrange(-1000.3, 1000.7, 2)
 
def test_randint():
"""for any valid pair (a, b), randint(a, b) should lay between [a, b]
"""
for i in range(1000):
r = random.randint(-10000, 10000)
if (-10000 <= r <= 10000): continue
else: raise "error: random.randint()"
 
def test_choice():
"""random.choice() should be able to deal with string, list.
"""
S = "abcdefg123*@#$%)("
L = [1, 2, 3, -1, 0.2, -0.1, -10000, "cyc"]
if random.choice(S) not in S:
raise "error: random.choice(S)"
if random.choice(L) not in L:
raise "error: random.choice(L)"
 
def test_shuffle():
"""test random.shuffle() on list. since string is not writable in-place,
random.shuffle() can not be applied on string.
Note: to copy items from a list to a new list, must use syntax like:
newList = oldList[:]
if use syntax like: newList = oldList, newList is just an alias of oldList.
"""
oldL = [1, 2, 3, -1, 0.2, -0.1, -10000, "cyc"]
newL = oldL[:]
random.shuffle(newL)
rep = 0
for ind in range(len(oldL)):
elem1 = oldL[ind]
elem2 = newL[ind]
if (elem1 == elem2): rep += 1
if (rep > len(oldL) / 2):
raise "oldL and newL shouldn't be the same"
def test_53_bits_per_float():
pass
def main():
test_seed_state()
test_jumpahead()
test_setstate()
test_random()
test_distribution()
test_randrange()
test_randint()
test_choice()
test_shuffle()
test_53_bits_per_float()
print("#OK")
 
if __name__ == '__main__':
main()
/programs/develop/tinypy/modules/re/init.c
0,0 → 1,710
/*
* regular expression module
*
* Important Note: do not support group name index
*
* $Id$
*/
 
#include <stdio.h>
#include <assert.h>
#include "regexpr.c"
 
/* tinypy API to be use in this unit */
extern tp_obj tp_data(TP,int magic,void *v);
extern tp_obj tp_object_new(TP);
extern tp_obj tp_object(TP);
extern tp_obj tp_method(TP,tp_obj self,tp_obj v(TP));
extern tp_obj tp_string_copy(TP, const char *s, int n);
extern tp_obj tp_list(TP);
extern tp_obj tp_copy(TP);
 
/* last error message */
static const char * LastError = NULL;
 
/* lower level regex object */
typedef struct {
struct re_pattern_buffer re_patbuf; /* The compiled expression */
struct re_registers re_regs; /* The registers from the last match */
char re_fastmap[256]; /* Storage for fastmap */
unsigned char *re_translate; /* String object for translate table */
unsigned char *re_lastok; /* String object last matched/searched */
 
/* supplementary */
int re_errno; /* error num */
int re_syntax; /* syntax */
} regexobject;
 
/* local declarations */
static regexobject* getre(TP, tp_obj rmobj);
static tp_obj match_obj_group(TP);
static tp_obj match_obj_groups(TP);
static tp_obj match_obj_start(TP);
static tp_obj match_obj_end(TP);
static tp_obj match_obj_span(TP);
 
/*
* helper function: return lower level regex object
* rmobj - regex or match object
*/
static regexobject * getre(TP, tp_obj rmobj)
{
tp_obj reobj_data = tp_get(tp, rmobj, tp_string("__data__"));
regexobject *re = NULL;
 
/* validate magic */
if (reobj_data.data.magic != sizeof(regexobject)) {
LastError = "broken regex object";
return (NULL);
}
re = (regexobject*)reobj_data.data.val;
assert(re);
 
return (re);
}
 
/*
* derive match object from regex object
*/
static tp_obj match_object(TP, tp_obj reobj)
{
tp_obj mo = tp_object(tp); /* match object */
tp_obj redata; /* regex object data */
tp_obj madata; /* match object data */
regexobject *re = NULL; /* lower level regex object */
 
redata = tp_get(tp, reobj, tp_string("__data__"));
re = (regexobject *)redata.data.val;
assert(re);
madata = tp_data(tp, (int)sizeof(regexobject), re);
 
tp_set(tp, mo, tp_string("group"), tp_method(tp, mo, match_obj_group));
tp_set(tp, mo, tp_string("groups"), tp_method(tp, mo, match_obj_groups));
tp_set(tp, mo, tp_string("start"), tp_method(tp, mo, match_obj_start));
tp_set(tp, mo, tp_string("end"), tp_method(tp, mo, match_obj_end));
tp_set(tp, mo, tp_string("span"), tp_method(tp, mo, match_obj_span));
tp_set(tp, mo, tp_string("__data__"), madata);
 
return (mo);
}
 
/*
* FUNC: regexobj.search(str[,pos=0])
* self - regex object
* str - string to be searched
* pos - optional starting offset
*
* RETURN:
* match object - when matched
* None - not matched
*/
static tp_obj regex_obj_search(TP)
{
tp_obj self = TP_OBJ(); /* regex object */
tp_obj str = TP_STR();
tp_obj pos = TP_DEFAULT(tp_number(0));
tp_obj maobj; /* match object */
regexobject *re = NULL;
int r = -2; /* -2 indicate exception */
int range;
 
if (pos.number.val < 0 || pos.number.val > str.string.len) {
LastError = "search offset out of range";
goto exception;
}
range = str.string.len - pos.number.val;
 
re = getre(tp, self);
re->re_lastok = NULL;
r = re_search(&re->re_patbuf, (unsigned char *)str.string.val,
str.string.len, pos.number.val, range, &re->re_regs);
 
/* cannot match pattern */
if (r == -1)
goto notfind;
 
/* error occurred */
if (r == -2)
goto exception;
 
/* matched */
re->re_lastok = (unsigned char *)str.string.val;
 
/* match obj */
maobj = match_object(tp, self);
 
return (maobj);
 
notfind:
re->re_lastok = NULL;
return (tp_None);
exception:
re->re_lastok = NULL;
tp_raise(tp_None, tp_string("regex search error"));
}
 
/*
* FUNC: regexobj.match(str[,pos=0])
* self - regex object
* str - string to be matched
* pos - optional starting position
*
* RETURN:
* match object - when matched
* None - not matched
*/
static tp_obj regex_obj_match(TP)
{
tp_obj self = TP_OBJ(); /* regex object */
tp_obj str = TP_STR();
tp_obj pos = TP_DEFAULT(tp_number(0));
tp_obj maobj; /* match object */
regexobject *re = NULL;
int r = -2; /* -2 indicate exception */
 
re = getre(tp, self);
re->re_lastok = NULL;
r = re_match(&re->re_patbuf, (unsigned char *)str.string.val,
str.string.len, pos.number.val, &re->re_regs);
 
/* cannot match pattern */
if (r == -1)
goto nomatch;
 
/* error occurred */
if (r == -2)
goto exception;
 
/* matched */
re->re_lastok = (unsigned char *)str.string.val;
 
/* match obj */
maobj = match_object(tp, self);
 
return (maobj);
 
nomatch:
re->re_lastok = NULL;
return (tp_None);
exception:
re->re_lastok = NULL;
tp_raise(tp_None, tp_string("regex match error"));
}
 
/*
* regex object split()
* self - regex object
* restr - regex string
* maxsplit - max split field, default 0, mean no limit
*/
static tp_obj regex_obj_split(TP)
{
tp_obj self = TP_OBJ(); /* regex object */
tp_obj restr = TP_OBJ(); /* string */
tp_obj maxsplit = TP_DEFAULT(tp_number(0));
tp_obj maobj; /* match object */
regexobject *re = NULL; /* lower level regex object */
tp_obj result = tp_list(tp);
tp_obj grpstr; /* group string */
int slen; /* string length */
int srchloc; /* search location */
 
/* maxsplit == 0 means no limit */
if ((int)maxsplit.number.val == 0)
maxsplit.number.val = RE_NREGS;
assert(maxsplit.number.val > 0);
 
srchloc = 0;
slen = strlen((char *)restr.string.val);
 
do {
/* generate a temp match object */
tp_params_v(tp, 3, self, restr, tp_number(srchloc));
maobj = regex_obj_search(tp);
if (!tp_bool(tp, maobj))
break;
 
re = getre(tp, maobj);
if (re->re_lastok == NULL) {
tp_raise(tp_None, tp_string("no match for split()"));
}
 
/* extract fields */
if ((int)maxsplit.number.val > 0) {
int start = re->re_regs.start[0];
int end = re->re_regs.end[0];
/*printf("%s:start(%d),end(%d)\n", __func__, start, end);*/
if (start < 0 || end < 0)
break;
 
grpstr = tp_string_copy(tp,
(const char *)re->re_lastok + srchloc, start - srchloc);
 
if (tp_bool(tp, grpstr)) {
tp_set(tp, result, tp_None, grpstr);
maxsplit.number.val--;
}
 
srchloc = end;
}
} while (srchloc < slen && (int)maxsplit.number.val > 0);
 
/* collect remaining string, if necessary */
if (srchloc < slen) {
grpstr = tp_string_copy(tp,
(const char *)restr.string.val + srchloc, slen - srchloc);
if (tp_bool(tp, grpstr))
tp_set(tp, result, tp_None, grpstr);
}
 
return (result);
}
 
/*
* regex object findall()
* self - regex object
* restr - regex string
* pos - starting position, default 0
*/
static tp_obj regex_obj_findall(TP)
{
tp_obj self = TP_OBJ(); /* regex object */
tp_obj restr = TP_OBJ(); /* string */
tp_obj pos = TP_DEFAULT(tp_number(0));
tp_obj maobj; /* match object */
regexobject *re = NULL; /* lower level regex object */
tp_obj result = tp_list(tp);
tp_obj grpstr; /* group string */
int slen; /* string length */
int srchloc; /* search location */
 
srchloc = (int)pos.number.val;
slen = strlen((char *)restr.string.val);
if (srchloc < 0 || srchloc >= slen)
tp_raise(tp_None, tp_string("starting position out of range"));
 
do {
/* generate a temp match object */
tp_params_v(tp, 3, self, restr, tp_number(srchloc));
maobj = regex_obj_search(tp);
if (!tp_bool(tp, maobj))
break;
 
re = getre(tp, maobj);
if (re->re_lastok == NULL) {
tp_raise(tp_None, tp_string("no match for findall()"));
}
 
/* extract fields */
if (srchloc < slen) {
int start = re->re_regs.start[0];
int end = re->re_regs.end[0];
/*printf("%s:start(%d),end(%d)\n", __func__, start, end);*/
if (start < 0 || end < 0)
break;
 
grpstr = tp_string_copy(tp,
(const char *)re->re_lastok + start, end - start);
 
if (tp_bool(tp, grpstr)) {
tp_set(tp, result, tp_None, grpstr);
}
 
srchloc = end;
}
} while (srchloc < slen);
 
return (result);
}
 
/*
* FUNC: matchobj.group([group1, ...])
* self - match object
* args - optional group indices, default 0
*
* return specified group.
*/
static tp_obj match_obj_group(TP)
{
tp_obj self = TP_OBJ(); /* match object */
tp_obj grpidx; /* a group index */
regexobject *re = NULL;
int indices[RE_NREGS];
int start;
int end;
int i;
int single = 0; /* single group index? */
tp_obj result;
 
/* get lower level regex object representation */
re = getre(tp, self);
if (re->re_lastok == NULL)
tp_raise(tp_None,
tp_string("group() only valid after successful match/search"));
 
for (i = 0; i < RE_NREGS; i++)
indices[i] = -1;
 
/*
* if no group index provided, supply default group index 0; else
* fill in indices[] with provided group index list.
*/
if (tp->params.list.val->len == 0) {
indices[0] = 0;
single = 1;
} else if (tp->params.list.val->len == 1) {
indices[0] = (int)TP_NUM();
single = 1;
} else {
i = 0;
TP_LOOP(grpidx)
if (grpidx.number.val < 0 || grpidx.number.val > RE_NREGS)
tp_raise(tp_None, tp_string("group() grpidx out of range"));
indices[i++] = (int)grpidx.number.val;
TP_END
}
 
/* generate result string list */
result = tp_list(tp);
for (i = 0; i < RE_NREGS && indices[i] >= 0; i++) {
tp_obj grpstr;
start = re->re_regs.start[indices[i]];
end = re->re_regs.end[indices[i]];
if (start < 0 || end < 0) {
grpstr = tp_None;
} else {
grpstr = tp_string_copy(tp, (const char *)re->re_lastok + start,
end - start);
}
tp_set(tp, result, tp_None, grpstr);
}
return (single ? tp_get(tp, result, tp_number(0)) : result);
}
 
/*
* FUNC: matchobj.groups()
* self - match object.
* return all groups.
* Note: CPython allow a 'default' argument, but we disallow it.
*/
static tp_obj match_obj_groups(TP)
{
tp_obj self = TP_OBJ(); /* match object */
regexobject *re = NULL;
int start;
int end;
int i;
tp_obj result = tp_list(tp);
 
re = getre(tp, self);
if (re->re_lastok == NULL) {
tp_raise(tp_None,
tp_string("groups() only valid after successful match/search"));
}
 
for (i = 1; i < RE_NREGS; i++) {
start = re->re_regs.start[i];
end = re->re_regs.end[i];
if (start < 0 || end < 0)
break;
 
tp_obj grpstr = tp_string_copy(tp,
(const char *)re->re_lastok + start, end - start);
 
if (tp_bool(tp, grpstr))
tp_set(tp, result, tp_None, grpstr);
}
 
return (result);
}
 
/*
* FUNC: matchobj.start([group])
* self - match object
* group - group index
* return starting position of matched 'group' substring.
*/
static tp_obj match_obj_start(TP)
{
tp_obj self = TP_OBJ(); /* match object */
tp_obj group = TP_DEFAULT(tp_number(0)); /* group */
regexobject *re = NULL;
int start;
 
re = getre(tp, self);
if (re->re_lastok == NULL) {
tp_raise(tp_None,
tp_string("start() only valid after successful match/search"));
}
 
if (group.number.val < 0 || group.number.val > RE_NREGS)
tp_raise(tp_None, tp_string("IndexError: group index out of range"));
 
start = re->re_regs.start[(int)group.number.val];
 
return (tp_number(start));
}
 
/*
* FUNC: matchobj.end([group])
* self - match object
* group - group index
* return ending position of matched 'group' substring.
*/
static tp_obj match_obj_end(TP)
{
tp_obj self = TP_OBJ(); /* match object */
tp_obj group = TP_DEFAULT(tp_number(0)); /* group */
regexobject *re = NULL;
int end;
 
re = getre(tp, self);
if (re->re_lastok == NULL) {
tp_raise(tp_None,
tp_string("end() only valid after successful match/search"));
}
 
if (group.number.val < 0 || group.number.val > RE_NREGS)
tp_raise(tp_None, tp_string("IndexError: group index out of range"));
 
end = re->re_regs.end[(int)group.number.val];
 
return (tp_number(end));
}
 
/*
* FUNC: matchobj.span([group])
* self - match object
* group - group index
* return [start,end] position pair of matched 'group' substring.
*/
static tp_obj match_obj_span(TP)
{
tp_obj self = TP_OBJ(); /* match object */
tp_obj group = TP_DEFAULT(tp_number(0)); /* group */
regexobject *re = NULL;
int start;
int end;
tp_obj result;
 
re = getre(tp, self);
if (re->re_lastok == NULL) {
tp_raise(tp_None,
tp_string("span() only valid after successful match/search"));
}
 
if (group.number.val < 0 || group.number.val > RE_NREGS)
tp_raise(tp_None, tp_string("IndexError: group index out of range"));
 
start = re->re_regs.start[(int)group.number.val];
end = re->re_regs.end[(int)group.number.val];
 
result = tp_list(tp);
tp_set(tp, result, tp_None, tp_number(start));
tp_set(tp, result, tp_None, tp_number(end));
 
return (result);
}
 
/*
* compile out a re object
* repat - regex pattern
* resyn - regex syntax
*/
static tp_obj regex_compile(TP)
{
char *error = NULL;
char const *pat = NULL;
int size = 0;
tp_obj reobj_data;
tp_obj repat = TP_TYPE(TP_STRING); /* pattern */
tp_obj resyn = TP_DEFAULT(tp_number(RE_SYNTAX_EMACS)); /* syntax */
tp_obj reobj; /* regex object */
regexobject *re;
 
/*
* create regex object, its parent is builtin 'object'
*/
reobj = tp_object(tp);
 
re = (regexobject *)malloc(sizeof(regexobject));
if (!re) {
error = "malloc lower level regex object failed";
goto finally;
}
 
re->re_patbuf.buffer = NULL;
re->re_patbuf.allocated = 0;
re->re_patbuf.fastmap = (unsigned char *)re->re_fastmap;
re->re_patbuf.translate = NULL;
re->re_translate = NULL;
re->re_lastok = NULL;
 
re->re_errno = 0;
re->re_syntax = (int)resyn.number.val;
 
pat = repat.string.val;
size = repat.string.len;
error = re_compile_pattern((unsigned char *)pat, size, &re->re_patbuf);
if (error != NULL) {
LastError = error;
goto finally;
}
 
/* regexobject's size as magic */
reobj_data = tp_data(tp, (int)sizeof(regexobject), re);
 
/*
* bind to regex object
*/
tp_set(tp, reobj, tp_string("search"),
tp_method(tp, reobj, regex_obj_search));
tp_set(tp, reobj, tp_string("match"),
tp_method(tp, reobj, regex_obj_match));
tp_set(tp, reobj, tp_string("split"),
tp_method(tp, reobj, regex_obj_split));
tp_set(tp, reobj, tp_string("findall"),
tp_method(tp, reobj, regex_obj_findall));
tp_set(tp, reobj, tp_string("__data__"), reobj_data);
 
tp_set(tp, reobj, tp_string("__name__"),
tp_string("regular expression object"));
tp_set(tp, reobj, tp_string("__doc__"), tp_string(
"regular expression object, support methods:\n"
"search(str[,pos=0])-search 'str' from 'pos'\n"
"match(str[,pos=0]) -match 'str' from 'pos'\n"
));
 
return (reobj);
 
finally:
tp_raise(tp_None, tp_string(error));
}
 
/*
* module level search()
*/
static tp_obj regex_search(TP)
{
tp_obj repat = TP_OBJ(); /* pattern */
tp_obj restr = TP_OBJ(); /* string */
tp_obj resyn = TP_DEFAULT(tp_number(RE_SYNTAX_EMACS));
tp_obj reobj; /* regex object */
tp_obj maobj; /* match object */
 
/* compile out regex object */
tp_params_v(tp, 2, repat, resyn);
reobj = regex_compile(tp);
/* call r.search() */
tp_params_v(tp, 3, reobj, restr, tp_number(0));
maobj = regex_obj_search(tp);
 
return (maobj);
}
 
/*
* module level match()
*/
static tp_obj regex_match(TP)
{
tp_obj repat = TP_OBJ(); /* pattern */
tp_obj restr = TP_OBJ(); /* string */
tp_obj resyn = TP_DEFAULT(tp_number(RE_SYNTAX_EMACS));
tp_obj reobj; /* regex object */
tp_obj maobj; /* match object */
 
/* compile out regex object */
tp_params_v(tp, 2, repat, resyn);
reobj = regex_compile(tp);
/* call r.search() */
tp_params_v(tp, 3, reobj, restr, tp_number(0));
maobj = regex_obj_match(tp);
 
return (maobj);
}
 
/*
* module level split()
* repat - regex pattern
* restr - regex string
* maxsplit - max split field, default 0, mean no limit
*/
static tp_obj regex_split(TP)
{
tp_obj repat = TP_OBJ(); /* pattern */
tp_obj restr = TP_OBJ(); /* string */
tp_obj maxsplit = TP_DEFAULT(tp_number(0));
tp_obj reobj; /* regex object */
 
/* generate a temp regex object */
tp_params_v(tp, 2, repat, tp_number(RE_SYNTAX_EMACS));
reobj = regex_compile(tp);
tp_params_v(tp, 3, reobj, restr, maxsplit);
return regex_obj_split(tp);
}
 
/*
* module level findall()
* repat - regex pattern
* restr - regex string
* resyn - regex syntax, optional, default RE_SYNTAX_EMAC
*/
static tp_obj regex_findall(TP)
{
tp_obj repat = TP_OBJ(); /* pattern */
tp_obj restr = TP_OBJ(); /* string */
tp_obj resyn = TP_DEFAULT(tp_number(RE_SYNTAX_EMACS));
tp_obj reobj; /* regex object */
 
/* generate a temp regex object */
tp_params_v(tp, 2, repat, resyn);
reobj = regex_compile(tp);
tp_params_v(tp, 2, reobj, restr);
return regex_obj_findall(tp);
}
 
 
/*
* re mod can only support 'set_syntax', 'get_syntax', and 'compile' functions,
* 'compile' function will return a 'reobj', and this 'reobj' will support
* methods 'search', 'match', 'group', 'groupall', el al.
*/
void re_init(TP)
{
/*
* module dict for re
*/
tp_obj re_mod = tp_dict(tp);
 
/*
* bind to re module
*/
tp_set(tp, re_mod, tp_string("compile"), tp_fnc(tp, regex_compile));
tp_set(tp, re_mod, tp_string("search"), tp_fnc(tp, regex_search));
tp_set(tp, re_mod, tp_string("match"), tp_fnc(tp, regex_match));
tp_set(tp, re_mod, tp_string("split"), tp_fnc(tp, regex_split));
tp_set(tp, re_mod, tp_string("findall"), tp_fnc(tp, regex_findall));
tp_set(tp, re_mod, tp_string("AWK_SYNTAX"), tp_number(RE_SYNTAX_AWK));
tp_set(tp, re_mod, tp_string("EGREP_SYNTAX"), tp_number(RE_SYNTAX_EGREP));
tp_set(tp, re_mod, tp_string("GREP_SYNTAX"), tp_number(RE_SYNTAX_GREP));
tp_set(tp, re_mod, tp_string("EMACS_SYNTAX"), tp_number(RE_SYNTAX_EMACS));
 
/*
* bind special attibutes to re module
*/
tp_set(tp, re_mod, tp_string("__name__"),
tp_string("regular expression module"));
tp_set(tp, re_mod, tp_string("__file__"), tp_string(__FILE__));
tp_set(tp, re_mod, tp_string("__doc__"),
tp_string("simple regular express implementation"));
 
/*
* bind regex module to tinypy modules[]
*/
tp_set(tp, tp->modules, tp_string("re"), re_mod);
}
 
/programs/develop/tinypy/modules/re/regexpr.c
0,0 → 1,2124
/*
* to eliminate dependence on CPython, I stripped out
* some CPython error handling function calls.
*/
 
/* regexpr.c
*
* Author: Tatu Ylonen <ylo@ngs.fi>
*
* Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
*
* Permission to use, copy, modify, distribute, and sell this software
* and its documentation for any purpose is hereby granted without
* fee, provided that the above copyright notice appear in all copies.
* This software is provided "as is" without express or implied
* warranty.
*
* Created: Thu Sep 26 17:14:05 1991 ylo
* Last modified: Mon Nov 4 17:06:48 1991 ylo
* Ported to Think C: 19 Jan 1992 guido@cwi.nl
*
* This code draws many ideas from the regular expression packages by
* Henry Spencer of the University of Toronto and Richard Stallman of
* the Free Software Foundation.
*
* Emacs-specific code and syntax table code is almost directly borrowed
* from GNU regexp.
*
* Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
* 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
* Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
* probably one or two others that I'm forgetting.
*
* $Id$ */
 
#include <stdlib.h>
#include <string.h>
#include "regexpr.h"
 
/* The original code blithely assumed that sizeof(short) == 2. Not
* always true. Original instances of "(short)x" were replaced by
* SHORT(x), where SHORT is #defined below. */
 
#define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
 
/* The stack implementation is taken from an idea by Andrew Kuchling.
* It's a doubly linked list of arrays. The advantages of this over a
* simple linked list are that the number of mallocs required are
* reduced. It also makes it possible to statically allocate enough
* space so that small patterns don't ever need to call malloc.
*
* The advantages over a single array is that is periodically
* realloced when more space is needed is that we avoid ever copying
* the stack. */
 
/* item_t is the basic stack element. Defined as a union of
* structures so that both registers, failure points, and counters can
* be pushed/popped from the stack. There's nothing built into the
* item to keep track of whether a certain stack item is a register, a
* failure point, or a counter. */
 
typedef union item_t
{
struct
{
int num;
int level;
unsigned char *start;
unsigned char *end;
} reg;
struct
{
int count;
int level;
int phantom;
unsigned char *code;
unsigned char *text;
} fail;
struct
{
int num;
int level;
int count;
} cntr;
} item_t;
 
#define STACK_PAGE_SIZE 256
#define NUM_REGISTERS 256
 
/* A 'page' of stack items. */
 
typedef struct item_page_t
{
item_t items[STACK_PAGE_SIZE];
struct item_page_t *prev;
struct item_page_t *next;
} item_page_t;
 
 
typedef struct match_state
{
/* The number of registers that have been pushed onto the stack
* since the last failure point. */
 
int count;
 
/* Used to control when registers need to be pushed onto the
* stack. */
int level;
/* The number of failure points on the stack. */
int point;
/* Storage for the registers. Each register consists of two
* pointers to characters. So register N is represented as
* start[N] and end[N]. The pointers must be converted to
* offsets from the beginning of the string before returning the
* registers to the calling program. */
unsigned char *start[NUM_REGISTERS];
unsigned char *end[NUM_REGISTERS];
/* Keeps track of whether a register has changed recently. */
int changed[NUM_REGISTERS];
/* Structure to encapsulate the stack. */
struct
{
/* index into the current page. If index == 0 and you need
* to pop an item, move to the previous page and set index
* = STACK_PAGE_SIZE - 1. Otherwise decrement index to
* push a page. If index == STACK_PAGE_SIZE and you need
* to push a page move to the next page and set index =
* 0. If there is no new next page, allocate a new page
* and link it in. Otherwise, increment index to push a
* page. */
int index;
item_page_t *current; /* Pointer to the current page. */
item_page_t first; /* First page is statically allocated. */
} stack;
} match_state;
 
/* Initialize a state object */
 
/* #define NEW_STATE(state) \ */
/* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
/* state.stack.current = &state.stack.first; \ */
/* state.stack.first.prev = NULL; \ */
/* state.stack.first.next = NULL; \ */
/* state.stack.index = 0; \ */
/* state.level = 1 */
 
#define NEW_STATE(state, nregs) \
{ \
int i; \
for (i = 0; i < nregs; i++) \
{ \
state.start[i] = NULL; \
state.end[i] = NULL; \
state.changed[i] = 0; \
} \
state.stack.current = &state.stack.first; \
state.stack.first.prev = NULL; \
state.stack.first.next = NULL; \
state.stack.index = 0; \
state.level = 1; \
state.count = 0; \
state.level = 0; \
state.point = 0; \
}
 
/* Free any memory that might have been malloc'd */
 
#define FREE_STATE(state) \
while(state.stack.first.next != NULL) \
{ \
state.stack.current = state.stack.first.next; \
state.stack.first.next = state.stack.current->next; \
free(state.stack.current); \
}
 
/* Discard the top 'count' stack items. */
 
#define STACK_DISCARD(stack, count, on_error) \
stack.index -= count; \
while (stack.index < 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
stack.current = stack.current->prev; \
stack.index += STACK_PAGE_SIZE; \
}
 
/* Store a pointer to the previous item on the stack. Used to pop an
* item off of the stack. */
 
#define STACK_PREV(stack, top, on_error) \
if (stack.index == 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
stack.current = stack.current->prev; \
stack.index = STACK_PAGE_SIZE - 1; \
} \
else \
{ \
stack.index--; \
} \
top = &(stack.current->items[stack.index])
 
/* Store a pointer to the next item on the stack. Used to push an item
* on to the stack. */
 
#define STACK_NEXT(stack, top, on_error) \
if (stack.index == STACK_PAGE_SIZE) \
{ \
if (stack.current->next == NULL) \
{ \
stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
if (stack.current->next == NULL) \
on_error; \
stack.current->next->prev = stack.current; \
stack.current->next->next = NULL; \
} \
stack.current = stack.current->next; \
stack.index = 0; \
} \
top = &(stack.current->items[stack.index++])
 
/* Store a pointer to the item that is 'count' items back in the
* stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
* STACK_TOP(stack, top, on_error). */
 
#define STACK_BACK(stack, top, count, on_error) \
{ \
int index; \
item_page_t *current; \
current = stack.current; \
index = stack.index - (count); \
while (index < 0) \
{ \
if (current->prev == NULL) \
on_error; \
current = current->prev; \
index += STACK_PAGE_SIZE; \
} \
top = &(current->items[index]); \
}
 
/* Store a pointer to the top item on the stack. Execute the
* 'on_error' code if there are no items on the stack. */
 
#define STACK_TOP(stack, top, on_error) \
if (stack.index == 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
} \
else \
{ \
top = &(stack.current->items[stack.index - 1]); \
}
 
/* Test to see if the stack is empty */
 
#define STACK_EMPTY(stack) ((stack.index == 0) && \
(stack.current->prev == NULL))
 
/* Return the start of register 'reg' */
 
#define GET_REG_START(state, reg) (state.start[reg])
 
/* Return the end of register 'reg' */
 
#define GET_REG_END(state, reg) (state.end[reg])
 
/* Set the start of register 'reg'. If the state of the register needs
* saving, push it on the stack. */
 
#define SET_REG_START(state, reg, text, on_error) \
if(state.changed[reg] < state.level) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->reg.num = reg; \
item->reg.start = state.start[reg]; \
item->reg.end = state.end[reg]; \
item->reg.level = state.changed[reg]; \
state.changed[reg] = state.level; \
state.count++; \
} \
state.start[reg] = text
 
/* Set the end of register 'reg'. If the state of the register needs
* saving, push it on the stack. */
 
#define SET_REG_END(state, reg, text, on_error) \
if(state.changed[reg] < state.level) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->reg.num = reg; \
item->reg.start = state.start[reg]; \
item->reg.end = state.end[reg]; \
item->reg.level = state.changed[reg]; \
state.changed[reg] = state.level; \
state.count++; \
} \
state.end[reg] = text
 
#define PUSH_FAILURE(state, xcode, xtext, on_error) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->fail.code = xcode; \
item->fail.text = xtext; \
item->fail.count = state.count; \
item->fail.level = state.level; \
item->fail.phantom = 0; \
state.count = 0; \
state.level++; \
state.point++; \
}
 
/* Update the last failure point with a new position in the text. */
 
#define UPDATE_FAILURE(state, xtext, on_error) \
{ \
item_t *item; \
STACK_BACK(state.stack, item, state.count + 1, on_error); \
if (!item->fail.phantom) \
{ \
item_t *item2; \
STACK_NEXT(state.stack, item2, on_error); \
item2->fail.code = item->fail.code; \
item2->fail.text = xtext; \
item2->fail.count = state.count; \
item2->fail.level = state.level; \
item2->fail.phantom = 1; \
state.count = 0; \
state.level++; \
state.point++; \
} \
else \
{ \
STACK_DISCARD(state.stack, state.count, on_error); \
STACK_TOP(state.stack, item, on_error); \
item->fail.text = xtext; \
state.count = 0; \
state.level++; \
} \
}
 
#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
{ \
item_t *item; \
do \
{ \
while(state.count > 0) \
{ \
STACK_PREV(state.stack, item, on_error); \
state.start[item->reg.num] = item->reg.start; \
state.end[item->reg.num] = item->reg.end; \
state.changed[item->reg.num] = item->reg.level; \
state.count--; \
} \
STACK_PREV(state.stack, item, on_empty); \
xcode = item->fail.code; \
xtext = item->fail.text; \
state.count = item->fail.count; \
state.level = item->fail.level; \
state.point--; \
} \
while (item->fail.text == NULL); \
}
 
enum regexp_compiled_ops /* opcodes for compiled regexp */
{
Cend, /* end of pattern reached */
Cbol, /* beginning of line */
Ceol, /* end of line */
Cset, /* character set. Followed by 32 bytes of set. */
Cexact, /* followed by a byte to match */
Canychar, /* matches any character except newline */
Cstart_memory, /* set register start addr (followed by reg number) */
Cend_memory, /* set register end addr (followed by reg number) */
Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
Cjump, /* followed by two bytes (lsb,msb) of displacement. */
Cstar_jump, /* will change to jump/update_failure_jump at runtime */
Cfailure_jump, /* jump to addr on failure */
Cupdate_failure_jump, /* update topmost failure point and jump */
Cdummy_failure_jump, /* push a dummy failure point and jump */
Cbegbuf, /* match at beginning of buffer */
Cendbuf, /* match at end of buffer */
Cwordbeg, /* match at beginning of word */
Cwordend, /* match at end of word */
Cwordbound, /* match if at word boundary */
Cnotwordbound, /* match if not at word boundary */
Csyntaxspec, /* matches syntax code (1 byte follows) */
Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
Crepeat1
};
 
enum regexp_syntax_op /* syntax codes for plain and quoted characters */
{
Rend, /* special code for end of regexp */
Rnormal, /* normal character */
Ranychar, /* any character except newline */
Rquote, /* the quote character */
Rbol, /* match beginning of line */
Reol, /* match end of line */
Roptional, /* match preceding expression optionally */
Rstar, /* match preceding expr zero or more times */
Rplus, /* match preceding expr one or more times */
Ror, /* match either of alternatives */
Ropenpar, /* opening parenthesis */
Rclosepar, /* closing parenthesis */
Rmemory, /* match memory register */
Rextended_memory, /* \vnn to match registers 10-99 */
Ropenset, /* open set. Internal syntax hard-coded below. */
/* the following are gnu extensions to "normal" regexp syntax */
Rbegbuf, /* beginning of buffer */
Rendbuf, /* end of buffer */
Rwordchar, /* word character */
Rnotwordchar, /* not word character */
Rwordbeg, /* beginning of word */
Rwordend, /* end of word */
Rwordbound, /* word bound */
Rnotwordbound, /* not word bound */
Rnum_ops
};
 
/* customized errno */
int re_errno = TP_RE_NOERR;
 
static int re_compile_initialized = 0;
static int regexp_syntax = 0;
int re_syntax = 0; /* Exported copy of regexp_syntax */
static unsigned char regexp_plain_ops[256];
static unsigned char regexp_quoted_ops[256];
static unsigned char regexp_precedences[Rnum_ops];
static int regexp_context_indep_ops;
static int regexp_ansi_sequences;
 
#define NUM_LEVELS 5 /* number of precedence levels in use */
#define MAX_NESTING 100 /* max nesting level of operators */
 
#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
 
unsigned char re_syntax_table[256];
 
int re_err_occurred(void)
{
if (re_errno == TP_RE_NOERR)
return (0);
return (1);
}
 
void re_compile_initialize(void)
{
int a;
static int syntax_table_inited = 0;
 
if (!syntax_table_inited)
{
syntax_table_inited = 1;
memset(re_syntax_table, 0, 256);
for (a = 'a'; a <= 'z'; a++)
re_syntax_table[a] = Sword;
for (a = 'A'; a <= 'Z'; a++)
re_syntax_table[a] = Sword;
for (a = '0'; a <= '9'; a++)
re_syntax_table[a] = Sword | Sdigit | Shexdigit;
for (a = '0'; a <= '7'; a++)
re_syntax_table[a] |= Soctaldigit;
for (a = 'A'; a <= 'F'; a++)
re_syntax_table[a] |= Shexdigit;
for (a = 'a'; a <= 'f'; a++)
re_syntax_table[a] |= Shexdigit;
re_syntax_table['_'] = Sword;
for (a = 9; a <= 13; a++)
re_syntax_table[a] = Swhitespace;
re_syntax_table[' '] = Swhitespace;
}
re_compile_initialized = 1;
for (a = 0; a < 256; a++)
{
regexp_plain_ops[a] = Rnormal;
regexp_quoted_ops[a] = Rnormal;
}
for (a = '0'; a <= '9'; a++)
regexp_quoted_ops[a] = Rmemory;
regexp_plain_ops['\134'] = Rquote;
if (regexp_syntax & RE_NO_BK_PARENS)
{
regexp_plain_ops['('] = Ropenpar;
regexp_plain_ops[')'] = Rclosepar;
}
else
{
regexp_quoted_ops['('] = Ropenpar;
regexp_quoted_ops[')'] = Rclosepar;
}
if (regexp_syntax & RE_NO_BK_VBAR)
regexp_plain_ops['\174'] = Ror;
else
regexp_quoted_ops['\174'] = Ror;
regexp_plain_ops['*'] = Rstar;
if (regexp_syntax & RE_BK_PLUS_QM)
{
regexp_quoted_ops['+'] = Rplus;
regexp_quoted_ops['?'] = Roptional;
}
else
{
regexp_plain_ops['+'] = Rplus;
regexp_plain_ops['?'] = Roptional;
}
if (regexp_syntax & RE_NEWLINE_OR)
regexp_plain_ops['\n'] = Ror;
regexp_plain_ops['\133'] = Ropenset;
regexp_plain_ops['\136'] = Rbol;
regexp_plain_ops['$'] = Reol;
regexp_plain_ops['.'] = Ranychar;
if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
{
regexp_quoted_ops['w'] = Rwordchar;
regexp_quoted_ops['W'] = Rnotwordchar;
regexp_quoted_ops['<'] = Rwordbeg;
regexp_quoted_ops['>'] = Rwordend;
regexp_quoted_ops['b'] = Rwordbound;
regexp_quoted_ops['B'] = Rnotwordbound;
regexp_quoted_ops['`'] = Rbegbuf;
regexp_quoted_ops['\''] = Rendbuf;
}
if (regexp_syntax & RE_ANSI_HEX)
regexp_quoted_ops['v'] = Rextended_memory;
for (a = 0; a < Rnum_ops; a++)
regexp_precedences[a] = 4;
if (regexp_syntax & RE_TIGHT_VBAR)
{
regexp_precedences[Ror] = 3;
regexp_precedences[Rbol] = 2;
regexp_precedences[Reol] = 2;
}
else
{
regexp_precedences[Ror] = 2;
regexp_precedences[Rbol] = 3;
regexp_precedences[Reol] = 3;
}
regexp_precedences[Rclosepar] = 1;
regexp_precedences[Rend] = 0;
regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
}
 
int re_set_syntax(int syntax)
{
int ret;
ret = regexp_syntax;
regexp_syntax = syntax;
re_syntax = syntax; /* Exported copy */
re_compile_initialize();
return ret;
}
 
static int hex_char_to_decimal(int ch)
{
if (ch >= '0' && ch <= '9')
return ch - '0';
if (ch >= 'a' && ch <= 'f')
return ch - 'a' + 10;
if (ch >= 'A' && ch <= 'F')
return ch - 'A' + 10;
return 16;
}
 
static void re_compile_fastmap_aux(unsigned char *code, int pos,
unsigned char *visited,
unsigned char *can_be_null,
unsigned char *fastmap)
{
int a;
int b;
int syntaxcode;
if (visited[pos])
return; /* we have already been here */
visited[pos] = 1;
for (;;)
switch (code[pos++]) {
case Cend:
{
*can_be_null = 1;
return;
}
case Cbol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
for (a = 0; a < 256; a++)
fastmap[a] = 1;
break;
}
case Csyntaxspec:
{
syntaxcode = code[pos++];
for (a = 0; a < 256; a++)
if (SYNTAX(a) & syntaxcode)
fastmap[a] = 1;
return;
}
case Cnotsyntaxspec:
{
syntaxcode = code[pos++];
for (a = 0; a < 256; a++)
if (!(SYNTAX(a) & syntaxcode) )
fastmap[a] = 1;
return;
}
case Ceol:
{
fastmap['\n'] = 1;
if (*can_be_null == 0)
*can_be_null = 2; /* can match null, but only at end of buffer*/
return;
}
case Cset:
{
for (a = 0; a < 256/8; a++)
if (code[pos + a] != 0)
for (b = 0; b < 8; b++)
if (code[pos + a] & (1 << b))
fastmap[(a << 3) + b] = 1;
pos += 256/8;
return;
}
case Cexact:
{
fastmap[(unsigned char)code[pos]] = 1;
return;
}
case Canychar:
{
for (a = 0; a < 256; a++)
if (a != '\n')
fastmap[a] = 1;
return;
}
case Cstart_memory:
case Cend_memory:
{
pos++;
break;
}
case Cmatch_memory:
{
for (a = 0; a < 256; a++)
fastmap[a] = 1;
*can_be_null = 1;
return;
}
case Cjump:
case Cdummy_failure_jump:
case Cupdate_failure_jump:
case Cstar_jump:
{
a = (unsigned char)code[pos++];
a |= (unsigned char)code[pos++] << 8;
pos += (int)SHORT(a);
if (visited[pos])
{
/* argh... the regexp contains empty loops. This is not
good, as this may cause a failure stack overflow when
matching. Oh well. */
/* this path leads nowhere; pursue other paths. */
return;
}
visited[pos] = 1;
break;
}
case Cfailure_jump:
{
a = (unsigned char)code[pos++];
a |= (unsigned char)code[pos++] << 8;
a = pos + (int)SHORT(a);
re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
break;
}
case Crepeat1:
{
pos += 2;
break;
}
default:
{
/*PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?");*/
re_errno = TP_RE_UNKNOWN_OPCODE;
return;
/*NOTREACHED*/
}
}
}
 
static int re_do_compile_fastmap(unsigned char *buffer, int used, int pos,
unsigned char *can_be_null,
unsigned char *fastmap)
{
unsigned char small_visited[512], *visited;
if (used <= sizeof(small_visited))
visited = small_visited;
else
{
visited = (unsigned char *)malloc(used);
if (!visited)
return 0;
}
*can_be_null = 0;
memset(fastmap, 0, 256);
memset(visited, 0, used);
re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
if (visited != small_visited)
free(visited);
return 1;
}
 
void re_compile_fastmap(regexp_t bufp)
{
if (!bufp->fastmap || bufp->fastmap_accurate)
return;
assert(bufp->used > 0);
if (!re_do_compile_fastmap(bufp->buffer,
bufp->used,
0,
&bufp->can_be_null,
bufp->fastmap))
return;
/*if (PyErr_Occurred()) return;*/
if (re_err_occurred()) return;
 
if (bufp->buffer[0] == Cbol)
bufp->anchor = 1; /* begline */
else
if (bufp->buffer[0] == Cbegbuf)
bufp->anchor = 2; /* begbuf */
else
bufp->anchor = 0; /* none */
bufp->fastmap_accurate = 1;
}
 
/*
* star is coded as:
* 1: failure_jump 2
* ... code for operand of star
* star_jump 1
* 2: ... code after star
*
* We change the star_jump to update_failure_jump if we can determine
* that it is safe to do so; otherwise we change it to an ordinary
* jump.
*
* plus is coded as
*
* jump 2
* 1: failure_jump 3
* 2: ... code for operand of plus
* star_jump 1
* 3: ... code after plus
*
* For star_jump considerations this is processed identically to star.
*
*/
 
static int re_optimize_star_jump(regexp_t bufp, unsigned char *code)
{
unsigned char map[256];
unsigned char can_be_null;
unsigned char *p1;
unsigned char *p2;
unsigned char ch;
int a;
int b;
int num_instructions = 0;
 
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
a = (int)SHORT(a);
p1 = code + a + 3; /* skip the failure_jump */
/* Check that the jump is within the pattern */
if (p1<bufp->buffer || bufp->buffer+bufp->used<p1)
{
/*PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (failure_jump opt)");*/
re_errno = TP_RE_JUMP_OUT_BOUNDS;
return 0;
}
assert(p1[-3] == Cfailure_jump);
p2 = code;
/* p1 points inside loop, p2 points to after loop */
if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
(int)(p2 - bufp->buffer),
&can_be_null, map))
goto make_normal_jump;
/* If we might introduce a new update point inside the
* loop, we can't optimize because then update_jump would
* update a wrong failure point. Thus we have to be
* quite careful here.
*/
/* loop until we find something that consumes a character */
loop_p1:
num_instructions++;
switch (*p1++)
{
case Cbol:
case Ceol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
goto loop_p1;
}
case Cstart_memory:
case Cend_memory:
{
p1++;
goto loop_p1;
}
case Cexact:
{
ch = (unsigned char)*p1++;
if (map[(int)ch])
goto make_normal_jump;
break;
}
case Canychar:
{
for (b = 0; b < 256; b++)
if (b != '\n' && map[b])
goto make_normal_jump;
break;
}
case Cset:
{
for (b = 0; b < 256; b++)
if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
goto make_normal_jump;
p1 += 256/8;
break;
}
default:
{
goto make_normal_jump;
}
}
/* now we know that we can't backtrack. */
while (p1 != p2 - 3)
{
num_instructions++;
switch (*p1++)
{
case Cend:
{
return 0;
}
case Cbol:
case Ceol:
case Canychar:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
break;
}
case Cset:
{
p1 += 256/8;
break;
}
case Cexact:
case Cstart_memory:
case Cend_memory:
case Cmatch_memory:
case Csyntaxspec:
case Cnotsyntaxspec:
{
p1++;
break;
}
case Cjump:
case Cstar_jump:
case Cfailure_jump:
case Cupdate_failure_jump:
case Cdummy_failure_jump:
{
goto make_normal_jump;
}
default:
{
return 0;
}
}
}
/* make_update_jump: */
code -= 3;
a += 3; /* jump to after the Cfailure_jump */
code[0] = Cupdate_failure_jump;
code[1] = a & 0xff;
code[2] = a >> 8;
if (num_instructions > 1)
return 1;
assert(num_instructions == 1);
/* if the only instruction matches a single character, we can do
* better */
p1 = code + 3 + a; /* start of sole instruction */
if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
*p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
code[0] = Crepeat1;
return 1;
make_normal_jump:
code -= 3;
*code = Cjump;
return 1;
}
 
static int re_optimize(regexp_t bufp)
{
unsigned char *code;
code = bufp->buffer;
while(1)
{
switch (*code++)
{
case Cend:
{
return 1;
}
case Canychar:
case Cbol:
case Ceol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
break;
}
case Cset:
{
code += 256/8;
break;
}
case Cexact:
case Cstart_memory:
case Cend_memory:
case Cmatch_memory:
case Csyntaxspec:
case Cnotsyntaxspec:
{
code++;
break;
}
case Cstar_jump:
{
if (!re_optimize_star_jump(bufp, code))
{
return 0;
}
/* fall through */
}
case Cupdate_failure_jump:
case Cjump:
case Cdummy_failure_jump:
case Cfailure_jump:
case Crepeat1:
{
code += 2;
break;
}
default:
{
return 0;
}
}
}
}
 
#define NEXTCHAR(var) \
{ \
if (pos >= size) \
goto ends_prematurely; \
(var) = regex[pos]; \
pos++; \
}
 
#define ALLOC(amount) \
{ \
if (pattern_offset+(amount) > alloc) \
{ \
alloc += 256 + (amount); \
pattern = (unsigned char *)realloc(pattern, alloc); \
if (!pattern) \
goto out_of_memory; \
} \
}
 
#define STORE(ch) pattern[pattern_offset++] = (ch)
 
#define CURRENT_LEVEL_START (starts[starts_base + current_level])
 
#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
 
#define PUSH_LEVEL_STARTS \
if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
starts_base += NUM_LEVELS; \
else \
goto too_complex \
 
#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
 
#define PUT_ADDR(offset,addr) \
{ \
int disp = (addr) - (offset) - 2; \
pattern[(offset)] = disp & 0xff; \
pattern[(offset)+1] = (disp>>8) & 0xff; \
}
 
#define INSERT_JUMP(pos,type,addr) \
{ \
int a, p = (pos), t = (type), ad = (addr); \
for (a = pattern_offset - 1; a >= p; a--) \
pattern[a + 3] = pattern[a]; \
pattern[p] = t; \
PUT_ADDR(p+1,ad); \
pattern_offset += 3; \
}
 
#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
 
#define SET_FIELDS \
{ \
bufp->allocated = alloc; \
bufp->buffer = pattern; \
bufp->used = pattern_offset; \
}
#define GETHEX(var) \
{ \
unsigned char gethex_ch, gethex_value; \
NEXTCHAR(gethex_ch); \
gethex_value = hex_char_to_decimal(gethex_ch); \
if (gethex_value == 16) \
goto hex_error; \
NEXTCHAR(gethex_ch); \
gethex_ch = hex_char_to_decimal(gethex_ch); \
if (gethex_ch == 16) \
goto hex_error; \
(var) = gethex_value * 16 + gethex_ch; \
}
 
#define ANSI_TRANSLATE(ch) \
{ \
switch (ch) \
{ \
case 'a': \
case 'A': \
{ \
ch = 7; /* audible bell */ \
break; \
} \
case 'b': \
case 'B': \
{ \
ch = 8; /* backspace */ \
break; \
} \
case 'f': \
case 'F': \
{ \
ch = 12; /* form feed */ \
break; \
} \
case 'n': \
case 'N': \
{ \
ch = 10; /* line feed */ \
break; \
} \
case 'r': \
case 'R': \
{ \
ch = 13; /* carriage return */ \
break; \
} \
case 't': \
case 'T': \
{ \
ch = 9; /* tab */ \
break; \
} \
case 'v': \
case 'V': \
{ \
ch = 11; /* vertical tab */ \
break; \
} \
case 'x': /* hex code */ \
case 'X': \
{ \
GETHEX(ch); \
break; \
} \
default: \
{ \
/* other characters passed through */ \
if (translate) \
ch = translate[(unsigned char)ch]; \
break; \
} \
} \
}
 
char *re_compile_pattern(unsigned char *regex, int size, regexp_t bufp)
{
int a;
int pos;
int op;
int current_level;
int level;
int opcode;
int pattern_offset = 0, alloc;
int starts[NUM_LEVELS * MAX_NESTING];
int starts_base;
int future_jumps[MAX_NESTING];
int num_jumps;
unsigned char ch = '\0';
unsigned char *pattern;
unsigned char *translate;
int next_register;
int paren_depth;
int num_open_registers;
int open_registers[RE_NREGS];
int beginning_context;
if (!re_compile_initialized)
re_compile_initialize();
bufp->used = 0;
bufp->fastmap_accurate = 0;
bufp->uses_registers = 1;
bufp->num_registers = 1;
translate = bufp->translate;
pattern = bufp->buffer;
alloc = bufp->allocated;
if (alloc == 0 || pattern == NULL)
{
alloc = 256;
pattern = (unsigned char *)malloc(alloc);
if (!pattern)
goto out_of_memory;
}
pattern_offset = 0;
starts_base = 0;
num_jumps = 0;
current_level = 0;
SET_LEVEL_START;
num_open_registers = 0;
next_register = 1;
paren_depth = 0;
beginning_context = 1;
op = -1;
/* we use Rend dummy to ensure that pending jumps are updated
(due to low priority of Rend) before exiting the loop. */
pos = 0;
while (op != Rend)
{
if (pos >= size)
op = Rend;
else
{
NEXTCHAR(ch);
if (translate)
ch = translate[(unsigned char)ch];
op = regexp_plain_ops[(unsigned char)ch];
if (op == Rquote)
{
NEXTCHAR(ch);
op = regexp_quoted_ops[(unsigned char)ch];
if (op == Rnormal && regexp_ansi_sequences)
ANSI_TRANSLATE(ch);
}
}
level = regexp_precedences[op];
/* printf("ch='%c' op=%d level=%d current_level=%d
curlevstart=%d\n", ch, op, level, current_level,
CURRENT_LEVEL_START); */
if (level > current_level)
{
for (current_level++; current_level < level; current_level++)
SET_LEVEL_START;
SET_LEVEL_START;
}
else
if (level < current_level)
{
current_level = level;
for (;num_jumps > 0 &&
future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
num_jumps--)
PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
}
switch (op)
{
case Rend:
{
break;
}
case Rnormal:
{
normal_char:
opcode = Cexact;
store_opcode_and_arg: /* opcode & ch must be set */
SET_LEVEL_START;
ALLOC(2);
STORE(opcode);
STORE(ch);
break;
}
case Ranychar:
{
opcode = Canychar;
store_opcode:
SET_LEVEL_START;
ALLOC(1);
STORE(opcode);
break;
}
case Rquote:
{
/*Py_FatalError("Rquote");*/
re_errno = TP_RE_QUOTE_ERR;
abort(); /* XXX: may need to jump to error handler */
/*NOTREACHED*/
}
case Rbol:
{
if (!beginning_context) {
if (regexp_context_indep_ops)
goto op_error;
else
goto normal_char;
}
opcode = Cbol;
goto store_opcode;
}
case Reol:
{
if (!((pos >= size) ||
((regexp_syntax & RE_NO_BK_VBAR) ?
(regex[pos] == '\174') :
(pos+1 < size && regex[pos] == '\134' &&
regex[pos+1] == '\174')) ||
((regexp_syntax & RE_NO_BK_PARENS)?
(regex[pos] == ')'):
(pos+1 < size && regex[pos] == '\134' &&
regex[pos+1] == ')')))) {
if (regexp_context_indep_ops)
goto op_error;
else
goto normal_char;
}
opcode = Ceol;
goto store_opcode;
/* NOTREACHED */
break;
}
case Roptional:
{
if (beginning_context) {
if (regexp_context_indep_ops)
goto op_error;
else
goto normal_char;
}
if (CURRENT_LEVEL_START == pattern_offset)
break; /* ignore empty patterns for ? */
ALLOC(3);
INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
pattern_offset + 3);
break;
}
case Rstar:
case Rplus:
{
if (beginning_context) {
if (regexp_context_indep_ops)
goto op_error;
else
goto normal_char;
}
if (CURRENT_LEVEL_START == pattern_offset)
break; /* ignore empty patterns for + and * */
ALLOC(9);
INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
pattern_offset + 6);
INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
if (op == Rplus) /* jump over initial failure_jump */
INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
CURRENT_LEVEL_START + 6);
break;
}
case Ror:
{
ALLOC(6);
INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
pattern_offset + 6);
if (num_jumps >= MAX_NESTING)
goto too_complex;
STORE(Cjump);
future_jumps[num_jumps++] = pattern_offset;
STORE(0);
STORE(0);
SET_LEVEL_START;
break;
}
case Ropenpar:
{
SET_LEVEL_START;
if (next_register < RE_NREGS)
{
bufp->uses_registers = 1;
ALLOC(2);
STORE(Cstart_memory);
STORE(next_register);
open_registers[num_open_registers++] = next_register;
bufp->num_registers++;
next_register++;
}
paren_depth++;
PUSH_LEVEL_STARTS;
current_level = 0;
SET_LEVEL_START;
break;
}
case Rclosepar:
{
if (paren_depth <= 0)
goto parenthesis_error;
POP_LEVEL_STARTS;
current_level = regexp_precedences[Ropenpar];
paren_depth--;
if (paren_depth < num_open_registers)
{
bufp->uses_registers = 1;
ALLOC(2);
STORE(Cend_memory);
num_open_registers--;
STORE(open_registers[num_open_registers]);
}
break;
}
case Rmemory:
{
if (ch == '0')
goto bad_match_register;
assert(ch >= '0' && ch <= '9');
bufp->uses_registers = 1;
opcode = Cmatch_memory;
ch -= '0';
goto store_opcode_and_arg;
}
case Rextended_memory:
{
NEXTCHAR(ch);
if (ch < '0' || ch > '9')
goto bad_match_register;
NEXTCHAR(a);
if (a < '0' || a > '9')
goto bad_match_register;
ch = 10 * (a - '0') + ch - '0';
if (ch == 0 || ch >= RE_NREGS)
goto bad_match_register;
bufp->uses_registers = 1;
opcode = Cmatch_memory;
goto store_opcode_and_arg;
}
case Ropenset:
{
int complement;
int prev;
int offset;
int range;
int firstchar;
SET_LEVEL_START;
ALLOC(1+256/8);
STORE(Cset);
offset = pattern_offset;
for (a = 0; a < 256/8; a++)
STORE(0);
NEXTCHAR(ch);
if (translate)
ch = translate[(unsigned char)ch];
if (ch == '\136')
{
complement = 1;
NEXTCHAR(ch);
if (translate)
ch = translate[(unsigned char)ch];
}
else
complement = 0;
prev = -1;
range = 0;
firstchar = 1;
while (ch != '\135' || firstchar)
{
firstchar = 0;
if (regexp_ansi_sequences && ch == '\134')
{
NEXTCHAR(ch);
ANSI_TRANSLATE(ch);
}
if (range)
{
for (a = prev; a <= (int)ch; a++)
SETBIT(pattern, offset, a);
prev = -1;
range = 0;
}
else
if (prev != -1 && ch == '-')
range = 1;
else
{
SETBIT(pattern, offset, ch);
prev = ch;
}
NEXTCHAR(ch);
if (translate)
ch = translate[(unsigned char)ch];
}
if (range)
SETBIT(pattern, offset, '-');
if (complement)
{
for (a = 0; a < 256/8; a++)
pattern[offset+a] ^= 0xff;
}
break;
}
case Rbegbuf:
{
opcode = Cbegbuf;
goto store_opcode;
}
case Rendbuf:
{
opcode = Cendbuf;
goto store_opcode;
}
case Rwordchar:
{
opcode = Csyntaxspec;
ch = Sword;
goto store_opcode_and_arg;
}
case Rnotwordchar:
{
opcode = Cnotsyntaxspec;
ch = Sword;
goto store_opcode_and_arg;
}
case Rwordbeg:
{
opcode = Cwordbeg;
goto store_opcode;
}
case Rwordend:
{
opcode = Cwordend;
goto store_opcode;
}
case Rwordbound:
{
opcode = Cwordbound;
goto store_opcode;
}
case Rnotwordbound:
{
opcode = Cnotwordbound;
goto store_opcode;
}
default:
{
abort();
}
}
beginning_context = (op == Ropenpar || op == Ror);
}
if (starts_base != 0)
goto parenthesis_error;
assert(num_jumps == 0);
ALLOC(1);
STORE(Cend);
SET_FIELDS;
if(!re_optimize(bufp))
return "Optimization error";
return NULL;
 
op_error:
SET_FIELDS;
return "Badly placed special character";
 
bad_match_register:
SET_FIELDS;
return "Bad match register number";
hex_error:
SET_FIELDS;
return "Bad hexadecimal number";
parenthesis_error:
SET_FIELDS;
return "Badly placed parenthesis";
out_of_memory:
SET_FIELDS;
return "Out of memory";
ends_prematurely:
SET_FIELDS;
return "Regular expression ends prematurely";
 
too_complex:
SET_FIELDS;
return "Regular expression too complex";
}
 
#undef CHARAT
#undef NEXTCHAR
#undef GETHEX
#undef ALLOC
#undef STORE
#undef CURRENT_LEVEL_START
#undef SET_LEVEL_START
#undef PUSH_LEVEL_STARTS
#undef POP_LEVEL_STARTS
#undef PUT_ADDR
#undef INSERT_JUMP
#undef SETBIT
#undef SET_FIELDS
 
#define PREFETCH if (text == textend) goto fail
 
#define NEXTCHAR(var) \
PREFETCH; \
var = (unsigned char)*text++; \
if (translate) \
var = translate[var]
 
int re_match(regexp_t bufp, unsigned char *string, int size, int pos,
regexp_registers_t old_regs)
{
unsigned char *code;
unsigned char *translate;
unsigned char *text;
unsigned char *textstart;
unsigned char *textend;
int a;
int b;
int ch;
int reg;
int match_end;
unsigned char *regstart;
unsigned char *regend;
int regsize;
match_state state;
assert(pos >= 0 && size >= 0);
assert(pos <= size);
text = string + pos;
textstart = string;
textend = string + size;
code = bufp->buffer;
translate = bufp->translate;
NEW_STATE(state, bufp->num_registers);
 
continue_matching:
switch (*code++)
{
case Cend:
{
match_end = text - textstart;
if (old_regs)
{
old_regs->start[0] = pos;
old_regs->end[0] = match_end;
if (!bufp->uses_registers)
{
for (a = 1; a < RE_NREGS; a++)
{
old_regs->start[a] = -1;
old_regs->end[a] = -1;
}
}
else
{
for (a = 1; a < bufp->num_registers; a++)
{
if ((GET_REG_START(state, a) == NULL) ||
(GET_REG_END(state, a) == NULL))
{
old_regs->start[a] = -1;
old_regs->end[a] = -1;
continue;
}
old_regs->start[a] = GET_REG_START(state, a) - textstart;
old_regs->end[a] = GET_REG_END(state, a) - textstart;
}
for (; a < RE_NREGS; a++)
{
old_regs->start[a] = -1;
old_regs->end[a] = -1;
}
}
}
FREE_STATE(state);
return match_end - pos;
}
case Cbol:
{
if (text == textstart || text[-1] == '\n')
goto continue_matching;
goto fail;
}
case Ceol:
{
if (text == textend || *text == '\n')
goto continue_matching;
goto fail;
}
case Cset:
{
NEXTCHAR(ch);
if (code[ch/8] & (1<<(ch & 7)))
{
code += 256/8;
goto continue_matching;
}
goto fail;
}
case Cexact:
{
NEXTCHAR(ch);
if (ch != (unsigned char)*code++)
goto fail;
goto continue_matching;
}
case Canychar:
{
NEXTCHAR(ch);
if (ch == '\n')
goto fail;
goto continue_matching;
}
case Cstart_memory:
{
reg = *code++;
SET_REG_START(state, reg, text, goto error);
goto continue_matching;
}
case Cend_memory:
{
reg = *code++;
SET_REG_END(state, reg, text, goto error);
goto continue_matching;
}
case Cmatch_memory:
{
reg = *code++;
regstart = GET_REG_START(state, reg);
regend = GET_REG_END(state, reg);
if ((regstart == NULL) || (regend == NULL))
goto fail; /* or should we just match nothing? */
regsize = regend - regstart;
 
if (regsize > (textend - text))
goto fail;
if(translate)
{
for (; regstart < regend; regstart++, text++)
if (translate[*regstart] != translate[*text])
goto fail;
}
else
for (; regstart < regend; regstart++, text++)
if (*regstart != *text)
goto fail;
goto continue_matching;
}
case Cupdate_failure_jump:
{
UPDATE_FAILURE(state, text, goto error);
/* fall to next case */
}
/* treat Cstar_jump just like Cjump if it hasn't been optimized */
case Cstar_jump:
case Cjump:
{
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
code += (int)SHORT(a);
if (code<bufp->buffer || bufp->buffer+bufp->used<code) {
/*PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cjump)");*/
re_errno = TP_RE_JUMP_OUT_BOUNDS;
FREE_STATE(state);
return -2;
}
goto continue_matching;
}
case Cdummy_failure_jump:
{
unsigned char *failuredest;
 
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
a = (int)SHORT(a);
assert(*code == Cfailure_jump);
b = (unsigned char)code[1];
b |= (unsigned char)code[2] << 8;
failuredest = code + (int)SHORT(b) + 3;
if (failuredest<bufp->buffer || bufp->buffer+bufp->used < failuredest) {
/*PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");*/
re_errno = TP_RE_JUMP_OUT_BOUNDS;
FREE_STATE(state);
return -2;
}
PUSH_FAILURE(state, failuredest, NULL, goto error);
code += a;
if (code<bufp->buffer || bufp->buffer+bufp->used < code) {
/*PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cdummy_failure_jump code)");*/
re_errno = TP_RE_JUMP_OUT_BOUNDS;
FREE_STATE(state);
return -2;
}
goto continue_matching;
}
case Cfailure_jump:
{
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
a = (int)SHORT(a);
if (code+a<bufp->buffer || bufp->buffer+bufp->used < code+a) {
/*PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cfailure_jump)");*/
re_errno = TP_RE_JUMP_OUT_BOUNDS;
FREE_STATE(state);
return -2;
}
PUSH_FAILURE(state, code + a, text, goto error);
goto continue_matching;
}
case Crepeat1:
{
unsigned char *pinst;
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
a = (int)SHORT(a);
pinst = code + a;
if (pinst<bufp->buffer || bufp->buffer+bufp->used<pinst) {
/*PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Crepeat1)");*/
re_errno = TP_RE_JUMP_OUT_BOUNDS;
FREE_STATE(state);
return -2;
}
/* pinst is sole instruction in loop, and it matches a
* single character. Since Crepeat1 was originally a
* Cupdate_failure_jump, we also know that backtracking
* is useless: so long as the single-character
* expression matches, it must be used. Also, in the
* case of +, we've already matched one character, so +
* can't fail: nothing here can cause a failure. */
switch (*pinst++)
{
case Cset:
{
if (translate)
{
while (text < textend)
{
ch = translate[(unsigned char)*text];
if (pinst[ch/8] & (1<<(ch & 7)))
text++;
else
break;
}
}
else
{
while (text < textend)
{
ch = (unsigned char)*text;
if (pinst[ch/8] & (1<<(ch & 7)))
text++;
else
break;
}
}
break;
}
case Cexact:
{
ch = (unsigned char)*pinst;
if (translate)
{
while (text < textend &&
translate[(unsigned char)*text] == ch)
text++;
}
else
{
while (text < textend && (unsigned char)*text == ch)
text++;
}
break;
}
case Canychar:
{
while (text < textend && (unsigned char)*text != '\n')
text++;
break;
}
case Csyntaxspec:
{
a = (unsigned char)*pinst;
if (translate)
{
while (text < textend &&
(SYNTAX(translate[*text]) & a) )
text++;
}
else
{
while (text < textend && (SYNTAX(*text) & a) )
text++;
}
break;
}
case Cnotsyntaxspec:
{
a = (unsigned char)*pinst;
if (translate)
{
while (text < textend &&
!(SYNTAX(translate[*text]) & a) )
text++;
}
else
{
while (text < textend && !(SYNTAX(*text) & a) )
text++;
}
break;
}
default:
{
FREE_STATE(state);
/*PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?");*/
re_errno = TP_RE_UNKNOWN_OPCODE;
return -2;
/*NOTREACHED*/
}
}
/* due to the funky way + and * are compiled, the top
* failure- stack entry at this point is actually a
* success entry -- update it & pop it */
UPDATE_FAILURE(state, text, goto error);
goto fail; /* i.e., succeed <wink/sigh> */
}
case Cbegbuf:
{
if (text == textstart)
goto continue_matching;
goto fail;
}
case Cendbuf:
{
if (text == textend)
goto continue_matching;
goto fail;
}
case Cwordbeg:
{
if (text == textend)
goto fail;
if (!(SYNTAX(*text) & Sword))
goto fail;
if (text == textstart)
goto continue_matching;
if (!(SYNTAX(text[-1]) & Sword))
goto continue_matching;
goto fail;
}
case Cwordend:
{
if (text == textstart)
goto fail;
if (!(SYNTAX(text[-1]) & Sword))
goto fail;
if (text == textend)
goto continue_matching;
if (!(SYNTAX(*text) & Sword))
goto continue_matching;
goto fail;
}
case Cwordbound:
{
/* Note: as in gnu regexp, this also matches at the
* beginning and end of buffer. */
 
if (text == textstart || text == textend)
goto continue_matching;
if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
goto continue_matching;
goto fail;
}
case Cnotwordbound:
{
/* Note: as in gnu regexp, this never matches at the
* beginning and end of buffer. */
if (text == textstart || text == textend)
goto fail;
if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
goto continue_matching;
goto fail;
}
case Csyntaxspec:
{
NEXTCHAR(ch);
if (!(SYNTAX(ch) & (unsigned char)*code++))
goto fail;
goto continue_matching;
}
case Cnotsyntaxspec:
{
NEXTCHAR(ch);
if (SYNTAX(ch) & (unsigned char)*code++)
goto fail;
goto continue_matching;
}
default:
{
FREE_STATE(state);
/*PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?");*/
re_errno = TP_RE_UNKNOWN_OPCODE;
return -2;
/*NOTREACHED*/
}
}
 
#if 0 /* This line is never reached --Guido */
abort();
#endif
/*
*NOTREACHED
*/
 
/* Using "break;" in the above switch statement is equivalent to "goto fail;" */
fail:
POP_FAILURE(state, code, text, goto done_matching, goto error);
goto continue_matching;
done_matching:
/* if(translated != NULL) */
/* free(translated); */
FREE_STATE(state);
return -1;
 
error:
/* if (translated != NULL) */
/* free(translated); */
FREE_STATE(state);
return -2;
}
 
#undef PREFETCH
#undef NEXTCHAR
 
int re_search(regexp_t bufp, unsigned char *string, int size, int pos,
int range, regexp_registers_t regs)
{
unsigned char *fastmap;
unsigned char *translate;
unsigned char *text;
unsigned char *partstart;
unsigned char *partend;
int dir;
int ret;
unsigned char anchor;
assert(size >= 0 && pos >= 0);
assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
fastmap = bufp->fastmap;
translate = bufp->translate;
if (fastmap && !bufp->fastmap_accurate) {
re_compile_fastmap(bufp);
if (re_err_occurred()) return -2;
}
anchor = bufp->anchor;
if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
fastmap = NULL;
 
if (range < 0)
{
dir = -1;
range = -range;
}
else
dir = 1;
 
if (anchor == 2) {
if (pos != 0)
return -1;
else
range = 0;
}
 
for (; range >= 0; range--, pos += dir)
{
if (fastmap)
{
if (dir == 1)
{ /* searching forwards */
 
text = string + pos;
partend = string + size;
partstart = text;
if (translate)
while (text != partend &&
!fastmap[(unsigned char) translate[(unsigned char)*text]])
text++;
else
while (text != partend && !fastmap[(unsigned char)*text])
text++;
pos += text - partstart;
range -= text - partstart;
if (pos == size && bufp->can_be_null == 0)
return -1;
}
else
{ /* searching backwards */
text = string + pos;
partstart = string + pos - range;
partend = text;
if (translate)
while (text != partstart &&
!fastmap[(unsigned char)
translate[(unsigned char)*text]])
text--;
else
while (text != partstart &&
!fastmap[(unsigned char)*text])
text--;
pos -= partend - text;
range -= partend - text;
}
}
if (anchor == 1)
{ /* anchored to begline */
if (pos > 0 && (string[pos - 1] != '\n'))
continue;
}
assert(pos >= 0 && pos <= size);
ret = re_match(bufp, string, size, pos, regs);
if (ret >= 0)
return pos;
if (ret == -2)
return -2;
}
return -1;
}
 
/*
** Local Variables:
** mode: c
** c-file-style: "python"
** End:
*/
/programs/develop/tinypy/modules/re/regexpr.h
0,0 → 1,160
/*
* -*- mode: c-mode; c-file-style: python -*-
*/
 
#ifndef Py_REGEXPR_H
#define Py_REGEXPR_H
#ifdef __cplusplus
extern "C" {
#endif
 
/*
* regexpr.h
*
* Author: Tatu Ylonen <ylo@ngs.fi>
*
* Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
*
* Permission to use, copy, modify, distribute, and sell this software
* and its documentation for any purpose is hereby granted without fee,
* provided that the above copyright notice appear in all copies. This
* software is provided "as is" without express or implied warranty.
*
* Created: Thu Sep 26 17:15:36 1991 ylo
* Last modified: Mon Nov 4 15:49:46 1991 ylo
*/
 
/* $Id$ */
 
#ifndef REGEXPR_H
#define REGEXPR_H
 
#define RE_NREGS 100 /* number of registers available */
 
typedef struct re_pattern_buffer
{
unsigned char *buffer; /* compiled pattern */
int allocated; /* allocated size of compiled pattern */
int used; /* actual length of compiled pattern */
unsigned char *fastmap; /* fastmap[ch] is true if ch can start pattern */
unsigned char *translate; /* translation to apply during compilation/matching */
unsigned char fastmap_accurate; /* true if fastmap is valid */
unsigned char can_be_null; /* true if can match empty string */
unsigned char uses_registers; /* registers are used and need to be initialized */
int num_registers; /* number of registers used */
unsigned char anchor; /* anchor: 0=none 1=begline 2=begbuf */
} *regexp_t;
 
typedef struct re_registers
{
int start[RE_NREGS]; /* start offset of region */
int end[RE_NREGS]; /* end offset of region */
} *regexp_registers_t;
 
/* bit definitions for syntax */
#define RE_NO_BK_PARENS 1 /* no quoting for parentheses */
#define RE_NO_BK_VBAR 2 /* no quoting for vertical bar */
#define RE_BK_PLUS_QM 4 /* quoting needed for + and ? */
#define RE_TIGHT_VBAR 8 /* | binds tighter than ^ and $ */
#define RE_NEWLINE_OR 16 /* treat newline as or */
#define RE_CONTEXT_INDEP_OPS 32 /* ^$?*+ are special in all contexts */
#define RE_ANSI_HEX 64 /* ansi sequences (\n etc) and \xhh */
#define RE_NO_GNU_EXTENSIONS 128 /* no gnu extensions */
 
#define TP_RE_NOERR 0
#define TP_RE_UNKNOWN_OPCODE (-1)
#define TP_RE_JUMP_OUT_BOUNDS 1
#define TP_RE_QUOTE_ERR 2
 
/* definitions for some common regexp styles */
#define RE_SYNTAX_AWK (RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_CONTEXT_INDEP_OPS)
#define RE_SYNTAX_EGREP (RE_SYNTAX_AWK|RE_NEWLINE_OR)
#define RE_SYNTAX_GREP (RE_BK_PLUS_QM|RE_NEWLINE_OR)
#define RE_SYNTAX_EMACS 0
 
#define Sword 1
#define Swhitespace 2
#define Sdigit 4
#define Soctaldigit 8
#define Shexdigit 16
 
/* Rename all exported symbols to avoid conflicts with similarly named
symbols in some systems' standard C libraries... */
 
#define re_syntax _Py_re_syntax
#define re_syntax_table _Py_re_syntax_table
#define re_compile_initialize _Py_re_compile_initialize
#define re_set_syntax _Py_re_set_syntax
#define re_compile_pattern _Py_re_compile_pattern
#define re_match _Py_re_match
#define re_search _Py_re_search
#define re_compile_fastmap _Py_re_compile_fastmap
#define re_comp _Py_re_comp
#define re_exec _Py_re_exec
 
#ifdef HAVE_PROTOTYPES
 
extern int re_syntax;
/* This is the actual syntax mask. It was added so that Python could do
* syntax-dependent munging of patterns before compilation. */
 
extern unsigned char re_syntax_table[256];
 
void re_compile_initialize(void);
 
int re_set_syntax(int syntax);
/* This sets the syntax to use and returns the previous syntax. The
* syntax is specified by a bit mask of the above defined bits. */
 
char *re_compile_pattern(unsigned char *regex, int regex_size, regexp_t compiled);
/* This compiles the regexp (given in regex and length in regex_size).
* This returns NULL if the regexp compiled successfully, and an error
* message if an error was encountered. The buffer field must be
* initialized to a memory area allocated by malloc (or to NULL) before
* use, and the allocated field must be set to its length (or 0 if
* buffer is NULL). Also, the translate field must be set to point to a
* valid translation table, or NULL if it is not used. */
 
int re_match(regexp_t compiled, unsigned char *string, int size, int pos,
regexp_registers_t old_regs);
/* This tries to match the regexp against the string. This returns the
* length of the matched portion, or -1 if the pattern could not be
* matched and -2 if an error (such as failure stack overflow) is
* encountered. */
 
int re_search(regexp_t compiled, unsigned char *string, int size, int startpos,
int range, regexp_registers_t regs);
/* This searches for a substring matching the regexp. This returns the
* first index at which a match is found. range specifies at how many
* positions to try matching; positive values indicate searching
* forwards, and negative values indicate searching backwards. mstop
* specifies the offset beyond which a match must not go. This returns
* -1 if no match is found, and -2 if an error (such as failure stack
* overflow) is encountered. */
 
void re_compile_fastmap(regexp_t compiled);
/* This computes the fastmap for the regexp. For this to have any effect,
* the calling program must have initialized the fastmap field to point
* to an array of 256 characters. */
 
#else /* HAVE_PROTOTYPES */
 
extern int re_syntax;
extern unsigned char re_syntax_table[256];
void re_compile_initialize();
int re_set_syntax();
char *re_compile_pattern();
int re_match();
int re_search();
void re_compile_fastmap();
 
#endif /* HAVE_PROTOTYPES */
 
#endif /* REGEXPR_H */
 
 
 
#ifdef __cplusplus
}
#endif
#endif /* !Py_REGEXPR_H */
/programs/develop/tinypy/modules/re/tests.py
0,0 → 1,648
"""
test case for re module
"""
 
import re
import testsuite
SUCCEED, FAIL, SYNTAX_ERROR = range(3)
 
def RAISE():
raise("testing failed")
 
def main():
#print("begin re tests")
 
assert(re.__name__ != None)
assert(re.__doc__ != None)
assert(re.__file__ != None)
 
test_re_obj_search()
test_re_obj_match()
test_re_mod_search()
test_re_mod_match()
test_re_obj_split()
test_re_mod_split()
test_re_obj_findall()
test_re_mod_findall()
test_mat_obj_groups()
test_mat_obj_start()
test_mat_obj_end()
test_mat_obj_span()
 
print("#OK: re tests passed")
 
def test_re_obj_search(verbose = None):
"""
some tests borrowed from cpython
testing re.compile(), reobj.search(), and matobj.group()
"""
regex_tests = testsuite.search_regex_tests
for t in regex_tests:
pattern=s=outcome=repl=expected=None
if len(t)==5:
pattern, s, outcome, repl, expected = t
elif len(t)==3:
pattern, s, outcome = t
else:
raise ('Test tuples should have 3 or 5 fields',t)
 
try:
obj=re.compile(pattern)
except:
if outcome==SYNTAX_ERROR: continue # Expected a syntax error
else:
# Regex syntax errors aren't yet reported, so for
# the official test suite they'll be quietly ignored.
pass
try:
matobj=obj.search(s)
except:
print('=== Unexpected exception:', obj, matobj, pattern, s)
RAISE()
 
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
pass
elif outcome==FAIL:
if matobj==None: pass # No match, as expected
else: print('=== Succeeded incorrectly', obj, matobj, pattern, s)
elif outcome==SUCCEED:
if matobj!=None:
# Matched, as expected, so now we compute the
# result string and compare it to our expected result.
found=matobj.group(0)
repl = repl.replace("found", str(found))
for i in range(1,11):
if "g"+str(i) in repl:
gi = str(matobj.group(i))
repl = repl.replace("g"+str(i), gi)
if len(t) == 5:
repl = repl.replace('+', '')
repl = repl.replace('\"', '')
if repl!=expected:
print( '=== grouping error', t,
str(repl)+' should be '+str(expected))
RAISE()
else:
print ('=== Failed incorrectly', t)
 
def test_re_obj_match(verbose = None):
"""
some tests borrowed from cpython
testing re.compile(), reobj.match() and matobj.group()
"""
regex_tests = testsuite.match_regex_tests
for t in regex_tests:
pattern=s=outcome=repl=expected=None
if len(t)==5:
pattern, s, outcome, repl, expected = t
elif len(t)==3:
pattern, s, outcome = t
else:
raise ('Test tuples should have 3 or 5 fields',t)
 
try:
obj=re.compile(pattern)
except:
if outcome==SYNTAX_ERROR: continue # Expected a syntax error
else:
# Regex syntax errors aren't yet reported, so for
# the official test suite they'll be quietly ignored.
pass
try:
matobj=obj.match(s)
except:
print('=== Unexpected exception:', obj, matobj, pattern, s)
 
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
pass
elif outcome==FAIL:
if matobj==None: pass # No match, as expected
else: print('=== Succeeded incorrectly', obj, matobj, pattern, s)
elif outcome==SUCCEED:
if matobj!=None:
# Matched, as expected, so now we compute the
# result string and compare it to our expected result.
found=matobj.group(0)
repl = repl.replace("found", str(found))
for i in range(1,11):
if "g"+str(i) in repl:
gi = str(matobj.group(i))
repl = repl.replace("g"+str(i), gi)
if len(t) == 5:
repl = repl.replace('+', '')
repl = repl.replace('\"', '')
if repl!=expected:
print( '=== grouping error', t,
str(repl)+' should be '+str(expected))
RAISE()
else:
print ('=== Failed incorrectly', obj, matobj, pattern, s)
 
def test_re_mod_search(verbose = None):
"""
some tests borrowed from cpython
testing re.search(), and matobj.group()
"""
regex_tests = testsuite.search_regex_tests
for t in regex_tests:
pattern=s=outcome=repl=expected=None
if len(t)==5:
pattern, s, outcome, repl, expected = t
elif len(t)==3:
pattern, s, outcome = t
else:
raise ('Test tuples should have 3 or 5 fields',t)
 
try:
matobj=re.search(pattern, s)
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
pass
else:
print('=== Unexpected exception:', matobj, pattern, s)
 
if outcome==FAIL:
if matobj==None: pass # No match, as expected
else: print('=== Succeeded incorrectly', obj, matobj, pattern, s)
elif outcome==SUCCEED:
if matobj!=None:
# Matched, as expected, so now we compute the
# result string and compare it to our expected result.
found=matobj.group(0)
repl = repl.replace("found", str(found))
for i in range(1,11):
if "g"+str(i) in repl:
gi = str(matobj.group(i))
repl = repl.replace("g"+str(i), gi)
if len(t) == 5:
repl = repl.replace('+', '')
repl = repl.replace('\"', '')
if repl!=expected:
print( '=== grouping error', t,
str(repl)+' should be '+str(expected))
RAISE()
else:
print ('=== Failed incorrectly', t)
 
def test_re_mod_match(verbose = None):
"""
some tests borrowed from cpython
testing re.match(), and matobj.group()
"""
regex_tests = testsuite.match_regex_tests
for t in regex_tests:
pattern=s=outcome=repl=expected=None
if len(t)==5:
pattern, s, outcome, repl, expected = t
elif len(t)==3:
pattern, s, outcome = t
else:
raise ('Test tuples should have 3 or 5 fields',t)
 
try:
matobj=re.match(pattern, s)
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
pass
else:
print('=== Unexpected exception:', matobj, pattern, s)
 
if outcome==FAIL:
if matobj==None: pass # No match, as expected
else: print('=== Succeeded incorrectly', matobj, pattern, s)
elif outcome==SUCCEED:
if matobj!=None:
# Matched, as expected, so now we compute the
# result string and compare it to our expected result.
found=matobj.group(0)
repl = repl.replace("found", str(found))
for i in range(1,11):
if "g"+str(i) in repl:
gi = str(matobj.group(i))
repl = repl.replace("g"+str(i), gi)
if len(t) == 5:
repl = repl.replace('+', '')
repl = repl.replace('\"', '')
if repl!=expected:
print( '=== grouping error', t,
str(repl)+' should be '+str(expected))
RAISE()
else:
print ('=== Failed incorrectly', t)
 
def test_re_obj_split(verbose = None):
"""
test re.compile(), and reobj.split()
"""
regex_tests = testsuite.split_regex_tests
for t in regex_tests:
pattern, s, outcome, maxsplit, fields = t
try:
reobj = re.compile(pattern)
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
pass
else:
print('=== Unexpected exception:', pattern, s,
outcome, maxsplit, fields)
try:
fldlst=reobj.split(s, maxsplit)
except:
if outcome == SYNTAX_ERROR:
continue
else:
print('=== Unexpected exception:', pattern, s,
outcome, maxsplit, fields)
 
if outcome==FAIL:
pass # No match, as expected
elif outcome==SUCCEED:
if fldlst:
# Matched, as expected, so now we compute the
# result string and compare it to our expected result.
if verbose:
fldstr = fieldstr = ""
for item in fldlst:
fldstr = fldstr + str(item) + " | "
for item in fields:
fieldstr = fieldstr + str(item) + " | "
print(fldstr, "~~~", fieldstr)
if len(fields) != len(fldlst):
print('=== Not coherent 1')
RAISE()
 
for i in range(len(fields)):
if fields[i] != fldlst[i]:
if verbose:
print('=== Not coherent 2', pattern, s,
outcome, maxsplit, fields, i,
fields[i],'(',len(fields[i]),')', ' | ',
fldlst[i],'(',len(fldlst[i]),')')
else:
print('=== Not coherent 2')
RAISE()
else:
print ('=== Failed incorrectly', pattern, s,
outcome, maxsplit, fields)
 
def test_re_mod_split(verbose = None):
"""
test re.split()
"""
regex_tests = testsuite.split_regex_tests
for t in regex_tests:
pattern, s, outcome, maxsplit, fields = t
try:
fldlst=re.split(pattern, s, maxsplit)
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
continue
else:
print('=== Unexpected exception:', pattern, s,
outcome, maxsplit, fields)
 
if outcome==FAIL:
pass # No match, as expected
elif outcome==SUCCEED:
if fldlst:
# Matched, as expected, so now we compute the
# result string and compare it to our expected result.
if verbose:
fldstr = fieldstr = ""
for item in fldlst:
fldstr = fldstr + str(item) + " | "
for item in fields:
fieldstr = fieldstr + str(item) + " | "
print(fldstr, "~~~", fieldstr)
 
if len(fields) != len(fldlst):
print('=== Not coherent 1')
RAISE()
 
for i in range(len(fields)):
if fields[i] != fldlst[i]:
if verbose:
print('=== Not coherent 2', pattern, s,
outcome, maxsplit, fields, i,
fields[i],'(',len(fields[i]),')', ' | ',
fldlst[i],'(',len(fldlst[i]),')')
else:
print('=== Not coherent 2')
RAISE()
else:
print ('=== Failed incorrectly', pattern, s,
outcome, maxsplit, fields)
 
def test_re_obj_findall(verbose = None):
"""
test re.compile(), and reobj.findall()
"""
regex_tests = testsuite.findall_regex_tests
for t in regex_tests:
pattern, s, outcome, pos, fields = t
try:
reobj = re.compile(pattern)
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
pass
else:
print('=== Unexpected exception:', pattern, s,
outcome, pos, fields)
try:
fldlst=reobj.findall(s, pos)
except:
if outcome == SYNTAX_ERROR:
continue
else:
print('=== Unexpected exception:', pattern, s,
outcome, pos, fields)
 
if outcome==FAIL:
pass # No match, as expected
elif outcome==SUCCEED:
if fldlst:
# Matched, as expected, so now we compute the
# result string and compare it to our expected result.
if verbose:
fldstr = fieldstr = ""
for item in fldlst:
fldstr = fldstr + str(item) + " | "
for item in fields:
fieldstr = fieldstr + str(item) + " | "
print(fldstr, "~~~", fieldstr)
 
if len(fields) != len(fldlst):
print('=== Not coherent 1')
RAISE()
 
for i in range(len(fields)):
if fields[i] != fldlst[i]:
if verbose:
print('=== Not coherent 2', pattern, s,
outcome, maxsplit, fields, i,
fields[i],'(',len(fields[i]),')', ' | ',
fldlst[i],'(',len(fldlst[i]),')')
else:
print('=== Not coherent 2')
RAISE()
else:
print ('=== Failed incorrectly', pattern, s,
outcome, pos, fields)
 
def test_re_mod_findall(verbose = None):
"""
test re.findall()
"""
regex_tests = testsuite.mod_findall_regex_tests
for t in regex_tests:
pattern, s, outcome, pos, fields = t # pos is not used
try:
fldlst=re.findall(pattern, s)
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
continue
else:
print('=== Unexpected exception:', pattern, s,
outcome, pos, fields)
 
if outcome==FAIL:
pass # No match, as expected
elif outcome==SUCCEED:
if fldlst:
# Matched, as expected, so now we compute the
# result string and compare it to our expected result.
if verbose:
fldstr = fieldstr = ""
for item in fldlst:
fldstr = fldstr + str(item) + " | "
for item in fields:
fieldstr = fieldstr + str(item) + " | "
print(fldstr, "~~~", fieldstr)
 
if len(fields) != len(fldlst):
print('=== Not coherent 1')
RAISE()
 
for i in range(len(fields)):
if fields[i] != fldlst[i]:
if verbose:
print('=== Not coherent 2', pattern, s,
outcome, maxsplit, fields, i,
fields[i],'(',len(fields[i]),')', ' | ',
fldlst[i],'(',len(fldlst[i]),')')
else:
print('=== Not coherent 2')
RAISE()
else:
print ('=== Failed incorrectly', pattern, s,
outcome, pos, fields)
 
def test_mat_obj_groups(verbose = None):
"""
test re.search(), and matobj.groups()
'verbose' is for debugging, when 'verbose' is true, print extra info
"""
regex_tests = testsuite.matobj_groups_regex_tests
for t in regex_tests:
pattern, s, outcome, fields, grpidx, start, end = t
try:
matobj=re.search(pattern, s)
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
continue
else:
print('=== Unexpected exception 1:', pattern, s,
outcome,fields)
 
try:
if outcome==SUCCEED: assert(matobj != None)
fldlst = matobj.groups()
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
continue
else:
print('=== Unexpected exception 2:', pattern, s,
outcome,fields)
if outcome==FAIL:
pass # No match, as expected
elif outcome==SUCCEED:
if fldlst and fields:
# Matched, as expected, so now we compute the
# result string and compare it to our expected result.
if verbose:
fldstr = fieldstr = ""
for item in fldlst:
fldstr = fldstr + str(item) + " | "
for item in fields:
fieldstr = fieldstr + str(item) + " | "
print(fldstr, "~~~", fieldstr)
 
if len(fields) != len(fldlst):
print('=== Not coherent 2')
RAISE()
 
for i in range(len(fields)):
if fields[i] != fldlst[i]:
if verbose:
print('=== Not coherent', pattern, s,
outcome,fields, i,
fields[i],'(',len(fields[i]),')', ' | ',
fldlst[i],'(',len(fldlst[i]),')')
else:
print('=== Not coherent')
RAISE()
elif not len(fldlst) and not len(fields):
# output is empty, as expected
if verbose:
print("output is empty, as expected")
continue
else:
if verbose:
for item in fldlst:
print(item,)
print()
for item in fields:
print(item,)
print()
print ('=== Failed incorrectly', pattern, s,
outcome,fields,fldlst)
 
def test_mat_obj_start(verbose = None):
"""
test re.search(), and matobj.start()
'verbose' is for debugging, when 'verbose' is true, print extra info
"""
regex_tests = testsuite.matobj_groups_regex_tests
for t in regex_tests:
pattern, s, outcome, fields, grpidx, start, end = t
try:
matobj=re.search(pattern, s)
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
continue
else:
print('=== Unexpected exception 1:', pattern, s,
outcome,fields)
 
try:
if outcome==SUCCEED: assert(matobj != None)
fldlst = matobj.groups()
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
continue
else:
print('=== Unexpected exception 2:', pattern, s,
outcome,fields)
if outcome==FAIL:
pass # No match, as expected
elif outcome==SUCCEED:
if grpidx > 0:
if matobj.start(grpidx) == start:
pass
else:
if verbose:
print ('=== Failed incorrectly', pattern, s,
outcome,fields,fldlst)
raise("testing failed")
 
 
def test_mat_obj_end(verbose = None):
"""
test re.search(), and matobj.end()
'verbose' is for debugging, when 'verbose' is true, print extra info
"""
regex_tests = testsuite.matobj_groups_regex_tests
for t in regex_tests:
pattern, s, outcome, fields, grpidx, start, end = t
try:
matobj=re.search(pattern, s)
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
continue
else:
print('=== Unexpected exception 1:', pattern, s,
outcome,fields)
 
try:
if outcome==SUCCEED: assert(matobj != None)
fldlst = matobj.groups()
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
continue
else:
print('=== Unexpected exception 2:', pattern, s,
outcome,fields)
if outcome==FAIL:
pass # No match, as expected
elif outcome==SUCCEED:
if grpidx > 0:
if matobj.end(grpidx) == end:
pass
else:
if verbose:
print ('=== Failed incorrectly', pattern, s,
outcome,fields,fldlst, matobj.end(grpidx), end)
raise("testing failed")
 
def test_mat_obj_span(verbose = None):
"""
test re.search(), and matobj.span()
'verbose' is for debugging, when 'verbose' is true, print extra info
"""
regex_tests = testsuite.matobj_groups_regex_tests
for t in regex_tests:
pattern, s, outcome, fields, grpidx, start, end = t
try:
matobj=re.search(pattern, s)
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
continue
else:
print('=== Unexpected exception 1:', pattern, s,
outcome,fields)
 
try:
if outcome==SUCCEED: assert(matobj != None)
fldlst = matobj.groups()
except:
if outcome==SYNTAX_ERROR:
# This should have been a syntax error; forget it.
continue
else:
print('=== Unexpected exception 2:', pattern, s,
outcome,fields)
if outcome==FAIL:
pass # No match, as expected
elif outcome==SUCCEED:
if (grpidx > 0):
spstart, spend = matobj.span(grpidx)
if spstart == start and spend == end:
pass
else:
if verbose:
print ('=== Failed incorrectly', pattern, s,
outcome,fields,fldlst)
raise("testing failed")
 
if __name__ == "__main__":
main()
 
/programs/develop/tinypy/modules/re/testsuite.py
0,0 → 1,367
# Test suite (for verifying correctness)
#
# The test suite is a list of 5- or 3-tuples. The 5 parts of a
# complete tuple are:
# element 0: a string containing the pattern
# 1: the string to match against the pattern
# 2: the expected result (0 - SUCCEED, 1 - FAIL, 2 - SYNTAX_ERROR)
# 3: a string that will be eval()'ed to produce a test string.
# This is an arbitrary Python expression; the available
# variables are "found" (the whole match), and "g1", "g2", ...
# up to "g10" contain the contents of each group, or the
# string 'None' if the group wasn't given a value.
# 4: The expected result of evaluating the expression.
# If the two don't match, an error is reported.
#
# If the regex isn't expected to work, the latter two elements can be omitted.
 
# test suite for search
search_regex_tests=[
['abc', 'abc', 0, 'found', 'abc'],
['abc', 'xbc', 1],
['abc', 'axc', 1],
['abc', 'abx', 1],
['abc', 'xabcy', 0, 'found', 'abc'],
['abc', 'ababc', 0, 'found', 'abc'],
['ab*c', 'abc', 0, 'found', 'abc'],
['ab*bc', 'abc', 0, 'found', 'abc'],
['ab*bc', 'abbc', 0, 'found', 'abbc'],
['ab*bc', 'abbbbc', 0, 'found', 'abbbbc'],
['ab+bc', 'abbc', 0, 'found', 'abbc'],
['ab+bc', 'abc', 1],
['ab+bc', 'abq', 1],
['ab+bc', 'abbbbc', 0, 'found', 'abbbbc'],
['ab?bc', 'abbc', 0, 'found', 'abbc'],
['ab?bc', 'abc', 0, 'found', 'abc'],
['ab?bc', 'abbbbc', 1],
['ab?c', 'abc', 0, 'found', 'abc'],
['^abc$', 'abc', 0, 'found', 'abc'],
['^abc$', 'abcc', 1],
['^abc', 'abcc', 0, 'found', 'abc'],
['^abc$', 'aabc', 1],
['abc$', 'aabc', 0, 'found', 'abc'],
['^', 'abc', 0, 'found+"-"', '-'],
['$', 'abc', 0, 'found+"-"', '-'],
['a.c', 'abc', 0, 'found', 'abc'],
['a.c', 'axc', 0, 'found', 'axc'],
['a.*c', 'axyzc', 0, 'found', 'axyzc'],
['a.*c', 'axyzd', 1],
['a[bc]d', 'abc', 1],
['a[bc]d', 'abd', 0, 'found', 'abd'],
['a[b-d]e', 'abd', 1],
['a[b-d]e', 'ace', 0, 'found', 'ace'],
['a[b-d]', 'aac', 0, 'found', 'ac'],
['a[-b]', 'a-', 0, 'found', 'a-'],
['a[b-]', 'a-', 0, 'found', 'a-'],
['a[]b', '-', 2],
['a[', '-', 2],
['a\\', '-', 2],
['abc\\)', '-', 2],
['\\(abc', '-', 2],
['a]', 'a]', 0, 'found', 'a]'],
['a[]]b', 'a]b', 0, 'found', 'a]b'],
['a[^bc]d', 'aed', 0, 'found', 'aed'],
['a[^bc]d', 'abd', 1],
['a[^-b]c', 'adc', 0, 'found', 'adc'],
['a[^-b]c', 'a-c', 1],
['a[^]b]c', 'a]c', 1],
['a[^]b]c', 'adc', 0, 'found', 'adc'],
['\\ba\\b', 'a-', 0, '"-"', '-'],
['\\ba\\b', '-a', 0, '"-"', '-'],
['\\ba\\b', '-a-', 0, '"-"', '-'],
['\\by\\b', 'xy', 1],
['\\by\\b', 'yz', 1],
['\\by\\b', 'xyz', 1],
['ab\\|cd', 'abc', 0, 'found', 'ab'],
['ab\\|cd', 'abcd', 0, 'found', 'ab'],
['\\(\\)ef', 'def', 0, 'found+"-"+g1', 'ef-'],
['$b', 'b', 1],
['a(b', 'a(b', 0, 'found+"-"+g1', 'a(b-None'],
['a(*b', 'ab', 0, 'found', 'ab'],
['a(*b', 'a((b', 0, 'found', 'a((b'],
['a\\\\b', 'a\\b', 0, 'found', 'a\\b'],
['\\(\\(a\\)\\)', 'abc', 0, 'found+"-"+g1+"-"+g2', 'a-a-a'],
['\\(a\\)b\\(c\\)', 'abc', 0, 'found+"-"+g1+"-"+g2', 'abc-a-c'],
['a+b+c', 'aabbabc', 0, 'found', 'abc'],
['\\(a+\\|b\\)*', 'ab', 0, 'found+"-"+g1', 'ab-b'],
['\\(a+\\|b\\)+', 'ab', 0, 'found+"-"+g1', 'ab-b'],
['\\(a+\\|b\\)?', 'ab', 0, 'found+"-"+g1', 'a-a'],
['\\)\\(', '-', 2],
['[^ab]*', 'cde', 0, 'found', 'cde'],
['abc', '', 1],
['a*', '', 0, 'found', ''],
['a\\|b\\|c\\|d\\|e', 'e', 0, 'found', 'e'],
['\\(a\\|b\\|c\\|d\\|e\\)f', 'ef', 0, 'found+"-"+g1', 'ef-e'],
['abcd*efg', 'abcdefg', 0, 'found', 'abcdefg'],
['ab*', 'xabyabbbz', 0, 'found', 'ab'],
['ab*', 'xayabbbz', 0, 'found', 'a'],
['\\(ab\\|cd\\)e', 'abcde', 0, 'found+"-"+g1', 'cde-cd'],
['[abhgefdc]ij', 'hij', 0, 'found', 'hij'],
['^\\(ab\\|cd\\)e', 'abcde', 1, 'xg1y', 'xy'],
['\\(abc\\|\\)ef', 'abcdef', 0, 'found+"-"+g1', 'ef-'],
['\\(a\\|b\\)c*d', 'abcd', 0, 'found+"-"+g1', 'bcd-b'],
['\\(ab\\|ab*\\)bc', 'abc', 0, 'found+"-"+g1', 'abc-a'],
['a\\([bc]*\\)c*', 'abc', 0, 'found+"-"+g1', 'abc-bc'],
['a\\([bc]*\\)\\(c*d\\)', 'abcd', 0, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'],
['a\\([bc]+\\)\\(c*d\\)', 'abcd', 0, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'],
['a\\([bc]*\\)\\(c+d\\)', 'abcd', 0, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'],
['a[bcd]*dcdcde', 'adcdcde', 0, 'found', 'adcdcde'],
['a[bcd]+dcdcde', 'adcdcde', 1],
['\\(ab\\|a\\)b*c', 'abc', 0, 'found+"-"+g1', 'abc-ab'],
['\\(\\(a\\)\\(b\\)c\\)\\(d\\)', 'abcd', 0, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'],
['[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', 0, 'found', 'alpha'],
['^a\\(bc+\\|b[eh]\\)g\\|.h$', 'abh', 0, 'found+"-"+g1', 'bh-None'],
['\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'effgz', 0, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'],
['\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'ij', 0, 'found+"-"+g1+"-"+g2', 'ij-ij-j'],
['\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'effg', 1],
['\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'bcdd', 1],
['\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'reffgz', 0, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'],
['\\(\\(\\(\\(\\(\\(\\(\\(\\(a\\)\\)\\)\\)\\)\\)\\)\\)\\)', 'a', 0, 'found', 'a'],
['multiple words of text', 'uh-uh', 1],
['multiple words', 'multiple words, yeah', 0, 'found', 'multiple words'],
['\\(.*\\)c\\(.*\\)', 'abcde', 0, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'],
['(\\(.*\\), \\(.*\\))', '(a, b)', 0, 'g2+"-"+g1', 'b-a'],
['[k]', 'ab', 1],
['a[-]?c', 'ac', 0, 'found', 'ac'],
['\\(abc\\)\\1', 'abcabc', 0, 'g1', 'abc'],
['\\([a-c]*\\)\\1', 'abcabc', 0, 'g1', 'abc'],
['^\\(.+\\)?B', 'AB', 0, 'g1', 'A'],
['\\(a+\\).\\1$', 'aaaaa', 0, 'found+"-"+g1', 'aaaaa-aa'],
['^\\(a+\\).\\1$', 'aaaa', 1],
['\\(abc\\)\\1', 'abcabc', 0, 'found+"-"+g1', 'abcabc-abc'],
['\\([a-c]+\\)\\1', 'abcabc', 0, 'found+"-"+g1', 'abcabc-abc'],
['\\(a\\)\\1', 'aa', 0, 'found+"-"+g1', 'aa-a'],
['\\(a+\\)\\1', 'aa', 0, 'found+"-"+g1', 'aa-a'],
['\\(a+\\)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a'],
['\\(a\\).+\\1', 'aba', 0, 'found+"-"+g1', 'aba-a'],
['\\(a\\)ba*\\1', 'aba', 0, 'found+"-"+g1', 'aba-a'],
['\\(aa\\|a\\)a\\1$', 'aaa', 0, 'found+"-"+g1', 'aaa-a'],
['\\(a\\|aa\\)a\\1$', 'aaa', 0, 'found+"-"+g1', 'aaa-a'],
['\\(a+\\)a\\1$', 'aaa', 0, 'found+"-"+g1', 'aaa-a'],
['\\([abc]*\\)\\1', 'abcabc', 0, 'found+"-"+g1', 'abcabc-abc'],
['\\(a\\)\\(b\\)c\\|ab', 'ab', 0, 'found+"-"+g1+"-"+g2', 'ab-None-None'],
['\\(a\\)+x', 'aaax', 0, 'found+"-"+g1', 'aaax-a'],
['\\([ac]\\)+x', 'aacx', 0, 'found+"-"+g1', 'aacx-c'],
['\\([^/]*/\\)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', 0, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/'],
['\\([^.]*\\)\\.\\([^:]*\\):[T ]+\\(.*\\)', 'track1.title:TBlah blah blah', 0, 'found+"-"+g1+"-"+g2+"-"+g3', 'track1.title:TBlah blah blah-track1-title-Blah blah blah'],
['\\([^N]*N\\)+', 'abNNxyzN', 0, 'found+"-"+g1', 'abNNxyzN-xyzN'],
['\\([^N]*N\\)+', 'abNNxyz', 0, 'found+"-"+g1', 'abNN-N'],
['\\([abc]*\\)x', 'abcx', 0, 'found+"-"+g1', 'abcx-abc'],
['\\([abc]*\\)x', 'abc', 1],
['\\([xyz]*\\)x', 'abcx', 0, 'found+"-"+g1', 'x-'],
['\\(a\\)+b\\|aac', 'aac', 0, 'found+"-"+g1', 'aac-None'],
['\\<a', 'a', 0, 'found', 'a'],
['\\<a', '!', 1],
['a\\<b', 'ab', 1],
['a\\>', 'ab', 1],
['a\\>', 'a!', 0, 'found', 'a'],
['a\\>', 'a', 0, 'found', 'a'],
]
 
 
# test suite for match
match_regex_tests=[
['abc', 'abc', 0, 'found', 'abc'],
['abc', 'xbc', 1],
['abc', 'axc', 1],
['abc', 'abx', 1],
['abc', 'xabcy', 1],
['abc', 'ababc', 1],
['ab*c', 'abc', 0, 'found', 'abc'],
['ab*bc', 'abc', 0, 'found', 'abc'],
['ab*bc', 'abbc', 0, 'found', 'abbc'],
['ab*bc', 'abbbbc', 0, 'found', 'abbbbc'],
['ab+bc', 'abbc', 0, 'found', 'abbc'],
['ab+bc', 'abc', 1],
['ab+bc', 'abq', 1],
['ab+bc', 'abbbbc', 0, 'found', 'abbbbc'],
['ab?bc', 'abbc', 0, 'found', 'abbc'],
['ab?bc', 'abc', 0, 'found', 'abc'],
['ab?bc', 'abbbbc', 1],
['ab?c', 'abc', 0, 'found', 'abc'],
['^abc$', 'abc', 0, 'found', 'abc'],
['^abc$', 'abcc', 1],
['^abc', 'abcc', 0, 'found', 'abc'],
['^abc$', 'aabc', 1],
['abc$', 'aabc', 1],
['^', 'abc', 0, 'found+"-"', '-'],
['$', 'abc', 1],
['a.c', 'abc', 0, 'found', 'abc'],
['a.c', 'axc', 0, 'found', 'axc'],
['a.*c', 'axyzc', 0, 'found', 'axyzc'],
['a.*c', 'axyzd', 1],
['a[bc]d', 'abc', 1],
['a[bc]d', 'abd', 0, 'found', 'abd'],
['a[b-d]e', 'abd', 1],
['a[b-d]e', 'ace', 0, 'found', 'ace'],
['a[b-d]', 'aac', 1],
['a[-b]', 'a-', 0, 'found', 'a-'],
['a[b-]', 'a-', 0, 'found', 'a-'],
['a[]b', '-', 2],
['a[', '-', 2],
['a\\', '-', 2],
['abc\\)', '-', 2],
['\\(abc', '-', 2],
['a]', 'a]', 0, 'found', 'a]'],
['a[]]b', 'a]b', 0, 'found', 'a]b'],
['a[^bc]d', 'aed', 0, 'found', 'aed'],
['a[^bc]d', 'abd', 1],
['a[^-b]c', 'adc', 0, 'found', 'adc'],
['a[^-b]c', 'a-c', 1],
['a[^]b]c', 'a]c', 1],
['a[^]b]c', 'adc', 0, 'found', 'adc'],
['\\ba\\b', 'a-', 0, '"-"', '-'],
['\\ba\\b', '-a', 1],
['\\ba\\b', '-a-', 1],
['\\by\\b', 'xy', 1],
['\\by\\b', 'yz', 1],
['\\by\\b', 'xyz', 1],
['ab\\|cd', 'abc', 0, 'found', 'ab'],
['ab\\|cd', 'abcd', 0, 'found', 'ab'],
['\\(\\)ef', 'def', 1],
['$b', 'b', 1],
['a(b', 'a(b', 0, 'found+"-"+g1', 'a(b-None'],
['a(*b', 'ab', 0, 'found', 'ab'],
['a(*b', 'a((b', 0, 'found', 'a((b'],
['a\\\\b', 'a\\b', 0, 'found', 'a\\b'],
['\\(\\(a\\)\\)', 'abc', 0, 'found+"-"+g1+"-"+g2', 'a-a-a'],
['\\(a\\)b\\(c\\)', 'abc', 0, 'found+"-"+g1+"-"+g2', 'abc-a-c'],
['a+b+c', 'aabbabc', 1],
['\\(a+\\|b\\)*', 'ab', 0, 'found+"-"+g1', 'ab-b'],
['\\(a+\\|b\\)+', 'ab', 0, 'found+"-"+g1', 'ab-b'],
['\\(a+\\|b\\)?', 'ab', 0, 'found+"-"+g1', 'a-a'],
['\\)\\(', '-', 2],
['[^ab]*', 'cde', 0, 'found', 'cde'],
['abc', '', 1],
['a*', '', 0, 'found', ''],
['a\\|b\\|c\\|d\\|e', 'e', 0, 'found', 'e'],
['\\(a\\|b\\|c\\|d\\|e\\)f', 'ef', 0, 'found+"-"+g1', 'ef-e'],
['abcd*efg', 'abcdefg', 0, 'found', 'abcdefg'],
['ab*', 'xabyabbbz', 1],
['ab*', 'xayabbbz', 1],
['\\(ab\\|cd\\)e', 'abcde', 1],
['[abhgefdc]ij', 'hij', 0, 'found', 'hij'],
['^\\(ab\\|cd\\)e', 'abcde', 1, 'xg1y', 'xy'],
['\\(abc\\|\\)ef', 'abcdef', 1],
['\\(a\\|b\\)c*d', 'abcd', 1],
['\\(ab\\|ab*\\)bc', 'abc', 0, 'found+"-"+g1', 'abc-a'],
['a\\([bc]*\\)c*', 'abc', 0, 'found+"-"+g1', 'abc-bc'],
['a\\([bc]*\\)\\(c*d\\)', 'abcd', 0, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'],
['a\\([bc]+\\)\\(c*d\\)', 'abcd', 0, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'],
['a\\([bc]*\\)\\(c+d\\)', 'abcd', 0, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'],
['a[bcd]*dcdcde', 'adcdcde', 0, 'found', 'adcdcde'],
['a[bcd]+dcdcde', 'adcdcde', 1],
['\\(ab\\|a\\)b*c', 'abc', 0, 'found+"-"+g1', 'abc-ab'],
['\\(\\(a\\)\\(b\\)c\\)\\(d\\)', 'abcd', 0, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'],
['[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', 0, 'found', 'alpha'],
['^a\\(bc+\\|b[eh]\\)g\\|.h$', 'abh', 1],
['\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'effgz', 0, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'],
['\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'ij', 0, 'found+"-"+g1+"-"+g2', 'ij-ij-j'],
['\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'effg', 1],
['\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'bcdd', 1],
['\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'reffgz', 1],
['\\(\\(\\(\\(\\(\\(\\(\\(\\(a\\)\\)\\)\\)\\)\\)\\)\\)\\)', 'a', 0, 'found', 'a'],
['multiple words of text', 'uh-uh', 1],
['multiple words', 'multiple words, yeah', 0, 'found', 'multiple words'],
['\\(.*\\)c\\(.*\\)', 'abcde', 0, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'],
['(\\(.*\\), \\(.*\\))', '(a, b)', 0, 'g2+"-"+g1', 'b-a'],
['[k]', 'ab', 1],
['a[-]?c', 'ac', 0, 'found', 'ac'],
['\\(abc\\)\\1', 'abcabc', 0, 'g1', 'abc'],
['\\([a-c]*\\)\\1', 'abcabc', 0, 'g1', 'abc'],
['^\\(.+\\)?B', 'AB', 0, 'g1', 'A'],
['\\(a+\\).\\1$', 'aaaaa', 0, 'found+"-"+g1', 'aaaaa-aa'],
['^\\(a+\\).\\1$', 'aaaa', 1],
['\\(abc\\)\\1', 'abcabc', 0, 'found+"-"+g1', 'abcabc-abc'],
['\\([a-c]+\\)\\1', 'abcabc', 0, 'found+"-"+g1', 'abcabc-abc'],
['\\(a\\)\\1', 'aa', 0, 'found+"-"+g1', 'aa-a'],
['\\(a+\\)\\1', 'aa', 0, 'found+"-"+g1', 'aa-a'],
['\\(a+\\)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a'],
['\\(a\\).+\\1', 'aba', 0, 'found+"-"+g1', 'aba-a'],
['\\(a\\)ba*\\1', 'aba', 0, 'found+"-"+g1', 'aba-a'],
['\\(aa\\|a\\)a\\1$', 'aaa', 0, 'found+"-"+g1', 'aaa-a'],
['\\(a\\|aa\\)a\\1$', 'aaa', 0, 'found+"-"+g1', 'aaa-a'],
['\\(a+\\)a\\1$', 'aaa', 0, 'found+"-"+g1', 'aaa-a'],
['\\([abc]*\\)\\1', 'abcabc', 0, 'found+"-"+g1', 'abcabc-abc'],
['\\(a\\)\\(b\\)c\\|ab', 'ab', 0, 'found+"-"+g1+"-"+g2', 'ab-None-None'],
['\\(a\\)+x', 'aaax', 0, 'found+"-"+g1', 'aaax-a'],
['\\([ac]\\)+x', 'aacx', 0, 'found+"-"+g1', 'aacx-c'],
['\\([^/]*/\\)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', 0, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/'],
['\\([^.]*\\)\\.\\([^:]*\\):[T ]+\\(.*\\)', 'track1.title:TBlah blah blah', 0, 'found+"-"+g1+"-"+g2+"-"+g3', 'track1.title:TBlah blah blah-track1-title-Blah blah blah'],
['\\([^N]*N\\)+', 'abNNxyzN', 0, 'found+"-"+g1', 'abNNxyzN-xyzN'],
['\\([^N]*N\\)+', 'abNNxyz', 0, 'found+"-"+g1', 'abNN-N'],
['\\([abc]*\\)x', 'abcx', 0, 'found+"-"+g1', 'abcx-abc'],
['\\([abc]*\\)x', 'abc', 1],
['\\([xyz]*\\)x', 'abcx', 1],
['\\(a\\)+b\\|aac', 'aac', 0, 'found+"-"+g1', 'aac-None'],
['\\<a', 'a', 0, 'found', 'a'],
['\\<a', '!', 1],
['a\\<b', 'ab', 1],
['a\\>', 'ab', 1],
['a\\>', 'a!', 0, 'found', 'a'],
['a\\>', 'a', 0, 'found', 'a'],
]
 
# test suite for split()
# element 0: pattern
# 1: string to split
# 3: compile result
# 4: maxsplit
# 5: splitted fields list
split_regex_tests = [
["[ |,]", "with you, nothing, and me", 0, 0, ["with","you","nothing","and","me"]],
["[ |,]", "with you, nothing, and me", 0, 1, ["with", "you, nothing, and me"]],
["\\ ", "send email to apply", 0, 0, ["send", "email", "to", "apply"]],
["\\ ", "send email to apply", 0, 2, ["send", "email", "to apply"]],
["[+ | -]", "+86-028-83201034", 0, 0, ["86", "028", "83201034"]],
["[+ | -]", "+86-028-83201034", 0, 1, ["86", "028-83201034"]],
["[*|#]", "slide show", 0, 0, ["slide show"]],
["(", "whats ever", 0, 1, ["whats ever"]],
["@#!~$%^&*()<>\n", "who knows", 0, 1, ["who knows"]],
]
 
# test suite for findall()
# element 0: pattern
# 1: string to match
# 3: compile result
# 4: starting position
# 5: grouped fields list
 
# reobj.find()
findall_regex_tests = [
["\\ ", "send email to apply", 0, 0, [" ", " ", " "]],
["\\ ", "send email to apply", 0, 5, [" ", " "]],
["[+ | -]", "+86-028-83201034", 0, 0, ["+", "-", "-"]],
["[+ | -]", "+86-028-83201034", 0, 1, ["-", "-"]],
["sl.*e\\|#", "slide show at Room #3", 0, 0, ["slide", "#"]],
["w.+s\\|e.*r", "whats ever", 0, 0, ["whats", "ever"]],
["Euler\\|Gauss", "Both Euler and Gauss are great mathematicians", 0, 0, ["Euler", "Gauss"]],
]
 
# module re.findall()
mod_findall_regex_tests = [
["\\ ", "send email to apply", 0, 0, [" ", " ", " "]],
["\\ ", "send email to apply", 0, 0, [" ", " ", " "]],
["[+ | -]", "+86-028-83201034", 0, 0, ["+", "-", "-"]],
["[+ | -]", "+86-028-83201034", 0, 0, ["+", "-", "-"]],
["sl.*e\\|#", "slide show at Room #3", 0, 0, ["slide", "#"]],
["w.+s\\|e.*r", "whats ever", 0, 0, ["whats", "ever"]],
["Euler\\|Gauss", "Both Euler and Gauss are great mathematicians", 0, 0, ["Euler", "Gauss"]],
]
 
# test for match object's groups() method
# element 0: pattern
# 1: string
# 2: compile result
# 3: matched fields, for groups()
# 4: group index, valid when > 0, for start(), end(), and span()
# 5: pattern's starting index in string, for start() and span()
# 6: pattern's ending index in string, for end() and span
matobj_groups_regex_tests = [
["\\(abc\\(.*xyz\\)\\(.*31415926\\)\\)", "where is abc and flurry xyz, which is pi 31415926, derived from ms", 0, ["abc and flurry xyz, which is pi 31415926"," and flurry xyz",", which is pi 31415926"], 2, 12, 27],
 
["[a\\|b]\\(.+\\)shoe\\([t]+\\)d", "bbbshoetttdxrznmlkjp", 0, ["bb", "ttt"], 1, 1, 3],
 
["abcdef", "xyah2oewoyqe030uabcdefwhalsdewnkhgiohyczb", 0, [], -1, 0, 0],
]