2014 lines
44 KiB
C++
2014 lines
44 KiB
C++
// This is a part of the Active Template Library.
|
|
// Copyright (C) Microsoft Corporation
|
|
// All rights reserved.
|
|
//
|
|
// This source code is only intended as a supplement to the
|
|
// Active Template Library Reference and related
|
|
// electronic documentation provided with the library.
|
|
// See these sources for detailed information regarding the
|
|
// Active Template Library product.
|
|
|
|
#ifndef __ATLRX_H__
|
|
#define __ATLRX_H__
|
|
|
|
#pragma once
|
|
|
|
#include <atlbase.h>
|
|
#include <atlcoll.h>
|
|
#include <mbstring.h>
|
|
|
|
#ifndef ATL_REGEXP_MIN_STACK
|
|
#define ATL_REGEXP_MIN_STACK 256
|
|
#endif
|
|
|
|
/*
|
|
Regular Expression Grammar
|
|
|
|
R - top level grammar rule
|
|
RE - regular expression
|
|
AltE - Alternative expression
|
|
E - expression
|
|
SE - simple expression
|
|
|
|
R -> RE
|
|
'^'RE (matches begining of string)
|
|
|
|
RE -> AltE RE
|
|
AltE
|
|
|
|
|
|
AltE -> E
|
|
E '|' AltE
|
|
E -> SE (RepeatOp '?'?)?
|
|
SE -> Arg
|
|
Group
|
|
CharClass
|
|
'\'Abbrev (see below)
|
|
'\'EscapedChar (any character including reserved symbols)
|
|
'\'Digit+ (Arg back reference)
|
|
'!' (not)
|
|
'.' (any char)
|
|
'$' (end of input)
|
|
Symbol (any non-reserved character)
|
|
Arg -> '{'RE'}'
|
|
Group -> '('RE')'
|
|
CharClass -> '[' '^'? CharSet ']'
|
|
CharSet -> CharItem+
|
|
CharItem -> Char('-'Char)?
|
|
RepeatOp -> '*'
|
|
'+'
|
|
'?'
|
|
Abbrev -> Abbreviation defined in CAtlRECharTraits
|
|
Abbrev Expansion Meaning
|
|
a ([a-zA-Z0-9]) alpha numeric
|
|
b ([ \\t]) white space (blank)
|
|
c ([a-zA-Z]) alpha
|
|
d ([0-9]) digit
|
|
h ([0-9a-fA-F]) hex digit
|
|
n (\r|(\r?\n)) newline
|
|
q (\"[^\"]*\")|(\'[^\']*\') quoted string
|
|
w ([a-zA-Z]+) simple word
|
|
z ([0-9]+) integer
|
|
*/
|
|
|
|
#pragma pack(push,_ATL_PACKING)
|
|
namespace ATL {
|
|
|
|
//Convertion utility classes used to convert char* to RECHAR.
|
|
//Used by rx debugging printing.
|
|
template <typename RECHARTYPE=char>
|
|
class CAToREChar
|
|
{
|
|
public:
|
|
CAToREChar(const char* psz) throw()
|
|
: m_psz(psz)
|
|
{
|
|
}
|
|
operator const RECHARTYPE*() const throw() { return m_psz; }
|
|
const char* m_psz;
|
|
};
|
|
|
|
template<>
|
|
class CAToREChar<wchar_t>
|
|
{
|
|
public:
|
|
CAToREChar(const char* psz) throw()
|
|
: m_a2w(psz)
|
|
{
|
|
}
|
|
operator const wchar_t*() const throw() { return (wchar_t*)m_a2w; }
|
|
|
|
private:
|
|
CA2W m_a2w;
|
|
};
|
|
|
|
class CAtlRECharTraitsA
|
|
{
|
|
public:
|
|
typedef char RECHARTYPE;
|
|
|
|
static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE *sz) throw()
|
|
{
|
|
#ifndef ATL_NO_CHECK_BIT_FIELD
|
|
ATLASSERT(UseBitFieldForRange());
|
|
#endif
|
|
return static_cast<size_t>(static_cast<unsigned char>(*sz));
|
|
}
|
|
static RECHARTYPE *Next(const RECHARTYPE *sz) throw()
|
|
{
|
|
return (RECHARTYPE *) (sz+1);
|
|
}
|
|
|
|
static int Strncmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
|
|
{
|
|
return strncmp(szLeft, szRight, nCount);
|
|
}
|
|
|
|
static int Strnicmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
|
|
{
|
|
return _strnicmp(szLeft, szRight, nCount);
|
|
}
|
|
|
|
_ATL_INSECURE_DEPRECATE("CAtlRECharTraitsA::Strlwr must be passed a buffer size.")
|
|
static RECHARTYPE *Strlwr(RECHARTYPE *sz) throw()
|
|
{
|
|
#pragma warning (push)
|
|
#pragma warning(disable : 4996)
|
|
return _strlwr(sz);
|
|
#pragma warning (pop)
|
|
}
|
|
|
|
static RECHARTYPE *Strlwr(RECHARTYPE *sz, int nSize) throw()
|
|
{
|
|
Checked::strlwr_s(sz, nSize);
|
|
return sz;
|
|
}
|
|
|
|
static long Strtol(const RECHARTYPE *sz, RECHARTYPE **szEnd, int nBase) throw()
|
|
{
|
|
return strtol(sz, szEnd, nBase);
|
|
}
|
|
|
|
static int Isdigit(RECHARTYPE ch) throw()
|
|
{
|
|
return isdigit(static_cast<unsigned char>(ch));
|
|
}
|
|
|
|
static const RECHARTYPE** GetAbbrevs()
|
|
{
|
|
static const RECHARTYPE *s_szAbbrevs[] =
|
|
{
|
|
"a([a-zA-Z0-9])", // alpha numeric
|
|
"b([ \\t])", // white space (blank)
|
|
"c([a-zA-Z])", // alpha
|
|
"d([0-9])", // digit
|
|
"h([0-9a-fA-F])", // hex digit
|
|
"n(\r|(\r?\n))", // newline
|
|
"q(\"[^\"]*\")|(\'[^\']*\')", // quoted string
|
|
"w([a-zA-Z]+)", // simple word
|
|
"z([0-9]+)", // integer
|
|
NULL
|
|
};
|
|
|
|
return s_szAbbrevs;
|
|
}
|
|
|
|
static BOOL UseBitFieldForRange() throw()
|
|
{
|
|
return TRUE;
|
|
}
|
|
|
|
static int ByteLen(const RECHARTYPE *sz) throw()
|
|
{
|
|
return int(strlen(sz));
|
|
}
|
|
};
|
|
|
|
class CAtlRECharTraitsW
|
|
{
|
|
public:
|
|
typedef WCHAR RECHARTYPE;
|
|
|
|
static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE *sz) throw()
|
|
{
|
|
#ifndef ATL_NO_CHECK_BIT_FIELD
|
|
ATLASSERT(UseBitFieldForRange());
|
|
#endif
|
|
return static_cast<size_t>(*sz);
|
|
}
|
|
static RECHARTYPE *Next(const RECHARTYPE *sz) throw()
|
|
{
|
|
return (RECHARTYPE *) (sz+1);
|
|
}
|
|
|
|
static int Strncmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
|
|
{
|
|
return wcsncmp(szLeft, szRight, nCount);
|
|
}
|
|
|
|
static int Strnicmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
|
|
{
|
|
return _wcsnicmp(szLeft, szRight, nCount);
|
|
}
|
|
|
|
_ATL_INSECURE_DEPRECATE("CAtlRECharTraitsW::Strlwr must be passed a buffer size.")
|
|
static RECHARTYPE *Strlwr(RECHARTYPE *sz) throw()
|
|
{
|
|
#pragma warning (push)
|
|
#pragma warning(disable : 4996)
|
|
return _wcslwr(sz);
|
|
#pragma warning (pop)
|
|
}
|
|
|
|
static RECHARTYPE *Strlwr(RECHARTYPE *sz, int nSize) throw()
|
|
{
|
|
Checked::wcslwr_s(sz, nSize);
|
|
return sz;
|
|
}
|
|
|
|
static long Strtol(const RECHARTYPE *sz, RECHARTYPE **szEnd, int nBase) throw()
|
|
{
|
|
return wcstol(sz, szEnd, nBase);
|
|
}
|
|
|
|
static int Isdigit(RECHARTYPE ch) throw()
|
|
{
|
|
return iswdigit(ch);
|
|
}
|
|
|
|
static const RECHARTYPE** GetAbbrevs()
|
|
{
|
|
static const RECHARTYPE *s_szAbbrevs[] =
|
|
{
|
|
L"a([a-zA-Z0-9])", // alpha numeric
|
|
L"b([ \\t])", // white space (blank)
|
|
L"c([a-zA-Z])", // alpha
|
|
L"d([0-9])", // digit
|
|
L"h([0-9a-fA-F])", // hex digit
|
|
L"n(\r|(\r?\n))", // newline
|
|
L"q(\"[^\"]*\")|(\'[^\']*\')", // quoted string
|
|
L"w([a-zA-Z]+)", // simple word
|
|
L"z([0-9]+)", // integer
|
|
NULL
|
|
};
|
|
|
|
return s_szAbbrevs;
|
|
}
|
|
|
|
static BOOL UseBitFieldForRange() throw()
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
static int ByteLen(const RECHARTYPE *sz) throw()
|
|
{
|
|
return int(wcslen(sz)*sizeof(WCHAR));
|
|
}
|
|
};
|
|
|
|
class CAtlRECharTraitsMB
|
|
{
|
|
public:
|
|
typedef unsigned char RECHARTYPE;
|
|
|
|
static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE *sz) throw()
|
|
{
|
|
#ifndef ATL_NO_CHECK_BIT_FIELD
|
|
ATLASSERT(UseBitFieldForRange());
|
|
#endif
|
|
|
|
return static_cast<size_t>(*sz);
|
|
}
|
|
|
|
static RECHARTYPE *Next(const RECHARTYPE *sz) throw()
|
|
{
|
|
return _mbsinc(sz);
|
|
}
|
|
|
|
static int Strncmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
|
|
{
|
|
return _mbsncmp(szLeft, szRight, nCount);
|
|
}
|
|
|
|
static int Strnicmp(const RECHARTYPE *szLeft, const RECHARTYPE *szRight, size_t nCount) throw()
|
|
{
|
|
return _mbsnicmp(szLeft, szRight, nCount);
|
|
}
|
|
|
|
_ATL_INSECURE_DEPRECATE("CAtlRECharTraitsMB::Strlwr must be passed a buffer size.")
|
|
static RECHARTYPE *Strlwr(RECHARTYPE *sz) throw()
|
|
{
|
|
#pragma warning (push)
|
|
#pragma warning(disable : 4996)
|
|
return _mbslwr(sz);
|
|
#pragma warning (pop)
|
|
}
|
|
|
|
static RECHARTYPE *Strlwr(RECHARTYPE *sz, int nSize) throw()
|
|
{
|
|
Checked::mbslwr_s(sz, nSize);
|
|
return sz;
|
|
}
|
|
|
|
static long Strtol(const RECHARTYPE *sz, RECHARTYPE **szEnd, int nBase) throw()
|
|
{
|
|
return strtol((const char *) sz, (char **) szEnd, nBase);
|
|
}
|
|
|
|
static int Isdigit(RECHARTYPE ch) throw()
|
|
{
|
|
return _ismbcdigit((unsigned int) ch);
|
|
}
|
|
|
|
static const RECHARTYPE** GetAbbrevs()
|
|
{
|
|
return reinterpret_cast<const RECHARTYPE **>(CAtlRECharTraitsA::GetAbbrevs());
|
|
}
|
|
|
|
static BOOL UseBitFieldForRange() throw()
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
static int ByteLen(const RECHARTYPE *sz) throw()
|
|
{
|
|
return (int)strlen((const char *) sz);
|
|
}
|
|
};
|
|
|
|
#ifndef _UNICODE
|
|
typedef CAtlRECharTraitsA CAtlRECharTraits;
|
|
#else // _UNICODE
|
|
typedef CAtlRECharTraitsW CAtlRECharTraits;
|
|
#endif // !_UNICODE
|
|
// Note: If you want to use CAtlRECharTraitsMB you must pass it in
|
|
// as a template argument
|
|
|
|
template <class CharTraits=CAtlRECharTraits>
|
|
class CAtlRegExp; // forward declaration
|
|
|
|
template <class CharTraits=CAtlRECharTraits>
|
|
class CAtlREMatchContext
|
|
{
|
|
public:
|
|
friend CAtlRegExp<CharTraits>;
|
|
typedef typename CharTraits::RECHARTYPE RECHAR;
|
|
|
|
struct MatchGroup
|
|
{
|
|
const RECHAR *szStart;
|
|
const RECHAR *szEnd;
|
|
};
|
|
|
|
UINT m_uNumGroups;
|
|
|
|
MatchGroup m_Match;
|
|
|
|
void GetMatch(UINT nIndex, const RECHAR **szStart, const RECHAR **szEnd)
|
|
{
|
|
ATLENSURE(szStart != NULL);
|
|
ATLENSURE(szEnd != NULL);
|
|
ATLENSURE(nIndex >=0 && nIndex < m_uNumGroups);
|
|
*szStart = m_Matches[nIndex].szStart;
|
|
*szEnd = m_Matches[nIndex].szEnd;
|
|
}
|
|
|
|
void GetMatch(UINT nIndex, MatchGroup *pGroup)
|
|
{
|
|
|
|
ATLENSURE(pGroup != NULL);
|
|
ATLENSURE(nIndex >=0&&(static_cast<UINT>(nIndex))< m_uNumGroups);
|
|
pGroup->szStart = m_Matches[nIndex].szStart;
|
|
pGroup->szEnd = m_Matches[nIndex].szEnd;
|
|
}
|
|
|
|
protected:
|
|
CAutoVectorPtr<void *> m_Mem;
|
|
CAutoVectorPtr<MatchGroup> m_Matches;
|
|
CAtlArray<void *> m_stack;
|
|
size_t m_nTos;
|
|
|
|
public:
|
|
CAtlREMatchContext(size_t nInitStackSize=ATL_REGEXP_MIN_STACK)
|
|
{
|
|
m_uNumGroups = 0;
|
|
m_nTos = 0;
|
|
m_stack.SetCount(nInitStackSize);
|
|
m_Match.szStart = NULL;
|
|
m_Match.szEnd = NULL;
|
|
}
|
|
|
|
protected:
|
|
BOOL Initialize(UINT uRequiredMem, UINT uNumGroups) throw()
|
|
{
|
|
m_nTos = 0;
|
|
|
|
m_uNumGroups = 0;
|
|
m_Matches.Free();
|
|
|
|
if (!m_Matches.Allocate(uNumGroups))
|
|
return FALSE;
|
|
|
|
m_uNumGroups = uNumGroups;
|
|
|
|
m_Mem.Free();
|
|
|
|
if (!m_Mem.Allocate(uRequiredMem))
|
|
return FALSE;
|
|
|
|
memset(m_Mem.m_p, 0x00, uRequiredMem*sizeof(void *));
|
|
|
|
memset(m_Matches, 0x00, m_uNumGroups * sizeof(MatchGroup));
|
|
return TRUE;
|
|
}
|
|
|
|
BOOL Push(void *p)
|
|
{
|
|
m_nTos++;
|
|
if (m_stack.GetCount() <= (UINT) m_nTos)
|
|
{
|
|
if (!m_stack.SetCount((m_nTos+1)*2))
|
|
{
|
|
m_nTos--;
|
|
return FALSE;
|
|
}
|
|
}
|
|
m_stack[m_nTos] = p;
|
|
return TRUE;
|
|
}
|
|
|
|
BOOL Push(size_t n)
|
|
{
|
|
return Push((void *) n);
|
|
}
|
|
|
|
void *Pop() throw()
|
|
{
|
|
if (m_nTos==0)
|
|
{
|
|
// stack underflow
|
|
// this should never happen at match time.
|
|
// (the parsing succeeded when it shouldn't have)
|
|
ATLASSERT(FALSE);
|
|
return NULL;
|
|
}
|
|
void *p = m_stack[m_nTos];
|
|
m_nTos--;
|
|
return p;
|
|
}
|
|
};
|
|
|
|
enum REParseError {
|
|
REPARSE_ERROR_OK = 0, // No error occurred
|
|
REPARSE_ERROR_OUTOFMEMORY, // Out of memory
|
|
REPARSE_ERROR_BRACE_EXPECTED, // A closing brace was expected
|
|
REPARSE_ERROR_PAREN_EXPECTED, // A closing parenthesis was expected
|
|
REPARSE_ERROR_BRACKET_EXPECTED, // A closing bracket was expected
|
|
REPARSE_ERROR_UNEXPECTED, // An unspecified fatal error occurred
|
|
REPARSE_ERROR_EMPTY_RANGE, // A range expression was empty
|
|
REPARSE_ERROR_INVALID_GROUP, // A backreference was made to a group
|
|
// that did not exist
|
|
REPARSE_ERROR_INVALID_RANGE, // An invalid range was specified
|
|
REPARSE_ERROR_EMPTY_REPEATOP, // A possibly empty * or + was detected
|
|
REPARSE_ERROR_INVALID_INPUT, // The input string was invalid
|
|
};
|
|
|
|
template <class CharTraits /* =CAtlRECharTraits */>
|
|
class CAtlRegExp
|
|
{
|
|
public:
|
|
CAtlRegExp() throw()
|
|
{
|
|
m_uNumGroups = 0;
|
|
m_uRequiredMem = 0;
|
|
m_bCaseSensitive = TRUE;
|
|
m_LastError = REPARSE_ERROR_OK;
|
|
}
|
|
|
|
typedef typename CharTraits::RECHARTYPE RECHAR;
|
|
|
|
// CAtlRegExp::Parse
|
|
// Parses the regular expression
|
|
// returns REPARSE_ERROR_OK if successful, an REParseError otherwise
|
|
REParseError Parse(const RECHAR *szRE, BOOL bCaseSensitive=TRUE)
|
|
{
|
|
ATLASSERT(szRE);
|
|
if (!szRE)
|
|
return REPARSE_ERROR_INVALID_INPUT;
|
|
|
|
Reset();
|
|
|
|
m_bCaseSensitive = bCaseSensitive;
|
|
|
|
const RECHAR *szInput = szRE;
|
|
|
|
if (!bCaseSensitive)
|
|
{
|
|
// copy the string
|
|
int nSize = CharTraits::ByteLen(szRE)+sizeof(RECHAR);
|
|
szInput = (const RECHAR *) malloc(nSize);
|
|
if (!szInput)
|
|
return REPARSE_ERROR_OUTOFMEMORY;
|
|
|
|
Checked::memcpy_s((char *) szInput, nSize, szRE, nSize);
|
|
|
|
CharTraits::Strlwr(const_cast<RECHAR *>(szInput), nSize/sizeof(RECHAR));
|
|
}
|
|
const RECHAR *sz = szInput;
|
|
|
|
int nCall = AddInstruction(RE_CALL);
|
|
if (nCall < 0)
|
|
return REPARSE_ERROR_OUTOFMEMORY;
|
|
|
|
if (*sz == '^')
|
|
{
|
|
if (AddInstruction(RE_FAIL) < 0)
|
|
return REPARSE_ERROR_OUTOFMEMORY;
|
|
sz++;
|
|
}
|
|
else
|
|
{
|
|
if (AddInstruction(RE_ADVANCE) < 0)
|
|
return REPARSE_ERROR_OUTOFMEMORY;
|
|
}
|
|
|
|
bool bEmpty = true;
|
|
ParseRE(&sz, bEmpty);
|
|
if (!GetLastParseError())
|
|
{
|
|
GetInstruction(nCall).call.nTarget = 2;
|
|
|
|
if (AddInstruction(RE_MATCH) < 0)
|
|
return REPARSE_ERROR_OUTOFMEMORY;
|
|
}
|
|
|
|
if (szInput != szRE)
|
|
free((void *) szInput);
|
|
|
|
return GetLastParseError();
|
|
}
|
|
|
|
BOOL Match(const RECHAR *szIn, CAtlREMatchContext<CharTraits> *pContext, const RECHAR **ppszEnd=NULL)
|
|
{
|
|
ATLASSERT(szIn);
|
|
ATLASSERT(pContext);
|
|
|
|
if (!szIn || !pContext)
|
|
return FALSE;
|
|
|
|
if (ppszEnd)
|
|
*ppszEnd = NULL;
|
|
|
|
const RECHAR *szInput = szIn;
|
|
|
|
if (!m_bCaseSensitive)
|
|
{
|
|
int nSize = CharTraits::ByteLen(szIn)+sizeof(RECHAR);
|
|
szInput = (const RECHAR *) malloc(nSize);
|
|
if (!szInput)
|
|
return FALSE;
|
|
|
|
Checked::memcpy_s((char *) szInput, nSize, szIn, nSize);
|
|
CharTraits::Strlwr(const_cast<RECHAR *>(szInput), nSize/sizeof(RECHAR));
|
|
}
|
|
|
|
if (!pContext->Initialize(m_uRequiredMem, m_uNumGroups))
|
|
{
|
|
if (szInput != szIn)
|
|
free((void *) szInput);
|
|
return FALSE;
|
|
}
|
|
|
|
size_t ip = 0;
|
|
|
|
const RECHAR *sz = szInput;
|
|
const RECHAR *szCurrInput = szInput;
|
|
|
|
#pragma warning(push)
|
|
#pragma warning(disable:4127) // conditional expression is constant
|
|
|
|
while (1)
|
|
{
|
|
#ifdef ATLRX_DEBUG
|
|
OnDebugEvent(ip, szInput, sz, pContext);
|
|
#endif
|
|
if (ip == 0)
|
|
pContext->m_Match.szStart = sz;
|
|
|
|
switch (GetInstruction(ip).type)
|
|
{
|
|
case RE_NOP:
|
|
ip++;
|
|
break;
|
|
|
|
case RE_SYMBOL:
|
|
if (GetInstruction(ip).symbol.nSymbol == static_cast<size_t>(static_cast<_TUCHAR>(*sz)))
|
|
{
|
|
sz = CharTraits::Next(sz);
|
|
ip++;
|
|
}
|
|
else
|
|
{
|
|
ip = (size_t) pContext->Pop();
|
|
}
|
|
break;
|
|
|
|
case RE_ANY:
|
|
if (*sz)
|
|
{
|
|
sz = CharTraits::Next(sz);
|
|
ip++;
|
|
}
|
|
else
|
|
{
|
|
ip = (size_t) pContext->Pop();
|
|
}
|
|
break;
|
|
|
|
case RE_GROUP_START:
|
|
pContext->m_Matches[GetInstruction(ip).group.nGroup].szStart = sz;
|
|
ip++;
|
|
break;
|
|
|
|
case RE_GROUP_END:
|
|
pContext->m_Matches[GetInstruction(ip).group.nGroup].szEnd = sz;
|
|
ip++;
|
|
break;
|
|
|
|
case RE_PUSH_CHARPOS:
|
|
pContext->Push((void *) sz);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_POP_CHARPOS:
|
|
sz = (RECHAR *) pContext->Pop();
|
|
ip++;
|
|
break;
|
|
|
|
case RE_CALL:
|
|
pContext->Push(ip+1);
|
|
ip = GetInstruction(ip).call.nTarget;
|
|
break;
|
|
|
|
case RE_JMP:
|
|
ip = GetInstruction(ip).jmp.nTarget;
|
|
break;
|
|
|
|
case RE_RETURN:
|
|
ip = (size_t) pContext->Pop();
|
|
break;
|
|
|
|
case RE_PUSH_MEMORY:
|
|
pContext->Push((void *) (pContext->m_Mem[GetInstruction(ip).memory.nIndex]));
|
|
ip++;
|
|
break;
|
|
|
|
case RE_POP_MEMORY:
|
|
pContext->m_Mem[GetInstruction(ip).memory.nIndex] = pContext->Pop();
|
|
ip++;
|
|
break;
|
|
|
|
case RE_STORE_CHARPOS:
|
|
pContext->m_Mem[GetInstruction(ip).memory.nIndex] = (void *) sz;
|
|
ip++;
|
|
break;
|
|
|
|
case RE_GET_CHARPOS:
|
|
sz = (RECHAR *) pContext->m_Mem[GetInstruction(ip).memory.nIndex];
|
|
ip++;
|
|
break;
|
|
|
|
case RE_STORE_STACKPOS:
|
|
pContext->m_Mem[GetInstruction(ip).memory.nIndex] = (void *) pContext->m_nTos;
|
|
ip++;
|
|
break;
|
|
|
|
case RE_GET_STACKPOS:
|
|
pContext->m_nTos = (size_t) pContext->m_Mem[GetInstruction(ip).memory.nIndex];
|
|
ip++;
|
|
break;
|
|
|
|
case RE_RET_NOMATCH:
|
|
if (sz == (RECHAR *) pContext->m_Mem[GetInstruction(ip).memory.nIndex])
|
|
{
|
|
// do a return
|
|
ip = (size_t) pContext->Pop();
|
|
}
|
|
else
|
|
ip++;
|
|
break;
|
|
|
|
case RE_ADVANCE:
|
|
sz = CharTraits::Next(szCurrInput);
|
|
szCurrInput = sz;
|
|
if (*sz == '\0')
|
|
goto Error;
|
|
ip = 0;
|
|
pContext->m_nTos = 0;
|
|
break;
|
|
|
|
case RE_FAIL:
|
|
goto Error;
|
|
|
|
case RE_RANGE:
|
|
{
|
|
if (*sz == '\0')
|
|
{
|
|
ip = (size_t) pContext->Pop();
|
|
break;
|
|
}
|
|
|
|
RECHAR *pBits = reinterpret_cast<RECHAR *>((&m_Instructions[ip]+1));
|
|
size_t u = CharTraits::GetBitFieldForRangeArrayIndex(sz);
|
|
if (pBits[u >> 3] & 1 << (u & 0x7))
|
|
{
|
|
ip += InstructionsPerRangeBitField();
|
|
ip++;
|
|
sz = CharTraits::Next(sz);
|
|
}
|
|
else
|
|
{
|
|
ip = (size_t) pContext->Pop();
|
|
}
|
|
}
|
|
break;
|
|
|
|
case RE_NOTRANGE:
|
|
{
|
|
if (*sz == '\0')
|
|
{
|
|
ip = (size_t) pContext->Pop();
|
|
break;
|
|
}
|
|
|
|
RECHAR *pBits = reinterpret_cast<RECHAR *>((&m_Instructions[ip]+1));
|
|
size_t u = static_cast<size_t>(static_cast<_TUCHAR>(* ((RECHAR *) sz)));
|
|
if (pBits[u >> 3] & 1 << (u & 0x7))
|
|
{
|
|
ip = (size_t) pContext->Pop();
|
|
}
|
|
else
|
|
{
|
|
ip += InstructionsPerRangeBitField();
|
|
ip++;
|
|
sz = CharTraits::Next(sz);
|
|
}
|
|
}
|
|
break;
|
|
|
|
case RE_RANGE_EX:
|
|
{
|
|
if (*sz == '\0')
|
|
{
|
|
ip = (size_t) pContext->Pop();
|
|
break;
|
|
}
|
|
|
|
BOOL bMatch = FALSE;
|
|
size_t inEnd = GetInstruction(ip).range.nTarget;
|
|
ip++;
|
|
|
|
while (ip < inEnd)
|
|
{
|
|
if (static_cast<size_t>(static_cast<_TUCHAR>(*sz)) >= GetInstruction(ip).memory.nIndex &&
|
|
static_cast<size_t>(static_cast<_TUCHAR>(*sz)) <= GetInstruction(ip+1).memory.nIndex)
|
|
{
|
|
// if we match, we jump to the end
|
|
sz = CharTraits::Next(sz);
|
|
ip = inEnd;
|
|
bMatch = TRUE;
|
|
}
|
|
else
|
|
{
|
|
ip += 2;
|
|
}
|
|
}
|
|
if (!bMatch)
|
|
{
|
|
ip = (size_t) pContext->Pop();
|
|
}
|
|
}
|
|
break;
|
|
|
|
case RE_NOTRANGE_EX:
|
|
{
|
|
if (*sz == '\0')
|
|
{
|
|
ip = (size_t) pContext->Pop();
|
|
break;
|
|
}
|
|
|
|
BOOL bMatch = TRUE;
|
|
size_t inEnd = GetInstruction(ip).range.nTarget;
|
|
ip++;
|
|
|
|
while (ip < inEnd)
|
|
{
|
|
if (static_cast<size_t>(static_cast<_TUCHAR>(*sz)) >= GetInstruction(ip).memory.nIndex &&
|
|
static_cast<size_t>(static_cast<_TUCHAR>(*sz)) <= GetInstruction(ip+1).memory.nIndex)
|
|
{
|
|
ip = (size_t) pContext->Pop();
|
|
bMatch = FALSE;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
// if we match, we jump to the end
|
|
ip += 2;
|
|
}
|
|
}
|
|
if (bMatch)
|
|
sz = CharTraits::Next(sz);
|
|
}
|
|
break;
|
|
|
|
case RE_PREVIOUS:
|
|
{
|
|
BOOL bMatch = FALSE;
|
|
if (m_bCaseSensitive)
|
|
{
|
|
bMatch = !CharTraits::Strncmp(sz, pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart,
|
|
pContext->m_Matches[GetInstruction(ip).prev.nGroup].szEnd-pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart);
|
|
}
|
|
else
|
|
{
|
|
bMatch = !CharTraits::Strnicmp(sz, pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart,
|
|
pContext->m_Matches[GetInstruction(ip).prev.nGroup].szEnd-pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart);
|
|
}
|
|
if (bMatch)
|
|
{
|
|
sz += pContext->m_Matches[GetInstruction(ip).prev.nGroup].szEnd-pContext->m_Matches[GetInstruction(ip).prev.nGroup].szStart;
|
|
ip++;
|
|
break;
|
|
}
|
|
ip = (size_t) pContext->Pop();
|
|
}
|
|
break;
|
|
|
|
case RE_MATCH:
|
|
pContext->m_Match.szEnd = sz;
|
|
if (!m_bCaseSensitive)
|
|
FixupMatchContext(pContext, szIn, szInput);
|
|
if (ppszEnd)
|
|
*ppszEnd = szIn + (sz - szInput);
|
|
if (szInput != szIn)
|
|
free((void *) szInput);
|
|
return TRUE;
|
|
break;
|
|
|
|
case RE_PUSH_GROUP:
|
|
pContext->Push((void *) pContext->m_Matches[GetInstruction(ip).group.nGroup].szStart);
|
|
pContext->Push((void *) pContext->m_Matches[GetInstruction(ip).group.nGroup].szEnd);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_POP_GROUP:
|
|
pContext->m_Matches[GetInstruction(ip).group.nGroup].szEnd = (const RECHAR *) pContext->Pop();
|
|
pContext->m_Matches[GetInstruction(ip).group.nGroup].szStart = (const RECHAR *) pContext->Pop();
|
|
ip++;
|
|
break;
|
|
|
|
default:
|
|
ATLASSERT(FALSE);
|
|
break;
|
|
}
|
|
}
|
|
|
|
#pragma warning(pop) // 4127
|
|
|
|
ATLASSERT(FALSE);
|
|
Error:
|
|
pContext->m_Match.szEnd = sz;
|
|
if (!m_bCaseSensitive)
|
|
FixupMatchContext(pContext, szIn, szInput);
|
|
if (ppszEnd)
|
|
*ppszEnd = szIn + (sz - szInput);
|
|
if (szInput != szIn)
|
|
free((void *) szInput);
|
|
return FALSE;
|
|
}
|
|
|
|
protected:
|
|
REParseError m_LastError;
|
|
|
|
REParseError GetLastParseError() throw()
|
|
{
|
|
return m_LastError;
|
|
}
|
|
|
|
void SetLastParseError(REParseError Error) throw()
|
|
{
|
|
m_LastError = Error;
|
|
}
|
|
// CAtlRegExp::Reset
|
|
// Removes all instructions to allow reparsing into the same instance
|
|
void Reset() throw()
|
|
{
|
|
m_Instructions.RemoveAll();
|
|
m_uRequiredMem = 0;
|
|
m_bCaseSensitive = TRUE;
|
|
m_uNumGroups = 0;
|
|
SetLastParseError(REPARSE_ERROR_OK);
|
|
}
|
|
|
|
|
|
enum REInstructionType {
|
|
RE_NOP,
|
|
RE_GROUP_START,
|
|
RE_GROUP_END,
|
|
RE_SYMBOL,
|
|
RE_ANY,
|
|
RE_RANGE,
|
|
RE_NOTRANGE,
|
|
RE_RANGE_EX,
|
|
RE_NOTRANGE_EX,
|
|
RE_PLUS,
|
|
RE_NG_PLUS,
|
|
RE_QUESTION,
|
|
RE_NG_QUESTION,
|
|
RE_JMP,
|
|
RE_PUSH_CHARPOS,
|
|
RE_POP_CHARPOS,
|
|
RE_CALL,
|
|
RE_RETURN,
|
|
RE_STAR_BEGIN,
|
|
RE_NG_STAR_BEGIN,
|
|
RE_PUSH_MEMORY,
|
|
RE_POP_MEMORY,
|
|
RE_STORE_CHARPOS,
|
|
RE_STORE_STACKPOS,
|
|
RE_GET_CHARPOS,
|
|
RE_GET_STACKPOS,
|
|
RE_RET_NOMATCH,
|
|
RE_PREVIOUS,
|
|
RE_FAIL,
|
|
RE_ADVANCE,
|
|
RE_MATCH,
|
|
RE_PUSH_GROUP,
|
|
RE_POP_GROUP,
|
|
};
|
|
|
|
struct INSTRUCTION_SYMBOL
|
|
{
|
|
size_t nSymbol;
|
|
};
|
|
|
|
struct INSTRUCTION_JMP
|
|
{
|
|
size_t nTarget;
|
|
};
|
|
|
|
struct INSTRUCTION_GROUP
|
|
{
|
|
size_t nGroup;
|
|
};
|
|
|
|
struct INSTRUCTION_CALL
|
|
{
|
|
size_t nTarget;
|
|
};
|
|
|
|
struct INSTRUCTION_MEMORY
|
|
{
|
|
size_t nIndex;
|
|
};
|
|
|
|
struct INSTRUCTION_PREVIOUS
|
|
{
|
|
size_t nGroup;
|
|
};
|
|
|
|
struct INSTRUCTION_RANGE_EX
|
|
{
|
|
size_t nTarget;
|
|
};
|
|
|
|
struct INSTRUCTION
|
|
{
|
|
REInstructionType type;
|
|
union
|
|
{
|
|
INSTRUCTION_SYMBOL symbol;
|
|
INSTRUCTION_JMP jmp;
|
|
INSTRUCTION_GROUP group;
|
|
INSTRUCTION_CALL call;
|
|
INSTRUCTION_MEMORY memory;
|
|
INSTRUCTION_PREVIOUS prev;
|
|
INSTRUCTION_RANGE_EX range;
|
|
};
|
|
};
|
|
|
|
inline int InstructionsPerRangeBitField() throw()
|
|
{
|
|
return (256/8) / sizeof(INSTRUCTION) + (((256/8) % sizeof(INSTRUCTION)) ? 1 : 0);
|
|
}
|
|
|
|
CAtlArray<INSTRUCTION> m_Instructions;
|
|
|
|
UINT m_uNumGroups;
|
|
UINT m_uRequiredMem;
|
|
BOOL m_bCaseSensitive;
|
|
|
|
|
|
// class used internally to restore
|
|
// parsing state when unwinding
|
|
class CParseState
|
|
{
|
|
public:
|
|
int m_nNumInstructions;
|
|
UINT m_uNumGroups;
|
|
UINT m_uRequiredMem;
|
|
|
|
CParseState(CAtlRegExp *pRegExp) throw()
|
|
{
|
|
m_nNumInstructions = (int) pRegExp->m_Instructions.GetCount();
|
|
m_uNumGroups = pRegExp->m_uNumGroups;
|
|
m_uRequiredMem = pRegExp->m_uRequiredMem;
|
|
}
|
|
|
|
void Restore(CAtlRegExp *pRegExp)
|
|
{
|
|
pRegExp->m_Instructions.SetCount(m_nNumInstructions);
|
|
pRegExp->m_uNumGroups = m_uNumGroups;
|
|
pRegExp->m_uRequiredMem = m_uRequiredMem;
|
|
}
|
|
};
|
|
|
|
int AddInstruction(REInstructionType type)
|
|
{
|
|
if (!m_Instructions.SetCount(m_Instructions.GetCount()+1))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_OUTOFMEMORY);
|
|
return -1;
|
|
}
|
|
|
|
m_Instructions[m_Instructions.GetCount()-1].type = type;
|
|
return (int) m_Instructions.GetCount()-1;
|
|
}
|
|
|
|
BOOL PeekToken(const RECHAR **ppszRE, int ch) throw()
|
|
{
|
|
if (**ppszRE != ch)
|
|
return FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
BOOL MatchToken(const RECHAR **ppszRE, int ch) throw()
|
|
{
|
|
if (!PeekToken(ppszRE, ch))
|
|
return FALSE;
|
|
*ppszRE = CharTraits::Next(*ppszRE);
|
|
return TRUE;
|
|
}
|
|
|
|
INSTRUCTION &GetInstruction(size_t nIndex) throw()
|
|
{
|
|
return m_Instructions[nIndex];
|
|
}
|
|
|
|
// ParseArg: parse grammar rule Arg
|
|
int ParseArg(const RECHAR **ppszRE, bool &bEmpty)
|
|
{
|
|
int nPushGroup = AddInstruction(RE_PUSH_GROUP);
|
|
if (nPushGroup < 0)
|
|
return -1;
|
|
|
|
GetInstruction(nPushGroup).group.nGroup = m_uNumGroups;
|
|
|
|
int p = AddInstruction(RE_GROUP_START);
|
|
if (p < 0)
|
|
return -1;
|
|
GetInstruction(p).group.nGroup = m_uNumGroups++;
|
|
|
|
int nCall = AddInstruction(RE_CALL);
|
|
if (nCall < 0)
|
|
return -1;
|
|
|
|
int nPopGroup = AddInstruction(RE_POP_GROUP);
|
|
if (nPopGroup < 0)
|
|
return -1;
|
|
GetInstruction(nPopGroup).group.nGroup = GetInstruction(nPushGroup).group.nGroup;
|
|
|
|
if (AddInstruction(RE_RETURN) < 0)
|
|
return -1;
|
|
|
|
int nAlt = ParseRE(ppszRE, bEmpty);
|
|
if (nAlt < 0)
|
|
{
|
|
if (GetLastParseError())
|
|
return -1;
|
|
|
|
if (!PeekToken(ppszRE, '}'))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_BRACE_EXPECTED);
|
|
return -1;
|
|
}
|
|
|
|
// in the case of an empty group, we add a nop
|
|
nAlt = AddInstruction(RE_NOP);
|
|
if (nAlt < 0)
|
|
return -1;
|
|
}
|
|
|
|
GetInstruction(nCall).call.nTarget = nAlt;
|
|
|
|
if (!MatchToken(ppszRE, '}'))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_BRACE_EXPECTED);
|
|
return -1;
|
|
}
|
|
|
|
int nEnd = AddInstruction(RE_GROUP_END);
|
|
if (nEnd < 0)
|
|
return -1;
|
|
GetInstruction(nEnd).group.nGroup = GetInstruction(p).group.nGroup;
|
|
return nPushGroup;
|
|
}
|
|
|
|
// ParseGroup: parse grammar rule Group
|
|
int ParseGroup(const RECHAR **ppszRE, bool &bEmpty)
|
|
{
|
|
int nCall = AddInstruction(RE_CALL);
|
|
if (nCall < 0)
|
|
return -1;
|
|
|
|
if (AddInstruction(RE_RETURN) < 0)
|
|
return -1;
|
|
|
|
int nAlt = ParseRE(ppszRE, bEmpty);
|
|
if (nAlt < 0)
|
|
{
|
|
if (GetLastParseError())
|
|
return -1;
|
|
|
|
if (!PeekToken(ppszRE, ')'))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_PAREN_EXPECTED);
|
|
return -1;
|
|
}
|
|
|
|
// in the case of an empty group, we add a nop
|
|
nAlt = AddInstruction(RE_NOP);
|
|
if (nAlt < 0)
|
|
return -1;
|
|
}
|
|
|
|
GetInstruction(nCall).call.nTarget = nAlt;
|
|
|
|
if (!MatchToken(ppszRE, ')'))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_PAREN_EXPECTED);
|
|
return -1;
|
|
}
|
|
|
|
return nCall;
|
|
}
|
|
|
|
RECHAR GetEscapedChar(RECHAR ch) throw()
|
|
{
|
|
if (ch == 't')
|
|
return '\t';
|
|
return ch;
|
|
}
|
|
|
|
// ParseCharItem: parse grammar rule CharItem
|
|
int ParseCharItem(const RECHAR **ppszRE, RECHAR *pchStartChar, RECHAR *pchEndChar) throw()
|
|
{
|
|
if (**ppszRE == '\\')
|
|
{
|
|
*ppszRE = CharTraits::Next(*ppszRE);
|
|
*pchStartChar = GetEscapedChar(**ppszRE);
|
|
}
|
|
else
|
|
*pchStartChar = **ppszRE;
|
|
*ppszRE = CharTraits::Next(*ppszRE);
|
|
|
|
if (!MatchToken(ppszRE, '-'))
|
|
{
|
|
*pchEndChar = *pchStartChar;
|
|
return 0;
|
|
}
|
|
|
|
// check for unterminated range
|
|
if (!**ppszRE || PeekToken(ppszRE, ']'))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_BRACKET_EXPECTED);
|
|
return -1;
|
|
}
|
|
|
|
*pchEndChar = **ppszRE;
|
|
*ppszRE = CharTraits::Next(*ppszRE);
|
|
|
|
if (*pchEndChar < *pchStartChar)
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_INVALID_RANGE);
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int AddInstructions(int nNumInstructions)
|
|
{
|
|
size_t nCurr = m_Instructions.GetCount();
|
|
if (!m_Instructions.SetCount(nCurr+nNumInstructions))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_OUTOFMEMORY);
|
|
return -1;
|
|
}
|
|
return (int) nCurr;
|
|
}
|
|
|
|
// ParseCharSet: parse grammar rule CharSet
|
|
int ParseCharSet(const RECHAR **ppszRE, BOOL bNot)
|
|
{
|
|
int p = -1;
|
|
|
|
unsigned char *pBits = NULL;
|
|
|
|
if (CharTraits::UseBitFieldForRange())
|
|
{
|
|
// we use a bit field to represent the characters
|
|
// a 1 bit means match against the character
|
|
// the last 5 bits are used as an index into
|
|
// the byte array, and the first 3 bits
|
|
// are used to index into the selected byte
|
|
|
|
p = AddInstruction(bNot ? RE_NOTRANGE : RE_RANGE);
|
|
if (p < 0)
|
|
return -1;
|
|
|
|
// add the required space to hold the character
|
|
// set. We use one bit per character for ansi
|
|
if (AddInstructions(InstructionsPerRangeBitField()) < 0)
|
|
return -1;
|
|
|
|
pBits = (unsigned char *) (&m_Instructions[p+1]);
|
|
memset(pBits, 0x00, 256/8);
|
|
}
|
|
else
|
|
{
|
|
p = AddInstruction(bNot ? RE_NOTRANGE_EX : RE_RANGE_EX);
|
|
if (p < 0)
|
|
return -1;
|
|
}
|
|
|
|
RECHAR chStart;
|
|
RECHAR chEnd;
|
|
|
|
while (**ppszRE && **ppszRE != ']')
|
|
{
|
|
if (ParseCharItem(ppszRE, &chStart, &chEnd))
|
|
return -1;
|
|
|
|
if (CharTraits::UseBitFieldForRange())
|
|
{
|
|
for (int i=chStart; i<=chEnd; i++)
|
|
pBits[i >> 3] |= 1 << (i & 0x7);
|
|
}
|
|
else
|
|
{
|
|
int nStart = AddInstruction(RE_NOP);
|
|
if (nStart < 0)
|
|
return -1;
|
|
|
|
int nEnd = AddInstruction(RE_NOP);
|
|
if (nEnd < 0)
|
|
return -1;
|
|
|
|
GetInstruction(nStart).memory.nIndex = (int) chStart;
|
|
GetInstruction(nEnd).memory.nIndex = (int) chEnd;
|
|
}
|
|
}
|
|
|
|
if (!CharTraits::UseBitFieldForRange())
|
|
GetInstruction(p).range.nTarget = m_Instructions.GetCount();
|
|
|
|
return p;
|
|
}
|
|
|
|
// ParseCharClass: parse grammar rule CharClass
|
|
int ParseCharClass(const RECHAR **ppszRE, bool &bEmpty)
|
|
{
|
|
bEmpty = false;
|
|
if (MatchToken(ppszRE, ']'))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_EMPTY_RANGE);
|
|
return -1;
|
|
}
|
|
|
|
BOOL bNot = FALSE;
|
|
if (MatchToken(ppszRE, '^'))
|
|
bNot = TRUE;
|
|
|
|
if (MatchToken(ppszRE, ']'))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_EMPTY_RANGE);
|
|
return -1;
|
|
}
|
|
|
|
int p = ParseCharSet(ppszRE, bNot);
|
|
if (p < 0)
|
|
return p;
|
|
if (!MatchToken(ppszRE, ']'))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_BRACKET_EXPECTED);
|
|
return -1;
|
|
}
|
|
|
|
return p;
|
|
}
|
|
|
|
int AddMemInstruction(REInstructionType type)
|
|
{
|
|
int p = AddInstruction(type);
|
|
if (p < 0)
|
|
return p;
|
|
GetInstruction(p).memory.nIndex = m_uRequiredMem++;
|
|
return p;
|
|
}
|
|
|
|
// helper for parsing !SE
|
|
int ParseNot(const RECHAR **ppszRE, bool &bEmpty)
|
|
{
|
|
int nStoreCP = AddMemInstruction(RE_STORE_CHARPOS);
|
|
int nStoreSP = AddMemInstruction(RE_STORE_STACKPOS);
|
|
|
|
int nCall = AddInstruction(RE_CALL);
|
|
if (nCall < 0)
|
|
return -1;
|
|
|
|
int nGetCP = AddInstruction(RE_GET_CHARPOS);
|
|
if (nGetCP < 0)
|
|
return -1;
|
|
GetInstruction(nGetCP).memory.nIndex = GetInstruction(nStoreCP).memory.nIndex;
|
|
|
|
int nGetSP = AddInstruction(RE_GET_STACKPOS);
|
|
if (nGetSP < 0)
|
|
return -1;
|
|
GetInstruction(nGetSP).memory.nIndex = GetInstruction(nStoreSP).memory.nIndex;
|
|
|
|
int nJmp = AddInstruction(RE_JMP);
|
|
if (nJmp < 0)
|
|
return -1;
|
|
|
|
int nSE = ParseSE(ppszRE, bEmpty);
|
|
if (nSE < 0)
|
|
return nSE;
|
|
|
|
// patch the call
|
|
GetInstruction(nCall).call.nTarget = nSE;
|
|
|
|
int nGetCP1 = AddInstruction(RE_GET_CHARPOS);
|
|
if (nGetCP1 < 0)
|
|
return -1;
|
|
GetInstruction(nGetCP1).memory.nIndex = GetInstruction(nStoreCP).memory.nIndex;
|
|
|
|
int nGetSP1 = AddInstruction(RE_GET_STACKPOS);
|
|
if (nGetSP1 < 0)
|
|
return -1;
|
|
GetInstruction(nGetSP1).memory.nIndex = GetInstruction(nStoreSP).memory.nIndex;
|
|
|
|
int nRet = AddInstruction(RE_RETURN);
|
|
if (nRet < 0)
|
|
return -1;
|
|
|
|
GetInstruction(nJmp).jmp.nTarget = nRet+1;
|
|
|
|
return nStoreCP;
|
|
}
|
|
|
|
// ParseAbbrev: parse grammar rule Abbrev
|
|
int ParseAbbrev(const RECHAR **ppszRE, bool &bEmpty)
|
|
{
|
|
const RECHAR **szAbbrevs = CharTraits::GetAbbrevs();
|
|
|
|
while (*szAbbrevs)
|
|
{
|
|
if (**ppszRE == **szAbbrevs)
|
|
{
|
|
const RECHAR *szAbbrev = (*szAbbrevs)+1;
|
|
int p = ParseE(&szAbbrev, bEmpty);
|
|
if (p < 0)
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_UNEXPECTED);
|
|
return p;
|
|
}
|
|
*ppszRE = CharTraits::Next(*ppszRE);
|
|
return p;
|
|
}
|
|
szAbbrevs++;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
// ParseSE: parse grammar rule SE (simple expression)
|
|
int ParseSE(const RECHAR **ppszRE, bool &bEmpty)
|
|
{
|
|
|
|
if (MatchToken(ppszRE, '{'))
|
|
return ParseArg(ppszRE, bEmpty);
|
|
if (MatchToken(ppszRE, '('))
|
|
return ParseGroup(ppszRE, bEmpty);
|
|
if (MatchToken(ppszRE, '['))
|
|
return ParseCharClass(ppszRE, bEmpty);
|
|
|
|
if (MatchToken(ppszRE, '\\'))
|
|
{
|
|
if (!CharTraits::Isdigit(**ppszRE))
|
|
{
|
|
// check for abbreviations
|
|
int p;
|
|
p = ParseAbbrev(ppszRE, bEmpty);
|
|
if (p >= 0)
|
|
return p;
|
|
|
|
if (GetLastParseError())
|
|
return -1;
|
|
|
|
// escaped char
|
|
p = AddInstruction(RE_SYMBOL);
|
|
if (p < 0)
|
|
return -1;
|
|
GetInstruction(p).symbol.nSymbol = (int) **ppszRE;
|
|
*ppszRE = CharTraits::Next(*ppszRE);
|
|
return p;
|
|
}
|
|
// previous match
|
|
bEmpty = false;
|
|
int nPrev = AddInstruction(RE_PREVIOUS);
|
|
if (nPrev < 0)
|
|
return -1;
|
|
|
|
UINT uValue = (UINT) CharTraits::Strtol(*ppszRE, (RECHAR **) ppszRE, 10);
|
|
if (uValue >= m_uNumGroups)
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_INVALID_GROUP);
|
|
return -1;
|
|
}
|
|
GetInstruction(nPrev).prev.nGroup = (size_t) uValue;
|
|
return nPrev;
|
|
}
|
|
|
|
if (MatchToken(ppszRE, '!'))
|
|
return ParseNot(ppszRE, bEmpty);
|
|
|
|
if (**ppszRE == '}' || **ppszRE == ']' || **ppszRE == ')')
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
if (**ppszRE == '\0')
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
int p;
|
|
if (**ppszRE == '.')
|
|
{
|
|
p = AddInstruction(RE_ANY);
|
|
if (p < 0)
|
|
return -1;
|
|
bEmpty = false;
|
|
}
|
|
else if (**ppszRE == '$' && (*ppszRE)[1] == '\0')
|
|
{
|
|
p = AddInstruction(RE_SYMBOL);
|
|
if (p < 0)
|
|
return -1;
|
|
GetInstruction(p).symbol.nSymbol = 0;
|
|
bEmpty = false;
|
|
}
|
|
else
|
|
{
|
|
p = AddInstruction(RE_SYMBOL);
|
|
if (p < 0)
|
|
return -1;
|
|
GetInstruction(p).symbol.nSymbol = (int) **ppszRE;
|
|
bEmpty = false;
|
|
}
|
|
*ppszRE = CharTraits::Next(*ppszRE);
|
|
return p;
|
|
}
|
|
|
|
// ParseE: parse grammar rule E (expression)
|
|
int ParseE(const RECHAR **ppszRE, bool &bEmpty)
|
|
{
|
|
CParseState ParseState(this);
|
|
const RECHAR *sz = *ppszRE;
|
|
|
|
int nSE;
|
|
|
|
int nFirst = ParseSE(ppszRE, bEmpty);
|
|
if (nFirst < 0)
|
|
return nFirst;
|
|
|
|
REInstructionType type = RE_MATCH;
|
|
|
|
if (MatchToken(ppszRE, '*'))
|
|
if(MatchToken(ppszRE, '?'))
|
|
type = RE_NG_STAR_BEGIN;
|
|
else
|
|
type = RE_STAR_BEGIN;
|
|
|
|
|
|
else if (MatchToken(ppszRE, '+'))
|
|
if(MatchToken(ppszRE, '?'))
|
|
type = RE_NG_PLUS;
|
|
else
|
|
type = RE_PLUS;
|
|
|
|
else if (MatchToken(ppszRE, '?'))
|
|
if(MatchToken(ppszRE, '?'))
|
|
type = RE_NG_QUESTION;
|
|
else
|
|
type = RE_QUESTION;
|
|
|
|
|
|
if (type == RE_MATCH)
|
|
return nFirst;
|
|
|
|
if (type == RE_STAR_BEGIN || type == RE_QUESTION|| type == RE_NG_STAR_BEGIN || type == RE_NG_QUESTION)
|
|
{
|
|
ParseState.Restore(this);
|
|
}
|
|
else
|
|
{
|
|
m_uNumGroups = ParseState.m_uNumGroups;
|
|
}
|
|
*ppszRE = sz;
|
|
|
|
int nE;
|
|
|
|
if (type == RE_NG_STAR_BEGIN || type == RE_NG_PLUS || type == RE_NG_QUESTION) // Non-Greedy
|
|
{
|
|
int nCall = AddInstruction(RE_CALL);
|
|
if (nCall < 0)
|
|
return -1;
|
|
|
|
bEmpty = false;
|
|
|
|
nSE = ParseSE(ppszRE, bEmpty);
|
|
if (nSE < 0)
|
|
return nSE;
|
|
|
|
if (bEmpty && (type == RE_NG_STAR_BEGIN || type == RE_NG_PLUS))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_EMPTY_REPEATOP);
|
|
return -1;
|
|
}
|
|
bEmpty = true;
|
|
|
|
*ppszRE = CharTraits::Next(*ppszRE);
|
|
*ppszRE = CharTraits::Next(*ppszRE);
|
|
|
|
if (type == RE_NG_STAR_BEGIN || type == RE_NG_PLUS)
|
|
{
|
|
int nJmp = AddInstruction(RE_JMP);
|
|
if (nJmp < 0)
|
|
return -1;
|
|
GetInstruction(nCall).call.nTarget = nJmp+1;
|
|
GetInstruction(nJmp).jmp.nTarget = nCall;
|
|
}
|
|
else
|
|
GetInstruction(nCall).call.nTarget = nSE+1;
|
|
|
|
if (type == RE_NG_PLUS)
|
|
nE = nFirst;
|
|
else
|
|
nE = nCall;
|
|
}
|
|
else // Greedy
|
|
{
|
|
|
|
int nPushMem = AddInstruction(RE_PUSH_MEMORY);
|
|
if (nPushMem < 0)
|
|
return -1;
|
|
|
|
int nStore = AddInstruction(RE_STORE_CHARPOS);
|
|
if (nStore < 0)
|
|
return -1;
|
|
|
|
if (AddInstruction(RE_PUSH_CHARPOS) < 0)
|
|
return -1;
|
|
|
|
int nCall = AddInstruction(RE_CALL);
|
|
if (nCall < 0)
|
|
return -1;
|
|
|
|
if (AddInstruction(RE_POP_CHARPOS) < 0)
|
|
return -1;
|
|
|
|
int nPopMem = AddInstruction(RE_POP_MEMORY);
|
|
if (nPopMem < 0)
|
|
return -1;
|
|
|
|
int nJmp = AddInstruction(RE_JMP);
|
|
if (nJmp < 0)
|
|
return -1;
|
|
|
|
GetInstruction(nPushMem).memory.nIndex = m_uRequiredMem++;
|
|
GetInstruction(nStore).memory.nIndex = GetInstruction(nPushMem).memory.nIndex;
|
|
GetInstruction(nCall).call.nTarget = nJmp+1;
|
|
GetInstruction(nPopMem).memory.nIndex = GetInstruction(nPushMem).memory.nIndex;
|
|
|
|
bEmpty = false;
|
|
|
|
nSE = ParseSE(ppszRE, bEmpty);
|
|
if (nSE < 0)
|
|
return nSE;
|
|
|
|
if (bEmpty && (type == RE_STAR_BEGIN || type == RE_PLUS))
|
|
{
|
|
SetLastParseError(REPARSE_ERROR_EMPTY_REPEATOP);
|
|
return -1;
|
|
}
|
|
|
|
if (type != RE_PLUS && type != RE_NG_PLUS)
|
|
bEmpty = true;
|
|
|
|
*ppszRE = CharTraits::Next(*ppszRE);
|
|
|
|
|
|
int nRetNoMatch = AddInstruction(RE_RET_NOMATCH);
|
|
if (nRetNoMatch < 0)
|
|
return -1;
|
|
|
|
int nStore1 = AddInstruction(RE_STORE_CHARPOS);
|
|
if (nStore1 < 0)
|
|
return -1;
|
|
|
|
GetInstruction(nRetNoMatch).memory.nIndex = GetInstruction(nPushMem).memory.nIndex;
|
|
GetInstruction(nStore1).memory.nIndex = GetInstruction(nPushMem).memory.nIndex;
|
|
|
|
if (type != RE_QUESTION)
|
|
{
|
|
int nJmp1 = AddInstruction(RE_JMP);
|
|
if (nJmp1 < 0)
|
|
return -1;
|
|
GetInstruction(nJmp1).jmp.nTarget = nPushMem;
|
|
}
|
|
|
|
GetInstruction(nJmp).jmp.nTarget = m_Instructions.GetCount();
|
|
if (type == RE_PLUS)
|
|
nE = nFirst;
|
|
else
|
|
nE = nPushMem;
|
|
}
|
|
|
|
return nE;
|
|
}
|
|
|
|
|
|
// ParseAltE: parse grammar rule AltE
|
|
int ParseAltE(const RECHAR **ppszRE, bool &bEmpty)
|
|
{
|
|
const RECHAR *sz = *ppszRE;
|
|
CParseState ParseState(this);
|
|
|
|
int nPush = AddInstruction(RE_PUSH_CHARPOS);
|
|
if (nPush < 0)
|
|
return -1;
|
|
|
|
int nCall = AddInstruction(RE_CALL);
|
|
if (nCall < 0)
|
|
return -1;
|
|
|
|
GetInstruction(nCall).call.nTarget = nPush+4;
|
|
if (AddInstruction(RE_POP_CHARPOS) < 0)
|
|
return -1;
|
|
|
|
int nJmpNext = AddInstruction(RE_JMP);
|
|
if (nJmpNext < 0)
|
|
return -1;
|
|
|
|
int nE = ParseE(ppszRE, bEmpty);
|
|
if (nE < 0)
|
|
{
|
|
if (GetLastParseError())
|
|
return -1;
|
|
ParseState.Restore(this);
|
|
return nE;
|
|
}
|
|
|
|
int nJmpEnd = AddInstruction(RE_JMP);
|
|
if (nJmpEnd < 0)
|
|
return -1;
|
|
|
|
GetInstruction(nJmpNext).jmp.nTarget = nJmpEnd+1;
|
|
|
|
if (!MatchToken(ppszRE, '|'))
|
|
{
|
|
ParseState.Restore(this);
|
|
*ppszRE = sz;
|
|
|
|
return ParseE(ppszRE, bEmpty);
|
|
}
|
|
|
|
bool bEmptyAltE;
|
|
int nAltE = ParseAltE(ppszRE, bEmptyAltE);
|
|
GetInstruction(nJmpEnd).jmp.nTarget = m_Instructions.GetCount();
|
|
GetInstruction(nJmpNext).jmp.nTarget = nAltE;
|
|
if (nAltE < 0)
|
|
{
|
|
if (GetLastParseError())
|
|
return -1;
|
|
ParseState.Restore(this);
|
|
return nAltE;
|
|
}
|
|
bEmpty = bEmpty | bEmptyAltE;
|
|
return nPush;
|
|
}
|
|
|
|
// ParseRE: parse grammar rule RE (regular expression)
|
|
int ParseRE(const RECHAR **ppszRE, bool &bEmpty)
|
|
{
|
|
if (**ppszRE == '\0')
|
|
return -1;
|
|
|
|
int p = ParseAltE(ppszRE, bEmpty);
|
|
if (p < 0)
|
|
return p;
|
|
|
|
bool bEmptyRE = true;
|
|
ParseRE(ppszRE, bEmptyRE);
|
|
if (GetLastParseError())
|
|
return -1;
|
|
bEmpty = bEmpty && bEmptyRE;
|
|
return p;
|
|
}
|
|
|
|
//pointers to the matched string and matched groups, currently point into an internal allocated
|
|
//buffer that hold a copy of the input string.
|
|
//This function fix these pointers to point into the original, user supplied buffer (first param to Match method).
|
|
//Example: If a ptr (szStart) currently point to <internal buffer>+3, it is fixed to <user supplied buffer>+3
|
|
void FixupMatchContext(CAtlREMatchContext<CharTraits> *pContext, const RECHAR *szOrig, const RECHAR *szNew)
|
|
{
|
|
ATLENSURE(pContext);
|
|
ATLASSERT(szOrig);
|
|
ATLASSERT(szNew);
|
|
|
|
pContext->m_Match.szStart = szOrig + (pContext->m_Match.szStart - szNew);
|
|
pContext->m_Match.szEnd = szOrig + (pContext->m_Match.szEnd - szNew);
|
|
for (UINT i=0; i<pContext->m_uNumGroups; i++)
|
|
{
|
|
if (pContext->m_Matches[i].szStart==NULL || pContext->m_Matches[i].szEnd==NULL)
|
|
{
|
|
continue; //Do not fix unmatched groups.
|
|
}
|
|
pContext->m_Matches[i].szStart = szOrig + (pContext->m_Matches[i].szStart - szNew);
|
|
pContext->m_Matches[i].szEnd = szOrig + (pContext->m_Matches[i].szEnd - szNew);
|
|
}
|
|
}
|
|
// implementation
|
|
// helpers for dumping and debugging the rx engine
|
|
public:
|
|
#ifdef ATL_REGEXP_DUMP
|
|
size_t DumpInstruction(size_t ip)
|
|
{
|
|
printf("%08x ", ip);
|
|
switch (GetInstruction(ip).type)
|
|
{
|
|
case RE_NOP:
|
|
printf("NOP\n");
|
|
ip++;
|
|
break;
|
|
|
|
case RE_SYMBOL:
|
|
AtlprintfT<RECHAR>(CAToREChar<RECHAR>("Symbol %c\n"),GetInstruction(ip).symbol.nSymbol);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_ANY:
|
|
printf("Any\n");
|
|
ip++;
|
|
break;
|
|
|
|
case RE_RANGE:
|
|
printf("Range\n");
|
|
ip++;
|
|
ip += InstructionsPerRangeBitField();
|
|
break;
|
|
|
|
case RE_NOTRANGE:
|
|
printf("NOT Range\n");
|
|
ip++;
|
|
ip += InstructionsPerRangeBitField();
|
|
break;
|
|
|
|
case RE_RANGE_EX:
|
|
printf("RangeEx %08x\n", GetInstruction(ip).range.nTarget);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_NOTRANGE_EX:
|
|
printf("NotRangeEx %08x\n", GetInstruction(ip).range.nTarget);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_GROUP_START:
|
|
printf("Start group %d\n", GetInstruction(ip).group.nGroup);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_GROUP_END:
|
|
printf("Group end %d\n", GetInstruction(ip).group.nGroup);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_PUSH_CHARPOS:
|
|
printf("Push char pos\n");
|
|
ip++;
|
|
break;
|
|
|
|
case RE_POP_CHARPOS:
|
|
printf("Pop char pos\n");
|
|
ip++;
|
|
break;
|
|
|
|
case RE_STORE_CHARPOS:
|
|
printf("Store char pos %d\n", GetInstruction(ip).memory.nIndex);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_GET_CHARPOS:
|
|
printf("Get char pos %d\n", GetInstruction(ip).memory.nIndex);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_STORE_STACKPOS:
|
|
printf("Store stack pos %d\n", GetInstruction(ip).memory.nIndex);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_GET_STACKPOS:
|
|
printf("Get stack pos %d\n", GetInstruction(ip).memory.nIndex);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_CALL:
|
|
printf("Call %08x\n", GetInstruction(ip).call.nTarget);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_JMP:
|
|
printf("Jump %08x\n", GetInstruction(ip).jmp.nTarget);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_RETURN:
|
|
printf("return\n");
|
|
ip++;
|
|
break;
|
|
|
|
case RE_PUSH_MEMORY:
|
|
printf("Push memory %08x\n", GetInstruction(ip).memory.nIndex);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_POP_MEMORY:
|
|
printf("Pop memory %08x\n", GetInstruction(ip).memory.nIndex);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_RET_NOMATCH:
|
|
printf("Return no match %08x\n", GetInstruction(ip).memory.nIndex);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_MATCH:
|
|
printf("END\n");
|
|
ip++;
|
|
break;
|
|
|
|
case RE_ADVANCE:
|
|
printf("ADVANCE\n");
|
|
ip++;
|
|
break;
|
|
|
|
case RE_FAIL:
|
|
printf("FAIL\n");
|
|
ip++;
|
|
break;
|
|
|
|
case RE_PREVIOUS:
|
|
printf("Prev %d\n", GetInstruction(ip).prev.nGroup);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_PUSH_GROUP:
|
|
printf("Push group %d\n", GetInstruction(ip).group.nGroup);
|
|
ip++;
|
|
break;
|
|
|
|
case RE_POP_GROUP:
|
|
printf("Pop group %d\n", GetInstruction(ip).group.nGroup);
|
|
ip++;
|
|
break;
|
|
|
|
|
|
default:
|
|
printf("????\n");
|
|
ip++;
|
|
break;
|
|
}
|
|
return ip;
|
|
}
|
|
|
|
void Dump(size_t ipCurrent = 0)
|
|
{
|
|
size_t ip = 0;
|
|
|
|
while (ip < m_Instructions.GetCount())
|
|
{
|
|
if (ip == ipCurrent)
|
|
printf("->");
|
|
ip = DumpInstruction(ip);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifdef ATLRX_DEBUG
|
|
void cls( HANDLE hConsole )
|
|
{
|
|
COORD coordScreen = { 0, 0 }; /* here's where we'll home the
|
|
cursor */
|
|
BOOL bSuccess;
|
|
DWORD cCharsWritten;
|
|
CONSOLE_SCREEN_BUFFER_INFO csbi; /* to get buffer info */
|
|
DWORD dwConSize; /* number of character cells in
|
|
the current buffer */
|
|
|
|
/* get the number of character cells in the current buffer */
|
|
|
|
bSuccess = GetConsoleScreenBufferInfo( hConsole, &csbi );
|
|
dwConSize = csbi.dwSize.X * csbi.dwSize.Y;
|
|
|
|
/* fill the entire screen with blanks */
|
|
|
|
bSuccess = FillConsoleOutputCharacter( hConsole, (TCHAR) ' ',
|
|
dwConSize, coordScreen, &cCharsWritten );
|
|
|
|
/* get the current text attribute */
|
|
|
|
bSuccess = GetConsoleScreenBufferInfo( hConsole, &csbi );
|
|
|
|
/* now set the buffer's attributes accordingly */
|
|
|
|
bSuccess = FillConsoleOutputAttribute( hConsole, csbi.wAttributes,
|
|
dwConSize, coordScreen, &cCharsWritten );
|
|
|
|
/* put the cursor at (0, 0) */
|
|
|
|
bSuccess = SetConsoleCursorPosition( hConsole, coordScreen );
|
|
return;
|
|
}
|
|
|
|
void DumpStack(CAtlREMatchContext<CharTraits> *pContext)
|
|
{
|
|
for (size_t i=pContext->m_nTos; i>0; i--)
|
|
{
|
|
if (pContext->m_stack[i] < (void *) m_Instructions.GetCount())
|
|
printf("0x%p\n", pContext->m_stack[i]);
|
|
else
|
|
{
|
|
// assume a pointer into the input
|
|
AtlprintfT<RECHAR>(CAToREChar<RECHAR>("%s\n"), pContext->m_stack[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void DumpMemory(CAtlREMatchContext<CharTraits> *pContext)
|
|
{
|
|
for (UINT i=0; i<m_uRequiredMem; i++)
|
|
{
|
|
AtlprintfT<RECHAR>(CAToREChar<RECHAR>("%d: %s\n"), i, pContext->m_Mem.m_p[i]);
|
|
}
|
|
}
|
|
|
|
virtual void OnDebugEvent(size_t ip, const RECHAR *szIn, const RECHAR *sz, CAtlREMatchContext<CharTraits> *pContext)
|
|
{
|
|
cls(GetStdHandle(STD_OUTPUT_HANDLE));
|
|
printf("----------Code---------\n");
|
|
Dump(ip);
|
|
printf("----------Input---------\n");
|
|
AtlprintfT<RECHAR>(CAToREChar<RECHAR>("%s\n"), szIn);
|
|
for (int s=0; szIn+s < sz; s++)
|
|
{
|
|
printf(" ");
|
|
}
|
|
printf("^\n");
|
|
printf("----------Memory---------\n");
|
|
DumpMemory(pContext);
|
|
printf("----------Stack---------\n");
|
|
DumpStack(pContext);
|
|
getchar();
|
|
}
|
|
#endif
|
|
|
|
};
|
|
|
|
} // namespace ATL
|
|
#pragma pack(pop)
|
|
|
|
#endif // __ATLRX_H__
|