00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 00009 /*************************************************************************** 00010 * RCS INFORMATION: 00011 * 00012 * $RCSfile: JRegex.C,v $ 00013 * $Author: johns $ $Locker: $ $State: Exp $ 00014 * $Revision: 1.15 $ $Date: 2024年03月01日 02:01:37 $ 00015 * 00016 *************************************************************************** 00017 * DESCRIPTION: 00018 * Interface for performing regular expression pattern matching, 00019 * encapsulating the PCRE regular expression package. 00020 ***************************************************************************/ 00021 00022 // 00023 // Online regex info and testing: 00024 // https://regex101.com/ 00025 // 00026 // PCRE regex library: 00027 // https://www.pcre.org/ 00028 // 00029 00030 #include "JRegex.h" 00031 #include "Inform.h" 00032 00033 #if defined(VMDUSEPCRE2) 00034 #define PCRE2_CODE_UNIT_WIDTH 8 00035 #include "pcre2.h" 00036 #else 00037 #include "pcre.h" 00038 #endif 00039 00040 JRegex::JRegex(const char *pattern, int) { 00041 if (pattern == NULL) { 00042 msgErr << "NULL pattern passed to JRegex!" << sendmsg; 00043 } 00044 else { 00045 #if defined(VMDUSEPCRE2) 00046 // 00047 // PCRE2 API 00048 // https://www.pcre.org/current/doc/html/pcre2api.html 00049 // https://www.pcre.org/current/doc/html/pcre2demo.html 00050 // 00051 int errornumber=0; 00052 size_t erroroffset=0; 00053 md = NULL; 00054 JIT=0; 00055 rpat = pcre2_compile((PCRE2_SPTR) pattern, // the regex pattern string 00056 PCRE2_ZERO_TERMINATED, // C style string 00057 0, // default options 00058 &errornumber, // error number 00059 &erroroffset, // offset to error location 00060 NULL); // use default compile context 00061 00062 if (rpat == NULL) { 00063 PCRE2_UCHAR errbuf[256]; 00064 pcre2_get_error_message(errornumber, errbuf, sizeof(errbuf)); 00065 msgWarn << "JRegex: Error in pcre2_compile(), " << sendmsg; 00066 msgWarn << "Error in regex pattern begins with " << pattern+erroroffset 00067 << sendmsg; 00068 } else { 00069 #if 1 00070 // 00071 // enable JIT compilation of the regex, for large selection traversals 00072 // https://www.pcre.org/current/doc/html/pcre2jit.html 00073 // 00074 int rc=0; 00075 rc=pcre2_jit_compile((pcre2_code *) rpat, PCRE2_JIT_COMPLETE); 00076 if (rc && rc != PCRE2_ERROR_JIT_BADOPTION) 00077 msgWarn << "JRegex: pcre2_jit_compile() returned an error." << sendmsg; 00078 00079 // check that JIT succeeded so we can use the fast-path if possible 00080 if (!rc) { 00081 size_t len=0; 00082 pcre2_pattern_info((pcre2_code *) rpat, PCRE2_INFO_JITSIZE, &len); 00083 // msgInfo << "PCRE2 JIT size: " << len << sendmsg; 00084 if (len != 0) 00085 JIT=1; 00086 } 00087 #endif 00088 00089 // md = pcre2_match_data_create(1, NULL); 00090 md = pcre2_match_data_create_from_pattern((pcre2_code *) rpat, NULL); 00091 } 00092 #else 00093 const char *errptr; 00094 int erroffset; 00095 rpat = vmdpcre_compile(pattern, // the regex pattern string 00096 0, // options 00097 &errptr, // points to error message, if any 00098 &erroffset, // offset to error location 00099 NULL); // Table pointer; NULL for use default 00100 if (rpat == NULL) { 00101 msgWarn << "JRegex: Error in pcre_compile(), " << errptr << sendmsg; 00102 msgWarn << "Error in regex pattern begins with " << pattern+erroffset 00103 << sendmsg; 00104 } 00105 #endif 00106 } 00107 } 00108 00109 JRegex::~JRegex() { 00110 #if defined(VMDUSEPCRE2) 00111 if (md) 00112 pcre2_match_data_free((pcre2_match_data *) md); 00113 if (rpat) 00114 pcre2_code_free((pcre2_code *) rpat); 00115 #else 00116 vmdpcre_free(rpat); 00117 #endif 00118 } 00119 00120 int JRegex::match(const char *str, int len) const { 00121 #if defined(VMDUSEPCRE2) 00122 if (rpat==NULL || md == NULL) { 00123 // msgWarn << "JRegex::match: bad regex pattern, no match" << sendmsg; 00124 return -1; 00125 } 00126 00127 int rc=0; 00128 00129 // if the regular expression has successfully JITted, we call the 00130 // JIT fast path to avoid overheads 00131 if (JIT) { 00132 rc=pcre2_jit_match((pcre2_code *) rpat, // compiled regex pattern 00133 (PCRE2_SPTR) str, // subject of the search 00134 len, // strlen of str 00135 0, // match starting offset 00136 0, // options 00137 (pcre2_match_data *) md, // match data block 00138 NULL); // match ctx, NULL for defaults 00139 return rc; 00140 } 00141 00142 rc=pcre2_match((pcre2_code *) rpat, // compiled regex pattern 00143 (PCRE2_SPTR) str, // subject of the search 00144 len, // strlen of str 00145 0, // match starting offset 00146 0, // options 00147 (pcre2_match_data *) md, // match data block 00148 NULL); // match ctx, NULL for defaults 00149 return rc; 00150 #else 00151 if (rpat==NULL) { 00152 // msgWarn << "JRegex::match: bad regex pattern, no match" << sendmsg; 00153 return -1; 00154 } 00155 int retval; 00156 retval=vmdpcre_exec(rpat, // compiled regex pattern 00157 NULL, // No extra study wisdom 00158 str, // subject of the search 00159 len, // strlen of str 00160 0, // offset at which to start finding substrings 00161 0, // options 00162 NULL, // return vector for location of substrings 00163 0); // size of return vector 00164 return retval; 00165 #endif 00166 } 00167 00168 int JRegex::search(const char *str, int len, int &length, int start) { 00169 #if defined(VMDUSEPCRE2) 00170 return -1; // not implemented 00171 #else 00172 if (rpat==NULL) { 00173 // msgWarn << "JRegex::search: bad regex pattern, no match" << sendmsg; 00174 return -1; 00175 } 00176 int ovec[6], retval; 00177 retval=vmdpcre_exec(rpat, // my regex pattern 00178 NULL, // No extra study wisdom 00179 str, // subject of the search 00180 len, // strlen of str 00181 start,// offset at which to start finding substrings 00182 0, // options 00183 ovec, // return vector for location of substrings 00184 6); // size of return vector 00185 if (retval < 0) 00186 return retval; 00187 length = ovec[1]-ovec[0]; 00188 return ovec[0]; 00189 #endif 00190 } 00191