00001 static char version[] = "$Header: /TDS Product Cycle/0 Development/Software/MiscTools/NewGrep/pattern.hpp 3 00-11-03 10:19 Vwagner $"; 00002 00003 /* 00004 Copyright © 2000 Teton Data Systems 00005 00006 Description: 00007 00008 */ 00009 00010 //! Needed because VS and VC++ (6) don't do namespaces well 00011 typedef std::vector<unsigned short> usv; 00012 00013 //!Holds a set of states for the pattern match algorithm 00014 class StateSet: public usv 00015 { 00016 private: 00017 public: 00018 00019 //!Put a state into the set if it isn't already there 00020 //!\return true if new_state is 0 00021 bool Put(unsigned short new_state) //!<new state to put in set 00022 { 00023 //! we only put in 'non-zero' states 00024 if (new_state) 00025 { 00026 //! if find doesn't, it returns end() 00027 if (usv::end() == std::find(usv::begin(), usv::end(), new_state)) 00028 usv::push_back(new_state); 00029 } 00030 00031 //! let caller know if we tried to put a 'zero' in 00032 return !new_state; 00033 } 00034 00035 //!override the standard one because it gives back the current vector 00036 void clear() 00037 { 00038 erase(begin(), end()); 00039 } 00040 }; 00041 00042 //!case_independent "pattern match". 00043 /*! 00044 In usage, two calls are made: 00045 The first is to the constructor to 'set' the pattern 00046 Then (possibly) multiple calls to operator () to check strings against the pattern. 00047 For example: 00048 \code 00049 Pattern magic("#?test#?"); // look for "test" anywhere in the string 00050 if (magic("this is a test")) // this would test true 00051 cout << "Yes, a match.\n"; 00052 if (magic("a rather contrived example")) // this would test false 00053 cout << "This is never gonna print\n"; 00054 else cout << "No copy of \"test\" in that string\n."; 00055 \endcode 00056 00057 Of course, both calls may be collapsed into a \b single "call" as follows: 00058 \code 00059 if (Pattern("#?test#?")("This is a test, isn't it?")) 00060 cout << "Yes, and successful at that!\n"; 00061 \endcode 00062 \note this is not particularly efficient as the Pattern will be destructed at the end of the statment, but for a "quick and dirty" it's probably OK 00063 00064 \b Pattern is designed to work with the \b STL algorithms as a functor. For example, 00065 assume a std::vector of string already filled which we wish to test against a pattern. 00066 The following would be one method: 00067 \code 00068 #include <vector> 00069 #include <algorithm> 00070 ... 00071 std::vector<string> text_stuff; 00072 ... // code to fill text_stuff 00073 ... 00074 // get an iterator which "points" to the first(1st) string in the vector 00075 std::vector<string>::const_iterator iter(text_stuff.begin()); 00076 // output all strings which contain the pattern "this matches" 00077 while (text_stuff.end() != (iter = find(iter, text_stuff.end(), Pattern("#?this'smatches#?")))) 00078 { 00079 cout << *iter; // output string which 'matches' 00080 ++iter; // so we start search after this string 00081 } 00082 \endcode 00083 \note we bounded our pattern with #? so that the match could occur anywhere in the target string 00084 \note we also used 's (whitespace) so that we will match either a space (' ') or tab ('\t') between the two words. 00085 */ 00086 class Pattern 00087 { 00088 private: 00089 std::string the_Pattern; //!<the actual pattern we're searching for 00090 unsigned short patp; //!<working pointer for the match function 00091 char ch; //!<current character from the_Pattern 00092 usv Compiled_Pattern; //!<holds 'next state' info 00093 enum {endstreamch = 0x0d}; 00094 00095 //!get the next character (or 'endstreamch') from the pattern 00096 //!\return the next character (put it in ch also) 00097 inline char rch() 00098 { 00099 if (patp>=the_Pattern.size()) // if we're past the end 00100 return ch=endstreamch; // return a marker 00101 return ch=the_Pattern[patp++]; // otherwise return the character AND bump the pointer 00102 } 00103 00104 //! get the next 'item' (remove squoted characters) 00105 //!\return then next character after skipping a squoted one if neccessary 00106 inline char nextitem() 00107 { 00108 if (ch=='\'') // if the last thing was a 'squote' 00109 rch(); // throw away what was squoted 00110 return rch(); // get the next thing 00111 } 00112 00113 //! replace each element of a list starting at 'list' with val 00114 void setexits(unsigned short list //!< the start of the list to replace 00115 ,unsigned short val) //!< the value to put in the list 00116 { 00117 while (list != 0) 00118 { 00119 unsigned short next(Compiled_Pattern[list]); 00120 Compiled_Pattern[list] = val; 00121 list = next; 00122 } 00123 } 00124 00125 //! append a value (or list) to a list 00126 //!\return either val (if the list is empty) or list (if it's not) 00127 unsigned short join(unsigned short list //!< start of list 00128 ,unsigned short val) //!< value to append (note this COULD be the start of a new list) 00129 { 00130 if (list == 0) return(val); 00131 unsigned short save_head(list); 00132 00133 //! follow the list to the end 00134 while (Compiled_Pattern[list] != 0) list = Compiled_Pattern[list]; 00135 00136 //! add new element to the end 00137 Compiled_Pattern[list] = val; 00138 return(save_head); 00139 } 00140 00141 //!collect stuff until we get an endstreamch or an unmatched ) 00142 unsigned short expand (unsigned short altp) 00143 { 00144 unsigned short exits = 0; 00145 unsigned short a; 00146 00147 for (;;) 00148 { 00149 a = primative(); 00150 if ((ch == '`') || (ch == '|') || (ch == ')') || (ch == endstreamch)) 00151 { 00152 exits = join(exits, a); 00153 00154 if ( ( ch != '`' ) && ( ch != '|' ) ) 00155 { 00156 return(exits); 00157 } 00158 00159 Compiled_Pattern[altp] = patp; 00160 altp = patp; 00161 nextitem(); 00162 } 00163 else // must be just an 'item' 00164 { 00165 setexits(a, patp); // 00166 } 00167 } 00168 } 00169 00170 //!get a primative from the pattern 00171 /*! 00172 \return pointer (well, patp) of the 1st char of the primative 00173 00174 a primative is: 00175 \li an item 00176 \li a '(' followed by a primative followed by a ')' 00177 \li a '#' followed by a primative 00178 */ 00179 unsigned short primative(void) 00180 { 00181 unsigned short a; 00182 char op; 00183 00184 a = patp; // save where we're starting 00185 op = ch; // save current ch for later switching 00186 nextitem(); // get the next item (we're gonna consume this one) 00187 switch (op) 00188 { 00189 case '(': 00190 a = expand(a); 00191 if (ch != ')') 00192 throw std::runtime_error("bad pattern: expected )"); 00193 nextitem(); // ch == ')' or we would have thrown 00194 return(a); 00195 00196 case endstreamch: 00197 case ')': 00198 case '`': 00199 case '|': 00200 throw std::runtime_error("bad pattern: unexpected ), `, |, or endstreamch"); 00201 00202 case '#': // 0 or more of 00203 setexits(primative(), a); // loop back to try it again 00204 default: 00205 return(a); 00206 } 00207 } 00208 00209 protected: 00210 public: 00211 00212 //!construct a Pattern object to search (case-independently) 00213 /*!\verbatim 00214 The following metacharacters are used in the pattern string 00215 ? = matches any single character except newline 00216 # = zero or more occurrances of following item 00217 % = Matches the null string 00218 () = enclose multiple items to be considered a single item 00219 'a = any alphabetic character 00220 'd = any digit 00221 'n = any alphanumeric 00222 's = any white space character 00223 ' = escape (match following metacharacter literally, '$ = $) 00224 | = or (a|b) = a or b 00225 ` = or (a`b) = a or b (added to allow easier input from MS command lines) 00226 00227 all other characters must be matched exactly 00228 \endverbatim 00229 \note The pattern much match the \b entire string presented to \b operator()() 00230 \note If you mean to look in only \b part of the string, enclose your pattern thusly: 00231 \note \b #?<your_pattern>#? 00232 00233 */ 00234 explicit Pattern(std::string pat) //!<the pattern we'll be attempting to match 00235 : the_Pattern(pat), patp(0), Compiled_Pattern(pat.size()+1) 00236 { 00237 rch(); //! prime the pump 00238 setexits(expand(0), 0); //! generate Compiled_Pattern 00239 } 00240 00241 bool operator() (const std::string& String_to_Test) 00242 { 00243 StateSet a_states, b_states; 00244 unsigned short Position_to_Test(0); 00245 bool success(false); //!<working variable that hold's success 00246 a_states.reserve(the_Pattern.size()); 00247 b_states.reserve(the_Pattern.size()); 00248 a_states.Put(1); 00249 a_states.Put(Compiled_Pattern[0]); 00250 for(;;) 00251 { 00252 //!for_all a_states ..... 00253 for (StateSet::iterator i = a_states.begin(); i != a_states.end(); ++i) 00254 { 00255 unsigned short p = *i; 00256 unsigned short q = Compiled_Pattern[p]; 00257 switch (the_Pattern[p-1]) 00258 { 00259 case '#': 00260 a_states.Put(p+1); 00261 case '%': 00262 success |= a_states.Put(q); 00263 break; 00264 case '(': 00265 case '`': 00266 case '|': 00267 a_states.Put(p+1); 00268 a_states.Put(q); 00269 break; 00270 default: 00271 break; 00272 } // end switch (the_Pattern[p-1]) 00273 } // end for (StateSet::iterator i = a_states.begin(); i != a_states.end(); ++i) 00274 00275 //! are we done? test for ONLY two (2) ways out of here 00276 if (Position_to_Test >= String_to_Test.size()) return success; 00277 if (!a_states.size()) return false; 00278 00279 //!Check the next character in the "input" string 00280 success = false; 00281 b_states.clear(); 00282 a_states.swap(b_states); 00283 ch = tolower(String_to_Test[Position_to_Test++]); //! to make everything 'case independent' 00284 00285 //!for_all b_states ..... 00286 for (i = b_states.begin(); i != b_states.end(); ++i) 00287 { 00288 unsigned short p = *i; 00289 switch (char k = tolower(the_Pattern[p-1])) //! to make everything 'case independent' 00290 { 00291 case '#': 00292 case '`': 00293 case '|': 00294 case '%': 00295 case '(': 00296 continue; // nothing to check here 00297 case '\'': // AHA!! one of our 'special' characters 00298 switch (k = tolower(the_Pattern[p])) //! to make everything 'case independent' 00299 { 00300 case 'a': 00301 if (isalpha(ch)) 00302 k = ch; // so they match later 00303 break; 00304 case 'd': 00305 if (isdigit(ch)) 00306 k = ch; // so they match later 00307 else 00308 k = '0'; // so they canNOT match 00309 break; 00310 case 'n': 00311 if (isalnum(ch)) 00312 k = ch; // so they match later 00313 break; 00314 case 's': 00315 if (isspace(ch)) 00316 k = ch; // so they match later 00317 else 00318 k = ' '; // so they canNOT match 00319 default:; //! nothing to do for 'non-special' characters 00320 } // end switch (k = tolower(the_Pattern[p])) 00321 00322 //!This is the important part... does the 'pattern' match the 'input string' 00323 default: 00324 if (ch == k) 00325 case '?': //! remember "?" matches anything 00326 success |= a_states.Put(Compiled_Pattern[p]); 00327 continue; 00328 } // end switch (char k = tolower(the_Pattern[p-1])) 00329 } // end for (i = b_states.begin(); i != b_states.end(); ++i) 00330 } // end for (;;) ...... no way out here 00331 } 00332 }; 00333
1.2.3 written by Dimitri van Heesch,
© 1997-2000