1 module tern.regex.builder; 2 3 import tern.state; 4 import tern.exception; 5 import std.string; 6 import std.conv; 7 import std.ascii; 8 import std.algorithm; 9 10 // Tokens 11 package enum : ubyte 12 { 13 BAD, 14 /// `(?...)` 15 /// Matches group ahead 16 LOOK_AHEAD, 17 /// `(...<...)` 18 /// Matches group behind 19 LOOK_BEHIND, 20 /// `[...]` 21 /// Stores a set of characters (matches any) 22 CHARACTERS, 23 /// `^` 24 /// Matches only if at the start of a line or the full text 25 ANCHOR_START, 26 /// `$` 27 /// Matches only if at the end of a line or the full text 28 ANCHOR_END, 29 /// `(...)` 30 /// Stores a set of elements 31 GROUP, 32 /// `.` 33 /// Matches any character 34 ANY, 35 /// ~`\gn`~ or `$n` (group) or `%n` (absolute) 36 /// Refers to a group or element 37 REFERENCE, 38 // Not used! Comments don't need to be parsed! 39 // (?#...) 40 //COMMENT 41 /// `\K` `\Kn`. 42 /// Resets match or group match 43 RESET, 44 /// `n->` or `|->` 45 /// Moves the current text position forward 46 PUSHFW, 47 /// `<-n` or `<-|` 48 /// Moves the current text position backward 49 PUSHBW 50 } 51 52 // Element modifiers 53 package enum : ubyte 54 { 55 /// No special rules, matches until the next element can match 56 NONE = 0, 57 /// `|` 58 /// If fails, the next element will attempt to match instead 59 ALTERNATE = 1, 60 /// `[^...]` 61 /// Matches only if no characters in the set match 62 EXCLUSIONARY = 2, 63 /// `{...}` 64 /// Has min/max 65 QUANTIFIED = 4, 66 // * 67 //GREEDY = 8, 68 // + 69 //MANY = 16, 70 // ? 71 //OPTIONAL = 32, 72 // Now defaults to POSITIVE/CAPTURE if set to NONE 73 // (...=...) 74 // Also (...) (capture group) 75 //CAPTURE = 64, 76 //POSITIVE = 64, 77 /// `(?:...)` 78 /// Acts as a group but does not capture match 79 NONCAPTURE = 8, 80 /// `(...!...)` 81 /// Matches if not matched 82 NEGATIVE = 16, 83 /// `...?` 84 /// Matches as few times as possible 85 LAZY = 32, 86 /// `...+` 87 /// Matches as many times as possible 88 GREEDY = 64 89 } 90 91 // Regex flags 92 package enum : ubyte 93 { 94 /// Match more than once 95 GLOBAL = 2, 96 /// ^ & $ match start & end 97 MULTILINE = 4, 98 /// Case insensitive 99 INSENSITIVE = 8, 100 /// Ignore whitespace 101 EXTENDED = 16, 102 /// . matches r\n\f 103 SINGLELINE = 32 104 } 105 106 package struct Element 107 { 108 public: 109 final: 110 align(8): 111 /// What kind of element is this? 112 /// eg: `CHARACTERS`. 113 ubyte token; 114 /// What are the special modifiers of this element? 115 /// eg: `EXCLUSIONARY`. 116 ubyte modifiers; 117 /// Characters mapped (like in a character set or literal) 118 /// Elements mapped (like in a group or reference) 119 union 120 { 121 string str; 122 Element[] elements; 123 } 124 /// Minimum times to require fulfillment 125 /// eg: `1`. 126 uint min; 127 /// Maximum times to allow fulfillment 128 /// eg: `1`. 129 uint max = 1; 130 131 pure string tokenName() 132 { 133 switch (token) 134 { 135 case BAD: return "BAD"; 136 case LOOK_AHEAD: return "LOOK_AHEAD"; 137 case LOOK_BEHIND: return "LOOK_BEHIND"; 138 case CHARACTERS: return "CHARACTERS"; 139 case ANCHOR_START: return "ANCHOR_START"; 140 case ANCHOR_END: return "ANCHOR_END"; 141 case GROUP: return "GROUP"; 142 case ANY: return "ANY"; 143 case REFERENCE: return "REFERENCE"; 144 case RESET: return "RESET"; 145 case PUSHFW: return "PUSHFW"; 146 case PUSHBW: return "PUSHBW"; 147 default: assert(0); 148 } 149 } 150 151 pure string modifiersName() 152 { 153 string ret; 154 switch (modifiers) 155 { 156 case NONE: ret = "NONE"; break; 157 case ALTERNATE: (ret != null) ? (ret ~= " | ALTERNATE") : (ret ~= "ALTERNATE"); break; 158 case EXCLUSIONARY: (ret != null) ? (ret ~= " | EXCLUSIONARY") : (ret ~= "EXCLUSIONARY"); break; 159 case QUANTIFIED: (ret != null) ? (ret ~= " | QUANTIFIED") : (ret ~= "QUANTIFIED"); break; 160 case NONCAPTURE: (ret != null) ? (ret ~= " | NONCAPTURE") : (ret ~= "NONCAPTURE"); break; 161 case NEGATIVE: (ret != null) ? (ret ~= " | NEGATIVE") : (ret ~= "NEGATIVE"); break; 162 case LAZY: (ret != null) ? (ret ~= " | LAZY") : (ret ~= "LAZY"); break; 163 case GREEDY: (ret != null) ? (ret ~= " | GREEDY") : (ret ~= "GREEDY"); break; 164 default: assert(0); 165 } 166 return ret; 167 } 168 } 169 170 package: 171 static: 172 pragma(inline, true) 173 pure @nogc bool modifierQuantifiable(Element element) 174 { 175 return !element.modifiers.hasFlag(QUANTIFIED); 176 } 177 178 pragma(inline, true) 179 pure @nogc bool tokenQuantifiable(Element element) 180 { 181 return element.token != ANCHOR_START && 182 element.token != ANCHOR_END && 183 element.token != PUSHFW && 184 element.token != PUSHBW && 185 element.token != RESET; 186 } 187 188 pragma(inline, true) 189 pure @nogc string getArgument(string pattern, int start, char opener, char closer) 190 { 191 int openers = 1; 192 foreach (i; (start + 1)..pattern.length) 193 { 194 if (pattern[i] == opener) 195 openers++; 196 else if (pattern[i] == closer) 197 openers--; 198 199 if (openers == 0) 200 return pattern[(start + 1)..i]; 201 } 202 return pattern[(start + 1)..pattern.length]; 203 } 204 205 pragma(inline, true) 206 pure string expand(string str, ref string[string] lookups) 207 { 208 if (str in lookups) 209 return lookups[str]; 210 211 string ret; 212 int i = 0; 213 while (i < str.length) 214 { 215 if (str[i] == '\\' && i + 1 < str.length && str[i..(i + 2)] in lookups) 216 { 217 ret ~= lookups[str[i..(i += 2)]]; 218 } 219 else if (i + 2 < str.length && str[i + 1] == '-') 220 { 221 char start = str[i]; 222 char end = str[i + 2]; 223 foreach (c; start..(end + 1)) 224 ret ~= c; 225 i += 3; 226 } 227 else 228 { 229 ret ~= str[i++]; 230 } 231 } 232 lookups[str] = ret; 233 return ret; 234 } 235 236 // TODO: Refer to future group/element 237 // b B (?:..) (..) lookahead lookbehind 238 pragma(inline, true) 239 pure Element[] build(string pattern, string[string] lookups) 240 { 241 Element[] elements; 242 for (int i; i < pattern.length; i++) 243 { 244 Element element; 245 char c = pattern[i]; 246 switch (c) 247 { 248 case '+': 249 if (!elements[$-1].tokenQuantifiable) 250 raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '+' (non-quantifiable!)", pattern, i, i + 1); 251 252 if (elements[$-1].modifierQuantifiable) 253 { 254 elements[$-1].min = 1; 255 elements[$-1].max = uint.max; 256 elements[$-1].modifiers |= QUANTIFIED; 257 } 258 else 259 { 260 elements[$-1].modifiers |= GREEDY; 261 } 262 break; 263 264 case '*': 265 if (!elements[$-1].tokenQuantifiable) 266 raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '*' (non-quantifiable!)", pattern, i, i + 1); 267 268 if (!elements[$-1].modifierQuantifiable) 269 raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '*' (non-quantifiable!)", pattern, i, i + 1); 270 271 elements[$-1].min = 0; 272 elements[$-1].max = uint.max; 273 elements[$-1].modifiers |= QUANTIFIED; 274 break; 275 276 case '?': 277 if (!elements[$-1].tokenQuantifiable) 278 raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '?' (non-quantifiable!)", pattern, i, i + 1); 279 280 if (elements[$-1].modifierQuantifiable) 281 { 282 elements[$-1].min = 0; 283 elements[$-1].max = 1; 284 elements[$-1].modifiers |= QUANTIFIED; 285 } 286 else 287 { 288 elements[$-1].modifiers |= LAZY; 289 } 290 break; 291 292 case '{': 293 if (!elements[$-1].tokenQuantifiable) 294 raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '{' (non-quantifiable!)", pattern, i, i + 1); 295 296 if (!elements[$-1].modifierQuantifiable) 297 raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '{' (non-quantifiable!)", pattern, i, i + 1); 298 299 string arg = pattern.getArgument(i, '{', '}'); 300 string[] args = arg.split(".."); 301 302 if (args.length == 1) 303 { 304 elements[$-1].min = args[0].to!uint; 305 elements[$-1].max = args[0].to!uint; 306 } 307 else if (args.length == 2) 308 { 309 elements[$-1].min = args[0].to!uint; 310 elements[$-1].max = args[1].to!uint; 311 } 312 else 313 raise("Quantifier range ('{') expected 1-2 arguments (found "~args.length.to!string~"!)", pattern, i, i + 1); 314 315 i += arg.length + 1; 316 elements[$-1].modifiers |= QUANTIFIED; 317 break; 318 319 case '|': 320 if (i + 2 < pattern.length && pattern[i..(i + 3)] == "|->") 321 { 322 element.token = PUSHFW; 323 element.min = 1; 324 i += 2; 325 break; 326 } 327 elements[$-1].modifiers |= ALTERNATE; 328 break; 329 330 case '.': 331 element.token = ANY; 332 element.min = 1; 333 element.max = 1; 334 break; 335 336 case '[': 337 if (i + 1 < pattern.length && pattern[i + 1] == '^') 338 { 339 element.modifiers |= EXCLUSIONARY; 340 i++; 341 } 342 343 element.token = CHARACTERS; 344 element.str = pattern.getArgument(i, '[', ']').expand(lookups); 345 element.min = 1; 346 element.max = 1; 347 348 i += pattern.getArgument(i, '[', ']').length + 1; 349 break; 350 351 case '^': 352 element.token = ANCHOR_START; 353 break; 354 355 case '%': 356 if (i + 1 < pattern.length && pattern[i + 1].isDigit) 357 { 358 uint id = 0; 359 while (i + 1 < pattern.length && pattern[i + 1].isDigit) 360 id = id * 10 + (pattern[++i] - '0'); 361 362 if (id < elements.length) 363 { 364 element.token = REFERENCE; 365 element.elements = [ elements[id] ]; 366 } 367 else 368 raise("REFERENCE ('%n') refers to element "~id.to!string~", which is outside of valid range!", pattern, i, i + 1); 369 break; 370 } 371 break; 372 373 case '$': 374 if (i + 1 < pattern.length && pattern[i + 1].isDigit) 375 { 376 uint id = 0; 377 while (i + 1 < pattern.length && pattern[i + 1].isDigit) 378 id = id * 10 + (pattern[++i] - '0'); 379 380 for (uint ii = 0, visits = 0; ii < elements.length; ++ii) 381 { 382 if (elements[ii].token == GROUP && visits++ == id) 383 { 384 element.token = REFERENCE; 385 element.elements = [ elements[ii] ]; 386 break; 387 } 388 } 389 if (element.token != REFERENCE) 390 raise("REFERENCE ('$n') refers to group "~id.to!string~", which is outside of valid range!", pattern, i, i + 1); 391 } 392 element.token = ANCHOR_END; 393 break; 394 395 case '<': 396 if (i + 2 < pattern.length && pattern[i + 1] == '-' && pattern[i + 2] == '|') 397 { 398 element.token = PUSHBW; 399 element.min = 1; 400 i += 2; 401 break; 402 } 403 else if (i + 2 < pattern.length && pattern[i + 1] == '-' && pattern[i + 2].isDigit) 404 { 405 i++; 406 uint len = 0; 407 while (i + 1 < pattern.length && pattern[i + 1].isDigit) 408 len = len * 10 + (pattern[++i] - '0'); 409 410 element.token = PUSHBW; 411 element.min = len; 412 break; 413 } 414 else 415 raise("Expected syntax <-n or <-| for PUSHBW!", pattern, i, i + 1); 416 break; 417 418 case '(': 419 string arg = pattern.getArgument(i, '(', ')'); 420 element.token = GROUP; 421 element.elements = arg.build(lookups); 422 i += arg.length + 1; 423 break; 424 425 default: 426 if (c.isDigit) 427 { 428 uint ci = i; 429 uint len = c - '0'; 430 while (i + 1 < pattern.length && pattern[i + 1].isDigit) 431 len = len * 10 + (pattern[++i] - '0'); 432 433 if (i + 2 < pattern.length && pattern[(i + 1)..(i + 3)] == "->") 434 { 435 element.token = PUSHFW; 436 element.min = len; 437 i += 2; 438 break; 439 } 440 i = ci; 441 } 442 443 element.token = CHARACTERS; 444 // Will not be adding support for gn 445 // Expected to use $n 446 if (c == '\\' && i + 1 < pattern.length) 447 { 448 // Reset (local) 449 if (pattern[i..(i + 2)] == r"\K") 450 { 451 i++; 452 element.token = RESET; 453 if (i + 1 < pattern.length && pattern[i + 1].isDigit) 454 { 455 uint id = 0; 456 while (i + 1 < pattern.length && pattern[i + 1].isDigit) 457 id = id * 10 + (pattern[++i] - '0'); 458 459 for (uint ii = 0, visits = 0; ii < elements.length; ++ii) 460 { 461 if (elements[ii].token == GROUP && visits++ == id) 462 { 463 element.elements = [ elements[ii] ]; 464 break; 465 } 466 } 467 if (element.elements.length == 0) 468 raise(r"RESET ('\K') refers to group "~id.to!string~", which is outside of valid range!", pattern, i, i + 1); 469 } 470 break; 471 } 472 // Escaped escape 473 else if (pattern[i..(i + 2)] == r"\\") 474 { 475 element.str = c.to!string; 476 element.min = 1; 477 element.max = 1; 478 i++; 479 break; 480 } 481 else if (pattern[i..(i + 2)] == r"\x" && i + 3 < pattern.length) 482 { 483 string hex = pattern[i + 2 .. i + 4]; 484 element.str = (cast(char)hex.to!ubyte(16)).to!string; 485 element.min = 1; 486 element.max = 1; 487 i += 3; 488 break; 489 } 490 // Any escape 491 else 492 { 493 string arg = pattern[i..(++i + 1)]; 494 switch (arg) 495 { 496 case r"\W", r"\D", r"\S", r"\H", r"\V": 497 element.str = arg.toLower.expand(lookups); 498 element.min = 1; 499 element.max = 1; 500 element.modifiers |= EXCLUSIONARY; 501 break; 502 503 default: 504 element.str = arg.expand(lookups); 505 element.min = 1; 506 element.max = 1; 507 } 508 } 509 } 510 else 511 { 512 element.str = c.to!string; 513 element.min = 1; 514 element.max = 1; 515 } 516 break; 517 } 518 if (element.token != BAD) 519 elements ~= element; 520 } 521 return elements; 522 }