1 module tern.regex.builder;
2 
3 import tern.state;
4 import tern.exception;
5 import std.string;
6 import std.conv;
7 import std.ascii;
8 import std.algorithm;
9 
10 // Tokens
11 package enum : ubyte
12 {
13     BAD,
14     /// `(?...)`
15     /// Matches group ahead
16     LOOK_AHEAD,
17     /// `(...<...)`
18     /// Matches group behind
19     LOOK_BEHIND,
20     /// `[...]`
21     /// Stores a set of characters (matches any)
22     CHARACTERS,
23     /// `^`
24     /// Matches only if at the start of a line or the full text
25     ANCHOR_START,
26     /// `$`
27     /// Matches only if at the end of a line or the full text
28     ANCHOR_END,
29     /// `(...)`
30     /// Stores a set of elements
31     GROUP,
32     /// `.`
33     /// Matches any character
34     ANY,
35     /// ~`\gn`~ or `$n` (group) or `%n` (absolute)
36     /// Refers to a group or element
37     REFERENCE,
38     // Not used! Comments don't need to be parsed!
39     // (?#...)
40     //COMMENT
41     /// `\K` `\Kn`.
42     /// Resets match or group match
43     RESET,
44     /// `n->` or `|->`
45     /// Moves the current text position forward
46     PUSHFW,
47     /// `<-n` or `<-|`
48     /// Moves the current text position backward
49     PUSHBW
50 }
51 
52 // Element modifiers
53 package enum : ubyte
54 {
55     /// No special rules, matches until the next element can match
56     NONE = 0,
57     /// `|`
58     /// If fails, the next element will attempt to match instead
59     ALTERNATE = 1,
60     /// `[^...]`
61     /// Matches only if no characters in the set match
62     EXCLUSIONARY = 2,
63     /// `{...}`
64     /// Has min/max
65     QUANTIFIED = 4,
66     // *
67     //GREEDY = 8,
68     // +
69     //MANY = 16,
70     // ?
71     //OPTIONAL = 32,
72     // Now defaults to POSITIVE/CAPTURE if set to NONE
73     // (...=...)
74     // Also (...) (capture group)
75     //CAPTURE = 64,
76     //POSITIVE = 64,
77     /// `(?:...)`
78     /// Acts as a group but does not capture match
79     NONCAPTURE = 8,
80     /// `(...!...)`
81     /// Matches if not matched
82     NEGATIVE = 16,
83     /// `...?`
84     /// Matches as few times as possible
85     LAZY = 32,
86     /// `...+`
87     /// Matches as many times as possible
88     GREEDY = 64
89 }
90 
91 // Regex flags
92 package enum : ubyte
93 {
94     /// Match more than once
95     GLOBAL = 2,
96     /// ^ & $ match start & end
97     MULTILINE = 4,
98     /// Case insensitive
99     INSENSITIVE = 8,
100     /// Ignore whitespace
101     EXTENDED = 16,
102     /// . matches  r\n\f
103     SINGLELINE = 32
104 }
105 
106 package struct Element
107 {
108 public:
109 final:
110 align(8):
111     /// What kind of element is this?
112     /// eg: `CHARACTERS`.
113     ubyte token;
114     /// What are the special modifiers of this element?
115     /// eg: `EXCLUSIONARY`.
116     ubyte modifiers;
117     /// Characters mapped (like in a character set or literal)
118     /// Elements mapped (like in a group or reference)
119     union
120     {
121         string str;
122         Element[] elements;
123     }
124     /// Minimum times to require fulfillment
125     /// eg: `1`.
126     uint min;
127     /// Maximum times to allow fulfillment
128     /// eg: `1`.
129     uint max = 1;
130 
131     pure string tokenName()
132     {
133         switch (token)
134         {
135             case BAD: return "BAD";
136             case LOOK_AHEAD: return "LOOK_AHEAD";
137             case LOOK_BEHIND: return "LOOK_BEHIND";
138             case CHARACTERS: return "CHARACTERS";
139             case ANCHOR_START: return "ANCHOR_START";
140             case ANCHOR_END: return "ANCHOR_END";
141             case GROUP: return "GROUP";
142             case ANY: return "ANY";
143             case REFERENCE: return "REFERENCE";
144             case RESET: return "RESET";
145             case PUSHFW: return "PUSHFW";
146             case PUSHBW: return "PUSHBW";
147             default: assert(0);
148         }
149     }
150 
151     pure string modifiersName()
152     {
153         string ret;
154         switch (modifiers)
155         {
156             case NONE: ret = "NONE"; break;
157             case ALTERNATE: (ret != null) ? (ret ~= " | ALTERNATE") : (ret ~= "ALTERNATE"); break;
158             case EXCLUSIONARY: (ret != null) ? (ret ~= " | EXCLUSIONARY") : (ret ~= "EXCLUSIONARY"); break;
159             case QUANTIFIED: (ret != null) ? (ret ~= " | QUANTIFIED") : (ret ~= "QUANTIFIED"); break;
160             case NONCAPTURE: (ret != null) ? (ret ~= " | NONCAPTURE") : (ret ~= "NONCAPTURE"); break;
161             case NEGATIVE: (ret != null) ? (ret ~= " | NEGATIVE") : (ret ~= "NEGATIVE"); break;
162             case LAZY: (ret != null) ? (ret ~= " | LAZY") : (ret ~= "LAZY"); break;
163             case GREEDY: (ret != null) ? (ret ~= " | GREEDY") : (ret ~= "GREEDY"); break;
164             default: assert(0);
165         }
166         return ret;
167     }
168 }
169 
170 package:
171 static:
172 pragma(inline, true)
173 pure @nogc bool modifierQuantifiable(Element element)
174 {
175     return !element.modifiers.hasFlag(QUANTIFIED);
176 }
177 
178 pragma(inline, true)
179 pure @nogc bool tokenQuantifiable(Element element)
180 {
181     return element.token != ANCHOR_START && 
182         element.token != ANCHOR_END && 
183         element.token != PUSHFW && 
184         element.token != PUSHBW && 
185         element.token != RESET;
186 }
187 
188 pragma(inline, true)
189 pure @nogc string getArgument(string pattern, int start, char opener, char closer)
190 {
191     int openers = 1;
192     foreach (i; (start + 1)..pattern.length)
193     {
194         if (pattern[i] == opener)
195             openers++;
196         else if (pattern[i] == closer)
197             openers--;
198         
199         if (openers == 0)
200             return pattern[(start + 1)..i];
201     }
202     return pattern[(start + 1)..pattern.length];
203 }
204 
205 pragma(inline, true)
206 pure string expand(string str, ref string[string] lookups)
207 {
208     if (str in lookups)
209         return lookups[str];
210 
211     string ret;
212     int i = 0;
213     while (i < str.length)
214     {
215         if (str[i] == '\\' && i + 1 < str.length && str[i..(i + 2)] in lookups)
216         {
217             ret ~= lookups[str[i..(i += 2)]];
218         }
219         else if (i + 2 < str.length && str[i + 1] == '-')
220         {
221             char start = str[i];
222             char end = str[i + 2];
223             foreach (c; start..(end + 1))
224                 ret ~= c;
225             i += 3;
226         }
227         else
228         {
229             ret ~= str[i++];
230         }
231     }
232     lookups[str] = ret;
233     return ret;
234 }
235 
236 // TODO: Refer to future group/element
237 //        b  B (?:..) (..) lookahead lookbehind
238 pragma(inline, true)
239 pure Element[] build(string pattern, string[string] lookups)
240 {
241     Element[] elements;
242     for (int i; i < pattern.length; i++)
243     {
244         Element element;
245         char c = pattern[i];
246         switch (c)
247         {
248             case '+':
249                 if (!elements[$-1].tokenQuantifiable)
250                     raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '+' (non-quantifiable!)", pattern, i, i + 1);
251 
252                 if (elements[$-1].modifierQuantifiable)
253                 {
254                     elements[$-1].min = 1;
255                     elements[$-1].max = uint.max;
256                     elements[$-1].modifiers |= QUANTIFIED;
257                 }
258                 else
259                 {
260                     elements[$-1].modifiers |= GREEDY;
261                 }
262                 break;
263 
264             case '*':
265                 if (!elements[$-1].tokenQuantifiable)
266                     raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '*' (non-quantifiable!)", pattern, i, i + 1);
267 
268                 if (!elements[$-1].modifierQuantifiable)
269                     raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '*' (non-quantifiable!)", pattern, i, i + 1);
270 
271                 elements[$-1].min = 0;
272                 elements[$-1].max = uint.max;
273                 elements[$-1].modifiers |= QUANTIFIED;
274                 break;
275 
276             case '?':
277                 if (!elements[$-1].tokenQuantifiable)
278                     raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '?' (non-quantifiable!)", pattern, i, i + 1);
279 
280                 if (elements[$-1].modifierQuantifiable)
281                 {
282                     elements[$-1].min = 0;
283                     elements[$-1].max = 1;
284                     elements[$-1].modifiers |= QUANTIFIED;
285                 }
286                 else 
287                 {
288                     elements[$-1].modifiers |= LAZY;
289                 }
290                 break;
291 
292             case '{':
293                 if (!elements[$-1].tokenQuantifiable)
294                     raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '{' (non-quantifiable!)", pattern, i, i + 1);
295 
296                 if (!elements[$-1].modifierQuantifiable)
297                     raise(elements[$-1].tokenName()~" cannot be succeeded by quantifier token '{' (non-quantifiable!)", pattern, i, i + 1);
298 
299                 string arg = pattern.getArgument(i, '{', '}');
300                 string[] args = arg.split("..");
301 
302                 if (args.length == 1)
303                 {
304                     elements[$-1].min = args[0].to!uint;
305                     elements[$-1].max = args[0].to!uint;
306                 }
307                 else if (args.length == 2)
308                 {
309                     elements[$-1].min = args[0].to!uint;
310                     elements[$-1].max = args[1].to!uint;
311                 }
312                 else
313                     raise("Quantifier range ('{') expected 1-2 arguments (found "~args.length.to!string~"!)", pattern, i, i + 1);
314 
315                 i += arg.length + 1;
316                 elements[$-1].modifiers |= QUANTIFIED;
317                 break;
318 
319             case '|':
320                 if (i + 2 < pattern.length && pattern[i..(i + 3)] == "|->")
321                 {
322                     element.token = PUSHFW;
323                     element.min = 1;
324                     i += 2;
325                     break;
326                 }
327                 elements[$-1].modifiers |= ALTERNATE;
328                 break;
329 
330             case '.':
331                 element.token = ANY;
332                 element.min = 1;
333                 element.max = 1;
334                 break;
335 
336             case '[':
337                 if (i + 1 < pattern.length && pattern[i + 1] == '^')
338                 {
339                     element.modifiers |= EXCLUSIONARY;
340                     i++;
341                 }
342 
343                 element.token = CHARACTERS;
344                 element.str = pattern.getArgument(i, '[', ']').expand(lookups);
345                 element.min = 1;
346                 element.max = 1;
347 
348                 i += pattern.getArgument(i, '[', ']').length + 1;
349                 break;
350 
351             case '^':
352                 element.token = ANCHOR_START;
353                 break;
354 
355             case '%':
356                 if (i + 1 < pattern.length && pattern[i + 1].isDigit)
357                 {
358                     uint id = 0;
359                     while (i + 1 < pattern.length && pattern[i + 1].isDigit)
360                         id = id * 10 + (pattern[++i] - '0');
361 
362                     if (id < elements.length)
363                     {
364                         element.token = REFERENCE;
365                         element.elements = [ elements[id] ];
366                     }
367                     else
368                         raise("REFERENCE ('%n') refers to element "~id.to!string~", which is outside of valid range!", pattern, i, i + 1);
369                     break;
370                 }
371                 break;
372 
373             case '$':
374                 if (i + 1 < pattern.length && pattern[i + 1].isDigit)
375                 {
376                     uint id = 0;
377                     while (i + 1 < pattern.length && pattern[i + 1].isDigit)
378                         id = id * 10 + (pattern[++i] - '0');
379 
380                     for (uint ii = 0, visits = 0; ii < elements.length; ++ii)
381                     {
382                         if (elements[ii].token == GROUP && visits++ == id)
383                         {
384                             element.token = REFERENCE;
385                             element.elements = [ elements[ii] ];
386                             break;
387                         }
388                     }
389                     if (element.token != REFERENCE)
390                         raise("REFERENCE ('$n') refers to group "~id.to!string~", which is outside of valid range!", pattern, i, i + 1);
391                 }
392                 element.token = ANCHOR_END;
393                 break;
394 
395             case '<':
396                 if (i + 2 < pattern.length && pattern[i + 1] == '-' && pattern[i + 2] == '|')
397                 {
398                     element.token = PUSHBW;
399                     element.min = 1;
400                     i += 2;
401                     break;
402                 }
403                 else if (i + 2 < pattern.length && pattern[i + 1] == '-' && pattern[i + 2].isDigit)
404                 {
405                     i++;
406                     uint len = 0;
407                     while (i + 1 < pattern.length && pattern[i + 1].isDigit)
408                         len = len * 10 + (pattern[++i] - '0');
409 
410                     element.token = PUSHBW;
411                     element.min = len;
412                     break;
413                 }
414                 else
415                     raise("Expected syntax <-n or <-| for PUSHBW!", pattern, i, i + 1);
416                 break;
417 
418             case '(':
419                 string arg = pattern.getArgument(i, '(', ')');
420                 element.token = GROUP;
421                 element.elements = arg.build(lookups);
422                 i += arg.length + 1;
423                 break;
424 
425             default:
426                 if (c.isDigit)
427                 {
428                     uint ci = i;
429                     uint len = c - '0';
430                     while (i + 1 < pattern.length && pattern[i + 1].isDigit)
431                         len = len * 10 + (pattern[++i] - '0');
432                     
433                     if (i + 2 < pattern.length && pattern[(i + 1)..(i + 3)] == "->")
434                     {
435                         element.token = PUSHFW;
436                         element.min = len;
437                         i += 2;
438                         break;
439                     }
440                     i = ci;
441                 }
442 
443                 element.token = CHARACTERS;
444                 // Will not be adding support for  gn
445                 // Expected to use $n
446                 if (c == '\\' && i + 1 < pattern.length)
447                 {
448                     // Reset (local)
449                     if (pattern[i..(i + 2)] == r"\K")
450                     {
451                         i++;
452                         element.token = RESET;
453                         if (i + 1 < pattern.length && pattern[i + 1].isDigit)
454                         {
455                             uint id = 0;
456                             while (i + 1 < pattern.length && pattern[i + 1].isDigit)
457                                 id = id * 10 + (pattern[++i] - '0');
458 
459                             for (uint ii = 0, visits = 0; ii < elements.length; ++ii)
460                             {
461                                 if (elements[ii].token == GROUP && visits++ == id)
462                                 {
463                                     element.elements = [ elements[ii] ];
464                                     break;
465                                 }
466                             }
467                             if (element.elements.length == 0)
468                                 raise(r"RESET ('\K') refers to group "~id.to!string~", which is outside of valid range!", pattern, i, i + 1);
469                         }
470                         break;
471                     }
472                     // Escaped escape
473                     else if (pattern[i..(i + 2)] == r"\\")
474                     {
475                         element.str = c.to!string;
476                         element.min = 1;
477                         element.max = 1;
478                         i++;
479                         break;
480                     }
481                     else if (pattern[i..(i + 2)] == r"\x" && i + 3 < pattern.length)
482                     {
483                         string hex = pattern[i + 2 .. i + 4];
484                         element.str = (cast(char)hex.to!ubyte(16)).to!string;
485                         element.min = 1;
486                         element.max = 1;
487                         i += 3;
488                         break;
489                     }
490                     // Any escape
491                     else
492                     {
493                         string arg = pattern[i..(++i + 1)];
494                         switch (arg)
495                         {
496                             case r"\W", r"\D", r"\S", r"\H", r"\V":
497                                 element.str = arg.toLower.expand(lookups);
498                                 element.min = 1;
499                                 element.max = 1;
500                                 element.modifiers |= EXCLUSIONARY;
501                                 break;
502 
503                             default:
504                                 element.str = arg.expand(lookups);
505                                 element.min = 1;
506                                 element.max = 1;
507                         }
508                     }
509                 }
510                 else
511                 {
512                     element.str = c.to!string;
513                     element.min = 1;
514                     element.max = 1;
515                 }
516                 break;
517         }
518         if (element.token != BAD)
519             elements ~= element;
520     }
521     return elements;
522 }