index.js 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. var util = require('./util');
  2. var types = require('./types');
  3. var sets = require('./sets');
  4. var positions = require('./positions');
  5. module.exports = function(regexpStr) {
  6. var i = 0, l, c,
  7. start = { type: types.ROOT, stack: []},
  8. // Keep track of last clause/group and stack.
  9. lastGroup = start,
  10. last = start.stack,
  11. groupStack = [];
  12. var repeatErr = function(i) {
  13. util.error(regexpStr, 'Nothing to repeat at column ' + (i - 1));
  14. };
  15. // Decode a few escaped characters.
  16. var str = util.strToChars(regexpStr);
  17. l = str.length;
  18. // Iterate through each character in string.
  19. while (i < l) {
  20. c = str[i++];
  21. switch (c) {
  22. // Handle escaped characters, inclues a few sets.
  23. case '\\':
  24. c = str[i++];
  25. switch (c) {
  26. case 'b':
  27. last.push(positions.wordBoundary());
  28. break;
  29. case 'B':
  30. last.push(positions.nonWordBoundary());
  31. break;
  32. case 'w':
  33. last.push(sets.words());
  34. break;
  35. case 'W':
  36. last.push(sets.notWords());
  37. break;
  38. case 'd':
  39. last.push(sets.ints());
  40. break;
  41. case 'D':
  42. last.push(sets.notInts());
  43. break;
  44. case 's':
  45. last.push(sets.whitespace());
  46. break;
  47. case 'S':
  48. last.push(sets.notWhitespace());
  49. break;
  50. default:
  51. // Check if c is integer.
  52. // In which case it's a reference.
  53. if (/\d/.test(c)) {
  54. last.push({ type: types.REFERENCE, value: parseInt(c, 10) });
  55. // Escaped character.
  56. } else {
  57. last.push({ type: types.CHAR, value: c.charCodeAt(0) });
  58. }
  59. }
  60. break;
  61. // Positionals.
  62. case '^':
  63. last.push(positions.begin());
  64. break;
  65. case '$':
  66. last.push(positions.end());
  67. break;
  68. // Handle custom sets.
  69. case '[':
  70. // Check if this class is 'anti' i.e. [^abc].
  71. var not;
  72. if (str[i] === '^') {
  73. not = true;
  74. i++;
  75. } else {
  76. not = false;
  77. }
  78. // Get all the characters in class.
  79. var classTokens = util.tokenizeClass(str.slice(i), regexpStr);
  80. // Increase index by length of class.
  81. i += classTokens[1];
  82. last.push({
  83. type: types.SET,
  84. set: classTokens[0],
  85. not: not,
  86. });
  87. break;
  88. // Class of any character except \n.
  89. case '.':
  90. last.push(sets.anyChar());
  91. break;
  92. // Push group onto stack.
  93. case '(':
  94. // Create group.
  95. var group = {
  96. type: types.GROUP,
  97. stack: [],
  98. remember: true,
  99. };
  100. c = str[i];
  101. // If if this is a special kind of group.
  102. if (c === '?') {
  103. c = str[i + 1];
  104. i += 2;
  105. // Match if followed by.
  106. if (c === '=') {
  107. group.followedBy = true;
  108. // Match if not followed by.
  109. } else if (c === '!') {
  110. group.notFollowedBy = true;
  111. } else if (c !== ':') {
  112. util.error(regexpStr,
  113. 'Invalid group, character \'' + c +
  114. '\' after \'?\' at column ' + (i - 1));
  115. }
  116. group.remember = false;
  117. }
  118. // Insert subgroup into current group stack.
  119. last.push(group);
  120. // Remember the current group for when the group closes.
  121. groupStack.push(lastGroup);
  122. // Make this new group the current group.
  123. lastGroup = group;
  124. last = group.stack;
  125. break;
  126. // Pop group out of stack.
  127. case ')':
  128. if (groupStack.length === 0) {
  129. util.error(regexpStr, 'Unmatched ) at column ' + (i - 1));
  130. }
  131. lastGroup = groupStack.pop();
  132. // Check if this group has a PIPE.
  133. // To get back the correct last stack.
  134. last = lastGroup.options ?
  135. lastGroup.options[lastGroup.options.length - 1] : lastGroup.stack;
  136. break;
  137. // Use pipe character to give more choices.
  138. case '|':
  139. // Create array where options are if this is the first PIPE
  140. // in this clause.
  141. if (!lastGroup.options) {
  142. lastGroup.options = [lastGroup.stack];
  143. delete lastGroup.stack;
  144. }
  145. // Create a new stack and add to options for rest of clause.
  146. var stack = [];
  147. lastGroup.options.push(stack);
  148. last = stack;
  149. break;
  150. // Repetition.
  151. // For every repetition, remove last element from last stack
  152. // then insert back a RANGE object.
  153. // This design is chosen because there could be more than
  154. // one repetition symbols in a regex i.e. `a?+{2,3}`.
  155. case '{':
  156. var rs = /^(\d+)(,(\d+)?)?\}/.exec(str.slice(i)), min, max;
  157. if (rs !== null) {
  158. if (last.length === 0) {
  159. repeatErr(i);
  160. }
  161. min = parseInt(rs[1], 10);
  162. max = rs[2] ? rs[3] ? parseInt(rs[3], 10) : Infinity : min;
  163. i += rs[0].length;
  164. last.push({
  165. type: types.REPETITION,
  166. min: min,
  167. max: max,
  168. value: last.pop(),
  169. });
  170. } else {
  171. last.push({
  172. type: types.CHAR,
  173. value: 123,
  174. });
  175. }
  176. break;
  177. case '?':
  178. if (last.length === 0) {
  179. repeatErr(i);
  180. }
  181. last.push({
  182. type: types.REPETITION,
  183. min: 0,
  184. max: 1,
  185. value: last.pop(),
  186. });
  187. break;
  188. case '+':
  189. if (last.length === 0) {
  190. repeatErr(i);
  191. }
  192. last.push({
  193. type: types.REPETITION,
  194. min: 1,
  195. max: Infinity,
  196. value: last.pop(),
  197. });
  198. break;
  199. case '*':
  200. if (last.length === 0) {
  201. repeatErr(i);
  202. }
  203. last.push({
  204. type: types.REPETITION,
  205. min: 0,
  206. max: Infinity,
  207. value: last.pop(),
  208. });
  209. break;
  210. // Default is a character that is not `\[](){}?+*^$`.
  211. default:
  212. last.push({
  213. type: types.CHAR,
  214. value: c.charCodeAt(0),
  215. });
  216. }
  217. }
  218. // Check if any groups have not been closed.
  219. if (groupStack.length !== 0) {
  220. util.error(regexpStr, 'Unterminated group');
  221. }
  222. return start;
  223. };
  224. module.exports.types = types;