Source: becke-ch--regex--s0-0-v1--base--pl--lib.js

  1. /*
  2. * Copyright (c) 2017. becke.ch - All Rights Reserved.
  3. * Use is subject to (MIT-style) License terms.
  4. * You may obtain a copy of the License at http://becke.ch/tool/becke-ch--regex--s0-v1/becke-ch--regex--s0-v1--license.txt
  5. */
  6. /**
  7. * Created by raoul-becke--s0-v1 on 08.12.16. A JavaScript Regular Expression library, extending the standard RegExp
  8. * class with missing functionality.
  9. * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp
  10. * @see http://www.ecma-international.org/ecma-262/6.0/#sec-regexp-regular-expression-objects
  11. */
  12. //Fixes issue for browsers (IE, Safari - respective in general WebKit based browsers e.g. JavaFX) not supporting
  13. // JavaScript "Symbol"
  14. Symbol = (typeof Symbol === "undefined") ? [] : Symbol;
  15. /**
  16. * This class is an extension of the standard {@linkcode RegExp} class adding missing functionality.
  17. * For further descriptions see the corresponding overridden methods.
  18. * @param {string|RegExp} [pattern]
  19. * @param {string} [options]
  20. * @constructor
  21. * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp
  22. * @see http://www.ecma-international.org/ecma-262/6.0/#sec-regexp-regular-expression-objects
  23. */
  24. function Regex(pattern, options) {
  25. var patternInstanceofRegExp = false;
  26. if (pattern instanceof RegExp) {
  27. pattern = pattern.source;
  28. patternInstanceofRegExp = true;
  29. }
  30. if (pattern) {
  31. this.regexGroupStructure = getRegexCompleteGroupingStructure(pattern);
  32. if (patternInstanceofRegExp) {
  33. this.source = pattern;
  34. } else {
  35. this.source = this.regexGroupStructure[2][0];
  36. }
  37. try {
  38. this.regex = new RegExp(this.regexGroupStructure[0][2], options);
  39. } catch (e) {
  40. new RegExp(pattern, options);
  41. }
  42. } else {
  43. this.regex = new RegExp(pattern, options);
  44. this.source = this.regex.source;
  45. }
  46. this.flags = this.regex.flags;
  47. this.global = this.regex.global;
  48. this.ignoreCase = this.regex.ignoreCase;
  49. this.multiline = this.regex.multiline;
  50. this.sticky = this.regex.sticky;
  51. this.unicode = this.regex.unicode;
  52. this.lastIndex = this.regex.lastIndex;
  53. }
  54. Regex.prototype = Object.create(RegExp.prototype, {
  55. flags: {
  56. value: null,
  57. enumerable: true,
  58. configurable: true,
  59. writable: true
  60. },
  61. global: {
  62. value: null,
  63. enumerable: true,
  64. configurable: true,
  65. writable: true
  66. },
  67. ignoreCase: {
  68. value: null,
  69. enumerable: true,
  70. configurable: true,
  71. writable: true
  72. },
  73. multiline: {
  74. value: null,
  75. enumerable: true,
  76. configurable: true,
  77. writable: true
  78. },
  79. source: {
  80. value: null,
  81. enumerable: true,
  82. configurable: true,
  83. writable: true
  84. },
  85. sticky: {
  86. value: null,
  87. enumerable: true,
  88. configurable: true,
  89. writable: true
  90. },
  91. unicode: {
  92. value: null,
  93. enumerable: true,
  94. configurable: true,
  95. writable: true
  96. }
  97. });
  98. Regex.prototype.constructor = Regex;
  99. /**
  100. * Returns the same string format as {@linkcode RegExp#toString} for the initial pattern that was provided to the
  101. * {@linkcode Regex} constructor.<br>
  102. * The string is constructed as follows: "/"+{@linkcode Regex#source}+"/"+{@linkcode Regex#flags}
  103. * @return {string}
  104. */
  105. Regex.prototype.toString = function () {
  106. return ('/' + this.source + '/' + this.flags);
  107. }
  108. /**
  109. * Simply invokes the inherited method {@linkcode RegExp#test}.
  110. * @param {string} [str]
  111. * @return {boolean}
  112. * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/test
  113. * @see http://www.ecma-international.org/ecma-262/6.0/#sec-regexp.prototype.test
  114. */
  115. Regex.prototype.test = function (str) {
  116. return this.regex.test(str);
  117. }
  118. /**
  119. * Simply invokes the inherited method {@linkcode RegExp[Symbol.search]}.
  120. * @param {string} str
  121. * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/@@search
  122. * @see http://www.ecma-international.org/ecma-262/6.0/#sec-regexp.prototype-@@search
  123. * @alias Regex.search
  124. */
  125. Regex.prototype[Symbol.search] = function (str) {
  126. return this.regex[Symbol.search](str);
  127. }
  128. /**
  129. * Simply invokes the inherited method {@linkcode RegExp[Symbol.split]}.
  130. * @param {string} str
  131. * @param {number} [limit]
  132. * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/@@split
  133. * @see http://www.ecma-international.org/ecma-262/6.0/#sec-regexp.prototype-@@split
  134. * @alias Regex.split
  135. */
  136. Regex.prototype[Symbol.split] = function (str, limit) {
  137. return this.regex[Symbol.split](str);
  138. }
  139. /**
  140. * Full indexed exec method: Based on {@linkcode RegExp#exec} but instead of simply getting "index" in the return which
  141. * only tells the starting of the first group (0 group) we are getting "index[0..n]" which tells us the starting index
  142. * of each matching group.<br>
  143. * <code>
  144. * Syntax: <b>(new Regex(pattern, flags)).exec(string) = {string[0..n], index:number[0..n], input:string}</b><br>
  145. * Example:<br>
  146. * //Retrieve content and position of: opening-, closing tags and body content for: non-nested html-tags.
  147. * var pattern = '(&lt;([^ &gt;]+)[^&gt;]*&gt;)([^&lt;]*)(&lt;\\/\\2&gt;)';<br>
  148. * var str = '&lt;html&gt;&lt;code class="html plain"&gt;first&lt;/code&gt;&lt;div class="content"&gt;second&lt;/div&gt;&lt;/html&gt;';<br>
  149. * var regex = new Regex(pattern, 'g');<br>
  150. * var result = regex.exec(str);<br>
  151. * <br>
  152. * console.log(5 === result.length);<br>
  153. * console.log('&lt;code class="html plain"&gt;first&lt;/code&gt;'=== result[0]);<br>
  154. * console.log('&lt;code class="html plain"&gt;'=== result[1]);<br>
  155. * console.log('first'=== result[3]);<br>
  156. * console.log('&lt;/code&gt;'=== result[4]);<br>
  157. * console.log(5=== result.index.length);<br>
  158. * console.log(6=== result.index[0]);<br>
  159. * console.log(6=== result.index[1]);<br>
  160. * console.log(31=== result.index[3]);<br>
  161. * console.log(36=== result.index[4]);<br>
  162. * </code>
  163. * <br>
  164. * @param {string} [str]
  165. * @return {Object} {string[0..n], index:number[0..n], input:string}
  166. * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec
  167. * @see http://www.ecma-international.org/ecma-262/6.0/#sec-regexp.prototype.exec
  168. */
  169. Regex.prototype.exec = function (str) {
  170. var result = [];
  171. result.index = [];
  172. var resultRegex = this.regex.exec(str);
  173. this.lastIndex = this.regex.lastIndex;
  174. if (!resultRegex) {
  175. return resultRegex;
  176. }
  177. result[0] = resultRegex[0];
  178. result.index[0] = resultRegex.index;
  179. result.input = str;
  180. var execInternal = function (strPosition, regexGroupStructureChildren) {
  181. var currentStrPos = strPosition;
  182. for (var i = 0; i < regexGroupStructureChildren.length; i++) {
  183. var index = regexGroupStructureChildren[i][0];
  184. var originalIndex = regexGroupStructureChildren[i][1];
  185. if (originalIndex) {
  186. result[originalIndex] = resultRegex[index];
  187. if (typeof result[originalIndex] === "undefined") {
  188. result.index[originalIndex] = undefined;
  189. } else {
  190. result.index[originalIndex] = currentStrPos;
  191. }
  192. }
  193. if (regexGroupStructureChildren[i][3]) {
  194. execInternal(currentStrPos, regexGroupStructureChildren[i][3]);
  195. }
  196. if (typeof resultRegex[index] !== "undefined") {
  197. currentStrPos += resultRegex[index].length;
  198. }
  199. }
  200. };
  201. if (this.regexGroupStructure && this.regexGroupStructure[0][3]) {
  202. execInternal(resultRegex.index, this.regexGroupStructure[0][3]);
  203. }
  204. return result;
  205. };
  206. /**
  207. * Full detailed match. Based on {@linkcode RegExp[Symbol.match]} with some improvements regarding the return value. Analogue to the base
  208. * method {@linkcode RegExp[Symbol.match]} internally the method {@linkcode Regex#exec} is called. And there are 2
  209. * improvements related to this: First improvement is that {@linkcode Regex#exec} returns the index for all matching
  210. * groups and Second improvement is that we always return a 2-dimensional array independent of whether the global 'g'
  211. * flag is set or not. If the 'g' flag is not set then first dimension only contains one element which is the result
  212. * of {@linkcode Regex#exec} and if the flag is set the size of the first dimension equals the number of matches we had
  213. * and each match contains the {@linkcode Regex#exec} result for the corresponding execution.
  214. * @param [str] {String}
  215. * @return {Object} Array[{string[0..n], index:number[0..n], input:string}]
  216. * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/@@match
  217. * @see http://www.ecma-international.org/ecma-262/6.0/#sec-regexp.prototype-@@match
  218. * @alias Regex.match
  219. */
  220. Regex.prototype[Symbol.match] = function (str) {
  221. this.lastIndex = 0;
  222. this.regex.lastIndex = 0;
  223. var resultExec = this.exec(str);
  224. if (!resultExec) {
  225. return null;
  226. }
  227. var resultExecArray = [];
  228. while (resultExec) {
  229. resultExecArray.push(resultExec);
  230. if (resultExec[0].length === 0) {
  231. this.regex.lastIndex++;
  232. }
  233. if (!this.global) {
  234. break;
  235. }
  236. resultExec = this.exec(str);
  237. }
  238. this.lastIndex = 0;
  239. this.regex.lastIndex = 0;
  240. return resultExecArray;
  241. }
  242. /**
  243. * Group based search & replace. Based on {@linkcode Regex.prototype[Symbol.replace]} but instead of only being able to provide a
  244. * single "replacement substring" or "replacement function" for replacement of the entire match (aka matching-group[0])
  245. * we can provide an array [0..n] of "replacement substring" or "replacement function" elements for replacement of
  246. * each matching group [0..n].
  247. * <br>
  248. * Important: If we don't want a group to be replaced we provide the corresponding "replacement substring/function"
  249. * array element as undefined or null!
  250. * <br>
  251. * Important: If we provide a "replacement substring/function" for a parent group-element then (obviously) no
  252. * replacement is performed on its child-group-elements. E.g. if we provide a "replacement substring/function"
  253. * for group 0 then the entire match is replaced and (obviously) no replacement of the further child-groups
  254. * takes place!
  255. * <br>
  256. * <br>
  257. * <code>
  258. * Syntax: <b>(new Regex(pattern))[Symbol.replace](string, [array of replacement strings])</b><br>
  259. * Alternative Syntax: For browsers supporting "Symbol": Chrome & Firefox: string.replace(new Regex(pattern), [array of replacement strings])<br>
  260. * Example:<br>
  261. * //Convert plain text to html: Replace special characters (multiple spaces, tabs, ...) in plain text with their html "equivalent":<br>
  262. * var CONVERT_TEXT_SPECIAL_CHARACTER_TO_HTML_ESCAPE_CHARACTER_PATTERN = "( {2})|(\t)|(&)|(<)|(>)|(\n)";<br>
  263. * var CONVERT_TEXT_SPECIAL_CHARACTER_TO_HTML_ESCAPE_CHARACTER_PATTERN_REPLACE_STRING = [undefined, "&amp;nbsp;&amp;nbsp;", "&amp;emsp;", "&amp;amp;", "&amp;lt;", "&amp;gt;", "&lt;br&gt;"];<br>
  264. * var regex = new Regex(CONVERT_TEXT_SPECIAL_CHARACTER_TO_HTML_ESCAPE_CHARACTER_PATTERN, 'g');<br>
  265. * var result = regex[Symbol.replace](myPlainText,CONVERT_TEXT_SPECIAL_CHARACTER_TO_HTML_ESCAPE_CHARACTER_PATTERN_REPLACE_STRING);<br>
  266. * //Alternative Syntax: For browsers supporting "Symbol": Chrome & Firefox<br>
  267. * var resultAlternative = myPlainText.replace(regex,CONVERT_TEXT_SPECIAL_CHARACTER_TO_HTML_ESCAPE_CHARACTER_PATTERN_REPLACE_STRING);
  268. * </code>
  269. * <br>
  270. * <br>
  271. * Special (in addition to the standard search and replace): We support in the "new substring" as well $0 as replacement
  272. * pattern which is basically the entire match.
  273. * And accordingly for the "new function" we call it as well with p0 as parameter which is the entire match.
  274. *
  275. * @param [str] {String}
  276. * @param [newSubstringFunctionArray] {String[]|function[]|String|function}
  277. * @return {string}
  278. * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/@@replace
  279. * @see http://www.ecma-international.org/ecma-262/6.0/#sec-regexp.prototype-@@replace
  280. * @alias Regex.replace
  281. */
  282. Regex.prototype[Symbol.replace] = function (str, newSubstringFunctionArray) {
  283. this.lastIndex = 0;
  284. this.regex.lastIndex = 0;
  285. if (!str) {
  286. return str;
  287. }
  288. // if (!newSubstringFunctionArray && newSubstringFunctionArray !== "") {
  289. // return newSubstringFunctionArray;
  290. // // newSubstringFunctionArray = '' + newSubstringFunctionArray;
  291. // // console.log(newSubstringFunctionArray);
  292. // }
  293. var resultExec = this.exec(str);
  294. if (!resultExec) {
  295. return str;
  296. }
  297. if (!(newSubstringFunctionArray instanceof Array)) {
  298. newSubstringFunctionArray = [newSubstringFunctionArray];
  299. }
  300. var resultString = '';
  301. var resultStringPosition = 0;
  302. var computeSubstringFunction = function (newSubstringFunctionIndex) {
  303. var computedString = '';
  304. var charAt = '';
  305. var newSubstringFunction = newSubstringFunctionArray[newSubstringFunctionIndex];
  306. // if(!newSubstringFunction){
  307. // newSubstringFunction += '';
  308. // }
  309. if (typeof newSubstringFunction === 'string') {
  310. for (var i = 0; i < newSubstringFunction.length; i++) {
  311. charAt = newSubstringFunction.charAt(i);
  312. if (charAt === '$') {
  313. i++;
  314. charAt = newSubstringFunction.charAt(i);
  315. if (charAt === '$') {
  316. computedString += '$';
  317. } else if (charAt === '&') {
  318. computedString += resultExec[newSubstringFunctionIndex];
  319. } else if (charAt === '`') {
  320. computedString += str.substring(0, resultExec.index[newSubstringFunctionIndex]);
  321. } else if (charAt === "'") {
  322. computedString += str.substring(resultExec.index[newSubstringFunctionIndex] + resultExec[newSubstringFunctionIndex].length);
  323. } else if (charAt >= '0' && charAt <= '9') {
  324. var int = charAt;
  325. i++;
  326. charAt = newSubstringFunction.charAt(i);
  327. while (charAt >= '0' && charAt <= '9') {
  328. int += charAt;
  329. i++;
  330. charAt = newSubstringFunction.charAt(i);
  331. }
  332. i--;
  333. if (resultExec[int]) {
  334. computedString += resultExec[int];
  335. } else {
  336. computedString += '$' + int;
  337. }
  338. } else {
  339. //strange not sure whether this is correct - initially it was: computedString += charAt;
  340. //but unit tests in NPM told differently!...
  341. computedString += '$' + charAt;
  342. }
  343. } else {
  344. computedString += charAt;
  345. }
  346. }
  347. } else if (newSubstringFunction instanceof Function) {
  348. var args = [resultExec[newSubstringFunctionIndex]];
  349. for (var j = 0; j < resultExec.length; j++) {
  350. args.push(resultExec[j]);
  351. }
  352. for (var k = 0; j < resultExec.index.length; k++) {
  353. args.push(resultExec.index[k]);
  354. }
  355. args.push(str);
  356. computedString += newSubstringFunction.apply(this, args);
  357. }
  358. return computedString;
  359. }
  360. var traverseRegexGroupStructure = function (regexGroupStructureChildren) {
  361. for (var i = 0; i < regexGroupStructureChildren.length; i++) {
  362. var originalIndex = regexGroupStructureChildren[i][1];
  363. if (originalIndex) {
  364. if (newSubstringFunctionArray[originalIndex] || newSubstringFunctionArray[originalIndex] === "") {
  365. if (resultExec[originalIndex] || resultExec[originalIndex] === "") {
  366. resultString += str.substring(resultStringPosition, resultExec.index[originalIndex]) + computeSubstringFunction(originalIndex);
  367. resultStringPosition = resultExec.index[originalIndex] + resultExec[originalIndex].length;
  368. }
  369. } else if (regexGroupStructureChildren[i][3]) {
  370. traverseRegexGroupStructure(regexGroupStructureChildren[i][3]);
  371. }
  372. } else {
  373. traverseRegexGroupStructure(regexGroupStructureChildren[i][3]);
  374. }
  375. }
  376. };
  377. while (resultExec) {
  378. if (newSubstringFunctionArray[0] || newSubstringFunctionArray[0] === "") {
  379. resultString += str.substring(resultStringPosition, resultExec.index[0]) + computeSubstringFunction(0);
  380. resultStringPosition = resultExec.index[0] + resultExec[0].length;
  381. } else if (this.regexGroupStructure && this.regexGroupStructure[0][3]) {
  382. traverseRegexGroupStructure(this.regexGroupStructure[0][3]);
  383. }
  384. // 20170415 - avoid infinite loop according to specification!
  385. if (resultExec[0].length === 0) {
  386. this.regex.lastIndex++;
  387. }
  388. if (!this.global) {
  389. break;
  390. }
  391. resultExec = this.exec(str);
  392. }
  393. this.lastIndex = 0;
  394. this.regex.lastIndex = 0;
  395. return resultString + str.substring(resultStringPosition, str.length);
  396. }
  397. /**
  398. * Takes the regular expression "regex" as input and puts all the non-grouped regex characters before a group
  399. * opening "(") into additional groups: "(xyz)" so in the end all relevant parts of the regular expression
  400. * (relevant in the sense that they are required to calculate the starting position of each group) are grouped.
  401. * AND finally returns the group structure of this regex:
  402. * [[index, originalIndex, 'regexForThisGroup', [[child01Index, child01OriginalIndex, 'regexForThisChild01Group', [...]], [child02Index, child02OriginalIndex, ..., [...]], ...]]].
  403. * The array elements are:
  404. * - index: This is the group index in the regular expression
  405. * - originalIndex: This is the groups index in the initial/original regular expression. "originalIndex" is "undefined"
  406. * if this group was the result of the group completion.
  407. * - regexForThisGroup: The regular expression of this group including its parantheses
  408. * - child-array: An array containing the child regular expressions
  409. *
  410. * SPECIAL: indexMap: The first element contains one more array element: [1]: indexMap: This array maps the
  411. * original-index [0..n] to the actual-index [0..m]
  412. *
  413. * SPECIAL: source: The first element contains one more array element: [2]: source: This is the initial regular
  414. * expression pattern and the only reason this is required is because in RegExp.source the slash "/" needs to be
  415. * escaped.
  416. *
  417. * Rule for group parsing: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp
  418. * And: http://www.ecma-international.org/ecma-262/6.0/#sec-regexp-regular-expression-objects
  419. * The capturing groups are numbered according to the order of left parentheses of capturing groups, starting from 1.
  420. * The matched substring can be recalled from the resulting array's elements [1], ..., [n] or from the predefined
  421. * RegExp object's properties $1, ..., $9.
  422. * The good thing is that the regular expression rules around groups (and as well in most other parts of the regular
  423. * expression) did not change between the different ECMA Script versions respective did not have any impact on the
  424. * group parsing and therefore this function is compatible between the different browsers!
  425. *
  426. * @param {String} [regex]
  427. * @returns {*}
  428. */
  429. function getRegexCompleteGroupingStructure(regex) {
  430. if (!regex) {
  431. console.error('The "regex" is empty! Returning empty array!');
  432. return [];
  433. }
  434. var indexMap = [];
  435. var source = [''];
  436. var containsBackReference = [];
  437. containsBackReference[0] = false;
  438. var getRegexCompleteGroupingStructureInternalResult = getRegexCompleteGroupingStructureInternal(regex, [0, 0, 0], true, indexMap, source, containsBackReference);
  439. if (containsBackReference[0]) {
  440. var fixIndexOnGroupingStructure = function (groupingStructureElement) {
  441. var regexForThisGroup = '';
  442. var charAt;
  443. for (var i = 0; i < groupingStructureElement[2].length; i++) {
  444. charAt = groupingStructureElement[2].charAt(i);
  445. regexForThisGroup += charAt;
  446. if (charAt === '\\') {
  447. if (i + 1 === groupingStructureElement[2].length) {
  448. continue;
  449. }
  450. i++;
  451. charAt = groupingStructureElement[2].charAt(i);
  452. var int = '';
  453. while (charAt >= '0' && charAt <= '9') {
  454. int += charAt;
  455. i++;
  456. charAt = groupingStructureElement[2].charAt(i);
  457. }
  458. if (int) {
  459. regexForThisGroup += indexMap[int];
  460. i--;
  461. } else {
  462. regexForThisGroup += charAt;
  463. }
  464. continue;
  465. }
  466. if (charAt === '[') {
  467. if (i + 1 === groupingStructureElement[2].length) {
  468. continue;
  469. }
  470. i++;
  471. charAt = groupingStructureElement[2].charAt(i);
  472. while ((charAt !== ']' || (groupingStructureElement[2].charAt(i - 1) === '\\' && groupingStructureElement[2].charAt(i - 2) !== '\\')) && i < groupingStructureElement[2].length) {
  473. regexForThisGroup += charAt;
  474. i++;
  475. charAt = groupingStructureElement[2].charAt(i);
  476. }
  477. regexForThisGroup += charAt;
  478. continue;
  479. }
  480. }
  481. groupingStructureElement[2] = regexForThisGroup;
  482. for (var j = 0; j < groupingStructureElement[3].length; j++) {
  483. fixIndexOnGroupingStructure(groupingStructureElement[3][j]);
  484. }
  485. }
  486. fixIndexOnGroupingStructure(getRegexCompleteGroupingStructureInternalResult);
  487. }
  488. return [getRegexCompleteGroupingStructureInternalResult, indexMap, source];
  489. }
  490. /**
  491. * Description see getRegexCompleteGroupingStructure.
  492. * @param regex
  493. * @param posIndexOrigIndex - this array has 3 values:
  494. * [0]:position: while iterating through the regular expression the parsing position is incremented starting at 0
  495. * [1]:index: the index of the current group we are parsing
  496. * [2]:original index: the original index the group had before we added additional groups
  497. * @param isCapturingGroup - tells us whether this group of characters enclosed within parantheses is a capturing group
  498. * or not. If not it could be a non capturing group (?:xyz) or an assertion which starts as well with a group bracket
  499. * "(" character but is followed with a "?=" or "?!" and tells us that this is not a capturing group
  500. * @param indexMap this array maps the original-index [0..n] to the actual-index [0..m]
  501. * @param source This is the initial regular expression pattern and the only reason this is required is because in
  502. * RegExp.source the slash "/" needs to be escaped.
  503. * @returns {*}
  504. */
  505. function getRegexCompleteGroupingStructureInternal(regex, posIndexOrigIndex, isCapturingGroup, indexMap, source, containsBackReference) {
  506. var groupStructure;
  507. if (isCapturingGroup) {
  508. groupStructure = [posIndexOrigIndex[1], posIndexOrigIndex[2], '', []];
  509. indexMap[posIndexOrigIndex[2]] = posIndexOrigIndex[1];
  510. } else {
  511. groupStructure = [undefined, undefined, '', []];
  512. }
  513. var tmpStr = '';
  514. var charAt;
  515. for (posIndexOrigIndex[0]; posIndexOrigIndex[0] < regex.length; posIndexOrigIndex[0]++) {
  516. charAt = regex.charAt(posIndexOrigIndex[0]);
  517. if (charAt === '\\') {
  518. //handle escape character
  519. if (posIndexOrigIndex[0] + 1 === regex.length) {
  520. tmpStr += '\\';
  521. source[0] += '\\';
  522. continue;
  523. }
  524. posIndexOrigIndex[0]++;
  525. charAt = regex.charAt(posIndexOrigIndex[0]);
  526. //check whether we have a back-reference and adjust it according to the shift in the index-map
  527. var int = '';
  528. //Back references above \9 behave strange and different in every language. In
  529. // Java-Script they are treated as back-reference if the group is existing otherwise they are treated
  530. // as character escape sequence.
  531. while (charAt >= '0' && charAt <= '9') {
  532. int += charAt;
  533. posIndexOrigIndex[0]++;
  534. charAt = regex.charAt(posIndexOrigIndex[0]);
  535. }
  536. if (int) {
  537. if (indexMap[int]) {
  538. //if a group exists then back-reference to the group
  539. //20170416 - bug the index map can change later if additional surrounding paranthesis are added
  540. //Therefore this calculation needs to be done in the end!
  541. //tmpStr += '\\' + indexMap[int];
  542. tmpStr += '\\' + int;
  543. containsBackReference[0] = true;
  544. } else {
  545. if (int.indexOf('8') >= 0 || int.indexOf('9') >= 0) {
  546. //if it is a non octal digit then treat it as simple number
  547. tmpStr += int;
  548. } else {
  549. //otherwise it is a character escape
  550. tmpStr += '\\' + 'x' + ("0" + (parseInt(int, 8).toString(16))).slice(-2).toUpperCase();
  551. }
  552. }
  553. source[0] += '\\' + int;
  554. posIndexOrigIndex[0]--;
  555. } else {
  556. tmpStr += '\\' + charAt;
  557. source[0] += '\\' + charAt;
  558. }
  559. continue;
  560. }
  561. if (charAt === '[') {
  562. //parse character set
  563. tmpStr += '[';
  564. source[0] += '[';
  565. if (posIndexOrigIndex[0] + 1 === regex.length) {
  566. continue;
  567. }
  568. posIndexOrigIndex[0]++;
  569. charAt = regex.charAt(posIndexOrigIndex[0]);
  570. while ((charAt !== ']' || (regex.charAt(posIndexOrigIndex[0] - 1) === '\\' && regex.charAt(posIndexOrigIndex[0] - 2) !== '\\')) && posIndexOrigIndex[0] < regex.length) {
  571. tmpStr += charAt;
  572. source[0] += charAt;
  573. posIndexOrigIndex[0]++;
  574. charAt = regex.charAt(posIndexOrigIndex[0]);
  575. }
  576. tmpStr += charAt;
  577. source[0] += charAt;
  578. continue;
  579. }
  580. if (charAt === '|') {
  581. //finalize pending tmp group string if any
  582. //20170327 - this is not necessary
  583. // if (tmpStr && groupStructure[3].length > 0) {
  584. // //complete grouping: put trailing and pending characters sequences into groups
  585. // posIndexOrigIndex[1]++;
  586. // tmpStr = '(' + tmpStr + ')';
  587. // groupStructure[3].push([posIndexOrigIndex[1], , tmpStr, []]);
  588. // }
  589. groupStructure[2] += tmpStr + '|';
  590. tmpStr = '';
  591. source[0] += '|';
  592. continue;
  593. }
  594. if (charAt === ')') {
  595. //handle group ending
  596. //20170327 - not required to group here because it is already contained in a parent group
  597. // if (tmpStr && groupStructure[3].length > 0) {
  598. // //complete grouping: put trailing and pending characters sequences into groups
  599. // posIndexOrigIndex[1]++;
  600. // tmpStr = '(' + tmpStr + ')';
  601. // groupStructure[3].push([posIndexOrigIndex[1], , tmpStr, []]);
  602. // }
  603. groupStructure[2] += tmpStr + ')';
  604. source[0] += ')';
  605. return groupStructure;
  606. }
  607. if (charAt === '(') {
  608. //handle group start
  609. if (tmpStr) {
  610. //complete grouping: put trailing and pending characters sequences into non remembering groups
  611. posIndexOrigIndex[1]++;
  612. tmpStr = '(' + tmpStr + ')';
  613. groupStructure[3].push([posIndexOrigIndex[1], undefined, tmpStr, []]);
  614. }
  615. posIndexOrigIndex[0]++;
  616. var regexGroupStructureInternal;
  617. var idx = posIndexOrigIndex[1] + 1;
  618. isCapturingGroup = true;
  619. // check whether group is an assertion
  620. if (regex.charAt(posIndexOrigIndex[0]) === '?' && posIndexOrigIndex[0] + 1 < regex.length && (regex.charAt(posIndexOrigIndex[0] + 1) === '=' || regex.charAt(posIndexOrigIndex[0] + 1) === '!' || regex.charAt(posIndexOrigIndex[0] + 1) === ':')) {
  621. //Handle assertion
  622. posIndexOrigIndex[0]++;
  623. var assertionChar = regex.charAt(posIndexOrigIndex[0]);
  624. posIndexOrigIndex[0]++;
  625. //we only set isCapturingGroup to false in case it is a non capturing group. For assertion groups
  626. //i.e. positive-/negative-lookahead the parser steps back after the parsing of the lookahead and
  627. //therefore (see if statement further down) we don't need to set the assertion groups into additional
  628. //paranthesis.
  629. //In other words at the end of a lookahead or a lookbehind, the regex engine hasn't moved on the string.
  630. // You can chain three more lookaheads after the first, and the regex engine still won't move.
  631. if (assertionChar === ':') {
  632. isCapturingGroup = false;
  633. }
  634. source[0] += '(?' + assertionChar;
  635. regexGroupStructureInternal = getRegexCompleteGroupingStructureInternal(regex, posIndexOrigIndex, false, indexMap, source, containsBackReference);
  636. regexGroupStructureInternal[2] = '(?' + assertionChar + regexGroupStructureInternal[2];
  637. } else {
  638. posIndexOrigIndex[1]++;
  639. posIndexOrigIndex[2]++;
  640. source[0] += '(';
  641. regexGroupStructureInternal = getRegexCompleteGroupingStructureInternal(regex, posIndexOrigIndex, true, indexMap, source, containsBackReference);
  642. regexGroupStructureInternal[2] = '(' + regexGroupStructureInternal[2];
  643. }
  644. //20170327 - special handling if we have a capturing group with a quantifier then we need to
  645. //put the quantified group into additional paranthesis otherwise only the first group matches!
  646. var quantifierStart = posIndexOrigIndex[0];
  647. var quantifierString = '';
  648. if (posIndexOrigIndex[0] + 1 < regex.length) {
  649. //parse quantifier
  650. charAt = regex.charAt(posIndexOrigIndex[0] + 1);
  651. if (charAt === '*') {
  652. posIndexOrigIndex[0]++;
  653. quantifierString = '*';
  654. } else if (charAt === '+') {
  655. posIndexOrigIndex[0]++;
  656. quantifierString = '+';
  657. } else if (charAt === '?') {
  658. posIndexOrigIndex[0]++;
  659. quantifierString = '?';
  660. } else if (charAt === '{') {
  661. posIndexOrigIndex[0]++;
  662. quantifierString = '{';
  663. posIndexOrigIndex[0]++;
  664. charAt = regex.charAt(posIndexOrigIndex[0]);
  665. while (charAt >= '0' && charAt <= '9' && posIndexOrigIndex[0] < regex.length) {
  666. quantifierString += charAt;
  667. posIndexOrigIndex[0]++;
  668. charAt = regex.charAt(posIndexOrigIndex[0]);
  669. }
  670. if (charAt === '}') {
  671. quantifierString += '}';
  672. } else {
  673. if (charAt === ',') {
  674. quantifierString += ',';
  675. posIndexOrigIndex[0]++;
  676. charAt = regex.charAt(posIndexOrigIndex[0]);
  677. while (charAt >= '0' && charAt <= '9' && posIndexOrigIndex[0] < regex.length) {
  678. quantifierString += charAt;
  679. posIndexOrigIndex[0]++;
  680. charAt = regex.charAt(posIndexOrigIndex[0]);
  681. }
  682. if (charAt === '}') {
  683. quantifierString += '}';
  684. } else {
  685. quantifierString = '';
  686. }
  687. } else {
  688. quantifierString = '';
  689. }
  690. }
  691. }
  692. if (quantifierString.length > 0) {
  693. regexGroupStructureInternal[2] += quantifierString;
  694. source[0] += quantifierString;
  695. if (regex.charAt(posIndexOrigIndex[0] + 1) === '?') {
  696. posIndexOrigIndex[0]++;
  697. regexGroupStructureInternal[2] += '?';
  698. source[0] += '?';
  699. }
  700. } else {
  701. posIndexOrigIndex[0] = quantifierStart;
  702. }
  703. }
  704. //20170327 - special handling if we have a non capturing group then we need to put this group into
  705. // additional paranthesis otherwise we don't know the size of the group !
  706. if (quantifierString.length > 0 || !isCapturingGroup) {
  707. incrementRegexGroupStructureIndex(regexGroupStructureInternal, indexMap);
  708. regexGroupStructureInternal = [idx, undefined, '(' + regexGroupStructureInternal[2] + ')', [regexGroupStructureInternal]];
  709. posIndexOrigIndex[1]++;
  710. }
  711. groupStructure[2] += tmpStr + regexGroupStructureInternal[2];
  712. groupStructure[3].push(regexGroupStructureInternal);
  713. tmpStr = '';
  714. } else {
  715. charAt = regex.charAt(posIndexOrigIndex[0]);
  716. tmpStr += charAt;
  717. if (charAt === '/') {
  718. source[0] += '\\' + charAt;
  719. } else {
  720. source[0] += charAt;
  721. }
  722. }
  723. }
  724. //we only get here in the top-most iteration i.e. for matching group 0 which has no enclosing paranthesis
  725. groupStructure[2] += tmpStr;
  726. return groupStructure;
  727. }
  728. function incrementRegexGroupStructureIndex(regexGroupStructure, indexMap) {
  729. if (regexGroupStructure[0]) {
  730. regexGroupStructure[0]++;
  731. if (regexGroupStructure[1]) {
  732. indexMap[regexGroupStructure[1]] = regexGroupStructure[0];
  733. }
  734. }
  735. for (var i = 0; i < regexGroupStructure[3].length; i++) {
  736. incrementRegexGroupStructureIndex(regexGroupStructure[3][i], indexMap);
  737. }
  738. }
  739. function initialize() {
  740. if (!(typeof module === "undefined")) {
  741. module.exports = Regex;
  742. }
  743. }
  744. initialize();