re.js 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. 'use strict';
  2. module.exports = function (opts) {
  3. var re = {};
  4. // Use direct extract instead of `regenerate` to reduse browserified size
  5. re.src_Any = require('uc.micro/properties/Any/regex').source;
  6. re.src_Cc = require('uc.micro/categories/Cc/regex').source;
  7. re.src_Z = require('uc.micro/categories/Z/regex').source;
  8. re.src_P = require('uc.micro/categories/P/regex').source;
  9. // \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation)
  10. re.src_ZPCc = [ re.src_Z, re.src_P, re.src_Cc ].join('|');
  11. // \p{\Z\Cc} (white spaces + control)
  12. re.src_ZCc = [ re.src_Z, re.src_Cc ].join('|');
  13. // Experimental. List of chars, completely prohibited in links
  14. // because can separate it from other part of text
  15. var text_separators = '[><\uff5c]';
  16. // All possible word characters (everything without punctuation, spaces & controls)
  17. // Defined via punctuation & spaces to save space
  18. // Should be something like \p{\L\N\S\M} (\w but without `_`)
  19. re.src_pseudo_letter = '(?:(?!' + text_separators + '|' + re.src_ZPCc + ')' + re.src_Any + ')';
  20. // The same as abothe but without [0-9]
  21. // var src_pseudo_letter_non_d = '(?:(?![0-9]|' + src_ZPCc + ')' + src_Any + ')';
  22. ////////////////////////////////////////////////////////////////////////////////
  23. re.src_ip4 =
  24. '(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
  25. // Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch.
  26. re.src_auth = '(?:(?:(?!' + re.src_ZCc + '|[@/\\[\\]()]).)+@)?';
  27. re.src_port =
  28. '(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?';
  29. re.src_host_terminator =
  30. '(?=$|' + text_separators + '|' + re.src_ZPCc + ')(?!-|_|:\\d|\\.-|\\.(?!$|' + re.src_ZPCc + '))';
  31. re.src_path =
  32. '(?:' +
  33. '[/?#]' +
  34. '(?:' +
  35. '(?!' + re.src_ZCc + '|' + text_separators + '|[()[\\]{}.,"\'?!\\-]).|' +
  36. '\\[(?:(?!' + re.src_ZCc + '|\\]).)*\\]|' +
  37. '\\((?:(?!' + re.src_ZCc + '|[)]).)*\\)|' +
  38. '\\{(?:(?!' + re.src_ZCc + '|[}]).)*\\}|' +
  39. '\\"(?:(?!' + re.src_ZCc + '|["]).)+\\"|' +
  40. "\\'(?:(?!" + re.src_ZCc + "|[']).)+\\'|" +
  41. "\\'(?=" + re.src_pseudo_letter + '|[-]).|' + // allow `I'm_king` if no pair found
  42. '\\.{2,3}[a-zA-Z0-9%/]|' + // github has ... in commit range links. Restrict to
  43. // - english
  44. // - percent-encoded
  45. // - parts of file path
  46. // until more examples found.
  47. '\\.(?!' + re.src_ZCc + '|[.]).|' +
  48. (opts && opts['---'] ?
  49. '\\-(?!--(?:[^-]|$))(?:-*)|' // `---` => long dash, terminate
  50. :
  51. '\\-+|'
  52. ) +
  53. '\\,(?!' + re.src_ZCc + ').|' + // allow `,,,` in paths
  54. '\\!(?!' + re.src_ZCc + '|[!]).|' +
  55. '\\?(?!' + re.src_ZCc + '|[?]).' +
  56. ')+' +
  57. '|\\/' +
  58. ')?';
  59. re.src_email_name =
  60. '[\\-;:&=\\+\\$,\\"\\.a-zA-Z0-9_]+';
  61. re.src_xn =
  62. 'xn--[a-z0-9\\-]{1,59}';
  63. // More to read about domain names
  64. // http://serverfault.com/questions/638260/
  65. re.src_domain_root =
  66. // Allow letters & digits (http://test1)
  67. '(?:' +
  68. re.src_xn +
  69. '|' +
  70. re.src_pseudo_letter + '{1,63}' +
  71. ')';
  72. re.src_domain =
  73. '(?:' +
  74. re.src_xn +
  75. '|' +
  76. '(?:' + re.src_pseudo_letter + ')' +
  77. '|' +
  78. // don't allow `--` in domain names, because:
  79. // - that can conflict with markdown &mdash; / &ndash;
  80. // - nobody use those anyway
  81. '(?:' + re.src_pseudo_letter + '(?:-(?!-)|' + re.src_pseudo_letter + '){0,61}' + re.src_pseudo_letter + ')' +
  82. ')';
  83. re.src_host =
  84. '(?:' +
  85. // Don't need IP check, because digits are already allowed in normal domain names
  86. // src_ip4 +
  87. // '|' +
  88. '(?:(?:(?:' + re.src_domain + ')\\.)*' + re.src_domain/*_root*/ + ')' +
  89. ')';
  90. re.tpl_host_fuzzy =
  91. '(?:' +
  92. re.src_ip4 +
  93. '|' +
  94. '(?:(?:(?:' + re.src_domain + ')\\.)+(?:%TLDS%))' +
  95. ')';
  96. re.tpl_host_no_ip_fuzzy =
  97. '(?:(?:(?:' + re.src_domain + ')\\.)+(?:%TLDS%))';
  98. re.src_host_strict =
  99. re.src_host + re.src_host_terminator;
  100. re.tpl_host_fuzzy_strict =
  101. re.tpl_host_fuzzy + re.src_host_terminator;
  102. re.src_host_port_strict =
  103. re.src_host + re.src_port + re.src_host_terminator;
  104. re.tpl_host_port_fuzzy_strict =
  105. re.tpl_host_fuzzy + re.src_port + re.src_host_terminator;
  106. re.tpl_host_port_no_ip_fuzzy_strict =
  107. re.tpl_host_no_ip_fuzzy + re.src_port + re.src_host_terminator;
  108. ////////////////////////////////////////////////////////////////////////////////
  109. // Main rules
  110. // Rude test fuzzy links by host, for quick deny
  111. re.tpl_host_fuzzy_test =
  112. 'localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:' + re.src_ZPCc + '|>|$))';
  113. re.tpl_email_fuzzy =
  114. '(^|' + text_separators + '|\\(|' + re.src_ZCc + ')(' + re.src_email_name + '@' + re.tpl_host_fuzzy_strict + ')';
  115. re.tpl_link_fuzzy =
  116. // Fuzzy link can't be prepended with .:/\- and non punctuation.
  117. // but can start with > (markdown blockquote)
  118. '(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|' + re.src_ZPCc + '))' +
  119. '((?![$+<=>^`|\uff5c])' + re.tpl_host_port_fuzzy_strict + re.src_path + ')';
  120. re.tpl_link_no_ip_fuzzy =
  121. // Fuzzy link can't be prepended with .:/\- and non punctuation.
  122. // but can start with > (markdown blockquote)
  123. '(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|' + re.src_ZPCc + '))' +
  124. '((?![$+<=>^`|\uff5c])' + re.tpl_host_port_no_ip_fuzzy_strict + re.src_path + ')';
  125. return re;
  126. };