265 lines
6.9 KiB
Python
265 lines
6.9 KiB
Python
from uc_micro.categories import Cc, Cf, P, Z
|
|
from uc_micro.properties import Any
|
|
|
|
SRC_ANY = Any.REGEX
|
|
SRC_CC = Cc.REGEX
|
|
SRC_CF = Cf.REGEX
|
|
SRC_P = P.REGEX
|
|
SRC_Z = Z.REGEX
|
|
|
|
# \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation)
|
|
SRC_ZPCC = "|".join([SRC_Z, SRC_P, SRC_CC])
|
|
|
|
# \p{\Z\Cc} (white spaces + control)
|
|
SRC_ZCC = "|".join([SRC_Z, SRC_CC])
|
|
|
|
# Experimental. List of chars, completely prohibited in links
|
|
# because can separate it from other part of text
|
|
TEXT_SEPARATORS = "[><\uff5c]"
|
|
|
|
# All possible word characters (everything without punctuation, spaces & controls)
|
|
# Defined via punctuation & spaces to save space
|
|
# Should be something like \p{\L\N\S\M} (\w but without `_`)
|
|
SRC_PSEUDO_LETTER = "(?:(?!" + TEXT_SEPARATORS + "|" + SRC_ZPCC + ")" + SRC_ANY + ")"
|
|
# The same as abothe but without [0-9]
|
|
# var SRC_PSEUDO_LETTER_non_d = '(?:(?![0-9]|' + SRC_ZPCC + ')' + SRC_ANY + ')'
|
|
|
|
# =============================================================================
|
|
|
|
SRC_IP4 = (
|
|
"(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|"
|
|
+ "2[0-4][0-9]|[01]?[0-9][0-9]?)"
|
|
)
|
|
|
|
# Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch.
|
|
SRC_AUTH = "(?:(?:(?!" + SRC_ZCC + "|[@/\\[\\]()]).)+@)?"
|
|
|
|
SRC_PORT = (
|
|
"(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?"
|
|
)
|
|
|
|
# Allow anything in markdown spec, forbid quote (") at the first position
|
|
# because emails enclosed in quotes are far more common
|
|
SRC_EMAIL_NAME = '[\\-:&=\\+\\$,\\.a-zA-Z0-9_][\\-:&=\\+\\$,\\"\\.a-zA-Z0-9_]*'
|
|
|
|
SRC_XN = "xn--[a-z0-9\\-]{1,59}"
|
|
|
|
# More to read about domain names
|
|
# http:#serverfault.com/questions/638260/
|
|
|
|
# Allow letters & digits (http:#test1)
|
|
SRC_DOMAIN_ROOT = "(?:" + SRC_XN + "|" + SRC_PSEUDO_LETTER + "{1,63}" + ")"
|
|
|
|
SRC_DOMAIN = (
|
|
"(?:"
|
|
+ SRC_XN
|
|
+ "|"
|
|
+ "(?:"
|
|
+ SRC_PSEUDO_LETTER
|
|
+ ")"
|
|
+ "|"
|
|
+ "(?:"
|
|
+ SRC_PSEUDO_LETTER
|
|
+ "(?:-|"
|
|
+ SRC_PSEUDO_LETTER
|
|
+ "){0,61}"
|
|
+ SRC_PSEUDO_LETTER
|
|
+ ")"
|
|
+ ")"
|
|
)
|
|
|
|
SRC_HOST = (
|
|
"(?:"
|
|
+
|
|
# Don't need IP check, because digits are already allowed in normal domain names
|
|
# SRC_IP4 +
|
|
# '|' +
|
|
"(?:(?:(?:"
|
|
+ SRC_DOMAIN
|
|
+ ")\\.)*"
|
|
+ SRC_DOMAIN # _root
|
|
+ ")"
|
|
+ ")"
|
|
)
|
|
|
|
TPL_HOST_FUZZY = (
|
|
"(?:" + SRC_IP4 + "|" + "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))" + ")"
|
|
)
|
|
|
|
TPL_HOST_NO_IP_FUZZY = "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))"
|
|
|
|
|
|
# =============================================================================
|
|
|
|
# Rude test fuzzy links by host, for quick deny
|
|
TPL_HOST_FUZZY_TEST = (
|
|
"localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:" + SRC_ZPCC + "|>|$))"
|
|
)
|
|
|
|
|
|
def _re_host_terminator(opts):
|
|
src_host_terminator = (
|
|
"(?=$|"
|
|
+ TEXT_SEPARATORS
|
|
+ "|"
|
|
+ SRC_ZPCC
|
|
+ ")"
|
|
+ "(?!"
|
|
+ ("-(?!--)|" if opts.get("---") else "-|")
|
|
+ "_|:\\d|\\.-|\\.(?!$|"
|
|
+ SRC_ZPCC
|
|
+ "))"
|
|
)
|
|
return src_host_terminator
|
|
|
|
|
|
def _re_src_path(opts):
|
|
src_path = (
|
|
"(?:"
|
|
+ "[/?#]"
|
|
+ "(?:"
|
|
+ "(?!"
|
|
+ SRC_ZCC
|
|
+ "|"
|
|
+ TEXT_SEPARATORS
|
|
+ "|[()[\\]{}.,\"'?!\\-;]).|"
|
|
+ "\\[(?:(?!"
|
|
+ SRC_ZCC
|
|
+ "|\\]).)*\\]|"
|
|
+ "\\((?:(?!"
|
|
+ SRC_ZCC
|
|
+ "|[)]).)*\\)|"
|
|
+ "\\{(?:(?!"
|
|
+ SRC_ZCC
|
|
+ "|[}]).)*\\}|"
|
|
+ '\\"(?:(?!'
|
|
+ SRC_ZCC
|
|
+ '|["]).)+\\"|'
|
|
+ "\\'(?:(?!"
|
|
+ SRC_ZCC
|
|
+ "|[']).)+\\'|"
|
|
+ "\\'(?="
|
|
+ SRC_PSEUDO_LETTER
|
|
+ "|[-])|"
|
|
+ "\\.{2,}[a-zA-Z0-9%/&]|"
|
|
# google has many dots in "google search" links (#66, #81).
|
|
# github has ... in commit range links,
|
|
# ReSTRICT to
|
|
# - english
|
|
# - percent-encoded
|
|
# - parts of file path
|
|
# - params separator
|
|
# until more examples found.
|
|
+ "\\.(?!"
|
|
+ SRC_ZCC
|
|
+ "|[.]|$)|"
|
|
+ ("\\-(?!--(?:[^-]|$))(?:-*)|" if opts.get("---") else "\\-+|")
|
|
+ ",(?!"
|
|
+ SRC_ZCC
|
|
+ "|$)|" # allow `,,,` in paths
|
|
+ ";(?!"
|
|
+ SRC_ZCC
|
|
+ "|$)|" # allow `,,,` in paths
|
|
+ "\\!+(?!"
|
|
+ SRC_ZCC
|
|
+ "|[!]|$)|" # allow `!!!` in paths, but not at the end
|
|
+ "\\?(?!"
|
|
+ SRC_ZCC
|
|
+ "|[?]|$)"
|
|
+ ")+"
|
|
+ "|\\/"
|
|
+ ")?"
|
|
)
|
|
|
|
return src_path
|
|
|
|
|
|
def build_re(opts):
|
|
"""Build regex
|
|
|
|
Args:
|
|
opts (dict): options
|
|
|
|
Return:
|
|
dict: dict of regex string
|
|
"""
|
|
SRC_HOST_STRICT = SRC_HOST + _re_host_terminator(opts)
|
|
|
|
TPL_HOST_FUZZY_STRICT = TPL_HOST_FUZZY + _re_host_terminator(opts)
|
|
|
|
SRC_HOST_PORT_STRICT = SRC_HOST + SRC_PORT + _re_host_terminator(opts)
|
|
|
|
TPL_HOST_PORT_FUZZY_STRICT = TPL_HOST_FUZZY + SRC_PORT + _re_host_terminator(opts)
|
|
|
|
TPL_HOST_PORT_NO_IP_FUZZY_STRICT = (
|
|
TPL_HOST_NO_IP_FUZZY + SRC_PORT + _re_host_terminator(opts)
|
|
)
|
|
|
|
TPL_EMAIL_FUZZY = (
|
|
"(^|"
|
|
+ TEXT_SEPARATORS
|
|
+ '|"|\\(|'
|
|
+ SRC_ZCC
|
|
+ ")"
|
|
+ "("
|
|
+ SRC_EMAIL_NAME
|
|
+ "@"
|
|
+ TPL_HOST_FUZZY_STRICT
|
|
+ ")"
|
|
)
|
|
|
|
regex = {
|
|
"src_Any": SRC_ANY,
|
|
"src_Cc": SRC_CC,
|
|
"src_Cf": SRC_CF,
|
|
"src_Z": SRC_Z,
|
|
"src_P": SRC_P,
|
|
"src_ZPCc": SRC_ZPCC,
|
|
"src_ZCc": SRC_ZCC,
|
|
"src_pseudo_letter": SRC_PSEUDO_LETTER,
|
|
"src_ip4": SRC_IP4,
|
|
"src_auth": SRC_AUTH,
|
|
"src_port": SRC_PORT,
|
|
"src_host_terminator": _re_host_terminator(opts),
|
|
"src_path": _re_src_path(opts),
|
|
"src_email_name": SRC_EMAIL_NAME,
|
|
"src_xn": SRC_XN,
|
|
"src_domain_root": SRC_DOMAIN_ROOT,
|
|
"src_domain": SRC_DOMAIN,
|
|
"src_host": SRC_HOST,
|
|
"tpl_host_fuzzy": TPL_HOST_FUZZY,
|
|
"tpl_host_no_ip_fuzzy": TPL_HOST_NO_IP_FUZZY,
|
|
"src_host_strict": SRC_HOST_STRICT,
|
|
"tpl_host_fuzzy_strict": TPL_HOST_FUZZY_STRICT,
|
|
"src_host_port_strict": SRC_HOST_PORT_STRICT,
|
|
"tpl_host_port_fuzzy_strict": TPL_HOST_PORT_FUZZY_STRICT,
|
|
"tpl_host_port_no_ip_fuzzy_strict": TPL_HOST_PORT_FUZZY_STRICT,
|
|
# Main rules
|
|
"tpl_host_fuzzy_test": TPL_HOST_FUZZY_TEST,
|
|
"tpl_email_fuzzy": TPL_EMAIL_FUZZY,
|
|
# Fuzzy link can't be prepended with .:/\- and non punctuation.
|
|
# but can start with > (markdown blockquote)
|
|
"tpl_link_fuzzy": (
|
|
"(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|"
|
|
+ SRC_ZPCC
|
|
+ "))"
|
|
+ "((?![$+<=>^`|\uff5c])"
|
|
+ TPL_HOST_PORT_FUZZY_STRICT
|
|
+ _re_src_path(opts)
|
|
+ ")"
|
|
),
|
|
# Fuzzy link can't be prepended with .:/\- and non punctuation.
|
|
# but can start with > (markdown blockquote)
|
|
"tpl_link_no_ip_fuzzy": (
|
|
"(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|"
|
|
+ SRC_ZPCC
|
|
+ "))"
|
|
+ "((?![$+<=>^`|\uff5c])"
|
|
+ TPL_HOST_PORT_NO_IP_FUZZY_STRICT
|
|
+ _re_src_path(opts)
|
|
+ ")"
|
|
),
|
|
}
|
|
|
|
return regex
|