Last active
November 24, 2021 23:59
-
-
Save rusty-snake/a82ffae09e820e053ac486694af777c3 to your computer and use it in GitHub Desktop.
ClearURLs to µBlock origin converter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright © 2021 rusty-snake | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
import json | |
import sys | |
KNOWN_BAD_FILTERS = [] | |
def normalize_url_pattern(url_pattern: str) -> str: | |
# No need for protocol and subdomain | |
url_pattern = url_pattern.replace(r"^https?:\/\/(?:[a-z0-9-]+\.)*?", "", 1) | |
url_pattern = url_pattern.replace(r"https?:\/\/([a-z0-9-.]*\.)", "", 1) | |
url_pattern = url_pattern.replace(r"^https?:\/\/", "", 1) | |
# adb TLD globbing | |
url_pattern = url_pattern.replace(r"(?:\.[a-z]{2,}){1,}", ".*", 1) | |
# Remove backslashes | |
url_pattern = url_pattern.replace("\\", "") | |
# Specific fixups | |
url_pattern = url_pattern.replace("(?:accounts.)?", "", 1) | |
url_pattern = url_pattern.replace("(?:support.)?", "", 1) | |
url_pattern = url_pattern.replace("(?:yandex.*|ya.ru)", "yandex.*", 1) | |
return url_pattern | |
def normalize_rule(rule: str) -> str: | |
rule = rule.replace("(?:%3F)?", "", 1) | |
rule = rule.replace("(?:", "(") | |
rule = rule.replace(r"\$", r"\x24") | |
return rule | |
def normalize_exception(exception: str) -> tuple[str, str]: | |
orig_exception = exception | |
exception = exception.replace(r"^https?:\/\/(?:[a-z0-9-]+\.)*?", "||", 1) | |
exception = exception.replace(r"^https?:\/\/", "||", 1) | |
# FIXME: |ws:// | |
exception = exception.replace(r"^wss?:\/\/(?:[a-z0-9-]+\.)*?", "|wss://", 1) | |
exception = exception.replace(r"(?:\.[a-z]{2,}){1,}", "TLD_WILDCARD", 1) | |
exception = exception.replace("=[^/?&]*", "=") | |
exception = exception.replace("=.*?", "=") | |
exception = exception.replace("=.", "=") | |
exception = exception.replace("[^?]*\\?.*?", "*?*") | |
exception = exception.replace("[^?]+.*?&?", "*?*") | |
exception = exception.replace("\\?.*?", "?") | |
exception = exception.replace(".*?&?", "*") | |
exception = exception.replace(".*?", "*") | |
exception = exception.replace("\\", "") | |
if any(c in "([" for c in exception): | |
exception = orig_exception | |
exception = exception.replace("(?:", "(") | |
return "regex", exception | |
elif any(c in "/?" for c in exception): | |
exception = exception.replace("TLD_WILDCARD", ".*", 1) | |
exception = exception.replace("|wss://zoom.us", "|wss://zoom.us^", 1) | |
return "path", exception | |
else: | |
exception = exception.replace("TLD_WILDCARD", ".*", 1) | |
exception = exception.replace("||", "", 1) | |
return "domain", exception | |
def expand_se(rule: str) -> list[str]: | |
# TODO: | |
# 1. "foo_(1|2)_(bar|baz)" -> ["foo_1_bar", "foo_2_bar", "foo_1_baz", "foo_2_baz"] | |
# 2. "foo_[12]_bar" -> ["foo_1_bar", "foo_2_bar"] | |
# 3. "foo_?bar" -> ["foobar", "foo_bar"] | |
# But "foo_[a-z]*_bar" -> ["foo_[a-z]*_bar"] | |
# | |
# https://stackoverflow.com/questions/20061268/python-regex-string-expansion | |
if rule.count("(") == 1 and rule.count(")") == 1 and "\\" not in rule: | |
fixed_prefix, remains = rule.split("(") | |
variants, fixed_suffix = remains.split(")") | |
variants = variants.split("|") | |
return [fixed_prefix + variant + fixed_suffix for variant in variants] | |
return [rule] | |
def is_regex(rule: str) -> bool: | |
return any(c in r".^$*+?{}[]\|()" for c in rule) | |
def print_rules( | |
url_pattern: str, rules: list[str], regex_fromat: str, plain_format: str | |
) -> None: | |
for rule in rules: | |
filter_template = regex_fromat if is_regex(rule) else plain_format | |
filter_ = filter_template.format(rule, url_pattern) | |
if filter_ not in KNOWN_BAD_FILTERS: | |
print(filter_) | |
def main() -> int: | |
data_min_json = json.loads(sys.stdin.read()) | |
# TODO: referralMarketing | |
providers = { | |
provider["urlPattern"]: provider["rules"] | |
for provider in data_min_json["providers"].values() | |
if provider["rules"] | |
} | |
# TODO: | |
# - URL encoded | |
# $removeparam=%24deep_link,domain=reddit.com | |
# - Better is_regex | |
# $removeparam=/^p\[\]=/,domain=flipkart.com | |
for url_pattern, rules in providers.items(): | |
url_pattern = normalize_url_pattern(url_pattern) | |
rules = (expand_se(normalize_rule(rule)) for rule in rules) | |
rules = [rule for expanded_rule in rules for rule in expanded_rule] | |
if url_pattern == ".*": | |
print_rules(url_pattern, rules, "$removeparam=/^{0}=/", "$removeparam={0}") | |
elif "/" in url_pattern: | |
print_rules( | |
url_pattern, rules, "||{1}$removeparam=/^{0}=/", "||{1}$removeparam={0}" | |
) | |
else: | |
print_rules( | |
url_pattern, | |
rules, | |
"$removeparam=/^{0}=/,domain={1}", | |
"$removeparam={0},domain={1}", | |
) | |
exceptions = [ | |
exception | |
for provider in data_min_json["providers"].values() | |
for exception in provider["exceptions"] | |
] | |
for exception in exceptions: | |
kind, exception = normalize_exception(exception.replace("\\\\", "\\")) | |
if kind == "regex": | |
print("@@/{0}/$removeparam".format(exception)) | |
elif kind == "path": | |
print("@@{0}$removeparam".format(exception)) | |
elif kind == "domain": | |
print("@@$removeparam,domain={0}".format(exception)) | |
else: | |
raise ValueError | |
return 0 | |
if __name__ == "__main__": | |
sys.exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright © 2021 rusty-snake | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
import json | |
import sys | |
def normalize_url_pattern(url_pattern: str) -> str: | |
# No need for protocol and subdomain | |
url_pattern = url_pattern.replace(r"^https?:\/\/(?:[a-z0-9-]+\.)*?", "", 1) | |
url_pattern = url_pattern.replace(r"https?:\/\/([a-z0-9-.]*\.)", "", 1) | |
url_pattern = url_pattern.replace(r"^https?:\/\/", "", 1) | |
# filterlist tld globbing | |
url_pattern = url_pattern.replace(r"(?:\.[a-z]{2,}){1,}", ".", 1) | |
# Remove backslashes | |
url_pattern = url_pattern.replace("\\", "") | |
# Specific fixups | |
url_pattern = url_pattern.replace("(?:accounts.)?", "", 1) | |
url_pattern = url_pattern.replace("(?:support.)?", "", 1) | |
url_pattern = url_pattern.replace("(?:yandex.|ya.ru)", "yandex.", 1) | |
return "" if url_pattern == ".*" else f"||{url_pattern}" | |
def is_regex(rule: str) -> bool: | |
lastchar = rule[0] | |
for char in rule[1:]: | |
if char in r".^$*+?{}[]\|()" and lastchar != "\\": | |
return True | |
return False | |
def print_rules( | |
filter_template: str, | |
exceptions_template: str, | |
url_pattern: str, | |
rules: str, | |
exceptions: str, | |
): | |
print(filter_template.format(url_pattern, rules)) | |
if exceptions: | |
print(exceptions_template.format(exceptions, rules)) | |
def main() -> int: | |
data_min_json = json.loads(sys.stdin.read()) | |
providers = { | |
provider["urlPattern"]: { | |
"rules": provider["rules"], | |
"exceptions": provider["exceptions"], | |
} | |
for provider in data_min_json["providers"].values() | |
if provider["rules"] | |
} | |
for url_pattern in providers: | |
rules = providers[url_pattern]["rules"] | |
exceptions = providers[url_pattern]["exceptions"] | |
url_pattern = normalize_url_pattern(url_pattern) | |
rules = [rule.replace(",", r"\,").replace(r"\$", r"\x24") for rule in rules] | |
rules_joined = "|".join(rules) | |
if len(rules) == 1 and not is_regex(rules[0]): | |
filter_template = "{0}$removeparam={1}" | |
exceptions_template = "@@/({0})/$removeparam={1}" | |
else: | |
filter_template = "{0}$removeparam=/^({1})=/,all" | |
exceptions_template = "@@/({0})/$removeparam=/^({1})=/,all" | |
print_rules( | |
filter_template, | |
exceptions_template, | |
url_pattern, | |
"|".join(rules), | |
"|".join(exceptions), | |
) | |
return 0 | |
if __name__ == "__main__": | |
sys.exit(main()) |
OMG I have been waiting for this. Thx!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
A modified version with an automatically generated list can be found at
https://github.com/DandelionSprout/adfilt/tree/master/ClearURLs%20for%20uBo.