Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port script to Python 3 #163

Merged
merged 4 commits into from
Oct 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 2.1
jobs:
build:
docker:
- image: circleci/python:2.7
- image: circleci/python:3.8
steps:
- checkout
- restore_cache:
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ generates safebrowsing-compatible digest list files to be served by

# Requirements

* python 2.x
* python ≥ 3.6
* (optional) virtualenv and/or virtualenvwrapper

# Run

1. (optional) Make a virtualenv for the project and activate it:

```
virtualenv shavar-list-creation
virtualenv -p python3.8 shavar-list-creation
source shavar-list-creation/bin/activate
```

Expand Down
50 changes: 24 additions & 26 deletions lists2safebrowsing.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#!/usr/bin/env python

import ConfigParser
import configparser
import hashlib
import json
import os
import re
import requests
import sys
import time
import urllib2
from urllib.parse import quote, unquote
from urllib.request import urlopen

from packaging import version as p_version
from publicsuffixlist import PublicSuffixList
Expand Down Expand Up @@ -58,15 +59,15 @@ def get_list_url(config, section, key):
"""Return the requested list URL (or the default, if it isn't found)"""
try:
url = config.get(section, key)
except ConfigParser.NoOptionError:
except configparser.NoOptionError:
url = config.get("main", "default_disconnect_url")
return url


def load_json_from_url(config, section, key):
url = get_list_url(config, section, key)
try:
loaded_json = json.loads(urllib2.urlopen(url).read())
loaded_json = json.loads(urlopen(url).read())
except Exception as e:
sys.stderr.write("Error loading %s: %s\n" % (url, repr(e)))
sys.exit(-1)
Expand All @@ -91,7 +92,7 @@ def canonicalize(d):
# repeatedly unescape until no more hex encodings
while (1):
_d = d
d = urllib2.unquote(_d)
d = unquote(_d)
# if decoding had no effect, stop
if (d == _d):
break
Expand Down Expand Up @@ -142,7 +143,7 @@ def canonicalize(d):
_url = ""
for i in url:
if (ord(i) <= 32 or ord(i) >= 127 or i == '#' or i == '%'):
_url += urllib2.quote(i)
_url += quote(i)
else:
_url += i

Expand All @@ -164,7 +165,7 @@ def add_domain_to_list(domain, canonicalized_domain, previous_domain,
if psl.publicsuffix(psl_d) == psl_d:
raise ValueError("Domain '%s' is in the public section of the "
"Public Suffix List" % psl_d)
domain_hash = hashlib.sha256(canonicalized_domain)
domain_hash = hashlib.sha256(canonicalized_domain.encode())
if log_file:
log_file.write("[m] %s >> %s\n" % (domain, canonicalized_domain))
log_file.write("[canonicalized] %s\n" % (canonicalized_domain))
Expand Down Expand Up @@ -328,14 +329,14 @@ def write_safebrowsing_blocklist(domains, output_name, log_file, chunk,
previous_domain = canonicalized_domain

# Write safebrowsing-list format header
output_string = "a:%u:32:%s\n" % (chunk, hashdata_bytes)
output_string += ''.join(output)
output_bytes = b"a:%d:32:%d\n" % (chunk, hashdata_bytes)
output_bytes += b''.join(output)
# When testing on shavar-prod-lists no output file is provided
if output_file:
output_file.write(output_string)
output_file.write(output_bytes)

print("Tracking protection(%s): publishing %d items; file size %d" % (
name, publishing, len(output_string)))
print("Tracking protection(%s): publishing %d items; file size %d" %
(name, publishing, len(output_bytes)))
return


Expand All @@ -349,28 +350,25 @@ def process_entitylist(incoming, chunk, output_file, log_file, list_variant):

for name, entity in sorted(incoming.items()):
urls = set()
name = name.encode('utf-8')
for prop in entity['properties']:
for res in entity['resources']:
prop = prop.encode('utf-8')
res = res.encode('utf-8')
if prop == res:
continue
urls.add(canonicalize('%s/?resource=%s' % (prop, res)))
urls = sorted(urls)
for url in urls:
h = hashlib.sha256(url)
h = hashlib.sha256(url.encode())
if log_file:
log_file.write(
"[entity] %s >> (canonicalized) %s, hash %s\n"
% (name, url, h.hexdigest())
)
publishing += 1
hashdata_bytes += 32
output.append(hashlib.sha256(url).digest())
output.append(h.digest())

# Write the data file
output_file.write("a:%u:32:%s\n" % (chunk, hashdata_bytes))
output_file.write(b"a:%d:32:%d\n" % (chunk, hashdata_bytes))
for o in output:
output_file.write(o)

Expand All @@ -391,7 +389,7 @@ def process_plugin_blocklist(incoming, chunk, output_file, log_file,
domains.sort(key=lambda d: d[1])
for domain, canonicalized_domain in domains:
if canonicalized_domain != previous_domain:
h = hashlib.sha256(canonicalized_domain)
h = hashlib.sha256(canonicalized_domain.encode())
if log_file:
log_file.write(
"[plugin-blocklist] %s >> (canonicalized) %s, hash %s\n"
Expand All @@ -400,10 +398,10 @@ def process_plugin_blocklist(incoming, chunk, output_file, log_file,
publishing += 1
hashdata_bytes += 32
previous_domain = canonicalized_domain
output.append(hashlib.sha256(canonicalized_domain).digest())
output.append(h.digest())

# Write the data file
output_file.write("a:%u:32:%s\n" % (chunk, hashdata_bytes))
output_file.write(b"a:%d:32:%d\n" % (chunk, hashdata_bytes))
for o in output:
output_file.write(o)

Expand Down Expand Up @@ -451,7 +449,7 @@ def get_tracker_lists(config, section, chunknum):
"Supported tags: %s\nConfig file tags: %s" %
(ALL_TAGS, desired_tags)
)
except ConfigParser.NoOptionError:
except configparser.NoOptionError:
desired_tags = DEFAULT_DISCONNECT_LIST_TAGS

# Retrieve domains that match filters
Expand Down Expand Up @@ -515,8 +513,8 @@ def get_plugin_lists(config, section, chunknum):
"configuration file is empty. A plugin "
"blocklist URL must be specified." % section)

for line in urllib2.urlopen(blocklist_url).readlines():
line = line.strip()
for line in urlopen(blocklist_url).readlines():
line = line.decode().strip()
# don't add blank lines or comments
if not line or line.startswith('#'):
continue
Expand Down Expand Up @@ -565,7 +563,7 @@ def version_configurations(config, section, version, revert=False):
new_source_url = initial_source_url_value
old_s3_key = versioned_key
new_s3_key = initial_s3_key_value
ver_val = None
ver_val = ""

# change the config
if config.has_option(section, source_url):
Expand Down Expand Up @@ -656,7 +654,7 @@ def start_versioning(config, chunknum, shavar_prod_lists_branches):


def main():
config = ConfigParser.ConfigParser()
config = configparser.ConfigParser()
filename = config.read(["shavar_list_creation.ini"])
if not filename:
sys.stderr.write("Error loading shavar_list_creation.ini\n")
Expand Down
10 changes: 5 additions & 5 deletions publish2cloud.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import ConfigParser
import configparser
import hashlib
import os
import requests
Expand All @@ -22,7 +22,7 @@
)
from packaging import version as p_version

CONFIG = ConfigParser.SafeConfigParser(os.environ)
CONFIG = configparser.ConfigParser(os.environ)
CONFIG.read(['shavar_list_creation.ini'])
try:
REMOTE_SETTINGS_URL = ''
Expand All @@ -46,7 +46,7 @@
)
CLOUDFRONT_USER_ID = os.environ.get('CLOUDFRONT_USER_ID', None)

except ConfigParser.NoOptionError as err:
except configparser.NoOptionError as err:
REMOTE_SETTINGS_URL = ''
REMOTE_SETTINGS_AUTH = None
REMOTE_SETTINGS_BUCKET = ''
Expand All @@ -56,7 +56,7 @@


def chunk_metadata(fp):
header = fp.readline().rstrip('\n')
header = fp.readline().decode().rstrip('\n')
chunktype, chunknum, hash_size, data_len = header.split(':')
return dict(
type=chunktype, num=chunknum, hash_size=hash_size, len=data_len,
Expand Down Expand Up @@ -92,7 +92,7 @@ def put_new_record_remote_settings(config, section, data):

if not rec_resp:
print('Failed to create/update record for %s. Error: %s' %
(data['Name'], rec_resp.content))
(data['Name'], rec_resp.content.decode()))
return rec_resp

attachment_url = record_url + '/attachment'
Expand Down
18 changes: 8 additions & 10 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
boto==2.40.0
publicsuffixlist==0.7.3
requests==2.20.0
trackingprotection_tools==0.4.6
packaging==19.2
setuptools==40.8.0
boto==2.49.0
publicsuffixlist==0.7.5
requests==2.24.0
trackingprotection-tools==0.6.1
packaging==20.4

# test requirements
pytest==4.6.9
pytest-cov==2.10.0
mock==3.0.5
moto==1.3.14
pytest==6.1.1
pytest-cov==2.10.1
moto==1.3.16
46 changes: 23 additions & 23 deletions tests/test_lists2safebrowsing.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import ConfigParser
import configparser
import hashlib
import json
import time
from unittest.mock import call, patch, mock_open

import pytest
from mock import call, patch, mock_open
from trackingprotection_tools import DisconnectParser

import lists2safebrowsing as l2s
Expand Down Expand Up @@ -125,16 +125,16 @@
)

TEST_DOMAIN_HASH = (b"q\xd8Q\xbe\x8b#\xad\xd9\xde\xdf\xa7B\x12\xf0D\xa2"
"\xf2\x1d\xcfx\xeaHi\x7f8%\xb5\x99\x83\xc1\x111")
b"\xf2\x1d\xcfx\xeaHi\x7f8%\xb5\x99\x83\xc1\x111")
VERSIONED_TEST_DOMAIN_HASH = (b"C]~\x9e\xfeLL\xba\xf5\x17k!5\xe4t\xc4\xcc"
"\xd2g\x84\x9cJ\xcb\x83;\xf4\x9f`jjYg")
b"\xd2g\x84\x9cJ\xcb\x83;\xf4\x9f`jjYg")
DUMMYTRACKER_DOMAIN_HASH = (b"\xe5\xa9\x07\xc8\xff6r\xa9\xcb\xc8\xf1\xd3"
"\xa2\x11\x0c\\\xbe\x7f\xdb1\xbb^\xdfD\xbcX"
"\xa8\xf1U;#\xe2")
b"\xa2\x11\x0c\\\xbe\x7f\xdb1\xbb^\xdfD\xbcX"
b"\xa8\xf1U;#\xe2")
GOOGLE_DOMAIN_HASH = (b"\xbc\x9a\x8f+o\xff\xd5\x85q\xe1\x88\xbb\x11\x05E"
"\xf8\xfb:\xf5\x1c\xdf\x1acimPZ\x98p\xa8[\xe5")
b"\xf8\xfb:\xf5\x1c\xdf\x1acimPZ\x98p\xa8[\xe5")
EXAMPLE_DOMAIN_HASH = (b"s\xd9\x86\xe0\t\x06_\x18,\x10\xbc\xb6\xa4]\xb3"
"\xd6\xed\xa9I\x8f\x890eJ\xf2e?\x8a\x93\x8c\xd8\x01")
b"\xd6\xed\xa9I\x8f\x890eJ\xf2e?\x8a\x93\x8c\xd8\x01")
DOMAIN_HASHES = (DUMMYTRACKER_DOMAIN_HASH + EXAMPLE_DOMAIN_HASH
+ GOOGLE_DOMAIN_HASH)

Expand Down Expand Up @@ -168,15 +168,15 @@
b"a:%d:32:160\n",
(
(b"\xa0\xbc\xee\xcaR\x0f\xd6\"\x8e\xf6\x7f\xb1Y\x8dM\xa1#\xdd"
"\x0b\x18\nn\xb1\x1d\x02SW\x89\xfc;\xc5\xb3"),
b"\x0b\x18\nn\xb1\x1d\x02SW\x89\xfc;\xc5\xb3"),
(b"}UA\xa3\x89e\xe6\xa0v\x1fA\xa6[\xd5+\xc3\xd9\xfe\x1d\x83\x90"
"\x161*\xa1f\x1e\x9ee\x9cV:"),
b"\x161*\xa1f\x1e\x9ee\x9cV:"),
(b"\xd9`\xdd\xfe\x97\x96\xa3\xfdJ\xa89\x18\xa2Mgd}\x7f\xf2\xd1z"
"\x11\x13\xde(m}V{\xdb \xb2"),
b"\x11\x13\xde(m}V{\xdb \xb2"),
(b"\xf3\xfa\xe4\x8a}\xd8\x8a\xae\xf3\xa0B\xe9\xc8q\xe5\xe1xL"
"\xc3,\x07\x95\x0f;}nK7\x03u\xea\x0e"),
b"\xc3,\x07\x95\x0f;}nK7\x03u\xea\x0e"),
(b"\xa8\xe9\xe3EoF\xdb\xe4\x95Q\xc7\xda8`\xf6C\x93\xd8\xf9"
"\xd9oB\xb5\xae\x86\x92w\"Fuw\xdf"),
b"\xd9oB\xb5\xae\x86\x92w\"Fuw\xdf"),
),
)

Expand Down Expand Up @@ -255,8 +255,8 @@ def chunknum():

@pytest.fixture
def config():
config = ConfigParser.ConfigParser()
config.readfp(open("sample_shavar_list_creation.ini"))
config = configparser.ConfigParser()
config.read_file(open("sample_shavar_list_creation.ini"))
return config


Expand Down Expand Up @@ -302,8 +302,8 @@ def test_get_list_url(config, section, key, expected_url):
def test_load_json_from_url(config):
"""Test loading the JSON entity list from a URL."""
data = json.dumps(TEST_ENTITY_DICT)
with patch("lists2safebrowsing.urllib2.urlopen",
mock_open(read_data=data)) as mocked_open:
with patch("lists2safebrowsing.urlopen",
mock_open(read_data=data.encode())) as mocked_open:
loaded_json = l2s.load_json_from_url(config, "entity-whitelist",
"entity_url")

Expand All @@ -320,7 +320,7 @@ def test_load_json_from_url(config):
def test_load_json_from_url_exception(capsys, config):
"""Test load_json_from_url when opening the URL fails."""
error = Exception
with patch("lists2safebrowsing.urllib2.urlopen", side_effect=error):
with patch("lists2safebrowsing.urlopen", side_effect=error):
with pytest.raises(SystemExit) as e:
l2s.load_json_from_url(config, "entity-whitelist", "entity_url")

Expand Down Expand Up @@ -362,7 +362,7 @@ def test_canonicalize(url, expected):
def _add_domain_to_list(domain, canonicalized_domain, previous_domain,
output):
"""Auxiliary function for add_domain_to_list tests."""
domain_hash = hashlib.sha256(canonicalized_domain.encode("utf-8"))
domain_hash = hashlib.sha256(canonicalized_domain.encode())

with patch("test_lists2safebrowsing.open", mock_open()):
with open("test_blocklist.log", "w") as log_file:
Expand All @@ -377,7 +377,7 @@ def _add_domain_to_list(domain, canonicalized_domain, previous_domain,
def test_add_domain_to_list():
"""Test adding a domain to a blocklist."""
domain = "https://www.host.com"
canonicalized_domain = "www.host.com"
canonicalized_domain = "www.host.com/"
added, domain_hash, log_writes, output = (
_add_domain_to_list(domain, canonicalized_domain, None, [])
)
Expand Down Expand Up @@ -635,8 +635,8 @@ def test_process_list(capsys, chunknum, log, list_type):

def _get_entity_or_plugin_lists(chunknum, config, function, section, data):
"""Auxiliary function for get_entity_lists/get_plugin_lists tests."""
with patch("lists2safebrowsing.urllib2.urlopen",
mock_open(read_data=data)) as mocked_urlopen, \
with patch("lists2safebrowsing.urlopen",
mock_open(read_data=data.encode())) as mocked_urlopen, \
patch("lists2safebrowsing.open", mock_open()) as mocked_open:
output_file, _ = function(config, section, chunknum)

Expand Down Expand Up @@ -761,7 +761,7 @@ def test_get_tracker_lists(config, parser, chunknum, section, domains,
test_domains.append("%s-%s" % (version.replace(".", "-"),
test_domains[0]))
expected_domains = test_domains + sorted(domains)
expected_hashes = [hashlib.sha256(d.encode("utf-8")).digest()
expected_hashes = [hashlib.sha256(d.encode()).digest()
for d in expected_domains]
expected_bytes = hashlib.sha256().digest_size * len(expected_hashes)
expected_header = b"a:%d:32:%d\n" % (chunknum, expected_bytes)
Expand Down
Loading