Skip to content

Commit

Permalink
Migrate utilities/dead_link to PDM (#4623)
Browse files Browse the repository at this point in the history
* Migrate utilities/dead_link to pdm

Signed-off-by: Olga Bulat <[email protected]>

* Refactor the script to make host and port configurable

Signed-off-by: Olga Bulat <[email protected]>

---------

Signed-off-by: Olga Bulat <[email protected]>
  • Loading branch information
obulat authored Jul 19, 2024
1 parent 6198601 commit f6cdfe1
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 98 deletions.
13 changes: 0 additions & 13 deletions utilities/dead_links/Pipfile

This file was deleted.

54 changes: 0 additions & 54 deletions utilities/dead_links/Pipfile.lock

This file was deleted.

91 changes: 60 additions & 31 deletions utilities/dead_links/dead_link_tally.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,12 @@
from collections import defaultdict
from urllib.parse import urlparse

import click
from redis import Redis
from tqdm import tqdm


redis = Redis("localhost", decode_responses=True)

cursor = 0

tallies = defaultdict(dict)
errors = dict()


def handle_matches(matches):
def handle_matches(redis, matches, tallies, errors):
values = redis.mget(matches)
for value, match in zip(values, matches):
try:
Expand All @@ -45,26 +38,62 @@ def handle_matches(matches):
errors[value] = e


total_to_process = redis.eval("return #redis.pcall('keys', 'valid:*')", 0)

with tqdm(total=total_to_process, miniters=10) as pbar:
cursor, matches = redis.scan(cursor=0, match="valid:*", count=250)
handle_matches(matches)
pbar.update(len(matches))
iter_count = 1

while cursor != 0:
cursor, matches = redis.scan(cursor=cursor, match="valid:*", count=250)
handle_matches(matches)
@click.command()
@click.option(
"--host",
help="Redis host to connect to",
type=str,
default="localhost",
)
@click.option(
"--port",
help="Port to connect to",
type=int,
default=None,
show_default=True,
)
def main(host: str, port: int | None):
port_str = f":{port}" if port is not None else ""
click.echo(f"Connecting to Redis cluster at {host}{port_str}")

redis_params = {"host": host, "decode_responses": True}
if port is not None:
redis_params["port"] = port

redis = Redis(**redis_params)
try:
redis.ping()
except Exception as e:
click.echo(f"Error connecting to Redis: {e}")
return

tallies = defaultdict(dict)
errors = dict()

total_to_process = redis.eval("return #redis.pcall('keys', 'valid:*')", 0)

with tqdm(total=total_to_process, miniters=10) as pbar:
cursor, matches = redis.scan(cursor=0, match="valid:*", count=250)
handle_matches(redis, matches, tallies, errors)
pbar.update(len(matches))
iter_count += 1
if iter_count % 10 == 0:
# only print each 10 iterations to ease I/O time spent
pprint.pprint(dict(cursor=cursor, **tallies), compact=True)


print("\n\n\n\n============= FINAL RESULTS ============= \n\n")
pprint.pprint(tallies)

print("\n\n\n==================== ERRORS ===============\n\n")
pprint.pprint(errors)
iter_count = 1

while cursor != 0:
cursor, matches = redis.scan(cursor=cursor, match="valid:*", count=250)
handle_matches(redis, matches, tallies, errors)
pbar.update(len(matches))
iter_count += 1
if iter_count % 10 == 0:
# only print each 10 iterations to ease I/O time spent
tqdm.write(
pprint.pformat(dict(cursor=cursor, **tallies), compact=True) + "\n"
)
print("\n\n\n\n============= FINAL RESULTS ============= \n\n")
pprint.pprint(tallies)

print("\n\n\n==================== ERRORS ===============\n\n")
pprint.pprint(errors)


if __name__ == "__main__":
main()
59 changes: 59 additions & 0 deletions utilities/dead_links/pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions utilities/dead_links/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[project]
name = "dead_links"
version = "0.0.0"
description = "Retrieves dead link tallies from Openverse API Redis"
authors = [
{name = "Openverse Contributors", email = "[email protected]"},
]
dependencies = [
"redis >= 5.0",
"tqdm >= 4.66",
"click >=8.1.7",
]
requires-python = "==3.12.*"

[tool.pdm]
distribution = false

0 comments on commit f6cdfe1

Please sign in to comment.