Skip to content

Commit

Permalink
Canonicalize target of archive.org Wayback URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
jstrieb committed Jan 5, 2023
1 parent 2ac55d0 commit de8e806
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 4 deletions.
6 changes: 6 additions & 0 deletions bloom-wrap.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ function canonicalizeUrl(rawUrl) {
.filter(p => p[0].startsWith("utm_"))
.forEach(p => url.searchParams.delete(p[0]));

// Use original URL for archive.org links
if (url.host === "web.archive.org" && url.pathname.startsWith("/web")) {
const new_url = url.pathname.replace(/\/web\/[^\/]*\//, "");
return canonicalizeUrl(new_url);
}

// Truncate index.html, index.php, and trailing slashes
if (url.pathname.endsWith("index.html")) {
url.pathname = url.pathname.slice(0, -"index.html".length);
Expand Down
19 changes: 16 additions & 3 deletions canonicalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import csv
import json
import re
import sys
import urllib.parse as urlparse

Expand Down Expand Up @@ -48,6 +49,7 @@ class URL(object):
"at_xt",
"_r",
]
archiveRegex = re.compile(r"/web/[^/]*/")

def __init__(self, url):
parsed = urlparse.urlsplit(url)
Expand All @@ -72,19 +74,29 @@ def queryStr(self):
def queryStr(self, value):
self.query = urlparse.parse_qs(value, keep_blank_values=True)

def canonicalize(self):
@classmethod
def canonicalize(cls, url):
"""
Transform the current URL object to make it as "canonical" as possible.
This includes removing unnecessary URL parameters, removing "www." from
the beginning of URLs, stripping unnecessary parts of the path, and
performing a few domain-specific adjustments.
Return a canonicalized URL object.
NOTE: The order in which the transformations take place is subtly
important. Do not change the order around without good reason.
NOTE: Any canonicalization changes made here *MUST* be reflected in the
`canonicalizeUrl` function within the `bloom-wrap.js` file!
"""
self = cls(url)

# Use the original URL for archive.org links
if self.netloc == "web.archive.org" and self.path.startswith("/web"):
new_url = URL.archiveRegex.sub("", self.path)
return cls.canonicalize(new_url)

# HTML files almost exclusively use URL parameters for tracking while
# the underlying page remains the same
if self.path.endswith(".html"):
Expand Down Expand Up @@ -126,6 +138,8 @@ def canonicalize(self):
if self.netloc == "en.m.wikipedia.org":
self.netloc = "en.wikipedia.org"

return self


###############################################################################
# Main function
Expand All @@ -134,8 +148,7 @@ def canonicalize(self):
def main():
csvReader = csv.DictReader(sys.stdin)
for entry in csvReader:
url = URL(entry["url"])
url.canonicalize()
url = URL.canonicalize(entry["url"])
print(url)


Expand Down
2 changes: 1 addition & 1 deletion manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

"name": "Hacker News Discussion Button",
"description": "Links to the Hacker News discussion for the current page. Preserves privacy.",
"version": "0.6",
"version": "0.7.0",
"author": "Jacob Strieb",
"homepage_url": "https://github.com/jstrieb/hackernews-button",
"icons": {
Expand Down

0 comments on commit de8e806

Please sign in to comment.