Skip to content

Commit

Permalink
getDomain function
Browse files Browse the repository at this point in the history
  • Loading branch information
ttyridal committed Nov 26, 2021
1 parent f2311e8 commit 747f5a0
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 0 deletions.
36 changes: 36 additions & 0 deletions ext/webextension/src/lib/getdomain.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import {tldlookup, tldcommon} from './getdomain_lut.js'

export function getDomain(url) {
//TODO decode Punycodeed urls (RFC 3492 and RFC 5891)
const parts = url.split('.').reverse();
let res = [];
let lut = tldlookup;
let v;

for (v=0; v < parts.length; v++) {
const part = parts[v];
if (!lut) break;
if (part in lut) {
res.push(part);
lut = lut[part]
}
else if ('*' in lut) {
res.push(e);
lut = null;
} else
break;
}
if (v < parts.length)
res.push(parts[v]);

if (parts.length > 2 && parts[1] in tldcommon
&& tldcommon[parts[1]].includes(parts[0]) && res.length < 3) {
res = parts.slice(0, 3);
}

v = parts.indexOf('blogspot');
if (v >= 0)
res = parts.slice(0, v + 2);

return res.reverse().join('.');
}
25 changes: 25 additions & 0 deletions ext/webextension/src/lib/getdomain.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"use strict";
import {jest, it, expect, beforeEach} from '@jest/globals'
import {getDomain} from './getdomain.js'

it('gets the correct domain from url', () => {
expect(getDomain('example.com')).toBe('example.com');
expect(getDomain('amazon.com')).toBe('amazon.com');
expect(getDomain('show.amazon.com')).toBe('amazon.com');
expect(getDomain('amazon.co.uk')).toBe('amazon.co.uk');
expect(getDomain('shop.amazon.co.uk')).toBe('amazon.co.uk');
expect(getDomain('tyridal.no')).toBe('tyridal.no');
expect(getDomain('digi.gitapp.si')).toBe('digi.gitapp.si');
expect(getDomain('www.tyridal.no')).toBe('tyridal.no');
expect(getDomain('torbjorn.tyridal.no')).toBe('tyridal.no');
expect(getDomain('wilson.no.eu.org')).toBe('wilson.no.eu.org');
expect(getDomain('xxx.wilson.no.eu.org')).toBe('wilson.no.eu.org');
expect(getDomain('weare.org.om')).toBe('weare.org.om');
expect(getDomain('rave.weare.org.om')).toBe('weare.org.om');
expect(getDomain('rave.blogspot.co.nz')).toBe('rave.blogspot.co.nz');
expect(getDomain('rave.blogspot.com')).toBe('rave.blogspot.com');
expect(getDomain('xx.rave.blogspot.co.nz')).toBe('rave.blogspot.co.nz');
expect(getDomain('xx.rave.blogspot.com')).toBe('rave.blogspot.com');
expect(getDomain('blogspot.com')).toBe('blogspot.com');

});
2 changes: 2 additions & 0 deletions ext/webextension/src/lib/getdomain_lut.js

Large diffs are not rendered by default.

107 changes: 107 additions & 0 deletions publicsuffixlist/tld.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from __future__ import print_function

from collections import *
import sys
import json

c = Counter()

common = {
'org': [],
'com': [],
'net': [],
'gov': [],
'edu': [],
'co': [],
'mil': [],
'ac': [],
'info': [],
}
tree = lambda: defaultdict(tree)

others = tree()

for l in open("public_suffix_list.dat"):
l = l.strip()
if not l or l[0] == '/' or '.' not in l: continue
## print(l)
x = l.split('.')

if l[0]=='!': continue ## deal with those later

if 'blogspot' in x: ## special case them.. always x.blogspot....
pass
elif len(x) == 2 and x[-2] in common:
common[x[-2]].append(x[-1])
else:
x = x[::-1]
d = others
for q in x:
d = d[q]

def walk(d, dst,lvl=0):
for k,v in d.items():
if v:
print((" "*lvl)+k)
dst[k] = dict()
walk(v, dst[k],lvl+1)
else:
print((" "*lvl)+k)
dst[k] = 0


table=dict()
walk(others, table)
## print("export const tldlookup =", json.dumps(table),";")
## print("export const tldcommon =", json.dumps(common),";")
sys.exit(0)
def lookup(url, d):
k = url.split('.')[::-1]
x = d
p = []
for e in k:
if not d:
break
if e in d:
p.append(e)
d = d[e]
elif '*' in d:
p.append(e)
d=None
else:
break

p.append(e)

if len(k)>2 and k[1] in common and k[0] in common[k[1]] and len(p) < 3:
p = k[:3]

try:
p1 = k[:k.index('blogspot')+2]
if len(p1) > len(p):
p = p1
except: pass

return ".".join(p[::-1])

for test in [
'example.com',
'amazon.com',
'show.amazon.com',
'amazon.co.uk',
'shop.amazon.co.uk',
'tyridal.no',
'digi.gitapp.si',
'www.tyridal.no',
'torbjorn.tyridal.no',
'wilson.no.eu.org',
'xxx.wilson.no.eu.org',
'weare.org.om',
'rave.weare.org.om',
'rave.blogspot.co.nz',
'rave.blogspot.com',
'xx.rave.blogspot.co.nz',
'xx.rave.blogspot.com',
'blogspot.com',
]:
print(test, "->", lookup(test, table))

0 comments on commit 747f5a0

Please sign in to comment.