replicant-frameworks_native/common/tools/make-iana-tld-pattern.py

145 lines
3.6 KiB
Python
Raw Normal View History

#!/usr/bin/env python
from urllib2 import urlopen
TLD_PREFIX = r"""
/**
* Regular expression to match all IANA top-level domains.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
public static final String TOP_LEVEL_DOMAIN_STR =
"""
TLD_SUFFIX = '";'
URL_PREFIX = r"""
/**
* Regular expression to match all IANA top-level domains for WEB_URL.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
"(?:"
"""
URL_SUFFIX = ';'
class Bucket:
def __init__(self, baseLetter):
self.base=baseLetter
self.words=[]
self.letters=[]
def dump(self, isWebUrl=False, isFirst=False, isLast=False):
if (len(self.words) == 0) and (len(self.letters) == 0):
return ''
self.words.sort()
self.letters.sort()
output = ' ';
if isFirst:
if isWebUrl:
output += '+ "'
else:
output += '"('
else:
output += '+ "|'
if len(self.words) != 0:
output += '('
if isWebUrl:
output += '?:'
firstWord = 1
for word in self.words:
if firstWord == 0:
output += '|'
firstWord = 0
for letter in word:
if letter == '-':
output += '\\\\' # escape the '-' character.
output += letter
if len(self.words) > 0 and len(self.letters) > 0:
output += '|'
if len(self.letters) == 1:
output += '%c%c' % (self.base, self.letters[0])
elif len(self.letters) > 0:
output += '%c[' % self.base
for letter in self.letters:
output += letter
output += ']'
if len(self.words) != 0:
output += ')'
if not isLast:
output += '"'
output += '\n'
return output;
def add(self, line):
length = len(line)
if line.startswith('#') or (length == 0):
return;
if length == 2:
self.letters.append(line[1:2])
else:
self.words.append(line)
def getBucket(buckets, line):
letter = line[0]
bucket = buckets.get(letter)
if bucket is None:
bucket = Bucket(letter)
buckets[letter] = bucket
return bucket
def makePattern(prefix, suffix, buckets, isWebUrl=False):
output = prefix
output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
for letter in range(ord('b'), ord('z')):
output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
if isWebUrl:
output += '))"'
else:
output += ')'
output += suffix
print output
if __name__ == "__main__":
f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
domains = f.readlines()
f.close()
buckets = {}
for domain in domains:
domain = domain.lower()
if len(domain) > 0:
getBucket(buckets, domain[0]).add(domain.strip())
makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)