2010-02-10 19:22:01 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
from urllib2 import urlopen
|
|
|
|
|
|
|
|
TLD_PREFIX = r"""
|
|
|
|
/**
|
2010-02-11 22:07:44 +00:00
|
|
|
* Regular expression to match all IANA top-level domains.
|
2010-02-10 19:22:01 +00:00
|
|
|
* List accurate as of 2010/02/05. List taken from:
|
|
|
|
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
|
|
|
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
|
|
|
|
*/
|
2010-02-11 22:07:44 +00:00
|
|
|
public static final String TOP_LEVEL_DOMAIN_STR =
|
2010-02-10 19:22:01 +00:00
|
|
|
"""
|
2010-02-11 22:07:44 +00:00
|
|
|
TLD_SUFFIX = '";'
|
2010-02-10 19:22:01 +00:00
|
|
|
|
|
|
|
URL_PREFIX = r"""
|
|
|
|
/**
|
2010-02-11 22:07:44 +00:00
|
|
|
* Regular expression to match all IANA top-level domains for WEB_URL.
|
2010-02-10 19:22:01 +00:00
|
|
|
* List accurate as of 2010/02/05. List taken from:
|
|
|
|
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
2010-02-11 22:07:44 +00:00
|
|
|
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
|
2010-02-10 19:22:01 +00:00
|
|
|
*/
|
2010-02-11 22:07:44 +00:00
|
|
|
public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
|
|
|
|
"(?:"
|
2010-02-10 19:22:01 +00:00
|
|
|
"""
|
|
|
|
|
2010-02-11 22:07:44 +00:00
|
|
|
URL_SUFFIX = ';'
|
2010-02-10 19:22:01 +00:00
|
|
|
|
|
|
|
class Bucket:
|
|
|
|
def __init__(self, baseLetter):
|
|
|
|
self.base=baseLetter
|
|
|
|
self.words=[]
|
|
|
|
self.letters=[]
|
|
|
|
|
|
|
|
def dump(self, isWebUrl=False, isFirst=False, isLast=False):
|
|
|
|
if (len(self.words) == 0) and (len(self.letters) == 0):
|
|
|
|
return ''
|
|
|
|
|
|
|
|
self.words.sort()
|
|
|
|
self.letters.sort()
|
|
|
|
|
|
|
|
output = ' ';
|
|
|
|
|
|
|
|
if isFirst:
|
|
|
|
if isWebUrl:
|
|
|
|
output += '+ "'
|
|
|
|
else:
|
|
|
|
output += '"('
|
|
|
|
else:
|
|
|
|
output += '+ "|'
|
|
|
|
|
|
|
|
if len(self.words) != 0:
|
|
|
|
output += '('
|
|
|
|
|
|
|
|
if isWebUrl:
|
|
|
|
output += '?:'
|
|
|
|
|
|
|
|
firstWord = 1
|
|
|
|
for word in self.words:
|
|
|
|
if firstWord == 0:
|
|
|
|
output += '|'
|
|
|
|
firstWord = 0
|
|
|
|
for letter in word:
|
|
|
|
if letter == '-':
|
|
|
|
output += '\\\\' # escape the '-' character.
|
|
|
|
output += letter
|
|
|
|
|
|
|
|
if len(self.words) > 0 and len(self.letters) > 0:
|
|
|
|
output += '|'
|
|
|
|
|
|
|
|
if len(self.letters) == 1:
|
|
|
|
output += '%c%c' % (self.base, self.letters[0])
|
|
|
|
elif len(self.letters) > 0:
|
|
|
|
output += '%c[' % self.base
|
|
|
|
|
|
|
|
for letter in self.letters:
|
|
|
|
output += letter
|
|
|
|
|
|
|
|
output += ']'
|
|
|
|
|
|
|
|
if len(self.words) != 0:
|
|
|
|
output += ')'
|
|
|
|
|
|
|
|
if not isLast:
|
|
|
|
output += '"'
|
|
|
|
output += '\n'
|
|
|
|
|
|
|
|
return output;
|
|
|
|
|
|
|
|
def add(self, line):
|
|
|
|
length = len(line)
|
|
|
|
|
|
|
|
if line.startswith('#') or (length == 0):
|
|
|
|
return;
|
|
|
|
|
|
|
|
if length == 2:
|
|
|
|
self.letters.append(line[1:2])
|
|
|
|
else:
|
|
|
|
self.words.append(line)
|
|
|
|
|
|
|
|
def getBucket(buckets, line):
|
|
|
|
letter = line[0]
|
|
|
|
bucket = buckets.get(letter)
|
|
|
|
|
|
|
|
if bucket is None:
|
|
|
|
bucket = Bucket(letter)
|
|
|
|
buckets[letter] = bucket
|
|
|
|
|
|
|
|
return bucket
|
|
|
|
|
|
|
|
def makePattern(prefix, suffix, buckets, isWebUrl=False):
|
|
|
|
output = prefix
|
|
|
|
|
|
|
|
output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
|
|
|
|
|
|
|
|
for letter in range(ord('b'), ord('z')):
|
|
|
|
output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
|
|
|
|
|
|
|
|
output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
|
|
|
|
|
|
|
|
if isWebUrl:
|
|
|
|
output += '))"'
|
|
|
|
else:
|
|
|
|
output += ')'
|
|
|
|
|
|
|
|
output += suffix
|
|
|
|
|
|
|
|
print output
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
|
|
|
|
domains = f.readlines()
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
buckets = {}
|
|
|
|
|
|
|
|
for domain in domains:
|
|
|
|
domain = domain.lower()
|
|
|
|
|
|
|
|
if len(domain) > 0:
|
|
|
|
getBucket(buckets, domain[0]).add(domain.strip())
|
|
|
|
|
|
|
|
makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
|
|
|
|
makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
|