replicant-frameworks_native/common/tools/make-iana-tld-pattern.py
Shimeng (Simon) Wang 3207e29d10 Enhance URL regular expression to match more Unicode chars.
Enhance URL regular expression to match legal one byte Unicode characters in
Internationalized Resource Identifiers as detailed in RFC 3987.  Specifically
two byte Unicode characters are not included.  Not all things in RFC 3987 is
implemented, this is just an enhancement for recognizing more common used one
byte Unicode characters.

This change helps Browser address bar identify more valid URL without scheme
typed in, such as 현금영수증.kr

make-iana-tld-pattern.py is modified to contain only Top Level Domain
regular expression generation.  Other parts of WEB_URL pattern are in
solely in Patters.java for better consistency and maintenance.
2010-02-11 14:07:44 -08:00

145 lines
3.6 KiB
Python
Executable File

#!/usr/bin/env python
from urllib2 import urlopen
TLD_PREFIX = r"""
/**
* Regular expression to match all IANA top-level domains.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
public static final String TOP_LEVEL_DOMAIN_STR =
"""
TLD_SUFFIX = '";'
URL_PREFIX = r"""
/**
* Regular expression to match all IANA top-level domains for WEB_URL.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
"(?:"
"""
URL_SUFFIX = ';'
class Bucket:
def __init__(self, baseLetter):
self.base=baseLetter
self.words=[]
self.letters=[]
def dump(self, isWebUrl=False, isFirst=False, isLast=False):
if (len(self.words) == 0) and (len(self.letters) == 0):
return ''
self.words.sort()
self.letters.sort()
output = ' ';
if isFirst:
if isWebUrl:
output += '+ "'
else:
output += '"('
else:
output += '+ "|'
if len(self.words) != 0:
output += '('
if isWebUrl:
output += '?:'
firstWord = 1
for word in self.words:
if firstWord == 0:
output += '|'
firstWord = 0
for letter in word:
if letter == '-':
output += '\\\\' # escape the '-' character.
output += letter
if len(self.words) > 0 and len(self.letters) > 0:
output += '|'
if len(self.letters) == 1:
output += '%c%c' % (self.base, self.letters[0])
elif len(self.letters) > 0:
output += '%c[' % self.base
for letter in self.letters:
output += letter
output += ']'
if len(self.words) != 0:
output += ')'
if not isLast:
output += '"'
output += '\n'
return output;
def add(self, line):
length = len(line)
if line.startswith('#') or (length == 0):
return;
if length == 2:
self.letters.append(line[1:2])
else:
self.words.append(line)
def getBucket(buckets, line):
letter = line[0]
bucket = buckets.get(letter)
if bucket is None:
bucket = Bucket(letter)
buckets[letter] = bucket
return bucket
def makePattern(prefix, suffix, buckets, isWebUrl=False):
output = prefix
output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
for letter in range(ord('b'), ord('z')):
output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
if isWebUrl:
output += '))"'
else:
output += ')'
output += suffix
print output
if __name__ == "__main__":
f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
domains = f.readlines()
f.close()
buckets = {}
for domain in domains:
domain = domain.lower()
if len(domain) > 0:
getBucket(buckets, domain[0]).add(domain.strip())
makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)