replicant-frameworks_native/common/tools/make-iana-tld-pattern.py
Shimeng (Simon) Wang 56811abc37 Add back lost python script.
The script is used to generate top level domains' regular expressions.
This is enhanced and used to regenerate the new top level domains.

	new file:   common/tools/make-iana-tld-pattern.py
2010-02-10 11:22:01 -08:00

161 lines
4.7 KiB
Python
Executable File

#!/usr/bin/env python
from urllib2 import urlopen
TLD_PREFIX = r"""
/**
* Regular expression pattern to match all IANA top-level domains.
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/
public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
"""
TLD_SUFFIX = '");'
URL_PREFIX = r"""
/**
* Regular expression pattern to match RFC 1738 URLs
* List accurate as of 2010/02/05. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
*/
public static final Pattern WEB_URL = Pattern.compile(
"((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
+ "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
+ "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
+ "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
+ "(?:" // plus top level domain
"""
URL_SUFFIX = r"""
+ "|(?:(?:25[0-5]|2[0-4]" // or ip address
+ "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
+ "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
+ "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
+ "|[1-9][0-9]|[0-9])))"
+ "(?:\\:\\d{1,5})?)" // plus option port number
+ "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
+ "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
+ "(?:\\b|$)"); // and finally, a word boundary or end of
// input. This is to stop foo.sure from
// matching as foo.su
"""
class Bucket:
def __init__(self, baseLetter):
self.base=baseLetter
self.words=[]
self.letters=[]
def dump(self, isWebUrl=False, isFirst=False, isLast=False):
if (len(self.words) == 0) and (len(self.letters) == 0):
return ''
self.words.sort()
self.letters.sort()
output = ' ';
if isFirst:
if isWebUrl:
output += '+ "'
else:
output += '"('
else:
output += '+ "|'
if len(self.words) != 0:
output += '('
if isWebUrl:
output += '?:'
firstWord = 1
for word in self.words:
if firstWord == 0:
output += '|'
firstWord = 0
for letter in word:
if letter == '-':
output += '\\\\' # escape the '-' character.
output += letter
if len(self.words) > 0 and len(self.letters) > 0:
output += '|'
if len(self.letters) == 1:
output += '%c%c' % (self.base, self.letters[0])
elif len(self.letters) > 0:
output += '%c[' % self.base
for letter in self.letters:
output += letter
output += ']'
if len(self.words) != 0:
output += ')'
if not isLast:
output += '"'
output += '\n'
return output;
def add(self, line):
length = len(line)
if line.startswith('#') or (length == 0):
return;
if length == 2:
self.letters.append(line[1:2])
else:
self.words.append(line)
def getBucket(buckets, line):
letter = line[0]
bucket = buckets.get(letter)
if bucket is None:
bucket = Bucket(letter)
buckets[letter] = bucket
return bucket
def makePattern(prefix, suffix, buckets, isWebUrl=False):
output = prefix
output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
for letter in range(ord('b'), ord('z')):
output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
if isWebUrl:
output += '))"'
else:
output += ')'
output += suffix
print output
if __name__ == "__main__":
f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
domains = f.readlines()
f.close()
buckets = {}
for domain in domains:
domain = domain.lower()
if len(domain) > 0:
getBucket(buckets, domain[0]).add(domain.strip())
makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)