replicant-frameworks_native/common/tools/make-iana-tld-pattern.py

#!/usr/bin/env python

from urllib2 import urlopen

TLD_PREFIX = r"""
    /**
     *  Regular expression pattern to match all IANA top-level domains.
     *  List accurate as of 2010/02/05.  List taken from:
     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
     */
    public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
"""
TLD_SUFFIX = '");'

URL_PREFIX = r"""
    /**
     *  Regular expression pattern to match RFC 1738 URLs
     *  List accurate as of 2010/02/05.  List taken from:
     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     *  This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
     */
    public static final Pattern WEB_URL = Pattern.compile(
        "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
        + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
        + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
        + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+"   // named host
        + "(?:"   // plus top level domain
"""

URL_SUFFIX = r"""
        + "|(?:(?:25[0-5]|2[0-4]" // or ip address
        + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
        + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
        + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
        + "|[1-9][0-9]|[0-9])))"
        + "(?:\\:\\d{1,5})?)" // plus option port number
        + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~"  // plus option query params
        + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
        + "(?:\\b|$)"); // and finally, a word boundary or end of
                        // input.  This is to stop foo.sure from
                        // matching as foo.su
"""

class Bucket:
    def __init__(self, baseLetter):
        self.base=baseLetter
        self.words=[]
        self.letters=[]

    def dump(self, isWebUrl=False, isFirst=False, isLast=False):
        if (len(self.words) == 0) and (len(self.letters) == 0):
            return ''

        self.words.sort()
        self.letters.sort()

        output = '        ';

        if isFirst:
            if isWebUrl:
                output += '+ "'
            else:
                output += '"('
        else:
            output += '+ "|'

        if len(self.words) != 0:
            output += '('

            if isWebUrl:
                output += '?:'

        firstWord = 1
        for word in self.words:
            if firstWord == 0:
                output += '|'
            firstWord = 0
            for letter in word:
                if letter == '-':
                    output += '\\\\'  # escape the '-' character.
                output += letter

        if len(self.words) > 0 and len(self.letters) > 0:
            output += '|'

        if len(self.letters) == 1:
            output += '%c%c' % (self.base, self.letters[0])
        elif len(self.letters) > 0:
            output += '%c[' % self.base

            for letter in self.letters:
                output += letter

            output += ']'

        if len(self.words) != 0:
            output += ')'

        if not isLast:
            output += '"'
            output += '\n'

        return output;

    def add(self, line):
        length = len(line)

        if line.startswith('#') or (length == 0):
            return;

        if length == 2:
            self.letters.append(line[1:2])
        else:
            self.words.append(line)

def getBucket(buckets, line):
    letter = line[0]
    bucket = buckets.get(letter)

    if bucket is None:
        bucket = Bucket(letter)
        buckets[letter] = bucket

    return bucket

def makePattern(prefix, suffix, buckets, isWebUrl=False):
    output = prefix

    output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)

    for letter in range(ord('b'), ord('z')):
        output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)

    output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)

    if isWebUrl:
        output += '))"'
    else:
        output += ')'

    output += suffix

    print output

if __name__ == "__main__":
    f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
    domains = f.readlines()
    f.close()

    buckets = {}

    for domain in domains:
        domain = domain.lower()

        if len(domain) > 0:
            getBucket(buckets, domain[0]).add(domain.strip())

    makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
    makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
Add back lost python script. The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py 2010-02-10 19:22:01 +00:00			`#!/usr/bin/env python`

			`from urllib2 import urlopen`

			`TLD_PREFIX = r"""`
			`/**`
			`* Regular expression pattern to match all IANA top-level domains.`
			`* List accurate as of 2010/02/05. List taken from:`
			`* http://data.iana.org/TLD/tlds-alpha-by-domain.txt`
			`* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py`
			`*/`
			`public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(`
			`"""`
			`TLD_SUFFIX = '");'`

			`URL_PREFIX = r"""`
			`/**`
			`* Regular expression pattern to match RFC 1738 URLs`
			`* List accurate as of 2010/02/05. List taken from:`
			`* http://data.iana.org/TLD/tlds-alpha-by-domain.txt`
			`* This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py`
			`*/`
			`public static final Pattern WEB_URL = Pattern.compile(`
			`"((?:(http\|https\|Http\|Https\|rtsp\|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"`
			`+ "\\,\\;\\?\\&\\=]\|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"`
			`+ "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]\|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"`
			`+ "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host`
			`+ "(?:" // plus top level domain`
			`"""`

			`URL_SUFFIX = r"""`
			`+ "\|(?:(?:25[0-5]\|2[0-4]" // or ip address`
			`+ "[0-9]\|[0-1][0-9]{2}\|[1-9][0-9]\|[1-9])\\.(?:25[0-5]\|2[0-4][0-9]"`
			`+ "\|[0-1][0-9]{2}\|[1-9][0-9]\|[1-9]\|0)\\.(?:25[0-5]\|2[0-4][0-9]\|[0-1]"`
			`+ "[0-9]{2}\|[1-9][0-9]\|[1-9]\|0)\\.(?:25[0-5]\|2[0-4][0-9]\|[0-1][0-9]{2}"`
			`+ "\|[1-9][0-9]\|[0-9])))"`
			`+ "(?:\\:\\d{1,5})?)" // plus option port number`
			`+ "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params`
			`+ "\\-\\.\\+\\!\\\\'\\(\\)\\,\\_])\|(?:\\%[a-fA-F0-9]{2})))?"`
			`+ "(?:\\b\|$)"); // and finally, a word boundary or end of`
			`// input. This is to stop foo.sure from`
			`// matching as foo.su`
			`"""`

			`class Bucket:`
			`def __init__(self, baseLetter):`
			`self.base=baseLetter`
			`self.words=[]`
			`self.letters=[]`

			`def dump(self, isWebUrl=False, isFirst=False, isLast=False):`
			`if (len(self.words) == 0) and (len(self.letters) == 0):`
			`return ''`

			`self.words.sort()`
			`self.letters.sort()`

			`output = ' ';`

			`if isFirst:`
			`if isWebUrl:`
			`output += '+ "'`
			`else:`
			`output += '"('`
			`else:`
			`output += '+ "\|'`

			`if len(self.words) != 0:`
			`output += '('`

			`if isWebUrl:`
			`output += '?:'`

			`firstWord = 1`
			`for word in self.words:`
			`if firstWord == 0:`
			`output += '\|'`
			`firstWord = 0`
			`for letter in word:`
			`if letter == '-':`
			`output += '\\\\' # escape the '-' character.`
			`output += letter`

			`if len(self.words) > 0 and len(self.letters) > 0:`
			`output += '\|'`

			`if len(self.letters) == 1:`
			`output += '%c%c' % (self.base, self.letters[0])`
			`elif len(self.letters) > 0:`
			`output += '%c[' % self.base`

			`for letter in self.letters:`
			`output += letter`

			`output += ']'`

			`if len(self.words) != 0:`
			`output += ')'`

			`if not isLast:`
			`output += '"'`
			`output += '\n'`

			`return output;`

			`def add(self, line):`
			`length = len(line)`

			`if line.startswith('#') or (length == 0):`
			`return;`

			`if length == 2:`
			`self.letters.append(line[1:2])`
			`else:`
			`self.words.append(line)`

			`def getBucket(buckets, line):`
			`letter = line[0]`
			`bucket = buckets.get(letter)`

			`if bucket is None:`
			`bucket = Bucket(letter)`
			`buckets[letter] = bucket`

			`return bucket`

			`def makePattern(prefix, suffix, buckets, isWebUrl=False):`
			`output = prefix`

			`output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)`

			`for letter in range(ord('b'), ord('z')):`
			`output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)`

			`output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)`

			`if isWebUrl:`
			`output += '))"'`
			`else:`
			`output += ')'`

			`output += suffix`

			`print output`

			`if __name__ == "__main__":`
			`f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')`
			`domains = f.readlines()`
			`f.close()`

			`buckets = {}`

			`for domain in domains:`
			`domain = domain.lower()`

			`if len(domain) > 0:`
			`getBucket(buckets, domain[0]).add(domain.strip())`

			`makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)`
			`makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)`