replicant-frameworks_native/common/tools/make-iana-tld-pattern.py

#!/usr/bin/env python

from urllib2 import urlopen

TLD_PREFIX = r"""
    /**
     *  Regular expression to match all IANA top-level domains.
     *  List accurate as of 2010/02/05.  List taken from:
     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
     */
    public static final String TOP_LEVEL_DOMAIN_STR =
"""
TLD_SUFFIX = '";'

URL_PREFIX = r"""
    /**
     *  Regular expression to match all IANA top-level domains for WEB_URL.
     *  List accurate as of 2010/02/05.  List taken from:
     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
     */
    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
        "(?:"
"""

URL_SUFFIX = ';'

class Bucket:
    def __init__(self, baseLetter):
        self.base=baseLetter
        self.words=[]
        self.letters=[]

    def dump(self, isWebUrl=False, isFirst=False, isLast=False):
        if (len(self.words) == 0) and (len(self.letters) == 0):
            return ''

        self.words.sort()
        self.letters.sort()

        output = '        ';

        if isFirst:
            if isWebUrl:
                output += '+ "'
            else:
                output += '"('
        else:
            output += '+ "|'

        if len(self.words) != 0:
            output += '('

            if isWebUrl:
                output += '?:'

        firstWord = 1
        for word in self.words:
            if firstWord == 0:
                output += '|'
            firstWord = 0
            for letter in word:
                if letter == '-':
                    output += '\\\\'  # escape the '-' character.
                output += letter

        if len(self.words) > 0 and len(self.letters) > 0:
            output += '|'

        if len(self.letters) == 1:
            output += '%c%c' % (self.base, self.letters[0])
        elif len(self.letters) > 0:
            output += '%c[' % self.base

            for letter in self.letters:
                output += letter

            output += ']'

        if len(self.words) != 0:
            output += ')'

        if not isLast:
            output += '"'
            output += '\n'

        return output;

    def add(self, line):
        length = len(line)

        if line.startswith('#') or (length == 0):
            return;

        if length == 2:
            self.letters.append(line[1:2])
        else:
            self.words.append(line)

def getBucket(buckets, line):
    letter = line[0]
    bucket = buckets.get(letter)

    if bucket is None:
        bucket = Bucket(letter)
        buckets[letter] = bucket

    return bucket

def makePattern(prefix, suffix, buckets, isWebUrl=False):
    output = prefix

    output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)

    for letter in range(ord('b'), ord('z')):
        output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)

    output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)

    if isWebUrl:
        output += '))"'
    else:
        output += ')'

    output += suffix

    print output

if __name__ == "__main__":
    f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
    domains = f.readlines()
    f.close()

    buckets = {}

    for domain in domains:
        domain = domain.lower()

        if len(domain) > 0:
            getBucket(buckets, domain[0]).add(domain.strip())

    makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
    makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
Add back lost python script. The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py 2010-02-10 19:22:01 +00:00			`#!/usr/bin/env python`

			`from urllib2 import urlopen`

			`TLD_PREFIX = r"""`
			`/**`
Enhance URL regular expression to match more Unicode chars. Enhance URL regular expression to match legal one byte Unicode characters in Internationalized Resource Identifiers as detailed in RFC 3987. Specifically two byte Unicode characters are not included. Not all things in RFC 3987 is implemented, this is just an enhancement for recognizing more common used one byte Unicode characters. This change helps Browser address bar identify more valid URL without scheme typed in, such as 현금영수증.kr make-iana-tld-pattern.py is modified to contain only Top Level Domain regular expression generation. Other parts of WEB_URL pattern are in solely in Patters.java for better consistency and maintenance. 2010-02-11 22:07:44 +00:00			`* Regular expression to match all IANA top-level domains.`
Add back lost python script. The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py 2010-02-10 19:22:01 +00:00			`* List accurate as of 2010/02/05. List taken from:`
			`* http://data.iana.org/TLD/tlds-alpha-by-domain.txt`
			`* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py`
			`*/`
Enhance URL regular expression to match more Unicode chars. Enhance URL regular expression to match legal one byte Unicode characters in Internationalized Resource Identifiers as detailed in RFC 3987. Specifically two byte Unicode characters are not included. Not all things in RFC 3987 is implemented, this is just an enhancement for recognizing more common used one byte Unicode characters. This change helps Browser address bar identify more valid URL without scheme typed in, such as 현금영수증.kr make-iana-tld-pattern.py is modified to contain only Top Level Domain regular expression generation. Other parts of WEB_URL pattern are in solely in Patters.java for better consistency and maintenance. 2010-02-11 22:07:44 +00:00			`public static final String TOP_LEVEL_DOMAIN_STR =`
Add back lost python script. The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py 2010-02-10 19:22:01 +00:00			`"""`
Enhance URL regular expression to match more Unicode chars. Enhance URL regular expression to match legal one byte Unicode characters in Internationalized Resource Identifiers as detailed in RFC 3987. Specifically two byte Unicode characters are not included. Not all things in RFC 3987 is implemented, this is just an enhancement for recognizing more common used one byte Unicode characters. This change helps Browser address bar identify more valid URL without scheme typed in, such as 현금영수증.kr make-iana-tld-pattern.py is modified to contain only Top Level Domain regular expression generation. Other parts of WEB_URL pattern are in solely in Patters.java for better consistency and maintenance. 2010-02-11 22:07:44 +00:00			`TLD_SUFFIX = '";'`
Add back lost python script. The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py 2010-02-10 19:22:01 +00:00
			`URL_PREFIX = r"""`
			`/**`
Enhance URL regular expression to match more Unicode chars. Enhance URL regular expression to match legal one byte Unicode characters in Internationalized Resource Identifiers as detailed in RFC 3987. Specifically two byte Unicode characters are not included. Not all things in RFC 3987 is implemented, this is just an enhancement for recognizing more common used one byte Unicode characters. This change helps Browser address bar identify more valid URL without scheme typed in, such as 현금영수증.kr make-iana-tld-pattern.py is modified to contain only Top Level Domain regular expression generation. Other parts of WEB_URL pattern are in solely in Patters.java for better consistency and maintenance. 2010-02-11 22:07:44 +00:00			`* Regular expression to match all IANA top-level domains for WEB_URL.`
Add back lost python script. The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py 2010-02-10 19:22:01 +00:00			`* List accurate as of 2010/02/05. List taken from:`
			`* http://data.iana.org/TLD/tlds-alpha-by-domain.txt`
Enhance URL regular expression to match more Unicode chars. Enhance URL regular expression to match legal one byte Unicode characters in Internationalized Resource Identifiers as detailed in RFC 3987. Specifically two byte Unicode characters are not included. Not all things in RFC 3987 is implemented, this is just an enhancement for recognizing more common used one byte Unicode characters. This change helps Browser address bar identify more valid URL without scheme typed in, such as 현금영수증.kr make-iana-tld-pattern.py is modified to contain only Top Level Domain regular expression generation. Other parts of WEB_URL pattern are in solely in Patters.java for better consistency and maintenance. 2010-02-11 22:07:44 +00:00			`* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py`
Add back lost python script. The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py 2010-02-10 19:22:01 +00:00			`*/`
Enhance URL regular expression to match more Unicode chars. Enhance URL regular expression to match legal one byte Unicode characters in Internationalized Resource Identifiers as detailed in RFC 3987. Specifically two byte Unicode characters are not included. Not all things in RFC 3987 is implemented, this is just an enhancement for recognizing more common used one byte Unicode characters. This change helps Browser address bar identify more valid URL without scheme typed in, such as 현금영수증.kr make-iana-tld-pattern.py is modified to contain only Top Level Domain regular expression generation. Other parts of WEB_URL pattern are in solely in Patters.java for better consistency and maintenance. 2010-02-11 22:07:44 +00:00			`public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =`
			`"(?:"`
Add back lost python script. The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py 2010-02-10 19:22:01 +00:00			`"""`

Enhance URL regular expression to match more Unicode chars. Enhance URL regular expression to match legal one byte Unicode characters in Internationalized Resource Identifiers as detailed in RFC 3987. Specifically two byte Unicode characters are not included. Not all things in RFC 3987 is implemented, this is just an enhancement for recognizing more common used one byte Unicode characters. This change helps Browser address bar identify more valid URL without scheme typed in, such as 현금영수증.kr make-iana-tld-pattern.py is modified to contain only Top Level Domain regular expression generation. Other parts of WEB_URL pattern are in solely in Patters.java for better consistency and maintenance. 2010-02-11 22:07:44 +00:00			`URL_SUFFIX = ';'`
Add back lost python script. The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/make-iana-tld-pattern.py 2010-02-10 19:22:01 +00:00
			`class Bucket:`
			`def __init__(self, baseLetter):`
			`self.base=baseLetter`
			`self.words=[]`
			`self.letters=[]`

			`def dump(self, isWebUrl=False, isFirst=False, isLast=False):`
			`if (len(self.words) == 0) and (len(self.letters) == 0):`
			`return ''`

			`self.words.sort()`
			`self.letters.sort()`

			`output = ' ';`

			`if isFirst:`
			`if isWebUrl:`
			`output += '+ "'`
			`else:`
			`output += '"('`
			`else:`
			`output += '+ "\|'`

			`if len(self.words) != 0:`
			`output += '('`

			`if isWebUrl:`
			`output += '?:'`

			`firstWord = 1`
			`for word in self.words:`
			`if firstWord == 0:`
			`output += '\|'`
			`firstWord = 0`
			`for letter in word:`
			`if letter == '-':`
			`output += '\\\\' # escape the '-' character.`
			`output += letter`

			`if len(self.words) > 0 and len(self.letters) > 0:`
			`output += '\|'`

			`if len(self.letters) == 1:`
			`output += '%c%c' % (self.base, self.letters[0])`
			`elif len(self.letters) > 0:`
			`output += '%c[' % self.base`

			`for letter in self.letters:`
			`output += letter`

			`output += ']'`

			`if len(self.words) != 0:`
			`output += ')'`

			`if not isLast:`
			`output += '"'`
			`output += '\n'`

			`return output;`

			`def add(self, line):`
			`length = len(line)`

			`if line.startswith('#') or (length == 0):`
			`return;`

			`if length == 2:`
			`self.letters.append(line[1:2])`
			`else:`
			`self.words.append(line)`

			`def getBucket(buckets, line):`
			`letter = line[0]`
			`bucket = buckets.get(letter)`

			`if bucket is None:`
			`bucket = Bucket(letter)`
			`buckets[letter] = bucket`

			`return bucket`

			`def makePattern(prefix, suffix, buckets, isWebUrl=False):`
			`output = prefix`

			`output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)`

			`for letter in range(ord('b'), ord('z')):`
			`output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)`

			`output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)`

			`if isWebUrl:`
			`output += '))"'`
			`else:`
			`output += ')'`

			`output += suffix`

			`print output`

			`if __name__ == "__main__":`
			`f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')`
			`domains = f.readlines()`
			`f.close()`

			`buckets = {}`

			`for domain in domains:`
			`domain = domain.lower()`

			`if len(domain) > 0:`
			`getBucket(buckets, domain[0]).add(domain.strip())`

			`makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)`
			`makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)`