#!/usr/bin/env python from urllib2 import urlopen TLD_PREFIX = r""" /** * Regular expression pattern to match all IANA top-level domains. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( """ TLD_SUFFIX = '");' URL_PREFIX = r""" /** * Regular expression pattern to match RFC 1738 URLs * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py */ public static final Pattern WEB_URL = Pattern.compile( "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host + "(?:" // plus top level domain """ URL_SUFFIX = r""" + "|(?:(?:25[0-5]|2[0-4]" // or ip address + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" + "|[1-9][0-9]|[0-9])))" + "(?:\\:\\d{1,5})?)" // plus option port number + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" + "(?:\\b|$)"); // and finally, a word boundary or end of // input. This is to stop foo.sure from // matching as foo.su """ class Bucket: def __init__(self, baseLetter): self.base=baseLetter self.words=[] self.letters=[] def dump(self, isWebUrl=False, isFirst=False, isLast=False): if (len(self.words) == 0) and (len(self.letters) == 0): return '' self.words.sort() self.letters.sort() output = ' '; if isFirst: if isWebUrl: output += '+ "' else: output += '"(' else: output += '+ "|' if len(self.words) != 0: output += '(' if isWebUrl: output += '?:' firstWord = 1 for word in self.words: if firstWord == 0: output += '|' firstWord = 0 for letter in word: if letter == '-': output += '\\\\' # escape the '-' character. output += letter if len(self.words) > 0 and len(self.letters) > 0: output += '|' if len(self.letters) == 1: output += '%c%c' % (self.base, self.letters[0]) elif len(self.letters) > 0: output += '%c[' % self.base for letter in self.letters: output += letter output += ']' if len(self.words) != 0: output += ')' if not isLast: output += '"' output += '\n' return output; def add(self, line): length = len(line) if line.startswith('#') or (length == 0): return; if length == 2: self.letters.append(line[1:2]) else: self.words.append(line) def getBucket(buckets, line): letter = line[0] bucket = buckets.get(letter) if bucket is None: bucket = Bucket(letter) buckets[letter] = bucket return bucket def makePattern(prefix, suffix, buckets, isWebUrl=False): output = prefix output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl) for letter in range(ord('b'), ord('z')): output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl) output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl) if isWebUrl: output += '))"' else: output += ')' output += suffix print output if __name__ == "__main__": f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') domains = f.readlines() f.close() buckets = {} for domain in domains: domain = domain.lower() if len(domain) > 0: getBucket(buckets, domain[0]).add(domain.strip()) makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False) makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)