diff options
author | Rob Austein <sra@hactrn.net> | 2010-10-04 13:59:50 +0000 |
---|---|---|
committer | Rob Austein <sra@hactrn.net> | 2010-10-04 13:59:50 +0000 |
commit | 6d24ef5781a37290231543f41d6a08062d51f9e0 (patch) | |
tree | 0d7a8c0e0eebb8ce0522fcdb1b9381c35cb77939 /scripts/arin-to-csv.py | |
parent | bd451c90f19c10c899a75795366d98da9977ef56 (diff) |
Update for XML file that ate Manhattan
svn path=/scripts/arin-to-csv.py; revision=3455
Diffstat (limited to 'scripts/arin-to-csv.py')
-rw-r--r-- | scripts/arin-to-csv.py | 129 |
1 files changed, 48 insertions, 81 deletions
diff --git a/scripts/arin-to-csv.py b/scripts/arin-to-csv.py index 2f1a3a14..fc97fd5e 100644 --- a/scripts/arin-to-csv.py +++ b/scripts/arin-to-csv.py @@ -6,9 +6,14 @@ NB: The input data for this script comes from ARIN under an agreement that allows research use but forbids redistribution, so if you think you need a copy of the data, please talk to ARIN about it, not us. +Input format used to be RPSL WHOIS dump, but ARIN recently went Java, +so we have to parse a 3.5GB XML "document". Credit to Liza Daly for +explaining the incantations needed to convince lxml to do this nicely, +see: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ + $Id$ -Copyright (C) 2009 Internet Systems Consortium ("ISC") +Copyright (C) 2009-2010 Internet Systems Consortium ("ISC") Permission to use, copy, modify, and distribute this software for any purpose with or without fee is hereby granted, provided that the above @@ -23,96 +28,58 @@ OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. """ -import rpki.myrpki - -class Handle(object): - - want_tags = () - - debug = False - - def set(self, tag, val): - if tag in self.want_tags: - setattr(self, tag, "".join(val.split(" "))) - - def check(self): - for tag in self.want_tags: - if not hasattr(self, tag): - return False - if self.debug: - print repr(self) - return True - -class ASHandle(Handle): - - want_tags = ("ASHandle", "ASNumber", "OrgID") - - def __repr__(self): - return "<%s %s.%s %s>" % (self.__class__.__name__, - self.OrgID, self.ASHandle, self.ASNumber) - - def finish(self, ctx): - if self.check(): - ctx.asns.writerow((ctx.translations.get(self.OrgID, self.OrgID), self.ASNumber)) +import sys, lxml.etree, rpki.myrpki -class NetHandle(Handle): +def ns(tag): + return "{http://www.arin.net/bulkwhois/core/v1}" + tag - NetType = None +tag_asn = ns("asn") +tag_net = ns("net") +tag_org = ns("org") +tag_poc = ns("poc") +tag_orgHandle = ns("orgHandle") +tag_netBlock = ns("netBlock") +tag_type = ns("type") +tag_startAddress = ns("startAddress") +tag_endAddress = ns("endAddress") +tag_startAsNumber = ns("startAsNumber") +tag_endAsNumber = ns("endAsNumber") - want_tags = ("NetHandle", "NetRange", "NetType", "OrgID") +def find(node, tag): + return node.findtext(tag).strip() - def finish(self, ctx): - if self.NetType in ("allocation", "assignment") and self.check(): - ctx.prefixes.writerow((ctx.translations.get(self.OrgID, self.OrgID), self.NetRange)) +def do_asn(node): + asns.writerow((find(node, tag_orgHandle), + "%s-%s" % (find(node, tag_startAsNumber), + find(node, tag_endAsNumber)))) - def __repr__(self): - return "<%s %s.%s %s %s>" % (self.__class__.__name__, - self.OrgID, self.NetHandle, - self.NetType, self.NetRange) +def do_net(node): + handle = find(node, tag_orgHandle) + for netblock in node.iter(tag_netBlock): + if find(netblock, tag_type) in ("DS", "DA", "IU"): + prefixes.writerow((handle, + "%s-%s" % (find(netblock, tag_startAddress), + find(netblock, tag_endAddress)))) -class V6NetHandle(NetHandle): +dispatch = { tag_asn : do_asn, tag_net : do_net } - want_tags = ("V6NetHandle", "NetRange", "NetType", "OrgID") +asns = rpki.myrpki.csv_writer("asns.csv") +prefixes = rpki.myrpki.csv_writer("prefixes.csv") - def __repr__(self): - return "<%s %s.%s %s %s>" % (self.__class__.__name__, - ctx.translations.get(self.OrgID, self.OrgID), - self.V6NetHandle, self.NetType, self.NetRange) +root = None -class main(object): +for event, node in lxml.etree.iterparse(sys.stdin): - types = { - "ASHandle" : ASHandle, - "NetHandle" : NetHandle, - "V6NetHandle" : V6NetHandle } + if root is None: + root = node + while root.getparent() is not None: + root = root.getparent() - translations = {} + if node.getparent() is root: - def __init__(self): - self.asns = rpki.myrpki.csv_writer("asns.csv") - self.prefixes = rpki.myrpki.csv_writer("prefixes.csv") - try: - self.translations = dict((src, dst) for src, dst in rpki.myrpki.csv_reader("translations.csv", columns = 2)) - except IOError: - pass - f = open("arin_db.txt") - cur = None - for line in f: - line = line.expandtabs().strip() - if not line: - if cur: - cur.finish(self) - cur = None - elif not line.startswith("#"): - tag, sep, val = tuple(s.strip() for s in line.partition(":")) - if not sep: - # This should not happen, but ARIN's "legacy" RPSL contains errors - continue - if cur is None: - cur = self.types[tag]() if tag in self.types else False - if cur: - cur.set(tag, val) - if cur: - cur.finish(self) + if node.tag in dispatch: + dispatch[node.tag](node) -main() + node.clear() + while node.getprevious() is not None: + del node.getparent()[0] |