diff options
author | Rob Austein <sra@hactrn.net> | 2014-04-05 22:42:12 +0000 |
---|---|---|
committer | Rob Austein <sra@hactrn.net> | 2014-04-05 22:42:12 +0000 |
commit | fe0bf509f528dbdc50c7182f81057c6a4e15e4bd (patch) | |
tree | 07c9a923d4a0ccdfea11c49cd284f6d5757c5eda /potpourri/arin-to-csv.py | |
parent | aa28ef54c271fbe4d52860ff8cf13cab19e2207c (diff) |
Source tree reorg, phase 1. Almost everything moved, no file contents changed.
svn path=/branches/tk685/; revision=5757
Diffstat (limited to 'potpourri/arin-to-csv.py')
-rw-r--r-- | potpourri/arin-to-csv.py | 114 |
1 files changed, 114 insertions, 0 deletions
diff --git a/potpourri/arin-to-csv.py b/potpourri/arin-to-csv.py new file mode 100644 index 00000000..a4e7ffc3 --- /dev/null +++ b/potpourri/arin-to-csv.py @@ -0,0 +1,114 @@ +# $Id$ +# +# Copyright (C) 2009-2012 Internet Systems Consortium ("ISC") +# +# Permission to use, copy, modify, and distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +# AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, +# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. + +""" +Parse an ARIN database research dump and write out (just) the +RPKI-relevant fields in myrpki-format CSV syntax. + +NB: The input data for this script comes from ARIN under an agreement +that allows research use but forbids redistribution, so if you think +you need a copy of the data, please talk to ARIN about it, not us. + +Input format used to be RPSL WHOIS dump, but ARIN recently went Java, +so we have to parse a 3.5GB XML "document". Credit to Liza Daly for +explaining the incantations needed to convince lxml to do this nicely, +see: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ +""" + +import sys +import lxml.etree + +from rpki.csv_utils import csv_writer + +def ns(tag): + return "{http://www.arin.net/bulkwhois/core/v1}" + tag + +tag_asn = ns("asn") +tag_net = ns("net") +tag_org = ns("org") +tag_poc = ns("poc") +tag_orgHandle = ns("orgHandle") +tag_netBlock = ns("netBlock") +tag_type = ns("type") +tag_startAddress = ns("startAddress") +tag_endAddress = ns("endAddress") +tag_startAsNumber = ns("startAsNumber") +tag_endAsNumber = ns("endAsNumber") + +def find(node, tag): + return node.findtext(tag).strip() + +def do_asn(node): + asns.writerow((find(node, tag_orgHandle), + "%s-%s" % (find(node, tag_startAsNumber), + find(node, tag_endAsNumber)))) + +erx_table = { + "AF" : "afrinic", + "AP" : "apnic", + "AR" : "arin", + "AV" : "arin", + "FX" : "afrinic", + "LN" : "lacnic", + "LX" : "lacnic", + "PV" : "apnic", + "PX" : "apnic", + "RN" : "ripe", + "RV" : "ripe", + "RX" : "ripe" } + +def do_net(node): + handle = find(node, tag_orgHandle) + for netblock in node.iter(tag_netBlock): + tag = find(netblock, tag_type) + startAddress = find(netblock, tag_startAddress) + endAddress = find(netblock, tag_endAddress) + if not startAddress.endswith(".000") and not startAddress.endswith(":0000"): + continue + if not endAddress.endswith(".255") and not endAddress.endswith(":FFFF"): + continue + if tag in ("DS", "DA", "IU"): + prefixes.writerow((handle, "%s-%s" % (startAddress, endAddress))) + elif tag in erx_table: + erx.writerow((erx_table[tag], "%s-%s" % (startAddress, endAddress))) + +dispatch = { tag_asn : do_asn, tag_net : do_net } + +asns = csv_writer("asns.csv") +prefixes = csv_writer("prefixes.csv") +erx = csv_writer("erx.csv") + +root = None + +for event, node in lxml.etree.iterparse(sys.stdin): + + if root is None: + root = node + while root.getparent() is not None: + root = root.getparent() + + if node.getparent() is root: + + if node.tag in dispatch: + dispatch[node.tag](node) + + node.clear() + while node.getprevious() is not None: + del node.getparent()[0] + +asns.close() +prefixes.close() +erx.close() |