# $Id$ # # Copyright (C) 2009-2012 Internet Systems Consortium ("ISC") # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH # REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY # AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, # INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM # LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # PERFORMANCE OF THIS SOFTWARE. """ Parse an ARIN database research dump and write out (just) the RPKI-relevant fields in myrpki-format CSV syntax. NB: The input data for this script comes from ARIN under an agreement that allows research use but forbids redistribution, so if you think you need a copy of the data, please talk to ARIN about it, not us. Input format used to be RPSL WHOIS dump, but ARIN recently went Java, so we have to parse a 3.5GB XML "document". Credit to Liza Daly for explaining the incantations needed to convince lxml to do this nicely, see: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ """ import sys import lxml.etree from rpki.csv_utils import csv_writer def ns(tag): return "{http://www.arin.net/bulkwhois/core/v1}" + tag tag_asn = ns("asn") tag_net = ns("net") tag_org = ns("org") tag_poc = ns("poc") tag_orgHandle = ns("orgHandle") tag_netBlock = ns("netBlock") tag_type = ns("type") tag_startAddress = ns("startAddress") tag_endAddress = ns("endAddress") tag_startAsNumber = ns("startAsNumber") tag_endAsNumber = ns("endAsNumber") def find(node, tag): return node.findtext(tag).strip() def do_asn(node): asns.writerow((find(node, tag_orgHandle), "%s-%s" % (find(node, tag_startAsNumber), find(node, tag_endAsNumber)))) erx_table = { "AF" : "afrinic", "AP" : "apnic", "AR" : "arin", "AV" : "arin", "FX" : "afrinic", "LN" : "lacnic", "LX" : "lacnic", "PV" : "apnic", "PX" : "apnic", "RN" : "ripe", "RV" : "ripe", "RX" : "ripe" } def do_net(node): handle = find(node, tag_orgHandle) for netblock in node.iter(tag_netBlock): tag = find(netblock, tag_type) startAddress = find(netblock, tag_startAddress) endAddress = find(netblock, tag_endAddress) if not startAddress.endswith(".000") and not startAddress.endswith(":0000"): continue if not endAddress.endswith(".255") and not endAddress.endswith(":FFFF"): continue if tag in ("DS", "DA", "IU"): prefixes.writerow((handle, "%s-%s" % (startAddress, endAddress))) elif tag in erx_table: erx.writerow((erx_table[tag], "%s-%s" % (startAddress, endAddress))) dispatch = { tag_asn : do_asn, tag_net : do_net } asns = csv_writer("asns.csv") prefixes = csv_writer("prefixes.csv") erx = csv_writer("erx.csv") root = None for event, node in lxml.etree.iterparse(sys.stdin): if root is None: root = node while root.getparent() is not None: root = root.getparent() if node.getparent() is root: if node.tag in dispatch: dispatch[node.tag](node) node.clear() while node.getprevious() is not None: del node.getparent()[0] asns.close() prefixes.close() erx.close()