potpourri/arin-to-csv.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114

# $Id$
#
# Copyright (C) 2009-2012  Internet Systems Consortium ("ISC")
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.

"""
Parse an ARIN database research dump and write out (just) the
RPKI-relevant fields in myrpki-format CSV syntax.

NB: The input data for this script comes from ARIN under an agreement
that allows research use but forbids redistribution, so if you think
you need a copy of the data, please talk to ARIN about it, not us.

Input format used to be RPSL WHOIS dump, but ARIN recently went Java,
so we have to parse a 3.5GB XML "document".  Credit to Liza Daly for
explaining the incantations needed to convince lxml to do this nicely,
see: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
"""

import sys
import lxml.etree

from rpki.csv_utils import csv_writer

def ns(tag):
    return "{http://www.arin.net/bulkwhois/core/v1}" + tag

tag_asn           = ns("asn")
tag_net           = ns("net")
tag_org           = ns("org")
tag_poc           = ns("poc")
tag_orgHandle     = ns("orgHandle")
tag_netBlock      = ns("netBlock")
tag_type          = ns("type")
tag_startAddress  = ns("startAddress")
tag_endAddress    = ns("endAddress")
tag_startAsNumber = ns("startAsNumber")
tag_endAsNumber   = ns("endAsNumber")

def find(node, tag):
    return node.findtext(tag).strip()

def do_asn(node):
    asns.writerow((find(node, tag_orgHandle),
                   "%s-%s" % (find(node, tag_startAsNumber),
                              find(node, tag_endAsNumber))))

erx_table = {
  "AF" : "afrinic",
  "AP" : "apnic",
  "AR" : "arin",
  "AV" : "arin",
  "FX" : "afrinic",
  "LN" : "lacnic",
  "LX" : "lacnic",
  "PV" : "apnic",
  "PX" : "apnic",
  "RN" : "ripe",
  "RV" : "ripe",
  "RX" : "ripe" }

def do_net(node):
    handle = find(node, tag_orgHandle)
    for netblock in node.iter(tag_netBlock):
        tag = find(netblock, tag_type)
        startAddress = find(netblock, tag_startAddress)
        endAddress = find(netblock, tag_endAddress)
        if not startAddress.endswith(".000") and not startAddress.endswith(":0000"):
            continue
        if not endAddress.endswith(".255") and not endAddress.endswith(":FFFF"):
            continue
        if tag in ("DS", "DA", "IU"):
            prefixes.writerow((handle, "%s-%s" % (startAddress, endAddress)))
        elif tag in erx_table:
            erx.writerow((erx_table[tag], "%s-%s" % (startAddress, endAddress)))

dispatch = { tag_asn : do_asn, tag_net : do_net }

asns     = csv_writer("asns.csv")
prefixes = csv_writer("prefixes.csv")
erx      = csv_writer("erx.csv")

root = None

for event, node in lxml.etree.iterparse(sys.stdin):

    if root is None:
        root = node
        while root.getparent() is not None:
            root = root.getparent()

    if node.getparent() is root:

        if node.tag in dispatch:
            dispatch[node.tag](node)

        node.clear()
        while node.getprevious() is not None:
            del node.getparent()[0]

asns.close()
prefixes.close()
erx.close()