1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
# $Id$
#
# Copyright (C) 2009-2012 Internet Systems Consortium ("ISC")
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
"""
Parse an ARIN database research dump and write out (just) the
RPKI-relevant fields in myrpki-format CSV syntax.
NB: The input data for this script comes from ARIN under an agreement
that allows research use but forbids redistribution, so if you think
you need a copy of the data, please talk to ARIN about it, not us.
Input format used to be RPSL WHOIS dump, but ARIN recently went Java,
so we have to parse a 3.5GB XML "document". Credit to Liza Daly for
explaining the incantations needed to convince lxml to do this nicely,
see: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
"""
import sys
import lxml.etree
from rpki.csv_utils import csv_writer
def ns(tag):
return "{http://www.arin.net/bulkwhois/core/v1}" + tag
tag_asn = ns("asn")
tag_net = ns("net")
tag_org = ns("org")
tag_poc = ns("poc")
tag_orgHandle = ns("orgHandle")
tag_netBlock = ns("netBlock")
tag_type = ns("type")
tag_startAddress = ns("startAddress")
tag_endAddress = ns("endAddress")
tag_startAsNumber = ns("startAsNumber")
tag_endAsNumber = ns("endAsNumber")
def find(node, tag):
return node.findtext(tag).strip()
def do_asn(node):
asns.writerow((find(node, tag_orgHandle),
"%s-%s" % (find(node, tag_startAsNumber),
find(node, tag_endAsNumber))))
erx_table = {
"AF" : "afrinic",
"AP" : "apnic",
"AR" : "arin",
"AV" : "arin",
"FX" : "afrinic",
"LN" : "lacnic",
"LX" : "lacnic",
"PV" : "apnic",
"PX" : "apnic",
"RN" : "ripe",
"RV" : "ripe",
"RX" : "ripe" }
def do_net(node):
handle = find(node, tag_orgHandle)
for netblock in node.iter(tag_netBlock):
tag = find(netblock, tag_type)
startAddress = find(netblock, tag_startAddress)
endAddress = find(netblock, tag_endAddress)
if not startAddress.endswith(".000") and not startAddress.endswith(":0000"):
continue
if not endAddress.endswith(".255") and not endAddress.endswith(":FFFF"):
continue
if tag in ("DS", "DA", "IU"):
prefixes.writerow((handle, "%s-%s" % (startAddress, endAddress)))
elif tag in erx_table:
erx.writerow((erx_table[tag], "%s-%s" % (startAddress, endAddress)))
dispatch = { tag_asn : do_asn, tag_net : do_net }
asns = csv_writer("asns.csv")
prefixes = csv_writer("prefixes.csv")
erx = csv_writer("erx.csv")
root = None
for event, node in lxml.etree.iterparse(sys.stdin):
if root is None:
root = node
while root.getparent() is not None:
root = root.getparent()
if node.getparent() is root:
if node.tag in dispatch:
dispatch[node.tag](node)
node.clear()
while node.getprevious() is not None:
del node.getparent()[0]
asns.close()
prefixes.close()
erx.close()
|