potpourri/whack-ripe-prefixes.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

# $Id$
# 
# Copyright (C) 2010  Internet Systems Consortium ("ISC")
# 
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
# 
# THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.

"""
Fix problems in prefixes.csv generated from RIPE's database.

RIPE's database contains inconsistancies, overlaps, and format errors
that make it impossible to feed the output of ripe-to-csv.awk directly
into testbed-rootcert.py without OpenSSL rejecting the resulting
root.conf.  This script takes a brute force approach to fixing this:
it converts all prefixes and address ranges into pairs of unsigned
decimal integers representing range min and range max, runs the
resulting 3+ million entry file through the unix sort program to put
the data into canonical order, then reads it back, merging overlaps
and converting everything back to ranges of IP addresses, and writing
the result in a form acceptable to testbed-rootcert.py.

Since we're doing all this anyway, the script also merges adjacent
address blocks, not because OpenSSL can't handle them (it can) but
because doing so cuts out a lot of unnecessary I/O.

Ordinarily, it would be dangerous to have the same program act as both
the source and sink of a pipeline, particularly for such a large data
set, as the chance of deadlock would approach 100%, but in this case
we know that the sort program must consume and buffer (somehow) all of
its input before writing a single line of output, so a single script
can safely act as a filter both before and after sort.
"""

import sys
import subprocess
import rpki.ipaddrs

sorter = subprocess.Popen(("sort", "-T.", "-n"),
                          stdin = subprocess.PIPE,
                          stdout = subprocess.PIPE)

for line in sys.stdin:
  handle, prefix = line.split()

  if "-" in prefix:
    range_min, range_max = prefix.split("-")
    range_min = rpki.ipaddrs.parse(range_min)
    range_max = rpki.ipaddrs.parse(range_max)

  else:
    address, length = prefix.split("/")
    address = rpki.ipaddrs.parse(address)
    mask = (1L << (address.bits - int(length))) - 1
    range_min = address & ~mask
    range_max = address |  mask

  sorter.stdin.write("%d %d\n" % (long(range_min), long(range_max)))

sorter.stdin.close()

prev_min = None
prev_max = None

def address(number):
  if number > 0xffffffff:
    return rpki.ipaddrs.v6addr(number)
  else:
    return rpki.ipaddrs.v4addr(number)

def show():
  if prev_min and prev_max:
    sys.stdout.write("x\t%s-%s\n" % (address(prev_min), address(prev_max)))

for line in sorter.stdout:
  this_min, this_max = line.split()
  this_min = long(this_min)
  this_max = long(this_max)

  if prev_min and prev_max and prev_max + 1 >= this_min:
    prev_min = min(prev_min, this_min)
    prev_max = max(prev_max, this_max)

  else:
    show()
    prev_min = this_min
    prev_max = this_max

show()

sorter.stdout.close()

sys.exit(sorter.wait())