diff options
Diffstat (limited to 'potpourri/whack-ripe-prefixes.py')
-rw-r--r-- | potpourri/whack-ripe-prefixes.py | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/potpourri/whack-ripe-prefixes.py b/potpourri/whack-ripe-prefixes.py new file mode 100644 index 00000000..52ea3f18 --- /dev/null +++ b/potpourri/whack-ripe-prefixes.py @@ -0,0 +1,101 @@ +# $Id$ +# +# Copyright (C) 2010 Internet Systems Consortium ("ISC") +# +# Permission to use, copy, modify, and distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +# AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, +# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. + +""" +Fix problems in prefixes.csv generated from RIPE's database. + +RIPE's database contains inconsistancies, overlaps, and format errors +that make it impossible to feed the output of ripe-to-csv.awk directly +into testbed-rootcert.py without OpenSSL rejecting the resulting +root.conf. This script takes a brute force approach to fixing this: +it converts all prefixes and address ranges into pairs of unsigned +decimal integers representing range min and range max, runs the +resulting 3+ million entry file through the unix sort program to put +the data into canonical order, then reads it back, merging overlaps +and converting everything back to ranges of IP addresses, and writing +the result in a form acceptable to testbed-rootcert.py. + +Since we're doing all this anyway, the script also merges adjacent +address blocks, not because OpenSSL can't handle them (it can) but +because doing so cuts out a lot of unnecessary I/O. + +Ordinarily, it would be dangerous to have the same program act as both +the source and sink of a pipeline, particularly for such a large data +set, as the chance of deadlock would approach 100%, but in this case +we know that the sort program must consume and buffer (somehow) all of +its input before writing a single line of output, so a single script +can safely act as a filter both before and after sort. +""" + +import sys +import subprocess +import rpki.ipaddrs + +sorter = subprocess.Popen(("sort", "-T.", "-n"), + stdin = subprocess.PIPE, + stdout = subprocess.PIPE) + +for line in sys.stdin: + handle, prefix = line.split() + + if "-" in prefix: + range_min, range_max = prefix.split("-") + range_min = rpki.ipaddrs.parse(range_min) + range_max = rpki.ipaddrs.parse(range_max) + + else: + address, length = prefix.split("/") + address = rpki.ipaddrs.parse(address) + mask = (1L << (address.bits - int(length))) - 1 + range_min = address & ~mask + range_max = address | mask + + sorter.stdin.write("%d %d\n" % (long(range_min), long(range_max))) + +sorter.stdin.close() + +prev_min = None +prev_max = None + +def address(number): + if number > 0xffffffff: + return rpki.ipaddrs.v6addr(number) + else: + return rpki.ipaddrs.v4addr(number) + +def show(): + if prev_min and prev_max: + sys.stdout.write("x\t%s-%s\n" % (address(prev_min), address(prev_max))) + +for line in sorter.stdout: + this_min, this_max = line.split() + this_min = long(this_min) + this_max = long(this_max) + + if prev_min and prev_max and prev_max + 1 >= this_min: + prev_min = min(prev_min, this_min) + prev_max = max(prev_max, this_max) + + else: + show() + prev_min = this_min + prev_max = this_max + +show() + +sorter.stdout.close() + +sys.exit(sorter.wait()) |