RPSL line continuation is not my friend

svn path=/scripts/ripe-to-csv.awk; revision=3496
author: Rob Austein <sra@hactrn.net> 2010-10-26 22:56:34 +0000
committer: Rob Austein <sra@hactrn.net> 2010-10-26 22:56:34 +0000
commit: 1672b574f9b0b3cb2ee8c3b8a947ae2f411c4904 (patch)
tree: 8128c068b0211cff94f7ba0208e046360f806875
parent: 7c2f0aa4ff0b101484458e0c1296df6b694bac1b (diff)
1 files changed, 39 insertions, 26 deletions
diff --git a/scripts/ripe-to-csv.awk b/scripts/ripe-to-csv.awk
index bebcb64a..a7073c38 100644
--- a/scripts/ripe-to-csv.awk
+++ b/scripts/ripe-to-csv.awk
@@ -18,13 +18,6 @@
 # fairly simple stream parser that has to process a ridiculous amount
 # of text.  AWK turns out to be significantly faster for this.
 #
-# There are a few known screw cases in RPSL format that this script
-# doesn't attempt to handle, so if you just can't resist using
-# newlines between the begin and end addresses of an IPv4 address
-# range, this script will not understand your WHOIS entry.  So don't.
-#
-# Feh.
-#
 # NB: The input data for this script is publicly available via FTP, but
 # you'll have to fetch the data from RIPE yourself, and be sure to see
 # the terms and conditions referenced by the data file header comments.
@@ -52,52 +45,72 @@ BEGIN {
     OFS = "\t";
 }
 
-# Clean up comments and trailing whitespace; skip lines that are empty
-# after cleanup.  If we were attempting to handle line continuation,
-# this is where we'd start.
-!/^$/ {
-    sub(/#.*$/, "");
+# Clean up trailing whitespace.
+{
     sub(/[ \t]+$/, "");
-    if (!NF)
-	next;
 }
 
-# Non-empty line and we have no tag, must be start of a new block.
+# Continuation line: strip comment, if any, then append value, if any,
+# to what we had from previous line(s).
+/^[^A-Z]/ {
+    sub(/[ \t]*#.*$/, "");
+    if (NF)
+	val = val $0;
+    next;
+}
+
+# Anything other than line continuation terminates the previous line,
+# so if we were working on a line, we're done with it now, process it.
+key {
+    do_line();
+}
+
+# Non-empty line and we have no tag, this must be start of a new block.
 NF && !tag {
     tag = $1;
 }
 
 # One of the tags we care about, clean up and save the data.
 /^(AS-NAME|AUT-NUM|INET6NUM|INETNUM|MNT-BY|NETNAME|STATUS):/ {
-    t = $1;
+    key = $1;
     sub(/^[^ \t]+:/, "");
-    gsub(/[ \t]/, "");
-    tags[t] = $0;
+    sub(/[ \t]*#.*$/, "");
+    val = $0;
 }
 
 # Blank line and we have something, process it.
 !NF && tag {
-    got_one();
+    do_block();
 }
 
-# End of file, process last entry, if any.
+# End of file, process final data, if any.
 END {
-    got_one();
+    do_line();
+    do_block();
+}
+
+# Handle one line, after line icky RPSL continuation.
+function do_line() {
+    gsub(/[ \t]/, "", val);
+    if (key && val)
+	tags[key] = val;
+    key = "";
+    val = "";
 }
 
 # Dispatch to handle known block types, then clean up so we can start
 # a new block.
-function got_one() {
+function do_block() {
     if (tag == "INETNUM" || tag == "INET6NUM")
-	got_inetnum();
+	do_prefix();
     else if (tag == "AUT-NUM")
-	got_aut_num();
+	do_asn();
     delete tags;
     tag = "";
 }
 
 # Handle an AUT-NUM block: extract the ASN, use MNT-BY as the handle.
-function got_aut_num() {
+function do_asn() {
     sub(/^AS/, "", tags[tag]);
     if (tags["MNT-BY"] && tags[tag])
 	print tags["MNT-BY"], tags[tag] >"asns.csv";
@@ -105,7 +118,7 @@ function got_aut_num() {
 
 # Handle an INETNUM or INET6NUM block: check for the status values we
 # care about, use NETNAME as the handle.
-function got_inetnum() {
+function do_prefix() {
     if (tags["STATUS"] ~ /^ASSIGNED(P[AI])?$/ && tags["NETNAME"] && tags[tag])
 	print tags["NETNAME"], tags[tag] >"prefixes.csv";
 }
author	Rob Austein <sra@hactrn.net>	2010-10-26 22:56:34 +0000
committer	Rob Austein <sra@hactrn.net>	2010-10-26 22:56:34 +0000
commit	1672b574f9b0b3cb2ee8c3b8a947ae2f411c4904 (patch)
tree	8128c068b0211cff94f7ba0208e046360f806875
parent	7c2f0aa4ff0b101484458e0c1296df6b694bac1b (diff)