Convert ip ranges to CIDR netblock

2 scripts to convert IP ranges to CIDR notation using awk, gawk or mawk. The scripts are much faster than using ipcalc and will return the same results. The first script is reliably compatible with awk, gawk and mawk but is over 3 times as slow as the second script which is reliably compatible with gawk.

ip2cidr.awk - script 1

# Convert IP ranges to CIDR notation
# awk, gawk, mawk compatible

function range2cidr(ipStart, ipEnd, result, bits, mask, newip) {
    bits = 1
    mask = 1
    while (bits < 32) {
        newip = bit_or(ipStart, mask)
        if ((newip > ipEnd) || ((bit_lshift(bit_rshift(ipStart,bits),bits)) != ipStart)) {
            bits--
            mask = bit_rshift(mask,1)
            break
        }
        bits++
        mask = bit_lshift(mask,1)+1
    }
    newip = bit_or(ipStart, mask)
    bits = 32 - bits
    result = (result)?result ORS dec2ip(ipStart) "/" bits : dec2ip(ipStart) "/" bits
    if (newip < ipEnd) result = range2cidr(newip + 1, ipEnd,result)
    return result
}

# convert dotted quads to long decimal ip
# int ip2dec("192.168.0.15")
#
function ip2dec(ip, slice) {
    split(ip, slice, /[.]/)
    return (slice[1] * 2^24) + (slice[2] * 2^16) + (slice[3] * 2^8) + slice[4]
}

# convert decimal long ip to dotted quads
# str dec2ip(1171259392)
#
function dec2ip(dec, ip, quad) {
    for (i=3; i>=1; i--) {
        quad = 256^i
        ip = ip int(dec/quad) "."
        dec = dec%quad
    }
    return ip dec
}

# Bitwise OR of var1 and var2
function bit_or(a, b, r, i, c) {
    for (r=i=0;i<32;i++) {
        c = 2 ^ i
        if ((int(a/c) % 2) || (int(b/c) % 2)) r += c
    }
    return r
}

# Rotate bytevalue left x times
function bit_lshift(var, x) {
    while(x--) var*=2;
    return var;
}

# Rotate bytevalue right x times
function bit_rshift(var, x) {
    while(x--) var=int(var/2);
    return var;
}

function sanitize(ip) {
    split(ip, slice, /[.]/)
    return slice[1]/1 "." slice[2]/1 "." slice[3]/1 "." slice[4]/1
}

BEGIN{
    FS=" - |-|:"
}

# sanitize ip's
!/^#/ && NF {
    f1 = sanitize($(NF-1))
    f2 = sanitize($NF)
    print range2cidr(ip2dec(f1), ip2dec(f2))
}

END {print ""}

Benchmarks processing file containing approximately 236K IP ranges:

  • mawk - 1m 2s
  • gawk - 1m 49s
  • awk - 2m 35s
  • ipcalc - 15m 32s

ip2cidr.awk - script 2

# Convert IP ranges to CIDR notation
# gawk compatible

function range2cidr(ipStart, ipEnd, result, bits, mask, newip) {
    bits = 1
    mask = 1
    while (bits < 32) {
        newip = or(ipStart, mask)
        if ((newip > ipEnd) || ((lshift(rshift(ipStart,bits),bits)) != ipStart)) {
            bits--
            mask = rshift(mask,1)
            break
        }
        bits++
        mask = lshift(mask,1)+1
    }
    newip = or(ipStart, mask)
    bits = 32 - bits
    result = (result)?result ORS dec2ip(ipStart) "/" bits : dec2ip(ipStart) "/" bits
    if (newip < ipEnd) result = range2cidr(newip + 1, ipEnd,result)
    return result
}

# convert dotted quads to long decimal ip
# int ip2dec("192.168.0.15")
#
function ip2dec(ip, slice) {
    split(ip, slice, /[.]/)
    return (slice[1] * 2^24) + (slice[2] * 2^16) + (slice[3] * 2^8) + slice[4]
}

# convert decimal long ip to dotted quads
# str dec2ip(1171259392)
#
function dec2ip(dec, ip, quad) {
    for (i=3; i>=1; i--) {
        quad = 256^i
        ip = ip int(dec/quad) "."
        dec = dec%quad
    }
    return ip dec
}

function sanitize(ip) {
    split(ip, slice, /[.]/)
    return slice[1]/1 "." slice[2]/1 "." slice[3]/1 "." slice[4]/1
}

BEGIN{
    FS=" - |-|:"
}

# sanitize ip's
!/^#/ && NF {
    f1 = sanitize($(NF-1))
    f2 = sanitize($NF)
    print range2cidr(ip2dec(f1), ip2dec(f2))
}

END {print ""}

Benchmarks processing file containing approximately 236K IP ranges:

  • gawk - 34s

Original script from Convert ip ranges to CIDR netblocks post.

The original script did not accurately convert some IP ranges to proper CIDR notation. Thanks to vgersh99 for the changes to the scripts. They now properly convert IP ranges to CIDR notation.

Example use:

awk -f ip2cidr.awk ipranges.txt > cidr.txt
gawk -f ip2cidr.awk ipranges.txt > cidr.txt
mawk -f ip2cidr.awk ipranges.txt > cidr.txt

Did you search here first?

2 Likes

This thread is a treasure - stealing.
Thanks for sharing

1 Like

After reviewing the suggested post I've come up with the following code:

#!/bin/sh

# Library with various ip manipulation functions
# convert ip ranges to CIDR notation

function range2cidr(ipStart, ipEnd,  bits, mask, newip) {
    bits = 1
    mask = 1
    while (bits < 32) {
        newip = or(ipStart, mask)
        if ((newip>ipEnd) || ((lshift(rshift(ipStart,bits),bits)) != ipStart)) {
           bits--
           mask = rshift(mask,1)
           break
        }
        bits++
        mask = lshift(mask,1)+1
    }
    newip = or(ipStart, mask)
    bits = 32 - bits
    result = dec2ip(ipStart) "/" bits
    if (newip < ipEnd) result = range2cidr(newip + 1, ipEnd)
    return result
}

# convert dotted quads to long decimal ip
#	int ip2dec("192.168.0.15")
#
function ip2dec(ip,   slice) {
	split(ip, slice, ".")
	return (slice[1] * 2^24) + (slice[2] * 2^16) + (slice[3] * 2^8) + slice[4]
}

# convert decimal long ip to dotted quads
#	str dec2ip(1171259392)
#
function dec2ip(dec,    ip, quad) {
	for (i=3; i>=1; i--) {
		quad = 256^i
		ip = ip int(dec/quad) "."
		dec = dec%quad
	}
	return ip dec
}

function sanitize(ip) {
	split(ip, slice, ".")
	return slice[1]/1 "." slice[2]/1 "." slice[3]/1 "." slice[4]/1
}


BEGIN{
	FS=" , | - "
}

# sanitize ip's
{$1 = sanitize($1); $2 = sanitize($2)}

# range with a single IP
$1==$2 {printf "%s\n", $1}

# ranges with multiple IP's
$1!=$2{print range2cidr(ip2dec($1), ip2dec($2))}

# footer
END {print "COMMIT\n"}

The ip address ranges are correctly converted to CIDR and printed to the console. Instead of printing to the console how do I send the information to a file instead? I don't need anything printed to the console. Also I was interested in mawk since might be faster than gawk. mawk doesn't have the lshift rshift etc. Can someone make the code more portable so I can use try mawk.

Thank you

Why do you have #!/bin/sh at the beginning of awk code? It doesn't belong there. Hopefully it's doing nothing right now.

awk -f command inputfile > filename
1 Like

Post #2 in the thread contained the written or() lshift() and rshift() functions if they're missing in the version of awk you're using.

1 Like

This works well. awk -f test.awk temp > done

Thanks I didn't notice that. Added it to my code.

So here is the code I ended up with:

#!/bin/awk -f

# Library with various ip manipulation functions
# convert ip ranges to CIDR notation

function range2cidr(ipStart, ipEnd,  bits, mask, newip) {
    bits = 1
    mask = 1
    while (bits < 32) {
        newip = bit_or(ipStart, mask)
        if ((newip>ipEnd) || ((bit_lshift(bit_rshift(ipStart,bits),bits)) != ipStart)) {
           bits--
           mask = bit_rshift(mask,1)
           break
        }
        bits++
        mask = bit_lshift(mask,1)+1
    }
    newip = bit_or(ipStart, mask)
    bits = 32 - bits
    result = dec2ip(ipStart) "/" bits
    if (newip < ipEnd) result = range2cidr(newip + 1, ipEnd)
    return result
}

# convert dotted quads to long decimal ip
#	int ip2dec("192.168.0.15")
#
function ip2dec(ip,   slice) {
	split(ip, slice, ".")
	return (slice[1] * 2^24) + (slice[2] * 2^16) + (slice[3] * 2^8) + slice[4]
}

# convert decimal long ip to dotted quads
#	str dec2ip(1171259392)
#
function dec2ip(dec,    ip, quad) {
	for (i=3; i>=1; i--) {
		quad = 256^i
		ip = ip int(dec/quad) "."
		dec = dec%quad
	}
	return ip dec
}

# Bitwise OR of var1 and var2
function bit_or(a, b, r, i, c) {
    for (r=i=0;i<32;i++) {
        c = 2 ^ i
        if ((int(a/c) % 2) || (int(b/c) % 2)) r += c
    }
    return r
}

# Rotate bytevalue left x times
function bit_lshift(var, x) {
  while(x--) var*=2;
  return var;
}

# Rotate bytevalue right x times
function bit_rshift(var, x) {
  while(x--) var=int(var/2);
  return var;
}

function sanitize(ip) {
	split(ip, slice, ".")
	return slice[1]/1 "." slice[2]/1 "." slice[3]/1 "." slice[4]/1
}


BEGIN{
	FS=" | - "
}

# sanitize ip's
{$1 = sanitize($1); $2 = sanitize($2)}

{print range2cidr(ip2dec($1), ip2dec($2))}

END {print ""}

Here are some benchmarks processing a file containing ip address ranges. The output file after running the script contained 236,315 lines.

  • ipcacl 15 min
  • mawk 59 sec
  • gawk 1 min 45 sec
  • awk 2 min 46 sec

Edit: Benchmarks were changed since they were wrong
Edit: Changed script since it wasn't appending /32 to a single IP address range
Edit: Changed field separator to be either a space or hyphen. See post #9.

mawk is indeed designed for speed above all else. Occasionally its features are lacking but that can usually be worked around.

1 Like

are you sure you're getting the desired results?
I'm getting:

222.104.193.5/32
222.106.31.112/32
222.106.31.123/32
222.126.13.224/32
222.191.251.186/32

on your sample/modified file:

222.104.193.5  222.104.193.5
222.106.31.112  222.106.31.112
222.106.31.123  222.106.31.123
222.126.13.224  222.126.13.231
222.191.251.186  222.191.251.186

At least 222.126.13.224 222.126.13.231 should convert to 222.126.13.224/29

1 Like

vgersh99 the sample I provided had a hyphen between the IP address ranges and the one you provided has a space for a field separator. Thank you for addressing the issue though, since I also noticed that when a hyphen was used as a field separator for a single IP address range the output wasn't being appended with /32. I've refined the code and removed the bit_or(), bit_lshift () and bit_rshift () functions since they were much slower than using gawk's built in or(), lshift() and rshift() functions. Can't use mawk now but the code below is still faster now using gawk. For some reason I get a segmentation fault with awk.

#!/usr/local/bin/gawk

# Library with various ip manipulation functions
# convert ip ranges to CIDR notation

function range2cidr(ipStart, ipEnd, bits, mask, newip) {
    bits = 1
    mask = 1
    while (bits < 32) {
        newip = or(ipStart, mask)
        if ((newip>ipEnd) || ((lshift(rshift(ipStart,bits),bits)) != ipStart)) {
           bits--
           mask = rshift(mask,1)
           break
        }
        bits++
        mask = lshift(mask,1)+1
    }
    newip = or(ipStart, mask)
    bits = 32 - bits
    result = dec2ip(ipStart) "/" bits
    if (newip < ipEnd) result = range2cidr(newip + 1, ipEnd)
    return result
}

# convert dotted quads to long decimal ip
#       int ip2dec("192.168.0.15")
#
function ip2dec(ip, slice) {
        split(ip, slice, ".")
        return (slice[1] * 2^24) + (slice[2] * 2^16) + (slice[3] * 2^8) + slice[4]
}

# convert decimal long ip to dotted quads
#       str dec2ip(1171259392)
#
function dec2ip(dec, ip, quad) {
        for (i=3; i>=1; i--) {
                quad = 256^i
                ip = ip int(dec/quad) "."
                dec = dec%quad
        }
        return ip dec
}

function sanitize(ip) {
        split(ip, slice, ".")
        return slice[1]/1 "." slice[2]/1 "." slice[3]/1 "." slice[4]/1
}

BEGIN{
        FS=" | - "
}

# sanitize ip's
{$1 = sanitize($1); $2 = sanitize($2)}

{print range2cidr(ip2dec($1), ip2dec($2))}

END {print ""}

Here are benchmarks processing a file containing ip address ranges. The output file after running the script contained 236,315 lines.

  • ipcacl 15 min
  • mawk does not work with this script
  • gawk 27 sec
  • awk segmentation fault

wrt segmentation fault...
What's your OS?
or(), lshift() and rshift() are gawk specific
what does which awk come back with?
What does ls -l $(which awk) come back with

OpenBSD

/usr/bin/awk

-r-xr-xr-x 1 root bin 180464 Oct 11 12:18 /usr/bin/awk

OpenBSD man page for awk: Here
The version of awk that OpenBSD supports or(), lshift() and rshift()

FYI, the list that I've been converting to CIDR notation is from iblock.com. Link

Looks like an old branch of gawk with non-POSIX extensions.
Probably a buggy and/or not so forgiving as gawk...
The BUGS section at the bottom of the man page might give some hints of what can be tweaked to make it run...
Stick with gawk if it's an option, or get a newer version of a native BSD version of awk.

P.S. I'm getting the following timing with gawk under Cygwin with the referenced file:

real    0m12.081s
user    0m10.202s
sys     0m0.967s

with the slightly modified awk code:

function range2cidr(ipStart, ipEnd, bits, mask, newip) {
    bits = 1
    mask = 1
    while (bits < 32) {
        newip = or(ipStart, mask)
        if ((newip>ipEnd) || ((lshift(rshift(ipStart,bits),bits)) != ipStart)) {
           bits--
           mask = rshift(mask,1)
           break
        }
        bits++
        mask = lshift(mask,1)+1
    }
    newip = or(ipStart, mask)
    bits = 32 - bits
    result = dec2ip(ipStart) "/" bits
    if (newip < ipEnd) result = range2cidr(newip + 1, ipEnd)
    return result
}

# convert dotted quads to long decimal ip
#       int ip2dec("192.168.0.15")
#
function ip2dec(ip, slice) {
        split(ip, slice, ".")
        return (slice[1] * 2^24) + (slice[2] * 2^16) + (slice[3] * 2^8) + slice[4]
}

# convert decimal long ip to dotted quads
#       str dec2ip(1171259392)
#
function dec2ip(dec, ip, quad) {
        for (i=3; i>=1; i--) {
                quad = 256^i
                ip = ip int(dec/quad) "."
                dec = dec%quad
        }
        return ip dec
}

function sanitize(ip) {
        split(ip, slice, ".")
        return slice[1]/1 "." slice[2]/1 "." slice[3]/1 "." slice[4]/1
}

BEGIN{
        FS="[- :]"
}

# sanitize ip's
!/^#/ && NF {$1 = sanitize($(NF-1)); $2 = sanitize($NF)}

!/^#/ && NF {print range2cidr(ip2dec($(NF-1)), ip2dec($NF))}

END {print ""}
1 Like

Thank you vgersh99 for the modified script. I'm using an Intel Celeron N2930 on my OpenBSD box so the timings are definitely different. I appreciative the time you've spent looking into this script. Hopfeuly others can find this beneficial in the future.

I wish I would have posted how I downloaded and parsed the downloaded file which was redirected to a temp file before you refined the script. I'm going to post it now to show you what my downloaded file looks like prior to me processing it using the script and gawk.

ftp -V -o - http://list.iblocklist.com/?list=ydxerpxkpcfqjaybcssw&fileformat=p2p&archiveformat=gz | gunzip -c | grep -v '#' | sed '/^$/d' | sed 's/.*://' | sed 's/-/ - /' >> temp

Also the awk version that OpenBSD is currenly using is:
awk version 20110810

The version was obtained using the command awk -V

you can get rid of all the grep-s/sed-s and pipe it directly into the modified script (without any temp file).
Good luck.

1 Like

Thank you vgersh99. I conducted benchmarks with the way I had previously downloaded the file and used my script and compared it to just downloading the file and using your script and had similar timing results. Again thank you for your time.

vgersh99 while doing some tests with your script I noticed it seems to be producing wrong output. I removed the following line from your script and the output then seems to be correct.

!/^#/ && NF {$1 = sanitize($(NF-1)); $2 = sanitize($NF)

I compared the output with your original script and with these 2 lines the output appears to be wrong:

!/^#/ && NF {$1 = sanitize($(NF-1)); $2 = sanitize($NF)}

!/^#/ && NF {print range2cidr(ip2dec($(NF-1)), ip2dec($NF))}

I piped the downloaded file like so:

ftp -V -o - http://list.iblocklist.com/?list=ydxerpxkpcfqjaybcssw&fileformat=p2p&archiveformat=gz | gunzip -c | gawk -f ip2cidr.awk > final

sorry, these 2 lines should read:

!/^#/ && NF {$(NF-1)= sanitize($(NF-1)); $NF = sanitize($NF)}

!/^#/ && NF {print range2cidr(ip2dec($(NF-1)), ip2dec($NF))}

Let me know how it goes...

P.S. actually, to speed things up a bit we can combine both lines into one without making awk rebuilding $0 after each "sanitize":

!/^#/ && NF {
    f1=sanitize($(NF-1))
    f2=sanitize($NF)
    print range2cidr(ip2dec(f1), ip2dec(f2))
}
1 Like

Thanks that fixed it. Overall there is a flaw in the script somewhere though. There are approx 470 IP ranges in the downloaded file that are converted wrong. For example these aren't converted correctly:

1.62.189.215 - 1.62.189.222
4.0.25.146 - 4.0.25.148
4.0.26.14 - 4.0.29.24
24.149.30.0 - 24.149.30.4

The script converts the above IP ranges to this:

1.62.189.222/32
4.0.25.148/32
4.0.29.24/32
24.149.30.4/32

They should have been converted to:

1.62.189.215/32
1.62.189.216/30
1.62.189.220/31
1.62.189.222/32
4.0.25.146/31
4.0.25.148/32
4.0.26.14/31
4.0.26.16/28
4.0.26.32/27
4.0.26.64/26
4.0.26.128/25
4.0.27.0/24
4.0.28.0/24
4.0.29.0/28
4.0.29.16/29
4.0.29.24/32
24.149.30.0/30
24.149.30.4/32

I discovered this issue while using ipcalc which accurately converts the IP ranges. The IP ranges in question convert to several CIDR blocks instead of one. The vast majority of IP address ranges in the file convert to only one CIDR block which are correct.

Anyone interested in fixing the script?

Good catch - thanks for the perseverance!
how about:

function range2cidr(ipStart, ipEnd, result, bits, mask, newip) {
    bits = 1
    mask = 1
    while (bits < 32) {
        newip = or(ipStart, mask)
        if ((newip>ipEnd) || ((lshift(rshift(ipStart,bits),bits)) != ipStart)) {
           bits--
           mask = rshift(mask,1)
           break
        }
        bits++
        mask = lshift(mask,1)+1
    }
    newip = or(ipStart, mask)
    bits = 32 - bits
    result = (result)?result ORS dec2ip(ipStart) "/" bits : dec2ip(ipStart) "/" bits
    if (newip < ipEnd) result = range2cidr(newip + 1, ipEnd,result)
    return result
}

# convert dotted quads to long decimal ip
#       int ip2dec("192.168.0.15")
#
function ip2dec(ip, slice) {
        split(ip, slice, ".")
        return (slice[1] * 2^24) + (slice[2] * 2^16) + (slice[3] * 2^8) + slice[4]
}

# convert decimal long ip to dotted quads
#       str dec2ip(1171259392)
#
function dec2ip(dec, ip, quad) {
        for (i=3; i>=1; i--) {
                quad = 256^i
                ip = ip int(dec/quad) "."
                dec = dec%quad
        }
        return ip dec
}

function sanitize(ip) {
        split(ip, slice, ".")
        return slice[1]/1 "." slice[2]/1 "." slice[3]/1 "." slice[4]/1
}

BEGIN{
        #FS=" |-|:"
        # to match BOTH formats: 'ip ip' AND the one in ydxerpxkpcfqjaybcssw file
        FS="[- :]"
}

# sanitize ip's
!/^#/ && NF {
  f1= sanitize($(NF-1))
  f2= sanitize($NF)
  print range2cidr(ip2dec(f1), ip2dec(f2))
}

END {print ""}
1 Like