Ok, back to the drawing board.
Instead of creating arrays/hashes etc. and comparing them, this perl program relies on the input being sorted. It simply runs through the data stream and keeps printing differences and pairs and singletons.
$cmp and $key/$prevkey are the main variables upon which the logic is built.
$
$ cat data.txt
KB0005 1019 T IFVATVPVI 0.691 PKC YES
KB0005 1036 T YFLQTSQQL 0.785 PKC YES
KB0005 1037 S FLQTSQQLK 0.585 DNAPK YES
KB0005 1045 S KQLESEGRS 0.669 PKC YES
KB0005 1045 S KQLESEGRS 0.880 unsp YES
KB204320 1019 T IFVATVPVI 0.699 PKC YES
KB204320 1036 T YFLQTSQQL 0.789 PKC YES
KB204320 1037 S FLQTSQQLK 0.589 DNAPK YES
KB204320 1045 S KQLESEGRS 0.880 unsp YES
$
$ cat testscr1.pl
#!/usr/bin/perl -w
$prevkey = "";
while (<>) {
chomp;
@x = split;
$key = "$x[1]:$x[2]:$x[3]:$x[5]:$x[6]";
$num = $x[4];
$line = sprintf("%-10s [MESG] => %s %s %s %s %s %s\n",$x[0],$x[1],$x[2],$x[3],$x[4],$x[5],$x[6]);
if ($prevkey eq "") { # we are on line 1; just set $cmp to 1 and move on
# A value of 1 means "start of comparison" - this line should be compared
# with the next line for potential pairing. A value of 0 means
# "end of comparison" - the comparison is over; we either found a pair or
# found a non-repeating line.
$cmp = 1;
} elsif ($key eq $prevkey) { # we found a pair
$cmp = 0;
# find diff
$diff = sprintf("%6.3f",$prevnum - $num);
# print prev and current lines if diff != 0
if ($prevnum != $num) {
$prevline =~ s/MESG/DIFF = $diff/;
$line =~ s/MESG/DIFF = $diff/;
print $prevline,$line;
}
} elsif ($key ne $prevkey) { # we did not find a pair; either prev line is
# non repeating or we found and printed a pair
# if $cmp equals 1 then print previous line else set $cmp to 1
if ($cmp == 1) {
$prevline =~ s/MESG/NO_REPETITION/;
print $prevline;
} else {
$cmp = 1;
}
}
$prevkey = $key;
$prevline = $line;
$prevnum = $num;
}
# if $cmp equals 1 then print previous line
if ($cmp == 1) {
$prevline =~ s/MESG/NO_REPETITION/;
print $prevline;
}
$
$ # Sorted input is absolutely essential for this perl program
$ # In the data below, all lines except line # 7 occur in pairs
$
$ sort -k2,2 -k3,3 -k4,4 -k6,6 -k7,7 data.txt
KB0005 1019 T IFVATVPVI 0.691 PKC YES
KB204320 1019 T IFVATVPVI 0.699 PKC YES
KB0005 1036 T YFLQTSQQL 0.785 PKC YES
KB204320 1036 T YFLQTSQQL 0.789 PKC YES
KB0005 1037 S FLQTSQQLK 0.585 DNAPK YES
KB204320 1037 S FLQTSQQLK 0.589 DNAPK YES
KB0005 1045 S KQLESEGRS 0.669 PKC YES
KB0005 1045 S KQLESEGRS 0.880 unsp YES
KB204320 1045 S KQLESEGRS 0.880 unsp YES
$
$ sort -k2,2 -k3,3 -k4,4 -k6,6 -k7,7 data.txt | perl testscr1.pl
KB0005 [DIFF = -0.008] => 1019 T IFVATVPVI 0.691 PKC YES
KB204320 [DIFF = -0.008] => 1019 T IFVATVPVI 0.699 PKC YES
KB0005 [DIFF = -0.004] => 1036 T YFLQTSQQL 0.785 PKC YES
KB204320 [DIFF = -0.004] => 1036 T YFLQTSQQL 0.789 PKC YES
KB0005 [DIFF = -0.004] => 1037 S FLQTSQQLK 0.585 DNAPK YES
KB204320 [DIFF = -0.004] => 1037 S FLQTSQQLK 0.589 DNAPK YES
KB0005 [NO_REPETITION] => 1045 S KQLESEGRS 0.669 PKC YES
$
$ # All lines except the last two occur in pairs
$
$ sort -k2,2 -k3,3 -k4,4 -k6,6 -k7,7 data.txt | sed -n 1,8p
KB0005 1019 T IFVATVPVI 0.691 PKC YES
KB204320 1019 T IFVATVPVI 0.699 PKC YES
KB0005 1036 T YFLQTSQQL 0.785 PKC YES
KB204320 1036 T YFLQTSQQL 0.789 PKC YES
KB0005 1037 S FLQTSQQLK 0.585 DNAPK YES
KB204320 1037 S FLQTSQQLK 0.589 DNAPK YES
KB0005 1045 S KQLESEGRS 0.669 PKC YES
KB0005 1045 S KQLESEGRS 0.880 unsp YES
$
$ sort -k2,2 -k3,3 -k4,4 -k6,6 -k7,7 data.txt | sed -n 1,8p | perl testscr1.pl
KB0005 [DIFF = -0.008] => 1019 T IFVATVPVI 0.691 PKC YES
KB204320 [DIFF = -0.008] => 1019 T IFVATVPVI 0.699 PKC YES
KB0005 [DIFF = -0.004] => 1036 T YFLQTSQQL 0.785 PKC YES
KB204320 [DIFF = -0.004] => 1036 T YFLQTSQQL 0.789 PKC YES
KB0005 [DIFF = -0.004] => 1037 S FLQTSQQLK 0.585 DNAPK YES
KB204320 [DIFF = -0.004] => 1037 S FLQTSQQLK 0.589 DNAPK YES
KB0005 [NO_REPETITION] => 1045 S KQLESEGRS 0.669 PKC YES
KB0005 [NO_REPETITION] => 1045 S KQLESEGRS 0.880 unsp YES
$
$ # No line is repeated
$
$ sed -n 1,5p data.txt
KB0005 1019 T IFVATVPVI 0.691 PKC YES
KB0005 1036 T YFLQTSQQL 0.785 PKC YES
KB0005 1037 S FLQTSQQLK 0.585 DNAPK YES
KB0005 1045 S KQLESEGRS 0.669 PKC YES
KB0005 1045 S KQLESEGRS 0.880 unsp YES
$
$ sed -n 1,5p data.txt | perl testscr1.pl
KB0005 [NO_REPETITION] => 1019 T IFVATVPVI 0.691 PKC YES
KB0005 [NO_REPETITION] => 1036 T YFLQTSQQL 0.785 PKC YES
KB0005 [NO_REPETITION] => 1037 S FLQTSQQLK 0.585 DNAPK YES
KB0005 [NO_REPETITION] => 1045 S KQLESEGRS 0.669 PKC YES
KB0005 [NO_REPETITION] => 1045 S KQLESEGRS 0.880 unsp YES
$
$ # Three pairs of lines; no single-occuring line
$
$ sort -k2,2 -k3,3 -k4,4 -k6,6 -k7,7 data.txt | sed -n 1,6p
KB0005 1019 T IFVATVPVI 0.691 PKC YES
KB204320 1019 T IFVATVPVI 0.699 PKC YES
KB0005 1036 T YFLQTSQQL 0.785 PKC YES
KB204320 1036 T YFLQTSQQL 0.789 PKC YES
KB0005 1037 S FLQTSQQLK 0.585 DNAPK YES
KB204320 1037 S FLQTSQQLK 0.589 DNAPK YES
$
$ sort -k2,2 -k3,3 -k4,4 -k6,6 -k7,7 data.txt | sed -n 1,6p | perl testscr1.pl
KB0005 [DIFF = -0.008] => 1019 T IFVATVPVI 0.691 PKC YES
KB204320 [DIFF = -0.008] => 1019 T IFVATVPVI 0.699 PKC YES
KB0005 [DIFF = -0.004] => 1036 T YFLQTSQQL 0.785 PKC YES
KB204320 [DIFF = -0.004] => 1036 T YFLQTSQQL 0.789 PKC YES
KB0005 [DIFF = -0.004] => 1037 S FLQTSQQLK 0.585 DNAPK YES
KB204320 [DIFF = -0.004] => 1037 S FLQTSQQLK 0.589 DNAPK YES
$
$ # Only one line
$
$ head -1 data.txt
KB0005 1019 T IFVATVPVI 0.691 PKC YES
$
$ head -1 data.txt | perl testscr1.pl
KB0005 [NO_REPETITION] => 1019 T IFVATVPVI 0.691 PKC YES
$
$
HTH,
tyler_durden