Match Fields between two files, print portions of each file together when matched in ([g]awk)'

I've written an awk script to compare two fields in two different files and then print portions of each file on the same line when matched. It works reasonably well, but every now and again, I notice some errors and cannot seem to figure out what the issue may be and am turning to you for help.

My ultimate goal is to do this: If $1 and $2 of file2 match $1 and $2 of file1 I would like to print $1, $2, and $3 of file2 followed by fields $3 through $23 (making them $4 through $24 in the output) of file1 on the same line. Else, if $2 of file2 has a "?", then I would like to print $1, $2, and $3 of file2 followed by "-" in $4-$24 of the output. My two input files look like this:

File1

PS009,001 L                   0   5 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   5   0  -1      -1      -1      -1   -1   -1    -1
PS009,001 H                   0   0 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   0   0  -1      -1      -1      -1   -1   -1    -1
PS009,001 NYX                 0   1  7  1  1  1 -1    6  0  1  2     2   2   5   2      -1      -1      -1    0  521     0
           * 0 1 100 2 100 0 0 .N 0 LineNr 1 ClauseNr 1: 1: 1: 200: 0 0 SentenceNr 1 TxtType: ? Pargr: 1 ClType:NmCl
PS009,001 <LMWT               0   2 -1 -1 -1  1 -1   -1 -1  1  1     2   2   2   1      -1      -1      -1    0  521     0
PS009,001 L                   0   5 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   5   0  -1      -1      -1      -1   -1   -1    -1
PS009,001 H                   0   0 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   0   0  -1      -1      -1      -1   -1   -1    -1
PS009,001 BN                  0   2 -1 -1 -1  1 -1   -1 -1  1  2     2   2   5   2      -1      -1      -1  -11  582     0
           * 0 -1 100 0 0 .. 2 LineNr 2 ClauseNr 1: 1: 1: 200: 0 0 SentenceNr 2 TxtType: ? Pargr: 1 ClType:NmCl
PS009,001 MZMWR               0   2 -1 -1 -1  1 -1   -1 -1  1  2     2   2   2   1      -1      -1      -1    0  521     0
PS009,001 L                   0   5 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   5   0  -1      -1      -1      -1   -1   -1    -1
PS009,001 DWD                 0   3 -1 -1 -1  1 -1   -1 -1  1  2     2   3   5   2      -1      -1      -1  -11  582     0
           * 0 -2 100 0 0 .. 1 LineNr 3 ClauseNr 1: 1: 1: 200: 0 0 SentenceNr 3 TxtType: ? Pargr: 1 ClType:NmCl
PS009,002 JDH                 0   1  4  2  1 -1 -1    1  1  1  0    -1   1   1  -1      -1      -1      -1    0  501     0
PS009,002 JHWH                0   3 -1 -1 -1  1 -1   -1 -1  1  2     2   3   3   2      -1      -1      -1    0  503     0
PS009,002 B                   0   5 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   5   0  -1      -1      -1      -1   -1   -1    -1
PS009,002 KL                 -6   2 -1 -1 -1  1 -1   -1 -1  1  2     1   2   0  -1       2      -1      -1   -1   -1    -1
PS009,002 LB                  0   2 -1 -1 -1  1 -1   -1 -1  1  2     1   2   0  -1      -1      -1      -1   -1   -1    -1
PS009,002 J                  -1   7 -1 -1 -1 -1 -1   -1  1  1 -1    -1   7   5   2  -20102      -1      -1    0  505     0
           * 0 1 200 18 411 27 131 33 131 0 0 .N 0 LineNr 4 ClauseNr 1: 1: 3: 101: 0 0 SentenceNr 1 TxtType: Q Pargr: 2 ClType:ZYq0
PS009,002 SPR                 0   1  4  1 18 -1 -1    1  1  1  0    -1   1   1  -1      -1      -1      -1    0  501     0
PS009,002 KL                 -6   2 -1 -1 -1  1 -1   -1 -1  1  2     1   2   0  -1       2      -1      -1   -1   -1    -1
PS009,002 PL>                 0   1  0  3  1 12 -1    6  0  3  1     1   2   0  -1      -1      -1      -1   -1   -1    -1
PS009,002 K                  -1   7 -1 -1 -1 -1 -1   -1  2  1  2    -1   7   2   2  -20102      -1      -1    0  503     0
           * 0 -1 200 1 200 0 0 .. 4 LineNr 5 ClauseNr 1: 1: 2: 101: 0 0 SentenceNr 2 TxtType: Q Pargr: 2 ClType:ZYq0
PS009,003 FMX                 0   1  4  0 18 -1 -1    1  1  1  0    -1   1   1  -1      -1      -1      -1    0  501     0
           * 0 -1 200 1 201 2 200 0 0 .. 4 LineNr 6 ClauseNr 1: 1: 1: 101: 0 0 SentenceNr 3 TxtType: Q Pargr: 2 ClType:ZYq0
PS009,003 W                   0   6 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   6   6  -1      -1      -1      -1    0  509     0
PS009,003 <LY                 0   1  4  0 18 -1 -1    1  1  1  0    -1   1   1  -1      -1      -1      -1    0  501     0
PS009,003 B                   0   5 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   5   0  -1      -1      -1      -1   -1   -1    -1
PS009,003 K                  -1   7 -1 -1 -1 -1 -1   -1  2  1  2    -1   7   5   2      -1      -1      -1    0  504     0
           * 0 -1 201 0 0 .. 5 LineNr 7 ClauseNr 1: 1: 3: 151: 0 0 SentenceNr 4 TxtType: Q Pargr: 2 ClType:WYq0
PS009,003 ZMR                 0   1  4  1 18 -1 -1    1  1  1  0    -1   1   1  -1      -1      -1      -1    0  501     0
PS009,003 CM                  0   2 -1 -1 -1  1 -1   -1 -1  1  2     1   2   0  -1      -1      -1      -1   -1   -1    -1
PS009,003 K                  -1   7 -1 -1 -1 -1 -1   -1  2  1  2    -1   7   2   2      -1      -1      -1    0  503     0

File2

PS009,001 L          ?         
PS009,001 H          ?         
PS009,001 NYX        ?         
PS009,001 <LMWT      ?         
PS009,001 L          ?         
PS009,001 H          ?         
PS009,001 BN         ?         
PS009,001 MZMWR      ?         
PS009,001 L          ?         
PS009,001 DWD        ?         
PS009,002 JDH        JDJ       
PS009,002 ?          L         
PS009,002 JHWH       MRJ>      
PS009,002 B          MN        
PS009,002 KL         KL        
PS009,002 ?          H         
PS009,002 LB         LB        
PS009,002 J          J         
PS009,002 ?          W         
PS009,002 SPR        C<J       
PS009,002 KL         KL        
PS009,002 ?          HJN       
PS009,002 PL>        TDMWR>    
PS009,002 K          K         
PS009,003 FMX        XDJ       
PS009,003 W          W         
PS009,003 <LY        RWZ=      
PS009,003 B          B         
PS009,003 K          K         
PS009,003 ?          W         
PS009,003 ZMR        ZMR==     
PS009,003 ?          L         
PS009,003 CM         CM        
PS009,003 K          K       

The awk script that I have been using is:

example$ awk -f script.awk file1 file2

Where 'script.awk' is:

FNR==NR {
         q=$2
         $1=$2=""
         A[q]=$0
         next
}
{
if($2 !~/\?/ && $1 && $2 in A)
         print $0,A[$2]
else if($2 ~/\?/)
         print $0, " - - - - - - - - - - - - - - - - - - - - - "
}

My output with the above script is:

PS009,001 L          ?            0 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 5 0 -1 -1 -1 -1 -1 -1 -1
PS009,001 H          ?            0 0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -1 -1 -1 -1 -1 -1 -1
PS009,001 NYX        ?            0 1 7 1 1 1 -1 6 0 1 2 2 2 5 2 -1 -1 -1 0 521 0
PS009,001 <LMWT      ?            0 2 -1 -1 -1 1 -1 -1 -1 1 1 2 2 2 1 -1 -1 -1 0 521 0
PS009,001 L          ?            0 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 5 0 -1 -1 -1 -1 -1 -1 -1
PS009,001 H          ?            0 0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -1 -1 -1 -1 -1 -1 -1
PS009,001 BN         ?            0 2 -1 -1 -1 1 -1 -1 -1 1 2 2 2 5 2 -1 -1 -1 -11 582 0
PS009,001 MZMWR      ?            0 2 -1 -1 -1 1 -1 -1 -1 1 2 2 2 2 1 -1 -1 -1 0 521 0
PS009,001 L          ?            0 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 5 0 -1 -1 -1 -1 -1 -1 -1
PS009,001 DWD        ?            0 3 -1 -1 -1 1 -1 -1 -1 1 2 2 3 5 2 -1 -1 -1 -11 582 0
PS009,002 JDH        JDJ          0 1 4 2 1 -1 -1 1 1 1 0 -1 1 1 -1 -1 -1 -1 0 501 0
PS009,002 ?          L           - - - - - - - - - - - - - - - - - - - - - 
PS009,002 JHWH       MRJ>         0 3 -1 -1 -1 1 -1 -1 -1 1 2 2 3 3 2 -1 -1 -1 0 562 0
PS009,002 B          MN           0 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 5 0 -1 -1 -1 -1 -1 -1 -1
PS009,002 KL         KL           -6 2 -1 -1 -1 1 -1 -1 -1 1 2 1 2 0 -1 2 -1 -1 -1 -1 -1
PS009,002 ?          H           - - - - - - - - - - - - - - - - - - - - - 
PS009,002 LB         LB           0 2 -1 -1 -1 1 -1 -1 -1 1 2 1 2 0 -1 -1 -1 -1 -1 -1 -1
PS009,002 J          J            -1 7 -1 -1 -1 -1 -1 -1 1 1 -1 -1 7 7 2 -1 -1 -1 0 503 0
PS009,002 ?          W           - - - - - - - - - - - - - - - - - - - - - 
PS009,002 SPR        C<J          0 1 4 1 18 -1 -1 1 1 1 0 -1 1 1 -1 -1 -1 -1 0 501 0
PS009,002 KL         KL           -6 2 -1 -1 -1 1 -1 -1 -1 1 2 1 2 0 -1 2 -1 -1 -1 -1 -1
PS009,002 ?          HJN         - - - - - - - - - - - - - - - - - - - - - 
PS009,002 PL>        TDMWR>       0 1 0 3 1 12 -1 6 0 3 1 1 2 0 -1 -1 -1 -1 -1 -1 -1
PS009,002 K          K            -1 7 -1 -1 -1 -1 -1 -1 2 1 2 -1 7 5 2 -1 -1 -1 0 505 0
PS009,003 FMX        XDJ          0 1 4 0 18 -1 -1 1 1 1 0 -1 1 1 -1 -1 -1 -1 0 501 0
PS009,003 W          W            0 6 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 6 6 -1 -1 -1 -1 0 509 0
PS009,003 <LY        RWZ=         0 1 4 0 18 -1 -1 1 1 1 0 -1 1 1 -1 -1 -1 -1 0 501 0
PS009,003 B          B            0 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 5 0 -1 -1 -1 -1 -1 -1 -1
PS009,003 K          K            -1 7 -1 -1 -1 -1 -1 -1 2 1 2 -1 7 5 2 -1 -1 -1 0 505 0
PS009,003 ?          W           - - - - - - - - - - - - - - - - - - - - - 
PS009,003 ZMR        ZMR==        0 1 1 1 7 -1 -1 3 2 3 2 -1 1 1 -1 -1 -1 -1 0 501 0
PS009,003 ?          L           - - - - - - - - - - - - - - - - - - - - - 
PS009,003 CM         CM           0 2 -1 -1 -1 1 -1 -1 -1 1 2 1 2 0 -1 -1 -1 -1 -1 -1 -1
PS009,003 K          K            -1 7 -1 -1 -1 -1 -1 -1 2 1 2 -1 7 5 2 -1 -1 -1 0 505 0

The output is mostly correct, but for instance on the next to last field on the line beginning PS009,002 JHWH (line 13) should be 503 as per file1 and in another case the line beginning PS009,003 ZMR (line 31) fields $4 until the end of the line are all incorrect.

However, if I change the order of the input files:

example$ awk -f script.awk file2 file1

I seem to get most of the data correct but the fields are out of order and naturally the "-" for when $3 in file2 is a "?" do not print.

Output:

PS009,001 L                   0   5 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   5   0  -1      -1      -1      -1   -1   -1    -1   ?
PS009,001 H                   0   0 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   0   0  -1      -1      -1      -1   -1   -1    -1   ?
PS009,001 NYX                 0   1  7  1  1  1 -1    6  0  1  2     2   2   5   2      -1      -1      -1    0  521     0   ?
PS009,001 <LMWT               0   2 -1 -1 -1  1 -1   -1 -1  1  1     2   2   2   1      -1      -1      -1    0  521     0   ?
PS009,001 L                   0   5 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   5   0  -1      -1      -1      -1   -1   -1    -1   ?
PS009,001 H                   0   0 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   0   0  -1      -1      -1      -1   -1   -1    -1   ?
PS009,001 BN                  0   2 -1 -1 -1  1 -1   -1 -1  1  2     2   2   5   2      -1      -1      -1  -11  582     0   ?
PS009,001 MZMWR               0   2 -1 -1 -1  1 -1   -1 -1  1  2     2   2   2   1      -1      -1      -1    0  521     0   ?
PS009,001 L                   0   5 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   5   0  -1      -1      -1      -1   -1   -1    -1   ?
PS009,001 DWD                 0   3 -1 -1 -1  1 -1   -1 -1  1  2     2   3   5   2      -1      -1      -1  -11  582     0   ?
PS009,002 JDH                 0   1  4  2  1 -1 -1    1  1  1  0    -1   1   1  -1      -1      -1      -1    0  501     0   JDJ
PS009,002 JHWH                0   3 -1 -1 -1  1 -1   -1 -1  1  2     2   3   3   2      -1      -1      -1    0  503     0   ?
PS009,002 B                   0   5 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   5   0  -1      -1      -1      -1   -1   -1    -1   B
PS009,002 KL                 -6   2 -1 -1 -1  1 -1   -1 -1  1  2     1   2   0  -1       2      -1      -1   -1   -1    -1   KL
PS009,002 LB                  0   2 -1 -1 -1  1 -1   -1 -1  1  2     1   2   0  -1      -1      -1      -1   -1   -1    -1   LB
PS009,002 J                  -1   7 -1 -1 -1 -1 -1   -1  1  1 -1    -1   7   5   2  -20102      -1      -1    0  505     0   NJ
PS009,002 SPR                 0   1  4  1 18 -1 -1    1  1  1  0    -1   1   1  -1      -1      -1      -1    0  501     0   C<J
PS009,002 KL                 -6   2 -1 -1 -1  1 -1   -1 -1  1  2     1   2   0  -1       2      -1      -1   -1   -1    -1   KL
PS009,002 PL>                 0   1  0  3  1 12 -1    6  0  3  1     1   2   0  -1      -1      -1      -1   -1   -1    -1   TDMWR>
PS009,002 K                  -1   7 -1 -1 -1 -1 -1   -1  2  1  2    -1   7   2   2  -20102      -1      -1    0  503     0   K
PS009,003 FMX                 0   1  4  0 18 -1 -1    1  1  1  0    -1   1   1  -1      -1      -1      -1    0  501     0   XDJ
PS009,003 W                   0   6 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   6   6  -1      -1      -1      -1    0  509     0   W
PS009,003 <LY                 0   1  4  0 18 -1 -1    1  1  1  0    -1   1   1  -1      -1      -1      -1    0  501     0   RWZ=
PS009,003 B                   0   5 -1 -1 -1 -1 -1   -1 -1 -1 -1    -1   5   0  -1      -1      -1      -1   -1   -1    -1   B
PS009,003 K                  -1   7 -1 -1 -1 -1 -1   -1  2  1  2    -1   7   5   2      -1      -1      -1    0  504     0   K
PS009,003 ZMR                 0   1  4  1 18 -1 -1    1  1  1  0    -1   1   1  -1      -1      -1      -1    0  501     0   ZMR==
PS009,003 CM                  0   2 -1 -1 -1  1 -1   -1 -1  1  2     1   2   0  -1      -1      -1      -1   -1   -1    -1   CM
PS009,003 K                  -1   7 -1 -1 -1 -1 -1   -1  2  1  2    -1   7   2   2      -1      -1      -1    0  503     0   K

Hopefully this makes sense and I really appreciate that I can turn to this forum when I am stuck.

Perhaps these modifications make it more what you are looking for:

FNR==NR {
         q=$1 SUBSEP $2
         $1=$2=""
         A[q]=$0
         next
}
{
if($2 !~/\?/ && ($1,$2) in A)
         print $0,A[$1,$2]
else if($2 ~/\?/)
         print $0, " - - - - - - - - - - - - - - - - - - - - - "
}

This is using both field 1 and 2 instead of only field 2.

It produces:

PS009,001 L          ?            0 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 5 0 -1 -1 -1 -1 -1 -1 -1
PS009,001 H          ?            0 0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -1 -1 -1 -1 -1 -1 -1
PS009,001 NYX        ?            0 1 7 1 1 1 -1 6 0 1 2 2 2 5 2 -1 -1 -1 0 521 0
PS009,001 <LMWT      ?            0 2 -1 -1 -1 1 -1 -1 -1 1 1 2 2 2 1 -1 -1 -1 0 521 0
PS009,001 L          ?            0 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 5 0 -1 -1 -1 -1 -1 -1 -1
PS009,001 H          ?            0 0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 -1 -1 -1 -1 -1 -1 -1
PS009,001 BN         ?            0 2 -1 -1 -1 1 -1 -1 -1 1 2 2 2 5 2 -1 -1 -1 -11 582 0
PS009,001 MZMWR      ?            0 2 -1 -1 -1 1 -1 -1 -1 1 2 2 2 2 1 -1 -1 -1 0 521 0
PS009,001 L          ?            0 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 5 0 -1 -1 -1 -1 -1 -1 -1
PS009,001 DWD        ?            0 3 -1 -1 -1 1 -1 -1 -1 1 2 2 3 5 2 -1 -1 -1 -11 582 0
PS009,002 JDH        JDJ          0 1 4 2 1 -1 -1 1 1 1 0 -1 1 1 -1 -1 -1 -1 0 501 0
PS009,002 ?          L           - - - - - - - - - - - - - - - - - - - - - 
PS009,002 JHWH       MRJ>         0 3 -1 -1 -1 1 -1 -1 -1 1 2 2 3 3 2 -1 -1 -1 0 503 0
PS009,002 B          MN           0 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 5 0 -1 -1 -1 -1 -1 -1 -1
PS009,002 KL         KL           -6 2 -1 -1 -1 1 -1 -1 -1 1 2 1 2 0 -1 2 -1 -1 -1 -1 -1
PS009,002 ?          H           - - - - - - - - - - - - - - - - - - - - - 
PS009,002 LB         LB           0 2 -1 -1 -1 1 -1 -1 -1 1 2 1 2 0 -1 -1 -1 -1 -1 -1 -1
PS009,002 J          J            -1 7 -1 -1 -1 -1 -1 -1 1 1 -1 -1 7 5 2 -20102 -1 -1 0 505 0
PS009,002 ?          W           - - - - - - - - - - - - - - - - - - - - - 
PS009,002 SPR        C<J          0 1 4 1 18 -1 -1 1 1 1 0 -1 1 1 -1 -1 -1 -1 0 501 0
PS009,002 KL         KL           -6 2 -1 -1 -1 1 -1 -1 -1 1 2 1 2 0 -1 2 -1 -1 -1 -1 -1
PS009,002 ?          HJN         - - - - - - - - - - - - - - - - - - - - - 
PS009,002 PL>        TDMWR>       0 1 0 3 1 12 -1 6 0 3 1 1 2 0 -1 -1 -1 -1 -1 -1 -1
PS009,002 K          K            -1 7 -1 -1 -1 -1 -1 -1 2 1 2 -1 7 2 2 -20102 -1 -1 0 503 0
PS009,003 FMX        XDJ          0 1 4 0 18 -1 -1 1 1 1 0 -1 1 1 -1 -1 -1 -1 0 501 0
PS009,003 W          W            0 6 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 6 6 -1 -1 -1 -1 0 509 0
PS009,003 <LY        RWZ=         0 1 4 0 18 -1 -1 1 1 1 0 -1 1 1 -1 -1 -1 -1 0 501 0
PS009,003 B          B            0 5 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 5 0 -1 -1 -1 -1 -1 -1 -1
PS009,003 K          K            -1 7 -1 -1 -1 -1 -1 -1 2 1 2 -1 7 2 2 -1 -1 -1 0 503 0
PS009,003 ?          W           - - - - - - - - - - - - - - - - - - - - - 
PS009,003 ZMR        ZMR==        0 1 4 1 18 -1 -1 1 1 1 0 -1 1 1 -1 -1 -1 -1 0 501 0
PS009,003 ?          L           - - - - - - - - - - - - - - - - - - - - - 
PS009,003 CM         CM           0 2 -1 -1 -1 1 -1 -1 -1 1 2 1 2 0 -1 -1 -1 -1 -1 -1 -1
PS009,003 K          K   -1 7 -1 -1 -1 -1 -1 -1 2 1 2 -1 7 2 2 -1 -1 -1 0 503 0
1 Like

Thank you so very much!!