Assign zero to strings that don't appear in block, store result in AWK array

Ophiuchus · October 22, 2011, 7:31pm

Hi to all,

I have this input:

<group>                    
<x "2">Group D</x>
<x "3">Group B</x>
<x "1">Group A</x>
</group>                    
<group>                    
<x "1">Group E</x>
<x "0">Group B</x>
<x "1">Group C</x>
</group>                    
<group>                    
<x "3">Group C</x>
<x "2">Group B</x>
<x "7">Group A</x>
</group>

And I would like this output stored in an AWK array.

2|Group D
3|Group B
1|Group A
0|Group C
0|Group E
1|Group E
0|Group B
1|Group C
0|Group A
0|Group B
3|Group C
2|Group B
7|Group A
0|Group E
0|Group D

the unique Groups are

Group A
Group B
Group C
Group D
Group E

As you can see, some Groups could be in all blocks, but sometime one or more Groups appear only in some blocks.

For those Groups that don't appear in a specific block I need to generate an output that assign them zero value for that block as
shown for the groups in red.

I am able to get the output in the format "number|Group X" with the script below, but I don't know how to
add the groups that don't appear in a specific block and assign them zero value.

awk '/<x "/{A[1 + c++]=gensub(/(.+")([0-9]+)(">)(.+)(<\/.+)/, "\\2|\\4", "g")}
     END{for (i=1;i<=length(A);i++) print A}' groups

I really need it in awk because I need that array to include it in a main AWK code.

Many thanks for your help in advance.

ahamed101 · October 22, 2011, 11:43pm

Try this...

awk '/<x /{
  A=gensub(/.+"([0-9]+)">(.+)<.*/, "\\1|\\2", "g")
  split(A, arr, "|");
  a[++j]=arr[1]"|"arr[2]; u[arr[2]] }
/<\/g/{a[++j]=-1}

END{
  for(i=1;i<=j;i++) {
    if(a == -1) {
      for(k in u) {
        k in t?v="":v=k
        if(v){print "0|"v}
      } delete t; continue 
    }
    print a
    split(a,arr,"|"); t[arr[2]]  
  }
}' input_file

--ahamed

agama · October 22, 2011, 11:46pm

I'd do it this way:

awk  '
    BEGIN {
        soup = "Group A,Group B,Group C,Group D,Group E"
        nlist = split( soup, list, "," );
    }

    /^<x / {
        n = substr( $2, 2 ) + 0;
        gsub( "<[^>]*>", "" );  # assumes <x ...>stuff</x> is the ONLY tag on the line!
        group[$0] = n;
        next;
    }

    /^<\/group>/ {   # print out last collection including zeros
        for( i = 1; i <= nlist; i++ )
            printf( "%d|%s\n", group, list );
        delete group;   # clear for next go round
        next;
    }
' input-file

It makes some BIG assumptions; if your input file is more complex than you've indicated it might have issues. Specifically, if there is more than one 'tag' on the <x ... </x> line, it will break. Also, each is printed in the order that they are defined in the 'soup' and not in the order presented in the input file.

Ophiuchus · October 23, 2011, 2:10am

Hi ahamed and agama,

Thanks for your help, both work just great!.

But how can I include it in the first part of the main awk code I already have?

I would like to have the array with the output of your codes indexed numerically in ascending order from 1 to last element.

My code looks like this:

awk 'NR==FNR{
        if($0 ~ /XYZ/){Var1++} #Counting occurences of "XYZ" and storing in Var1
        if($0 ~ /<x "/){           
            A[1 + c++]=gensub(/(.+")([0-9]+)(">)(.+)(<\/.+)/, "\\2|\\4", "g")  # I would like your output in this array
            
            B[gensub(/pattern/,"how","g")] #Storing desired data in array B
            C[gensub(/pattern/,"how","g")] # #Storing desired data in array C
            D[1 + c++]=gensub(/pattern/, "\\2|\\4", "g") # #Storing desired data in array D
        }
next}
{
        some other code
} 
END{
    for ( i=1;i<=N;i++ ) { #Loop for printing values of 4 arrays after some manipulations
        some code to manipulate 4 arrays created when NR=FNR
    }
}' file1 file2

As you can see in the code, the data stored in array A (in red) is given by the "gensub()" function and the array is indexed
from 1 to last element.

Then, I would like to have inside the array A the ouptut of your codes insted of output of gensub(). Is possible only to generate
data of A in that way to use it later in my code?

Something like:

awk 'NR==FNR{
        if($0 ~ /XYZ/){Var1++}
        if($0 ~ /<x "/){
            A[1 + c++]="new output" # (new output = output generated by your codes)
            
            B[gensub(/pattern/,"how","g")] ...
            C[gensub(/pattern/,"how","g")] ...
            D[1 + c++]=gensub(/pattern/, "\\2|\\4", "g") ..
        }
.
.
.

Thanks for help so far.

agama · October 23, 2011, 7:01pm

You could try something like this:

awk  '
    BEGIN {
        soup = "Group A,Group B,Group C,Group D,Group E"
        nlist = split( soup, list, "," );
    }

    NR != FNR {
        # some other processing for file2
        next;
    }

    # ----------- blocks for processing file 1 ------------------------
    /XYZ/ { Var1++; }   # count lines with XYZ

    /^<x / {
        str = gensub(/(.+")([0-9]+)(">)(.+)(<\/.+)/, "\\2|\\4", "g")
        split( str, a, "|" );
        agroup[a[2]] = a[1];

        # your original code
        B[gensub(/pattern/,"how","g")]      #Storing desired data in array B
        C[gensub(/pattern/,"how","g")]      # #Storing desired data in array C

        # small change to match D with A
        dgroup[a[2]] = gensub(/pattern/, "\\2|\\4", "g") # #Storing desired data in array D

        next;
    }

    /^<\/group>/ {
        for( i = 1; i <= nlist; i++ )       # end of group, it is now safe to fill in D and A
        {
            A[++aidx] = sprintf( "%d|%s", agroup, list );
            D[aidx] = dgroup;
        }
        delete agroup;
        delete dgroup;
        next;
    }

    END {
        for( i = 1; i <= length( A ); i++ )         # my testing to ensure they align
            printf( "(%s) (%s)\n", A, D );
    }
' file1 file2

Notice the change to process FNR != NR so that later rules for the first file can be separate blocks.

There was one bug in the sample you posted. You incremented c twice in the same block of code. The result would have been that the array A would have had values stored with odd indexes, and D would have values stored starting at 2 with even indexes. The code above doesn't have this issue, and ensures that the values in array D match the values in array A -- they aren't in the order seen in the input, but the order that matches the list in the BEGIN block.

Hope this gets you closer.

Ophiuchus · October 23, 2011, 7:19pm

Hi agama,

Thanks for your reply and help. Yes, I saw now the bug about double incrementing c. Thanks :D.

Regarding your code I've been trying to adapt it to my main awk code, but the main problem I have is that the "soup"
array is pre-defined at the beginning and since the values of Group A, Group B,.. Group E, etc are taken from the same
file groups, the array"soup" should be generated first.

I've been trying with a modified version of your code as below, removing the "BEGIN{}" statement and
definig array list[] as line highlited in red:

*(in blue what I added or modified)

awk  '
    /^<x / {
        list[gensub(/(.+">)(.+)(<\/.+$)/,"\\2","g")];asorti(list,A) # Generating list[] array that will contain unique group strings
         n=gensub(/.+ "|">.+/,"","g") # extracting only values (numbers)
        group[gensub(/.+">|<\/.+/,"","g")]=n;
        next;
    }
    /^<\/group>/ {   # print out last collection including zeros
        for( i = 1; i <= length(list); i++ )
            A[++w]=sprintf("%d|%s", group[A], A );
        delete group;   # clear for next go round
        next;
    } END{for (i=1;i<=length(A);i++) print A}' groups

I think I don't get correct output because the code needs a predefined "soup" array:(.

Many thanks for help so far.

agama · October 23, 2011, 7:59pm

Having a predefined list is key to knowing what is missing, and yes, that is why you're getting odd output.

Using a technique similar to ahamed101's suggestion might help. Since you're not processing the contents of A and D until the end, you could save everything using doubly indexed agroup and dgroup arrays while building your list. The problem with this, and why I didn't suggest it, is that if there is a group X that is missing from all blocks of the input file, it will not be accounted for in the output.

I'll think about it some more.

---------- Post updated at 19:59 ---------- Previous update was at 19:35 ----------

Ok, this collects the various group names as it reads through the file and builds the A and D arrays at the end. If a group name is missing completely it will not be accounted for:

awk  '
    NR != FNR {
        # some other processing for file2
        next;
    }

    # ----------- blocks for processing file 1 ------------------------
    /^<x / {
        str = gensub(/(.+")([0-9]+)(">)(.+)(<\/.+)/, "\\2|\\4", "g")
        split( str, a, "|" );

        if( !seen[a[2]]++ )              # new group name, add it to the list
            list[++nlist] = a[2];

        agroup[group+0,a[2]] = a[1];   # changed to track across whole file

        # your original code
        B[gensub(/pattern/,"how","g")]      #Storing desired data in array B
        C[gensub(/pattern/,"how","g")]      # #Storing desired data in array C

        # small change to match D with A
        dgroup[group+0,a[2]] = gensub(/pattern/, "\\2|\\4", "g") # changed to track across whole file

        next;
    }

    /^<\/group>/ {
        group++;
        next;
    }

    END {
        asort( list );
        for( g = 0; g < group; g++ )            # build A and D with groups seen
        {
            for( i = 1; i <= nlist; i++ )       
            {
                A[++aidx] = sprintf( "%d|%s", agroup[g,list], list );
                D[aidx] = dgroup[g,list];
            }
        }

        # whatever end processing on A and D can be done here
        for( i = 1; i <= length( A ); i++ )         # my testing to ensure they align
            printf( "(%s) (%s)\n", A, D );
    }
' file1 file2

Ophiuchus · October 24, 2011, 5:21am

agama,

Many thanks, I'll try your new code right away and let you know asap.

I think if a group is missing for all groups blocks won't be a problem because it'll mean that only exist those groups that appear as unique Groups in the
file.

Best regards

---------- Post updated at 09:20 PM ---------- Previous update was at 08:21 PM ----------

Tested and works independently. Now I'll test it including it in my main awk script and following the structure of code as you suggested me. I'll let you know

Many thanks.

---------- Post updated at 11:00 PM ---------- Previous update was at 09:20 PM ----------

Hi again agama,

Is possible to process file1 first? how would be the structure?

I ask this because when awk code is processing file2 generates another array, but doing comparison with one of the arrays created when the code reads file1.

I've tried change the order as follow, but doesn't work (changes in red):

awk  '
    NR == FNR { 
    # ----------- blocks for processing file 1 ------------------------
    /^<x / {
        str = gensub(/(.+")([0-9]+)(">)(.+)(<\/.+)/, "\\2|\\4", "g")
        split( str, a, "|" );

        if( !seen[a[2]]++ )              # new group name, add it to the list
            list[++nlist] = a[2];

        agroup[group+0,a[2]] = a[1];   # changed to track across whole file

        # your original code
        B[gensub(/pattern/,"how","g")]      #Storing desired data in array B
        C[gensub(/pattern/,"how","g")]      # #Storing desired data in array C

        # small change to match D with A
        dgroup[group+0,a[2]] = gensub(/pattern/, "\\2|\\4", "g") # changed to track across whole file

        next;
    }

    /^<\/group>/ {
        group++;
        next;
    }

{
        # some other processing for file2
        if($0 == Arr1[d+1]) {Ln[d+1]=FNR;if(d<length(Arr1)-1){d++}} # Arr1 is created when processing file1
        next;
    }

    END {
        asort( list );
        for( g = 0; g < group; g++ )            # build A and D with groups seen
        {
            for( i = 1; i <= nlist; i++ )       
            {
                A[++aidx] = sprintf( "%d|%s", agroup[g,list], list );
                D[aidx] = dgroup[g,list];
            }
        }

        # whatever end processing on A and D can be done here
        for( i = 1; i <= length( A ); i++ )         # my testing to ensure they align
            printf( "(%s) (%s)\n", A, D );
    }
' file1 file2

Thanks again for your help

---------- Post updated 10-24-11 at 05:21 AM ---------- Previous update was 10-23-11 at 11:00 PM ----------

Hi agama again,

I've been able to adapt your code and suggestions into my main code. I saw that was much more complicated to
generate the array in the same awk code, then I generated an array and stored data in bash array. This bash array
is the input to main awk code.

At the beginnig I had some issues, but I was able to set the correct format of the array expected by the split() function.

The final code is as below:

oldIFS=$IFS # Default field separator in bash, IFS=" "
IFS=$'\n' # Changing temporaly to "|"
UnqGroups=( $( awk '/^<x /{print gensub(/(.+">)(.+)(<\/.+$)/,"\\2","g")}' file1 | sort -u | tr '\n' '|') )  #Unique groups
IFS=$oldIFS #Set it again to " ".

awk -v z="${UnqGroups
[*]}" 'BEGIN {nlist=split(z,list,"|")-1}
    NR==FNR{    
        if($0 ~ /j v="/){
            B[gensub(/pattern/,"\\2","g")] 
            x[gensub(pattern/,"","g")];asorti(x,C) 
            
            # To generate Array A
            n=gensub(/.+="|".+$/,"","g")
            group[gensub(/pattern/,"\\2","g")]=n;
            next;
            
        }
        if($0 ~ /<\/group>/) {
        for( i = 1; i <= nlist; i++ )
            A[++w]=sprintf("%d|%s", group
, list );
        delete group;   # clear for next go round
        next;} 
next}
{ 
    # code to work with file2
} 
END{ Print arrays info    }' file1 file2

Many thanks again both for all help and time.

Best regards

agama · October 24, 2011, 10:01pm

Glad you were able to work it out!

Yes, it would have been possible to generate the array's before the processing of the second file.... You would have just needed to change END { to a function (something like function gen_array() { ) and then invoke it once at the start of the second file:

FNR != NR {
    if( FNR == 1 )
          gen_array();   # only invoked on 1st record of each new file

    # rest of your code
}

Doubt you'll go back and make the changes (I wouldn't once I had something working), but figure I'd post it for some closure here.

Ophiuchus · October 25, 2011, 3:37am

Hi again agama,

Even when I could adapt your code to the main awk code with success using a bash array, It would be great to know how would be the way you say using the function.

How would be the function, so I can test?

Regards.

agama · October 25, 2011, 6:26pm

This is the way I'd do it:

awk  '
    function build(     g, i )
     {
        asort( list );
        for( g = 0; g < group; g++ )            # build A and D with groups seen
        {
            for( i = 1; i <= nlist; i++ )       # end of group, it is now safe to fill in D and A
            {
                A[++aidx] = sprintf( "%d|%s", agroup[g,list], list );
                D[aidx] = dgroup[g,list];
            }
        }
    }

    NR != FNR {
        if( FNR == 1 )          # build the arrays when 1st record of second file is read
            build();

        # some other processing for file2
        next;
    }

    # ----------- blocks for processing file 1 ------------------------
    /^<x / {
        str = gensub(/(.+")([0-9]+)(">)(.+)(<\/.+)/, "\\2|\\4", "g")
        split( str, a, "|" );
        agroup[group+0,a[2]] = a[1];
        if( !seen[a[2]]++ )
            list[++nlist] = a[2];

        # your original code
        B[gensub(/pattern/,"how","g")]      #Storing desired data in array B
        C[gensub(/pattern/,"how","g")]      # #Storing desired data in array C

        # small change to match D with A
        dgroup[group+0,a[2]] = gensub(/pattern/, "\\2|\\4", "g") # #Storing desired data in array D

        next;
    }

    /^<\/group>/ {
        group++;
        next;
    }
' file1 file2

Ophiuchus · October 25, 2011, 11:39pm

agama, hello,

I must say, many thanks!!! it works just perfect:D:b:.

It works faster than the other way because all is in awk and I dont need anymore at least 2 arrays that I was generating before.

I've learn many things with this thread, I'm some new to awk, but completely new with awk UDF functions. I have now better idea in how to use them.

At the end I'm not sure why if I use the structure of "NR != FNR" is not working for me, It worked great in this way:

awk  '
    function build(     g, i )
     {
        asort( list );
        for( g = 0; g < group; g++ )            # build A and D with groups seen
        {
            for( i = 1; i <= nlist; i++ )       # end of group, it is now safe to fill in D and A
            {
                A[++aidx] = sprintf( "%d|%s", agroup[g,list], list );
                D[aidx] = dgroup[g,list];
            }
        }
    }

 NR == FNR 
 {  # ----------- blocks for processing file 1 ------------------------
   if($0 ~ /^<x /) {
        str = gensub(/(.+")([0-9]+)(">)(.+)(<\/.+)/, "\\2|\\4", "g")
        split( str, a, "|" );
        agroup[group+0,a[2]] = a[1];
        if( !seen[a[2]]++ )
            list[++nlist] = a[2];

        # your original code
        B[gensub(/pattern/,"how","g")]      #Storing desired data in array B

        next;
    }

   if( $0 ~ /^<\/group>/) {
        group++;
        next;
    }
}

  {
        if( FNR == 1 )          # build the arrays when 1st record of second file is read
            build();

        # some other processing for file2
        next;
    }
' file1 file2

Many thanks again for your great help, support and time.

Much appreciated.

Best regards