Hi everyone,
I have a little bit of complicated task to finish with AWK. Here it is;
I have a data file in xml format which looks like this
<data>
a1 a2 a3 a4 a5
b1 b2 b3 b4 b5
c1 c2 c3 c4 c5
d1 d2 d3 d4 d5
e1 e2 e3 e4 e5
</data>
lets say each data block contains 5 rows and 5 columns, what I need to do is this;
I have a condition, and I need to find the row that satisfies this condition then
I need to add an extra field to each row whose value will be calculated using
the columns of the row that satisfies the condition as well as other columns in
other rows. As an example, lets say row "c" satisfies my condition, then I add
an extra field to data which will look like this
<data>
a1 a2 a3 a4 a5 a6
b1 b2 b3 b4 b5 b6
c1 c2 c3 c4 c5 c6
d1 d2 d3 d4 d5 d6
e1 e2 e3 e4 e5 e6
</data>
where the last fields are calculated as following;
a6 = c2*a2 + c3*a3 + c4*a4 + c5*a5
b6 = c2*b2 + c3*b3 + c4*b4 + c5*b5
c6 = c2*c2 + c3*c3 + c4*c4 + c5*c5
d6 = c2*d2 + c3*d3 + c4*d4 + c5*d5
e6 = c2*e2 + c3*e3 + c4*e4 + c5*e5
the algebra on the above calculation may not necessarily be simple as this.
Thanks for any help.
Please post a sample of the input and the desired output.
Mention the condition to be tested for.
Show how the real data is, how the "data" blocks are separated from one another.
Also, mention the "real" calculations.
Do not oversimplify. This often leads to frequent changes in requirements with subsequent confusion.
Here is two data blocks from my real data file
<data>
21 0.00 0.00 0.57 0.57
21 0.00 0.00 -0.19 0.19
6 -0.63 0.12 0.31 0.37
24 -0.44 0.15 0.25 0.30
-13 -0.23 0.37 0.13 0.14
</data>
<data>
1 0.00 0.00 0.10 0.10
-1 0.00 0.00 -0.66 0.66
6 -0.17 0.40 0.27 0.32
24 -0.48 -0.24 0.12 0.15
-13 0.17 -0.44 0.33 0.18
</data>
take the row where $1==6 to use its fields in further calculations
and do the following algebraic calculations
a6 = c2*a2 - c3*a3 - c4*a4 - c5*a5
b6 = c2*b2 - c3*b3 - c4*b4 - c5*b5
c6 = c2*c2 - c3*c3 - c4*c4 - c5*c5
d6 = c2*d2 - c3*d3 - c4*d4 - c5*d5
e6 = c2*e2 - c3*e3 - c4*e4 - c5*e5
After this operation these two data blocks will look like
<data>
21 0.00 0.00 0.57 0.57 a6
21 0.00 0.00 -0.19 0.19 b6
6 -0.63 0.12 0.31 0.37 c6
24 -0.44 0.15 0.25 0.30 d6
-13 -0.23 0.37 0.13 0.14 e6
</data>
<data>
1 0.00 0.00 0.10 0.10 a6
-1 0.00 0.00 -0.66 0.66 b6
6 -0.17 0.40 0.27 0.32 c6
24 -0.48 -0.24 0.12 0.15 d6
-13 0.17 -0.44 0.33 0.18 e6
</data>
where the numerical values of the last column (for the 2nd data block as an example)
a6 = (-0.17)*(0.00) -(0.40)*(0.00) -(0.27)*(0.10) -(0.32)*(0.10) = -0.0590
b6 = (-0.17)*(0.00) -(0.40)*(0.00) -(0.27)*(-0.66)-(0.32)*(0.66) = -0.0330
c6 = (-0.17)*(-0.17)-(0.40)*(0.40) -(0.27)*(0.27) -(0.32)*(0.32) = -0.3064
d6 = (-0.17)*(-0.48)-(0.40)*(-0.24)-(0.27)*(0.12) -(0.32)*(0.15) = 0.0972
e6 = (-0.17)*(0.17) -(0.40)*(-0.44)-(0.27)*(0.33) -(0.32)*(0.18) = 0.0004
Yoda
February 23, 2013, 11:06am
4
Try this code:
awk ' /<data>/ {
f = 1;
print $0
next;
} /<\/data>/ && s {
f = 0;
m = 0;
k = 12;
while(m < j)
{
for(i=1;i<=nf;i++)
{
printf "%s\t", a[i,++m];
if(i >= 2)
{
a6 -= (a[i,k] * a[i,m]);
++k;
}
}
printf "%.4f", a6;
printf "\n"
if(k > nf) k=12;
}
j = 0;
print $0;
} f == 1 {
for(i=1;i<=NF;i++)
{
a[i,++j] = $i;
}
if(a[1,11] == 6 )
s = 1;
if(a[1,11] != 6 )
s = 0;
nf = NF;
}' file
Hi bipi,
thanks for your efforts, but this code does not give desired answers,
could you please check it again.
Yoda
February 23, 2013, 11:39am
6
Oops I forgot to reinitialize a6
Modified code:
awk ' /<data>/ {
f = 1;
print $0
next;
} /<\/data>/ && s {
f = 0;
m = 0;
k = 12;
while(m < j)
{
for(i=1;i<=nf;i++)
{
printf "%s\t", a[i,++m];
if(i >= 2)
{
a6 -= (a[i,k] * a[i,m]);
++k;
}
}
printf "%.4f", a6;
a6 = 0;
printf "\n"
if(k > nf) k=12;
}
j = 0;
print $0;
} f == 1 {
for(i=1;i<=NF;i++)
{
a[i,++j] = $i;
}
if(a[1,11] == 6 )
s = 1;
if(a[1,11] != 6 )
s = 0;
nf = NF;
}' file
Here is the O/P that I am getting:
$ ./hayreter
<data>
21 0.00 0.00 0.57 0.57 -0.3876
21 0.00 0.00 -0.19 0.19 -0.0114
6 -0.63 0.12 0.31 0.37 -0.6443
24 -0.44 0.15 0.25 0.30 -0.4837
-13 -0.23 0.37 0.13 0.14 -0.2814
</data>
<data>
1 0.00 0.00 0.10 0.10 -0.0590
-1 0.00 0.00 -0.66 0.66 -0.0330
6 -0.17 0.40 0.27 0.32 -0.3642
24 -0.48 -0.24 0.12 0.15 -0.0660
-13 0.17 -0.44 0.33 0.18 0.0582
this is getting better
first 2 row is correct but when it comes to 3rd row (that satisfies the condition)
answers are getting wrong
for you to check your results, look at the table that I posted yesterday above
Yoda
February 23, 2013, 12:09pm
8
OK fixed it. But I don't know why it didn't work in the previous code!
awk ' /<data>/ {
f = 1;
print $0
next;
} /<\/data>/ && s {
f = 0;
m = 0;
k = 12;
while(m < j)
{
for(i=1;i<=nf;i++)
{
printf "%s\t", a[i,++m];
if(i >= 2)
{
b = (a[i,k] * a[i,m]);
++k;
}
}
a6 = b[2] - b[3] - b[4] - b[5];
printf "%.4f", a6;
a6 = 0;
printf "\n"
if(k > nf) k=12;
}
j = 0;
print $0;
} f == 1 {
for(i=1;i<=NF;i++)
{
a[i,++j] = $i;
}
if(a[1,11] == 6 )
s = 1;
if(a[1,11] != 6 )
s = 0;
nf = NF;
}' file
Current O/P:
$ ./hayreter
<data>
21 0.00 0.00 0.57 0.57 -0.3876
21 0.00 0.00 -0.19 0.19 -0.0114
6 -0.63 0.12 0.31 0.37 0.1495
24 -0.44 0.15 0.25 0.30 0.0707
-13 -0.23 0.37 0.13 0.14 0.0084
</data>
<data>
1 0.00 0.00 0.10 0.10 -0.0590
-1 0.00 0.00 -0.66 0.66 -0.0330
6 -0.17 0.40 0.27 0.32 -0.3064
24 -0.48 -0.24 0.12 0.15 0.0972
-13 0.17 -0.44 0.33 0.18 0.0004
</data>
1 Like
Try this (with assumptions about your data):
awk '/^[ \t]*<data>/{data=1;countrow=0;rec[countrow++]=$0;next}
/^[ \t]*<\/data>/{
if(output) {
print rec[0]
n=split(rec[foundat],pattline)
for(i=1;i<countrow;i++) {
split(rec,otherline)
c=0
for(j=2;j<=n;j++) {
if(j==2) {c=pattline[j]*otherline[j];continue}
c-=(pattline[j]*otherline[j])
}
print rec,c
}
print
}
else
for(i in rec) delete rec
output=foundat=data=0;next
}
data{
if(!output && $1=="6") { output=1; foundat=countrow }
rec[countrow++]=$0
}' file
1 Like
Both scripts works quite fine, I thank to you both so much.
Would you mind explain your codes line by line, I would appreciate a lot.
thanks again.
Yoda
February 23, 2013, 11:31pm
11
Here is a brief explanation:
awk ' /<data>/ { # Search for pattern: <data>
f = 1; # If found set f = 1
print $0 # Print <data> tag
next; # Stop processing current record
} /<\/data>/ && s { # Search for pattern: </data>
f = 0; # If found set f = 0
m = 0; # m = 0 (counter for fetching all array elements)
k = 12; # k = 12 (index of 2nd element in row c)
while(m < j) # While m < j
{
for(i=1;i<=nf;i++) # For each records in 2D array
{
printf "%s\t", a[i,++m]; # Print records
if(i >= 2) # If array index i > 2 (starting from second element)
{
b = (a[i,k] * a[i,m]); # Multiply current record with row c record.
++k;
}
}
a6 = b[2] - b[3] - b[4] - b[5]; # Subtract elements in array b and assign to a6
printf "%.4f", a6; # Print a6
a6 = 0; # Reinitialize a6 = 0
printf "\n"
if(k > nf) k=12; # if k index goes beyond number of recs, reset to 12
}
j = 0;
print $0; # Print </data> tag
} f == 1 { # If f == 1
for(i=1;i<=NF;i++) # Creating a 2D array with elements in <data> tag.
{
a[i,++j] = $i;
}
if(a[1,11] == 6 ) # If row c first element == 6
s = 1; # Set s = 1
if(a[1,11] != 6 ) # If row c first element != 6
s = 0; # Set s = 0
nf = NF; # Set nf = number of records in line NF
}' file
Thanks again for this brief (but quite detailed) explanation.
I have just one question, how did you assign the row c?
And if our condition were to satisfied in row d instead of c
would this script still work? or is it not as that much generic?
Yoda
February 23, 2013, 11:50pm
13
Here are the indices (i,j)
for each data elements when stored in array:
1 (1,1) 0.00 (2,2) 0.00 (3,3) 0.10 (4,4) 0.10 (5,5)
-1 (1,6) 0.00 (2,7) 0.00 (3,8) -0.66 (4,9) 0.66 (5,10)
6 (1,11) -0.17 (2,12) 0.40 (3,13) 0.27 (4,14) 0.32 (5,15)
24 (1,16) -0.48 (2,17) -0.24 (3,18) 0.12 (4,19) 0.15 (5,20)
-13 (1,21) 0.17 (2,22) -0.44 (3,23) 0.33 (4,24) 0.18 (5,25)
This is why in the code I am using j
index 11 - 15
to identify row c elements. Hence you have to change the j
index as per your requirement. I hope you understood.
don't worry, I perfectly understood what you mean
thanks a lot bipi.