In the awk , thanks you @RavinderSingh13, for the help in below, hopefully it is close as I am trying to update the value in $12 of the tab-delimetedfile2 with the matching value in $1 of the space delimetedfile1 . I have added comments for each line as well. Thank you :).
awk
awk '$12 == /NM_/{ # search $12 for pattern NM_
match($12,/p..*/); # using match the regex will match in $12 from the p. to the end
VAL=substr($12,RSTART+1,RLENGTH-2); # Putting the values of subtring whose value starts from RSTART+1 to RSTART-2
for(i=1;i<=num;i++){ # Starting a loop which will start from value 1 of variable i to till value of variable num
awk 'FNR==NR {a[$1]=$2; next} a[$i]{VAL=a[$i]} # Store $2 value from file1 in array a and update array i matching VAL
} # close block
next # process next line
}1' OFS="\t" file1 FS'\t' file2 # define file1 as space delimited and file2 as tab-delimited with the output also being tab-delimited
file1
C Cys
D Asp
V Val
W Trp
Y Tyr
file2
R_Index Chr Start End Ref Alt Func.refGene Gene.refGene GeneDetail.refGene Inheritence ExonicFunc.refGene AAChange.refGene avsnp147
1 chr1 948846 948846 - A upstream ISG15 dist=1 . . . rs3841266
2 chr1 948870 948870 C G UTR5 ISG15 NM_005101:c.-84C>G . . . rs4615788
3 chr1 948921 948921 T C UTR5 ISG15 NM_005101:c.-33T>C . . . rs15842
4 chr1 949597 949597 C T exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.237C>T:p.D79D rs61766284
5 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.V98V rs8997
6 chr1 1269554 1269554 T C exonic TAS1R3 . . nonsynonymous SNV TAS1R3:NM_152228:exon6:c.2269T>C:p.C757R rs307377
desired output the 3 lines in bold are updated because the one-letter code before and after the digit matched $1 in file1 , so it is updated to the three-letter code in $2 .
R_Index Chr Start End Ref Alt Func.refGene Gene.refGene GeneDetail.refGene Inheritence ExonicFunc.refGene AAChange.refGene avsnp147
1 chr1 948846 948846 - A upstream ISG15 dist=1 . . . rs3841266
2 chr1 948870 948870 C G UTR5 ISG15 NM_005101:c.-84C>G . . . rs4615788
3 chr1 948921 948921 T C UTR5 ISG15 NM_005101:c.-33T>C . . . rs15842
4 chr1 949597 949597 C T exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.237C>T:p.Asp79Asp rs61766284
5 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.Val98Val rs8997
6 chr1 1269554 1269554 T C exonic TAS1R3 . . nonsynonymous SNV TAS1R3:NM_152228:exon6:c.2269T>C:p.Cys757Arg rs307377
awk '
# BEGIN runs before any of the input files is opened
BEGIN { FS=OFS="\t" }
# The input files are processed one by one and the following code runs for each line
# FNR is equal to NR when processing file1
# a[ ] is indexed by the one letter code, its value is the three letter code
FNR==NR { a[$1]=$2; next }
# The next goes to the next input cycle
# The following code runs for file2 (and further files)
$12 ~ /:NM_/{ # search $12 for pattern :NM_
match($12,/p..*/) # using match the regex will match in $12 from the p. to the
VAL=substr($12,RSTART+1,RLENGTH-2) # Put the values of substring whose value starts from RSTART+1 to RSTART-2
# more to come
}
{ print }
' file1 file2
Both awk command below execute but I do not get the intended result:
I added some more comments to both as well. Thank you :).
awk 1
awk '
# BEGIN runs before any of the input files is opened
BEGIN { FS=OFS="\t" }
# The input files are processed one by one and the following code runs for each line
# FNR is equal to NR when processing file1
# a[ ] is indexed by the one letter code, its value is the three letter code
FNR==NR { a[$1]=$2; next }
# The next goes to the next input cycle
# The following code runs for file2 (and further files)
$12 ~ /:NM_/{ # search $12 for pattern :NM_
match($12,/p..*/) # using match the regex will match in $12 from the p. to the
VAL=substr($12,RSTART+1,RLENGTH-2) # Put the values of substring whose value starts from RSTART+1 to RSTART-2
# update one letter code to three letter by storing the value of $12 in array then updating if it matches file1
{ if(a[$12]){$12=a[$12] };
}
print}' file1 file2 > out
out
4 chr1 949597 949597 C T exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.237C>T:p.D79D rs61766284
5 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.V98V rs8997
awk 2
# store value of $2 in file1 in array A and update each sub-patter of p. matching $12 in file2
awk 'FNR==NR {A[$1]=$2; next} $12 in A {sub ($12, $12 "p." A[$12]) }1' OFS="\t" file1 FS='\t' file2 > out
out
R_Index Chr Start End Ref Alt Func.refGene Gene.refGene GeneDetail.refGene Inheritence ExonicFunc.refGene AAChange.refGene avsnp147
1 chr1 948846 948846 - A upstream ISG15 dist=1 . . . rs3841266
2 chr1 948870 948870 C G UTR5 ISG15 NM_005101:c.-84C>G . . . rs4615788
3 chr1 948921 948921 T C UTR5 ISG15 NM_005101:c.-33T>C . . . rs15842
4 chr1 949597 949597 C T exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.237C>T:p.D79D rs61766284
5 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.V98V rs8997
You need to cycle through each character, as you maybe intended in post#1; you need substr() to cut out a single character.
awk '
BEGIN { OFS="\t" }
# The input files are processed one by one and the following code runs for each line
# FNR is equal to NR when processing file1
# a[ ] is indexed by the one letter code, its value is the three letter code
FNR==NR { a[$1]=$2; next }
# The next goes to the next input cycle
# The following code runs for file2 (and further files)
($12 ~ /:NM_/ && match($12,/p..*/)) { # search for :NM_ and p..*
# Get the substring after p.
VAL=substr($12,RSTART+2)
# Get its length
lenVAL=length(VAL)
ostring=""
# Cycle through each character, append to ostring, if in a[ ] replace by its value
for (i=1; i<=lenVAL; i++) {
c=substr(VAL,i,1)
ostring=(ostring ((c in a) ? a[c] : c))
}
# copy ostring back to $12 (unconditionally), retaining the part up to p.
$12=(substr($12,1,RSTART+1) ostring)
}
# always print
{ print }
' file1 FS="\t" file2
I have reverted to define FS after reading file1, because file1 might not be TAB-separated.
---------- Post updated 09-12-17 at 07:45 AM ---------- Previous update was 09-11-17 at 05:19 PM ----------
A line with multiple NM_ values in $12 seperated by a ; seems to chage a matching c. as well. I have tried adding in ; as a FS , but that splits eaach into multiple tabs in the output. I also added tried a break after the ostring=(ostring ((c in a) ? a[c] : c)) , thinking that would process each, then break, and loop to the next... but that only processed one and then stopped. Maybe I added it in the wrong place or is there a better way? Thank you
line
R_Index Chr Start End Ref Alt Func.refGene Gene.refGene GeneDetail.refGene Inheritence ExonicFunc.refGene AAChange.refGene avsnp147
1 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.V98V;ISG15:NM_005101:exon2:c.237C>T:p.D79D rs8997
current output
R_Index Chr Start End Ref Alt Func.refGene Gene.refGene GeneDetail.refGene Inheritence ExonicFunc.refGene AAChange.refGene avsnp147
1 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.Val98Val;ISG15:NM_005101:exon2:c.237Cys>T:p.Asp79Asp rs8997
Im not sure I follow completly, but it is close... line 3 is the multiple NM_ , but only the second p . looks to be updated. Can you add comments if possible? Thank you :).
1 chr1 949597 949597 C T exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.237C>T:p.Asp79Asp rs61766284
2 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.Val98Val rs8997
3 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.V98V;ISG15:NM_005101:exon2:c.237C>T:p.Asp79Asp rs8997
In your post#5 you haven't mentioned that we need to change p.D79D after : , so I had written my logic to change D79D not before DOT, could you please post exact output you need so that I could try to help you.
I apologize and below is the complete file2 is below, it is tab-delimeted and $12 may have multiple (up to 30) NM_ in it. In this file2 line 6 has multiple NM_ in $12 seperated by a ; . All the matching p. and both p. are updated if it matches file1 . So using line 6 as an example the V in p.V98V and D in p.D79D .
The last thing I am going to do, though I have not thought that far ahead yet :)... is since the V in p.V98V and D in p.D79D are the same, that is the letter before and after the digit are the same, print p.= , but only if the letters before and after the digit are the same. So in this case line 6 would look like
6 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.=;ISG15:NM_005101:exon2:c.237C>T:p.=
this is the ultimate goal but the last step and maybe it is easier to do at a later time. Thanks :).
file2
R_Index Chr Start End Ref Alt Func.refGene Gene.refGene GeneDetail.refGene Inheritence ExonicFunc.refGene AAChange.refGene avsnp147
1 chr1 948846 948846 - A upstream ISG15 dist=1 . . . rs3841266
2 chr1 948870 948870 C G UTR5 ISG15 NM_005101:c.-84C>G . . . rs4615788
3 chr1 948921 948921 T C UTR5 ISG15 NM_005101:c.-33T>C . . . rs15842
4 chr1 949597 949597 C T exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.237C>T:p.D79D rs61766284
5 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.V98V rs8997
6 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.V98V;ISG15:NM_005101:exon2:c.237C>T:p.D79D rs8997
desired output
R_Index Chr Start End Ref Alt Func.refGene Gene.refGene GeneDetail.refGene Inheritence ExonicFunc.refGene AAChange.refGene avsnp147
1 chr1 948846 948846 - A upstream ISG15 dist=1 . . . rs3841266
2 chr1 948870 948870 C G UTR5 ISG15 NM_005101:c.-84C>G . . . rs4615788
3 chr1 948921 948921 T C UTR5 ISG15 NM_005101:c.-33T>C . . . rs15842
4 chr1 949597 949597 C T exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.237C>T:p.Asp79Asp rs61766284
5 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.Val98Val rs8997
6 chr1 949654 949654 A G exonic ISG15 . . synonymous SNV ISG15:NM_005101:exon2:c.294A>G:p.Val98Val;ISG15:NM_005101:exon2:c.237C>T:p.Asp79Asp rs8997
Have you made any progress?
Here is my attempt.
I have added another loop over the ";"-separated parts in $12.
In each part it still cycles through all characters after "p.".
awk '
BEGIN { OFS="\t" }
# The input files are processed one by one and the following code runs for each line
# FNR is equal to NR when processing file1
# a[ ] is indexed by the one letter code, its value is the three letter code
FNR==NR { a[$1]=$2; next }
# The next goes to the next input cycle
# The following code runs for file2 (and further files)
$12 ~ /:NM_/ {
ostring=""
# split $12 by ";" and cycle through them
nNM=split($12,NM,";")
for (n=1; n<=nNM; n++) {
if (n>1) ostring=(ostring ";") # append ";"
if (match(NM[n],/p[.].*/)) {
# copy up to "p."
ostring=(ostring substr(NM[n],1,RSTART+1))
# Get the substring after "p."
VAL=substr(NM[n],RSTART+2)
# Get its length
lenVAL=length(VAL)
# Cycle through each character, append to ostring, if in a[ ] replace by its value
for (i=1; i<=lenVAL; i++) {
c=substr(VAL,i,1)
ostring=(ostring ((c in a) ? a[c] : c))
}
} else {
# append the unchanged string
ostring=(ostring NM[n])
}
}
# copy ostring back to $12 (unconditionally)
$12=ostring
}
# always print
{ print }
' file1 FS="\t" file2
I was having no luck and now I see why (or think I do anyway).... I was spiting but not cycling though and appending the new value. Thank you very much :).