In the awk
below I am splitting $7
on the :
(colon) then -
(hyphen) as array a
. The word chr
is printed at the start of every $1
line.
Next, $4
is split on the >
(greater then) as array b
. I am not sure how to account for the two other possibilities in $4
so the correct output is printed. Every line in the input file with either be a del, an ins or have a >
in it. The split of $4
captures the last condition, but I am not sure about the first two. I added comments to each line in the awk
and hope it is close. Thank you :).
The ---
are not part of the file they are just there as examples.
1. if $4 has del then the using the del to the right (whatever letter(s), the format of that line is REF=C;OBS= --- line 3 --- OBS is a null string
2. if $4 has ins then the using del the to the right (whatever letter(s), the format of that line is REF=;OBS=ACA --- line 4 --- REF is a null string
file tab-delimited
Gene Accession_number COSMIC_id CDS_mut_syntax AA_mut_syntax Strand HG19_coordinates Amplicon_id Insert Target_url
SMARCB1 NM_003073.2 1002 c.118C>T p.R40* + 22:24133967-24133967 CHP2_SMARCB1_1 CCTCCGTATGTTCCGAGGTTCTCTGTACAAGAGATACCCCTCACTCTGGAGGCGACTAGCCACTGTGGAAGAGAGGAAGAAAATAGTTGCATCGTCACATGGTAAAAAAAC http://www.sanger.ac.uk/perl/genetics/CGP/cosmic?action=mut_summary&id=1002
RB1 NM_000321 1042 c.2107-2A>G p.? + 13:49037865-49037865 CHP2_RB1_9 TACATCAATTTATTTACTAGATTATGATGTGTTCCATGTATGGCATATGCAAAGTGAAGAATATAGACCTTAAATTCAAAATCATT http://www.sanger.ac.uk/perl/genetics/CGP/cosmic?action=mut_summary&id=1042
SMARCB1 NM_003073.2 1057 c.1148delC p.P383fs + 22:24176357-24176357 CHP2_SMARCB1_4 CCTTGGGAAGGGCAGCGCCCAGGCTGGGAGCTGGCCCCGACTCATTGCCCTCCCCACTCCTCTTCCAGGCGGATGAGGCGTCTTGCCAACACGGCCCCGGCCTGGTAACCAGCCCATCAGCACACGGCTCCC http://www.sanger.ac.uk/perl/genetics/CGP/cosmic?action=mut_summary&id=1057
BRAF NM_004333 144982 c.1797_1798insACA p.T599_V600insT - 7:140453137-140453138 CHP2_BRAF_2 TCAAACTGATGGGACCCACTCCATCGAGATTTCACTGTAGCTAGACCAAAATCACCTATTTTTACTGTGAGGTCTTCATGAAGAAATATATCTGAGGTGTAGTAAGTAAAGGAAAACAG http://www.sanger.ac.uk/perl/genetics/CGP/cosmic?action=mut_summary&id=144982
desired output tab-delimited
chr22 24133967 24133967 c.118C>T REF=C;OBS=T SMARCB1
chr13 49037865 49037865 c.2107-2A>G REF=A;OBS=G RB1
chr22 24176357 24176357 c.1148delC REF=C;OBS= SMARCB1
chr7 140453138 140453138 c.1797_1798insACA REF;OBS=TACTACG BRAF
awk
awk 'BEGIN{ FS=OFS="\t" }NR>1{ split($7,a,":|-"); k="chr"; # define FS and OFS as tab, skip header, split $7 on : then - reading values in array a, set k equal to chr
{ split($4,b,">"); # split $4 on > reading values into array b
printf(k a[1],a[2],a[3],$4,"REF="b[1],"";"OBS="b[2],$1) }' file # print desired output
Hello cmccabe,
Could you please try following and let me know if this helps you.
awk 'NR>1{
if($4 !~ /del/ && $4 !~ /ins/){
REF=substr($4,index($4,">")-1,1);
OBS=substr($4,index($4,">")+1,1)
}
else {
e=$4;
if($4 ~ /ins/){
sub(/.*ins/,"",e);
REF="";
OBS=e
};
if($4 ~ /del/){
sub(/.*del/,"",e);
REF=e;
OBS=""
}
};
CHR=substr($7,1,index($7,":")-1);
CHR_1=substr($7,index($7,":")+1,index($7,"-")-4);
CHR_2=substr($7,index($7,"-")+1);
if($4 !~ /ins/) {
print "chr"CHR,CHR_1 CHR_2$4,"REF="REF,"OBS="OBS,$1
}
else {
print "chr"CHR,CHR_1,CHR_2,$4,"REF="REF,"OBS="OBS,$1
}
}
' Input_file
Thanks,
R. Singh
1 Like
It works great.... thank you
I added comments to each line of the awk
, if you have time and are able to could you please look at them to see if I understand?
awk -F'\t' -v OFS="\t" 'NR>1{ # define FS and OFS as tab and skip header, and start block
if($4 !~ /del/ && $4 !~ /ins/){ # if $4 not equal to del or ins then (start processing)
REF=substr($4,index($4,">")-1,1); # extract the value to the left of the > as REF (why -1,1)?
OBS=substr($4,index($4,">")+1,1) # extract the value to the right of the > as OBS (why +1,1)?
} # end block
else { # start block
e=$4 # set $4 to e
if($4 ~ /ins/){ # if pattern ins, start block
sub(/.*ins/,"",e); # not quite sure, is this creting an pattern index for ins
REF=""; # where REF is null
OBS=e # and OBS is whatever the value is
}; # end block
if($4 ~ /del/){ # if pattern del and start block
sub(/.*del/,"",e); # not quite sure, is this creting an pattern index for del
REF=e; # where OBS is whatever the value is
OBS="" # and OBS is null
} # end block
}; # end block
CHR=substr($7,1,index($7,":")-1); # split $7 on : and store value to left as CHR
CHR_1=substr($7,index($7,":")+1,index($7,"-")-4); # split $7 on - and store value to left as CHR_1 (why are two index needed)?
CHR_2=substr($7,index($7,"-")+1); # split $7 on - and store value to right as CHR_2 (why are two index needed)?
if($4 !~ /ins/) { # print output for > ($4 not equal ins and start block
print "chr"CHR,CHR_1,CHR_2,$4,"REF="REF";""OBS="OBS,$1
} # close block
else { # start block
print "chr"CHR,CHR_1,CHR_2,$4,"REF="REF";""OBS="OBS,$1 # print output if $4 has del or ins in it (set by the pattern if statements)
} # close block
} # end processing
' input # define input
Thanks again :).
Hello cmccabe,
Nice explanation. I too added complete explanation here.
awk 'NR>1{ ###checking here if line number is greater than 1 here, if yes then perform following steps.
if($4 !~ /del/ && $4 !~ /ins/){ ###Checking here if 4th field is NOT having del and ins strings in them.
REF=substr($4,index($4,">")-1,1); ###creating variable REF which is a substring starting from index of char > -1 to 1 character.
OBS=substr($4,index($4,">")+1,1) ###creating variable OBS which is having substring of 4th field whose starting index is the index number of > +1 to till 1 character.
}
else { ###If a line does not have ins and del strings within them then perform following actions.
e=$4; ###creating a variable named e whose value is 4th field.
if($4 ~ /ins/){ ###checking if 4th field has ins string in it.
sub(/.*ins/,"",e); ###substituting all characters from starting to till ins string with NULL in e variable.
REF=""; ###Making variable REF to NULL.
OBS=e ###Making variable named OBS to new value of variable e now.
};
if($4 ~ /del/){ ###Checking here if any 4th field has string del in it.
sub(/.*del/,"",e); ###substituting all characters from starting to till string del with NULL in variable named e.
REF=e; ###creating variable REF to new value of variable e here.
OBS="" ###creating OBS variable with NULL value.
}
};
CHR=substr($7,1,index($7,":")-1); ###creating variable CHR with substring whose starting point is 1st position to till index of colons index -1.
CHR_1=substr($7,index($7,":")+1,index($7,"-")-4);### creating variable named CHR_1 with a substring whose starting point is index of colon +1 till index of dash -4 value in 7th field.
CHR_2=substr($7,index($7,"-")+1);### creating a variable named CHR_2 here whose value is substring whose starting point is index of dash value +1 till the end of 7th field value.
if($4 !~ /ins/) { ###Checking if 4th field does not have ins string in it, if yes then perform following.
print "chr"CHR,CHR_1 CHR_2$4,"REF="REF,"OBS="OBS,$1 ###printing values of variables CHR,CHR_1,CHR_2 etc.
}
else { ###Checking if 4th field does not have ins value in it, then perform following.
print "chr"CHR,CHR_1,CHR_2,$4,"REF="REF,"OBS="OBS,$1 ###print the value of variables CHR,CHR_1,CHR_2 etc.
}
}
' Input_file ###Mentioning the Input_file name here.
Thanks,
R. Singh