In the awk
below I am trying to match the value in $4
of file1
with the split value from $4
in file2
. I store the value of $4 in file1
in A
and the split value (using the _ for the split) in array. I then strore the value in $2
as min
, the value in $3
as max
, and the value in $1
as chr
.
If A
is equal to array
, then i use the values stored in min
, max
, and chr
to check if there is overlap or not between the $2
, $3
, and $1
values in file2
. If there is then overlap
is printing but if there is not missing
is printed. I am trying to ensure that the lines match and that the coordinates are in covered from file1
to file2
. My actual data is several thousands of lines all in the below format and a match should result for each line in file2
. I commented the awk
as well and hope it helps as I am getting multiple syntax errors and maybe there is a better way, but I wanted to try and see. Thank you :).
file1 tab-delimeted
chr19 42373737 42373856 RPS19
chr6 32790021 32790140 TAP2
file2 tab-delimeted
chr19 42364844 42364915 RPS19_cds_1_0_chr19_42364845_f 0 +
chr19 42365180 42365281 RPS19_cds_2_0_chr19_42365181_f 0 +
chr19 42373100 42373284 RPS19_cds_3_0_chr19_42373101_f 0 +
chr19 42373768 42373823 RPS19_cds_4_0_chr19_42373769_f 0 +
chr19 42375418 42375445 RPS19_cds_5_0_chr19_42375419_f 0 +
chr6 32790065 32790095 TAP2_cds_0_0_chr6_32790066_r 0 -
chr6 32797176 32797313 TAP2_cds_1_0_chr6_32797177_r 0 -
chr6 32797706 32797866 TAP2_cds_2_0_chr6_32797707_r 0 -
chr6 32798043 32798217 TAP2_cds_3_0_chr6_32798044_r 0 -
chr6 32798394 32798583 TAP2_cds_4_0_chr6_32798395_r 0 -
chr6 32800109 32800238 TAP2_cds_5_0_chr6_32800110_r 0 -
chr6 32800403 32800601 TAP2_cds_6_0_chr6_32800404_r 0 -
chr6 32802930 32803136 TAP2_cds_7_0_chr6_32802931_r 0 -
chr6 32803419 32803550 TAP2_cds_8_0_chr6_32803420_r 0 -
chr6 32805313 32805428 TAP2_cds_9_0_chr6_32805314_r 0 -
chr6 32805517 32806010 TAP2_cds_10_0_chr6_32805518_r 0 -
desired output tab-delimeted
chr19 42364844 42364915 RPS19_cds_1_0_chr19_42364845_f 0 + missing
chr19 42365180 42365281 RPS19_cds_2_0_chr19_42365181_f 0 + missing
chr19 42373100 42373284 RPS19_cds_3_0_chr19_42373101_f 0 + missing
chr19 42373768 42373823 RPS19_cds_4_0_chr19_42373769_f 0 + overlap
chr19 42375418 42375445 RPS19_cds_5_0_chr19_42375419_f 0 + missing
chr6 32790065 32790095 TAP2_cds_0_0_chr6_32790066_r 0 - overlap
chr6 32797176 32797313 TAP2_cds_1_0_chr6_32797177_r 0 - missing
chr6 32797706 32797866 TAP2_cds_2_0_chr6_32797707_r 0 - missing
chr6 32798043 32798217 TAP2_cds_3_0_chr6_32798044_r 0 - missing
chr6 32798394 32798583 TAP2_cds_4_0_chr6_32798395_r 0 - missing
chr6 32800109 32800238 TAP2_cds_5_0_chr6_32800110_r 0 - missing
chr6 32800403 32800601 TAP2_cds_6_0_chr6_32800404_r 0 - missing
chr6 32802930 32803136 TAP2_cds_7_0_chr6_32802931_r 0 - missing
chr6 32803419 32803550 TAP2_cds_8_0_chr6_32803420_r 0 - missing
chr6 32805313 32805428 TAP2_cds_9_0_chr6_32805314_r 0 - missing
chr6 32805517 32806010 TAP2_cds_10_0_chr6_32805518_r 0 - missing
awk
awk '
BEGIN { FS=OFS="\t" } # define FS and OFS as tab
NR==FNR{ # process same line in file1 and file2
{
A[$1]=$4;next} # store $4 value from file1 into A
{min[NR]=$2; max[NR]=$3; chr[NR]=$1; next} # store $1,$2,$3 values into seperate arrays
{
split($4,array,"_") # split $4 in file2 by the _
}
{
for (id in min)
if([A] ~ array) && (($1==chr[NR])&&(min[id] <= $2 && $3 < max[id])) { # match $4 in A with array split and check for overlap using min max and chr from file1
$7 = print "overlap" else "missing"; # print value in $7 of file 2
}' file1 file2