The four lines in the tab-delimeted input
are a sample format from my actual data. The awk
is meant to go line by line and check if a pattern is satisfied and if it is follow a particular format (there are 3). All the lines in the file should follow one of the three formats below. I added comments to the awk
but can not get it to execute and there is probably a better way. Thank you :).
format1= only text (alpha characters) are stored in variable p --- so only NHLRC1 is stored in $p as the other parenthesis is a #
format2= parenthesis with a number in them are removed along with the parenthesis --- so in line 3 the (10866) is removed
format3= split $4 on the _ (underscore) and print the 3 field
input tab-delimited
6 18122723 18122843 469_380805_378884(NHLRC1)_1.1_1
6 31114121 31114241 344047_16724314_rs746647_1
6 31430946 31431066 344049_16724385_HCP5(10866)_1_1
6 32808479 32808599 445446_18754304_PSMB8-exon6_1
desired output tab-delimited
chr6 18122723 18122843 chr6:18122723-18122843 NHLRC1
chr6 31114121 31114241 chr6:31114121-31114241 rs746647
chr6 31430946 31431066 chr6:31430946-31431066 HCP5
chr6 32808479 32808599 chr6:32808479-32808599 PSMB8-exon6
awk
awk 'BEGIN{FS=OFS="\t"} # define fs and output
FNR==NR{ # process each field in each line of file
if(/([A-Z])/) { # pattern 1 for extracting only alpha in () not number
p=$(awk -F"[()]" '{print $2}') # extract string in variable p
print "chr"$1,$2,$3,"chr:"$2"-"$3,$p # print desired output
next
}
if(/([0-9])/) { # pattern remove # in ()
n=$(awk -F"[()]" '{print $2}') # extract number in ()in variable n
awk -v num=$n 'BEGIN {sub([0-9],"",num) && sub (),"",$4) ; print name} # substitute # with null value and print
next
}
if($4 ~ /_/) { # pattern 3 for _ spilt
awk '{split($0,a,"_"); print "chr"$1,$2,$3,"chr:"$2"-"$3,a[3]}' # if conditions 1 and 2 not meet then split on _ and print 3rd field along with desired fields
next
}
}' input