awk to format each line by pattern

The four lines in the tab-delimeted input are a sample format from my actual data. The awk is meant to go line by line and check if a pattern is satisfied and if it is follow a particular format (there are 3). All the lines in the file should follow one of the three formats below. I added comments to the awk but can not get it to execute and there is probably a better way. Thank you :).

format1= only text (alpha characters) are stored in variable p   --- so only NHLRC1 is stored in $p as the other parenthesis is a #
format2= parenthesis with a number in them are removed along with the parenthesis --- so in line 3 the (10866) is removed
format3= split $4 on the _ (underscore) and print the 3 field

input tab-delimited

6	18122723	18122843	469_380805_378884(NHLRC1)_1.1_1
6	31114121	31114241	344047_16724314_rs746647_1
6	31430946	31431066	344049_16724385_HCP5(10866)_1_1
6	32808479	32808599	445446_18754304_PSMB8-exon6_1

desired output tab-delimited

chr6	18122723	18122843	chr6:18122723-18122843	NHLRC1
chr6	31114121	31114241	chr6:31114121-31114241	rs746647
chr6	31430946	31431066	chr6:31430946-31431066	HCP5
chr6	32808479	32808599	chr6:32808479-32808599	PSMB8-exon6

awk

awk 'BEGIN{FS=OFS="\t"}  # define fs and output
       FNR==NR{ # process each field in each line of file
         if(/([A-Z])/) {  # pattern 1 for extracting only alpha in () not number
            p=$(awk -F"[()]" '{print $2}')      # extract string in variable p
              print "chr"$1,$2,$3,"chr:"$2"-"$3,$p  # print desired output
               next
  }
         if(/([0-9])/) {  # pattern remove # in () 
            n=$(awk -F"[()]" '{print $2}')   # extract number in ()in variable n
              awk -v num=$n 'BEGIN {sub([0-9],"",num) && sub (),"",$4)  ; print name}  # substitute # with null value and print
               next
  }
         if($4 ~ /_/) {  # pattern 3 for _ spilt
            awk '{split($0,a,"_"); print "chr"$1,$2,$3,"chr:"$2"-"$3,a[3]}'  # if conditions 1 and 2 not meet then split on _ and print 3rd field along with desired fields
               next
  }
}' input

Try:

awk '
  {
    split($4,F,/_/)
    if(split(F[3],G,/[)(]/)) {
      if(G[2]~/[[:alpha:]]/)
        p=G[2]
      else 
        p=G[1]
    } 
    else 
      p=F[3]
  }
  {
    print "chr" $1, $2, $3, "chr" $1 ":" $2 "-" $3 OFS p
  }
' FS='\t' OFS='\t' file
1 Like

The awk works great... thank you. I found two additional format types and commented your code to try and capture these two additional. However I don't think I am understanding it correctly. Would you be able to comment it so I can try to make the changes... I added the bold portion to capture the pattern in line 5 (split $4 on the _ and capture the 2nd value if alpha). Also, I can't figure out how does a numeric value inside a () not get printed? Thank you very much :).

awk '
  {
    split($4,F,/_/)            # split field 4 on _ and strore in F
    if(split(F[3],G,/[)(]/)) { # store value of 3rd field in G
        if(G[2]~/[[:alpha:]]/) # check that it's alpha and store in G[2]
        p=G[2]  # store G[2] as p
      else 
        p=G[1]  # if numeric store as p
    } 
    else 
      p=F[3]   # store spilt value as p
  }
  { 
    split($4,A,/_/)
     if(split(A[2],B,/[_]/)) {
      if(B[2]~/[[:alpha:]]/)
        p=B[2]
  }
   }
    {
    print "chr" $1, $2, $3, "chr" $1 ":" $2 "-" $3 OFS p  # print desired output
  }
' FS='\t' OFS='\t' in   # define FS and OFS as tab-delimited

in tab-delimited

6	18122723	18122843	469_380805_378884(NHLRC1)_1.1_1
6	31114121	31114241	344047_16724314_rs746647_1
6	31430946	31431066	344049_16724385_HCP5(10866)_1_1
6	32808479	32808599	445446_18754304_PSMB8-exon6_1
1	33478785	33478905	19186497_AK2-Exon1_1
1	24022788	24022908	466743_18956150_RPL11-NM_000975-exon6_1

desired output tab-delimited

chr6	18122723	18122843	chr6:18122723-18122843	NHLRC1
chr6	31114121	31114241	chr6:31114121-31114241	rs746647
chr6	31430946	31431066	chr6:31430946-31431066	HCP5
chr6	32808479	32808599	chr6:32808479-32808599	PSMB8-exon6
chr1	33478785	33478905	chr1:33478785-33478905	AK2-Exon1
chr1	24022788	24022908	chr1:24022788-24022908	RPL11-NM_000975-exon6

Hi try this instead:

awk '
  {
    gsub(/^[0-9_]+[_(]|[)(_][_)(0-9.]+$/,x,$4)
    print "chr" $1, $2, $3, "chr" $1 ":" $2 "-" $3, $4
  }
' FS='\t' OFS='\t' file
1 Like

Thank you very much :).