shell script performance issues --Urgent

I need help in awk please help immediatly.

This below function is taking lot of time
Please help me to fine tune it so that it runs faster.
The file count is around 3million records

# Process Body
processbody() {

    \#set -x

    while read line
    do
            ENTITY_TYPE=\`print "$line" | cut -d'|' -f2 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`

            if [ $\{ENTITY_TYPE\} == "O" ]
            then
                    ENTITY_TYPE="B"
            else
                    ENTITY_TYPE="P"
            fi
            CUSTOMER_ID=\`print "$line" | cut -d'|' -f1 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`

            \#Branch and Account Numbers should be left blank

            BRANCH_NUMBER=
            ACCOUNT_NUMBER=
            ACCOUNT\_DATE_OPEN=\`print "$line" | cut -d'|' -f3 |sed 's/[^0-9]//g' | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}' | cut -c1-8\`
            CORPORATE_NAME=\`print "$line" | cut -d'|' -f4 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            LAST_NAME=\`print "$line" | cut -d'|' -f5 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            FIRST_NAME=\`print "$line" | cut -d'|' -f6 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            MIDDLE_NAME=\`print "$line" | cut -d'|' -f7 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            NAME_SUFFIX=\`print "$line" | cut -d'|' -f8 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`

            \# Extracting person gender information
            PERSON_GENDER=\`print "$line" | cut -d'|' -f9 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            \# If gender is anything other than M or F,replace it with blank
            if [[ $\{PERSON_GENDER\} != "M" && $\{PERSON_GENDER\} != "F" ]]
            then
                    PERSON_GENDER=
            fi

            BIRTH_DATE=\`print $line | cut -d'|' -f10 | sed 's/[^0-9]//g' | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}' | cut -c1-8\`
            \#AGE should be left blank
            AGE=



            \# Extracting citizenship code information
            CITIZEN\_COUNTRY_NAME=\`print $line | cut -d'|' -f11 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`


            if [[ $\{CITIZEN\_COUNTRY_NAME\} == "US" || $\{CITIZEN\_COUNTRY_NAME\} == "USA" || $\{CITIZEN\_COUNTRY_NAME\} == "UNITED STATES" || $\{CITIZEN\_COUNTRY_NAME\} == "UNITED STATES OF AMERICA" ]]
            then
                    CITIZENSHIP_CODE="USA"
                    FED_ID=\`print $line | cut -d'|' -f12 | sed -e 's/[^0-9]//g' | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            else
                    CITIZENSHIP_CODE=\`print $line | cut -d'|' -f11 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}' | cut -c1-3\`
                    FED_ID=
            fi

                    if [[ $\{ENTITY_TYPE\} == "P" ]]
                    then
                            FED\_ID_TYPE="S"
                    else
                            FED\_ID_TYPE="T"
                    fi

            \#Extracting National ID information

            ID\_INFORMATION_1=\`print $line | cut -d'|' -f13 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            ID\_INFORMATION_2=\`print $line | cut -d'|' -f14 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`


            if [[ ! -z $\{ID\_INFORMATION_1\} && $\{ID\_INFORMATION_1\} != "" ]]
            then
                    NATIONAL\_ID=$\{ID\_INFORMATION_1\}

                    \# Remove all non numeric characters in NATIONAL_ID field
                    NATIONAL_ID=\`print $\{NATIONAL_ID\} | sed 's/[^0-9a-zA-Z]//g' | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
                    NATIONAL\_ID_TYPE="DL"
            elif [[ ! -z $\{ID\_INFORMATION_2\} && $\{ID\_INFORMATION_2\} != "" ]]
            then
                    NATIONAL\_ID=$\{ID\_INFORMATION_2\}


                    \# Remove all non numeric characters in NATIONAL_ID field
                    NATIONAL_ID=\`print $\{NATIONAL_ID\} | sed 's/[^0-9a-zA-Z]//g' | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
                    NATIONAL\_ID_TYPE="PP"
            else
                    NATIONAL_ID=
                    NATIONAL\_ID_TYPE=
            fi

            \#Extracting street address information

            ADDRESS_1=\`print $line | cut -d'|' -f15 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            ADDRESS_2=\`print $line | cut -d'|' -f16 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            STREET\_ADDRESS=$\{ADDRESS\_1\}$\{ADDRESS_2\}
            STREET_ADDRESS=\`print $\{STREET_ADDRESS\} | cut -c1-60\`

            \#Extracting city information

            ADDRESS_3=\`print $line | cut -d'|' -f17 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            CITY\_NAME=$\{ADDRESS_3\}

            \#Extracting country information

            COUNTRY=\`print $line | cut -d'|' -f20 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            ADDRESS_4=\`print $line | cut -d'|' -f18 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`

            COUNTRY_NAME=$\{COUNTRY\}

            if [[ $\{COUNTRY_NAME\} == "US" || $\{COUNTRY_NAME\} == "USA" || $\{COUNTRY_NAME\} == "UNITED STATES" || $\{COUNTRY_NAME\} == "UNITED STATES OF AMERICA" ]]
            then
                    COUNTRY_CODE="USA"
            else
                    COUNTRY_CODE=\`print $\{COUNTRY\} | sed 's/ //g' | cut -c1-3\`
            fi

            \#POSTCODE=\`print $line | cut -d'|' -f19 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}' |cut -c1-5\`
            if [[ $\{COUNTRY_CODE\} == "USA" ]]
            then
                    STATE\_CODE=$\{ADDRESS_4\}
                    POSTCODE=\`print $line | cut -d'|' -f19 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}' |cut -c1-5\`

                    FOREIGN_PROVINCE=
                    FOREIGN\_POSTAL_CODE=
            else
                    STATE_CODE=
                    POSTCODE=
                    FOREIGN\_PROVINCE=$\{ADDRESS_4\}
                    FOREIGN\_POSTAL_CODE=\`print $line | cut -d'|' -f19 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            fi

            PROCESSBODY="CDCI|"
            PROCESSBODY="$\{PROCESSBODY\}$\{ENTITY_TYPE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{CUSTOMER_ID\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{BRANCH_NUMBER\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{ACCOUNT_NUMBER\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{ACCOUNT\_DATE_OPEN\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{CORPORATE_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{LAST_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{FIRST_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{MIDDLE_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{NAME_SUFFIX\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{PERSON_GENDER\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{BIRTH_DATE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{AGE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{CITIZENSHIP_CODE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{FED_ID\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{FED\_ID_TYPE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{NATIONAL_ID\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{NATIONAL\_ID_TYPE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{STREET_ADDRESS\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{CITY_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{STATE_CODE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{POSTCODE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{FOREIGN_PROVINCE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{FOREIGN\_POSTAL_CODE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{COUNTRY_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{COUNTRY_CODE\}"

            print "$\{PROCESSBODY\}" >> $\{INQ\_TEMP_FILE\}
    done < $\{EDD_HOME\}/tmp/inquiry.txt

}

looks like you could have done all of this in ONE awk program withOUT constant chopping of lines with print|sed|cut|awk.

vgersh99,
can you please let me know how to use
cut and sed inside awk program?

Thanks & Regards

example
how to write below line inside awk
ACCOUNT_DATE_OPEN=`print "$line" | cut -d'|' -f3 |sed 's/[^0-9]//g' | awk '{gsub(/^[ \t]+|[ \t]+$/,"");print}' | cut -c1-8`

you don't need to use cut/sed - awk provides most of the cut/sed functions natively.

what does the 'line' look like AND what part of it you want?
A sample pls!


  1. \t ↩︎

example of line
COL001 | P | 2007-02-01-00.00.00.000000 | | sam | babu | | | M | 1949-01-04-00.00.00.000000 | INDIA | | C60 | | 110 S | | ENNIS | IN | 46563 | INDIA |

ok, I assume this is all ONE line.
you didn't say what part of it you wanted....
In the future pls use vB Codes when posting sample data and/or quoting others.

Sure vgersh99,
I will do that from next time.
Can you please give code for doing that?

Thanks & Regards,

echo 'COL001 | P | 2007-02-01-00.00.00.000000 | | sam | babu | | | M | 1949-01-04-00.00.00.000000 | INDIA | | C60 | | 110 S | | ENNIS | IN | 46563 | INDIA |' | awk -F'|' '{gsub("[^0-9]", "", $3); print "[" $3 "]"}'

But I'd advise to do EVERYTHING in awk - including reading line/records - no need for the shell 'while read line....'

I have changed the script to use awk only
example

awk -F "|" '{

            \#ENTITY_TYPE=\`print "$line" | cut -d'|' -f2 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            ENTITY_TYPE=$2

            if [ $\{ENTITY_TYPE\} == "O" ]
            then
                    ENTITY_TYPE="B"
            else
                    ENTITY_TYPE="P"
            fi
            \#CUSTOMER_ID=\`print "$line" | cut -d'|' -f1 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            CUSTOMER_ID=$1
	
	BIRTH_DATE=\`print $10 | sed 's/[^0-9]//g' | cut -c1-8\`

\}' inquiry.txt

My question is can I use sed and cut inside awk?
If now can you please tell me the alternative way to do this?

Thanks in advance.

Can someone please help me on this?
Its kind of urgent..

Thanks a lot in advance.

you can use ANY utility from within awk, but it's not recommended as you're defeating the purpose of using just ONE tool capabale of doing it all natively.
Furthermore, I've provided a sample awk code in my previous post. Is there something in the sample code that you're stuck with?

Can you send some format of lines of your Input file..

Although I am also new to scripting but I think..
You are using awk inside while statement . The number of times the awk appears the programme will take that much time. Thats the problem I think.

you no need to put awk inside while..instead process all your data once you inside awk .

Here your all the lines of your input file is processed as many times the awk appears..

user_prady

I am not sure abt your requirment
but try with the below code

nawk '{
      split($0,arr1,"|")    
      split(arr1[3],arr2,"-")
      print arr2[1]arr2[2]arr2[3]
}' sample

  1. \t ↩︎

Try to use as less as awk statement to reduce your time

like

nawk '{
      cnt = split($0,arr1,"|")
     # for (i = 1 ; i <= cnt ; i++){
     #        print arr1
     # }
 
      ENTITY_TYPE = arr1[2]
      if (ENTITY_TYPE ~ "O"){
         ENTITY_TYPE="B"
      }
      else{
         ENTITY_TYPE="P"
      }
      CUSTOMER_ID = arr1[1]
      BRANCH_NUMBER= ""
      ACCOUNT_NUMBER=""
      split(arr1[3],arr2,"-")
      ACCOUNT_DATE_OPEN= sprint arr2[1]arr2[2]arr2[3]
      #print ACCOUNT_DATE_OPEN
      CORPORATE_NAME= arr1[4]
      #print CORPORATE_NAME
      LAST_NAME= arr1[5]
      FIRST_NAME=arr1[6]
      MIDDLE_NAME=arr1[7]
      NAME_SUFFIX=arr1[8]
      #print MIDDLE_NAME
 
# Extracting person gender information
 
    PERSON_GENDER=arr1[9]
 
    #print PERSON_GENDER
    if( PERSON_GENDER ~ "F" || PERSON_GENDER ~ "M" ){}
    else {
         PERSON_GENDER=""
    }
 
      split(arr1[10],arr3,"-")
      BIRTH_DATE= sprint arr3[1]arr3[2]arr3[3]
 
      #AGE should be left blank
      AGE=""
      CITIZEN_COUNTRY_NAME = arr1[11]
      if (CITIZEN_COUNTRY_NAME ~ "US" || CITIZEN_COUNTRY_NAME ~ "UNITED STATES" || CITIZEN_COUNTRY_NAME ~ "UNITED STATES OF AMERICA" || CITIZEN_COUNTRY_NAME ~ "USA"){
             CITIZEN_COUNTRY_NAME = "USA"
     }
}' sample_input_file

  1. \t ↩︎

  2. \t ↩︎

  3. \t ↩︎

  4. \t ↩︎

  5. \t ↩︎

  6. \t ↩︎

  7. \t ↩︎

  8. \t ↩︎

  9. \t ↩︎

  10. \t ↩︎

  11. \t ↩︎

  12. \t ↩︎

  13. \t ↩︎

  14. \t ↩︎

  15. \t ↩︎

  16. \t ↩︎

  17. \t ↩︎

  18. \t ↩︎

  19. \t ↩︎

  20. \t ↩︎

  21. \t ↩︎

  22. \t ↩︎

  23. \t ↩︎

  24. \t ↩︎

  25. \t ↩︎

#set -x

nawk '{
                split($0,arr1,"|")

                ENTITY_TYPE = arr1[2]
                if (ENTITY_TYPE ~ "O"){
                    ENTITY_TYPE = "B"
                }
                else {
                    ENTITY_TYPE = "P"
                }

                CUSTOMER_ID = arr1[1]

                #Branch and Account Numbers should be left blank

                BRANCH_NUMBER = ""
                ACCOUNT_NUMBER = ""
                split(arr1[3],arr2,"-")
                ACCOUNT_DATE_OPEN = sprint arr2[1]arr2[2]arr2[3]
                CORPORATE_NAME = arr1[4]
                LAST_NAME = arr1[5]
                FIRST_NAME = arr1[6]
                MIDDLE_NAME = arr1[7]
                NAME_SUFFIX = arr1[8]

                # Extracting person gender information
                PERSON_GENDER = arr1[9]
                # If gender is anything other than M or F,replace it with blank
                if (PERSON_GENDER ~ "F" || PERSON_GENDER ~ "M") {
                }
                else {
                   PERSON_GENDER=""
                }

                split(arr1[10],arr3,"-")
                BIRTH_DATE = sprint arr3[1]arr3[2]arr3[3]
                #AGE should be left blank
                AGE = ""

                # Extracting citizenship code information
                CITIZEN_COUNTRY_NAME = arr1[11]


                if (CITIZEN_COUNTRY_NAME ~ "US" || CITIZEN_COUNTRY_NAME ~ "UNITED STATES" || CITIZEN_COUNTRY_NAME ~ "UNITED STATES OF AMERICA" || CITIZEN_COUNTRY_NAME ~ "USA") {
                  CITIZEN_COUNTRY_NAME = "USA"
                  #FED_ID = `print arr1[12] | sed -e 's/[^0-9]//g'`
                  FED_ID = arr1[12]
                }
                else {
                  CITIZENSHIP_CODE = arr1[11]
                  FED_ID = ""
                }

                        if (ENTITY_TYPE ~ "P") {
                          FED_ID_TYPE = "S"
                        }
                        else {
                           FED_ID_TYPE = "T"
                        }

                #Extracting National ID information

                ID_INFORMATION_1 = arr1[13]
                ID_INFORMATION_2 = arr1[14]

                if (length(ID_INFORMATION_1) > 0 && ID_INFORMATION_1 != "") {
                  NATIONAL_ID = ID_INFORMATION_1
                  #NATIONAL_ID = `print NATIONAL_ID | sed 's/[^0-9a-zA-Z]//g'`
                  NATIONAL_ID_TYPE = "DL"
                }
                else if (length(ID_INFORMATION_2) > 0 && ID_INFORMATION_2 != "") {
                  NATIONAL_ID = D_INFORMATION_2
                  #NATIONAL_ID = `print NATIONAL_ID | sed 's/[^0-9a-zA-Z]//g'`
                  NATIONAL_ID_TYPE = "DL"
                }
                else
                  NATIONAL_ID = ""
                  NATIONAL_ID_TYPE = ""
                }





                #Extracting street address information

                ADDRESS_1 = arr1[15]
                ADDRESS_2 = arr1[16]
                STREET_ADDRESS = sprint ADDRESS_1 ADDRESS_2
                STREET_ADDRESS = substr(STREET_ADDRESS,1,60)

                #Extracting city information

                ADDRESS_3 = arr1[17]
                CITY_NAME = ADDRESS_3

                #Extracting country information
                COUNTRY = arr1[20]
                ADDRESS_4 = arr1[18]
                COUNTRY_NAME = COUNTRY

                if (COUNTRY_NAME ~ "US" || COUNTRY_NAME ~ "USA" || COUNTRY_NAME ~ "UNITED STATES" || COUNTRY_NAME ~ "UNITED STATES OF AMERICA") {
                  COUNTRY_CODE = "USA"
                }
                else {
                   COUNTRY_CODE = substr(COUNTRY,1,3)
                }


                if (COUNTRY_CODE ~ "USA") {
                        STATE_CODE = ADDRESS_4
                        POSTCODE = substr(arr1[19],1,5)
                        FOREIGN_PROVINCE = ""
                        FOREIGN_POSTAL_CODE = ""
                }
                else {
                        STATE_CODE = ""
                        POSTCODE = ""
                        FOREIGN_PROVINCE = ADDRESS_4
                        FOREIGN_POSTAL_CODE = arr1[19]
                }

printf("%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n", CDCI, ENTITY_TYPE, CUSTOMER_ID, BRANCH_NUMBER, ACCOUNT_NUMBER, ACCOUNT_DATE_OPEN, CORPORATE_NAME, LAST_NAME, FIRST_NAME, MIDDLE_NAME, NAME_SUFFIX, PERSON_GENDER, BIRTH_DATE, AGE, CITIZENSHIP_CODE, FED_ID, FED_ID_TYPE, NATIONAL_ID, NATIONAL_ID_TYPE, STREET_ADDRESS, CITY_NAME, STATE_CODE, POSTCODE, FOREIGN_PROVINCE, FOREIGN_POSTAL_CODE, COUNTRY_NAME, COUNTRY_CODE) >> test_out_file.txt


}' input_inq1.txt

I have changes the code as above.
when i execute I get the below error
Syntax Error The source line is 102.
The error context is
>>> if <<< (COUNTRY_NAME ~ "US" || COUNTRY_NAME ~ "USA" || COUNTRY_NAME ~ "UNITED STATES" || COUNTRY_NAME ~ "UNITED STATES OF AMERICA") {
awk: 0602-542 There is an extra } character.
awk: 0602-500 Quitting The source line is 102.

Can someone tell me why this error is comming?

Thanks

I think I found the error.
Thanks all for the great help and inputs. really appreciate all.

One final question
how to remove all chars except numbers in a variable inside awk?
example
FED_ID = `print arr1[12] | sed -e 's/[^0-9]//g'`
is not working
Can someone suggest an alternative?

Thanks all,
I got it.
Have a nice weekend.