shell script performance issues --Urgent

icefish · December 13, 2007, 11:18am

I need help in awk please help immediatly.

This below function is taking lot of time
Please help me to fine tune it so that it runs faster.
The file count is around 3million records

# Process Body
processbody() {

    \#set -x

    while read line
    do
            ENTITY_TYPE=\`print "$line" | cut -d'|' -f2 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`

            if [ $\{ENTITY_TYPE\} == "O" ]
            then
                    ENTITY_TYPE="B"
            else
                    ENTITY_TYPE="P"
            fi
            CUSTOMER_ID=\`print "$line" | cut -d'|' -f1 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`

            \#Branch and Account Numbers should be left blank

            BRANCH_NUMBER=
            ACCOUNT_NUMBER=
            ACCOUNT\_DATE_OPEN=\`print "$line" | cut -d'|' -f3 |sed 's/[^0-9]//g' | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}' | cut -c1-8\`
            CORPORATE_NAME=\`print "$line" | cut -d'|' -f4 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            LAST_NAME=\`print "$line" | cut -d'|' -f5 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            FIRST_NAME=\`print "$line" | cut -d'|' -f6 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            MIDDLE_NAME=\`print "$line" | cut -d'|' -f7 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            NAME_SUFFIX=\`print "$line" | cut -d'|' -f8 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`

            \# Extracting person gender information
            PERSON_GENDER=\`print "$line" | cut -d'|' -f9 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            \# If gender is anything other than M or F,replace it with blank
            if [[ $\{PERSON_GENDER\} != "M" && $\{PERSON_GENDER\} != "F" ]]
            then
                    PERSON_GENDER=
            fi

            BIRTH_DATE=\`print $line | cut -d'|' -f10 | sed 's/[^0-9]//g' | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}' | cut -c1-8\`
            \#AGE should be left blank
            AGE=



            \# Extracting citizenship code information
            CITIZEN\_COUNTRY_NAME=\`print $line | cut -d'|' -f11 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`


            if [[ $\{CITIZEN\_COUNTRY_NAME\} == "US" || $\{CITIZEN\_COUNTRY_NAME\} == "USA" || $\{CITIZEN\_COUNTRY_NAME\} == "UNITED STATES" || $\{CITIZEN\_COUNTRY_NAME\} == "UNITED STATES OF AMERICA" ]]
            then
                    CITIZENSHIP_CODE="USA"
                    FED_ID=\`print $line | cut -d'|' -f12 | sed -e 's/[^0-9]//g' | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            else
                    CITIZENSHIP_CODE=\`print $line | cut -d'|' -f11 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}' | cut -c1-3\`
                    FED_ID=
            fi

                    if [[ $\{ENTITY_TYPE\} == "P" ]]
                    then
                            FED\_ID_TYPE="S"
                    else
                            FED\_ID_TYPE="T"
                    fi

            \#Extracting National ID information

            ID\_INFORMATION_1=\`print $line | cut -d'|' -f13 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            ID\_INFORMATION_2=\`print $line | cut -d'|' -f14 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`


            if [[ ! -z $\{ID\_INFORMATION_1\} && $\{ID\_INFORMATION_1\} != "" ]]
            then
                    NATIONAL\_ID=$\{ID\_INFORMATION_1\}

                    \# Remove all non numeric characters in NATIONAL_ID field
                    NATIONAL_ID=\`print $\{NATIONAL_ID\} | sed 's/[^0-9a-zA-Z]//g' | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
                    NATIONAL\_ID_TYPE="DL"
            elif [[ ! -z $\{ID\_INFORMATION_2\} && $\{ID\_INFORMATION_2\} != "" ]]
            then
                    NATIONAL\_ID=$\{ID\_INFORMATION_2\}


                    \# Remove all non numeric characters in NATIONAL_ID field
                    NATIONAL_ID=\`print $\{NATIONAL_ID\} | sed 's/[^0-9a-zA-Z]//g' | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
                    NATIONAL\_ID_TYPE="PP"
            else
                    NATIONAL_ID=
                    NATIONAL\_ID_TYPE=
            fi

            \#Extracting street address information

            ADDRESS_1=\`print $line | cut -d'|' -f15 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            ADDRESS_2=\`print $line | cut -d'|' -f16 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            STREET\_ADDRESS=$\{ADDRESS\_1\}$\{ADDRESS_2\}
            STREET_ADDRESS=\`print $\{STREET_ADDRESS\} | cut -c1-60\`

            \#Extracting city information

            ADDRESS_3=\`print $line | cut -d'|' -f17 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            CITY\_NAME=$\{ADDRESS_3\}

            \#Extracting country information

            COUNTRY=\`print $line | cut -d'|' -f20 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            ADDRESS_4=\`print $line | cut -d'|' -f18 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`

            COUNTRY_NAME=$\{COUNTRY\}

            if [[ $\{COUNTRY_NAME\} == "US" || $\{COUNTRY_NAME\} == "USA" || $\{COUNTRY_NAME\} == "UNITED STATES" || $\{COUNTRY_NAME\} == "UNITED STATES OF AMERICA" ]]
            then
                    COUNTRY_CODE="USA"
            else
                    COUNTRY_CODE=\`print $\{COUNTRY\} | sed 's/ //g' | cut -c1-3\`
            fi

            \#POSTCODE=\`print $line | cut -d'|' -f19 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}' |cut -c1-5\`
            if [[ $\{COUNTRY_CODE\} == "USA" ]]
            then
                    STATE\_CODE=$\{ADDRESS_4\}
                    POSTCODE=\`print $line | cut -d'|' -f19 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}' |cut -c1-5\`

                    FOREIGN_PROVINCE=
                    FOREIGN\_POSTAL_CODE=
            else
                    STATE_CODE=
                    POSTCODE=
                    FOREIGN\_PROVINCE=$\{ADDRESS_4\}
                    FOREIGN\_POSTAL_CODE=\`print $line | cut -d'|' -f19 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            fi

            PROCESSBODY="CDCI|"
            PROCESSBODY="$\{PROCESSBODY\}$\{ENTITY_TYPE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{CUSTOMER_ID\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{BRANCH_NUMBER\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{ACCOUNT_NUMBER\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{ACCOUNT\_DATE_OPEN\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{CORPORATE_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{LAST_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{FIRST_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{MIDDLE_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{NAME_SUFFIX\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{PERSON_GENDER\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{BIRTH_DATE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{AGE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{CITIZENSHIP_CODE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{FED_ID\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{FED\_ID_TYPE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{NATIONAL_ID\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{NATIONAL\_ID_TYPE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{STREET_ADDRESS\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{CITY_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{STATE_CODE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{POSTCODE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{FOREIGN_PROVINCE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{FOREIGN\_POSTAL_CODE\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{COUNTRY_NAME\}|"
            PROCESSBODY="$\{PROCESSBODY\}$\{COUNTRY_CODE\}"

            print "$\{PROCESSBODY\}" &gt;&gt; $\{INQ\_TEMP_FILE\}
    done &lt; $\{EDD_HOME\}/tmp/inquiry.txt

}

vgersh99 · December 13, 2007, 11:50am

looks like you could have done all of this in ONE awk program withOUT constant chopping of lines with print|sed|cut|awk.

icefish · December 13, 2007, 2:03pm

vgersh99,
can you please let me know how to use
cut and sed inside awk program?

Thanks & Regards

icefish · December 13, 2007, 2:07pm

example
how to write below line inside awk
ACCOUNT_DATE_OPEN=`print "$line" | cut -d'|' -f3 |sed 's/[^0-9]//g' | awk '{gsub(/^[ \t]+|[ \t]+$/,"");print}' | cut -c1-8`

vgersh99 · December 13, 2007, 2:15pm

you don't need to use cut/sed - awk provides most of the cut/sed functions natively.

vgersh99 · December 13, 2007, 2:16pm

what does the 'line' look like AND what part of it you want?
A sample pls!

\t ↩︎

icefish · December 13, 2007, 2:21pm

example of line
COL001 | P | 2007-02-01-00.00.00.000000 | | sam | babu | | | M | 1949-01-04-00.00.00.000000 | INDIA | | C60 | | 110 S | | ENNIS | IN | 46563 | INDIA |

vgersh99 · December 13, 2007, 2:25pm

ok, I assume this is all ONE line.
you didn't say what part of it you wanted....
In the future pls use vB Codes when posting sample data and/or quoting others.

icefish · December 13, 2007, 2:31pm

Sure vgersh99,
I will do that from next time.
Can you please give code for doing that?

Thanks & Regards,

vgersh99 · December 13, 2007, 2:32pm

echo 'COL001 | P | 2007-02-01-00.00.00.000000 | | sam | babu | | | M | 1949-01-04-00.00.00.000000 | INDIA | | C60 | | 110 S | | ENNIS | IN | 46563 | INDIA |' | awk -F'|' '{gsub("[^0-9]", "", $3); print "[" $3 "]"}'

But I'd advise to do EVERYTHING in awk - including reading line/records - no need for the shell 'while read line....'

icefish · December 13, 2007, 2:41pm

I have changed the script to use awk only
example

awk -F "|" '{

            \#ENTITY_TYPE=\`print "$line" | cut -d'|' -f2 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            ENTITY_TYPE=$2

            if [ $\{ENTITY_TYPE\} == "O" ]
            then
                    ENTITY_TYPE="B"
            else
                    ENTITY_TYPE="P"
            fi
            \#CUSTOMER_ID=\`print "$line" | cut -d'|' -f1 | awk '\{gsub\(/^[ \\t]\+|[ \\t]\+$/,""\);print\}'\`
            CUSTOMER_ID=$1
	
	BIRTH_DATE=\`print $10 | sed 's/[^0-9]//g' | cut -c1-8\`

\}' inquiry.txt

My question is can I use sed and cut inside awk?
If now can you please tell me the alternative way to do this?

Thanks in advance.

icefish · December 13, 2007, 3:10pm

Can someone please help me on this?
Its kind of urgent..

Thanks a lot in advance.

vgersh99 · December 13, 2007, 3:19pm

you can use ANY utility from within awk, but it's not recommended as you're defeating the purpose of using just ONE tool capabale of doing it all natively.
Furthermore, I've provided a sample awk code in my previous post. Is there something in the sample code that you're stuck with?

user_prady · December 13, 2007, 8:18pm

Can you send some format of lines of your Input file..

Although I am also new to scripting but I think..
You are using awk inside while statement . The number of times the awk appears the programme will take that much time. Thats the problem I think.

you no need to put awk inside while..instead process all your data once you inside awk .

Here your all the lines of your input file is processed as many times the awk appears..

user_prady

user_prady · December 13, 2007, 8:57pm

I am not sure abt your requirment
but try with the below code

nawk '{
      split($0,arr1,"|")    
      split(arr1[3],arr2,"-")
      print arr2[1]arr2[2]arr2[3]
}' sample

\t ↩︎

user_prady · December 13, 2007, 10:50pm

icefish:

I need help in awk please help immediatly.

This below function is taking lot of time
Please help me to fine tune it so that it runs faster.
The file count is around 3million records

# Process Body
processbody() {

#set -x

while read line
do
ENTITY_TYPE=`print "$line" | cut -d'|' -f2 | awk '{gsub(/^[1]+|[ \t]+$/,"");print}'`

if [ ${ENTITY_TYPE} == "O" ]
then
ENTITY_TYPE="B"
else
ENTITY_TYPE="P"
fi
CUSTOMER_ID=`print "$line" | cut -d'|' -f1 | awk '{gsub(/^[2]+|[ \t]+$/,"");print}'`

#Branch and Account Numbers should be left blank

BRANCH_NUMBER=
ACCOUNT_NUMBER=
ACCOUNT_DATE_OPEN=`print "$line" | cut -d'|' -f3 |sed 's/[^0-9]//g' | awk '{gsub(/^[3]+|[ \t]+$/,"");print}' | cut -c1-8`
CORPORATE_NAME=`print "$line" | cut -d'|' -f4 | awk '{gsub(/^[4]+|[ \t]+$/,"");print}'`
LAST_NAME=`print "$line" | cut -d'|' -f5 | awk '{gsub(/^[5]+|[ \t]+$/,"");print}'`
FIRST_NAME=`print "$line" | cut -d'|' -f6 | awk '{gsub(/^[6]+|[ \t]+$/,"");print}'`
MIDDLE_NAME=`print "$line" | cut -d'|' -f7 | awk '{gsub(/^[7]+|[ \t]+$/,"");print}'`
NAME_SUFFIX=`print "$line" | cut -d'|' -f8 | awk '{gsub(/^[8]+|[ \t]+$/,"");print}'`

# Extracting person gender information
PERSON_GENDER=`print "$line" | cut -d'|' -f9 | awk '{gsub(/^[9]+|[ \t]+$/,"");print}'`
# If gender is anything other than M or F,replace it with blank
if [[ ${PERSON_GENDER} != "M" && ${PERSON_GENDER} != "F" ]]
then
PERSON_GENDER=
fi

BIRTH_DATE=`print $line | cut -d'|' -f10 | sed 's/[^0-9]//g' | awk '{gsub(/^[10]+|[ \t]+$/,"");print}' | cut -c1-8`
#AGE should be left blank
AGE=

# Extracting citizenship code information
CITIZEN_COUNTRY_NAME=`print $line | cut -d'|' -f11 | awk '{gsub(/^[11]+|[ \t]+$/,"");print}'`

if [[ ${CITIZEN_COUNTRY_NAME} == "US" || ${CITIZEN_COUNTRY_NAME} == "USA" || ${CITIZEN_COUNTRY_NAME} == "UNITED STATES" || ${CITIZEN_COUNTRY_NAME} == "UNITED STATES OF AMERICA" ]]
then
CITIZENSHIP_CODE="USA"
FED_ID=`print $line | cut -d'|' -f12 | sed -e 's/[^0-9]//g' | awk '{gsub(/^[12]+|[ \t]+$/,"");print}'`
else
CITIZENSHIP_CODE=`print $line | cut -d'|' -f11 | awk '{gsub(/^[13]+|[ \t]+$/,"");print}' | cut -c1-3`
FED_ID=
fi

if [[ ${ENTITY_TYPE} == "P" ]]
then
FED_ID_TYPE="S"
else
FED_ID_TYPE="T"
fi

#Extracting National ID information

ID_INFORMATION_1=`print $line | cut -d'|' -f13 | awk '{gsub(/^[14]+|[ \t]+$/,"");print}'`
ID_INFORMATION_2=`print $line | cut -d'|' -f14 | awk '{gsub(/^[15]+|[ \t]+$/,"");print}'`

if [[ ! -z ${ID_INFORMATION_1} && ${ID_INFORMATION_1} != "" ]]
then
NATIONAL_ID=${ID_INFORMATION_1}

# Remove all non numeric characters in NATIONAL_ID field
NATIONAL_ID=`print ${NATIONAL_ID} | sed 's/[^0-9a-zA-Z]//g' | awk '{gsub(/^[16]+|[ \t]+$/,"");print}'`
NATIONAL_ID_TYPE="DL"
elif [[ ! -z ${ID_INFORMATION_2} && ${ID_INFORMATION_2} != "" ]]
then
NATIONAL_ID=${ID_INFORMATION_2}

# Remove all non numeric characters in NATIONAL_ID field
NATIONAL_ID=`print ${NATIONAL_ID} | sed 's/[^0-9a-zA-Z]//g' | awk '{gsub(/^[17]+|[ \t]+$/,"");print}'`
NATIONAL_ID_TYPE="PP"
else
NATIONAL_ID=
NATIONAL_ID_TYPE=
fi

#Extracting street address information

ADDRESS_1=`print $line | cut -d'|' -f15 | awk '{gsub(/^[18]+|[ \t]+$/,"");print}'`
ADDRESS_2=`print $line | cut -d'|' -f16 | awk '{gsub(/^[19]+|[ \t]+$/,"");print}'`
STREET_ADDRESS=${ADDRESS_1}${ADDRESS_2}
STREET_ADDRESS=`print ${STREET_ADDRESS} | cut -c1-60`

#Extracting city information

ADDRESS_3=`print $line | cut -d'|' -f17 | awk '{gsub(/^[20]+|[ \t]+$/,"");print}'`
CITY_NAME=${ADDRESS_3}

#Extracting country information

COUNTRY=`print $line | cut -d'|' -f20 | awk '{gsub(/^[21]+|[ \t]+$/,"");print}'`
ADDRESS_4=`print $line | cut -d'|' -f18 | awk '{gsub(/^[22]+|[ \t]+$/,"");print}'`

COUNTRY_NAME=${COUNTRY}

if [[ ${COUNTRY_NAME} == "US" || ${COUNTRY_NAME} == "USA" || ${COUNTRY_NAME} == "UNITED STATES" || ${COUNTRY_NAME} == "UNITED STATES OF AMERICA" ]]
then
COUNTRY_CODE="USA"
else
COUNTRY_CODE=`print ${COUNTRY} | sed 's/ //g' | cut -c1-3`
fi

#POSTCODE=`print $line | cut -d'|' -f19 | awk '{gsub(/^[23]+|[ \t]+$/,"");print}' |cut -c1-5`
if [[ ${COUNTRY_CODE} == "USA" ]]
then
STATE_CODE=${ADDRESS_4}
POSTCODE=`print $line | cut -d'|' -f19 | awk '{gsub(/^[24]+|[ \t]+$/,"");print}' |cut -c1-5`

FOREIGN_PROVINCE=
FOREIGN_POSTAL_CODE=
else
STATE_CODE=
POSTCODE=
FOREIGN_PROVINCE=${ADDRESS_4}
FOREIGN_POSTAL_CODE=`print $line | cut -d'|' -f19 | awk '{gsub(/^[25]+|[ \t]+$/,"");print}'`
fi

PROCESSBODY="CDCI|"
PROCESSBODY="${PROCESSBODY}${ENTITY_TYPE}|"
PROCESSBODY="${PROCESSBODY}${CUSTOMER_ID}|"
PROCESSBODY="${PROCESSBODY}${BRANCH_NUMBER}|"
PROCESSBODY="${PROCESSBODY}${ACCOUNT_NUMBER}|"
PROCESSBODY="${PROCESSBODY}${ACCOUNT_DATE_OPEN}|"
PROCESSBODY="${PROCESSBODY}${CORPORATE_NAME}|"
PROCESSBODY="${PROCESSBODY}${LAST_NAME}|"
PROCESSBODY="${PROCESSBODY}${FIRST_NAME}|"
PROCESSBODY="${PROCESSBODY}${MIDDLE_NAME}|"
PROCESSBODY="${PROCESSBODY}${NAME_SUFFIX}|"
PROCESSBODY="${PROCESSBODY}${PERSON_GENDER}|"
PROCESSBODY="${PROCESSBODY}${BIRTH_DATE}|"
PROCESSBODY="${PROCESSBODY}${AGE}|"
PROCESSBODY="${PROCESSBODY}${CITIZENSHIP_CODE}|"
PROCESSBODY="${PROCESSBODY}${FED_ID}|"
PROCESSBODY="${PROCESSBODY}${FED_ID_TYPE}|"
PROCESSBODY="${PROCESSBODY}${NATIONAL_ID}|"
PROCESSBODY="${PROCESSBODY}${NATIONAL_ID_TYPE}|"
PROCESSBODY="${PROCESSBODY}${STREET_ADDRESS}|"
PROCESSBODY="${PROCESSBODY}${CITY_NAME}|"
PROCESSBODY="${PROCESSBODY}${STATE_CODE}|"
PROCESSBODY="${PROCESSBODY}${POSTCODE}|"
PROCESSBODY="${PROCESSBODY}${FOREIGN_PROVINCE}|"
PROCESSBODY="${PROCESSBODY}${FOREIGN_POSTAL_CODE}|"
PROCESSBODY="${PROCESSBODY}${COUNTRY_NAME}|"
PROCESSBODY="${PROCESSBODY}${COUNTRY_CODE}"

print "${PROCESSBODY}" >> ${INQ_TEMP_FILE}
done < ${EDD_HOME}/tmp/inquiry.txt

}

Try to use as less as awk statement to reduce your time

like

nawk '{
      cnt = split($0,arr1,"|")
     # for (i = 1 ; i <= cnt ; i++){
     #        print arr1
     # }
 
      ENTITY_TYPE = arr1[2]
      if (ENTITY_TYPE ~ "O"){
         ENTITY_TYPE="B"
      }
      else{
         ENTITY_TYPE="P"
      }
      CUSTOMER_ID = arr1[1]
      BRANCH_NUMBER= ""
      ACCOUNT_NUMBER=""
      split(arr1[3],arr2,"-")
      ACCOUNT_DATE_OPEN= sprint arr2[1]arr2[2]arr2[3]
      #print ACCOUNT_DATE_OPEN
      CORPORATE_NAME= arr1[4]
      #print CORPORATE_NAME
      LAST_NAME= arr1[5]
      FIRST_NAME=arr1[6]
      MIDDLE_NAME=arr1[7]
      NAME_SUFFIX=arr1[8]
      #print MIDDLE_NAME
 
# Extracting person gender information
 
    PERSON_GENDER=arr1[9]
 
    #print PERSON_GENDER
    if( PERSON_GENDER ~ "F" || PERSON_GENDER ~ "M" ){}
    else {
         PERSON_GENDER=""
    }
 
      split(arr1[10],arr3,"-")
      BIRTH_DATE= sprint arr3[1]arr3[2]arr3[3]
 
      #AGE should be left blank
      AGE=""
      CITIZEN_COUNTRY_NAME = arr1[11]
      if (CITIZEN_COUNTRY_NAME ~ "US" || CITIZEN_COUNTRY_NAME ~ "UNITED STATES" || CITIZEN_COUNTRY_NAME ~ "UNITED STATES OF AMERICA" || CITIZEN_COUNTRY_NAME ~ "USA"){
             CITIZEN_COUNTRY_NAME = "USA"
     }
}' sample_input_file

\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎
\t ↩︎

icefish · December 14, 2007, 7:29am

#set -x

nawk '{
                split($0,arr1,"|")

                ENTITY_TYPE = arr1[2]
                if (ENTITY_TYPE ~ "O"){
                    ENTITY_TYPE = "B"
                }
                else {
                    ENTITY_TYPE = "P"
                }

                CUSTOMER_ID = arr1[1]

                #Branch and Account Numbers should be left blank

                BRANCH_NUMBER = ""
                ACCOUNT_NUMBER = ""
                split(arr1[3],arr2,"-")
                ACCOUNT_DATE_OPEN = sprint arr2[1]arr2[2]arr2[3]
                CORPORATE_NAME = arr1[4]
                LAST_NAME = arr1[5]
                FIRST_NAME = arr1[6]
                MIDDLE_NAME = arr1[7]
                NAME_SUFFIX = arr1[8]

                # Extracting person gender information
                PERSON_GENDER = arr1[9]
                # If gender is anything other than M or F,replace it with blank
                if (PERSON_GENDER ~ "F" || PERSON_GENDER ~ "M") {
                }
                else {
                   PERSON_GENDER=""
                }

                split(arr1[10],arr3,"-")
                BIRTH_DATE = sprint arr3[1]arr3[2]arr3[3]
                #AGE should be left blank
                AGE = ""

                # Extracting citizenship code information
                CITIZEN_COUNTRY_NAME = arr1[11]


                if (CITIZEN_COUNTRY_NAME ~ "US" || CITIZEN_COUNTRY_NAME ~ "UNITED STATES" || CITIZEN_COUNTRY_NAME ~ "UNITED STATES OF AMERICA" || CITIZEN_COUNTRY_NAME ~ "USA") {
                  CITIZEN_COUNTRY_NAME = "USA"
                  #FED_ID = `print arr1[12] | sed -e 's/[^0-9]//g'`
                  FED_ID = arr1[12]
                }
                else {
                  CITIZENSHIP_CODE = arr1[11]
                  FED_ID = ""
                }

                        if (ENTITY_TYPE ~ "P") {
                          FED_ID_TYPE = "S"
                        }
                        else {
                           FED_ID_TYPE = "T"
                        }

                #Extracting National ID information

                ID_INFORMATION_1 = arr1[13]
                ID_INFORMATION_2 = arr1[14]

                if (length(ID_INFORMATION_1) > 0 && ID_INFORMATION_1 != "") {
                  NATIONAL_ID = ID_INFORMATION_1
                  #NATIONAL_ID = `print NATIONAL_ID | sed 's/[^0-9a-zA-Z]//g'`
                  NATIONAL_ID_TYPE = "DL"
                }
                else if (length(ID_INFORMATION_2) > 0 && ID_INFORMATION_2 != "") {
                  NATIONAL_ID = D_INFORMATION_2
                  #NATIONAL_ID = `print NATIONAL_ID | sed 's/[^0-9a-zA-Z]//g'`
                  NATIONAL_ID_TYPE = "DL"
                }
                else
                  NATIONAL_ID = ""
                  NATIONAL_ID_TYPE = ""
                }





                #Extracting street address information

                ADDRESS_1 = arr1[15]
                ADDRESS_2 = arr1[16]
                STREET_ADDRESS = sprint ADDRESS_1 ADDRESS_2
                STREET_ADDRESS = substr(STREET_ADDRESS,1,60)

                #Extracting city information

                ADDRESS_3 = arr1[17]
                CITY_NAME = ADDRESS_3

                #Extracting country information
                COUNTRY = arr1[20]
                ADDRESS_4 = arr1[18]
                COUNTRY_NAME = COUNTRY

                if (COUNTRY_NAME ~ "US" || COUNTRY_NAME ~ "USA" || COUNTRY_NAME ~ "UNITED STATES" || COUNTRY_NAME ~ "UNITED STATES OF AMERICA") {
                  COUNTRY_CODE = "USA"
                }
                else {
                   COUNTRY_CODE = substr(COUNTRY,1,3)
                }


                if (COUNTRY_CODE ~ "USA") {
                        STATE_CODE = ADDRESS_4
                        POSTCODE = substr(arr1[19],1,5)
                        FOREIGN_PROVINCE = ""
                        FOREIGN_POSTAL_CODE = ""
                }
                else {
                        STATE_CODE = ""
                        POSTCODE = ""
                        FOREIGN_PROVINCE = ADDRESS_4
                        FOREIGN_POSTAL_CODE = arr1[19]
                }

printf("%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s\n", CDCI, ENTITY_TYPE, CUSTOMER_ID, BRANCH_NUMBER, ACCOUNT_NUMBER, ACCOUNT_DATE_OPEN, CORPORATE_NAME, LAST_NAME, FIRST_NAME, MIDDLE_NAME, NAME_SUFFIX, PERSON_GENDER, BIRTH_DATE, AGE, CITIZENSHIP_CODE, FED_ID, FED_ID_TYPE, NATIONAL_ID, NATIONAL_ID_TYPE, STREET_ADDRESS, CITY_NAME, STATE_CODE, POSTCODE, FOREIGN_PROVINCE, FOREIGN_POSTAL_CODE, COUNTRY_NAME, COUNTRY_CODE) >> test_out_file.txt


}' input_inq1.txt

I have changes the code as above.
when i execute I get the below error
Syntax Error The source line is 102.
The error context is
>>> if <<< (COUNTRY_NAME ~ "US" || COUNTRY_NAME ~ "USA" || COUNTRY_NAME ~ "UNITED STATES" || COUNTRY_NAME ~ "UNITED STATES OF AMERICA") {
awk: 0602-542 There is an extra } character.
awk: 0602-500 Quitting The source line is 102.

Can someone tell me why this error is comming?

Thanks

icefish · December 14, 2007, 9:02am

I think I found the error.
Thanks all for the great help and inputs. really appreciate all.

One final question
how to remove all chars except numbers in a variable inside awk?
example
FED_ID = `print arr1[12] | sed -e 's/[^0-9]//g'`
is not working
Can someone suggest an alternative?

icefish · December 14, 2007, 9:43am

Thanks all,
I got it.
Have a nice weekend.