Fixed Width file creation from csv

Hello All, I'm able to achieve my goal of creating a fixed width file from a comma delimited but I know I'm not doing it as efficiently as possible.

Original File

checksab
004429876883,O,342040,981.98,10232014
004429876883,O,322389,2615.00,10232014
004429876883,V,323624,10826.51,10232014
004429876883,O,383734,30000.00,10232014
004429876883,O,349626,929.25,10232014
004429876883,O,389727,14095.75,10232014
004429876883,O,379910,1669.23,10232014

Desired/Achieved Output

final.txt
004429876883O0000342040000000098198102320140
004429876883O0000322389000000261500102320140
004429876883V0000323624000001082651102320140
004429876883O0000383734000003000000102320140
004429876883O0000349626000000092925102320140
004429876883O0000389627000001409575102320140
004429876883O0000379910000000166923102320140

I was able to achieve the desired output in this manner.

#/bin/ksh
tr ',' ' ' < checksab > check1
tr -d '[.]' < check1 > check2
awk '{$1= $1 $2}1' check2 > check3
awk '{$4= $4 $5}1' check3 > check4
awk '{ printf("%11s %9s %19s \n", $1, $3, $4); }' check4 > check5
sed 's/ /0/g' check5 > final.txt

Creating 5 files to get to the final.txt isn't ideal but it's much quicker than our previous way of running it through excel formulas. Any direction y'all could provide to clean this up would be greatly appreciated.

I wrote a C tool for this and similar tasks (still needs tr to delete .), but use pipes! Option it to pad with 0, tab on ',' and right justify the numbers. Its original use was to align tab separated columns in bulk data (assuming fixed pitch font):

$ cat mysrc/autotab.c
 
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
static  FILE    *tmp ;          /* temp file */
static  char    *just = "l" ;   /* output column justification */
static  char    *osep = "  " ;  /* output column sep */
static  char    j ;             /* current justification */
static  char    sav[4096] ;     /* output column store for justification */
static  char    savl[65536];/* output line store */
static  int     c ;             /* character read */
static  int     cl = 0 ;        /* current column length */
static  int     col = 0 ;       /* current column # */
static  int     fs = 0 ;        /* possibly embedded spaces found */
static  int     gen_hdr = 0 ;   /* generate header state */
static  int     i ;             /* utility int */
static  int     no ;            /* narrow, overlap final column */
static  int     isep = '\t' ;   /* input column sep */
static  int     jlen = 1 ;      /* output column justification */
static  int     l[4096] ;       /* array of column widths */
static  int     ll = 0 ;        /* output line length */
static  int     maxcol = 0 ;    /* max column # */
int main( int argc, char **argv ){
        for ( i = 1 ; i < argc ; i++ ){
                if ( !strcmp( argv, "-is" ) && ( i + 1 ) < argc ){
                        isep = argv[++i][0] ;
                        continue ;
                 }
                if ( !strcmp( argv, "-os" ) && ( i + 1 ) < argc ){
                        osep = argv[++i] ;
                        continue ;
                 }
                if ( !strcmp( argv, "-no" )){
                        no = 1 ;
                        continue ;
                 }
                if ( !strcmp( argv, "-j" ) && ( i + 1 ) < argc ){
                        just = argv[++i] ;
                        jlen = strlen( just );
                        continue ;
                 }
                if ( !strcmp( argv, "-gh" ) ){
                        gen_hdr = 1 ;
                        continue ;
                 }
                fprintf( stderr,
"\n"
"Usage: autotab [ -is <i_sep> ] [ -os <o_sep> ] [ -gh ] [ -no ] [ -j <just> ]\n"
"\n"
"Scans input as columns defined by <i_sep> (default tab), measuring maximum\n"
"column width without blank padding and saving the input.  Lines with no\n"
"<i_sep> are not measured.  If -no is present (narrow, overlapping), the\n"
"characters between the last <i_sep> on a line and the line feed are not\n"
"measured.  (The -no option is only useful with left justification.)\n"
"After reading EOF, the saved input is printed, padded to the measured\n"
"column width and separated by the <o_sep> string (default 2 spaces) with\n"
"empty right side columns, blanks and carriage returns suppressed.\n"
"If -j is present, the characters of <just> define the justification of each\n"
"column with the same relative offset:\n"
" r for right, c for centered, and anything else means left.\n"
"If -gh is present, the saved input is prefixed by a numbered column header,\n"
"which is padded and aligned like the data.\n"
"The size limits are: %d measured columns, output line %d characters\n"
"and right or center justified column data width %d characters.\n"
"\n",
                        sizeof( l )/sizeof(int),
                        sizeof( savl ),
                        sizeof( sav ));
                exit( 1 );
         }
        if ( NULL == ( tmp = tmpfile() )){
                perror( "tmpfile()" );
                exit( 1 );
         }
        memset( (char*)l, 0, sizeof( l ) );
        do {
                switch( c = getchar() ){
                case EOF:
                        if ( ferror( stdin ) ){
                                perror( "stdin" );
                                exit( 1 );
                         }
                        continue ;      /* Out of loop */
                case '\n':
                        if ( no ){
                                col = 0 ;
                                cl = 0 ;
                                fs = 0 ;
                                break ;
                         }
                        /* Intentional Fall Through */
                case '\f':
                        if ( cl && col ){
                                if ( col == ( sizeof( l ) / sizeof( int ) )){
                                        fprintf( stderr,
                                                 "Too many columns!\n" );
                                        exit( 1 );
                                 }
                                if ( cl > l[col] ){
                                        l[col++] = cl ;
                                 }
                         }
                        if ( col > maxcol ){
                                maxcol = col ;
                         }
                        col = 0 ;
                        cl = 0 ;
                        fs = 0 ;
                        break ;
                case ' ':
                        if ( cl ){
                                fs++ ;
                         }
                case '\r':
                        continue ;
                default:
                        if ( c == isep ){
                                if ( cl ){
                                        if ( col == ( sizeof( l )
                                                        / sizeof( int ) )){
                                                fprintf( stderr,
                                                         "Too many columns!\n"
                                                        );
                                                exit( 1 );
                                         }
                                        if ( cl > l[col] ){
                                                l[col] = cl ;
                                         }
                                 }
                                col++ ;
                                cl = 0 ;
                                fs = 0 ;
                                break ;
                         }
                        cl++ ;
                        cl += fs ;
                        while ( fs ){
                                fs-- ;
                                if ( EOF == putc( ' ', tmp )){
                                        perror( "putc(tmp)" );
                                        exit( 1 );
                                 }
                         }
                        break ;
                 }
                if ( EOF == putc( c, tmp )){
                        perror( "putc(tmp)" );
                        exit( 1 );
                 }
        } while ( c != EOF );
        rewind( tmp );
        if ( gen_hdr ){
                col = 0 ;
                do {
                        if ( 0 > ( cl = printf( "Col. %d", col + 1 ))){
                                if ( ferror( stdout )){
                                        perror( "stdout" );
                                        exit( 1 );
                                 }
                                exit( 0 );
                         }
                        if ( cl > l[col] ){
                                l[col] = cl ;
                         } else while ( cl++ < l[col] ){
                                if ( EOF == putchar( ' ' )){
                                        if ( ferror( stdout )){
                                                perror( "stdout" );
                                                exit( 1 );
                                         }
                                        exit( 0 );
                                 }
                         }
                        if ( ++col == maxcol ){
                                break ;
                         }
                        if ( EOF == fputs( osep, stdout )){
                                if ( ferror( stdout )){
                                        perror( "stdout" );
                                        exit( 1 );
                                 }
                                exit( 0 );
                         }
                 } while ( 1 );
                if ( EOF == putchar( '\n' )){
                        if ( ferror( stdout ) ){
                                perror( "stdout" );
                                exit( 1 );
                         }
                        exit( 0 );
                 }
                cl = col = 0 ;
         }
        j = *just ;
        do {
                switch ( c = getc( tmp )){
                case EOF:
                        if ( ferror( tmp )){
                                perror( "getc(tmp)" );
                                exit( 1 );
                         }
                        if ( !ll && !cl ){
                                exit( 0 );
                         }
                        c = '\n' ;
                        /* Intentional fall through for EOF as linefeed */
                case '\f':
                case '\n':
                        if ( cl ){
                                if ( col ){
                                        fs = l[col] - cl ;
                                 } else {
                                        fs = 0 ;
                                 }
                                switch ( j ){
                                case 'c':
                                        fs >>= 1 ;
                                case 'r':
                                        if ( ll > ( sizeof( savl ) - fs - cl )){
                                                fputs(
"Output line too long!\n",                              stderr );
                                                exit( -1 );
                                        }
                                        ll += sprintf( savl + ll,
                                                "%*s%.*s",
                                                fs,
                                                "",
                                                cl,
                                                sav );
                                        break ;
                                 }
                         }
                        while ( savl[--ll] == ' '
                             || savl[ll] == '\t' ){
                                /* nothing */
                         }
                        if ( 0 > printf( "%.*s%c", ++ll, savl, c )){
                                if ( ferror( stdout )){
                                        perror( "stdout" );
                                 }
                                exit( 1 );
                         }
                        ll = 0 ;
                        col = 0 ;
                        cl = 0 ;
                        fs = 0 ;
                        j = *just ;
                        break ;
                default:
                        if ( c == isep ){
                                fs = l[col] - cl ;
                                if ( ll >
                                  ( sizeof( savl ) - fs - cl - strlen( osep ))){
                                        fputs(
"Output line too long!\n",                      stderr );
                                        exit( 1 );
                                }
                                switch ( j ){
                                case 'c':
                                        ll += sprintf( savl + ll,
                                                "%*s%.*s%*s",
                                                fs >> 1,
                                                "",
                                                cl,
                                                sav,
                                                fs - ( fs >> 1 ),
                                                "" );
                                        break ;
                                case 'r':
                                        ll += sprintf( savl + ll,
                                                "%*s%.*s",
                                                fs,
                                                "",
                                                cl,
                                                sav );
                                        break ;
                                default:
                                        ll += sprintf( savl + ll, "%*s", fs, ""
                                                );
                                        break ;
                                 }
                                ll += sprintf( savl + ll, "%s", osep );
                                if ( ++col < jlen ){
                                        j = just[col] ;
                                 } else {
                                        j = 'l' ;
                                 }
                                fs = 0 ;
                                cl = 0 ;
                                continue ;
                         }
                        if ( j == 'r' || j == 'c' ){
                                if ( cl >= sizeof( sav )){
                                        fprintf( stderr,
"\nFatal: Column %d too wide.\n",
                                                ++col );
                                        exit( 1 );
                                 }
                                sav[cl++] = c ;
                                continue ;
                         }
                        if ( ll >= sizeof( savl )){
                                fprintf( stderr, "Output line too long!\n" );
                                exit( 1 );
                         }
                        cl++ ;
                        savl[ll++] = c ;
                        break ;
                 }
         } while ( c != EOF );
        exit( 0 );
 }

Did you need that last 0 at the end added?

$ awk -F, '{gsub(/\./,x,$4);printf("%11s%s%010d%012d%s\n",$1,$2,$3,$4,$5)}' checksab
004429876883O000034204000000009819810232014
004429876883O000032238900000026150010232014
004429876883V000032362400000108265110232014
004429876883O000038373400000300000010232014
004429876883O000034962600000009292510232014
004429876883O000038972700000140957510232014
004429876883O000037991000000016692310232014

Yes, the last '0' at the end is actually needed.

Just literally add it to the printf format specifier then.

awk -F, '{gsub(/\./,x,$4);printf("%11s%s%010d%012d%s0\n",$1,$2,$3,$4,$5)}' checksab
2 Likes

It could be safer to multiply field 4 by 100 rather than removing decimal point. I thinking of input like 12.5 :

awk -F, '{printf("%11s%s%010d%012d%s0\n",$1,$2,$3,$4*100,$5)}' checksab
1 Like

Without 5 temp files, awk , sed , or tr and just using ksh built-ins, you could try:

#!/bin/ksh
while IFS=',.' read a b c d e f
do      printf '%s%s%010d%010d%02d%08d0\n' "$a" "$b" "$c" "$d" "$e" "$f"
done < checksab > final.txt

but, for the 6th input line:

004429876883,O,389727,14095.75,10232014

in your sample, it produces:

004429876883O0000389727000001409575102320140

instead of the:

004429876883O0000389627000001409575102320140

you said you wanted.