Need sql query to string split and normalize data

calredd · May 11, 2017, 7:47am

Hello gurus,
I have data in one of the oracle tables as as below:

Column 1    Column 2
1               NY,NJ,CA
2               US,UK,
3               AS,EU,NA

fyi, Column 2 above has data delimited with a comma as shown.

I need a sql query the produce the below output in two columns as.

Column 1        Column 2
1                   NY
1                   NJ
1                   CA
2                   US
2                   UK
3                   AS
3                   EU
3                   NA

Basically, I need to split data in one field based on a delimiter which is a comma and then normalize the data to get the required output. I have been trying sql using regex but without success. Any inputs are appreciated.
Thanks,
Carl

vgersh99 · May 11, 2017, 8:37am

as a workaround for post-processing:

awk 'FNR==1{print;next}{n=split($2,a,",");for(i=1;i<=n;i++) if(a) print $1,a}' mySQLextactedFile

calredd · May 11, 2017, 8:42am

Thanks but need the solution in sql query. Its easy to implement using sed/awk.

disedorgue · May 11, 2017, 9:35am

Hi,
Example (in red for you) :

with T As
  (select 123 as c1, 'NY,NJ,CA' as c2 from dual
  union
  select 124 as c1, 'NY,PA,' as c2 from dual)
  SELECT DISTINCT C1, regexp_substr(C2,'[^,]+', 1, LEVEL)
  FROM T
  CONNECT BY regexp_substr(C2, '[^,]+', 1, LEVEL) IS NOT NULL
  ORDER BY C1;

Result:

        C1 REGEXP_SUBSTR(C2,'[^,]+'
---------- ------------------------
       123 CA
       123 NJ
       123 NY
       124 NY
       124 PA

Regards.

calredd · May 11, 2017, 12:25pm

disedorgue:

Hi,
Example (in red for you) :

with T As
  (select 123 as c1, 'NY,NJ,CA' as c2 from dual
  union
  select 124 as c1, 'NY,PA,' as c2 from dual)
  SELECT DISTINCT C1, regexp_substr(C2,'[^,]+', 1, LEVEL)
  FROM T
  CONNECT BY regexp_substr(C2, '[^,]+', 1, LEVEL) IS NOT NULL
  ORDER BY C1;

Result:

   C1 REGEXP_SUBSTR(C2,'[^,]+'
---------- ------------------------
   123 CA
   123 NJ
   123 NY
   124 NY
   124 PA

Regards.

works great although runs a little slow, thanks a lot.

durden_tyler · May 12, 2017, 11:41am

SQL>
SQL> --
SQL> select * from t;
  
         X Y
---------- ----------------------------------------
         1 NY,NJ,CA
         2 US,UK
         3 AS,EU,NA
         4 AAA,BBBB,C,DDDDD,EE,F,GGGGGG
         5
         6 XYZ
  
 6 rows selected.
  
SQL>
SQL> -- Using SUBSTR, INSTR functions.
SQL> select x,
  2         case when iter.pos = 1 and length(y)-length(replace(y,','))+1 = 1 then y
  3              when iter.pos = 1 then substr(y,1,instr(y,',',1,iter.pos)-1)
  4              when iter.pos = length(y)-length(replace(y,','))+1 then substr(y,instr(y,',',1,iter.pos-1)+1)
  5              else substr(y, instr(y,',',1,iter.pos-1)+1, instr(y,',',1,iter.pos) - instr(y,',',1,iter.pos-1) - 1)
  6         end as token
  7    from t,
  8         (  select level as pos
  9              from dual
 10           connect by level <= (select max(length(y)-length(replace(y,','))+1) from t)
 11         ) iter
 12   where iter.pos <= nvl(length(y)-length(replace(y,','))+1,1)
 13   order by x, pos
 14  ;
  
         X TOKEN
---------- ----------------------------------------
         1 NY
         1 NJ
         1 CA
         2 US
         2 UK
         3 AS
         3 EU
         3 NA
         4 AAA
         4 BBBB
         4 C
         4 DDDDD
         4 EE
         4 F
         4 GGGGGG
         5
         6 XYZ
  
 17 rows selected.
  
SQL>
SQL> -- Same query in stages.
SQL> with iter(pos) as (
  2      select level as pos
  3        from dual
  4     connect by level <= (select max(length(y) - length(replace(y,',')) + 1) from t)
  5  ),
  6  data(x, y, token_count) as (
  7      select x, y, length(y) - length(replace(y, ',')) + 1 as token_count
  8        from t
  9  ),
 10  combined as (
 11      select d.x, d.y, iter.pos, d.token_count,
 12             case when iter.pos > 1 then instr(y, ',', 1, iter.pos-1)
 13             end as prev_indx,
 14             instr(y, ',', 1, iter.pos) as indx
 15        from data d, iter
 16       where iter.pos <= nvl(d.token_count, 1)
 17  )
 18  select x,
 19         case when pos = 1 and token_count = 1 then y
 20              when pos = 1 then substr(y, 1, indx - 1)
 21              when pos = token_count then substr(y, prev_indx + 1)
 22              else substr(y, prev_indx + 1, indx - prev_indx - 1)
 23         end as token
 24    from combined
 25   order by x, pos
 26  ;
  
         X TOKEN
---------- ----------------------------------------
         1 NY
         1 NJ
         1 CA
         2 US
         2 UK
         3 AS
         3 EU
         3 NA
         4 AAA
         4 BBBB
         4 C
         4 DDDDD
         4 EE
         4 F
         4 GGGGGG
         5
         6 XYZ
  
 17 rows selected.
  
SQL>
SQL>
SQL> -- Another one using regular expressions.
SQL> -- regexp_count is in version 11g Release 1 and higher
SQL> select x,
  2         regexp_substr(y,'[^,]+',1,iter.pos) as token
  3    from t,
  4         (  select level as pos
  5              from dual
  6           connect by level <= (select max(regexp_count(y,',')+1) from t)
  7         ) iter
  8   where iter.pos <= nvl(regexp_count(y,',')+1,1)
  9   order by x, pos
 10  ;
  
         X TOKEN
---------- ----------------------------------------
         1 NY
         1 NJ
         1 CA
         2 US
         2 UK
         3 AS
         3 EU
         3 NA
         4 AAA
         4 BBBB
         4 C
         4 DDDDD
         4 EE
         4 F
         4 GGGGGG
         5
         6 XYZ
  
 17 rows selected.
  
SQL>
SQL>