awk Moving Average

theflamingmoe · August 4, 2017, 2:45am

Hi, I'm using awk to try and get a moving average for the second column of numbers ($2) in the below example broken out by unique identifier in column 1 ($1) :

H1,1.2
H1,2.3
H1,5.5
H1,6.6
H1,8.7
H1,4.1
H1,6.4
H1,7.8
H1,9.6
H1,3.2
H5,50.1
H5,54.2
H5,58.8
H5,60.9
H5,54.3
H5,52.7
H5,53.8
H5,60.1
H5,59.9
H5,59.4
H5,57.1

My expected output is:

H1,1.2,3.0 
H1,2.3,3.9 
H1,5.5,4.9 
H1,6.6,5.4 
H1,8.7,6.3 
H1,4.1,6.7 
H1,6.4,7.3 
H1,7.8,6.2 
H1,9.6,6.8 
H1,3.2,6.9 
H5,50.1,54.4 
H5,54.2,56.0 
H5,58.8,55.7 
H5,60.9,56.2 
H5,54.3,56.1 
H5,52.7,56.4 
H5,53.8,56.2 
H5,60.1,57.2 
H5,59.9,58.1 
H5,59.4,59.1 
H5,57.1,58.8

The window for the moving average is 5, except for the first 2 row numbers and the last two row numbers for each unique identifier where the window will be 3 and 4.

I have tried modifying the below code. The first problem i'm having is setting up an associative array e.g. n[$1]=$2. The second problem is the changing window for the moving average. Any help would be appreciated

gawk -F"," 'BEGIN {N=5} {n[NR]=$2} NR>=N {x=0; delete n[NR-N]; for(i in n) x+=n; print $1,n[NR-(N-1)],x/N}' OFS=, Test.csv

RudiC · August 4, 2017, 6:28am

Not too simple nor straightforward, really messy, but try

awk -F, '
function PRE()  {for (j=CNT-DT1+1; j<=CNT; j++) {SUM = 0
                                                 for (i=j-DT1; i<=CNT; i++) SUM += VAL
                                                 printf "%s,%s,%.1f\n", LAST, VAL[j], SUM/(CNT-j+2*DT1-1)
                                                }
                }

NR == 1         {DT1 = int(TGTN/2)
                }

NR > 1 &&
$1 != LAST      {PRE()
                 CNT = 0
                }

                {VAL[++CNT] = $2
                 LAST       = $1
                 DT2        = CNT - TGTN
                 if (CNT > DT1) {ST = CNT - 2 * DT1 - (DT2<0?DT2:0)
                                 SUM = 0
                                 for (i=ST; i<=CNT; i++) SUM += VAL
                                 printf "%s,%s,%.1f\n", LAST, VAL[CNT-DT1], SUM/(CNT-ST+1)
                                }
                }
END             {PRE()
                }
' OFS=,  TGTN=5 file
H1,1.2,3.0
H1,2.3,3.9
H1,5.5,4.9
H1,6.6,5.4
H1,8.7,6.3
H1,4.1,6.7
H1,6.4,7.3
H1,7.8,6.2
H1,9.6,6.7
H1,3.2,6.9
H5,50.1,54.4
H5,54.2,56.0
H5,58.8,55.7
H5,60.9,56.2
H5,54.3,56.1
H5,52.7,56.4
H5,53.8,56.2
H5,60.1,57.2
H5,59.9,58.1
H5,59.4,59.1
H5,57.1,58.8

There seems to be a rounding discrepancy in line 9.

theflamingmoe · August 4, 2017, 6:06pm

Wow, thanks so much RudiC. I'm still going over your reply trying to digest parts of your code. Much appreciated.

durden_tyler · August 5, 2017, 3:34am

$ 
$ cat data.txt
H1,1.2
H1,2.3
H1,5.5
H1,6.6
H1,8.7
H1,4.1
H1,6.4
H1,7.8
H1,9.6
H1,3.2
H5,50.1
H5,54.2
H5,58.8
H5,60.9
H5,54.3
H5,52.7
H5,53.8
H5,60.1
H5,59.9
H5,59.4
H5,57.1
$ 
$ awk -F, '
    BEGIN {window=5; half=1+int(window/2); iter=1; cnt=1; n=1; off=1; ptr=1}
    { if ($1 != prev) {
          while (cnt > half) {
              cnt--
              split(line[off], delta, ",")
              sum -= delta[2]
              off++
              avg = sum / cnt
              printf("%s,%.1f\n", line[ptr], avg)
              ptr++
          }
          delete line
          sum = 0; iter = 1; cnt = 1;
          off = 1; ptr = 1; n = 1
      }
      line[n] = $0
      n++
      sum += $2
      if (iter >= half) {
          if (iter == half) {
              cnt = half
          } else {
              cnt++
              if (cnt > window) {
                  cnt = window
                  split(line[off], delta, ",")
                  sum -= delta[2]
                  off++
              }
          }
          avg = sum / cnt
          printf("%s,%.1f\n", line[ptr], avg)
          ptr++
      }
      prev = $1
      iter++
    }
    END {
        while (cnt > half) {
            cnt--
            split(line[off], delta, ",")
            sum -= delta[2]
            off++
            avg = sum / cnt
            printf("%s,%.1f\n", line[ptr], avg)
            ptr++
        }
    }
' data.txt
H1,1.2,3.0
H1,2.3,3.9
H1,5.5,4.9
H1,6.6,5.4
H1,8.7,6.3
H1,4.1,6.7
H1,6.4,7.3
H1,7.8,6.2
H1,9.6,6.8
H1,3.2,6.9
H5,50.1,54.4
H5,54.2,56.0
H5,58.8,55.7
H5,60.9,56.2
H5,54.3,56.1
H5,52.7,56.4
H5,53.8,56.2
H5,60.1,57.2
H5,59.9,58.1
H5,59.4,59.1
H5,57.1,58.8
$ 
$

---------- Post updated at 03:34 AM ---------- Previous update was at 02:46 AM ----------

Calculating moving average appears to be pretty convoluted in imperative languages; however a declarative language like (modern) SQL provides syntactic sugar for solving this problem.

By any chance if you are extracting your data from a database, then you may want to check and leverage the power of SQL (specifically, "window" functions) to calculate moving average.
Here's an example with PostgreSQL 9.6, although window functions are available in Oracle, DB2, SQL Server, MySQL and MariaDB as well.

mydb=> 
mydb=> SELECT VERSION();
                                              version                                              
---------------------------------------------------------------------------------------------------
 PostgreSQL 9.6.3 on x86_64-pc-linux-gnu, compiled by gcc (Debian 6.3.0-18) 6.3.0 20170516, 64-bit
(1 row)

mydb=> 
mydb=> SELECT * FROM data;
 id | val  
----+------
 H1 |  1.2
 H1 |  2.3
 H1 |  5.5
 H1 |  6.6
 H1 |  8.7
 H1 |  4.1
 H1 |  6.4
 H1 |  7.8
 H1 |  9.6
 H1 |  3.2
 H5 | 50.1
 H5 | 54.2
 H5 | 58.8
 H5 | 60.9
 H5 | 54.3
 H5 | 52.7
 H5 | 53.8
 H5 | 60.1
 H5 | 59.9
 H5 | 59.4
 H5 | 57.1
(21 rows)

mydb=> 
mydb=> SELECT id,
mydb->        val,
mydb->        ROUND(
mydb(>            AVG(val) OVER (PARTITION BY id
mydb(>                           ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING
mydb(>                          )::NUMERIC,
mydb(>            1
mydb(>        ) AS moving_avg
mydb->   FROM data
mydb-> ;
 id | val  | moving_avg 
----+------+------------
 H1 |  1.2 |        3.0
 H1 |  2.3 |        3.9
 H1 |  5.5 |        4.9
 H1 |  6.6 |        5.4
 H1 |  8.7 |        6.3
 H1 |  4.1 |        6.7
 H1 |  6.4 |        7.3
 H1 |  7.8 |        6.2
 H1 |  9.6 |        6.8
 H1 |  3.2 |        6.9
 H5 | 50.1 |       54.4
 H5 | 54.2 |       56.0
 H5 | 58.8 |       55.7
 H5 | 60.9 |       56.2
 H5 | 54.3 |       56.1
 H5 | 52.7 |       56.4
 H5 | 53.8 |       56.2
 H5 | 60.1 |       57.2
 H5 | 59.9 |       58.1
 H5 | 59.4 |       59.1
 H5 | 57.1 |       58.8
(21 rows)

mydb=> 
mydb=>

theflamingmoe · August 5, 2017, 6:51am

Thanks Durden_Tyler. I am extracting the table from a database and do have access to SQL Server. I never tried using SQL Server but from what you tell me it will make my life easier and maybe i should give it a go. Thanks again for the code and extra tips.
Theflamingmoe.

durden_tyler · August 7, 2017, 1:33pm

If you have the SQL Server client installed in your computer, then you will have the GUI tool "Microsoft SQL Server Management Studio", where you can write and execute SQL queries.
Window or analytic functions were introduced in SQL Server 2012.
You could determine the SQL Server version by issuing the following query:

select @@version
go

For SQL Server 2012 and higher, either of the following queries should work - assuming the columns "id", and "val" in the table "data" have the data you presented in your 1st post.

-- Query1 : Tested on MS SQL Server 2014 at sqlfiddle.com
select x.id,
       x.val,
       format(avg(x.val) over (partition by x.id
                               order by x.rn
                               rows between 2 preceding and 2 following
                              ),
              'N1'
             ) as mov_avg
  from (
            select id, val,
                   row_number() over (partition by 1 order by (select 1) asc) as rn
              from data
       ) x
go
  
-- Query2 : Tested on MS SQL Server 2014 at sqlfiddle.com
with x (id, val, rn) as (
    select id, val,
           row_number() over (partition by 1 order by (select 1) asc) as rn
      from data
)
select x.id,
       x.val,
       format(avg(x.val) over (partition by x.id
                               order by x.rn
                               rows between 2 preceding and 2 following
                              ),
              'N1'
             ) as mov_avg
  from x
go

For versions lower than 2012 that do not have analytic functions, you could join the "data" table by itself and determine 5-day moving averages:

with x (id, val, rn) as (
    select id, val,
           row_number() over (partition by id order by (select 1) asc) as rn
      from data
)
select x1.id as id,
       x1.val as val,
       convert(decimal(6,1), sum(x2.val)/count(x2.id)) as moving_avg
  from x as x1
       join x as x2
       on (x2.id = x1.id and
           x1.rn >= x2.rn - 2 and
           x1.rn <= x2.rn + 2
          )
group by x1.id, x1.rn, x1.val
go

Here's a log of my execution of the final query in my very old MS SQL Server 2008.
I have used the command-line utility "sqlcmd" instead of SQL Server Management Studio.

C:\data>
C:\data>sqlcmd -S <server_name> -U <login_id>
Password: <entered_password_here>
1>
2> select @@version
3> go
 ------------------------------------------------------------------------------------------
Microsoft SQL Server 2008 R2 (SP3-OD) (KB3144114) - 10.50.6542.0 (X64)
        Feb 22 2016 18:07:23
        Copyright (c) Microsoft Corporation
        Enterprise Edition (64-bit) on Windows NT 6.1 <X64> (Build 7601: Service Pack 1)
 
(1 rows affected)
1>
2> --
3> with data (id, val) as (
4>     select 'H1', 1.2  union all
5>     select 'H1', 2.3  union all
6>     select 'H1', 5.5  union all
7>     select 'H1', 6.6  union all
8>     select 'H1', 8.7  union all
9>     select 'H1', 4.1  union all
10>     select 'H1', 6.4  union all
11>     select 'H1', 7.8  union all
12>     select 'H1', 9.6  union all
13>     select 'H1', 3.2  union all
14>     select 'H5', 50.1 union all
15>     select 'H5', 54.2 union all
16>     select 'H5', 58.8 union all
17>     select 'H5', 60.9 union all
18>     select 'H5', 54.3 union all
19>     select 'H5', 52.7 union all
20>     select 'H5', 53.8 union all
21>     select 'H5', 60.1 union all
22>     select 'H5', 59.9 union all
23>     select 'H5', 59.4 union all
24>     select 'H5', 57.1
25> ),
26> x (id, val, rn) as (
27>     select id, val,
28>            row_number() over (partition by id order by (select 1) asc) as rn
29>       from data
30> )
31> select x1.id as id,
32>        x1.val as val,
33>        convert(decimal(6,1), sum(x2.val)/count(x2.id)) as moving_avg
34>   from x as x1
35>        join x as x2
36>        on (x2.id = x1.id and
37>            x1.rn >= x2.rn - 2 and
38>            x1.rn <= x2.rn + 2
39>           )
40> group by x1.id, x1.rn, x1.val
41> go
id val   moving_avg
-- ----- ----------
H1   1.2        3.0
H1   2.3        3.9
H1   5.5        4.9
H1   6.6        5.4
H1   8.7        6.3
H1   4.1        6.7
H1   6.4        7.3
H1   7.8        6.2
H1   9.6        6.8
H1   3.2        6.9
H5  50.1       54.4
H5  54.2       56.0
H5  58.8       55.7
H5  60.9       56.2
H5  54.3       56.1
H5  52.7       56.4
H5  53.8       56.2
H5  60.1       57.2
H5  59.9       58.1
H5  59.4       59.1
H5  57.1       58.8
  
(21 rows affected)
1>
2> exit
  
C:\data>
C:\data>

theflamingmoe · August 7, 2017, 5:46pm

Thanks again Durden_Tyler, your post was very helpfull.
Regards,
Theflamingmoe

rdrtx1 · August 7, 2017, 8:10pm

awk -F, '
{v[$1]=$1; b[$1, ++c[$1]]=($2+=0)}
END{
   for (k in v) {
      for (i=1; i<=c[k]; i++) {
         s=0; ac=0;
         for (j=-2; j<=+2; j++) if (length(b[k, j + i])) {s+=b[k, j + i]; ac++}
         printf("%s,%s,%0.1f\n", k, b[k, i], (s/ac));
      }
   }
}
' infile

theflamingmoe · August 8, 2017, 6:49am

Overwhelmed by the response to my question. I was worried i wouldn't get any replys. Thanks drtx1.