I came across a previous thread "awk-calculating-simple-correlation-rows" which calculated correlations across rows in awk.
Code:
awk '{
a = 0; for (i = 2; i <= NF; ++i) a += $i; a /= NF-1
b = 0; for (i = 2; i <= NF; ++i) b += ($i - a) ^ 2; b = sqrt(b)
if (b <= 0) next
for (i = 2; i <= NF; ++i) x[NR, i] = ($i - a) / b
n[NR] = $1
for (i = 2; i <= NR; ++i) {
if (!(i in n)) continue
a = 0
for (k = 2; k <= NF; ++k)
a += x[NR, k] * x[i, k]
print n[NR], n, a
}
}'
I need something similar which can calculate correlations across columns and produce a similar output
At the moment i have got around this by transposing the file first and then running the existing code.
------ Post updated at 06:10 PM ------
Apologies i sent the reply before i had finished. The actual file is quite large (1.5 million rows x 50240 columns). If there was a solution without the need to transpose then that would be very useful.
The first row in the desired output (C1 C1 1) is the correlation between column 2 and column 2 which is a correlation of1 as it is the same data. The second row in the desired output (C1 C2 -0.4) is the correlation between column 2 and column 3 etc.
I've broken my code up into stdev and pearson parts for readability.
$ cat stdev.awk
function stdev_mean(TITLE) { return(DATA[TITLE,"T"]/DATA[TITLE,"C"]); }
function stdev_count(TITLE) { return(DATA[TITLE,"C"]-1); }
# Run first for data 1-n to get means
function stdev_pass1(TITLE, VAL) {
DATA[TITLE,"T"] += VAL+0;
DATA[TITLE,"C"] ++;
}
# Run second for data 1-n to get standard deviations
function stdev_pass2(TITLE,VAL,X) {
X = stdev_mean(TITLE) - VAL+0;
DATA[TITLE,"D"] += X*X;
}
# Final result after both passes
function stdev(TITLE) { return(sqrt(DATA[TITLE,"D"] / stdev_count(TITLE)));}
$ cat pearson.awk
# Integrate titles and skip first line
FNR==1 { for(N=2; N<=NF; N++) COL[N]=$N ; MAX=NF; next }
# First pass, calculate means and skip to next line
NR==FNR { for(N=2; N<=NF; N++) stdev_pass1(N, $N); next }
# Second pass, means are now valid, calculate deviation and correlation
{
for(N=2; N<=NF; N++) stdev_pass2(N, $N);
for(N=2; N<=NF; N++) for(M=N; M<=NF; M++)
CORR[N,M]+=(stdev_mean(N) - ($N+0)) * (stdev_mean(M) - ($M+0));
}
END { # Print final data
for(N=2; N<=MAX; N++) for(M=N; M<=MAX; M++)
print COL[N], COL[M], CORR[N,M] / (stdev_count(N)*stdev(N)*stdev(M));
}
# Not a typo, awk is fed inputfile twice.
# This avoids needing to store the entire massive file in memory.
$ awk -f stdev.awk -f pearson.awk inputfile inputfile
C1 C1 1
C1 C2 -0.4
C1 C3 -0.069843
C1 C4 -0.597614
C1 C5 -0.175412
C1 C6 -0.362738
C2 C2 1
C2 C3 0.488901
C2 C4 0.239046
C2 C5 0.30697
C2 C6 0.362738
C3 C3 1
C3 C4 0.667827
C3 C5 0.581936
C3 C6 0.861381
C4 C4 1
C4 C5 0.576557
C4 C6 0.932143
C5 C5 1
C5 C6 0.731727
C6 C6 1
$
Be sure to use GNU awk, other versions may have a 2048-byte line size limit.