Try this awk program which doesn't use the split function :
awk -v Q="'" -F'[:,]' '
BEGIN {
cmd = "sort -k3,3nr -k2,2 | awk " Q "{ out=out (NR==1 ? $1 \":\" : \",\") $2 } END { print out }" Q;
}
NR==FNR {
list = $1;
all = "";
for (i=2; i<=NF; i++) {
all = all SUBSEP $i ;
books[list, i-1] = $i;
}
books[list, "all" ] = all SUBSEP;
books[list, "count"] = NF-1;
next;
}
{
book = $1;
delete bookCount;
for (i=2; i<=NF; i++) {
list = $i;
if (books[list, "all"] ~ SUBSEP book SUBSEP) {
for (ib=1; ib<=books[list, "count"]; ib++) {
bookCount[books[list, ib]]++;
}
}
}
for (b in bookCount) {
if (b != book) {
print book, b, bookCount | cmd;
}
}
close(cmd);
}
' kevin2.dat kevin1.dat
Input file 1 (kevin1.dat) :
A:list1,list2,list3,list4
B:list1,list2,list3,list4
C:list1,list2,list6
D:list3
F:list2,list4,list5
G:list7
H:list2,list5
Input file 2 (kevin2.dat) :
list1:A,B,C
list2:A,B,C,F,H
list3:A,B,D
list4:A,B,F
list5:H,F
list6:C
list7:G
Output:
$ time ./kevin.sh
A:B,C,F,D,H
B:A,C,F,D,H
C:A,B,F,H
D:A,B
F:A,B,H,C
H:F,A,B,C
real 0m1.142s
user 0m0.590s
sys 0m0.580s
$
The problem with that script is that we run a sort command for every book.
The following solution use only one sort command :
awk -v Q="'" -F'[:,]' '
NR==FNR {
list = $1;
all = "";
for (i=2; i<=NF; i++) {
all = all SUBSEP $i ;
books[list, i-1] = $i;
}
books[list, "all" ] = all SUBSEP;
books[list, "count"] = NF-1;
next;
}
{
book = $1;
delete bookCount;
for (i=2; i<=NF; i++) {
list = $i;
if (books[list, "all"] ~ SUBSEP book SUBSEP) {
for (ib=1; ib<=books[list, "count"]; ib++) {
bookCount[books[list, ib]]++;
}
}
}
for (b in bookCount) {
if (b != book) {
print book, b, bookCount
}
}
close(cmd);
}
' kevin2.dat kevin1.dat |
sort -k1,1 -k3,3nr -k2,2 |
awk '
{
book = $1;
if (book == prev) {
out = out "," $2;
} else {
if (out) print prev ":" out;
out = $2;
prev = book;
}
}
END { if (out) print prev ":" out; }
'
With the same input files, the out is the same but times are better :
$ time ./kevin2.sh
A:B,C,F,D,H
B:A,C,F,D,H
C:A,B,F,H
D:A,B
F:A,B,H,C
H:F,A,B,C
real 0m0.419s
user 0m0.152s
sys 0m0.169s
$
Jean-Pierre.