awk to average field if matching string in another

In the awk below I am trying to get the average of the sum of $7 if the string in $4 matches in the line below it. The --- in the desired out is not needed, it is just to illustrate the calculation. The awk executes and produces the current out. I am not sure why the middle line is skipped and the math seems to be off slightly.

All the numbers come from the tab-delimeted file and if string in $4 matches in the line below it then the average of the sum of $7 is printed along with the $5 value of the matching lines. The output is then sorted in natural ordering. Thank you :).

awk -F'\t' '{gene[$4]=$5; sum[$4]+=$7; count[$4]++} 
    END{for(k in sum) printf "%s\t%s\t%.1f\n",  k, gene[k], sum[k]/count[k]}' file_less | sort -k1,1 -V -s > out

current out

chr1:1167485-1167705	B3GALT6	13.0
chr1:1167605-1167825	B3GALT6	11.0

desired out

chr1:1167485-1167705	B3GALT6	14.5   --- (680/47)
chr1:1167573-1167679	B3GALT6	12.4   --- (1323/107)
chr1:1167605-1167825	B3GALT6	11.0   --- (811/74)

file

chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	1	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	2	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	3	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	4	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	5	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	6	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	7	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	8	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	9	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	10	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	11	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	12	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	13	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	14	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	15	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	16	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	17	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	18	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	19	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	20	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	21	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	22	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	23	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	24	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	25	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	26	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	27	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	28	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	29	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	30	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	31	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	32	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	33	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	34	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	35	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	36	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	37	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	38	16
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	39	16
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	40	16
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	41	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	42	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	43	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	44	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	45	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	46	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	47	19
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	88	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	89	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	90	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	91	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	92	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	93	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	94	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	95	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	96	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	97	18
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	98	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	99	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	100	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	101	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	102	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	103	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	104	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	105	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	106	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	107	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	108	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	109	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	110	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	111	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	112	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	113	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	114	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	115	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	116	13
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	117	12
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	118	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	119	7
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	120	8
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	121	8
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	122	8
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	123	8
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	124	7
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	125	8
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	126	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	127	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	128	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	129	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	130	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	131	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	132	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	133	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	134	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	135	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	136	8
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	137	8
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	138	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	139	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	140	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	141	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	142	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	143	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	144	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	145	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	146	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	147	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	148	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	149	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	150	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	151	8
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	152	7
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	153	7
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	154	6
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	155	7
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	156	7
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	157	6
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	158	7
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	159	7
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	160	8
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	161	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	162	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	163	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	164	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	165	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	166	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	167	9
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	168	11
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	169	11
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	170	12
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	171	12
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	172	12
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	173	14
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	174	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	175	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	176	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	177	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	178	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	179	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	180	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	181	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	182	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	183	15
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	184	16
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	185	16
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	186	16
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	187	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	188	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	189	17
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	190	19
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	191	19
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	192	19
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	193	19
chr1	1167485	1167705	chr1:1167485-1167705	B3GALT6	194	19
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	1	8
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	2	8
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	3	8
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	4	7
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	5	8
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	6	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	7	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	8	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	9	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	10	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	11	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	12	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	13	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	14	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	15	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	16	8
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	17	8
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	18	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	19	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	20	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	21	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	22	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	23	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	24	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	25	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	26	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	27	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	28	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	29	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	30	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	31	8
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	32	7
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	33	7
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	34	6
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	35	7
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	36	7
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	37	6
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	38	7
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	39	7
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	40	8
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	41	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	42	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	43	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	44	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	45	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	46	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	47	9
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	48	11
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	49	11
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	50	12
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	51	12
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	52	12
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	53	14
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	54	15
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	55	15
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	56	15
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	57	15
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	58	15
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	59	15
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	60	15
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	61	15
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	62	15
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	63	15
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	64	16
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	65	16
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	66	16
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	67	17
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	68	17
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	69	17
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	70	19
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	71	19
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	72	19
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	73	19
chr1	1167605	1167825	chr1:1167605-1167825	B3GALT6	74	19

I apologize for the long post, but I am not sure why bit line 1 and line 2 seem to be getting grouped together, that is the values in $7 for both the lines seems to be getting summed together ($7=2003/total lines=154)

here's a debug version:

awk '{gene[$4]=$5; sum[$4]+=$7; count[$4]++}
    END{for(k in sum) printf "%s\t%s\t%.1f [%d/%d]\n",  k, gene[k], sum[k]/count[k], sum[k], count[k]}' ccm1.txt | sort -k1,1 -V -s

Produces:

chr1:1167485-1167705    B3GALT6 13.0 [2003/154]
chr1:1167605-1167825    B3GALT6 11.0 [811/74]

Also:

$ grep -c chr1:1167485-1167705 ccm1.txt
154

Looks right

1 Like

Thank you very much :).