Help with sort and keep data record to calculate N50 in c

Input_file_1

#content_1
A
#content_2
AF
#content_3
AAR
#content_4
ASEI
#content_5
AS
#content_6
ADFSFGS

Rules:

  1. Based on c program to calculate content of each "#". Result getting from the above Input_file_1 are 1,2,3,4,2,7;
  2. Sort length on reverse order (descending order). 7, 4, 3, 2, 2, 1, 1;
  3. Hope that the program able to store the above record (7, 4, 3, 2, 2, 1, 1) temporary for downstream analysis;
  4. Sum all the total of Input_file_1: 7 + 4 + 3 + 2 + 2 + 1 + 1 = 20;
  5. Divide (50%) the total sum of Input_file_1 as a threhold value: 20/2 = 10;
  6. N50 must be equal to or greater than 50% of the total sum in Input_file_1 (10);
  7. 7+4 = 11 (greater than 10);
    Desired output result after running c program:
4

Many thanks for any advice.

I don't understand step 7. why is output 4? I googled N50, it typically means how many of the largest integers need to be added together to equal 50%, so 7+4=11, requires 2 integers and output is 2? I guess you want the smallest member.

I think this is proper solution, finally. Using your example file:

[mute@geek ~/test]$ ./n50 n50.txt
4
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define MAXLINES        32768

int lines[MAXLINES];
int line_count = 0;

int qsort_cmp(const void *p1, const void *p2)
{
        return (*(int *)p2 - *(int *)p1);
}

int main(int argc, char**argv)
{
        FILE *fh;
        char buf[512];
        int i, sum = 0, threshold;

        if (argc != 2)
        {
                printf("Usage: %s <input file>\n", argv[0]);
                return -1;
        }

        if ((fh = fopen(argv[1], "r")) == NULL)
        {
                perror("fopen");
                return -1;
        }

        while (fgets(buf, sizeof(buf), fh))
        {
                int len = strlen(buf);

                /* skip comments */
                if (buf[0] == '#') continue;

                /* strip newline at end */
                buf[--len] = 0;

                /* add to sum. */
                sum += len;

                /* keep record */
                lines[line_count++] = len;
        }

        qsort(lines, line_count, sizeof(int), qsort_cmp);

        threshold = sum / 2;

        for (i = sum = 0; (sum += lines) < threshold; i++)
                ;

        printf("%d\n", lines);

        return 0;
}
1 Like

Hi, friend.
This is one of the thread that mention well about N50 calculation, Calculating an N50 from Velvet output | (R news & tutorials)
The N50 of my example should be 4 instead of 2.
I'm trying with your approaches now with test file.
Hopefully we getting the same approaches :slight_smile:

As per your PM, the content should handle newlines. Also, I added DEBUG statements so you can view what the program is doing...

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define DEBUG
#define MAXLINES        32768


int lines[MAXLINES];
int line_count = 0;

int qsort_cmp(const void *p1, const void *p2)
{
        return (*(int *)p2 - *(int *)p1);
}

int main(int argc, char**argv)
{
        FILE *fh;
        char buf[512];
#ifdef  DEBUG
        char contentblock[512];
#endif
        int i, sum = 0, threshold;

        if (argc != 2)
        {
                printf("Usage: %s <input file>\n", argv[0]);
                return -1;
        }

        if ((fh = fopen(argv[1], "r")) == NULL)
        {
                perror("fopen");
                return -1;
        }

#ifdef  DEBUG
        /* if a block is given before first '#' ... */
        strcpy(contentblock, "null");
#endif
        lines[0] = 0;
        while (fgets(buf, sizeof(buf), fh))
        {
                int len = strlen(buf);
                /* strip newline at end */
                buf[--len] = 0;
                /* new content block */
                if (buf[0] == '#') {
#ifdef  DEBUG
                        printf("%s length %d\n", contentblock, lines[line_count]);
                        strcpy(contentblock, buf + 1);
#endif
                        lines[++line_count] = 0;
                        continue;
                }
                /* keep record */
                lines[line_count] += len;
                /* add to sum. */
                sum += len;
        }
        threshold = sum / 2;
#ifdef  DEBUG
        printf("%s length %d\n", contentblock, lines[line_count]);
        printf("sum = %d, threshold = %d\n", sum, threshold);
        printf("Before sort: ");
        for (i = 0; i <= line_count; i++)
                printf("%d%s", lines, (i == line_count) ? "\n" : " + ");
#endif

        qsort(lines, line_count + 1, sizeof(int), qsort_cmp);
#ifdef  DEBUG
        printf("After sort: ");
        for (i = 0; i <= line_count; i++)
                printf("%d%s", lines, (i == line_count) ? "\n" : " + ");
#endif
        for (i = sum = 0; (sum += lines) < threshold; i++)
        {
#ifdef  DEBUG
                printf("%d%s", lines, (sum >= threshold) ? "\n" : " + ");
#endif
        }

        printf("%d\n", lines);

        return 0;
}
1 Like

Many thanks, neutronscott.
Your program work very fast for huge data :slight_smile:
It is amazing.
Do you have any idea how to edit the program to allow it print out only the N50 number instead of whole data analysis detail?
I try to edit it.
But can't work :frowning:
Thanks for your assist.

#undef DEBUG

1 Like