Hello, I am trying to reverse complement one string and reverse another (NO complement!), both with pointer. My code compiled without error, but did not do the job I wanted.
#include <stdio.h>
#include <stdlib.h>
#include <zlib.h>
#include "kseq.h"
// STEP 1: declare the type of file handler and the read() function
KSEQ_INIT(gzFile, gzread)
char *rev_comp(char *seq);
char *strrev(char *str);
int main(int argc, char *argv[])
{
gzFile fp;
kseq_t *seq;
int l;
char *entry_rc;
char *qual_rv;
if (argc != 2) {
fprintf(stderr, "Usage: %s <infile.fastq> ", argv[0]);
return 1;
}
fp = gzopen(argv[1], "r"); // STEP 2: open the file handler
seq = kseq_init(fp); // STEP 3: initialize seq
while ((l = kseq_read(seq)) >= 0) { // STEP 4: read sequence
if (strlen(seq->seq.s) == strlen(seq->qual.s))
{
// entry_rc = malloc(sizeof(char)*strlen(seq->seq.s)); //Line 39: This line is not needed! But Why????
qual_rv = malloc(sizeof(char)*strlen(seq->qual.s)); //Line 40: Not sure this is the right way either.
entry_rc = rev_comp(seq->seq.s); //Line 41: Seems fine to have the DNA rev_complemented!
qual_rv = strrev(seq->qual.s); //Line 42: This line did not work!
printf(">%s\n%s\n%s\n%s\n%s\n", seq->name.s, seq->seq.s, seq->qual.s, entry_rc, qual_rv );
}
}
kseq_destroy(seq); // STEP 5: destroy seq
gzclose(fp); // STEP 6: close the file handler
return 0;
}
/*************************** Functions ***************************************************/
/*1. reverse and complement the sequence*/
char *rev_comp(char *seq) {
if( seq == NULL || !(*seq) )
return NULL;
int i, j = strlen(seq)-1;
char *seq_rc;
seq_rc = malloc(sizeof(char) * (j+1));
for(i=0; j>=0; i++, j--) {
*(seq_rc+i) = *(seq+j);
if (*(seq_rc+i) == 'A') *(seq_rc+i) ='T';
else if (*(seq_rc+i) == 'C') *(seq_rc+i) ='G';
else if (*(seq_rc+i) == 'G') *(seq_rc+i) ='C';
else if (*(seq_rc+i) == 'T') *(seq_rc+i) ='A';
else
*(seq_rc+i) ='N';
}
return seq_rc;
}
/*2. reverse the quality string */
char *strrev( char *s)
{
char c;
char* s0 = s - 1;
char* s1 = s;
/* Find the end of the string */
while (*s1) ++s1;
/* Reverse it */
while (s1-- > ++s0)
{
c = *s0;
*s0 = *s1;
*s1 = c;
}
return s;
}
The problem seems to be with Line 42.
I believe strrev() function is fine as it was tested to be working well with another program. It may be related to the fact of seq->qual.s, which is a string pointer like seq->seq.s, which function rev_comp() worked well with. From my previous post, I seemed to understand the difference between string pointer and string array, so I tried use malloc to allocate memory for both of them, especially here I want reverse the string seq->qual.s in place with strrev() function.
My questions are:
1) What is wrong with Line 42 and Line 40?
2) Why Line 39 is not needed (tested!) and Line 41 worked well with string pointer as parameter?
infile:
@HWI-EAS121
CGTTACGAGATCGGAAGAGCGGTTCAGCAG
+HWI-EAS121
aaaaa`b_aa`aa`YaX]aZ`aZM^Z]YRa
@HWI-EAS122
GGGTGGGCATTTCCACTCGCAGTATGGGTT
+HWI-EAS122
a``^\__`_```^a``a`^a_^__]a_]\]
@HWI-EAS123
CGTTTATGTTTTTGAATATGTCTTATCTTA
+HWI-EAS123
abaa`^aaaaabbbaababbbbbb`bbbb_
I have attached the kseq.h header file which is needed to handle zipped or unzipped sequence files. What I expected to have 2nd, 6th, 10th row of string reverse complemented (DNA), and the 4th, 8th, 12th row of strings reversed of the infile, but the program did not do the job. Here is the output:
output:
>HWI-EAS121
CGTTACGAGATCGGAAGAGCGGTTCAGCAG
aRY]Z^MZa`Za]XaY`aa`aa_b`aaaaa
CTGCTGAACCGCTCTTCCGATCTCGTAACG
aRY]Z^MZa`Za]XaY`aa`aa_b`aaaaa
>HWI-EAS122
GGGTGGGCATTTCCACTCGCAGTATGGGTT
]\]_a]__^_a^`a``a^```_`__\^``a
AACCCATACTGCGAGTGGAAATGCCCACCC
]\]_a]__^_a^`a``a^```_`__\^``a
>HWI-EAS123
CGTTTATGTTTTTGAATATGTCTTATCTTA
_bbbb`bbbbbbabaabbbaaaaa^`aaba
TAAGATAAGACATATTCAAAAACATAAACG
_bbbb`bbbbbbabaabbbaaaaa^`aaba
I know this post is a little too long, but the idea of code is to call two functions to manipulate the string within the file. Could not figure out the bug by myself while trying to catch string pointer. Thanks a lot in advance!