I have written a code in C where the objective is to search for strings.
There are two files:
- Database file which has more than one billion entries. This file is read in argv[1] in the C code below. The format of the file is like this:
a.txt apple
b.txt candle
c.txt glue
- There is another file which has strings each in a newline. Strings from this file have to be searched in the database file. This file is read using argv[2] in the C code below. The format of this file is this:
apple
candle
glue
computer
database
The objective is to read each string present in the second file and search for that string in the database file.
This is what I have tried:
//This program will read the bigram, trigram and quadgram file generated from the Wikipedia and search for the entities from it.
//replace space with - before running this code in both files.
#define _GNU_SOURCE
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<search.h>
int main ( int argc , char **argv )
{
FILE *wikipedia_ngram = NULL; //This is the file on which searching has to be done. We will call this the "database file".
FILE *entities = NULL; //This file contains the list of strings to be searched.
char *line = NULL;
char *file_name = NULL;
char *word = NULL;
size_t len = 0;
ssize_t read;
//Defining the hash variables
ENTRY e, *ep;
unsigned long int lines_ngram = 0; //number of lines in the n-gram file
unsigned long iterator = 0;
wikipedia_ngram = fopen ( argv [ 1 ] , "r" );
if ( wikipedia_ngram == NULL )
{
fprintf ( stderr , "Wikipedia n-gram file open error\n" );
return ( EXIT_FAILURE );
}
entities = fopen ( argv [ 2 ] , "r" );
if ( entities == NULL )
{
fprintf ( stderr , "Entities file open error\n" );
return ( EXIT_FAILURE );
}
file_name = ( char * ) malloc ( 5000 * sizeof ( char ) );
if ( file_name == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in file_name\n" );
return ( EXIT_FAILURE );
}
word = ( char * ) malloc ( 1000 * sizeof ( char ) );
if ( word == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in word\n" );
return ( EXIT_FAILURE );
}
while ( ( read = getline ( &line , &len , wikipedia_ngram ) ) != -1 )
{
lines_ngram ++; //finding the number of lines in the database file
}
rewind ( wikipedia_ngram );
//got the number of lines above
//create the hash table now.
hcreate ( lines_ngram ); //the code below is an adaption of an example in the hsearch() man page on LInux system
for ( iterator = 0; iterator < lines_ngram; iterator++ )
{
fscanf ( wikipedia_ngram , "%s %s\n" , file_name , word ); //read data line by line from the database file
e.key = word;
e.data = (char *) file_name;
ep = hsearch ( e , ENTER ); //create a hash table
/* there should be no failures */
if (ep == NULL)
{
fprintf(stderr, "Entry failed\n");
exit ( EXIT_FAILURE );
}
}
memset ( word , 0 , 1000 );
//find the entities in the hash table.
while ( !feof ( entities ) )
{
fscanf ( entities , "%s\n" , word ); //read the strings to be searched line by line
e.key = word;
ep = hsearch (e, FIND);
if ( ep == NULL )
{
fprintf ( stderr , "ep search error\n" );
exit ( EXIT_FAILURE );
}
printf ("%s %s\n" , ep->key, ( char * ) ( ep->data ) );
}
if ( line )
{
free ( line );
}
fclose ( wikipedia_ngram );
fclose ( entities );
free ( file_name );
free ( word );
hdestroy();
return ( EXIT_SUCCESS );
}
The code above compiles on a Linux system using gcc (gcc version 4.8.2 (GCC)), but the output is this:
apple c.txt
candle c.txt
glue c.txt
ep search error
I am not able to figure out where the problem lies. I even used GDB to debug the code, but I could not locate the problem.