+ added missing free for line array
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 16 May 2008 07:58:21 +0000 (07:58 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 16 May 2008 07:58:21 +0000 (07:58 +0000)
+ added check to ensure consistency with python parser
+ parsing is much faster now

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9044 e1793c9e-67f9-0310-80fc-b846ff1f7b36

cparser/qparser.c
cparser/read.h
cparser/test.py

index f8df208..47fcdb3 100644 (file)
@@ -14,18 +14,22 @@ unsigned long *id_map;
 
 int map_idx;
 
+const int offset = 1000000000000;
+
 Read **read_array;
 
 int num_reads;
 
+const char* pos_strand = "+";
+const char* neg_strand = "-";
+
 
 // the line format is defined as follows
 // id chr strand seq splitpos size q1 q2 q3 geneId p1 p2 p3 p4 true_cut
 const char* line_format = "%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n";
 
 int create_read_from_line(Read* newRead, const char* current_line) {
-   int buffer_size= 64;                                                                                                                                                                                                        
-   printf("current line is %s\n",current_line);
+   //printf("current line is %s\n",current_line);
 
    int entries_found = sscanf(current_line,line_format,&(newRead->id),
    &(newRead->chr),&(newRead->strand),newRead->seq,&(newRead->splitpos),&(newRead->size),
@@ -36,18 +40,13 @@ int create_read_from_line(Read* newRead, const char* current_line) {
       return -1;
    }
 
+   // make sequence lowercase but don't destroy brackets
    Py_ssize_t idx;
    for(idx=0;idx<strlen(newRead->seq);idx++) {
       if ( 65 <= newRead->seq[idx] && newRead->seq[idx] < 85)
          newRead->seq[idx] = newRead->seq[idx]+32;
    }
 
-   for(idx=0;idx<newRead->size;idx++) {
-      newRead->prb[idx]       -= 50;
-      newRead->cal_prb[idx]   -= 64;
-      newRead->chastity[idx]  += 10;
-   }
-
    if ( newRead->strand == 'D' )
       newRead->strand = '+';
 
@@ -75,6 +74,13 @@ static int set_item_from_line(const char* current_line) {
    if(status != 0)
       return status;
 
+   //print_read(current_read);
+
+   //int read_idx = current_read->id - offset;
+   //if (read_idx > num_reads)
+   //   printf("offset error\n");
+   //read_array[read_idx] = current_read;
+   
    read_array[num_reads] = current_read;
    id_map[num_reads] = current_read->id;
    num_reads++;
@@ -93,7 +99,13 @@ static void assign_read(PyObject *read_dict, int index) {
    status = PyDict_SetItem(read_dict, PyString_FromString("id"),         PyInt_FromLong(current_read->id) );
    status = PyDict_SetItem(read_dict, PyString_FromString("chr"),        PyInt_FromLong(current_read->chr) );
    status = PyDict_SetItem(read_dict, PyString_FromString("seq"),        PyString_FromString(current_read->seq) );
-   status = PyDict_SetItem(read_dict, PyString_FromString("strand"),     PyString_FromString(current_read->strand) );
+   
+   if ( current_read->strand == '+' )
+      status = PyDict_SetItem(read_dict, PyString_FromString("strand"),     PyString_FromString(pos_strand) );
+
+   if ( current_read->strand == '-' )
+      status = PyDict_SetItem(read_dict, PyString_FromString("strand"),     PyString_FromString(neg_strand) );
+
    status = PyDict_SetItem(read_dict, PyString_FromString("splitpos"),   PyInt_FromLong(current_read->splitpos) );
    status = PyDict_SetItem(read_dict, PyString_FromString("read_size"),       PyInt_FromLong(current_read->size) );
    status = PyDict_SetItem(read_dict, PyString_FromString("true_cut"),       PyInt_FromLong(current_read->true_cut) );
@@ -104,9 +116,9 @@ static void assign_read(PyObject *read_dict, int index) {
 
    size_t idx;
    for(idx=0;idx<current_read->size;idx++) {
-      status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]) );
-      status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]) );
-      status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]) );
+      status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
+      status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
+      status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
    }
 
    status = PyDict_SetItem(read_dict, PyString_FromString("prb"),        prb_list );
@@ -135,6 +147,9 @@ static PyObject * Py_fetch_read(PyObject *obj, PyObject *args) {
   
    PyObject* read_dict = PyDict_New();
 
+   //size_t idx = read_id - offset;
+   //assign_read(read_dict,idx);
+
    size_t idx;
    for(idx=0;idx<num_reads;idx++) {
       current_read_id = id_map[idx];
@@ -178,14 +193,14 @@ static PyObject * Py_parse_reads(PyObject *obj, PyObject *args) {
    }
 
    off_t reads_filesize = reads_stat.st_size;                                                                                                                                                                                  
-   printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+   //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
 
    // ATTENTION this is an overestimator of the reads in the file
    // it is NOT the exact number
-   int numReads = reads_filesize / 200.0;
+   int num_init_reads = reads_filesize / 200.0;
 
-   read_array = malloc(sizeof(Read*)*numReads);
-   id_map = malloc(sizeof(unsigned long)*numReads);
+   read_array = malloc(sizeof(Read*)*num_init_reads);
+   id_map = malloc(sizeof(unsigned long)*num_init_reads);
 
    //printf("Found %d reads.",numReads);
 
@@ -242,7 +257,9 @@ static PyObject * Py_parse_reads(PyObject *obj, PyObject *args) {
    if(status != 0)
       perror("munmap");
 
-   return PyInt_FromLong(0);
+   free(current_line);
+
+   return PyInt_FromLong(num_reads);
 }
 
 
index 318364e..881e8be 100644 (file)
@@ -36,7 +36,6 @@ void init_read(Read *r) {
    r->exon_start  = 0;
    r->p_stop      = 0;
    r->true_cut    = 0;
-   r->gene_id      = 0;
 
    r->seq      = malloc(sizeof(char)*buffer_size);
    r->prb      = malloc(sizeof(char)*buffer_size);
@@ -100,5 +99,16 @@ void create_read(Read* newRead, unsigned long id, int chr, char strand, char* se
 }
 
 
+void print_read(Read* r) {
+
+   const char* read_format = "%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n";
+
+   printf("read has entries:\n");
+   printf(read_format,r->id,r->chr,r->strand,r->seq,r->splitpos,
+   r->size,r->prb,r->cal_prb,r->chastity,r->gene_id,
+   r->p_start,r->exon_stop,r->exon_start,r->p_stop,r->true_cut);
+}
+
+
 
 #endif // __READ_H__
index 81c3cb1..6c75662 100644 (file)
@@ -12,9 +12,50 @@ def cpu():
    resource.getrusage(resource.RUSAGE_SELF).ru_stime) 
 
 
+def compare_entries(elem_1,elem_2):
+   all_eq = True
+   
+   att = "chr"
+   eq = elem_1[att] == elem_2[att]
+   att = "seq"
+   eq = eq and elem_1[att] == elem_2[att]
+   att = "strand"
+   eq = eq and elem_1[att] == elem_2[att]
+   att = "splitpos"
+   eq = eq and elem_1[att] == elem_2[att]
+   att = "read_size"
+   eq = eq and elem_1[att] == elem_2[att]
+   att = "true_cut"
+   eq = eq and elem_1[att] == elem_2[att]
+
+
+   att = "prb"
+   eq = eq and elem_1[att] == elem_2[att]
+   att = "cal_prb"
+   eq = eq and elem_1[att] == elem_2[att]
+   att = "chastity"
+   eq = eq and elem_1[att] == elem_2[att]
+
+
+   att = "gene_id"
+   eq = eq and elem_1[att] == elem_2[att]
+   att = "p_start"
+   eq = eq and elem_1[att] == elem_2[att]
+   att = "exon_stop"
+   eq = eq and elem_1[att] == elem_2[att]
+   att = "exon_start"
+   eq = eq and elem_1[att] == elem_2[att]
+   att = "p_stop"
+   eq = eq and elem_1[att] == elem_2[att]
+
+   all_eq = all_eq and eq
+
+   return all_eq
+
+
 def test_module():
-   filename = 'allReads.full_20'
-   #filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full_10k'
+   #filename = 'allReads.full_20'
+   filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
 
    start = cpu()
    num_reads = qparser.parse_reads(filename)
@@ -22,8 +63,17 @@ def test_module():
 
    print 'found %d num reads' % num_reads
 
-   current_read = qparser.fetch_read(1000000000002)
-   print current_read
+   #start = cpu()
+   #result_2 = parse_filtered_reads(filename)
+   #stop = cpu()
+
+   #for current_key in result_2.keys():
+   #   current_read_1 = qparser.fetch_read(current_key)
+
+   #   current_read_2 = result_2[current_key]
+
+   #   print compare_entries(current_read_1,current_read_2)
+   #   #pdb.set_trace()
 
 def _test_module():
    filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full_10k'
@@ -45,46 +95,6 @@ def _test_module():
 
    assert keys_1 == keys_2
 
-   all_eq = True
-   
-   for key in keys_1:
-      elem_1 = result_1[key]
-      elem_2 = result_2[key]
-
-      att = "chr"
-      eq = elem_1[map[att]] == elem_2[att]
-      att = "seq"
-      eq = eq and elem_1[map[att]] == elem_2[att]
-      att = "strand"
-      eq = eq and elem_1[map[att]] == elem_2[att]
-      att = "splitpos"
-      eq = eq and elem_1[map[att]] == elem_2[att]
-      att = "read_size"
-      eq = eq and elem_1[map[att]] == elem_2[att]
-      att = "true_cut"
-      eq = eq and elem_1[map[att]] == elem_2[att]
-
-
-      att = "prb"
-      #eq = eq and elem_1[map[att]] == elem_2[att]
-      att = "cal_prb"
-      #eq = eq and elem_1[map[att]] == elem_2[att]
-      att = "chastity"
-      #eq = eq and elem_1[map[att]] == elem_2[att]
-
-
-      att = "gene_id"
-      eq = eq and elem_1[map[att]] == elem_2[att]
-      att = "p_start"
-      eq = eq and elem_1[map[att]] == elem_2[att]
-      att = "exon_stop"
-      eq = eq and elem_1[map[att]] == elem_2[att]
-      att = "exon_start"
-      eq = eq and elem_1[map[att]] == elem_2[att]
-      att = "p_stop"
-      eq = eq and elem_1[map[att]] == elem_2[att]
-
-      all_eq = all_eq and eq
 
    print all_eq