+ added small program for intron-position comparison
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 24 Jun 2008 13:01:14 +0000 (13:01 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 24 Jun 2008 13:01:14 +0000 (13:01 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9736 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/run_specific_scripts/transcriptome_analysis/compare_predictions/.compare.c.swp [new file with mode: 0644]
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/Makefile [new file with mode: 0644]
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c [new file with mode: 0644]
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/gt_test [new file with mode: 0644]
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/pred_test [new file with mode: 0644]
tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py
tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py

diff --git a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/.compare.c.swp b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/.compare.c.swp
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/Makefile b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/Makefile
new file mode 100644 (file)
index 0000000..f69b9c9
--- /dev/null
@@ -0,0 +1,14 @@
+CC=gcc
+CFLAGS=-ggdb -Wall -Wshadow -lm #-pedantic
+
+#SRCS= parser.c\
+#OBJS = $(SRCS:%.cpp=%.o)
+
+all: $(OBJS)
+       @ $(CC) $(CFLAGS) -o compare compare.c
+
+#@ $(CC) $(CFLAGS) -o filterReads $(OBJS) filterReads.c
+
+clean:
+       @ rm -rf *.o
+
diff --git a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c
new file mode 100644 (file)
index 0000000..126f287
--- /dev/null
@@ -0,0 +1,204 @@
+// use macro for GNU getline
+#define _GNU_SOURCE 
+#include <stdio.h>
+
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+
+/*
+ * This program compares two lists of intron positions stemming for example from
+ * a gff file in order to evaluate predictions made by QPalma.
+ *
+ * It is assumed that the first file contains the ground truth and the second
+ * file the predicted positions.
+ *
+ */
+
+int* gt_intron_starts;
+int* gt_intron_stops;
+int gt_size;
+
+int* pred_intron_starts;
+int* pred_intron_stops;
+int pred_size;
+
+int matching_introns;
+
+static char* info = "Usage: ./<app name> <input 1 file name> <input 2 file name> <result file name>";
+
+
+/* 
+ * This function compares the intron boundaries of the two files.
+ *
+ */
+
+void compare_introns() {
+
+   matching_introns = 0;
+   
+   int gt_idx = 0;
+   int pred_idx = 0;
+   int gt_start,gt_stop,pred_start,pred_stop;
+
+   printf("sizes are %d/%d\n",gt_size,pred_size);
+
+   while(1) {
+      if (gt_idx == gt_size || pred_idx == pred_size)
+         break;
+
+      gt_start = gt_intron_starts[gt_idx];
+      gt_stop = gt_intron_stops[gt_idx];
+
+      pred_start = pred_intron_starts[pred_idx];
+      pred_stop = pred_intron_stops[pred_idx];
+
+      //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
+      if (gt_start == pred_start && gt_stop == pred_stop) {
+         matching_introns++;
+         gt_idx++;
+         pred_idx++;
+         continue;
+      }
+
+      // check for all possible overlappings / positions
+      if ( gt_stop <= pred_start ) {
+         gt_idx++;
+         continue;
+      }
+
+      if ( pred_stop <= gt_start ) {
+         pred_idx++;
+         continue;
+      }
+
+      if (gt_stop <= pred_stop && gt_stop >= pred_start) {
+         gt_idx++;
+         continue;
+      }
+
+      if (pred_stop <= gt_stop && pred_stop >= gt_start) {
+         pred_idx++;
+         continue;
+      }
+   }
+}
+
+
+/*
+ *
+ *
+ *
+ */
+
+void load_introns(char* fn, int** starts, int** stops, int* size) {
+
+   FILE *fs = fopen(fn,"r");
+   if(fs == NULL) {
+      printf("Error: Could not open file: %s",fn);
+      exit(EXIT_FAILURE);
+   }
+
+   size_t line_size = 256;
+   char* current_line = malloc(sizeof(char)*line_size);
+   size_t status;
+
+   // we count the number of lines the file has
+   int line_ctr = 0;
+   while( getline(&current_line,&line_size,fs) != -1 ) line_ctr++;
+
+
+   *size = line_ctr;
+   // now we allocate memory to store all positions
+   *starts  = malloc(sizeof(int)*line_ctr);
+   *stops   = malloc(sizeof(int)*line_ctr);
+
+   if ( *starts == NULL || *stops == NULL )
+      perror("Could not allocate memory for position arrays");
+
+   fs = freopen(fn,"r",fs);
+   if(fs == NULL) {
+      printf("Error: Could not open file: %s",fn);
+      exit(EXIT_FAILURE);
+   }
+
+   char* first_num  = malloc(sizeof(char)*line_size);
+   char* second_num = malloc(sizeof(char)*line_size);
+
+   int idx;
+   int intron_start;
+   int intron_stop;
+   int ctr = 0;
+
+   while(1) { 
+      status = getline(&current_line,&line_size,fs);
+      if ( status == -1 )
+         break;
+      
+      //printf("%s",current_line);
+      
+      for(idx=0;idx<line_size;idx++) {
+         if (current_line[idx] == ' ')
+            break;
+      }
+      strncpy(first_num,current_line,idx);
+      strncpy(second_num,&current_line[idx],line_size-idx);
+      intron_start = atoi(first_num);
+      intron_stop = atoi(second_num);
+      //printf("start/stop: %d/%d\n",intron_start,intron_stop);
+    
+      (*starts)[ctr] = intron_start;
+      (*stops)[ctr]  = intron_stop;
+      ctr++;
+   }
+
+   printf("ctr is %d\n",ctr);
+   if (fclose(fs) != 0)
+      perror("Closing of filestream failed!");
+}
+
+int main(int argc, char* argv[]) {
+
+   if(argc != 4) {
+      printf("%s\n",info);
+      exit(EXIT_FAILURE);
+   }
+
+   int filenameSize = 256;
+   char* gt_fn    = malloc(sizeof(char)*filenameSize);
+   char* pred_fn  = malloc(sizeof(char)*filenameSize);
+   char* result_fn= malloc(sizeof(char)*filenameSize);
+
+   if ( gt_fn == NULL || pred_fn == NULL || result_fn == NULL)
+      perror("Could not allocate memory for filenames");
+
+   strncpy(gt_fn,argv[1],strlen(argv[1]));
+   strncpy(pred_fn,argv[2],strlen(argv[2]));
+   strncpy(result_fn,argv[3],strlen(argv[3]));
+
+   FILE *result_fs = fopen(result_fn,"r");
+   if(result_fs == NULL) {
+      printf("Error: Could not open file: %s",result_fn);
+      exit(EXIT_FAILURE);
+   }
+
+
+   load_introns(gt_fn,&gt_intron_starts,&gt_intron_stops,&gt_size);
+   load_introns(pred_fn,&pred_intron_starts,&pred_intron_stops,&pred_size);
+
+   compare_introns();
+
+   int f_status = fclose(result_fs);
+   if(f_status != 0)
+      printf("closing of gff filestream failed!\n");
+
+   free(gt_fn);
+   free(pred_fn);
+   free(result_fn);
+
+   printf("Found %d matching intron(s).\n",matching_introns);
+   exit(EXIT_SUCCESS);
+}
+
diff --git a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/gt_test b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/gt_test
new file mode 100644 (file)
index 0000000..89a7b1c
--- /dev/null
@@ -0,0 +1,4 @@
+1 10
+10 12
+12 14
+20 22
diff --git a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/pred_test b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/pred_test
new file mode 100644 (file)
index 0000000..d856dd5
--- /dev/null
@@ -0,0 +1,4 @@
+1 10
+10 14
+14 20
+20 22
index 17afe77..ba68e47 100644 (file)
@@ -1,13 +1,12 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import sys
 import array
-import random
+import cPickle
 import os
-import re
 import pdb
-import cPickle
+import random
+import sys
 
 import qpalma
 import qpalma.tools
@@ -24,6 +23,29 @@ import qpalma.Configuration as Conf
 
 
 class DatasetGenerator:
+   """
+   This class wraps the dataset generation for the matches of the second vmatch
+   run and the heuristic-filtered matches of the first run.
+
+   An example in our dataset consists of:
+
+   - information on the original read:
+         * id
+         * nucleotide sequence
+         * quality vectors (at the moment: prb and chastity)
+
+   - information on the target dna sequence:
+         * chromosome
+         * strand
+         * fragment start 
+         * fragment end positions
+
+   this information is stored in tuples which are stored then in a dictionary
+   indexed by the reads unique id.
+
+   CAUTION: The dictionary stores a list of the above defined tuples under each
+   'id'. This is because one read can be found several times on the genome.
+   """
 
    def __init__(self,map_file,map_2nd_file):
       assert os.path.exists(map_file), 'Error: Can not find map file'
@@ -33,13 +55,17 @@ class DatasetGenerator:
 
       self.dataset = []
 
-      self.read_size = 38
+      #self.read_size = 38
+      self.read_size = 36
 
 
    def saveAs(self,dataset_file):
       dataset_fn = '%s.pickle'% dataset_file
       dataset_keys_fn = '%s.keys.pickle'%dataset_file
 
+      print dataset_fn 
+      print dataset_keys_fn 
+
       assert not os.path.exists(dataset_fn), 'The data_file already exists!'
       assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
 
@@ -49,9 +75,6 @@ class DatasetGenerator:
 
 
    def parse_map_file(self,dataset,map_file,first_map):
-      """
-      """
-
       strand_map = ['-','+']
       instance_counter = 0
 
@@ -59,6 +82,9 @@ class DatasetGenerator:
          if instance_counter > 0 and instance_counter % 50000 == 0:
             print 'processed %d examples' % instance_counter
 
+         #if instance_counter == 10:
+         #   break
+
          slist    = line.split()
 
          id       = int(slist[0])
@@ -108,9 +134,9 @@ class DatasetGenerator:
          # Take into account that the results of the second run do not contain
          # the original reads anymore.
          if first_map:
-            original_est = read_seq
+            original_read = read_seq
          else:
-            original_est = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
+            original_read = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
 
          # in order to save some space we use a signed char to store the
          # qualities. Each quality element can range as follows: -128 <= elem <= 127
@@ -130,11 +156,10 @@ class DatasetGenerator:
 
          # as one single read can have several vmatch matches we store all
          # these matches under the unique id of the read
-         dataset.setdefault(id, []).append((currentSeqInfo,original_est,currentQualities))
+         dataset.setdefault(id, []).append((currentSeqInfo,original_read,currentQualities))
 
          instance_counter += 1
 
-
       print 'Total examples processed: %d' % instance_counter
       return dataset
 
@@ -145,7 +170,7 @@ class DatasetGenerator:
       # usually we have two files to parse:
       # the map file from the second run and a subset of the map file from the
       # first run
-      dataset = self.parse_map_file(dataset,self.map_file,True)
+      #dataset = self.parse_map_file(dataset,self.map_file,True)
       dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
 
       self.dataset = dataset
index 89fd86c..42ed713 100644 (file)
@@ -8,12 +8,18 @@ from compile_dataset import DatasetGenerator
 
 jp = os.path.join
 
-working_dir='/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+#working_dir='/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+#working_dir='/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+
+working_dir='/fml/ag-raetsch/share/projects/qpalma/solexa/new_run2/mapping/spliced'
+
+
 result_dir='/fml/ag-raetsch/home/fabio/tmp/sandbox'
 
-map_1_fn = jp(working_dir,'map.vm.spliced')
-map_2_fn = jp(working_dir,'map_2nd.vm')
+#map_1_fn = jp(working_dir,'map.vm.spliced')
+map_1_fn = jp(working_dir,'spliced.heuristic')
+map_2_fn = jp(working_dir,'map.vm')
 
 dg = DatasetGenerator(map_1_fn,map_2_fn)
 dg.compile_dataset()
-dg.saveAs(jp(result_dir,'dataset_transcriptome_run_1'))
+dg.saveAs(jp(result_dir,'dataset_neg_strand_testcase'))