+ found negative strand bug
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 11 Aug 2008 07:34:50 +0000 (07:34 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 11 Aug 2008 07:34:50 +0000 (07:34 +0000)
+ fix this version as a base for the distribution

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10357 e1793c9e-67f9-0310-80fc-b846ff1f7b36

16 files changed:
cparser/qparser.c
makePackage.sh [new file with mode: 0755]
pipeline/Pipeline.py [new file with mode: 0644]
scripts/Lookup.py [deleted file]
scripts/createAlignmentFileFromPrediction.py
scripts/grid_alignment.py
scripts/grid_heuristic.py
scripts/grid_predict.py
scripts/qpalma_main.py
scripts/qpalma_pipeline.py
setup.py [new file with mode: 0644]
tests/test_qpalma_prediction.py
tools/data_tools/filterReads.c
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/Makefile
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c
tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py

index 47fcdb3..c99f3a1 100644 (file)
@@ -4,7 +4,6 @@
 #include <sys/mman.h>
 #include <sys/stat.h>
 
-
 #include "read.h"
 
 #define WITH_QUALITIES 0
@@ -282,116 +281,3 @@ int main(int argc, char *argv[]) {
        initqparser();
 }
 
-
-/*
-static int set_item_from_line(PyObject *result_dict, const char* current_line) {
-   
-   // increment the reference count for the result_dict object because we want
-   // ot modify it
-   Py_INCREF( result_dict );
-
-   //printf("current line is %s\n",current_line);
-   int entries_found = sscanf(current_line,line_format,&id,&chr,&strand,seq,&splitpos,&size,prb,cal_prb,chastity,geneId,&p_start,&exon_stop,&exon_start,&p_stop,&true_cut);
-
-   if (entries_found != 15) {
-      return entries_found;
-   }
-
-   //printf("after sscanf\n");
-   int status;
-
-   // create dictionary representing one line
-   //PyObject* entry_dict = PyDict_New();
-
-   // alternative way using a list instead of a dictionary
-   PyObject* entry_list = PyList_New(15);
-   Py_INCREF( entry_list );
-
-   PyObject *id_py = PyInt_FromLong(id);
-   PyObject *strand_py = PyString_FromString("--");
-
-   if ( strand == 'D' )
-      strand_py = PyString_FromString("+");
-
-   if ( strand == 'P' )
-      strand_py = PyString_FromString("-");
-
-   //printf("before : %s\n",seq);
-   Py_ssize_t idx;
-   for(idx=0;idx<strlen(seq);idx++) {
-      if ( 65 <= seq[idx] && seq[idx] < 85)
-         seq[idx] = seq[idx]+32;
-   }
-   //printf("after : %s\n",seq);
-
-   // add entries of that line
-   
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("id"),         id_py );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("chr"),        PyInt_FromLong(chr) );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("seq"),        PyString_FromString(seq) );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("strand"),     strand_py );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("splitpos"),   PyInt_FromLong(splitpos) );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("read_size"),       PyInt_FromLong(size) );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("true_cut"),       PyInt_FromLong(true_cut) );
-
-   status = PyList_SetItem(entry_list, 0,  id_py );
-   status = PyList_SetItem(entry_list, 1,  PyInt_FromLong(chr) );
-   status = PyList_SetItem(entry_list, 2,  PyString_FromString(seq) );
-   status = PyList_SetItem(entry_list, 3,  strand_py );
-   status = PyList_SetItem(entry_list, 4,  PyInt_FromLong(splitpos) );
-   status = PyList_SetItem(entry_list, 5,       PyInt_FromLong(size) );
-   status = PyList_SetItem(entry_list, 6,      PyInt_FromLong(true_cut) );
-
-   PyObject* prb_list = PyList_New(size);
-   PyObject* cal_prb_list = PyList_New(size);
-   PyObject* chastity_list = PyList_New(size);
-
-#if WITH_QUALITIES
-   for(idx=0;idx<size;idx++) {
-      status = PyList_SetItem( prb_list, idx, PyInt_FromLong(prb[idx]-50) );
-      status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(cal_prb[idx]-64) );
-      status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(chastity[idx]+10) );
-   }
-
-   status = PyList_SetItem(entry_list, 7,    prb_list );
-   status = PyList_SetItem(entry_list, 8,    cal_prb_list );
-   status = PyList_SetItem(entry_list, 9,    chastity_list );
-#else
-   status = PyList_SetItem(entry_list, 7,    PyString_FromString("") );
-   status = PyList_SetItem(entry_list, 8,    PyString_FromString("") );
-   status = PyList_SetItem(entry_list, 9,    PyString_FromString("") );
-#endif 
-
-
-   status = PyList_SetItem(entry_list, 10, PyString_FromString(geneId) );
-   status = PyList_SetItem(entry_list, 11,    PyInt_FromLong(p_start) );
-   status = PyList_SetItem(entry_list, 12,  PyInt_FromLong(exon_stop) );
-   status = PyList_SetItem(entry_list, 13, PyInt_FromLong(exon_start) );
-   status = PyList_SetItem(entry_list, 14,     PyInt_FromLong(p_stop) );
-
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("prb"),        prb_list );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("cal_prb"),    cal_prb_list );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("chastity"),   chastity_list );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("gene_id"), PyString_FromString(geneId) );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("p_start"),    PyInt_FromLong(p_start) );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("exon_stop"),  PyInt_FromLong(exon_stop) );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("exon_start"), PyInt_FromLong(exon_start) );
-   //status = PyDict_SetItem(entry_dict, PyString_FromString("p_stop"),     PyInt_FromLong(p_stop) );
-
-   // now save the dictionary representing one line in the dictionary
-   // representing the whole file
-   //status = PyDict_SetItem(result_dict, id_py, entry_dict);
-
-   status = PyDict_SetItem(result_dict, id_py, entry_list);
-   if (status != 0) {
-               PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Failed to add item!");
-   }
-
-   Py_DECREF( entry_list );
-   // decrement the reference count as we are finished with the local
-   // modification of the object
-   Py_DECREF( result_dict );
-
-   return status;
-}
-*/
diff --git a/makePackage.sh b/makePackage.sh
new file mode 100755 (executable)
index 0000000..7e2eea1
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+#
+# The purpose of this script is to create a python package
+# for QPalma
+#
+
+name=qpalma-0.8
+tmp_dir=/tmp/$name
+result_fn=/tmp/$name.tar.gz
+
+# first we create a temporary directory to store the needed files:
+mkdir $tmp_dir
+
+# copy all needed files to the package directory
+cp setup.py $tmp_dir
+
+# create a gzipped tarball containing the QPalma package
+cd $tmp_dir
+cd ..
+tar -czvf $result_fn $name
+
+echo "Finished package-building result is under $result_fn"
diff --git a/pipeline/Pipeline.py b/pipeline/Pipeline.py
new file mode 100644 (file)
index 0000000..32c69be
--- /dev/null
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- coding:latin-1 -*-
+
+import subprocess
+
+class PipelineStep:
+  """
+  The purpose of this class is to wrap a pipeline step which can be the
+  execution of a binary a small sequences of shell commands
+  """
+
+  def __init__(self,cL):
+     self.commandList = cL
+
+  def __call__(self):
+     success,pre_mes = self.checkPreConditions()
+     if not success:
+        return False,pre_mes
+
+     all_output = []
+     for currentCommand in self.commandList:
+        streams = subprocess.Popen(currentCommand,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+        out,err = streams.communicate()
+
+        if err != '':
+           return False,err
+
+        all_output.append(out)
+
+     success,post_mes = self.checkPostConditions()
+     if not success:
+        return False,post_mes
+
+     return True,all_output
+
+
+  def checkPreConditions(self):
+     return True,''
+
+
+  def checkPostConditions(self):
+     return True,''
+
+
+if __name__ == '__main__':
+  macro = PipelineStep(['date','w'])
+  status,mes = macro()
+  print status 
+  print mes                                                                                                                                                                     
diff --git a/scripts/Lookup.py b/scripts/Lookup.py
deleted file mode 100644 (file)
index 4c29143..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import random
-import os
-import re
-import pdb
-import random
-
-from numpy.matlib import inf
-from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
-
-
-class LookupTable:
-   """
-   Speed up genomic and splice site score queries by prefetching the sequences
-   and the scores once for all chromosomes.
-   """
-
-   def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
-      accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
-      self.seqInfo = SeqSpliceInfo(flat_files,chromo_list)
-
-      self.strands = ['+','-']
-   
-      # take care that it also works say if we get a chromo_list [1,3,5]
-      # those are mapped then to 0,1,2
-      self.chromo_list = chromo_list
-      self.chromo_map = []
-      for elem in chromo_list:
-         self.chromo_map.append(elem)
-
-      self.genomicSequencesP   = [0]*(len(self.chromo_map)+1)
-      self.acceptorScoresP     = [0]*(len(self.chromo_map)+1)
-      self.donorScoresP        = [0]*(len(self.chromo_map)+1)
-
-      self.genomicSequencesN   = [0]*(len(self.chromo_map)+1)
-      self.acceptorScoresN     = [0]*(len(self.chromo_map)+1)
-      self.donorScoresN        = [0]*(len(self.chromo_map)+1)
-     
-      for chrId in self.chromo_list:
-         for strandId in self.strands:
-            print (chrId,strandId)
-            self.prefetch_seq_and_scores(chrId,strandId)
-
-   def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
-      assert chromo in self.chromo_list
-      assert strand in self.strands
-
-      chromo_idx = self.chromo_map[chromo]
-
-      if strand == '+':
-         currentSequence         = self.genomicSequencesP[chromo_idx][start:stop]
-         currentDonorScores      = self.donorScoresP[chromo_idx][start:stop]
-         currentAcceptorScores   = self.acceptorScoresP[chromo_idx][start:stop]
-      elif strand == '-':
-         currentSequence         = self.genomicSequencesN[chromo_idx][start:stop]
-         currentDonorScores      = self.donorScoresN[chromo_idx][start:stop]
-         currentAcceptorScores   = self.acceptorScoresN[chromo_idx][start:stop]
-      else:
-         assert False
-
-      return currentSequence, currentAcceptorScores, currentDonorScores
-
-
-   def prefetch_seq_and_scores(self,chromo,strand):
-      """
-      This function expects an interval, chromosome and strand information and
-      returns then the genomic sequence of this interval and the associated scores.
-      """
-
-      chromo_idx = self.chromo_map[chromo]
-
-      genomicSeq_start  = 0
-
-      genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
-      genomicSeq = genomicSeq.lower()
-
-      if strand == '+':
-         self.genomicSequencesP[chromo_idx] = genomicSeq
-         self.acceptorScoresP[chromo_idx]   = currentAcc
-         self.donorScoresP[chromo_idx]      = currentDon
-      elif strand == '-':
-         self.genomicSequencesN[chromo_idx] = genomicSeq
-         self.acceptorScoresN[chromo_idx]   = currentAcc
-         self.donorScoresN[chromo_idx]      = currentDon
-      else:
-         assert False
index 64fb471..351c294 100644 (file)
@@ -88,14 +88,27 @@ def create_alignment_file(current_loc,out_fname):
    accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
    seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
 
-   # implement uniqueness filtering using alignment score for reads with
-   # several alignments
+   # Uniqueness filtering using alignment score for reads with several
+   # alignments
+   allUniquePredictions = {}
 
-   #for pred_idx,current_prediction in enumerate(allPredictions):
-   
+   for new_prediction in allPredictions:
+      id = new_prediction['id']
+      if allUniquePredictions.has_key(id):
+         current_prediction = allUniquePredictions[id]
+
+         current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
+         new_score =  new_prediction['DPScores'].flatten().tolist()[0][0]
+
+         if current_a_score < new_score :
+            allUniquePredictions[id] = new_prediction
+
+      else:
+         allUniquePredictions[id] = new_prediction
 
    written_lines = 0
-   for pred_idx,current_prediction in enumerate(allPredictions):
+   for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
+
       id = current_prediction['id']
 
       #current_ground_truth = qparser.fetch_read(id)
index 21b5d81..93457e0 100644 (file)
@@ -12,10 +12,11 @@ import math
 from pythongrid import KybJob, Usage
 from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
 
-from createAlignmentFileFromPrediction import *
+from createAlignmentFileFromPrediction import create_alignment_file
 
 import grid_alignment
 
+
 def g_alignment(chunk_fn,result_fn):
    create_alignment_file(chunk_fn,result_fn)
 
@@ -23,17 +24,8 @@ def g_alignment(chunk_fn,result_fn):
 def create_and_submit():
    jp = os.path.join
 
-   #run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
-   #run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
-   #run_dir  = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'   
-   #data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
-   #data_dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'
-
-   run_dir  = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
-   data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
-
-   #run_dir  = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
-   #data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
+   run_dir  = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction'
+   result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/alignment'
 
    chunks_fn = []
    for fn in os.listdir(run_dir):
@@ -41,8 +33,9 @@ def create_and_submit():
          chunks_fn.append(fn)
 
    #chunks_fn = [\
-   #'chunk_0.predictions.pickle',\
-   #'chunk_4.predictions.pickle']
+   #'chunk_9.predictions.pickle',\
+   #'chunk_14.predictions.pickle',\
+   #'chunk_24.predictions.pickle']
 
    print chunks_fn
 
@@ -50,7 +43,7 @@ def create_and_submit():
 
    for chunk_fn in chunks_fn:
       chunk_name  = chunk_fn[:chunk_fn.find('.')]
-      result_fn   = jp(data_dir,'%s.align_remap'%chunk_name)
+      result_fn   = jp(result_dir,'%s.align_remap'%chunk_name)
       chunk_fn = jp(run_dir,chunk_fn) 
 
       #pdb.set_trace()
@@ -66,7 +59,6 @@ def create_and_submit():
       (sid, jobids) = submit_jobs(functionJobs)
       time.sleep(10)
 
-      #break
 
 if __name__ == '__main__':
    create_and_submit()
index 424a61f..77b93c1 100644 (file)
@@ -37,13 +37,10 @@ def create_and_submit():
 
    data_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
 
-
    run_fname      = jp(run_dir,'run_obj.pickle')
 
-   #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm'
-   #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/new_map.vm'
-   original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main/map.vm'
-   split_file(original_map_fname,data_dir,num_splits)
+   #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main/map.vm'
+   #split_file(original_map_fname,data_dir,num_splits)
 
    param_fname    = jp(run_dir,'param_526.pickle')
 
index ebeecd2..95f8306 100644 (file)
@@ -39,14 +39,14 @@ def get_slices(dataset_size,num_nodes):
    return all_instances
 
 
-def makeJobs(run,dataset_fn,chunks,param):
+def makeJobs(run,dataset_fn,chunks,param_fn):
    """
    """
 
    jobs=[]
 
    for c_name,current_chunk in chunks:
-      current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param,c_name])
+      current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param_fn,c_name])
       current_job.h_vmem = '20.0G'
       #current_job.express = 'True'
 
@@ -69,21 +69,11 @@ def create_and_submit():
    run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
    run['name'] = 'saved_run'
 
-   param = cPickle.load(open(jp(run_dir,'param_526.pickle')))
+   param_fn = jp(run_dir,'param_526.pickle')
 
-   #dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/sandbox/dataset_neg_strand_testcase.pickle'
-   #prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/sandbox/dataset_neg_strand_testcase.keys.pickle'
-
-   #dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.pickle'
-   #prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.keys.pickle'
-
-   #run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
-   #dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/dataset_run_1.pickle.pickle'
-   #prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/dataset_run_1.pickle.keys.pickle'
-
-   run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
-   dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/dataset_run_2.pickle.pickle'
-   prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/dataset_run_2.pickle.keys.pickle'
+   run['result_dir']    = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction'
+   dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.pickle'
+   prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.keys.pickle'
 
    prediction_keys = cPickle.load(open(prediction_keys_fn))
 
@@ -98,48 +88,31 @@ def create_and_submit():
          c_name = 'chunk_%d' % idx
          chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
 
-   functionJobs = makeJobs(run,dataset_fn,chunks,param)
+   functionJobs = makeJobs(run,dataset_fn,chunks,param_fn)
 
    sum = 0
    for size in [len(elem) for name,elem in chunks]:
       sum += size
    
-   #assert sum == len(prediction_keys)
-
    print 'Got %d job(s)' % len(functionJobs)
 
    #print "output ret field in each job before sending it onto the cluster"
    #for (i, job) in enumerate(functionJobs):
    #   print "Job with id: ", i, "- ret: ", job.ret
-
    #print ""
    #print "sending function jobs to cluster"
    #print ""
 
-   #pdb.set_trace()
-
    (sid, jobids) = submit_jobs(functionJobs)
 
-   #print 'checking whether finished'
-   #while not get_status(sid, jobids):
-   #   time.sleep(7)
-   #print 'collecting jobs'
-   #retjobs = collect_jobs(sid, jobids, functionJobs)
-   #print "ret fields AFTER execution on cluster"
-   #for (i, job) in enumerate(retjobs):
-   #   print "Job #", i, "- ret: ", job.ret
-
-   #print '--------------'
 
-
-def g_predict(run,dataset_fn,prediction_keys,param,set_name):
+def g_predict(run,dataset_fn,prediction_keys,param_fn,set_name):
    """
   
    """
 
-   qp = QPalma()
-   qp.predict(run,dataset_fn,prediction_keys,param,set_name)
-
+   qp = QPalma(False)
+   qp.init_prediction(run,dataset_fn,prediction_keys,param_fn,set_name)
    return 'finished prediction of set %s.' % set_name
 
 
index ee70a37..af47cd0 100644 (file)
 # 
 ###########################################################
 
-import sys
+import array
 import cPickle
-import pdb
 import os.path
-import array
+import pdb
+import sys
 
-from qpalma.sequence_utils import get_seq_and_scores,reverse_complement,unbracket_seq
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
 
 import numpy
 from numpy.matlib import mat,zeros,ones,inf
@@ -107,8 +107,9 @@ class QPalma:
    the alignment.
    """
    
-   def __init__(self):
+   def __init__(self,dmode=False):
       self.ARGS = Param()
+      self.qpalma_debug_mode = dmode
 
 
    def plog(self,string):
@@ -564,11 +565,12 @@ class QPalma:
 #
 ###############################################################################
 
-   def predict(self,run,dataset_fn,prediction_keys,param,set_name):
+   def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,set_name):
       """
       Performing a prediction takes...
       """
       self.run = run
+      self.set_name = set_name
 
       #full_working_path = os.path.join(run['alignment_dir'],run['name'])
       full_working_path = run['result_dir']
@@ -586,13 +588,6 @@ class QPalma:
 
       self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
 
-      if self.run['mode'] == 'normal':
-         self.use_quality_scores = False
-
-      elif self.run['mode'] == 'using_quality_scores':
-         self.use_quality_scores = True
-      else:
-         assert(False)
 
       # number of prediction instances
       self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
@@ -606,6 +601,27 @@ class QPalma:
 
       # we do not need the full dataset anymore
       del dataset
+   
+      # Load parameter vector to predict with
+      param = cPickle.load(open(param_fn))
+
+      self.predict(run,prediction_set,param)
+
+
+   def predict(self,run,prediction_set,param):
+      """
+      This method...
+      """
+
+      self.run = run
+
+      if self.run['mode'] == 'normal':
+         self.use_quality_scores = False
+
+      elif self.run['mode'] == 'using_quality_scores':
+         self.use_quality_scores = True
+      else:
+         assert(False)
 
       # Set the parameters such as limits/penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] =\
@@ -614,8 +630,9 @@ class QPalma:
       #############################################################################################
       # Prediction
       #############################################################################################
-
-      self.plog('Starting prediction...\n')
+      
+      if not self.qpalma_debug_mode:
+         self.plog('Starting prediction...\n')
 
       donSP       = self.run['numDonSuppPoints']
       accSP       = self.run['numAccSuppPoints']
@@ -634,24 +651,32 @@ class QPalma:
       # we take the first quality vector of the tuple of quality vectors
       quality_index = 0
 
+      # fetch the data needed
+      g_dir    = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      g_fmt = 'chr%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      num_chromo = 6
+
+      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+      seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
       # beginning of the prediction loop
       for example_key in prediction_set.keys():
          print 'Current example %d' % example_key
 
          for example in prediction_set[example_key]:
 
-            currentSeqInfo,original_read,currentQualities = example
+            currentSeqInfo,read,currentQualities = example
 
             id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
             currentSeqInfo 
 
-            # remove brackets from the original read annotation
-            read = unbracket_seq(original_read)
-
-            if strand == '-':
-               read = reverse_complement(read)
-
-            self.plog('Loading example id: %d...\n'% int(id))
+            if not self.qpalma_debug_mode:
+               self.plog('Loading example id: %d...\n'% int(id))
 
             if run['mode'] == 'normal':
                quality = [40]*len(read)
@@ -663,7 +688,7 @@ class QPalma:
                quality = [40]*len(read)
 
             try:
-               currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+               currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
             except:
                problem_ctr += 1
                continue
@@ -680,16 +705,22 @@ class QPalma:
             current_prediction = self.calc_alignment(currentDNASeq, read,\
             quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
 
-            current_prediction['id'] = id
-            #current_prediction['start_pos'] = up_cut
-            current_prediction['start_pos'] = genomicSeq_start
-            current_prediction['chr'] = chromo
-            current_prediction['strand'] = strand
+            current_prediction['id']         = id
+            current_prediction['chr']        = chromo
+            current_prediction['strand']     = strand
+            current_prediction['start_pos']  = genomicSeq_start
 
             allPredictions.append(current_prediction)
 
+      if not self.qpalma_debug_mode:
+         self.finalizePrediction(allPredictions,problem_ctr)
+      else:
+         return allPredictions
+
+
+   def finalizePrediction(self,allPredictions,problem_ctr):
       # end of the prediction loop we save all predictions in a pickle file and exit
-      cPickle.dump(allPredictions,open('%s.predictions.pickle'%(set_name),'w+'))
+      cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
       print 'Prediction completed'
       self.plog('Prediction completed\n')
       mes =  'Problem ctr %d' % problem_ctr
@@ -728,28 +759,22 @@ class QPalma:
       # old code removed
       newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
       newWeightMatch = newWeightMatch.reshape(1,mm_len)
-      true_map = [0]*2
+      true_map    = [0]*2
       true_map[0] = 1
-      pathNr = 0
+      pathNr      = 0
 
       _newSpliceAlign   = array.array('B',newSpliceAlign.flatten().tolist()[0])
       _newEstAlign      = array.array('B',newEstAlign.flatten().tolist()[0])
        
       alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
 
-      dna_array = array.array('B',dna_array)
-      read_array = array.array('B',read_array)
-
-      #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
-      #self.plog(line1+'\n')
-      #self.plog(line2+'\n')
-      #self.plog(line3+'\n')
+      dna_array   = array.array('B',dna_array)
+      read_array  = array.array('B',read_array)
 
       newExons = self.calculatePredictedExons(newSpliceAlign)
 
       current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
-      'alignment':alignment,\
-      'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
+      'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
       'dna_array':dna_array, 'read_array':read_array }
 
       return current_prediction
@@ -773,27 +798,3 @@ class QPalma:
             newExons.append(pos)
 
       return newExons
-
-
-###########################
-# A simple command line 
-# interface
-###########################
-
-if __name__ == '__main__':
-   assert len(sys.argv) == 4
-
-   run_fn      = sys.argv[1]
-   dataset_fn  = sys.argv[2]
-   param_fn    = sys.argv[3]
-
-   run_obj = cPickle.load(open(run_fn))
-   dataset_obj = cPickle.load(open(dataset_fn))
-
-   qpalma = QPalma()
-
-   if param_fn == 'train':
-      qpalma.train(run_obj,dataset_obj)
-   else:
-      param_obj = cPickle.load(open(param_fn))
-      qpalma.predict(run_obj,dataset_obj,param_obj)
index 3670dd7..0e2d215 100644 (file)
@@ -1,6 +1,15 @@
 #!/usr/bin/env python 
 # -*- coding: utf-8 -*-
 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+
 #
 # This file contains the main interface to the QPalma pipeline.
 #
diff --git a/setup.py b/setup.py
new file mode 100644 (file)
index 0000000..c65ac1e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,17 @@
+from distutils.core import setup, Extension
+
+
+setup (name = 'QPalma' 
+   description = 'A package for optimal alignment of short reads',
+   version = '1.0', 
+   long_description = '''
+   A
+   ''', 
+   author = 'Fabio De Bona',
+   author_email = 'fabio@tuebingen.mpg.de',
+   url = 'http://',
+   license = 'GNU GPL version 3',
+   ext_package = "qpalma",
+   #ext_modules = extmods,
+   package_dir = {"qpalma": "python"},
+   packages = ["qpalma"])
index 3b9c759..2830382 100644 (file)
@@ -1,15 +1,17 @@
 #!/usr/bin/env python 
 # -*- coding: utf-8 -*- 
 
-import pdb
-import unittest
+import cPickle
+import math
 import numpy
 import os.path
-import cPickle
+import pdb
+import unittest
 
 from qpalma_main import QPalma
-from Utils import pprint_alignment
-from qpalma.sequence_utils import alphabet
+from Utils import print_prediction
+
+from createAlignmentFileFromPrediction import alignment_reconstruct
 
 jp = os.path.join
 
@@ -19,11 +21,22 @@ class TestQPalmaPrediction(unittest.TestCase):
    This class...
    """
 
+   def _setUp(self):
+      data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
+
+      self.prediction_set = cPickle.load(open(data_fn))
+
+      
+
    def setUp(self):
+      print
       self.prediction_set = {}
 
-      read = 'catctcacagtcttcttcttcttctcgattgcagtagc'      
-      currentQualities = [[40]*len(read)]
+      #  xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
+      # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
+
+      read = 'tattttggaagttccaattcccttaatacatctcacag'
+      currentQualities = [[30]*len(read)]
 
       id       = 1
       chromo   = 3
@@ -31,15 +44,37 @@ class TestQPalmaPrediction(unittest.TestCase):
 
       tsize = 23470805
 
-      genomicSeq_start  = tsize - 2048
-      genomicSeq_stop   = tsize - 1990
+      genomicSeq_start  = tsize - (2038+10)
+      genomicSeq_stop   = tsize - 1900 + 10
+      print 'Position: ',
+      print genomicSeq_start,genomicSeq_stop 
 
       currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
 
       example = (currentSeqInfo,read,currentQualities)
-
       self.prediction_set[id] = [example]
 
+      #  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx                                                                                        xxxxxxxx
+      # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
+
+      read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
+      currentQualities = [[40]*30+[22]*8]
+
+      id       = 2
+      chromo   = 3
+      strand   = '+'
+
+      tsize = 23470805
+
+      genomicSeq_start  = tsize - (2038+10)
+      genomicSeq_stop   = tsize - 1900 + 10
+      print 'Position: ',
+      print genomicSeq_start,genomicSeq_stop 
+
+      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
+
+      example = (currentSeqInfo,read,currentQualities)
+      self.prediction_set[id] = [example]
 
    
    def testAlignments(self):
@@ -64,20 +99,22 @@ class TestQPalmaPrediction(unittest.TestCase):
       qp = QPalma(True)
       #qp.init_prediction(run,set_name)
       allPredictions = qp.predict(run,self.prediction_set,param)
-      for elem in allPredictions:
-         dna_array   = elem['dna_array']
-         read_array  = elem['read_array']
-
-         dna   = map(lambda x: alphabet[x],dna_array)
-         read  = map(lambda x: alphabet[x],read_array)
-
-         spliceAlign = elem['spliceAlign']
-         estAlign    = elem['estAlign']
-
-         line1,line2,line3 = pprint_alignment(spliceAlign, estAlign, dna, read)
-         print line1
-         print line2
-         print line3
+      for current_prediction in allPredictions:
+         align_str = print_prediction(current_prediction)
+         print align_str
+
+         id = current_prediction['id']
+         seq         = current_prediction['read']
+         dna         = current_prediction['dna']
+         chromo      = current_prediction['chr']
+         strand      = current_prediction['strand']
+         start_pos   = current_prediction['start_pos']
+         predExons   = current_prediction['predExons']
+
+         numExons = int(math.ceil(len(predExons) / 2))
+         
+         print alignment_reconstruct(current_prediction,numExons)
+         #print id,start_pos,predExons
 
 
 if __name__ == '__main__':
index 2dba63a..139c681 100644 (file)
@@ -2,13 +2,10 @@
 // The purpose of this program is to read a gff and a    
 // solexa reads file and create a data set used by QPalma. 
 //
-//
-//
 // Notes:
 //
 // - Both the read indices and the gff gene indices are one-based
 //
-//
 ////////////////////////////////////////////////////////////////////////////////
 
 #include <sys/mman.h>
@@ -58,8 +55,6 @@ char current_strand;
 //#define _FDEBUG_ 0
 //#define DEBUG_READ 38603
 
-
-//
 /*
  * TODO:
  * - Check strand -> done simple (only if equal)
@@ -453,6 +448,7 @@ void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, R
       }
    }
 
+   exit(0);
    free(up_used_flag);
    free(down_used_flag);
 }
@@ -504,7 +500,7 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    int u_off = p_start - up_read_start;
    int d_off = exon_start - down_read_start;
 
-   FA(u_off >= 0 && d_off >= 0);
+   FA( u_off >= 0 && d_off >= 0 );
    FA( exon_stop - p_start + p_stop - exon_start + 2 == read_size);
    FA( u_size + d_size == read_size );
 
@@ -517,27 +513,36 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    if ( current_strand == 'P' ) {
       printf("flipping read sequences...\n");
       printf("%s %s\n",up_read->seq,down_read->seq);
-
-      char *tmp = malloc(sizeof(char)*strlen(up_read->seq+1));
+      char *tmp = malloc(sizeof(char)*(strlen(up_read->seq)+1));
       strncpy(tmp,up_read->seq,strlen(up_read->seq));
       tmp[strlen(up_read->seq)]='\0';
-      realloc(up_read->seq,sizeof(char)*strlen(down_read->seq+1));
+
+      realloc(up_read->seq,sizeof(char)*(strlen(down_read->seq)+1));
       strncpy(up_read->seq,down_read->seq,strlen(down_read->seq));
       up_read->seq[strlen(down_read->seq)] = '\0';
-      realloc(down_read->seq,sizeof(char)*strlen(tmp));
+
+      if (up_read->seq == NULL)
+         perror("realloc\n");
+
+      realloc(down_read->seq,sizeof(char)*strlen(tmp)+1);
       strncpy(down_read->seq,tmp,strlen(tmp));
       down_read->seq[strlen(tmp)] = '\0';
-
+      
+      if (down_read->seq == NULL)
+         perror("realloc\n");
+      
       free(tmp);
-
       printf("flipping done...\n");
       printf("%s %s\n",up_read->seq,down_read->seq);
    }
-
+   
    printf("start joining...\n");
    Tuple jinfo = join_seq(new_seq,up_read->seq,u_off,u_size,down_read->seq,d_off,d_size,down_range);
    printf("end of joining...\n");
 
+   printf("new_seq is %s\n",new_seq);
+
+
    int cut_pos = jinfo.first;
    int additional_pos = jinfo.second;
    printf("jinfo contains %d/%d\n",jinfo.first,jinfo.second);
@@ -548,15 +553,11 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    char* new_prb        = malloc(sizeof(char)*buf_size);
    char* new_cal_prb    = malloc(sizeof(char)*buf_size);
    char* new_chastity   = malloc(sizeof(char)*buf_size);
+
    if (new_prb == NULL || new_cal_prb == NULL || new_chastity == NULL)
       perror("malloc\n");
-
-   if( jinfo.first == -1 ) {
-      retval = 0;
-      goto free;
-   }
-
-   if( additional_pos > (down_range - d_size) ) {
+   
+   if( jinfo.first == -1 || (additional_pos > (down_range - d_size)) ) {
       retval = 0;
       goto free;
    }
@@ -564,26 +565,24 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    printf("joining qualities...\n");
    strncpy(new_prb, up_read->prb+u_off, u_size);
    strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
-   new_prb[buf_size] = '\0';
+   new_prb[buf_size-1] = '\0';
 
    strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
    strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
-   new_cal_prb[buf_size] = '\0';
+   new_cal_prb[buf_size-1] = '\0';
 
    strncpy(new_chastity, up_read->chastity+u_off, u_size);
    strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
-   new_chastity[buf_size] = '\0';
+   new_chastity[buf_size-1] = '\0';
    printf("end of joining qualities...\n");
 
    //printf("old reads: %s %s (%d %d %d/%d)\n",up_read->seq,down_read->seq,up_read->pos,down_read->pos,u_off,d_off);
    //printf("new read: %s %d %d\n",new_seq,cut_pos,u_size);
-
    //if ( current_strand == 'P' ) {
    //   int alpha   = read_size - u_off - 1;
    //   int beta    = alpha - u_size ;
    //   p_start     = up_read->pos + beta + 1;
    //   exon_stop   = up_read->pos + alpha;
-
    //   alpha   = read_size - d_off - 1;
    //   beta    = alpha - (read_size - u_size);
    //   exon_start  = down_read->pos + beta + 1; 
@@ -641,6 +640,7 @@ void reverse_complement(char** seq, int len) {
       if (idx == len || temp_idx == -1)
          break;
    }
+   free(temp);
 }
 
 
@@ -674,7 +674,6 @@ void open_file(const char* filename, const char* mode, FILE** fs) {
  *
  */
 
-
 int main(int argc, char* argv[]) {
 
    if(argc != 8) {
@@ -733,6 +732,8 @@ int main(int argc, char* argv[]) {
       if (! (old_gene_stop <  currentGene->start ) ) {
          printf("Gene %s is overlapping\n",currentGene->id);
          old_gene_stop = currentGene->stop;
+         free_gene(allGenes[g_idx]);
+         free(allGenes[g_idx]);
          allGenes[g_idx] = 0;
          nulled_genes++;
          continue;
index f69b9c9..7a907e8 100644 (file)
@@ -4,7 +4,8 @@ CFLAGS=-ggdb -Wall -Wshadow -lm #-pedantic
 #SRCS= parser.c\
 #OBJS = $(SRCS:%.cpp=%.o)
 
-all: $(OBJS)
+all: compare.c
+       @ echo "Compiling..."
        @ $(CC) $(CFLAGS) -o compare compare.c
 
 #@ $(CC) $(CFLAGS) -o filterReads $(OBJS) filterReads.c
index 126f287..46ca423 100644 (file)
@@ -178,7 +178,7 @@ int main(int argc, char* argv[]) {
    strncpy(pred_fn,argv[2],strlen(argv[2]));
    strncpy(result_fn,argv[3],strlen(argv[3]));
 
-   FILE *result_fs = fopen(result_fn,"r");
+   FILE *result_fs = fopen(result_fn,"w+");
    if(result_fs == NULL) {
       printf("Error: Could not open file: %s",result_fn);
       exit(EXIT_FAILURE);
index 338fe20..3e4f093 100644 (file)
@@ -12,7 +12,7 @@ import qpalma
 import qpalma.tools
 
 from qpalma.parsers import *
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
 
 import QPalmaConfiguration as Conf
 
@@ -58,13 +58,14 @@ class DatasetGenerator:
 
       self.read_size = Conf.read_size
 
+      self.half_window_size = 1500
+
 
    def saveAs(self,dataset_file):
       dataset_fn = '%s.pickle'% dataset_file
       dataset_keys_fn = '%s.keys.pickle'%dataset_file
 
-      print dataset_fn 
-      print dataset_keys_fn 
+      print dataset_fn, dataset_keys_fn 
 
       assert not os.path.exists(dataset_fn), 'The data_file already exists!'
       assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
@@ -74,8 +75,7 @@ class DatasetGenerator:
       cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
 
 
-   def parse_map_file(self,dataset,map_file):
-      strand_map = ['-','+']
+   def parse_map_file(self,dataset,map_file,first_round):
       instance_counter = 0
 
       g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
@@ -90,10 +90,14 @@ class DatasetGenerator:
       accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
       seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
 
+      not_consistent_ctr = 0
+
       for line in open(map_file):
          if instance_counter > 0 and instance_counter % 5000 == 0:
             print 'processed %d examples' % instance_counter
-            #if instance_counter == 10000:
+
+         if instance_counter == 10000:
+            print 'Not consistent ctr: %d' % not_consistent_ctr
             #   break
 
          slist    = line.split()
@@ -102,69 +106,75 @@ class DatasetGenerator:
          chromo   = int(slist[1])
          pos      = int(slist[2])
 
-         # convert D/P strand info to +/-
-         strand   = slist[3]
-         strand   = strand_map[strand == 'D']
-
-         read_seq = slist[4]
-
-         # QPalma uses lowercase characters
-         read_seq = read_seq.lower()
-         read_seq = unbracket_seq(read_seq)
-         
-         # fetch the prb quality vector
-         _prb      = slist[5]
-
          # for the time being we only consider chromosomes 1-5
          if not chromo in range(1,6):
             continue
 
-         if strand == '-':
-            pos = seqInfo.chromo_sizes[chromo]-pos-self.read_size
+         # convert D/P strand info to +/-
+         if slist[3] == 'D':
+            strand = '+'
+         else:
+            strand = '-'
+
+         # QPalma uses lowercase characters
+         bracket_seq = slist[4].lower()
+         read_seq = unbracket_seq(bracket_seq)
+         reconstructed_dna_seq = reconstruct_dna_seq(bracket_seq)
+
+         if strand == '-' and first_round:
             read_seq = reverse_complement(read_seq)
+            #reconstructed_dna_seq = reverse_complement(reconstructed_dna_seq)
 
-         # we use an area of +/- 1500 nucleotides around the seed position
-         if pos > 1501:
-            ds_offset = 1500
+         assert not '[' in read_seq and not ']' in read_seq
+         
+         # we use an area of +/-  `self.half_window_size` nucleotides around the seed position
+         if pos > self.half_window_size+1:
+            us_offset = self.half_window_size
          else:
-            ds_offset = pos-1
+            us_offset = pos - 1 
 
-         us_offset = 1500
+         if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
+            ds_offset = self.half_window_size
+         else:
+            ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
             
-         genomicSeq_start = pos - ds_offset
-         genomicSeq_stop  = pos + us_offset
+         genomicSeq_start = pos - us_offset
+         genomicSeq_stop  = pos + ds_offset
 
-         #try:
          dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
-         #except:
+         
+         #if strand == '-':
+         #   dna_part = dna_seq[us_offset-len(read_seq):us_offset]
+         #else:
+         #   dna_part = dna_seq[us_offset:us_offset+len(read_seq)]
+
+         #if reconstructed_dna_seq != dna_part:
+         #   not_consistent_ctr += 1
          #   pdb.set_trace()
 
-         # Take into account that the results of the second run do not contain
-         # the original reads anymore.
-         #if first_map:
-         #   original_read = read_seq
-         #else:
-         #   original_read = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
+         ## check whether everything is ok
+         #   try:
+         #      currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+         #   except:
+         #      problem_ctr += 1
+         #      continue
 
-         # in order to save some space we use a signed char to store the
+         # In order to save some space we use a signed char to store the
          # qualities. Each quality element can range as follows: -128 <= elem <= 127
-         prb = array.array('b',map(lambda x: ord(x)-64,_prb))
-
-         #pdb.set_trace()
+         prb = array.array('b',map(lambda x: ord(x)-64,slist[5]))
 
          # add instance to set
          currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
          currentQualities = [prb]
 
-         # as one single read can have several vmatch matches we store all
+         # As one single read can have several vmatch matches we store all
          # these matches under the unique id of the read
-
-         #dataset.setdefault(id, []).append((currentSeqInfo,original_read,currentQualities))
          dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
 
          instance_counter += 1
 
       print 'Total examples processed: %d' % instance_counter
+      print 'Not consistent ctr: %d' % not_consistent_ctr
       return dataset
 
 
@@ -174,7 +184,8 @@ class DatasetGenerator:
       # usually we have two files to parse:
       # the map file from the second run and a subset of the map file from the
       # first run
-      dataset = self.parse_map_file(dataset,self.map_file)
-      dataset = self.parse_map_file(dataset,self.map_2nd_file)
+
+      dataset = self.parse_map_file(dataset,self.map_file,True)
+      dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
 
       self.dataset = dataset