+ pruned a lot of import declarations
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 30 Jun 2008 15:16:48 +0000 (15:16 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 30 Jun 2008 15:16:48 +0000 (15:16 +0000)
+ switched to new pythongrid version
+ renamed some ambiguous variables
+ cleaned up code base a bit

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9800 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Experiment.py
scripts/PipelineHeuristic.py
scripts/Utils.py
scripts/check_and_init.py
scripts/check_dataset.py
scripts/createAlignmentFileFromPrediction.py
scripts/grid_alignment.py
scripts/grid_heuristic.py
scripts/grid_predict.py
scripts/qpalma_main.py

index c414d5a..f26e7be 100644 (file)
@@ -10,7 +10,7 @@
 #
 ###############################################################################
 
-import qpalma.Configuration as Conf
+import QPalmaConfiguration as Conf
 from Run import *
 import pdb
 import os
index 5d036cd..ee17503 100644 (file)
@@ -9,16 +9,9 @@ import os.path
 import math
 import resource
 
-from qpalma.DataProc import *
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
 from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
-
-from qpalma.Configuration import *
 
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
@@ -27,9 +20,9 @@ from qpalma.parsers import *
 
 from ParaParser import *
 
-from Lookup import *
+from Lookup import Lookup
 
-from qpalma.sequence_utils import *
+from qpalma.sequence_utils import reverse_complement,unbracket_seq
 
 
 
@@ -236,13 +229,16 @@ class PipelineHeuristic:
 
          strand = strand_map[strand]
 
-         #if strand == '-':
-         #   continue
 
          if not chr in range(1,6):
             continue
 
-         unb_seq = unbracket_est(seq)
+         unb_seq = unbracket_seq(seq)
+
+         # forgot to do this
+         if strand == '-':
+            unb_seq = reverse_complement(unb_seq)
+
          effective_len = len(unb_seq)
 
          genomicSeq_start  = pos
@@ -251,6 +247,7 @@ class PipelineHeuristic:
          start = cpu()
          #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
          currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+
          stop = cpu()
          self.get_time += stop-start
 
@@ -393,7 +390,7 @@ class PipelineHeuristic:
       cal_prb  = location['cal_prb']
       
       original_est = original_est.lower()
-      est = unbracket_est(original_est)
+      est = unbracket_seq(original_est)
       effective_len = len(est)
 
       genomicSeq_start  = pos - self.max_intron_size
index 1c3c4f7..ea4fb26 100644 (file)
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 
 import pdb
+import os.path
 
 from numpy.matlib import mat,zeros,ones,inf
 
@@ -147,7 +148,9 @@ def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
 
 ##########
 
-def split_file_join_results(filename,parts):
+def split_file(filename,result_dir,parts):
+   jp = os.path.join
+
    all_intervals = []
 
    print 'counting lines'
@@ -170,22 +173,15 @@ def split_file_join_results(filename,parts):
          end = begin+part
 
       params = (begin,end)
-      print begin,end
-
       all_intervals.append(params)
 
-   pdb.set_trace()
-
-   infile = open(filename,'r')
-
    parts_fn = []
    for pos,params in enumerate(all_intervals):
       beg,end = params
-      print params
-      out_fn = '%s.part_%d'%(filename,pos)
-      parts_fn.append(out_fn)
-      #out_fh = open(out_fn,'w+')
+      out_fn = jp(result_dir,'map.part_%d'%pos)
       print out_fn
+      parts_fn.append(out_fn)
+      out_fh = open(out_fn,'w+')
 
    parts_fn.reverse()
    all_intervals.reverse()
@@ -201,6 +197,7 @@ def split_file_join_results(filename,parts):
          params = all_intervals.pop()
          beg,end = params
          out_fn = parts_fn.pop()
+         out_fh.close()
          out_fh = open(out_fn,'w+')
 
    out_fh.close()
index aed62ad..27c9e19 100644 (file)
@@ -1,10 +1,15 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+#
+# 
+#
+#
+
 import os.path
 import cPickle
 
-import qpalma.Configuration as Conf
+import QPalmaConfiguration as Conf
 
 
 def check_vmatch_params(Config):
index 74817b5..411f727 100644 (file)
@@ -5,69 +5,59 @@
 # The purpose of this script is to check the created dataset pickle files for
 # consistency before doing any kind of training/prediction on the data.
 #
-#
 
 import sys
 import pdb
 import cPickle
 
-from compile_dataset import get_seq_and_scores
+from qpalma.sequence_utils import get_seq_and_scores,unbracket_seq,reverse_complement
 
 
 def checkAll(filename):
+   """
+   This function loads the dataset and performs some sanity checks and
+   assertions to be sure that the set is in the right shape for QPalma to train
+   resp. to predict on.
+   """
+
    dataset = cPickle.load(open(filename))
 
-   for idx,(current_key,current_value) in enumerate(dataset.iteritems()):
-      (currentSeqInfo,original_est,currentQualities) = current_value
-      currentSeqInfo    = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-      currentQualities  = (prb,cal_prb,chastity)
+   # we take the first quality vector of the tuple of quality vectors
+   quality_index = 0
+
+   status = True
+   mes = '---'
 
-      if idx > 0 and idx % 250 == 0:
-         print 'processed %d examples' % idx
+   idx = 0
+   for example_key in dataset.keys():
+      matches = dataset[example_key]
+      print 'Current example %d has %d matches' % (example_key,len(matches))
 
-      currentInfo = SeqInfo[idx]
-      chr,strand,p1,p2 = currentInfo
-      currentExon = Exons[idx]
-      currentExon[0,1] += 1
-      currentExon -= 1
+      for example in matches:
+         (currentSeqInfo,original_est,currentQualities) = example
 
-      currentEst  = OriginalEsts[idx]
-      originalEst  = OriginalEsts[idx]
+         (id,chromo,strand,genomicSeq_start,genomicSeq_stop) = currentSeqInfo
 
-      if currentEst.find('[') != -1:
-         #print currentEst
-         currentEst = unbracket_est(currentEst)
-         #print currentEst
+         assert chromo in range(1,6), pdb.set_trace()
+         assert strand in ['+','-'], pdb.set_trace()
 
-      assert len(currentEst) == 36, pdb.set_trace()
+         quality = currentQualities[quality_index]
 
-      first_seq = get_seq( currentExon[0,0], currentExon[0,1], True )
-      end = first_seq[-2:]
-      first_seq = first_seq[:-2]
-      second_seq = get_seq( currentExon[1,0], currentExon[1,1]+1, False )
-      begin = second_seq[:2]
-      second_seq = second_seq[2:]
+         # check for key consistency
+         assert id == example_key
 
-      dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      seq, acc, don =\
-      get_seq_and_scores(chr,strand,currentExon[0,0],currentExon[1,1]+1,dna_flat_files)
+         if idx > 0 and idx % 1000 == 0:
+            print 'processed %d examples' % idx
 
-      assert (len(first_seq) + len(second_seq)) == 36, pdb.set_trace()
+         dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
 
-      if not (end == 'gt' or end == 'gc'):
-         print 'invalid donor in example %d'% idx
-         print SeqInfo[idx]
-         print currentExon
+         genomic_seq_pos,acc_pos,don_pos = get_seq_and_scores(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files,False)
+         genomic_seq_neg,acc_neg,don_neg = get_seq_and_scores(chromo,'-',genomicSeq_start,genomicSeq_stop,dna_flat_files,False)
 
-         #invalid_donor_ctr += 1
-         #continue
+         assert reverse_complement(genomic_seq_neg) == genomic_seq_pos
 
-      if not (begin == 'ag'):
-         print 'invalid acceptor in example %d'% idx
-         print SeqInfo[idx]
-         print currentExon
 
-   pdb.set_trace() 
+   return status,mes
 
 
 if __name__ == '__main__':
index 922dc70..885149c 100644 (file)
@@ -22,38 +22,30 @@ def alignment_reconstruct(current_prediction,num_exons):
    We reconstruct the exact alignment to infer the positions of indels and the
    sizes of the respective exons.
    """
+   translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char) 
 
    SpliceAlign = current_prediction['spliceAlign']
    estAlign    = current_prediction['estAlign']
 
    dna_array   = current_prediction['dna_array']
-   est_array   = current_prediction['est_array']
+   read_array  = current_prediction['read_array']
 
-   dna_array   = "".join(map(lambda x: str(x),dna_array))
-   est_array   = "".join(map(lambda x: str(x),est_array))
+   dna_array   = "".join(map(lambda x: translation[int(x)],dna_array))
+   read_array  = "".join(map(lambda x: translation[int(x)],read_array))
 
-   translation = '-ACGTN_'    #how aligned est and dna sequence is displayed
-                              #(gap_char, 5 nucleotides, intron_char) 
-
-   # this array hold a number for each exons indicating its number of matches
+   # this array holds a number for each exon indicating its number of matches
    exonIdentities = [0]*num_exons
-   exonGaps = [0]*num_exons
-
-   est_array = map(lambda x: translation[int(x)],est_array)
-   dna_array = map(lambda x: translation[int(x)],dna_array)
+   exonGaps       = [0]*num_exons
 
-   gaps = 0
+   gaps     = 0
    identity = 0
-   idx = 0
+   idx      = 0
 
-   #if num_exons > 1:
-   #   pdb.set_trace()
-
-   for ridx in range(len(est_array)):
-      read_elem = est_array[ridx]
+   for ridx in range(len(read_array)):
+      read_elem = read_array[ridx]
       dna_elem = dna_array[ridx]
 
-      if ridx > 0 and est_array[ridx-1] != '_' and est_array[ridx] == '_':
+      if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
          idx += 1
          gaps = 0
          identity = 0
@@ -74,17 +66,16 @@ def alignment_reconstruct(current_prediction,num_exons):
 
 
 def create_alignment_file(current_loc,out_fname):
-   import numpy
-   #qparser.parse_reads(filtered_reads)
+   print 'Working on %s'%current_loc
 
    out_fh = open(out_fname,'w+')
-
    allPredictions = cPickle.load(open(current_loc))
    #allPredictions = current_loc
 
-   allPositions = {}
+   print 'Loaded %d examples' % len(allPredictions)
 
-   for current_prediction in allPredictions:
+   written_lines = 0
+   for pred_idx,current_prediction in enumerate(allPredictions):
       id = current_prediction['id']
 
       #current_ground_truth = qparser.fetch_read(id)
@@ -92,7 +83,7 @@ def create_alignment_file(current_loc,out_fname):
       #seq = current_ground_truth['seq']
       #q1 = current_ground_truth['prb']
 
-      seq         = current_prediction['est']
+      seq         = current_prediction['read']
       dna         = current_prediction['dna']
 
       # CHECK !!!
@@ -111,6 +102,10 @@ def create_alignment_file(current_loc,out_fname):
       (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
       tExonSizes,tStarts, tEnds) = alignment 
 
+      if len(qExonSizes) != num_exons:
+         print 'BUG exon sizes %d'%id
+         continue
+
       EST2GFF = False
       new_line = ''
 
@@ -170,22 +165,26 @@ def create_alignment_file(current_loc,out_fname):
 
       else:
 
-         num_exons = len(qExonSizes)
-         exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+         try:
+            exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+         except:
+            print 'Bug in example %d (idx: %d)' %\
+            (current_prediction['id'],pred_idx)
+            continue
 
          pp = lambda x : str(x)[1:-1].replace(' ','')
 
          new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
          (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
-         str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
-         str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
-         str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''),\
-         str(exonIdentities)[1:-1].replace(' ',''),str(exonGaps)[1:-1].replace(' ',''))
+         pp(qExonSizes),pp(qStarts),\
+         pp(qEnds),pp(tExonSizes),\
+         pp(tStarts),pp(tEnds),\
+         pp(exonIdentities),pp(exonGaps))
+         written_lines += 1
 
       out_fh.write(new_line)
 
-   return allPositions
-
+   print 'Wrote out %d lines' % written_lines
 
 if __name__ == '__main__':
    current_dir = sys.argv[1]
index 0959c5d..21b5d81 100644 (file)
@@ -3,20 +3,19 @@
 
 import cPickle
 import sys
+import time
 import pdb
 import os
 import os.path
 import math
 
-from pythongrid import Job, KybJob, MethodJob, processJobs, Usage, processJobsLocally
+from pythongrid import KybJob, Usage
+from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
 
 from createAlignmentFileFromPrediction import *
 
 import grid_alignment
 
-from Utils import split_file_join_results
-
-
 def g_alignment(chunk_fn,result_fn):
    create_alignment_file(chunk_fn,result_fn)
 
@@ -25,29 +24,28 @@ def create_and_submit():
    jp = os.path.join
 
    #run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
-   run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
-   
-   data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+   #run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
+   #run_dir  = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'   
+   #data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+   #data_dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'
+
+   run_dir  = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
+   data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
+
+   #run_dir  = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
+   #data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
 
    chunks_fn = []
    for fn in os.listdir(run_dir):
       if fn.startswith('chunk'):
          chunks_fn.append(fn)
 
-
-   chunks_fn = [\
-   'chunk_17.predictions.pickle',\
-   'chunk_18.predictions.pickle',\
-   'chunk_19.predictions.pickle',\
-   'chunk_20.predictions.pickle',\
-   'chunk_21.predictions.pickle',\
-   'chunk_23.predictions.pickle',\
-   'chunk_24.predictions.pickle'
-   ]
+   #chunks_fn = [\
+   #'chunk_0.predictions.pickle',\
+   #'chunk_4.predictions.pickle']
 
    print chunks_fn
 
-
    functionJobs=[]
 
    for chunk_fn in chunks_fn:
@@ -59,18 +57,16 @@ def create_and_submit():
 
       current_job = KybJob(grid_alignment.g_alignment,[chunk_fn,result_fn])
       current_job.h_vmem = '15.0G'
-      #current_job.express = 'True'
+      current_job.express = 'True'
 
       print "job #1: ", current_job.nativeSpecification
 
-      functionJobs.append(current_job)
-
-   processedFunctionJobs = processJobs(functionJobs)
-   print "ret fields AFTER execution on cluster"
-   for (i, job) in enumerate(processedFunctionJobs):
-      print "Job with id: ", i, "- ret: ", job.ret
+      functionJobs = [current_job]
+      #functionJobs.append(current_job)
+      (sid, jobids) = submit_jobs(functionJobs)
+      time.sleep(10)
 
+      #break
 
 if __name__ == '__main__':
-   #split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
    create_and_submit()
index 64381d1..78f9bd6 100644 (file)
@@ -4,54 +4,68 @@
 import cPickle
 import sys
 import pdb
+import time
 import os
 import os.path
 import math
 
-from pythongrid import Job, KybJob, MethodJob, processJobs, Usage, processJobsLocally
+from pythongrid import KybJob, Usage
+from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
 
 from PipelineHeuristic import *
 
 import grid_heuristic
 
-from Utils import split_file_join_results
+from Utils import split_file
+
 
 def g_heuristic(run_fname,data_fname,param_fname,result_fname):
    #print run_fname,data_fname,param_fname,result_fname
    ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_fname)
    ph1.filter()
-   
 
    return 'finished filtering set %s.' % data_fname
 
+
 def create_and_submit():
    jp = os.path.join
 
    run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
-   data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+   data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_0'
 
    run_fname      = jp(run_dir,'run_obj.pickle')
-   data_fname     = jp(data_dir,'map.vm')
+   original_map_fname = '/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/main/map.vm'
+   #split_file(original_map_fname,data_dir,25)
+
    param_fname    = jp(run_dir,'param_526.pickle')
 
    functionJobs=[]
 
-   for idx in range(10):
-      data_fname     = jp(data_dir,'map.vm.part_%d'%idx)
+   for idx in range(3,25):
+      data_fname     = jp(data_dir,'map.part_%d'%idx)
       result_fname   = jp(data_dir,'map.vm.part_%d.heuristic'%idx)
 
+      #pdb.set_trace()
+
       current_job = KybJob(grid_heuristic.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
       current_job.h_vmem = '25.0G'
-      #current_job.express = 'True'
+      current_job.express = 'True'
 
       print "job #1: ", current_job.nativeSpecification
 
       functionJobs.append(current_job)
 
-   processedFunctionJobs = processJobs(functionJobs)
-   print "ret fields AFTER execution on cluster"
-   for (i, job) in enumerate(processedFunctionJobs):
-      print "Job with id: ", i, "- ret: ", job.ret
+   (sid, jobids) = submit_jobs(functionJobs)
+   #print 'checking whether finished'
+   #while not get_status(sid, jobids):
+   #   time.sleep(7)
+   #print 'collecting jobs'
+   #retjobs = collect_jobs(sid, jobids, functionJobs)
+   #print "ret fields AFTER execution on cluster"
+   #for (i, job) in enumerate(retjobs):
+   #   print "Job #", i, "- ret: ", job.ret
+
+   #print '--------------'
 
 
 if __name__ == '__main__':
index 3d962f7..ebeecd2 100644 (file)
@@ -3,12 +3,14 @@
 
 import cPickle
 import sys
+import time
 import pdb
 import os
 import os.path
 import math
 
-from pythongrid import Job, KybJob, MethodJob, processJobs, Usage, processJobsLocally
+from pythongrid import KybJob, Usage
+from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
 
 from qpalma_main import *
 
@@ -45,7 +47,7 @@ def makeJobs(run,dataset_fn,chunks,param):
 
    for c_name,current_chunk in chunks:
       current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param,c_name])
-      current_job.h_vmem = '30.0G'
+      current_job.h_vmem = '20.0G'
       #current_job.express = 'True'
 
       print "job #1: ", current_job.nativeSpecification
@@ -69,19 +71,32 @@ def create_and_submit():
 
    param = cPickle.load(open(jp(run_dir,'param_526.pickle')))
 
-   dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.pickle'
-   prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.keys.pickle'
+   #dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/sandbox/dataset_neg_strand_testcase.pickle'
+   #prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/sandbox/dataset_neg_strand_testcase.keys.pickle'
+
+   #dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.pickle'
+   #prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.keys.pickle'
+
+   #run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
+   #dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/dataset_run_1.pickle.pickle'
+   #prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/dataset_run_1.pickle.keys.pickle'
+
+   run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
+   dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/dataset_run_2.pickle.pickle'
+   prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/dataset_run_2.pickle.keys.pickle'
 
    prediction_keys = cPickle.load(open(prediction_keys_fn))
 
    print 'Found %d keys for prediction.' % len(prediction_keys)
 
    num_splits = 25
+   #num_splits = 1
    slices = get_slices(len(prediction_keys),num_splits)
    chunks = []
    for idx,slice in enumerate(slices):
-      c_name = 'chunk_%d' % idx
-      chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
+      #if idx != 0:
+         c_name = 'chunk_%d' % idx
+         chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
 
    functionJobs = makeJobs(run,dataset_fn,chunks,param)
 
@@ -89,23 +104,32 @@ def create_and_submit():
    for size in [len(elem) for name,elem in chunks]:
       sum += size
    
-   assert sum == len(prediction_keys)
+   #assert sum == len(prediction_keys)
 
    print 'Got %d job(s)' % len(functionJobs)
 
-   print "output ret field in each job before sending it onto the cluster"
-   for (i, job) in enumerate(functionJobs):
-      print "Job with id: ", i, "- ret: ", job.ret
+   #print "output ret field in each job before sending it onto the cluster"
+   #for (i, job) in enumerate(functionJobs):
+   #   print "Job with id: ", i, "- ret: ", job.ret
+
+   #print ""
+   #print "sending function jobs to cluster"
+   #print ""
+
+   #pdb.set_trace()
 
-   print ""
-   print "sending function jobs to cluster"
-   print ""
+   (sid, jobids) = submit_jobs(functionJobs)
 
-   processedFunctionJobs = processJobs(functionJobs)
+   #print 'checking whether finished'
+   #while not get_status(sid, jobids):
+   #   time.sleep(7)
+   #print 'collecting jobs'
+   #retjobs = collect_jobs(sid, jobids, functionJobs)
+   #print "ret fields AFTER execution on cluster"
+   #for (i, job) in enumerate(retjobs):
+   #   print "Job #", i, "- ret: ", job.ret
 
-   print "ret fields AFTER execution on cluster"
-   for (i, job) in enumerate(processedFunctionJobs):
-      print "Job with id: ", i, "- ret: ", job.ret
+   #print '--------------'
 
 
 def g_predict(run,dataset_fn,prediction_keys,param,set_name):
index d0a17ad..ee70a37 100644 (file)
 import sys
 import cPickle
 import pdb
-import re
 import os.path
+import array
 
-from qpalma.sequence_utils import *
+from qpalma.sequence_utils import get_seq_and_scores,reverse_complement,unbracket_seq
 
 import numpy
 from numpy.matlib import mat,zeros,ones,inf
@@ -31,23 +31,19 @@ import qpalma
 
 #from qpalma.SIQP_CPX import SIQPSolver
 #from qpalma.SIQP_CVXOPT import SIQPSolver
-import array
 
-from qpalma.DataProc import *
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
 from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
 from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
+from qpalma.Plif import Plf,compute_donacc
 
-from qpalma.Configuration import *
+import QPalmaConfiguration as Conf
 
-# this two imports are needed for the load genomic resp. interval query
+# these two imports are needed for the load genomic resp. interval query
 # functions
-from Genefinding import *
-from genome_utils import load_genomic
+#from Genefinding import *
+#from genome_utils import load_genomic
 from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
 
 class SpliceSiteException:
@@ -75,13 +71,11 @@ def getData(training_set,exampleKey,run):
    dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
 
    # splice score is located at g of ag
-   ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
-   assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
+   #ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
+   #assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
+   #gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
+   #assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
 
-   gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
-   assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
-
-   #original_exons = Exons[exampleIdx]
    original_exons = currentExons
    exons = original_exons - (up_cut-1)
    exons[0,0] -= 1
@@ -149,8 +143,8 @@ class QPalma:
       # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
       currentAlignment.myalign( current_num_path, dna, dna_len,\
        est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-       acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
-       print_matrix)
+       acceptor, a_len, c_qualityPlifs, run['remove_duplicate_scores'],\
+       run['print_matrix'] )
 
       c_SpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*current_num_path))
       c_EstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*current_num_path))
@@ -576,7 +570,8 @@ class QPalma:
       """
       self.run = run
 
-      full_working_path = os.path.join(run['alignment_dir'],run['name'])
+      #full_working_path = os.path.join(run['alignment_dir'],run['name'])
+      full_working_path = run['result_dir']
 
       print 'full_working_path is %s' % full_working_path 
 
@@ -612,13 +607,14 @@ class QPalma:
       # we do not need the full dataset anymore
       del dataset
 
-      # Set the parameters such as limits penalties for the Plifs
+      # Set the parameters such as limits/penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] =\
       set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
 
       #############################################################################################
       # Prediction
       #############################################################################################
+
       self.plog('Starting prediction...\n')
 
       donSP       = self.run['numDonSuppPoints']
@@ -635,40 +631,39 @@ class QPalma:
       # where we store the predictions
       allPredictions = []
 
+      # we take the first quality vector of the tuple of quality vectors
+      quality_index = 0
+
       # beginning of the prediction loop
       for example_key in prediction_set.keys():
          print 'Current example %d' % example_key
 
          for example in prediction_set[example_key]:
 
-            currentSeqInfo,original_est,currentQualities = example
+            currentSeqInfo,original_read,currentQualities = example
 
-            id,chr,strand,genomicSeq_start,genomicSeq_stop =\
+            id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
             currentSeqInfo 
 
-            assert id == example_key
+            # remove brackets from the original read annotation
+            read = unbracket_seq(original_read)
 
-            if not chr in range(1,6):
-               continue
+            if strand == '-':
+               read = reverse_complement(read)
 
             self.plog('Loading example id: %d...\n'% int(id))
 
-            est = original_est
-            est = unbracket_est(est)
-
             if run['mode'] == 'normal':
-               quality = [40]*len(est)
+               quality = [40]*len(read)
 
             if run['mode'] == 'using_quality_scores':
-               quality = currentQualities[0]
+               quality = currentQualities[quality_index]
 
             if not run['enable_quality_scores']:
-               quality = [40]*len(est)
-
-            current_example_predictions = []
+               quality = [40]*len(read)
 
             try:
-               currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+               currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
             except:
                problem_ctr += 1
                continue
@@ -682,13 +677,13 @@ class QPalma:
                   if elem != -inf:
                      currentAcc[idx] = 0.0
 
-            current_prediction = self.calc_alignment(currentDNASeq, est,\
+            current_prediction = self.calc_alignment(currentDNASeq, read,\
             quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
 
             current_prediction['id'] = id
             #current_prediction['start_pos'] = up_cut
             current_prediction['start_pos'] = genomicSeq_start
-            current_prediction['chr'] = chr
+            current_prediction['chr'] = chromo
             current_prediction['strand'] = strand
 
             allPredictions.append(current_prediction)
@@ -703,7 +698,7 @@ class QPalma:
       self.logfh.close()
 
 
-   def calc_alignment(self, dna, est, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
+   def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
       """
       Given two sequences and the parameters we calculate on alignment
       """
@@ -712,21 +707,21 @@ class QPalma:
       donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
 
       dna = str(dna)
-      est = str(est)
+      read = str(read)
 
-      if '-' in est:
+      if '-' in read:
          self.plog('found gap\n')
-         est = est.replace('-','')
-         assert len(est) == Conf.read_size
+         read = read.replace('-','')
+         assert len(read) == Conf.read_size
 
       dna_len = len(dna)
-      est_len = len(est)
+      read_len = len(read)
 
       ps = h.convert2SWIG()
 
       newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
-      newQualityPlifsFeatures, dna_array, est_array =\
-      self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
+      newQualityPlifsFeatures, dna_array, read_array =\
+      self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
 
       mm_len = run['matchmatrixRows']*run['matchmatrixCols']
 
@@ -740,10 +735,10 @@ class QPalma:
       _newSpliceAlign   = array.array('B',newSpliceAlign.flatten().tolist()[0])
       _newEstAlign      = array.array('B',newEstAlign.flatten().tolist()[0])
        
-      alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
+      alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
 
       dna_array = array.array('B',dna_array)
-      est_array = array.array('B',est_array)
+      read_array = array.array('B',read_array)
 
       #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
       #self.plog(line1+'\n')
@@ -752,10 +747,10 @@ class QPalma:
 
       newExons = self.calculatePredictedExons(newSpliceAlign)
 
-      current_prediction = {'predExons':newExons, 'dna':dna, 'est':est, 'DPScores':newDPScores,\
+      current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
       'alignment':alignment,\
       'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
-      'dna_array':dna_array, 'est_array':est_array }
+      'dna_array':dna_array, 'read_array':read_array }
 
       return current_prediction
 
@@ -779,6 +774,7 @@ class QPalma:
 
       return newExons
 
+
 ###########################
 # A simple command line 
 # interface