+ minor changes in the paths
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 1 Sep 2008 12:21:29 +0000 (12:21 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 1 Sep 2008 12:21:29 +0000 (12:21 +0000)
+ fixed some stuff in the approximation
+ freeze version for git checkout

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10491 e1793c9e-67f9-0310-80fc-b846ff1f7b36

14 files changed:
qpalma/Lookup.py
qpalma/gridtools.py
scripts/PipelineHeuristic.py
scripts/grid_alignment.py
scripts/grid_heuristic.py
scripts/grid_predict.py
tests/test_sequence_utils.py
tools/data_tools/parser.c
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/pred_test
tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py
tools/run_specific_scripts/transcriptome_analysis/createExonInfoForGenefinding.py
tools/run_specific_scripts/transcriptome_analysis/createGenefindingInfo.sh
tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py

index 7d78bcb..8b4ac2a 100644 (file)
@@ -43,7 +43,7 @@ class LookupTable:
             self.prefetch_seq_and_scores(chrId,strandId)
 
  
-   def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
+   def get_seq_and_scores(self,chromo,strand,start,stop):
       assert chromo in self.chromo_list
       assert strand in self.strands
 
index be0a7fe..da809c3 100644 (file)
@@ -1,6 +1,14 @@
 #!/usr/bin/env python 
 # -*- coding: utf-8 -*- 
 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
 import cPickle
 import math
 import os
index b19eb63..f4050bc 100644 (file)
@@ -15,8 +15,6 @@ from qpalma.computeSpliceAlignWithQuality import *
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
 
-#from qpalma.parsers import *
-
 from ParaParser import ParaParser,IN_VECTOR
 
 from qpalma.Lookup import LookupTable
@@ -91,8 +89,8 @@ class PipelineHeuristic:
       g_fmt = 'chr%d.dna.flat'
       s_fmt = 'contig_%d%s'
 
-      #self.chromo_range = range(1,6)
-      self.chromo_range = range(1,2)
+      self.chromo_range = range(1,6)
+      #self.chromo_range = range(1,2)
 
       num_chromo = 2
       accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
@@ -123,6 +121,8 @@ class PipelineHeuristic:
       self.qualityPlifs = qualityPlifs
 
       self.read_size    = 38
+      self.prb_offset   = 64
+      #self.prb_offset   = 50
 
       # parameters of the heuristics to decide whether the read is spliced
       #self.splice_thresh      = 0.005
@@ -257,12 +257,13 @@ class PipelineHeuristic:
          genomicSeq_stop   = pos+effective_len
 
          start = cpu()
-         currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
-         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+         #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
 
-         assert currentDNASeq_ == currentDNASeq
-         assert currentAcc_ == currentAcc_
-         assert currentDon_ == currentDon_
+         # just performed some tests to check LookupTable results
+         #assert currentDNASeq_ == currentDNASeq
+         #assert currentAcc_ == currentAcc_
+         #assert currentDon_ == currentDon_
 
          stop = cpu()
          self.get_time += stop-start
@@ -273,19 +274,19 @@ class PipelineHeuristic:
          exons[1,0]     = effective_len
          est            = unb_seq
          original_est   = seq
-         quality        = map(lambda x:ord(x)-64,prb)
+         quality        = map(lambda x:ord(x)-self.prb_offset,prb)
 
          currentVMatchAlignment = dna, exons, est, original_est, quality,\
          currentAcc, currentDon
 
          #pdb.set_trace()
 
-         alternativeAlignmentScores = self.calcAlternativeAlignments(location)
-         #try:
-         #   alternativeAlignmentScores = self.calcAlternativeAlignments(location)
-         #except:
-         #   alternativeAlignmentScores = []
+         #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
 
+         try:
+            alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         except:
+            alternativeAlignmentScores = []
 
          if alternativeAlignmentScores == []:
              # no alignment necessary
@@ -295,10 +296,11 @@ class PipelineHeuristic:
              maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
              # compute alignment for vmatch unspliced read
              vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+             #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
+             
 
          start = cpu()
          
-         #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
          #print 'all candidates %s' % str(alternativeAlignmentScores)
 
          new_id = id - 1000000300000
@@ -370,7 +372,6 @@ class PipelineHeuristic:
       #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
       #distal_acc=distal_acc[-2:]
 
-
       proximal_don   = []
       for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
          if currentDon[idx] >= splice_thresh:
@@ -405,7 +406,7 @@ class PipelineHeuristic:
       strand   = location['strand']
       original_est = location['seq']
       quality      = location['prb']
-      quality        = map(lambda x:ord(x)-64,quality)
+      quality        = map(lambda x:ord(x)-self.prb_offset,quality)
       
       original_est = original_est.lower()
       est = unbracket_seq(original_est)
@@ -423,14 +424,16 @@ class PipelineHeuristic:
 
       start = cpu()
       #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
-      currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+      currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
       stop = cpu()
       self.get_time += stop-start
       dna   = currentDNASeq
-   
+
       proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
        
       alternativeScores = []
+
+      #pdb.set_trace()
       
       # inlined
       h = self.h
@@ -519,6 +522,8 @@ class PipelineHeuristic:
               if acc_score>=self.splice_stop_thresh:
                   break
               
+      #pdb.set_trace()
+
       _stop = cpu()
       self.alternativeScoresTime += _stop-_start
 
index 93457e0..68c12c8 100644 (file)
@@ -24,9 +24,9 @@ def g_alignment(chunk_fn,result_fn):
 def create_and_submit():
    jp = os.path.join
 
-   run_dir  = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction'
-   result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/alignment'
-
+   run_dir     = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction'
+   result_dir  = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/alignment'
+   
    chunks_fn = []
    for fn in os.listdir(run_dir):
       if fn.startswith('chunk'):
index 77b93c1..1fdd697 100644 (file)
@@ -30,17 +30,18 @@ def g_heuristic(run_fname,data_fname,param_fname,result_fname):
 def create_and_submit():
    jp = os.path.join
 
-   num_splits = 25
+   num_splits = 50
 
    run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
    #data_dir = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/'
 
-   data_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+   data_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/'
 
    run_fname      = jp(run_dir,'run_obj.pickle')
 
-   #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main/map.vm'
-   #split_file(original_map_fname,data_dir,num_splits)
+   original_map_fname = jp(data_dir,'map.vm')
+
+   split_file(original_map_fname,data_dir,num_splits)
 
    param_fname    = jp(run_dir,'param_526.pickle')
 
index 95f8306..25197cb 100644 (file)
@@ -71,15 +71,19 @@ def create_and_submit():
 
    param_fn = jp(run_dir,'param_526.pickle')
 
-   run['result_dir']    = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction'
-   dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.pickle'
-   prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.keys.pickle'
+   #run['result_dir']    = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/prediction'
+   #dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.pickle'
+   #prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.keys.pickle'
+
+   run['result_dir']    = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction'
+   dataset_fn           = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.pickle'
+   prediction_keys_fn   = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.keys.pickle'
 
    prediction_keys = cPickle.load(open(prediction_keys_fn))
 
    print 'Found %d keys for prediction.' % len(prediction_keys)
 
-   num_splits = 25
+   num_splits = 50
    #num_splits = 1
    slices = get_slices(len(prediction_keys),num_splits)
    chunks = []
index b83c40a..00e8870 100644 (file)
@@ -26,6 +26,8 @@ class TestSequenceUtils(unittest.TestCase):
       accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
       self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
 
+      print self.seqInfo.chromo_sizes
+
       #self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,2))
 
 
@@ -182,7 +184,7 @@ class TestLookupTable(unittest.TestCase):
       pass
 
 
-   def _testTableThalianaData(self):
+   def testTableThalianaData(self):
       g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
       acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
       don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
@@ -190,10 +192,31 @@ class TestLookupTable(unittest.TestCase):
       g_fmt = 'chr%d.dna.flat'
       s_fmt = 'contig_%d%s'
 
-      lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,100))
+      num_chromo = 2
+
+      lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,num_chromo))
+
+      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+      seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
+      seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
+      seq = reverse_complement(seq)
+      chromo = 1
+      strand = '-'
+      pos = 10475515
+      dna = seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+      self.assertEqual(seq,dna)
+
+      dna = lt1.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+      pdb.set_trace()
+      self.assertEqual(seq,dna)
+
+      dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
+      dna_,acc_,don_ = lt1.get_seq_and_scores(1,'+',1,1369)
+
 
 
-   def testTableLyrataData(self):
+   def _testTableLyrataData(self):
       g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
       acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
       don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
@@ -236,6 +259,6 @@ def check_wrapper():
 
 
 if __name__ == '__main__':
-   suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
-   #suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
+   #suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
+   suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
    unittest.TextTestRunner(verbosity=2).run(suite)
index 6a0d4eb..c314e05 100644 (file)
  * In the current gff files from the TAIR repository
  * the exons boundaries are defined as follows: 
  *
- * exon start points at:
+ * exon starts at:
  *
  * agxy
  *    ^
- * whereas exon stop points at:
+ * whereas exon stops at:
  *
  * xygt
  *   ^
index fe51eaa..86e174e 100644 (file)
@@ -25,19 +25,90 @@ int* pred_intron_starts;
 int* pred_intron_stops;
 int pred_size;
 
-int matching_introns;
+int matching_values;
 
 static char* info = "Usage: ./<app name> <input 1 file name> <input 2 file name> <result file name>";
 
+static int chromo_sizes[5] = {30432563, 19705359, 23470805, 18585042, 26992728};
+
+short strand_info;
+
+
+void check_seeds() {
+
+   matching_values = 0;
+   int pred_idx, gt_idx, chromo, tmp;
+   int pos,gt_start,gt_stop,pred_start,pred_stop;
+
+   printf("sizes are %d/%d (gt/pred)\n",gt_size,pred_size);
+
+   for(pred_idx=0;pred_idx<pred_size;pred_idx++) {
+
+      pred_start = pos-1500;
+      pred_stop = pos+1500;
+      pos = pred_intron_stops[pred_idx];
+      chromo = pred_intron_starts[pred_idx];
+
+      for(gt_idx=0;gt_idx<gt_size;gt_idx++) {
+
+         gt_start = gt_intron_starts[gt_idx];
+         gt_stop = gt_intron_stops[gt_idx];
+
+         if(strand_info == 0) {
+            tmp = gt_start;
+            gt_start = chromo_sizes[chromo] - gt_stop;
+            gt_stop =  chromo_sizes[chromo] - tmp;
+         }
+
+         //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
+         if ( pred_start <= gt_start && gt_stop <= pred_stop ) {
+            matching_values++;
+            break;
+         }
+
+         //if (gt_start == pred_start && gt_stop == pred_stop)
+         //   matching_values++;
+   }}
+}
+
+
+void compare_introns() {
+
+   matching_values = 0;
+   int pred_idx, gt_idx;
+   int gt_start,gt_stop,pred_start,pred_stop;
+
+   printf("sizes are %d/%d (gt/pred)\n",gt_size,pred_size);
+
+   for(pred_idx=0;pred_idx<pred_size;pred_idx++) {
+
+      pred_start = pred_intron_starts[pred_idx];
+      pred_stop = pred_intron_stops[pred_idx];
+
+      for(gt_idx=0;gt_idx<gt_size;gt_idx++) {
+
+         gt_start = gt_intron_starts[gt_idx];
+         gt_stop = gt_intron_stops[gt_idx];
+
+         //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
+         if ( fabs(gt_start - pred_start) <= 2 && fabs(gt_stop - pred_stop) <= 2) {
+            matching_values++;
+            break;
+         }
+         //if (gt_start == pred_start && gt_stop == pred_stop)
+         //   matching_values++;
+   }}
+}
+
 
 /* 
  * This function compares the intron boundaries of the two files.
  *
  */
 
-void compare_introns() {
+void _compare_introns() {
 
-   matching_introns = 0;
+   matching_values = 0;
    
    int gt_idx = 0;
    int pred_idx = 0;
@@ -57,7 +128,18 @@ void compare_introns() {
 
       //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
       if (gt_start == pred_start && gt_stop == pred_stop) {
-         matching_introns++;
+         matching_values++;
+
+         // take doubles into account
+#if 1
+         int idx = pred_idx+1;
+         for(;idx<pred_size;idx++) {
+            if (pred_intron_starts[idx] == pred_start && pred_intron_stops[idx] == pred_stop)
+               matching_values++;
+            else
+               break;
+         }
+#endif
          gt_idx++;
          pred_idx++;
          continue;
@@ -110,6 +192,7 @@ void load_introns(char* fn, int** starts, int** stops, int* size) {
    while( getline(&current_line,&line_size,fs) != -1 ) (*size)++;
 
    // now we allocate memory to store all positions
+   //*id      = malloc(sizeof(char)*(*size));
    *starts  = malloc(sizeof(int)*(*size));
    *stops   = malloc(sizeof(int)*(*size));
 
@@ -122,6 +205,7 @@ void load_introns(char* fn, int** starts, int** stops, int* size) {
       exit(EXIT_FAILURE);
    }
 
+   //char* id_string  = malloc(sizeof(char)*line_size);
    char* first_num  = malloc(sizeof(char)*line_size);
    char* second_num = malloc(sizeof(char)*line_size);
 
@@ -152,7 +236,8 @@ void load_introns(char* fn, int** starts, int** stops, int* size) {
       ctr++;
    }
 
-   printf("ctr is %d\n",ctr);
+   //printf("ctr is %d\n",ctr);
+
    if (fclose(fs) != 0)
       perror("Closing of filestream failed!");
 }
@@ -160,7 +245,7 @@ void load_introns(char* fn, int** starts, int** stops, int* size) {
 
 int main(int argc, char* argv[]) {
 
-   if(argc != 4) {
+   if(argc != 6) {
       printf("%s\n",info);
       exit(EXIT_FAILURE);
    }
@@ -173,9 +258,29 @@ int main(int argc, char* argv[]) {
    if ( gt_fn == NULL || pred_fn == NULL || result_fn == NULL)
       perror("Could not allocate memory for filenames");
 
-   strncpy(gt_fn,argv[1],strlen(argv[1]));
-   strncpy(pred_fn,argv[2],strlen(argv[2]));
-   strncpy(result_fn,argv[3],strlen(argv[3]));
+   short intron_mode = -1;
+   if ( strcmp(argv[1],"-intron") == 0 )
+      intron_mode = 1;
+
+   if ( strcmp(argv[1],"-seed") == 0 )
+      intron_mode = 0;
+
+   if (intron_mode == -1)
+      perror("You have to choose between -intron or -seed mode!");
+
+   strand_info = -1;
+   if ( strcmp(argv[2],"-strand=+") == 0 )
+      strand_info = 1;
+
+   if ( strcmp(argv[2],"-strand=-") == 0 )
+      strand_info = 0;
+
+   if (strand_info == -1)
+      perror("You have to choose between -strand=+ or -strand=- !");
+
+   strncpy(gt_fn,argv[3],strlen(argv[3]));
+   strncpy(pred_fn,argv[4],strlen(argv[4]));
+   strncpy(result_fn,argv[5],strlen(argv[5]));
 
    FILE *result_fs = fopen(result_fn,"w+");
    if(result_fs == NULL) {
@@ -186,7 +291,10 @@ int main(int argc, char* argv[]) {
    load_introns(gt_fn,&gt_intron_starts,&gt_intron_stops,&gt_size);
    load_introns(pred_fn,&pred_intron_starts,&pred_intron_stops,&pred_size);
 
-   compare_introns();
+   if (intron_mode == 1)
+      compare_introns();
+   else
+      check_seeds();
 
    int f_status = fclose(result_fs);
    if(f_status != 0)
@@ -196,6 +304,6 @@ int main(int argc, char* argv[]) {
    free(pred_fn);
    free(result_fn);
 
-   printf("Found %d matching intron(s).\n",matching_introns);
+   printf("Found %d matching intron(s)/seed(s).\n",matching_values);
    exit(EXIT_SUCCESS);
 }
index 3e4f093..7cc028c 100644 (file)
@@ -56,7 +56,9 @@ class DatasetGenerator:
 
       self.dataset = []
 
-      self.read_size = Conf.read_size
+      self.read_size = 38
+      #self.prb_offset = 50
+      self.prb_offset = 64
 
       self.half_window_size = 1500
 
@@ -70,6 +72,7 @@ class DatasetGenerator:
       assert not os.path.exists(dataset_fn), 'The data_file already exists!'
       assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
 
+      #pdb.set_trace()
       # saving new dataset and single keys as well
       cPickle.dump(self.dataset,open(dataset_fn,'w+'),protocol=2)
       cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
@@ -161,7 +164,8 @@ class DatasetGenerator:
 
          # In order to save some space we use a signed char to store the
          # qualities. Each quality element can range as follows: -128 <= elem <= 127
-         prb = array.array('b',map(lambda x: ord(x)-64,slist[5]))
+         
+         prb = array.array('b',map(lambda x: ord(x)-self.prb_offset,slist[5]))
 
          # add instance to set
          currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
index dba0b67..db3200c 100755 (executable)
@@ -69,6 +69,13 @@ def run(chunk_dir,outfile):
       starts   = [int(elem) + start_pos for elem in starts]
       ends     = [int(elem) + start_pos for elem in ends]
 
+      if strand == '+':
+         starts   = [e+1 for e in starts]
+         ends     = [e+1 for e in ends]
+      else:
+         starts   = [e-3001 for e in starts]
+         ends     = [e-3001 for e in ends]
+
       #out_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),str(start_pos),pp(ids),pp(gaps)))
       out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
 
index 1d58abe..f7a6b3f 100755 (executable)
@@ -1,8 +1,8 @@
 #!/bin/bash
 
-for((idx=1;idx<4;idx++))
+for((idx=1;idx<2;idx++))
 do
-   current_dir=/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_${idx}
+   current_dir=/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_${idx}/spliced
    input=$current_dir/alignment 
    result=$current_dir/alignment/alignmentInfo.genefinding
    result2=$current_dir/alignment/alignmentInfo.genefinding.chr1_only
@@ -11,7 +11,8 @@ do
    intron_starts=$current_dir/alignment/intron_starts
    intron_info=$current_dir/alignment/intron_info
 
-   #python createExonInfoForGenefinding.py $input $result
+   python createExonInfoForGenefinding.py $input $result
+
    #cat $result | grep '^1' > $result2
    #cat $result2 | sort > $result3
 
index 19cb0c7..ffa07ab 100644 (file)
@@ -2,28 +2,52 @@
 # -*- coding: utf-8 -*-
 
 import os.path
+import sys
 import cProfile
 
 from compile_dataset import DatasetGenerator
 
+jp = os.path.join
+
+def combine_spliced_map_parts(data_dir,result_fn):
+   """
+   """
+   
+   assert os.path.exists(data_dir)
+
+   result_fn = jp(data_dir,result_fn)
+
+   if os.path.exists(result_fn):
+      os.remove(result_fn)
+
+   for chunk_fn in os.listdir(data_dir):
+      if chunk_fn.endswith('.spliced'):
+         full_chunk_fn = jp(data_dir,chunk_fn)
+         cmd = 'cat %s >>  %s' % (full_chunk_fn,result_fn)
+         os.system(cmd)
+
+   cmd = "cat %s | sed -e \'/^#/d\' > tmp ; mv tmp %s" % (result_fn,result_fn)
+   os.system(cmd)
+         
 
 def run():
-   jp = os.path.join
 
-   main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+   #main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+   #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3'
+   #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset'
 
-   #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1'
-   #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset'
+   main_dir    = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/'
+   spliced_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/'
+   result_dir  = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/'
 
-   spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3'
-   result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset'
+   combine_spliced_map_parts(main_dir,'map.vm.spliced')
 
    map_1_fn = jp(main_dir,'map.vm.spliced')
    map_2_fn = jp(spliced_dir,'map.vm')
 
    dg = DatasetGenerator(map_1_fn,map_2_fn)
    dg.compile_dataset()
-   dg.saveAs(jp(result_dir,'dataset_run_1.pickle'))
+   dg.saveAs(jp(result_dir,'dataset_run_1'))
 
 if __name__ == '__main__':
    #cProfile.run('run()')