From 3244420704b220bf95694ae0d49cbba3653a68ac Mon Sep 17 00:00:00 2001 From: fabio Date: Mon, 1 Sep 2008 12:21:29 +0000 Subject: [PATCH] + minor changes in the paths + fixed some stuff in the approximation + freeze version for git checkout git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10491 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- qpalma/Lookup.py | 2 +- qpalma/gridtools.py | 8 ++ scripts/PipelineHeuristic.py | 45 +++--- scripts/grid_alignment.py | 6 +- scripts/grid_heuristic.py | 9 +- scripts/grid_predict.py | 12 +- tests/test_sequence_utils.py | 33 ++++- tools/data_tools/parser.c | 4 +- .../compare_predictions/compare.c | 130 ++++++++++++++++-- .../compare_predictions/pred_test | 5 + .../transcriptome_analysis/compile_dataset.py | 8 +- .../createExonInfoForGenefinding.py | 7 + .../createGenefindingInfo.sh | 7 +- .../createNewDataset.py | 38 ++++- 14 files changed, 252 insertions(+), 62 deletions(-) diff --git a/qpalma/Lookup.py b/qpalma/Lookup.py index 7d78bcb..8b4ac2a 100644 --- a/qpalma/Lookup.py +++ b/qpalma/Lookup.py @@ -43,7 +43,7 @@ class LookupTable: self.prefetch_seq_and_scores(chrId,strandId) - def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files): + def get_seq_and_scores(self,chromo,strand,start,stop): assert chromo in self.chromo_list assert strand in self.strands diff --git a/qpalma/gridtools.py b/qpalma/gridtools.py index be0a7fe..da809c3 100644 --- a/qpalma/gridtools.py +++ b/qpalma/gridtools.py @@ -1,6 +1,14 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# Written (W) 2008 Fabio De Bona +# Copyright (C) 2008 Max-Planck-Society + import cPickle import math import os diff --git a/scripts/PipelineHeuristic.py b/scripts/PipelineHeuristic.py index b19eb63..f4050bc 100644 --- a/scripts/PipelineHeuristic.py +++ b/scripts/PipelineHeuristic.py @@ -15,8 +15,6 @@ from qpalma.computeSpliceAlignWithQuality import * from numpy.matlib import mat,zeros,ones,inf from numpy import inf,mean -#from qpalma.parsers import * - from ParaParser import ParaParser,IN_VECTOR from qpalma.Lookup import LookupTable @@ -91,8 +89,8 @@ class PipelineHeuristic: g_fmt = 'chr%d.dna.flat' s_fmt = 'contig_%d%s' - #self.chromo_range = range(1,6) - self.chromo_range = range(1,2) + self.chromo_range = range(1,6) + #self.chromo_range = range(1,2) num_chromo = 2 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt) @@ -123,6 +121,8 @@ class PipelineHeuristic: self.qualityPlifs = qualityPlifs self.read_size = 38 + self.prb_offset = 64 + #self.prb_offset = 50 # parameters of the heuristics to decide whether the read is spliced #self.splice_thresh = 0.005 @@ -257,12 +257,13 @@ class PipelineHeuristic: genomicSeq_stop = pos+effective_len start = cpu() - currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False) - currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False) + #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False) + currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop) - assert currentDNASeq_ == currentDNASeq - assert currentAcc_ == currentAcc_ - assert currentDon_ == currentDon_ + # just performed some tests to check LookupTable results + #assert currentDNASeq_ == currentDNASeq + #assert currentAcc_ == currentAcc_ + #assert currentDon_ == currentDon_ stop = cpu() self.get_time += stop-start @@ -273,19 +274,19 @@ class PipelineHeuristic: exons[1,0] = effective_len est = unb_seq original_est = seq - quality = map(lambda x:ord(x)-64,prb) + quality = map(lambda x:ord(x)-self.prb_offset,prb) currentVMatchAlignment = dna, exons, est, original_est, quality,\ currentAcc, currentDon #pdb.set_trace() - alternativeAlignmentScores = self.calcAlternativeAlignments(location) - #try: - # alternativeAlignmentScores = self.calcAlternativeAlignments(location) - #except: - # alternativeAlignmentScores = [] + #alternativeAlignmentScores = self.calcAlternativeAlignments(location) + try: + alternativeAlignmentScores = self.calcAlternativeAlignments(location) + except: + alternativeAlignmentScores = [] if alternativeAlignmentScores == []: # no alignment necessary @@ -295,10 +296,11 @@ class PipelineHeuristic: maxAlternativeAlignmentScore = max(alternativeAlignmentScores) # compute alignment for vmatch unspliced read vMatchScore = self.calcAlignmentScore(currentVMatchAlignment) + #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand) + start = cpu() - #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore) #print 'all candidates %s' % str(alternativeAlignmentScores) new_id = id - 1000000300000 @@ -370,7 +372,6 @@ class PipelineHeuristic: #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) #distal_acc=distal_acc[-2:] - proximal_don = [] for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size): if currentDon[idx] >= splice_thresh: @@ -405,7 +406,7 @@ class PipelineHeuristic: strand = location['strand'] original_est = location['seq'] quality = location['prb'] - quality = map(lambda x:ord(x)-64,quality) + quality = map(lambda x:ord(x)-self.prb_offset,quality) original_est = original_est.lower() est = unbracket_seq(original_est) @@ -423,14 +424,16 @@ class PipelineHeuristic: start = cpu() #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files']) - currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files']) + currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop) stop = cpu() self.get_time += stop-start dna = currentDNASeq - + proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh) alternativeScores = [] + + #pdb.set_trace() # inlined h = self.h @@ -519,6 +522,8 @@ class PipelineHeuristic: if acc_score>=self.splice_stop_thresh: break + #pdb.set_trace() + _stop = cpu() self.alternativeScoresTime += _stop-_start diff --git a/scripts/grid_alignment.py b/scripts/grid_alignment.py index 93457e0..68c12c8 100644 --- a/scripts/grid_alignment.py +++ b/scripts/grid_alignment.py @@ -24,9 +24,9 @@ def g_alignment(chunk_fn,result_fn): def create_and_submit(): jp = os.path.join - run_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction' - result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/alignment' - + run_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction' + result_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/alignment' + chunks_fn = [] for fn in os.listdir(run_dir): if fn.startswith('chunk'): diff --git a/scripts/grid_heuristic.py b/scripts/grid_heuristic.py index 77b93c1..1fdd697 100644 --- a/scripts/grid_heuristic.py +++ b/scripts/grid_heuristic.py @@ -30,17 +30,18 @@ def g_heuristic(run_fname,data_fname,param_fname,result_fname): def create_and_submit(): jp = os.path.join - num_splits = 25 + num_splits = 50 run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+' #data_dir = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/' - data_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main' + data_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/' run_fname = jp(run_dir,'run_obj.pickle') - #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main/map.vm' - #split_file(original_map_fname,data_dir,num_splits) + original_map_fname = jp(data_dir,'map.vm') + + split_file(original_map_fname,data_dir,num_splits) param_fname = jp(run_dir,'param_526.pickle') diff --git a/scripts/grid_predict.py b/scripts/grid_predict.py index 95f8306..25197cb 100644 --- a/scripts/grid_predict.py +++ b/scripts/grid_predict.py @@ -71,15 +71,19 @@ def create_and_submit(): param_fn = jp(run_dir,'param_526.pickle') - run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction' - dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.pickle' - prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.keys.pickle' + #run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/prediction' + #dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.pickle' + #prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.keys.pickle' + + run['result_dir'] = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction' + dataset_fn = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.pickle' + prediction_keys_fn = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.keys.pickle' prediction_keys = cPickle.load(open(prediction_keys_fn)) print 'Found %d keys for prediction.' % len(prediction_keys) - num_splits = 25 + num_splits = 50 #num_splits = 1 slices = get_slices(len(prediction_keys),num_splits) chunks = [] diff --git a/tests/test_sequence_utils.py b/tests/test_sequence_utils.py index b83c40a..00e8870 100644 --- a/tests/test_sequence_utils.py +++ b/tests/test_sequence_utils.py @@ -26,6 +26,8 @@ class TestSequenceUtils(unittest.TestCase): accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt) self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo)) + print self.seqInfo.chromo_sizes + #self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,2)) @@ -182,7 +184,7 @@ class TestLookupTable(unittest.TestCase): pass - def _testTableThalianaData(self): + def testTableThalianaData(self): g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/' acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc' don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don' @@ -190,10 +192,31 @@ class TestLookupTable(unittest.TestCase): g_fmt = 'chr%d.dna.flat' s_fmt = 'contig_%d%s' - lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,100)) + num_chromo = 2 + + lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,num_chromo)) + + accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt) + seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo)) + + seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower() + seq = reverse_complement(seq) + chromo = 1 + strand = '-' + pos = 10475515 + dna = seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True) + self.assertEqual(seq,dna) + + dna = lt1.get_seq_and_scores(chromo,strand,pos,pos+38,True) + pdb.set_trace() + self.assertEqual(seq,dna) + + dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369) + dna_,acc_,don_ = lt1.get_seq_and_scores(1,'+',1,1369) + - def testTableLyrataData(self): + def _testTableLyrataData(self): g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs' acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc' don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don' @@ -236,6 +259,6 @@ def check_wrapper(): if __name__ == '__main__': - suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils) - #suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable) + #suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils) + suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable) unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/tools/data_tools/parser.c b/tools/data_tools/parser.c index 6a0d4eb..c314e05 100644 --- a/tools/data_tools/parser.c +++ b/tools/data_tools/parser.c @@ -11,11 +11,11 @@ * In the current gff files from the TAIR repository * the exons boundaries are defined as follows: * - * exon start points at: + * exon starts at: * * agxy * ^ - * whereas exon stop points at: + * whereas exon stops at: * * xygt * ^ diff --git a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c index fe51eaa..86e174e 100644 --- a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c +++ b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c @@ -25,19 +25,90 @@ int* pred_intron_starts; int* pred_intron_stops; int pred_size; -int matching_introns; +int matching_values; static char* info = "Usage: ./ "; +static int chromo_sizes[5] = {30432563, 19705359, 23470805, 18585042, 26992728}; + +short strand_info; + + +void check_seeds() { + + matching_values = 0; + int pred_idx, gt_idx, chromo, tmp; + int pos,gt_start,gt_stop,pred_start,pred_stop; + + printf("sizes are %d/%d (gt/pred)\n",gt_size,pred_size); + + for(pred_idx=0;pred_idx $result2 #cat $result2 | sort > $result3 diff --git a/tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py b/tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py index 19cb0c7..ffa07ab 100644 --- a/tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py +++ b/tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py @@ -2,28 +2,52 @@ # -*- coding: utf-8 -*- import os.path +import sys import cProfile from compile_dataset import DatasetGenerator +jp = os.path.join + +def combine_spliced_map_parts(data_dir,result_fn): + """ + """ + + assert os.path.exists(data_dir) + + result_fn = jp(data_dir,result_fn) + + if os.path.exists(result_fn): + os.remove(result_fn) + + for chunk_fn in os.listdir(data_dir): + if chunk_fn.endswith('.spliced'): + full_chunk_fn = jp(data_dir,chunk_fn) + cmd = 'cat %s >> %s' % (full_chunk_fn,result_fn) + os.system(cmd) + + cmd = "cat %s | sed -e \'/^#/d\' > tmp ; mv tmp %s" % (result_fn,result_fn) + os.system(cmd) + def run(): - jp = os.path.join - main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main' + #main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main' + #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3' + #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset' - #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1' - #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset' + main_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/' + spliced_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/' + result_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/' - spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3' - result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset' + combine_spliced_map_parts(main_dir,'map.vm.spliced') map_1_fn = jp(main_dir,'map.vm.spliced') map_2_fn = jp(spliced_dir,'map.vm') dg = DatasetGenerator(map_1_fn,map_2_fn) dg.compile_dataset() - dg.saveAs(jp(result_dir,'dataset_run_1.pickle')) + dg.saveAs(jp(result_dir,'dataset_run_1')) if __name__ == '__main__': #cProfile.run('run()') -- 2.20.1