self.prefetch_seq_and_scores(chrId,strandId)
- def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
+ def get_seq_and_scores(self,chromo,strand,start,stop):
assert chromo in self.chromo_list
assert strand in self.strands
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
import cPickle
import math
import os
from numpy.matlib import mat,zeros,ones,inf
from numpy import inf,mean
-#from qpalma.parsers import *
-
from ParaParser import ParaParser,IN_VECTOR
from qpalma.Lookup import LookupTable
g_fmt = 'chr%d.dna.flat'
s_fmt = 'contig_%d%s'
- #self.chromo_range = range(1,6)
- self.chromo_range = range(1,2)
+ self.chromo_range = range(1,6)
+ #self.chromo_range = range(1,2)
num_chromo = 2
accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
self.qualityPlifs = qualityPlifs
self.read_size = 38
+ self.prb_offset = 64
+ #self.prb_offset = 50
# parameters of the heuristics to decide whether the read is spliced
#self.splice_thresh = 0.005
genomicSeq_stop = pos+effective_len
start = cpu()
- currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
- currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+ #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+ currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
- assert currentDNASeq_ == currentDNASeq
- assert currentAcc_ == currentAcc_
- assert currentDon_ == currentDon_
+ # just performed some tests to check LookupTable results
+ #assert currentDNASeq_ == currentDNASeq
+ #assert currentAcc_ == currentAcc_
+ #assert currentDon_ == currentDon_
stop = cpu()
self.get_time += stop-start
exons[1,0] = effective_len
est = unb_seq
original_est = seq
- quality = map(lambda x:ord(x)-64,prb)
+ quality = map(lambda x:ord(x)-self.prb_offset,prb)
currentVMatchAlignment = dna, exons, est, original_est, quality,\
currentAcc, currentDon
#pdb.set_trace()
- alternativeAlignmentScores = self.calcAlternativeAlignments(location)
- #try:
- # alternativeAlignmentScores = self.calcAlternativeAlignments(location)
- #except:
- # alternativeAlignmentScores = []
+ #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+ try:
+ alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+ except:
+ alternativeAlignmentScores = []
if alternativeAlignmentScores == []:
# no alignment necessary
maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
# compute alignment for vmatch unspliced read
vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+ #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
+
start = cpu()
- #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
#print 'all candidates %s' % str(alternativeAlignmentScores)
new_id = id - 1000000300000
#distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
#distal_acc=distal_acc[-2:]
-
proximal_don = []
for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
if currentDon[idx] >= splice_thresh:
strand = location['strand']
original_est = location['seq']
quality = location['prb']
- quality = map(lambda x:ord(x)-64,quality)
+ quality = map(lambda x:ord(x)-self.prb_offset,quality)
original_est = original_est.lower()
est = unbracket_seq(original_est)
start = cpu()
#currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
- currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+ currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
stop = cpu()
self.get_time += stop-start
dna = currentDNASeq
-
+
proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
alternativeScores = []
+
+ #pdb.set_trace()
# inlined
h = self.h
if acc_score>=self.splice_stop_thresh:
break
+ #pdb.set_trace()
+
_stop = cpu()
self.alternativeScoresTime += _stop-_start
def create_and_submit():
jp = os.path.join
- run_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction'
- result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/alignment'
-
+ run_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction'
+ result_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/alignment'
+
chunks_fn = []
for fn in os.listdir(run_dir):
if fn.startswith('chunk'):
def create_and_submit():
jp = os.path.join
- num_splits = 25
+ num_splits = 50
run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
#data_dir = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/'
- data_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+ data_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/'
run_fname = jp(run_dir,'run_obj.pickle')
- #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main/map.vm'
- #split_file(original_map_fname,data_dir,num_splits)
+ original_map_fname = jp(data_dir,'map.vm')
+
+ split_file(original_map_fname,data_dir,num_splits)
param_fname = jp(run_dir,'param_526.pickle')
param_fn = jp(run_dir,'param_526.pickle')
- run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction'
- dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.pickle'
- prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.keys.pickle'
+ #run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/prediction'
+ #dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.pickle'
+ #prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.keys.pickle'
+
+ run['result_dir'] = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction'
+ dataset_fn = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.pickle'
+ prediction_keys_fn = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.keys.pickle'
prediction_keys = cPickle.load(open(prediction_keys_fn))
print 'Found %d keys for prediction.' % len(prediction_keys)
- num_splits = 25
+ num_splits = 50
#num_splits = 1
slices = get_slices(len(prediction_keys),num_splits)
chunks = []
accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+ print self.seqInfo.chromo_sizes
+
#self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,2))
pass
- def _testTableThalianaData(self):
+ def testTableThalianaData(self):
g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
g_fmt = 'chr%d.dna.flat'
s_fmt = 'contig_%d%s'
- lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,100))
+ num_chromo = 2
+
+ lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,num_chromo))
+
+ accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+ seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
+ seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
+ seq = reverse_complement(seq)
+ chromo = 1
+ strand = '-'
+ pos = 10475515
+ dna = seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+ self.assertEqual(seq,dna)
+
+ dna = lt1.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+ pdb.set_trace()
+ self.assertEqual(seq,dna)
+
+ dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
+ dna_,acc_,don_ = lt1.get_seq_and_scores(1,'+',1,1369)
+
- def testTableLyrataData(self):
+ def _testTableLyrataData(self):
g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
if __name__ == '__main__':
- suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
- #suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
+ #suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
+ suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
unittest.TextTestRunner(verbosity=2).run(suite)
* In the current gff files from the TAIR repository
* the exons boundaries are defined as follows:
*
- * exon start points at:
+ * exon starts at:
*
* agxy
* ^
- * whereas exon stop points at:
+ * whereas exon stops at:
*
* xygt
* ^
int* pred_intron_stops;
int pred_size;
-int matching_introns;
+int matching_values;
static char* info = "Usage: ./<app name> <input 1 file name> <input 2 file name> <result file name>";
+static int chromo_sizes[5] = {30432563, 19705359, 23470805, 18585042, 26992728};
+
+short strand_info;
+
+
+void check_seeds() {
+
+ matching_values = 0;
+ int pred_idx, gt_idx, chromo, tmp;
+ int pos,gt_start,gt_stop,pred_start,pred_stop;
+
+ printf("sizes are %d/%d (gt/pred)\n",gt_size,pred_size);
+
+ for(pred_idx=0;pred_idx<pred_size;pred_idx++) {
+
+ pred_start = pos-1500;
+ pred_stop = pos+1500;
+ pos = pred_intron_stops[pred_idx];
+ chromo = pred_intron_starts[pred_idx];
+
+ for(gt_idx=0;gt_idx<gt_size;gt_idx++) {
+
+ gt_start = gt_intron_starts[gt_idx];
+ gt_stop = gt_intron_stops[gt_idx];
+
+ if(strand_info == 0) {
+ tmp = gt_start;
+ gt_start = chromo_sizes[chromo] - gt_stop;
+ gt_stop = chromo_sizes[chromo] - tmp;
+ }
+
+ //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
+ if ( pred_start <= gt_start && gt_stop <= pred_stop ) {
+ matching_values++;
+ break;
+ }
+
+ //if (gt_start == pred_start && gt_stop == pred_stop)
+ // matching_values++;
+ }}
+}
+
+
+void compare_introns() {
+
+ matching_values = 0;
+ int pred_idx, gt_idx;
+ int gt_start,gt_stop,pred_start,pred_stop;
+
+ printf("sizes are %d/%d (gt/pred)\n",gt_size,pred_size);
+
+ for(pred_idx=0;pred_idx<pred_size;pred_idx++) {
+
+ pred_start = pred_intron_starts[pred_idx];
+ pred_stop = pred_intron_stops[pred_idx];
+
+ for(gt_idx=0;gt_idx<gt_size;gt_idx++) {
+
+ gt_start = gt_intron_starts[gt_idx];
+ gt_stop = gt_intron_stops[gt_idx];
+
+ //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
+ if ( fabs(gt_start - pred_start) <= 2 && fabs(gt_stop - pred_stop) <= 2) {
+ matching_values++;
+ break;
+ }
+ //if (gt_start == pred_start && gt_stop == pred_stop)
+ // matching_values++;
+ }}
+}
+
/*
* This function compares the intron boundaries of the two files.
*
*/
-void compare_introns() {
+void _compare_introns() {
- matching_introns = 0;
+ matching_values = 0;
int gt_idx = 0;
int pred_idx = 0;
//printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
if (gt_start == pred_start && gt_stop == pred_stop) {
- matching_introns++;
+ matching_values++;
+
+ // take doubles into account
+#if 1
+ int idx = pred_idx+1;
+ for(;idx<pred_size;idx++) {
+ if (pred_intron_starts[idx] == pred_start && pred_intron_stops[idx] == pred_stop)
+ matching_values++;
+ else
+ break;
+ }
+#endif
gt_idx++;
pred_idx++;
continue;
while( getline(¤t_line,&line_size,fs) != -1 ) (*size)++;
// now we allocate memory to store all positions
+ //*id = malloc(sizeof(char)*(*size));
*starts = malloc(sizeof(int)*(*size));
*stops = malloc(sizeof(int)*(*size));
exit(EXIT_FAILURE);
}
+ //char* id_string = malloc(sizeof(char)*line_size);
char* first_num = malloc(sizeof(char)*line_size);
char* second_num = malloc(sizeof(char)*line_size);
ctr++;
}
- printf("ctr is %d\n",ctr);
+ //printf("ctr is %d\n",ctr);
+
if (fclose(fs) != 0)
perror("Closing of filestream failed!");
}
int main(int argc, char* argv[]) {
- if(argc != 4) {
+ if(argc != 6) {
printf("%s\n",info);
exit(EXIT_FAILURE);
}
if ( gt_fn == NULL || pred_fn == NULL || result_fn == NULL)
perror("Could not allocate memory for filenames");
- strncpy(gt_fn,argv[1],strlen(argv[1]));
- strncpy(pred_fn,argv[2],strlen(argv[2]));
- strncpy(result_fn,argv[3],strlen(argv[3]));
+ short intron_mode = -1;
+ if ( strcmp(argv[1],"-intron") == 0 )
+ intron_mode = 1;
+
+ if ( strcmp(argv[1],"-seed") == 0 )
+ intron_mode = 0;
+
+ if (intron_mode == -1)
+ perror("You have to choose between -intron or -seed mode!");
+
+ strand_info = -1;
+ if ( strcmp(argv[2],"-strand=+") == 0 )
+ strand_info = 1;
+
+ if ( strcmp(argv[2],"-strand=-") == 0 )
+ strand_info = 0;
+
+ if (strand_info == -1)
+ perror("You have to choose between -strand=+ or -strand=- !");
+
+ strncpy(gt_fn,argv[3],strlen(argv[3]));
+ strncpy(pred_fn,argv[4],strlen(argv[4]));
+ strncpy(result_fn,argv[5],strlen(argv[5]));
FILE *result_fs = fopen(result_fn,"w+");
if(result_fs == NULL) {
load_introns(gt_fn,>_intron_starts,>_intron_stops,>_size);
load_introns(pred_fn,&pred_intron_starts,&pred_intron_stops,&pred_size);
- compare_introns();
+ if (intron_mode == 1)
+ compare_introns();
+ else
+ check_seeds();
int f_status = fclose(result_fs);
if(f_status != 0)
free(pred_fn);
free(result_fn);
- printf("Found %d matching intron(s).\n",matching_introns);
+ printf("Found %d matching intron(s)/seed(s).\n",matching_values);
exit(EXIT_SUCCESS);
}
1 10
+1 10
+1 10
+10 14
+10 14
10 14
14 20
20 22
+20 22
self.dataset = []
- self.read_size = Conf.read_size
+ self.read_size = 38
+ #self.prb_offset = 50
+ self.prb_offset = 64
self.half_window_size = 1500
assert not os.path.exists(dataset_fn), 'The data_file already exists!'
assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
+ #pdb.set_trace()
# saving new dataset and single keys as well
cPickle.dump(self.dataset,open(dataset_fn,'w+'),protocol=2)
cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
# In order to save some space we use a signed char to store the
# qualities. Each quality element can range as follows: -128 <= elem <= 127
- prb = array.array('b',map(lambda x: ord(x)-64,slist[5]))
+
+ prb = array.array('b',map(lambda x: ord(x)-self.prb_offset,slist[5]))
# add instance to set
currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
starts = [int(elem) + start_pos for elem in starts]
ends = [int(elem) + start_pos for elem in ends]
+ if strand == '+':
+ starts = [e+1 for e in starts]
+ ends = [e+1 for e in ends]
+ else:
+ starts = [e-3001 for e in starts]
+ ends = [e-3001 for e in ends]
+
#out_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),str(start_pos),pp(ids),pp(gaps)))
out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
#!/bin/bash
-for((idx=1;idx<4;idx++))
+for((idx=1;idx<2;idx++))
do
- current_dir=/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_${idx}
+ current_dir=/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_${idx}/spliced
input=$current_dir/alignment
result=$current_dir/alignment/alignmentInfo.genefinding
result2=$current_dir/alignment/alignmentInfo.genefinding.chr1_only
intron_starts=$current_dir/alignment/intron_starts
intron_info=$current_dir/alignment/intron_info
- #python createExonInfoForGenefinding.py $input $result
+ python createExonInfoForGenefinding.py $input $result
+
#cat $result | grep '^1' > $result2
#cat $result2 | sort > $result3
# -*- coding: utf-8 -*-
import os.path
+import sys
import cProfile
from compile_dataset import DatasetGenerator
+jp = os.path.join
+
+def combine_spliced_map_parts(data_dir,result_fn):
+ """
+ """
+
+ assert os.path.exists(data_dir)
+
+ result_fn = jp(data_dir,result_fn)
+
+ if os.path.exists(result_fn):
+ os.remove(result_fn)
+
+ for chunk_fn in os.listdir(data_dir):
+ if chunk_fn.endswith('.spliced'):
+ full_chunk_fn = jp(data_dir,chunk_fn)
+ cmd = 'cat %s >> %s' % (full_chunk_fn,result_fn)
+ os.system(cmd)
+
+ cmd = "cat %s | sed -e \'/^#/d\' > tmp ; mv tmp %s" % (result_fn,result_fn)
+ os.system(cmd)
+
def run():
- jp = os.path.join
- main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+ #main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+ #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3'
+ #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset'
- #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1'
- #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset'
+ main_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/'
+ spliced_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/'
+ result_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/'
- spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3'
- result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset'
+ combine_spliced_map_parts(main_dir,'map.vm.spliced')
map_1_fn = jp(main_dir,'map.vm.spliced')
map_2_fn = jp(spliced_dir,'map.vm')
dg = DatasetGenerator(map_1_fn,map_2_fn)
dg.compile_dataset()
- dg.saveAs(jp(result_dir,'dataset_run_1.pickle'))
+ dg.saveAs(jp(result_dir,'dataset_run_1'))
if __name__ == '__main__':
#cProfile.run('run()')