from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
-translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
+# how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
+translation = '-acgtn_'
+
+# convenience function for pretty-printing arrays
+pp = lambda x : str(x)[1:-1].replace(' ','')
def alignment_reconstruct(current_prediction,num_exons):
def getUniquePrediction(allPredictions):
"""
Since one read can have several alignments due to several possible seed
- positions we have to preprocess all prediction. This function return for
+ positions we have to preprocess all predictions. This function returns for
every read only one alignment that is the highest scoring under the QPalma
model.
"""
return allUniquePredictions
-def createOutput(current_loc,out_fname,output_format):
+def recalculatePositions(chromo,start_pos,strand,start,ends,seqInfo,window_size):
+ """
+ If we work on the negative strand we have to recalculate the indices other
+ wise we just have to add an offset i.e. the start position.
+ """
+
+ if strand == '-':
+ total_size = seqInfo.chromo_sizes[chromo]
+ start_pos = total_size - start_pos
+
+ starts = [int(elem) + start_pos for elem in starts]
+ ends = [int(elem) + start_pos for elem in ends]
+
+ if strand == '-':
+ starts = [e-window_size for e in starts]
+ ends = [e-window_size for e in ends]
+
+ return starts,ends
+
+
+def createAlignmentOutput(settings,chunk_fn,result_fn):
"""
Given all the predictions and a result file name. This function calculates
all aligmnment information and returns the result in the specified format.
"""
- out_fh = open(out_fname,'w+')
- allPredictions = cPickle.load(open(current_loc))
+ out_fh = open(result_fn,'w+')
+ allPredictions = cPickle.load(open(chunk_fn))
#allPredictions = current_loc
print 'Loaded %d examples' % len(allPredictions)
# make predictions unique
allUniquePredictions = getUniquePrediction(allPredictions)
+ output_format = settings['output_format']
+
written_lines = 0
for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
if output_format == 'BLAT':
# BLAT output
- new_line = createBlatOutput(current_prediction)
+ new_line = createBlatOutput(current_prediction,settings)
elif output_format == 'ShoRe':
# ShoRe output
- new_line = createShoReOutput(current_prediction)
+ new_line = createShoReOutput(current_prediction,settings)
elif output_format == 'mGene':
# Genefinding output for mGene
- new_line = createGenefindingOutput(current_prediction)
+ new_line = createGenefindingOutput(current_prediction,settings)
else:
assert False
print 'Wrote out %d lines' % written_lines
-def createBlatOutput(current_prediction):
+def createBlatOutput(current_prediction,settings):
"""
This function takes one prediction as input and returns the corresponding
alignment in BLAT format.
"""
+ # fetch the data needed
+ allowed_fragments = settings['allowed_fragments']
+ accessWrapper = DataAccessWrapper(settings)
+ seqInfo = SeqSpliceInfo(accessWrapper,range(1,allowed_fragments ))
+
+ # this is the total size of the seed region
+ window_size = 2*settings['half_window_size']
+
+ # now we fetch the data of the read itself
id = current_prediction['id']
seq = current_prediction['read']
alignment = current_prediction['alignment']
DPScores = current_prediction['DPScores']
- predExons = current_prediction['predExons']
- predExons = [e+start_pos for e in predExons]
(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
tExonSizes,tStarts, tEnds) = alignment
print 'BUG exon sizes %d'%id
continue
- EST2GFF = False
new_line = ''
-
- if EST2GFF:
- predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
-
- match = predExons[0,1] - predExons[0,0]
- if predExons.shape == (2,2):
- match += predExons[1,1] - predExons[1,0]
-
- mismatch = 0
- repmatch = 0
- Ns = 0
- Qgapcnt = 0
- Qgapbs = 0
-
- Tgapcnt = 0
- Tgapbs = 0
-
- if predExons.shape == (2,2):
- Tgapbs += predExons[1,0] - predExons[0,1]
- Tgapcnt = 1
-
- # &strand, Qname, &Qsize,
- # &Qstart, &Qend, Tname, &Tsize,
- # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
- #
- # ATTENTION
- #
- # we enforce equal exons sizes for q and t because we are missing indel
- # positions of the alignment
-
- if qExonSizes[0] != tExonSizes[0]:
- continue
-
- Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
- Qsize = len(seq)
- Qstart = qStart
- Qend = qEnd
- Tname = 'CHR%d'%chromo
-
- start_pos -= 2
-
- Tsize = tEnd+1 + start_pos
- Tstart = tStart + start_pos
- Tend = tEnd + start_pos
- blockcnt = Tgapcnt+1
- exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
- Qstarts_str = str(qStarts)[1:-1].replace(' ','')
- Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
-
- new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
- % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
- Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
- Tname, Tsize, Tstart, Tend,\
- blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
-
- else:
- #try:
- exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
- #except:
- # print 'Bug in example %d (idx: %d)' %\
- # (current_prediction['id'],pred_idx)
- # continue
-
- #t_size = tEnd - tStart
- #read_size = 38
- #if strand == '-':
- # total_size = seqInfo.chromo_sizes[chromo]
-
- # start_pos = total_size - start_pos
-
- # qExonSizes.reverse()
- # qStarts = [read_size-e for e in qEnds]
- # qStarts.reverse()
- # qEnds = [read_size-e for e in qStarts]
- # qEnds.reverse()
-
- # tExonSizes.reverse()
- # tStarts = [t_size-e for e in tEnds]
- # tStarts.reverse()
- # tEnds = [t_size-e for e in tStarts]
- # tEnds.reverse()
-
- # exonIdentities.reverse()
- # exonGaps.reverse()
-
- pp = lambda x : str(x)[1:-1].replace(' ','')
-
- new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
- (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
- pp(qExonSizes),pp(qStarts),\
- pp(qEnds),pp(tExonSizes),\
- pp(tStarts),pp(tEnds),\
- pp(exonIdentities),pp(exonGaps))
+ #try:
+ exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+ #except:
+ # print 'Bug in example %d (idx: %d)' %\
+ # (current_prediction['id'],pred_idx)
+ # continue
+
+ tStarts,tEnds = recalculatePositions(chromo,start_pos,strand,tStarts,tEnds,seqInfo,window_size)
+
+ ids = exonIdentities
+ ids = [int(elem) for elem in ids]
+ gaps = exonGaps
+ gaps = [int(elem) for elem in gaps]
+
+ new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+ (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+ pp(qExonSizes),pp(qStarts),pp(qEnds),pp(tExonSizes),\
+ pp(tStarts),pp(tEnds),pp(ids),pp(gaps))
return new_line
def createGenefindingOutput(current_loc,out_fname):
"""
+ This is an output format used by mGene. It consists of positional informations to generate the exon and intron tracks.
"""
+
+ #out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
pass
def createShoReOutput(current_loc,out_fname):
"""
+ This format is used by the additional steps implemented in the ShoRe pipeline.
"""
+
pass
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
import cPickle
import math
import numpy
import unittest
from qpalma_main import QPalma
-from Utils import print_prediction
+from qpalma.utils import print_prediction
from createAlignmentFileFromPrediction import alignment_reconstruct
- def setUp(self):
+ def _setUp(self):
print
self.prediction_set = {}
example = (currentSeqInfo,read,currentQualities)
self.prediction_set[id] = [example]
+
+ def setUp(self):
+ self.prediction_set = {}
+
+ # chr1 + 20-120
+ read = 'catctatgcaacagcattacagtgatcaccggcccaaaaaacctgtgtctggggttttgcctgatgatagcagtgatactgaaactggatcaatggtaag'
+ currentQualities = [[40]*len(read)]
+
+ id = 3
+ chromo = 1
+ strand = '+'
+
+ genomicSeq_start = 3500
+ genomicSeq_stop = 6500
+ print 'Position: ',
+ print genomicSeq_start,genomicSeq_stop
+
+ currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
+
+ example = (currentSeqInfo,read,currentQualities)
+ self.prediction_set[id] = [example]
+
+ # chr1 - 5000-5100
+ read = 'ctgtgtatctggttgctcaatatgctcgccggaaaatgaagatcatggatgctgtgagttctccttattgttcattatcaaactgatatgagtttctgat'
+ currentQualities = [[40]*len(read)]
+
+ id = 4
+ chromo = 1
+ strand = '-'
+
+ total_size = 30432563
+
+ genomicSeq_start = total_size - 6500
+ genomicSeq_stop = total_size - 3500
+ print 'Position: ',
+ print genomicSeq_start,genomicSeq_stop
+
+ currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
+
+ example = (currentSeqInfo,read,currentQualities)
+ self.prediction_set[id] = [example]
+
def testAlignments(self):
run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
print len(example)
# fetch the data needed
- g_dir = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
- don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+ settings = {}
- g_fmt = 'chr%d.dna.flat'
- s_fmt = 'contig_%d%s'
+ settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
+ settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+ settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
- num_chromo = 6
+ settings['genome_file_fmt'] = 'chr%d.dna.flat'
+ settings['splice_score_file_fmt']= 'contig_%d%s'
- accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
- seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+ allowed_fragments = [1]
+ settings['allowed_fragments'] = allowed_fragments
+
+ accessWrapper = DataAccessWrapper(settings)
+ seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
qp = QPalma(run,seqInfo,True)
- #qp.init_prediction(run,set_name)
allPredictions = qp.predict(self.prediction_set,param)
for current_prediction in allPredictions:
numExons = int(math.ceil(len(predExons) / 2))
print alignment_reconstruct(current_prediction,numExons)
- #print id,start_pos,predExons
+ print id,start_pos,predExons
print 'Problem counter is %d' % qp.problem_ctr
+
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
unittest.TextTestRunner(verbosity=2).run(suite)