import random
from numpy.matlib import inf
-from qpalma.sequence_utils import get_seq_and_scores,get_flatfile_size
+from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size
class Lookup:
self.chromosomes = range(1,6)
self.strands = ['+','-']
+ flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ chromo_list = [1,2,3,4,5,8,9,10,11,12]
+ self.seqInfo = SeqSpliceInfo(flat_files,chromo_list)
+
self.genomicSequencesP = [0]*(len(self.chromosomes)+1)
self.acceptorScoresP = [0]*(len(self.chromosomes)+1)
self.donorScoresP = [0]*(len(self.chromosomes)+1)
self.donorScoresN = [0]*(len(self.chromosomes)+1)
#for chrId in self.chromosomes:
- for chrId in range(1,6):
+ #for chrId in range(1,6):
+ for chrId in range(1,2):
for strandId in self.strands:
print (chrId,strandId)
self.prefetch_seq_and_scores(chrId,strandId)
assert chromo in self.chromosomes
assert strand in self.strands
- if strand == '+':
- currentSequence = self.genomicSequencesP[chromo][start:stop+1]
- currentDonorScores = self.donorScoresP[chromo][start:stop+1]
- currentAcceptorScores = self.acceptorScoresP[chromo][start:stop+1]
- elif strand == '-':
- end = len(self.genomicSequencesN[chromo])
- currentSequence = self.genomicSequencesN[chromo][end-stop-1:end-start]
- #length = len(currentSequence)
- currentDonorScores = self.donorScoresN[chromo][end-stop-1:end-start]
- #currentDonorScores += [-inf]*(length-len(currentDonorScores))
- currentAcceptorScores = self.acceptorScoresN[chromo][end-stop-1:end-start]
- else:
- pdb.set_trace()
+ currentSequence = self.genomicSequencesP[chromo][start:stop]
+ currentDonorScores = self.donorScoresP[chromo][start:stop]
+ currentAcceptorScores = self.acceptorScoresP[chromo][start:stop]
- return currentSequence.lower(), currentAcceptorScores, currentDonorScores
+ return currentSequence, currentAcceptorScores, currentDonorScores
def prefetch_seq_and_scores(self,chromo,strand):
returns then the genomic sequence of this interval and the associated scores.
"""
- fn = 'chr%d.dna.flat' % chromo
- filename = os.path.join(self.dna_flat_files,fn)
- size = get_flatfile_size(filename)
+ genomicSeq_start = 0
- # we do not use the first/last 205 position as they are can not be
- # predicted due to the window size used for splice site prediction
- genomicSeq_start = 205
- genomicSeq_stop = size-205
- genomicSeq,currentAcc,currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files)
- genomicSeq = genomicSeq.lower()
+ if strand == '-':
+ genomicSeq_stop = self.seqInfo.chromo_sizes[chromo+7]
+ else:
+ genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]
- U = 'u'*205
- INF_F = [-inf]*205
+ genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+ genomicSeq = genomicSeq.lower()
if strand == '+':
- self.genomicSequencesP[chromo] = U + genomicSeq + U
- self.acceptorScoresP[chromo] = INF_F + currentAcc + INF_F
- self.donorScoresP[chromo] = INF_F + currentDon + INF_F
+ self.genomicSequencesP[chromo] = genomicSeq
+ self.acceptorScoresP[chromo] = currentAcc
+ self.donorScoresP[chromo] = currentDon
elif strand == '-':
- self.genomicSequencesN[chromo] = U + genomicSeq + U
- self.acceptorScoresN[chromo] = INF_F + currentAcc + INF_F
- self.donorScoresN[chromo] = INF_F + currentDon + INF_F
+ self.genomicSequencesN[chromo] = genomicSeq
+ self.acceptorScoresN[chromo] = currentAcc
+ self.donorScoresN[chromo] = currentDon
else:
assert False
- def getSpliceScores(self,chr,strand,intervalBegin,intervalEnd):
- """
- Now we want to use interval_query to get the predicted splice scores trained
- on the TAIR sequence and annotation.
- """
-
- assert intervalEnd > intervalBegin
-
- size = intervalEnd-intervalBegin
-
- acc = size*[0.0]
- don = size*[0.0]
-
- interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
- pos_size = new_intp()
- intp_assign(pos_size,1)
-
- # fetch acceptor scores
- sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
- acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
- # fetch donor scores
- sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
- don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
- return acc, don
-
-
if __name__ == '__main__':
dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
lt1 = Lookup(dna_flat_files)
+
+ #self.lt1.seqInfo.get_seq_and_scores