import random
from numpy.matlib import inf
-
-from Genefinding import *
-from genome_utils import load_genomic
-
-# only for checking purposes
-from qpalma.sequence_utils import get_seq_and_scores
+from qpalma.sequence_utils import get_seq_and_scores,get_flatfile_size
class Lookup:
and the scores once for all chromosomes.
"""
-
def __init__(self,dna_flat_files):
self.dna_flat_files = dna_flat_files
- self.max_size = 40000000
-
self.chromosomes = range(1,6)
- self.strands = ['-']
+ self.strands = ['+','-']
self.genomicSequencesP = [0]*(len(self.chromosomes)+1)
self.acceptorScoresP = [0]*(len(self.chromosomes)+1)
self.donorScoresN = [0]*(len(self.chromosomes)+1)
#for chrId in self.chromosomes:
- for chrId in range(4,6):
+ for chrId in range(1,6):
for strandId in self.strands:
print (chrId,strandId)
self.prefetch_seq_and_scores(chrId,strandId)
- def get_seq_and_scores(self,chr,strand,start,stop,dna_flat_files):
- assert chr in self.chromosomes
+ def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
+ assert chromo in self.chromosomes
assert strand in self.strands
if strand == '+':
- currentSequence = self.genomicSequencesP[chr][start-1:stop]
- length = len(currentSequence)
- currentDonorScores = self.donorScoresP[chr][start:stop]
- currentDonorScores += [-inf]*(length-len(currentDonorScores))
- currentAcceptorScores = self.acceptorScoresP[chr][start:stop+2]
- currentAcceptorScores = currentAcceptorScores[1:]
+ currentSequence = self.genomicSequencesP[chromo][start:stop+1]
+ currentDonorScores = self.donorScoresP[chromo][start:stop+1]
+ currentAcceptorScores = self.acceptorScoresP[chromo][start:stop+1]
elif strand == '-':
- currentSequence = self.genomicSequencesN[chr][start-1:stop]
- length = len(currentSequence)
- currentDonorScores = self.donorScoresN[chr][start:stop]
- currentDonorScores += [-inf]*(length-len(currentDonorScores))
- currentAcceptorScores = self.acceptorScoresN[chr][start:stop+2]
- currentAcceptorScores = currentAcceptorScores[1:]
+ end = len(self.genomicSequencesN[chromo])
+ currentSequence = self.genomicSequencesN[chromo][end-stop-1:end-start]
+ #length = len(currentSequence)
+ currentDonorScores = self.donorScoresN[chromo][end-stop-1:end-start]
+ #currentDonorScores += [-inf]*(length-len(currentDonorScores))
+ currentAcceptorScores = self.acceptorScoresN[chromo][end-stop-1:end-start]
else:
pdb.set_trace()
fn = 'chr%d.dna.flat' % chromo
filename = os.path.join(self.dna_flat_files,fn)
- cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
- import subprocess
- obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
- out,err = obj.communicate()
- if err != '':
- print 'Error occurred while trying to obtain file size'
- size = int(out)
+ size = get_flatfile_size(filename)
+ # we do not use the first/last 205 position as they are can not be
+ # predicted due to the window size used for splice site prediction
genomicSeq_start = 205
genomicSeq_stop = size-205
genomicSeq,currentAcc,currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files)
self.genomicSequencesP[chromo] = U + genomicSeq + U
self.acceptorScoresP[chromo] = INF_F + currentAcc + INF_F
self.donorScoresP[chromo] = INF_F + currentDon + INF_F
-
elif strand == '-':
self.genomicSequencesN[chromo] = U + genomicSeq + U
self.acceptorScoresN[chromo] = INF_F + currentAcc + INF_F
self.donorScoresN[chromo] = INF_F + currentDon + INF_F
-
- else:
- assert False
-
-
- def _prefetch_seq_and_scores(self,chr,strand):
- """
- This function expects an interval, chromosome and strand information and
- returns then the genomic sequence of this interval and the associated scores.
- """
-
- chrom = 'chr%d' % chr
-
- genomicSeq_start = 0
- genomicSeq_stop = self.max_size
- genomicSeq = load_genomic(chrom,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files,one_based=False)
- genomicSeq = genomicSeq.lower()
-
- no_base = re.compile('[^acgt]')
- genomicSeq = no_base.sub('n',genomicSeq)
- # check the obtained dna sequence
- assert genomicSeq != '', 'load_genomic returned empty sequence!'
-
- intervalBegin = 0
- intervalEnd = self.max_size
-
- currentAcc, currentDon = self.getSpliceScores(chr,strand,intervalBegin,intervalEnd)
-
- if strand == '+':
- self.genomicSequencesP[chr] = genomicSeq
- self.acceptorScoresP[chr] = currentAcc
- self.donorScoresP[chr] = currentDon
-
- elif strand == '-':
- self.genomicSequencesN[chr] = genomicSeq
- self.acceptorScoresN[chr] = currentAcc
- self.donorScoresN[chr] = currentDon
-
else:
assert False
if __name__ == '__main__':
dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
lt1 = Lookup(dna_flat_files)
-
- #seq,acc,don = lt1.get_seq_and_scores(1,'+',1001,2002,'')
- #_seq,_acc,_don = get_seq_and_scores(1,'+',1001,2002,dna_flat_files)
-
- for chro in range(1,6):
- start = random.randint(40,10000)
- stop = start + random.randint(10,100)
-
- seq,acc,don = lt1.get_seq_and_scores(chro,'-',start,stop,'')
- _seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
-
- assert _seq == seq, pdb.set_trace()
- assert _acc == acc
- assert _don == don
-
- #seq,acc,don = lt1.get_seq_and_scores(chro,'-',start,stop,'')
- #_seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
-
- #assert _seq == seq
- #assert _acc == acc
- #assert _don == don
-
- pdb.set_trace()
-