assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
assert len(currentAcc) == len(currentDon)
- pdb.set_trace()
-
return genomicSeq, currentAcc, currentDon
# build reverse complement if on negative strand
print 'Error occurred while trying to obtain file size'
end = int(out)
- intervalBegin = genomicSeq_start#-100
- intervalEnd = genomicSeq_stop#+100
+ intervalBegin = genomicSeq_start-100
+ intervalEnd = genomicSeq_stop+100
total_size = end
currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
- #currentAcc = currentAcc[100:-98]
+ currentAcc = currentAcc[100:-98]
currentAcc = currentAcc[1:]
- #currentDon = currentDon[100:-100]
+ currentDon = currentDon[100:-100]
length = len(genomicSeq)
currentAcc = currentAcc[:length]
currentDon = currentDon+[-inf]*(length-len(currentDon))
- ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
+ ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
assert len(currentAcc) == len(currentDon), pdb.set_trace()
return genomicSeq, currentAcc, currentDon
+
+
+def checks():
+ start = 1001
+ stop = start+1234
+ dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
+ for chromo in range(1,6):
+ p_seq,_p_acc,_p_don = get_seq_and_scores(chromo,'+',start,stop,dna_flat_files)
+ n_seq,n_acc,n_don = get_seq_and_scores(chromo,'-',start,stop,dna_flat_files)
+
+ print p_seq == reverse_complement(n_seq)
+
+
+
+if __name__ == '__main__':
+ checks()
from genome_utils import load_genomic
# only for checking purposes
-#from compile_dataset import get_seq_and_scores
+from qpalma.sequence_utils import get_seq_and_scores
class Lookup:
self.max_size = 40000000
self.chromosomes = range(1,6)
- self.strands = ['+']
+ self.strands = ['-']
self.genomicSequencesP = [0]*(len(self.chromosomes)+1)
self.acceptorScoresP = [0]*(len(self.chromosomes)+1)
self.acceptorScoresN = [0]*(len(self.chromosomes)+1)
self.donorScoresN = [0]*(len(self.chromosomes)+1)
- for chrId in self.chromosomes:
+ #for chrId in self.chromosomes:
+ for chrId in range(4,6):
for strandId in self.strands:
print (chrId,strandId)
self.prefetch_seq_and_scores(chrId,strandId)
else:
pdb.set_trace()
-
return currentSequence.lower(), currentAcceptorScores, currentDonorScores
- def prefetch_seq_and_scores(self,chr,strand):
+ def prefetch_seq_and_scores(self,chromo,strand):
+ """
+ This function expects an interval, chromosome and strand information and
+ returns then the genomic sequence of this interval and the associated scores.
+ """
+
+ fn = 'chr%d.dna.flat' % chromo
+ filename = os.path.join(self.dna_flat_files,fn)
+ cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
+ import subprocess
+ obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+ out,err = obj.communicate()
+ if err != '':
+ print 'Error occurred while trying to obtain file size'
+ size = int(out)
+
+ genomicSeq_start = 205
+ genomicSeq_stop = size-205
+ genomicSeq,currentAcc,currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files)
+ genomicSeq = genomicSeq.lower()
+
+ U = 'u'*205
+ INF_F = [-inf]*205
+
+ if strand == '+':
+ self.genomicSequencesP[chromo] = U + genomicSeq + U
+ self.acceptorScoresP[chromo] = INF_F + currentAcc + INF_F
+ self.donorScoresP[chromo] = INF_F + currentDon + INF_F
+
+ elif strand == '-':
+ self.genomicSequencesN[chromo] = U + genomicSeq + U
+ self.acceptorScoresN[chromo] = INF_F + currentAcc + INF_F
+ self.donorScoresN[chromo] = INF_F + currentDon + INF_F
+
+ else:
+ assert False
+
+
+ def _prefetch_seq_and_scores(self,chr,strand):
"""
This function expects an interval, chromosome and strand information and
returns then the genomic sequence of this interval and the associated scores.
start = random.randint(40,10000)
stop = start + random.randint(10,100)
- seq,acc,don = lt1.get_seq_and_scores(chro,'+',start,stop,'')
- _seq,_acc,_don = get_seq_and_scores(chro,'+',start,stop,dna_flat_files)
+ seq,acc,don = lt1.get_seq_and_scores(chro,'-',start,stop,'')
+ _seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
- assert _seq == seq
+ assert _seq == seq, pdb.set_trace()
assert _acc == acc
assert _don == don
#assert _acc == acc
#assert _don == don
-
pdb.set_trace()
+