+ fixed some issues with the splice site scores and ugly code fragments
[qpalma.git] / tools / data_tools / filterReads.py
index 71646c5..93fb93d 100644 (file)
@@ -7,8 +7,68 @@ import pdb
 import qpalma
 from qpalma.tools.parseGff import parse_gff
 
+class Read:
+
+   def __init__(self):
+      pass
+
+def createRead(line):
+   chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity = line.split()
+   newRead = Read()
+   newRead.chr = int(chr)
+   newRead.pos = int(pos)
+   newRead.seq = seq
+
+   if strand == 'D':
+      newRead.strand = '+'
+   if strand == 'P':
+      newRead.strand = '-'
+
+   newRead.mismatch = int(mismatch)
+   newRead.occurrence = int(occurrence)
+   newRead.size = int(size)
+
+   newRead.prb = prb
+   newRead.cal_prb = cal_prb
+   newRead.chastity = chastity
+
+   return newRead
+
+
 def filter(gff_file,reads,out):
-   allGenes = parse_gff(gff_file)
+   read_size = 36
+
+   d,allGenes = parse_gff(gff_file)
+   del d
+
+   print 'Found %d genes.' % len(allGenes)
+
+   old_gene_pos = -1
+   for g_idx in range(len(allGenes)):
+      currentGene = allGenes[g_idx]
+
+      if not old_gene_pos < currentGene.start:
+         allGenes[g_idx] = 'nil'
+
+      old_gene_pos = currentGene.stop
+   
+   allGenes = [e for e in allGenes if e != 'nil']
+
+   print '%d genes remained.' % len(allGenes)
+
+   read_fh = open(reads)
+
+   for currentGene in allGenes:
+      currentGene.overlappingReads = []
+      currentRead = createRead(read_fh.next())
+
+      while currentGene.start < currentRead.pos and currentRead.pos+read_size-1 < currentGene.stop:
+         currentGene.overlappingReads.append(currentRead)
+         currentRead = createRead(read_fh.next())
+
+   #for currentGene in allGenes:
+   #   for (exon_start,exons_stop) in currentGene.exons:
+
 
    out_fh = open(out,'w+')