+ resolved an issue with the memory mapping in filterReads
[qpalma.git] / tools / data_tools / filterReads.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import pdb
6
7 import qpalma
8 from qpalma.tools.parseGff import parse_gff
9
10 class Read:
11
12 def __init__(self):
13 pass
14
15 def createRead(line):
16 chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity = line.split()
17 newRead = Read()
18 newRead.chr = int(chr)
19 newRead.pos = int(pos)
20 newRead.seq = seq
21
22 if strand == 'D':
23 newRead.strand = '+'
24 if strand == 'P':
25 newRead.strand = '-'
26
27 newRead.mismatch = int(mismatch)
28 newRead.occurrence = int(occurrence)
29 newRead.size = int(size)
30
31 newRead.prb = prb
32 newRead.cal_prb = cal_prb
33 newRead.chastity = chastity
34
35 return newRead
36
37
38 def filter(gff_file,reads,out):
39 read_size = 36
40
41 d,allGenes = parse_gff(gff_file)
42 del d
43
44 print 'Found %d genes.' % len(allGenes)
45
46 old_gene_pos = -1
47 for g_idx in range(len(allGenes)):
48 currentGene = allGenes[g_idx]
49
50 if not old_gene_pos < currentGene.start:
51 allGenes[g_idx] = 'nil'
52
53 old_gene_pos = currentGene.stop
54
55 allGenes = [e for e in allGenes if e != 'nil']
56
57 print '%d genes remained.' % len(allGenes)
58
59 read_fh = open(reads)
60
61 for currentGene in allGenes:
62 currentGene.overlappingReads = []
63 currentRead = createRead(read_fh.next())
64
65 while currentGene.start < currentRead.pos and currentRead.pos+read_size-1 < currentGene.stop:
66 currentGene.overlappingReads.append(currentRead)
67 currentRead = createRead(read_fh.next())
68
69 #for currentGene in allGenes:
70 # for (exon_start,exons_stop) in currentGene.exons:
71
72
73 out_fh = open(out,'w+')
74
75
76 out_fh.close()
77
78 if __name__ == '__main__':
79 assert len(sys.argv) == 4, 'Usage: '
80 (gff_file,reads,out) = sys.argv[1:4]
81 filter(gff_file,reads,out)
82