+ renamed main dir in order to create python module hierarchy
[qpalma.git] / qpalma / tools / createTestSet.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os.path
6 import csv
7 from PyGff import *
8 import cPickle
9
10 info=\
11 """
12 You have to supply two files. One containing the gff information and the other
13 one containing the information of the Solexa(R) reads.
14
15 Usage: ./createTestSet.py gff.pickle reads.txt
16 """
17
18 doc=\
19 """
20 Make sure that you split the data into chromosome files beforehand because
21 this method does not check the chromosome info
22 """
23
24 def printRow(row):
25 print "".join([elem+' ' for elem in row]).strip()
26
27 def check(annotation,reads):
28 skippedReadsCounter = 0
29
30 reader = csv.reader(reads, delimiter='\t', quoting=csv.QUOTE_NONE)
31 for pos,row in enumerate(reader):
32 if len(row) != 12:
33 skippedReadsCounter += 1
34 continue
35
36 if pos % 10000 == 0:
37 print >> sys.stderr, pos
38
39 chr = row[0]
40 pos = int(row[1])
41 readStart = pos
42 readStop = pos+35
43
44 for gene in annotation:
45 # read inside current gene positions
46
47 if gene.start <= readStart and pos+35 <= gene.stop:
48 overlapping = True
49
50 for exonStart,exonStop in gene.exons:
51 if exonStart <= readStart and readStop <= exonStop:
52 overlapping = False
53 break
54
55 if overlapping:
56 printRow(row)
57 continue
58
59 # read is overlapping with gene boundaries
60 if pos < gene.start and pos+35 > gene.start:
61 printRow(row)
62 continue
63
64 if pos < gene.stop and pos+35 > gene.stop:
65 printRow(row)
66 continue
67
68 print skippedReadsCounter
69
70 if __name__ == '__main__':
71 assert len(sys.argv) >= 3, info
72 annotFile = sys.argv[1]
73 readsFile = sys.argv[2]
74 assert os.path.exists(annotFile), 'File %s does not exist!' % annotFile
75 assert os.path.exists(readsFile), 'File %s does not exist!' % readsFile
76 print >> sys.stderr, doc
77
78 annotation = cPickle.load(open(annotFile))
79 reads = open(readsFile)
80
81 check(annotation,reads)