1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 __author__ = "Michael Imelfort"
28 __copyright__ = "Copyright 2014"
29 __credits__ = ["Michael Imelfort"]
30 __license__ = "LGPLv3"
31 __maintainer__ = "Michael Imelfort"
32 __email__ = "mike@mikeimelfort.com"
33
34
35
36
37 import os
38 import ctypes as c
39 from multiprocessing import Pool, Manager
40 import multiprocessing as mp
41 import numpy as np
42 import sys
43
44
45 from cWrapper import *
46 from bamLink import *
47 from bamFile import *
48 from bammExceptions import *
49
50
51
52
53
54
55
56
57
58
59
60
66 '''Single-process BAMfile parsing
67
68 cTypes pointers are unpickleable unless they are top level, so this function
69 lives outside the class. In this case we reduce the number of member
70 variables passed to it by passing the class instead. Any implicit copy
71 operations do not affect the workflow as it stands now. If you modify this
72 function you need to be aware of the limitations of python multiprocessing,
73 Queues, pickling and shared memory.
74
75 Extra logic is also contained in BamParser._parseOneBam
76
77 Inputs:
78 bAMpARSER - BamParser instance, a valid BamParser instance
79 parseQueue - Manager.Queue, bids (BAMs) yet to be parsed
80 BFI_list - Manager.List, place all processed BFIs on this list
81 verbose - == True -> be verbose
82 doContigNames - == True -> load contigs names from the C-land BFI struct
83 '''
84 CW = CWrapper()
85 while True:
86
87 bid = parseQueue.get(block=True, timeout=None)
88 if bid is None:
89 break
90
91 if verbose:
92 print "Parsing file: %s" % bAMpARSER.bamFiles[bid]
93
94
95 coverages = []
96 contig_lengths = []
97 contig_names = []
98 links = {}
99
100 BFI = bAMpARSER._parseOneBam(bid)
101
102
103 if bAMpARSER.doCovs or bAMpARSER.doLinks:
104 contig_lengths = \
105 np.array([int(i) for i in
106 c.cast(BFI.contigLengths,
107 c.POINTER(c.c_uint32*BFI.numContigs)).contents
108 ])
109
110 coverages = np.array([[float(j) for j in c.cast(i, c.POINTER(c.c_float*BFI.numBams)).contents] for i in
111 c.cast(BFI.coverages, c.POINTER(c.POINTER(c.c_float*BFI.numBams)*BFI.numContigs)).contents])
112
113
114 if doContigNames:
115 contig_names = []
116 contig_name_lengths = \
117 np.array([int(i) for i in
118 c.cast(BFI.contigNameLengths,
119 c.POINTER(c.c_uint16*BFI.numContigs)
120 ).contents
121 ])
122
123 contig_name_array = \
124 c.cast(BFI.contigNames,
125 c.POINTER(c.POINTER(c.c_char)*BFI.numContigs)
126 ).contents
127
128 for i in range(BFI.numContigs):
129 contig_names.append((c.cast(contig_name_array[i],
130 c.POINTER(c.c_char * \
131 contig_name_lengths[i]
132 )
133 ).contents
134 ).value
135 )
136
137
138 bam_file_name = bAMpARSER.bamFiles[bid]
139 BF = BM_bamFile(bid, bam_file_name)
140 BF_C = \
141 (c.cast(BFI.bamFiles,
142 c.POINTER(c.POINTER(BM_bamFile_C)*1)).contents)[0].contents
143
144 num_types = BF_C.numTypes
145 BTs_C = c.cast(BF_C.types,
146 c.POINTER(c.POINTER(BM_bamType_C)*num_types)).contents
147
148 for bt_c in BTs_C:
149 BT = BM_bamType((bt_c.contents).orientationType,
150 (bt_c.contents).insertSize,
151 (bt_c.contents).insertStdev,
152 (bt_c.contents).supporting)
153 BF.types.append(BT)
154
155 if bAMpARSER.doLinks:
156 links = pythonizeLinks(BFI, BF)
157 else:
158 links = {}
159
160
161 BBFI = BM_fileInfo(coverages,
162 contig_lengths,
163 BFI.numBams,
164 BFI.numContigs,
165 contig_names,
166 [BF],
167 links)
168
169
170 BFI_list.append(BBFI)
171
172
173 pBFI = c.POINTER(BM_fileInfo_C)
174 pBFI = c.pointer(BFI)
175 CW._destroyBFI(pBFI)
176
177 if doContigNames:
178
179 doContigNames = False
180
182 '''Unpeel the links-associated C structs and return a python dictionary
183 of LinkPair instances
184
185 Inputs:
186 BFI - BM_fileInfo_C, C-land BamFileInfo struct
187 bamFile - uid of the bamFile associated with the BFI
188
189 Outputs:
190 A python dictionary of LinkPair instances
191 '''
192 links = {}
193 CW = CWrapper()
194 pBFI = c.POINTER(BM_fileInfo_C)
195 pBFI = c.pointer(BFI)
196
197 LW = BM_LinkWalker_C()
198 pLW = c.POINTER(BM_LinkWalker_C)
199 pLW = c.pointer(LW)
200 success = CW._initLW(pLW, pBFI)
201 if(success == 2):
202 ret_val = 2
203 LP = None
204 while(ret_val != 0):
205 if ret_val == 2:
206
207 LP = BM_linkPair(((LW.pair).contents).cid1,
208 ((LW.pair).contents).cid2)
209
210 links[LP.makeKey()] = LP
211
212 LI = (LW.LI).contents
213 LP.addLink(LI.reversed1,
214 LI.reversed2,
215 LI.pos1,
216 LI.pos2,
217 bamFile)
218 ret_val = CW._stepLW(pLW)
219 CW._destroyLW(pLW)
220
221 return links
222
223
224
225
226
227
229 ''' class for generating coverage profiles and linking read information
230 from several BAM files
231
232 Uses python multiprocessing to parallelize processing
233 '''
234
235 - def __init__(self,
236 coverageType,
237 minLength=0,
238 baseQuality=0,
239 mappingQuality=0,
240 useSuppAlignments=False,
241 useSecondaryAlignments=False,
242 maxMisMatches=1000
243 ):
244 '''
245 Default constructor.
246
247 Set quality thresholds used in later parsing
248
249 Inputs:
250 coverageType - BM_coverageType, stores type of coverage to calculate
251 minLength - int, ignore contigs shorter than this length
252 mappingQuality - int, ignore positions with a lower base quality score
253 mappingQuality - int, skip all reads with a lower mapping quality score
254 useSuppAlignments - == True -> DON'T skip supplementary alignments
255 useSecondaryAlignments - == True -> DON'T skip secondary alignments
256 maxMisMatches - int, skip all reads with more mismatches (NM aux files)
257
258 Outputs:
259 None
260 '''
261
262
263
264 self.baseQuality = baseQuality
265 self.mappingQuality = mappingQuality
266 self.minLength = minLength
267 self.maxMisMatches = maxMisMatches
268
269 if useSuppAlignments:
270 self.ignoreSuppAlignments = 0
271 else:
272 self.ignoreSuppAlignments = 1
273
274 if useSuppAlignments:
275 self.ignoreSecondaryAlignments = 0
276 else:
277 self.ignoreSecondaryAlignments = 1
278
279 self.coverageType = coverageType
280
281
282
283
284 self.BFI = None
285
286
287 self.bamFiles = []
288 self.types = []
289 self.doLinks = False
290 self.doCovs = False
291
292
293
294
295 - def parseBams(self,
296 bamFiles,
297 doLinks=False,
298 doCovs=False,
299 types=None,
300 threads=1,
301 verbose=False):
302 '''Parse bam files to get coverage and linking reads.
303
304 Manages threading
305 Stores results in internal mapping results list
306
307 Inputs:
308 bamFiles - [string], full paths to BAMs
309 doLinks - == True -> find linking pairs
310 doCovs - == True -> calculate coverage profiles
311 types - [int] or None, number of insert types per bamfile
312 threads - int, max number of threads to use
313 verbose - == True -> be verbose
314
315 Outputs:
316 0 if the parsing worked, 1 otherwise
317 '''
318
319 self.bamFiles = bamFiles
320
321
322 if types is None:
323 self.types = [1]*len(self.bamFiles)
324 else:
325 self.types = types
326
327 if len(self.types) != len(self.bamFiles):
328 raise InvalidNumberOfTypesException("%d types for %d BAM files" % \
329 (len(self.types),
330 len(self.bamFiles)))
331
332
333 self.doLinks = doLinks
334 self.doCovs = doCovs
335
336
337 for bam in bamFiles:
338 if not os.path.isfile(bam):
339 raise BAMFileNotFoundException("BAM file %s not found" % bam)
340 elif not os.path.isfile("%s.bai" % bam):
341 raise BAMIndexNotFoundException("Index file %s not found" % \
342 ("%s.bai" % bam))
343
344
345 parse_queue = mp.Queue()
346
347 BFI_list = mp.Manager().list()
348
349
350 for bid in range(len(bamFiles)):
351 parse_queue.put(bid)
352
353
354 for _ in range(threads):
355 parse_queue.put(None)
356
357 try:
358
359 parse_proc = [mp.Process(target=externalParseWrapper,
360 args = (self,
361 parse_queue,
362 BFI_list,
363 verbose,
364 True)
365 )
366 ]
367
368 parse_proc += [mp.Process(target=externalParseWrapper,
369 args = (self,
370 parse_queue,
371 BFI_list,
372 verbose,
373 True)
374 ) for _ in range(threads-1)
375 ]
376
377 for p in parse_proc:
378 p.start()
379
380 for p in parse_proc:
381 p.join()
382
383
384 self.collapseBFIs(BFI_list)
385
386
387 return 0
388
389 except (KeyboardInterrupt, SystemExit):
390
391 for p in parse_proc:
392 p.terminate()
393
394
395 return 1
396
398 '''Parse a single BAM file and append the result
399 to the internal mapping results list
400
401 Called from the ExternalParseWrapper
402
403 Inputs:
404 bid - unique identifier of the BAM to parse
405
406 Outputs:
407 A populated BM_FileInfo_C struct containing the parsing results
408 '''
409
410
411 BFI = BM_fileInfo_C()
412 pBFI = c.POINTER(BM_fileInfo_C)
413 pBFI = c.pointer(BFI)
414
415 BCT = BM_coverageType_C()
416 BCT.type = self.coverageType.cType
417 BCT.upperCut = float(self.coverageType.cUpper)
418 BCT.lowerCut = float(self.coverageType.cLower)
419 pBCT = c.POINTER(BM_coverageType_C)
420 pBCT = c.pointer(BCT)
421
422 bamfiles_c_array = (c.c_char_p * 1)()
423 bamfiles_c_array[:] = [self.bamFiles[bid]]
424
425 types_c_array = (c.c_int * 1)()
426 types_c_array[:] = [self.types[bid]]
427
428 CW = CWrapper()
429 if self.doLinks or self.doCovs:
430 CW._parseCoverageAndLinks(self.doLinks,
431 self.doCovs,
432 1,
433 self.baseQuality,
434 self.mappingQuality,
435 self.minLength,
436 self.maxMisMatches,
437 types_c_array,
438 self.ignoreSuppAlignments,
439 self.ignoreSecondaryAlignments,
440 pBCT,
441 bamfiles_c_array,
442 pBFI)
443 else:
444
445 BCT.type = CT.NONE
446 CW._parseCoverageAndLinks(self.doLinks,
447 self.doCovs,
448 1,
449 0,
450 0,
451 0,
452 0,
453 types_c_array,
454 1,
455 1,
456 pBCT,
457 bamfiles_c_array,
458 pBFI)
459
460 return BFI
461
463 '''Collapse multiple BFI objects into one and
464 set it as member variable BFI
465
466 Only one of the threads will bother to parse contig names. Find it's
467 BFI and then merge all the other BFIs (from other threads) into it
468
469 Inputs:
470 BFI_list - Manager.List, list of all BFIs created during parsing
471
472 Outputs:
473 None
474 '''
475 baseBFI_index = 0
476 if self.doCovs or self.doLinks:
477
478 for i in range(len(BFI_list)):
479 if len(BFI_list[i].contigNames) > 0:
480 baseBFI_index = i
481 break
482
483
484 self.BFI = BFI_list[baseBFI_index]
485 for i in range(len(BFI_list)):
486 if i != baseBFI_index:
487 self.BFI.consume(BFI_list[i])
488
489
490
491
493 '''Print template size and orientation information
494 to a file or to stdout.
495
496 Inputs:
497 fileName - string, full path to output file or ""
498
499 Outputs:
500 None
501 '''
502 if self.BFI is None:
503 raise NoBAMSFoundException
504 else:
505 if fileName == "":
506 self.BFI.printBamTypes(sys.stdout)
507 else:
508 with open(fileName, "w") as fh:
509 self.BFI.printBamTypes(fh)
510
512 '''Print coverage profile information to a file or to stdout
513
514 Inputs:
515 fileName - string, full path to output file or ""
516
517 Outputs:
518 None
519 '''
520 if self.BFI is None:
521 raise NoBAMSFoundException
522 else:
523 if fileName == "":
524 self.BFI.printCoverages(sys.stdout)
525 else:
526 with open(fileName, "w") as fh:
527 self.BFI.printCoverages(fh)
528
530 '''Print paired links information to a file or to stdout
531
532 Inputs:
533 fileName - string, full path to output file or ""
534
535 Outputs:
536 None
537 '''
538 if self.BFI is None:
539 raise NoBAMSFoundException
540 else:
541 if fileName == "":
542 self.BFI.printLinks(sys.stdout)
543 else:
544 with open(fileName, "w") as fh:
545 self.BFI.printLinks(dict(zip(range(len(self.bamFiles)),self.bamFiles)), fh)
546
547
548
549
550
551