-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathRecplot.py
More file actions
1944 lines (1667 loc) · 81.1 KB
/
Recplot.py
File metadata and controls
1944 lines (1667 loc) · 81.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
import sys
import re
import bisect
import sqlite3
import argparse
import platform
import importlib
import os
import zlib
import time
import tempfile
import subprocess
from array import array
from struct import unpack
from urllib.request import pathname2url
if platform.system() != "Windows":
try:
lib = importlib.import_module("pysam")
except:
print("", end = "")
else:
import pysam
def get_sys():
return(platform.system())
CtoPy = { 'A':'<c', 'c':'<b', 'C':'<B', 's':'<h', 'S':'<H', 'i':'<i', 'I':'<I', 'f':'<f' }
py4py = { 'A': 1 , 'c': 1 , 'C': 1 , 's': 2 , 'S': 2 , 'i': 4 , 'I': 4 , 'f': 4 }
dna_codes = '=ACMGRSVTWYHKDBN'
cigar_codes = 'MIDNSHP=X'
parse_codes = {
'sam': ' The current alignment in SAM format.',
'bam': ' All the bytes that make up the current alignment ("read"),\n still in binary just as it was in the BAM file. Useful\n when creating a new BAM file of filtered alignments.',
'sam_qname': ' [1st column in SAM] The QNAME (fragment ID) of the alignment.',
'bam_qname': ' The original bytes before decoding to sam_qname.',
'sam_flag': ' [2nd column in SAM] The FLAG number of the alignment.',
'bam_flag': ' The original bytes before decoding to sam_flag.',
'sam_refID': ' The chromosome ID (not the same as the name!).\n Chromosome names are stored in the BAM header (file_chromosomes),\n so to convert refIDs to chromsome names one needs to do:\n "my_bam.file_chromosomes[read.sam_refID]" (or use sam_rname)\n But for comparisons, using the refID is much faster that using\n the actual chromosome name (for example, when reading through a\n sorted BAM file and looking for where last_refID != this_refID)\n Note that when negative the alignment is not aligned, and thus one\n must not perform my_bam.file_chromosomes[read.sam_refID]\n without checking that the value is positive first.',
'sam_rname': ' [3rd column in SAM] The actual chromosome/contig name for the\n alignment. Will return "*" if refID is negative.',
'bam_refID': ' The original bytes before decoding to sam_refID.',
'sam_pos1': ' [4th column in SAM] The 1-based position of the alignment. Note\n that in SAM format values less than 1 are converted to "0" for\n "no data" and sam_pos1 will also do this.',
'sam_pos0': ' The 0-based position of the alignment. Note that in SAM all\n positions are 1-based, but in BAM they are stored as 0-based.\n Unlike sam_pos1, negative values are kept as negative values,\n essentially giving one the decoded value as it was stored.',
'bam_pos': ' The original bytes before decoding to sam_pos*.',
'sam_mapq': ' [5th column in SAM] The Mapping Quality of the current alignment.',
'bam_mapq': ' The original bytes before decoding to sam_mapq.',
'sam_cigar_string': ' [6th column in SAM] The CIGAR string, as per the SAM format.\n Allowed values are "MIDNSHP=X".',
'sam_cigar_list': ' A list of tuples with 2 values per tuple:\n the number of bases, and the CIGAR operation applied to those\n bases. Faster to calculate than sam_cigar_string.',
'bam_cigar': ' The original bytes before decoding to sam_cigar_*.',
'sam_next_refID': ' The sam_refID of the alignment\'s mate (if any). Note that as per\n sam_refID, this value can be negative and is not the actual\n chromosome name (see sam_pnext1).',
'sam_rnext': ' [7th column in SAM] The chromosome name of the alignment\'s mate.\n Value is "*" if unmapped. Note that in a SAM file this value\n is "=" if it is the same as the sam_rname, however pybam will\n only do this if the user prints the whole SAM entry with "sam".',
'bam_next_refID': ' The original bytes before decoding to sam_next_refID.',
'sam_pnext1': ' [8th column in SAM] The 1-based position of the alignment\'s mate.\n Note that in SAM format values less than 1 are converted to "0"\n for "no data", and sam_pnext1 will also do this.',
'sam_pnext0': ' The 0-based position of the alignment\'s mate. Note that in SAM all\n positions are 1-based, but in BAM they are stored as 0-based.\n Unlike sam_pnext1, negative values are kept as negative values\n here, essentially giving you the value as it was stored in BAM.',
'bam_pnext': ' The original bytes before decoding to sam_pnext0.',
'sam_tlen': ' [9th column in SAM] The TLEN value.',
'bam_tlen': ' The original bytes before decoding to sam_tlen.',
'sam_seq': ' [10th column in SAM] The SEQ value (DNA sequence of the alignment).\n Allowed values are "ACGTMRSVWYHKDBN and =".',
'bam_seq': ' The original bytes before decoding to sam_seq.',
'sam_qual': ' [11th column in SAM] The QUAL value (quality scores per DNA base\n in SEQ) of the alignment.',
'bam_qual': ' The original bytes before decoding to sam_qual.',
'sam_tags_list': ' A list of tuples with 3 values per tuple: a two-letter TAG ID, the\n type code used to describe the data in the TAG value (see SAM spec.\n for details), and the value of the TAG. Note that the BAM format\n has type codes like "c" for a number in the range -127 to +127,\n and "C" for a number in the range of 0 to 255.\n In a SAM file however, all numerical codes appear to just be stored\n using "i", which is a number in the range -2147483647 to +2147483647.\n sam_tags_list will therefore return the code used in the BAM file,\n and not "i" for all numbers.',
'sam_tags_string': ' [12th column a SAM] Returns the TAGs in the same format as would be found \n in a SAM file (with all numbers having a signed 32bit code of "i").',
'bam_tags': ' The original bytes before decoding to sam_tags_*.',
'sam_bin': ' The bin value of the alignment (used for indexing reads).\n Please refer to section 5.3 of the SAM spec for how this\n value is calculated.',
'bam_bin': ' The original bytes before decoding to sam_bin.',
'sam_block_size': ' The number of bytes the current alignment takes up in the BAM\n file minus the four bytes used to store the block_size value\n itself. Essentially sam_block_size +4 == bytes needed to store\n the current alignment.',
'bam_block_size': ' The original bytes before decoding to sam_block_size.',
'sam_l_read_name': ' The length of the QNAME plus 1 because the QNAME is terminated\n with a NUL byte.',
'bam_l_read_name': ' The original bytes before decoding to sam_l_read_name.',
'sam_l_seq': ' The number of bases in the seq. Useful if you just want to know\n how many bases are in the SEQ but do not need to know what those\n bases are (which requires more decoding effort).',
'bam_l_seq': ' The original bytes before decoding to sam_l_seq.',
'sam_n_cigar_op': ' The number of CIGAR operations in the CIGAR field. Useful if one\n wants to know how many CIGAR operations there are, but does not\n need to know what they are.',
'bam_n_cigar_op': ' The original bytes before decoding to sam_n_cigar_op.',
'file_alignments_read': ' A running counter of the number of alignments ("reads"),\n processed thus far. Note the BAM format does not store\n how many reads are in a file, so the usefulness of this\n metric is somewhat limited unless one already knows how\n many reads are in the file.',
'file_binary_header': ' From the first byte in the file, until the first byte of\n the first read. The original binary header.',
'file_bytes_read': ' A running counter of the bytes read from the file. Note\n that as data is read in arbitary chunks, this is literally\n the amount of data read from the file/pipe by pybam.',
'file_chromosome_lengths': ' The binary header of the BAM file includes chromosome names\n and chromosome lengths. This is a dictionary of chromosome-name\n keys and chromosome-length values.',
'file_chromosomes': ' A list of chromosomes from the binary header.',
'file_decompressor': ' BAM files are compressed with bgzip. The value here reflects\n the decompressor used. "internal" if pybam\'s internal\n decompressor is being used, "gzip" or "pigz" if the system\n has these binaries installed and pybam can find them.\n Any other value reflects a custom decompression command.',
'file_directory': ' The directory the input BAM file can be found in. This will be\n correct if the input file is specified via a string or python\n file object, however if the input is a pipe such as sys.stdin, \n then the current working directory will be used.',
'file_header': ' The ASCII portion of the BAM header. This is the typical header\n users of samtools will be familiar with.',
'file_name': ' The file name (base name) of input file if input is a string or\n python file object. If input is via stdin this will be "<stdin>"'
}
class read():
'''
[ Dynamic Parser Example ]
for alignment in pybam.read('/my/data.bam'):
print alignment.sam_seq
[ Static Parser Example ]
for seq,mapq in pybam.read('/my/data.bam',['sam_seq','sam_mapq']):
print seq
print mapq
[ Mixed Parser Example ]
my_bam = pybam.read('/my/data.bam',['sam_seq','sam_mapq'])
print my_bam._static_parser_code
for seq,mapq in my_bam:
if seq.startswith('ACGT') and mapq > 10:
print my_bam.sam
[ Custom Decompressor (from file path) Example ]
my_bam = pybam.read('/my/data.bam.lzma',decompressor='lzma --decompress --stdout /my/data.bam.lzma')
[ Custom Decompressor (from file object) Example ]
my_bam = pybam.read(sys.stdin,decompressor='lzma --decompress --stdout') # data given to lzma via stdin
[ Force Internal bgzip Decompressor ]
my_bam = pybam.read('/my/data.bam',decompressor='internal')
"print pybam.wat" in the python terminal to see the possible parsable values,
or visit http://github.com/JohnLonginotto/pybam for the latest info.
'''
def __init__(self,f,fields=False, decompressor="internal"):
self.file_bytes_read = 0
self.file_chromosomes = []
self.file_alignments_read = 0
self.file_chromosome_lengths = {}
if fields is not False:
print(fields)
if type(fields) is not list or len(fields) == 0:
raise PybamError('\n\nFields for the static parser must be provided as a non-empty list. You gave a ' + str(type(fields)) + '\n')
else:
for field in fields:
if field.startswith('sam') or field.startswith('bam'):
if field not in list(parse_codes.keys()):
raise PybamError('\n\nStatic parser field "' + str(field) + '" from fields ' + str(fields) + ' is not known to this version of pybam!\nPrint "pybam.wat" to see available field names with explinations.\n')
else:
raise PybamError('\n\nStatic parser field "' + str(field) + '" from fields ' + str(fields) + ' does not start with "sam" or "bam" and thus is not an avaliable field for the static parsing.\nPrint "pybam.wat" in interactive python to see available field names with explinations.\n')
if decompressor:
if type(decompressor) is str:
if decompressor != 'internal' and '{}' not in decompressor: raise PybamError('\n\nWhen a custom decompressor is used and the input file is a string, the decompressor string must contain at least one occurence of "{}" to be substituted with a filepath by pybam.\n')
else: raise PybamError('\n\nUser-supplied decompressor must be a string that when run on the command line decompresses a named file (or stdin), to stdout:\ne.g. "lzma --decompress --stdout {}" if pybam is provided a path as input file, where {} is substituted for that path.\nor just "lzma --decompress --stdout" if pybam is provided a file object instead of a file path, as data from that file object will be piped via stdin to the decompression program.\n')
## First we make a generator that will return chunks of uncompressed data, regardless of how we choose to decompress:
def generator():
DEVNULL = open(os.devnull, 'wb')
# First we need to figure out what sort of file we have - whether it's gzip compressed, uncompressed, or something else entirely!
if type(f) is str:
try:
self._file = open(f,'rb')
except:
raise PybamError('\n\nCould not open "' + str(self._file.name) + '" for reading!\n')
try:
magic = self._file.read(4)
except:
raise PybamError('\n\nCould not read from "' + str(self._file.name) + '"!\n')
elif type(f) is file:
self._file = f
try:
magic = self._file.read(4)
except:
raise PybamError('\n\nCould not read from "' + str(self._file.name) + '"!\n')
else:
raise PybamError('\n\nInput file was not a string or a file object. It was: "' + str(f) + '"\n')
self.file_name = os.path.basename(os.path.realpath(self._file.name))
self.file_directory = os.path.dirname(os.path.realpath(self._file.name))
if magic == b'BAM\1':
# The user has passed us already unzipped BAM data! Job done :)
data = b'BAM\1' + self._file.read(35536)
self.file_bytes_read += len(data)
self.file_decompressor = 'None'
while data:
yield data
data = self._file.read(35536)
self.file_bytes_read += len(data)
self._file.close()
DEVNULL.close()
return
elif magic == b"\x1f\x8b\x08\x04": # The user has passed us compressed gzip/bgzip data, which is typical for a BAM file
if decompressor is not False and decompressor != 'internal':
if type(f) is str:
self._subprocess = subprocess.Popen( decompressor.replace('{}',f), shell=True, stdout=subprocess.PIPE, stderr=DEVNULL)
else:
self._subprocess = subprocess.Popen('{ printf "'+magic+'"; cat; } | ' + decompressor, stdin=self._file, shell=True, stdout=subprocess.PIPE, stderr=DEVNULL)
self.file_decompressor = decompressor
data = self._subprocess.stdout.read(35536)
self.file_bytes_read += len(data)
while data:
yield data
data = self._subprocess.stdout.read(35536)
self.file_bytes_read += len(data)
self._file.close()
DEVNULL.close()
return
# else look for pigz or gzip:
else:
try:
self._subprocess = subprocess.Popen(["pigz"],stdin=DEVNULL,stdout=DEVNULL,stderr=DEVNULL)
if self._subprocess.returncode is None: self._subprocess.kill()
use = 'pigz'
except OSError:
try:
self._subprocess = subprocess.Popen(["gzip"],stdin=DEVNULL,stdout=DEVNULL,stderr=DEVNULL)
if self._subprocess.returncode is None: self._subprocess.kill()
use = 'gzip'
except OSError:
use = 'internal'
if use != 'internal' and decompressor != 'internal':
if type(f) is str: self._subprocess = subprocess.Popen([ use , '--decompress','--stdout', f ], stdout=subprocess.PIPE, stderr=DEVNULL)
else: self._subprocess = subprocess.Popen('{ printf "'+magic+'"; cat; } | ' + use + ' --decompress --stdout', stdin=f, shell=True, stdout=subprocess.PIPE, stderr=DEVNULL)
time.sleep(1)
if self._subprocess.poll() == None:
data = self._subprocess.stdout.read(35536)
self.file_decompressor = use
self.file_bytes_read += len(data)
while data:
yield data
data = self._subprocess.stdout.read(35536)
self.file_bytes_read += len(data)
self._file.close()
DEVNULL.close()
return
# Python's gzip module can't read from a stream that doesn't support seek(), and the zlib module cannot read the bgzip format without a lot of help:
self.file_decompressor = 'internal'
raw_data = magic + self._file.read(65536)
self.file_bytes_read = len(raw_data)
internal_cache = []
blocks_left_to_grab = 50
bs = 0
checkpoint = 0
decompress = zlib.decompress
while raw_data:
if len(raw_data) - bs < 35536:
raw_data = raw_data[bs:] + self._file.read(65536)
self.file_bytes_read += len(raw_data) - bs
bs = 0
magic = raw_data[bs:bs+4]
if not magic: break # a child's heart
if magic != b"\x1f\x8b\x08\x04": raise PybamError('\n\nThe input file is not in a format I understand. First four bytes: ' + repr(magic) + '\n')
try:
more_bs = bs + unpack("<H", raw_data[bs+16:bs+18])[0] +1
internal_cache.append(decompress(raw_data[bs+18:more_bs-8],-15))
bs = more_bs
except: ## zlib doesnt have a nice exception for when things go wrong. just "error"
header_data = magic + raw_data[bs+4:bs+12]
header_size = 12
extra_len = unpack("<H", header_data[-2:])[0]
while header_size-12 < extra_len:
header_data += raw_data[bs+12:bs+16]
subfield_id = header_data[-4:-2]
subfield_len = unpack("<H", header_data[-2:])[0]
subfield_data = raw_data[bs+16:bs+16+subfield_len]
header_data += subfield_data
header_size += subfield_len + 4
if subfield_id == 'BC': block_size = unpack("<H", subfield_data)[0]
raw_data = raw_data[bs+16+subfield_len:bs+16+subfield_len+block_size-extra_len-19]
crc_data = raw_data[bs+16+subfield_len+block_size-extra_len-19:bs+16+subfield_len+block_size-extra_len-19+8] # I have left the numbers in verbose, because the above try is the optimised code.
bs = bs+16+subfield_len+block_size-extra_len-19+8
zipped_data = header_data + raw_data + crc_data
internal_cache.append(decompress(zipped_data,47)) # 31 works the same as 47.
# Although the following in the bgzip code from biopython, its not needed if you let zlib decompress the whole zipped_data, header and crc, because it checks anyway (in C land)
# I've left the manual crc checks in for documentation purposes:
'''
expected_crc = crc_data[:4]
expected_size = unpack("<I", crc_data[4:])[0]
if len(unzipped_data) != expected_size: print 'ERROR: Failed to unpack due to a Type 1 CRC error. Could the BAM be corrupted?'; exit()
crc = zlib.crc32(unzipped_data)
if crc < 0: crc = pack("<i", crc)
else: crc = pack("<I", crc)
if expected_crc != crc: print 'ERROR: Failed to unpack due to a Type 2 CRC error. Could the BAM be corrupted?'; exit()
'''
blocks_left_to_grab -= 1
if blocks_left_to_grab == 0:
yield b''.join(internal_cache)
internal_cache = []
blocks_left_to_grab = 50
self._file.close()
DEVNULL.close()
if internal_cache != b'':
yield b''.join(internal_cache)
return
elif decompressor is not False and decompressor != 'internal':
# It wouldn't be safe to just print to the shell four random bytes from the beginning of a file, so instead it's
# written to a temp file and cat'd. The idea here being that we trust the decompressor string as it was written by
# someone with access to python, so it has system access anyway. The file/data, however, should not be trusted.
magic_file = os.path.join(tempfile.mkdtemp(),'magic')
with open(magic_file,'wb') as mf:
mf.write(magic)
if type(f) is str:
self._subprocess = subprocess.Popen(decompressor.replace('{}',f), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
else:
self._subprocess = subprocess.Popen('{ cat "'+magic_file+'"; cat; } | ' + decompressor, stdin=self._file, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
self.file_decompressor = decompressor
data = self._subprocess.stdout.read(35536)
self.file_bytes_read += len(data)
while data:
yield data
data = self._subprocess.stdout.read(35536)
self.file_bytes_read += len(data)
self._file.close()
DEVNULL.close()
return
else:
raise PybamError('\n\nThe input file is not in a format I understand. First four bytes: ' + repr(magic) + '\n')
## At this point, we know that whatever decompression method was used, a call to self._generator will return some uncompressed data.
self._generator = generator()
print(self._generator)
## So lets parse the BAM header:
header_cache = b''
while len(header_cache) < 4:
header_cache += next(self._generator)
p_from = 0; p_to = 4
if header_cache[p_from:p_to] != b'BAM\x01':
raise PybamError('\n\nInput file ' + self.file_name + ' does not appear to be a BAM file.\n')
## Parse the BAM header:
p_from = p_to; p_to += 4
length_of_header = unpack('<i',header_cache[p_from:p_to])[0]
p_from = p_to; p_to += length_of_header
while len(header_cache) < p_to: header_cache += str(next(self._generator))
self.file_header = header_cache[p_from:p_to]
p_from = p_to; p_to += 4
while len(header_cache) < p_to: header_cache += str(next(self._generator))
number_of_reference_sequences = unpack('<i',header_cache[p_from:p_to])[0]
for _ in range(number_of_reference_sequences):
p_from = p_to; p_to += 4
while len(header_cache) < p_to: header_cache += next(self._generator)
l_name = unpack('<l',header_cache[p_from:p_to])[0]
p_from = p_to; p_to += l_name
while len(header_cache) < p_to: header_cache += next(self._generator)
self.file_chromosomes.append(header_cache[p_from:p_to -1].decode('ascii'))
p_from = p_to; p_to += 4
while len(header_cache) < p_to: header_cache += next(self._generator)
self.file_chromosome_lengths[self.file_chromosomes[-1]] = unpack('<l',header_cache[p_from:p_to])[0]
self.file_bytes_read = p_to
self.file_binary_header = memoryview(header_cache[:p_to])
header_cache = header_cache[p_to:]
# A quick check to make sure the header of this BAM file makes sense:
chromosomes_from_header = []
for line in str(self.file_header).split('\\n'):
if line.startswith('@SQ\\tSN:'):
chromosomes_from_header.append(line.split('\\t')[1][3:])
if chromosomes_from_header != self.file_chromosomes:
raise PybamWarn('For some reason the BAM format stores the chromosome names in two locations,\n the ASCII text header we all know and love, viewable with samtools view -H, and another special binary header\n which is used to translate the chromosome refID (a number) into a chromosome RNAME when you do bam -> sam.\n\nThese two headers should always be the same, but apparently they are not:\nThe ASCII header looks like: ' + self.file_header + '\nWhile the binary header has the following chromosomes: ' + self.file_chromosomes + '\n')
#HL - decoding header
self.file_header = self.file_header.decode()
## Variable parsing:
def new_entry(header_cache):
cache = header_cache # we keep a small cache of X bytes of decompressed BAM data, to smoothen out disk access.
p = 0 # where the next alignment/entry starts in the cache
while True:
try:
while len(cache) < p + 4: cache = cache[p:] + next(self._generator); p = 0 # Grab enough bytes to parse blocksize
self.sam_block_size = unpack('<i',cache[p:p+4])[0]
self.file_alignments_read += 1
while len(cache) < p + 4 + self.sam_block_size:
cache = cache[p:] + next(self._generator); p = 0 # Grab enough bytes to parse entry
except StopIteration: break
self.bam = cache[p:p + 4 + self.sam_block_size]
p = p + 4 + self.sam_block_size
yield self
self._new_entry = new_entry(header_cache)
def compile_parser(self,fields):
temp_code = ''
end_of_qname = False
end_of_cigar = False
end_of_seq = False
end_of_qual = False
dependencies = set(fields)
if 'bam' in fields:
fields[fields.index('bam')] = 'self.bam'
if 'sam_block_size' in fields:
fields[fields.index('sam_block_size')] = 'self.sam_block_size'
if 'sam' in dependencies:
dependencies.update(['sam_qname','sam_flag','sam_rname','sam_pos1','sam_mapq','sam_cigar_string','bam_refID','bam_next_refID','sam_rnext','sam_pnext1','sam_tlen','sam_seq','sam_qual','sam_tags_string'])
if 'sam_tags_string' in dependencies:
dependencies.update(['sam_tags_list'])
if 'sam_pos1' in dependencies:
temp_code += "\n sam_pos1 = (0 if sam_pos0 < 0 else sam_pos0 + 1)"
dependencies.update(['sam_pos0'])
if 'sam_pnext1' in dependencies:
temp_code += "\n sam_pnext1 = (0 if sam_pnext0 < 0 else sam_pnext0 + 1)"
dependencies.update(['sam_pnext0'])
if 'sam_qname' in dependencies or 'bam_qname' in dependencies:
temp_code += "\n _end_of_qname = 36 + sam_l_read_name"
dependencies.update(['sam_l_read_name'])
end_of_qname = True
if 'sam_cigar_string' in dependencies or 'sam_cigar_list' in dependencies or 'bam_cigar' in dependencies:
if end_of_qname:
pass
else:
temp_code += "\n _end_of_qname = 36 + sam_l_read_name"
temp_code += "\n _end_of_cigar = _end_of_qname + (4*sam_n_cigar_op)"
dependencies.update(['sam_l_read_name','sam_n_cigar_op'])
end_of_cigar = True
if 'sam_seq' in dependencies or 'bam_seq' in dependencies:
if end_of_cigar:
pass
elif end_of_qname:
temp_code += "\n _end_of_cigar = _end_of_qname + (4*sam_n_cigar_op)"
else:
temp_code += "\n _end_of_cigar = 36 + sam_l_read_name + (4*sam_n_cigar_op)"
temp_code += "\n _end_of_seq = _end_of_cigar + (-((-sam_l_seq)//2))"
dependencies.update(['sam_l_seq','sam_n_cigar_op','sam_l_read_name'])
end_of_seq = True
if 'sam_qual' in dependencies or 'bam_qual' in dependencies:
if end_of_seq:
pass
elif end_of_cigar:
temp_code += "\n _end_of_seq = _end_of_cigar + (-((-sam_l_seq)//2))"
elif end_of_qname:
temp_code += "\n _end_of_seq = _end_of_qname + (4*sam_n_cigar_op) + (-((-sam_l_seq)//2))"
else:
temp_code += "\n _end_of_seq = 36 + sam_l_read_name + (4*sam_n_cigar_op) + (-((-sam_l_seq)//2))"
temp_code += "\n _end_of_qual = _end_of_seq + sam_l_seq"
dependencies.update(['sam_l_seq','sam_n_cigar_op','sam_l_read_name'])
end_of_qual = True
if 'sam_tags_list' in dependencies or 'bam_tags' in dependencies:
if end_of_qual:
pass
elif end_of_seq:
temp_code += "\n _end_of_qual = _end_of_seq + sam_l_seq"
elif end_of_cigar:
temp_code += "\n _end_of_qual = _end_of_cigar + (-((-sam_l_seq)//2)) + sam_l_seq"
elif end_of_qname:
temp_code += "\n _end_of_qual = _end_of_qname + (4*sam_n_cigar_op) + (-((-sam_l_seq)//2)) + sam_l_seq"
else:
temp_code += "\n _end_of_qual = 36 + sam_l_read_name + (4*sam_n_cigar_op) + (-((-sam_l_seq)//2)) + sam_l_seq"
dependencies.update(['sam_l_seq','sam_n_cigar_op','sam_l_read_name'])
if 'sam_rname' in dependencies:
temp_code += "\n sam_rname = '*' if sam_refID < 0 else self.file_chromosomes[sam_refID]"
dependencies.update(['sam_refID'])
if 'sam_rnext' in dependencies:
temp_code += "\n sam_rnext = '*' if sam_next_refID < 0 else self.file_chromosomes[sam_next_refID]"
dependencies.update(['sam_next_refID'])
## First we figure out what data from the static portion of the BAM entry we'll need:
tmp = {}
tmp['code'] = 'def parser(self):\n from array import array\n from struct import unpack\n for _ in self._new_entry:'
tmp['last_start'] = None
tmp['name_list'] = []
tmp['dtype_list'] = []
def pack_up(name,dtype,length,end,tmp):
if name in dependencies:
if tmp['last_start'] is None:
tmp['last_start'] = end - length
tmp['name_list'].append(name)
tmp['dtype_list'].append(dtype)
elif tmp['last_start'] is not None:
tmp['code'] += '\n ' + ', '.join(tmp['name_list']) + ' = unpack("<' + ''.join(tmp['dtype_list']) + '",self.bam[' + str(tmp['last_start']) + ':' + str(end-length) + '])'
if len(tmp['dtype_list']) == 1:
tmp['code'] += '[0]'
tmp['last_start'] = None
tmp['name_list'] = []
tmp['dtype_list'] = []
pack_up('sam_refID', 'i',4, 8,tmp)
pack_up('sam_pos0', 'i',4,12,tmp)
pack_up('sam_l_read_name', 'B',1,13,tmp)
pack_up('sam_mapq', 'B',1,14,tmp)
pack_up('sam_bin', 'H',2,16,tmp)
pack_up('sam_n_cigar_op', 'H',2,18,tmp)
pack_up('sam_flag', 'H',2,20,tmp)
pack_up('sam_l_seq', 'i',4,24,tmp)
pack_up('sam_next_refID', 'i',4,28,tmp)
pack_up('sam_pnext0', 'i',4,32,tmp)
pack_up('sam_tlen', 'i',4,36,tmp)
pack_up( None, None,0,36,tmp) # To add anything not yet added.
code = tmp['code']
del tmp
code += temp_code
# Fixed-length BAM data (where we just grab the bytes, we dont unpack) can, however, be grabbed individually.
if 'bam_block_size' in dependencies: code += "\n bam_block_size = self.bam[0 : 4 ]"
if 'bam_refID' in dependencies: code += "\n bam_refID = self.bam[4 : 8 ]"
if 'bam_pos' in dependencies: code += "\n bam_pos = self.bam[8 : 12 ]"
if 'bam_l_read_name' in dependencies: code += "\n bam_l_read_name = self.bam[12 : 13 ]"
if 'bam_mapq' in dependencies: code += "\n bam_mapq = self.bam[13 : 14 ]"
if 'bam_bin' in dependencies: code += "\n bam_bin = self.bam[14 : 16 ]"
if 'bam_n_cigar_op' in dependencies: code += "\n bam_n_cigar_op = self.bam[16 : 18 ]"
if 'bam_flag' in dependencies: code += "\n bam_flag = self.bam[18 : 20 ]"
if 'bam_l_seq' in dependencies: code += "\n bam_l_seq = self.bam[20 : 24 ]"
if 'bam_next_refID' in dependencies: code += "\n bam_next_refID = self.bam[24 : 28 ]"
if 'bam_pnext' in dependencies: code += "\n bam_pnext = self.bam[28 : 32 ]"
if 'bam_tlen' in dependencies: code += "\n bam_tlen = self.bam[32 : 36 ]"
if 'bam_qname' in dependencies: code += "\n bam_qname = self.bam[36 : _end_of_qname ]"
if 'bam_cigar' in dependencies: code += "\n bam_cigar = self.bam[_end_of_qname : _end_of_cigar ]"
if 'bam_seq' in dependencies: code += "\n bam_seq = self.bam[_end_of_cigar : _end_of_seq ]"
if 'bam_qual' in dependencies: code += "\n bam_qual = self.bam[_end_of_seq : _end_of_qual ]"
if 'bam_tags' in dependencies: code += "\n bam_tags = self.bam[_end_of_qual : ]"
if 'sam_qname' in dependencies:
if 'bam_qname' in dependencies: code += "\n sam_qname = bam_qname[:-1]"
else: code += "\n sam_qname = self.bam[36 : _end_of_qname -1 ]"
if 'sam_cigar_list' in dependencies:
if 'bam_cigar' in dependencies: code += "\n sam_cigar_list = [( cig >> 4 , cigar_codes[cig & 0b1111]) for cig in array('I', bam_cigar) ]"
else: code += "\n sam_cigar_list = [( cig >> 4 , cigar_codes[cig & 0b1111]) for cig in array('I', self.bam[_end_of_qname : _end_of_cigar]) ]"
if 'sam_cigar_string'in dependencies:
if 'bam_cigar' in dependencies: code += "\n sam_cigar_string = ''.join([ str(cig >> 4) + cigar_codes[cig & 0b1111] for cig in array('I', bam_cigar)])"
else: code += "\n sam_cigar_string = ''.join([ str(cig >> 4) + cigar_codes[cig & 0b1111] for cig in array('I', self.bam[_end_of_qname : _end_of_cigar]) ])"
if 'sam_seq' in dependencies:
if 'bam_seq' in dependencies: code += "\n sam_seq = ''.join( [ dna_codes[dna >> 4] + dna_codes[dna & 0b1111] for dna in array('B', bam_seq)])[:sam_l_seq]"
else: code += "\n sam_seq = ''.join( [ dna_codes[dna >> 4] + dna_codes[dna & 0b1111] for dna in array('B', self.bam[_end_of_cigar : _end_of_seq])])[:sam_l_seq]"
if 'sam_qual' in dependencies:
if 'bam_qual' in dependencies: code += "\n sam_qual = b''.join( [ chr(ord(quality) + 33) for quality in bam_qual ])"
else: code += "\n sam_qual = b''.join( [ chr(ord(quality) + 33) for quality in self.bam[_end_of_seq : _end_of_qual ]])"
if 'sam_tags_list' in dependencies:
code += '''
sam_tags_list = []
offset = _end_of_qual
while offset != len(self.bam):
tag_name = self.bam[offset:offset+2]
tag_type = self.bam[offset+2]
if tag_type == 'Z':
offset_end = self.bam.index('\\0',offset+3)+1
tag_data = self.bam[offset+3:offset_end-1]
elif tag_type in CtoPy:
offset_end = offset+3+py4py[tag_type]
tag_data = unpack(CtoPy[tag_type],self.bam[offset+3:offset_end])[0]
elif tag_type == 'B':
offset_end = offset+8+(unpack('<i',self.bam[offset+4:offset+8])[0]*py4py[self.bam[offset+3]])
tag_data = array(self.bam[offset+3] , self.bam[offset+8:offset_end] )
else:
print 'PYBAM ERROR: I dont know how to parse BAM tags in this format: ',repr(tag_type)
print ' This is simply because I never saw this kind of tag during development.'
print ' If you could mail the following chunk of text to john at john.uk.com, i will fix this up for everyone :)'
print repr(tag_type),repr(self.bam[offset+3:end])
exit()
sam_tags_list.append((tag_name,tag_type,tag_data))
offset = offset_end'''
if 'sam_tags_string' in dependencies:
code += "\n sam_tags_string = '\t'.join(A + ':' + ('i' if B in 'cCsSI' else B) + ':' + ((C.typecode + ',' + ','.join(map(str,C))) if type(C)==array else str(C)) for A,B,C in self.sam_tags_list)"
if 'sam' in dependencies:
code += "\n sam = sam_qname + '\t' + str(sam_flag) + '\t' + sam_rname + '\t' + str(sam_pos1) + '\t' + str(sam_mapq) + '\t' + ('*' if sam_cigar_string == '' else sam_cigar_string) + '\t' + ('=' if bam_refID == bam_next_refID else sam_rnext) + '\t' + str(sam_pnext1) + '\t' + str(sam_tlen) + '\t' + sam_seq + '\t' + sam_qual + '\t' + sam_tags_string"
code += '\n yield ' + ','.join([x for x in fields]) + '\n'
self._static_parser_code = code # "code" is the static parser's code as a string (a function called "parser")
exec_dict = { # This dictionary stores things the exec'd code needs to know about, and will store the compiled function after exec()
'unpack':unpack,
'array':array,
'dna_codes':dna_codes,
'CtoPy':CtoPy,
'py4py':py4py,
'cigar_codes':cigar_codes
}
exec(code, exec_dict) # exec() compiles "code" to real code, creating the "parser" function and adding it to exec_dict['parser']
return(exec_dict['parser'])
if fields:
static_parser = compile_parser(self,fields)(self)
def next_read(): return next(self._new_entry)
else:
def next_read(): return next(self._new_entry)
self.next = next_read
def __next__(self):
return next(self._new_entry)
def __iter__(self): return self
def __str__(self): return self.sam
## Methods to pull out raw bam data from entry (so still in its binary encoding). This can be helpful in some scenarios.
@property
def bam_block_size(self): return self.bam[ : 4 ]
@property
def bam_refID(self): return self.bam[ 4 : 8 ]
@property
def bam_pos(self): return self.bam[ 8 : 12 ]
@property
def bam_l_read_name(self): return self.bam[ 12 : 13 ]
@property
def bam_mapq(self): return self.bam[ 13 : 14 ]
@property
def bam_bin(self): return self.bam[ 14 : 16 ]
@property
def bam_n_cigar_op(self): return self.bam[ 16 : 18 ]
@property
def bam_flag(self): return self.bam[ 18 : 20 ]
@property
def bam_l_seq(self): return self.bam[ 20 : 24 ]
@property
def bam_next_refID(self): return self.bam[ 24 : 28 ]
@property
def bam_pnext(self): return self.bam[ 28 : 32 ]
@property
def bam_tlen(self): return self.bam[ 32 : 36 ]
@property
def bam_qname(self): return self.bam[ 36 : self._end_of_qname ]
@property
def bam_cigar(self): return self.bam[ self._end_of_qname : self._end_of_cigar ]
@property
def bam_seq(self): return self.bam[ self._end_of_cigar : self._end_of_seq ]
@property
def bam_qual(self): return self.bam[ self._end_of_seq : self._end_of_qual ]
@property
def bam_tags(self): return self.bam[ self._end_of_qual : ]
@property
def sam_refID(self): return unpack( '<i', self.bam[ 4 : 8 ] )[0]
@property
def sam_pos0(self): return unpack( '<i', self.bam[ 8 : 12 ] )[0]
@property
def sam_l_read_name(self): return unpack( '<B', self.bam[ 12 : 13 ] )[0]
@property
def sam_mapq(self): return unpack( '<B', self.bam[ 13 : 14 ] )[0]
@property
def sam_bin(self): return unpack( '<H', self.bam[ 14 : 16 ] )[0]
@property
def sam_n_cigar_op(self): return unpack( '<H', self.bam[ 16 : 18 ] )[0]
@property
def sam_flag(self): return unpack( '<H', self.bam[ 18 : 20 ] )[0]
@property
def sam_l_seq(self): return unpack( '<i', self.bam[ 20 : 24 ] )[0]
@property
def sam_next_refID(self): return unpack( '<i', self.bam[ 24 : 28 ] )[0]
@property
def sam_pnext0(self): return unpack( '<i', self.bam[ 28 : 32 ] )[0]
@property
def sam_tlen(self): return unpack( '<i', self.bam[ 32 : 36 ] )[0]
@property
def sam_qname(self): return self.bam[ 36 : self._end_of_qname -1 ].decode() # -1 to remove trailing NUL byte
@property
def sam_cigar_list(self): return [ (cig >> 4 , cigar_codes[cig & 0b1111] ) for cig in array('I', self.bam[self._end_of_qname : self._end_of_cigar ])]
@property
def sam_cigar_string(self): return ''.join( [ str(cig >> 4) + cigar_codes[cig & 0b1111] for cig in array('I', self.bam[self._end_of_qname : self._end_of_cigar ])])
@property
def sam_seq(self): return ''.join( [ dna_codes[dna >> 4] + dna_codes[dna & 0b1111] for dna in array('B', self.bam[self._end_of_cigar : self._end_of_seq ])])[:self.sam_l_seq] # As DNA is 4 bits packed 2-per-byte, there might be a trailing '0000', so we can either
@property
def sam_qual(self):
return ''.join( [ chr(quality + 33) for quality in self.bam[self._end_of_seq : self._end_of_qual ]])
@property
def sam_tags_list(self):
result = []
offset = self._end_of_qual
while offset != len(self.bam):
tag_name = self.bam[offset:offset+2].decode()
tag_type = chr(self.bam[offset+2])
if tag_type == 'Z':
offset_end = self.bam.index(b'\x00',offset+3)+1
tag_data = self.bam[offset+3:offset_end-1].decode()
elif tag_type in CtoPy:
offset_end = offset+3+py4py[tag_type]
tag_data = unpack(CtoPy[tag_type],self.bam[offset+3:offset_end])[0]
elif tag_type == 'B':
offset_end = offset+8+(unpack('<i',self.bam[offset+4:offset+8])[0]*py4py[self.bam[offset+3]])
tag_data = array(self.bam[offset+3] , self.bam[offset+8:offset_end] )
else:
print('PYBAM ERROR: I dont know how to parse BAM tags in this format: ',repr(tag_type))
print(' This is simply because I never saw this kind of tag during development.')
print(' If you could mail the following chunk of text to john at john.uk.com, ill fix this up :)')
print(repr(tag_type),repr(self.bam[offset+3:end]))
exit()
result.append((tag_name,tag_type,tag_data))
offset = offset_end
return result
@property
def sam_tags_string(self):
return '\t'.join(A + ':' + ('i' if B in 'cCsSI' else B) + ':' + ((C.typecode + ',' + ','.join(map(str,C))) if type(C)==array else str(C)) for A,B,C in self.sam_tags_list)
## BONUS methods - methods that mimic how samtools works.
@property
def sam_pos1(self): return 0 if self.sam_pos0 < 0 else self.sam_pos0 + 1
@property
def sam_pnext1(self): return 0 if self.sam_pnext0 < 0 else self.sam_pnext0 + 1
@property
def sam_rname(self): return '*' if self.sam_refID < 0 else self.file_chromosomes[self.sam_refID ]
@property
def sam_rnext(self): return '*' if self.sam_next_refID < 0 else self.file_chromosomes[self.sam_next_refID]
@property
def sam(self): return (
self.sam_qname + '\t' +
str(self.sam_flag) + '\t' +
self.sam_rname + '\t' +
str(self.sam_pos1) + '\t' +
str(self.sam_mapq) + '\t' +
('*' if self.sam_cigar_string == '' else self.sam_cigar_string) + '\t' +
('=' if self.bam_refID == self.bam_next_refID else self.sam_rnext) + '\t' +
str(self.sam_pnext1) + '\t' +
str(self.sam_tlen) + '\t' +
self.sam_seq + '\t' +
self.sam_qual + '\t' +
self.sam_tags_string
)
## Internal methods - methods used to calculate where variable-length blocks start/end
@property
def _end_of_qname(self): return self.sam_l_read_name + 36 # fixed-length stuff at the beginning takes up 36 bytes.
@property
def _end_of_cigar(self): return self._end_of_qname + (4*self.sam_n_cigar_op) # 4 bytes per n_cigar_op
@property
def _end_of_seq(self): return self._end_of_cigar + (-((-self.sam_l_seq)//2)) # {blurgh}
@property
def _end_of_qual(self): return self._end_of_seq + self.sam_l_seq # qual has the same length as seq
def __del__(self):
if self._subprocess.returncode is None: self._subprocess.kill()
self._file.close()
class PybamWarn(Exception): pass
class PybamError(Exception): pass
def parse_to_mags_identical(contig_file_name, out_file_name):
contigs = open(contig_file_name, 'r')
mags = open(out_file_name, "w")
for line in contigs:
if line[0] == ">":
#set to new contig. One final loop of starts ends counts is needed
current_contig = line[1:].strip().split()[0]
print(current_contig, current_contig, sep = "\t", file = mags)
contigs.close()
mags.close()
return("")
def tables_in_sqlite_db(conn):
tabs = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = [
v[0] for v in tabs.fetchall()
if v[0] != "sqlite_sequence"
]
return tables
def sqldb_creation(contigs, mags, sample_reads, map_format, database):
""" Read information provided by user and creates SQLite3 database
Arguments:
contigs {str} -- Location of fasta file with contigs of interest.
mags {str} -- Location of tab separated file with contigs and their corresponding mags.
sample_reads {list} -- Location of one or more read mapping results file(s).
format {str} -- Format of read mapping (blast or sam).
database {str} -- Name (location) of database to create.
"""
# ===== Database and table creation =====
# Create or open database
conn = sqlite3.connect(database)
cursor = conn.cursor()
#Check if there are tables ahead of time
tables = tables_in_sqlite_db(cursor)
#Clean out the old DB to begin with; effectively reinitialize.
for table in tables:
cursor.execute('DROP TABLE IF EXISTS '+ table)
# Create lookup table (always creates a new one)
cursor.execute('DROP TABLE IF EXISTS lookup_table')
cursor.execute('CREATE TABLE lookup_table \
(mag_name TEXT, mag_id INTEGER, contig_name TEXT, contig_id INTEGER)')
# Create sample_info, mag_info, mags_per_sample, and gene_info tables
cursor.execute('DROP TABLE IF EXISTS mag_info')
cursor.execute('DROP TABLE IF EXISTS sample_info')
cursor.execute('DROP TABLE IF EXISTS mags_per_sample')
cursor.execute('CREATE TABLE mag_info \
(mag_id INTEGER, contig_id INTEGER, contig_len INTEGER)')
cursor.execute('CREATE TABLE sample_info \
(sample_name TEXT, sample_id TEXT, sample_number INTEGER)')
cursor.execute('CREATE TABLE mags_per_sample \
(sample_name TEXT, mag_name TEXT)')
# ========
# === Extract sample information and save in into DB ===
# Rename samples provided to avoid illegal names on files
sampleid_to_sample = {}
samples_to_db = []
sample_number = 1
for sample_name in sample_reads:
sample_id = "sample_" + str(sample_number)
samples_to_db.append((sample_name, sample_id, sample_number))
sampleid_to_sample[sample_id] = sample_name
sample_number += 1
# Enter information into table
cursor.execute("begin")
cursor.executemany('INSERT INTO sample_info VALUES(?, ?, ?)', samples_to_db)
cursor.execute('CREATE UNIQUE INDEX sample_index ON sample_info (sample_name)')
cursor.execute("commit")
# ========
# === Extract contig information and MAG correspondence. Save into DB. ===
# Get contig sizes
contig_sizes = read_contigs(contigs)
# Get contig - MAG information
contig_mag_corresp = get_mags(mags)
# Initialize variables
contig_identifiers = []
mag_ids = {}
mag_id = 0
contig_id = 1
# The dictionary contig_information is important for speed when filling tables
contig_information = {}
# Iterate through contig - MAG pairs
for contig_name, mag_name in contig_mag_corresp.items():
# Store MAG (and contig) names and ids
if mag_name in mag_ids:
contig_identifiers.append((mag_name, mag_ids[mag_name], contig_name, contig_id))
contig_information[contig_name] = [mag_name, mag_ids[mag_name], contig_id]
contig_id += 1
else:
mag_id += 1
mag_ids[mag_name] = mag_id
contig_identifiers.append((mag_name, mag_ids[mag_name], contig_name, contig_id))
contig_information[contig_name] = [mag_name, mag_ids[mag_name], contig_id]
contig_id += 1
cursor.executemany('INSERT INTO lookup_table VALUES(?, ?, ?, ?)', contig_identifiers)
cursor.execute('CREATE INDEX mag_name_index ON lookup_table (mag_name)')
conn.commit()
# ========
# === Fill contig length table ===
contig_lengths = []
for contig, contig_len in contig_sizes.items():
# Get mag_id and contig_id
sql_command = 'SELECT mag_id, contig_id from lookup_table WHERE contig_name = ?'
cursor.execute(sql_command, (contig,))
mag_contig_id = cursor.fetchone()
contig_lengths.append((mag_contig_id[0], mag_contig_id[1], contig_len))
cursor.executemany('INSERT INTO mag_info VALUES(?, ?, ?)', contig_lengths)
cursor.execute('CREATE INDEX mag_id_index ON mag_info (mag_id)')
conn.commit()
# ========
# === Create one table with information per sample ===
for sample_name in sampleid_to_sample.keys():
# Drop if they exist
cursor.execute('DROP TABLE IF EXISTS ' + sample_name)
# Create tables once again
cursor.execute('CREATE TABLE ' + sample_name + \
' (mag_id INTEGER, contig_id INTEGER, identity FLOAT, start INTEGER, stop INTEGER)')
# === Retrieve information from read mapping and store it ===
# Read read mapping file for each sample and fill corresponding table
for sample_name, mapping_file in sampleid_to_sample.items():
mags_in_sample = []
contigs_in_sample = save_reads_mapped(mapping_file, sample_name, map_format, cursor, conn)
cursor.execute('SELECT contig_name, mag_name, mag_id FROM lookup_table')
all_contigs = cursor.fetchall()
for element in all_contigs:
if element[0] in contigs_in_sample:
if element[1] not in mags_in_sample:
mags_in_sample.append(element[1])
else:
continue
else:
continue
mags_in_sample = [(mapping_file, x) for x in mags_in_sample]
cursor.executemany('INSERT INTO mags_per_sample VALUES(?, ?)', mags_in_sample)
conn.commit()
cursor.close()
conn.commit()
conn.close()
#This is now written with bamnostic so that windows supports bam format, too.
def save_reads_mapped(mapping_file, sample_name, map_format, cursor, conn):
""" This script reads a read mapping file, extracts the contig to which each read maps,
the percent id, the start and stop, and stores it in a table per sample.
Arguments:
mapping_file {str} -- Location of read mapping file.
sample_name {str} -- Name of database sample name (form sample_#)
map_format {str} -- Format of read mapping results (blast or sam)
cursor {obj} -- Cursor to execute db instructions
conn {obj} -- Connection handler to db.
"""
assert (map_format == "sam" or map_format == "bam" or map_format == "blast"), "Mapping format not recognized. Must be one of 'sam' 'bam' or 'blast'"
contig_mag_corresp = {}
contigs_in_sample = []
# Retrieve mag information as mag_name, mag_id, contig_name, contig_id
sql_command = 'SELECT * from lookup_table'
cursor.execute(sql_command)
contig_correspondence = cursor.fetchall()
for contig_mag in contig_correspondence:
contig_mag_corresp[contig_mag[2]] = [contig_mag[0], contig_mag[1], contig_mag[3]]
# Read mapping files and fill sample tables
if map_format == "blast":
with open(mapping_file) as input_reads:
record_counter = 0
records = []
for line in input_reads:
# Commit changes after 500000 records
if record_counter == 500000:
cursor.execute("begin")
cursor.executemany('INSERT INTO ' + sample_name + ' VALUES(?, ?, ?, ?, ?)', records)
cursor.execute("commit")
record_counter = 0
records = []
if line.startswith("#"):
pass
else:
segment = line.split("\t")
contig_ref = segment[1]
# Exclude reads not associated with MAGs of interest
if contig_ref not in contig_mag_corresp:
continue
else:
if contig_ref not in contigs_in_sample:
contigs_in_sample.append(contig_ref)
pct_id = float(segment[2])
pos1 = int(segment[8])
pos2 = int(segment[9])
start = min(pos1, pos2)
end = start+(max(pos1, pos2)-min(pos1, pos2))
mag_id = contig_mag_corresp[contig_ref][1]
contig_id = contig_mag_corresp[contig_ref][2]
records.append((mag_id, contig_id, pct_id, start, end))
record_counter += 1
# Commit remaining records
if record_counter > 0:
cursor.execute("begin")
cursor.executemany('INSERT INTO ' + sample_name + ' VALUES(?, ?, ?, ?, ?)', records)
cursor.execute("commit")
# Create index for faster access
cursor.execute('CREATE INDEX ' + sample_name + '_index on ' + sample_name + ' (mag_id)')
if map_format == "sam":
record_counter = 0
records = []
with open(mapping_file) as input_reads:
for line in input_reads:
if record_counter == 500000:
cursor.execute("begin")
cursor.executemany('INSERT INTO ' + sample_name + ' VALUES(?, ?, ?, ?, ?)', records)
cursor.execute("commit")
record_counter = 0
records = []
if "MD:Z:" not in line:
continue
else :
segment = line.split()
contig_ref = segment[2]
# Exclude reads not associated with MAGs of interest
if contig_ref not in contig_mag_corresp:
continue
else:
if contig_ref not in contigs_in_sample:
contigs_in_sample.append(contig_ref)
# Often the MD:Z: field will be the last one in a magicblast output, but not always.
# Therefore, start from the end and work in.
iter = len(segment)-1
mdz_seg = segment[iter]
# If it's not the correct field, proceed until it is.
while not mdz_seg.startswith("MD:Z:"):
iter -= 1
mdz_seg = segment[iter]
#Remove the MD:Z: flag from the start
mdz_seg = mdz_seg[5:]
match_count = re.findall('[0-9]+', mdz_seg)