SeqAn3 3.2.0-rc.1
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/std/bit>
16#include <iterator>
17#include <seqan3/std/ranges>
18#include <string>
19#include <vector>
20
33
34namespace seqan3
35{
36
49class format_bam : private detail::format_sam_base
50{
51public:
55 // string_buffer is of type std::string and has some problems with pre-C++11 ABI
56 format_bam() = default;
57 format_bam(format_bam const &) = default;
58 format_bam & operator=(format_bam const &) = default;
59 format_bam(format_bam &&) = default;
61 ~format_bam() = default;
62
64
67 {
68 { "bam" }
69 };
70
71protected:
72 template <typename stream_type, // constraints checked by file
73 typename seq_legal_alph_type,
74 typename ref_seqs_type,
75 typename ref_ids_type,
76 typename stream_pos_type,
77 typename seq_type,
78 typename id_type,
79 typename offset_type,
80 typename ref_seq_type,
81 typename ref_id_type,
82 typename ref_offset_type,
83 typename align_type,
84 typename cigar_type,
85 typename flag_type,
86 typename mapq_type,
87 typename qual_type,
88 typename mate_type,
89 typename tag_dict_type,
90 typename e_value_type,
91 typename bit_score_type>
92 void read_alignment_record(stream_type & stream,
93 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
94 ref_seqs_type & ref_seqs,
96 stream_pos_type & position_buffer,
97 seq_type & seq,
98 qual_type & qual,
99 id_type & id,
100 offset_type & offset,
101 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
102 ref_id_type & ref_id,
103 ref_offset_type & ref_offset,
104 align_type & align,
105 cigar_type & cigar_vector,
106 flag_type & flag,
107 mapq_type & mapq,
108 mate_type & mate,
109 tag_dict_type & tag_dict,
110 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
111 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
112
113 template <typename stream_type,
114 typename header_type,
115 typename seq_type,
116 typename id_type,
117 typename ref_seq_type,
118 typename ref_id_type,
119 typename align_type,
120 typename cigar_type,
121 typename qual_type,
122 typename mate_type,
123 typename tag_dict_type>
124 void write_alignment_record([[maybe_unused]] stream_type & stream,
125 [[maybe_unused]] sam_file_output_options const & options,
126 [[maybe_unused]] header_type && header,
127 [[maybe_unused]] seq_type && seq,
128 [[maybe_unused]] qual_type && qual,
129 [[maybe_unused]] id_type && id,
130 [[maybe_unused]] int32_t const offset,
131 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
132 [[maybe_unused]] ref_id_type && ref_id,
133 [[maybe_unused]] std::optional<int32_t> ref_offset,
134 [[maybe_unused]] align_type && align,
135 [[maybe_unused]] cigar_type && cigar_vector,
136 [[maybe_unused]] sam_flag const flag,
137 [[maybe_unused]] uint8_t const mapq,
138 [[maybe_unused]] mate_type && mate,
139 [[maybe_unused]] tag_dict_type && tag_dict,
140 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
141 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
142
143private:
145 bool header_was_read{false};
146
148 std::string string_buffer{};
149
151 struct alignment_record_core
152 { // naming corresponds to official SAM/BAM specifications
153 int32_t block_size;
154 int32_t refID;
155 int32_t pos;
156 uint32_t l_read_name:8;
157 uint32_t mapq:8;
158 uint32_t bin:16;
159 uint32_t n_cigar_op:16;
160 sam_flag flag;
161 int32_t l_seq;
162 int32_t next_refID;
163 int32_t next_pos;
164 int32_t tlen;
165 };
166
168 static constexpr std::array<uint8_t, 256> char_to_sam_rank
169 {
170 [] () constexpr
171 {
173
174 using index_t = std::make_unsigned_t<char>;
175
176 // ret['M'] = 0; set anyway by initialization
177 ret[static_cast<index_t>('I')] = 1;
178 ret[static_cast<index_t>('D')] = 2;
179 ret[static_cast<index_t>('N')] = 3;
180 ret[static_cast<index_t>('S')] = 4;
181 ret[static_cast<index_t>('H')] = 5;
182 ret[static_cast<index_t>('P')] = 6;
183 ret[static_cast<index_t>('=')] = 7;
184 ret[static_cast<index_t>('X')] = 8;
185
186 return ret;
187 }()
188 };
189
191 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
192 {
193 --end;
194 if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
195 if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
196 if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
197 if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
198 if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
199 return 0;
200 }
201
208 template <typename stream_view_type, std::integral number_type>
209 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
210 {
211 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
212 }
213
219 template <typename stream_view_type>
220 void read_float_byte_field(stream_view_type && stream_view, float & target)
221 {
222 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
223 }
224
225 template <typename stream_view_type, typename value_type>
226 void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
227 stream_view_type && stream_view,
228 value_type const & SEQAN3_DOXYGEN_ONLY(value));
229
230 template <typename stream_view_type>
231 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
232
233 template <typename cigar_input_type>
234 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
235
236 static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
237};
238
240template <typename stream_type, // constraints checked by file
241 typename seq_legal_alph_type,
242 typename ref_seqs_type,
243 typename ref_ids_type,
244 typename stream_pos_type,
245 typename seq_type,
246 typename id_type,
247 typename offset_type,
248 typename ref_seq_type,
249 typename ref_id_type,
250 typename ref_offset_type,
251 typename align_type,
252 typename cigar_type,
253 typename flag_type,
254 typename mapq_type,
255 typename qual_type,
256 typename mate_type,
257 typename tag_dict_type,
258 typename e_value_type,
259 typename bit_score_type>
260inline void format_bam::read_alignment_record(stream_type & stream,
261 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
262 ref_seqs_type & ref_seqs,
264 stream_pos_type & position_buffer,
265 seq_type & seq,
266 qual_type & qual,
267 id_type & id,
268 offset_type & offset,
269 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
270 ref_id_type & ref_id,
271 ref_offset_type & ref_offset,
272 align_type & align,
273 cigar_type & cigar_vector,
274 flag_type & flag,
275 mapq_type & mapq,
276 mate_type & mate,
277 tag_dict_type & tag_dict,
278 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
279 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
280{
281 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
282 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
283 "The ref_offset must be a specialisation of std::optional.");
284
285 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
286 "The type of field::mapq must be uint8_t.");
287
288 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
289 "The type of field::flag must be seqan3::sam_flag.");
290
291 auto stream_view = seqan3::detail::istreambuf(stream);
292
293 // these variables need to be stored to compute the ALIGNMENT
294 [[maybe_unused]] int32_t offset_tmp{};
295 [[maybe_unused]] int32_t soft_clipping_end{};
296 [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
297 [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
298
299 // Header
300 // -------------------------------------------------------------------------------------------------------------
301 if (!header_was_read)
302 {
303 // magic BAM string
304 if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
305 throw format_error{"File is not in BAM format."};
306
307 int32_t l_text{}; // length of header text including \0 character
308 int32_t n_ref{}; // number of reference sequences
309 int32_t l_name{}; // 1 + length of reference name including \0 character
310 int32_t l_ref{}; // length of reference sequence
311
312 read_integral_byte_field(stream_view, l_text);
313
314 if (l_text > 0) // header text is present
315 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
316
317 read_integral_byte_field(stream_view, n_ref);
318
319 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
320 {
321 read_integral_byte_field(stream_view, l_name);
322
323 string_buffer.resize(l_name - 1);
324 std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.data()); // copy without \0 character
325 ++std::ranges::begin(stream_view); // skip \0 character
326
327 read_integral_byte_field(stream_view, l_ref);
328
329 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
330 {
331 // If there was no header text, we parse reference sequences block as header information
332 if (l_text == 0)
333 {
334 auto & reference_ids = header.ref_ids();
335 // put the length of the reference sequence into ref_id_info
336 header.ref_id_info.emplace_back(l_ref, "");
337 // put the reference name into reference_ids
338 reference_ids.push_back(string_buffer);
339 // assign the reference name an ascending reference id (starts at index 0).
340 header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
341 continue;
342 }
343 }
344
345 auto id_it = header.ref_dict.find(string_buffer);
346
347 // sanity checks of reference information to existing header object:
348 if (id_it == header.ref_dict.end()) // [unlikely]
349 {
350 throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
351 "' found in BAM file header (header.ref_ids():",
352 header.ref_ids(), ").")};
353 }
354 else if (id_it->second != ref_idx) // [unlikely]
355 {
356 throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
357 " does not correspond to the position ", id_it->second,
358 " in the header (header.ref_ids():", header.ref_ids(), ").")};
359 }
360 else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
361 {
362 throw format_error{"Provided reference has unequal length as specified in the header."};
363 }
364 }
365
366 header_was_read = true;
367
368 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
369 return;
370 }
371
372 // read alignment record into buffer
373 // -------------------------------------------------------------------------------------------------------------
374 position_buffer = stream.tellg();
375 alignment_record_core core;
376 std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
377
378 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
379 {
380 throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
381 "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
382 }
383 else if (core.refID > -1) // not unmapped
384 {
385 ref_id = core.refID; // field::ref_id
386 }
387
388 flag = core.flag; // field::flag
389 mapq = core.mapq; // field::mapq
390
391 if (core.pos > -1) // [[likely]]
392 ref_offset = core.pos; // field::ref_offset
393
394 if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
395 {
396 if (core.next_refID > -1)
397 get<0>(mate) = core.next_refID;
398
399 if (core.next_pos > -1) // [[likely]]
400 get<1>(mate) = core.next_pos;
401
402 get<2>(mate) = core.tlen;
403 }
404
405 // read id
406 // -------------------------------------------------------------------------------------------------------------
407 auto id_view = stream_view | detail::take_exactly_or_throw(core.l_read_name - 1);
408 if constexpr (!detail::decays_to_ignore_v<id_type>)
409 read_forward_range_field(id_view, id); // field::id
410 else
411 detail::consume(id_view);
412 ++std::ranges::begin(stream_view); // skip '\0'
413
414 // read cigar string
415 // -------------------------------------------------------------------------------------------------------------
416 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
417 {
418 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
419 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
420 // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
421 }
422 else
423 {
424 detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
425 }
426
427 offset = offset_tmp;
428
429 // read sequence
430 // -------------------------------------------------------------------------------------------------------------
431 if (core.l_seq > 0) // sequence information is given
432 {
433 auto seq_stream = stream_view
434 | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
436 {
437 return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
438 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
439 });
440
441 if constexpr (detail::decays_to_ignore_v<seq_type>)
442 {
443 auto skip_sequence_bytes = [&] ()
444 {
445 detail::consume(seq_stream);
446 if (core.l_seq & 1)
447 ++std::ranges::begin(stream_view);
448 };
449
450 if constexpr (!detail::decays_to_ignore_v<align_type>)
451 {
453 "If you want to read ALIGNMENT but not SEQ, the alignment"
454 " object must store a sequence container at the second (query) position.");
455
456 if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
457 {
458 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
459 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
460 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
461
462 get<1>(align).reserve(seq_length);
463
464 auto tmp_iter = std::ranges::begin(seq_stream);
465 std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
466
467 if (offset_tmp & 1)
468 {
469 get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
470 ++tmp_iter;
471 --seq_length;
472 }
473
474 for (size_t i = (seq_length / 2); i > 0; --i)
475 {
476 get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
477 get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
478 ++tmp_iter;
479 }
480
481 if (seq_length & 1)
482 {
483 get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
484 ++tmp_iter;
485 --soft_clipping_end;
486 }
487
488 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
489 }
490 else
491 {
492 skip_sequence_bytes();
493 get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
494 }
495 }
496 else
497 {
498 skip_sequence_bytes();
499 }
500 }
501 else
502 {
503 using alph_t = std::ranges::range_value_t<decltype(seq)>;
504 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
505
506 for (auto [d1, d2] : seq_stream)
507 {
508 seq.push_back(from_dna16[to_rank(d1)]);
509 seq.push_back(from_dna16[to_rank(d2)]);
510 }
511
512 if (core.l_seq & 1)
513 {
514 dna16sam d = dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
515 seq.push_back(from_dna16[to_rank(d)]);
516 ++std::ranges::begin(stream_view);
517 }
518
519 if constexpr (!detail::decays_to_ignore_v<align_type>)
520 {
521 assign_unaligned(get<1>(align),
522 seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
523 std::ranges::distance(seq) - soft_clipping_end));
524 }
525 }
526 }
527
528 // read qual string
529 // -------------------------------------------------------------------------------------------------------------
530 auto qual_view = stream_view | detail::take_exactly_or_throw(core.l_seq)
531 | std::views::transform([] (char chr)
532 {
533 return static_cast<char>(chr + 33);
534 });
535 if constexpr (!detail::decays_to_ignore_v<qual_type>)
536 read_forward_range_field(qual_view, qual);
537 else
538 detail::consume(qual_view);
539
540 // All remaining optional fields if any: SAM tags dictionary
541 // -------------------------------------------------------------------------------------------------------------
542 int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
543 core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
544 assert(remaining_bytes >= 0);
545 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
546
547 while (tags_view.size() > 0)
548 {
549 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
550 read_sam_dict_field(tags_view, tag_dict);
551 else
552 detail::consume(tags_view);
553 }
554
555 // DONE READING - wrap up
556 // -------------------------------------------------------------------------------------------------------------
557 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
558 {
559 // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
560 // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
561 // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
562 if (core.l_seq != 0 && offset_tmp == core.l_seq)
563 {
564 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
565 { // maybe only throw in debug mode and otherwise return an empty alignment?
566 throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
567 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
568 "stored in the optional field CG. You need to read in the field::tags and "
569 "field::seq in order to access this information.")};
570 }
571 else
572 {
573 auto it = tag_dict.find("CG"_tag);
574
575 if (it == tag_dict.end())
576 throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
577 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
578 "stored in the optional field CG but this tag is not present in the given ",
579 "record.")};
580
581 auto cigar_view = std::views::all(std::get<std::string>(it->second));
582 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
583 offset_tmp = soft_clipping_end = 0;
584 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
585 tag_dict.erase(it); // remove redundant information
586
587 if constexpr (!detail::decays_to_ignore_v<align_type>)
588 {
589 assign_unaligned(get<1>(align),
590 seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
591 std::ranges::distance(seq) - soft_clipping_end));
592 }
593 }
594 }
595 }
596
597 // Alignment object construction
598 if constexpr (!detail::decays_to_ignore_v<align_type>)
599 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
600
601 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
602 std::swap(cigar_vector, tmp_cigar_vector);
603}
604
606template <typename stream_type,
607 typename header_type,
608 typename seq_type,
609 typename id_type,
610 typename ref_seq_type,
611 typename ref_id_type,
612 typename align_type,
613 typename cigar_type,
614 typename qual_type,
615 typename mate_type,
616 typename tag_dict_type>
617inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
618 [[maybe_unused]] sam_file_output_options const & options,
619 [[maybe_unused]] header_type && header,
620 [[maybe_unused]] seq_type && seq,
621 [[maybe_unused]] qual_type && qual,
622 [[maybe_unused]] id_type && id,
623 [[maybe_unused]] int32_t const offset,
624 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
625 [[maybe_unused]] ref_id_type && ref_id,
626 [[maybe_unused]] std::optional<int32_t> ref_offset,
627 [[maybe_unused]] align_type && align,
628 [[maybe_unused]] cigar_type && cigar_vector,
629 [[maybe_unused]] sam_flag const flag,
630 [[maybe_unused]] uint8_t const mapq,
631 [[maybe_unused]] mate_type && mate,
632 [[maybe_unused]] tag_dict_type && tag_dict,
633 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
634 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
635{
636 // ---------------------------------------------------------------------
637 // Type Requirements (as static asserts for user friendliness)
638 // ---------------------------------------------------------------------
639 static_assert((std::ranges::forward_range<seq_type> &&
641 "The seq object must be a std::ranges::forward_range over "
642 "letters that model seqan3::alphabet.");
643
644 static_assert((std::ranges::forward_range<id_type> &&
646 "The id object must be a std::ranges::forward_range over "
647 "letters that model seqan3::alphabet.");
648
649 static_assert((std::ranges::forward_range<ref_seq_type> &&
651 "The ref_seq object must be a std::ranges::forward_range "
652 "over letters that model seqan3::alphabet.");
653
654 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
655 {
656 static_assert((std::ranges::forward_range<ref_id_type> ||
657 std::integral<std::remove_reference_t<ref_id_type>> ||
658 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
659 "The ref_id object must be a std::ranges::forward_range "
660 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
661 }
662
664 "The align object must be a std::pair of two ranges whose "
665 "value_type is comparable to seqan3::gap");
666
667 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
668 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
669 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
670 "The align object must be a std::pair of two ranges whose "
671 "value_type is comparable to seqan3::gap");
672
673 static_assert((std::ranges::forward_range<qual_type> &&
675 "The qual object must be a std::ranges::forward_range "
676 "over letters that model seqan3::alphabet.");
677
679 "The mate object must be a std::tuple of size 3 with "
680 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
681 "2) a std::integral or std::optional<std::integral>, and "
682 "3) a std::integral.");
683
684 static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
685 std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
686 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
687 (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
688 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
689 std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
690 "The mate object must be a std::tuple of size 3 with "
691 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
692 "2) a std::integral or std::optional<std::integral>, and "
693 "3) a std::integral.");
694
695 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
696 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
697
698 if constexpr (detail::decays_to_ignore_v<header_type>)
699 {
700 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
701 "You can either construct the output file with ref_ids and ref_seqs information and "
702 "the header will be created for you, or you can access the `header` member directly."};
703 }
704 else
705 {
706 // ---------------------------------------------------------------------
707 // logical Requirements
708 // ---------------------------------------------------------------------
709
710 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
711 throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
712
713 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
714
715 // ---------------------------------------------------------------------
716 // Writing the BAM Header on first call
717 // ---------------------------------------------------------------------
718 if (!header_was_written)
719 {
720 stream << "BAM\1";
722 write_header(os, options, header); // write SAM header to temporary stream to query the size.
723 int32_t l_text{static_cast<int32_t>(os.str().size())};
724 std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
725
726 stream << os.str();
727
728 int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
729 std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
730
731 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
732 {
733 int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
734 std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
735 // write reference name:
736 std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
737 stream_it = '\0';
738 // write reference sequence length:
739 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
740 }
741
742 header_was_written = true;
743 }
744
745 // ---------------------------------------------------------------------
746 // Writing the Record
747 // ---------------------------------------------------------------------
748 int32_t ref_length{};
749
750 // if alignment is non-empty, replace cigar_vector.
751 // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
752 if (!std::ranges::empty(cigar_vector))
753 {
754 int32_t dummy_seq_length{};
755 for (auto & [count, operation] : cigar_vector)
756 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
757 }
758 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
759 {
760 ref_length = std::ranges::distance(get<1>(align));
761
762 // compute possible distance from alignment end to sequence end
763 // which indicates soft clipping at the end.
764 // This should be replaced by a free count_gaps function for
765 // aligned sequences which is more efficient if possible.
766 int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
767
768 for (auto chr : get<1>(align))
769 if (chr == gap{})
770 ++off_end;
771
772 off_end -= ref_length;
773 cigar_vector = detail::get_cigar_vector(align, offset, off_end);
774 }
775
776 if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
777 {
778 tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
779 cigar_vector.resize(2);
780 cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
781 cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
782 }
783
784 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
785
786 // Compute the value for the l_read_name field for the bam record.
787 // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
788 // the data type to store the value is uint8_t and 255 is the maximal size.
789 // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
790 // 2 bytes.
791 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
792 read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
793
794 alignment_record_core core
795 {
796 /* block_size */ 0, // will be initialised right after
797 /* refID */ -1, // will be initialised right after
798 /* pos */ ref_offset.value_or(-1),
799 /* l_read_name */ read_name_size,
800 /* mapq */ mapq,
801 /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
802 /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
803 /* flag */ flag,
804 /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
805 /* next_refId */ -1, // will be initialised right after
806 /* next_pos */ get<1>(mate).value_or(-1),
807 /* tlen */ get<2>(mate)
808 };
809
810 auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
811 [[maybe_unused]] auto & id_target)
812 {
813 using id_t = std::remove_reference_t<decltype(id_source)>;
814
815 if constexpr (!detail::decays_to_ignore_v<id_t>)
816 {
817 if constexpr (std::integral<id_t>)
818 {
819 id_target = id_source;
820 }
821 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
822 {
823 id_target = id_source.value_or(-1);
824 }
825 else
826 {
827 if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
828 {
829 auto id_it = header.ref_dict.end();
830
831 if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
832 std::ranges::sized_range<decltype(id_source)> &&
833 std::ranges::borrowed_range<decltype(id_source)>)
834 {
835 id_it = header.ref_dict.find(std::span{std::ranges::data(id_source),
836 std::ranges::size(id_source)});
837 }
838 else
839 {
840 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
841
842 static_assert(implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
843 "The ref_id type is not convertible to the reference id information stored in the "
844 "reference dictionary of the header object.");
845
846 id_it = header.ref_dict.find(id_source);
847 }
848
849 if (id_it == header.ref_dict.end())
850 {
851 throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
852 "not be found in BAM header ref_dict: ",
853 header.ref_dict, ".")};
854 }
855
856 id_target = id_it->second;
857 }
858 }
859 }
860 };
861
862 // initialise core.refID
863 check_and_assign_id_to(ref_id, core.refID);
864
865 // initialise core.next_refID
866 check_and_assign_id_to(get<0>(mate), core.next_refID);
867
868 // initialise core.block_size
869 core.block_size = sizeof(core) - 4/*block_size excluded*/ +
870 core.l_read_name +
871 core.n_cigar_op * 4 + // each int32_t has 4 bytes
872 (core.l_seq + 1) / 2 + // bitcompressed seq
873 core.l_seq + // quality string
874 tag_dict_binary_str.size();
875
876 std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
877
878 if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
879 stream_it = '*';
880 else
881 std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
882 stream_it = '\0';
883
884 // write cigar
885 for (auto [cigar_count, op] : cigar_vector)
886 {
887 cigar_count = cigar_count << 4;
888 cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
889 std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
890 }
891
892 // write seq (bit-compressed: dna16sam characters go into one byte)
893 using alph_t = std::ranges::range_value_t<seq_type>;
894 constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
895
896 auto sit = std::ranges::begin(seq);
897 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
898 {
899 uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
900 ++sidx, ++sit;
901 compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
902 stream_it = static_cast<char>(compressed_chr);
903 }
904
905 if (core.l_seq & 1) // write one more
906 stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
907
908 // write qual
909 if (std::ranges::empty(qual))
910 {
911 auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
912 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
913 }
914 else
915 {
916 if (std::ranges::distance(qual) != core.l_seq)
917 throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
918 core.l_seq, ". Got quality with size ",
919 std::ranges::distance(qual), " instead.")};
920
921 auto v = qual | std::views::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
922 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
923 }
924
925 // write optional fields
926 stream << tag_dict_binary_str;
927 } // if constexpr (!detail::decays_to_ignore_v<header_type>)
928}
929
931template <typename stream_view_type, typename value_type>
932inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
933 stream_view_type && stream_view,
934 value_type const & SEQAN3_DOXYGEN_ONLY(value))
935{
936 int32_t count;
937 read_integral_byte_field(stream_view, count); // read length of vector
938 std::vector<value_type> tmp_vector;
939 tmp_vector.reserve(count);
940
941 while (count > 0)
942 {
943 value_type tmp{};
944 if constexpr(std::integral<value_type>)
945 {
946 read_integral_byte_field(stream_view, tmp);
947 }
948 else if constexpr(std::same_as<value_type, float>)
949 {
950 read_float_byte_field(stream_view, tmp);
951 }
952 else
953 {
954 constexpr bool always_false = std::is_same_v<value_type, void>;
955 static_assert(always_false, "format_bam::read_sam_dict_vector: unsupported value_type");
956 }
957 tmp_vector.push_back(std::move(tmp));
958 --count;
959 }
960 variant = std::move(tmp_vector);
961}
962
980template <typename stream_view_type>
981inline void format_bam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
982{
983 /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
984 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
985 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
986 VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
987 by the length (int32_t) of the array, followed by the values.
988 */
989 auto it = std::ranges::begin(stream_view);
990 uint16_t tag = static_cast<uint16_t>(*it) << 8;
991 ++it; // skip char read before
992
993 tag += static_cast<uint16_t>(*it);
994 ++it; // skip char read before
995
996 char type_id = *it;
997 ++it; // skip char read before
998
999 switch (type_id)
1000 {
1001 case 'A' : // char
1002 {
1003 target[tag] = *it;
1004 ++it; // skip char that has been read
1005 break;
1006 }
1007 // all integer sizes are possible
1008 case 'c' : // int8_t
1009 {
1010 int8_t tmp;
1011 read_integral_byte_field(stream_view, tmp);
1012 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1013 break;
1014 }
1015 case 'C' : // uint8_t
1016 {
1017 uint8_t tmp;
1018 read_integral_byte_field(stream_view, tmp);
1019 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1020 break;
1021 }
1022 case 's' : // int16_t
1023 {
1024 int16_t tmp;
1025 read_integral_byte_field(stream_view, tmp);
1026 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1027 break;
1028 }
1029 case 'S' : // uint16_t
1030 {
1031 uint16_t tmp;
1032 read_integral_byte_field(stream_view, tmp);
1033 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1034 break;
1035 }
1036 case 'i' : // int32_t
1037 {
1038 int32_t tmp;
1039 read_integral_byte_field(stream_view, tmp);
1040 target[tag] = std::move(tmp); // readable sam format only allows int32_t
1041 break;
1042 }
1043 case 'I' : // uint32_t
1044 {
1045 uint32_t tmp;
1046 read_integral_byte_field(stream_view, tmp);
1047 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1048 break;
1049 }
1050 case 'f' : // float
1051 {
1052 float tmp;
1053 read_float_byte_field(stream_view, tmp);
1054 target[tag] = tmp;
1055 break;
1056 }
1057 case 'Z' : // string
1058 {
1059 string_buffer.clear();
1060 while (!is_char<'\0'>(*it))
1061 {
1062 string_buffer.push_back(*it);
1063 ++it;
1064 }
1065 ++it; // skip \0
1066 target[tag] = string_buffer;
1067 break;
1068 }
1069 case 'H' : // byte array, represented as null-terminated string; specification requires even number of bytes
1070 {
1071 std::vector<std::byte> byte_array;
1072 std::byte value;
1073 while (!is_char<'\0'>(*it))
1074 {
1075 string_buffer.clear();
1076 string_buffer.push_back(*it);
1077 ++it;
1078
1079 if (*it == '\0')
1080 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1081
1082 string_buffer.push_back(*it);
1083 ++it;
1084 read_byte_field(string_buffer, value);
1085 byte_array.push_back(value);
1086 }
1087 ++it; // skip \0
1088 target[tag] = byte_array;
1089 break;
1090 }
1091 case 'B' : // Array. Value type depends on second char [cCsSiIf]
1092 {
1093 char array_value_type_id = *it;
1094 ++it; // skip char read before
1095
1096 switch (array_value_type_id)
1097 {
1098 case 'c' : // int8_t
1099 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1100 break;
1101 case 'C' : // uint8_t
1102 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1103 break;
1104 case 's' : // int16_t
1105 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1106 break;
1107 case 'S' : // uint16_t
1108 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1109 break;
1110 case 'i' : // int32_t
1111 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1112 break;
1113 case 'I' : // uint32_t
1114 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1115 break;
1116 case 'f' : // float
1117 read_sam_dict_vector(target[tag], stream_view, float{});
1118 break;
1119 default:
1120 throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1121 "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
1122 }
1123 break;
1124 }
1125 default:
1126 throw format_error{detail::to_string("The second character in the numerical id of a "
1127 "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
1128 }
1129}
1130
1145template <typename cigar_input_type>
1146inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1147{
1148 std::vector<cigar> operations{};
1149 char operation{'\0'};
1150 uint32_t count{};
1151 int32_t ref_length{}, seq_length{};
1152 uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1153 constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1154 constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1155
1156 if (n_cigar_op == 0) // [[unlikely]]
1157 return std::tuple{operations, ref_length, seq_length};
1158
1159 // parse the rest of the cigar
1160 // -------------------------------------------------------------------------------------------------------------
1161 while (n_cigar_op > 0) // until stream is not empty
1162 {
1163 std::ranges::copy_n(std::ranges::begin(cigar_input),
1164 sizeof(operation_and_count),
1165 reinterpret_cast<char*>(&operation_and_count));
1166 operation = cigar_mapping[operation_and_count & cigar_mask];
1167 count = operation_and_count >> 4;
1168
1169 detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1170 operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1171 --n_cigar_op;
1172 }
1173
1174 return std::tuple{operations, ref_length, seq_length};
1175}
1176
1180inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary const & tag_dict)
1181{
1182 std::string result{};
1183
1184 auto stream_variant_fn = [&result] (auto && arg) // helper to print a std::variant
1185 {
1186 // T is either char, int32_t, float, std::string, or a std::vector<some int>
1187 using T = std::remove_cvref_t<decltype(arg)>;
1188
1189 if constexpr (std::same_as<T, int32_t>)
1190 {
1191 // always choose the smallest possible representation [cCsSiI]
1192 size_t const absolute_arg = std::abs(arg);
1193 auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1194 bool const negative = arg < 0;
1195 n = n * n + 2 * negative; // for switch case order
1196
1197 switch (n)
1198 {
1199 case 0:
1200 {
1201 result[result.size() - 1] = 'C';
1202 result.append(reinterpret_cast<char const *>(&arg), 1);
1203 break;
1204 }
1205 case 1:
1206 {
1207 result[result.size() - 1] = 'S';
1208 result.append(reinterpret_cast<char const *>(&arg), 2);
1209 break;
1210 }
1211 case 2:
1212 {
1213 result[result.size() - 1] = 'c';
1214 int8_t tmp = static_cast<int8_t>(arg);
1215 result.append(reinterpret_cast<char const *>(&tmp), 1);
1216 break;
1217 }
1218 case 3:
1219 {
1220 result[result.size() - 1] = 's';
1221 int16_t tmp = static_cast<int16_t>(arg);
1222 result.append(reinterpret_cast<char const *>(&tmp), 2);
1223 break;
1224 }
1225 default:
1226 {
1227 result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1228 break;
1229 }
1230 }
1231 }
1232 else if constexpr (std::same_as<T, std::string>)
1233 {
1234 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1235 }
1236 else if constexpr (!std::ranges::range<T>) // char, float
1237 {
1238 result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1239 }
1240 else // std::vector of some arithmetic_type type
1241 {
1242 int32_t sz{static_cast<int32_t>(arg.size())};
1243 result.append(reinterpret_cast<char *>(&sz), 4);
1244 result.append(reinterpret_cast<char const *>(arg.data()),
1245 arg.size() * sizeof(std::ranges::range_value_t<T>));
1246 }
1247 };
1248
1249 for (auto & [tag, variant] : tag_dict)
1250 {
1251 result.push_back(static_cast<char>(tag / 256));
1252 result.push_back(static_cast<char>(tag % 256));
1253
1254 result.push_back(detail::sam_tag_type_char[variant.index()]);
1255
1256 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1257 result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1258
1259 std::visit(stream_variant_fn, variant);
1260 }
1261
1262 return result;
1263}
1264
1265} // namespace seqan3
T begin(T... args)
The <bit> header from C++20's standard library.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:165
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:191
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The BAM format.
Definition: format_bam.hpp:50
format_bam()=default
Defaulted.
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:617
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:260
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:67
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:34
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:139
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:178
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:175
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:337
T clear(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T end(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:162
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:228
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:495
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
The main SeqAn3 namespace.
Definition: cleanup.hpp:4
Provides seqan3::debug_stream and related types.
T push_back(T... args)
The <ranges> header from C++20's standard library.
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:26
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T visit(T... args)