72 template <
typename stream_type,
73 typename seq_legal_alph_type,
74 typename ref_seqs_type,
75 typename ref_ids_type,
76 typename stream_pos_type,
80 typename ref_seq_type,
82 typename ref_offset_type,
89 typename tag_dict_type,
90 typename e_value_type,
91 typename bit_score_type>
94 ref_seqs_type & ref_seqs,
96 stream_pos_type & position_buffer,
100 offset_type & offset,
101 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
102 ref_id_type & ref_id,
103 ref_offset_type & ref_offset,
105 cigar_type & cigar_vector,
109 tag_dict_type & tag_dict,
110 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
111 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
113 template <
typename stream_type,
114 typename header_type,
117 typename ref_seq_type,
118 typename ref_id_type,
123 typename tag_dict_type>
126 [[maybe_unused]] header_type && header,
127 [[maybe_unused]] seq_type && seq,
128 [[maybe_unused]] qual_type && qual,
129 [[maybe_unused]] id_type &&
id,
130 [[maybe_unused]] int32_t
const offset,
131 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
132 [[maybe_unused]] ref_id_type && ref_id,
134 [[maybe_unused]] align_type && align,
135 [[maybe_unused]] cigar_type && cigar_vector,
136 [[maybe_unused]]
sam_flag const flag,
137 [[maybe_unused]] uint8_t
const mapq,
138 [[maybe_unused]] mate_type && mate,
139 [[maybe_unused]] tag_dict_type && tag_dict,
140 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
141 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score));
145 bool header_was_read{
false};
151 struct alignment_record_core
156 uint32_t l_read_name:8;
159 uint32_t n_cigar_op:16;
177 ret[
static_cast<index_t
>(
'I')] = 1;
178 ret[
static_cast<index_t
>(
'D')] = 2;
179 ret[
static_cast<index_t
>(
'N')] = 3;
180 ret[
static_cast<index_t
>(
'S')] = 4;
181 ret[
static_cast<index_t
>(
'H')] = 5;
182 ret[
static_cast<index_t
>(
'P')] = 6;
183 ret[
static_cast<index_t
>(
'=')] = 7;
184 ret[
static_cast<index_t
>(
'X')] = 8;
191 static uint16_t reg2bin(int32_t beg, int32_t end)
noexcept
194 if (beg >> 14 == end >> 14)
return ((1 << 15) - 1) / 7 + (beg >> 14);
195 if (beg >> 17 == end >> 17)
return ((1 << 12) - 1) / 7 + (beg >> 17);
196 if (beg >> 20 == end >> 20)
return ((1 << 9) - 1) / 7 + (beg >> 20);
197 if (beg >> 23 == end >> 23)
return ((1 << 6) - 1) / 7 + (beg >> 23);
198 if (beg >> 26 == end >> 26)
return ((1 << 3) - 1) / 7 + (beg >> 26);
208 template <
typename stream_view_type, std::
integral number_type>
209 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
211 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(target),
reinterpret_cast<char *
>(&target));
219 template <
typename stream_view_type>
220 void read_float_byte_field(stream_view_type && stream_view,
float & target)
222 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(int32_t),
reinterpret_cast<char *
>(&target));
225 template <
typename stream_view_type,
typename value_type>
227 stream_view_type && stream_view,
228 value_type
const & SEQAN3_DOXYGEN_ONLY(value));
230 template <
typename stream_view_type>
231 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
233 template <
typename cigar_input_type>
234 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const;
236 static std::string get_tag_dict_str(sam_tag_dictionary
const & tag_dict);
240template <
typename stream_type,
241 typename seq_legal_alph_type,
242 typename ref_seqs_type,
243 typename ref_ids_type,
244 typename stream_pos_type,
247 typename offset_type,
248 typename ref_seq_type,
249 typename ref_id_type,
250 typename ref_offset_type,
257 typename tag_dict_type,
258 typename e_value_type,
259 typename bit_score_type>
262 ref_seqs_type & ref_seqs,
264 stream_pos_type & position_buffer,
268 offset_type & offset,
269 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
270 ref_id_type & ref_id,
271 ref_offset_type & ref_offset,
273 cigar_type & cigar_vector,
277 tag_dict_type & tag_dict,
278 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
279 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
281 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
282 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
283 "The ref_offset must be a specialisation of std::optional.");
285 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
286 "The type of field::mapq must be uint8_t.");
288 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
289 "The type of field::flag must be seqan3::sam_flag.");
291 auto stream_view = seqan3::detail::istreambuf(stream);
294 [[maybe_unused]] int32_t offset_tmp{};
295 [[maybe_unused]] int32_t soft_clipping_end{};
297 [[maybe_unused]] int32_t ref_length{0}, seq_length{0};
301 if (!header_was_read)
304 if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4),
std::string_view{
"BAM\1"}))
312 read_integral_byte_field(stream_view, l_text);
315 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
317 read_integral_byte_field(stream_view, n_ref);
319 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
321 read_integral_byte_field(stream_view, l_name);
323 string_buffer.
resize(l_name - 1);
324 std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.
data());
325 ++std::ranges::begin(stream_view);
327 read_integral_byte_field(stream_view, l_ref);
329 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>)
334 auto & reference_ids = header.
ref_ids();
338 reference_ids.push_back(string_buffer);
340 header.
ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
345 auto id_it = header.
ref_dict.find(string_buffer);
350 throw format_error{detail::to_string(
"Unknown reference name '" + string_buffer +
351 "' found in BAM file header (header.ref_ids():",
354 else if (id_it->second != ref_idx)
356 throw format_error{detail::to_string(
"Reference id '", string_buffer,
"' at position ", ref_idx,
357 " does not correspond to the position ", id_it->second,
358 " in the header (header.ref_ids():", header.
ref_ids(),
").")};
360 else if (std::get<0>(header.
ref_id_info[id_it->second]) != l_ref)
362 throw format_error{
"Provided reference has unequal length as specified in the header."};
366 header_was_read =
true;
368 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view))
374 position_buffer = stream.tellg();
375 alignment_record_core core;
376 std::ranges::copy(stream_view | detail::take_exactly_or_throw(
sizeof(core)),
reinterpret_cast<char *
>(&core));
378 if (core.refID >=
static_cast<int32_t
>(header.
ref_ids().size()) || core.refID < -1)
380 throw format_error{detail::to_string(
"Reference id index '", core.refID,
"' is not in range of ",
381 "header.ref_ids(), which has size ", header.
ref_ids().size(),
".")};
383 else if (core.refID > -1)
394 if constexpr (!detail::decays_to_ignore_v<mate_type>)
396 if (core.next_refID > -1)
397 get<0>(
mate) = core.next_refID;
399 if (core.next_pos > -1)
400 get<1>(
mate) = core.next_pos;
402 get<2>(
mate) = core.tlen;
407 auto id_view = stream_view | detail::take_exactly_or_throw(core.l_read_name - 1);
408 if constexpr (!detail::decays_to_ignore_v<id_type>)
409 read_forward_range_field(id_view,
id);
411 detail::consume(id_view);
412 ++std::ranges::begin(stream_view);
416 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
418 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
419 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
424 detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
433 auto seq_stream = stream_view
434 | detail::take_exactly_or_throw(core.l_seq / 2)
441 if constexpr (detail::decays_to_ignore_v<seq_type>)
443 auto skip_sequence_bytes = [&] ()
445 detail::consume(seq_stream);
447 ++std::ranges::begin(stream_view);
450 if constexpr (!detail::decays_to_ignore_v<align_type>)
453 "If you want to read ALIGNMENT but not SEQ, the alignment"
454 " object must store a sequence container at the second (query) position.");
456 if (!tmp_cigar_vector.empty())
458 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end));
459 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
460 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
462 get<1>(align).reserve(seq_length);
464 auto tmp_iter = std::ranges::begin(seq_stream);
465 std::ranges::advance(tmp_iter, offset_tmp / 2);
469 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
474 for (
size_t i = (seq_length / 2); i > 0; --i)
476 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
477 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
483 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
488 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
492 skip_sequence_bytes();
498 skip_sequence_bytes();
503 using alph_t = std::ranges::range_value_t<
decltype(
seq)>;
504 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
506 for (
auto [d1, d2] : seq_stream)
516 ++std::ranges::begin(stream_view);
519 if constexpr (!detail::decays_to_ignore_v<align_type>)
521 assign_unaligned(get<1>(align),
522 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
523 std::ranges::distance(
seq) - soft_clipping_end));
530 auto qual_view = stream_view | detail::take_exactly_or_throw(core.l_seq)
533 return static_cast<char>(chr + 33);
535 if constexpr (!detail::decays_to_ignore_v<qual_type>)
536 read_forward_range_field(qual_view,
qual);
538 detail::consume(qual_view);
542 int32_t remaining_bytes = core.block_size - (
sizeof(alignment_record_core) - 4) -
543 core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
544 assert(remaining_bytes >= 0);
545 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
547 while (tags_view.size() > 0)
549 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
550 read_sam_dict_field(tags_view, tag_dict);
552 detail::consume(tags_view);
557 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
562 if (core.l_seq != 0 && offset_tmp == core.l_seq)
564 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
566 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
567 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
568 "stored in the optional field CG. You need to read in the field::tags and "
569 "field::seq in order to access this information.")};
573 auto it = tag_dict.find(
"CG"_tag);
575 if (it == tag_dict.end())
576 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
577 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
578 "stored in the optional field CG but this tag is not present in the given ",
581 auto cigar_view = std::views::all(std::get<std::string>(it->second));
582 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
583 offset_tmp = soft_clipping_end = 0;
584 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
587 if constexpr (!detail::decays_to_ignore_v<align_type>)
589 assign_unaligned(get<1>(align),
590 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
591 std::ranges::distance(
seq) - soft_clipping_end));
598 if constexpr (!detail::decays_to_ignore_v<align_type>)
599 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length);
601 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
602 std::swap(cigar_vector, tmp_cigar_vector);
606template <
typename stream_type,
607 typename header_type,
610 typename ref_seq_type,
611 typename ref_id_type,
616 typename tag_dict_type>
619 [[maybe_unused]] header_type && header,
620 [[maybe_unused]] seq_type && seq,
621 [[maybe_unused]] qual_type && qual,
622 [[maybe_unused]] id_type &&
id,
623 [[maybe_unused]] int32_t
const offset,
624 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
625 [[maybe_unused]] ref_id_type && ref_id,
627 [[maybe_unused]] align_type && align,
628 [[maybe_unused]] cigar_type && cigar_vector,
629 [[maybe_unused]]
sam_flag const flag,
630 [[maybe_unused]] uint8_t
const mapq,
631 [[maybe_unused]] mate_type && mate,
632 [[maybe_unused]] tag_dict_type && tag_dict,
633 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
634 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score))
639 static_assert((std::ranges::forward_range<seq_type> &&
641 "The seq object must be a std::ranges::forward_range over "
642 "letters that model seqan3::alphabet.");
644 static_assert((std::ranges::forward_range<id_type> &&
646 "The id object must be a std::ranges::forward_range over "
647 "letters that model seqan3::alphabet.");
649 static_assert((std::ranges::forward_range<ref_seq_type> &&
651 "The ref_seq object must be a std::ranges::forward_range "
652 "over letters that model seqan3::alphabet.");
654 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
656 static_assert((std::ranges::forward_range<ref_id_type> ||
657 std::integral<std::remove_reference_t<ref_id_type>> ||
658 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>,
std::optional>),
659 "The ref_id object must be a std::ranges::forward_range "
660 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
664 "The align object must be a std::pair of two ranges whose "
665 "value_type is comparable to seqan3::gap");
667 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
668 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
669 std::equality_comparable_with<
gap, std::ranges::range_reference_t<
decltype(std::get<1>(align))>>),
670 "The align object must be a std::pair of two ranges whose "
671 "value_type is comparable to seqan3::gap");
673 static_assert((std::ranges::forward_range<qual_type> &&
675 "The qual object must be a std::ranges::forward_range "
676 "over letters that model seqan3::alphabet.");
679 "The mate object must be a std::tuple of size 3 with "
680 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
681 "2) a std::integral or std::optional<std::integral>, and "
682 "3) a std::integral.");
684 static_assert(((std::ranges::forward_range<decltype(std::get<0>(
mate))> ||
687 (std::integral<std::remove_cvref_t<decltype(std::get<1>(
mate))>> ||
689 std::integral<std::remove_cvref_t<decltype(std::get<2>(
mate))>>),
690 "The mate object must be a std::tuple of size 3 with "
691 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
692 "2) a std::integral or std::optional<std::integral>, and "
693 "3) a std::integral.");
696 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
698 if constexpr (detail::decays_to_ignore_v<header_type>)
700 throw format_error{
"BAM can only be written with a header but you did not provide enough information! "
701 "You can either construct the output file with ref_ids and ref_seqs information and "
702 "the header will be created for you, or you can access the `header` member directly."};
713 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
718 if (!header_was_written)
722 write_header(os, options, header);
723 int32_t l_text{
static_cast<int32_t
>(os.
str().size())};
724 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_text), 4, stream_it);
728 int32_t n_ref{
static_cast<int32_t
>(header.
ref_ids().size())};
729 std::ranges::copy_n(
reinterpret_cast<char *
>(&n_ref), 4, stream_it);
731 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
733 int32_t l_name{
static_cast<int32_t
>(header.
ref_ids()[ridx].size()) + 1};
734 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_name), 4, stream_it);
736 std::ranges::copy(header.
ref_ids()[ridx].begin(), header.
ref_ids()[ridx].end(), stream_it);
739 std::ranges::copy_n(
reinterpret_cast<char *
>(&get<0>(header.
ref_id_info[ridx])), 4, stream_it);
742 header_was_written =
true;
748 int32_t ref_length{};
752 if (!std::ranges::empty(cigar_vector))
754 int32_t dummy_seq_length{};
755 for (
auto & [
count, operation] : cigar_vector)
756 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(),
count);
758 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
760 ref_length = std::ranges::distance(get<1>(align));
766 int32_t off_end{
static_cast<int32_t
>(std::ranges::distance(
seq)) -
offset};
768 for (
auto chr : get<1>(align))
772 off_end -= ref_length;
773 cigar_vector = detail::get_cigar_vector(align,
offset, off_end);
776 if (cigar_vector.size() >= (1 << 16))
778 tag_dict[
"CG"_tag] = detail::get_cigar_string(cigar_vector);
779 cigar_vector.resize(2);
780 cigar_vector[0] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(
seq)),
'S'_cigar_operation};
781 cigar_vector[1] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(get<1>(align))),
'N'_cigar_operation};
784 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
791 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(
id), 254) + 1;
792 read_name_size +=
static_cast<uint8_t
>(read_name_size == 1);
794 alignment_record_core core
802 static_cast<uint16_t
>(cigar_vector.size()),
804 static_cast<int32_t
>(std::ranges::distance(
seq)),
806 get<1>(
mate).value_or(-1),
810 auto check_and_assign_id_to = [&header] ([[maybe_unused]]
auto & id_source,
811 [[maybe_unused]]
auto & id_target)
815 if constexpr (!detail::decays_to_ignore_v<id_t>)
817 if constexpr (std::integral<id_t>)
819 id_target = id_source;
821 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
823 id_target = id_source.value_or(-1);
827 if (!std::ranges::empty(id_source))
831 if constexpr (std::ranges::contiguous_range<
decltype(id_source)> &&
832 std::ranges::sized_range<
decltype(id_source)> &&
833 std::ranges::borrowed_range<
decltype(id_source)>)
843 "The ref_id type is not convertible to the reference id information stored in the "
844 "reference dictionary of the header object.");
846 id_it = header.
ref_dict.find(id_source);
851 throw format_error{detail::to_string(
"Unknown reference name '", id_source,
"' could "
852 "not be found in BAM header ref_dict: ",
856 id_target = id_it->second;
863 check_and_assign_id_to(
ref_id, core.refID);
866 check_and_assign_id_to(get<0>(
mate), core.next_refID);
869 core.block_size =
sizeof(core) - 4 +
871 core.n_cigar_op * 4 +
872 (core.l_seq + 1) / 2 +
874 tag_dict_binary_str.
size();
876 std::ranges::copy_n(
reinterpret_cast<char *
>(&core),
sizeof(core), stream_it);
878 if (std::ranges::empty(
id))
881 std::ranges::copy_n(std::ranges::begin(
id), core.l_read_name - 1, stream_it);
885 for (
auto [cigar_count, op] : cigar_vector)
887 cigar_count = cigar_count << 4;
888 cigar_count |=
static_cast<int32_t
>(char_to_sam_rank[op.to_char()]);
889 std::ranges::copy_n(
reinterpret_cast<char *
>(&cigar_count), 4, stream_it);
893 using alph_t = std::ranges::range_value_t<seq_type>;
894 constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
896 auto sit = std::ranges::begin(
seq);
897 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
902 stream_it =
static_cast<char>(compressed_chr);
906 stream_it =
static_cast<char>(
to_rank(to_dna16[
to_rank(*sit)]) << 4);
909 if (std::ranges::empty(
qual))
912 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
916 if (std::ranges::distance(
qual) != core.l_seq)
917 throw format_error{detail::to_string(
"Expected quality of same length as sequence with size ",
918 core.l_seq,
". Got quality with size ",
919 std::ranges::distance(
qual),
" instead.")};
922 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
926 stream << tag_dict_binary_str;
931template <
typename stream_view_type,
typename value_type>
933 stream_view_type && stream_view,
934 value_type
const & SEQAN3_DOXYGEN_ONLY(value))
937 read_integral_byte_field(stream_view,
count);
944 if constexpr(std::integral<value_type>)
946 read_integral_byte_field(stream_view, tmp);
948 else if constexpr(std::same_as<value_type, float>)
950 read_float_byte_field(stream_view, tmp);
954 constexpr bool always_false = std::is_same_v<value_type, void>;
955 static_assert(always_false,
"format_bam::read_sam_dict_vector: unsupported value_type");
960 variant = std::move(tmp_vector);
980template <
typename stream_view_type>
981inline void format_bam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
990 uint16_t tag =
static_cast<uint16_t
>(*it) << 8;
993 tag +=
static_cast<uint16_t
>(*it);
1011 read_integral_byte_field(stream_view, tmp);
1012 target[tag] =
static_cast<int32_t
>(tmp);
1018 read_integral_byte_field(stream_view, tmp);
1019 target[tag] =
static_cast<int32_t
>(tmp);
1025 read_integral_byte_field(stream_view, tmp);
1026 target[tag] =
static_cast<int32_t
>(tmp);
1032 read_integral_byte_field(stream_view, tmp);
1033 target[tag] =
static_cast<int32_t
>(tmp);
1039 read_integral_byte_field(stream_view, tmp);
1040 target[tag] = std::move(tmp);
1046 read_integral_byte_field(stream_view, tmp);
1047 target[tag] =
static_cast<int32_t
>(tmp);
1053 read_float_byte_field(stream_view, tmp);
1059 string_buffer.
clear();
1060 while (!is_char<'\0'>(*it))
1066 target[tag] = string_buffer;
1073 while (!is_char<'\0'>(*it))
1075 string_buffer.
clear();
1080 throw format_error{
"Hexadecimal tag has an uneven number of digits!"};
1084 read_byte_field(string_buffer, value);
1088 target[tag] = byte_array;
1093 char array_value_type_id = *it;
1096 switch (array_value_type_id)
1099 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1102 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1105 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1108 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1111 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1114 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1117 read_sam_dict_vector(target[tag], stream_view,
float{});
1120 throw format_error{detail::to_string(
"The first character in the numerical id of a SAM tag ",
1121 "must be one of [cCsSiIf] but '", array_value_type_id,
"' was given.")};
1126 throw format_error{detail::to_string(
"The second character in the numerical id of a "
1127 "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id,
"' was given.")};
1145template <
typename cigar_input_type>
1146inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const
1149 char operation{
'\0'};
1151 int32_t ref_length{}, seq_length{};
1152 uint32_t operation_and_count{};
1153 constexpr char const * cigar_mapping =
"MIDNSHP=X*******";
1154 constexpr uint32_t cigar_mask = 0x0f;
1156 if (n_cigar_op == 0)
1157 return std::tuple{operations, ref_length, seq_length};
1161 while (n_cigar_op > 0)
1163 std::ranges::copy_n(std::ranges::begin(cigar_input),
1164 sizeof(operation_and_count),
1165 reinterpret_cast<char*
>(&operation_and_count));
1166 operation = cigar_mapping[operation_and_count & cigar_mask];
1167 count = operation_and_count >> 4;
1169 detail::update_alignment_lengths(ref_length, seq_length, operation,
count);
1174 return std::tuple{operations, ref_length, seq_length};
1180inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary
const & tag_dict)
1184 auto stream_variant_fn = [&result] (
auto && arg)
1189 if constexpr (std::same_as<T, int32_t>)
1192 size_t const absolute_arg = std::abs(arg);
1194 bool const negative = arg < 0;
1195 n = n * n + 2 * negative;
1201 result[result.size() - 1] =
'C';
1202 result.append(
reinterpret_cast<char const *
>(&arg), 1);
1207 result[result.size() - 1] =
'S';
1208 result.append(
reinterpret_cast<char const *
>(&arg), 2);
1213 result[result.size() - 1] =
'c';
1214 int8_t tmp =
static_cast<int8_t
>(arg);
1215 result.append(
reinterpret_cast<char const *
>(&tmp), 1);
1220 result[result.size() - 1] =
's';
1221 int16_t tmp =
static_cast<int16_t
>(arg);
1222 result.append(
reinterpret_cast<char const *
>(&tmp), 2);
1227 result.append(
reinterpret_cast<char const *
>(&arg), 4);
1232 else if constexpr (std::same_as<T, std::string>)
1234 result.append(
reinterpret_cast<char const *
>(arg.data()), arg.size() + 1);
1236 else if constexpr (!std::ranges::range<T>)
1238 result.append(
reinterpret_cast<char const *
>(&arg),
sizeof(arg));
1242 int32_t sz{
static_cast<int32_t
>(arg.size())};
1243 result.append(
reinterpret_cast<char *
>(&sz), 4);
1244 result.append(
reinterpret_cast<char const *
>(arg.data()),
1245 arg.size() *
sizeof(std::ranges::range_value_t<T>));
1249 for (
auto & [tag, variant] : tag_dict)
1251 result.push_back(
static_cast<char>(tag / 256));
1252 result.push_back(
static_cast<char>(tag % 256));
1254 result.push_back(detail::sam_tag_type_char[variant.
index()]);
1256 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.
index()]))
1257 result.push_back(detail::sam_tag_type_char_extra[variant.
index()]);
The <bit> header from C++20's standard library.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:165
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:191
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The alphabet of a gap character '-'.
Definition: gap.hpp:39
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:337
Provides seqan3::dna16sam.
Provides seqan3::detail::fast_ostreambuf_iterator.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:162
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:228
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:495
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: cleanup.hpp:4
Provides seqan3::debug_stream and related types.
The <ranges> header from C++20's standard library.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::views::slice.
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:26
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.