51 namespace seqan3::detail
68 format_sam_base() noexcept = default;
69 format_sam_base(format_sam_base const &) noexcept = default;
70 format_sam_base & operator=(format_sam_base const &) noexcept = default;
71 format_sam_base(format_sam_base &&) noexcept = default;
72 format_sam_base & operator=(format_sam_base &&) noexcept = default;
73 ~format_sam_base() noexcept = default;
77 static constexpr
std::array format_version{
'1',
'.',
'6'};
83 bool header_was_written{
false};
86 bool ref_info_present_in_header{
false};
88 template <
typename ref_id_type,
89 typename ref_id_tmp_type,
91 typename ref_seqs_type>
92 void check_and_assign_ref_id(ref_id_type &
ref_id,
93 ref_id_tmp_type & ref_id_tmp,
97 static void update_alignment_lengths(int32_t & ref_length,
99 char const cigar_operation,
100 uint32_t
const cigar_count);
102 template <
typename align_type,
typename ref_seqs_type>
103 void construct_alignment(align_type & align,
105 [[maybe_unused]] int32_t rid,
106 [[maybe_unused]] ref_seqs_type & ref_seqs,
107 [[maybe_unused]] int32_t ref_start,
110 void transfer_soft_clipping_to(
std::vector<cigar> const & cigar_vector, int32_t & sc_begin, int32_t & sc_end)
const;
112 template <
typename cigar_input_type>
115 template <
typename stream_view_type>
116 void read_field(stream_view_type && stream_view, detail::ignore_t
const & SEQAN3_DOXYGEN_ONLY(target));
118 template <
typename stream_view_type, std::ranges::forward_range target_range_type>
119 void read_field(stream_view_type && stream_view, target_range_type & target);
121 template <
typename stream_view_t, arithmetic arithmetic_target_type>
122 void read_field(stream_view_t && stream_view, arithmetic_target_type & arithmetic_target);
124 template <
typename stream_view_type,
typename optional_value_type>
127 template <
typename stream_view_type,
typename ref_
ids_type,
typename ref_seqs_type>
128 void read_header(stream_view_type && stream_view,
129 alignment_file_header<ref_ids_type> & hdr,
132 template <
typename stream_t,
typename ref_
ids_type>
133 void write_header(stream_t & stream,
134 alignment_file_output_options
const & options,
135 alignment_file_header<ref_ids_type> & header);
148 template <
typename ref_id_type,
149 typename ref_id_tmp_type,
150 typename header_type,
151 typename ref_seqs_type>
152 inline void format_sam_base::check_and_assign_ref_id(ref_id_type &
ref_id,
153 ref_id_tmp_type & ref_id_tmp,
154 header_type & header,
157 if (!std::ranges::empty(ref_id_tmp))
159 auto search = header.ref_dict.find(ref_id_tmp);
161 if (
search == header.ref_dict.end())
163 if constexpr(detail::decays_to_ignore_v<ref_seqs_type>)
165 if (ref_info_present_in_header)
167 throw format_error{
"Unknown reference id found in record which is not present in the header."};
171 header.ref_ids().push_back(ref_id_tmp);
173 header.ref_dict[header.ref_ids()[pos]] = pos;
179 throw format_error{
"Unknown reference id found in record which is not present in the given ids."};
195 inline void format_sam_base::update_alignment_lengths(int32_t & ref_length,
196 int32_t & seq_length,
197 char const cigar_operation,
198 uint32_t
const cigar_count)
200 switch (cigar_operation)
202 case 'M':
case '=':
case 'X': ref_length += cigar_count, seq_length += cigar_count;
break;
203 case 'D':
case 'N': ref_length += cigar_count;
break;
204 case 'I' : seq_length += cigar_count;
break;
205 case 'S':
case 'H':
case 'P':
break;
206 default:
throw format_error{
"Illegal cigar operation: " +
std::string{cigar_operation}};
215 inline void format_sam_base::transfer_soft_clipping_to(
std::vector<cigar> const & cigar_vector,
217 int32_t & sc_end)
const
220 auto soft_clipping_at = [&] (
size_t const index) {
return cigar_vector[index] ==
'S'_cigar_op; };
222 auto hard_clipping_at = [&] (
size_t const index) {
return cigar_vector[index] ==
'H'_cigar_op; };
224 auto vector_size_at_least = [&] (
size_t const min_size) {
return cigar_vector.
size() >= min_size; };
226 auto cigar_count_at = [&] (
size_t const index) {
return get<0>(cigar_vector[index]); };
229 if (vector_size_at_least(1) && soft_clipping_at(0))
230 sc_begin = cigar_count_at(0);
231 else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1))
232 sc_begin = cigar_count_at(1);
237 auto last_index = cigar_vector.
size() - 1;
238 auto second_last_index = last_index - 1;
240 if (vector_size_at_least(2) && soft_clipping_at(last_index))
241 sc_end = cigar_count_at(last_index);
242 else if (vector_size_at_least(3) && hard_clipping_at(last_index) && soft_clipping_at(second_last_index))
243 sc_end = cigar_count_at(second_last_index);
259 template <
typename cigar_input_type>
264 char cigar_operation{};
265 uint32_t cigar_count{};
266 int32_t ref_length{}, seq_length{};
273 while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view))
276 cigar_operation = *std::ranges::begin(cigar_view);
277 std::ranges::next(std::ranges::begin(cigar_view));
280 throw format_error{
"Corrupted cigar string encountered"};
282 update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
283 operations.emplace_back(cigar_count, cigar_op{}.assign_char(cigar_operation));
286 return {operations, ref_length, seq_length};
299 template <
typename align_type,
typename ref_seqs_type>
300 inline void format_sam_base::construct_alignment(align_type & align,
302 [[maybe_unused]] int32_t rid,
303 [[maybe_unused]] ref_seqs_type & ref_seqs,
304 [[maybe_unused]] int32_t ref_start,
307 if (rid > -1 && ref_start > -1 &&
308 !cigar_vector.
empty() &&
309 !std::ranges::empty(get<1>(align)))
311 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
313 assert(static_cast<size_t>(ref_start + ref_length) <=
std::ranges::size(ref_seqs[rid]));
319 using unaligned_t = remove_cvref_t<detail::unaligned_seq_t<decltype(get<0>(align))>>;
320 auto dummy_seq =
views::repeat_n(value_type_t<unaligned_t>{}, ref_length)
322 static_assert(
std::same_as<unaligned_t, decltype(dummy_seq)>,
323 "No reference information was given so the type of the first alignment tuple position"
324 "must have an unaligned sequence type of a dummy sequence ("
325 "views::repeat_n(dna5{}, size_t{}) | "
326 "std::views::transform(detail::access_restrictor_fn{}))");
332 detail::alignment_from_cigar(align, cigar_vector);
336 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
343 using unaligned_t = remove_cvref_t<detail::unaligned_seq_t<decltype(get<0>(align))>>;
356 template <
typename stream_view_type>
357 inline void format_sam_base::read_field(stream_view_type && stream_view,
358 detail::ignore_t
const & SEQAN3_DOXYGEN_ONLY(target))
360 detail::consume(stream_view);
370 template <
typename stream_view_type, std::ranges::forward_range target_range_type>
371 inline void format_sam_base::read_field(stream_view_type && stream_view, target_range_type & target)
373 if (!is_char<'*'>(*std::ranges::begin(stream_view)))
374 std::ranges::copy(stream_view |
views::char_to<value_type_t<target_range_type>>,
375 std::ranges::back_inserter(target));
377 std::ranges::next(std::ranges::begin(stream_view));
390 template <
typename stream_view_t, arithmetic arithmetic_target_type>
391 inline void format_sam_base::read_field(stream_view_t && stream_view, arithmetic_target_type & arithmetic_target)
394 auto [ignore,
end] = std::ranges::copy(stream_view, arithmetic_buffer.data());
398 if (res.ec == std::errc::invalid_argument || res.ptr != end)
399 throw format_error{
std::string(
"[CORRUPTED SAM FILE] The string '") +
401 "' could not be cast into type " +
402 detail::type_name_as_string<arithmetic_target_type>};
404 if (res.ec == std::errc::result_out_of_range)
406 "' into type " + detail::type_name_as_string<arithmetic_target_type> +
407 " would cause an overflow."};
420 template <
typename stream_view_type,
typename optional_value_type>
423 optional_value_type tmp;
424 read_field(std::forward<stream_view_type>(stream_view), tmp);
444 template <
typename stream_view_type,
typename ref_
ids_type,
typename ref_seqs_type>
445 inline void format_sam_base::read_header(stream_view_type && stream_view,
446 alignment_file_header<ref_ids_type> & hdr,
449 auto parse_tag_value = [&stream_view,
this] (
auto & value)
452 std::ranges::next(std::ranges::begin(stream_view));
458 parse_tag_value(hdr.format_version);
461 while (is_char<'\t'>(*std::ranges::begin(stream_view)))
463 std::ranges::next(std::ranges::begin(stream_view));
466 if (is_char<'S'>(*std::ranges::begin(stream_view)))
468 std::ranges::next(std::ranges::begin(stream_view));
470 if (is_char<'O'>(*std::ranges::begin(stream_view)))
472 else if (is_char<'S'>(*std::ranges::begin(stream_view)))
475 throw format_error{
std::string{
"Illegal SAM header tag: S"} +
476 std::string{static_cast<char>(*std::ranges::begin(stream_view))}};
478 else if (!is_char<'G'>(*std::ranges::begin(stream_view)))
480 throw format_error{
std::string{
"Illegal SAM header tag in @HG starting with:"} +
481 std::string{static_cast<char>(*std::ranges::begin(stream_view))}};
484 parse_tag_value(*who);
486 std::ranges::next(std::ranges::begin(stream_view));
490 while (is_char<'@'>(*std::ranges::begin(stream_view)))
492 std::ranges::next(std::ranges::begin(stream_view));
494 if (is_char<'S'>(*std::ranges::begin(stream_view)))
496 ref_info_present_in_header =
true;
501 std::ranges::next(std::ranges::begin(stream_view));
502 parse_tag_value(get<0>(info));
504 if (is_char<'\t'>(*std::ranges::begin(stream_view)))
506 std::ranges::next(std::ranges::begin(stream_view));
509 std::ranges::next(std::ranges::begin(stream_view));
513 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
515 auto id_it = hdr.ref_dict.find(
id);
517 if (id_it == hdr.ref_dict.end())
518 throw format_error{detail::to_string(
"Unknown reference name '",
id,
"' found in SAM header ",
519 "(header.ref_ids(): ", hdr.ref_ids(),
").")};
521 auto & given_ref_info = hdr.ref_id_info[id_it->second];
523 if (std::get<0>(given_ref_info) != std::get<0>(info))
524 throw format_error{
"Provided reference has unequal length as specified in the header."};
526 hdr.ref_id_info[id_it->second] =
std::move(info);
530 static_assert(!detail::is_type_specialisation_of_v<decltype(hdr.ref_ids()),
std::deque>,
531 "The range over reference ids must be of type std::deque such that "
532 "pointers are not invalidated.");
534 hdr.ref_ids().push_back(
id);
535 hdr.ref_id_info.push_back(info);
536 hdr.ref_dict[(hdr.ref_ids())[(hdr.ref_ids()).
size() - 1]] = (hdr.ref_ids()).
size() - 1;
539 else if (is_char<'R'>(*std::ranges::begin(stream_view)))
543 parse_tag_value(get<0>(tmp));
545 if (is_char<'\t'>(*std::ranges::begin(stream_view)))
547 std::ranges::next(std::ranges::begin(stream_view));
550 std::ranges::next(std::ranges::begin(stream_view));
552 hdr.read_groups.emplace_back(
std::move(tmp));
554 else if (is_char<'P'>(*std::ranges::begin(stream_view)))
556 typename alignment_file_header<ref_ids_type>::program_info_t tmp{};
558 parse_tag_value(tmp.id);
561 while (is_char<'\t'>(*std::ranges::begin(stream_view)))
563 std::ranges::next(std::ranges::begin(stream_view));
566 if (is_char<'P'>(*std::ranges::begin(stream_view)))
568 std::ranges::next(std::ranges::begin(stream_view));
570 if (is_char<'N'>(*std::ranges::begin(stream_view)))
575 else if (is_char<'C'>(*std::ranges::begin(stream_view)))
577 who = &tmp.command_line_call;
579 else if (is_char<'D'>(*std::ranges::begin(stream_view)))
581 who = &tmp.description;
583 else if (!is_char<'V'>(*std::ranges::begin(stream_view)))
585 throw format_error{
std::string{
"Illegal SAM header tag starting with:"} +
586 std::string{static_cast<char>(*std::ranges::begin(stream_view))}};
589 parse_tag_value(*who);
591 std::ranges::next(std::ranges::begin(stream_view));
593 hdr.program_infos.emplace_back(
std::move(tmp));
595 else if (is_char<'C'>(*std::ranges::begin(stream_view)))
598 std::ranges::next(std::ranges::begin(stream_view));
599 std::ranges::next(std::ranges::begin(stream_view));
600 std::ranges::next(std::ranges::begin(stream_view));
602 std::ranges::next(std::ranges::begin(stream_view));
604 hdr.comments.emplace_back(
std::move(tmp));
608 throw format_error{
std::string{
"Illegal SAM header tag starting with:"} +
609 std::string{static_cast<char>(*std::ranges::begin(stream_view))}};
630 template <
typename stream_t,
typename ref_
ids_type>
631 inline void format_sam_base::write_header(stream_t & stream,
632 alignment_file_output_options
const & options,
633 alignment_file_header<ref_ids_type> & header)
641 if (!header.sorting.empty() &&
642 !(header.sorting ==
"unknown" ||
643 header.sorting ==
"unsorted" ||
644 header.sorting ==
"queryname" ||
645 header.sorting ==
"coordinate" ))
646 throw format_error{
"SAM format error: The header.sorting member must be "
647 "one of [unknown, unsorted, queryname, coordinate]."};
649 if (!header.grouping.empty() &&
650 !(header.grouping ==
"none" ||
651 header.grouping ==
"query" ||
652 header.grouping ==
"reference"))
653 throw format_error{
"SAM format error: The header.grouping member must be "
654 "one of [none, query, reference]."};
673 stream <<
"@HD\tVN:";
674 std::ranges::copy(format_version, stream_it);
676 if (!header.sorting.empty())
677 stream <<
"\tSO:" << header.sorting;
679 if (!header.subsorting.empty())
680 stream <<
"\tSS:" << header.subsorting;
682 if (!header.grouping.empty())
683 stream <<
"\tGO:" << header.grouping;
685 detail::write_eol(stream_it, options.add_carriage_return);
688 for (
auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info))
690 stream <<
"@SQ\tSN:";
692 std::ranges::copy(ref_name, stream_it);
694 stream <<
"\tLN:" << get<0>(ref_info);
696 if (!get<1>(ref_info).
empty())
697 stream <<
"\t" << get<1>(ref_info);
699 detail::write_eol(stream_it, options.add_carriage_return);
703 for (
auto const & read_group : header.read_groups)
706 <<
"\tID:" << get<0>(read_group);
708 if (!get<1>(read_group).
empty())
709 stream <<
"\t" << get<1>(read_group);
711 detail::write_eol(stream_it, options.add_carriage_return);
715 for (
auto const & program : header.program_infos)
718 <<
"\tID:" << program.id;
720 if (!program.name.empty())
721 stream <<
"\tPN:" << program.name;
723 if (!program.command_line_call.empty())
724 stream <<
"\tCL:" << program.command_line_call;
726 if (!program.previous.empty())
727 stream <<
"\tPP:" << program.previous;
729 if (!program.description.empty())
730 stream <<
"\tDS:" << program.description;
732 if (!program.version.empty())
733 stream <<
"\tVN:" << program.version;
735 detail::write_eol(stream_it, options.add_carriage_return);
739 for (
auto const &
comment : header.comments)
742 detail::write_eol(stream_it, options.add_carriage_return);