SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
simd_algorithm_sse4.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 
21 
22 //-----------------------------------------------------------------------------
23 // forward declare sse4 simd algorithms that use sse4 intrinsics
24 //-----------------------------------------------------------------------------
25 
26 namespace seqan3::detail
27 {
31 template <simd::simd_concept simd_t>
32 constexpr simd_t load_sse4(void const * mem_addr);
33 
37 template <simd::simd_concept simd_t>
38 inline void transpose_matrix_sse4(std::array<simd_t, simd_traits<simd_t>::length> & matrix);
39 
43 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
44 constexpr target_simd_t upcast_signed_sse4(source_simd_t const & src);
45 
49 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
50 constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const & src);
51 
55 template <uint8_t index, simd::simd_concept simd_t>
56 constexpr simd_t extract_half_sse4(simd_t const & src);
57 
61 template <uint8_t index, simd::simd_concept simd_t>
62 constexpr simd_t extract_quarter_sse4(simd_t const & src);
63 
67 template <uint8_t index, simd::simd_concept simd_t>
68 constexpr simd_t extract_eighth_sse4(simd_t const & src);
69 
70 }
71 
72 //-----------------------------------------------------------------------------
73 // implementation
74 //-----------------------------------------------------------------------------
75 
76 #ifdef __SSE4_2__
77 
78 namespace seqan3::detail
79 {
80 
81 template <simd::simd_concept simd_t>
82 constexpr simd_t load_sse4(void const * mem_addr)
83 {
84  return reinterpret_cast<simd_t>(_mm_loadu_si128(reinterpret_cast<__m128i const *>(mem_addr)));
85 }
86 
87 template <simd::simd_concept simd_t>
88 inline void transpose_matrix_sse4(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
89 {
90  static_assert(simd_traits<simd_t>::length == simd_traits<simd_t>::max_length, "Expects byte scalar type.");
91  static_assert(is_native_builtin_simd_v<simd_t>, "The passed simd vector is not a native SSE4 simd vector type.");
92  static_assert(is_builtin_simd_v<simd_t>, "The passed simd vector is not a builtin vector type.");
93 
94  // we need a look-up table to reverse the lowest 4 bits
95  // in order to place the permute the transposed rows
96  constexpr std::array<char, 16> bit_reverse{0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
97 
98  // transpose a 16x16 byte matrix
99  //
100  // matrix =
101  // A0 A1 A2 ... Ae Af
102  // B0 B1 B2 ... Be Bf
103  // ...
104  // P0 P1 P2 ... Pe Pf
105  __m128i tmp1[16];
106  for (int i = 0; i < 8; ++i)
107  {
108  tmp1[i] = _mm_unpacklo_epi8(reinterpret_cast<__m128i &>(matrix[2*i]),
109  reinterpret_cast<__m128i &>(matrix[2*i+1]));
110  tmp1[i+8] = _mm_unpackhi_epi8(reinterpret_cast<__m128i &>(matrix[2*i]),
111  reinterpret_cast<__m128i &>(matrix[2*i+1]));
112  }
113  // tmp1[0] = A0 B0 A1 B1 ... A7 B7
114  // tmp1[1] = C0 D0 C1 D1 ... C7 D7
115  // ...
116  // tmp1[7] = O0 P0 O1 P1 ... O7 P7
117  // tmp1[8] = A8 B8 A9 B9 ... Af Bf
118  // ...
119  // tmp1[15] = O8 P8 O9 P9 ... Of Pf
120  __m128i tmp2[16];
121  for (int i = 0; i < 8; ++i)
122  {
123  tmp2[i] = _mm_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
124  tmp2[i+8] = _mm_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
125  }
126  // tmp2[0] = A0 B0 C0 D0 ... A3 B3 C3 D3
127  // tmp2[1] = E0 F0 G0 H0 ... E3 F3 G3 H3
128  // ...
129  // tmp2[3] = M0 N0 O0 P0 ... M3 N3 O3 P3
130  // tmp2[4] = A8 B8 C8 D8 ... Ab Bb Cb Db
131  // ...
132  // tmp2[7] = M8 N8 O8 P8 ... Mb Nb Ob Pb
133  // tmp2[8] = A4 B4 C4 D4 ... A7 B7 C7 D7
134  // ..
135  // tmp2[12] = Ac Bc Cc Dc ... Af Bf Cf Df
136  // ...
137  // tmp2[15] = Mc Nc Oc Pc ... Mf Nf Of Pf
138  for (int i = 0; i < 8; ++i)
139  {
140  tmp1[i] = _mm_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
141  tmp1[i+8] = _mm_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
142  }
143  // tmp1[0] = A0 B0 .... H0 A1 B1 .... H1
144  // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
145  // ...
146  // tmp1[4] = A0 B0 .... H0 A1 B1 .... H1
147  // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
148  for (int i = 0; i < 8; ++i)
149  {
150  matrix[bit_reverse[i]] = reinterpret_cast<simd_t>(_mm_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]));
151  matrix[bit_reverse[i+8]] = reinterpret_cast<simd_t>(_mm_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]));
152  }
153 }
154 
155 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
156 constexpr target_simd_t upcast_signed_sse4(source_simd_t const & src)
157 {
158  if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi8 ...
159  {
160  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi16
161  return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi16(reinterpret_cast<__m128i const &>(src)));
162  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
163  return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi32(reinterpret_cast<__m128i const &>(src)));
164  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
165  return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi64(reinterpret_cast<__m128i const &>(src)));
166  }
167  else if constexpr (simd_traits<source_simd_t>::length == 8) // cast from epi16 ...
168  {
169  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
170  return reinterpret_cast<target_simd_t>(_mm_cvtepi16_epi32(reinterpret_cast<__m128i const &>(src)));
171  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
172  return reinterpret_cast<target_simd_t>(_mm_cvtepi16_epi64(reinterpret_cast<__m128i const &>(src)));
173  }
174  else // cast from epi32 to epi64
175  {
176  static_assert(simd_traits<source_simd_t>::length == 4, "Expected 32 bit scalar type.");
177  return reinterpret_cast<target_simd_t>(_mm_cvtepi32_epi64(reinterpret_cast<__m128i const &>(src)));
178  }
179 }
180 
181 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
182 constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const & src)
183 {
184  if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi8 ...
185  {
186  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi16
187  return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi16(reinterpret_cast<__m128i const &>(src)));
188  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
189  return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi32(reinterpret_cast<__m128i const &>(src)));
190  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
191  return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi64(reinterpret_cast<__m128i const &>(src)));
192  }
193  else if constexpr (simd_traits<source_simd_t>::length == 8) // cast from epi16 ...
194  {
195  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
196  return reinterpret_cast<target_simd_t>(_mm_cvtepu16_epi32(reinterpret_cast<__m128i const &>(src)));
197  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
198  return reinterpret_cast<target_simd_t>(_mm_cvtepu16_epi64(reinterpret_cast<__m128i const &>(src)));
199  }
200  else // cast from epi32 to epi64
201  {
202  static_assert(simd_traits<source_simd_t>::length == 4, "Expected 32 bit scalar type.");
203  return reinterpret_cast<target_simd_t>(_mm_cvtepu32_epi64(reinterpret_cast<__m128i const &>(src)));
204  }
205 }
206 
207 template <uint8_t index, simd::simd_concept simd_t>
208 constexpr simd_t extract_half_sse4(simd_t const & src)
209 {
210  return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), (index) << 3));
211 }
212 
213 template <uint8_t index, simd::simd_concept simd_t>
214 constexpr simd_t extract_quarter_sse4(simd_t const & src)
215 {
216  return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), index << 2));
217 }
218 
219 template <uint8_t index, simd::simd_concept simd_t>
220 constexpr simd_t extract_eighth_sse4(simd_t const & src)
221 {
222  return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), index << 1));
223 }
224 
225 } // namespace seqan3::detail
226 
227 #endif // __SSE4_2__
Provides seqan3::simd::simd_concept.
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Provides seqan3::simd::simd_traits.