1 /// General-purpose memory optimized memory utilities. For memory management, see `tern.typecons.automem`. 2 module tern.memory; 3 4 // TODO: Support AVX, add c-compilation for if SSE/AVX isnt supported, add memset 5 import tern.experimental.heap_allocator; 6 import tern.traits; 7 import core.bitop; 8 import inteli.tmmintrin; 9 import inteli.emmintrin; 10 import inteli.smmintrin; 11 12 public: 13 @nogc: 14 /** 15 * Counts the number of trailing zeroes in `DIR` direction in `mask`. 16 * 17 * Params: 18 * DIR = Direction to count in, will find index if `DIR == 0` or last index if `DIR == 1`. 19 * mask = Mask to be counted from. 20 * 21 * Returns: 22 * Number of trailing zeroes in `DIR` direction in `mask`. 23 */ 24 pragma(inline) 25 size_t ctz(uint DIR)(size_t mask) 26 { 27 if (mask == 0) 28 return -1; 29 30 static if (DIR == 0) 31 return bsf(mask); 32 else 33 return bsr(mask); 34 } 35 36 /** 37 * Finds the index of `elem` in `src` within `0..len` using SIMD intrinsics. 38 * 39 * Assumes that `len` is a multiple of 16 and undefined behavior if `T` is not an integral or vector size. 40 * 41 * Params: 42 * DIR = Direction to count in, will find index if `DIR == 0` or last index if `DIR == 1`. 43 * src = Data source pointer. 44 * len = Length of data to be memchrned. 45 * elem = Data to be searched for. 46 */ 47 pragma(inline) 48 size_t memchr(uint DIR, T)(const scope void* src, size_t len, const scope T elem) 49 { 50 static if (T.sizeof == 16) 51 { 52 __m128d val = _mm_loadu_pd(cast(double*)&elem); 53 static if (DIR == 0) 54 { 55 foreach (i; 0..(len / 16)) 56 { 57 if (_mm_cmpeq_pd(_mm_loadu_pd(cast(double*)(cast(__m128d*)src + i)), val) != 0) 58 return i; 59 } 60 } 61 else 62 { 63 foreach_reverse (i; 0..(len / 16)) 64 { 65 if (_mm_cmpeq_pd(_mm_loadu_pd(cast(double*)(cast(__m128d*)src + i)), val) != 0) 66 return i; 67 } 68 } 69 return -1; 70 } 71 else static if (T.sizeof == 8) 72 { 73 ulong val = *cast(ulong*)&elem; 74 static if (DIR == 0) 75 { 76 foreach (i; 0..(len / 16)) 77 { 78 __m128i cmp = _mm_cmpeq_epi64(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi64x(val)); 79 size_t mask = cast(size_t)_mm_movemask_pd(cast(__m128d)cmp); 80 size_t index = ctz!DIR(mask); 81 82 if (index != -1) 83 return index + (i * 2); 84 } 85 } 86 else 87 { 88 foreach_reverse (i; 0..(len / 16)) 89 { 90 __m128i cmp = _mm_cmpeq_epi64(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi64x(val)); 91 size_t mask = cast(size_t)_mm_movemask_pd(cast(__m128d)cmp); 92 size_t index = ctz!DIR(mask); 93 94 if (index != -1) 95 return index + (i * 2); 96 } 97 } 98 return -1; 99 } 100 else static if (T.sizeof == 4) 101 { 102 uint val = *cast(uint*)&elem; 103 static if (DIR == 0) 104 { 105 foreach (i; 0..(len / 16)) 106 { 107 __m128i cmp = _mm_cmpeq_epi32(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi32(val)); 108 size_t mask = cast(size_t)_mm_movemask_ps(cast(__m128)cmp); 109 size_t index = ctz!DIR(mask); 110 111 if (index != -1) 112 return index + (i * 4); 113 } 114 } 115 else 116 { 117 foreach_reverse (i; 0..(len / 16)) 118 { 119 __m128i cmp = _mm_cmpeq_epi32(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi32(val)); 120 size_t mask = cast(size_t)_mm_movemask_ps(cast(__m128)cmp); 121 size_t index = ctz!DIR(mask); 122 123 if (index != -1) 124 return index + (i * 4); 125 } 126 } 127 return -1; 128 } 129 else static if (T.sizeof == 2) 130 { 131 ushort val = *cast(ushort*)&elem; 132 static if (DIR == 0) 133 { 134 foreach (i; 0..(len / 16)) 135 { 136 __m128i cmp = _mm_cmpeq_epi16(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi16(val)); 137 size_t mask = cast(size_t)_mm_movemask_epi8(cast(__m128)cmp); 138 size_t index = ctz!DIR(mask); 139 140 if (index != -1) 141 return (index / 2) + (i * 8); 142 } 143 } 144 else 145 { 146 foreach_reverse (i; 0..(len / 16)) 147 { 148 __m128i cmp = _mm_cmpeq_epi16(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi16(val)); 149 size_t mask = cast(size_t)_mm_movemask_epi8(cast(__m128)cmp); 150 size_t index = ctz!DIR(mask); 151 152 if (index != -1) 153 return (index / 2) + (i * 8); 154 } 155 } 156 return -1; 157 } 158 else 159 { 160 ubyte val = *cast(ubyte*)&elem; 161 static if (DIR == 0) 162 { 163 foreach (i; 0..(len / 16)) 164 { 165 __m128i cmp = _mm_cmpeq_epi8(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi8(val)); 166 size_t mask = cast(size_t)_mm_movemask_epi8(cast(__m128d)cmp); 167 size_t index = ctz!DIR(mask); 168 169 if (index != -1) 170 return index + (i * 16); 171 } 172 } 173 else 174 { 175 foreach_reverse (i; 0..(len / 16)) 176 { 177 __m128i cmp = _mm_cmpeq_epi8(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi8(val)); 178 size_t mask = cast(size_t)_mm_movemask_epi8(cast(__m128d)cmp); 179 size_t index = ctz!DIR(mask); 180 181 if (index != -1) 182 return index + (i * 16); 183 } 184 } 185 return -1; 186 } 187 } 188 189 /** 190 * Allocates an entry of `size` 191 * 192 * Params: 193 * size = Size to be allocated. 194 * 195 * Returns: 196 * Pointer to the allocated entry. 197 */ 198 @trusted void* malloc(size_t size) => tern.experimental.heap_allocator.malloc!true(size); 199 200 /** 201 * Allocates an entry of `size` and clears the entry. 202 * 203 * Params: 204 * size = Size of the new entry. 205 * 206 * Returns: 207 * Pointer to the allocated entry. 208 */ 209 @trusted void* calloc(size_t size) => tern.experimental.heap_allocator.calloc!true(size); 210 211 /** 212 * Reallocates `ptr` with `size` 213 * Tries to avoid actually doing a new allocation if possible. 214 * 215 * Params: 216 * ptr = Pointer to entry to be reallocated. 217 * size = Size of the new entry. 218 */ 219 @trusted void realloc(ref void* ptr, size_t size) => tern.experimental.heap_allocator.realloc!true(ptr, size); 220 221 /** 222 * Zeroes the entry pointed to by `ptr`. 223 * 224 * Params: 225 * ptr = Pointer to entry to be zeroed. 226 */ 227 @trusted void wake(void* ptr) => tern.experimental.heap_allocator.wake!true(ptr); 228 229 /** 230 * Frees `ptr`, self explanatory. 231 * 232 * Params: 233 * ptr = Pointer to entry to be freed. 234 * 235 * Returns: 236 * True if this succeeded, otherwise false. 237 */ 238 @trusted bool free(void* ptr) => tern.experimental.heap_allocator.free!true(ptr); 239 240 pure: 241 /** 242 * Creates a reference to `V` instance data. 243 * 244 * Returns: 245 * `void*` to `V` instance data. 246 */ 247 pragma(inline) 248 @trusted scope void* reference(alias V)() 249 { 250 static if (is(typeof(V) == class)) 251 return cast(void*)V; 252 else static if (isDynamicArray!(typeof(V))) 253 return cast(void*)V.ptr; 254 else 255 return cast(void*)&V; 256 } 257 258 static: 259 /** 260 * Copies all data from `src` to `dst` within range `0..len`. 261 * 262 * Params: 263 * src = Data source pointer. 264 * dst = Data destination pointer. 265 * len = Length of data to be copied. 266 */ 267 pragma(inline) 268 void memcpy(const scope void* src, const scope void* dst, size_t len) 269 { 270 switch (len % 16) 271 { 272 case 0: 273 foreach (i; 0..(len / 16)) 274 _mm_storeu_pd(cast(double*)dst + i, _mm_loadu_pd(cast(double*)(cast(__m128d*)src + i))); 275 break; 276 case 8: 277 foreach (i; 0..(len / 8)) 278 (cast(ulong*)dst)[i] = (cast(ulong*)src)[i]; 279 break; 280 case 4: 281 foreach (i; 0..(len / 4)) 282 (cast(uint*)dst)[i] = (cast(uint*)src)[i]; 283 break; 284 case 2: 285 foreach (i; 0..(len / 2)) 286 (cast(ushort*)dst)[i] = (cast(ushort*)src)[i]; 287 break; 288 default: 289 foreach (i; 0..len) 290 (cast(ubyte*)dst)[i] = (cast(ubyte*)src)[i]; 291 break; 292 } 293 } 294 295 /// ditto 296 pragma(inline) 297 void memcpy(size_t len)(const scope void* src, const scope void* dst) 298 { 299 static if (len % 16 == 0) 300 { 301 static foreach (i; 0..(len / 16)) 302 _mm_storeu_pd(cast(double*)dst + i, _mm_loadu_pd(cast(__m128d*)src + i)); 303 } 304 else static if (len % 8 == 0) 305 { 306 static foreach (i; 0..(len / 8)) 307 (cast(ulong*)dst)[i] = (cast(ulong*)src)[i]; 308 } 309 else static if (len % 4 == 0) 310 { 311 static foreach (i; 0..(len / 4)) 312 (cast(uint*)dst)[i] = (cast(uint*)src)[i]; 313 } 314 else static if (len % 2 == 0) 315 { 316 static foreach (i; 0..(len / 2)) 317 (cast(ushort*)dst)[i] = (cast(ushort*)src)[i]; 318 } 319 else 320 { 321 static foreach (i; 0..len) 322 (cast(ubyte*)dst)[i] = (cast(ubyte*)src)[i]; 323 } 324 } 325 326 /** 327 * Zeroes all bytes at `src` within range `0..len`. 328 * 329 * Params: 330 * src = Data source pointer. 331 * len = Length of data to be copied. 332 */ 333 pragma(inline) 334 void memzero(const scope void* src, size_t len) 335 { 336 switch (len % 16) 337 { 338 case 0: 339 foreach (i; 0..(len / 16)) 340 _mm_storeu_pd(cast(double*)src + i, [0, 0]); 341 break; 342 case 8: 343 foreach (i; 0..(len / 8)) 344 (cast(ulong*)src)[i] = 0; 345 break; 346 case 4: 347 foreach (i; 0..(len / 4)) 348 (cast(uint*)src)[i] = 0; 349 break; 350 case 2: 351 foreach (i; 0..(len / 2)) 352 (cast(ushort*)src)[i] = 0; 353 break; 354 default: 355 foreach (i; 0..len) 356 (cast(ubyte*)src)[i] = 0; 357 break; 358 } 359 } 360 361 /** 362 * Swaps all bytes at `src` from `0..len`. 363 * 364 * Params: 365 * src = Data source pointer. 366 * len = Length of data to be byte-swapped. 367 */ 368 pragma(inline) 369 void byteswap(const scope void* src, size_t len) 370 { 371 if (len % 16 == 0) 372 { 373 foreach (i; 0..(len / 16)) 374 { 375 __m128i mask = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 376 _mm_storeu_pd(cast(double*)(cast(__m128i*)src + i), cast(__m128d)_mm_shuffle_epi8(_mm_loadu_si128(cast(__m128i*)src + i), mask)); 377 } 378 379 size_t end = (len / 32); 380 foreach (i; 0..(len / 32)) 381 { 382 __m128d t = _mm_loadu_pd(cast(double*)(cast(__m128*)src + i)); 383 _mm_storeu_pd(cast(double*)(cast(__m128*)src + i), _mm_loadu_pd(cast(double*)(cast(__m128*)src + (end - i)))); 384 _mm_storeu_pd(cast(double*)(cast(__m128*)src + (end - i)), t); 385 } 386 return; 387 } 388 389 // TODO: Not this 390 import std.algorithm : reverse; 391 (cast(ubyte*)src)[0..len].reverse(); 392 } 393 394 pragma(inline) 395 void emplace(T)(T* ptr) 396 { 397 if (!hasChildren!T) 398 { 399 T t; 400 memcpy!T.sizeof(cast(void*)ptr, cast(void*)&t); 401 } 402 else 403 { 404 static foreach (field; Fields!T) 405 { 406 typeof(getChild!(T, field)) t; 407 memcpy!T.sizeof(cast(void*)ptr + getChild!(T, field).offsetof, cast(void*)&t); 408 } 409 } 410 }