1 /// General-purpose memory optimized memory utilities. For memory management, see `tern.typecons.automem`.
2 module tern.memory;
3 
4 // TODO: Support AVX, add c-compilation for if SSE/AVX isnt supported, add memset
5 import tern.experimental.heap_allocator;
6 import tern.traits;
7 import core.bitop;
8 import inteli.tmmintrin;
9 import inteli.emmintrin;
10 import inteli.smmintrin;
11 
12 public:
13 @nogc:
14 /**
15  * Counts the number of trailing zeroes in `DIR` direction in `mask`.
16  *
17  * Params:
18  *  DIR = Direction to count in, will find index if `DIR == 0` or last index if `DIR == 1`.
19  *  mask = Mask to be counted from.
20  * 
21  * Returns:
22  *  Number of trailing zeroes in `DIR` direction in `mask`.
23  */
24 pragma(inline)
25 size_t ctz(uint DIR)(size_t mask) 
26 {
27     if (mask == 0)
28         return -1;
29 
30     static if (DIR == 0)
31         return bsf(mask);
32     else
33         return bsr(mask);
34 }
35 
36 /**
37  * Finds the index of `elem` in `src` within `0..len` using SIMD intrinsics.
38  *
39  * Assumes that `len` is a multiple of 16 and undefined behavior if `T` is not an integral or vector size.
40  *
41  * Params:
42  *  DIR = Direction to count in, will find index if `DIR == 0` or last index if `DIR == 1`.
43  *  src = Data source pointer.
44  *  len = Length of data to be memchrned.
45  *  elem = Data to be searched for.
46  */
47 pragma(inline)
48 size_t memchr(uint DIR, T)(const scope void* src, size_t len, const scope T elem)
49 {
50     static if (T.sizeof == 16)
51     {
52         __m128d val = _mm_loadu_pd(cast(double*)&elem);
53         static if (DIR == 0)
54         {
55             foreach (i; 0..(len / 16))
56             {
57                 if (_mm_cmpeq_pd(_mm_loadu_pd(cast(double*)(cast(__m128d*)src + i)), val) != 0)
58                     return i;
59             }
60         }
61         else
62         {
63             foreach_reverse (i; 0..(len / 16))
64             {
65                 if (_mm_cmpeq_pd(_mm_loadu_pd(cast(double*)(cast(__m128d*)src + i)), val) != 0)
66                     return i;
67             }
68         }
69         return -1;
70     }
71     else static if (T.sizeof == 8)
72     {
73         ulong val = *cast(ulong*)&elem;
74         static if (DIR == 0)
75         {
76             foreach (i; 0..(len / 16))
77             {
78                 __m128i cmp = _mm_cmpeq_epi64(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi64x(val));
79                 size_t mask = cast(size_t)_mm_movemask_pd(cast(__m128d)cmp);
80                 size_t index = ctz!DIR(mask);
81 
82                 if (index != -1)
83                     return index + (i * 2);
84             }
85         }
86         else
87         {
88             foreach_reverse (i; 0..(len / 16))
89             {
90                 __m128i cmp = _mm_cmpeq_epi64(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi64x(val));
91                 size_t mask = cast(size_t)_mm_movemask_pd(cast(__m128d)cmp);
92                 size_t index = ctz!DIR(mask);
93 
94                 if (index != -1)
95                     return index + (i * 2);
96             }
97         }
98         return -1;
99     }
100     else static if (T.sizeof == 4)
101     {
102         uint val = *cast(uint*)&elem;
103         static if (DIR == 0)
104         {
105             foreach (i; 0..(len / 16))
106             {
107                 __m128i cmp = _mm_cmpeq_epi32(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi32(val));
108                 size_t mask = cast(size_t)_mm_movemask_ps(cast(__m128)cmp);
109                 size_t index = ctz!DIR(mask);
110 
111                 if (index != -1)
112                     return index + (i * 4);
113             }
114         }
115         else
116         {
117             foreach_reverse (i; 0..(len / 16))
118             {
119                 __m128i cmp = _mm_cmpeq_epi32(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi32(val));
120                 size_t mask = cast(size_t)_mm_movemask_ps(cast(__m128)cmp);
121                 size_t index = ctz!DIR(mask);
122 
123                 if (index != -1)
124                     return index + (i * 4);
125             }
126         }
127         return -1;
128     }
129     else static if (T.sizeof == 2)
130     {
131         ushort val = *cast(ushort*)&elem;
132         static if (DIR == 0)
133         {
134             foreach (i; 0..(len / 16))
135             {
136                 __m128i cmp = _mm_cmpeq_epi16(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi16(val));
137                 size_t mask = cast(size_t)_mm_movemask_epi8(cast(__m128)cmp);
138                 size_t index = ctz!DIR(mask);
139 
140                 if (index != -1)
141                     return (index / 2) + (i * 8);
142             }
143         }
144         else
145         {
146             foreach_reverse (i; 0..(len / 16))
147             {
148                 __m128i cmp = _mm_cmpeq_epi16(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi16(val));
149                 size_t mask = cast(size_t)_mm_movemask_epi8(cast(__m128)cmp);
150                 size_t index = ctz!DIR(mask);
151 
152                 if (index != -1)
153                     return (index / 2) + (i * 8);
154             }
155         }
156         return -1;
157     }
158     else
159     {
160         ubyte val = *cast(ubyte*)&elem;
161         static if (DIR == 0)
162         {
163             foreach (i; 0..(len / 16))
164             {
165                 __m128i cmp = _mm_cmpeq_epi8(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi8(val));
166                 size_t mask = cast(size_t)_mm_movemask_epi8(cast(__m128d)cmp);
167                 size_t index = ctz!DIR(mask);
168 
169                 if (index != -1)
170                     return index + (i * 16);
171             }
172         }
173         else
174         {
175             foreach_reverse (i; 0..(len / 16))
176             {
177                 __m128i cmp = _mm_cmpeq_epi8(_mm_loadu_si128(cast(__m128i*)src + i), _mm_set1_epi8(val));
178                 size_t mask = cast(size_t)_mm_movemask_epi8(cast(__m128d)cmp);
179                 size_t index = ctz!DIR(mask);
180 
181                 if (index != -1)
182                     return index + (i * 16);
183             }
184         }
185         return -1;
186     }
187 }
188 
189 /**
190  * Allocates an entry of `size` 
191  *
192  * Params:
193  *  size = Size to be allocated.
194  *
195  * Returns:
196  *  Pointer to the allocated entry.
197  */
198 @trusted void* malloc(size_t size) => tern.experimental.heap_allocator.malloc!true(size);
199 
200 /**
201  * Allocates an entry of `size` and clears the entry.
202  *
203  * Params:
204  *  size = Size of the new entry.
205  *
206  * Returns:
207  *  Pointer to the allocated entry.
208  */
209 @trusted void* calloc(size_t size) => tern.experimental.heap_allocator.calloc!true(size);
210 
211 /**
212  * Reallocates `ptr` with `size`  
213  * Tries to avoid actually doing a new allocation if possible.
214  *
215  * Params:
216  *  ptr = Pointer to entry to be reallocated.
217  *  size = Size of the new entry.
218  */
219 @trusted void realloc(ref void* ptr, size_t size) => tern.experimental.heap_allocator.realloc!true(ptr, size);
220 
221 /**
222  * Zeroes the entry pointed to by `ptr`.
223  *
224  * Params:
225  *  ptr = Pointer to entry to be zeroed.
226  */
227 @trusted void wake(void* ptr) => tern.experimental.heap_allocator.wake!true(ptr);
228 
229 /**
230  * Frees `ptr`, self explanatory.
231  *
232  * Params:
233  *  ptr = Pointer to entry to be freed.
234  *
235  * Returns:
236  *  True if this succeeded, otherwise false.
237  */
238 @trusted bool free(void* ptr) => tern.experimental.heap_allocator.free!true(ptr);
239 
240 pure:
241 /**
242  * Creates a reference to `V` instance data.
243  *
244  * Returns:
245  *  `void*` to `V` instance data.
246  */
247 pragma(inline)
248 @trusted scope void* reference(alias V)()
249 {
250     static if (is(typeof(V) == class))
251         return cast(void*)V;
252     else static if (isDynamicArray!(typeof(V)))
253         return cast(void*)V.ptr;
254     else
255         return cast(void*)&V;
256 }
257 
258 static:
259 /** 
260  * Copies all data from `src` to `dst` within range `0..len`.
261  *
262  * Params:
263  *  src = Data source pointer.
264  *  dst = Data destination pointer.
265  *  len = Length of data to be copied.
266  */
267 pragma(inline)
268 void memcpy(const scope void* src, const scope void* dst, size_t len)
269 {
270     switch (len % 16)
271     {
272         case 0:
273             foreach (i; 0..(len / 16))
274                 _mm_storeu_pd(cast(double*)dst + i, _mm_loadu_pd(cast(double*)(cast(__m128d*)src + i)));
275             break;
276         case 8:
277             foreach (i; 0..(len / 8))
278                 (cast(ulong*)dst)[i] = (cast(ulong*)src)[i];
279             break;
280         case 4:
281             foreach (i; 0..(len / 4))
282                 (cast(uint*)dst)[i] = (cast(uint*)src)[i];
283             break;
284         case 2:
285             foreach (i; 0..(len / 2))
286                 (cast(ushort*)dst)[i] = (cast(ushort*)src)[i];
287             break;
288         default:
289             foreach (i; 0..len)
290                 (cast(ubyte*)dst)[i] = (cast(ubyte*)src)[i];
291             break;
292     }
293 }
294 
295 /// ditto
296 pragma(inline)
297 void memcpy(size_t len)(const scope void* src, const scope void* dst)
298 {
299     static if (len % 16 == 0)
300     {
301         static foreach (i; 0..(len / 16))
302             _mm_storeu_pd(cast(double*)dst + i, _mm_loadu_pd(cast(__m128d*)src + i));
303     }
304     else static if (len % 8 == 0)
305     {            
306         static foreach (i; 0..(len / 8))
307             (cast(ulong*)dst)[i] = (cast(ulong*)src)[i];
308     }
309     else static if (len % 4 == 0)
310     {            
311         static foreach (i; 0..(len / 4))
312             (cast(uint*)dst)[i] = (cast(uint*)src)[i];
313     }
314     else static if (len % 2 == 0)
315     {            
316         static foreach (i; 0..(len / 2))
317             (cast(ushort*)dst)[i] = (cast(ushort*)src)[i];
318     }
319     else
320     {            
321         static foreach (i; 0..len)
322             (cast(ubyte*)dst)[i] = (cast(ubyte*)src)[i];
323     }
324 }
325 
326 /** 
327  * Zeroes all bytes at `src` within range `0..len`.
328  *
329  * Params:
330  *  src = Data source pointer.
331  *  len = Length of data to be copied.
332  */
333 pragma(inline)
334 void memzero(const scope void* src, size_t len)
335 {
336     switch (len % 16)
337     {
338         case 0:
339             foreach (i; 0..(len / 16))
340                 _mm_storeu_pd(cast(double*)src + i, [0, 0]);
341             break;
342         case 8:
343             foreach (i; 0..(len / 8))
344                 (cast(ulong*)src)[i] = 0;
345             break;
346         case 4:
347             foreach (i; 0..(len / 4))
348                 (cast(uint*)src)[i] = 0;
349             break;
350         case 2:
351             foreach (i; 0..(len / 2))
352                 (cast(ushort*)src)[i] = 0;
353             break;
354         default:
355             foreach (i; 0..len)
356                 (cast(ubyte*)src)[i] = 0;
357             break;
358     }
359 }
360 
361 /**
362  * Swaps all bytes at `src` from `0..len`.
363  *
364  * Params:
365  *  src = Data source pointer.
366  *  len = Length of data to be byte-swapped.
367  */
368 pragma(inline)
369 void byteswap(const scope void* src, size_t len)
370 {
371     if (len % 16 == 0)
372     {
373         foreach (i; 0..(len / 16))
374         {
375             __m128i mask = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
376             _mm_storeu_pd(cast(double*)(cast(__m128i*)src + i), cast(__m128d)_mm_shuffle_epi8(_mm_loadu_si128(cast(__m128i*)src + i), mask));
377         }
378 
379         size_t end = (len / 32);
380         foreach (i; 0..(len / 32))
381         {
382             __m128d t = _mm_loadu_pd(cast(double*)(cast(__m128*)src + i));
383             _mm_storeu_pd(cast(double*)(cast(__m128*)src + i), _mm_loadu_pd(cast(double*)(cast(__m128*)src + (end - i))));
384             _mm_storeu_pd(cast(double*)(cast(__m128*)src + (end - i)), t);
385         }
386         return;
387     }
388 
389     // TODO: Not this
390     import std.algorithm : reverse;
391     (cast(ubyte*)src)[0..len].reverse();
392 }
393 
394 pragma(inline)
395 void emplace(T)(T* ptr)
396 {
397     if (!hasChildren!T)
398     {
399         T t;
400         memcpy!T.sizeof(cast(void*)ptr, cast(void*)&t);
401     }
402     else
403     {
404         static foreach (field; Fields!T)
405         {
406             typeof(getChild!(T, field)) t;
407             memcpy!T.sizeof(cast(void*)ptr + getChild!(T, field).offsetof, cast(void*)&t);
408         }
409     }
410 }