#include "arm_neon.h" #define LOAD(DEST, TYPE, X, Y, SRC) \ (DEST).val = __load_lanes (__array_ref (SRC, X * Y), TYPE##x##Y##_t[X]) #define STORE(DEST, TYPE, X, Y, SRC) \ __array_ref (DEST, X * Y) = __store_lanes ((SRC).val, TYPE##_t [X * Y]) #define LOAD_LANE(DEST, TYPE, X, Y, SRC, LANE) \ (DEST).val = __load_lane (__array_ref (SRC, X * Y), (DEST).val, LANE) #define STORE_LANE(DEST, TYPE, X, Y, SRC, LANE) \ __array_ref (DEST, X) = __store_lane ((SRC).val, LANE, TYPE##_t[X]) void foo (uint32_t *a) { uint32x4x2_t x, y; LOAD (x, uint32, 2, 4, a); LOAD (y, uint32, 2, 4, a + 12); x.val[0] = vaddq_u32 (x.val[0], y.val[0]); x.val[1] = vaddq_u32 (x.val[1], y.val[1]); STORE (a, uint32, 2, 4, x); } void bar (uint32_t *a) { uint32x4x2_t x, y; LOAD (x, uint32, 2, 4, a); LOAD (y, uint32, 2, 4, a); x.val[0] = vaddq_u32 (x.val[0], y.val[0]); x.val[1] = vaddq_u32 (x.val[1], y.val[1]); STORE (a, uint32, 2, 4, x); } void frob (uint32_t *a) { uint32x4x2_t x, y; LOAD (x, uint32, 2, 4, a); LOAD (y, uint32, 2, 4, a + 12); LOAD_LANE (x, uint32, 2, 4, a + 32, 1); LOAD_LANE (y, uint32, 2, 4, a + 36, 2); x.val[0] = vaddq_u32 (x.val[0], y.val[0]); x.val[1] = vaddq_u32 (x.val[1], y.val[1]); STORE_LANE (a, uint32, 2, 4, x, 3); STORE_LANE (a + 4, uint32, 2, 4, x, 1); STORE_LANE (a + 8, uint32, 2, 4, x, 0); STORE_LANE (a + 12, uint32, 2, 4, x, 2); }