diff options
author | Eric <ewild@sysmocom.de> | 2020-07-23 02:16:46 +0200 |
---|---|---|
committer | laforge <laforge@osmocom.org> | 2020-08-06 16:47:40 +0000 |
commit | 3afc1d1777e1c0dc0fe832db2c9746fb8a767fe3 (patch) | |
tree | 96f104934eb857ba1d66516f4df7e8e6865c9790 /src/conv_acc_neon.c | |
parent | 2c962f5de1eeea119cfac7d9d92db31c570353b9 (diff) |
libomsocoding: NEON viterbi acceleration
configure flag required to enable this: --enable-neon
Although autodetection according to __ARM_NEON would work because this
is only defined if the fpu is neon neon-fp16 neon-vfpv3 neon-vfpv4
neon-fp-armv8 crypto-neon-fp-armv8 doing that would lead to a unknown
performance impact, so it needs to be enabled manually.
Speedup is about ~1.3-1.5 on a unspecified single core Cortex A9. This
requires handling a special case for RACH with len 14 which is far too
short for neon and would actually incur a performance penalty of 25%.
Related: OS#4585
Change-Id: I58ff2cb4ce3514f43390ff0a2121f81e6a4983b5
Diffstat (limited to 'src/conv_acc_neon.c')
-rw-r--r-- | src/conv_acc_neon.c | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/src/conv_acc_neon.c b/src/conv_acc_neon.c new file mode 100644 index 00000000..72449468 --- /dev/null +++ b/src/conv_acc_neon.c @@ -0,0 +1,110 @@ +/*! \file conv_acc_neon.c + * Accelerated Viterbi decoder implementation + * for architectures with only NEON available. */ +/* + * (C) 2020 by sysmocom - s.f.m.c. GmbH + * Author: Eric Wild + * + * All Rights Reserved + * + * SPDX-License-Identifier: GPL-2.0+ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stdlib.h> +#include <stdint.h> +#include <malloc.h> +#include "config.h" + +#if defined(HAVE_NEON) +#include <arm_neon.h> +#endif + +/* align req is 16 on android because google was confused, 8 on sane platforms */ +#define NEON_ALIGN 8 + +#include <conv_acc_neon_impl.h> + +/* Aligned Memory Allocator + * NEON requires 8-byte memory alignment. We store relevant trellis values + * (accumulated sums, outputs, and path decisions) as 16 bit signed integers + * so the allocated memory is casted as such. + */ +__attribute__ ((visibility("hidden"))) +int16_t *osmo_conv_neon_vdec_malloc(size_t n) +{ + return (int16_t *) memalign(NEON_ALIGN, sizeof(int16_t) * n); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_vdec_free(int16_t *ptr) +{ + free(ptr); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k5_n2(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; + + _neon_metrics_k5_n2(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k5_n3(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], 0 }; + + _neon_metrics_k5_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k5_n4(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; + + _neon_metrics_k5_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k7_n2(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[0], val[1] }; + + _neon_metrics_k7_n2(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k7_n3(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], 0 }; + + _neon_metrics_k7_n4(_val, out, sums, paths, norm); +} + +__attribute__ ((visibility("hidden"))) +void osmo_conv_neon_metrics_k7_n4(const int8_t *val, const int16_t *out, + int16_t *sums, int16_t *paths, int norm) +{ + const int16_t _val[4] = { val[0], val[1], val[2], val[3] }; + + _neon_metrics_k7_n4(_val, out, sums, paths, norm); +} |