Line 0
Link Here
|
|
|
1 |
From 2368b59a81003cb87869a8f01615e28bbf8326bf Mon Sep 17 00:00:00 2001 |
2 |
From: Ilya Kurdyukov <jpegqs@gmail.com> |
3 |
Date: Fri, 26 Mar 2021 14:57:57 +0700 |
4 |
Subject: [PATCH] ffmpeg-4.3.1 e2k support |
5 |
|
6 |
--- |
7 |
configure | 20 +- |
8 |
libavcodec/audiodsp.c | 2 + |
9 |
libavcodec/audiodsp.h | 1 + |
10 |
libavcodec/blockdsp.c | 2 + |
11 |
libavcodec/blockdsp.h | 1 + |
12 |
libavcodec/e2k/Makefile | 31 + |
13 |
libavcodec/e2k/audiodsp.c | 62 + |
14 |
libavcodec/e2k/blockdsp.c | 72 + |
15 |
libavcodec/e2k/dctdsp.h | 27 + |
16 |
libavcodec/e2k/fdctdsp.c | 389 +++++ |
17 |
libavcodec/e2k/fft.c | 1043 +++++++++++++ |
18 |
libavcodec/e2k/fft.h | 29 + |
19 |
libavcodec/e2k/fft_init.c | 152 ++ |
20 |
libavcodec/e2k/fmtconvert.c | 55 + |
21 |
libavcodec/e2k/h264chroma.c | 63 + |
22 |
libavcodec/e2k/h264chroma_template.c | 113 ++ |
23 |
libavcodec/e2k/h264dsp.c | 820 +++++++++++ |
24 |
libavcodec/e2k/h264qpel.c | 255 ++++ |
25 |
libavcodec/e2k/h264qpel_template.c | 354 +++++ |
26 |
libavcodec/e2k/hevcdsp.c | 94 ++ |
27 |
libavcodec/e2k/hpeldsp.c | 302 ++++ |
28 |
libavcodec/e2k/hpeldsp.h | 30 + |
29 |
libavcodec/e2k/idctdsp.c | 237 +++ |
30 |
libavcodec/e2k/lossless_audiodsp.c | 75 + |
31 |
libavcodec/e2k/lossless_videodsp.c | 59 + |
32 |
libavcodec/e2k/mdct15.c | 187 +++ |
33 |
libavcodec/e2k/me_cmp.c | 461 ++++++ |
34 |
libavcodec/e2k/mpegaudiodsp.c | 142 ++ |
35 |
libavcodec/e2k/mpegvideo.c | 100 ++ |
36 |
libavcodec/e2k/mpegvideodsp.c | 86 ++ |
37 |
libavcodec/e2k/mpegvideoencdsp.c | 75 + |
38 |
libavcodec/e2k/pixblockdsp.c | 83 ++ |
39 |
libavcodec/e2k/svq1enc.c | 68 + |
40 |
libavcodec/e2k/vc1dsp.c | 303 ++++ |
41 |
libavcodec/e2k/videodsp.c | 36 + |
42 |
libavcodec/e2k/vorbisdsp.c | 62 + |
43 |
libavcodec/e2k/vp3dsp.c | 169 +++ |
44 |
libavcodec/e2k/vp8dsp.c | 428 ++++++ |
45 |
libavcodec/e2k/vp9dsp.c | 1740 ++++++++++++++++++++++ |
46 |
libavcodec/fdctdsp.c | 2 + |
47 |
libavcodec/fdctdsp.h | 2 + |
48 |
libavcodec/fft.h | 1 + |
49 |
libavcodec/fft_template.c | 1 + |
50 |
libavcodec/fmtconvert.c | 2 + |
51 |
libavcodec/fmtconvert.h | 1 + |
52 |
libavcodec/h264chroma.c | 2 + |
53 |
libavcodec/h264chroma.h | 1 + |
54 |
libavcodec/h264dsp.c | 1 + |
55 |
libavcodec/h264dsp.h | 2 + |
56 |
libavcodec/h264qpel.c | 2 + |
57 |
libavcodec/h264qpel.h | 1 + |
58 |
libavcodec/hevcdsp.c | 2 + |
59 |
libavcodec/hevcdsp.h | 1 + |
60 |
libavcodec/hpeldsp.c | 2 + |
61 |
libavcodec/hpeldsp.h | 1 + |
62 |
libavcodec/idctdsp.c | 2 + |
63 |
libavcodec/idctdsp.h | 2 + |
64 |
libavcodec/lossless_audiodsp.c | 2 + |
65 |
libavcodec/lossless_audiodsp.h | 1 + |
66 |
libavcodec/lossless_videodsp.c | 2 + |
67 |
libavcodec/lossless_videodsp.h | 1 + |
68 |
libavcodec/mdct15.c | 2 + |
69 |
libavcodec/mdct15.h | 1 + |
70 |
libavcodec/me_cmp.c | 2 + |
71 |
libavcodec/me_cmp.h | 1 + |
72 |
libavcodec/mpegaudiodsp.c | 1 + |
73 |
libavcodec/mpegaudiodsp.h | 1 + |
74 |
libavcodec/mpegvideo.c | 2 + |
75 |
libavcodec/mpegvideo.h | 1 + |
76 |
libavcodec/mpegvideodsp.c | 2 + |
77 |
libavcodec/mpegvideodsp.h | 1 + |
78 |
libavcodec/mpegvideoencdsp.c | 2 + |
79 |
libavcodec/mpegvideoencdsp.h | 2 + |
80 |
libavcodec/pixblockdsp.c | 2 + |
81 |
libavcodec/pixblockdsp.h | 2 + |
82 |
libavcodec/svq1enc.c | 2 + |
83 |
libavcodec/svq1enc.h | 1 + |
84 |
libavcodec/tests/dct.c | 2 + |
85 |
libavcodec/tests/e2k/dct.c | 31 + |
86 |
libavcodec/vc1dsp.c | 2 + |
87 |
libavcodec/vc1dsp.h | 1 + |
88 |
libavcodec/videodsp.c | 2 + |
89 |
libavcodec/videodsp.h | 1 + |
90 |
libavcodec/vorbisdsp.c | 2 + |
91 |
libavcodec/vorbisdsp.h | 1 + |
92 |
libavcodec/vp3dsp.c | 2 + |
93 |
libavcodec/vp3dsp.h | 1 + |
94 |
libavcodec/vp8dsp.c | 2 + |
95 |
libavcodec/vp8dsp.h | 1 + |
96 |
libavcodec/vp9dsp.c | 1 + |
97 |
libavcodec/vp9dsp.h | 1 + |
98 |
libavutil/cpu.c | 8 + |
99 |
libavutil/cpu.h | 2 + |
100 |
libavutil/cpu_internal.h | 2 + |
101 |
libavutil/e2k/Makefile | 2 + |
102 |
libavutil/e2k/cpu.c | 41 + |
103 |
libavutil/e2k/cpu.h | 27 + |
104 |
libavutil/e2k/float_dsp.c | 188 +++ |
105 |
libavutil/e2k/intreadwrite.h | 54 + |
106 |
libavutil/e2k/timer.h | 35 + |
107 |
libavutil/e2k/util_e2k.h | 146 ++ |
108 |
libavutil/float_dsp.c | 2 + |
109 |
libavutil/float_dsp.h | 1 + |
110 |
libavutil/intreadwrite.h | 2 + |
111 |
libavutil/tests/cpu.c | 2 + |
112 |
libavutil/timer.h | 2 + |
113 |
libswresample/audioconvert.c | 1 + |
114 |
libswresample/e2k/Makefile | 1 + |
115 |
libswresample/e2k/audio_convert.c | 110 ++ |
116 |
libswresample/swresample_internal.h | 4 + |
117 |
libswscale/e2k/Makefile | 3 + |
118 |
libswscale/e2k/swscale.c | 2046 ++++++++++++++++++++++++++ |
119 |
libswscale/e2k/yuv2rgb.c | 248 ++++ |
120 |
libswscale/e2k/yuv2rgb.h | 52 + |
121 |
libswscale/e2k/yuv2yuv.c | 146 ++ |
122 |
libswscale/swscale.c | 2 + |
123 |
libswscale/swscale_internal.h | 5 + |
124 |
libswscale/swscale_unscaled.c | 2 + |
125 |
libswscale/utils.c | 13 + |
126 |
libswscale/yuv2rgb.c | 2 + |
127 |
tests/checkasm/checkasm.c | 2 + |
128 |
tests/checkasm/huffyuvdsp.c | 8 +- |
129 |
122 files changed, 11491 insertions(+), 5 deletions(-) |
130 |
create mode 100644 libavcodec/e2k/Makefile |
131 |
create mode 100644 libavcodec/e2k/audiodsp.c |
132 |
create mode 100644 libavcodec/e2k/blockdsp.c |
133 |
create mode 100644 libavcodec/e2k/dctdsp.h |
134 |
create mode 100644 libavcodec/e2k/fdctdsp.c |
135 |
create mode 100644 libavcodec/e2k/fft.c |
136 |
create mode 100644 libavcodec/e2k/fft.h |
137 |
create mode 100644 libavcodec/e2k/fft_init.c |
138 |
create mode 100644 libavcodec/e2k/fmtconvert.c |
139 |
create mode 100644 libavcodec/e2k/h264chroma.c |
140 |
create mode 100644 libavcodec/e2k/h264chroma_template.c |
141 |
create mode 100644 libavcodec/e2k/h264dsp.c |
142 |
create mode 100644 libavcodec/e2k/h264qpel.c |
143 |
create mode 100644 libavcodec/e2k/h264qpel_template.c |
144 |
create mode 100644 libavcodec/e2k/hevcdsp.c |
145 |
create mode 100644 libavcodec/e2k/hpeldsp.c |
146 |
create mode 100644 libavcodec/e2k/hpeldsp.h |
147 |
create mode 100644 libavcodec/e2k/idctdsp.c |
148 |
create mode 100644 libavcodec/e2k/lossless_audiodsp.c |
149 |
create mode 100644 libavcodec/e2k/lossless_videodsp.c |
150 |
create mode 100644 libavcodec/e2k/mdct15.c |
151 |
create mode 100644 libavcodec/e2k/me_cmp.c |
152 |
create mode 100644 libavcodec/e2k/mpegaudiodsp.c |
153 |
create mode 100644 libavcodec/e2k/mpegvideo.c |
154 |
create mode 100644 libavcodec/e2k/mpegvideodsp.c |
155 |
create mode 100644 libavcodec/e2k/mpegvideoencdsp.c |
156 |
create mode 100644 libavcodec/e2k/pixblockdsp.c |
157 |
create mode 100644 libavcodec/e2k/svq1enc.c |
158 |
create mode 100644 libavcodec/e2k/vc1dsp.c |
159 |
create mode 100644 libavcodec/e2k/videodsp.c |
160 |
create mode 100644 libavcodec/e2k/vorbisdsp.c |
161 |
create mode 100644 libavcodec/e2k/vp3dsp.c |
162 |
create mode 100644 libavcodec/e2k/vp8dsp.c |
163 |
create mode 100644 libavcodec/e2k/vp9dsp.c |
164 |
create mode 100644 libavcodec/tests/e2k/dct.c |
165 |
create mode 100644 libavutil/e2k/Makefile |
166 |
create mode 100644 libavutil/e2k/cpu.c |
167 |
create mode 100644 libavutil/e2k/cpu.h |
168 |
create mode 100644 libavutil/e2k/float_dsp.c |
169 |
create mode 100644 libavutil/e2k/intreadwrite.h |
170 |
create mode 100644 libavutil/e2k/timer.h |
171 |
create mode 100644 libavutil/e2k/util_e2k.h |
172 |
create mode 100644 libswresample/e2k/Makefile |
173 |
create mode 100644 libswresample/e2k/audio_convert.c |
174 |
create mode 100644 libswscale/e2k/Makefile |
175 |
create mode 100644 libswscale/e2k/swscale.c |
176 |
create mode 100644 libswscale/e2k/yuv2rgb.c |
177 |
create mode 100644 libswscale/e2k/yuv2rgb.h |
178 |
create mode 100644 libswscale/e2k/yuv2yuv.c |
179 |
|
180 |
diff --git a/configure b/configure |
181 |
index 19c1865..3c2a9ab 100755 |
182 |
--- a/configure |
183 |
+++ b/configure |
184 |
@@ -1989,6 +1989,7 @@ ARCH_LIST=" |
185 |
parisc |
186 |
ppc |
187 |
ppc64 |
188 |
+ e2k |
189 |
s390 |
190 |
sh4 |
191 |
sparc |
192 |
@@ -2060,6 +2061,10 @@ ARCH_EXT_LIST_PPC=" |
193 |
vsx |
194 |
" |
195 |
|
196 |
+ARCH_EXT_LIST_E2K=" |
197 |
+ e2k |
198 |
+" |
199 |
+ |
200 |
ARCH_EXT_LIST_X86=" |
201 |
$ARCH_EXT_LIST_X86_SIMD |
202 |
cpunop |
203 |
@@ -2069,6 +2074,7 @@ ARCH_EXT_LIST_X86=" |
204 |
ARCH_EXT_LIST=" |
205 |
$ARCH_EXT_LIST_ARM |
206 |
$ARCH_EXT_LIST_PPC |
207 |
+ $ARCH_EXT_LIST_E2K |
208 |
$ARCH_EXT_LIST_X86 |
209 |
$ARCH_EXT_LIST_MIPS |
210 |
$ARCH_EXT_LIST_LOONGSON |
211 |
@@ -2594,10 +2600,10 @@ for ext in $(filter_out mmx $ARCH_EXT_LIST_X86_SIMD); do |
212 |
done |
213 |
|
214 |
aligned_stack_if_any="aarch64 ppc x86" |
215 |
-fast_64bit_if_any="aarch64 alpha ia64 mips64 parisc64 ppc64 sparc64 x86_64" |
216 |
-fast_clz_if_any="aarch64 alpha avr32 mips ppc x86" |
217 |
+fast_64bit_if_any="aarch64 alpha ia64 mips64 parisc64 ppc64 e2k sparc64 x86_64" |
218 |
+fast_clz_if_any="aarch64 alpha avr32 mips ppc e2k x86" |
219 |
fast_unaligned_if_any="aarch64 ppc x86" |
220 |
-simd_align_16_if_any="altivec neon sse" |
221 |
+simd_align_16_if_any="altivec e2k neon sse" |
222 |
simd_align_32_if_any="avx" |
223 |
simd_align_64_if_any="avx512" |
224 |
|
225 |
@@ -4889,6 +4895,9 @@ case "$arch" in |
226 |
"Power Macintosh"|ppc*|powerpc*) |
227 |
arch="ppc" |
228 |
;; |
229 |
+ e2k|elbrus) |
230 |
+ arch="e2k" |
231 |
+ ;; |
232 |
s390|s390x) |
233 |
arch="s390" |
234 |
;; |
235 |
@@ -5177,6 +5186,11 @@ elif enabled ppc; then |
236 |
;; |
237 |
esac |
238 |
|
239 |
+elif enabled e2k; then |
240 |
+ |
241 |
+ cpu="e2k" |
242 |
+ cpuflags="-msse4.1 -mno-avx" |
243 |
+ |
244 |
elif enabled sparc; then |
245 |
|
246 |
case $cpu in |
247 |
diff --git a/libavcodec/audiodsp.c b/libavcodec/audiodsp.c |
248 |
index efcb0a8..36b8528 100644 |
249 |
--- a/libavcodec/audiodsp.c |
250 |
+++ b/libavcodec/audiodsp.c |
251 |
@@ -113,6 +113,8 @@ av_cold void ff_audiodsp_init(AudioDSPContext *c) |
252 |
ff_audiodsp_init_arm(c); |
253 |
if (ARCH_PPC) |
254 |
ff_audiodsp_init_ppc(c); |
255 |
+ if (ARCH_E2K) |
256 |
+ ff_audiodsp_init_e2k(c); |
257 |
if (ARCH_X86) |
258 |
ff_audiodsp_init_x86(c); |
259 |
} |
260 |
diff --git a/libavcodec/audiodsp.h b/libavcodec/audiodsp.h |
261 |
index aa6fa78..9c05e28 100644 |
262 |
--- a/libavcodec/audiodsp.h |
263 |
+++ b/libavcodec/audiodsp.h |
264 |
@@ -55,6 +55,7 @@ typedef struct AudioDSPContext { |
265 |
void ff_audiodsp_init(AudioDSPContext *c); |
266 |
void ff_audiodsp_init_arm(AudioDSPContext *c); |
267 |
void ff_audiodsp_init_ppc(AudioDSPContext *c); |
268 |
+void ff_audiodsp_init_e2k(AudioDSPContext *c); |
269 |
void ff_audiodsp_init_x86(AudioDSPContext *c); |
270 |
|
271 |
#endif /* AVCODEC_AUDIODSP_H */ |
272 |
diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c |
273 |
index c7efe7e..704c723 100644 |
274 |
--- a/libavcodec/blockdsp.c |
275 |
+++ b/libavcodec/blockdsp.c |
276 |
@@ -71,6 +71,8 @@ av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx) |
277 |
ff_blockdsp_init_arm(c); |
278 |
if (ARCH_PPC) |
279 |
ff_blockdsp_init_ppc(c); |
280 |
+ if (ARCH_E2K) |
281 |
+ ff_blockdsp_init_e2k(c); |
282 |
if (ARCH_X86) |
283 |
ff_blockdsp_init_x86(c, avctx); |
284 |
if (ARCH_MIPS) |
285 |
diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h |
286 |
index 26fc2ea..8eccb47 100644 |
287 |
--- a/libavcodec/blockdsp.h |
288 |
+++ b/libavcodec/blockdsp.h |
289 |
@@ -44,6 +44,7 @@ void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx); |
290 |
void ff_blockdsp_init_alpha(BlockDSPContext *c); |
291 |
void ff_blockdsp_init_arm(BlockDSPContext *c); |
292 |
void ff_blockdsp_init_ppc(BlockDSPContext *c); |
293 |
+void ff_blockdsp_init_e2k(BlockDSPContext *c); |
294 |
void ff_blockdsp_init_x86(BlockDSPContext *c, AVCodecContext *avctx); |
295 |
void ff_blockdsp_init_mips(BlockDSPContext *c); |
296 |
|
297 |
diff --git a/libavcodec/e2k/Makefile b/libavcodec/e2k/Makefile |
298 |
new file mode 100644 |
299 |
index 0000000..3564b97 |
300 |
--- /dev/null |
301 |
+++ b/libavcodec/e2k/Makefile |
302 |
@@ -0,0 +1,31 @@ |
303 |
+# subsystems |
304 |
+OBJS-$(CONFIG_AUDIODSP) += e2k/audiodsp.o |
305 |
+OBJS-$(CONFIG_BLOCKDSP) += e2k/blockdsp.o |
306 |
+OBJS-$(CONFIG_FFT) += e2k/fft_init.o e2k/fft.o |
307 |
+OBJS-$(CONFIG_FDCTDSP) += e2k/fdctdsp.o |
308 |
+OBJS-$(CONFIG_FMTCONVERT) += e2k/fmtconvert.o |
309 |
+OBJS-$(CONFIG_H264CHROMA) += e2k/h264chroma.o |
310 |
+OBJS-$(CONFIG_H264DSP) += e2k/h264dsp.o e2k/hpeldsp.o |
311 |
+OBJS-$(CONFIG_H264QPEL) += e2k/h264qpel.o |
312 |
+OBJS-$(CONFIG_HPELDSP) += e2k/hpeldsp.o |
313 |
+OBJS-$(CONFIG_IDCTDSP) += e2k/idctdsp.o |
314 |
+OBJS-$(CONFIG_LLVIDDSP) += e2k/lossless_videodsp.o |
315 |
+OBJS-$(CONFIG_MDCT15) += e2k/mdct15.o |
316 |
+OBJS-$(CONFIG_ME_CMP) += e2k/me_cmp.o |
317 |
+OBJS-$(CONFIG_MPEGAUDIODSP) += e2k/mpegaudiodsp.o |
318 |
+OBJS-$(CONFIG_MPEGVIDEO) += e2k/mpegvideo.o e2k/mpegvideodsp.o |
319 |
+OBJS-$(CONFIG_MPEGVIDEOENC) += e2k/mpegvideoencdsp.o |
320 |
+OBJS-$(CONFIG_PIXBLOCKDSP) += e2k/pixblockdsp.o |
321 |
+OBJS-$(CONFIG_VC1DSP) += e2k/vc1dsp.o |
322 |
+OBJS-$(CONFIG_VIDEODSP) += e2k/videodsp.o |
323 |
+OBJS-$(CONFIG_VP3DSP) += e2k/vp3dsp.o |
324 |
+OBJS-$(CONFIG_VP8DSP) += e2k/vp8dsp.o |
325 |
+ |
326 |
+# decoders/encoders |
327 |
+OBJS-$(CONFIG_HEVC_DECODER) += e2k/hevcdsp.o |
328 |
+OBJS-$(CONFIG_LLAUDDSP) += e2k/lossless_audiodsp.o |
329 |
+OBJS-$(CONFIG_SVQ1_ENCODER) += e2k/svq1enc.o |
330 |
+OBJS-$(CONFIG_VORBIS_DECODER) += e2k/vorbisdsp.o |
331 |
+OBJS-$(CONFIG_VP7_DECODER) += e2k/vp8dsp.o |
332 |
+OBJS-$(CONFIG_VP9_DECODER) += e2k/vp9dsp.o |
333 |
+ |
334 |
diff --git a/libavcodec/e2k/audiodsp.c b/libavcodec/e2k/audiodsp.c |
335 |
new file mode 100644 |
336 |
index 0000000..c2e4433 |
337 |
--- /dev/null |
338 |
+++ b/libavcodec/e2k/audiodsp.c |
339 |
@@ -0,0 +1,62 @@ |
340 |
+/* |
341 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
342 |
+ * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org> |
343 |
+ * |
344 |
+ * This file is part of FFmpeg. |
345 |
+ * |
346 |
+ * FFmpeg is free software; you can redistribute it and/or |
347 |
+ * modify it under the terms of the GNU Lesser General Public |
348 |
+ * License as published by the Free Software Foundation; either |
349 |
+ * version 2.1 of the License, or (at your option) any later version. |
350 |
+ * |
351 |
+ * FFmpeg is distributed in the hope that it will be useful, |
352 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
353 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
354 |
+ * Lesser General Public License for more details. |
355 |
+ * |
356 |
+ * You should have received a copy of the GNU Lesser General Public |
357 |
+ * License along with FFmpeg; if not, write to the Free Software |
358 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
359 |
+ */ |
360 |
+ |
361 |
+/** |
362 |
+ * @file |
363 |
+ * miscellaneous audio operations |
364 |
+ */ |
365 |
+ |
366 |
+#include "config.h" |
367 |
+ |
368 |
+#include "libavutil/attributes.h" |
369 |
+#include "libavutil/cpu.h" |
370 |
+#include "libavutil/e2k/cpu.h" |
371 |
+#include "libavutil/e2k/util_e2k.h" |
372 |
+ |
373 |
+#include "libavcodec/audiodsp.h" |
374 |
+ |
375 |
+static int32_t scalarproduct_int16_e2k(const int16_t *v1, const int16_t *v2, int order) |
376 |
+{ |
377 |
+ int i; |
378 |
+ vec_s16 vec1, vec2; |
379 |
+ vec_s32 res = _mm_setzero_si128(), tmp; |
380 |
+ |
381 |
+ PRAGMA_E2K("ivdep") |
382 |
+ for (i = 0; i < order; i += 8) { |
383 |
+ vec1 = VEC_LD(v1); |
384 |
+ vec2 = VEC_LD(v2); |
385 |
+ tmp = _mm_madd_epi16(vec1, vec2); |
386 |
+ res = _mm_add_epi32(res, tmp); |
387 |
+ v1 += 8; |
388 |
+ v2 += 8; |
389 |
+ } |
390 |
+ |
391 |
+ res = _mm_hadd_epi32(res, res); |
392 |
+ return _mm_extract_epi32(res, 0) + _mm_extract_epi32(res, 1); |
393 |
+} |
394 |
+ |
395 |
+av_cold void ff_audiodsp_init_e2k(AudioDSPContext *c) |
396 |
+{ |
397 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
398 |
+ return; |
399 |
+ |
400 |
+ c->scalarproduct_int16 = scalarproduct_int16_e2k; |
401 |
+} |
402 |
diff --git a/libavcodec/e2k/blockdsp.c b/libavcodec/e2k/blockdsp.c |
403 |
new file mode 100644 |
404 |
index 0000000..f85dce1 |
405 |
--- /dev/null |
406 |
+++ b/libavcodec/e2k/blockdsp.c |
407 |
@@ -0,0 +1,72 @@ |
408 |
+/* |
409 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
410 |
+ * Copyright (c) 2002 Brian Foley |
411 |
+ * Copyright (c) 2002 Dieter Shirley |
412 |
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
413 |
+ * |
414 |
+ * This file is part of FFmpeg. |
415 |
+ * |
416 |
+ * FFmpeg is free software; you can redistribute it and/or |
417 |
+ * modify it under the terms of the GNU Lesser General Public |
418 |
+ * License as published by the Free Software Foundation; either |
419 |
+ * version 2.1 of the License, or (at your option) any later version. |
420 |
+ * |
421 |
+ * FFmpeg is distributed in the hope that it will be useful, |
422 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
423 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
424 |
+ * Lesser General Public License for more details. |
425 |
+ * |
426 |
+ * You should have received a copy of the GNU Lesser General Public |
427 |
+ * License along with FFmpeg; if not, write to the Free Software |
428 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
429 |
+ */ |
430 |
+ |
431 |
+#include "config.h" |
432 |
+ |
433 |
+#include <string.h> |
434 |
+ |
435 |
+#include "libavutil/attributes.h" |
436 |
+#include "libavutil/cpu.h" |
437 |
+#include "libavutil/mem.h" |
438 |
+#include "libavutil/e2k/cpu.h" |
439 |
+#include "libavutil/e2k/util_e2k.h" |
440 |
+ |
441 |
+#include "libavcodec/blockdsp.h" |
442 |
+ |
443 |
+static void clear_block_e2k(int16_t *block) |
444 |
+{ |
445 |
+ LOAD_ZERO; |
446 |
+ VEC_ST(block, zerov); |
447 |
+ VEC_ST(block + 8, zerov); |
448 |
+ VEC_ST(block + 8 * 2, zerov); |
449 |
+ VEC_ST(block + 8 * 3, zerov); |
450 |
+ VEC_ST(block + 8 * 4, zerov); |
451 |
+ VEC_ST(block + 8 * 5, zerov); |
452 |
+ VEC_ST(block + 8 * 6, zerov); |
453 |
+ VEC_ST(block + 8 * 7, zerov); |
454 |
+} |
455 |
+ |
456 |
+static void clear_blocks_e2k(int16_t *blocks) |
457 |
+{ |
458 |
+ int i; |
459 |
+ LOAD_ZERO; |
460 |
+ for (i = 0; i < 6; i++, blocks += 64) { |
461 |
+ VEC_ST(blocks, zerov); |
462 |
+ VEC_ST(blocks + 8, zerov); |
463 |
+ VEC_ST(blocks + 8 * 2, zerov); |
464 |
+ VEC_ST(blocks + 8 * 3, zerov); |
465 |
+ VEC_ST(blocks + 8 * 4, zerov); |
466 |
+ VEC_ST(blocks + 8 * 5, zerov); |
467 |
+ VEC_ST(blocks + 8 * 6, zerov); |
468 |
+ VEC_ST(blocks + 8 * 7, zerov); |
469 |
+ } |
470 |
+} |
471 |
+ |
472 |
+av_cold void ff_blockdsp_init_e2k(BlockDSPContext *c) |
473 |
+{ |
474 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
475 |
+ return; |
476 |
+ |
477 |
+ c->clear_block = clear_block_e2k; |
478 |
+ c->clear_blocks = clear_blocks_e2k; |
479 |
+} |
480 |
diff --git a/libavcodec/e2k/dctdsp.h b/libavcodec/e2k/dctdsp.h |
481 |
new file mode 100644 |
482 |
index 0000000..1281dc7 |
483 |
--- /dev/null |
484 |
+++ b/libavcodec/e2k/dctdsp.h |
485 |
@@ -0,0 +1,27 @@ |
486 |
+/* |
487 |
+ * This file is part of FFmpeg. |
488 |
+ * |
489 |
+ * FFmpeg is free software; you can redistribute it and/or |
490 |
+ * modify it under the terms of the GNU Lesser General Public |
491 |
+ * License as published by the Free Software Foundation; either |
492 |
+ * version 2.1 of the License, or (at your option) any later version. |
493 |
+ * |
494 |
+ * FFmpeg is distributed in the hope that it will be useful, |
495 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
496 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
497 |
+ * Lesser General Public License for more details. |
498 |
+ * |
499 |
+ * You should have received a copy of the GNU Lesser General Public |
500 |
+ * License along with FFmpeg; if not, write to the Free Software |
501 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
502 |
+ */ |
503 |
+ |
504 |
+#ifndef AVCODEC_E2K_DCTDSP_H |
505 |
+#define AVCODEC_E2K_DCTDSP_H |
506 |
+ |
507 |
+#include <stdint.h> |
508 |
+ |
509 |
+void ff_fdct_e2k(int16_t *block); |
510 |
+void ff_idct_e2k(int16_t *block); |
511 |
+ |
512 |
+#endif /* AVCODEC_E2K_DCTDSP_H */ |
513 |
diff --git a/libavcodec/e2k/fdctdsp.c b/libavcodec/e2k/fdctdsp.c |
514 |
new file mode 100644 |
515 |
index 0000000..568a67f |
516 |
--- /dev/null |
517 |
+++ b/libavcodec/e2k/fdctdsp.c |
518 |
@@ -0,0 +1,389 @@ |
519 |
+/* |
520 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
521 |
+ * Copyright (C) 2003 James Klicman <james@klicman.org> |
522 |
+ * |
523 |
+ * This file is part of FFmpeg. |
524 |
+ * |
525 |
+ * FFmpeg is free software; you can redistribute it and/or |
526 |
+ * modify it under the terms of the GNU Lesser General Public |
527 |
+ * License as published by the Free Software Foundation; either |
528 |
+ * version 2.1 of the License, or (at your option) any later version. |
529 |
+ * |
530 |
+ * FFmpeg is distributed in the hope that it will be useful, |
531 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
532 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
533 |
+ * Lesser General Public License for more details. |
534 |
+ * |
535 |
+ * You should have received a copy of the GNU Lesser General Public |
536 |
+ * License along with FFmpeg; if not, write to the Free Software |
537 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
538 |
+ */ |
539 |
+ |
540 |
+#include "config.h" |
541 |
+ |
542 |
+#include "libavutil/attributes.h" |
543 |
+#include "libavutil/cpu.h" |
544 |
+#include "libavutil/e2k/cpu.h" |
545 |
+#include "libavutil/e2k/util_e2k.h" |
546 |
+ |
547 |
+#include "libavcodec/fdctdsp.h" |
548 |
+ |
549 |
+#include "dctdsp.h" |
550 |
+ |
551 |
+#define C1 0.98078528040323044912618224 /* cos(1 * PI / 16) */ |
552 |
+#define C2 0.92387953251128675612818319 /* cos(2 * PI / 16) */ |
553 |
+#define C3 0.83146961230254523707878838 /* cos(3 * PI / 16) */ |
554 |
+#define C4 0.70710678118654752440084436 /* cos(4 * PI / 16) */ |
555 |
+#define C5 0.55557023301960222474283081 /* cos(5 * PI / 16) */ |
556 |
+#define C6 0.38268343236508977172845998 /* cos(6 * PI / 16) */ |
557 |
+#define C7 0.19509032201612826784828487 /* cos(7 * PI / 16) */ |
558 |
+ |
559 |
+#define W0 -(2 * C2) |
560 |
+#define W1 (2 * C6) |
561 |
+#define W2 (M_SQRT2 * C6) |
562 |
+#define W3 (M_SQRT2 * C3) |
563 |
+#define W4 (M_SQRT2 * (-C1 + C3 + C5 - C7)) |
564 |
+#define W5 (M_SQRT2 * (C1 + C3 - C5 + C7)) |
565 |
+#define W6 (M_SQRT2 * (C1 + C3 + C5 - C7)) |
566 |
+#define W7 (M_SQRT2 * (C1 + C3 - C5 - C7)) |
567 |
+#define W8 (M_SQRT2 * (C7 - C3)) |
568 |
+#define W9 (M_SQRT2 * (-C1 - C3)) |
569 |
+#define WA (M_SQRT2 * (-C3 - C5)) |
570 |
+#define WB (M_SQRT2 * (C5 - C3)) |
571 |
+ |
572 |
+#define LD_W0 _mm_set1_ps(W0) |
573 |
+#define LD_W1 _mm_set1_ps(W1) |
574 |
+#define LD_W2 _mm_set1_ps(W2) |
575 |
+#define LD_W3 _mm_set1_ps(W3) |
576 |
+#define LD_W4 _mm_set1_ps(W4) |
577 |
+#define LD_W5 _mm_set1_ps(W5) |
578 |
+#define LD_W6 _mm_set1_ps(W6) |
579 |
+#define LD_W7 _mm_set1_ps(W7) |
580 |
+#define LD_W8 _mm_set1_ps(W8) |
581 |
+#define LD_W9 _mm_set1_ps(W9) |
582 |
+#define LD_WA _mm_set1_ps(WA) |
583 |
+#define LD_WB _mm_set1_ps(WB) |
584 |
+ |
585 |
+#define _mm_madd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c) |
586 |
+ |
587 |
+#define FDCTROW(b0, b1, b2, b3, b4, b5, b6, b7) /* {{{ */ \ |
588 |
+ x0 = _mm_add_ps(b0, b7); /* x0 = b0 + b7; */ \ |
589 |
+ x7 = _mm_sub_ps(b0, b7); /* x7 = b0 - b7; */ \ |
590 |
+ x1 = _mm_add_ps(b1, b6); /* x1 = b1 + b6; */ \ |
591 |
+ x6 = _mm_sub_ps(b1, b6); /* x6 = b1 - b6; */ \ |
592 |
+ x2 = _mm_add_ps(b2, b5); /* x2 = b2 + b5; */ \ |
593 |
+ x5 = _mm_sub_ps(b2, b5); /* x5 = b2 - b5; */ \ |
594 |
+ x3 = _mm_add_ps(b3, b4); /* x3 = b3 + b4; */ \ |
595 |
+ x4 = _mm_sub_ps(b3, b4); /* x4 = b3 - b4; */ \ |
596 |
+ \ |
597 |
+ b7 = _mm_add_ps(x0, x3); /* b7 = x0 + x3; */ \ |
598 |
+ b1 = _mm_add_ps(x1, x2); /* b1 = x1 + x2; */ \ |
599 |
+ b0 = _mm_add_ps(b7, b1); /* b0 = b7 + b1; */ \ |
600 |
+ b4 = _mm_sub_ps(b7, b1); /* b4 = b7 - b1; */ \ |
601 |
+ \ |
602 |
+ b2 = _mm_sub_ps(x0, x3); /* b2 = x0 - x3; */ \ |
603 |
+ b6 = _mm_sub_ps(x1, x2); /* b6 = x1 - x2; */ \ |
604 |
+ b5 = _mm_add_ps(b6, b2); /* b5 = b6 + b2; */ \ |
605 |
+ cnst = LD_W2; \ |
606 |
+ b5 = _mm_mul_ps(cnst, b5); /* b5 = b5 * W2; */ \ |
607 |
+ cnst = LD_W1; \ |
608 |
+ b2 = _mm_madd_ps(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ |
609 |
+ cnst = LD_W0; \ |
610 |
+ b6 = _mm_madd_ps(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ |
611 |
+ \ |
612 |
+ x0 = _mm_add_ps(x4, x7); /* x0 = x4 + x7; */ \ |
613 |
+ x1 = _mm_add_ps(x5, x6); /* x1 = x5 + x6; */ \ |
614 |
+ x2 = _mm_add_ps(x4, x6); /* x2 = x4 + x6; */ \ |
615 |
+ x3 = _mm_add_ps(x5, x7); /* x3 = x5 + x7; */ \ |
616 |
+ x8 = _mm_add_ps(x2, x3); /* x8 = x2 + x3; */ \ |
617 |
+ cnst = LD_W3; \ |
618 |
+ x8 = _mm_mul_ps(cnst, x8); /* x8 = x8 * W3; */ \ |
619 |
+ \ |
620 |
+ cnst = LD_W8; \ |
621 |
+ x0 = _mm_mul_ps(cnst, x0); /* x0 *= W8; */ \ |
622 |
+ cnst = LD_W9; \ |
623 |
+ x1 = _mm_mul_ps(cnst, x1); /* x1 *= W9; */ \ |
624 |
+ cnst = LD_WA; \ |
625 |
+ x2 = _mm_madd_ps(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ |
626 |
+ cnst = LD_WB; \ |
627 |
+ x3 = _mm_madd_ps(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ |
628 |
+ \ |
629 |
+ cnst = LD_W4; \ |
630 |
+ b7 = _mm_madd_ps(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ |
631 |
+ cnst = LD_W5; \ |
632 |
+ b5 = _mm_madd_ps(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ |
633 |
+ cnst = LD_W6; \ |
634 |
+ b3 = _mm_madd_ps(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ |
635 |
+ cnst = LD_W7; \ |
636 |
+ b1 = _mm_madd_ps(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ |
637 |
+ \ |
638 |
+ b7 = _mm_add_ps(b7, x2); /* b7 = b7 + x2; */ \ |
639 |
+ b5 = _mm_add_ps(b5, x3); /* b5 = b5 + x3; */ \ |
640 |
+ b3 = _mm_add_ps(b3, x2); /* b3 = b3 + x2; */ \ |
641 |
+ b1 = _mm_add_ps(b1, x3) /* b1 = b1 + x3; */ \ |
642 |
+ /* }}} */ |
643 |
+ |
644 |
+#define FDCTCOL(b0, b1, b2, b3, b4, b5, b6, b7) /* {{{ */ \ |
645 |
+ x0 = _mm_add_ps(b0, b7); /* x0 = b0 + b7; */ \ |
646 |
+ x7 = _mm_sub_ps(b0, b7); /* x7 = b0 - b7; */ \ |
647 |
+ x1 = _mm_add_ps(b1, b6); /* x1 = b1 + b6; */ \ |
648 |
+ x6 = _mm_sub_ps(b1, b6); /* x6 = b1 - b6; */ \ |
649 |
+ x2 = _mm_add_ps(b2, b5); /* x2 = b2 + b5; */ \ |
650 |
+ x5 = _mm_sub_ps(b2, b5); /* x5 = b2 - b5; */ \ |
651 |
+ x3 = _mm_add_ps(b3, b4); /* x3 = b3 + b4; */ \ |
652 |
+ x4 = _mm_sub_ps(b3, b4); /* x4 = b3 - b4; */ \ |
653 |
+ \ |
654 |
+ b7 = _mm_add_ps(x0, x3); /* b7 = x0 + x3; */ \ |
655 |
+ b1 = _mm_add_ps(x1, x2); /* b1 = x1 + x2; */ \ |
656 |
+ b0 = _mm_add_ps(b7, b1); /* b0 = b7 + b1; */ \ |
657 |
+ b4 = _mm_sub_ps(b7, b1); /* b4 = b7 - b1; */ \ |
658 |
+ \ |
659 |
+ b2 = _mm_sub_ps(x0, x3); /* b2 = x0 - x3; */ \ |
660 |
+ b6 = _mm_sub_ps(x1, x2); /* b6 = x1 - x2; */ \ |
661 |
+ b5 = _mm_add_ps(b6, b2); /* b5 = b6 + b2; */ \ |
662 |
+ cnst = LD_W2; \ |
663 |
+ b5 = _mm_mul_ps(cnst, b5); /* b5 = b5 * W2; */ \ |
664 |
+ cnst = LD_W1; \ |
665 |
+ b2 = _mm_madd_ps(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ |
666 |
+ cnst = LD_W0; \ |
667 |
+ b6 = _mm_madd_ps(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ |
668 |
+ \ |
669 |
+ x0 = _mm_add_ps(x4, x7); /* x0 = x4 + x7; */ \ |
670 |
+ x1 = _mm_add_ps(x5, x6); /* x1 = x5 + x6; */ \ |
671 |
+ x2 = _mm_add_ps(x4, x6); /* x2 = x4 + x6; */ \ |
672 |
+ x3 = _mm_add_ps(x5, x7); /* x3 = x5 + x7; */ \ |
673 |
+ x8 = _mm_add_ps(x2, x3); /* x8 = x2 + x3; */ \ |
674 |
+ cnst = LD_W3; \ |
675 |
+ x8 = _mm_mul_ps(cnst, x8); /* x8 = x8 * W3; */ \ |
676 |
+ \ |
677 |
+ cnst = LD_W8; \ |
678 |
+ x0 = _mm_mul_ps(cnst, x0); /* x0 *= W8; */ \ |
679 |
+ cnst = LD_W9; \ |
680 |
+ x1 = _mm_mul_ps(cnst, x1); /* x1 *= W9; */ \ |
681 |
+ cnst = LD_WA; \ |
682 |
+ x2 = _mm_madd_ps(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ |
683 |
+ cnst = LD_WB; \ |
684 |
+ x3 = _mm_madd_ps(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ |
685 |
+ \ |
686 |
+ cnst = LD_W4; \ |
687 |
+ b7 = _mm_madd_ps(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ |
688 |
+ cnst = LD_W5; \ |
689 |
+ b5 = _mm_madd_ps(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ |
690 |
+ cnst = LD_W6; \ |
691 |
+ b3 = _mm_madd_ps(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ |
692 |
+ cnst = LD_W7; \ |
693 |
+ b1 = _mm_madd_ps(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ |
694 |
+ \ |
695 |
+ b7 = _mm_add_ps(b7, x2); /* b7 += x2; */ \ |
696 |
+ b5 = _mm_add_ps(b5, x3); /* b5 += x3; */ \ |
697 |
+ b3 = _mm_add_ps(b3, x2); /* b3 += x2; */ \ |
698 |
+ b1 = _mm_add_ps(b1, x3) /* b1 += x3; */ \ |
699 |
+ /* }}} */ |
700 |
+ |
701 |
+/* two dimensional discrete cosine transform */ |
702 |
+void ff_fdct_e2k(int16_t *block) |
703 |
+{ |
704 |
+ vec_f b00, b10, b20, b30, b40, b50, b60, b70; |
705 |
+ vec_f b01, b11, b21, b31, b41, b51, b61, b71; |
706 |
+ vec_f cnst; |
707 |
+ vec_f x0, x1, x2, x3, x4, x5, x6, x7, x8; |
708 |
+ vec_s16 a0, a1, a2, a3, a4, a5, a6, a7; |
709 |
+ vec_s16 z0, z1, z2, z3, z4, z5, z6, z7; |
710 |
+ |
711 |
+ a0 = VEC_LD(block + 8 * 0); |
712 |
+ a4 = VEC_LD(block + 8 * 4); |
713 |
+ a1 = VEC_LD(block + 8 * 1); |
714 |
+ a5 = VEC_LD(block + 8 * 5); |
715 |
+ a2 = VEC_LD(block + 8 * 2); |
716 |
+ a6 = VEC_LD(block + 8 * 6); |
717 |
+ a3 = VEC_LD(block + 8 * 3); |
718 |
+ a7 = VEC_LD(block + 8 * 7); |
719 |
+ |
720 |
+ TRANSPOSE8(a0, a1, a2, a3, a4, a5, a6, a7); |
721 |
+ |
722 |
+ /* Some of the initial calculations can be done as vector short |
723 |
+ * before conversion to vec_f. The following code section |
724 |
+ * takes advantage of this. */ |
725 |
+ |
726 |
+ /* fdct rows {{{ */ |
727 |
+ z0 = _mm_add_epi16(a0, a7); |
728 |
+ z7 = _mm_sub_epi16(a0, a7); |
729 |
+ z1 = _mm_add_epi16(a1, a6); |
730 |
+ z6 = _mm_sub_epi16(a1, a6); |
731 |
+ z2 = _mm_add_epi16(a2, a5); |
732 |
+ z5 = _mm_sub_epi16(a2, a5); |
733 |
+ z3 = _mm_add_epi16(a3, a4); |
734 |
+ z4 = _mm_sub_epi16(a3, a4); |
735 |
+ |
736 |
+#define CTF0(n) \ |
737 |
+ b##n##0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a##n, a##n), 16));\ |
738 |
+ b##n##1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a##n, a##n), 16)); |
739 |
+ |
740 |
+ a7 = _mm_add_epi16(z0, z3); |
741 |
+ a1 = _mm_add_epi16(z1, z2); |
742 |
+ a0 = _mm_add_epi16(a7, a1); |
743 |
+ a4 = _mm_sub_epi16(a7, a1); |
744 |
+ CTF0(0); |
745 |
+ CTF0(4); |
746 |
+ |
747 |
+ a2 = _mm_sub_epi16(z0, z3); |
748 |
+ a6 = _mm_sub_epi16(z1, z2); |
749 |
+ CTF0(2); |
750 |
+ CTF0(6); |
751 |
+ |
752 |
+#undef CTF0 |
753 |
+ |
754 |
+ x0 = _mm_add_ps(b60, b20); |
755 |
+ x1 = _mm_add_ps(b61, b21); |
756 |
+ |
757 |
+ cnst = LD_W2; |
758 |
+ x0 = _mm_mul_ps(cnst, x0); |
759 |
+ x1 = _mm_mul_ps(cnst, x1); |
760 |
+ cnst = LD_W1; |
761 |
+ b20 = _mm_madd_ps(cnst, b20, x0); |
762 |
+ b21 = _mm_madd_ps(cnst, b21, x1); |
763 |
+ cnst = LD_W0; |
764 |
+ b60 = _mm_madd_ps(cnst, b60, x0); |
765 |
+ b61 = _mm_madd_ps(cnst, b61, x1); |
766 |
+ |
767 |
+#define CTFX(x, b) \ |
768 |
+ b##0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16)); \ |
769 |
+ b##1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16)); |
770 |
+ |
771 |
+ CTFX(z4, b7); |
772 |
+ CTFX(z5, b5); |
773 |
+ CTFX(z6, b3); |
774 |
+ CTFX(z7, b1); |
775 |
+ |
776 |
+#undef CTFX |
777 |
+ |
778 |
+ x0 = _mm_add_ps(b70, b10); |
779 |
+ x1 = _mm_add_ps(b50, b30); |
780 |
+ x2 = _mm_add_ps(b70, b30); |
781 |
+ x3 = _mm_add_ps(b50, b10); |
782 |
+ x8 = _mm_add_ps(x2, x3); |
783 |
+ cnst = LD_W3; |
784 |
+ x8 = _mm_mul_ps(cnst, x8); |
785 |
+ |
786 |
+ cnst = LD_W8; |
787 |
+ x0 = _mm_mul_ps(cnst, x0); |
788 |
+ cnst = LD_W9; |
789 |
+ x1 = _mm_mul_ps(cnst, x1); |
790 |
+ cnst = LD_WA; |
791 |
+ x2 = _mm_madd_ps(cnst, x2, x8); |
792 |
+ cnst = LD_WB; |
793 |
+ x3 = _mm_madd_ps(cnst, x3, x8); |
794 |
+ |
795 |
+ cnst = LD_W4; |
796 |
+ b70 = _mm_madd_ps(cnst, b70, x0); |
797 |
+ cnst = LD_W5; |
798 |
+ b50 = _mm_madd_ps(cnst, b50, x1); |
799 |
+ cnst = LD_W6; |
800 |
+ b30 = _mm_madd_ps(cnst, b30, x1); |
801 |
+ cnst = LD_W7; |
802 |
+ b10 = _mm_madd_ps(cnst, b10, x0); |
803 |
+ |
804 |
+ b70 = _mm_add_ps(b70, x2); |
805 |
+ b50 = _mm_add_ps(b50, x3); |
806 |
+ b30 = _mm_add_ps(b30, x2); |
807 |
+ b10 = _mm_add_ps(b10, x3); |
808 |
+ |
809 |
+ x0 = _mm_add_ps(b71, b11); |
810 |
+ x1 = _mm_add_ps(b51, b31); |
811 |
+ x2 = _mm_add_ps(b71, b31); |
812 |
+ x3 = _mm_add_ps(b51, b11); |
813 |
+ x8 = _mm_add_ps(x2, x3); |
814 |
+ cnst = LD_W3; |
815 |
+ x8 = _mm_mul_ps(cnst, x8); |
816 |
+ |
817 |
+ cnst = LD_W8; |
818 |
+ x0 = _mm_mul_ps(cnst, x0); |
819 |
+ cnst = LD_W9; |
820 |
+ x1 = _mm_mul_ps(cnst, x1); |
821 |
+ cnst = LD_WA; |
822 |
+ x2 = _mm_madd_ps(cnst, x2, x8); |
823 |
+ cnst = LD_WB; |
824 |
+ x3 = _mm_madd_ps(cnst, x3, x8); |
825 |
+ |
826 |
+ cnst = LD_W4; |
827 |
+ b71 = _mm_madd_ps(cnst, b71, x0); |
828 |
+ cnst = LD_W5; |
829 |
+ b51 = _mm_madd_ps(cnst, b51, x1); |
830 |
+ cnst = LD_W6; |
831 |
+ b31 = _mm_madd_ps(cnst, b31, x1); |
832 |
+ cnst = LD_W7; |
833 |
+ b11 = _mm_madd_ps(cnst, b11, x0); |
834 |
+ |
835 |
+ b71 = _mm_add_ps(b71, x2); |
836 |
+ b51 = _mm_add_ps(b51, x3); |
837 |
+ b31 = _mm_add_ps(b31, x2); |
838 |
+ b11 = _mm_add_ps(b11, x3); |
839 |
+ /* }}} */ |
840 |
+ |
841 |
+ /* 8x8 matrix transpose (vec_f[8][2]) {{{ */ |
842 |
+ x0 = _mm_unpacklo_ps(b00, b10); |
843 |
+ x1 = _mm_unpackhi_ps(b00, b10); |
844 |
+ x2 = _mm_unpacklo_ps(b20, b30); |
845 |
+ x3 = _mm_unpackhi_ps(b20, b30); |
846 |
+ b00 = _mm_unpacklo_ps2(x0, x2); |
847 |
+ b10 = _mm_unpackhi_ps2(x0, x2); |
848 |
+ b20 = _mm_unpacklo_ps2(x1, x3); |
849 |
+ b30 = _mm_unpackhi_ps2(x1, x3); |
850 |
+ |
851 |
+ x4 = _mm_unpacklo_ps(b41, b51); |
852 |
+ x5 = _mm_unpackhi_ps(b41, b51); |
853 |
+ x6 = _mm_unpacklo_ps(b61, b71); |
854 |
+ x7 = _mm_unpackhi_ps(b61, b71); |
855 |
+ b41 = _mm_unpacklo_ps2(x4, x6); |
856 |
+ b51 = _mm_unpackhi_ps2(x4, x6); |
857 |
+ b61 = _mm_unpacklo_ps2(x5, x7); |
858 |
+ b71 = _mm_unpackhi_ps2(x5, x7); |
859 |
+ |
860 |
+ x0 = _mm_unpacklo_ps(b01, b11); |
861 |
+ x1 = _mm_unpackhi_ps(b01, b11); |
862 |
+ x2 = _mm_unpacklo_ps(b21, b31); |
863 |
+ x3 = _mm_unpackhi_ps(b21, b31); |
864 |
+ x4 = _mm_unpacklo_ps(b40, b50); |
865 |
+ x5 = _mm_unpackhi_ps(b40, b50); |
866 |
+ x6 = _mm_unpacklo_ps(b60, b70); |
867 |
+ x7 = _mm_unpackhi_ps(b60, b70); |
868 |
+ b40 = _mm_unpacklo_ps2(x0, x2); |
869 |
+ b50 = _mm_unpackhi_ps2(x0, x2); |
870 |
+ b60 = _mm_unpacklo_ps2(x1, x3); |
871 |
+ b70 = _mm_unpackhi_ps2(x1, x3); |
872 |
+ b01 = _mm_unpacklo_ps2(x4, x6); |
873 |
+ b11 = _mm_unpackhi_ps2(x4, x6); |
874 |
+ b21 = _mm_unpacklo_ps2(x5, x7); |
875 |
+ b31 = _mm_unpackhi_ps2(x5, x7); |
876 |
+ /* }}} */ |
877 |
+ |
878 |
+ FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); |
879 |
+ FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); |
880 |
+ |
881 |
+ /* round, convert back to short */ |
882 |
+#define CTS(n) \ |
883 |
+ a##n = _mm_packs_epi32(_mm_cvtps_epi32(b##n##0), _mm_cvtps_epi32(b##n##1)); \ |
884 |
+ VEC_ST(block + 8 * n, a##n) |
885 |
+ |
886 |
+ CTS(0); CTS(1); CTS(2); CTS(3); |
887 |
+ CTS(4); CTS(5); CTS(6); CTS(7); |
888 |
+ |
889 |
+#undef CTS |
890 |
+} |
891 |
+ |
892 |
+av_cold void ff_fdctdsp_init_e2k(FDCTDSPContext *c, AVCodecContext *avctx, |
893 |
+ unsigned high_bit_depth) |
894 |
+{ |
895 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
896 |
+ return; |
897 |
+ |
898 |
+ // !checkasm |
899 |
+ // libavcodec/tests/dct |
900 |
+ |
901 |
+ if (!high_bit_depth) { |
902 |
+ if (avctx->dct_algo == FF_DCT_AUTO || |
903 |
+ avctx->dct_algo == FF_DCT_ALTIVEC) { |
904 |
+ c->fdct = ff_fdct_e2k; |
905 |
+ } |
906 |
+ } |
907 |
+} |
908 |
diff --git a/libavcodec/e2k/fft.c b/libavcodec/e2k/fft.c |
909 |
new file mode 100644 |
910 |
index 0000000..5b58202 |
911 |
--- /dev/null |
912 |
+++ b/libavcodec/e2k/fft.c |
913 |
@@ -0,0 +1,1043 @@ |
914 |
+/* |
915 |
+ * FFT transform |
916 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
917 |
+ * Copyright (c) 2014 Rong Yan |
918 |
+ * Copyright (c) 2009 Loren Merritt |
919 |
+ * |
920 |
+ * This algorithm (though not any of the implementation details) is |
921 |
+ * based on libdjbfft by D. J. Bernstein. |
922 |
+ * |
923 |
+ * This file is part of FFmpeg. |
924 |
+ * |
925 |
+ * FFmpeg is free software; you can redistribute it and/or |
926 |
+ * modify it under the terms of the GNU Lesser General Public |
927 |
+ * License as published by the Free Software Foundation; either |
928 |
+ * version 2.1 of the License, or (at your option) any later version. |
929 |
+ * |
930 |
+ * FFmpeg is distributed in the hope that it will be useful, |
931 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
932 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
933 |
+ * Lesser General Public License for more details. |
934 |
+ * |
935 |
+ * You should have received a copy of the GNU Lesser General Public |
936 |
+ * License along with FFmpeg; if not, write to the Free Software |
937 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
938 |
+ */ |
939 |
+ |
940 |
+ |
941 |
+#include "config.h" |
942 |
+#include "libavutil/cpu.h" |
943 |
+#include "libavutil/e2k/util_e2k.h" |
944 |
+#include "libavcodec/fft.h" |
945 |
+#include "libavcodec/fft-internal.h" |
946 |
+#include "fft.h" |
947 |
+ |
948 |
+#define _mm_madd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c) |
949 |
+#define _mm_msub_ps(a, b, c) _mm_sub_ps(_mm_mul_ps(a, b), c) |
950 |
+#define _mm_nmsub_ps(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b)) |
951 |
+ |
952 |
+static av_always_inline |
953 |
+void pass_e2k_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n) |
954 |
+{ |
955 |
+ int i1 = n * 4, i2 = n * 8, i3 = n * 12; |
956 |
+ FFTSample *out = (FFTSample*)z; |
957 |
+ const FFTSample *wim = wre + n * 2; |
958 |
+ vec_f vz0, vzo1, vzo2, vzo3; |
959 |
+ vec_f x0, x1, x2, x3; |
960 |
+ vec_f x4, x5, x6, x7; |
961 |
+ vec_f x8, x9, x10, x11; |
962 |
+ vec_f x12, x13, x14, x15; |
963 |
+ vec_f x16, x17, x18, x19; |
964 |
+ vec_f x20, x21, x22, x23; |
965 |
+ vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1; |
966 |
+ vec_f y0, y1, y2, y3; |
967 |
+ vec_f y4, y5, y8, y9; |
968 |
+ vec_f y10, y13, y14, y15; |
969 |
+ vec_f y16, y17, y18, y19; |
970 |
+ vec_f y20, y21, y22, y23; |
971 |
+ vec_f wr1, wi1, wr0, wi0; |
972 |
+ vec_f wr2, wi2, wr3, wi3; |
973 |
+ vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3; |
974 |
+ |
975 |
+ n = n - 2; |
976 |
+ vzo2 = _mm_loadu_ps(out + i2); // zo2.r zo2.i z(o2+1).r z(o2+1).i |
977 |
+ vzo2plus1 = _mm_loadu_ps(out + i2 + 4); |
978 |
+ vzo3 = _mm_loadu_ps(out + i3); // zo3.r zo3.i z(o3+1).r z(o3+1).i |
979 |
+ vzo3plus1 = _mm_loadu_ps(out + i3 + 4); |
980 |
+ vz0 = _mm_loadu_ps(out); // z0.r z0.i z1.r z1.i |
981 |
+ vz0plus1 = _mm_loadu_ps(out + 4); |
982 |
+ vzo1 = _mm_loadu_ps(out + i1); // zo1.r zo1.i z(o1+1).r z(o1+1).i |
983 |
+ vzo1plus1 = _mm_loadu_ps(out + i1 + 4); |
984 |
+ |
985 |
+ x0 = _mm_add_ps(vzo2, vzo3); |
986 |
+ x1 = _mm_sub_ps(vzo2, vzo3); |
987 |
+ y0 = _mm_add_ps(vzo2plus1, vzo3plus1); |
988 |
+ y1 = _mm_sub_ps(vzo2plus1, vzo3plus1); |
989 |
+ |
990 |
+ wr1 = _mm_set1_ps(wre[1]); |
991 |
+ wi1 = _mm_set1_ps(wim[-1]); |
992 |
+ wi2 = _mm_set1_ps(wim[-2]); |
993 |
+ wi3 = _mm_set1_ps(wim[-3]); |
994 |
+ wr2 = _mm_set1_ps(wre[2]); |
995 |
+ wr3 = _mm_set1_ps(wre[3]); |
996 |
+ |
997 |
+ x2 = _mm_unpackhi_ps(x0, x1); |
998 |
+ x3 = _mm_shuffle_ps(x2, x2, 0x1b); |
999 |
+ |
1000 |
+ y2 = _mm_unpacklo_ps(y0, y1); |
1001 |
+ y3 = _mm_unpackhi_ps(y0, y1); |
1002 |
+ y4 = _mm_shuffle_ps(y2, y2, 0x1b); |
1003 |
+ y5 = _mm_shuffle_ps(y3, y3, 0x1b); |
1004 |
+ |
1005 |
+ ymulwi2 = _mm_mul_ps(y4, wi2); |
1006 |
+ ymulwi3 = _mm_mul_ps(y5, wi3); |
1007 |
+ x4 = _mm_mul_ps(x2, wr1); |
1008 |
+ x5 = _mm_mul_ps(x3, wi1); |
1009 |
+ y8 = _mm_madd_ps(y2, wr2, ymulwi2); |
1010 |
+ y9 = _mm_msub_ps(y2, wr2, ymulwi2); |
1011 |
+ x6 = _mm_add_ps(x4, x5); |
1012 |
+ x7 = _mm_sub_ps(x4, x5); |
1013 |
+ y13 = _mm_madd_ps(y3, wr3, ymulwi3); |
1014 |
+ y14 = _mm_msub_ps(y3, wr3, ymulwi3); |
1015 |
+ |
1016 |
+ x8 = _mm_shuffle_ps(x6, x7, 0xe4); |
1017 |
+ y10 = _mm_shuffle_ps(y8, y9, 0xe4); |
1018 |
+ y15 = _mm_shuffle_ps(y13, y14, 0xe4); |
1019 |
+ |
1020 |
+ x9 = _mm_shuffle_ps(x0, x8, 0x84); |
1021 |
+ x10 = _mm_shuffle_ps(x1, x8, 0x71); |
1022 |
+ |
1023 |
+ y16 = _mm_shuffle_ps(y10, y15, 0x88); |
1024 |
+ y17 = _mm_shuffle_ps(y10, y15, 0x77); |
1025 |
+ |
1026 |
+ x11 = _mm_add_ps(vz0, x9); |
1027 |
+ x12 = _mm_sub_ps(vz0, x9); |
1028 |
+ x13 = _mm_add_ps(vzo1, x10); |
1029 |
+ x14 = _mm_sub_ps(vzo1, x10); |
1030 |
+ |
1031 |
+ y18 = _mm_add_ps(vz0plus1, y16); |
1032 |
+ y19 = _mm_sub_ps(vz0plus1, y16); |
1033 |
+ y20 = _mm_add_ps(vzo1plus1, y17); |
1034 |
+ y21 = _mm_sub_ps(vzo1plus1, y17); |
1035 |
+ |
1036 |
+ x15 = _mm_blend_ps(x13, x14, 0xa); |
1037 |
+ x16 = _mm_blend_ps(x14, x13, 0xa); |
1038 |
+ y22 = _mm_blend_ps(y20, y21, 0xa); |
1039 |
+ y23 = _mm_blend_ps(y21, y20, 0xa); |
1040 |
+ |
1041 |
+ _mm_storeu_ps(out, x11); |
1042 |
+ _mm_storeu_ps(out + 4, y18); |
1043 |
+ _mm_storeu_ps(out + i1, x15); |
1044 |
+ _mm_storeu_ps(out + i1 + 4, y22); |
1045 |
+ _mm_storeu_ps(out + i2, x12); |
1046 |
+ _mm_storeu_ps(out + i2 + 4, y19); |
1047 |
+ _mm_storeu_ps(out + i3, x16); |
1048 |
+ _mm_storeu_ps(out + i3 + 4, y23); |
1049 |
+ |
1050 |
+ do { |
1051 |
+ out += 8; |
1052 |
+ wre += 4; |
1053 |
+ wim -= 4; |
1054 |
+ wr0 = _mm_set1_ps(wre[0]); |
1055 |
+ wr1 = _mm_set1_ps(wre[1]); |
1056 |
+ wi0 = _mm_set1_ps(wim[0]); |
1057 |
+ wi1 = _mm_set1_ps(wim[-1]); |
1058 |
+ |
1059 |
+ wr2 = _mm_set1_ps(wre[2]); |
1060 |
+ wr3 = _mm_set1_ps(wre[3]); |
1061 |
+ wi2 = _mm_set1_ps(wim[-2]); |
1062 |
+ wi3 = _mm_set1_ps(wim[-3]); |
1063 |
+ |
1064 |
+ vzo2 = _mm_loadu_ps(out + i2); // zo2.r zo2.i z(o2+1).r z(o2+1).i |
1065 |
+ vzo2plus1 = _mm_loadu_ps(out + i2 + 4); |
1066 |
+ vzo3 = _mm_loadu_ps(out + i3); // zo3.r zo3.i z(o3+1).r z(o3+1).i |
1067 |
+ vzo3plus1 = _mm_loadu_ps(out + i3 + 4); |
1068 |
+ vz0 = _mm_loadu_ps(out + 0); // z0.r z0.i z1.r z1.i |
1069 |
+ vz0plus1 = _mm_loadu_ps(out + 4); |
1070 |
+ vzo1 = _mm_loadu_ps(out + i1); // zo1.r zo1.i z(o1+1).r z(o1+1).i |
1071 |
+ vzo1plus1 = _mm_loadu_ps(out + i1 + 4); |
1072 |
+ |
1073 |
+ x0 = _mm_add_ps(vzo2, vzo3); |
1074 |
+ x1 = _mm_sub_ps(vzo2, vzo3); |
1075 |
+ |
1076 |
+ y0 = _mm_add_ps(vzo2plus1, vzo3plus1); |
1077 |
+ y1 = _mm_sub_ps(vzo2plus1, vzo3plus1); |
1078 |
+ |
1079 |
+ x2 = _mm_unpacklo_ps(x0, x1); |
1080 |
+ x3 = _mm_unpackhi_ps(x0, x1); |
1081 |
+ x4 = _mm_shuffle_ps(x2, x2, 0x1b); |
1082 |
+ x5 = _mm_shuffle_ps(x3, x3, 0x1b); |
1083 |
+ |
1084 |
+ y2 = _mm_unpacklo_ps(y0, y1); |
1085 |
+ y3 = _mm_unpackhi_ps(y0, y1); |
1086 |
+ y4 = _mm_shuffle_ps(y2, y2, 0x1b); |
1087 |
+ y5 = _mm_shuffle_ps(y3, y3, 0x1b); |
1088 |
+ |
1089 |
+ xmulwi0 = _mm_mul_ps(x4, wi0); |
1090 |
+ xmulwi1 = _mm_mul_ps(x5, wi1); |
1091 |
+ x8 = _mm_madd_ps(x2, wr0, xmulwi0); |
1092 |
+ x9 = _mm_msub_ps(x2, wr0, xmulwi0); |
1093 |
+ |
1094 |
+ ymulwi2 = _mm_mul_ps(y4, wi2); |
1095 |
+ ymulwi3 = _mm_mul_ps(y5, wi3); |
1096 |
+ x13 = _mm_madd_ps(x3, wr1, xmulwi1); |
1097 |
+ x14 = _mm_msub_ps(x3, wr1, xmulwi1); |
1098 |
+ |
1099 |
+ y8 = _mm_madd_ps(y2, wr2, ymulwi2); |
1100 |
+ y9 = _mm_msub_ps(y2, wr2, ymulwi2); |
1101 |
+ y13 = _mm_madd_ps(y3, wr3, ymulwi3); |
1102 |
+ y14 = _mm_msub_ps(y3, wr3, ymulwi3); |
1103 |
+ |
1104 |
+ x10 = _mm_shuffle_ps(x8, x9, 0xe4); |
1105 |
+ x15 = _mm_shuffle_ps(x13, x14, 0xe4); |
1106 |
+ |
1107 |
+ y10 = _mm_shuffle_ps(y8, y9, 0xe4); |
1108 |
+ y15 = _mm_shuffle_ps(y13, y14, 0xe4); |
1109 |
+ |
1110 |
+ x16 = _mm_shuffle_ps(x10, x15, 0x88); |
1111 |
+ x17 = _mm_shuffle_ps(x10, x15, 0x77); |
1112 |
+ |
1113 |
+ y16 = _mm_shuffle_ps(y10, y15, 0x88); |
1114 |
+ y17 = _mm_shuffle_ps(y10, y15, 0x77); |
1115 |
+ |
1116 |
+ x18 = _mm_add_ps(vz0, x16); |
1117 |
+ x19 = _mm_sub_ps(vz0, x16); |
1118 |
+ x20 = _mm_add_ps(vzo1, x17); |
1119 |
+ x21 = _mm_sub_ps(vzo1, x17); |
1120 |
+ |
1121 |
+ y18 = _mm_add_ps(vz0plus1, y16); |
1122 |
+ y19 = _mm_sub_ps(vz0plus1, y16); |
1123 |
+ y20 = _mm_add_ps(vzo1plus1, y17); |
1124 |
+ y21 = _mm_sub_ps(vzo1plus1, y17); |
1125 |
+ |
1126 |
+ x22 = _mm_blend_ps(x20, x21, 0xa); |
1127 |
+ x23 = _mm_blend_ps(x21, x20, 0xa); |
1128 |
+ y22 = _mm_blend_ps(y20, y21, 0xa); |
1129 |
+ y23 = _mm_blend_ps(y21, y20, 0xa); |
1130 |
+ |
1131 |
+ _mm_storeu_ps(out, x18); |
1132 |
+ _mm_storeu_ps(out + 4, y18); |
1133 |
+ _mm_storeu_ps(out + i1, x22); |
1134 |
+ _mm_storeu_ps(out + i1 + 4, y22); |
1135 |
+ _mm_storeu_ps(out + i2, x19); |
1136 |
+ _mm_storeu_ps(out + i2 + 4, y19); |
1137 |
+ _mm_storeu_ps(out + i3, x23); |
1138 |
+ _mm_storeu_ps(out + i3 + 4, y23); |
1139 |
+ } while (n -= 2); |
1140 |
+} |
1141 |
+ |
1142 |
+static av_always_inline |
1143 |
+void fft2_e2k_interleave(FFTComplex *z) |
1144 |
+{ |
1145 |
+#if 1 |
1146 |
+ vec_f a, b, c; |
1147 |
+ float *out = (float*)z; |
1148 |
+ vec_f sign = _mm_castsi128_ps(_mm_set1_epi32(-1 << 31)); |
1149 |
+ |
1150 |
+ a = _mm_loadu_ps(out); |
1151 |
+ b = _mm_unpacklo_ps2(a, a); |
1152 |
+ c = _mm_unpackhi_ps2(a, _mm_xor_ps(a, sign)); |
1153 |
+ a = _mm_add_ps(b, c); |
1154 |
+ _mm_storeu_ps(out, a); |
1155 |
+#else |
1156 |
+ FFTSample r0, i0, r1, i1; |
1157 |
+ r0 = z[0].re; i0 = z[0].im; |
1158 |
+ r1 = z[1].re; i1 = z[1].im; |
1159 |
+ z[0].re = r0 + r1; |
1160 |
+ z[0].im = i0 + i1; |
1161 |
+ z[1].re = r0 - r1; |
1162 |
+ z[1].im = i0 - i1; |
1163 |
+#endif |
1164 |
+} |
1165 |
+ |
1166 |
+static av_always_inline |
1167 |
+void fft4_e2k_interleave(FFTComplex *z) |
1168 |
+{ |
1169 |
+ vec_f a, b, c, d; |
1170 |
+ float *out = (float*)z; |
1171 |
+ a = _mm_loadu_ps(out); |
1172 |
+ b = _mm_loadu_ps(out + 4); |
1173 |
+ |
1174 |
+ c = _mm_shuffle_ps(a, b, 0x64); |
1175 |
+ d = _mm_shuffle_ps(a, b, 0xce); |
1176 |
+ a = _mm_add_ps(c, d); |
1177 |
+ b = _mm_sub_ps(c, d); |
1178 |
+ |
1179 |
+ c = _mm_unpacklo_ps2(a, b); |
1180 |
+ d = _mm_shuffle_ps(a, b, 0xbe); |
1181 |
+ |
1182 |
+ a = _mm_add_ps(c, d); |
1183 |
+ b = _mm_sub_ps(c, d); |
1184 |
+ _mm_storeu_ps(out, a); |
1185 |
+ _mm_storeu_ps(out + 4, b); |
1186 |
+} |
1187 |
+ |
1188 |
+static av_always_inline |
1189 |
+void fft8_e2k_interleave(FFTComplex *z) |
1190 |
+{ |
1191 |
+ vec_f vz0, vz1, vz2, vz3; |
1192 |
+ vec_f x0, x1, x2, x3; |
1193 |
+ vec_f x4, x5, x6, x7; |
1194 |
+ vec_f x8, x9, x10, x11; |
1195 |
+ vec_f x12, x13, x14, x15; |
1196 |
+ vec_f x16, x17, x18, x19; |
1197 |
+ vec_f x20, x21, x22, x23; |
1198 |
+ vec_f x24, x25, x26, x27; |
1199 |
+ vec_f x28, x29, x30, x31; |
1200 |
+ vec_f x32, x33, x34; |
1201 |
+ |
1202 |
+ float *out = (float*)z; |
1203 |
+ vec_f vc1 = _mm_set1_ps(sqrthalf); |
1204 |
+ |
1205 |
+ vz0 = _mm_loadu_ps(out); |
1206 |
+ vz1 = _mm_loadu_ps(out + 4); |
1207 |
+ vz2 = _mm_loadu_ps(out + 8); |
1208 |
+ vz3 = _mm_loadu_ps(out + 12); |
1209 |
+ |
1210 |
+ x0 = _mm_shuffle_ps(vz0, vz1, 0x64); |
1211 |
+ x1 = _mm_shuffle_ps(vz0, vz1, 0xce); |
1212 |
+ x2 = _mm_shuffle_ps(vz2, vz3, 0x46); |
1213 |
+ x3 = _mm_shuffle_ps(vz2, vz3, 0xec); |
1214 |
+ |
1215 |
+ x4 = _mm_add_ps(x0, x1); |
1216 |
+ x5 = _mm_sub_ps(x0, x1); |
1217 |
+ x6 = _mm_add_ps(x2, x3); |
1218 |
+ x7 = _mm_sub_ps(x2, x3); |
1219 |
+ |
1220 |
+ x8 = _mm_unpacklo_ps2(x4, x5); |
1221 |
+ x9 = _mm_shuffle_ps(x4, x5, 0xbe); |
1222 |
+ x10 = _mm_shuffle_ps(x6, x7, 0x66); |
1223 |
+ x11 = _mm_shuffle_ps(x6, x7, 0xcc); |
1224 |
+ |
1225 |
+ x12 = _mm_add_ps(x8, x9); |
1226 |
+ x13 = _mm_sub_ps(x8, x9); |
1227 |
+ x14 = _mm_add_ps(x10, x11); |
1228 |
+ x15 = _mm_sub_ps(x10, x11); |
1229 |
+ x16 = _mm_unpacklo_ps(x12, x13); |
1230 |
+ x17 = _mm_unpacklo_ps(x14, x15); |
1231 |
+ x18 = _mm_shuffle_ps(x17, x17, 0x6c); |
1232 |
+ x19 = _mm_add_ps(x16, x18); // z0.r z2.r z0.i z2.i |
1233 |
+ x20 = _mm_sub_ps(x16, x18); // z4.r z6.r z4.i z6.i |
1234 |
+ |
1235 |
+ x21 = _mm_unpackhi_ps(x12, x13); |
1236 |
+ x22 = _mm_unpackhi_ps(x14, x15); |
1237 |
+ x23 = _mm_unpackhi_ps2(x22, x22); |
1238 |
+ x24 = _mm_add_ps(x22, x23); |
1239 |
+ x25 = _mm_sub_ps(x22, x23); |
1240 |
+ |
1241 |
+ x26 = _mm_unpacklo_ps2(x24, x25); |
1242 |
+ x26 = _mm_mul_ps(_mm_shuffle_ps(x26, x26, 0x8d), vc1); // 1,s1,0,s0 |
1243 |
+ |
1244 |
+ x27 = _mm_add_ps(x21, x26); // z1.r z7.r z1.i z3.i |
1245 |
+ x28 = _mm_sub_ps(x21, x26); // z5.r z3.r z5.i z7.i |
1246 |
+ |
1247 |
+ x29 = _mm_shuffle_ps(x19, x27, 0x88); // z0.r z0.i z1.r z1.i |
1248 |
+ x30 = _mm_shuffle_ps(x19, x27, 0xdd); // z2.r z2.i z7.r z3.i |
1249 |
+ x31 = _mm_shuffle_ps(x20, x28, 0x88); // z4.r z4.i z5.r z5.i |
1250 |
+ x32 = _mm_shuffle_ps(x20, x28, 0xdd); // z6.r z6.i z3.r z7.i |
1251 |
+ x33 = _mm_blend_ps(x30, x32, 0x4); // z2.r z2.i z3.r z3.i |
1252 |
+ x34 = _mm_blend_ps(x32, x30, 0x4); // z6.r z6.i z7.r z7.i |
1253 |
+ |
1254 |
+ _mm_storeu_ps(out, x29); |
1255 |
+ _mm_storeu_ps(out + 4, x33); |
1256 |
+ _mm_storeu_ps(out + 8, x31); |
1257 |
+ _mm_storeu_ps(out + 12, x34); |
1258 |
+} |
1259 |
+ |
1260 |
+static av_always_inline |
1261 |
+void fft16_e2k_interleave(FFTComplex *z) |
1262 |
+{ |
1263 |
+ float *out = (float*)z; |
1264 |
+ vec_f vc0 = _mm_set1_ps(sqrthalf); |
1265 |
+ vec_f vc1 = _mm_set1_ps(ff_cos_16[1]); |
1266 |
+ vec_f vc2 = _mm_set1_ps(ff_cos_16[3]); |
1267 |
+ vec_f vz0, vz1, vz2, vz3; |
1268 |
+ vec_f vz4, vz5, vz6, vz7; |
1269 |
+ vec_f x0, x1, x2, x3; |
1270 |
+ vec_f x4, x5, x6, x7; |
1271 |
+ vec_f x8, x9, x10, x11; |
1272 |
+ vec_f x12, x13, x14, x15; |
1273 |
+ vec_f x16, x17, x18, x19; |
1274 |
+ vec_f x20, x21, x22, x23; |
1275 |
+ vec_f x24, x25, x26, x27; |
1276 |
+ vec_f x28, x29, x30, x31; |
1277 |
+ vec_f x32, x33, x34, x35; |
1278 |
+ vec_f x36, x37, x38, x39; |
1279 |
+ vec_f x40, x41, x42, x43; |
1280 |
+ vec_f x44, x45, x46, x47; |
1281 |
+ vec_f x48, x49, x50, x51; |
1282 |
+ vec_f x52, x53, x54, x55; |
1283 |
+ vec_f x56, x57, x58, x59; |
1284 |
+ vec_f x60, x61, x62, x63; |
1285 |
+ vec_f x64, x65, x66, x67; |
1286 |
+ vec_f x68, x69, x70, x71; |
1287 |
+ vec_f x72, x73, x74, x75; |
1288 |
+ vec_f x76, x77, x78, x79; |
1289 |
+ vec_f x80, x81, x82, x83; |
1290 |
+ vec_f x84, x85, x86; |
1291 |
+ |
1292 |
+ vz0 = _mm_loadu_ps(out); |
1293 |
+ vz1 = _mm_loadu_ps(out + 4); |
1294 |
+ vz2 = _mm_loadu_ps(out + 8); |
1295 |
+ vz3 = _mm_loadu_ps(out + 12); |
1296 |
+ vz4 = _mm_loadu_ps(out + 16); |
1297 |
+ vz5 = _mm_loadu_ps(out + 20); |
1298 |
+ vz6 = _mm_loadu_ps(out + 24); |
1299 |
+ vz7 = _mm_loadu_ps(out + 28); |
1300 |
+ |
1301 |
+ x0 = _mm_shuffle_ps(vz0, vz1, 0x64); |
1302 |
+ x1 = _mm_shuffle_ps(vz0, vz1, 0xce); |
1303 |
+ x2 = _mm_unpacklo_ps2(vz2, vz3); |
1304 |
+ x3 = _mm_unpackhi_ps2(vz2, vz3); |
1305 |
+ |
1306 |
+ x4 = _mm_shuffle_ps(vz4, vz5, 0x64); |
1307 |
+ x5 = _mm_shuffle_ps(vz4, vz5, 0xce); |
1308 |
+ x6 = _mm_shuffle_ps(vz6, vz7, 0x64); |
1309 |
+ x7 = _mm_shuffle_ps(vz6, vz7, 0xce); |
1310 |
+ |
1311 |
+ x8 = _mm_add_ps(x0, x1); |
1312 |
+ x9 = _mm_sub_ps(x0, x1); |
1313 |
+ x10 = _mm_add_ps(x2, x3); |
1314 |
+ x11 = _mm_sub_ps(x2, x3); |
1315 |
+ |
1316 |
+ x12 = _mm_add_ps(x4, x5); |
1317 |
+ x13 = _mm_sub_ps(x4, x5); |
1318 |
+ x14 = _mm_add_ps(x6, x7); |
1319 |
+ x15 = _mm_sub_ps(x6, x7); |
1320 |
+ |
1321 |
+ x16 = _mm_unpacklo_ps2(x8, x9); |
1322 |
+ x17 = _mm_shuffle_ps(x8, x9, 0xbe); |
1323 |
+ x18 = _mm_shuffle_ps(x10, x11, 0x96); |
1324 |
+ x19 = _mm_shuffle_ps(x10, x11, 0xcc); |
1325 |
+ x20 = _mm_unpacklo_ps2(x12, x14); |
1326 |
+ x21 = _mm_unpackhi_ps2(x12, x14); |
1327 |
+ x22 = _mm_unpacklo_ps2(x13, x15); |
1328 |
+ x23 = _mm_shuffle_ps(x13, x15, 0xbb); |
1329 |
+ |
1330 |
+ x24 = _mm_add_ps(x16, x17); |
1331 |
+ x25 = _mm_sub_ps(x16, x17); |
1332 |
+ x26 = _mm_add_ps(x18, x19); |
1333 |
+ x27 = _mm_sub_ps(x18, x19); |
1334 |
+ x28 = _mm_add_ps(x20, x21); |
1335 |
+ x29 = _mm_sub_ps(x20, x21); |
1336 |
+ x30 = _mm_add_ps(x22, x23); |
1337 |
+ x31 = _mm_sub_ps(x22, x23); |
1338 |
+ |
1339 |
+ x32 = _mm_add_ps(x24, x26); |
1340 |
+ x33 = _mm_sub_ps(x24, x26); |
1341 |
+ x34 = _mm_unpacklo_ps2(x32, x33); |
1342 |
+ |
1343 |
+ x35 = _mm_shuffle_ps(x28, x29, 0x96); |
1344 |
+ x36 = _mm_shuffle_ps(x28, x29, 0xcc); |
1345 |
+ x37 = _mm_add_ps(x35, x36); |
1346 |
+ x38 = _mm_sub_ps(x35, x36); |
1347 |
+ x39 = _mm_shuffle_ps(x37, x38, 0x14); |
1348 |
+ |
1349 |
+ x40 = _mm_shuffle_ps(x27, x38, 0xeb); |
1350 |
+ x41 = _mm_shuffle_ps(x26, x37, 0xbe); |
1351 |
+ x42 = _mm_add_ps(x40, x41); |
1352 |
+ x43 = _mm_sub_ps(x40, x41); |
1353 |
+ x44 = _mm_mul_ps(x42, vc0); |
1354 |
+ x45 = _mm_mul_ps(x43, vc0); |
1355 |
+ |
1356 |
+ x46 = _mm_add_ps(x34, x39); // z0.r z0.i z4.r z4.i |
1357 |
+ x47 = _mm_sub_ps(x34, x39); // z8.r z8.i z12.r z12.i |
1358 |
+ |
1359 |
+ x48 = _mm_shuffle_ps(x30, x31, 0x96); |
1360 |
+ x49 = _mm_shuffle_ps(x30, x31, 0x3c); |
1361 |
+ x50 = _mm_add_ps(x48, x49); |
1362 |
+ x51 = _mm_sub_ps(x48, x49); |
1363 |
+ x52 = _mm_mul_ps(x50, vc1); |
1364 |
+ x53 = _mm_mul_ps(x50, vc2); |
1365 |
+ x54 = _mm_mul_ps(x51, vc1); |
1366 |
+ x55 = _mm_mul_ps(x51, vc2); |
1367 |
+ |
1368 |
+ x56 = _mm_unpackhi_ps2(x24, x25); |
1369 |
+ x57 = _mm_shuffle_ps(x44, x45, 0x14); |
1370 |
+ x58 = _mm_add_ps(x56, x57); |
1371 |
+ x59 = _mm_sub_ps(x56, x57); |
1372 |
+ |
1373 |
+ x60 = _mm_shuffle_ps(x54, x54, 0xb1); |
1374 |
+ x61 = _mm_shuffle_ps(x55, x55, 0xb1); |
1375 |
+ x62 = _mm_add_ps(x52, x61); |
1376 |
+ x63 = _mm_sub_ps(x52, x61); |
1377 |
+ x64 = _mm_add_ps(x60, x53); |
1378 |
+ x65 = _mm_sub_ps(x60, x53); |
1379 |
+ x66 = _mm_shuffle_ps(x62, x64, 0xb4); |
1380 |
+ x67 = _mm_shuffle_ps(x65, x63, 0xb4); |
1381 |
+ |
1382 |
+ x68 = _mm_add_ps(x58, x66); // z1.r z1.i z3.r z3.i |
1383 |
+ x69 = _mm_sub_ps(x58, x66); // z9.r z9.i z11.r z11.i |
1384 |
+ x70 = _mm_add_ps(x59, x67); // z5.r z5.i z15.r z15.i |
1385 |
+ x71 = _mm_sub_ps(x59, x67); // z13.r z13.i z7.r z7.i |
1386 |
+ |
1387 |
+ x72 = _mm_shuffle_ps(x27, x27, 0xe1); |
1388 |
+ x73 = _mm_add_ps(x25, x72); |
1389 |
+ x74 = _mm_sub_ps(x25, x72); |
1390 |
+ x75 = _mm_unpacklo_ps2(x73, x74); |
1391 |
+ x76 = _mm_shuffle_ps(x44, x45, 0xeb); |
1392 |
+ x77 = _mm_add_ps(x75, x76); // z2.r z2.i z6.r z6.i |
1393 |
+ x78 = _mm_sub_ps(x75, x76); // z10.r z10.i z14.r z14.i |
1394 |
+ |
1395 |
+ x79 = _mm_unpacklo_ps2(x46, x68); // z0.r z0.i z1.r z1.i |
1396 |
+ x80 = _mm_shuffle_ps(x77, x68, 0xe4); // z2.r z2.i z3.r z3.i |
1397 |
+ x81 = _mm_shuffle_ps(x46, x70, 0x4e); // z4.r z4.i z5.r z5.i |
1398 |
+ x82 = _mm_unpackhi_ps2(x77, x71); // z6.r z6.i z7.r z7.i |
1399 |
+ _mm_storeu_ps(out, x79); |
1400 |
+ _mm_storeu_ps(out + 4, x80); |
1401 |
+ _mm_storeu_ps(out + 8, x81); |
1402 |
+ _mm_storeu_ps(out + 12, x82); |
1403 |
+ x83 = _mm_unpacklo_ps2(x47, x69); // z8.r z8.i z9.r z9.i |
1404 |
+ x84 = _mm_shuffle_ps(x78, x69, 0xe4); // z10.r z10.i z11.r z11.i |
1405 |
+ x85 = _mm_shuffle_ps(x47, x71, 0x4e); // z12.r z12.i z13.r z13.i |
1406 |
+ x86 = _mm_unpackhi_ps2(x78, x70); // z14.r z14.i z15.r z15.i |
1407 |
+ _mm_storeu_ps(out + 16, x83); |
1408 |
+ _mm_storeu_ps(out + 20, x84); |
1409 |
+ _mm_storeu_ps(out + 24, x85); |
1410 |
+ _mm_storeu_ps(out + 28, x86); |
1411 |
+} |
1412 |
+ |
1413 |
+static av_always_inline |
1414 |
+void fft4_e2k(FFTComplex *z) |
1415 |
+{ |
1416 |
+ vec_f a, b, c, d; |
1417 |
+ float *out = (float*)z; |
1418 |
+ a = _mm_loadu_ps(out); |
1419 |
+ b = _mm_loadu_ps(out + 4); |
1420 |
+ |
1421 |
+ c = _mm_shuffle_ps(a, b, 0x64); |
1422 |
+ d = _mm_shuffle_ps(a, b, 0xce); |
1423 |
+ a = _mm_add_ps(c, d); |
1424 |
+ b = _mm_sub_ps(c, d); |
1425 |
+ |
1426 |
+ c = _mm_unpacklo_ps(a, b); |
1427 |
+ d = _mm_unpackhi_ps(a, _mm_shuffle_ps(b, b, 0xb1)); |
1428 |
+ |
1429 |
+ a = _mm_add_ps(c, d); |
1430 |
+ b = _mm_sub_ps(c, d); |
1431 |
+ |
1432 |
+ c = _mm_unpacklo_ps2(a, b); |
1433 |
+ d = _mm_unpackhi_ps2(a, b); |
1434 |
+ |
1435 |
+ _mm_storeu_ps(out, c); |
1436 |
+ _mm_storeu_ps(out + 4, d); |
1437 |
+} |
1438 |
+ |
1439 |
+static av_always_inline |
1440 |
+void fft8_e2k(FFTComplex *z) |
1441 |
+{ |
1442 |
+ vec_f vz0, vz1, vz2, vz3; |
1443 |
+ vec_f vz4, vz5, vz6, vz7, vz8; |
1444 |
+ |
1445 |
+ float *out = (float*)z; |
1446 |
+ vec_f vc0 = _mm_setzero_ps(); |
1447 |
+ vec_f vc1 = _mm_setr_ps(-sqrthalf, sqrthalf, sqrthalf, -sqrthalf); |
1448 |
+ vec_f vc2 = _mm_set1_ps(sqrthalf); |
1449 |
+ |
1450 |
+ vz0 = _mm_loadu_ps(out); |
1451 |
+ vz1 = _mm_loadu_ps(out + 4); |
1452 |
+ vz2 = _mm_loadu_ps(out + 8); |
1453 |
+ vz3 = _mm_loadu_ps(out + 12); |
1454 |
+ |
1455 |
+ vz6 = _mm_unpacklo_ps(vz2, vz3); |
1456 |
+ vz7 = _mm_unpackhi_ps(vz2, vz3); |
1457 |
+ vz4 = _mm_shuffle_ps(vz0, vz1, 0x64); |
1458 |
+ vz5 = _mm_shuffle_ps(vz0, vz1, 0xce); |
1459 |
+ |
1460 |
+ vz2 = _mm_add_ps(vz6, vz7); |
1461 |
+ vz3 = _mm_sub_ps(vz6, vz7); |
1462 |
+ vz8 = _mm_shuffle_ps(vz3, vz3, 0x4e); |
1463 |
+ |
1464 |
+ vz0 = _mm_add_ps(vz4, vz5); |
1465 |
+ vz1 = _mm_sub_ps(vz4, vz5); |
1466 |
+ |
1467 |
+ vz3 = _mm_madd_ps(vz3, vc1, vc0); |
1468 |
+ vz3 = _mm_madd_ps(vz8, vc2, vz3); |
1469 |
+ |
1470 |
+ vz4 = _mm_unpacklo_ps(vz0, vz1); |
1471 |
+ vz5 = _mm_unpackhi_ps(vz0, _mm_shuffle_ps(vz1, vz1, 0xb1)); |
1472 |
+ vz6 = _mm_shuffle_ps(vz2, vz3, 0x39); |
1473 |
+ vz7 = _mm_shuffle_ps(vz2, vz3, 0x6c); |
1474 |
+ |
1475 |
+ vz0 = _mm_add_ps(vz4, vz5); |
1476 |
+ vz1 = _mm_sub_ps(vz4, vz5); |
1477 |
+ vz2 = _mm_add_ps(vz6, vz7); |
1478 |
+ vz3 = _mm_sub_ps(vz6, vz7); |
1479 |
+ |
1480 |
+ vz4 = _mm_unpacklo_ps2(vz0, vz1); |
1481 |
+ vz5 = _mm_unpackhi_ps2(vz0, vz1); |
1482 |
+ vz6 = _mm_shuffle_ps(vz2, vz3, 0xd8); |
1483 |
+ vz7 = _mm_shuffle_ps(vz2, vz3, 0x8d); |
1484 |
+ |
1485 |
+ vz2 = _mm_sub_ps(vz4, vz6); |
1486 |
+ vz3 = _mm_sub_ps(vz5, vz7); |
1487 |
+ |
1488 |
+ vz0 = _mm_add_ps(vz4, vz6); |
1489 |
+ vz1 = _mm_add_ps(vz5, vz7); |
1490 |
+ |
1491 |
+ _mm_storeu_ps(out, vz0); |
1492 |
+ _mm_storeu_ps(out + 4, vz1); |
1493 |
+ _mm_storeu_ps(out + 8, vz2); |
1494 |
+ _mm_storeu_ps(out + 12, vz3); |
1495 |
+} |
1496 |
+ |
1497 |
+static av_always_inline |
1498 |
+void fft16_e2k(FFTComplex *z) |
1499 |
+{ |
1500 |
+ float *out = (float*)z; |
1501 |
+ vec_f vc0 = _mm_setzero_ps(); |
1502 |
+ vec_f vc1 = _mm_setr_ps(-sqrthalf, sqrthalf, sqrthalf, -sqrthalf); |
1503 |
+ vec_f vc2 = _mm_set1_ps(sqrthalf); |
1504 |
+ vec_f vc3 = _mm_setr_ps(1.0, 0.92387953, sqrthalf, 0.38268343); |
1505 |
+ vec_f vc4 = _mm_setr_ps(0.0, 0.38268343, sqrthalf, 0.92387953); |
1506 |
+ vec_f vc5 = _mm_setr_ps(-0.0, -0.38268343, -sqrthalf, -0.92387953); |
1507 |
+ |
1508 |
+ vec_f vz0, vz1, vz2, vz3; |
1509 |
+ vec_f vz4, vz5, vz6, vz7; |
1510 |
+ vec_f vz8, vz9, vz10, vz11; |
1511 |
+ vec_f vz12, vz13; |
1512 |
+ |
1513 |
+ vz0 = _mm_loadu_ps(out + 16); |
1514 |
+ vz1 = _mm_loadu_ps(out + 20); |
1515 |
+ vz2 = _mm_loadu_ps(out + 24); |
1516 |
+ vz3 = _mm_loadu_ps(out + 28); |
1517 |
+ |
1518 |
+ vz4 = _mm_shuffle_ps(vz0, vz1, 0x64); |
1519 |
+ vz5 = _mm_shuffle_ps(vz0, vz1, 0xce); |
1520 |
+ vz6 = _mm_shuffle_ps(vz2, vz3, 0x64); |
1521 |
+ vz7 = _mm_shuffle_ps(vz2, vz3, 0xce); |
1522 |
+ |
1523 |
+ vz0 = _mm_add_ps(vz4, vz5); |
1524 |
+ vz1= _mm_sub_ps(vz4, vz5); |
1525 |
+ vz2 = _mm_add_ps(vz6, vz7); |
1526 |
+ vz3 = _mm_sub_ps(vz6, vz7); |
1527 |
+ |
1528 |
+ vz4 = _mm_unpacklo_ps(vz0, vz1); |
1529 |
+ vz5 = _mm_unpackhi_ps(vz0, _mm_shuffle_ps(vz1, vz1, 0xb1)); |
1530 |
+ vz6 = _mm_unpacklo_ps(vz2, vz3); |
1531 |
+ vz7 = _mm_unpackhi_ps(vz2, _mm_shuffle_ps(vz3, vz3, 0xb1)); |
1532 |
+ |
1533 |
+ vz0 = _mm_add_ps(vz4, vz5); |
1534 |
+ vz1 = _mm_sub_ps(vz4, vz5); |
1535 |
+ vz2 = _mm_add_ps(vz6, vz7); |
1536 |
+ vz3 = _mm_sub_ps(vz6, vz7); |
1537 |
+ |
1538 |
+ vz4 = _mm_unpacklo_ps2(vz0, vz1); |
1539 |
+ vz5 = _mm_unpackhi_ps2(vz0, vz1); |
1540 |
+ |
1541 |
+ vz6 = _mm_unpacklo_ps2(vz2, vz3); |
1542 |
+ vz7 = _mm_unpackhi_ps2(vz2, vz3); |
1543 |
+ |
1544 |
+ vz0 = _mm_loadu_ps(out); |
1545 |
+ vz1 = _mm_loadu_ps(out + 4); |
1546 |
+ vz2 = _mm_loadu_ps(out + 8); |
1547 |
+ vz3 = _mm_loadu_ps(out + 12); |
1548 |
+ vz10 = _mm_unpacklo_ps(vz2, vz3); |
1549 |
+ vz11 = _mm_unpackhi_ps(vz2, vz3); |
1550 |
+ vz8 = _mm_shuffle_ps(vz0, vz1, 0x64); |
1551 |
+ vz9 = _mm_shuffle_ps(vz0, vz1, 0xce); |
1552 |
+ |
1553 |
+ vz2 = _mm_add_ps(vz10, vz11); |
1554 |
+ vz3 = _mm_sub_ps(vz10, vz11); |
1555 |
+ vz12 = _mm_shuffle_ps(vz3, vz3, 0x4e); |
1556 |
+ vz0 = _mm_add_ps(vz8, vz9); |
1557 |
+ vz1 = _mm_sub_ps(vz8, vz9); |
1558 |
+ |
1559 |
+ vz3 = _mm_madd_ps(vz3, vc1, vc0); |
1560 |
+ vz3 = _mm_madd_ps(vz12, vc2, vz3); |
1561 |
+ vz8 = _mm_unpacklo_ps(vz0, vz1); |
1562 |
+ vz9 = _mm_unpackhi_ps(vz0, _mm_shuffle_ps(vz1, vz1, 0xb1)); |
1563 |
+ vz10 = _mm_shuffle_ps(vz2, vz3, 0x39); |
1564 |
+ vz11 = _mm_shuffle_ps(vz2, vz3, 0x6c); |
1565 |
+ |
1566 |
+ vz0 = _mm_add_ps(vz8, vz9); |
1567 |
+ vz1 = _mm_sub_ps(vz8, vz9); |
1568 |
+ vz2 = _mm_add_ps(vz10, vz11); |
1569 |
+ vz3 = _mm_sub_ps(vz10, vz11); |
1570 |
+ |
1571 |
+ vz8 = _mm_unpacklo_ps2(vz0, vz1); |
1572 |
+ vz9 = _mm_unpackhi_ps2(vz0, vz1); |
1573 |
+ vz10 = _mm_shuffle_ps(vz2, vz3, 0xd8); |
1574 |
+ vz11 = _mm_shuffle_ps(vz2, vz3, 0x8d); |
1575 |
+ |
1576 |
+ vz2 = _mm_sub_ps(vz8, vz10); |
1577 |
+ vz3 = _mm_sub_ps(vz9, vz11); |
1578 |
+ vz0 = _mm_add_ps(vz8, vz10); |
1579 |
+ vz1 = _mm_add_ps(vz9, vz11); |
1580 |
+ |
1581 |
+ vz8 = _mm_madd_ps(vz4, vc3, vc0); |
1582 |
+ vz9 = _mm_madd_ps(vz5, vc3, vc0); |
1583 |
+ vz10 = _mm_madd_ps(vz6, vc3, vc0); |
1584 |
+ vz11 = _mm_madd_ps(vz7, vc3, vc0); |
1585 |
+ |
1586 |
+ vz8 = _mm_madd_ps(vz5, vc4, vz8); |
1587 |
+ vz9 = _mm_madd_ps(vz4, vc5, vz9); |
1588 |
+ vz10 = _mm_madd_ps(vz7, vc5, vz10); |
1589 |
+ vz11 = _mm_madd_ps(vz6, vc4, vz11); |
1590 |
+ |
1591 |
+ vz12 = _mm_sub_ps(vz10, vz8); |
1592 |
+ vz10 = _mm_add_ps(vz10, vz8); |
1593 |
+ |
1594 |
+ vz13 = _mm_sub_ps(vz9, vz11); |
1595 |
+ vz11 = _mm_add_ps(vz9, vz11); |
1596 |
+ |
1597 |
+ vz4 = _mm_sub_ps(vz0, vz10); |
1598 |
+ vz0 = _mm_add_ps(vz0, vz10); |
1599 |
+ |
1600 |
+ vz7 = _mm_sub_ps(vz3, vz12); |
1601 |
+ vz3 = _mm_add_ps(vz3, vz12); |
1602 |
+ |
1603 |
+ vz5 = _mm_sub_ps(vz1, vz11); |
1604 |
+ vz1 = _mm_add_ps(vz1, vz11); |
1605 |
+ |
1606 |
+ vz6 = _mm_sub_ps(vz2, vz13); |
1607 |
+ vz2 = _mm_add_ps(vz2, vz13); |
1608 |
+ |
1609 |
+ _mm_storeu_ps(out, vz0); |
1610 |
+ _mm_storeu_ps(out + 4, vz1); |
1611 |
+ _mm_storeu_ps(out + 8, vz2); |
1612 |
+ _mm_storeu_ps(out + 12, vz3); |
1613 |
+ _mm_storeu_ps(out + 16, vz4); |
1614 |
+ _mm_storeu_ps(out + 20, vz5); |
1615 |
+ _mm_storeu_ps(out + 24, vz6); |
1616 |
+ _mm_storeu_ps(out + 28, vz7); |
1617 |
+} |
1618 |
+ |
1619 |
+static av_always_inline |
1620 |
+void pass_e2k(FFTComplex *z, const FFTSample *wre, unsigned int n) |
1621 |
+{ |
1622 |
+ int i1 = n * 4, i2 = n * 8, i3 = n * 12; |
1623 |
+ FFTSample *out = (FFTSample*)z; |
1624 |
+ const FFTSample *wim = wre + n * 2; |
1625 |
+ vec_f v0, v1, v2, v3; |
1626 |
+ vec_f v4, v5, v6, v7; |
1627 |
+ vec_f v8, v9, v10, v11; |
1628 |
+ vec_f v12, v13; |
1629 |
+ |
1630 |
+ n = n - 2; |
1631 |
+ |
1632 |
+ v8 = _mm_loadu_ps(wre); |
1633 |
+#if 0 |
1634 |
+ v9 = _mm_loadu_ps(wim - 3); |
1635 |
+#else |
1636 |
+ v10 = _mm_loadu_ps(wim); |
1637 |
+ v9 = _mm_loadu_ps(wim - 4); |
1638 |
+ v9 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(v10), _mm_castps_si128(v9), 4)); |
1639 |
+#endif |
1640 |
+ v9 = _mm_shuffle_ps(v9, v9, 0x1b); |
1641 |
+ |
1642 |
+ v4 = _mm_loadu_ps(out + i2); |
1643 |
+ v5 = _mm_loadu_ps(out + i2 + 4); |
1644 |
+ v6 = _mm_loadu_ps(out + i3); |
1645 |
+ v7 = _mm_loadu_ps(out + i3 + 4); |
1646 |
+ v10 = _mm_mul_ps(v4, v8); // r2*wre |
1647 |
+ v11 = _mm_mul_ps(v5, v8); // i2*wre |
1648 |
+ v12 = _mm_mul_ps(v6, v8); // r3*wre |
1649 |
+ v13 = _mm_mul_ps(v7, v8); // i3*wre |
1650 |
+ |
1651 |
+ v0 = _mm_loadu_ps(out); // r0 |
1652 |
+ v3 = _mm_loadu_ps(out + i1 + 4); // i1 |
1653 |
+ v10 = _mm_madd_ps(v5, v9, v10); // r2*wim |
1654 |
+ v11 = _mm_nmsub_ps(v4, v9, v11); // i2*wim |
1655 |
+ v12 = _mm_nmsub_ps(v7, v9, v12); // r3*wim |
1656 |
+ v13 = _mm_madd_ps(v6, v9, v13); // i3*wim |
1657 |
+ |
1658 |
+ v1 = _mm_loadu_ps(out + 4); // i0 |
1659 |
+ v2 = _mm_loadu_ps(out + i1); // r1 |
1660 |
+ v8 = _mm_sub_ps(v12, v10); |
1661 |
+ v12 = _mm_add_ps(v12, v10); |
1662 |
+ v9 = _mm_sub_ps(v11, v13); |
1663 |
+ v13 = _mm_add_ps(v11, v13); |
1664 |
+ v4 = _mm_sub_ps(v0, v12); |
1665 |
+ v0 = _mm_add_ps(v0, v12); |
1666 |
+ v7 = _mm_sub_ps(v3, v8); |
1667 |
+ v3 = _mm_add_ps(v3, v8); |
1668 |
+ |
1669 |
+ _mm_storeu_ps(out, v0); // r0 |
1670 |
+ _mm_storeu_ps(out + i1 + 4, v3); // i1 |
1671 |
+ _mm_storeu_ps(out + i2, v4); // r2 |
1672 |
+ _mm_storeu_ps(out + i3 + 4, v7);// i3 |
1673 |
+ |
1674 |
+ v5 = _mm_sub_ps(v1, v13); |
1675 |
+ v1 = _mm_add_ps(v1, v13); |
1676 |
+ v6 = _mm_sub_ps(v2, v9); |
1677 |
+ v2 = _mm_add_ps(v2, v9); |
1678 |
+ |
1679 |
+ _mm_storeu_ps(out + 4, v1); // i0 |
1680 |
+ _mm_storeu_ps(out + i1, v2); // r1 |
1681 |
+ _mm_storeu_ps(out + i2 + 4, v5); // i2 |
1682 |
+ _mm_storeu_ps(out + i3, v6); // r3 |
1683 |
+ |
1684 |
+ do { |
1685 |
+ out += 8; |
1686 |
+ wre += 4; |
1687 |
+ wim -= 4; |
1688 |
+ |
1689 |
+ v8 = _mm_loadu_ps(wre); |
1690 |
+#if 0 |
1691 |
+ v9 = _mm_loadu_ps(wim - 3); |
1692 |
+#else |
1693 |
+ v10 = _mm_loadu_ps(wim); |
1694 |
+ v9 = _mm_loadu_ps(wim - 4); |
1695 |
+ v9 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(v10), _mm_castps_si128(v9), 4)); |
1696 |
+#endif |
1697 |
+ v9 = _mm_shuffle_ps(v9, v9, 0x1b); |
1698 |
+ |
1699 |
+ v4 = _mm_loadu_ps(out + i2); // r2 |
1700 |
+ v5 = _mm_loadu_ps(out + i2 + 4); // i2 |
1701 |
+ v6 = _mm_loadu_ps(out + i3); // r3 |
1702 |
+ v7 = _mm_loadu_ps(out + i3 + 4);// i3 |
1703 |
+ v10 = _mm_mul_ps(v4, v8); // r2*wre |
1704 |
+ v11 = _mm_mul_ps(v5, v8); // i2*wre |
1705 |
+ v12 = _mm_mul_ps(v6, v8); // r3*wre |
1706 |
+ v13 = _mm_mul_ps(v7, v8); // i3*wre |
1707 |
+ |
1708 |
+ v0 = _mm_loadu_ps(out); // r0 |
1709 |
+ v3 = _mm_loadu_ps(out + i1 + 4); // i1 |
1710 |
+ v10 = _mm_madd_ps(v5, v9, v10); // r2*wim |
1711 |
+ v11 = _mm_nmsub_ps(v4, v9, v11); // i2*wim |
1712 |
+ v12 = _mm_nmsub_ps(v7, v9, v12); // r3*wim |
1713 |
+ v13 = _mm_madd_ps(v6, v9, v13); // i3*wim |
1714 |
+ |
1715 |
+ v1 = _mm_loadu_ps(out + 4); // i0 |
1716 |
+ v2 = _mm_loadu_ps(out + i1); // r1 |
1717 |
+ v8 = _mm_sub_ps(v12, v10); |
1718 |
+ v12 = _mm_add_ps(v12, v10); |
1719 |
+ v9 = _mm_sub_ps(v11, v13); |
1720 |
+ v13 = _mm_add_ps(v11, v13); |
1721 |
+ v4 = _mm_sub_ps(v0, v12); |
1722 |
+ v0 = _mm_add_ps(v0, v12); |
1723 |
+ v7 = _mm_sub_ps(v3, v8); |
1724 |
+ v3 = _mm_add_ps(v3, v8); |
1725 |
+ |
1726 |
+ _mm_storeu_ps(out, v0); // r0 |
1727 |
+ _mm_storeu_ps(out + i1 + 4, v3); // i1 |
1728 |
+ _mm_storeu_ps(out + i2, v4); // r2 |
1729 |
+ _mm_storeu_ps(out + i3 + 4, v7); // i3 |
1730 |
+ |
1731 |
+ v5 = _mm_sub_ps(v1, v13); |
1732 |
+ v1 = _mm_add_ps(v1, v13); |
1733 |
+ v6 = _mm_sub_ps(v2, v9); |
1734 |
+ v2 = _mm_add_ps(v2, v9); |
1735 |
+ |
1736 |
+ _mm_storeu_ps(out + 4, v1); // i0 |
1737 |
+ _mm_storeu_ps(out + i1, v2); // r1 |
1738 |
+ _mm_storeu_ps(out + i2 + 4, v5); // i2 |
1739 |
+ _mm_storeu_ps(out + i3, v6); // r3 |
1740 |
+ } while (n -= 2); |
1741 |
+} |
1742 |
+ |
1743 |
+static void fft32_e2k_interleave(FFTComplex *z) |
1744 |
+{ |
1745 |
+ fft16_e2k_interleave(z); |
1746 |
+ fft8_e2k_interleave(z+16); |
1747 |
+ fft8_e2k_interleave(z+24); |
1748 |
+ pass_e2k_interleave(z,ff_cos_32,4); |
1749 |
+} |
1750 |
+ |
1751 |
+static void fft64_e2k_interleave(FFTComplex *z) |
1752 |
+{ |
1753 |
+ fft32_e2k_interleave(z); |
1754 |
+ fft16_e2k_interleave(z+32); |
1755 |
+ fft16_e2k_interleave(z+48); |
1756 |
+ pass_e2k_interleave(z,ff_cos_64, 8); |
1757 |
+} |
1758 |
+ |
1759 |
+static void fft128_e2k_interleave(FFTComplex *z) |
1760 |
+{ |
1761 |
+ fft64_e2k_interleave(z); |
1762 |
+ fft32_e2k_interleave(z+64); |
1763 |
+ fft32_e2k_interleave(z+96); |
1764 |
+ pass_e2k_interleave(z,ff_cos_128,16); |
1765 |
+} |
1766 |
+ |
1767 |
+static void fft256_e2k_interleave(FFTComplex *z) |
1768 |
+{ |
1769 |
+ fft128_e2k_interleave(z); |
1770 |
+ fft64_e2k_interleave(z+128); |
1771 |
+ fft64_e2k_interleave(z+192); |
1772 |
+ pass_e2k_interleave(z,ff_cos_256,32); |
1773 |
+} |
1774 |
+ |
1775 |
+static void fft512_e2k_interleave(FFTComplex *z) |
1776 |
+{ |
1777 |
+ fft256_e2k_interleave(z); |
1778 |
+ fft128_e2k_interleave(z+256); |
1779 |
+ fft128_e2k_interleave(z+384); |
1780 |
+ pass_e2k_interleave(z,ff_cos_512,64); |
1781 |
+} |
1782 |
+ |
1783 |
+static void fft1024_e2k_interleave(FFTComplex *z) |
1784 |
+{ |
1785 |
+ fft512_e2k_interleave(z); |
1786 |
+ fft256_e2k_interleave(z+512); |
1787 |
+ fft256_e2k_interleave(z+768); |
1788 |
+ pass_e2k_interleave(z,ff_cos_1024,128); |
1789 |
+} |
1790 |
+ |
1791 |
+static void fft2048_e2k_interleave(FFTComplex *z) |
1792 |
+{ |
1793 |
+ fft1024_e2k_interleave(z); |
1794 |
+ fft512_e2k_interleave(z+1024); |
1795 |
+ fft512_e2k_interleave(z+1536); |
1796 |
+ pass_e2k_interleave(z,ff_cos_2048,256); |
1797 |
+} |
1798 |
+ |
1799 |
+static void fft4096_e2k_interleave(FFTComplex *z) |
1800 |
+{ |
1801 |
+ fft2048_e2k_interleave(z); |
1802 |
+ fft1024_e2k_interleave(z+2048); |
1803 |
+ fft1024_e2k_interleave(z+3072); |
1804 |
+ pass_e2k_interleave(z,ff_cos_4096, 512); |
1805 |
+} |
1806 |
+ |
1807 |
+static void fft8192_e2k_interleave(FFTComplex *z) |
1808 |
+{ |
1809 |
+ fft4096_e2k_interleave(z); |
1810 |
+ fft2048_e2k_interleave(z+4096); |
1811 |
+ fft2048_e2k_interleave(z+6144); |
1812 |
+ pass_e2k_interleave(z,ff_cos_8192,1024); |
1813 |
+} |
1814 |
+ |
1815 |
+static void fft16384_e2k_interleave(FFTComplex *z) |
1816 |
+{ |
1817 |
+ fft8192_e2k_interleave(z); |
1818 |
+ fft4096_e2k_interleave(z+8192); |
1819 |
+ fft4096_e2k_interleave(z+12288); |
1820 |
+ pass_e2k_interleave(z,ff_cos_16384,2048); |
1821 |
+} |
1822 |
+ |
1823 |
+static void fft32768_e2k_interleave(FFTComplex *z) |
1824 |
+{ |
1825 |
+ fft16384_e2k_interleave(z); |
1826 |
+ fft8192_e2k_interleave(z+16384); |
1827 |
+ fft8192_e2k_interleave(z+24576); |
1828 |
+ pass_e2k_interleave(z,ff_cos_32768,4096); |
1829 |
+} |
1830 |
+ |
1831 |
+static void fft65536_e2k_interleave(FFTComplex *z) |
1832 |
+{ |
1833 |
+ fft32768_e2k_interleave(z); |
1834 |
+ fft16384_e2k_interleave(z+32768); |
1835 |
+ fft16384_e2k_interleave(z+49152); |
1836 |
+ pass_e2k_interleave(z,ff_cos_65536,8192); |
1837 |
+} |
1838 |
+ |
1839 |
+static void fft32_e2k(FFTComplex *z) |
1840 |
+{ |
1841 |
+ fft16_e2k(z); |
1842 |
+ fft8_e2k(z+16); |
1843 |
+ fft8_e2k(z+24); |
1844 |
+ pass_e2k(z,ff_cos_32,4); |
1845 |
+} |
1846 |
+ |
1847 |
+static void fft64_e2k(FFTComplex *z) |
1848 |
+{ |
1849 |
+ fft32_e2k(z); |
1850 |
+ fft16_e2k(z+32); |
1851 |
+ fft16_e2k(z+48); |
1852 |
+ pass_e2k(z,ff_cos_64, 8); |
1853 |
+} |
1854 |
+ |
1855 |
+static void fft128_e2k(FFTComplex *z) |
1856 |
+{ |
1857 |
+ fft64_e2k(z); |
1858 |
+ fft32_e2k(z+64); |
1859 |
+ fft32_e2k(z+96); |
1860 |
+ pass_e2k(z,ff_cos_128,16); |
1861 |
+} |
1862 |
+ |
1863 |
+static void fft256_e2k(FFTComplex *z) |
1864 |
+{ |
1865 |
+ fft128_e2k(z); |
1866 |
+ fft64_e2k(z+128); |
1867 |
+ fft64_e2k(z+192); |
1868 |
+ pass_e2k(z,ff_cos_256,32); |
1869 |
+} |
1870 |
+ |
1871 |
+static void fft512_e2k(FFTComplex *z) |
1872 |
+{ |
1873 |
+ fft256_e2k(z); |
1874 |
+ fft128_e2k(z+256); |
1875 |
+ fft128_e2k(z+384); |
1876 |
+ pass_e2k(z,ff_cos_512,64); |
1877 |
+} |
1878 |
+ |
1879 |
+static void fft1024_e2k(FFTComplex *z) |
1880 |
+{ |
1881 |
+ fft512_e2k(z); |
1882 |
+ fft256_e2k(z+512); |
1883 |
+ fft256_e2k(z+768); |
1884 |
+ pass_e2k(z,ff_cos_1024,128); |
1885 |
+ |
1886 |
+} |
1887 |
+ |
1888 |
+static void fft2048_e2k(FFTComplex *z) |
1889 |
+{ |
1890 |
+ fft1024_e2k(z); |
1891 |
+ fft512_e2k(z+1024); |
1892 |
+ fft512_e2k(z+1536); |
1893 |
+ pass_e2k(z,ff_cos_2048,256); |
1894 |
+} |
1895 |
+ |
1896 |
+static void fft4096_e2k(FFTComplex *z) |
1897 |
+{ |
1898 |
+ fft2048_e2k(z); |
1899 |
+ fft1024_e2k(z+2048); |
1900 |
+ fft1024_e2k(z+3072); |
1901 |
+ pass_e2k(z,ff_cos_4096, 512); |
1902 |
+} |
1903 |
+ |
1904 |
+static void fft8192_e2k(FFTComplex *z) |
1905 |
+{ |
1906 |
+ fft4096_e2k(z); |
1907 |
+ fft2048_e2k(z+4096); |
1908 |
+ fft2048_e2k(z+6144); |
1909 |
+ pass_e2k(z,ff_cos_8192,1024); |
1910 |
+} |
1911 |
+ |
1912 |
+static void fft16384_e2k(FFTComplex *z) |
1913 |
+{ |
1914 |
+ fft8192_e2k(z); |
1915 |
+ fft4096_e2k(z+8192); |
1916 |
+ fft4096_e2k(z+12288); |
1917 |
+ pass_e2k(z,ff_cos_16384,2048); |
1918 |
+} |
1919 |
+ |
1920 |
+static void fft32768_e2k(FFTComplex *z) |
1921 |
+{ |
1922 |
+ fft16384_e2k(z); |
1923 |
+ fft8192_e2k(z+16384); |
1924 |
+ fft8192_e2k(z+24576); |
1925 |
+ pass_e2k(z,ff_cos_32768,4096); |
1926 |
+} |
1927 |
+ |
1928 |
+static void fft65536_e2k(FFTComplex *z) |
1929 |
+{ |
1930 |
+ fft32768_e2k(z); |
1931 |
+ fft16384_e2k(z+32768); |
1932 |
+ fft16384_e2k(z+49152); |
1933 |
+ pass_e2k(z,ff_cos_65536,8192); |
1934 |
+} |
1935 |
+ |
1936 |
+static void (* const fft_dispatch_e2k[])(FFTComplex*) = { |
1937 |
+ fft4_e2k, fft8_e2k, fft16_e2k, fft32_e2k, fft64_e2k, fft128_e2k, fft256_e2k, fft512_e2k, fft1024_e2k, |
1938 |
+ fft2048_e2k, fft4096_e2k, fft8192_e2k, fft16384_e2k, fft32768_e2k, fft65536_e2k, |
1939 |
+}; |
1940 |
+ |
1941 |
+static void (* const fft_dispatch_e2k_interleave[])(FFTComplex*) = { |
1942 |
+ fft4_e2k_interleave, fft8_e2k_interleave, fft16_e2k_interleave, fft32_e2k_interleave, fft64_e2k_interleave, |
1943 |
+ fft128_e2k_interleave, fft256_e2k_interleave, fft512_e2k_interleave, fft1024_e2k_interleave, |
1944 |
+ fft2048_e2k_interleave, fft4096_e2k_interleave, fft8192_e2k_interleave, fft16384_e2k_interleave, fft32768_e2k_interleave, fft65536_e2k_interleave, |
1945 |
+}; |
1946 |
+ |
1947 |
+void ff_fft_calc_interleave_e2k(FFTContext *s, FFTComplex *z) |
1948 |
+{ |
1949 |
+ fft_dispatch_e2k_interleave[s->nbits-2](z); |
1950 |
+} |
1951 |
+ |
1952 |
+void ff_fft_calc_e2k(FFTContext *s, FFTComplex *z) |
1953 |
+{ |
1954 |
+ fft_dispatch_e2k[s->nbits-2](z); |
1955 |
+} |
1956 |
+ |
1957 |
diff --git a/libavcodec/e2k/fft.h b/libavcodec/e2k/fft.h |
1958 |
new file mode 100644 |
1959 |
index 0000000..62ae2f3 |
1960 |
--- /dev/null |
1961 |
+++ b/libavcodec/e2k/fft.h |
1962 |
@@ -0,0 +1,29 @@ |
1963 |
+/* |
1964 |
+ * This file is part of FFmpeg. |
1965 |
+ * |
1966 |
+ * FFmpeg is free software; you can redistribute it and/or |
1967 |
+ * modify it under the terms of the GNU Lesser General Public |
1968 |
+ * License as published by the Free Software Foundation; either |
1969 |
+ * version 2.1 of the License, or (at your option) any later version. |
1970 |
+ * |
1971 |
+ * FFmpeg is distributed in the hope that it will be useful, |
1972 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
1973 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
1974 |
+ * Lesser General Public License for more details. |
1975 |
+ * |
1976 |
+ * You should have received a copy of the GNU Lesser General Public |
1977 |
+ * License along with FFmpeg; if not, write to the Free Software |
1978 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
1979 |
+ */ |
1980 |
+ |
1981 |
+#ifndef AVCODEC_E2K_FFT_H |
1982 |
+#define AVCODEC_E2K_FFT_H |
1983 |
+ |
1984 |
+#include "config.h" |
1985 |
+#include "libavcodec/fft.h" |
1986 |
+#include "libavcodec/fft-internal.h" |
1987 |
+ |
1988 |
+void ff_fft_calc_interleave_e2k(FFTContext *s, FFTComplex *z); |
1989 |
+void ff_fft_calc_e2k(FFTContext *s, FFTComplex *z); |
1990 |
+ |
1991 |
+#endif /* AVCODEC_E2K_FFT_H */ |
1992 |
diff --git a/libavcodec/e2k/fft_init.c b/libavcodec/e2k/fft_init.c |
1993 |
new file mode 100644 |
1994 |
index 0000000..116236d |
1995 |
--- /dev/null |
1996 |
+++ b/libavcodec/e2k/fft_init.c |
1997 |
@@ -0,0 +1,152 @@ |
1998 |
+/* |
1999 |
+ * FFT/IFFT transforms |
2000 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
2001 |
+ * Copyright (c) 2009 Loren Merritt |
2002 |
+ * |
2003 |
+ * This file is part of FFmpeg. |
2004 |
+ * |
2005 |
+ * FFmpeg is free software; you can redistribute it and/or |
2006 |
+ * modify it under the terms of the GNU Lesser General Public |
2007 |
+ * License as published by the Free Software Foundation; either |
2008 |
+ * version 2.1 of the License, or (at your option) any later version. |
2009 |
+ * |
2010 |
+ * FFmpeg is distributed in the hope that it will be useful, |
2011 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
2012 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
2013 |
+ * Lesser General Public License for more details. |
2014 |
+ * |
2015 |
+ * You should have received a copy of the GNU Lesser General Public |
2016 |
+ * License along with FFmpeg; if not, write to the Free Software |
2017 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
2018 |
+ */ |
2019 |
+ |
2020 |
+#include "config.h" |
2021 |
+#include "libavutil/cpu.h" |
2022 |
+#include "libavutil/e2k/cpu.h" |
2023 |
+#include "libavutil/e2k/util_e2k.h" |
2024 |
+#include "libavcodec/fft.h" |
2025 |
+ |
2026 |
+#include "fft.h" |
2027 |
+ |
2028 |
+/** |
2029 |
+ * Do a complex FFT with the parameters defined in ff_fft_init(). |
2030 |
+ * The input data must be permuted before with s->revtab table. |
2031 |
+ * No 1.0 / sqrt(n) normalization is done. |
2032 |
+ * |
2033 |
+ * This code assumes that the 'z' pointer is 16 bytes-aligned. |
2034 |
+ * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats. |
2035 |
+ */ |
2036 |
+ |
2037 |
+static void imdct_half_e2k(FFTContext *s, FFTSample *output, const FFTSample *input) |
2038 |
+{ |
2039 |
+ int j, k; |
2040 |
+ int n = 1 << s->mdct_bits; |
2041 |
+ int n4 = n >> 2; |
2042 |
+ int n8 = n >> 3; |
2043 |
+ int n32 = n >> 5; |
2044 |
+ const uint16_t *revtabj = s->revtab; |
2045 |
+ const uint16_t *revtabk = s->revtab + n4; |
2046 |
+ const vec_f *tcos = (const vec_f*)(s->tcos + n8); |
2047 |
+ const vec_f *tsin = (const vec_f*)(s->tsin + n8); |
2048 |
+ const vec_f *pin = (const vec_f*)(input + n4); |
2049 |
+ vec_f *pout = (vec_f*)(output + n4); |
2050 |
+ |
2051 |
+ /* pre rotation */ |
2052 |
+ k = n32 - 1; |
2053 |
+ do { |
2054 |
+ vec_f cos, sin, cos0, sin0, cos1, sin1; |
2055 |
+ vec_f re, im, r0, i0, r1, i1, a, b; |
2056 |
+#define CMULA(p, perm) \ |
2057 |
+ a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */ \ |
2058 |
+ b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */ \ |
2059 |
+ re = _mm_shuffle_ps(a, b, 0x88); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */ \ |
2060 |
+ im = _mm_shuffle_ps(b, a, 0x77); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */ \ |
2061 |
+ cos = _mm_shuffle_ps(cos0, cos1, perm); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */ \ |
2062 |
+ sin = _mm_shuffle_ps(sin0, sin1, perm); \ |
2063 |
+ r##p = _mm_sub_ps(_mm_mul_ps(im, cos), _mm_mul_ps(re, sin)); \ |
2064 |
+ i##p = _mm_add_ps(_mm_mul_ps(re, cos), _mm_mul_ps(im, sin)); |
2065 |
+#define STORE2(L, v, dst) \ |
2066 |
+ VEC_ST##L(output + dst * 2, _mm_castps_si128(v)); |
2067 |
+#define STORE8(p) \ |
2068 |
+ a = _mm_unpacklo_ps(r##p, i##p); \ |
2069 |
+ b = _mm_unpackhi_ps(r##p, i##p); \ |
2070 |
+ STORE2(L, a, revtabk[ p*2-4]); \ |
2071 |
+ STORE2(H, a, revtabk[ p*2-3]); \ |
2072 |
+ STORE2(L, b, revtabj[-p*2+2]); \ |
2073 |
+ STORE2(H, b, revtabj[-p*2+3]); |
2074 |
+ |
2075 |
+ cos0 = tcos[k]; |
2076 |
+ sin0 = tsin[k]; |
2077 |
+ cos1 = tcos[-k-1]; |
2078 |
+ sin1 = tsin[-k-1]; |
2079 |
+ CMULA(0, 0xe4); |
2080 |
+ CMULA(1, 0x4e); |
2081 |
+ STORE8(0); |
2082 |
+ STORE8(1); |
2083 |
+ revtabj += 4; |
2084 |
+ revtabk -= 4; |
2085 |
+ k--; |
2086 |
+ } while (k >= 0); |
2087 |
+ |
2088 |
+ ff_fft_calc_e2k(s, (FFTComplex*)output); |
2089 |
+ |
2090 |
+ /* post rotation + reordering */ |
2091 |
+ j = -n32; |
2092 |
+ k = n32 - 1; |
2093 |
+ do { |
2094 |
+ vec_f cos, sin, re, im, a, b, c, d; |
2095 |
+#define CMULB(d0, d1, o) \ |
2096 |
+ re = pout[o*2]; im = pout[o*2+1]; \ |
2097 |
+ cos = tcos[o]; sin = tsin[o]; \ |
2098 |
+ d0 = _mm_sub_ps(_mm_mul_ps(im, sin), _mm_mul_ps(re, cos)); \ |
2099 |
+ d1 = _mm_add_ps(_mm_mul_ps(re, sin), _mm_mul_ps(im, cos)); |
2100 |
+ |
2101 |
+ CMULB(a, b, j); |
2102 |
+ CMULB(c, d, k); |
2103 |
+ d = _mm_shuffle_ps(d, d, 0x1b); |
2104 |
+ b = _mm_shuffle_ps(b, b, 0x1b); |
2105 |
+ pout[2*j] = _mm_unpacklo_ps(a, d); |
2106 |
+ pout[2*j+1] = _mm_unpackhi_ps(a, d); |
2107 |
+ pout[2*k] = _mm_unpacklo_ps(c, b); |
2108 |
+ pout[2*k+1] = _mm_unpackhi_ps(c, b); |
2109 |
+ j++; |
2110 |
+ k--; |
2111 |
+ } while (k >= 0); |
2112 |
+} |
2113 |
+ |
2114 |
+static void imdct_calc_e2k(FFTContext *s, FFTSample *output, const FFTSample *input) |
2115 |
+{ |
2116 |
+ int k; |
2117 |
+ int n = 1 << s->mdct_bits; |
2118 |
+ int n4 = n >> 2; |
2119 |
+ int n16 = n >> 4; |
2120 |
+ vec_u32 sign = _mm_set1_epi32(-1 << 31); |
2121 |
+ vec_u32 *p0 = (vec_u32*)(output + n4); |
2122 |
+ vec_u32 *p1 = (vec_u32*)(output + n4 * 3); |
2123 |
+ |
2124 |
+ imdct_half_e2k(s, output + n4, input); |
2125 |
+ |
2126 |
+ for (k = 0; k < n16; k++) { |
2127 |
+ vec_u32 a = p0[k] ^ sign; |
2128 |
+ vec_u32 b = p1[-1 - k]; |
2129 |
+ p0[-1 - k] = _mm_shuffle_epi32(a, 0x1b); |
2130 |
+ p1[k] = _mm_shuffle_epi32(b, 0x1b); |
2131 |
+ } |
2132 |
+} |
2133 |
+ |
2134 |
+av_cold void ff_fft_init_e2k(FFTContext *s) |
2135 |
+{ |
2136 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
2137 |
+ return; |
2138 |
+ |
2139 |
+ // !checkasm |
2140 |
+ // libavcodec/tests/fft -n 2..14 [-i] |
2141 |
+ // libavcodec/tests/fft -{m|d|r} -n 4..14 [-i] |
2142 |
+ |
2143 |
+ s->fft_calc = ff_fft_calc_interleave_e2k; |
2144 |
+ |
2145 |
+ if (s->mdct_bits >= 5) { |
2146 |
+ s->imdct_calc = imdct_calc_e2k; |
2147 |
+ s->imdct_half = imdct_half_e2k; |
2148 |
+ } |
2149 |
+} |
2150 |
diff --git a/libavcodec/e2k/fmtconvert.c b/libavcodec/e2k/fmtconvert.c |
2151 |
new file mode 100644 |
2152 |
index 0000000..bfd9cb5 |
2153 |
--- /dev/null |
2154 |
+++ b/libavcodec/e2k/fmtconvert.c |
2155 |
@@ -0,0 +1,55 @@ |
2156 |
+/* |
2157 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
2158 |
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> |
2159 |
+ * |
2160 |
+ * This file is part of FFmpeg. |
2161 |
+ * |
2162 |
+ * FFmpeg is free software; you can redistribute it and/or |
2163 |
+ * modify it under the terms of the GNU Lesser General Public |
2164 |
+ * License as published by the Free Software Foundation; either |
2165 |
+ * version 2.1 of the License, or (at your option) any later version. |
2166 |
+ * |
2167 |
+ * FFmpeg is distributed in the hope that it will be useful, |
2168 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
2169 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
2170 |
+ * Lesser General Public License for more details. |
2171 |
+ * |
2172 |
+ * You should have received a copy of the GNU Lesser General Public |
2173 |
+ * License along with FFmpeg; if not, write to the Free Software |
2174 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
2175 |
+ */ |
2176 |
+ |
2177 |
+#include "config.h" |
2178 |
+#include "libavutil/attributes.h" |
2179 |
+#include "libavutil/cpu.h" |
2180 |
+#include "libavutil/mem.h" |
2181 |
+#include "libavutil/e2k/cpu.h" |
2182 |
+#include "libavutil/e2k/util_e2k.h" |
2183 |
+#include "libavcodec/fmtconvert.h" |
2184 |
+ |
2185 |
+static void int32_to_float_fmul_scalar_e2k(float *dst, const int32_t *src, |
2186 |
+ float mul, int len) |
2187 |
+{ |
2188 |
+ int i; |
2189 |
+ __m128 src1, src2, dst1, dst2, mul_v; |
2190 |
+ mul_v = _mm_set1_ps(mul); |
2191 |
+ |
2192 |
+ PRAGMA_E2K("ivdep") |
2193 |
+ for (i = 0; i < len; i += 8) { |
2194 |
+ src1 = _mm_cvtepi32_ps(VEC_LD(src + i)); |
2195 |
+ src2 = _mm_cvtepi32_ps(VEC_LD(src + i + 4)); |
2196 |
+ dst1 = _mm_mul_ps(src1, mul_v); |
2197 |
+ dst2 = _mm_mul_ps(src2, mul_v); |
2198 |
+ _mm_storeu_ps(dst + i, dst1); |
2199 |
+ _mm_storeu_ps(dst + i + 4, dst2); |
2200 |
+ } |
2201 |
+} |
2202 |
+ |
2203 |
+av_cold void ff_fmt_convert_init_e2k(FmtConvertContext *c, |
2204 |
+ AVCodecContext *avctx) |
2205 |
+{ |
2206 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
2207 |
+ return; |
2208 |
+ |
2209 |
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_e2k; |
2210 |
+} |
2211 |
diff --git a/libavcodec/e2k/h264chroma.c b/libavcodec/e2k/h264chroma.c |
2212 |
new file mode 100644 |
2213 |
index 0000000..802a26c |
2214 |
--- /dev/null |
2215 |
+++ b/libavcodec/e2k/h264chroma.c |
2216 |
@@ -0,0 +1,63 @@ |
2217 |
+/* |
2218 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
2219 |
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> |
2220 |
+ * |
2221 |
+ * This file is part of FFmpeg. |
2222 |
+ * |
2223 |
+ * FFmpeg is free software; you can redistribute it and/or |
2224 |
+ * modify it under the terms of the GNU Lesser General Public |
2225 |
+ * License as published by the Free Software Foundation; either |
2226 |
+ * version 2.1 of the License, or (at your option) any later version. |
2227 |
+ * |
2228 |
+ * FFmpeg is distributed in the hope that it will be useful, |
2229 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
2230 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
2231 |
+ * Lesser General Public License for more details. |
2232 |
+ * |
2233 |
+ * You should have received a copy of the GNU Lesser General Public |
2234 |
+ * License along with FFmpeg; if not, write to the Free Software |
2235 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
2236 |
+ */ |
2237 |
+ |
2238 |
+#include "config.h" |
2239 |
+ |
2240 |
+#include "libavutil/attributes.h" |
2241 |
+#include "libavutil/cpu.h" |
2242 |
+#include "libavutil/intreadwrite.h" |
2243 |
+#include "libavutil/e2k/cpu.h" |
2244 |
+#include "libavutil/e2k/util_e2k.h" |
2245 |
+ |
2246 |
+#include "libavcodec/h264chroma.h" |
2247 |
+ |
2248 |
+#define PUT_OP_U8_E2K(d, s, dst) d = s |
2249 |
+#define AVG_OP_U8_E2K(d, s, dst) d = _mm_avg_epu8(dst, s) |
2250 |
+ |
2251 |
+#define OP_U8_E2K PUT_OP_U8_E2K |
2252 |
+#define PREFIX_h264_chroma_mc8_e2k put_h264_chroma_mc8_e2k |
2253 |
+#define PREFIX_h264_chroma_mc8_num e2k_put_h264_chroma_mc8_num |
2254 |
+#include "h264chroma_template.c" |
2255 |
+#undef OP_U8_E2K |
2256 |
+#undef PREFIX_h264_chroma_mc8_e2k |
2257 |
+#undef PREFIX_h264_chroma_mc8_num |
2258 |
+ |
2259 |
+#define OP_U8_E2K AVG_OP_U8_E2K |
2260 |
+#define PREFIX_h264_chroma_mc8_e2k avg_h264_chroma_mc8_e2k |
2261 |
+#define PREFIX_h264_chroma_mc8_num e2k_avg_h264_chroma_mc8_num |
2262 |
+#include "h264chroma_template.c" |
2263 |
+#undef OP_U8_E2K |
2264 |
+#undef PREFIX_h264_chroma_mc8_e2k |
2265 |
+#undef PREFIX_h264_chroma_mc8_num |
2266 |
+ |
2267 |
+av_cold void ff_h264chroma_init_e2k(H264ChromaContext *c, int bit_depth) |
2268 |
+{ |
2269 |
+ const int high_bit_depth = bit_depth > 8; |
2270 |
+ |
2271 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
2272 |
+ return; |
2273 |
+ |
2274 |
+ // !checkasm |
2275 |
+ if (!high_bit_depth) { |
2276 |
+ c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_e2k; |
2277 |
+ c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_e2k; |
2278 |
+ } |
2279 |
+} |
2280 |
diff --git a/libavcodec/e2k/h264chroma_template.c b/libavcodec/e2k/h264chroma_template.c |
2281 |
new file mode 100644 |
2282 |
index 0000000..623e0b2 |
2283 |
--- /dev/null |
2284 |
+++ b/libavcodec/e2k/h264chroma_template.c |
2285 |
@@ -0,0 +1,113 @@ |
2286 |
+/* |
2287 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
2288 |
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> |
2289 |
+ * |
2290 |
+ * This file is part of FFmpeg. |
2291 |
+ * |
2292 |
+ * FFmpeg is free software; you can redistribute it and/or |
2293 |
+ * modify it under the terms of the GNU Lesser General Public |
2294 |
+ * License as published by the Free Software Foundation; either |
2295 |
+ * version 2.1 of the License, or (at your option) any later version. |
2296 |
+ * |
2297 |
+ * FFmpeg is distributed in the hope that it will be useful, |
2298 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
2299 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
2300 |
+ * Lesser General Public License for more details. |
2301 |
+ * |
2302 |
+ * You should have received a copy of the GNU Lesser General Public |
2303 |
+ * License along with FFmpeg; if not, write to the Free Software |
2304 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
2305 |
+ */ |
2306 |
+ |
2307 |
+#include "libavutil/mem.h" |
2308 |
+#include "libavutil/e2k/util_e2k.h" |
2309 |
+ |
2310 |
+/* this code assume that stride % 16 == 0 */ |
2311 |
+ |
2312 |
+#define CHROMA_MC8_E2K_CORE() \ |
2313 |
+ v1 = _mm_add_epi16(_mm_maddubs_epi16(v0, vA), bias); \ |
2314 |
+ v1 = _mm_add_epi16(_mm_maddubs_epi16(v2, vB), v1); \ |
2315 |
+ v1 = _mm_srai_epi16(v1, 6); \ |
2316 |
+ v1 = _mm_packus_epi16(v1, v1); \ |
2317 |
+ OP_U8_E2K(v1, v1, VEC_LD8(dst)); \ |
2318 |
+ VEC_STL(dst, v1); dst += stride; \ |
2319 |
+ v0 = v2; |
2320 |
+ |
2321 |
+#define CHROMA_MC8_E2K_CORE_SIMPLE(v1) \ |
2322 |
+ v1 = _mm_add_epi16(_mm_maddubs_epi16(v1, vA), bias); \ |
2323 |
+ v1 = _mm_srai_epi16(v1, 6); \ |
2324 |
+ v1 = _mm_packus_epi16(v1, v1); \ |
2325 |
+ OP_U8_E2K(v1, v1, VEC_LD8(dst)); \ |
2326 |
+ VEC_STL(dst, v1); dst += stride; |
2327 |
+ |
2328 |
+#define GET_VSRC1(v0) v0 = VEC_LD8(src); src += stride; |
2329 |
+#define GET_VSRC2(v0, v1) \ |
2330 |
+ v0 = VEC_LD8(src); \ |
2331 |
+ v1 = VEC_LD8(src + 1); \ |
2332 |
+ v0 = _mm_unpacklo_epi8(v0, v1); src += stride; |
2333 |
+ |
2334 |
+#ifdef PREFIX_h264_chroma_mc8_e2k |
2335 |
+static void PREFIX_h264_chroma_mc8_e2k(uint8_t *dst, uint8_t *src, |
2336 |
+ ptrdiff_t stride, int h, |
2337 |
+ int x, int y) |
2338 |
+{ |
2339 |
+ int i, xm = (8 - x) | x << 8; |
2340 |
+ __m128i vA, vB, bias = _mm_set1_epi16(32); |
2341 |
+ __m128i v0, v1, v2, v3; |
2342 |
+ |
2343 |
+ if (y) { |
2344 |
+ if (x) { |
2345 |
+ vA = _mm_set1_epi16(xm * (8 - y)); |
2346 |
+ vB = _mm_set1_epi16(xm * y); |
2347 |
+ GET_VSRC2(v0, v1); |
2348 |
+ PRAGMA_E2K("ivdep") |
2349 |
+ PRAGMA_E2K("unroll(2)") |
2350 |
+ for (i = 0; i < h; i++) { |
2351 |
+ GET_VSRC2(v2, v3); |
2352 |
+ CHROMA_MC8_E2K_CORE(); |
2353 |
+ } |
2354 |
+ } else { |
2355 |
+ vA = _mm_set1_epi16(((8 - y) | y << 8) * 8); |
2356 |
+ GET_VSRC1(v0); |
2357 |
+ PRAGMA_E2K("ivdep") |
2358 |
+ PRAGMA_E2K("unroll(2)") |
2359 |
+ for (i = 0; i < h; i++) { |
2360 |
+ GET_VSRC1(v2); |
2361 |
+ v1 = _mm_unpacklo_epi8(v0, v2); |
2362 |
+ CHROMA_MC8_E2K_CORE_SIMPLE(v1); |
2363 |
+ v0 = v2; |
2364 |
+ } |
2365 |
+ } |
2366 |
+ } else { |
2367 |
+ vA = _mm_set1_epi16(xm * 8); |
2368 |
+ PRAGMA_E2K("ivdep") |
2369 |
+ PRAGMA_E2K("unroll(2)") |
2370 |
+ for (i = 0; i < h; i++) { |
2371 |
+ GET_VSRC2(v0, v1); |
2372 |
+ CHROMA_MC8_E2K_CORE_SIMPLE(v0); |
2373 |
+ } |
2374 |
+ } |
2375 |
+} |
2376 |
+#endif |
2377 |
+ |
2378 |
+#ifdef PREFIX_no_rnd_vc1_chroma_mc8_e2k |
2379 |
+static void PREFIX_no_rnd_vc1_chroma_mc8_e2k(uint8_t *dst, uint8_t *src, |
2380 |
+ ptrdiff_t stride, int h, |
2381 |
+ int x, int y) |
2382 |
+{ |
2383 |
+ int i, xm = (8 - x) | x << 8; |
2384 |
+ __m128i vA = _mm_set1_epi16(xm * (8 - y)); |
2385 |
+ __m128i vB = _mm_set1_epi16(xm * y); |
2386 |
+ __m128i bias = _mm_set1_epi16(28); |
2387 |
+ __m128i v0, v1, v2, v3; |
2388 |
+ |
2389 |
+ GET_VSRC2(v0, v1); |
2390 |
+ PRAGMA_E2K("ivdep") |
2391 |
+ PRAGMA_E2K("unroll(2)") |
2392 |
+ for (i = 0; i < h; i++) { |
2393 |
+ GET_VSRC2(v2, v3); |
2394 |
+ CHROMA_MC8_E2K_CORE(); |
2395 |
+ } |
2396 |
+} |
2397 |
+#endif |
2398 |
+ |
2399 |
diff --git a/libavcodec/e2k/h264dsp.c b/libavcodec/e2k/h264dsp.c |
2400 |
new file mode 100644 |
2401 |
index 0000000..ff7be7d |
2402 |
--- /dev/null |
2403 |
+++ b/libavcodec/e2k/h264dsp.c |
2404 |
@@ -0,0 +1,820 @@ |
2405 |
+/* |
2406 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
2407 |
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> |
2408 |
+ * |
2409 |
+ * This file is part of FFmpeg. |
2410 |
+ * |
2411 |
+ * FFmpeg is free software; you can redistribute it and/or |
2412 |
+ * modify it under the terms of the GNU Lesser General Public |
2413 |
+ * License as published by the Free Software Foundation; either |
2414 |
+ * version 2.1 of the License, or (at your option) any later version. |
2415 |
+ * |
2416 |
+ * FFmpeg is distributed in the hope that it will be useful, |
2417 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
2418 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
2419 |
+ * Lesser General Public License for more details. |
2420 |
+ * |
2421 |
+ * You should have received a copy of the GNU Lesser General Public |
2422 |
+ * License along with FFmpeg; if not, write to the Free Software |
2423 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
2424 |
+ */ |
2425 |
+ |
2426 |
+#include "config.h" |
2427 |
+ |
2428 |
+#include <stdint.h> |
2429 |
+#include <string.h> |
2430 |
+ |
2431 |
+#include "libavutil/attributes.h" |
2432 |
+#include "libavutil/cpu.h" |
2433 |
+#include "libavutil/intreadwrite.h" |
2434 |
+#include "libavutil/mem.h" |
2435 |
+#include "libavutil/e2k/cpu.h" |
2436 |
+#include "libavutil/e2k/util_e2k.h" |
2437 |
+ |
2438 |
+#include "libavcodec/h264dec.h" |
2439 |
+#include "libavcodec/h264dsp.h" |
2440 |
+ |
2441 |
+/**************************************************************************** |
2442 |
+ * IDCT transform: |
2443 |
+ ****************************************************************************/ |
2444 |
+ |
2445 |
+#define VEC_1D_DCT(vb0, vb1, vb2, vb3, va0, va1, va2, va3) \ |
2446 |
+ /* 1st stage */ \ |
2447 |
+ vz0 = _mm_add_epi16(vb0, vb2); /* temp[0] = Y[0] + Y[2] */ \ |
2448 |
+ vz1 = _mm_sub_epi16(vb0, vb2); /* temp[1] = Y[0] - Y[2] */ \ |
2449 |
+ vz2 = _mm_srai_epi16(vb1, 1); \ |
2450 |
+ vz2 = _mm_sub_epi16(vz2, vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ |
2451 |
+ vz3 = _mm_srai_epi16(vb3, 1); \ |
2452 |
+ vz3 = _mm_add_epi16(vb1, vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ |
2453 |
+ /* 2nd stage: output */ \ |
2454 |
+ va0 = _mm_add_epi16(vz0, vz3); /* x[0] = temp[0] + temp[3] */ \ |
2455 |
+ va1 = _mm_add_epi16(vz1, vz2); /* x[1] = temp[1] + temp[2] */ \ |
2456 |
+ va2 = _mm_sub_epi16(vz1, vz2); /* x[2] = temp[1] - temp[2] */ \ |
2457 |
+ va3 = _mm_sub_epi16(vz0, vz3) /* x[3] = temp[0] - temp[3] */ |
2458 |
+ |
2459 |
+#define VEC_TRANSPOSE_4(a0, a1, a2, a3, b0, b1, b2, b3) \ |
2460 |
+ b0 = _mm_unpacklo_epi16(a0, a0); \ |
2461 |
+ b1 = _mm_unpacklo_epi16(a1, a0); \ |
2462 |
+ b2 = _mm_unpacklo_epi16(a2, a0); \ |
2463 |
+ b3 = _mm_unpacklo_epi16(a3, a0); \ |
2464 |
+ a0 = _mm_unpacklo_epi16(b0, b2); \ |
2465 |
+ a1 = _mm_unpackhi_epi16(b0, b2); \ |
2466 |
+ a2 = _mm_unpacklo_epi16(b1, b3); \ |
2467 |
+ a3 = _mm_unpackhi_epi16(b1, b3); \ |
2468 |
+ b0 = _mm_unpacklo_epi16(a0, a2); \ |
2469 |
+ b1 = _mm_unpackhi_epi16(a0, a2); \ |
2470 |
+ b2 = _mm_unpacklo_epi16(a1, a3); \ |
2471 |
+ b3 = _mm_unpackhi_epi16(a1, a3) |
2472 |
+ |
2473 |
+#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ |
2474 |
+ va = _mm_srai_epi16(va, 6); \ |
2475 |
+ vdst = _mm_cvtsi32_si128(*(uint32_t*)dst); \ |
2476 |
+ vdst = _mm_unpacklo_epi8(vdst, zerov); \ |
2477 |
+ va = _mm_add_epi16(va, vdst); \ |
2478 |
+ va = _mm_packus_epi16(va, va); \ |
2479 |
+ *(uint32_t*)dst = _mm_extract_epi32(va, 0); \ |
2480 |
+ dst += stride; |
2481 |
+ |
2482 |
+static void h264_idct_add_e2k(uint8_t *dst, int16_t *block, int stride) |
2483 |
+{ |
2484 |
+ vec_s16 va0, va1, va2, va3; |
2485 |
+ vec_s16 vz0, vz1, vz2, vz3; |
2486 |
+ vec_s16 vtmp0, vtmp1, vtmp2, vtmp3; |
2487 |
+ vec_u8 vdst; |
2488 |
+ LOAD_ZERO; |
2489 |
+ |
2490 |
+ block[0] += 32; /* add 32 as a DC-level for rounding */ |
2491 |
+ |
2492 |
+ vtmp0 = VEC_LD(block); |
2493 |
+ vtmp1 = VEC_ALIGNR8(vtmp0, vtmp0); |
2494 |
+ vtmp2 = VEC_LD(block + 8); |
2495 |
+ vtmp3 = VEC_ALIGNR8(vtmp2, vtmp2); |
2496 |
+ VEC_ST(block, zerov); |
2497 |
+ VEC_ST(block + 8, zerov); |
2498 |
+ |
2499 |
+ VEC_1D_DCT(vtmp0, vtmp1, vtmp2, vtmp3, va0, va1, va2, va3); |
2500 |
+ VEC_TRANSPOSE_4(va0, va1, va2, va3, vtmp0, vtmp1, vtmp2, vtmp3); |
2501 |
+ VEC_1D_DCT(vtmp0, vtmp1, vtmp2, vtmp3, va0, va1, va2, va3); |
2502 |
+ |
2503 |
+ VEC_LOAD_U8_ADD_S16_STORE_U8(va0); |
2504 |
+ VEC_LOAD_U8_ADD_S16_STORE_U8(va1); |
2505 |
+ VEC_LOAD_U8_ADD_S16_STORE_U8(va2); |
2506 |
+ VEC_LOAD_U8_ADD_S16_STORE_U8(va3); |
2507 |
+} |
2508 |
+ |
2509 |
+#define IDCT8_1D_E2K(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) { \ |
2510 |
+ /* a0 = SRC(0) + SRC(4); */ \ |
2511 |
+ /* a2 = SRC(0) - SRC(4); */ \ |
2512 |
+ /* a4 = (SRC(2)>>1) - SRC(6); */ \ |
2513 |
+ /* a6 = (SRC(6)>>1) + SRC(2); */ \ |
2514 |
+ vec_s16 a0v = _mm_add_epi16(s0, s4); \ |
2515 |
+ vec_s16 a2v = _mm_sub_epi16(s0, s4); \ |
2516 |
+ vec_s16 a4v = _mm_sub_epi16(_mm_srai_epi16(s2, 1), s6); \ |
2517 |
+ vec_s16 a6v = _mm_add_epi16(_mm_srai_epi16(s6, 1), s2); \ |
2518 |
+ /* b0 = a0 + a6; */ \ |
2519 |
+ /* b2 = a2 + a4; */ \ |
2520 |
+ /* b4 = a2 - a4; */ \ |
2521 |
+ /* b6 = a0 - a6; */ \ |
2522 |
+ vec_s16 b0v = _mm_add_epi16(a0v, a6v); \ |
2523 |
+ vec_s16 b2v = _mm_add_epi16(a2v, a4v); \ |
2524 |
+ vec_s16 b4v = _mm_sub_epi16(a2v, a4v); \ |
2525 |
+ vec_s16 b6v = _mm_sub_epi16(a0v, a6v); \ |
2526 |
+ /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ |
2527 |
+ /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ |
2528 |
+ /* a5 = (SRC(7)-SRC(1)) + (SRC(5) + (SRC(5)>>1)); */ \ |
2529 |
+ /* a7 = (SRC(5)+SRC(3)) + (SRC(1) + (SRC(1)>>1)); */ \ |
2530 |
+ vec_s16 a1v = _mm_sub_epi16(_mm_sub_epi16(s5, s3), _mm_add_epi16(s7, _mm_srai_epi16(s7, 1))); \ |
2531 |
+ vec_s16 a3v = _mm_sub_epi16(_mm_add_epi16(s7, s1), _mm_add_epi16(s3, _mm_srai_epi16(s3, 1))); \ |
2532 |
+ vec_s16 a5v = _mm_add_epi16(_mm_sub_epi16(s7, s1), _mm_add_epi16(s5, _mm_srai_epi16(s5, 1))); \ |
2533 |
+ vec_s16 a7v = _mm_add_epi16(_mm_add_epi16(s5, s3), _mm_add_epi16(s1, _mm_srai_epi16(s1, 1))); \ |
2534 |
+ /* b1 = (a7>>2) + a1; */ \ |
2535 |
+ /* b3 = a3 + (a5>>2); */ \ |
2536 |
+ /* b5 = (a3>>2) - a5; */ \ |
2537 |
+ /* b7 = a7 - (a1>>2); */ \ |
2538 |
+ vec_s16 b1v = _mm_add_epi16(_mm_srai_epi16(a7v, 2), a1v); \ |
2539 |
+ vec_s16 b3v = _mm_add_epi16(a3v, _mm_srai_epi16(a5v, 2)); \ |
2540 |
+ vec_s16 b5v = _mm_sub_epi16(_mm_srai_epi16(a3v, 2), a5v); \ |
2541 |
+ vec_s16 b7v = _mm_sub_epi16(a7v, _mm_srai_epi16(a1v, 2)); \ |
2542 |
+ /* DST(0, b0 + b7); */ \ |
2543 |
+ /* DST(1, b2 + b5); */ \ |
2544 |
+ /* DST(2, b4 + b3); */ \ |
2545 |
+ /* DST(3, b6 + b1); */ \ |
2546 |
+ /* DST(4, b6 - b1); */ \ |
2547 |
+ /* DST(5, b4 - b3); */ \ |
2548 |
+ /* DST(6, b2 - b5); */ \ |
2549 |
+ /* DST(7, b0 - b7); */ \ |
2550 |
+ d0 = _mm_add_epi16(b0v, b7v); \ |
2551 |
+ d1 = _mm_add_epi16(b2v, b5v); \ |
2552 |
+ d2 = _mm_add_epi16(b4v, b3v); \ |
2553 |
+ d3 = _mm_add_epi16(b6v, b1v); \ |
2554 |
+ d4 = _mm_sub_epi16(b6v, b1v); \ |
2555 |
+ d5 = _mm_sub_epi16(b4v, b3v); \ |
2556 |
+ d6 = _mm_sub_epi16(b2v, b5v); \ |
2557 |
+ d7 = _mm_sub_epi16(b0v, b7v); \ |
2558 |
+} |
2559 |
+ |
2560 |
+#define E2K_STORE_SUM_CLIP(dest, idctv) { \ |
2561 |
+ /* unaligned load */ \ |
2562 |
+ __m128i dstv = VEC_LD8(dest); \ |
2563 |
+ dstv = _mm_unpacklo_epi8(dstv, zerov); \ |
2564 |
+ idctv = _mm_srai_epi16(idctv, 6); \ |
2565 |
+ dstv = _mm_add_epi16(dstv, idctv); \ |
2566 |
+ dstv = _mm_packus_epi16(dstv, dstv); \ |
2567 |
+ /* unaligned store */ \ |
2568 |
+ VEC_STL(dest, dstv); \ |
2569 |
+} |
2570 |
+ |
2571 |
+static void h264_idct8_add_e2k(uint8_t *dst, int16_t *dct, int stride) |
2572 |
+{ |
2573 |
+ vec_s16 s0, s1, s2, s3, s4, s5, s6, s7; |
2574 |
+ vec_s16 d0, d1, d2, d3, d4, d5, d6, d7; |
2575 |
+ vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; |
2576 |
+ |
2577 |
+ LOAD_ZERO; |
2578 |
+ |
2579 |
+ dct[0] += 32; // rounding for the >>6 at the end |
2580 |
+ |
2581 |
+ s0 = VEC_LD(dct + 8 * 0); |
2582 |
+ s1 = VEC_LD(dct + 8 * 1); |
2583 |
+ s2 = VEC_LD(dct + 8 * 2); |
2584 |
+ s3 = VEC_LD(dct + 8 * 3); |
2585 |
+ s4 = VEC_LD(dct + 8 * 4); |
2586 |
+ s5 = VEC_LD(dct + 8 * 5); |
2587 |
+ s6 = VEC_LD(dct + 8 * 6); |
2588 |
+ s7 = VEC_LD(dct + 8 * 7); |
2589 |
+ VEC_ST(dct + 8 * 0, zerov); |
2590 |
+ VEC_ST(dct + 8 * 1, zerov); |
2591 |
+ VEC_ST(dct + 8 * 2, zerov); |
2592 |
+ VEC_ST(dct + 8 * 3, zerov); |
2593 |
+ VEC_ST(dct + 8 * 4, zerov); |
2594 |
+ VEC_ST(dct + 8 * 5, zerov); |
2595 |
+ VEC_ST(dct + 8 * 6, zerov); |
2596 |
+ VEC_ST(dct + 8 * 7, zerov); |
2597 |
+ |
2598 |
+ IDCT8_1D_E2K(s0, s1, s2, s3, s4, s5, s6, s7, |
2599 |
+ d0, d1, d2, d3, d4, d5, d6, d7); |
2600 |
+ |
2601 |
+ TRANSPOSE8(d0, d1, d2, d3, d4, d5, d6, d7); |
2602 |
+ |
2603 |
+ IDCT8_1D_E2K(d0, d1, d2, d3, d4, d5, d6, d7, |
2604 |
+ idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); |
2605 |
+ |
2606 |
+ E2K_STORE_SUM_CLIP(&dst[0*stride], idct0); |
2607 |
+ E2K_STORE_SUM_CLIP(&dst[1*stride], idct1); |
2608 |
+ E2K_STORE_SUM_CLIP(&dst[2*stride], idct2); |
2609 |
+ E2K_STORE_SUM_CLIP(&dst[3*stride], idct3); |
2610 |
+ E2K_STORE_SUM_CLIP(&dst[4*stride], idct4); |
2611 |
+ E2K_STORE_SUM_CLIP(&dst[5*stride], idct5); |
2612 |
+ E2K_STORE_SUM_CLIP(&dst[6*stride], idct6); |
2613 |
+ E2K_STORE_SUM_CLIP(&dst[7*stride], idct7); |
2614 |
+} |
2615 |
+ |
2616 |
+static void h264_idct_dc_add_e2k(uint8_t *dst, int16_t *block, int stride) |
2617 |
+{ |
2618 |
+ __m64 dc16, zerov = _mm_setzero_si64(); |
2619 |
+ __m64 dcplus, dcminus, v0, v1, v2, v3; |
2620 |
+ int i, dc; |
2621 |
+ |
2622 |
+ dc = (block[0] + 32) >> 6; |
2623 |
+ block[0] = 0; |
2624 |
+ dc16 = _mm_set1_pi16(dc); |
2625 |
+ dcplus = _mm_packs_pu16(dc16, dc16); |
2626 |
+ dc16 = _mm_sub_pi16(zerov, dc16); |
2627 |
+ dcminus = _mm_packs_pu16(dc16, dc16); |
2628 |
+ |
2629 |
+ PRAGMA_E2K("ivdep") |
2630 |
+ for (i = 0; i < 4; i += 4) { |
2631 |
+ v0 = _mm_cvtsi32_si64(*(uint32_t*)(dst + 0 * stride)); |
2632 |
+ v1 = _mm_cvtsi32_si64(*(uint32_t*)(dst + 1 * stride)); |
2633 |
+ v2 = _mm_cvtsi32_si64(*(uint32_t*)(dst + 2 * stride)); |
2634 |
+ v3 = _mm_cvtsi32_si64(*(uint32_t*)(dst + 3 * stride)); |
2635 |
+ |
2636 |
+ v0 = _mm_unpacklo_pi32(v0, v1); |
2637 |
+ v2 = _mm_unpacklo_pi32(v2, v3); |
2638 |
+ v0 = _mm_adds_pu8(v0, dcplus); |
2639 |
+ v2 = _mm_adds_pu8(v2, dcplus); |
2640 |
+ v0 = _mm_subs_pu8(v0, dcminus); |
2641 |
+ v2 = _mm_subs_pu8(v2, dcminus); |
2642 |
+ |
2643 |
+ *(uint32_t*)(dst + 0 * stride) = _mm_extract_pi32(v0, 0); |
2644 |
+ *(uint32_t*)(dst + 1 * stride) = _mm_extract_pi32(v0, 1); |
2645 |
+ *(uint32_t*)(dst + 2 * stride) = _mm_extract_pi32(v2, 0); |
2646 |
+ *(uint32_t*)(dst + 3 * stride) = _mm_extract_pi32(v2, 1); |
2647 |
+ dst += 4 * stride; |
2648 |
+ } |
2649 |
+} |
2650 |
+ |
2651 |
+static void h264_idct8_dc_add_e2k(uint8_t *dst, int16_t *block, int stride) |
2652 |
+{ |
2653 |
+ vec_s16 dc16; |
2654 |
+ vec_u8 dcplus, dcminus, v0, v1, v2, v3; |
2655 |
+ LOAD_ZERO; |
2656 |
+ int i, dc; |
2657 |
+ |
2658 |
+ dc = (block[0] + 32) >> 6; |
2659 |
+ block[0] = 0; |
2660 |
+ dc16 = _mm_set1_epi16(dc); |
2661 |
+ dcplus = _mm_packus_epi16(dc16, dc16); |
2662 |
+ dc16 = _mm_sub_epi16(zerov, dc16); |
2663 |
+ dcminus = _mm_packus_epi16(dc16, dc16); |
2664 |
+ |
2665 |
+ PRAGMA_E2K("ivdep") |
2666 |
+ for (i = 0; i < 8; i += 4) { |
2667 |
+ v0 = VEC_LD8(dst + 0 * stride); |
2668 |
+ v1 = VEC_LD8(dst + 1 * stride); |
2669 |
+ v2 = VEC_LD8(dst + 2 * stride); |
2670 |
+ v3 = VEC_LD8(dst + 3 * stride); |
2671 |
+ |
2672 |
+ v0 = _mm_unpacklo_epi64(v0, v1); |
2673 |
+ v2 = _mm_unpacklo_epi64(v2, v3); |
2674 |
+ v0 = _mm_adds_epu8(v0, dcplus); |
2675 |
+ v2 = _mm_adds_epu8(v2, dcplus); |
2676 |
+ v0 = _mm_subs_epu8(v0, dcminus); |
2677 |
+ v2 = _mm_subs_epu8(v2, dcminus); |
2678 |
+ |
2679 |
+ VEC_STL(dst + 0 * stride, v0); |
2680 |
+ VEC_STH(dst + 1 * stride, v0); |
2681 |
+ VEC_STL(dst + 2 * stride, v2); |
2682 |
+ VEC_STH(dst + 3 * stride, v2); |
2683 |
+ dst += 4 * stride; |
2684 |
+ } |
2685 |
+} |
2686 |
+ |
2687 |
+static void h264_idct_add16_e2k(uint8_t *dst, const int *block_offset, |
2688 |
+ int16_t *block, int stride, |
2689 |
+ const uint8_t nnzc[15 * 8]) |
2690 |
+{ |
2691 |
+ int i; |
2692 |
+ for (i = 0; i < 16; i++) { |
2693 |
+ int nnz = nnzc[scan8[i]]; |
2694 |
+ if (nnz) { |
2695 |
+ if (nnz == 1 && block[i * 16]) |
2696 |
+ h264_idct_dc_add_e2k(dst + block_offset[i], block + i * 16, stride); |
2697 |
+ else |
2698 |
+ h264_idct_add_e2k(dst + block_offset[i], block + i * 16, stride); |
2699 |
+ } |
2700 |
+ } |
2701 |
+} |
2702 |
+ |
2703 |
+static void h264_idct_add16intra_e2k(uint8_t *dst, const int *block_offset, |
2704 |
+ int16_t *block, int stride, |
2705 |
+ const uint8_t nnzc[15 * 8]) |
2706 |
+{ |
2707 |
+ int i; |
2708 |
+ for (i = 0; i < 16; i++) { |
2709 |
+ if (nnzc[scan8[i]]) |
2710 |
+ h264_idct_add_e2k(dst + block_offset[i], block + i * 16, stride); |
2711 |
+ else if(block[i * 16]) |
2712 |
+ h264_idct_dc_add_e2k(dst + block_offset[i], block + i * 16, stride); |
2713 |
+ } |
2714 |
+} |
2715 |
+ |
2716 |
+static void h264_idct8_add4_e2k(uint8_t *dst, const int *block_offset, |
2717 |
+ int16_t *block, int stride, |
2718 |
+ const uint8_t nnzc[15 * 8]) |
2719 |
+{ |
2720 |
+ int i; |
2721 |
+ for (i = 0; i < 16; i += 4){ |
2722 |
+ int nnz = nnzc[scan8[i]]; |
2723 |
+ if (nnz) { |
2724 |
+ if (nnz == 1 && block[i * 16]) |
2725 |
+ h264_idct8_dc_add_e2k(dst + block_offset[i], block + i * 16, stride); |
2726 |
+ else |
2727 |
+ h264_idct8_add_e2k(dst + block_offset[i], block + i * 16, stride); |
2728 |
+ } |
2729 |
+ } |
2730 |
+} |
2731 |
+ |
2732 |
+static void h264_idct_add8_e2k(uint8_t **dest, const int *block_offset, |
2733 |
+ int16_t *block, int stride, |
2734 |
+ const uint8_t nnzc[15 * 8]) |
2735 |
+{ |
2736 |
+ int i, j; |
2737 |
+ for (j = 1; j < 3; j++) { |
2738 |
+ for (i = j * 16; i < j * 16 + 4; i++) { |
2739 |
+ if (nnzc[scan8[i]]) |
2740 |
+ h264_idct_add_e2k(dest[j - 1] + block_offset[i], block + i * 16, stride); |
2741 |
+ else if (block[i * 16]) |
2742 |
+ h264_idct_dc_add_e2k(dest[j - 1] + block_offset[i], block + i * 16, stride); |
2743 |
+ } |
2744 |
+ } |
2745 |
+} |
2746 |
+ |
2747 |
+#define transpose4x16(r0, r1, r2, r3) { \ |
2748 |
+ vec_u8 r4, r5, r6, r7; \ |
2749 |
+ \ |
2750 |
+ r4 = _mm_unpacklo_epi8(r0, r2); /*0, 2 set 0*/ \ |
2751 |
+ r5 = _mm_unpackhi_epi8(r0, r2); /*0, 2 set 1*/ \ |
2752 |
+ r6 = _mm_unpacklo_epi8(r1, r3); /*1, 3 set 0*/ \ |
2753 |
+ r7 = _mm_unpackhi_epi8(r1, r3); /*1, 3 set 1*/ \ |
2754 |
+ \ |
2755 |
+ r0 = _mm_unpacklo_epi8(r4, r6); /*all set 0*/ \ |
2756 |
+ r1 = _mm_unpackhi_epi8(r4, r6); /*all set 1*/ \ |
2757 |
+ r2 = _mm_unpacklo_epi8(r5, r7); /*all set 2*/ \ |
2758 |
+ r3 = _mm_unpackhi_epi8(r5, r7); /*all set 3*/ \ |
2759 |
+} |
2760 |
+ |
2761 |
+#define WRITE4(i, j) ((uint32_t*)dst)[(i * 4 + j) * (dst_stride >> 2)] = _mm_extract_epi32(r##i, j); |
2762 |
+ |
2763 |
+static av_always_inline void write16x4(uint8_t *dst, int dst_stride, |
2764 |
+ vec_u8 r0, vec_u8 r1, |
2765 |
+ vec_u8 r2, vec_u8 r3) { |
2766 |
+ |
2767 |
+ WRITE4(0, 0) WRITE4(0, 1) WRITE4(0, 2) WRITE4(0, 3) |
2768 |
+ WRITE4(1, 0) WRITE4(1, 1) WRITE4(1, 2) WRITE4(1, 3) |
2769 |
+ WRITE4(2, 0) WRITE4(2, 1) WRITE4(2, 2) WRITE4(2, 3) |
2770 |
+ WRITE4(3, 0) WRITE4(3, 1) WRITE4(3, 2) WRITE4(3, 3) |
2771 |
+} |
2772 |
+ |
2773 |
+/* performs a 6x16 transpose of data in src, and stores it to dst */ |
2774 |
+#define read_and_transpose16x6(src, st, r8, r9, r10, r11, r12, r13) { \ |
2775 |
+ vec_u8 r0, r1, r2, r3, r4, r5, r6, r7, r14, r15; \ |
2776 |
+ r0 = VEC_LD8(src); \ |
2777 |
+ r1 = VEC_LD8(src + st); \ |
2778 |
+ r2 = VEC_LD8(src + st * 2); \ |
2779 |
+ r3 = VEC_LD8(src + st * 3); \ |
2780 |
+ r4 = VEC_LD8(src + st * 4); \ |
2781 |
+ r5 = VEC_LD8(src + st * 5); \ |
2782 |
+ r6 = VEC_LD8(src + st * 6); \ |
2783 |
+ r7 = VEC_LD8(src + st * 7); \ |
2784 |
+ r8 = VEC_LD8(src + st * 8); \ |
2785 |
+ r9 = VEC_LD8(src + st * 9); \ |
2786 |
+ r10 = VEC_LD8(src + st * 10); \ |
2787 |
+ r11 = VEC_LD8(src + st * 11); \ |
2788 |
+ r12 = VEC_LD8(src + st * 12); \ |
2789 |
+ r13 = VEC_LD8(src + st * 13); \ |
2790 |
+ r14 = VEC_LD8(src + st * 14); \ |
2791 |
+ r15 = VEC_LD8(src + st * 15); \ |
2792 |
+ \ |
2793 |
+ /* Merge first pairs */ \ |
2794 |
+ r0 = _mm_unpacklo_epi8(r0, r4); /* 0, 4 */ \ |
2795 |
+ r1 = _mm_unpacklo_epi8(r1, r5); /* 1, 5 */ \ |
2796 |
+ r2 = _mm_unpacklo_epi8(r2, r6); /* 2, 6 */ \ |
2797 |
+ r3 = _mm_unpacklo_epi8(r3, r7); /* 3, 7 */ \ |
2798 |
+ r4 = _mm_unpacklo_epi8(r8, r12); /* 8,12 */ \ |
2799 |
+ r5 = _mm_unpacklo_epi8(r9, r13); /* 9,13 */ \ |
2800 |
+ r6 = _mm_unpacklo_epi8(r10, r14); /* 10,14 */ \ |
2801 |
+ r7 = _mm_unpacklo_epi8(r11, r15); /* 11,15 */ \ |
2802 |
+ \ |
2803 |
+ /* Merge second pairs */ \ |
2804 |
+ r8 = _mm_unpacklo_epi8(r0, r2); /* 0, 2, 4, 6 set 0 */ \ |
2805 |
+ r9 = _mm_unpackhi_epi8(r0, r2); /* 0, 2, 4, 6 set 1 */ \ |
2806 |
+ r10 = _mm_unpacklo_epi8(r1, r3); /* 1, 3, 5, 7 set 0 */ \ |
2807 |
+ r11 = _mm_unpackhi_epi8(r1, r3); /* 1, 3, 5, 7 set 1 */ \ |
2808 |
+ r12 = _mm_unpacklo_epi8(r4, r6); /* 8,10,12,14 set 0 */ \ |
2809 |
+ r13 = _mm_unpackhi_epi8(r4, r6); /* 8,10,12,14 set 1 */ \ |
2810 |
+ r14 = _mm_unpacklo_epi8(r5, r7); /* 9,11,13,15 set 0 */ \ |
2811 |
+ r15 = _mm_unpackhi_epi8(r5, r7); /* 9,11,13,15 set 1 */ \ |
2812 |
+ \ |
2813 |
+ /* Third merge */ \ |
2814 |
+ r0 = _mm_unpacklo_epi8(r8, r10); /* 0..7 set 0 */ \ |
2815 |
+ r1 = _mm_unpackhi_epi8(r8, r10); /* 0..7 set 1 */ \ |
2816 |
+ r2 = _mm_unpacklo_epi8(r9, r11); /* 0..7 set 2 */ \ |
2817 |
+ r4 = _mm_unpacklo_epi8(r12, r14); /* 8..15 set 0 */ \ |
2818 |
+ r5 = _mm_unpackhi_epi8(r12, r14); /* 8..15 set 1 */ \ |
2819 |
+ r6 = _mm_unpacklo_epi8(r13, r15); /* 8..15 set 2 */ \ |
2820 |
+ /* Don't need to compute 3 and 7*/ \ |
2821 |
+ \ |
2822 |
+ /* Final merge */ \ |
2823 |
+ r8 = _mm_unpacklo_epi64(r0, r4); /* all set 0 */ \ |
2824 |
+ r9 = _mm_unpackhi_epi64(r0, r4); /* all set 1 */ \ |
2825 |
+ r10 = _mm_unpacklo_epi64(r1, r5); /* all set 2 */ \ |
2826 |
+ r11 = _mm_unpackhi_epi64(r1, r5); /* all set 3 */ \ |
2827 |
+ r12 = _mm_unpacklo_epi64(r2, r6); /* all set 4 */ \ |
2828 |
+ r13 = _mm_unpackhi_epi64(r2, r6); /* all set 5 */ \ |
2829 |
+ /* Don't need to compute 14 and 15*/ \ |
2830 |
+} |
2831 |
+ |
2832 |
+#define read_and_transpose8x4(src, st, r8, r9, r10, r11) { \ |
2833 |
+ __m64 r0, r1, r2, r3, r4, r5, r6, r7; \ |
2834 |
+ r0 = _mm_cvtsi32_si64(*(uint32_t*)(src)); \ |
2835 |
+ r1 = _mm_cvtsi32_si64(*(uint32_t*)(src + st)); \ |
2836 |
+ r2 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 2)); \ |
2837 |
+ r3 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 3)); \ |
2838 |
+ r4 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 4)); \ |
2839 |
+ r5 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 5)); \ |
2840 |
+ r6 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 6)); \ |
2841 |
+ r7 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 7)); \ |
2842 |
+ \ |
2843 |
+ r0 = _mm_unpacklo_pi8(r0, r4); \ |
2844 |
+ r1 = _mm_unpacklo_pi8(r1, r5); \ |
2845 |
+ r2 = _mm_unpacklo_pi8(r2, r6); \ |
2846 |
+ r3 = _mm_unpacklo_pi8(r3, r7); \ |
2847 |
+ \ |
2848 |
+ r4 = _mm_unpacklo_pi8(r0, r2); \ |
2849 |
+ r5 = _mm_unpackhi_pi8(r0, r2); \ |
2850 |
+ r6 = _mm_unpacklo_pi8(r1, r3); \ |
2851 |
+ r7 = _mm_unpackhi_pi8(r1, r3); \ |
2852 |
+ \ |
2853 |
+ r8 = _mm_unpacklo_pi8(r4, r6); \ |
2854 |
+ r9 = _mm_unpackhi_pi8(r4, r6); \ |
2855 |
+ r10 = _mm_unpacklo_pi8(r5, r7); \ |
2856 |
+ r11 = _mm_unpackhi_pi8(r5, r7); \ |
2857 |
+} |
2858 |
+ |
2859 |
+#define DEF_HELPERS(n, vec, p, si) \ |
2860 |
+/* out: o = |x-y| < a */ \ |
2861 |
+static av_always_inline vec diff_lt##n(vec x, vec y, vec a) { \ |
2862 |
+ vec o = _mm_or_##si(_mm_subs_##p##u8(x, y), _mm_subs_##p##u8(y, x)); /* |x-y| */ \ |
2863 |
+ return _mm_cmpgt_##p##i8(a, _mm_xor_##si(o, _mm_set1_##p##i8(-128))); \ |
2864 |
+} \ |
2865 |
+static av_always_inline vec deblock_mask##n(vec p0, vec p1, vec q0, \ |
2866 |
+ vec q1, vec alpha, vec beta) { \ |
2867 |
+ vec mask, tempmask; \ |
2868 |
+ mask = diff_lt##n(p0, q0, alpha); \ |
2869 |
+ tempmask = diff_lt##n(p1, p0, beta); \ |
2870 |
+ mask = _mm_and_##si(mask, tempmask); \ |
2871 |
+ tempmask = diff_lt##n(q1, q0, beta); \ |
2872 |
+ return _mm_and_##si(mask, tempmask); \ |
2873 |
+} |
2874 |
+ |
2875 |
+DEF_HELPERS(16, __m128i, ep, si128) |
2876 |
+DEF_HELPERS(8, __m64, p, si64) |
2877 |
+ |
2878 |
+// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) |
2879 |
+static av_always_inline vec_u8 h264_deblock_q1(vec_u8 p0, vec_u8 p1, vec_u8 p2, |
2880 |
+ vec_u8 q0, vec_u8 tc0) { |
2881 |
+ |
2882 |
+ vec_u8 average = _mm_avg_epu8(p0, q0); |
2883 |
+ vec_u8 temp, uncliped; |
2884 |
+ vec_u8 ones = _mm_set1_epi8(1), maxv, minv; |
2885 |
+ |
2886 |
+ temp = _mm_xor_si128(average, p2); |
2887 |
+ average = _mm_avg_epu8(average, p2); /* avg(p2, avg(p0, q0)) */ |
2888 |
+ temp = _mm_and_si128(temp, ones); /* (p2^avg(p0, q0)) & 1 */ |
2889 |
+ uncliped = _mm_subs_epu8(average, temp); /* (p2+((p0+q0+1)>>1))>>1 */ |
2890 |
+ maxv = _mm_adds_epu8(p1, tc0); |
2891 |
+ minv = _mm_subs_epu8(p1, tc0); |
2892 |
+ return _mm_min_epu8(maxv, _mm_max_epu8(minv, uncliped)); |
2893 |
+} |
2894 |
+ |
2895 |
+#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked, vec, x, si) { \ |
2896 |
+ vec pq0bit = _mm_xor_##si(p0, q0); \ |
2897 |
+ vec q1minus, p0minus, stage1, stage2, c127 = _mm_set1_##x##i8(127); \ |
2898 |
+ vec vec160 = _mm_set1_##x##i8(160), delta, deltaneg, notv = _mm_set1_##x##i8(-1); \ |
2899 |
+ \ |
2900 |
+ q1minus = _mm_xor_##si(q1, notv); /* 255 - q1 */ \ |
2901 |
+ stage1 = _mm_avg_##x##u8(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ |
2902 |
+ stage2 = _mm_srli_##x##i16(stage1, 1); \ |
2903 |
+ stage2 = _mm_and_##si(stage2, c127); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ |
2904 |
+ p0minus = _mm_xor_##si(p0, notv); /* 255 - p0 */ \ |
2905 |
+ stage1 = _mm_avg_##x##u8(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \ |
2906 |
+ pq0bit = _mm_and_##si(pq0bit, _mm_set1_##x##i8(1)); \ |
2907 |
+ stage2 = _mm_avg_##x##u8(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \ |
2908 |
+ stage2 = _mm_adds_##x##u8(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \ |
2909 |
+ deltaneg = _mm_subs_##x##u8(vec160, stage2); /* -d */ \ |
2910 |
+ delta = _mm_subs_##x##u8(stage2, vec160); /* d */ \ |
2911 |
+ deltaneg = _mm_min_##x##u8(tc0masked, deltaneg); \ |
2912 |
+ delta = _mm_min_##x##u8(tc0masked, delta); \ |
2913 |
+ p0 = _mm_subs_##x##u8(p0, deltaneg); \ |
2914 |
+ q0 = _mm_subs_##x##u8(q0, delta); \ |
2915 |
+ p0 = _mm_adds_##x##u8(p0, delta); \ |
2916 |
+ q0 = _mm_adds_##x##u8(q0, deltaneg); \ |
2917 |
+} |
2918 |
+ |
2919 |
+#define h264_loop_filter_luma_e2k(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ |
2920 |
+ vec_u8 alphavec, betavec, mask, p1mask, q1mask; \ |
2921 |
+ vec_s8 tc0vec; \ |
2922 |
+ vec_u8 finaltc0, tc0masked, newp1, newq1; \ |
2923 |
+ \ |
2924 |
+ betavec = _mm_set1_epi8(beta - 128); \ |
2925 |
+ alphavec = _mm_set1_epi8(alpha - 128); \ |
2926 |
+ mask = deblock_mask16(p0, p1, q0, q1, alphavec, betavec); /* if in block */ \ |
2927 |
+ \ |
2928 |
+ tc0vec = _mm_cvtsi32_si128(*(uint32_t*)tc0); \ |
2929 |
+ tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec); \ |
2930 |
+ tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec); \ |
2931 |
+ mask = _mm_blendv_epi8(mask, _mm_setzero_si128(), tc0vec); \ |
2932 |
+ finaltc0 = _mm_and_si128(tc0vec, mask); /* if (tc0[i] >= 0) tc = tc0 */ \ |
2933 |
+ \ |
2934 |
+ p1mask = diff_lt16(p2, p0, betavec); \ |
2935 |
+ p1mask = _mm_and_si128(p1mask, mask); /* if(|p2 - p0| < beta) */ \ |
2936 |
+ tc0masked = _mm_and_si128(p1mask, tc0vec); \ |
2937 |
+ finaltc0 = _mm_sub_epi8(finaltc0, p1mask); /* tc++ */ \ |
2938 |
+ newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ |
2939 |
+ /*end if*/ \ |
2940 |
+ \ |
2941 |
+ q1mask = diff_lt16(q2, q0, betavec); \ |
2942 |
+ q1mask = _mm_and_si128(q1mask, mask); /* if(|q2 - q0| < beta) */ \ |
2943 |
+ tc0masked = _mm_and_si128(q1mask, tc0vec); \ |
2944 |
+ finaltc0 = _mm_sub_epi8(finaltc0, q1mask); /* tc++ */ \ |
2945 |
+ newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ |
2946 |
+ /*end if*/ \ |
2947 |
+ \ |
2948 |
+ h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0, __m128i, ep, si128); \ |
2949 |
+ p1 = newp1; \ |
2950 |
+ q1 = newq1; \ |
2951 |
+} |
2952 |
+ |
2953 |
+#define h264_loop_filter_chroma2_e2k(p1, p0, q0, q1, alpha, beta, tc0) { \ |
2954 |
+ __m64 alphavec, betavec, mask, tc0vec, finaltc0; \ |
2955 |
+ \ |
2956 |
+ betavec = _mm_set1_pi8(beta - 128); \ |
2957 |
+ alphavec = _mm_set1_pi8(alpha - 128); \ |
2958 |
+ mask = deblock_mask8(p0, p1, q0, q1, alphavec, betavec); /* if in block */ \ |
2959 |
+ \ |
2960 |
+ tc0vec = _mm_cvtsi32_si64(*(uint32_t*)tc0); \ |
2961 |
+ tc0vec = _mm_unpacklo_pi8(tc0vec, tc0vec); \ |
2962 |
+ mask = _mm_blendv_pi8(mask, _mm_setzero_si64(), tc0vec); \ |
2963 |
+ finaltc0 = _mm_and_si64(tc0vec, mask); /* if (tc0[i] >= 0) tc = tc0 */ \ |
2964 |
+ \ |
2965 |
+ h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0, __m64, p, si64); \ |
2966 |
+} |
2967 |
+ |
2968 |
+#define h264_loop_filter_chroma4_e2k(p1, p0, q0, q1, alpha, beta, tc0) { \ |
2969 |
+ __m128i alphavec, betavec, mask, tc0vec, finaltc0; \ |
2970 |
+ \ |
2971 |
+ betavec = _mm_set1_epi8(beta - 128); \ |
2972 |
+ alphavec = _mm_set1_epi8(alpha - 128); \ |
2973 |
+ mask = deblock_mask16(p0, p1, q0, q1, alphavec, betavec); /* if in block */ \ |
2974 |
+ \ |
2975 |
+ tc0vec = _mm_cvtsi32_si128(*(uint32_t*)tc0); \ |
2976 |
+ tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec); \ |
2977 |
+ tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec); \ |
2978 |
+ mask = _mm_blendv_epi8(mask, _mm_setzero_si128(), tc0vec); \ |
2979 |
+ finaltc0 = _mm_and_si128(tc0vec, mask); /* if (tc0[i] >= 0) tc = tc0 */ \ |
2980 |
+ \ |
2981 |
+ h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0, __m128i, ep, si128); \ |
2982 |
+} |
2983 |
+ |
2984 |
+static void h264_v_loop_filter_luma_e2k(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) { |
2985 |
+ if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { |
2986 |
+ vec_u8 p2, p1, p0, q0, q1, q2; |
2987 |
+ p2 = VEC_LD(pix - 3 * stride); |
2988 |
+ p1 = VEC_LD(pix - 2 * stride); |
2989 |
+ p0 = VEC_LD(pix - stride); |
2990 |
+ q0 = VEC_LD(pix); |
2991 |
+ q1 = VEC_LD(pix + stride); |
2992 |
+ q2 = VEC_LD(pix + 2 * stride); |
2993 |
+ h264_loop_filter_luma_e2k(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); |
2994 |
+ VEC_ST(pix - 2 * stride, p1); |
2995 |
+ VEC_ST(pix - 1 * stride, p0); |
2996 |
+ VEC_ST(pix, q0); |
2997 |
+ VEC_ST(pix + stride, q1); |
2998 |
+ } |
2999 |
+} |
3000 |
+ |
3001 |
+static void h264_v_loop_filter_chroma_e2k(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) { |
3002 |
+ if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { |
3003 |
+ __m64 p1, p0, q0, q1; |
3004 |
+ p1 = *(__m64*)(pix - 2 * stride); |
3005 |
+ p0 = *(__m64*)(pix - stride); |
3006 |
+ q0 = *(__m64*)pix; |
3007 |
+ q1 = *(__m64*)(pix + stride); |
3008 |
+ h264_loop_filter_chroma2_e2k(p1, p0, q0, q1, alpha, beta, tc0); |
3009 |
+ *(__m64*)(pix - 1 * stride) = p0; |
3010 |
+ *(__m64*)pix = q0; |
3011 |
+ } |
3012 |
+} |
3013 |
+ |
3014 |
+static void h264_h_loop_filter_luma_e2k(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) { |
3015 |
+ vec_u8 p2, p1, p0, q0, q1, q2; |
3016 |
+ if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) return; |
3017 |
+ read_and_transpose16x6(pix - 3, stride, p2, p1, p0, q0, q1, q2); |
3018 |
+ h264_loop_filter_luma_e2k(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); |
3019 |
+ transpose4x16(p1, p0, q0, q1); |
3020 |
+ write16x4(pix - 2, stride, p1, p0, q0, q1); |
3021 |
+} |
3022 |
+ |
3023 |
+static void h264_h_loop_filter_chroma_e2k(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) { |
3024 |
+ __m64 p1, p0, q0, q1; |
3025 |
+ if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) return; |
3026 |
+ read_and_transpose8x4(pix - 2, stride, p1, p0, q0, q1); |
3027 |
+ h264_loop_filter_chroma2_e2k(p1, p0, q0, q1, alpha, beta, tc0); |
3028 |
+ p1 = _mm_unpacklo_pi8(p0, q0); |
3029 |
+ q1 = _mm_unpackhi_pi8(p0, q0); |
3030 |
+#define WRITE2(v, i) *(uint16_t*)(pix + i * stride) = _mm_extract_pi16(v, i); |
3031 |
+ pix--; |
3032 |
+ WRITE2(p1, 0) |
3033 |
+ WRITE2(p1, 1) |
3034 |
+ WRITE2(p1, 2) |
3035 |
+ WRITE2(p1, 3) |
3036 |
+ pix += stride * 4; |
3037 |
+ WRITE2(q1, 0) |
3038 |
+ WRITE2(q1, 1) |
3039 |
+ WRITE2(q1, 2) |
3040 |
+ WRITE2(q1, 3) |
3041 |
+#undef WRITE2 |
3042 |
+} |
3043 |
+ |
3044 |
+static void h264_h_loop_filter_chroma422_e2k(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) { |
3045 |
+ __m128i p1, p0, q0, q1; |
3046 |
+ __m64 p1l, p0l, q0l, q1l, p1h, p0h, q0h, q1h; |
3047 |
+ if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) return; |
3048 |
+ read_and_transpose8x4(pix - 2, stride, p1l, p0l, q0l, q1l); |
3049 |
+ read_and_transpose8x4(pix - 2 + stride * 8, stride, p1h, p0h, q0h, q1h); |
3050 |
+ p1 = _mm_unpacklo_epi64(_mm_movpi64_epi64(p1l), _mm_movpi64_epi64(p1h)); |
3051 |
+ p0 = _mm_unpacklo_epi64(_mm_movpi64_epi64(p0l), _mm_movpi64_epi64(p0h)); |
3052 |
+ q0 = _mm_unpacklo_epi64(_mm_movpi64_epi64(q0l), _mm_movpi64_epi64(q0h)); |
3053 |
+ q1 = _mm_unpacklo_epi64(_mm_movpi64_epi64(q1l), _mm_movpi64_epi64(q1h)); |
3054 |
+ h264_loop_filter_chroma4_e2k(p1, p0, q0, q1, alpha, beta, tc0); |
3055 |
+ transpose4x16(p1, p0, q0, q1); |
3056 |
+ write16x4(pix - 2, stride, p1, p0, q0, q1); |
3057 |
+} |
3058 |
+ |
3059 |
+static void weight_h264_pixels16_e2k(uint8_t *block, ptrdiff_t stride, int height, |
3060 |
+ int log2_denom, int weight, int offset) |
3061 |
+{ |
3062 |
+ int y; |
3063 |
+ vec_u8 vblock; |
3064 |
+ vec_s16 vweight, voffset, v0, v1; |
3065 |
+ LOAD_ZERO; |
3066 |
+ |
3067 |
+ offset <<= log2_denom; |
3068 |
+ if (log2_denom) offset += 1 << (log2_denom - 1); |
3069 |
+ |
3070 |
+ vweight = _mm_set1_epi16(weight); |
3071 |
+ voffset = _mm_set1_epi16(offset); |
3072 |
+ |
3073 |
+ PRAGMA_E2K("ivdep") |
3074 |
+ for (y = 0; y < height; y++) { |
3075 |
+ vblock = VEC_LD(block); |
3076 |
+ v0 = _mm_unpacklo_epi8(vblock, zerov); |
3077 |
+ v1 = _mm_unpackhi_epi8(vblock, zerov); |
3078 |
+ |
3079 |
+ v0 = _mm_mullo_epi16(v0, vweight); |
3080 |
+ v1 = _mm_mullo_epi16(v1, vweight); |
3081 |
+ v0 = _mm_adds_epi16(v0, voffset); |
3082 |
+ v1 = _mm_adds_epi16(v1, voffset); |
3083 |
+ v0 = _mm_srai_epi16(v0, log2_denom); |
3084 |
+ v1 = _mm_srai_epi16(v1, log2_denom); |
3085 |
+ |
3086 |
+ vblock = _mm_packus_epi16(v0, v1); |
3087 |
+ VEC_ST(block, vblock); |
3088 |
+ block += stride; |
3089 |
+ } |
3090 |
+} |
3091 |
+ |
3092 |
+static void weight_h264_pixels8_e2k(uint8_t *block, ptrdiff_t stride, int height, |
3093 |
+ int log2_denom, int weight, int offset) |
3094 |
+{ |
3095 |
+ int y; |
3096 |
+ vec_u8 vblock; |
3097 |
+ vec_s16 vweight, voffset, v0; |
3098 |
+ LOAD_ZERO; |
3099 |
+ |
3100 |
+ offset <<= log2_denom; |
3101 |
+ if (log2_denom) offset += 1 << (log2_denom - 1); |
3102 |
+ |
3103 |
+ vweight = _mm_set1_epi16(weight); |
3104 |
+ voffset = _mm_set1_epi16(offset); |
3105 |
+ |
3106 |
+ PRAGMA_E2K("ivdep") |
3107 |
+ for (y = 0; y < height; y++) { |
3108 |
+ vblock = VEC_LD8(block); |
3109 |
+ v0 = _mm_unpacklo_epi8(vblock, zerov); |
3110 |
+ |
3111 |
+ v0 = _mm_mullo_epi16(v0, vweight); |
3112 |
+ v0 = _mm_adds_epi16(v0, voffset); |
3113 |
+ v0 = _mm_srai_epi16(v0, log2_denom); |
3114 |
+ |
3115 |
+ vblock = _mm_packus_epi16(v0, v0); |
3116 |
+ VEC_STL(block, vblock); |
3117 |
+ block += stride; |
3118 |
+ } |
3119 |
+} |
3120 |
+ |
3121 |
+static void biweight_h264_pixels16_e2k(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, |
3122 |
+ int log2_denom, int weightd, int weights, int offset) |
3123 |
+{ |
3124 |
+ int y; |
3125 |
+ vec_u8 vsrc, vdst; |
3126 |
+ vec_s16 vweights, vweightd, voffset, v0, v1, v2, v3; |
3127 |
+ LOAD_ZERO; |
3128 |
+ |
3129 |
+ offset = ((offset + 1) | 1) << log2_denom; |
3130 |
+ vweights = _mm_set1_epi16(weights); |
3131 |
+ vweightd = _mm_set1_epi16(weightd); |
3132 |
+ voffset = _mm_set1_epi16(offset); |
3133 |
+ |
3134 |
+ PRAGMA_E2K("ivdep") |
3135 |
+ for (y = 0; y < height; y++) { |
3136 |
+ vdst = VEC_LD(dst); |
3137 |
+ vsrc = VEC_LD(src); |
3138 |
+ v0 = _mm_unpacklo_epi8(vdst, zerov); |
3139 |
+ v1 = _mm_unpackhi_epi8(vdst, zerov); |
3140 |
+ v2 = _mm_unpacklo_epi8(vsrc, zerov); |
3141 |
+ v3 = _mm_unpackhi_epi8(vsrc, zerov); |
3142 |
+ |
3143 |
+ v0 = _mm_mullo_epi16(v0, vweightd); |
3144 |
+ v1 = _mm_mullo_epi16(v1, vweightd); |
3145 |
+ v2 = _mm_mullo_epi16(v2, vweights); |
3146 |
+ v3 = _mm_mullo_epi16(v3, vweights); |
3147 |
+ v0 = _mm_adds_epi16(v0, voffset); |
3148 |
+ v1 = _mm_adds_epi16(v1, voffset); |
3149 |
+ v0 = _mm_adds_epi16(v0, v2); |
3150 |
+ v1 = _mm_adds_epi16(v1, v3); |
3151 |
+ v0 = _mm_srai_epi16(v0, log2_denom + 1); |
3152 |
+ v1 = _mm_srai_epi16(v1, log2_denom + 1); |
3153 |
+ |
3154 |
+ vdst = _mm_packus_epi16(v0, v1); |
3155 |
+ VEC_ST(dst, vdst); |
3156 |
+ dst += stride; |
3157 |
+ src += stride; |
3158 |
+ } |
3159 |
+} |
3160 |
+ |
3161 |
+static void biweight_h264_pixels8_e2k(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, |
3162 |
+ int log2_denom, int weightd, int weights, int offset) |
3163 |
+{ |
3164 |
+ int y; |
3165 |
+ vec_u8 vsrc, vdst; |
3166 |
+ vec_s16 vweights, vweightd, voffset, v0, v2; |
3167 |
+ LOAD_ZERO; |
3168 |
+ |
3169 |
+ offset = ((offset + 1) | 1) << log2_denom; |
3170 |
+ vweights = _mm_set1_epi16(weights); |
3171 |
+ vweightd = _mm_set1_epi16(weightd); |
3172 |
+ voffset = _mm_set1_epi16(offset); |
3173 |
+ |
3174 |
+ PRAGMA_E2K("ivdep") |
3175 |
+ for (y = 0; y < height; y++) { |
3176 |
+ vdst = VEC_LD8(dst); |
3177 |
+ vsrc = VEC_LD8(src); |
3178 |
+ v0 = _mm_unpacklo_epi8(vdst, zerov); |
3179 |
+ v2 = _mm_unpacklo_epi8(vsrc, zerov); |
3180 |
+ |
3181 |
+ v0 = _mm_mullo_epi16(v0, vweightd); |
3182 |
+ v2 = _mm_mullo_epi16(v2, vweights); |
3183 |
+ v0 = _mm_adds_epi16(v0, voffset); |
3184 |
+ v0 = _mm_adds_epi16(v0, v2); |
3185 |
+ v0 = _mm_srai_epi16(v0, log2_denom + 1); |
3186 |
+ |
3187 |
+ vdst = _mm_packus_epi16(v0, v0); |
3188 |
+ VEC_STL(dst, vdst); |
3189 |
+ dst += stride; |
3190 |
+ src += stride; |
3191 |
+ } |
3192 |
+} |
3193 |
+ |
3194 |
+av_cold void ff_h264dsp_init_e2k(H264DSPContext *c, const int bit_depth, |
3195 |
+ const int chroma_format_idc) |
3196 |
+{ |
3197 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
3198 |
+ return; |
3199 |
+ |
3200 |
+ if (bit_depth == 8) { |
3201 |
+ c->h264_idct_add = h264_idct_add_e2k; |
3202 |
+ if (chroma_format_idc <= 1) |
3203 |
+ c->h264_idct_add8 = h264_idct_add8_e2k; // !checkasm |
3204 |
+ |
3205 |
+ c->h264_idct_add16 = h264_idct_add16_e2k; |
3206 |
+ c->h264_idct_add16intra = h264_idct_add16intra_e2k; |
3207 |
+ c->h264_idct_dc_add = h264_idct_dc_add_e2k; |
3208 |
+ c->h264_idct8_dc_add = h264_idct8_dc_add_e2k; |
3209 |
+ c->h264_idct8_add = h264_idct8_add_e2k; |
3210 |
+ c->h264_idct8_add4 = h264_idct8_add4_e2k; |
3211 |
+ c->h264_v_loop_filter_luma = h264_v_loop_filter_luma_e2k; |
3212 |
+ c->h264_h_loop_filter_luma = h264_h_loop_filter_luma_e2k; |
3213 |
+ c->h264_v_loop_filter_chroma = h264_v_loop_filter_chroma_e2k; |
3214 |
+ if (chroma_format_idc <= 1) { |
3215 |
+ c->h264_h_loop_filter_chroma = h264_h_loop_filter_chroma_e2k; |
3216 |
+ } else { |
3217 |
+ c->h264_h_loop_filter_chroma = h264_h_loop_filter_chroma422_e2k; |
3218 |
+ } |
3219 |
+ c->weight_h264_pixels_tab[0] = weight_h264_pixels16_e2k; // !checkasm |
3220 |
+ c->weight_h264_pixels_tab[1] = weight_h264_pixels8_e2k; // !checkasm |
3221 |
+ c->biweight_h264_pixels_tab[0] = biweight_h264_pixels16_e2k; // !checkasm |
3222 |
+ c->biweight_h264_pixels_tab[1] = biweight_h264_pixels8_e2k; // !checkasm |
3223 |
+ } |
3224 |
+} |
3225 |
diff --git a/libavcodec/e2k/h264qpel.c b/libavcodec/e2k/h264qpel.c |
3226 |
new file mode 100644 |
3227 |
index 0000000..f8fe094 |
3228 |
--- /dev/null |
3229 |
+++ b/libavcodec/e2k/h264qpel.c |
3230 |
@@ -0,0 +1,255 @@ |
3231 |
+/* |
3232 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
3233 |
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> |
3234 |
+ * |
3235 |
+ * This file is part of FFmpeg. |
3236 |
+ * |
3237 |
+ * FFmpeg is free software; you can redistribute it and/or |
3238 |
+ * modify it under the terms of the GNU Lesser General Public |
3239 |
+ * License as published by the Free Software Foundation; either |
3240 |
+ * version 2.1 of the License, or (at your option) any later version. |
3241 |
+ * |
3242 |
+ * FFmpeg is distributed in the hope that it will be useful, |
3243 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
3244 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
3245 |
+ * Lesser General Public License for more details. |
3246 |
+ * |
3247 |
+ * You should have received a copy of the GNU Lesser General Public |
3248 |
+ * License along with FFmpeg; if not, write to the Free Software |
3249 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
3250 |
+ */ |
3251 |
+ |
3252 |
+#include "config.h" |
3253 |
+ |
3254 |
+#include "libavutil/attributes.h" |
3255 |
+#include "libavutil/cpu.h" |
3256 |
+#include "libavutil/intreadwrite.h" |
3257 |
+#include "libavutil/e2k/cpu.h" |
3258 |
+#include "libavutil/e2k/util_e2k.h" |
3259 |
+ |
3260 |
+#include "libavcodec/h264qpel.h" |
3261 |
+ |
3262 |
+#include "hpeldsp.h" |
3263 |
+ |
3264 |
+#define PUT_OP_U8_E2K(d, s, dst) d = s |
3265 |
+#define AVG_OP_U8_E2K(d, s, dst) d = _mm_avg_epu8(dst, s) |
3266 |
+ |
3267 |
+#define OP_U8_E2K PUT_OP_U8_E2K |
3268 |
+#define PREFIX_h264_qpel16_h_lowpass_e2k put_h264_qpel16_h_lowpass_e2k |
3269 |
+#define PREFIX_h264_qpel16_v_lowpass_e2k put_h264_qpel16_v_lowpass_e2k |
3270 |
+#define PREFIX_h264_qpel16_hv_lowpass_e2k put_h264_qpel16_hv_lowpass_e2k |
3271 |
+#include "h264qpel_template.c" |
3272 |
+#undef OP_U8_E2K |
3273 |
+#undef PREFIX_h264_qpel16_h_lowpass_e2k |
3274 |
+#undef PREFIX_h264_qpel16_v_lowpass_e2k |
3275 |
+#undef PREFIX_h264_qpel16_hv_lowpass_e2k |
3276 |
+ |
3277 |
+#define OP_U8_E2K AVG_OP_U8_E2K |
3278 |
+#define PREFIX_h264_qpel16_h_lowpass_e2k avg_h264_qpel16_h_lowpass_e2k |
3279 |
+#define PREFIX_h264_qpel16_v_lowpass_e2k avg_h264_qpel16_v_lowpass_e2k |
3280 |
+#define PREFIX_h264_qpel16_hv_lowpass_e2k avg_h264_qpel16_hv_lowpass_e2k |
3281 |
+#include "h264qpel_template.c" |
3282 |
+#undef OP_U8_E2K |
3283 |
+#undef PREFIX_h264_qpel16_h_lowpass_e2k |
3284 |
+#undef PREFIX_h264_qpel16_v_lowpass_e2k |
3285 |
+#undef PREFIX_h264_qpel16_hv_lowpass_e2k |
3286 |
+ |
3287 |
+#define H264_MC(OPNAME, SIZE, CODETYPE) \ |
3288 |
+static void OPNAME##h264_qpel##SIZE##_mc00_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3289 |
+{\ |
3290 |
+ ff_##OPNAME##pixels##SIZE##_##CODETYPE(dst, src, stride, SIZE);\ |
3291 |
+}\ |
3292 |
+\ |
3293 |
+static void OPNAME##h264_qpel##SIZE##_mc10_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3294 |
+{ \ |
3295 |
+ DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ |
3296 |
+ put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(half, src, SIZE, stride);\ |
3297 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, src, half, stride, stride, SIZE);\ |
3298 |
+}\ |
3299 |
+\ |
3300 |
+static void OPNAME##h264_qpel##SIZE##_mc20_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3301 |
+{\ |
3302 |
+ OPNAME##h264_qpel##SIZE##_h_lowpass_##CODETYPE(dst, src, stride, stride);\ |
3303 |
+}\ |
3304 |
+\ |
3305 |
+static void OPNAME##h264_qpel##SIZE##_mc30_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3306 |
+{\ |
3307 |
+ DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ |
3308 |
+ put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(half, src, SIZE, stride);\ |
3309 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, src+1, half, stride, stride, SIZE);\ |
3310 |
+}\ |
3311 |
+\ |
3312 |
+static void OPNAME##h264_qpel##SIZE##_mc01_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3313 |
+{\ |
3314 |
+ DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ |
3315 |
+ put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(half, src, SIZE, stride);\ |
3316 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, src, half, stride, stride, SIZE);\ |
3317 |
+}\ |
3318 |
+\ |
3319 |
+static void OPNAME##h264_qpel##SIZE##_mc02_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3320 |
+{\ |
3321 |
+ OPNAME##h264_qpel##SIZE##_v_lowpass_##CODETYPE(dst, src, stride, stride);\ |
3322 |
+}\ |
3323 |
+\ |
3324 |
+static void OPNAME##h264_qpel##SIZE##_mc03_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3325 |
+{\ |
3326 |
+ DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ |
3327 |
+ put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(half, src, SIZE, stride);\ |
3328 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, src+stride, half, stride, stride, SIZE);\ |
3329 |
+}\ |
3330 |
+\ |
3331 |
+static void OPNAME##h264_qpel##SIZE##_mc11_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3332 |
+{\ |
3333 |
+ DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
3334 |
+ DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ |
3335 |
+ put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src, SIZE, stride);\ |
3336 |
+ put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src, SIZE, stride);\ |
3337 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ |
3338 |
+}\ |
3339 |
+\ |
3340 |
+static void OPNAME##h264_qpel##SIZE##_mc31_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3341 |
+{\ |
3342 |
+ DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
3343 |
+ DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ |
3344 |
+ put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src, SIZE, stride);\ |
3345 |
+ put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src+1, SIZE, stride);\ |
3346 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ |
3347 |
+}\ |
3348 |
+\ |
3349 |
+static void OPNAME##h264_qpel##SIZE##_mc13_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3350 |
+{\ |
3351 |
+ DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
3352 |
+ DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ |
3353 |
+ put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src + stride, SIZE, stride);\ |
3354 |
+ put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src, SIZE, stride);\ |
3355 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ |
3356 |
+}\ |
3357 |
+\ |
3358 |
+static void OPNAME##h264_qpel##SIZE##_mc33_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3359 |
+{\ |
3360 |
+ DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
3361 |
+ DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ |
3362 |
+ put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src + stride, SIZE, stride);\ |
3363 |
+ put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src+1, SIZE, stride);\ |
3364 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ |
3365 |
+}\ |
3366 |
+\ |
3367 |
+static void OPNAME##h264_qpel##SIZE##_mc22_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3368 |
+{\ |
3369 |
+ DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ |
3370 |
+ OPNAME##h264_qpel##SIZE##_hv_lowpass_##CODETYPE(dst, tmp, src, stride, SIZE, stride);\ |
3371 |
+}\ |
3372 |
+\ |
3373 |
+static void OPNAME##h264_qpel##SIZE##_mc21_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3374 |
+{\ |
3375 |
+ DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
3376 |
+ DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ |
3377 |
+ DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ |
3378 |
+ put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src, SIZE, stride);\ |
3379 |
+ put_h264_qpel##SIZE##_hv_lowpass_##CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ |
3380 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ |
3381 |
+}\ |
3382 |
+\ |
3383 |
+static void OPNAME##h264_qpel##SIZE##_mc23_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3384 |
+{\ |
3385 |
+ DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
3386 |
+ DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ |
3387 |
+ DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ |
3388 |
+ put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src + stride, SIZE, stride);\ |
3389 |
+ put_h264_qpel##SIZE##_hv_lowpass_##CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ |
3390 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ |
3391 |
+}\ |
3392 |
+\ |
3393 |
+static void OPNAME##h264_qpel##SIZE##_mc12_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3394 |
+{\ |
3395 |
+ DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ |
3396 |
+ DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ |
3397 |
+ DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ |
3398 |
+ put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src, SIZE, stride);\ |
3399 |
+ put_h264_qpel##SIZE##_hv_lowpass_##CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ |
3400 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ |
3401 |
+}\ |
3402 |
+\ |
3403 |
+static void OPNAME##h264_qpel##SIZE##_mc32_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
3404 |
+{\ |
3405 |
+ DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ |
3406 |
+ DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ |
3407 |
+ DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ |
3408 |
+ put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src+1, SIZE, stride);\ |
3409 |
+ put_h264_qpel##SIZE##_hv_lowpass_##CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ |
3410 |
+ OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ |
3411 |
+}\ |
3412 |
+ |
3413 |
+#if 1 |
3414 |
+static av_always_inline void put_pixels16_l2_e2k(uint8_t *dst, const uint8_t *src1, |
3415 |
+ const uint8_t *src2, int dst_stride, |
3416 |
+ int src_stride1, int h) |
3417 |
+{ |
3418 |
+ int i; |
3419 |
+ vec_u8 a, b, d; |
3420 |
+ |
3421 |
+ for (i = 0; i < h; i++) { |
3422 |
+ a = VEC_LD(src1 + i * src_stride1); |
3423 |
+ b = VEC_LD(src2 + i * 16); |
3424 |
+ d = _mm_avg_epu8(a, b); |
3425 |
+ VEC_ST(dst, d); |
3426 |
+ dst += dst_stride; |
3427 |
+ } |
3428 |
+} |
3429 |
+ |
3430 |
+static av_always_inline void avg_pixels16_l2_e2k(uint8_t *dst, const uint8_t *src1, |
3431 |
+ const uint8_t *src2, int dst_stride, |
3432 |
+ int src_stride1, int h) |
3433 |
+{ |
3434 |
+ int i; |
3435 |
+ vec_u8 a, b, d; |
3436 |
+ |
3437 |
+ for (i = 0; i < h; i++) { |
3438 |
+ a = VEC_LD(src1 + i * src_stride1); |
3439 |
+ b = VEC_LD(src2 + i * 16); |
3440 |
+ d = _mm_avg_epu8(a, b); |
3441 |
+ a = _mm_avg_epu8(VEC_LD(dst), d); |
3442 |
+ VEC_ST(dst, a); |
3443 |
+ dst += dst_stride; |
3444 |
+ } |
3445 |
+} |
3446 |
+ |
3447 |
+#else // Implemented but could be faster |
3448 |
+#define put_pixels16_l2_e2k(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h) |
3449 |
+#define avg_pixels16_l2_e2k(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) |
3450 |
+#endif |
3451 |
+ |
3452 |
+H264_MC(put_, 16, e2k) |
3453 |
+H264_MC(avg_, 16, e2k) |
3454 |
+ |
3455 |
+av_cold void ff_h264qpel_init_e2k(H264QpelContext *c, int bit_depth) |
3456 |
+{ |
3457 |
+ const int high_bit_depth = bit_depth > 8; |
3458 |
+ |
3459 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
3460 |
+ return; |
3461 |
+ |
3462 |
+ if (!high_bit_depth) { |
3463 |
+#define dspfunc(PFX, IDX, NUM) \ |
3464 |
+ c->PFX##_pixels_tab[IDX][ 0] = PFX##NUM##_mc00_e2k; \ |
3465 |
+ c->PFX##_pixels_tab[IDX][ 1] = PFX##NUM##_mc10_e2k; \ |
3466 |
+ c->PFX##_pixels_tab[IDX][ 2] = PFX##NUM##_mc20_e2k; \ |
3467 |
+ c->PFX##_pixels_tab[IDX][ 3] = PFX##NUM##_mc30_e2k; \ |
3468 |
+ c->PFX##_pixels_tab[IDX][ 4] = PFX##NUM##_mc01_e2k; \ |
3469 |
+ c->PFX##_pixels_tab[IDX][ 5] = PFX##NUM##_mc11_e2k; \ |
3470 |
+ c->PFX##_pixels_tab[IDX][ 6] = PFX##NUM##_mc21_e2k; \ |
3471 |
+ c->PFX##_pixels_tab[IDX][ 7] = PFX##NUM##_mc31_e2k; \ |
3472 |
+ c->PFX##_pixels_tab[IDX][ 8] = PFX##NUM##_mc02_e2k; \ |
3473 |
+ c->PFX##_pixels_tab[IDX][ 9] = PFX##NUM##_mc12_e2k; \ |
3474 |
+ c->PFX##_pixels_tab[IDX][10] = PFX##NUM##_mc22_e2k; \ |
3475 |
+ c->PFX##_pixels_tab[IDX][11] = PFX##NUM##_mc32_e2k; \ |
3476 |
+ c->PFX##_pixels_tab[IDX][12] = PFX##NUM##_mc03_e2k; \ |
3477 |
+ c->PFX##_pixels_tab[IDX][13] = PFX##NUM##_mc13_e2k; \ |
3478 |
+ c->PFX##_pixels_tab[IDX][14] = PFX##NUM##_mc23_e2k; \ |
3479 |
+ c->PFX##_pixels_tab[IDX][15] = PFX##NUM##_mc33_e2k |
3480 |
+ |
3481 |
+ dspfunc(put_h264_qpel, 0, 16); |
3482 |
+ dspfunc(avg_h264_qpel, 0, 16); |
3483 |
+#undef dspfunc |
3484 |
+ } |
3485 |
+} |
3486 |
diff --git a/libavcodec/e2k/h264qpel_template.c b/libavcodec/e2k/h264qpel_template.c |
3487 |
new file mode 100644 |
3488 |
index 0000000..bbd6516 |
3489 |
--- /dev/null |
3490 |
+++ b/libavcodec/e2k/h264qpel_template.c |
3491 |
@@ -0,0 +1,354 @@ |
3492 |
+/* |
3493 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
3494 |
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> |
3495 |
+ * |
3496 |
+ * This file is part of FFmpeg. |
3497 |
+ * |
3498 |
+ * FFmpeg is free software; you can redistribute it and/or |
3499 |
+ * modify it under the terms of the GNU Lesser General Public |
3500 |
+ * License as published by the Free Software Foundation; either |
3501 |
+ * version 2.1 of the License, or (at your option) any later version. |
3502 |
+ * |
3503 |
+ * FFmpeg is distributed in the hope that it will be useful, |
3504 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
3505 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
3506 |
+ * Lesser General Public License for more details. |
3507 |
+ * |
3508 |
+ * You should have received a copy of the GNU Lesser General Public |
3509 |
+ * License along with FFmpeg; if not, write to the Free Software |
3510 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
3511 |
+ */ |
3512 |
+ |
3513 |
+#include "config.h" |
3514 |
+#if HAVE_UNISTD_H |
3515 |
+#include <unistd.h> |
3516 |
+#endif |
3517 |
+ |
3518 |
+#include "libavutil/avassert.h" |
3519 |
+#include "libavutil/mem.h" |
3520 |
+#include "libavutil/e2k/util_e2k.h" |
3521 |
+ |
3522 |
+#define load_alignment() { \ |
3523 |
+ srcM2 = VEC_LD(src - 2); \ |
3524 |
+ srcM1 = VEC_LD(src - 1); \ |
3525 |
+ srcP0 = VEC_LD(src); \ |
3526 |
+ srcP1 = VEC_LD(src + 1); \ |
3527 |
+ srcP2 = VEC_LD(src + 2); \ |
3528 |
+ srcP3 = VEC_LD(src + 3); \ |
3529 |
+} |
3530 |
+ |
3531 |
+/* this code assume stride % 16 == 0 */ |
3532 |
+#ifdef PREFIX_h264_qpel16_h_lowpass_e2k |
3533 |
+static void PREFIX_h264_qpel16_h_lowpass_e2k(uint8_t *dst, |
3534 |
+ const uint8_t *src, |
3535 |
+ int dstStride, int srcStride) |
3536 |
+{ |
3537 |
+ int i; |
3538 |
+ |
3539 |
+ LOAD_ZERO; |
3540 |
+ const vec_s16 v5ss = _mm_set1_epi16(5); |
3541 |
+ const vec_s16 v20ss = _mm_set1_epi16(20); |
3542 |
+ const vec_s16 v16ss = _mm_set1_epi16(16); |
3543 |
+ |
3544 |
+ vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
3545 |
+ vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, |
3546 |
+ srcP2A, srcP2B, srcP3A, srcP3B, |
3547 |
+ srcM1A, srcM1B, srcM2A, srcM2B, |
3548 |
+ sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
3549 |
+ pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
3550 |
+ sumA, sumB; |
3551 |
+ vec_u8 sum, fsum; |
3552 |
+ |
3553 |
+ PRAGMA_E2K("ivdep") |
3554 |
+ for (i = 0; i < 16; i++) { |
3555 |
+ load_alignment(); |
3556 |
+ |
3557 |
+ srcP0A = _mm_unpacklo_epi8(srcP0, zerov); |
3558 |
+ srcP0B = _mm_unpackhi_epi8(srcP0, zerov); |
3559 |
+ srcP1A = _mm_unpacklo_epi8(srcP1, zerov); |
3560 |
+ srcP1B = _mm_unpackhi_epi8(srcP1, zerov); |
3561 |
+ |
3562 |
+ srcP2A = _mm_unpacklo_epi8(srcP2, zerov); |
3563 |
+ srcP2B = _mm_unpackhi_epi8(srcP2, zerov); |
3564 |
+ srcP3A = _mm_unpacklo_epi8(srcP3, zerov); |
3565 |
+ srcP3B = _mm_unpackhi_epi8(srcP3, zerov); |
3566 |
+ |
3567 |
+ srcM1A = _mm_unpacklo_epi8(srcM1, zerov); |
3568 |
+ srcM1B = _mm_unpackhi_epi8(srcM1, zerov); |
3569 |
+ srcM2A = _mm_unpacklo_epi8(srcM2, zerov); |
3570 |
+ srcM2B = _mm_unpackhi_epi8(srcM2, zerov); |
3571 |
+ |
3572 |
+ sum1A = _mm_adds_epi16(srcP0A, srcP1A); |
3573 |
+ sum1B = _mm_adds_epi16(srcP0B, srcP1B); |
3574 |
+ sum2A = _mm_adds_epi16(srcM1A, srcP2A); |
3575 |
+ sum2B = _mm_adds_epi16(srcM1B, srcP2B); |
3576 |
+ sum3A = _mm_adds_epi16(srcM2A, srcP3A); |
3577 |
+ sum3B = _mm_adds_epi16(srcM2B, srcP3B); |
3578 |
+ |
3579 |
+ pp1A = _mm_add_epi16(_mm_mullo_epi16(sum1A, v20ss), v16ss); |
3580 |
+ pp1B = _mm_add_epi16(_mm_mullo_epi16(sum1B, v20ss), v16ss); |
3581 |
+ pp2A = _mm_mullo_epi16(sum2A, v5ss); |
3582 |
+ pp2B = _mm_mullo_epi16(sum2B, v5ss); |
3583 |
+ pp3A = _mm_add_epi16(sum3A, pp1A); |
3584 |
+ pp3B = _mm_add_epi16(sum3B, pp1B); |
3585 |
+ sumA = _mm_sub_epi16(pp3A, pp2A); |
3586 |
+ sumB = _mm_sub_epi16(pp3B, pp2B); |
3587 |
+ sumA = _mm_srai_epi16(sumA, 5); |
3588 |
+ sumB = _mm_srai_epi16(sumB, 5); |
3589 |
+ sum = _mm_packus_epi16(sumA, sumB); |
3590 |
+ |
3591 |
+ OP_U8_E2K(fsum, sum, VEC_LD(dst)); |
3592 |
+ VEC_ST(dst, fsum); |
3593 |
+ |
3594 |
+ src += srcStride; |
3595 |
+ dst += dstStride; |
3596 |
+ } |
3597 |
+} |
3598 |
+#endif /* PREFIX_h264_qpel16_h_lowpass_e2k */ |
3599 |
+ |
3600 |
+/* this code assume stride % 16 == 0 */ |
3601 |
+#ifdef PREFIX_h264_qpel16_v_lowpass_e2k |
3602 |
+static void PREFIX_h264_qpel16_v_lowpass_e2k(uint8_t *dst, |
3603 |
+ const uint8_t *src, |
3604 |
+ int dstStride, int srcStride) |
3605 |
+{ |
3606 |
+ int i; |
3607 |
+ |
3608 |
+ LOAD_ZERO; |
3609 |
+ const vec_s16 v20ss = _mm_set1_epi16(20); |
3610 |
+ const vec_s16 v5ss = _mm_set1_epi16(5); |
3611 |
+ const vec_s16 v16ss = _mm_set1_epi16(16); |
3612 |
+ |
3613 |
+ const vec_u8 srcM2 = VEC_LD(src - srcStride * 2); |
3614 |
+ const vec_u8 srcM1 = VEC_LD(src - srcStride); |
3615 |
+ const vec_u8 srcP0 = VEC_LD(src); |
3616 |
+ const vec_u8 srcP1 = VEC_LD(src + srcStride); |
3617 |
+ const vec_u8 srcP2 = VEC_LD(src + srcStride * 2); |
3618 |
+ |
3619 |
+ vec_s16 srcM2ssA = _mm_unpacklo_epi8(srcM2, zerov); |
3620 |
+ vec_s16 srcM2ssB = _mm_unpackhi_epi8(srcM2, zerov); |
3621 |
+ vec_s16 srcM1ssA = _mm_unpacklo_epi8(srcM1, zerov); |
3622 |
+ vec_s16 srcM1ssB = _mm_unpackhi_epi8(srcM1, zerov); |
3623 |
+ vec_s16 srcP0ssA = _mm_unpacklo_epi8(srcP0, zerov); |
3624 |
+ vec_s16 srcP0ssB = _mm_unpackhi_epi8(srcP0, zerov); |
3625 |
+ vec_s16 srcP1ssA = _mm_unpacklo_epi8(srcP1, zerov); |
3626 |
+ vec_s16 srcP1ssB = _mm_unpackhi_epi8(srcP1, zerov); |
3627 |
+ vec_s16 srcP2ssA = _mm_unpacklo_epi8(srcP2, zerov); |
3628 |
+ vec_s16 srcP2ssB = _mm_unpackhi_epi8(srcP2, zerov); |
3629 |
+ |
3630 |
+ vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
3631 |
+ sumA, sumB, srcP3ssA, srcP3ssB, |
3632 |
+ sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; |
3633 |
+ vec_u8 sum, fsum, srcP3; |
3634 |
+ |
3635 |
+ src += srcStride * 3; |
3636 |
+ PRAGMA_E2K("ivdep") |
3637 |
+ for (i = 0; i < 16; i++) { |
3638 |
+ srcP3 = VEC_LD(src); |
3639 |
+ src += srcStride; |
3640 |
+ |
3641 |
+ srcP3ssA = _mm_unpacklo_epi8(srcP3, zerov); |
3642 |
+ srcP3ssB = _mm_unpackhi_epi8(srcP3, zerov); |
3643 |
+ |
3644 |
+ sum1A = _mm_adds_epi16(srcP0ssA, srcP1ssA); |
3645 |
+ sum1B = _mm_adds_epi16(srcP0ssB, srcP1ssB); |
3646 |
+ sum2A = _mm_adds_epi16(srcM1ssA, srcP2ssA); |
3647 |
+ sum2B = _mm_adds_epi16(srcM1ssB, srcP2ssB); |
3648 |
+ sum3A = _mm_adds_epi16(srcM2ssA, srcP3ssA); |
3649 |
+ sum3B = _mm_adds_epi16(srcM2ssB, srcP3ssB); |
3650 |
+ |
3651 |
+ srcM2ssA = srcM1ssA; |
3652 |
+ srcM2ssB = srcM1ssB; |
3653 |
+ srcM1ssA = srcP0ssA; |
3654 |
+ srcM1ssB = srcP0ssB; |
3655 |
+ srcP0ssA = srcP1ssA; |
3656 |
+ srcP0ssB = srcP1ssB; |
3657 |
+ srcP1ssA = srcP2ssA; |
3658 |
+ srcP1ssB = srcP2ssB; |
3659 |
+ srcP2ssA = srcP3ssA; |
3660 |
+ srcP2ssB = srcP3ssB; |
3661 |
+ |
3662 |
+ pp1A = _mm_add_epi16(_mm_mullo_epi16(sum1A, v20ss), v16ss); |
3663 |
+ pp1B = _mm_add_epi16(_mm_mullo_epi16(sum1B, v20ss), v16ss); |
3664 |
+ pp2A = _mm_mullo_epi16(sum2A, v5ss); |
3665 |
+ pp2B = _mm_mullo_epi16(sum2B, v5ss); |
3666 |
+ pp3A = _mm_add_epi16(sum3A, pp1A); |
3667 |
+ pp3B = _mm_add_epi16(sum3B, pp1B); |
3668 |
+ sumA = _mm_sub_epi16(pp3A, pp2A); |
3669 |
+ sumB = _mm_sub_epi16(pp3B, pp2B); |
3670 |
+ sumA = _mm_srai_epi16(sumA, 5); |
3671 |
+ sumB = _mm_srai_epi16(sumB, 5); |
3672 |
+ sum = _mm_packus_epi16(sumA, sumB); |
3673 |
+ |
3674 |
+ OP_U8_E2K(fsum, sum, VEC_LD(dst)); |
3675 |
+ VEC_ST(dst, fsum); |
3676 |
+ dst += dstStride; |
3677 |
+ } |
3678 |
+} |
3679 |
+#endif /* PREFIX_h264_qpel16_v_lowpass_e2k */ |
3680 |
+ |
3681 |
+/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ |
3682 |
+#ifdef PREFIX_h264_qpel16_hv_lowpass_e2k |
3683 |
+static void PREFIX_h264_qpel16_hv_lowpass_e2k(uint8_t *dst, int16_t *tmp, |
3684 |
+ const uint8_t *src, |
3685 |
+ int dstStride, int tmpStride, |
3686 |
+ int srcStride) |
3687 |
+{ |
3688 |
+ int i; |
3689 |
+ LOAD_ZERO; |
3690 |
+ const vec_s16 v20ss = _mm_set1_epi16(20); |
3691 |
+ const vec_s16 v5ss = _mm_set1_epi16(5); |
3692 |
+ const vec_s32 v512si = _mm_set1_epi32(512); |
3693 |
+ |
3694 |
+ vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, |
3695 |
+ srcP2A, srcP2B, srcP3A, srcP3B, |
3696 |
+ srcM1A, srcM1B, srcM2A, srcM2B, |
3697 |
+ sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
3698 |
+ pp1A, pp1B, pp2A, pp2B, sumA, sumB; |
3699 |
+ int16_t *tmpbis = tmp; |
3700 |
+ |
3701 |
+ vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, |
3702 |
+ tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, |
3703 |
+ tmpP2ssA, tmpP2ssB; |
3704 |
+ |
3705 |
+ vec_s32 pp1Al, pp1Ah, pp1Bl, pp1Bh, pp2Al, pp2Ah, pp2Bl, pp2Bh, |
3706 |
+ pp3Al, pp3Ah, pp3Bl, pp3Bh, sumAl, sumAh, sumBl, sumBh; |
3707 |
+ vec_u8 fsum, sum; |
3708 |
+ |
3709 |
+ src -= 2 * srcStride; |
3710 |
+ PRAGMA_E2K("ivdep") |
3711 |
+ for (i = 0; i < 21; i ++) { |
3712 |
+ vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
3713 |
+ |
3714 |
+ load_alignment(); |
3715 |
+ |
3716 |
+ srcP0A = _mm_unpacklo_epi8(srcP0, zerov); |
3717 |
+ srcP0B = _mm_unpackhi_epi8(srcP0, zerov); |
3718 |
+ srcP1A = _mm_unpacklo_epi8(srcP1, zerov); |
3719 |
+ srcP1B = _mm_unpackhi_epi8(srcP1, zerov); |
3720 |
+ |
3721 |
+ srcP2A = _mm_unpacklo_epi8(srcP2, zerov); |
3722 |
+ srcP2B = _mm_unpackhi_epi8(srcP2, zerov); |
3723 |
+ srcP3A = _mm_unpacklo_epi8(srcP3, zerov); |
3724 |
+ srcP3B = _mm_unpackhi_epi8(srcP3, zerov); |
3725 |
+ |
3726 |
+ srcM1A = _mm_unpacklo_epi8(srcM1, zerov); |
3727 |
+ srcM1B = _mm_unpackhi_epi8(srcM1, zerov); |
3728 |
+ srcM2A = _mm_unpacklo_epi8(srcM2, zerov); |
3729 |
+ srcM2B = _mm_unpackhi_epi8(srcM2, zerov); |
3730 |
+ |
3731 |
+ sum1A = _mm_adds_epi16(srcP0A, srcP1A); |
3732 |
+ sum1B = _mm_adds_epi16(srcP0B, srcP1B); |
3733 |
+ sum2A = _mm_adds_epi16(srcM1A, srcP2A); |
3734 |
+ sum2B = _mm_adds_epi16(srcM1B, srcP2B); |
3735 |
+ sum3A = _mm_adds_epi16(srcM2A, srcP3A); |
3736 |
+ sum3B = _mm_adds_epi16(srcM2B, srcP3B); |
3737 |
+ |
3738 |
+ pp1A = _mm_add_epi16(_mm_mullo_epi16(sum1A, v20ss), sum3A); |
3739 |
+ pp1B = _mm_add_epi16(_mm_mullo_epi16(sum1B, v20ss), sum3B); |
3740 |
+ pp2A = _mm_mullo_epi16(sum2A, v5ss); |
3741 |
+ pp2B = _mm_mullo_epi16(sum2B, v5ss); |
3742 |
+ sumA = _mm_sub_epi16(pp1A, pp2A); |
3743 |
+ sumB = _mm_sub_epi16(pp1B, pp2B); |
3744 |
+ |
3745 |
+ VEC_ST(tmp, sumA); |
3746 |
+ VEC_ST(tmp + 8, sumB); |
3747 |
+ |
3748 |
+ src += srcStride; |
3749 |
+ tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ |
3750 |
+ } |
3751 |
+ |
3752 |
+ tmpM2ssA = VEC_LD(tmpbis); |
3753 |
+ tmpM2ssB = VEC_LD(tmpbis + 8); |
3754 |
+ tmpbis += tmpStride; |
3755 |
+ tmpM1ssA = VEC_LD(tmpbis); |
3756 |
+ tmpM1ssB = VEC_LD(tmpbis + 8); |
3757 |
+ tmpbis += tmpStride; |
3758 |
+ tmpP0ssA = VEC_LD(tmpbis); |
3759 |
+ tmpP0ssB = VEC_LD(tmpbis + 8); |
3760 |
+ tmpbis += tmpStride; |
3761 |
+ tmpP1ssA = VEC_LD(tmpbis); |
3762 |
+ tmpP1ssB = VEC_LD(tmpbis + 8); |
3763 |
+ tmpbis += tmpStride; |
3764 |
+ tmpP2ssA = VEC_LD(tmpbis); |
3765 |
+ tmpP2ssB = VEC_LD(tmpbis + 8); |
3766 |
+ tmpbis += tmpStride; |
3767 |
+ |
3768 |
+ PRAGMA_E2K("ivdep") |
3769 |
+ for (i = 0; i < 16; i++) { |
3770 |
+ vec_s16 tmp0, tmp1; |
3771 |
+ const vec_s16 tmpP3ssA = VEC_LD(tmpbis); |
3772 |
+ const vec_s16 tmpP3ssB = VEC_LD(tmpbis + 8); |
3773 |
+ |
3774 |
+ const vec_s16 sum1A = _mm_adds_epi16(tmpP0ssA, tmpP1ssA); |
3775 |
+ const vec_s16 sum1B = _mm_adds_epi16(tmpP0ssB, tmpP1ssB); |
3776 |
+ const vec_s16 sum2A = _mm_adds_epi16(tmpM1ssA, tmpP2ssA); |
3777 |
+ const vec_s16 sum2B = _mm_adds_epi16(tmpM1ssB, tmpP2ssB); |
3778 |
+ vec_s16 sum3A = _mm_adds_epi16(tmpM2ssA, tmpP3ssA); |
3779 |
+ vec_s16 sum3B = _mm_adds_epi16(tmpM2ssB, tmpP3ssB); |
3780 |
+ |
3781 |
+ tmpbis += tmpStride; |
3782 |
+ |
3783 |
+ tmpM2ssA = tmpM1ssA; |
3784 |
+ tmpM2ssB = tmpM1ssB; |
3785 |
+ tmpM1ssA = tmpP0ssA; |
3786 |
+ tmpM1ssB = tmpP0ssB; |
3787 |
+ tmpP0ssA = tmpP1ssA; |
3788 |
+ tmpP0ssB = tmpP1ssB; |
3789 |
+ tmpP1ssA = tmpP2ssA; |
3790 |
+ tmpP1ssB = tmpP2ssB; |
3791 |
+ tmpP2ssA = tmpP3ssA; |
3792 |
+ tmpP2ssB = tmpP3ssB; |
3793 |
+ |
3794 |
+ tmp0 = _mm_mullo_epi16(sum1A, v20ss); |
3795 |
+ tmp1 = _mm_mulhi_epi16(sum1A, v20ss); |
3796 |
+ pp1Al = _mm_unpacklo_epi16(tmp0, tmp1); |
3797 |
+ pp1Ah = _mm_unpackhi_epi16(tmp0, tmp1); |
3798 |
+ tmp0 = _mm_mullo_epi16(sum1B, v20ss); |
3799 |
+ tmp1 = _mm_mulhi_epi16(sum1B, v20ss); |
3800 |
+ pp1Bl = _mm_unpacklo_epi16(tmp0, tmp1); |
3801 |
+ pp1Bh = _mm_unpackhi_epi16(tmp0, tmp1); |
3802 |
+ |
3803 |
+ pp1Al = _mm_add_epi32(pp1Al, v512si); |
3804 |
+ pp1Ah = _mm_add_epi32(pp1Ah, v512si); |
3805 |
+ pp1Bl = _mm_add_epi32(pp1Bl, v512si); |
3806 |
+ pp1Bh = _mm_add_epi32(pp1Bh, v512si); |
3807 |
+ |
3808 |
+ tmp0 = _mm_mullo_epi16(sum2A, v5ss); |
3809 |
+ tmp1 = _mm_mulhi_epi16(sum2A, v5ss); |
3810 |
+ pp2Al = _mm_unpacklo_epi16(tmp0, tmp1); |
3811 |
+ pp2Ah = _mm_unpackhi_epi16(tmp0, tmp1); |
3812 |
+ tmp0 = _mm_mullo_epi16(sum2B, v5ss); |
3813 |
+ tmp1 = _mm_mulhi_epi16(sum2B, v5ss); |
3814 |
+ pp2Bl = _mm_unpacklo_epi16(tmp0, tmp1); |
3815 |
+ pp2Bh = _mm_unpackhi_epi16(tmp0, tmp1); |
3816 |
+ |
3817 |
+ tmp0 = _mm_srai_epi32(_mm_unpacklo_epi16(sum3A, sum3A), 16); |
3818 |
+ tmp1 = _mm_srai_epi32(_mm_unpackhi_epi16(sum3A, sum3A), 16); |
3819 |
+ pp3Al = _mm_add_epi32(tmp0, pp1Al); |
3820 |
+ pp3Ah = _mm_add_epi32(tmp1, pp1Ah); |
3821 |
+ tmp0 = _mm_srai_epi32(_mm_unpacklo_epi16(sum3B, sum3B), 16); |
3822 |
+ tmp1 = _mm_srai_epi32(_mm_unpackhi_epi16(sum3B, sum3B), 16); |
3823 |
+ pp3Bl = _mm_add_epi32(tmp0, pp1Bl); |
3824 |
+ pp3Bh = _mm_add_epi32(tmp1, pp1Bh); |
3825 |
+ |
3826 |
+ sumAl = _mm_sub_epi32(pp3Al, pp2Al); |
3827 |
+ sumAh = _mm_sub_epi32(pp3Ah, pp2Ah); |
3828 |
+ sumBl = _mm_sub_epi32(pp3Bl, pp2Bl); |
3829 |
+ sumBh = _mm_sub_epi32(pp3Bh, pp2Bh); |
3830 |
+ |
3831 |
+ sumAl = _mm_srai_epi32(sumAl, 10); |
3832 |
+ sumAh = _mm_srai_epi32(sumAh, 10); |
3833 |
+ sumBl = _mm_srai_epi32(sumBl, 10); |
3834 |
+ sumBh = _mm_srai_epi32(sumBh, 10); |
3835 |
+ |
3836 |
+ sumA = _mm_packs_epi32(sumAl, sumAh); |
3837 |
+ sumB = _mm_packs_epi32(sumBl, sumBh); |
3838 |
+ sum = _mm_packus_epi16(sumA, sumB); |
3839 |
+ |
3840 |
+ OP_U8_E2K(fsum, sum, VEC_LD(dst)); |
3841 |
+ VEC_ST(dst, fsum); |
3842 |
+ dst += dstStride; |
3843 |
+ } |
3844 |
+} |
3845 |
+#endif /* PREFIX_h264_qpel16_hv_lowpass_e2k */ |
3846 |
diff --git a/libavcodec/e2k/hevcdsp.c b/libavcodec/e2k/hevcdsp.c |
3847 |
new file mode 100644 |
3848 |
index 0000000..74004d7 |
3849 |
--- /dev/null |
3850 |
+++ b/libavcodec/e2k/hevcdsp.c |
3851 |
@@ -0,0 +1,94 @@ |
3852 |
+/* |
3853 |
+ * SIMD-optimized IDCT functions for HEVC decoding |
3854 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
3855 |
+ * Copyright (c) Alexandra Hajkova |
3856 |
+ * |
3857 |
+ * This file is part of FFmpeg. |
3858 |
+ * |
3859 |
+ * FFmpeg is free software; you can redistribute it and/or |
3860 |
+ * modify it under the terms of the GNU Lesser General Public |
3861 |
+ * License as published by the Free Software Foundation; either |
3862 |
+ * version 2.1 of the License, or (at your option) any later version. |
3863 |
+ * |
3864 |
+ * FFmpeg is distributed in the hope that it will be useful, |
3865 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
3866 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
3867 |
+ * Lesser General Public License for more details. |
3868 |
+ * |
3869 |
+ * You should have received a copy of the GNU Lesser General Public |
3870 |
+ * License along with FFmpeg; if not, write to the Free Software |
3871 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
3872 |
+ */ |
3873 |
+ |
3874 |
+#include "config.h" |
3875 |
+ |
3876 |
+#include "libavutil/attributes.h" |
3877 |
+#include "libavutil/cpu.h" |
3878 |
+#include "libavutil/e2k/cpu.h" |
3879 |
+#include "libavutil/e2k/util_e2k.h" |
3880 |
+ |
3881 |
+#include "libavcodec/hevcdsp.h" |
3882 |
+ |
3883 |
+#define transform4x4(shift) { \ |
3884 |
+ vec_s16 src_02, src_13; \ |
3885 |
+ vec_s32 e0, o0, e1, o1, add; \ |
3886 |
+ src_02 = _mm_unpacklo_epi16(src_01, src_23); \ |
3887 |
+ src_13 = _mm_unpackhi_epi16(src_01, src_23); \ |
3888 |
+ e0 = _mm_madd_epi16(src_02, trans0); \ |
3889 |
+ o0 = _mm_madd_epi16(src_13, trans1); \ |
3890 |
+ e1 = _mm_madd_epi16(src_02, trans2); \ |
3891 |
+ o1 = _mm_madd_epi16(src_13, trans3); \ |
3892 |
+ add = _mm_set1_epi32(1 << (shift - 1)); \ |
3893 |
+ e0 = _mm_add_epi32(e0, add); \ |
3894 |
+ e1 = _mm_add_epi32(e1, add); \ |
3895 |
+ res0 = _mm_add_epi32(e0, o0); \ |
3896 |
+ res1 = _mm_add_epi32(e1, o1); \ |
3897 |
+ res2 = _mm_sub_epi32(e1, o1); \ |
3898 |
+ res3 = _mm_sub_epi32(e0, o0); \ |
3899 |
+ res0 = _mm_srai_epi32(res0, shift); \ |
3900 |
+ res1 = _mm_srai_epi32(res1, shift); \ |
3901 |
+ res2 = _mm_srai_epi32(res2, shift); \ |
3902 |
+ res3 = _mm_srai_epi32(res3, shift); \ |
3903 |
+ packed0 = _mm_packs_epi32(res0, res1); \ |
3904 |
+ packed1 = _mm_packs_epi32(res2, res3); \ |
3905 |
+ \ |
3906 |
+ res0 = _mm_unpacklo_epi16(packed0, packed1); \ |
3907 |
+ res1 = _mm_unpackhi_epi16(packed0, packed1); \ |
3908 |
+ src_01 = _mm_unpacklo_epi16(res0, res1); \ |
3909 |
+ src_23 = _mm_unpackhi_epi16(res0, res1); \ |
3910 |
+} |
3911 |
+ |
3912 |
+#define HEVC_IDCT4X4_E2K(depth) \ |
3913 |
+static void ff_hevc_idct_4x4##_##depth##_e2k(int16_t *coeffs, int col_limit) \ |
3914 |
+{ \ |
3915 |
+ const int shift = 7; \ |
3916 |
+ const int shift2 = 20 - depth; \ |
3917 |
+ vec_s16 src_01, src_23; \ |
3918 |
+ vec_s32 res0, res1, res2, res3; \ |
3919 |
+ vec_s16 packed0, packed1; \ |
3920 |
+ vec_s16 trans0 = _mm_set1_epi32(64 | 64 << 16); \ |
3921 |
+ vec_s16 trans1 = _mm_set1_epi32(83 | 36 << 16); \ |
3922 |
+ vec_s16 trans2 = _mm_set1_epi32(64 | -64 << 16); \ |
3923 |
+ vec_s16 trans3 = _mm_set1_epi32(36 | -83 << 16); \ |
3924 |
+ \ |
3925 |
+ src_01 = VEC_LD(coeffs); \ |
3926 |
+ src_23 = VEC_LD(coeffs + 8); \ |
3927 |
+ transform4x4(shift); \ |
3928 |
+ transform4x4(shift2); \ |
3929 |
+ VEC_ST(coeffs, src_01); \ |
3930 |
+ VEC_ST(coeffs + 8, src_23); \ |
3931 |
+} |
3932 |
+ |
3933 |
+HEVC_IDCT4X4_E2K(8) |
3934 |
+HEVC_IDCT4X4_E2K(10) |
3935 |
+ |
3936 |
+av_cold void ff_hevc_dsp_init_e2k(HEVCDSPContext *c, const int bit_depth) |
3937 |
+{ |
3938 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
3939 |
+ return; |
3940 |
+ |
3941 |
+ if (bit_depth == 8) |
3942 |
+ c->idct[0] = ff_hevc_idct_4x4_8_e2k; |
3943 |
+ if (bit_depth == 10) |
3944 |
+ c->idct[0] = ff_hevc_idct_4x4_10_e2k; |
3945 |
+} |
3946 |
diff --git a/libavcodec/e2k/hpeldsp.c b/libavcodec/e2k/hpeldsp.c |
3947 |
new file mode 100644 |
3948 |
index 0000000..9ff59bb |
3949 |
--- /dev/null |
3950 |
+++ b/libavcodec/e2k/hpeldsp.c |
3951 |
@@ -0,0 +1,302 @@ |
3952 |
+/* |
3953 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
3954 |
+ * Copyright (c) 2002 Brian Foley |
3955 |
+ * Copyright (c) 2002 Dieter Shirley |
3956 |
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
3957 |
+ * |
3958 |
+ * This file is part of FFmpeg. |
3959 |
+ * |
3960 |
+ * FFmpeg is free software; you can redistribute it and/or |
3961 |
+ * modify it under the terms of the GNU Lesser General Public |
3962 |
+ * License as published by the Free Software Foundation; either |
3963 |
+ * version 2.1 of the License, or (at your option) any later version. |
3964 |
+ * |
3965 |
+ * FFmpeg is distributed in the hope that it will be useful, |
3966 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
3967 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
3968 |
+ * Lesser General Public License for more details. |
3969 |
+ * |
3970 |
+ * You should have received a copy of the GNU Lesser General Public |
3971 |
+ * License along with FFmpeg; if not, write to the Free Software |
3972 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
3973 |
+ */ |
3974 |
+ |
3975 |
+#include "config.h" |
3976 |
+ |
3977 |
+#include "libavutil/attributes.h" |
3978 |
+#include "libavutil/cpu.h" |
3979 |
+#include "libavutil/e2k/cpu.h" |
3980 |
+#include "libavutil/e2k/util_e2k.h" |
3981 |
+ |
3982 |
+#include "libavcodec/hpeldsp.h" |
3983 |
+ |
3984 |
+#include "hpeldsp.h" |
3985 |
+ |
3986 |
+/* next one assumes that ((line_size % 16) == 0) */ |
3987 |
+void ff_put_pixels16_e2k(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
3988 |
+{ |
3989 |
+ vec_u8 v0, v1, v2, v3; |
3990 |
+ int i; |
3991 |
+ |
3992 |
+ PRAGMA_E2K("ivdep") |
3993 |
+ for (i = 0; i < h; i += 4) { |
3994 |
+ v0 = VEC_LD(pixels); |
3995 |
+ v1 = VEC_LD(pixels + line_size); |
3996 |
+ v2 = VEC_LD(pixels + line_size * 2); |
3997 |
+ v3 = VEC_LD(pixels + line_size * 3); |
3998 |
+ VEC_ST(block, v0); |
3999 |
+ VEC_ST(block + line_size, v1); |
4000 |
+ VEC_ST(block + line_size * 2, v2); |
4001 |
+ VEC_ST(block + line_size * 3, v3); |
4002 |
+ pixels += line_size * 4; |
4003 |
+ block += line_size * 4; |
4004 |
+ } |
4005 |
+} |
4006 |
+ |
4007 |
+/* next one assumes that ((line_size % 16) == 0) */ |
4008 |
+#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) |
4009 |
+void ff_avg_pixels16_e2k(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
4010 |
+{ |
4011 |
+ vec_u8 pixelsv, blockv; |
4012 |
+ int i; |
4013 |
+ |
4014 |
+ PRAGMA_E2K("ivdep") |
4015 |
+ for (i = 0; i < h; i++) { |
4016 |
+ blockv = VEC_LD(block); |
4017 |
+ pixelsv = VEC_LD(pixels); |
4018 |
+ blockv = _mm_avg_epu8(blockv, pixelsv); |
4019 |
+ VEC_ST(block, blockv); |
4020 |
+ pixels += line_size; |
4021 |
+ block += line_size; |
4022 |
+ } |
4023 |
+} |
4024 |
+ |
4025 |
+/* next one assumes that ((line_size % 8) == 0) */ |
4026 |
+static void avg_pixels8_e2k(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) |
4027 |
+{ |
4028 |
+ __m64 pixelsv, blockv; |
4029 |
+ int i; |
4030 |
+ |
4031 |
+ PRAGMA_E2K("ivdep") |
4032 |
+ for (i = 0; i < h; i++) { |
4033 |
+ blockv = *(__m64*)block; |
4034 |
+ pixelsv = *(__m64*)pixels; |
4035 |
+ blockv = _mm_avg_pu8(blockv, pixelsv); |
4036 |
+ *(__m64*)block = blockv; |
4037 |
+ pixels += line_size; |
4038 |
+ block += line_size; |
4039 |
+ } |
4040 |
+} |
4041 |
+ |
4042 |
+/* next one assumes that ((line_size % 8) == 0) */ |
4043 |
+static void put_pixels8_xy2_e2k(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
4044 |
+{ |
4045 |
+ int i; |
4046 |
+ vec_u8 pixelsv1, pixelsv2, blockv; |
4047 |
+ vec_u16 pixelssum1, pixelssum2, temp3; |
4048 |
+ LOAD_ZERO; |
4049 |
+ const vec_u16 vctwo = _mm_set1_epi16(2); |
4050 |
+ |
4051 |
+ pixelsv1 = VEC_LD8(pixels); |
4052 |
+ pixelsv2 = VEC_LD8(pixels + 1); |
4053 |
+ pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov); |
4054 |
+ pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov); |
4055 |
+ pixelssum1 = _mm_add_epi16(pixelsv1, pixelsv2); |
4056 |
+ pixelssum1 = _mm_add_epi16(pixelssum1, vctwo); |
4057 |
+ |
4058 |
+ PRAGMA_E2K("ivdep") |
4059 |
+ for (i = 0; i < h; i++) { |
4060 |
+ pixels += line_size; |
4061 |
+ blockv = VEC_LD8(block); |
4062 |
+ pixelsv1 = VEC_LD8(pixels); |
4063 |
+ pixelsv2 = VEC_LD8(pixels + 1); |
4064 |
+ pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov); |
4065 |
+ pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov); |
4066 |
+ pixelssum2 = _mm_add_epi16(pixelsv1, pixelsv2); |
4067 |
+ temp3 = _mm_add_epi16(pixelssum1, pixelssum2); |
4068 |
+ temp3 = _mm_srai_epi16(temp3, 2); |
4069 |
+ pixelssum1 = _mm_add_epi16(pixelssum2, vctwo); |
4070 |
+ |
4071 |
+ blockv = _mm_packus_epi16(temp3, temp3); |
4072 |
+ VEC_STL(block, blockv); |
4073 |
+ block += line_size; |
4074 |
+ } |
4075 |
+} |
4076 |
+ |
4077 |
+/* next one assumes that ((line_size % 8) == 0) */ |
4078 |
+static void put_no_rnd_pixels8_xy2_e2k(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
4079 |
+{ |
4080 |
+ int i; |
4081 |
+ vec_u8 pixelsv1, pixelsv2, blockv; |
4082 |
+ vec_u16 pixelssum1, pixelssum2, temp3; |
4083 |
+ LOAD_ZERO; |
4084 |
+ const vec_u16 vcone = _mm_set1_epi16(1); |
4085 |
+ |
4086 |
+ pixelsv1 = VEC_LD8(pixels); |
4087 |
+ pixelsv2 = VEC_LD8(pixels + 1); |
4088 |
+ pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov); |
4089 |
+ pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov); |
4090 |
+ pixelssum1 = _mm_add_epi16(pixelsv1, pixelsv2); |
4091 |
+ pixelssum1 = _mm_add_epi16(pixelssum1, vcone); |
4092 |
+ |
4093 |
+ PRAGMA_E2K("ivdep") |
4094 |
+ for (i = 0; i < h; i++) { |
4095 |
+ pixels += line_size; |
4096 |
+ blockv = VEC_LD8(block); |
4097 |
+ pixelsv1 = VEC_LD8(pixels); |
4098 |
+ pixelsv2 = VEC_LD8(pixels + 1); |
4099 |
+ pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov); |
4100 |
+ pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov); |
4101 |
+ pixelssum2 = _mm_add_epi16(pixelsv1, pixelsv2); |
4102 |
+ temp3 = _mm_add_epi16(pixelssum1, pixelssum2); |
4103 |
+ temp3 = _mm_srai_epi16(temp3, 2); |
4104 |
+ pixelssum1 = _mm_add_epi16(pixelssum2, vcone); |
4105 |
+ |
4106 |
+ blockv = _mm_packus_epi16(temp3, temp3); |
4107 |
+ VEC_STL(block, blockv); |
4108 |
+ block += line_size; |
4109 |
+ } |
4110 |
+} |
4111 |
+ |
4112 |
+/* next one assumes that ((line_size % 16) == 0) */ |
4113 |
+static void put_pixels16_xy2_e2k(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) |
4114 |
+{ |
4115 |
+ int i; |
4116 |
+ vec_u8 pixelsv1, pixelsv2, pixelsv3, pixelsv4, blockv; |
4117 |
+ vec_u16 temp3, temp4, pixelssum1, pixelssum2, pixelssum3, pixelssum4; |
4118 |
+ LOAD_ZERO; |
4119 |
+ const vec_u16 vctwo = _mm_set1_epi16(2); |
4120 |
+ |
4121 |
+ pixelsv1 = VEC_LD(pixels); |
4122 |
+ pixelsv2 = VEC_LD(pixels + 1); |
4123 |
+ pixelsv3 = _mm_unpackhi_epi8(pixelsv1, zerov); |
4124 |
+ pixelsv4 = _mm_unpackhi_epi8(pixelsv2, zerov); |
4125 |
+ pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov); |
4126 |
+ pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov); |
4127 |
+ pixelssum3 = _mm_add_epi16(pixelsv3, pixelsv4); |
4128 |
+ pixelssum3 = _mm_add_epi16(pixelssum3, vctwo); |
4129 |
+ pixelssum1 = _mm_add_epi16(pixelsv1, pixelsv2); |
4130 |
+ pixelssum1 = _mm_add_epi16(pixelssum1, vctwo); |
4131 |
+ |
4132 |
+ PRAGMA_E2K("ivdep") |
4133 |
+ for (i = 0; i < h; i++) { |
4134 |
+ pixels += line_size; |
4135 |
+ blockv = VEC_LD(block); |
4136 |
+ pixelsv1 = VEC_LD(pixels); |
4137 |
+ pixelsv2 = VEC_LD(pixels + 1); |
4138 |
+ pixelsv3 = _mm_unpackhi_epi8(pixelsv1, zerov); |
4139 |
+ pixelsv4 = _mm_unpackhi_epi8(pixelsv2, zerov); |
4140 |
+ pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov); |
4141 |
+ pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov); |
4142 |
+ pixelssum4 = _mm_add_epi16(pixelsv3, pixelsv4); |
4143 |
+ pixelssum2 = _mm_add_epi16(pixelsv1, pixelsv2); |
4144 |
+ temp4 = _mm_add_epi16(pixelssum3, pixelssum4); |
4145 |
+ temp4 = _mm_srai_epi16(temp4, 2); |
4146 |
+ temp3 = _mm_add_epi16(pixelssum1, pixelssum2); |
4147 |
+ temp3 = _mm_srai_epi16(temp3, 2); |
4148 |
+ pixelssum3 = _mm_add_epi16(pixelssum4, vctwo); |
4149 |
+ pixelssum1 = _mm_add_epi16(pixelssum2, vctwo); |
4150 |
+ blockv = _mm_packus_epi16(temp3, temp4); |
4151 |
+ VEC_ST(block, blockv); |
4152 |
+ block += line_size; |
4153 |
+ } |
4154 |
+} |
4155 |
+ |
4156 |
+/* next one assumes that ((line_size % 16) == 0) */ |
4157 |
+static void put_no_rnd_pixels16_xy2_e2k(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h) |
4158 |
+{ |
4159 |
+ int i; |
4160 |
+ vec_u8 pixelsv1, pixelsv2, pixelsv3, pixelsv4, blockv; |
4161 |
+ vec_u16 temp3, temp4, pixelssum1, pixelssum2, pixelssum3, pixelssum4; |
4162 |
+ LOAD_ZERO; |
4163 |
+ const vec_u16 vcone = _mm_set1_epi16(1); |
4164 |
+ |
4165 |
+ pixelsv1 = VEC_LD(pixels); |
4166 |
+ pixelsv2 = VEC_LD(pixels + 1); |
4167 |
+ pixelsv3 = _mm_unpackhi_epi8(pixelsv1, zerov); |
4168 |
+ pixelsv4 = _mm_unpackhi_epi8(pixelsv2, zerov); |
4169 |
+ pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov); |
4170 |
+ pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov); |
4171 |
+ pixelssum3 = _mm_add_epi16(pixelsv3, pixelsv4); |
4172 |
+ pixelssum3 = _mm_add_epi16(pixelssum3, vcone); |
4173 |
+ pixelssum1 = _mm_add_epi16(pixelsv1, pixelsv2); |
4174 |
+ pixelssum1 = _mm_add_epi16(pixelssum1, vcone); |
4175 |
+ |
4176 |
+ PRAGMA_E2K("ivdep") |
4177 |
+ for (i = 0; i < h; i++) { |
4178 |
+ pixels += line_size; |
4179 |
+ blockv = VEC_LD(block); |
4180 |
+ pixelsv1 = VEC_LD(pixels); |
4181 |
+ pixelsv2 = VEC_LD(pixels + 1); |
4182 |
+ pixelsv3 = _mm_unpackhi_epi8(pixelsv1, zerov); |
4183 |
+ pixelsv4 = _mm_unpackhi_epi8(pixelsv2, zerov); |
4184 |
+ pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov); |
4185 |
+ pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov); |
4186 |
+ pixelssum4 = _mm_add_epi16(pixelsv3, pixelsv4); |
4187 |
+ pixelssum2 = _mm_add_epi16(pixelsv1, pixelsv2); |
4188 |
+ temp4 = _mm_add_epi16(pixelssum3, pixelssum4); |
4189 |
+ temp4 = _mm_srai_epi16(temp4, 2); |
4190 |
+ temp3 = _mm_add_epi16(pixelssum1, pixelssum2); |
4191 |
+ temp3 = _mm_srai_epi16(temp3, 2); |
4192 |
+ pixelssum3 = _mm_add_epi16(pixelssum4, vcone); |
4193 |
+ pixelssum1 = _mm_add_epi16(pixelssum2, vcone); |
4194 |
+ blockv = _mm_packus_epi16(temp3, temp4); |
4195 |
+ VEC_ST(block, blockv); |
4196 |
+ block += line_size; |
4197 |
+ } |
4198 |
+} |
4199 |
+ |
4200 |
+/* next one assumes that ((line_size % 8) == 0) */ |
4201 |
+static void avg_pixels8_xy2_e2k(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
4202 |
+{ |
4203 |
+ int i; |
4204 |
+ vec_u8 pixelsv1, pixelsv2, blockv, blocktemp; |
4205 |
+ vec_u16 pixelssum1, pixelssum2, temp3; |
4206 |
+ LOAD_ZERO; |
4207 |
+ const vec_u16 vctwo = _mm_set1_epi16(2); |
4208 |
+ |
4209 |
+ pixelsv1 = VEC_LD8(pixels); |
4210 |
+ pixelsv2 = VEC_LD8(pixels + 1); |
4211 |
+ pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov); |
4212 |
+ pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov); |
4213 |
+ pixelssum1 = _mm_add_epi16(pixelsv1, pixelsv2); |
4214 |
+ pixelssum1 = _mm_add_epi16(pixelssum1, vctwo); |
4215 |
+ |
4216 |
+ PRAGMA_E2K("ivdep") |
4217 |
+ for (i = 0; i < h; i++) { |
4218 |
+ pixels += line_size; |
4219 |
+ blockv = VEC_LD8(block); |
4220 |
+ pixelsv1 = VEC_LD8(pixels); |
4221 |
+ pixelsv2 = VEC_LD8(pixels + 1); |
4222 |
+ pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov); |
4223 |
+ pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov); |
4224 |
+ pixelssum2 = _mm_add_epi16(pixelsv1, pixelsv2); |
4225 |
+ temp3 = _mm_add_epi16(pixelssum1, pixelssum2); |
4226 |
+ temp3 = _mm_srai_epi16(temp3, 2); |
4227 |
+ pixelssum1 = _mm_add_epi16(pixelssum2, vctwo); |
4228 |
+ blocktemp = _mm_packus_epi16(temp3, temp3); |
4229 |
+ blockv = _mm_avg_epu8(blocktemp, blockv); |
4230 |
+ VEC_STL(block, blockv); |
4231 |
+ block += line_size; |
4232 |
+ } |
4233 |
+} |
4234 |
+ |
4235 |
+av_cold void ff_hpeldsp_init_e2k(HpelDSPContext *c, int flags) |
4236 |
+{ |
4237 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
4238 |
+ return; |
4239 |
+ |
4240 |
+ // !checkasm |
4241 |
+ |
4242 |
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_e2k; |
4243 |
+ c->avg_pixels_tab[1][0] = avg_pixels8_e2k; |
4244 |
+ c->avg_pixels_tab[1][3] = avg_pixels8_xy2_e2k; // fate vsynth1-mpeg2-422 |
4245 |
+ |
4246 |
+ c->put_pixels_tab[0][0] = ff_put_pixels16_e2k; |
4247 |
+ c->put_pixels_tab[1][3] = put_pixels8_xy2_e2k; |
4248 |
+ c->put_pixels_tab[0][3] = put_pixels16_xy2_e2k; |
4249 |
+ |
4250 |
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_e2k; |
4251 |
+ c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_e2k; |
4252 |
+ c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_e2k; |
4253 |
+} |
4254 |
diff --git a/libavcodec/e2k/hpeldsp.h b/libavcodec/e2k/hpeldsp.h |
4255 |
new file mode 100644 |
4256 |
index 0000000..0ade264 |
4257 |
--- /dev/null |
4258 |
+++ b/libavcodec/e2k/hpeldsp.h |
4259 |
@@ -0,0 +1,30 @@ |
4260 |
+/* |
4261 |
+ * This file is part of FFmpeg. |
4262 |
+ * |
4263 |
+ * FFmpeg is free software; you can redistribute it and/or |
4264 |
+ * modify it under the terms of the GNU Lesser General Public |
4265 |
+ * License as published by the Free Software Foundation; either |
4266 |
+ * version 2.1 of the License, or (at your option) any later version. |
4267 |
+ * |
4268 |
+ * FFmpeg is distributed in the hope that it will be useful, |
4269 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
4270 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
4271 |
+ * Lesser General Public License for more details. |
4272 |
+ * |
4273 |
+ * You should have received a copy of the GNU Lesser General Public |
4274 |
+ * License along with FFmpeg; if not, write to the Free Software |
4275 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
4276 |
+ */ |
4277 |
+ |
4278 |
+#ifndef AVCODEC_E2K_HPELDSP_H |
4279 |
+#define AVCODEC_E2K_HPELDSP_H |
4280 |
+ |
4281 |
+#include <stddef.h> |
4282 |
+#include <stdint.h> |
4283 |
+ |
4284 |
+void ff_avg_pixels16_e2k(uint8_t *block, const uint8_t *pixels, |
4285 |
+ ptrdiff_t line_size, int h); |
4286 |
+void ff_put_pixels16_e2k(uint8_t *block, const uint8_t *pixels, |
4287 |
+ ptrdiff_t line_size, int h); |
4288 |
+ |
4289 |
+#endif /* AVCODEC_E2K_HPELDSP_H */ |
4290 |
diff --git a/libavcodec/e2k/idctdsp.c b/libavcodec/e2k/idctdsp.c |
4291 |
new file mode 100644 |
4292 |
index 0000000..db9d2ca |
4293 |
--- /dev/null |
4294 |
+++ b/libavcodec/e2k/idctdsp.c |
4295 |
@@ -0,0 +1,237 @@ |
4296 |
+/* |
4297 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
4298 |
+ * Copyright (c) 2001 Michel Lespinasse |
4299 |
+ * |
4300 |
+ * This file is part of FFmpeg. |
4301 |
+ * |
4302 |
+ * FFmpeg is free software; you can redistribute it and/or |
4303 |
+ * modify it under the terms of the GNU Lesser General Public |
4304 |
+ * License as published by the Free Software Foundation; either |
4305 |
+ * version 2.1 of the License, or (at your option) any later version. |
4306 |
+ * |
4307 |
+ * FFmpeg is distributed in the hope that it will be useful, |
4308 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
4309 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
4310 |
+ * Lesser General Public License for more details. |
4311 |
+ * |
4312 |
+ * You should have received a copy of the GNU Lesser General Public |
4313 |
+ * License along with FFmpeg; if not, write to the Free Software |
4314 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
4315 |
+ */ |
4316 |
+ |
4317 |
+/* NOTE: This code is based on GPL code from the libmpeg2 project. The |
4318 |
+ * author, Michel Lespinasses, has given explicit permission to release |
4319 |
+ * under LGPL as part of FFmpeg. |
4320 |
+ * |
4321 |
+ * FFmpeg integration by Dieter Shirley |
4322 |
+ * |
4323 |
+ * This file is a direct copy of the AltiVec IDCT module from the libmpeg2 |
4324 |
+ * project. I've deleted all of the libmpeg2-specific code, renamed the |
4325 |
+ * functions and reordered the function parameters. The only change to the |
4326 |
+ * IDCT function itself was to factor out the partial transposition, and to |
4327 |
+ * perform a full transpose at the end of the function. */ |
4328 |
+ |
4329 |
+#include "config.h" |
4330 |
+ |
4331 |
+#include <stdlib.h> |
4332 |
+#include <string.h> |
4333 |
+ |
4334 |
+#include "libavutil/attributes.h" |
4335 |
+#include "libavutil/cpu.h" |
4336 |
+#include "libavutil/e2k/cpu.h" |
4337 |
+#include "libavutil/e2k/util_e2k.h" |
4338 |
+ |
4339 |
+#include "libavcodec/idctdsp.h" |
4340 |
+ |
4341 |
+#include "dctdsp.h" |
4342 |
+ |
4343 |
+#define IDCT_HALF \ |
4344 |
+ /* 1st stage */ \ |
4345 |
+ t1 = _mm_adds_epi16(_mm_mulhrs_epi16(a1, vx7), vx1); \ |
4346 |
+ t8 = _mm_adds_epi16(_mm_mulhrs_epi16(a1, vx1), \ |
4347 |
+ _mm_subs_epi16(zero, vx7)); \ |
4348 |
+ t7 = _mm_adds_epi16(_mm_mulhrs_epi16(a2, vx5), vx3); \ |
4349 |
+ t3 = _mm_adds_epi16(_mm_mulhrs_epi16(ma2, vx3), vx5); \ |
4350 |
+ \ |
4351 |
+ /* 2nd stage */ \ |
4352 |
+ t5 = _mm_adds_epi16(vx0, vx4); \ |
4353 |
+ t0 = _mm_subs_epi16(vx0, vx4); \ |
4354 |
+ t2 = _mm_adds_epi16(_mm_mulhrs_epi16(a0, vx6), vx2); \ |
4355 |
+ t4 = _mm_adds_epi16(_mm_mulhrs_epi16(a0, vx2), \ |
4356 |
+ _mm_subs_epi16(zero, vx6)); \ |
4357 |
+ t6 = _mm_adds_epi16(t8, t3); \ |
4358 |
+ t3 = _mm_subs_epi16(t8, t3); \ |
4359 |
+ t8 = _mm_subs_epi16(t1, t7); \ |
4360 |
+ t1 = _mm_adds_epi16(t1, t7); \ |
4361 |
+ \ |
4362 |
+ /* 3rd stage */ \ |
4363 |
+ t7 = _mm_adds_epi16(t5, t2); \ |
4364 |
+ t2 = _mm_subs_epi16(t5, t2); \ |
4365 |
+ t5 = _mm_adds_epi16(t0, t4); \ |
4366 |
+ t0 = _mm_subs_epi16(t0, t4); \ |
4367 |
+ t4 = _mm_subs_epi16(t8, t3); \ |
4368 |
+ t3 = _mm_adds_epi16(t8, t3); \ |
4369 |
+ \ |
4370 |
+ /* 4th stage */ \ |
4371 |
+ vy0 = _mm_adds_epi16(t7, t1); \ |
4372 |
+ vy7 = _mm_subs_epi16(t7, t1); \ |
4373 |
+ vy1 = _mm_adds_epi16(_mm_mulhrs_epi16(c4, t3), t5); \ |
4374 |
+ vy6 = _mm_adds_epi16(_mm_mulhrs_epi16(mc4, t3), t5); \ |
4375 |
+ vy2 = _mm_adds_epi16(_mm_mulhrs_epi16(c4, t4), t0); \ |
4376 |
+ vy5 = _mm_adds_epi16(_mm_mulhrs_epi16(mc4, t4), t0); \ |
4377 |
+ vy3 = _mm_adds_epi16(t2, t6); \ |
4378 |
+ vy4 = _mm_subs_epi16(t2, t6) |
4379 |
+ |
4380 |
+#define IDCT \ |
4381 |
+ vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ |
4382 |
+ vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ |
4383 |
+ vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \ |
4384 |
+ \ |
4385 |
+ vec_s16 c4 = _mm_set1_epi16(23170); \ |
4386 |
+ vec_s16 a0 = _mm_set1_epi16(13573); \ |
4387 |
+ vec_s16 a1 = _mm_set1_epi16(6518); \ |
4388 |
+ vec_s16 a2 = _mm_set1_epi16(21895); \ |
4389 |
+ vec_s16 mc4 = _mm_set1_epi16(-23170); \ |
4390 |
+ vec_s16 ma2 = _mm_set1_epi16(-21895); \ |
4391 |
+ vec_s16 bias = _mm_set1_epi32(32 | 31 << 16); \ |
4392 |
+ \ |
4393 |
+ vec_s16 zero = _mm_setzero_si128(); \ |
4394 |
+ \ |
4395 |
+ t0 = VEC_LD(constants[0]); \ |
4396 |
+ t1 = VEC_LD(constants[1]); \ |
4397 |
+ t2 = VEC_LD(constants[2]); \ |
4398 |
+ t3 = VEC_LD(constants[3]); \ |
4399 |
+ \ |
4400 |
+ vx0 = _mm_mulhrs_epi16(_mm_slli_epi16(block[0], 4), t0); \ |
4401 |
+ vx1 = _mm_mulhrs_epi16(_mm_slli_epi16(block[1], 4), t1); \ |
4402 |
+ vx2 = _mm_mulhrs_epi16(_mm_slli_epi16(block[2], 4), t2); \ |
4403 |
+ vx3 = _mm_mulhrs_epi16(_mm_slli_epi16(block[3], 4), t3); \ |
4404 |
+ vx4 = _mm_mulhrs_epi16(_mm_slli_epi16(block[4], 4), t0); \ |
4405 |
+ vx5 = _mm_mulhrs_epi16(_mm_slli_epi16(block[5], 4), t3); \ |
4406 |
+ vx6 = _mm_mulhrs_epi16(_mm_slli_epi16(block[6], 4), t2); \ |
4407 |
+ vx7 = _mm_mulhrs_epi16(_mm_slli_epi16(block[7], 4), t1); \ |
4408 |
+ \ |
4409 |
+ IDCT_HALF; \ |
4410 |
+ \ |
4411 |
+ vx0 = _mm_unpacklo_epi16(vy0, vy4); \ |
4412 |
+ vx1 = _mm_unpackhi_epi16(vy0, vy4); \ |
4413 |
+ vx2 = _mm_unpacklo_epi16(vy1, vy5); \ |
4414 |
+ vx3 = _mm_unpackhi_epi16(vy1, vy5); \ |
4415 |
+ vx4 = _mm_unpacklo_epi16(vy2, vy6); \ |
4416 |
+ vx5 = _mm_unpackhi_epi16(vy2, vy6); \ |
4417 |
+ vx6 = _mm_unpacklo_epi16(vy3, vy7); \ |
4418 |
+ vx7 = _mm_unpackhi_epi16(vy3, vy7); \ |
4419 |
+ \ |
4420 |
+ vy0 = _mm_unpacklo_epi16(vx0, vx4); \ |
4421 |
+ vy1 = _mm_unpackhi_epi16(vx0, vx4); \ |
4422 |
+ vy2 = _mm_unpacklo_epi16(vx1, vx5); \ |
4423 |
+ vy3 = _mm_unpackhi_epi16(vx1, vx5); \ |
4424 |
+ vy4 = _mm_unpacklo_epi16(vx2, vx6); \ |
4425 |
+ vy5 = _mm_unpackhi_epi16(vx2, vx6); \ |
4426 |
+ vy6 = _mm_unpacklo_epi16(vx3, vx7); \ |
4427 |
+ vy7 = _mm_unpackhi_epi16(vx3, vx7); \ |
4428 |
+ \ |
4429 |
+ vx0 = _mm_adds_epi16(_mm_unpacklo_epi16(vy0, vy4), bias); \ |
4430 |
+ vx1 = _mm_unpackhi_epi16(vy0, vy4); \ |
4431 |
+ vx2 = _mm_unpacklo_epi16(vy1, vy5); \ |
4432 |
+ vx3 = _mm_unpackhi_epi16(vy1, vy5); \ |
4433 |
+ vx4 = _mm_unpacklo_epi16(vy2, vy6); \ |
4434 |
+ vx5 = _mm_unpackhi_epi16(vy2, vy6); \ |
4435 |
+ vx6 = _mm_unpacklo_epi16(vy3, vy7); \ |
4436 |
+ vx7 = _mm_unpackhi_epi16(vy3, vy7); \ |
4437 |
+ \ |
4438 |
+ IDCT_HALF; \ |
4439 |
+ \ |
4440 |
+ vx0 = _mm_srai_epi16(vy0, 6); \ |
4441 |
+ vx1 = _mm_srai_epi16(vy1, 6); \ |
4442 |
+ vx2 = _mm_srai_epi16(vy2, 6); \ |
4443 |
+ vx3 = _mm_srai_epi16(vy3, 6); \ |
4444 |
+ vx4 = _mm_srai_epi16(vy4, 6); \ |
4445 |
+ vx5 = _mm_srai_epi16(vy5, 6); \ |
4446 |
+ vx6 = _mm_srai_epi16(vy6, 6); \ |
4447 |
+ vx7 = _mm_srai_epi16(vy7, 6) |
4448 |
+ |
4449 |
+static const int16_t ALIGNED(16) constants[4][8] = { |
4450 |
+ { 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725 }, |
4451 |
+ { 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521 }, |
4452 |
+ { 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692 }, |
4453 |
+ { 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722 } |
4454 |
+}; |
4455 |
+ |
4456 |
+void ff_idct_e2k(int16_t *blk) |
4457 |
+{ |
4458 |
+ vec_s16 *block = (vec_s16*)blk; |
4459 |
+ |
4460 |
+ IDCT; |
4461 |
+ |
4462 |
+ block[0] = vx0; |
4463 |
+ block[1] = vx1; |
4464 |
+ block[2] = vx2; |
4465 |
+ block[3] = vx3; |
4466 |
+ block[4] = vx4; |
4467 |
+ block[5] = vx5; |
4468 |
+ block[6] = vx6; |
4469 |
+ block[7] = vx7; |
4470 |
+} |
4471 |
+ |
4472 |
+#define COPY(vx0, vx1, i) \ |
4473 |
+ tmp = _mm_packus_epi16(vx0, vx1); \ |
4474 |
+ VEC_STL(dest, tmp); dest += stride; \ |
4475 |
+ VEC_STH(dest, tmp); dest += stride |
4476 |
+ |
4477 |
+static void idct_put_e2k(uint8_t *dest, ptrdiff_t stride, int16_t *blk) |
4478 |
+{ |
4479 |
+ vec_s16 *block = (vec_s16*)blk; |
4480 |
+ vec_u8 tmp; |
4481 |
+ |
4482 |
+ IDCT; |
4483 |
+ |
4484 |
+ COPY(vx0, vx1, 0); |
4485 |
+ COPY(vx2, vx3, 2); |
4486 |
+ COPY(vx4, vx5, 4); |
4487 |
+ COPY(vx6, vx7, 6); |
4488 |
+} |
4489 |
+ |
4490 |
+#define ADD(vx0, vx1, i) \ |
4491 |
+ tmp = VEC_LD8(dest); \ |
4492 |
+ t0 = _mm_unpacklo_epi8(tmp, zero); \ |
4493 |
+ tmp = VEC_LD8(dest + stride); \ |
4494 |
+ t1 = _mm_unpacklo_epi8(tmp, zero); \ |
4495 |
+ t0 = _mm_adds_epi16(t0, vx0); \ |
4496 |
+ t1 = _mm_adds_epi16(t1, vx1); \ |
4497 |
+ tmp = _mm_packus_epi16(t0, t1); \ |
4498 |
+ VEC_STL(dest, tmp); dest += stride; \ |
4499 |
+ VEC_STH(dest, tmp); dest += stride |
4500 |
+ |
4501 |
+static void idct_add_e2k(uint8_t *dest, ptrdiff_t stride, int16_t *blk) |
4502 |
+{ |
4503 |
+ vec_s16 *block = (vec_s16*)blk; |
4504 |
+ vec_u8 tmp; |
4505 |
+ |
4506 |
+ IDCT; |
4507 |
+ |
4508 |
+ ADD(vx0, vx1, 0); |
4509 |
+ ADD(vx2, vx3, 2); |
4510 |
+ ADD(vx4, vx5, 4); |
4511 |
+ ADD(vx6, vx7, 6); |
4512 |
+} |
4513 |
+ |
4514 |
+av_cold void ff_idctdsp_init_e2k(IDCTDSPContext *c, AVCodecContext *avctx, |
4515 |
+ unsigned high_bit_depth) |
4516 |
+{ |
4517 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
4518 |
+ return; |
4519 |
+ |
4520 |
+ // !checkasm |
4521 |
+ // libavcodec/tests/dct -i |
4522 |
+ |
4523 |
+ if (!high_bit_depth && avctx->lowres == 0) { |
4524 |
+ if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) || |
4525 |
+ (avctx->idct_algo == FF_IDCT_ALTIVEC)) { |
4526 |
+ c->idct = ff_idct_e2k; |
4527 |
+ c->idct_add = idct_add_e2k; // untested |
4528 |
+ c->idct_put = idct_put_e2k; // untested |
4529 |
+ c->perm_type = FF_IDCT_PERM_TRANSPOSE; |
4530 |
+ } |
4531 |
+ } |
4532 |
+} |
4533 |
diff --git a/libavcodec/e2k/lossless_audiodsp.c b/libavcodec/e2k/lossless_audiodsp.c |
4534 |
new file mode 100644 |
4535 |
index 0000000..1bb7c45 |
4536 |
--- /dev/null |
4537 |
+++ b/libavcodec/e2k/lossless_audiodsp.c |
4538 |
@@ -0,0 +1,75 @@ |
4539 |
+/* |
4540 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
4541 |
+ * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org> |
4542 |
+ * |
4543 |
+ * This file is part of FFmpeg. |
4544 |
+ * |
4545 |
+ * FFmpeg is free software; you can redistribute it and/or |
4546 |
+ * modify it under the terms of the GNU Lesser General Public |
4547 |
+ * License as published by the Free Software Foundation; either |
4548 |
+ * version 2.1 of the License, or (at your option) any later version. |
4549 |
+ * |
4550 |
+ * FFmpeg is distributed in the hope that it will be useful, |
4551 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
4552 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
4553 |
+ * Lesser General Public License for more details. |
4554 |
+ * |
4555 |
+ * You should have received a copy of the GNU Lesser General Public |
4556 |
+ * License along with FFmpeg; if not, write to the Free Software |
4557 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
4558 |
+ */ |
4559 |
+ |
4560 |
+#include "config.h" |
4561 |
+ |
4562 |
+#include "libavutil/attributes.h" |
4563 |
+#include "libavutil/cpu.h" |
4564 |
+#include "libavutil/e2k/cpu.h" |
4565 |
+#include "libavutil/e2k/util_e2k.h" |
4566 |
+ |
4567 |
+#include "libavcodec/lossless_audiodsp.h" |
4568 |
+ |
4569 |
+#define GET_T(tt0, tt1, src, a, b) { \ |
4570 |
+ tt0 = VEC_LD(src); \ |
4571 |
+ tt1 = VEC_LD(src + 8); \ |
4572 |
+} |
4573 |
+ |
4574 |
+static int32_t scalarproduct_and_madd_int16_e2k(int16_t *v1, |
4575 |
+ const int16_t *v2, |
4576 |
+ const int16_t *v3, |
4577 |
+ int order, int mul) |
4578 |
+{ |
4579 |
+ int i; |
4580 |
+ LOAD_ZERO; |
4581 |
+ vec_s16 *pv1 = (vec_s16*)v1; |
4582 |
+ vec_s16 muls = _mm_set1_epi16(mul); |
4583 |
+ vec_s16 t0, t1, i0, i1; |
4584 |
+ vec_s32 res = zerov; |
4585 |
+ |
4586 |
+ PRAGMA_E2K("ivdep") |
4587 |
+ for (i = 0; i < order; i += 16) { |
4588 |
+ GET_T(t0, t1, v2, i1, i2); |
4589 |
+ i0 = pv1[0]; |
4590 |
+ i1 = pv1[1]; |
4591 |
+ t0 = _mm_madd_epi16(t0, i0); |
4592 |
+ t1 = _mm_madd_epi16(t1, i1); |
4593 |
+ res = _mm_add_epi32(res, _mm_add_epi32(t0, t1)); |
4594 |
+ GET_T(t0, t1, v3, i4, i3); |
4595 |
+ pv1[0] = _mm_add_epi16(_mm_mullo_epi16(t0, muls), i0); |
4596 |
+ pv1[1] = _mm_add_epi16(_mm_mullo_epi16(t1, muls), i1); |
4597 |
+ pv1 += 2; |
4598 |
+ v2 += 16; |
4599 |
+ v3 += 16; |
4600 |
+ } |
4601 |
+ |
4602 |
+ res = _mm_hadd_epi32(res, res); |
4603 |
+ return _mm_extract_epi32(res, 0) + _mm_extract_epi32(res, 1); |
4604 |
+} |
4605 |
+ |
4606 |
+av_cold void ff_llauddsp_init_e2k(LLAudDSPContext *c) |
4607 |
+{ |
4608 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
4609 |
+ return; |
4610 |
+ |
4611 |
+ // !checkasm |
4612 |
+ c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_e2k; |
4613 |
+} |
4614 |
diff --git a/libavcodec/e2k/lossless_videodsp.c b/libavcodec/e2k/lossless_videodsp.c |
4615 |
new file mode 100644 |
4616 |
index 0000000..a055ac7 |
4617 |
--- /dev/null |
4618 |
+++ b/libavcodec/e2k/lossless_videodsp.c |
4619 |
@@ -0,0 +1,59 @@ |
4620 |
+/* |
4621 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
4622 |
+ * Copyright (c) 2002 Brian Foley |
4623 |
+ * Copyright (c) 2002 Dieter Shirley |
4624 |
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
4625 |
+ * |
4626 |
+ * This file is part of FFmpeg. |
4627 |
+ * |
4628 |
+ * FFmpeg is free software; you can redistribute it and/or |
4629 |
+ * modify it under the terms of the GNU Lesser General Public |
4630 |
+ * License as published by the Free Software Foundation; either |
4631 |
+ * version 2.1 of the License, or (at your option) any later version. |
4632 |
+ * |
4633 |
+ * FFmpeg is distributed in the hope that it will be useful, |
4634 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
4635 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
4636 |
+ * Lesser General Public License for more details. |
4637 |
+ * |
4638 |
+ * You should have received a copy of the GNU Lesser General Public |
4639 |
+ * License along with FFmpeg; if not, write to the Free Software |
4640 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
4641 |
+ */ |
4642 |
+ |
4643 |
+#include "config.h" |
4644 |
+ |
4645 |
+#include "libavutil/attributes.h" |
4646 |
+#include "libavutil/cpu.h" |
4647 |
+#include "libavutil/e2k/cpu.h" |
4648 |
+#include "libavutil/e2k/util_e2k.h" |
4649 |
+ |
4650 |
+#include "libavcodec/lossless_videodsp.h" |
4651 |
+ |
4652 |
+static void add_bytes_e2k(uint8_t *dst, uint8_t *src, ptrdiff_t w) |
4653 |
+{ |
4654 |
+ int i; |
4655 |
+ __m128i vdst, vsrc; |
4656 |
+ |
4657 |
+ /* dst and src are 16 bytes-aligned (guaranteed). */ |
4658 |
+ PRAGMA_E2K("ivdep") |
4659 |
+ for (i = 0; i + 15 < w; i += 16) { |
4660 |
+ vdst = _mm_load_si128((const __m128i*)(dst + i)); |
4661 |
+ vsrc = _mm_load_si128((const __m128i*)(src + i)); |
4662 |
+ vdst = _mm_add_epi8(vsrc, vdst); |
4663 |
+ _mm_store_si128((__m128i*)(dst + i), vdst); |
4664 |
+ } |
4665 |
+ /* If w is not a multiple of 16. */ |
4666 |
+ PRAGMA_E2K("ivdep") |
4667 |
+ for (; i < w; i++) |
4668 |
+ dst[i] = dst[i] + src[i]; |
4669 |
+} |
4670 |
+ |
4671 |
+av_cold void ff_llviddsp_init_e2k(LLVidDSPContext *c) |
4672 |
+{ |
4673 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
4674 |
+ return; |
4675 |
+ |
4676 |
+ // checkasm |
4677 |
+ c->add_bytes = add_bytes_e2k; |
4678 |
+} |
4679 |
diff --git a/libavcodec/e2k/mdct15.c b/libavcodec/e2k/mdct15.c |
4680 |
new file mode 100644 |
4681 |
index 0000000..9b3c809 |
4682 |
--- /dev/null |
4683 |
+++ b/libavcodec/e2k/mdct15.c |
4684 |
@@ -0,0 +1,187 @@ |
4685 |
+/* |
4686 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
4687 |
+ * Copyright (c) 2013-2014 Mozilla Corporation |
4688 |
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com> |
4689 |
+ * |
4690 |
+ * This file is part of FFmpeg. |
4691 |
+ * |
4692 |
+ * FFmpeg is free software; you can redistribute it and/or |
4693 |
+ * modify it under the terms of the GNU Lesser General Public |
4694 |
+ * License as published by the Free Software Foundation; either |
4695 |
+ * version 2.1 of the License, or (at your option) any later version. |
4696 |
+ * |
4697 |
+ * FFmpeg is distributed in the hope that it will be useful, |
4698 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
4699 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
4700 |
+ * Lesser General Public License for more details. |
4701 |
+ * |
4702 |
+ * You should have received a copy of the GNU Lesser General Public |
4703 |
+ * License along with FFmpeg; if not, write to the Free Software |
4704 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
4705 |
+ */ |
4706 |
+ |
4707 |
+#include "config.h" |
4708 |
+#include "libavutil/cpu.h" |
4709 |
+#include "libavutil/e2k/cpu.h" |
4710 |
+#include "libavutil/e2k/util_e2k.h" |
4711 |
+ |
4712 |
+#include "libavutil/attributes.h" |
4713 |
+#include "libavutil/common.h" |
4714 |
+ |
4715 |
+#include "libavcodec/mdct15.h" |
4716 |
+ |
4717 |
+#define CMUL(dre, dim, are, aim, bre, bim) do { \ |
4718 |
+ (dre) = (are) * (bre) - (aim) * (bim); \ |
4719 |
+ (dim) = (are) * (bim) + (aim) * (bre); \ |
4720 |
+ } while (0) |
4721 |
+ |
4722 |
+#define CMUL3(c, a, b) CMUL((c).re, (c).im, (a).re, (a).im, (b).re, (b).im) |
4723 |
+ |
4724 |
+static av_always_inline void fft5(float *out, float *in, FFTComplex exptab[2]) |
4725 |
+{ |
4726 |
+ __m128 z0r, z1r, z2r, z3r, z0i, z1i, z2i, z3i; |
4727 |
+ __m128 t0r, t1r, t2r, t3r, t4r, t5r; |
4728 |
+ __m128 t0i, t1i, t2i, t3i, t4i, t5i; |
4729 |
+ __m128 i0r, i1r, i2r, i3r, i4r; |
4730 |
+ __m128 i0i, i1i, i2i, i3i, i4i; |
4731 |
+ __m128 e0r = _mm_set1_ps(exptab[0].re); |
4732 |
+ __m128 e0i = _mm_set1_ps(exptab[0].im); |
4733 |
+ __m128 e1r = _mm_set1_ps(exptab[1].re); |
4734 |
+ __m128 e1i = _mm_set1_ps(exptab[1].im); |
4735 |
+ |
4736 |
+ i0r = _mm_load_ps(in); |
4737 |
+ i0i = _mm_load_ps(in + 4); |
4738 |
+ i1r = _mm_load_ps(in + 8); |
4739 |
+ i1i = _mm_load_ps(in + 12); |
4740 |
+ i2r = _mm_load_ps(in + 16); |
4741 |
+ i2i = _mm_load_ps(in + 20); |
4742 |
+ i3r = _mm_load_ps(in + 24); |
4743 |
+ i3i = _mm_load_ps(in + 28); |
4744 |
+ i4r = _mm_load_ps(in + 32); |
4745 |
+ i4i = _mm_load_ps(in + 36); |
4746 |
+ |
4747 |
+ t0r = _mm_add_ps(i1r, i4r); |
4748 |
+ t0i = _mm_add_ps(i1i, i4i); |
4749 |
+ t1i = _mm_sub_ps(i1r, i4r); |
4750 |
+ t1r = _mm_sub_ps(i1i, i4i); |
4751 |
+ t2r = _mm_add_ps(i2r, i3r); |
4752 |
+ t2i = _mm_add_ps(i2i, i3i); |
4753 |
+ t3i = _mm_sub_ps(i2r, i3r); |
4754 |
+ t3r = _mm_sub_ps(i2i, i3i); |
4755 |
+ |
4756 |
+ t4r = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(i0r, i1r), i2r), i3r), i4r); |
4757 |
+ t4i = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(i0i, i1i), i2i), i3i), i4i); |
4758 |
+ _mm_store_ps(out + 0, t4r); |
4759 |
+ _mm_store_ps(out + 4, t4i); |
4760 |
+ |
4761 |
+ t4r = _mm_sub_ps(_mm_mul_ps(e0r, t2r), _mm_mul_ps(e1r, t0r)); |
4762 |
+ t4i = _mm_sub_ps(_mm_mul_ps(e0r, t2i), _mm_mul_ps(e1r, t0i)); |
4763 |
+ t0r = _mm_sub_ps(_mm_mul_ps(e0r, t0r), _mm_mul_ps(e1r, t2r)); |
4764 |
+ t0i = _mm_sub_ps(_mm_mul_ps(e0r, t0i), _mm_mul_ps(e1r, t2i)); |
4765 |
+ t5r = _mm_sub_ps(_mm_mul_ps(e0i, t3r), _mm_mul_ps(e1i, t1r)); |
4766 |
+ t5i = _mm_sub_ps(_mm_mul_ps(e0i, t3i), _mm_mul_ps(e1i, t1i)); |
4767 |
+ t1r = _mm_add_ps(_mm_mul_ps(e0i, t1r), _mm_mul_ps(e1i, t3r)); |
4768 |
+ t1i = _mm_add_ps(_mm_mul_ps(e0i, t1i), _mm_mul_ps(e1i, t3i)); |
4769 |
+ |
4770 |
+ z0r = _mm_sub_ps(t0r, t1r); |
4771 |
+ z0i = _mm_sub_ps(t0i, t1i); |
4772 |
+ z1r = _mm_add_ps(t4r, t5r); |
4773 |
+ z1i = _mm_add_ps(t4i, t5i); |
4774 |
+ |
4775 |
+ z2r = _mm_sub_ps(t4r, t5r); |
4776 |
+ z2i = _mm_sub_ps(t4i, t5i); |
4777 |
+ z3r = _mm_add_ps(t0r, t1r); |
4778 |
+ z3i = _mm_add_ps(t0i, t1i); |
4779 |
+ |
4780 |
+ _mm_store_ps(out + 8, _mm_add_ps(i0r, z3r)); |
4781 |
+ _mm_store_ps(out + 12, _mm_add_ps(i0i, z0i)); |
4782 |
+ _mm_store_ps(out + 16, _mm_add_ps(i0r, z2r)); |
4783 |
+ _mm_store_ps(out + 20, _mm_add_ps(i0i, z1i)); |
4784 |
+ _mm_store_ps(out + 24, _mm_add_ps(i0r, z1r)); |
4785 |
+ _mm_store_ps(out + 28, _mm_add_ps(i0i, z2i)); |
4786 |
+ _mm_store_ps(out + 32, _mm_add_ps(i0r, z0r)); |
4787 |
+ _mm_store_ps(out + 36, _mm_add_ps(i0i, z3i)); |
4788 |
+} |
4789 |
+ |
4790 |
+#define CMUL4(c, a, b) CMUL((c).re, (c).im, tmp[k * 8 + a], tmp[k * 8 + 4 + a], (b).re, (b).im) |
4791 |
+ |
4792 |
+static void fft15_e2k(FFTComplex *out, float *in, FFTComplex *exptab, ptrdiff_t stride) |
4793 |
+{ |
4794 |
+ int k; |
4795 |
+ DECLARE_ALIGNED(16, float, tmp)[5 * 8]; |
4796 |
+ |
4797 |
+ fft5(tmp, in, exptab + 19); |
4798 |
+ |
4799 |
+ PRAGMA_E2K("ivdep") |
4800 |
+ for (k = 0; k < 5; k++) { |
4801 |
+ FFTComplex t[2]; |
4802 |
+ |
4803 |
+ CMUL4(t[0], 2, exptab[k]); |
4804 |
+ CMUL4(t[1], 3, exptab[2 * k]); |
4805 |
+ out[stride*k].re = tmp[k * 8] + t[0].re + t[1].re; |
4806 |
+ out[stride*k].im = tmp[k * 8 + 4] + t[0].im + t[1].im; |
4807 |
+ |
4808 |
+ CMUL4(t[0], 2, exptab[k + 5]); |
4809 |
+ CMUL4(t[1], 3, exptab[2 * (k + 5)]); |
4810 |
+ out[stride*(k + 5)].re = tmp[k * 8] + t[0].re + t[1].re; |
4811 |
+ out[stride*(k + 5)].im = tmp[k * 8 + 4] + t[0].im + t[1].im; |
4812 |
+ |
4813 |
+ CMUL4(t[0], 2, exptab[k + 10]); |
4814 |
+ CMUL4(t[1], 3, exptab[2 * k + 5]); |
4815 |
+ out[stride*(k + 10)].re = tmp[k * 8] + t[0].re + t[1].re; |
4816 |
+ out[stride*(k + 10)].im = tmp[k * 8 + 4] + t[0].im + t[1].im; |
4817 |
+ } |
4818 |
+} |
4819 |
+ |
4820 |
+static void imdct15_half_e2k(MDCT15Context *s, float *dst, const float *src, |
4821 |
+ ptrdiff_t stride) |
4822 |
+{ |
4823 |
+ DECLARE_ALIGNED(16, float, fft15in)[5 * 8]; |
4824 |
+ FFTComplex *z = (FFTComplex *)dst; |
4825 |
+ int i, j, k, len8 = s->len4 >> 1, l_ptwo = 1 << s->ptwo_fft.nbits; |
4826 |
+ const float *in1 = src, *in2 = src + (s->len2 - 1) * stride; |
4827 |
+ |
4828 |
+ /* Reindex input, putting it into a buffer and doing an Nx15 FFT */ |
4829 |
+ for (i = 0; i < l_ptwo; i++) { |
4830 |
+ PRAGMA_E2K("ivdep") |
4831 |
+ for (k = j = 0; j < 15; j += 3, k += 8) { |
4832 |
+ int k0 = s->pfa_prereindex[i * 15 + j]; |
4833 |
+ int k1 = s->pfa_prereindex[i * 15 + j + 1]; |
4834 |
+ int k2 = s->pfa_prereindex[i * 15 + j + 2]; |
4835 |
+ float are, aim; FFTComplex b; |
4836 |
+ |
4837 |
+ are = in2[-k0 * stride]; aim = in1[k0 * stride]; |
4838 |
+ b = s->twiddle_exptab[k0 >> 1]; |
4839 |
+ fft15in[k ] = are * b.re - aim * b.im; |
4840 |
+ fft15in[k + 4] = are * b.im + aim * b.re; |
4841 |
+ fft15in[k + 1] = 0; |
4842 |
+ fft15in[k + 5] = 0; |
4843 |
+ |
4844 |
+ are = in2[-k1 * stride]; aim = in1[k1 * stride]; |
4845 |
+ b = s->twiddle_exptab[k1 >> 1]; |
4846 |
+ fft15in[k + 2] = are * b.re - aim * b.im; |
4847 |
+ fft15in[k + 6] = are * b.im + aim * b.re; |
4848 |
+ |
4849 |
+ are = in2[-k2 * stride]; aim = in1[k2 * stride]; |
4850 |
+ b = s->twiddle_exptab[k2 >> 1]; |
4851 |
+ fft15in[k + 3] = are * b.re - aim * b.im; |
4852 |
+ fft15in[k + 7] = are * b.im + aim * b.re; |
4853 |
+ } |
4854 |
+ fft15_e2k(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo); |
4855 |
+ } |
4856 |
+ |
4857 |
+ /* Then a 15xN FFT (where N is a power of two) */ |
4858 |
+ for (i = 0; i < 15; i++) |
4859 |
+ s->ptwo_fft.fft_calc(&s->ptwo_fft, s->tmp + l_ptwo*i); |
4860 |
+ |
4861 |
+ /* Reindex again, apply twiddles and output */ |
4862 |
+ s->postreindex(z, s->tmp, s->twiddle_exptab, s->pfa_postreindex, len8); |
4863 |
+} |
4864 |
+ |
4865 |
+av_cold void ff_mdct15_init_e2k(MDCT15Context *s) |
4866 |
+{ |
4867 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
4868 |
+ return; |
4869 |
+ |
4870 |
+ s->imdct_half = imdct15_half_e2k; |
4871 |
+} |
4872 |
diff --git a/libavcodec/e2k/me_cmp.c b/libavcodec/e2k/me_cmp.c |
4873 |
new file mode 100644 |
4874 |
index 0000000..e6eda38 |
4875 |
--- /dev/null |
4876 |
+++ b/libavcodec/e2k/me_cmp.c |
4877 |
@@ -0,0 +1,461 @@ |
4878 |
+/* |
4879 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
4880 |
+ * Copyright (c) 2002 Brian Foley |
4881 |
+ * Copyright (c) 2002 Dieter Shirley |
4882 |
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
4883 |
+ * |
4884 |
+ * This file is part of FFmpeg. |
4885 |
+ * |
4886 |
+ * FFmpeg is free software; you can redistribute it and/or |
4887 |
+ * modify it under the terms of the GNU Lesser General Public |
4888 |
+ * License as published by the Free Software Foundation; either |
4889 |
+ * version 2.1 of the License, or (at your option) any later version. |
4890 |
+ * |
4891 |
+ * FFmpeg is distributed in the hope that it will be useful, |
4892 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
4893 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
4894 |
+ * Lesser General Public License for more details. |
4895 |
+ * |
4896 |
+ * You should have received a copy of the GNU Lesser General Public |
4897 |
+ * License along with FFmpeg; if not, write to the Free Software |
4898 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
4899 |
+ */ |
4900 |
+ |
4901 |
+#include "config.h" |
4902 |
+ |
4903 |
+#include "libavutil/attributes.h" |
4904 |
+#include "libavutil/cpu.h" |
4905 |
+#include "libavutil/e2k/cpu.h" |
4906 |
+#include "libavutil/e2k/util_e2k.h" |
4907 |
+ |
4908 |
+#include "libavcodec/avcodec.h" |
4909 |
+#include "libavcodec/mpegvideo.h" |
4910 |
+#include "libavcodec/me_cmp.h" |
4911 |
+ |
4912 |
+#define LOAD_PIX(v1, v2, pix) { \ |
4913 |
+ v1 = VEC_LD(pix); \ |
4914 |
+ v2 = VEC_LD(pix + 1); \ |
4915 |
+} |
4916 |
+ |
4917 |
+static int sad16_x2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
4918 |
+ ptrdiff_t stride, int h) |
4919 |
+{ |
4920 |
+ int i; |
4921 |
+ __m128i v0, v1, v2, sum = _mm_setzero_si128(); |
4922 |
+ |
4923 |
+ PRAGMA_E2K("ivdep") |
4924 |
+ for (i = 0; i < h; i++) { |
4925 |
+ LOAD_PIX(v1, v2, pix2); |
4926 |
+ v0 = VEC_LD(pix1); |
4927 |
+ v1 = _mm_avg_epu8(v1, v2); |
4928 |
+ sum = _mm_add_epi32(sum, _mm_sad_epu8(v0, v1)); |
4929 |
+ |
4930 |
+ pix1 += stride; |
4931 |
+ pix2 += stride; |
4932 |
+ } |
4933 |
+ return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 2); |
4934 |
+} |
4935 |
+ |
4936 |
+static int sad8_x2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
4937 |
+ ptrdiff_t stride, int h) |
4938 |
+{ |
4939 |
+ int i; |
4940 |
+ __m64 v0, v1, v2, sum = _mm_setzero_si64(); |
4941 |
+ |
4942 |
+ PRAGMA_E2K("ivdep") |
4943 |
+ for (i = 0; i < h; i++) { |
4944 |
+ v1 = *(__m64*)pix2; |
4945 |
+ v2 = *(__m64*)(pix2 + 1); |
4946 |
+ v0 = *(__m64*)pix1; |
4947 |
+ v1 = _mm_avg_pu8(v1, v2); |
4948 |
+ sum = _mm_add_pi32(sum, _mm_sad_pu8(v0, v1)); |
4949 |
+ |
4950 |
+ pix1 += stride; |
4951 |
+ pix2 += stride; |
4952 |
+ } |
4953 |
+ return _mm_extract_pi32(sum, 0); |
4954 |
+} |
4955 |
+ |
4956 |
+static int sad16_y2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
4957 |
+ ptrdiff_t stride, int h) |
4958 |
+{ |
4959 |
+ int i; |
4960 |
+ __m128i v0, v1, v2, sum = _mm_setzero_si128(); |
4961 |
+ |
4962 |
+ v2 = VEC_LD(pix2); |
4963 |
+ pix2 += stride; |
4964 |
+ |
4965 |
+ PRAGMA_E2K("ivdep") |
4966 |
+ for (i = 0; i < h; i++) { |
4967 |
+ v1 = v2; |
4968 |
+ v2 = VEC_LD(pix2); |
4969 |
+ v0 = VEC_LD(pix1); |
4970 |
+ v1 = _mm_avg_epu8(v1, v2); |
4971 |
+ sum = _mm_add_epi32(sum, _mm_sad_epu8(v0, v1)); |
4972 |
+ pix1 += stride; |
4973 |
+ pix2 += stride; |
4974 |
+ } |
4975 |
+ return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 2); |
4976 |
+} |
4977 |
+ |
4978 |
+static int sad8_y2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
4979 |
+ ptrdiff_t stride, int h) |
4980 |
+{ |
4981 |
+ int i; |
4982 |
+ __m64 v0, v1, v2, sum = _mm_setzero_si64(); |
4983 |
+ |
4984 |
+ v2 = *(__m64*)pix2; |
4985 |
+ pix2 += stride; |
4986 |
+ |
4987 |
+ PRAGMA_E2K("ivdep") |
4988 |
+ for (i = 0; i < h; i++) { |
4989 |
+ v1 = v2; |
4990 |
+ v2 = *(__m64*)pix2; |
4991 |
+ v0 = *(__m64*)pix1; |
4992 |
+ v1 = _mm_avg_pu8(v1, v2); |
4993 |
+ sum = _mm_add_pi32(sum, _mm_sad_pu8(v0, v1)); |
4994 |
+ pix1 += stride; |
4995 |
+ pix2 += stride; |
4996 |
+ } |
4997 |
+ return _mm_extract_pi32(sum, 0); |
4998 |
+} |
4999 |
+ |
5000 |
+static int sad16_xy2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
5001 |
+ ptrdiff_t stride, int h) |
5002 |
+{ |
5003 |
+ int i; |
5004 |
+ LOAD_ZERO; |
5005 |
+ __m128i v0, v1, v2, sum = zerov; |
5006 |
+ __m128i t0, t1, t2, t3, t4, t5; |
5007 |
+ __m128i c2 = _mm_set1_epi16(2); |
5008 |
+ |
5009 |
+ LOAD_PIX(v1, v2, pix2); |
5010 |
+ t2 = _mm_unpacklo_epi8(v1, zerov); |
5011 |
+ t3 = _mm_unpackhi_epi8(v1, zerov); |
5012 |
+ t4 = _mm_unpacklo_epi8(v2, zerov); |
5013 |
+ t5 = _mm_unpackhi_epi8(v2, zerov); |
5014 |
+ t2 = _mm_add_epi16(t2, t4); |
5015 |
+ t3 = _mm_add_epi16(t3, t5); |
5016 |
+ pix2 += stride; |
5017 |
+ |
5018 |
+ PRAGMA_E2K("ivdep") |
5019 |
+ for (i = 0; i < h; i++) { |
5020 |
+ t0 = t2; t1 = t3; |
5021 |
+ LOAD_PIX(v1, v2, pix2); |
5022 |
+ v0 = VEC_LD(pix1); |
5023 |
+ t2 = _mm_unpacklo_epi8(v1, zerov); |
5024 |
+ t3 = _mm_unpackhi_epi8(v1, zerov); |
5025 |
+ t4 = _mm_unpacklo_epi8(v2, zerov); |
5026 |
+ t5 = _mm_unpackhi_epi8(v2, zerov); |
5027 |
+ t2 = _mm_add_epi16(t2, t4); |
5028 |
+ t3 = _mm_add_epi16(t3, t5); |
5029 |
+ |
5030 |
+ v1 = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t0, t2), c2), 2); |
5031 |
+ v2 = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t1, t3), c2), 2); |
5032 |
+ v1 = _mm_packus_epi16(v1, v2); |
5033 |
+ |
5034 |
+ sum = _mm_add_epi32(sum, _mm_sad_epu8(v0, v1)); |
5035 |
+ pix1 += stride; |
5036 |
+ pix2 += stride; |
5037 |
+ } |
5038 |
+ return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 2); |
5039 |
+} |
5040 |
+ |
5041 |
+static int sad8_xy2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
5042 |
+ ptrdiff_t stride, int h) |
5043 |
+{ |
5044 |
+ int i; |
5045 |
+ LOAD_ZERO; |
5046 |
+ __m64 v0, sum = _mm_movepi64_pi64(zerov); |
5047 |
+ __m128i v1, v2, t0, t1, t2, c2 = _mm_set1_epi16(2); |
5048 |
+ |
5049 |
+ v1 = VEC_LD8(pix2); |
5050 |
+ v2 = VEC_LD8(pix2 + 1); |
5051 |
+ t1 = _mm_unpacklo_epi8(v1, zerov); |
5052 |
+ t2 = _mm_unpacklo_epi8(v2, zerov); |
5053 |
+ t1 = _mm_add_epi16(t1, t2); |
5054 |
+ pix2 += stride; |
5055 |
+ |
5056 |
+ PRAGMA_E2K("ivdep") |
5057 |
+ for (i = 0; i < h; i++) { |
5058 |
+ t0 = t1; |
5059 |
+ v1 = VEC_LD8(pix2); |
5060 |
+ v2 = VEC_LD8(pix2 + 1); |
5061 |
+ v0 = *(__m64*)pix1; |
5062 |
+ t1 = _mm_unpacklo_epi8(v1, zerov); |
5063 |
+ t2 = _mm_unpacklo_epi8(v2, zerov); |
5064 |
+ t1 = _mm_add_epi16(t1, t2); |
5065 |
+ |
5066 |
+ v1 = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t0, t1), c2), 2); |
5067 |
+ v1 = _mm_packus_epi16(v1, v1); |
5068 |
+ |
5069 |
+ sum = _mm_add_pi32(sum, _mm_sad_pu8(v0, _mm_movepi64_pi64(v1))); |
5070 |
+ pix1 += stride; |
5071 |
+ pix2 += stride; |
5072 |
+ } |
5073 |
+ return _mm_extract_pi32(sum, 0); |
5074 |
+} |
5075 |
+ |
5076 |
+static int sad16_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
5077 |
+ ptrdiff_t stride, int h) |
5078 |
+{ |
5079 |
+ int i; |
5080 |
+ __m128i v0, v1, sum = _mm_setzero_si128(); |
5081 |
+ |
5082 |
+ PRAGMA_E2K("ivdep") |
5083 |
+ for (i = 0; i < h; i++) { |
5084 |
+ v0 = VEC_LD(pix1); |
5085 |
+ v1 = VEC_LD(pix2); |
5086 |
+ sum = _mm_add_epi32(sum, _mm_sad_epu8(v0, v1)); |
5087 |
+ pix1 += stride; |
5088 |
+ pix2 += stride; |
5089 |
+ } |
5090 |
+ return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 2); |
5091 |
+} |
5092 |
+ |
5093 |
+static int sad8_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
5094 |
+ ptrdiff_t stride, int h) |
5095 |
+{ |
5096 |
+ int i; |
5097 |
+ __m64 v0, v1, sum = _mm_setzero_si64(); |
5098 |
+ |
5099 |
+ PRAGMA_E2K("ivdep") |
5100 |
+ for (i = 0; i < h; i++) { |
5101 |
+ v0 = *(__m64*)pix1; |
5102 |
+ v1 = *(__m64*)pix2; |
5103 |
+ sum = _mm_add_pi32(sum, _mm_sad_pu8(v0, v1)); |
5104 |
+ pix1 += stride; |
5105 |
+ pix2 += stride; |
5106 |
+ } |
5107 |
+ return _mm_extract_pi32(sum, 0); |
5108 |
+} |
5109 |
+ |
5110 |
+/* Sum of Squared Errors for an 8x8 block. */ |
5111 |
+static int sse8_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
5112 |
+ ptrdiff_t stride, int h) |
5113 |
+{ |
5114 |
+ int i; |
5115 |
+ LOAD_ZERO; |
5116 |
+ __m128i v0, v1, sum = zerov; |
5117 |
+ |
5118 |
+ PRAGMA_E2K("ivdep") |
5119 |
+ for (i = 0; i < h; i++) { |
5120 |
+ v0 = VEC_LD8(pix1); |
5121 |
+ v1 = VEC_LD8(pix2); |
5122 |
+ v0 = _mm_unpacklo_epi8(v0, zerov); |
5123 |
+ v1 = _mm_unpacklo_epi8(v1, zerov); |
5124 |
+ v0 = _mm_sub_epi16(v0, v1); |
5125 |
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v0, v0)); |
5126 |
+ pix1 += stride; |
5127 |
+ pix2 += stride; |
5128 |
+ } |
5129 |
+ sum = _mm_hadd_epi32(sum, sum); |
5130 |
+ return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1); |
5131 |
+} |
5132 |
+ |
5133 |
+/* Sum of Squared Errors for a 16x16 block. */ |
5134 |
+static int sse16_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
5135 |
+ ptrdiff_t stride, int h) |
5136 |
+{ |
5137 |
+ int i; |
5138 |
+ LOAD_ZERO; |
5139 |
+ __m128i v0, v1, v2, v3, sum = zerov; |
5140 |
+ |
5141 |
+ PRAGMA_E2K("ivdep") |
5142 |
+ for (i = 0; i < h; i++) { |
5143 |
+ v2 = VEC_LD(pix1); |
5144 |
+ v3 = VEC_LD(pix2); |
5145 |
+ v0 = _mm_unpacklo_epi8(v2, zerov); |
5146 |
+ v1 = _mm_unpacklo_epi8(v3, zerov); |
5147 |
+ v2 = _mm_unpackhi_epi8(v2, zerov); |
5148 |
+ v3 = _mm_unpackhi_epi8(v3, zerov); |
5149 |
+ v0 = _mm_sub_epi16(v0, v1); |
5150 |
+ v2 = _mm_sub_epi16(v2, v3); |
5151 |
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v0, v0)); |
5152 |
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v2, v2)); |
5153 |
+ pix1 += stride; |
5154 |
+ pix2 += stride; |
5155 |
+ } |
5156 |
+ sum = _mm_hadd_epi32(sum, sum); |
5157 |
+ return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1); |
5158 |
+} |
5159 |
+ |
5160 |
+#define HADAMARD8_FIN(t, sum) \ |
5161 |
+ v0 = _mm_add_epi16(t##0, t##1); \ |
5162 |
+ v1 = _mm_sub_epi16(t##0, t##1); \ |
5163 |
+ v2 = _mm_add_epi16(t##2, t##3); \ |
5164 |
+ v3 = _mm_sub_epi16(t##2, t##3); \ |
5165 |
+ v4 = _mm_add_epi16(t##4, t##5); \ |
5166 |
+ v5 = _mm_sub_epi16(t##4, t##5); \ |
5167 |
+ v6 = _mm_add_epi16(t##6, t##7); \ |
5168 |
+ v7 = _mm_sub_epi16(t##6, t##7); \ |
5169 |
+ \ |
5170 |
+ t0 = _mm_add_epi16(v0, v2); \ |
5171 |
+ t2 = _mm_sub_epi16(v0, v2); \ |
5172 |
+ t1 = _mm_add_epi16(v1, v3); \ |
5173 |
+ t3 = _mm_sub_epi16(v1, v3); \ |
5174 |
+ t4 = _mm_add_epi16(v4, v6); \ |
5175 |
+ t6 = _mm_sub_epi16(v4, v6); \ |
5176 |
+ t5 = _mm_add_epi16(v5, v7); \ |
5177 |
+ t7 = _mm_sub_epi16(v5, v7); \ |
5178 |
+ \ |
5179 |
+ v0 = _mm_add_epi16(t0, t4); \ |
5180 |
+ v4 = _mm_sub_epi16(t0, t4); \ |
5181 |
+ v1 = _mm_add_epi16(t1, t5); \ |
5182 |
+ v5 = _mm_sub_epi16(t1, t5); \ |
5183 |
+ v2 = _mm_add_epi16(t2, t6); \ |
5184 |
+ v6 = _mm_sub_epi16(t2, t6); \ |
5185 |
+ v3 = _mm_add_epi16(t3, t7); \ |
5186 |
+ v7 = _mm_sub_epi16(t3, t7); \ |
5187 |
+ \ |
5188 |
+ v0 = _mm_madd_epi16(_mm_abs_epi16(v0), onev); \ |
5189 |
+ v1 = _mm_madd_epi16(_mm_abs_epi16(v1), onev); \ |
5190 |
+ v2 = _mm_madd_epi16(_mm_abs_epi16(v2), onev); \ |
5191 |
+ v3 = _mm_madd_epi16(_mm_abs_epi16(v3), onev); \ |
5192 |
+ v4 = _mm_madd_epi16(_mm_abs_epi16(v4), onev); \ |
5193 |
+ v5 = _mm_madd_epi16(_mm_abs_epi16(v5), onev); \ |
5194 |
+ v6 = _mm_madd_epi16(_mm_abs_epi16(v6), onev); \ |
5195 |
+ v7 = _mm_madd_epi16(_mm_abs_epi16(v7), onev); \ |
5196 |
+ \ |
5197 |
+ v0 = _mm_add_epi32(v0, v1); \ |
5198 |
+ v2 = _mm_add_epi32(v2, v3); \ |
5199 |
+ v4 = _mm_add_epi32(v4, v5); \ |
5200 |
+ v6 = _mm_add_epi32(v6, v7); \ |
5201 |
+ v0 = _mm_add_epi32(v0, v2); \ |
5202 |
+ v4 = _mm_add_epi32(v4, v6); \ |
5203 |
+ sum = _mm_add_epi32(v0, v4); |
5204 |
+ |
5205 |
+static int hadamard8_diff_e2k(MpegEncContext *s, uint8_t *dst, |
5206 |
+ uint8_t *src, ptrdiff_t stride, int h) |
5207 |
+{ |
5208 |
+ LOAD_ZERO; |
5209 |
+ vec_s16 v0, v1, v2, v3, v4, v5, v6, v7; |
5210 |
+ vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, sum; |
5211 |
+ const vec_s16 onev = _mm_set1_epi16(1); |
5212 |
+ const vec_s16 vprod1 = _mm_setr_epi16(1, -1, 1, -1, 1, -1, 1, -1); |
5213 |
+ const vec_s16 vprod2 = _mm_setr_epi16(1, 1, -1, -1, 1, 1, -1, -1); |
5214 |
+ const vec_s16 vprod3 = _mm_setr_epi16(1, 1, 1, 1, -1, -1, -1, -1); |
5215 |
+ const vec_u8 perm1 = _mm_setr_epi8( |
5216 |
+ 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
5217 |
+ 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); |
5218 |
+ |
5219 |
+#define ITER(i) { \ |
5220 |
+ v0 = VEC_LD8(src + stride * i); \ |
5221 |
+ v1 = VEC_LD8(dst + stride * i); \ |
5222 |
+ v0 = _mm_unpacklo_epi8(v0, zerov); \ |
5223 |
+ v1 = _mm_unpacklo_epi8(v1, zerov); \ |
5224 |
+ v0 = _mm_sub_epi16(v0, v1); \ |
5225 |
+ v1 = _mm_shuffle_epi8(v0, perm1); \ |
5226 |
+ v0 = _mm_add_epi16(_mm_sign_epi16(v0, vprod1), v1); \ |
5227 |
+ v1 = _mm_shuffle_epi32(v0, 0xb1); \ |
5228 |
+ v0 = _mm_add_epi16(_mm_sign_epi16(v0, vprod2), v1); \ |
5229 |
+ v1 = _mm_shuffle_epi32(v0, 0x4e); \ |
5230 |
+ t##i = _mm_add_epi16(_mm_sign_epi16(v0, vprod3), v1); \ |
5231 |
+} |
5232 |
+ ITER(0); ITER(1); ITER(2); ITER(3); |
5233 |
+ ITER(4); ITER(5); ITER(6); ITER(7); |
5234 |
+#undef ITER |
5235 |
+ |
5236 |
+ HADAMARD8_FIN(t, sum) |
5237 |
+ |
5238 |
+ sum = _mm_hadd_epi32(sum, sum); |
5239 |
+ return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1); |
5240 |
+} |
5241 |
+ |
5242 |
+#if 1 |
5243 |
+static int hadamard8_diff16_e2k(MpegEncContext *s, uint8_t *dst, |
5244 |
+ uint8_t *src, ptrdiff_t stride, int h) |
5245 |
+{ |
5246 |
+ LOAD_ZERO; |
5247 |
+ vec_s16 v0, v1, v2, v3, v4, v5, v6, v7; |
5248 |
+ vec_s16 x0, x1, x2, x3, x4, x5, x6, x7; |
5249 |
+ vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, sum = zerov; |
5250 |
+ const vec_s16 onev = _mm_set1_epi16(1); |
5251 |
+ const vec_s16 vprod1 = _mm_setr_epi16(1, -1, 1, -1, 1, -1, 1, -1); |
5252 |
+ const vec_s16 vprod2 = _mm_setr_epi16(1, 1, -1, -1, 1, 1, -1, -1); |
5253 |
+ const vec_s16 vprod3 = _mm_setr_epi16(1, 1, 1, 1, -1, -1, -1, -1); |
5254 |
+ const vec_u8 perm1 = _mm_setr_epi8( |
5255 |
+ 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
5256 |
+ 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); |
5257 |
+ int i; |
5258 |
+ |
5259 |
+ PRAGMA_E2K("ivdep") |
5260 |
+ for (i = 0; i < h; i += 8) { |
5261 |
+ |
5262 |
+#define ITER(i) { \ |
5263 |
+ v2 = VEC_LD(src + stride * i); \ |
5264 |
+ v3 = VEC_LD(dst + stride * i); \ |
5265 |
+ v0 = _mm_unpacklo_epi8(v2, zerov); \ |
5266 |
+ v1 = _mm_unpacklo_epi8(v3, zerov); \ |
5267 |
+ v2 = _mm_unpackhi_epi8(v2, zerov); \ |
5268 |
+ v3 = _mm_unpackhi_epi8(v3, zerov); \ |
5269 |
+ v0 = _mm_sub_epi16(v0, v1); \ |
5270 |
+ v2 = _mm_sub_epi16(v2, v3); \ |
5271 |
+ v1 = _mm_shuffle_epi8(v0, perm1); \ |
5272 |
+ v3 = _mm_shuffle_epi8(v2, perm1); \ |
5273 |
+ v0 = _mm_add_epi16(_mm_sign_epi16(v0, vprod1), v1); \ |
5274 |
+ v2 = _mm_add_epi16(_mm_sign_epi16(v2, vprod1), v3); \ |
5275 |
+ v1 = _mm_shuffle_epi32(v0, 0xb1); \ |
5276 |
+ v3 = _mm_shuffle_epi32(v2, 0xb1); \ |
5277 |
+ v0 = _mm_add_epi16(_mm_sign_epi16(v0, vprod2), v1); \ |
5278 |
+ v2 = _mm_add_epi16(_mm_sign_epi16(v2, vprod2), v3); \ |
5279 |
+ v1 = _mm_shuffle_epi32(v0, 0x4e); \ |
5280 |
+ v3 = _mm_shuffle_epi32(v2, 0x4e); \ |
5281 |
+ t##i = _mm_add_epi16(_mm_sign_epi16(v0, vprod3), v1); \ |
5282 |
+ x##i = _mm_add_epi16(_mm_sign_epi16(v2, vprod3), v3); \ |
5283 |
+} |
5284 |
+ ITER(0); ITER(1); ITER(2); ITER(3); |
5285 |
+ ITER(4); ITER(5); ITER(6); ITER(7); |
5286 |
+#undef ITER |
5287 |
+ |
5288 |
+ HADAMARD8_FIN(t, v0) |
5289 |
+ sum = _mm_add_epi32(sum, v0); |
5290 |
+ HADAMARD8_FIN(x, v0) |
5291 |
+ sum = _mm_add_epi32(sum, v0); |
5292 |
+ dst += 8 * stride; |
5293 |
+ src += 8 * stride; |
5294 |
+ } |
5295 |
+ sum = _mm_hadd_epi32(sum, sum); |
5296 |
+ return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1); |
5297 |
+} |
5298 |
+#else |
5299 |
+static int hadamard8_diff16_e2k(MpegEncContext *s, uint8_t *dst, |
5300 |
+ uint8_t *src, ptrdiff_t stride, int h) |
5301 |
+{ |
5302 |
+ int i, score = 0; |
5303 |
+ for (i = 0; i < h; i += 8) { |
5304 |
+ score += hadamard8_diff_e2k(s, dst, src, stride, 8); |
5305 |
+ score += hadamard8_diff_e2k(s, dst + 8, src + 8, stride, 8); |
5306 |
+ dst += 8 * stride; |
5307 |
+ src += 8 * stride; |
5308 |
+ } |
5309 |
+ return score; |
5310 |
+} |
5311 |
+#endif |
5312 |
+ |
5313 |
+av_cold void ff_me_cmp_init_e2k(MECmpContext *c, AVCodecContext *avctx) |
5314 |
+{ |
5315 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
5316 |
+ return; |
5317 |
+ |
5318 |
+ // !checkasm |
5319 |
+ |
5320 |
+ // fate lavf-mxf |
5321 |
+ c->pix_abs[0][0] = sad16_e2k; |
5322 |
+ c->pix_abs[0][1] = sad16_x2_e2k; |
5323 |
+ c->pix_abs[0][2] = sad16_y2_e2k; |
5324 |
+ c->pix_abs[0][3] = sad16_xy2_e2k; |
5325 |
+ c->pix_abs[1][0] = sad8_e2k; |
5326 |
+ c->pix_abs[1][1] = sad8_x2_e2k; |
5327 |
+ c->pix_abs[1][2] = sad8_y2_e2k; |
5328 |
+ c->pix_abs[1][3] = sad8_xy2_e2k; |
5329 |
+ |
5330 |
+ c->sad[0] = sad16_e2k; |
5331 |
+ c->sad[1] = sad8_e2k; |
5332 |
+ c->sse[0] = sse16_e2k; |
5333 |
+ c->sse[1] = sse8_e2k; |
5334 |
+ |
5335 |
+ // fate vsynth1-mpeg4-qprd |
5336 |
+ c->hadamard8_diff[0] = hadamard8_diff16_e2k; |
5337 |
+ c->hadamard8_diff[1] = hadamard8_diff_e2k; |
5338 |
+} |
5339 |
diff --git a/libavcodec/e2k/mpegaudiodsp.c b/libavcodec/e2k/mpegaudiodsp.c |
5340 |
new file mode 100644 |
5341 |
index 0000000..2751453 |
5342 |
--- /dev/null |
5343 |
+++ b/libavcodec/e2k/mpegaudiodsp.c |
5344 |
@@ -0,0 +1,142 @@ |
5345 |
+/* |
5346 |
+ * Elbrus optimized MP3 decoding functions |
5347 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
5348 |
+ * Copyright (c) 2010 Vitor Sessak |
5349 |
+ * |
5350 |
+ * This file is part of FFmpeg. |
5351 |
+ * |
5352 |
+ * FFmpeg is free software; you can redistribute it and/or |
5353 |
+ * modify it under the terms of the GNU Lesser General Public |
5354 |
+ * License as published by the Free Software Foundation; either |
5355 |
+ * version 2.1 of the License, or (at your option) any later version. |
5356 |
+ * |
5357 |
+ * FFmpeg is distributed in the hope that it will be useful, |
5358 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
5359 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
5360 |
+ * Lesser General Public License for more details. |
5361 |
+ * |
5362 |
+ * You should have received a copy of the GNU Lesser General Public |
5363 |
+ * License along with FFmpeg; if not, write to the Free Software |
5364 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
5365 |
+ */ |
5366 |
+ |
5367 |
+#include "config.h" |
5368 |
+#include "libavutil/attributes.h" |
5369 |
+#include "libavutil/cpu.h" |
5370 |
+#include "libavutil/internal.h" |
5371 |
+#include "libavutil/e2k/cpu.h" |
5372 |
+#include "libavutil/e2k/util_e2k.h" |
5373 |
+#include "libavcodec/mpegaudiodsp.h" |
5374 |
+#include "libavcodec/mpegaudio.h" |
5375 |
+ |
5376 |
+#define MACS(rt, ra, rb) rt += (ra) * (rb) |
5377 |
+#define MLSS(rt, ra, rb) rt -= (ra) * (rb) |
5378 |
+ |
5379 |
+#define SUM8(op, sum, w, p) { \ |
5380 |
+ op(sum, (w)[0 * 64], (p)[0 * 64]); \ |
5381 |
+ op(sum, (w)[1 * 64], (p)[1 * 64]); \ |
5382 |
+ op(sum, (w)[2 * 64], (p)[2 * 64]); \ |
5383 |
+ op(sum, (w)[3 * 64], (p)[3 * 64]); \ |
5384 |
+ op(sum, (w)[4 * 64], (p)[4 * 64]); \ |
5385 |
+ op(sum, (w)[5 * 64], (p)[5 * 64]); \ |
5386 |
+ op(sum, (w)[6 * 64], (p)[6 * 64]); \ |
5387 |
+ op(sum, (w)[7 * 64], (p)[7 * 64]); \ |
5388 |
+} |
5389 |
+ |
5390 |
+static av_always_inline |
5391 |
+void apply_window(const float *buf, const float *win1, |
5392 |
+ const float *win2, float *sum1, float *sum2) |
5393 |
+{ |
5394 |
+ vec_f v0, v1, v2, v3, v4, v5; |
5395 |
+ int i; |
5396 |
+ |
5397 |
+#define MULT(j) \ |
5398 |
+ v1 = _mm_load_ps(win1 + j * 64); \ |
5399 |
+ v2 = _mm_load_ps(win2 + j * 16); \ |
5400 |
+ v3 = _mm_load_ps(buf + j * 64); \ |
5401 |
+ v0 = _mm_sub_ps(v0, _mm_mul_ps(v3, v1)); \ |
5402 |
+ v4 = _mm_sub_ps(v4, _mm_mul_ps(v2, v3)) |
5403 |
+ |
5404 |
+ v0 = v4 = _mm_setzero_ps(); |
5405 |
+ MULT(0); MULT(1); MULT(2); MULT(3); |
5406 |
+ MULT(4); MULT(5); MULT(6); MULT(7); |
5407 |
+ |
5408 |
+ PRAGMA_E2K("ivdep") |
5409 |
+ PRAGMA_E2K("unroll(3)") |
5410 |
+ for (i = 4; i < 16; i += 4) { |
5411 |
+ win1 += 4; win2 += 4; buf += 4; |
5412 |
+ _mm_store_ps(sum1, v0); v5 = v4; |
5413 |
+ |
5414 |
+ v0 = v4 = _mm_setzero_ps(); |
5415 |
+ MULT(0); MULT(1); MULT(2); MULT(3); |
5416 |
+ MULT(4); MULT(5); MULT(6); MULT(7); |
5417 |
+ _mm_store_ps(sum2, _mm_alignr_ps(v4, v5, 1)); |
5418 |
+ sum1 += 4; sum2 += 4; |
5419 |
+ } |
5420 |
+ _mm_store_ps(sum1, v0); |
5421 |
+ _mm_store_ps(sum2, _mm_bsrli_ps(v4, 1)); |
5422 |
+ |
5423 |
+#undef MULT |
5424 |
+} |
5425 |
+ |
5426 |
+static void apply_window_e2k(float *in, float *win, int *unused, float *out, |
5427 |
+ ptrdiff_t incr) |
5428 |
+{ |
5429 |
+ float ALIGNED(16) suma[16]; |
5430 |
+ float ALIGNED(16) sumb[16]; |
5431 |
+ float ALIGNED(16) sumc[16]; |
5432 |
+ float ALIGNED(16) sumd[16]; |
5433 |
+ float sum; |
5434 |
+ |
5435 |
+ /* copy to avoid wrap */ |
5436 |
+ memcpy(in + 512, in, 32 * sizeof(*in)); |
5437 |
+ |
5438 |
+ apply_window(in + 16, win , win + 512, suma, sumc); |
5439 |
+ apply_window(in + 32, win + 48, win + 640, sumb, sumd); |
5440 |
+ |
5441 |
+ sum = suma[0]; |
5442 |
+ SUM8(MACS, sum, win + 32, in + 48); |
5443 |
+ suma[0] = sum; |
5444 |
+ |
5445 |
+#define SUMS(a, b) \ |
5446 |
+ v0 = _mm_load_ps(sumd + b); \ |
5447 |
+ v1 = _mm_load_ps(sumc + a); \ |
5448 |
+ v0 = _mm_shuffle_ps(v0, v0, 0x1b); \ |
5449 |
+ v1 = _mm_shuffle_ps(v1, v1, 0x1b); \ |
5450 |
+ v0 = _mm_sub_ps(v0, _mm_load_ps(suma + a)); \ |
5451 |
+ v1 = _mm_add_ps(v1, _mm_load_ps(sumb + b)); \ |
5452 |
+ _mm_storeu_ps(out + a, v0); \ |
5453 |
+ _mm_storeu_ps(out + b + 16, v1) |
5454 |
+ |
5455 |
+ if (incr == 1) { |
5456 |
+ vec_f v0, v1; |
5457 |
+ SUMS(0, 12); SUMS(4, 8); SUMS(8, 4); SUMS(12, 0); |
5458 |
+ out += 16 * incr; |
5459 |
+ } else { |
5460 |
+ int j; |
5461 |
+ float *out2 = out + 32 * incr; |
5462 |
+ out[0] = -suma[0]; |
5463 |
+ out += incr; |
5464 |
+ out2 -= incr; |
5465 |
+ PRAGMA_E2K("ivdep") |
5466 |
+ for (j = 1; j < 16; j++) { |
5467 |
+ *out = sumd[15 - j] - suma[j]; |
5468 |
+ *out2 = sumb[16 - j] + sumc[j - 1]; |
5469 |
+ out += incr; |
5470 |
+ out2 -= incr; |
5471 |
+ } |
5472 |
+ } |
5473 |
+ |
5474 |
+ sum = 0; |
5475 |
+ SUM8(MLSS, sum, win + 16 + 32, in + 32); |
5476 |
+ *out = sum; |
5477 |
+} |
5478 |
+ |
5479 |
+av_cold void ff_mpadsp_init_e2k(MPADSPContext *s) |
5480 |
+{ |
5481 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
5482 |
+ return; |
5483 |
+ |
5484 |
+ // !checkasm |
5485 |
+ s->apply_window_float = apply_window_e2k; // fate audiomatch-square-mp3 |
5486 |
+} |
5487 |
diff --git a/libavcodec/e2k/mpegvideo.c b/libavcodec/e2k/mpegvideo.c |
5488 |
new file mode 100644 |
5489 |
index 0000000..36bf975 |
5490 |
--- /dev/null |
5491 |
+++ b/libavcodec/e2k/mpegvideo.c |
5492 |
@@ -0,0 +1,100 @@ |
5493 |
+/* |
5494 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
5495 |
+ * Copyright (c) 2002 Dieter Shirley |
5496 |
+ * |
5497 |
+ * This file is part of FFmpeg. |
5498 |
+ * |
5499 |
+ * FFmpeg is free software; you can redistribute it and/or |
5500 |
+ * modify it under the terms of the GNU Lesser General Public |
5501 |
+ * License as published by the Free Software Foundation; either |
5502 |
+ * version 2.1 of the License, or (at your option) any later version. |
5503 |
+ * |
5504 |
+ * FFmpeg is distributed in the hope that it will be useful, |
5505 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
5506 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
5507 |
+ * Lesser General Public License for more details. |
5508 |
+ * |
5509 |
+ * You should have received a copy of the GNU Lesser General Public |
5510 |
+ * License along with FFmpeg; if not, write to the Free Software |
5511 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
5512 |
+ */ |
5513 |
+ |
5514 |
+#include <stdlib.h> |
5515 |
+#include <stdio.h> |
5516 |
+ |
5517 |
+#include "config.h" |
5518 |
+ |
5519 |
+#include "libavutil/attributes.h" |
5520 |
+#include "libavutil/cpu.h" |
5521 |
+#include "libavutil/e2k/cpu.h" |
5522 |
+#include "libavutil/e2k/util_e2k.h" |
5523 |
+ |
5524 |
+#include "libavcodec/mpegvideo.h" |
5525 |
+ |
5526 |
+/* this code assumes `block' is 16 bytes-aligned */ |
5527 |
+static void dct_unquantize_h263_intra_e2k(MpegEncContext *s, |
5528 |
+ int16_t *block, int n, int qscale) |
5529 |
+{ |
5530 |
+ int level, qmul, qadd = 0, nCoeffs = 63, j; |
5531 |
+ __m128i qmulv, qaddv, v0, v1; |
5532 |
+ |
5533 |
+ qmul = qscale << 1; |
5534 |
+ level = block[0]; |
5535 |
+ |
5536 |
+ if (!s->h263_aic) { |
5537 |
+ level *= n < 4 ? s->y_dc_scale : s->c_dc_scale; |
5538 |
+ qadd = (qscale - 1) | 1; |
5539 |
+ } else { |
5540 |
+ av_assert2(s->block_last_index[n] >= 0); |
5541 |
+ } |
5542 |
+ if (!s->ac_pred) { |
5543 |
+ nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]]; |
5544 |
+ } |
5545 |
+ |
5546 |
+ qmulv = _mm_set1_epi16(qmul); |
5547 |
+ qaddv = _mm_set1_epi16(qadd); |
5548 |
+ PRAGMA_E2K("ivdep") |
5549 |
+ for (j = 0; j <= nCoeffs; j += 8) { |
5550 |
+ v0 = _mm_load_si128((const __m128i*)(block + j)); |
5551 |
+ v1 = _mm_mullo_epi16(v0, qmulv); |
5552 |
+ v1 = _mm_add_epi16(v1, _mm_sign_epi16(qaddv, v0)); |
5553 |
+ _mm_store_si128((__m128i*)(block + j), v1); |
5554 |
+ } |
5555 |
+ |
5556 |
+ block[0] = level; |
5557 |
+} |
5558 |
+ |
5559 |
+static void dct_unquantize_h263_inter_e2k(MpegEncContext *s, |
5560 |
+ int16_t *block, int n, int qscale) |
5561 |
+{ |
5562 |
+ int qmul, qadd, nCoeffs, j; |
5563 |
+ __m128i qmulv, qaddv, v0, v1; |
5564 |
+ |
5565 |
+ qmul = qscale << 1; |
5566 |
+ qadd = (qscale - 1) | 1; |
5567 |
+ |
5568 |
+ av_assert2(s->block_last_index[n] >= 0 || s->h263_aic); |
5569 |
+ nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]]; |
5570 |
+ |
5571 |
+ qmulv = _mm_set1_epi16(qmul); |
5572 |
+ qaddv = _mm_set1_epi16(qadd); |
5573 |
+ PRAGMA_E2K("ivdep") |
5574 |
+ for (j = 0; j <= nCoeffs; j += 8) { |
5575 |
+ v0 = _mm_load_si128((const __m128i*)(block + j)); |
5576 |
+ v1 = _mm_mullo_epi16(v0, qmulv); |
5577 |
+ v1 = _mm_add_epi16(v1, _mm_sign_epi16(qaddv, v0)); |
5578 |
+ _mm_store_si128((__m128i*)(block + j), v1); |
5579 |
+ } |
5580 |
+} |
5581 |
+ |
5582 |
+av_cold void ff_mpv_common_init_e2k(MpegEncContext *s) |
5583 |
+{ |
5584 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
5585 |
+ return; |
5586 |
+ |
5587 |
+ // !checkasm |
5588 |
+ // fate flv-add_keyframe_index |
5589 |
+ s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_e2k; |
5590 |
+ s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_e2k; |
5591 |
+} |
5592 |
+ |
5593 |
diff --git a/libavcodec/e2k/mpegvideodsp.c b/libavcodec/e2k/mpegvideodsp.c |
5594 |
new file mode 100644 |
5595 |
index 0000000..3d44735 |
5596 |
--- /dev/null |
5597 |
+++ b/libavcodec/e2k/mpegvideodsp.c |
5598 |
@@ -0,0 +1,86 @@ |
5599 |
+/* |
5600 |
+ * GMC (Global Motion Compensation) |
5601 |
+ * |
5602 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
5603 |
+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> |
5604 |
+ * |
5605 |
+ * This file is part of FFmpeg. |
5606 |
+ * |
5607 |
+ * FFmpeg is free software; you can redistribute it and/or |
5608 |
+ * modify it under the terms of the GNU Lesser General Public |
5609 |
+ * License as published by the Free Software Foundation; either |
5610 |
+ * version 2.1 of the License, or (at your option) any later version. |
5611 |
+ * |
5612 |
+ * FFmpeg is distributed in the hope that it will be useful, |
5613 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
5614 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
5615 |
+ * Lesser General Public License for more details. |
5616 |
+ * |
5617 |
+ * You should have received a copy of the GNU Lesser General Public |
5618 |
+ * License along with FFmpeg; if not, write to the Free Software |
5619 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
5620 |
+ */ |
5621 |
+ |
5622 |
+#include "libavutil/cpu.h" |
5623 |
+#include "libavutil/mem.h" |
5624 |
+#include "libavutil/e2k/cpu.h" |
5625 |
+#include "libavutil/e2k/util_e2k.h" |
5626 |
+ |
5627 |
+#include "libavcodec/mpegvideodsp.h" |
5628 |
+ |
5629 |
+/* ATM this code assumes stride is a multiple of 8 |
5630 |
+ * to preserve proper dst alignment. */ |
5631 |
+static void gmc1_e2k(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, |
5632 |
+ int stride, int h, int x16, int y16, int rounder) |
5633 |
+{ |
5634 |
+ int i; |
5635 |
+ LOAD_ZERO; |
5636 |
+ vec_u8 dstv, srcvA, srcvB; |
5637 |
+ vec_u16 t0, t1, t2, t3; |
5638 |
+ |
5639 |
+ vec_u16 Av = _mm_set1_epi16((16 - x16) * (16 - y16)); |
5640 |
+ vec_u16 Bv = _mm_set1_epi16( x16 * (16 - y16)); |
5641 |
+ vec_u16 Cv = _mm_set1_epi16((16 - x16) * y16); |
5642 |
+ vec_u16 Dv = _mm_set1_epi16( x16 * y16); |
5643 |
+ vec_u16 rounderV = _mm_set1_epi16(rounder); |
5644 |
+ |
5645 |
+ vec_u8 srcvC = VEC_LD8(src); |
5646 |
+ vec_u8 srcvD = VEC_LD8(src + 1); |
5647 |
+ srcvC = _mm_unpacklo_epi8(srcvC, zerov); |
5648 |
+ srcvD = _mm_unpacklo_epi8(srcvD, zerov); |
5649 |
+ |
5650 |
+ PRAGMA_E2K("ivdep") |
5651 |
+ for (i = 0; i < h; i++) { |
5652 |
+ src += stride; |
5653 |
+ |
5654 |
+ srcvA = srcvC; |
5655 |
+ srcvB = srcvD; |
5656 |
+ srcvC = VEC_LD8(src); |
5657 |
+ srcvD = VEC_LD8(src + 1); |
5658 |
+ srcvC = _mm_unpacklo_epi8(srcvC, zerov); |
5659 |
+ srcvD = _mm_unpacklo_epi8(srcvD, zerov); |
5660 |
+ |
5661 |
+ t0 = _mm_mullo_epi16(srcvA, Av); |
5662 |
+ t1 = _mm_mullo_epi16(srcvB, Bv); |
5663 |
+ t0 = _mm_add_epi16(t0, t1); |
5664 |
+ t2 = _mm_mullo_epi16(srcvC, Cv); |
5665 |
+ t3 = _mm_mullo_epi16(srcvD, Dv); |
5666 |
+ t0 = _mm_add_epi16(t0, rounderV); |
5667 |
+ t2 = _mm_add_epi16(t2, t3); |
5668 |
+ t0 = _mm_add_epi16(t0, t2); |
5669 |
+ t0 = _mm_srli_epi16(t0, 8); |
5670 |
+ dstv = _mm_packus_epi16(t0, t0); |
5671 |
+ |
5672 |
+ VEC_STL(dst, dstv); |
5673 |
+ dst += stride; |
5674 |
+ } |
5675 |
+} |
5676 |
+ |
5677 |
+av_cold void ff_mpegvideodsp_init_e2k(MpegVideoDSPContext *c) |
5678 |
+{ |
5679 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
5680 |
+ return; |
5681 |
+ |
5682 |
+ // !checkasm |
5683 |
+ c->gmc1 = gmc1_e2k; |
5684 |
+} |
5685 |
diff --git a/libavcodec/e2k/mpegvideoencdsp.c b/libavcodec/e2k/mpegvideoencdsp.c |
5686 |
new file mode 100644 |
5687 |
index 0000000..c5d3e4d |
5688 |
--- /dev/null |
5689 |
+++ b/libavcodec/e2k/mpegvideoencdsp.c |
5690 |
@@ -0,0 +1,75 @@ |
5691 |
+/* |
5692 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
5693 |
+ * |
5694 |
+ * This file is part of FFmpeg. |
5695 |
+ * |
5696 |
+ * FFmpeg is free software; you can redistribute it and/or |
5697 |
+ * modify it under the terms of the GNU Lesser General Public |
5698 |
+ * License as published by the Free Software Foundation; either |
5699 |
+ * version 2.1 of the License, or (at your option) any later version. |
5700 |
+ * |
5701 |
+ * FFmpeg is distributed in the hope that it will be useful, |
5702 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
5703 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
5704 |
+ * Lesser General Public License for more details. |
5705 |
+ * |
5706 |
+ * You should have received a copy of the GNU Lesser General Public |
5707 |
+ * License along with FFmpeg; if not, write to the Free Software |
5708 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
5709 |
+ */ |
5710 |
+ |
5711 |
+#include "config.h" |
5712 |
+ |
5713 |
+#include <stdint.h> |
5714 |
+ |
5715 |
+#include "libavutil/attributes.h" |
5716 |
+#include "libavutil/cpu.h" |
5717 |
+#include "libavutil/e2k/cpu.h" |
5718 |
+#include "libavutil/e2k/util_e2k.h" |
5719 |
+ |
5720 |
+#include "libavcodec/mpegvideoencdsp.h" |
5721 |
+ |
5722 |
+static int pix_norm1_e2k(uint8_t *pix, int line_size) |
5723 |
+{ |
5724 |
+ int i; |
5725 |
+ LOAD_ZERO; |
5726 |
+ __m128i v0, v1, sum = zerov; |
5727 |
+ |
5728 |
+ PRAGMA_E2K("ivdep") |
5729 |
+ for (i = 0; i < 16; i++) { |
5730 |
+ v1 = VEC_LD(pix); |
5731 |
+ v0 = _mm_unpacklo_epi8(v1, zerov); |
5732 |
+ v1 = _mm_unpackhi_epi8(v1, zerov); |
5733 |
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v0, v0)); |
5734 |
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v1, v1)); |
5735 |
+ pix += line_size; |
5736 |
+ } |
5737 |
+ sum = _mm_hadd_epi32(sum, sum); |
5738 |
+ return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1); |
5739 |
+} |
5740 |
+ |
5741 |
+static int pix_sum_e2k(uint8_t *pix, int line_size) |
5742 |
+{ |
5743 |
+ int i; |
5744 |
+ LOAD_ZERO; |
5745 |
+ __m128i v0, sum = zerov; |
5746 |
+ |
5747 |
+ PRAGMA_E2K("ivdep") |
5748 |
+ for (i = 0; i < 16; i++) { |
5749 |
+ v0 = VEC_LD(pix); |
5750 |
+ sum = _mm_add_epi32(sum, _mm_sad_epu8(v0, zerov)); |
5751 |
+ pix += line_size; |
5752 |
+ } |
5753 |
+ return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 2); |
5754 |
+} |
5755 |
+ |
5756 |
+av_cold void ff_mpegvideoencdsp_init_e2k(MpegvideoEncDSPContext *c, |
5757 |
+ AVCodecContext *avctx) |
5758 |
+{ |
5759 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
5760 |
+ return; |
5761 |
+ |
5762 |
+ // !checkasm |
5763 |
+ c->pix_norm1 = pix_norm1_e2k; |
5764 |
+ c->pix_sum = pix_sum_e2k; |
5765 |
+} |
5766 |
diff --git a/libavcodec/e2k/pixblockdsp.c b/libavcodec/e2k/pixblockdsp.c |
5767 |
new file mode 100644 |
5768 |
index 0000000..f5a5060 |
5769 |
--- /dev/null |
5770 |
+++ b/libavcodec/e2k/pixblockdsp.c |
5771 |
@@ -0,0 +1,83 @@ |
5772 |
+/* |
5773 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
5774 |
+ * Copyright (c) 2002 Brian Foley |
5775 |
+ * Copyright (c) 2002 Dieter Shirley |
5776 |
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
5777 |
+ * |
5778 |
+ * This file is part of FFmpeg. |
5779 |
+ * |
5780 |
+ * FFmpeg is free software; you can redistribute it and/or |
5781 |
+ * modify it under the terms of the GNU Lesser General Public |
5782 |
+ * License as published by the Free Software Foundation; either |
5783 |
+ * version 2.1 of the License, or (at your option) any later version. |
5784 |
+ * |
5785 |
+ * FFmpeg is distributed in the hope that it will be useful, |
5786 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
5787 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
5788 |
+ * Lesser General Public License for more details. |
5789 |
+ * |
5790 |
+ * You should have received a copy of the GNU Lesser General Public |
5791 |
+ * License along with FFmpeg; if not, write to the Free Software |
5792 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
5793 |
+ */ |
5794 |
+ |
5795 |
+#include "config.h" |
5796 |
+ |
5797 |
+#include "libavutil/attributes.h" |
5798 |
+#include "libavutil/cpu.h" |
5799 |
+#include "libavutil/e2k/cpu.h" |
5800 |
+#include "libavutil/e2k/util_e2k.h" |
5801 |
+ |
5802 |
+#include "libavcodec/avcodec.h" |
5803 |
+#include "libavcodec/pixblockdsp.h" |
5804 |
+ |
5805 |
+static void get_pixels_e2k(int16_t * restrict block, const uint8_t *pixels, |
5806 |
+ ptrdiff_t stride) |
5807 |
+{ |
5808 |
+ LOAD_ZERO; |
5809 |
+ __m128i v0; |
5810 |
+ int i; |
5811 |
+ |
5812 |
+ PRAGMA_E2K("ivdep") |
5813 |
+ for (i = 0; i < 8; i++) { |
5814 |
+ v0 = VEC_LD8(pixels); |
5815 |
+ v0 = _mm_unpacklo_epi8(v0, zerov); |
5816 |
+ VEC_ST(block + i * 8, v0); |
5817 |
+ pixels += stride; |
5818 |
+ } |
5819 |
+} |
5820 |
+ |
5821 |
+static void diff_pixels_e2k(int16_t * restrict block, const uint8_t *s1, |
5822 |
+ const uint8_t *s2, ptrdiff_t stride) |
5823 |
+{ |
5824 |
+ LOAD_ZERO; |
5825 |
+ __m128i v0, v1; |
5826 |
+ int i; |
5827 |
+ |
5828 |
+ PRAGMA_E2K("ivdep") |
5829 |
+ for (i = 0; i < 8; i++) { |
5830 |
+ v0 = VEC_LD8(s1); |
5831 |
+ v1 = VEC_LD8(s2); |
5832 |
+ v0 = _mm_unpacklo_epi8(v0, zerov); |
5833 |
+ v1 = _mm_unpacklo_epi8(v1, zerov); |
5834 |
+ v0 = _mm_sub_epi16(v0, v1); |
5835 |
+ VEC_ST(block + i * 8, v0); |
5836 |
+ s1 += stride; |
5837 |
+ s2 += stride; |
5838 |
+ } |
5839 |
+} |
5840 |
+ |
5841 |
+av_cold void ff_pixblockdsp_init_e2k(PixblockDSPContext *c, |
5842 |
+ AVCodecContext *avctx, |
5843 |
+ unsigned high_bit_depth) |
5844 |
+{ |
5845 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
5846 |
+ return; |
5847 |
+ |
5848 |
+ // checkasm |
5849 |
+ |
5850 |
+ c->diff_pixels = diff_pixels_e2k; |
5851 |
+ |
5852 |
+ if (!high_bit_depth) |
5853 |
+ c->get_pixels = get_pixels_e2k; |
5854 |
+} |
5855 |
diff --git a/libavcodec/e2k/svq1enc.c b/libavcodec/e2k/svq1enc.c |
5856 |
new file mode 100644 |
5857 |
index 0000000..263ac60 |
5858 |
--- /dev/null |
5859 |
+++ b/libavcodec/e2k/svq1enc.c |
5860 |
@@ -0,0 +1,68 @@ |
5861 |
+/* |
5862 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
5863 |
+ * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org> |
5864 |
+ * |
5865 |
+ * This file is part of FFmpeg. |
5866 |
+ * |
5867 |
+ * FFmpeg is free software; you can redistribute it and/or |
5868 |
+ * modify it under the terms of the GNU Lesser General Public |
5869 |
+ * License as published by the Free Software Foundation; either |
5870 |
+ * version 2.1 of the License, or (at your option) any later version. |
5871 |
+ * |
5872 |
+ * FFmpeg is distributed in the hope that it will be useful, |
5873 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
5874 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
5875 |
+ * Lesser General Public License for more details. |
5876 |
+ * |
5877 |
+ * You should have received a copy of the GNU Lesser General Public |
5878 |
+ * License along with FFmpeg; if not, write to the Free Software |
5879 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
5880 |
+ */ |
5881 |
+ |
5882 |
+#include "config.h" |
5883 |
+ |
5884 |
+#include <stdint.h> |
5885 |
+ |
5886 |
+#include "libavutil/attributes.h" |
5887 |
+#include "libavutil/cpu.h" |
5888 |
+#include "libavutil/e2k/cpu.h" |
5889 |
+#include "libavutil/e2k/util_e2k.h" |
5890 |
+ |
5891 |
+#include "libavcodec/svq1enc.h" |
5892 |
+ |
5893 |
+static int ssd_int8_vs_int16_e2k(const int8_t *pix1, const int16_t *pix2, |
5894 |
+ intptr_t size) |
5895 |
+{ |
5896 |
+ int i, res; |
5897 |
+ __m128i v0, v1, v2, v3, sum = _mm_setzero_si128(); |
5898 |
+ |
5899 |
+ for (i = 0; i + 15 < size; i += 16) { |
5900 |
+ v1 = VEC_LD(pix1); |
5901 |
+ v0 = _mm_srai_epi16(_mm_unpacklo_epi8(v1, v1), 8); |
5902 |
+ v1 = _mm_srai_epi16(_mm_unpackhi_epi8(v1, v1), 8); |
5903 |
+ v2 = VEC_LD(pix2); |
5904 |
+ v3 = VEC_LD(pix2 + 8); |
5905 |
+ v0 = _mm_sub_epi16(v0, v2); |
5906 |
+ v1 = _mm_sub_epi16(v1, v3); |
5907 |
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v0, v0)); |
5908 |
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(v1, v1)); |
5909 |
+ pix1 += 16; |
5910 |
+ pix2 += 16; |
5911 |
+ } |
5912 |
+ sum = _mm_hadd_epi32(sum, sum); |
5913 |
+ res = _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1); |
5914 |
+ |
5915 |
+ for (; i < size; i++) |
5916 |
+ res += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]); |
5917 |
+ |
5918 |
+ return res; |
5919 |
+} |
5920 |
+ |
5921 |
+av_cold void ff_svq1enc_init_e2k(SVQ1EncContext *c) |
5922 |
+{ |
5923 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
5924 |
+ return; |
5925 |
+ |
5926 |
+ // !checkasm |
5927 |
+ c->ssd_int8_vs_int16 = ssd_int8_vs_int16_e2k; |
5928 |
+} |
5929 |
diff --git a/libavcodec/e2k/vc1dsp.c b/libavcodec/e2k/vc1dsp.c |
5930 |
new file mode 100644 |
5931 |
index 0000000..91307a9 |
5932 |
--- /dev/null |
5933 |
+++ b/libavcodec/e2k/vc1dsp.c |
5934 |
@@ -0,0 +1,303 @@ |
5935 |
+/* |
5936 |
+ * VC-1 and WMV3 decoder - DSP functions |
5937 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
5938 |
+ * Copyright (c) 2006 Konstantin Shishkov |
5939 |
+ * |
5940 |
+ * This file is part of FFmpeg. |
5941 |
+ * |
5942 |
+ * FFmpeg is free software; you can redistribute it and/or |
5943 |
+ * modify it under the terms of the GNU Lesser General Public |
5944 |
+ * License as published by the Free Software Foundation; either |
5945 |
+ * version 2.1 of the License, or (at your option) any later version. |
5946 |
+ * |
5947 |
+ * FFmpeg is distributed in the hope that it will be useful, |
5948 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
5949 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
5950 |
+ * Lesser General Public License for more details. |
5951 |
+ * |
5952 |
+ * You should have received a copy of the GNU Lesser General Public |
5953 |
+ * License along with FFmpeg; if not, write to the Free Software |
5954 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
5955 |
+ */ |
5956 |
+ |
5957 |
+#include "config.h" |
5958 |
+ |
5959 |
+#include "libavutil/attributes.h" |
5960 |
+#include "libavutil/cpu.h" |
5961 |
+#include "libavutil/e2k/cpu.h" |
5962 |
+#include "libavutil/e2k/util_e2k.h" |
5963 |
+ |
5964 |
+#include "libavcodec/vc1dsp.h" |
5965 |
+ |
5966 |
+// main steps of 8x8 transform |
5967 |
+#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) do { \ |
5968 |
+ t0 = _mm_slli_epi32(_mm_add_epi32(s0, s4), 2); \ |
5969 |
+ t0 = _mm_add_epi32(_mm_slli_epi32(t0, 1), t0); \ |
5970 |
+ t0 = _mm_add_epi32(t0, vec_rnd); \ |
5971 |
+ t1 = _mm_slli_epi32(_mm_sub_epi32(s0, s4), 2); \ |
5972 |
+ t1 = _mm_add_epi32(_mm_slli_epi32(t1, 1), t1); \ |
5973 |
+ t1 = _mm_add_epi32(t1, vec_rnd); \ |
5974 |
+ t2 = _mm_add_epi32(_mm_slli_epi32(s6, 2), _mm_slli_epi32(s6, 1)); \ |
5975 |
+ t2 = _mm_add_epi32(t2, _mm_slli_epi32(s2, 4)); \ |
5976 |
+ t3 = _mm_add_epi32(_mm_slli_epi32(s2, 2), _mm_slli_epi32(s2, 1)); \ |
5977 |
+ t3 = _mm_sub_epi32(t3, _mm_slli_epi32(s6, 4)); \ |
5978 |
+ t4 = _mm_add_epi32(t0, t2); \ |
5979 |
+ t5 = _mm_add_epi32(t1, t3); \ |
5980 |
+ t6 = _mm_sub_epi32(t1, t3); \ |
5981 |
+ t7 = _mm_sub_epi32(t0, t2); \ |
5982 |
+\ |
5983 |
+ t0 = _mm_slli_epi32(_mm_add_epi32(s1, s3), 4); \ |
5984 |
+ t0 = _mm_add_epi32(t0, _mm_slli_epi32(s5, 3)); \ |
5985 |
+ t0 = _mm_add_epi32(t0, _mm_slli_epi32(s7, 2)); \ |
5986 |
+ t0 = _mm_add_epi32(t0, _mm_sub_epi32(s5, s3)); \ |
5987 |
+\ |
5988 |
+ t1 = _mm_slli_epi32(_mm_sub_epi32(s1, s5), 4); \ |
5989 |
+ t1 = _mm_sub_epi32(t1, _mm_slli_epi32(s7, 3)); \ |
5990 |
+ t1 = _mm_sub_epi32(t1, _mm_slli_epi32(s3, 2)); \ |
5991 |
+ t1 = _mm_sub_epi32(t1, _mm_add_epi32(s1, s7)); \ |
5992 |
+\ |
5993 |
+ t2 = _mm_slli_epi32(_mm_sub_epi32(s7, s3), 4); \ |
5994 |
+ t2 = _mm_add_epi32(t2, _mm_slli_epi32(s1, 3)); \ |
5995 |
+ t2 = _mm_add_epi32(t2, _mm_slli_epi32(s5, 2)); \ |
5996 |
+ t2 = _mm_add_epi32(t2, _mm_sub_epi32(s1, s7)); \ |
5997 |
+\ |
5998 |
+ t3 = _mm_slli_epi32(_mm_sub_epi32(s5, s7), 4); \ |
5999 |
+ t3 = _mm_sub_epi32(t3, _mm_slli_epi32(s3, 3)); \ |
6000 |
+ t3 = _mm_add_epi32(t3, _mm_slli_epi32(s1, 2)); \ |
6001 |
+ t3 = _mm_sub_epi32(t3, _mm_add_epi32(s3, s5)); \ |
6002 |
+\ |
6003 |
+ s0 = _mm_add_epi32(t4, t0); \ |
6004 |
+ s1 = _mm_add_epi32(t5, t1); \ |
6005 |
+ s2 = _mm_add_epi32(t6, t2); \ |
6006 |
+ s3 = _mm_add_epi32(t7, t3); \ |
6007 |
+ s4 = _mm_sub_epi32(t7, t3); \ |
6008 |
+ s5 = _mm_sub_epi32(t6, t2); \ |
6009 |
+ s6 = _mm_sub_epi32(t5, t1); \ |
6010 |
+ s7 = _mm_sub_epi32(t4, t0); \ |
6011 |
+}while(0) |
6012 |
+ |
6013 |
+#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) do { \ |
6014 |
+ s0 = _mm_srai_epi32(s0, 3); \ |
6015 |
+ s1 = _mm_srai_epi32(s1, 3); \ |
6016 |
+ s2 = _mm_srai_epi32(s2, 3); \ |
6017 |
+ s3 = _mm_srai_epi32(s3, 3); \ |
6018 |
+ s4 = _mm_srai_epi32(s4, 3); \ |
6019 |
+ s5 = _mm_srai_epi32(s5, 3); \ |
6020 |
+ s6 = _mm_srai_epi32(s6, 3); \ |
6021 |
+ s7 = _mm_srai_epi32(s7, 3); \ |
6022 |
+} while(0) |
6023 |
+ |
6024 |
+#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) do { \ |
6025 |
+ s0 = _mm_srai_epi32(s0, 7); \ |
6026 |
+ s1 = _mm_srai_epi32(s1, 7); \ |
6027 |
+ s2 = _mm_srai_epi32(s2, 7); \ |
6028 |
+ s3 = _mm_srai_epi32(s3, 7); \ |
6029 |
+ s4 = _mm_srai_epi32(_mm_add_epi32(s4, c1), 7); \ |
6030 |
+ s5 = _mm_srai_epi32(_mm_add_epi32(s5, c1), 7); \ |
6031 |
+ s6 = _mm_srai_epi32(_mm_add_epi32(s6, c1), 7); \ |
6032 |
+ s7 = _mm_srai_epi32(_mm_add_epi32(s7, c1), 7); \ |
6033 |
+} while(0) |
6034 |
+ |
6035 |
+/* main steps of 4x4 transform */ |
6036 |
+#define STEP4(s0, s1, s2, s3, vec_rnd) do { \ |
6037 |
+ t1 = _mm_add_epi32(_mm_slli_epi32(s0, 4), s0); \ |
6038 |
+ t1 = _mm_add_epi32(t1, vec_rnd); \ |
6039 |
+ t2 = _mm_add_epi32(_mm_slli_epi32(s2, 4), s2); \ |
6040 |
+ t0 = _mm_add_epi32(t1, t2); \ |
6041 |
+ t1 = _mm_sub_epi32(t1, t2); \ |
6042 |
+ t3 = _mm_slli_epi32(_mm_sub_epi32(s3, s1), 1); \ |
6043 |
+ t3 = _mm_add_epi32(t3, _mm_slli_epi32(t3, 2)); \ |
6044 |
+ t2 = _mm_add_epi32(t3, _mm_slli_epi32(s1, 5)); \ |
6045 |
+ t3 = _mm_add_epi32(t3, _mm_slli_epi32(s3, 3)); \ |
6046 |
+ t3 = _mm_add_epi32(t3, _mm_slli_epi32(s3, 2)); \ |
6047 |
+ s0 = _mm_add_epi32(t0, t2); \ |
6048 |
+ s1 = _mm_sub_epi32(t1, t3); \ |
6049 |
+ s2 = _mm_add_epi32(t1, t3); \ |
6050 |
+ s3 = _mm_sub_epi32(t0, t2); \ |
6051 |
+} while (0) |
6052 |
+ |
6053 |
+#define SHIFT_HOR4(s0, s1, s2, s3) \ |
6054 |
+ s0 = _mm_srai_epi32(s0, 3); \ |
6055 |
+ s1 = _mm_srai_epi32(s1, 3); \ |
6056 |
+ s2 = _mm_srai_epi32(s2, 3); \ |
6057 |
+ s3 = _mm_srai_epi32(s3, 3) |
6058 |
+ |
6059 |
+#define SHIFT_VERT4(s0, s1, s2, s3) \ |
6060 |
+ s0 = _mm_srai_epi32(s0, 7); \ |
6061 |
+ s1 = _mm_srai_epi32(s1, 7); \ |
6062 |
+ s2 = _mm_srai_epi32(s2, 7); \ |
6063 |
+ s3 = _mm_srai_epi32(s3, 7) |
6064 |
+ |
6065 |
+#define _mm_unpacklo1_epi16(v) _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16) |
6066 |
+#define _mm_unpackhi1_epi16(v) _mm_srai_epi32(_mm_unpackhi_epi16(v, v), 16) |
6067 |
+ |
6068 |
+#define TRANSPOSE4_32(s0, s1, s2, s3) \ |
6069 |
+ t0 = _mm_unpacklo_epi32(s0, s1); \ |
6070 |
+ t1 = _mm_unpackhi_epi32(s0, s1); \ |
6071 |
+ t2 = _mm_unpacklo_epi32(s2, s3); \ |
6072 |
+ t3 = _mm_unpackhi_epi32(s2, s3); \ |
6073 |
+ s0 = _mm_unpacklo_epi64(t0, t2); \ |
6074 |
+ s1 = _mm_unpackhi_epi64(t0, t2); \ |
6075 |
+ s2 = _mm_unpacklo_epi64(t1, t3); \ |
6076 |
+ s3 = _mm_unpackhi_epi64(t1, t3); |
6077 |
+ |
6078 |
+/* Do inverse transform on 8x8 block */ |
6079 |
+static void vc1_inv_trans_8x8_e2k(int16_t block[64]) |
6080 |
+{ |
6081 |
+ vec_s16 src0, src1, src2, src3, src4, src5, src6, src7; |
6082 |
+ vec_s32 s0, s1, s2, s3, s4, s5, s6, s7; |
6083 |
+ vec_s32 s8, s9, sA, sB, sC, sD, sE, sF; |
6084 |
+ vec_s32 t0, t1, t2, t3, t4, t5, t6, t7; |
6085 |
+ const vec_s32 c64 = _mm_set1_epi32(64); |
6086 |
+ const vec_s32 c4 = _mm_set1_epi32(4); |
6087 |
+ const vec_s32 c1 = _mm_set1_epi32(1); |
6088 |
+ |
6089 |
+ src0 = VEC_LD(block + 8 * 0); |
6090 |
+ src1 = VEC_LD(block + 8 * 1); |
6091 |
+ src2 = VEC_LD(block + 8 * 2); |
6092 |
+ src3 = VEC_LD(block + 8 * 3); |
6093 |
+ src4 = VEC_LD(block + 8 * 4); |
6094 |
+ src5 = VEC_LD(block + 8 * 5); |
6095 |
+ src6 = VEC_LD(block + 8 * 6); |
6096 |
+ src7 = VEC_LD(block + 8 * 7); |
6097 |
+ |
6098 |
+ s0 = _mm_unpacklo1_epi16(src0); |
6099 |
+ s1 = _mm_unpacklo1_epi16(src1); |
6100 |
+ s2 = _mm_unpacklo1_epi16(src2); |
6101 |
+ s3 = _mm_unpacklo1_epi16(src3); |
6102 |
+ s4 = _mm_unpacklo1_epi16(src4); |
6103 |
+ s5 = _mm_unpacklo1_epi16(src5); |
6104 |
+ s6 = _mm_unpacklo1_epi16(src6); |
6105 |
+ s7 = _mm_unpacklo1_epi16(src7); |
6106 |
+ s8 = _mm_unpackhi1_epi16(src0); |
6107 |
+ s9 = _mm_unpackhi1_epi16(src1); |
6108 |
+ sA = _mm_unpackhi1_epi16(src2); |
6109 |
+ sB = _mm_unpackhi1_epi16(src3); |
6110 |
+ sC = _mm_unpackhi1_epi16(src4); |
6111 |
+ sD = _mm_unpackhi1_epi16(src5); |
6112 |
+ sE = _mm_unpackhi1_epi16(src6); |
6113 |
+ sF = _mm_unpackhi1_epi16(src7); |
6114 |
+ STEP8(s0, s1, s2, s3, s4, s5, s6, s7, c4); |
6115 |
+ SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); |
6116 |
+ STEP8(s8, s9, sA, sB, sC, sD, sE, sF, c4); |
6117 |
+ SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); |
6118 |
+ |
6119 |
+ TRANSPOSE4_32(s0, s1, s2, s3) |
6120 |
+ TRANSPOSE4_32(s4, s5, s6, s7) |
6121 |
+ TRANSPOSE4_32(s8, s9, sA, sB) |
6122 |
+ TRANSPOSE4_32(sC, sD, sE, sF) |
6123 |
+ |
6124 |
+ STEP8(s0, s1, s2, s3, s8, s9, sA, sB, c64); |
6125 |
+ SHIFT_VERT8(s0, s1, s2, s3, s8, s9, sA, sB); |
6126 |
+ STEP8(s4, s5, s6, s7, sC, sD, sE, sF, c64); |
6127 |
+ SHIFT_VERT8(s4, s5, s6, s7, sC, sD, sE, sF); |
6128 |
+ src0 = _mm_packs_epi32(s0, s4); |
6129 |
+ src1 = _mm_packs_epi32(s1, s5); |
6130 |
+ src2 = _mm_packs_epi32(s2, s6); |
6131 |
+ src3 = _mm_packs_epi32(s3, s7); |
6132 |
+ src4 = _mm_packs_epi32(s8, sC); |
6133 |
+ src5 = _mm_packs_epi32(s9, sD); |
6134 |
+ src6 = _mm_packs_epi32(sA, sE); |
6135 |
+ src7 = _mm_packs_epi32(sB, sF); |
6136 |
+ |
6137 |
+ VEC_ST(block + 8 * 0, src0); |
6138 |
+ VEC_ST(block + 8 * 1, src1); |
6139 |
+ VEC_ST(block + 8 * 2, src2); |
6140 |
+ VEC_ST(block + 8 * 3, src3); |
6141 |
+ VEC_ST(block + 8 * 4, src4); |
6142 |
+ VEC_ST(block + 8 * 5, src5); |
6143 |
+ VEC_ST(block + 8 * 6, src6); |
6144 |
+ VEC_ST(block + 8 * 7, src7); |
6145 |
+} |
6146 |
+ |
6147 |
+/* Do inverse transform on 8x4 part of block */ |
6148 |
+static void vc1_inv_trans_8x4_e2k(uint8_t *dest, ptrdiff_t stride, |
6149 |
+ int16_t *block) |
6150 |
+{ |
6151 |
+ LOAD_ZERO; |
6152 |
+ vec_s16 src0, src1, src2, src3; |
6153 |
+ vec_s32 s0, s1, s2, s3, s4, s5, s6, s7; |
6154 |
+ vec_s32 t0, t1, t2, t3, t4, t5, t6, t7; |
6155 |
+ const vec_s32 c64 = _mm_set1_epi32(64); |
6156 |
+ const vec_s32 c4 = _mm_set1_epi32(4); |
6157 |
+ __m128i tmp; |
6158 |
+ |
6159 |
+ src0 = VEC_LD(block + 8 * 0); |
6160 |
+ src1 = VEC_LD(block + 8 * 1); |
6161 |
+ src2 = VEC_LD(block + 8 * 2); |
6162 |
+ src3 = VEC_LD(block + 8 * 3); |
6163 |
+ |
6164 |
+ t0 = _mm_unpacklo_epi16(src0, src1); |
6165 |
+ t1 = _mm_unpackhi_epi16(src0, src1); |
6166 |
+ t2 = _mm_unpacklo_epi16(src2, src3); |
6167 |
+ t3 = _mm_unpackhi_epi16(src2, src3); |
6168 |
+ |
6169 |
+ t4 = _mm_unpacklo_epi32(t0, t2); |
6170 |
+ t5 = _mm_unpackhi_epi32(t0, t2); |
6171 |
+ t6 = _mm_unpacklo_epi32(t1, t3); |
6172 |
+ t7 = _mm_unpackhi_epi32(t1, t3); |
6173 |
+ |
6174 |
+ s0 = _mm_unpacklo1_epi16(t4); |
6175 |
+ s1 = _mm_unpackhi1_epi16(t4); |
6176 |
+ s2 = _mm_unpacklo1_epi16(t5); |
6177 |
+ s3 = _mm_unpackhi1_epi16(t5); |
6178 |
+ s4 = _mm_unpacklo1_epi16(t6); |
6179 |
+ s5 = _mm_unpackhi1_epi16(t6); |
6180 |
+ s6 = _mm_unpacklo1_epi16(t7); |
6181 |
+ s7 = _mm_unpackhi1_epi16(t7); |
6182 |
+ |
6183 |
+ STEP8(s0, s1, s2, s3, s4, s5, s6, s7, c4); |
6184 |
+ SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); |
6185 |
+ |
6186 |
+ TRANSPOSE4_32(s0, s1, s2, s3) |
6187 |
+ TRANSPOSE4_32(s4, s5, s6, s7) |
6188 |
+ |
6189 |
+ STEP4(s0, s1, s2, s3, c64); |
6190 |
+ SHIFT_VERT4(s0, s1, s2, s3); |
6191 |
+ STEP4(s4, s5, s6, s7, c64); |
6192 |
+ SHIFT_VERT4(s4, s5, s6, s7); |
6193 |
+ src0 = _mm_packs_epi32(s0, s4); |
6194 |
+ src1 = _mm_packs_epi32(s1, s5); |
6195 |
+ src2 = _mm_packs_epi32(s2, s6); |
6196 |
+ src3 = _mm_packs_epi32(s3, s7); |
6197 |
+ |
6198 |
+#define ADD(dest, src) \ |
6199 |
+ tmp = VEC_LD8(dest); \ |
6200 |
+ tmp = _mm_unpacklo_epi8(tmp, zerov); \ |
6201 |
+ tmp = _mm_adds_epi16(tmp, src); \ |
6202 |
+ tmp = _mm_packus_epi16(tmp, tmp); \ |
6203 |
+ VEC_STL(dest, tmp) |
6204 |
+ |
6205 |
+ ADD(dest, src0); dest += stride; |
6206 |
+ ADD(dest, src1); dest += stride; |
6207 |
+ ADD(dest, src2); dest += stride; |
6208 |
+ ADD(dest, src3); |
6209 |
+} |
6210 |
+ |
6211 |
+#define PUT_OP_U8_E2K(d, s, dst) d = s |
6212 |
+#define AVG_OP_U8_E2K(d, s, dst) d = _mm_avg_epu8(dst, s) |
6213 |
+ |
6214 |
+#define OP_U8_E2K PUT_OP_U8_E2K |
6215 |
+#define PREFIX_no_rnd_vc1_chroma_mc8_e2k put_no_rnd_vc1_chroma_mc8_e2k |
6216 |
+#include "h264chroma_template.c" |
6217 |
+#undef OP_U8_E2K |
6218 |
+#undef PREFIX_no_rnd_vc1_chroma_mc8_e2k |
6219 |
+ |
6220 |
+#define OP_U8_E2K AVG_OP_U8_E2K |
6221 |
+#define PREFIX_no_rnd_vc1_chroma_mc8_e2k avg_no_rnd_vc1_chroma_mc8_e2k |
6222 |
+#include "h264chroma_template.c" |
6223 |
+#undef OP_U8_E2K |
6224 |
+#undef PREFIX_no_rnd_vc1_chroma_mc8_e2k |
6225 |
+ |
6226 |
+av_cold void ff_vc1dsp_init_e2k(VC1DSPContext *dsp) |
6227 |
+{ |
6228 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
6229 |
+ return; |
6230 |
+ |
6231 |
+ // !checkasm |
6232 |
+ |
6233 |
+ dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_e2k; // fate mss2-wmv |
6234 |
+ dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_e2k; // fate wmv3-drm-dec |
6235 |
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_e2k; |
6236 |
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_e2k; |
6237 |
+} |
6238 |
diff --git a/libavcodec/e2k/videodsp.c b/libavcodec/e2k/videodsp.c |
6239 |
new file mode 100644 |
6240 |
index 0000000..d831d68 |
6241 |
--- /dev/null |
6242 |
+++ b/libavcodec/e2k/videodsp.c |
6243 |
@@ -0,0 +1,36 @@ |
6244 |
+/* |
6245 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
6246 |
+ * Copyright (c) 2003-2004 Romain Dolbeau |
6247 |
+ * |
6248 |
+ * This file is part of FFmpeg. |
6249 |
+ * |
6250 |
+ * FFmpeg is free software; you can redistribute it and/or |
6251 |
+ * modify it under the terms of the GNU Lesser General Public |
6252 |
+ * License as published by the Free Software Foundation; either |
6253 |
+ * version 2.1 of the License, or (at your option) any later version. |
6254 |
+ * |
6255 |
+ * FFmpeg is distributed in the hope that it will be useful, |
6256 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
6257 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
6258 |
+ * Lesser General Public License for more details. |
6259 |
+ * |
6260 |
+ * You should have received a copy of the GNU Lesser General Public |
6261 |
+ * License along with FFmpeg; if not, write to the Free Software |
6262 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
6263 |
+ */ |
6264 |
+ |
6265 |
+#include "libavutil/attributes.h" |
6266 |
+#include "libavcodec/videodsp.h" |
6267 |
+ |
6268 |
+static void prefetch_e2k(uint8_t *mem, ptrdiff_t stride, int h) |
6269 |
+{ |
6270 |
+ do { |
6271 |
+ __builtin_prefetch(mem); |
6272 |
+ mem += stride; |
6273 |
+ } while (--h); |
6274 |
+} |
6275 |
+ |
6276 |
+av_cold void ff_videodsp_init_e2k(VideoDSPContext *ctx, int bpc) |
6277 |
+{ |
6278 |
+ ctx->prefetch = prefetch_e2k; |
6279 |
+} |
6280 |
diff --git a/libavcodec/e2k/vorbisdsp.c b/libavcodec/e2k/vorbisdsp.c |
6281 |
new file mode 100644 |
6282 |
index 0000000..7a7619e |
6283 |
--- /dev/null |
6284 |
+++ b/libavcodec/e2k/vorbisdsp.c |
6285 |
@@ -0,0 +1,62 @@ |
6286 |
+/* |
6287 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
6288 |
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> |
6289 |
+ * |
6290 |
+ * This file is part of FFmpeg. |
6291 |
+ * |
6292 |
+ * FFmpeg is free software; you can redistribute it and/or |
6293 |
+ * modify it under the terms of the GNU Lesser General Public |
6294 |
+ * License as published by the Free Software Foundation; either |
6295 |
+ * version 2.1 of the License, or (at your option) any later version. |
6296 |
+ * |
6297 |
+ * FFmpeg is distributed in the hope that it will be useful, |
6298 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
6299 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
6300 |
+ * Lesser General Public License for more details. |
6301 |
+ * |
6302 |
+ * You should have received a copy of the GNU Lesser General Public |
6303 |
+ * License along with FFmpeg; if not, write to the Free Software |
6304 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
6305 |
+ */ |
6306 |
+ |
6307 |
+#include "config.h" |
6308 |
+ |
6309 |
+#include "libavutil/attributes.h" |
6310 |
+#include "libavutil/cpu.h" |
6311 |
+#include "libavutil/e2k/cpu.h" |
6312 |
+#include "libavutil/e2k/util_e2k.h" |
6313 |
+ |
6314 |
+#include "libavcodec/vorbisdsp.h" |
6315 |
+ |
6316 |
+static void vorbis_inverse_coupling_e2k(float *mag, float *ang, |
6317 |
+ intptr_t blocksize) |
6318 |
+{ |
6319 |
+ int i; |
6320 |
+ vec_f m, a, t0, t1, zerov = _mm_setzero_ps(); |
6321 |
+ vec_f sign = _mm_castsi128_ps(_mm_set1_epi32(1 << 31)); |
6322 |
+ |
6323 |
+ PRAGMA_E2K("ivdep") |
6324 |
+ for (i = 0; i < blocksize; i += 4) { |
6325 |
+ m = _mm_load_ps(mag + i); |
6326 |
+ a = _mm_load_ps(ang + i); |
6327 |
+ t0 = _mm_cmple_ps(m, zerov); |
6328 |
+ t1 = _mm_cmple_ps(a, zerov); |
6329 |
+ a = _mm_xor_ps(a, _mm_and_ps(t0, sign)); |
6330 |
+ t0 = _mm_andnot_ps(t1, a); |
6331 |
+ t1 = _mm_and_ps(t1, a); |
6332 |
+ a = _mm_sub_ps(m, t0); |
6333 |
+ m = _mm_add_ps(m, t1); |
6334 |
+ _mm_store_ps(ang + i, a); |
6335 |
+ _mm_store_ps(mag + i, m); |
6336 |
+ } |
6337 |
+} |
6338 |
+ |
6339 |
+av_cold void ff_vorbisdsp_init_e2k(VorbisDSPContext *c) |
6340 |
+{ |
6341 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
6342 |
+ return; |
6343 |
+ |
6344 |
+ // !checkasm |
6345 |
+ // fate vorbis-encode |
6346 |
+ c->vorbis_inverse_coupling = vorbis_inverse_coupling_e2k; |
6347 |
+} |
6348 |
diff --git a/libavcodec/e2k/vp3dsp.c b/libavcodec/e2k/vp3dsp.c |
6349 |
new file mode 100644 |
6350 |
index 0000000..f086096 |
6351 |
--- /dev/null |
6352 |
+++ b/libavcodec/e2k/vp3dsp.c |
6353 |
@@ -0,0 +1,169 @@ |
6354 |
+/* |
6355 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
6356 |
+ * Copyright (C) 2009 David Conrad |
6357 |
+ * |
6358 |
+ * This file is part of FFmpeg. |
6359 |
+ * |
6360 |
+ * FFmpeg is free software; you can redistribute it and/or |
6361 |
+ * modify it under the terms of the GNU Lesser General Public |
6362 |
+ * License as published by the Free Software Foundation; either |
6363 |
+ * version 2.1 of the License, or (at your option) any later version. |
6364 |
+ * |
6365 |
+ * FFmpeg is distributed in the hope that it will be useful, |
6366 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
6367 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
6368 |
+ * Lesser General Public License for more details. |
6369 |
+ * |
6370 |
+ * You should have received a copy of the GNU Lesser General Public |
6371 |
+ * License along with FFmpeg; if not, write to the Free Software |
6372 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
6373 |
+ */ |
6374 |
+ |
6375 |
+#include <string.h> |
6376 |
+ |
6377 |
+#include "config.h" |
6378 |
+ |
6379 |
+#include "libavutil/attributes.h" |
6380 |
+#include "libavutil/cpu.h" |
6381 |
+#include "libavutil/e2k/cpu.h" |
6382 |
+#include "libavutil/e2k/util_e2k.h" |
6383 |
+ |
6384 |
+#include "libavcodec/vp3dsp.h" |
6385 |
+ |
6386 |
+#define IDCT_START(extra) \ |
6387 |
+ vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H; \ |
6388 |
+ vec_s16 Ed, Gd, Add, Bdd, Fd, Hd; \ |
6389 |
+ vec_s16 addv = _mm_set1_epi16(extra + 8); \ |
6390 |
+ \ |
6391 |
+ vec_s16 C1 = _mm_set1_epi16(64277); \ |
6392 |
+ vec_s16 C2 = _mm_set1_epi16(60547); \ |
6393 |
+ vec_s16 C3 = _mm_set1_epi16(54491); \ |
6394 |
+ vec_s16 C4 = _mm_set1_epi16(46341); \ |
6395 |
+ vec_s16 C5 = _mm_set1_epi16(36410); \ |
6396 |
+ vec_s16 C6 = _mm_set1_epi16(25080); \ |
6397 |
+ vec_s16 C7 = _mm_set1_epi16(12785); \ |
6398 |
+ \ |
6399 |
+ vec_s16 b0 = VEC_LD(block + 8 * 0); \ |
6400 |
+ vec_s16 b1 = VEC_LD(block + 8 * 1); \ |
6401 |
+ vec_s16 b2 = VEC_LD(block + 8 * 2); \ |
6402 |
+ vec_s16 b3 = VEC_LD(block + 8 * 3); \ |
6403 |
+ vec_s16 b4 = VEC_LD(block + 8 * 4); \ |
6404 |
+ vec_s16 b5 = VEC_LD(block + 8 * 5); \ |
6405 |
+ vec_s16 b6 = VEC_LD(block + 8 * 6); \ |
6406 |
+ vec_s16 b7 = VEC_LD(block + 8 * 7); |
6407 |
+ |
6408 |
+// these functions do (a*C)>>16 |
6409 |
+// things are tricky because a is signed, but C unsigned. |
6410 |
+// M15 is used if C fits in 15 bit unsigned (C6,C7) |
6411 |
+// M16 is used if C requires 16 bits unsigned |
6412 |
+#define M15(a, C) _mm_mulhi_epi16(a, C) |
6413 |
+#define M16(a, C) _mm_add_epi16(a, M15(a, C)) |
6414 |
+ |
6415 |
+#define IDCT_1D(ADD, SHIFT)\ |
6416 |
+ A = _mm_add_epi16(M16(b1, C1), M15(b7, C7)); \ |
6417 |
+ B = _mm_sub_epi16(M15(b1, C7), M16(b7, C1)); \ |
6418 |
+ C = _mm_add_epi16(M16(b3, C3), M16(b5, C5)); \ |
6419 |
+ D = _mm_sub_epi16(M16(b5, C3), M16(b3, C5)); \ |
6420 |
+ \ |
6421 |
+ Ad = M16(_mm_sub_epi16(A, C), C4); \ |
6422 |
+ Bd = M16(_mm_sub_epi16(B, D), C4); \ |
6423 |
+ \ |
6424 |
+ Cd = _mm_add_epi16(A, C); \ |
6425 |
+ Dd = _mm_add_epi16(B, D); \ |
6426 |
+ \ |
6427 |
+ E = ADD(M16(_mm_add_epi16(b0, b4), C4)); \ |
6428 |
+ F = ADD(M16(_mm_sub_epi16(b0, b4), C4)); \ |
6429 |
+ \ |
6430 |
+ G = _mm_add_epi16(M16(b2, C2), M15(b6, C6)); \ |
6431 |
+ H = _mm_sub_epi16(M15(b2, C6), M16(b6, C2)); \ |
6432 |
+ \ |
6433 |
+ Ed = _mm_sub_epi16(E, G); \ |
6434 |
+ Gd = _mm_add_epi16(E, G); \ |
6435 |
+ \ |
6436 |
+ Add = _mm_add_epi16(F, Ad); \ |
6437 |
+ Bdd = _mm_sub_epi16(Bd, H); \ |
6438 |
+ \ |
6439 |
+ Fd = _mm_sub_epi16(F, Ad); \ |
6440 |
+ Hd = _mm_add_epi16(Bd, H); \ |
6441 |
+ \ |
6442 |
+ b0 = SHIFT(_mm_add_epi16(Gd, Cd)); \ |
6443 |
+ b7 = SHIFT(_mm_sub_epi16(Gd, Cd)); \ |
6444 |
+ \ |
6445 |
+ b1 = SHIFT(_mm_add_epi16(Add, Hd)); \ |
6446 |
+ b2 = SHIFT(_mm_sub_epi16(Add, Hd)); \ |
6447 |
+ \ |
6448 |
+ b3 = SHIFT(_mm_add_epi16(Ed, Dd)); \ |
6449 |
+ b4 = SHIFT(_mm_sub_epi16(Ed, Dd)); \ |
6450 |
+ \ |
6451 |
+ b5 = SHIFT(_mm_add_epi16(Fd, Bdd)); \ |
6452 |
+ b6 = SHIFT(_mm_sub_epi16(Fd, Bdd)); |
6453 |
+ |
6454 |
+#define NOP(a) a |
6455 |
+#define ADD8(a) _mm_add_epi16(a, addv) |
6456 |
+#define SHIFT4(a) _mm_srai_epi16(a, 4) |
6457 |
+ |
6458 |
+static void vp3_idct_put_e2k(uint8_t *dst, ptrdiff_t stride, int16_t block[64]) |
6459 |
+{ |
6460 |
+ vec_u8 vdst; |
6461 |
+ IDCT_START(2048) |
6462 |
+ |
6463 |
+ IDCT_1D(NOP, NOP) |
6464 |
+ TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); |
6465 |
+ IDCT_1D(ADD8, SHIFT4) |
6466 |
+ |
6467 |
+#define PUT(a) \ |
6468 |
+ vdst = _mm_packus_epi16(a, a); \ |
6469 |
+ VEC_STL(dst, vdst); |
6470 |
+ |
6471 |
+ PUT(b0) dst += stride; |
6472 |
+ PUT(b1) dst += stride; |
6473 |
+ PUT(b2) dst += stride; |
6474 |
+ PUT(b3) dst += stride; |
6475 |
+ PUT(b4) dst += stride; |
6476 |
+ PUT(b5) dst += stride; |
6477 |
+ PUT(b6) dst += stride; |
6478 |
+ PUT(b7) |
6479 |
+ memset(block, 0, sizeof(*block) * 64); |
6480 |
+} |
6481 |
+ |
6482 |
+static void vp3_idct_add_e2k(uint8_t *dst, ptrdiff_t stride, int16_t block[64]) |
6483 |
+{ |
6484 |
+ LOAD_ZERO; |
6485 |
+ vec_u8 vdst; |
6486 |
+ vec_s16 vdst_16; |
6487 |
+ |
6488 |
+ IDCT_START(0) |
6489 |
+ |
6490 |
+ IDCT_1D(NOP, NOP) |
6491 |
+ TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); |
6492 |
+ IDCT_1D(ADD8, SHIFT4) |
6493 |
+ |
6494 |
+#define ADD(a) \ |
6495 |
+ vdst = VEC_LD8(dst); \ |
6496 |
+ vdst_16 = _mm_unpacklo_epi8(vdst, zerov); \ |
6497 |
+ vdst_16 = _mm_adds_epi16(a, vdst_16); \ |
6498 |
+ vdst = _mm_packus_epi16(vdst_16, vdst_16); \ |
6499 |
+ VEC_STL(dst, vdst); |
6500 |
+ |
6501 |
+ ADD(b0) dst += stride; |
6502 |
+ ADD(b1) dst += stride; |
6503 |
+ ADD(b2) dst += stride; |
6504 |
+ ADD(b3) dst += stride; |
6505 |
+ ADD(b4) dst += stride; |
6506 |
+ ADD(b5) dst += stride; |
6507 |
+ ADD(b6) dst += stride; |
6508 |
+ ADD(b7) |
6509 |
+ memset(block, 0, sizeof(*block) * 64); |
6510 |
+} |
6511 |
+ |
6512 |
+av_cold void ff_vp3dsp_init_e2k(VP3DSPContext *c, int flags) |
6513 |
+{ |
6514 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
6515 |
+ return; |
6516 |
+ |
6517 |
+ // !checkasm |
6518 |
+ // fate theora-coeff-level64 |
6519 |
+ |
6520 |
+ c->idct_put = vp3_idct_put_e2k; |
6521 |
+ c->idct_add = vp3_idct_add_e2k; |
6522 |
+} |
6523 |
diff --git a/libavcodec/e2k/vp8dsp.c b/libavcodec/e2k/vp8dsp.c |
6524 |
new file mode 100644 |
6525 |
index 0000000..61b46b3 |
6526 |
--- /dev/null |
6527 |
+++ b/libavcodec/e2k/vp8dsp.c |
6528 |
@@ -0,0 +1,428 @@ |
6529 |
+/* |
6530 |
+ * VP8 compatible video decoder |
6531 |
+ * |
6532 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
6533 |
+ * Copyright (C) 2010 David Conrad |
6534 |
+ * |
6535 |
+ * This file is part of FFmpeg. |
6536 |
+ * |
6537 |
+ * FFmpeg is free software; you can redistribute it and/or |
6538 |
+ * modify it under the terms of the GNU Lesser General Public |
6539 |
+ * License as published by the Free Software Foundation; either |
6540 |
+ * version 2.1 of the License, or (at your option) any later version. |
6541 |
+ * |
6542 |
+ * FFmpeg is distributed in the hope that it will be useful, |
6543 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
6544 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
6545 |
+ * Lesser General Public License for more details. |
6546 |
+ * |
6547 |
+ * You should have received a copy of the GNU Lesser General Public |
6548 |
+ * License along with FFmpeg; if not, write to the Free Software |
6549 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
6550 |
+ */ |
6551 |
+ |
6552 |
+#include "config.h" |
6553 |
+ |
6554 |
+#include "libavutil/cpu.h" |
6555 |
+#include "libavutil/mem.h" |
6556 |
+#include "libavutil/e2k/cpu.h" |
6557 |
+#include "libavutil/e2k/util_e2k.h" |
6558 |
+ |
6559 |
+#include "libavcodec/vp8dsp.h" |
6560 |
+ |
6561 |
+#include "hpeldsp.h" |
6562 |
+ |
6563 |
+#define REPT4(a, b, c, d) { a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d } |
6564 |
+ |
6565 |
+// h subpel filter uses msum to multiply+add 4 pixel taps at once |
6566 |
+static const uint8_t ALIGNED(16) h_subpel_filters_inner[7][16] = |
6567 |
+{ |
6568 |
+ REPT4( -6, 123, 12, -1), |
6569 |
+ REPT4(-11, 108, 36, -8), |
6570 |
+ REPT4( -9, 93, 50, -6), |
6571 |
+ REPT4(-16, 77, 77, -16), |
6572 |
+ REPT4( -6, 50, 93, -9), |
6573 |
+ REPT4( -8, 36, 108, -11), |
6574 |
+ REPT4( -1, 12, 123, -6) |
6575 |
+}; |
6576 |
+ |
6577 |
+// for 6tap filters, these are the outer two taps |
6578 |
+// The zeros mask off pixels 4-7 when filtering 0-3 |
6579 |
+// and vice-versa |
6580 |
+static const uint8_t ALIGNED(16) h_subpel_filters_outer[3][16] = |
6581 |
+{ |
6582 |
+ REPT4(2, 1, 2, 1), |
6583 |
+ REPT4(3, 3, 3, 3), |
6584 |
+ REPT4(1, 2, 1, 2) |
6585 |
+}; |
6586 |
+ |
6587 |
+#define INNER_PERM(x) x, x+1, x+2, x+3, x+1, x+2, x+3, x+4 |
6588 |
+ |
6589 |
+#define INIT_H_SUBPEL_FILTER(j, n, is6tap) \ |
6590 |
+ vec_s8 filter_inner = *(__m128i*)h_subpel_filters_inner[j]; \ |
6591 |
+ is6tap( \ |
6592 |
+ vec_s8 filter_outer = *(__m128i*)h_subpel_filters_outer[(j) >> 1]; \ |
6593 |
+ vec_u8 perm_outer = _mm_setr_epi8(0,5, 1,6, 2,7, 3,8, 4,9, 5,10, 6,11, 7,12); \ |
6594 |
+ ) \ |
6595 |
+ vec_s32 c64 = _mm_set1_epi16(64); \ |
6596 |
+ vec_u8 perm_inner_l = _mm_setr_epi8(INNER_PERM(n), INNER_PERM(n + 2)); \ |
6597 |
+ vec_u8 perm_inner_h = _mm_setr_epi8(INNER_PERM(n + 4), INNER_PERM(n + 6)); \ |
6598 |
+ __m128i v0, v1; \ |
6599 |
+ int i |
6600 |
+ |
6601 |
+#define FILTER_H(a, is6tap) \ |
6602 |
+ v0 = _mm_shuffle_epi8(a, perm_inner_l); \ |
6603 |
+ v1 = _mm_shuffle_epi8(a, perm_inner_h); \ |
6604 |
+ v0 = _mm_maddubs_epi16(v0, filter_inner); \ |
6605 |
+ v1 = _mm_maddubs_epi16(v1, filter_inner); \ |
6606 |
+ v0 = _mm_hadds_epi16(v0, v1); \ |
6607 |
+ is6tap( \ |
6608 |
+ a = _mm_shuffle_epi8(a, perm_outer); \ |
6609 |
+ v0 = _mm_adds_epi16(v0, _mm_maddubs_epi16(a, filter_outer)); \ |
6610 |
+ ) \ |
6611 |
+ v0 = _mm_adds_epi16(v0, c64); \ |
6612 |
+ a = _mm_srai_epi16(v0, 7) |
6613 |
+ |
6614 |
+#define INIT_H_SUBPEL_FILTER4(j, n, is6tap) \ |
6615 |
+ __m64 filter_inner = *(__m64*)h_subpel_filters_inner[j]; \ |
6616 |
+ is6tap( \ |
6617 |
+ __m64 filter_outer = *(__m64*)h_subpel_filters_outer[(j) >> 1]; \ |
6618 |
+ __m64 perm_outer = _mm_setr_pi8(0,5, 1,6, 2,7, 3,8); \ |
6619 |
+ __m64 a1; \ |
6620 |
+ ) \ |
6621 |
+ __m64 c64 = _mm_set1_pi16(64); \ |
6622 |
+ __m64 perm_inner_l = _mm_setr_pi8(INNER_PERM(n)); \ |
6623 |
+ __m64 perm_inner_h = _mm_setr_pi8(INNER_PERM(n + 2)); \ |
6624 |
+ __m64 v0, v1, a0; \ |
6625 |
+ int i |
6626 |
+ |
6627 |
+#define FILTER_H4(is6tap) \ |
6628 |
+ v0 = _mm_shuffle_pi8(a0, perm_inner_l); \ |
6629 |
+ v1 = _mm_shuffle_pi8(a0, perm_inner_h); \ |
6630 |
+ v0 = _mm_maddubs_pi16(v0, filter_inner); \ |
6631 |
+ v1 = _mm_maddubs_pi16(v1, filter_inner); \ |
6632 |
+ v0 = _mm_hadds_pi16(v0, v1); \ |
6633 |
+ is6tap( \ |
6634 |
+ a0 = _mm_shuffle2_pi8(a0, a1, perm_outer); \ |
6635 |
+ v0 = _mm_adds_pi16(v0, _mm_maddubs_pi16(a0, filter_outer)); \ |
6636 |
+ ) \ |
6637 |
+ v0 = _mm_adds_pi16(v0, c64); \ |
6638 |
+ a0 = _mm_srai_pi16(v0, 7); \ |
6639 |
+ a0 = _mm_packs_pu16(a0, a0); \ |
6640 |
+ *(uint32_t*)dst = _mm_cvtsi64_si32(a0) |
6641 |
+ |
6642 |
+#define COPY(code) code |
6643 |
+#define NOP(code) |
6644 |
+#define IF6TAP(code) code |
6645 |
+ |
6646 |
+static void put_vp8_epel16_h6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, |
6647 |
+ ptrdiff_t src_stride, int h, int mx, int my) |
6648 |
+{ |
6649 |
+ INIT_H_SUBPEL_FILTER(mx - 1, 1, IF6TAP); |
6650 |
+ __m128i a0, a1; |
6651 |
+ |
6652 |
+ PRAGMA_E2K("ivdep") |
6653 |
+ for (i = 0; i < h; i++) { |
6654 |
+ a0 = VEC_LD(src - 2); |
6655 |
+ a1 = VEC_LD(src - 2 + 8); |
6656 |
+ FILTER_H(a0, IF6TAP); |
6657 |
+ FILTER_H(a1, IF6TAP); |
6658 |
+ a0 = _mm_packus_epi16(a0, a1); |
6659 |
+ VEC_ST(dst, a0); |
6660 |
+ src += src_stride; |
6661 |
+ dst += dst_stride; |
6662 |
+ } |
6663 |
+} |
6664 |
+ |
6665 |
+static void put_vp8_epel8_h6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, |
6666 |
+ ptrdiff_t src_stride, int h, int mx, int my) |
6667 |
+{ |
6668 |
+ INIT_H_SUBPEL_FILTER(mx - 1, 1, IF6TAP); |
6669 |
+ __m128i a0; |
6670 |
+ |
6671 |
+ PRAGMA_E2K("ivdep") |
6672 |
+ for (i = 0; i < h; i++) { |
6673 |
+ a0 = VEC_LD(src - 2); |
6674 |
+ FILTER_H(a0, IF6TAP); |
6675 |
+ a0 = _mm_packus_epi16(a0, a0); |
6676 |
+ VEC_STL(dst, a0); |
6677 |
+ src += src_stride; |
6678 |
+ dst += dst_stride; |
6679 |
+ } |
6680 |
+} |
6681 |
+ |
6682 |
+static void put_vp8_epel8_h4_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, |
6683 |
+ ptrdiff_t src_stride, int h, int mx, int my) |
6684 |
+{ |
6685 |
+ INIT_H_SUBPEL_FILTER(mx - 1, 0, NOP); |
6686 |
+ __m128i a0; |
6687 |
+ |
6688 |
+ PRAGMA_E2K("ivdep") |
6689 |
+ for (i = 0; i < h; i++) { |
6690 |
+ a0 = VEC_LD(src - 1); |
6691 |
+ FILTER_H(a0, NOP); |
6692 |
+ a0 = _mm_packus_epi16(a0, a0); |
6693 |
+ VEC_STL(dst, a0); |
6694 |
+ src += src_stride; |
6695 |
+ dst += dst_stride; |
6696 |
+ } |
6697 |
+} |
6698 |
+ |
6699 |
+static void put_vp8_epel4_h6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, |
6700 |
+ ptrdiff_t src_stride, int h, int mx, int my) |
6701 |
+{ |
6702 |
+ INIT_H_SUBPEL_FILTER4(mx - 1, 1, IF6TAP); |
6703 |
+ |
6704 |
+ PRAGMA_E2K("ivdep") |
6705 |
+ for (i = 0; i < h; i++) { |
6706 |
+ a0 = *(__m64*)(src - 2); |
6707 |
+ a1 = _mm_cvtsi32_si64(src[8 - 2]); |
6708 |
+ FILTER_H4(IF6TAP); |
6709 |
+ src += src_stride; |
6710 |
+ dst += dst_stride; |
6711 |
+ } |
6712 |
+} |
6713 |
+ |
6714 |
+static void put_vp8_epel4_h4_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, |
6715 |
+ ptrdiff_t src_stride, int h, int mx, int my) |
6716 |
+{ |
6717 |
+ INIT_H_SUBPEL_FILTER4(mx - 1, 0, NOP); |
6718 |
+ |
6719 |
+ PRAGMA_E2K("ivdep") |
6720 |
+ for (i = 0; i < h; i++) { |
6721 |
+ a0 = *(__m64*)(src - 1); |
6722 |
+ FILTER_H4(NOP); |
6723 |
+ src += src_stride; |
6724 |
+ dst += dst_stride; |
6725 |
+ } |
6726 |
+} |
6727 |
+ |
6728 |
+#define PAIR_8X2(a, b) (a & 255) | b * 256 |
6729 |
+static const int16_t v_subpel_filters[7][3] = |
6730 |
+{ |
6731 |
+ { PAIR_8X2( -6, 123), PAIR_8X2( 12, -1), PAIR_8X2(0, 0) }, |
6732 |
+ { PAIR_8X2(-11, 108), PAIR_8X2( 36, -8), PAIR_8X2(2, 1) }, |
6733 |
+ { PAIR_8X2( -9, 93), PAIR_8X2( 50, -6), PAIR_8X2(0, 0) }, |
6734 |
+ { PAIR_8X2(-16, 77), PAIR_8X2( 77, -16), PAIR_8X2(3, 3) }, |
6735 |
+ { PAIR_8X2( -6, 50), PAIR_8X2( 93, -9), PAIR_8X2(0, 0) }, |
6736 |
+ { PAIR_8X2( -8, 36), PAIR_8X2(108, -11), PAIR_8X2(1, 2) }, |
6737 |
+ { PAIR_8X2( -1, 12), PAIR_8X2(123, -6), PAIR_8X2(0, 0) } |
6738 |
+}; |
6739 |
+ |
6740 |
+#define INIT_V_SUBPEL_FILTER(p, type, j, is6tap) \ |
6741 |
+ type v0, v1, r0; \ |
6742 |
+ type c64 = _mm_set1_##p(64); \ |
6743 |
+ type f0 = _mm_set1_##p(v_subpel_filters[j][0]); \ |
6744 |
+ type f1 = _mm_set1_##p(v_subpel_filters[j][1]); \ |
6745 |
+ is6tap(type f2 = _mm_set1_##p(v_subpel_filters[j][2]);) \ |
6746 |
+ int i |
6747 |
+ |
6748 |
+#define FILTER_V(p, dstv, lo, CVT, is6tap) \ |
6749 |
+ v0 = _mm_maddubs_##p(_mm_unpack##lo(CVT(s1), CVT(s2)), f0); \ |
6750 |
+ v1 = _mm_maddubs_##p(_mm_unpack##lo(CVT(s3), CVT(s4)), f1); \ |
6751 |
+ v0 = _mm_adds_##p(v0, v1); \ |
6752 |
+ is6tap( \ |
6753 |
+ v1 = _mm_maddubs_##p(_mm_unpack##lo(CVT(s0), CVT(s5)), f2); \ |
6754 |
+ v0 = _mm_adds_##p(v0, v1); \ |
6755 |
+ ) \ |
6756 |
+ v0 = _mm_adds_##p(v0, c64); \ |
6757 |
+ dstv = _mm_srai_##p(v0, 7) |
6758 |
+ |
6759 |
+static void put_vp8_epel16_v6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, |
6760 |
+ ptrdiff_t src_stride, int h, int mx, int my) |
6761 |
+{ |
6762 |
+ INIT_V_SUBPEL_FILTER(epi16, __m128i, my - 1, IF6TAP); |
6763 |
+ __m128i s0, s1, s2, s3, s4, s5; |
6764 |
+ |
6765 |
+ s0 = VEC_LD(src - 2 * src_stride); |
6766 |
+ s1 = VEC_LD(src - 1 * src_stride); |
6767 |
+ s2 = VEC_LD(src); |
6768 |
+ s3 = VEC_LD(src + 1 * src_stride); |
6769 |
+ s4 = VEC_LD(src + 2 * src_stride); |
6770 |
+ src += src_stride * 3; |
6771 |
+ |
6772 |
+ PRAGMA_E2K("ivdep") |
6773 |
+ for (i = 0; i < h; i++) { |
6774 |
+ s5 = VEC_LD(src); |
6775 |
+ FILTER_V(epi16, r0, lo_epi8, COPY, IF6TAP); |
6776 |
+ FILTER_V(epi16, v0, hi_epi8, COPY, IF6TAP); |
6777 |
+ r0 = _mm_packus_epi16(r0, v0); |
6778 |
+ VEC_ST(dst, r0); |
6779 |
+ s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; |
6780 |
+ dst += dst_stride; |
6781 |
+ src += src_stride; |
6782 |
+ } |
6783 |
+} |
6784 |
+ |
6785 |
+static void put_vp8_epel8_v6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, |
6786 |
+ ptrdiff_t src_stride, int h, int mx, int my) |
6787 |
+{ |
6788 |
+ INIT_V_SUBPEL_FILTER(epi16, __m128i, my - 1, IF6TAP); |
6789 |
+ __m64 s0, s1, s2, s3, s4, s5; |
6790 |
+ |
6791 |
+ s0 = *(__m64*)(src - 2 * src_stride); |
6792 |
+ s1 = *(__m64*)(src - 1 * src_stride); |
6793 |
+ s2 = *(__m64*)src; |
6794 |
+ s3 = *(__m64*)(src + 1 * src_stride); |
6795 |
+ s4 = *(__m64*)(src + 2 * src_stride); |
6796 |
+ src += src_stride * 3; |
6797 |
+ |
6798 |
+ PRAGMA_E2K("ivdep") |
6799 |
+ for (i = 0; i < h; i++) { |
6800 |
+ s5 = *(__m64*)src; |
6801 |
+ FILTER_V(epi16, r0, lo_epi8, _mm_movpi64_epi64, IF6TAP); |
6802 |
+ r0 = _mm_packus_epi16(r0, r0); |
6803 |
+ VEC_STL(dst, r0); |
6804 |
+ s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; |
6805 |
+ dst += dst_stride; |
6806 |
+ src += src_stride; |
6807 |
+ } |
6808 |
+} |
6809 |
+ |
6810 |
+static void put_vp8_epel4_v6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, |
6811 |
+ ptrdiff_t src_stride, int h, int mx, int my) |
6812 |
+{ |
6813 |
+ INIT_V_SUBPEL_FILTER(pi16, __m64, my - 1, IF6TAP); |
6814 |
+ __m64 s0, s1, s2, s3, s4, s5; |
6815 |
+ |
6816 |
+ s0 = _mm_cvtsi32_si64(*(uint32_t*)(src - 2 * src_stride)); |
6817 |
+ s1 = _mm_cvtsi32_si64(*(uint32_t*)(src - 1 * src_stride)); |
6818 |
+ s2 = _mm_cvtsi32_si64(*(uint32_t*)src); |
6819 |
+ s3 = _mm_cvtsi32_si64(*(uint32_t*)(src + 1 * src_stride)); |
6820 |
+ s4 = _mm_cvtsi32_si64(*(uint32_t*)(src + 2 * src_stride)); |
6821 |
+ src += src_stride * 3; |
6822 |
+ |
6823 |
+ PRAGMA_E2K("ivdep") |
6824 |
+ for (i = 0; i < h; i++) { |
6825 |
+ s5 = _mm_cvtsi32_si64(*(uint32_t*)src); |
6826 |
+ FILTER_V(pi16, r0, lo_pi8, COPY, IF6TAP); |
6827 |
+ r0 = _mm_packs_pu16(r0, r0); |
6828 |
+ *(uint32_t*)dst = _mm_cvtsi64_si32(r0); |
6829 |
+ s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; |
6830 |
+ dst += dst_stride; |
6831 |
+ src += src_stride; |
6832 |
+ } |
6833 |
+} |
6834 |
+ |
6835 |
+static void put_vp8_epel8_v4_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, |
6836 |
+ ptrdiff_t src_stride, int h, int mx, int my) |
6837 |
+{ |
6838 |
+ INIT_V_SUBPEL_FILTER(epi16, __m128i, my - 1, NOP); |
6839 |
+ __m64 s1, s2, s3, s4; |
6840 |
+ |
6841 |
+ s1 = *(__m64*)(src - 1 * src_stride); |
6842 |
+ s2 = *(__m64*)src; |
6843 |
+ s3 = *(__m64*)(src + 1 * src_stride); |
6844 |
+ src += src_stride * 2; |
6845 |
+ |
6846 |
+ PRAGMA_E2K("ivdep") |
6847 |
+ for (i = 0; i < h; i++) { |
6848 |
+ s4 = *(__m64*)src; |
6849 |
+ FILTER_V(epi16, r0, lo_epi8, _mm_movpi64_epi64, NOP); |
6850 |
+ r0 = _mm_packus_epi16(r0, r0); |
6851 |
+ VEC_STL(dst, r0); |
6852 |
+ s1 = s2; s2 = s3; s3 = s4; |
6853 |
+ dst += dst_stride; |
6854 |
+ src += src_stride; |
6855 |
+ } |
6856 |
+} |
6857 |
+ |
6858 |
+static void put_vp8_epel4_v4_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, |
6859 |
+ ptrdiff_t src_stride, int h, int mx, int my) |
6860 |
+{ |
6861 |
+ INIT_V_SUBPEL_FILTER(pi16, __m64, my - 1, NOP); |
6862 |
+ __m64 s1, s2, s3, s4; |
6863 |
+ |
6864 |
+ s1 = _mm_cvtsi32_si64(*(uint32_t*)(src - 1 * src_stride)); |
6865 |
+ s2 = _mm_cvtsi32_si64(*(uint32_t*)src); |
6866 |
+ s3 = _mm_cvtsi32_si64(*(uint32_t*)(src + 1 * src_stride)); |
6867 |
+ src += src_stride * 2; |
6868 |
+ |
6869 |
+ PRAGMA_E2K("ivdep") |
6870 |
+ for (i = 0; i < h; i++) { |
6871 |
+ s4 = _mm_cvtsi32_si64(*(uint32_t*)src); |
6872 |
+ FILTER_V(pi16, r0, lo_pi8, COPY, NOP); |
6873 |
+ r0 = _mm_packs_pu16(r0, r0); |
6874 |
+ *(uint32_t*)dst = _mm_cvtsi64_si32(r0); |
6875 |
+ s1 = s2; s2 = s3; s3 = s4; |
6876 |
+ dst += dst_stride; |
6877 |
+ src += src_stride; |
6878 |
+ } |
6879 |
+} |
6880 |
+ |
6881 |
+#define EPEL_HV(WIDTH, HTAPS, VTAPS) \ |
6882 |
+static void put_vp8_epel##WIDTH##_h##HTAPS##v##VTAPS##_e2k(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ |
6883 |
+{ \ |
6884 |
+ DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \ |
6885 |
+ if (VTAPS == 6) { \ |
6886 |
+ put_vp8_epel##WIDTH##_h##HTAPS##_e2k(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \ |
6887 |
+ put_vp8_epel##WIDTH##_v##VTAPS##_e2k(dst, dstride, tmp+2*16, 16, h, mx, my); \ |
6888 |
+ } else { \ |
6889 |
+ put_vp8_epel##WIDTH##_h##HTAPS##_e2k(tmp, 16, src-sstride, sstride, h+4, mx, my); \ |
6890 |
+ put_vp8_epel##WIDTH##_v##VTAPS##_e2k(dst, dstride, tmp+16, 16, h, mx, my); \ |
6891 |
+ } \ |
6892 |
+} |
6893 |
+ |
6894 |
+EPEL_HV(16, 6,6) |
6895 |
+EPEL_HV(8, 6,6) |
6896 |
+EPEL_HV(8, 4,6) |
6897 |
+EPEL_HV(8, 6,4) |
6898 |
+EPEL_HV(8, 4,4) |
6899 |
+EPEL_HV(4, 6,6) |
6900 |
+EPEL_HV(4, 4,6) |
6901 |
+EPEL_HV(4, 6,4) |
6902 |
+EPEL_HV(4, 4,4) |
6903 |
+ |
6904 |
+static void put_vp8_pixels16_e2k(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, |
6905 |
+ ptrdiff_t sstride, int h, int mx, int my) |
6906 |
+{ |
6907 |
+ __m128i v0, v1, v2, v3; |
6908 |
+ int i; |
6909 |
+ |
6910 |
+ PRAGMA_E2K("ivdep") |
6911 |
+ for (i = 0; i < h; i += 4) { |
6912 |
+ v0 = VEC_LD(src); |
6913 |
+ v1 = VEC_LD(src + sstride); |
6914 |
+ v2 = VEC_LD(src + sstride * 2); |
6915 |
+ v3 = VEC_LD(src + sstride * 3); |
6916 |
+ VEC_ST(dst, v0); |
6917 |
+ VEC_ST(dst + dstride, v1); |
6918 |
+ VEC_ST(dst + dstride * 2, v2); |
6919 |
+ VEC_ST(dst + dstride * 3, v3); |
6920 |
+ src += sstride * 4; |
6921 |
+ dst += dstride * 4; |
6922 |
+ } |
6923 |
+} |
6924 |
+ |
6925 |
+ |
6926 |
+av_cold void ff_vp78dsp_init_e2k(VP8DSPContext *c) |
6927 |
+{ |
6928 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
6929 |
+ return; |
6930 |
+ |
6931 |
+ // checkasm |
6932 |
+ c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_e2k; |
6933 |
+ c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_e2k; |
6934 |
+ c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_e2k; |
6935 |
+ c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_e2k; |
6936 |
+ |
6937 |
+ c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_e2k; |
6938 |
+ c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_e2k; |
6939 |
+ c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_e2k; |
6940 |
+ c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_e2k; |
6941 |
+ |
6942 |
+ c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_e2k; |
6943 |
+ c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_e2k; |
6944 |
+ c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_e2k; |
6945 |
+ c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_e2k; |
6946 |
+ |
6947 |
+ c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_e2k; |
6948 |
+ c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_e2k; |
6949 |
+ c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_e2k; |
6950 |
+ c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_e2k; |
6951 |
+ |
6952 |
+ c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_e2k; |
6953 |
+ c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_e2k; |
6954 |
+ c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_e2k; |
6955 |
+ c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_e2k; |
6956 |
+} |
6957 |
diff --git a/libavcodec/e2k/vp9dsp.c b/libavcodec/e2k/vp9dsp.c |
6958 |
new file mode 100644 |
6959 |
index 0000000..5b80070 |
6960 |
--- /dev/null |
6961 |
+++ b/libavcodec/e2k/vp9dsp.c |
6962 |
@@ -0,0 +1,1740 @@ |
6963 |
+/* |
6964 |
+ * VP9 compatible video decoder |
6965 |
+ * |
6966 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
6967 |
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> |
6968 |
+ * Copyright (C) 2013 Clément Bœsch <u pkh me> |
6969 |
+ * |
6970 |
+ * This file is part of FFmpeg. |
6971 |
+ * |
6972 |
+ * FFmpeg is free software; you can redistribute it and/or |
6973 |
+ * modify it under the terms of the GNU Lesser General Public |
6974 |
+ * License as published by the Free Software Foundation; either |
6975 |
+ * version 2.1 of the License, or (at your option) any later version. |
6976 |
+ * |
6977 |
+ * FFmpeg is distributed in the hope that it will be useful, |
6978 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
6979 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
6980 |
+ * Lesser General Public License for more details. |
6981 |
+ * |
6982 |
+ * You should have received a copy of the GNU Lesser General Public |
6983 |
+ * License along with FFmpeg; if not, write to the Free Software |
6984 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
6985 |
+ */ |
6986 |
+ |
6987 |
+#define BIT_DEPTH 8 |
6988 |
+ |
6989 |
+#include "config.h" |
6990 |
+#include "libavutil/cpu.h" |
6991 |
+#include "libavutil/e2k/cpu.h" |
6992 |
+#include "libavutil/e2k/util_e2k.h" |
6993 |
+ |
6994 |
+#include "libavutil/common.h" |
6995 |
+#include "libavutil/intreadwrite.h" |
6996 |
+#include "libavcodec/vp9dsp.h" |
6997 |
+ |
6998 |
+#define pixel uint8_t |
6999 |
+ |
7000 |
+#define itxfm_wrapper4(type_a, type_b, sz, bits, has_dconly) \ |
7001 |
+static void type_a##_##type_b##_##sz##x##sz##_add_e2k(uint8_t *dst, \ |
7002 |
+ ptrdiff_t stride, \ |
7003 |
+ int16_t *block, int eob) \ |
7004 |
+{ \ |
7005 |
+ int j; \ |
7006 |
+ int16_t tmp[sz * sz], out[sz * sz]; \ |
7007 |
+ __m64 h0, h1, h2, round, zerov = _mm_setzero_si64(); \ |
7008 |
+\ |
7009 |
+ if (has_dconly && eob == 1) { \ |
7010 |
+ int t = ((((int)block[0] * 11585 + (1 << 13)) >> 14) \ |
7011 |
+ * 11585 + (1 << 13)) >> 14; \ |
7012 |
+ block[0] = 0; \ |
7013 |
+ t = bits ? (t + (1 << (bits - 1))) >> bits : t; \ |
7014 |
+ h1 = _mm_set1_pi16(t); \ |
7015 |
+ h2 = _mm_set1_pi16(-t); \ |
7016 |
+ h1 = _mm_packs_pu16(h1, h1); \ |
7017 |
+ h2 = _mm_packs_pu16(h2, h2); \ |
7018 |
+ \ |
7019 |
+ PRAGMA_E2K("ivdep") \ |
7020 |
+ for (j = 0; j < sz; j++, dst += stride) { \ |
7021 |
+ h0 = _mm_cvtsi32_si64(*(uint32_t*)dst); \ |
7022 |
+ h0 = _mm_subs_pu8(_mm_adds_pu8(h0, h1), h2); \ |
7023 |
+ *(uint32_t*)dst = _mm_cvtsi64_si32(h0); \ |
7024 |
+ } \ |
7025 |
+ return; \ |
7026 |
+ } \ |
7027 |
+ \ |
7028 |
+ type_a##sz##_1d(block, tmp, 0); \ |
7029 |
+ memset(block, 0, sz * sz * sizeof(*block)); \ |
7030 |
+ type_b##sz##_1d(tmp, out, 1); \ |
7031 |
+ round = _mm_set1_pi16((1 << bits) >> 1); \ |
7032 |
+ PRAGMA_E2K("ivdep") \ |
7033 |
+ for (j = 0; j < sz; j++, dst += stride) { \ |
7034 |
+ h1 = *(__m64*)(out + j * sz); \ |
7035 |
+ h1 = _mm_srai_pi16(_mm_add_pi16(h1, round), bits); \ |
7036 |
+ h0 = _mm_cvtsi32_si64(*(uint32_t*)dst); \ |
7037 |
+ h0 = _mm_unpacklo_pi8(h0, zerov); \ |
7038 |
+ h0 = _mm_add_pi16(h0, h1); \ |
7039 |
+ h0 = _mm_packs_pu16(h0, h0); \ |
7040 |
+ *(uint32_t*)dst = _mm_cvtsi64_si32(h0); \ |
7041 |
+ } \ |
7042 |
+} |
7043 |
+ |
7044 |
+#define itxfm_wrapper8(type_a, type_b, sz, bits, has_dconly) \ |
7045 |
+static void type_a##_##type_b##_##sz##x##sz##_add_e2k(uint8_t *dst, \ |
7046 |
+ ptrdiff_t stride, \ |
7047 |
+ int16_t *block, int eob) \ |
7048 |
+{ \ |
7049 |
+ int j; \ |
7050 |
+ int16_t tmp[sz * sz], out[sz * sz]; \ |
7051 |
+ __m128i v0, v1, round; \ |
7052 |
+ LOAD_ZERO; \ |
7053 |
+\ |
7054 |
+ if (has_dconly && eob == 1) { \ |
7055 |
+ __m64 h0, h1, h2; \ |
7056 |
+ int t = ((((int)block[0] * 11585 + (1 << 13)) >> 14) \ |
7057 |
+ * 11585 + (1 << 13)) >> 14; \ |
7058 |
+ block[0] = 0; \ |
7059 |
+ t = bits ? (t + (1 << (bits - 1))) >> bits : t; \ |
7060 |
+ h1 = _mm_set1_pi16(t); \ |
7061 |
+ h2 = _mm_set1_pi16(-t); \ |
7062 |
+ h1 = _mm_packs_pu16(h1, h1); \ |
7063 |
+ h2 = _mm_packs_pu16(h2, h2); \ |
7064 |
+ \ |
7065 |
+ PRAGMA_E2K("ivdep") \ |
7066 |
+ for (j = 0; j < sz; j++, dst += stride) { \ |
7067 |
+ h0 = *(__m64*)dst; \ |
7068 |
+ h0 = _mm_subs_pu8(_mm_adds_pu8(h0, h1), h2); \ |
7069 |
+ *(__m64*)dst = h0; \ |
7070 |
+ } \ |
7071 |
+ return; \ |
7072 |
+ } \ |
7073 |
+ \ |
7074 |
+ type_a##sz##_1d(block, tmp, 0); \ |
7075 |
+ memset(block, 0, sz * sz * sizeof(*block)); \ |
7076 |
+ type_b##sz##_1d(tmp, out, 1); \ |
7077 |
+ round = _mm_set1_epi16((1 << bits) >> 1); \ |
7078 |
+ PRAGMA_E2K("ivdep") \ |
7079 |
+ for (j = 0; j < sz; j++, dst += stride) { \ |
7080 |
+ v1 = VEC_LD(out + j * sz); \ |
7081 |
+ v1 = _mm_srai_epi16(_mm_add_epi16(v1, round), bits); \ |
7082 |
+ v0 = VEC_LD8(dst); \ |
7083 |
+ v0 = _mm_unpacklo_epi8(v0, zerov); \ |
7084 |
+ v0 = _mm_add_epi16(v0, v1); \ |
7085 |
+ v0 = _mm_packus_epi16(v0, v0); \ |
7086 |
+ VEC_STL(dst, v0); \ |
7087 |
+ } \ |
7088 |
+} |
7089 |
+ |
7090 |
+#define itxfm_wrapper16(type_a, type_b, sz, bits, has_dconly) \ |
7091 |
+static void type_a##_##type_b##_##sz##x##sz##_add_e2k(uint8_t *dst, \ |
7092 |
+ ptrdiff_t stride, \ |
7093 |
+ int16_t *block, int eob) \ |
7094 |
+{ \ |
7095 |
+ int i, j; \ |
7096 |
+ int16_t tmp[sz * sz], out[sz * sz]; \ |
7097 |
+ __m128i v0, v1, v2, v3, round; \ |
7098 |
+ LOAD_ZERO; \ |
7099 |
+\ |
7100 |
+ if (has_dconly && eob == 1) { \ |
7101 |
+ int t = ((((int)block[0] * 11585 + (1 << 13)) >> 14) \ |
7102 |
+ * 11585 + (1 << 13)) >> 14; \ |
7103 |
+ block[0] = 0; \ |
7104 |
+ t = bits ? (t + (1 << (bits - 1))) >> bits : t; \ |
7105 |
+ v1 = _mm_set1_epi16(t); \ |
7106 |
+ v2 = _mm_set1_epi16(-t); \ |
7107 |
+ v1 = _mm_packus_epi16(v1, v1); \ |
7108 |
+ v2 = _mm_packus_epi16(v2, v2); \ |
7109 |
+ \ |
7110 |
+ for (j = 0; j < sz; j++, dst += stride) \ |
7111 |
+ PRAGMA_E2K("ivdep") \ |
7112 |
+ for (i = 0; i < sz; i += 16) { \ |
7113 |
+ v0 = VEC_LD(dst + i); \ |
7114 |
+ v0 = _mm_subs_epu8(_mm_adds_epu8(v0, v1), v2); \ |
7115 |
+ VEC_ST(dst + i, v0); \ |
7116 |
+ } \ |
7117 |
+ return; \ |
7118 |
+ } \ |
7119 |
+ \ |
7120 |
+ type_a##sz##_1d(block, tmp, 0); \ |
7121 |
+ memset(block, 0, sz * sz * sizeof(*block)); \ |
7122 |
+ type_b##sz##_1d(tmp, out, 1); \ |
7123 |
+ round = _mm_set1_epi16((1 << bits) >> 1); \ |
7124 |
+ for (j = 0; j < sz; j++, dst += stride) \ |
7125 |
+ PRAGMA_E2K("ivdep") \ |
7126 |
+ for (i = 0; i < sz; i += 16) { \ |
7127 |
+ v2 = VEC_LD(out + j * sz + i); \ |
7128 |
+ v3 = VEC_LD(out + j * sz + i + 8); \ |
7129 |
+ v2 = _mm_srai_epi16(_mm_add_epi16(v2, round), bits); \ |
7130 |
+ v3 = _mm_srai_epi16(_mm_add_epi16(v3, round), bits); \ |
7131 |
+ v1 = VEC_LD(dst + i); \ |
7132 |
+ v0 = _mm_unpacklo_epi8(v1, zerov); \ |
7133 |
+ v1 = _mm_unpackhi_epi8(v1, zerov); \ |
7134 |
+ v0 = _mm_add_epi16(v0, v2); \ |
7135 |
+ v1 = _mm_add_epi16(v1, v3); \ |
7136 |
+ v0 = _mm_packus_epi16(v0, v1); \ |
7137 |
+ VEC_ST(dst + i, v0); \ |
7138 |
+ } \ |
7139 |
+} |
7140 |
+ |
7141 |
+#define IN(x) VEC_LD8(in + (x) * sz) |
7142 |
+ |
7143 |
+#define X1(x, a, b) \ |
7144 |
+ __m128i x = _mm_set1_epi32((a & 0xffff) | b << 16); |
7145 |
+ |
7146 |
+#define X2(x, y, i0, i1) \ |
7147 |
+ v1 = _mm_unpacklo_epi16(IN(i0), IN(i1)); \ |
7148 |
+ v0 = _mm_madd_epi16(v1, f##x); \ |
7149 |
+ v1 = _mm_madd_epi16(v1, f##y); \ |
7150 |
+ t##x##a = _mm_srai_epi32(_mm_add_epi32(v0, round), 14); \ |
7151 |
+ t##y##a = _mm_srai_epi32(_mm_add_epi32(v1, round), 14); |
7152 |
+ |
7153 |
+#define X3(x, y, i0, i1) \ |
7154 |
+ v0 = _mm_mullo_epi32(_mm_sub_epi32(i0, i1), c11585); \ |
7155 |
+ v1 = _mm_mullo_epi32(_mm_add_epi32(i0, i1), c11585); \ |
7156 |
+ x = _mm_srai_epi32(_mm_add_epi32(v0, round), 14); \ |
7157 |
+ y = _mm_srai_epi32(_mm_add_epi32(v1, round), 14); |
7158 |
+ |
7159 |
+#define X4(x, y, i0, i1, m0, m1) \ |
7160 |
+ v0 = _mm_add_epi32(_mm_mullo_epi32(i0, m0), _mm_mullo_epi32(i1, m1)); \ |
7161 |
+ v1 = _mm_sub_epi32(_mm_mullo_epi32(i0, m1), _mm_mullo_epi32(i1, m0)); \ |
7162 |
+ x = _mm_srai_epi32(_mm_add_epi32(v0, round), 14); \ |
7163 |
+ y = _mm_srai_epi32(_mm_add_epi32(v1, round), 14); |
7164 |
+ |
7165 |
+#define X5(d, add, a0, a1, a2, a3, b0, b1, b2, b3) \ |
7166 |
+ v0 = _mm_##add##_epi32(a0, b0); \ |
7167 |
+ v1 = _mm_##add##_epi32(a1, b1); \ |
7168 |
+ v2 = _mm_##add##_epi32(a2, b2); \ |
7169 |
+ v3 = _mm_##add##_epi32(a3, b3); \ |
7170 |
+ v0 = _mm_packs_epi32(v0, v1); \ |
7171 |
+ v1 = _mm_packs_epi32(v2, v3); \ |
7172 |
+ v2 = _mm_unpacklo_epi16(v0, v1); \ |
7173 |
+ v3 = _mm_unpackhi_epi16(v0, v1); \ |
7174 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
7175 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
7176 |
+ VEC_STL(out + d + sz * 0, v0); \ |
7177 |
+ VEC_STH(out + d + sz * 1, v0); \ |
7178 |
+ VEC_STL(out + d + sz * 2, v1); \ |
7179 |
+ VEC_STH(out + d + sz * 3, v1); |
7180 |
+ |
7181 |
+#define X6(d, add, a0, a1, a2, a3, b0, b1, b2, b3) \ |
7182 |
+ v0 = _mm_##add##_epi32(a0, b0); \ |
7183 |
+ v1 = _mm_##add##_epi32(a1, b1); \ |
7184 |
+ v2 = _mm_##add##_epi32(a2, b2); \ |
7185 |
+ v3 = _mm_##add##_epi32(a3, b3); \ |
7186 |
+ v0 = _mm_packs_epi32(v0, v1); \ |
7187 |
+ v1 = _mm_packs_epi32(v2, v3); \ |
7188 |
+ VEC_STL(out + sz * (d + 0), v0); \ |
7189 |
+ VEC_STH(out + sz * (d + 1), v0); \ |
7190 |
+ VEC_STL(out + sz * (d + 2), v1); \ |
7191 |
+ VEC_STH(out + sz * (d + 3), v1); |
7192 |
+ |
7193 |
+static av_always_inline void idct4_1d(const int16_t *in, |
7194 |
+ int16_t *out, int pass) |
7195 |
+{ |
7196 |
+ __m128i v0, v1, v2, v3; |
7197 |
+ __m128i t0a, t1a, t2a, t3a; |
7198 |
+ __m128i round = _mm_set1_epi32(1 << 13); |
7199 |
+ int sz = 4; |
7200 |
+ |
7201 |
+ X1(f0, 11585, 11585) |
7202 |
+ X1(f1, 11585, -11585) |
7203 |
+ X1(f2, 6270, -15137) |
7204 |
+ X1(f3, 15137, 6270) |
7205 |
+ |
7206 |
+ X2(0, 1, 0, 2) |
7207 |
+ X2(2, 3, 1, 3) |
7208 |
+ |
7209 |
+ v0 = _mm_add_epi32(t0a, t3a); |
7210 |
+ v1 = _mm_add_epi32(t1a, t2a); |
7211 |
+ v2 = _mm_sub_epi32(t1a, t2a); |
7212 |
+ v3 = _mm_sub_epi32(t0a, t3a); |
7213 |
+ v0 = _mm_packs_epi32(v0, v1); |
7214 |
+ v1 = _mm_packs_epi32(v2, v3); |
7215 |
+ if (!pass) { |
7216 |
+ v2 = _mm_unpacklo_epi16(v0, v1); |
7217 |
+ v3 = _mm_unpackhi_epi16(v0, v1); |
7218 |
+ v0 = _mm_unpacklo_epi16(v2, v3); |
7219 |
+ v1 = _mm_unpackhi_epi16(v2, v3); |
7220 |
+ } |
7221 |
+ VEC_STL(out + sz * 0, v0); |
7222 |
+ VEC_STH(out + sz * 1, v0); |
7223 |
+ VEC_STL(out + sz * 2, v1); |
7224 |
+ VEC_STH(out + sz * 3, v1); |
7225 |
+} |
7226 |
+ |
7227 |
+static av_always_inline void idct8_1d(const int16_t *in, |
7228 |
+ int16_t *out, int pass) |
7229 |
+{ |
7230 |
+ __m128i v0, v1, v2, v3; |
7231 |
+ __m128i t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a; |
7232 |
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7; |
7233 |
+ __m128i round = _mm_set1_epi32(1 << 13); |
7234 |
+ __m128i c11585 = _mm_set1_epi32(11585); |
7235 |
+ |
7236 |
+ X1(f0, 11585, 11585) |
7237 |
+ X1(f1, 11585, -11585) |
7238 |
+ X1(f2, 6270, -15137) |
7239 |
+ X1(f3, 15137, 6270) |
7240 |
+ |
7241 |
+ X1(f4, 3196, -16069) |
7242 |
+ X1(f7, 16069, 3196) |
7243 |
+ X1(f5, 13623, -9102) |
7244 |
+ X1(f6, 9102, 13623) |
7245 |
+ |
7246 |
+ int i, sz = 8; |
7247 |
+ PRAGMA_E2K("ivdep") |
7248 |
+ for (i = 0; i < sz; i += 4, in += 4) { |
7249 |
+ X2(0, 1, 0, 4) |
7250 |
+ X2(2, 3, 2, 6) |
7251 |
+ X2(4, 7, 1, 7) |
7252 |
+ X2(5, 6, 5, 3) |
7253 |
+ |
7254 |
+ t0 = _mm_add_epi32(t0a, t3a); |
7255 |
+ t1 = _mm_add_epi32(t1a, t2a); |
7256 |
+ t2 = _mm_sub_epi32(t1a, t2a); |
7257 |
+ t3 = _mm_sub_epi32(t0a, t3a); |
7258 |
+ t4 = _mm_add_epi32(t4a, t5a); |
7259 |
+ t5a = _mm_sub_epi32(t4a, t5a); |
7260 |
+ t7 = _mm_add_epi32(t7a, t6a); |
7261 |
+ t6a = _mm_sub_epi32(t7a, t6a); |
7262 |
+ |
7263 |
+ X3(t5, t6, t6a, t5a) |
7264 |
+ |
7265 |
+ if (!pass) { |
7266 |
+ X5(0, add, t0, t1, t2, t3, t7, t6, t5, t4) |
7267 |
+ X5(4, sub, t3, t2, t1, t0, t4, t5, t6, t7) |
7268 |
+ out += 4 * sz; |
7269 |
+ } else { |
7270 |
+ X6(0, add, t0, t1, t2, t3, t7, t6, t5, t4) |
7271 |
+ X6(4, sub, t3, t2, t1, t0, t4, t5, t6, t7) |
7272 |
+ out += 4; |
7273 |
+ } |
7274 |
+ } |
7275 |
+} |
7276 |
+ |
7277 |
+static av_always_inline void idct16_1d(const int16_t *in, |
7278 |
+ int16_t *out, int pass) |
7279 |
+{ |
7280 |
+ __m128i v0, v1, v2, v3; |
7281 |
+ __m128i t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a; |
7282 |
+ __m128i t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a; |
7283 |
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7; |
7284 |
+ __m128i t8, t9, t10, t11, t12, t13, t14, t15; |
7285 |
+ __m128i round = _mm_set1_epi32(1 << 13); |
7286 |
+ __m128i c11585 = _mm_set1_epi32(11585); |
7287 |
+ __m128i m15137 = _mm_set1_epi32(-15137), c6270 = _mm_set1_epi32(6270); |
7288 |
+ |
7289 |
+ X1(f0, 11585, 11585) |
7290 |
+ X1(f1, 11585, -11585) |
7291 |
+ X1(f2, 6270, -15137) |
7292 |
+ X1(f3, 15137, 6270) |
7293 |
+ |
7294 |
+ X1(f4, 3196, -16069) |
7295 |
+ X1(f7, 16069, 3196) |
7296 |
+ X1(f5, 13623, -9102) |
7297 |
+ X1(f6, 9102, 13623) |
7298 |
+ |
7299 |
+ X1(f8, 1606, -16305) |
7300 |
+ X1(f15, 16305, 1606) |
7301 |
+ X1(f9, 12665, -10394) |
7302 |
+ X1(f14, 10394, 12665) |
7303 |
+ X1(f10, 7723, -14449) |
7304 |
+ X1(f13, 14449, 7723) |
7305 |
+ X1(f11, 15679, -4756) |
7306 |
+ X1(f12, 4756, 15679) |
7307 |
+ |
7308 |
+ int i, sz = 16; |
7309 |
+ PRAGMA_E2K("ivdep") |
7310 |
+ for (i = 0; i < sz; i += 4, in += 4) { |
7311 |
+ X2( 0, 1, 0, 8) |
7312 |
+ X2( 2, 3, 4, 12) |
7313 |
+ X2( 4, 7, 2, 14) |
7314 |
+ X2( 5, 6, 10, 6) |
7315 |
+ X2( 8, 15, 1, 15) |
7316 |
+ X2( 9, 14, 9, 7) |
7317 |
+ X2(10, 13, 5, 11) |
7318 |
+ X2(11, 12, 13, 3) |
7319 |
+ |
7320 |
+ t0 = _mm_add_epi32(t0a, t3a); |
7321 |
+ t1 = _mm_add_epi32(t1a, t2a); |
7322 |
+ t2 = _mm_sub_epi32(t1a, t2a); |
7323 |
+ t3 = _mm_sub_epi32(t0a, t3a); |
7324 |
+ t4 = _mm_add_epi32(t4a, t5a); |
7325 |
+ t5 = _mm_sub_epi32(t4a, t5a); |
7326 |
+ t6 = _mm_sub_epi32(t7a, t6a); |
7327 |
+ t7 = _mm_add_epi32(t7a, t6a); |
7328 |
+ t8 = _mm_add_epi32(t8a, t9a); |
7329 |
+ t9 = _mm_sub_epi32(t8a, t9a); |
7330 |
+ t10 = _mm_sub_epi32(t11a, t10a); |
7331 |
+ t11 = _mm_add_epi32(t11a, t10a); |
7332 |
+ t12 = _mm_add_epi32(t12a, t13a); |
7333 |
+ t13 = _mm_sub_epi32(t12a, t13a); |
7334 |
+ t14 = _mm_sub_epi32(t15a, t14a); |
7335 |
+ t15 = _mm_add_epi32(t15a, t14a); |
7336 |
+ |
7337 |
+ X3( t5a, t6a, t6, t5) |
7338 |
+ X4( t9a, t14a, t9, t14, m15137, c6270) |
7339 |
+ X4(t13a, t10a, t13, t10, c6270, m15137) |
7340 |
+ |
7341 |
+ t0a = _mm_add_epi32(t0, t7); |
7342 |
+ t1a = _mm_add_epi32(t1, t6a); |
7343 |
+ t2a = _mm_add_epi32(t2, t5a); |
7344 |
+ t3a = _mm_add_epi32(t3, t4); |
7345 |
+ t4 = _mm_sub_epi32(t3, t4); |
7346 |
+ t5 = _mm_sub_epi32(t2, t5a); |
7347 |
+ t6 = _mm_sub_epi32(t1, t6a); |
7348 |
+ t7 = _mm_sub_epi32(t0, t7); |
7349 |
+ t8a = _mm_add_epi32(t8, t11); |
7350 |
+ t9 = _mm_add_epi32(t9a, t10a); |
7351 |
+ t10 = _mm_sub_epi32(t9a, t10a); |
7352 |
+ t11a = _mm_sub_epi32(t8, t11); |
7353 |
+ t12a = _mm_sub_epi32(t15, t12); |
7354 |
+ t13 = _mm_sub_epi32(t14a, t13a); |
7355 |
+ t14 = _mm_add_epi32(t14a, t13a); |
7356 |
+ t15a = _mm_add_epi32(t15, t12); |
7357 |
+ |
7358 |
+ X3(t10a, t13a, t13, t10) |
7359 |
+ X3(t11, t12, t12a, t11a) |
7360 |
+ |
7361 |
+ if (!pass) { |
7362 |
+ X5( 0, add, t0a, t1a, t2a, t3a, t15a, t14, t13a, t12) |
7363 |
+ X5( 4, add, t4, t5, t6, t7, t11, t10a, t9, t8a) |
7364 |
+ X5( 8, sub, t7, t6, t5, t4, t8a, t9, t10a, t11) |
7365 |
+ X5(12, sub, t3a, t2a, t1a, t0a, t12, t13a, t14, t15a) |
7366 |
+ out += 4 * sz; |
7367 |
+ } else { |
7368 |
+ X6( 0, add, t0a, t1a, t2a, t3a, t15a, t14, t13a, t12) |
7369 |
+ X6( 4, add, t4, t5, t6, t7, t11, t10a, t9, t8a) |
7370 |
+ X6( 8, sub, t7, t6, t5, t4, t8a, t9, t10a, t11) |
7371 |
+ X6(12, sub, t3a, t2a, t1a, t0a, t12, t13a, t14, t15a) |
7372 |
+ out += 4; |
7373 |
+ } |
7374 |
+ } |
7375 |
+} |
7376 |
+ |
7377 |
+static av_always_inline void idct32_1d(const int16_t *in, |
7378 |
+ int16_t *out, int pass) |
7379 |
+{ |
7380 |
+ __m128i v0, v1, v2, v3; |
7381 |
+ __m128i t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a; |
7382 |
+ __m128i t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a; |
7383 |
+ __m128i t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a; |
7384 |
+ __m128i t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a; |
7385 |
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7; |
7386 |
+ __m128i t8, t9, t10, t11, t12, t13, t14, t15; |
7387 |
+ __m128i t16, t17, t18, t19, t20, t21, t22, t23; |
7388 |
+ __m128i t24, t25, t26, t27, t28, t29, t30, t31; |
7389 |
+ __m128i round = _mm_set1_epi32(1 << 13); |
7390 |
+ __m128i c11585 = _mm_set1_epi32(11585); |
7391 |
+ __m128i m15137 = _mm_set1_epi32(-15137), c6270 = _mm_set1_epi32(6270); |
7392 |
+ __m128i m16069 = _mm_set1_epi32(-16069), c3196 = _mm_set1_epi32(3196); |
7393 |
+ __m128i m9102 = _mm_set1_epi32(-9102), c13623 = _mm_set1_epi32(13623); |
7394 |
+ |
7395 |
+ X1(f0, 11585, 11585) |
7396 |
+ X1(f1, 11585, -11585) |
7397 |
+ X1(f2, 6270, -15137) |
7398 |
+ X1(f3, 15137, 6270) |
7399 |
+ |
7400 |
+ X1(f4, 3196, -16069) |
7401 |
+ X1(f7, 16069, 3196) |
7402 |
+ X1(f5, 13623, -9102) |
7403 |
+ X1(f6, 9102, 13623) |
7404 |
+ |
7405 |
+ X1(f8, 1606, -16305) |
7406 |
+ X1(f15, 16305, 1606) |
7407 |
+ X1(f9, 12665, -10394) |
7408 |
+ X1(f14, 10394, 12665) |
7409 |
+ X1(f10, 7723, -14449) |
7410 |
+ X1(f13, 14449, 7723) |
7411 |
+ X1(f11, 15679, -4756) |
7412 |
+ X1(f12, 4756, 15679) |
7413 |
+ |
7414 |
+ X1(f16, 804, -16364) |
7415 |
+ X1(f31, 16364, 804) |
7416 |
+ X1(f17, 12140, -11003) |
7417 |
+ X1(f30, 11003, 12140) |
7418 |
+ X1(f18, 7005, -14811) |
7419 |
+ X1(f29, 14811, 7005) |
7420 |
+ X1(f19, 15426, -5520) |
7421 |
+ X1(f28, 5520, 15426) |
7422 |
+ X1(f20, 3981, -15893) |
7423 |
+ X1(f27, 15893, 3981) |
7424 |
+ X1(f21, 14053, -8423U) |
7425 |
+ X1(f26, 8423U, 14053) |
7426 |
+ X1(f22, 9760, -13160) |
7427 |
+ X1(f25, 13160, 9760) |
7428 |
+ X1(f23, 16207, -2404) |
7429 |
+ X1(f24, 2404, 16207) |
7430 |
+ |
7431 |
+ int i, sz = 32; |
7432 |
+ PRAGMA_E2K("ivdep") |
7433 |
+ for (i = 0; i < sz; i += 4, in += 4) { |
7434 |
+ X2( 0, 1, 0, 16) |
7435 |
+ X2( 2, 3, 8, 24) |
7436 |
+ X2( 4, 7, 4, 28) |
7437 |
+ X2( 5, 6, 20, 12) |
7438 |
+ X2( 8, 15, 2, 30) |
7439 |
+ X2( 9, 14, 18, 14) |
7440 |
+ X2(10, 13, 10, 22) |
7441 |
+ X2(11, 12, 26, 6) |
7442 |
+ X2(16, 31, 1, 31) |
7443 |
+ X2(17, 30, 17, 15) |
7444 |
+ X2(18, 29, 9, 23) |
7445 |
+ X2(19, 28, 25, 7) |
7446 |
+ X2(20, 27, 5, 27) |
7447 |
+ X2(21, 26, 21, 11) |
7448 |
+ X2(22, 25, 13, 19) |
7449 |
+ X2(23, 24, 29, 3) |
7450 |
+ |
7451 |
+ t0 = _mm_add_epi32(t0a, t3a); |
7452 |
+ t1 = _mm_add_epi32(t1a, t2a); |
7453 |
+ t2 = _mm_sub_epi32(t1a, t2a); |
7454 |
+ t3 = _mm_sub_epi32(t0a, t3a); |
7455 |
+ t4 = _mm_add_epi32(t4a, t5a); |
7456 |
+ t5 = _mm_sub_epi32(t4a, t5a); |
7457 |
+ t6 = _mm_sub_epi32(t7a, t6a); |
7458 |
+ t7 = _mm_add_epi32(t7a, t6a); |
7459 |
+ t8 = _mm_add_epi32(t8a, t9a); |
7460 |
+ t9 = _mm_sub_epi32(t8a, t9a); |
7461 |
+ t10 = _mm_sub_epi32(t11a, t10a); |
7462 |
+ t11 = _mm_add_epi32(t11a, t10a); |
7463 |
+ t12 = _mm_add_epi32(t12a, t13a); |
7464 |
+ t13 = _mm_sub_epi32(t12a, t13a); |
7465 |
+ t14 = _mm_sub_epi32(t15a, t14a); |
7466 |
+ t15 = _mm_add_epi32(t15a, t14a); |
7467 |
+ t16 = _mm_add_epi32(t16a, t17a); |
7468 |
+ t17 = _mm_sub_epi32(t16a, t17a); |
7469 |
+ t18 = _mm_sub_epi32(t19a, t18a); |
7470 |
+ t19 = _mm_add_epi32(t19a, t18a); |
7471 |
+ t20 = _mm_add_epi32(t20a, t21a); |
7472 |
+ t21 = _mm_sub_epi32(t20a, t21a); |
7473 |
+ t22 = _mm_sub_epi32(t23a, t22a); |
7474 |
+ t23 = _mm_add_epi32(t23a, t22a); |
7475 |
+ t24 = _mm_add_epi32(t24a, t25a); |
7476 |
+ t25 = _mm_sub_epi32(t24a, t25a); |
7477 |
+ t26 = _mm_sub_epi32(t27a, t26a); |
7478 |
+ t27 = _mm_add_epi32(t27a, t26a); |
7479 |
+ t28 = _mm_add_epi32(t28a, t29a); |
7480 |
+ t29 = _mm_sub_epi32(t28a, t29a); |
7481 |
+ t30 = _mm_sub_epi32(t31a, t30a); |
7482 |
+ t31 = _mm_add_epi32(t31a, t30a); |
7483 |
+ |
7484 |
+ X3( t5a, t6a, t6, t5) |
7485 |
+ X4( t9a, t14a, t9, t14, m15137, c6270) |
7486 |
+ X4(t13a, t10a, t13, t10, c6270, m15137) |
7487 |
+ X4(t17a, t30a, t17, t30, m16069, c3196) |
7488 |
+ X4(t29a, t18a, t29, t18, c3196, m16069) |
7489 |
+ X4(t21a, t26a, t21, t26, m9102, c13623) |
7490 |
+ X4(t25a, t22a, t25, t22, c13623, m9102) |
7491 |
+ |
7492 |
+ t0a = _mm_add_epi32(t0, t7); |
7493 |
+ t1a = _mm_add_epi32(t1, t6a); |
7494 |
+ t2a = _mm_add_epi32(t2, t5a); |
7495 |
+ t3a = _mm_add_epi32(t3, t4); |
7496 |
+ t4a = _mm_sub_epi32(t3, t4); |
7497 |
+ t5 = _mm_sub_epi32(t2, t5a); |
7498 |
+ t6 = _mm_sub_epi32(t1, t6a); |
7499 |
+ t7a = _mm_sub_epi32(t0, t7); |
7500 |
+ t8a = _mm_add_epi32(t8, t11); |
7501 |
+ t9 = _mm_add_epi32(t9a, t10a); |
7502 |
+ t10 = _mm_sub_epi32(t9a, t10a); |
7503 |
+ t11a = _mm_sub_epi32(t8, t11); |
7504 |
+ t12a = _mm_sub_epi32(t15, t12); |
7505 |
+ t13 = _mm_sub_epi32(t14a, t13a); |
7506 |
+ t14 = _mm_add_epi32(t14a, t13a); |
7507 |
+ t15a = _mm_add_epi32(t15, t12); |
7508 |
+ t16a = _mm_add_epi32(t16, t19); |
7509 |
+ t17 = _mm_add_epi32(t17a, t18a); |
7510 |
+ t18 = _mm_sub_epi32(t17a, t18a); |
7511 |
+ t19a = _mm_sub_epi32(t16, t19); |
7512 |
+ t20a = _mm_sub_epi32(t23, t20); |
7513 |
+ t21 = _mm_sub_epi32(t22a, t21a); |
7514 |
+ t22 = _mm_add_epi32(t22a, t21a); |
7515 |
+ t23a = _mm_add_epi32(t23, t20); |
7516 |
+ t24a = _mm_add_epi32(t24, t27); |
7517 |
+ t25 = _mm_add_epi32(t25a, t26a); |
7518 |
+ t26 = _mm_sub_epi32(t25a, t26a); |
7519 |
+ t27a = _mm_sub_epi32(t24, t27); |
7520 |
+ t28a = _mm_sub_epi32(t31, t28); |
7521 |
+ t29 = _mm_sub_epi32(t30a, t29a); |
7522 |
+ t30 = _mm_add_epi32(t30a, t29a); |
7523 |
+ t31a = _mm_add_epi32(t31, t28); |
7524 |
+ |
7525 |
+ X3(t10a, t13a, t13, t10) |
7526 |
+ X3(t11, t12, t12a, t11a) |
7527 |
+ X4(t18a, t29a, t18, t29, m15137, c6270) |
7528 |
+ X4(t19, t28, t19a, t28a, m15137, c6270) |
7529 |
+ X4(t27, t20, t27a, t20a, c6270, m15137) |
7530 |
+ X4(t26a, t21a, t26, t21, c6270, m15137) |
7531 |
+ |
7532 |
+ t0 = _mm_add_epi32(t0a, t15a); |
7533 |
+ t1 = _mm_add_epi32(t1a, t14); |
7534 |
+ t2 = _mm_add_epi32(t2a, t13a); |
7535 |
+ t3 = _mm_add_epi32(t3a, t12); |
7536 |
+ t4 = _mm_add_epi32(t4a, t11); |
7537 |
+ t5a = _mm_add_epi32(t5, t10a); |
7538 |
+ t6a = _mm_add_epi32(t6, t9); |
7539 |
+ t7 = _mm_add_epi32(t7a, t8a); |
7540 |
+ t8 = _mm_sub_epi32(t7a, t8a); |
7541 |
+ t9a = _mm_sub_epi32(t6, t9); |
7542 |
+ t10 = _mm_sub_epi32(t5, t10a); |
7543 |
+ t11a = _mm_sub_epi32(t4a, t11); |
7544 |
+ t12a = _mm_sub_epi32(t3a, t12); |
7545 |
+ t13 = _mm_sub_epi32(t2a, t13a); |
7546 |
+ t14a = _mm_sub_epi32(t1a, t14); |
7547 |
+ t15 = _mm_sub_epi32(t0a, t15a); |
7548 |
+ t16 = _mm_add_epi32(t16a, t23a); |
7549 |
+ t17a = _mm_add_epi32(t17, t22); |
7550 |
+ t18 = _mm_add_epi32(t18a, t21a); |
7551 |
+ t19a = _mm_add_epi32(t19, t20); |
7552 |
+ t20a = _mm_sub_epi32(t19, t20); |
7553 |
+ t21 = _mm_sub_epi32(t18a, t21a); |
7554 |
+ t22a = _mm_sub_epi32(t17, t22); |
7555 |
+ t23 = _mm_sub_epi32(t16a, t23a); |
7556 |
+ t24 = _mm_sub_epi32(t31a, t24a); |
7557 |
+ t25a = _mm_sub_epi32(t30, t25); |
7558 |
+ t26 = _mm_sub_epi32(t29a, t26a); |
7559 |
+ t27a = _mm_sub_epi32(t28, t27); |
7560 |
+ t28a = _mm_add_epi32(t28, t27); |
7561 |
+ t29 = _mm_add_epi32(t29a, t26a); |
7562 |
+ t30a = _mm_add_epi32(t30, t25); |
7563 |
+ t31 = _mm_add_epi32(t31a, t24a); |
7564 |
+ |
7565 |
+ X3(t20, t27, t27a, t20a) |
7566 |
+ X3(t21a, t26a, t26, t21) |
7567 |
+ X3(t22, t25, t25a, t22a) |
7568 |
+ X3(t23a, t24a, t24, t23) |
7569 |
+ |
7570 |
+ if (!pass) { |
7571 |
+ X5( 0, add, t0, t1, t2, t3, t31, t30a, t29, t28a) |
7572 |
+ X5( 4, add, t4, t5a, t6a, t7, t27, t26a, t25, t24a) |
7573 |
+ X5( 8, add, t8, t9a, t10, t11a, t23a, t22, t21a, t20) |
7574 |
+ X5(12, add, t12a, t13, t14a, t15, t19a, t18, t17a, t16) |
7575 |
+ X5(16, sub, t15, t14a, t13, t12a, t16, t17a, t18, t19a) |
7576 |
+ X5(20, sub, t11a, t10, t9a, t8, t20, t21a, t22, t23a) |
7577 |
+ X5(24, sub, t7, t6a, t5a, t4, t24a, t25, t26a, t27) |
7578 |
+ X5(28, sub, t3, t2, t1, t0, t28a, t29, t30a, t31) |
7579 |
+ out += 4 * sz; |
7580 |
+ } else { |
7581 |
+ X6( 0, add, t0, t1, t2, t3, t31, t30a, t29, t28a) |
7582 |
+ X6( 4, add, t4, t5a, t6a, t7, t27, t26a, t25, t24a) |
7583 |
+ X6( 8, add, t8, t9a, t10, t11a, t23a, t22, t21a, t20) |
7584 |
+ X6(12, add, t12a, t13, t14a, t15, t19a, t18, t17a, t16) |
7585 |
+ X6(16, sub, t15, t14a, t13, t12a, t16, t17a, t18, t19a) |
7586 |
+ X6(20, sub, t11a, t10, t9a, t8, t20, t21a, t22, t23a) |
7587 |
+ X6(24, sub, t7, t6a, t5a, t4, t24a, t25, t26a, t27) |
7588 |
+ X6(28, sub, t3, t2, t1, t0, t28a, t29, t30a, t31) |
7589 |
+ out += 4; |
7590 |
+ } |
7591 |
+ } |
7592 |
+} |
7593 |
+ |
7594 |
+#undef IN |
7595 |
+#undef X1 |
7596 |
+#undef X2 |
7597 |
+#undef X3 |
7598 |
+#undef X4 |
7599 |
+#undef X5 |
7600 |
+#undef X6 |
7601 |
+ |
7602 |
+itxfm_wrapper4(idct, idct, 4, 4, 1) |
7603 |
+itxfm_wrapper8(idct, idct, 8, 5, 1) |
7604 |
+itxfm_wrapper16(idct, idct, 16, 6, 1) |
7605 |
+itxfm_wrapper16(idct, idct, 32, 6, 1) |
7606 |
+ |
7607 |
+#undef itxfm_wrapper4 |
7608 |
+#undef itxfm_wrapper8 |
7609 |
+#undef itxfm_wrapper16 |
7610 |
+ |
7611 |
+static av_cold void ff_vp9dsp_itxfm_init_8_e2k(VP9DSPContext *dsp) |
7612 |
+{ |
7613 |
+ |
7614 |
+#define init_idct(tx, nm) \ |
7615 |
+ dsp->itxfm_add[tx][DCT_DCT] = \ |
7616 |
+ dsp->itxfm_add[tx][ADST_DCT] = \ |
7617 |
+ dsp->itxfm_add[tx][DCT_ADST] = \ |
7618 |
+ dsp->itxfm_add[tx][ADST_ADST] = nm##_add_e2k |
7619 |
+ |
7620 |
+ dsp->itxfm_add[TX_4X4][DCT_DCT] = idct_idct_4x4_add_e2k; |
7621 |
+ dsp->itxfm_add[TX_8X8][DCT_DCT] = idct_idct_8x8_add_e2k; |
7622 |
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = idct_idct_16x16_add_e2k; |
7623 |
+ |
7624 |
+ init_idct(TX_32X32, idct_idct_32x32); |
7625 |
+ |
7626 |
+#undef init_idct |
7627 |
+} |
7628 |
+ |
7629 |
+#define LOAD_TRANSPOSE8(dst, a0, a1, a2, a3, a4, a5, a6, a7) \ |
7630 |
+ t0 = VEC_LD8(dst + stride * 0); \ |
7631 |
+ t1 = VEC_LD8(dst + stride * 1); \ |
7632 |
+ t2 = VEC_LD8(dst + stride * 2); \ |
7633 |
+ t3 = VEC_LD8(dst + stride * 3); \ |
7634 |
+ t4 = VEC_LD8(dst + stride * 4); \ |
7635 |
+ t5 = VEC_LD8(dst + stride * 5); \ |
7636 |
+ t6 = VEC_LD8(dst + stride * 6); \ |
7637 |
+ t7 = VEC_LD8(dst + stride * 7); \ |
7638 |
+ t0 = _mm_unpacklo_epi8(t0, t4); \ |
7639 |
+ t1 = _mm_unpacklo_epi8(t1, t5); \ |
7640 |
+ t2 = _mm_unpacklo_epi8(t2, t6); \ |
7641 |
+ t3 = _mm_unpacklo_epi8(t3, t7); \ |
7642 |
+ t4 = _mm_unpacklo_epi8(t0, t2); \ |
7643 |
+ t5 = _mm_unpackhi_epi8(t0, t2); \ |
7644 |
+ t6 = _mm_unpacklo_epi8(t1, t3); \ |
7645 |
+ t7 = _mm_unpackhi_epi8(t1, t3); \ |
7646 |
+ t0 = _mm_unpacklo_epi8(t4, t6); \ |
7647 |
+ t1 = _mm_unpackhi_epi8(t4, t6); \ |
7648 |
+ t2 = _mm_unpacklo_epi8(t5, t7); \ |
7649 |
+ t3 = _mm_unpackhi_epi8(t5, t7); \ |
7650 |
+ a0 = _mm_unpacklo_epi8(t0, zerov); \ |
7651 |
+ a1 = _mm_unpackhi_epi8(t0, zerov); \ |
7652 |
+ a2 = _mm_unpacklo_epi8(t1, zerov); \ |
7653 |
+ a3 = _mm_unpackhi_epi8(t1, zerov); \ |
7654 |
+ a4 = _mm_unpacklo_epi8(t2, zerov); \ |
7655 |
+ a5 = _mm_unpackhi_epi8(t2, zerov); \ |
7656 |
+ a6 = _mm_unpacklo_epi8(t3, zerov); \ |
7657 |
+ a7 = _mm_unpackhi_epi8(t3, zerov) |
7658 |
+ |
7659 |
+#define STORE_TRANSPOSE8(dst, a0, a1, a2, a3, a4, a5, a6, a7) \ |
7660 |
+ t0 = _mm_packus_epi16(a0, a1); \ |
7661 |
+ t1 = _mm_packus_epi16(a2, a3); \ |
7662 |
+ t2 = _mm_packus_epi16(a4, a5); \ |
7663 |
+ t3 = _mm_packus_epi16(a6, a7); \ |
7664 |
+ t4 = _mm_unpacklo_epi8(t0, t2); \ |
7665 |
+ t5 = _mm_unpackhi_epi8(t0, t2); \ |
7666 |
+ t6 = _mm_unpacklo_epi8(t1, t3); \ |
7667 |
+ t7 = _mm_unpackhi_epi8(t1, t3); \ |
7668 |
+ t0 = _mm_unpacklo_epi8(t4, t6); \ |
7669 |
+ t1 = _mm_unpackhi_epi8(t4, t6); \ |
7670 |
+ t2 = _mm_unpacklo_epi8(t5, t7); \ |
7671 |
+ t3 = _mm_unpackhi_epi8(t5, t7); \ |
7672 |
+ t4 = _mm_unpacklo_epi8(t0, t2); \ |
7673 |
+ t5 = _mm_unpackhi_epi8(t0, t2); \ |
7674 |
+ t6 = _mm_unpacklo_epi8(t1, t3); \ |
7675 |
+ t7 = _mm_unpackhi_epi8(t1, t3); \ |
7676 |
+ VEC_STL(dst + stride * 0, t4); \ |
7677 |
+ VEC_STH(dst + stride * 1, t4); \ |
7678 |
+ VEC_STL(dst + stride * 2, t5); \ |
7679 |
+ VEC_STH(dst + stride * 3, t5); \ |
7680 |
+ VEC_STL(dst + stride * 4, t6); \ |
7681 |
+ VEC_STH(dst + stride * 5, t6); \ |
7682 |
+ VEC_STL(dst + stride * 6, t7); \ |
7683 |
+ VEC_STH(dst + stride * 7, t7) |
7684 |
+ |
7685 |
+ |
7686 |
+static av_always_inline void loop_filter_h(uint8_t *dst, int E, int I, int H, |
7687 |
+ ptrdiff_t stride, int wd) |
7688 |
+{ |
7689 |
+ int F = 1; |
7690 |
+ LOAD_ZERO; |
7691 |
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7; |
7692 |
+ __m128i vfm, vflat8out, vflat8in, vhev; |
7693 |
+ __m128i p2a, p1a, p0a, q0a, q1a, q2a; |
7694 |
+ __m128i p6a, p5a, p4a, p3a, q3a, q4a, q5a, q6a; |
7695 |
+ |
7696 |
+ __m128i vF = _mm_set1_epi16(F); |
7697 |
+ __m128i vI = _mm_set1_epi16(I); |
7698 |
+ __m128i vE = _mm_set1_epi16(E); |
7699 |
+ __m128i vH = _mm_set1_epi16(H); |
7700 |
+ |
7701 |
+ __m128i p7, p6, p5, p4, p3, p2, p1, p0; |
7702 |
+ __m128i q0, q1, q2, q3, q4, q5, q6, q7; |
7703 |
+ |
7704 |
+ if (wd >= 16) { |
7705 |
+ LOAD_TRANSPOSE8(dst - 8, p7, p6, p5, p4, p3, p2, p1, p0); |
7706 |
+ LOAD_TRANSPOSE8(dst , q0, q1, q2, q3, q4, q5, q6, q7); |
7707 |
+ p6a = p6; p5a = p5; |
7708 |
+ p4a = p4; p3a = p3; |
7709 |
+ q3a = q3; q4a = q4; |
7710 |
+ q5a = q5; q6a = q6; |
7711 |
+ } else { |
7712 |
+ LOAD_TRANSPOSE8(dst - 4, p3, p2, p1, p0, q0, q1, q2, q3); |
7713 |
+ } |
7714 |
+ |
7715 |
+ t0 = _mm_abs_epi16(_mm_sub_epi16(p3, p2)); |
7716 |
+ t1 = _mm_abs_epi16(_mm_sub_epi16(p2, p1)); |
7717 |
+ t2 = _mm_abs_epi16(_mm_sub_epi16(p1, p0)); |
7718 |
+ t3 = _mm_abs_epi16(_mm_sub_epi16(q1, q0)); |
7719 |
+ t4 = _mm_abs_epi16(_mm_sub_epi16(q2, q1)); |
7720 |
+ t5 = _mm_abs_epi16(_mm_sub_epi16(q3, q2)); |
7721 |
+ vhev = _mm_or_si128(_mm_cmpgt_epi16(t2, vH), _mm_cmpgt_epi16(t3, vH)); |
7722 |
+ t0 = _mm_cmpgt_epi16(t0, vI); |
7723 |
+ t1 = _mm_cmpgt_epi16(t1, vI); |
7724 |
+ t2 = _mm_cmpgt_epi16(t2, vI); |
7725 |
+ t3 = _mm_cmpgt_epi16(t3, vI); |
7726 |
+ t4 = _mm_cmpgt_epi16(t4, vI); |
7727 |
+ t5 = _mm_cmpgt_epi16(t5, vI); |
7728 |
+ |
7729 |
+ t0 = _mm_or_si128(_mm_or_si128(t0, t1), t2); |
7730 |
+ t3 = _mm_or_si128(_mm_or_si128(t3, t4), t5); |
7731 |
+ t0 = _mm_or_si128(t0, t3); |
7732 |
+ |
7733 |
+ t6 = _mm_abs_epi16(_mm_sub_epi16(p0, q0)); |
7734 |
+ t7 = _mm_abs_epi16(_mm_sub_epi16(p1, q1)); |
7735 |
+ t6 = _mm_add_epi16(_mm_slli_epi16(t6, 1), _mm_srai_epi16(t7, 1)); |
7736 |
+ t6 = _mm_cmpgt_epi16(t6, vE); |
7737 |
+ vfm = _mm_or_si128(t0, t6); // !fm |
7738 |
+ |
7739 |
+ if (_mm_movemask_epi8(vfm) == 0xffff) return; |
7740 |
+ |
7741 |
+ if (wd >= 8) { |
7742 |
+ t0 = _mm_abs_epi16(_mm_sub_epi16(p3, p0)); |
7743 |
+ t1 = _mm_abs_epi16(_mm_sub_epi16(p2, p0)); |
7744 |
+ t2 = _mm_abs_epi16(_mm_sub_epi16(p1, p0)); |
7745 |
+ t3 = _mm_abs_epi16(_mm_sub_epi16(q1, q0)); |
7746 |
+ t4 = _mm_abs_epi16(_mm_sub_epi16(q2, q0)); |
7747 |
+ t5 = _mm_abs_epi16(_mm_sub_epi16(q3, q0)); |
7748 |
+ t0 = _mm_cmpgt_epi16(t0, vF); |
7749 |
+ t1 = _mm_cmpgt_epi16(t1, vF); |
7750 |
+ t2 = _mm_cmpgt_epi16(t2, vF); |
7751 |
+ t3 = _mm_cmpgt_epi16(t3, vF); |
7752 |
+ t4 = _mm_cmpgt_epi16(t4, vF); |
7753 |
+ t5 = _mm_cmpgt_epi16(t5, vF); |
7754 |
+ |
7755 |
+ t0 = _mm_or_si128(_mm_or_si128(t0, t1), t2); |
7756 |
+ t3 = _mm_or_si128(_mm_or_si128(t3, t4), t5); |
7757 |
+ vflat8in = _mm_or_si128(_mm_or_si128(t0, t3), vfm); |
7758 |
+ } |
7759 |
+ |
7760 |
+ { |
7761 |
+ __m128i c1 = _mm_set1_epi16(1); |
7762 |
+ __m128i c127 = _mm_set1_epi16(127); |
7763 |
+ __m128i m128 = _mm_set1_epi16(-128); |
7764 |
+ __m128i c43 = _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3); |
7765 |
+ t0 = _mm_and_si128(vhev, _mm_sub_epi16(p1, q1)); |
7766 |
+ t1 = _mm_sub_epi16(q0, p0); |
7767 |
+ t0 = _mm_min_epi16(_mm_max_epi16(t0, m128), c127); |
7768 |
+ // f = av_clip_intp2(p1 - q1, 7) & hev; |
7769 |
+ t1 = _mm_add_epi16(_mm_add_epi16(t1, t0), _mm_add_epi16(t1, t1)); |
7770 |
+ t1 = _mm_andnot_si128(vfm, t1); |
7771 |
+ t1 = _mm_packs_epi16(t1, t1); |
7772 |
+ // f = av_clip_intp2(3 * (q0 - p0) + f, 7); |
7773 |
+ t3 = _mm_adds_epi8(t1, c43); |
7774 |
+ t2 = _mm_srai_epi16(_mm_unpacklo_epi8(t3, t3), 8 + 3); |
7775 |
+ // f1 = FFMIN(f + 4, 0x7f) >> 3; |
7776 |
+ t3 = _mm_srai_epi16(_mm_unpackhi_epi8(t3, t3), 8 + 3); |
7777 |
+ // f2 = FFMIN(f + 3, 0x7f) >> 3; |
7778 |
+ t4 = _mm_srai_epi16(_mm_add_epi16(t2, c1), 1); |
7779 |
+ t4 = _mm_andnot_si128(vhev, t4); // f3 = ((f1 + 1) >> 1) & ~hev; |
7780 |
+ p1a = _mm_add_epi16(p1, t4); // av_clip_uint8(p1 + f3); |
7781 |
+ p0a = _mm_add_epi16(p0, t3); // av_clip_uint8(p0 + f2); |
7782 |
+ q0a = _mm_sub_epi16(q0, t2); // av_clip_uint8(q0 - f1); |
7783 |
+ q1a = _mm_sub_epi16(q1, t4); // av_clip_uint8(q1 - f3); |
7784 |
+ } |
7785 |
+ |
7786 |
+ p2a = p2; q2a = q2; |
7787 |
+ |
7788 |
+ if (wd >= 8 && (_mm_movemask_epi8(vflat8in) != 0xffff)) { |
7789 |
+ __m128i c4 = _mm_set1_epi16(4); |
7790 |
+ t0 = _mm_add_epi16(_mm_slli_epi16(p3, 2), c4); |
7791 |
+ t1 = _mm_add_epi16(_mm_add_epi16(q0, p0), _mm_add_epi16(p1, p2)); |
7792 |
+ t0 = _mm_sub_epi16(t0, p3); |
7793 |
+ t0 = _mm_add_epi16(t0, t1); |
7794 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p2, t0), 3); |
7795 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q1, p3)); |
7796 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(p1, t0), 3); |
7797 |
+ p2a = _mm_blendv_epi8(t1, p2, vflat8in); |
7798 |
+ p1a = _mm_blendv_epi8(t2, p1a, vflat8in); |
7799 |
+ |
7800 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q2, p3)); |
7801 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p0, t0), 3); |
7802 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p3)); |
7803 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q0, t0), 3); |
7804 |
+ p0a = _mm_blendv_epi8(t1, p0a, vflat8in); |
7805 |
+ q0a = _mm_blendv_epi8(t2, q0a, vflat8in); |
7806 |
+ |
7807 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p2)); |
7808 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(q1, t0), 3); |
7809 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p1)); |
7810 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q2, t0), 3); |
7811 |
+ q1a = _mm_blendv_epi8(t1, q1a, vflat8in); |
7812 |
+ q2a = _mm_blendv_epi8(t2, q2, vflat8in); |
7813 |
+ |
7814 |
+ if (wd >= 16) { |
7815 |
+ t0 = _mm_abs_epi16(_mm_sub_epi16(p7, p0)); |
7816 |
+ t1 = _mm_abs_epi16(_mm_sub_epi16(p6, p0)); |
7817 |
+ t2 = _mm_abs_epi16(_mm_sub_epi16(p5, p0)); |
7818 |
+ t3 = _mm_abs_epi16(_mm_sub_epi16(p4, p0)); |
7819 |
+ t4 = _mm_abs_epi16(_mm_sub_epi16(q4, q0)); |
7820 |
+ t5 = _mm_abs_epi16(_mm_sub_epi16(q5, q0)); |
7821 |
+ t6 = _mm_abs_epi16(_mm_sub_epi16(q6, q0)); |
7822 |
+ t7 = _mm_abs_epi16(_mm_sub_epi16(q7, q0)); |
7823 |
+ |
7824 |
+ t0 = _mm_cmpgt_epi16(t0, vF); |
7825 |
+ t1 = _mm_cmpgt_epi16(t1, vF); |
7826 |
+ t2 = _mm_cmpgt_epi16(t2, vF); |
7827 |
+ t3 = _mm_cmpgt_epi16(t3, vF); |
7828 |
+ t4 = _mm_cmpgt_epi16(t4, vF); |
7829 |
+ t5 = _mm_cmpgt_epi16(t5, vF); |
7830 |
+ t6 = _mm_cmpgt_epi16(t6, vF); |
7831 |
+ t7 = _mm_cmpgt_epi16(t7, vF); |
7832 |
+ |
7833 |
+ t0 = _mm_or_si128(_mm_or_si128(t0, t1), _mm_or_si128(t2, t3)); |
7834 |
+ t4 = _mm_or_si128(_mm_or_si128(t4, t5), _mm_or_si128(t6, t7)); |
7835 |
+ vflat8out = _mm_or_si128(t0, t4); |
7836 |
+ vflat8out = _mm_or_si128(vflat8out, vflat8in); |
7837 |
+ |
7838 |
+ if (_mm_movemask_epi8(vflat8out) != 0xffff) { |
7839 |
+ __m128i c8 = _mm_set1_epi16(8); |
7840 |
+ t0 = _mm_add_epi16(_mm_slli_epi16(p7, 3), c8); |
7841 |
+ t1 = _mm_add_epi16(_mm_add_epi16(q0, p0), _mm_add_epi16(p1, p2)); |
7842 |
+ t2 = _mm_add_epi16(_mm_add_epi16(p3, p4), _mm_add_epi16(p5, p6)); |
7843 |
+ t0 = _mm_sub_epi16(t0, p7); |
7844 |
+ t0 = _mm_add_epi16(t0, t1); |
7845 |
+ t0 = _mm_add_epi16(t0, t2); |
7846 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p6, t0), 4); |
7847 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q1, p7)); |
7848 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(p5, t0), 4); |
7849 |
+ p6a = _mm_blendv_epi8(t1, p6, vflat8out); |
7850 |
+ p5a = _mm_blendv_epi8(t2, p5, vflat8out); |
7851 |
+ |
7852 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q2, p7)); |
7853 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p4, t0), 4); |
7854 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p7)); |
7855 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(p3, t0), 4); |
7856 |
+ p4a = _mm_blendv_epi8(t1, p4, vflat8out); |
7857 |
+ p3a = _mm_blendv_epi8(t2, p3, vflat8out); |
7858 |
+ |
7859 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q4, p7)); |
7860 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p2, t0), 4); |
7861 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q5, p7)); |
7862 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(p1, t0), 4); |
7863 |
+ p2a = _mm_blendv_epi8(t1, p2a, vflat8out); |
7864 |
+ p1a = _mm_blendv_epi8(t2, p1a, vflat8out); |
7865 |
+ |
7866 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q6, p7)); |
7867 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p0, t0), 4); |
7868 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p7)); |
7869 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q0, t0), 4); |
7870 |
+ p0a = _mm_blendv_epi8(t1, p0a, vflat8out); |
7871 |
+ q0a = _mm_blendv_epi8(t2, q0a, vflat8out); |
7872 |
+ |
7873 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p6)); |
7874 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(q1, t0), 4); |
7875 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p5)); |
7876 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q2, t0), 4); |
7877 |
+ q1a = _mm_blendv_epi8(t1, q1a, vflat8out); |
7878 |
+ q2a = _mm_blendv_epi8(t2, q2a, vflat8out); |
7879 |
+ |
7880 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p4)); |
7881 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(q3, t0), 4); |
7882 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p3)); |
7883 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q4, t0), 4); |
7884 |
+ q3a = _mm_blendv_epi8(t1, q3, vflat8out); |
7885 |
+ q4a = _mm_blendv_epi8(t2, q4, vflat8out); |
7886 |
+ |
7887 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p2)); |
7888 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(q5, t0), 4); |
7889 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p1)); |
7890 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q6, t0), 4); |
7891 |
+ q5a = _mm_blendv_epi8(t1, q5, vflat8out); |
7892 |
+ q6a = _mm_blendv_epi8(t2, q6, vflat8out); |
7893 |
+ } |
7894 |
+ } |
7895 |
+ } |
7896 |
+ |
7897 |
+ if (wd >= 16) { |
7898 |
+ STORE_TRANSPOSE8(dst - 8, p7, p6a, p5a, p4a, p3a, p2a, p1a, p0a); |
7899 |
+ STORE_TRANSPOSE8(dst , q0a, q1a, q2a, q3a, q4a, q5a, q6a, q7 ); |
7900 |
+ } else if (wd >= 8) { |
7901 |
+ STORE_TRANSPOSE8(dst - 4, p3, p2a, p1a, p0a, q0a, q1a, q2a, q3 ); |
7902 |
+ } else { |
7903 |
+ t0 = _mm_packus_epi16(p1a, p0a); |
7904 |
+ t1 = _mm_packus_epi16(q0a, q1a); |
7905 |
+ t2 = _mm_unpacklo_epi8(t0, t1); |
7906 |
+ t3 = _mm_unpackhi_epi8(t0, t1); |
7907 |
+ t0 = _mm_unpacklo_epi8(t2, t3); |
7908 |
+ t1 = _mm_unpackhi_epi8(t2, t3); |
7909 |
+ *(uint32_t*)(dst - 2 + stride * 0) = _mm_extract_epi32(t0, 0); |
7910 |
+ *(uint32_t*)(dst - 2 + stride * 1) = _mm_extract_epi32(t0, 1); |
7911 |
+ *(uint32_t*)(dst - 2 + stride * 2) = _mm_extract_epi32(t0, 2); |
7912 |
+ *(uint32_t*)(dst - 2 + stride * 3) = _mm_extract_epi32(t0, 3); |
7913 |
+ *(uint32_t*)(dst - 2 + stride * 4) = _mm_extract_epi32(t1, 0); |
7914 |
+ *(uint32_t*)(dst - 2 + stride * 5) = _mm_extract_epi32(t1, 1); |
7915 |
+ *(uint32_t*)(dst - 2 + stride * 6) = _mm_extract_epi32(t1, 2); |
7916 |
+ *(uint32_t*)(dst - 2 + stride * 7) = _mm_extract_epi32(t1, 3); |
7917 |
+ } |
7918 |
+} |
7919 |
+ |
7920 |
+#undef LOAD_TRANSPOSE8 |
7921 |
+#undef STORE_TRANSPOSE8 |
7922 |
+ |
7923 |
+static av_always_inline void loop_filter_v(uint8_t *dst, int E, int I, int H, |
7924 |
+ ptrdiff_t stride, int wd) |
7925 |
+{ |
7926 |
+ int F = 1; |
7927 |
+ LOAD_ZERO; |
7928 |
+ __m128i t0, t1, t2, t3, t4, t5, t6, t7; |
7929 |
+ __m128i vfm, vflat8out, vflat8in, vhev; |
7930 |
+ __m128i p2a, p1a, p0a, q0a, q1a, q2a; |
7931 |
+ |
7932 |
+ __m128i vF = _mm_set1_epi16(F); |
7933 |
+ __m128i vI = _mm_set1_epi16(I); |
7934 |
+ __m128i vE = _mm_set1_epi16(E); |
7935 |
+ __m128i vH = _mm_set1_epi16(H); |
7936 |
+ |
7937 |
+ __m128i p7, p6, p5, p4; |
7938 |
+ __m128i p3 = VEC_LD8(dst - stride * 4), p2 = VEC_LD8(dst - stride * 3); |
7939 |
+ __m128i p1 = VEC_LD8(dst - stride * 2), p0 = VEC_LD8(dst - stride * 1); |
7940 |
+ __m128i q0 = VEC_LD8(dst + stride * 0), q1 = VEC_LD8(dst + stride * 1); |
7941 |
+ __m128i q2 = VEC_LD8(dst + stride * 2), q3 = VEC_LD8(dst + stride * 3); |
7942 |
+ __m128i q4, q5, q6, q7; |
7943 |
+ |
7944 |
+ p3 = _mm_unpacklo_epi8(p3, zerov); |
7945 |
+ p2 = _mm_unpacklo_epi8(p2, zerov); |
7946 |
+ p1 = _mm_unpacklo_epi8(p1, zerov); |
7947 |
+ p0 = _mm_unpacklo_epi8(p0, zerov); |
7948 |
+ q0 = _mm_unpacklo_epi8(q0, zerov); |
7949 |
+ q1 = _mm_unpacklo_epi8(q1, zerov); |
7950 |
+ q2 = _mm_unpacklo_epi8(q2, zerov); |
7951 |
+ q3 = _mm_unpacklo_epi8(q3, zerov); |
7952 |
+ |
7953 |
+ t0 = _mm_abs_epi16(_mm_sub_epi16(p3, p2)); |
7954 |
+ t1 = _mm_abs_epi16(_mm_sub_epi16(p2, p1)); |
7955 |
+ t2 = _mm_abs_epi16(_mm_sub_epi16(p1, p0)); |
7956 |
+ t3 = _mm_abs_epi16(_mm_sub_epi16(q1, q0)); |
7957 |
+ t4 = _mm_abs_epi16(_mm_sub_epi16(q2, q1)); |
7958 |
+ t5 = _mm_abs_epi16(_mm_sub_epi16(q3, q2)); |
7959 |
+ vhev = _mm_or_si128(_mm_cmpgt_epi16(t2, vH), _mm_cmpgt_epi16(t3, vH)); |
7960 |
+ t0 = _mm_cmpgt_epi16(t0, vI); |
7961 |
+ t1 = _mm_cmpgt_epi16(t1, vI); |
7962 |
+ t2 = _mm_cmpgt_epi16(t2, vI); |
7963 |
+ t3 = _mm_cmpgt_epi16(t3, vI); |
7964 |
+ t4 = _mm_cmpgt_epi16(t4, vI); |
7965 |
+ t5 = _mm_cmpgt_epi16(t5, vI); |
7966 |
+ |
7967 |
+ t0 = _mm_or_si128(_mm_or_si128(t0, t1), t2); |
7968 |
+ t3 = _mm_or_si128(_mm_or_si128(t3, t4), t5); |
7969 |
+ t0 = _mm_or_si128(t0, t3); |
7970 |
+ |
7971 |
+ t6 = _mm_abs_epi16(_mm_sub_epi16(p0, q0)); |
7972 |
+ t7 = _mm_abs_epi16(_mm_sub_epi16(p1, q1)); |
7973 |
+ t6 = _mm_add_epi16(_mm_slli_epi16(t6, 1), _mm_srai_epi16(t7, 1)); |
7974 |
+ t6 = _mm_cmpgt_epi16(t6, vE); |
7975 |
+ vfm = _mm_or_si128(t0, t6); // !fm |
7976 |
+ |
7977 |
+ if (_mm_movemask_epi8(vfm) == 0xffff) return; |
7978 |
+ |
7979 |
+ if (wd >= 8) { |
7980 |
+ t0 = _mm_abs_epi16(_mm_sub_epi16(p3, p0)); |
7981 |
+ t1 = _mm_abs_epi16(_mm_sub_epi16(p2, p0)); |
7982 |
+ t2 = _mm_abs_epi16(_mm_sub_epi16(p1, p0)); |
7983 |
+ t3 = _mm_abs_epi16(_mm_sub_epi16(q1, q0)); |
7984 |
+ t4 = _mm_abs_epi16(_mm_sub_epi16(q2, q0)); |
7985 |
+ t5 = _mm_abs_epi16(_mm_sub_epi16(q3, q0)); |
7986 |
+ t0 = _mm_cmpgt_epi16(t0, vF); |
7987 |
+ t1 = _mm_cmpgt_epi16(t1, vF); |
7988 |
+ t2 = _mm_cmpgt_epi16(t2, vF); |
7989 |
+ t3 = _mm_cmpgt_epi16(t3, vF); |
7990 |
+ t4 = _mm_cmpgt_epi16(t4, vF); |
7991 |
+ t5 = _mm_cmpgt_epi16(t5, vF); |
7992 |
+ |
7993 |
+ t0 = _mm_or_si128(_mm_or_si128(t0, t1), t2); |
7994 |
+ t3 = _mm_or_si128(_mm_or_si128(t3, t4), t5); |
7995 |
+ vflat8in = _mm_or_si128(_mm_or_si128(t0, t3), vfm); |
7996 |
+ } |
7997 |
+ |
7998 |
+ { |
7999 |
+ __m128i c1 = _mm_set1_epi16(1); |
8000 |
+ __m128i c127 = _mm_set1_epi16(127); |
8001 |
+ __m128i m128 = _mm_set1_epi16(-128); |
8002 |
+ __m128i c43 = _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3); |
8003 |
+ t0 = _mm_and_si128(vhev, _mm_sub_epi16(p1, q1)); |
8004 |
+ t1 = _mm_sub_epi16(q0, p0); |
8005 |
+ t0 = _mm_min_epi16(_mm_max_epi16(t0, m128), c127); |
8006 |
+ // f = av_clip_intp2(p1 - q1, 7) & hev; |
8007 |
+ t1 = _mm_add_epi16(_mm_add_epi16(t1, t0), _mm_add_epi16(t1, t1)); |
8008 |
+ t1 = _mm_andnot_si128(vfm, t1); |
8009 |
+ t1 = _mm_packs_epi16(t1, t1); |
8010 |
+ // f = av_clip_intp2(3 * (q0 - p0) + f, 7); |
8011 |
+ t3 = _mm_adds_epi8(t1, c43); |
8012 |
+ t2 = _mm_srai_epi16(_mm_unpacklo_epi8(t3, t3), 8 + 3); |
8013 |
+ // f1 = FFMIN(f + 4, 0x7f) >> 3; |
8014 |
+ t3 = _mm_srai_epi16(_mm_unpackhi_epi8(t3, t3), 8 + 3); |
8015 |
+ // f2 = FFMIN(f + 3, 0x7f) >> 3; |
8016 |
+ t4 = _mm_srai_epi16(_mm_add_epi16(t2, c1), 1); |
8017 |
+ t4 = _mm_andnot_si128(vhev, t4); // f3 = ((f1 + 1) >> 1) & ~hev; |
8018 |
+ p1a = _mm_add_epi16(p1, t4); // av_clip_uint8(p1 + f3); |
8019 |
+ p0a = _mm_add_epi16(p0, t3); // av_clip_uint8(p0 + f2); |
8020 |
+ q0a = _mm_sub_epi16(q0, t2); // av_clip_uint8(q0 - f1); |
8021 |
+ q1a = _mm_sub_epi16(q1, t4); // av_clip_uint8(q1 - f3); |
8022 |
+ } |
8023 |
+ |
8024 |
+ if (wd >= 8 && _mm_movemask_epi8(vflat8in) != 0xffff) { |
8025 |
+ __m128i c4 = _mm_set1_epi16(4); |
8026 |
+ t0 = _mm_add_epi16(_mm_slli_epi16(p3, 2), c4); |
8027 |
+ t1 = _mm_add_epi16(_mm_add_epi16(q0, p0), _mm_add_epi16(p1, p2)); |
8028 |
+ t0 = _mm_sub_epi16(t0, p3); |
8029 |
+ t0 = _mm_add_epi16(t0, t1); |
8030 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p2, t0), 3); |
8031 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q1, p3)); |
8032 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(p1, t0), 3); |
8033 |
+ p2a = _mm_blendv_epi8(t1, p2, vflat8in); |
8034 |
+ p1a = _mm_blendv_epi8(t2, p1a, vflat8in); |
8035 |
+ |
8036 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q2, p3)); |
8037 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p0, t0), 3); |
8038 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p3)); |
8039 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q0, t0), 3); |
8040 |
+ p0a = _mm_blendv_epi8(t1, p0a, vflat8in); |
8041 |
+ q0a = _mm_blendv_epi8(t2, q0a, vflat8in); |
8042 |
+ |
8043 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p2)); |
8044 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(q1, t0), 3); |
8045 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p1)); |
8046 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q2, t0), 3); |
8047 |
+ q1a = _mm_blendv_epi8(t1, q1a, vflat8in); |
8048 |
+ q2a = _mm_blendv_epi8(t2, q2, vflat8in); |
8049 |
+ |
8050 |
+ if (wd >= 16) { |
8051 |
+ p7 = VEC_LD8(dst - stride * 8); |
8052 |
+ p6 = VEC_LD8(dst - stride * 7); |
8053 |
+ p5 = VEC_LD8(dst - stride * 6); |
8054 |
+ p4 = VEC_LD8(dst - stride * 5); |
8055 |
+ q4 = VEC_LD8(dst + stride * 4); |
8056 |
+ q5 = VEC_LD8(dst + stride * 5); |
8057 |
+ q6 = VEC_LD8(dst + stride * 6); |
8058 |
+ q7 = VEC_LD8(dst + stride * 7); |
8059 |
+ |
8060 |
+ p7 = _mm_unpacklo_epi8(p7, zerov); |
8061 |
+ p6 = _mm_unpacklo_epi8(p6, zerov); |
8062 |
+ p5 = _mm_unpacklo_epi8(p5, zerov); |
8063 |
+ p4 = _mm_unpacklo_epi8(p4, zerov); |
8064 |
+ q4 = _mm_unpacklo_epi8(q4, zerov); |
8065 |
+ q5 = _mm_unpacklo_epi8(q5, zerov); |
8066 |
+ q6 = _mm_unpacklo_epi8(q6, zerov); |
8067 |
+ q7 = _mm_unpacklo_epi8(q7, zerov); |
8068 |
+ |
8069 |
+ t0 = _mm_abs_epi16(_mm_sub_epi16(p7, p0)); |
8070 |
+ t1 = _mm_abs_epi16(_mm_sub_epi16(p6, p0)); |
8071 |
+ t2 = _mm_abs_epi16(_mm_sub_epi16(p5, p0)); |
8072 |
+ t3 = _mm_abs_epi16(_mm_sub_epi16(p4, p0)); |
8073 |
+ t4 = _mm_abs_epi16(_mm_sub_epi16(q4, q0)); |
8074 |
+ t5 = _mm_abs_epi16(_mm_sub_epi16(q5, q0)); |
8075 |
+ t6 = _mm_abs_epi16(_mm_sub_epi16(q6, q0)); |
8076 |
+ t7 = _mm_abs_epi16(_mm_sub_epi16(q7, q0)); |
8077 |
+ |
8078 |
+ t0 = _mm_cmpgt_epi16(t0, vF); |
8079 |
+ t1 = _mm_cmpgt_epi16(t1, vF); |
8080 |
+ t2 = _mm_cmpgt_epi16(t2, vF); |
8081 |
+ t3 = _mm_cmpgt_epi16(t3, vF); |
8082 |
+ t4 = _mm_cmpgt_epi16(t4, vF); |
8083 |
+ t5 = _mm_cmpgt_epi16(t5, vF); |
8084 |
+ t6 = _mm_cmpgt_epi16(t6, vF); |
8085 |
+ t7 = _mm_cmpgt_epi16(t7, vF); |
8086 |
+ |
8087 |
+ t0 = _mm_or_si128(_mm_or_si128(t0, t1), _mm_or_si128(t2, t3)); |
8088 |
+ t4 = _mm_or_si128(_mm_or_si128(t4, t5), _mm_or_si128(t6, t7)); |
8089 |
+ vflat8out = _mm_or_si128(t0, t4); |
8090 |
+ |
8091 |
+ vflat8out = _mm_or_si128(vflat8out, vflat8in); |
8092 |
+ if (_mm_movemask_epi8(vflat8out) != 0xffff) { |
8093 |
+ __m128i c8 = _mm_set1_epi16(8); |
8094 |
+ t0 = _mm_add_epi16(_mm_slli_epi16(p7, 3), c8); |
8095 |
+ t1 = _mm_add_epi16(_mm_add_epi16(q0, p0), _mm_add_epi16(p1, p2)); |
8096 |
+ t2 = _mm_add_epi16(_mm_add_epi16(p3, p4), _mm_add_epi16(p5, p6)); |
8097 |
+ t0 = _mm_sub_epi16(t0, p7); |
8098 |
+ t0 = _mm_add_epi16(t0, t1); |
8099 |
+ t0 = _mm_add_epi16(t0, t2); |
8100 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p6, t0), 4); |
8101 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q1, p7)); |
8102 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(p5, t0), 4); |
8103 |
+ t1 = _mm_blendv_epi8(t1, p6, vflat8out); |
8104 |
+ t2 = _mm_blendv_epi8(t2, p5, vflat8out); |
8105 |
+ t1 = _mm_packus_epi16(t1, t2); |
8106 |
+ VEC_STL(dst - stride * 7, t1); |
8107 |
+ VEC_STH(dst - stride * 6, t1); |
8108 |
+ |
8109 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q2, p7)); |
8110 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p4, t0), 4); |
8111 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p7)); |
8112 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(p3, t0), 4); |
8113 |
+ t1 = _mm_blendv_epi8(t1, p4, vflat8out); |
8114 |
+ t2 = _mm_blendv_epi8(t2, p3, vflat8out); |
8115 |
+ t1 = _mm_packus_epi16(t1, t2); |
8116 |
+ VEC_STL(dst - stride * 5, t1); |
8117 |
+ VEC_STH(dst - stride * 4, t1); |
8118 |
+ |
8119 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q4, p7)); |
8120 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p2, t0), 4); |
8121 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q5, p7)); |
8122 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(p1, t0), 4); |
8123 |
+ p2a = _mm_blendv_epi8(t1, p2a, vflat8out); |
8124 |
+ p1a = _mm_blendv_epi8(t2, p1a, vflat8out); |
8125 |
+ |
8126 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q6, p7)); |
8127 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(p0, t0), 4); |
8128 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p7)); |
8129 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q0, t0), 4); |
8130 |
+ p0a = _mm_blendv_epi8(t1, p0a, vflat8out); |
8131 |
+ q0a = _mm_blendv_epi8(t2, q0a, vflat8out); |
8132 |
+ |
8133 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p6)); |
8134 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(q1, t0), 4); |
8135 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p5)); |
8136 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q2, t0), 4); |
8137 |
+ q1a = _mm_blendv_epi8(t1, q1a, vflat8out); |
8138 |
+ q2a = _mm_blendv_epi8(t2, q2a, vflat8out); |
8139 |
+ |
8140 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p4)); |
8141 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(q3, t0), 4); |
8142 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p3)); |
8143 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q4, t0), 4); |
8144 |
+ t1 = _mm_blendv_epi8(t1, q3, vflat8out); |
8145 |
+ t2 = _mm_blendv_epi8(t2, q4, vflat8out); |
8146 |
+ t1 = _mm_packus_epi16(t1, t2); |
8147 |
+ VEC_STL(dst + stride * 3, t1); |
8148 |
+ VEC_STH(dst + stride * 4, t1); |
8149 |
+ |
8150 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p2)); |
8151 |
+ t1 = _mm_srai_epi16(_mm_add_epi16(q5, t0), 4); |
8152 |
+ t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p1)); |
8153 |
+ t2 = _mm_srai_epi16(_mm_add_epi16(q6, t0), 4); |
8154 |
+ t1 = _mm_blendv_epi8(t1, q5, vflat8out); |
8155 |
+ t2 = _mm_blendv_epi8(t2, q6, vflat8out); |
8156 |
+ t1 = _mm_packus_epi16(t1, t2); |
8157 |
+ VEC_STL(dst + stride * 5, t1); |
8158 |
+ VEC_STH(dst + stride * 6, t1); |
8159 |
+ } |
8160 |
+ } |
8161 |
+ t1 = _mm_packus_epi16(p2a, q2a); |
8162 |
+ VEC_STL(dst - stride * 3, t1); |
8163 |
+ VEC_STH(dst + stride * 2, t1); |
8164 |
+ } |
8165 |
+ |
8166 |
+ t0 = _mm_packus_epi16(p1a, p0a); |
8167 |
+ t1 = _mm_packus_epi16(q0a, q1a); |
8168 |
+ VEC_STL(dst - stride * 2, t0); |
8169 |
+ VEC_STH(dst - stride * 1, t0); |
8170 |
+ VEC_STL(dst + stride * 0, t1); |
8171 |
+ VEC_STH(dst + stride * 1, t1); |
8172 |
+} |
8173 |
+ |
8174 |
+#define lf_8_fns(wd) \ |
8175 |
+static void loop_filter_h_##wd##_8_e2k(uint8_t *dst, \ |
8176 |
+ ptrdiff_t stride, \ |
8177 |
+ int E, int I, int H) \ |
8178 |
+{ \ |
8179 |
+ loop_filter_h(dst, E, I, H, stride, wd); \ |
8180 |
+} \ |
8181 |
+static void loop_filter_v_##wd##_8_e2k(uint8_t *dst, \ |
8182 |
+ ptrdiff_t stride, \ |
8183 |
+ int E, int I, int H) \ |
8184 |
+{ \ |
8185 |
+ loop_filter_v(dst, E, I, H, stride, wd); \ |
8186 |
+} |
8187 |
+ |
8188 |
+lf_8_fns(4) |
8189 |
+lf_8_fns(8) |
8190 |
+lf_8_fns(16) |
8191 |
+ |
8192 |
+#undef lf_8_fn |
8193 |
+#undef lf_8_fns |
8194 |
+ |
8195 |
+#define lf_16_fn(dir, stridea) \ |
8196 |
+static void loop_filter_##dir##_16_16_e2k(uint8_t *dst, \ |
8197 |
+ ptrdiff_t stride, \ |
8198 |
+ int E, int I, int H) \ |
8199 |
+{ \ |
8200 |
+ loop_filter_##dir##_16_8_e2k(dst, stride, E, I, H); \ |
8201 |
+ loop_filter_##dir##_16_8_e2k(dst + 8 * stridea, stride, E, I, H); \ |
8202 |
+} |
8203 |
+ |
8204 |
+lf_16_fn(h, stride) |
8205 |
+lf_16_fn(v, sizeof(pixel)) |
8206 |
+ |
8207 |
+#undef lf_16_fn |
8208 |
+ |
8209 |
+#define lf_mix_fn(dir, wd1, wd2, stridea) \ |
8210 |
+static void loop_filter_##dir##_##wd1##wd2##_16_e2k(uint8_t *dst, \ |
8211 |
+ ptrdiff_t stride, \ |
8212 |
+ int E, int I, int H) \ |
8213 |
+{ \ |
8214 |
+ loop_filter_##dir##_##wd1##_8_e2k(dst, stride, E & 0xff, I & 0xff, H & 0xff); \ |
8215 |
+ loop_filter_##dir##_##wd2##_8_e2k(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \ |
8216 |
+} |
8217 |
+ |
8218 |
+#define lf_mix_fns(wd1, wd2) \ |
8219 |
+lf_mix_fn(h, wd1, wd2, stride) \ |
8220 |
+lf_mix_fn(v, wd1, wd2, sizeof(pixel)) |
8221 |
+ |
8222 |
+lf_mix_fns(4, 4) |
8223 |
+lf_mix_fns(4, 8) |
8224 |
+lf_mix_fns(8, 4) |
8225 |
+lf_mix_fns(8, 8) |
8226 |
+ |
8227 |
+#undef lf_mix_fn |
8228 |
+#undef lf_mix_fns |
8229 |
+ |
8230 |
+static av_cold void ff_vp9dsp_loopfilter_init_8_e2k(VP9DSPContext *dsp) |
8231 |
+{ |
8232 |
+ dsp->loop_filter_8[0][0] = loop_filter_h_4_8_e2k; |
8233 |
+ dsp->loop_filter_8[0][1] = loop_filter_v_4_8_e2k; |
8234 |
+ dsp->loop_filter_8[1][0] = loop_filter_h_8_8_e2k; |
8235 |
+ dsp->loop_filter_8[1][1] = loop_filter_v_8_8_e2k; |
8236 |
+ dsp->loop_filter_8[2][0] = loop_filter_h_16_8_e2k; |
8237 |
+ dsp->loop_filter_8[2][1] = loop_filter_v_16_8_e2k; |
8238 |
+ |
8239 |
+ dsp->loop_filter_16[0] = loop_filter_h_16_16_e2k; |
8240 |
+ dsp->loop_filter_16[1] = loop_filter_v_16_16_e2k; |
8241 |
+ |
8242 |
+ dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_e2k; |
8243 |
+ dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_e2k; |
8244 |
+ dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_e2k; |
8245 |
+ dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_e2k; |
8246 |
+ dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_e2k; |
8247 |
+ dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_e2k; |
8248 |
+ dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_e2k; |
8249 |
+ dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_e2k; |
8250 |
+} |
8251 |
+ |
8252 |
+#if BIT_DEPTH != 12 |
8253 |
+ |
8254 |
+static av_always_inline void copy_e2k(uint8_t *dst, ptrdiff_t dst_stride, |
8255 |
+ const uint8_t *src, ptrdiff_t src_stride, |
8256 |
+ int w, int h) |
8257 |
+{ |
8258 |
+ do { |
8259 |
+ memcpy(dst, src, w); |
8260 |
+ dst += dst_stride; |
8261 |
+ src += src_stride; |
8262 |
+ } while (--h); |
8263 |
+} |
8264 |
+ |
8265 |
+static av_always_inline void avg_e2k(uint8_t *dst, ptrdiff_t dst_stride, |
8266 |
+ const uint8_t *src, ptrdiff_t src_stride, |
8267 |
+ int w, int h) |
8268 |
+{ |
8269 |
+ int y; |
8270 |
+ __m128i v0, v1, v2, v3; __m64 h0, h1; |
8271 |
+ if (w >= 64) { |
8272 |
+ PRAGMA_E2K("ivdep") |
8273 |
+ for (y = 0; y < h; y++) { |
8274 |
+ v0 = VEC_LD(src); |
8275 |
+ v1 = VEC_LD(src + 16); |
8276 |
+ v2 = VEC_LD(src + 32); |
8277 |
+ v3 = VEC_LD(src + 48); |
8278 |
+ v0 = _mm_avg_epu8(v0, VEC_LD(dst)); |
8279 |
+ v1 = _mm_avg_epu8(v1, VEC_LD(dst + 16)); |
8280 |
+ v2 = _mm_avg_epu8(v2, VEC_LD(dst + 32)); |
8281 |
+ v3 = _mm_avg_epu8(v3, VEC_LD(dst + 48)); |
8282 |
+ VEC_ST(dst, v0); |
8283 |
+ VEC_ST(dst + 16, v1); |
8284 |
+ VEC_ST(dst + 32, v2); |
8285 |
+ VEC_ST(dst + 48, v3); |
8286 |
+ src += src_stride; |
8287 |
+ dst += dst_stride; |
8288 |
+ } |
8289 |
+ } else if (w >= 32) { |
8290 |
+ PRAGMA_E2K("ivdep") |
8291 |
+ for (y = 0; y < h; y++) { |
8292 |
+ v0 = VEC_LD(src); |
8293 |
+ v1 = VEC_LD(src + 16); |
8294 |
+ v0 = _mm_avg_epu8(v0, VEC_LD(dst)); |
8295 |
+ v1 = _mm_avg_epu8(v1, VEC_LD(dst + 16)); |
8296 |
+ VEC_ST(dst, v0); |
8297 |
+ VEC_ST(dst + 16, v1); |
8298 |
+ src += src_stride; |
8299 |
+ dst += dst_stride; |
8300 |
+ } |
8301 |
+ } else if (w >= 16) { |
8302 |
+ PRAGMA_E2K("ivdep") |
8303 |
+ for (y = 0; y < h; y++) { |
8304 |
+ v0 = VEC_LD(src); |
8305 |
+ v0 = _mm_avg_epu8(v0, VEC_LD(dst)); |
8306 |
+ VEC_ST(dst, v0); |
8307 |
+ src += src_stride; |
8308 |
+ dst += dst_stride; |
8309 |
+ } |
8310 |
+ } else if (w >= 8) { |
8311 |
+ PRAGMA_E2K("ivdep") |
8312 |
+ for (y = 0; y < h; y++) { |
8313 |
+ h0 = *(__m64*)src; |
8314 |
+ h1 = *(__m64*)dst; |
8315 |
+ h0 = _mm_avg_pu8(h0, h1); |
8316 |
+ *(__m64*)dst = h0; |
8317 |
+ src += src_stride; |
8318 |
+ dst += dst_stride; |
8319 |
+ } |
8320 |
+ } else { |
8321 |
+ PRAGMA_E2K("ivdep") |
8322 |
+ for (y = 0; y < h; y++) { |
8323 |
+ h0 = _mm_cvtsi32_si64(*(uint32_t*)src); |
8324 |
+ h1 = _mm_cvtsi32_si64(*(uint32_t*)dst); |
8325 |
+ h0 = _mm_avg_pu8(h0, h1); |
8326 |
+ *(uint32_t*)dst = _mm_cvtsi64_si32(h0); |
8327 |
+ src += src_stride; |
8328 |
+ dst += dst_stride; |
8329 |
+ } |
8330 |
+ } |
8331 |
+} |
8332 |
+ |
8333 |
+#define fpel_fn(type, sz) \ |
8334 |
+static void type##sz##_e2k(uint8_t *dst, ptrdiff_t dst_stride, \ |
8335 |
+ const uint8_t *src, ptrdiff_t src_stride, \ |
8336 |
+ int h, int mx, int my) \ |
8337 |
+{ \ |
8338 |
+ type##_e2k(dst, dst_stride, src, src_stride, sz, h); \ |
8339 |
+} |
8340 |
+ |
8341 |
+#define copy_avg_fn(sz) \ |
8342 |
+fpel_fn(copy, sz) \ |
8343 |
+fpel_fn(avg, sz) |
8344 |
+ |
8345 |
+copy_avg_fn(64) |
8346 |
+copy_avg_fn(32) |
8347 |
+copy_avg_fn(16) |
8348 |
+copy_avg_fn(8) |
8349 |
+copy_avg_fn(4) |
8350 |
+ |
8351 |
+#undef fpel_fn |
8352 |
+#undef copy_avg_fn |
8353 |
+ |
8354 |
+#endif /* BIT_DEPTH != 12 */ |
8355 |
+ |
8356 |
+static av_always_inline void do_8tap_1d_v_e2k(uint8_t *dst, ptrdiff_t dst_stride, |
8357 |
+ const uint8_t *src, ptrdiff_t src_stride, |
8358 |
+ int w, int h, const int16_t *filter, int avg) |
8359 |
+{ |
8360 |
+ int x, y; |
8361 |
+ const uint8_t *s; uint8_t *d; |
8362 |
+ |
8363 |
+ if (w >= 8) { |
8364 |
+ __m128i a0, a1, a2, a3, a4, a5, a6, a7; |
8365 |
+ __m128i v0, v1, v2, v3; |
8366 |
+ __m128i f0, f1, f2, f3, c64; __m64 h0, h1; |
8367 |
+ f0 = _mm_set1_epi16((filter[0] & 255) | filter[1] << 8); |
8368 |
+ f1 = _mm_set1_epi16((filter[2] & 255) | filter[3] << 8); |
8369 |
+ f2 = _mm_set1_epi16((filter[4] & 255) | filter[5] << 8); |
8370 |
+ f3 = _mm_set1_epi16((filter[6] & 255) | filter[7] << 8); |
8371 |
+ c64 = _mm_set1_epi16(64); |
8372 |
+ |
8373 |
+ for (x = 0; x < w; x += 8) { |
8374 |
+ a0 = VEC_LD8(&src[x - 3 * src_stride]); |
8375 |
+ a1 = VEC_LD8(&src[x - 2 * src_stride]); |
8376 |
+ a2 = VEC_LD8(&src[x - 1 * src_stride]); |
8377 |
+ a3 = VEC_LD8(&src[x + 0 * src_stride]); |
8378 |
+ a4 = VEC_LD8(&src[x + 1 * src_stride]); |
8379 |
+ a5 = VEC_LD8(&src[x + 2 * src_stride]); |
8380 |
+ a6 = VEC_LD8(&src[x + 3 * src_stride]); |
8381 |
+ s = src + x + 4 * src_stride; |
8382 |
+ d = dst + x; |
8383 |
+ |
8384 |
+ PRAGMA_E2K("ivdep") |
8385 |
+ for (y = 0; y < h; y++) { |
8386 |
+ a7 = VEC_LD8(s); |
8387 |
+ v0 = _mm_unpacklo_epi8(a0, a1); |
8388 |
+ v1 = _mm_unpacklo_epi8(a2, a3); |
8389 |
+ v2 = _mm_unpacklo_epi8(a4, a5); |
8390 |
+ v3 = _mm_unpacklo_epi8(a6, a7); |
8391 |
+ v0 = _mm_maddubs_epi16(v0, f0); |
8392 |
+ v1 = _mm_maddubs_epi16(v1, f1); |
8393 |
+ v2 = _mm_maddubs_epi16(v2, f2); |
8394 |
+ v3 = _mm_maddubs_epi16(v3, f3); |
8395 |
+ v0 = _mm_add_epi16(v0, v2); |
8396 |
+ v1 = _mm_add_epi16(v1, v3); |
8397 |
+ v0 = _mm_add_epi16(v0, c64); |
8398 |
+ v0 = _mm_adds_epi16(v0, v1); |
8399 |
+ v0 = _mm_srai_epi16(v0, 7); |
8400 |
+ v0 = _mm_packus_epi16(v0, v0); |
8401 |
+ h0 = _mm_movepi64_pi64(v0); |
8402 |
+ if (avg) { |
8403 |
+ h1 = *(__m64*)d; |
8404 |
+ h0 = _mm_avg_pu8(h0, h1); |
8405 |
+ } |
8406 |
+ *(__m64*)d = h0; |
8407 |
+ s += src_stride; |
8408 |
+ d += dst_stride; |
8409 |
+ a0 = a1; a1 = a2; a2 = a3; a3 = a4; |
8410 |
+ a4 = a5; a5 = a6; a6 = a7; |
8411 |
+ } |
8412 |
+ } |
8413 |
+ } else { |
8414 |
+ __m64 a0, a1, a2, a3, a4, a5, a6, a7; |
8415 |
+ __m64 v0, v1, v2, v3; |
8416 |
+ __m64 f0, f1, f2, f3, c64; |
8417 |
+ f0 = _mm_set1_pi16((filter[0] & 255) | filter[1] << 8); |
8418 |
+ f1 = _mm_set1_pi16((filter[2] & 255) | filter[3] << 8); |
8419 |
+ f2 = _mm_set1_pi16((filter[4] & 255) | filter[5] << 8); |
8420 |
+ f3 = _mm_set1_pi16((filter[6] & 255) | filter[7] << 8); |
8421 |
+ c64 = _mm_set1_pi16(64); |
8422 |
+ |
8423 |
+ a0 = _mm_cvtsi32_si64(*(uint32_t*)(src - 3 * src_stride)); |
8424 |
+ a1 = _mm_cvtsi32_si64(*(uint32_t*)(src - 2 * src_stride)); |
8425 |
+ a2 = _mm_cvtsi32_si64(*(uint32_t*)(src - 1 * src_stride)); |
8426 |
+ a3 = _mm_cvtsi32_si64(*(uint32_t*)(src + 0 * src_stride)); |
8427 |
+ a4 = _mm_cvtsi32_si64(*(uint32_t*)(src + 1 * src_stride)); |
8428 |
+ a5 = _mm_cvtsi32_si64(*(uint32_t*)(src + 2 * src_stride)); |
8429 |
+ a6 = _mm_cvtsi32_si64(*(uint32_t*)(src + 3 * src_stride)); |
8430 |
+ s = src + 4 * src_stride; |
8431 |
+ d = dst; |
8432 |
+ |
8433 |
+ PRAGMA_E2K("ivdep") |
8434 |
+ for (y = 0; y < h; y++) { |
8435 |
+ a7 = _mm_cvtsi32_si64(*(uint32_t*)s); |
8436 |
+ v0 = _mm_unpacklo_pi8(a0, a1); |
8437 |
+ v1 = _mm_unpacklo_pi8(a2, a3); |
8438 |
+ v2 = _mm_unpacklo_pi8(a4, a5); |
8439 |
+ v3 = _mm_unpacklo_pi8(a6, a7); |
8440 |
+ v0 = _mm_maddubs_pi16(v0, f0); |
8441 |
+ v1 = _mm_maddubs_pi16(v1, f1); |
8442 |
+ v2 = _mm_maddubs_pi16(v2, f2); |
8443 |
+ v3 = _mm_maddubs_pi16(v3, f3); |
8444 |
+ v0 = _mm_add_pi16(v0, v2); |
8445 |
+ v1 = _mm_add_pi16(v1, v3); |
8446 |
+ v0 = _mm_add_pi16(v0, c64); |
8447 |
+ v0 = _mm_adds_pi16(v0, v1); |
8448 |
+ v0 = _mm_srai_pi16(v0, 7); |
8449 |
+ v0 = _mm_packs_pu16(v0, v0); |
8450 |
+ if (avg) { |
8451 |
+ v1 = _mm_cvtsi32_si64(*(uint32_t*)d); |
8452 |
+ v0 = _mm_avg_pu8(v0, v1); |
8453 |
+ } |
8454 |
+ *(uint32_t*)d = _mm_cvtsi64_si32(v0); |
8455 |
+ s += src_stride; |
8456 |
+ d += dst_stride; |
8457 |
+ a0 = a1; a1 = a2; a2 = a3; a3 = a4; |
8458 |
+ a4 = a5; a5 = a6; a6 = a7; |
8459 |
+ } |
8460 |
+ } |
8461 |
+} |
8462 |
+ |
8463 |
+static av_always_inline void do_8tap_1d_h_e2k(uint8_t *dst, ptrdiff_t dst_stride, |
8464 |
+ const uint8_t *src, ptrdiff_t src_stride, |
8465 |
+ int w, int h, const int16_t *filter, int avg) |
8466 |
+{ |
8467 |
+ int x, y; |
8468 |
+ |
8469 |
+ if (w >= 8) { |
8470 |
+ __m64 a0, a1, a2, a3; |
8471 |
+ __m128i v0, v1, v2, v3; |
8472 |
+ __m128i f0, f1, f2, f3, c64; __m64 h0, h1; |
8473 |
+ f0 = _mm_set1_epi16((filter[0] & 255) | filter[1] << 8); |
8474 |
+ f1 = _mm_set1_epi16((filter[2] & 255) | filter[3] << 8); |
8475 |
+ f2 = _mm_set1_epi16((filter[4] & 255) | filter[5] << 8); |
8476 |
+ f3 = _mm_set1_epi16((filter[6] & 255) | filter[7] << 8); |
8477 |
+ c64 = _mm_set1_epi16(64); |
8478 |
+ |
8479 |
+ for (y = 0; y < h; y++) { |
8480 |
+ PRAGMA_E2K("ivdep") |
8481 |
+ for (x = 0; x < w; x += 8) { |
8482 |
+ a0 = *(__m64*)(src + x - 3); |
8483 |
+ a1 = *(__m64*)(src + x - 3 + 7); |
8484 |
+ a0 = _mm_slli_si64(a0, 8); |
8485 |
+ a2 = _mm_alignr_pi8(a1, a0, 1); |
8486 |
+ a3 = _mm_alignr_pi8(a1, a0, 2); |
8487 |
+ v0 = _mm_unpacklo_epi8(_mm_movpi64_epi64(a2), _mm_movpi64_epi64(a3)); |
8488 |
+ a2 = _mm_alignr_pi8(a1, a0, 3); |
8489 |
+ a3 = _mm_alignr_pi8(a1, a0, 4); |
8490 |
+ v1 = _mm_unpacklo_epi8(_mm_movpi64_epi64(a2), _mm_movpi64_epi64(a3)); |
8491 |
+ a2 = _mm_alignr_pi8(a1, a0, 5); |
8492 |
+ a3 = _mm_alignr_pi8(a1, a0, 6); |
8493 |
+ v2 = _mm_unpacklo_epi8(_mm_movpi64_epi64(a2), _mm_movpi64_epi64(a3)); |
8494 |
+ a2 = _mm_alignr_pi8(a1, a0, 7); |
8495 |
+ v3 = _mm_unpacklo_epi8(_mm_movpi64_epi64(a2), _mm_movpi64_epi64(a1)); |
8496 |
+ |
8497 |
+ v0 = _mm_maddubs_epi16(v0, f0); |
8498 |
+ v1 = _mm_maddubs_epi16(v1, f1); |
8499 |
+ v2 = _mm_maddubs_epi16(v2, f2); |
8500 |
+ v3 = _mm_maddubs_epi16(v3, f3); |
8501 |
+ v0 = _mm_add_epi16(v0, v2); |
8502 |
+ v1 = _mm_add_epi16(v1, v3); |
8503 |
+ v0 = _mm_add_epi16(v0, c64); |
8504 |
+ v0 = _mm_adds_epi16(v0, v1); |
8505 |
+ v0 = _mm_srai_epi16(v0, 7); |
8506 |
+ v0 = _mm_packus_epi16(v0, v0); |
8507 |
+ h0 = _mm_movepi64_pi64(v0); |
8508 |
+ if (avg) { |
8509 |
+ h1 = *(__m64*)(dst + x); |
8510 |
+ h0 = _mm_avg_pu8(h0, h1); |
8511 |
+ } |
8512 |
+ *(__m64*)(dst + x) = h0; |
8513 |
+ } |
8514 |
+ src += src_stride; |
8515 |
+ dst += dst_stride; |
8516 |
+ } |
8517 |
+ } else { |
8518 |
+ __m64 a0, a1, a2, a3; |
8519 |
+ __m64 v0, v1, v2, v3; |
8520 |
+ __m64 f0, f1, f2, f3, c64; |
8521 |
+ f0 = _mm_set1_pi16((filter[0] & 255) | filter[1] << 8); |
8522 |
+ f1 = _mm_set1_pi16((filter[2] & 255) | filter[3] << 8); |
8523 |
+ f2 = _mm_set1_pi16((filter[4] & 255) | filter[5] << 8); |
8524 |
+ f3 = _mm_set1_pi16((filter[6] & 255) | filter[7] << 8); |
8525 |
+ c64 = _mm_set1_pi16(64); |
8526 |
+ |
8527 |
+ PRAGMA_E2K("ivdep") |
8528 |
+ for (y = 0; y < h; y++) { |
8529 |
+ a0 = *(__m64*)(src - 3); |
8530 |
+ a1 = _mm_cvtsi32_si64(*(uint32_t*)(src - 3 + 7)); |
8531 |
+ a0 = _mm_slli_si64(a0, 8); |
8532 |
+ a2 = _mm_alignr_pi8(a1, a0, 1); |
8533 |
+ a3 = _mm_alignr_pi8(a1, a0, 2); |
8534 |
+ v0 = _mm_unpacklo_pi8(a2, a3); |
8535 |
+ a2 = _mm_alignr_pi8(a1, a0, 3); |
8536 |
+ a3 = _mm_alignr_pi8(a1, a0, 4); |
8537 |
+ v1 = _mm_unpacklo_pi8(a2, a3); |
8538 |
+ a2 = _mm_alignr_pi8(a1, a0, 5); |
8539 |
+ a3 = _mm_alignr_pi8(a1, a0, 6); |
8540 |
+ v2 = _mm_unpacklo_pi8(a2, a3); |
8541 |
+ a2 = _mm_alignr_pi8(a1, a0, 7); |
8542 |
+ v3 = _mm_unpacklo_pi8(a2, a1); |
8543 |
+ |
8544 |
+ v0 = _mm_maddubs_pi16(v0, f0); |
8545 |
+ v1 = _mm_maddubs_pi16(v1, f1); |
8546 |
+ v2 = _mm_maddubs_pi16(v2, f2); |
8547 |
+ v3 = _mm_maddubs_pi16(v3, f3); |
8548 |
+ v0 = _mm_add_pi16(v0, v2); |
8549 |
+ v1 = _mm_add_pi16(v1, v3); |
8550 |
+ v0 = _mm_add_pi16(v0, c64); |
8551 |
+ v0 = _mm_adds_pi16(v0, v1); |
8552 |
+ v0 = _mm_srai_pi16(v0, 7); |
8553 |
+ v0 = _mm_packs_pu16(v0, v0); |
8554 |
+ if (avg) { |
8555 |
+ v1 = _mm_cvtsi32_si64(*(uint32_t*)dst); |
8556 |
+ v0 = _mm_avg_pu8(v0, v1); |
8557 |
+ } |
8558 |
+ *(uint32_t*)dst = _mm_cvtsi64_si32(v0); |
8559 |
+ src += src_stride; |
8560 |
+ dst += dst_stride; |
8561 |
+ } |
8562 |
+ } |
8563 |
+} |
8564 |
+ |
8565 |
+#define filter_8tap_1d_fn(opn, opa) \ |
8566 |
+static av_noinline void opn##_8tap_1d_v_e2k(uint8_t *dst, ptrdiff_t dst_stride, \ |
8567 |
+ const uint8_t *src, ptrdiff_t src_stride, \ |
8568 |
+ int w, int h, const int16_t *filter) \ |
8569 |
+{ \ |
8570 |
+ do_8tap_1d_v_e2k(dst, dst_stride, src, src_stride, w, h, filter, opa); \ |
8571 |
+} \ |
8572 |
+static av_noinline void opn##_8tap_1d_h_e2k(uint8_t *dst, ptrdiff_t dst_stride, \ |
8573 |
+ const uint8_t *src, ptrdiff_t src_stride, \ |
8574 |
+ int w, int h, const int16_t *filter) \ |
8575 |
+{ \ |
8576 |
+ do_8tap_1d_h_e2k(dst, dst_stride, src, src_stride, w, h, filter, opa); \ |
8577 |
+} |
8578 |
+ |
8579 |
+filter_8tap_1d_fn(put, 0) |
8580 |
+filter_8tap_1d_fn(avg, 1) |
8581 |
+ |
8582 |
+#undef filter_8tap_1d_fn |
8583 |
+ |
8584 |
+#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \ |
8585 |
+static void avg##_8tap_##type##_##sz##dir##_e2k(uint8_t *dst, ptrdiff_t dst_stride, \ |
8586 |
+ const uint8_t *src, ptrdiff_t src_stride, \ |
8587 |
+ int h, int mx, int my) \ |
8588 |
+{ \ |
8589 |
+ avg##_8tap_1d_##dir##_e2k(dst, dst_stride, src, src_stride, sz, h, \ |
8590 |
+ ff_vp9_subpel_filters[type_idx][dir_m]); \ |
8591 |
+} |
8592 |
+ |
8593 |
+#define put_opa 0 |
8594 |
+#define avg_opa 1 |
8595 |
+#define filter_fn_2d(sz, type, type_idx, avg) \ |
8596 |
+static void avg##_8tap_##type##_##sz##hv_e2k(uint8_t *dst, ptrdiff_t dst_stride, \ |
8597 |
+ const uint8_t *src, ptrdiff_t src_stride, \ |
8598 |
+ int h, int mx, int my) \ |
8599 |
+{ \ |
8600 |
+ int w = sz; \ |
8601 |
+ pixel tmp[sz * (64 + 7)]; \ |
8602 |
+ src -= src_stride * 3; \ |
8603 |
+ do_8tap_1d_h_e2k(tmp, sz, src, src_stride, w, h + 7, ff_vp9_subpel_filters[type_idx][mx], 0); \ |
8604 |
+ do_8tap_1d_v_e2k(dst, dst_stride, tmp + sz * 3, sz, w, h, ff_vp9_subpel_filters[type_idx][my], avg##_opa); \ |
8605 |
+} |
8606 |
+ |
8607 |
+#define bilinf_fn_1d(sz, dir, dir_m, avg) |
8608 |
+#define bilinf_fn_2d(sz, avg) |
8609 |
+ |
8610 |
+#define filter_fn(sz, avg) \ |
8611 |
+filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \ |
8612 |
+filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \ |
8613 |
+filter_fn_2d(sz, regular, FILTER_8TAP_REGULAR, avg) \ |
8614 |
+filter_fn_1d(sz, h, mx, smooth, FILTER_8TAP_SMOOTH, avg) \ |
8615 |
+filter_fn_1d(sz, v, my, smooth, FILTER_8TAP_SMOOTH, avg) \ |
8616 |
+filter_fn_2d(sz, smooth, FILTER_8TAP_SMOOTH, avg) \ |
8617 |
+filter_fn_1d(sz, h, mx, sharp, FILTER_8TAP_SHARP, avg) \ |
8618 |
+filter_fn_1d(sz, v, my, sharp, FILTER_8TAP_SHARP, avg) \ |
8619 |
+filter_fn_2d(sz, sharp, FILTER_8TAP_SHARP, avg) \ |
8620 |
+bilinf_fn_1d(sz, h, mx, avg) \ |
8621 |
+bilinf_fn_1d(sz, v, my, avg) \ |
8622 |
+bilinf_fn_2d(sz, avg) |
8623 |
+ |
8624 |
+#define filter_fn_set(avg) \ |
8625 |
+filter_fn(64, avg) \ |
8626 |
+filter_fn(32, avg) \ |
8627 |
+filter_fn(16, avg) \ |
8628 |
+filter_fn(8, avg) \ |
8629 |
+filter_fn(4, avg) |
8630 |
+ |
8631 |
+filter_fn_set(put) |
8632 |
+filter_fn_set(avg) |
8633 |
+ |
8634 |
+#undef filter_fn |
8635 |
+#undef filter_fn_set |
8636 |
+#undef filter_fn_1d |
8637 |
+#undef filter_fn_2d |
8638 |
+#undef bilinf_fn_1d |
8639 |
+#undef bilinf_fn_2d |
8640 |
+ |
8641 |
+static av_cold void ff_vp9dsp_mc_init_8_e2k(VP9DSPContext *dsp) |
8642 |
+{ |
8643 |
+#define init_fpel(idx1, idx2, sz, type) \ |
8644 |
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_e2k; \ |
8645 |
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_e2k; \ |
8646 |
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = type##sz##_e2k |
8647 |
+ |
8648 |
+#define init_copy_avg(idx, sz) \ |
8649 |
+ init_fpel(idx, 0, sz, copy); \ |
8650 |
+ init_fpel(idx, 1, sz, avg) |
8651 |
+ |
8652 |
+ init_copy_avg(0, 64); |
8653 |
+ init_copy_avg(1, 32); |
8654 |
+ init_copy_avg(2, 16); |
8655 |
+ init_copy_avg(3, 8); |
8656 |
+ init_copy_avg(4, 4); |
8657 |
+ |
8658 |
+#undef init_copy_avg |
8659 |
+#undef init_fpel |
8660 |
+ |
8661 |
+#define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \ |
8662 |
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_e2k; \ |
8663 |
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_e2k; \ |
8664 |
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_e2k |
8665 |
+ |
8666 |
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \ |
8667 |
+ init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) |
8668 |
+ |
8669 |
+#define init_subpel2(idx, idxh, idxv, dir, type) \ |
8670 |
+ init_subpel1(0, idx, idxh, idxv, 64, dir, type); \ |
8671 |
+ init_subpel1(1, idx, idxh, idxv, 32, dir, type); \ |
8672 |
+ init_subpel1(2, idx, idxh, idxv, 16, dir, type); \ |
8673 |
+ init_subpel1(3, idx, idxh, idxv, 8, dir, type); \ |
8674 |
+ init_subpel1(4, idx, idxh, idxv, 4, dir, type) |
8675 |
+ |
8676 |
+#define init_subpel3(idx, type) \ |
8677 |
+ init_subpel2(idx, 1, 1, hv, type); \ |
8678 |
+ init_subpel2(idx, 0, 1, v, type); \ |
8679 |
+ init_subpel2(idx, 1, 0, h, type) |
8680 |
+ |
8681 |
+ init_subpel3(0, put); |
8682 |
+ init_subpel3(1, avg); |
8683 |
+ |
8684 |
+#undef init_subpel1 |
8685 |
+#undef init_subpel2 |
8686 |
+#undef init_subpel3 |
8687 |
+#undef init_subpel1_bd_aware |
8688 |
+} |
8689 |
+ |
8690 |
+av_cold void ff_vp9dsp_init_e2k(VP9DSPContext *dsp, int bpp, int bitexact) |
8691 |
+{ |
8692 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
8693 |
+ return; |
8694 |
+ |
8695 |
+ // checkasm |
8696 |
+ // doesn't check all cases for loopfilter |
8697 |
+ if (bpp == 8) { |
8698 |
+ ff_vp9dsp_itxfm_init_8_e2k(dsp); |
8699 |
+ ff_vp9dsp_loopfilter_init_8_e2k(dsp); |
8700 |
+ ff_vp9dsp_mc_init_8_e2k(dsp); |
8701 |
+ } |
8702 |
+} |
8703 |
diff --git a/libavcodec/fdctdsp.c b/libavcodec/fdctdsp.c |
8704 |
index b9c2c86..69e6302 100644 |
8705 |
--- a/libavcodec/fdctdsp.c |
8706 |
+++ b/libavcodec/fdctdsp.c |
8707 |
@@ -45,6 +45,8 @@ av_cold void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx) |
8708 |
|
8709 |
if (ARCH_PPC) |
8710 |
ff_fdctdsp_init_ppc(c, avctx, high_bit_depth); |
8711 |
+ if (ARCH_E2K) |
8712 |
+ ff_fdctdsp_init_e2k(c, avctx, high_bit_depth); |
8713 |
if (ARCH_X86) |
8714 |
ff_fdctdsp_init_x86(c, avctx, high_bit_depth); |
8715 |
} |
8716 |
diff --git a/libavcodec/fdctdsp.h b/libavcodec/fdctdsp.h |
8717 |
index 3e1f683..75407d4 100644 |
8718 |
--- a/libavcodec/fdctdsp.h |
8719 |
+++ b/libavcodec/fdctdsp.h |
8720 |
@@ -31,6 +31,8 @@ typedef struct FDCTDSPContext { |
8721 |
void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx); |
8722 |
void ff_fdctdsp_init_ppc(FDCTDSPContext *c, AVCodecContext *avctx, |
8723 |
unsigned high_bit_depth); |
8724 |
+void ff_fdctdsp_init_e2k(FDCTDSPContext *c, AVCodecContext *avctx, |
8725 |
+ unsigned high_bit_depth); |
8726 |
void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx, |
8727 |
unsigned high_bit_depth); |
8728 |
|
8729 |
diff --git a/libavcodec/fft.h b/libavcodec/fft.h |
8730 |
index c858570..6a30f7c 100644 |
8731 |
--- a/libavcodec/fft.h |
8732 |
+++ b/libavcodec/fft.h |
8733 |
@@ -161,6 +161,7 @@ void ff_fft_init_x86(FFTContext *s); |
8734 |
void ff_fft_init_arm(FFTContext *s); |
8735 |
void ff_fft_init_mips(FFTContext *s); |
8736 |
void ff_fft_init_ppc(FFTContext *s); |
8737 |
+void ff_fft_init_e2k(FFTContext *s); |
8738 |
|
8739 |
void ff_fft_fixed_init_arm(FFTContext *s); |
8740 |
|
8741 |
diff --git a/libavcodec/fft_template.c b/libavcodec/fft_template.c |
8742 |
index 20a62e4..d565a71 100644 |
8743 |
--- a/libavcodec/fft_template.c |
8744 |
+++ b/libavcodec/fft_template.c |
8745 |
@@ -245,6 +245,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) |
8746 |
if (ARCH_AARCH64) ff_fft_init_aarch64(s); |
8747 |
if (ARCH_ARM) ff_fft_init_arm(s); |
8748 |
if (ARCH_PPC) ff_fft_init_ppc(s); |
8749 |
+ if (ARCH_E2K) ff_fft_init_e2k(s); |
8750 |
if (ARCH_X86) ff_fft_init_x86(s); |
8751 |
if (CONFIG_MDCT) s->mdct_calcw = s->mdct_calc; |
8752 |
if (HAVE_MIPSFPU) ff_fft_init_mips(s); |
8753 |
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c |
8754 |
index 3b33af6..141ffc0 100644 |
8755 |
--- a/libavcodec/fmtconvert.c |
8756 |
+++ b/libavcodec/fmtconvert.c |
8757 |
@@ -61,6 +61,8 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) |
8758 |
ff_fmt_convert_init_arm(c, avctx); |
8759 |
if (ARCH_PPC) |
8760 |
ff_fmt_convert_init_ppc(c, avctx); |
8761 |
+ if (ARCH_E2K) |
8762 |
+ ff_fmt_convert_init_e2k(c, avctx); |
8763 |
if (ARCH_X86) |
8764 |
ff_fmt_convert_init_x86(c, avctx); |
8765 |
if (HAVE_MIPSFPU) |
8766 |
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h |
8767 |
index a1b17e4..1053e0f 100644 |
8768 |
--- a/libavcodec/fmtconvert.h |
8769 |
+++ b/libavcodec/fmtconvert.h |
8770 |
@@ -71,6 +71,7 @@ void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); |
8771 |
void ff_fmt_convert_init_aarch64(FmtConvertContext *c, AVCodecContext *avctx); |
8772 |
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); |
8773 |
void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx); |
8774 |
+void ff_fmt_convert_init_e2k(FmtConvertContext *c, AVCodecContext *avctx); |
8775 |
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx); |
8776 |
void ff_fmt_convert_init_mips(FmtConvertContext *c); |
8777 |
|
8778 |
diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c |
8779 |
index c2f1f30..ea6196f 100644 |
8780 |
--- a/libavcodec/h264chroma.c |
8781 |
+++ b/libavcodec/h264chroma.c |
8782 |
@@ -52,6 +52,8 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth) |
8783 |
ff_h264chroma_init_arm(c, bit_depth); |
8784 |
if (ARCH_PPC) |
8785 |
ff_h264chroma_init_ppc(c, bit_depth); |
8786 |
+ if (ARCH_E2K) |
8787 |
+ ff_h264chroma_init_e2k(c, bit_depth); |
8788 |
if (ARCH_X86) |
8789 |
ff_h264chroma_init_x86(c, bit_depth); |
8790 |
if (ARCH_MIPS) |
8791 |
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h |
8792 |
index 5c89fd1..0ec192c 100644 |
8793 |
--- a/libavcodec/h264chroma.h |
8794 |
+++ b/libavcodec/h264chroma.h |
8795 |
@@ -34,6 +34,7 @@ void ff_h264chroma_init(H264ChromaContext *c, int bit_depth); |
8796 |
void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth); |
8797 |
void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth); |
8798 |
void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth); |
8799 |
+void ff_h264chroma_init_e2k(H264ChromaContext *c, int bit_depth); |
8800 |
void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth); |
8801 |
void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth); |
8802 |
|
8803 |
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c |
8804 |
index d26f552..64c19a9 100644 |
8805 |
--- a/libavcodec/h264dsp.c |
8806 |
+++ b/libavcodec/h264dsp.c |
8807 |
@@ -156,6 +156,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, |
8808 |
if (ARCH_AARCH64) ff_h264dsp_init_aarch64(c, bit_depth, chroma_format_idc); |
8809 |
if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc); |
8810 |
if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc); |
8811 |
+ if (ARCH_E2K) ff_h264dsp_init_e2k(c, bit_depth, chroma_format_idc); |
8812 |
if (ARCH_X86) ff_h264dsp_init_x86(c, bit_depth, chroma_format_idc); |
8813 |
if (ARCH_MIPS) ff_h264dsp_init_mips(c, bit_depth, chroma_format_idc); |
8814 |
} |
8815 |
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h |
8816 |
index cbea317..ff781e1 100644 |
8817 |
--- a/libavcodec/h264dsp.h |
8818 |
+++ b/libavcodec/h264dsp.h |
8819 |
@@ -125,6 +125,8 @@ void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, |
8820 |
const int chroma_format_idc); |
8821 |
void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, |
8822 |
const int chroma_format_idc); |
8823 |
+void ff_h264dsp_init_e2k(H264DSPContext *c, const int bit_depth, |
8824 |
+ const int chroma_format_idc); |
8825 |
void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, |
8826 |
const int chroma_format_idc); |
8827 |
void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth, |
8828 |
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c |
8829 |
index 50e82e2..5955069 100644 |
8830 |
--- a/libavcodec/h264qpel.c |
8831 |
+++ b/libavcodec/h264qpel.c |
8832 |
@@ -102,6 +102,8 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth) |
8833 |
ff_h264qpel_init_arm(c, bit_depth); |
8834 |
if (ARCH_PPC) |
8835 |
ff_h264qpel_init_ppc(c, bit_depth); |
8836 |
+ if (ARCH_E2K) |
8837 |
+ ff_h264qpel_init_e2k(c, bit_depth); |
8838 |
if (ARCH_X86) |
8839 |
ff_h264qpel_init_x86(c, bit_depth); |
8840 |
if (ARCH_MIPS) |
8841 |
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h |
8842 |
index 7c57ad0..7880b86 100644 |
8843 |
--- a/libavcodec/h264qpel.h |
8844 |
+++ b/libavcodec/h264qpel.h |
8845 |
@@ -34,6 +34,7 @@ void ff_h264qpel_init(H264QpelContext *c, int bit_depth); |
8846 |
void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth); |
8847 |
void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth); |
8848 |
void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth); |
8849 |
+void ff_h264qpel_init_e2k(H264QpelContext *c, int bit_depth); |
8850 |
void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth); |
8851 |
void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth); |
8852 |
|
8853 |
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c |
8854 |
index 957e40d..a7d0d5d 100644 |
8855 |
--- a/libavcodec/hevcdsp.c |
8856 |
+++ b/libavcodec/hevcdsp.c |
8857 |
@@ -261,6 +261,8 @@ int i = 0; |
8858 |
ff_hevc_dsp_init_arm(hevcdsp, bit_depth); |
8859 |
if (ARCH_PPC) |
8860 |
ff_hevc_dsp_init_ppc(hevcdsp, bit_depth); |
8861 |
+ if (ARCH_E2K) |
8862 |
+ ff_hevc_dsp_init_e2k(hevcdsp, bit_depth); |
8863 |
if (ARCH_X86) |
8864 |
ff_hevc_dsp_init_x86(hevcdsp, bit_depth); |
8865 |
if (ARCH_MIPS) |
8866 |
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h |
8867 |
index 0ae67cb..3221a2a 100644 |
8868 |
--- a/libavcodec/hevcdsp.h |
8869 |
+++ b/libavcodec/hevcdsp.h |
8870 |
@@ -129,6 +129,7 @@ extern const int8_t ff_hevc_qpel_filters[3][16]; |
8871 |
|
8872 |
void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth); |
8873 |
void ff_hevc_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth); |
8874 |
+void ff_hevc_dsp_init_e2k(HEVCDSPContext *c, const int bit_depth); |
8875 |
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth); |
8876 |
void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth); |
8877 |
|
8878 |
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c |
8879 |
index 8e2fd8f..dd4ef87 100644 |
8880 |
--- a/libavcodec/hpeldsp.c |
8881 |
+++ b/libavcodec/hpeldsp.c |
8882 |
@@ -363,6 +363,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags) |
8883 |
ff_hpeldsp_init_arm(c, flags); |
8884 |
if (ARCH_PPC) |
8885 |
ff_hpeldsp_init_ppc(c, flags); |
8886 |
+ if (ARCH_E2K) |
8887 |
+ ff_hpeldsp_init_e2k(c, flags); |
8888 |
if (ARCH_X86) |
8889 |
ff_hpeldsp_init_x86(c, flags); |
8890 |
if (ARCH_MIPS) |
8891 |
diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h |
8892 |
index 768139b..6d0c293 100644 |
8893 |
--- a/libavcodec/hpeldsp.h |
8894 |
+++ b/libavcodec/hpeldsp.h |
8895 |
@@ -100,6 +100,7 @@ void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags); |
8896 |
void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags); |
8897 |
void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags); |
8898 |
void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags); |
8899 |
+void ff_hpeldsp_init_e2k(HpelDSPContext *c, int flags); |
8900 |
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags); |
8901 |
void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags); |
8902 |
|
8903 |
diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c |
8904 |
index 846ed0b..a2554aa 100644 |
8905 |
--- a/libavcodec/idctdsp.c |
8906 |
+++ b/libavcodec/idctdsp.c |
8907 |
@@ -311,6 +311,8 @@ av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx) |
8908 |
ff_idctdsp_init_arm(c, avctx, high_bit_depth); |
8909 |
if (ARCH_PPC) |
8910 |
ff_idctdsp_init_ppc(c, avctx, high_bit_depth); |
8911 |
+ if (ARCH_E2K) |
8912 |
+ ff_idctdsp_init_e2k(c, avctx, high_bit_depth); |
8913 |
if (ARCH_X86) |
8914 |
ff_idctdsp_init_x86(c, avctx, high_bit_depth); |
8915 |
if (ARCH_MIPS) |
8916 |
diff --git a/libavcodec/idctdsp.h b/libavcodec/idctdsp.h |
8917 |
index ca21a31..1204bff 100644 |
8918 |
--- a/libavcodec/idctdsp.h |
8919 |
+++ b/libavcodec/idctdsp.h |
8920 |
@@ -114,6 +114,8 @@ void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx, |
8921 |
unsigned high_bit_depth); |
8922 |
void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx, |
8923 |
unsigned high_bit_depth); |
8924 |
+void ff_idctdsp_init_e2k(IDCTDSPContext *c, AVCodecContext *avctx, |
8925 |
+ unsigned high_bit_depth); |
8926 |
void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, |
8927 |
unsigned high_bit_depth); |
8928 |
void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx, |
8929 |
diff --git a/libavcodec/lossless_audiodsp.c b/libavcodec/lossless_audiodsp.c |
8930 |
index 3781659..979e0c9 100644 |
8931 |
--- a/libavcodec/lossless_audiodsp.c |
8932 |
+++ b/libavcodec/lossless_audiodsp.c |
8933 |
@@ -62,6 +62,8 @@ av_cold void ff_llauddsp_init(LLAudDSPContext *c) |
8934 |
ff_llauddsp_init_arm(c); |
8935 |
if (ARCH_PPC) |
8936 |
ff_llauddsp_init_ppc(c); |
8937 |
+ if (ARCH_E2K) |
8938 |
+ ff_llauddsp_init_e2k(c); |
8939 |
if (ARCH_X86) |
8940 |
ff_llauddsp_init_x86(c); |
8941 |
} |
8942 |
diff --git a/libavcodec/lossless_audiodsp.h b/libavcodec/lossless_audiodsp.h |
8943 |
index eea5d49..3de02d5 100644 |
8944 |
--- a/libavcodec/lossless_audiodsp.h |
8945 |
+++ b/libavcodec/lossless_audiodsp.h |
8946 |
@@ -46,6 +46,7 @@ typedef struct LLAudDSPContext { |
8947 |
void ff_llauddsp_init(LLAudDSPContext *c); |
8948 |
void ff_llauddsp_init_arm(LLAudDSPContext *c); |
8949 |
void ff_llauddsp_init_ppc(LLAudDSPContext *c); |
8950 |
+void ff_llauddsp_init_e2k(LLAudDSPContext *c); |
8951 |
void ff_llauddsp_init_x86(LLAudDSPContext *c); |
8952 |
|
8953 |
#endif /* AVCODEC_LOSSLESS_AUDIODSP_H */ |
8954 |
diff --git a/libavcodec/lossless_videodsp.c b/libavcodec/lossless_videodsp.c |
8955 |
index cff94c2..34b1db5 100644 |
8956 |
--- a/libavcodec/lossless_videodsp.c |
8957 |
+++ b/libavcodec/lossless_videodsp.c |
8958 |
@@ -120,6 +120,8 @@ void ff_llviddsp_init(LLVidDSPContext *c) |
8959 |
|
8960 |
if (ARCH_PPC) |
8961 |
ff_llviddsp_init_ppc(c); |
8962 |
+ if (ARCH_E2K) |
8963 |
+ ff_llviddsp_init_e2k(c); |
8964 |
if (ARCH_X86) |
8965 |
ff_llviddsp_init_x86(c); |
8966 |
} |
8967 |
diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_videodsp.h |
8968 |
index 8077898..917afb6 100644 |
8969 |
--- a/libavcodec/lossless_videodsp.h |
8970 |
+++ b/libavcodec/lossless_videodsp.h |
8971 |
@@ -45,5 +45,6 @@ typedef struct LLVidDSPContext { |
8972 |
void ff_llviddsp_init(LLVidDSPContext *llviddsp); |
8973 |
void ff_llviddsp_init_x86(LLVidDSPContext *llviddsp); |
8974 |
void ff_llviddsp_init_ppc(LLVidDSPContext *llviddsp); |
8975 |
+void ff_llviddsp_init_e2k(LLVidDSPContext *llviddsp); |
8976 |
|
8977 |
#endif //AVCODEC_LOSSLESS_VIDEODSP_H |
8978 |
diff --git a/libavcodec/mdct15.c b/libavcodec/mdct15.c |
8979 |
index 6f35059..bced874 100644 |
8980 |
--- a/libavcodec/mdct15.c |
8981 |
+++ b/libavcodec/mdct15.c |
8982 |
@@ -318,6 +318,8 @@ av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale) |
8983 |
|
8984 |
if (ARCH_X86) |
8985 |
ff_mdct15_init_x86(s); |
8986 |
+ if (ARCH_E2K) |
8987 |
+ ff_mdct15_init_e2k(s); |
8988 |
|
8989 |
*ps = s; |
8990 |
|
8991 |
diff --git a/libavcodec/mdct15.h b/libavcodec/mdct15.h |
8992 |
index 42e60f3..c9d515b 100644 |
8993 |
--- a/libavcodec/mdct15.h |
8994 |
+++ b/libavcodec/mdct15.h |
8995 |
@@ -58,5 +58,6 @@ int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale); |
8996 |
void ff_mdct15_uninit(MDCT15Context **ps); |
8997 |
|
8998 |
void ff_mdct15_init_x86(MDCT15Context *s); |
8999 |
+void ff_mdct15_init_e2k(MDCT15Context *s); |
9000 |
|
9001 |
#endif /* AVCODEC_MDCT15_H */ |
9002 |
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c |
9003 |
index ae248c5..cb967a4 100644 |
9004 |
--- a/libavcodec/me_cmp.c |
9005 |
+++ b/libavcodec/me_cmp.c |
9006 |
@@ -1088,6 +1088,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx) |
9007 |
ff_me_cmp_init_arm(c, avctx); |
9008 |
if (ARCH_PPC) |
9009 |
ff_me_cmp_init_ppc(c, avctx); |
9010 |
+ if (ARCH_E2K) |
9011 |
+ ff_me_cmp_init_e2k(c, avctx); |
9012 |
if (ARCH_X86) |
9013 |
ff_me_cmp_init_x86(c, avctx); |
9014 |
if (ARCH_MIPS) |
9015 |
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h |
9016 |
index 0a589e3..8989a1c 100644 |
9017 |
--- a/libavcodec/me_cmp.h |
9018 |
+++ b/libavcodec/me_cmp.h |
9019 |
@@ -85,6 +85,7 @@ void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx); |
9020 |
void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx); |
9021 |
void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx); |
9022 |
void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx); |
9023 |
+void ff_me_cmp_init_e2k(MECmpContext *c, AVCodecContext *avctx); |
9024 |
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx); |
9025 |
void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx); |
9026 |
|
9027 |
diff --git a/libavcodec/mpegaudiodsp.c b/libavcodec/mpegaudiodsp.c |
9028 |
index 3cafca2..0514bd4 100644 |
9029 |
--- a/libavcodec/mpegaudiodsp.c |
9030 |
+++ b/libavcodec/mpegaudiodsp.c |
9031 |
@@ -48,6 +48,7 @@ av_cold void ff_mpadsp_init(MPADSPContext *s) |
9032 |
if (ARCH_AARCH64) ff_mpadsp_init_aarch64(s); |
9033 |
if (ARCH_ARM) ff_mpadsp_init_arm(s); |
9034 |
if (ARCH_PPC) ff_mpadsp_init_ppc(s); |
9035 |
+ if (ARCH_E2K) ff_mpadsp_init_e2k(s); |
9036 |
if (ARCH_X86) ff_mpadsp_init_x86(s); |
9037 |
if (HAVE_MIPSFPU) ff_mpadsp_init_mipsfpu(s); |
9038 |
if (HAVE_MIPSDSP) ff_mpadsp_init_mipsdsp(s); |
9039 |
diff --git a/libavcodec/mpegaudiodsp.h b/libavcodec/mpegaudiodsp.h |
9040 |
index 0e4352d..4514564 100644 |
9041 |
--- a/libavcodec/mpegaudiodsp.h |
9042 |
+++ b/libavcodec/mpegaudiodsp.h |
9043 |
@@ -62,6 +62,7 @@ void ff_mpa_synth_filter_float(MPADSPContext *s, |
9044 |
void ff_mpadsp_init_aarch64(MPADSPContext *s); |
9045 |
void ff_mpadsp_init_arm(MPADSPContext *s); |
9046 |
void ff_mpadsp_init_ppc(MPADSPContext *s); |
9047 |
+void ff_mpadsp_init_e2k(MPADSPContext *s); |
9048 |
void ff_mpadsp_init_x86(MPADSPContext *s); |
9049 |
void ff_mpadsp_init_mipsfpu(MPADSPContext *s); |
9050 |
void ff_mpadsp_init_mipsdsp(MPADSPContext *s); |
9051 |
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c |
9052 |
index 49fd1c9..49c4476 100644 |
9053 |
--- a/libavcodec/mpegvideo.c |
9054 |
+++ b/libavcodec/mpegvideo.c |
9055 |
@@ -318,6 +318,8 @@ static av_cold int dct_init(MpegEncContext *s) |
9056 |
ff_mpv_common_init_arm(s); |
9057 |
if (ARCH_PPC) |
9058 |
ff_mpv_common_init_ppc(s); |
9059 |
+ if (ARCH_E2K) |
9060 |
+ ff_mpv_common_init_e2k(s); |
9061 |
if (ARCH_X86) |
9062 |
ff_mpv_common_init_x86(s); |
9063 |
if (ARCH_MIPS) |
9064 |
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h |
9065 |
index 29e692f..85fddd2 100644 |
9066 |
--- a/libavcodec/mpegvideo.h |
9067 |
+++ b/libavcodec/mpegvideo.h |
9068 |
@@ -685,6 +685,7 @@ void ff_mpv_common_init_arm(MpegEncContext *s); |
9069 |
void ff_mpv_common_init_axp(MpegEncContext *s); |
9070 |
void ff_mpv_common_init_neon(MpegEncContext *s); |
9071 |
void ff_mpv_common_init_ppc(MpegEncContext *s); |
9072 |
+void ff_mpv_common_init_e2k(MpegEncContext *s); |
9073 |
void ff_mpv_common_init_x86(MpegEncContext *s); |
9074 |
void ff_mpv_common_init_mips(MpegEncContext *s); |
9075 |
|
9076 |
diff --git a/libavcodec/mpegvideodsp.c b/libavcodec/mpegvideodsp.c |
9077 |
index a58e45a..2d7aa96 100644 |
9078 |
--- a/libavcodec/mpegvideodsp.c |
9079 |
+++ b/libavcodec/mpegvideodsp.c |
9080 |
@@ -114,6 +114,8 @@ av_cold void ff_mpegvideodsp_init(MpegVideoDSPContext *c) |
9081 |
|
9082 |
if (ARCH_PPC) |
9083 |
ff_mpegvideodsp_init_ppc(c); |
9084 |
+ if (ARCH_E2K) |
9085 |
+ ff_mpegvideodsp_init_e2k(c); |
9086 |
if (ARCH_X86) |
9087 |
ff_mpegvideodsp_init_x86(c); |
9088 |
} |
9089 |
diff --git a/libavcodec/mpegvideodsp.h b/libavcodec/mpegvideodsp.h |
9090 |
index 293e254..56a0bf5 100644 |
9091 |
--- a/libavcodec/mpegvideodsp.h |
9092 |
+++ b/libavcodec/mpegvideodsp.h |
9093 |
@@ -42,6 +42,7 @@ typedef struct MpegVideoDSPContext { |
9094 |
|
9095 |
void ff_mpegvideodsp_init(MpegVideoDSPContext *c); |
9096 |
void ff_mpegvideodsp_init_ppc(MpegVideoDSPContext *c); |
9097 |
+void ff_mpegvideodsp_init_e2k(MpegVideoDSPContext *c); |
9098 |
void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c); |
9099 |
|
9100 |
#endif /* AVCODEC_MPEGVIDEODSP_H */ |
9101 |
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c |
9102 |
index a34ab35..f6ba6b4 100644 |
9103 |
--- a/libavcodec/mpegvideoencdsp.c |
9104 |
+++ b/libavcodec/mpegvideoencdsp.c |
9105 |
@@ -249,6 +249,8 @@ av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, |
9106 |
ff_mpegvideoencdsp_init_arm(c, avctx); |
9107 |
if (ARCH_PPC) |
9108 |
ff_mpegvideoencdsp_init_ppc(c, avctx); |
9109 |
+ if (ARCH_E2K) |
9110 |
+ ff_mpegvideoencdsp_init_e2k(c, avctx); |
9111 |
if (ARCH_X86) |
9112 |
ff_mpegvideoencdsp_init_x86(c, avctx); |
9113 |
if (ARCH_MIPS) |
9114 |
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h |
9115 |
index 33f0282..2f2b191 100644 |
9116 |
--- a/libavcodec/mpegvideoencdsp.h |
9117 |
+++ b/libavcodec/mpegvideoencdsp.h |
9118 |
@@ -50,6 +50,8 @@ void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c, |
9119 |
AVCodecContext *avctx); |
9120 |
void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c, |
9121 |
AVCodecContext *avctx); |
9122 |
+void ff_mpegvideoencdsp_init_e2k(MpegvideoEncDSPContext *c, |
9123 |
+ AVCodecContext *avctx); |
9124 |
void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, |
9125 |
AVCodecContext *avctx); |
9126 |
void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c, |
9127 |
diff --git a/libavcodec/pixblockdsp.c b/libavcodec/pixblockdsp.c |
9128 |
index 67393b9..8383663 100644 |
9129 |
--- a/libavcodec/pixblockdsp.c |
9130 |
+++ b/libavcodec/pixblockdsp.c |
9131 |
@@ -109,6 +109,8 @@ av_cold void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx) |
9132 |
ff_pixblockdsp_init_arm(c, avctx, high_bit_depth); |
9133 |
if (ARCH_PPC) |
9134 |
ff_pixblockdsp_init_ppc(c, avctx, high_bit_depth); |
9135 |
+ if (ARCH_E2K) |
9136 |
+ ff_pixblockdsp_init_e2k(c, avctx, high_bit_depth); |
9137 |
if (ARCH_X86) |
9138 |
ff_pixblockdsp_init_x86(c, avctx, high_bit_depth); |
9139 |
if (ARCH_MIPS) |
9140 |
diff --git a/libavcodec/pixblockdsp.h b/libavcodec/pixblockdsp.h |
9141 |
index 07c2ec4..49e2b25 100644 |
9142 |
--- a/libavcodec/pixblockdsp.h |
9143 |
+++ b/libavcodec/pixblockdsp.h |
9144 |
@@ -52,6 +52,8 @@ void ff_pixblockdsp_init_arm(PixblockDSPContext *c, AVCodecContext *avctx, |
9145 |
unsigned high_bit_depth); |
9146 |
void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, AVCodecContext *avctx, |
9147 |
unsigned high_bit_depth); |
9148 |
+void ff_pixblockdsp_init_e2k(PixblockDSPContext *c, AVCodecContext *avctx, |
9149 |
+ unsigned high_bit_depth); |
9150 |
void ff_pixblockdsp_init_x86(PixblockDSPContext *c, AVCodecContext *avctx, |
9151 |
unsigned high_bit_depth); |
9152 |
void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx, |
9153 |
diff --git a/libavcodec/svq1enc.c b/libavcodec/svq1enc.c |
9154 |
index cb215c2..d55fcd3 100644 |
9155 |
--- a/libavcodec/svq1enc.c |
9156 |
+++ b/libavcodec/svq1enc.c |
9157 |
@@ -570,6 +570,8 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx) |
9158 |
|
9159 |
if (ARCH_PPC) |
9160 |
ff_svq1enc_init_ppc(s); |
9161 |
+ if (ARCH_E2K) |
9162 |
+ ff_svq1enc_init_e2k(s); |
9163 |
if (ARCH_X86) |
9164 |
ff_svq1enc_init_x86(s); |
9165 |
|
9166 |
diff --git a/libavcodec/svq1enc.h b/libavcodec/svq1enc.h |
9167 |
index b4ef763..c070d80 100644 |
9168 |
--- a/libavcodec/svq1enc.h |
9169 |
+++ b/libavcodec/svq1enc.h |
9170 |
@@ -80,6 +80,7 @@ typedef struct SVQ1EncContext { |
9171 |
} SVQ1EncContext; |
9172 |
|
9173 |
void ff_svq1enc_init_ppc(SVQ1EncContext *c); |
9174 |
+void ff_svq1enc_init_e2k(SVQ1EncContext *c); |
9175 |
void ff_svq1enc_init_x86(SVQ1EncContext *c); |
9176 |
|
9177 |
#endif /* AVCODEC_SVQ1ENC_H */ |
9178 |
diff --git a/libavcodec/tests/dct.c b/libavcodec/tests/dct.c |
9179 |
index 2ca8039..755734c 100644 |
9180 |
--- a/libavcodec/tests/dct.c |
9181 |
+++ b/libavcodec/tests/dct.c |
9182 |
@@ -100,6 +100,8 @@ static const struct algo idct_tab[] = { |
9183 |
#include "arm/dct.c" |
9184 |
#elif ARCH_PPC |
9185 |
#include "ppc/dct.c" |
9186 |
+#elif ARCH_E2K |
9187 |
+#include "e2k/dct.c" |
9188 |
#elif ARCH_X86 |
9189 |
#include "x86/dct.c" |
9190 |
#else |
9191 |
diff --git a/libavcodec/tests/e2k/dct.c b/libavcodec/tests/e2k/dct.c |
9192 |
new file mode 100644 |
9193 |
index 0000000..7c15b25 |
9194 |
--- /dev/null |
9195 |
+++ b/libavcodec/tests/e2k/dct.c |
9196 |
@@ -0,0 +1,31 @@ |
9197 |
+/* |
9198 |
+ * This file is part of FFmpeg. |
9199 |
+ * |
9200 |
+ * FFmpeg is free software; you can redistribute it and/or |
9201 |
+ * modify it under the terms of the GNU Lesser General Public |
9202 |
+ * License as published by the Free Software Foundation; either |
9203 |
+ * version 2.1 of the License, or (at your option) any later version. |
9204 |
+ * |
9205 |
+ * FFmpeg is distributed in the hope that it will be useful, |
9206 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
9207 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
9208 |
+ * Lesser General Public License for more details. |
9209 |
+ * |
9210 |
+ * You should have received a copy of the GNU Lesser General Public |
9211 |
+ * License along with FFmpeg; if not, write to the Free Software |
9212 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
9213 |
+ */ |
9214 |
+ |
9215 |
+#include "config.h" |
9216 |
+ |
9217 |
+#include "libavcodec/e2k/dctdsp.h" |
9218 |
+ |
9219 |
+static const struct algo fdct_tab_arch[] = { |
9220 |
+ { "FDCT-E2K", ff_fdct_e2k, FF_IDCT_PERM_NONE, AV_CPU_FLAG_E2K }, |
9221 |
+ { 0 } |
9222 |
+}; |
9223 |
+ |
9224 |
+static const struct algo idct_tab_arch[] = { |
9225 |
+ { "IDCT-E2K", ff_idct_e2k, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_E2K }, |
9226 |
+ { 0 } |
9227 |
+}; |
9228 |
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c |
9229 |
index c25a6f3..2953403 100644 |
9230 |
--- a/libavcodec/vc1dsp.c |
9231 |
+++ b/libavcodec/vc1dsp.c |
9232 |
@@ -1035,6 +1035,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp) |
9233 |
ff_vc1dsp_init_arm(dsp); |
9234 |
if (ARCH_PPC) |
9235 |
ff_vc1dsp_init_ppc(dsp); |
9236 |
+ if (ARCH_E2K) |
9237 |
+ ff_vc1dsp_init_e2k(dsp); |
9238 |
if (ARCH_X86) |
9239 |
ff_vc1dsp_init_x86(dsp); |
9240 |
if (ARCH_MIPS) |
9241 |
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h |
9242 |
index 75db62b..eabf2c8 100644 |
9243 |
--- a/libavcodec/vc1dsp.h |
9244 |
+++ b/libavcodec/vc1dsp.h |
9245 |
@@ -86,6 +86,7 @@ void ff_vc1dsp_init(VC1DSPContext* c); |
9246 |
void ff_vc1dsp_init_aarch64(VC1DSPContext* dsp); |
9247 |
void ff_vc1dsp_init_arm(VC1DSPContext* dsp); |
9248 |
void ff_vc1dsp_init_ppc(VC1DSPContext *c); |
9249 |
+void ff_vc1dsp_init_e2k(VC1DSPContext *c); |
9250 |
void ff_vc1dsp_init_x86(VC1DSPContext* dsp); |
9251 |
void ff_vc1dsp_init_mips(VC1DSPContext* dsp); |
9252 |
|
9253 |
diff --git a/libavcodec/videodsp.c b/libavcodec/videodsp.c |
9254 |
index ce9e9eb..087614a 100644 |
9255 |
--- a/libavcodec/videodsp.c |
9256 |
+++ b/libavcodec/videodsp.c |
9257 |
@@ -50,6 +50,8 @@ av_cold void ff_videodsp_init(VideoDSPContext *ctx, int bpc) |
9258 |
ff_videodsp_init_arm(ctx, bpc); |
9259 |
if (ARCH_PPC) |
9260 |
ff_videodsp_init_ppc(ctx, bpc); |
9261 |
+ if (ARCH_E2K) |
9262 |
+ ff_videodsp_init_e2k(ctx, bpc); |
9263 |
if (ARCH_X86) |
9264 |
ff_videodsp_init_x86(ctx, bpc); |
9265 |
if (ARCH_MIPS) |
9266 |
diff --git a/libavcodec/videodsp.h b/libavcodec/videodsp.h |
9267 |
index c0545f2..566296f 100644 |
9268 |
--- a/libavcodec/videodsp.h |
9269 |
+++ b/libavcodec/videodsp.h |
9270 |
@@ -82,6 +82,7 @@ void ff_videodsp_init(VideoDSPContext *ctx, int bpc); |
9271 |
void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc); |
9272 |
void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc); |
9273 |
void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc); |
9274 |
+void ff_videodsp_init_e2k(VideoDSPContext *ctx, int bpc); |
9275 |
void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc); |
9276 |
void ff_videodsp_init_mips(VideoDSPContext *ctx, int bpc); |
9277 |
|
9278 |
diff --git a/libavcodec/vorbisdsp.c b/libavcodec/vorbisdsp.c |
9279 |
index 362a276..e0ecda3 100644 |
9280 |
--- a/libavcodec/vorbisdsp.c |
9281 |
+++ b/libavcodec/vorbisdsp.c |
9282 |
@@ -31,6 +31,8 @@ av_cold void ff_vorbisdsp_init(VorbisDSPContext *dsp) |
9283 |
ff_vorbisdsp_init_arm(dsp); |
9284 |
if (ARCH_PPC) |
9285 |
ff_vorbisdsp_init_ppc(dsp); |
9286 |
+ if (ARCH_E2K) |
9287 |
+ ff_vorbisdsp_init_e2k(dsp); |
9288 |
if (ARCH_X86) |
9289 |
ff_vorbisdsp_init_x86(dsp); |
9290 |
} |
9291 |
diff --git a/libavcodec/vorbisdsp.h b/libavcodec/vorbisdsp.h |
9292 |
index 7abec4e..001151f 100644 |
9293 |
--- a/libavcodec/vorbisdsp.h |
9294 |
+++ b/libavcodec/vorbisdsp.h |
9295 |
@@ -34,5 +34,6 @@ void ff_vorbisdsp_init_aarch64(VorbisDSPContext *dsp); |
9296 |
void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp); |
9297 |
void ff_vorbisdsp_init_arm(VorbisDSPContext *dsp); |
9298 |
void ff_vorbisdsp_init_ppc(VorbisDSPContext *dsp); |
9299 |
+void ff_vorbisdsp_init_e2k(VorbisDSPContext *dsp); |
9300 |
|
9301 |
#endif /* AVCODEC_VORBISDSP_H */ |
9302 |
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c |
9303 |
index f485fba..c2ce815 100644 |
9304 |
--- a/libavcodec/vp3dsp.c |
9305 |
+++ b/libavcodec/vp3dsp.c |
9306 |
@@ -456,6 +456,8 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags) |
9307 |
ff_vp3dsp_init_arm(c, flags); |
9308 |
if (ARCH_PPC) |
9309 |
ff_vp3dsp_init_ppc(c, flags); |
9310 |
+ if (ARCH_E2K) |
9311 |
+ ff_vp3dsp_init_e2k(c, flags); |
9312 |
if (ARCH_X86) |
9313 |
ff_vp3dsp_init_x86(c, flags); |
9314 |
if (ARCH_MIPS) |
9315 |
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h |
9316 |
index 3b849ec..a01bfd4 100644 |
9317 |
--- a/libavcodec/vp3dsp.h |
9318 |
+++ b/libavcodec/vp3dsp.h |
9319 |
@@ -56,6 +56,7 @@ void ff_vp3dsp_idct10_add(uint8_t *dest, ptrdiff_t stride, int16_t *block); |
9320 |
void ff_vp3dsp_init(VP3DSPContext *c, int flags); |
9321 |
void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags); |
9322 |
void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags); |
9323 |
+void ff_vp3dsp_init_e2k(VP3DSPContext *c, int flags); |
9324 |
void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags); |
9325 |
void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags); |
9326 |
|
9327 |
diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c |
9328 |
index 4ff63d0..23c9db0 100644 |
9329 |
--- a/libavcodec/vp8dsp.c |
9330 |
+++ b/libavcodec/vp8dsp.c |
9331 |
@@ -679,6 +679,8 @@ av_cold void ff_vp78dsp_init(VP8DSPContext *dsp) |
9332 |
ff_vp78dsp_init_arm(dsp); |
9333 |
if (ARCH_PPC) |
9334 |
ff_vp78dsp_init_ppc(dsp); |
9335 |
+ if (ARCH_E2K) |
9336 |
+ ff_vp78dsp_init_e2k(dsp); |
9337 |
if (ARCH_X86) |
9338 |
ff_vp78dsp_init_x86(dsp); |
9339 |
} |
9340 |
diff --git a/libavcodec/vp8dsp.h b/libavcodec/vp8dsp.h |
9341 |
index cfe1524..be4f412 100644 |
9342 |
--- a/libavcodec/vp8dsp.h |
9343 |
+++ b/libavcodec/vp8dsp.h |
9344 |
@@ -94,6 +94,7 @@ void ff_vp78dsp_init(VP8DSPContext *c); |
9345 |
void ff_vp78dsp_init_aarch64(VP8DSPContext *c); |
9346 |
void ff_vp78dsp_init_arm(VP8DSPContext *c); |
9347 |
void ff_vp78dsp_init_ppc(VP8DSPContext *c); |
9348 |
+void ff_vp78dsp_init_e2k(VP8DSPContext *c); |
9349 |
void ff_vp78dsp_init_x86(VP8DSPContext *c); |
9350 |
|
9351 |
void ff_vp8dsp_init(VP8DSPContext *c); |
9352 |
diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c |
9353 |
index f6d73f7..03df9c0 100644 |
9354 |
--- a/libavcodec/vp9dsp.c |
9355 |
+++ b/libavcodec/vp9dsp.c |
9356 |
@@ -95,5 +95,6 @@ av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact) |
9357 |
if (ARCH_AARCH64) ff_vp9dsp_init_aarch64(dsp, bpp); |
9358 |
if (ARCH_ARM) ff_vp9dsp_init_arm(dsp, bpp); |
9359 |
if (ARCH_X86) ff_vp9dsp_init_x86(dsp, bpp, bitexact); |
9360 |
+ if (ARCH_E2K) ff_vp9dsp_init_e2k(dsp, bpp, bitexact); |
9361 |
if (ARCH_MIPS) ff_vp9dsp_init_mips(dsp, bpp); |
9362 |
} |
9363 |
diff --git a/libavcodec/vp9dsp.h b/libavcodec/vp9dsp.h |
9364 |
index e225631..f6dfe01 100644 |
9365 |
--- a/libavcodec/vp9dsp.h |
9366 |
+++ b/libavcodec/vp9dsp.h |
9367 |
@@ -131,6 +131,7 @@ void ff_vp9dsp_init_12(VP9DSPContext *dsp); |
9368 |
void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp); |
9369 |
void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp); |
9370 |
void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact); |
9371 |
+void ff_vp9dsp_init_e2k(VP9DSPContext *dsp, int bpp, int bitexact); |
9372 |
void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp); |
9373 |
|
9374 |
#endif /* AVCODEC_VP9DSP_H */ |
9375 |
diff --git a/libavutil/cpu.c b/libavutil/cpu.c |
9376 |
index 6548cc3..78e3f79 100644 |
9377 |
--- a/libavutil/cpu.c |
9378 |
+++ b/libavutil/cpu.c |
9379 |
@@ -57,6 +57,8 @@ static int get_cpu_flags(void) |
9380 |
return ff_get_cpu_flags_arm(); |
9381 |
if (ARCH_PPC) |
9382 |
return ff_get_cpu_flags_ppc(); |
9383 |
+ if (ARCH_E2K) |
9384 |
+ return ff_get_cpu_flags_e2k(); |
9385 |
if (ARCH_X86) |
9386 |
return ff_get_cpu_flags_x86(); |
9387 |
return 0; |
9388 |
@@ -132,6 +134,8 @@ int av_parse_cpu_flags(const char *s) |
9389 |
{ "flags" , NULL, 0, AV_OPT_TYPE_FLAGS, { .i64 = 0 }, INT64_MIN, INT64_MAX, .unit = "flags" }, |
9390 |
#if ARCH_PPC |
9391 |
{ "altivec" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_ALTIVEC }, .unit = "flags" }, |
9392 |
+#elif ARCH_E2K |
9393 |
+ { "e2k" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_E2K }, .unit = "flags" }, |
9394 |
#elif ARCH_X86 |
9395 |
{ "mmx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_MMX }, .unit = "flags" }, |
9396 |
{ "mmxext" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_MMXEXT }, .unit = "flags" }, |
9397 |
@@ -194,6 +198,8 @@ int av_parse_cpu_caps(unsigned *flags, const char *s) |
9398 |
{ "flags" , NULL, 0, AV_OPT_TYPE_FLAGS, { .i64 = 0 }, INT64_MIN, INT64_MAX, .unit = "flags" }, |
9399 |
#if ARCH_PPC |
9400 |
{ "altivec" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_ALTIVEC }, .unit = "flags" }, |
9401 |
+#elif ARCH_E2K |
9402 |
+ { "e2k" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_E2K }, .unit = "flags" }, |
9403 |
#elif ARCH_X86 |
9404 |
{ "mmx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_MMX }, .unit = "flags" }, |
9405 |
{ "mmx2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_MMX2 }, .unit = "flags" }, |
9406 |
@@ -314,6 +320,8 @@ size_t av_cpu_max_align(void) |
9407 |
return ff_get_cpu_max_align_arm(); |
9408 |
if (ARCH_PPC) |
9409 |
return ff_get_cpu_max_align_ppc(); |
9410 |
+ if (ARCH_E2K) |
9411 |
+ return ff_get_cpu_max_align_e2k(); |
9412 |
if (ARCH_X86) |
9413 |
return ff_get_cpu_max_align_x86(); |
9414 |
|
9415 |
diff --git a/libavutil/cpu.h b/libavutil/cpu.h |
9416 |
index 8bb9eb6..537c6db 100644 |
9417 |
--- a/libavutil/cpu.h |
9418 |
+++ b/libavutil/cpu.h |
9419 |
@@ -61,6 +61,8 @@ |
9420 |
#define AV_CPU_FLAG_VSX 0x0002 ///< ISA 2.06 |
9421 |
#define AV_CPU_FLAG_POWER8 0x0004 ///< ISA 2.07 |
9422 |
|
9423 |
+#define AV_CPU_FLAG_E2K 0x0001 |
9424 |
+ |
9425 |
#define AV_CPU_FLAG_ARMV5TE (1 << 0) |
9426 |
#define AV_CPU_FLAG_ARMV6 (1 << 1) |
9427 |
#define AV_CPU_FLAG_ARMV6T2 (1 << 2) |
9428 |
diff --git a/libavutil/cpu_internal.h b/libavutil/cpu_internal.h |
9429 |
index 37122d1..d40e28d 100644 |
9430 |
--- a/libavutil/cpu_internal.h |
9431 |
+++ b/libavutil/cpu_internal.h |
9432 |
@@ -44,11 +44,13 @@ |
9433 |
int ff_get_cpu_flags_aarch64(void); |
9434 |
int ff_get_cpu_flags_arm(void); |
9435 |
int ff_get_cpu_flags_ppc(void); |
9436 |
+int ff_get_cpu_flags_e2k(void); |
9437 |
int ff_get_cpu_flags_x86(void); |
9438 |
|
9439 |
size_t ff_get_cpu_max_align_aarch64(void); |
9440 |
size_t ff_get_cpu_max_align_arm(void); |
9441 |
size_t ff_get_cpu_max_align_ppc(void); |
9442 |
+size_t ff_get_cpu_max_align_e2k(void); |
9443 |
size_t ff_get_cpu_max_align_x86(void); |
9444 |
|
9445 |
#endif /* AVUTIL_CPU_INTERNAL_H */ |
9446 |
diff --git a/libavutil/e2k/Makefile b/libavutil/e2k/Makefile |
9447 |
new file mode 100644 |
9448 |
index 0000000..67892b4 |
9449 |
--- /dev/null |
9450 |
+++ b/libavutil/e2k/Makefile |
9451 |
@@ -0,0 +1,2 @@ |
9452 |
+OBJS += e2k/cpu.o \ |
9453 |
+ e2k/float_dsp.o |
9454 |
diff --git a/libavutil/e2k/cpu.c b/libavutil/e2k/cpu.c |
9455 |
new file mode 100644 |
9456 |
index 0000000..6e52faa |
9457 |
--- /dev/null |
9458 |
+++ b/libavutil/e2k/cpu.c |
9459 |
@@ -0,0 +1,41 @@ |
9460 |
+/* |
9461 |
+ * This file is part of FFmpeg. |
9462 |
+ * |
9463 |
+ * FFmpeg is free software; you can redistribute it and/or |
9464 |
+ * modify it under the terms of the GNU Lesser General Public |
9465 |
+ * License as published by the Free Software Foundation; either |
9466 |
+ * version 2.1 of the License, or (at your option) any later version. |
9467 |
+ * |
9468 |
+ * FFmpeg is distributed in the hope that it will be useful, |
9469 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
9470 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
9471 |
+ * Lesser General Public License for more details. |
9472 |
+ * |
9473 |
+ * You should have received a copy of the GNU Lesser General Public |
9474 |
+ * License along with FFmpeg; if not, write to the Free Software |
9475 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
9476 |
+ */ |
9477 |
+ |
9478 |
+#include "config.h" |
9479 |
+#include "libavutil/avassert.h" |
9480 |
+#include "libavutil/cpu.h" |
9481 |
+#include "libavutil/cpu_internal.h" |
9482 |
+ |
9483 |
+int ff_get_cpu_flags_e2k(void) |
9484 |
+{ |
9485 |
+#if HAVE_E2K |
9486 |
+ return AV_CPU_FLAG_E2K; |
9487 |
+#else |
9488 |
+ return 0; |
9489 |
+#endif |
9490 |
+} |
9491 |
+ |
9492 |
+size_t ff_get_cpu_max_align_e2k(void) |
9493 |
+{ |
9494 |
+ int flags = av_get_cpu_flags(); |
9495 |
+ |
9496 |
+ if (flags & AV_CPU_FLAG_E2K) |
9497 |
+ return 16; |
9498 |
+ |
9499 |
+ return 8; |
9500 |
+} |
9501 |
diff --git a/libavutil/e2k/cpu.h b/libavutil/e2k/cpu.h |
9502 |
new file mode 100644 |
9503 |
index 0000000..e9a3d66 |
9504 |
--- /dev/null |
9505 |
+++ b/libavutil/e2k/cpu.h |
9506 |
@@ -0,0 +1,27 @@ |
9507 |
+/* |
9508 |
+ * This file is part of FFmpeg. |
9509 |
+ * |
9510 |
+ * FFmpeg is free software; you can redistribute it and/or |
9511 |
+ * modify it under the terms of the GNU Lesser General Public |
9512 |
+ * License as published by the Free Software Foundation; either |
9513 |
+ * version 2.1 of the License, or (at your option) any later version. |
9514 |
+ * |
9515 |
+ * FFmpeg is distributed in the hope that it will be useful, |
9516 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
9517 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
9518 |
+ * Lesser General Public License for more details. |
9519 |
+ * |
9520 |
+ * You should have received a copy of the GNU Lesser General Public |
9521 |
+ * License along with FFmpeg; if not, write to the Free Software |
9522 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
9523 |
+ */ |
9524 |
+ |
9525 |
+#ifndef AVUTIL_E2K_CPU_H |
9526 |
+#define AVUTIL_E2K_CPU_H |
9527 |
+ |
9528 |
+#include "libavutil/cpu.h" |
9529 |
+#include "libavutil/cpu_internal.h" |
9530 |
+ |
9531 |
+#define E2K_BASE(flags) CPUEXT(flags, E2K) |
9532 |
+ |
9533 |
+#endif /* AVUTIL_E2K_CPU_H */ |
9534 |
diff --git a/libavutil/e2k/float_dsp.c b/libavutil/e2k/float_dsp.c |
9535 |
new file mode 100644 |
9536 |
index 0000000..dfecdab |
9537 |
--- /dev/null |
9538 |
+++ b/libavutil/e2k/float_dsp.c |
9539 |
@@ -0,0 +1,188 @@ |
9540 |
+/* |
9541 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
9542 |
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> |
9543 |
+ * |
9544 |
+ * This file is part of FFmpeg. |
9545 |
+ * |
9546 |
+ * FFmpeg is free software; you can redistribute it and/or |
9547 |
+ * modify it under the terms of the GNU Lesser General Public |
9548 |
+ * License as published by the Free Software Foundation; either |
9549 |
+ * version 2.1 of the License, or (at your option) any later version. |
9550 |
+ * |
9551 |
+ * FFmpeg is distributed in the hope that it will be useful, |
9552 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
9553 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
9554 |
+ * Lesser General Public License for more details. |
9555 |
+ * |
9556 |
+ * You should have received a copy of the GNU Lesser General Public |
9557 |
+ * License along with FFmpeg; if not, write to the Free Software |
9558 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
9559 |
+ */ |
9560 |
+ |
9561 |
+#include "config.h" |
9562 |
+#include "libavutil/attributes.h" |
9563 |
+#include "libavutil/cpu.h" |
9564 |
+#include "libavutil/float_dsp.h" |
9565 |
+#include "libavutil/e2k/cpu.h" |
9566 |
+#include "util_e2k.h" |
9567 |
+ |
9568 |
+static void ff_vector_fmul_e2k(float *dst, |
9569 |
+ const float *src0, const float *src1, |
9570 |
+ int len) |
9571 |
+{ |
9572 |
+ int i = 0; |
9573 |
+ __m128 d0, d1; |
9574 |
+ |
9575 |
+ PRAGMA_E2K("ivdep") |
9576 |
+ for (; i < len; i += 8) { |
9577 |
+ d0 = _mm_load_ps(src0 + i); |
9578 |
+ d1 = _mm_load_ps(src0 + i + 4); |
9579 |
+ d0 = _mm_mul_ps(d0, _mm_load_ps(src1 + i)); |
9580 |
+ d1 = _mm_mul_ps(d1, _mm_load_ps(src1 + i + 4)); |
9581 |
+ _mm_store_ps(dst + i, d0); |
9582 |
+ _mm_store_ps(dst + i + 4, d1); |
9583 |
+ } |
9584 |
+} |
9585 |
+ |
9586 |
+static void vector_fmac_scalar_e2k(float *dst, const float *src, float mul, |
9587 |
+ int len) |
9588 |
+{ |
9589 |
+ int i = 0; |
9590 |
+ __m128 d0, d1, d2 = _mm_set1_ps(mul); |
9591 |
+ |
9592 |
+ PRAGMA_E2K("ivdep") |
9593 |
+ for (; i < len; i += 8) { |
9594 |
+ d0 = _mm_load_ps(src + i); |
9595 |
+ d1 = _mm_load_ps(src + i + 4); |
9596 |
+ d0 = _mm_mul_ps(d0, d2); |
9597 |
+ d1 = _mm_mul_ps(d1, d2); |
9598 |
+ d0 = _mm_add_ps(d0, _mm_load_ps(dst + i)); |
9599 |
+ d1 = _mm_add_ps(d1, _mm_load_ps(dst + i + 4)); |
9600 |
+ _mm_store_ps(dst + i, d0); |
9601 |
+ _mm_store_ps(dst + i + 4, d1); |
9602 |
+ } |
9603 |
+} |
9604 |
+ |
9605 |
+static void vector_fmul_scalar_e2k(float *dst, const float *src, float mul, |
9606 |
+ int len) |
9607 |
+{ |
9608 |
+ int i = 0; |
9609 |
+ __m128 d0, d1, d2 = _mm_set1_ps(mul); |
9610 |
+ |
9611 |
+ PRAGMA_E2K("ivdep") |
9612 |
+ for (; i < len - 4; i += 8) { |
9613 |
+ d0 = _mm_load_ps(src + i); |
9614 |
+ d1 = _mm_load_ps(src + i + 4); |
9615 |
+ d0 = _mm_mul_ps(d0, d2); |
9616 |
+ d1 = _mm_mul_ps(d1, d2); |
9617 |
+ _mm_store_ps(dst + i, d0); |
9618 |
+ _mm_store_ps(dst + i + 4, d1); |
9619 |
+ } |
9620 |
+ if (i < len) { |
9621 |
+ d0 = _mm_load_ps(src + i); |
9622 |
+ d0 = _mm_mul_ps(d0, d2); |
9623 |
+ _mm_store_ps(dst + i, d0); |
9624 |
+ } |
9625 |
+} |
9626 |
+ |
9627 |
+static void ff_vector_fmul_window_e2k(float *dst, const float *src0, |
9628 |
+ const float *src1, const float *win, |
9629 |
+ int len) |
9630 |
+{ |
9631 |
+ __m128 t0, t1, s0, s1, wi, wj; |
9632 |
+ int i, j; |
9633 |
+ |
9634 |
+ dst += len; |
9635 |
+ win += len; |
9636 |
+ src0 += len; |
9637 |
+ |
9638 |
+ PRAGMA_E2K("ivdep") |
9639 |
+ for (i = -len, j = len - 4; i < 0; i += 4, j -= 4) { |
9640 |
+ s0 = _mm_load_ps(src0 + i); |
9641 |
+ s1 = _mm_load_ps(src1 + j); |
9642 |
+ wi = _mm_load_ps(win + i); |
9643 |
+ wj = _mm_load_ps(win + j); |
9644 |
+ |
9645 |
+ s1 = _mm_shuffle_ps(s1, s1, 0x1b); |
9646 |
+ wj = _mm_shuffle_ps(wj, wj, 0x1b); |
9647 |
+ |
9648 |
+ t0 = _mm_mul_ps(s0, wj); |
9649 |
+ t1 = _mm_mul_ps(s0, wi); |
9650 |
+ t0 = _mm_sub_ps(t0, _mm_mul_ps(s1, wi)); |
9651 |
+ t1 = _mm_add_ps(t1, _mm_mul_ps(s1, wj)); |
9652 |
+ t1 = _mm_shuffle_ps(t1, t1, 0x1b); |
9653 |
+ |
9654 |
+ _mm_store_ps(dst + i, t0); |
9655 |
+ _mm_store_ps(dst + j, t1); |
9656 |
+ } |
9657 |
+} |
9658 |
+ |
9659 |
+static void ff_vector_fmul_add_e2k(float *dst, const float *src0, |
9660 |
+ const float *src1, const float *src2, |
9661 |
+ int len) |
9662 |
+{ |
9663 |
+ int i; |
9664 |
+ __m128 d, s0, s1, s2; |
9665 |
+ |
9666 |
+ PRAGMA_E2K("ivdep") |
9667 |
+ for (i = 0; i < len; i += 4) { |
9668 |
+ s0 = _mm_load_ps(src0 + i); |
9669 |
+ s1 = _mm_load_ps(src1 + i); |
9670 |
+ s2 = _mm_load_ps(src2 + i); |
9671 |
+ d = _mm_add_ps(_mm_mul_ps(s0, s1), s2); |
9672 |
+ _mm_store_ps(dst + i, d); |
9673 |
+ } |
9674 |
+} |
9675 |
+ |
9676 |
+static void ff_vector_fmul_reverse_e2k(float *dst, const float *src0, |
9677 |
+ const float *src1, int len) |
9678 |
+{ |
9679 |
+ int i; |
9680 |
+ __m128 s0, s1, s2, s3; |
9681 |
+ src1 += len - 4; |
9682 |
+ |
9683 |
+ PRAGMA_E2K("ivdep") |
9684 |
+ for (i = 0; i < len; i += 8) { |
9685 |
+ s1 = _mm_load_ps(src1 - i); |
9686 |
+ s0 = _mm_load_ps(src0 + i); |
9687 |
+ s3 = _mm_load_ps(src1 - i - 4); |
9688 |
+ s2 = _mm_load_ps(src0 + i + 4); |
9689 |
+ s1 = _mm_shuffle_ps(s1, s1, 0x1b); |
9690 |
+ s3 = _mm_shuffle_ps(s3, s3, 0x1b); |
9691 |
+ s0 = _mm_mul_ps(s0, s1); |
9692 |
+ s2 = _mm_mul_ps(s2, s3); |
9693 |
+ _mm_store_ps(dst + i, s0); |
9694 |
+ _mm_store_ps(dst + i + 4, s2); |
9695 |
+ } |
9696 |
+} |
9697 |
+ |
9698 |
+static void butterflies_float_e2k(float *av_restrict src0, |
9699 |
+ float *av_restrict src1, int len) |
9700 |
+{ |
9701 |
+ int i; |
9702 |
+ __m128 s0, s1, s2; |
9703 |
+ |
9704 |
+ PRAGMA_E2K("ivdep") |
9705 |
+ for (i = 0; i < len; i += 4) { |
9706 |
+ s0 = _mm_load_ps(src0 + i); |
9707 |
+ s1 = _mm_load_ps(src1 + i); |
9708 |
+ s2 = _mm_sub_ps(s0, s1); |
9709 |
+ s0 = _mm_add_ps(s0, s1); |
9710 |
+ _mm_store_ps(src1 + i, s2); |
9711 |
+ _mm_store_ps(src0 + i, s0); |
9712 |
+ } |
9713 |
+} |
9714 |
+ |
9715 |
+av_cold void ff_float_dsp_init_e2k(AVFloatDSPContext *fdsp, int bit_exact) |
9716 |
+{ |
9717 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
9718 |
+ return; |
9719 |
+ |
9720 |
+ fdsp->vector_fmul = ff_vector_fmul_e2k; |
9721 |
+ fdsp->vector_fmac_scalar = vector_fmac_scalar_e2k; |
9722 |
+ fdsp->vector_fmul_scalar = vector_fmul_scalar_e2k; |
9723 |
+ fdsp->vector_fmul_window = ff_vector_fmul_window_e2k; |
9724 |
+ fdsp->vector_fmul_add = ff_vector_fmul_add_e2k; |
9725 |
+ fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_e2k; |
9726 |
+ fdsp->butterflies_float = butterflies_float_e2k; |
9727 |
+} |
9728 |
diff --git a/libavutil/e2k/intreadwrite.h b/libavutil/e2k/intreadwrite.h |
9729 |
new file mode 100644 |
9730 |
index 0000000..0387475 |
9731 |
--- /dev/null |
9732 |
+++ b/libavutil/e2k/intreadwrite.h |
9733 |
@@ -0,0 +1,54 @@ |
9734 |
+/* |
9735 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
9736 |
+ * |
9737 |
+ * This file is part of FFmpeg. |
9738 |
+ * |
9739 |
+ * FFmpeg is free software; you can redistribute it and/or |
9740 |
+ * modify it under the terms of the GNU Lesser General Public |
9741 |
+ * License as published by the Free Software Foundation; either |
9742 |
+ * version 2.1 of the License, or (at your option) any later version. |
9743 |
+ * |
9744 |
+ * FFmpeg is distributed in the hope that it will be useful, |
9745 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
9746 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
9747 |
+ * Lesser General Public License for more details. |
9748 |
+ * |
9749 |
+ * You should have received a copy of the GNU Lesser General Public |
9750 |
+ * License along with FFmpeg; if not, write to the Free Software |
9751 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
9752 |
+ */ |
9753 |
+ |
9754 |
+#ifndef AVUTIL_E2K_INTREADWRITE_H |
9755 |
+#define AVUTIL_E2K_INTREADWRITE_H |
9756 |
+ |
9757 |
+#include <stdint.h> |
9758 |
+#include "config.h" |
9759 |
+ |
9760 |
+#include <x86intrin.h> |
9761 |
+ |
9762 |
+#define AV_RB32 av_read_bswap32 |
9763 |
+#define AV_WB32 av_write_bswap32 |
9764 |
+#define AV_RB64 av_read_bswap64 |
9765 |
+#define AV_WB64 av_write_bswap64 |
9766 |
+ |
9767 |
+static av_always_inline uint32_t av_read_bswap32(const void *p) |
9768 |
+{ |
9769 |
+ return _bswap(*(const uint32_t*)p); |
9770 |
+} |
9771 |
+ |
9772 |
+static av_always_inline void av_write_bswap32(void *p, uint32_t v) |
9773 |
+{ |
9774 |
+ *(uint32_t*)p = _bswap(v); |
9775 |
+} |
9776 |
+ |
9777 |
+static av_always_inline uint64_t av_read_bswap64(const void *p) |
9778 |
+{ |
9779 |
+ return _bswap64(*(const uint64_t*)p); |
9780 |
+} |
9781 |
+ |
9782 |
+static av_always_inline void av_write_bswap64(void *p, uint64_t v) |
9783 |
+{ |
9784 |
+ *(uint64_t*)p = _bswap64(v); |
9785 |
+} |
9786 |
+ |
9787 |
+#endif /* AVUTIL_E2K_INTREADWRITE_H */ |
9788 |
diff --git a/libavutil/e2k/timer.h b/libavutil/e2k/timer.h |
9789 |
new file mode 100644 |
9790 |
index 0000000..ea78175 |
9791 |
--- /dev/null |
9792 |
+++ b/libavutil/e2k/timer.h |
9793 |
@@ -0,0 +1,35 @@ |
9794 |
+/* |
9795 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
9796 |
+ * |
9797 |
+ * This file is part of FFmpeg. |
9798 |
+ * |
9799 |
+ * FFmpeg is free software; you can redistribute it and/or |
9800 |
+ * modify it under the terms of the GNU Lesser General Public |
9801 |
+ * License as published by the Free Software Foundation; either |
9802 |
+ * version 2.1 of the License, or (at your option) any later version. |
9803 |
+ * |
9804 |
+ * FFmpeg is distributed in the hope that it will be useful, |
9805 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
9806 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
9807 |
+ * Lesser General Public License for more details. |
9808 |
+ * |
9809 |
+ * You should have received a copy of the GNU Lesser General Public |
9810 |
+ * License along with FFmpeg; if not, write to the Free Software |
9811 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
9812 |
+ */ |
9813 |
+ |
9814 |
+#ifndef AVUTIL_E2K_TIMER_H |
9815 |
+#define AVUTIL_E2K_TIMER_H |
9816 |
+ |
9817 |
+#include <stdint.h> |
9818 |
+#include <x86intrin.h> |
9819 |
+ |
9820 |
+#define AV_READ_TIME read_time |
9821 |
+ |
9822 |
+static inline uint64_t read_time(void) |
9823 |
+{ |
9824 |
+ unsigned aux; |
9825 |
+ return __rdtscp(&aux); |
9826 |
+} |
9827 |
+ |
9828 |
+#endif /* AVUTIL_E2K_TIMER_H */ |
9829 |
diff --git a/libavutil/e2k/util_e2k.h b/libavutil/e2k/util_e2k.h |
9830 |
new file mode 100644 |
9831 |
index 0000000..f5cea7c |
9832 |
--- /dev/null |
9833 |
+++ b/libavutil/e2k/util_e2k.h |
9834 |
@@ -0,0 +1,146 @@ |
9835 |
+/* |
9836 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
9837 |
+ * |
9838 |
+ * This file is part of FFmpeg. |
9839 |
+ * |
9840 |
+ * FFmpeg is free software; you can redistribute it and/or |
9841 |
+ * modify it under the terms of the GNU Lesser General Public |
9842 |
+ * License as published by the Free Software Foundation; either |
9843 |
+ * version 2.1 of the License, or (at your option) any later version. |
9844 |
+ * |
9845 |
+ * FFmpeg is distributed in the hope that it will be useful, |
9846 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
9847 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
9848 |
+ * Lesser General Public License for more details. |
9849 |
+ * |
9850 |
+ * You should have received a copy of the GNU Lesser General Public |
9851 |
+ * License along with FFmpeg; if not, write to the Free Software |
9852 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
9853 |
+ */ |
9854 |
+ |
9855 |
+/** |
9856 |
+ * @file |
9857 |
+ * Contains misc utility macros and inline functions |
9858 |
+ */ |
9859 |
+ |
9860 |
+#ifndef AVUTIL_E2K_UTIL_E2K_H |
9861 |
+#define AVUTIL_E2K_UTIL_E2K_H |
9862 |
+ |
9863 |
+#include <stdint.h> |
9864 |
+#include "config.h" |
9865 |
+#include <smmintrin.h> /* SSE4.1 */ |
9866 |
+ |
9867 |
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline |
9868 |
+#define ALIGNED(n) __attribute__((aligned(n))) |
9869 |
+ |
9870 |
+#ifdef __e2k__ |
9871 |
+#define PRAGMA_E2K _Pragma |
9872 |
+#define _mm_shuffle2_pi8(a, b, c) \ |
9873 |
+ ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c))) |
9874 |
+#define _mm_shuffle2_epi8(a, b, c) \ |
9875 |
+ ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c))) |
9876 |
+#define _mm_blendv_pi8(a, b, c) \ |
9877 |
+ ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c))) |
9878 |
+#else |
9879 |
+#define PRAGMA_E2K(x) |
9880 |
+#define _mm_shuffle2_pi8(a, b, c) \ |
9881 |
+ _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \ |
9882 |
+ _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c))) |
9883 |
+#define _mm_shuffle2_epi8(a, b, c) \ |
9884 |
+ _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \ |
9885 |
+ _mm_slli_epi16(c, 3)) |
9886 |
+#define _mm_blendv_pi8(a, b, c) \ |
9887 |
+ _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \ |
9888 |
+ _mm_movpi64_epi64(b), _mm_movpi64_epi64(c))) |
9889 |
+ |
9890 |
+static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) { |
9891 |
+ int n = b & 63; |
9892 |
+ a = a >> n | a << (64 - n); |
9893 |
+ return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63))); |
9894 |
+} |
9895 |
+#endif |
9896 |
+ |
9897 |
+#define _mm_extract_pi32(a, b) _mm_extract_epi32(_mm_movpi64_epi64(a), b) |
9898 |
+#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1)) |
9899 |
+ |
9900 |
+#define _mm_unpacklo_ps2(a, b) _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(a), _mm_castps_pd(b))); |
9901 |
+#define _mm_unpackhi_ps2(a, b) _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(a), _mm_castps_pd(b))); |
9902 |
+#define _mm_alignr_ps(a, b, n) _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(a), _mm_castps_si128(b), (n) * 4)) |
9903 |
+#define _mm_bsrli_ps(a, n) _mm_castsi128_ps(_mm_bsrli_si128(_mm_castps_si128(a), (n) * 4)) |
9904 |
+ |
9905 |
+/*********************************************************************** |
9906 |
+ * Vector types |
9907 |
+ **********************************************************************/ |
9908 |
+#define vec_u8 __m128i |
9909 |
+#define vec_s8 __m128i |
9910 |
+#define vec_u16 __m128i |
9911 |
+#define vec_s16 __m128i |
9912 |
+#define vec_u32 __m128i |
9913 |
+#define vec_s32 __m128i |
9914 |
+#define vec_f __m128 |
9915 |
+ |
9916 |
+/*********************************************************************** |
9917 |
+ * Null vector |
9918 |
+ **********************************************************************/ |
9919 |
+#define LOAD_ZERO const __m128i zerov = _mm_setzero_si128() |
9920 |
+ |
9921 |
+// Transpose 8x8 matrix of 16-bit elements (in-place) |
9922 |
+#define TRANSPOSE8(a0, a1, a2, a3, a4, a5, a6, a7) \ |
9923 |
+do { \ |
9924 |
+ vec_s16 _b0, _b1, _b2, _b3, _b4, _b5, _b6, _b7; \ |
9925 |
+ vec_s16 _c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7; \ |
9926 |
+ _b0 = _mm_unpacklo_epi16(a0, a2); \ |
9927 |
+ _b1 = _mm_unpackhi_epi16(a0, a2); \ |
9928 |
+ _b2 = _mm_unpacklo_epi16(a1, a3); \ |
9929 |
+ _b3 = _mm_unpackhi_epi16(a1, a3); \ |
9930 |
+ _b4 = _mm_unpacklo_epi16(a4, a6); \ |
9931 |
+ _b5 = _mm_unpackhi_epi16(a4, a6); \ |
9932 |
+ _b6 = _mm_unpacklo_epi16(a5, a7); \ |
9933 |
+ _b7 = _mm_unpackhi_epi16(a5, a7); \ |
9934 |
+ \ |
9935 |
+ _c0 = _mm_unpacklo_epi16(_b0, _b2); \ |
9936 |
+ _c1 = _mm_unpackhi_epi16(_b0, _b2); \ |
9937 |
+ _c2 = _mm_unpacklo_epi16(_b1, _b3); \ |
9938 |
+ _c3 = _mm_unpackhi_epi16(_b1, _b3); \ |
9939 |
+ _c4 = _mm_unpacklo_epi16(_b4, _b6); \ |
9940 |
+ _c5 = _mm_unpackhi_epi16(_b4, _b6); \ |
9941 |
+ _c6 = _mm_unpacklo_epi16(_b5, _b7); \ |
9942 |
+ _c7 = _mm_unpackhi_epi16(_b5, _b7); \ |
9943 |
+ \ |
9944 |
+ a0 = _mm_unpacklo_epi64(_c0, _c4); \ |
9945 |
+ a1 = _mm_unpackhi_epi64(_c0, _c4); \ |
9946 |
+ a2 = _mm_unpacklo_epi64(_c1, _c5); \ |
9947 |
+ a3 = _mm_unpackhi_epi64(_c1, _c5); \ |
9948 |
+ a4 = _mm_unpacklo_epi64(_c2, _c6); \ |
9949 |
+ a5 = _mm_unpackhi_epi64(_c2, _c6); \ |
9950 |
+ a6 = _mm_unpacklo_epi64(_c3, _c7); \ |
9951 |
+ a7 = _mm_unpackhi_epi64(_c3, _c7); \ |
9952 |
+} while (0) |
9953 |
+ |
9954 |
+#define VEC_LD(a) _mm_loadu_si128((const __m128i*)(a)) |
9955 |
+#define VEC_ST(a, b) _mm_storeu_si128((__m128i*)(a), b) |
9956 |
+#define VEC_LD8(a) _mm_loadl_epi64((const __m128i*)(a)) |
9957 |
+#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b) |
9958 |
+#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b)); |
9959 |
+ |
9960 |
+#define VEC_SPLAT16(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8)) |
9961 |
+ |
9962 |
+#if !defined(__iset__) || __iset__ < 5 |
9963 |
+#define NEED_ALIGN8 |
9964 |
+#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, src_tmp1; |
9965 |
+#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index; |
9966 |
+#define ALIGN8_START(ptr, src) \ |
9967 |
+ src_shr = (intptr_t)(ptr - 1) & 7; \ |
9968 |
+ src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \ |
9969 |
+ src##_next = src##_ptr[src_shr == 7]; \ |
9970 |
+ src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \ |
9971 |
+ _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8)); |
9972 |
+#define ALIGN8_READ16(v0, src, i) \ |
9973 |
+ src_tmp1 = src##_ptr[i * 2 + 1]; \ |
9974 |
+ src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \ |
9975 |
+ src##_next = src##_ptr[i * 2 + 2]; \ |
9976 |
+ src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \ |
9977 |
+ v0 = _mm_setr_epi64(src_tmp0, src_tmp1); |
9978 |
+#endif |
9979 |
+ |
9980 |
+#endif /* AVUTIL_E2K_UTIL_E2K_H */ |
9981 |
diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c |
9982 |
index 6e28d71..5241c3e 100644 |
9983 |
--- a/libavutil/float_dsp.c |
9984 |
+++ b/libavutil/float_dsp.c |
9985 |
@@ -156,6 +156,8 @@ av_cold AVFloatDSPContext *avpriv_float_dsp_alloc(int bit_exact) |
9986 |
ff_float_dsp_init_arm(fdsp); |
9987 |
if (ARCH_PPC) |
9988 |
ff_float_dsp_init_ppc(fdsp, bit_exact); |
9989 |
+ if (ARCH_E2K) |
9990 |
+ ff_float_dsp_init_e2k(fdsp, bit_exact); |
9991 |
if (ARCH_X86) |
9992 |
ff_float_dsp_init_x86(fdsp); |
9993 |
if (ARCH_MIPS) |
9994 |
diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h |
9995 |
index 9c66459..97d9d79 100644 |
9996 |
--- a/libavutil/float_dsp.h |
9997 |
+++ b/libavutil/float_dsp.h |
9998 |
@@ -205,6 +205,7 @@ float avpriv_scalarproduct_float_c(const float *v1, const float *v2, int len); |
9999 |
void ff_float_dsp_init_aarch64(AVFloatDSPContext *fdsp); |
10000 |
void ff_float_dsp_init_arm(AVFloatDSPContext *fdsp); |
10001 |
void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int strict); |
10002 |
+void ff_float_dsp_init_e2k(AVFloatDSPContext *fdsp, int strict); |
10003 |
void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp); |
10004 |
void ff_float_dsp_init_mips(AVFloatDSPContext *fdsp); |
10005 |
|
10006 |
diff --git a/libavutil/intreadwrite.h b/libavutil/intreadwrite.h |
10007 |
index 4c8413a..b8a698e 100644 |
10008 |
--- a/libavutil/intreadwrite.h |
10009 |
+++ b/libavutil/intreadwrite.h |
10010 |
@@ -72,6 +72,8 @@ typedef union { |
10011 |
# include "mips/intreadwrite.h" |
10012 |
#elif ARCH_PPC |
10013 |
# include "ppc/intreadwrite.h" |
10014 |
+#elif ARCH_E2K |
10015 |
+# include "e2k/intreadwrite.h" |
10016 |
#elif ARCH_TOMI |
10017 |
# include "tomi/intreadwrite.h" |
10018 |
#elif ARCH_X86 |
10019 |
diff --git a/libavutil/tests/cpu.c b/libavutil/tests/cpu.c |
10020 |
index ce45b71..21c30cf 100644 |
10021 |
--- a/libavutil/tests/cpu.c |
10022 |
+++ b/libavutil/tests/cpu.c |
10023 |
@@ -49,6 +49,8 @@ static const struct { |
10024 |
{ AV_CPU_FLAG_SETEND, "setend" }, |
10025 |
#elif ARCH_PPC |
10026 |
{ AV_CPU_FLAG_ALTIVEC, "altivec" }, |
10027 |
+#elif ARCH_E2K |
10028 |
+ { AV_CPU_FLAG_E2K, "e2k" }, |
10029 |
#elif ARCH_X86 |
10030 |
{ AV_CPU_FLAG_MMX, "mmx" }, |
10031 |
{ AV_CPU_FLAG_MMXEXT, "mmxext" }, |
10032 |
diff --git a/libavutil/timer.h b/libavutil/timer.h |
10033 |
index 0bb353c..cc0c282 100644 |
10034 |
--- a/libavutil/timer.h |
10035 |
+++ b/libavutil/timer.h |
10036 |
@@ -54,6 +54,8 @@ |
10037 |
# include "arm/timer.h" |
10038 |
#elif ARCH_PPC |
10039 |
# include "ppc/timer.h" |
10040 |
+#elif ARCH_E2K |
10041 |
+# include "e2k/timer.h" |
10042 |
#elif ARCH_X86 |
10043 |
# include "x86/timer.h" |
10044 |
#endif |
10045 |
diff --git a/libswresample/audioconvert.c b/libswresample/audioconvert.c |
10046 |
index d21fc8e..96cd701 100644 |
10047 |
--- a/libswresample/audioconvert.c |
10048 |
+++ b/libswresample/audioconvert.c |
10049 |
@@ -179,6 +179,7 @@ AudioConvert *swri_audio_convert_alloc(enum AVSampleFormat out_fmt, |
10050 |
if(HAVE_X86ASM && HAVE_MMX) swri_audio_convert_init_x86(ctx, out_fmt, in_fmt, channels); |
10051 |
if(ARCH_ARM) swri_audio_convert_init_arm(ctx, out_fmt, in_fmt, channels); |
10052 |
if(ARCH_AARCH64) swri_audio_convert_init_aarch64(ctx, out_fmt, in_fmt, channels); |
10053 |
+ if(HAVE_E2K) swri_audio_convert_init_e2k(ctx, out_fmt, in_fmt, channels); |
10054 |
|
10055 |
return ctx; |
10056 |
} |
10057 |
diff --git a/libswresample/e2k/Makefile b/libswresample/e2k/Makefile |
10058 |
new file mode 100644 |
10059 |
index 0000000..a90ab9e |
10060 |
--- /dev/null |
10061 |
+++ b/libswresample/e2k/Makefile |
10062 |
@@ -0,0 +1 @@ |
10063 |
+OBJS += e2k/audio_convert.o |
10064 |
diff --git a/libswresample/e2k/audio_convert.c b/libswresample/e2k/audio_convert.c |
10065 |
new file mode 100644 |
10066 |
index 0000000..d3577c8 |
10067 |
--- /dev/null |
10068 |
+++ b/libswresample/e2k/audio_convert.c |
10069 |
@@ -0,0 +1,110 @@ |
10070 |
+/* |
10071 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
10072 |
+ * |
10073 |
+ * This file is part of FFmpeg. |
10074 |
+ * |
10075 |
+ * FFmpeg is free software; you can redistribute it and/or |
10076 |
+ * modify it under the terms of the GNU Lesser General Public |
10077 |
+ * License as published by the Free Software Foundation; either |
10078 |
+ * version 2.1 of the License, or (at your option) any later version. |
10079 |
+ * |
10080 |
+ * FFmpeg is distributed in the hope that it will be useful, |
10081 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10082 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
10083 |
+ * Lesser General Public License for more details. |
10084 |
+ * |
10085 |
+ * You should have received a copy of the GNU Lesser General Public |
10086 |
+ * License along with FFmpeg; if not, write to the Free Software |
10087 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
10088 |
+ */ |
10089 |
+ |
10090 |
+#include "config.h" |
10091 |
+#include "libavutil/cpu.h" |
10092 |
+#include "libavutil/e2k/cpu.h" |
10093 |
+#include "libavutil/e2k/util_e2k.h" |
10094 |
+ |
10095 |
+#include "libswresample/swresample_internal.h" |
10096 |
+#include "libswresample/audioconvert.h" |
10097 |
+ |
10098 |
+// length is aligned by 16 in "audioconvert.c" |
10099 |
+ |
10100 |
+static void conv_flt_to_s16_e2k(uint8_t **_dst, const uint8_t **_src, int len) { |
10101 |
+ const float *src = (const float*)_src[0]; |
10102 |
+ int16_t *dst = (int16_t*)_dst[0]; |
10103 |
+ int i = 0; |
10104 |
+ __m128 f0, f1, c1 = _mm_set1_ps(1 << 15); |
10105 |
+ __m128i v0, v1; |
10106 |
+ |
10107 |
+ PRAGMA_E2K("ivdep") |
10108 |
+ for (; i + 7 < len; i += 8) { |
10109 |
+ f0 = _mm_loadu_ps(src); |
10110 |
+ f1 = _mm_loadu_ps(src + 4); |
10111 |
+ v0 = _mm_cvtps_epi32(_mm_mul_ps(f0, c1)); |
10112 |
+ v1 = _mm_cvtps_epi32(_mm_mul_ps(f1, c1)); |
10113 |
+ v0 = _mm_packs_epi32(v0, v1); |
10114 |
+ VEC_ST(dst, v0); |
10115 |
+ src += 8; dst += 8; |
10116 |
+ } |
10117 |
+/* |
10118 |
+ PRAGMA_E2K("ivdep") |
10119 |
+ for (; i < len; i++) |
10120 |
+ *dst++ = av_clip_int16(lrintf(*src++ * (1 << 15))); |
10121 |
+*/ |
10122 |
+} |
10123 |
+ |
10124 |
+static void conv_fltp_to_s16_2ch_e2k(uint8_t **_dst, const uint8_t **_src, int len) { |
10125 |
+ const float *src0 = (const float*)_src[0]; |
10126 |
+ const float *src1 = (const float*)_src[1]; |
10127 |
+ int16_t *dst = (int16_t*)_dst[0]; |
10128 |
+ int i = 0; |
10129 |
+ __m128 f0, f1, c1 = _mm_set1_ps(1 << 15); |
10130 |
+ __m128i v0, v1, v2, v3; |
10131 |
+ |
10132 |
+ PRAGMA_E2K("ivdep") |
10133 |
+ for (; i + 7 < len; i += 8) { |
10134 |
+ f0 = _mm_loadu_ps(src0); |
10135 |
+ f1 = _mm_loadu_ps(src0 + 4); |
10136 |
+ v0 = _mm_cvtps_epi32(_mm_mul_ps(f0, c1)); |
10137 |
+ v1 = _mm_cvtps_epi32(_mm_mul_ps(f1, c1)); |
10138 |
+ v2 = _mm_packs_epi32(v0, v1); |
10139 |
+ f0 = _mm_loadu_ps(src1); |
10140 |
+ f1 = _mm_loadu_ps(src1 + 4); |
10141 |
+ v0 = _mm_cvtps_epi32(_mm_mul_ps(f0, c1)); |
10142 |
+ v1 = _mm_cvtps_epi32(_mm_mul_ps(f1, c1)); |
10143 |
+ v3 = _mm_packs_epi32(v0, v1); |
10144 |
+ v0 = _mm_unpacklo_epi16(v2, v3); |
10145 |
+ v1 = _mm_unpackhi_epi16(v2, v3); |
10146 |
+ VEC_ST(dst, v0); |
10147 |
+ VEC_ST(dst + 8, v1); |
10148 |
+ src0 += 8; src1 += 8; dst += 16; |
10149 |
+ } |
10150 |
+/* |
10151 |
+ PRAGMA_E2K("ivdep") |
10152 |
+ for (; i < len; i++) { |
10153 |
+ dst[0] = av_clip_int16(lrintf(*src0++ * (1 << 15))); |
10154 |
+ dst[1] = av_clip_int16(lrintf(*src1++ * (1 << 15))); |
10155 |
+ dst += 2; |
10156 |
+ } |
10157 |
+*/ |
10158 |
+} |
10159 |
+ |
10160 |
+av_cold void swri_audio_convert_init_e2k(struct AudioConvert *ac, |
10161 |
+ enum AVSampleFormat out_fmt, |
10162 |
+ enum AVSampleFormat in_fmt, |
10163 |
+ int channels){ |
10164 |
+ |
10165 |
+ if (!E2K_BASE(av_get_cpu_flags())) |
10166 |
+ return; |
10167 |
+ |
10168 |
+ ac->simd_f = NULL; |
10169 |
+ |
10170 |
+ if (out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLT || |
10171 |
+ out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLTP) |
10172 |
+ ac->simd_f = conv_flt_to_s16_e2k; |
10173 |
+ |
10174 |
+ if (out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLTP && channels == 2) |
10175 |
+ ac->simd_f = conv_fltp_to_s16_2ch_e2k; |
10176 |
+ |
10177 |
+ // if(ac->simd_f) ac->in_simd_align_mask = 7; |
10178 |
+} |
10179 |
+ |
10180 |
diff --git a/libswresample/swresample_internal.h b/libswresample/swresample_internal.h |
10181 |
index f2ea5a2..b7e8501 100644 |
10182 |
--- a/libswresample/swresample_internal.h |
10183 |
+++ b/libswresample/swresample_internal.h |
10184 |
@@ -218,5 +218,9 @@ void swri_audio_convert_init_x86(struct AudioConvert *ac, |
10185 |
enum AVSampleFormat out_fmt, |
10186 |
enum AVSampleFormat in_fmt, |
10187 |
int channels); |
10188 |
+void swri_audio_convert_init_e2k(struct AudioConvert *ac, |
10189 |
+ enum AVSampleFormat out_fmt, |
10190 |
+ enum AVSampleFormat in_fmt, |
10191 |
+ int channels); |
10192 |
|
10193 |
#endif |
10194 |
diff --git a/libswscale/e2k/Makefile b/libswscale/e2k/Makefile |
10195 |
new file mode 100644 |
10196 |
index 0000000..f35371d |
10197 |
--- /dev/null |
10198 |
+++ b/libswscale/e2k/Makefile |
10199 |
@@ -0,0 +1,3 @@ |
10200 |
+OBJS += e2k/swscale.o \ |
10201 |
+ e2k/yuv2rgb.o \ |
10202 |
+ e2k/yuv2yuv.o |
10203 |
diff --git a/libswscale/e2k/swscale.c b/libswscale/e2k/swscale.c |
10204 |
new file mode 100644 |
10205 |
index 0000000..24a857f |
10206 |
--- /dev/null |
10207 |
+++ b/libswscale/e2k/swscale.c |
10208 |
@@ -0,0 +1,2046 @@ |
10209 |
+/* |
10210 |
+ * Elbrus-enhanced yuv2yuvX |
10211 |
+ * |
10212 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
10213 |
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org> |
10214 |
+ * based on the equivalent C code in swscale.c |
10215 |
+ * |
10216 |
+ * This file is part of FFmpeg. |
10217 |
+ * |
10218 |
+ * FFmpeg is free software; you can redistribute it and/or |
10219 |
+ * modify it under the terms of the GNU Lesser General Public |
10220 |
+ * License as published by the Free Software Foundation; either |
10221 |
+ * version 2.1 of the License, or (at your option) any later version. |
10222 |
+ * |
10223 |
+ * FFmpeg is distributed in the hope that it will be useful, |
10224 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10225 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
10226 |
+ * Lesser General Public License for more details. |
10227 |
+ * |
10228 |
+ * You should have received a copy of the GNU Lesser General Public |
10229 |
+ * License along with FFmpeg; if not, write to the Free Software |
10230 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
10231 |
+ */ |
10232 |
+ |
10233 |
+#include <inttypes.h> |
10234 |
+ |
10235 |
+#include "config.h" |
10236 |
+#include "libswscale/swscale.h" |
10237 |
+#include "libswscale/swscale_internal.h" |
10238 |
+#include "libavutil/attributes.h" |
10239 |
+#include "libavutil/cpu.h" |
10240 |
+#include "libavutil/e2k/util_e2k.h" |
10241 |
+ |
10242 |
+#include "yuv2rgb.h" |
10243 |
+ |
10244 |
+static void yuv2planeX_8_e2k(const int16_t *filter, int filterSize, |
10245 |
+ const int16_t **src, uint8_t *dest, int dstW, |
10246 |
+ const uint8_t *dither, int offset) |
10247 |
+{ |
10248 |
+ int i = 0, j; |
10249 |
+ __m64 h0; |
10250 |
+ __m128i d0, d1, zerov = _mm_setzero_si128(); |
10251 |
+ h0 = (__m64)__builtin_e2k_insfd(*(uint64_t*)dither, ((offset + i) & 7) * 8, 0); |
10252 |
+ d1 = _mm_unpacklo_epi8(_mm_movpi64_epi64(h0), zerov); |
10253 |
+ d0 = _mm_unpacklo_epi16(d1, zerov); |
10254 |
+ d1 = _mm_unpackhi_epi16(d1, zerov); |
10255 |
+ d0 = _mm_slli_epi32(d0, 12); |
10256 |
+ d1 = _mm_slli_epi32(d1, 12); |
10257 |
+ |
10258 |
+ for (; i < dstW - 15; i += 16) { |
10259 |
+ __m128i r0, r1, r2, r3, v0, v1, v2, v3; |
10260 |
+ |
10261 |
+ r2 = r0 = d0; |
10262 |
+ r3 = r1 = d1; |
10263 |
+ for (j = 0; j < filterSize; j++) { |
10264 |
+ v1 = _mm_set1_epi16(filter[j]); |
10265 |
+ |
10266 |
+ v0 = VEC_LD(src[j] + i); |
10267 |
+ v2 = _mm_mullo_epi16(v0, v1); |
10268 |
+ v3 = _mm_mulhi_epi16(v0, v1); |
10269 |
+ v0 = _mm_unpacklo_epi16(v2, v3); |
10270 |
+ v3 = _mm_unpackhi_epi16(v2, v3); |
10271 |
+ r0 = _mm_add_epi32(r0, v0); |
10272 |
+ r1 = _mm_add_epi32(r1, v3); |
10273 |
+ |
10274 |
+ v0 = VEC_LD(src[j] + i + 8); |
10275 |
+ v2 = _mm_mullo_epi16(v0, v1); |
10276 |
+ v3 = _mm_mulhi_epi16(v0, v1); |
10277 |
+ v0 = _mm_unpacklo_epi16(v2, v3); |
10278 |
+ v3 = _mm_unpackhi_epi16(v2, v3); |
10279 |
+ r2 = _mm_add_epi32(r2, v0); |
10280 |
+ r3 = _mm_add_epi32(r3, v3); |
10281 |
+ } |
10282 |
+ r0 = _mm_srai_epi32(r0, 19); |
10283 |
+ r1 = _mm_srai_epi32(r1, 19); |
10284 |
+ r2 = _mm_srai_epi32(r2, 19); |
10285 |
+ r3 = _mm_srai_epi32(r3, 19); |
10286 |
+ r0 = _mm_packs_epi32(r0, r1); |
10287 |
+ r2 = _mm_packs_epi32(r2, r3); |
10288 |
+ r0 = _mm_packus_epi16(r0, r2); |
10289 |
+ VEC_ST(dest + i, r0); |
10290 |
+ } |
10291 |
+ |
10292 |
+ for (; i < dstW; i++) { |
10293 |
+ int val = dither[(i + offset) & 7] << 12; |
10294 |
+ for (j = 0; j < filterSize; j++) |
10295 |
+ val += src[j][i] * filter[j]; |
10296 |
+ dest[i] = av_clip_uint8(val >> 19); |
10297 |
+ } |
10298 |
+} |
10299 |
+ |
10300 |
+static void hScale_real_e2k(SwsContext *c, int16_t *dst, int dstW, |
10301 |
+ const uint8_t *src, const int16_t *filter, |
10302 |
+ const int32_t *filterPos, int filterSize) |
10303 |
+{ |
10304 |
+ int i; |
10305 |
+ LOAD_ZERO; |
10306 |
+ switch (filterSize) { |
10307 |
+ |
10308 |
+ case 1: |
10309 |
+ PRAGMA_E2K("ivdep") |
10310 |
+ for (i = 0; i < dstW; i++) { |
10311 |
+ int val, srcPos = filterPos[i]; |
10312 |
+ val = (int)src[srcPos] * filter[filterSize * i]; |
10313 |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); |
10314 |
+ } |
10315 |
+ break; |
10316 |
+ |
10317 |
+ case 2: |
10318 |
+ PRAGMA_E2K("ivdep") |
10319 |
+ for (i = 0; i < dstW; i++) { |
10320 |
+ int val, srcPos = filterPos[i]; |
10321 |
+ val = (int)src[srcPos] * filter[filterSize * i]; |
10322 |
+ val += (int)src[srcPos + 1] * filter[filterSize * i + 1]; |
10323 |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); |
10324 |
+ } |
10325 |
+ break; |
10326 |
+ |
10327 |
+ case 4: { |
10328 |
+ __m64 zerov = _mm_setzero_si64(); |
10329 |
+ PRAGMA_E2K("ivdep") |
10330 |
+ for (i = 0; i < dstW; i++, filter += filterSize) { |
10331 |
+ int val; |
10332 |
+ __m64 v0, v2, accv; |
10333 |
+ const uint8_t *srci = src + filterPos[i]; |
10334 |
+ |
10335 |
+ v0 = _mm_cvtsi32_si64(*(uint32_t*)srci); |
10336 |
+ v0 = _mm_unpacklo_pi8(v0, zerov); |
10337 |
+ v2 = *(__m64*)filter; |
10338 |
+ accv = _mm_madd_pi16(v0, v2); |
10339 |
+ val = _mm_extract_pi32(accv, 0) + _mm_extract_pi32(accv, 1); |
10340 |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); |
10341 |
+ } |
10342 |
+ } |
10343 |
+ break; |
10344 |
+ |
10345 |
+ case 8: |
10346 |
+ PRAGMA_E2K("ivdep") |
10347 |
+ for (i = 0; i < dstW; i++, filter += filterSize) { |
10348 |
+ int val, j = 0; |
10349 |
+ __m128i v0, v2, accv; |
10350 |
+ const uint8_t *srci = src + filterPos[i]; |
10351 |
+ |
10352 |
+ v0 = VEC_LD8(srci + j); |
10353 |
+ v0 = _mm_unpacklo_epi8(v0, zerov); |
10354 |
+ v2 = VEC_LD(filter + j); |
10355 |
+ accv = _mm_madd_epi16(v0, v2); |
10356 |
+ accv = _mm_hadd_epi32(accv, accv); |
10357 |
+ val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1); |
10358 |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); |
10359 |
+ } |
10360 |
+ break; |
10361 |
+ |
10362 |
+ case 16: |
10363 |
+ PRAGMA_E2K("ivdep") |
10364 |
+ for (i = 0; i < dstW; i++, filter += filterSize) { |
10365 |
+ int val, j = 0; |
10366 |
+ __m128i v0, v1, v2, v3, accv; |
10367 |
+ const uint8_t *srci = src + filterPos[i]; |
10368 |
+ |
10369 |
+ v1 = VEC_LD(srci + j); |
10370 |
+ v0 = _mm_unpacklo_epi8(v1, zerov); |
10371 |
+ v1 = _mm_unpackhi_epi8(v1, zerov); |
10372 |
+ v2 = VEC_LD(filter + j); |
10373 |
+ v3 = VEC_LD(filter + j + 8); |
10374 |
+ accv = _mm_madd_epi16(v0, v2); |
10375 |
+ accv = _mm_add_epi32(accv, _mm_madd_epi16(v1, v3)); |
10376 |
+ accv = _mm_hadd_epi32(accv, accv); |
10377 |
+ val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1); |
10378 |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); |
10379 |
+ } |
10380 |
+ break; |
10381 |
+ |
10382 |
+ default: |
10383 |
+ av_assert0((filterSize & 7) == 0); |
10384 |
+ |
10385 |
+ for (i = 0; i < dstW; i++, filter += filterSize) { |
10386 |
+ int val, j = 0; |
10387 |
+ __m128i v0, v1, v2, v3, accv = zerov; |
10388 |
+ const uint8_t *srci = src + filterPos[i]; |
10389 |
+ |
10390 |
+ for (; j < filterSize - 15; j += 16) { |
10391 |
+ v1 = VEC_LD(srci + j); |
10392 |
+ v0 = _mm_unpacklo_epi8(v1, zerov); |
10393 |
+ v1 = _mm_unpackhi_epi8(v1, zerov); |
10394 |
+ v2 = VEC_LD(filter + j); |
10395 |
+ v3 = VEC_LD(filter + j + 8); |
10396 |
+ accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v2)); |
10397 |
+ accv = _mm_add_epi32(accv, _mm_madd_epi16(v1, v3)); |
10398 |
+ } |
10399 |
+ if (filterSize & 8) { |
10400 |
+ v1 = VEC_LD8(srci + j); |
10401 |
+ v0 = _mm_unpacklo_epi8(v1, zerov); |
10402 |
+ v2 = VEC_LD(filter + j); |
10403 |
+ accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v2)); |
10404 |
+ } |
10405 |
+ accv = _mm_hadd_epi32(accv, accv); |
10406 |
+ val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1); |
10407 |
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); |
10408 |
+ } |
10409 |
+ } |
10410 |
+} |
10411 |
+ |
10412 |
+static void yuv2plane1_floatLE_e2k(const int16_t *_src, uint8_t *_dest, |
10413 |
+ int dstW, const uint8_t *dither, int offset) |
10414 |
+{ |
10415 |
+ const int32_t *src = (const int32_t*)_src; |
10416 |
+ float *dest = (float*)_dest; |
10417 |
+ int shift = 3; |
10418 |
+ int add = (1 << shift) >> 1; |
10419 |
+ int clip = (1 << 16) - 1; |
10420 |
+ float fmult = 1.0f / 65535.0f; |
10421 |
+ LOAD_ZERO; |
10422 |
+ vec_u32 vadd = _mm_set1_epi32(add); |
10423 |
+ vec_u32 vlargest = _mm_set1_epi32(clip); |
10424 |
+ vec_f vmul = _mm_set1_ps(fmult); |
10425 |
+ vec_u32 v0; |
10426 |
+ vec_f v1; |
10427 |
+ int i = 0; |
10428 |
+ |
10429 |
+ PRAGMA_E2K("ivdep") |
10430 |
+ for (; i < dstW - 3; i += 4) { |
10431 |
+ v0 = VEC_LD(src + i); |
10432 |
+ v0 = _mm_add_epi32(v0, vadd); |
10433 |
+ v0 = _mm_srai_epi32(v0, shift); |
10434 |
+ v0 = _mm_max_epi32(v0, zerov); |
10435 |
+ v0 = _mm_min_epi32(v0, vlargest); |
10436 |
+ v1 = _mm_mul_ps(_mm_cvtepi32_ps(v0), vmul); |
10437 |
+ _mm_storeu_ps(dest + i, v1); |
10438 |
+ } |
10439 |
+ |
10440 |
+ PRAGMA_E2K("ivdep") |
10441 |
+ for (; i < dstW; ++i){ |
10442 |
+ int val = src[i] + add; |
10443 |
+ val = av_clip_uint16(val >> shift); |
10444 |
+ dest[i] = fmult * (float)val; |
10445 |
+ } |
10446 |
+} |
10447 |
+ |
10448 |
+static void yuv2plane1_floatBE_e2k(const int16_t *_src, uint8_t *_dest, |
10449 |
+ int dstW, const uint8_t *dither, int offset) |
10450 |
+{ |
10451 |
+ const int32_t *src = (const int32_t*)_src; |
10452 |
+ uint32_t *dest = (uint32_t*)_dest; |
10453 |
+ int shift = 3; |
10454 |
+ int add = (1 << shift) >> 1; |
10455 |
+ int clip = (1 << 16) - 1; |
10456 |
+ float fmult = 1.0f / 65535.0f; |
10457 |
+ LOAD_ZERO; |
10458 |
+ vec_u32 vadd = _mm_set1_epi32(add); |
10459 |
+ vec_u32 vlargest = _mm_set1_epi32(clip); |
10460 |
+ vec_f vmul = _mm_set1_ps(fmult); |
10461 |
+ vec_u8 vswap = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); |
10462 |
+ vec_u32 v0; |
10463 |
+ vec_f v1; |
10464 |
+ int i = 0; |
10465 |
+ |
10466 |
+ PRAGMA_E2K("ivdep") |
10467 |
+ for (; i < dstW - 3; i += 4) { |
10468 |
+ v0 = VEC_LD(src + i); |
10469 |
+ v0 = _mm_add_epi32(v0, vadd); |
10470 |
+ v0 = _mm_srai_epi32(v0, shift); |
10471 |
+ v0 = _mm_max_epi32(v0, zerov); |
10472 |
+ v0 = _mm_min_epi32(v0, vlargest); |
10473 |
+ v1 = _mm_mul_ps(_mm_cvtepi32_ps(v0), vmul); |
10474 |
+ v0 = _mm_shuffle_epi8(_mm_castps_si128(v1), vswap); |
10475 |
+ VEC_ST(dest + i, v0); |
10476 |
+ } |
10477 |
+ |
10478 |
+ PRAGMA_E2K("ivdep") |
10479 |
+ for (; i < dstW; i++) { |
10480 |
+ int val = src[i] + add; |
10481 |
+ val = av_clip_uint16(val >> shift); |
10482 |
+ dest[i] = av_bswap32(av_float2int(fmult * (float)val)); |
10483 |
+ } |
10484 |
+} |
10485 |
+ |
10486 |
+static void yuv2plane1_8_e2k(const int16_t *src, uint8_t *dest, int dstW, |
10487 |
+ const uint8_t *dither, int offset) |
10488 |
+{ |
10489 |
+ int i = 0; |
10490 |
+ __m128i v0, v1, ditherv; |
10491 |
+ LOAD_ZERO; |
10492 |
+ __m64 h0; |
10493 |
+ h0 = (__m64)__builtin_e2k_insfd(*(uint64_t*)dither, ((offset + i) & 7) * 8, 0); |
10494 |
+ ditherv = _mm_unpacklo_epi8(_mm_movpi64_epi64(h0), zerov); |
10495 |
+ |
10496 |
+ PRAGMA_E2K("ivdep") |
10497 |
+ for (; i < dstW - 15; i += 16) { |
10498 |
+ v0 = VEC_LD(src + i); |
10499 |
+ v1 = VEC_LD(src + i + 8); |
10500 |
+ v0 = _mm_adds_epi16(v0, ditherv); |
10501 |
+ v1 = _mm_adds_epi16(v1, ditherv); |
10502 |
+ v0 = _mm_srai_epi16(v0, 7); |
10503 |
+ v1 = _mm_srai_epi16(v1, 7); |
10504 |
+ v0 = _mm_packus_epi16(v0, v1); |
10505 |
+ VEC_ST(dest + i, v0); |
10506 |
+ } |
10507 |
+ |
10508 |
+ PRAGMA_E2K("ivdep") |
10509 |
+ for (; i < dstW; i++) { |
10510 |
+ int val = (src[i] + dither[(i + offset) & 7]) >> 7; |
10511 |
+ dest[i] = av_clip_uint8(val); |
10512 |
+ } |
10513 |
+} |
10514 |
+ |
10515 |
+#define output_pixel(pos, val) \ |
10516 |
+ if (big_endian) { \ |
10517 |
+ AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \ |
10518 |
+ } else { \ |
10519 |
+ AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \ |
10520 |
+ } |
10521 |
+ |
10522 |
+static av_always_inline |
10523 |
+void yuv2plane1_10_e2k(const int16_t *src, uint16_t *dest, int dstW, |
10524 |
+ const int big_endian, const int output_bits) |
10525 |
+{ |
10526 |
+ int shift = 15 - output_bits; |
10527 |
+ int add = 1 << (shift - 1); |
10528 |
+ int clip = (1 << output_bits) - 1; |
10529 |
+ vec_u16 vadd = _mm_set1_epi16(add); |
10530 |
+ vec_u16 vlargest = _mm_set1_epi16(clip); |
10531 |
+ vec_u8 vswap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); |
10532 |
+ LOAD_ZERO; |
10533 |
+ __m128i v0; |
10534 |
+ int i = 0; |
10535 |
+ |
10536 |
+ PRAGMA_E2K("ivdep") |
10537 |
+ for (; i < dstW - 7; i += 8) { |
10538 |
+ v0 = VEC_LD(src + i); |
10539 |
+ v0 = _mm_adds_epi16(v0, vadd); |
10540 |
+ v0 = _mm_srai_epi16(v0, shift); |
10541 |
+ v0 = _mm_max_epi16(v0, zerov); |
10542 |
+ v0 = _mm_min_epu16(v0, vlargest); |
10543 |
+ if (big_endian) { |
10544 |
+ v0 = _mm_shuffle_epi8(v0, vswap); |
10545 |
+ } |
10546 |
+ VEC_ST(dest + i, v0); |
10547 |
+ } |
10548 |
+ |
10549 |
+ PRAGMA_E2K("ivdep") |
10550 |
+ for (; i < dstW; i++) { |
10551 |
+ int val = src[i] + add; |
10552 |
+ output_pixel(&dest[i], val); |
10553 |
+ } |
10554 |
+} |
10555 |
+ |
10556 |
+static av_always_inline |
10557 |
+void yuv2planeX_10_e2k(const int16_t *filter, int filterSize, |
10558 |
+ const int16_t **src, uint16_t *dest, int dstW, |
10559 |
+ int big_endian, int output_bits) |
10560 |
+{ |
10561 |
+ int shift = 11 + 16 - output_bits; |
10562 |
+ int add = 1 << (shift - 1); |
10563 |
+ int clip = (1 << output_bits) - 1; |
10564 |
+ vec_u16 vadd = _mm_set1_epi32(add); |
10565 |
+ vec_u16 vlargest = _mm_set1_epi16(clip); |
10566 |
+ vec_u8 vswap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); |
10567 |
+ __m128i v0, v1, v2, v3, v4, v5; |
10568 |
+ int i = 0, j; |
10569 |
+ |
10570 |
+ for (; i < dstW - 7; i += 8) { |
10571 |
+ v4 = v5 = vadd; |
10572 |
+ for (j = 0; j < filterSize; j++) { |
10573 |
+ v0 = VEC_LD(src[j] + i); |
10574 |
+ v1 = _mm_set1_epi16(filter[j]); |
10575 |
+ v2 = _mm_mullo_epi16(v0, v1); |
10576 |
+ v3 = _mm_mulhi_epi16(v0, v1); |
10577 |
+ v0 = _mm_unpacklo_epi16(v2, v3); |
10578 |
+ v1 = _mm_unpackhi_epi16(v2, v3); |
10579 |
+ v4 = _mm_add_epi32(v4, v0); |
10580 |
+ v5 = _mm_add_epi32(v5, v1); |
10581 |
+ } |
10582 |
+ v4 = _mm_srai_epi32(v4, shift); |
10583 |
+ v5 = _mm_srai_epi32(v5, shift); |
10584 |
+ v0 = _mm_packus_epi32(v4, v5); |
10585 |
+ v0 = _mm_min_epu16(v0, vlargest); |
10586 |
+ if (big_endian) { |
10587 |
+ v0 = _mm_shuffle_epi8(v0, vswap); |
10588 |
+ } |
10589 |
+ VEC_ST(dest + i, v0); |
10590 |
+ } |
10591 |
+ |
10592 |
+ for (; i < dstW; i++) { |
10593 |
+ int val = 1 << (shift - 1); |
10594 |
+ for (j = 0; j < filterSize; j++) |
10595 |
+ val += src[j][i] * filter[j]; |
10596 |
+ output_pixel(&dest[i], val); |
10597 |
+ } |
10598 |
+} |
10599 |
+ |
10600 |
+#undef output_pixel |
10601 |
+ |
10602 |
+#define output_pixel(pos, val, bias, signedness) \ |
10603 |
+ if (big_endian) { \ |
10604 |
+ AV_WB16(pos, bias + av_clip_##signedness##16(val >> shift)); \ |
10605 |
+ } else { \ |
10606 |
+ AV_WL16(pos, bias + av_clip_##signedness##16(val >> shift)); \ |
10607 |
+ } |
10608 |
+ |
10609 |
+static av_always_inline |
10610 |
+void yuv2plane1_16_e2k(const int32_t *src, uint16_t *dest, int dstW, |
10611 |
+ const int big_endian, int output_bits) |
10612 |
+{ |
10613 |
+ int shift = 3; |
10614 |
+ int add = 1 << (shift - 1); |
10615 |
+ vec_u32 vadd = _mm_set1_epi32(add); |
10616 |
+ vec_u8 vswap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); |
10617 |
+ __m128i v0, v1; |
10618 |
+ int i = 0; |
10619 |
+ |
10620 |
+ PRAGMA_E2K("ivdep") |
10621 |
+ for (; i < dstW - 7; i += 8) { |
10622 |
+ v0 = VEC_LD(src + i); |
10623 |
+ v1 = VEC_LD(src + i + 4); |
10624 |
+ v0 = _mm_add_epi32(v0, vadd); |
10625 |
+ v1 = _mm_add_epi32(v1, vadd); |
10626 |
+ v0 = _mm_srai_epi32(v0, shift); |
10627 |
+ v1 = _mm_srai_epi32(v1, shift); |
10628 |
+ v0 = _mm_packus_epi32(v0, v1); |
10629 |
+ if (big_endian) { |
10630 |
+ v0 = _mm_shuffle_epi8(v0, vswap); |
10631 |
+ } |
10632 |
+ VEC_ST(dest + i, v0); |
10633 |
+ } |
10634 |
+ |
10635 |
+ PRAGMA_E2K("ivdep") |
10636 |
+ for (; i < dstW; i++) { |
10637 |
+ int val = src[i] + add; |
10638 |
+ output_pixel(&dest[i], val, 0, uint); |
10639 |
+ } |
10640 |
+} |
10641 |
+ |
10642 |
+/* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline |
10643 |
+ * filters (or anything with negative coeffs, the range can be slightly |
10644 |
+ * wider in both directions. To account for this overflow, we subtract |
10645 |
+ * a constant so it always fits in the signed range (assuming a |
10646 |
+ * reasonable filterSize), and re-add that at the end. */ |
10647 |
+ |
10648 |
+static av_always_inline |
10649 |
+void yuv2planeX_16_e2k(const int16_t *filter, int filterSize, |
10650 |
+ const int32_t **src, uint16_t *dest, int dstW, |
10651 |
+ int big_endian, int output_bits) |
10652 |
+{ |
10653 |
+ int shift = 15, bias = 0x8000; |
10654 |
+ int add = (1 << (shift - 1)) - 0x40000000; |
10655 |
+ vec_u32 vadd = _mm_set1_epi32(add); |
10656 |
+ vec_u16 vbias = _mm_set1_epi16(bias); |
10657 |
+ vec_u8 vswap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); |
10658 |
+ __m128i v0, v1, v2, v4, v5; |
10659 |
+ int i = 0, j; |
10660 |
+ |
10661 |
+ for (; i < dstW - 7; i += 8) { |
10662 |
+ v4 = v5 = vadd; |
10663 |
+ for (j = 0; j < filterSize; j++) { |
10664 |
+ v0 = VEC_LD(src[j] + i); |
10665 |
+ v1 = VEC_LD(src[j] + i + 4); |
10666 |
+ v2 = _mm_set1_epi32(filter[j]); |
10667 |
+ v4 = _mm_add_epi32(v4, _mm_mullo_epi32(v0, v2)); |
10668 |
+ v5 = _mm_add_epi32(v5, _mm_mullo_epi32(v1, v2)); |
10669 |
+ } |
10670 |
+ v4 = _mm_srai_epi32(v4, shift); |
10671 |
+ v5 = _mm_srai_epi32(v5, shift); |
10672 |
+ v0 = _mm_packs_epi32(v4, v5); |
10673 |
+ v0 = _mm_add_epi16(v0, vbias); |
10674 |
+ if (big_endian) { |
10675 |
+ v0 = _mm_shuffle_epi8(v0, vswap); |
10676 |
+ } |
10677 |
+ VEC_ST(dest + i, v0); |
10678 |
+ } |
10679 |
+ |
10680 |
+ for (; i < dstW; i++) { |
10681 |
+ int val = add; |
10682 |
+ for (j = 0; j < filterSize; j++) |
10683 |
+ val += src[j][i] * (unsigned)filter[j]; |
10684 |
+ output_pixel(&dest[i], val, bias, int); |
10685 |
+ } |
10686 |
+} |
10687 |
+ |
10688 |
+#undef output_pixel |
10689 |
+ |
10690 |
+#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \ |
10691 |
+static void yuv2plane1_##bits##BE_LE##_e2k(const int16_t *src, \ |
10692 |
+ uint8_t *dest, int dstW, \ |
10693 |
+ const uint8_t *dither, int offset) \ |
10694 |
+{ \ |
10695 |
+ yuv2plane1_##template_size##_e2k((const typeX_t *) src, \ |
10696 |
+ (uint16_t *) dest, dstW, is_be, bits); \ |
10697 |
+} \ |
10698 |
+static void yuv2planeX_##bits##BE_LE##_e2k(const int16_t *filter, int filterSize, \ |
10699 |
+ const int16_t **src, uint8_t *dest, int dstW, \ |
10700 |
+ const uint8_t *dither, int offset)\ |
10701 |
+{ \ |
10702 |
+ yuv2planeX_##template_size##_e2k(filter, \ |
10703 |
+ filterSize, (const typeX_t **) src, \ |
10704 |
+ (uint16_t *) dest, dstW, is_be, bits); \ |
10705 |
+} |
10706 |
+ |
10707 |
+yuv2NBPS( 9, BE, 1, 10, int16_t) |
10708 |
+yuv2NBPS( 9, LE, 0, 10, int16_t) |
10709 |
+yuv2NBPS(10, BE, 1, 10, int16_t) |
10710 |
+yuv2NBPS(10, LE, 0, 10, int16_t) |
10711 |
+yuv2NBPS(12, BE, 1, 10, int16_t) |
10712 |
+yuv2NBPS(12, LE, 0, 10, int16_t) |
10713 |
+yuv2NBPS(14, BE, 1, 10, int16_t) |
10714 |
+yuv2NBPS(14, LE, 0, 10, int16_t) |
10715 |
+yuv2NBPS(16, BE, 1, 16, int32_t) |
10716 |
+yuv2NBPS(16, LE, 0, 16, int32_t) |
10717 |
+ |
10718 |
+#define INIT_RGB(R, B) \ |
10719 |
+ __m64 rgb_index0 = _mm_setr_pi8(0, 1, 3, 4, 5, 7, 8, 9); \ |
10720 |
+ __m64 rgb_index1 = _mm_setr_pi8(3, 4, 5, 7, 8, 9, 11, 12); \ |
10721 |
+ __m64 rgb_index2 = _mm_setr_pi8(5, 7, 8, 9, 11, 12, 13, 15); |
10722 |
+ |
10723 |
+#define INIT_RGBX(R, B) \ |
10724 |
+ __m128i A_h = _mm_set1_epi16(-256); |
10725 |
+ |
10726 |
+#define INIT_XRGB(R, B) \ |
10727 |
+ __m128i A_l = _mm_set1_epi16(255); |
10728 |
+ |
10729 |
+#define WRITE_RGB(R, B) \ |
10730 |
+ v0 = _mm_srai_epi32(R##_l, 22); \ |
10731 |
+ v1 = _mm_srai_epi32(R##_h, 22); \ |
10732 |
+ v4 = _mm_srai_epi32(G_l, 22 - 16); \ |
10733 |
+ v5 = _mm_srai_epi32(G_h, 22 - 16); \ |
10734 |
+ v2 = _mm_srai_epi32(B##_l, 22 - 8); \ |
10735 |
+ v3 = _mm_srai_epi32(B##_h, 22 - 8); \ |
10736 |
+ v0 = _mm_blend_epi16(v0, v4, 0xaa); \ |
10737 |
+ v1 = _mm_blend_epi16(v1, v5, 0xaa); \ |
10738 |
+ v2 = _mm_packus_epi32(v2, v3); \ |
10739 |
+ v0 = _mm_packus_epi16(v0, v1); \ |
10740 |
+ v1 = _mm_unpacklo_epi16(v0, v2); \ |
10741 |
+ v2 = _mm_unpackhi_epi16(v0, v2); \ |
10742 |
+ { \ |
10743 |
+ union { __m128i v; __m64 d[2]; } a = { v1 }, b = { v2 }; \ |
10744 |
+ __m64 *p = (__m64*)dest; \ |
10745 |
+ p[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \ |
10746 |
+ p[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \ |
10747 |
+ p[2] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \ |
10748 |
+ dest += 24; \ |
10749 |
+ } |
10750 |
+ |
10751 |
+#define WRITE_RGBX(R, B) \ |
10752 |
+ v0 = _mm_srai_epi32(R##_l, 22); \ |
10753 |
+ v1 = _mm_srai_epi32(R##_h, 22); \ |
10754 |
+ v4 = _mm_srai_epi32(G_l, 22 - 16); \ |
10755 |
+ v5 = _mm_srai_epi32(G_h, 22 - 16); \ |
10756 |
+ v2 = _mm_srai_epi32(B##_l, 22); \ |
10757 |
+ v3 = _mm_srai_epi32(B##_h, 22); \ |
10758 |
+ v0 = _mm_blend_epi16(v0, v4, 0xaa); \ |
10759 |
+ v1 = _mm_blend_epi16(v1, v5, 0xaa); \ |
10760 |
+ v0 = _mm_packus_epi16(v0, v1); \ |
10761 |
+ v2 = _mm_packus_epi16(v2, v3); \ |
10762 |
+ v2 = _mm_or_si128(v2, A_h); \ |
10763 |
+ v1 = _mm_unpacklo_epi16(v0, v2); \ |
10764 |
+ v3 = _mm_unpackhi_epi16(v0, v2); \ |
10765 |
+ VEC_ST(dest, v1); \ |
10766 |
+ VEC_ST(dest + 16, v3); \ |
10767 |
+ dest += 32; |
10768 |
+ |
10769 |
+#define WRITE_XRGB(R, B) \ |
10770 |
+ v0 = _mm_srai_epi32(R##_l, 22 - 16); \ |
10771 |
+ v1 = _mm_srai_epi32(R##_h, 22 - 16); \ |
10772 |
+ v4 = _mm_srai_epi32(G_l, 22); \ |
10773 |
+ v5 = _mm_srai_epi32(G_h, 22); \ |
10774 |
+ v2 = _mm_srai_epi32(B##_l, 22 - 16); \ |
10775 |
+ v3 = _mm_srai_epi32(B##_h, 22 - 16); \ |
10776 |
+ v2 = _mm_blend_epi16(v4, v2, 0xaa); \ |
10777 |
+ v3 = _mm_blend_epi16(v5, v3, 0xaa); \ |
10778 |
+ v0 = _mm_packus_epi16(v0, v1); \ |
10779 |
+ v2 = _mm_packus_epi16(v2, v3); \ |
10780 |
+ v0 = _mm_or_si128(v0, A_l); \ |
10781 |
+ v1 = _mm_unpacklo_epi16(v0, v2); \ |
10782 |
+ v3 = _mm_unpackhi_epi16(v0, v2); \ |
10783 |
+ VEC_ST(dest, v1); \ |
10784 |
+ VEC_ST(dest + 16, v3); \ |
10785 |
+ dest += 32; |
10786 |
+ |
10787 |
+#define CALC_RGB \ |
10788 |
+ vy_l = _mm_add_epi32(_mm_mullo_epi32(vy_l, y_coeff), y_add); \ |
10789 |
+ vy_h = _mm_add_epi32(_mm_mullo_epi32(vy_h, y_coeff), y_add); \ |
10790 |
+ \ |
10791 |
+ v0 = _mm_mullo_epi32(vv_l, v2g_coeff); \ |
10792 |
+ v1 = _mm_mullo_epi32(vu_l, u2g_coeff); \ |
10793 |
+ v2 = _mm_mullo_epi32(vv_h, v2g_coeff); \ |
10794 |
+ v3 = _mm_mullo_epi32(vu_h, u2g_coeff); \ |
10795 |
+ G_l = _mm_add_epi32(_mm_add_epi32(v0, vy_l), v1); \ |
10796 |
+ G_h = _mm_add_epi32(_mm_add_epi32(v2, vy_h), v3); \ |
10797 |
+ \ |
10798 |
+ R_l = _mm_add_epi32(vy_l, _mm_mullo_epi32(vv_l, v2r_coeff)); \ |
10799 |
+ R_h = _mm_add_epi32(vy_h, _mm_mullo_epi32(vv_h, v2r_coeff)); \ |
10800 |
+ B_l = _mm_add_epi32(vy_l, _mm_mullo_epi32(vu_l, u2b_coeff)); \ |
10801 |
+ B_h = _mm_add_epi32(vy_h, _mm_mullo_epi32(vu_h, u2b_coeff)); |
10802 |
+ |
10803 |
+#define WITH_ALPHA(...) __VA_ARGS__ |
10804 |
+#define NO_ALPHA(...) |
10805 |
+ |
10806 |
+#define YUV2RGBWRAPPERXF(ext, fmt, R, B, hasAlpha) \ |
10807 |
+static void yuv2##ext##_X_e2k(SwsContext *c, const int16_t *lumFilter, \ |
10808 |
+ const int16_t **lumSrc, int lumFilterSize, \ |
10809 |
+ const int16_t *chrFilter, const int16_t **chrUSrc, \ |
10810 |
+ const int16_t **chrVSrc, int chrFilterSize, \ |
10811 |
+ const int16_t **alpSrc, uint8_t *dest, int dstW, \ |
10812 |
+ int y) \ |
10813 |
+{ \ |
10814 |
+ vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h; \ |
10815 |
+ vec_s32 hasAlpha(A_l, A_h,) R_l, R_h, G_l, G_h, B_l, B_h; \ |
10816 |
+ vec_s32 ystart = _mm_set1_epi32(1 << 9); \ |
10817 |
+ vec_s32 uvstart = _mm_set1_epi32((1 << 9) - (128 << 19)); \ |
10818 |
+ vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff); \ |
10819 |
+ vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \ |
10820 |
+ vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff); \ |
10821 |
+ vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff); \ |
10822 |
+ vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff); \ |
10823 |
+ vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff); \ |
10824 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
10825 |
+ int i, j; \ |
10826 |
+ INIT_##fmt(R, B) \ |
10827 |
+ \ |
10828 |
+ for (i = 0; i < dstW; i += 8) { \ |
10829 |
+ vy_l = vy_h = ystart; \ |
10830 |
+ for (j = 0; j < lumFilterSize; j++) { \ |
10831 |
+ v0 = VEC_LD(lumSrc[j] + i); \ |
10832 |
+ v1 = _mm_set1_epi16(lumFilter[j]); \ |
10833 |
+ v2 = _mm_mullo_epi16(v0, v1); \ |
10834 |
+ v3 = _mm_mulhi_epi16(v0, v1); \ |
10835 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
10836 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
10837 |
+ vy_l = _mm_add_epi32(vy_l, v0); \ |
10838 |
+ vy_h = _mm_add_epi32(vy_h, v1); \ |
10839 |
+ } \ |
10840 |
+ vy_l = _mm_srai_epi32(vy_l, 10); \ |
10841 |
+ vy_h = _mm_srai_epi32(vy_h, 10); \ |
10842 |
+ \ |
10843 |
+ vu_l = vu_h = vv_l = vv_h = uvstart; \ |
10844 |
+ for (j = 0; j < chrFilterSize; j++) { \ |
10845 |
+ v0 = VEC_LD(chrUSrc[j] + i); \ |
10846 |
+ v1 = VEC_LD(chrVSrc[j] + i); \ |
10847 |
+ v5 = _mm_set1_epi16(chrFilter[j]); \ |
10848 |
+ v2 = _mm_mullo_epi16(v0, v5); \ |
10849 |
+ v3 = _mm_mulhi_epi16(v0, v5); \ |
10850 |
+ v4 = _mm_mullo_epi16(v1, v5); \ |
10851 |
+ v5 = _mm_mulhi_epi16(v1, v5); \ |
10852 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
10853 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
10854 |
+ v2 = _mm_unpacklo_epi16(v4, v5); \ |
10855 |
+ v3 = _mm_unpackhi_epi16(v4, v5); \ |
10856 |
+ vu_l = _mm_add_epi32(vu_l, v0); \ |
10857 |
+ vu_h = _mm_add_epi32(vu_h, v1); \ |
10858 |
+ vv_l = _mm_add_epi32(vv_l, v2); \ |
10859 |
+ vv_h = _mm_add_epi32(vv_h, v3); \ |
10860 |
+ } \ |
10861 |
+ vu_l = _mm_srai_epi32(vu_l, 10); \ |
10862 |
+ vu_h = _mm_srai_epi32(vu_h, 10); \ |
10863 |
+ vv_l = _mm_srai_epi32(vv_l, 10); \ |
10864 |
+ vv_h = _mm_srai_epi32(vv_h, 10); \ |
10865 |
+ \ |
10866 |
+ CALC_RGB \ |
10867 |
+ WRITE_##fmt(R, B) \ |
10868 |
+ } \ |
10869 |
+} |
10870 |
+ |
10871 |
+#define SETUP(buf, i, alpha, r0, r1) { \ |
10872 |
+ v0 = VEC_LD(buf##0 + i); \ |
10873 |
+ v1 = VEC_LD(buf##1 + i); \ |
10874 |
+ v2 = _mm_unpacklo_epi16(v0, v1); \ |
10875 |
+ v3 = _mm_unpackhi_epi16(v0, v1); \ |
10876 |
+ r0 = _mm_madd_epi16(v2, alpha); \ |
10877 |
+ r1 = _mm_madd_epi16(v3, alpha); \ |
10878 |
+} |
10879 |
+ |
10880 |
+#define YUV2RGBWRAPPER2F(ext, fmt, R, B, hasAlpha) \ |
10881 |
+static void yuv2##ext##_2_e2k(SwsContext *c, const int16_t *buf[2], \ |
10882 |
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \ |
10883 |
+ const int16_t *abuf[2], uint8_t *dest, int dstW, \ |
10884 |
+ int yalpha, int uvalpha, int y) \ |
10885 |
+{ \ |
10886 |
+ const int16_t hasAlpha(*abuf0 = abuf[0], *abuf1 = abuf[1],) \ |
10887 |
+ *buf0 = buf[0], *buf1 = buf[1], \ |
10888 |
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], \ |
10889 |
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1]; \ |
10890 |
+ vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h; \ |
10891 |
+ vec_s32 hasAlpha(A_l, A_h,) R_l, R_h, G_l, G_h, B_l, B_h; \ |
10892 |
+ vec_s16 vyalpha = _mm_set1_epi32(4096 + yalpha * 0xffff); \ |
10893 |
+ vec_s16 vuvalpha = _mm_set1_epi32(4096 + uvalpha * 0xffff); \ |
10894 |
+ vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff); \ |
10895 |
+ vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \ |
10896 |
+ vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff); \ |
10897 |
+ vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff); \ |
10898 |
+ vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff); \ |
10899 |
+ vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff); \ |
10900 |
+ vec_s32 dec128 = _mm_set1_epi32(128 << 19); \ |
10901 |
+ hasAlpha(vec_s32 add18 = _mm_set1_epi32(1 << 18);) \ |
10902 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
10903 |
+ int i; \ |
10904 |
+ INIT_##fmt(R, B) \ |
10905 |
+ \ |
10906 |
+ av_assert2(yalpha <= 4096U); \ |
10907 |
+ av_assert2(uvalpha <= 4096U); \ |
10908 |
+ \ |
10909 |
+ for (i = 0; i < dstW; i += 8) { \ |
10910 |
+ SETUP(buf, i, vyalpha, v0, v1); \ |
10911 |
+ vy_l = _mm_srai_epi32(v0, 10); \ |
10912 |
+ vy_h = _mm_srai_epi32(v1, 10); \ |
10913 |
+ \ |
10914 |
+ SETUP(ubuf, i, vuvalpha, v0, v1); \ |
10915 |
+ vu_l = _mm_srai_epi32(_mm_sub_epi32(v0, dec128), 10); \ |
10916 |
+ vu_h = _mm_srai_epi32(_mm_sub_epi32(v1, dec128), 10); \ |
10917 |
+ \ |
10918 |
+ SETUP(vbuf, i, vuvalpha, v0, v1); \ |
10919 |
+ vv_l = _mm_srai_epi32(_mm_sub_epi32(v0, dec128), 10); \ |
10920 |
+ vv_h = _mm_srai_epi32(_mm_sub_epi32(v1, dec128), 10); \ |
10921 |
+ \ |
10922 |
+ hasAlpha( \ |
10923 |
+ SETUP(abuf, i, vyalpha, v0, v1); \ |
10924 |
+ A_l = _mm_add_epi32(v0, add18); \ |
10925 |
+ A_h = _mm_add_epi32(v1, add18); \ |
10926 |
+ ) \ |
10927 |
+ \ |
10928 |
+ CALC_RGB \ |
10929 |
+ WRITE_##fmt(R, B) \ |
10930 |
+ } \ |
10931 |
+} |
10932 |
+ |
10933 |
+#define YUV2RGBWRAPPER1F(ext, fmt, R, B, hasAlpha) \ |
10934 |
+static void yuv2##ext##_1_e2k(SwsContext *c, const int16_t *buf0, \ |
10935 |
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \ |
10936 |
+ const int16_t *abuf0, uint8_t *dest, int dstW, \ |
10937 |
+ int uvalpha, int y) \ |
10938 |
+{ \ |
10939 |
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; \ |
10940 |
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; \ |
10941 |
+ int uvshl = uvalpha < 2048 ? 2 : 1; \ |
10942 |
+ vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h; \ |
10943 |
+ vec_s32 R_l, R_h, G_l, G_h, B_l, B_h; \ |
10944 |
+ vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff * 4); \ |
10945 |
+ vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \ |
10946 |
+ vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff << uvshl); \ |
10947 |
+ vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff << uvshl); \ |
10948 |
+ vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff << uvshl); \ |
10949 |
+ vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff << uvshl); \ |
10950 |
+ vec_u16 uvsub = _mm_set1_epi16(uvalpha < 2048 ? 128 << 7 : 128 << 8); \ |
10951 |
+ hasAlpha(vec_s16 A, add64 = _mm_set1_epi16(64);) \ |
10952 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
10953 |
+ int i; \ |
10954 |
+ INIT_##fmt(R, B) \ |
10955 |
+ \ |
10956 |
+ for (i = 0; i < dstW; i += 8) { \ |
10957 |
+ v0 = VEC_LD(buf0 + i); \ |
10958 |
+ v2 = _mm_unpacklo_epi16(v0, v0); \ |
10959 |
+ v3 = _mm_unpackhi_epi16(v0, v0); \ |
10960 |
+ vy_l = _mm_srai_epi32(v2, 16); \ |
10961 |
+ vy_h = _mm_srai_epi32(v3, 16); \ |
10962 |
+ \ |
10963 |
+ v0 = VEC_LD(ubuf0 + i); \ |
10964 |
+ v1 = VEC_LD(vbuf0 + i); \ |
10965 |
+ if (uvalpha >= 2048) { \ |
10966 |
+ v2 = VEC_LD(ubuf1 + i); \ |
10967 |
+ v3 = VEC_LD(vbuf1 + i); \ |
10968 |
+ v0 = _mm_add_epi16(v0, v2); \ |
10969 |
+ v1 = _mm_add_epi16(v1, v3); \ |
10970 |
+ } \ |
10971 |
+ v0 = _mm_sub_epi16(v0, uvsub); \ |
10972 |
+ v1 = _mm_sub_epi16(v1, uvsub); \ |
10973 |
+ v2 = _mm_unpacklo_epi16(v0, v0); \ |
10974 |
+ v3 = _mm_unpackhi_epi16(v0, v0); \ |
10975 |
+ vu_l = _mm_srai_epi32(v2, 16); \ |
10976 |
+ vu_h = _mm_srai_epi32(v3, 16); \ |
10977 |
+ v2 = _mm_unpacklo_epi16(v1, v1); \ |
10978 |
+ v3 = _mm_unpackhi_epi16(v1, v1); \ |
10979 |
+ vv_l = _mm_srai_epi32(v2, 16); \ |
10980 |
+ vv_h = _mm_srai_epi32(v3, 16); \ |
10981 |
+ \ |
10982 |
+ hasAlpha( \ |
10983 |
+ A = VEC_LD(abuf0 + i); \ |
10984 |
+ A = _mm_add_epi16(A, add64); \ |
10985 |
+ A = _mm_srai_epi16(A, 7); \ |
10986 |
+ ) \ |
10987 |
+ \ |
10988 |
+ CALC_RGB \ |
10989 |
+ WRITE_##fmt(R, B) \ |
10990 |
+ } \ |
10991 |
+} |
10992 |
+ |
10993 |
+YUV2RGBWRAPPERXF(rgbx32_full, RGBX, R, B, NO_ALPHA) |
10994 |
+YUV2RGBWRAPPERXF(bgrx32_full, RGBX, B, R, NO_ALPHA) |
10995 |
+YUV2RGBWRAPPERXF(xrgb32_full, XRGB, R, B, NO_ALPHA) |
10996 |
+YUV2RGBWRAPPERXF(xbgr32_full, XRGB, B, R, NO_ALPHA) |
10997 |
+YUV2RGBWRAPPERXF(rgb24_full, RGB, R, B, NO_ALPHA) |
10998 |
+YUV2RGBWRAPPERXF(bgr24_full, RGB, B, R, NO_ALPHA) |
10999 |
+ |
11000 |
+YUV2RGBWRAPPER2F(rgbx32_full, RGBX, R, B, NO_ALPHA) |
11001 |
+YUV2RGBWRAPPER2F(bgrx32_full, RGBX, B, R, NO_ALPHA) |
11002 |
+YUV2RGBWRAPPER2F(xrgb32_full, XRGB, R, B, NO_ALPHA) |
11003 |
+YUV2RGBWRAPPER2F(xbgr32_full, XRGB, B, R, NO_ALPHA) |
11004 |
+YUV2RGBWRAPPER2F(rgb24_full, RGB, R, B, NO_ALPHA) |
11005 |
+YUV2RGBWRAPPER2F(bgr24_full, RGB, B, R, NO_ALPHA) |
11006 |
+ |
11007 |
+YUV2RGBWRAPPER1F(rgbx32_full, RGBX, R, B, NO_ALPHA) |
11008 |
+YUV2RGBWRAPPER1F(bgrx32_full, RGBX, B, R, NO_ALPHA) |
11009 |
+YUV2RGBWRAPPER1F(xrgb32_full, XRGB, R, B, NO_ALPHA) |
11010 |
+YUV2RGBWRAPPER1F(xbgr32_full, XRGB, B, R, NO_ALPHA) |
11011 |
+YUV2RGBWRAPPER1F(rgb24_full, RGB, R, B, NO_ALPHA) |
11012 |
+YUV2RGBWRAPPER1F(bgr24_full, RGB, B, R, NO_ALPHA) |
11013 |
+ |
11014 |
+#if 1 // performance |
11015 |
+ |
11016 |
+#define INIT2_RGB(R, B) \ |
11017 |
+ __m128i perm_unp8 = _mm_setr_epi8( \ |
11018 |
+ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); \ |
11019 |
+ __m64 rgb_index0 = _mm_setr_pi8(0, 1, 3, 4, 5, 7, 8, 9); \ |
11020 |
+ __m64 rgb_index1 = _mm_setr_pi8(3, 4, 5, 7, 8, 9, 11, 12); \ |
11021 |
+ __m64 rgb_index2 = _mm_setr_pi8(5, 7, 8, 9, 11, 12, 13, 15); |
11022 |
+ |
11023 |
+#define INIT2_RGBX(R, B) INIT2_XRGB(R, B) |
11024 |
+#define INIT2_XRGB(R, B) \ |
11025 |
+ __m128i A_l = _mm_set1_epi16(255); \ |
11026 |
+ __m128i perm_unp8 = _mm_setr_epi8( \ |
11027 |
+ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); |
11028 |
+ |
11029 |
+#define WRITE2_RGB(R, B) \ |
11030 |
+ v4 = _mm_packus_epi16(R##_l, G_l); \ |
11031 |
+ v5 = _mm_packus_epi16(B##_l, B##_l); \ |
11032 |
+ v0 = _mm_shuffle_epi8(v4, perm_unp8); \ |
11033 |
+ v1 = _mm_unpacklo_epi8(v5, v5); \ |
11034 |
+ v2 = _mm_unpacklo_epi16(v0, v1); \ |
11035 |
+ v3 = _mm_unpackhi_epi16(v0, v1); \ |
11036 |
+ { \ |
11037 |
+ union { __m128i v; __m64 d[2]; } a = { v2 }, b = { v3 }; \ |
11038 |
+ __m64 *p = (__m64*)dest; \ |
11039 |
+ p[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \ |
11040 |
+ p[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \ |
11041 |
+ p[2] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \ |
11042 |
+ dest += 24; \ |
11043 |
+ } |
11044 |
+ |
11045 |
+#define WRITE2_RGBX(R, B) \ |
11046 |
+ v4 = _mm_packus_epi16(R##_l, G_l); \ |
11047 |
+ v5 = _mm_packus_epi16(B##_l, A_l); \ |
11048 |
+ v0 = _mm_shuffle_epi8(v4, perm_unp8); \ |
11049 |
+ v1 = _mm_shuffle_epi8(v5, perm_unp8); \ |
11050 |
+ v2 = _mm_unpacklo_epi16(v0, v1); \ |
11051 |
+ v3 = _mm_unpackhi_epi16(v0, v1); \ |
11052 |
+ VEC_ST(dest, v2); \ |
11053 |
+ VEC_ST(dest + 16, v3); \ |
11054 |
+ dest += 32; |
11055 |
+ |
11056 |
+#define WRITE2_XRGB(R, B) \ |
11057 |
+ v4 = _mm_packus_epi16(A_l, R##_l); \ |
11058 |
+ v5 = _mm_packus_epi16(G_l, B##_l); \ |
11059 |
+ v0 = _mm_shuffle_epi8(v4, perm_unp8); \ |
11060 |
+ v1 = _mm_shuffle_epi8(v5, perm_unp8); \ |
11061 |
+ v2 = _mm_unpacklo_epi16(v0, v1); \ |
11062 |
+ v3 = _mm_unpackhi_epi16(v0, v1); \ |
11063 |
+ VEC_ST(dest, v2); \ |
11064 |
+ VEC_ST(dest + 16, v3); \ |
11065 |
+ dest += 32; |
11066 |
+ |
11067 |
+#define CALC2_RGB \ |
11068 |
+ vy_l = _mm_mulhrs_epi16(_mm_sub_epi16(vy_l, y_sub), y_coeff); \ |
11069 |
+ \ |
11070 |
+ v0 = _mm_mulhrs_epi16(vv_l, v2g_coeff); \ |
11071 |
+ v1 = _mm_mulhrs_epi16(vu_l, u2g_coeff); \ |
11072 |
+ G_l = _mm_add_epi16(_mm_add_epi16(v0, vy_l), v1); \ |
11073 |
+ \ |
11074 |
+ R_l = _mm_add_epi16(vy_l, _mm_mulhrs_epi16(vv_l, v2r_coeff)); \ |
11075 |
+ B_l = _mm_add_epi16(vy_l, _mm_mulhrs_epi16(vu_l, u2b_coeff)); |
11076 |
+ |
11077 |
+#define YUV2RGBWRAPPERX(ext, fmt, R, B, hasAlpha) \ |
11078 |
+static void yuv2##ext##_X_e2k(SwsContext *c, const int16_t *lumFilter, \ |
11079 |
+ const int16_t **lumSrc, int lumFilterSize, \ |
11080 |
+ const int16_t *chrFilter, const int16_t **chrUSrc, \ |
11081 |
+ const int16_t **chrVSrc, int chrFilterSize, \ |
11082 |
+ const int16_t **alpSrc, uint8_t *dest, int dstW, \ |
11083 |
+ int y) \ |
11084 |
+{ \ |
11085 |
+ vec_s32 vy_l, vy_h, vu_l, vv_l, vu2_l, vu2_h, vv2_l, vv2_h; \ |
11086 |
+ vec_s32 hasAlpha(A_l,) R_l, G_l, B_l; \ |
11087 |
+ vec_s32 ystart = _mm_set1_epi32(0); \ |
11088 |
+ vec_s32 uvstart = _mm_set1_epi32(-(128 << 19)); \ |
11089 |
+ vec_s32 y_coeff = _mm_set1_epi16(c->yuv2rgb_y_coeff); \ |
11090 |
+ vec_s32 y_sub = _mm_set1_epi16((c->yuv2rgb_y_offset + 64) >> 7); \ |
11091 |
+ vec_s32 v2r_coeff = _mm_set1_epi16(c->yuv2rgb_v2r_coeff); \ |
11092 |
+ vec_s32 v2g_coeff = _mm_set1_epi16(c->yuv2rgb_v2g_coeff); \ |
11093 |
+ vec_s32 u2g_coeff = _mm_set1_epi16(c->yuv2rgb_u2g_coeff); \ |
11094 |
+ vec_s32 u2b_coeff = _mm_set1_epi16(c->yuv2rgb_u2b_coeff); \ |
11095 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
11096 |
+ int i, j; \ |
11097 |
+ INIT2_##fmt(R, B) \ |
11098 |
+ \ |
11099 |
+ for (i = 0; i < (dstW + 1) >> 1; i += 8) { \ |
11100 |
+ vy_l = vy_h = ystart; \ |
11101 |
+ for (j = 0; j < lumFilterSize; j++) { \ |
11102 |
+ v0 = VEC_LD(lumSrc[j] + i * 2); \ |
11103 |
+ v1 = _mm_set1_epi16(lumFilter[j]); \ |
11104 |
+ v2 = _mm_mullo_epi16(v0, v1); \ |
11105 |
+ v3 = _mm_mulhi_epi16(v0, v1); \ |
11106 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
11107 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
11108 |
+ vy_l = _mm_add_epi32(vy_l, v0); \ |
11109 |
+ vy_h = _mm_add_epi32(vy_h, v1); \ |
11110 |
+ } \ |
11111 |
+ vy_l = _mm_srai_epi32(vy_l, 17); \ |
11112 |
+ vy_h = _mm_srai_epi32(vy_h, 17); \ |
11113 |
+ \ |
11114 |
+ vu2_l = vu2_h = vv2_l = vv2_h = uvstart; \ |
11115 |
+ for (j = 0; j < chrFilterSize; j++) { \ |
11116 |
+ v0 = VEC_LD(chrUSrc[j] + i); \ |
11117 |
+ v1 = VEC_LD(chrVSrc[j] + i); \ |
11118 |
+ v5 = _mm_set1_epi16(chrFilter[j]); \ |
11119 |
+ v2 = _mm_mullo_epi16(v0, v5); \ |
11120 |
+ v3 = _mm_mulhi_epi16(v0, v5); \ |
11121 |
+ v4 = _mm_mullo_epi16(v1, v5); \ |
11122 |
+ v5 = _mm_mulhi_epi16(v1, v5); \ |
11123 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
11124 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
11125 |
+ v2 = _mm_unpacklo_epi16(v4, v5); \ |
11126 |
+ v3 = _mm_unpackhi_epi16(v4, v5); \ |
11127 |
+ vu2_l = _mm_add_epi32(vu2_l, v0); \ |
11128 |
+ vu2_h = _mm_add_epi32(vu2_h, v1); \ |
11129 |
+ vv2_l = _mm_add_epi32(vv2_l, v2); \ |
11130 |
+ vv2_h = _mm_add_epi32(vv2_h, v3); \ |
11131 |
+ } \ |
11132 |
+ vu2_l = _mm_srai_epi32(vu2_l, 17); \ |
11133 |
+ vu2_h = _mm_srai_epi32(vu2_h, 17); \ |
11134 |
+ vv2_l = _mm_srai_epi32(vv2_l, 17); \ |
11135 |
+ vv2_h = _mm_srai_epi32(vv2_h, 17); \ |
11136 |
+ vu2_l = _mm_packs_epi32(vu2_l, vu2_h); \ |
11137 |
+ vv2_l = _mm_packs_epi32(vv2_l, vv2_h); \ |
11138 |
+ \ |
11139 |
+ vu_l = _mm_unpacklo_epi16(vu2_l, vu2_l); \ |
11140 |
+ vv_l = _mm_unpacklo_epi16(vv2_l, vv2_l); \ |
11141 |
+ vy_l = _mm_packs_epi32(vy_l, vy_h); \ |
11142 |
+ \ |
11143 |
+ CALC2_RGB \ |
11144 |
+ WRITE2_##fmt(R, B) \ |
11145 |
+ \ |
11146 |
+ vy_l = vy_h = ystart; \ |
11147 |
+ for (j = 0; j < lumFilterSize; j++) { \ |
11148 |
+ v0 = VEC_LD(lumSrc[j] + i * 2 + 8); \ |
11149 |
+ v1 = _mm_set1_epi16(lumFilter[j]); \ |
11150 |
+ v2 = _mm_mullo_epi16(v0, v1); \ |
11151 |
+ v3 = _mm_mulhi_epi16(v0, v1); \ |
11152 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
11153 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
11154 |
+ vy_l = _mm_add_epi32(vy_l, v0); \ |
11155 |
+ vy_h = _mm_add_epi32(vy_h, v1); \ |
11156 |
+ } \ |
11157 |
+ vy_l = _mm_srai_epi32(vy_l, 17); \ |
11158 |
+ vy_h = _mm_srai_epi32(vy_h, 17); \ |
11159 |
+ \ |
11160 |
+ vu_l = _mm_unpackhi_epi16(vu2_l, vu2_l); \ |
11161 |
+ vv_l = _mm_unpackhi_epi16(vv2_l, vv2_l); \ |
11162 |
+ vy_l = _mm_packs_epi32(vy_l, vy_h); \ |
11163 |
+ \ |
11164 |
+ CALC2_RGB \ |
11165 |
+ WRITE2_##fmt(R, B) \ |
11166 |
+ } \ |
11167 |
+} |
11168 |
+ |
11169 |
+#define SETUP2(buf, i, alpha, r0) { \ |
11170 |
+ v0 = VEC_LD(buf##0 + i); \ |
11171 |
+ v1 = VEC_LD(buf##1 + i); \ |
11172 |
+ v1 = _mm_subs_epi16(v0, v1); \ |
11173 |
+ v1 = _mm_mulhrs_epi16(v1, alpha); \ |
11174 |
+ r0 = _mm_add_epi16(v0, v1); \ |
11175 |
+} |
11176 |
+ |
11177 |
+#define YUV2RGBWRAPPER2(ext, fmt, R, B, hasAlpha) \ |
11178 |
+static void yuv2##ext##_2_e2k(SwsContext *c, const int16_t *buf[2], \ |
11179 |
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \ |
11180 |
+ const int16_t *abuf[2], uint8_t *dest, int dstW, \ |
11181 |
+ int yalpha, int uvalpha, int y) \ |
11182 |
+{ \ |
11183 |
+ const int16_t hasAlpha(*abuf0 = abuf[0], *abuf1 = abuf[1],) \ |
11184 |
+ *buf0 = buf[0], *buf1 = buf[1], \ |
11185 |
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], \ |
11186 |
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1]; \ |
11187 |
+ vec_s32 vy_l, vu_l, vv_l, vu2_l, vv2_l; \ |
11188 |
+ vec_s32 hasAlpha(A_l,) R_l, G_l, B_l; \ |
11189 |
+ vec_s16 vyalpha = _mm_set1_epi16(-yalpha << 3); \ |
11190 |
+ vec_s16 vuvalpha = _mm_set1_epi16(-uvalpha << 3); \ |
11191 |
+ vec_s32 y_coeff = _mm_set1_epi16(c->yuv2rgb_y_coeff); \ |
11192 |
+ vec_s32 y_sub = _mm_set1_epi16((c->yuv2rgb_y_offset + 64) >> 7); \ |
11193 |
+ vec_s32 v2r_coeff = _mm_set1_epi16(c->yuv2rgb_v2r_coeff); \ |
11194 |
+ vec_s32 v2g_coeff = _mm_set1_epi16(c->yuv2rgb_v2g_coeff); \ |
11195 |
+ vec_s32 u2g_coeff = _mm_set1_epi16(c->yuv2rgb_u2g_coeff); \ |
11196 |
+ vec_s32 u2b_coeff = _mm_set1_epi16(c->yuv2rgb_u2b_coeff); \ |
11197 |
+ vec_s32 dec128 = _mm_set1_epi16(128 << 2); \ |
11198 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
11199 |
+ int i; \ |
11200 |
+ INIT2_##fmt(R, B) \ |
11201 |
+ \ |
11202 |
+ av_assert2(yalpha <= 4096U); \ |
11203 |
+ av_assert2(uvalpha <= 4096U); \ |
11204 |
+ \ |
11205 |
+ for (i = 0; i < (dstW + 1) >> 1; i += 8) { \ |
11206 |
+ SETUP2(buf, i * 2, vyalpha, vy_l); \ |
11207 |
+ vy_l = _mm_srai_epi16(vy_l, 5); \ |
11208 |
+ \ |
11209 |
+ SETUP2(ubuf, i, vuvalpha, vu2_l); \ |
11210 |
+ vu2_l = _mm_srai_epi16(vu2_l, 5); \ |
11211 |
+ \ |
11212 |
+ SETUP2(vbuf, i, vuvalpha, vv2_l); \ |
11213 |
+ vv2_l = _mm_srai_epi16(vv2_l, 5); \ |
11214 |
+ \ |
11215 |
+ vu2_l = _mm_sub_epi16(vu2_l, dec128); \ |
11216 |
+ vv2_l = _mm_sub_epi16(vv2_l, dec128); \ |
11217 |
+ \ |
11218 |
+ hasAlpha( \ |
11219 |
+ SETUP2(abuf, i * 2, vyalpha, A_l); \ |
11220 |
+ ) \ |
11221 |
+ \ |
11222 |
+ vu_l = _mm_unpacklo_epi16(vu2_l, vu2_l); \ |
11223 |
+ vv_l = _mm_unpacklo_epi16(vv2_l, vv2_l); \ |
11224 |
+ \ |
11225 |
+ CALC2_RGB \ |
11226 |
+ WRITE2_##fmt(R, B) \ |
11227 |
+ \ |
11228 |
+ SETUP2(buf, i * 2 + 8, vyalpha, vy_l); \ |
11229 |
+ vy_l = _mm_srai_epi16(vy_l, 5); \ |
11230 |
+ \ |
11231 |
+ hasAlpha( \ |
11232 |
+ SETUP2(abuf, i * 2 + 8, vyalpha, A_l); \ |
11233 |
+ ) \ |
11234 |
+ \ |
11235 |
+ vu_l = _mm_unpackhi_epi16(vu2_l, vu2_l); \ |
11236 |
+ vv_l = _mm_unpackhi_epi16(vv2_l, vv2_l); \ |
11237 |
+ \ |
11238 |
+ CALC2_RGB \ |
11239 |
+ WRITE2_##fmt(R, B) \ |
11240 |
+ } \ |
11241 |
+} |
11242 |
+ |
11243 |
+#define YUV2RGBWRAPPER1(ext, fmt, R, B, hasAlpha) \ |
11244 |
+static void yuv2##ext##_1_e2k(SwsContext *c, const int16_t *buf0, \ |
11245 |
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \ |
11246 |
+ const int16_t *abuf0, uint8_t *dest, int dstW, \ |
11247 |
+ int uvalpha, int y) \ |
11248 |
+{ \ |
11249 |
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; \ |
11250 |
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; \ |
11251 |
+ int uvshr = uvalpha < 2048 ? 5 : 6; \ |
11252 |
+ vec_s32 vy_l, vu_l, vv_l, vu2_l, vv2_l; \ |
11253 |
+ vec_s32 hasAlpha(A_l,) R_l, G_l, B_l; \ |
11254 |
+ vec_s32 y_coeff = _mm_set1_epi16(c->yuv2rgb_y_coeff); \ |
11255 |
+ vec_s32 y_sub = _mm_set1_epi16((c->yuv2rgb_y_offset + 64) >> 7); \ |
11256 |
+ vec_s32 v2r_coeff = _mm_set1_epi16(c->yuv2rgb_v2r_coeff); \ |
11257 |
+ vec_s32 v2g_coeff = _mm_set1_epi16(c->yuv2rgb_v2g_coeff); \ |
11258 |
+ vec_s32 u2g_coeff = _mm_set1_epi16(c->yuv2rgb_u2g_coeff); \ |
11259 |
+ vec_s32 u2b_coeff = _mm_set1_epi16(c->yuv2rgb_u2b_coeff); \ |
11260 |
+ vec_u16 uvsub = _mm_set1_epi16(uvalpha < 2048 ? 128 << 7 : 128 << 8); \ |
11261 |
+ hasAlpha(vec_s16 A, add64 = _mm_set1_epi16(64);) \ |
11262 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
11263 |
+ int i; \ |
11264 |
+ INIT2_##fmt(R, B) \ |
11265 |
+ \ |
11266 |
+ for (i = 0; i < (dstW + 1) >> 1; i += 8) { \ |
11267 |
+ v0 = VEC_LD(buf0 + i * 2); \ |
11268 |
+ vy_l = _mm_srai_epi16(v0, 5); \ |
11269 |
+ \ |
11270 |
+ v0 = VEC_LD(ubuf0 + i); \ |
11271 |
+ v1 = VEC_LD(vbuf0 + i); \ |
11272 |
+ if (uvalpha >= 2048) { \ |
11273 |
+ v2 = VEC_LD(ubuf1 + i); \ |
11274 |
+ v3 = VEC_LD(vbuf1 + i); \ |
11275 |
+ v0 = _mm_add_epi16(v0, v2); \ |
11276 |
+ v1 = _mm_add_epi16(v1, v3); \ |
11277 |
+ } \ |
11278 |
+ v0 = _mm_sub_epi16(v0, uvsub); \ |
11279 |
+ v1 = _mm_sub_epi16(v1, uvsub); \ |
11280 |
+ vu2_l = _mm_srai_epi16(v0, uvshr); \ |
11281 |
+ vv2_l = _mm_srai_epi16(v1, uvshr); \ |
11282 |
+ \ |
11283 |
+ hasAlpha( \ |
11284 |
+ A_l = VEC_LD(abuf0 + i * 2); \ |
11285 |
+ A_l = _mm_add_epi16(A_l, add64); \ |
11286 |
+ A_l = _mm_srai_epi16(A_l, 7); \ |
11287 |
+ ) \ |
11288 |
+ \ |
11289 |
+ vu_l = _mm_unpacklo_epi16(vu2_l, vu2_l); \ |
11290 |
+ vv_l = _mm_unpacklo_epi16(vv2_l, vv2_l); \ |
11291 |
+ \ |
11292 |
+ CALC2_RGB \ |
11293 |
+ WRITE2_##fmt(R, B) \ |
11294 |
+ \ |
11295 |
+ v0 = VEC_LD(buf0 + i * 2 + 8); \ |
11296 |
+ vy_l = _mm_srai_epi16(v0, 5); \ |
11297 |
+ \ |
11298 |
+ hasAlpha( \ |
11299 |
+ A_l = VEC_LD(abuf0 + i * 2 + 8); \ |
11300 |
+ A_l = _mm_add_epi16(A_l, add64); \ |
11301 |
+ A_l = _mm_srai_epi16(A_l, 7); \ |
11302 |
+ ) \ |
11303 |
+ \ |
11304 |
+ vu_l = _mm_unpackhi_epi16(vu2_l, vu2_l); \ |
11305 |
+ vv_l = _mm_unpackhi_epi16(vv2_l, vv2_l); \ |
11306 |
+ \ |
11307 |
+ CALC2_RGB \ |
11308 |
+ WRITE2_##fmt(R, B) \ |
11309 |
+ } \ |
11310 |
+} |
11311 |
+ |
11312 |
+#else // quality |
11313 |
+ |
11314 |
+#define YUV2RGBWRAPPERX(ext, fmt, R, B, hasAlpha) \ |
11315 |
+static void yuv2##ext##_X_e2k(SwsContext *c, const int16_t *lumFilter, \ |
11316 |
+ const int16_t **lumSrc, int lumFilterSize, \ |
11317 |
+ const int16_t *chrFilter, const int16_t **chrUSrc, \ |
11318 |
+ const int16_t **chrVSrc, int chrFilterSize, \ |
11319 |
+ const int16_t **alpSrc, uint8_t *dest, int dstW, \ |
11320 |
+ int y) \ |
11321 |
+{ \ |
11322 |
+ vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h, vu2_l, vu2_h, vv2_l, vv2_h; \ |
11323 |
+ vec_s32 hasAlpha(A_l, A_h,) R_l, R_h, G_l, G_h, B_l, B_h; \ |
11324 |
+ vec_s32 ystart = _mm_set1_epi32(1 << 9); \ |
11325 |
+ vec_s32 uvstart = _mm_set1_epi32((1 << 9) - (128 << 19)); \ |
11326 |
+ vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff); \ |
11327 |
+ vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \ |
11328 |
+ vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff); \ |
11329 |
+ vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff); \ |
11330 |
+ vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff); \ |
11331 |
+ vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff); \ |
11332 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
11333 |
+ int i, j; \ |
11334 |
+ INIT_##fmt(R, B) \ |
11335 |
+ \ |
11336 |
+ for (i = 0; i < (dstW + 1) >> 1; i += 8) { \ |
11337 |
+ vy_l = vy_h = ystart; \ |
11338 |
+ for (j = 0; j < lumFilterSize; j++) { \ |
11339 |
+ v0 = VEC_LD(lumSrc[j] + i * 2); \ |
11340 |
+ v1 = _mm_set1_epi16(lumFilter[j]); \ |
11341 |
+ v2 = _mm_mullo_epi16(v0, v1); \ |
11342 |
+ v3 = _mm_mulhi_epi16(v0, v1); \ |
11343 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
11344 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
11345 |
+ vy_l = _mm_add_epi32(vy_l, v0); \ |
11346 |
+ vy_h = _mm_add_epi32(vy_h, v1); \ |
11347 |
+ } \ |
11348 |
+ vy_l = _mm_srai_epi32(vy_l, 10); \ |
11349 |
+ vy_h = _mm_srai_epi32(vy_h, 10); \ |
11350 |
+ \ |
11351 |
+ vu2_l = vu2_h = vv2_l = vv2_h = uvstart; \ |
11352 |
+ for (j = 0; j < chrFilterSize; j++) { \ |
11353 |
+ v0 = VEC_LD(chrUSrc[j] + i); \ |
11354 |
+ v1 = VEC_LD(chrVSrc[j] + i); \ |
11355 |
+ v5 = _mm_set1_epi16(chrFilter[j]); \ |
11356 |
+ v2 = _mm_mullo_epi16(v0, v5); \ |
11357 |
+ v3 = _mm_mulhi_epi16(v0, v5); \ |
11358 |
+ v4 = _mm_mullo_epi16(v1, v5); \ |
11359 |
+ v5 = _mm_mulhi_epi16(v1, v5); \ |
11360 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
11361 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
11362 |
+ v2 = _mm_unpacklo_epi16(v4, v5); \ |
11363 |
+ v3 = _mm_unpackhi_epi16(v4, v5); \ |
11364 |
+ vu2_l = _mm_add_epi32(vu2_l, v0); \ |
11365 |
+ vu2_h = _mm_add_epi32(vu2_h, v1); \ |
11366 |
+ vv2_l = _mm_add_epi32(vv2_l, v2); \ |
11367 |
+ vv2_h = _mm_add_epi32(vv2_h, v3); \ |
11368 |
+ } \ |
11369 |
+ vu2_l = _mm_srai_epi32(vu2_l, 10); \ |
11370 |
+ vu2_h = _mm_srai_epi32(vu2_h, 10); \ |
11371 |
+ vv2_l = _mm_srai_epi32(vv2_l, 10); \ |
11372 |
+ vv2_h = _mm_srai_epi32(vv2_h, 10); \ |
11373 |
+ \ |
11374 |
+ vu_l = _mm_unpacklo_epi32(vu2_l, vu2_l); \ |
11375 |
+ vu_h = _mm_unpackhi_epi32(vu2_l, vu2_l); \ |
11376 |
+ vv_l = _mm_unpacklo_epi32(vv2_l, vv2_l); \ |
11377 |
+ vv_h = _mm_unpackhi_epi32(vv2_l, vv2_l); \ |
11378 |
+ \ |
11379 |
+ CALC_RGB \ |
11380 |
+ WRITE_##fmt(R, B) \ |
11381 |
+ \ |
11382 |
+ vy_l = vy_h = ystart; \ |
11383 |
+ for (j = 0; j < lumFilterSize; j++) { \ |
11384 |
+ v0 = VEC_LD(lumSrc[j] + i * 2 + 8); \ |
11385 |
+ v1 = _mm_set1_epi16(lumFilter[j]); \ |
11386 |
+ v2 = _mm_mullo_epi16(v0, v1); \ |
11387 |
+ v3 = _mm_mulhi_epi16(v0, v1); \ |
11388 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
11389 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
11390 |
+ vy_l = _mm_add_epi32(vy_l, v0); \ |
11391 |
+ vy_h = _mm_add_epi32(vy_h, v1); \ |
11392 |
+ } \ |
11393 |
+ vy_l = _mm_srai_epi32(vy_l, 10); \ |
11394 |
+ vy_h = _mm_srai_epi32(vy_h, 10); \ |
11395 |
+ \ |
11396 |
+ vu_l = _mm_unpacklo_epi32(vu2_h, vu2_h); \ |
11397 |
+ vu_h = _mm_unpackhi_epi32(vu2_h, vu2_h); \ |
11398 |
+ vv_l = _mm_unpacklo_epi32(vv2_h, vv2_h); \ |
11399 |
+ vv_h = _mm_unpackhi_epi32(vv2_h, vv2_h); \ |
11400 |
+ \ |
11401 |
+ CALC_RGB \ |
11402 |
+ WRITE_##fmt(R, B) \ |
11403 |
+ } \ |
11404 |
+} |
11405 |
+ |
11406 |
+#define YUV2RGBWRAPPER2(ext, fmt, R, B, hasAlpha) \ |
11407 |
+static void yuv2##ext##_2_e2k(SwsContext *c, const int16_t *buf[2], \ |
11408 |
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \ |
11409 |
+ const int16_t *abuf[2], uint8_t *dest, int dstW, \ |
11410 |
+ int yalpha, int uvalpha, int y) \ |
11411 |
+{ \ |
11412 |
+ const int16_t hasAlpha(*abuf0 = abuf[0], *abuf1 = abuf[1],) \ |
11413 |
+ *buf0 = buf[0], *buf1 = buf[1], \ |
11414 |
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], \ |
11415 |
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1]; \ |
11416 |
+ vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h, vu2_l, vu2_h, vv2_l, vv2_h; \ |
11417 |
+ vec_s32 hasAlpha(A_l, A_h,) R_l, R_h, G_l, G_h, B_l, B_h; \ |
11418 |
+ vec_s16 vyalpha = _mm_set1_epi32(4096 + yalpha * 0xffff); \ |
11419 |
+ vec_s16 vuvalpha = _mm_set1_epi32(4096 + uvalpha * 0xffff); \ |
11420 |
+ vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff); \ |
11421 |
+ vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \ |
11422 |
+ vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff); \ |
11423 |
+ vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff); \ |
11424 |
+ vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff); \ |
11425 |
+ vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff); \ |
11426 |
+ vec_s32 dec128 = _mm_set1_epi32(128 << 19); \ |
11427 |
+ hasAlpha(vec_s32 add18 = _mm_set1_epi32(1 << 18);) \ |
11428 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
11429 |
+ int i; \ |
11430 |
+ INIT_##fmt(R, B) \ |
11431 |
+ \ |
11432 |
+ av_assert2(yalpha <= 4096U); \ |
11433 |
+ av_assert2(uvalpha <= 4096U); \ |
11434 |
+ \ |
11435 |
+ for (i = 0; i < (dstW + 1) >> 1; i += 8) { \ |
11436 |
+ SETUP(buf, i * 2, vyalpha, v0, v1); \ |
11437 |
+ vy_l = _mm_srai_epi32(v0, 10); \ |
11438 |
+ vy_h = _mm_srai_epi32(v1, 10); \ |
11439 |
+ \ |
11440 |
+ SETUP(ubuf, i, vuvalpha, v0, v1); \ |
11441 |
+ vu2_l = _mm_srai_epi32(_mm_sub_epi32(v0, dec128), 10); \ |
11442 |
+ vu2_h = _mm_srai_epi32(_mm_sub_epi32(v1, dec128), 10); \ |
11443 |
+ \ |
11444 |
+ SETUP(vbuf, i, vuvalpha, v0, v1); \ |
11445 |
+ vv2_l = _mm_srai_epi32(_mm_sub_epi32(v0, dec128), 10); \ |
11446 |
+ vv2_h = _mm_srai_epi32(_mm_sub_epi32(v1, dec128), 10); \ |
11447 |
+ \ |
11448 |
+ hasAlpha( \ |
11449 |
+ SETUP(abuf, i * 2, vyalpha, v0, v1); \ |
11450 |
+ A_l = _mm_add_epi32(v0, add18); \ |
11451 |
+ A_h = _mm_add_epi32(v1, add18); \ |
11452 |
+ ) \ |
11453 |
+ \ |
11454 |
+ vu_l = _mm_unpacklo_epi32(vu2_l, vu2_l); \ |
11455 |
+ vu_h = _mm_unpackhi_epi32(vu2_l, vu2_l); \ |
11456 |
+ vv_l = _mm_unpacklo_epi32(vv2_l, vv2_l); \ |
11457 |
+ vv_h = _mm_unpackhi_epi32(vv2_l, vv2_l); \ |
11458 |
+ \ |
11459 |
+ CALC_RGB \ |
11460 |
+ WRITE_##fmt(R, B) \ |
11461 |
+ \ |
11462 |
+ SETUP(buf, i * 2 + 8, vyalpha, v0, v1); \ |
11463 |
+ vy_l = _mm_srai_epi32(v0, 10); \ |
11464 |
+ vy_h = _mm_srai_epi32(v1, 10); \ |
11465 |
+ \ |
11466 |
+ hasAlpha( \ |
11467 |
+ SETUP(abuf, i * 2 + 8, vyalpha, v0, v1); \ |
11468 |
+ A_l = _mm_add_epi32(v0, add18); \ |
11469 |
+ A_h = _mm_add_epi32(v1, add18); \ |
11470 |
+ ) \ |
11471 |
+ \ |
11472 |
+ vu_l = _mm_unpacklo_epi32(vu2_h, vu2_h); \ |
11473 |
+ vu_h = _mm_unpackhi_epi32(vu2_h, vu2_h); \ |
11474 |
+ vv_l = _mm_unpacklo_epi32(vv2_h, vv2_h); \ |
11475 |
+ vv_h = _mm_unpackhi_epi32(vv2_h, vv2_h); \ |
11476 |
+ \ |
11477 |
+ CALC_RGB \ |
11478 |
+ WRITE_##fmt(R, B) \ |
11479 |
+ } \ |
11480 |
+} |
11481 |
+ |
11482 |
+#define YUV2RGBWRAPPER1(ext, fmt, R, B, hasAlpha) \ |
11483 |
+static void yuv2##ext##_1_e2k(SwsContext *c, const int16_t *buf0, \ |
11484 |
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \ |
11485 |
+ const int16_t *abuf0, uint8_t *dest, int dstW, \ |
11486 |
+ int uvalpha, int y) \ |
11487 |
+{ \ |
11488 |
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; \ |
11489 |
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; \ |
11490 |
+ int uvshl = uvalpha < 2048 ? 2 : 1; \ |
11491 |
+ vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h, vu2_l, vu2_h, vv2_l, vv2_h; \ |
11492 |
+ vec_s32 R_l, R_h, G_l, G_h, B_l, B_h; \ |
11493 |
+ vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff * 4); \ |
11494 |
+ vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \ |
11495 |
+ vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff << uvshl); \ |
11496 |
+ vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff << uvshl); \ |
11497 |
+ vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff << uvshl); \ |
11498 |
+ vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff << uvshl); \ |
11499 |
+ vec_u16 uvsub = _mm_set1_epi16(uvalpha < 2048 ? 128 << 7 : 128 << 8); \ |
11500 |
+ hasAlpha(vec_s16 A, add64 = _mm_set1_epi16(64);) \ |
11501 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
11502 |
+ int i; \ |
11503 |
+ INIT_##fmt(R, B) \ |
11504 |
+ \ |
11505 |
+ for (i = 0; i < (dstW + 1) >> 1; i += 8) { \ |
11506 |
+ v0 = VEC_LD(buf0 + i * 2); \ |
11507 |
+ v2 = _mm_unpacklo_epi16(v0, v0); \ |
11508 |
+ v3 = _mm_unpackhi_epi16(v0, v0); \ |
11509 |
+ vy_l = _mm_srai_epi32(v2, 16); \ |
11510 |
+ vy_h = _mm_srai_epi32(v3, 16); \ |
11511 |
+ \ |
11512 |
+ v0 = VEC_LD(ubuf0 + i); \ |
11513 |
+ v1 = VEC_LD(vbuf0 + i); \ |
11514 |
+ if (uvalpha >= 2048) { \ |
11515 |
+ v2 = VEC_LD(ubuf1 + i); \ |
11516 |
+ v3 = VEC_LD(vbuf1 + i); \ |
11517 |
+ v0 = _mm_add_epi16(v0, v2); \ |
11518 |
+ v1 = _mm_add_epi16(v1, v3); \ |
11519 |
+ } \ |
11520 |
+ v0 = _mm_sub_epi16(v0, uvsub); \ |
11521 |
+ v1 = _mm_sub_epi16(v1, uvsub); \ |
11522 |
+ v2 = _mm_unpacklo_epi16(v0, v0); \ |
11523 |
+ v3 = _mm_unpackhi_epi16(v0, v0); \ |
11524 |
+ vu2_l = _mm_srai_epi32(v2, 16); \ |
11525 |
+ vu2_h = _mm_srai_epi32(v3, 16); \ |
11526 |
+ v2 = _mm_unpacklo_epi16(v1, v1); \ |
11527 |
+ v3 = _mm_unpackhi_epi16(v1, v1); \ |
11528 |
+ vv2_l = _mm_srai_epi32(v2, 16); \ |
11529 |
+ vv2_h = _mm_srai_epi32(v3, 16); \ |
11530 |
+ \ |
11531 |
+ hasAlpha( \ |
11532 |
+ A_l = VEC_LD(abuf0 + i * 2); \ |
11533 |
+ A_l = _mm_add_epi16(A_l, add64); \ |
11534 |
+ A_l = _mm_srai_epi16(A_l, 7); \ |
11535 |
+ ) \ |
11536 |
+ \ |
11537 |
+ vu_l = _mm_unpacklo_epi32(vu2_l, vu2_l); \ |
11538 |
+ vu_h = _mm_unpackhi_epi32(vu2_l, vu2_l); \ |
11539 |
+ vv_l = _mm_unpacklo_epi32(vv2_l, vv2_l); \ |
11540 |
+ vv_h = _mm_unpackhi_epi32(vv2_l, vv2_l); \ |
11541 |
+ \ |
11542 |
+ CALC_RGB \ |
11543 |
+ WRITE_##fmt(R, B) \ |
11544 |
+ \ |
11545 |
+ v0 = VEC_LD(buf0 + i * 2 + 8); \ |
11546 |
+ v2 = _mm_unpacklo_epi16(v0, v0); \ |
11547 |
+ v3 = _mm_unpackhi_epi16(v0, v0); \ |
11548 |
+ vy_l = _mm_srai_epi32(v2, 16); \ |
11549 |
+ vy_h = _mm_srai_epi32(v3, 16); \ |
11550 |
+ \ |
11551 |
+ hasAlpha( \ |
11552 |
+ A_l = VEC_LD(abuf0 + i * 2 + 8); \ |
11553 |
+ A_l = _mm_add_epi16(A_l, add64); \ |
11554 |
+ A_l = _mm_srai_epi16(A_l, 7); \ |
11555 |
+ ) \ |
11556 |
+ \ |
11557 |
+ vu_l = _mm_unpacklo_epi32(vu2_h, vu2_h); \ |
11558 |
+ vu_h = _mm_unpackhi_epi32(vu2_h, vu2_h); \ |
11559 |
+ vv_l = _mm_unpacklo_epi32(vv2_h, vv2_h); \ |
11560 |
+ vv_h = _mm_unpackhi_epi32(vv2_h, vv2_h); \ |
11561 |
+ \ |
11562 |
+ CALC_RGB \ |
11563 |
+ WRITE_##fmt(R, B) \ |
11564 |
+ } \ |
11565 |
+} |
11566 |
+ |
11567 |
+#endif |
11568 |
+ |
11569 |
+YUV2RGBWRAPPERX(rgbx32, RGBX, R, B, NO_ALPHA) |
11570 |
+YUV2RGBWRAPPERX(bgrx32, RGBX, B, R, NO_ALPHA) |
11571 |
+YUV2RGBWRAPPERX(xrgb32, XRGB, R, B, NO_ALPHA) |
11572 |
+YUV2RGBWRAPPERX(xbgr32, XRGB, B, R, NO_ALPHA) |
11573 |
+YUV2RGBWRAPPERX(rgb24, RGB, R, B, NO_ALPHA) |
11574 |
+YUV2RGBWRAPPERX(bgr24, RGB, B, R, NO_ALPHA) |
11575 |
+ |
11576 |
+YUV2RGBWRAPPER2(rgbx32, RGBX, R, B, NO_ALPHA) |
11577 |
+YUV2RGBWRAPPER2(bgrx32, RGBX, B, R, NO_ALPHA) |
11578 |
+YUV2RGBWRAPPER2(xrgb32, XRGB, R, B, NO_ALPHA) |
11579 |
+YUV2RGBWRAPPER2(xbgr32, XRGB, B, R, NO_ALPHA) |
11580 |
+YUV2RGBWRAPPER2(rgb24, RGB, R, B, NO_ALPHA) |
11581 |
+YUV2RGBWRAPPER2(bgr24, RGB, B, R, NO_ALPHA) |
11582 |
+ |
11583 |
+YUV2RGBWRAPPER1(rgbx32, RGBX, R, B, NO_ALPHA) |
11584 |
+YUV2RGBWRAPPER1(bgrx32, RGBX, B, R, NO_ALPHA) |
11585 |
+YUV2RGBWRAPPER1(xrgb32, XRGB, R, B, NO_ALPHA) |
11586 |
+YUV2RGBWRAPPER1(xbgr32, XRGB, B, R, NO_ALPHA) |
11587 |
+YUV2RGBWRAPPER1(rgb24, RGB, R, B, NO_ALPHA) |
11588 |
+YUV2RGBWRAPPER1(bgr24, RGB, B, R, NO_ALPHA) |
11589 |
+ |
11590 |
+#define WRITE_422(vu, vv, x0, x1) \ |
11591 |
+ vy0 = _mm_srai_epi32(vy0, 19); \ |
11592 |
+ vy1 = _mm_srai_epi32(vy1, 19); \ |
11593 |
+ vy2 = _mm_srai_epi32(vy2, 19); \ |
11594 |
+ vy3 = _mm_srai_epi32(vy3, 19); \ |
11595 |
+ vu##0 = _mm_srai_epi32(vu##0, 19); \ |
11596 |
+ vu##1 = _mm_srai_epi32(vu##1, 19); \ |
11597 |
+ vv##0 = _mm_srai_epi32(vv##0, 19 - 16); \ |
11598 |
+ vv##1 = _mm_srai_epi32(vv##1, 19 - 16); \ |
11599 |
+ v0 = _mm_packs_epi32(vy0, vy1); \ |
11600 |
+ v1 = _mm_packs_epi32(vy2, vy3); \ |
11601 |
+ v2 = _mm_blend_epi16(vu##0, vv##0, 0xaa); \ |
11602 |
+ v3 = _mm_blend_epi16(vu##1, vv##1, 0xaa); \ |
11603 |
+ v4 = _mm_packus_epi16(v0, v1); \ |
11604 |
+ v5 = _mm_packus_epi16(v2, v3); \ |
11605 |
+ v0 = _mm_unpacklo_epi8(x0, x1); \ |
11606 |
+ v1 = _mm_unpackhi_epi8(x0, x1); \ |
11607 |
+ VEC_ST(dest, v0); \ |
11608 |
+ VEC_ST(dest + 16, v1); \ |
11609 |
+ dest += 32; |
11610 |
+ |
11611 |
+#define WRITE_YUYV422 WRITE_422(vu, vv, v4, v5) |
11612 |
+#define WRITE_YVYU422 WRITE_422(vv, vu, v4, v5) |
11613 |
+#define WRITE_UYVY422 WRITE_422(vu, vv, v5, v4) |
11614 |
+ |
11615 |
+#define YUV2PACKEDWRAPPERX(ext, fmt) \ |
11616 |
+static void yuv2##ext##_X_e2k(SwsContext *c, const int16_t *lumFilter, \ |
11617 |
+ const int16_t **lumSrc, int lumFilterSize, \ |
11618 |
+ const int16_t *chrFilter, const int16_t **chrUSrc, \ |
11619 |
+ const int16_t **chrVSrc, int chrFilterSize, \ |
11620 |
+ const int16_t **alpSrc, uint8_t *dest, int dstW, \ |
11621 |
+ int y) \ |
11622 |
+{ \ |
11623 |
+ int i, j; \ |
11624 |
+ __m128i vy0, vy1, vy2, vy3, vu0, vu1, vv0, vv1; \ |
11625 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
11626 |
+ vec_s32 start = _mm_set1_epi32(1 << 18); \ |
11627 |
+ \ |
11628 |
+ for (i = 0; i < (dstW + 1) >> 1; i += 8) { \ |
11629 |
+ vy0 = vy1 = vy2 = vy3 = start; \ |
11630 |
+ for (j = 0; j < lumFilterSize; j++) { \ |
11631 |
+ v0 = VEC_LD(lumSrc[j] + i * 2); \ |
11632 |
+ v1 = VEC_LD(lumSrc[j] + i * 2 + 8); \ |
11633 |
+ v5 = _mm_set1_epi16(lumFilter[j]); \ |
11634 |
+ v2 = _mm_mullo_epi16(v0, v5); \ |
11635 |
+ v3 = _mm_mulhi_epi16(v0, v5); \ |
11636 |
+ v4 = _mm_mullo_epi16(v1, v5); \ |
11637 |
+ v5 = _mm_mulhi_epi16(v1, v5); \ |
11638 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
11639 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
11640 |
+ v2 = _mm_unpacklo_epi16(v4, v5); \ |
11641 |
+ v3 = _mm_unpackhi_epi16(v4, v5); \ |
11642 |
+ vy0 = _mm_add_epi32(vy0, v0); \ |
11643 |
+ vy1 = _mm_add_epi32(vy1, v1); \ |
11644 |
+ vy2 = _mm_add_epi32(vy2, v2); \ |
11645 |
+ vy3 = _mm_add_epi32(vy3, v3); \ |
11646 |
+ } \ |
11647 |
+ \ |
11648 |
+ vu0 = vu1 = vv0 = vv1 = start; \ |
11649 |
+ for (j = 0; j < chrFilterSize; j++) { \ |
11650 |
+ v0 = VEC_LD(chrUSrc[j] + i); \ |
11651 |
+ v1 = VEC_LD(chrVSrc[j] + i); \ |
11652 |
+ v5 = _mm_set1_epi16(chrFilter[j]); \ |
11653 |
+ v2 = _mm_mullo_epi16(v0, v5); \ |
11654 |
+ v3 = _mm_mulhi_epi16(v0, v5); \ |
11655 |
+ v4 = _mm_mullo_epi16(v1, v5); \ |
11656 |
+ v5 = _mm_mulhi_epi16(v1, v5); \ |
11657 |
+ v0 = _mm_unpacklo_epi16(v2, v3); \ |
11658 |
+ v1 = _mm_unpackhi_epi16(v2, v3); \ |
11659 |
+ v2 = _mm_unpacklo_epi16(v4, v5); \ |
11660 |
+ v3 = _mm_unpackhi_epi16(v4, v5); \ |
11661 |
+ vu0 = _mm_add_epi32(vu0, v0); \ |
11662 |
+ vu1 = _mm_add_epi32(vu1, v1); \ |
11663 |
+ vv0 = _mm_add_epi32(vv0, v2); \ |
11664 |
+ vv1 = _mm_add_epi32(vv1, v3); \ |
11665 |
+ } \ |
11666 |
+ \ |
11667 |
+ WRITE_##fmt##422 \ |
11668 |
+ } \ |
11669 |
+} |
11670 |
+ |
11671 |
+#define YUV2PACKEDWRAPPER2(ext, fmt) \ |
11672 |
+static void yuv2##ext##_2_e2k(SwsContext *c, const int16_t *buf[2], \ |
11673 |
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \ |
11674 |
+ const int16_t *abuf[2], uint8_t *dest, int dstW, \ |
11675 |
+ int yalpha, int uvalpha, int y) \ |
11676 |
+{ \ |
11677 |
+ const int16_t *buf0 = buf[0], *buf1 = buf[1], \ |
11678 |
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], \ |
11679 |
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1]; \ |
11680 |
+ vec_s16 vyalpha = _mm_set1_epi32(4096 + yalpha * 0xffff); \ |
11681 |
+ vec_s16 vuvalpha = _mm_set1_epi32(4096 + uvalpha * 0xffff); \ |
11682 |
+ __m128i vy0, vy1, vy2, vy3, vu0, vu1, vv0, vv1; \ |
11683 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
11684 |
+ int i; \ |
11685 |
+ av_assert2(yalpha <= 4096U); \ |
11686 |
+ av_assert2(uvalpha <= 4096U); \ |
11687 |
+ \ |
11688 |
+ for (i = 0; i < (dstW + 1) >> 1; i += 8) { \ |
11689 |
+ SETUP(buf, i * 2, vyalpha, vy0, vy1); \ |
11690 |
+ SETUP(buf, i * 2 + 8, vyalpha, vy2, vy3); \ |
11691 |
+ SETUP(ubuf, i, vuvalpha, vu0, vu1); \ |
11692 |
+ SETUP(vbuf, i, vuvalpha, vv0, vv1); \ |
11693 |
+ \ |
11694 |
+ WRITE_##fmt##422 \ |
11695 |
+ } \ |
11696 |
+} |
11697 |
+ |
11698 |
+#define INIT1_422 __m128i blenduv = _mm_set1_epi16(255); |
11699 |
+ |
11700 |
+#define WRITE1_422(vu, vv, x0, x1) \ |
11701 |
+ v5 = _mm_srli_epi16(vu, 8); \ |
11702 |
+ v4 = _mm_packus_epi16(vy0, vy1); \ |
11703 |
+ v5 = _mm_blendv_epi8(vv, v5, blenduv); \ |
11704 |
+ v0 = _mm_unpacklo_epi8(x0, x1); \ |
11705 |
+ v1 = _mm_unpackhi_epi8(x0, x1); \ |
11706 |
+ VEC_ST(dest, v0); \ |
11707 |
+ VEC_ST(dest + 16, v1); \ |
11708 |
+ dest += 32; |
11709 |
+ |
11710 |
+#define WRITE1_YUYV422 WRITE1_422(vu, vv, v4, v5) |
11711 |
+#define WRITE1_YVYU422 WRITE1_422(vv, vu, v4, v5) |
11712 |
+#define WRITE1_UYVY422 WRITE1_422(vu, vv, v5, v4) |
11713 |
+ |
11714 |
+#define YUV2PACKEDWRAPPER1(ext, fmt) \ |
11715 |
+static void yuv2##ext##_1_e2k(SwsContext *c, const int16_t *buf0, \ |
11716 |
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \ |
11717 |
+ const int16_t *abuf0, uint8_t *dest, int dstW, \ |
11718 |
+ int uvalpha, int y) \ |
11719 |
+{ \ |
11720 |
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; \ |
11721 |
+ vec_s16 vy0, vy1, vu, vv; \ |
11722 |
+ vec_s16 add64 = _mm_set1_epi16(64); \ |
11723 |
+ int i; \ |
11724 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
11725 |
+ LOAD_ZERO; \ |
11726 |
+ INIT1_422 \ |
11727 |
+ \ |
11728 |
+ if (uvalpha < 2048) { \ |
11729 |
+ for (i = 0; i < (dstW + 1) >> 1; i += 8) { \ |
11730 |
+ vy0 = VEC_LD(buf0 + i * 2); \ |
11731 |
+ vy1 = VEC_LD(buf0 + i * 2 + 8); \ |
11732 |
+ vu = VEC_LD(ubuf0 + i); \ |
11733 |
+ vv = VEC_LD(vbuf0 + i); \ |
11734 |
+ vy0 = _mm_adds_epi16(vy0, add64); \ |
11735 |
+ vy1 = _mm_adds_epi16(vy1, add64); \ |
11736 |
+ vu = _mm_max_epi16(vu, zerov); \ |
11737 |
+ vv = _mm_max_epi16(vv, zerov); \ |
11738 |
+ vy0 = _mm_srai_epi16(vy0, 7); \ |
11739 |
+ vy1 = _mm_srai_epi16(vy1, 7); \ |
11740 |
+ vu = _mm_add_epi16(vu, add64); \ |
11741 |
+ vv = _mm_add_epi16(vv, add64); \ |
11742 |
+ vu = _mm_adds_epu16(vu, vu); \ |
11743 |
+ vv = _mm_adds_epu16(vv, vv); \ |
11744 |
+ \ |
11745 |
+ WRITE1_##fmt##422 \ |
11746 |
+ } \ |
11747 |
+ } else { \ |
11748 |
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; \ |
11749 |
+ vec_s16 add128 = _mm_add_epi16(add64, add64); \ |
11750 |
+ for (i = 0; i < (dstW + 1) >> 1; i += 8) { \ |
11751 |
+ vy0 = VEC_LD(buf0 + i * 2); \ |
11752 |
+ vy1 = VEC_LD(buf0 + i * 2 + 8); \ |
11753 |
+ v0 = VEC_LD(ubuf0 + i); \ |
11754 |
+ v1 = VEC_LD(vbuf0 + i); \ |
11755 |
+ v2 = VEC_LD(ubuf1 + i); \ |
11756 |
+ v3 = VEC_LD(vbuf1 + i); \ |
11757 |
+ vy0 = _mm_adds_epi16(vy0, add64); \ |
11758 |
+ vy1 = _mm_adds_epi16(vy1, add64); \ |
11759 |
+ v0 = _mm_max_epi16(v0, zerov); \ |
11760 |
+ v1 = _mm_max_epi16(v1, zerov); \ |
11761 |
+ v2 = _mm_max_epi16(v2, zerov); \ |
11762 |
+ v3 = _mm_max_epi16(v3, zerov); \ |
11763 |
+ v0 = _mm_add_epi16(v0, add128); \ |
11764 |
+ v1 = _mm_add_epi16(v1, add128); \ |
11765 |
+ vy0 = _mm_srai_epi16(vy0, 7); \ |
11766 |
+ vy1 = _mm_srai_epi16(vy1, 7); \ |
11767 |
+ vu = _mm_adds_epu16(v0, v2); \ |
11768 |
+ vv = _mm_adds_epu16(v1, v3); \ |
11769 |
+ \ |
11770 |
+ WRITE1_##fmt##422 \ |
11771 |
+ } \ |
11772 |
+ } \ |
11773 |
+} |
11774 |
+ |
11775 |
+YUV2PACKEDWRAPPERX(yuyv422, YUYV) |
11776 |
+YUV2PACKEDWRAPPERX(yvyu422, YVYU) |
11777 |
+YUV2PACKEDWRAPPERX(uyvy422, UYVY) |
11778 |
+ |
11779 |
+YUV2PACKEDWRAPPER2(yuyv422, YUYV) |
11780 |
+YUV2PACKEDWRAPPER2(yvyu422, YVYU) |
11781 |
+YUV2PACKEDWRAPPER2(uyvy422, UYVY) |
11782 |
+ |
11783 |
+YUV2PACKEDWRAPPER1(yuyv422, YUYV) |
11784 |
+YUV2PACKEDWRAPPER1(yvyu422, YVYU) |
11785 |
+YUV2PACKEDWRAPPER1(uyvy422, UYVY) |
11786 |
+ |
11787 |
+#define HSCALE_INIT() \ |
11788 |
+ __m128i v0, v1, v2, v3, v4, v5, v6; \ |
11789 |
+ vec_u32 vadd = _mm_setr_epi32(0, xInc, xInc * 2, xInc * 3); \ |
11790 |
+ vec_u16 vadd16 = _mm_setr_epi16(0, xInc, xInc * 2, xInc * 3, \ |
11791 |
+ xInc * 4, xInc * 5, xInc * 6, xInc * 7) |
11792 |
+ |
11793 |
+#define HSCALE1() \ |
11794 |
+ v4 = _mm_set1_epi16(xpos); \ |
11795 |
+ v5 = _mm_set1_epi16(xpos + xInc * 8); \ |
11796 |
+ v4 = _mm_add_epi16(v4, vadd16); \ |
11797 |
+ v5 = _mm_add_epi16(v5, vadd16); \ |
11798 |
+ v4 = _mm_srli_epi16(v4, 9); \ |
11799 |
+ v5 = _mm_srli_epi16(v5, 9); \ |
11800 |
+ \ |
11801 |
+ v0 = _mm_set1_epi32(xpos & 0xffff); \ |
11802 |
+ v1 = _mm_set1_epi32((xpos & 0xffff) + xInc * 4); \ |
11803 |
+ v2 = _mm_set1_epi32((xpos & 0xffff) + xInc * 8); \ |
11804 |
+ v3 = _mm_set1_epi32((xpos & 0xffff) + xInc * 12); \ |
11805 |
+ v0 = _mm_add_epi32(v0, vadd); \ |
11806 |
+ v1 = _mm_add_epi32(v1, vadd); \ |
11807 |
+ v2 = _mm_add_epi32(v2, vadd); \ |
11808 |
+ v3 = _mm_add_epi32(v3, vadd); \ |
11809 |
+ v0 = _mm_srli_epi32(v0, 16); \ |
11810 |
+ v1 = _mm_srli_epi32(v1, 16); \ |
11811 |
+ v2 = _mm_srli_epi32(v2, 16); \ |
11812 |
+ v3 = _mm_srli_epi32(v3, 16); \ |
11813 |
+ v0 = _mm_packs_epi32(v0, v1); \ |
11814 |
+ v2 = _mm_packs_epi32(v2, v3); \ |
11815 |
+ v6 = _mm_packus_epi16(v0, v2); \ |
11816 |
+ \ |
11817 |
+ xx = xpos >> 16 |
11818 |
+ |
11819 |
+static void hyscale_fast_e2k(SwsContext *c, int16_t *dst, int dstWidth, |
11820 |
+ const uint8_t *src, int srcW, int xInc) |
11821 |
+{ |
11822 |
+ int i, xpos = 0, xx, a1; |
11823 |
+ LOAD_ZERO; |
11824 |
+ HSCALE_INIT(); |
11825 |
+ |
11826 |
+ for (i = 0; i < dstWidth; i += 16) { |
11827 |
+ HSCALE1(); |
11828 |
+ |
11829 |
+ v1 = VEC_LD(src + xx); |
11830 |
+ v3 = VEC_LD(src + xx + 1); |
11831 |
+ |
11832 |
+ v1 = _mm_shuffle_epi8(v1, v6); |
11833 |
+ v3 = _mm_shuffle_epi8(v3, v6); |
11834 |
+ v0 = _mm_unpacklo_epi8(v1, zerov); |
11835 |
+ v1 = _mm_unpackhi_epi8(v1, zerov); |
11836 |
+ v2 = _mm_unpacklo_epi8(v3, zerov); |
11837 |
+ v3 = _mm_unpackhi_epi8(v3, zerov); |
11838 |
+ v2 = _mm_sub_epi16(v2, v0); |
11839 |
+ v3 = _mm_sub_epi16(v3, v1); |
11840 |
+ v0 = _mm_slli_epi16(v0, 7); |
11841 |
+ v1 = _mm_slli_epi16(v1, 7); |
11842 |
+ v2 = _mm_mullo_epi16(v2, v4); |
11843 |
+ v3 = _mm_mullo_epi16(v3, v5); |
11844 |
+ v0 = _mm_add_epi16(v0, v2); |
11845 |
+ v1 = _mm_add_epi16(v1, v3); |
11846 |
+ |
11847 |
+ VEC_ST(dst + i, v0); |
11848 |
+ VEC_ST(dst + i + 8, v1); |
11849 |
+ xpos += xInc * 16; |
11850 |
+ } |
11851 |
+ |
11852 |
+ a1 = src[srcW - 1] * 128; |
11853 |
+ for (i = dstWidth - 1; (i * xInc) >> 16 >= srcW - 1; i--) |
11854 |
+ dst[i] = a1; |
11855 |
+} |
11856 |
+ |
11857 |
+#define HSCALE2() \ |
11858 |
+ v0 = _mm_shuffle_epi8(v0, v6); \ |
11859 |
+ v1 = _mm_shuffle_epi8(v1, v6); \ |
11860 |
+ v2 = _mm_unpacklo_epi8(v0, v1); \ |
11861 |
+ v3 = _mm_unpackhi_epi8(v0, v1); \ |
11862 |
+ v2 = _mm_maddubs_epi16(v2, v4); \ |
11863 |
+ v3 = _mm_maddubs_epi16(v3, v5) |
11864 |
+ |
11865 |
+static void hcscale_fast_e2k(SwsContext *c, int16_t *dst1, int16_t *dst2, |
11866 |
+ int dstWidth, const uint8_t *src1, |
11867 |
+ const uint8_t *src2, int srcW, int xInc) |
11868 |
+{ |
11869 |
+ int i, xpos = 0, xx, a1, a2; |
11870 |
+ HSCALE_INIT(); |
11871 |
+ __m128i xorv = _mm_set1_epi8(127); |
11872 |
+ |
11873 |
+ for (i = 0; i < dstWidth; i += 16) { |
11874 |
+ HSCALE1(); |
11875 |
+ |
11876 |
+ v0 = _mm_packus_epi16(v4, v5); |
11877 |
+ v1 = _mm_xor_si128(v0, xorv); |
11878 |
+ v4 = _mm_unpacklo_epi8(v1, v0); |
11879 |
+ v5 = _mm_unpackhi_epi8(v1, v0); |
11880 |
+ |
11881 |
+ v0 = VEC_LD(src1 + xx); |
11882 |
+ v1 = VEC_LD(src1 + xx + 1); |
11883 |
+ HSCALE2(); |
11884 |
+ v0 = VEC_LD(src2 + xx); |
11885 |
+ v1 = VEC_LD(src2 + xx + 1); |
11886 |
+ VEC_ST(dst1 + i, v2); |
11887 |
+ VEC_ST(dst1 + i + 8, v3); |
11888 |
+ HSCALE2(); |
11889 |
+ VEC_ST(dst2 + i, v2); |
11890 |
+ VEC_ST(dst2 + i + 8, v3); |
11891 |
+ xpos += xInc * 16; |
11892 |
+ } |
11893 |
+ |
11894 |
+ a1 = src1[srcW - 1] * 128; |
11895 |
+ a2 = src2[srcW - 1] * 128; |
11896 |
+ for (i = dstWidth - 1; (i * xInc) >> 16 >= srcW - 1; i--) { |
11897 |
+ dst1[i] = a1; |
11898 |
+ dst2[i] = a2; |
11899 |
+ } |
11900 |
+} |
11901 |
+ |
11902 |
+static void hScale8To19_e2k(SwsContext *c, int16_t *_dst, int dstW, |
11903 |
+ const uint8_t *src, const int16_t *filter, |
11904 |
+ const int32_t *filterPos, int filterSize) |
11905 |
+{ |
11906 |
+ int i, j; |
11907 |
+ int32_t *dst = (int32_t*)_dst; |
11908 |
+ LOAD_ZERO; |
11909 |
+ __m128i v0, v1, accv; |
11910 |
+ |
11911 |
+ if (filterSize == 1) { |
11912 |
+ for (i = 0; i < dstW; i++, filter += filterSize) { |
11913 |
+ int val = 0, srcPos = filterPos[i]; |
11914 |
+ for (j = 0; j < filterSize; j++) { |
11915 |
+ val += (int)src[srcPos + j] * filter[j]; |
11916 |
+ } |
11917 |
+ dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ... |
11918 |
+ } |
11919 |
+ } else { |
11920 |
+ __m64 h0, maskv; |
11921 |
+ uint64_t mask = ~0ll; |
11922 |
+ mask >>= (-filterSize & 7) * 8; |
11923 |
+ maskv = (__m64)mask; // 8, 1, 2, 3, 4, 5, 6, 7 |
11924 |
+ |
11925 |
+ for (i = 0; i < dstW; i++, filter += filterSize) { |
11926 |
+ int val; |
11927 |
+ const uint8_t *srci = src + filterPos[i]; |
11928 |
+ accv = zerov; |
11929 |
+ for (j = 0; j + 7 < filterSize; j += 8) { |
11930 |
+ v0 = VEC_LD8(srci + j); |
11931 |
+ v1 = VEC_LD(filter + j); |
11932 |
+ v0 = _mm_unpacklo_epi8(v0, zerov); |
11933 |
+ accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1)); |
11934 |
+ } |
11935 |
+ if (filterSize & 7) { |
11936 |
+ h0 = *(__m64*)(srci + j); |
11937 |
+ // Remove the unused elements on the last round |
11938 |
+ h0 = _mm_and_si64(h0, maskv); |
11939 |
+ v0 = _mm_movpi64_epi64(h0); |
11940 |
+ v1 = VEC_LD(filter + j); |
11941 |
+ v0 = _mm_unpacklo_epi8(v0, zerov); |
11942 |
+ accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1)); |
11943 |
+ } |
11944 |
+ accv = _mm_hadd_epi32(accv, accv); |
11945 |
+ val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1); |
11946 |
+ dst[i] = FFMIN(val >> 3, (1 << 19) - 1); |
11947 |
+ } |
11948 |
+ } |
11949 |
+} |
11950 |
+ |
11951 |
+static void hScale16To19_e2k(SwsContext *c, int16_t *_dst, int dstW, |
11952 |
+ const uint8_t *_src, const int16_t *filter, |
11953 |
+ const int32_t *filterPos, int filterSize) |
11954 |
+{ |
11955 |
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat); |
11956 |
+ int i, j; |
11957 |
+ int32_t *dst = (int32_t*)_dst; |
11958 |
+ const uint16_t *src = (const uint16_t*)_src; |
11959 |
+ int bits = desc->comp[0].depth - 1; |
11960 |
+ int sh = bits - 4; |
11961 |
+ LOAD_ZERO; |
11962 |
+ __m128i v0, v1, accv; |
11963 |
+ |
11964 |
+ if ((isAnyRGB(c->srcFormat) || c->srcFormat == AV_PIX_FMT_PAL8) && desc->comp[0].depth < 16) { |
11965 |
+ sh = 9; |
11966 |
+ } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */ |
11967 |
+ sh = 16 - 1 - 4; |
11968 |
+ } |
11969 |
+ |
11970 |
+ if (filterSize == 1) { |
11971 |
+ for (i = 0; i < dstW; i++) { |
11972 |
+ int val = 0, srcPos = filterPos[i]; |
11973 |
+ for (j = 0; j < filterSize; j++) { |
11974 |
+ val += (int)src[srcPos + j] * filter[filterSize * i + j]; |
11975 |
+ } |
11976 |
+ // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit |
11977 |
+ dst[i] = FFMIN(val >> sh, (1 << 19) - 1); |
11978 |
+ } |
11979 |
+ } else { |
11980 |
+ __m128i maskv, signv = _mm_set1_epi16(-0x8000), initv = zerov; |
11981 |
+ uint64_t mask = ~0ll; |
11982 |
+ mask >>= (-filterSize & 7) * 8; |
11983 |
+ maskv = _mm_movpi64_epi64((__m64)mask); |
11984 |
+ maskv = _mm_unpacklo_epi8(maskv, maskv); |
11985 |
+ |
11986 |
+ for (j = 0; j + 7 < filterSize; j += 8) { |
11987 |
+ v1 = VEC_LD(filter + j); |
11988 |
+ initv = _mm_sub_epi32(initv, _mm_madd_epi16(signv, v1)); |
11989 |
+ } |
11990 |
+ if (filterSize & 7) { |
11991 |
+ v1 = VEC_LD(filter + j); |
11992 |
+ v1 = _mm_and_si128(v1, maskv); |
11993 |
+ initv = _mm_sub_epi32(initv, _mm_madd_epi16(signv, v1)); |
11994 |
+ } |
11995 |
+ |
11996 |
+ for (i = 0; i < dstW; i++, filter += filterSize) { |
11997 |
+ int val; |
11998 |
+ const int16_t *srci = src + filterPos[i]; |
11999 |
+ accv = initv; |
12000 |
+ for (j = 0; j + 7 < filterSize; j += 8) { |
12001 |
+ v0 = VEC_LD(srci + j); |
12002 |
+ v0 = _mm_xor_si128(v0, signv); |
12003 |
+ v1 = VEC_LD(filter + j); |
12004 |
+ accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1)); |
12005 |
+ } |
12006 |
+ if (filterSize & 7) { |
12007 |
+ v0 = VEC_LD(srci + j); |
12008 |
+ v0 = _mm_xor_si128(v0, signv); |
12009 |
+ v1 = VEC_LD(filter + j); |
12010 |
+ // Remove the unused elements on the last round |
12011 |
+ v1 = _mm_and_si128(v1, maskv); |
12012 |
+ accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1)); |
12013 |
+ } |
12014 |
+ accv = _mm_hadd_epi32(accv, accv); |
12015 |
+ val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1); |
12016 |
+ dst[i] = FFMIN(val >> sh, (1 << 19) - 1); |
12017 |
+ } |
12018 |
+ } |
12019 |
+} |
12020 |
+ |
12021 |
+static void hScale16To15_e2k(SwsContext *c, int16_t *dst, int dstW, |
12022 |
+ const uint8_t *_src, const int16_t *filter, |
12023 |
+ const int32_t *filterPos, int filterSize) |
12024 |
+{ |
12025 |
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat); |
12026 |
+ int i, j; |
12027 |
+ const uint16_t *src = (const uint16_t*)_src; |
12028 |
+ int sh = desc->comp[0].depth - 1; |
12029 |
+ LOAD_ZERO; |
12030 |
+ __m128i v0, v1, accv; |
12031 |
+ |
12032 |
+ if (sh < 15) { |
12033 |
+ sh = isAnyRGB(c->srcFormat) || c->srcFormat == AV_PIX_FMT_PAL8 ? 13 : (desc->comp[0].depth - 1); |
12034 |
+ } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */ |
12035 |
+ sh = 16 - 1; |
12036 |
+ } |
12037 |
+ |
12038 |
+ if (filterSize == 1) { |
12039 |
+ for (i = 0; i < dstW; i++) { |
12040 |
+ int val = 0, srcPos = filterPos[i]; |
12041 |
+ for (j = 0; j < filterSize; j++) { |
12042 |
+ val += (int)src[srcPos + j] * filter[filterSize * i + j]; |
12043 |
+ } |
12044 |
+ // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit |
12045 |
+ dst[i] = FFMIN(val >> sh, (1 << 15) - 1); |
12046 |
+ } |
12047 |
+ } else { |
12048 |
+ __m128i maskv, signv = _mm_set1_epi16(-0x8000), initv = zerov; |
12049 |
+ uint64_t mask = ~0ll; |
12050 |
+ mask >>= (-filterSize & 7) * 8; |
12051 |
+ maskv = _mm_movpi64_epi64((__m64)mask); |
12052 |
+ maskv = _mm_unpacklo_epi8(maskv, maskv); |
12053 |
+ |
12054 |
+ for (j = 0; j + 7 < filterSize; j += 8) { |
12055 |
+ v1 = VEC_LD(filter + j); |
12056 |
+ initv = _mm_sub_epi32(initv, _mm_madd_epi16(signv, v1)); |
12057 |
+ } |
12058 |
+ if (filterSize & 7) { |
12059 |
+ v1 = VEC_LD(filter + j); |
12060 |
+ v1 = _mm_and_si128(v1, maskv); |
12061 |
+ initv = _mm_sub_epi32(initv, _mm_madd_epi16(signv, v1)); |
12062 |
+ } |
12063 |
+ |
12064 |
+ for (i = 0; i < dstW; i++, filter += filterSize) { |
12065 |
+ int val; |
12066 |
+ const int16_t *srci = src + filterPos[i]; |
12067 |
+ accv = initv; |
12068 |
+ for (j = 0; j + 7 < filterSize; j += 8) { |
12069 |
+ v0 = VEC_LD(srci + j); |
12070 |
+ v0 = _mm_xor_si128(v0, signv); |
12071 |
+ v1 = VEC_LD(filter + j); |
12072 |
+ accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1)); |
12073 |
+ } |
12074 |
+ if (filterSize & 7) { |
12075 |
+ v0 = VEC_LD(srci + j); |
12076 |
+ v0 = _mm_xor_si128(v0, signv); |
12077 |
+ // Remove the unused elements on the last round |
12078 |
+ v1 = VEC_LD(filter + j); |
12079 |
+ v1 = _mm_and_si128(v1, maskv); |
12080 |
+ accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1)); |
12081 |
+ } |
12082 |
+ accv = _mm_hadd_epi32(accv, accv); |
12083 |
+ val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1); |
12084 |
+ dst[i] = FFMIN(val >> sh, (1 << 15) - 1); |
12085 |
+ } |
12086 |
+ } |
12087 |
+} |
12088 |
+ |
12089 |
+av_cold void ff_sws_init_swscale_e2k(SwsContext *c) |
12090 |
+{ |
12091 |
+ enum AVPixelFormat dstFormat = c->dstFormat; |
12092 |
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat); |
12093 |
+ |
12094 |
+ if (!(av_get_cpu_flags() & AV_CPU_FLAG_E2K)) |
12095 |
+ return; |
12096 |
+ |
12097 |
+ if (dstFormat == AV_PIX_FMT_P010LE || dstFormat == AV_PIX_FMT_P010BE) { |
12098 |
+ // c->yuv2plane1 = isBE(dstFormat) ? yuv2p010l1_BE_e2k : yuv2p010l1_LE_e2k; |
12099 |
+ // c->yuv2planeX = isBE(dstFormat) ? yuv2p010lX_BE_e2k : yuv2p010lX_LE_e2k; |
12100 |
+ // c->yuv2nv12cX = yuv2p010cX_e2k; |
12101 |
+ } else if (is16BPS(dstFormat)) { |
12102 |
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_e2k : yuv2planeX_16LE_e2k; |
12103 |
+ c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_e2k : yuv2plane1_16LE_e2k; |
12104 |
+ if (dstFormat == AV_PIX_FMT_P016LE || dstFormat == AV_PIX_FMT_P016BE) { |
12105 |
+ // c->yuv2nv12cX = yuv2p016cX_e2k; |
12106 |
+ } |
12107 |
+ } else if (isNBPS(dstFormat)) { |
12108 |
+ if (desc->comp[0].depth == 9) { |
12109 |
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_e2k : yuv2planeX_9LE_e2k; |
12110 |
+ c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_e2k : yuv2plane1_9LE_e2k; |
12111 |
+ } else if (desc->comp[0].depth == 10) { |
12112 |
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_e2k : yuv2planeX_10LE_e2k; |
12113 |
+ c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_e2k : yuv2plane1_10LE_e2k; |
12114 |
+ } else if (desc->comp[0].depth == 12) { |
12115 |
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_e2k : yuv2planeX_12LE_e2k; |
12116 |
+ c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_e2k : yuv2plane1_12LE_e2k; |
12117 |
+ } else if (desc->comp[0].depth == 14) { |
12118 |
+ c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_e2k : yuv2planeX_14LE_e2k; |
12119 |
+ c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_e2k : yuv2plane1_14LE_e2k; |
12120 |
+ } else |
12121 |
+ av_assert0(0); |
12122 |
+ } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) { |
12123 |
+ // c->yuv2planeX = yuv2planeX_floatBE_e2k; |
12124 |
+ c->yuv2plane1 = yuv2plane1_floatBE_e2k; |
12125 |
+ } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) { |
12126 |
+ // c->yuv2planeX = yuv2planeX_floatLE_e2k; |
12127 |
+ c->yuv2plane1 = yuv2plane1_floatLE_e2k; |
12128 |
+ } else { |
12129 |
+ c->yuv2plane1 = yuv2plane1_8_e2k; |
12130 |
+ c->yuv2planeX = yuv2planeX_8_e2k; |
12131 |
+#if 0 |
12132 |
+ if (dstFormat == AV_PIX_FMT_NV12 || dstFormat == AV_PIX_FMT_NV21 || |
12133 |
+ dstFormat == AV_PIX_FMT_NV24 || dstFormat == AV_PIX_FMT_NV42) |
12134 |
+ c->yuv2nv12cX = yuv2nv12cX_e2k; |
12135 |
+#endif |
12136 |
+ } |
12137 |
+ |
12138 |
+ if (c->srcBpc == 8) { |
12139 |
+ if (c->dstBpc <= 14) { |
12140 |
+ c->hyScale = c->hcScale = hScale_real_e2k; |
12141 |
+ if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) { |
12142 |
+ c->hyscale_fast = hyscale_fast_e2k; |
12143 |
+ c->hcscale_fast = hcscale_fast_e2k; |
12144 |
+ } |
12145 |
+ } else { |
12146 |
+ c->hyScale = c->hcScale = hScale8To19_e2k; |
12147 |
+ } |
12148 |
+ } else { |
12149 |
+ c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_e2k |
12150 |
+ : hScale16To15_e2k; |
12151 |
+ } |
12152 |
+ |
12153 |
+ if (c->flags & SWS_FULL_CHR_H_INT) { |
12154 |
+ switch (dstFormat) { |
12155 |
+ case AV_PIX_FMT_RGB24: |
12156 |
+ c->yuv2packed1 = yuv2rgb24_full_1_e2k; |
12157 |
+ c->yuv2packed2 = yuv2rgb24_full_2_e2k; |
12158 |
+ c->yuv2packedX = yuv2rgb24_full_X_e2k; |
12159 |
+ break; |
12160 |
+ case AV_PIX_FMT_BGR24: |
12161 |
+ c->yuv2packed1 = yuv2bgr24_full_1_e2k; |
12162 |
+ c->yuv2packed2 = yuv2bgr24_full_2_e2k; |
12163 |
+ c->yuv2packedX = yuv2bgr24_full_X_e2k; |
12164 |
+ break; |
12165 |
+ case AV_PIX_FMT_BGRA: |
12166 |
+ if (!c->needAlpha) { |
12167 |
+ c->yuv2packed1 = yuv2bgrx32_full_1_e2k; |
12168 |
+ c->yuv2packed2 = yuv2bgrx32_full_2_e2k; |
12169 |
+ c->yuv2packedX = yuv2bgrx32_full_X_e2k; |
12170 |
+ } |
12171 |
+ break; |
12172 |
+ case AV_PIX_FMT_RGBA: |
12173 |
+ if (!c->needAlpha) { |
12174 |
+ c->yuv2packed1 = yuv2rgbx32_full_1_e2k; |
12175 |
+ c->yuv2packed2 = yuv2rgbx32_full_2_e2k; |
12176 |
+ c->yuv2packedX = yuv2rgbx32_full_X_e2k; |
12177 |
+ } |
12178 |
+ break; |
12179 |
+ case AV_PIX_FMT_ARGB: |
12180 |
+ if (!c->needAlpha) { |
12181 |
+ c->yuv2packed1 = yuv2xrgb32_full_1_e2k; |
12182 |
+ c->yuv2packed2 = yuv2xrgb32_full_2_e2k; |
12183 |
+ c->yuv2packedX = yuv2xrgb32_full_X_e2k; |
12184 |
+ } |
12185 |
+ break; |
12186 |
+ case AV_PIX_FMT_ABGR: |
12187 |
+ if (!c->needAlpha) { |
12188 |
+ c->yuv2packed1 = yuv2xbgr32_full_1_e2k; |
12189 |
+ c->yuv2packed2 = yuv2xbgr32_full_2_e2k; |
12190 |
+ c->yuv2packedX = yuv2xbgr32_full_X_e2k; |
12191 |
+ } |
12192 |
+ break; |
12193 |
+ } |
12194 |
+ } else if (!(c->flags & SWS_BITEXACT)) { /* !SWS_FULL_CHR_H_INT */ |
12195 |
+ switch (dstFormat) { |
12196 |
+ case AV_PIX_FMT_RGB24: |
12197 |
+ c->yuv2packed1 = yuv2rgb24_1_e2k; |
12198 |
+ c->yuv2packed2 = yuv2rgb24_2_e2k; |
12199 |
+ c->yuv2packedX = yuv2rgb24_X_e2k; |
12200 |
+ break; |
12201 |
+ case AV_PIX_FMT_BGR24: |
12202 |
+ c->yuv2packed1 = yuv2bgr24_1_e2k; |
12203 |
+ c->yuv2packed2 = yuv2bgr24_2_e2k; |
12204 |
+ c->yuv2packedX = yuv2bgr24_X_e2k; |
12205 |
+ break; |
12206 |
+ case AV_PIX_FMT_BGRA: |
12207 |
+ if (!c->needAlpha) { |
12208 |
+ c->yuv2packed1 = yuv2bgrx32_1_e2k; |
12209 |
+ c->yuv2packed2 = yuv2bgrx32_2_e2k; |
12210 |
+ c->yuv2packedX = yuv2bgrx32_X_e2k; |
12211 |
+ } |
12212 |
+ break; |
12213 |
+ case AV_PIX_FMT_RGBA: |
12214 |
+ if (!c->needAlpha) { |
12215 |
+ c->yuv2packed1 = yuv2rgbx32_1_e2k; |
12216 |
+ c->yuv2packed2 = yuv2rgbx32_2_e2k; |
12217 |
+ c->yuv2packedX = yuv2rgbx32_X_e2k; |
12218 |
+ } |
12219 |
+ break; |
12220 |
+ case AV_PIX_FMT_ARGB: |
12221 |
+ if (!c->needAlpha) { |
12222 |
+ c->yuv2packed1 = yuv2xrgb32_1_e2k; |
12223 |
+ c->yuv2packed2 = yuv2xrgb32_2_e2k; |
12224 |
+ c->yuv2packedX = yuv2xrgb32_X_e2k; |
12225 |
+ } |
12226 |
+ break; |
12227 |
+ case AV_PIX_FMT_ABGR: |
12228 |
+ if (!c->needAlpha) { |
12229 |
+ c->yuv2packed1 = yuv2xbgr32_1_e2k; |
12230 |
+ c->yuv2packed2 = yuv2xbgr32_2_e2k; |
12231 |
+ c->yuv2packedX = yuv2xbgr32_X_e2k; |
12232 |
+ } |
12233 |
+ break; |
12234 |
+ } |
12235 |
+ } |
12236 |
+ |
12237 |
+ switch (dstFormat) { |
12238 |
+ case AV_PIX_FMT_YUYV422: |
12239 |
+ c->yuv2packed1 = yuv2yuyv422_1_e2k; |
12240 |
+ c->yuv2packed2 = yuv2yuyv422_2_e2k; |
12241 |
+ c->yuv2packedX = yuv2yuyv422_X_e2k; |
12242 |
+ break; |
12243 |
+ case AV_PIX_FMT_YVYU422: |
12244 |
+ c->yuv2packed1 = yuv2yvyu422_1_e2k; |
12245 |
+ c->yuv2packed2 = yuv2yvyu422_2_e2k; |
12246 |
+ c->yuv2packedX = yuv2yvyu422_X_e2k; |
12247 |
+ break; |
12248 |
+ case AV_PIX_FMT_UYVY422: |
12249 |
+ c->yuv2packed1 = yuv2uyvy422_1_e2k; |
12250 |
+ c->yuv2packed2 = yuv2uyvy422_2_e2k; |
12251 |
+ c->yuv2packedX = yuv2uyvy422_X_e2k; |
12252 |
+ break; |
12253 |
+ } |
12254 |
+} |
12255 |
diff --git a/libswscale/e2k/yuv2rgb.c b/libswscale/e2k/yuv2rgb.c |
12256 |
new file mode 100644 |
12257 |
index 0000000..92f153f |
12258 |
--- /dev/null |
12259 |
+++ b/libswscale/e2k/yuv2rgb.c |
12260 |
@@ -0,0 +1,248 @@ |
12261 |
+/* |
12262 |
+ * Elbrus acceleration for colorspace conversion |
12263 |
+ * |
12264 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
12265 |
+ * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com> |
12266 |
+ * |
12267 |
+ * This file is part of FFmpeg. |
12268 |
+ * |
12269 |
+ * FFmpeg is free software; you can redistribute it and/or |
12270 |
+ * modify it under the terms of the GNU Lesser General Public |
12271 |
+ * License as published by the Free Software Foundation; either |
12272 |
+ * version 2.1 of the License, or (at your option) any later version. |
12273 |
+ * |
12274 |
+ * FFmpeg is distributed in the hope that it will be useful, |
12275 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12276 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12277 |
+ * Lesser General Public License for more details. |
12278 |
+ * |
12279 |
+ * You should have received a copy of the GNU Lesser General Public |
12280 |
+ * License along with FFmpeg; if not, write to the Free Software |
12281 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
12282 |
+ */ |
12283 |
+ |
12284 |
+#include <stdio.h> |
12285 |
+#include <stdlib.h> |
12286 |
+#include <string.h> |
12287 |
+#include <inttypes.h> |
12288 |
+ |
12289 |
+#include "config.h" |
12290 |
+#include "libswscale/rgb2rgb.h" |
12291 |
+#include "libswscale/swscale.h" |
12292 |
+#include "libswscale/swscale_internal.h" |
12293 |
+#include "libavutil/attributes.h" |
12294 |
+#include "libavutil/cpu.h" |
12295 |
+#include "libavutil/e2k/util_e2k.h" |
12296 |
+#include "libavutil/pixdesc.h" |
12297 |
+#include "yuv2rgb.h" |
12298 |
+ |
12299 |
+/* |
12300 |
+ * ------------------------------------------------------------------------------ |
12301 |
+ * CS converters |
12302 |
+ * ------------------------------------------------------------------------------ |
12303 |
+ */ |
12304 |
+ |
12305 |
+#define INIT2_RGB(R, B) \ |
12306 |
+ __m128i perm_unp8 = _mm_setr_epi8( \ |
12307 |
+ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); \ |
12308 |
+ __m64 rgb_index0 = _mm_setr_pi8(0, 1, 3, 4, 5, 7, 8, 9); \ |
12309 |
+ __m64 rgb_index1 = _mm_setr_pi8(3, 4, 5, 7, 8, 9, 11, 12); \ |
12310 |
+ __m64 rgb_index2 = _mm_setr_pi8(5, 7, 8, 9, 11, 12, 13, 15); |
12311 |
+ |
12312 |
+#define INIT2_RGBX(R, B) INIT2_XRGB(R, B) |
12313 |
+#define INIT2_XRGB(R, B) \ |
12314 |
+ __m128i A_l = _mm_set1_epi16(255); \ |
12315 |
+ __m128i perm_unp8 = _mm_setr_epi8( \ |
12316 |
+ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); |
12317 |
+ |
12318 |
+#define WRITE2_RGB(dest, R, B) \ |
12319 |
+ v4 = _mm_packus_epi16(R##_l, G_l); \ |
12320 |
+ v5 = _mm_packus_epi16(B##_l, B##_l); \ |
12321 |
+ v0 = _mm_shuffle_epi8(v4, perm_unp8); \ |
12322 |
+ v1 = _mm_unpacklo_epi8(v5, v5); \ |
12323 |
+ v2 = _mm_unpacklo_epi16(v0, v1); \ |
12324 |
+ v3 = _mm_unpackhi_epi16(v0, v1); \ |
12325 |
+ { \ |
12326 |
+ union { __m128i v; __m64 d[2]; } a = { v2 }, b = { v3 }; \ |
12327 |
+ __m64 *p = (__m64*)dest; \ |
12328 |
+ p[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \ |
12329 |
+ p[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \ |
12330 |
+ p[2] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \ |
12331 |
+ dest += 24; \ |
12332 |
+ } |
12333 |
+ |
12334 |
+#define WRITE2_RGBX(dest, R, B) \ |
12335 |
+ v4 = _mm_packus_epi16(R##_l, G_l); \ |
12336 |
+ v5 = _mm_packus_epi16(B##_l, A_l); \ |
12337 |
+ v0 = _mm_shuffle_epi8(v4, perm_unp8); \ |
12338 |
+ v1 = _mm_shuffle_epi8(v5, perm_unp8); \ |
12339 |
+ v2 = _mm_unpacklo_epi16(v0, v1); \ |
12340 |
+ v3 = _mm_unpackhi_epi16(v0, v1); \ |
12341 |
+ VEC_ST(dest, v2); \ |
12342 |
+ VEC_ST(dest + 16, v3); \ |
12343 |
+ dest += 32; |
12344 |
+ |
12345 |
+#define WRITE2_XRGB(dest, R, B) \ |
12346 |
+ v4 = _mm_packus_epi16(A_l, R##_l); \ |
12347 |
+ v5 = _mm_packus_epi16(G_l, B##_l); \ |
12348 |
+ v0 = _mm_shuffle_epi8(v4, perm_unp8); \ |
12349 |
+ v1 = _mm_shuffle_epi8(v5, perm_unp8); \ |
12350 |
+ v2 = _mm_unpacklo_epi16(v0, v1); \ |
12351 |
+ v3 = _mm_unpackhi_epi16(v0, v1); \ |
12352 |
+ VEC_ST(dest, v2); \ |
12353 |
+ VEC_ST(dest + 16, v3); \ |
12354 |
+ dest += 32; |
12355 |
+ |
12356 |
+#define DEFCSP420_CVT(name, fmt, R, B) \ |
12357 |
+static int yuv2##name##_e2k(SwsContext *c, const unsigned char **in, \ |
12358 |
+ int *instrides, int srcSliceY, int srcSliceH, \ |
12359 |
+ unsigned char **oplanes, int *outstrides) \ |
12360 |
+{ \ |
12361 |
+ vec_s32 R_l, G_l, B_l; \ |
12362 |
+ vec_s32 y_coeff = _mm_set1_epi16(c->yuv2rgb_y_coeff); \ |
12363 |
+ vec_s32 y_sub = _mm_set1_epi16((c->yuv2rgb_y_offset + 64) >> 7); \ |
12364 |
+ vec_s32 v2r_coeff = _mm_set1_epi16(c->yuv2rgb_v2r_coeff); \ |
12365 |
+ vec_s32 v2g_coeff = _mm_set1_epi16(c->yuv2rgb_v2g_coeff); \ |
12366 |
+ vec_s32 u2g_coeff = _mm_set1_epi16(c->yuv2rgb_u2g_coeff); \ |
12367 |
+ vec_s32 u2b_coeff = _mm_set1_epi16(c->yuv2rgb_u2b_coeff); \ |
12368 |
+ vec_s32 dec128 = _mm_set1_epi16(128); \ |
12369 |
+ __m128i v0, v1, v2, v3, v4, v5; \ |
12370 |
+ LOAD_ZERO; \ |
12371 |
+ INIT2_##fmt(R, B) \ |
12372 |
+ int i, j, w = c->dstW & -16, h = srcSliceH & -2; \ |
12373 |
+ vec_s16 Y0, Y1, Y2, Y3, U, V; \ |
12374 |
+ vec_s16 vx, ux, uvx, vx0, ux0, uvx0, vx1, ux1, uvx1; \ |
12375 |
+ const uint8_t *y1i = in[0]; \ |
12376 |
+ const uint8_t *y2i = in[0] + instrides[0]; \ |
12377 |
+ const uint8_t *ui = in[1], *vi = in[2]; \ |
12378 |
+ uint8_t *out0, *out1; \ |
12379 |
+ int vshift = c->srcFormat == AV_PIX_FMT_YUV422P; \ |
12380 |
+ int instrides0 = instrides[0] * 2 - w; \ |
12381 |
+ int instrides1 = (instrides[1] << vshift) - w / 2; \ |
12382 |
+ int instrides2 = (instrides[2] << vshift) - w / 2; \ |
12383 |
+ \ |
12384 |
+ for (i = 0; i < h; i += 2) { \ |
12385 |
+ out0 = oplanes[0] + (i + srcSliceY) * outstrides[0]; \ |
12386 |
+ out1 = out0 + outstrides[0]; \ |
12387 |
+ for (j = 0; j < w >> 4; j++) { \ |
12388 |
+ Y1 = VEC_LD(y1i); \ |
12389 |
+ Y3 = VEC_LD(y2i); \ |
12390 |
+ U = VEC_LD8(ui); \ |
12391 |
+ V = VEC_LD8(vi); \ |
12392 |
+ U = _mm_unpacklo_epi8(U, zerov); \ |
12393 |
+ V = _mm_unpacklo_epi8(V, zerov); \ |
12394 |
+ Y0 = _mm_unpacklo_epi8(Y1, zerov); \ |
12395 |
+ Y1 = _mm_unpackhi_epi8(Y1, zerov); \ |
12396 |
+ Y2 = _mm_unpacklo_epi8(Y3, zerov); \ |
12397 |
+ Y3 = _mm_unpackhi_epi8(Y3, zerov); \ |
12398 |
+ U = _mm_sub_epi16(U, dec128); \ |
12399 |
+ V = _mm_sub_epi16(V, dec128); \ |
12400 |
+ U = _mm_slli_epi16(U, 2); \ |
12401 |
+ V = _mm_slli_epi16(V, 2); \ |
12402 |
+ Y0 = _mm_slli_epi16(Y0, 2); \ |
12403 |
+ Y1 = _mm_slli_epi16(Y1, 2); \ |
12404 |
+ Y2 = _mm_slli_epi16(Y2, 2); \ |
12405 |
+ Y3 = _mm_slli_epi16(Y3, 2); \ |
12406 |
+ \ |
12407 |
+ Y0 = _mm_mulhrs_epi16(_mm_sub_epi16(Y0, y_sub), y_coeff); \ |
12408 |
+ Y1 = _mm_mulhrs_epi16(_mm_sub_epi16(Y1, y_sub), y_coeff); \ |
12409 |
+ Y2 = _mm_mulhrs_epi16(_mm_sub_epi16(Y2, y_sub), y_coeff); \ |
12410 |
+ Y3 = _mm_mulhrs_epi16(_mm_sub_epi16(Y3, y_sub), y_coeff); \ |
12411 |
+ \ |
12412 |
+ ux = _mm_mulhrs_epi16(U, u2b_coeff); \ |
12413 |
+ vx = _mm_mulhrs_epi16(V, v2r_coeff); \ |
12414 |
+ ux0 = _mm_unpacklo_epi16(ux, ux); \ |
12415 |
+ ux1 = _mm_unpackhi_epi16(ux, ux); \ |
12416 |
+ vx0 = _mm_unpacklo_epi16(vx, vx); \ |
12417 |
+ vx1 = _mm_unpackhi_epi16(vx, vx); \ |
12418 |
+ \ |
12419 |
+ uvx = _mm_mulhrs_epi16(U, u2g_coeff); \ |
12420 |
+ uvx = _mm_add_epi16(_mm_mulhrs_epi16(V, v2g_coeff), uvx); \ |
12421 |
+ uvx0 = _mm_unpacklo_epi16(uvx, uvx); \ |
12422 |
+ uvx1 = _mm_unpackhi_epi16(uvx, uvx); \ |
12423 |
+ \ |
12424 |
+ R_l = _mm_add_epi16(Y0, vx0); \ |
12425 |
+ G_l = _mm_add_epi16(Y0, uvx0); \ |
12426 |
+ B_l = _mm_add_epi16(Y0, ux0); \ |
12427 |
+ \ |
12428 |
+ WRITE2_##fmt(out0, R, B) \ |
12429 |
+ \ |
12430 |
+ R_l = _mm_add_epi16(Y1, vx1); \ |
12431 |
+ G_l = _mm_add_epi16(Y1, uvx1); \ |
12432 |
+ B_l = _mm_add_epi16(Y1, ux1); \ |
12433 |
+ \ |
12434 |
+ WRITE2_##fmt(out0, R, B) \ |
12435 |
+ \ |
12436 |
+ R_l = _mm_add_epi16(Y2, vx0); \ |
12437 |
+ G_l = _mm_add_epi16(Y2, uvx0); \ |
12438 |
+ B_l = _mm_add_epi16(Y2, ux0); \ |
12439 |
+ \ |
12440 |
+ WRITE2_##fmt(out1, R, B) \ |
12441 |
+ \ |
12442 |
+ R_l = _mm_add_epi16(Y3, vx1); \ |
12443 |
+ G_l = _mm_add_epi16(Y3, uvx1); \ |
12444 |
+ B_l = _mm_add_epi16(Y3, ux1); \ |
12445 |
+ \ |
12446 |
+ WRITE2_##fmt(out1, R, B) \ |
12447 |
+ \ |
12448 |
+ y1i += 16; ui += 8; \ |
12449 |
+ y2i += 16; vi += 8; \ |
12450 |
+ } \ |
12451 |
+ y1i += instrides0; ui += instrides1; \ |
12452 |
+ y2i += instrides0; vi += instrides2; \ |
12453 |
+ } \ |
12454 |
+ return srcSliceH; \ |
12455 |
+} |
12456 |
+ |
12457 |
+DEFCSP420_CVT(rgbx32, RGBX, R, B) |
12458 |
+DEFCSP420_CVT(bgrx32, RGBX, B, R) |
12459 |
+DEFCSP420_CVT(xrgb32, XRGB, R, B) |
12460 |
+DEFCSP420_CVT(xbgr32, XRGB, B, R) |
12461 |
+DEFCSP420_CVT(rgb24, RGB, R, B) |
12462 |
+DEFCSP420_CVT(bgr24, RGB, B, R) |
12463 |
+ |
12464 |
+/* Ok currently the acceleration routine only supports |
12465 |
+ * inputs of widths a multiple of 16 |
12466 |
+ * and heights a multiple 2 |
12467 |
+ * |
12468 |
+ * So we just fall back to the C codes for this. |
12469 |
+ */ |
12470 |
+av_cold SwsFunc ff_yuv2rgb_init_e2k(SwsContext *c) |
12471 |
+{ |
12472 |
+ SwsFunc ret; |
12473 |
+ if (!(av_get_cpu_flags() & AV_CPU_FLAG_E2K)) |
12474 |
+ return NULL; |
12475 |
+ |
12476 |
+ if (c->flags & SWS_BITEXACT || c->needAlpha) |
12477 |
+ return NULL; |
12478 |
+ |
12479 |
+ ret = NULL; |
12480 |
+ switch (c->srcFormat) { |
12481 |
+ case AV_PIX_FMT_YUV422P: |
12482 |
+ case AV_PIX_FMT_YUV420P: |
12483 |
+ if (c->dstW & 15 || c->dstH & 1) break; |
12484 |
+ switch (c->dstFormat) { |
12485 |
+ case AV_PIX_FMT_RGB24: |
12486 |
+ ret = yuv2rgb24_e2k; break; |
12487 |
+ case AV_PIX_FMT_BGR24: |
12488 |
+ ret = yuv2bgr24_e2k; break; |
12489 |
+ case AV_PIX_FMT_ARGB: |
12490 |
+ ret = yuv2xrgb32_e2k; break; |
12491 |
+ case AV_PIX_FMT_ABGR: |
12492 |
+ ret = yuv2xbgr32_e2k; break; |
12493 |
+ case AV_PIX_FMT_RGBA: |
12494 |
+ ret = yuv2rgbx32_e2k; break; |
12495 |
+ case AV_PIX_FMT_BGRA: |
12496 |
+ ret = yuv2bgrx32_e2k; break; |
12497 |
+ default: break; |
12498 |
+ } |
12499 |
+ break; |
12500 |
+ } |
12501 |
+ if (ret) { |
12502 |
+ av_log(c, AV_LOG_WARNING, "E2K: yuv2rgb(%s, %s)\n", |
12503 |
+ av_get_pix_fmt_name(c->srcFormat), |
12504 |
+ av_get_pix_fmt_name(c->dstFormat)); |
12505 |
+ } |
12506 |
+ return ret; |
12507 |
+} |
12508 |
+ |
12509 |
diff --git a/libswscale/e2k/yuv2rgb.h b/libswscale/e2k/yuv2rgb.h |
12510 |
new file mode 100644 |
12511 |
index 0000000..59637bc |
12512 |
--- /dev/null |
12513 |
+++ b/libswscale/e2k/yuv2rgb.h |
12514 |
@@ -0,0 +1,52 @@ |
12515 |
+/* |
12516 |
+ * Elbrus-enhanced yuv2yuvX |
12517 |
+ * |
12518 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
12519 |
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org> |
12520 |
+ * based on the equivalent C code in swscale.c |
12521 |
+ * |
12522 |
+ * This file is part of FFmpeg. |
12523 |
+ * |
12524 |
+ * FFmpeg is free software; you can redistribute it and/or |
12525 |
+ * modify it under the terms of the GNU Lesser General Public |
12526 |
+ * License as published by the Free Software Foundation; either |
12527 |
+ * version 2.1 of the License, or (at your option) any later version. |
12528 |
+ * |
12529 |
+ * FFmpeg is distributed in the hope that it will be useful, |
12530 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12531 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12532 |
+ * Lesser General Public License for more details. |
12533 |
+ * |
12534 |
+ * You should have received a copy of the GNU Lesser General Public |
12535 |
+ * License along with FFmpeg; if not, write to the Free Software |
12536 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
12537 |
+ */ |
12538 |
+ |
12539 |
+#ifndef SWSCALE_E2K_YUV2RGB_H |
12540 |
+#define SWSCALE_E2K_YUV2RGB_H |
12541 |
+ |
12542 |
+#include <stdint.h> |
12543 |
+ |
12544 |
+#include "libswscale/swscale_internal.h" |
12545 |
+ |
12546 |
+#define YUV2PACKEDX_HEADER(suffix) \ |
12547 |
+ void ff_yuv2##suffix##_X_e2k(SwsContext *c, \ |
12548 |
+ const int16_t *lumFilter, \ |
12549 |
+ const int16_t **lumSrc, \ |
12550 |
+ int lumFilterSize, \ |
12551 |
+ const int16_t *chrFilter, \ |
12552 |
+ const int16_t **chrUSrc, \ |
12553 |
+ const int16_t **chrVSrc, \ |
12554 |
+ int chrFilterSize, \ |
12555 |
+ const int16_t **alpSrc, \ |
12556 |
+ uint8_t *dest, \ |
12557 |
+ int dstW, int dstY); |
12558 |
+ |
12559 |
+YUV2PACKEDX_HEADER(abgr); |
12560 |
+YUV2PACKEDX_HEADER(bgra); |
12561 |
+YUV2PACKEDX_HEADER(argb); |
12562 |
+YUV2PACKEDX_HEADER(rgba); |
12563 |
+YUV2PACKEDX_HEADER(rgb24); |
12564 |
+YUV2PACKEDX_HEADER(bgr24); |
12565 |
+ |
12566 |
+#endif /* SWSCALE_E2K_YUV2RGB_H */ |
12567 |
diff --git a/libswscale/e2k/yuv2yuv.c b/libswscale/e2k/yuv2yuv.c |
12568 |
new file mode 100644 |
12569 |
index 0000000..7423fa8 |
12570 |
--- /dev/null |
12571 |
+++ b/libswscale/e2k/yuv2yuv.c |
12572 |
@@ -0,0 +1,146 @@ |
12573 |
+/* |
12574 |
+ * Elbrus-enhanced yuv-to-yuv conversion routines. |
12575 |
+ * |
12576 |
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd |
12577 |
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org> |
12578 |
+ * based on the equivalent C code in swscale.c |
12579 |
+ * |
12580 |
+ * This file is part of FFmpeg. |
12581 |
+ * |
12582 |
+ * FFmpeg is free software; you can redistribute it and/or |
12583 |
+ * modify it under the terms of the GNU Lesser General Public |
12584 |
+ * License as published by the Free Software Foundation; either |
12585 |
+ * version 2.1 of the License, or (at your option) any later version. |
12586 |
+ * |
12587 |
+ * FFmpeg is distributed in the hope that it will be useful, |
12588 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12589 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12590 |
+ * Lesser General Public License for more details. |
12591 |
+ * |
12592 |
+ * You should have received a copy of the GNU Lesser General Public |
12593 |
+ * License along with FFmpeg; if not, write to the Free Software |
12594 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
12595 |
+ */ |
12596 |
+ |
12597 |
+#include <inttypes.h> |
12598 |
+ |
12599 |
+#include "config.h" |
12600 |
+#include "libavutil/attributes.h" |
12601 |
+#include "libavutil/cpu.h" |
12602 |
+#include "libswscale/swscale.h" |
12603 |
+#include "libswscale/swscale_internal.h" |
12604 |
+#include "libavutil/e2k/util_e2k.h" |
12605 |
+ |
12606 |
+/* This code assumes: |
12607 |
+ * |
12608 |
+ * 1) dst is 16 bytes-aligned |
12609 |
+ * 2) dstStride is a multiple of 16 |
12610 |
+ * 3) width is a multiple of 16 |
12611 |
+ * 4) lum & chrom stride are multiples of 8 |
12612 |
+ */ |
12613 |
+ |
12614 |
+static int yv12toyuy2_unscaled_e2k(SwsContext *c, const uint8_t *src[], |
12615 |
+ int srcStride[], int srcSliceY, |
12616 |
+ int srcSliceH, uint8_t *dstParam[], |
12617 |
+ int dstStride_a[]) |
12618 |
+{ |
12619 |
+ const uint8_t *ysrc = src[0], *usrc = src[1], *vsrc = src[2]; |
12620 |
+ int dstStride = dstStride_a[0]; |
12621 |
+ uint8_t *dst = dstParam[0] + dstStride * srcSliceY; |
12622 |
+ int width = (c->dstW + 1) >> 1, height = srcSliceH; |
12623 |
+ int lumStride = srcStride[0]; |
12624 |
+ int chromStride = srcStride[1]; |
12625 |
+ int y, i; |
12626 |
+ |
12627 |
+ for (y = 0; y < height; y++) { |
12628 |
+ PRAGMA_E2K("ivdep") |
12629 |
+ for (i = 0; i < width - 7; i += 8) { |
12630 |
+ __m128i v0, v1, v2, v3; |
12631 |
+ v0 = VEC_LD(ysrc + i * 2); |
12632 |
+ v2 = VEC_LD8(usrc + i); |
12633 |
+ v3 = VEC_LD8(vsrc + i); |
12634 |
+ v1 = _mm_unpacklo_epi8(v2, v3); |
12635 |
+ VEC_ST(dst + i * 4, _mm_unpacklo_epi8(v0, v1)); |
12636 |
+ VEC_ST(dst + i * 4 + 16, _mm_unpackhi_epi8(v0, v1)); |
12637 |
+ } |
12638 |
+ |
12639 |
+ PRAGMA_E2K("ivdep") |
12640 |
+ for (; i < width; i++) { |
12641 |
+ *(uint32_t*)(dst + i * 4) = |
12642 |
+ ysrc[i * 2] | usrc[i] << 8 | |
12643 |
+ ysrc[i * 2 + 1] << 16 | vsrc[i] << 24; |
12644 |
+ } |
12645 |
+ |
12646 |
+ if (y & 1) { |
12647 |
+ usrc += chromStride; |
12648 |
+ vsrc += chromStride; |
12649 |
+ } |
12650 |
+ ysrc += lumStride; |
12651 |
+ dst += dstStride; |
12652 |
+ } |
12653 |
+ |
12654 |
+ return srcSliceH; |
12655 |
+} |
12656 |
+ |
12657 |
+static int yv12touyvy_unscaled_e2k(SwsContext *c, const uint8_t *src[], |
12658 |
+ int srcStride[], int srcSliceY, |
12659 |
+ int srcSliceH, uint8_t *dstParam[], |
12660 |
+ int dstStride_a[]) |
12661 |
+{ |
12662 |
+ const uint8_t *ysrc = src[0], *usrc = src[1], *vsrc = src[2]; |
12663 |
+ int dstStride = dstStride_a[0]; |
12664 |
+ uint8_t *dst = dstParam[0] + dstStride * srcSliceY; |
12665 |
+ int width = (c->dstW + 1) >> 1, height = srcSliceH; |
12666 |
+ int lumStride = srcStride[0]; |
12667 |
+ int chromStride = srcStride[1]; |
12668 |
+ int y, i; |
12669 |
+ |
12670 |
+ for (y = 0; y < height; y++) { |
12671 |
+ PRAGMA_E2K("ivdep") |
12672 |
+ for (i = 0; i < width - 7; i += 8) { |
12673 |
+ __m128i v0, v1, v2, v3; |
12674 |
+ v0 = VEC_LD(ysrc + i * 2); |
12675 |
+ v2 = VEC_LD8(usrc + i); |
12676 |
+ v3 = VEC_LD8(vsrc + i); |
12677 |
+ v1 = _mm_unpacklo_epi8(v2, v3); |
12678 |
+ VEC_ST(dst + i * 4, _mm_unpacklo_epi8(v1, v0)); |
12679 |
+ VEC_ST(dst + i * 4 + 16, _mm_unpackhi_epi8(v1, v0)); |
12680 |
+ } |
12681 |
+ |
12682 |
+ PRAGMA_E2K("ivdep") |
12683 |
+ for (; i < width; i++) { |
12684 |
+ *(uint32_t*)(dst + i * 4) = |
12685 |
+ usrc[i] | ysrc[i * 2] << 8 | |
12686 |
+ vsrc[i] << 16 | ysrc[i * 2 + 1] << 24; |
12687 |
+ } |
12688 |
+ |
12689 |
+ if (y & 1) { |
12690 |
+ usrc += chromStride; |
12691 |
+ vsrc += chromStride; |
12692 |
+ } |
12693 |
+ ysrc += lumStride; |
12694 |
+ dst += dstStride; |
12695 |
+ } |
12696 |
+ return srcSliceH; |
12697 |
+} |
12698 |
+ |
12699 |
+av_cold void ff_get_unscaled_swscale_e2k(SwsContext *c) |
12700 |
+{ |
12701 |
+ if (!(av_get_cpu_flags() & AV_CPU_FLAG_E2K)) |
12702 |
+ return; |
12703 |
+ |
12704 |
+ if (c->flags & SWS_BITEXACT) |
12705 |
+ return; |
12706 |
+ |
12707 |
+ if (c->srcFormat == AV_PIX_FMT_YUV420P) { |
12708 |
+ enum AVPixelFormat dstFormat = c->dstFormat; |
12709 |
+ switch (dstFormat) { |
12710 |
+ case AV_PIX_FMT_YUYV422: |
12711 |
+ c->swscale = yv12toyuy2_unscaled_e2k; |
12712 |
+ break; |
12713 |
+ case AV_PIX_FMT_UYVY422: |
12714 |
+ c->swscale = yv12touyvy_unscaled_e2k; |
12715 |
+ break; |
12716 |
+ } |
12717 |
+ } |
12718 |
+} |
12719 |
diff --git a/libswscale/swscale.c b/libswscale/swscale.c |
12720 |
index 9cb7e8f..7760885 100644 |
12721 |
--- a/libswscale/swscale.c |
12722 |
+++ b/libswscale/swscale.c |
12723 |
@@ -586,6 +586,8 @@ SwsFunc ff_getSwsFunc(SwsContext *c) |
12724 |
|
12725 |
if (ARCH_PPC) |
12726 |
ff_sws_init_swscale_ppc(c); |
12727 |
+ if (ARCH_E2K) |
12728 |
+ ff_sws_init_swscale_e2k(c); |
12729 |
if (ARCH_X86) |
12730 |
ff_sws_init_swscale_x86(c); |
12731 |
if (ARCH_AARCH64) |
12732 |
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h |
12733 |
index ee46092..8dcffb2 100644 |
12734 |
--- a/libswscale/swscale_internal.h |
12735 |
+++ b/libswscale/swscale_internal.h |
12736 |
@@ -31,7 +31,9 @@ |
12737 |
#include "libavutil/log.h" |
12738 |
#include "libavutil/pixfmt.h" |
12739 |
#include "libavutil/pixdesc.h" |
12740 |
+#if HAVE_ALTIVEC |
12741 |
#include "libavutil/ppc/util_altivec.h" |
12742 |
+#endif |
12743 |
|
12744 |
#define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long |
12745 |
|
12746 |
@@ -639,6 +641,7 @@ av_cold void ff_sws_init_range_convert(SwsContext *c); |
12747 |
|
12748 |
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c); |
12749 |
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c); |
12750 |
+SwsFunc ff_yuv2rgb_init_e2k(SwsContext *c); |
12751 |
|
12752 |
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt) |
12753 |
{ |
12754 |
@@ -853,6 +856,7 @@ extern const AVClass ff_sws_context_class; |
12755 |
*/ |
12756 |
void ff_get_unscaled_swscale(SwsContext *c); |
12757 |
void ff_get_unscaled_swscale_ppc(SwsContext *c); |
12758 |
+void ff_get_unscaled_swscale_e2k(SwsContext *c); |
12759 |
void ff_get_unscaled_swscale_arm(SwsContext *c); |
12760 |
void ff_get_unscaled_swscale_aarch64(SwsContext *c); |
12761 |
|
12762 |
@@ -873,6 +877,7 @@ void ff_sws_init_output_funcs(SwsContext *c, |
12763 |
yuv2anyX_fn *yuv2anyX); |
12764 |
void ff_sws_init_swscale_ppc(SwsContext *c); |
12765 |
void ff_sws_init_swscale_vsx(SwsContext *c); |
12766 |
+void ff_sws_init_swscale_e2k(SwsContext *c); |
12767 |
void ff_sws_init_swscale_x86(SwsContext *c); |
12768 |
void ff_sws_init_swscale_aarch64(SwsContext *c); |
12769 |
void ff_sws_init_swscale_arm(SwsContext *c); |
12770 |
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c |
12771 |
index 5fb572b..cacb11c 100644 |
12772 |
--- a/libswscale/swscale_unscaled.c |
12773 |
+++ b/libswscale/swscale_unscaled.c |
12774 |
@@ -2172,6 +2172,8 @@ void ff_get_unscaled_swscale(SwsContext *c) |
12775 |
|
12776 |
if (ARCH_PPC) |
12777 |
ff_get_unscaled_swscale_ppc(c); |
12778 |
+ if (ARCH_E2K) |
12779 |
+ ff_get_unscaled_swscale_e2k(c); |
12780 |
if (ARCH_ARM) |
12781 |
ff_get_unscaled_swscale_arm(c); |
12782 |
if (ARCH_AARCH64) |
12783 |
diff --git a/libswscale/utils.c b/libswscale/utils.c |
12784 |
index 111062e..db58be1 100644 |
12785 |
--- a/libswscale/utils.c |
12786 |
+++ b/libswscale/utils.c |
12787 |
@@ -51,6 +51,7 @@ |
12788 |
#include "libavutil/pixdesc.h" |
12789 |
#include "libavutil/aarch64/cpu.h" |
12790 |
#include "libavutil/ppc/cpu.h" |
12791 |
+#include "libavutil/e2k/cpu.h" |
12792 |
#include "libavutil/x86/asm.h" |
12793 |
#include "libavutil/x86/cpu.h" |
12794 |
|
12795 |
@@ -600,6 +601,14 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos, |
12796 |
filterAlign = 1; |
12797 |
} |
12798 |
|
12799 |
+ if (E2K_BASE(cpu_flags)) { |
12800 |
+ if (minFilterSize < 5) |
12801 |
+ filterAlign = 4; |
12802 |
+ |
12803 |
+ if (minFilterSize < 3) |
12804 |
+ filterAlign = 1; |
12805 |
+ } |
12806 |
+ |
12807 |
if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) { |
12808 |
// special case for unscaled vertical filtering |
12809 |
if (minFilterSize == 1 && filterAlign == 2) |
12810 |
@@ -1679,6 +1688,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, |
12811 |
{ |
12812 |
const int filterAlign = X86_MMX(cpu_flags) ? 4 : |
12813 |
PPC_ALTIVEC(cpu_flags) ? 8 : |
12814 |
+ E2K_BASE(cpu_flags) ? 8 : |
12815 |
have_neon(cpu_flags) ? 8 : 1; |
12816 |
|
12817 |
if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos, |
12818 |
@@ -1706,6 +1716,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, |
12819 |
{ |
12820 |
const int filterAlign = X86_MMX(cpu_flags) ? 2 : |
12821 |
PPC_ALTIVEC(cpu_flags) ? 8 : |
12822 |
+ E2K_BASE(cpu_flags) ? 8 : |
12823 |
have_neon(cpu_flags) ? 2 : 1; |
12824 |
|
12825 |
if ((ret = initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, |
12826 |
@@ -1790,6 +1801,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, |
12827 |
cpucaps = "MMX"; |
12828 |
else if (PPC_ALTIVEC(cpu_flags)) |
12829 |
cpucaps = "AltiVec"; |
12830 |
+ else if (E2K_BASE(cpu_flags)) |
12831 |
+ cpucaps = "Elbrus"; |
12832 |
else |
12833 |
cpucaps = "C"; |
12834 |
|
12835 |
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c |
12836 |
index 5884625..26d41fe 100644 |
12837 |
--- a/libswscale/yuv2rgb.c |
12838 |
+++ b/libswscale/yuv2rgb.c |
12839 |
@@ -682,6 +682,8 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c) |
12840 |
|
12841 |
if (ARCH_PPC) |
12842 |
t = ff_yuv2rgb_init_ppc(c); |
12843 |
+ if (ARCH_E2K) |
12844 |
+ t = ff_yuv2rgb_init_e2k(c); |
12845 |
if (ARCH_X86) |
12846 |
t = ff_yuv2rgb_init_x86(c); |
12847 |
|
12848 |
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c |
12849 |
index 899f68b..3768647 100644 |
12850 |
--- a/tests/checkasm/checkasm.c |
12851 |
+++ b/tests/checkasm/checkasm.c |
12852 |
@@ -213,6 +213,8 @@ static const struct { |
12853 |
{ "ALTIVEC", "altivec", AV_CPU_FLAG_ALTIVEC }, |
12854 |
{ "VSX", "vsx", AV_CPU_FLAG_VSX }, |
12855 |
{ "POWER8", "power8", AV_CPU_FLAG_POWER8 }, |
12856 |
+#elif ARCH_E2K |
12857 |
+ { "E2K", "e2k", AV_CPU_FLAG_E2K }, |
12858 |
#elif ARCH_X86 |
12859 |
{ "MMX", "mmx", AV_CPU_FLAG_MMX|AV_CPU_FLAG_CMOV }, |
12860 |
{ "MMXEXT", "mmxext", AV_CPU_FLAG_MMXEXT }, |
12861 |
diff --git a/tests/checkasm/huffyuvdsp.c b/tests/checkasm/huffyuvdsp.c |
12862 |
index 8392022..d5eba83 100644 |
12863 |
--- a/tests/checkasm/huffyuvdsp.c |
12864 |
+++ b/tests/checkasm/huffyuvdsp.c |
12865 |
@@ -24,10 +24,14 @@ |
12866 |
#include "libavutil/intreadwrite.h" |
12867 |
#include "libavutil/mem.h" |
12868 |
|
12869 |
-#include "libavcodec/huffyuvdsp.h" |
12870 |
- |
12871 |
#include "checkasm.h" |
12872 |
|
12873 |
+/* Short defines (B,G,R,A) in "huffyuvdsp.h" cause problems for Elbrus (e2k) |
12874 |
+ * system includes, so this header file must be included after "checkasm.h". |
12875 |
+ * Ilya Kurdyukov <jpegqs@gmail.com> |
12876 |
+ */ |
12877 |
+#include "libavcodec/huffyuvdsp.h" |
12878 |
+ |
12879 |
#define randomize_buffers(buf, size) \ |
12880 |
do { \ |
12881 |
int j; \ |
12882 |
-- |
12883 |
2.17.1 |
12884 |
|