View | Details | Raw Unified | Return to bug 39933
Collapse All | Expand All

(-)a/.gear/ffmpeg-e2k-simd.patch (+12884 lines)
Line 0 Link Here
1
From 2368b59a81003cb87869a8f01615e28bbf8326bf Mon Sep 17 00:00:00 2001
2
From: Ilya Kurdyukov <jpegqs@gmail.com>
3
Date: Fri, 26 Mar 2021 14:57:57 +0700
4
Subject: [PATCH] ffmpeg-4.3.1 e2k support
5
6
---
7
 configure                            |   20 +-
8
 libavcodec/audiodsp.c                |    2 +
9
 libavcodec/audiodsp.h                |    1 +
10
 libavcodec/blockdsp.c                |    2 +
11
 libavcodec/blockdsp.h                |    1 +
12
 libavcodec/e2k/Makefile              |   31 +
13
 libavcodec/e2k/audiodsp.c            |   62 +
14
 libavcodec/e2k/blockdsp.c            |   72 +
15
 libavcodec/e2k/dctdsp.h              |   27 +
16
 libavcodec/e2k/fdctdsp.c             |  389 +++++
17
 libavcodec/e2k/fft.c                 | 1043 +++++++++++++
18
 libavcodec/e2k/fft.h                 |   29 +
19
 libavcodec/e2k/fft_init.c            |  152 ++
20
 libavcodec/e2k/fmtconvert.c          |   55 +
21
 libavcodec/e2k/h264chroma.c          |   63 +
22
 libavcodec/e2k/h264chroma_template.c |  113 ++
23
 libavcodec/e2k/h264dsp.c             |  820 +++++++++++
24
 libavcodec/e2k/h264qpel.c            |  255 ++++
25
 libavcodec/e2k/h264qpel_template.c   |  354 +++++
26
 libavcodec/e2k/hevcdsp.c             |   94 ++
27
 libavcodec/e2k/hpeldsp.c             |  302 ++++
28
 libavcodec/e2k/hpeldsp.h             |   30 +
29
 libavcodec/e2k/idctdsp.c             |  237 +++
30
 libavcodec/e2k/lossless_audiodsp.c   |   75 +
31
 libavcodec/e2k/lossless_videodsp.c   |   59 +
32
 libavcodec/e2k/mdct15.c              |  187 +++
33
 libavcodec/e2k/me_cmp.c              |  461 ++++++
34
 libavcodec/e2k/mpegaudiodsp.c        |  142 ++
35
 libavcodec/e2k/mpegvideo.c           |  100 ++
36
 libavcodec/e2k/mpegvideodsp.c        |   86 ++
37
 libavcodec/e2k/mpegvideoencdsp.c     |   75 +
38
 libavcodec/e2k/pixblockdsp.c         |   83 ++
39
 libavcodec/e2k/svq1enc.c             |   68 +
40
 libavcodec/e2k/vc1dsp.c              |  303 ++++
41
 libavcodec/e2k/videodsp.c            |   36 +
42
 libavcodec/e2k/vorbisdsp.c           |   62 +
43
 libavcodec/e2k/vp3dsp.c              |  169 +++
44
 libavcodec/e2k/vp8dsp.c              |  428 ++++++
45
 libavcodec/e2k/vp9dsp.c              | 1740 ++++++++++++++++++++++
46
 libavcodec/fdctdsp.c                 |    2 +
47
 libavcodec/fdctdsp.h                 |    2 +
48
 libavcodec/fft.h                     |    1 +
49
 libavcodec/fft_template.c            |    1 +
50
 libavcodec/fmtconvert.c              |    2 +
51
 libavcodec/fmtconvert.h              |    1 +
52
 libavcodec/h264chroma.c              |    2 +
53
 libavcodec/h264chroma.h              |    1 +
54
 libavcodec/h264dsp.c                 |    1 +
55
 libavcodec/h264dsp.h                 |    2 +
56
 libavcodec/h264qpel.c                |    2 +
57
 libavcodec/h264qpel.h                |    1 +
58
 libavcodec/hevcdsp.c                 |    2 +
59
 libavcodec/hevcdsp.h                 |    1 +
60
 libavcodec/hpeldsp.c                 |    2 +
61
 libavcodec/hpeldsp.h                 |    1 +
62
 libavcodec/idctdsp.c                 |    2 +
63
 libavcodec/idctdsp.h                 |    2 +
64
 libavcodec/lossless_audiodsp.c       |    2 +
65
 libavcodec/lossless_audiodsp.h       |    1 +
66
 libavcodec/lossless_videodsp.c       |    2 +
67
 libavcodec/lossless_videodsp.h       |    1 +
68
 libavcodec/mdct15.c                  |    2 +
69
 libavcodec/mdct15.h                  |    1 +
70
 libavcodec/me_cmp.c                  |    2 +
71
 libavcodec/me_cmp.h                  |    1 +
72
 libavcodec/mpegaudiodsp.c            |    1 +
73
 libavcodec/mpegaudiodsp.h            |    1 +
74
 libavcodec/mpegvideo.c               |    2 +
75
 libavcodec/mpegvideo.h               |    1 +
76
 libavcodec/mpegvideodsp.c            |    2 +
77
 libavcodec/mpegvideodsp.h            |    1 +
78
 libavcodec/mpegvideoencdsp.c         |    2 +
79
 libavcodec/mpegvideoencdsp.h         |    2 +
80
 libavcodec/pixblockdsp.c             |    2 +
81
 libavcodec/pixblockdsp.h             |    2 +
82
 libavcodec/svq1enc.c                 |    2 +
83
 libavcodec/svq1enc.h                 |    1 +
84
 libavcodec/tests/dct.c               |    2 +
85
 libavcodec/tests/e2k/dct.c           |   31 +
86
 libavcodec/vc1dsp.c                  |    2 +
87
 libavcodec/vc1dsp.h                  |    1 +
88
 libavcodec/videodsp.c                |    2 +
89
 libavcodec/videodsp.h                |    1 +
90
 libavcodec/vorbisdsp.c               |    2 +
91
 libavcodec/vorbisdsp.h               |    1 +
92
 libavcodec/vp3dsp.c                  |    2 +
93
 libavcodec/vp3dsp.h                  |    1 +
94
 libavcodec/vp8dsp.c                  |    2 +
95
 libavcodec/vp8dsp.h                  |    1 +
96
 libavcodec/vp9dsp.c                  |    1 +
97
 libavcodec/vp9dsp.h                  |    1 +
98
 libavutil/cpu.c                      |    8 +
99
 libavutil/cpu.h                      |    2 +
100
 libavutil/cpu_internal.h             |    2 +
101
 libavutil/e2k/Makefile               |    2 +
102
 libavutil/e2k/cpu.c                  |   41 +
103
 libavutil/e2k/cpu.h                  |   27 +
104
 libavutil/e2k/float_dsp.c            |  188 +++
105
 libavutil/e2k/intreadwrite.h         |   54 +
106
 libavutil/e2k/timer.h                |   35 +
107
 libavutil/e2k/util_e2k.h             |  146 ++
108
 libavutil/float_dsp.c                |    2 +
109
 libavutil/float_dsp.h                |    1 +
110
 libavutil/intreadwrite.h             |    2 +
111
 libavutil/tests/cpu.c                |    2 +
112
 libavutil/timer.h                    |    2 +
113
 libswresample/audioconvert.c         |    1 +
114
 libswresample/e2k/Makefile           |    1 +
115
 libswresample/e2k/audio_convert.c    |  110 ++
116
 libswresample/swresample_internal.h  |    4 +
117
 libswscale/e2k/Makefile              |    3 +
118
 libswscale/e2k/swscale.c             | 2046 ++++++++++++++++++++++++++
119
 libswscale/e2k/yuv2rgb.c             |  248 ++++
120
 libswscale/e2k/yuv2rgb.h             |   52 +
121
 libswscale/e2k/yuv2yuv.c             |  146 ++
122
 libswscale/swscale.c                 |    2 +
123
 libswscale/swscale_internal.h        |    5 +
124
 libswscale/swscale_unscaled.c        |    2 +
125
 libswscale/utils.c                   |   13 +
126
 libswscale/yuv2rgb.c                 |    2 +
127
 tests/checkasm/checkasm.c            |    2 +
128
 tests/checkasm/huffyuvdsp.c          |    8 +-
129
 122 files changed, 11491 insertions(+), 5 deletions(-)
130
 create mode 100644 libavcodec/e2k/Makefile
131
 create mode 100644 libavcodec/e2k/audiodsp.c
132
 create mode 100644 libavcodec/e2k/blockdsp.c
133
 create mode 100644 libavcodec/e2k/dctdsp.h
134
 create mode 100644 libavcodec/e2k/fdctdsp.c
135
 create mode 100644 libavcodec/e2k/fft.c
136
 create mode 100644 libavcodec/e2k/fft.h
137
 create mode 100644 libavcodec/e2k/fft_init.c
138
 create mode 100644 libavcodec/e2k/fmtconvert.c
139
 create mode 100644 libavcodec/e2k/h264chroma.c
140
 create mode 100644 libavcodec/e2k/h264chroma_template.c
141
 create mode 100644 libavcodec/e2k/h264dsp.c
142
 create mode 100644 libavcodec/e2k/h264qpel.c
143
 create mode 100644 libavcodec/e2k/h264qpel_template.c
144
 create mode 100644 libavcodec/e2k/hevcdsp.c
145
 create mode 100644 libavcodec/e2k/hpeldsp.c
146
 create mode 100644 libavcodec/e2k/hpeldsp.h
147
 create mode 100644 libavcodec/e2k/idctdsp.c
148
 create mode 100644 libavcodec/e2k/lossless_audiodsp.c
149
 create mode 100644 libavcodec/e2k/lossless_videodsp.c
150
 create mode 100644 libavcodec/e2k/mdct15.c
151
 create mode 100644 libavcodec/e2k/me_cmp.c
152
 create mode 100644 libavcodec/e2k/mpegaudiodsp.c
153
 create mode 100644 libavcodec/e2k/mpegvideo.c
154
 create mode 100644 libavcodec/e2k/mpegvideodsp.c
155
 create mode 100644 libavcodec/e2k/mpegvideoencdsp.c
156
 create mode 100644 libavcodec/e2k/pixblockdsp.c
157
 create mode 100644 libavcodec/e2k/svq1enc.c
158
 create mode 100644 libavcodec/e2k/vc1dsp.c
159
 create mode 100644 libavcodec/e2k/videodsp.c
160
 create mode 100644 libavcodec/e2k/vorbisdsp.c
161
 create mode 100644 libavcodec/e2k/vp3dsp.c
162
 create mode 100644 libavcodec/e2k/vp8dsp.c
163
 create mode 100644 libavcodec/e2k/vp9dsp.c
164
 create mode 100644 libavcodec/tests/e2k/dct.c
165
 create mode 100644 libavutil/e2k/Makefile
166
 create mode 100644 libavutil/e2k/cpu.c
167
 create mode 100644 libavutil/e2k/cpu.h
168
 create mode 100644 libavutil/e2k/float_dsp.c
169
 create mode 100644 libavutil/e2k/intreadwrite.h
170
 create mode 100644 libavutil/e2k/timer.h
171
 create mode 100644 libavutil/e2k/util_e2k.h
172
 create mode 100644 libswresample/e2k/Makefile
173
 create mode 100644 libswresample/e2k/audio_convert.c
174
 create mode 100644 libswscale/e2k/Makefile
175
 create mode 100644 libswscale/e2k/swscale.c
176
 create mode 100644 libswscale/e2k/yuv2rgb.c
177
 create mode 100644 libswscale/e2k/yuv2rgb.h
178
 create mode 100644 libswscale/e2k/yuv2yuv.c
179
180
diff --git a/configure b/configure
181
index 19c1865..3c2a9ab 100755
182
--- a/configure
183
+++ b/configure
184
@@ -1989,6 +1989,7 @@ ARCH_LIST="
185
     parisc
186
     ppc
187
     ppc64
188
+    e2k
189
     s390
190
     sh4
191
     sparc
192
@@ -2060,6 +2061,10 @@ ARCH_EXT_LIST_PPC="
193
     vsx
194
 "
195
 
196
+ARCH_EXT_LIST_E2K="
197
+    e2k
198
+"
199
+
200
 ARCH_EXT_LIST_X86="
201
     $ARCH_EXT_LIST_X86_SIMD
202
     cpunop
203
@@ -2069,6 +2074,7 @@ ARCH_EXT_LIST_X86="
204
 ARCH_EXT_LIST="
205
     $ARCH_EXT_LIST_ARM
206
     $ARCH_EXT_LIST_PPC
207
+    $ARCH_EXT_LIST_E2K
208
     $ARCH_EXT_LIST_X86
209
     $ARCH_EXT_LIST_MIPS
210
     $ARCH_EXT_LIST_LOONGSON
211
@@ -2594,10 +2600,10 @@ for ext in $(filter_out mmx $ARCH_EXT_LIST_X86_SIMD); do
212
 done
213
 
214
 aligned_stack_if_any="aarch64 ppc x86"
215
-fast_64bit_if_any="aarch64 alpha ia64 mips64 parisc64 ppc64 sparc64 x86_64"
216
-fast_clz_if_any="aarch64 alpha avr32 mips ppc x86"
217
+fast_64bit_if_any="aarch64 alpha ia64 mips64 parisc64 ppc64 e2k sparc64 x86_64"
218
+fast_clz_if_any="aarch64 alpha avr32 mips ppc e2k x86"
219
 fast_unaligned_if_any="aarch64 ppc x86"
220
-simd_align_16_if_any="altivec neon sse"
221
+simd_align_16_if_any="altivec e2k neon sse"
222
 simd_align_32_if_any="avx"
223
 simd_align_64_if_any="avx512"
224
 
225
@@ -4889,6 +4895,9 @@ case "$arch" in
226
     "Power Macintosh"|ppc*|powerpc*)
227
         arch="ppc"
228
     ;;
229
+    e2k|elbrus)
230
+        arch="e2k"
231
+    ;;
232
     s390|s390x)
233
         arch="s390"
234
     ;;
235
@@ -5177,6 +5186,11 @@ elif enabled ppc; then
236
         ;;
237
     esac
238
 
239
+elif enabled e2k; then
240
+
241
+    cpu="e2k"
242
+    cpuflags="-msse4.1 -mno-avx"
243
+
244
 elif enabled sparc; then
245
 
246
     case $cpu in
247
diff --git a/libavcodec/audiodsp.c b/libavcodec/audiodsp.c
248
index efcb0a8..36b8528 100644
249
--- a/libavcodec/audiodsp.c
250
+++ b/libavcodec/audiodsp.c
251
@@ -113,6 +113,8 @@ av_cold void ff_audiodsp_init(AudioDSPContext *c)
252
         ff_audiodsp_init_arm(c);
253
     if (ARCH_PPC)
254
         ff_audiodsp_init_ppc(c);
255
+    if (ARCH_E2K)
256
+        ff_audiodsp_init_e2k(c);
257
     if (ARCH_X86)
258
         ff_audiodsp_init_x86(c);
259
 }
260
diff --git a/libavcodec/audiodsp.h b/libavcodec/audiodsp.h
261
index aa6fa78..9c05e28 100644
262
--- a/libavcodec/audiodsp.h
263
+++ b/libavcodec/audiodsp.h
264
@@ -55,6 +55,7 @@ typedef struct AudioDSPContext {
265
 void ff_audiodsp_init(AudioDSPContext *c);
266
 void ff_audiodsp_init_arm(AudioDSPContext *c);
267
 void ff_audiodsp_init_ppc(AudioDSPContext *c);
268
+void ff_audiodsp_init_e2k(AudioDSPContext *c);
269
 void ff_audiodsp_init_x86(AudioDSPContext *c);
270
 
271
 #endif /* AVCODEC_AUDIODSP_H */
272
diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c
273
index c7efe7e..704c723 100644
274
--- a/libavcodec/blockdsp.c
275
+++ b/libavcodec/blockdsp.c
276
@@ -71,6 +71,8 @@ av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx)
277
         ff_blockdsp_init_arm(c);
278
     if (ARCH_PPC)
279
         ff_blockdsp_init_ppc(c);
280
+    if (ARCH_E2K)
281
+        ff_blockdsp_init_e2k(c);
282
     if (ARCH_X86)
283
         ff_blockdsp_init_x86(c, avctx);
284
     if (ARCH_MIPS)
285
diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h
286
index 26fc2ea..8eccb47 100644
287
--- a/libavcodec/blockdsp.h
288
+++ b/libavcodec/blockdsp.h
289
@@ -44,6 +44,7 @@ void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx);
290
 void ff_blockdsp_init_alpha(BlockDSPContext *c);
291
 void ff_blockdsp_init_arm(BlockDSPContext *c);
292
 void ff_blockdsp_init_ppc(BlockDSPContext *c);
293
+void ff_blockdsp_init_e2k(BlockDSPContext *c);
294
 void ff_blockdsp_init_x86(BlockDSPContext *c, AVCodecContext *avctx);
295
 void ff_blockdsp_init_mips(BlockDSPContext *c);
296
 
297
diff --git a/libavcodec/e2k/Makefile b/libavcodec/e2k/Makefile
298
new file mode 100644
299
index 0000000..3564b97
300
--- /dev/null
301
+++ b/libavcodec/e2k/Makefile
302
@@ -0,0 +1,31 @@
303
+# subsystems
304
+OBJS-$(CONFIG_AUDIODSP)                += e2k/audiodsp.o
305
+OBJS-$(CONFIG_BLOCKDSP)                += e2k/blockdsp.o
306
+OBJS-$(CONFIG_FFT)                     += e2k/fft_init.o e2k/fft.o
307
+OBJS-$(CONFIG_FDCTDSP)                 += e2k/fdctdsp.o
308
+OBJS-$(CONFIG_FMTCONVERT)              += e2k/fmtconvert.o
309
+OBJS-$(CONFIG_H264CHROMA)              += e2k/h264chroma.o
310
+OBJS-$(CONFIG_H264DSP)                 += e2k/h264dsp.o e2k/hpeldsp.o
311
+OBJS-$(CONFIG_H264QPEL)                += e2k/h264qpel.o
312
+OBJS-$(CONFIG_HPELDSP)                 += e2k/hpeldsp.o
313
+OBJS-$(CONFIG_IDCTDSP)                 += e2k/idctdsp.o
314
+OBJS-$(CONFIG_LLVIDDSP)                += e2k/lossless_videodsp.o
315
+OBJS-$(CONFIG_MDCT15)                  += e2k/mdct15.o
316
+OBJS-$(CONFIG_ME_CMP)                  += e2k/me_cmp.o
317
+OBJS-$(CONFIG_MPEGAUDIODSP)            += e2k/mpegaudiodsp.o
318
+OBJS-$(CONFIG_MPEGVIDEO)               += e2k/mpegvideo.o e2k/mpegvideodsp.o
319
+OBJS-$(CONFIG_MPEGVIDEOENC)            += e2k/mpegvideoencdsp.o
320
+OBJS-$(CONFIG_PIXBLOCKDSP)             += e2k/pixblockdsp.o
321
+OBJS-$(CONFIG_VC1DSP)                  += e2k/vc1dsp.o
322
+OBJS-$(CONFIG_VIDEODSP)                += e2k/videodsp.o
323
+OBJS-$(CONFIG_VP3DSP)                  += e2k/vp3dsp.o
324
+OBJS-$(CONFIG_VP8DSP)                  += e2k/vp8dsp.o
325
+
326
+# decoders/encoders
327
+OBJS-$(CONFIG_HEVC_DECODER)            += e2k/hevcdsp.o
328
+OBJS-$(CONFIG_LLAUDDSP)                += e2k/lossless_audiodsp.o
329
+OBJS-$(CONFIG_SVQ1_ENCODER)            += e2k/svq1enc.o
330
+OBJS-$(CONFIG_VORBIS_DECODER)          += e2k/vorbisdsp.o
331
+OBJS-$(CONFIG_VP7_DECODER)             += e2k/vp8dsp.o
332
+OBJS-$(CONFIG_VP9_DECODER)             += e2k/vp9dsp.o
333
+
334
diff --git a/libavcodec/e2k/audiodsp.c b/libavcodec/e2k/audiodsp.c
335
new file mode 100644
336
index 0000000..c2e4433
337
--- /dev/null
338
+++ b/libavcodec/e2k/audiodsp.c
339
@@ -0,0 +1,62 @@
340
+/*
341
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
342
+ * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
343
+ *
344
+ * This file is part of FFmpeg.
345
+ *
346
+ * FFmpeg is free software; you can redistribute it and/or
347
+ * modify it under the terms of the GNU Lesser General Public
348
+ * License as published by the Free Software Foundation; either
349
+ * version 2.1 of the License, or (at your option) any later version.
350
+ *
351
+ * FFmpeg is distributed in the hope that it will be useful,
352
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
353
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
354
+ * Lesser General Public License for more details.
355
+ *
356
+ * You should have received a copy of the GNU Lesser General Public
357
+ * License along with FFmpeg; if not, write to the Free Software
358
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
359
+ */
360
+
361
+/**
362
+ * @file
363
+ * miscellaneous audio operations
364
+ */
365
+
366
+#include "config.h"
367
+
368
+#include "libavutil/attributes.h"
369
+#include "libavutil/cpu.h"
370
+#include "libavutil/e2k/cpu.h"
371
+#include "libavutil/e2k/util_e2k.h"
372
+
373
+#include "libavcodec/audiodsp.h"
374
+
375
+static int32_t scalarproduct_int16_e2k(const int16_t *v1, const int16_t *v2, int order)
376
+{
377
+    int i;
378
+    vec_s16 vec1, vec2;
379
+    vec_s32 res = _mm_setzero_si128(), tmp;
380
+
381
+    PRAGMA_E2K("ivdep")
382
+    for (i = 0; i < order; i += 8) {
383
+        vec1 = VEC_LD(v1);
384
+        vec2 = VEC_LD(v2);
385
+        tmp = _mm_madd_epi16(vec1, vec2);
386
+        res = _mm_add_epi32(res, tmp);
387
+        v1 += 8;
388
+        v2 += 8;
389
+    }
390
+
391
+    res = _mm_hadd_epi32(res, res);
392
+    return _mm_extract_epi32(res, 0) + _mm_extract_epi32(res, 1);
393
+}
394
+
395
+av_cold void ff_audiodsp_init_e2k(AudioDSPContext *c)
396
+{
397
+    if (!E2K_BASE(av_get_cpu_flags()))
398
+        return;
399
+
400
+    c->scalarproduct_int16 = scalarproduct_int16_e2k;
401
+}
402
diff --git a/libavcodec/e2k/blockdsp.c b/libavcodec/e2k/blockdsp.c
403
new file mode 100644
404
index 0000000..f85dce1
405
--- /dev/null
406
+++ b/libavcodec/e2k/blockdsp.c
407
@@ -0,0 +1,72 @@
408
+/*
409
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
410
+ * Copyright (c) 2002 Brian Foley
411
+ * Copyright (c) 2002 Dieter Shirley
412
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
413
+ *
414
+ * This file is part of FFmpeg.
415
+ *
416
+ * FFmpeg is free software; you can redistribute it and/or
417
+ * modify it under the terms of the GNU Lesser General Public
418
+ * License as published by the Free Software Foundation; either
419
+ * version 2.1 of the License, or (at your option) any later version.
420
+ *
421
+ * FFmpeg is distributed in the hope that it will be useful,
422
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
423
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
424
+ * Lesser General Public License for more details.
425
+ *
426
+ * You should have received a copy of the GNU Lesser General Public
427
+ * License along with FFmpeg; if not, write to the Free Software
428
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
429
+ */
430
+
431
+#include "config.h"
432
+
433
+#include <string.h>
434
+
435
+#include "libavutil/attributes.h"
436
+#include "libavutil/cpu.h"
437
+#include "libavutil/mem.h"
438
+#include "libavutil/e2k/cpu.h"
439
+#include "libavutil/e2k/util_e2k.h"
440
+
441
+#include "libavcodec/blockdsp.h"
442
+
443
+static void clear_block_e2k(int16_t *block)
444
+{
445
+    LOAD_ZERO;
446
+    VEC_ST(block, zerov);
447
+    VEC_ST(block + 8, zerov);
448
+    VEC_ST(block + 8 * 2, zerov);
449
+    VEC_ST(block + 8 * 3, zerov);
450
+    VEC_ST(block + 8 * 4, zerov);
451
+    VEC_ST(block + 8 * 5, zerov);
452
+    VEC_ST(block + 8 * 6, zerov);
453
+    VEC_ST(block + 8 * 7, zerov);
454
+}
455
+
456
+static void clear_blocks_e2k(int16_t *blocks)
457
+{
458
+    int i;
459
+    LOAD_ZERO;
460
+    for (i = 0; i < 6; i++, blocks += 64) {
461
+        VEC_ST(blocks, zerov);
462
+        VEC_ST(blocks + 8, zerov);
463
+        VEC_ST(blocks + 8 * 2, zerov);
464
+        VEC_ST(blocks + 8 * 3, zerov);
465
+        VEC_ST(blocks + 8 * 4, zerov);
466
+        VEC_ST(blocks + 8 * 5, zerov);
467
+        VEC_ST(blocks + 8 * 6, zerov);
468
+        VEC_ST(blocks + 8 * 7, zerov);
469
+    }
470
+}
471
+
472
+av_cold void ff_blockdsp_init_e2k(BlockDSPContext *c)
473
+{
474
+    if (!E2K_BASE(av_get_cpu_flags()))
475
+        return;
476
+
477
+    c->clear_block = clear_block_e2k;
478
+    c->clear_blocks = clear_blocks_e2k;
479
+}
480
diff --git a/libavcodec/e2k/dctdsp.h b/libavcodec/e2k/dctdsp.h
481
new file mode 100644
482
index 0000000..1281dc7
483
--- /dev/null
484
+++ b/libavcodec/e2k/dctdsp.h
485
@@ -0,0 +1,27 @@
486
+/*
487
+ * This file is part of FFmpeg.
488
+ *
489
+ * FFmpeg is free software; you can redistribute it and/or
490
+ * modify it under the terms of the GNU Lesser General Public
491
+ * License as published by the Free Software Foundation; either
492
+ * version 2.1 of the License, or (at your option) any later version.
493
+ *
494
+ * FFmpeg is distributed in the hope that it will be useful,
495
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
496
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
497
+ * Lesser General Public License for more details.
498
+ *
499
+ * You should have received a copy of the GNU Lesser General Public
500
+ * License along with FFmpeg; if not, write to the Free Software
501
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
502
+ */
503
+
504
+#ifndef AVCODEC_E2K_DCTDSP_H
505
+#define AVCODEC_E2K_DCTDSP_H
506
+
507
+#include <stdint.h>
508
+
509
+void ff_fdct_e2k(int16_t *block);
510
+void ff_idct_e2k(int16_t *block);
511
+
512
+#endif /* AVCODEC_E2K_DCTDSP_H */
513
diff --git a/libavcodec/e2k/fdctdsp.c b/libavcodec/e2k/fdctdsp.c
514
new file mode 100644
515
index 0000000..568a67f
516
--- /dev/null
517
+++ b/libavcodec/e2k/fdctdsp.c
518
@@ -0,0 +1,389 @@
519
+/*
520
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
521
+ * Copyright (C) 2003  James Klicman <james@klicman.org>
522
+ *
523
+ * This file is part of FFmpeg.
524
+ *
525
+ * FFmpeg is free software; you can redistribute it and/or
526
+ * modify it under the terms of the GNU Lesser General Public
527
+ * License as published by the Free Software Foundation; either
528
+ * version 2.1 of the License, or (at your option) any later version.
529
+ *
530
+ * FFmpeg is distributed in the hope that it will be useful,
531
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
532
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
533
+ * Lesser General Public License for more details.
534
+ *
535
+ * You should have received a copy of the GNU Lesser General Public
536
+ * License along with FFmpeg; if not, write to the Free Software
537
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
538
+ */
539
+
540
+#include "config.h"
541
+
542
+#include "libavutil/attributes.h"
543
+#include "libavutil/cpu.h"
544
+#include "libavutil/e2k/cpu.h"
545
+#include "libavutil/e2k/util_e2k.h"
546
+
547
+#include "libavcodec/fdctdsp.h"
548
+
549
+#include "dctdsp.h"
550
+
551
+#define C1     0.98078528040323044912618224 /* cos(1 * PI / 16) */
552
+#define C2     0.92387953251128675612818319 /* cos(2 * PI / 16) */
553
+#define C3     0.83146961230254523707878838 /* cos(3 * PI / 16) */
554
+#define C4     0.70710678118654752440084436 /* cos(4 * PI / 16) */
555
+#define C5     0.55557023301960222474283081 /* cos(5 * PI / 16) */
556
+#define C6     0.38268343236508977172845998 /* cos(6 * PI / 16) */
557
+#define C7     0.19509032201612826784828487 /* cos(7 * PI / 16) */
558
+
559
+#define W0 -(2 * C2)
560
+#define W1  (2 * C6)
561
+#define W2 (M_SQRT2 * C6)
562
+#define W3 (M_SQRT2 * C3)
563
+#define W4 (M_SQRT2 * (-C1 + C3 + C5 - C7))
564
+#define W5 (M_SQRT2 *  (C1 + C3 - C5 + C7))
565
+#define W6 (M_SQRT2 *  (C1 + C3 + C5 - C7))
566
+#define W7 (M_SQRT2 *  (C1 + C3 - C5 - C7))
567
+#define W8 (M_SQRT2 *  (C7 - C3))
568
+#define W9 (M_SQRT2 * (-C1 - C3))
569
+#define WA (M_SQRT2 * (-C3 - C5))
570
+#define WB (M_SQRT2 *  (C5 - C3))
571
+
572
+#define LD_W0 _mm_set1_ps(W0)
573
+#define LD_W1 _mm_set1_ps(W1)
574
+#define LD_W2 _mm_set1_ps(W2)
575
+#define LD_W3 _mm_set1_ps(W3)
576
+#define LD_W4 _mm_set1_ps(W4)
577
+#define LD_W5 _mm_set1_ps(W5)
578
+#define LD_W6 _mm_set1_ps(W6)
579
+#define LD_W7 _mm_set1_ps(W7)
580
+#define LD_W8 _mm_set1_ps(W8)
581
+#define LD_W9 _mm_set1_ps(W9)
582
+#define LD_WA _mm_set1_ps(WA)
583
+#define LD_WB _mm_set1_ps(WB)
584
+
585
+#define _mm_madd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
586
+
587
+#define FDCTROW(b0, b1, b2, b3, b4, b5, b6, b7) /* {{{ */           \
588
+    x0 = _mm_add_ps(b0, b7);            /* x0 = b0 + b7; */         \
589
+    x7 = _mm_sub_ps(b0, b7);            /* x7 = b0 - b7; */         \
590
+    x1 = _mm_add_ps(b1, b6);            /* x1 = b1 + b6; */         \
591
+    x6 = _mm_sub_ps(b1, b6);            /* x6 = b1 - b6; */         \
592
+    x2 = _mm_add_ps(b2, b5);            /* x2 = b2 + b5; */         \
593
+    x5 = _mm_sub_ps(b2, b5);            /* x5 = b2 - b5; */         \
594
+    x3 = _mm_add_ps(b3, b4);            /* x3 = b3 + b4; */         \
595
+    x4 = _mm_sub_ps(b3, b4);            /* x4 = b3 - b4; */         \
596
+                                                                    \
597
+    b7 = _mm_add_ps(x0, x3);            /* b7 = x0 + x3; */         \
598
+    b1 = _mm_add_ps(x1, x2);            /* b1 = x1 + x2; */         \
599
+    b0 = _mm_add_ps(b7, b1);            /* b0 = b7 + b1; */         \
600
+    b4 = _mm_sub_ps(b7, b1);            /* b4 = b7 - b1; */         \
601
+                                                                    \
602
+    b2   = _mm_sub_ps(x0, x3);          /* b2 = x0 - x3; */         \
603
+    b6   = _mm_sub_ps(x1, x2);          /* b6 = x1 - x2; */         \
604
+    b5   = _mm_add_ps(b6, b2);          /* b5 = b6 + b2; */         \
605
+    cnst = LD_W2;                                                   \
606
+    b5   = _mm_mul_ps(cnst, b5);        /* b5 = b5 * W2; */         \
607
+    cnst = LD_W1;                                                   \
608
+    b2   = _mm_madd_ps(cnst, b2, b5);   /* b2 = b5 + b2 * W1; */    \
609
+    cnst = LD_W0;                                                   \
610
+    b6   = _mm_madd_ps(cnst, b6, b5);   /* b6 = b5 + b6 * W0; */    \
611
+                                                                    \
612
+    x0   = _mm_add_ps(x4, x7);          /* x0 = x4 + x7; */         \
613
+    x1   = _mm_add_ps(x5, x6);          /* x1 = x5 + x6; */         \
614
+    x2   = _mm_add_ps(x4, x6);          /* x2 = x4 + x6; */         \
615
+    x3   = _mm_add_ps(x5, x7);          /* x3 = x5 + x7; */         \
616
+    x8   = _mm_add_ps(x2, x3);          /* x8 = x2 + x3; */         \
617
+    cnst = LD_W3;                                                   \
618
+    x8   = _mm_mul_ps(cnst, x8);        /* x8 = x8 * W3; */         \
619
+                                                                    \
620
+    cnst = LD_W8;                                                   \
621
+    x0   = _mm_mul_ps(cnst, x0);        /* x0 *= W8; */             \
622
+    cnst = LD_W9;                                                   \
623
+    x1   = _mm_mul_ps(cnst, x1);        /* x1 *= W9; */             \
624
+    cnst = LD_WA;                                                   \
625
+    x2   = _mm_madd_ps(cnst, x2, x8);   /* x2 = x2 * WA + x8; */    \
626
+    cnst = LD_WB;                                                   \
627
+    x3   = _mm_madd_ps(cnst, x3, x8);   /* x3 = x3 * WB + x8; */    \
628
+                                                                    \
629
+    cnst = LD_W4;                                                   \
630
+    b7   = _mm_madd_ps(cnst, x4, x0);   /* b7 = x4 * W4 + x0; */    \
631
+    cnst = LD_W5;                                                   \
632
+    b5   = _mm_madd_ps(cnst, x5, x1);   /* b5 = x5 * W5 + x1; */    \
633
+    cnst = LD_W6;                                                   \
634
+    b3   = _mm_madd_ps(cnst, x6, x1);   /* b3 = x6 * W6 + x1; */    \
635
+    cnst = LD_W7;                                                   \
636
+    b1   = _mm_madd_ps(cnst, x7, x0);   /* b1 = x7 * W7 + x0; */    \
637
+                                                                    \
638
+    b7 = _mm_add_ps(b7, x2);            /* b7 = b7 + x2; */         \
639
+    b5 = _mm_add_ps(b5, x3);            /* b5 = b5 + x3; */         \
640
+    b3 = _mm_add_ps(b3, x2);            /* b3 = b3 + x2; */         \
641
+    b1 = _mm_add_ps(b1, x3)             /* b1 = b1 + x3; */         \
642
+    /* }}} */
643
+
644
+#define FDCTCOL(b0, b1, b2, b3, b4, b5, b6, b7) /* {{{ */           \
645
+    x0 = _mm_add_ps(b0, b7);            /* x0 = b0 + b7; */         \
646
+    x7 = _mm_sub_ps(b0, b7);            /* x7 = b0 - b7; */         \
647
+    x1 = _mm_add_ps(b1, b6);            /* x1 = b1 + b6; */         \
648
+    x6 = _mm_sub_ps(b1, b6);            /* x6 = b1 - b6; */         \
649
+    x2 = _mm_add_ps(b2, b5);            /* x2 = b2 + b5; */         \
650
+    x5 = _mm_sub_ps(b2, b5);            /* x5 = b2 - b5; */         \
651
+    x3 = _mm_add_ps(b3, b4);            /* x3 = b3 + b4; */         \
652
+    x4 = _mm_sub_ps(b3, b4);            /* x4 = b3 - b4; */         \
653
+                                                                    \
654
+    b7 = _mm_add_ps(x0, x3);            /* b7 = x0 + x3; */         \
655
+    b1 = _mm_add_ps(x1, x2);            /* b1 = x1 + x2; */         \
656
+    b0 = _mm_add_ps(b7, b1);            /* b0 = b7 + b1; */         \
657
+    b4 = _mm_sub_ps(b7, b1);            /* b4 = b7 - b1; */         \
658
+                                                                    \
659
+    b2   = _mm_sub_ps(x0, x3);          /* b2 = x0 - x3; */         \
660
+    b6   = _mm_sub_ps(x1, x2);          /* b6 = x1 - x2; */         \
661
+    b5   = _mm_add_ps(b6, b2);          /* b5 = b6 + b2; */         \
662
+    cnst = LD_W2;                                                   \
663
+    b5   = _mm_mul_ps(cnst, b5);        /* b5 = b5 * W2; */         \
664
+    cnst = LD_W1;                                                   \
665
+    b2   = _mm_madd_ps(cnst, b2, b5);   /* b2 = b5 + b2 * W1; */    \
666
+    cnst = LD_W0;                                                   \
667
+    b6   = _mm_madd_ps(cnst, b6, b5);   /* b6 = b5 + b6 * W0; */    \
668
+                                                                    \
669
+    x0   = _mm_add_ps(x4, x7);          /* x0 = x4 + x7; */         \
670
+    x1   = _mm_add_ps(x5, x6);          /* x1 = x5 + x6; */         \
671
+    x2   = _mm_add_ps(x4, x6);          /* x2 = x4 + x6; */         \
672
+    x3   = _mm_add_ps(x5, x7);          /* x3 = x5 + x7; */         \
673
+    x8   = _mm_add_ps(x2, x3);          /* x8 = x2 + x3; */         \
674
+    cnst = LD_W3;                                                   \
675
+    x8   = _mm_mul_ps(cnst, x8);        /* x8 = x8 * W3; */         \
676
+                                                                    \
677
+    cnst = LD_W8;                                                   \
678
+    x0   = _mm_mul_ps(cnst, x0);        /* x0 *= W8; */             \
679
+    cnst = LD_W9;                                                   \
680
+    x1   = _mm_mul_ps(cnst, x1);        /* x1 *= W9; */             \
681
+    cnst = LD_WA;                                                   \
682
+    x2   = _mm_madd_ps(cnst, x2, x8);   /* x2 = x2 * WA + x8; */    \
683
+    cnst = LD_WB;                                                   \
684
+    x3   = _mm_madd_ps(cnst, x3, x8);   /* x3 = x3 * WB + x8; */    \
685
+                                                                    \
686
+    cnst = LD_W4;                                                   \
687
+    b7   = _mm_madd_ps(cnst, x4, x0);   /* b7 = x4 * W4 + x0; */    \
688
+    cnst = LD_W5;                                                   \
689
+    b5   = _mm_madd_ps(cnst, x5, x1);   /* b5 = x5 * W5 + x1; */    \
690
+    cnst = LD_W6;                                                   \
691
+    b3   = _mm_madd_ps(cnst, x6, x1);   /* b3 = x6 * W6 + x1; */    \
692
+    cnst = LD_W7;                                                   \
693
+    b1   = _mm_madd_ps(cnst, x7, x0);   /* b1 = x7 * W7 + x0; */    \
694
+                                                                    \
695
+    b7 = _mm_add_ps(b7, x2);            /* b7 += x2; */             \
696
+    b5 = _mm_add_ps(b5, x3);            /* b5 += x3; */             \
697
+    b3 = _mm_add_ps(b3, x2);            /* b3 += x2; */             \
698
+    b1 = _mm_add_ps(b1, x3)             /* b1 += x3; */             \
699
+    /* }}} */
700
+
701
+/* two dimensional discrete cosine transform */
702
+void ff_fdct_e2k(int16_t *block)
703
+{
704
+    vec_f b00, b10, b20, b30, b40, b50, b60, b70;
705
+    vec_f b01, b11, b21, b31, b41, b51, b61, b71;
706
+    vec_f cnst;
707
+    vec_f x0, x1, x2, x3, x4, x5, x6, x7, x8;
708
+    vec_s16 a0, a1, a2, a3, a4, a5, a6, a7;
709
+    vec_s16 z0, z1, z2, z3, z4, z5, z6, z7;
710
+
711
+    a0 = VEC_LD(block + 8 * 0);
712
+    a4 = VEC_LD(block + 8 * 4);
713
+    a1 = VEC_LD(block + 8 * 1);
714
+    a5 = VEC_LD(block + 8 * 5);
715
+    a2 = VEC_LD(block + 8 * 2);
716
+    a6 = VEC_LD(block + 8 * 6);
717
+    a3 = VEC_LD(block + 8 * 3);
718
+    a7 = VEC_LD(block + 8 * 7);
719
+
720
+    TRANSPOSE8(a0, a1, a2, a3, a4, a5, a6, a7);
721
+
722
+    /* Some of the initial calculations can be done as vector short
723
+     * before conversion to vec_f.  The following code section
724
+     * takes advantage of this. */
725
+
726
+    /* fdct rows {{{ */
727
+    z0 = _mm_add_epi16(a0, a7);
728
+    z7 = _mm_sub_epi16(a0, a7);
729
+    z1 = _mm_add_epi16(a1, a6);
730
+    z6 = _mm_sub_epi16(a1, a6);
731
+    z2 = _mm_add_epi16(a2, a5);
732
+    z5 = _mm_sub_epi16(a2, a5);
733
+    z3 = _mm_add_epi16(a3, a4);
734
+    z4 = _mm_sub_epi16(a3, a4);
735
+
736
+#define CTF0(n) \
737
+    b##n##0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a##n, a##n), 16));\
738
+    b##n##1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a##n, a##n), 16));
739
+
740
+    a7 = _mm_add_epi16(z0, z3);
741
+    a1 = _mm_add_epi16(z1, z2);
742
+    a0 = _mm_add_epi16(a7, a1);
743
+    a4 = _mm_sub_epi16(a7, a1);
744
+    CTF0(0);
745
+    CTF0(4);
746
+
747
+    a2 = _mm_sub_epi16(z0, z3);
748
+    a6 = _mm_sub_epi16(z1, z2);
749
+    CTF0(2);
750
+    CTF0(6);
751
+
752
+#undef CTF0
753
+
754
+    x0 = _mm_add_ps(b60, b20);
755
+    x1 = _mm_add_ps(b61, b21);
756
+
757
+    cnst = LD_W2;
758
+    x0   = _mm_mul_ps(cnst, x0);
759
+    x1   = _mm_mul_ps(cnst, x1);
760
+    cnst = LD_W1;
761
+    b20  = _mm_madd_ps(cnst, b20, x0);
762
+    b21  = _mm_madd_ps(cnst, b21, x1);
763
+    cnst = LD_W0;
764
+    b60  = _mm_madd_ps(cnst, b60, x0);
765
+    b61  = _mm_madd_ps(cnst, b61, x1);
766
+
767
+#define CTFX(x, b) \
768
+    b##0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16)); \
769
+    b##1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16));
770
+
771
+    CTFX(z4, b7);
772
+    CTFX(z5, b5);
773
+    CTFX(z6, b3);
774
+    CTFX(z7, b1);
775
+
776
+#undef CTFX
777
+
778
+    x0   = _mm_add_ps(b70, b10);
779
+    x1   = _mm_add_ps(b50, b30);
780
+    x2   = _mm_add_ps(b70, b30);
781
+    x3   = _mm_add_ps(b50, b10);
782
+    x8   = _mm_add_ps(x2, x3);
783
+    cnst = LD_W3;
784
+    x8   = _mm_mul_ps(cnst, x8);
785
+
786
+    cnst = LD_W8;
787
+    x0   = _mm_mul_ps(cnst, x0);
788
+    cnst = LD_W9;
789
+    x1   = _mm_mul_ps(cnst, x1);
790
+    cnst = LD_WA;
791
+    x2   = _mm_madd_ps(cnst, x2, x8);
792
+    cnst = LD_WB;
793
+    x3   = _mm_madd_ps(cnst, x3, x8);
794
+
795
+    cnst = LD_W4;
796
+    b70  = _mm_madd_ps(cnst, b70, x0);
797
+    cnst = LD_W5;
798
+    b50  = _mm_madd_ps(cnst, b50, x1);
799
+    cnst = LD_W6;
800
+    b30  = _mm_madd_ps(cnst, b30, x1);
801
+    cnst = LD_W7;
802
+    b10  = _mm_madd_ps(cnst, b10, x0);
803
+
804
+    b70 = _mm_add_ps(b70, x2);
805
+    b50 = _mm_add_ps(b50, x3);
806
+    b30 = _mm_add_ps(b30, x2);
807
+    b10 = _mm_add_ps(b10, x3);
808
+
809
+    x0   = _mm_add_ps(b71, b11);
810
+    x1   = _mm_add_ps(b51, b31);
811
+    x2   = _mm_add_ps(b71, b31);
812
+    x3   = _mm_add_ps(b51, b11);
813
+    x8   = _mm_add_ps(x2, x3);
814
+    cnst = LD_W3;
815
+    x8   = _mm_mul_ps(cnst, x8);
816
+
817
+    cnst = LD_W8;
818
+    x0   = _mm_mul_ps(cnst, x0);
819
+    cnst = LD_W9;
820
+    x1   = _mm_mul_ps(cnst, x1);
821
+    cnst = LD_WA;
822
+    x2   = _mm_madd_ps(cnst, x2, x8);
823
+    cnst = LD_WB;
824
+    x3   = _mm_madd_ps(cnst, x3, x8);
825
+
826
+    cnst = LD_W4;
827
+    b71  = _mm_madd_ps(cnst, b71, x0);
828
+    cnst = LD_W5;
829
+    b51  = _mm_madd_ps(cnst, b51, x1);
830
+    cnst = LD_W6;
831
+    b31  = _mm_madd_ps(cnst, b31, x1);
832
+    cnst = LD_W7;
833
+    b11  = _mm_madd_ps(cnst, b11, x0);
834
+
835
+    b71 = _mm_add_ps(b71, x2);
836
+    b51 = _mm_add_ps(b51, x3);
837
+    b31 = _mm_add_ps(b31, x2);
838
+    b11 = _mm_add_ps(b11, x3);
839
+    /* }}} */
840
+
841
+    /* 8x8 matrix transpose (vec_f[8][2]) {{{ */
842
+    x0 = _mm_unpacklo_ps(b00, b10);
843
+    x1 = _mm_unpackhi_ps(b00, b10);
844
+    x2 = _mm_unpacklo_ps(b20, b30);
845
+    x3 = _mm_unpackhi_ps(b20, b30);
846
+    b00 = _mm_unpacklo_ps2(x0, x2);
847
+    b10 = _mm_unpackhi_ps2(x0, x2);
848
+    b20 = _mm_unpacklo_ps2(x1, x3);
849
+    b30 = _mm_unpackhi_ps2(x1, x3);
850
+
851
+    x4 = _mm_unpacklo_ps(b41, b51);
852
+    x5 = _mm_unpackhi_ps(b41, b51);
853
+    x6 = _mm_unpacklo_ps(b61, b71);
854
+    x7 = _mm_unpackhi_ps(b61, b71);
855
+    b41 = _mm_unpacklo_ps2(x4, x6);
856
+    b51 = _mm_unpackhi_ps2(x4, x6);
857
+    b61 = _mm_unpacklo_ps2(x5, x7);
858
+    b71 = _mm_unpackhi_ps2(x5, x7);
859
+
860
+    x0 = _mm_unpacklo_ps(b01, b11);
861
+    x1 = _mm_unpackhi_ps(b01, b11);
862
+    x2 = _mm_unpacklo_ps(b21, b31);
863
+    x3 = _mm_unpackhi_ps(b21, b31);
864
+    x4 = _mm_unpacklo_ps(b40, b50);
865
+    x5 = _mm_unpackhi_ps(b40, b50);
866
+    x6 = _mm_unpacklo_ps(b60, b70);
867
+    x7 = _mm_unpackhi_ps(b60, b70);
868
+    b40 = _mm_unpacklo_ps2(x0, x2);
869
+    b50 = _mm_unpackhi_ps2(x0, x2);
870
+    b60 = _mm_unpacklo_ps2(x1, x3);
871
+    b70 = _mm_unpackhi_ps2(x1, x3);
872
+    b01 = _mm_unpacklo_ps2(x4, x6);
873
+    b11 = _mm_unpackhi_ps2(x4, x6);
874
+    b21 = _mm_unpacklo_ps2(x5, x7);
875
+    b31 = _mm_unpackhi_ps2(x5, x7);
876
+    /* }}} */
877
+
878
+    FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
879
+    FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);
880
+
881
+    /* round, convert back to short */
882
+#define CTS(n) \
883
+    a##n = _mm_packs_epi32(_mm_cvtps_epi32(b##n##0), _mm_cvtps_epi32(b##n##1)); \
884
+    VEC_ST(block + 8 * n, a##n)
885
+
886
+    CTS(0); CTS(1); CTS(2); CTS(3);
887
+    CTS(4); CTS(5); CTS(6); CTS(7);
888
+
889
+#undef CTS
890
+}
891
+
892
+av_cold void ff_fdctdsp_init_e2k(FDCTDSPContext *c, AVCodecContext *avctx,
893
+                                 unsigned high_bit_depth)
894
+{
895
+    if (!E2K_BASE(av_get_cpu_flags()))
896
+        return;
897
+
898
+    // !checkasm
899
+    // libavcodec/tests/dct
900
+
901
+    if (!high_bit_depth) {
902
+        if (avctx->dct_algo == FF_DCT_AUTO ||
903
+            avctx->dct_algo == FF_DCT_ALTIVEC) {
904
+            c->fdct = ff_fdct_e2k;
905
+        }
906
+    }
907
+}
908
diff --git a/libavcodec/e2k/fft.c b/libavcodec/e2k/fft.c
909
new file mode 100644
910
index 0000000..5b58202
911
--- /dev/null
912
+++ b/libavcodec/e2k/fft.c
913
@@ -0,0 +1,1043 @@
914
+/*
915
+ * FFT transform
916
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
917
+ * Copyright (c) 2014 Rong Yan
918
+ * Copyright (c) 2009 Loren Merritt
919
+ *
920
+ * This algorithm (though not any of the implementation details) is
921
+ * based on libdjbfft by D. J. Bernstein.
922
+ *
923
+ * This file is part of FFmpeg.
924
+ *
925
+ * FFmpeg is free software; you can redistribute it and/or
926
+ * modify it under the terms of the GNU Lesser General Public
927
+ * License as published by the Free Software Foundation; either
928
+ * version 2.1 of the License, or (at your option) any later version.
929
+ *
930
+ * FFmpeg is distributed in the hope that it will be useful,
931
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
932
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
933
+ * Lesser General Public License for more details.
934
+ *
935
+ * You should have received a copy of the GNU Lesser General Public
936
+ * License along with FFmpeg; if not, write to the Free Software
937
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
938
+ */
939
+
940
+
941
+#include "config.h"
942
+#include "libavutil/cpu.h"
943
+#include "libavutil/e2k/util_e2k.h"
944
+#include "libavcodec/fft.h"
945
+#include "libavcodec/fft-internal.h"
946
+#include "fft.h"
947
+
948
+#define _mm_madd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
949
+#define _mm_msub_ps(a, b, c) _mm_sub_ps(_mm_mul_ps(a, b), c)
950
+#define _mm_nmsub_ps(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
951
+
952
+static av_always_inline
953
+void pass_e2k_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
954
+{
955
+    int i1 = n * 4, i2 = n * 8, i3 = n * 12;
956
+    FFTSample *out = (FFTSample*)z;
957
+    const FFTSample *wim = wre + n * 2;
958
+    vec_f vz0, vzo1, vzo2, vzo3;
959
+    vec_f x0, x1, x2, x3;
960
+    vec_f x4, x5, x6, x7;
961
+    vec_f x8, x9, x10, x11;
962
+    vec_f x12, x13, x14, x15;
963
+    vec_f x16, x17, x18, x19;
964
+    vec_f x20, x21, x22, x23;
965
+    vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
966
+    vec_f y0, y1, y2, y3;
967
+    vec_f y4, y5, y8, y9;
968
+    vec_f y10, y13, y14, y15;
969
+    vec_f y16, y17, y18, y19;
970
+    vec_f y20, y21, y22, y23;
971
+    vec_f wr1, wi1, wr0, wi0;
972
+    vec_f wr2, wi2, wr3, wi3;
973
+    vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
974
+
975
+    n = n - 2;
976
+    vzo2 = _mm_loadu_ps(out + i2);  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
977
+    vzo2plus1 = _mm_loadu_ps(out + i2 + 4);
978
+    vzo3 = _mm_loadu_ps(out + i3);  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
979
+    vzo3plus1 = _mm_loadu_ps(out + i3 + 4);
980
+    vz0 = _mm_loadu_ps(out);    // z0.r  z0.i  z1.r  z1.i
981
+    vz0plus1 = _mm_loadu_ps(out + 4);
982
+    vzo1 = _mm_loadu_ps(out + i1);  // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
983
+    vzo1plus1 = _mm_loadu_ps(out + i1 + 4);
984
+
985
+    x0 = _mm_add_ps(vzo2, vzo3);
986
+    x1 = _mm_sub_ps(vzo2, vzo3);
987
+    y0 = _mm_add_ps(vzo2plus1, vzo3plus1);
988
+    y1 = _mm_sub_ps(vzo2plus1, vzo3plus1);
989
+
990
+    wr1 = _mm_set1_ps(wre[1]);
991
+    wi1 = _mm_set1_ps(wim[-1]);
992
+    wi2 = _mm_set1_ps(wim[-2]);
993
+    wi3 = _mm_set1_ps(wim[-3]);
994
+    wr2 = _mm_set1_ps(wre[2]);
995
+    wr3 = _mm_set1_ps(wre[3]);
996
+
997
+    x2 = _mm_unpackhi_ps(x0, x1);
998
+    x3 = _mm_shuffle_ps(x2, x2, 0x1b);
999
+
1000
+    y2 = _mm_unpacklo_ps(y0, y1);
1001
+    y3 = _mm_unpackhi_ps(y0, y1);
1002
+    y4 = _mm_shuffle_ps(y2, y2, 0x1b);
1003
+    y5 = _mm_shuffle_ps(y3, y3, 0x1b);
1004
+
1005
+    ymulwi2 = _mm_mul_ps(y4, wi2);
1006
+    ymulwi3 = _mm_mul_ps(y5, wi3);
1007
+    x4 = _mm_mul_ps(x2, wr1);
1008
+    x5 = _mm_mul_ps(x3, wi1);
1009
+    y8 = _mm_madd_ps(y2, wr2, ymulwi2);
1010
+    y9 = _mm_msub_ps(y2, wr2, ymulwi2);
1011
+    x6 = _mm_add_ps(x4, x5);
1012
+    x7 = _mm_sub_ps(x4, x5);
1013
+    y13 = _mm_madd_ps(y3, wr3, ymulwi3);
1014
+    y14 = _mm_msub_ps(y3, wr3, ymulwi3);
1015
+
1016
+    x8 = _mm_shuffle_ps(x6, x7, 0xe4);
1017
+    y10 = _mm_shuffle_ps(y8, y9, 0xe4);
1018
+    y15 = _mm_shuffle_ps(y13, y14, 0xe4);
1019
+
1020
+    x9 = _mm_shuffle_ps(x0, x8, 0x84);
1021
+    x10 = _mm_shuffle_ps(x1, x8, 0x71);
1022
+
1023
+    y16 = _mm_shuffle_ps(y10, y15, 0x88);
1024
+    y17 = _mm_shuffle_ps(y10, y15, 0x77);
1025
+
1026
+    x11 = _mm_add_ps(vz0, x9);
1027
+    x12 = _mm_sub_ps(vz0, x9);
1028
+    x13 = _mm_add_ps(vzo1, x10);
1029
+    x14 = _mm_sub_ps(vzo1, x10);
1030
+
1031
+    y18 = _mm_add_ps(vz0plus1, y16);
1032
+    y19 = _mm_sub_ps(vz0plus1, y16);
1033
+    y20 = _mm_add_ps(vzo1plus1, y17);
1034
+    y21 = _mm_sub_ps(vzo1plus1, y17);
1035
+
1036
+    x15 = _mm_blend_ps(x13, x14, 0xa);
1037
+    x16 = _mm_blend_ps(x14, x13, 0xa);
1038
+    y22 = _mm_blend_ps(y20, y21, 0xa);
1039
+    y23 = _mm_blend_ps(y21, y20, 0xa);
1040
+
1041
+    _mm_storeu_ps(out, x11);
1042
+    _mm_storeu_ps(out + 4, y18);
1043
+    _mm_storeu_ps(out + i1, x15);
1044
+    _mm_storeu_ps(out + i1 + 4, y22);
1045
+    _mm_storeu_ps(out + i2, x12);
1046
+    _mm_storeu_ps(out + i2 + 4, y19);
1047
+    _mm_storeu_ps(out + i3, x16);
1048
+    _mm_storeu_ps(out + i3 + 4, y23);
1049
+
1050
+    do {
1051
+        out += 8;
1052
+        wre += 4;
1053
+        wim -= 4;
1054
+        wr0 = _mm_set1_ps(wre[0]);
1055
+        wr1 = _mm_set1_ps(wre[1]);
1056
+        wi0 = _mm_set1_ps(wim[0]);
1057
+        wi1 = _mm_set1_ps(wim[-1]);
1058
+
1059
+        wr2 = _mm_set1_ps(wre[2]);
1060
+        wr3 = _mm_set1_ps(wre[3]);
1061
+        wi2 = _mm_set1_ps(wim[-2]);
1062
+        wi3 = _mm_set1_ps(wim[-3]);
1063
+
1064
+        vzo2 = _mm_loadu_ps(out + i2);  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
1065
+        vzo2plus1 = _mm_loadu_ps(out + i2 + 4);
1066
+        vzo3 = _mm_loadu_ps(out + i3);  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
1067
+        vzo3plus1 = _mm_loadu_ps(out + i3 + 4);
1068
+        vz0 = _mm_loadu_ps(out + 0);    // z0.r  z0.i  z1.r  z1.i
1069
+        vz0plus1 = _mm_loadu_ps(out + 4);
1070
+        vzo1 = _mm_loadu_ps(out + i1); // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
1071
+        vzo1plus1 = _mm_loadu_ps(out + i1 + 4);
1072
+
1073
+        x0 = _mm_add_ps(vzo2, vzo3);
1074
+        x1 = _mm_sub_ps(vzo2, vzo3);
1075
+
1076
+        y0 = _mm_add_ps(vzo2plus1, vzo3plus1);
1077
+        y1 = _mm_sub_ps(vzo2plus1, vzo3plus1);
1078
+
1079
+        x2 = _mm_unpacklo_ps(x0, x1);
1080
+        x3 = _mm_unpackhi_ps(x0, x1);
1081
+        x4 = _mm_shuffle_ps(x2, x2, 0x1b);
1082
+        x5 = _mm_shuffle_ps(x3, x3, 0x1b);
1083
+
1084
+        y2 = _mm_unpacklo_ps(y0, y1);
1085
+        y3 = _mm_unpackhi_ps(y0, y1);
1086
+        y4 = _mm_shuffle_ps(y2, y2, 0x1b);
1087
+        y5 = _mm_shuffle_ps(y3, y3, 0x1b);
1088
+
1089
+        xmulwi0 = _mm_mul_ps(x4, wi0);
1090
+        xmulwi1 = _mm_mul_ps(x5, wi1);
1091
+        x8 = _mm_madd_ps(x2, wr0, xmulwi0);
1092
+        x9 = _mm_msub_ps(x2, wr0, xmulwi0);
1093
+
1094
+        ymulwi2 = _mm_mul_ps(y4, wi2);
1095
+        ymulwi3 = _mm_mul_ps(y5, wi3);
1096
+        x13 = _mm_madd_ps(x3, wr1, xmulwi1);
1097
+        x14 = _mm_msub_ps(x3, wr1, xmulwi1);
1098
+
1099
+        y8 = _mm_madd_ps(y2, wr2, ymulwi2);
1100
+        y9 = _mm_msub_ps(y2, wr2, ymulwi2);
1101
+        y13 = _mm_madd_ps(y3, wr3, ymulwi3);
1102
+        y14 = _mm_msub_ps(y3, wr3, ymulwi3);
1103
+
1104
+        x10 = _mm_shuffle_ps(x8, x9, 0xe4);
1105
+        x15 = _mm_shuffle_ps(x13, x14, 0xe4);
1106
+
1107
+        y10 = _mm_shuffle_ps(y8, y9, 0xe4);
1108
+        y15 = _mm_shuffle_ps(y13, y14, 0xe4);
1109
+
1110
+        x16 = _mm_shuffle_ps(x10, x15, 0x88);
1111
+        x17 = _mm_shuffle_ps(x10, x15, 0x77);
1112
+
1113
+        y16 = _mm_shuffle_ps(y10, y15, 0x88);
1114
+        y17 = _mm_shuffle_ps(y10, y15, 0x77);
1115
+
1116
+        x18 = _mm_add_ps(vz0, x16);
1117
+        x19 = _mm_sub_ps(vz0, x16);
1118
+        x20 = _mm_add_ps(vzo1, x17);
1119
+        x21 = _mm_sub_ps(vzo1, x17);
1120
+
1121
+        y18 = _mm_add_ps(vz0plus1, y16);
1122
+        y19 = _mm_sub_ps(vz0plus1, y16);
1123
+        y20 = _mm_add_ps(vzo1plus1, y17);
1124
+        y21 = _mm_sub_ps(vzo1plus1, y17);
1125
+
1126
+        x22 = _mm_blend_ps(x20, x21, 0xa);
1127
+        x23 = _mm_blend_ps(x21, x20, 0xa);
1128
+        y22 = _mm_blend_ps(y20, y21, 0xa);
1129
+        y23 = _mm_blend_ps(y21, y20, 0xa);
1130
+
1131
+        _mm_storeu_ps(out, x18);
1132
+        _mm_storeu_ps(out + 4, y18);
1133
+        _mm_storeu_ps(out + i1, x22);
1134
+        _mm_storeu_ps(out + i1 + 4, y22);
1135
+        _mm_storeu_ps(out + i2, x19);
1136
+        _mm_storeu_ps(out + i2 + 4, y19);
1137
+        _mm_storeu_ps(out + i3, x23);
1138
+        _mm_storeu_ps(out + i3 + 4, y23);
1139
+    } while (n -= 2);
1140
+}
1141
+
1142
+static av_always_inline
1143
+void fft2_e2k_interleave(FFTComplex *z)
1144
+{
1145
+#if 1
1146
+    vec_f a, b, c;
1147
+    float *out = (float*)z;
1148
+    vec_f sign = _mm_castsi128_ps(_mm_set1_epi32(-1 << 31));
1149
+
1150
+    a = _mm_loadu_ps(out);
1151
+    b = _mm_unpacklo_ps2(a, a);
1152
+    c = _mm_unpackhi_ps2(a, _mm_xor_ps(a, sign));
1153
+    a = _mm_add_ps(b, c);
1154
+    _mm_storeu_ps(out, a);
1155
+#else
1156
+    FFTSample r0, i0, r1, i1;
1157
+    r0 = z[0].re; i0 = z[0].im;
1158
+    r1 = z[1].re; i1 = z[1].im;
1159
+    z[0].re = r0 + r1;
1160
+    z[0].im = i0 + i1;
1161
+    z[1].re = r0 - r1;
1162
+    z[1].im = i0 - i1;
1163
+#endif
1164
+}
1165
+
1166
+static av_always_inline
1167
+void fft4_e2k_interleave(FFTComplex *z)
1168
+{
1169
+    vec_f a, b, c, d;
1170
+    float *out = (float*)z;
1171
+    a = _mm_loadu_ps(out);
1172
+    b = _mm_loadu_ps(out + 4);
1173
+
1174
+    c = _mm_shuffle_ps(a, b, 0x64);
1175
+    d = _mm_shuffle_ps(a, b, 0xce);
1176
+    a = _mm_add_ps(c, d);
1177
+    b = _mm_sub_ps(c, d);
1178
+
1179
+    c = _mm_unpacklo_ps2(a, b);
1180
+    d = _mm_shuffle_ps(a, b, 0xbe);
1181
+
1182
+    a = _mm_add_ps(c, d);
1183
+    b = _mm_sub_ps(c, d);
1184
+    _mm_storeu_ps(out, a);
1185
+    _mm_storeu_ps(out + 4, b);
1186
+}
1187
+
1188
+static av_always_inline
1189
+void fft8_e2k_interleave(FFTComplex *z)
1190
+{
1191
+    vec_f vz0, vz1, vz2, vz3;
1192
+    vec_f x0, x1, x2, x3;
1193
+    vec_f x4, x5, x6, x7;
1194
+    vec_f x8, x9, x10, x11;
1195
+    vec_f x12, x13, x14, x15;
1196
+    vec_f x16, x17, x18, x19;
1197
+    vec_f x20, x21, x22, x23;
1198
+    vec_f x24, x25, x26, x27;
1199
+    vec_f x28, x29, x30, x31;
1200
+    vec_f x32, x33, x34;
1201
+
1202
+    float *out = (float*)z;
1203
+    vec_f vc1 = _mm_set1_ps(sqrthalf);
1204
+
1205
+    vz0 = _mm_loadu_ps(out);
1206
+    vz1 = _mm_loadu_ps(out + 4);
1207
+    vz2 = _mm_loadu_ps(out + 8);
1208
+    vz3 = _mm_loadu_ps(out + 12);
1209
+
1210
+    x0 = _mm_shuffle_ps(vz0, vz1, 0x64);
1211
+    x1 = _mm_shuffle_ps(vz0, vz1, 0xce);
1212
+    x2 = _mm_shuffle_ps(vz2, vz3, 0x46);
1213
+    x3 = _mm_shuffle_ps(vz2, vz3, 0xec);
1214
+
1215
+    x4 = _mm_add_ps(x0, x1);
1216
+    x5 = _mm_sub_ps(x0, x1);
1217
+    x6 = _mm_add_ps(x2, x3);
1218
+    x7 = _mm_sub_ps(x2, x3);
1219
+
1220
+    x8 = _mm_unpacklo_ps2(x4, x5);
1221
+    x9 = _mm_shuffle_ps(x4, x5, 0xbe);
1222
+    x10 = _mm_shuffle_ps(x6, x7, 0x66);
1223
+    x11 = _mm_shuffle_ps(x6, x7, 0xcc);
1224
+
1225
+    x12 = _mm_add_ps(x8, x9);
1226
+    x13 = _mm_sub_ps(x8, x9);
1227
+    x14 = _mm_add_ps(x10, x11);
1228
+    x15 = _mm_sub_ps(x10, x11);
1229
+    x16 = _mm_unpacklo_ps(x12, x13);
1230
+    x17 = _mm_unpacklo_ps(x14, x15);
1231
+    x18 = _mm_shuffle_ps(x17, x17, 0x6c);
1232
+    x19 = _mm_add_ps(x16, x18); // z0.r  z2.r  z0.i  z2.i
1233
+    x20 = _mm_sub_ps(x16, x18); // z4.r  z6.r  z4.i  z6.i
1234
+
1235
+    x21 = _mm_unpackhi_ps(x12, x13);
1236
+    x22 = _mm_unpackhi_ps(x14, x15);
1237
+    x23 = _mm_unpackhi_ps2(x22, x22);
1238
+    x24 = _mm_add_ps(x22, x23);
1239
+    x25 = _mm_sub_ps(x22, x23);
1240
+
1241
+    x26 = _mm_unpacklo_ps2(x24, x25);
1242
+    x26 = _mm_mul_ps(_mm_shuffle_ps(x26, x26, 0x8d), vc1); // 1,s1,0,s0
1243
+
1244
+    x27 = _mm_add_ps(x21, x26); // z1.r  z7.r  z1.i  z3.i
1245
+    x28 = _mm_sub_ps(x21, x26); // z5.r  z3.r  z5.i  z7.i
1246
+
1247
+    x29 = _mm_shuffle_ps(x19, x27, 0x88); // z0.r  z0.i  z1.r  z1.i
1248
+    x30 = _mm_shuffle_ps(x19, x27, 0xdd); // z2.r  z2.i  z7.r  z3.i
1249
+    x31 = _mm_shuffle_ps(x20, x28, 0x88); // z4.r  z4.i  z5.r  z5.i
1250
+    x32 = _mm_shuffle_ps(x20, x28, 0xdd); // z6.r  z6.i  z3.r  z7.i
1251
+    x33 = _mm_blend_ps(x30, x32, 0x4); // z2.r  z2.i  z3.r  z3.i
1252
+    x34 = _mm_blend_ps(x32, x30, 0x4); // z6.r  z6.i  z7.r  z7.i
1253
+
1254
+    _mm_storeu_ps(out, x29);
1255
+    _mm_storeu_ps(out + 4, x33);
1256
+    _mm_storeu_ps(out + 8, x31);
1257
+    _mm_storeu_ps(out + 12, x34);
1258
+}
1259
+
1260
+static av_always_inline
1261
+void fft16_e2k_interleave(FFTComplex *z)
1262
+{
1263
+    float *out = (float*)z;
1264
+    vec_f vc0 = _mm_set1_ps(sqrthalf);
1265
+    vec_f vc1 = _mm_set1_ps(ff_cos_16[1]);
1266
+    vec_f vc2 = _mm_set1_ps(ff_cos_16[3]);
1267
+    vec_f vz0, vz1, vz2, vz3;
1268
+    vec_f vz4, vz5, vz6, vz7;
1269
+    vec_f x0, x1, x2, x3;
1270
+    vec_f x4, x5, x6, x7;
1271
+    vec_f x8, x9, x10, x11;
1272
+    vec_f x12, x13, x14, x15;
1273
+    vec_f x16, x17, x18, x19;
1274
+    vec_f x20, x21, x22, x23;
1275
+    vec_f x24, x25, x26, x27;
1276
+    vec_f x28, x29, x30, x31;
1277
+    vec_f x32, x33, x34, x35;
1278
+    vec_f x36, x37, x38, x39;
1279
+    vec_f x40, x41, x42, x43;
1280
+    vec_f x44, x45, x46, x47;
1281
+    vec_f x48, x49, x50, x51;
1282
+    vec_f x52, x53, x54, x55;
1283
+    vec_f x56, x57, x58, x59;
1284
+    vec_f x60, x61, x62, x63;
1285
+    vec_f x64, x65, x66, x67;
1286
+    vec_f x68, x69, x70, x71;
1287
+    vec_f x72, x73, x74, x75;
1288
+    vec_f x76, x77, x78, x79;
1289
+    vec_f x80, x81, x82, x83;
1290
+    vec_f x84, x85, x86;
1291
+
1292
+    vz0 = _mm_loadu_ps(out);
1293
+    vz1 = _mm_loadu_ps(out + 4);
1294
+    vz2 = _mm_loadu_ps(out + 8);
1295
+    vz3 = _mm_loadu_ps(out + 12);
1296
+    vz4 = _mm_loadu_ps(out + 16);
1297
+    vz5 = _mm_loadu_ps(out + 20);
1298
+    vz6 = _mm_loadu_ps(out + 24);
1299
+    vz7 = _mm_loadu_ps(out + 28);
1300
+
1301
+    x0 = _mm_shuffle_ps(vz0, vz1, 0x64);
1302
+    x1 = _mm_shuffle_ps(vz0, vz1, 0xce);
1303
+    x2 = _mm_unpacklo_ps2(vz2, vz3);
1304
+    x3 = _mm_unpackhi_ps2(vz2, vz3);
1305
+
1306
+    x4 = _mm_shuffle_ps(vz4, vz5, 0x64);
1307
+    x5 = _mm_shuffle_ps(vz4, vz5, 0xce);
1308
+    x6 = _mm_shuffle_ps(vz6, vz7, 0x64);
1309
+    x7 = _mm_shuffle_ps(vz6, vz7, 0xce);
1310
+
1311
+    x8 = _mm_add_ps(x0, x1);
1312
+    x9 = _mm_sub_ps(x0, x1);
1313
+    x10 = _mm_add_ps(x2, x3);
1314
+    x11 = _mm_sub_ps(x2, x3);
1315
+
1316
+    x12 = _mm_add_ps(x4, x5);
1317
+    x13 = _mm_sub_ps(x4, x5);
1318
+    x14 = _mm_add_ps(x6, x7);
1319
+    x15 = _mm_sub_ps(x6, x7);
1320
+
1321
+    x16 = _mm_unpacklo_ps2(x8, x9);
1322
+    x17 = _mm_shuffle_ps(x8, x9, 0xbe);
1323
+    x18 = _mm_shuffle_ps(x10, x11, 0x96);
1324
+    x19 = _mm_shuffle_ps(x10, x11, 0xcc);
1325
+    x20 = _mm_unpacklo_ps2(x12, x14);
1326
+    x21 = _mm_unpackhi_ps2(x12, x14);
1327
+    x22 = _mm_unpacklo_ps2(x13, x15);
1328
+    x23 = _mm_shuffle_ps(x13, x15, 0xbb);
1329
+
1330
+    x24 = _mm_add_ps(x16, x17);
1331
+    x25 = _mm_sub_ps(x16, x17);
1332
+    x26 = _mm_add_ps(x18, x19);
1333
+    x27 = _mm_sub_ps(x18, x19);
1334
+    x28 = _mm_add_ps(x20, x21);
1335
+    x29 = _mm_sub_ps(x20, x21);
1336
+    x30 = _mm_add_ps(x22, x23);
1337
+    x31 = _mm_sub_ps(x22, x23);
1338
+
1339
+    x32 = _mm_add_ps(x24, x26);
1340
+    x33 = _mm_sub_ps(x24, x26);
1341
+    x34 = _mm_unpacklo_ps2(x32, x33);
1342
+
1343
+    x35 = _mm_shuffle_ps(x28, x29, 0x96);
1344
+    x36 = _mm_shuffle_ps(x28, x29, 0xcc);
1345
+    x37 = _mm_add_ps(x35, x36);
1346
+    x38 = _mm_sub_ps(x35, x36);
1347
+    x39 = _mm_shuffle_ps(x37, x38, 0x14);
1348
+
1349
+    x40 = _mm_shuffle_ps(x27, x38, 0xeb);
1350
+    x41 = _mm_shuffle_ps(x26, x37, 0xbe);
1351
+    x42 = _mm_add_ps(x40, x41);
1352
+    x43 = _mm_sub_ps(x40, x41);
1353
+    x44 = _mm_mul_ps(x42, vc0);
1354
+    x45 = _mm_mul_ps(x43, vc0);
1355
+
1356
+    x46 = _mm_add_ps(x34, x39);  // z0.r  z0.i  z4.r  z4.i
1357
+    x47 = _mm_sub_ps(x34, x39);  // z8.r  z8.i  z12.r  z12.i
1358
+
1359
+    x48 = _mm_shuffle_ps(x30, x31, 0x96);
1360
+    x49 = _mm_shuffle_ps(x30, x31, 0x3c);
1361
+    x50 = _mm_add_ps(x48, x49);
1362
+    x51 = _mm_sub_ps(x48, x49);
1363
+    x52 = _mm_mul_ps(x50, vc1);
1364
+    x53 = _mm_mul_ps(x50, vc2);
1365
+    x54 = _mm_mul_ps(x51, vc1);
1366
+    x55 = _mm_mul_ps(x51, vc2);
1367
+
1368
+    x56 = _mm_unpackhi_ps2(x24, x25);
1369
+    x57 = _mm_shuffle_ps(x44, x45, 0x14);
1370
+    x58 = _mm_add_ps(x56, x57);
1371
+    x59 = _mm_sub_ps(x56, x57);
1372
+
1373
+    x60 = _mm_shuffle_ps(x54, x54, 0xb1);
1374
+    x61 = _mm_shuffle_ps(x55, x55, 0xb1);
1375
+    x62 = _mm_add_ps(x52, x61);
1376
+    x63 = _mm_sub_ps(x52, x61);
1377
+    x64 = _mm_add_ps(x60, x53);
1378
+    x65 = _mm_sub_ps(x60, x53);
1379
+    x66 = _mm_shuffle_ps(x62, x64, 0xb4);
1380
+    x67 = _mm_shuffle_ps(x65, x63, 0xb4);
1381
+
1382
+    x68 = _mm_add_ps(x58, x66); // z1.r    z1.i  z3.r    z3.i
1383
+    x69 = _mm_sub_ps(x58, x66); // z9.r    z9.i  z11.r  z11.i
1384
+    x70 = _mm_add_ps(x59, x67); // z5.r    z5.i  z15.r  z15.i
1385
+    x71 = _mm_sub_ps(x59, x67); // z13.r  z13.i z7.r   z7.i
1386
+
1387
+    x72 = _mm_shuffle_ps(x27, x27, 0xe1);
1388
+    x73 = _mm_add_ps(x25, x72);
1389
+    x74 = _mm_sub_ps(x25, x72);
1390
+    x75 = _mm_unpacklo_ps2(x73, x74);
1391
+    x76 = _mm_shuffle_ps(x44, x45, 0xeb);
1392
+    x77 = _mm_add_ps(x75, x76); // z2.r   z2.i    z6.r    z6.i
1393
+    x78 = _mm_sub_ps(x75, x76); // z10.r  z10.i  z14.r  z14.i
1394
+
1395
+    x79 = _mm_unpacklo_ps2(x46, x68); // z0.r  z0.i  z1.r  z1.i
1396
+    x80 = _mm_shuffle_ps(x77, x68, 0xe4); // z2.r  z2.i  z3.r  z3.i
1397
+    x81 = _mm_shuffle_ps(x46, x70, 0x4e); // z4.r  z4.i  z5.r  z5.i
1398
+    x82 = _mm_unpackhi_ps2(x77, x71); // z6.r  z6.i  z7.r  z7.i
1399
+    _mm_storeu_ps(out, x79);
1400
+    _mm_storeu_ps(out + 4, x80);
1401
+    _mm_storeu_ps(out + 8, x81);
1402
+    _mm_storeu_ps(out + 12, x82);
1403
+    x83 = _mm_unpacklo_ps2(x47, x69); // z8.r  z8.i  z9.r  z9.i
1404
+    x84 = _mm_shuffle_ps(x78, x69, 0xe4); // z10.r  z10.i  z11.r  z11.i
1405
+    x85 = _mm_shuffle_ps(x47, x71, 0x4e); // z12.r  z12.i  z13.r  z13.i
1406
+    x86 = _mm_unpackhi_ps2(x78, x70); // z14.r  z14.i  z15.r  z15.i
1407
+    _mm_storeu_ps(out + 16, x83);
1408
+    _mm_storeu_ps(out + 20, x84);
1409
+    _mm_storeu_ps(out + 24, x85);
1410
+    _mm_storeu_ps(out + 28, x86);
1411
+}
1412
+
1413
+static av_always_inline
1414
+void fft4_e2k(FFTComplex *z)
1415
+{
1416
+    vec_f a, b, c, d;
1417
+    float *out = (float*)z;
1418
+    a = _mm_loadu_ps(out);
1419
+    b = _mm_loadu_ps(out + 4);
1420
+
1421
+    c = _mm_shuffle_ps(a, b, 0x64);
1422
+    d = _mm_shuffle_ps(a, b, 0xce);
1423
+    a = _mm_add_ps(c, d);
1424
+    b = _mm_sub_ps(c, d);
1425
+
1426
+    c = _mm_unpacklo_ps(a, b);
1427
+    d = _mm_unpackhi_ps(a, _mm_shuffle_ps(b, b, 0xb1));
1428
+
1429
+    a = _mm_add_ps(c, d);
1430
+    b = _mm_sub_ps(c, d);
1431
+
1432
+    c = _mm_unpacklo_ps2(a, b);
1433
+    d = _mm_unpackhi_ps2(a, b);
1434
+
1435
+    _mm_storeu_ps(out, c);
1436
+    _mm_storeu_ps(out + 4, d);
1437
+}
1438
+
1439
+static av_always_inline
1440
+void fft8_e2k(FFTComplex *z)
1441
+{
1442
+    vec_f vz0, vz1, vz2, vz3;
1443
+    vec_f vz4, vz5, vz6, vz7, vz8;
1444
+
1445
+    float *out = (float*)z;
1446
+    vec_f vc0 = _mm_setzero_ps();
1447
+    vec_f vc1 = _mm_setr_ps(-sqrthalf, sqrthalf, sqrthalf, -sqrthalf);
1448
+    vec_f vc2 = _mm_set1_ps(sqrthalf);
1449
+
1450
+    vz0 = _mm_loadu_ps(out);
1451
+    vz1 = _mm_loadu_ps(out + 4);
1452
+    vz2 = _mm_loadu_ps(out + 8);
1453
+    vz3 = _mm_loadu_ps(out + 12);
1454
+
1455
+    vz6 = _mm_unpacklo_ps(vz2, vz3);
1456
+    vz7 = _mm_unpackhi_ps(vz2, vz3);
1457
+    vz4 = _mm_shuffle_ps(vz0, vz1, 0x64);
1458
+    vz5 = _mm_shuffle_ps(vz0, vz1, 0xce);
1459
+
1460
+    vz2 = _mm_add_ps(vz6, vz7);
1461
+    vz3 = _mm_sub_ps(vz6, vz7);
1462
+    vz8 = _mm_shuffle_ps(vz3, vz3, 0x4e);
1463
+
1464
+    vz0 = _mm_add_ps(vz4, vz5);
1465
+    vz1 = _mm_sub_ps(vz4, vz5);
1466
+
1467
+    vz3 = _mm_madd_ps(vz3, vc1, vc0);
1468
+    vz3 = _mm_madd_ps(vz8, vc2, vz3);
1469
+
1470
+    vz4 = _mm_unpacklo_ps(vz0, vz1);
1471
+    vz5 = _mm_unpackhi_ps(vz0, _mm_shuffle_ps(vz1, vz1, 0xb1));
1472
+    vz6 = _mm_shuffle_ps(vz2, vz3, 0x39);
1473
+    vz7 = _mm_shuffle_ps(vz2, vz3, 0x6c);
1474
+
1475
+    vz0 = _mm_add_ps(vz4, vz5);
1476
+    vz1 = _mm_sub_ps(vz4, vz5);
1477
+    vz2 = _mm_add_ps(vz6, vz7);
1478
+    vz3 = _mm_sub_ps(vz6, vz7);
1479
+
1480
+    vz4 = _mm_unpacklo_ps2(vz0, vz1);
1481
+    vz5 = _mm_unpackhi_ps2(vz0, vz1);
1482
+    vz6 = _mm_shuffle_ps(vz2, vz3, 0xd8);
1483
+    vz7 = _mm_shuffle_ps(vz2, vz3, 0x8d);
1484
+
1485
+    vz2 = _mm_sub_ps(vz4, vz6);
1486
+    vz3 = _mm_sub_ps(vz5, vz7);
1487
+
1488
+    vz0 = _mm_add_ps(vz4, vz6);
1489
+    vz1 = _mm_add_ps(vz5, vz7);
1490
+
1491
+    _mm_storeu_ps(out, vz0);
1492
+    _mm_storeu_ps(out + 4, vz1);
1493
+    _mm_storeu_ps(out + 8, vz2);
1494
+    _mm_storeu_ps(out + 12, vz3);
1495
+}
1496
+
1497
+static av_always_inline
1498
+void fft16_e2k(FFTComplex *z)
1499
+{
1500
+    float *out = (float*)z;
1501
+    vec_f vc0 = _mm_setzero_ps();
1502
+    vec_f vc1 = _mm_setr_ps(-sqrthalf, sqrthalf, sqrthalf, -sqrthalf);
1503
+    vec_f vc2 = _mm_set1_ps(sqrthalf);
1504
+    vec_f vc3 = _mm_setr_ps(1.0, 0.92387953, sqrthalf, 0.38268343);
1505
+    vec_f vc4 = _mm_setr_ps(0.0, 0.38268343, sqrthalf, 0.92387953);
1506
+    vec_f vc5 = _mm_setr_ps(-0.0, -0.38268343, -sqrthalf, -0.92387953);
1507
+
1508
+    vec_f vz0, vz1, vz2, vz3;
1509
+    vec_f vz4, vz5, vz6, vz7;
1510
+    vec_f vz8, vz9, vz10, vz11;
1511
+    vec_f vz12, vz13;
1512
+
1513
+    vz0 = _mm_loadu_ps(out + 16);
1514
+    vz1 = _mm_loadu_ps(out + 20);
1515
+    vz2 = _mm_loadu_ps(out + 24);
1516
+    vz3 = _mm_loadu_ps(out + 28);
1517
+
1518
+    vz4 = _mm_shuffle_ps(vz0, vz1, 0x64);
1519
+    vz5 = _mm_shuffle_ps(vz0, vz1, 0xce);
1520
+    vz6 = _mm_shuffle_ps(vz2, vz3, 0x64);
1521
+    vz7 = _mm_shuffle_ps(vz2, vz3, 0xce);
1522
+
1523
+    vz0 = _mm_add_ps(vz4, vz5);
1524
+    vz1= _mm_sub_ps(vz4, vz5);
1525
+    vz2 = _mm_add_ps(vz6, vz7);
1526
+    vz3 = _mm_sub_ps(vz6, vz7);
1527
+
1528
+    vz4 = _mm_unpacklo_ps(vz0, vz1);
1529
+    vz5 = _mm_unpackhi_ps(vz0, _mm_shuffle_ps(vz1, vz1, 0xb1));
1530
+    vz6 = _mm_unpacklo_ps(vz2, vz3);
1531
+    vz7 = _mm_unpackhi_ps(vz2, _mm_shuffle_ps(vz3, vz3, 0xb1));
1532
+
1533
+    vz0 = _mm_add_ps(vz4, vz5);
1534
+    vz1 = _mm_sub_ps(vz4, vz5);
1535
+    vz2 = _mm_add_ps(vz6, vz7);
1536
+    vz3 = _mm_sub_ps(vz6, vz7);
1537
+
1538
+    vz4 = _mm_unpacklo_ps2(vz0, vz1);
1539
+    vz5 = _mm_unpackhi_ps2(vz0, vz1);
1540
+
1541
+    vz6 = _mm_unpacklo_ps2(vz2, vz3);
1542
+    vz7 = _mm_unpackhi_ps2(vz2, vz3);
1543
+
1544
+    vz0 = _mm_loadu_ps(out);
1545
+    vz1 = _mm_loadu_ps(out + 4);
1546
+    vz2 = _mm_loadu_ps(out + 8);
1547
+    vz3 = _mm_loadu_ps(out + 12);
1548
+    vz10 = _mm_unpacklo_ps(vz2, vz3);
1549
+    vz11 = _mm_unpackhi_ps(vz2, vz3);
1550
+    vz8 = _mm_shuffle_ps(vz0, vz1, 0x64);
1551
+    vz9 = _mm_shuffle_ps(vz0, vz1, 0xce);
1552
+
1553
+    vz2 = _mm_add_ps(vz10, vz11);
1554
+    vz3 = _mm_sub_ps(vz10, vz11);
1555
+    vz12 = _mm_shuffle_ps(vz3, vz3, 0x4e);
1556
+    vz0 = _mm_add_ps(vz8, vz9);
1557
+    vz1 = _mm_sub_ps(vz8, vz9);
1558
+
1559
+    vz3 = _mm_madd_ps(vz3, vc1, vc0);
1560
+    vz3 = _mm_madd_ps(vz12, vc2, vz3);
1561
+    vz8 = _mm_unpacklo_ps(vz0, vz1);
1562
+    vz9 = _mm_unpackhi_ps(vz0, _mm_shuffle_ps(vz1, vz1, 0xb1));
1563
+    vz10 = _mm_shuffle_ps(vz2, vz3, 0x39);
1564
+    vz11 = _mm_shuffle_ps(vz2, vz3, 0x6c);
1565
+
1566
+    vz0 = _mm_add_ps(vz8, vz9);
1567
+    vz1 = _mm_sub_ps(vz8, vz9);
1568
+    vz2 = _mm_add_ps(vz10, vz11);
1569
+    vz3 = _mm_sub_ps(vz10, vz11);
1570
+
1571
+    vz8 = _mm_unpacklo_ps2(vz0, vz1);
1572
+    vz9 = _mm_unpackhi_ps2(vz0, vz1);
1573
+    vz10 = _mm_shuffle_ps(vz2, vz3, 0xd8);
1574
+    vz11 = _mm_shuffle_ps(vz2, vz3, 0x8d);
1575
+
1576
+    vz2 = _mm_sub_ps(vz8, vz10);
1577
+    vz3 = _mm_sub_ps(vz9, vz11);
1578
+    vz0 = _mm_add_ps(vz8, vz10);
1579
+    vz1 = _mm_add_ps(vz9, vz11);
1580
+
1581
+    vz8 = _mm_madd_ps(vz4, vc3, vc0);
1582
+    vz9 = _mm_madd_ps(vz5, vc3, vc0);
1583
+    vz10 = _mm_madd_ps(vz6, vc3, vc0);
1584
+    vz11 = _mm_madd_ps(vz7, vc3, vc0);
1585
+
1586
+    vz8 = _mm_madd_ps(vz5, vc4, vz8);
1587
+    vz9 = _mm_madd_ps(vz4, vc5, vz9);
1588
+    vz10 = _mm_madd_ps(vz7, vc5, vz10);
1589
+    vz11 = _mm_madd_ps(vz6, vc4, vz11);
1590
+
1591
+    vz12 = _mm_sub_ps(vz10, vz8);
1592
+    vz10 = _mm_add_ps(vz10, vz8);
1593
+
1594
+    vz13 = _mm_sub_ps(vz9, vz11);
1595
+    vz11 = _mm_add_ps(vz9, vz11);
1596
+
1597
+    vz4 = _mm_sub_ps(vz0, vz10);
1598
+    vz0 = _mm_add_ps(vz0, vz10);
1599
+
1600
+    vz7 = _mm_sub_ps(vz3, vz12);
1601
+    vz3 = _mm_add_ps(vz3, vz12);
1602
+
1603
+    vz5 = _mm_sub_ps(vz1, vz11);
1604
+    vz1 = _mm_add_ps(vz1, vz11);
1605
+
1606
+    vz6 = _mm_sub_ps(vz2, vz13);
1607
+    vz2 = _mm_add_ps(vz2, vz13);
1608
+
1609
+    _mm_storeu_ps(out, vz0);
1610
+    _mm_storeu_ps(out + 4, vz1);
1611
+    _mm_storeu_ps(out + 8, vz2);
1612
+    _mm_storeu_ps(out + 12, vz3);
1613
+    _mm_storeu_ps(out + 16, vz4);
1614
+    _mm_storeu_ps(out + 20, vz5);
1615
+    _mm_storeu_ps(out + 24, vz6);
1616
+    _mm_storeu_ps(out + 28, vz7);
1617
+}
1618
+
1619
+static av_always_inline
1620
+void pass_e2k(FFTComplex *z, const FFTSample *wre, unsigned int n)
1621
+{
1622
+    int i1 = n * 4, i2 = n * 8, i3 = n * 12;
1623
+    FFTSample *out = (FFTSample*)z;
1624
+    const FFTSample *wim = wre + n * 2;
1625
+    vec_f v0, v1, v2, v3;
1626
+    vec_f v4, v5, v6, v7;
1627
+    vec_f v8, v9, v10, v11;
1628
+    vec_f v12, v13;
1629
+
1630
+    n = n - 2;
1631
+
1632
+    v8 = _mm_loadu_ps(wre);
1633
+#if 0
1634
+    v9 = _mm_loadu_ps(wim - 3);
1635
+#else
1636
+    v10 = _mm_loadu_ps(wim);
1637
+    v9 = _mm_loadu_ps(wim - 4);
1638
+    v9 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(v10), _mm_castps_si128(v9), 4));
1639
+#endif
1640
+    v9 = _mm_shuffle_ps(v9, v9, 0x1b);
1641
+
1642
+    v4 = _mm_loadu_ps(out + i2);
1643
+    v5 = _mm_loadu_ps(out + i2 + 4);
1644
+    v6 = _mm_loadu_ps(out + i3);
1645
+    v7 = _mm_loadu_ps(out + i3 + 4);
1646
+    v10 = _mm_mul_ps(v4, v8); // r2*wre
1647
+    v11 = _mm_mul_ps(v5, v8); // i2*wre
1648
+    v12 = _mm_mul_ps(v6, v8); // r3*wre
1649
+    v13 = _mm_mul_ps(v7, v8); // i3*wre
1650
+
1651
+    v0 = _mm_loadu_ps(out); // r0
1652
+    v3 = _mm_loadu_ps(out + i1 + 4); // i1
1653
+    v10 = _mm_madd_ps(v5, v9, v10); // r2*wim
1654
+    v11 = _mm_nmsub_ps(v4, v9, v11); // i2*wim
1655
+    v12 = _mm_nmsub_ps(v7, v9, v12); // r3*wim
1656
+    v13 = _mm_madd_ps(v6, v9, v13); // i3*wim
1657
+
1658
+    v1 = _mm_loadu_ps(out + 4); // i0
1659
+    v2 = _mm_loadu_ps(out + i1); // r1
1660
+    v8 = _mm_sub_ps(v12, v10);
1661
+    v12 = _mm_add_ps(v12, v10);
1662
+    v9 = _mm_sub_ps(v11, v13);
1663
+    v13 = _mm_add_ps(v11, v13);
1664
+    v4 = _mm_sub_ps(v0, v12);
1665
+    v0 = _mm_add_ps(v0, v12);
1666
+    v7 = _mm_sub_ps(v3, v8);
1667
+    v3 = _mm_add_ps(v3, v8);
1668
+
1669
+    _mm_storeu_ps(out, v0); // r0
1670
+    _mm_storeu_ps(out + i1 + 4, v3); // i1
1671
+    _mm_storeu_ps(out + i2, v4); // r2
1672
+    _mm_storeu_ps(out + i3 + 4, v7);// i3
1673
+
1674
+    v5 = _mm_sub_ps(v1, v13);
1675
+    v1 = _mm_add_ps(v1, v13);
1676
+    v6 = _mm_sub_ps(v2, v9);
1677
+    v2 = _mm_add_ps(v2, v9);
1678
+
1679
+    _mm_storeu_ps(out + 4, v1); // i0
1680
+    _mm_storeu_ps(out + i1, v2); // r1
1681
+    _mm_storeu_ps(out + i2 + 4, v5); // i2
1682
+    _mm_storeu_ps(out + i3, v6); // r3
1683
+
1684
+    do {
1685
+        out += 8;
1686
+        wre += 4;
1687
+        wim -= 4;
1688
+
1689
+        v8 = _mm_loadu_ps(wre);
1690
+#if 0
1691
+        v9 = _mm_loadu_ps(wim - 3);
1692
+#else
1693
+        v10 = _mm_loadu_ps(wim);
1694
+        v9 = _mm_loadu_ps(wim - 4);
1695
+        v9 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(v10), _mm_castps_si128(v9), 4));
1696
+#endif
1697
+        v9 = _mm_shuffle_ps(v9, v9, 0x1b);
1698
+
1699
+        v4 = _mm_loadu_ps(out + i2); // r2
1700
+        v5 = _mm_loadu_ps(out + i2 + 4); // i2
1701
+        v6 = _mm_loadu_ps(out + i3); // r3
1702
+        v7 = _mm_loadu_ps(out + i3 + 4);// i3
1703
+        v10 = _mm_mul_ps(v4, v8); // r2*wre
1704
+        v11 = _mm_mul_ps(v5, v8); // i2*wre
1705
+        v12 = _mm_mul_ps(v6, v8); // r3*wre
1706
+        v13 = _mm_mul_ps(v7, v8); // i3*wre
1707
+
1708
+        v0 = _mm_loadu_ps(out); // r0
1709
+        v3 = _mm_loadu_ps(out + i1 + 4); // i1
1710
+        v10 = _mm_madd_ps(v5, v9, v10); // r2*wim
1711
+        v11 = _mm_nmsub_ps(v4, v9, v11); // i2*wim
1712
+        v12 = _mm_nmsub_ps(v7, v9, v12); // r3*wim
1713
+        v13 = _mm_madd_ps(v6, v9, v13); // i3*wim
1714
+
1715
+        v1 = _mm_loadu_ps(out + 4); // i0
1716
+        v2 = _mm_loadu_ps(out + i1); // r1
1717
+        v8 = _mm_sub_ps(v12, v10);
1718
+        v12 = _mm_add_ps(v12, v10);
1719
+        v9 = _mm_sub_ps(v11, v13);
1720
+        v13 = _mm_add_ps(v11, v13);
1721
+        v4 = _mm_sub_ps(v0, v12);
1722
+        v0 = _mm_add_ps(v0, v12);
1723
+        v7 = _mm_sub_ps(v3, v8);
1724
+        v3 = _mm_add_ps(v3, v8);
1725
+
1726
+        _mm_storeu_ps(out, v0); // r0
1727
+        _mm_storeu_ps(out + i1 + 4, v3); // i1
1728
+        _mm_storeu_ps(out + i2, v4); // r2
1729
+        _mm_storeu_ps(out + i3 + 4, v7); // i3
1730
+
1731
+        v5 = _mm_sub_ps(v1, v13);
1732
+        v1 = _mm_add_ps(v1, v13);
1733
+        v6 = _mm_sub_ps(v2, v9);
1734
+        v2 = _mm_add_ps(v2, v9);
1735
+
1736
+        _mm_storeu_ps(out + 4, v1); // i0
1737
+        _mm_storeu_ps(out + i1, v2); // r1
1738
+        _mm_storeu_ps(out + i2 + 4, v5); // i2
1739
+        _mm_storeu_ps(out + i3, v6); // r3
1740
+    } while (n -= 2);
1741
+}
1742
+
1743
+static void fft32_e2k_interleave(FFTComplex *z)
1744
+{
1745
+    fft16_e2k_interleave(z);
1746
+    fft8_e2k_interleave(z+16);
1747
+    fft8_e2k_interleave(z+24);
1748
+    pass_e2k_interleave(z,ff_cos_32,4);
1749
+}
1750
+
1751
+static void fft64_e2k_interleave(FFTComplex *z)
1752
+{
1753
+    fft32_e2k_interleave(z);
1754
+    fft16_e2k_interleave(z+32);
1755
+    fft16_e2k_interleave(z+48);
1756
+    pass_e2k_interleave(z,ff_cos_64, 8);
1757
+}
1758
+
1759
+static void fft128_e2k_interleave(FFTComplex *z)
1760
+{
1761
+    fft64_e2k_interleave(z);
1762
+    fft32_e2k_interleave(z+64);
1763
+    fft32_e2k_interleave(z+96);
1764
+    pass_e2k_interleave(z,ff_cos_128,16);
1765
+}
1766
+
1767
+static void fft256_e2k_interleave(FFTComplex *z)
1768
+{
1769
+    fft128_e2k_interleave(z);
1770
+    fft64_e2k_interleave(z+128);
1771
+    fft64_e2k_interleave(z+192);
1772
+    pass_e2k_interleave(z,ff_cos_256,32);
1773
+}
1774
+
1775
+static void fft512_e2k_interleave(FFTComplex *z)
1776
+{
1777
+    fft256_e2k_interleave(z);
1778
+    fft128_e2k_interleave(z+256);
1779
+    fft128_e2k_interleave(z+384);
1780
+    pass_e2k_interleave(z,ff_cos_512,64);
1781
+}
1782
+
1783
+static void fft1024_e2k_interleave(FFTComplex *z)
1784
+{
1785
+    fft512_e2k_interleave(z);
1786
+    fft256_e2k_interleave(z+512);
1787
+    fft256_e2k_interleave(z+768);
1788
+    pass_e2k_interleave(z,ff_cos_1024,128);
1789
+}
1790
+
1791
+static void fft2048_e2k_interleave(FFTComplex *z)
1792
+{
1793
+    fft1024_e2k_interleave(z);
1794
+    fft512_e2k_interleave(z+1024);
1795
+    fft512_e2k_interleave(z+1536);
1796
+    pass_e2k_interleave(z,ff_cos_2048,256);
1797
+}
1798
+
1799
+static void fft4096_e2k_interleave(FFTComplex *z)
1800
+{
1801
+    fft2048_e2k_interleave(z);
1802
+    fft1024_e2k_interleave(z+2048);
1803
+    fft1024_e2k_interleave(z+3072);
1804
+    pass_e2k_interleave(z,ff_cos_4096, 512);
1805
+}
1806
+
1807
+static void fft8192_e2k_interleave(FFTComplex *z)
1808
+{
1809
+    fft4096_e2k_interleave(z);
1810
+    fft2048_e2k_interleave(z+4096);
1811
+    fft2048_e2k_interleave(z+6144);
1812
+    pass_e2k_interleave(z,ff_cos_8192,1024);
1813
+}
1814
+
1815
+static void fft16384_e2k_interleave(FFTComplex *z)
1816
+{
1817
+    fft8192_e2k_interleave(z);
1818
+    fft4096_e2k_interleave(z+8192);
1819
+    fft4096_e2k_interleave(z+12288);
1820
+    pass_e2k_interleave(z,ff_cos_16384,2048);
1821
+}
1822
+
1823
+static void fft32768_e2k_interleave(FFTComplex *z)
1824
+{
1825
+    fft16384_e2k_interleave(z);
1826
+    fft8192_e2k_interleave(z+16384);
1827
+    fft8192_e2k_interleave(z+24576);
1828
+    pass_e2k_interleave(z,ff_cos_32768,4096);
1829
+}
1830
+
1831
+static void fft65536_e2k_interleave(FFTComplex *z)
1832
+{
1833
+    fft32768_e2k_interleave(z);
1834
+    fft16384_e2k_interleave(z+32768);
1835
+    fft16384_e2k_interleave(z+49152);
1836
+    pass_e2k_interleave(z,ff_cos_65536,8192);
1837
+}
1838
+
1839
+static void fft32_e2k(FFTComplex *z)
1840
+{
1841
+    fft16_e2k(z);
1842
+    fft8_e2k(z+16);
1843
+    fft8_e2k(z+24);
1844
+    pass_e2k(z,ff_cos_32,4);
1845
+}
1846
+
1847
+static void fft64_e2k(FFTComplex *z)
1848
+{
1849
+    fft32_e2k(z);
1850
+    fft16_e2k(z+32);
1851
+    fft16_e2k(z+48);
1852
+    pass_e2k(z,ff_cos_64, 8);
1853
+}
1854
+
1855
+static void fft128_e2k(FFTComplex *z)
1856
+{
1857
+    fft64_e2k(z);
1858
+    fft32_e2k(z+64);
1859
+    fft32_e2k(z+96);
1860
+    pass_e2k(z,ff_cos_128,16);
1861
+}
1862
+
1863
+static void fft256_e2k(FFTComplex *z)
1864
+{
1865
+    fft128_e2k(z);
1866
+    fft64_e2k(z+128);
1867
+    fft64_e2k(z+192);
1868
+    pass_e2k(z,ff_cos_256,32);
1869
+}
1870
+
1871
+static void fft512_e2k(FFTComplex *z)
1872
+{
1873
+    fft256_e2k(z);
1874
+    fft128_e2k(z+256);
1875
+    fft128_e2k(z+384);
1876
+    pass_e2k(z,ff_cos_512,64);
1877
+}
1878
+
1879
+static void fft1024_e2k(FFTComplex *z)
1880
+{
1881
+    fft512_e2k(z);
1882
+    fft256_e2k(z+512);
1883
+    fft256_e2k(z+768);
1884
+    pass_e2k(z,ff_cos_1024,128);
1885
+
1886
+}
1887
+
1888
+static void fft2048_e2k(FFTComplex *z)
1889
+{
1890
+    fft1024_e2k(z);
1891
+    fft512_e2k(z+1024);
1892
+    fft512_e2k(z+1536);
1893
+    pass_e2k(z,ff_cos_2048,256);
1894
+}
1895
+
1896
+static void fft4096_e2k(FFTComplex *z)
1897
+{
1898
+    fft2048_e2k(z);
1899
+    fft1024_e2k(z+2048);
1900
+    fft1024_e2k(z+3072);
1901
+    pass_e2k(z,ff_cos_4096, 512);
1902
+}
1903
+
1904
+static void fft8192_e2k(FFTComplex *z)
1905
+{
1906
+    fft4096_e2k(z);
1907
+    fft2048_e2k(z+4096);
1908
+    fft2048_e2k(z+6144);
1909
+    pass_e2k(z,ff_cos_8192,1024);
1910
+}
1911
+
1912
+static void fft16384_e2k(FFTComplex *z)
1913
+{
1914
+    fft8192_e2k(z);
1915
+    fft4096_e2k(z+8192);
1916
+    fft4096_e2k(z+12288);
1917
+    pass_e2k(z,ff_cos_16384,2048);
1918
+}
1919
+
1920
+static void fft32768_e2k(FFTComplex *z)
1921
+{
1922
+    fft16384_e2k(z);
1923
+    fft8192_e2k(z+16384);
1924
+    fft8192_e2k(z+24576);
1925
+    pass_e2k(z,ff_cos_32768,4096);
1926
+}
1927
+
1928
+static void fft65536_e2k(FFTComplex *z)
1929
+{
1930
+    fft32768_e2k(z);
1931
+    fft16384_e2k(z+32768);
1932
+    fft16384_e2k(z+49152);
1933
+    pass_e2k(z,ff_cos_65536,8192);
1934
+}
1935
+
1936
+static void (* const fft_dispatch_e2k[])(FFTComplex*) = {
1937
+    fft4_e2k, fft8_e2k, fft16_e2k, fft32_e2k, fft64_e2k, fft128_e2k, fft256_e2k, fft512_e2k, fft1024_e2k,
1938
+    fft2048_e2k, fft4096_e2k, fft8192_e2k, fft16384_e2k, fft32768_e2k, fft65536_e2k,
1939
+};
1940
+
1941
+static void (* const fft_dispatch_e2k_interleave[])(FFTComplex*) = {
1942
+    fft4_e2k_interleave, fft8_e2k_interleave, fft16_e2k_interleave, fft32_e2k_interleave, fft64_e2k_interleave,
1943
+    fft128_e2k_interleave, fft256_e2k_interleave, fft512_e2k_interleave, fft1024_e2k_interleave,
1944
+    fft2048_e2k_interleave, fft4096_e2k_interleave, fft8192_e2k_interleave, fft16384_e2k_interleave, fft32768_e2k_interleave, fft65536_e2k_interleave,
1945
+};
1946
+
1947
+void ff_fft_calc_interleave_e2k(FFTContext *s, FFTComplex *z)
1948
+{
1949
+     fft_dispatch_e2k_interleave[s->nbits-2](z);
1950
+}
1951
+
1952
+void ff_fft_calc_e2k(FFTContext *s, FFTComplex *z)
1953
+{
1954
+     fft_dispatch_e2k[s->nbits-2](z);
1955
+}
1956
+
1957
diff --git a/libavcodec/e2k/fft.h b/libavcodec/e2k/fft.h
1958
new file mode 100644
1959
index 0000000..62ae2f3
1960
--- /dev/null
1961
+++ b/libavcodec/e2k/fft.h
1962
@@ -0,0 +1,29 @@
1963
+/*
1964
+ * This file is part of FFmpeg.
1965
+ *
1966
+ * FFmpeg is free software; you can redistribute it and/or
1967
+ * modify it under the terms of the GNU Lesser General Public
1968
+ * License as published by the Free Software Foundation; either
1969
+ * version 2.1 of the License, or (at your option) any later version.
1970
+ *
1971
+ * FFmpeg is distributed in the hope that it will be useful,
1972
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1973
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
1974
+ * Lesser General Public License for more details.
1975
+ *
1976
+ * You should have received a copy of the GNU Lesser General Public
1977
+ * License along with FFmpeg; if not, write to the Free Software
1978
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1979
+ */
1980
+
1981
+#ifndef AVCODEC_E2K_FFT_H
1982
+#define AVCODEC_E2K_FFT_H
1983
+
1984
+#include "config.h"
1985
+#include "libavcodec/fft.h"
1986
+#include "libavcodec/fft-internal.h"
1987
+
1988
+void ff_fft_calc_interleave_e2k(FFTContext *s, FFTComplex *z);
1989
+void ff_fft_calc_e2k(FFTContext *s, FFTComplex *z);
1990
+
1991
+#endif /* AVCODEC_E2K_FFT_H */
1992
diff --git a/libavcodec/e2k/fft_init.c b/libavcodec/e2k/fft_init.c
1993
new file mode 100644
1994
index 0000000..116236d
1995
--- /dev/null
1996
+++ b/libavcodec/e2k/fft_init.c
1997
@@ -0,0 +1,152 @@
1998
+/*
1999
+ * FFT/IFFT transforms
2000
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
2001
+ * Copyright (c) 2009 Loren Merritt
2002
+ *
2003
+ * This file is part of FFmpeg.
2004
+ *
2005
+ * FFmpeg is free software; you can redistribute it and/or
2006
+ * modify it under the terms of the GNU Lesser General Public
2007
+ * License as published by the Free Software Foundation; either
2008
+ * version 2.1 of the License, or (at your option) any later version.
2009
+ *
2010
+ * FFmpeg is distributed in the hope that it will be useful,
2011
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
2012
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2013
+ * Lesser General Public License for more details.
2014
+ *
2015
+ * You should have received a copy of the GNU Lesser General Public
2016
+ * License along with FFmpeg; if not, write to the Free Software
2017
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2018
+ */
2019
+
2020
+#include "config.h"
2021
+#include "libavutil/cpu.h"
2022
+#include "libavutil/e2k/cpu.h"
2023
+#include "libavutil/e2k/util_e2k.h"
2024
+#include "libavcodec/fft.h"
2025
+
2026
+#include "fft.h"
2027
+
2028
+/**
2029
+ * Do a complex FFT with the parameters defined in ff_fft_init().
2030
+ * The input data must be permuted before with s->revtab table.
2031
+ * No 1.0 / sqrt(n) normalization is done.
2032
+ *
2033
+ * This code assumes that the 'z' pointer is 16 bytes-aligned.
2034
+ * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
2035
+ */
2036
+
2037
+static void imdct_half_e2k(FFTContext *s, FFTSample *output, const FFTSample *input)
2038
+{
2039
+    int j, k;
2040
+    int n = 1 << s->mdct_bits;
2041
+    int n4 = n >> 2;
2042
+    int n8 = n >> 3;
2043
+    int n32 = n >> 5;
2044
+    const uint16_t *revtabj = s->revtab;
2045
+    const uint16_t *revtabk = s->revtab + n4;
2046
+    const vec_f *tcos = (const vec_f*)(s->tcos + n8);
2047
+    const vec_f *tsin = (const vec_f*)(s->tsin + n8);
2048
+    const vec_f *pin = (const vec_f*)(input + n4);
2049
+    vec_f *pout = (vec_f*)(output + n4);
2050
+
2051
+    /* pre rotation */
2052
+    k = n32 - 1;
2053
+    do {
2054
+        vec_f cos, sin, cos0, sin0, cos1, sin1;
2055
+        vec_f re, im, r0, i0, r1, i1, a, b;
2056
+#define CMULA(p, perm) \
2057
+    a = pin[ k*2+p];                 /* { z[k].re,    z[k].im,    z[k+1].re,  z[k+1].im  } */ \
2058
+    b = pin[-k*2-p-1];               /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */ \
2059
+    re = _mm_shuffle_ps(a, b, 0x88); /* { z[k].re,    z[k+1].re,  z[-k-2].re, z[-k-1].re } */ \
2060
+    im = _mm_shuffle_ps(b, a, 0x77); /* { z[-k-1].im, z[-k-2].im, z[k+1].im,  z[k].im    } */ \
2061
+    cos = _mm_shuffle_ps(cos0, cos1, perm);  /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */ \
2062
+    sin = _mm_shuffle_ps(sin0, sin1, perm); \
2063
+    r##p = _mm_sub_ps(_mm_mul_ps(im, cos), _mm_mul_ps(re, sin)); \
2064
+    i##p = _mm_add_ps(_mm_mul_ps(re, cos), _mm_mul_ps(im, sin));
2065
+#define STORE2(L, v, dst) \
2066
+    VEC_ST##L(output + dst * 2, _mm_castps_si128(v));
2067
+#define STORE8(p) \
2068
+    a = _mm_unpacklo_ps(r##p, i##p); \
2069
+    b = _mm_unpackhi_ps(r##p, i##p); \
2070
+    STORE2(L, a, revtabk[ p*2-4]); \
2071
+    STORE2(H, a, revtabk[ p*2-3]); \
2072
+    STORE2(L, b, revtabj[-p*2+2]); \
2073
+    STORE2(H, b, revtabj[-p*2+3]);
2074
+
2075
+        cos0 = tcos[k];
2076
+        sin0 = tsin[k];
2077
+        cos1 = tcos[-k-1];
2078
+        sin1 = tsin[-k-1];
2079
+        CMULA(0, 0xe4);
2080
+        CMULA(1, 0x4e);
2081
+        STORE8(0);
2082
+        STORE8(1);
2083
+        revtabj += 4;
2084
+        revtabk -= 4;
2085
+        k--;
2086
+    } while (k >= 0);
2087
+
2088
+    ff_fft_calc_e2k(s, (FFTComplex*)output);
2089
+
2090
+    /* post rotation + reordering */
2091
+    j = -n32;
2092
+    k = n32 - 1;
2093
+    do {
2094
+        vec_f cos, sin, re, im, a, b, c, d;
2095
+#define CMULB(d0, d1, o)                  \
2096
+    re = pout[o*2]; im = pout[o*2+1]; \
2097
+    cos = tcos[o];  sin = tsin[o];    \
2098
+    d0 = _mm_sub_ps(_mm_mul_ps(im, sin), _mm_mul_ps(re, cos)); \
2099
+    d1 = _mm_add_ps(_mm_mul_ps(re, sin), _mm_mul_ps(im, cos));
2100
+
2101
+        CMULB(a, b, j);
2102
+        CMULB(c, d, k);
2103
+        d = _mm_shuffle_ps(d, d, 0x1b);
2104
+        b = _mm_shuffle_ps(b, b, 0x1b);
2105
+        pout[2*j]   = _mm_unpacklo_ps(a, d);
2106
+        pout[2*j+1] = _mm_unpackhi_ps(a, d);
2107
+        pout[2*k]   = _mm_unpacklo_ps(c, b);
2108
+        pout[2*k+1] = _mm_unpackhi_ps(c, b);
2109
+        j++;
2110
+        k--;
2111
+    } while (k >= 0);
2112
+}
2113
+
2114
+static void imdct_calc_e2k(FFTContext *s, FFTSample *output, const FFTSample *input)
2115
+{
2116
+    int k;
2117
+    int n = 1 << s->mdct_bits;
2118
+    int n4 = n >> 2;
2119
+    int n16 = n >> 4;
2120
+    vec_u32 sign = _mm_set1_epi32(-1 << 31);
2121
+    vec_u32 *p0 = (vec_u32*)(output + n4);
2122
+    vec_u32 *p1 = (vec_u32*)(output + n4 * 3);
2123
+
2124
+    imdct_half_e2k(s, output + n4, input);
2125
+
2126
+    for (k = 0; k < n16; k++) {
2127
+        vec_u32 a = p0[k] ^ sign;
2128
+        vec_u32 b = p1[-1 - k];
2129
+        p0[-1 - k] = _mm_shuffle_epi32(a, 0x1b);
2130
+        p1[k]      = _mm_shuffle_epi32(b, 0x1b);
2131
+    }
2132
+}
2133
+
2134
+av_cold void ff_fft_init_e2k(FFTContext *s)
2135
+{
2136
+    if (!E2K_BASE(av_get_cpu_flags()))
2137
+        return;
2138
+
2139
+    // !checkasm
2140
+    // libavcodec/tests/fft -n 2..14 [-i]
2141
+    // libavcodec/tests/fft -{m|d|r} -n 4..14 [-i]
2142
+
2143
+    s->fft_calc = ff_fft_calc_interleave_e2k;
2144
+
2145
+    if (s->mdct_bits >= 5) {
2146
+        s->imdct_calc = imdct_calc_e2k;
2147
+        s->imdct_half = imdct_half_e2k;
2148
+    }
2149
+}
2150
diff --git a/libavcodec/e2k/fmtconvert.c b/libavcodec/e2k/fmtconvert.c
2151
new file mode 100644
2152
index 0000000..bfd9cb5
2153
--- /dev/null
2154
+++ b/libavcodec/e2k/fmtconvert.c
2155
@@ -0,0 +1,55 @@
2156
+/*
2157
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
2158
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
2159
+ *
2160
+ * This file is part of FFmpeg.
2161
+ *
2162
+ * FFmpeg is free software; you can redistribute it and/or
2163
+ * modify it under the terms of the GNU Lesser General Public
2164
+ * License as published by the Free Software Foundation; either
2165
+ * version 2.1 of the License, or (at your option) any later version.
2166
+ *
2167
+ * FFmpeg is distributed in the hope that it will be useful,
2168
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
2169
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2170
+ * Lesser General Public License for more details.
2171
+ *
2172
+ * You should have received a copy of the GNU Lesser General Public
2173
+ * License along with FFmpeg; if not, write to the Free Software
2174
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2175
+ */
2176
+
2177
+#include "config.h"
2178
+#include "libavutil/attributes.h"
2179
+#include "libavutil/cpu.h"
2180
+#include "libavutil/mem.h"
2181
+#include "libavutil/e2k/cpu.h"
2182
+#include "libavutil/e2k/util_e2k.h"
2183
+#include "libavcodec/fmtconvert.h"
2184
+
2185
+static void int32_to_float_fmul_scalar_e2k(float *dst, const int32_t *src,
2186
+                                           float mul, int len)
2187
+{
2188
+    int i;
2189
+    __m128 src1, src2, dst1, dst2, mul_v;
2190
+    mul_v = _mm_set1_ps(mul);
2191
+
2192
+    PRAGMA_E2K("ivdep")
2193
+    for (i = 0; i < len; i += 8) {
2194
+        src1 = _mm_cvtepi32_ps(VEC_LD(src + i));
2195
+        src2 = _mm_cvtepi32_ps(VEC_LD(src + i + 4));
2196
+        dst1 = _mm_mul_ps(src1, mul_v);
2197
+        dst2 = _mm_mul_ps(src2, mul_v);
2198
+        _mm_storeu_ps(dst + i, dst1);
2199
+        _mm_storeu_ps(dst + i + 4, dst2);
2200
+    }
2201
+}
2202
+
2203
+av_cold void ff_fmt_convert_init_e2k(FmtConvertContext *c,
2204
+                                     AVCodecContext *avctx)
2205
+{
2206
+    if (!E2K_BASE(av_get_cpu_flags()))
2207
+        return;
2208
+
2209
+    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_e2k;
2210
+}
2211
diff --git a/libavcodec/e2k/h264chroma.c b/libavcodec/e2k/h264chroma.c
2212
new file mode 100644
2213
index 0000000..802a26c
2214
--- /dev/null
2215
+++ b/libavcodec/e2k/h264chroma.c
2216
@@ -0,0 +1,63 @@
2217
+/*
2218
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
2219
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
2220
+ *
2221
+ * This file is part of FFmpeg.
2222
+ *
2223
+ * FFmpeg is free software; you can redistribute it and/or
2224
+ * modify it under the terms of the GNU Lesser General Public
2225
+ * License as published by the Free Software Foundation; either
2226
+ * version 2.1 of the License, or (at your option) any later version.
2227
+ *
2228
+ * FFmpeg is distributed in the hope that it will be useful,
2229
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
2230
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2231
+ * Lesser General Public License for more details.
2232
+ *
2233
+ * You should have received a copy of the GNU Lesser General Public
2234
+ * License along with FFmpeg; if not, write to the Free Software
2235
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2236
+ */
2237
+
2238
+#include "config.h"
2239
+
2240
+#include "libavutil/attributes.h"
2241
+#include "libavutil/cpu.h"
2242
+#include "libavutil/intreadwrite.h"
2243
+#include "libavutil/e2k/cpu.h"
2244
+#include "libavutil/e2k/util_e2k.h"
2245
+
2246
+#include "libavcodec/h264chroma.h"
2247
+
2248
+#define PUT_OP_U8_E2K(d, s, dst) d = s
2249
+#define AVG_OP_U8_E2K(d, s, dst) d = _mm_avg_epu8(dst, s)
2250
+
2251
+#define OP_U8_E2K                          PUT_OP_U8_E2K
2252
+#define PREFIX_h264_chroma_mc8_e2k         put_h264_chroma_mc8_e2k
2253
+#define PREFIX_h264_chroma_mc8_num         e2k_put_h264_chroma_mc8_num
2254
+#include "h264chroma_template.c"
2255
+#undef OP_U8_E2K
2256
+#undef PREFIX_h264_chroma_mc8_e2k
2257
+#undef PREFIX_h264_chroma_mc8_num
2258
+
2259
+#define OP_U8_E2K                          AVG_OP_U8_E2K
2260
+#define PREFIX_h264_chroma_mc8_e2k         avg_h264_chroma_mc8_e2k
2261
+#define PREFIX_h264_chroma_mc8_num         e2k_avg_h264_chroma_mc8_num
2262
+#include "h264chroma_template.c"
2263
+#undef OP_U8_E2K
2264
+#undef PREFIX_h264_chroma_mc8_e2k
2265
+#undef PREFIX_h264_chroma_mc8_num
2266
+
2267
+av_cold void ff_h264chroma_init_e2k(H264ChromaContext *c, int bit_depth)
2268
+{
2269
+    const int high_bit_depth = bit_depth > 8;
2270
+
2271
+    if (!E2K_BASE(av_get_cpu_flags()))
2272
+        return;
2273
+
2274
+    // !checkasm
2275
+    if (!high_bit_depth) {
2276
+        c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_e2k;
2277
+        c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_e2k;
2278
+    }
2279
+}
2280
diff --git a/libavcodec/e2k/h264chroma_template.c b/libavcodec/e2k/h264chroma_template.c
2281
new file mode 100644
2282
index 0000000..623e0b2
2283
--- /dev/null
2284
+++ b/libavcodec/e2k/h264chroma_template.c
2285
@@ -0,0 +1,113 @@
2286
+/*
2287
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
2288
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
2289
+ *
2290
+ * This file is part of FFmpeg.
2291
+ *
2292
+ * FFmpeg is free software; you can redistribute it and/or
2293
+ * modify it under the terms of the GNU Lesser General Public
2294
+ * License as published by the Free Software Foundation; either
2295
+ * version 2.1 of the License, or (at your option) any later version.
2296
+ *
2297
+ * FFmpeg is distributed in the hope that it will be useful,
2298
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
2299
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2300
+ * Lesser General Public License for more details.
2301
+ *
2302
+ * You should have received a copy of the GNU Lesser General Public
2303
+ * License along with FFmpeg; if not, write to the Free Software
2304
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2305
+ */
2306
+
2307
+#include "libavutil/mem.h"
2308
+#include "libavutil/e2k/util_e2k.h"
2309
+
2310
+/* this code assume that stride % 16 == 0 */
2311
+
2312
+#define CHROMA_MC8_E2K_CORE() \
2313
+    v1 = _mm_add_epi16(_mm_maddubs_epi16(v0, vA), bias); \
2314
+    v1 = _mm_add_epi16(_mm_maddubs_epi16(v2, vB), v1); \
2315
+    v1 = _mm_srai_epi16(v1, 6); \
2316
+    v1 = _mm_packus_epi16(v1, v1); \
2317
+    OP_U8_E2K(v1, v1, VEC_LD8(dst)); \
2318
+    VEC_STL(dst, v1); dst += stride; \
2319
+    v0 = v2;
2320
+
2321
+#define CHROMA_MC8_E2K_CORE_SIMPLE(v1) \
2322
+    v1 = _mm_add_epi16(_mm_maddubs_epi16(v1, vA), bias); \
2323
+    v1 = _mm_srai_epi16(v1, 6); \
2324
+    v1 = _mm_packus_epi16(v1, v1); \
2325
+    OP_U8_E2K(v1, v1, VEC_LD8(dst)); \
2326
+    VEC_STL(dst, v1); dst += stride;
2327
+
2328
+#define GET_VSRC1(v0) v0 = VEC_LD8(src); src += stride;
2329
+#define GET_VSRC2(v0, v1) \
2330
+    v0 = VEC_LD8(src); \
2331
+    v1 = VEC_LD8(src + 1); \
2332
+    v0 = _mm_unpacklo_epi8(v0, v1); src += stride;
2333
+
2334
+#ifdef PREFIX_h264_chroma_mc8_e2k
2335
+static void PREFIX_h264_chroma_mc8_e2k(uint8_t *dst, uint8_t *src,
2336
+                                       ptrdiff_t stride, int h,
2337
+                                       int x, int y)
2338
+{
2339
+    int i, xm = (8 - x) | x << 8;
2340
+    __m128i vA, vB, bias = _mm_set1_epi16(32);
2341
+    __m128i v0, v1, v2, v3;
2342
+
2343
+    if (y) {
2344
+        if (x) {
2345
+            vA = _mm_set1_epi16(xm * (8 - y));
2346
+            vB = _mm_set1_epi16(xm * y);
2347
+            GET_VSRC2(v0, v1);
2348
+            PRAGMA_E2K("ivdep")
2349
+            PRAGMA_E2K("unroll(2)")
2350
+            for (i = 0; i < h; i++) {
2351
+                GET_VSRC2(v2, v3);
2352
+                CHROMA_MC8_E2K_CORE();
2353
+            }
2354
+        } else {
2355
+            vA = _mm_set1_epi16(((8 - y) | y << 8) * 8);
2356
+            GET_VSRC1(v0);
2357
+            PRAGMA_E2K("ivdep")
2358
+            PRAGMA_E2K("unroll(2)")
2359
+            for (i = 0; i < h; i++) {
2360
+                GET_VSRC1(v2);
2361
+                v1 = _mm_unpacklo_epi8(v0, v2);
2362
+                CHROMA_MC8_E2K_CORE_SIMPLE(v1);
2363
+                v0 = v2;
2364
+            }
2365
+        }
2366
+    } else {
2367
+        vA = _mm_set1_epi16(xm * 8);
2368
+        PRAGMA_E2K("ivdep")
2369
+        PRAGMA_E2K("unroll(2)")
2370
+        for (i = 0; i < h; i++) {
2371
+            GET_VSRC2(v0, v1);
2372
+            CHROMA_MC8_E2K_CORE_SIMPLE(v0);
2373
+        }
2374
+    }
2375
+}
2376
+#endif
2377
+
2378
+#ifdef PREFIX_no_rnd_vc1_chroma_mc8_e2k
2379
+static void PREFIX_no_rnd_vc1_chroma_mc8_e2k(uint8_t *dst, uint8_t *src,
2380
+                                             ptrdiff_t stride, int h,
2381
+                                             int x, int y)
2382
+{
2383
+    int i, xm = (8 - x) | x << 8;
2384
+    __m128i vA = _mm_set1_epi16(xm * (8 - y));
2385
+    __m128i vB = _mm_set1_epi16(xm * y);
2386
+    __m128i bias = _mm_set1_epi16(28);
2387
+    __m128i v0, v1, v2, v3;
2388
+
2389
+    GET_VSRC2(v0, v1);
2390
+    PRAGMA_E2K("ivdep")
2391
+    PRAGMA_E2K("unroll(2)")
2392
+    for (i = 0; i < h; i++) {
2393
+         GET_VSRC2(v2, v3);
2394
+         CHROMA_MC8_E2K_CORE();
2395
+    }
2396
+}
2397
+#endif
2398
+
2399
diff --git a/libavcodec/e2k/h264dsp.c b/libavcodec/e2k/h264dsp.c
2400
new file mode 100644
2401
index 0000000..ff7be7d
2402
--- /dev/null
2403
+++ b/libavcodec/e2k/h264dsp.c
2404
@@ -0,0 +1,820 @@
2405
+/*
2406
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
2407
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
2408
+ *
2409
+ * This file is part of FFmpeg.
2410
+ *
2411
+ * FFmpeg is free software; you can redistribute it and/or
2412
+ * modify it under the terms of the GNU Lesser General Public
2413
+ * License as published by the Free Software Foundation; either
2414
+ * version 2.1 of the License, or (at your option) any later version.
2415
+ *
2416
+ * FFmpeg is distributed in the hope that it will be useful,
2417
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
2418
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2419
+ * Lesser General Public License for more details.
2420
+ *
2421
+ * You should have received a copy of the GNU Lesser General Public
2422
+ * License along with FFmpeg; if not, write to the Free Software
2423
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2424
+ */
2425
+
2426
+#include "config.h"
2427
+
2428
+#include <stdint.h>
2429
+#include <string.h>
2430
+
2431
+#include "libavutil/attributes.h"
2432
+#include "libavutil/cpu.h"
2433
+#include "libavutil/intreadwrite.h"
2434
+#include "libavutil/mem.h"
2435
+#include "libavutil/e2k/cpu.h"
2436
+#include "libavutil/e2k/util_e2k.h"
2437
+
2438
+#include "libavcodec/h264dec.h"
2439
+#include "libavcodec/h264dsp.h"
2440
+
2441
+/****************************************************************************
2442
+ * IDCT transform:
2443
+ ****************************************************************************/
2444
+
2445
+#define VEC_1D_DCT(vb0, vb1, vb2, vb3, va0, va1, va2, va3)         \
2446
+    /* 1st stage */                                                \
2447
+    vz0 = _mm_add_epi16(vb0, vb2); /* temp[0] = Y[0] + Y[2] */     \
2448
+    vz1 = _mm_sub_epi16(vb0, vb2); /* temp[1] = Y[0] - Y[2] */     \
2449
+    vz2 = _mm_srai_epi16(vb1, 1);                                  \
2450
+    vz2 = _mm_sub_epi16(vz2, vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \
2451
+    vz3 = _mm_srai_epi16(vb3, 1);                                  \
2452
+    vz3 = _mm_add_epi16(vb1, vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \
2453
+    /* 2nd stage: output */                                        \
2454
+    va0 = _mm_add_epi16(vz0, vz3); /* x[0] = temp[0] + temp[3] */  \
2455
+    va1 = _mm_add_epi16(vz1, vz2); /* x[1] = temp[1] + temp[2] */  \
2456
+    va2 = _mm_sub_epi16(vz1, vz2); /* x[2] = temp[1] - temp[2] */  \
2457
+    va3 = _mm_sub_epi16(vz0, vz3)  /* x[3] = temp[0] - temp[3] */
2458
+
2459
+#define VEC_TRANSPOSE_4(a0, a1, a2, a3, b0, b1, b2, b3) \
2460
+    b0 = _mm_unpacklo_epi16(a0, a0); \
2461
+    b1 = _mm_unpacklo_epi16(a1, a0); \
2462
+    b2 = _mm_unpacklo_epi16(a2, a0); \
2463
+    b3 = _mm_unpacklo_epi16(a3, a0); \
2464
+    a0 = _mm_unpacklo_epi16(b0, b2); \
2465
+    a1 = _mm_unpackhi_epi16(b0, b2); \
2466
+    a2 = _mm_unpacklo_epi16(b1, b3); \
2467
+    a3 = _mm_unpackhi_epi16(b1, b3); \
2468
+    b0 = _mm_unpacklo_epi16(a0, a2); \
2469
+    b1 = _mm_unpackhi_epi16(a0, a2); \
2470
+    b2 = _mm_unpacklo_epi16(a1, a3); \
2471
+    b3 = _mm_unpackhi_epi16(a1, a3)
2472
+
2473
+#define VEC_LOAD_U8_ADD_S16_STORE_U8(va)               \
2474
+    va = _mm_srai_epi16(va, 6);                        \
2475
+    vdst = _mm_cvtsi32_si128(*(uint32_t*)dst);         \
2476
+    vdst = _mm_unpacklo_epi8(vdst, zerov);             \
2477
+    va = _mm_add_epi16(va, vdst);                      \
2478
+    va = _mm_packus_epi16(va, va);                     \
2479
+    *(uint32_t*)dst = _mm_extract_epi32(va, 0);        \
2480
+    dst += stride;
2481
+
2482
+static void h264_idct_add_e2k(uint8_t *dst, int16_t *block, int stride)
2483
+{
2484
+    vec_s16 va0, va1, va2, va3;
2485
+    vec_s16 vz0, vz1, vz2, vz3;
2486
+    vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
2487
+    vec_u8 vdst;
2488
+    LOAD_ZERO;
2489
+
2490
+    block[0] += 32;  /* add 32 as a DC-level for rounding */
2491
+
2492
+    vtmp0 = VEC_LD(block);
2493
+    vtmp1 = VEC_ALIGNR8(vtmp0, vtmp0);
2494
+    vtmp2 = VEC_LD(block + 8);
2495
+    vtmp3 = VEC_ALIGNR8(vtmp2, vtmp2);
2496
+    VEC_ST(block, zerov);
2497
+    VEC_ST(block + 8, zerov);
2498
+
2499
+    VEC_1D_DCT(vtmp0, vtmp1, vtmp2, vtmp3, va0, va1, va2, va3);
2500
+    VEC_TRANSPOSE_4(va0, va1, va2, va3, vtmp0, vtmp1, vtmp2, vtmp3);
2501
+    VEC_1D_DCT(vtmp0, vtmp1, vtmp2, vtmp3, va0, va1, va2, va3);
2502
+
2503
+    VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
2504
+    VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
2505
+    VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
2506
+    VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
2507
+}
2508
+
2509
+#define IDCT8_1D_E2K(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) { \
2510
+    /* a0 = SRC(0) + SRC(4); */      \
2511
+    /* a2 = SRC(0) - SRC(4); */      \
2512
+    /* a4 = (SRC(2)>>1) - SRC(6); */ \
2513
+    /* a6 = (SRC(6)>>1) + SRC(2); */ \
2514
+    vec_s16 a0v = _mm_add_epi16(s0, s4); \
2515
+    vec_s16 a2v = _mm_sub_epi16(s0, s4); \
2516
+    vec_s16 a4v = _mm_sub_epi16(_mm_srai_epi16(s2, 1), s6); \
2517
+    vec_s16 a6v = _mm_add_epi16(_mm_srai_epi16(s6, 1), s2); \
2518
+    /* b0 = a0 + a6; */ \
2519
+    /* b2 = a2 + a4; */ \
2520
+    /* b4 = a2 - a4; */ \
2521
+    /* b6 = a0 - a6; */ \
2522
+    vec_s16 b0v = _mm_add_epi16(a0v, a6v);  \
2523
+    vec_s16 b2v = _mm_add_epi16(a2v, a4v);  \
2524
+    vec_s16 b4v = _mm_sub_epi16(a2v, a4v);  \
2525
+    vec_s16 b6v = _mm_sub_epi16(a0v, a6v);  \
2526
+    /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
2527
+    /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
2528
+    /* a5 = (SRC(7)-SRC(1)) + (SRC(5) + (SRC(5)>>1)); */ \
2529
+    /* a7 = (SRC(5)+SRC(3)) + (SRC(1) + (SRC(1)>>1)); */ \
2530
+    vec_s16 a1v = _mm_sub_epi16(_mm_sub_epi16(s5, s3), _mm_add_epi16(s7, _mm_srai_epi16(s7, 1))); \
2531
+    vec_s16 a3v = _mm_sub_epi16(_mm_add_epi16(s7, s1), _mm_add_epi16(s3, _mm_srai_epi16(s3, 1))); \
2532
+    vec_s16 a5v = _mm_add_epi16(_mm_sub_epi16(s7, s1), _mm_add_epi16(s5, _mm_srai_epi16(s5, 1))); \
2533
+    vec_s16 a7v = _mm_add_epi16(_mm_add_epi16(s5, s3), _mm_add_epi16(s1, _mm_srai_epi16(s1, 1))); \
2534
+    /* b1 = (a7>>2) + a1; */ \
2535
+    /* b3 = a3 + (a5>>2); */ \
2536
+    /* b5 = (a3>>2) - a5; */ \
2537
+    /* b7 = a7 - (a1>>2); */ \
2538
+    vec_s16 b1v = _mm_add_epi16(_mm_srai_epi16(a7v, 2), a1v); \
2539
+    vec_s16 b3v = _mm_add_epi16(a3v, _mm_srai_epi16(a5v, 2)); \
2540
+    vec_s16 b5v = _mm_sub_epi16(_mm_srai_epi16(a3v, 2), a5v); \
2541
+    vec_s16 b7v = _mm_sub_epi16(a7v, _mm_srai_epi16(a1v, 2)); \
2542
+    /* DST(0, b0 + b7); */ \
2543
+    /* DST(1, b2 + b5); */ \
2544
+    /* DST(2, b4 + b3); */ \
2545
+    /* DST(3, b6 + b1); */ \
2546
+    /* DST(4, b6 - b1); */ \
2547
+    /* DST(5, b4 - b3); */ \
2548
+    /* DST(6, b2 - b5); */ \
2549
+    /* DST(7, b0 - b7); */ \
2550
+    d0 = _mm_add_epi16(b0v, b7v); \
2551
+    d1 = _mm_add_epi16(b2v, b5v); \
2552
+    d2 = _mm_add_epi16(b4v, b3v); \
2553
+    d3 = _mm_add_epi16(b6v, b1v); \
2554
+    d4 = _mm_sub_epi16(b6v, b1v); \
2555
+    d5 = _mm_sub_epi16(b4v, b3v); \
2556
+    d6 = _mm_sub_epi16(b2v, b5v); \
2557
+    d7 = _mm_sub_epi16(b0v, b7v); \
2558
+}
2559
+
2560
+#define E2K_STORE_SUM_CLIP(dest, idctv) {     \
2561
+    /* unaligned load */                      \
2562
+    __m128i dstv = VEC_LD8(dest);             \
2563
+    dstv = _mm_unpacklo_epi8(dstv, zerov);    \
2564
+    idctv = _mm_srai_epi16(idctv, 6);         \
2565
+    dstv = _mm_add_epi16(dstv, idctv);        \
2566
+    dstv = _mm_packus_epi16(dstv, dstv);      \
2567
+    /* unaligned store */                     \
2568
+    VEC_STL(dest, dstv);                      \
2569
+}
2570
+
2571
+static void h264_idct8_add_e2k(uint8_t *dst, int16_t *dct, int stride)
2572
+{
2573
+    vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
2574
+    vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
2575
+    vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
2576
+
2577
+    LOAD_ZERO;
2578
+
2579
+    dct[0] += 32; // rounding for the >>6 at the end
2580
+
2581
+    s0 = VEC_LD(dct + 8 * 0);
2582
+    s1 = VEC_LD(dct + 8 * 1);
2583
+    s2 = VEC_LD(dct + 8 * 2);
2584
+    s3 = VEC_LD(dct + 8 * 3);
2585
+    s4 = VEC_LD(dct + 8 * 4);
2586
+    s5 = VEC_LD(dct + 8 * 5);
2587
+    s6 = VEC_LD(dct + 8 * 6);
2588
+    s7 = VEC_LD(dct + 8 * 7);
2589
+    VEC_ST(dct + 8 * 0, zerov);
2590
+    VEC_ST(dct + 8 * 1, zerov);
2591
+    VEC_ST(dct + 8 * 2, zerov);
2592
+    VEC_ST(dct + 8 * 3, zerov);
2593
+    VEC_ST(dct + 8 * 4, zerov);
2594
+    VEC_ST(dct + 8 * 5, zerov);
2595
+    VEC_ST(dct + 8 * 6, zerov);
2596
+    VEC_ST(dct + 8 * 7, zerov);
2597
+
2598
+    IDCT8_1D_E2K(s0, s1, s2, s3, s4, s5, s6, s7,
2599
+                     d0, d1, d2, d3, d4, d5, d6, d7);
2600
+
2601
+    TRANSPOSE8(d0, d1, d2, d3, d4, d5, d6, d7);
2602
+
2603
+    IDCT8_1D_E2K(d0, d1, d2, d3, d4, d5, d6, d7,
2604
+                     idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
2605
+
2606
+    E2K_STORE_SUM_CLIP(&dst[0*stride], idct0);
2607
+    E2K_STORE_SUM_CLIP(&dst[1*stride], idct1);
2608
+    E2K_STORE_SUM_CLIP(&dst[2*stride], idct2);
2609
+    E2K_STORE_SUM_CLIP(&dst[3*stride], idct3);
2610
+    E2K_STORE_SUM_CLIP(&dst[4*stride], idct4);
2611
+    E2K_STORE_SUM_CLIP(&dst[5*stride], idct5);
2612
+    E2K_STORE_SUM_CLIP(&dst[6*stride], idct6);
2613
+    E2K_STORE_SUM_CLIP(&dst[7*stride], idct7);
2614
+}
2615
+
2616
+static void h264_idct_dc_add_e2k(uint8_t *dst, int16_t *block, int stride)
2617
+{
2618
+    __m64 dc16, zerov = _mm_setzero_si64();
2619
+    __m64 dcplus, dcminus, v0, v1, v2, v3;
2620
+    int i, dc;
2621
+
2622
+    dc = (block[0] + 32) >> 6;
2623
+    block[0] = 0;
2624
+    dc16 = _mm_set1_pi16(dc);
2625
+    dcplus = _mm_packs_pu16(dc16, dc16);
2626
+    dc16 = _mm_sub_pi16(zerov, dc16);
2627
+    dcminus = _mm_packs_pu16(dc16, dc16);
2628
+
2629
+    PRAGMA_E2K("ivdep")
2630
+    for (i = 0; i < 4; i += 4) {
2631
+        v0 = _mm_cvtsi32_si64(*(uint32_t*)(dst + 0 * stride));
2632
+        v1 = _mm_cvtsi32_si64(*(uint32_t*)(dst + 1 * stride));
2633
+        v2 = _mm_cvtsi32_si64(*(uint32_t*)(dst + 2 * stride));
2634
+        v3 = _mm_cvtsi32_si64(*(uint32_t*)(dst + 3 * stride));
2635
+
2636
+        v0 = _mm_unpacklo_pi32(v0, v1);
2637
+        v2 = _mm_unpacklo_pi32(v2, v3);
2638
+        v0 = _mm_adds_pu8(v0, dcplus);
2639
+        v2 = _mm_adds_pu8(v2, dcplus);
2640
+        v0 = _mm_subs_pu8(v0, dcminus);
2641
+        v2 = _mm_subs_pu8(v2, dcminus);
2642
+
2643
+        *(uint32_t*)(dst + 0 * stride) = _mm_extract_pi32(v0, 0);
2644
+        *(uint32_t*)(dst + 1 * stride) = _mm_extract_pi32(v0, 1);
2645
+        *(uint32_t*)(dst + 2 * stride) = _mm_extract_pi32(v2, 0);
2646
+        *(uint32_t*)(dst + 3 * stride) = _mm_extract_pi32(v2, 1);
2647
+        dst += 4 * stride;
2648
+    }
2649
+}
2650
+
2651
+static void h264_idct8_dc_add_e2k(uint8_t *dst, int16_t *block, int stride)
2652
+{
2653
+    vec_s16 dc16;
2654
+    vec_u8 dcplus, dcminus, v0, v1, v2, v3;
2655
+    LOAD_ZERO;
2656
+    int i, dc;
2657
+
2658
+    dc = (block[0] + 32) >> 6;
2659
+    block[0] = 0;
2660
+    dc16 = _mm_set1_epi16(dc);
2661
+    dcplus = _mm_packus_epi16(dc16, dc16);
2662
+    dc16 = _mm_sub_epi16(zerov, dc16);
2663
+    dcminus = _mm_packus_epi16(dc16, dc16);
2664
+
2665
+    PRAGMA_E2K("ivdep")
2666
+    for (i = 0; i < 8; i += 4) {
2667
+        v0 = VEC_LD8(dst + 0 * stride);
2668
+        v1 = VEC_LD8(dst + 1 * stride);
2669
+        v2 = VEC_LD8(dst + 2 * stride);
2670
+        v3 = VEC_LD8(dst + 3 * stride);
2671
+
2672
+        v0 = _mm_unpacklo_epi64(v0, v1);
2673
+        v2 = _mm_unpacklo_epi64(v2, v3);
2674
+        v0 = _mm_adds_epu8(v0, dcplus);
2675
+        v2 = _mm_adds_epu8(v2, dcplus);
2676
+        v0 = _mm_subs_epu8(v0, dcminus);
2677
+        v2 = _mm_subs_epu8(v2, dcminus);
2678
+
2679
+        VEC_STL(dst + 0 * stride, v0);
2680
+        VEC_STH(dst + 1 * stride, v0);
2681
+        VEC_STL(dst + 2 * stride, v2);
2682
+        VEC_STH(dst + 3 * stride, v2);
2683
+        dst += 4 * stride;
2684
+    }
2685
+}
2686
+
2687
+static void h264_idct_add16_e2k(uint8_t *dst, const int *block_offset,
2688
+                                    int16_t *block, int stride,
2689
+                                    const uint8_t nnzc[15 * 8])
2690
+{
2691
+    int i;
2692
+    for (i = 0; i < 16; i++) {
2693
+        int nnz = nnzc[scan8[i]];
2694
+        if (nnz) {
2695
+            if (nnz == 1 && block[i * 16])
2696
+                h264_idct_dc_add_e2k(dst + block_offset[i], block + i * 16, stride);
2697
+            else                           
2698
+                h264_idct_add_e2k(dst + block_offset[i], block + i * 16, stride);
2699
+        }
2700
+    }
2701
+}
2702
+
2703
+static void h264_idct_add16intra_e2k(uint8_t *dst, const int *block_offset,
2704
+                                         int16_t *block, int stride,
2705
+                                         const uint8_t nnzc[15 * 8])
2706
+{
2707
+    int i;
2708
+    for (i = 0; i < 16; i++) {
2709
+        if (nnzc[scan8[i]])
2710
+            h264_idct_add_e2k(dst + block_offset[i], block + i * 16, stride);
2711
+        else if(block[i * 16])
2712
+            h264_idct_dc_add_e2k(dst + block_offset[i], block + i * 16, stride);
2713
+    }
2714
+}
2715
+
2716
+static void h264_idct8_add4_e2k(uint8_t *dst, const int *block_offset,
2717
+                                    int16_t *block, int stride,
2718
+                                    const uint8_t nnzc[15 * 8])
2719
+{
2720
+    int i;
2721
+    for (i = 0; i < 16; i += 4){
2722
+        int nnz = nnzc[scan8[i]];
2723
+        if (nnz) {
2724
+            if (nnz == 1 && block[i * 16]) 
2725
+                h264_idct8_dc_add_e2k(dst + block_offset[i], block + i * 16, stride);
2726
+            else
2727
+                h264_idct8_add_e2k(dst + block_offset[i], block + i * 16, stride);
2728
+        }
2729
+    }
2730
+}
2731
+
2732
+static void h264_idct_add8_e2k(uint8_t **dest, const int *block_offset,
2733
+                                   int16_t *block, int stride,
2734
+                                   const uint8_t nnzc[15 * 8])
2735
+{
2736
+    int i, j;
2737
+    for (j = 1; j < 3; j++) {
2738
+        for (i = j * 16; i < j * 16 + 4; i++) {
2739
+            if (nnzc[scan8[i]])
2740
+                h264_idct_add_e2k(dest[j - 1] + block_offset[i], block + i * 16, stride);
2741
+            else if (block[i * 16])
2742
+                h264_idct_dc_add_e2k(dest[j - 1] + block_offset[i], block + i * 16, stride);
2743
+        }
2744
+    }
2745
+}
2746
+
2747
+#define transpose4x16(r0, r1, r2, r3) {             \
2748
+    vec_u8 r4, r5, r6, r7;                          \
2749
+                                                    \
2750
+    r4 = _mm_unpacklo_epi8(r0, r2);  /*0, 2 set 0*/ \
2751
+    r5 = _mm_unpackhi_epi8(r0, r2);  /*0, 2 set 1*/ \
2752
+    r6 = _mm_unpacklo_epi8(r1, r3);  /*1, 3 set 0*/ \
2753
+    r7 = _mm_unpackhi_epi8(r1, r3);  /*1, 3 set 1*/ \
2754
+                                                    \
2755
+    r0 = _mm_unpacklo_epi8(r4, r6);  /*all set 0*/  \
2756
+    r1 = _mm_unpackhi_epi8(r4, r6);  /*all set 1*/  \
2757
+    r2 = _mm_unpacklo_epi8(r5, r7);  /*all set 2*/  \
2758
+    r3 = _mm_unpackhi_epi8(r5, r7);  /*all set 3*/  \
2759
+}
2760
+
2761
+#define WRITE4(i, j) ((uint32_t*)dst)[(i * 4 + j) * (dst_stride >> 2)] = _mm_extract_epi32(r##i, j);
2762
+
2763
+static av_always_inline void write16x4(uint8_t *dst, int dst_stride,
2764
+                                       vec_u8 r0, vec_u8 r1,
2765
+                                       vec_u8 r2, vec_u8 r3) {
2766
+
2767
+    WRITE4(0, 0) WRITE4(0, 1) WRITE4(0, 2) WRITE4(0, 3)
2768
+    WRITE4(1, 0) WRITE4(1, 1) WRITE4(1, 2) WRITE4(1, 3)
2769
+    WRITE4(2, 0) WRITE4(2, 1) WRITE4(2, 2) WRITE4(2, 3)
2770
+    WRITE4(3, 0) WRITE4(3, 1) WRITE4(3, 2) WRITE4(3, 3)
2771
+}
2772
+
2773
+/* performs a 6x16 transpose of data in src, and stores it to dst */
2774
+#define read_and_transpose16x6(src, st, r8, r9, r10, r11, r12, r13) { \
2775
+    vec_u8 r0, r1, r2, r3, r4, r5, r6, r7, r14, r15;                  \
2776
+    r0 = VEC_LD8(src);                                                \
2777
+    r1 = VEC_LD8(src + st);                                           \
2778
+    r2 = VEC_LD8(src + st * 2);                                       \
2779
+    r3 = VEC_LD8(src + st * 3);                                       \
2780
+    r4 = VEC_LD8(src + st * 4);                                       \
2781
+    r5 = VEC_LD8(src + st * 5);                                       \
2782
+    r6 = VEC_LD8(src + st * 6);                                       \
2783
+    r7 = VEC_LD8(src + st * 7);                                       \
2784
+    r8 = VEC_LD8(src + st * 8);                                       \
2785
+    r9 = VEC_LD8(src + st * 9);                                       \
2786
+    r10 = VEC_LD8(src + st * 10);                                     \
2787
+    r11 = VEC_LD8(src + st * 11);                                     \
2788
+    r12 = VEC_LD8(src + st * 12);                                     \
2789
+    r13 = VEC_LD8(src + st * 13);                                     \
2790
+    r14 = VEC_LD8(src + st * 14);                                     \
2791
+    r15 = VEC_LD8(src + st * 15);                                     \
2792
+                                                                      \
2793
+    /* Merge first pairs */                                           \
2794
+    r0 = _mm_unpacklo_epi8(r0, r4);    /*  0, 4 */                    \
2795
+    r1 = _mm_unpacklo_epi8(r1, r5);    /*  1, 5 */                    \
2796
+    r2 = _mm_unpacklo_epi8(r2, r6);    /*  2, 6 */                    \
2797
+    r3 = _mm_unpacklo_epi8(r3, r7);    /*  3, 7 */                    \
2798
+    r4 = _mm_unpacklo_epi8(r8, r12);   /*  8,12 */                    \
2799
+    r5 = _mm_unpacklo_epi8(r9, r13);   /*  9,13 */                    \
2800
+    r6 = _mm_unpacklo_epi8(r10, r14);  /* 10,14 */                    \
2801
+    r7 = _mm_unpacklo_epi8(r11, r15);  /* 11,15 */                    \
2802
+                                                                      \
2803
+    /* Merge second pairs */                                          \
2804
+    r8  = _mm_unpacklo_epi8(r0, r2);   /* 0, 2, 4, 6 set 0 */         \
2805
+    r9  = _mm_unpackhi_epi8(r0, r2);   /* 0, 2, 4, 6 set 1 */         \
2806
+    r10 = _mm_unpacklo_epi8(r1, r3);   /* 1, 3, 5, 7 set 0 */         \
2807
+    r11 = _mm_unpackhi_epi8(r1, r3);   /* 1, 3, 5, 7 set 1 */         \
2808
+    r12 = _mm_unpacklo_epi8(r4, r6);   /* 8,10,12,14 set 0 */         \
2809
+    r13 = _mm_unpackhi_epi8(r4, r6);   /* 8,10,12,14 set 1 */         \
2810
+    r14 = _mm_unpacklo_epi8(r5, r7);   /* 9,11,13,15 set 0 */         \
2811
+    r15 = _mm_unpackhi_epi8(r5, r7);   /* 9,11,13,15 set 1 */         \
2812
+                                                                      \
2813
+    /* Third merge */                                                 \
2814
+    r0 = _mm_unpacklo_epi8(r8, r10);   /* 0..7 set 0  */              \
2815
+    r1 = _mm_unpackhi_epi8(r8, r10);   /* 0..7 set 1  */              \
2816
+    r2 = _mm_unpacklo_epi8(r9, r11);   /* 0..7 set 2  */              \
2817
+    r4 = _mm_unpacklo_epi8(r12, r14);  /* 8..15 set 0 */              \
2818
+    r5 = _mm_unpackhi_epi8(r12, r14);  /* 8..15 set 1 */              \
2819
+    r6 = _mm_unpacklo_epi8(r13, r15);  /* 8..15 set 2 */              \
2820
+    /* Don't need to compute 3 and 7*/                                \
2821
+                                                                      \
2822
+    /* Final merge */                                                 \
2823
+    r8  = _mm_unpacklo_epi64(r0, r4);   /* all set 0 */               \
2824
+    r9  = _mm_unpackhi_epi64(r0, r4);   /* all set 1 */               \
2825
+    r10 = _mm_unpacklo_epi64(r1, r5);   /* all set 2 */               \
2826
+    r11 = _mm_unpackhi_epi64(r1, r5);   /* all set 3 */               \
2827
+    r12 = _mm_unpacklo_epi64(r2, r6);   /* all set 4 */               \
2828
+    r13 = _mm_unpackhi_epi64(r2, r6);   /* all set 5 */               \
2829
+    /* Don't need to compute 14 and 15*/                              \
2830
+}
2831
+
2832
+#define read_and_transpose8x4(src, st, r8, r9, r10, r11) {            \
2833
+    __m64 r0, r1, r2, r3, r4, r5, r6, r7;                             \
2834
+    r0 = _mm_cvtsi32_si64(*(uint32_t*)(src));                         \
2835
+    r1 = _mm_cvtsi32_si64(*(uint32_t*)(src + st));                    \
2836
+    r2 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 2));                \
2837
+    r3 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 3));                \
2838
+    r4 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 4));                \
2839
+    r5 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 5));                \
2840
+    r6 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 6));                \
2841
+    r7 = _mm_cvtsi32_si64(*(uint32_t*)(src + st * 7));                \
2842
+                                                                      \
2843
+    r0 = _mm_unpacklo_pi8(r0, r4);                                    \
2844
+    r1 = _mm_unpacklo_pi8(r1, r5);                                    \
2845
+    r2 = _mm_unpacklo_pi8(r2, r6);                                    \
2846
+    r3 = _mm_unpacklo_pi8(r3, r7);                                    \
2847
+                                                                      \
2848
+    r4 = _mm_unpacklo_pi8(r0, r2);                                    \
2849
+    r5 = _mm_unpackhi_pi8(r0, r2);                                    \
2850
+    r6 = _mm_unpacklo_pi8(r1, r3);                                    \
2851
+    r7 = _mm_unpackhi_pi8(r1, r3);                                    \
2852
+                                                                      \
2853
+    r8 = _mm_unpacklo_pi8(r4, r6);                                    \
2854
+    r9 = _mm_unpackhi_pi8(r4, r6);                                    \
2855
+    r10 = _mm_unpacklo_pi8(r5, r7);                                   \
2856
+    r11 = _mm_unpackhi_pi8(r5, r7);                                   \
2857
+}
2858
+
2859
+#define DEF_HELPERS(n, vec, p, si) \
2860
+/* out: o = |x-y| < a */ \
2861
+static av_always_inline vec diff_lt##n(vec x, vec y, vec a) { \
2862
+    vec o = _mm_or_##si(_mm_subs_##p##u8(x, y), _mm_subs_##p##u8(y, x)); /* |x-y| */ \
2863
+    return _mm_cmpgt_##p##i8(a, _mm_xor_##si(o, _mm_set1_##p##i8(-128))); \
2864
+} \
2865
+static av_always_inline vec deblock_mask##n(vec p0, vec p1, vec q0, \
2866
+                                            vec q1, vec alpha, vec beta) { \
2867
+    vec mask, tempmask; \
2868
+    mask = diff_lt##n(p0, q0, alpha); \
2869
+    tempmask = diff_lt##n(p1, p0, beta); \
2870
+    mask = _mm_and_##si(mask, tempmask); \
2871
+    tempmask = diff_lt##n(q1, q0, beta); \
2872
+    return _mm_and_##si(mask, tempmask); \
2873
+}
2874
+
2875
+DEF_HELPERS(16, __m128i, ep, si128)
2876
+DEF_HELPERS(8, __m64, p, si64)
2877
+
2878
+// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
2879
+static av_always_inline vec_u8 h264_deblock_q1(vec_u8 p0, vec_u8 p1, vec_u8 p2,
2880
+                                               vec_u8 q0, vec_u8 tc0) {
2881
+
2882
+    vec_u8 average = _mm_avg_epu8(p0, q0);
2883
+    vec_u8 temp, uncliped;
2884
+    vec_u8 ones = _mm_set1_epi8(1), maxv, minv;
2885
+
2886
+    temp = _mm_xor_si128(average, p2);
2887
+    average = _mm_avg_epu8(average, p2);     /* avg(p2, avg(p0, q0))   */
2888
+    temp = _mm_and_si128(temp, ones);        /* (p2^avg(p0, q0)) & 1   */
2889
+    uncliped = _mm_subs_epu8(average, temp); /* (p2+((p0+q0+1)>>1))>>1 */
2890
+    maxv = _mm_adds_epu8(p1, tc0);
2891
+    minv = _mm_subs_epu8(p1, tc0);
2892
+    return _mm_min_epu8(maxv, _mm_max_epu8(minv, uncliped));
2893
+}
2894
+
2895
+#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked, vec, x, si) {                                 \
2896
+    vec pq0bit = _mm_xor_##si(p0, q0);                                                              \
2897
+    vec q1minus, p0minus, stage1, stage2, c127 = _mm_set1_##x##i8(127);                             \
2898
+    vec vec160 = _mm_set1_##x##i8(160), delta, deltaneg, notv = _mm_set1_##x##i8(-1);               \
2899
+                                                                                                    \
2900
+    q1minus = _mm_xor_##si(q1, notv);            /* 255 - q1 */                                     \
2901
+    stage1 = _mm_avg_##x##u8(p1, q1minus);       /* (p1 - q1 + 256)>>1 */                           \
2902
+    stage2 = _mm_srli_##x##i16(stage1, 1);                                                          \
2903
+    stage2 = _mm_and_##si(stage2, c127);         /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */     \
2904
+    p0minus = _mm_xor_##si(p0, notv);            /* 255 - p0 */                                     \
2905
+    stage1 = _mm_avg_##x##u8(q0, p0minus);       /* (q0 - p0 + 256)>>1 */                           \
2906
+    pq0bit = _mm_and_##si(pq0bit, _mm_set1_##x##i8(1));                                             \
2907
+    stage2 = _mm_avg_##x##u8(stage2, pq0bit);    /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
2908
+    stage2 = _mm_adds_##x##u8(stage2, stage1);   /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */  \
2909
+    deltaneg = _mm_subs_##x##u8(vec160, stage2); /* -d */                                           \
2910
+    delta = _mm_subs_##x##u8(stage2, vec160);    /*  d */                                           \
2911
+    deltaneg = _mm_min_##x##u8(tc0masked, deltaneg);                                                \
2912
+    delta = _mm_min_##x##u8(tc0masked, delta);                                                      \
2913
+    p0 = _mm_subs_##x##u8(p0, deltaneg);                                                            \
2914
+    q0 = _mm_subs_##x##u8(q0, delta);                                                               \
2915
+    p0 = _mm_adds_##x##u8(p0, delta);                                                               \
2916
+    q0 = _mm_adds_##x##u8(q0, deltaneg);                                                            \
2917
+}
2918
+
2919
+#define h264_loop_filter_luma_e2k(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) {                \
2920
+    vec_u8 alphavec, betavec, mask, p1mask, q1mask;                                          \
2921
+    vec_s8 tc0vec;                                                                           \
2922
+    vec_u8 finaltc0, tc0masked, newp1, newq1;                                                \
2923
+                                                                                             \
2924
+    betavec = _mm_set1_epi8(beta - 128);                                                     \
2925
+    alphavec = _mm_set1_epi8(alpha - 128);                                                   \
2926
+    mask = deblock_mask16(p0, p1, q0, q1, alphavec, betavec); /* if in block */              \
2927
+                                                                                             \
2928
+    tc0vec = _mm_cvtsi32_si128(*(uint32_t*)tc0);                                             \
2929
+    tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec);                                              \
2930
+    tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec);                                              \
2931
+    mask = _mm_blendv_epi8(mask, _mm_setzero_si128(), tc0vec);                               \
2932
+    finaltc0 = _mm_and_si128(tc0vec, mask);                /* if (tc0[i] >= 0) tc = tc0 */   \
2933
+                                                                                             \
2934
+    p1mask = diff_lt16(p2, p0, betavec);                                                     \
2935
+    p1mask = _mm_and_si128(p1mask, mask);                       /* if(|p2 - p0| < beta) */   \
2936
+    tc0masked = _mm_and_si128(p1mask, tc0vec);                                               \
2937
+    finaltc0 = _mm_sub_epi8(finaltc0, p1mask);                  /* tc++ */                   \
2938
+    newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
2939
+    /*end if*/                                                                               \
2940
+                                                                                             \
2941
+    q1mask = diff_lt16(q2, q0, betavec);                                                     \
2942
+    q1mask = _mm_and_si128(q1mask, mask);                       /* if(|q2 - q0| < beta) */   \
2943
+    tc0masked = _mm_and_si128(q1mask, tc0vec);                                               \
2944
+    finaltc0 = _mm_sub_epi8(finaltc0, q1mask);                  /* tc++ */                   \
2945
+    newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                      \
2946
+    /*end if*/                                                                               \
2947
+                                                                                             \
2948
+    h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0, __m128i, ep, si128);                        \
2949
+    p1 = newp1;                                                                              \
2950
+    q1 = newq1;                                                                              \
2951
+}
2952
+
2953
+#define h264_loop_filter_chroma2_e2k(p1, p0, q0, q1, alpha, beta, tc0) {                     \
2954
+    __m64 alphavec, betavec, mask, tc0vec, finaltc0;                                         \
2955
+                                                                                             \
2956
+    betavec = _mm_set1_pi8(beta - 128);                                                      \
2957
+    alphavec = _mm_set1_pi8(alpha - 128);                                                    \
2958
+    mask = deblock_mask8(p0, p1, q0, q1, alphavec, betavec); /* if in block */               \
2959
+                                                                                             \
2960
+    tc0vec = _mm_cvtsi32_si64(*(uint32_t*)tc0);                                              \
2961
+    tc0vec = _mm_unpacklo_pi8(tc0vec, tc0vec);                                               \
2962
+    mask = _mm_blendv_pi8(mask, _mm_setzero_si64(), tc0vec);                                 \
2963
+    finaltc0 = _mm_and_si64(tc0vec, mask);                /* if (tc0[i] >= 0) tc = tc0 */    \
2964
+                                                                                             \
2965
+    h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0, __m64, p, si64);                            \
2966
+}
2967
+
2968
+#define h264_loop_filter_chroma4_e2k(p1, p0, q0, q1, alpha, beta, tc0) {                     \
2969
+    __m128i alphavec, betavec, mask, tc0vec, finaltc0;                                       \
2970
+                                                                                             \
2971
+    betavec = _mm_set1_epi8(beta - 128);                                                     \
2972
+    alphavec = _mm_set1_epi8(alpha - 128);                                                   \
2973
+    mask = deblock_mask16(p0, p1, q0, q1, alphavec, betavec); /* if in block */              \
2974
+                                                                                             \
2975
+    tc0vec = _mm_cvtsi32_si128(*(uint32_t*)tc0);                                             \
2976
+    tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec);                                              \
2977
+    tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec);                                              \
2978
+    mask = _mm_blendv_epi8(mask, _mm_setzero_si128(), tc0vec);                               \
2979
+    finaltc0 = _mm_and_si128(tc0vec, mask);                /* if (tc0[i] >= 0) tc = tc0 */   \
2980
+                                                                                             \
2981
+    h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0, __m128i, ep, si128);                        \
2982
+}
2983
+
2984
+static void h264_v_loop_filter_luma_e2k(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) {
2985
+    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
2986
+        vec_u8 p2, p1, p0, q0, q1, q2;
2987
+        p2 = VEC_LD(pix - 3 * stride);
2988
+        p1 = VEC_LD(pix - 2 * stride);
2989
+        p0 = VEC_LD(pix - stride);
2990
+        q0 = VEC_LD(pix);
2991
+        q1 = VEC_LD(pix + stride);
2992
+        q2 = VEC_LD(pix + 2 * stride);
2993
+        h264_loop_filter_luma_e2k(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
2994
+        VEC_ST(pix - 2 * stride, p1);
2995
+        VEC_ST(pix - 1 * stride, p0);
2996
+        VEC_ST(pix, q0);
2997
+        VEC_ST(pix + stride, q1);
2998
+    }
2999
+}
3000
+
3001
+static void h264_v_loop_filter_chroma_e2k(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) {
3002
+    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
3003
+        __m64 p1, p0, q0, q1;
3004
+        p1 = *(__m64*)(pix - 2 * stride);
3005
+        p0 = *(__m64*)(pix - stride);
3006
+        q0 = *(__m64*)pix;
3007
+        q1 = *(__m64*)(pix + stride);
3008
+        h264_loop_filter_chroma2_e2k(p1, p0, q0, q1, alpha, beta, tc0);
3009
+        *(__m64*)(pix - 1 * stride) = p0;
3010
+        *(__m64*)pix = q0;
3011
+    }
3012
+}
3013
+
3014
+static void h264_h_loop_filter_luma_e2k(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) {
3015
+    vec_u8 p2, p1, p0, q0, q1, q2;
3016
+    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) return;
3017
+    read_and_transpose16x6(pix - 3, stride, p2, p1, p0, q0, q1, q2);
3018
+    h264_loop_filter_luma_e2k(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
3019
+    transpose4x16(p1, p0, q0, q1);
3020
+    write16x4(pix - 2, stride, p1, p0, q0, q1);
3021
+}
3022
+
3023
+static void h264_h_loop_filter_chroma_e2k(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) {
3024
+    __m64 p1, p0, q0, q1;
3025
+    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) return;
3026
+    read_and_transpose8x4(pix - 2, stride, p1, p0, q0, q1);
3027
+    h264_loop_filter_chroma2_e2k(p1, p0, q0, q1, alpha, beta, tc0);
3028
+    p1 = _mm_unpacklo_pi8(p0, q0);
3029
+    q1 = _mm_unpackhi_pi8(p0, q0);
3030
+#define WRITE2(v, i) *(uint16_t*)(pix + i * stride) = _mm_extract_pi16(v, i);
3031
+    pix--;
3032
+    WRITE2(p1, 0)
3033
+    WRITE2(p1, 1)
3034
+    WRITE2(p1, 2)
3035
+    WRITE2(p1, 3)
3036
+    pix += stride * 4;
3037
+    WRITE2(q1, 0)
3038
+    WRITE2(q1, 1)
3039
+    WRITE2(q1, 2)
3040
+    WRITE2(q1, 3)
3041
+#undef WRITE2
3042
+}
3043
+
3044
+static void h264_h_loop_filter_chroma422_e2k(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0) {
3045
+    __m128i p1, p0, q0, q1;
3046
+    __m64 p1l, p0l, q0l, q1l, p1h, p0h, q0h, q1h;
3047
+    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) return;
3048
+    read_and_transpose8x4(pix - 2, stride, p1l, p0l, q0l, q1l);
3049
+    read_and_transpose8x4(pix - 2 + stride * 8, stride, p1h, p0h, q0h, q1h);
3050
+    p1 = _mm_unpacklo_epi64(_mm_movpi64_epi64(p1l), _mm_movpi64_epi64(p1h));
3051
+    p0 = _mm_unpacklo_epi64(_mm_movpi64_epi64(p0l), _mm_movpi64_epi64(p0h));
3052
+    q0 = _mm_unpacklo_epi64(_mm_movpi64_epi64(q0l), _mm_movpi64_epi64(q0h));
3053
+    q1 = _mm_unpacklo_epi64(_mm_movpi64_epi64(q1l), _mm_movpi64_epi64(q1h));
3054
+    h264_loop_filter_chroma4_e2k(p1, p0, q0, q1, alpha, beta, tc0);
3055
+    transpose4x16(p1, p0, q0, q1);
3056
+    write16x4(pix - 2, stride, p1, p0, q0, q1);
3057
+}
3058
+
3059
+static void weight_h264_pixels16_e2k(uint8_t *block, ptrdiff_t stride, int height,
3060
+                                     int log2_denom, int weight, int offset)
3061
+{
3062
+    int y;
3063
+    vec_u8 vblock;
3064
+    vec_s16 vweight, voffset, v0, v1;
3065
+    LOAD_ZERO;
3066
+
3067
+    offset <<= log2_denom;
3068
+    if (log2_denom) offset += 1 << (log2_denom - 1);
3069
+
3070
+    vweight = _mm_set1_epi16(weight);
3071
+    voffset = _mm_set1_epi16(offset);
3072
+
3073
+    PRAGMA_E2K("ivdep")
3074
+    for (y = 0; y < height; y++) {
3075
+        vblock = VEC_LD(block);
3076
+        v0 = _mm_unpacklo_epi8(vblock, zerov);
3077
+        v1 = _mm_unpackhi_epi8(vblock, zerov);
3078
+
3079
+        v0 = _mm_mullo_epi16(v0, vweight);
3080
+        v1 = _mm_mullo_epi16(v1, vweight);
3081
+        v0 = _mm_adds_epi16(v0, voffset);
3082
+        v1 = _mm_adds_epi16(v1, voffset);
3083
+        v0 = _mm_srai_epi16(v0, log2_denom);
3084
+        v1 = _mm_srai_epi16(v1, log2_denom);
3085
+
3086
+        vblock = _mm_packus_epi16(v0, v1);
3087
+        VEC_ST(block, vblock);
3088
+        block += stride;
3089
+    }
3090
+}
3091
+
3092
+static void weight_h264_pixels8_e2k(uint8_t *block, ptrdiff_t stride, int height,
3093
+                                    int log2_denom, int weight, int offset)
3094
+{
3095
+    int y;
3096
+    vec_u8 vblock;
3097
+    vec_s16 vweight, voffset, v0;
3098
+    LOAD_ZERO;
3099
+
3100
+    offset <<= log2_denom;
3101
+    if (log2_denom) offset += 1 << (log2_denom - 1);
3102
+
3103
+    vweight = _mm_set1_epi16(weight);
3104
+    voffset = _mm_set1_epi16(offset);
3105
+
3106
+    PRAGMA_E2K("ivdep")
3107
+    for (y = 0; y < height; y++) {
3108
+        vblock = VEC_LD8(block);
3109
+        v0 = _mm_unpacklo_epi8(vblock, zerov);
3110
+
3111
+        v0 = _mm_mullo_epi16(v0, vweight);
3112
+        v0 = _mm_adds_epi16(v0, voffset);
3113
+        v0 = _mm_srai_epi16(v0, log2_denom);
3114
+
3115
+        vblock = _mm_packus_epi16(v0, v0);
3116
+        VEC_STL(block, vblock);
3117
+        block += stride;
3118
+    }
3119
+}
3120
+
3121
+static void biweight_h264_pixels16_e2k(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height,
3122
+                                       int log2_denom, int weightd, int weights, int offset)
3123
+{
3124
+    int y;
3125
+    vec_u8 vsrc, vdst;
3126
+    vec_s16 vweights, vweightd, voffset, v0, v1, v2, v3;
3127
+    LOAD_ZERO;
3128
+
3129
+    offset = ((offset + 1) | 1) << log2_denom;
3130
+    vweights = _mm_set1_epi16(weights);
3131
+    vweightd = _mm_set1_epi16(weightd);
3132
+    voffset = _mm_set1_epi16(offset);
3133
+
3134
+    PRAGMA_E2K("ivdep")
3135
+    for (y = 0; y < height; y++) {
3136
+        vdst = VEC_LD(dst);
3137
+        vsrc = VEC_LD(src);
3138
+        v0 = _mm_unpacklo_epi8(vdst, zerov);
3139
+        v1 = _mm_unpackhi_epi8(vdst, zerov);
3140
+        v2 = _mm_unpacklo_epi8(vsrc, zerov);
3141
+        v3 = _mm_unpackhi_epi8(vsrc, zerov);
3142
+
3143
+        v0 = _mm_mullo_epi16(v0, vweightd);
3144
+        v1 = _mm_mullo_epi16(v1, vweightd);
3145
+        v2 = _mm_mullo_epi16(v2, vweights);
3146
+        v3 = _mm_mullo_epi16(v3, vweights);
3147
+        v0 = _mm_adds_epi16(v0, voffset);
3148
+        v1 = _mm_adds_epi16(v1, voffset);
3149
+        v0 = _mm_adds_epi16(v0, v2);
3150
+        v1 = _mm_adds_epi16(v1, v3);
3151
+        v0 = _mm_srai_epi16(v0, log2_denom + 1);
3152
+        v1 = _mm_srai_epi16(v1, log2_denom + 1);
3153
+
3154
+        vdst = _mm_packus_epi16(v0, v1);
3155
+        VEC_ST(dst, vdst);
3156
+        dst += stride;
3157
+        src += stride;
3158
+    }
3159
+}
3160
+
3161
+static void biweight_h264_pixels8_e2k(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height,
3162
+                                      int log2_denom, int weightd, int weights, int offset)
3163
+{
3164
+    int y;
3165
+    vec_u8 vsrc, vdst;
3166
+    vec_s16 vweights, vweightd, voffset, v0, v2;
3167
+    LOAD_ZERO;
3168
+
3169
+    offset = ((offset + 1) | 1) << log2_denom;
3170
+    vweights = _mm_set1_epi16(weights);
3171
+    vweightd = _mm_set1_epi16(weightd);
3172
+    voffset = _mm_set1_epi16(offset);
3173
+
3174
+    PRAGMA_E2K("ivdep")
3175
+    for (y = 0; y < height; y++) {
3176
+        vdst = VEC_LD8(dst);
3177
+        vsrc = VEC_LD8(src);
3178
+        v0 = _mm_unpacklo_epi8(vdst, zerov);
3179
+        v2 = _mm_unpacklo_epi8(vsrc, zerov);
3180
+
3181
+        v0 = _mm_mullo_epi16(v0, vweightd);
3182
+        v2 = _mm_mullo_epi16(v2, vweights);
3183
+        v0 = _mm_adds_epi16(v0, voffset);
3184
+        v0 = _mm_adds_epi16(v0, v2);
3185
+        v0 = _mm_srai_epi16(v0, log2_denom + 1);
3186
+
3187
+        vdst = _mm_packus_epi16(v0, v0);
3188
+        VEC_STL(dst, vdst);
3189
+        dst += stride;
3190
+        src += stride;
3191
+    }
3192
+}
3193
+
3194
+av_cold void ff_h264dsp_init_e2k(H264DSPContext *c, const int bit_depth,
3195
+                                 const int chroma_format_idc)
3196
+{
3197
+    if (!E2K_BASE(av_get_cpu_flags()))
3198
+        return;
3199
+
3200
+    if (bit_depth == 8) {
3201
+        c->h264_idct_add = h264_idct_add_e2k;
3202
+        if (chroma_format_idc <= 1)
3203
+            c->h264_idct_add8 = h264_idct_add8_e2k; // !checkasm
3204
+
3205
+        c->h264_idct_add16      = h264_idct_add16_e2k;
3206
+        c->h264_idct_add16intra = h264_idct_add16intra_e2k;
3207
+        c->h264_idct_dc_add     = h264_idct_dc_add_e2k;
3208
+        c->h264_idct8_dc_add    = h264_idct8_dc_add_e2k;
3209
+        c->h264_idct8_add       = h264_idct8_add_e2k;
3210
+        c->h264_idct8_add4      = h264_idct8_add4_e2k;
3211
+        c->h264_v_loop_filter_luma = h264_v_loop_filter_luma_e2k;
3212
+        c->h264_h_loop_filter_luma = h264_h_loop_filter_luma_e2k;
3213
+        c->h264_v_loop_filter_chroma = h264_v_loop_filter_chroma_e2k;
3214
+        if (chroma_format_idc <= 1) {
3215
+            c->h264_h_loop_filter_chroma = h264_h_loop_filter_chroma_e2k;
3216
+        } else {
3217
+            c->h264_h_loop_filter_chroma = h264_h_loop_filter_chroma422_e2k;
3218
+        }
3219
+        c->weight_h264_pixels_tab[0]   = weight_h264_pixels16_e2k;   // !checkasm
3220
+        c->weight_h264_pixels_tab[1]   = weight_h264_pixels8_e2k;    // !checkasm
3221
+        c->biweight_h264_pixels_tab[0] = biweight_h264_pixels16_e2k; // !checkasm
3222
+        c->biweight_h264_pixels_tab[1] = biweight_h264_pixels8_e2k;  // !checkasm
3223
+    }
3224
+}
3225
diff --git a/libavcodec/e2k/h264qpel.c b/libavcodec/e2k/h264qpel.c
3226
new file mode 100644
3227
index 0000000..f8fe094
3228
--- /dev/null
3229
+++ b/libavcodec/e2k/h264qpel.c
3230
@@ -0,0 +1,255 @@
3231
+/*
3232
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
3233
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3234
+ *
3235
+ * This file is part of FFmpeg.
3236
+ *
3237
+ * FFmpeg is free software; you can redistribute it and/or
3238
+ * modify it under the terms of the GNU Lesser General Public
3239
+ * License as published by the Free Software Foundation; either
3240
+ * version 2.1 of the License, or (at your option) any later version.
3241
+ *
3242
+ * FFmpeg is distributed in the hope that it will be useful,
3243
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3244
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
3245
+ * Lesser General Public License for more details.
3246
+ *
3247
+ * You should have received a copy of the GNU Lesser General Public
3248
+ * License along with FFmpeg; if not, write to the Free Software
3249
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3250
+ */
3251
+
3252
+#include "config.h"
3253
+
3254
+#include "libavutil/attributes.h"
3255
+#include "libavutil/cpu.h"
3256
+#include "libavutil/intreadwrite.h"
3257
+#include "libavutil/e2k/cpu.h"
3258
+#include "libavutil/e2k/util_e2k.h"
3259
+
3260
+#include "libavcodec/h264qpel.h"
3261
+
3262
+#include "hpeldsp.h"
3263
+
3264
+#define PUT_OP_U8_E2K(d, s, dst) d = s
3265
+#define AVG_OP_U8_E2K(d, s, dst) d = _mm_avg_epu8(dst, s)
3266
+
3267
+#define OP_U8_E2K                          PUT_OP_U8_E2K
3268
+#define PREFIX_h264_qpel16_h_lowpass_e2k   put_h264_qpel16_h_lowpass_e2k
3269
+#define PREFIX_h264_qpel16_v_lowpass_e2k   put_h264_qpel16_v_lowpass_e2k
3270
+#define PREFIX_h264_qpel16_hv_lowpass_e2k  put_h264_qpel16_hv_lowpass_e2k
3271
+#include "h264qpel_template.c"
3272
+#undef OP_U8_E2K
3273
+#undef PREFIX_h264_qpel16_h_lowpass_e2k
3274
+#undef PREFIX_h264_qpel16_v_lowpass_e2k
3275
+#undef PREFIX_h264_qpel16_hv_lowpass_e2k
3276
+
3277
+#define OP_U8_E2K                          AVG_OP_U8_E2K
3278
+#define PREFIX_h264_qpel16_h_lowpass_e2k   avg_h264_qpel16_h_lowpass_e2k
3279
+#define PREFIX_h264_qpel16_v_lowpass_e2k   avg_h264_qpel16_v_lowpass_e2k
3280
+#define PREFIX_h264_qpel16_hv_lowpass_e2k  avg_h264_qpel16_hv_lowpass_e2k
3281
+#include "h264qpel_template.c"
3282
+#undef OP_U8_E2K
3283
+#undef PREFIX_h264_qpel16_h_lowpass_e2k
3284
+#undef PREFIX_h264_qpel16_v_lowpass_e2k
3285
+#undef PREFIX_h264_qpel16_hv_lowpass_e2k
3286
+
3287
+#define H264_MC(OPNAME, SIZE, CODETYPE) \
3288
+static void OPNAME##h264_qpel##SIZE##_mc00_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3289
+{\
3290
+    ff_##OPNAME##pixels##SIZE##_##CODETYPE(dst, src, stride, SIZE);\
3291
+}\
3292
+\
3293
+static void OPNAME##h264_qpel##SIZE##_mc10_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3294
+{ \
3295
+    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
3296
+    put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(half, src, SIZE, stride);\
3297
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, src, half, stride, stride, SIZE);\
3298
+}\
3299
+\
3300
+static void OPNAME##h264_qpel##SIZE##_mc20_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3301
+{\
3302
+    OPNAME##h264_qpel##SIZE##_h_lowpass_##CODETYPE(dst, src, stride, stride);\
3303
+}\
3304
+\
3305
+static void OPNAME##h264_qpel##SIZE##_mc30_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3306
+{\
3307
+    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
3308
+    put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(half, src, SIZE, stride);\
3309
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, src+1, half, stride, stride, SIZE);\
3310
+}\
3311
+\
3312
+static void OPNAME##h264_qpel##SIZE##_mc01_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3313
+{\
3314
+    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
3315
+    put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(half, src, SIZE, stride);\
3316
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, src, half, stride, stride, SIZE);\
3317
+}\
3318
+\
3319
+static void OPNAME##h264_qpel##SIZE##_mc02_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3320
+{\
3321
+    OPNAME##h264_qpel##SIZE##_v_lowpass_##CODETYPE(dst, src, stride, stride);\
3322
+}\
3323
+\
3324
+static void OPNAME##h264_qpel##SIZE##_mc03_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3325
+{\
3326
+    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
3327
+    put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(half, src, SIZE, stride);\
3328
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
3329
+}\
3330
+\
3331
+static void OPNAME##h264_qpel##SIZE##_mc11_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3332
+{\
3333
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
3334
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
3335
+    put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src, SIZE, stride);\
3336
+    put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src, SIZE, stride);\
3337
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
3338
+}\
3339
+\
3340
+static void OPNAME##h264_qpel##SIZE##_mc31_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3341
+{\
3342
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
3343
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
3344
+    put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src, SIZE, stride);\
3345
+    put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src+1, SIZE, stride);\
3346
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
3347
+}\
3348
+\
3349
+static void OPNAME##h264_qpel##SIZE##_mc13_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3350
+{\
3351
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
3352
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
3353
+    put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src + stride, SIZE, stride);\
3354
+    put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src, SIZE, stride);\
3355
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
3356
+}\
3357
+\
3358
+static void OPNAME##h264_qpel##SIZE##_mc33_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3359
+{\
3360
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
3361
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
3362
+    put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src + stride, SIZE, stride);\
3363
+    put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src+1, SIZE, stride);\
3364
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
3365
+}\
3366
+\
3367
+static void OPNAME##h264_qpel##SIZE##_mc22_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3368
+{\
3369
+    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
3370
+    OPNAME##h264_qpel##SIZE##_hv_lowpass_##CODETYPE(dst, tmp, src, stride, SIZE, stride);\
3371
+}\
3372
+\
3373
+static void OPNAME##h264_qpel##SIZE##_mc21_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3374
+{\
3375
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
3376
+    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
3377
+    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
3378
+    put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src, SIZE, stride);\
3379
+    put_h264_qpel##SIZE##_hv_lowpass_##CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
3380
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
3381
+}\
3382
+\
3383
+static void OPNAME##h264_qpel##SIZE##_mc23_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3384
+{\
3385
+    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
3386
+    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
3387
+    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
3388
+    put_h264_qpel##SIZE##_h_lowpass_##CODETYPE(halfH, src + stride, SIZE, stride);\
3389
+    put_h264_qpel##SIZE##_hv_lowpass_##CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
3390
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
3391
+}\
3392
+\
3393
+static void OPNAME##h264_qpel##SIZE##_mc12_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3394
+{\
3395
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
3396
+    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
3397
+    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
3398
+    put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src, SIZE, stride);\
3399
+    put_h264_qpel##SIZE##_hv_lowpass_##CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
3400
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
3401
+}\
3402
+\
3403
+static void OPNAME##h264_qpel##SIZE##_mc32_##CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
3404
+{\
3405
+    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
3406
+    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
3407
+    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
3408
+    put_h264_qpel##SIZE##_v_lowpass_##CODETYPE(halfV, src+1, SIZE, stride);\
3409
+    put_h264_qpel##SIZE##_hv_lowpass_##CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
3410
+    OPNAME##pixels##SIZE##_l2_##CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
3411
+}\
3412
+
3413
+#if 1
3414
+static av_always_inline void put_pixels16_l2_e2k(uint8_t *dst, const uint8_t *src1,
3415
+                                                 const uint8_t *src2, int dst_stride,
3416
+                                                 int src_stride1, int h)
3417
+{
3418
+    int i;
3419
+    vec_u8 a, b, d;
3420
+
3421
+    for (i = 0; i < h; i++) {
3422
+        a = VEC_LD(src1 + i * src_stride1);
3423
+        b = VEC_LD(src2 + i * 16);
3424
+        d = _mm_avg_epu8(a, b);
3425
+        VEC_ST(dst, d);
3426
+        dst += dst_stride;
3427
+    }
3428
+}
3429
+
3430
+static av_always_inline void avg_pixels16_l2_e2k(uint8_t *dst, const uint8_t *src1,
3431
+                                                 const uint8_t *src2, int dst_stride,
3432
+                                                 int src_stride1, int h)
3433
+{
3434
+    int i;
3435
+    vec_u8 a, b, d;
3436
+
3437
+    for (i = 0; i < h; i++) {
3438
+        a = VEC_LD(src1 + i * src_stride1);
3439
+        b = VEC_LD(src2 + i * 16);
3440
+        d = _mm_avg_epu8(a, b);
3441
+        a = _mm_avg_epu8(VEC_LD(dst), d);
3442
+        VEC_ST(dst, a);
3443
+        dst += dst_stride;
3444
+    }
3445
+}
3446
+
3447
+#else // Implemented but could be faster
3448
+#define put_pixels16_l2_e2k(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
3449
+#define avg_pixels16_l2_e2k(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
3450
+#endif
3451
+
3452
+H264_MC(put_, 16, e2k)
3453
+H264_MC(avg_, 16, e2k)
3454
+
3455
+av_cold void ff_h264qpel_init_e2k(H264QpelContext *c, int bit_depth)
3456
+{
3457
+    const int high_bit_depth = bit_depth > 8;
3458
+
3459
+    if (!E2K_BASE(av_get_cpu_flags()))
3460
+        return;
3461
+
3462
+    if (!high_bit_depth) {
3463
+#define dspfunc(PFX, IDX, NUM) \
3464
+        c->PFX##_pixels_tab[IDX][ 0] = PFX##NUM##_mc00_e2k; \
3465
+        c->PFX##_pixels_tab[IDX][ 1] = PFX##NUM##_mc10_e2k; \
3466
+        c->PFX##_pixels_tab[IDX][ 2] = PFX##NUM##_mc20_e2k; \
3467
+        c->PFX##_pixels_tab[IDX][ 3] = PFX##NUM##_mc30_e2k; \
3468
+        c->PFX##_pixels_tab[IDX][ 4] = PFX##NUM##_mc01_e2k; \
3469
+        c->PFX##_pixels_tab[IDX][ 5] = PFX##NUM##_mc11_e2k; \
3470
+        c->PFX##_pixels_tab[IDX][ 6] = PFX##NUM##_mc21_e2k; \
3471
+        c->PFX##_pixels_tab[IDX][ 7] = PFX##NUM##_mc31_e2k; \
3472
+        c->PFX##_pixels_tab[IDX][ 8] = PFX##NUM##_mc02_e2k; \
3473
+        c->PFX##_pixels_tab[IDX][ 9] = PFX##NUM##_mc12_e2k; \
3474
+        c->PFX##_pixels_tab[IDX][10] = PFX##NUM##_mc22_e2k; \
3475
+        c->PFX##_pixels_tab[IDX][11] = PFX##NUM##_mc32_e2k; \
3476
+        c->PFX##_pixels_tab[IDX][12] = PFX##NUM##_mc03_e2k; \
3477
+        c->PFX##_pixels_tab[IDX][13] = PFX##NUM##_mc13_e2k; \
3478
+        c->PFX##_pixels_tab[IDX][14] = PFX##NUM##_mc23_e2k; \
3479
+        c->PFX##_pixels_tab[IDX][15] = PFX##NUM##_mc33_e2k
3480
+
3481
+        dspfunc(put_h264_qpel, 0, 16);
3482
+        dspfunc(avg_h264_qpel, 0, 16);
3483
+#undef dspfunc
3484
+    }
3485
+}
3486
diff --git a/libavcodec/e2k/h264qpel_template.c b/libavcodec/e2k/h264qpel_template.c
3487
new file mode 100644
3488
index 0000000..bbd6516
3489
--- /dev/null
3490
+++ b/libavcodec/e2k/h264qpel_template.c
3491
@@ -0,0 +1,354 @@
3492
+/*
3493
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
3494
+ * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3495
+ *
3496
+ * This file is part of FFmpeg.
3497
+ *
3498
+ * FFmpeg is free software; you can redistribute it and/or
3499
+ * modify it under the terms of the GNU Lesser General Public
3500
+ * License as published by the Free Software Foundation; either
3501
+ * version 2.1 of the License, or (at your option) any later version.
3502
+ *
3503
+ * FFmpeg is distributed in the hope that it will be useful,
3504
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3505
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
3506
+ * Lesser General Public License for more details.
3507
+ *
3508
+ * You should have received a copy of the GNU Lesser General Public
3509
+ * License along with FFmpeg; if not, write to the Free Software
3510
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3511
+ */
3512
+
3513
+#include "config.h"
3514
+#if HAVE_UNISTD_H
3515
+#include <unistd.h>
3516
+#endif
3517
+
3518
+#include "libavutil/avassert.h"
3519
+#include "libavutil/mem.h"
3520
+#include "libavutil/e2k/util_e2k.h"
3521
+
3522
+#define load_alignment() { \
3523
+    srcM2 = VEC_LD(src - 2); \
3524
+    srcM1 = VEC_LD(src - 1); \
3525
+    srcP0 = VEC_LD(src); \
3526
+    srcP1 = VEC_LD(src + 1); \
3527
+    srcP2 = VEC_LD(src + 2); \
3528
+    srcP3 = VEC_LD(src + 3); \
3529
+}
3530
+
3531
+/* this code assume stride % 16 == 0 */
3532
+#ifdef PREFIX_h264_qpel16_h_lowpass_e2k
3533
+static void PREFIX_h264_qpel16_h_lowpass_e2k(uint8_t *dst,
3534
+                                             const uint8_t *src,
3535
+                                             int dstStride, int srcStride)
3536
+{
3537
+    int i;
3538
+
3539
+    LOAD_ZERO;
3540
+    const vec_s16 v5ss = _mm_set1_epi16(5);
3541
+    const vec_s16 v20ss = _mm_set1_epi16(20);
3542
+    const vec_s16 v16ss = _mm_set1_epi16(16);
3543
+
3544
+    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
3545
+    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
3546
+              srcP2A, srcP2B, srcP3A, srcP3B,
3547
+              srcM1A, srcM1B, srcM2A, srcM2B,
3548
+              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
3549
+              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
3550
+              sumA, sumB;
3551
+    vec_u8 sum, fsum;
3552
+
3553
+    PRAGMA_E2K("ivdep")
3554
+    for (i = 0; i < 16; i++) {
3555
+        load_alignment();
3556
+
3557
+        srcP0A = _mm_unpacklo_epi8(srcP0, zerov);
3558
+        srcP0B = _mm_unpackhi_epi8(srcP0, zerov);
3559
+        srcP1A = _mm_unpacklo_epi8(srcP1, zerov);
3560
+        srcP1B = _mm_unpackhi_epi8(srcP1, zerov);
3561
+
3562
+        srcP2A = _mm_unpacklo_epi8(srcP2, zerov);
3563
+        srcP2B = _mm_unpackhi_epi8(srcP2, zerov);
3564
+        srcP3A = _mm_unpacklo_epi8(srcP3, zerov);
3565
+        srcP3B = _mm_unpackhi_epi8(srcP3, zerov);
3566
+
3567
+        srcM1A = _mm_unpacklo_epi8(srcM1, zerov);
3568
+        srcM1B = _mm_unpackhi_epi8(srcM1, zerov);
3569
+        srcM2A = _mm_unpacklo_epi8(srcM2, zerov);
3570
+        srcM2B = _mm_unpackhi_epi8(srcM2, zerov);
3571
+
3572
+        sum1A = _mm_adds_epi16(srcP0A, srcP1A);
3573
+        sum1B = _mm_adds_epi16(srcP0B, srcP1B);
3574
+        sum2A = _mm_adds_epi16(srcM1A, srcP2A);
3575
+        sum2B = _mm_adds_epi16(srcM1B, srcP2B);
3576
+        sum3A = _mm_adds_epi16(srcM2A, srcP3A);
3577
+        sum3B = _mm_adds_epi16(srcM2B, srcP3B);
3578
+
3579
+        pp1A = _mm_add_epi16(_mm_mullo_epi16(sum1A, v20ss), v16ss);
3580
+        pp1B = _mm_add_epi16(_mm_mullo_epi16(sum1B, v20ss), v16ss);
3581
+        pp2A = _mm_mullo_epi16(sum2A, v5ss);
3582
+        pp2B = _mm_mullo_epi16(sum2B, v5ss);
3583
+        pp3A = _mm_add_epi16(sum3A, pp1A);
3584
+        pp3B = _mm_add_epi16(sum3B, pp1B);
3585
+        sumA = _mm_sub_epi16(pp3A, pp2A);
3586
+        sumB = _mm_sub_epi16(pp3B, pp2B);
3587
+        sumA = _mm_srai_epi16(sumA, 5);
3588
+        sumB = _mm_srai_epi16(sumB, 5);
3589
+        sum = _mm_packus_epi16(sumA, sumB);
3590
+
3591
+        OP_U8_E2K(fsum, sum, VEC_LD(dst));
3592
+        VEC_ST(dst, fsum);
3593
+
3594
+        src += srcStride;
3595
+        dst += dstStride;
3596
+    }
3597
+}
3598
+#endif /* PREFIX_h264_qpel16_h_lowpass_e2k */
3599
+
3600
+/* this code assume stride % 16 == 0 */
3601
+#ifdef PREFIX_h264_qpel16_v_lowpass_e2k
3602
+static void PREFIX_h264_qpel16_v_lowpass_e2k(uint8_t *dst,
3603
+                                             const uint8_t *src,
3604
+                                             int dstStride, int srcStride)
3605
+{
3606
+    int i;
3607
+
3608
+    LOAD_ZERO;
3609
+    const vec_s16 v20ss = _mm_set1_epi16(20);
3610
+    const vec_s16 v5ss = _mm_set1_epi16(5);
3611
+    const vec_s16 v16ss = _mm_set1_epi16(16);
3612
+
3613
+    const vec_u8 srcM2 = VEC_LD(src - srcStride * 2);
3614
+    const vec_u8 srcM1 = VEC_LD(src - srcStride);
3615
+    const vec_u8 srcP0 = VEC_LD(src);
3616
+    const vec_u8 srcP1 = VEC_LD(src + srcStride);
3617
+    const vec_u8 srcP2 = VEC_LD(src + srcStride * 2);
3618
+
3619
+    vec_s16 srcM2ssA = _mm_unpacklo_epi8(srcM2, zerov);
3620
+    vec_s16 srcM2ssB = _mm_unpackhi_epi8(srcM2, zerov);
3621
+    vec_s16 srcM1ssA = _mm_unpacklo_epi8(srcM1, zerov);
3622
+    vec_s16 srcM1ssB = _mm_unpackhi_epi8(srcM1, zerov);
3623
+    vec_s16 srcP0ssA = _mm_unpacklo_epi8(srcP0, zerov);
3624
+    vec_s16 srcP0ssB = _mm_unpackhi_epi8(srcP0, zerov);
3625
+    vec_s16 srcP1ssA = _mm_unpacklo_epi8(srcP1, zerov);
3626
+    vec_s16 srcP1ssB = _mm_unpackhi_epi8(srcP1, zerov);
3627
+    vec_s16 srcP2ssA = _mm_unpacklo_epi8(srcP2, zerov);
3628
+    vec_s16 srcP2ssB = _mm_unpackhi_epi8(srcP2, zerov);
3629
+
3630
+    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
3631
+              sumA, sumB, srcP3ssA, srcP3ssB,
3632
+              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
3633
+    vec_u8 sum, fsum, srcP3;
3634
+
3635
+    src += srcStride * 3;
3636
+    PRAGMA_E2K("ivdep")
3637
+    for (i = 0; i < 16; i++) {
3638
+        srcP3 = VEC_LD(src);
3639
+        src += srcStride;
3640
+
3641
+        srcP3ssA = _mm_unpacklo_epi8(srcP3, zerov);
3642
+        srcP3ssB = _mm_unpackhi_epi8(srcP3, zerov);
3643
+
3644
+        sum1A = _mm_adds_epi16(srcP0ssA, srcP1ssA);
3645
+        sum1B = _mm_adds_epi16(srcP0ssB, srcP1ssB);
3646
+        sum2A = _mm_adds_epi16(srcM1ssA, srcP2ssA);
3647
+        sum2B = _mm_adds_epi16(srcM1ssB, srcP2ssB);
3648
+        sum3A = _mm_adds_epi16(srcM2ssA, srcP3ssA);
3649
+        sum3B = _mm_adds_epi16(srcM2ssB, srcP3ssB);
3650
+
3651
+        srcM2ssA = srcM1ssA;
3652
+        srcM2ssB = srcM1ssB;
3653
+        srcM1ssA = srcP0ssA;
3654
+        srcM1ssB = srcP0ssB;
3655
+        srcP0ssA = srcP1ssA;
3656
+        srcP0ssB = srcP1ssB;
3657
+        srcP1ssA = srcP2ssA;
3658
+        srcP1ssB = srcP2ssB;
3659
+        srcP2ssA = srcP3ssA;
3660
+        srcP2ssB = srcP3ssB;
3661
+
3662
+        pp1A = _mm_add_epi16(_mm_mullo_epi16(sum1A, v20ss), v16ss);
3663
+        pp1B = _mm_add_epi16(_mm_mullo_epi16(sum1B, v20ss), v16ss);
3664
+        pp2A = _mm_mullo_epi16(sum2A, v5ss);
3665
+        pp2B = _mm_mullo_epi16(sum2B, v5ss);
3666
+        pp3A = _mm_add_epi16(sum3A, pp1A);
3667
+        pp3B = _mm_add_epi16(sum3B, pp1B);
3668
+        sumA = _mm_sub_epi16(pp3A, pp2A);
3669
+        sumB = _mm_sub_epi16(pp3B, pp2B);
3670
+        sumA = _mm_srai_epi16(sumA, 5);
3671
+        sumB = _mm_srai_epi16(sumB, 5);
3672
+        sum = _mm_packus_epi16(sumA, sumB);
3673
+
3674
+        OP_U8_E2K(fsum, sum, VEC_LD(dst));
3675
+        VEC_ST(dst, fsum);
3676
+        dst += dstStride;
3677
+    }
3678
+}
3679
+#endif /* PREFIX_h264_qpel16_v_lowpass_e2k */
3680
+
3681
+/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
3682
+#ifdef PREFIX_h264_qpel16_hv_lowpass_e2k
3683
+static void PREFIX_h264_qpel16_hv_lowpass_e2k(uint8_t *dst, int16_t *tmp,
3684
+                                              const uint8_t *src,
3685
+                                              int dstStride, int tmpStride,
3686
+                                              int srcStride)
3687
+{
3688
+    int i;
3689
+    LOAD_ZERO;
3690
+    const vec_s16 v20ss = _mm_set1_epi16(20);
3691
+    const vec_s16 v5ss = _mm_set1_epi16(5);
3692
+    const vec_s32 v512si = _mm_set1_epi32(512);
3693
+
3694
+    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
3695
+              srcP2A, srcP2B, srcP3A, srcP3B,
3696
+              srcM1A, srcM1B, srcM2A, srcM2B,
3697
+              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
3698
+              pp1A, pp1B, pp2A, pp2B, sumA, sumB;
3699
+    int16_t *tmpbis = tmp;
3700
+
3701
+    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
3702
+              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
3703
+              tmpP2ssA, tmpP2ssB;
3704
+
3705
+    vec_s32 pp1Al, pp1Ah, pp1Bl, pp1Bh, pp2Al, pp2Ah, pp2Bl, pp2Bh,
3706
+              pp3Al, pp3Ah, pp3Bl, pp3Bh, sumAl, sumAh, sumBl, sumBh;
3707
+    vec_u8 fsum, sum;
3708
+
3709
+    src -= 2 * srcStride;
3710
+    PRAGMA_E2K("ivdep")
3711
+    for (i = 0; i < 21; i ++) {
3712
+        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
3713
+
3714
+        load_alignment();
3715
+
3716
+        srcP0A = _mm_unpacklo_epi8(srcP0, zerov);
3717
+        srcP0B = _mm_unpackhi_epi8(srcP0, zerov);
3718
+        srcP1A = _mm_unpacklo_epi8(srcP1, zerov);
3719
+        srcP1B = _mm_unpackhi_epi8(srcP1, zerov);
3720
+
3721
+        srcP2A = _mm_unpacklo_epi8(srcP2, zerov);
3722
+        srcP2B = _mm_unpackhi_epi8(srcP2, zerov);
3723
+        srcP3A = _mm_unpacklo_epi8(srcP3, zerov);
3724
+        srcP3B = _mm_unpackhi_epi8(srcP3, zerov);
3725
+
3726
+        srcM1A = _mm_unpacklo_epi8(srcM1, zerov);
3727
+        srcM1B = _mm_unpackhi_epi8(srcM1, zerov);
3728
+        srcM2A = _mm_unpacklo_epi8(srcM2, zerov);
3729
+        srcM2B = _mm_unpackhi_epi8(srcM2, zerov);
3730
+
3731
+        sum1A = _mm_adds_epi16(srcP0A, srcP1A);
3732
+        sum1B = _mm_adds_epi16(srcP0B, srcP1B);
3733
+        sum2A = _mm_adds_epi16(srcM1A, srcP2A);
3734
+        sum2B = _mm_adds_epi16(srcM1B, srcP2B);
3735
+        sum3A = _mm_adds_epi16(srcM2A, srcP3A);
3736
+        sum3B = _mm_adds_epi16(srcM2B, srcP3B);
3737
+
3738
+        pp1A = _mm_add_epi16(_mm_mullo_epi16(sum1A, v20ss), sum3A);
3739
+        pp1B = _mm_add_epi16(_mm_mullo_epi16(sum1B, v20ss), sum3B);
3740
+        pp2A = _mm_mullo_epi16(sum2A, v5ss);
3741
+        pp2B = _mm_mullo_epi16(sum2B, v5ss);
3742
+        sumA = _mm_sub_epi16(pp1A, pp2A);
3743
+        sumB = _mm_sub_epi16(pp1B, pp2B);
3744
+
3745
+        VEC_ST(tmp, sumA);
3746
+        VEC_ST(tmp + 8, sumB);
3747
+
3748
+        src += srcStride;
3749
+        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
3750
+    }
3751
+
3752
+    tmpM2ssA = VEC_LD(tmpbis);
3753
+    tmpM2ssB = VEC_LD(tmpbis + 8);
3754
+    tmpbis += tmpStride;
3755
+    tmpM1ssA = VEC_LD(tmpbis);
3756
+    tmpM1ssB = VEC_LD(tmpbis + 8);
3757
+    tmpbis += tmpStride;
3758
+    tmpP0ssA = VEC_LD(tmpbis);
3759
+    tmpP0ssB = VEC_LD(tmpbis + 8);
3760
+    tmpbis += tmpStride;
3761
+    tmpP1ssA = VEC_LD(tmpbis);
3762
+    tmpP1ssB = VEC_LD(tmpbis + 8);
3763
+    tmpbis += tmpStride;
3764
+    tmpP2ssA = VEC_LD(tmpbis);
3765
+    tmpP2ssB = VEC_LD(tmpbis + 8);
3766
+    tmpbis += tmpStride;
3767
+
3768
+    PRAGMA_E2K("ivdep")
3769
+    for (i = 0; i < 16; i++) {
3770
+        vec_s16 tmp0, tmp1;
3771
+        const vec_s16 tmpP3ssA = VEC_LD(tmpbis);
3772
+        const vec_s16 tmpP3ssB = VEC_LD(tmpbis + 8);
3773
+
3774
+        const vec_s16 sum1A = _mm_adds_epi16(tmpP0ssA, tmpP1ssA);
3775
+        const vec_s16 sum1B = _mm_adds_epi16(tmpP0ssB, tmpP1ssB);
3776
+        const vec_s16 sum2A = _mm_adds_epi16(tmpM1ssA, tmpP2ssA);
3777
+        const vec_s16 sum2B = _mm_adds_epi16(tmpM1ssB, tmpP2ssB);
3778
+        vec_s16 sum3A = _mm_adds_epi16(tmpM2ssA, tmpP3ssA);
3779
+        vec_s16 sum3B = _mm_adds_epi16(tmpM2ssB, tmpP3ssB);
3780
+
3781
+        tmpbis += tmpStride;
3782
+
3783
+        tmpM2ssA = tmpM1ssA;
3784
+        tmpM2ssB = tmpM1ssB;
3785
+        tmpM1ssA = tmpP0ssA;
3786
+        tmpM1ssB = tmpP0ssB;
3787
+        tmpP0ssA = tmpP1ssA;
3788
+        tmpP0ssB = tmpP1ssB;
3789
+        tmpP1ssA = tmpP2ssA;
3790
+        tmpP1ssB = tmpP2ssB;
3791
+        tmpP2ssA = tmpP3ssA;
3792
+        tmpP2ssB = tmpP3ssB;
3793
+
3794
+        tmp0 = _mm_mullo_epi16(sum1A, v20ss);
3795
+        tmp1 = _mm_mulhi_epi16(sum1A, v20ss);
3796
+        pp1Al = _mm_unpacklo_epi16(tmp0, tmp1);
3797
+        pp1Ah = _mm_unpackhi_epi16(tmp0, tmp1);
3798
+        tmp0 = _mm_mullo_epi16(sum1B, v20ss);
3799
+        tmp1 = _mm_mulhi_epi16(sum1B, v20ss);
3800
+        pp1Bl = _mm_unpacklo_epi16(tmp0, tmp1);
3801
+        pp1Bh = _mm_unpackhi_epi16(tmp0, tmp1);
3802
+
3803
+        pp1Al = _mm_add_epi32(pp1Al, v512si);
3804
+        pp1Ah = _mm_add_epi32(pp1Ah, v512si);
3805
+        pp1Bl = _mm_add_epi32(pp1Bl, v512si);
3806
+        pp1Bh = _mm_add_epi32(pp1Bh, v512si);
3807
+
3808
+        tmp0 = _mm_mullo_epi16(sum2A, v5ss);
3809
+        tmp1 = _mm_mulhi_epi16(sum2A, v5ss);
3810
+        pp2Al = _mm_unpacklo_epi16(tmp0, tmp1);
3811
+        pp2Ah = _mm_unpackhi_epi16(tmp0, tmp1);
3812
+        tmp0 = _mm_mullo_epi16(sum2B, v5ss);
3813
+        tmp1 = _mm_mulhi_epi16(sum2B, v5ss);
3814
+        pp2Bl = _mm_unpacklo_epi16(tmp0, tmp1);
3815
+        pp2Bh = _mm_unpackhi_epi16(tmp0, tmp1);
3816
+
3817
+        tmp0 = _mm_srai_epi32(_mm_unpacklo_epi16(sum3A, sum3A), 16);
3818
+        tmp1 = _mm_srai_epi32(_mm_unpackhi_epi16(sum3A, sum3A), 16);
3819
+        pp3Al = _mm_add_epi32(tmp0, pp1Al);
3820
+        pp3Ah = _mm_add_epi32(tmp1, pp1Ah);
3821
+        tmp0 = _mm_srai_epi32(_mm_unpacklo_epi16(sum3B, sum3B), 16);
3822
+        tmp1 = _mm_srai_epi32(_mm_unpackhi_epi16(sum3B, sum3B), 16);
3823
+        pp3Bl = _mm_add_epi32(tmp0, pp1Bl);
3824
+        pp3Bh = _mm_add_epi32(tmp1, pp1Bh);
3825
+
3826
+        sumAl = _mm_sub_epi32(pp3Al, pp2Al);
3827
+        sumAh = _mm_sub_epi32(pp3Ah, pp2Ah);
3828
+        sumBl = _mm_sub_epi32(pp3Bl, pp2Bl);
3829
+        sumBh = _mm_sub_epi32(pp3Bh, pp2Bh);
3830
+
3831
+        sumAl = _mm_srai_epi32(sumAl, 10);
3832
+        sumAh = _mm_srai_epi32(sumAh, 10);
3833
+        sumBl = _mm_srai_epi32(sumBl, 10);
3834
+        sumBh = _mm_srai_epi32(sumBh, 10);
3835
+
3836
+        sumA = _mm_packs_epi32(sumAl, sumAh);
3837
+        sumB = _mm_packs_epi32(sumBl, sumBh);
3838
+        sum = _mm_packus_epi16(sumA, sumB);
3839
+
3840
+        OP_U8_E2K(fsum, sum, VEC_LD(dst));
3841
+        VEC_ST(dst, fsum);
3842
+        dst += dstStride;
3843
+    }
3844
+}
3845
+#endif /* PREFIX_h264_qpel16_hv_lowpass_e2k */
3846
diff --git a/libavcodec/e2k/hevcdsp.c b/libavcodec/e2k/hevcdsp.c
3847
new file mode 100644
3848
index 0000000..74004d7
3849
--- /dev/null
3850
+++ b/libavcodec/e2k/hevcdsp.c
3851
@@ -0,0 +1,94 @@
3852
+/*
3853
+ * SIMD-optimized IDCT functions for HEVC decoding
3854
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
3855
+ * Copyright (c) Alexandra Hajkova
3856
+ *
3857
+ * This file is part of FFmpeg.
3858
+ *
3859
+ * FFmpeg is free software; you can redistribute it and/or
3860
+ * modify it under the terms of the GNU Lesser General Public
3861
+ * License as published by the Free Software Foundation; either
3862
+ * version 2.1 of the License, or (at your option) any later version.
3863
+ *
3864
+ * FFmpeg is distributed in the hope that it will be useful,
3865
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3866
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
3867
+ * Lesser General Public License for more details.
3868
+ *
3869
+ * You should have received a copy of the GNU Lesser General Public
3870
+ * License along with FFmpeg; if not, write to the Free Software
3871
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3872
+ */
3873
+
3874
+#include "config.h"
3875
+
3876
+#include "libavutil/attributes.h"
3877
+#include "libavutil/cpu.h"
3878
+#include "libavutil/e2k/cpu.h"
3879
+#include "libavutil/e2k/util_e2k.h"
3880
+
3881
+#include "libavcodec/hevcdsp.h"
3882
+
3883
+#define transform4x4(shift) { \
3884
+    vec_s16 src_02, src_13; \
3885
+    vec_s32 e0, o0, e1, o1, add; \
3886
+    src_02 = _mm_unpacklo_epi16(src_01, src_23); \
3887
+    src_13 = _mm_unpackhi_epi16(src_01, src_23); \
3888
+    e0 = _mm_madd_epi16(src_02, trans0); \
3889
+    o0 = _mm_madd_epi16(src_13, trans1); \
3890
+    e1 = _mm_madd_epi16(src_02, trans2); \
3891
+    o1 = _mm_madd_epi16(src_13, trans3); \
3892
+    add = _mm_set1_epi32(1 << (shift - 1)); \
3893
+    e0 = _mm_add_epi32(e0, add); \
3894
+    e1 = _mm_add_epi32(e1, add); \
3895
+    res0 = _mm_add_epi32(e0, o0); \
3896
+    res1 = _mm_add_epi32(e1, o1); \
3897
+    res2 = _mm_sub_epi32(e1, o1); \
3898
+    res3 = _mm_sub_epi32(e0, o0); \
3899
+    res0 = _mm_srai_epi32(res0, shift); \
3900
+    res1 = _mm_srai_epi32(res1, shift); \
3901
+    res2 = _mm_srai_epi32(res2, shift); \
3902
+    res3 = _mm_srai_epi32(res3, shift); \
3903
+    packed0 = _mm_packs_epi32(res0, res1); \
3904
+    packed1 = _mm_packs_epi32(res2, res3); \
3905
+    \
3906
+    res0 = _mm_unpacklo_epi16(packed0, packed1); \
3907
+    res1 = _mm_unpackhi_epi16(packed0, packed1); \
3908
+    src_01 = _mm_unpacklo_epi16(res0, res1); \
3909
+    src_23 = _mm_unpackhi_epi16(res0, res1); \
3910
+}
3911
+
3912
+#define HEVC_IDCT4X4_E2K(depth) \
3913
+static void ff_hevc_idct_4x4##_##depth##_e2k(int16_t *coeffs, int col_limit) \
3914
+{ \
3915
+    const int shift = 7; \
3916
+    const int shift2 = 20 - depth; \
3917
+    vec_s16 src_01, src_23; \
3918
+    vec_s32 res0, res1, res2, res3; \
3919
+    vec_s16 packed0, packed1; \
3920
+    vec_s16 trans0 = _mm_set1_epi32(64 | 64 << 16); \
3921
+    vec_s16 trans1 = _mm_set1_epi32(83 | 36 << 16); \
3922
+    vec_s16 trans2 = _mm_set1_epi32(64 | -64 << 16); \
3923
+    vec_s16 trans3 = _mm_set1_epi32(36 | -83 << 16); \
3924
+    \
3925
+    src_01 = VEC_LD(coeffs); \
3926
+    src_23 = VEC_LD(coeffs + 8); \
3927
+    transform4x4(shift); \
3928
+    transform4x4(shift2); \
3929
+    VEC_ST(coeffs, src_01); \
3930
+    VEC_ST(coeffs + 8, src_23); \
3931
+}
3932
+
3933
+HEVC_IDCT4X4_E2K(8)
3934
+HEVC_IDCT4X4_E2K(10)
3935
+
3936
+av_cold void ff_hevc_dsp_init_e2k(HEVCDSPContext *c, const int bit_depth)
3937
+{
3938
+    if (!E2K_BASE(av_get_cpu_flags()))
3939
+        return;
3940
+
3941
+    if (bit_depth == 8)
3942
+        c->idct[0] = ff_hevc_idct_4x4_8_e2k;
3943
+    if (bit_depth == 10)
3944
+        c->idct[0] = ff_hevc_idct_4x4_10_e2k;
3945
+}
3946
diff --git a/libavcodec/e2k/hpeldsp.c b/libavcodec/e2k/hpeldsp.c
3947
new file mode 100644
3948
index 0000000..9ff59bb
3949
--- /dev/null
3950
+++ b/libavcodec/e2k/hpeldsp.c
3951
@@ -0,0 +1,302 @@
3952
+/*
3953
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
3954
+ * Copyright (c) 2002 Brian Foley
3955
+ * Copyright (c) 2002 Dieter Shirley
3956
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
3957
+ *
3958
+ * This file is part of FFmpeg.
3959
+ *
3960
+ * FFmpeg is free software; you can redistribute it and/or
3961
+ * modify it under the terms of the GNU Lesser General Public
3962
+ * License as published by the Free Software Foundation; either
3963
+ * version 2.1 of the License, or (at your option) any later version.
3964
+ *
3965
+ * FFmpeg is distributed in the hope that it will be useful,
3966
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3967
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
3968
+ * Lesser General Public License for more details.
3969
+ *
3970
+ * You should have received a copy of the GNU Lesser General Public
3971
+ * License along with FFmpeg; if not, write to the Free Software
3972
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3973
+ */
3974
+
3975
+#include "config.h"
3976
+
3977
+#include "libavutil/attributes.h"
3978
+#include "libavutil/cpu.h"
3979
+#include "libavutil/e2k/cpu.h"
3980
+#include "libavutil/e2k/util_e2k.h"
3981
+
3982
+#include "libavcodec/hpeldsp.h"
3983
+
3984
+#include "hpeldsp.h"
3985
+
3986
+/* next one assumes that ((line_size % 16) == 0) */
3987
+void ff_put_pixels16_e2k(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
3988
+{
3989
+    vec_u8 v0, v1, v2, v3;
3990
+    int i;
3991
+
3992
+    PRAGMA_E2K("ivdep")
3993
+    for (i = 0; i < h; i += 4) {
3994
+        v0 = VEC_LD(pixels);
3995
+        v1 = VEC_LD(pixels + line_size);
3996
+        v2 = VEC_LD(pixels + line_size * 2);
3997
+        v3 = VEC_LD(pixels + line_size * 3);
3998
+        VEC_ST(block, v0);
3999
+        VEC_ST(block + line_size, v1);
4000
+        VEC_ST(block + line_size * 2, v2);
4001
+        VEC_ST(block + line_size * 3, v3);
4002
+        pixels += line_size * 4;
4003
+        block += line_size * 4;
4004
+    }
4005
+}
4006
+
4007
+/* next one assumes that ((line_size % 16) == 0) */
4008
+#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
4009
+void ff_avg_pixels16_e2k(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
4010
+{
4011
+    vec_u8 pixelsv, blockv;
4012
+    int i;
4013
+
4014
+    PRAGMA_E2K("ivdep")
4015
+    for (i = 0; i < h; i++) {
4016
+        blockv = VEC_LD(block);
4017
+        pixelsv = VEC_LD(pixels);
4018
+        blockv = _mm_avg_epu8(blockv, pixelsv);
4019
+        VEC_ST(block, blockv);
4020
+        pixels += line_size;
4021
+        block += line_size;
4022
+    }
4023
+}
4024
+
4025
+/* next one assumes that ((line_size % 8) == 0) */
4026
+static void avg_pixels8_e2k(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
4027
+{
4028
+    __m64 pixelsv, blockv;
4029
+    int i;
4030
+
4031
+    PRAGMA_E2K("ivdep")
4032
+    for (i = 0; i < h; i++) {
4033
+        blockv = *(__m64*)block;
4034
+        pixelsv = *(__m64*)pixels;
4035
+        blockv = _mm_avg_pu8(blockv, pixelsv);
4036
+        *(__m64*)block = blockv;
4037
+        pixels += line_size;
4038
+        block += line_size;
4039
+    }
4040
+}
4041
+
4042
+/* next one assumes that ((line_size % 8) == 0) */
4043
+static void put_pixels8_xy2_e2k(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
4044
+{
4045
+    int i;
4046
+    vec_u8 pixelsv1, pixelsv2, blockv;
4047
+    vec_u16 pixelssum1, pixelssum2, temp3;
4048
+    LOAD_ZERO;
4049
+    const vec_u16 vctwo = _mm_set1_epi16(2);
4050
+
4051
+    pixelsv1 = VEC_LD8(pixels);
4052
+    pixelsv2 = VEC_LD8(pixels + 1);
4053
+    pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov);
4054
+    pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov);
4055
+    pixelssum1 = _mm_add_epi16(pixelsv1, pixelsv2);
4056
+    pixelssum1 = _mm_add_epi16(pixelssum1, vctwo);
4057
+
4058
+    PRAGMA_E2K("ivdep")
4059
+    for (i = 0; i < h; i++) {
4060
+        pixels += line_size;
4061
+        blockv = VEC_LD8(block);
4062
+        pixelsv1 = VEC_LD8(pixels);
4063
+        pixelsv2 = VEC_LD8(pixels + 1);
4064
+        pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov);
4065
+        pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov);
4066
+        pixelssum2 = _mm_add_epi16(pixelsv1, pixelsv2);
4067
+        temp3 = _mm_add_epi16(pixelssum1, pixelssum2);
4068
+        temp3 = _mm_srai_epi16(temp3, 2);
4069
+        pixelssum1 = _mm_add_epi16(pixelssum2, vctwo);
4070
+
4071
+        blockv = _mm_packus_epi16(temp3, temp3);
4072
+        VEC_STL(block, blockv);
4073
+        block += line_size;
4074
+    }
4075
+}
4076
+
4077
+/* next one assumes that ((line_size % 8) == 0) */
4078
+static void put_no_rnd_pixels8_xy2_e2k(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
4079
+{
4080
+    int i;
4081
+    vec_u8 pixelsv1, pixelsv2, blockv;
4082
+    vec_u16 pixelssum1, pixelssum2, temp3;
4083
+    LOAD_ZERO;
4084
+    const vec_u16 vcone = _mm_set1_epi16(1);
4085
+
4086
+    pixelsv1 = VEC_LD8(pixels);
4087
+    pixelsv2 = VEC_LD8(pixels + 1);
4088
+    pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov);
4089
+    pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov);
4090
+    pixelssum1 = _mm_add_epi16(pixelsv1, pixelsv2);
4091
+    pixelssum1 = _mm_add_epi16(pixelssum1, vcone);
4092
+
4093
+    PRAGMA_E2K("ivdep")
4094
+    for (i = 0; i < h; i++) {
4095
+        pixels += line_size;
4096
+        blockv = VEC_LD8(block);
4097
+        pixelsv1 = VEC_LD8(pixels);
4098
+        pixelsv2 = VEC_LD8(pixels + 1);
4099
+        pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov);
4100
+        pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov);
4101
+        pixelssum2 = _mm_add_epi16(pixelsv1, pixelsv2);
4102
+        temp3 = _mm_add_epi16(pixelssum1, pixelssum2);
4103
+        temp3 = _mm_srai_epi16(temp3, 2);
4104
+        pixelssum1 = _mm_add_epi16(pixelssum2, vcone);
4105
+
4106
+        blockv = _mm_packus_epi16(temp3, temp3);
4107
+        VEC_STL(block, blockv);
4108
+        block += line_size;
4109
+    }
4110
+}
4111
+
4112
+/* next one assumes that ((line_size % 16) == 0) */
4113
+static void put_pixels16_xy2_e2k(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
4114
+{
4115
+    int i;
4116
+    vec_u8 pixelsv1, pixelsv2, pixelsv3, pixelsv4, blockv;
4117
+    vec_u16 temp3, temp4, pixelssum1, pixelssum2, pixelssum3, pixelssum4;
4118
+    LOAD_ZERO;
4119
+    const vec_u16 vctwo = _mm_set1_epi16(2);
4120
+
4121
+    pixelsv1 = VEC_LD(pixels);
4122
+    pixelsv2 = VEC_LD(pixels + 1);
4123
+    pixelsv3 = _mm_unpackhi_epi8(pixelsv1, zerov);
4124
+    pixelsv4 = _mm_unpackhi_epi8(pixelsv2, zerov);
4125
+    pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov);
4126
+    pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov);
4127
+    pixelssum3 = _mm_add_epi16(pixelsv3, pixelsv4);
4128
+    pixelssum3 = _mm_add_epi16(pixelssum3, vctwo);
4129
+    pixelssum1 = _mm_add_epi16(pixelsv1, pixelsv2);
4130
+    pixelssum1 = _mm_add_epi16(pixelssum1, vctwo);
4131
+
4132
+    PRAGMA_E2K("ivdep")
4133
+    for (i = 0; i < h; i++) {
4134
+        pixels += line_size;
4135
+        blockv = VEC_LD(block);
4136
+        pixelsv1 = VEC_LD(pixels);
4137
+        pixelsv2 = VEC_LD(pixels + 1);
4138
+        pixelsv3 = _mm_unpackhi_epi8(pixelsv1, zerov);
4139
+        pixelsv4 = _mm_unpackhi_epi8(pixelsv2, zerov);
4140
+        pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov);
4141
+        pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov);
4142
+        pixelssum4 = _mm_add_epi16(pixelsv3, pixelsv4);
4143
+        pixelssum2 = _mm_add_epi16(pixelsv1, pixelsv2);
4144
+        temp4 = _mm_add_epi16(pixelssum3, pixelssum4);
4145
+        temp4 = _mm_srai_epi16(temp4, 2);
4146
+        temp3 = _mm_add_epi16(pixelssum1, pixelssum2);
4147
+        temp3 = _mm_srai_epi16(temp3, 2);
4148
+        pixelssum3 = _mm_add_epi16(pixelssum4, vctwo);
4149
+        pixelssum1 = _mm_add_epi16(pixelssum2, vctwo);
4150
+        blockv = _mm_packus_epi16(temp3, temp4);
4151
+        VEC_ST(block, blockv);
4152
+        block += line_size;
4153
+    }
4154
+}
4155
+
4156
+/* next one assumes that ((line_size % 16) == 0) */
4157
+static void put_no_rnd_pixels16_xy2_e2k(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
4158
+{
4159
+    int i;
4160
+    vec_u8 pixelsv1, pixelsv2, pixelsv3, pixelsv4, blockv;
4161
+    vec_u16 temp3, temp4, pixelssum1, pixelssum2, pixelssum3, pixelssum4;
4162
+    LOAD_ZERO;
4163
+    const vec_u16 vcone = _mm_set1_epi16(1);
4164
+
4165
+    pixelsv1 = VEC_LD(pixels);
4166
+    pixelsv2 = VEC_LD(pixels + 1);
4167
+    pixelsv3 = _mm_unpackhi_epi8(pixelsv1, zerov);
4168
+    pixelsv4 = _mm_unpackhi_epi8(pixelsv2, zerov);
4169
+    pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov);
4170
+    pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov);
4171
+    pixelssum3 = _mm_add_epi16(pixelsv3, pixelsv4);
4172
+    pixelssum3 = _mm_add_epi16(pixelssum3, vcone);
4173
+    pixelssum1 = _mm_add_epi16(pixelsv1, pixelsv2);
4174
+    pixelssum1 = _mm_add_epi16(pixelssum1, vcone);
4175
+
4176
+    PRAGMA_E2K("ivdep")
4177
+    for (i = 0; i < h; i++) {
4178
+        pixels += line_size;
4179
+        blockv = VEC_LD(block);
4180
+        pixelsv1 = VEC_LD(pixels);
4181
+        pixelsv2 = VEC_LD(pixels + 1);
4182
+        pixelsv3 = _mm_unpackhi_epi8(pixelsv1, zerov);
4183
+        pixelsv4 = _mm_unpackhi_epi8(pixelsv2, zerov);
4184
+        pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov);
4185
+        pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov);
4186
+        pixelssum4 = _mm_add_epi16(pixelsv3, pixelsv4);
4187
+        pixelssum2 = _mm_add_epi16(pixelsv1, pixelsv2);
4188
+        temp4 = _mm_add_epi16(pixelssum3, pixelssum4);
4189
+        temp4 = _mm_srai_epi16(temp4, 2);
4190
+        temp3 = _mm_add_epi16(pixelssum1, pixelssum2);
4191
+        temp3 = _mm_srai_epi16(temp3, 2);
4192
+        pixelssum3 = _mm_add_epi16(pixelssum4, vcone);
4193
+        pixelssum1 = _mm_add_epi16(pixelssum2, vcone);
4194
+        blockv = _mm_packus_epi16(temp3, temp4);
4195
+        VEC_ST(block, blockv);
4196
+        block += line_size;
4197
+    }
4198
+}
4199
+
4200
+/* next one assumes that ((line_size % 8) == 0) */
4201
+static void avg_pixels8_xy2_e2k(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
4202
+{
4203
+    int i;
4204
+    vec_u8 pixelsv1, pixelsv2, blockv, blocktemp;
4205
+    vec_u16 pixelssum1, pixelssum2, temp3;
4206
+    LOAD_ZERO;
4207
+    const vec_u16 vctwo = _mm_set1_epi16(2);
4208
+
4209
+    pixelsv1 = VEC_LD8(pixels);
4210
+    pixelsv2 = VEC_LD8(pixels + 1);
4211
+    pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov);
4212
+    pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov);
4213
+    pixelssum1 = _mm_add_epi16(pixelsv1, pixelsv2);
4214
+    pixelssum1 = _mm_add_epi16(pixelssum1, vctwo);
4215
+
4216
+    PRAGMA_E2K("ivdep")
4217
+    for (i = 0; i < h; i++) {
4218
+        pixels += line_size;
4219
+        blockv = VEC_LD8(block);
4220
+        pixelsv1 = VEC_LD8(pixels);
4221
+        pixelsv2 = VEC_LD8(pixels + 1);
4222
+        pixelsv1 = _mm_unpacklo_epi8(pixelsv1, zerov);
4223
+        pixelsv2 = _mm_unpacklo_epi8(pixelsv2, zerov);
4224
+        pixelssum2 = _mm_add_epi16(pixelsv1, pixelsv2);
4225
+        temp3 = _mm_add_epi16(pixelssum1, pixelssum2);
4226
+        temp3 = _mm_srai_epi16(temp3, 2);
4227
+        pixelssum1 = _mm_add_epi16(pixelssum2, vctwo);
4228
+        blocktemp = _mm_packus_epi16(temp3, temp3);
4229
+        blockv = _mm_avg_epu8(blocktemp, blockv);
4230
+        VEC_STL(block, blockv);
4231
+        block += line_size;
4232
+    }
4233
+}
4234
+
4235
+av_cold void ff_hpeldsp_init_e2k(HpelDSPContext *c, int flags)
4236
+{
4237
+    if (!E2K_BASE(av_get_cpu_flags()))
4238
+        return;
4239
+
4240
+    // !checkasm
4241
+
4242
+    c->avg_pixels_tab[0][0]        = ff_avg_pixels16_e2k;
4243
+    c->avg_pixels_tab[1][0]        = avg_pixels8_e2k;
4244
+    c->avg_pixels_tab[1][3]        = avg_pixels8_xy2_e2k; // fate vsynth1-mpeg2-422
4245
+
4246
+    c->put_pixels_tab[0][0]        = ff_put_pixels16_e2k;
4247
+    c->put_pixels_tab[1][3]        = put_pixels8_xy2_e2k;
4248
+    c->put_pixels_tab[0][3]        = put_pixels16_xy2_e2k;
4249
+
4250
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_e2k;
4251
+    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_e2k;
4252
+    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_e2k;
4253
+}
4254
diff --git a/libavcodec/e2k/hpeldsp.h b/libavcodec/e2k/hpeldsp.h
4255
new file mode 100644
4256
index 0000000..0ade264
4257
--- /dev/null
4258
+++ b/libavcodec/e2k/hpeldsp.h
4259
@@ -0,0 +1,30 @@
4260
+/*
4261
+ * This file is part of FFmpeg.
4262
+ *
4263
+ * FFmpeg is free software; you can redistribute it and/or
4264
+ * modify it under the terms of the GNU Lesser General Public
4265
+ * License as published by the Free Software Foundation; either
4266
+ * version 2.1 of the License, or (at your option) any later version.
4267
+ *
4268
+ * FFmpeg is distributed in the hope that it will be useful,
4269
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4270
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
4271
+ * Lesser General Public License for more details.
4272
+ *
4273
+ * You should have received a copy of the GNU Lesser General Public
4274
+ * License along with FFmpeg; if not, write to the Free Software
4275
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
4276
+ */
4277
+
4278
+#ifndef AVCODEC_E2K_HPELDSP_H
4279
+#define AVCODEC_E2K_HPELDSP_H
4280
+
4281
+#include <stddef.h>
4282
+#include <stdint.h>
4283
+
4284
+void ff_avg_pixels16_e2k(uint8_t *block, const uint8_t *pixels,
4285
+                         ptrdiff_t line_size, int h);
4286
+void ff_put_pixels16_e2k(uint8_t *block, const uint8_t *pixels,
4287
+                         ptrdiff_t line_size, int h);
4288
+
4289
+#endif /* AVCODEC_E2K_HPELDSP_H */
4290
diff --git a/libavcodec/e2k/idctdsp.c b/libavcodec/e2k/idctdsp.c
4291
new file mode 100644
4292
index 0000000..db9d2ca
4293
--- /dev/null
4294
+++ b/libavcodec/e2k/idctdsp.c
4295
@@ -0,0 +1,237 @@
4296
+/*
4297
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
4298
+ * Copyright (c) 2001 Michel Lespinasse
4299
+ *
4300
+ * This file is part of FFmpeg.
4301
+ *
4302
+ * FFmpeg is free software; you can redistribute it and/or
4303
+ * modify it under the terms of the GNU Lesser General Public
4304
+ * License as published by the Free Software Foundation; either
4305
+ * version 2.1 of the License, or (at your option) any later version.
4306
+ *
4307
+ * FFmpeg is distributed in the hope that it will be useful,
4308
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4309
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
4310
+ * Lesser General Public License for more details.
4311
+ *
4312
+ * You should have received a copy of the GNU Lesser General Public
4313
+ * License along with FFmpeg; if not, write to the Free Software
4314
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
4315
+ */
4316
+
4317
+/* NOTE: This code is based on GPL code from the libmpeg2 project.  The
4318
+ * author, Michel Lespinasses, has given explicit permission to release
4319
+ * under LGPL as part of FFmpeg.
4320
+ *
4321
+ * FFmpeg integration by Dieter Shirley
4322
+ *
4323
+ * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
4324
+ * project.  I've deleted all of the libmpeg2-specific code, renamed the
4325
+ * functions and reordered the function parameters.  The only change to the
4326
+ * IDCT function itself was to factor out the partial transposition, and to
4327
+ * perform a full transpose at the end of the function. */
4328
+
4329
+#include "config.h"
4330
+
4331
+#include <stdlib.h>
4332
+#include <string.h>
4333
+
4334
+#include "libavutil/attributes.h"
4335
+#include "libavutil/cpu.h"
4336
+#include "libavutil/e2k/cpu.h"
4337
+#include "libavutil/e2k/util_e2k.h"
4338
+
4339
+#include "libavcodec/idctdsp.h"
4340
+
4341
+#include "dctdsp.h"
4342
+
4343
+#define IDCT_HALF                                         \
4344
+    /* 1st stage */                                       \
4345
+    t1 = _mm_adds_epi16(_mm_mulhrs_epi16(a1, vx7), vx1);  \
4346
+    t8 = _mm_adds_epi16(_mm_mulhrs_epi16(a1, vx1),        \
4347
+                        _mm_subs_epi16(zero, vx7));       \
4348
+    t7 = _mm_adds_epi16(_mm_mulhrs_epi16(a2, vx5), vx3);  \
4349
+    t3 = _mm_adds_epi16(_mm_mulhrs_epi16(ma2, vx3), vx5); \
4350
+                                                          \
4351
+    /* 2nd stage */                                       \
4352
+    t5 = _mm_adds_epi16(vx0, vx4);                        \
4353
+    t0 = _mm_subs_epi16(vx0, vx4);                        \
4354
+    t2 = _mm_adds_epi16(_mm_mulhrs_epi16(a0, vx6), vx2);  \
4355
+    t4 = _mm_adds_epi16(_mm_mulhrs_epi16(a0, vx2),        \
4356
+                        _mm_subs_epi16(zero, vx6));       \
4357
+    t6 = _mm_adds_epi16(t8, t3);                          \
4358
+    t3 = _mm_subs_epi16(t8, t3);                          \
4359
+    t8 = _mm_subs_epi16(t1, t7);                          \
4360
+    t1 = _mm_adds_epi16(t1, t7);                          \
4361
+                                                          \
4362
+    /* 3rd stage */                                       \
4363
+    t7 = _mm_adds_epi16(t5, t2);                          \
4364
+    t2 = _mm_subs_epi16(t5, t2);                          \
4365
+    t5 = _mm_adds_epi16(t0, t4);                          \
4366
+    t0 = _mm_subs_epi16(t0, t4);                          \
4367
+    t4 = _mm_subs_epi16(t8, t3);                          \
4368
+    t3 = _mm_adds_epi16(t8, t3);                          \
4369
+                                                          \
4370
+    /* 4th stage */                                       \
4371
+    vy0 = _mm_adds_epi16(t7, t1);                         \
4372
+    vy7 = _mm_subs_epi16(t7, t1);                         \
4373
+    vy1 = _mm_adds_epi16(_mm_mulhrs_epi16(c4, t3), t5);   \
4374
+    vy6 = _mm_adds_epi16(_mm_mulhrs_epi16(mc4, t3), t5);  \
4375
+    vy2 = _mm_adds_epi16(_mm_mulhrs_epi16(c4, t4), t0);   \
4376
+    vy5 = _mm_adds_epi16(_mm_mulhrs_epi16(mc4, t4), t0);  \
4377
+    vy3 = _mm_adds_epi16(t2, t6);                         \
4378
+    vy4 = _mm_subs_epi16(t2, t6)
4379
+
4380
+#define IDCT                                                                \
4381
+    vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;                         \
4382
+    vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                         \
4383
+    vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8;                             \
4384
+                                                                            \
4385
+    vec_s16 c4   = _mm_set1_epi16(23170);                                   \
4386
+    vec_s16 a0   = _mm_set1_epi16(13573);                                   \
4387
+    vec_s16 a1   = _mm_set1_epi16(6518);                                    \
4388
+    vec_s16 a2   = _mm_set1_epi16(21895);                                   \
4389
+    vec_s16 mc4  = _mm_set1_epi16(-23170);                                  \
4390
+    vec_s16 ma2  = _mm_set1_epi16(-21895);                                  \
4391
+    vec_s16 bias = _mm_set1_epi32(32 | 31 << 16);                           \
4392
+                                                                            \
4393
+    vec_s16 zero  = _mm_setzero_si128();                                    \
4394
+                                                                            \
4395
+    t0 = VEC_LD(constants[0]);                                              \
4396
+    t1 = VEC_LD(constants[1]);                                              \
4397
+    t2 = VEC_LD(constants[2]);                                              \
4398
+    t3 = VEC_LD(constants[3]);                                              \
4399
+                                                                            \
4400
+    vx0 = _mm_mulhrs_epi16(_mm_slli_epi16(block[0], 4), t0);                \
4401
+    vx1 = _mm_mulhrs_epi16(_mm_slli_epi16(block[1], 4), t1);                \
4402
+    vx2 = _mm_mulhrs_epi16(_mm_slli_epi16(block[2], 4), t2);                \
4403
+    vx3 = _mm_mulhrs_epi16(_mm_slli_epi16(block[3], 4), t3);                \
4404
+    vx4 = _mm_mulhrs_epi16(_mm_slli_epi16(block[4], 4), t0);                \
4405
+    vx5 = _mm_mulhrs_epi16(_mm_slli_epi16(block[5], 4), t3);                \
4406
+    vx6 = _mm_mulhrs_epi16(_mm_slli_epi16(block[6], 4), t2);                \
4407
+    vx7 = _mm_mulhrs_epi16(_mm_slli_epi16(block[7], 4), t1);                \
4408
+                                                                            \
4409
+    IDCT_HALF;                                                              \
4410
+                                                                            \
4411
+    vx0 = _mm_unpacklo_epi16(vy0, vy4);                                     \
4412
+    vx1 = _mm_unpackhi_epi16(vy0, vy4);                                     \
4413
+    vx2 = _mm_unpacklo_epi16(vy1, vy5);                                     \
4414
+    vx3 = _mm_unpackhi_epi16(vy1, vy5);                                     \
4415
+    vx4 = _mm_unpacklo_epi16(vy2, vy6);                                     \
4416
+    vx5 = _mm_unpackhi_epi16(vy2, vy6);                                     \
4417
+    vx6 = _mm_unpacklo_epi16(vy3, vy7);                                     \
4418
+    vx7 = _mm_unpackhi_epi16(vy3, vy7);                                     \
4419
+                                                                            \
4420
+    vy0 = _mm_unpacklo_epi16(vx0, vx4);                                     \
4421
+    vy1 = _mm_unpackhi_epi16(vx0, vx4);                                     \
4422
+    vy2 = _mm_unpacklo_epi16(vx1, vx5);                                     \
4423
+    vy3 = _mm_unpackhi_epi16(vx1, vx5);                                     \
4424
+    vy4 = _mm_unpacklo_epi16(vx2, vx6);                                     \
4425
+    vy5 = _mm_unpackhi_epi16(vx2, vx6);                                     \
4426
+    vy6 = _mm_unpacklo_epi16(vx3, vx7);                                     \
4427
+    vy7 = _mm_unpackhi_epi16(vx3, vx7);                                     \
4428
+                                                                            \
4429
+    vx0 = _mm_adds_epi16(_mm_unpacklo_epi16(vy0, vy4), bias);               \
4430
+    vx1 = _mm_unpackhi_epi16(vy0, vy4);                                     \
4431
+    vx2 = _mm_unpacklo_epi16(vy1, vy5);                                     \
4432
+    vx3 = _mm_unpackhi_epi16(vy1, vy5);                                     \
4433
+    vx4 = _mm_unpacklo_epi16(vy2, vy6);                                     \
4434
+    vx5 = _mm_unpackhi_epi16(vy2, vy6);                                     \
4435
+    vx6 = _mm_unpacklo_epi16(vy3, vy7);                                     \
4436
+    vx7 = _mm_unpackhi_epi16(vy3, vy7);                                     \
4437
+                                                                            \
4438
+    IDCT_HALF;                                                              \
4439
+                                                                            \
4440
+    vx0 = _mm_srai_epi16(vy0, 6);                                           \
4441
+    vx1 = _mm_srai_epi16(vy1, 6);                                           \
4442
+    vx2 = _mm_srai_epi16(vy2, 6);                                           \
4443
+    vx3 = _mm_srai_epi16(vy3, 6);                                           \
4444
+    vx4 = _mm_srai_epi16(vy4, 6);                                           \
4445
+    vx5 = _mm_srai_epi16(vy5, 6);                                           \
4446
+    vx6 = _mm_srai_epi16(vy6, 6);                                           \
4447
+    vx7 = _mm_srai_epi16(vy7, 6)
4448
+
4449
+static const int16_t ALIGNED(16) constants[4][8] = {
4450
+    { 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725 },
4451
+    { 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521 },
4452
+    { 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692 },
4453
+    { 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722 }
4454
+};
4455
+
4456
+void ff_idct_e2k(int16_t *blk)
4457
+{
4458
+    vec_s16 *block = (vec_s16*)blk;
4459
+
4460
+    IDCT;
4461
+
4462
+    block[0] = vx0;
4463
+    block[1] = vx1;
4464
+    block[2] = vx2;
4465
+    block[3] = vx3;
4466
+    block[4] = vx4;
4467
+    block[5] = vx5;
4468
+    block[6] = vx6;
4469
+    block[7] = vx7;
4470
+}
4471
+
4472
+#define COPY(vx0, vx1, i)                 \
4473
+    tmp = _mm_packus_epi16(vx0, vx1);     \
4474
+    VEC_STL(dest, tmp); dest += stride;   \
4475
+    VEC_STH(dest, tmp); dest += stride
4476
+
4477
+static void idct_put_e2k(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
4478
+{
4479
+    vec_s16 *block = (vec_s16*)blk;
4480
+    vec_u8 tmp;
4481
+
4482
+    IDCT;
4483
+
4484
+    COPY(vx0, vx1, 0);
4485
+    COPY(vx2, vx3, 2);
4486
+    COPY(vx4, vx5, 4);
4487
+    COPY(vx6, vx7, 6);
4488
+}
4489
+
4490
+#define ADD(vx0, vx1, i)                  \
4491
+    tmp = VEC_LD8(dest);                  \
4492
+    t0 = _mm_unpacklo_epi8(tmp, zero);    \
4493
+    tmp = VEC_LD8(dest + stride);         \
4494
+    t1 = _mm_unpacklo_epi8(tmp, zero);    \
4495
+    t0 = _mm_adds_epi16(t0, vx0);         \
4496
+    t1 = _mm_adds_epi16(t1, vx1);         \
4497
+    tmp = _mm_packus_epi16(t0, t1);       \
4498
+    VEC_STL(dest, tmp); dest += stride;   \
4499
+    VEC_STH(dest, tmp); dest += stride
4500
+
4501
+static void idct_add_e2k(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
4502
+{
4503
+    vec_s16 *block = (vec_s16*)blk;
4504
+    vec_u8 tmp;
4505
+
4506
+    IDCT;
4507
+
4508
+    ADD(vx0, vx1, 0);
4509
+    ADD(vx2, vx3, 2);
4510
+    ADD(vx4, vx5, 4);
4511
+    ADD(vx6, vx7, 6);
4512
+}
4513
+
4514
+av_cold void ff_idctdsp_init_e2k(IDCTDSPContext *c, AVCodecContext *avctx,
4515
+                                 unsigned high_bit_depth)
4516
+{
4517
+    if (!E2K_BASE(av_get_cpu_flags()))
4518
+        return;
4519
+
4520
+    // !checkasm
4521
+    // libavcodec/tests/dct -i
4522
+
4523
+    if (!high_bit_depth && avctx->lowres == 0) {
4524
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
4525
+            (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
4526
+            c->idct      = ff_idct_e2k;
4527
+            c->idct_add  = idct_add_e2k; // untested
4528
+            c->idct_put  = idct_put_e2k; // untested
4529
+            c->perm_type = FF_IDCT_PERM_TRANSPOSE;
4530
+        }
4531
+    }
4532
+}
4533
diff --git a/libavcodec/e2k/lossless_audiodsp.c b/libavcodec/e2k/lossless_audiodsp.c
4534
new file mode 100644
4535
index 0000000..1bb7c45
4536
--- /dev/null
4537
+++ b/libavcodec/e2k/lossless_audiodsp.c
4538
@@ -0,0 +1,75 @@
4539
+/*
4540
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
4541
+ * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
4542
+ *
4543
+ * This file is part of FFmpeg.
4544
+ *
4545
+ * FFmpeg is free software; you can redistribute it and/or
4546
+ * modify it under the terms of the GNU Lesser General Public
4547
+ * License as published by the Free Software Foundation; either
4548
+ * version 2.1 of the License, or (at your option) any later version.
4549
+ *
4550
+ * FFmpeg is distributed in the hope that it will be useful,
4551
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4552
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
4553
+ * Lesser General Public License for more details.
4554
+ *
4555
+ * You should have received a copy of the GNU Lesser General Public
4556
+ * License along with FFmpeg; if not, write to the Free Software
4557
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
4558
+ */
4559
+
4560
+#include "config.h"
4561
+
4562
+#include "libavutil/attributes.h"
4563
+#include "libavutil/cpu.h"
4564
+#include "libavutil/e2k/cpu.h"
4565
+#include "libavutil/e2k/util_e2k.h"
4566
+
4567
+#include "libavcodec/lossless_audiodsp.h"
4568
+
4569
+#define GET_T(tt0, tt1, src, a, b) { \
4570
+    tt0 = VEC_LD(src);               \
4571
+    tt1 = VEC_LD(src + 8);           \
4572
+}
4573
+
4574
+static int32_t scalarproduct_and_madd_int16_e2k(int16_t *v1,
4575
+                                                const int16_t *v2,
4576
+                                                const int16_t *v3,
4577
+                                                int order, int mul)
4578
+{
4579
+    int i;
4580
+    LOAD_ZERO;
4581
+    vec_s16 *pv1 = (vec_s16*)v1;
4582
+    vec_s16 muls = _mm_set1_epi16(mul);
4583
+    vec_s16 t0, t1, i0, i1;
4584
+    vec_s32 res = zerov;
4585
+
4586
+    PRAGMA_E2K("ivdep")
4587
+    for (i = 0; i < order; i += 16) {
4588
+        GET_T(t0, t1, v2, i1, i2);
4589
+        i0 = pv1[0];
4590
+        i1 = pv1[1];
4591
+        t0 = _mm_madd_epi16(t0, i0);
4592
+        t1 = _mm_madd_epi16(t1, i1);
4593
+        res = _mm_add_epi32(res, _mm_add_epi32(t0, t1));
4594
+        GET_T(t0, t1, v3, i4, i3);
4595
+        pv1[0] = _mm_add_epi16(_mm_mullo_epi16(t0, muls), i0);
4596
+        pv1[1] = _mm_add_epi16(_mm_mullo_epi16(t1, muls), i1);
4597
+        pv1 += 2;
4598
+        v2 += 16;
4599
+        v3 += 16;
4600
+    }
4601
+
4602
+    res = _mm_hadd_epi32(res, res);
4603
+    return _mm_extract_epi32(res, 0) + _mm_extract_epi32(res, 1);
4604
+}
4605
+
4606
+av_cold void ff_llauddsp_init_e2k(LLAudDSPContext *c)
4607
+{
4608
+    if (!E2K_BASE(av_get_cpu_flags()))
4609
+        return;
4610
+
4611
+    // !checkasm
4612
+    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_e2k;
4613
+}
4614
diff --git a/libavcodec/e2k/lossless_videodsp.c b/libavcodec/e2k/lossless_videodsp.c
4615
new file mode 100644
4616
index 0000000..a055ac7
4617
--- /dev/null
4618
+++ b/libavcodec/e2k/lossless_videodsp.c
4619
@@ -0,0 +1,59 @@
4620
+/*
4621
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
4622
+ * Copyright (c) 2002 Brian Foley
4623
+ * Copyright (c) 2002 Dieter Shirley
4624
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
4625
+ *
4626
+ * This file is part of FFmpeg.
4627
+ *
4628
+ * FFmpeg is free software; you can redistribute it and/or
4629
+ * modify it under the terms of the GNU Lesser General Public
4630
+ * License as published by the Free Software Foundation; either
4631
+ * version 2.1 of the License, or (at your option) any later version.
4632
+ *
4633
+ * FFmpeg is distributed in the hope that it will be useful,
4634
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4635
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
4636
+ * Lesser General Public License for more details.
4637
+ *
4638
+ * You should have received a copy of the GNU Lesser General Public
4639
+ * License along with FFmpeg; if not, write to the Free Software
4640
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
4641
+ */
4642
+
4643
+#include "config.h"
4644
+
4645
+#include "libavutil/attributes.h"
4646
+#include "libavutil/cpu.h"
4647
+#include "libavutil/e2k/cpu.h"
4648
+#include "libavutil/e2k/util_e2k.h"
4649
+
4650
+#include "libavcodec/lossless_videodsp.h"
4651
+
4652
+static void add_bytes_e2k(uint8_t *dst, uint8_t *src, ptrdiff_t w)
4653
+{
4654
+    int i;
4655
+    __m128i vdst, vsrc;
4656
+
4657
+    /* dst and src are 16 bytes-aligned (guaranteed). */
4658
+    PRAGMA_E2K("ivdep")
4659
+    for (i = 0; i + 15 < w; i += 16) {
4660
+        vdst = _mm_load_si128((const __m128i*)(dst + i));
4661
+        vsrc = _mm_load_si128((const __m128i*)(src + i));
4662
+        vdst = _mm_add_epi8(vsrc, vdst);
4663
+        _mm_store_si128((__m128i*)(dst + i), vdst);
4664
+    }
4665
+    /* If w is not a multiple of 16. */
4666
+    PRAGMA_E2K("ivdep")
4667
+    for (; i < w; i++)
4668
+        dst[i] = dst[i] + src[i];
4669
+}
4670
+
4671
+av_cold void ff_llviddsp_init_e2k(LLVidDSPContext *c)
4672
+{
4673
+    if (!E2K_BASE(av_get_cpu_flags()))
4674
+        return;
4675
+
4676
+    // checkasm
4677
+    c->add_bytes = add_bytes_e2k;
4678
+}
4679
diff --git a/libavcodec/e2k/mdct15.c b/libavcodec/e2k/mdct15.c
4680
new file mode 100644
4681
index 0000000..9b3c809
4682
--- /dev/null
4683
+++ b/libavcodec/e2k/mdct15.c
4684
@@ -0,0 +1,187 @@
4685
+/*
4686
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
4687
+ * Copyright (c) 2013-2014 Mozilla Corporation
4688
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
4689
+ *
4690
+ * This file is part of FFmpeg.
4691
+ *
4692
+ * FFmpeg is free software; you can redistribute it and/or
4693
+ * modify it under the terms of the GNU Lesser General Public
4694
+ * License as published by the Free Software Foundation; either
4695
+ * version 2.1 of the License, or (at your option) any later version.
4696
+ *
4697
+ * FFmpeg is distributed in the hope that it will be useful,
4698
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4699
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
4700
+ * Lesser General Public License for more details.
4701
+ *
4702
+ * You should have received a copy of the GNU Lesser General Public
4703
+ * License along with FFmpeg; if not, write to the Free Software
4704
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
4705
+ */
4706
+
4707
+#include "config.h"
4708
+#include "libavutil/cpu.h"
4709
+#include "libavutil/e2k/cpu.h"
4710
+#include "libavutil/e2k/util_e2k.h"
4711
+
4712
+#include "libavutil/attributes.h"
4713
+#include "libavutil/common.h"
4714
+
4715
+#include "libavcodec/mdct15.h"
4716
+
4717
+#define CMUL(dre, dim, are, aim, bre, bim) do { \
4718
+        (dre) = (are) * (bre) - (aim) * (bim);  \
4719
+        (dim) = (are) * (bim) + (aim) * (bre);  \
4720
+    } while (0)
4721
+
4722
+#define CMUL3(c, a, b) CMUL((c).re, (c).im, (a).re, (a).im, (b).re, (b).im)
4723
+
4724
+static av_always_inline void fft5(float *out, float *in, FFTComplex exptab[2])
4725
+{
4726
+    __m128 z0r, z1r, z2r, z3r, z0i, z1i, z2i, z3i;
4727
+    __m128 t0r, t1r, t2r, t3r, t4r, t5r;
4728
+    __m128 t0i, t1i, t2i, t3i, t4i, t5i;
4729
+    __m128 i0r, i1r, i2r, i3r, i4r;
4730
+    __m128 i0i, i1i, i2i, i3i, i4i;
4731
+    __m128 e0r = _mm_set1_ps(exptab[0].re);
4732
+    __m128 e0i = _mm_set1_ps(exptab[0].im);
4733
+    __m128 e1r = _mm_set1_ps(exptab[1].re);
4734
+    __m128 e1i = _mm_set1_ps(exptab[1].im);
4735
+
4736
+    i0r = _mm_load_ps(in);
4737
+    i0i = _mm_load_ps(in + 4);
4738
+    i1r = _mm_load_ps(in + 8);
4739
+    i1i = _mm_load_ps(in + 12);
4740
+    i2r = _mm_load_ps(in + 16);
4741
+    i2i = _mm_load_ps(in + 20);
4742
+    i3r = _mm_load_ps(in + 24);
4743
+    i3i = _mm_load_ps(in + 28);
4744
+    i4r = _mm_load_ps(in + 32);
4745
+    i4i = _mm_load_ps(in + 36);
4746
+
4747
+    t0r = _mm_add_ps(i1r, i4r);
4748
+    t0i = _mm_add_ps(i1i, i4i);
4749
+    t1i = _mm_sub_ps(i1r, i4r);
4750
+    t1r = _mm_sub_ps(i1i, i4i);
4751
+    t2r = _mm_add_ps(i2r, i3r);
4752
+    t2i = _mm_add_ps(i2i, i3i);
4753
+    t3i = _mm_sub_ps(i2r, i3r);
4754
+    t3r = _mm_sub_ps(i2i, i3i);
4755
+
4756
+    t4r = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(i0r, i1r), i2r), i3r), i4r);
4757
+    t4i = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(i0i, i1i), i2i), i3i), i4i);
4758
+    _mm_store_ps(out + 0, t4r);
4759
+    _mm_store_ps(out + 4, t4i);
4760
+
4761
+    t4r = _mm_sub_ps(_mm_mul_ps(e0r, t2r), _mm_mul_ps(e1r, t0r));
4762
+    t4i = _mm_sub_ps(_mm_mul_ps(e0r, t2i), _mm_mul_ps(e1r, t0i));
4763
+    t0r = _mm_sub_ps(_mm_mul_ps(e0r, t0r), _mm_mul_ps(e1r, t2r));
4764
+    t0i = _mm_sub_ps(_mm_mul_ps(e0r, t0i), _mm_mul_ps(e1r, t2i));
4765
+    t5r = _mm_sub_ps(_mm_mul_ps(e0i, t3r), _mm_mul_ps(e1i, t1r));
4766
+    t5i = _mm_sub_ps(_mm_mul_ps(e0i, t3i), _mm_mul_ps(e1i, t1i));
4767
+    t1r = _mm_add_ps(_mm_mul_ps(e0i, t1r), _mm_mul_ps(e1i, t3r));
4768
+    t1i = _mm_add_ps(_mm_mul_ps(e0i, t1i), _mm_mul_ps(e1i, t3i));
4769
+
4770
+    z0r = _mm_sub_ps(t0r, t1r);
4771
+    z0i = _mm_sub_ps(t0i, t1i);
4772
+    z1r = _mm_add_ps(t4r, t5r);
4773
+    z1i = _mm_add_ps(t4i, t5i);
4774
+
4775
+    z2r = _mm_sub_ps(t4r, t5r);
4776
+    z2i = _mm_sub_ps(t4i, t5i);
4777
+    z3r = _mm_add_ps(t0r, t1r);
4778
+    z3i = _mm_add_ps(t0i, t1i);
4779
+
4780
+    _mm_store_ps(out +  8, _mm_add_ps(i0r, z3r));
4781
+    _mm_store_ps(out + 12, _mm_add_ps(i0i, z0i));
4782
+    _mm_store_ps(out + 16, _mm_add_ps(i0r, z2r));
4783
+    _mm_store_ps(out + 20, _mm_add_ps(i0i, z1i));
4784
+    _mm_store_ps(out + 24, _mm_add_ps(i0r, z1r));
4785
+    _mm_store_ps(out + 28, _mm_add_ps(i0i, z2i));
4786
+    _mm_store_ps(out + 32, _mm_add_ps(i0r, z0r));
4787
+    _mm_store_ps(out + 36, _mm_add_ps(i0i, z3i));
4788
+}
4789
+
4790
+#define CMUL4(c, a, b) CMUL((c).re, (c).im, tmp[k * 8 + a], tmp[k * 8 + 4 + a], (b).re, (b).im)
4791
+
4792
+static void fft15_e2k(FFTComplex *out, float *in, FFTComplex *exptab, ptrdiff_t stride)
4793
+{
4794
+    int k;
4795
+    DECLARE_ALIGNED(16, float, tmp)[5 * 8];
4796
+
4797
+    fft5(tmp, in, exptab + 19);
4798
+
4799
+    PRAGMA_E2K("ivdep")
4800
+    for (k = 0; k < 5; k++) {
4801
+        FFTComplex t[2];
4802
+
4803
+        CMUL4(t[0], 2, exptab[k]);
4804
+        CMUL4(t[1], 3, exptab[2 * k]);
4805
+        out[stride*k].re = tmp[k * 8]     + t[0].re + t[1].re;
4806
+        out[stride*k].im = tmp[k * 8 + 4] + t[0].im + t[1].im;
4807
+
4808
+        CMUL4(t[0], 2, exptab[k + 5]);
4809
+        CMUL4(t[1], 3, exptab[2 * (k + 5)]);
4810
+        out[stride*(k + 5)].re = tmp[k * 8]     + t[0].re + t[1].re;
4811
+        out[stride*(k + 5)].im = tmp[k * 8 + 4] + t[0].im + t[1].im;
4812
+
4813
+        CMUL4(t[0], 2, exptab[k + 10]);
4814
+        CMUL4(t[1], 3, exptab[2 * k + 5]);
4815
+        out[stride*(k + 10)].re = tmp[k * 8]     + t[0].re + t[1].re;
4816
+        out[stride*(k + 10)].im = tmp[k * 8 + 4] + t[0].im + t[1].im;
4817
+    }
4818
+}
4819
+
4820
+static void imdct15_half_e2k(MDCT15Context *s, float *dst, const float *src,
4821
+                             ptrdiff_t stride)
4822
+{
4823
+    DECLARE_ALIGNED(16, float, fft15in)[5 * 8];
4824
+    FFTComplex *z = (FFTComplex *)dst;
4825
+    int i, j, k, len8 = s->len4 >> 1, l_ptwo = 1 << s->ptwo_fft.nbits;
4826
+    const float *in1 = src, *in2 = src + (s->len2 - 1) * stride;
4827
+
4828
+    /* Reindex input, putting it into a buffer and doing an Nx15 FFT */
4829
+    for (i = 0; i < l_ptwo; i++) {
4830
+        PRAGMA_E2K("ivdep")
4831
+        for (k = j = 0; j < 15; j += 3, k += 8) {
4832
+            int k0 = s->pfa_prereindex[i * 15 + j];
4833
+            int k1 = s->pfa_prereindex[i * 15 + j + 1];
4834
+            int k2 = s->pfa_prereindex[i * 15 + j + 2];
4835
+            float are, aim; FFTComplex b;
4836
+
4837
+            are = in2[-k0 * stride]; aim = in1[k0 * stride];
4838
+            b = s->twiddle_exptab[k0 >> 1];
4839
+            fft15in[k    ] = are * b.re - aim * b.im;
4840
+            fft15in[k + 4] = are * b.im + aim * b.re;
4841
+            fft15in[k + 1] = 0;
4842
+            fft15in[k + 5] = 0;
4843
+
4844
+            are = in2[-k1 * stride]; aim = in1[k1 * stride];
4845
+            b = s->twiddle_exptab[k1 >> 1];
4846
+            fft15in[k + 2] = are * b.re - aim * b.im;
4847
+            fft15in[k + 6] = are * b.im + aim * b.re;
4848
+
4849
+            are = in2[-k2 * stride]; aim = in1[k2 * stride];
4850
+            b = s->twiddle_exptab[k2 >> 1];
4851
+            fft15in[k + 3] = are * b.re - aim * b.im;
4852
+            fft15in[k + 7] = are * b.im + aim * b.re;
4853
+        }
4854
+        fft15_e2k(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
4855
+    }
4856
+
4857
+    /* Then a 15xN FFT (where N is a power of two) */
4858
+    for (i = 0; i < 15; i++)
4859
+        s->ptwo_fft.fft_calc(&s->ptwo_fft, s->tmp + l_ptwo*i);
4860
+
4861
+    /* Reindex again, apply twiddles and output */
4862
+    s->postreindex(z, s->tmp, s->twiddle_exptab, s->pfa_postreindex, len8);
4863
+}
4864
+
4865
+av_cold void ff_mdct15_init_e2k(MDCT15Context *s)
4866
+{
4867
+    if (!E2K_BASE(av_get_cpu_flags()))
4868
+        return;
4869
+
4870
+    s->imdct_half  = imdct15_half_e2k;
4871
+}
4872
diff --git a/libavcodec/e2k/me_cmp.c b/libavcodec/e2k/me_cmp.c
4873
new file mode 100644
4874
index 0000000..e6eda38
4875
--- /dev/null
4876
+++ b/libavcodec/e2k/me_cmp.c
4877
@@ -0,0 +1,461 @@
4878
+/*
4879
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
4880
+ * Copyright (c) 2002 Brian Foley
4881
+ * Copyright (c) 2002 Dieter Shirley
4882
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
4883
+ *
4884
+ * This file is part of FFmpeg.
4885
+ *
4886
+ * FFmpeg is free software; you can redistribute it and/or
4887
+ * modify it under the terms of the GNU Lesser General Public
4888
+ * License as published by the Free Software Foundation; either
4889
+ * version 2.1 of the License, or (at your option) any later version.
4890
+ *
4891
+ * FFmpeg is distributed in the hope that it will be useful,
4892
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4893
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
4894
+ * Lesser General Public License for more details.
4895
+ *
4896
+ * You should have received a copy of the GNU Lesser General Public
4897
+ * License along with FFmpeg; if not, write to the Free Software
4898
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
4899
+ */
4900
+
4901
+#include "config.h"
4902
+
4903
+#include "libavutil/attributes.h"
4904
+#include "libavutil/cpu.h"
4905
+#include "libavutil/e2k/cpu.h"
4906
+#include "libavutil/e2k/util_e2k.h"
4907
+
4908
+#include "libavcodec/avcodec.h"
4909
+#include "libavcodec/mpegvideo.h"
4910
+#include "libavcodec/me_cmp.h"
4911
+
4912
+#define LOAD_PIX(v1, v2, pix) { \
4913
+    v1 = VEC_LD(pix);           \
4914
+    v2 = VEC_LD(pix + 1);       \
4915
+}
4916
+
4917
+static int sad16_x2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
4918
+                        ptrdiff_t stride, int h)
4919
+{
4920
+    int i;
4921
+    __m128i v0, v1, v2, sum = _mm_setzero_si128();
4922
+
4923
+    PRAGMA_E2K("ivdep")
4924
+    for (i = 0; i < h; i++) {
4925
+        LOAD_PIX(v1, v2, pix2);
4926
+        v0 = VEC_LD(pix1);
4927
+        v1 = _mm_avg_epu8(v1, v2);
4928
+        sum = _mm_add_epi32(sum, _mm_sad_epu8(v0, v1));
4929
+
4930
+        pix1 += stride;
4931
+        pix2 += stride;
4932
+    }
4933
+    return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 2);
4934
+}
4935
+
4936
+static int sad8_x2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
4937
+                        ptrdiff_t stride, int h)
4938
+{
4939
+    int i;
4940
+    __m64 v0, v1, v2, sum = _mm_setzero_si64();
4941
+
4942
+    PRAGMA_E2K("ivdep")
4943
+    for (i = 0; i < h; i++) {
4944
+        v1 = *(__m64*)pix2;
4945
+        v2 = *(__m64*)(pix2 + 1);
4946
+        v0 = *(__m64*)pix1;
4947
+        v1 = _mm_avg_pu8(v1, v2);
4948
+        sum = _mm_add_pi32(sum, _mm_sad_pu8(v0, v1));
4949
+
4950
+        pix1 += stride;
4951
+        pix2 += stride;
4952
+    }
4953
+    return _mm_extract_pi32(sum, 0);
4954
+}
4955
+
4956
+static int sad16_y2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
4957
+                        ptrdiff_t stride, int h)
4958
+{
4959
+    int i;
4960
+    __m128i v0, v1, v2, sum = _mm_setzero_si128();
4961
+
4962
+    v2 = VEC_LD(pix2);
4963
+    pix2 += stride;
4964
+
4965
+    PRAGMA_E2K("ivdep")
4966
+    for (i = 0; i < h; i++) {
4967
+        v1 = v2;
4968
+        v2 = VEC_LD(pix2);
4969
+        v0 = VEC_LD(pix1);
4970
+        v1 = _mm_avg_epu8(v1, v2);
4971
+        sum = _mm_add_epi32(sum, _mm_sad_epu8(v0, v1));
4972
+        pix1 += stride;
4973
+        pix2 += stride;
4974
+    }
4975
+    return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 2);
4976
+}
4977
+
4978
+static int sad8_y2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
4979
+                        ptrdiff_t stride, int h)
4980
+{
4981
+    int i;
4982
+    __m64 v0, v1, v2, sum = _mm_setzero_si64();
4983
+
4984
+    v2 = *(__m64*)pix2;
4985
+    pix2 += stride;
4986
+
4987
+    PRAGMA_E2K("ivdep")
4988
+    for (i = 0; i < h; i++) {
4989
+        v1 = v2;
4990
+        v2 = *(__m64*)pix2;
4991
+        v0 = *(__m64*)pix1;
4992
+        v1 = _mm_avg_pu8(v1, v2);
4993
+        sum = _mm_add_pi32(sum, _mm_sad_pu8(v0, v1));
4994
+        pix1 += stride;
4995
+        pix2 += stride;
4996
+    }
4997
+    return _mm_extract_pi32(sum, 0);
4998
+}
4999
+
5000
+static int sad16_xy2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
5001
+                         ptrdiff_t stride, int h)
5002
+{
5003
+    int i;
5004
+    LOAD_ZERO;
5005
+    __m128i v0, v1, v2, sum = zerov;
5006
+    __m128i t0, t1, t2, t3, t4, t5;
5007
+    __m128i c2 = _mm_set1_epi16(2);
5008
+
5009
+    LOAD_PIX(v1, v2, pix2);
5010
+    t2 = _mm_unpacklo_epi8(v1, zerov);
5011
+    t3 = _mm_unpackhi_epi8(v1, zerov);
5012
+    t4 = _mm_unpacklo_epi8(v2, zerov);
5013
+    t5 = _mm_unpackhi_epi8(v2, zerov);
5014
+    t2 = _mm_add_epi16(t2, t4);
5015
+    t3 = _mm_add_epi16(t3, t5);
5016
+    pix2 += stride;
5017
+
5018
+    PRAGMA_E2K("ivdep")
5019
+    for (i = 0; i < h; i++) {
5020
+        t0 = t2; t1 = t3;
5021
+        LOAD_PIX(v1, v2, pix2);
5022
+        v0 = VEC_LD(pix1);
5023
+        t2 = _mm_unpacklo_epi8(v1, zerov);
5024
+        t3 = _mm_unpackhi_epi8(v1, zerov);
5025
+        t4 = _mm_unpacklo_epi8(v2, zerov);
5026
+        t5 = _mm_unpackhi_epi8(v2, zerov);
5027
+        t2 = _mm_add_epi16(t2, t4);
5028
+        t3 = _mm_add_epi16(t3, t5);
5029
+
5030
+        v1 = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t0, t2), c2), 2);
5031
+        v2 = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t1, t3), c2), 2);
5032
+        v1 = _mm_packus_epi16(v1, v2);
5033
+
5034
+        sum = _mm_add_epi32(sum, _mm_sad_epu8(v0, v1));
5035
+        pix1 += stride;
5036
+        pix2 += stride;
5037
+    }
5038
+    return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 2);
5039
+}
5040
+
5041
+static int sad8_xy2_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
5042
+                         ptrdiff_t stride, int h)
5043
+{
5044
+    int i;
5045
+    LOAD_ZERO;
5046
+    __m64 v0, sum = _mm_movepi64_pi64(zerov);
5047
+    __m128i v1, v2, t0, t1, t2, c2 = _mm_set1_epi16(2);
5048
+
5049
+    v1 = VEC_LD8(pix2);
5050
+    v2 = VEC_LD8(pix2 + 1);
5051
+    t1 = _mm_unpacklo_epi8(v1, zerov);
5052
+    t2 = _mm_unpacklo_epi8(v2, zerov);
5053
+    t1 = _mm_add_epi16(t1, t2);
5054
+    pix2 += stride;
5055
+
5056
+    PRAGMA_E2K("ivdep")
5057
+    for (i = 0; i < h; i++) {
5058
+        t0 = t1;
5059
+        v1 = VEC_LD8(pix2);
5060
+        v2 = VEC_LD8(pix2 + 1);
5061
+        v0 = *(__m64*)pix1;
5062
+        t1 = _mm_unpacklo_epi8(v1, zerov);
5063
+        t2 = _mm_unpacklo_epi8(v2, zerov);
5064
+        t1 = _mm_add_epi16(t1, t2);
5065
+
5066
+        v1 = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t0, t1), c2), 2);
5067
+        v1 = _mm_packus_epi16(v1, v1);
5068
+
5069
+        sum = _mm_add_pi32(sum, _mm_sad_pu8(v0, _mm_movepi64_pi64(v1)));
5070
+        pix1 += stride;
5071
+        pix2 += stride;
5072
+    }
5073
+    return _mm_extract_pi32(sum, 0);
5074
+}
5075
+
5076
+static int sad16_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
5077
+                     ptrdiff_t stride, int h)
5078
+{
5079
+    int i;
5080
+    __m128i v0, v1, sum = _mm_setzero_si128();
5081
+
5082
+    PRAGMA_E2K("ivdep")
5083
+    for (i = 0; i < h; i++) {
5084
+        v0 = VEC_LD(pix1);
5085
+        v1 = VEC_LD(pix2);
5086
+        sum = _mm_add_epi32(sum, _mm_sad_epu8(v0, v1));
5087
+        pix1 += stride;
5088
+        pix2 += stride;
5089
+    }
5090
+    return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 2);
5091
+}
5092
+
5093
+static int sad8_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
5094
+                    ptrdiff_t stride, int h)
5095
+{
5096
+    int i;
5097
+    __m64 v0, v1, sum = _mm_setzero_si64();
5098
+
5099
+    PRAGMA_E2K("ivdep")
5100
+    for (i = 0; i < h; i++) {
5101
+        v0 = *(__m64*)pix1;
5102
+        v1 = *(__m64*)pix2;
5103
+        sum = _mm_add_pi32(sum, _mm_sad_pu8(v0, v1));
5104
+        pix1 += stride;
5105
+        pix2 += stride;
5106
+    }
5107
+    return _mm_extract_pi32(sum, 0);
5108
+}
5109
+
5110
+/* Sum of Squared Errors for an 8x8 block. */
5111
+static int sse8_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
5112
+                    ptrdiff_t stride, int h)
5113
+{
5114
+    int i;
5115
+    LOAD_ZERO; 
5116
+    __m128i v0, v1, sum = zerov;
5117
+
5118
+    PRAGMA_E2K("ivdep")
5119
+    for (i = 0; i < h; i++) {
5120
+        v0 = VEC_LD8(pix1);
5121
+        v1 = VEC_LD8(pix2);
5122
+        v0 = _mm_unpacklo_epi8(v0, zerov);
5123
+        v1 = _mm_unpacklo_epi8(v1, zerov);
5124
+        v0 = _mm_sub_epi16(v0, v1);
5125
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v0, v0));
5126
+        pix1 += stride;
5127
+        pix2 += stride;
5128
+    }
5129
+    sum = _mm_hadd_epi32(sum, sum);
5130
+    return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1);
5131
+}
5132
+
5133
+/* Sum of Squared Errors for a 16x16 block. */
5134
+static int sse16_e2k(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
5135
+                     ptrdiff_t stride, int h)
5136
+{
5137
+    int i;
5138
+    LOAD_ZERO; 
5139
+    __m128i v0, v1, v2, v3, sum = zerov;
5140
+
5141
+    PRAGMA_E2K("ivdep")
5142
+    for (i = 0; i < h; i++) {
5143
+        v2 = VEC_LD(pix1);
5144
+        v3 = VEC_LD(pix2);
5145
+        v0 = _mm_unpacklo_epi8(v2, zerov);
5146
+        v1 = _mm_unpacklo_epi8(v3, zerov);
5147
+        v2 = _mm_unpackhi_epi8(v2, zerov);
5148
+        v3 = _mm_unpackhi_epi8(v3, zerov);
5149
+        v0 = _mm_sub_epi16(v0, v1);
5150
+        v2 = _mm_sub_epi16(v2, v3);
5151
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v0, v0));
5152
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v2, v2));
5153
+        pix1 += stride;
5154
+        pix2 += stride;
5155
+    }
5156
+    sum = _mm_hadd_epi32(sum, sum);
5157
+    return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1);
5158
+}
5159
+
5160
+#define HADAMARD8_FIN(t, sum) \
5161
+    v0 = _mm_add_epi16(t##0, t##1); \
5162
+    v1 = _mm_sub_epi16(t##0, t##1); \
5163
+    v2 = _mm_add_epi16(t##2, t##3); \
5164
+    v3 = _mm_sub_epi16(t##2, t##3); \
5165
+    v4 = _mm_add_epi16(t##4, t##5); \
5166
+    v5 = _mm_sub_epi16(t##4, t##5); \
5167
+    v6 = _mm_add_epi16(t##6, t##7); \
5168
+    v7 = _mm_sub_epi16(t##6, t##7); \
5169
+    \
5170
+    t0 = _mm_add_epi16(v0, v2); \
5171
+    t2 = _mm_sub_epi16(v0, v2); \
5172
+    t1 = _mm_add_epi16(v1, v3); \
5173
+    t3 = _mm_sub_epi16(v1, v3); \
5174
+    t4 = _mm_add_epi16(v4, v6); \
5175
+    t6 = _mm_sub_epi16(v4, v6); \
5176
+    t5 = _mm_add_epi16(v5, v7); \
5177
+    t7 = _mm_sub_epi16(v5, v7); \
5178
+    \
5179
+    v0 = _mm_add_epi16(t0, t4); \
5180
+    v4 = _mm_sub_epi16(t0, t4); \
5181
+    v1 = _mm_add_epi16(t1, t5); \
5182
+    v5 = _mm_sub_epi16(t1, t5); \
5183
+    v2 = _mm_add_epi16(t2, t6); \
5184
+    v6 = _mm_sub_epi16(t2, t6); \
5185
+    v3 = _mm_add_epi16(t3, t7); \
5186
+    v7 = _mm_sub_epi16(t3, t7); \
5187
+    \
5188
+    v0 = _mm_madd_epi16(_mm_abs_epi16(v0), onev); \
5189
+    v1 = _mm_madd_epi16(_mm_abs_epi16(v1), onev); \
5190
+    v2 = _mm_madd_epi16(_mm_abs_epi16(v2), onev); \
5191
+    v3 = _mm_madd_epi16(_mm_abs_epi16(v3), onev); \
5192
+    v4 = _mm_madd_epi16(_mm_abs_epi16(v4), onev); \
5193
+    v5 = _mm_madd_epi16(_mm_abs_epi16(v5), onev); \
5194
+    v6 = _mm_madd_epi16(_mm_abs_epi16(v6), onev); \
5195
+    v7 = _mm_madd_epi16(_mm_abs_epi16(v7), onev); \
5196
+    \
5197
+    v0 = _mm_add_epi32(v0, v1); \
5198
+    v2 = _mm_add_epi32(v2, v3); \
5199
+    v4 = _mm_add_epi32(v4, v5); \
5200
+    v6 = _mm_add_epi32(v6, v7); \
5201
+    v0 = _mm_add_epi32(v0, v2); \
5202
+    v4 = _mm_add_epi32(v4, v6); \
5203
+    sum = _mm_add_epi32(v0, v4);
5204
+
5205
+static int hadamard8_diff_e2k(MpegEncContext *s, uint8_t *dst,
5206
+                              uint8_t *src, ptrdiff_t stride, int h)
5207
+{
5208
+    LOAD_ZERO;
5209
+    vec_s16 v0, v1, v2, v3, v4, v5, v6, v7;
5210
+    vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, sum;
5211
+    const vec_s16 onev = _mm_set1_epi16(1);
5212
+    const vec_s16 vprod1 = _mm_setr_epi16(1, -1, 1, -1, 1, -1, 1, -1);
5213
+    const vec_s16 vprod2 = _mm_setr_epi16(1, 1, -1, -1, 1, 1, -1, -1);
5214
+    const vec_s16 vprod3 = _mm_setr_epi16(1, 1, 1, 1, -1, -1, -1, -1);
5215
+    const vec_u8 perm1 = _mm_setr_epi8(
5216
+          0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
5217
+          0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
5218
+
5219
+#define ITER(i) {                                         \
5220
+    v0 = VEC_LD8(src + stride * i);                       \
5221
+    v1 = VEC_LD8(dst + stride * i);                       \
5222
+    v0 = _mm_unpacklo_epi8(v0, zerov);                    \
5223
+    v1 = _mm_unpacklo_epi8(v1, zerov);                    \
5224
+    v0 = _mm_sub_epi16(v0, v1);                           \
5225
+    v1 = _mm_shuffle_epi8(v0, perm1);                     \
5226
+    v0 = _mm_add_epi16(_mm_sign_epi16(v0, vprod1), v1);   \
5227
+    v1 = _mm_shuffle_epi32(v0, 0xb1);                     \
5228
+    v0 = _mm_add_epi16(_mm_sign_epi16(v0, vprod2), v1);   \
5229
+    v1 = _mm_shuffle_epi32(v0, 0x4e);                     \
5230
+    t##i = _mm_add_epi16(_mm_sign_epi16(v0, vprod3), v1); \
5231
+}
5232
+    ITER(0); ITER(1); ITER(2); ITER(3);
5233
+    ITER(4); ITER(5); ITER(6); ITER(7);
5234
+#undef ITER
5235
+
5236
+    HADAMARD8_FIN(t, sum)
5237
+
5238
+    sum = _mm_hadd_epi32(sum, sum);
5239
+    return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1);
5240
+}
5241
+
5242
+#if 1
5243
+static int hadamard8_diff16_e2k(MpegEncContext *s, uint8_t *dst,
5244
+                                uint8_t *src, ptrdiff_t stride, int h)
5245
+{
5246
+    LOAD_ZERO;
5247
+    vec_s16 v0, v1, v2, v3, v4, v5, v6, v7;
5248
+    vec_s16 x0, x1, x2, x3, x4, x5, x6, x7;
5249
+    vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, sum = zerov;
5250
+    const vec_s16 onev = _mm_set1_epi16(1);
5251
+    const vec_s16 vprod1 = _mm_setr_epi16(1, -1, 1, -1, 1, -1, 1, -1);
5252
+    const vec_s16 vprod2 = _mm_setr_epi16(1, 1, -1, -1, 1, 1, -1, -1);
5253
+    const vec_s16 vprod3 = _mm_setr_epi16(1, 1, 1, 1, -1, -1, -1, -1);
5254
+    const vec_u8 perm1 = _mm_setr_epi8(
5255
+          0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
5256
+          0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
5257
+    int i;
5258
+
5259
+    PRAGMA_E2K("ivdep")
5260
+    for (i = 0; i < h; i += 8) {
5261
+
5262
+#define ITER(i) {                                         \
5263
+    v2 = VEC_LD(src + stride * i);                        \
5264
+    v3 = VEC_LD(dst + stride * i);                        \
5265
+    v0 = _mm_unpacklo_epi8(v2, zerov);                    \
5266
+    v1 = _mm_unpacklo_epi8(v3, zerov);                    \
5267
+    v2 = _mm_unpackhi_epi8(v2, zerov);                    \
5268
+    v3 = _mm_unpackhi_epi8(v3, zerov);                    \
5269
+    v0 = _mm_sub_epi16(v0, v1);                           \
5270
+    v2 = _mm_sub_epi16(v2, v3);                           \
5271
+    v1 = _mm_shuffle_epi8(v0, perm1);                     \
5272
+    v3 = _mm_shuffle_epi8(v2, perm1);                     \
5273
+    v0 = _mm_add_epi16(_mm_sign_epi16(v0, vprod1), v1);   \
5274
+    v2 = _mm_add_epi16(_mm_sign_epi16(v2, vprod1), v3);   \
5275
+    v1 = _mm_shuffle_epi32(v0, 0xb1);                     \
5276
+    v3 = _mm_shuffle_epi32(v2, 0xb1);                     \
5277
+    v0 = _mm_add_epi16(_mm_sign_epi16(v0, vprod2), v1);   \
5278
+    v2 = _mm_add_epi16(_mm_sign_epi16(v2, vprod2), v3);   \
5279
+    v1 = _mm_shuffle_epi32(v0, 0x4e);                     \
5280
+    v3 = _mm_shuffle_epi32(v2, 0x4e);                     \
5281
+    t##i = _mm_add_epi16(_mm_sign_epi16(v0, vprod3), v1); \
5282
+    x##i = _mm_add_epi16(_mm_sign_epi16(v2, vprod3), v3); \
5283
+}
5284
+      ITER(0); ITER(1); ITER(2); ITER(3);
5285
+      ITER(4); ITER(5); ITER(6); ITER(7);
5286
+#undef ITER
5287
+
5288
+      HADAMARD8_FIN(t, v0)
5289
+      sum = _mm_add_epi32(sum, v0);
5290
+      HADAMARD8_FIN(x, v0)
5291
+      sum = _mm_add_epi32(sum, v0);
5292
+      dst += 8 * stride;
5293
+      src += 8 * stride;
5294
+    }
5295
+    sum = _mm_hadd_epi32(sum, sum);
5296
+    return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1);
5297
+}
5298
+#else
5299
+static int hadamard8_diff16_e2k(MpegEncContext *s, uint8_t *dst,
5300
+                                uint8_t *src, ptrdiff_t stride, int h)
5301
+{
5302
+    int i, score = 0;
5303
+    for (i = 0; i < h; i += 8) {
5304
+        score += hadamard8_diff_e2k(s, dst, src, stride, 8);
5305
+        score += hadamard8_diff_e2k(s, dst + 8, src + 8, stride, 8);
5306
+        dst += 8 * stride;
5307
+        src += 8 * stride;
5308
+    }
5309
+    return score;
5310
+}
5311
+#endif
5312
+
5313
+av_cold void ff_me_cmp_init_e2k(MECmpContext *c, AVCodecContext *avctx)
5314
+{
5315
+    if (!E2K_BASE(av_get_cpu_flags()))
5316
+        return;
5317
+
5318
+    // !checkasm
5319
+
5320
+    // fate lavf-mxf
5321
+    c->pix_abs[0][0] = sad16_e2k;
5322
+    c->pix_abs[0][1] = sad16_x2_e2k;
5323
+    c->pix_abs[0][2] = sad16_y2_e2k;
5324
+    c->pix_abs[0][3] = sad16_xy2_e2k;
5325
+    c->pix_abs[1][0] = sad8_e2k;
5326
+    c->pix_abs[1][1] = sad8_x2_e2k;
5327
+    c->pix_abs[1][2] = sad8_y2_e2k;
5328
+    c->pix_abs[1][3] = sad8_xy2_e2k;
5329
+
5330
+    c->sad[0] = sad16_e2k;
5331
+    c->sad[1] = sad8_e2k;
5332
+    c->sse[0] = sse16_e2k;
5333
+    c->sse[1] = sse8_e2k;
5334
+
5335
+    // fate vsynth1-mpeg4-qprd
5336
+    c->hadamard8_diff[0] = hadamard8_diff16_e2k;
5337
+    c->hadamard8_diff[1] = hadamard8_diff_e2k;
5338
+}
5339
diff --git a/libavcodec/e2k/mpegaudiodsp.c b/libavcodec/e2k/mpegaudiodsp.c
5340
new file mode 100644
5341
index 0000000..2751453
5342
--- /dev/null
5343
+++ b/libavcodec/e2k/mpegaudiodsp.c
5344
@@ -0,0 +1,142 @@
5345
+/*
5346
+ * Elbrus optimized MP3 decoding functions
5347
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
5348
+ * Copyright (c) 2010 Vitor Sessak
5349
+ *
5350
+ * This file is part of FFmpeg.
5351
+ *
5352
+ * FFmpeg is free software; you can redistribute it and/or
5353
+ * modify it under the terms of the GNU Lesser General Public
5354
+ * License as published by the Free Software Foundation; either
5355
+ * version 2.1 of the License, or (at your option) any later version.
5356
+ *
5357
+ * FFmpeg is distributed in the hope that it will be useful,
5358
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5359
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
5360
+ * Lesser General Public License for more details.
5361
+ *
5362
+ * You should have received a copy of the GNU Lesser General Public
5363
+ * License along with FFmpeg; if not, write to the Free Software
5364
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
5365
+ */
5366
+
5367
+#include "config.h"
5368
+#include "libavutil/attributes.h"
5369
+#include "libavutil/cpu.h"
5370
+#include "libavutil/internal.h"
5371
+#include "libavutil/e2k/cpu.h"
5372
+#include "libavutil/e2k/util_e2k.h"
5373
+#include "libavcodec/mpegaudiodsp.h"
5374
+#include "libavcodec/mpegaudio.h"
5375
+
5376
+#define MACS(rt, ra, rb) rt += (ra) * (rb)
5377
+#define MLSS(rt, ra, rb) rt -= (ra) * (rb)
5378
+
5379
+#define SUM8(op, sum, w, p) {             \
5380
+    op(sum, (w)[0 * 64], (p)[0 * 64]);    \
5381
+    op(sum, (w)[1 * 64], (p)[1 * 64]);    \
5382
+    op(sum, (w)[2 * 64], (p)[2 * 64]);    \
5383
+    op(sum, (w)[3 * 64], (p)[3 * 64]);    \
5384
+    op(sum, (w)[4 * 64], (p)[4 * 64]);    \
5385
+    op(sum, (w)[5 * 64], (p)[5 * 64]);    \
5386
+    op(sum, (w)[6 * 64], (p)[6 * 64]);    \
5387
+    op(sum, (w)[7 * 64], (p)[7 * 64]);    \
5388
+}
5389
+
5390
+static av_always_inline
5391
+void apply_window(const float *buf, const float *win1,
5392
+                  const float *win2, float *sum1, float *sum2)
5393
+{
5394
+    vec_f v0, v1, v2, v3, v4, v5;
5395
+    int i;
5396
+
5397
+#define MULT(j)                                   \
5398
+    v1 = _mm_load_ps(win1 + j * 64);              \
5399
+    v2 = _mm_load_ps(win2 + j * 16);              \
5400
+    v3 = _mm_load_ps(buf + j * 64);               \
5401
+    v0 = _mm_sub_ps(v0, _mm_mul_ps(v3, v1));      \
5402
+    v4 = _mm_sub_ps(v4, _mm_mul_ps(v2, v3))
5403
+
5404
+    v0 = v4 = _mm_setzero_ps();
5405
+    MULT(0); MULT(1); MULT(2); MULT(3);
5406
+    MULT(4); MULT(5); MULT(6); MULT(7);
5407
+
5408
+    PRAGMA_E2K("ivdep")
5409
+    PRAGMA_E2K("unroll(3)")
5410
+    for (i = 4; i < 16; i += 4) {
5411
+        win1 += 4; win2 += 4; buf += 4;
5412
+        _mm_store_ps(sum1, v0); v5 = v4;
5413
+
5414
+        v0 = v4 = _mm_setzero_ps();
5415
+        MULT(0); MULT(1); MULT(2); MULT(3);
5416
+        MULT(4); MULT(5); MULT(6); MULT(7);
5417
+        _mm_store_ps(sum2, _mm_alignr_ps(v4, v5, 1));
5418
+        sum1 += 4; sum2 += 4;
5419
+    }
5420
+    _mm_store_ps(sum1, v0);
5421
+    _mm_store_ps(sum2, _mm_bsrli_ps(v4, 1));
5422
+
5423
+#undef MULT
5424
+}
5425
+
5426
+static void apply_window_e2k(float *in, float *win, int *unused, float *out,
5427
+                             ptrdiff_t incr)
5428
+{
5429
+    float ALIGNED(16) suma[16];
5430
+    float ALIGNED(16) sumb[16];
5431
+    float ALIGNED(16) sumc[16];
5432
+    float ALIGNED(16) sumd[16];
5433
+    float sum;
5434
+
5435
+    /* copy to avoid wrap */
5436
+    memcpy(in + 512, in, 32 * sizeof(*in));
5437
+
5438
+    apply_window(in + 16, win     , win + 512, suma, sumc);
5439
+    apply_window(in + 32, win + 48, win + 640, sumb, sumd);
5440
+
5441
+    sum = suma[0];
5442
+    SUM8(MACS, sum, win + 32, in + 48);
5443
+    suma[0] = sum;
5444
+
5445
+#define SUMS(a, b)                              \
5446
+    v0 = _mm_load_ps(sumd + b);                 \
5447
+    v1 = _mm_load_ps(sumc + a);                 \
5448
+    v0 = _mm_shuffle_ps(v0, v0, 0x1b);          \
5449
+    v1 = _mm_shuffle_ps(v1, v1, 0x1b);          \
5450
+    v0 = _mm_sub_ps(v0, _mm_load_ps(suma + a)); \
5451
+    v1 = _mm_add_ps(v1, _mm_load_ps(sumb + b)); \
5452
+    _mm_storeu_ps(out + a, v0);                 \
5453
+    _mm_storeu_ps(out + b + 16, v1)
5454
+
5455
+    if (incr == 1) {
5456
+        vec_f v0, v1;
5457
+        SUMS(0, 12); SUMS(4, 8); SUMS(8, 4); SUMS(12, 0);
5458
+        out += 16 * incr;
5459
+    } else {
5460
+        int j;
5461
+        float *out2 = out + 32 * incr;
5462
+        out[0] = -suma[0];
5463
+        out += incr;
5464
+        out2 -= incr;
5465
+        PRAGMA_E2K("ivdep")
5466
+        for (j = 1; j < 16; j++) {
5467
+            *out  = sumd[15 - j] - suma[j];
5468
+            *out2 = sumb[16 - j] + sumc[j - 1];
5469
+            out  += incr;
5470
+            out2 -= incr;
5471
+        }
5472
+    }
5473
+
5474
+    sum = 0;
5475
+    SUM8(MLSS, sum, win + 16 + 32, in + 32);
5476
+    *out = sum;
5477
+}
5478
+
5479
+av_cold void ff_mpadsp_init_e2k(MPADSPContext *s)
5480
+{
5481
+    if (!E2K_BASE(av_get_cpu_flags()))
5482
+        return;
5483
+
5484
+    // !checkasm
5485
+    s->apply_window_float = apply_window_e2k; // fate audiomatch-square-mp3
5486
+}
5487
diff --git a/libavcodec/e2k/mpegvideo.c b/libavcodec/e2k/mpegvideo.c
5488
new file mode 100644
5489
index 0000000..36bf975
5490
--- /dev/null
5491
+++ b/libavcodec/e2k/mpegvideo.c
5492
@@ -0,0 +1,100 @@
5493
+/*
5494
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
5495
+ * Copyright (c) 2002 Dieter Shirley
5496
+ *
5497
+ * This file is part of FFmpeg.
5498
+ *
5499
+ * FFmpeg is free software; you can redistribute it and/or
5500
+ * modify it under the terms of the GNU Lesser General Public
5501
+ * License as published by the Free Software Foundation; either
5502
+ * version 2.1 of the License, or (at your option) any later version.
5503
+ *
5504
+ * FFmpeg is distributed in the hope that it will be useful,
5505
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5506
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
5507
+ * Lesser General Public License for more details.
5508
+ *
5509
+ * You should have received a copy of the GNU Lesser General Public
5510
+ * License along with FFmpeg; if not, write to the Free Software
5511
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
5512
+ */
5513
+
5514
+#include <stdlib.h>
5515
+#include <stdio.h>
5516
+
5517
+#include "config.h"
5518
+
5519
+#include "libavutil/attributes.h"
5520
+#include "libavutil/cpu.h"
5521
+#include "libavutil/e2k/cpu.h"
5522
+#include "libavutil/e2k/util_e2k.h"
5523
+
5524
+#include "libavcodec/mpegvideo.h"
5525
+
5526
+/* this code assumes `block' is 16 bytes-aligned */
5527
+static void dct_unquantize_h263_intra_e2k(MpegEncContext *s,
5528
+                                          int16_t *block, int n, int qscale)
5529
+{
5530
+    int level, qmul, qadd = 0, nCoeffs = 63, j;
5531
+    __m128i qmulv, qaddv, v0, v1;
5532
+
5533
+    qmul = qscale << 1;
5534
+    level = block[0];
5535
+
5536
+    if (!s->h263_aic) {
5537
+        level *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
5538
+        qadd = (qscale - 1) | 1;
5539
+    } else {
5540
+        av_assert2(s->block_last_index[n] >= 0);
5541
+    }
5542
+    if (!s->ac_pred) {
5543
+        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
5544
+    }
5545
+
5546
+    qmulv = _mm_set1_epi16(qmul);
5547
+    qaddv = _mm_set1_epi16(qadd);
5548
+    PRAGMA_E2K("ivdep")
5549
+    for (j = 0; j <= nCoeffs; j += 8) {
5550
+        v0 = _mm_load_si128((const __m128i*)(block + j));
5551
+        v1 = _mm_mullo_epi16(v0, qmulv);
5552
+        v1 = _mm_add_epi16(v1, _mm_sign_epi16(qaddv, v0));
5553
+        _mm_store_si128((__m128i*)(block + j), v1);
5554
+    }
5555
+
5556
+    block[0] = level;
5557
+}
5558
+
5559
+static void dct_unquantize_h263_inter_e2k(MpegEncContext *s,
5560
+                                          int16_t *block, int n, int qscale)
5561
+{
5562
+    int qmul, qadd, nCoeffs, j;
5563
+    __m128i qmulv, qaddv, v0, v1;
5564
+
5565
+    qmul = qscale << 1;
5566
+    qadd = (qscale - 1) | 1;
5567
+
5568
+    av_assert2(s->block_last_index[n] >= 0 || s->h263_aic);
5569
+    nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
5570
+
5571
+    qmulv = _mm_set1_epi16(qmul);
5572
+    qaddv = _mm_set1_epi16(qadd);
5573
+    PRAGMA_E2K("ivdep")
5574
+    for (j = 0; j <= nCoeffs; j += 8) {
5575
+        v0 = _mm_load_si128((const __m128i*)(block + j));
5576
+        v1 = _mm_mullo_epi16(v0, qmulv);
5577
+        v1 = _mm_add_epi16(v1, _mm_sign_epi16(qaddv, v0));
5578
+        _mm_store_si128((__m128i*)(block + j), v1);
5579
+    }
5580
+}
5581
+
5582
+av_cold void ff_mpv_common_init_e2k(MpegEncContext *s)
5583
+{
5584
+    if (!E2K_BASE(av_get_cpu_flags()))
5585
+        return;
5586
+
5587
+    // !checkasm
5588
+    // fate flv-add_keyframe_index
5589
+    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_e2k;
5590
+    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_e2k;
5591
+}
5592
+
5593
diff --git a/libavcodec/e2k/mpegvideodsp.c b/libavcodec/e2k/mpegvideodsp.c
5594
new file mode 100644
5595
index 0000000..3d44735
5596
--- /dev/null
5597
+++ b/libavcodec/e2k/mpegvideodsp.c
5598
@@ -0,0 +1,86 @@
5599
+/*
5600
+ * GMC (Global Motion Compensation)
5601
+ *
5602
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
5603
+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
5604
+ *
5605
+ * This file is part of FFmpeg.
5606
+ *
5607
+ * FFmpeg is free software; you can redistribute it and/or
5608
+ * modify it under the terms of the GNU Lesser General Public
5609
+ * License as published by the Free Software Foundation; either
5610
+ * version 2.1 of the License, or (at your option) any later version.
5611
+ *
5612
+ * FFmpeg is distributed in the hope that it will be useful,
5613
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5614
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
5615
+ * Lesser General Public License for more details.
5616
+ *
5617
+ * You should have received a copy of the GNU Lesser General Public
5618
+ * License along with FFmpeg; if not, write to the Free Software
5619
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
5620
+ */
5621
+
5622
+#include "libavutil/cpu.h"
5623
+#include "libavutil/mem.h"
5624
+#include "libavutil/e2k/cpu.h"
5625
+#include "libavutil/e2k/util_e2k.h"
5626
+
5627
+#include "libavcodec/mpegvideodsp.h"
5628
+
5629
+/* ATM this code assumes stride is a multiple of 8
5630
+ * to preserve proper dst alignment. */
5631
+static void gmc1_e2k(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
5632
+                     int stride, int h, int x16, int y16, int rounder)
5633
+{
5634
+    int i;
5635
+    LOAD_ZERO;
5636
+    vec_u8 dstv, srcvA, srcvB;
5637
+    vec_u16 t0, t1, t2, t3;
5638
+
5639
+    vec_u16 Av = _mm_set1_epi16((16 - x16) * (16 - y16));
5640
+    vec_u16 Bv = _mm_set1_epi16(      x16  * (16 - y16));
5641
+    vec_u16 Cv = _mm_set1_epi16((16 - x16) * y16);
5642
+    vec_u16 Dv = _mm_set1_epi16(      x16  * y16);
5643
+    vec_u16 rounderV = _mm_set1_epi16(rounder);
5644
+
5645
+    vec_u8 srcvC = VEC_LD8(src);
5646
+    vec_u8 srcvD = VEC_LD8(src + 1);
5647
+    srcvC = _mm_unpacklo_epi8(srcvC, zerov);
5648
+    srcvD = _mm_unpacklo_epi8(srcvD, zerov);
5649
+
5650
+    PRAGMA_E2K("ivdep")
5651
+    for (i = 0; i < h; i++) {
5652
+        src += stride;
5653
+
5654
+        srcvA = srcvC;
5655
+        srcvB = srcvD;
5656
+        srcvC = VEC_LD8(src);
5657
+        srcvD = VEC_LD8(src + 1);
5658
+        srcvC = _mm_unpacklo_epi8(srcvC, zerov);
5659
+        srcvD = _mm_unpacklo_epi8(srcvD, zerov);
5660
+
5661
+        t0 = _mm_mullo_epi16(srcvA, Av);
5662
+        t1 = _mm_mullo_epi16(srcvB, Bv);
5663
+        t0 = _mm_add_epi16(t0, t1);
5664
+        t2 = _mm_mullo_epi16(srcvC, Cv);
5665
+        t3 = _mm_mullo_epi16(srcvD, Dv);
5666
+        t0 = _mm_add_epi16(t0, rounderV);
5667
+        t2 = _mm_add_epi16(t2, t3);
5668
+        t0 = _mm_add_epi16(t0, t2);
5669
+        t0 = _mm_srli_epi16(t0, 8);
5670
+        dstv = _mm_packus_epi16(t0, t0);
5671
+
5672
+        VEC_STL(dst, dstv);
5673
+        dst += stride;
5674
+    }
5675
+}
5676
+
5677
+av_cold void ff_mpegvideodsp_init_e2k(MpegVideoDSPContext *c)
5678
+{
5679
+    if (!E2K_BASE(av_get_cpu_flags()))
5680
+        return;
5681
+
5682
+    // !checkasm
5683
+    c->gmc1 = gmc1_e2k;
5684
+}
5685
diff --git a/libavcodec/e2k/mpegvideoencdsp.c b/libavcodec/e2k/mpegvideoencdsp.c
5686
new file mode 100644
5687
index 0000000..c5d3e4d
5688
--- /dev/null
5689
+++ b/libavcodec/e2k/mpegvideoencdsp.c
5690
@@ -0,0 +1,75 @@
5691
+/*
5692
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
5693
+ *
5694
+ * This file is part of FFmpeg.
5695
+ *
5696
+ * FFmpeg is free software; you can redistribute it and/or
5697
+ * modify it under the terms of the GNU Lesser General Public
5698
+ * License as published by the Free Software Foundation; either
5699
+ * version 2.1 of the License, or (at your option) any later version.
5700
+ *
5701
+ * FFmpeg is distributed in the hope that it will be useful,
5702
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5703
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
5704
+ * Lesser General Public License for more details.
5705
+ *
5706
+ * You should have received a copy of the GNU Lesser General Public
5707
+ * License along with FFmpeg; if not, write to the Free Software
5708
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
5709
+ */
5710
+
5711
+#include "config.h"
5712
+
5713
+#include <stdint.h>
5714
+
5715
+#include "libavutil/attributes.h"
5716
+#include "libavutil/cpu.h"
5717
+#include "libavutil/e2k/cpu.h"
5718
+#include "libavutil/e2k/util_e2k.h"
5719
+
5720
+#include "libavcodec/mpegvideoencdsp.h"
5721
+
5722
+static int pix_norm1_e2k(uint8_t *pix, int line_size)
5723
+{
5724
+    int i;
5725
+    LOAD_ZERO; 
5726
+    __m128i v0, v1, sum = zerov;
5727
+
5728
+    PRAGMA_E2K("ivdep")
5729
+    for (i = 0; i < 16; i++) {
5730
+        v1 = VEC_LD(pix);
5731
+        v0 = _mm_unpacklo_epi8(v1, zerov);
5732
+        v1 = _mm_unpackhi_epi8(v1, zerov);
5733
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v0, v0));
5734
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v1, v1));
5735
+        pix += line_size;
5736
+    }
5737
+    sum = _mm_hadd_epi32(sum, sum);
5738
+    return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1);
5739
+}
5740
+
5741
+static int pix_sum_e2k(uint8_t *pix, int line_size)
5742
+{
5743
+    int i;
5744
+    LOAD_ZERO;
5745
+    __m128i v0, sum = zerov;
5746
+
5747
+    PRAGMA_E2K("ivdep")
5748
+    for (i = 0; i < 16; i++) {
5749
+        v0 = VEC_LD(pix);
5750
+        sum = _mm_add_epi32(sum, _mm_sad_epu8(v0, zerov));
5751
+        pix += line_size;
5752
+    }
5753
+    return _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 2);
5754
+}
5755
+
5756
+av_cold void ff_mpegvideoencdsp_init_e2k(MpegvideoEncDSPContext *c,
5757
+                                         AVCodecContext *avctx)
5758
+{
5759
+    if (!E2K_BASE(av_get_cpu_flags()))
5760
+        return;
5761
+
5762
+    // !checkasm
5763
+    c->pix_norm1 = pix_norm1_e2k;
5764
+    c->pix_sum   = pix_sum_e2k;
5765
+}
5766
diff --git a/libavcodec/e2k/pixblockdsp.c b/libavcodec/e2k/pixblockdsp.c
5767
new file mode 100644
5768
index 0000000..f5a5060
5769
--- /dev/null
5770
+++ b/libavcodec/e2k/pixblockdsp.c
5771
@@ -0,0 +1,83 @@
5772
+/*
5773
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
5774
+ * Copyright (c) 2002 Brian Foley
5775
+ * Copyright (c) 2002 Dieter Shirley
5776
+ * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5777
+ *
5778
+ * This file is part of FFmpeg.
5779
+ *
5780
+ * FFmpeg is free software; you can redistribute it and/or
5781
+ * modify it under the terms of the GNU Lesser General Public
5782
+ * License as published by the Free Software Foundation; either
5783
+ * version 2.1 of the License, or (at your option) any later version.
5784
+ *
5785
+ * FFmpeg is distributed in the hope that it will be useful,
5786
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5787
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
5788
+ * Lesser General Public License for more details.
5789
+ *
5790
+ * You should have received a copy of the GNU Lesser General Public
5791
+ * License along with FFmpeg; if not, write to the Free Software
5792
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
5793
+ */
5794
+
5795
+#include "config.h"
5796
+
5797
+#include "libavutil/attributes.h"
5798
+#include "libavutil/cpu.h"
5799
+#include "libavutil/e2k/cpu.h"
5800
+#include "libavutil/e2k/util_e2k.h"
5801
+
5802
+#include "libavcodec/avcodec.h"
5803
+#include "libavcodec/pixblockdsp.h"
5804
+
5805
+static void get_pixels_e2k(int16_t * restrict block, const uint8_t *pixels,
5806
+                           ptrdiff_t stride)
5807
+{
5808
+    LOAD_ZERO;
5809
+    __m128i v0;
5810
+    int i;
5811
+
5812
+    PRAGMA_E2K("ivdep")
5813
+    for (i = 0; i < 8; i++) {
5814
+        v0 = VEC_LD8(pixels);
5815
+        v0 = _mm_unpacklo_epi8(v0, zerov);
5816
+        VEC_ST(block + i * 8, v0);
5817
+        pixels += stride;
5818
+    }
5819
+}
5820
+
5821
+static void diff_pixels_e2k(int16_t * restrict block, const uint8_t *s1,
5822
+                            const uint8_t *s2, ptrdiff_t stride)
5823
+{
5824
+    LOAD_ZERO;
5825
+    __m128i v0, v1;
5826
+    int i;
5827
+
5828
+    PRAGMA_E2K("ivdep")
5829
+    for (i = 0; i < 8; i++) {
5830
+        v0 = VEC_LD8(s1);
5831
+        v1 = VEC_LD8(s2);
5832
+        v0 = _mm_unpacklo_epi8(v0, zerov);
5833
+        v1 = _mm_unpacklo_epi8(v1, zerov);
5834
+        v0 = _mm_sub_epi16(v0, v1);
5835
+        VEC_ST(block + i * 8, v0);
5836
+        s1 += stride;
5837
+        s2 += stride;
5838
+    }
5839
+}
5840
+
5841
+av_cold void ff_pixblockdsp_init_e2k(PixblockDSPContext *c,
5842
+                                     AVCodecContext *avctx,
5843
+                                     unsigned high_bit_depth)
5844
+{
5845
+    if (!E2K_BASE(av_get_cpu_flags()))
5846
+        return;
5847
+
5848
+    // checkasm
5849
+
5850
+    c->diff_pixels = diff_pixels_e2k;
5851
+
5852
+    if (!high_bit_depth)
5853
+        c->get_pixels = get_pixels_e2k;
5854
+}
5855
diff --git a/libavcodec/e2k/svq1enc.c b/libavcodec/e2k/svq1enc.c
5856
new file mode 100644
5857
index 0000000..263ac60
5858
--- /dev/null
5859
+++ b/libavcodec/e2k/svq1enc.c
5860
@@ -0,0 +1,68 @@
5861
+/*
5862
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
5863
+ * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
5864
+ *
5865
+ * This file is part of FFmpeg.
5866
+ *
5867
+ * FFmpeg is free software; you can redistribute it and/or
5868
+ * modify it under the terms of the GNU Lesser General Public
5869
+ * License as published by the Free Software Foundation; either
5870
+ * version 2.1 of the License, or (at your option) any later version.
5871
+ *
5872
+ * FFmpeg is distributed in the hope that it will be useful,
5873
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5874
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
5875
+ * Lesser General Public License for more details.
5876
+ *
5877
+ * You should have received a copy of the GNU Lesser General Public
5878
+ * License along with FFmpeg; if not, write to the Free Software
5879
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
5880
+ */
5881
+
5882
+#include "config.h"
5883
+
5884
+#include <stdint.h>
5885
+
5886
+#include "libavutil/attributes.h"
5887
+#include "libavutil/cpu.h"
5888
+#include "libavutil/e2k/cpu.h"
5889
+#include "libavutil/e2k/util_e2k.h"
5890
+
5891
+#include "libavcodec/svq1enc.h"
5892
+
5893
+static int ssd_int8_vs_int16_e2k(const int8_t *pix1, const int16_t *pix2,
5894
+                                 intptr_t size)
5895
+{
5896
+    int i, res;
5897
+    __m128i v0, v1, v2, v3, sum = _mm_setzero_si128();
5898
+
5899
+    for (i = 0; i + 15 < size; i += 16) {
5900
+        v1 = VEC_LD(pix1);
5901
+        v0 = _mm_srai_epi16(_mm_unpacklo_epi8(v1, v1), 8);
5902
+        v1 = _mm_srai_epi16(_mm_unpackhi_epi8(v1, v1), 8);
5903
+        v2 = VEC_LD(pix2);
5904
+        v3 = VEC_LD(pix2 + 8);
5905
+        v0 = _mm_sub_epi16(v0, v2);
5906
+        v1 = _mm_sub_epi16(v1, v3);
5907
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v0, v0));
5908
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v1, v1));
5909
+        pix1 += 16;
5910
+        pix2 += 16;
5911
+    }
5912
+    sum = _mm_hadd_epi32(sum, sum);
5913
+    res = _mm_extract_epi32(sum, 0) + _mm_extract_epi32(sum, 1);
5914
+
5915
+    for (; i < size; i++)
5916
+        res += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
5917
+
5918
+    return res;
5919
+}
5920
+
5921
+av_cold void ff_svq1enc_init_e2k(SVQ1EncContext *c)
5922
+{
5923
+    if (!E2K_BASE(av_get_cpu_flags()))
5924
+        return;
5925
+
5926
+    // !checkasm
5927
+    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_e2k;
5928
+}
5929
diff --git a/libavcodec/e2k/vc1dsp.c b/libavcodec/e2k/vc1dsp.c
5930
new file mode 100644
5931
index 0000000..91307a9
5932
--- /dev/null
5933
+++ b/libavcodec/e2k/vc1dsp.c
5934
@@ -0,0 +1,303 @@
5935
+/*
5936
+ * VC-1 and WMV3 decoder - DSP functions
5937
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
5938
+ * Copyright (c) 2006 Konstantin Shishkov
5939
+ *
5940
+ * This file is part of FFmpeg.
5941
+ *
5942
+ * FFmpeg is free software; you can redistribute it and/or
5943
+ * modify it under the terms of the GNU Lesser General Public
5944
+ * License as published by the Free Software Foundation; either
5945
+ * version 2.1 of the License, or (at your option) any later version.
5946
+ *
5947
+ * FFmpeg is distributed in the hope that it will be useful,
5948
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5949
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
5950
+ * Lesser General Public License for more details.
5951
+ *
5952
+ * You should have received a copy of the GNU Lesser General Public
5953
+ * License along with FFmpeg; if not, write to the Free Software
5954
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
5955
+ */
5956
+
5957
+#include "config.h"
5958
+
5959
+#include "libavutil/attributes.h"
5960
+#include "libavutil/cpu.h"
5961
+#include "libavutil/e2k/cpu.h"
5962
+#include "libavutil/e2k/util_e2k.h"
5963
+
5964
+#include "libavcodec/vc1dsp.h"
5965
+
5966
+// main steps of 8x8 transform
5967
+#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) do { \
5968
+    t0 = _mm_slli_epi32(_mm_add_epi32(s0, s4), 2); \
5969
+    t0 = _mm_add_epi32(_mm_slli_epi32(t0, 1), t0); \
5970
+    t0 = _mm_add_epi32(t0, vec_rnd); \
5971
+    t1 = _mm_slli_epi32(_mm_sub_epi32(s0, s4), 2); \
5972
+    t1 = _mm_add_epi32(_mm_slli_epi32(t1, 1), t1); \
5973
+    t1 = _mm_add_epi32(t1, vec_rnd); \
5974
+    t2 = _mm_add_epi32(_mm_slli_epi32(s6, 2), _mm_slli_epi32(s6, 1)); \
5975
+    t2 = _mm_add_epi32(t2, _mm_slli_epi32(s2, 4)); \
5976
+    t3 = _mm_add_epi32(_mm_slli_epi32(s2, 2), _mm_slli_epi32(s2, 1)); \
5977
+    t3 = _mm_sub_epi32(t3, _mm_slli_epi32(s6, 4)); \
5978
+    t4 = _mm_add_epi32(t0, t2); \
5979
+    t5 = _mm_add_epi32(t1, t3); \
5980
+    t6 = _mm_sub_epi32(t1, t3); \
5981
+    t7 = _mm_sub_epi32(t0, t2); \
5982
+\
5983
+    t0 = _mm_slli_epi32(_mm_add_epi32(s1, s3), 4); \
5984
+    t0 = _mm_add_epi32(t0, _mm_slli_epi32(s5, 3)); \
5985
+    t0 = _mm_add_epi32(t0, _mm_slli_epi32(s7, 2)); \
5986
+    t0 = _mm_add_epi32(t0, _mm_sub_epi32(s5, s3)); \
5987
+\
5988
+    t1 = _mm_slli_epi32(_mm_sub_epi32(s1, s5), 4); \
5989
+    t1 = _mm_sub_epi32(t1, _mm_slli_epi32(s7, 3)); \
5990
+    t1 = _mm_sub_epi32(t1, _mm_slli_epi32(s3, 2)); \
5991
+    t1 = _mm_sub_epi32(t1, _mm_add_epi32(s1, s7)); \
5992
+\
5993
+    t2 = _mm_slli_epi32(_mm_sub_epi32(s7, s3), 4); \
5994
+    t2 = _mm_add_epi32(t2, _mm_slli_epi32(s1, 3)); \
5995
+    t2 = _mm_add_epi32(t2, _mm_slli_epi32(s5, 2)); \
5996
+    t2 = _mm_add_epi32(t2, _mm_sub_epi32(s1, s7)); \
5997
+\
5998
+    t3 = _mm_slli_epi32(_mm_sub_epi32(s5, s7), 4); \
5999
+    t3 = _mm_sub_epi32(t3, _mm_slli_epi32(s3, 3)); \
6000
+    t3 = _mm_add_epi32(t3, _mm_slli_epi32(s1, 2)); \
6001
+    t3 = _mm_sub_epi32(t3, _mm_add_epi32(s3, s5)); \
6002
+\
6003
+    s0 = _mm_add_epi32(t4, t0); \
6004
+    s1 = _mm_add_epi32(t5, t1); \
6005
+    s2 = _mm_add_epi32(t6, t2); \
6006
+    s3 = _mm_add_epi32(t7, t3); \
6007
+    s4 = _mm_sub_epi32(t7, t3); \
6008
+    s5 = _mm_sub_epi32(t6, t2); \
6009
+    s6 = _mm_sub_epi32(t5, t1); \
6010
+    s7 = _mm_sub_epi32(t4, t0); \
6011
+}while(0)
6012
+
6013
+#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) do { \
6014
+    s0 = _mm_srai_epi32(s0, 3); \
6015
+    s1 = _mm_srai_epi32(s1, 3); \
6016
+    s2 = _mm_srai_epi32(s2, 3); \
6017
+    s3 = _mm_srai_epi32(s3, 3); \
6018
+    s4 = _mm_srai_epi32(s4, 3); \
6019
+    s5 = _mm_srai_epi32(s5, 3); \
6020
+    s6 = _mm_srai_epi32(s6, 3); \
6021
+    s7 = _mm_srai_epi32(s7, 3); \
6022
+} while(0)
6023
+
6024
+#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) do { \
6025
+    s0 = _mm_srai_epi32(s0, 7); \
6026
+    s1 = _mm_srai_epi32(s1, 7); \
6027
+    s2 = _mm_srai_epi32(s2, 7); \
6028
+    s3 = _mm_srai_epi32(s3, 7); \
6029
+    s4 = _mm_srai_epi32(_mm_add_epi32(s4, c1), 7); \
6030
+    s5 = _mm_srai_epi32(_mm_add_epi32(s5, c1), 7); \
6031
+    s6 = _mm_srai_epi32(_mm_add_epi32(s6, c1), 7); \
6032
+    s7 = _mm_srai_epi32(_mm_add_epi32(s7, c1), 7); \
6033
+} while(0)
6034
+
6035
+/* main steps of 4x4 transform */
6036
+#define STEP4(s0, s1, s2, s3, vec_rnd) do { \
6037
+    t1 = _mm_add_epi32(_mm_slli_epi32(s0, 4), s0); \
6038
+    t1 = _mm_add_epi32(t1, vec_rnd); \
6039
+    t2 = _mm_add_epi32(_mm_slli_epi32(s2, 4), s2); \
6040
+    t0 = _mm_add_epi32(t1, t2); \
6041
+    t1 = _mm_sub_epi32(t1, t2); \
6042
+    t3 = _mm_slli_epi32(_mm_sub_epi32(s3, s1), 1); \
6043
+    t3 = _mm_add_epi32(t3, _mm_slli_epi32(t3, 2)); \
6044
+    t2 = _mm_add_epi32(t3, _mm_slli_epi32(s1, 5)); \
6045
+    t3 = _mm_add_epi32(t3, _mm_slli_epi32(s3, 3)); \
6046
+    t3 = _mm_add_epi32(t3, _mm_slli_epi32(s3, 2)); \
6047
+    s0 = _mm_add_epi32(t0, t2); \
6048
+    s1 = _mm_sub_epi32(t1, t3); \
6049
+    s2 = _mm_add_epi32(t1, t3); \
6050
+    s3 = _mm_sub_epi32(t0, t2); \
6051
+} while (0)
6052
+
6053
+#define SHIFT_HOR4(s0, s1, s2, s3) \
6054
+    s0 = _mm_srai_epi32(s0, 3); \
6055
+    s1 = _mm_srai_epi32(s1, 3); \
6056
+    s2 = _mm_srai_epi32(s2, 3); \
6057
+    s3 = _mm_srai_epi32(s3, 3)
6058
+
6059
+#define SHIFT_VERT4(s0, s1, s2, s3) \
6060
+    s0 = _mm_srai_epi32(s0, 7); \
6061
+    s1 = _mm_srai_epi32(s1, 7); \
6062
+    s2 = _mm_srai_epi32(s2, 7); \
6063
+    s3 = _mm_srai_epi32(s3, 7)
6064
+
6065
+#define _mm_unpacklo1_epi16(v) _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16)
6066
+#define _mm_unpackhi1_epi16(v) _mm_srai_epi32(_mm_unpackhi_epi16(v, v), 16)
6067
+
6068
+#define TRANSPOSE4_32(s0, s1, s2, s3) \
6069
+    t0 = _mm_unpacklo_epi32(s0, s1); \
6070
+    t1 = _mm_unpackhi_epi32(s0, s1); \
6071
+    t2 = _mm_unpacklo_epi32(s2, s3); \
6072
+    t3 = _mm_unpackhi_epi32(s2, s3); \
6073
+    s0 = _mm_unpacklo_epi64(t0, t2); \
6074
+    s1 = _mm_unpackhi_epi64(t0, t2); \
6075
+    s2 = _mm_unpacklo_epi64(t1, t3); \
6076
+    s3 = _mm_unpackhi_epi64(t1, t3);
6077
+
6078
+/* Do inverse transform on 8x8 block */
6079
+static void vc1_inv_trans_8x8_e2k(int16_t block[64])
6080
+{
6081
+    vec_s16 src0, src1, src2, src3, src4, src5, src6, src7;
6082
+    vec_s32 s0, s1, s2, s3, s4, s5, s6, s7;
6083
+    vec_s32 s8, s9, sA, sB, sC, sD, sE, sF;
6084
+    vec_s32 t0, t1, t2, t3, t4, t5, t6, t7;
6085
+    const vec_s32 c64 = _mm_set1_epi32(64);
6086
+    const vec_s32 c4 = _mm_set1_epi32(4);
6087
+    const vec_s32 c1 = _mm_set1_epi32(1);
6088
+
6089
+    src0 = VEC_LD(block + 8 * 0);
6090
+    src1 = VEC_LD(block + 8 * 1);
6091
+    src2 = VEC_LD(block + 8 * 2);
6092
+    src3 = VEC_LD(block + 8 * 3);
6093
+    src4 = VEC_LD(block + 8 * 4);
6094
+    src5 = VEC_LD(block + 8 * 5);
6095
+    src6 = VEC_LD(block + 8 * 6);
6096
+    src7 = VEC_LD(block + 8 * 7);
6097
+
6098
+    s0 = _mm_unpacklo1_epi16(src0);
6099
+    s1 = _mm_unpacklo1_epi16(src1);
6100
+    s2 = _mm_unpacklo1_epi16(src2);
6101
+    s3 = _mm_unpacklo1_epi16(src3);
6102
+    s4 = _mm_unpacklo1_epi16(src4);
6103
+    s5 = _mm_unpacklo1_epi16(src5);
6104
+    s6 = _mm_unpacklo1_epi16(src6);
6105
+    s7 = _mm_unpacklo1_epi16(src7);
6106
+    s8 = _mm_unpackhi1_epi16(src0);
6107
+    s9 = _mm_unpackhi1_epi16(src1);
6108
+    sA = _mm_unpackhi1_epi16(src2);
6109
+    sB = _mm_unpackhi1_epi16(src3);
6110
+    sC = _mm_unpackhi1_epi16(src4);
6111
+    sD = _mm_unpackhi1_epi16(src5);
6112
+    sE = _mm_unpackhi1_epi16(src6);
6113
+    sF = _mm_unpackhi1_epi16(src7);
6114
+    STEP8(s0, s1, s2, s3, s4, s5, s6, s7, c4);
6115
+    SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
6116
+    STEP8(s8, s9, sA, sB, sC, sD, sE, sF, c4);
6117
+    SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
6118
+
6119
+    TRANSPOSE4_32(s0, s1, s2, s3)
6120
+    TRANSPOSE4_32(s4, s5, s6, s7)
6121
+    TRANSPOSE4_32(s8, s9, sA, sB)
6122
+    TRANSPOSE4_32(sC, sD, sE, sF)
6123
+
6124
+    STEP8(s0, s1, s2, s3, s8, s9, sA, sB, c64);
6125
+    SHIFT_VERT8(s0, s1, s2, s3, s8, s9, sA, sB);
6126
+    STEP8(s4, s5, s6, s7, sC, sD, sE, sF, c64);
6127
+    SHIFT_VERT8(s4, s5, s6, s7, sC, sD, sE, sF);
6128
+    src0 = _mm_packs_epi32(s0, s4);
6129
+    src1 = _mm_packs_epi32(s1, s5);
6130
+    src2 = _mm_packs_epi32(s2, s6);
6131
+    src3 = _mm_packs_epi32(s3, s7);
6132
+    src4 = _mm_packs_epi32(s8, sC);
6133
+    src5 = _mm_packs_epi32(s9, sD);
6134
+    src6 = _mm_packs_epi32(sA, sE);
6135
+    src7 = _mm_packs_epi32(sB, sF);
6136
+
6137
+    VEC_ST(block + 8 * 0, src0);
6138
+    VEC_ST(block + 8 * 1, src1);
6139
+    VEC_ST(block + 8 * 2, src2);
6140
+    VEC_ST(block + 8 * 3, src3);
6141
+    VEC_ST(block + 8 * 4, src4);
6142
+    VEC_ST(block + 8 * 5, src5);
6143
+    VEC_ST(block + 8 * 6, src6);
6144
+    VEC_ST(block + 8 * 7, src7);
6145
+}
6146
+
6147
+/* Do inverse transform on 8x4 part of block */
6148
+static void vc1_inv_trans_8x4_e2k(uint8_t *dest, ptrdiff_t stride,
6149
+                                  int16_t *block)
6150
+{
6151
+    LOAD_ZERO;
6152
+    vec_s16 src0, src1, src2, src3;
6153
+    vec_s32 s0, s1, s2, s3, s4, s5, s6, s7;
6154
+    vec_s32 t0, t1, t2, t3, t4, t5, t6, t7;
6155
+    const vec_s32 c64 = _mm_set1_epi32(64);
6156
+    const vec_s32 c4 = _mm_set1_epi32(4);
6157
+    __m128i tmp;
6158
+
6159
+    src0 = VEC_LD(block + 8 * 0);
6160
+    src1 = VEC_LD(block + 8 * 1);
6161
+    src2 = VEC_LD(block + 8 * 2);
6162
+    src3 = VEC_LD(block + 8 * 3);
6163
+
6164
+    t0 = _mm_unpacklo_epi16(src0, src1);
6165
+    t1 = _mm_unpackhi_epi16(src0, src1);
6166
+    t2 = _mm_unpacklo_epi16(src2, src3);
6167
+    t3 = _mm_unpackhi_epi16(src2, src3);
6168
+
6169
+    t4 = _mm_unpacklo_epi32(t0, t2);
6170
+    t5 = _mm_unpackhi_epi32(t0, t2);
6171
+    t6 = _mm_unpacklo_epi32(t1, t3);
6172
+    t7 = _mm_unpackhi_epi32(t1, t3);
6173
+
6174
+    s0 = _mm_unpacklo1_epi16(t4);
6175
+    s1 = _mm_unpackhi1_epi16(t4);
6176
+    s2 = _mm_unpacklo1_epi16(t5);
6177
+    s3 = _mm_unpackhi1_epi16(t5);
6178
+    s4 = _mm_unpacklo1_epi16(t6);
6179
+    s5 = _mm_unpackhi1_epi16(t6);
6180
+    s6 = _mm_unpacklo1_epi16(t7);
6181
+    s7 = _mm_unpackhi1_epi16(t7);
6182
+
6183
+    STEP8(s0, s1, s2, s3, s4, s5, s6, s7, c4);
6184
+    SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
6185
+
6186
+    TRANSPOSE4_32(s0, s1, s2, s3)
6187
+    TRANSPOSE4_32(s4, s5, s6, s7)
6188
+
6189
+    STEP4(s0, s1, s2, s3, c64);
6190
+    SHIFT_VERT4(s0, s1, s2, s3);
6191
+    STEP4(s4, s5, s6, s7, c64);
6192
+    SHIFT_VERT4(s4, s5, s6, s7);
6193
+    src0 = _mm_packs_epi32(s0, s4);
6194
+    src1 = _mm_packs_epi32(s1, s5);
6195
+    src2 = _mm_packs_epi32(s2, s6);
6196
+    src3 = _mm_packs_epi32(s3, s7);
6197
+
6198
+#define ADD(dest, src)                   \
6199
+    tmp = VEC_LD8(dest);                 \
6200
+    tmp = _mm_unpacklo_epi8(tmp, zerov); \
6201
+    tmp = _mm_adds_epi16(tmp, src);      \
6202
+    tmp = _mm_packus_epi16(tmp, tmp);    \
6203
+    VEC_STL(dest, tmp)
6204
+
6205
+    ADD(dest, src0); dest += stride;
6206
+    ADD(dest, src1); dest += stride;
6207
+    ADD(dest, src2); dest += stride;
6208
+    ADD(dest, src3);
6209
+}
6210
+
6211
+#define PUT_OP_U8_E2K(d, s, dst) d = s
6212
+#define AVG_OP_U8_E2K(d, s, dst) d = _mm_avg_epu8(dst, s)
6213
+
6214
+#define OP_U8_E2K                          PUT_OP_U8_E2K
6215
+#define PREFIX_no_rnd_vc1_chroma_mc8_e2k   put_no_rnd_vc1_chroma_mc8_e2k
6216
+#include "h264chroma_template.c"
6217
+#undef OP_U8_E2K
6218
+#undef PREFIX_no_rnd_vc1_chroma_mc8_e2k
6219
+
6220
+#define OP_U8_E2K                          AVG_OP_U8_E2K
6221
+#define PREFIX_no_rnd_vc1_chroma_mc8_e2k   avg_no_rnd_vc1_chroma_mc8_e2k
6222
+#include "h264chroma_template.c"
6223
+#undef OP_U8_E2K
6224
+#undef PREFIX_no_rnd_vc1_chroma_mc8_e2k
6225
+
6226
+av_cold void ff_vc1dsp_init_e2k(VC1DSPContext *dsp)
6227
+{
6228
+    if (!E2K_BASE(av_get_cpu_flags()))
6229
+        return;
6230
+
6231
+    // !checkasm
6232
+
6233
+    dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_e2k; // fate mss2-wmv
6234
+    dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_e2k; // fate wmv3-drm-dec
6235
+    dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_e2k;
6236
+    dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_e2k;
6237
+}
6238
diff --git a/libavcodec/e2k/videodsp.c b/libavcodec/e2k/videodsp.c
6239
new file mode 100644
6240
index 0000000..d831d68
6241
--- /dev/null
6242
+++ b/libavcodec/e2k/videodsp.c
6243
@@ -0,0 +1,36 @@
6244
+/*
6245
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
6246
+ * Copyright (c) 2003-2004 Romain Dolbeau
6247
+ *
6248
+ * This file is part of FFmpeg.
6249
+ *
6250
+ * FFmpeg is free software; you can redistribute it and/or
6251
+ * modify it under the terms of the GNU Lesser General Public
6252
+ * License as published by the Free Software Foundation; either
6253
+ * version 2.1 of the License, or (at your option) any later version.
6254
+ *
6255
+ * FFmpeg is distributed in the hope that it will be useful,
6256
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
6257
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
6258
+ * Lesser General Public License for more details.
6259
+ *
6260
+ * You should have received a copy of the GNU Lesser General Public
6261
+ * License along with FFmpeg; if not, write to the Free Software
6262
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6263
+ */
6264
+
6265
+#include "libavutil/attributes.h"
6266
+#include "libavcodec/videodsp.h"
6267
+
6268
+static void prefetch_e2k(uint8_t *mem, ptrdiff_t stride, int h)
6269
+{
6270
+    do {
6271
+        __builtin_prefetch(mem);
6272
+        mem += stride;
6273
+    } while (--h);
6274
+}
6275
+
6276
+av_cold void ff_videodsp_init_e2k(VideoDSPContext *ctx, int bpc)
6277
+{
6278
+    ctx->prefetch = prefetch_e2k;
6279
+}
6280
diff --git a/libavcodec/e2k/vorbisdsp.c b/libavcodec/e2k/vorbisdsp.c
6281
new file mode 100644
6282
index 0000000..7a7619e
6283
--- /dev/null
6284
+++ b/libavcodec/e2k/vorbisdsp.c
6285
@@ -0,0 +1,62 @@
6286
+/*
6287
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
6288
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
6289
+ *
6290
+ * This file is part of FFmpeg.
6291
+ *
6292
+ * FFmpeg is free software; you can redistribute it and/or
6293
+ * modify it under the terms of the GNU Lesser General Public
6294
+ * License as published by the Free Software Foundation; either
6295
+ * version 2.1 of the License, or (at your option) any later version.
6296
+ *
6297
+ * FFmpeg is distributed in the hope that it will be useful,
6298
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
6299
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
6300
+ * Lesser General Public License for more details.
6301
+ *
6302
+ * You should have received a copy of the GNU Lesser General Public
6303
+ * License along with FFmpeg; if not, write to the Free Software
6304
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6305
+ */
6306
+
6307
+#include "config.h"
6308
+
6309
+#include "libavutil/attributes.h"
6310
+#include "libavutil/cpu.h"
6311
+#include "libavutil/e2k/cpu.h"
6312
+#include "libavutil/e2k/util_e2k.h"
6313
+
6314
+#include "libavcodec/vorbisdsp.h"
6315
+
6316
+static void vorbis_inverse_coupling_e2k(float *mag, float *ang,
6317
+                                        intptr_t blocksize)
6318
+{
6319
+    int i;
6320
+    vec_f m, a, t0, t1, zerov = _mm_setzero_ps();
6321
+    vec_f sign = _mm_castsi128_ps(_mm_set1_epi32(1 << 31));
6322
+
6323
+    PRAGMA_E2K("ivdep")
6324
+    for (i = 0; i < blocksize; i += 4) {
6325
+        m = _mm_load_ps(mag + i);
6326
+        a = _mm_load_ps(ang + i);
6327
+        t0 = _mm_cmple_ps(m, zerov);
6328
+        t1 = _mm_cmple_ps(a, zerov);
6329
+        a = _mm_xor_ps(a, _mm_and_ps(t0, sign));
6330
+        t0 = _mm_andnot_ps(t1, a);
6331
+        t1 = _mm_and_ps(t1, a);
6332
+        a = _mm_sub_ps(m, t0);
6333
+        m = _mm_add_ps(m, t1);
6334
+        _mm_store_ps(ang + i, a);
6335
+        _mm_store_ps(mag + i, m);
6336
+    }
6337
+}
6338
+
6339
+av_cold void ff_vorbisdsp_init_e2k(VorbisDSPContext *c)
6340
+{
6341
+    if (!E2K_BASE(av_get_cpu_flags()))
6342
+        return;
6343
+
6344
+    // !checkasm
6345
+    // fate vorbis-encode
6346
+    c->vorbis_inverse_coupling = vorbis_inverse_coupling_e2k;
6347
+}
6348
diff --git a/libavcodec/e2k/vp3dsp.c b/libavcodec/e2k/vp3dsp.c
6349
new file mode 100644
6350
index 0000000..f086096
6351
--- /dev/null
6352
+++ b/libavcodec/e2k/vp3dsp.c
6353
@@ -0,0 +1,169 @@
6354
+/*
6355
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
6356
+ * Copyright (C) 2009 David Conrad
6357
+ *
6358
+ * This file is part of FFmpeg.
6359
+ *
6360
+ * FFmpeg is free software; you can redistribute it and/or
6361
+ * modify it under the terms of the GNU Lesser General Public
6362
+ * License as published by the Free Software Foundation; either
6363
+ * version 2.1 of the License, or (at your option) any later version.
6364
+ *
6365
+ * FFmpeg is distributed in the hope that it will be useful,
6366
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
6367
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
6368
+ * Lesser General Public License for more details.
6369
+ *
6370
+ * You should have received a copy of the GNU Lesser General Public
6371
+ * License along with FFmpeg; if not, write to the Free Software
6372
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6373
+ */
6374
+
6375
+#include <string.h>
6376
+
6377
+#include "config.h"
6378
+
6379
+#include "libavutil/attributes.h"
6380
+#include "libavutil/cpu.h"
6381
+#include "libavutil/e2k/cpu.h"
6382
+#include "libavutil/e2k/util_e2k.h"
6383
+
6384
+#include "libavcodec/vp3dsp.h"
6385
+
6386
+#define IDCT_START(extra) \
6387
+    vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H; \
6388
+    vec_s16 Ed, Gd, Add, Bdd, Fd, Hd; \
6389
+    vec_s16 addv = _mm_set1_epi16(extra + 8); \
6390
+    \
6391
+    vec_s16 C1 = _mm_set1_epi16(64277); \
6392
+    vec_s16 C2 = _mm_set1_epi16(60547); \
6393
+    vec_s16 C3 = _mm_set1_epi16(54491); \
6394
+    vec_s16 C4 = _mm_set1_epi16(46341); \
6395
+    vec_s16 C5 = _mm_set1_epi16(36410); \
6396
+    vec_s16 C6 = _mm_set1_epi16(25080); \
6397
+    vec_s16 C7 = _mm_set1_epi16(12785); \
6398
+    \
6399
+    vec_s16 b0 = VEC_LD(block + 8 * 0); \
6400
+    vec_s16 b1 = VEC_LD(block + 8 * 1); \
6401
+    vec_s16 b2 = VEC_LD(block + 8 * 2); \
6402
+    vec_s16 b3 = VEC_LD(block + 8 * 3); \
6403
+    vec_s16 b4 = VEC_LD(block + 8 * 4); \
6404
+    vec_s16 b5 = VEC_LD(block + 8 * 5); \
6405
+    vec_s16 b6 = VEC_LD(block + 8 * 6); \
6406
+    vec_s16 b7 = VEC_LD(block + 8 * 7);
6407
+
6408
+// these functions do (a*C)>>16
6409
+// things are tricky because a is signed, but C unsigned.
6410
+// M15 is used if C fits in 15 bit unsigned (C6,C7)
6411
+// M16 is used if C requires 16 bits unsigned
6412
+#define M15(a, C) _mm_mulhi_epi16(a, C)
6413
+#define M16(a, C) _mm_add_epi16(a, M15(a, C))
6414
+
6415
+#define IDCT_1D(ADD, SHIFT)\
6416
+    A = _mm_add_epi16(M16(b1, C1), M15(b7, C7)); \
6417
+    B = _mm_sub_epi16(M15(b1, C7), M16(b7, C1)); \
6418
+    C = _mm_add_epi16(M16(b3, C3), M16(b5, C5)); \
6419
+    D = _mm_sub_epi16(M16(b5, C3), M16(b3, C5)); \
6420
+    \
6421
+    Ad = M16(_mm_sub_epi16(A, C), C4); \
6422
+    Bd = M16(_mm_sub_epi16(B, D), C4); \
6423
+    \
6424
+    Cd = _mm_add_epi16(A, C); \
6425
+    Dd = _mm_add_epi16(B, D); \
6426
+    \
6427
+    E = ADD(M16(_mm_add_epi16(b0, b4), C4)); \
6428
+    F = ADD(M16(_mm_sub_epi16(b0, b4), C4)); \
6429
+    \
6430
+    G = _mm_add_epi16(M16(b2, C2), M15(b6, C6)); \
6431
+    H = _mm_sub_epi16(M15(b2, C6), M16(b6, C2)); \
6432
+    \
6433
+    Ed = _mm_sub_epi16(E, G); \
6434
+    Gd = _mm_add_epi16(E, G); \
6435
+    \
6436
+    Add = _mm_add_epi16(F, Ad); \
6437
+    Bdd = _mm_sub_epi16(Bd, H); \
6438
+    \
6439
+    Fd = _mm_sub_epi16(F, Ad); \
6440
+    Hd = _mm_add_epi16(Bd, H); \
6441
+    \
6442
+    b0 = SHIFT(_mm_add_epi16(Gd, Cd)); \
6443
+    b7 = SHIFT(_mm_sub_epi16(Gd, Cd)); \
6444
+    \
6445
+    b1 = SHIFT(_mm_add_epi16(Add, Hd)); \
6446
+    b2 = SHIFT(_mm_sub_epi16(Add, Hd)); \
6447
+    \
6448
+    b3 = SHIFT(_mm_add_epi16(Ed, Dd)); \
6449
+    b4 = SHIFT(_mm_sub_epi16(Ed, Dd)); \
6450
+    \
6451
+    b5 = SHIFT(_mm_add_epi16(Fd, Bdd)); \
6452
+    b6 = SHIFT(_mm_sub_epi16(Fd, Bdd));
6453
+
6454
+#define NOP(a) a
6455
+#define ADD8(a) _mm_add_epi16(a, addv)
6456
+#define SHIFT4(a) _mm_srai_epi16(a, 4)
6457
+
6458
+static void vp3_idct_put_e2k(uint8_t *dst, ptrdiff_t stride, int16_t block[64])
6459
+{
6460
+    vec_u8 vdst;
6461
+    IDCT_START(2048)
6462
+
6463
+    IDCT_1D(NOP, NOP)
6464
+    TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
6465
+    IDCT_1D(ADD8, SHIFT4)
6466
+
6467
+#define PUT(a) \
6468
+    vdst = _mm_packus_epi16(a, a); \
6469
+    VEC_STL(dst, vdst);
6470
+
6471
+    PUT(b0)     dst += stride;
6472
+    PUT(b1)     dst += stride;
6473
+    PUT(b2)     dst += stride;
6474
+    PUT(b3)     dst += stride;
6475
+    PUT(b4)     dst += stride;
6476
+    PUT(b5)     dst += stride;
6477
+    PUT(b6)     dst += stride;
6478
+    PUT(b7)
6479
+    memset(block, 0, sizeof(*block) * 64);
6480
+}
6481
+
6482
+static void vp3_idct_add_e2k(uint8_t *dst, ptrdiff_t stride, int16_t block[64])
6483
+{
6484
+    LOAD_ZERO;
6485
+    vec_u8 vdst;
6486
+    vec_s16 vdst_16;
6487
+
6488
+    IDCT_START(0)
6489
+
6490
+    IDCT_1D(NOP, NOP)
6491
+    TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
6492
+    IDCT_1D(ADD8, SHIFT4)
6493
+
6494
+#define ADD(a) \
6495
+    vdst = VEC_LD8(dst); \
6496
+    vdst_16 = _mm_unpacklo_epi8(vdst, zerov); \
6497
+    vdst_16 = _mm_adds_epi16(a, vdst_16); \
6498
+    vdst = _mm_packus_epi16(vdst_16, vdst_16); \
6499
+    VEC_STL(dst, vdst);
6500
+
6501
+    ADD(b0)     dst += stride;
6502
+    ADD(b1)     dst += stride;
6503
+    ADD(b2)     dst += stride;
6504
+    ADD(b3)     dst += stride;
6505
+    ADD(b4)     dst += stride;
6506
+    ADD(b5)     dst += stride;
6507
+    ADD(b6)     dst += stride;
6508
+    ADD(b7)
6509
+    memset(block, 0, sizeof(*block) * 64);
6510
+}
6511
+
6512
+av_cold void ff_vp3dsp_init_e2k(VP3DSPContext *c, int flags)
6513
+{
6514
+    if (!E2K_BASE(av_get_cpu_flags()))
6515
+        return;
6516
+
6517
+    // !checkasm
6518
+    // fate theora-coeff-level64
6519
+
6520
+    c->idct_put = vp3_idct_put_e2k;
6521
+    c->idct_add = vp3_idct_add_e2k;
6522
+}
6523
diff --git a/libavcodec/e2k/vp8dsp.c b/libavcodec/e2k/vp8dsp.c
6524
new file mode 100644
6525
index 0000000..61b46b3
6526
--- /dev/null
6527
+++ b/libavcodec/e2k/vp8dsp.c
6528
@@ -0,0 +1,428 @@
6529
+/*
6530
+ * VP8 compatible video decoder
6531
+ *
6532
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
6533
+ * Copyright (C) 2010 David Conrad
6534
+ *
6535
+ * This file is part of FFmpeg.
6536
+ *
6537
+ * FFmpeg is free software; you can redistribute it and/or
6538
+ * modify it under the terms of the GNU Lesser General Public
6539
+ * License as published by the Free Software Foundation; either
6540
+ * version 2.1 of the License, or (at your option) any later version.
6541
+ *
6542
+ * FFmpeg is distributed in the hope that it will be useful,
6543
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
6544
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
6545
+ * Lesser General Public License for more details.
6546
+ *
6547
+ * You should have received a copy of the GNU Lesser General Public
6548
+ * License along with FFmpeg; if not, write to the Free Software
6549
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6550
+ */
6551
+
6552
+#include "config.h"
6553
+
6554
+#include "libavutil/cpu.h"
6555
+#include "libavutil/mem.h"
6556
+#include "libavutil/e2k/cpu.h"
6557
+#include "libavutil/e2k/util_e2k.h"
6558
+
6559
+#include "libavcodec/vp8dsp.h"
6560
+
6561
+#include "hpeldsp.h"
6562
+
6563
+#define REPT4(a, b, c, d) { a, b, c, d,  a, b, c, d,  a, b, c, d,  a, b, c, d }
6564
+
6565
+// h subpel filter uses msum to multiply+add 4 pixel taps at once
6566
+static const uint8_t ALIGNED(16) h_subpel_filters_inner[7][16] =
6567
+{
6568
+    REPT4( -6, 123,  12,  -1),
6569
+    REPT4(-11, 108,  36,  -8),
6570
+    REPT4( -9,  93,  50,  -6),
6571
+    REPT4(-16,  77,  77, -16),
6572
+    REPT4( -6,  50,  93,  -9),
6573
+    REPT4( -8,  36, 108, -11),
6574
+    REPT4( -1,  12, 123,  -6)
6575
+};
6576
+
6577
+// for 6tap filters, these are the outer two taps
6578
+// The zeros mask off pixels 4-7 when filtering 0-3
6579
+// and vice-versa
6580
+static const uint8_t ALIGNED(16) h_subpel_filters_outer[3][16] =
6581
+{
6582
+    REPT4(2, 1, 2, 1),
6583
+    REPT4(3, 3, 3, 3),
6584
+    REPT4(1, 2, 1, 2)
6585
+};
6586
+
6587
+#define INNER_PERM(x) x, x+1, x+2, x+3, x+1, x+2, x+3, x+4
6588
+
6589
+#define INIT_H_SUBPEL_FILTER(j, n, is6tap) \
6590
+    vec_s8 filter_inner = *(__m128i*)h_subpel_filters_inner[j]; \
6591
+    is6tap( \
6592
+        vec_s8 filter_outer = *(__m128i*)h_subpel_filters_outer[(j) >> 1]; \
6593
+        vec_u8 perm_outer = _mm_setr_epi8(0,5, 1,6, 2,7, 3,8, 4,9, 5,10, 6,11, 7,12); \
6594
+    ) \
6595
+    vec_s32 c64 = _mm_set1_epi16(64); \
6596
+    vec_u8 perm_inner_l = _mm_setr_epi8(INNER_PERM(n), INNER_PERM(n + 2)); \
6597
+    vec_u8 perm_inner_h = _mm_setr_epi8(INNER_PERM(n + 4), INNER_PERM(n + 6)); \
6598
+    __m128i v0, v1; \
6599
+    int i
6600
+
6601
+#define FILTER_H(a, is6tap) \
6602
+    v0 = _mm_shuffle_epi8(a, perm_inner_l); \
6603
+    v1 = _mm_shuffle_epi8(a, perm_inner_h); \
6604
+    v0 = _mm_maddubs_epi16(v0, filter_inner); \
6605
+    v1 = _mm_maddubs_epi16(v1, filter_inner); \
6606
+    v0 = _mm_hadds_epi16(v0, v1); \
6607
+    is6tap( \
6608
+        a = _mm_shuffle_epi8(a, perm_outer); \
6609
+        v0 = _mm_adds_epi16(v0, _mm_maddubs_epi16(a, filter_outer)); \
6610
+    ) \
6611
+    v0 = _mm_adds_epi16(v0, c64); \
6612
+    a = _mm_srai_epi16(v0, 7)
6613
+
6614
+#define INIT_H_SUBPEL_FILTER4(j, n, is6tap) \
6615
+    __m64 filter_inner = *(__m64*)h_subpel_filters_inner[j]; \
6616
+    is6tap( \
6617
+        __m64 filter_outer = *(__m64*)h_subpel_filters_outer[(j) >> 1]; \
6618
+        __m64 perm_outer = _mm_setr_pi8(0,5, 1,6, 2,7, 3,8); \
6619
+        __m64 a1; \
6620
+    ) \
6621
+    __m64 c64 = _mm_set1_pi16(64); \
6622
+    __m64 perm_inner_l = _mm_setr_pi8(INNER_PERM(n)); \
6623
+    __m64 perm_inner_h = _mm_setr_pi8(INNER_PERM(n + 2)); \
6624
+    __m64 v0, v1, a0; \
6625
+    int i
6626
+
6627
+#define FILTER_H4(is6tap) \
6628
+    v0 = _mm_shuffle_pi8(a0, perm_inner_l); \
6629
+    v1 = _mm_shuffle_pi8(a0, perm_inner_h); \
6630
+    v0 = _mm_maddubs_pi16(v0, filter_inner); \
6631
+    v1 = _mm_maddubs_pi16(v1, filter_inner); \
6632
+    v0 = _mm_hadds_pi16(v0, v1); \
6633
+    is6tap( \
6634
+        a0 = _mm_shuffle2_pi8(a0, a1, perm_outer); \
6635
+        v0 = _mm_adds_pi16(v0, _mm_maddubs_pi16(a0, filter_outer)); \
6636
+    ) \
6637
+    v0 = _mm_adds_pi16(v0, c64); \
6638
+    a0 = _mm_srai_pi16(v0, 7); \
6639
+    a0 = _mm_packs_pu16(a0, a0); \
6640
+    *(uint32_t*)dst = _mm_cvtsi64_si32(a0)
6641
+
6642
+#define COPY(code) code
6643
+#define NOP(code)
6644
+#define IF6TAP(code) code
6645
+
6646
+static void put_vp8_epel16_h6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src,
6647
+                                  ptrdiff_t src_stride, int h, int mx, int my)
6648
+{
6649
+    INIT_H_SUBPEL_FILTER(mx - 1, 1, IF6TAP);
6650
+    __m128i a0, a1;
6651
+
6652
+    PRAGMA_E2K("ivdep")
6653
+    for (i = 0; i < h; i++) {
6654
+        a0 = VEC_LD(src - 2);
6655
+        a1 = VEC_LD(src - 2 + 8);
6656
+        FILTER_H(a0, IF6TAP);
6657
+        FILTER_H(a1, IF6TAP);
6658
+        a0 = _mm_packus_epi16(a0, a1);
6659
+        VEC_ST(dst, a0);
6660
+        src += src_stride;
6661
+        dst += dst_stride;
6662
+    }
6663
+}
6664
+
6665
+static void put_vp8_epel8_h6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src,
6666
+                                  ptrdiff_t src_stride, int h, int mx, int my)
6667
+{
6668
+    INIT_H_SUBPEL_FILTER(mx - 1, 1, IF6TAP);
6669
+    __m128i a0;
6670
+
6671
+    PRAGMA_E2K("ivdep")
6672
+    for (i = 0; i < h; i++) {
6673
+        a0 = VEC_LD(src - 2);
6674
+        FILTER_H(a0, IF6TAP);
6675
+        a0 = _mm_packus_epi16(a0, a0);
6676
+        VEC_STL(dst, a0);
6677
+        src += src_stride;
6678
+        dst += dst_stride;
6679
+    }
6680
+}
6681
+
6682
+static void put_vp8_epel8_h4_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src,
6683
+                                  ptrdiff_t src_stride, int h, int mx, int my)
6684
+{
6685
+    INIT_H_SUBPEL_FILTER(mx - 1, 0, NOP);
6686
+    __m128i a0;
6687
+
6688
+    PRAGMA_E2K("ivdep")
6689
+    for (i = 0; i < h; i++) {
6690
+        a0 = VEC_LD(src - 1);
6691
+        FILTER_H(a0, NOP);
6692
+        a0 = _mm_packus_epi16(a0, a0);
6693
+        VEC_STL(dst, a0);
6694
+        src += src_stride;
6695
+        dst += dst_stride;
6696
+    }
6697
+}
6698
+
6699
+static void put_vp8_epel4_h6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src,
6700
+                                  ptrdiff_t src_stride, int h, int mx, int my)
6701
+{
6702
+    INIT_H_SUBPEL_FILTER4(mx - 1, 1, IF6TAP);
6703
+
6704
+    PRAGMA_E2K("ivdep")
6705
+    for (i = 0; i < h; i++) {
6706
+        a0 = *(__m64*)(src - 2);
6707
+        a1 = _mm_cvtsi32_si64(src[8 - 2]);
6708
+        FILTER_H4(IF6TAP);
6709
+        src += src_stride;
6710
+        dst += dst_stride;
6711
+    }
6712
+}
6713
+
6714
+static void put_vp8_epel4_h4_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src,
6715
+                                  ptrdiff_t src_stride, int h, int mx, int my)
6716
+{
6717
+    INIT_H_SUBPEL_FILTER4(mx - 1, 0, NOP);
6718
+
6719
+    PRAGMA_E2K("ivdep")
6720
+    for (i = 0; i < h; i++) {
6721
+        a0 = *(__m64*)(src - 1);
6722
+        FILTER_H4(NOP);
6723
+        src += src_stride;
6724
+        dst += dst_stride;
6725
+    }
6726
+}
6727
+
6728
+#define PAIR_8X2(a, b) (a & 255) | b * 256
6729
+static const int16_t v_subpel_filters[7][3] =
6730
+{
6731
+    { PAIR_8X2( -6, 123), PAIR_8X2( 12,  -1), PAIR_8X2(0, 0) },
6732
+    { PAIR_8X2(-11, 108), PAIR_8X2( 36,  -8), PAIR_8X2(2, 1) },
6733
+    { PAIR_8X2( -9,  93), PAIR_8X2( 50,  -6), PAIR_8X2(0, 0) },
6734
+    { PAIR_8X2(-16,  77), PAIR_8X2( 77, -16), PAIR_8X2(3, 3) },
6735
+    { PAIR_8X2( -6,  50), PAIR_8X2( 93,  -9), PAIR_8X2(0, 0) },
6736
+    { PAIR_8X2( -8,  36), PAIR_8X2(108, -11), PAIR_8X2(1, 2) },
6737
+    { PAIR_8X2( -1,  12), PAIR_8X2(123,  -6), PAIR_8X2(0, 0) }
6738
+};
6739
+
6740
+#define INIT_V_SUBPEL_FILTER(p, type, j, is6tap) \
6741
+    type v0, v1, r0; \
6742
+    type c64 = _mm_set1_##p(64); \
6743
+    type f0 = _mm_set1_##p(v_subpel_filters[j][0]); \
6744
+    type f1 = _mm_set1_##p(v_subpel_filters[j][1]); \
6745
+    is6tap(type f2 = _mm_set1_##p(v_subpel_filters[j][2]);) \
6746
+    int i
6747
+
6748
+#define FILTER_V(p, dstv, lo, CVT, is6tap) \
6749
+    v0 = _mm_maddubs_##p(_mm_unpack##lo(CVT(s1), CVT(s2)), f0); \
6750
+    v1 = _mm_maddubs_##p(_mm_unpack##lo(CVT(s3), CVT(s4)), f1); \
6751
+    v0 = _mm_adds_##p(v0, v1); \
6752
+    is6tap( \
6753
+        v1 = _mm_maddubs_##p(_mm_unpack##lo(CVT(s0), CVT(s5)), f2); \
6754
+        v0 = _mm_adds_##p(v0, v1); \
6755
+    ) \
6756
+    v0 = _mm_adds_##p(v0, c64); \
6757
+    dstv = _mm_srai_##p(v0, 7)
6758
+
6759
+static void put_vp8_epel16_v6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, 
6760
+                                  ptrdiff_t src_stride, int h, int mx, int my)
6761
+{
6762
+    INIT_V_SUBPEL_FILTER(epi16, __m128i, my - 1, IF6TAP);
6763
+    __m128i s0, s1, s2, s3, s4, s5;
6764
+
6765
+    s0 = VEC_LD(src - 2 * src_stride);
6766
+    s1 = VEC_LD(src - 1 * src_stride);
6767
+    s2 = VEC_LD(src);
6768
+    s3 = VEC_LD(src + 1 * src_stride);
6769
+    s4 = VEC_LD(src + 2 * src_stride);
6770
+    src += src_stride * 3;
6771
+
6772
+    PRAGMA_E2K("ivdep")
6773
+    for (i = 0; i < h; i++) {
6774
+        s5 = VEC_LD(src);
6775
+        FILTER_V(epi16, r0, lo_epi8, COPY, IF6TAP);
6776
+        FILTER_V(epi16, v0, hi_epi8, COPY, IF6TAP);
6777
+        r0 = _mm_packus_epi16(r0, v0);
6778
+        VEC_ST(dst, r0);
6779
+        s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5;
6780
+        dst += dst_stride;
6781
+        src += src_stride;
6782
+    }
6783
+}
6784
+
6785
+static void put_vp8_epel8_v6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, 
6786
+                                 ptrdiff_t src_stride, int h, int mx, int my)
6787
+{
6788
+    INIT_V_SUBPEL_FILTER(epi16, __m128i, my - 1, IF6TAP);
6789
+    __m64 s0, s1, s2, s3, s4, s5;
6790
+
6791
+    s0 = *(__m64*)(src - 2 * src_stride);
6792
+    s1 = *(__m64*)(src - 1 * src_stride);
6793
+    s2 = *(__m64*)src;
6794
+    s3 = *(__m64*)(src + 1 * src_stride);
6795
+    s4 = *(__m64*)(src + 2 * src_stride);
6796
+    src += src_stride * 3;
6797
+
6798
+    PRAGMA_E2K("ivdep")
6799
+    for (i = 0; i < h; i++) {
6800
+        s5 = *(__m64*)src;
6801
+        FILTER_V(epi16, r0, lo_epi8, _mm_movpi64_epi64, IF6TAP);
6802
+        r0 = _mm_packus_epi16(r0, r0);
6803
+        VEC_STL(dst, r0);
6804
+        s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5;
6805
+        dst += dst_stride;
6806
+        src += src_stride;
6807
+    }
6808
+}
6809
+
6810
+static void put_vp8_epel4_v6_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, 
6811
+                                 ptrdiff_t src_stride, int h, int mx, int my)
6812
+{
6813
+    INIT_V_SUBPEL_FILTER(pi16, __m64, my - 1, IF6TAP);
6814
+    __m64 s0, s1, s2, s3, s4, s5;
6815
+
6816
+    s0 = _mm_cvtsi32_si64(*(uint32_t*)(src - 2 * src_stride));
6817
+    s1 = _mm_cvtsi32_si64(*(uint32_t*)(src - 1 * src_stride));
6818
+    s2 = _mm_cvtsi32_si64(*(uint32_t*)src);
6819
+    s3 = _mm_cvtsi32_si64(*(uint32_t*)(src + 1 * src_stride));
6820
+    s4 = _mm_cvtsi32_si64(*(uint32_t*)(src + 2 * src_stride));
6821
+    src += src_stride * 3;
6822
+
6823
+    PRAGMA_E2K("ivdep")
6824
+    for (i = 0; i < h; i++) {
6825
+        s5 = _mm_cvtsi32_si64(*(uint32_t*)src);
6826
+        FILTER_V(pi16, r0, lo_pi8, COPY, IF6TAP);
6827
+        r0 = _mm_packs_pu16(r0, r0);
6828
+        *(uint32_t*)dst = _mm_cvtsi64_si32(r0);
6829
+        s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5;
6830
+        dst += dst_stride;
6831
+        src += src_stride;
6832
+    }
6833
+}
6834
+
6835
+static void put_vp8_epel8_v4_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, 
6836
+                                 ptrdiff_t src_stride, int h, int mx, int my)
6837
+{
6838
+    INIT_V_SUBPEL_FILTER(epi16, __m128i, my - 1, NOP);
6839
+    __m64 s1, s2, s3, s4;
6840
+
6841
+    s1 = *(__m64*)(src - 1 * src_stride);
6842
+    s2 = *(__m64*)src;
6843
+    s3 = *(__m64*)(src + 1 * src_stride);
6844
+    src += src_stride * 2;
6845
+
6846
+    PRAGMA_E2K("ivdep")
6847
+    for (i = 0; i < h; i++) {
6848
+        s4 = *(__m64*)src;
6849
+        FILTER_V(epi16, r0, lo_epi8, _mm_movpi64_epi64, NOP);
6850
+        r0 = _mm_packus_epi16(r0, r0);
6851
+        VEC_STL(dst, r0);
6852
+        s1 = s2; s2 = s3; s3 = s4;
6853
+        dst += dst_stride;
6854
+        src += src_stride;
6855
+    }
6856
+}
6857
+
6858
+static void put_vp8_epel4_v4_e2k(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, 
6859
+                                 ptrdiff_t src_stride, int h, int mx, int my)
6860
+{
6861
+    INIT_V_SUBPEL_FILTER(pi16, __m64, my - 1, NOP);
6862
+    __m64 s1, s2, s3, s4;
6863
+
6864
+    s1 = _mm_cvtsi32_si64(*(uint32_t*)(src - 1 * src_stride));
6865
+    s2 = _mm_cvtsi32_si64(*(uint32_t*)src);
6866
+    s3 = _mm_cvtsi32_si64(*(uint32_t*)(src + 1 * src_stride));
6867
+    src += src_stride * 2;
6868
+
6869
+    PRAGMA_E2K("ivdep")
6870
+    for (i = 0; i < h; i++) {
6871
+        s4 = _mm_cvtsi32_si64(*(uint32_t*)src);
6872
+        FILTER_V(pi16, r0, lo_pi8, COPY, NOP);
6873
+        r0 = _mm_packs_pu16(r0, r0);
6874
+        *(uint32_t*)dst = _mm_cvtsi64_si32(r0);
6875
+        s1 = s2; s2 = s3; s3 = s4;
6876
+        dst += dst_stride;
6877
+        src += src_stride;
6878
+    }
6879
+}
6880
+
6881
+#define EPEL_HV(WIDTH, HTAPS, VTAPS) \
6882
+static void put_vp8_epel##WIDTH##_h##HTAPS##v##VTAPS##_e2k(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
6883
+{ \
6884
+    DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
6885
+    if (VTAPS == 6) { \
6886
+        put_vp8_epel##WIDTH##_h##HTAPS##_e2k(tmp, 16,      src-2*sstride, sstride, h+5, mx, my); \
6887
+        put_vp8_epel##WIDTH##_v##VTAPS##_e2k(dst, dstride, tmp+2*16,      16,      h,   mx, my); \
6888
+    } else { \
6889
+        put_vp8_epel##WIDTH##_h##HTAPS##_e2k(tmp, 16,      src-sstride, sstride, h+4, mx, my); \
6890
+        put_vp8_epel##WIDTH##_v##VTAPS##_e2k(dst, dstride, tmp+16,      16,      h,   mx, my); \
6891
+    } \
6892
+}
6893
+
6894
+EPEL_HV(16, 6,6)
6895
+EPEL_HV(8,  6,6)
6896
+EPEL_HV(8,  4,6)
6897
+EPEL_HV(8,  6,4)
6898
+EPEL_HV(8,  4,4)
6899
+EPEL_HV(4,  6,6)
6900
+EPEL_HV(4,  4,6)
6901
+EPEL_HV(4,  6,4)
6902
+EPEL_HV(4,  4,4)
6903
+
6904
+static void put_vp8_pixels16_e2k(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
6905
+                                 ptrdiff_t sstride, int h, int mx, int my)
6906
+{
6907
+    __m128i v0, v1, v2, v3;
6908
+    int i;
6909
+
6910
+    PRAGMA_E2K("ivdep")
6911
+    for (i = 0; i < h; i += 4) {
6912
+        v0 = VEC_LD(src);
6913
+        v1 = VEC_LD(src + sstride);
6914
+        v2 = VEC_LD(src + sstride * 2);
6915
+        v3 = VEC_LD(src + sstride * 3);
6916
+        VEC_ST(dst, v0);
6917
+        VEC_ST(dst + dstride, v1);
6918
+        VEC_ST(dst + dstride * 2, v2);
6919
+        VEC_ST(dst + dstride * 3, v3);
6920
+        src += sstride * 4;
6921
+        dst += dstride * 4;
6922
+    }
6923
+}
6924
+
6925
+
6926
+av_cold void ff_vp78dsp_init_e2k(VP8DSPContext *c)
6927
+{
6928
+    if (!E2K_BASE(av_get_cpu_flags()))
6929
+        return;
6930
+
6931
+    // checkasm
6932
+    c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_e2k;
6933
+    c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_e2k;
6934
+    c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_e2k;
6935
+    c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_e2k;
6936
+
6937
+    c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_e2k;
6938
+    c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_e2k;
6939
+    c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_e2k;
6940
+    c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_e2k;
6941
+
6942
+    c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_e2k;
6943
+    c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_e2k;
6944
+    c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_e2k;
6945
+    c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_e2k;
6946
+
6947
+    c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_e2k;
6948
+    c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_e2k;
6949
+    c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_e2k;
6950
+    c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_e2k;
6951
+
6952
+    c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_e2k;
6953
+    c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_e2k;
6954
+    c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_e2k;
6955
+    c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_e2k;
6956
+}
6957
diff --git a/libavcodec/e2k/vp9dsp.c b/libavcodec/e2k/vp9dsp.c
6958
new file mode 100644
6959
index 0000000..5b80070
6960
--- /dev/null
6961
+++ b/libavcodec/e2k/vp9dsp.c
6962
@@ -0,0 +1,1740 @@
6963
+/*
6964
+ * VP9 compatible video decoder
6965
+ *
6966
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
6967
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
6968
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
6969
+ *
6970
+ * This file is part of FFmpeg.
6971
+ *
6972
+ * FFmpeg is free software; you can redistribute it and/or
6973
+ * modify it under the terms of the GNU Lesser General Public
6974
+ * License as published by the Free Software Foundation; either
6975
+ * version 2.1 of the License, or (at your option) any later version.
6976
+ *
6977
+ * FFmpeg is distributed in the hope that it will be useful,
6978
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
6979
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
6980
+ * Lesser General Public License for more details.
6981
+ *
6982
+ * You should have received a copy of the GNU Lesser General Public
6983
+ * License along with FFmpeg; if not, write to the Free Software
6984
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6985
+ */
6986
+
6987
+#define BIT_DEPTH 8
6988
+
6989
+#include "config.h"
6990
+#include "libavutil/cpu.h"
6991
+#include "libavutil/e2k/cpu.h"
6992
+#include "libavutil/e2k/util_e2k.h"
6993
+
6994
+#include "libavutil/common.h"
6995
+#include "libavutil/intreadwrite.h"
6996
+#include "libavcodec/vp9dsp.h"
6997
+
6998
+#define pixel   uint8_t
6999
+
7000
+#define itxfm_wrapper4(type_a, type_b, sz, bits, has_dconly) \
7001
+static void type_a##_##type_b##_##sz##x##sz##_add_e2k(uint8_t *dst, \
7002
+                                                      ptrdiff_t stride, \
7003
+                                                      int16_t *block, int eob) \
7004
+{ \
7005
+    int j; \
7006
+    int16_t tmp[sz * sz], out[sz * sz]; \
7007
+    __m64 h0, h1, h2, round, zerov = _mm_setzero_si64(); \
7008
+\
7009
+    if (has_dconly && eob == 1) { \
7010
+        int t = ((((int)block[0] * 11585 + (1 << 13)) >> 14) \
7011
+                                 * 11585 + (1 << 13)) >> 14; \
7012
+        block[0] = 0; \
7013
+        t = bits ? (t + (1 << (bits - 1))) >> bits : t; \
7014
+        h1 = _mm_set1_pi16(t); \
7015
+        h2 = _mm_set1_pi16(-t); \
7016
+        h1 = _mm_packs_pu16(h1, h1); \
7017
+        h2 = _mm_packs_pu16(h2, h2); \
7018
+        \
7019
+        PRAGMA_E2K("ivdep") \
7020
+        for (j = 0; j < sz; j++, dst += stride) { \
7021
+            h0 = _mm_cvtsi32_si64(*(uint32_t*)dst); \
7022
+            h0 = _mm_subs_pu8(_mm_adds_pu8(h0, h1), h2); \
7023
+            *(uint32_t*)dst = _mm_cvtsi64_si32(h0); \
7024
+        } \
7025
+        return; \
7026
+    } \
7027
+    \
7028
+    type_a##sz##_1d(block, tmp, 0); \
7029
+    memset(block, 0, sz * sz * sizeof(*block)); \
7030
+    type_b##sz##_1d(tmp, out, 1); \
7031
+    round = _mm_set1_pi16((1 << bits) >> 1); \
7032
+    PRAGMA_E2K("ivdep") \
7033
+    for (j = 0; j < sz; j++, dst += stride) { \
7034
+         h1 = *(__m64*)(out + j * sz); \
7035
+         h1 = _mm_srai_pi16(_mm_add_pi16(h1, round), bits); \
7036
+         h0 = _mm_cvtsi32_si64(*(uint32_t*)dst); \
7037
+         h0 = _mm_unpacklo_pi8(h0, zerov); \
7038
+         h0 = _mm_add_pi16(h0, h1); \
7039
+         h0 = _mm_packs_pu16(h0, h0); \
7040
+         *(uint32_t*)dst = _mm_cvtsi64_si32(h0); \
7041
+    } \
7042
+}
7043
+
7044
+#define itxfm_wrapper8(type_a, type_b, sz, bits, has_dconly) \
7045
+static void type_a##_##type_b##_##sz##x##sz##_add_e2k(uint8_t *dst, \
7046
+                                                      ptrdiff_t stride, \
7047
+                                                      int16_t *block, int eob) \
7048
+{ \
7049
+    int j; \
7050
+    int16_t tmp[sz * sz], out[sz * sz]; \
7051
+    __m128i v0, v1, round; \
7052
+    LOAD_ZERO; \
7053
+\
7054
+    if (has_dconly && eob == 1) { \
7055
+        __m64 h0, h1, h2; \
7056
+        int t = ((((int)block[0] * 11585 + (1 << 13)) >> 14) \
7057
+                                 * 11585 + (1 << 13)) >> 14; \
7058
+        block[0] = 0; \
7059
+        t = bits ? (t + (1 << (bits - 1))) >> bits : t; \
7060
+        h1 = _mm_set1_pi16(t); \
7061
+        h2 = _mm_set1_pi16(-t); \
7062
+        h1 = _mm_packs_pu16(h1, h1); \
7063
+        h2 = _mm_packs_pu16(h2, h2); \
7064
+        \
7065
+        PRAGMA_E2K("ivdep") \
7066
+        for (j = 0; j < sz; j++, dst += stride) { \
7067
+            h0 = *(__m64*)dst; \
7068
+            h0 = _mm_subs_pu8(_mm_adds_pu8(h0, h1), h2); \
7069
+            *(__m64*)dst = h0; \
7070
+        } \
7071
+        return; \
7072
+    } \
7073
+    \
7074
+    type_a##sz##_1d(block, tmp, 0); \
7075
+    memset(block, 0, sz * sz * sizeof(*block)); \
7076
+    type_b##sz##_1d(tmp, out, 1); \
7077
+    round = _mm_set1_epi16((1 << bits) >> 1); \
7078
+    PRAGMA_E2K("ivdep") \
7079
+    for (j = 0; j < sz; j++, dst += stride) { \
7080
+         v1 = VEC_LD(out + j * sz); \
7081
+         v1 = _mm_srai_epi16(_mm_add_epi16(v1, round), bits); \
7082
+         v0 = VEC_LD8(dst); \
7083
+         v0 = _mm_unpacklo_epi8(v0, zerov); \
7084
+         v0 = _mm_add_epi16(v0, v1); \
7085
+         v0 = _mm_packus_epi16(v0, v0); \
7086
+         VEC_STL(dst, v0); \
7087
+    } \
7088
+}
7089
+
7090
+#define itxfm_wrapper16(type_a, type_b, sz, bits, has_dconly) \
7091
+static void type_a##_##type_b##_##sz##x##sz##_add_e2k(uint8_t *dst, \
7092
+                                                      ptrdiff_t stride, \
7093
+                                                      int16_t *block, int eob) \
7094
+{ \
7095
+    int i, j; \
7096
+    int16_t tmp[sz * sz], out[sz * sz]; \
7097
+    __m128i v0, v1, v2, v3, round; \
7098
+    LOAD_ZERO; \
7099
+\
7100
+    if (has_dconly && eob == 1) { \
7101
+        int t = ((((int)block[0] * 11585 + (1 << 13)) >> 14) \
7102
+                                 * 11585 + (1 << 13)) >> 14; \
7103
+        block[0] = 0; \
7104
+        t = bits ? (t + (1 << (bits - 1))) >> bits : t; \
7105
+        v1 = _mm_set1_epi16(t); \
7106
+        v2 = _mm_set1_epi16(-t); \
7107
+        v1 = _mm_packus_epi16(v1, v1); \
7108
+        v2 = _mm_packus_epi16(v2, v2); \
7109
+        \
7110
+        for (j = 0; j < sz; j++, dst += stride) \
7111
+        PRAGMA_E2K("ivdep") \
7112
+        for (i = 0; i < sz; i += 16) { \
7113
+            v0 = VEC_LD(dst + i); \
7114
+            v0 = _mm_subs_epu8(_mm_adds_epu8(v0, v1), v2); \
7115
+            VEC_ST(dst + i, v0); \
7116
+        } \
7117
+        return; \
7118
+    } \
7119
+    \
7120
+    type_a##sz##_1d(block, tmp, 0); \
7121
+    memset(block, 0, sz * sz * sizeof(*block)); \
7122
+    type_b##sz##_1d(tmp, out, 1); \
7123
+    round = _mm_set1_epi16((1 << bits) >> 1); \
7124
+    for (j = 0; j < sz; j++, dst += stride) \
7125
+    PRAGMA_E2K("ivdep") \
7126
+    for (i = 0; i < sz; i += 16) { \
7127
+         v2 = VEC_LD(out + j * sz + i); \
7128
+         v3 = VEC_LD(out + j * sz + i + 8); \
7129
+         v2 = _mm_srai_epi16(_mm_add_epi16(v2, round), bits); \
7130
+         v3 = _mm_srai_epi16(_mm_add_epi16(v3, round), bits); \
7131
+         v1 = VEC_LD(dst + i); \
7132
+         v0 = _mm_unpacklo_epi8(v1, zerov); \
7133
+         v1 = _mm_unpackhi_epi8(v1, zerov); \
7134
+         v0 = _mm_add_epi16(v0, v2); \
7135
+         v1 = _mm_add_epi16(v1, v3); \
7136
+         v0 = _mm_packus_epi16(v0, v1); \
7137
+         VEC_ST(dst + i, v0); \
7138
+    } \
7139
+}
7140
+
7141
+#define IN(x) VEC_LD8(in + (x) * sz)
7142
+
7143
+#define X1(x, a, b) \
7144
+    __m128i x = _mm_set1_epi32((a & 0xffff) | b << 16);
7145
+
7146
+#define X2(x, y, i0, i1) \
7147
+    v1 = _mm_unpacklo_epi16(IN(i0), IN(i1)); \
7148
+    v0 = _mm_madd_epi16(v1, f##x); \
7149
+    v1 = _mm_madd_epi16(v1, f##y); \
7150
+    t##x##a = _mm_srai_epi32(_mm_add_epi32(v0, round), 14); \
7151
+    t##y##a = _mm_srai_epi32(_mm_add_epi32(v1, round), 14);
7152
+
7153
+#define X3(x, y, i0, i1) \
7154
+    v0 = _mm_mullo_epi32(_mm_sub_epi32(i0, i1), c11585); \
7155
+    v1 = _mm_mullo_epi32(_mm_add_epi32(i0, i1), c11585); \
7156
+    x = _mm_srai_epi32(_mm_add_epi32(v0, round), 14); \
7157
+    y = _mm_srai_epi32(_mm_add_epi32(v1, round), 14);
7158
+
7159
+#define X4(x, y, i0, i1, m0, m1) \
7160
+    v0 = _mm_add_epi32(_mm_mullo_epi32(i0, m0), _mm_mullo_epi32(i1, m1)); \
7161
+    v1 = _mm_sub_epi32(_mm_mullo_epi32(i0, m1), _mm_mullo_epi32(i1, m0)); \
7162
+    x = _mm_srai_epi32(_mm_add_epi32(v0, round), 14); \
7163
+    y = _mm_srai_epi32(_mm_add_epi32(v1, round), 14);
7164
+
7165
+#define X5(d, add, a0, a1, a2, a3, b0, b1, b2, b3) \
7166
+    v0 = _mm_##add##_epi32(a0, b0); \
7167
+    v1 = _mm_##add##_epi32(a1, b1); \
7168
+    v2 = _mm_##add##_epi32(a2, b2); \
7169
+    v3 = _mm_##add##_epi32(a3, b3); \
7170
+    v0 = _mm_packs_epi32(v0, v1); \
7171
+    v1 = _mm_packs_epi32(v2, v3); \
7172
+    v2 = _mm_unpacklo_epi16(v0, v1); \
7173
+    v3 = _mm_unpackhi_epi16(v0, v1); \
7174
+    v0 = _mm_unpacklo_epi16(v2, v3); \
7175
+    v1 = _mm_unpackhi_epi16(v2, v3); \
7176
+    VEC_STL(out + d + sz * 0, v0); \
7177
+    VEC_STH(out + d + sz * 1, v0); \
7178
+    VEC_STL(out + d + sz * 2, v1); \
7179
+    VEC_STH(out + d + sz * 3, v1);
7180
+
7181
+#define X6(d, add, a0, a1, a2, a3, b0, b1, b2, b3) \
7182
+    v0 = _mm_##add##_epi32(a0, b0); \
7183
+    v1 = _mm_##add##_epi32(a1, b1); \
7184
+    v2 = _mm_##add##_epi32(a2, b2); \
7185
+    v3 = _mm_##add##_epi32(a3, b3); \
7186
+    v0 = _mm_packs_epi32(v0, v1); \
7187
+    v1 = _mm_packs_epi32(v2, v3); \
7188
+    VEC_STL(out + sz * (d + 0), v0); \
7189
+    VEC_STH(out + sz * (d + 1), v0); \
7190
+    VEC_STL(out + sz * (d + 2), v1); \
7191
+    VEC_STH(out + sz * (d + 3), v1);
7192
+
7193
+static av_always_inline void idct4_1d(const int16_t *in,
7194
+                                      int16_t *out, int pass)
7195
+{
7196
+    __m128i v0, v1, v2, v3;
7197
+    __m128i t0a, t1a, t2a, t3a;
7198
+    __m128i round = _mm_set1_epi32(1 << 13);
7199
+    int sz = 4;
7200
+
7201
+    X1(f0, 11585, 11585)
7202
+    X1(f1, 11585, -11585)
7203
+    X1(f2, 6270, -15137)
7204
+    X1(f3, 15137, 6270)
7205
+
7206
+    X2(0, 1, 0, 2)
7207
+    X2(2, 3, 1, 3)
7208
+
7209
+    v0 = _mm_add_epi32(t0a, t3a);
7210
+    v1 = _mm_add_epi32(t1a, t2a);
7211
+    v2 = _mm_sub_epi32(t1a, t2a);
7212
+    v3 = _mm_sub_epi32(t0a, t3a);
7213
+    v0 = _mm_packs_epi32(v0, v1);
7214
+    v1 = _mm_packs_epi32(v2, v3);
7215
+    if (!pass) {
7216
+        v2 = _mm_unpacklo_epi16(v0, v1);
7217
+        v3 = _mm_unpackhi_epi16(v0, v1);
7218
+        v0 = _mm_unpacklo_epi16(v2, v3);
7219
+        v1 = _mm_unpackhi_epi16(v2, v3);
7220
+    }
7221
+    VEC_STL(out + sz * 0, v0);
7222
+    VEC_STH(out + sz * 1, v0);
7223
+    VEC_STL(out + sz * 2, v1);
7224
+    VEC_STH(out + sz * 3, v1);
7225
+}
7226
+
7227
+static av_always_inline void idct8_1d(const int16_t *in,
7228
+                                      int16_t *out, int pass)
7229
+{
7230
+    __m128i v0, v1, v2, v3;
7231
+    __m128i t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
7232
+    __m128i t0, t1, t2, t3, t4, t5, t6, t7;
7233
+    __m128i round = _mm_set1_epi32(1 << 13);
7234
+    __m128i c11585 = _mm_set1_epi32(11585);
7235
+
7236
+    X1(f0, 11585, 11585)
7237
+    X1(f1, 11585, -11585)
7238
+    X1(f2, 6270, -15137)
7239
+    X1(f3, 15137, 6270)
7240
+
7241
+    X1(f4, 3196, -16069)
7242
+    X1(f7, 16069, 3196)
7243
+    X1(f5, 13623, -9102)
7244
+    X1(f6, 9102, 13623)
7245
+
7246
+    int i, sz = 8;
7247
+    PRAGMA_E2K("ivdep")
7248
+    for (i = 0; i < sz; i += 4, in += 4) {
7249
+        X2(0, 1, 0, 4)
7250
+        X2(2, 3, 2, 6)
7251
+        X2(4, 7, 1, 7)
7252
+        X2(5, 6, 5, 3)
7253
+
7254
+        t0  = _mm_add_epi32(t0a, t3a);
7255
+        t1  = _mm_add_epi32(t1a, t2a);
7256
+        t2  = _mm_sub_epi32(t1a, t2a);
7257
+        t3  = _mm_sub_epi32(t0a, t3a);
7258
+        t4  = _mm_add_epi32(t4a, t5a);
7259
+        t5a = _mm_sub_epi32(t4a, t5a);
7260
+        t7  = _mm_add_epi32(t7a, t6a);
7261
+        t6a = _mm_sub_epi32(t7a, t6a);
7262
+
7263
+        X3(t5, t6, t6a, t5a)
7264
+
7265
+        if (!pass) {
7266
+            X5(0, add, t0, t1, t2, t3, t7, t6, t5, t4)
7267
+            X5(4, sub, t3, t2, t1, t0, t4, t5, t6, t7)
7268
+            out += 4 * sz;
7269
+        } else {
7270
+            X6(0, add, t0, t1, t2, t3, t7, t6, t5, t4)
7271
+            X6(4, sub, t3, t2, t1, t0, t4, t5, t6, t7)
7272
+            out += 4;
7273
+        }
7274
+    }
7275
+}
7276
+
7277
+static av_always_inline void idct16_1d(const int16_t *in,
7278
+                                       int16_t *out, int pass)
7279
+{
7280
+    __m128i v0, v1, v2, v3;
7281
+    __m128i t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
7282
+    __m128i t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
7283
+    __m128i t0, t1, t2, t3, t4, t5, t6, t7;
7284
+    __m128i t8, t9, t10, t11, t12, t13, t14, t15;
7285
+    __m128i round = _mm_set1_epi32(1 << 13);
7286
+    __m128i c11585 = _mm_set1_epi32(11585);
7287
+    __m128i m15137 = _mm_set1_epi32(-15137), c6270 = _mm_set1_epi32(6270);
7288
+
7289
+    X1(f0, 11585, 11585)
7290
+    X1(f1, 11585, -11585)
7291
+    X1(f2, 6270, -15137)
7292
+    X1(f3, 15137, 6270)
7293
+
7294
+    X1(f4, 3196, -16069)
7295
+    X1(f7, 16069, 3196)
7296
+    X1(f5, 13623, -9102)
7297
+    X1(f6, 9102, 13623)
7298
+
7299
+    X1(f8, 1606, -16305)
7300
+    X1(f15, 16305, 1606)
7301
+    X1(f9, 12665, -10394)
7302
+    X1(f14, 10394, 12665)
7303
+    X1(f10, 7723, -14449)
7304
+    X1(f13, 14449, 7723)
7305
+    X1(f11, 15679, -4756)
7306
+    X1(f12, 4756, 15679)
7307
+
7308
+    int i, sz = 16;
7309
+    PRAGMA_E2K("ivdep")
7310
+    for (i = 0; i < sz; i += 4, in += 4) {
7311
+        X2( 0,  1,  0,  8)
7312
+        X2( 2,  3,  4, 12)
7313
+        X2( 4,  7,  2, 14)
7314
+        X2( 5,  6, 10,  6)
7315
+        X2( 8, 15,  1, 15)
7316
+        X2( 9, 14,  9,  7)
7317
+        X2(10, 13,  5, 11)
7318
+        X2(11, 12, 13,  3)
7319
+
7320
+        t0  = _mm_add_epi32(t0a, t3a);
7321
+        t1  = _mm_add_epi32(t1a, t2a);
7322
+        t2  = _mm_sub_epi32(t1a, t2a);
7323
+        t3  = _mm_sub_epi32(t0a, t3a);
7324
+        t4  = _mm_add_epi32(t4a, t5a);
7325
+        t5  = _mm_sub_epi32(t4a, t5a);
7326
+        t6  = _mm_sub_epi32(t7a, t6a);
7327
+        t7  = _mm_add_epi32(t7a, t6a);
7328
+        t8  = _mm_add_epi32(t8a, t9a);
7329
+        t9  = _mm_sub_epi32(t8a, t9a);
7330
+        t10 = _mm_sub_epi32(t11a, t10a);
7331
+        t11 = _mm_add_epi32(t11a, t10a);
7332
+        t12 = _mm_add_epi32(t12a, t13a);
7333
+        t13 = _mm_sub_epi32(t12a, t13a);
7334
+        t14 = _mm_sub_epi32(t15a, t14a);
7335
+        t15 = _mm_add_epi32(t15a, t14a);
7336
+
7337
+        X3( t5a,  t6a, t6,  t5)
7338
+        X4( t9a, t14a, t9,  t14, m15137, c6270)
7339
+        X4(t13a, t10a, t13, t10, c6270, m15137)
7340
+
7341
+        t0a  = _mm_add_epi32(t0, t7);
7342
+        t1a  = _mm_add_epi32(t1, t6a);
7343
+        t2a  = _mm_add_epi32(t2, t5a);
7344
+        t3a  = _mm_add_epi32(t3, t4);
7345
+        t4   = _mm_sub_epi32(t3, t4);
7346
+        t5   = _mm_sub_epi32(t2, t5a);
7347
+        t6   = _mm_sub_epi32(t1, t6a);
7348
+        t7   = _mm_sub_epi32(t0, t7);
7349
+        t8a  = _mm_add_epi32(t8, t11);
7350
+        t9   = _mm_add_epi32(t9a, t10a);
7351
+        t10  = _mm_sub_epi32(t9a, t10a);
7352
+        t11a = _mm_sub_epi32(t8, t11);
7353
+        t12a = _mm_sub_epi32(t15, t12);
7354
+        t13  = _mm_sub_epi32(t14a, t13a);
7355
+        t14  = _mm_add_epi32(t14a, t13a);
7356
+        t15a = _mm_add_epi32(t15, t12);
7357
+
7358
+        X3(t10a, t13a, t13, t10)
7359
+        X3(t11, t12, t12a, t11a)
7360
+
7361
+        if (!pass) {
7362
+            X5( 0, add, t0a, t1a, t2a, t3a, t15a, t14, t13a, t12)
7363
+            X5( 4, add, t4,  t5,  t6,  t7,  t11, t10a, t9, t8a)
7364
+            X5( 8, sub, t7,  t6,  t5,  t4,  t8a, t9, t10a, t11)
7365
+            X5(12, sub, t3a, t2a, t1a, t0a, t12, t13a, t14, t15a)
7366
+            out += 4 * sz;
7367
+        } else {
7368
+            X6( 0, add, t0a, t1a, t2a, t3a, t15a, t14, t13a, t12)
7369
+            X6( 4, add, t4,  t5,  t6,  t7,  t11, t10a, t9, t8a)
7370
+            X6( 8, sub, t7,  t6,  t5,  t4,  t8a, t9, t10a, t11)
7371
+            X6(12, sub, t3a, t2a, t1a, t0a, t12, t13a, t14, t15a)
7372
+            out += 4;
7373
+        }
7374
+    }
7375
+}
7376
+
7377
+static av_always_inline void idct32_1d(const int16_t *in,
7378
+                                       int16_t *out, int pass)
7379
+{
7380
+    __m128i v0, v1, v2, v3;
7381
+    __m128i t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
7382
+    __m128i t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
7383
+    __m128i t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a;
7384
+    __m128i t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a;
7385
+    __m128i t0, t1, t2, t3, t4, t5, t6, t7;
7386
+    __m128i t8, t9, t10, t11, t12, t13, t14, t15;
7387
+    __m128i t16, t17, t18, t19, t20, t21, t22, t23;
7388
+    __m128i t24, t25, t26, t27, t28, t29, t30, t31;
7389
+    __m128i round = _mm_set1_epi32(1 << 13);
7390
+    __m128i c11585 = _mm_set1_epi32(11585);
7391
+    __m128i m15137 = _mm_set1_epi32(-15137), c6270 = _mm_set1_epi32(6270);
7392
+    __m128i m16069 = _mm_set1_epi32(-16069), c3196 = _mm_set1_epi32(3196);
7393
+    __m128i m9102 = _mm_set1_epi32(-9102), c13623 = _mm_set1_epi32(13623);
7394
+
7395
+    X1(f0, 11585, 11585)
7396
+    X1(f1, 11585, -11585)
7397
+    X1(f2, 6270, -15137)
7398
+    X1(f3, 15137, 6270)
7399
+
7400
+    X1(f4, 3196, -16069)
7401
+    X1(f7, 16069, 3196)
7402
+    X1(f5, 13623, -9102)
7403
+    X1(f6, 9102, 13623)
7404
+
7405
+    X1(f8, 1606, -16305)
7406
+    X1(f15, 16305, 1606)
7407
+    X1(f9, 12665, -10394)
7408
+    X1(f14, 10394, 12665)
7409
+    X1(f10, 7723, -14449)
7410
+    X1(f13, 14449, 7723)
7411
+    X1(f11, 15679, -4756)
7412
+    X1(f12, 4756, 15679)
7413
+
7414
+    X1(f16, 804, -16364)
7415
+    X1(f31, 16364, 804)
7416
+    X1(f17, 12140, -11003)
7417
+    X1(f30, 11003, 12140)
7418
+    X1(f18, 7005, -14811)
7419
+    X1(f29, 14811, 7005)
7420
+    X1(f19, 15426, -5520)
7421
+    X1(f28, 5520, 15426)
7422
+    X1(f20, 3981, -15893)
7423
+    X1(f27, 15893, 3981)
7424
+    X1(f21, 14053, -8423U)
7425
+    X1(f26, 8423U, 14053)
7426
+    X1(f22, 9760, -13160)
7427
+    X1(f25, 13160, 9760)
7428
+    X1(f23, 16207, -2404)
7429
+    X1(f24, 2404, 16207)
7430
+
7431
+    int i, sz = 32;
7432
+    PRAGMA_E2K("ivdep")
7433
+    for (i = 0; i < sz; i += 4, in += 4) {
7434
+        X2( 0,  1,  0, 16)
7435
+        X2( 2,  3,  8, 24)
7436
+        X2( 4,  7,  4, 28)
7437
+        X2( 5,  6, 20, 12)
7438
+        X2( 8, 15,  2, 30)
7439
+        X2( 9, 14, 18, 14)
7440
+        X2(10, 13, 10, 22)
7441
+        X2(11, 12, 26,  6)
7442
+        X2(16, 31,  1, 31)
7443
+        X2(17, 30, 17, 15)
7444
+        X2(18, 29,  9, 23)
7445
+        X2(19, 28, 25,  7)
7446
+        X2(20, 27,  5, 27)
7447
+        X2(21, 26, 21, 11)
7448
+        X2(22, 25, 13, 19)
7449
+        X2(23, 24, 29,  3)
7450
+
7451
+        t0  = _mm_add_epi32(t0a, t3a);
7452
+        t1  = _mm_add_epi32(t1a, t2a);
7453
+        t2  = _mm_sub_epi32(t1a, t2a);
7454
+        t3  = _mm_sub_epi32(t0a, t3a);
7455
+        t4  = _mm_add_epi32(t4a, t5a);
7456
+        t5  = _mm_sub_epi32(t4a, t5a);
7457
+        t6  = _mm_sub_epi32(t7a, t6a);
7458
+        t7  = _mm_add_epi32(t7a, t6a);
7459
+        t8  = _mm_add_epi32(t8a, t9a);
7460
+        t9  = _mm_sub_epi32(t8a, t9a);
7461
+        t10 = _mm_sub_epi32(t11a, t10a);
7462
+        t11 = _mm_add_epi32(t11a, t10a);
7463
+        t12 = _mm_add_epi32(t12a, t13a);
7464
+        t13 = _mm_sub_epi32(t12a, t13a);
7465
+        t14 = _mm_sub_epi32(t15a, t14a);
7466
+        t15 = _mm_add_epi32(t15a, t14a);
7467
+        t16 = _mm_add_epi32(t16a, t17a);
7468
+        t17 = _mm_sub_epi32(t16a, t17a);
7469
+        t18 = _mm_sub_epi32(t19a, t18a);
7470
+        t19 = _mm_add_epi32(t19a, t18a);
7471
+        t20 = _mm_add_epi32(t20a, t21a);
7472
+        t21 = _mm_sub_epi32(t20a, t21a);
7473
+        t22 = _mm_sub_epi32(t23a, t22a);
7474
+        t23 = _mm_add_epi32(t23a, t22a);
7475
+        t24 = _mm_add_epi32(t24a, t25a);
7476
+        t25 = _mm_sub_epi32(t24a, t25a);
7477
+        t26 = _mm_sub_epi32(t27a, t26a);
7478
+        t27 = _mm_add_epi32(t27a, t26a);
7479
+        t28 = _mm_add_epi32(t28a, t29a);
7480
+        t29 = _mm_sub_epi32(t28a, t29a);
7481
+        t30 = _mm_sub_epi32(t31a, t30a);
7482
+        t31 = _mm_add_epi32(t31a, t30a);
7483
+
7484
+        X3( t5a,  t6a, t6,  t5)
7485
+        X4( t9a, t14a, t9,  t14, m15137, c6270)
7486
+        X4(t13a, t10a, t13, t10, c6270, m15137)
7487
+        X4(t17a, t30a, t17, t30, m16069, c3196)
7488
+        X4(t29a, t18a, t29, t18, c3196, m16069)
7489
+        X4(t21a, t26a, t21, t26, m9102, c13623)
7490
+        X4(t25a, t22a, t25, t22, c13623, m9102)
7491
+
7492
+        t0a  = _mm_add_epi32(t0, t7);
7493
+        t1a  = _mm_add_epi32(t1, t6a);
7494
+        t2a  = _mm_add_epi32(t2, t5a);
7495
+        t3a  = _mm_add_epi32(t3, t4);
7496
+        t4a  = _mm_sub_epi32(t3, t4);
7497
+        t5   = _mm_sub_epi32(t2, t5a);
7498
+        t6   = _mm_sub_epi32(t1, t6a);
7499
+        t7a  = _mm_sub_epi32(t0, t7);
7500
+        t8a  = _mm_add_epi32(t8, t11);
7501
+        t9   = _mm_add_epi32(t9a, t10a);
7502
+        t10  = _mm_sub_epi32(t9a, t10a);
7503
+        t11a = _mm_sub_epi32(t8, t11);
7504
+        t12a = _mm_sub_epi32(t15, t12);
7505
+        t13  = _mm_sub_epi32(t14a, t13a);
7506
+        t14  = _mm_add_epi32(t14a, t13a);
7507
+        t15a = _mm_add_epi32(t15, t12);
7508
+        t16a = _mm_add_epi32(t16, t19);
7509
+        t17  = _mm_add_epi32(t17a, t18a);
7510
+        t18  = _mm_sub_epi32(t17a, t18a);
7511
+        t19a = _mm_sub_epi32(t16, t19);
7512
+        t20a = _mm_sub_epi32(t23, t20);
7513
+        t21  = _mm_sub_epi32(t22a, t21a);
7514
+        t22  = _mm_add_epi32(t22a, t21a);
7515
+        t23a = _mm_add_epi32(t23, t20);
7516
+        t24a = _mm_add_epi32(t24, t27);
7517
+        t25  = _mm_add_epi32(t25a, t26a);
7518
+        t26  = _mm_sub_epi32(t25a, t26a);
7519
+        t27a = _mm_sub_epi32(t24, t27);
7520
+        t28a = _mm_sub_epi32(t31, t28);
7521
+        t29  = _mm_sub_epi32(t30a, t29a);
7522
+        t30  = _mm_add_epi32(t30a, t29a);
7523
+        t31a = _mm_add_epi32(t31, t28);
7524
+
7525
+        X3(t10a, t13a, t13, t10)
7526
+        X3(t11, t12, t12a, t11a)
7527
+        X4(t18a, t29a, t18, t29, m15137, c6270)
7528
+        X4(t19, t28, t19a, t28a, m15137, c6270)
7529
+        X4(t27, t20, t27a, t20a, c6270, m15137)
7530
+        X4(t26a, t21a, t26, t21, c6270, m15137)
7531
+
7532
+        t0   = _mm_add_epi32(t0a, t15a);
7533
+        t1   = _mm_add_epi32(t1a, t14);
7534
+        t2   = _mm_add_epi32(t2a, t13a);
7535
+        t3   = _mm_add_epi32(t3a, t12);
7536
+        t4   = _mm_add_epi32(t4a, t11);
7537
+        t5a  = _mm_add_epi32(t5, t10a);
7538
+        t6a  = _mm_add_epi32(t6, t9);
7539
+        t7   = _mm_add_epi32(t7a, t8a);
7540
+        t8   = _mm_sub_epi32(t7a, t8a);
7541
+        t9a  = _mm_sub_epi32(t6, t9);
7542
+        t10  = _mm_sub_epi32(t5, t10a);
7543
+        t11a = _mm_sub_epi32(t4a, t11);
7544
+        t12a = _mm_sub_epi32(t3a, t12);
7545
+        t13  = _mm_sub_epi32(t2a, t13a);
7546
+        t14a = _mm_sub_epi32(t1a, t14);
7547
+        t15  = _mm_sub_epi32(t0a, t15a);
7548
+        t16  = _mm_add_epi32(t16a, t23a);
7549
+        t17a = _mm_add_epi32(t17, t22);
7550
+        t18  = _mm_add_epi32(t18a, t21a);
7551
+        t19a = _mm_add_epi32(t19, t20);
7552
+        t20a = _mm_sub_epi32(t19, t20);
7553
+        t21  = _mm_sub_epi32(t18a, t21a);
7554
+        t22a = _mm_sub_epi32(t17, t22);
7555
+        t23  = _mm_sub_epi32(t16a, t23a);
7556
+        t24  = _mm_sub_epi32(t31a, t24a);
7557
+        t25a = _mm_sub_epi32(t30, t25);
7558
+        t26  = _mm_sub_epi32(t29a, t26a);
7559
+        t27a = _mm_sub_epi32(t28, t27);
7560
+        t28a = _mm_add_epi32(t28, t27);
7561
+        t29  = _mm_add_epi32(t29a, t26a);
7562
+        t30a = _mm_add_epi32(t30, t25);
7563
+        t31  = _mm_add_epi32(t31a, t24a);
7564
+
7565
+        X3(t20, t27, t27a, t20a)
7566
+        X3(t21a, t26a, t26, t21)
7567
+        X3(t22, t25, t25a, t22a)
7568
+        X3(t23a, t24a, t24, t23)
7569
+
7570
+        if (!pass) {
7571
+            X5( 0, add, t0,   t1,  t2,   t3,   t31, t30a, t29, t28a)
7572
+            X5( 4, add, t4,   t5a, t6a,  t7,   t27, t26a, t25, t24a)
7573
+            X5( 8, add, t8,   t9a, t10,  t11a, t23a, t22, t21a, t20)
7574
+            X5(12, add, t12a, t13, t14a, t15,  t19a, t18, t17a, t16)
7575
+            X5(16, sub, t15,  t14a, t13, t12a, t16, t17a, t18, t19a)
7576
+            X5(20, sub, t11a, t10,  t9a, t8,   t20, t21a, t22, t23a)
7577
+            X5(24, sub, t7,   t6a,  t5a, t4,   t24a, t25, t26a, t27)
7578
+            X5(28, sub, t3,   t2,   t1,  t0,   t28a, t29, t30a, t31)
7579
+            out += 4 * sz;
7580
+        } else {
7581
+            X6( 0, add, t0,   t1,  t2,   t3,   t31, t30a, t29, t28a)
7582
+            X6( 4, add, t4,   t5a, t6a,  t7,   t27, t26a, t25, t24a)
7583
+            X6( 8, add, t8,   t9a, t10,  t11a, t23a, t22, t21a, t20)
7584
+            X6(12, add, t12a, t13, t14a, t15,  t19a, t18, t17a, t16)
7585
+            X6(16, sub, t15,  t14a, t13, t12a, t16, t17a, t18, t19a)
7586
+            X6(20, sub, t11a, t10,  t9a, t8,   t20, t21a, t22, t23a)
7587
+            X6(24, sub, t7,   t6a,  t5a, t4,   t24a, t25, t26a, t27)
7588
+            X6(28, sub, t3,   t2,   t1,  t0,   t28a, t29, t30a, t31)
7589
+            out += 4;
7590
+        }
7591
+    }
7592
+}
7593
+
7594
+#undef IN
7595
+#undef X1
7596
+#undef X2
7597
+#undef X3
7598
+#undef X4
7599
+#undef X5
7600
+#undef X6
7601
+
7602
+itxfm_wrapper4(idct, idct, 4, 4, 1)
7603
+itxfm_wrapper8(idct, idct, 8, 5, 1)
7604
+itxfm_wrapper16(idct, idct, 16, 6, 1)
7605
+itxfm_wrapper16(idct, idct, 32, 6, 1)
7606
+
7607
+#undef itxfm_wrapper4
7608
+#undef itxfm_wrapper8
7609
+#undef itxfm_wrapper16
7610
+
7611
+static av_cold void ff_vp9dsp_itxfm_init_8_e2k(VP9DSPContext *dsp)
7612
+{
7613
+
7614
+#define init_idct(tx, nm) \
7615
+    dsp->itxfm_add[tx][DCT_DCT]   = \
7616
+    dsp->itxfm_add[tx][ADST_DCT]  = \
7617
+    dsp->itxfm_add[tx][DCT_ADST]  = \
7618
+    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_e2k
7619
+
7620
+    dsp->itxfm_add[TX_4X4][DCT_DCT] = idct_idct_4x4_add_e2k;
7621
+    dsp->itxfm_add[TX_8X8][DCT_DCT] = idct_idct_8x8_add_e2k;
7622
+    dsp->itxfm_add[TX_16X16][DCT_DCT] = idct_idct_16x16_add_e2k;
7623
+
7624
+    init_idct(TX_32X32, idct_idct_32x32);
7625
+
7626
+#undef init_idct
7627
+}
7628
+
7629
+#define LOAD_TRANSPOSE8(dst, a0, a1, a2, a3, a4, a5, a6, a7) \
7630
+    t0 = VEC_LD8(dst + stride * 0); \
7631
+    t1 = VEC_LD8(dst + stride * 1); \
7632
+    t2 = VEC_LD8(dst + stride * 2); \
7633
+    t3 = VEC_LD8(dst + stride * 3); \
7634
+    t4 = VEC_LD8(dst + stride * 4); \
7635
+    t5 = VEC_LD8(dst + stride * 5); \
7636
+    t6 = VEC_LD8(dst + stride * 6); \
7637
+    t7 = VEC_LD8(dst + stride * 7); \
7638
+    t0 = _mm_unpacklo_epi8(t0, t4); \
7639
+    t1 = _mm_unpacklo_epi8(t1, t5); \
7640
+    t2 = _mm_unpacklo_epi8(t2, t6); \
7641
+    t3 = _mm_unpacklo_epi8(t3, t7); \
7642
+    t4 = _mm_unpacklo_epi8(t0, t2); \
7643
+    t5 = _mm_unpackhi_epi8(t0, t2); \
7644
+    t6 = _mm_unpacklo_epi8(t1, t3); \
7645
+    t7 = _mm_unpackhi_epi8(t1, t3); \
7646
+    t0 = _mm_unpacklo_epi8(t4, t6); \
7647
+    t1 = _mm_unpackhi_epi8(t4, t6); \
7648
+    t2 = _mm_unpacklo_epi8(t5, t7); \
7649
+    t3 = _mm_unpackhi_epi8(t5, t7); \
7650
+    a0 = _mm_unpacklo_epi8(t0, zerov); \
7651
+    a1 = _mm_unpackhi_epi8(t0, zerov); \
7652
+    a2 = _mm_unpacklo_epi8(t1, zerov); \
7653
+    a3 = _mm_unpackhi_epi8(t1, zerov); \
7654
+    a4 = _mm_unpacklo_epi8(t2, zerov); \
7655
+    a5 = _mm_unpackhi_epi8(t2, zerov); \
7656
+    a6 = _mm_unpacklo_epi8(t3, zerov); \
7657
+    a7 = _mm_unpackhi_epi8(t3, zerov)
7658
+
7659
+#define STORE_TRANSPOSE8(dst, a0, a1, a2, a3, a4, a5, a6, a7) \
7660
+    t0 = _mm_packus_epi16(a0, a1); \
7661
+    t1 = _mm_packus_epi16(a2, a3); \
7662
+    t2 = _mm_packus_epi16(a4, a5); \
7663
+    t3 = _mm_packus_epi16(a6, a7); \
7664
+    t4 = _mm_unpacklo_epi8(t0, t2); \
7665
+    t5 = _mm_unpackhi_epi8(t0, t2); \
7666
+    t6 = _mm_unpacklo_epi8(t1, t3); \
7667
+    t7 = _mm_unpackhi_epi8(t1, t3); \
7668
+    t0 = _mm_unpacklo_epi8(t4, t6); \
7669
+    t1 = _mm_unpackhi_epi8(t4, t6); \
7670
+    t2 = _mm_unpacklo_epi8(t5, t7); \
7671
+    t3 = _mm_unpackhi_epi8(t5, t7); \
7672
+    t4 = _mm_unpacklo_epi8(t0, t2); \
7673
+    t5 = _mm_unpackhi_epi8(t0, t2); \
7674
+    t6 = _mm_unpacklo_epi8(t1, t3); \
7675
+    t7 = _mm_unpackhi_epi8(t1, t3); \
7676
+    VEC_STL(dst + stride * 0, t4); \
7677
+    VEC_STH(dst + stride * 1, t4); \
7678
+    VEC_STL(dst + stride * 2, t5); \
7679
+    VEC_STH(dst + stride * 3, t5); \
7680
+    VEC_STL(dst + stride * 4, t6); \
7681
+    VEC_STH(dst + stride * 5, t6); \
7682
+    VEC_STL(dst + stride * 6, t7); \
7683
+    VEC_STH(dst + stride * 7, t7)
7684
+
7685
+
7686
+static av_always_inline void loop_filter_h(uint8_t *dst, int E, int I, int H,
7687
+                                           ptrdiff_t stride, int wd)
7688
+{
7689
+    int F = 1;
7690
+    LOAD_ZERO;
7691
+    __m128i t0, t1, t2, t3, t4, t5, t6, t7;
7692
+    __m128i vfm, vflat8out, vflat8in, vhev;
7693
+    __m128i p2a, p1a, p0a, q0a, q1a, q2a;
7694
+    __m128i p6a, p5a, p4a, p3a, q3a, q4a, q5a, q6a;
7695
+
7696
+    __m128i vF = _mm_set1_epi16(F);
7697
+    __m128i vI = _mm_set1_epi16(I);
7698
+    __m128i vE = _mm_set1_epi16(E);
7699
+    __m128i vH = _mm_set1_epi16(H);
7700
+
7701
+    __m128i p7, p6, p5, p4, p3, p2, p1, p0;
7702
+    __m128i q0, q1, q2, q3, q4, q5, q6, q7;
7703
+
7704
+    if (wd >= 16) {
7705
+        LOAD_TRANSPOSE8(dst - 8, p7, p6, p5, p4, p3, p2, p1, p0);
7706
+        LOAD_TRANSPOSE8(dst    , q0, q1, q2, q3, q4, q5, q6, q7);
7707
+        p6a = p6; p5a = p5;
7708
+        p4a = p4; p3a = p3;
7709
+        q3a = q3; q4a = q4;
7710
+        q5a = q5; q6a = q6;
7711
+    } else {
7712
+        LOAD_TRANSPOSE8(dst - 4, p3, p2, p1, p0, q0, q1, q2, q3);
7713
+    }
7714
+
7715
+    t0 = _mm_abs_epi16(_mm_sub_epi16(p3, p2));
7716
+    t1 = _mm_abs_epi16(_mm_sub_epi16(p2, p1));
7717
+    t2 = _mm_abs_epi16(_mm_sub_epi16(p1, p0));
7718
+    t3 = _mm_abs_epi16(_mm_sub_epi16(q1, q0));
7719
+    t4 = _mm_abs_epi16(_mm_sub_epi16(q2, q1));
7720
+    t5 = _mm_abs_epi16(_mm_sub_epi16(q3, q2));
7721
+    vhev = _mm_or_si128(_mm_cmpgt_epi16(t2, vH), _mm_cmpgt_epi16(t3, vH));
7722
+    t0 = _mm_cmpgt_epi16(t0, vI);
7723
+    t1 = _mm_cmpgt_epi16(t1, vI);
7724
+    t2 = _mm_cmpgt_epi16(t2, vI);
7725
+    t3 = _mm_cmpgt_epi16(t3, vI);
7726
+    t4 = _mm_cmpgt_epi16(t4, vI);
7727
+    t5 = _mm_cmpgt_epi16(t5, vI);
7728
+
7729
+    t0 = _mm_or_si128(_mm_or_si128(t0, t1), t2);
7730
+    t3 = _mm_or_si128(_mm_or_si128(t3, t4), t5);
7731
+    t0 = _mm_or_si128(t0, t3);
7732
+
7733
+    t6 = _mm_abs_epi16(_mm_sub_epi16(p0, q0));
7734
+    t7 = _mm_abs_epi16(_mm_sub_epi16(p1, q1));
7735
+    t6 = _mm_add_epi16(_mm_slli_epi16(t6, 1), _mm_srai_epi16(t7, 1));
7736
+    t6 = _mm_cmpgt_epi16(t6, vE);
7737
+    vfm = _mm_or_si128(t0, t6); // !fm
7738
+
7739
+    if (_mm_movemask_epi8(vfm) == 0xffff) return;
7740
+
7741
+    if (wd >= 8) {
7742
+        t0 = _mm_abs_epi16(_mm_sub_epi16(p3, p0));
7743
+        t1 = _mm_abs_epi16(_mm_sub_epi16(p2, p0));
7744
+        t2 = _mm_abs_epi16(_mm_sub_epi16(p1, p0));
7745
+        t3 = _mm_abs_epi16(_mm_sub_epi16(q1, q0));
7746
+        t4 = _mm_abs_epi16(_mm_sub_epi16(q2, q0));
7747
+        t5 = _mm_abs_epi16(_mm_sub_epi16(q3, q0));
7748
+        t0 = _mm_cmpgt_epi16(t0, vF);
7749
+        t1 = _mm_cmpgt_epi16(t1, vF);
7750
+        t2 = _mm_cmpgt_epi16(t2, vF);
7751
+        t3 = _mm_cmpgt_epi16(t3, vF);
7752
+        t4 = _mm_cmpgt_epi16(t4, vF);
7753
+        t5 = _mm_cmpgt_epi16(t5, vF);
7754
+
7755
+        t0 = _mm_or_si128(_mm_or_si128(t0, t1), t2);
7756
+        t3 = _mm_or_si128(_mm_or_si128(t3, t4), t5);
7757
+        vflat8in = _mm_or_si128(_mm_or_si128(t0, t3), vfm);
7758
+    }
7759
+
7760
+    {
7761
+        __m128i c1 = _mm_set1_epi16(1);
7762
+        __m128i c127 = _mm_set1_epi16(127);
7763
+        __m128i m128 = _mm_set1_epi16(-128);
7764
+        __m128i c43 = _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3);
7765
+        t0 = _mm_and_si128(vhev, _mm_sub_epi16(p1, q1));
7766
+        t1 = _mm_sub_epi16(q0, p0);
7767
+        t0 = _mm_min_epi16(_mm_max_epi16(t0, m128), c127);
7768
+        // f = av_clip_intp2(p1 - q1, 7) & hev;
7769
+        t1 = _mm_add_epi16(_mm_add_epi16(t1, t0), _mm_add_epi16(t1, t1));
7770
+        t1 = _mm_andnot_si128(vfm, t1);
7771
+        t1 = _mm_packs_epi16(t1, t1);
7772
+        // f = av_clip_intp2(3 * (q0 - p0) + f, 7);
7773
+        t3 = _mm_adds_epi8(t1, c43);
7774
+        t2 = _mm_srai_epi16(_mm_unpacklo_epi8(t3, t3), 8 + 3);
7775
+        // f1 = FFMIN(f + 4, 0x7f) >> 3;
7776
+        t3 = _mm_srai_epi16(_mm_unpackhi_epi8(t3, t3), 8 + 3);
7777
+        // f2 = FFMIN(f + 3, 0x7f) >> 3;
7778
+        t4 = _mm_srai_epi16(_mm_add_epi16(t2, c1), 1);
7779
+        t4 = _mm_andnot_si128(vhev, t4); // f3 = ((f1 + 1) >> 1) & ~hev;
7780
+        p1a = _mm_add_epi16(p1, t4); // av_clip_uint8(p1 + f3);
7781
+        p0a = _mm_add_epi16(p0, t3); // av_clip_uint8(p0 + f2);
7782
+        q0a = _mm_sub_epi16(q0, t2); // av_clip_uint8(q0 - f1);
7783
+        q1a = _mm_sub_epi16(q1, t4); // av_clip_uint8(q1 - f3);
7784
+    }
7785
+
7786
+    p2a = p2; q2a = q2;
7787
+
7788
+    if (wd >= 8 && (_mm_movemask_epi8(vflat8in) != 0xffff)) {
7789
+        __m128i c4 = _mm_set1_epi16(4);
7790
+        t0 = _mm_add_epi16(_mm_slli_epi16(p3, 2), c4);
7791
+        t1 = _mm_add_epi16(_mm_add_epi16(q0, p0), _mm_add_epi16(p1, p2));
7792
+        t0 = _mm_sub_epi16(t0, p3);
7793
+        t0 = _mm_add_epi16(t0, t1);
7794
+        t1 = _mm_srai_epi16(_mm_add_epi16(p2, t0), 3);
7795
+        t0 = _mm_add_epi16(t0, _mm_sub_epi16(q1, p3));
7796
+        t2 = _mm_srai_epi16(_mm_add_epi16(p1, t0), 3);
7797
+        p2a = _mm_blendv_epi8(t1, p2, vflat8in);
7798
+        p1a = _mm_blendv_epi8(t2, p1a, vflat8in);
7799
+
7800
+        t0 = _mm_add_epi16(t0, _mm_sub_epi16(q2, p3));
7801
+        t1 = _mm_srai_epi16(_mm_add_epi16(p0, t0), 3);
7802
+        t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p3));
7803
+        t2 = _mm_srai_epi16(_mm_add_epi16(q0, t0), 3);
7804
+        p0a = _mm_blendv_epi8(t1, p0a, vflat8in);
7805
+        q0a = _mm_blendv_epi8(t2, q0a, vflat8in);
7806
+
7807
+        t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p2));
7808
+        t1 = _mm_srai_epi16(_mm_add_epi16(q1, t0), 3);
7809
+        t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p1));
7810
+        t2 = _mm_srai_epi16(_mm_add_epi16(q2, t0), 3);
7811
+        q1a = _mm_blendv_epi8(t1, q1a, vflat8in);
7812
+        q2a = _mm_blendv_epi8(t2, q2, vflat8in);
7813
+
7814
+        if (wd >= 16) {
7815
+            t0 = _mm_abs_epi16(_mm_sub_epi16(p7, p0));
7816
+            t1 = _mm_abs_epi16(_mm_sub_epi16(p6, p0));
7817
+            t2 = _mm_abs_epi16(_mm_sub_epi16(p5, p0));
7818
+            t3 = _mm_abs_epi16(_mm_sub_epi16(p4, p0));
7819
+            t4 = _mm_abs_epi16(_mm_sub_epi16(q4, q0));
7820
+            t5 = _mm_abs_epi16(_mm_sub_epi16(q5, q0));
7821
+            t6 = _mm_abs_epi16(_mm_sub_epi16(q6, q0));
7822
+            t7 = _mm_abs_epi16(_mm_sub_epi16(q7, q0));
7823
+
7824
+            t0 = _mm_cmpgt_epi16(t0, vF);
7825
+            t1 = _mm_cmpgt_epi16(t1, vF);
7826
+            t2 = _mm_cmpgt_epi16(t2, vF);
7827
+            t3 = _mm_cmpgt_epi16(t3, vF);
7828
+            t4 = _mm_cmpgt_epi16(t4, vF);
7829
+            t5 = _mm_cmpgt_epi16(t5, vF);
7830
+            t6 = _mm_cmpgt_epi16(t6, vF);
7831
+            t7 = _mm_cmpgt_epi16(t7, vF);
7832
+
7833
+            t0 = _mm_or_si128(_mm_or_si128(t0, t1), _mm_or_si128(t2, t3));
7834
+            t4 = _mm_or_si128(_mm_or_si128(t4, t5), _mm_or_si128(t6, t7));
7835
+            vflat8out = _mm_or_si128(t0, t4);
7836
+            vflat8out = _mm_or_si128(vflat8out, vflat8in);
7837
+
7838
+            if (_mm_movemask_epi8(vflat8out) != 0xffff) {
7839
+                __m128i c8 = _mm_set1_epi16(8);
7840
+                t0 = _mm_add_epi16(_mm_slli_epi16(p7, 3), c8);
7841
+                t1 = _mm_add_epi16(_mm_add_epi16(q0, p0), _mm_add_epi16(p1, p2));
7842
+                t2 = _mm_add_epi16(_mm_add_epi16(p3, p4), _mm_add_epi16(p5, p6));
7843
+                t0 = _mm_sub_epi16(t0, p7);
7844
+                t0 = _mm_add_epi16(t0, t1);
7845
+                t0 = _mm_add_epi16(t0, t2);
7846
+                t1 = _mm_srai_epi16(_mm_add_epi16(p6, t0), 4);
7847
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q1, p7));
7848
+                t2 = _mm_srai_epi16(_mm_add_epi16(p5, t0), 4);
7849
+                p6a = _mm_blendv_epi8(t1, p6, vflat8out);
7850
+                p5a = _mm_blendv_epi8(t2, p5, vflat8out);
7851
+
7852
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q2, p7));
7853
+                t1 = _mm_srai_epi16(_mm_add_epi16(p4, t0), 4);
7854
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p7));
7855
+                t2 = _mm_srai_epi16(_mm_add_epi16(p3, t0), 4);
7856
+                p4a = _mm_blendv_epi8(t1, p4, vflat8out);
7857
+                p3a = _mm_blendv_epi8(t2, p3, vflat8out);
7858
+
7859
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q4, p7));
7860
+                t1 = _mm_srai_epi16(_mm_add_epi16(p2, t0), 4);
7861
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q5, p7));
7862
+                t2 = _mm_srai_epi16(_mm_add_epi16(p1, t0), 4);
7863
+                p2a = _mm_blendv_epi8(t1, p2a, vflat8out);
7864
+                p1a = _mm_blendv_epi8(t2, p1a, vflat8out);
7865
+
7866
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q6, p7));
7867
+                t1 = _mm_srai_epi16(_mm_add_epi16(p0, t0), 4);
7868
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p7));
7869
+                t2 = _mm_srai_epi16(_mm_add_epi16(q0, t0), 4);
7870
+                p0a = _mm_blendv_epi8(t1, p0a, vflat8out);
7871
+                q0a = _mm_blendv_epi8(t2, q0a, vflat8out);
7872
+
7873
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p6));
7874
+                t1 = _mm_srai_epi16(_mm_add_epi16(q1, t0), 4);
7875
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p5));
7876
+                t2 = _mm_srai_epi16(_mm_add_epi16(q2, t0), 4);
7877
+                q1a = _mm_blendv_epi8(t1, q1a, vflat8out);
7878
+                q2a = _mm_blendv_epi8(t2, q2a, vflat8out);
7879
+
7880
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p4));
7881
+                t1 = _mm_srai_epi16(_mm_add_epi16(q3, t0), 4);
7882
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p3));
7883
+                t2 = _mm_srai_epi16(_mm_add_epi16(q4, t0), 4);
7884
+                q3a = _mm_blendv_epi8(t1, q3, vflat8out);
7885
+                q4a = _mm_blendv_epi8(t2, q4, vflat8out);
7886
+
7887
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p2));
7888
+                t1 = _mm_srai_epi16(_mm_add_epi16(q5, t0), 4);
7889
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p1));
7890
+                t2 = _mm_srai_epi16(_mm_add_epi16(q6, t0), 4);
7891
+                q5a = _mm_blendv_epi8(t1, q5, vflat8out);
7892
+                q6a = _mm_blendv_epi8(t2, q6, vflat8out);
7893
+            }
7894
+        }
7895
+    }
7896
+
7897
+    if (wd >= 16) {
7898
+        STORE_TRANSPOSE8(dst - 8, p7,  p6a, p5a, p4a, p3a, p2a, p1a, p0a);
7899
+        STORE_TRANSPOSE8(dst    , q0a, q1a, q2a, q3a, q4a, q5a, q6a, q7 );
7900
+    } else if (wd >= 8) {
7901
+        STORE_TRANSPOSE8(dst - 4, p3,  p2a, p1a, p0a, q0a, q1a, q2a, q3 );
7902
+    } else {
7903
+        t0 = _mm_packus_epi16(p1a, p0a);
7904
+        t1 = _mm_packus_epi16(q0a, q1a);
7905
+        t2 = _mm_unpacklo_epi8(t0, t1);
7906
+        t3 = _mm_unpackhi_epi8(t0, t1);
7907
+        t0 = _mm_unpacklo_epi8(t2, t3);
7908
+        t1 = _mm_unpackhi_epi8(t2, t3);
7909
+        *(uint32_t*)(dst - 2 + stride * 0) = _mm_extract_epi32(t0, 0);
7910
+        *(uint32_t*)(dst - 2 + stride * 1) = _mm_extract_epi32(t0, 1);
7911
+        *(uint32_t*)(dst - 2 + stride * 2) = _mm_extract_epi32(t0, 2);
7912
+        *(uint32_t*)(dst - 2 + stride * 3) = _mm_extract_epi32(t0, 3);
7913
+        *(uint32_t*)(dst - 2 + stride * 4) = _mm_extract_epi32(t1, 0);
7914
+        *(uint32_t*)(dst - 2 + stride * 5) = _mm_extract_epi32(t1, 1);
7915
+        *(uint32_t*)(dst - 2 + stride * 6) = _mm_extract_epi32(t1, 2);
7916
+        *(uint32_t*)(dst - 2 + stride * 7) = _mm_extract_epi32(t1, 3);
7917
+    }
7918
+}
7919
+
7920
+#undef LOAD_TRANSPOSE8
7921
+#undef STORE_TRANSPOSE8
7922
+
7923
+static av_always_inline void loop_filter_v(uint8_t *dst, int E, int I, int H,
7924
+                                           ptrdiff_t stride, int wd)
7925
+{
7926
+    int F = 1;
7927
+    LOAD_ZERO;
7928
+    __m128i t0, t1, t2, t3, t4, t5, t6, t7;
7929
+    __m128i vfm, vflat8out, vflat8in, vhev;
7930
+    __m128i p2a, p1a, p0a, q0a, q1a, q2a;
7931
+
7932
+    __m128i vF = _mm_set1_epi16(F);
7933
+    __m128i vI = _mm_set1_epi16(I);
7934
+    __m128i vE = _mm_set1_epi16(E);
7935
+    __m128i vH = _mm_set1_epi16(H);
7936
+
7937
+    __m128i p7, p6, p5, p4;
7938
+    __m128i p3 = VEC_LD8(dst - stride * 4), p2 = VEC_LD8(dst - stride * 3);
7939
+    __m128i p1 = VEC_LD8(dst - stride * 2), p0 = VEC_LD8(dst - stride * 1);
7940
+    __m128i q0 = VEC_LD8(dst + stride * 0), q1 = VEC_LD8(dst + stride * 1);
7941
+    __m128i q2 = VEC_LD8(dst + stride * 2), q3 = VEC_LD8(dst + stride * 3);
7942
+    __m128i q4, q5, q6, q7;
7943
+
7944
+    p3 = _mm_unpacklo_epi8(p3, zerov);
7945
+    p2 = _mm_unpacklo_epi8(p2, zerov);
7946
+    p1 = _mm_unpacklo_epi8(p1, zerov);
7947
+    p0 = _mm_unpacklo_epi8(p0, zerov);
7948
+    q0 = _mm_unpacklo_epi8(q0, zerov);
7949
+    q1 = _mm_unpacklo_epi8(q1, zerov);
7950
+    q2 = _mm_unpacklo_epi8(q2, zerov);
7951
+    q3 = _mm_unpacklo_epi8(q3, zerov);
7952
+
7953
+    t0 = _mm_abs_epi16(_mm_sub_epi16(p3, p2));
7954
+    t1 = _mm_abs_epi16(_mm_sub_epi16(p2, p1));
7955
+    t2 = _mm_abs_epi16(_mm_sub_epi16(p1, p0));
7956
+    t3 = _mm_abs_epi16(_mm_sub_epi16(q1, q0));
7957
+    t4 = _mm_abs_epi16(_mm_sub_epi16(q2, q1));
7958
+    t5 = _mm_abs_epi16(_mm_sub_epi16(q3, q2));
7959
+    vhev = _mm_or_si128(_mm_cmpgt_epi16(t2, vH), _mm_cmpgt_epi16(t3, vH));
7960
+    t0 = _mm_cmpgt_epi16(t0, vI);
7961
+    t1 = _mm_cmpgt_epi16(t1, vI);
7962
+    t2 = _mm_cmpgt_epi16(t2, vI);
7963
+    t3 = _mm_cmpgt_epi16(t3, vI);
7964
+    t4 = _mm_cmpgt_epi16(t4, vI);
7965
+    t5 = _mm_cmpgt_epi16(t5, vI);
7966
+
7967
+    t0 = _mm_or_si128(_mm_or_si128(t0, t1), t2);
7968
+    t3 = _mm_or_si128(_mm_or_si128(t3, t4), t5);
7969
+    t0 = _mm_or_si128(t0, t3);
7970
+
7971
+    t6 = _mm_abs_epi16(_mm_sub_epi16(p0, q0));
7972
+    t7 = _mm_abs_epi16(_mm_sub_epi16(p1, q1));
7973
+    t6 = _mm_add_epi16(_mm_slli_epi16(t6, 1), _mm_srai_epi16(t7, 1));
7974
+    t6 = _mm_cmpgt_epi16(t6, vE);
7975
+    vfm = _mm_or_si128(t0, t6); // !fm
7976
+
7977
+    if (_mm_movemask_epi8(vfm) == 0xffff) return;
7978
+
7979
+    if (wd >= 8) {
7980
+        t0 = _mm_abs_epi16(_mm_sub_epi16(p3, p0));
7981
+        t1 = _mm_abs_epi16(_mm_sub_epi16(p2, p0));
7982
+        t2 = _mm_abs_epi16(_mm_sub_epi16(p1, p0));
7983
+        t3 = _mm_abs_epi16(_mm_sub_epi16(q1, q0));
7984
+        t4 = _mm_abs_epi16(_mm_sub_epi16(q2, q0));
7985
+        t5 = _mm_abs_epi16(_mm_sub_epi16(q3, q0));
7986
+        t0 = _mm_cmpgt_epi16(t0, vF);
7987
+        t1 = _mm_cmpgt_epi16(t1, vF);
7988
+        t2 = _mm_cmpgt_epi16(t2, vF);
7989
+        t3 = _mm_cmpgt_epi16(t3, vF);
7990
+        t4 = _mm_cmpgt_epi16(t4, vF);
7991
+        t5 = _mm_cmpgt_epi16(t5, vF);
7992
+
7993
+        t0 = _mm_or_si128(_mm_or_si128(t0, t1), t2);
7994
+        t3 = _mm_or_si128(_mm_or_si128(t3, t4), t5);
7995
+        vflat8in = _mm_or_si128(_mm_or_si128(t0, t3), vfm);
7996
+    }
7997
+
7998
+    {
7999
+        __m128i c1 = _mm_set1_epi16(1);
8000
+        __m128i c127 = _mm_set1_epi16(127);
8001
+        __m128i m128 = _mm_set1_epi16(-128);
8002
+        __m128i c43 = _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3);
8003
+        t0 = _mm_and_si128(vhev, _mm_sub_epi16(p1, q1));
8004
+        t1 = _mm_sub_epi16(q0, p0);
8005
+        t0 = _mm_min_epi16(_mm_max_epi16(t0, m128), c127);
8006
+        // f = av_clip_intp2(p1 - q1, 7) & hev;
8007
+        t1 = _mm_add_epi16(_mm_add_epi16(t1, t0), _mm_add_epi16(t1, t1));
8008
+        t1 = _mm_andnot_si128(vfm, t1);
8009
+        t1 = _mm_packs_epi16(t1, t1);
8010
+        // f = av_clip_intp2(3 * (q0 - p0) + f, 7);
8011
+        t3 = _mm_adds_epi8(t1, c43);
8012
+        t2 = _mm_srai_epi16(_mm_unpacklo_epi8(t3, t3), 8 + 3);
8013
+        // f1 = FFMIN(f + 4, 0x7f) >> 3;
8014
+        t3 = _mm_srai_epi16(_mm_unpackhi_epi8(t3, t3), 8 + 3);
8015
+        // f2 = FFMIN(f + 3, 0x7f) >> 3;
8016
+        t4 = _mm_srai_epi16(_mm_add_epi16(t2, c1), 1);
8017
+        t4 = _mm_andnot_si128(vhev, t4); // f3 = ((f1 + 1) >> 1) & ~hev;
8018
+        p1a = _mm_add_epi16(p1, t4); // av_clip_uint8(p1 + f3);
8019
+        p0a = _mm_add_epi16(p0, t3); // av_clip_uint8(p0 + f2);
8020
+        q0a = _mm_sub_epi16(q0, t2); // av_clip_uint8(q0 - f1);
8021
+        q1a = _mm_sub_epi16(q1, t4); // av_clip_uint8(q1 - f3);
8022
+    }
8023
+
8024
+    if (wd >= 8 && _mm_movemask_epi8(vflat8in) != 0xffff) {
8025
+        __m128i c4 = _mm_set1_epi16(4);
8026
+        t0 = _mm_add_epi16(_mm_slli_epi16(p3, 2), c4);
8027
+        t1 = _mm_add_epi16(_mm_add_epi16(q0, p0), _mm_add_epi16(p1, p2));
8028
+        t0 = _mm_sub_epi16(t0, p3);
8029
+        t0 = _mm_add_epi16(t0, t1);
8030
+        t1 = _mm_srai_epi16(_mm_add_epi16(p2, t0), 3);
8031
+        t0 = _mm_add_epi16(t0, _mm_sub_epi16(q1, p3));
8032
+        t2 = _mm_srai_epi16(_mm_add_epi16(p1, t0), 3);
8033
+        p2a = _mm_blendv_epi8(t1, p2, vflat8in);
8034
+        p1a = _mm_blendv_epi8(t2, p1a, vflat8in);
8035
+
8036
+        t0 = _mm_add_epi16(t0, _mm_sub_epi16(q2, p3));
8037
+        t1 = _mm_srai_epi16(_mm_add_epi16(p0, t0), 3);
8038
+        t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p3));
8039
+        t2 = _mm_srai_epi16(_mm_add_epi16(q0, t0), 3);
8040
+        p0a = _mm_blendv_epi8(t1, p0a, vflat8in);
8041
+        q0a = _mm_blendv_epi8(t2, q0a, vflat8in);
8042
+
8043
+        t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p2));
8044
+        t1 = _mm_srai_epi16(_mm_add_epi16(q1, t0), 3);
8045
+        t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p1));
8046
+        t2 = _mm_srai_epi16(_mm_add_epi16(q2, t0), 3);
8047
+        q1a = _mm_blendv_epi8(t1, q1a, vflat8in);
8048
+        q2a = _mm_blendv_epi8(t2, q2, vflat8in);
8049
+
8050
+        if (wd >= 16) {
8051
+            p7 = VEC_LD8(dst - stride * 8);
8052
+            p6 = VEC_LD8(dst - stride * 7);
8053
+            p5 = VEC_LD8(dst - stride * 6);
8054
+            p4 = VEC_LD8(dst - stride * 5);
8055
+            q4 = VEC_LD8(dst + stride * 4);
8056
+            q5 = VEC_LD8(dst + stride * 5);
8057
+            q6 = VEC_LD8(dst + stride * 6);
8058
+            q7 = VEC_LD8(dst + stride * 7);
8059
+
8060
+            p7 = _mm_unpacklo_epi8(p7, zerov);
8061
+            p6 = _mm_unpacklo_epi8(p6, zerov);
8062
+            p5 = _mm_unpacklo_epi8(p5, zerov);
8063
+            p4 = _mm_unpacklo_epi8(p4, zerov);
8064
+            q4 = _mm_unpacklo_epi8(q4, zerov);
8065
+            q5 = _mm_unpacklo_epi8(q5, zerov);
8066
+            q6 = _mm_unpacklo_epi8(q6, zerov);
8067
+            q7 = _mm_unpacklo_epi8(q7, zerov);
8068
+
8069
+            t0 = _mm_abs_epi16(_mm_sub_epi16(p7, p0));
8070
+            t1 = _mm_abs_epi16(_mm_sub_epi16(p6, p0));
8071
+            t2 = _mm_abs_epi16(_mm_sub_epi16(p5, p0));
8072
+            t3 = _mm_abs_epi16(_mm_sub_epi16(p4, p0));
8073
+            t4 = _mm_abs_epi16(_mm_sub_epi16(q4, q0));
8074
+            t5 = _mm_abs_epi16(_mm_sub_epi16(q5, q0));
8075
+            t6 = _mm_abs_epi16(_mm_sub_epi16(q6, q0));
8076
+            t7 = _mm_abs_epi16(_mm_sub_epi16(q7, q0));
8077
+
8078
+            t0 = _mm_cmpgt_epi16(t0, vF);
8079
+            t1 = _mm_cmpgt_epi16(t1, vF);
8080
+            t2 = _mm_cmpgt_epi16(t2, vF);
8081
+            t3 = _mm_cmpgt_epi16(t3, vF);
8082
+            t4 = _mm_cmpgt_epi16(t4, vF);
8083
+            t5 = _mm_cmpgt_epi16(t5, vF);
8084
+            t6 = _mm_cmpgt_epi16(t6, vF);
8085
+            t7 = _mm_cmpgt_epi16(t7, vF);
8086
+
8087
+            t0 = _mm_or_si128(_mm_or_si128(t0, t1), _mm_or_si128(t2, t3));
8088
+            t4 = _mm_or_si128(_mm_or_si128(t4, t5), _mm_or_si128(t6, t7));
8089
+            vflat8out = _mm_or_si128(t0, t4);
8090
+
8091
+            vflat8out = _mm_or_si128(vflat8out, vflat8in);
8092
+            if (_mm_movemask_epi8(vflat8out) != 0xffff) {
8093
+                __m128i c8 = _mm_set1_epi16(8);
8094
+                t0 = _mm_add_epi16(_mm_slli_epi16(p7, 3), c8);
8095
+                t1 = _mm_add_epi16(_mm_add_epi16(q0, p0), _mm_add_epi16(p1, p2));
8096
+                t2 = _mm_add_epi16(_mm_add_epi16(p3, p4), _mm_add_epi16(p5, p6));
8097
+                t0 = _mm_sub_epi16(t0, p7);
8098
+                t0 = _mm_add_epi16(t0, t1);
8099
+                t0 = _mm_add_epi16(t0, t2);
8100
+                t1 = _mm_srai_epi16(_mm_add_epi16(p6, t0), 4);
8101
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q1, p7));
8102
+                t2 = _mm_srai_epi16(_mm_add_epi16(p5, t0), 4);
8103
+                t1 = _mm_blendv_epi8(t1, p6, vflat8out);
8104
+                t2 = _mm_blendv_epi8(t2, p5, vflat8out);
8105
+                t1 = _mm_packus_epi16(t1, t2);
8106
+                VEC_STL(dst - stride * 7, t1);
8107
+                VEC_STH(dst - stride * 6, t1);
8108
+
8109
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q2, p7));
8110
+                t1 = _mm_srai_epi16(_mm_add_epi16(p4, t0), 4);
8111
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q3, p7));
8112
+                t2 = _mm_srai_epi16(_mm_add_epi16(p3, t0), 4);
8113
+                t1 = _mm_blendv_epi8(t1, p4, vflat8out);
8114
+                t2 = _mm_blendv_epi8(t2, p3, vflat8out);
8115
+                t1 = _mm_packus_epi16(t1, t2);
8116
+                VEC_STL(dst - stride * 5, t1);
8117
+                VEC_STH(dst - stride * 4, t1);
8118
+
8119
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q4, p7));
8120
+                t1 = _mm_srai_epi16(_mm_add_epi16(p2, t0), 4);
8121
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q5, p7));
8122
+                t2 = _mm_srai_epi16(_mm_add_epi16(p1, t0), 4);
8123
+                p2a = _mm_blendv_epi8(t1, p2a, vflat8out);
8124
+                p1a = _mm_blendv_epi8(t2, p1a, vflat8out);
8125
+
8126
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q6, p7));
8127
+                t1 = _mm_srai_epi16(_mm_add_epi16(p0, t0), 4);
8128
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p7));
8129
+                t2 = _mm_srai_epi16(_mm_add_epi16(q0, t0), 4);
8130
+                p0a = _mm_blendv_epi8(t1, p0a, vflat8out);
8131
+                q0a = _mm_blendv_epi8(t2, q0a, vflat8out);
8132
+
8133
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p6));
8134
+                t1 = _mm_srai_epi16(_mm_add_epi16(q1, t0), 4);
8135
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p5));
8136
+                t2 = _mm_srai_epi16(_mm_add_epi16(q2, t0), 4);
8137
+                q1a = _mm_blendv_epi8(t1, q1a, vflat8out);
8138
+                q2a = _mm_blendv_epi8(t2, q2a, vflat8out);
8139
+
8140
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p4));
8141
+                t1 = _mm_srai_epi16(_mm_add_epi16(q3, t0), 4);
8142
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p3));
8143
+                t2 = _mm_srai_epi16(_mm_add_epi16(q4, t0), 4);
8144
+                t1 = _mm_blendv_epi8(t1, q3, vflat8out);
8145
+                t2 = _mm_blendv_epi8(t2, q4, vflat8out);
8146
+                t1 = _mm_packus_epi16(t1, t2);
8147
+                VEC_STL(dst + stride * 3, t1);
8148
+                VEC_STH(dst + stride * 4, t1);
8149
+
8150
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p2));
8151
+                t1 = _mm_srai_epi16(_mm_add_epi16(q5, t0), 4);
8152
+                t0 = _mm_add_epi16(t0, _mm_sub_epi16(q7, p1));
8153
+                t2 = _mm_srai_epi16(_mm_add_epi16(q6, t0), 4);
8154
+                t1 = _mm_blendv_epi8(t1, q5, vflat8out);
8155
+                t2 = _mm_blendv_epi8(t2, q6, vflat8out);
8156
+                t1 = _mm_packus_epi16(t1, t2);
8157
+                VEC_STL(dst + stride * 5, t1);
8158
+                VEC_STH(dst + stride * 6, t1);
8159
+            }
8160
+        }
8161
+        t1 = _mm_packus_epi16(p2a, q2a);
8162
+        VEC_STL(dst - stride * 3, t1);
8163
+        VEC_STH(dst + stride * 2, t1);
8164
+    }
8165
+
8166
+    t0 = _mm_packus_epi16(p1a, p0a);
8167
+    t1 = _mm_packus_epi16(q0a, q1a);
8168
+    VEC_STL(dst - stride * 2, t0);
8169
+    VEC_STH(dst - stride * 1, t0);
8170
+    VEC_STL(dst + stride * 0, t1);
8171
+    VEC_STH(dst + stride * 1, t1);
8172
+}
8173
+
8174
+#define lf_8_fns(wd) \
8175
+static void loop_filter_h_##wd##_8_e2k(uint8_t *dst, \
8176
+                                       ptrdiff_t stride, \
8177
+                                       int E, int I, int H) \
8178
+{ \
8179
+    loop_filter_h(dst, E, I, H, stride, wd); \
8180
+} \
8181
+static void loop_filter_v_##wd##_8_e2k(uint8_t *dst, \
8182
+                                       ptrdiff_t stride, \
8183
+                                       int E, int I, int H) \
8184
+{ \
8185
+    loop_filter_v(dst, E, I, H, stride, wd); \
8186
+}
8187
+
8188
+lf_8_fns(4)
8189
+lf_8_fns(8)
8190
+lf_8_fns(16)
8191
+
8192
+#undef lf_8_fn
8193
+#undef lf_8_fns
8194
+
8195
+#define lf_16_fn(dir, stridea) \
8196
+static void loop_filter_##dir##_16_16_e2k(uint8_t *dst, \
8197
+                                          ptrdiff_t stride, \
8198
+                                          int E, int I, int H) \
8199
+{ \
8200
+    loop_filter_##dir##_16_8_e2k(dst, stride, E, I, H); \
8201
+    loop_filter_##dir##_16_8_e2k(dst + 8 * stridea, stride, E, I, H); \
8202
+}
8203
+
8204
+lf_16_fn(h, stride)
8205
+lf_16_fn(v, sizeof(pixel))
8206
+
8207
+#undef lf_16_fn
8208
+
8209
+#define lf_mix_fn(dir, wd1, wd2, stridea) \
8210
+static void loop_filter_##dir##_##wd1##wd2##_16_e2k(uint8_t *dst, \
8211
+                                                    ptrdiff_t stride, \
8212
+                                                    int E, int I, int H) \
8213
+{ \
8214
+    loop_filter_##dir##_##wd1##_8_e2k(dst, stride, E & 0xff, I & 0xff, H & 0xff); \
8215
+    loop_filter_##dir##_##wd2##_8_e2k(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
8216
+}
8217
+
8218
+#define lf_mix_fns(wd1, wd2) \
8219
+lf_mix_fn(h, wd1, wd2, stride) \
8220
+lf_mix_fn(v, wd1, wd2, sizeof(pixel))
8221
+
8222
+lf_mix_fns(4, 4)
8223
+lf_mix_fns(4, 8)
8224
+lf_mix_fns(8, 4)
8225
+lf_mix_fns(8, 8)
8226
+
8227
+#undef lf_mix_fn
8228
+#undef lf_mix_fns
8229
+
8230
+static av_cold void ff_vp9dsp_loopfilter_init_8_e2k(VP9DSPContext *dsp)
8231
+{
8232
+    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_e2k;
8233
+    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_e2k;
8234
+    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_e2k;
8235
+    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_e2k;
8236
+    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_e2k;
8237
+    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_e2k;
8238
+
8239
+    dsp->loop_filter_16[0] = loop_filter_h_16_16_e2k;
8240
+    dsp->loop_filter_16[1] = loop_filter_v_16_16_e2k;
8241
+
8242
+    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_e2k;
8243
+    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_e2k;
8244
+    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_e2k;
8245
+    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_e2k;
8246
+    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_e2k;
8247
+    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_e2k;
8248
+    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_e2k;
8249
+    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_e2k;
8250
+}
8251
+
8252
+#if BIT_DEPTH != 12
8253
+
8254
+static av_always_inline void copy_e2k(uint8_t *dst, ptrdiff_t dst_stride,
8255
+                                      const uint8_t *src, ptrdiff_t src_stride,
8256
+                                      int w, int h)
8257
+{
8258
+    do {
8259
+        memcpy(dst, src, w);
8260
+        dst += dst_stride;
8261
+        src += src_stride;
8262
+    } while (--h);
8263
+}
8264
+
8265
+static av_always_inline void avg_e2k(uint8_t *dst, ptrdiff_t dst_stride,
8266
+                                     const uint8_t *src, ptrdiff_t src_stride,
8267
+                                     int w, int h)
8268
+{
8269
+    int y;
8270
+    __m128i v0, v1, v2, v3; __m64 h0, h1;
8271
+    if (w >= 64) {
8272
+        PRAGMA_E2K("ivdep")
8273
+        for (y = 0; y < h; y++) {
8274
+            v0 = VEC_LD(src);
8275
+            v1 = VEC_LD(src + 16);
8276
+            v2 = VEC_LD(src + 32);
8277
+            v3 = VEC_LD(src + 48);
8278
+            v0 = _mm_avg_epu8(v0, VEC_LD(dst));
8279
+            v1 = _mm_avg_epu8(v1, VEC_LD(dst + 16));
8280
+            v2 = _mm_avg_epu8(v2, VEC_LD(dst + 32));
8281
+            v3 = _mm_avg_epu8(v3, VEC_LD(dst + 48));
8282
+            VEC_ST(dst, v0);
8283
+            VEC_ST(dst + 16, v1);
8284
+            VEC_ST(dst + 32, v2);
8285
+            VEC_ST(dst + 48, v3);
8286
+            src += src_stride;
8287
+            dst += dst_stride;
8288
+        }
8289
+    } else if (w >= 32) {
8290
+        PRAGMA_E2K("ivdep")
8291
+        for (y = 0; y < h; y++) {
8292
+            v0 = VEC_LD(src);
8293
+            v1 = VEC_LD(src + 16);
8294
+            v0 = _mm_avg_epu8(v0, VEC_LD(dst));
8295
+            v1 = _mm_avg_epu8(v1, VEC_LD(dst + 16));
8296
+            VEC_ST(dst, v0);
8297
+            VEC_ST(dst + 16, v1);
8298
+            src += src_stride;
8299
+            dst += dst_stride;
8300
+        }
8301
+    } else if (w >= 16) {
8302
+        PRAGMA_E2K("ivdep")
8303
+        for (y = 0; y < h; y++) {
8304
+            v0 = VEC_LD(src);
8305
+            v0 = _mm_avg_epu8(v0, VEC_LD(dst));
8306
+            VEC_ST(dst, v0);
8307
+            src += src_stride;
8308
+            dst += dst_stride;
8309
+        }
8310
+    } else if (w >= 8) {
8311
+        PRAGMA_E2K("ivdep")
8312
+        for (y = 0; y < h; y++) {
8313
+            h0 = *(__m64*)src;
8314
+            h1 = *(__m64*)dst;
8315
+            h0 = _mm_avg_pu8(h0, h1);
8316
+            *(__m64*)dst = h0;
8317
+            src += src_stride;
8318
+            dst += dst_stride;
8319
+        }
8320
+    } else {
8321
+        PRAGMA_E2K("ivdep")
8322
+        for (y = 0; y < h; y++) {
8323
+            h0 = _mm_cvtsi32_si64(*(uint32_t*)src);
8324
+            h1 = _mm_cvtsi32_si64(*(uint32_t*)dst);
8325
+            h0 = _mm_avg_pu8(h0, h1);
8326
+            *(uint32_t*)dst = _mm_cvtsi64_si32(h0);
8327
+            src += src_stride;
8328
+            dst += dst_stride;
8329
+        }
8330
+    }
8331
+}
8332
+
8333
+#define fpel_fn(type, sz) \
8334
+static void type##sz##_e2k(uint8_t *dst, ptrdiff_t dst_stride, \
8335
+                           const uint8_t *src, ptrdiff_t src_stride, \
8336
+                           int h, int mx, int my) \
8337
+{ \
8338
+    type##_e2k(dst, dst_stride, src, src_stride, sz, h); \
8339
+}
8340
+
8341
+#define copy_avg_fn(sz) \
8342
+fpel_fn(copy, sz) \
8343
+fpel_fn(avg,  sz)
8344
+
8345
+copy_avg_fn(64)
8346
+copy_avg_fn(32)
8347
+copy_avg_fn(16)
8348
+copy_avg_fn(8)
8349
+copy_avg_fn(4)
8350
+
8351
+#undef fpel_fn
8352
+#undef copy_avg_fn
8353
+
8354
+#endif /* BIT_DEPTH != 12 */
8355
+
8356
+static av_always_inline void do_8tap_1d_v_e2k(uint8_t *dst, ptrdiff_t dst_stride,
8357
+                                              const uint8_t *src, ptrdiff_t src_stride,
8358
+                                              int w, int h, const int16_t *filter, int avg)
8359
+{
8360
+    int x, y;
8361
+    const uint8_t *s; uint8_t *d;
8362
+
8363
+    if (w >= 8) {
8364
+        __m128i a0, a1, a2, a3, a4, a5, a6, a7;
8365
+        __m128i v0, v1, v2, v3;
8366
+        __m128i f0, f1, f2, f3, c64; __m64 h0, h1;
8367
+        f0 = _mm_set1_epi16((filter[0] & 255) | filter[1] << 8);
8368
+        f1 = _mm_set1_epi16((filter[2] & 255) | filter[3] << 8);
8369
+        f2 = _mm_set1_epi16((filter[4] & 255) | filter[5] << 8);
8370
+        f3 = _mm_set1_epi16((filter[6] & 255) | filter[7] << 8);
8371
+        c64 = _mm_set1_epi16(64);
8372
+
8373
+        for (x = 0; x < w; x += 8) {
8374
+            a0 = VEC_LD8(&src[x - 3 * src_stride]);
8375
+            a1 = VEC_LD8(&src[x - 2 * src_stride]);
8376
+            a2 = VEC_LD8(&src[x - 1 * src_stride]);
8377
+            a3 = VEC_LD8(&src[x + 0 * src_stride]);
8378
+            a4 = VEC_LD8(&src[x + 1 * src_stride]);
8379
+            a5 = VEC_LD8(&src[x + 2 * src_stride]);
8380
+            a6 = VEC_LD8(&src[x + 3 * src_stride]);
8381
+            s = src + x + 4 * src_stride;
8382
+            d = dst + x;
8383
+
8384
+            PRAGMA_E2K("ivdep")
8385
+            for (y = 0; y < h; y++) {
8386
+                a7 = VEC_LD8(s);
8387
+                v0 = _mm_unpacklo_epi8(a0, a1);
8388
+                v1 = _mm_unpacklo_epi8(a2, a3);
8389
+                v2 = _mm_unpacklo_epi8(a4, a5);
8390
+                v3 = _mm_unpacklo_epi8(a6, a7);
8391
+                v0 = _mm_maddubs_epi16(v0, f0);
8392
+                v1 = _mm_maddubs_epi16(v1, f1);
8393
+                v2 = _mm_maddubs_epi16(v2, f2);
8394
+                v3 = _mm_maddubs_epi16(v3, f3);
8395
+                v0 = _mm_add_epi16(v0, v2);
8396
+                v1 = _mm_add_epi16(v1, v3);
8397
+                v0 = _mm_add_epi16(v0, c64);
8398
+                v0 = _mm_adds_epi16(v0, v1);
8399
+                v0 = _mm_srai_epi16(v0, 7);
8400
+                v0 = _mm_packus_epi16(v0, v0);
8401
+                h0 = _mm_movepi64_pi64(v0);
8402
+                if (avg) {
8403
+                    h1 = *(__m64*)d;
8404
+                    h0 = _mm_avg_pu8(h0, h1);
8405
+                }
8406
+                *(__m64*)d = h0;
8407
+                s += src_stride;
8408
+                d += dst_stride;
8409
+                a0 = a1; a1 = a2; a2 = a3; a3 = a4;
8410
+                a4 = a5; a5 = a6; a6 = a7;
8411
+            }
8412
+        }
8413
+    } else {
8414
+        __m64 a0, a1, a2, a3, a4, a5, a6, a7;
8415
+        __m64 v0, v1, v2, v3;
8416
+        __m64 f0, f1, f2, f3, c64;
8417
+        f0 = _mm_set1_pi16((filter[0] & 255) | filter[1] << 8);
8418
+        f1 = _mm_set1_pi16((filter[2] & 255) | filter[3] << 8);
8419
+        f2 = _mm_set1_pi16((filter[4] & 255) | filter[5] << 8);
8420
+        f3 = _mm_set1_pi16((filter[6] & 255) | filter[7] << 8);
8421
+        c64 = _mm_set1_pi16(64);
8422
+
8423
+        a0 = _mm_cvtsi32_si64(*(uint32_t*)(src - 3 * src_stride));
8424
+        a1 = _mm_cvtsi32_si64(*(uint32_t*)(src - 2 * src_stride));
8425
+        a2 = _mm_cvtsi32_si64(*(uint32_t*)(src - 1 * src_stride));
8426
+        a3 = _mm_cvtsi32_si64(*(uint32_t*)(src + 0 * src_stride));
8427
+        a4 = _mm_cvtsi32_si64(*(uint32_t*)(src + 1 * src_stride));
8428
+        a5 = _mm_cvtsi32_si64(*(uint32_t*)(src + 2 * src_stride));
8429
+        a6 = _mm_cvtsi32_si64(*(uint32_t*)(src + 3 * src_stride));
8430
+        s = src + 4 * src_stride;
8431
+        d = dst;
8432
+
8433
+        PRAGMA_E2K("ivdep")
8434
+        for (y = 0; y < h; y++) {
8435
+            a7 = _mm_cvtsi32_si64(*(uint32_t*)s);
8436
+            v0 = _mm_unpacklo_pi8(a0, a1);
8437
+            v1 = _mm_unpacklo_pi8(a2, a3);
8438
+            v2 = _mm_unpacklo_pi8(a4, a5);
8439
+            v3 = _mm_unpacklo_pi8(a6, a7);
8440
+            v0 = _mm_maddubs_pi16(v0, f0);
8441
+            v1 = _mm_maddubs_pi16(v1, f1);
8442
+            v2 = _mm_maddubs_pi16(v2, f2);
8443
+            v3 = _mm_maddubs_pi16(v3, f3);
8444
+            v0 = _mm_add_pi16(v0, v2);
8445
+            v1 = _mm_add_pi16(v1, v3);
8446
+            v0 = _mm_add_pi16(v0, c64);
8447
+            v0 = _mm_adds_pi16(v0, v1);
8448
+            v0 = _mm_srai_pi16(v0, 7);
8449
+            v0 = _mm_packs_pu16(v0, v0);
8450
+            if (avg) {
8451
+                v1 = _mm_cvtsi32_si64(*(uint32_t*)d);
8452
+                v0 = _mm_avg_pu8(v0, v1);
8453
+            }
8454
+            *(uint32_t*)d = _mm_cvtsi64_si32(v0);
8455
+            s += src_stride;
8456
+            d += dst_stride;
8457
+            a0 = a1; a1 = a2; a2 = a3; a3 = a4;
8458
+            a4 = a5; a5 = a6; a6 = a7;
8459
+        }
8460
+    }
8461
+}
8462
+
8463
+static av_always_inline void do_8tap_1d_h_e2k(uint8_t *dst, ptrdiff_t dst_stride,
8464
+                                              const uint8_t *src, ptrdiff_t src_stride,
8465
+                                              int w, int h, const int16_t *filter, int avg)
8466
+{
8467
+    int x, y;
8468
+
8469
+    if (w >= 8) {
8470
+        __m64 a0, a1, a2, a3;
8471
+        __m128i v0, v1, v2, v3;
8472
+        __m128i f0, f1, f2, f3, c64; __m64 h0, h1;
8473
+        f0 = _mm_set1_epi16((filter[0] & 255) | filter[1] << 8);
8474
+        f1 = _mm_set1_epi16((filter[2] & 255) | filter[3] << 8);
8475
+        f2 = _mm_set1_epi16((filter[4] & 255) | filter[5] << 8);
8476
+        f3 = _mm_set1_epi16((filter[6] & 255) | filter[7] << 8);
8477
+        c64 = _mm_set1_epi16(64);
8478
+
8479
+        for (y = 0; y < h; y++) {
8480
+            PRAGMA_E2K("ivdep")
8481
+            for (x = 0; x < w; x += 8) {
8482
+                a0 = *(__m64*)(src + x - 3);
8483
+                a1 = *(__m64*)(src + x - 3 + 7);
8484
+                a0 = _mm_slli_si64(a0, 8);
8485
+                a2 = _mm_alignr_pi8(a1, a0, 1);
8486
+                a3 = _mm_alignr_pi8(a1, a0, 2);
8487
+                v0 = _mm_unpacklo_epi8(_mm_movpi64_epi64(a2), _mm_movpi64_epi64(a3));
8488
+                a2 = _mm_alignr_pi8(a1, a0, 3);
8489
+                a3 = _mm_alignr_pi8(a1, a0, 4);
8490
+                v1 = _mm_unpacklo_epi8(_mm_movpi64_epi64(a2), _mm_movpi64_epi64(a3));
8491
+                a2 = _mm_alignr_pi8(a1, a0, 5);
8492
+                a3 = _mm_alignr_pi8(a1, a0, 6);
8493
+                v2 = _mm_unpacklo_epi8(_mm_movpi64_epi64(a2), _mm_movpi64_epi64(a3));
8494
+                a2 = _mm_alignr_pi8(a1, a0, 7);
8495
+                v3 = _mm_unpacklo_epi8(_mm_movpi64_epi64(a2), _mm_movpi64_epi64(a1));
8496
+
8497
+                v0 = _mm_maddubs_epi16(v0, f0);
8498
+                v1 = _mm_maddubs_epi16(v1, f1);
8499
+                v2 = _mm_maddubs_epi16(v2, f2);
8500
+                v3 = _mm_maddubs_epi16(v3, f3);
8501
+                v0 = _mm_add_epi16(v0, v2);
8502
+                v1 = _mm_add_epi16(v1, v3);
8503
+                v0 = _mm_add_epi16(v0, c64);
8504
+                v0 = _mm_adds_epi16(v0, v1);
8505
+                v0 = _mm_srai_epi16(v0, 7);
8506
+                v0 = _mm_packus_epi16(v0, v0);
8507
+                h0 = _mm_movepi64_pi64(v0);
8508
+                if (avg) {
8509
+                    h1 = *(__m64*)(dst + x);
8510
+                    h0 = _mm_avg_pu8(h0, h1);
8511
+                }
8512
+                *(__m64*)(dst + x) = h0;
8513
+            }
8514
+            src += src_stride;
8515
+            dst += dst_stride;
8516
+        }
8517
+    } else {
8518
+        __m64 a0, a1, a2, a3;
8519
+        __m64 v0, v1, v2, v3;
8520
+        __m64 f0, f1, f2, f3, c64;
8521
+        f0 = _mm_set1_pi16((filter[0] & 255) | filter[1] << 8);
8522
+        f1 = _mm_set1_pi16((filter[2] & 255) | filter[3] << 8);
8523
+        f2 = _mm_set1_pi16((filter[4] & 255) | filter[5] << 8);
8524
+        f3 = _mm_set1_pi16((filter[6] & 255) | filter[7] << 8);
8525
+        c64 = _mm_set1_pi16(64);
8526
+
8527
+        PRAGMA_E2K("ivdep")
8528
+        for (y = 0; y < h; y++) {
8529
+            a0 = *(__m64*)(src - 3);
8530
+            a1 = _mm_cvtsi32_si64(*(uint32_t*)(src - 3 + 7));
8531
+            a0 = _mm_slli_si64(a0, 8);
8532
+            a2 = _mm_alignr_pi8(a1, a0, 1);
8533
+            a3 = _mm_alignr_pi8(a1, a0, 2);
8534
+            v0 = _mm_unpacklo_pi8(a2, a3);
8535
+            a2 = _mm_alignr_pi8(a1, a0, 3);
8536
+            a3 = _mm_alignr_pi8(a1, a0, 4);
8537
+            v1 = _mm_unpacklo_pi8(a2, a3);
8538
+            a2 = _mm_alignr_pi8(a1, a0, 5);
8539
+            a3 = _mm_alignr_pi8(a1, a0, 6);
8540
+            v2 = _mm_unpacklo_pi8(a2, a3);
8541
+            a2 = _mm_alignr_pi8(a1, a0, 7);
8542
+            v3 = _mm_unpacklo_pi8(a2, a1);
8543
+
8544
+            v0 = _mm_maddubs_pi16(v0, f0);
8545
+            v1 = _mm_maddubs_pi16(v1, f1);
8546
+            v2 = _mm_maddubs_pi16(v2, f2);
8547
+            v3 = _mm_maddubs_pi16(v3, f3);
8548
+            v0 = _mm_add_pi16(v0, v2);
8549
+            v1 = _mm_add_pi16(v1, v3);
8550
+            v0 = _mm_add_pi16(v0, c64);
8551
+            v0 = _mm_adds_pi16(v0, v1);
8552
+            v0 = _mm_srai_pi16(v0, 7);
8553
+            v0 = _mm_packs_pu16(v0, v0);
8554
+            if (avg) {
8555
+                v1 = _mm_cvtsi32_si64(*(uint32_t*)dst);
8556
+                v0 = _mm_avg_pu8(v0, v1);
8557
+            }
8558
+            *(uint32_t*)dst = _mm_cvtsi64_si32(v0);
8559
+            src += src_stride;
8560
+            dst += dst_stride;
8561
+        }
8562
+    }
8563
+}
8564
+
8565
+#define filter_8tap_1d_fn(opn, opa) \
8566
+static av_noinline void opn##_8tap_1d_v_e2k(uint8_t *dst, ptrdiff_t dst_stride, \
8567
+                                            const uint8_t *src, ptrdiff_t src_stride, \
8568
+                                            int w, int h, const int16_t *filter) \
8569
+{ \
8570
+    do_8tap_1d_v_e2k(dst, dst_stride, src, src_stride, w, h, filter, opa); \
8571
+} \
8572
+static av_noinline void opn##_8tap_1d_h_e2k(uint8_t *dst, ptrdiff_t dst_stride, \
8573
+                                            const uint8_t *src, ptrdiff_t src_stride, \
8574
+                                            int w, int h, const int16_t *filter) \
8575
+{ \
8576
+    do_8tap_1d_h_e2k(dst, dst_stride, src, src_stride, w, h, filter, opa); \
8577
+}
8578
+
8579
+filter_8tap_1d_fn(put, 0)
8580
+filter_8tap_1d_fn(avg, 1)
8581
+
8582
+#undef filter_8tap_1d_fn
8583
+
8584
+#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
8585
+static void avg##_8tap_##type##_##sz##dir##_e2k(uint8_t *dst, ptrdiff_t dst_stride, \
8586
+                                                const uint8_t *src, ptrdiff_t src_stride, \
8587
+                                                int h, int mx, int my) \
8588
+{ \
8589
+    avg##_8tap_1d_##dir##_e2k(dst, dst_stride, src, src_stride, sz, h, \
8590
+                              ff_vp9_subpel_filters[type_idx][dir_m]); \
8591
+}
8592
+
8593
+#define put_opa 0
8594
+#define avg_opa 1
8595
+#define filter_fn_2d(sz, type, type_idx, avg) \
8596
+static void avg##_8tap_##type##_##sz##hv_e2k(uint8_t *dst, ptrdiff_t dst_stride, \
8597
+                                             const uint8_t *src, ptrdiff_t src_stride, \
8598
+                                             int h, int mx, int my) \
8599
+{ \
8600
+    int w = sz; \
8601
+    pixel tmp[sz * (64 + 7)]; \
8602
+    src -= src_stride * 3; \
8603
+    do_8tap_1d_h_e2k(tmp, sz, src, src_stride, w, h + 7, ff_vp9_subpel_filters[type_idx][mx], 0); \
8604
+    do_8tap_1d_v_e2k(dst, dst_stride, tmp + sz * 3, sz, w, h, ff_vp9_subpel_filters[type_idx][my], avg##_opa); \
8605
+}
8606
+
8607
+#define bilinf_fn_1d(sz, dir, dir_m, avg)
8608
+#define bilinf_fn_2d(sz, avg)
8609
+
8610
+#define filter_fn(sz, avg) \
8611
+filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
8612
+filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
8613
+filter_fn_2d(sz,        regular, FILTER_8TAP_REGULAR, avg) \
8614
+filter_fn_1d(sz, h, mx, smooth,  FILTER_8TAP_SMOOTH,  avg) \
8615
+filter_fn_1d(sz, v, my, smooth,  FILTER_8TAP_SMOOTH,  avg) \
8616
+filter_fn_2d(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
8617
+filter_fn_1d(sz, h, mx, sharp,   FILTER_8TAP_SHARP,   avg) \
8618
+filter_fn_1d(sz, v, my, sharp,   FILTER_8TAP_SHARP,   avg) \
8619
+filter_fn_2d(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
8620
+bilinf_fn_1d(sz, h, mx,                               avg) \
8621
+bilinf_fn_1d(sz, v, my,                               avg) \
8622
+bilinf_fn_2d(sz,                                      avg)
8623
+
8624
+#define filter_fn_set(avg) \
8625
+filter_fn(64, avg) \
8626
+filter_fn(32, avg) \
8627
+filter_fn(16, avg) \
8628
+filter_fn(8,  avg) \
8629
+filter_fn(4,  avg)
8630
+
8631
+filter_fn_set(put)
8632
+filter_fn_set(avg)
8633
+
8634
+#undef filter_fn
8635
+#undef filter_fn_set
8636
+#undef filter_fn_1d
8637
+#undef filter_fn_2d
8638
+#undef bilinf_fn_1d
8639
+#undef bilinf_fn_2d
8640
+
8641
+static av_cold void ff_vp9dsp_mc_init_8_e2k(VP9DSPContext *dsp)
8642
+{
8643
+#define init_fpel(idx1, idx2, sz, type) \
8644
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_e2k; \
8645
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_e2k; \
8646
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = type##sz##_e2k
8647
+
8648
+#define init_copy_avg(idx, sz) \
8649
+    init_fpel(idx, 0, sz, copy); \
8650
+    init_fpel(idx, 1, sz, avg)
8651
+
8652
+    init_copy_avg(0, 64);
8653
+    init_copy_avg(1, 32);
8654
+    init_copy_avg(2, 16);
8655
+    init_copy_avg(3,  8);
8656
+    init_copy_avg(4,  4);
8657
+
8658
+#undef init_copy_avg
8659
+#undef init_fpel
8660
+
8661
+#define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \
8662
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_e2k; \
8663
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_e2k; \
8664
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_e2k
8665
+
8666
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
8667
+    init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type)
8668
+
8669
+#define init_subpel2(idx, idxh, idxv, dir, type) \
8670
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
8671
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
8672
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
8673
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
8674
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
8675
+
8676
+#define init_subpel3(idx, type) \
8677
+    init_subpel2(idx, 1, 1, hv, type); \
8678
+    init_subpel2(idx, 0, 1, v, type); \
8679
+    init_subpel2(idx, 1, 0, h, type)
8680
+
8681
+    init_subpel3(0, put);
8682
+    init_subpel3(1, avg);
8683
+
8684
+#undef init_subpel1
8685
+#undef init_subpel2
8686
+#undef init_subpel3
8687
+#undef init_subpel1_bd_aware
8688
+}
8689
+
8690
+av_cold void ff_vp9dsp_init_e2k(VP9DSPContext *dsp, int bpp, int bitexact)
8691
+{
8692
+    if (!E2K_BASE(av_get_cpu_flags()))
8693
+        return;
8694
+
8695
+    // checkasm
8696
+    // doesn't check all cases for loopfilter 
8697
+    if (bpp == 8) {
8698
+        ff_vp9dsp_itxfm_init_8_e2k(dsp);
8699
+        ff_vp9dsp_loopfilter_init_8_e2k(dsp);
8700
+        ff_vp9dsp_mc_init_8_e2k(dsp);
8701
+    }
8702
+}
8703
diff --git a/libavcodec/fdctdsp.c b/libavcodec/fdctdsp.c
8704
index b9c2c86..69e6302 100644
8705
--- a/libavcodec/fdctdsp.c
8706
+++ b/libavcodec/fdctdsp.c
8707
@@ -45,6 +45,8 @@ av_cold void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx)
8708
 
8709
     if (ARCH_PPC)
8710
         ff_fdctdsp_init_ppc(c, avctx, high_bit_depth);
8711
+    if (ARCH_E2K)
8712
+        ff_fdctdsp_init_e2k(c, avctx, high_bit_depth);
8713
     if (ARCH_X86)
8714
         ff_fdctdsp_init_x86(c, avctx, high_bit_depth);
8715
 }
8716
diff --git a/libavcodec/fdctdsp.h b/libavcodec/fdctdsp.h
8717
index 3e1f683..75407d4 100644
8718
--- a/libavcodec/fdctdsp.h
8719
+++ b/libavcodec/fdctdsp.h
8720
@@ -31,6 +31,8 @@ typedef struct FDCTDSPContext {
8721
 void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx);
8722
 void ff_fdctdsp_init_ppc(FDCTDSPContext *c, AVCodecContext *avctx,
8723
                          unsigned high_bit_depth);
8724
+void ff_fdctdsp_init_e2k(FDCTDSPContext *c, AVCodecContext *avctx,
8725
+                         unsigned high_bit_depth);
8726
 void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx,
8727
                          unsigned high_bit_depth);
8728
 
8729
diff --git a/libavcodec/fft.h b/libavcodec/fft.h
8730
index c858570..6a30f7c 100644
8731
--- a/libavcodec/fft.h
8732
+++ b/libavcodec/fft.h
8733
@@ -161,6 +161,7 @@ void ff_fft_init_x86(FFTContext *s);
8734
 void ff_fft_init_arm(FFTContext *s);
8735
 void ff_fft_init_mips(FFTContext *s);
8736
 void ff_fft_init_ppc(FFTContext *s);
8737
+void ff_fft_init_e2k(FFTContext *s);
8738
 
8739
 void ff_fft_fixed_init_arm(FFTContext *s);
8740
 
8741
diff --git a/libavcodec/fft_template.c b/libavcodec/fft_template.c
8742
index 20a62e4..d565a71 100644
8743
--- a/libavcodec/fft_template.c
8744
+++ b/libavcodec/fft_template.c
8745
@@ -245,6 +245,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
8746
     if (ARCH_AARCH64) ff_fft_init_aarch64(s);
8747
     if (ARCH_ARM)     ff_fft_init_arm(s);
8748
     if (ARCH_PPC)     ff_fft_init_ppc(s);
8749
+    if (ARCH_E2K)     ff_fft_init_e2k(s);
8750
     if (ARCH_X86)     ff_fft_init_x86(s);
8751
     if (CONFIG_MDCT)  s->mdct_calcw = s->mdct_calc;
8752
     if (HAVE_MIPSFPU) ff_fft_init_mips(s);
8753
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
8754
index 3b33af6..141ffc0 100644
8755
--- a/libavcodec/fmtconvert.c
8756
+++ b/libavcodec/fmtconvert.c
8757
@@ -61,6 +61,8 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
8758
         ff_fmt_convert_init_arm(c, avctx);
8759
     if (ARCH_PPC)
8760
         ff_fmt_convert_init_ppc(c, avctx);
8761
+    if (ARCH_E2K)
8762
+        ff_fmt_convert_init_e2k(c, avctx);
8763
     if (ARCH_X86)
8764
         ff_fmt_convert_init_x86(c, avctx);
8765
     if (HAVE_MIPSFPU)
8766
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
8767
index a1b17e4..1053e0f 100644
8768
--- a/libavcodec/fmtconvert.h
8769
+++ b/libavcodec/fmtconvert.h
8770
@@ -71,6 +71,7 @@ void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
8771
 void ff_fmt_convert_init_aarch64(FmtConvertContext *c, AVCodecContext *avctx);
8772
 void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
8773
 void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
8774
+void ff_fmt_convert_init_e2k(FmtConvertContext *c, AVCodecContext *avctx);
8775
 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
8776
 void ff_fmt_convert_init_mips(FmtConvertContext *c);
8777
 
8778
diff --git a/libavcodec/h264chroma.c b/libavcodec/h264chroma.c
8779
index c2f1f30..ea6196f 100644
8780
--- a/libavcodec/h264chroma.c
8781
+++ b/libavcodec/h264chroma.c
8782
@@ -52,6 +52,8 @@ av_cold void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
8783
         ff_h264chroma_init_arm(c, bit_depth);
8784
     if (ARCH_PPC)
8785
         ff_h264chroma_init_ppc(c, bit_depth);
8786
+    if (ARCH_E2K)
8787
+        ff_h264chroma_init_e2k(c, bit_depth);
8788
     if (ARCH_X86)
8789
         ff_h264chroma_init_x86(c, bit_depth);
8790
     if (ARCH_MIPS)
8791
diff --git a/libavcodec/h264chroma.h b/libavcodec/h264chroma.h
8792
index 5c89fd1..0ec192c 100644
8793
--- a/libavcodec/h264chroma.h
8794
+++ b/libavcodec/h264chroma.h
8795
@@ -34,6 +34,7 @@ void ff_h264chroma_init(H264ChromaContext *c, int bit_depth);
8796
 void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth);
8797
 void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth);
8798
 void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
8799
+void ff_h264chroma_init_e2k(H264ChromaContext *c, int bit_depth);
8800
 void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
8801
 void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth);
8802
 
8803
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
8804
index d26f552..64c19a9 100644
8805
--- a/libavcodec/h264dsp.c
8806
+++ b/libavcodec/h264dsp.c
8807
@@ -156,6 +156,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
8808
     if (ARCH_AARCH64) ff_h264dsp_init_aarch64(c, bit_depth, chroma_format_idc);
8809
     if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
8810
     if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
8811
+    if (ARCH_E2K) ff_h264dsp_init_e2k(c, bit_depth, chroma_format_idc);
8812
     if (ARCH_X86) ff_h264dsp_init_x86(c, bit_depth, chroma_format_idc);
8813
     if (ARCH_MIPS) ff_h264dsp_init_mips(c, bit_depth, chroma_format_idc);
8814
 }
8815
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
8816
index cbea317..ff781e1 100644
8817
--- a/libavcodec/h264dsp.h
8818
+++ b/libavcodec/h264dsp.h
8819
@@ -125,6 +125,8 @@ void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
8820
                          const int chroma_format_idc);
8821
 void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
8822
                          const int chroma_format_idc);
8823
+void ff_h264dsp_init_e2k(H264DSPContext *c, const int bit_depth,
8824
+                         const int chroma_format_idc);
8825
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
8826
                          const int chroma_format_idc);
8827
 void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth,
8828
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
8829
index 50e82e2..5955069 100644
8830
--- a/libavcodec/h264qpel.c
8831
+++ b/libavcodec/h264qpel.c
8832
@@ -102,6 +102,8 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
8833
         ff_h264qpel_init_arm(c, bit_depth);
8834
     if (ARCH_PPC)
8835
         ff_h264qpel_init_ppc(c, bit_depth);
8836
+    if (ARCH_E2K)
8837
+        ff_h264qpel_init_e2k(c, bit_depth);
8838
     if (ARCH_X86)
8839
         ff_h264qpel_init_x86(c, bit_depth);
8840
     if (ARCH_MIPS)
8841
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
8842
index 7c57ad0..7880b86 100644
8843
--- a/libavcodec/h264qpel.h
8844
+++ b/libavcodec/h264qpel.h
8845
@@ -34,6 +34,7 @@ void ff_h264qpel_init(H264QpelContext *c, int bit_depth);
8846
 void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth);
8847
 void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth);
8848
 void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth);
8849
+void ff_h264qpel_init_e2k(H264QpelContext *c, int bit_depth);
8850
 void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth);
8851
 void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth);
8852
 
8853
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
8854
index 957e40d..a7d0d5d 100644
8855
--- a/libavcodec/hevcdsp.c
8856
+++ b/libavcodec/hevcdsp.c
8857
@@ -261,6 +261,8 @@ int i = 0;
8858
         ff_hevc_dsp_init_arm(hevcdsp, bit_depth);
8859
     if (ARCH_PPC)
8860
         ff_hevc_dsp_init_ppc(hevcdsp, bit_depth);
8861
+    if (ARCH_E2K)
8862
+        ff_hevc_dsp_init_e2k(hevcdsp, bit_depth);
8863
     if (ARCH_X86)
8864
         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
8865
     if (ARCH_MIPS)
8866
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
8867
index 0ae67cb..3221a2a 100644
8868
--- a/libavcodec/hevcdsp.h
8869
+++ b/libavcodec/hevcdsp.h
8870
@@ -129,6 +129,7 @@ extern const int8_t ff_hevc_qpel_filters[3][16];
8871
 
8872
 void ff_hevc_dsp_init_arm(HEVCDSPContext *c, const int bit_depth);
8873
 void ff_hevc_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
8874
+void ff_hevc_dsp_init_e2k(HEVCDSPContext *c, const int bit_depth);
8875
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
8876
 void ff_hevc_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
8877
 
8878
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
8879
index 8e2fd8f..dd4ef87 100644
8880
--- a/libavcodec/hpeldsp.c
8881
+++ b/libavcodec/hpeldsp.c
8882
@@ -363,6 +363,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
8883
         ff_hpeldsp_init_arm(c, flags);
8884
     if (ARCH_PPC)
8885
         ff_hpeldsp_init_ppc(c, flags);
8886
+    if (ARCH_E2K)
8887
+        ff_hpeldsp_init_e2k(c, flags);
8888
     if (ARCH_X86)
8889
         ff_hpeldsp_init_x86(c, flags);
8890
     if (ARCH_MIPS)
8891
diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h
8892
index 768139b..6d0c293 100644
8893
--- a/libavcodec/hpeldsp.h
8894
+++ b/libavcodec/hpeldsp.h
8895
@@ -100,6 +100,7 @@ void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags);
8896
 void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
8897
 void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
8898
 void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
8899
+void ff_hpeldsp_init_e2k(HpelDSPContext *c, int flags);
8900
 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
8901
 void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags);
8902
 
8903
diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
8904
index 846ed0b..a2554aa 100644
8905
--- a/libavcodec/idctdsp.c
8906
+++ b/libavcodec/idctdsp.c
8907
@@ -311,6 +311,8 @@ av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx)
8908
         ff_idctdsp_init_arm(c, avctx, high_bit_depth);
8909
     if (ARCH_PPC)
8910
         ff_idctdsp_init_ppc(c, avctx, high_bit_depth);
8911
+    if (ARCH_E2K)
8912
+        ff_idctdsp_init_e2k(c, avctx, high_bit_depth);
8913
     if (ARCH_X86)
8914
         ff_idctdsp_init_x86(c, avctx, high_bit_depth);
8915
     if (ARCH_MIPS)
8916
diff --git a/libavcodec/idctdsp.h b/libavcodec/idctdsp.h
8917
index ca21a31..1204bff 100644
8918
--- a/libavcodec/idctdsp.h
8919
+++ b/libavcodec/idctdsp.h
8920
@@ -114,6 +114,8 @@ void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
8921
                          unsigned high_bit_depth);
8922
 void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx,
8923
                          unsigned high_bit_depth);
8924
+void ff_idctdsp_init_e2k(IDCTDSPContext *c, AVCodecContext *avctx,
8925
+                         unsigned high_bit_depth);
8926
 void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
8927
                          unsigned high_bit_depth);
8928
 void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
8929
diff --git a/libavcodec/lossless_audiodsp.c b/libavcodec/lossless_audiodsp.c
8930
index 3781659..979e0c9 100644
8931
--- a/libavcodec/lossless_audiodsp.c
8932
+++ b/libavcodec/lossless_audiodsp.c
8933
@@ -62,6 +62,8 @@ av_cold void ff_llauddsp_init(LLAudDSPContext *c)
8934
         ff_llauddsp_init_arm(c);
8935
     if (ARCH_PPC)
8936
         ff_llauddsp_init_ppc(c);
8937
+    if (ARCH_E2K)
8938
+        ff_llauddsp_init_e2k(c);
8939
     if (ARCH_X86)
8940
         ff_llauddsp_init_x86(c);
8941
 }
8942
diff --git a/libavcodec/lossless_audiodsp.h b/libavcodec/lossless_audiodsp.h
8943
index eea5d49..3de02d5 100644
8944
--- a/libavcodec/lossless_audiodsp.h
8945
+++ b/libavcodec/lossless_audiodsp.h
8946
@@ -46,6 +46,7 @@ typedef struct LLAudDSPContext {
8947
 void ff_llauddsp_init(LLAudDSPContext *c);
8948
 void ff_llauddsp_init_arm(LLAudDSPContext *c);
8949
 void ff_llauddsp_init_ppc(LLAudDSPContext *c);
8950
+void ff_llauddsp_init_e2k(LLAudDSPContext *c);
8951
 void ff_llauddsp_init_x86(LLAudDSPContext *c);
8952
 
8953
 #endif /* AVCODEC_LOSSLESS_AUDIODSP_H */
8954
diff --git a/libavcodec/lossless_videodsp.c b/libavcodec/lossless_videodsp.c
8955
index cff94c2..34b1db5 100644
8956
--- a/libavcodec/lossless_videodsp.c
8957
+++ b/libavcodec/lossless_videodsp.c
8958
@@ -120,6 +120,8 @@ void ff_llviddsp_init(LLVidDSPContext *c)
8959
 
8960
     if (ARCH_PPC)
8961
         ff_llviddsp_init_ppc(c);
8962
+    if (ARCH_E2K)
8963
+        ff_llviddsp_init_e2k(c);
8964
     if (ARCH_X86)
8965
         ff_llviddsp_init_x86(c);
8966
 }
8967
diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_videodsp.h
8968
index 8077898..917afb6 100644
8969
--- a/libavcodec/lossless_videodsp.h
8970
+++ b/libavcodec/lossless_videodsp.h
8971
@@ -45,5 +45,6 @@ typedef struct LLVidDSPContext {
8972
 void ff_llviddsp_init(LLVidDSPContext *llviddsp);
8973
 void ff_llviddsp_init_x86(LLVidDSPContext *llviddsp);
8974
 void ff_llviddsp_init_ppc(LLVidDSPContext *llviddsp);
8975
+void ff_llviddsp_init_e2k(LLVidDSPContext *llviddsp);
8976
 
8977
 #endif //AVCODEC_LOSSLESS_VIDEODSP_H
8978
diff --git a/libavcodec/mdct15.c b/libavcodec/mdct15.c
8979
index 6f35059..bced874 100644
8980
--- a/libavcodec/mdct15.c
8981
+++ b/libavcodec/mdct15.c
8982
@@ -318,6 +318,8 @@ av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale)
8983
 
8984
     if (ARCH_X86)
8985
         ff_mdct15_init_x86(s);
8986
+    if (ARCH_E2K)
8987
+        ff_mdct15_init_e2k(s);
8988
 
8989
     *ps = s;
8990
 
8991
diff --git a/libavcodec/mdct15.h b/libavcodec/mdct15.h
8992
index 42e60f3..c9d515b 100644
8993
--- a/libavcodec/mdct15.h
8994
+++ b/libavcodec/mdct15.h
8995
@@ -58,5 +58,6 @@ int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale);
8996
 void ff_mdct15_uninit(MDCT15Context **ps);
8997
 
8998
 void ff_mdct15_init_x86(MDCT15Context *s);
8999
+void ff_mdct15_init_e2k(MDCT15Context *s);
9000
 
9001
 #endif /* AVCODEC_MDCT15_H */
9002
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
9003
index ae248c5..cb967a4 100644
9004
--- a/libavcodec/me_cmp.c
9005
+++ b/libavcodec/me_cmp.c
9006
@@ -1088,6 +1088,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
9007
         ff_me_cmp_init_arm(c, avctx);
9008
     if (ARCH_PPC)
9009
         ff_me_cmp_init_ppc(c, avctx);
9010
+    if (ARCH_E2K)
9011
+        ff_me_cmp_init_e2k(c, avctx);
9012
     if (ARCH_X86)
9013
         ff_me_cmp_init_x86(c, avctx);
9014
     if (ARCH_MIPS)
9015
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
9016
index 0a589e3..8989a1c 100644
9017
--- a/libavcodec/me_cmp.h
9018
+++ b/libavcodec/me_cmp.h
9019
@@ -85,6 +85,7 @@ void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
9020
 void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
9021
 void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
9022
 void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
9023
+void ff_me_cmp_init_e2k(MECmpContext *c, AVCodecContext *avctx);
9024
 void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
9025
 void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
9026
 
9027
diff --git a/libavcodec/mpegaudiodsp.c b/libavcodec/mpegaudiodsp.c
9028
index 3cafca2..0514bd4 100644
9029
--- a/libavcodec/mpegaudiodsp.c
9030
+++ b/libavcodec/mpegaudiodsp.c
9031
@@ -48,6 +48,7 @@ av_cold void ff_mpadsp_init(MPADSPContext *s)
9032
     if (ARCH_AARCH64) ff_mpadsp_init_aarch64(s);
9033
     if (ARCH_ARM)     ff_mpadsp_init_arm(s);
9034
     if (ARCH_PPC)     ff_mpadsp_init_ppc(s);
9035
+    if (ARCH_E2K)     ff_mpadsp_init_e2k(s);
9036
     if (ARCH_X86)     ff_mpadsp_init_x86(s);
9037
     if (HAVE_MIPSFPU)   ff_mpadsp_init_mipsfpu(s);
9038
     if (HAVE_MIPSDSP) ff_mpadsp_init_mipsdsp(s);
9039
diff --git a/libavcodec/mpegaudiodsp.h b/libavcodec/mpegaudiodsp.h
9040
index 0e4352d..4514564 100644
9041
--- a/libavcodec/mpegaudiodsp.h
9042
+++ b/libavcodec/mpegaudiodsp.h
9043
@@ -62,6 +62,7 @@ void ff_mpa_synth_filter_float(MPADSPContext *s,
9044
 void ff_mpadsp_init_aarch64(MPADSPContext *s);
9045
 void ff_mpadsp_init_arm(MPADSPContext *s);
9046
 void ff_mpadsp_init_ppc(MPADSPContext *s);
9047
+void ff_mpadsp_init_e2k(MPADSPContext *s);
9048
 void ff_mpadsp_init_x86(MPADSPContext *s);
9049
 void ff_mpadsp_init_mipsfpu(MPADSPContext *s);
9050
 void ff_mpadsp_init_mipsdsp(MPADSPContext *s);
9051
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
9052
index 49fd1c9..49c4476 100644
9053
--- a/libavcodec/mpegvideo.c
9054
+++ b/libavcodec/mpegvideo.c
9055
@@ -318,6 +318,8 @@ static av_cold int dct_init(MpegEncContext *s)
9056
         ff_mpv_common_init_arm(s);
9057
     if (ARCH_PPC)
9058
         ff_mpv_common_init_ppc(s);
9059
+    if (ARCH_E2K)
9060
+        ff_mpv_common_init_e2k(s);
9061
     if (ARCH_X86)
9062
         ff_mpv_common_init_x86(s);
9063
     if (ARCH_MIPS)
9064
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
9065
index 29e692f..85fddd2 100644
9066
--- a/libavcodec/mpegvideo.h
9067
+++ b/libavcodec/mpegvideo.h
9068
@@ -685,6 +685,7 @@ void ff_mpv_common_init_arm(MpegEncContext *s);
9069
 void ff_mpv_common_init_axp(MpegEncContext *s);
9070
 void ff_mpv_common_init_neon(MpegEncContext *s);
9071
 void ff_mpv_common_init_ppc(MpegEncContext *s);
9072
+void ff_mpv_common_init_e2k(MpegEncContext *s);
9073
 void ff_mpv_common_init_x86(MpegEncContext *s);
9074
 void ff_mpv_common_init_mips(MpegEncContext *s);
9075
 
9076
diff --git a/libavcodec/mpegvideodsp.c b/libavcodec/mpegvideodsp.c
9077
index a58e45a..2d7aa96 100644
9078
--- a/libavcodec/mpegvideodsp.c
9079
+++ b/libavcodec/mpegvideodsp.c
9080
@@ -114,6 +114,8 @@ av_cold void ff_mpegvideodsp_init(MpegVideoDSPContext *c)
9081
 
9082
     if (ARCH_PPC)
9083
         ff_mpegvideodsp_init_ppc(c);
9084
+    if (ARCH_E2K)
9085
+        ff_mpegvideodsp_init_e2k(c);
9086
     if (ARCH_X86)
9087
         ff_mpegvideodsp_init_x86(c);
9088
 }
9089
diff --git a/libavcodec/mpegvideodsp.h b/libavcodec/mpegvideodsp.h
9090
index 293e254..56a0bf5 100644
9091
--- a/libavcodec/mpegvideodsp.h
9092
+++ b/libavcodec/mpegvideodsp.h
9093
@@ -42,6 +42,7 @@ typedef struct MpegVideoDSPContext {
9094
 
9095
 void ff_mpegvideodsp_init(MpegVideoDSPContext *c);
9096
 void ff_mpegvideodsp_init_ppc(MpegVideoDSPContext *c);
9097
+void ff_mpegvideodsp_init_e2k(MpegVideoDSPContext *c);
9098
 void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c);
9099
 
9100
 #endif /* AVCODEC_MPEGVIDEODSP_H */
9101
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
9102
index a34ab35..f6ba6b4 100644
9103
--- a/libavcodec/mpegvideoencdsp.c
9104
+++ b/libavcodec/mpegvideoencdsp.c
9105
@@ -249,6 +249,8 @@ av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
9106
         ff_mpegvideoencdsp_init_arm(c, avctx);
9107
     if (ARCH_PPC)
9108
         ff_mpegvideoencdsp_init_ppc(c, avctx);
9109
+    if (ARCH_E2K)
9110
+        ff_mpegvideoencdsp_init_e2k(c, avctx);
9111
     if (ARCH_X86)
9112
         ff_mpegvideoencdsp_init_x86(c, avctx);
9113
     if (ARCH_MIPS)
9114
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
9115
index 33f0282..2f2b191 100644
9116
--- a/libavcodec/mpegvideoencdsp.h
9117
+++ b/libavcodec/mpegvideoencdsp.h
9118
@@ -50,6 +50,8 @@ void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
9119
                                  AVCodecContext *avctx);
9120
 void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
9121
                                  AVCodecContext *avctx);
9122
+void ff_mpegvideoencdsp_init_e2k(MpegvideoEncDSPContext *c,
9123
+                                 AVCodecContext *avctx);
9124
 void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
9125
                                  AVCodecContext *avctx);
9126
 void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
9127
diff --git a/libavcodec/pixblockdsp.c b/libavcodec/pixblockdsp.c
9128
index 67393b9..8383663 100644
9129
--- a/libavcodec/pixblockdsp.c
9130
+++ b/libavcodec/pixblockdsp.c
9131
@@ -109,6 +109,8 @@ av_cold void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx)
9132
         ff_pixblockdsp_init_arm(c, avctx, high_bit_depth);
9133
     if (ARCH_PPC)
9134
         ff_pixblockdsp_init_ppc(c, avctx, high_bit_depth);
9135
+    if (ARCH_E2K)
9136
+        ff_pixblockdsp_init_e2k(c, avctx, high_bit_depth);
9137
     if (ARCH_X86)
9138
         ff_pixblockdsp_init_x86(c, avctx, high_bit_depth);
9139
     if (ARCH_MIPS)
9140
diff --git a/libavcodec/pixblockdsp.h b/libavcodec/pixblockdsp.h
9141
index 07c2ec4..49e2b25 100644
9142
--- a/libavcodec/pixblockdsp.h
9143
+++ b/libavcodec/pixblockdsp.h
9144
@@ -52,6 +52,8 @@ void ff_pixblockdsp_init_arm(PixblockDSPContext *c, AVCodecContext *avctx,
9145
                              unsigned high_bit_depth);
9146
 void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, AVCodecContext *avctx,
9147
                              unsigned high_bit_depth);
9148
+void ff_pixblockdsp_init_e2k(PixblockDSPContext *c, AVCodecContext *avctx,
9149
+                             unsigned high_bit_depth);
9150
 void ff_pixblockdsp_init_x86(PixblockDSPContext *c, AVCodecContext *avctx,
9151
                              unsigned high_bit_depth);
9152
 void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx,
9153
diff --git a/libavcodec/svq1enc.c b/libavcodec/svq1enc.c
9154
index cb215c2..d55fcd3 100644
9155
--- a/libavcodec/svq1enc.c
9156
+++ b/libavcodec/svq1enc.c
9157
@@ -570,6 +570,8 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
9158
 
9159
     if (ARCH_PPC)
9160
         ff_svq1enc_init_ppc(s);
9161
+    if (ARCH_E2K)
9162
+        ff_svq1enc_init_e2k(s);
9163
     if (ARCH_X86)
9164
         ff_svq1enc_init_x86(s);
9165
 
9166
diff --git a/libavcodec/svq1enc.h b/libavcodec/svq1enc.h
9167
index b4ef763..c070d80 100644
9168
--- a/libavcodec/svq1enc.h
9169
+++ b/libavcodec/svq1enc.h
9170
@@ -80,6 +80,7 @@ typedef struct SVQ1EncContext {
9171
 } SVQ1EncContext;
9172
 
9173
 void ff_svq1enc_init_ppc(SVQ1EncContext *c);
9174
+void ff_svq1enc_init_e2k(SVQ1EncContext *c);
9175
 void ff_svq1enc_init_x86(SVQ1EncContext *c);
9176
 
9177
 #endif /* AVCODEC_SVQ1ENC_H */
9178
diff --git a/libavcodec/tests/dct.c b/libavcodec/tests/dct.c
9179
index 2ca8039..755734c 100644
9180
--- a/libavcodec/tests/dct.c
9181
+++ b/libavcodec/tests/dct.c
9182
@@ -100,6 +100,8 @@ static const struct algo idct_tab[] = {
9183
 #include "arm/dct.c"
9184
 #elif ARCH_PPC
9185
 #include "ppc/dct.c"
9186
+#elif ARCH_E2K
9187
+#include "e2k/dct.c"
9188
 #elif ARCH_X86
9189
 #include "x86/dct.c"
9190
 #else
9191
diff --git a/libavcodec/tests/e2k/dct.c b/libavcodec/tests/e2k/dct.c
9192
new file mode 100644
9193
index 0000000..7c15b25
9194
--- /dev/null
9195
+++ b/libavcodec/tests/e2k/dct.c
9196
@@ -0,0 +1,31 @@
9197
+/*
9198
+ * This file is part of FFmpeg.
9199
+ *
9200
+ * FFmpeg is free software; you can redistribute it and/or
9201
+ * modify it under the terms of the GNU Lesser General Public
9202
+ * License as published by the Free Software Foundation; either
9203
+ * version 2.1 of the License, or (at your option) any later version.
9204
+ *
9205
+ * FFmpeg is distributed in the hope that it will be useful,
9206
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
9207
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
9208
+ * Lesser General Public License for more details.
9209
+ *
9210
+ * You should have received a copy of the GNU Lesser General Public
9211
+ * License along with FFmpeg; if not, write to the Free Software
9212
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9213
+ */
9214
+
9215
+#include "config.h"
9216
+
9217
+#include "libavcodec/e2k/dctdsp.h"
9218
+
9219
+static const struct algo fdct_tab_arch[] = {
9220
+    { "FDCT-E2K", ff_fdct_e2k, FF_IDCT_PERM_NONE, AV_CPU_FLAG_E2K },
9221
+    { 0 }
9222
+};
9223
+
9224
+static const struct algo idct_tab_arch[] = {
9225
+    { "IDCT-E2K", ff_idct_e2k, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_E2K },
9226
+    { 0 }
9227
+};
9228
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
9229
index c25a6f3..2953403 100644
9230
--- a/libavcodec/vc1dsp.c
9231
+++ b/libavcodec/vc1dsp.c
9232
@@ -1035,6 +1035,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
9233
         ff_vc1dsp_init_arm(dsp);
9234
     if (ARCH_PPC)
9235
         ff_vc1dsp_init_ppc(dsp);
9236
+    if (ARCH_E2K)
9237
+        ff_vc1dsp_init_e2k(dsp);
9238
     if (ARCH_X86)
9239
         ff_vc1dsp_init_x86(dsp);
9240
     if (ARCH_MIPS)
9241
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
9242
index 75db62b..eabf2c8 100644
9243
--- a/libavcodec/vc1dsp.h
9244
+++ b/libavcodec/vc1dsp.h
9245
@@ -86,6 +86,7 @@ void ff_vc1dsp_init(VC1DSPContext* c);
9246
 void ff_vc1dsp_init_aarch64(VC1DSPContext* dsp);
9247
 void ff_vc1dsp_init_arm(VC1DSPContext* dsp);
9248
 void ff_vc1dsp_init_ppc(VC1DSPContext *c);
9249
+void ff_vc1dsp_init_e2k(VC1DSPContext *c);
9250
 void ff_vc1dsp_init_x86(VC1DSPContext* dsp);
9251
 void ff_vc1dsp_init_mips(VC1DSPContext* dsp);
9252
 
9253
diff --git a/libavcodec/videodsp.c b/libavcodec/videodsp.c
9254
index ce9e9eb..087614a 100644
9255
--- a/libavcodec/videodsp.c
9256
+++ b/libavcodec/videodsp.c
9257
@@ -50,6 +50,8 @@ av_cold void ff_videodsp_init(VideoDSPContext *ctx, int bpc)
9258
         ff_videodsp_init_arm(ctx, bpc);
9259
     if (ARCH_PPC)
9260
         ff_videodsp_init_ppc(ctx, bpc);
9261
+    if (ARCH_E2K)
9262
+        ff_videodsp_init_e2k(ctx, bpc);
9263
     if (ARCH_X86)
9264
         ff_videodsp_init_x86(ctx, bpc);
9265
     if (ARCH_MIPS)
9266
diff --git a/libavcodec/videodsp.h b/libavcodec/videodsp.h
9267
index c0545f2..566296f 100644
9268
--- a/libavcodec/videodsp.h
9269
+++ b/libavcodec/videodsp.h
9270
@@ -82,6 +82,7 @@ void ff_videodsp_init(VideoDSPContext *ctx, int bpc);
9271
 void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc);
9272
 void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc);
9273
 void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc);
9274
+void ff_videodsp_init_e2k(VideoDSPContext *ctx, int bpc);
9275
 void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc);
9276
 void ff_videodsp_init_mips(VideoDSPContext *ctx, int bpc);
9277
 
9278
diff --git a/libavcodec/vorbisdsp.c b/libavcodec/vorbisdsp.c
9279
index 362a276..e0ecda3 100644
9280
--- a/libavcodec/vorbisdsp.c
9281
+++ b/libavcodec/vorbisdsp.c
9282
@@ -31,6 +31,8 @@ av_cold void ff_vorbisdsp_init(VorbisDSPContext *dsp)
9283
         ff_vorbisdsp_init_arm(dsp);
9284
     if (ARCH_PPC)
9285
         ff_vorbisdsp_init_ppc(dsp);
9286
+    if (ARCH_E2K)
9287
+        ff_vorbisdsp_init_e2k(dsp);
9288
     if (ARCH_X86)
9289
         ff_vorbisdsp_init_x86(dsp);
9290
 }
9291
diff --git a/libavcodec/vorbisdsp.h b/libavcodec/vorbisdsp.h
9292
index 7abec4e..001151f 100644
9293
--- a/libavcodec/vorbisdsp.h
9294
+++ b/libavcodec/vorbisdsp.h
9295
@@ -34,5 +34,6 @@ void ff_vorbisdsp_init_aarch64(VorbisDSPContext *dsp);
9296
 void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp);
9297
 void ff_vorbisdsp_init_arm(VorbisDSPContext *dsp);
9298
 void ff_vorbisdsp_init_ppc(VorbisDSPContext *dsp);
9299
+void ff_vorbisdsp_init_e2k(VorbisDSPContext *dsp);
9300
 
9301
 #endif /* AVCODEC_VORBISDSP_H */
9302
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
9303
index f485fba..c2ce815 100644
9304
--- a/libavcodec/vp3dsp.c
9305
+++ b/libavcodec/vp3dsp.c
9306
@@ -456,6 +456,8 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
9307
         ff_vp3dsp_init_arm(c, flags);
9308
     if (ARCH_PPC)
9309
         ff_vp3dsp_init_ppc(c, flags);
9310
+    if (ARCH_E2K)
9311
+        ff_vp3dsp_init_e2k(c, flags);
9312
     if (ARCH_X86)
9313
         ff_vp3dsp_init_x86(c, flags);
9314
     if (ARCH_MIPS)
9315
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
9316
index 3b849ec..a01bfd4 100644
9317
--- a/libavcodec/vp3dsp.h
9318
+++ b/libavcodec/vp3dsp.h
9319
@@ -56,6 +56,7 @@ void ff_vp3dsp_idct10_add(uint8_t *dest, ptrdiff_t stride, int16_t *block);
9320
 void ff_vp3dsp_init(VP3DSPContext *c, int flags);
9321
 void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags);
9322
 void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags);
9323
+void ff_vp3dsp_init_e2k(VP3DSPContext *c, int flags);
9324
 void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags);
9325
 void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags);
9326
 
9327
diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c
9328
index 4ff63d0..23c9db0 100644
9329
--- a/libavcodec/vp8dsp.c
9330
+++ b/libavcodec/vp8dsp.c
9331
@@ -679,6 +679,8 @@ av_cold void ff_vp78dsp_init(VP8DSPContext *dsp)
9332
         ff_vp78dsp_init_arm(dsp);
9333
     if (ARCH_PPC)
9334
         ff_vp78dsp_init_ppc(dsp);
9335
+    if (ARCH_E2K)
9336
+        ff_vp78dsp_init_e2k(dsp);
9337
     if (ARCH_X86)
9338
         ff_vp78dsp_init_x86(dsp);
9339
 }
9340
diff --git a/libavcodec/vp8dsp.h b/libavcodec/vp8dsp.h
9341
index cfe1524..be4f412 100644
9342
--- a/libavcodec/vp8dsp.h
9343
+++ b/libavcodec/vp8dsp.h
9344
@@ -94,6 +94,7 @@ void ff_vp78dsp_init(VP8DSPContext *c);
9345
 void ff_vp78dsp_init_aarch64(VP8DSPContext *c);
9346
 void ff_vp78dsp_init_arm(VP8DSPContext *c);
9347
 void ff_vp78dsp_init_ppc(VP8DSPContext *c);
9348
+void ff_vp78dsp_init_e2k(VP8DSPContext *c);
9349
 void ff_vp78dsp_init_x86(VP8DSPContext *c);
9350
 
9351
 void ff_vp8dsp_init(VP8DSPContext *c);
9352
diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
9353
index f6d73f7..03df9c0 100644
9354
--- a/libavcodec/vp9dsp.c
9355
+++ b/libavcodec/vp9dsp.c
9356
@@ -95,5 +95,6 @@ av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)
9357
     if (ARCH_AARCH64) ff_vp9dsp_init_aarch64(dsp, bpp);
9358
     if (ARCH_ARM) ff_vp9dsp_init_arm(dsp, bpp);
9359
     if (ARCH_X86) ff_vp9dsp_init_x86(dsp, bpp, bitexact);
9360
+    if (ARCH_E2K) ff_vp9dsp_init_e2k(dsp, bpp, bitexact);
9361
     if (ARCH_MIPS) ff_vp9dsp_init_mips(dsp, bpp);
9362
 }
9363
diff --git a/libavcodec/vp9dsp.h b/libavcodec/vp9dsp.h
9364
index e225631..f6dfe01 100644
9365
--- a/libavcodec/vp9dsp.h
9366
+++ b/libavcodec/vp9dsp.h
9367
@@ -131,6 +131,7 @@ void ff_vp9dsp_init_12(VP9DSPContext *dsp);
9368
 void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp);
9369
 void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp);
9370
 void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact);
9371
+void ff_vp9dsp_init_e2k(VP9DSPContext *dsp, int bpp, int bitexact);
9372
 void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp);
9373
 
9374
 #endif /* AVCODEC_VP9DSP_H */
9375
diff --git a/libavutil/cpu.c b/libavutil/cpu.c
9376
index 6548cc3..78e3f79 100644
9377
--- a/libavutil/cpu.c
9378
+++ b/libavutil/cpu.c
9379
@@ -57,6 +57,8 @@ static int get_cpu_flags(void)
9380
         return ff_get_cpu_flags_arm();
9381
     if (ARCH_PPC)
9382
         return ff_get_cpu_flags_ppc();
9383
+    if (ARCH_E2K)
9384
+        return ff_get_cpu_flags_e2k();
9385
     if (ARCH_X86)
9386
         return ff_get_cpu_flags_x86();
9387
     return 0;
9388
@@ -132,6 +134,8 @@ int av_parse_cpu_flags(const char *s)
9389
         { "flags"   , NULL, 0, AV_OPT_TYPE_FLAGS, { .i64 = 0 }, INT64_MIN, INT64_MAX, .unit = "flags" },
9390
 #if   ARCH_PPC
9391
         { "altivec" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_ALTIVEC  },    .unit = "flags" },
9392
+#elif ARCH_E2K
9393
+        { "e2k"     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_E2K      },    .unit = "flags" },
9394
 #elif ARCH_X86
9395
         { "mmx"     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_MMX      },    .unit = "flags" },
9396
         { "mmxext"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_MMXEXT       },    .unit = "flags" },
9397
@@ -194,6 +198,8 @@ int av_parse_cpu_caps(unsigned *flags, const char *s)
9398
         { "flags"   , NULL, 0, AV_OPT_TYPE_FLAGS, { .i64 = 0 }, INT64_MIN, INT64_MAX, .unit = "flags" },
9399
 #if   ARCH_PPC
9400
         { "altivec" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_ALTIVEC  },    .unit = "flags" },
9401
+#elif ARCH_E2K
9402
+        { "e2k"     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_E2K      },    .unit = "flags" },
9403
 #elif ARCH_X86
9404
         { "mmx"     , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_MMX      },    .unit = "flags" },
9405
         { "mmx2"    , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_MMX2     },    .unit = "flags" },
9406
@@ -314,6 +320,8 @@ size_t av_cpu_max_align(void)
9407
         return ff_get_cpu_max_align_arm();
9408
     if (ARCH_PPC)
9409
         return ff_get_cpu_max_align_ppc();
9410
+    if (ARCH_E2K)
9411
+        return ff_get_cpu_max_align_e2k();
9412
     if (ARCH_X86)
9413
         return ff_get_cpu_max_align_x86();
9414
 
9415
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
9416
index 8bb9eb6..537c6db 100644
9417
--- a/libavutil/cpu.h
9418
+++ b/libavutil/cpu.h
9419
@@ -61,6 +61,8 @@
9420
 #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
9421
 #define AV_CPU_FLAG_POWER8       0x0004 ///< ISA 2.07
9422
 
9423
+#define AV_CPU_FLAG_E2K          0x0001
9424
+
9425
 #define AV_CPU_FLAG_ARMV5TE      (1 << 0)
9426
 #define AV_CPU_FLAG_ARMV6        (1 << 1)
9427
 #define AV_CPU_FLAG_ARMV6T2      (1 << 2)
9428
diff --git a/libavutil/cpu_internal.h b/libavutil/cpu_internal.h
9429
index 37122d1..d40e28d 100644
9430
--- a/libavutil/cpu_internal.h
9431
+++ b/libavutil/cpu_internal.h
9432
@@ -44,11 +44,13 @@
9433
 int ff_get_cpu_flags_aarch64(void);
9434
 int ff_get_cpu_flags_arm(void);
9435
 int ff_get_cpu_flags_ppc(void);
9436
+int ff_get_cpu_flags_e2k(void);
9437
 int ff_get_cpu_flags_x86(void);
9438
 
9439
 size_t ff_get_cpu_max_align_aarch64(void);
9440
 size_t ff_get_cpu_max_align_arm(void);
9441
 size_t ff_get_cpu_max_align_ppc(void);
9442
+size_t ff_get_cpu_max_align_e2k(void);
9443
 size_t ff_get_cpu_max_align_x86(void);
9444
 
9445
 #endif /* AVUTIL_CPU_INTERNAL_H */
9446
diff --git a/libavutil/e2k/Makefile b/libavutil/e2k/Makefile
9447
new file mode 100644
9448
index 0000000..67892b4
9449
--- /dev/null
9450
+++ b/libavutil/e2k/Makefile
9451
@@ -0,0 +1,2 @@
9452
+OBJS += e2k/cpu.o \
9453
+        e2k/float_dsp.o
9454
diff --git a/libavutil/e2k/cpu.c b/libavutil/e2k/cpu.c
9455
new file mode 100644
9456
index 0000000..6e52faa
9457
--- /dev/null
9458
+++ b/libavutil/e2k/cpu.c
9459
@@ -0,0 +1,41 @@
9460
+/*
9461
+ * This file is part of FFmpeg.
9462
+ *
9463
+ * FFmpeg is free software; you can redistribute it and/or
9464
+ * modify it under the terms of the GNU Lesser General Public
9465
+ * License as published by the Free Software Foundation; either
9466
+ * version 2.1 of the License, or (at your option) any later version.
9467
+ *
9468
+ * FFmpeg is distributed in the hope that it will be useful,
9469
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
9470
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
9471
+ * Lesser General Public License for more details.
9472
+ *
9473
+ * You should have received a copy of the GNU Lesser General Public
9474
+ * License along with FFmpeg; if not, write to the Free Software
9475
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9476
+ */
9477
+
9478
+#include "config.h"
9479
+#include "libavutil/avassert.h"
9480
+#include "libavutil/cpu.h"
9481
+#include "libavutil/cpu_internal.h"
9482
+
9483
+int ff_get_cpu_flags_e2k(void)
9484
+{
9485
+#if HAVE_E2K
9486
+    return AV_CPU_FLAG_E2K;
9487
+#else
9488
+    return 0;
9489
+#endif
9490
+}
9491
+
9492
+size_t ff_get_cpu_max_align_e2k(void)
9493
+{
9494
+    int flags = av_get_cpu_flags();
9495
+
9496
+    if (flags & AV_CPU_FLAG_E2K)
9497
+        return 16;
9498
+
9499
+    return 8;
9500
+}
9501
diff --git a/libavutil/e2k/cpu.h b/libavutil/e2k/cpu.h
9502
new file mode 100644
9503
index 0000000..e9a3d66
9504
--- /dev/null
9505
+++ b/libavutil/e2k/cpu.h
9506
@@ -0,0 +1,27 @@
9507
+/*
9508
+ * This file is part of FFmpeg.
9509
+ *
9510
+ * FFmpeg is free software; you can redistribute it and/or
9511
+ * modify it under the terms of the GNU Lesser General Public
9512
+ * License as published by the Free Software Foundation; either
9513
+ * version 2.1 of the License, or (at your option) any later version.
9514
+ *
9515
+ * FFmpeg is distributed in the hope that it will be useful,
9516
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
9517
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
9518
+ * Lesser General Public License for more details.
9519
+ *
9520
+ * You should have received a copy of the GNU Lesser General Public
9521
+ * License along with FFmpeg; if not, write to the Free Software
9522
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9523
+ */
9524
+
9525
+#ifndef AVUTIL_E2K_CPU_H
9526
+#define AVUTIL_E2K_CPU_H
9527
+
9528
+#include "libavutil/cpu.h"
9529
+#include "libavutil/cpu_internal.h"
9530
+
9531
+#define E2K_BASE(flags) CPUEXT(flags, E2K)
9532
+
9533
+#endif /* AVUTIL_E2K_CPU_H */
9534
diff --git a/libavutil/e2k/float_dsp.c b/libavutil/e2k/float_dsp.c
9535
new file mode 100644
9536
index 0000000..dfecdab
9537
--- /dev/null
9538
+++ b/libavutil/e2k/float_dsp.c
9539
@@ -0,0 +1,188 @@
9540
+/*
9541
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
9542
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
9543
+ *
9544
+ * This file is part of FFmpeg.
9545
+ *
9546
+ * FFmpeg is free software; you can redistribute it and/or
9547
+ * modify it under the terms of the GNU Lesser General Public
9548
+ * License as published by the Free Software Foundation; either
9549
+ * version 2.1 of the License, or (at your option) any later version.
9550
+ *
9551
+ * FFmpeg is distributed in the hope that it will be useful,
9552
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
9553
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
9554
+ * Lesser General Public License for more details.
9555
+ *
9556
+ * You should have received a copy of the GNU Lesser General Public
9557
+ * License along with FFmpeg; if not, write to the Free Software
9558
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9559
+ */
9560
+
9561
+#include "config.h"
9562
+#include "libavutil/attributes.h"
9563
+#include "libavutil/cpu.h"
9564
+#include "libavutil/float_dsp.h"
9565
+#include "libavutil/e2k/cpu.h"
9566
+#include "util_e2k.h"
9567
+
9568
+static void ff_vector_fmul_e2k(float *dst,
9569
+                               const float *src0, const float *src1,
9570
+                               int len)
9571
+{
9572
+    int i = 0;
9573
+    __m128 d0, d1;
9574
+
9575
+    PRAGMA_E2K("ivdep")
9576
+    for (; i < len; i += 8) {
9577
+        d0 = _mm_load_ps(src0 + i);
9578
+        d1 = _mm_load_ps(src0 + i + 4);
9579
+        d0 = _mm_mul_ps(d0, _mm_load_ps(src1 + i));
9580
+        d1 = _mm_mul_ps(d1, _mm_load_ps(src1 + i + 4));
9581
+        _mm_store_ps(dst + i, d0);
9582
+        _mm_store_ps(dst + i + 4, d1);
9583
+    }
9584
+}
9585
+
9586
+static void vector_fmac_scalar_e2k(float *dst, const float *src, float mul,
9587
+                                   int len)
9588
+{
9589
+    int i = 0;
9590
+    __m128 d0, d1, d2 = _mm_set1_ps(mul);
9591
+
9592
+    PRAGMA_E2K("ivdep")
9593
+    for (; i < len; i += 8) {
9594
+        d0 = _mm_load_ps(src + i);
9595
+        d1 = _mm_load_ps(src + i + 4);
9596
+        d0 = _mm_mul_ps(d0, d2);
9597
+        d1 = _mm_mul_ps(d1, d2);
9598
+        d0 = _mm_add_ps(d0, _mm_load_ps(dst + i));
9599
+        d1 = _mm_add_ps(d1, _mm_load_ps(dst + i + 4));
9600
+        _mm_store_ps(dst + i, d0);
9601
+        _mm_store_ps(dst + i + 4, d1);
9602
+    }
9603
+}
9604
+
9605
+static void vector_fmul_scalar_e2k(float *dst, const float *src, float mul,
9606
+                                   int len)
9607
+{
9608
+    int i = 0;
9609
+    __m128 d0, d1, d2 = _mm_set1_ps(mul);
9610
+
9611
+    PRAGMA_E2K("ivdep")
9612
+    for (; i < len - 4; i += 8) {
9613
+        d0 = _mm_load_ps(src + i);
9614
+        d1 = _mm_load_ps(src + i + 4);
9615
+        d0 = _mm_mul_ps(d0, d2);
9616
+        d1 = _mm_mul_ps(d1, d2);
9617
+        _mm_store_ps(dst + i, d0);
9618
+        _mm_store_ps(dst + i + 4, d1);
9619
+    }
9620
+    if (i < len) {
9621
+        d0 = _mm_load_ps(src + i);
9622
+        d0 = _mm_mul_ps(d0, d2);
9623
+        _mm_store_ps(dst + i, d0);
9624
+    }
9625
+}
9626
+
9627
+static void ff_vector_fmul_window_e2k(float *dst, const float *src0,
9628
+                                      const float *src1, const float *win,
9629
+                                      int len)
9630
+{
9631
+    __m128 t0, t1, s0, s1, wi, wj;
9632
+    int i, j;
9633
+
9634
+    dst  += len;
9635
+    win  += len;
9636
+    src0 += len;
9637
+
9638
+    PRAGMA_E2K("ivdep")
9639
+    for (i = -len, j = len - 4; i < 0; i += 4, j -= 4) {
9640
+        s0 = _mm_load_ps(src0 + i);
9641
+        s1 = _mm_load_ps(src1 + j);
9642
+        wi = _mm_load_ps(win + i);
9643
+        wj = _mm_load_ps(win + j);
9644
+
9645
+        s1 = _mm_shuffle_ps(s1, s1, 0x1b);
9646
+        wj = _mm_shuffle_ps(wj, wj, 0x1b);
9647
+
9648
+        t0 = _mm_mul_ps(s0, wj);
9649
+        t1 = _mm_mul_ps(s0, wi);
9650
+        t0 = _mm_sub_ps(t0, _mm_mul_ps(s1, wi));
9651
+        t1 = _mm_add_ps(t1, _mm_mul_ps(s1, wj));
9652
+        t1 = _mm_shuffle_ps(t1, t1, 0x1b);
9653
+
9654
+        _mm_store_ps(dst + i, t0);
9655
+        _mm_store_ps(dst + j, t1);
9656
+    }
9657
+}
9658
+
9659
+static void ff_vector_fmul_add_e2k(float *dst, const float *src0,
9660
+                                   const float *src1, const float *src2,
9661
+                                   int len)
9662
+{
9663
+    int i;
9664
+    __m128 d, s0, s1, s2;
9665
+
9666
+    PRAGMA_E2K("ivdep")
9667
+    for (i = 0; i < len; i += 4) {
9668
+        s0 = _mm_load_ps(src0 + i);
9669
+        s1 = _mm_load_ps(src1 + i);
9670
+        s2 = _mm_load_ps(src2 + i);
9671
+        d = _mm_add_ps(_mm_mul_ps(s0, s1), s2);
9672
+        _mm_store_ps(dst + i, d);
9673
+    }
9674
+}
9675
+
9676
+static void ff_vector_fmul_reverse_e2k(float *dst, const float *src0,
9677
+                                       const float *src1, int len)
9678
+{
9679
+    int i;
9680
+    __m128 s0, s1, s2, s3;
9681
+    src1 += len - 4;
9682
+
9683
+    PRAGMA_E2K("ivdep")
9684
+    for (i = 0; i < len; i += 8) {
9685
+        s1 = _mm_load_ps(src1 - i);
9686
+        s0 = _mm_load_ps(src0 + i);
9687
+        s3 = _mm_load_ps(src1 - i - 4);
9688
+        s2 = _mm_load_ps(src0 + i + 4);
9689
+        s1 = _mm_shuffle_ps(s1, s1, 0x1b);
9690
+        s3 = _mm_shuffle_ps(s3, s3, 0x1b);
9691
+        s0 = _mm_mul_ps(s0, s1);
9692
+        s2 = _mm_mul_ps(s2, s3);
9693
+        _mm_store_ps(dst + i, s0);
9694
+        _mm_store_ps(dst + i + 4, s2);
9695
+    }
9696
+}
9697
+
9698
+static void butterflies_float_e2k(float *av_restrict src0,
9699
+                                  float *av_restrict src1, int len)
9700
+{
9701
+    int i;
9702
+    __m128 s0, s1, s2;
9703
+
9704
+    PRAGMA_E2K("ivdep")
9705
+    for (i = 0; i < len; i += 4) {
9706
+        s0 = _mm_load_ps(src0 + i);
9707
+        s1 = _mm_load_ps(src1 + i);
9708
+        s2 = _mm_sub_ps(s0, s1);
9709
+        s0 = _mm_add_ps(s0, s1);
9710
+        _mm_store_ps(src1 + i, s2);
9711
+        _mm_store_ps(src0 + i, s0);
9712
+    }
9713
+}
9714
+
9715
+av_cold void ff_float_dsp_init_e2k(AVFloatDSPContext *fdsp, int bit_exact)
9716
+{
9717
+    if (!E2K_BASE(av_get_cpu_flags()))
9718
+        return;
9719
+
9720
+    fdsp->vector_fmul = ff_vector_fmul_e2k; 
9721
+    fdsp->vector_fmac_scalar = vector_fmac_scalar_e2k;
9722
+    fdsp->vector_fmul_scalar = vector_fmul_scalar_e2k;
9723
+    fdsp->vector_fmul_window = ff_vector_fmul_window_e2k;
9724
+    fdsp->vector_fmul_add = ff_vector_fmul_add_e2k;
9725
+    fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_e2k;
9726
+    fdsp->butterflies_float = butterflies_float_e2k;
9727
+}
9728
diff --git a/libavutil/e2k/intreadwrite.h b/libavutil/e2k/intreadwrite.h
9729
new file mode 100644
9730
index 0000000..0387475
9731
--- /dev/null
9732
+++ b/libavutil/e2k/intreadwrite.h
9733
@@ -0,0 +1,54 @@
9734
+/*
9735
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
9736
+ *
9737
+ * This file is part of FFmpeg.
9738
+ *
9739
+ * FFmpeg is free software; you can redistribute it and/or
9740
+ * modify it under the terms of the GNU Lesser General Public
9741
+ * License as published by the Free Software Foundation; either
9742
+ * version 2.1 of the License, or (at your option) any later version.
9743
+ *
9744
+ * FFmpeg is distributed in the hope that it will be useful,
9745
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
9746
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
9747
+ * Lesser General Public License for more details.
9748
+ *
9749
+ * You should have received a copy of the GNU Lesser General Public
9750
+ * License along with FFmpeg; if not, write to the Free Software
9751
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9752
+ */
9753
+
9754
+#ifndef AVUTIL_E2K_INTREADWRITE_H
9755
+#define AVUTIL_E2K_INTREADWRITE_H
9756
+
9757
+#include <stdint.h>
9758
+#include "config.h"
9759
+
9760
+#include <x86intrin.h>
9761
+
9762
+#define AV_RB32 av_read_bswap32
9763
+#define AV_WB32 av_write_bswap32
9764
+#define AV_RB64 av_read_bswap64
9765
+#define AV_WB64 av_write_bswap64
9766
+
9767
+static av_always_inline uint32_t av_read_bswap32(const void *p)
9768
+{
9769
+    return _bswap(*(const uint32_t*)p);
9770
+}
9771
+
9772
+static av_always_inline void av_write_bswap32(void *p, uint32_t v)
9773
+{
9774
+    *(uint32_t*)p = _bswap(v);
9775
+}
9776
+
9777
+static av_always_inline uint64_t av_read_bswap64(const void *p)
9778
+{
9779
+    return _bswap64(*(const uint64_t*)p);
9780
+}
9781
+
9782
+static av_always_inline void av_write_bswap64(void *p, uint64_t v)
9783
+{
9784
+    *(uint64_t*)p = _bswap64(v);
9785
+}
9786
+
9787
+#endif /* AVUTIL_E2K_INTREADWRITE_H */
9788
diff --git a/libavutil/e2k/timer.h b/libavutil/e2k/timer.h
9789
new file mode 100644
9790
index 0000000..ea78175
9791
--- /dev/null
9792
+++ b/libavutil/e2k/timer.h
9793
@@ -0,0 +1,35 @@
9794
+/*
9795
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
9796
+ *
9797
+ * This file is part of FFmpeg.
9798
+ *
9799
+ * FFmpeg is free software; you can redistribute it and/or
9800
+ * modify it under the terms of the GNU Lesser General Public
9801
+ * License as published by the Free Software Foundation; either
9802
+ * version 2.1 of the License, or (at your option) any later version.
9803
+ *
9804
+ * FFmpeg is distributed in the hope that it will be useful,
9805
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
9806
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
9807
+ * Lesser General Public License for more details.
9808
+ *
9809
+ * You should have received a copy of the GNU Lesser General Public
9810
+ * License along with FFmpeg; if not, write to the Free Software
9811
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9812
+ */
9813
+
9814
+#ifndef AVUTIL_E2K_TIMER_H
9815
+#define AVUTIL_E2K_TIMER_H
9816
+
9817
+#include <stdint.h>
9818
+#include <x86intrin.h>
9819
+
9820
+#define AV_READ_TIME read_time
9821
+
9822
+static inline uint64_t read_time(void)
9823
+{
9824
+    unsigned aux;
9825
+    return __rdtscp(&aux);
9826
+}
9827
+
9828
+#endif /* AVUTIL_E2K_TIMER_H */
9829
diff --git a/libavutil/e2k/util_e2k.h b/libavutil/e2k/util_e2k.h
9830
new file mode 100644
9831
index 0000000..f5cea7c
9832
--- /dev/null
9833
+++ b/libavutil/e2k/util_e2k.h
9834
@@ -0,0 +1,146 @@
9835
+/*
9836
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
9837
+ *
9838
+ * This file is part of FFmpeg.
9839
+ *
9840
+ * FFmpeg is free software; you can redistribute it and/or
9841
+ * modify it under the terms of the GNU Lesser General Public
9842
+ * License as published by the Free Software Foundation; either
9843
+ * version 2.1 of the License, or (at your option) any later version.
9844
+ *
9845
+ * FFmpeg is distributed in the hope that it will be useful,
9846
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
9847
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
9848
+ * Lesser General Public License for more details.
9849
+ *
9850
+ * You should have received a copy of the GNU Lesser General Public
9851
+ * License along with FFmpeg; if not, write to the Free Software
9852
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9853
+ */
9854
+
9855
+/**
9856
+ * @file
9857
+ * Contains misc utility macros and inline functions
9858
+ */
9859
+
9860
+#ifndef AVUTIL_E2K_UTIL_E2K_H
9861
+#define AVUTIL_E2K_UTIL_E2K_H
9862
+
9863
+#include <stdint.h>
9864
+#include "config.h"
9865
+#include <smmintrin.h> /* SSE4.1 */
9866
+
9867
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
9868
+#define ALIGNED(n) __attribute__((aligned(n))) 
9869
+
9870
+#ifdef __e2k__
9871
+#define PRAGMA_E2K _Pragma
9872
+#define _mm_shuffle2_pi8(a, b, c) \
9873
+    ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c)))
9874
+#define _mm_shuffle2_epi8(a, b, c) \
9875
+    ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c)))
9876
+#define _mm_blendv_pi8(a, b, c) \
9877
+    ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c)))
9878
+#else
9879
+#define PRAGMA_E2K(x)
9880
+#define _mm_shuffle2_pi8(a, b, c) \
9881
+    _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \
9882
+        _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c)))
9883
+#define _mm_shuffle2_epi8(a, b, c) \
9884
+    _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \
9885
+        _mm_slli_epi16(c, 3))
9886
+#define _mm_blendv_pi8(a, b, c) \
9887
+    _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \
9888
+        _mm_movpi64_epi64(b), _mm_movpi64_epi64(c)))
9889
+
9890
+static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) {
9891
+  int n = b & 63;
9892
+  a = a >> n | a << (64 - n);
9893
+  return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63)));
9894
+}
9895
+#endif
9896
+
9897
+#define _mm_extract_pi32(a, b) _mm_extract_epi32(_mm_movpi64_epi64(a), b)
9898
+#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1))
9899
+
9900
+#define _mm_unpacklo_ps2(a, b) _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(a), _mm_castps_pd(b)));
9901
+#define _mm_unpackhi_ps2(a, b) _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(a), _mm_castps_pd(b)));
9902
+#define _mm_alignr_ps(a, b, n) _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(a), _mm_castps_si128(b), (n) * 4))
9903
+#define _mm_bsrli_ps(a, n) _mm_castsi128_ps(_mm_bsrli_si128(_mm_castps_si128(a), (n) * 4))
9904
+
9905
+/***********************************************************************
9906
+ * Vector types
9907
+ **********************************************************************/
9908
+#define vec_u8  __m128i
9909
+#define vec_s8  __m128i
9910
+#define vec_u16 __m128i
9911
+#define vec_s16 __m128i
9912
+#define vec_u32 __m128i
9913
+#define vec_s32 __m128i
9914
+#define vec_f   __m128
9915
+
9916
+/***********************************************************************
9917
+ * Null vector
9918
+ **********************************************************************/
9919
+#define LOAD_ZERO const __m128i zerov = _mm_setzero_si128()
9920
+
9921
+// Transpose 8x8 matrix of 16-bit elements (in-place)
9922
+#define TRANSPOSE8(a0, a1, a2, a3, a4, a5, a6, a7) \
9923
+do { \
9924
+    vec_s16 _b0, _b1, _b2, _b3, _b4, _b5, _b6, _b7; \
9925
+    vec_s16 _c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7; \
9926
+    _b0 = _mm_unpacklo_epi16(a0, a2); \
9927
+    _b1 = _mm_unpackhi_epi16(a0, a2); \
9928
+    _b2 = _mm_unpacklo_epi16(a1, a3); \
9929
+    _b3 = _mm_unpackhi_epi16(a1, a3); \
9930
+    _b4 = _mm_unpacklo_epi16(a4, a6); \
9931
+    _b5 = _mm_unpackhi_epi16(a4, a6); \
9932
+    _b6 = _mm_unpacklo_epi16(a5, a7); \
9933
+    _b7 = _mm_unpackhi_epi16(a5, a7); \
9934
+    \
9935
+    _c0 = _mm_unpacklo_epi16(_b0, _b2); \
9936
+    _c1 = _mm_unpackhi_epi16(_b0, _b2); \
9937
+    _c2 = _mm_unpacklo_epi16(_b1, _b3); \
9938
+    _c3 = _mm_unpackhi_epi16(_b1, _b3); \
9939
+    _c4 = _mm_unpacklo_epi16(_b4, _b6); \
9940
+    _c5 = _mm_unpackhi_epi16(_b4, _b6); \
9941
+    _c6 = _mm_unpacklo_epi16(_b5, _b7); \
9942
+    _c7 = _mm_unpackhi_epi16(_b5, _b7); \
9943
+    \
9944
+    a0 = _mm_unpacklo_epi64(_c0, _c4); \
9945
+    a1 = _mm_unpackhi_epi64(_c0, _c4); \
9946
+    a2 = _mm_unpacklo_epi64(_c1, _c5); \
9947
+    a3 = _mm_unpackhi_epi64(_c1, _c5); \
9948
+    a4 = _mm_unpacklo_epi64(_c2, _c6); \
9949
+    a5 = _mm_unpackhi_epi64(_c2, _c6); \
9950
+    a6 = _mm_unpacklo_epi64(_c3, _c7); \
9951
+    a7 = _mm_unpackhi_epi64(_c3, _c7); \
9952
+} while (0)
9953
+
9954
+#define VEC_LD(a)     _mm_loadu_si128((const __m128i*)(a))
9955
+#define VEC_ST(a, b)  _mm_storeu_si128((__m128i*)(a), b)
9956
+#define VEC_LD8(a)    _mm_loadl_epi64((const __m128i*)(a))
9957
+#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b)
9958
+#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b));
9959
+
9960
+#define VEC_SPLAT16(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8))
9961
+
9962
+#if !defined(__iset__) || __iset__ < 5
9963
+#define NEED_ALIGN8
9964
+#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, src_tmp1;
9965
+#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index;
9966
+#define ALIGN8_START(ptr, src) \
9967
+  src_shr = (intptr_t)(ptr - 1) & 7; \
9968
+  src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \
9969
+  src##_next = src##_ptr[src_shr == 7]; \
9970
+  src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \
9971
+                            _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8));
9972
+#define ALIGN8_READ16(v0, src, i) \
9973
+  src_tmp1 = src##_ptr[i * 2 + 1]; \
9974
+  src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \
9975
+  src##_next = src##_ptr[i * 2 + 2]; \
9976
+  src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \
9977
+  v0 = _mm_setr_epi64(src_tmp0, src_tmp1);
9978
+#endif
9979
+
9980
+#endif /* AVUTIL_E2K_UTIL_E2K_H */
9981
diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c
9982
index 6e28d71..5241c3e 100644
9983
--- a/libavutil/float_dsp.c
9984
+++ b/libavutil/float_dsp.c
9985
@@ -156,6 +156,8 @@ av_cold AVFloatDSPContext *avpriv_float_dsp_alloc(int bit_exact)
9986
         ff_float_dsp_init_arm(fdsp);
9987
     if (ARCH_PPC)
9988
         ff_float_dsp_init_ppc(fdsp, bit_exact);
9989
+    if (ARCH_E2K)
9990
+        ff_float_dsp_init_e2k(fdsp, bit_exact);
9991
     if (ARCH_X86)
9992
         ff_float_dsp_init_x86(fdsp);
9993
     if (ARCH_MIPS)
9994
diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h
9995
index 9c66459..97d9d79 100644
9996
--- a/libavutil/float_dsp.h
9997
+++ b/libavutil/float_dsp.h
9998
@@ -205,6 +205,7 @@ float avpriv_scalarproduct_float_c(const float *v1, const float *v2, int len);
9999
 void ff_float_dsp_init_aarch64(AVFloatDSPContext *fdsp);
10000
 void ff_float_dsp_init_arm(AVFloatDSPContext *fdsp);
10001
 void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int strict);
10002
+void ff_float_dsp_init_e2k(AVFloatDSPContext *fdsp, int strict);
10003
 void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp);
10004
 void ff_float_dsp_init_mips(AVFloatDSPContext *fdsp);
10005
 
10006
diff --git a/libavutil/intreadwrite.h b/libavutil/intreadwrite.h
10007
index 4c8413a..b8a698e 100644
10008
--- a/libavutil/intreadwrite.h
10009
+++ b/libavutil/intreadwrite.h
10010
@@ -72,6 +72,8 @@ typedef union {
10011
 #   include "mips/intreadwrite.h"
10012
 #elif ARCH_PPC
10013
 #   include "ppc/intreadwrite.h"
10014
+#elif ARCH_E2K
10015
+#   include "e2k/intreadwrite.h"
10016
 #elif ARCH_TOMI
10017
 #   include "tomi/intreadwrite.h"
10018
 #elif ARCH_X86
10019
diff --git a/libavutil/tests/cpu.c b/libavutil/tests/cpu.c
10020
index ce45b71..21c30cf 100644
10021
--- a/libavutil/tests/cpu.c
10022
+++ b/libavutil/tests/cpu.c
10023
@@ -49,6 +49,8 @@ static const struct {
10024
     { AV_CPU_FLAG_SETEND,    "setend"     },
10025
 #elif ARCH_PPC
10026
     { AV_CPU_FLAG_ALTIVEC,   "altivec"    },
10027
+#elif ARCH_E2K
10028
+    { AV_CPU_FLAG_E2K,       "e2k"        },
10029
 #elif ARCH_X86
10030
     { AV_CPU_FLAG_MMX,       "mmx"        },
10031
     { AV_CPU_FLAG_MMXEXT,    "mmxext"     },
10032
diff --git a/libavutil/timer.h b/libavutil/timer.h
10033
index 0bb353c..cc0c282 100644
10034
--- a/libavutil/timer.h
10035
+++ b/libavutil/timer.h
10036
@@ -54,6 +54,8 @@
10037
 #   include "arm/timer.h"
10038
 #elif ARCH_PPC
10039
 #   include "ppc/timer.h"
10040
+#elif ARCH_E2K
10041
+#   include "e2k/timer.h"
10042
 #elif ARCH_X86
10043
 #   include "x86/timer.h"
10044
 #endif
10045
diff --git a/libswresample/audioconvert.c b/libswresample/audioconvert.c
10046
index d21fc8e..96cd701 100644
10047
--- a/libswresample/audioconvert.c
10048
+++ b/libswresample/audioconvert.c
10049
@@ -179,6 +179,7 @@ AudioConvert *swri_audio_convert_alloc(enum AVSampleFormat out_fmt,
10050
     if(HAVE_X86ASM && HAVE_MMX) swri_audio_convert_init_x86(ctx, out_fmt, in_fmt, channels);
10051
     if(ARCH_ARM)              swri_audio_convert_init_arm(ctx, out_fmt, in_fmt, channels);
10052
     if(ARCH_AARCH64)          swri_audio_convert_init_aarch64(ctx, out_fmt, in_fmt, channels);
10053
+    if(HAVE_E2K)              swri_audio_convert_init_e2k(ctx, out_fmt, in_fmt, channels);
10054
 
10055
     return ctx;
10056
 }
10057
diff --git a/libswresample/e2k/Makefile b/libswresample/e2k/Makefile
10058
new file mode 100644
10059
index 0000000..a90ab9e
10060
--- /dev/null
10061
+++ b/libswresample/e2k/Makefile
10062
@@ -0,0 +1 @@
10063
+OBJS += e2k/audio_convert.o
10064
diff --git a/libswresample/e2k/audio_convert.c b/libswresample/e2k/audio_convert.c
10065
new file mode 100644
10066
index 0000000..d3577c8
10067
--- /dev/null
10068
+++ b/libswresample/e2k/audio_convert.c
10069
@@ -0,0 +1,110 @@
10070
+/*
10071
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
10072
+ *
10073
+ * This file is part of FFmpeg.
10074
+ *
10075
+ * FFmpeg is free software; you can redistribute it and/or
10076
+ * modify it under the terms of the GNU Lesser General Public
10077
+ * License as published by the Free Software Foundation; either
10078
+ * version 2.1 of the License, or (at your option) any later version.
10079
+ *
10080
+ * FFmpeg is distributed in the hope that it will be useful,
10081
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10082
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
10083
+ * Lesser General Public License for more details.
10084
+ *
10085
+ * You should have received a copy of the GNU Lesser General Public
10086
+ * License along with FFmpeg; if not, write to the Free Software
10087
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
10088
+ */
10089
+
10090
+#include "config.h"
10091
+#include "libavutil/cpu.h"
10092
+#include "libavutil/e2k/cpu.h"
10093
+#include "libavutil/e2k/util_e2k.h"
10094
+
10095
+#include "libswresample/swresample_internal.h"
10096
+#include "libswresample/audioconvert.h"
10097
+
10098
+// length is aligned by 16 in "audioconvert.c"
10099
+
10100
+static void conv_flt_to_s16_e2k(uint8_t **_dst, const uint8_t **_src, int len) {
10101
+    const float *src = (const float*)_src[0];
10102
+    int16_t *dst = (int16_t*)_dst[0];
10103
+    int i = 0;
10104
+    __m128 f0, f1, c1 = _mm_set1_ps(1 << 15);
10105
+    __m128i v0, v1;
10106
+
10107
+    PRAGMA_E2K("ivdep")
10108
+    for (; i + 7 < len; i += 8) {
10109
+        f0 = _mm_loadu_ps(src);
10110
+        f1 = _mm_loadu_ps(src + 4);
10111
+        v0 = _mm_cvtps_epi32(_mm_mul_ps(f0, c1));
10112
+        v1 = _mm_cvtps_epi32(_mm_mul_ps(f1, c1));
10113
+        v0 = _mm_packs_epi32(v0, v1);
10114
+        VEC_ST(dst, v0);
10115
+        src += 8; dst += 8;
10116
+    }
10117
+/*
10118
+    PRAGMA_E2K("ivdep")
10119
+    for (; i < len; i++)
10120
+        *dst++ = av_clip_int16(lrintf(*src++ * (1 << 15)));
10121
+*/
10122
+}
10123
+
10124
+static void conv_fltp_to_s16_2ch_e2k(uint8_t **_dst, const uint8_t **_src, int len) {
10125
+    const float *src0 = (const float*)_src[0];
10126
+    const float *src1 = (const float*)_src[1];
10127
+    int16_t *dst = (int16_t*)_dst[0];
10128
+    int i = 0;
10129
+    __m128 f0, f1, c1 = _mm_set1_ps(1 << 15);
10130
+    __m128i v0, v1, v2, v3;
10131
+
10132
+    PRAGMA_E2K("ivdep")
10133
+    for (; i + 7 < len; i += 8) {
10134
+        f0 = _mm_loadu_ps(src0);
10135
+        f1 = _mm_loadu_ps(src0 + 4);
10136
+        v0 = _mm_cvtps_epi32(_mm_mul_ps(f0, c1));
10137
+        v1 = _mm_cvtps_epi32(_mm_mul_ps(f1, c1));
10138
+        v2 = _mm_packs_epi32(v0, v1);
10139
+        f0 = _mm_loadu_ps(src1);
10140
+        f1 = _mm_loadu_ps(src1 + 4);
10141
+        v0 = _mm_cvtps_epi32(_mm_mul_ps(f0, c1));
10142
+        v1 = _mm_cvtps_epi32(_mm_mul_ps(f1, c1));
10143
+        v3 = _mm_packs_epi32(v0, v1);
10144
+        v0 = _mm_unpacklo_epi16(v2, v3);
10145
+        v1 = _mm_unpackhi_epi16(v2, v3);
10146
+        VEC_ST(dst, v0);
10147
+        VEC_ST(dst + 8, v1);
10148
+        src0 += 8; src1 += 8; dst += 16;
10149
+    }
10150
+/*
10151
+    PRAGMA_E2K("ivdep")
10152
+    for (; i < len; i++) {
10153
+        dst[0] = av_clip_int16(lrintf(*src0++ * (1 << 15)));
10154
+        dst[1] = av_clip_int16(lrintf(*src1++ * (1 << 15)));
10155
+        dst += 2;
10156
+    }
10157
+*/
10158
+}
10159
+
10160
+av_cold void swri_audio_convert_init_e2k(struct AudioConvert *ac,
10161
+                                         enum AVSampleFormat out_fmt,
10162
+                                         enum AVSampleFormat in_fmt,
10163
+                                         int channels){
10164
+
10165
+    if (!E2K_BASE(av_get_cpu_flags()))
10166
+        return;
10167
+
10168
+    ac->simd_f = NULL;
10169
+
10170
+    if (out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLT ||
10171
+        out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLTP)
10172
+        ac->simd_f = conv_flt_to_s16_e2k;
10173
+
10174
+    if (out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLTP && channels == 2)
10175
+        ac->simd_f = conv_fltp_to_s16_2ch_e2k;
10176
+
10177
+    // if(ac->simd_f) ac->in_simd_align_mask = 7;
10178
+}
10179
+
10180
diff --git a/libswresample/swresample_internal.h b/libswresample/swresample_internal.h
10181
index f2ea5a2..b7e8501 100644
10182
--- a/libswresample/swresample_internal.h
10183
+++ b/libswresample/swresample_internal.h
10184
@@ -218,5 +218,9 @@ void swri_audio_convert_init_x86(struct AudioConvert *ac,
10185
                                  enum AVSampleFormat out_fmt,
10186
                                  enum AVSampleFormat in_fmt,
10187
                                  int channels);
10188
+void swri_audio_convert_init_e2k(struct AudioConvert *ac,
10189
+                                 enum AVSampleFormat out_fmt,
10190
+                                 enum AVSampleFormat in_fmt,
10191
+                                 int channels);
10192
 
10193
 #endif
10194
diff --git a/libswscale/e2k/Makefile b/libswscale/e2k/Makefile
10195
new file mode 100644
10196
index 0000000..f35371d
10197
--- /dev/null
10198
+++ b/libswscale/e2k/Makefile
10199
@@ -0,0 +1,3 @@
10200
+OBJS += e2k/swscale.o \
10201
+        e2k/yuv2rgb.o \
10202
+        e2k/yuv2yuv.o
10203
diff --git a/libswscale/e2k/swscale.c b/libswscale/e2k/swscale.c
10204
new file mode 100644
10205
index 0000000..24a857f
10206
--- /dev/null
10207
+++ b/libswscale/e2k/swscale.c
10208
@@ -0,0 +1,2046 @@
10209
+/*
10210
+ * Elbrus-enhanced yuv2yuvX
10211
+ *
10212
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
10213
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
10214
+ * based on the equivalent C code in swscale.c
10215
+ *
10216
+ * This file is part of FFmpeg.
10217
+ *
10218
+ * FFmpeg is free software; you can redistribute it and/or
10219
+ * modify it under the terms of the GNU Lesser General Public
10220
+ * License as published by the Free Software Foundation; either
10221
+ * version 2.1 of the License, or (at your option) any later version.
10222
+ *
10223
+ * FFmpeg is distributed in the hope that it will be useful,
10224
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10225
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
10226
+ * Lesser General Public License for more details.
10227
+ *
10228
+ * You should have received a copy of the GNU Lesser General Public
10229
+ * License along with FFmpeg; if not, write to the Free Software
10230
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
10231
+ */
10232
+
10233
+#include <inttypes.h>
10234
+
10235
+#include "config.h"
10236
+#include "libswscale/swscale.h"
10237
+#include "libswscale/swscale_internal.h"
10238
+#include "libavutil/attributes.h"
10239
+#include "libavutil/cpu.h"
10240
+#include "libavutil/e2k/util_e2k.h"
10241
+
10242
+#include "yuv2rgb.h"
10243
+
10244
+static void yuv2planeX_8_e2k(const int16_t *filter, int filterSize,
10245
+                           const int16_t **src, uint8_t *dest, int dstW,
10246
+                           const uint8_t *dither, int offset)
10247
+{
10248
+    int i = 0, j;
10249
+    __m64 h0;
10250
+    __m128i d0, d1, zerov = _mm_setzero_si128();
10251
+    h0 = (__m64)__builtin_e2k_insfd(*(uint64_t*)dither, ((offset + i) & 7) * 8, 0);
10252
+    d1 = _mm_unpacklo_epi8(_mm_movpi64_epi64(h0), zerov);
10253
+    d0 = _mm_unpacklo_epi16(d1, zerov);
10254
+    d1 = _mm_unpackhi_epi16(d1, zerov);
10255
+    d0 = _mm_slli_epi32(d0, 12);
10256
+    d1 = _mm_slli_epi32(d1, 12);
10257
+
10258
+    for (; i < dstW - 15; i += 16) {
10259
+        __m128i r0, r1, r2, r3, v0, v1, v2, v3;
10260
+
10261
+        r2 = r0 = d0;
10262
+        r3 = r1 = d1;
10263
+        for (j = 0; j < filterSize; j++) {
10264
+            v1 = _mm_set1_epi16(filter[j]);
10265
+
10266
+            v0 = VEC_LD(src[j] + i);
10267
+            v2 = _mm_mullo_epi16(v0, v1);
10268
+            v3 = _mm_mulhi_epi16(v0, v1);
10269
+            v0 = _mm_unpacklo_epi16(v2, v3);
10270
+            v3 = _mm_unpackhi_epi16(v2, v3);
10271
+            r0 = _mm_add_epi32(r0, v0);
10272
+            r1 = _mm_add_epi32(r1, v3);
10273
+
10274
+            v0 = VEC_LD(src[j] + i + 8);
10275
+            v2 = _mm_mullo_epi16(v0, v1);
10276
+            v3 = _mm_mulhi_epi16(v0, v1);
10277
+            v0 = _mm_unpacklo_epi16(v2, v3);
10278
+            v3 = _mm_unpackhi_epi16(v2, v3);
10279
+            r2 = _mm_add_epi32(r2, v0);
10280
+            r3 = _mm_add_epi32(r3, v3);
10281
+        }
10282
+        r0 = _mm_srai_epi32(r0, 19);
10283
+        r1 = _mm_srai_epi32(r1, 19);
10284
+        r2 = _mm_srai_epi32(r2, 19);
10285
+        r3 = _mm_srai_epi32(r3, 19);
10286
+        r0 = _mm_packs_epi32(r0, r1);
10287
+        r2 = _mm_packs_epi32(r2, r3);
10288
+        r0 = _mm_packus_epi16(r0, r2);
10289
+        VEC_ST(dest + i, r0);
10290
+    }
10291
+
10292
+    for (; i < dstW; i++) {
10293
+        int val = dither[(i + offset) & 7] << 12;
10294
+        for (j = 0; j < filterSize; j++)
10295
+            val += src[j][i] * filter[j];
10296
+        dest[i] = av_clip_uint8(val >> 19);
10297
+    }
10298
+}
10299
+
10300
+static void hScale_real_e2k(SwsContext *c, int16_t *dst, int dstW,
10301
+                            const uint8_t *src, const int16_t *filter,
10302
+                            const int32_t *filterPos, int filterSize)
10303
+{
10304
+    int i;
10305
+    LOAD_ZERO;
10306
+    switch (filterSize) {
10307
+
10308
+    case 1:
10309
+        PRAGMA_E2K("ivdep")
10310
+        for (i = 0; i < dstW; i++) {
10311
+            int val, srcPos = filterPos[i];
10312
+            val = (int)src[srcPos] * filter[filterSize * i];
10313
+            dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
10314
+        }
10315
+    break;
10316
+
10317
+    case 2:
10318
+        PRAGMA_E2K("ivdep")
10319
+        for (i = 0; i < dstW; i++) {
10320
+            int val, srcPos = filterPos[i];
10321
+            val = (int)src[srcPos] * filter[filterSize * i];
10322
+            val += (int)src[srcPos + 1] * filter[filterSize * i + 1];
10323
+            dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
10324
+        }
10325
+    break;
10326
+
10327
+    case 4: {
10328
+        __m64 zerov = _mm_setzero_si64();
10329
+        PRAGMA_E2K("ivdep")
10330
+        for (i = 0; i < dstW; i++, filter += filterSize) {
10331
+            int val;
10332
+            __m64 v0, v2, accv;
10333
+            const uint8_t *srci = src + filterPos[i];
10334
+
10335
+            v0 = _mm_cvtsi32_si64(*(uint32_t*)srci);
10336
+            v0 = _mm_unpacklo_pi8(v0, zerov);
10337
+            v2 = *(__m64*)filter;
10338
+            accv = _mm_madd_pi16(v0, v2);
10339
+            val = _mm_extract_pi32(accv, 0) + _mm_extract_pi32(accv, 1);
10340
+            dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
10341
+        }
10342
+    }
10343
+    break;
10344
+
10345
+    case 8:
10346
+        PRAGMA_E2K("ivdep")
10347
+        for (i = 0; i < dstW; i++, filter += filterSize) {
10348
+            int val, j = 0;
10349
+            __m128i v0, v2, accv;
10350
+            const uint8_t *srci = src + filterPos[i];
10351
+
10352
+            v0 = VEC_LD8(srci + j);
10353
+            v0 = _mm_unpacklo_epi8(v0, zerov);
10354
+            v2 = VEC_LD(filter + j);
10355
+            accv = _mm_madd_epi16(v0, v2);
10356
+            accv = _mm_hadd_epi32(accv, accv);
10357
+            val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1);
10358
+            dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
10359
+        }
10360
+    break;
10361
+
10362
+    case 16:
10363
+        PRAGMA_E2K("ivdep")
10364
+        for (i = 0; i < dstW; i++, filter += filterSize) {
10365
+            int val, j = 0;
10366
+            __m128i v0, v1, v2, v3, accv;
10367
+            const uint8_t *srci = src + filterPos[i];
10368
+
10369
+            v1 = VEC_LD(srci + j);
10370
+            v0 = _mm_unpacklo_epi8(v1, zerov);
10371
+            v1 = _mm_unpackhi_epi8(v1, zerov);
10372
+            v2 = VEC_LD(filter + j);
10373
+            v3 = VEC_LD(filter + j + 8);
10374
+            accv = _mm_madd_epi16(v0, v2);
10375
+            accv = _mm_add_epi32(accv, _mm_madd_epi16(v1, v3));
10376
+            accv = _mm_hadd_epi32(accv, accv);
10377
+            val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1);
10378
+            dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
10379
+        }
10380
+    break;
10381
+
10382
+    default:
10383
+        av_assert0((filterSize & 7) == 0);
10384
+
10385
+        for (i = 0; i < dstW; i++, filter += filterSize) {
10386
+            int val, j = 0;
10387
+            __m128i v0, v1, v2, v3, accv = zerov;
10388
+            const uint8_t *srci = src + filterPos[i];
10389
+
10390
+            for (; j < filterSize - 15; j += 16) {
10391
+                v1 = VEC_LD(srci + j);
10392
+                v0 = _mm_unpacklo_epi8(v1, zerov);
10393
+                v1 = _mm_unpackhi_epi8(v1, zerov);
10394
+                v2 = VEC_LD(filter + j);
10395
+                v3 = VEC_LD(filter + j + 8);
10396
+                accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v2));
10397
+                accv = _mm_add_epi32(accv, _mm_madd_epi16(v1, v3));
10398
+            }
10399
+            if (filterSize & 8) {
10400
+                v1 = VEC_LD8(srci + j);
10401
+                v0 = _mm_unpacklo_epi8(v1, zerov);
10402
+                v2 = VEC_LD(filter + j);
10403
+                accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v2));
10404
+            }
10405
+            accv = _mm_hadd_epi32(accv, accv);
10406
+            val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1);
10407
+            dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
10408
+        }
10409
+    }
10410
+}
10411
+
10412
+static void yuv2plane1_floatLE_e2k(const int16_t *_src, uint8_t *_dest,
10413
+                                   int dstW, const uint8_t *dither, int offset)
10414
+{
10415
+    const int32_t *src = (const int32_t*)_src;
10416
+    float *dest = (float*)_dest;
10417
+    int shift = 3;
10418
+    int add = (1 << shift) >> 1;
10419
+    int clip = (1 << 16) - 1;
10420
+    float fmult = 1.0f / 65535.0f;
10421
+    LOAD_ZERO;
10422
+    vec_u32 vadd = _mm_set1_epi32(add);
10423
+    vec_u32 vlargest = _mm_set1_epi32(clip);
10424
+    vec_f vmul = _mm_set1_ps(fmult);
10425
+    vec_u32 v0;
10426
+    vec_f v1;
10427
+    int i = 0;
10428
+
10429
+    PRAGMA_E2K("ivdep")
10430
+    for (; i < dstW - 3; i += 4) {
10431
+        v0 = VEC_LD(src + i);
10432
+        v0 = _mm_add_epi32(v0, vadd);
10433
+        v0 = _mm_srai_epi32(v0, shift);
10434
+        v0 = _mm_max_epi32(v0, zerov);
10435
+        v0 = _mm_min_epi32(v0, vlargest);
10436
+        v1 = _mm_mul_ps(_mm_cvtepi32_ps(v0), vmul);
10437
+        _mm_storeu_ps(dest + i, v1);
10438
+    }
10439
+
10440
+    PRAGMA_E2K("ivdep")
10441
+    for (; i < dstW; ++i){
10442
+        int val = src[i] + add;
10443
+        val = av_clip_uint16(val >> shift);
10444
+        dest[i] = fmult * (float)val;
10445
+    }
10446
+}
10447
+
10448
+static void yuv2plane1_floatBE_e2k(const int16_t *_src, uint8_t *_dest,
10449
+                                   int dstW, const uint8_t *dither, int offset)
10450
+{
10451
+    const int32_t *src = (const int32_t*)_src;
10452
+    uint32_t *dest = (uint32_t*)_dest;
10453
+    int shift = 3;
10454
+    int add = (1 << shift) >> 1;
10455
+    int clip = (1 << 16) - 1;
10456
+    float fmult = 1.0f / 65535.0f;
10457
+    LOAD_ZERO;
10458
+    vec_u32 vadd = _mm_set1_epi32(add);
10459
+    vec_u32 vlargest = _mm_set1_epi32(clip);
10460
+    vec_f vmul = _mm_set1_ps(fmult);
10461
+    vec_u8 vswap = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
10462
+    vec_u32 v0;
10463
+    vec_f v1;
10464
+    int i = 0;
10465
+
10466
+    PRAGMA_E2K("ivdep")
10467
+    for (; i < dstW - 3; i += 4) {
10468
+        v0 = VEC_LD(src + i);
10469
+        v0 = _mm_add_epi32(v0, vadd);
10470
+        v0 = _mm_srai_epi32(v0, shift);
10471
+        v0 = _mm_max_epi32(v0, zerov);
10472
+        v0 = _mm_min_epi32(v0, vlargest);
10473
+        v1 = _mm_mul_ps(_mm_cvtepi32_ps(v0), vmul);
10474
+        v0 = _mm_shuffle_epi8(_mm_castps_si128(v1), vswap);
10475
+        VEC_ST(dest + i, v0);
10476
+    }
10477
+
10478
+    PRAGMA_E2K("ivdep")
10479
+    for (; i < dstW; i++) {
10480
+        int val = src[i] + add;
10481
+        val = av_clip_uint16(val >> shift);
10482
+        dest[i] = av_bswap32(av_float2int(fmult * (float)val));
10483
+    }
10484
+}
10485
+
10486
+static void yuv2plane1_8_e2k(const int16_t *src, uint8_t *dest, int dstW,
10487
+                             const uint8_t *dither, int offset)
10488
+{
10489
+    int i = 0;
10490
+    __m128i v0, v1, ditherv;
10491
+    LOAD_ZERO;
10492
+    __m64 h0;
10493
+    h0 = (__m64)__builtin_e2k_insfd(*(uint64_t*)dither, ((offset + i) & 7) * 8, 0);
10494
+    ditherv = _mm_unpacklo_epi8(_mm_movpi64_epi64(h0), zerov);
10495
+
10496
+    PRAGMA_E2K("ivdep")
10497
+    for (; i < dstW - 15; i += 16) {
10498
+        v0 = VEC_LD(src + i);
10499
+        v1 = VEC_LD(src + i + 8);
10500
+        v0 = _mm_adds_epi16(v0, ditherv);
10501
+        v1 = _mm_adds_epi16(v1, ditherv);
10502
+        v0 = _mm_srai_epi16(v0, 7);
10503
+        v1 = _mm_srai_epi16(v1, 7);
10504
+        v0 = _mm_packus_epi16(v0, v1);
10505
+        VEC_ST(dest + i, v0);
10506
+    }
10507
+
10508
+    PRAGMA_E2K("ivdep")
10509
+    for (; i < dstW; i++) {
10510
+        int val = (src[i] + dither[(i + offset) & 7]) >> 7;
10511
+        dest[i] = av_clip_uint8(val);
10512
+    }
10513
+}
10514
+
10515
+#define output_pixel(pos, val) \
10516
+    if (big_endian) { \
10517
+        AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
10518
+    } else { \
10519
+        AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
10520
+    }
10521
+
10522
+static av_always_inline
10523
+void yuv2plane1_10_e2k(const int16_t *src, uint16_t *dest, int dstW,
10524
+                       const int big_endian, const int output_bits)
10525
+{
10526
+    int shift = 15 - output_bits;
10527
+    int add = 1 << (shift - 1);
10528
+    int clip = (1 << output_bits) - 1;
10529
+    vec_u16 vadd = _mm_set1_epi16(add);
10530
+    vec_u16 vlargest = _mm_set1_epi16(clip);
10531
+    vec_u8 vswap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
10532
+    LOAD_ZERO;
10533
+    __m128i v0;
10534
+    int i = 0;
10535
+
10536
+    PRAGMA_E2K("ivdep")
10537
+    for (; i < dstW - 7; i += 8) {
10538
+        v0 = VEC_LD(src + i);
10539
+        v0 = _mm_adds_epi16(v0, vadd);
10540
+        v0 = _mm_srai_epi16(v0, shift);
10541
+        v0 = _mm_max_epi16(v0, zerov);
10542
+        v0 = _mm_min_epu16(v0, vlargest);
10543
+        if (big_endian) {
10544
+            v0 = _mm_shuffle_epi8(v0, vswap);
10545
+        }
10546
+        VEC_ST(dest + i, v0);
10547
+    }
10548
+
10549
+    PRAGMA_E2K("ivdep")
10550
+    for (; i < dstW; i++) {
10551
+        int val = src[i] + add;
10552
+        output_pixel(&dest[i], val);
10553
+    }
10554
+}
10555
+
10556
+static av_always_inline
10557
+void yuv2planeX_10_e2k(const int16_t *filter, int filterSize,
10558
+                       const int16_t **src, uint16_t *dest, int dstW,
10559
+                       int big_endian, int output_bits)
10560
+{
10561
+    int shift = 11 + 16 - output_bits;
10562
+    int add = 1 << (shift - 1);
10563
+    int clip = (1 << output_bits) - 1;
10564
+    vec_u16 vadd = _mm_set1_epi32(add);
10565
+    vec_u16 vlargest = _mm_set1_epi16(clip);
10566
+    vec_u8 vswap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
10567
+    __m128i v0, v1, v2, v3, v4, v5;
10568
+    int i = 0, j;
10569
+
10570
+    for (; i < dstW - 7; i += 8) {
10571
+        v4 = v5 = vadd;
10572
+        for (j = 0; j < filterSize; j++) {
10573
+            v0 = VEC_LD(src[j] + i);
10574
+            v1 = _mm_set1_epi16(filter[j]);
10575
+            v2 = _mm_mullo_epi16(v0, v1);
10576
+            v3 = _mm_mulhi_epi16(v0, v1);
10577
+            v0 = _mm_unpacklo_epi16(v2, v3);
10578
+            v1 = _mm_unpackhi_epi16(v2, v3);
10579
+            v4 = _mm_add_epi32(v4, v0);
10580
+            v5 = _mm_add_epi32(v5, v1);
10581
+        }
10582
+        v4 = _mm_srai_epi32(v4, shift);
10583
+        v5 = _mm_srai_epi32(v5, shift);
10584
+        v0 = _mm_packus_epi32(v4, v5);
10585
+        v0 = _mm_min_epu16(v0, vlargest);
10586
+        if (big_endian) {
10587
+            v0 = _mm_shuffle_epi8(v0, vswap);
10588
+        }
10589
+        VEC_ST(dest + i, v0);
10590
+    }
10591
+
10592
+    for (; i < dstW; i++) {
10593
+        int val = 1 << (shift - 1);
10594
+        for (j = 0; j < filterSize; j++)
10595
+            val += src[j][i] * filter[j];
10596
+        output_pixel(&dest[i], val);
10597
+    }
10598
+}
10599
+
10600
+#undef output_pixel
10601
+
10602
+#define output_pixel(pos, val, bias, signedness) \
10603
+    if (big_endian) { \
10604
+        AV_WB16(pos, bias + av_clip_##signedness##16(val >> shift)); \
10605
+    } else { \
10606
+        AV_WL16(pos, bias + av_clip_##signedness##16(val >> shift)); \
10607
+    }
10608
+
10609
+static av_always_inline
10610
+void yuv2plane1_16_e2k(const int32_t *src, uint16_t *dest, int dstW,
10611
+                       const int big_endian, int output_bits)
10612
+{
10613
+    int shift = 3;
10614
+    int add = 1 << (shift - 1);
10615
+    vec_u32 vadd = _mm_set1_epi32(add);
10616
+    vec_u8 vswap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
10617
+    __m128i v0, v1;
10618
+    int i = 0;
10619
+
10620
+    PRAGMA_E2K("ivdep")
10621
+    for (; i < dstW - 7; i += 8) {
10622
+        v0 = VEC_LD(src + i);
10623
+        v1 = VEC_LD(src + i + 4);
10624
+        v0 = _mm_add_epi32(v0, vadd);
10625
+        v1 = _mm_add_epi32(v1, vadd);
10626
+        v0 = _mm_srai_epi32(v0, shift);
10627
+        v1 = _mm_srai_epi32(v1, shift);
10628
+        v0 = _mm_packus_epi32(v0, v1);
10629
+        if (big_endian) {
10630
+            v0 = _mm_shuffle_epi8(v0, vswap);
10631
+        }
10632
+        VEC_ST(dest + i, v0);
10633
+    }
10634
+
10635
+    PRAGMA_E2K("ivdep")
10636
+    for (; i < dstW; i++) {
10637
+        int val = src[i] + add;
10638
+        output_pixel(&dest[i], val, 0, uint);
10639
+    }
10640
+}
10641
+
10642
+/* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
10643
+ * filters (or anything with negative coeffs, the range can be slightly
10644
+ * wider in both directions. To account for this overflow, we subtract
10645
+ * a constant so it always fits in the signed range (assuming a
10646
+ * reasonable filterSize), and re-add that at the end. */
10647
+
10648
+static av_always_inline
10649
+void yuv2planeX_16_e2k(const int16_t *filter, int filterSize,
10650
+                       const int32_t **src, uint16_t *dest, int dstW,
10651
+                       int big_endian, int output_bits)
10652
+{
10653
+    int shift = 15, bias = 0x8000;
10654
+    int add = (1 << (shift - 1)) - 0x40000000;
10655
+    vec_u32 vadd = _mm_set1_epi32(add);
10656
+    vec_u16 vbias = _mm_set1_epi16(bias);
10657
+    vec_u8 vswap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
10658
+    __m128i v0, v1, v2, v4, v5;
10659
+    int i = 0, j;
10660
+
10661
+    for (; i < dstW - 7; i += 8) {
10662
+        v4 = v5 = vadd;
10663
+        for (j = 0; j < filterSize; j++) {
10664
+            v0 = VEC_LD(src[j] + i);
10665
+            v1 = VEC_LD(src[j] + i + 4);
10666
+            v2 = _mm_set1_epi32(filter[j]);
10667
+            v4 = _mm_add_epi32(v4, _mm_mullo_epi32(v0, v2));
10668
+            v5 = _mm_add_epi32(v5, _mm_mullo_epi32(v1, v2));
10669
+        }
10670
+        v4 = _mm_srai_epi32(v4, shift);
10671
+        v5 = _mm_srai_epi32(v5, shift);
10672
+        v0 = _mm_packs_epi32(v4, v5);
10673
+        v0 = _mm_add_epi16(v0, vbias);
10674
+        if (big_endian) {
10675
+            v0 = _mm_shuffle_epi8(v0, vswap);
10676
+        }
10677
+        VEC_ST(dest + i, v0);
10678
+    }
10679
+
10680
+    for (; i < dstW; i++) {
10681
+        int val = add;
10682
+        for (j = 0; j < filterSize; j++)
10683
+            val += src[j][i] * (unsigned)filter[j];
10684
+        output_pixel(&dest[i], val, bias, int);
10685
+    }
10686
+}
10687
+
10688
+#undef output_pixel
10689
+
10690
+#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
10691
+static void yuv2plane1_##bits##BE_LE##_e2k(const int16_t *src, \
10692
+                             uint8_t *dest, int dstW, \
10693
+                             const uint8_t *dither, int offset) \
10694
+{ \
10695
+    yuv2plane1_##template_size##_e2k((const typeX_t *) src, \
10696
+                         (uint16_t *) dest, dstW, is_be, bits); \
10697
+} \
10698
+static void yuv2planeX_##bits##BE_LE##_e2k(const int16_t *filter, int filterSize, \
10699
+                              const int16_t **src, uint8_t *dest, int dstW, \
10700
+                              const uint8_t *dither, int offset)\
10701
+{ \
10702
+    yuv2planeX_##template_size##_e2k(filter, \
10703
+                         filterSize, (const typeX_t **) src, \
10704
+                         (uint16_t *) dest, dstW, is_be, bits); \
10705
+}
10706
+
10707
+yuv2NBPS( 9, BE, 1, 10, int16_t)
10708
+yuv2NBPS( 9, LE, 0, 10, int16_t)
10709
+yuv2NBPS(10, BE, 1, 10, int16_t)
10710
+yuv2NBPS(10, LE, 0, 10, int16_t)
10711
+yuv2NBPS(12, BE, 1, 10, int16_t)
10712
+yuv2NBPS(12, LE, 0, 10, int16_t)
10713
+yuv2NBPS(14, BE, 1, 10, int16_t)
10714
+yuv2NBPS(14, LE, 0, 10, int16_t)
10715
+yuv2NBPS(16, BE, 1, 16, int32_t)
10716
+yuv2NBPS(16, LE, 0, 16, int32_t)
10717
+
10718
+#define INIT_RGB(R, B) \
10719
+    __m64 rgb_index0 = _mm_setr_pi8(0, 1, 3, 4, 5, 7, 8, 9); \
10720
+    __m64 rgb_index1 = _mm_setr_pi8(3, 4, 5, 7, 8, 9, 11, 12); \
10721
+    __m64 rgb_index2 = _mm_setr_pi8(5, 7, 8, 9, 11, 12, 13, 15);
10722
+
10723
+#define INIT_RGBX(R, B) \
10724
+    __m128i A_h = _mm_set1_epi16(-256);
10725
+
10726
+#define INIT_XRGB(R, B) \
10727
+    __m128i A_l = _mm_set1_epi16(255);
10728
+
10729
+#define WRITE_RGB(R, B) \
10730
+    v0 = _mm_srai_epi32(R##_l, 22); \
10731
+    v1 = _mm_srai_epi32(R##_h, 22); \
10732
+    v4 = _mm_srai_epi32(G_l, 22 - 16); \
10733
+    v5 = _mm_srai_epi32(G_h, 22 - 16); \
10734
+    v2 = _mm_srai_epi32(B##_l, 22 - 8); \
10735
+    v3 = _mm_srai_epi32(B##_h, 22 - 8); \
10736
+    v0 = _mm_blend_epi16(v0, v4, 0xaa); \
10737
+    v1 = _mm_blend_epi16(v1, v5, 0xaa); \
10738
+    v2 = _mm_packus_epi32(v2, v3); \
10739
+    v0 = _mm_packus_epi16(v0, v1); \
10740
+    v1 = _mm_unpacklo_epi16(v0, v2); \
10741
+    v2 = _mm_unpackhi_epi16(v0, v2); \
10742
+    { \
10743
+        union { __m128i v; __m64 d[2]; } a = { v1 }, b = { v2 }; \
10744
+        __m64 *p = (__m64*)dest; \
10745
+        p[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \
10746
+        p[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \
10747
+        p[2] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \
10748
+        dest += 24; \
10749
+    }
10750
+
10751
+#define WRITE_RGBX(R, B) \
10752
+    v0 = _mm_srai_epi32(R##_l, 22); \
10753
+    v1 = _mm_srai_epi32(R##_h, 22); \
10754
+    v4 = _mm_srai_epi32(G_l, 22 - 16); \
10755
+    v5 = _mm_srai_epi32(G_h, 22 - 16); \
10756
+    v2 = _mm_srai_epi32(B##_l, 22); \
10757
+    v3 = _mm_srai_epi32(B##_h, 22); \
10758
+    v0 = _mm_blend_epi16(v0, v4, 0xaa); \
10759
+    v1 = _mm_blend_epi16(v1, v5, 0xaa); \
10760
+    v0 = _mm_packus_epi16(v0, v1); \
10761
+    v2 = _mm_packus_epi16(v2, v3); \
10762
+    v2 = _mm_or_si128(v2, A_h); \
10763
+    v1 = _mm_unpacklo_epi16(v0, v2); \
10764
+    v3 = _mm_unpackhi_epi16(v0, v2); \
10765
+    VEC_ST(dest, v1); \
10766
+    VEC_ST(dest + 16, v3); \
10767
+    dest += 32;
10768
+
10769
+#define WRITE_XRGB(R, B) \
10770
+    v0 = _mm_srai_epi32(R##_l, 22 - 16); \
10771
+    v1 = _mm_srai_epi32(R##_h, 22 - 16); \
10772
+    v4 = _mm_srai_epi32(G_l, 22); \
10773
+    v5 = _mm_srai_epi32(G_h, 22); \
10774
+    v2 = _mm_srai_epi32(B##_l, 22 - 16); \
10775
+    v3 = _mm_srai_epi32(B##_h, 22 - 16); \
10776
+    v2 = _mm_blend_epi16(v4, v2, 0xaa); \
10777
+    v3 = _mm_blend_epi16(v5, v3, 0xaa); \
10778
+    v0 = _mm_packus_epi16(v0, v1); \
10779
+    v2 = _mm_packus_epi16(v2, v3); \
10780
+    v0 = _mm_or_si128(v0, A_l); \
10781
+    v1 = _mm_unpacklo_epi16(v0, v2); \
10782
+    v3 = _mm_unpackhi_epi16(v0, v2); \
10783
+    VEC_ST(dest, v1); \
10784
+    VEC_ST(dest + 16, v3); \
10785
+    dest += 32;
10786
+
10787
+#define CALC_RGB \
10788
+    vy_l = _mm_add_epi32(_mm_mullo_epi32(vy_l, y_coeff), y_add); \
10789
+    vy_h = _mm_add_epi32(_mm_mullo_epi32(vy_h, y_coeff), y_add); \
10790
+    \
10791
+    v0 = _mm_mullo_epi32(vv_l, v2g_coeff); \
10792
+    v1 = _mm_mullo_epi32(vu_l, u2g_coeff); \
10793
+    v2 = _mm_mullo_epi32(vv_h, v2g_coeff); \
10794
+    v3 = _mm_mullo_epi32(vu_h, u2g_coeff); \
10795
+    G_l = _mm_add_epi32(_mm_add_epi32(v0, vy_l), v1); \
10796
+    G_h = _mm_add_epi32(_mm_add_epi32(v2, vy_h), v3); \
10797
+    \
10798
+    R_l = _mm_add_epi32(vy_l, _mm_mullo_epi32(vv_l, v2r_coeff)); \
10799
+    R_h = _mm_add_epi32(vy_h, _mm_mullo_epi32(vv_h, v2r_coeff)); \
10800
+    B_l = _mm_add_epi32(vy_l, _mm_mullo_epi32(vu_l, u2b_coeff)); \
10801
+    B_h = _mm_add_epi32(vy_h, _mm_mullo_epi32(vu_h, u2b_coeff));
10802
+
10803
+#define WITH_ALPHA(...) __VA_ARGS__
10804
+#define NO_ALPHA(...)
10805
+
10806
+#define YUV2RGBWRAPPERXF(ext, fmt, R, B, hasAlpha) \
10807
+static void yuv2##ext##_X_e2k(SwsContext *c, const int16_t *lumFilter, \
10808
+                              const int16_t **lumSrc, int lumFilterSize, \
10809
+                              const int16_t *chrFilter, const int16_t **chrUSrc, \
10810
+                              const int16_t **chrVSrc, int chrFilterSize, \
10811
+                              const int16_t **alpSrc, uint8_t *dest, int dstW, \
10812
+                              int y) \
10813
+{ \
10814
+    vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h; \
10815
+    vec_s32 hasAlpha(A_l, A_h,) R_l, R_h, G_l, G_h, B_l, B_h; \
10816
+    vec_s32 ystart = _mm_set1_epi32(1 << 9); \
10817
+    vec_s32 uvstart = _mm_set1_epi32((1 << 9) - (128 << 19)); \
10818
+    vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff); \
10819
+    vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \
10820
+    vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff); \
10821
+    vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff); \
10822
+    vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff); \
10823
+    vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff); \
10824
+    __m128i v0, v1, v2, v3, v4, v5; \
10825
+    int i, j; \
10826
+    INIT_##fmt(R, B) \
10827
+    \
10828
+    for (i = 0; i < dstW; i += 8) { \
10829
+        vy_l = vy_h = ystart; \
10830
+        for (j = 0; j < lumFilterSize; j++) { \
10831
+            v0 = VEC_LD(lumSrc[j] + i); \
10832
+            v1 = _mm_set1_epi16(lumFilter[j]); \
10833
+            v2 = _mm_mullo_epi16(v0, v1); \
10834
+            v3 = _mm_mulhi_epi16(v0, v1); \
10835
+            v0 = _mm_unpacklo_epi16(v2, v3); \
10836
+            v1 = _mm_unpackhi_epi16(v2, v3); \
10837
+            vy_l = _mm_add_epi32(vy_l, v0); \
10838
+            vy_h = _mm_add_epi32(vy_h, v1); \
10839
+        } \
10840
+        vy_l = _mm_srai_epi32(vy_l, 10); \
10841
+        vy_h = _mm_srai_epi32(vy_h, 10); \
10842
+        \
10843
+        vu_l = vu_h = vv_l = vv_h = uvstart; \
10844
+        for (j = 0; j < chrFilterSize; j++) { \
10845
+            v0 = VEC_LD(chrUSrc[j] + i); \
10846
+            v1 = VEC_LD(chrVSrc[j] + i); \
10847
+            v5 = _mm_set1_epi16(chrFilter[j]); \
10848
+            v2 = _mm_mullo_epi16(v0, v5); \
10849
+            v3 = _mm_mulhi_epi16(v0, v5); \
10850
+            v4 = _mm_mullo_epi16(v1, v5); \
10851
+            v5 = _mm_mulhi_epi16(v1, v5); \
10852
+            v0 = _mm_unpacklo_epi16(v2, v3); \
10853
+            v1 = _mm_unpackhi_epi16(v2, v3); \
10854
+            v2 = _mm_unpacklo_epi16(v4, v5); \
10855
+            v3 = _mm_unpackhi_epi16(v4, v5); \
10856
+            vu_l = _mm_add_epi32(vu_l, v0); \
10857
+            vu_h = _mm_add_epi32(vu_h, v1); \
10858
+            vv_l = _mm_add_epi32(vv_l, v2); \
10859
+            vv_h = _mm_add_epi32(vv_h, v3); \
10860
+        } \
10861
+        vu_l = _mm_srai_epi32(vu_l, 10); \
10862
+        vu_h = _mm_srai_epi32(vu_h, 10); \
10863
+        vv_l = _mm_srai_epi32(vv_l, 10); \
10864
+        vv_h = _mm_srai_epi32(vv_h, 10); \
10865
+        \
10866
+        CALC_RGB \
10867
+        WRITE_##fmt(R, B) \
10868
+    } \
10869
+}
10870
+
10871
+#define SETUP(buf, i, alpha, r0, r1) { \
10872
+    v0 = VEC_LD(buf##0 + i); \
10873
+    v1 = VEC_LD(buf##1 + i); \
10874
+    v2 = _mm_unpacklo_epi16(v0, v1); \
10875
+    v3 = _mm_unpackhi_epi16(v0, v1); \
10876
+    r0 = _mm_madd_epi16(v2, alpha); \
10877
+    r1 = _mm_madd_epi16(v3, alpha); \
10878
+}
10879
+
10880
+#define YUV2RGBWRAPPER2F(ext, fmt, R, B, hasAlpha) \
10881
+static void yuv2##ext##_2_e2k(SwsContext *c, const int16_t *buf[2], \
10882
+                              const int16_t *ubuf[2], const int16_t *vbuf[2], \
10883
+                              const int16_t *abuf[2], uint8_t *dest, int dstW, \
10884
+                              int yalpha, int uvalpha, int y) \
10885
+{ \
10886
+    const int16_t hasAlpha(*abuf0 = abuf[0], *abuf1 = abuf[1],) \
10887
+                  *buf0 = buf[0], *buf1 = buf[1], \
10888
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], \
10889
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1]; \
10890
+    vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h; \
10891
+    vec_s32 hasAlpha(A_l, A_h,) R_l, R_h, G_l, G_h, B_l, B_h; \
10892
+    vec_s16 vyalpha = _mm_set1_epi32(4096 + yalpha * 0xffff); \
10893
+    vec_s16 vuvalpha = _mm_set1_epi32(4096 + uvalpha * 0xffff); \
10894
+    vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff); \
10895
+    vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \
10896
+    vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff); \
10897
+    vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff); \
10898
+    vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff); \
10899
+    vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff); \
10900
+    vec_s32 dec128 = _mm_set1_epi32(128 << 19); \
10901
+    hasAlpha(vec_s32 add18 = _mm_set1_epi32(1 << 18);) \
10902
+    __m128i v0, v1, v2, v3, v4, v5; \
10903
+    int i; \
10904
+    INIT_##fmt(R, B) \
10905
+    \
10906
+    av_assert2(yalpha <= 4096U); \
10907
+    av_assert2(uvalpha <= 4096U); \
10908
+    \
10909
+    for (i = 0; i < dstW; i += 8) { \
10910
+        SETUP(buf, i, vyalpha, v0, v1); \
10911
+        vy_l = _mm_srai_epi32(v0, 10); \
10912
+        vy_h = _mm_srai_epi32(v1, 10); \
10913
+        \
10914
+        SETUP(ubuf, i, vuvalpha, v0, v1); \
10915
+        vu_l = _mm_srai_epi32(_mm_sub_epi32(v0, dec128), 10); \
10916
+        vu_h = _mm_srai_epi32(_mm_sub_epi32(v1, dec128), 10); \
10917
+        \
10918
+        SETUP(vbuf, i, vuvalpha, v0, v1); \
10919
+        vv_l = _mm_srai_epi32(_mm_sub_epi32(v0, dec128), 10); \
10920
+        vv_h = _mm_srai_epi32(_mm_sub_epi32(v1, dec128), 10); \
10921
+        \
10922
+        hasAlpha( \
10923
+            SETUP(abuf, i, vyalpha, v0, v1); \
10924
+            A_l = _mm_add_epi32(v0, add18); \
10925
+            A_h = _mm_add_epi32(v1, add18); \
10926
+        ) \
10927
+        \
10928
+        CALC_RGB \
10929
+        WRITE_##fmt(R, B) \
10930
+    } \
10931
+}
10932
+
10933
+#define YUV2RGBWRAPPER1F(ext, fmt, R, B, hasAlpha) \
10934
+static void yuv2##ext##_1_e2k(SwsContext *c, const int16_t *buf0, \
10935
+                              const int16_t *ubuf[2], const int16_t *vbuf[2], \
10936
+                              const int16_t *abuf0, uint8_t *dest, int dstW, \
10937
+                              int uvalpha, int y) \
10938
+{ \
10939
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; \
10940
+    const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; \
10941
+    int uvshl = uvalpha < 2048 ? 2 : 1; \
10942
+    vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h; \
10943
+    vec_s32 R_l, R_h, G_l, G_h, B_l, B_h; \
10944
+    vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff * 4); \
10945
+    vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \
10946
+    vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff << uvshl); \
10947
+    vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff << uvshl); \
10948
+    vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff << uvshl); \
10949
+    vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff << uvshl); \
10950
+    vec_u16 uvsub = _mm_set1_epi16(uvalpha < 2048 ? 128 << 7 : 128 << 8); \
10951
+    hasAlpha(vec_s16 A, add64 = _mm_set1_epi16(64);) \
10952
+    __m128i v0, v1, v2, v3, v4, v5; \
10953
+    int i; \
10954
+    INIT_##fmt(R, B) \
10955
+    \
10956
+    for (i = 0; i < dstW; i += 8) { \
10957
+        v0 = VEC_LD(buf0 + i); \
10958
+        v2 = _mm_unpacklo_epi16(v0, v0); \
10959
+        v3 = _mm_unpackhi_epi16(v0, v0); \
10960
+        vy_l = _mm_srai_epi32(v2, 16); \
10961
+        vy_h = _mm_srai_epi32(v3, 16); \
10962
+        \
10963
+        v0 = VEC_LD(ubuf0 + i); \
10964
+        v1 = VEC_LD(vbuf0 + i); \
10965
+        if (uvalpha >= 2048) { \
10966
+            v2 = VEC_LD(ubuf1 + i); \
10967
+            v3 = VEC_LD(vbuf1 + i); \
10968
+            v0 = _mm_add_epi16(v0, v2); \
10969
+            v1 = _mm_add_epi16(v1, v3); \
10970
+        } \
10971
+        v0 = _mm_sub_epi16(v0, uvsub); \
10972
+        v1 = _mm_sub_epi16(v1, uvsub); \
10973
+        v2 = _mm_unpacklo_epi16(v0, v0); \
10974
+        v3 = _mm_unpackhi_epi16(v0, v0); \
10975
+        vu_l = _mm_srai_epi32(v2, 16); \
10976
+        vu_h = _mm_srai_epi32(v3, 16); \
10977
+        v2 = _mm_unpacklo_epi16(v1, v1); \
10978
+        v3 = _mm_unpackhi_epi16(v1, v1); \
10979
+        vv_l = _mm_srai_epi32(v2, 16); \
10980
+        vv_h = _mm_srai_epi32(v3, 16); \
10981
+        \
10982
+        hasAlpha( \
10983
+            A = VEC_LD(abuf0 + i); \
10984
+            A = _mm_add_epi16(A, add64); \
10985
+            A = _mm_srai_epi16(A, 7); \
10986
+        ) \
10987
+        \
10988
+        CALC_RGB \
10989
+        WRITE_##fmt(R, B) \
10990
+    } \
10991
+}
10992
+
10993
+YUV2RGBWRAPPERXF(rgbx32_full, RGBX, R, B, NO_ALPHA)
10994
+YUV2RGBWRAPPERXF(bgrx32_full, RGBX, B, R, NO_ALPHA)
10995
+YUV2RGBWRAPPERXF(xrgb32_full, XRGB, R, B, NO_ALPHA)
10996
+YUV2RGBWRAPPERXF(xbgr32_full, XRGB, B, R, NO_ALPHA)
10997
+YUV2RGBWRAPPERXF(rgb24_full, RGB, R, B, NO_ALPHA)
10998
+YUV2RGBWRAPPERXF(bgr24_full, RGB, B, R, NO_ALPHA)
10999
+
11000
+YUV2RGBWRAPPER2F(rgbx32_full, RGBX, R, B, NO_ALPHA)
11001
+YUV2RGBWRAPPER2F(bgrx32_full, RGBX, B, R, NO_ALPHA)
11002
+YUV2RGBWRAPPER2F(xrgb32_full, XRGB, R, B, NO_ALPHA)
11003
+YUV2RGBWRAPPER2F(xbgr32_full, XRGB, B, R, NO_ALPHA)
11004
+YUV2RGBWRAPPER2F(rgb24_full, RGB, R, B, NO_ALPHA)
11005
+YUV2RGBWRAPPER2F(bgr24_full, RGB, B, R, NO_ALPHA)
11006
+
11007
+YUV2RGBWRAPPER1F(rgbx32_full, RGBX, R, B, NO_ALPHA)
11008
+YUV2RGBWRAPPER1F(bgrx32_full, RGBX, B, R, NO_ALPHA)
11009
+YUV2RGBWRAPPER1F(xrgb32_full, XRGB, R, B, NO_ALPHA)
11010
+YUV2RGBWRAPPER1F(xbgr32_full, XRGB, B, R, NO_ALPHA)
11011
+YUV2RGBWRAPPER1F(rgb24_full, RGB, R, B, NO_ALPHA)
11012
+YUV2RGBWRAPPER1F(bgr24_full, RGB, B, R, NO_ALPHA)
11013
+
11014
+#if 1 // performance
11015
+
11016
+#define INIT2_RGB(R, B) \
11017
+    __m128i perm_unp8 = _mm_setr_epi8( \
11018
+        0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); \
11019
+    __m64 rgb_index0 = _mm_setr_pi8(0, 1, 3, 4, 5, 7, 8, 9); \
11020
+    __m64 rgb_index1 = _mm_setr_pi8(3, 4, 5, 7, 8, 9, 11, 12); \
11021
+    __m64 rgb_index2 = _mm_setr_pi8(5, 7, 8, 9, 11, 12, 13, 15);
11022
+
11023
+#define INIT2_RGBX(R, B) INIT2_XRGB(R, B)
11024
+#define INIT2_XRGB(R, B) \
11025
+    __m128i A_l = _mm_set1_epi16(255); \
11026
+    __m128i perm_unp8 = _mm_setr_epi8( \
11027
+        0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
11028
+
11029
+#define WRITE2_RGB(R, B) \
11030
+    v4 = _mm_packus_epi16(R##_l, G_l); \
11031
+    v5 = _mm_packus_epi16(B##_l, B##_l); \
11032
+    v0 = _mm_shuffle_epi8(v4, perm_unp8); \
11033
+    v1 = _mm_unpacklo_epi8(v5, v5); \
11034
+    v2 = _mm_unpacklo_epi16(v0, v1); \
11035
+    v3 = _mm_unpackhi_epi16(v0, v1); \
11036
+    { \
11037
+        union { __m128i v; __m64 d[2]; } a = { v2 }, b = { v3 }; \
11038
+        __m64 *p = (__m64*)dest; \
11039
+        p[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \
11040
+        p[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \
11041
+        p[2] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \
11042
+        dest += 24; \
11043
+    }
11044
+
11045
+#define WRITE2_RGBX(R, B) \
11046
+    v4 = _mm_packus_epi16(R##_l, G_l); \
11047
+    v5 = _mm_packus_epi16(B##_l, A_l); \
11048
+    v0 = _mm_shuffle_epi8(v4, perm_unp8); \
11049
+    v1 = _mm_shuffle_epi8(v5, perm_unp8); \
11050
+    v2 = _mm_unpacklo_epi16(v0, v1); \
11051
+    v3 = _mm_unpackhi_epi16(v0, v1); \
11052
+    VEC_ST(dest, v2); \
11053
+    VEC_ST(dest + 16, v3); \
11054
+    dest += 32;
11055
+
11056
+#define WRITE2_XRGB(R, B) \
11057
+    v4 = _mm_packus_epi16(A_l, R##_l); \
11058
+    v5 = _mm_packus_epi16(G_l, B##_l); \
11059
+    v0 = _mm_shuffle_epi8(v4, perm_unp8); \
11060
+    v1 = _mm_shuffle_epi8(v5, perm_unp8); \
11061
+    v2 = _mm_unpacklo_epi16(v0, v1); \
11062
+    v3 = _mm_unpackhi_epi16(v0, v1); \
11063
+    VEC_ST(dest, v2); \
11064
+    VEC_ST(dest + 16, v3); \
11065
+    dest += 32;
11066
+
11067
+#define CALC2_RGB \
11068
+    vy_l = _mm_mulhrs_epi16(_mm_sub_epi16(vy_l, y_sub), y_coeff); \
11069
+    \
11070
+    v0 = _mm_mulhrs_epi16(vv_l, v2g_coeff); \
11071
+    v1 = _mm_mulhrs_epi16(vu_l, u2g_coeff); \
11072
+    G_l = _mm_add_epi16(_mm_add_epi16(v0, vy_l), v1); \
11073
+    \
11074
+    R_l = _mm_add_epi16(vy_l, _mm_mulhrs_epi16(vv_l, v2r_coeff)); \
11075
+    B_l = _mm_add_epi16(vy_l, _mm_mulhrs_epi16(vu_l, u2b_coeff));
11076
+
11077
+#define YUV2RGBWRAPPERX(ext, fmt, R, B, hasAlpha) \
11078
+static void yuv2##ext##_X_e2k(SwsContext *c, const int16_t *lumFilter, \
11079
+                              const int16_t **lumSrc, int lumFilterSize, \
11080
+                              const int16_t *chrFilter, const int16_t **chrUSrc, \
11081
+                              const int16_t **chrVSrc, int chrFilterSize, \
11082
+                              const int16_t **alpSrc, uint8_t *dest, int dstW, \
11083
+                              int y) \
11084
+{ \
11085
+    vec_s32 vy_l, vy_h, vu_l, vv_l, vu2_l, vu2_h, vv2_l, vv2_h; \
11086
+    vec_s32 hasAlpha(A_l,) R_l, G_l, B_l; \
11087
+    vec_s32 ystart = _mm_set1_epi32(0); \
11088
+    vec_s32 uvstart = _mm_set1_epi32(-(128 << 19)); \
11089
+    vec_s32 y_coeff = _mm_set1_epi16(c->yuv2rgb_y_coeff); \
11090
+    vec_s32 y_sub = _mm_set1_epi16((c->yuv2rgb_y_offset + 64) >> 7); \
11091
+    vec_s32 v2r_coeff = _mm_set1_epi16(c->yuv2rgb_v2r_coeff); \
11092
+    vec_s32 v2g_coeff = _mm_set1_epi16(c->yuv2rgb_v2g_coeff); \
11093
+    vec_s32 u2g_coeff = _mm_set1_epi16(c->yuv2rgb_u2g_coeff); \
11094
+    vec_s32 u2b_coeff = _mm_set1_epi16(c->yuv2rgb_u2b_coeff); \
11095
+    __m128i v0, v1, v2, v3, v4, v5; \
11096
+    int i, j; \
11097
+    INIT2_##fmt(R, B) \
11098
+    \
11099
+    for (i = 0; i < (dstW + 1) >> 1; i += 8) { \
11100
+        vy_l = vy_h = ystart; \
11101
+        for (j = 0; j < lumFilterSize; j++) { \
11102
+            v0 = VEC_LD(lumSrc[j] + i * 2); \
11103
+            v1 = _mm_set1_epi16(lumFilter[j]); \
11104
+            v2 = _mm_mullo_epi16(v0, v1); \
11105
+            v3 = _mm_mulhi_epi16(v0, v1); \
11106
+            v0 = _mm_unpacklo_epi16(v2, v3); \
11107
+            v1 = _mm_unpackhi_epi16(v2, v3); \
11108
+            vy_l = _mm_add_epi32(vy_l, v0); \
11109
+            vy_h = _mm_add_epi32(vy_h, v1); \
11110
+        } \
11111
+        vy_l = _mm_srai_epi32(vy_l, 17); \
11112
+        vy_h = _mm_srai_epi32(vy_h, 17); \
11113
+        \
11114
+        vu2_l = vu2_h = vv2_l = vv2_h = uvstart; \
11115
+        for (j = 0; j < chrFilterSize; j++) { \
11116
+            v0 = VEC_LD(chrUSrc[j] + i); \
11117
+            v1 = VEC_LD(chrVSrc[j] + i); \
11118
+            v5 = _mm_set1_epi16(chrFilter[j]); \
11119
+            v2 = _mm_mullo_epi16(v0, v5); \
11120
+            v3 = _mm_mulhi_epi16(v0, v5); \
11121
+            v4 = _mm_mullo_epi16(v1, v5); \
11122
+            v5 = _mm_mulhi_epi16(v1, v5); \
11123
+            v0 = _mm_unpacklo_epi16(v2, v3); \
11124
+            v1 = _mm_unpackhi_epi16(v2, v3); \
11125
+            v2 = _mm_unpacklo_epi16(v4, v5); \
11126
+            v3 = _mm_unpackhi_epi16(v4, v5); \
11127
+            vu2_l = _mm_add_epi32(vu2_l, v0); \
11128
+            vu2_h = _mm_add_epi32(vu2_h, v1); \
11129
+            vv2_l = _mm_add_epi32(vv2_l, v2); \
11130
+            vv2_h = _mm_add_epi32(vv2_h, v3); \
11131
+        } \
11132
+        vu2_l = _mm_srai_epi32(vu2_l, 17); \
11133
+        vu2_h = _mm_srai_epi32(vu2_h, 17); \
11134
+        vv2_l = _mm_srai_epi32(vv2_l, 17); \
11135
+        vv2_h = _mm_srai_epi32(vv2_h, 17); \
11136
+        vu2_l = _mm_packs_epi32(vu2_l, vu2_h); \
11137
+        vv2_l = _mm_packs_epi32(vv2_l, vv2_h); \
11138
+        \
11139
+        vu_l = _mm_unpacklo_epi16(vu2_l, vu2_l); \
11140
+        vv_l = _mm_unpacklo_epi16(vv2_l, vv2_l); \
11141
+        vy_l = _mm_packs_epi32(vy_l, vy_h); \
11142
+        \
11143
+        CALC2_RGB \
11144
+        WRITE2_##fmt(R, B) \
11145
+        \
11146
+        vy_l = vy_h = ystart; \
11147
+        for (j = 0; j < lumFilterSize; j++) { \
11148
+            v0 = VEC_LD(lumSrc[j] + i * 2 + 8); \
11149
+            v1 = _mm_set1_epi16(lumFilter[j]); \
11150
+            v2 = _mm_mullo_epi16(v0, v1); \
11151
+            v3 = _mm_mulhi_epi16(v0, v1); \
11152
+            v0 = _mm_unpacklo_epi16(v2, v3); \
11153
+            v1 = _mm_unpackhi_epi16(v2, v3); \
11154
+            vy_l = _mm_add_epi32(vy_l, v0); \
11155
+            vy_h = _mm_add_epi32(vy_h, v1); \
11156
+        } \
11157
+        vy_l = _mm_srai_epi32(vy_l, 17); \
11158
+        vy_h = _mm_srai_epi32(vy_h, 17); \
11159
+        \
11160
+        vu_l = _mm_unpackhi_epi16(vu2_l, vu2_l); \
11161
+        vv_l = _mm_unpackhi_epi16(vv2_l, vv2_l); \
11162
+        vy_l = _mm_packs_epi32(vy_l, vy_h); \
11163
+        \
11164
+        CALC2_RGB \
11165
+        WRITE2_##fmt(R, B) \
11166
+    } \
11167
+}
11168
+
11169
+#define SETUP2(buf, i, alpha, r0) { \
11170
+    v0 = VEC_LD(buf##0 + i); \
11171
+    v1 = VEC_LD(buf##1 + i); \
11172
+    v1 = _mm_subs_epi16(v0, v1); \
11173
+    v1 = _mm_mulhrs_epi16(v1, alpha); \
11174
+    r0 = _mm_add_epi16(v0, v1); \
11175
+}
11176
+
11177
+#define YUV2RGBWRAPPER2(ext, fmt, R, B, hasAlpha) \
11178
+static void yuv2##ext##_2_e2k(SwsContext *c, const int16_t *buf[2], \
11179
+                              const int16_t *ubuf[2], const int16_t *vbuf[2], \
11180
+                              const int16_t *abuf[2], uint8_t *dest, int dstW, \
11181
+                              int yalpha, int uvalpha, int y) \
11182
+{ \
11183
+    const int16_t hasAlpha(*abuf0 = abuf[0], *abuf1 = abuf[1],) \
11184
+                  *buf0 = buf[0], *buf1 = buf[1], \
11185
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], \
11186
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1]; \
11187
+    vec_s32 vy_l, vu_l, vv_l, vu2_l, vv2_l; \
11188
+    vec_s32 hasAlpha(A_l,) R_l, G_l, B_l; \
11189
+    vec_s16 vyalpha = _mm_set1_epi16(-yalpha << 3); \
11190
+    vec_s16 vuvalpha = _mm_set1_epi16(-uvalpha << 3); \
11191
+    vec_s32 y_coeff = _mm_set1_epi16(c->yuv2rgb_y_coeff); \
11192
+    vec_s32 y_sub = _mm_set1_epi16((c->yuv2rgb_y_offset + 64) >> 7); \
11193
+    vec_s32 v2r_coeff = _mm_set1_epi16(c->yuv2rgb_v2r_coeff); \
11194
+    vec_s32 v2g_coeff = _mm_set1_epi16(c->yuv2rgb_v2g_coeff); \
11195
+    vec_s32 u2g_coeff = _mm_set1_epi16(c->yuv2rgb_u2g_coeff); \
11196
+    vec_s32 u2b_coeff = _mm_set1_epi16(c->yuv2rgb_u2b_coeff); \
11197
+    vec_s32 dec128 = _mm_set1_epi16(128 << 2); \
11198
+    __m128i v0, v1, v2, v3, v4, v5; \
11199
+    int i; \
11200
+    INIT2_##fmt(R, B) \
11201
+    \
11202
+    av_assert2(yalpha <= 4096U); \
11203
+    av_assert2(uvalpha <= 4096U); \
11204
+    \
11205
+    for (i = 0; i < (dstW + 1) >> 1; i += 8) { \
11206
+        SETUP2(buf, i * 2, vyalpha, vy_l); \
11207
+        vy_l = _mm_srai_epi16(vy_l, 5); \
11208
+        \
11209
+        SETUP2(ubuf, i, vuvalpha, vu2_l); \
11210
+        vu2_l = _mm_srai_epi16(vu2_l, 5); \
11211
+        \
11212
+        SETUP2(vbuf, i, vuvalpha, vv2_l); \
11213
+        vv2_l = _mm_srai_epi16(vv2_l, 5); \
11214
+        \
11215
+        vu2_l = _mm_sub_epi16(vu2_l, dec128); \
11216
+        vv2_l = _mm_sub_epi16(vv2_l, dec128); \
11217
+        \
11218
+        hasAlpha( \
11219
+            SETUP2(abuf, i * 2, vyalpha, A_l); \
11220
+        ) \
11221
+        \
11222
+        vu_l = _mm_unpacklo_epi16(vu2_l, vu2_l); \
11223
+        vv_l = _mm_unpacklo_epi16(vv2_l, vv2_l); \
11224
+        \
11225
+        CALC2_RGB \
11226
+        WRITE2_##fmt(R, B) \
11227
+        \
11228
+        SETUP2(buf, i * 2 + 8, vyalpha, vy_l); \
11229
+        vy_l = _mm_srai_epi16(vy_l, 5); \
11230
+        \
11231
+        hasAlpha( \
11232
+            SETUP2(abuf, i * 2 + 8, vyalpha, A_l); \
11233
+        ) \
11234
+        \
11235
+        vu_l = _mm_unpackhi_epi16(vu2_l, vu2_l); \
11236
+        vv_l = _mm_unpackhi_epi16(vv2_l, vv2_l); \
11237
+        \
11238
+        CALC2_RGB \
11239
+        WRITE2_##fmt(R, B) \
11240
+    } \
11241
+}
11242
+
11243
+#define YUV2RGBWRAPPER1(ext, fmt, R, B, hasAlpha) \
11244
+static void yuv2##ext##_1_e2k(SwsContext *c, const int16_t *buf0, \
11245
+                              const int16_t *ubuf[2], const int16_t *vbuf[2], \
11246
+                              const int16_t *abuf0, uint8_t *dest, int dstW, \
11247
+                              int uvalpha, int y) \
11248
+{ \
11249
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; \
11250
+    const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; \
11251
+    int uvshr = uvalpha < 2048 ? 5 : 6; \
11252
+    vec_s32 vy_l, vu_l, vv_l, vu2_l, vv2_l; \
11253
+    vec_s32 hasAlpha(A_l,) R_l, G_l, B_l; \
11254
+    vec_s32 y_coeff = _mm_set1_epi16(c->yuv2rgb_y_coeff); \
11255
+    vec_s32 y_sub = _mm_set1_epi16((c->yuv2rgb_y_offset + 64) >> 7); \
11256
+    vec_s32 v2r_coeff = _mm_set1_epi16(c->yuv2rgb_v2r_coeff); \
11257
+    vec_s32 v2g_coeff = _mm_set1_epi16(c->yuv2rgb_v2g_coeff); \
11258
+    vec_s32 u2g_coeff = _mm_set1_epi16(c->yuv2rgb_u2g_coeff); \
11259
+    vec_s32 u2b_coeff = _mm_set1_epi16(c->yuv2rgb_u2b_coeff); \
11260
+    vec_u16 uvsub = _mm_set1_epi16(uvalpha < 2048 ? 128 << 7 : 128 << 8); \
11261
+    hasAlpha(vec_s16 A, add64 = _mm_set1_epi16(64);) \
11262
+    __m128i v0, v1, v2, v3, v4, v5; \
11263
+    int i; \
11264
+    INIT2_##fmt(R, B) \
11265
+    \
11266
+    for (i = 0; i < (dstW + 1) >> 1; i += 8) { \
11267
+        v0 = VEC_LD(buf0 + i * 2); \
11268
+        vy_l = _mm_srai_epi16(v0, 5); \
11269
+        \
11270
+        v0 = VEC_LD(ubuf0 + i); \
11271
+        v1 = VEC_LD(vbuf0 + i); \
11272
+        if (uvalpha >= 2048) { \
11273
+            v2 = VEC_LD(ubuf1 + i); \
11274
+            v3 = VEC_LD(vbuf1 + i); \
11275
+            v0 = _mm_add_epi16(v0, v2); \
11276
+            v1 = _mm_add_epi16(v1, v3); \
11277
+        } \
11278
+        v0 = _mm_sub_epi16(v0, uvsub); \
11279
+        v1 = _mm_sub_epi16(v1, uvsub); \
11280
+        vu2_l = _mm_srai_epi16(v0, uvshr); \
11281
+        vv2_l = _mm_srai_epi16(v1, uvshr); \
11282
+        \
11283
+        hasAlpha( \
11284
+            A_l = VEC_LD(abuf0 + i * 2); \
11285
+            A_l = _mm_add_epi16(A_l, add64); \
11286
+            A_l = _mm_srai_epi16(A_l, 7); \
11287
+        ) \
11288
+        \
11289
+        vu_l = _mm_unpacklo_epi16(vu2_l, vu2_l); \
11290
+        vv_l = _mm_unpacklo_epi16(vv2_l, vv2_l); \
11291
+        \
11292
+        CALC2_RGB \
11293
+        WRITE2_##fmt(R, B) \
11294
+        \
11295
+        v0 = VEC_LD(buf0 + i * 2 + 8); \
11296
+        vy_l = _mm_srai_epi16(v0, 5); \
11297
+        \
11298
+        hasAlpha( \
11299
+            A_l = VEC_LD(abuf0 + i * 2 + 8); \
11300
+            A_l = _mm_add_epi16(A_l, add64); \
11301
+            A_l = _mm_srai_epi16(A_l, 7); \
11302
+        ) \
11303
+        \
11304
+        vu_l = _mm_unpackhi_epi16(vu2_l, vu2_l); \
11305
+        vv_l = _mm_unpackhi_epi16(vv2_l, vv2_l); \
11306
+        \
11307
+        CALC2_RGB \
11308
+        WRITE2_##fmt(R, B) \
11309
+    } \
11310
+}
11311
+
11312
+#else // quality
11313
+
11314
+#define YUV2RGBWRAPPERX(ext, fmt, R, B, hasAlpha) \
11315
+static void yuv2##ext##_X_e2k(SwsContext *c, const int16_t *lumFilter, \
11316
+                              const int16_t **lumSrc, int lumFilterSize, \
11317
+                              const int16_t *chrFilter, const int16_t **chrUSrc, \
11318
+                              const int16_t **chrVSrc, int chrFilterSize, \
11319
+                              const int16_t **alpSrc, uint8_t *dest, int dstW, \
11320
+                              int y) \
11321
+{ \
11322
+    vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h, vu2_l, vu2_h, vv2_l, vv2_h; \
11323
+    vec_s32 hasAlpha(A_l, A_h,) R_l, R_h, G_l, G_h, B_l, B_h; \
11324
+    vec_s32 ystart = _mm_set1_epi32(1 << 9); \
11325
+    vec_s32 uvstart = _mm_set1_epi32((1 << 9) - (128 << 19)); \
11326
+    vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff); \
11327
+    vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \
11328
+    vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff); \
11329
+    vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff); \
11330
+    vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff); \
11331
+    vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff); \
11332
+    __m128i v0, v1, v2, v3, v4, v5; \
11333
+    int i, j; \
11334
+    INIT_##fmt(R, B) \
11335
+    \
11336
+    for (i = 0; i < (dstW + 1) >> 1; i += 8) { \
11337
+        vy_l = vy_h = ystart; \
11338
+        for (j = 0; j < lumFilterSize; j++) { \
11339
+            v0 = VEC_LD(lumSrc[j] + i * 2); \
11340
+            v1 = _mm_set1_epi16(lumFilter[j]); \
11341
+            v2 = _mm_mullo_epi16(v0, v1); \
11342
+            v3 = _mm_mulhi_epi16(v0, v1); \
11343
+            v0 = _mm_unpacklo_epi16(v2, v3); \
11344
+            v1 = _mm_unpackhi_epi16(v2, v3); \
11345
+            vy_l = _mm_add_epi32(vy_l, v0); \
11346
+            vy_h = _mm_add_epi32(vy_h, v1); \
11347
+        } \
11348
+        vy_l = _mm_srai_epi32(vy_l, 10); \
11349
+        vy_h = _mm_srai_epi32(vy_h, 10); \
11350
+        \
11351
+        vu2_l = vu2_h = vv2_l = vv2_h = uvstart; \
11352
+        for (j = 0; j < chrFilterSize; j++) { \
11353
+            v0 = VEC_LD(chrUSrc[j] + i); \
11354
+            v1 = VEC_LD(chrVSrc[j] + i); \
11355
+            v5 = _mm_set1_epi16(chrFilter[j]); \
11356
+            v2 = _mm_mullo_epi16(v0, v5); \
11357
+            v3 = _mm_mulhi_epi16(v0, v5); \
11358
+            v4 = _mm_mullo_epi16(v1, v5); \
11359
+            v5 = _mm_mulhi_epi16(v1, v5); \
11360
+            v0 = _mm_unpacklo_epi16(v2, v3); \
11361
+            v1 = _mm_unpackhi_epi16(v2, v3); \
11362
+            v2 = _mm_unpacklo_epi16(v4, v5); \
11363
+            v3 = _mm_unpackhi_epi16(v4, v5); \
11364
+            vu2_l = _mm_add_epi32(vu2_l, v0); \
11365
+            vu2_h = _mm_add_epi32(vu2_h, v1); \
11366
+            vv2_l = _mm_add_epi32(vv2_l, v2); \
11367
+            vv2_h = _mm_add_epi32(vv2_h, v3); \
11368
+        } \
11369
+        vu2_l = _mm_srai_epi32(vu2_l, 10); \
11370
+        vu2_h = _mm_srai_epi32(vu2_h, 10); \
11371
+        vv2_l = _mm_srai_epi32(vv2_l, 10); \
11372
+        vv2_h = _mm_srai_epi32(vv2_h, 10); \
11373
+        \
11374
+        vu_l = _mm_unpacklo_epi32(vu2_l, vu2_l); \
11375
+        vu_h = _mm_unpackhi_epi32(vu2_l, vu2_l); \
11376
+        vv_l = _mm_unpacklo_epi32(vv2_l, vv2_l); \
11377
+        vv_h = _mm_unpackhi_epi32(vv2_l, vv2_l); \
11378
+        \
11379
+        CALC_RGB \
11380
+        WRITE_##fmt(R, B) \
11381
+        \
11382
+        vy_l = vy_h = ystart; \
11383
+        for (j = 0; j < lumFilterSize; j++) { \
11384
+            v0 = VEC_LD(lumSrc[j] + i * 2 + 8); \
11385
+            v1 = _mm_set1_epi16(lumFilter[j]); \
11386
+            v2 = _mm_mullo_epi16(v0, v1); \
11387
+            v3 = _mm_mulhi_epi16(v0, v1); \
11388
+            v0 = _mm_unpacklo_epi16(v2, v3); \
11389
+            v1 = _mm_unpackhi_epi16(v2, v3); \
11390
+            vy_l = _mm_add_epi32(vy_l, v0); \
11391
+            vy_h = _mm_add_epi32(vy_h, v1); \
11392
+        } \
11393
+        vy_l = _mm_srai_epi32(vy_l, 10); \
11394
+        vy_h = _mm_srai_epi32(vy_h, 10); \
11395
+        \
11396
+        vu_l = _mm_unpacklo_epi32(vu2_h, vu2_h); \
11397
+        vu_h = _mm_unpackhi_epi32(vu2_h, vu2_h); \
11398
+        vv_l = _mm_unpacklo_epi32(vv2_h, vv2_h); \
11399
+        vv_h = _mm_unpackhi_epi32(vv2_h, vv2_h); \
11400
+        \
11401
+        CALC_RGB \
11402
+        WRITE_##fmt(R, B) \
11403
+    } \
11404
+}
11405
+
11406
+#define YUV2RGBWRAPPER2(ext, fmt, R, B, hasAlpha) \
11407
+static void yuv2##ext##_2_e2k(SwsContext *c, const int16_t *buf[2], \
11408
+                              const int16_t *ubuf[2], const int16_t *vbuf[2], \
11409
+                              const int16_t *abuf[2], uint8_t *dest, int dstW, \
11410
+                              int yalpha, int uvalpha, int y) \
11411
+{ \
11412
+    const int16_t hasAlpha(*abuf0 = abuf[0], *abuf1 = abuf[1],) \
11413
+                  *buf0 = buf[0], *buf1 = buf[1], \
11414
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], \
11415
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1]; \
11416
+    vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h, vu2_l, vu2_h, vv2_l, vv2_h; \
11417
+    vec_s32 hasAlpha(A_l, A_h,) R_l, R_h, G_l, G_h, B_l, B_h; \
11418
+    vec_s16 vyalpha = _mm_set1_epi32(4096 + yalpha * 0xffff); \
11419
+    vec_s16 vuvalpha = _mm_set1_epi32(4096 + uvalpha * 0xffff); \
11420
+    vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff); \
11421
+    vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \
11422
+    vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff); \
11423
+    vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff); \
11424
+    vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff); \
11425
+    vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff); \
11426
+    vec_s32 dec128 = _mm_set1_epi32(128 << 19); \
11427
+    hasAlpha(vec_s32 add18 = _mm_set1_epi32(1 << 18);) \
11428
+    __m128i v0, v1, v2, v3, v4, v5; \
11429
+    int i; \
11430
+    INIT_##fmt(R, B) \
11431
+    \
11432
+    av_assert2(yalpha <= 4096U); \
11433
+    av_assert2(uvalpha <= 4096U); \
11434
+    \
11435
+    for (i = 0; i < (dstW + 1) >> 1; i += 8) { \
11436
+        SETUP(buf, i * 2, vyalpha, v0, v1); \
11437
+        vy_l = _mm_srai_epi32(v0, 10); \
11438
+        vy_h = _mm_srai_epi32(v1, 10); \
11439
+        \
11440
+        SETUP(ubuf, i, vuvalpha, v0, v1); \
11441
+        vu2_l = _mm_srai_epi32(_mm_sub_epi32(v0, dec128), 10); \
11442
+        vu2_h = _mm_srai_epi32(_mm_sub_epi32(v1, dec128), 10); \
11443
+        \
11444
+        SETUP(vbuf, i, vuvalpha, v0, v1); \
11445
+        vv2_l = _mm_srai_epi32(_mm_sub_epi32(v0, dec128), 10); \
11446
+        vv2_h = _mm_srai_epi32(_mm_sub_epi32(v1, dec128), 10); \
11447
+        \
11448
+        hasAlpha( \
11449
+            SETUP(abuf, i * 2, vyalpha, v0, v1); \
11450
+            A_l = _mm_add_epi32(v0, add18); \
11451
+            A_h = _mm_add_epi32(v1, add18); \
11452
+        ) \
11453
+        \
11454
+        vu_l = _mm_unpacklo_epi32(vu2_l, vu2_l); \
11455
+        vu_h = _mm_unpackhi_epi32(vu2_l, vu2_l); \
11456
+        vv_l = _mm_unpacklo_epi32(vv2_l, vv2_l); \
11457
+        vv_h = _mm_unpackhi_epi32(vv2_l, vv2_l); \
11458
+        \
11459
+        CALC_RGB \
11460
+        WRITE_##fmt(R, B) \
11461
+        \
11462
+        SETUP(buf, i * 2 + 8, vyalpha, v0, v1); \
11463
+        vy_l = _mm_srai_epi32(v0, 10); \
11464
+        vy_h = _mm_srai_epi32(v1, 10); \
11465
+        \
11466
+        hasAlpha( \
11467
+            SETUP(abuf, i * 2 + 8, vyalpha, v0, v1); \
11468
+            A_l = _mm_add_epi32(v0, add18); \
11469
+            A_h = _mm_add_epi32(v1, add18); \
11470
+        ) \
11471
+        \
11472
+        vu_l = _mm_unpacklo_epi32(vu2_h, vu2_h); \
11473
+        vu_h = _mm_unpackhi_epi32(vu2_h, vu2_h); \
11474
+        vv_l = _mm_unpacklo_epi32(vv2_h, vv2_h); \
11475
+        vv_h = _mm_unpackhi_epi32(vv2_h, vv2_h); \
11476
+        \
11477
+        CALC_RGB \
11478
+        WRITE_##fmt(R, B) \
11479
+    } \
11480
+}
11481
+
11482
+#define YUV2RGBWRAPPER1(ext, fmt, R, B, hasAlpha) \
11483
+static void yuv2##ext##_1_e2k(SwsContext *c, const int16_t *buf0, \
11484
+                              const int16_t *ubuf[2], const int16_t *vbuf[2], \
11485
+                              const int16_t *abuf0, uint8_t *dest, int dstW, \
11486
+                              int uvalpha, int y) \
11487
+{ \
11488
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; \
11489
+    const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; \
11490
+    int uvshl = uvalpha < 2048 ? 2 : 1; \
11491
+    vec_s32 vy_l, vy_h, vu_l, vu_h, vv_l, vv_h, vu2_l, vu2_h, vv2_l, vv2_h; \
11492
+    vec_s32 R_l, R_h, G_l, G_h, B_l, B_h; \
11493
+    vec_s32 y_coeff = _mm_set1_epi32(c->yuv2rgb_y_coeff * 4); \
11494
+    vec_s32 y_add = _mm_set1_epi32((1 << 21) - c->yuv2rgb_y_offset * c->yuv2rgb_y_coeff); \
11495
+    vec_s32 v2r_coeff = _mm_set1_epi32(c->yuv2rgb_v2r_coeff << uvshl); \
11496
+    vec_s32 v2g_coeff = _mm_set1_epi32(c->yuv2rgb_v2g_coeff << uvshl); \
11497
+    vec_s32 u2g_coeff = _mm_set1_epi32(c->yuv2rgb_u2g_coeff << uvshl); \
11498
+    vec_s32 u2b_coeff = _mm_set1_epi32(c->yuv2rgb_u2b_coeff << uvshl); \
11499
+    vec_u16 uvsub = _mm_set1_epi16(uvalpha < 2048 ? 128 << 7 : 128 << 8); \
11500
+    hasAlpha(vec_s16 A, add64 = _mm_set1_epi16(64);) \
11501
+    __m128i v0, v1, v2, v3, v4, v5; \
11502
+    int i; \
11503
+    INIT_##fmt(R, B) \
11504
+    \
11505
+    for (i = 0; i < (dstW + 1) >> 1; i += 8) { \
11506
+        v0 = VEC_LD(buf0 + i * 2); \
11507
+        v2 = _mm_unpacklo_epi16(v0, v0); \
11508
+        v3 = _mm_unpackhi_epi16(v0, v0); \
11509
+        vy_l = _mm_srai_epi32(v2, 16); \
11510
+        vy_h = _mm_srai_epi32(v3, 16); \
11511
+        \
11512
+        v0 = VEC_LD(ubuf0 + i); \
11513
+        v1 = VEC_LD(vbuf0 + i); \
11514
+        if (uvalpha >= 2048) { \
11515
+            v2 = VEC_LD(ubuf1 + i); \
11516
+            v3 = VEC_LD(vbuf1 + i); \
11517
+            v0 = _mm_add_epi16(v0, v2); \
11518
+            v1 = _mm_add_epi16(v1, v3); \
11519
+        } \
11520
+        v0 = _mm_sub_epi16(v0, uvsub); \
11521
+        v1 = _mm_sub_epi16(v1, uvsub); \
11522
+        v2 = _mm_unpacklo_epi16(v0, v0); \
11523
+        v3 = _mm_unpackhi_epi16(v0, v0); \
11524
+        vu2_l = _mm_srai_epi32(v2, 16); \
11525
+        vu2_h = _mm_srai_epi32(v3, 16); \
11526
+        v2 = _mm_unpacklo_epi16(v1, v1); \
11527
+        v3 = _mm_unpackhi_epi16(v1, v1); \
11528
+        vv2_l = _mm_srai_epi32(v2, 16); \
11529
+        vv2_h = _mm_srai_epi32(v3, 16); \
11530
+        \
11531
+        hasAlpha( \
11532
+            A_l = VEC_LD(abuf0 + i * 2); \
11533
+            A_l = _mm_add_epi16(A_l, add64); \
11534
+            A_l = _mm_srai_epi16(A_l, 7); \
11535
+        ) \
11536
+        \
11537
+        vu_l = _mm_unpacklo_epi32(vu2_l, vu2_l); \
11538
+        vu_h = _mm_unpackhi_epi32(vu2_l, vu2_l); \
11539
+        vv_l = _mm_unpacklo_epi32(vv2_l, vv2_l); \
11540
+        vv_h = _mm_unpackhi_epi32(vv2_l, vv2_l); \
11541
+        \
11542
+        CALC_RGB \
11543
+        WRITE_##fmt(R, B) \
11544
+        \
11545
+        v0 = VEC_LD(buf0 + i * 2 + 8); \
11546
+        v2 = _mm_unpacklo_epi16(v0, v0); \
11547
+        v3 = _mm_unpackhi_epi16(v0, v0); \
11548
+        vy_l = _mm_srai_epi32(v2, 16); \
11549
+        vy_h = _mm_srai_epi32(v3, 16); \
11550
+        \
11551
+        hasAlpha( \
11552
+            A_l = VEC_LD(abuf0 + i * 2 + 8); \
11553
+            A_l = _mm_add_epi16(A_l, add64); \
11554
+            A_l = _mm_srai_epi16(A_l, 7); \
11555
+        ) \
11556
+        \
11557
+        vu_l = _mm_unpacklo_epi32(vu2_h, vu2_h); \
11558
+        vu_h = _mm_unpackhi_epi32(vu2_h, vu2_h); \
11559
+        vv_l = _mm_unpacklo_epi32(vv2_h, vv2_h); \
11560
+        vv_h = _mm_unpackhi_epi32(vv2_h, vv2_h); \
11561
+        \
11562
+        CALC_RGB \
11563
+        WRITE_##fmt(R, B) \
11564
+    } \
11565
+}
11566
+
11567
+#endif
11568
+
11569
+YUV2RGBWRAPPERX(rgbx32, RGBX, R, B, NO_ALPHA)
11570
+YUV2RGBWRAPPERX(bgrx32, RGBX, B, R, NO_ALPHA)
11571
+YUV2RGBWRAPPERX(xrgb32, XRGB, R, B, NO_ALPHA)
11572
+YUV2RGBWRAPPERX(xbgr32, XRGB, B, R, NO_ALPHA)
11573
+YUV2RGBWRAPPERX(rgb24, RGB, R, B, NO_ALPHA)
11574
+YUV2RGBWRAPPERX(bgr24, RGB, B, R, NO_ALPHA)
11575
+
11576
+YUV2RGBWRAPPER2(rgbx32, RGBX, R, B, NO_ALPHA)
11577
+YUV2RGBWRAPPER2(bgrx32, RGBX, B, R, NO_ALPHA)
11578
+YUV2RGBWRAPPER2(xrgb32, XRGB, R, B, NO_ALPHA)
11579
+YUV2RGBWRAPPER2(xbgr32, XRGB, B, R, NO_ALPHA)
11580
+YUV2RGBWRAPPER2(rgb24, RGB, R, B, NO_ALPHA)
11581
+YUV2RGBWRAPPER2(bgr24, RGB, B, R, NO_ALPHA)
11582
+
11583
+YUV2RGBWRAPPER1(rgbx32, RGBX, R, B, NO_ALPHA)
11584
+YUV2RGBWRAPPER1(bgrx32, RGBX, B, R, NO_ALPHA)
11585
+YUV2RGBWRAPPER1(xrgb32, XRGB, R, B, NO_ALPHA)
11586
+YUV2RGBWRAPPER1(xbgr32, XRGB, B, R, NO_ALPHA)
11587
+YUV2RGBWRAPPER1(rgb24, RGB, R, B, NO_ALPHA)
11588
+YUV2RGBWRAPPER1(bgr24, RGB, B, R, NO_ALPHA)
11589
+
11590
+#define WRITE_422(vu, vv, x0, x1) \
11591
+    vy0 = _mm_srai_epi32(vy0, 19); \
11592
+    vy1 = _mm_srai_epi32(vy1, 19); \
11593
+    vy2 = _mm_srai_epi32(vy2, 19); \
11594
+    vy3 = _mm_srai_epi32(vy3, 19); \
11595
+    vu##0 = _mm_srai_epi32(vu##0, 19); \
11596
+    vu##1 = _mm_srai_epi32(vu##1, 19); \
11597
+    vv##0 = _mm_srai_epi32(vv##0, 19 - 16); \
11598
+    vv##1 = _mm_srai_epi32(vv##1, 19 - 16); \
11599
+    v0 = _mm_packs_epi32(vy0, vy1); \
11600
+    v1 = _mm_packs_epi32(vy2, vy3); \
11601
+    v2 = _mm_blend_epi16(vu##0, vv##0, 0xaa); \
11602
+    v3 = _mm_blend_epi16(vu##1, vv##1, 0xaa); \
11603
+    v4 = _mm_packus_epi16(v0, v1); \
11604
+    v5 = _mm_packus_epi16(v2, v3); \
11605
+    v0 = _mm_unpacklo_epi8(x0, x1); \
11606
+    v1 = _mm_unpackhi_epi8(x0, x1); \
11607
+    VEC_ST(dest, v0); \
11608
+    VEC_ST(dest + 16, v1); \
11609
+    dest += 32;
11610
+
11611
+#define WRITE_YUYV422 WRITE_422(vu, vv, v4, v5)
11612
+#define WRITE_YVYU422 WRITE_422(vv, vu, v4, v5)
11613
+#define WRITE_UYVY422 WRITE_422(vu, vv, v5, v4)
11614
+
11615
+#define YUV2PACKEDWRAPPERX(ext, fmt) \
11616
+static void yuv2##ext##_X_e2k(SwsContext *c, const int16_t *lumFilter, \
11617
+                              const int16_t **lumSrc, int lumFilterSize, \
11618
+                              const int16_t *chrFilter, const int16_t **chrUSrc, \
11619
+                              const int16_t **chrVSrc, int chrFilterSize, \
11620
+                              const int16_t **alpSrc, uint8_t *dest, int dstW, \
11621
+                              int y) \
11622
+{ \
11623
+    int i, j; \
11624
+    __m128i vy0, vy1, vy2, vy3, vu0, vu1, vv0, vv1; \
11625
+    __m128i v0, v1, v2, v3, v4, v5; \
11626
+    vec_s32 start = _mm_set1_epi32(1 << 18); \
11627
+    \
11628
+    for (i = 0; i < (dstW + 1) >> 1; i += 8) { \
11629
+        vy0 = vy1 = vy2 = vy3 = start; \
11630
+        for (j = 0; j < lumFilterSize; j++) { \
11631
+            v0 = VEC_LD(lumSrc[j] + i * 2); \
11632
+            v1 = VEC_LD(lumSrc[j] + i * 2 + 8); \
11633
+            v5 = _mm_set1_epi16(lumFilter[j]); \
11634
+            v2 = _mm_mullo_epi16(v0, v5); \
11635
+            v3 = _mm_mulhi_epi16(v0, v5); \
11636
+            v4 = _mm_mullo_epi16(v1, v5); \
11637
+            v5 = _mm_mulhi_epi16(v1, v5); \
11638
+            v0 = _mm_unpacklo_epi16(v2, v3); \
11639
+            v1 = _mm_unpackhi_epi16(v2, v3); \
11640
+            v2 = _mm_unpacklo_epi16(v4, v5); \
11641
+            v3 = _mm_unpackhi_epi16(v4, v5); \
11642
+            vy0 = _mm_add_epi32(vy0, v0); \
11643
+            vy1 = _mm_add_epi32(vy1, v1); \
11644
+            vy2 = _mm_add_epi32(vy2, v2); \
11645
+            vy3 = _mm_add_epi32(vy3, v3); \
11646
+        } \
11647
+        \
11648
+        vu0 = vu1 = vv0 = vv1 = start; \
11649
+        for (j = 0; j < chrFilterSize; j++) { \
11650
+            v0 = VEC_LD(chrUSrc[j] + i); \
11651
+            v1 = VEC_LD(chrVSrc[j] + i); \
11652
+            v5 = _mm_set1_epi16(chrFilter[j]); \
11653
+            v2 = _mm_mullo_epi16(v0, v5); \
11654
+            v3 = _mm_mulhi_epi16(v0, v5); \
11655
+            v4 = _mm_mullo_epi16(v1, v5); \
11656
+            v5 = _mm_mulhi_epi16(v1, v5); \
11657
+            v0 = _mm_unpacklo_epi16(v2, v3); \
11658
+            v1 = _mm_unpackhi_epi16(v2, v3); \
11659
+            v2 = _mm_unpacklo_epi16(v4, v5); \
11660
+            v3 = _mm_unpackhi_epi16(v4, v5); \
11661
+            vu0 = _mm_add_epi32(vu0, v0); \
11662
+            vu1 = _mm_add_epi32(vu1, v1); \
11663
+            vv0 = _mm_add_epi32(vv0, v2); \
11664
+            vv1 = _mm_add_epi32(vv1, v3); \
11665
+        } \
11666
+        \
11667
+        WRITE_##fmt##422 \
11668
+    } \
11669
+}
11670
+
11671
+#define YUV2PACKEDWRAPPER2(ext, fmt) \
11672
+static void yuv2##ext##_2_e2k(SwsContext *c, const int16_t *buf[2], \
11673
+                              const int16_t *ubuf[2], const int16_t *vbuf[2], \
11674
+                              const int16_t *abuf[2], uint8_t *dest, int dstW, \
11675
+                              int yalpha, int uvalpha, int y) \
11676
+{ \
11677
+    const int16_t *buf0 = buf[0], *buf1 = buf[1], \
11678
+                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], \
11679
+                  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1]; \
11680
+    vec_s16 vyalpha = _mm_set1_epi32(4096 + yalpha * 0xffff); \
11681
+    vec_s16 vuvalpha = _mm_set1_epi32(4096 + uvalpha * 0xffff); \
11682
+    __m128i vy0, vy1, vy2, vy3, vu0, vu1, vv0, vv1; \
11683
+    __m128i v0, v1, v2, v3, v4, v5; \
11684
+    int i; \
11685
+    av_assert2(yalpha <= 4096U); \
11686
+    av_assert2(uvalpha <= 4096U); \
11687
+    \
11688
+    for (i = 0; i < (dstW + 1) >> 1; i += 8) { \
11689
+        SETUP(buf, i * 2, vyalpha, vy0, vy1); \
11690
+        SETUP(buf, i * 2 + 8, vyalpha, vy2, vy3); \
11691
+        SETUP(ubuf, i, vuvalpha, vu0, vu1); \
11692
+        SETUP(vbuf, i, vuvalpha, vv0, vv1); \
11693
+        \
11694
+        WRITE_##fmt##422 \
11695
+    } \
11696
+}
11697
+
11698
+#define INIT1_422 __m128i blenduv = _mm_set1_epi16(255);
11699
+
11700
+#define WRITE1_422(vu, vv, x0, x1) \
11701
+    v5 = _mm_srli_epi16(vu, 8); \
11702
+    v4 = _mm_packus_epi16(vy0, vy1); \
11703
+    v5 = _mm_blendv_epi8(vv, v5, blenduv); \
11704
+    v0 = _mm_unpacklo_epi8(x0, x1); \
11705
+    v1 = _mm_unpackhi_epi8(x0, x1); \
11706
+    VEC_ST(dest, v0); \
11707
+    VEC_ST(dest + 16, v1); \
11708
+    dest += 32;
11709
+
11710
+#define WRITE1_YUYV422 WRITE1_422(vu, vv, v4, v5)
11711
+#define WRITE1_YVYU422 WRITE1_422(vv, vu, v4, v5)
11712
+#define WRITE1_UYVY422 WRITE1_422(vu, vv, v5, v4)
11713
+
11714
+#define YUV2PACKEDWRAPPER1(ext, fmt) \
11715
+static void yuv2##ext##_1_e2k(SwsContext *c, const int16_t *buf0, \
11716
+                              const int16_t *ubuf[2], const int16_t *vbuf[2], \
11717
+                              const int16_t *abuf0, uint8_t *dest, int dstW, \
11718
+                              int uvalpha, int y) \
11719
+{ \
11720
+    const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0]; \
11721
+    vec_s16 vy0, vy1, vu, vv; \
11722
+    vec_s16 add64 = _mm_set1_epi16(64); \
11723
+    int i; \
11724
+    __m128i v0, v1, v2, v3, v4, v5; \
11725
+    LOAD_ZERO; \
11726
+    INIT1_422 \
11727
+    \
11728
+    if (uvalpha < 2048) { \
11729
+        for (i = 0; i < (dstW + 1) >> 1; i += 8) { \
11730
+            vy0 = VEC_LD(buf0 + i * 2); \
11731
+            vy1 = VEC_LD(buf0 + i * 2 + 8); \
11732
+            vu = VEC_LD(ubuf0 + i); \
11733
+            vv = VEC_LD(vbuf0 + i); \
11734
+            vy0 = _mm_adds_epi16(vy0, add64); \
11735
+            vy1 = _mm_adds_epi16(vy1, add64); \
11736
+            vu = _mm_max_epi16(vu, zerov); \
11737
+            vv = _mm_max_epi16(vv, zerov); \
11738
+            vy0 = _mm_srai_epi16(vy0, 7); \
11739
+            vy1 = _mm_srai_epi16(vy1, 7); \
11740
+            vu = _mm_add_epi16(vu, add64); \
11741
+            vv = _mm_add_epi16(vv, add64); \
11742
+            vu = _mm_adds_epu16(vu, vu); \
11743
+            vv = _mm_adds_epu16(vv, vv); \
11744
+            \
11745
+            WRITE1_##fmt##422 \
11746
+        } \
11747
+    } else { \
11748
+        const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1]; \
11749
+        vec_s16 add128 = _mm_add_epi16(add64, add64); \
11750
+        for (i = 0; i < (dstW + 1) >> 1; i += 8) { \
11751
+            vy0 = VEC_LD(buf0 + i * 2); \
11752
+            vy1 = VEC_LD(buf0 + i * 2 + 8); \
11753
+            v0 = VEC_LD(ubuf0 + i); \
11754
+            v1 = VEC_LD(vbuf0 + i); \
11755
+            v2 = VEC_LD(ubuf1 + i); \
11756
+            v3 = VEC_LD(vbuf1 + i); \
11757
+            vy0 = _mm_adds_epi16(vy0, add64); \
11758
+            vy1 = _mm_adds_epi16(vy1, add64); \
11759
+            v0 = _mm_max_epi16(v0, zerov); \
11760
+            v1 = _mm_max_epi16(v1, zerov); \
11761
+            v2 = _mm_max_epi16(v2, zerov); \
11762
+            v3 = _mm_max_epi16(v3, zerov); \
11763
+            v0 = _mm_add_epi16(v0, add128); \
11764
+            v1 = _mm_add_epi16(v1, add128); \
11765
+            vy0 = _mm_srai_epi16(vy0, 7); \
11766
+            vy1 = _mm_srai_epi16(vy1, 7); \
11767
+            vu = _mm_adds_epu16(v0, v2); \
11768
+            vv = _mm_adds_epu16(v1, v3); \
11769
+            \
11770
+            WRITE1_##fmt##422 \
11771
+        } \
11772
+    } \
11773
+}
11774
+
11775
+YUV2PACKEDWRAPPERX(yuyv422, YUYV)
11776
+YUV2PACKEDWRAPPERX(yvyu422, YVYU)
11777
+YUV2PACKEDWRAPPERX(uyvy422, UYVY)
11778
+
11779
+YUV2PACKEDWRAPPER2(yuyv422, YUYV)
11780
+YUV2PACKEDWRAPPER2(yvyu422, YVYU)
11781
+YUV2PACKEDWRAPPER2(uyvy422, UYVY)
11782
+
11783
+YUV2PACKEDWRAPPER1(yuyv422, YUYV)
11784
+YUV2PACKEDWRAPPER1(yvyu422, YVYU)
11785
+YUV2PACKEDWRAPPER1(uyvy422, UYVY)
11786
+
11787
+#define HSCALE_INIT() \
11788
+    __m128i v0, v1, v2, v3, v4, v5, v6; \
11789
+    vec_u32 vadd = _mm_setr_epi32(0, xInc, xInc * 2, xInc * 3); \
11790
+    vec_u16 vadd16 = _mm_setr_epi16(0, xInc, xInc * 2, xInc * 3, \
11791
+        xInc * 4, xInc * 5, xInc * 6, xInc * 7)
11792
+
11793
+#define HSCALE1() \
11794
+    v4 = _mm_set1_epi16(xpos); \
11795
+    v5 = _mm_set1_epi16(xpos + xInc * 8); \
11796
+    v4 = _mm_add_epi16(v4, vadd16); \
11797
+    v5 = _mm_add_epi16(v5, vadd16); \
11798
+    v4 = _mm_srli_epi16(v4, 9); \
11799
+    v5 = _mm_srli_epi16(v5, 9); \
11800
+    \
11801
+    v0 = _mm_set1_epi32(xpos & 0xffff); \
11802
+    v1 = _mm_set1_epi32((xpos & 0xffff) + xInc * 4); \
11803
+    v2 = _mm_set1_epi32((xpos & 0xffff) + xInc * 8); \
11804
+    v3 = _mm_set1_epi32((xpos & 0xffff) + xInc * 12); \
11805
+    v0 = _mm_add_epi32(v0, vadd); \
11806
+    v1 = _mm_add_epi32(v1, vadd); \
11807
+    v2 = _mm_add_epi32(v2, vadd); \
11808
+    v3 = _mm_add_epi32(v3, vadd); \
11809
+    v0 = _mm_srli_epi32(v0, 16); \
11810
+    v1 = _mm_srli_epi32(v1, 16); \
11811
+    v2 = _mm_srli_epi32(v2, 16); \
11812
+    v3 = _mm_srli_epi32(v3, 16); \
11813
+    v0 = _mm_packs_epi32(v0, v1); \
11814
+    v2 = _mm_packs_epi32(v2, v3); \
11815
+    v6 = _mm_packus_epi16(v0, v2); \
11816
+    \
11817
+    xx = xpos >> 16
11818
+
11819
+static void hyscale_fast_e2k(SwsContext *c, int16_t *dst, int dstWidth,
11820
+                           const uint8_t *src, int srcW, int xInc)
11821
+{
11822
+    int i, xpos = 0, xx, a1;
11823
+    LOAD_ZERO;
11824
+    HSCALE_INIT();
11825
+
11826
+    for (i = 0; i < dstWidth; i += 16) {
11827
+        HSCALE1();
11828
+
11829
+        v1 = VEC_LD(src + xx);
11830
+        v3 = VEC_LD(src + xx + 1);
11831
+
11832
+        v1 = _mm_shuffle_epi8(v1, v6);
11833
+        v3 = _mm_shuffle_epi8(v3, v6);
11834
+        v0 = _mm_unpacklo_epi8(v1, zerov);
11835
+        v1 = _mm_unpackhi_epi8(v1, zerov);
11836
+        v2 = _mm_unpacklo_epi8(v3, zerov);
11837
+        v3 = _mm_unpackhi_epi8(v3, zerov);
11838
+        v2 = _mm_sub_epi16(v2, v0);
11839
+        v3 = _mm_sub_epi16(v3, v1);
11840
+        v0 = _mm_slli_epi16(v0, 7);
11841
+        v1 = _mm_slli_epi16(v1, 7);
11842
+        v2 = _mm_mullo_epi16(v2, v4);
11843
+        v3 = _mm_mullo_epi16(v3, v5);
11844
+        v0 = _mm_add_epi16(v0, v2);
11845
+        v1 = _mm_add_epi16(v1, v3);
11846
+
11847
+        VEC_ST(dst + i, v0);
11848
+        VEC_ST(dst + i + 8, v1);
11849
+        xpos += xInc * 16;
11850
+    }
11851
+
11852
+    a1 = src[srcW - 1] * 128;
11853
+    for (i = dstWidth - 1; (i * xInc) >> 16 >= srcW - 1; i--)
11854
+        dst[i] = a1;
11855
+}
11856
+
11857
+#define HSCALE2() \
11858
+    v0 = _mm_shuffle_epi8(v0, v6); \
11859
+    v1 = _mm_shuffle_epi8(v1, v6); \
11860
+    v2 = _mm_unpacklo_epi8(v0, v1); \
11861
+    v3 = _mm_unpackhi_epi8(v0, v1); \
11862
+    v2 = _mm_maddubs_epi16(v2, v4); \
11863
+    v3 = _mm_maddubs_epi16(v3, v5)
11864
+
11865
+static void hcscale_fast_e2k(SwsContext *c, int16_t *dst1, int16_t *dst2,
11866
+                           int dstWidth, const uint8_t *src1,
11867
+                           const uint8_t *src2, int srcW, int xInc)
11868
+{
11869
+    int i, xpos = 0, xx, a1, a2;
11870
+    HSCALE_INIT();
11871
+    __m128i xorv = _mm_set1_epi8(127);
11872
+
11873
+    for (i = 0; i < dstWidth; i += 16) {
11874
+        HSCALE1();
11875
+
11876
+        v0 = _mm_packus_epi16(v4, v5);
11877
+        v1 = _mm_xor_si128(v0, xorv);
11878
+        v4 = _mm_unpacklo_epi8(v1, v0);
11879
+        v5 = _mm_unpackhi_epi8(v1, v0);
11880
+
11881
+        v0 = VEC_LD(src1 + xx);
11882
+        v1 = VEC_LD(src1 + xx + 1);
11883
+        HSCALE2();
11884
+        v0 = VEC_LD(src2 + xx);
11885
+        v1 = VEC_LD(src2 + xx + 1);
11886
+        VEC_ST(dst1 + i, v2);
11887
+        VEC_ST(dst1 + i + 8, v3);
11888
+        HSCALE2();
11889
+        VEC_ST(dst2 + i, v2);
11890
+        VEC_ST(dst2 + i + 8, v3);
11891
+        xpos += xInc * 16;
11892
+    }
11893
+
11894
+    a1 = src1[srcW - 1] * 128;
11895
+    a2 = src2[srcW - 1] * 128;
11896
+    for (i = dstWidth - 1; (i * xInc) >> 16 >= srcW - 1; i--) {
11897
+        dst1[i] = a1;
11898
+        dst2[i] = a2;
11899
+    }
11900
+}
11901
+
11902
+static void hScale8To19_e2k(SwsContext *c, int16_t *_dst, int dstW,
11903
+                            const uint8_t *src, const int16_t *filter,
11904
+                            const int32_t *filterPos, int filterSize)
11905
+{
11906
+    int i, j;
11907
+    int32_t *dst = (int32_t*)_dst;
11908
+    LOAD_ZERO;
11909
+    __m128i v0, v1, accv;
11910
+
11911
+    if (filterSize == 1) {
11912
+        for (i = 0; i < dstW; i++, filter += filterSize) {
11913
+            int val = 0, srcPos = filterPos[i];
11914
+            for (j = 0; j < filterSize; j++) {
11915
+                val += (int)src[srcPos + j] * filter[j];
11916
+            }
11917
+            dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
11918
+        }
11919
+    } else {
11920
+        __m64 h0, maskv;
11921
+        uint64_t mask = ~0ll;
11922
+        mask >>= (-filterSize & 7) * 8;
11923
+        maskv = (__m64)mask; // 8, 1, 2, 3, 4, 5, 6, 7
11924
+
11925
+        for (i = 0; i < dstW; i++, filter += filterSize) {
11926
+            int val;
11927
+            const uint8_t *srci = src + filterPos[i];
11928
+            accv = zerov;
11929
+            for (j = 0; j + 7 < filterSize; j += 8) {
11930
+                v0 = VEC_LD8(srci + j);
11931
+                v1 = VEC_LD(filter + j);
11932
+                v0 = _mm_unpacklo_epi8(v0, zerov);
11933
+                accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1));
11934
+            }
11935
+            if (filterSize & 7) {
11936
+                h0 = *(__m64*)(srci + j);
11937
+                // Remove the unused elements on the last round
11938
+                h0 = _mm_and_si64(h0, maskv);
11939
+                v0 = _mm_movpi64_epi64(h0);
11940
+                v1 = VEC_LD(filter + j);
11941
+                v0 = _mm_unpacklo_epi8(v0, zerov);
11942
+                accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1));
11943
+            }
11944
+            accv = _mm_hadd_epi32(accv, accv);
11945
+            val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1);
11946
+            dst[i] = FFMIN(val >> 3, (1 << 19) - 1);
11947
+        }
11948
+    }
11949
+}
11950
+
11951
+static void hScale16To19_e2k(SwsContext *c, int16_t *_dst, int dstW,
11952
+                             const uint8_t *_src, const int16_t *filter,
11953
+                             const int32_t *filterPos, int filterSize)
11954
+{
11955
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
11956
+    int i, j;
11957
+    int32_t *dst = (int32_t*)_dst;
11958
+    const uint16_t *src = (const uint16_t*)_src;
11959
+    int bits = desc->comp[0].depth - 1;
11960
+    int sh = bits - 4;
11961
+    LOAD_ZERO;
11962
+    __m128i v0, v1, accv;
11963
+
11964
+    if ((isAnyRGB(c->srcFormat) || c->srcFormat == AV_PIX_FMT_PAL8) && desc->comp[0].depth < 16) {
11965
+        sh = 9;
11966
+    } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
11967
+        sh = 16 - 1 - 4;
11968
+    }
11969
+
11970
+    if (filterSize == 1) {
11971
+        for (i = 0; i < dstW; i++) {
11972
+            int val = 0, srcPos = filterPos[i];
11973
+            for (j = 0; j < filterSize; j++) {
11974
+                val += (int)src[srcPos + j] * filter[filterSize * i + j];
11975
+            }
11976
+            // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
11977
+            dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
11978
+        }
11979
+    } else {
11980
+        __m128i maskv, signv = _mm_set1_epi16(-0x8000), initv = zerov;
11981
+        uint64_t mask = ~0ll;
11982
+        mask >>= (-filterSize & 7) * 8;
11983
+        maskv = _mm_movpi64_epi64((__m64)mask);
11984
+        maskv = _mm_unpacklo_epi8(maskv, maskv);
11985
+
11986
+        for (j = 0; j + 7 < filterSize; j += 8) {
11987
+            v1 = VEC_LD(filter + j);
11988
+            initv = _mm_sub_epi32(initv, _mm_madd_epi16(signv, v1));
11989
+        }
11990
+        if (filterSize & 7) {
11991
+            v1 = VEC_LD(filter + j);
11992
+            v1 = _mm_and_si128(v1, maskv);
11993
+            initv = _mm_sub_epi32(initv, _mm_madd_epi16(signv, v1));
11994
+        }
11995
+
11996
+        for (i = 0; i < dstW; i++, filter += filterSize) {
11997
+            int val;
11998
+            const int16_t *srci = src + filterPos[i];
11999
+            accv = initv;
12000
+            for (j = 0; j + 7 < filterSize; j += 8) {
12001
+                v0 = VEC_LD(srci + j);
12002
+                v0 = _mm_xor_si128(v0, signv);
12003
+                v1 = VEC_LD(filter + j);
12004
+                accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1));
12005
+            }
12006
+            if (filterSize & 7) {
12007
+                v0 = VEC_LD(srci + j);
12008
+                v0 = _mm_xor_si128(v0, signv);
12009
+                v1 = VEC_LD(filter + j);
12010
+                // Remove the unused elements on the last round
12011
+                v1 = _mm_and_si128(v1, maskv);
12012
+                accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1));
12013
+            }
12014
+            accv = _mm_hadd_epi32(accv, accv);
12015
+            val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1);
12016
+            dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
12017
+        }
12018
+    }
12019
+}
12020
+
12021
+static void hScale16To15_e2k(SwsContext *c, int16_t *dst, int dstW,
12022
+                             const uint8_t *_src, const int16_t *filter,
12023
+                             const int32_t *filterPos, int filterSize)
12024
+{
12025
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
12026
+    int i, j;
12027
+    const uint16_t *src = (const uint16_t*)_src;
12028
+    int sh = desc->comp[0].depth - 1;
12029
+    LOAD_ZERO;
12030
+    __m128i v0, v1, accv;
12031
+
12032
+    if (sh < 15) {
12033
+        sh = isAnyRGB(c->srcFormat) || c->srcFormat == AV_PIX_FMT_PAL8 ? 13 : (desc->comp[0].depth - 1);
12034
+    } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
12035
+        sh = 16 - 1;
12036
+    }
12037
+
12038
+    if (filterSize == 1) {
12039
+        for (i = 0; i < dstW; i++) {
12040
+            int val = 0, srcPos = filterPos[i];
12041
+            for (j = 0; j < filterSize; j++) {
12042
+                val += (int)src[srcPos + j] * filter[filterSize * i + j];
12043
+            }
12044
+            // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
12045
+            dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
12046
+        }
12047
+    } else {
12048
+        __m128i maskv, signv = _mm_set1_epi16(-0x8000), initv = zerov;
12049
+        uint64_t mask = ~0ll;
12050
+        mask >>= (-filterSize & 7) * 8;
12051
+        maskv = _mm_movpi64_epi64((__m64)mask);
12052
+        maskv = _mm_unpacklo_epi8(maskv, maskv);
12053
+
12054
+        for (j = 0; j + 7 < filterSize; j += 8) {
12055
+            v1 = VEC_LD(filter + j);
12056
+            initv = _mm_sub_epi32(initv, _mm_madd_epi16(signv, v1));
12057
+        }
12058
+        if (filterSize & 7) {
12059
+            v1 = VEC_LD(filter + j);
12060
+            v1 = _mm_and_si128(v1, maskv);
12061
+            initv = _mm_sub_epi32(initv, _mm_madd_epi16(signv, v1));
12062
+        }
12063
+
12064
+        for (i = 0; i < dstW; i++, filter += filterSize) {
12065
+            int val;
12066
+            const int16_t *srci = src + filterPos[i];
12067
+            accv = initv;
12068
+            for (j = 0; j + 7 < filterSize; j += 8) {
12069
+                v0 = VEC_LD(srci + j);
12070
+                v0 = _mm_xor_si128(v0, signv);
12071
+                v1 = VEC_LD(filter + j);
12072
+                accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1));
12073
+            }
12074
+            if (filterSize & 7) {
12075
+                v0 = VEC_LD(srci + j);
12076
+                v0 = _mm_xor_si128(v0, signv);
12077
+                // Remove the unused elements on the last round
12078
+                v1 = VEC_LD(filter + j);
12079
+                v1 = _mm_and_si128(v1, maskv);
12080
+                accv = _mm_add_epi32(accv, _mm_madd_epi16(v0, v1));
12081
+            }
12082
+            accv = _mm_hadd_epi32(accv, accv);
12083
+            val = _mm_extract_epi32(accv, 0) + _mm_extract_epi32(accv, 1);
12084
+            dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
12085
+        }
12086
+    }
12087
+}
12088
+
12089
+av_cold void ff_sws_init_swscale_e2k(SwsContext *c)
12090
+{
12091
+    enum AVPixelFormat dstFormat = c->dstFormat;
12092
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
12093
+
12094
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_E2K))
12095
+        return;
12096
+
12097
+    if (dstFormat == AV_PIX_FMT_P010LE || dstFormat == AV_PIX_FMT_P010BE) {
12098
+        // c->yuv2plane1 = isBE(dstFormat) ? yuv2p010l1_BE_e2k : yuv2p010l1_LE_e2k;
12099
+        // c->yuv2planeX = isBE(dstFormat) ? yuv2p010lX_BE_e2k : yuv2p010lX_LE_e2k;
12100
+        // c->yuv2nv12cX = yuv2p010cX_e2k;
12101
+    } else if (is16BPS(dstFormat)) {
12102
+        c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_e2k  : yuv2planeX_16LE_e2k;
12103
+        c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_e2k  : yuv2plane1_16LE_e2k;
12104
+        if (dstFormat == AV_PIX_FMT_P016LE || dstFormat == AV_PIX_FMT_P016BE) {
12105
+          // c->yuv2nv12cX = yuv2p016cX_e2k;
12106
+        }
12107
+    } else if (isNBPS(dstFormat)) {
12108
+        if (desc->comp[0].depth == 9) {
12109
+            c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_e2k  : yuv2planeX_9LE_e2k;
12110
+            c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_e2k  : yuv2plane1_9LE_e2k;
12111
+        } else if (desc->comp[0].depth == 10) {
12112
+            c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_e2k  : yuv2planeX_10LE_e2k;
12113
+            c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_e2k  : yuv2plane1_10LE_e2k;
12114
+        } else if (desc->comp[0].depth == 12) {
12115
+            c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_e2k  : yuv2planeX_12LE_e2k;
12116
+            c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_e2k  : yuv2plane1_12LE_e2k;
12117
+        } else if (desc->comp[0].depth == 14) {
12118
+            c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_e2k  : yuv2planeX_14LE_e2k;
12119
+            c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_e2k  : yuv2plane1_14LE_e2k;
12120
+        } else
12121
+            av_assert0(0);
12122
+    } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
12123
+        // c->yuv2planeX = yuv2planeX_floatBE_e2k;
12124
+        c->yuv2plane1 = yuv2plane1_floatBE_e2k;
12125
+    } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
12126
+        // c->yuv2planeX = yuv2planeX_floatLE_e2k;
12127
+        c->yuv2plane1 = yuv2plane1_floatLE_e2k;
12128
+    } else {
12129
+        c->yuv2plane1 = yuv2plane1_8_e2k;
12130
+        c->yuv2planeX = yuv2planeX_8_e2k;
12131
+#if 0
12132
+        if (dstFormat == AV_PIX_FMT_NV12 || dstFormat == AV_PIX_FMT_NV21 ||
12133
+            dstFormat == AV_PIX_FMT_NV24 || dstFormat == AV_PIX_FMT_NV42)
12134
+            c->yuv2nv12cX = yuv2nv12cX_e2k;
12135
+#endif
12136
+    }
12137
+
12138
+    if (c->srcBpc == 8) {
12139
+        if (c->dstBpc <= 14) {
12140
+            c->hyScale = c->hcScale = hScale_real_e2k;
12141
+            if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) {
12142
+                c->hyscale_fast = hyscale_fast_e2k;
12143
+                c->hcscale_fast = hcscale_fast_e2k;
12144
+            }
12145
+        } else {
12146
+            c->hyScale = c->hcScale = hScale8To19_e2k;
12147
+        }
12148
+    } else {
12149
+            c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_e2k
12150
+                                                     : hScale16To15_e2k;
12151
+    }
12152
+
12153
+    if (c->flags & SWS_FULL_CHR_H_INT) {
12154
+        switch (dstFormat) {
12155
+            case AV_PIX_FMT_RGB24:
12156
+                c->yuv2packed1 = yuv2rgb24_full_1_e2k;
12157
+                c->yuv2packed2 = yuv2rgb24_full_2_e2k;
12158
+                c->yuv2packedX = yuv2rgb24_full_X_e2k;
12159
+            break;
12160
+            case AV_PIX_FMT_BGR24:
12161
+                c->yuv2packed1 = yuv2bgr24_full_1_e2k;
12162
+                c->yuv2packed2 = yuv2bgr24_full_2_e2k;
12163
+                c->yuv2packedX = yuv2bgr24_full_X_e2k;
12164
+            break;
12165
+            case AV_PIX_FMT_BGRA:
12166
+                if (!c->needAlpha) {
12167
+                    c->yuv2packed1 = yuv2bgrx32_full_1_e2k;
12168
+                    c->yuv2packed2 = yuv2bgrx32_full_2_e2k;
12169
+                    c->yuv2packedX = yuv2bgrx32_full_X_e2k;
12170
+                }
12171
+            break;
12172
+            case AV_PIX_FMT_RGBA:
12173
+                if (!c->needAlpha) {
12174
+                    c->yuv2packed1 = yuv2rgbx32_full_1_e2k;
12175
+                    c->yuv2packed2 = yuv2rgbx32_full_2_e2k;
12176
+                    c->yuv2packedX = yuv2rgbx32_full_X_e2k;
12177
+                }
12178
+            break;
12179
+            case AV_PIX_FMT_ARGB:
12180
+                if (!c->needAlpha) {
12181
+                    c->yuv2packed1 = yuv2xrgb32_full_1_e2k;
12182
+                    c->yuv2packed2 = yuv2xrgb32_full_2_e2k;
12183
+                    c->yuv2packedX = yuv2xrgb32_full_X_e2k;
12184
+                }
12185
+            break;
12186
+            case AV_PIX_FMT_ABGR:
12187
+                if (!c->needAlpha) {
12188
+                    c->yuv2packed1 = yuv2xbgr32_full_1_e2k;
12189
+                    c->yuv2packed2 = yuv2xbgr32_full_2_e2k;
12190
+                    c->yuv2packedX = yuv2xbgr32_full_X_e2k;
12191
+                 }
12192
+            break;
12193
+        }
12194
+    } else if (!(c->flags & SWS_BITEXACT)) { /* !SWS_FULL_CHR_H_INT */
12195
+        switch (dstFormat) {
12196
+            case AV_PIX_FMT_RGB24:
12197
+                c->yuv2packed1 = yuv2rgb24_1_e2k;
12198
+                c->yuv2packed2 = yuv2rgb24_2_e2k;
12199
+                c->yuv2packedX = yuv2rgb24_X_e2k;
12200
+            break;
12201
+            case AV_PIX_FMT_BGR24:
12202
+                c->yuv2packed1 = yuv2bgr24_1_e2k;
12203
+                c->yuv2packed2 = yuv2bgr24_2_e2k;
12204
+                c->yuv2packedX = yuv2bgr24_X_e2k;
12205
+            break;
12206
+            case AV_PIX_FMT_BGRA:
12207
+                if (!c->needAlpha) {
12208
+                    c->yuv2packed1 = yuv2bgrx32_1_e2k;
12209
+                    c->yuv2packed2 = yuv2bgrx32_2_e2k;
12210
+                    c->yuv2packedX = yuv2bgrx32_X_e2k;
12211
+                }
12212
+            break;
12213
+            case AV_PIX_FMT_RGBA:
12214
+                if (!c->needAlpha) {
12215
+                    c->yuv2packed1 = yuv2rgbx32_1_e2k;
12216
+                    c->yuv2packed2 = yuv2rgbx32_2_e2k;
12217
+                    c->yuv2packedX = yuv2rgbx32_X_e2k;
12218
+                }
12219
+            break;
12220
+            case AV_PIX_FMT_ARGB:
12221
+                if (!c->needAlpha) {
12222
+                    c->yuv2packed1 = yuv2xrgb32_1_e2k;
12223
+                    c->yuv2packed2 = yuv2xrgb32_2_e2k;
12224
+                    c->yuv2packedX = yuv2xrgb32_X_e2k;
12225
+                }
12226
+            break;
12227
+            case AV_PIX_FMT_ABGR:
12228
+                if (!c->needAlpha) {
12229
+                    c->yuv2packed1 = yuv2xbgr32_1_e2k;
12230
+                    c->yuv2packed2 = yuv2xbgr32_2_e2k;
12231
+                    c->yuv2packedX = yuv2xbgr32_X_e2k;
12232
+                }
12233
+            break;
12234
+        }
12235
+    }
12236
+
12237
+    switch (dstFormat) {
12238
+        case AV_PIX_FMT_YUYV422:
12239
+            c->yuv2packed1 = yuv2yuyv422_1_e2k;
12240
+            c->yuv2packed2 = yuv2yuyv422_2_e2k;
12241
+            c->yuv2packedX = yuv2yuyv422_X_e2k;
12242
+        break;
12243
+        case AV_PIX_FMT_YVYU422:
12244
+            c->yuv2packed1 = yuv2yvyu422_1_e2k;
12245
+            c->yuv2packed2 = yuv2yvyu422_2_e2k;
12246
+            c->yuv2packedX = yuv2yvyu422_X_e2k;
12247
+        break;
12248
+        case AV_PIX_FMT_UYVY422:
12249
+            c->yuv2packed1 = yuv2uyvy422_1_e2k;
12250
+            c->yuv2packed2 = yuv2uyvy422_2_e2k;
12251
+            c->yuv2packedX = yuv2uyvy422_X_e2k;
12252
+        break;
12253
+    }
12254
+}
12255
diff --git a/libswscale/e2k/yuv2rgb.c b/libswscale/e2k/yuv2rgb.c
12256
new file mode 100644
12257
index 0000000..92f153f
12258
--- /dev/null
12259
+++ b/libswscale/e2k/yuv2rgb.c
12260
@@ -0,0 +1,248 @@
12261
+/*
12262
+ * Elbrus acceleration for colorspace conversion
12263
+ *
12264
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
12265
+ * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
12266
+ *
12267
+ * This file is part of FFmpeg.
12268
+ *
12269
+ * FFmpeg is free software; you can redistribute it and/or
12270
+ * modify it under the terms of the GNU Lesser General Public
12271
+ * License as published by the Free Software Foundation; either
12272
+ * version 2.1 of the License, or (at your option) any later version.
12273
+ *
12274
+ * FFmpeg is distributed in the hope that it will be useful,
12275
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12276
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12277
+ * Lesser General Public License for more details.
12278
+ *
12279
+ * You should have received a copy of the GNU Lesser General Public
12280
+ * License along with FFmpeg; if not, write to the Free Software
12281
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
12282
+ */
12283
+
12284
+#include <stdio.h>
12285
+#include <stdlib.h>
12286
+#include <string.h>
12287
+#include <inttypes.h>
12288
+
12289
+#include "config.h"
12290
+#include "libswscale/rgb2rgb.h"
12291
+#include "libswscale/swscale.h"
12292
+#include "libswscale/swscale_internal.h"
12293
+#include "libavutil/attributes.h"
12294
+#include "libavutil/cpu.h"
12295
+#include "libavutil/e2k/util_e2k.h"
12296
+#include "libavutil/pixdesc.h"
12297
+#include "yuv2rgb.h"
12298
+
12299
+/*
12300
+ * ------------------------------------------------------------------------------
12301
+ * CS converters
12302
+ * ------------------------------------------------------------------------------
12303
+ */
12304
+
12305
+#define INIT2_RGB(R, B) \
12306
+    __m128i perm_unp8 = _mm_setr_epi8( \
12307
+        0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); \
12308
+    __m64 rgb_index0 = _mm_setr_pi8(0, 1, 3, 4, 5, 7, 8, 9); \
12309
+    __m64 rgb_index1 = _mm_setr_pi8(3, 4, 5, 7, 8, 9, 11, 12); \
12310
+    __m64 rgb_index2 = _mm_setr_pi8(5, 7, 8, 9, 11, 12, 13, 15);
12311
+
12312
+#define INIT2_RGBX(R, B) INIT2_XRGB(R, B)
12313
+#define INIT2_XRGB(R, B) \
12314
+    __m128i A_l = _mm_set1_epi16(255); \
12315
+    __m128i perm_unp8 = _mm_setr_epi8( \
12316
+        0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
12317
+
12318
+#define WRITE2_RGB(dest, R, B) \
12319
+    v4 = _mm_packus_epi16(R##_l, G_l); \
12320
+    v5 = _mm_packus_epi16(B##_l, B##_l); \
12321
+    v0 = _mm_shuffle_epi8(v4, perm_unp8); \
12322
+    v1 = _mm_unpacklo_epi8(v5, v5); \
12323
+    v2 = _mm_unpacklo_epi16(v0, v1); \
12324
+    v3 = _mm_unpackhi_epi16(v0, v1); \
12325
+    { \
12326
+        union { __m128i v; __m64 d[2]; } a = { v2 }, b = { v3 }; \
12327
+        __m64 *p = (__m64*)dest; \
12328
+        p[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \
12329
+        p[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \
12330
+        p[2] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \
12331
+        dest += 24; \
12332
+    }
12333
+
12334
+#define WRITE2_RGBX(dest, R, B) \
12335
+    v4 = _mm_packus_epi16(R##_l, G_l); \
12336
+    v5 = _mm_packus_epi16(B##_l, A_l); \
12337
+    v0 = _mm_shuffle_epi8(v4, perm_unp8); \
12338
+    v1 = _mm_shuffle_epi8(v5, perm_unp8); \
12339
+    v2 = _mm_unpacklo_epi16(v0, v1); \
12340
+    v3 = _mm_unpackhi_epi16(v0, v1); \
12341
+    VEC_ST(dest, v2); \
12342
+    VEC_ST(dest + 16, v3); \
12343
+    dest += 32;
12344
+
12345
+#define WRITE2_XRGB(dest, R, B) \
12346
+    v4 = _mm_packus_epi16(A_l, R##_l); \
12347
+    v5 = _mm_packus_epi16(G_l, B##_l); \
12348
+    v0 = _mm_shuffle_epi8(v4, perm_unp8); \
12349
+    v1 = _mm_shuffle_epi8(v5, perm_unp8); \
12350
+    v2 = _mm_unpacklo_epi16(v0, v1); \
12351
+    v3 = _mm_unpackhi_epi16(v0, v1); \
12352
+    VEC_ST(dest, v2); \
12353
+    VEC_ST(dest + 16, v3); \
12354
+    dest += 32;
12355
+
12356
+#define DEFCSP420_CVT(name, fmt, R, B) \
12357
+static int yuv2##name##_e2k(SwsContext *c, const unsigned char **in, \
12358
+                      int *instrides, int srcSliceY, int srcSliceH, \
12359
+                      unsigned char **oplanes, int *outstrides) \
12360
+{ \
12361
+    vec_s32 R_l, G_l, B_l; \
12362
+    vec_s32 y_coeff = _mm_set1_epi16(c->yuv2rgb_y_coeff); \
12363
+    vec_s32 y_sub = _mm_set1_epi16((c->yuv2rgb_y_offset + 64) >> 7); \
12364
+    vec_s32 v2r_coeff = _mm_set1_epi16(c->yuv2rgb_v2r_coeff); \
12365
+    vec_s32 v2g_coeff = _mm_set1_epi16(c->yuv2rgb_v2g_coeff); \
12366
+    vec_s32 u2g_coeff = _mm_set1_epi16(c->yuv2rgb_u2g_coeff); \
12367
+    vec_s32 u2b_coeff = _mm_set1_epi16(c->yuv2rgb_u2b_coeff); \
12368
+    vec_s32 dec128 = _mm_set1_epi16(128); \
12369
+    __m128i v0, v1, v2, v3, v4, v5; \
12370
+    LOAD_ZERO; \
12371
+    INIT2_##fmt(R, B) \
12372
+    int i, j, w = c->dstW & -16, h = srcSliceH & -2; \
12373
+    vec_s16 Y0, Y1, Y2, Y3, U, V; \
12374
+    vec_s16 vx, ux, uvx, vx0, ux0, uvx0, vx1, ux1, uvx1; \
12375
+    const uint8_t *y1i = in[0]; \
12376
+    const uint8_t *y2i = in[0] + instrides[0]; \
12377
+    const uint8_t *ui = in[1], *vi = in[2]; \
12378
+    uint8_t *out0, *out1; \
12379
+    int vshift = c->srcFormat == AV_PIX_FMT_YUV422P; \
12380
+    int instrides0 = instrides[0] * 2 - w; \
12381
+    int instrides1 = (instrides[1] << vshift) - w / 2; \
12382
+    int instrides2 = (instrides[2] << vshift) - w / 2; \
12383
+    \
12384
+    for (i = 0; i < h; i += 2) { \
12385
+        out0 = oplanes[0] + (i + srcSliceY) * outstrides[0]; \
12386
+        out1 = out0 + outstrides[0]; \
12387
+        for (j = 0; j < w >> 4; j++) { \
12388
+            Y1 = VEC_LD(y1i); \
12389
+            Y3 = VEC_LD(y2i); \
12390
+            U = VEC_LD8(ui); \
12391
+            V = VEC_LD8(vi); \
12392
+            U = _mm_unpacklo_epi8(U, zerov); \
12393
+            V = _mm_unpacklo_epi8(V, zerov); \
12394
+            Y0 = _mm_unpacklo_epi8(Y1, zerov); \
12395
+            Y1 = _mm_unpackhi_epi8(Y1, zerov); \
12396
+            Y2 = _mm_unpacklo_epi8(Y3, zerov); \
12397
+            Y3 = _mm_unpackhi_epi8(Y3, zerov); \
12398
+            U = _mm_sub_epi16(U, dec128); \
12399
+            V = _mm_sub_epi16(V, dec128); \
12400
+            U = _mm_slli_epi16(U, 2); \
12401
+            V = _mm_slli_epi16(V, 2); \
12402
+            Y0 = _mm_slli_epi16(Y0, 2); \
12403
+            Y1 = _mm_slli_epi16(Y1, 2); \
12404
+            Y2 = _mm_slli_epi16(Y2, 2); \
12405
+            Y3 = _mm_slli_epi16(Y3, 2); \
12406
+            \
12407
+            Y0 = _mm_mulhrs_epi16(_mm_sub_epi16(Y0, y_sub), y_coeff); \
12408
+            Y1 = _mm_mulhrs_epi16(_mm_sub_epi16(Y1, y_sub), y_coeff); \
12409
+            Y2 = _mm_mulhrs_epi16(_mm_sub_epi16(Y2, y_sub), y_coeff); \
12410
+            Y3 = _mm_mulhrs_epi16(_mm_sub_epi16(Y3, y_sub), y_coeff); \
12411
+            \
12412
+            ux = _mm_mulhrs_epi16(U, u2b_coeff); \
12413
+            vx = _mm_mulhrs_epi16(V, v2r_coeff); \
12414
+            ux0 = _mm_unpacklo_epi16(ux, ux); \
12415
+            ux1 = _mm_unpackhi_epi16(ux, ux); \
12416
+            vx0 = _mm_unpacklo_epi16(vx, vx); \
12417
+            vx1 = _mm_unpackhi_epi16(vx, vx); \
12418
+            \
12419
+            uvx = _mm_mulhrs_epi16(U, u2g_coeff); \
12420
+            uvx = _mm_add_epi16(_mm_mulhrs_epi16(V, v2g_coeff), uvx); \
12421
+            uvx0 = _mm_unpacklo_epi16(uvx, uvx); \
12422
+            uvx1 = _mm_unpackhi_epi16(uvx, uvx); \
12423
+            \
12424
+            R_l = _mm_add_epi16(Y0, vx0); \
12425
+            G_l = _mm_add_epi16(Y0, uvx0); \
12426
+            B_l = _mm_add_epi16(Y0, ux0); \
12427
+            \
12428
+            WRITE2_##fmt(out0, R, B) \
12429
+            \
12430
+            R_l = _mm_add_epi16(Y1, vx1); \
12431
+            G_l = _mm_add_epi16(Y1, uvx1); \
12432
+            B_l = _mm_add_epi16(Y1, ux1); \
12433
+            \
12434
+            WRITE2_##fmt(out0, R, B) \
12435
+            \
12436
+            R_l = _mm_add_epi16(Y2, vx0); \
12437
+            G_l = _mm_add_epi16(Y2, uvx0); \
12438
+            B_l = _mm_add_epi16(Y2, ux0); \
12439
+            \
12440
+            WRITE2_##fmt(out1, R, B) \
12441
+            \
12442
+            R_l = _mm_add_epi16(Y3, vx1); \
12443
+            G_l = _mm_add_epi16(Y3, uvx1); \
12444
+            B_l = _mm_add_epi16(Y3, ux1); \
12445
+            \
12446
+            WRITE2_##fmt(out1, R, B) \
12447
+            \
12448
+            y1i += 16; ui += 8; \
12449
+            y2i += 16; vi += 8; \
12450
+        } \
12451
+        y1i += instrides0; ui += instrides1; \
12452
+        y2i += instrides0; vi += instrides2; \
12453
+    } \
12454
+    return srcSliceH; \
12455
+}
12456
+
12457
+DEFCSP420_CVT(rgbx32, RGBX, R, B)
12458
+DEFCSP420_CVT(bgrx32, RGBX, B, R)
12459
+DEFCSP420_CVT(xrgb32, XRGB, R, B)
12460
+DEFCSP420_CVT(xbgr32, XRGB, B, R)
12461
+DEFCSP420_CVT(rgb24, RGB, R, B)
12462
+DEFCSP420_CVT(bgr24, RGB, B, R)
12463
+
12464
+/* Ok currently the acceleration routine only supports
12465
+ * inputs of widths a multiple of 16
12466
+ * and heights a multiple 2
12467
+ *
12468
+ * So we just fall back to the C codes for this.
12469
+ */
12470
+av_cold SwsFunc ff_yuv2rgb_init_e2k(SwsContext *c)
12471
+{
12472
+    SwsFunc ret;
12473
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_E2K))
12474
+        return NULL;
12475
+
12476
+    if (c->flags & SWS_BITEXACT || c->needAlpha)
12477
+        return NULL;
12478
+
12479
+    ret = NULL;
12480
+    switch (c->srcFormat) {
12481
+    case AV_PIX_FMT_YUV422P:
12482
+    case AV_PIX_FMT_YUV420P:
12483
+        if (c->dstW & 15 || c->dstH & 1) break;
12484
+        switch (c->dstFormat) {
12485
+        case AV_PIX_FMT_RGB24:
12486
+            ret = yuv2rgb24_e2k; break;
12487
+        case AV_PIX_FMT_BGR24:
12488
+            ret = yuv2bgr24_e2k; break;
12489
+        case AV_PIX_FMT_ARGB:
12490
+            ret = yuv2xrgb32_e2k; break;
12491
+        case AV_PIX_FMT_ABGR:
12492
+            ret = yuv2xbgr32_e2k; break;
12493
+        case AV_PIX_FMT_RGBA:
12494
+            ret = yuv2rgbx32_e2k; break;
12495
+        case AV_PIX_FMT_BGRA:
12496
+            ret = yuv2bgrx32_e2k; break;
12497
+        default: break;
12498
+        }
12499
+        break;
12500
+    }
12501
+    if (ret) {
12502
+        av_log(c, AV_LOG_WARNING, "E2K: yuv2rgb(%s, %s)\n",
12503
+                av_get_pix_fmt_name(c->srcFormat),
12504
+                av_get_pix_fmt_name(c->dstFormat));
12505
+    }
12506
+    return ret;
12507
+}
12508
+
12509
diff --git a/libswscale/e2k/yuv2rgb.h b/libswscale/e2k/yuv2rgb.h
12510
new file mode 100644
12511
index 0000000..59637bc
12512
--- /dev/null
12513
+++ b/libswscale/e2k/yuv2rgb.h
12514
@@ -0,0 +1,52 @@
12515
+/*
12516
+ * Elbrus-enhanced yuv2yuvX
12517
+ *
12518
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
12519
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
12520
+ * based on the equivalent C code in swscale.c
12521
+ *
12522
+ * This file is part of FFmpeg.
12523
+ *
12524
+ * FFmpeg is free software; you can redistribute it and/or
12525
+ * modify it under the terms of the GNU Lesser General Public
12526
+ * License as published by the Free Software Foundation; either
12527
+ * version 2.1 of the License, or (at your option) any later version.
12528
+ *
12529
+ * FFmpeg is distributed in the hope that it will be useful,
12530
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12531
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12532
+ * Lesser General Public License for more details.
12533
+ *
12534
+ * You should have received a copy of the GNU Lesser General Public
12535
+ * License along with FFmpeg; if not, write to the Free Software
12536
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
12537
+ */
12538
+
12539
+#ifndef SWSCALE_E2K_YUV2RGB_H
12540
+#define SWSCALE_E2K_YUV2RGB_H
12541
+
12542
+#include <stdint.h>
12543
+
12544
+#include "libswscale/swscale_internal.h"
12545
+
12546
+#define YUV2PACKEDX_HEADER(suffix)                              \
12547
+    void ff_yuv2##suffix##_X_e2k(SwsContext *c,             \
12548
+                                     const int16_t *lumFilter,  \
12549
+                                     const int16_t **lumSrc,    \
12550
+                                     int lumFilterSize,         \
12551
+                                     const int16_t *chrFilter,  \
12552
+                                     const int16_t **chrUSrc,   \
12553
+                                     const int16_t **chrVSrc,   \
12554
+                                     int chrFilterSize,         \
12555
+                                     const int16_t **alpSrc,    \
12556
+                                     uint8_t *dest,             \
12557
+                                     int dstW, int dstY);
12558
+
12559
+YUV2PACKEDX_HEADER(abgr);
12560
+YUV2PACKEDX_HEADER(bgra);
12561
+YUV2PACKEDX_HEADER(argb);
12562
+YUV2PACKEDX_HEADER(rgba);
12563
+YUV2PACKEDX_HEADER(rgb24);
12564
+YUV2PACKEDX_HEADER(bgr24);
12565
+
12566
+#endif /* SWSCALE_E2K_YUV2RGB_H */
12567
diff --git a/libswscale/e2k/yuv2yuv.c b/libswscale/e2k/yuv2yuv.c
12568
new file mode 100644
12569
index 0000000..7423fa8
12570
--- /dev/null
12571
+++ b/libswscale/e2k/yuv2yuv.c
12572
@@ -0,0 +1,146 @@
12573
+/*
12574
+ * Elbrus-enhanced yuv-to-yuv conversion routines.
12575
+ *
12576
+ * Copyright (C) 2021 Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
12577
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
12578
+ * based on the equivalent C code in swscale.c
12579
+ *
12580
+ * This file is part of FFmpeg.
12581
+ *
12582
+ * FFmpeg is free software; you can redistribute it and/or
12583
+ * modify it under the terms of the GNU Lesser General Public
12584
+ * License as published by the Free Software Foundation; either
12585
+ * version 2.1 of the License, or (at your option) any later version.
12586
+ *
12587
+ * FFmpeg is distributed in the hope that it will be useful,
12588
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12589
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12590
+ * Lesser General Public License for more details.
12591
+ *
12592
+ * You should have received a copy of the GNU Lesser General Public
12593
+ * License along with FFmpeg; if not, write to the Free Software
12594
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
12595
+ */
12596
+
12597
+#include <inttypes.h>
12598
+
12599
+#include "config.h"
12600
+#include "libavutil/attributes.h"
12601
+#include "libavutil/cpu.h"
12602
+#include "libswscale/swscale.h"
12603
+#include "libswscale/swscale_internal.h"
12604
+#include "libavutil/e2k/util_e2k.h"
12605
+
12606
+/* This code assumes:
12607
+ *
12608
+ * 1) dst is 16 bytes-aligned
12609
+ * 2) dstStride is a multiple of 16
12610
+ * 3) width is a multiple of 16
12611
+ * 4) lum & chrom stride are multiples of 8
12612
+ */
12613
+
12614
+static int yv12toyuy2_unscaled_e2k(SwsContext *c, const uint8_t *src[],
12615
+                                   int srcStride[], int srcSliceY,
12616
+                                   int srcSliceH, uint8_t *dstParam[],
12617
+                                   int dstStride_a[])
12618
+{
12619
+    const uint8_t *ysrc = src[0], *usrc = src[1], *vsrc = src[2];
12620
+    int dstStride = dstStride_a[0];
12621
+    uint8_t *dst = dstParam[0] + dstStride * srcSliceY;
12622
+    int width = (c->dstW + 1) >> 1, height = srcSliceH;
12623
+    int lumStride = srcStride[0];
12624
+    int chromStride = srcStride[1];
12625
+    int y, i;
12626
+
12627
+    for (y = 0; y < height; y++) {
12628
+        PRAGMA_E2K("ivdep")
12629
+        for (i = 0; i < width - 7; i += 8) {
12630
+            __m128i v0, v1, v2, v3;
12631
+            v0 = VEC_LD(ysrc + i * 2);
12632
+            v2 = VEC_LD8(usrc + i);
12633
+            v3 = VEC_LD8(vsrc + i);
12634
+            v1 = _mm_unpacklo_epi8(v2, v3);
12635
+            VEC_ST(dst + i * 4, _mm_unpacklo_epi8(v0, v1));
12636
+            VEC_ST(dst + i * 4 + 16, _mm_unpackhi_epi8(v0, v1));
12637
+        }
12638
+
12639
+        PRAGMA_E2K("ivdep")
12640
+        for (; i < width; i++) {
12641
+            *(uint32_t*)(dst + i * 4) =
12642
+                ysrc[i * 2] | usrc[i] << 8 |
12643
+                ysrc[i * 2 + 1] << 16 | vsrc[i] << 24;
12644
+        }
12645
+
12646
+        if (y & 1) {
12647
+            usrc += chromStride;
12648
+            vsrc += chromStride;
12649
+        }
12650
+        ysrc += lumStride;
12651
+        dst += dstStride;
12652
+    }
12653
+
12654
+    return srcSliceH;
12655
+}
12656
+
12657
+static int yv12touyvy_unscaled_e2k(SwsContext *c, const uint8_t *src[],
12658
+                                   int srcStride[], int srcSliceY,
12659
+                                   int srcSliceH, uint8_t *dstParam[],
12660
+                                   int dstStride_a[])
12661
+{
12662
+    const uint8_t *ysrc = src[0], *usrc = src[1], *vsrc = src[2];
12663
+    int dstStride = dstStride_a[0];
12664
+    uint8_t *dst = dstParam[0] + dstStride * srcSliceY;
12665
+    int width = (c->dstW + 1) >> 1, height = srcSliceH;
12666
+    int lumStride = srcStride[0];
12667
+    int chromStride = srcStride[1];
12668
+    int y, i;
12669
+
12670
+    for (y = 0; y < height; y++) {
12671
+        PRAGMA_E2K("ivdep")
12672
+        for (i = 0; i < width - 7; i += 8) {
12673
+            __m128i v0, v1, v2, v3;
12674
+            v0 = VEC_LD(ysrc + i * 2);
12675
+            v2 = VEC_LD8(usrc + i);
12676
+            v3 = VEC_LD8(vsrc + i);
12677
+            v1 = _mm_unpacklo_epi8(v2, v3);
12678
+            VEC_ST(dst + i * 4, _mm_unpacklo_epi8(v1, v0));
12679
+            VEC_ST(dst + i * 4 + 16, _mm_unpackhi_epi8(v1, v0));
12680
+        }
12681
+
12682
+        PRAGMA_E2K("ivdep")
12683
+        for (; i < width; i++) {
12684
+            *(uint32_t*)(dst + i * 4) =
12685
+                usrc[i] | ysrc[i * 2] << 8 |
12686
+                vsrc[i] << 16 | ysrc[i * 2 + 1] << 24;
12687
+        }
12688
+
12689
+        if (y & 1) {
12690
+            usrc += chromStride;
12691
+            vsrc += chromStride;
12692
+        }
12693
+        ysrc += lumStride;
12694
+        dst += dstStride;
12695
+    }
12696
+    return srcSliceH;
12697
+}
12698
+
12699
+av_cold void ff_get_unscaled_swscale_e2k(SwsContext *c)
12700
+{
12701
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_E2K))
12702
+        return;
12703
+
12704
+    if (c->flags & SWS_BITEXACT)
12705
+        return;
12706
+
12707
+    if (c->srcFormat == AV_PIX_FMT_YUV420P) {
12708
+        enum AVPixelFormat dstFormat = c->dstFormat;
12709
+        switch (dstFormat) {
12710
+        case AV_PIX_FMT_YUYV422:
12711
+            c->swscale = yv12toyuy2_unscaled_e2k;
12712
+            break;
12713
+        case AV_PIX_FMT_UYVY422:
12714
+            c->swscale = yv12touyvy_unscaled_e2k;
12715
+            break;
12716
+        }
12717
+    }
12718
+}
12719
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
12720
index 9cb7e8f..7760885 100644
12721
--- a/libswscale/swscale.c
12722
+++ b/libswscale/swscale.c
12723
@@ -586,6 +586,8 @@ SwsFunc ff_getSwsFunc(SwsContext *c)
12724
 
12725
     if (ARCH_PPC)
12726
         ff_sws_init_swscale_ppc(c);
12727
+    if (ARCH_E2K)
12728
+        ff_sws_init_swscale_e2k(c);
12729
     if (ARCH_X86)
12730
         ff_sws_init_swscale_x86(c);
12731
     if (ARCH_AARCH64)
12732
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
12733
index ee46092..8dcffb2 100644
12734
--- a/libswscale/swscale_internal.h
12735
+++ b/libswscale/swscale_internal.h
12736
@@ -31,7 +31,9 @@
12737
 #include "libavutil/log.h"
12738
 #include "libavutil/pixfmt.h"
12739
 #include "libavutil/pixdesc.h"
12740
+#if HAVE_ALTIVEC
12741
 #include "libavutil/ppc/util_altivec.h"
12742
+#endif
12743
 
12744
 #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
12745
 
12746
@@ -639,6 +641,7 @@ av_cold void ff_sws_init_range_convert(SwsContext *c);
12747
 
12748
 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
12749
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
12750
+SwsFunc ff_yuv2rgb_init_e2k(SwsContext *c);
12751
 
12752
 static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
12753
 {
12754
@@ -853,6 +856,7 @@ extern const AVClass ff_sws_context_class;
12755
  */
12756
 void ff_get_unscaled_swscale(SwsContext *c);
12757
 void ff_get_unscaled_swscale_ppc(SwsContext *c);
12758
+void ff_get_unscaled_swscale_e2k(SwsContext *c);
12759
 void ff_get_unscaled_swscale_arm(SwsContext *c);
12760
 void ff_get_unscaled_swscale_aarch64(SwsContext *c);
12761
 
12762
@@ -873,6 +877,7 @@ void ff_sws_init_output_funcs(SwsContext *c,
12763
                               yuv2anyX_fn *yuv2anyX);
12764
 void ff_sws_init_swscale_ppc(SwsContext *c);
12765
 void ff_sws_init_swscale_vsx(SwsContext *c);
12766
+void ff_sws_init_swscale_e2k(SwsContext *c);
12767
 void ff_sws_init_swscale_x86(SwsContext *c);
12768
 void ff_sws_init_swscale_aarch64(SwsContext *c);
12769
 void ff_sws_init_swscale_arm(SwsContext *c);
12770
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
12771
index 5fb572b..cacb11c 100644
12772
--- a/libswscale/swscale_unscaled.c
12773
+++ b/libswscale/swscale_unscaled.c
12774
@@ -2172,6 +2172,8 @@ void ff_get_unscaled_swscale(SwsContext *c)
12775
 
12776
     if (ARCH_PPC)
12777
         ff_get_unscaled_swscale_ppc(c);
12778
+    if (ARCH_E2K)
12779
+        ff_get_unscaled_swscale_e2k(c);
12780
      if (ARCH_ARM)
12781
          ff_get_unscaled_swscale_arm(c);
12782
     if (ARCH_AARCH64)
12783
diff --git a/libswscale/utils.c b/libswscale/utils.c
12784
index 111062e..db58be1 100644
12785
--- a/libswscale/utils.c
12786
+++ b/libswscale/utils.c
12787
@@ -51,6 +51,7 @@
12788
 #include "libavutil/pixdesc.h"
12789
 #include "libavutil/aarch64/cpu.h"
12790
 #include "libavutil/ppc/cpu.h"
12791
+#include "libavutil/e2k/cpu.h"
12792
 #include "libavutil/x86/asm.h"
12793
 #include "libavutil/x86/cpu.h"
12794
 
12795
@@ -600,6 +601,14 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
12796
             filterAlign = 1;
12797
     }
12798
 
12799
+    if (E2K_BASE(cpu_flags)) {
12800
+        if (minFilterSize < 5)
12801
+            filterAlign = 4;
12802
+
12803
+        if (minFilterSize < 3)
12804
+            filterAlign = 1;
12805
+    }
12806
+
12807
     if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
12808
         // special case for unscaled vertical filtering
12809
         if (minFilterSize == 1 && filterAlign == 2)
12810
@@ -1679,6 +1688,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
12811
         {
12812
             const int filterAlign = X86_MMX(cpu_flags)     ? 4 :
12813
                                     PPC_ALTIVEC(cpu_flags) ? 8 :
12814
+                                    E2K_BASE(cpu_flags)    ? 8 :
12815
                                     have_neon(cpu_flags)   ? 8 : 1;
12816
 
12817
             if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,
12818
@@ -1706,6 +1716,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
12819
     {
12820
         const int filterAlign = X86_MMX(cpu_flags)     ? 2 :
12821
                                 PPC_ALTIVEC(cpu_flags) ? 8 :
12822
+                                E2K_BASE(cpu_flags)    ? 8 :
12823
                                 have_neon(cpu_flags)   ? 2 : 1;
12824
 
12825
         if ((ret = initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize,
12826
@@ -1790,6 +1801,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
12827
             cpucaps = "MMX";
12828
         else if (PPC_ALTIVEC(cpu_flags))
12829
             cpucaps = "AltiVec";
12830
+        else if (E2K_BASE(cpu_flags))
12831
+            cpucaps = "Elbrus";
12832
         else
12833
             cpucaps = "C";
12834
 
12835
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
12836
index 5884625..26d41fe 100644
12837
--- a/libswscale/yuv2rgb.c
12838
+++ b/libswscale/yuv2rgb.c
12839
@@ -682,6 +682,8 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
12840
 
12841
     if (ARCH_PPC)
12842
         t = ff_yuv2rgb_init_ppc(c);
12843
+    if (ARCH_E2K)
12844
+        t = ff_yuv2rgb_init_e2k(c);
12845
     if (ARCH_X86)
12846
         t = ff_yuv2rgb_init_x86(c);
12847
 
12848
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
12849
index 899f68b..3768647 100644
12850
--- a/tests/checkasm/checkasm.c
12851
+++ b/tests/checkasm/checkasm.c
12852
@@ -213,6 +213,8 @@ static const struct {
12853
     { "ALTIVEC",  "altivec",  AV_CPU_FLAG_ALTIVEC },
12854
     { "VSX",      "vsx",      AV_CPU_FLAG_VSX },
12855
     { "POWER8",   "power8",   AV_CPU_FLAG_POWER8 },
12856
+#elif ARCH_E2K
12857
+    { "E2K",      "e2k",      AV_CPU_FLAG_E2K },
12858
 #elif ARCH_X86
12859
     { "MMX",      "mmx",      AV_CPU_FLAG_MMX|AV_CPU_FLAG_CMOV },
12860
     { "MMXEXT",   "mmxext",   AV_CPU_FLAG_MMXEXT },
12861
diff --git a/tests/checkasm/huffyuvdsp.c b/tests/checkasm/huffyuvdsp.c
12862
index 8392022..d5eba83 100644
12863
--- a/tests/checkasm/huffyuvdsp.c
12864
+++ b/tests/checkasm/huffyuvdsp.c
12865
@@ -24,10 +24,14 @@
12866
 #include "libavutil/intreadwrite.h"
12867
 #include "libavutil/mem.h"
12868
 
12869
-#include "libavcodec/huffyuvdsp.h"
12870
-
12871
 #include "checkasm.h"
12872
 
12873
+/* Short defines (B,G,R,A) in "huffyuvdsp.h" cause problems for Elbrus (e2k)
12874
+ * system includes, so this header file must be included after "checkasm.h".
12875
+ * Ilya Kurdyukov <jpegqs@gmail.com>
12876
+ */
12877
+#include "libavcodec/huffyuvdsp.h"
12878
+
12879
 #define randomize_buffers(buf, size)     \
12880
     do {                                 \
12881
         int j;                           \
12882
-- 
12883
2.17.1
12884
(-)a/.gear/ffmpeg.spec (-2 / +14 lines)
Lines 36-42 Link Here
36
%def_enable libcdio
36
%def_enable libcdio
37
%def_enable libcodec2
37
%def_enable libcodec2
38
%def_enable libdav1d
38
%def_enable libdav1d
39
%ifarch %e2k
40
%def_disable libdc1394
41
%else
39
%def_enable libdc1394
42
%def_enable libdc1394
43
%endif
40
%def_enable libdrm
44
%def_enable libdrm
41
%def_disable libflite
45
%def_disable libflite
42
%def_enable libfontconfig
46
%def_enable libfontconfig
Lines 127-133 Link Here
127
131
128
# nvidia cuda doesn't support arm
132
# nvidia cuda doesn't support arm
129
# https://developer.nvidia.com/nvidia-video-codec-sdk/download
133
# https://developer.nvidia.com/nvidia-video-codec-sdk/download
130
%ifarch %arm
134
%ifarch %arm %e2k
131
%def_disable cuvid
135
%def_disable cuvid
132
%else
136
%else
133
%def_enable cuvid
137
%def_enable cuvid
Lines 146-152 Link Here
146
Name:		ffmpeg
150
Name:		ffmpeg
147
Epoch:		2
151
Epoch:		2
148
Version:	4.3.2
152
Version:	4.3.2
149
Release:	alt1
153
Release:	alt2
150
154
151
Summary:	A command line toolbox to manipulate, convert and stream multimedia content
155
Summary:	A command line toolbox to manipulate, convert and stream multimedia content
152
License:	GPLv3
156
License:	GPLv3
Lines 157-162 Url: http://ffmpeg.org Link Here
157
# https://git.ffmpeg.org/ffmpeg.git
161
# https://git.ffmpeg.org/ffmpeg.git
158
Source:		%name-%version.tar
162
Source:		%name-%version.tar
159
Patch:		%name-%version-%release.patch
163
Patch:		%name-%version-%release.patch
164
Patch2000: %name-e2k-simd.patch
160
BuildRequires:	libX11-devel libXext-devel libXvMC-devel libXfixes-devel
165
BuildRequires:	libX11-devel libXext-devel libXvMC-devel libXfixes-devel
161
BuildRequires:	libalsa-devel
166
BuildRequires:	libalsa-devel
162
%ifarch %ix86 x86_64
167
%ifarch %ix86 x86_64
Lines 558-563 This package contains static development files for libswscale. Link Here
558
%prep
563
%prep
559
%setup
564
%setup
560
%patch -p1
565
%patch -p1
566
%ifarch %e2k
567
%patch2000 -p1
568
%endif
561
569
562
%build
570
%build
563
xz Changelog
571
xz Changelog
Lines 863-868 xz Changelog Link Here
863
%endif
871
%endif
864
872
865
%changelog
873
%changelog
874
* Tue Apr 13 2021 Ilya Kurdyukov <ilyakurdyukov@altlinux.org> 2:4.3.2-alt2
875
- added SIMD patch for Elbrus
876
- disable cuvid and libdc1394 on Elbrus
877
866
* Sun Mar 28 2021 Anton Farygin <rider@altlinux.org> 2:4.3.2-alt1
878
* Sun Mar 28 2021 Anton Farygin <rider@altlinux.org> 2:4.3.2-alt1
867
- 4.3.2
879
- 4.3.2
868
880
(-)a/.gear/rules (-1 / +1 lines)
Lines 1-3 Link Here
1
tar: n@version@:.
1
tar: n@version@:.
2
diff: n@version@:. .
2
diff: n@version@:. .
3
copy?: .gear/*.patch
3
spec: .gear/ffmpeg.spec
4
spec: .gear/ffmpeg.spec
4
- 

Return to bug 39933