diff --git a/configure b/configure index d7a3f507e8..9b7435ec79 100755 --- a/configure +++ b/configure @@ -207,6 +207,7 @@ External library support: --disable-bzlib disable bzlib [autodetect] --disable-coreimage disable Apple CoreImage framework [autodetect] --enable-chromaprint enable audio fingerprinting with chromaprint [no] + --disable-epoxy disable epoxy [autodetect] --enable-frei0r enable frei0r video filtering [no] --enable-gcrypt enable gcrypt, needed for rtmp(t)e support if openssl, librtmp or gmp is not used [no] @@ -279,6 +280,7 @@ External library support: if openssl, gnutls or mbedtls is not used [no] --enable-libtwolame enable MP2 encoding via libtwolame [no] --enable-libuavs3d enable AVS3 decoding via libuavs3d [no] + --disable-libudev disable libudev [autodetect] --enable-libv4l2 enable libv4l2/v4l-utils [no] --enable-libvidstab enable video stabilization using vid.stab [no] --enable-libvmaf enable vmaf filter via libvmaf [no] @@ -340,12 +342,17 @@ External library support: --enable-libmfx enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no] --enable-libnpp enable Nvidia Performance Primitives-based code [no] --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] + --enable-rpi enable other rpi specific stuff [no] + --enable-sand enable sand video formats [rpi] + --enable-vout-drm enable the vout_drm module - for internal testing only [no] + --enable-vout-egl enable the vout_egl module - for internal testing only [no] --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] --disable-nvenc disable Nvidia video encoding code [autodetect] --enable-omx enable OpenMAX IL code [no] --enable-omx-rpi enable OpenMAX IL code for Raspberry Pi [no] --enable-rkmpp enable Rockchip Media Process Platform code [no] --disable-v4l2-m2m disable V4L2 mem2mem code [autodetect] + --enable-v4l2-request enable V4L2 request API code [no] --disable-vaapi disable Video Acceleration API (mainly Unix/Intel) code [autodetect] --disable-vdpau disable Nvidia Video Decode and Presentation API for Unix code [autodetect] --disable-videotoolbox disable VideoToolbox code [autodetect] @@ -1703,7 +1710,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST=" avfoundation bzlib coreimage + epoxy iconv + libudev libxcb libxcb_shm libxcb_shape @@ -1868,7 +1877,10 @@ HWACCEL_LIBRARY_LIST=" mmal omx opencl + v4l2_request vulkan + rpi4_8 + rpi4_10 " DOCUMENT_LIST=" @@ -1884,12 +1896,16 @@ FEATURE_LIST=" gray hardcoded_tables omx_rpi + rpi runtime_cpudetect safe_bitstream_reader + sand shared small static swscale_alpha + vout_drm + vout_egl " # this list should be kept in linking order @@ -1930,6 +1946,7 @@ SUBSYSTEM_LIST=" pixelutils network rdft + rpi " # COMPONENT_LIST needs to come last to ensure correct dependency checking @@ -2416,9 +2433,11 @@ CONFIG_EXTRA=" rangecoder riffdec riffenc + rpi rtpdec rtpenc_chain rv34dsp + sand scene_sad sinewin snappy @@ -2750,6 +2769,8 @@ hap_decoder_select="snappy texturedsp" hap_encoder_deps="libsnappy" hap_encoder_select="texturedspenc" hevc_decoder_select="atsc_a53 bswapdsp cabac golomb hevcparse videodsp" +hevc_rpi_decoder_deps="rpi" +hevc_rpi_decoder_select="hevc_decoder sand" huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp" huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp" hymt_decoder_select="huffyuv_decoder" @@ -2919,6 +2940,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext" dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32" ffnvcodec_deps_any="libdl LoadLibrary" nvdec_deps="ffnvcodec" +v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev" vaapi_x11_deps="xlib" videotoolbox_hwaccel_deps="videotoolbox pthreads" videotoolbox_hwaccel_extralibs="-framework QuartzCore" @@ -2960,6 +2982,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC" hevc_dxva2_hwaccel_select="hevc_decoder" hevc_nvdec_hwaccel_deps="nvdec" hevc_nvdec_hwaccel_select="hevc_decoder" +hevc_v4l2request_hwaccel_deps="v4l2_request" +hevc_v4l2request_hwaccel_select="hevc_decoder" +hevc_rpi4_10_hwaccel_deps="rpi" +hevc_rpi4_10_hwaccel_select="hevc_decoder" +hevc_rpi4_8_hwaccel_deps="rpi" +hevc_rpi4_8_hwaccel_select="hevc_decoder" hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC" hevc_vaapi_hwaccel_select="hevc_decoder" hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC" @@ -3437,8 +3465,13 @@ sndio_indev_deps="sndio" sndio_outdev_deps="sndio" v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h" v4l2_indev_suggest="libv4l2" +v4l2_outdev_deps="libdrm" v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" v4l2_outdev_suggest="libv4l2" +vout_drm_outdev_deps="libdrm" +vout_egl_outdev_deps="xlib epoxy" +vout_rpi_outdev_deps="rpi" +vout_rpi_outdev_select="sand" vfwcap_indev_deps="vfw32 vfwcap_defines" xcbgrab_indev_deps="libxcb" xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes" @@ -3657,6 +3690,7 @@ tonemap_vaapi_filter_deps="vaapi VAProcFilterParameterBufferHDRToneMapping" tonemap_opencl_filter_deps="opencl const_nan" transpose_opencl_filter_deps="opencl" transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags" +unsand_filter_select="sand" unsharp_opencl_filter_deps="opencl" uspp_filter_deps="gpl avcodec" vaguedenoiser_filter_deps="gpl" @@ -6154,6 +6188,12 @@ check_func_headers glob.h glob enabled xlib && check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext +enabled libudev && + check_pkg_config libudev libudev libudev.h udev_new + +enabled epoxy && + check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version + check_headers direct.h check_headers dirent.h check_headers dxgidebug.h @@ -6491,11 +6531,12 @@ enabled mbedtls && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto || die "ERROR: mbedTLS not found"; } enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; } -enabled mmal && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || +( enabled rpi || + enabled mmal ) && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || { ! enabled cross_compile && add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline && add_ldflags -L/opt/vc/lib/ && - check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } || + check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } || die "ERROR: mmal not found" && check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; } enabled openal && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do @@ -6536,8 +6577,16 @@ enabled rkmpp && { require_pkg_config rkmpp rockchip_mpp rockchip/r { enabled libdrm || die "ERROR: rkmpp requires --enable-libdrm"; } } +enabled v4l2_request && { enabled libdrm || + die "ERROR: v4l2-request requires --enable-libdrm"; } && + { enabled libudev || + die "ERROR: v4l2-request requires libudev"; } enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init +enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; } + +enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } && + { enabled xlib || die "ERROR: vout_egl requires xlib"; } if enabled gcrypt; then GCRYPT_CONFIG="${cross_prefix}libgcrypt-config" @@ -6617,6 +6666,8 @@ if enabled v4l2_m2m; then check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;" fi +check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns +check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" check_headers sys/videoio.h test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c index 46bb014de8..0502ff71b8 100644 --- a/fftools/ffmpeg.c +++ b/fftools/ffmpeg.c @@ -2186,8 +2186,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) ifilter->channel_layout != frame->channel_layout; break; case AVMEDIA_TYPE_VIDEO: - need_reinit |= ifilter->width != frame->width || - ifilter->height != frame->height; + need_reinit |= ifilter->width != av_frame_cropped_width(frame) || + ifilter->height != av_frame_cropped_height(frame); break; } @@ -2198,6 +2198,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data)) need_reinit = 1; + if (no_cvt_hw && fg->graph) + need_reinit = 0; + if (need_reinit) { ret = ifilter_parameters_from_frame(ifilter, frame); if (ret < 0) @@ -2466,8 +2469,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_ decoded_frame->top_field_first = ist->top_field_first; ist->frames_decoded++; - - if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { + if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame); if (err < 0) goto fail; @@ -2671,7 +2673,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo case AVMEDIA_TYPE_VIDEO: ret = decode_video (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt, &decode_failed); - if (!repeating || !pkt || got_output) { + // Pi: Do not inc dts if no_cvt_hw set + // V4L2 H264 decode has long latency and sometimes spits out a long + // stream of output without input. In this case incrementing DTS is wrong. + // There may be cases where the condition as written is correct so only + // "fix" in the cases which cause problems + if (!repeating || !pkt || (got_output && !no_cvt_hw)) { if (pkt && pkt->duration) { duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q); } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) { @@ -2895,6 +2902,16 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat } else { const HWAccel *hwaccel = NULL; int i; + + if (no_cvt_hw) { + config = avcodec_get_hw_config(s->codec, 0); + if (config->methods == AV_CODEC_HW_CONFIG_METHOD_INTERNAL) { + av_log(s, AV_LOG_DEBUG, "no_cvt_hw so accepting pix_fmt %d with codec internal hwaccel\n", *p); + ist->hwaccel_pix_fmt = *p; + break; + } + } + for (i = 0; hwaccels[i].name; i++) { if (hwaccels[i].pix_fmt == *p) { hwaccel = &hwaccels[i]; @@ -2990,6 +3007,15 @@ static int init_input_stream(int ist_index, char *error, int error_len) return ret; } +#if CONFIG_HEVC_RPI_DECODER + ret = -1; + if (strcmp(codec->name, "hevc_rpi") == 0 && + (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) { + ist->dec = codec = avcodec_find_decoder_by_name("hevc"); + av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n"); + } + if (ret < 0) +#endif if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) { if (ret == AVERROR_EXPERIMENTAL) abort_codec_experimental(codec, 0); diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h index 606f2afe0c..448cd2e009 100644 --- a/fftools/ffmpeg.h +++ b/fftools/ffmpeg.h @@ -61,6 +61,7 @@ enum HWAccelID { HWACCEL_GENERIC, HWACCEL_VIDEOTOOLBOX, HWACCEL_QSV, + HWACCEL_RPI, }; typedef struct HWAccel { @@ -611,6 +612,7 @@ extern int video_sync_method; extern float frame_drop_threshold; extern int do_benchmark; extern int do_benchmark_all; +extern int no_cvt_hw; extern int do_deinterlace; extern int do_hex_dump; extern int do_pkt_dump; diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c index 4ab769c07b..5cdc3a7b6c 100644 --- a/fftools/ffmpeg_filter.c +++ b/fftools/ffmpeg_filter.c @@ -1160,8 +1160,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame) ifilter->format = frame->format; - ifilter->width = frame->width; - ifilter->height = frame->height; + ifilter->width = av_frame_cropped_width(frame); + ifilter->height = av_frame_cropped_height(frame); ifilter->sample_aspect_ratio = frame->sample_aspect_ratio; ifilter->sample_rate = frame->sample_rate; diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c index fc4a5d31d6..cc69dce40e 100644 --- a/fftools/ffmpeg_hw.c +++ b/fftools/ffmpeg_hw.c @@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type) char *name; size_t index_pos; int index, index_limit = 1000; + if (!type_name) + return NULL; index_pos = strlen(type_name); name = av_malloc(index_pos + 4); if (!name) diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c index 807e783422..456d4f349b 100644 --- a/fftools/ffmpeg_opt.c +++ b/fftools/ffmpeg_opt.c @@ -133,12 +133,22 @@ static const char *const opt_name_enc_time_bases[] = {"enc_time_base" }\ } +#if CONFIG_RPI +static int rpi_init(AVCodecContext *avctx) { + return 0; +} +#endif + const HWAccel hwaccels[] = { #if CONFIG_VIDEOTOOLBOX { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX }, #endif #if CONFIG_LIBMFX { "qsv", qsv_init, HWACCEL_QSV, AV_PIX_FMT_QSV }, +#endif +#if CONFIG_RPI + { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 }, + { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 }, #endif { 0 }, }; @@ -158,6 +168,7 @@ float frame_drop_threshold = 0; int do_deinterlace = 0; int do_benchmark = 0; int do_benchmark_all = 0; +int no_cvt_hw = 0; int do_hex_dump = 0; int do_pkt_dump = 0; int copy_ts = 0; @@ -3499,6 +3510,8 @@ const OptionDef options[] = { "add timings for benchmarking" }, { "benchmark_all", OPT_BOOL | OPT_EXPERT, { &do_benchmark_all }, "add timings for each task" }, + { "no_cvt_hw", OPT_BOOL | OPT_EXPERT, { &no_cvt_hw }, + "do not auto-convert hw frames to sw" }, { "progress", HAS_ARG | OPT_EXPERT, { .func_arg = opt_progress }, "write program-readable progress information", "url" }, { "stdin", OPT_BOOL | OPT_EXPERT, { &stdin_interaction }, diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 33a280cf69..ef22d26dc1 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -19,6 +19,7 @@ HEADERS = ac3_parser.h \ mediacodec.h \ packet.h \ qsv.h \ + rpi_zc.h \ vaapi.h \ vdpau.h \ version.h \ @@ -140,6 +141,7 @@ OBJS-$(CONFIG_QSVDEC) += qsvdec.o OBJS-$(CONFIG_QSVENC) += qsvenc.o OBJS-$(CONFIG_RANGECODER) += rangecoder.o OBJS-$(CONFIG_RDFT) += rdft.o +OBJS-$(CONFIG_RPI) += rpi_qpu.o rpi_mailbox.o rpi_zc.o OBJS-$(CONFIG_RV34DSP) += rv34dsp.o OBJS-$(CONFIG_SHARED) += log2_tab.o reverse.o OBJS-$(CONFIG_SINEWIN) += sinewin.o @@ -154,7 +156,10 @@ OBJS-$(CONFIG_VIDEODSP) += videodsp.o OBJS-$(CONFIG_VP3DSP) += vp3dsp.o OBJS-$(CONFIG_VP56DSP) += vp56dsp.o OBJS-$(CONFIG_VP8DSP) += vp8dsp.o -OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o +OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\ + weak_link.o +OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\ + v4l2_req_devscan.o weak_link.o OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o OBJS-$(CONFIG_WMV2DSP) += wmv2dsp.o @@ -403,6 +408,14 @@ OBJS-$(CONFIG_HEVC_QSV_DECODER) += qsvdec.o OBJS-$(CONFIG_HEVC_QSV_ENCODER) += qsvenc_hevc.o hevc_ps_enc.o \ hevc_data.o OBJS-$(CONFIG_HEVC_RKMPP_DECODER) += rkmppdec.o +OBJS-$(CONFIG_RPI) += rpi_mem.o \ + rpi_mailbox.o rpi_zc.o +OBJS-$(CONFIG_HEVC_RPI_DECODER) += rpi_hevcdec.o rpi_hevc_mvs.o \ + rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o \ + rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o \ + rpi_hevc_shader.o rpi_hevc_shader_template.o \ + rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \ + rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o OBJS-$(CONFIG_HEVC_VAAPI_ENCODER) += vaapi_encode_h265.o h265_profile_level.o OBJS-$(CONFIG_HEVC_V4L2M2M_DECODER) += v4l2_m2m_dec.o OBJS-$(CONFIG_HEVC_V4L2M2M_ENCODER) += v4l2_m2m_enc.o @@ -941,6 +954,10 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec.o +OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL) += rpivid_hevc.o +OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL) += rpivid_hevc.o +OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\ + v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o h265_profile_level.o OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o @@ -1297,3 +1314,31 @@ $(SUBDIR)pcm.o: $(SUBDIR)pcm_tables.h $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h endif + +ifdef CONFIG_HEVC_RPI_DECODER +QASM_PY := ../local/bin/qasm.py +VASMVIDCORE := ../local/bin/vasmvidcore_std + +ifneq ("$(wildcard $(QASM_PY))","") +$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm + $(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@ + +$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm + $(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@ +endif + +ifneq ("$(wildcard $(VASMVIDCORE))","") +$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s + $(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@ +$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s + $(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@ + +$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin + python pi-util/make_array.py $< +$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin + python pi-util/make_array.py $< +endif + +$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h +$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h +endif diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 954461f81d..7078dc6089 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -35,6 +35,8 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o # subsystems NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o +NEON-OBJS-$(CONFIG_BLOCKDSP) += aarch64/blockdsp_init_aarch64.o \ + aarch64/blockdsp_neon.o NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o @@ -44,10 +46,12 @@ NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ aarch64/hpeldsp_neon.o NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o -NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o +NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \ + aarch64/simple_idct_neon.o NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o +NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o # decoders/encoders diff --git a/libavcodec/aarch64/blockdsp_init_aarch64.c b/libavcodec/aarch64/blockdsp_init_aarch64.c new file mode 100644 index 0000000000..9f3280f007 --- /dev/null +++ b/libavcodec/aarch64/blockdsp_init_aarch64.c @@ -0,0 +1,42 @@ +/* + * AArch64 NEON optimised block operations + * + * Copyright (c) 2022 Ben Avison + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/blockdsp.h" + +void ff_clear_block_neon(int16_t *block); +void ff_clear_blocks_neon(int16_t *blocks); + +av_cold void ff_blockdsp_init_aarch64(BlockDSPContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + c->clear_block = ff_clear_block_neon; + c->clear_blocks = ff_clear_blocks_neon; + } +} diff --git a/libavcodec/aarch64/blockdsp_neon.S b/libavcodec/aarch64/blockdsp_neon.S new file mode 100644 index 0000000000..e4a4959ccc --- /dev/null +++ b/libavcodec/aarch64/blockdsp_neon.S @@ -0,0 +1,43 @@ +/* + * AArch64 NEON optimised block operations + * + * Copyright (c) 2022 Ben Avison + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +function ff_clear_block_neon, export=1 + movi v0.16b, #0 + movi v1.16b, #0 + st1 {v0.16b, v1.16b}, [x0], #32 + st1 {v0.16b, v1.16b}, [x0], #32 + st1 {v0.16b, v1.16b}, [x0], #32 + st1 {v0.16b, v1.16b}, [x0] + ret +endfunc + +function ff_clear_blocks_neon, export=1 + movi v0.16b, #0 + movi v1.16b, #0 + .rept 23 + st1 {v0.16b, v1.16b}, [x0], #32 + .endr + st1 {v0.16b, v1.16b}, [x0] + ret +endfunc diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c index 742a3372e3..eec21aa5a2 100644 --- a/libavcodec/aarch64/idctdsp_init_aarch64.c +++ b/libavcodec/aarch64/idctdsp_init_aarch64.c @@ -27,19 +27,29 @@ #include "libavcodec/idctdsp.h" #include "idct.h" +void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); +void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); +void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); + av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { int cpu_flags = av_get_cpu_flags(); - if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) { - if (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLEAUTO || - avctx->idct_algo == FF_IDCT_SIMPLENEON) { - c->idct_put = ff_simple_idct_put_neon; - c->idct_add = ff_simple_idct_add_neon; - c->idct = ff_simple_idct_neon; - c->perm_type = FF_IDCT_PERM_PARTTRANS; + if (have_neon(cpu_flags)) { + if (!avctx->lowres && !high_bit_depth) { + if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLENEON) { + c->idct_put = ff_simple_idct_put_neon; + c->idct_add = ff_simple_idct_add_neon; + c->idct = ff_simple_idct_neon; + c->perm_type = FF_IDCT_PERM_PARTTRANS; + } } + + c->add_pixels_clamped = ff_add_pixels_clamped_neon; + c->put_pixels_clamped = ff_put_pixels_clamped_neon; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; } } diff --git a/libavcodec/aarch64/idctdsp_neon.S b/libavcodec/aarch64/idctdsp_neon.S new file mode 100644 index 0000000000..7f47611206 --- /dev/null +++ b/libavcodec/aarch64/idctdsp_neon.S @@ -0,0 +1,130 @@ +/* + * IDCT AArch64 NEON optimisations + * + * Copyright (c) 2022 Ben Avison + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// Clamp 16-bit signed block coefficients to unsigned 8-bit +// On entry: +// x0 -> array of 64x 16-bit coefficients +// x1 -> 8-bit results +// x2 = row stride for results, bytes +function ff_put_pixels_clamped_neon, export=1 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0] + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + sqxtun v4.8b, v4.8h + st1 {v0.8b}, [x1], x2 + sqxtun v0.8b, v5.8h + st1 {v1.8b}, [x1], x2 + sqxtun v1.8b, v6.8h + st1 {v2.8b}, [x1], x2 + sqxtun v2.8b, v7.8h + st1 {v3.8b}, [x1], x2 + st1 {v4.8b}, [x1], x2 + st1 {v0.8b}, [x1], x2 + st1 {v1.8b}, [x1], x2 + st1 {v2.8b}, [x1] + ret +endfunc + +// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128) +// On entry: +// x0 -> array of 64x 16-bit coefficients +// x1 -> 8-bit results +// x2 = row stride for results, bytes +function ff_put_signed_pixels_clamped_neon, export=1 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 + movi v4.8b, #128 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] + sqxtn v0.8b, v0.8h + sqxtn v1.8b, v1.8h + sqxtn v2.8b, v2.8h + sqxtn v3.8b, v3.8h + sqxtn v5.8b, v16.8h + add v0.8b, v0.8b, v4.8b + sqxtn v6.8b, v17.8h + add v1.8b, v1.8b, v4.8b + sqxtn v7.8b, v18.8h + add v2.8b, v2.8b, v4.8b + sqxtn v16.8b, v19.8h + add v3.8b, v3.8b, v4.8b + st1 {v0.8b}, [x1], x2 + add v0.8b, v5.8b, v4.8b + st1 {v1.8b}, [x1], x2 + add v1.8b, v6.8b, v4.8b + st1 {v2.8b}, [x1], x2 + add v2.8b, v7.8b, v4.8b + st1 {v3.8b}, [x1], x2 + add v3.8b, v16.8b, v4.8b + st1 {v0.8b}, [x1], x2 + st1 {v1.8b}, [x1], x2 + st1 {v2.8b}, [x1], x2 + st1 {v3.8b}, [x1] + ret +endfunc + +// Add 16-bit signed block coefficients to unsigned 8-bit +// On entry: +// x0 -> array of 64x 16-bit coefficients +// x1 -> 8-bit input and results +// x2 = row stride for 8-bit input and results, bytes +function ff_add_pixels_clamped_neon, export=1 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 + mov x3, x1 + ld1 {v4.8b}, [x1], x2 + ld1 {v5.8b}, [x1], x2 + ld1 {v6.8b}, [x1], x2 + ld1 {v7.8b}, [x1], x2 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] + uaddw v0.8h, v0.8h, v4.8b + uaddw v1.8h, v1.8h, v5.8b + uaddw v2.8h, v2.8h, v6.8b + ld1 {v4.8b}, [x1], x2 + uaddw v3.8h, v3.8h, v7.8b + ld1 {v5.8b}, [x1], x2 + sqxtun v0.8b, v0.8h + ld1 {v6.8b}, [x1], x2 + sqxtun v1.8b, v1.8h + ld1 {v7.8b}, [x1] + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + uaddw v4.8h, v16.8h, v4.8b + st1 {v0.8b}, [x3], x2 + uaddw v0.8h, v17.8h, v5.8b + st1 {v1.8b}, [x3], x2 + uaddw v1.8h, v18.8h, v6.8b + st1 {v2.8b}, [x3], x2 + uaddw v2.8h, v19.8h, v7.8b + sqxtun v4.8b, v4.8h + sqxtun v0.8b, v0.8h + st1 {v3.8b}, [x3], x2 + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + st1 {v4.8b}, [x3], x2 + st1 {v0.8b}, [x3], x2 + st1 {v1.8b}, [x3], x2 + st1 {v2.8b}, [x3] + ret +endfunc diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c index 13dfd74940..161d5a972b 100644 --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c @@ -21,10 +21,28 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/aarch64/cpu.h" +#include "libavutil/intreadwrite.h" #include "libavcodec/vc1dsp.h" #include "config.h" +void ff_vc1_inv_trans_8x8_neon(int16_t *block); +void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + +void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + +void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq); +void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq); +void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq); +void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq); +void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq); +void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq); + void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y); void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, @@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y); +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); + +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) +{ + /* Dealing with starting and stopping, and removing escape bytes, are + * comparatively less time-sensitive, so are more clearly expressed using + * a C wrapper around the assembly inner loop. Note that we assume a + * little-endian machine that supports unaligned loads. */ + int dsize = 0; + while (size >= 4) + { + int found = 0; + while (!found && (((uintptr_t) dst) & 7) && size >= 4) + { + found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; + if (!found) + { + *dst++ = *src++; + --size; + ++dsize; + } + } + if (!found) + { + int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); + dst += skip; + src += skip; + size -= skip; + dsize += skip; + while (!found && size >= 4) + { + found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; + if (!found) + { + *dst++ = *src++; + --size; + ++dsize; + } + } + } + if (found) + { + *dst++ = *src++; + *dst++ = *src++; + ++src; + size -= 3; + dsize += 2; + } + } + while (size > 0) + { + *dst++ = *src++; + --size; + ++dsize; + } + return dsize; +} + av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) { int cpu_flags = av_get_cpu_flags(); if (have_neon(cpu_flags)) { + dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon; + dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon; + dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon; + dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon; + dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon; + dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; + dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon; + dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; + + dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; + dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; + dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; + dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; + dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; + dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; + dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; + + dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; } } diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S new file mode 100644 index 0000000000..529c21d285 --- /dev/null +++ b/libavcodec/aarch64/vc1dsp_neon.S @@ -0,0 +1,1552 @@ +/* + * VC1 AArch64 NEON optimisations + * + * Copyright (c) 2022 Ben Avison + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// VC-1 8x8 inverse transform +// On entry: +// x0 -> array of 16-bit inverse transform coefficients, in column-major order +// On exit: +// array at x0 updated to hold transformed block; also now held in row-major order +function ff_vc1_inv_trans_8x8_neon, export=1 + ld1 {v1.16b, v2.16b}, [x0], #32 + ld1 {v3.16b, v4.16b}, [x0], #32 + ld1 {v5.16b, v6.16b}, [x0], #32 + shl v1.8h, v1.8h, #2 // 8/2 * src[0] + sub x1, x0, #3*32 + ld1 {v16.16b, v17.16b}, [x0] + shl v7.8h, v2.8h, #4 // 16 * src[8] + shl v18.8h, v2.8h, #2 // 4 * src[8] + shl v19.8h, v4.8h, #4 // 16 * src[24] + ldr d0, .Lcoeffs_it8 + shl v5.8h, v5.8h, #2 // 8/2 * src[32] + shl v20.8h, v6.8h, #4 // 16 * src[40] + shl v21.8h, v6.8h, #2 // 4 * src[40] + shl v22.8h, v17.8h, #4 // 16 * src[56] + ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40] + mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16] + sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40] + ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56] + sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56] + shl v3.8h, v3.8h, #3 // 16/2 * src[16] + mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] + ssra v1.8h, v1.8h, #1 // 12/2 * src[0] + ssra v5.8h, v5.8h, #1 // 12/2 * src[32] + mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] + shl v21.8h, v16.8h, #3 // 16/2 * src[48] + mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] + sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] + mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] + add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] + sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] + mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] + mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] + add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 + sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 + mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] + add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 + add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 + mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] + sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 + add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 + mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] + sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 + sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 + neg v3.8h, v7.8h // -t1 + neg v4.8h, v20.8h // +t2 + neg v6.8h, v19.8h // +t3 + ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1 + ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1 + neg v7.8h, v18.8h // +t4 + ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1 + ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1 + ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1 + ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1 + ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1 + ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1 + srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3 + srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3 + srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3 + srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3 + srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3 + srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3 + srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3 + srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3 + trn2 v17.8h, v3.8h, v4.8h + trn2 v18.8h, v5.8h, v6.8h + trn2 v19.8h, v2.8h, v1.8h + trn2 v20.8h, v7.8h, v16.8h + trn1 v21.4s, v17.4s, v18.4s + trn2 v17.4s, v17.4s, v18.4s + trn1 v18.4s, v19.4s, v20.4s + trn2 v19.4s, v19.4s, v20.4s + trn1 v3.8h, v3.8h, v4.8h + trn2 v4.2d, v21.2d, v18.2d + trn1 v20.2d, v17.2d, v19.2d + trn1 v5.8h, v5.8h, v6.8h + trn1 v1.8h, v2.8h, v1.8h + trn1 v2.8h, v7.8h, v16.8h + trn1 v6.2d, v21.2d, v18.2d + trn2 v7.2d, v17.2d, v19.2d + shl v16.8h, v20.8h, #4 // 16 * src[24] + shl v17.8h, v4.8h, #4 // 16 * src[40] + trn1 v18.4s, v3.4s, v5.4s + trn1 v19.4s, v1.4s, v2.4s + shl v21.8h, v7.8h, #4 // 16 * src[56] + shl v22.8h, v6.8h, #2 // 4 * src[8] + shl v23.8h, v4.8h, #2 // 4 * src[40] + trn2 v3.4s, v3.4s, v5.4s + trn2 v1.4s, v1.4s, v2.4s + shl v2.8h, v6.8h, #4 // 16 * src[8] + sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40] + ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40] + sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56] + trn1 v22.2d, v18.2d, v19.2d + trn2 v18.2d, v18.2d, v19.2d + trn1 v19.2d, v3.2d, v1.2d + ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56] + mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] + shl v21.8h, v22.8h, #2 // 8/2 * src[0] + shl v18.8h, v18.8h, #2 // 8/2 * src[32] + mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] + shl v6.8h, v19.8h, #3 // 16/2 * src[16] + trn2 v1.2d, v3.2d, v1.2d + mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] + ssra v21.8h, v21.8h, #1 // 12/2 * src[0] + ssra v18.8h, v18.8h, #1 // 12/2 * src[32] + mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16] + shl v19.8h, v1.8h, #3 // 16/2 * src[48] + mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] + add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] + mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] + sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] + sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] + mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] + mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] + add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 + add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 + mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] + sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 + neg v21.8h, v17.8h // +t2 + mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] + sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 + neg v4.8h, v5.8h // +t3 + sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 + sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 + neg v24.8h, v16.8h // +t4 + add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 + add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 + ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1 + neg v3.8h, v2.8h // -t1 + ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1 + ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1 + ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1 + srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1 + srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1 + srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1 + srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1 + srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7 + srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7 + srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7 + srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7 + srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7 + srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7 + st1 {v2.16b, v3.16b}, [x1], #32 + srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7 + srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7 + st1 {v4.16b, v5.16b}, [x1], #32 + st1 {v16.16b, v17.16b}, [x1], #32 + st1 {v0.16b, v1.16b}, [x1] + ret +endfunc + +// VC-1 8x4 inverse transform +// On entry: +// x0 -> array of 8-bit samples, in row-major order +// x1 = row stride for 8-bit sample array +// x2 -> array of 16-bit inverse transform coefficients, in row-major order +// On exit: +// array at x0 updated by saturated addition of (narrowed) transformed block +function ff_vc1_inv_trans_8x4_neon, export=1 + ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32 + mov x3, x0 + ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2] + ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector + ld1 {v5.8b}, [x0], x1 + trn2 v6.4h, v1.4h, v3.4h + trn2 v7.4h, v2.4h, v4.4h + trn1 v1.4h, v1.4h, v3.4h + trn1 v2.4h, v2.4h, v4.4h + trn2 v3.4h, v16.4h, v18.4h + trn2 v4.4h, v17.4h, v19.4h + trn1 v16.4h, v16.4h, v18.4h + trn1 v17.4h, v17.4h, v19.4h + ld1 {v18.8b}, [x0], x1 + trn1 v19.2s, v6.2s, v3.2s + trn2 v3.2s, v6.2s, v3.2s + trn1 v6.2s, v7.2s, v4.2s + trn2 v4.2s, v7.2s, v4.2s + trn1 v7.2s, v1.2s, v16.2s + trn1 v20.2s, v2.2s, v17.2s + shl v21.4h, v19.4h, #4 // 16 * src[1] + trn2 v1.2s, v1.2s, v16.2s + shl v16.4h, v3.4h, #4 // 16 * src[3] + trn2 v2.2s, v2.2s, v17.2s + shl v17.4h, v6.4h, #4 // 16 * src[5] + ld1 {v22.8b}, [x0], x1 + shl v23.4h, v4.4h, #4 // 16 * src[7] + mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2] + ld1 {v25.8b}, [x0] + shl v26.4h, v19.4h, #2 // 4 * src[1] + shl v27.4h, v6.4h, #2 // 4 * src[5] + ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7] + ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5] + sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7] + sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5] + shl v7.4h, v7.4h, #2 // 8/2 * src[0] + shl v20.4h, v20.4h, #2 // 8/2 * src[4] + mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7] + shl v1.4h, v1.4h, #3 // 16/2 * src[2] + mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5] + ssra v7.4h, v7.4h, #1 // 12/2 * src[0] + mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5] + ssra v20.4h, v20.4h, #1 // 12/2 * src[4] + mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7] + shl v3.4h, v2.4h, #3 // 16/2 * src[6] + mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6] + mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7] + mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7] + sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6] + mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7] + add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4] + mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7] + sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4] + neg v6.4h, v21.4h // -t1 + add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 + sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 + add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 + sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 + add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 + add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 + sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 + sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 + neg v3.4h, v17.4h // +t2 + neg v4.4h, v16.4h // +t3 + neg v28.4h, v23.4h // +t4 + ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1 + ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1 + ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1 + ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1 + ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1 + ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1 + ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1 + ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1 + trn1 v1.2d, v7.2d, v1.2d + trn1 v2.2d, v20.2d, v2.2d + trn1 v3.2d, v24.2d, v27.2d + trn1 v4.2d, v19.2d, v26.2d + srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3 + srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3 + srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3 + srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3 + trn2 v6.8h, v1.8h, v2.8h + trn1 v1.8h, v1.8h, v2.8h + trn2 v2.8h, v3.8h, v4.8h + trn1 v3.8h, v3.8h, v4.8h + trn2 v4.4s, v6.4s, v2.4s + trn1 v7.4s, v1.4s, v3.4s + trn2 v1.4s, v1.4s, v3.4s + mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24] + trn1 v2.4s, v6.4s, v2.4s + mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24] + mul v6.8h, v7.8h, v0.h[6] // 17 * src[0] + mul v1.8h, v1.8h, v0.h[6] // 17 * src[16] + mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] + mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24] + add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16] + sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16] + neg v2.8h, v3.8h // -t4/2 + neg v6.8h, v4.8h // -t3/2 + ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1 + ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1 + ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1 + ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1 + srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7 + srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7 + srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7 + srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7 + uaddw v0.8h, v0.8h, v5.8b + uaddw v1.8h, v1.8h, v18.8b + uaddw v2.8h, v2.8h, v22.8b + uaddw v3.8h, v3.8h, v25.8b + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v0.8b}, [x3], x1 + st1 {v1.8b}, [x3], x1 + st1 {v2.8b}, [x3], x1 + st1 {v3.8b}, [x3] + ret +endfunc + +// VC-1 4x8 inverse transform +// On entry: +// x0 -> array of 8-bit samples, in row-major order +// x1 = row stride for 8-bit sample array +// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) +// On exit: +// array at x0 updated by saturated addition of (narrowed) transformed block +function ff_vc1_inv_trans_4x8_neon, export=1 + mov x3, #16 + ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector + mov x4, x0 + ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 + ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 + ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 + ld1 {v4.d}[0], [x2], x3 // 30 31 32 33 + ld1 {v1.d}[1], [x2], x3 // 40 41 42 43 + ld1 {v2.d}[1], [x2], x3 // 50 51 52 53 + ld1 {v3.d}[1], [x2], x3 // 60 61 62 63 + ld1 {v4.d}[1], [x2] // 70 71 72 73 + ld1 {v5.s}[0], [x0], x1 + ld1 {v6.s}[0], [x0], x1 + ld1 {v7.s}[0], [x0], x1 + trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53 + trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52 + trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73 + trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72 + ld1 {v4.s}[0], [x0], x1 + trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73 + trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70 + trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71 + mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3] + ld1 {v5.s}[1], [x0], x1 + mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3] + ld1 {v6.s}[1], [x0], x1 + trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72 + mul v3.8h, v18.8h, v0.h[6] // 17 * src[0] + ld1 {v7.s}[1], [x0], x1 + mul v1.8h, v1.8h, v0.h[6] // 17 * src[2] + ld1 {v4.s}[1], [x0] + mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3] + mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] + add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2] + sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2] + neg v3.8h, v16.8h // -t3/2 + ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1 + neg v18.8h, v17.8h // -t4/2 + ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1 + ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1 + ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1 + srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3 + srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3 + srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3 + srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3 + trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73 + trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71 + trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61 + trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63 + trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53 + trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73 + trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43 + mov d18, v3.d[1] // 50 51 52 53 + shl v19.4h, v3.4h, #4 // 16 * src[8] + mov d20, v16.d[1] // 70 71 72 73 + shl v21.4h, v16.4h, #4 // 16 * src[24] + mov d22, v17.d[1] // 40 41 42 43 + shl v23.4h, v3.4h, #2 // 4 * src[8] + shl v24.4h, v18.4h, #4 // 16 * src[40] + shl v25.4h, v20.4h, #4 // 16 * src[56] + shl v26.4h, v18.4h, #2 // 4 * src[40] + trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63 + ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40] + sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56] + shl v17.4h, v17.4h, #2 // 8/2 * src[0] + sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40] + shl v22.4h, v22.4h, #2 // 8/2 * src[32] + mov d23, v1.d[1] // 60 61 62 63 + ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56] + mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16] + shl v1.4h, v1.4h, #3 // 16/2 * src[16] + mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] + ssra v17.4h, v17.4h, #1 // 12/2 * src[0] + mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] + ssra v22.4h, v22.4h, #1 // 12/2 * src[32] + mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] + shl v3.4h, v23.4h, #3 // 16/2 * src[48] + mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] + mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] + mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] + add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32] + sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48] + sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32] + mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] + mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] + add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 + mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] + sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 + add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 + sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 + neg v23.4h, v24.4h // +t2 + sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 + add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 + neg v17.4h, v21.4h // +t3 + sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 + add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 + neg v16.4h, v19.4h // -t1 + neg v27.4h, v2.4h // +t4 + ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1 + srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1 + ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1 + srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1 + ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1 + srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1 + ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1 + srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1 + trn1 v0.2d, v20.2d, v0.2d + trn1 v2.2d, v18.2d, v22.2d + trn1 v3.2d, v25.2d, v3.2d + trn1 v1.2d, v26.2d, v1.2d + srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7 + srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7 + srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7 + srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7 + uaddw v0.8h, v0.8h, v5.8b + uaddw v2.8h, v2.8h, v6.8b + uaddw v3.8h, v3.8h, v7.8b + uaddw v1.8h, v1.8h, v4.8b + sqxtun v0.8b, v0.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + sqxtun v1.8b, v1.8h + st1 {v0.s}[0], [x4], x1 + st1 {v2.s}[0], [x4], x1 + st1 {v3.s}[0], [x4], x1 + st1 {v1.s}[0], [x4], x1 + st1 {v0.s}[1], [x4], x1 + st1 {v2.s}[1], [x4], x1 + st1 {v3.s}[1], [x4], x1 + st1 {v1.s}[1], [x4] + ret +endfunc + +// VC-1 4x4 inverse transform +// On entry: +// x0 -> array of 8-bit samples, in row-major order +// x1 = row stride for 8-bit sample array +// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) +// On exit: +// array at x0 updated by saturated addition of (narrowed) transformed block +function ff_vc1_inv_trans_4x4_neon, export=1 + mov x3, #16 + ldr d0, .Lcoeffs_it4 + mov x4, x0 + ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 + ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 + ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 + ld1 {v4.d}[0], [x2] // 30 31 32 33 + ld1 {v5.s}[0], [x0], x1 + ld1 {v5.s}[1], [x0], x1 + ld1 {v6.s}[0], [x0], x1 + trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13 + trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12 + ld1 {v6.s}[1], [x0] + trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33 + trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32 + trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33 + trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30 + trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31 + trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32 + mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3] + mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3] + mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] + mul v1.4h, v1.4h, v0.h[2] // 17 * src[2] + mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3] + mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] + add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2] + sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2] + neg v7.4h, v3.4h // -t3/2 + neg v16.4h, v4.4h // -t4/2 + ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1 + ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1 + ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1 + ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1 + srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3 + srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3 + srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3 + srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3 + trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31 + trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21 + trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33 + trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23 + trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33 + trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03 + trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13 + trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23 + mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24] + mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24] + mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] + mul v1.4h, v1.4h, v0.h[2] // 17 * src[16] + mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] + mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24] + add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16] + sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16] + neg v3.4h, v2.4h // -t4/2 + neg v7.4h, v4.4h // -t3/2 + ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1 + ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1 + ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1 + ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1 + trn1 v0.2d, v4.2d, v3.2d + trn1 v1.2d, v2.2d, v7.2d + srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7 + srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7 + uaddw v0.8h, v0.8h, v5.8b + uaddw v1.8h, v1.8h, v6.8b + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + st1 {v0.s}[0], [x4], x1 + st1 {v0.s}[1], [x4], x1 + st1 {v1.s}[0], [x4], x1 + st1 {v1.s}[1], [x4] + ret +endfunc + +// VC-1 8x8 inverse transform, DC case +// On entry: +// x0 -> array of 8-bit samples, in row-major order +// x1 = row stride for 8-bit sample array +// x2 -> 16-bit inverse transform DC coefficient +// On exit: +// array at x0 updated by saturated addition of (narrowed) transformed block +function ff_vc1_inv_trans_8x8_dc_neon, export=1 + ldrsh w2, [x2] + mov x3, x0 + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + add w2, w2, w2, lsl #1 + ld1 {v3.8b}, [x0], x1 + ld1 {v4.8b}, [x0], x1 + add w2, w2, #1 + ld1 {v5.8b}, [x0], x1 + asr w2, w2, #1 + ld1 {v6.8b}, [x0], x1 + add w2, w2, w2, lsl #1 + ld1 {v7.8b}, [x0] + add w0, w2, #16 + asr w0, w0, #5 + dup v16.8h, w0 + uaddw v0.8h, v16.8h, v0.8b + uaddw v1.8h, v16.8h, v1.8b + uaddw v2.8h, v16.8h, v2.8b + uaddw v3.8h, v16.8h, v3.8b + uaddw v4.8h, v16.8h, v4.8b + uaddw v5.8h, v16.8h, v5.8b + sqxtun v0.8b, v0.8h + uaddw v6.8h, v16.8h, v6.8b + sqxtun v1.8b, v1.8h + uaddw v7.8h, v16.8h, v7.8b + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + sqxtun v4.8b, v4.8h + st1 {v0.8b}, [x3], x1 + sqxtun v0.8b, v5.8h + st1 {v1.8b}, [x3], x1 + sqxtun v1.8b, v6.8h + st1 {v2.8b}, [x3], x1 + sqxtun v2.8b, v7.8h + st1 {v3.8b}, [x3], x1 + st1 {v4.8b}, [x3], x1 + st1 {v0.8b}, [x3], x1 + st1 {v1.8b}, [x3], x1 + st1 {v2.8b}, [x3] + ret +endfunc + +// VC-1 8x4 inverse transform, DC case +// On entry: +// x0 -> array of 8-bit samples, in row-major order +// x1 = row stride for 8-bit sample array +// x2 -> 16-bit inverse transform DC coefficient +// On exit: +// array at x0 updated by saturated addition of (narrowed) transformed block +function ff_vc1_inv_trans_8x4_dc_neon, export=1 + ldrsh w2, [x2] + mov x3, x0 + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + add w2, w2, w2, lsl #1 + ld1 {v3.8b}, [x0] + add w0, w2, #1 + asr w0, w0, #1 + add w0, w0, w0, lsl #4 + add w0, w0, #64 + asr w0, w0, #7 + dup v4.8h, w0 + uaddw v0.8h, v4.8h, v0.8b + uaddw v1.8h, v4.8h, v1.8b + uaddw v2.8h, v4.8h, v2.8b + uaddw v3.8h, v4.8h, v3.8b + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v0.8b}, [x3], x1 + st1 {v1.8b}, [x3], x1 + st1 {v2.8b}, [x3], x1 + st1 {v3.8b}, [x3] + ret +endfunc + +// VC-1 4x8 inverse transform, DC case +// On entry: +// x0 -> array of 8-bit samples, in row-major order +// x1 = row stride for 8-bit sample array +// x2 -> 16-bit inverse transform DC coefficient +// On exit: +// array at x0 updated by saturated addition of (narrowed) transformed block +function ff_vc1_inv_trans_4x8_dc_neon, export=1 + ldrsh w2, [x2] + mov x3, x0 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[0], [x0], x1 + ld1 {v2.s}[0], [x0], x1 + add w2, w2, w2, lsl #4 + ld1 {v3.s}[0], [x0], x1 + add w2, w2, #4 + asr w2, w2, #3 + add w2, w2, w2, lsl #1 + ld1 {v0.s}[1], [x0], x1 + add w2, w2, #16 + asr w2, w2, #5 + dup v4.8h, w2 + ld1 {v1.s}[1], [x0], x1 + ld1 {v2.s}[1], [x0], x1 + ld1 {v3.s}[1], [x0] + uaddw v0.8h, v4.8h, v0.8b + uaddw v1.8h, v4.8h, v1.8b + uaddw v2.8h, v4.8h, v2.8b + uaddw v3.8h, v4.8h, v3.8b + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v0.s}[0], [x3], x1 + st1 {v1.s}[0], [x3], x1 + st1 {v2.s}[0], [x3], x1 + st1 {v3.s}[0], [x3], x1 + st1 {v0.s}[1], [x3], x1 + st1 {v1.s}[1], [x3], x1 + st1 {v2.s}[1], [x3], x1 + st1 {v3.s}[1], [x3] + ret +endfunc + +// VC-1 4x4 inverse transform, DC case +// On entry: +// x0 -> array of 8-bit samples, in row-major order +// x1 = row stride for 8-bit sample array +// x2 -> 16-bit inverse transform DC coefficient +// On exit: +// array at x0 updated by saturated addition of (narrowed) transformed block +function ff_vc1_inv_trans_4x4_dc_neon, export=1 + ldrsh w2, [x2] + mov x3, x0 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[0], [x0], x1 + ld1 {v0.s}[1], [x0], x1 + add w2, w2, w2, lsl #4 + ld1 {v1.s}[1], [x0] + add w0, w2, #4 + asr w0, w0, #3 + add w0, w0, w0, lsl #4 + add w0, w0, #64 + asr w0, w0, #7 + dup v2.8h, w0 + uaddw v0.8h, v2.8h, v0.8b + uaddw v1.8h, v2.8h, v1.8b + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + st1 {v0.s}[0], [x3], x1 + st1 {v1.s}[0], [x3], x1 + st1 {v0.s}[1], [x3], x1 + st1 {v1.s}[1], [x3] + ret +endfunc + +.align 5 +.Lcoeffs_it8: +.quad 0x000F00090003 +.Lcoeffs_it4: +.quad 0x0011000B0005 +.Lcoeffs: +.quad 0x00050002 + +// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks +// On entry: +// x0 -> top-left pel of lower block +// w1 = row stride, bytes +// w2 = PQUANT bitstream parameter +function ff_vc1_v_loop_filter4_neon, export=1 + sub x3, x0, w1, sxtw #2 + sxtw x1, w1 // technically, stride is signed int + ldr d0, .Lcoeffs + ld1 {v1.s}[0], [x0], x1 // P5 + ld1 {v2.s}[0], [x3], x1 // P1 + ld1 {v3.s}[0], [x3], x1 // P2 + ld1 {v4.s}[0], [x0], x1 // P6 + ld1 {v5.s}[0], [x3], x1 // P3 + ld1 {v6.s}[0], [x0], x1 // P7 + ld1 {v7.s}[0], [x3] // P4 + ld1 {v16.s}[0], [x0] // P8 + ushll v17.8h, v1.8b, #1 // 2*P5 + dup v18.8h, w2 // pq + ushll v2.8h, v2.8b, #1 // 2*P1 + uxtl v3.8h, v3.8b // P2 + uxtl v4.8h, v4.8b // P6 + uxtl v19.8h, v5.8b // P3 + mls v2.4h, v3.4h, v0.h[1] // 2*P1-5*P2 + uxtl v3.8h, v6.8b // P7 + mls v17.4h, v4.4h, v0.h[1] // 2*P5-5*P6 + ushll v5.8h, v5.8b, #1 // 2*P3 + uxtl v6.8h, v7.8b // P4 + mla v17.4h, v3.4h, v0.h[1] // 2*P5-5*P6+5*P7 + uxtl v3.8h, v16.8b // P8 + mla v2.4h, v19.4h, v0.h[1] // 2*P1-5*P2+5*P3 + uxtl v1.8h, v1.8b // P5 + mls v5.4h, v6.4h, v0.h[1] // 2*P3-5*P4 + mls v17.4h, v3.4h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 + sub v3.4h, v6.4h, v1.4h // P4-P5 + mls v2.4h, v6.4h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 + mla v5.4h, v1.4h, v0.h[1] // 2*P3-5*P4+5*P5 + mls v5.4h, v4.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 + abs v4.4h, v3.4h + srshr v7.4h, v17.4h, #3 + srshr v2.4h, v2.4h, #3 + sshr v4.4h, v4.4h, #1 // clip + srshr v5.4h, v5.4h, #3 + abs v7.4h, v7.4h // a2 + sshr v3.4h, v3.4h, #8 // clip_sign + abs v2.4h, v2.4h // a1 + cmeq v16.4h, v4.4h, #0 // test clip == 0 + abs v17.4h, v5.4h // a0 + sshr v5.4h, v5.4h, #8 // a0_sign + cmhs v19.4h, v2.4h, v7.4h // test a1 >= a2 + cmhs v18.4h, v17.4h, v18.4h // test a0 >= pq + sub v3.4h, v3.4h, v5.4h // clip_sign - a0_sign + bsl v19.8b, v7.8b, v2.8b // a3 + orr v2.8b, v16.8b, v18.8b // test clip == 0 || a0 >= pq + uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 + mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 + orr v5.8b, v2.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 + mov w0, v5.s[1] // move to gp reg + ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 + cmhs v5.4h, v0.4h, v4.4h + tbnz w0, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered + bsl v5.8b, v4.8b, v0.8b // FFMIN(d, clip) + bic v0.8b, v5.8b, v2.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) + mls v6.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 + mla v1.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 + sqxtun v0.8b, v6.8h + sqxtun v1.8b, v1.8h + st1 {v0.s}[0], [x3], x1 + st1 {v1.s}[0], [x3] +1: ret +endfunc + +// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks +// On entry: +// x0 -> top-left pel of right block +// w1 = row stride, bytes +// w2 = PQUANT bitstream parameter +function ff_vc1_h_loop_filter4_neon, export=1 + sub x3, x0, #4 // where to start reading + sxtw x1, w1 // technically, stride is signed int + ldr d0, .Lcoeffs + ld1 {v1.8b}, [x3], x1 + sub x0, x0, #1 // where to start writing + ld1 {v2.8b}, [x3], x1 + ld1 {v3.8b}, [x3], x1 + ld1 {v4.8b}, [x3] + dup v5.8h, w2 // pq + trn1 v6.8b, v1.8b, v2.8b + trn2 v1.8b, v1.8b, v2.8b + trn1 v2.8b, v3.8b, v4.8b + trn2 v3.8b, v3.8b, v4.8b + trn1 v4.4h, v6.4h, v2.4h // P1, P5 + trn1 v7.4h, v1.4h, v3.4h // P2, P6 + trn2 v2.4h, v6.4h, v2.4h // P3, P7 + trn2 v1.4h, v1.4h, v3.4h // P4, P8 + ushll v3.8h, v4.8b, #1 // 2*P1, 2*P5 + uxtl v6.8h, v7.8b // P2, P6 + uxtl v7.8h, v2.8b // P3, P7 + uxtl v1.8h, v1.8b // P4, P8 + mls v3.8h, v6.8h, v0.h[1] // 2*P1-5*P2, 2*P5-5*P6 + ushll v2.8h, v2.8b, #1 // 2*P3, 2*P7 + uxtl v4.8h, v4.8b // P1, P5 + mla v3.8h, v7.8h, v0.h[1] // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 + mov d6, v6.d[1] // P6 + mls v3.8h, v1.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 + mov d4, v4.d[1] // P5 + mls v2.4h, v1.4h, v0.h[1] // 2*P3-5*P4 + mla v2.4h, v4.4h, v0.h[1] // 2*P3-5*P4+5*P5 + sub v7.4h, v1.4h, v4.4h // P4-P5 + mls v2.4h, v6.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 + srshr v3.8h, v3.8h, #3 + abs v6.4h, v7.4h + sshr v7.4h, v7.4h, #8 // clip_sign + srshr v2.4h, v2.4h, #3 + abs v3.8h, v3.8h // a1, a2 + sshr v6.4h, v6.4h, #1 // clip + mov d16, v3.d[1] // a2 + abs v17.4h, v2.4h // a0 + cmeq v18.4h, v6.4h, #0 // test clip == 0 + sshr v2.4h, v2.4h, #8 // a0_sign + cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2 + cmhs v5.4h, v17.4h, v5.4h // test a0 >= pq + sub v2.4h, v7.4h, v2.4h // clip_sign - a0_sign + bsl v19.8b, v16.8b, v3.8b // a3 + orr v3.8b, v18.8b, v5.8b // test clip == 0 || a0 >= pq + uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 + mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 + orr v5.8b, v3.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 + mov w2, v5.s[1] // move to gp reg + ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 + cmhs v5.4h, v0.4h, v6.4h + tbnz w2, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered + bsl v5.8b, v6.8b, v0.8b // FFMIN(d, clip) + bic v0.8b, v5.8b, v3.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) + mla v4.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 + mls v1.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 + sqxtun v3.8b, v4.8h + sqxtun v2.8b, v1.8h + st2 {v2.b, v3.b}[0], [x0], x1 + st2 {v2.b, v3.b}[1], [x0], x1 + st2 {v2.b, v3.b}[2], [x0], x1 + st2 {v2.b, v3.b}[3], [x0] +1: ret +endfunc + +// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks +// On entry: +// x0 -> top-left pel of lower block +// w1 = row stride, bytes +// w2 = PQUANT bitstream parameter +function ff_vc1_v_loop_filter8_neon, export=1 + sub x3, x0, w1, sxtw #2 + sxtw x1, w1 // technically, stride is signed int + ldr d0, .Lcoeffs + ld1 {v1.8b}, [x0], x1 // P5 + movi v2.2d, #0x0000ffff00000000 + ld1 {v3.8b}, [x3], x1 // P1 + ld1 {v4.8b}, [x3], x1 // P2 + ld1 {v5.8b}, [x0], x1 // P6 + ld1 {v6.8b}, [x3], x1 // P3 + ld1 {v7.8b}, [x0], x1 // P7 + ushll v16.8h, v1.8b, #1 // 2*P5 + ushll v3.8h, v3.8b, #1 // 2*P1 + ld1 {v17.8b}, [x3] // P4 + uxtl v4.8h, v4.8b // P2 + ld1 {v18.8b}, [x0] // P8 + uxtl v5.8h, v5.8b // P6 + dup v19.8h, w2 // pq + uxtl v20.8h, v6.8b // P3 + mls v3.8h, v4.8h, v0.h[1] // 2*P1-5*P2 + uxtl v4.8h, v7.8b // P7 + ushll v6.8h, v6.8b, #1 // 2*P3 + mls v16.8h, v5.8h, v0.h[1] // 2*P5-5*P6 + uxtl v7.8h, v17.8b // P4 + uxtl v17.8h, v18.8b // P8 + mla v16.8h, v4.8h, v0.h[1] // 2*P5-5*P6+5*P7 + uxtl v1.8h, v1.8b // P5 + mla v3.8h, v20.8h, v0.h[1] // 2*P1-5*P2+5*P3 + sub v4.8h, v7.8h, v1.8h // P4-P5 + mls v6.8h, v7.8h, v0.h[1] // 2*P3-5*P4 + mls v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 + abs v17.8h, v4.8h + sshr v4.8h, v4.8h, #8 // clip_sign + mls v3.8h, v7.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 + sshr v17.8h, v17.8h, #1 // clip + mla v6.8h, v1.8h, v0.h[1] // 2*P3-5*P4+5*P5 + srshr v16.8h, v16.8h, #3 + mls v6.8h, v5.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 + cmeq v5.8h, v17.8h, #0 // test clip == 0 + srshr v3.8h, v3.8h, #3 + abs v16.8h, v16.8h // a2 + abs v3.8h, v3.8h // a1 + srshr v6.8h, v6.8h, #3 + cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2 + abs v20.8h, v6.8h // a0 + sshr v6.8h, v6.8h, #8 // a0_sign + bsl v18.16b, v16.16b, v3.16b // a3 + cmhs v3.8h, v20.8h, v19.8h // test a0 >= pq + sub v4.8h, v4.8h, v6.8h // clip_sign - a0_sign + uqsub v6.8h, v20.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + cmhs v16.8h, v18.8h, v20.8h // test a3 >= a0 + orr v3.16b, v5.16b, v3.16b // test clip == 0 || a0 >= pq + mul v0.8h, v6.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 + orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0 + cmtst v2.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either + mov w0, v5.s[1] // move to gp reg + ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 + mov w2, v5.s[3] + orr v2.16b, v3.16b, v2.16b + cmhs v3.8h, v0.8h, v17.8h + and w0, w0, w2 + bsl v3.16b, v17.16b, v0.16b // FFMIN(d, clip) + tbnz w0, #0, 1f // none of the 8 pixel pairs should be updated in this case + bic v0.16b, v3.16b, v2.16b // set each d to zero if it should not be filtered + mls v7.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 + mla v1.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 + sqxtun v0.8b, v7.8h + sqxtun v1.8b, v1.8h + st1 {v0.8b}, [x3], x1 + st1 {v1.8b}, [x3] +1: ret +endfunc + +// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks +// On entry: +// x0 -> top-left pel of right block +// w1 = row stride, bytes +// w2 = PQUANT bitstream parameter +function ff_vc1_h_loop_filter8_neon, export=1 + sub x3, x0, #4 // where to start reading + sxtw x1, w1 // technically, stride is signed int + ldr d0, .Lcoeffs + ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... + sub x0, x0, #1 // where to start writing + ld1 {v2.8b}, [x3], x1 + add x4, x0, x1, lsl #2 + ld1 {v3.8b}, [x3], x1 + ld1 {v4.8b}, [x3], x1 + ld1 {v5.8b}, [x3], x1 + ld1 {v6.8b}, [x3], x1 + ld1 {v7.8b}, [x3], x1 + trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... + ld1 {v17.8b}, [x3] + trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... + trn1 v2.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... + trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... + dup v4.8h, w2 // pq + trn1 v18.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... + trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... + trn1 v6.4h, v16.4h, v2.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... + trn1 v19.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... + trn1 v20.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... + trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... + trn2 v2.4h, v16.4h, v2.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... + trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... + trn1 v3.4h, v18.4h, v20.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... + trn1 v16.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... + trn2 v17.4h, v18.4h, v20.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... + trn2 v5.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... + trn1 v7.2s, v6.2s, v3.2s // P1 + trn1 v18.2s, v19.2s, v16.2s // P2 + trn2 v3.2s, v6.2s, v3.2s // P5 + trn2 v6.2s, v19.2s, v16.2s // P6 + trn1 v16.2s, v2.2s, v17.2s // P3 + trn2 v2.2s, v2.2s, v17.2s // P7 + ushll v7.8h, v7.8b, #1 // 2*P1 + trn1 v17.2s, v1.2s, v5.2s // P4 + ushll v19.8h, v3.8b, #1 // 2*P5 + trn2 v1.2s, v1.2s, v5.2s // P8 + uxtl v5.8h, v18.8b // P2 + uxtl v6.8h, v6.8b // P6 + uxtl v18.8h, v16.8b // P3 + mls v7.8h, v5.8h, v0.h[1] // 2*P1-5*P2 + uxtl v2.8h, v2.8b // P7 + ushll v5.8h, v16.8b, #1 // 2*P3 + mls v19.8h, v6.8h, v0.h[1] // 2*P5-5*P6 + uxtl v16.8h, v17.8b // P4 + uxtl v1.8h, v1.8b // P8 + mla v19.8h, v2.8h, v0.h[1] // 2*P5-5*P6+5*P7 + uxtl v2.8h, v3.8b // P5 + mla v7.8h, v18.8h, v0.h[1] // 2*P1-5*P2+5*P3 + sub v3.8h, v16.8h, v2.8h // P4-P5 + mls v5.8h, v16.8h, v0.h[1] // 2*P3-5*P4 + mls v19.8h, v1.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 + abs v1.8h, v3.8h + sshr v3.8h, v3.8h, #8 // clip_sign + mls v7.8h, v16.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 + sshr v1.8h, v1.8h, #1 // clip + mla v5.8h, v2.8h, v0.h[1] // 2*P3-5*P4+5*P5 + srshr v17.8h, v19.8h, #3 + mls v5.8h, v6.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 + cmeq v6.8h, v1.8h, #0 // test clip == 0 + srshr v7.8h, v7.8h, #3 + abs v17.8h, v17.8h // a2 + abs v7.8h, v7.8h // a1 + srshr v5.8h, v5.8h, #3 + cmhs v18.8h, v7.8h, v17.8h // test a1 >= a2 + abs v19.8h, v5.8h // a0 + sshr v5.8h, v5.8h, #8 // a0_sign + bsl v18.16b, v17.16b, v7.16b // a3 + cmhs v4.8h, v19.8h, v4.8h // test a0 >= pq + sub v3.8h, v3.8h, v5.8h // clip_sign - a0_sign + uqsub v5.8h, v19.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + cmhs v7.8h, v18.8h, v19.8h // test a3 >= a0 + orr v4.16b, v6.16b, v4.16b // test clip == 0 || a0 >= pq + mul v0.8h, v5.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 + orr v5.16b, v4.16b, v7.16b // test clip == 0 || a0 >= pq || a3 >= a0 + mov w2, v5.s[1] // move to gp reg + ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 + mov w3, v5.s[3] + cmhs v5.8h, v0.8h, v1.8h + and w5, w2, w3 + bsl v5.16b, v1.16b, v0.16b // FFMIN(d, clip) + tbnz w5, #0, 2f // none of the 8 pixel pairs should be updated in this case + bic v0.16b, v5.16b, v4.16b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) + mla v2.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 + mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 + sqxtun v1.8b, v2.8h + sqxtun v0.8b, v16.8h + tbnz w2, #0, 1f // none of the first 4 pixel pairs should be updated if so + st2 {v0.b, v1.b}[0], [x0], x1 + st2 {v0.b, v1.b}[1], [x0], x1 + st2 {v0.b, v1.b}[2], [x0], x1 + st2 {v0.b, v1.b}[3], [x0] +1: tbnz w3, #0, 2f // none of the second 4 pixel pairs should be updated if so + st2 {v0.b, v1.b}[4], [x4], x1 + st2 {v0.b, v1.b}[5], [x4], x1 + st2 {v0.b, v1.b}[6], [x4], x1 + st2 {v0.b, v1.b}[7], [x4] +2: ret +endfunc + +// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks +// On entry: +// x0 -> top-left pel of lower block +// w1 = row stride, bytes +// w2 = PQUANT bitstream parameter +function ff_vc1_v_loop_filter16_neon, export=1 + sub x3, x0, w1, sxtw #2 + sxtw x1, w1 // technically, stride is signed int + ldr d0, .Lcoeffs + ld1 {v1.16b}, [x0], x1 // P5 + movi v2.2d, #0x0000ffff00000000 + ld1 {v3.16b}, [x3], x1 // P1 + ld1 {v4.16b}, [x3], x1 // P2 + ld1 {v5.16b}, [x0], x1 // P6 + ld1 {v6.16b}, [x3], x1 // P3 + ld1 {v7.16b}, [x0], x1 // P7 + ushll v16.8h, v1.8b, #1 // 2*P5[0..7] + ushll v17.8h, v3.8b, #1 // 2*P1[0..7] + ld1 {v18.16b}, [x3] // P4 + uxtl v19.8h, v4.8b // P2[0..7] + ld1 {v20.16b}, [x0] // P8 + uxtl v21.8h, v5.8b // P6[0..7] + dup v22.8h, w2 // pq + ushll2 v3.8h, v3.16b, #1 // 2*P1[8..15] + mls v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] + ushll2 v19.8h, v1.16b, #1 // 2*P5[8..15] + uxtl2 v4.8h, v4.16b // P2[8..15] + mls v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] + uxtl2 v5.8h, v5.16b // P6[8..15] + uxtl v23.8h, v6.8b // P3[0..7] + uxtl v24.8h, v7.8b // P7[0..7] + mls v3.8h, v4.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] + ushll v4.8h, v6.8b, #1 // 2*P3[0..7] + uxtl v25.8h, v18.8b // P4[0..7] + mls v19.8h, v5.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] + uxtl2 v26.8h, v6.16b // P3[8..15] + mla v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] + uxtl2 v7.8h, v7.16b // P7[8..15] + ushll2 v6.8h, v6.16b, #1 // 2*P3[8..15] + mla v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] + uxtl2 v18.8h, v18.16b // P4[8..15] + uxtl v23.8h, v20.8b // P8[0..7] + mls v4.8h, v25.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] + uxtl v24.8h, v1.8b // P5[0..7] + uxtl2 v20.8h, v20.16b // P8[8..15] + mla v3.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] + uxtl2 v1.8h, v1.16b // P5[8..15] + sub v26.8h, v25.8h, v24.8h // P4[0..7]-P5[0..7] + mla v19.8h, v7.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] + sub v7.8h, v18.8h, v1.8h // P4[8..15]-P5[8..15] + mls v6.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] + abs v27.8h, v26.8h + sshr v26.8h, v26.8h, #8 // clip_sign[0..7] + mls v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] + abs v28.8h, v7.8h + sshr v27.8h, v27.8h, #1 // clip[0..7] + mls v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] + sshr v7.8h, v7.8h, #8 // clip_sign[8..15] + sshr v23.8h, v28.8h, #1 // clip[8..15] + mla v4.8h, v24.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] + cmeq v28.8h, v27.8h, #0 // test clip[0..7] == 0 + srshr v17.8h, v17.8h, #3 + mls v3.8h, v18.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] + cmeq v29.8h, v23.8h, #0 // test clip[8..15] == 0 + srshr v16.8h, v16.8h, #3 + mls v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] + abs v17.8h, v17.8h // a1[0..7] + mla v6.8h, v1.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] + srshr v3.8h, v3.8h, #3 + mls v4.8h, v21.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] + abs v16.8h, v16.8h // a2[0..7] + srshr v19.8h, v19.8h, #3 + mls v6.8h, v5.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] + cmhs v5.8h, v17.8h, v16.8h // test a1[0..7] >= a2[0..7] + abs v3.8h, v3.8h // a1[8..15] + srshr v4.8h, v4.8h, #3 + abs v19.8h, v19.8h // a2[8..15] + bsl v5.16b, v16.16b, v17.16b // a3[0..7] + srshr v6.8h, v6.8h, #3 + cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15] + abs v17.8h, v4.8h // a0[0..7] + sshr v4.8h, v4.8h, #8 // a0_sign[0..7] + bsl v16.16b, v19.16b, v3.16b // a3[8..15] + uqsub v3.8h, v17.8h, v5.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + abs v19.8h, v6.8h // a0[8..15] + cmhs v20.8h, v17.8h, v22.8h // test a0[0..7] >= pq + cmhs v5.8h, v5.8h, v17.8h // test a3[0..7] >= a0[0..7] + sub v4.8h, v26.8h, v4.8h // clip_sign[0..7] - a0_sign[0..7] + sshr v6.8h, v6.8h, #8 // a0_sign[8..15] + mul v3.8h, v3.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 + uqsub v17.8h, v19.8h, v16.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + orr v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq + cmhs v21.8h, v19.8h, v22.8h // test a0[8..15] >= pq + cmhs v16.8h, v16.8h, v19.8h // test a3[8..15] >= a0[8..15] + mul v0.8h, v17.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 + sub v6.8h, v7.8h, v6.8h // clip_sign[8..15] - a0_sign[8..15] + orr v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] + ushr v3.8h, v3.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 + orr v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq + cmtst v17.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either + mov w0, v5.s[1] // move to gp reg + cmhs v19.8h, v3.8h, v27.8h + ushr v0.8h, v0.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 + mov w2, v5.s[3] + orr v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] + orr v16.16b, v20.16b, v17.16b + bsl v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7]) + cmtst v2.2d, v5.2d, v2.2d + cmhs v3.8h, v0.8h, v23.8h + mov w4, v5.s[1] + mov w5, v5.s[3] + and w0, w0, w2 + bic v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) + orr v2.16b, v7.16b, v2.16b + bsl v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15]) + mls v25.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] + and w2, w4, w5 + bic v0.16b, v3.16b, v2.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) + mla v24.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] + and w0, w0, w2 + mls v18.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] + sqxtun v2.8b, v25.8h + tbnz w0, #0, 1f // none of the 16 pixel pairs should be updated in this case + mla v1.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] + sqxtun v0.8b, v24.8h + sqxtun2 v2.16b, v18.8h + sqxtun2 v0.16b, v1.8h + st1 {v2.16b}, [x3], x1 + st1 {v0.16b}, [x3] +1: ret +endfunc + +// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks +// On entry: +// x0 -> top-left pel of right block +// w1 = row stride, bytes +// w2 = PQUANT bitstream parameter +function ff_vc1_h_loop_filter16_neon, export=1 + sub x3, x0, #4 // where to start reading + sxtw x1, w1 // technically, stride is signed int + ldr d0, .Lcoeffs + ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... + sub x0, x0, #1 // where to start writing + ld1 {v2.8b}, [x3], x1 + add x4, x0, x1, lsl #3 + ld1 {v3.8b}, [x3], x1 + add x5, x0, x1, lsl #2 + ld1 {v4.8b}, [x3], x1 + add x6, x4, x1, lsl #2 + ld1 {v5.8b}, [x3], x1 + ld1 {v6.8b}, [x3], x1 + ld1 {v7.8b}, [x3], x1 + trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... + ld1 {v17.8b}, [x3], x1 + trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... + ld1 {v2.8b}, [x3], x1 + trn1 v18.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... + ld1 {v19.8b}, [x3], x1 + trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... + ld1 {v4.8b}, [x3], x1 + trn1 v20.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... + ld1 {v21.8b}, [x3], x1 + trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... + ld1 {v6.8b}, [x3], x1 + trn1 v22.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... + ld1 {v23.8b}, [x3], x1 + trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... + ld1 {v17.8b}, [x3], x1 + trn1 v24.8b, v2.8b, v19.8b // P1[8], P1[9], P3[8]... + ld1 {v25.8b}, [x3] + trn2 v2.8b, v2.8b, v19.8b // P2[8], P2[9], P4[8]... + trn1 v19.4h, v16.4h, v18.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... + trn1 v26.8b, v4.8b, v21.8b // P1[10], P1[11], P3[10]... + trn2 v4.8b, v4.8b, v21.8b // P2[10], P2[11], P4[10]... + trn1 v21.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... + trn1 v27.4h, v20.4h, v22.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... + trn1 v28.8b, v6.8b, v23.8b // P1[12], P1[13], P3[12]... + trn2 v6.8b, v6.8b, v23.8b // P2[12], P2[13], P4[12]... + trn1 v23.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... + trn1 v29.4h, v24.4h, v26.4h // P1[8], P1[9], P1[10], P1[11], P5[8]... + trn1 v30.8b, v17.8b, v25.8b // P1[14], P1[15], P3[14]... + trn2 v17.8b, v17.8b, v25.8b // P2[14], P2[15], P4[14]... + trn1 v25.4h, v2.4h, v4.4h // P2[8], P2[9], P2[10], P2[11], P6[8]... + trn1 v31.2s, v19.2s, v27.2s // P1[0..7] + trn2 v19.2s, v19.2s, v27.2s // P5[0..7] + trn1 v27.2s, v21.2s, v23.2s // P2[0..7] + trn2 v21.2s, v21.2s, v23.2s // P6[0..7] + trn1 v23.4h, v28.4h, v30.4h // P1[12], P1[13], P1[14], P1[15], P5[12]... + trn2 v16.4h, v16.4h, v18.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... + trn1 v18.4h, v6.4h, v17.4h // P2[12], P2[13], P2[14], P2[15], P6[12]... + trn2 v20.4h, v20.4h, v22.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... + trn2 v22.4h, v24.4h, v26.4h // P3[8], P3[9], P3[10], P3[11], P7[8]... + trn1 v24.2s, v29.2s, v23.2s // P1[8..15] + trn2 v23.2s, v29.2s, v23.2s // P5[8..15] + trn1 v26.2s, v25.2s, v18.2s // P2[8..15] + trn2 v18.2s, v25.2s, v18.2s // P6[8..15] + trn2 v25.4h, v28.4h, v30.4h // P3[12], P3[13], P3[14], P3[15], P7[12]... + trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... + trn2 v3.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... + trn2 v2.4h, v2.4h, v4.4h // P4[8], P4[9], P4[10], P4[11], P8[8]... + trn2 v4.4h, v6.4h, v17.4h // P4[12], P4[13], P4[14], P4[15], P8[12]... + ushll v5.8h, v31.8b, #1 // 2*P1[0..7] + ushll v6.8h, v19.8b, #1 // 2*P5[0..7] + trn1 v7.2s, v16.2s, v20.2s // P3[0..7] + uxtl v17.8h, v27.8b // P2[0..7] + trn2 v16.2s, v16.2s, v20.2s // P7[0..7] + uxtl v20.8h, v21.8b // P6[0..7] + trn1 v21.2s, v22.2s, v25.2s // P3[8..15] + ushll v24.8h, v24.8b, #1 // 2*P1[8..15] + trn2 v22.2s, v22.2s, v25.2s // P7[8..15] + ushll v25.8h, v23.8b, #1 // 2*P5[8..15] + trn1 v27.2s, v1.2s, v3.2s // P4[0..7] + uxtl v26.8h, v26.8b // P2[8..15] + mls v5.8h, v17.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] + uxtl v17.8h, v18.8b // P6[8..15] + mls v6.8h, v20.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] + trn1 v18.2s, v2.2s, v4.2s // P4[8..15] + uxtl v28.8h, v7.8b // P3[0..7] + mls v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] + uxtl v16.8h, v16.8b // P7[0..7] + uxtl v26.8h, v21.8b // P3[8..15] + mls v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] + uxtl v22.8h, v22.8b // P7[8..15] + ushll v7.8h, v7.8b, #1 // 2*P3[0..7] + uxtl v27.8h, v27.8b // P4[0..7] + trn2 v1.2s, v1.2s, v3.2s // P8[0..7] + ushll v3.8h, v21.8b, #1 // 2*P3[8..15] + trn2 v2.2s, v2.2s, v4.2s // P8[8..15] + uxtl v4.8h, v18.8b // P4[8..15] + mla v5.8h, v28.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] + uxtl v1.8h, v1.8b // P8[0..7] + mla v6.8h, v16.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] + uxtl v2.8h, v2.8b // P8[8..15] + uxtl v16.8h, v19.8b // P5[0..7] + mla v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] + uxtl v18.8h, v23.8b // P5[8..15] + dup v19.8h, w2 // pq + mla v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] + sub v21.8h, v27.8h, v16.8h // P4[0..7]-P5[0..7] + sub v22.8h, v4.8h, v18.8h // P4[8..15]-P5[8..15] + mls v7.8h, v27.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] + abs v23.8h, v21.8h + mls v3.8h, v4.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] + abs v26.8h, v22.8h + sshr v21.8h, v21.8h, #8 // clip_sign[0..7] + mls v5.8h, v27.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] + sshr v23.8h, v23.8h, #1 // clip[0..7] + sshr v26.8h, v26.8h, #1 // clip[8..15] + mls v6.8h, v1.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] + sshr v1.8h, v22.8h, #8 // clip_sign[8..15] + cmeq v22.8h, v23.8h, #0 // test clip[0..7] == 0 + mls v24.8h, v4.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] + cmeq v28.8h, v26.8h, #0 // test clip[8..15] == 0 + srshr v5.8h, v5.8h, #3 + mls v25.8h, v2.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] + srshr v2.8h, v6.8h, #3 + mla v7.8h, v16.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] + srshr v6.8h, v24.8h, #3 + mla v3.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] + abs v5.8h, v5.8h // a1[0..7] + srshr v24.8h, v25.8h, #3 + mls v3.8h, v17.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] + abs v2.8h, v2.8h // a2[0..7] + abs v6.8h, v6.8h // a1[8..15] + mls v7.8h, v20.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] + abs v17.8h, v24.8h // a2[8..15] + cmhs v20.8h, v5.8h, v2.8h // test a1[0..7] >= a2[0..7] + srshr v3.8h, v3.8h, #3 + cmhs v24.8h, v6.8h, v17.8h // test a1[8..15] >= a2[8.15] + srshr v7.8h, v7.8h, #3 + bsl v20.16b, v2.16b, v5.16b // a3[0..7] + abs v2.8h, v3.8h // a0[8..15] + sshr v3.8h, v3.8h, #8 // a0_sign[8..15] + bsl v24.16b, v17.16b, v6.16b // a3[8..15] + abs v5.8h, v7.8h // a0[0..7] + sshr v6.8h, v7.8h, #8 // a0_sign[0..7] + cmhs v7.8h, v2.8h, v19.8h // test a0[8..15] >= pq + sub v1.8h, v1.8h, v3.8h // clip_sign[8..15] - a0_sign[8..15] + uqsub v3.8h, v2.8h, v24.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + cmhs v2.8h, v24.8h, v2.8h // test a3[8..15] >= a0[8..15] + uqsub v17.8h, v5.8h, v20.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + cmhs v19.8h, v5.8h, v19.8h // test a0[0..7] >= pq + orr v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq + sub v6.8h, v21.8h, v6.8h // clip_sign[0..7] - a0_sign[0..7] + mul v3.8h, v3.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 + cmhs v5.8h, v20.8h, v5.8h // test a3[0..7] >= a0[0..7] + orr v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq + mul v0.8h, v17.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 + orr v2.16b, v7.16b, v2.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] + orr v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] + ushr v3.8h, v3.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 + mov w7, v2.s[1] + mov w8, v2.s[3] + ushr v0.8h, v0.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 + mov w2, v5.s[1] // move to gp reg + cmhs v2.8h, v3.8h, v26.8h + mov w3, v5.s[3] + cmhs v5.8h, v0.8h, v23.8h + bsl v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15]) + and w9, w7, w8 + bsl v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7]) + and w10, w2, w3 + bic v0.16b, v2.16b, v7.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) + and w9, w10, w9 + bic v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) + mls v4.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 + tbnz w9, #0, 4f // none of the 16 pixel pairs should be updated in this case + mls v27.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 + mla v16.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 + sqxtun v2.8b, v4.8h + mla v18.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 + sqxtun v0.8b, v27.8h + sqxtun v1.8b, v16.8h + sqxtun v3.8b, v18.8h + tbnz w2, #0, 1f + st2 {v0.b, v1.b}[0], [x0], x1 + st2 {v0.b, v1.b}[1], [x0], x1 + st2 {v0.b, v1.b}[2], [x0], x1 + st2 {v0.b, v1.b}[3], [x0] +1: tbnz w3, #0, 2f + st2 {v0.b, v1.b}[4], [x5], x1 + st2 {v0.b, v1.b}[5], [x5], x1 + st2 {v0.b, v1.b}[6], [x5], x1 + st2 {v0.b, v1.b}[7], [x5] +2: tbnz w7, #0, 3f + st2 {v2.b, v3.b}[0], [x4], x1 + st2 {v2.b, v3.b}[1], [x4], x1 + st2 {v2.b, v3.b}[2], [x4], x1 + st2 {v2.b, v3.b}[3], [x4] +3: tbnz w8, #0, 4f + st2 {v2.b, v3.b}[4], [x6], x1 + st2 {v2.b, v3.b}[5], [x6], x1 + st2 {v2.b, v3.b}[6], [x6], x1 + st2 {v2.b, v3.b}[7], [x6] +4: ret +endfunc + +// Copy at most the specified number of bytes from source to destination buffer, +// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence +// On entry: +// x0 -> source buffer +// w1 = max number of bytes to copy +// x2 -> destination buffer, optimally 8-byte aligned +// On exit: +// w0 = number of bytes not copied +function ff_vc1_unescape_buffer_helper_neon, export=1 + // Offset by 80 to screen out cases that are too short for us to handle, + // and also make it easy to test for loop termination, or to determine + // whether we need an odd number of half-iterations of the loop. + subs w1, w1, #80 + b.mi 90f + + // Set up useful constants + movi v20.4s, #3, lsl #24 + movi v21.4s, #3, lsl #16 + + tst w1, #32 + b.ne 1f + + ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48 + ext v25.16b, v0.16b, v1.16b, #1 + ext v26.16b, v0.16b, v1.16b, #2 + ext v27.16b, v0.16b, v1.16b, #3 + ext v29.16b, v1.16b, v2.16b, #1 + ext v30.16b, v1.16b, v2.16b, #2 + ext v31.16b, v1.16b, v2.16b, #3 + bic v24.16b, v0.16b, v20.16b + bic v25.16b, v25.16b, v20.16b + bic v26.16b, v26.16b, v20.16b + bic v27.16b, v27.16b, v20.16b + bic v28.16b, v1.16b, v20.16b + bic v29.16b, v29.16b, v20.16b + bic v30.16b, v30.16b, v20.16b + bic v31.16b, v31.16b, v20.16b + eor v24.16b, v24.16b, v21.16b + eor v25.16b, v25.16b, v21.16b + eor v26.16b, v26.16b, v21.16b + eor v27.16b, v27.16b, v21.16b + eor v28.16b, v28.16b, v21.16b + eor v29.16b, v29.16b, v21.16b + eor v30.16b, v30.16b, v21.16b + eor v31.16b, v31.16b, v21.16b + cmeq v24.4s, v24.4s, #0 + cmeq v25.4s, v25.4s, #0 + cmeq v26.4s, v26.4s, #0 + cmeq v27.4s, v27.4s, #0 + add w1, w1, #32 + b 3f + +1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48 + ext v25.16b, v3.16b, v4.16b, #1 + ext v26.16b, v3.16b, v4.16b, #2 + ext v27.16b, v3.16b, v4.16b, #3 + ext v29.16b, v4.16b, v5.16b, #1 + ext v30.16b, v4.16b, v5.16b, #2 + ext v31.16b, v4.16b, v5.16b, #3 + bic v24.16b, v3.16b, v20.16b + bic v25.16b, v25.16b, v20.16b + bic v26.16b, v26.16b, v20.16b + bic v27.16b, v27.16b, v20.16b + bic v28.16b, v4.16b, v20.16b + bic v29.16b, v29.16b, v20.16b + bic v30.16b, v30.16b, v20.16b + bic v31.16b, v31.16b, v20.16b + eor v24.16b, v24.16b, v21.16b + eor v25.16b, v25.16b, v21.16b + eor v26.16b, v26.16b, v21.16b + eor v27.16b, v27.16b, v21.16b + eor v28.16b, v28.16b, v21.16b + eor v29.16b, v29.16b, v21.16b + eor v30.16b, v30.16b, v21.16b + eor v31.16b, v31.16b, v21.16b + cmeq v24.4s, v24.4s, #0 + cmeq v25.4s, v25.4s, #0 + cmeq v26.4s, v26.4s, #0 + cmeq v27.4s, v27.4s, #0 + // Drop through... +2: mov v0.16b, v5.16b + ld1 {v1.16b, v2.16b}, [x0], #32 + cmeq v28.4s, v28.4s, #0 + cmeq v29.4s, v29.4s, #0 + cmeq v30.4s, v30.4s, #0 + cmeq v31.4s, v31.4s, #0 + orr v24.16b, v24.16b, v25.16b + orr v26.16b, v26.16b, v27.16b + orr v28.16b, v28.16b, v29.16b + orr v30.16b, v30.16b, v31.16b + ext v25.16b, v0.16b, v1.16b, #1 + orr v22.16b, v24.16b, v26.16b + ext v26.16b, v0.16b, v1.16b, #2 + ext v27.16b, v0.16b, v1.16b, #3 + ext v29.16b, v1.16b, v2.16b, #1 + orr v23.16b, v28.16b, v30.16b + ext v30.16b, v1.16b, v2.16b, #2 + ext v31.16b, v1.16b, v2.16b, #3 + bic v24.16b, v0.16b, v20.16b + bic v25.16b, v25.16b, v20.16b + bic v26.16b, v26.16b, v20.16b + orr v22.16b, v22.16b, v23.16b + bic v27.16b, v27.16b, v20.16b + bic v28.16b, v1.16b, v20.16b + bic v29.16b, v29.16b, v20.16b + bic v30.16b, v30.16b, v20.16b + bic v31.16b, v31.16b, v20.16b + addv s22, v22.4s + eor v24.16b, v24.16b, v21.16b + eor v25.16b, v25.16b, v21.16b + eor v26.16b, v26.16b, v21.16b + eor v27.16b, v27.16b, v21.16b + eor v28.16b, v28.16b, v21.16b + mov w3, v22.s[0] + eor v29.16b, v29.16b, v21.16b + eor v30.16b, v30.16b, v21.16b + eor v31.16b, v31.16b, v21.16b + cmeq v24.4s, v24.4s, #0 + cmeq v25.4s, v25.4s, #0 + cmeq v26.4s, v26.4s, #0 + cmeq v27.4s, v27.4s, #0 + cbnz w3, 90f + st1 {v3.16b, v4.16b}, [x2], #32 +3: mov v3.16b, v2.16b + ld1 {v4.16b, v5.16b}, [x0], #32 + cmeq v28.4s, v28.4s, #0 + cmeq v29.4s, v29.4s, #0 + cmeq v30.4s, v30.4s, #0 + cmeq v31.4s, v31.4s, #0 + orr v24.16b, v24.16b, v25.16b + orr v26.16b, v26.16b, v27.16b + orr v28.16b, v28.16b, v29.16b + orr v30.16b, v30.16b, v31.16b + ext v25.16b, v3.16b, v4.16b, #1 + orr v22.16b, v24.16b, v26.16b + ext v26.16b, v3.16b, v4.16b, #2 + ext v27.16b, v3.16b, v4.16b, #3 + ext v29.16b, v4.16b, v5.16b, #1 + orr v23.16b, v28.16b, v30.16b + ext v30.16b, v4.16b, v5.16b, #2 + ext v31.16b, v4.16b, v5.16b, #3 + bic v24.16b, v3.16b, v20.16b + bic v25.16b, v25.16b, v20.16b + bic v26.16b, v26.16b, v20.16b + orr v22.16b, v22.16b, v23.16b + bic v27.16b, v27.16b, v20.16b + bic v28.16b, v4.16b, v20.16b + bic v29.16b, v29.16b, v20.16b + bic v30.16b, v30.16b, v20.16b + bic v31.16b, v31.16b, v20.16b + addv s22, v22.4s + eor v24.16b, v24.16b, v21.16b + eor v25.16b, v25.16b, v21.16b + eor v26.16b, v26.16b, v21.16b + eor v27.16b, v27.16b, v21.16b + eor v28.16b, v28.16b, v21.16b + mov w3, v22.s[0] + eor v29.16b, v29.16b, v21.16b + eor v30.16b, v30.16b, v21.16b + eor v31.16b, v31.16b, v21.16b + cmeq v24.4s, v24.4s, #0 + cmeq v25.4s, v25.4s, #0 + cmeq v26.4s, v26.4s, #0 + cmeq v27.4s, v27.4s, #0 + cbnz w3, 91f + st1 {v0.16b, v1.16b}, [x2], #32 + subs w1, w1, #64 + b.pl 2b + +90: add w0, w1, #80 + ret + +91: sub w1, w1, #32 + b 90b +endfunc diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index 2e9a3581de..d9571b437f 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c @@ -153,6 +153,7 @@ extern AVCodec ff_hap_decoder; extern AVCodec ff_hevc_decoder; extern AVCodec ff_hevc_qsv_decoder; extern AVCodec ff_hevc_rkmpp_decoder; +extern AVCodec ff_hevc_rpi_decoder; extern AVCodec ff_hevc_v4l2m2m_decoder; extern AVCodec ff_hnm4_video_decoder; extern AVCodec ff_hq_hqa_decoder; @@ -917,6 +918,41 @@ static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id) } } +static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt) +{ + const enum AVPixelFormat *pf = p->pix_fmts; + + // Assume good if we lack info + if (pf == NULL) + return 1; + if (fmt == AV_PIX_FMT_NONE) + return 0; + + for (; *pf != AV_PIX_FMT_NONE; ++pf) { + if (*pf == fmt) + return 1; + } + return 0; +} + +AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt) +{ + const AVCodec *p, *experimental = NULL; + void *i = 0; + + id= remap_deprecated_codec_id(id); + while ((p = av_codec_iterate(&i))) { + if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) { + if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) { + experimental = p; + } else + return (AVCodec *)p; + } + p = p->next; + } + return (AVCodec *)experimental; +} + static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *)) { const AVCodec *p, *experimental = NULL; diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index c4ab93aeeb..cd926f7b33 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -39,6 +39,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ arm/sbrdsp_init_arm.o OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o +OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \ + arm/rpi_hevcpred_init_arm.o OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o @@ -137,10 +139,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ + arm/hevcdsp_idct_neon.o \ arm/hevcdsp_deblock_neon.o \ arm/hevcdsp_idct_neon.o \ arm/hevcdsp_qpel_neon.o \ arm/hevcdsp_sao_neon.o +NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_neon.o \ + arm/rpi_hevc_misc_neon.o \ + arm/rpi_hevcdsp_deblock_neon.o \ + arm/rpi_hevcdsp_idct_neon.o \ + arm/rpi_hevcdsp_res8_neon.o \ + arm/rpi_hevcdsp_res16_neon.o \ + arm/rpi_hevcdsp_sao_neon.o \ + arm/rpi_hevcpred_init_neon.o \ + arm/rpi_hevcpred_intra_angular_neon.o \ + arm/rpi_hevcpred_intra_dc_neon.o \ + arm/rpi_hevcpred_intra_filter_neon.o \ + arm/rpi_hevcpred_intra_hv_neon.o \ + arm/rpi_hevcpred_intra_planar_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h index fdbf86b45e..4755f20e2e 100644 --- a/libavcodec/arm/cabac.h +++ b/libavcodec/arm/cabac.h @@ -26,83 +26,209 @@ #include "libavutil/internal.h" #include "libavcodec/cabac.h" + #define get_cabac_inline get_cabac_inline_arm static av_always_inline int get_cabac_inline_arm(CABACContext *c, - uint8_t *const state) + uint8_t *state) { - int bit; - void *reg_b, *reg_c, *tmp; + const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128; + int bit, ptr, low, tmp1, tmp2; + __asm__ volatile ( + "ldr %[bit], [%[c], %[range_off]] \n\t" + "ldrb %[ptr], [%[state]] \n\t" + "sub %[tmp1], %[mlps_tables], %[lps_off] \n\t" + "and %[tmp2], %[bit], #0xc0 \n\t" + "add %[tmp1], %[tmp1], %[ptr] \n\t" + "ldr %[low], [%[c], %[low_off]] \n\t" + "ldrb %[tmp2], [%[tmp1], %[tmp2], lsl #1] \n\t" + "sub %[bit], %[bit], %[tmp2] \n\t" + "mov %[tmp1], %[bit] \n\t" + "cmp %[low], %[bit], lsl #17 \n\t" + "itt ge \n\t" + "movge %[tmp1], %[tmp2] \n\t" + "mvnge %[ptr], %[ptr] \n\t" + "clz %[tmp2], %[tmp1] \n\t" + "it ge \n\t" + "subge %[low], %[low], %[bit], lsl #17 \n\t" + "sub %[tmp2], %[tmp2], #23 \n\t" + "and %[bit], %[ptr], #1 \n\t" + "ldrb %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t" + "lsl %[low], %[low], %[tmp2] \n\t" + "lsls %[ptr], %[low], #16 \n\t" + "bne 1f \n\t" + "ldr %[ptr], [%[c], %[ptr_off]] \n\t" + "lsl %[tmp2], %[tmp1], %[tmp2] \n\t" +#if UNCHECKED_BITSTREAM_READER + "strb %[mlps_tables], [%[state]] \n\t" + "rbit %[state], %[low] \n\t" + "ldrh %[tmp1], [%[ptr]], #2 \n\t" +#else + "ldr %[tmp1], [%[c], %[end_off]] \n\t" + "strb %[mlps_tables], [%[state]] \n\t" + "rbit %[state], %[low] \n\t" + "cmp %[tmp1], %[ptr] \n\t" +#if CONFIG_THUMB + "it cs \n\t" + "ldrhcs %[tmp1], [%[ptr]], #2 \n\t" +#else + "ldrcsh %[tmp1], [%[ptr]], #2 \n\t" +#endif +#endif + "clz %[state], %[state] \n\t" + "movw %[mlps_tables], #0xffff \n\t" + "sub %[state], %[state], #16 \n\t" + "str %[tmp2], [%[c], %[range_off]] \n\t" + "rev %[tmp1], %[tmp1] \n\t" + "str %[ptr], [%[c], %[ptr_off]] \n\t" + "lsr %[tmp1], %[tmp1], #15 \n\t" + "sub %[tmp1], %[tmp1], %[mlps_tables] \n\t" +#if CONFIG_THUMB + "lsl %[tmp1], %[tmp1], %[state] \n\t" + "add %[low], %[low], %[tmp1] \n\t" +#else + "add %[low], %[low], %[tmp1], lsl %[state] \n\t" +#endif + "str %[low], [%[c], %[low_off]] \n\t" + "b 2f \n\t" + "1: \n\t" + "strb %[mlps_tables], [%[state]] \n\t" + "lsl %[tmp1], %[tmp1], %[tmp2] \n\t" + "str %[low], [%[c], %[low_off]] \n\t" + "str %[tmp1], [%[c], %[range_off]] \n\t" + "2: \n\t" + : // Outputs + [state]"+r"(state), + [mlps_tables]"+r"(mlps_tables), + [bit]"=&r"(bit), + [ptr]"=&r"(ptr), + [low]"=&r"(low), + [tmp1]"=&r"(tmp1), + [tmp2]"=&r"(tmp2) + : // Inputs + [c]"r"(c), + [low_off]"J"(offsetof(CABACContext, low)), + [range_off]"J"(offsetof(CABACContext, range)), + [ptr_off]"J"(offsetof(CABACContext, bytestream)), + [end_off]"J"(offsetof(CABACContext, bytestream_end)), + [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) + : // Clobbers + "cc", "memory" + ); + return bit; +} - __asm__ volatile( - "ldrb %[bit] , [%[state]] \n\t" - "add %[r_b] , %[tables] , %[lps_off] \n\t" - "mov %[tmp] , %[range] \n\t" - "and %[range] , %[range] , #0xC0 \n\t" - "add %[r_b] , %[r_b] , %[bit] \n\t" - "ldrb %[range] , [%[r_b], %[range], lsl #1] \n\t" - "add %[r_b] , %[tables] , %[norm_off] \n\t" - "sub %[r_c] , %[tmp] , %[range] \n\t" - "lsl %[tmp] , %[r_c] , #17 \n\t" - "cmp %[tmp] , %[low] \n\t" - "it gt \n\t" - "movgt %[range] , %[r_c] \n\t" - "itt cc \n\t" - "mvncc %[bit] , %[bit] \n\t" - "subcc %[low] , %[low] , %[tmp] \n\t" - "add %[r_c] , %[tables] , %[mlps_off] \n\t" - "ldrb %[tmp] , [%[r_b], %[range]] \n\t" - "ldrb %[r_b] , [%[r_c], %[bit]] \n\t" - "lsl %[low] , %[low] , %[tmp] \n\t" - "lsl %[range] , %[range] , %[tmp] \n\t" - "uxth %[r_c] , %[low] \n\t" - "strb %[r_b] , [%[state]] \n\t" - "tst %[r_c] , %[r_c] \n\t" - "bne 2f \n\t" - "ldr %[r_c] , [%[c], %[byte]] \n\t" +#define get_cabac_bypass get_cabac_bypass_arm +static inline int get_cabac_bypass_arm(CABACContext * const c) +{ + uint32_t low = c->low, range, ptr, tmp; + int rv; + __asm volatile ( + "ldr %[range] , [%[c], %[range_off]] \n\t" + "mov %[rv] , #0 \n\t" + "ldr %[ptr] , [%[c], %[ptr_off]] \n\t" + "lsl %[low] , #1 \n\t" +#if !UNCHECKED_BITSTREAM_READER + "ldr %[tmp] , [%[c], %[end_off]] \n\t" +#endif + "cmp %[low] , %[range], lsl #17 \n\t" + "itt cs \n\t" + "subcs %[low] , %[low], %[range], lsl #17 \n\t" + "movcs %[rv] , #1 \n\t" #if UNCHECKED_BITSTREAM_READER - "ldrh %[tmp] , [%[r_c]] \n\t" - "add %[r_c] , %[r_c] , #2 \n\t" - "str %[r_c] , [%[c], %[byte]] \n\t" + "ldrh %[tmp] , [%[ptr]], #2 \n\t" +#else + "cmp %[tmp] , %[ptr] \n\t" +#if CONFIG_THUMB + "it cs \n\t" + "ldrhcs %[tmp] , [%[ptr]], #2 \n\t" #else - "ldr %[r_b] , [%[c], %[end]] \n\t" - "ldrh %[tmp] , [%[r_c]] \n\t" - "cmp %[r_c] , %[r_b] \n\t" - "itt lt \n\t" - "addlt %[r_c] , %[r_c] , #2 \n\t" - "strlt %[r_c] , [%[c], %[byte]] \n\t" + "ldrcsh %[tmp] , [%[ptr]], #2 \n\t" +#endif #endif - "sub %[r_c] , %[low] , #1 \n\t" - "add %[r_b] , %[tables] , %[norm_off] \n\t" - "eor %[r_c] , %[low] , %[r_c] \n\t" - "rev %[tmp] , %[tmp] \n\t" - "lsr %[r_c] , %[r_c] , #15 \n\t" - "lsr %[tmp] , %[tmp] , #15 \n\t" - "ldrb %[r_c] , [%[r_b], %[r_c]] \n\t" - "movw %[r_b] , #0xFFFF \n\t" - "sub %[tmp] , %[tmp] , %[r_b] \n\t" - "rsb %[r_c] , %[r_c] , #7 \n\t" - "lsl %[tmp] , %[tmp] , %[r_c] \n\t" - "add %[low] , %[low] , %[tmp] \n\t" - "2: \n\t" - : [bit]"=&r"(bit), - [low]"+&r"(c->low), - [range]"+&r"(c->range), - [r_b]"=&r"(reg_b), - [r_c]"=&r"(reg_c), - [tmp]"=&r"(tmp) - : [c]"r"(c), - [state]"r"(state), - [tables]"r"(ff_h264_cabac_tables), - [byte]"M"(offsetof(CABACContext, bytestream)), - [end]"M"(offsetof(CABACContext, bytestream_end)), - [norm_off]"I"(H264_NORM_SHIFT_OFFSET), - [lps_off]"I"(H264_LPS_RANGE_OFFSET), - [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128) - : "memory", "cc" - ); + "lsls %[range] , %[low], #16 \n\t" + "bne 1f \n\t" - return bit & 1; + "str %[ptr] , [%[c], %[ptr_off]] \n\t" + "rev %[tmp] , %[tmp] \n\t" + "add %[low] , %[low], %[tmp], lsr #15 \n\t" + "movw %[tmp] , 0xFFFF \n\t" + "sub %[low] , %[tmp] \n\t" + "1: \n\t" + "str %[low] , [%[c], %[low_off]] \n\t" + : // Outputs + [rv]"=&r"(rv), + [low]"+r"(low), + [range]"=&r"(range), + [ptr]"=&r"(ptr), + [tmp]"=&r"(tmp) + : // Inputs + [c]"r"(c), + [low_off]"J"(offsetof(CABACContext, low)), + [range_off]"J"(offsetof(CABACContext, range)), + [ptr_off]"J"(offsetof(CABACContext, bytestream)), + [end_off]"J"(offsetof(CABACContext, bytestream_end)) + : // Clobbers + "memory", "cc" + ); + return rv; } + + +#define get_cabac_bypass_sign get_cabac_bypass_sign_arm +static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv) +{ + uint32_t low = c->low, range, ptr, tmp; + __asm volatile ( + "ldr %[range] , [%[c], %[range_off]] \n\t" + "ldr %[ptr] , [%[c], %[ptr_off]] \n\t" + "lsl %[low] , #1 \n\t" +#if !UNCHECKED_BITSTREAM_READER + "ldr %[tmp] , [%[c], %[end_off]] \n\t" +#endif + "cmp %[low] , %[range], lsl #17 \n\t" + "it cs \n\t" + "subcs %[low] , %[low], %[range], lsl #17 \n\t" + "it cc \n\t" + "rsbcc %[rv] , %[rv], #0 \n\t" +#if UNCHECKED_BITSTREAM_READER + "ldrh %[tmp] , [%[ptr]], #2 \n\t" +#else + "cmp %[tmp] , %[ptr] \n\t" +#if CONFIG_THUMB + "it cs \n\t" + "ldrhcs %[tmp] , [%[ptr]], #2 \n\t" +#else + "ldrcsh %[tmp] , [%[ptr]], #2 \n\t" +#endif +#endif + "lsls %[range] , %[low], #16 \n\t" + "bne 1f \n\t" + + "str %[ptr] , [%[c], %[ptr_off]] \n\t" + "rev %[tmp] , %[tmp] \n\t" + "add %[low] , %[low], %[tmp], lsr #15 \n\t" + "movw %[tmp] , 0xFFFF \n\t" + "sub %[low] , %[tmp] \n\t" + "1: \n\t" + "str %[low] , [%[c], %[low_off]] \n\t" + : // Outputs + [rv]"+r"(rv), + [low]"+r"(low), + [range]"=&r"(range), + [ptr]"=&r"(ptr), + [tmp]"=&r"(tmp) + : // Inputs + [c]"r"(c), + [low_off]"J"(offsetof(CABACContext, low)), + [range_off]"J"(offsetof(CABACContext, range)), + [ptr_off]"J"(offsetof(CABACContext, bytestream)), + [end_off]"J"(offsetof(CABACContext, bytestream_end)) + : // Clobbers + "memory", "cc" + ); + return rv; +} + #endif /* HAVE_ARMV6T2_INLINE */ #endif /* AVCODEC_ARM_CABAC_H */ diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h new file mode 100644 index 0000000000..c88dec6eff --- /dev/null +++ b/libavcodec/arm/rpi_hevc_cabac.h @@ -0,0 +1,607 @@ +/* + * This file is part of FFmpeg. + * + * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_HEVC_CABAC_H +#define AVCODEC_ARM_HEVC_CABAC_H + +#include "config.h" +#if HAVE_ARMV6T2_INLINE + +#define hevc_mem_bits32 hevc_mem_bits32_arm +static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits) +{ + unsigned int n; + __asm__ ( + "rev %[n], %[x] \n\t" + : [n]"=r"(n) + : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3))) + : + ); + return n << (bits & 7); +} + + +// --------------------------------------------------------------------------- +// +// Helper fns - little bits of code where ARM has an instraction that the +// compiler doesn't know about / use + +#define trans_scale_sat trans_scale_sat_arm +static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift) +{ + int rv; + int t = ((level * (int)(scale * scale_m)) >> shift) + 1; + + __asm__ ( + "ssat %[rv], #16, %[t], ASR #1 \n\t" + : [rv]"=r"(rv) + : [t]"r"(t) + : + ); + return rv; +} + +#define update_rice update_rice_arm +static inline void update_rice_arm(uint8_t * const stat_coeff, + const unsigned int last_coeff_abs_level_remaining, + const unsigned int c_rice_param) +{ + int t = last_coeff_abs_level_remaining << 1; + __asm__ ( + "lsrs %[t], %[t], %[shift] \n\t" + + "it eq \n\t" + "subeq %[stat], %[stat], #1 \n\t" + "cmp %[t], #6 \n\t" + "adc %[stat], %[stat], #0 \n\t" + "usat %[stat], #8, %[stat] \n\t" + : [stat]"+r"(*stat_coeff), + [t]"+r"(t) + : [shift]"r"(c_rice_param) + : "cc" + ); +} + +// --------------------------------------------------------------------------- +// +// CABAC get loops +// +// Where the loop is simple enough we can normally do 10-30% better than the +// compiler + +// Get the residual greater than 1 bits + +#define get_cabac_greater1_bits get_cabac_greater1_bits_arm +static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n, + uint8_t * const state0) +{ + unsigned int i, reg_b, st, tmp, bit, rv; + __asm__ ( + "mov %[i] , #0 \n\t" + "mov %[rv] , #0 \n\t" + "1: \n\t" + "add %[i] , %[i] , #1 \n\t" + "cmp %[rv] , #0 \n\t" + "ite eq \n\t" + "usateq %[st] , #2 , %[i] \n\t" + "movne %[st] , #0 \n\t" + "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" + "and %[tmp] , %[range] , #0xC0 \n\t" + + "ldrb %[bit] , [%[state0], %[st]] \n\t" + "add %[r_b] , %[r_b] , %[bit] \n\t" + "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t" + "sub %[range] , %[range] , %[tmp] \n\t" + + "cmp %[low] , %[range], lsl #17 \n\t" + "ittt ge \n\t" + "subge %[low] , %[low] , %[range], lsl #17 \n\t" + "movge %[range] , %[tmp] \n\t" + "mvnge %[bit] , %[bit] \n\t" + + "clz %[tmp] , %[range] \n\t" + "sub %[tmp] , #23 \n\t" + "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" + "and %[bit] , %[bit] , #1 \n\t" + "strb %[r_b] , [%[state0], %[st]] \n\t" + "lsl %[low] , %[low] , %[tmp] \n\t" + "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t" + "lsl %[range] , %[range] , %[tmp] \n\t" + +// There is a small speed gain from combining both conditions, using a single +// branch and then working out what that meant later + "lsls %[tmp] , %[low] , #16 \n\t" + "it ne \n\t" + "cmpne %[n] , %[i] \n\t" + "bne 1b \n\t" + +// If reload is not required then we must have run out of flags to decode + "tst %[tmp] , %[tmp] \n\t" + "bne 2f \n\t" + +// Do reload + "ldrh %[tmp] , [%[bptr]] , #2 \n\t" + "rbit %[bit] , %[low] \n\t" + "movw %[r_b] , #0xFFFF \n\t" + "clz %[bit] , %[bit] \n\t" + "rev %[tmp] , %[tmp] \n\t" + "sub %[bit] , %[bit] , #16 \n\t" + "cmp %[n] , %[i] \n\t" + "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t" + +#if CONFIG_THUMB + "lsl %[tmp] , %[tmp] , %[bit] \n\t" + "add %[low] , %[low] , %[tmp] \n\t" +#else + "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t" +#endif + + "bne 1b \n\t" + "2: \n\t" + : [bit]"=&r"(bit), + [low]"+r"(c->low), + [range]"+r"(c->range), + [r_b]"=&r"(reg_b), + [bptr]"+r"(c->bytestream), + [i]"=&r"(i), + [tmp]"=&r"(tmp), + [st]"=&r"(st), + [rv]"=&r"(rv) + : [state0]"r"(state0), + [n]"r"(n), + [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), + [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) + : "memory", "cc" + ); + return rv; +} + + +// n must be > 0 on entry +#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm +static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0, + unsigned int n, + const uint8_t * ctx_map, + uint8_t * p) +{ + unsigned int reg_b, tmp, st, bit; + __asm__ ( +// Get bin from map +#if CONFIG_THUMB + "add %[ctx_map] , %[n] \n\t" + "ldrb %[st] , [%[ctx_map]] \n\t" +#else + "ldrb %[st] , [%[ctx_map], %[n]]! \n\t" +#endif + "1: \n\t" + +// Load state & ranges + "ldrb %[bit] , [%[state0], %[st]] \n\t" + "and %[tmp] , %[range] , #0xC0 \n\t" + "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" + "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t" + "ldrb %[tmp] , [%[r_b], %[bit]] \n\t" + "sub %[range] , %[range] , %[tmp] \n\t" + + "cmp %[low] , %[range], lsl #17 \n\t" + "ittt ge \n\t" + "mvnge %[bit] , %[bit] \n\t" + "subge %[low] , %[low] , %[range], lsl #17 \n\t" + "movge %[range] , %[tmp] \n\t" + +// Renorm + "clz %[tmp] , %[range] \n\t" + "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" + "sub %[tmp] , #23 \n\t" + "strb %[r_b] , [%[state0], %[st]] \n\t" + "tst %[bit] , #1 \n\t" + "ldrb %[st] , [%[ctx_map], #-1]! \n\t" + "lsl %[low] , %[low] , %[tmp] \n\t" +// GCC asm seems to need strbne written differently for thumb and arm +#if CONFIG_THUMB + "it ne \n\t" + "strbne %[n] , [%[idx]] , #1 \n\t" +#else + "strneb %[n] , [%[idx]] , #1 \n\t" +#endif + +// There is a small speed gain from combining both conditions, using a single +// branch and then working out what that meant later + "subs %[n] , %[n] , #1 \n\t" + "lsl %[range] , %[range] , %[tmp] \n\t" +#if CONFIG_THUMB + "itt ne \n\t" + "lslsne %[tmp] , %[low] , #16 \n\t" +#else + "lslnes %[tmp] , %[low] , #16 \n\t" +#endif + "bne 1b \n\t" + +// If we have bits left then n must be 0 so give up now + "lsls %[tmp] , %[low] , #16 \n\t" + "bne 2f \n\t" + +// Do reload + "ldrh %[tmp] , [%[bptr]] , #2 \n\t" + "rbit %[bit] , %[low] \n\t" + "movw %[r_b] , #0xFFFF \n\t" + "clz %[bit] , %[bit] \n\t" + "cmp %[n] , #0 \n\t" + "rev %[tmp] , %[tmp] \n\t" + "sub %[bit] , %[bit] , #16 \n\t" + "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t" + +#if CONFIG_THUMB + "lsl %[tmp] , %[tmp] , %[bit] \n\t" + "add %[low] , %[low] , %[tmp] \n\t" +#else + "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t" +#endif + +// Check to see if we still have more to do + "bne 1b \n\t" + "2: \n\t" + : [bit]"=&r"(bit), + [low]"+r"(c->low), + [range]"+r"(c->range), + [r_b]"=&r"(reg_b), + [bptr]"+r"(c->bytestream), + [idx]"+r"(p), + [n]"+r"(n), + [tmp]"=&r"(tmp), + [st]"=&r"(st), + [ctx_map]"+r"(ctx_map) + : [state0]"r"(state0), + [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), + [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) + : "memory", "cc" + ); + + return p; +} + +// --------------------------------------------------------------------------- +// +// CABAC_BY22 functions + + +#define get_cabac_by22_start get_cabac_by22_start_arm +static inline void get_cabac_by22_start_arm(CABACContext * const c) +{ + const uint8_t *ptr = c->bytestream; + register uint32_t low __asm__("r1"), range __asm__("r2"); + uint32_t m, range8, bits; +#if !USE_BY22_DIV + uintptr_t inv; +#endif + + av_assert2(offsetof (CABACContext, low) == 0); + av_assert2(offsetof (CABACContext, range) == 4); + av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2); + __asm__ volatile ( + "ldmia %[c], {%[low], %[range]} \n\t" + : // Outputs + [low]"=r"(low), + [range]"=r"(range) + : // Inputs + [c]"r"(c) + : // Clobbers + ); +#if !USE_BY22_DIV + inv = (uintptr_t)cabac_by22_inv_range; +#endif + __asm__ volatile ( + "ldr %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t" +#if !USE_BY22_DIV + "uxtb %[range8], %[range] \n\t" +#endif + "rbit %[bits], %[low] \n\t" + "lsl %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t" + "clz %[bits], %[bits] \n\t" + "str %[ptr], [%[c], %[ptr_off]] \n\t" + "rev %[m], %[m] \n\t" + "rsb %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t" + "eor %[m], %[m], #0x80000000 \n\t" +#if !USE_BY22_DIV + "ldr %[inv], [%[inv], %[range8], lsl #2] \n\t" + "pkhbt %[range], %[bits], %[range], lsl #16 \n\t" + "str %[range], [%[c], %[bits_off]] \n\t" +#else + "strh %[bits], [%[c], %[bits_off]] \n\t" +#endif +#if CONFIG_THUMB + "lsr %[m], %[ptr] \n\t" + "eor %[range], %[low], %[m] \n\t" +#else + "eor %[range], %[low], %[m], lsr %[ptr] \n\t" +#endif + : // Outputs + [ptr]"+&r"(ptr), + [low]"+&r"(low), + [range]"+&r"(range), +#if !USE_BY22_DIV + [inv]"+&r"(inv), +#endif + [m]"=&r"(m), + [range8]"=&r"(range8), + [bits]"=&r"(bits) + : // Inputs + [c]"r"(c), + [bits_off]"J"(offsetof (CABACContext, by22.bits)), + [ptr_off]"J"(offsetof (CABACContext, bytestream)) + : // Clobbers + "memory" + ); + c->low = range; +#if !USE_BY22_DIV + c->range = inv; +#endif +} + +#define get_cabac_by22_peek get_cabac_by22_peek_arm +static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c) +{ + uint32_t rv = c->low &~ 1, tmp; + __asm__ ( + "cmp %[inv] , #0 \n\t" + "it ne \n\t" + "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t" + : // Outputs + [rv]"+r"(rv), + [tmp]"=r"(tmp) + : // Inputs + [inv]"r"(c->range) + : // Clobbers + "cc" + ); + return rv << 1; +} + +#define get_cabac_by22_flush get_cabac_by22_flush_arm +static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val) +{ + uint32_t bits, ptr, tmp1, tmp2; + __asm__ volatile ( + "ldrh %[bits], [%[cc], %[bits_off]] \n\t" + "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" + "rsb %[tmp1], %[n], #32 \n\t" + "add %[bits], %[bits], %[n] \n\t" + "ldrh %[tmp2], [%[cc], %[range_off]] \n\t" + "lsr %[tmp1], %[val], %[tmp1] \n\t" + "ldr %[val], [%[cc], %[low_off]] \n\t" +#if CONFIG_THUMB + "add %[ptr], %[ptr], %[bits], lsr #3 \n\t" + "ldr %[ptr], [%[ptr]] \n\t" +#else + "ldr %[ptr], [%[ptr], %[bits], lsr #3] \n\t" +#endif + "mul %[tmp1], %[tmp2], %[tmp1] \n\t" + "and %[tmp2], %[bits], #7 \n\t" + "strh %[bits], [%[cc], %[bits_off]] \n\t" + "rev %[ptr], %[ptr] \n\t" + "lsl %[tmp1], %[tmp1], #23 \n\t" +#if CONFIG_THUMB + "lsl %[val], %[n] \n\t" + "sub %[val], %[tmp1] \n\t" +#else + "rsb %[val], %[tmp1], %[val], lsl %[n] \n\t" +#endif + "lsl %[ptr], %[ptr], %[tmp2] \n\t" + "orr %[val], %[val], %[ptr], lsr #9 \n\t" + "str %[val], [%[cc], %[low_off]] \n\t" + : // Outputs + [val]"+r"(val), + [bits]"=&r"(bits), + [ptr]"=&r"(ptr), + [tmp1]"=&r"(tmp1), + [tmp2]"=&r"(tmp2) + : // Inputs + [cc]"r"(c), + [n]"r"(n), + [bits_off]"J"(offsetof(CABACContext, by22.bits)), + [ptr_off]"J"(offsetof(CABACContext, bytestream)), + [range_off]"J"(offsetof(CABACContext, by22.range)), + [low_off]"J"(offsetof(CABACContext, low)) + : // Clobbers + "memory" + ); +} + +#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm +static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param) +{ + uint32_t last_coeff_abs_level_remaining; + uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2; + __asm__ volatile ( + "ldr %[remain], [%[cc], %[low_off]] \n\t" + "ldr %[prefix], [%[cc], %[range_off]] \n\t" + "bic %[remain], %[remain], #1 \n\t" + "ldrh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" + "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" + "cmp %[prefix], #0 \n\t" + "it ne \n\t" + "umullne %[prefix], %[remain], %[prefix], %[remain] \n\t" + "ldrh %[range], [%[cc], %[by22_range_off]] \n\t" + "lsl %[remain], %[remain], #1 \n\t" + "mvn %[prefix], %[remain] \n\t" + "clz %[prefix], %[prefix] \n\t" + "rsbs %[n1], %[prefix], #2 \n\t" + "bcc 1f \n\t" + "adc %[n1], %[rice], %[prefix] \n\t" + "add %[tmp2], %[tmp2], %[n1] \n\t" + "rsb %[n2], %[n1], #32 \n\t" + "and %[tmp1], %[tmp2], #7 \n\t" + "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" + "lsr %[tmp2], %[tmp2], #3 \n\t" + "lsr %[n2], %[remain], %[n2] \n\t" + "mul %[n2], %[range], %[n2] \n\t" + "ldr %[range], [%[cc], %[low_off]] \n\t" + "ldr %[ptr], [%[ptr], %[tmp2]] \n\t" + "rsb %[tmp2], %[rice], #31 \n\t" + "lsl %[remain], %[remain], %[prefix] \n\t" + "lsl %[n2], %[n2], #23 \n\t" +#if CONFIG_THUMB + "lsl %[range], %[n1] \n\t" + "sub %[range], %[n2] \n\t" +#else + "rsb %[range], %[n2], %[range], lsl %[n1] \n\t" +#endif + "rev %[ptr], %[ptr] \n\t" + "lsl %[n2], %[prefix], %[rice] \n\t" +#if CONFIG_THUMB + "lsr %[remain], %[tmp2] \n\t" + "add %[remain], %[n2] \n\t" +#else + "add %[remain], %[n2], %[remain], lsr %[tmp2] \n\t" +#endif + "b 3f \n\t" + "1: \n\t" + "add %[n2], %[rice], %[prefix], lsl #1 \n\t" + "cmp %[n2], %[peek_bits_plus_2] \n\t" + "bhi 2f \n\t" + "sub %[n1], %[n2], #2 \n\t" + "add %[tmp2], %[tmp2], %[n1] \n\t" + "rsb %[n2], %[n1], #32 \n\t" + "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" + "lsr %[tmp1], %[tmp2], #3 \n\t" + "lsr %[n2], %[remain], %[n2] \n\t" + "mul %[n2], %[range], %[n2] \n\t" + "rsb %[range], %[rice], #34 \n\t" + "ldr %[ptr], [%[ptr], %[tmp1]] \n\t" + "and %[tmp1], %[tmp2], #7 \n\t" + "lsl %[remain], %[remain], %[prefix] \n\t" + "ldr %[tmp2], [%[cc], %[low_off]] \n\t" + "rsb %[prefix], %[prefix], %[range] \n\t" + "orr %[remain], %[remain], #0x80000000 \n\t" + "rev %[ptr], %[ptr] \n\t" + "lsl %[n2], %[n2], #23 \n\t" + "mov %[range], #2 \n\t" +#if CONFIG_THUMB + "lsl %[tmp2], %[n1] \n\t" + "sub %[tmp2], %[n2] \n\t" +#else + "rsb %[tmp2], %[n2], %[tmp2], lsl %[n1] \n\t" +#endif + "lsl %[ptr], %[ptr], %[tmp1] \n\t" + "lsl %[rice], %[range], %[rice] \n\t" + "orr %[range], %[tmp2], %[ptr], lsr #9 \n\t" +#if CONFIG_THUMB + "lsr %[remain], %[prefix] \n\t" + "add %[remain], %[rice] \n\t" +#else + "add %[remain], %[rice], %[remain], lsr %[prefix] \n\t" +#endif + "b 4f \n\t" + "2: \n\t" + "add %[n1], %[tmp2], %[prefix] \n\t" +#if CONFIG_THUMB + "add %[tmp2], %[ptr], %[n1], lsr #3 \n\t" + "ldr %[tmp2], [%[tmp2]] \n\t" +#else + "ldr %[tmp2], [%[ptr], %[n1], lsr #3] \n\t" +#endif + "rsb %[tmp1], %[prefix], #32 \n\t" + "push {%[rice]} \n\t" + "and %[rice], %[n1], #7 \n\t" + "lsr %[tmp1], %[remain], %[tmp1] \n\t" + "ldr %[ptr], [%[cc], %[low_off]] \n\t" + "mul %[remain], %[range], %[tmp1] \n\t" + "rev %[tmp2], %[tmp2] \n\t" + "rsb %[n2], %[prefix], %[n2] \n\t" + "ldr %[tmp1], [%[cc], %[range_off]] \n\t" + "lsl %[rice], %[tmp2], %[rice] \n\t" + "sub %[tmp2], %[n2], #2 \n\t" + "lsl %[remain], %[remain], #23 \n\t" +#if CONFIG_THUMB + "lsl %[ptr], %[prefix] \n\t" + "rsb %[remain], %[ptr] \n\t" +#else + "rsb %[remain], %[remain], %[ptr], lsl %[prefix] \n\t" +#endif + "orr %[remain], %[remain], %[rice], lsr #9 \n\t" + "add %[prefix], %[n1], %[tmp2] \n\t" + "bic %[n1], %[remain], #1 \n\t" + "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" + "cmp %[tmp1], #0 \n\t" + "rsb %[rice], %[tmp2], #32 \n\t" + "it ne \n\t" + "umullne %[tmp1], %[n1], %[tmp1], %[n1] \n\t" + "and %[tmp1], %[prefix], #7 \n\t" +#if CONFIG_THUMB + "add %[ptr], %[ptr], %[prefix], lsr #3 \n\t" + "ldr %[ptr], [%[ptr]] \n\t" +#else + "ldr %[ptr], [%[ptr], %[prefix], lsr #3] \n\t" +#endif + "lsl %[n1], %[n1], #1 \n\t" + "lsr %[rice], %[n1], %[rice] \n\t" + "rsb %[n2], %[n2], #34 \n\t" + "mul %[range], %[range], %[rice] \n\t" + "pop {%[rice]} \n\t" + "rev %[ptr], %[ptr] \n\t" + "orr %[n1], %[n1], #0x80000000 \n\t" + "strh %[prefix], [%[cc], %[by22_bits_off]] \n\t" + "mov %[prefix], #2 \n\t" + "lsl %[range], %[range], #23 \n\t" +#if CONFIG_THUMB + "lsl %[remain], %[tmp2] \n\t" + "rsb %[range], %[remain] \n\t" +#else + "rsb %[range], %[range], %[remain], lsl %[tmp2] \n\t" +#endif + "lsl %[remain], %[prefix], %[rice] \n\t" +#if CONFIG_THUMB + "lsr %[n1], %[n2] \n\t" + "add %[remain], %[n1] \n\t" +#else + "add %[remain], %[remain], %[n1], lsr %[n2] \n\t" +#endif + "3: \n\t" + "lsl %[ptr], %[ptr], %[tmp1] \n\t" + "orr %[range], %[range], %[ptr], lsr #9 \n\t" + "4: \n\t" + "str %[range], [%[cc], %[low_off]] \n\t" + : // Outputs + [remain]"=&r"(last_coeff_abs_level_remaining), + [rice]"+r"(rice_param), + [prefix]"=&r"(prefix), + [n1]"=&r"(n1), + [range]"=&r"(range), + [n2]"=&r"(n2), + [ptr]"=&r"(ptr), + [tmp1]"=&r"(tmp1), + [tmp2]"=&r"(tmp2) + : // Inputs + [cc]"r"(c), + [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2), + [low_off]"J"(offsetof(CABACContext, low)), + [range_off]"J"(offsetof(CABACContext, range)), + [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)), + [by22_range_off]"J"(offsetof(CABACContext, by22.range)), + [ptr_off]"J"(offsetof(CABACContext, bytestream)) + : // Clobbers + "cc", "memory" + ); + return last_coeff_abs_level_remaining; +} + +#endif /* HAVE_ARMV6T2_INLINE */ + +#endif /* AVCODEC_ARM_HEVC_CABAC_H */ diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S new file mode 100644 index 0000000000..978b7b6947 --- /dev/null +++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S @@ -0,0 +1,183 @@ +/* + * ARM NEON optimised IDCT functions for HEVC decoding + * Copyright (c) 2014 Seppo Tomperi + * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +@ Included multiple times from hevc_idct_neon.S +@ Macros defined there + +#define DC_SHIFT (15 - BIT_DEPTH) +#define DC_ADD (1 | (1 << (14 - BIT_DEPTH))) +#define TRN_SHIFT (20 - BIT_DEPTH) + +function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1 + ldrsh r1, [r0] + add r1, #DC_ADD + asr r1, #DC_SHIFT + vdup.16 q0, r1 + vdup.16 q1, r1 + vst1.16 {q0, q1}, [r0] + bx lr +endfunc + +function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1 + ldrsh r1, [r0] + add r2, r0, #32 + mov r3, #64 + add r1, #DC_ADD + asr r1, #DC_SHIFT + vdup.16 q8, r1 + vdup.16 q9, r1 + vst1.16 {q8, q9}, [r0], r3 + vst1.16 {q8, q9}, [r2], r3 + vst1.16 {q8, q9}, [r0] + vst1.16 {q8, q9}, [r2] + bx lr +endfunc + +function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1 + ldrsh r1, [r0] + add r2, r0, #32 + mov r3, #64 + add r1, #DC_ADD + mov ip, #16*16 + asr r1, #DC_SHIFT + vdup.16 q8, r1 + vdup.16 q9, r1 +1: vst1.16 {q8, q9}, [r0], r3 + subs ip, ip, #32 + vst1.16 {q8, q9}, [r2], r3 + bhi 1b + bx lr +endfunc + +function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1 + ldrsh r1, [r0] + add r2, r0, #32 + mov r3, #64 + add r1, #DC_ADD + mov ip, #32*32 + asr r1, #DC_SHIFT + vdup.16 q8, r1 + vdup.16 q9, r1 +1: vst1.16 {q8, q9}, [r0], r3 + subs ip, ip, #32 + vst1.16 {q8, q9}, [r2], r3 + bhi 1b + bx lr +endfunc + + +function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1 + vldr.i32 s0, =0x00240053 // 36 and 83 + vld1.16 {q14, q15}, [r0 :256] // coeffs + + tr4_shift #7 + + vzip.16 d28, d29 + vzip.16 d30, d31 + vzip.32 q14, q15 + + tr4_shift #TRN_SHIFT + + vst4.16 {q14, q15}, [r0 :256] + bx lr + + .ltorg +endfunc + + + +function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1 + vmov.i32 d0, #0x4a // 74 + vld1.16 {q14, q15}, [r0 :256] // coeffs + vmov.i32 d1, #0x1d // 29 + vmov.i32 d2, #0x37 // 55 + + tr4_luma_shift #7 + + vzip.16 d28, d29 + vzip.16 d30, d31 + vzip.32 q14, q15 + + tr4_luma_shift #TRN_SHIFT + + vst4.16 {q14, q15}, [r0 :256] + bx lr +endfunc + +function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1 + add r2, r0, #16 + adr r3, tr4f + vpush {d8-d15} + vld1.16 {d0, d1}, [r3] + mov r3, #32 + + tr8_vert d16, d17, d18, d19, d24, d25, d26, d27, q8, q9, \ + "sub r0, r0, #128-8", \ + "sub r2, r2, #128-8", \ + "cmp r1, #4" + ble 2f + + tr8_vert d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \ + "sub r0, r0, #128+8", \ + "sub r2, r2, #128+8+16-32", \ + "mov r3, #64" + + vzip.16 d16, d17 + vzip.16 d18, d19 + + vzip.16 d20, d21 + vzip.16 d22, d23 + vzip.16 d28, d29 + vzip.16 d30, d31 + vzip.32 q10, q11 + vzip.32 q14, q15 +1: + vzip.16 d24, d25 + vzip.16 d26, d27 + vzip.32 q8, q9 + vzip.32 q12, q13 + + tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8, q9, TRN_SHIFT + tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT + + vpop {d8-d15} + bx lr + +2: vmov.i64 q10, #0 + sub r0, r0, #8 + vmov.i64 q11, #0 + sub r2, r2, #8+16-32 + vmov.i64 q14, #0 + mov r3, #64 + vmov.i64 q15, #0 + + vzip.16 d16, d17 + vzip.16 d18, d19 + + b 1b + +endfunc + +#undef DC_SHIFT +#undef DC_ADD +#undef TRN_SHIFT + diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S new file mode 100644 index 0000000000..161bb0d7c9 --- /dev/null +++ b/libavcodec/arm/rpi_hevc_misc_neon.S @@ -0,0 +1,267 @@ +/* +Copyright (c) 2017 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Written by John Cox, Ben Avison +*/ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +@ rpi_zap_coeff_vals_neon( +@ uint16_t * buf, [r0] +@ unsigned int log_n_m2) [r1] + +function rpi_zap_coeff_vals_neon, export=1 + mov ip, #1 + vmov.i64 q0, #0 + teq r1, #0 + vmov.i64 q1, #0 + beq 2f + + lsl ip, r1 @ 2, 4 or 8 + add r2, r0, #32 + lsl ip, r1 @ 4, 16 or 64 = number of 32-byte blocks to zero + mov r3, #64 +1: vst1.8 {q0,q1}, [r0:256], r3 + subs ip, #2 + vst1.8 {q0,q1}, [r2:256], r3 + bne 1b + bx lr + +2: vst1.8 {q0,q1}, [r0:256] + bx lr +endfunc + +@ PIC jump tables are more expensive than absolute for A32 code +.set jent_pic, CONFIG_PIC || CONFIG_THUMB + +@ Jump table entry - if in neon mode the bottom bit must be set +@ ? There is probably a real asm instruction to do this but I haven't found it +.macro jent lab +.if jent_pic +T .short ((0 + \lab) - (0 + 98b)) / 2 +A .short (0 + \lab) - (4 + 98b) +.else +T .word 1 + \lab +A .word \lab +.endif +.endm + +.set expected_next, 0 + +.macro cpy_compound val, p1, p2, drop_thru=0 +.if \p1 + \p2 != \val +.error "Bad addition! \p1 + \p2 != \val" +.endif +.if expected_next != 0 && expected_next != \val +.error "Drop thru failure" +.endif +\val\(): + push {r0-r3} + bl 100\p1\()b + pop {r0-r3} + add r0, #\p1 + add r2, #\p1 +.if \drop_thru == 0 + b \p2\()b +.set expected_next, 0 +.else +.set expected_next, \p2 +.endif +.endm + +@ ff_hevc_cpy_blks8x4_neon( +@ dst [r0] +@ dst_stride [r1] +@ src [r2] +@ src_stride [r3] +@ width [sp, #0] (bytes) +@ height) [sp, #4] +@ +@ Power of 2 widths are directly coded, all others are done in stripes +@ We expect the vast majority of calls to be power of 2 +@ +@ Currently has min width of 8, but we could make that 4 without issue +@ Min height is 4 + +function ff_hevc_rpi_cpy_blks8x4_neon, export=1 + ldr r12, [sp, #0] + push {r11, lr} +.if jent_pic +A adr lr, 98f - 2 +.else +A adr lr, 98f - 4 +.endif + lsr r12, #3 + ldr r11, [sp, #(8 + 4)] +.if jent_pic +A lsl r12, #1 +A ldrsh lr, [lr, r12] +A add pc, lr +T tbh [pc, r12, lsl #1] +.else + @ A32 only, Thumb is always PIC + ldr pc, [lr, r12, lsl #2] +.endif + +98: +T .short 0 @ unused + jent 8f + jent 16f + jent 24f + jent 32f + jent 40f + jent 48f + jent 56f + jent 64f + jent 72f + jent 80f + jent 88f + jent 96f + jent 104f + jent 112f + jent 120f + jent 128f + +1008: + push {r11, lr} +8: + add lr, r2, r3 + lsl r3, #1 + add r12, r0, r1 + lsl r1, #1 +1: + vld1.32 {d0 }, [r2], r3 + vld1.32 {d1 }, [lr], r3 + vld1.32 {d2 }, [r2], r3 + vld1.32 {d3 }, [lr], r3 + subs r11, #4 + vst1.32 {d0 }, [r0], r1 + vst1.32 {d1 }, [r12], r1 + vst1.32 {d2 }, [r0], r1 + vst1.32 {d3 }, [r12], r1 + bgt 1b + pop {r11, pc} + +10016: + push {r11, lr} +16: + add lr, r2, r3 + lsl r3, #1 + add r12, r0, r1 + lsl r1, #1 +1: + vld1.32 {q0 }, [r2], r3 + vld1.32 {q1 }, [lr], r3 + vld1.32 {q2 }, [r2], r3 + vld1.32 {q3 }, [lr], r3 + subs r11, #4 + vst1.32 {q0 }, [r0], r1 + vst1.32 {q1 }, [r12], r1 + vst1.32 {q2 }, [r0], r1 + vst1.32 {q3 }, [r12], r1 + bgt 1b + pop {r11, pc} + +10032: + push {r11, lr} +32: + add lr, r2, r3 + lsl r3, #1 + add r12, r0, r1 + lsl r1, #1 +1: + vld1.32 {q8, q9 }, [r2], r3 + vld1.32 {q10, q11}, [lr], r3 + vld1.32 {q12, q13}, [r2], r3 + vld1.32 {q14, q15}, [lr], r3 + subs r11, #4 + vst1.32 {q8, q9 }, [r0], r1 + vst1.32 {q10, q11}, [r12], r1 + vst1.32 {q12, q13}, [r0], r1 + vst1.32 {q14, q15}, [r12], r1 + bgt 1b + pop {r11, pc} + +10064: + push {r11, lr} +64: + add lr, r2, #32 + add r12, r0, #32 +1: + vld1.32 {q8, q9 }, [r2], r3 + vld1.32 {q10, q11}, [lr], r3 + vld1.32 {q12, q13}, [r2], r3 + vld1.32 {q14, q15}, [lr], r3 + subs r11, #2 + vst1.32 {q8, q9 }, [r0], r1 + vst1.32 {q10, q11}, [r12], r1 + vst1.32 {q12, q13}, [r0], r1 + vst1.32 {q14, q15}, [r12], r1 + bgt 1b + pop {r11, pc} + +128: + push {r4, r5} + @ We could do this with fewer registers if we jump around but I + @ have a primative urge to load sequentially + mov r4, #64 + add lr, r2, #32 + add r12, r0, #32 + sub r3, r4 + sub r1, r4 +1: + vld1.32 {q8, q9 }, [r2], r4 + vld1.32 {q10, q11}, [lr], r4 + vld1.32 {q12, q13}, [r2], r3 + vld1.32 {q14, q15}, [lr], r3 + subs r11, #1 + vst1.32 {q8, q9 }, [r0], r4 + vst1.32 {q10, q11}, [r12], r4 + vst1.32 {q12, q13}, [r0], r1 + vst1.32 {q14, q15}, [r12], r1 + bgt 1b + pop {r4, r5, r11, pc} + +@ Use drop_thru where we can +cpy_compound 104, 64, 40, 1 +cpy_compound 40, 32, 8 + +cpy_compound 112, 64, 48, 1 +cpy_compound 48, 32, 16 + +cpy_compound 120, 64, 56, 1 +cpy_compound 56, 32, 24, 1 +cpy_compound 24, 16, 8 + +cpy_compound 72, 64, 8 +cpy_compound 80, 64, 16 +cpy_compound 88, 64, 24 +cpy_compound 96, 64, 32 + + +endfunc + diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h new file mode 100644 index 0000000000..9d21f6a882 --- /dev/null +++ b/libavcodec/arm/rpi_hevc_misc_neon.h @@ -0,0 +1,438 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H +#define AVCODEC_ARM_RPI_HEVC_MISC_H + +#include "config.h" +#if HAVE_NEON_INLINE && !CONFIG_THUMB + +static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src, + int pixel_shift, int height, + ptrdiff_t stride_src) +{ + const uint8_t *src2 = src + stride_src; + stride_src <<= 1; + switch (pixel_shift) + { + case 2: + __asm__ volatile ( + "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t" + "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t" + "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t" + "subs %[height], #4 \n\t" + "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t" + "beq 2f \n\t" + "1: \n\t" + "vld1.32 {d2[0]}, [%[src]], %[stride_src] \n\t" + "vld1.32 {d2[1]}, [%[src2]], %[stride_src] \n\t" + "vld1.32 {d3[0]}, [%[src]], %[stride_src] \n\t" + "vld1.32 {d3[1]}, [%[src2]], %[stride_src] \n\t" + "subs %[height], #4 \n\t" + "vst1.32 {q0}, [%[dst]]! \n\t" + "beq 3f \n\t" + "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t" + "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t" + "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t" + "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t" + "subs %[height], #4 \n\t" + "vst1.32 {q1}, [%[dst]]! \n\t" + "bne 1b \n\t" + "2: \n\t" + "vst1.32 {q0}, [%[dst]] \n\t" + "b 4f \n\t" + "3: \n\t" + "vst1.32 {q1}, [%[dst]] \n\t" + "4: \n\t" + : // Outputs + [src]"+r"(src), + [src2]"+r"(src2), + [dst]"+r"(dst), + [height]"+r"(height) + : // Inputs + [stride_src]"r"(stride_src) + : // Clobbers + "cc", "memory" + ); + break; + case 1: + __asm__ volatile ( + "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t" + "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t" + "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t" + "subs %[height], #4 \n\t" + "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t" + "beq 2f \n\t" + "1: \n\t" + "vld1.16 {d2[0]}, [%[src]], %[stride_src] \n\t" + "vld1.16 {d3[0]}, [%[src2]], %[stride_src] \n\t" + "vld1.16 {d2[1]}, [%[src]], %[stride_src] \n\t" + "vld1.16 {d3[1]}, [%[src2]], %[stride_src] \n\t" + "vzip.16 d0, d1 \n\t" + "subs %[height], #4 \n\t" + "vst1.16 {d0}, [%[dst]]! \n\t" + "beq 3f \n\t" + "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t" + "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t" + "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t" + "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t" + "vzip.16 d2, d3 \n\t" + "subs %[height], #4 \n\t" + "vst1.16 {d2}, [%[dst]]! \n\t" + "bne 1b \n\t" + "2: \n\t" + "vzip.16 d0, d1 \n\t" + "vst1.16 {d0}, [%[dst]] \n\t" + "b 4f \n\t" + "3: \n\t" + "vzip.16 d2, d3 \n\t" + "vst1.16 {d2}, [%[dst]] \n\t" + "4: \n\t" + : // Outputs + [src]"+r"(src), + [src2]"+r"(src2), + [dst]"+r"(dst), + [height]"+r"(height) + : // Inputs + [stride_src]"r"(stride_src) + : // Clobbers + "cc", "memory" + ); + break; + default: + __asm__ volatile ( + "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t" + "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t" + "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t" + "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t" + "subs %[height], #8 \n\t" + "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t" + "beq 2f \n\t" + "1: \n\t" + "vld1.8 {d2[0]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d3[0]}, [%[src2]], %[stride_src] \n\t" + "vld1.8 {d2[1]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d3[1]}, [%[src2]], %[stride_src] \n\t" + "vld1.8 {d2[2]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d3[2]}, [%[src2]], %[stride_src] \n\t" + "vld1.8 {d2[3]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d3[3]}, [%[src2]], %[stride_src] \n\t" + "vzip.8 d0, d1 \n\t" + "subs %[height], #8 \n\t" + "vst1.8 {d0}, [%[dst]]! \n\t" + "beq 3f \n\t" + "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t" + "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t" + "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t" + "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t" + "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t" + "vzip.8 d2, d3 \n\t" + "subs %[height], #8 \n\t" + "vst1.8 {d2}, [%[dst]]! \n\t" + "bne 1b \n\t" + "2: \n\t" + "vzip.8 d0, d1 \n\t" + "vst1.8 {d0}, [%[dst]] \n\t" + "b 4f \n\t" + "3: \n\t" + "vzip.8 d2, d3 \n\t" + "vst1.8 {d2}, [%[dst]] \n\t" + "4: \n\t" + : // Outputs + [src]"+r"(src), + [src2]"+r"(src2), + [dst]"+r"(dst), + [height]"+r"(height) + : // Inputs + [stride_src]"r"(stride_src) + : // Clobbers + "cc", "memory" + ); + break; + } +} + +static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src, + int pixel_shift, int height, + ptrdiff_t stride_dst) +{ + uint8_t *dst2 = dst + stride_dst; + stride_dst <<= 1; + switch (pixel_shift) + { + case 2: + __asm__ volatile ( + "subs %[height], #4 \n\t" + "vld1.32 {q0}, [%[src]]! \n\t" + "beq 2f \n\t" + "1: \n\t" + "vld1.32 {q1}, [%[src]]! \n\t" + "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.32 {d1[0]}, [%[dst]], %[stride_dst] \n\t" + "subs %[height], #4 \n\t" + "vst1.32 {d1[1]}, [%[dst2]], %[stride_dst] \n\t" + "beq 3f \n\t" + "vld1.32 {q0}, [%[src]]! \n\t" + "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.32 {d3[0]}, [%[dst]], %[stride_dst] \n\t" + "subs %[height], #4 \n\t" + "vst1.32 {d3[1]}, [%[dst2]], %[stride_dst] \n\t" + "bne 1b \n\t" + "2: \n\t" + "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.32 {d1[0]}, [%[dst]] \n\t" + "vst1.32 {d1[1]}, [%[dst2]] \n\t" + "b 4f \n\t" + "3: \n\t" + "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.32 {d3[0]}, [%[dst]] \n\t" + "vst1.32 {d3[1]}, [%[dst2]] \n\t" + "4: \n\t" + : // Outputs + [dst]"+r"(dst), + [dst2]"+r"(dst2), + [src]"+r"(src), + [height]"+r"(height) + : // Inputs + [stride_dst]"r"(stride_dst) + : // Clobbers + "cc", "memory" + ); + break; + case 1: + __asm__ volatile ( + "subs %[height], #4 \n\t" + "vld1.16 {d0}, [%[src]]! \n\t" + "beq 2f \n\t" + "1: \n\t" + "vld1.16 {d2}, [%[src]]! \n\t" + "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.16 {d0[2]}, [%[dst]], %[stride_dst] \n\t" + "subs %[height], #4 \n\t" + "vst1.16 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" + "beq 3f \n\t" + "vld1.16 {d0}, [%[src]]! \n\t" + "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.16 {d2[2]}, [%[dst]], %[stride_dst] \n\t" + "subs %[height], #4 \n\t" + "vst1.16 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" + "bne 1b \n\t" + "2: \n\t" + "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.16 {d0[2]}, [%[dst]] \n\t" + "vst1.16 {d0[3]}, [%[dst2]] \n\t" + "b 4f \n\t" + "3: \n\t" + "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.16 {d2[2]}, [%[dst]] \n\t" + "vst1.16 {d2[3]}, [%[dst2]] \n\t" + "4: \n\t" + : // Outputs + [dst]"+r"(dst), + [dst2]"+r"(dst2), + [src]"+r"(src), + [height]"+r"(height) + : // Inputs + [stride_dst]"r"(stride_dst) + : // Clobbers + "cc", "memory" + ); + break; + default: + __asm__ volatile ( + "subs %[height], #8 \n\t" + "vld1.8 {d0}, [%[src]]! \n\t" + "beq 2f \n\t" + "1: \n\t" + "vld1.8 {d2}, [%[src]]! \n\t" + "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d0[6]}, [%[dst]], %[stride_dst] \n\t" + "subs %[height], #8 \n\t" + "vst1.8 {d0[7]}, [%[dst2]], %[stride_dst] \n\t" + "beq 3f \n\t" + "vld1.8 {d0}, [%[src]]! \n\t" + "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d2[6]}, [%[dst]], %[stride_dst] \n\t" + "subs %[height], #8 \n\t" + "vst1.8 {d2[7]}, [%[dst2]], %[stride_dst] \n\t" + "bne 1b \n\t" + "2: \n\t" + "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d0[6]}, [%[dst]] \n\t" + "vst1.8 {d0[7]}, [%[dst2]] \n\t" + "b 4f \n\t" + "3: \n\t" + "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t" + "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t" + "vst1.8 {d2[6]}, [%[dst]] \n\t" + "vst1.8 {d2[7]}, [%[dst2]] \n\t" + "4: \n\t" + : // Outputs + [dst]"+r"(dst), + [dst2]"+r"(dst2), + [src]"+r"(src), + [height]"+r"(height) + : // Inputs + [stride_dst]"r"(stride_dst) + : // Clobbers + "cc", "memory" + ); + break; + } +} + +static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src, + int pixel_shift, int height, + ptrdiff_t stride_dst, ptrdiff_t stride_src) +{ + int x, y; + switch (pixel_shift) + { + case 2: + __asm__ volatile ( + "ldr %[x], [%[src]], %[stride_src] \n\t" + "ldr %[y], [%[src]], %[stride_src] \n\t" + "str %[x], [%[dst]], %[stride_dst] \n\t" + "sub %[height], #2 \n\t" + "1: \n\t" + "ldr %[x], [%[src]], %[stride_src] \n\t" + "str %[y], [%[dst]], %[stride_dst] \n\t" + "ldr %[y], [%[src]], %[stride_src] \n\t" + "subs %[height], #2 \n\t" + "str %[x], [%[dst]], %[stride_dst] \n\t" + "bne 1b \n\t" + "str %[y], [%[dst]] \n\t" + : // Outputs + [x]"=&r"(x), + [y]"=&r"(y), + [src]"+r"(src), + [dst]"+r"(dst), + [height]"+r"(height) + : // Inputs + [stride_src]"r"(stride_src), + [stride_dst]"r"(stride_dst) + : // Clobbers + "cc", "memory" + ); + break; + case 1: + __asm__ volatile ( + "ldrh %[x], [%[src]], %[stride_src] \n\t" + "ldrh %[y], [%[src]], %[stride_src] \n\t" + "strh %[x], [%[dst]], %[stride_dst] \n\t" + "sub %[height], #2 \n\t" + "1: \n\t" + "ldrh %[x], [%[src]], %[stride_src] \n\t" + "strh %[y], [%[dst]], %[stride_dst] \n\t" + "ldrh %[y], [%[src]], %[stride_src] \n\t" + "subs %[height], #2 \n\t" + "strh %[x], [%[dst]], %[stride_dst] \n\t" + "bne 1b \n\t" + "strh %[y], [%[dst]] \n\t" + : // Outputs + [x]"=&r"(x), + [y]"=&r"(y), + [src]"+r"(src), + [dst]"+r"(dst), + [height]"+r"(height) + : // Inputs + [stride_src]"r"(stride_src), + [stride_dst]"r"(stride_dst) + : // Clobbers + "cc", "memory" + ); + break; + default: + __asm__ volatile ( + "ldrb %[x], [%[src]], %[stride_src] \n\t" + "ldrb %[y], [%[src]], %[stride_src] \n\t" + "strb %[x], [%[dst]], %[stride_dst] \n\t" + "sub %[height], #2 \n\t" + "1: \n\t" + "ldrb %[x], [%[src]], %[stride_src] \n\t" + "strb %[y], [%[dst]], %[stride_dst] \n\t" + "ldrb %[y], [%[src]], %[stride_src] \n\t" + "subs %[height], #2 \n\t" + "strb %[x], [%[dst]], %[stride_dst] \n\t" + "bne 1b \n\t" + "strb %[y], [%[dst]] \n\t" + : // Outputs + [x]"=&r"(x), + [y]"=&r"(y), + [src]"+r"(src), + [dst]"+r"(dst), + [height]"+r"(height) + : // Inputs + [stride_src]"r"(stride_src), + [stride_dst]"r"(stride_dst) + : // Clobbers + "cc", "memory" + ); + break; + } +} + +#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon +static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src, + int pixel_shift, int height, + ptrdiff_t stride_dst, ptrdiff_t stride_src) +{ + if (stride_dst == 1 << pixel_shift) + ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src); + else if (stride_src == 1 << pixel_shift) + ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst); + else + ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src); +} + +#endif /* HAVE_NEON_INLINE */ + +#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */ diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h new file mode 100644 index 0000000000..325c26a49b --- /dev/null +++ b/libavcodec/arm/rpi_hevc_mv_arm.h @@ -0,0 +1,93 @@ +/* +Copyright (c) 2017 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Written by John Cox, Ben Avison +*/ + +#ifndef AVCODEC_ARM_RPI_HEVC_MV_H +#define AVCODEC_ARM_RPI_HEVC_MV_H + +#if HAVE_ARMV6T2_INLINE +static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b) +{ + MvXY r; + __asm__ ( + "sadd16 %[r], %[a], %[b] \n\t" + : [r]"=r"(r) + : [a]"r"(a), + [b]"r"(b) + : + ); + return r; +} +#define mvxy_add mvxy_add_arm +#endif + +#if HAVE_ARMV6T2_INLINE +#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV)) +static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb) +{ + int t; + __asm__ ( + "ssat %[td], #8, %[td] \n\t" + "ssat %[tb], #8, %[tb] \n\t" + "eor %[t], %[td], %[td], asr #31 \n\t" + "adds %[t], %[t], %[td], lsr #31 \n\t" + "asr %[t], #1 \n\t" + "add %[t], #0x4000 \n\t" + "it ne \n\t" + "sdivne %[t], %[t], %[td] \n\t" + "mov %[td], #32 \n\t" + "smlabb %[td], %[t], %[tb], %[td] \n\t" + "ssat %[td], #13, %[td], asr #6 \n\t" + "mov %[tb], #127 \n\t" + "smlatb %[t], %[xy], %[td], %[tb] \n\t" + "smlabb %[tb], %[xy], %[td], %[tb] \n\t" +// This takes the sign of x & y for rounding at the "wrong" point +// (i.e. after adding 127) but for the range of values (-1,-127) +// where it does the wrong thing you get the right answer (0) anyway + "add %[t], %[t], %[t], lsr #31 \n\t" + "add %[xy], %[tb], %[tb], lsr #31 \n\t" + "ssat %[t], #16, %[t], asr #8 \n\t" + "ssat %[xy], #16, %[xy], asr #8 \n\t" + "pkhbt %[xy], %[xy], %[t], lsl #16 \n\t" + : + [t]"=&r"(t), + [xy]"+r"(xy), + [td]"+r"(td), + [tb]"+r"(tb) + : + : + "cc" + ); + return xy; +} +#define mv_scale_xy mv_scale_xy_arm +#endif +#endif + +#endif // AVCODEC_ARM_RPI_HEVC_MV_H + diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h new file mode 100644 index 0000000000..62b9326532 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_arm.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_HEVCDSP_ARM_H +#define AVCODEC_ARM_HEVCDSP_ARM_H + +#include "libavcodec/rpi_hevcdsp.h" + +void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth); + +#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S new file mode 100644 index 0000000000..88a3b4e5e7 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S @@ -0,0 +1,1634 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1 + */ + + +#include "libavutil/arm/asm.S" +#include "neon.S" + +.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8 + vsubl.u8 q0, \Q0a, \P0a + vsubl.u8 q1, \P1a, \Q1a + vdup.16 d4, r2 + \I1 + vshl.i16 q0, #2 + \I2 + vadd.i16 q0, q1 + \I3 + vmovl.u8 q2, d4 + \I4 + vneg.s16 q1, q2 + \I5 + vrshr.s16 q0, #3 + \I6 + \I7 + \I8 + vmin.s16 q0, q2 + vmovl.u8 q2, \Q0a + vmax.s16 q0, q1 + vaddw.u8 q1, q0, \P0a + vsub.i16 q0, q2, q0 + vqmovun.s16 \P0a, q1 + vqmovun.s16 \Q0a, q0 +.endm + + +.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7 + vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a + lsr r12, r2, #16 + vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b + vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a + vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b + vshl.i16 q0, #2 @ (q0a - p0a) * 4 + vshl.i16 q1, #2 @ (q0b - p0b) * 4 + vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a + vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b + vdup.16 d4, r2 @ tc0a, tc0b + vdup.16 d6, r12 @ tc1a, tc1b + vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 + \I1 + vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 + \I2 + vmovl.u8 q2, d4 @ tc0a, tc0b + \I3 + vmovl.u8 q3, d6 @ tc1a, tc1b + \I4 + vmin.s16 q0, q2 + \I5 + vneg.s16 q2, q2 @ -tc0a, -tc0b + \I6 + vmin.s16 q1, q3 + \I7 + vneg.s16 q3, q3 @ -tc1a, -tc1b + vmax.s16 q0, q2 @ delta0a + vmovl.u8 q2, \Q0a + vmax.s16 q1, q3 @ delta0b + vaddw.u8 q3, q0, \P0a @ p0a + delta0a + vsub.i16 q0, q2, q0 @ q0a - delta0a + vmovl.u8 q2, \Q0b + vsub.i16 q2, q1 @ q0b - delta0b + vaddw.u8 q1, \P0b @ p0b + delta0b + vqmovun.s16 \Q0a, q0 + vqmovun.s16 \P0a, q3 + vqmovun.s16 \Q0b, q2 + vqmovun.s16 \P0b, q1 +.endm + + +@ Preserves r12 +@ Clobbers r2 +@ P0a et al all contain UVUVUVUV +@ r2 (tc4) contains +@ [0..7] tc U a +@ [8..15] tc V a + +.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8 + vsub.i16 q0, \Q0a, \P0a + vsub.i16 q1, \P1a, \Q1a + vdup.16 d4, r2 + \I1 + vshl.i16 q0, #2 + \I2 + vadd.i16 q0, q1 + \I3 + vshll.u8 q2, d4, #\bit_depth - 8 + \I4 + vneg.s16 q1, q2 + \I5 + vrshr.s16 q0, #3 + \I6 + \I7 + \I8 + vmin.s16 q0, q2 + vmov.i16 q2, #0 + vmax.s16 q0, q1 + vadd.i16 \P0a, q0 + vsub.i16 \Q0a, q0 + vmov.i16 q1, #(1 << \bit_depth) - 1 + vmax.s16 \P0a, q2 + vmax.s16 \Q0a, q2 + vmin.s16 \P0a, q1 + vmin.s16 \Q0a, q1 +.endm + +@ Clobbers r2, r12 +@ P0a et al all contain UVUVUVUV +@ r2 (tc4) contains +@ [0..7] tc U a +@ [8..15] tc V a +@ [16..23] tc U b +@ [24..31] tc V b + +.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7 + vsub.i16 q0, \Q0a, \P0a @ q0a - p0a + lsr r12, r2, #16 + vsub.i16 q1, \Q0b, \P0b @ q0b - p0b + vsub.i16 q2, \P1a, \Q1a @ p1a - q1a + vsub.i16 q3, \P1b, \Q1b @ p1b - q1b + vshl.i16 q0, #2 @ (q0a - p0a) * 4 + vshl.i16 q1, #2 @ (q0b - p0b) * 4 + vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a + vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b + vdup.16 d4, r2 @ tc0a, tc0b + vdup.16 d6, r12 @ tc1a, tc1b + vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 + \I1 + vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 + \I2 + vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b + \I3 + vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b + \I4 + vmin.s16 q0, q2 + \I5 + vneg.s16 q2, q2 @ -tc0a, -tc0b + \I6 + vmin.s16 q1, q3 + \I7 + vneg.s16 q3, q3 @ -tc1a, -tc1b + vmax.s16 q0, q2 @ delta0a + vadd.i16 \P0a, q0 @ p0a + delta0a + vsub.i16 \Q0a, q0 @ q0a - delta0a + vmax.s16 q1, q3 @ delta0b + vadd.i16 \P0b, q1 @ p0b + delta0b + vsub.i16 \Q0b, q1 @ q0b - delta0b + vmov.i16 q2, #0 + vmov.i16 q3, #(1 << \bit_depth) - 1 + vmax.s16 \P0a, q2 + vmax.s16 \Q0a, q2 + vmax.s16 \P0b, q2 + vmax.s16 \Q0b, q2 + vmin.s16 \P0a, q3 + vmin.s16 \Q0a, q3 + vmin.s16 \P0b, q3 + vmin.s16 \Q0b, q3 +.endm + + + +@ uint8_t *_no_p, [sp+0] +@ uint8_t *_no_q) [sp+4] + +.macro hevc_loop_filter_luma_start + ldr r12, [r3] + ldr r3, [r3, #4] + orrs r3, r12, r3, lsl #16 + it eq + bxeq lr + push {r4-r10,lr} @ 32 bytes + ldrd r4, r5, [sp, #32] @ &_no_p + ldrb r4, [r4] + ldrb r5, [r5] + movs r10, r4 + it ne + movne r10, #1 + cmp r5, #0 + it ne + orrne r10, #2 +.endm + +@ Input: +@ r2 beta (raw: needs shift for bitdepth > 8) +@ r3[ 0:15] tc[0] (raw: needs shift for bitdepth > 8) +@ r3[16:31] tc[1] (raw: needs shift for bitdepth > 8) +@ +@ Input & output +@ 8-bit: d16-d23 (Q3,Q2,Q1,Q0,P0,P1,P2,P3) +@ 16-bit: q8-q15 +@ +@ r1 -r1 +@ r10 b1->C, b0->N (r10 junk) +@ +@ Junks: +@ r5, r6, r7, r8, r9 + +.macro m_filter_luma bit_depth, Q11, Q15 +.if \bit_depth == 8 + vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2 + vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1 + vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0 + vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0 + vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1 + vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2 +.endif + vadd.i16 q0, q9, \Q11 @ P2 + P0 +.if \bit_depth > 8 + lsl r3, r3, #(\bit_depth - 8) +.endif + vadd.i16 q1, q14, q12 @ Q2 + Q0 +.if \bit_depth > 8 + lsl r2, r2, #(\bit_depth - 8) +.endif + vsub.i16 q0, q10 @ P2 - P1 + P0 + lsr r5, r3, #16 + vsub.i16 q1, q13 @ Q2 - Q1 + Q0 +.if \bit_depth == 8 + vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3 + vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3 +.endif + vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0) + vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0) + vmov.i64 q2, #0xffffffff0000 + vbic q0, q2 @ only dp0(') and dp3(') + vbic q1, q2 @ only dq0(') and dq3(') + vsra.u64 q0, #16 + vsra.u64 q1, #16 + vdup.16 q3, r2 @ beta + vdup.16 d14, r3 @ tC[0] + vdup.16 d15, r5 @ tC[1] + vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0) + vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0 + vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0 + vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0 + vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0) + vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0) + vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3 + vshl.s16 q6, q7, #2 @ tC[] * 4 + vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1 + vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta) + vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block) + vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3 + cmp r7, #0 + beq .Lbypasswrite + + vcgt.s16 q5, q6, q5 @ if < tc25 + vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3) + vand q4, q5 + vbic d8, d4 + vbic d9, d4 + vshr.s16 q3, #2 @ beta_2 = beta >> 2 + vsra.u64 q4, #16 + vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1 + vshl.i16 q7, #1 @ tc2 = tC[] << 1 + vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc + vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half + vand d6, d8 @ && beta_2 tests, prime in ms half + vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3 + vneg.s16 q6, q7 @ -tc2 + vmovn.i32 d8, q3 + vshrn.i32 d6, q3, #16 + vand d6, d8 + vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3 + vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block) + vadd.i16 q0, \Q11, q12 @ p0 + q0 + ands r9, r7, r8 + beq 1f + + vadd.i16 q2, q0, q10 @ p1 + p0 + q0 + vadd.i16 q3, q0, q13 @ p0 + q0 + q1 + lsr r3, r9, #16 + vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping) + vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping) + vadd.i16 q0, q8, q9 @ p3 + p2 + vadd.i16 q5, \Q15, q14 @ q2 + q3 + vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2 + vshl.i16 q0, #1 @ 2 * p3 + 2 * p2 + vshl.i16 q5, #1 @ 2 * q2 + 2 * q3 + vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping) + vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping) + vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping) + vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping) + vrshr.s16 q0, #3 @ scale, with rounding + vrshr.s16 q5, #3 + vrshr.s16 q1, #2 + vrshr.s16 q4, #2 + vrshr.s16 q2, #3 + vrshr.s16 q3, #3 + vsub.i16 q0, q9 @ find difference + vsub.i16 q5, q14 + vsub.i16 q1, q10 + vsub.i16 q4, q13 + vsub.i16 q2, \Q11 + vsub.i16 q3, q12 + vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2 + vmax.s16 q5, q6 + vmax.s16 q1, q6 + vmax.s16 q4, q6 + vmax.s16 q2, q6 + vmax.s16 q3, q6 + vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure + vdup.16 d13, r3 + vmin.s16 q0, q7 + vmin.s16 q5, q7 + vmin.s16 q1, q7 + vmin.s16 q4, q7 + vmin.s16 q2, q7 + vmin.s16 q3, q7 + vadd.i16 q0, q9 @ apply difference + vadd.i16 q5, q14 + vadd.i16 q1, q10 + vadd.i16 q4, q13 + vadd.i16 q2, \Q11 + vadd.i16 q3, q12 + vbit q9, q0, q6 @ apply filtered values according to mask + vbit q14, q5, q6 + vbit q10, q1, q6 + vbit q13, q4, q6 + vbit \Q11, q2, q6 + vbit q12, q3, q6 + vneg.s16 q6, q7 @ restore -tc2 + +1: + bics r9, r7, r8 + beq 2f + + vsub.i16 q0, q12, \Q11 @ q0 - p0 + vsub.i16 q1, q13, q10 @ q1 - p1 + lsr r3, r9, #16 + vshl.i16 q2, q0, #3 + lsr r7, r5, #16 + vadd.i16 q3, q0, q2 @ 9 * (q0 - p0) + lsr r8, r6, #16 + vshl.i16 q2, q1, #1 + vadd.i16 q4, q1, q2 @ 3 * (q1 - p1) + vshr.s16 q6, #1 @ -tc = -tc2 >> 1 + vsub.i16 q5, q3, q4 + vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1 + vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1 + vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4 + vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1 + vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1 + vmax.s16 q6, q5 @ + vshr.s16 q4, q7, #1 @ tc = tc2 >> 1 + vdup.16 q0, r2 @ beta + vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc] + vshr.s16 q4, #1 @ tc_2 = tc >> 1 + vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1 + vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1 + vshr.s16 q2, q0, #1 @ beta >> 1 + vadd.i16 q2, q0 @ beta + (beta >> 1) + vneg.s16 q0, q4 @ -tc_2 + vabs.s16 q5, q5 @ abs(original delta0) + vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3 + vmax.s16 q1, q0 + vmax.s16 q3, q0 + vshl.s16 q0, q7, #2 @ 8 * tc + vadd.i16 q7, q0 @ 10 * tc + vdup.16 d0, r9 + vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering + vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2) + vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2) + vdup.16 d8, r5 @ dp0 + dp3 + vdup.16 d9, r7 @ dp0' + dp3' + vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0)) + vdup.16 d10, r6 @ dq0 + dq3 + vdup.16 d11, r8 @ dq0' + dq3' + vand q7, q0 @ AND block and line masks + vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1) + vadd.i16 q0, q1, q10 @ p1 + deltap1 + vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1) + vadd.i16 q3, q3, q13 @ q1 + deltaq1 + vadd.i16 q1, \Q11, q6 @ p0 + delta0 + vsub.i16 q2, q12, q6 @ q0 - delta0 + vand q4, q7 @ AND nd_p test with block/line masks + vand q5, q7 @ AND nd_q test with block/line masks + vbit q10, q0, q4 + vbit \Q11, q1, q7 + vbit q12, q2, q7 + vbit q13, q3, q5 + +2: +.if \bit_depth == 8 + vmovn.i16 d16, q8 + vmovn.i16 d23, \Q15 + neg r1, r1 + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 + vqmovun.s16 d19, \Q11 + lsls r10, #31 + vqmovun.s16 d20, q12 + vqmovun.s16 d21, q13 + vqmovun.s16 d22, q14 +.else + vmov.i16 q0, #0 + vmov.i16 q1, #(1 << \bit_depth - 1) + @ q8 & q15 should be unaltered and so don't require clipping + neg r1, r1 + vmax.s16 q9, q0 + vmax.s16 q10, q0 + vmax.s16 q11, q0 + vmax.s16 q12, q0 + vmax.s16 q13, q0 + vmax.s16 q14, q0 + lsls r10, #31 + vmin.s16 q9, q1 + vmin.s16 q10, q1 + vmin.s16 q11, q1 + vmin.s16 q12, q1 + vmin.s16 q13, q1 + vmin.s16 q14, q1 +.endif + bx lr +.endm + +function hevc_loop_filter_luma_body + m_filter_luma 8, q15, q11 +endfunc + +@ void ff_hevc_rpi_v_loop_filter_luma_neon_8( +@ uint8_t *_pix, [r0] +@ ptrdiff_t _stride, [r1] +@ int _beta, [r2] +@ int *_tc, [r3] +@ uint8_t *_no_p, [sp+0] +@ uint8_t *_no_q) [sp+4] + +function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1 + hevc_loop_filter_luma_start + + sub r4, r0, #4 + b .Lv_loop_luma_common +endfunc + +@ void ff_hevc_rpi_v_loop_filter2_luma_neon( +@ uint8_t * pix_r, [r0] +@ ptrdiff_t _stride, [r1] +@ int _beta, [r2] +@ int tc2, [r3] +@ int no_f, [sp+0] +@ uint8_t * pix_l) [sp+4] + +function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1 + cmp r3, #0 + it eq + bxeq lr + push {r4-r10,lr} @ 32 bytes + ldr r4, [sp, #36] + ldr r10, [sp, #32] + +.Lv_loop_luma_common: + vpush {d8-d15} + + @ It's slightly faster to do unlaned loads and transpose in the + @ 8-bit case, even though it needs more instructions, because + @ VLD4.8 is a really slow way to read from memory. + vld1.32 {d16[0]}, [r4:32], r1 + vld1.32 {d20[0]}, [r0:32], r1 + vld1.32 {d16[1]}, [r4:32], r1 + vld1.32 {d20[1]}, [r0:32], r1 + vld1.32 {d17[0]}, [r4:32], r1 + vld1.32 {d21[0]}, [r0:32], r1 + vld1.32 {d17[1]}, [r4:32], r1 + vld1.32 {d21[1]}, [r0:32], r1 + vld1.32 {d18[0]}, [r4:32], r1 + vld1.32 {d22[0]}, [r0:32], r1 + vld1.32 {d18[1]}, [r4:32], r1 + vld1.32 {d22[1]}, [r0:32], r1 + vld1.32 {d19[0]}, [r4:32], r1 + vld1.32 {d23[0]}, [r0:32], r1 + vld1.32 {d19[1]}, [r4:32] + vld1.32 {d23[1]}, [r0:32] + vuzp.16 q8, q9 + vuzp.16 q10, q11 + vuzp.8 q8, q9 + vuzp.8 q10, q11 + vswp d17, d18 + vswp d21, d22 + + bl hevc_loop_filter_luma_body + + add r6, r4, r1 + add r2, r0, r1 + lsl r1, #1 + + vpop {d8-d15} + + @ no_p[1] + bmi 1f + vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1 + vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1 + vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 + vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1 + + vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 + vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1 + vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 + vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32] +1: + @ no_q[1] + bcs 1f + vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1 + vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1 + vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 + vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1 + + vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 + vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1 + vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 + vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32] +1: + pop {r4-r10,pc} + +.Lbypasswrite: + vpop {d8-d15} + pop {r4-r10,pc} +endfunc + +.macro m_filter_v_luma_16 bit_depth + vpush {d8-d15} + + @ Uses slightly fewer instructions to do laned loads than unlaned + @ and transpose. This also means that we can use the same code for + @ both split & unsplit deblock + vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4], r1 + vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1 + + vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 + vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 + + vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1 + vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1 + + vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 + vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 + + vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1 + vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1 + + vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 + vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 + + vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1 + vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 + + vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4] + vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0] + + bl hevc_loop_filter_luma_body_\bit_depth + + add r6, r4, r1 + add r2, r0, r1 + lsl r1, #1 + + vpop {d8-d15} + + @ p[1] + bmi 1f + vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1 + vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1 + vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 + vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1 + vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 + vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1 + vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 + vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6] +1: + @ q[1] + bcs 1f + vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1 + vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1 + vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 + vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1 + vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 + vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1 + vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 + vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2] +1: + pop {r4-r10,pc} +.endm + + + + +@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0] +@ ptrdiff_t stride, [r1] +@ int beta, [r2] +@ int32_t *tc, [r3] +@ uint8_t *no_p, sp[0] +@ uint8_t *no_q); sp[4] +@ +@ Src should always be on 8 byte boundry & all in the same slice + +function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1 + hevc_loop_filter_luma_start + b .Lh_loop_filter_luma_common_8 +endfunc + +function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1 + cmp r3, #0 + it eq + bxeq lr + push {r4-r10,lr} @ 32 bytes + ldr r10, [sp, #32] + +.Lh_loop_filter_luma_common_8: + sub r4, r0, r1, lsl #2 + add r0, r4, r1 + lsl r1, #1 + vpush {d8-d15} + + vld1.8 {d16}, [r4], r1 + vld1.8 {d17}, [r0], r1 + vld1.8 {d18}, [r4], r1 + vld1.8 {d19}, [r0], r1 + vld1.8 {d20}, [r4], r1 + vld1.8 {d21}, [r0], r1 + vld1.8 {d22}, [r4] + vld1.8 {d23}, [r0] + + bl hevc_loop_filter_luma_body + + add r0, r0, r1, lsl #1 + add r2, r4, r1, lsl #1 + add r6, r4, r1, asr #1 + vpop {d8-d15} + + @ P2-P0 + bcs 1f + vst1.8 {d22}, [r4], r1 + vst1.8 {d21}, [r6] + vst1.8 {d20}, [r4] +1: + @ Q0-Q2 + bmi 1f + vst1.8 {d19}, [r0], r1 + vst1.8 {d18}, [r2] + vst1.8 {d17}, [r0] +1: + pop {r4-r10,pc} +endfunc + + +.macro m_filter_h_luma_16 bit_depth + sub r4, r0, r1, lsl #2 + add r0, r4, r1 + lsl r1, #1 + vpush {d8-d15} + + vld1.16 { q8}, [r4], r1 + vld1.16 { q9}, [r0], r1 + vld1.16 {q10}, [r4], r1 + vld1.16 {q11}, [r0], r1 + vld1.16 {q12}, [r4], r1 + vld1.16 {q13}, [r0], r1 + vld1.16 {q14}, [r4] + vld1.16 {q15}, [r0] + + bl hevc_loop_filter_luma_body_\bit_depth + + add r0, r0, r1, lsl #1 + add r2, r4, r1, lsl #1 + add r6, r4, r1, asr #1 + vpop {d8-d15} + + @ P2-P0 + bcs 1f + vst1.16 {q14}, [r4], r1 + vst1.16 {q13}, [r6] + vst1.16 {q12}, [r4] +1: + bmi 1f + vst1.16 {q11}, [r0], r1 + vst1.16 {q10}, [r2] + vst1.16 { q9}, [r0] +1: + pop {r4-r10,pc} +.endm + + +@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 +@ unsigned int no_f); // r3 +@ +@ no_f +@ 0 tl P0 +@ 1 tr P1 +@ 2 bl Q0 +@ 3 br Q1 +@ +@ Probably not worth having the P/Qa only special case in this direction +@ Given layout we won't save any memory reads or avoid any cache dirtying +@ We would save a bit of computation but I expect the partials to be less +@ common in the H direction than V due to how we arrange deblock. + +function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1 + sub r12, r0, r1 + cmp r2, #0 + it eq + bxeq lr + vld1.8 {d26,d27}, [r0] + lsl r1, #1 + sub r0, r1 + vld1.8 {d18,d19}, [r12], r1 + vld1.8 {d16,d17}, [r0], r1 + vld1.8 {d28,d29}, [r12] + + hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \ + "sub r12, r0, r1, asr #1" + + lsls r3, #29 @ b2 -> N, b3 -> C + it pl + vstrpl d26, [r0, #0] + it cc + vstrcc d27, [r0, #8] + lsls r3, #2 @ b0 -> N, b1 -> C + it pl + vstrpl d18, [r12, #0] + it cc + vstrcc d19, [r12, #8] + bx lr + +endfunc + + +@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 +@ unsigned int no_f); // r3 +@ +@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] +@ +@ Macro here actual function near bottom + +.macro m_filter_h_uv_16 bit_depth + sub r12, r0, r1 + cmp r2, #0 + it eq + bxeq lr + vld1.16 {q12, q13}, [r0] + lsl r1, #1 + sub r0, r1 + vld1.16 {q10, q11}, [r12], r1 + vld1.16 {q8, q9 }, [r0], r1 + vld1.16 {q14, q15}, [r12] + + hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \ + "sub r12, r0, r1, asr #1", \ + "cmp r3, #0" + + bne 1f + vst1.16 {q10, q11}, [r12] + vst1.16 {q12, q13}, [r0] + bx lr + + @ At least one no_f bit is set + @ Which means we need to break this apart in an ugly fashion +1: + lsls r3, #29 @ b2 -> N, b3 -> C + itt pl + vstrpl d24, [r0, #0] + vstrpl d25, [r0, #8] + itt cc + vstrcc d26, [r0, #16] + vstrcc d27, [r0, #24] + lsls r3, #2 @ b0 -> N, b1 -> C + itt pl + vstrpl d20, [r12, #0] + vstrpl d21, [r12, #8] + itt cc + vstrcc d22, [r12, #16] + vstrcc d23, [r12, #24] + bx lr +.endm + + +@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 +@ uint8_t * src_l, // r3 +@ unsigned int no_f); // sp[0] +@ +@ no_f: +@ 0 tl P0 +@ 1 tr Q0 +@ 2 bl P1 +@ 3 br Q1 + +function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1 + cmp r2, #0 + it eq + bxeq lr + push {lr} + vld2.16 {d16[0], d18[0]}, [r3], r1 + vld2.16 {d20[0], d22[0]}, [r0], r1 + + cmp r2, #0x10000 + vld2.16 {d16[1], d18[1]}, [r3], r1 + vld2.16 {d20[1], d22[1]}, [r0], r1 + + vld2.16 {d16[2], d18[2]}, [r3], r1 + vld2.16 {d20[2], d22[2]}, [r0], r1 + + vld2.16 {d16[3], d18[3]}, [r3], r1 + vld2.16 {d20[3], d22[3]}, [r0], r1 + blo 10f + + vld2.16 {d17[0], d19[0]}, [r3], r1 + vld2.16 {d21[0], d23[0]}, [r0], r1 + + sub ip, r0, r3 + vld2.16 {d17[1], d19[1]}, [r3], r1 + vld2.16 {d21[1], d23[1]}, [r0], r1 + + cmp ip, #4 + vld2.16 {d17[2], d19[2]}, [r3], r1 + vld2.16 {d21[2], d23[2]}, [r0], r1 + + vld2.16 {d17[3], d19[3]}, [r3] + vld2.16 {d21[3], d23[3]}, [r0] + + hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \ + "ldr lr, [sp, #4]", \ + "neg r1, r1", \ + "it eq; cmpeq lr, #0", \ + "add r3, #2", \ + "add ip, r3, r1", \ + "add r2, r0, r1", \ + "lsl r1, #1" + + bne 1f + +@ Much/most of the time r0 == r3 + 4 and no_f == 0 +@ so it is worth having this special case + vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b + vst2.16 {d19[2], d21[2]}, [ip], r1 + vst2.16 {d19[1], d21[1]}, [r3], r1 + vst2.16 {d19[0], d21[0]}, [ip], r1 + vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a + vst2.16 {d18[2], d20[2]}, [ip], r1 + vst2.16 {d18[1], d20[1]}, [r3] + vst2.16 {d18[0], d20[0]}, [ip] + pop {pc} + +@ Either split or partial +1: + lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 + ittt cs + addcs r0, r0, r1, lsl #1 + addcs r2, r2, r1, lsl #1 + bcs 1f + @ Q0b + vst1.16 {d21[3]}, [r0], r1 + vst1.16 {d21[2]}, [r2], r1 + vst1.16 {d21[1]}, [r0], r1 + vst1.16 {d21[0]}, [r2], r1 +1: + ittt mi + addmi r3, r3, r1, lsl #1 + addmi ip, ip, r1, lsl #1 + bmi 1f + @ P0b + vst1.16 {d19[3]}, [r3], r1 + vst1.16 {d19[2]}, [ip], r1 + vst1.16 {d19[1]}, [r3], r1 + vst1.16 {d19[0]}, [ip], r1 +1: + lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 + bcs 1f + @ Q0a + vst1.16 {d20[3]}, [r0], r1 + vst1.16 {d20[2]}, [r2], r1 + vst1.16 {d20[1]}, [r0] + vst1.16 {d20[0]}, [r2] +1: + it mi + popmi {pc} + @ P0a + vst1.16 {d18[3]}, [r3], r1 + vst1.16 {d18[2]}, [ip], r1 + vst1.16 {d18[1]}, [r3] + vst1.16 {d18[0]}, [ip] + pop {pc} + +@ Single lump (rather than double) +10: + @ As we have post inced r0/r3 in the load the easiest thing to do is + @ to subtract and write forwards, rather than backwards (as above) + @ b0 (P0a) -> N, b1 (Q0a) -> C + + hevc_loop_filter_uv_body1 d16, d18, d20, d22 \ + "ldr lr, [sp, #4]", \ + "add r3, #2", \ + "sub r0, r0, r1, lsl #2", \ + "sub r3, r3, r1, lsl #2", \ + "lsls lr, #31", \ + "add r2, r0, r1", \ + "add ip, r3, r1", \ + "lsl r1, #1" + + bcs 3f + @ Q0a + vst1.16 {d20[0]}, [r0], r1 + vst1.16 {d20[1]}, [r2], r1 + vst1.16 {d20[2]}, [r0] + vst1.16 {d20[3]}, [r2] +3: + it mi + popmi {pc} + @ P0a + vst1.16 {d18[0]}, [r3], r1 + vst1.16 {d18[1]}, [ip], r1 + vst1.16 {d18[2]}, [r3] + vst1.16 {d18[3]}, [ip] + pop {pc} + +endfunc + + +@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 +@ uint8_t * src_l, // r3 +@ unsigned int no_f); // sp[0] +@ + +@ no_f +@ 0 tl P0a +@ 1 tr Q0a +@ 2 bl P0b +@ 3 br Q0b + +@ P1: q8, q12 +@ P0: q9, q13 +@ Q0: q10, q14 +@ Q1: q11, q15 + +.macro m_filter_v_uv2_16 bit_depth + cmp r2, #0 + it eq + bxeq lr + push {lr} + vld2.32 {d16[0], d18[0]}, [r3], r1 + vld2.32 {d20[0], d22[0]}, [r0], r1 + + cmp r2, #0x10000 + vld2.32 {d16[1], d18[1]}, [r3], r1 + vld2.32 {d20[1], d22[1]}, [r0], r1 + + vld2.32 {d17[0], d19[0]}, [r3], r1 + vld2.32 {d21[0], d23[0]}, [r0], r1 + + vld2.32 {d17[1], d19[1]}, [r3], r1 + vld2.32 {d21[1], d23[1]}, [r0], r1 + blo 10f + + vld2.32 {d24[0], d26[0]}, [r3], r1 + vld2.32 {d28[0], d30[0]}, [r0], r1 + + sub ip, r0, r3 + vld2.32 {d24[1], d26[1]}, [r3], r1 + vld2.32 {d28[1], d30[1]}, [r0], r1 + + cmp ip, #8 + vld2.32 {d25[0], d27[0]}, [r3], r1 + vld2.32 {d29[0], d31[0]}, [r0], r1 + + vld2.32 {d25[1], d27[1]}, [r3] + vld2.32 {d29[1], d31[1]}, [r0] + + hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \ + "ldr lr, [sp, #4]", \ + "neg r1, r1", \ + "it eq; cmpeq lr, #0", \ + "add r3, #4", \ + "add ip, r3, r1", \ + "add r2, r0, r1", \ + "lsl r1, #1" + + bne 1f + +@ Much/most of the time r0 == r3 + 8 and no_f == 0 +@ so it is worth having this special case + vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b + vst2.32 {d27[0], d29[0]}, [ip], r1 + vst2.32 {d26[1], d28[1]}, [r3], r1 + vst2.32 {d26[0], d28[0]}, [ip], r1 + vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a + vst2.32 {d19[0], d21[0]}, [ip], r1 + vst2.32 {d18[1], d20[1]}, [r3] + vst2.32 {d18[0], d20[0]}, [ip] + pop {pc} + +@ Either split or partial +1: + lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 + ittt cs + addcs r0, r0, r1, lsl #1 + addcs r2, r2, r1, lsl #1 + bcs 1f + @ Q0b + vst1.32 {d29[1]}, [r0], r1 + vst1.32 {d29[0]}, [r2], r1 + vst1.32 {d28[1]}, [r0], r1 + vst1.32 {d28[0]}, [r2], r1 +1: + ittt mi + addmi r3, r3, r1, lsl #1 + addmi ip, ip, r1, lsl #1 + bmi 1f + @ P0b + vst1.32 {d27[1]}, [r3], r1 + vst1.32 {d27[0]}, [ip], r1 + vst1.32 {d26[1]}, [r3], r1 + vst1.32 {d26[0]}, [ip], r1 +1: + lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 + bcs 1f + @ Q0a + vst1.32 {d21[1]}, [r0], r1 + vst1.32 {d21[0]}, [r2], r1 + vst1.32 {d20[1]}, [r0] + vst1.32 {d20[0]}, [r2] +1: + it mi + popmi {pc} + @ P0a + vst1.32 {d19[1]}, [r3], r1 + vst1.32 {d19[0]}, [ip], r1 + vst1.32 {d18[1]}, [r3] + vst1.32 {d18[0]}, [ip] + pop {pc} + +@ Single lump (rather than double) +10: + @ As we have post inced r0/r3 in the load the easiest thing to do is + @ to subtract and write forwards, rather than backwards (as above) + @ b0 (P0a) -> N, b1 (Q0a) -> C + + hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \ + "ldr lr, [sp, #4]", \ + "add r3, #4", \ + "sub r0, r0, r1, lsl #2", \ + "sub r3, r3, r1, lsl #2", \ + "lsls lr, #31", \ + "add r2, r0, r1", \ + "add ip, r3, r1", \ + "lsl r1, #1" + + bcs 3f + @ Q0a + vst1.32 {d20[0]}, [r0], r1 + vst1.32 {d20[1]}, [r2], r1 + vst1.32 {d21[0]}, [r0] + vst1.32 {d21[1]}, [r2] +3: + it mi + popmi {pc} + @ P0a + vst1.32 {d18[0]}, [r3], r1 + vst1.32 {d18[1]}, [ip], r1 + vst1.32 {d19[0]}, [r3] + vst1.32 {d19[1]}, [ip] + pop {pc} +.endm + + +@ The NEON version is faster under ideal circumstances (i.e. everything in L1) +@ But in real world testing it is ~20% slower, presumably due to code size + +#if 0 // NEON version + +/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, + * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, + * int in_inc0, int in_inc1) + */ +function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 + mov ip, sp + push {a1-a3,v1-v8,lr} + ldm ip, {v1-v6} + cmp a1, #2 + bls 2f + vpush {d8-d13} + sub v5, v5, #10 + sub v6, v6, #10 +1: + vld2.32 {d0[0], d2[0]}, [a3]! + vld2.32 {d4[0], d6[0]}, [a4]! + vmov.u8 q12, #0 + ldrb a2, [a3], #1 + ldrb ip, [a4], #1 + ldrb v8, [a3], #1 + ldrb lr, [a4], #1 + add a2, v1, a2, lsl #2 + vld1.8 {d24[0]}, [a3], v5 + add ip, v3, ip, lsl #2 + vld1.8 {d25[0]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d16[0]}, [a2] + add lr, v4, lr, lsl #2 + vld1.32 {d20[0]}, [ip] + vld1.32 {d18[0]}, [v8] + vld1.32 {d22[0]}, [lr] + + vld2.32 {d0[1], d2[1]}, [a3]! + vld2.32 {d4[1], d6[1]}, [a4]! + ldrb a2, [a3], #1 + vmov.u16 d12, #1 + ldrb ip, [a4], #1 + vmov.u16 d13, #2 + ldrb v8, [a3], #1 + vmov.u16 d27, #4 + ldrb lr, [a4], #1 + add a2, v1, a2, lsl #2 + vld1.8 {d24[2]}, [a3], v5 + add ip, v3, ip, lsl #2 + vld1.8 {d25[2]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d16[1]}, [a2] + add lr, v4, lr, lsl #2 + vld1.32 {d20[1]}, [ip] + vld1.32 {d18[1]}, [v8] + vld1.32 {d22[1]}, [lr] + + vld2.32 {d1[0], d3[0]}, [a3]! + vld2.32 {d5[0], d7[0]}, [a4]! + ldrb a2, [a3], #1 + ldrb ip, [a4], #1 + ldrb lr, [a4], #1 + ldrb v8, [a3], #1 + add a2, v1, a2, lsl #2 + vld1.8 {d24[4]}, [a3], v5 + add ip, v3, ip, lsl #2 + vld1.8 {d25[4]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d17[0]}, [a2] + add lr, v4, lr, lsl #2 + vld1.32 {d21[0]}, [ip] + vld1.32 {d19[0]}, [v8] + vld1.32 {d23[0]}, [lr] + + vld2.32 {d1[1], d3[1]}, [a3]! + vld2.32 {d5[1], d7[1]}, [a4]! + ldrb a2, [a3], #1 + ldrb ip, [a4], #1 + ldrb v8, [a3], #1 + ldrb lr, [a4], #1 + add a2, v1, a2, lsl #2 + vld1.8 {d24[6]}, [a3], v5 + add ip, v3, ip, lsl #2 + vld1.8 {d25[6]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d17[1]}, [a2] + add lr, v4, lr, lsl #2 + vld1.32 {d21[1]}, [ip] + vld1.32 {d19[1]}, [v8] + vld1.32 {d23[1]}, [lr] + + @ So now we have: + @ q0.32[i] = curr[i].mv[0] + @ q1.32[i] = curr[i].mv[1] + @ q2.32[i] = neigh[i].mv[0] + @ q3.32[i] = neigh[i].mv[1] + @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]] + @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]] + @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] + @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] + @ d24.16[i] = curr[i].pred_flag + @ d25.16[i] = neigh[i].pred_flag + + vtst.16 d28, d24, d12 + vtst.16 d29, d24, d13 + vadd.i16 d8, d24, d12 + vadd.i16 d9, d25, d12 + vtst.16 d30, d25, d12 + vtst.16 d31, d25, d13 + veor d26, d8, d9 + ldr lr, [sp, 6*8 + 1*4] + vmovl.s16 q4, d28 + vmovl.s16 q5, d29 + teq lr, #1 + vmovl.s16 q14, d30 + it ne + lslne v1, lr, #1 + vmovl.s16 q15, d31 + it ne + rsbne v2, v1, #32 + vbif q0, q1, q4 + vbif q2, q3, q14 + vbif q1, q0, q5 + vbif q3, q2, q15 + vabd.s16 q12, q0, q2 + vabd.s16 q2, q1 + vabd.s16 q0, q3 + vabd.s16 q1, q3 + vbif q8, q9, q4 + vbif q10, q11, q14 + vbif q9, q8, q5 + vbif q11, q10, q15 + vclt.u16 d6, d24, d27 + vclt.u16 d8, d2, d27 + vclt.u16 d7, d25, d27 + vclt.u16 d9, d3, d27 + vclt.u16 d2, d0, d27 + vclt.u16 d0, d4, d27 + vclt.u16 d3, d1, d27 + vclt.u16 d1, d5, d27 + vceq.i32 q12, q10, q8 + vceq.i32 q10, q9 + vceq.i32 q8, q11 + vceq.i32 q9, q11 + vshrn.i32 d6, q3, #8 + vshrn.i32 d7, q4, #8 + vshrn.i32 d8, q1, #8 + vshrn.i32 d9, q0, #8 + vmovn.i32 d4, q12 + vmovn.i32 d2, q10 + vmovn.i32 d3, q8 + vmovn.i32 d5, q9 + vand q2, q3 + vrev16.8 q3, q3 + vand q2, q3 + vand q1, q4 + vrev16.8 q4, q4 + vand q1, q4 + vand d4, d5 + vand d2, d3 + vbic d0, d12, d4 + vshr.u16 d26, #2 + vbic d0, d2 + vmov.i16 d1, #0x5555 + vorr d0, d26 + bne 10f + + @ Merge results into result word, no duplicates + vmov a2, s0 + vmov v8, s1 + vmov.u16 ip, d0[1] + vmov.u16 lr, d0[3] + lsl a2, #30 + lsl v8, #30 + lsl ip, #30 + lsl lr, #30 + orr a2, ip, a2, lsr #2 + orr v8, lr, v8, lsr #2 + orr a2, v8, a2, lsr #4 + subs a1, #4 + orr v7, a2, v7, lsr #8 + bhi 1b + + mov a1, #32 + ldr a3, [sp, #6*8] + vpop {d8-d13} + sub a1, a1, a3, lsl #1 + mov a1, v7, lsr a1 + pop {a2-a4,v1-v8,pc} +10: + @ Merge results into result word, with duplicates + vmul.i16 d0, d1 + vmov a2, s0 + vmov v8, s1 + vmov.u16 ip, d0[1] + vmov.u16 lr, d0[3] + lsl a2, v2 + subs a1, #4 + lsl v8, v2 + lsl ip, v2 + lsl lr, v2 + ldr v2, [sp, #6*8 + 12*4 + 1*4] +T lsr a2, v1 +T orr a2, ip, a2 +A orr a2, ip, a2, lsr v1 + lsl ip, v1, #1 +T lsr v8, v1 +T orr v8, lr, v8 +A orr v8, lr, v8, lsr v1 + lsl lr, v1, #2 +T lsr a2, ip +T orr a2, v8, a2 +A orr a2, v8, a2, lsr ip + ldr v1, [sp, #6*8 + 12*4] +T lsr v7, lr +T orr v7, a2, v7 +A orr v7, a2, v7, lsr lr + bhi 1b + + mov a1, #32 + ldrd a3, a4, [sp, #6*8] + vpop {d8-d13} + mls a1, a3, a4, a1 + mls a1, a3, a4, a1 + mov a1, v7, lsr a1 + pop {a2-a4,v1-v8,pc} + + +2: + sub v5, v5, #10 + sub v6, v6, #10 + vmov.u8 d16, #0 + blo 3f + vld2.32 {d0[0], d1[0]}, [a3]! + vld2.32 {d2[0], d3[0]}, [a4]! + ldrb a2, [a3], #1 + ldrb ip, [a4], #1 + ldrb lr, [a4], #1 + ldrb v8, [a3], #1 + add a2, v1, a2, lsl #2 + vld1.8 {d16[0]}, [a3], v5 + add ip, v3, ip, lsl #2 + vld1.8 {d16[4]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d4[0]}, [a2] + add lr, v4, lr, lsl #2 + vld1.32 {d5[0]}, [ip] + vld1.32 {d6[0]}, [v8] + vld1.32 {d7[0]}, [lr] + +3: + vld2.32 {d0[1], d1[1]}, [a3]! + vld2.32 {d2[1], d3[1]}, [a4]! + ldrb a2, [a3], #1 + vmov.u16 d17, #1 + ldrb ip, [a4], #1 + vmov.u16 d18, #2 + ldrb v8, [a3], #1 + vmov.u16 d19, #4 + ldrb lr, [a4], #1 + add a2, v1, a2, lsl #2 + vld1.8 {d16[2]}, [a3], v5 + add ip, v3, ip, lsl #2 + vld1.8 {d16[6]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d4[1]}, [a2] + add lr, v4, lr, lsl #2 + vld1.32 {d5[1]}, [ip] + vld1.32 {d6[1]}, [v8] + vld1.32 {d7[1]}, [lr] + + @ So now we have: + @ d0.32[i] = curr[i].mv[0] + @ d1.32[i] = curr[i].mv[1] + @ d2.32[i] = neigh[i].mv[0] + @ d3.32[i] = neigh[i].mv[1] + @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]] + @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] + @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]] + @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] + @ d16.16[i] = curr[i].pred_flag + @ d16.16[2+i] = neigh[i].pred_flag + + vtst.16 d20, d16, d17 + vtst.16 d22, d16, d18 + vadd.i16 d30, d16, d17 + vswp d2, d3 + ldr lr, [sp, #1*4] + vmovl.s16 q10, d20 + teq lr, #1 + vmovl.s16 q11, d22 + it ne + lslne v1, lr, #1 + vbif d0, d1, d20 + vbif d4, d6, d20 + vbif d3, d2, d21 + vbif d5, d7, d21 + vbif d1, d0, d22 + vbif d6, d4, d22 + vbif d2, d3, d23 + vbif d7, d5, d23 + vshr.u16 d30, #2 + vabd.s16 d24, d0, d3 + vabd.s16 d25, d1, d2 + vabd.s16 q0, q0, q1 + vceq.i32 d2, d4, d5 + vceq.i32 d20, d5, d6 + vceq.i32 d21, d4, d7 + vceq.i32 d3, d6, d7 + vclt.u16 d6, d24, d19 + vclt.u16 d7, d25, d19 + vclt.u16 d22, d1, d19 + vclt.u16 d23, d0, d19 + vshrn.i32 d6, q3, #8 + vmovn.i32 d2, q1 + vshrn.i32 d7, q11, #8 + vmovn.i32 d3, q10 + vand q0, q3, q1 + it ne + rsbne v2, v1, #32 + vrev16.8 q3, q3 + vand q0, q3 + vsra.u64 d30, #32 + vshr.u64 q1, q0, #32 + vand q0, q1 + vbic d0, d17, d0 + vand d30, d30, d17 + vbic d0, d1 + vmov.i16 d1, #0x5555 + vorr d0, d30 + bne 10f + + @ Construct result word, no duplicates + cmp a1, #2 + vmov.u16 a1, d0[1] + vmov.u16 a2, d0[0] + it eq + orreq a1, a2, a1, lsl #2 + pop {a2-a4,v1-v8,pc} +10: + @ Construct result word, with duplicates + cmp a1, #2 + vmul.i16 d0, d1 + vmov a2, s0 + vmov.u16 a1, d0[1] + lsl a2, #16 + pkhbt a1, a1, a1, lsl #16 + lsr a2, v2 + lsr a1, v2 +T itt eq +T lsleq a1, v1 +T orreq a1, a2, a1 +A orreq a1, a2, a1, lsl v1 + pop {a2-a4,v1-v8,pc} +endfunc + + + +#else // non-NEON version + + +/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, + * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, + * int in_inc0, in_inc1) + */ +function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 + add ip, sp, #4*4 + push {a2-a4,v1-v8,lr} + mov v6, #32 +1: ldmdb ip, {v1-v4} + ldrsb v5, [a3, #8] @ curr->ref_idx + ldrsb v8, [a3, #9] + ldrsb ip, [a4, #8] @ neigh->ref_idx + ldrsb lr, [a4, #9] + ldr v1, [v1, v5, lsl #2] + ldrb v5, [a3, #10] @ curr->pred_flag + ldr v2, [v2, v8, lsl #2] + ldrb v8, [a4, #10] @ neigh->pred_flag + ldr v3, [v3, ip, lsl #2] + ldr v4, [v4, lr, lsl #2] + teq v5, #3 + beq 20f + teq v8, #3 + beq 90f + + tst v5, #1 + itee ne + ldrne v5, [a3, #0] @ curr->mv[0] + moveq v1, v2 + ldreq v5, [a3, #4] @ curr->mv[1] + tst v8, #1 + itee ne + ldrne v8, [a4, #0] @ neigh->mv[0] + moveq v3, v4 + ldreq v8, [a4, #4] @ neigh->mv[1] + teq v1, v3 + bne 10f + ldr lr, =0xFFFCFFFC + ssub16 ip, v8, v5 + ssub16 v5, v5, v8 + sel v5, v5, ip + ands v5, v5, lr + @ drop through +10: it ne + movne v5, #1<<30 +11: + sub v6, v6, #2 +T mov v7, v7, lsr #2 + subs a2, a2, #1 +A orr v7, v5, v7, lsr #2 +T orr v7, v5, v7 + bhi 11b + + ldrd v3, v4, [sp, #16*4] + ldr a2, [sp] + add ip, sp, #16*4 + subs a1, a1, #1 + add a3, a3, v3 + add a4, a4, v4 + bhi 1b + mov a1, v7, lsr v6 + pop {a2-a4,v1-v8,pc} + +20: teq v8, #3 + bne 10b + + teq v1, v3 + it eq + teqeq v2, v4 + bne 40f + teq v1, v2 + bne 30f + + ldrd v1, v2, [a3] @ curr->mv + ldrd v3, v4, [a4] @ neigh->mv + ldr lr, =0xFFFCFFFC + ssub16 ip, v3, v1 + ssub16 v5, v1, v3 + sel v5, v5, ip + ands v5, v5, lr + bne 25f + ssub16 ip, v4, v2 + ssub16 v5, v2, v4 + sel v5, v5, ip + ands v5, v5, lr + beq 11b + @ drop through +25: ssub16 ip, v4, v1 + ssub16 v5, v1, v4 + sel v5, v5, ip + ands v5, v5, lr + bne 10b + ssub16 ip, v3, v2 + ssub16 v5, v2, v3 + sel v5, v5, ip + ands v5, v5, lr + b 10b + +30: ldrd v1, v2, [a3] @ curr->mv + ldrd v3, v4, [a4] @ neigh->mv + ldr lr, =0xFFFCFFFC + ssub16 ip, v3, v1 + ssub16 v5, v1, v3 + sel v5, v5, ip + ands v5, v5, lr + bne 10b + ssub16 ip, v4, v2 + ssub16 v5, v2, v4 + sel v5, v5, ip + ands v5, v5, lr + b 10b + +40: teq v1, v4 + ite eq + teqeq v2, v3 + bne 10b + + ldrd v1, v2, [a3] @ curr->mv + ldrd v3, v4, [a4] @ neigh->mv + ldr lr, =0xFFFCFFFC + b 25b + +90: + mov v5, #1<<30 + b 11b +endfunc + + +#endif + + +@ ============================================================================= +@ +@ 10 bit + +function hevc_loop_filter_luma_body_10 + m_filter_luma 10, q11, q15 +endfunc + +function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1 + hevc_loop_filter_luma_start + b .Lh_loop_luma_common_10 +endfunc + +function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1 + cmp r3, #0 + it eq + bxeq lr + push {r4-r10,lr} @ 32 bytes + ldr r10, [sp, #32] +.Lh_loop_luma_common_10: + m_filter_h_luma_16 10 +endfunc + +function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1 + hevc_loop_filter_luma_start + sub r4, r0, #8 + b .Lv_loop_luma_common_10 +endfunc + +function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1 + cmp r3, #0 + it eq + bxeq lr + push {r4-r10,lr} @ 32 bytes + ldr r4, [sp, #36] + ldr r10, [sp, #32] + +.Lv_loop_luma_common_10: + m_filter_v_luma_16 10 +endfunc + +function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1 + m_filter_h_uv_16 10 +endfunc + +function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1 + m_filter_v_uv2_16 10 +endfunc + diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S new file mode 100644 index 0000000000..7ed5c7dc52 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +/* uses registers q8 - q13 for temp values */ +.macro tr4_luma_shift shift + vaddl.s16 q8, d28, d30 // c0 = src0 + src2 + vaddl.s16 q9, d30, d31 // c1 = src2 + src3 + vsubl.s16 q10, d28, d31 // c2 = src0 - src3 + vaddl.s16 q11, d28, d31 // src0 + src3 + + vmul.i32 q12, q8, d1[0] // 29 * c0 + vmul.i32 q13, q10, d2[0] // 55 * c2 + vmul.i32 q8, q8, d2[0] // 55 * c0 + vmull.s16 q14, d29, d0[0] // c3 = 74 * src1 + + vsubw.s16 q11, q11, d30 // src0 - src2 + src3 + vmla.i32 q12, q9, d2[0] // 29 * c0 + 55 * c1 + vmls.i32 q13, q9, d1[0] // 55 * c2 - 29 * c1 + vmla.i32 q8, q10, d1[0] // 55 * c0 + 29 * c2 + + vmul.i32 q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3) + vadd.i32 q12, q12, q14 // dst0 = 29 * c0 + 55 * c1 + c3 + vadd.i32 q13, q13, q14 // dst1 = 55 * c2 - 29 * c1 + c3 + vsub.i32 q8, q8, q14 // dst3 = 55 * c0 + 29 * c2 - c3 + + vqrshrn.s32 d28, q12, \shift + vqrshrn.s32 d29, q13, \shift + vqrshrn.s32 d30, q11, \shift + vqrshrn.s32 d31, q8, \shift +.endm + +/* uses registers q8 - q11 for temp values */ +.macro tr4_shift shift + vmull.s16 q9, d29, d0[0] // 83 * src1 + vmull.s16 q8, d29, d0[1] // 36 * src1 + vshll.s16 q14, d28, #6 // 64 * src0 + vshll.s16 q10, d30, #6 // 64 * src2 + vmlal.s16 q9, d31, d0[1] // 83 * src1 + 36 * src3 o0 + vmlsl.s16 q8, d31, d0[0] // 36 * src1 - 83 * src3 o1 + vadd.s32 q11, q14, q10 // 64 * (src0 + src2) e0 + vsub.s32 q10, q14, q10 // 64 * (src0 - src2) e1 + vadd.s32 q14, q11, q9 // e0 + o0 + vadd.s32 q15, q10, q8 // e1 + o1 + vsub.s32 q8, q10, q8 // e1 - o1 + vsub.s32 q9, q11, q9 // e0 - o0 + + vqrshrn.s32 d28, q14, \shift + vqrshrn.s32 d29, q15, \shift + vqrshrn.s32 d30, q8, \shift + vqrshrn.s32 d31, q9, \shift +.endm + +.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7, \ + tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \ + tmp1, /* Q reg which doesn't alias with d7 or d0 */ \ + shift, I1, I2, I3 + + vmull.s16 q4, \d1, d1[1] // 89 * src1 + \I1 + vmull.s16 q5, \d1, d1[0] // 75 * src1 + \I2 + vmull.s16 q6, \d1, d1[3] // 50 * src1 + \I3 + vmull.s16 q7, \d1, d1[2] // 18 * src1 + vmlal.s16 q4, \d3, d1[0] // 75 * src3 + vmlsl.s16 q5, \d3, d1[2] //-18 * src3 + vmlsl.s16 q6, \d3, d1[1] //-89 * src3 + vmlsl.s16 q7, \d3, d1[3] //-50 * src3 + + // tr4 + vmull.s16 q1, \d2, d0[0] // 83 * src(1*2) + vmull.s16 q2, \d2, d0[1] // 36 * src(1*2) + + vmlal.s16 q4, \d5, d1[3] // 50 * src5 + vmlsl.s16 q5, \d5, d1[1] //-89 * src5 + vmlal.s16 q6, \d5, d1[2] // 18 * src5 + vmlal.s16 q7, \d5, d1[0] // 75 * src5 + + vshll.s16 q3, \d0, #6 // 64 * src(0*2) + vshll.s16 \tmp0, \d4, #6 // 64 * src(2*2) + vmlal.s16 q1, \d6, d0[1] // 83 * src(1*2) + 36 * src(3*2) o0 + vmlsl.s16 q2, \d6, d0[0] // 36 * src(1*2) - 83 * src(3*2) o1 + vadd.i32 \tmp1, q3, \tmp0 // 64 * (src(0*2) + src(2*2)) e0 + vsub.i32 \tmp0, q3, \tmp0 // 64 * (src(0*2) - src(2*2)) e1 + + vmlal.s16 q4, \d7, d1[2] // 18 * src7 + vmlsl.s16 q5, \d7, d1[3] //-50 * src7 + vmlal.s16 q6, \d7, d1[0] // 75 * src7 + vmlsl.s16 q7, \d7, d1[1] //-89 * src7 + + vsub.i32 q3, \tmp1, q1 // e0 - o0 + vadd.i32 \tmp1, \tmp1, q1 // e0 + o0 + vadd.i32 q1, \tmp0, q2 // e1 + o1 + vsub.i32 q2, \tmp0, q2 // e1 - o1 + + vadd.i32 \tmp0, \tmp1, q4 // e_8[0] + o_8[0], dst[0] + vsub.i32 q4, \tmp1, q4 // e_8[0] - o_8[0], dst[7] + vsub.i32 \tmp1, q3, q7 // e_8[3] - o_8[3], dst[4] + vadd.i32 q7, q3, q7 // e_8[3] + o_8[3], dst[3] + vadd.i32 q3, q1, q5 // e_8[1] + o_8[1], dst[1] + vsub.i32 q5, q1, q5 // e_8[1] - o_8[1], dst[6] + vsub.i32 q1, q2, q6 // e_8[2] - o_8[2], dst[5] + vadd.i32 q6, q2, q6 // e_8[2] + o_8[2], dst[2] + vqrshrn.s32 \d0, \tmp0, #\shift + vqrshrn.s32 \d4, \tmp1, #\shift + vqrshrn.s32 \d1, q3, #\shift + vqrshrn.s32 \d5, q1, #\shift + vqrshrn.s32 \d2, q6, #\shift + vqrshrn.s32 \d6, q5, #\shift + vqrshrn.s32 \d3, q7, #\shift + vqrshrn.s32 \d7, q4, #\shift +.endm + +.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3 + vld1.16 {\d0}, [r0 :64], r3 + vld1.16 {\d1}, [r2 :64], r3 + vld1.16 {\d2}, [r0 :64], r3 + vld1.16 {\d3}, [r2 :64], r3 + vld1.16 {\d4}, [r0 :64], r3 + vld1.16 {\d5}, [r2 :64], r3 + vld1.16 {\d6}, [r0 :64], r3 + vld1.16 {\d7}, [r2 :64], r3 + + tr8_process \ + \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \ + \q01, \q23, 7, "\I1", "\I2", "\I3" +.endm + +.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift + tr8_process \ + \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \ + \q01, \q23, \shift + + vzip.16 \d0, \d4 + vzip.16 \d1, \d5 + vzip.16 \d2, \d6 + vzip.16 \d3, \d7 + vst4.16 {\d0-\d3}, [r0 :128], r3 + vst4.16 {\d4-\d7}, [r2 :128], r3 +.endm + +#define BIT_DEPTH 8 +#include "rpi_hevc_idct_fn_neon.S" + +.text + +.align 4 +tr4f: +.word 0x00240053 // 36 and d1[0] = 83 +.word 0x00000000 +tr8f: +.word 0x0059004b // 89, d0[0] = 75 +.word 0x00320012 // 50, d0[2] = 18 +tr16: +.word 0x005a0057 // 90, d2[0] = 87 +.word 0x00500046 // 80, d2[2] = 70 +.word 0x0039002b // 57, d2[0] = 43 +.word 0x00190009 // 25, d2[2] = 9 + +#undef BIT_DEPTH +#define BIT_DEPTH 10 +#include "rpi_hevc_idct_fn_neon.S" + diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c new file mode 100644 index 0000000000..109fa98c29 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/rpi_hevcdsp.h" +#include "rpi_hevcdsp_arm.h" + +av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + ff_hevcdsp_rpi_init_neon(c, bit_depth); +} diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c new file mode 100644 index 0000000000..9294ab8010 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c @@ -0,0 +1,467 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/rpi_hevcdsp.h" +#include "rpi_hevcdsp_arm.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/bit_depth_template.c" + +// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but +// have been removed from head as we never use them. + +void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); + +void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); +void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); + +void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r, + unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f); +void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r, + unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, + uint8_t * _pix_l); +void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4, + unsigned int no_f); +void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4, + uint8_t * src_l, + unsigned int no_f); + +void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r, + unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f); +void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r, + unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, + uint8_t * _pix_l); +void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4, + unsigned int no_f); +void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4, + uint8_t * src_l, + unsigned int no_f); + +void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit); +void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit); +void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs); +void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs); +void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs); +void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs); +void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs); + +void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit); +void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit); +void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs); +void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs); +void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs); +void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs); +void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs); + +void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); + +void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); +void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); +void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); +void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); + + +void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride); + +void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); +void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); +void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); +void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); + + +void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_v); +void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_v); +void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_v); +void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_u); +void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_u); +void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_u); +void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); +void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); +void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); + + +void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_v); +void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_v); +void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_v); +void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_u); +void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_u); +void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_u); +void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); +void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); +void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); +void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); + +void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); +void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); +void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); +void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); + +void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); +void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); +void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); +void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); + +void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); +void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); +void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); + +void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); +void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); +void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); + +void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); +void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); +void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); + +void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); +void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); +void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); + +void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); +void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); +void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); +void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); + +void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); +void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); +void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); +void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); + + +uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, + int in_inc0, int in_inc1); +void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height); + + +static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) +{ + ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); + ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height); +} +static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) +{ + ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); + ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height); +} + +static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height) +{ + ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); + ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); +} +static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height) +{ + ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); + ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); +} + +#if SAO_FILTER_N == 6 +static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) +{ + ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); + ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height); +} +static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) +{ + ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); + ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height); +} + +static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height) +{ + ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); + ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); +} +static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height) +{ + ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); + ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); +} + +static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height) +{ + ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); + ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); +} +static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height) +{ + ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); + ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); +} + +static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height) +{ + ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src, + sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); + ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, + sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); +} +static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height) +{ + ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src, + sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); + ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, + sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); +} +#endif + + + +#if RPI_HEVC_SAO_BUF_STRIDE != 160 +#error SAO edge src stride not 160 - value used in .S +#endif + +av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth) +{ + if (bit_depth == 8) { + c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8; + c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8; + c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8; + c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8; + c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8; + c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8; + c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8; + c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_8; + c->idct[0] = ff_hevc_rpi_transform_4x4_neon_8; + c->idct[1] = ff_hevc_rpi_transform_8x8_neon_8; + c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_8; + c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_8; + c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_8; + c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_8; + c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_8; + c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_8; + c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_8; + c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_8; + c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_8; + c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_8; + c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_8; + c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_8; + c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_8; + c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_8; + c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_8; + c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_8; + c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_8; + c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_8; + c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_8; + c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_8; + c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_8; + c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8; + c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8; + c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8; + c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_8; + c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_8; + c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_8; + c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_8; + c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_8; + c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_8; + c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_8; + c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_8; + c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_8; + c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_8; + c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_8; +#if SAO_FILTER_N == 6 + c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_8; + c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_8; +#endif + c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_8; + c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_8; + c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_8; + + c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_8; + c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_8; + c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_8; + +#if SAO_FILTER_N == 6 + c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_8; + c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_8; +#endif + } + else if (bit_depth == 10) { + c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_10; + c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_10; + c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_10; + c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_10; + c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_10; + c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_10; + c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_10; + c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_10; + c->idct[0] = ff_hevc_rpi_transform_4x4_neon_10; + c->idct[1] = ff_hevc_rpi_transform_8x8_neon_10; + c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_10; + c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_10; + c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_10; + c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_10; + c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_10; + c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_10; + c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_10; + c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_10; + c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_10; + c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_10; + c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_10; + c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_10; + c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_10; + c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_10; + c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_10; + c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_10; + c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_10; + c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_10; + c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_10; + c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_10; + c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_10; + c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10; + c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10; + c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10; + c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_10; + c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_10; + c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_10; + c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_10; + c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_10; + c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_10; + + c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_10; + c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_10; + c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_10; + c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_10; + c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_10; +#if SAO_FILTER_N == 6 + c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_10; + c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_10; +#endif + c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_10; + c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_10; + c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_10; + + c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_10; + c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_10; + c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_10; + +#if SAO_FILTER_N == 6 + c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_10; + c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_10; +#endif + } + + assert(offsetof(HEVCRpiMvField, mv) == 0); + assert(offsetof(HEVCRpiMvField, ref_idx) == 8); + assert(offsetof(HEVCRpiMvField, pred_flag) == 10); + c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon; + c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon; +} diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S new file mode 100644 index 0000000000..93876d14c0 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S @@ -0,0 +1,620 @@ +/* +Copyright (c) 2017 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox, Ben Avison +*/ + +#include "libavutil/arm/asm.S" +#include "neon.S" + + .arch_extension mp @ enable PLDW + +#define BIT_DEPTH 10 + +.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX + vmax.s16 \Q0, \Q_MIN + vmax.s16 \Q1, \Q_MIN + vmax.s16 \Q2, \Q_MIN + vmax.s16 \Q3, \Q_MIN + vmin.s16 \Q0, \Q_MAX + vmin.s16 \Q1, \Q_MAX + vmin.s16 \Q2, \Q_MAX + vmin.s16 \Q3, \Q_MAX +.endm + +@ add_residual4x4( +@ uint16_t *_dst, [r0] +@ int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1 + add ip, r0, r2 + vld1.16 {q10, q11}, [r1] + lsl r2, #1 + vld1.16 {d0}, [r0 :64], r2 + vld1.16 {d1}, [ip :64], r2 + vld1.16 {d2}, [r0 :64] + vld1.16 {d3}, [ip :64] + sub r0, r2 + vqadd.s16 q0, q10 + sub ip, r2 + vqadd.s16 q1, q11 + vmov.i16 q8, #0 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + vmax.s16 q0, q0, q8 + vmax.s16 q1, q1, q8 + vmin.s16 q0, q0, q9 + vmin.s16 q1, q1, q9 + vst1.16 {d0}, [r0 :64], r2 + vst1.16 {d1}, [ip :64], r2 + vst1.16 {d2}, [r0 :64] + vst1.16 {d3}, [ip :64] + bx lr + +endfunc + +@ add_residual4x4_dc( +@ uint16_t *_dst, [r0] +@ ptrdiff_t stride, [r1] +@ int dc) [r2] + +function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1 + add ip, r0, r1 + vdup.16 q15, r2 + lsl r1, #1 + vld1.16 {d0}, [r0 :64], r1 + vld1.16 {d1}, [ip :64], r1 + vld1.16 {d2}, [r0 :64] + vld1.16 {d3}, [ip :64] + sub r0, r1 + vqadd.s16 q0, q15 + sub ip, r1 + vqadd.s16 q1, q15 + vmov.i16 q8, #0 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + vmax.s16 q0, q0, q8 + vmax.s16 q1, q1, q8 + vmin.s16 q0, q0, q9 + vmin.s16 q1, q1, q9 + vst1.16 {d0}, [r0 :64], r1 + vst1.16 {d1}, [ip :64], r1 + vst1.16 {d2}, [r0 :64] + vst1.16 {d3}, [ip :64] + bx lr + +endfunc + + +@ add_residual8x8( +@ uint16_t *_dst, [r0] +@ int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1 + mov r3, #8 + vmov.i64 q8, #0 + add ip, r0, r2 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + lsl r2, #1 +1: + vldm r1!, {q10-q13} + vld1.16 {q0}, [r0 :128], r2 + vld1.16 {q1}, [ip :128], r2 + vld1.16 {q2}, [r0 :128] + vld1.16 {q3}, [ip :128] + sub r0, r2 + vqadd.s16 q0, q10 + sub ip, r2 + vqadd.s16 q1, q11 + subs r3, #4 + vqadd.s16 q2, q12 + vqadd.s16 q3, q13 + clip16_4 q0, q1, q2, q3, q8, q9 + vst1.16 {q0}, [r0 :128], r2 + vst1.16 {q1}, [ip :128], r2 + vst1.16 {q2}, [r0 :128], r2 + vst1.16 {q3}, [ip :128], r2 + bne 1b + bx lr + +endfunc + +@ add_residual4x4_dc_c( +@ uint16_t *_dst, [r0] +@ ptrdiff_t stride, [r1] +@ int dc_uv) [r2] + +function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1 + mov r3, #4 + vdup.32 q15, r2 + b 9f +endfunc + +@ add_residual8x8_dc( +@ uint16_t *_dst, [r0] +@ ptrdiff_t stride, [r1] +@ int dc) [r2] + +function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1 + vdup.16 q15, r2 + mov r3, #8 +9: + vmov.i16 q8, #0 + add ip, r0, r1 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + lsl r1, #1 +1: + vld1.16 {q0}, [r0 :128], r1 + vld1.16 {q1}, [ip :128], r1 + vld1.16 {q2}, [r0 :128] + vld1.16 {q3}, [ip :128] + sub r0, r1 + vqadd.s16 q0, q15 + sub ip, r1 + vqadd.s16 q1, q15 + subs r3, #4 + vqadd.s16 q2, q15 + vqadd.s16 q3, q15 + clip16_4 q0, q1, q2, q3, q8, q9 + vst1.16 {q0}, [r0 :128], r1 + vst1.16 {q1}, [ip :128], r1 + vst1.16 {q2}, [r0 :128], r1 + vst1.16 {q3}, [ip :128], r1 + bne 1b + bx lr + +endfunc + +@ add_residual16x16( +@ uint16_t *_dst, [r0] +@ int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1 + add ip, r0, r2 + vmov.i16 q8, #0 + lsl r2, #1 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + mov r3, #16 +1: + vldm r1!, {q10-q13} + @ For RPI Sand we could guarantee :256 but not for general + @ non-RPI allocation. :128 is as good as we can claim + vld1.16 {q0, q1}, [r0 :128] + subs r3, #2 + vld1.16 {q2, q3}, [ip :128] + vqadd.s16 q0, q10 + vqadd.s16 q1, q11 + vqadd.s16 q2, q12 + vqadd.s16 q3, q13 + clip16_4 q0, q1, q2, q3, q8, q9 + vst1.16 {q0, q1}, [r0 :128], r2 + vst1.16 {q2, q3}, [ip :128], r2 + bne 1b + bx lr +endfunc + +@ add_residual8x8_dc_c( +@ uint16_t *_dst, [r0] +@ ptrdiff_t stride, [r1] +@ int dc_uv) [r2] + +function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1 + mov r3, #8 + vdup.32 q15, r2 + b 9f +endfunc + +@ add_residual16x16_dc( +@ uint16_t *_dst, [r0] +@ ptrdiff_t stride, [r1] +@ int dc) [r2] + +function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1 + vdup.i16 q15, r2 + mov r3, #16 +9: + vmov.i16 q8, #0 + add ip, r0, r1 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + lsl r1, #1 +1: + @ For RPI Sand we could guarantee :256 but not for general + @ non-RPI allocation. :128 is as good as we can claim + vld1.16 {q0, q1}, [r0 :128] + subs r3, #2 + vqadd.s16 q0, q15 + vqadd.s16 q1, q15 + vld1.16 {q2, q3}, [ip :128] + vqadd.s16 q2, q15 + vqadd.s16 q3, q15 + clip16_4 q0, q1, q2, q3, q8, q9 + vst1.16 {q0, q1}, [r0 :128], r1 + vst1.16 {q2, q3}, [ip :128], r1 + bne 1b + bx lr + +endfunc + + +@ add_residual32x32( +@ uint16_t *_dst, [r0] +@ int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1 + push {lr} + mov r3, #32 + vmov.i16 q8, #0 + add lr, r0, r2 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + add ip, r0, #32 +1: + vldm r1!, {q10-q13} + vldm r0, {q0-q3} + vqadd.s16 q0, q10 + pldw [lr] + vqadd.s16 q1, q11 + add lr, r2 + vqadd.s16 q2, q12 + subs r3, #1 + vqadd.s16 q3, q13 + clip16_4 q0, q1, q2, q3, q8, q9 + vst1.16 {q0-q1}, [r0], r2 + vst1.16 {q2-q3}, [ip], r2 + bne 1b + pop {pc} + +endfunc + +@ add_residual16x16_dc_c( +@ uint16_t *_dst, [r0] +@ ptrdiff_t stride, [r1] +@ int dc_uv) [r2] + +function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1 + mov r3, #16 + vdup.32 q15, r2 + b 9f +endfunc + +@ add_residual32x32_dc( +@ uint16_t *_dst, [r0] +@ ptrdiff_t stride, [r1] +@ int dc) [r2] + +function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1 + vdup.16 q15, r2 + mov r3, #32 +9: + vmov.i16 q8, #0 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + add ip, r0, #32 +1: + vldm r0, {q0-q3} + vqadd.s16 q0, q15 + subs r3, #1 + vqadd.s16 q1, q15 + vqadd.s16 q2, q15 + vqadd.s16 q3, q15 + clip16_4 q0, q1, q2, q3, q8, q9 + vst1.16 {q0-q1}, [r0], r1 + vst1.16 {q2-q3}, [ip], r1 + bne 1b + bx lr + +endfunc + +@ ============================================================================ +@ U add + +@ add_residual4x4_u( +@ uint16_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride, [r2] +@ int dc) [r3] + +function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1 + vdup.16 q15, r3 + add ip, r0, r2 + vld1.16 {q10, q11}, [r1 :256] + lsl r2, #1 + vld2.16 {d0, d2}, [r0 :128], r2 + vld2.16 {d1, d3}, [ip :128], r2 + vld2.16 {d4, d6}, [r0 :128] + vld2.16 {d5, d7}, [ip :128] + sub r0, r2 + vmov.i16 q8, #0 + sub ip, r2 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + + vqadd.s16 q0, q10 + vqadd.s16 q1, q15 + vqadd.s16 q2, q11 + vqadd.s16 q3, q15 + clip16_4 q0, q1, q2, q3, q8, q9 + + vst2.16 {d0, d2}, [r0 :128], r2 + vst2.16 {d1, d3}, [ip :128], r2 + vst2.16 {d4, d6}, [r0 :128] + vst2.16 {d5, d7}, [ip :128] + bx lr +endfunc + +@ add_residual8x8_u( +@ uint16_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride, [r2] +@ int dc) [r3] + +function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1 + vdup.16 q15, r3 + mov r3, #8 + vmov.i16 q8, #0 + add ip, r0, r2 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + lsl r2, #1 +1: + vld2.16 {q0, q1}, [r0 :256] + subs r3, #2 + vld2.16 {q2, q3}, [ip :256] + vld1.16 {q10, q11}, [r1 :256]! + vqadd.s16 q0, q10 + vqadd.s16 q1, q15 + vqadd.s16 q2, q11 + vqadd.s16 q3, q15 + clip16_4 q0, q1, q2, q3, q8, q9 + vst2.16 {q0, q1}, [r0 :256], r2 + vst2.16 {q2, q3}, [ip :256], r2 + bne 1b + bx lr +endfunc + +@ add_residual16x16_u( +@ uint16_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride, [r2] +@ int dc) [r3] + +function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1 + push {lr} + vdup.16 q15, r3 + mov r3, #16 + vmov.i16 q8, #0 + add lr, r0, r2 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + add ip, r0, #32 +1: + vld2.16 {q0, q1}, [r0 :256] + vld2.16 {q2, q3}, [ip :256] + vld1.16 {q10, q11}, [r1 :256]! + vqadd.s16 q0, q10 + pldw [lr] + vqadd.s16 q1, q15 + add lr, r2 + vqadd.s16 q2, q11 + subs r3, #1 + vqadd.s16 q3, q15 + clip16_4 q0, q1, q2, q3, q8, q9 + vst2.16 {q0, q1}, [r0 :256], r2 + vst2.16 {q2, q3}, [ip :256], r2 + bne 1b + pop {pc} +endfunc + +@ ============================================================================ +@ V add + +@ add_residual4x4_v( +@ uint16_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride, [r2] +@ int dc) [r3] + +function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1 + vdup.16 q15, r3 + add ip, r0, r2 + vld1.16 {q10, q11}, [r1 :256] + lsl r2, #1 + vld2.16 {d0, d2}, [r0 :128], r2 + vld2.16 {d1, d3}, [ip :128], r2 + vld2.16 {d4, d6}, [r0 :128] + vld2.16 {d5, d7}, [ip :128] + sub r0, r2 + vmov.i16 q8, #0 + sub ip, r2 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + + vqadd.s16 q0, q15 + vqadd.s16 q1, q10 + vqadd.s16 q2, q15 + vqadd.s16 q3, q11 + clip16_4 q0, q1, q2, q3, q8, q9 + + vst2.16 {d0, d2}, [r0 :128], r2 + vst2.16 {d1, d3}, [ip :128], r2 + vst2.16 {d4, d6}, [r0 :128] + vst2.16 {d5, d7}, [ip :128] + bx lr +endfunc + +@ add_residual8x8_v( +@ uint16_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride, [r2] +@ int dc) [r3] + +function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1 + vdup.16 q15, r3 + mov r3, #8 + vmov.i16 q8, #0 + add ip, r0, r2 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + lsl r2, #1 +1: + vld2.16 {q0, q1}, [r0 :256] + subs r3, #2 + vld2.16 {q2, q3}, [ip :256] + vld1.16 {q10, q11}, [r1 :256]! + vqadd.s16 q0, q15 + vqadd.s16 q1, q10 + vqadd.s16 q2, q15 + vqadd.s16 q3, q11 + clip16_4 q0, q1, q2, q3, q8, q9 + vst2.16 {q0, q1}, [r0 :256], r2 + vst2.16 {q2, q3}, [ip :256], r2 + bne 1b + bx lr +endfunc + +@ add_residual16x16_v( +@ uint16_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride, [r2] +@ int dc) [r3] + +function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1 + push {lr} + vdup.16 q15, r3 + mov r3, #16 + vmov.i16 q8, #0 + add lr, r0, r2 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + add ip, r0, #32 +1: + vld2.16 {q0, q1}, [r0 :256] + vld2.16 {q2, q3}, [ip :256] + vld1.16 {q10, q11}, [r1 :256]! + vqadd.s16 q0, q15 + pldw [lr] + vqadd.s16 q1, q10 + add lr, r2 + vqadd.s16 q2, q15 + subs r3, #1 + vqadd.s16 q3, q11 + clip16_4 q0, q1, q2, q3, q8, q9 + vst2.16 {q0, q1}, [r0 :256], r2 + vst2.16 {q2, q3}, [ip :256], r2 + bne 1b + pop {pc} +endfunc + +@ ============================================================================ +@ U & V add + +@ add_residual4x4_c( +@ uint16_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1 + vmov.i16 q8, #0 + add ip, r0, r2 + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + lsl r2, #1 + vldm r1, {q10-q13} + vld2.16 {d0, d2}, [r0 :128], r2 + vld2.16 {d1, d3}, [ip :128], r2 + vld2.16 {d4, d6}, [r0 :128] + vld2.16 {d5, d7}, [ip :128] + + sub r0, r2 + vqadd.s16 q0, q10 + sub ip, r2 + vqadd.s16 q1, q12 + vqadd.s16 q2, q11 + vqadd.s16 q3, q13 + clip16_4 q0, q1, q2, q3, q8, q9 + + vst2.16 {d0, d2}, [r0 :128], r2 + vst2.16 {d1, d3}, [ip :128], r2 + vst2.16 {d4, d6}, [r0 :128] + vst2.16 {d5, d7}, [ip :128] + bx lr +endfunc + +@ add_residual8x8_c( +@ uint16_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1 + push {lr} + add ip, r0, r2 + lsl r2, #1 + vmov.i16 q8, #0 + add r3, r1, #(8*8*2) @ Offset to V + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + mov lr, #8 +1: + vld1.16 {q10, q11}, [r1 :256]! + subs lr, #2 + vld2.16 {q0, q1}, [r0 :256] + vld2.16 {q2, q3}, [ip :256] + vld1.16 {q12, q13}, [r3 :256]! + vqadd.s16 q0, q10 + vqadd.s16 q1, q12 + vqadd.s16 q2, q11 + vqadd.s16 q3, q13 + clip16_4 q0, q1, q2, q3, q8, q9 + vst2.16 {q0, q1}, [r0 :256], r2 + vst2.16 {q2, q3}, [ip :256], r2 + bne 1b + pop {pc} +endfunc + +@ add_residual16x16_c( +@ uint16_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1 + push {r4, lr} + vmov.i16 q8, #0 + add r3, r1, #(16*16*2) @ Offset to V + vmov.i16 q9, #(1 << BIT_DEPTH) - 1 + add ip, r0, #32 + add r4, r0, r2 + mov lr, #16 +1: + vld2.16 {q0, q1}, [r0 :256] + vld2.16 {q2, q3}, [ip :256] + vld1.16 {q10, q11}, [r1 :256]! + vld1.16 {q12, q13}, [r3 :256]! + vqadd.s16 q0, q10 + pldw [r4] + vqadd.s16 q1, q12 + add r4, r2 + vqadd.s16 q2, q11 + subs lr, #1 + vqadd.s16 q3, q13 + clip16_4 q0, q1, q2, q3, q8, q9 + vst2.16 {q0, q1}, [r0 :256], r2 + vst2.16 {q2, q3}, [ip :256], r2 + bne 1b + pop {r4,pc} +endfunc + diff --git a/libavcodec/arm/rpi_hevcdsp_res8_neon.S b/libavcodec/arm/rpi_hevcdsp_res8_neon.S new file mode 100644 index 0000000000..d9a1d7d98c --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S @@ -0,0 +1,741 @@ +/* +Copyright (c) 2017 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox, Ben Avison +*/ + +#include "libavutil/arm/asm.S" +#include "neon.S" + + .arch_extension mp @ enable PLDW + +@ General notes: +@ +@ Residual is generally only guaranteed to be clipped to 16 bits. +@ This means that we do need to do vmovl, vqadd, vqmovun +@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away +@ with this). +@ +@ There is an exception for the DC case because its transform is guaranteed +@ to be small enough that overflow cannot occur during the first add. + +@ ============================================================================ +@ Y add + +function ff_hevc_rpi_add_residual_4x4_neon_8, export=1 + add ip, r0, r2 + vld1.16 {q0, q1}, [r1] + lsl r2, #1 + vld1.32 d4[0], [r0], r2 + rsb r3, r2, #0 + vld1.32 d4[1], [ip], r2 + vld1.32 d5[0], [r0], r3 + vld1.32 d5[1], [ip], r3 + vmovl.u8 q8, d4 + vmovl.u8 q9, d5 + vqadd.s16 q0, q8 + vqadd.s16 q1, q9 + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vst1.32 d0[0], [r0], r2 + vst1.32 d0[1], [ip], r2 + vst1.32 d1[0], [r0] + vst1.32 d1[1], [ip] + bx lr +endfunc + +function ff_hevc_rpi_add_residual_8x8_neon_8, export=1 + push {r4, lr} + vld1.16 {q0, q1}, [r1]! + add ip, r0, r2 + vld1.8 {d6}, [r0] + add r4, r0, r2, lsl #1 + vld1.8 {d7}, [ip] + add lr, ip, r2, lsl #1 + lsl r2, #1 + mov r3, #8-2 + vmovl.u8 q2, d6 + vmovl.u8 q3, d7 + vqadd.s16 q2, q0 + vqadd.s16 q3, q1 +1: + vld1.16 {q0, q1}, [r1]! + subs r3, #2 + vqmovun.s16 d4, q2 + vqmovun.s16 d5, q3 + vld1.8 {d6}, [r4], r2 + vld1.8 {d7}, [lr], r2 + vst1.8 {d4}, [r0], r2 + vst1.8 {d5}, [ip], r2 + vmovl.u8 q2, d6 + pldw [r4] + vmovl.u8 q3, d7 + vqadd.s16 q2, q0 + vqadd.s16 q3, q1 + bne 1b + + vqmovun.s16 d4, q2 + vqmovun.s16 d5, q3 + vst1.8 {d4}, [r0] + vst1.8 {d5}, [ip] + pop {r4, pc} +endfunc + +function ff_hevc_rpi_add_residual_16x16_neon_8, export=1 + vld1.16 {q0, q1}, [r1]! + add ip, r0, r2 + vld1.8 {q3}, [r0] + mov r3, #16-1 + vmovl.u8 q2, d6 + vmovl.u8 q3, d7 + vqadd.s16 q2, q0 + vqadd.s16 q3, q1 +1: + vld1.16 {q0, q1}, [r1]! + subs r3, #1 + vqmovun.s16 d4, q2 + vqmovun.s16 d5, q3 + vld1.8 {q3}, [ip], r2 + vst1.8 {q2}, [r0], r2 + vmovl.u8 q2, d6 + pldw [ip] + vmovl.u8 q3, d7 + vqadd.s16 q2, q0 + vqadd.s16 q3, q1 + bne 1b + + vqmovun.s16 d4, q2 + vqmovun.s16 d5, q3 + vst1.8 {q2}, [r0] + bx lr +endfunc + +function ff_hevc_rpi_add_residual_32x32_neon_8, export=1 + vldm r1!, {q0-q3} + vld1.8 {q8, q9}, [r0] + add ip, r0, r2 + vmovl.u8 q10, d16 + mov r3, #32-1 + vmovl.u8 q11, d17 + vmovl.u8 q12, d18 + vmovl.u8 q13, d19 + vqadd.s16 q10, q0 + vqadd.s16 q11, q1 + vqadd.s16 q12, q2 + vqadd.s16 q13, q3 +1: + vldm r1!, {q0-q3} + vqmovun.s16 d20, q10 + vqmovun.s16 d21, q11 + vqmovun.s16 d22, q12 + vqmovun.s16 d23, q13 + vld1.8 {q8, q9}, [ip], r2 + subs r3, #1 + vst1.8 {q10, q11}, [r0], r2 + vmovl.u8 q10, d16 + pldw [ip] + vmovl.u8 q11, d17 + vmovl.u8 q12, d18 + vmovl.u8 q13, d19 + vqadd.s16 q10, q0 + vqadd.s16 q11, q1 + vqadd.s16 q12, q2 + vqadd.s16 q13, q3 + bne 1b + + vqmovun.s16 d20, q10 + vqmovun.s16 d21, q11 + vqmovun.s16 d22, q12 + vqmovun.s16 d23, q13 + vst1.8 {q10, q11}, [r0] + bx lr +endfunc + + +@ ff_hevc_rpi_add_residual_4x4_dc_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + +function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1 + add ip, r0, r1 + vdup.16 q15, r2 + lsl r1, #1 + vld1.32 d4[0], [r0], r1 + rsb r3, r1, #0 + vld1.32 d4[1], [ip], r1 + vld1.32 d5[0], [r0], r3 + vld1.32 d5[1], [ip], r3 + vaddw.u8 q0, q15, d4 + vaddw.u8 q1, q15, d5 + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vst1.32 d0[0], [r0], r1 + vst1.32 d0[1], [ip], r1 + vst1.32 d1[0], [r0] + vst1.32 d1[1], [ip] + bx lr +endfunc + +@ ============================================================================ +@ DC Y or C add + +@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + +function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1 + mov r3, #4-2 + vdup.32 q15, r2 + b 1f +endfunc + +@ ff_hevc_rpi_add_residual_8x8_dc_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + +function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1 + vdup.16 q15, r2 + mov r3, #8-2 +1: vld1.8 d16, [r0] + add ip, r0, r1 + push {r4, lr} + vld1.8 d17, [ip] + add r4, r0, r1, lsl #1 + vaddw.u8 q0, q15, d16 + lsl r1, #1 + vaddw.u8 q1, q15, d17 + add lr, ip, r1 +1: + vld1.8 {d16}, [r4], r1 + vld1.8 {d17}, [lr], r1 + subs r3, #2 + vqmovun.s16 d4, q0 + vqmovun.s16 d5, q1 + vaddw.u8 q0, q15, d16 + vaddw.u8 q1, q15, d17 + vst1.8 {d4}, [r0], r1 + vst1.8 {d5}, [ip], r1 + bne 1b + + vqmovun.s16 d4, q0 + vqmovun.s16 d5, q1 + vst1.8 {d4}, [r0] + vst1.8 {d5}, [ip] + pop {r4, pc} +endfunc + + +@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + +function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1 + mov r3, #8-1 + vdup.32 q15, r2 + b 1f +endfunc + +@ ff_hevc_rpi_add_residual_16x16_dc_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + +function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1 + vdup.16 q15, r2 + mov r3, #16-1 +1: vld1.8 {q8}, [r0] + add ip, r0, r1 + vaddw.u8 q0, q15, d16 + vaddw.u8 q1, q15, d17 +1: + vld1.8 {q8}, [ip], r1 + subs r3, #1 + vqmovun.s16 d4, q0 + vqmovun.s16 d5, q1 + vaddw.u8 q0, q15, d16 + vaddw.u8 q1, q15, d17 + vst1.8 {q2}, [r0], r1 + bne 1b + + vqmovun.s16 d4, q0 + vqmovun.s16 d5, q1 + vst1.8 {q2}, [r0] + bx lr +endfunc + + +@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + +function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1 + mov r3, #16-1 + vdup.32 q15, r2 + b 1f +endfunc + +@ ff_hevc_rpi_add_residual_32x32_dc_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + +function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1 + vdup.16 q15, r2 + mov r3, #32-1 +1: vld1.8 {q8, q9}, [r0] + add ip, r0, r1 + vaddw.u8 q0, q15, d16 + vaddw.u8 q1, q15, d17 + vaddw.u8 q2, q15, d18 + vaddw.u8 q3, q15, d19 +1: + vqmovun.s16 d20, q0 + vqmovun.s16 d21, q1 + vqmovun.s16 d22, q2 + vqmovun.s16 d23, q3 + vld1.8 {q8, q9}, [ip], r1 + subs r3, #1 + vaddw.u8 q0, q15, d16 + vaddw.u8 q1, q15, d17 + vaddw.u8 q2, q15, d18 + vaddw.u8 q3, q15, d19 + vst1.8 {q10, q11}, [r0], r1 + bne 1b + + vqmovun.s16 d20, q0 + vqmovun.s16 d21, q1 + vqmovun.s16 d22, q2 + vqmovun.s16 d23, q3 + vst1.8 {q10, q11}, [r0] + bx lr +endfunc + +@ ============================================================================ +@ U add + +@ add_residual4x4_c( +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride, [r2] +@ int dc_v) [r3] + +function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1 + add ip, r0, r2 + vld1.16 {q0, q1}, [r1] + lsl r2, #1 + vld1.8 {d16}, [r0 :64], r2 + vld1.8 {d17}, [ip :64], r2 + vld1.8 {d18}, [r0 :64] + sub r0, r2 + vld1.8 {d19}, [ip :64] + sub ip, r2 + vdup.16 q2, r3 + vdup.16 q3, r3 + vmovl.u8 q10, d16 + vmovl.u8 q11, d17 + vmovl.u8 q12, d18 + vmovl.u8 q13, d19 + vzip.16 q0, q2 + vzip.16 q1, q3 + vqadd.s16 q0, q10 + vqadd.s16 q2, q11 + vqadd.s16 q1, q12 + vqadd.s16 q3, q13 + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q2 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q3 + vst1.8 {d0}, [r0 :64], r2 + vst1.8 {d1}, [ip :64], r2 + vst1.8 {d2}, [r0 :64] + vst1.8 {d3}, [ip :64] + bx lr +endfunc + +@ add_residual8x8_c( +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] +@ int dc_v) [r3] + +function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1 + vdup.16 q15, r3 + add ip, r0, r2 + push {r4, lr} + vld2.8 {d16, d17}, [r0 :128] + lsl r2, #1 + vld2.8 {d18, d19}, [ip :128] + mov r3, #8-2 + vld1.16 {q0, q1}, [r1 :256]! + add r4, r0, r2 + vmovl.u8 q10, d16 + add lr, ip, r2 + vmovl.u8 q11, d18 + vqadd.s16 q0, q10 + vaddw.u8 q2, q15, d17 + vqadd.s16 q1, q11 + vaddw.u8 q3, q15, d19 +1: + vqmovun.s16 d20, q0 + vqmovun.s16 d21, q2 + vld2.8 {d16, d17}, [r4 :128], r2 + subs r3, #2 + vqmovun.s16 d22, q1 + vqmovun.s16 d23, q3 + vst2.8 {d20, d21}, [r0 :128], r2 + vld2.8 {d18, d19}, [lr :128], r2 + vst2.8 {d22, d23}, [ip :128], r2 + vld1.16 {q0, q1}, [r1 :256]! + vmovl.u8 q10, d16 + vmovl.u8 q11, d18 + vqadd.s16 q0, q10 + vaddw.u8 q2, q15, d17 + vqadd.s16 q1, q11 + vaddw.u8 q3, q15, d19 + bne 1b + + vqmovun.s16 d20, q0 + vqmovun.s16 d21, q2 + vqmovun.s16 d22, q1 + vqmovun.s16 d23, q3 + vst2.8 {d20, d21}, [r0 :128] + vst2.8 {d22, d23}, [ip :128] + pop {r4, pc} +endfunc + +@ add_residual16x16_u( +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] +@ int dc_v) [r3] + +function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1 + vdup.16 q15, r3 + add ip, r0, r2 + vld2.8 {q8, q9}, [r0 :256] + mov r3, #16-1 + vld1.16 {q0, q1}, [r1 :256]! + vmovl.u8 q11, d16 + vmovl.u8 q12, d17 + vqadd.s16 q0, q11 + vaddw.u8 q11, q15, d18 + vqadd.s16 q1, q12 + vaddw.u8 q12, q15, d19 +1: + vld2.8 {q8, q9}, [ip :256], r2 + subs r3, #1 + vqmovun.s16 d20, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d21, q1 + vqmovun.s16 d23, q12 + vld1.16 {q0, q1}, [r1 :256]! + vst2.8 {q10, q11}, [r0 :256], r2 + vmovl.u8 q11, d16 + pldw [ip] + vmovl.u8 q12, d17 + vqadd.s16 q0, q11 + vaddw.u8 q11, q15, d18 + vqadd.s16 q1, q12 + vaddw.u8 q12, q15, d19 + bne 1b + + vqmovun.s16 d20, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d21, q1 + vqmovun.s16 d23, q12 + vst2.8 {q10, q11}, [r0 :256] + bx lr +endfunc + +@ ============================================================================ +@ V add + +@ add_residual4x4_v( +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1 + add ip, r0, r2 + vld1.16 {q2, q3}, [r1] + lsl r2, #1 + vld1.8 {d16}, [r0 :64], r2 + vld1.8 {d17}, [ip :64], r2 + vld1.8 {d18}, [r0 :64] + sub r0, r2 + vld1.8 {d19}, [ip :64] + sub ip, r2 + vdup.16 q0, r3 + vdup.16 q1, r3 + vmovl.u8 q10, d16 + vmovl.u8 q11, d17 + vmovl.u8 q12, d18 + vmovl.u8 q13, d19 + vzip.16 q0, q2 + vzip.16 q1, q3 + vqadd.s16 q0, q10 + vqadd.s16 q2, q11 + vqadd.s16 q1, q12 + vqadd.s16 q3, q13 + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q2 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q3 + vst1.8 {d0}, [r0 :64], r2 + vst1.8 {d1}, [ip :64], r2 + vst1.8 {d2}, [r0 :64] + vst1.8 {d3}, [ip :64] + bx lr +endfunc + +@ add_residual8x8_v( +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1 + vdup.16 q15, r3 + add ip, r0, r2 + push {r4, lr} + vld2.8 {d16, d17}, [r0 :128] + lsl r2, #1 + vld2.8 {d18, d19}, [ip :128] + mov r3, #8-2 + vld1.16 {q0, q1}, [r1 :256]! + add r4, r0, r2 + vmovl.u8 q10, d17 + add lr, ip, r2 + vmovl.u8 q11, d19 + vqadd.s16 q0, q10 + vaddw.u8 q2, q15, d16 + vqadd.s16 q1, q11 + vaddw.u8 q3, q15, d18 +1: + vqmovun.s16 d20, q2 + vqmovun.s16 d21, q0 + vld2.8 {d16, d17}, [r4 :128], r2 + subs r3, #2 + vqmovun.s16 d22, q3 + vqmovun.s16 d23, q1 + vst2.8 {d20, d21}, [r0 :128], r2 + vld2.8 {d18, d19}, [lr :128], r2 + vst2.8 {d22, d23}, [ip :128], r2 + vld1.16 {q0, q1}, [r1 :256]! + vmovl.u8 q10, d17 + vmovl.u8 q11, d19 + vqadd.s16 q0, q10 + vaddw.u8 q2, q15, d16 + vqadd.s16 q1, q11 + vaddw.u8 q3, q15, d18 + bne 1b + + vqmovun.s16 d20, q2 + vqmovun.s16 d21, q0 + vqmovun.s16 d22, q3 + vqmovun.s16 d23, q1 + vst2.8 {d20, d21}, [r0 :128] + vst2.8 {d22, d23}, [ip :128] + pop {r4, pc} +endfunc + +@ add_residual16x16_v( +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1 + vdup.16 q15, r3 + add ip, r0, r2 + vld2.8 {q8, q9}, [r0 :256] + mov r3, #16-1 + vld1.16 {q0, q1}, [r1 :256]! + vmovl.u8 q11, d18 + vmovl.u8 q12, d19 + vqadd.s16 q0, q11 + vaddw.u8 q11, q15, d16 + vqadd.s16 q1, q12 + vaddw.u8 q12, q15, d17 +1: + vld2.8 {q8, q9}, [ip :256], r2 + subs r3, #1 + vqmovun.s16 d20, q11 + vqmovun.s16 d22, q0 + vqmovun.s16 d21, q12 + vqmovun.s16 d23, q1 + vld1.16 {q0, q1}, [r1 :256]! + vst2.8 {q10, q11}, [r0 :256], r2 + vmovl.u8 q11, d18 + pldw [ip] + vmovl.u8 q12, d19 + vqadd.s16 q0, q11 + vaddw.u8 q11, q15, d16 + vqadd.s16 q1, q12 + vaddw.u8 q12, q15, d17 + bne 1b + + vqmovun.s16 d20, q11 + vqmovun.s16 d22, q0 + vqmovun.s16 d21, q12 + vqmovun.s16 d23, q1 + vst2.8 {q10, q11}, [r0 :256] + bx lr +endfunc + +@ ============================================================================ +@ U & V add + +@ add_residual4x4_c( +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1 + add ip, r0, r2 + vld1.16 {q0, q1}, [r1]! @ all of U + lsl r2, #1 + vld1.8 {d16}, [r0 :64], r2 + rsb r3, r2, #0 + vld1.8 {d17}, [ip :64], r2 + vld1.16 {q2, q3}, [r1] @ all of V + vld1.8 {d18}, [r0 :64], r3 + vld1.8 {d19}, [ip :64], r3 + vmovl.u8 q10, d16 + vmovl.u8 q11, d17 + vmovl.u8 q12, d18 + vmovl.u8 q13, d19 + vzip.16 q0, q2 + vzip.16 q1, q3 + vqadd.s16 q0, q10 + vqadd.s16 q2, q11 + vqadd.s16 q1, q12 + vqadd.s16 q3, q13 + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q2 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q3 + vst1.8 {d0}, [r0 :64], r2 + vst1.8 {d1}, [ip :64], r2 + vst1.8 {d2}, [r0 :64] + vst1.8 {d3}, [ip :64] + bx lr +endfunc + +@ add_residual8x8_c( +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1 + vld2.8 {d16, d17}, [r0 :128] + add r3, r1, #(8*8*2) @ Offset to V + vld1.16 {q0}, [r1 :128]! + add ip, r0, r2 + vld1.16 {q1}, [r3 :128]! + vmovl.u8 q10, d16 + push {lr} + vmovl.u8 q8, d17 + mov lr, #8-1 + vqadd.s16 q10, q0 + vqadd.s16 q1, q8 +1: + vld2.8 {d16, d17}, [ip :128], r2 + subs lr, #1 + vld1.16 {q0}, [r1 :128]! + vqmovun.s16 d20, q10 + vqmovun.s16 d21, q1 + vld1.16 {q1}, [r3 :128]! + vst2.8 {d20, d21}, [r0 :128], r2 + vmovl.u8 q10, d16 + pldw [ip] + vmovl.u8 q8, d17 + vqadd.s16 q10, q0 + vqadd.s16 q1, q8 + bne 1b + + vqmovun.s16 d20, q10 + vqmovun.s16 d21, q1 + vst2.8 {d20, d21}, [r0 :128] + pop {pc} +endfunc + +@ add_residual16x16_c( +@ uint8_t *_dst, [r0] +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + +function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1 + vld2.8 {q8, q9}, [r0 :256] + add r3, r1, #(16*16*2) @ Offset to V + vld1.16 {q0, q1}, [r1 :256]! + add ip, r0, r2 + vld1.16 {q2, q3}, [r3 :256]! + vmovl.u8 q10, d16 + push {lr} + vmovl.u8 q8, d17 + mov lr, #16-1 + vmovl.u8 q11, d18 + vmovl.u8 q9, d19 + vqadd.s16 q0, q10 + vqadd.s16 q1, q8 + vqadd.s16 q2, q11 + vqadd.s16 q3, q9 +1: + vld2.8 {q8, q9}, [ip :256], r2 + subs lr, #1 + vqmovun.s16 d20, q0 + vqmovun.s16 d22, q2 + vqmovun.s16 d21, q1 + vqmovun.s16 d23, q3 + vld1.16 {q0, q1}, [r1 :256]! + vst2.8 {d20-d23}, [r0 :256], r2 + vld1.16 {q2, q3}, [r3 :256]! + vmovl.u8 q10, d16 + pldw [ip] + vmovl.u8 q8, d17 + vmovl.u8 q11, d18 + vmovl.u8 q9, d19 + vqadd.s16 q0, q10 + vqadd.s16 q1, q8 + vqadd.s16 q2, q11 + vqadd.s16 q3, q9 + bne 1b + + vqmovun.s16 d20, q0 + vqmovun.s16 d22, q2 + vqmovun.s16 d21, q1 + vqmovun.s16 d23, q3 + vst2.8 {d20-d23}, [r0 :256] + pop {pc} +endfunc + +@ 32x32 chroma never occurs so NIF + +@ ============================================================================ diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S new file mode 100644 index 0000000000..b56e0f9644 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S @@ -0,0 +1,2245 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi + * 2017 John Cox (for Raspberry Pi) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +.set EDGE_SRC_STRIDE, 160 + +@ PIC jump tables are fractionally more expensive than absolute in our code +.set jent_pic, CONFIG_PIC + + +.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4 + vshr.u8 q12, q8, #3 + \I1 + vadd.i8 q8, \Q_K128 + \I2 + vshr.u8 q13, q9, #3 + \I3 + vadd.i8 q9, \Q_K128 + \I4 + vtbl.8 d24, \XLAT0, d24 + vtbl.8 d25, \XLAT0, d25 + vtbl.8 d26, \XLAT1, d26 + vtbl.8 d27, \XLAT1, d27 + + vqadd.s8 q8, q12 + vshr.u8 q12, q10, #3 + vadd.i8 q10, \Q_K128 + vqadd.s8 q9, q13 + vshr.u8 q13, q11, #3 + vadd.i8 q11, \Q_K128 + + vtbl.8 d24, \XLAT0, d24 + vtbl.8 d25, \XLAT0, d25 + vtbl.8 d26, \XLAT1, d26 + vtbl.8 d27, \XLAT1, d27 + vqadd.s8 q10, q12 + vsub.i8 q8, \Q_K128 + vqadd.s8 q11, q13 + vsub.i8 q9, \Q_K128 + vsub.i8 q10, \Q_K128 + vsub.i8 q11, \Q_K128 +.endm + +.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4 + \L1 + \L2 + \L3 + \L4 + \L5 + vadd.i8 q12, q8, \Q_K128 + vshr.u8 q8, #3 + vtbl.8 d16, \XLAT0, d16 + vtbl.8 d17, \XLAT1, d17 + vqadd.s8 q12, q8 + bmi 2f +1: \L1 + \L2 + \L3 + \L4 + \L5 + vsub.i8 q13, q12, \Q_K128 + vadd.i8 q12, q8, \Q_K128 + vshr.u8 q8, #3 + \S1 + \S2 + \S3 + \S4 + vtbl.8 d16, \XLAT0, d16 + vtbl.8 d17, \XLAT1, d17 + vqadd.s8 q12, q8 + bpl 1b +2: vsub.i8 q13, q12, \Q_K128 + \S1 + \S2 + \S3 + \S4 +.endm + + +.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX + vmax.s16 \Q0, \Q_MIN + vmax.s16 \Q1, \Q_MIN + vmax.s16 \Q2, \Q_MIN + vmax.s16 \Q3, \Q_MIN + vmin.s16 \Q0, \Q_MAX + vmin.s16 \Q1, \Q_MAX + vmin.s16 \Q2, \Q_MAX + vmin.s16 \Q3, \Q_MAX +.endm + +@ Clobbers q12, q13 +.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2 + vshrn.i16 d24, \Q0, #(\bit_depth - 5) + vshrn.i16 d25, \Q1, #(\bit_depth - 5) + vshrn.i16 d26, \Q2, #(\bit_depth - 5) + \I1 + vtbl.8 d24, \XLAT0, d24 + vshrn.i16 d27, \Q3, #(\bit_depth - 5) + vtbl.8 d25, \XLAT1, d25 + \I2 + vtbl.8 d26, \XLAT0, d26 + vtbl.8 d27, \XLAT1, d27 + vaddw.s8 \Q0, d24 + vaddw.s8 \Q1, d25 + vaddw.s8 \Q2, d26 + vaddw.s8 \Q3, d27 + clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX +.endm + +@ Clobbers q10, q11, q12 +.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4 + \L1 + \L2 + \L3 + \L4 + \L5 + vshrn.i16 d24, \Q0, #\bit_depth - 5 + vshrn.i16 d25, \Q1, #\bit_depth - 5 + vtbl.8 d24, \XLAT0, d24 + vtbl.8 d25, \XLAT1, d25 + vaddw.s8 q10, \Q0, d24 + vaddw.s8 q11, \Q1, d25 + bmi 2f +1: \L1 + \L2 + \L3 + \L4 + \L5 + vmax.s16 q10, \Q_MIN + vmax.s16 q11, \Q_MIN + vshrn.i16 d24, \Q0, #\bit_depth - 5 + vshrn.i16 d25, \Q1, #\bit_depth - 5 + vmin.s16 q10, \Q_MAX + vmin.s16 q11, \Q_MAX + \S1 + \S2 + \S3 + \S4 + vtbl.8 d24, \XLAT0, d24 + vtbl.8 d25, \XLAT1, d25 + vaddw.s8 q10, \Q0, d24 + vaddw.s8 q11, \Q1, d25 + bpl 1b +2: vmax.s16 q10, \Q_MIN + vmax.s16 q11, \Q_MIN + vmin.s16 q10, \Q_MAX + vmin.s16 q11, \Q_MAX + \S1 + \S2 + \S3 + \S4 +.endm + + +@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38) +@ so we are quite safe stuffing it into a byte array +@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma +@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of +@ precision + +@ This, somewhat nasty, bit of code builds the {d0-d3} translation +@ array via the stack +@ Given that sao_left_class > 28 can cause wrap we can't just poke +@ all 4 bytes in at once +@ +@ It also loads other common regs + +@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately +function band_load_y + ldr ip, [sp, #16] @ &sao_offset_val[0] + ldr r4, [sp, #20] @ sao_left_class + vmov.i64 d4, #0 + vmov.i64 q0, #0 + pld [r1] + vld2.8 {q8}, [ip] + sub ip, sp, #8*5 + vmov.i64 q1, #0 + add r4, ip, r4 + vpush {d0-d4} @ Put zero array on stack + vshr.u64 d16, d16, #8 @ 1st interesting val is [1] + ldr ip, [ip, #8*5 + 28] @ height + vst1.32 {d16[0]}, [r4] + add r4, r1, r3 + vpop {d0-d4} @ Pop modified array + sub ip, ip, #1 + vorr d0, d0, d4 + bx lr +endfunc + +@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately +function band_load_c + ldr ip, [sp, #16] @ &sao_offset_val1[0] + ldr r4, [sp, #20] @ sao_left_class1 + vmov.i64 d24, #0 + vmov.i64 q10, #0 + pld [r1] + vld2.8 {q8}, [ip] + sub ip, sp, #8*5 + vmov.i64 q11, #0 + add r4, ip, r4 + ldr ip, [sp, #24] @ &sao_offset_val2[0] + vpush {d20-d24} @ Put zero array on stack + vld2.8 {q9}, [ip] + vshr.u64 d16, d16, #8 @ 1st interesting val is [1] + ldr ip, [sp, #8*5 + 28] @ sao_left_class2 + vst1.32 {d16[0]}, [r4] + add ip, sp, ip + vshr.u64 d18, d18, #8 @ 1st interesting val is [1] + vldmia sp, {d0-d3} @ Load modified array + vldr d16, [sp, #8*4] + add r4, r1, r3 + vstmia sp, {d20-d24} @ Put zero array on stack (again) + vst1.32 {d18[0]}, [ip] + vorr d0, d0, d16 + vldmia sp, {d4-d7} @ Load modified array + vldr d18, [sp, #8*4] + ldr ip, [sp, #8*5 + 36] @ height + add sp, sp, #8*5 + vorr d4, d4, d18 + sub ip, ip, #1 + bx lr +endfunc + + +@ ff_hevc_rpi_sao_band_64_neon_8 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ ptrdiff_t stride_src, [r3] +@ int16_t *sao_offset_val, [sp, #0] +@ int sao_left_class, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_band_64_neon_8, export=1 + push {r4-r6, lr} + vmov.u8 q15, #128 + bl band_load_y + +1: vldmia r1, {q8-q11} + sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \ + "pld [r4]", \ + "subs ip, #1", \ + "it ne; addne r4, r3", \ + "add r1, r3" + vstmia r0, {q8-q11} + add r0, r2 + bpl 1b + + pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_32_neon_8 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ ptrdiff_t stride_src, [r3] +@ int16_t *sao_offset_val, [sp, #0] +@ int sao_left_class, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_band_32_neon_8, export=1 + push {r4-r6, lr} + add r5, r0, r2 + add r6, r1, r3 + lsl r2, #1 + lsl r3, #1 + vmov.u8 q15, #128 + bl band_load_y + +1: vld1.8 { q8, q9 }, [r1, :128], r3 + subs ip, #2 + vld1.8 {q10, q11}, [r6, :128], r3 + + sao_band_64b_8 {d0-d3}, {d0-d3}, q15 + + vst1.8 { q8, q9 }, [r0, :128], r2 + vst1.8 {q10, q11}, [r5, :128], r2 + bpl 1b + + pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_16_neon_8 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ ptrdiff_t stride_src, [r3] +@ int16_t *sao_offset_val, [sp, #0] +@ int sao_left_class, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_band_16_neon_8, export=1 + push {r4-r6, lr} + add r5, r0, r2 + add r6, r1, r3 + lsl r2, #1 + lsl r3, #1 + vmov.u8 q15, #128 + bl band_load_y + +1: vld1.8 { q8}, [r1, :128], r3 + subs ip, #4 + vld1.8 { q9}, [r6, :128], r3 + vld1.8 {q10}, [r1, :128], r3 + vld1.8 {q11}, [r6, :128], r3 + + sao_band_64b_8 {d0-d3}, {d0-d3}, q15 + + vst1.8 { q8}, [r0, :128], r2 + vst1.8 { q9}, [r5, :128], r2 + vst1.8 {q10}, [r0, :128], r2 + vst1.8 {q11}, [r5, :128], r2 + bpl 1b + + pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_8_neon_8 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ ptrdiff_t stride_src, [r3] +@ int16_t *sao_offset_val, [sp, #0] +@ int sao_left_class, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_band_8_neon_8, export=1 + ldr ip, [sp, #8] @ width + push {r4-r6, lr} + vmov.u8 q15, #128 + cmp ip, #8 + bl band_load_y + add r5, r0, r2 + add r6, r1, r3 + lsl r2, #1 + lsl r3, #1 + blt 4f + + sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \ + "vld1.8 {d16}, [r1, :64], r3", \ + "subs ip, #2", \ + "vld1.8 {d17}, [r6, :64], r3", \ + "", \ + "", \ + "vst1.8 {d26}, [r0, :64], r2", \ + "vst1.8 {d27}, [r5, :64], r2" + pop {r4-r6, pc} +4: + sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \ + "vld1.32 {d16[0]}, [r1, :32], r3", \ + "subs ip, #4", \ + "vld1.32 {d16[1]}, [r6, :32], r3", \ + "vld1.32 {d17[0]}, [r1, :32], r3", \ + "vld1.32 {d17[1]}, [r6, :32], r3", \ + "vst1.32 {d26[0]}, [r0, :32], r2", \ + "vst1.32 {d26[1]}, [r5, :32], r2", \ + "vst1.32 {d27[0]}, [r0, :32], r2", \ + "vst1.32 {d27[1]}, [r5, :32], r2" + pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_c_32_neon_8( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] +@ uint32_t src_stride [r3] +@ const int16_t * table1 sp[0] +@ uint32_t offset1 sp[4] +@ const int16_t * table2 sp[8] +@ uint32_t offset2 sp[12] +@ int width sp[16] +@ int height sp[20] + +function ff_hevc_rpi_sao_band_c_32_neon_8, export=1 + push {r4-r6, lr} + add r5, r0, #32 + add r6, r1, #32 + vmov.u8 q15, #128 + bl band_load_c + +1: vld2.8 { q8, q9 }, [r1, :128], r3 + subs ip, #1 + vld2.8 {q10, q11}, [r6, :128], r3 + + sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \ + "pld [r4]", \ + "it ne; addne r4, r3" + + vst2.8 { q8, q9 }, [r0, :128], r2 + vst2.8 {q10, q11}, [r5, :128], r2 + bpl 1b + + pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_c_16_neon_8( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] +@ uint32_t src_stride [r3] +@ const int16_t * table1 sp[0] +@ uint32_t offset1 sp[4] +@ const int16_t * table2 sp[8] +@ uint32_t offset2 sp[12] +@ int width sp[16] +@ int height sp[20] + +function ff_hevc_rpi_sao_band_c_16_neon_8, export=1 + push {r4-r6, lr} + add r5, r0, r2 + add r6, r1, r3 + lsl r2, #1 + lsl r3, #1 + vmov.u8 q15, #128 + bl band_load_c + +1: vld2.8 { q8, q9 }, [r1, :128], r3 + subs ip, #2 + vld2.8 {q10, q11}, [r6, :128], r3 + + sao_band_64b_8 {d0-d3}, {d4-d7}, q15 + + vst2.8 { q8, q9 }, [r0, :128], r2 + vst2.8 {q10, q11}, [r5, :128], r2 + bpl 1b + + pop {r4-r6, pc} +endfunc + +@ ff_hevc_rpi_sao_band_c_8_neon_8( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] +@ uint32_t src_stride [r3] +@ const int16_t * table1 sp[0] +@ uint32_t offset1 sp[4] +@ const int16_t * table2 sp[8] +@ uint32_t offset2 sp[12] +@ int width sp[16] +@ int height sp[20] + +function ff_hevc_rpi_sao_band_c_8_neon_8, export=1 + ldr ip, [sp, #16] @ width + push {r4-r6, lr} + vmov.u8 q15, #128 + cmp ip, #8 + bl band_load_c + blt 4f + + sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \ + "vld2.8 {d16-d17}, [r1, :128], r3", \ + "subs ip, #1", \ + "", \ + "", \ + "", \ + "vst2.8 {d26-d27}, [r0, :128], r2" + pop {r4-r6, pc} +4: + add r5, r0, r2 + add r6, r1, r3 + lsl r2, #1 + lsl r3, #1 + sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \ + "vld1.8 {d16}, [r1, :64], r3", \ + "subs ip, #2", \ + "vld1.8 {d17}, [r6, :64], r3", \ + "vuzp.8 d16, d17", \ + "", \ + "vzip.8 d26, d27", \ + "vst1.8 {d26}, [r0, :64], r2", \ + "vst1.8 {d27}, [r5, :64], r2" + pop {r4-r6, pc} +endfunc + + +@ ff_hevc_rpi_sao_band_64_neon_10 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ ptrdiff_t stride_src, [r3] +@ int16_t *sao_offset_val, [sp, #0] +@ int sao_left_class, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +.macro band_64_16 bit_depth + push {r4-r6, lr} + vmov.i64 q2, #0 + vmov.i16 q3, #(1 << \bit_depth) - 1 + bl band_load_y + vpush {q4-q7} + +1: vldm r1, {q4-q11} + sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \ + "subs ip, #1", \ + "add r1, r3" + sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth + vstm r0, {q4-q11} + add r0, r2 + bpl 1b + + vpop {q4-q7} + pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_64_neon_10, export=1 + band_64_16 10 +endfunc + +@ ff_hevc_rpi_sao_band_32_neon_10 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ ptrdiff_t stride_src, [r3] +@ int16_t *sao_offset_val, [sp, #0] +@ int sao_left_class, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +.macro band_32_16 bit_depth + push {r4-r6, lr} + vmov.i64 q2, #0 + vmov.i16 q3, #(1 << \bit_depth) - 1 + bl band_load_y + +1: vldm r1, {q8-q11} + sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \ + "subs ip, #1", \ + "add r1, r3" + vstm r0, {q8-q11} + add r0, r2 + bpl 1b + + pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_32_neon_10, export=1 + band_32_16 10 +endfunc + +@ ff_hevc_rpi_sao_band_16_neon_10 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ ptrdiff_t stride_src, [r3] +@ int16_t *sao_offset_val, [sp, #0] +@ int sao_left_class, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +.macro band_16_16 bit_depth + push {r4-r6, lr} + add r5, r0, r2 + add r6, r1, r3 + lsl r2, #1 + lsl r3, #1 + vmov.i64 q14, #0 + vmov.i16 q15, #(1 << \bit_depth) - 1 + bl band_load_y + +1: vld1.16 { q8, q9 }, [r1, :128], r3 + subs r12, #2 + vld1.16 {q10, q11}, [r6, :128], r3 + sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth + vst1.16 { q8, q9 }, [r0, :128], r2 + vst1.16 {q10, q11}, [r5, :128], r2 + bpl 1b + + pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_16_neon_10, export=1 + band_16_16 10 +endfunc + +@ ff_hevc_rpi_sao_band_8_neon_10 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ ptrdiff_t stride_src, [r3] +@ int16_t *sao_offset_val, [sp, #0] +@ int sao_left_class, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +.macro band_8_16 bit_depth + ldr ip, [sp, #8] @ width + push {r4-r6, lr} + vmov.i64 q14, #0 + cmp ip, #8 + vmov.i16 q15, #(1 << \bit_depth) - 1 + bl band_load_y + add r5, r0, r2 + add r6, r1, r3 + lsl r2, #1 + lsl r3, #1 + blt 4f + + sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \ + "vld1.16 {q8}, [r1, :128], r3", \ + "subs ip, #2", \ + "vld1.16 {q9}, [r6, :128], r3", \ + "", \ + "", \ + "vst1.16 {q10}, [r0, :128], r2", \ + "vst1.16 {q11}, [r5, :128], r2" + pop {r4-r6, pc} +4: + sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \ + "vld1.16 {d16}, [r1, :64], r3", \ + "subs ip, #4", \ + "vld1.16 {d17}, [r6, :64], r3", \ + "vld1.16 {d18}, [r1, :64], r3", \ + "vld1.16 {d19}, [r6, :64], r3", \ + "vst1.16 {d20}, [r0, :64], r2", \ + "vst1.16 {d21}, [r5, :64], r2", \ + "vst1.16 {d22}, [r0, :64], r2", \ + "vst1.16 {d23}, [r5, :64], r2" + pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_8_neon_10, export=1 + band_8_16 10 +endfunc + + +@ ff_hevc_rpi_sao_band_c_32_neon_10( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] +@ uint32_t src_stride [r3] +@ const int16_t * table1 sp[0] +@ uint32_t offset1 sp[4] +@ const int16_t * table2 sp[8] +@ uint32_t offset2 sp[12] +@ int width sp[16] +@ int height sp[20] + +.macro band_c_32_16 bit_depth + push {r4-r6, lr} + add r5, r0, #32 + add r6, r1, #32 + sub r2, #64 + sub r3, #64 + vmov.i64 q14, #0 + vmov.i16 q15, #(1 << \bit_depth) - 1 + bl band_load_c + mov lr, #64 + vpush {q4-q7} + +1: vld2.16 { q4, q5 }, [r1, :128], lr + subs ip, #1 + vld2.16 { q6, q7 }, [r6, :128], lr + vld2.16 { q8, q9 }, [r1, :128], r3 + vld2.16 {q10, q11}, [r6, :128], r3 + + sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ + "pld [r4]", \ + "it ne; addne r4, r3" + sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth + + vst2.16 { q4, q5 }, [r0, :128], lr + vst2.16 { q6, q7 }, [r5, :128], lr + vst2.16 { q8, q9 }, [r0, :128], r2 + vst2.16 {q10, q11}, [r5, :128], r2 + + bpl 1b + + vpop {q4-q7} + pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_c_32_neon_10, export=1 + band_c_32_16 10 +endfunc + + +@ ff_hevc_rpi_sao_band_c_16_neon_10( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] +@ uint32_t src_stride [r3] +@ const int16_t * table1 sp[0] +@ uint32_t offset1 sp[4] +@ const int16_t * table2 sp[8] +@ uint32_t offset2 sp[12] +@ int width sp[16] +@ int height sp[20] + +.macro band_c_16_16 bit_depth + push {r4-r6, lr} + add r5, r0, #32 + add r6, r1, #32 + vmov.i64 q14, #0 + vmov.i16 q15, #(1 << \bit_depth) - 1 + bl band_load_c + +1: vld2.16 { q8, q9 }, [r1, :128], r3 + subs ip, #1 + vld2.16 {q10, q11}, [r6, :128], r3 + + sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth + sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth + + vst2.16 { q8, q9 }, [r0, :128], r2 + vst2.16 {q10, q11}, [r5, :128], r2 + + bpl 1b + pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_c_16_neon_10, export=1 + band_c_16_16 10 +endfunc + + +@ ff_hevc_rpi_sao_band_c_8_neon_10( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] +@ uint32_t src_stride [r3] +@ const int16_t * table1 sp[0] +@ uint32_t offset1 sp[4] +@ const int16_t * table2 sp[8] +@ uint32_t offset2 sp[12] +@ int width sp[16] +@ int height sp[20] + +.macro band_c_8_16 bit_depth + ldr ip, [sp, #16] @ width + push {r4-r6, lr} + vmov.i64 q14, #0 + cmp ip, #8 + vmov.i16 q15, #(1 << \bit_depth) - 1 + bl band_load_c + blt 4f + + sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ + "vld2.16 {q8,q9}, [r1, :128], r3", \ + "subs ip, #1", \ + "", \ + "", \ + "", \ + "vst2.16 {q10,q11}, [r0, :128], r2" + pop {r4-r6, pc} +4: + add r5, r0, r2 + add r6, r1, r3 + lsl r2, #1 + lsl r3, #1 + sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ + "vld2.16 {d16,d18}, [r1, :128], r3", \ + "subs ip, #2", \ + "vld2.16 {d17,d19}, [r6, :128], r3", \ + "", \ + "", \ + "vst2.16 {d20,d22}, [r0, :128], r2", \ + "vst2.16 {d21,d23}, [r5, :128], r2" + pop {r4-r6, pc} +.endm + +function ff_hevc_rpi_sao_band_c_8_neon_10, export=1 + band_c_8_16 10 +endfunc + + +@ ============================================================================= +@ SAO EDGE + +@ r0 destination address +@ r2 stride to post-increment r0 with +@ [r5] translate values +@ +@ a <- c <- b +@ a in q0 - q3 +@ c in q4 - q7 +@ b in q8 - q11 +@ +@ q12-15 used as temp +@ +@ Can be used for both Y & C as we unzip/zip the deltas and +@ transform "u/v" separately via d26/d27. For Y d26=d27 + +function edge_64b_body_8 + + vcgt.u8 q12, q4, q0 @ c > a -> -1 , otherwise 0 + vcgt.u8 q13, q5, q1 + vcgt.u8 q14, q6, q2 + vcgt.u8 q15, q7, q3 + + vcgt.u8 q0, q4 @ a > c -> -1 , otherwise 0 + vcgt.u8 q1, q5 + vcgt.u8 q2, q6 + vcgt.u8 q3, q7 + + vsub.s8 q0, q12 @ a = sign(c-a) + vsub.s8 q1, q13 + vsub.s8 q2, q14 + vsub.s8 q3, q15 + + vcgt.u8 q12, q4, q8 @ c > b -> -1 , otherwise 0 + vcgt.u8 q13, q5, q9 + vcgt.u8 q14, q6, q10 + vcgt.u8 q15, q7, q11 + + vsub.s8 q0, q12 + vsub.s8 q1, q13 + vsub.s8 q2, q14 + vsub.s8 q3, q15 + + vcgt.u8 q12, q8, q4 @ c < b -> -1 , otherwise 0 + vcgt.u8 q13, q9, q5 + vcgt.u8 q14, q10, q6 + vcgt.u8 q15, q11, q7 + + vadd.s8 q0, q12 @ a = sign(c-a) + sign(c-b) + vadd.s8 q1, q13 + vmov.u8 q12, #2 + vadd.s8 q2, q14 + vadd.s8 q3, q15 + + vadd.s8 q0, q12 + vadd.s8 q1, q12 + + vld1.8 {d26, d27}, [r5] + + vadd.s8 q2, q12 + vuzp.8 q0, q1 + vmov.u8 q15, #128 + vadd.s8 q3, q12 @ a = 2 + sign(c-a) + sign(c-b) + + vtbl.8 d0, {d26}, d0 + vadd.s8 q12, q4, q15 @ Add -128 so we can use saturating signed add + + vtbl.8 d1, {d26}, d1 + vadd.s8 q14, q5, q15 + + vtbl.8 d2, {d27}, d2 + vuzp.8 q2, q3 + + vtbl.8 d3, {d27}, d3 + + vtbl.8 d4, {d26}, d4 + vzip.8 q0, q1 + + vtbl.8 d5, {d26}, d5 + vqadd.s8 q0, q12 + vqadd.s8 q1, q14 + vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add + + vtbl.8 d6, {d27}, d6 + vtbl.8 d7, {d27}, d7 + vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add + vzip.8 q2, q3 + + vsub.s8 q0, q15 + vqadd.s8 q2, q12 + vqadd.s8 q3, q14 + vsub.s8 q1, q15 + vsub.s8 q2, q15 + vsub.s8 q3, q15 + + bx lr +endfunc + +@ r0 destination address +@ r2 stride to post-increment r0 with +@ r4 upper clip value +@ [r5] translate values +@ +@ a <- c <- b +@ a in q0 - q3 +@ c in q4 - q7 +@ b in q8 - q11 +@ +@ q12-15 used as temp +@ +@ Can be used for both Y & C as we unzip/zip the deltas and +@ transform "u/v" separately via d26/d27. For Y d26=d27 + +function edge_64b_body_16 + + vcgt.u16 q12, q4, q0 // c > a -> -1 , otherwise 0 + vcgt.u16 q13, q5, q1 + vcgt.u16 q14, q6, q2 + vcgt.u16 q15, q7, q3 + + vcgt.u16 q0, q0, q4 // a > c -> -1 , otherwise 0 + vcgt.u16 q1, q1, q5 + vcgt.u16 q2, q2, q6 + vcgt.u16 q3, q3, q7 + + vsub.s16 q0, q0, q12 // a = sign(c-a) + vsub.s16 q1, q1, q13 + vsub.s16 q2, q2, q14 + vsub.s16 q3, q3, q15 + + vcgt.u16 q12, q4, q8 // c > b -> -1 , otherwise 0 + vcgt.u16 q13, q5, q9 + vcgt.u16 q14, q6, q10 + vcgt.u16 q15, q7, q11 + + vsub.s16 q0, q0, q12 + vsub.s16 q1, q1, q13 + vsub.s16 q2, q2, q14 + vsub.s16 q3, q3, q15 + + vcgt.u16 q12, q8, q4 // c < b -> -1 , otherwise 0 + vcgt.u16 q13, q9, q5 + vcgt.u16 q14, q10, q6 + vcgt.u16 q15, q11, q7 + + vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b) + vadd.s16 q1, q1, q13 + vadd.s16 q2, q2, q14 + vadd.s16 q3, q3, q15 + + vmov.u8 q12, #2 + + vmovn.s16 d0, q0 + vmovn.s16 d1, q1 + vmovn.s16 d2, q2 + vmovn.s16 d3, q3 + + vldr d26, [r5] + + vuzp.8 q0, q1 + + vldr d27, [r5, #8] + + vadd.s8 q0, q0, q12 + vadd.s8 q1, q1, q12 + + vmov.i64 q12, #0 + + vtbl.8 d0, {d26}, d0 + vtbl.8 d1, {d26}, d1 + vtbl.8 d2, {d27}, d2 + vtbl.8 d3, {d27}, d3 + + vdup.i16 q13, r4 + + vzip.8 q0, q1 + + @ Avoid overwrite whilst widening + vaddw.s8 q2, q6, d2 + vaddw.s8 q3, q7, d3 + vaddw.s8 q1, q5, d1 + vaddw.s8 q0, q4, d0 + + @ now clip + clip16_4 q2, q3, q1, q0, q12, q13 + + bx lr +endfunc + + +@ a <- c <- b +@ a in q0 +@ c in q1 +@ b in q2 +@ Temp q3, q9, q10 +@ +@ d16, d17 (q8) xlat U, V +@ q14.u8 #2 +@ q15.u8 #128 + +function edge_16b_body_8 + vcgt.u8 q9, q0, q1 @ a > c -> -1 , otherwise 0 + vadd.u8 q9, q14, q9 + vcgt.u8 q0, q1, q0 @ c > a -> -1 , otherwise 0 + vsub.u8 q9, q9, q0 + vcgt.u8 q0, q2, q1 @ c < b -> -1 , otherwise 0 + vadd.u8 q9, q9, q0 + vcgt.u8 q0, q1, q2 @ c > b -> -1 , otherwise 0 + vsub.u8 q0, q9, q0 + + vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add + + vuzp.8 d0, d1 + + vtbl.8 d0, {d16}, d0 + vtbl.8 d1, {d17}, d1 + + vzip.8 d0, d1 + vqadd.s8 q0, q3 + vsub.s8 q0, q15 + + bx lr +endfunc + +@ a <- c <- b +@ a in q0 +@ c in q1 +@ b in q2 +@ Temp q3 +@ +@ q12, #0 +@ d16, d17 xlat U, V +@ q14.u8 #2 +@ q15.u16 max +function edge_16b_body_16 + vcgt.u16 q9, q0, q1 @ a > c -> -1 , otherwise 0 + vadd.u16 q9, q14, q9 + vcgt.u16 q0, q1, q0 @ c > a -> -1 , otherwise 0 + vsub.u16 q9, q9, q0 + vcgt.u16 q0, q2, q1 @ c < b -> -1 , otherwise 0 + vadd.u16 q9, q9, q0 + vcgt.u16 q0, q1, q2 @ c > b -> -1 , otherwise 0 + vsub.u16 q0, q9, q0 + + vmovn.s16 d0, q0 + @ d1 will have random contents that we transform but + @ that doesn't matter as we then discard them + vuzp.8 d0, d1 + + vtbl.8 d0, {d16}, d0 + vtbl.8 d1, {d17}, d1 + + vzip.8 d0, d1 + + vaddw.s8 q0, q1, d0 + + @ now clip + vmax.s16 q0, q12 + vmin.s16 q0, q15 + bx lr +endfunc + + +@ ff_hevc_rpi_sao_edge_[c_]xx_neon( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ const int16_t *_sao_offset_val_u, [r3] +@ const int16_t *_sao_offset_val_v, [sp, #0] // Chroma only +@ int eo, [sp, #sp_base + 0] +@ int width, [sp, #sp_base + 4] +@ int height) [sp, #sp_base + 8] + +@ Jumps via jump_tab with +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ EDGE_SRC_STRIDE [r3] +@ (1 << \bit_depth) - 1 [r4] +@ * xlat_table [r5] // setup_64b only +@ int height [r12] +@ +@ 0 [q12] // > 8 bit +@ 2 [q14] +@ 128 [q15] // = 8 bit +@ r4 [q15] // > 8 bit + +.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0 + +@ Build translate registers +@ As translate values can only be 0-4 we don't care about junk in the rest +@ of the register +.if \is_chroma + ldr ip, [sp, #0] + push {r4-r6, lr} @ 16 bytes + vld1.8 {d16[2]}, [r3] + add r3, r3, #2 + vld1.8 {d17[2]}, [ip] + add ip, ip, #2 + vld1.8 {d16[0]}, [r3] + add r3, r3, #2 + vld1.8 {d17[0]}, [ip] + add ip, ip, #2 + vld1.8 {d16[1]}, [r3] + add r3, r3, #2 + vld1.8 {d17[1]}, [ip] + add ip, ip, #2 + vld1.8 {d16[3]}, [r3] + add r3, r3, #2 + vld1.8 {d17[3]}, [ip] + add ip, ip, #2 + vld1.8 {d16[4]}, [r3] + vld1.8 {d17[4]}, [ip] + movw r3, EDGE_SRC_STRIDE +.set sp_base, 20 +.else + add ip, r3, #4 + vld1.8 {d16[1]}, [r3] + add r3, r3, #2 + vld1.8 {d17[0]}, [ip] + add ip, ip, #2 + vld1.8 {d16[0]}, [r3] + add r3, r3, #6 + vld1.8 {d17[1]}, [ip] + vld1.8 {d16[2]}, [r3] + movw r3, EDGE_SRC_STRIDE + push {r4-r6, lr} @ 16 bytes + vzip.8 d16, d17 + vmov d17, d16 +.set sp_base, 16 +.endif + +@ If setup_64b we need the xlat table on the stack +.if \setup_64b + sub r5, sp, #16 +.endif + +@ Get jump address +@ We have a special case for width 4 as the calling code doesn't detect it +@ If we may have w4 then we add a 2nd jump table after the 1st +.if \check_w4 + ldr r12, [sp, #sp_base + 4] @ width + adr r6, \jump_tab + ldr lr, [sp, #sp_base + 0] @ e0 + cmp r12, #8 + it lt + addlt r6, #16 +.else + ldr lr, [sp, #sp_base + 0] @ e0 + adr r6, \jump_tab +.endif + + ldr r12, [sp, #sp_base + 8] @ height + +.if \bit_depth > 8 + movw r4, (1 << \bit_depth) - 1 +.endif +.if \setup_16b +.if \bit_depth > 8 + vmov.i64 q12, #0 + vdup.16 q15, r4 + vmov.u16 q14, #2 +.else + vmov.u8 q15, #128 + vmov.u8 q14, #2 +.endif +.endif + +@ If setup_64b we need q4-q7 saved. +.if \setup_64b + vpush {q4-q8} @ 80 bytes, q8 pushed first +.set sp_base, sp_base + 80 +.endif + + ldr r6, [r6, lr, lsl #2] + +@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes +.if \do2 + push {r0, r1, r6, r12} +.if jent_pic + bl 98f +.else + blx r6 +.endif + pop {r0, r1, r6, r12} + + add r0, #64 + add r1, #64 +.endif + +.if jent_pic + bl 98f +.else + blx r6 +.endif + +@ Tidy up & return +.if \setup_64b + vpop {q4-q8} @ spurious but harmless load of q8 +.endif + pop {r4-r6, pc} + +.if jent_pic && !\xjump +@ Magic label - used as 98b in jent macro +98: + add pc, r6 +.endif +.endm + + +.macro edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab + edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1 +.endm + +.macro edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0 + edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump +.endm + + +.macro edge_64b_e0, body_fn, pb + sub r1, #8 + mov r6, lr +1: vldm r1, {d7-d16} + // load a + vext.8 q0, q3, q4, #(16 - \pb) + add r1, r3 + vext.8 q1, q4, q5, #(16 - \pb) + subs r12, #1 + vext.8 q2, q5, q6, #(16 - \pb) + vext.8 q3, q6, q7, #(16 - \pb) + pld [r1] + // load b + vext.8 q11, q7, q8, #\pb @ Avoid overwrite + pld [r1, #64] + vext.8 q8, q4, q5, #\pb + vext.8 q9, q5, q6, #\pb + vext.8 q10, q6, q7, #\pb + bl \body_fn + vstm r0, {q0-q3} + add r0, r0, r2 + bgt 1b + bx r6 +.endm + +.macro edge_32bx2_e0, body_fn, pb + add r6, r1, r3 + push {r7,lr} + sub r1, #8 + add r7, r0, r2 + lsl r2, #1 +1: vldmia r1, {d7-d12} + // load a + vext.8 q0, q3, q4, #16 - \pb + add r1, r1, r3, lsl #1 + vext.8 q1, q4, q5, #16 - \pb + subs r12, #2 + // load b + vext.8 q8, q4, q5, #\pb + vext.8 q9, q5, q6, #\pb + vldr d25, [r6, #-8] + vldmia r6, {d12-d15} + vldr d26, [r6, #32] + // load a + vext.8 q2, q12, q6, #16 - \pb + add r6, r6, r3, lsl #1 + vext.8 q3, q6, q7, #16 - \pb + // load b + vext.8 q10, q6, q7, #\pb + vext.8 q11, q7, q13, #\pb + bl \body_fn + vst1.8 {q0-q1}, [r0, :256], r2 + vst1.8 {q2-q3}, [r7, :256], r2 + bgt 1b + pop {r7,pc} +.endm + +.macro edge_16b_e0, body_fn, pb + sub r1, #8 + mov r6, lr +1: vldmia r1, {d1-d4} + add r1, r3 + subs r12, #1 + vext.8 q0, q0, q1, #16 - \pb + vext.8 q2, q1, q2, #\pb + + bl \body_fn + vst1.8 {q0}, [r0, :128], r2 + bgt 1b + bx r6 +.endm + +.macro edge_8bx2_e0, body_fn, pb + add r6, r1, r3 + push {r7,lr} + sub r1, #8 + add r7, r0, r2 + lsl r2, #1 +1: vldmia r1, {d1-d2} + vldmia r6, {d3-d4} + vldr d6, [r1, #16] + subs r12, #2 + vldr d7, [r6, #-8] + add r1, r1, r3, lsl #1 + vext.8 d0, d1, d2, #8 - \pb + add r6, r6, r3, lsl #1 + vext.8 d5, d3, d4, #\pb + vext.8 d4, d2, d6, #\pb + vext.8 d1, d7, d3, #8 - \pb + + bl \body_fn + vst1.8 {d0}, [r0, :64], r2 + vst1.8 {d1}, [r7, :64], r2 + bgt 1b + pop {r7,pc} +.endm + +.macro edge_4bx4_e0, body_fn, pb + add r6, r1, r3 + push {r7,lr} + add r7, r0, r2 + lsl r2, #1 + + tst r1, #4 + bne 2f +1: // r1 (and assumed r6) are 64-bit aligned + vldr d2, [r1] + vldr d0, [r1, #-8] + add r1, r1, r3, lsl #1 + vldr d20, [r6] + subs r12, #4 + vldr d18, [r6, #-8] + add r6, r6, r3, lsl #1 + vldr d3, [r1] + vshr.u64 d4, d2, #\pb * 8 + vldr d1, [r1, #-8] + add r1, r1, r3, lsl #1 + vldr d21, [r6] + vext.8 d0, d0, d2, #8 - \pb + vldr d19, [r6,#-8] + add r6, r6, r3, lsl #1 + vshr.u64 d22, d20, #\pb * 8 + vext.8 d18, d18, d20, #8 - \pb + vshr.u64 d5, d3, #\pb * 8 + vext.8 d1, d1, d3, #8 - \pb + vshr.u64 d23, d21, #\pb * 8 + vext.8 d19, d19, d21, #8 - \pb + vsli.64 q1, q10, #32 + vsli.64 q2, q11, #32 + vsli.64 q0, q9, #32 + + bl \body_fn + vst1.32 {d0[0]}, [r0, :32], r2 + vst1.32 {d0[1]}, [r7, :32], r2 + vst1.32 {d1[0]}, [r0, :32], r2 + vst1.32 {d1[1]}, [r7, :32], r2 + bgt 1b + pop {r7,pc} + +2: // r1 (and assumed r6) are 32-bit but not 64-bit aligned + vldr d20, [r1, #-4] + vldr d22, [r1, #4] + add r1, r1, r3, lsl #1 + vldr d2, [r6, #-4] + subs r12, #4 + vldr d4, [r6, #4] + add r6, r6, r3, lsl #1 + vldr d21, [r1, #-4] + vshl.i64 d18, d20, #\pb * 8 + vldr d23, [r1, #4] + add r1, r1, r3, lsl #1 + vldr d3, [r6, #-4] + vext.8 d22, d20, d22, #\pb + vldr d5, [r6, #4] + add r6, r6, r3, lsl #1 + vshl.i64 d0, d2, #\pb * 8 + vext.8 d4, d2, d4, #\pb + vshl.i64 d19, d21, #\pb * 8 + vext.8 d23, d21, d23, #\pb + vshl.i64 d1, d3, #\pb * 8 + vext.8 d5, d3, d5, #\pb + vsri.64 q1, q10, #32 + vsri.64 q0, q9, #32 + vsri.64 q2, q11, #32 + + bl \body_fn + vst1.32 {d0[0]}, [r0, :32], r2 + vst1.32 {d0[1]}, [r7, :32], r2 + vst1.32 {d1[0]}, [r0, :32], r2 + vst1.32 {d1[1]}, [r7, :32], r2 + bgt 2b + pop {r7,pc} +.endm + + +.macro edge_64b_e1, body_fn + sub r1, r3 + push {lr} + add r6, r1, #32 + // load a + vld1.8 {q0-q1}, [r1, :256], r3 + vld1.8 {q2-q3}, [r6, :256], r3 + // load c + vld1.8 {q4-q5}, [r1, :256], r3 + vld1.8 {q6-q7}, [r6, :256], r3 +1: // load b + vld1.8 {q8-q9}, [r1, :256], r3 + subs r12, #1 + vld1.8 {q10-q11}, [r6, :256], r3 + bl \body_fn + vstm r0, {q0-q3} + // copy c to a + vmov.64 q0, q4 + pld [r1, r3] + vmov.64 q1, q5 + it le + pople {lr} + vmov.64 q2, q6 + it le + bxle lr + vmov.64 q3, q7 + add r0, r0, r2 + // copy b to c + vmov.64 q4, q8 + vmov.64 q5, q9 + vmov.64 q6, q10 + vmov.64 q7, q11 + b 1b +.endm + +.macro edge_32bx2_e1, body_fn + sub r6, r1, r3 + vld1.8 {q2-q3}, [r1, :256], r3 + vld1.8 {q0-q1}, [r6, :256] + mov r6, lr + +1: @ Given the data duplication here we could obviously do better than + @ using the generic body_fn but it almost certainly isn't worth it + vld1.8 {q8-q9}, [r1, :256], r3 + subs r12, #2 + vmov q4, q2 + vmov q5, q3 + vld1.8 {q10-q11}, [r1, :256], r3 + vmov q6, q8 + vmov q7, q9 + + bl \body_fn + + vst1.8 {q0-q1}, [r0, :256], r2 + // copy b to a + vmov q0, q8 + vmov q1, q9 + vst1.8 {q2-q3}, [r0, :256], r2 + vmov q2, q10 + it le + bxle r6 + vmov q3, q11 + b 1b +.endm + +.macro edge_16b_e1, body_fn + sub r6, r1, r3 + // load c + vld1.8 {q1}, [r1, :128], r3 + // load a + vld1.8 {q0}, [r6, :128] + mov r6, lr +1: // load b + vld1.8 {q2}, [r1, :128], r3 + bl \body_fn + vst1.8 {q0}, [r0, :128], r2 + subs r12, #1 + // copy c to a + vmov.64 q0, q1 + it le + bxle r6 + // copy b to c + vmov.64 q1, q2 + b 1b +.endm + +.macro edge_8bx2_e1, body_fn + sub r6, r1, r3 + lsl r3, #1 + push {r7, lr} + vld1.8 {d1}, [r1, :64], r3 + vld1.8 {d0}, [r6, :64], r3 + add r7, r0, r2 + lsl r2, #1 +1: @ Given the data duplication here we could obviously do better than + @ using the generic body_fn but it almost certainly isn't worth it + vld1.8 {d4}, [r6, :64], r3 + vmov d2, d1 + vld1.8 {d5}, [r1, :64], r3 + subs r12, #2 + vmov d3, d4 + + bl \body_fn + + vst1.8 {d0}, [r0, :64], r2 + vst1.8 {d1}, [r7, :64], r2 + + // copy b to a + vmov q0, q2 + bgt 1b + pop {r7, pc} +.endm + +.macro edge_4bx4_e1, body_fn + sub r6, r1, r3 + lsl r3, #1 + push {r7, lr} + vld1.32 {d0[1]}, [r1, :32], r3 + add r7, r0, r2 + vld1.32 {d0[0]}, [r6, :32], r3 + lsl r2, #1 + vld1.32 {d4[1]}, [r1, :32], r3 + vld1.32 {d4[0]}, [r6, :32], r3 + vld1.32 {d5[1]}, [r1, :32], r3 + vld1.32 {d5[0]}, [r6, :32], r3 + vmov d1, d4 + vext.32 d2, d0, d4, #1 + subs r12, #4 + vmov d22, d5 + vext.32 d3, d4, d5, #1 + b 2f + +1: vst1.32 {d0[0]}, [r0, :32], r2 + vext.32 d2, d22, d4, #1 + vst1.32 {d0[1]}, [r7, :32], r2 + vmov d0, d22 + vst1.32 {d1[0]}, [r0, :32], r2 + vext.32 d3, d4, d5, #1 + vst1.32 {d1[1]}, [r7, :32], r2 + vmov d1, d4 + vmov d22, d5 +2: @ Given the data duplication here we could probably do better than + @ using the generic body_fn but it almost certainly isn't worth it + bl \body_fn + ble 3f + vld1.32 {d4[0]}, [r6, :32], r3 + subs r12, #4 + vld1.32 {d4[1]}, [r1, :32], r3 + vld1.32 {d5[0]}, [r6, :32], r3 + vld1.32 {d5[1]}, [r1, :32], r3 + b 1b + +3: vst1.32 {d0[0]}, [r0, :32], r2 + vst1.32 {d0[1]}, [r7, :32], r2 + vst1.32 {d1[0]}, [r0, :32] + vst1.32 {d1[1]}, [r7, :32] + pop {r7, pc} +.endm + +.macro edge_64b_e2, body_fn, pb + push {lr} + sub r6, r1, r3 + // load c and a + vld1.8 {q4-q5}, [r1, :128] + vldr d25, [r6, #-8] + vldmia r6, {d16-d23} + vext.8 q0, q12, q8, #16 - \pb + add r6, r1, #32 + vext.8 q1, q8, q9, #16 - \pb + add r1, r1, r3 + vext.8 q2, q9, q10, #16 - \pb + vld1.8 {q6-q7}, [r6, :128] + sub r6, r1, r3 + vext.8 q3, q10, q11, #16 - \pb + +1: // load b + vldmia r1, {d16-d24} + vext.8 q8, q8, q9, #\pb + pld [r1, r3] + vext.8 q9, q9, q10, #\pb + subs r12, #1 + vext.8 q10, q10, q11, #\pb + vext.8 q11, q11, q12, #\pb + bl \body_fn + // next a is mostly available in c + vldr d25, [r6, #-8] + vstmia r0, {q0-q3} + vext.8 q3, q6, q7, #16 - \pb + it le + pople {lr} + vext.8 q2, q5, q6, #16 - \pb + it le + bxle lr + vext.8 q1, q4, q5, #16 - \pb + add r6, r6, r3 + vext.8 q0, q12, q4, #16 - \pb + add r0, r0, r2 + // next c is mostly available in b + vldr d8, [r1] + vext.8 d9, d16, d17, #8 - \pb + vext.8 q5, q8, q9, #16 - \pb + add r1, r1, r3 + vext.8 q6, q9, q10, #16 - \pb + pld [r6, #-8] + vext.8 q7, q10, q11, #16 - \pb + b 1b +.endm + +.macro edge_32bx2_e2, body_fn, pb + sub r6, r1, r3 + push {r7, lr} + add r7, r0, r2 + lsl r2, #1 + // load a and first 32b of c + vld1.8 {q4-q5}, [r1, :256] + vldr d25, [r6, #-8] + vld1.8 {q13-q14}, [r6, :256] + vldr d31, [r1, #-8] + add r6, r6, r3, lsl #1 + vext.8 q0, q12, q13, #16 - \pb + add r1, r1, r3, lsl #1 + vext.8 q1, q13, q14, #16 - \pb + vext.8 q2, q15, q4, #16 - \pb + vext.8 q3, q4, q5, #16 - \pb +1: + // load second 32b of c and second 32b of b + vldmia r6, {d12-d16} + vldmia r1, {d20-d24} + // first 32b of b is mostly available in second 32b of c + vext.8 q9, q7, q8, #\pb + subs r12, #2 + vext.8 q8, q6, q7, #\pb + vext.8 q10, q10, q11, #\pb + vext.8 q11, q11, q12, #\pb + + bl \body_fn + + vst1.8 {q0-q1}, [r0, :256], r2 + vst1.8 {q2-q3}, [r7, :256], r2 + ble 2f + + vldr d25, [r6, #-8] + add r6, r6, r3, lsl #1 + vldr d8, [r1] + vext.8 d9, d20, d21, #8 - \pb + vldr d31, [r1, #-8] + add r1, r1, r3, lsl #1 + // first 32b of a is mostly available in second 32b of c + vext.8 q1, q6, q7, #16 - \pb + vext.8 q0, q12, q6, #16 - \pb + // first 32b of c is mostly available in second 32b of b + vext.8 q5, q10, q11, #16 - \pb + // second 32b of a is mostly available in first 32b of c + vext.8 q2, q15, q4, #16 - \pb + vext.8 q3, q4, q5, #16 - \pb + b 1b + +2: pop {r7, pc} +.endm + +.macro edge_16b_e2, body_fn, pb + push {lr} + sub r6, r1, r3 + vld1.8 {q1}, [r1, :128], r3 + vldr d19, [r6, #-8] + vld1.8 {q10}, [r6, :128], r3 + +1: vldmia r1, {d4-d6} + vext.8 q0, q9, q10, #16 - \pb + subs r12, #1 + vext.8 q2, q2, q3, #\pb + bl \body_fn + vst1.8 {q0}, [r0, :128], r2 + ble 2f + vmov q10, q1 + vldr d2, [r1] + add r1, r1, r3 + vldr d19, [r6, #-8] + add r6, r6, r3 + vext.8 d3, d4, d5, #8 - \pb + b 1b + +2: pop {pc} +.endm + +.macro edge_8bx2_e2, body_fn, pb + sub r6, r1, r3 + push {r7, lr} + add r7, r0, r2 + lsl r2, #1 + vldr d18, [r6, #-8] + vldr d19, [r6] + add r6, r6, r3, lsl #1 + vldr d20, [r1, #-8] + vldr d2, [r1] + add r1, r1, r3, lsl #1 + vldmia r6, {d3-d4} + vld1.8 {d21-d22}, [r1, :128] + +1: vext.8 d0, d18, d19, #8 - \pb + vext.8 d4, d3, d4, #\pb + vext.8 d1, d20, d2, #8 - \pb + subs r12, #2 + vext.8 d5, d21, d22, #\pb + + bl \body_fn + + vst1.8 {d0}, [r0, :64], r2 + vst1.8 {d1}, [r7, :64], r2 + ble 2f + + vldr d18, [r6, #-8] + add r6, r6, r3, lsl #1 + vldr d20, [r1, #-8] + vmov d19, d3 + vldr d2, [r1] + add r1, r1, r3, lsl #1 + vldmia r6, {d3-d4} + vld1.8 {d21-d22}, [r1, :128] + b 1b + +2: pop {r7, pc} +.endm + +.macro edge_4bx4_e2, body_fn, pb + sub r6, r1, r3 + push {r7-r9, lr} + add r8, r1, r3 + sub r6, r6, #\pb + add r8, r8, #\pb + add r7, r0, r2 + lsl r2, #1 + +1: vld1.32 {d0[0]}, [r6], r3 + subs r12, #4 + vld1.32 {d2[0]}, [r1], r3 + vld1.32 {d4[0]}, [r8], r3 + vld1.32 {d0[1]}, [r6], r3 + vld1.32 {d2[1]}, [r1], r3 + vld1.32 {d4[1]}, [r8], r3 + vld1.32 {d1[0]}, [r6], r3 + vld1.32 {d3[0]}, [r1], r3 + vld1.32 {d5[0]}, [r8], r3 + vld1.32 {d1[1]}, [r6], r3 + vld1.32 {d3[1]}, [r1], r3 + vld1.32 {d5[1]}, [r8], r3 + + bl \body_fn + + vst1.32 {d0[0]}, [r0, :32], r2 + vst1.32 {d0[1]}, [r7, :32], r2 + vst1.32 {d1[0]}, [r0, :32], r2 + vst1.32 {d1[1]}, [r7, :32], r2 + bgt 1b + + pop {r7-r9,pc} +.endm + +.macro edge_64b_e3, body_fn, pb + push {lr} + sub r6, r1, r3 + // load c and a + vld1.8 {q4-q5}, [r1, :128] + vldmia r6, {d16-d24} + vext.8 q0, q8, q9, #\pb + add r6, r1, #32 + vext.8 q1, q9, q10, #\pb + add r1, r1, r3 + vext.8 q2, q10, q11, #\pb + vld1.8 {q6-q7}, [r6, :128] + sub r6, r1, r3 + vext.8 q3, q11, q12, #\pb + +1: // load b + vldr d17, [r1, #-8] + vldmia r1, {d18-d25} + vext.8 q8, q8, q9, #16 - \pb + pld [r1, r3] + vext.8 q9, q9, q10, #16 - \pb + subs r12, #1 + vext.8 q10, q10, q11, #16 - \pb + vext.8 q11, q11, q12, #16 - \pb + bl \body_fn + // next a is mostly available in c + vldr d24, [r6, #64] + vstmia r0, {q0-q3} + vext.8 q0, q4, q5, #\pb + it le + pople {lr} + vext.8 q1, q5, q6, #\pb + it le + bxle lr + vext.8 q2, q6, q7, #\pb + add r6, r6, r3 + vext.8 q3, q7, q12, #\pb + add r0, r0, r2 + // next c is mostly available in b + vext.8 d14, d22, d23, #\pb + vldr d15, [r1, #56] + vext.8 q4, q8, q9, #\pb + add r1, r1, r3 + vext.8 q5, q9, q10, #\pb + vext.8 q6, q10, q11, #\pb + b 1b +.endm + +.macro edge_32bx2_e3, body_fn, pb + sub r6, r1, r3 + push {r7, lr} + add r7, r0, r2 + lsl r2, #1 + // load a and first 32b of c + vldmia r1, {d8-d12} + vldmia r6, {d24-d28} + vext.8 q2, q4, q5, #\pb + add r6, r6, r3, lsl #1 + vext.8 q3, q5, q6, #\pb + add r1, r1, r3, lsl #1 + vext.8 q0, q12, q13, #\pb + vext.8 q1, q13, q14, #\pb +1: + // load second 32b of c and second 32b of b + vldr d25, [r6, #-8] + subs r12, #2 + vldmia r6, {d12-d15} + vldr d27, [r1, #-8] + vldmia r1, {d20-d23} + // first 32b of b is mostly available in second 32b of c + vext.8 q8, q12, q6, #16 - \pb + vext.8 q9, q6, q7, #16 - \pb + vext.8 q11, q10, q11, #16 - \pb + vext.8 q10, q13, q10, #16 - \pb + + bl \body_fn + + vst1.8 {q0-q1}, [r0, :256], r2 + vst1.8 {q2-q3}, [r7, :256], r2 + ble 2f + + vldr d24, [r6, #32] + add r6, r6, r3, lsl #1 + vldr d11, [r1, #24] + vext.8 d10, d22, d23, #\pb + vldr d30, [r1, #32] + add r1, r1, r3, lsl #1 + // first 32b of a is mostly available in second 32b of c + vext.8 q0, q6, q7, #\pb + vext.8 q1, q7, q12, #\pb + // first 32b of c is mostly available in second 32b of b + vext.8 q4, q10, q11, #\pb + // second 32b of a is mostly available in first 32b of c + vext.8 q3, q5, q15, #\pb + vext.8 q2, q4, q5, #\pb + b 1b + +2: pop {r7, pc} +.endm + +.macro edge_16b_e3, body_fn, pb + push {lr} + sub r6, r1, r3 + vld1.8 {q1}, [r1, :128], r3 + vldmia r6, {d18-d20} + add r6, r6, r3 + +1: vldr d5, [r1, #-8] + vld1.8 {q3}, [r1, :128] + subs r12, #1 + vext.8 q0, q9, q10, #\pb + vext.8 q2, q2, q3, #16 - \pb + bl \body_fn + vst1.8 {q0}, [r0, :128], r2 + ble 2f + vmov q9, q1 + vldr d3, [r1, #8] + add r1, r1, r3 + vldr d20, [r6, #16] + add r6, r6, r3 + vext.8 d2, d4, d5, #\pb + b 1b + +2: pop {pc} +.endm + +.macro edge_8bx2_e3, body_fn, pb + sub r6, r1, r3 + push {r7, lr} + add r7, r0, r2 + lsl r2, #1 + vld1.8 {d18-d19}, [r6] + add r6, r6, r3, lsl #1 + vldr d20, [r1, #8] + vldr d2, [r1] + add r1, r1, r3, lsl #1 + vldr d4, [r6, #-8] + vldr d3, [r6] + vldr d21, [r1, #-8] + vldr d22, [r1] + +1: vext.8 d0, d18, d19, #\pb + vext.8 d4, d4, d3, #8 - \pb + vext.8 d1, d2, d20, #\pb + subs r12, #2 + vext.8 d5, d21, d22, #8 - \pb + + bl \body_fn + + vst1.8 {d0}, [r0, :64], r2 + vst1.8 {d1}, [r7, :64], r2 + ble 2f + + vldr d19, [r6, #8] + add r6, r6, r3, lsl #1 + vldr d20, [r1, #8] + vmov d18, d3 + vldr d2, [r1] + add r1, r1, r3, lsl #1 + vldr d4, [r6, #-8] + vldr d3, [r6] + vldr d21, [r1, #-8] + vldr d22, [r1] + b 1b + +2: pop {r7, pc} +.endm + +.macro edge_4bx4_e3, body_fn, pb + @ e3 is the same as e2 but with the X offset reversed + edge_4bx4_e2 \body_fn, (-\pb) +.endm + +@ Jump table entry - if in neon mode the bottom bit must be set +@ ? There is probably a real asm instruction to do this but I haven't found it +.macro jent lab +.if jent_pic +@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is +@ simpler and clearer in the code to stick with .word +T .word (0 + \lab) - (4 + 98b) +A .word (0 + \lab) - (8 + 98b) +.else +T .word 1 + \lab +A .word \lab +.endif +.endm + +.macro edge_64b_bodies, body_fn, pb + jent 0f + jent 10f + jent 20f + jent 30f + +0: edge_64b_e0 \body_fn, \pb +10: edge_64b_e1 \body_fn +20: edge_64b_e2 \body_fn, \pb +30: edge_64b_e3 \body_fn, \pb +.endm + +.macro edge_32bx2_bodies, body_fn, pb + jent 0f + jent 10f + jent 20f + jent 30f + +0: edge_32bx2_e0 \body_fn, \pb +10: edge_32bx2_e1 \body_fn +20: edge_32bx2_e2 \body_fn, \pb +30: edge_32bx2_e3 \body_fn, \pb +.endm + +.macro edge_16b_bodies, body_fn, pb + jent 0f + jent 10f + jent 20f + jent 30f + +0: edge_16b_e0 \body_fn, \pb +10: edge_16b_e1 \body_fn +20: edge_16b_e2 \body_fn, \pb +30: edge_16b_e3 \body_fn, \pb +.endm + +.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb + jent 0f + jent 10f + jent 20f + jent 30f + jent 5f + jent 15f + jent 25f + jent 35f + +0: edge_32bx2_e0 \body_fn_64b, \pb +10: edge_32bx2_e1 \body_fn_64b +20: edge_32bx2_e2 \body_fn_64b, \pb +30: edge_32bx2_e3 \body_fn_64b, \pb +5: edge_16b_e0 \body_fn_16b, \pb +15: edge_16b_e1 \body_fn_16b +25: edge_16b_e2 \body_fn_16b, \pb +35: edge_16b_e3 \body_fn_16b, \pb +.endm + +.macro edge_16b_8bx2_bodies, body_fn, pb + jent 0f + jent 10f + jent 20f + jent 30f + jent 5f + jent 15f + jent 25f + jent 35f + +0: edge_16b_e0 \body_fn, \pb +10: edge_16b_e1 \body_fn +20: edge_16b_e2 \body_fn, \pb +30: edge_16b_e3 \body_fn, \pb +5: edge_8bx2_e0 \body_fn, \pb +15: edge_8bx2_e1 \body_fn +25: edge_8bx2_e2 \body_fn, \pb +35: edge_8bx2_e3 \body_fn, \pb +.endm + +.macro edge_8bx2_4bx4_bodies, body_fn, pb + jent 0f + jent 10f + jent 20f + jent 30f + jent 5f + jent 15f + jent 25f + jent 35f + +0: edge_8bx2_e0 \body_fn, \pb +10: edge_8bx2_e1 \body_fn +20: edge_8bx2_e2 \body_fn, \pb +30: edge_8bx2_e3 \body_fn, \pb +5: edge_4bx4_e0 \body_fn, \pb +15: edge_4bx4_e1 \body_fn +25: edge_4bx4_e2 \body_fn, \pb +35: edge_4bx4_e3 \body_fn, \pb +.endm + +@ void ff_hevc_rpi_sao_edge_8_neon_8( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] +@ int16_t *_sao_offset_val, [r3] +@ int eo, [sp, #0] +@ int width, [sp, #4] +@ int height) [sp, #8] + +function ff_hevc_rpi_sao_edge_8_neon_8, export=1 + edge_16b_init 8, 0, 1, 99f +99: + edge_8bx2_4bx4_bodies edge_16b_body_8, 1 +endfunc + +@ void ff_hevc_rpi_sao_edge_16_neon_8( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] +@ int16_t *_sao_offset_val, [r3] +@ int eo, [sp, #0] +@ int width, [sp, #4] +@ int height) [sp, #8] + +function ff_hevc_rpi_sao_edge_16_neon_8, export=1 + edge_16b_init 8, 0, 0, 99f +99: + edge_16b_bodies edge_16b_body_8, 1 +endfunc + +@ void ff_hevc_rpi_sao_edge_32_neon_8( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] +@ int16_t *_sao_offset_val, [r3] +@ int eo, [sp, #0] +@ int width, [sp, #4] +@ int height) [sp, #8] + +function ff_hevc_rpi_sao_edge_32_neon_8, export=1 + edge_64b_init 8, 0, 0, 99f +99: + edge_32bx2_bodies edge_64b_body_8, 1 +endfunc + +@ void ff_hevc_rpi_sao_edge_64_neon_8( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] +@ int16_t *_sao_offset_val, [r3] +@ int eo, [sp, #0] +@ int width, [sp, #4] +@ int height) [sp, #8] + +function ff_hevc_rpi_sao_edge_64_neon_8, export=1 + edge_64b_init 8, 0, 0, 99f +99: + edge_64b_bodies edge_64b_body_8, 1 +endfunc + +@ ff_hevc_rpi_sao_edge_c_8_neon_8( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ const int16_t *_sao_offset_val_u, [r3] +@ const int16_t *_sao_offset_val_v, [sp, #0] +@ int eo, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1 + edge_16b_init 8, 1, 1, 99f +99: + edge_16b_8bx2_bodies edge_16b_body_8, 2 +endfunc + +@ ff_hevc_rpi_sao_edge_c_16_neon_8( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ const int16_t *_sao_offset_val_u, [r3] +@ const int16_t *_sao_offset_val_v, [sp, #0] +@ int eo, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1 + edge_64b_init 8, 1, 0, 99f +99: + edge_32bx2_bodies edge_64b_body_8, 2 +endfunc + +@ ff_hevc_rpi_sao_edge_c_32_neon_8( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ const int16_t *_sao_offset_val_u, [r3] +@ const int16_t *_sao_offset_val_v, [sp, #0] +@ int eo, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1 + edge_64b_init 8, 1, 0, 99f +99: + edge_64b_bodies edge_64b_body_8, 2 +endfunc + +@ void ff_hevc_rpi_sao_edge_8_neon_10( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] +@ int16_t *_sao_offset_val, [r3] +@ int eo, [sp, #0] +@ int width, [sp, #4] +@ int height) [sp, #8] + +function ff_hevc_rpi_sao_edge_8_neon_10, export=1 + edge_16b_init 10, 0, 1, 99f +99: + edge_16b_8bx2_bodies edge_16b_body_16, 2 +endfunc + +@ void ff_hevc_rpi_sao_edge_16_neon_10( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] +@ int16_t *_sao_offset_val, [r3] +@ int eo, [sp, #0] +@ int width, [sp, #4] +@ int height) [sp, #8] + +function ff_hevc_rpi_sao_edge_16_neon_10, export=1 + edge_64b_init 10, 0, 0, 99f +99: + edge_32bx2_bodies edge_64b_body_16, 2 +endfunc + +@ void ff_hevc_rpi_sao_edge_64_neon_10( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] +@ int16_t *_sao_offset_val, [r3] +@ int eo, [sp, #0] +@ int width, [sp, #4] +@ int height) [sp, #8] + +@ We simply split the 32 case into 2 vertical stripes +@ and call the fns for w32 +@ +@ Calling code will always have src != dst so we don't have to worry +@ about edge effects + +function ff_hevc_rpi_sao_edge_64_neon_10, export=1 + edge_64b_init 10, 0, 1, 99f, xjump=1 +endfunc + +@ void ff_hevc_rpi_sao_edge_32_neon_10( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] +@ int16_t *_sao_offset_val, [r3] +@ int eo, [sp, #0] +@ int width, [sp, #4] +@ int height) [sp, #8] + +function ff_hevc_rpi_sao_edge_32_neon_10, export=1 + edge_64b_init 10, 0, 0, 99f +99: + edge_64b_bodies edge_64b_body_16, 2 +endfunc + +@ ff_hevc_rpi_sao_edge_c_8_neon_10( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ const int16_t *_sao_offset_val_u, [r3] +@ const int16_t *_sao_offset_val_v, [sp, #0] +@ int eo, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1 + edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1 +99: + edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4 +endfunc + +@ ff_hevc_rpi_sao_edge_c_32_neon_10( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ const int16_t *_sao_offset_val_u, [r3] +@ const int16_t *_sao_offset_val_v, [sp, #0] +@ int eo, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1 + edge_64b_init 10, 1, 1, 99f, xjump=1 +endfunc + + +@ ff_hevc_rpi_sao_edge_c_16_neon_10( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] +@ const int16_t *_sao_offset_val_u, [r3] +@ const int16_t *_sao_offset_val_v, [sp, #0] +@ int eo, [sp, #4] +@ int width, [sp, #8] +@ int height) [sp, #12] + +function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1 + edge_64b_init 10, 1, 0, 99f +99: + edge_64b_bodies edge_64b_body_16, 4 +endfunc + diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h new file mode 100644 index 0000000000..36a23a5bf9 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_arm.h @@ -0,0 +1,28 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_HEVCPRED_ARM_H +#define AVCODEC_ARM_HEVCPRED_ARM_H + +#include "libavcodec/rpi_hevcpred.h" + +void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth); +void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth); + +#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */ + diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c new file mode 100644 index 0000000000..80724d4cf3 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_init_arm.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2018 John Cox (for Raspberry Pi) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" + +#include "libavcodec/rpi_hevcpred.h" +#include "rpi_hevcpred_arm.h" + +av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + ff_hevc_rpi_pred_init_neon(c, bit_depth); +} + diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c new file mode 100644 index 0000000000..21e7700174 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_init_neon.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2018 John Cox (for Raspberry Pi) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "rpi_hevcpred_arm.h" + +intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8; +intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; +intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; +intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; +intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16; +intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32; +intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32; +intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32; + +void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); + +void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); + +void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); +void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); + +void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); + +void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); +void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); + +void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth) +{ + switch (bit_depth) + { + case 8: + c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8; + c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8; + c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16; // Equivalent to c_4_neon_8 + c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16; + c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16; + + c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8; + c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8; + c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8; + c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8; + c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8; + c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8; + c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8; + + c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8; + c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8; + c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8; + c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8; + c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8; + c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8; + c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8; + + c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8; + c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8; + c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8; + c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8; + c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8; + c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8; + c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8; + + c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8; + c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8; + c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8; + c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8; + c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8; + c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8; + c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8; + + c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8; + c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8; + c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8; + c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8; + c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8; + c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8; + c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8; + break; + case 10: + c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16; + c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16; + c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16; + c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32; + c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32; + c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32; + + c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10; + c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10; + c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10; + c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10; + c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10; + c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10; + c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10; + + c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10; + c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10; + c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10; + c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10; + c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10; + c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10; + c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10; + + c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10; + c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10; + c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10; + c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10; + c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10; + c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10; + c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10; + + c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10; + c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10; + c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10; + c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10; + c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10; + c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10; + c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10; + + c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10; + c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10; + c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10; + c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10; + c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10; + c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10; + c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10; + break; + default: + break; + } +} + diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S new file mode 100644 index 0000000000..fa8f67cf03 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S @@ -0,0 +1,2984 @@ +/* +Copyright (c) 2017 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox, Ben Avison +*/ + +/* + * General angular pred + * + * Horizontal (10) & Vertical (26) cases have their own file + * and are not dealt with properly here (luma filtering is missing) + * + * The inv_angle calculations are annoying - if it wasn't for the +128 + * rounding step then the result would simply be the loop counter :-( + */ + + +#include "libavutil/arm/asm.S" +#include "neon.S" + +.text + +@ Horizontal Patch functions +@ These need a transpose before store so exist as smaller patches +@ Patches can be called repeatedly without any intermediate setup +@ to generate a horizontal block +@ +@ It is almost certainly the case that larger patch fns can be built +@ and they would be a little faster, but we would still need the small +@ fns and code size (or at least instruction cache size) is an issue +@ given how much code we already have here + +@ Generate 8x8 luma 8 patch +@ +@ r3 Out stride +@ r4 Angle add +@ r7 Inv angle (_up only) +@ +@ In/Out (updated) +@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) +@ r2 Left ptr - updated +@ r10 Inv angle accumulator (_up only) +@ r12 32 - angle frac (_down) or angle frac (_up) +@ d0 Older reference samples +@ d1=r8+r9 Newer reference samples +@ d2 32 - angle frac +@ d3 Angle frac +@ q2 Partially computed next result (_up only) +@ +@ Temps +@ r5 Loop counter +@ r6 +@ r7 (_down only) +@ r11 (_up only) +@ q2, q8-q11 + +patch_h_down_8x8_8: + ldrd r8, r9, [r2] @ Left + rsb r12, r6, #32 + vmov d0, r8, r9 + vdup.8 d3, r6 + lsr r8, #8 + vdup.8 d2, r12 + orr r8, r8, r9, lsl #24 + ldr r9, [r2, #5]! + vmov d1, r8, r9 + // drop through... +patch_h_down_8x8_8_continue: + mov r5, #8 +1: + subs r12, r4 + vmull.u8 q2, d0, d2 + it mi + addmi r12, #32 + vmlal.u8 q2, d1, d3 + rsb r6, r12, #32 + vext.8 q8, q8, q9, #8 + itt mi + lsrmi r7, r8, #8 + vmovmi d0, r8, r9 + vdup.8 d2, r12 + vext.8 q9, q9, q10, #8 + it mi + orrmi r8, r7, r9, lsl #24 + vext.8 q10, q10, q11, #8 + it mi + ldrmi r9, [r2, #1]! + vmov d22, d23 + vrshrn.u16 d23, q2, #5 + it mi + vmovmi d1, r8, r9 + subs r5, #1 + vdup.8 d3, r6 + bne 1b + // drop through... +store_tran_8x8_8: + vzip.8 d16, d17 + add r6, r0, r3 + vzip.8 d18, d19 + lsl r3, #1 + vzip.8 d20, d21 + add r5, r0, r3 + vzip.8 d22, d23 + vzip.16 q8, q9 + vzip.16 q10, q11 + vzip.32 q8, q10 + vzip.32 q9, q11 + vst1.8 {d16}, [r0]! + vst1.8 {d17}, [r6], r3 + vst1.8 {d20}, [r5], r3 + vst1.8 {d21}, [r6], r3 + vst1.8 {d18}, [r5], r3 + vst1.8 {d19}, [r6], r3 + vst1.8 {d22}, [r5] + asr r3, #1 + vst1.8 {d23}, [r6] + + bx lr + +patch_h_up_8x8_8: + ldrd r8, r9, [r2] + rsb r6, r4, #32 + vmov d0, r8, r9 + vdup.8 d3, r4 + lsr r11, r8, #24 + vdup.8 d2, r6 + ldr r8, [r2, #-1]! + orr r9, r11, r9, lsl #8 + vmov d1, r8, r9 + mov r12, r4 + vmull.u8 q2, d0, d2 + vmlal.u8 q2, d1, d3 +patch_h_up_8x8_8_continue: + mov r5, #8 +1: + add r12, r4 + mov r11, #0 + cmp r12, #33 + it cs + addcs r10, r7 + vext.8 q8, q8, q9, #8 + itt cs + subcs r12, #32 + tstcs r10, #1<<31 + rsb r6, r12, #32 + it eq + asreq r11, r10, #8 + it cs + vmovcs d0, r8, r9 + vdup.8 d2, r6 + it cs + lsrcs r6, r8, #24 + vext.8 q9, q9, q10, #8 + itt cs + orrcs r9, r6, r9, lsl #8 + ldrbcs r11, [r1, r11] + vdup.8 d3, r12 + vext.8 q10, q10, q11, #8 + it hi + ldrbhi r11, [r2, #-1]! + vmov d22, d23 + vrshrn.u16 d23, q2, #5 + itt cs + orrcs r8, r11, r8, lsl #8 + vmovcs d1, r8, r9 + vmull.u8 q2, d0, d2 + subs r5, #1 + vmlal.u8 q2, d1, d3 + bne 1b + + b store_tran_8x8_8 + + +.macro ADRT reg, val +@ adr in T32 has enough range but not in A32 +A adrl \reg, \val +T adr \reg, \val +.endm + +@ ff_hevc_rpi_pred_angular_4_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_4_neon_8, export=1 + ldr r12, [sp] + push {r4-r8, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + ldr lr, [r2], #1 @ Top + rsb r12, r6, #32 + vmov s0, lr + vdup.8 d3, r6 + ldr lr, [r2], #1 + vdup.8 d2, r12 + vmov s2, lr + subs r12, r4 + vmull.u8 q2, d0, d2 + it mi + addmi r12, #32 + vmlal.u8 q2, d1, d3 + rsb r6, r12, #32 + itt mi + vmovmi s0, lr + ldrmi lr, [r2], #1 + vdup.8 d2, r12 + it mi + vmovmi s2, lr + vdup.8 d3, r6 + mov r5, #2 +1: + vrshrn.u16 d20, q2, #5 + subs r12, r4 + vmull.u8 q2, d0, d2 + it mi + addmi r12, #32 + vmlal.u8 q2, d1, d3 + rsb r6, r12, #32 + vext.64 q8, q8, q9, #1 + it mi + vmovmi s0, lr + vext.64 q9, q9, q10, #1 + it mi + ldrmi lr, [r2], #1 + vdup.8 d2, r12 + it mi + vmovmi s2, lr + subs r5, #1 + vdup.8 d3, r6 + bne 1b + + vrshrn.u16 d20, q2, #5 + vmull.u8 q2, d0, d2 + add r12, r0, r3 + vmlal.u8 q2, d1, d3 + lsl r3, #1 + vext.64 q8, q8, q9, #1 + vext.64 q9, q9, q10, #1 + vrshrn.u16 d20, q2, #5 + +98: + vst4.8 {d17[0], d18[0], d19[0], d20[0]}, [r0], r3 + vst4.8 {d17[1], d18[1], d19[1], d20[1]}, [r12], r3 + vst4.8 {d17[2], d18[2], d19[2], d20[2]}, [r0] + vst4.8 {d17[3], d18[3], d19[3], d20[3]}, [r12] + pop {r4-r8, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + rsb r12, r6, #32 + ldr lr, [r2] @ Left + ldrb r2, [r2, #-1] @ Top-left + vmov s0, lr + vdup.8 d2, r12 + vdup.8 d3, r6 + orr lr, r2, lr, lsl #8 + vmov s2, lr + sub r8, r7, #128 + mov r5, #3 +2: + vmull.u8 q2, d0, d2 + subs r12, r4 + vmlal.u8 q2, d1, d3 +T it mi + addmi r12, #32 +T asr r6, r8, #8 +T it mi +T ldrbmi r2, [r1, r6] +A ldrbmi r2, [r1, r8, asr #8] + rsb r6, r12, #32 + vdup.8 d2, r12 + ittt mi + vmovmi s0, lr + orrmi lr, r2, lr, lsl #8 + vmovmi s2, lr + vrshrn.u16 d20, q2, #5 + vdup.8 d3, r6 + it mi + addmi r8, r7 + subs r5, #1 + vext.64 q8, q8, q9, #1 + vext.64 q9, q9, q10, #1 + bne 2b + + vmull.u8 q2, d0, d2 + add r12, r0, r3 + vmlal.u8 q2, d1, d3 + lsl r3, #1 + vrshrn.u16 d20, q2, #5 + b 98b + +@ Left of vertical - works down left +18: + ldrh r7, [r7] + rsb r12, r6, #32 + ldr lr, [r1] @ Top + ldrb r1, [r2, #-1] @ Top-left + vmov s0, lr + vdup.8 d2, r12 + vdup.8 d3, r6 + orr lr, r1, lr, lsl #8 + vmov s2, lr + sub r8, r7, #128 + mov r5, #3 +2: + vmull.u8 q2, d0, d2 + subs r12, r4 + vmlal.u8 q2, d1, d3 +T it mi + addmi r12, #32 +T asr r6, r8, #8 +T it mi +T ldrbmi r1, [r2, r6] +A ldrbmi r1, [r2, r8, asr #8] + rsb r6, r12, #32 + vdup.8 d2, r12 + ittt mi + vmovmi s0, lr + orrmi lr, r1, lr, lsl #8 + vmovmi s2, lr + vrshrn.u16 d4, q2, #5 + vdup.8 d3, r6 + it mi + addmi r8, r7 + subs r5, #1 + vst1.32 {d4[0]}, [r0], r3 + bne 2b + + vmull.u8 q2, d0, d2 + vmlal.u8 q2, d1, d3 + vrshrn.u16 d4, q2, #5 + vst1.32 {d4[0]}, [r0] + + pop {r4-r8, pc} + +@ Right of vertical - works along top - left unused +26: + ldr lr, [r1], #1 @ Top + rsb r12, r6, #32 + vmov s0, lr + vdup.8 d3, r6 + ldr lr, [r1], #1 + vdup.8 d2, r12 + vmov s2, lr + subs r12, r4 + vmull.u8 q2, d0, d2 + it mi + addmi r12, #32 + vmlal.u8 q2, d1, d3 + rsb r6, r12, #32 + itt mi + vmovmi s0, lr + ldrmi lr, [r1], #1 + vdup.8 d2, r12 + it mi + vmovmi s2, lr + vdup.8 d3, r6 + mov r5, #2 +1: + vrshrn.u16 d6, q2, #5 + subs r12, r4 + vmull.u8 q2, d0, d2 + it mi + addmi r12, #32 + vmlal.u8 q2, d1, d3 + rsb r6, r12, #32 + vst1.32 {d6[0]}, [r0], r3 + itt mi + vmovmi s0, lr + ldrmi lr, [r1], #1 + vdup.8 d2, r12 + it mi + vmovmi s2, lr + subs r5, #1 + vdup.8 d3, r6 + bne 1b + + vrshrn.u16 d6, q2, #5 + vmull.u8 q2, d0, d2 + vmlal.u8 q2, d1, d3 + vst1.32 {d6[0]}, [r0], r3 + vrshrn.u16 d6, q2, #5 + vst1.32 {d6[0]}, [r0] + + pop {r4-r8, pc} + +endfunc + + + +@ ff_hevc_rpi_pred_angular_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_8_neon_8, export=1 + ldr r12, [sp] + push {r4-r11, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + bl patch_h_down_8x8_8 + pop {r4-r11, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r10, #-128 + bl patch_h_up_8x8_8 + pop {r4-r11, pc} + +@ Left of vertical - works down left +18: + ldrd r8, r9, [r1] @ Top + rsb r12, r6, #32 + ldrb lr, [r2, #-1] @ Top-left + ldrh r7, [r7] + vmov d0, r8, r9 + lsl r9, r9, #8 + vdup.8 d2, r12 + orr r9, r9, r8, lsr #24 + orr r8, lr, r8, lsl #8 + vmov d1, r8, r9 + sub r1, r7, #128 + mov r5, #7 +1: + vdup.8 d3, r6 + vmull.u8 q2, d0, d2 + subs r12, r12, r4 + vmlal.u8 q2, d1, d3 + ittt mi + addmi lr, r2, r1, asr #8 + addmi r12, r12, #32 + vmovmi d0, r8, r9 + rsb r6, r12, #32 + itt mi + lslmi r9, r9, #8 + ldrbmi lr, [lr] + vdup.8 d2, r12 + vrshrn.u16 d4, q2, #5 + itttt mi + orrmi r9, r9, r8, lsr #24 + orrmi r8, lr, r8, lsl #8 + vmovmi d1, r8, r9 + addmi r1, r1, r7 + subs r5, r5, #1 + vst1.8 {d4}, [r0], r3 + bne 1b + + vdup.8 d3, r6 + vmull.u8 q2, d0, d2 + vmlal.u8 q2, d1, d3 + vrshrn.u16 d4, q2, #5 + vst1.8 {d4}, [r0] + + pop {r4-r11, pc} + +@ Right of vertical - works along top - left unused +26: + ldrd r8, r9, [r1] @ Top + rsb r12, r6, #32 + vmov d0, r8, r9 + vdup.8 d3, r6 + mov r5, #7 + lsr r8, #8 + vdup.8 d2, r12 + orr r8, r8, r9, lsl #24 + ldr r9, [r1, #5]! + vmov d1, r8, r9 +1: + vmull.u8 q2, d0, d2 + subs r12, r4 + vmlal.u8 q2, d1, d3 + it mi + addmi r12, #32 + rsb r6, r12, #32 + itt mi + vmovmi d0, r8, r9 + lsrmi r8, #8 + vdup.8 d2, r12 + itt mi + orrmi r8, r8, r9, lsl #24 + ldrmi r9, [r1, #1]! + vrshrn.u16 d6, q2, #5 + it mi + vmovmi d1, r8, r9 + vdup.8 d3, r6 + subs r5, #1 + vst1.8 {d6}, [r0], r3 + bne 1b + + vmull.u8 q2, d0, d2 + vmlal.u8 q2, d1, d3 + vrshrn.u16 d6, q2, #5 + vst1.8 {d6}, [r0] + + pop {r4-r11, pc} + +endfunc + + +@ ff_hevc_rpi_pred_angular_16_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_16_neon_8, export=1 + ldr r12, [sp] + push {r4-r11, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + mov r1, r2 @ save r2 - r1 unused by patch_down + + bl patch_h_down_8x8_8 + bl patch_h_down_8x8_8_continue + + add r2, r1, #8 @ restore r2, but 8 rows further down left + sub r0, #16 + mov r6, r4 + add r0, r0, r3, lsl #3 + + bl patch_h_down_8x8_8 + bl patch_h_down_8x8_8_continue + + pop {r4-r11, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r10, #-128 + + push {r2} + bl patch_h_up_8x8_8 + bl patch_h_up_8x8_8_continue + pop {r2} + + sub r0, #16 + mov r10, #-128 + add r2, #8 + add r0, r0, r3, lsl #3 + sub r10, r10, r7, lsl #3 + + bl patch_h_up_8x8_8 + bl patch_h_up_8x8_8_continue + + pop {r4-r11, pc} + +@ Left of vertical - works down left +18: + vld1.8 {q9}, [r1] + sub r1, r2, #1 + rsb r12, r6, #32 + ldrh r7, [r7] + vdup.8 d6, r6 + vext.8 q8, q9, q9, #15 + sub r8, r7, #128 + vld1.8 {d16[0]}, [r1] + vdup.8 d7, r12 + mov r5, #15 +1: + vmull.u8 q0, d18, d7 + subs r12, r4 + vmlal.u8 q0, d16, d6 + it cc + addcc r12, #32 + vmull.u8 q1, d19, d7 + it cc + addcc r1, r2, r8, asr #8 + vmlal.u8 q1, d17, d6 + rsb r6, r12, #32 + vext.8 q10, q8, q8, #15 + sub r5, #1 + vld1.8 {d20[0]}, [r1] + it cc + addcc r8, r7 + vmov q11, q8 + teq r5, #0 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vdup.8 d6, r6 + vdup.8 d7, r12 + vst1.8 {q0}, [r0], r3 + bhi 1b + beq 4f +2: + vmull.u8 q0, d22, d7 + subs r12, r4 + vmlal.u8 q0, d20, d6 + it cc + addcc r12, #32 + vmull.u8 q1, d23, d7 + it cc + addcc r1, r2, r8, asr #8 + vmlal.u8 q1, d21, d6 + rsb r6, r12, #32 + vext.8 q8, q10, q10, #15 + sub r5, #1 + vld1.8 {d16[0]}, [r1] + it cc + addcc r8, r7 + vmov q9, q10 + teq r5, #0 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vdup.8 d6, r6 + vdup.8 d7, r12 + vst1.8 {q0}, [r0], r3 + bhi 2b + bne 1b + bcc 5f +3: + vmull.u8 q0, d22, d7 + vmlal.u8 q0, d20, d6 + vmull.u8 q1, d23, d7 + vmlal.u8 q1, d21, d6 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vst1.8 {q0}, [r0] + + pop {r4-r11, pc} +4: + bcc 3b +5: + vmull.u8 q0, d18, d7 + vmlal.u8 q0, d16, d6 + vmull.u8 q1, d19, d7 + vmlal.u8 q1, d17, d6 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vst1.8 {q0}, [r0] + + pop {r4-r11, pc} + +@ Right of vertical - works along top - left unused +26: + vld1.8 {q9}, [r1]! + rsb r12, r6, #32 + vdup.8 d6, r6 + vdup.8 d7, r12 + vext.8 q8, q9, q9, #1 + vld1.8 {d17[7]}, [r1]! + mov r5, #15 +1: + vmull.u8 q0, d16, d6 + subs r12, r4 + vmlal.u8 q0, d18, d7 + it cc + addcc r12, #32 + vmull.u8 q1, d17, d6 + rsb r6, r12, #32 + vmlal.u8 q1, d19, d7 + sub r5, #1 + vext.8 q10, q8, q8, #1 + teq r5, #0 + vld1.8 {d21[7]}, [r1] + it cc + addcc r1, #1 + vmov q11, q8 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vdup.8 d6, r6 + vdup.8 d7, r12 + vst1.8 {q0}, [r0], r3 + bhi 1b + beq 4f +2: + vmull.u8 q0, d20, d6 + subs r12, r4 + vmlal.u8 q0, d22, d7 + it cc + addcc r12, #32 + vmull.u8 q1, d21, d6 + rsb r6, r12, #32 + vmlal.u8 q1, d23, d7 + sub r5, #1 + vext.8 q8, q10, q10, #1 + teq r5, #0 + vld1.8 {d17[7]}, [r1] + it cc + addcc r1, #1 + vmov q9, q10 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vdup.8 d6, r6 + vdup.8 d7, r12 + vst1.8 {q0}, [r0], r3 + bhi 2b + bne 1b + bcc 5f +3: + vmull.u8 q0, d20, d6 + vmlal.u8 q0, d22, d7 + vmull.u8 q1, d21, d6 + vmlal.u8 q1, d23, d7 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vst1.8 {q0}, [r0] + + pop {r4-r11, pc} +4: + bcc 3b +5: + vmull.u8 q0, d16, d6 + vmlal.u8 q0, d18, d7 + vmull.u8 q1, d17, d6 + vmlal.u8 q1, d19, d7 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vst1.8 {q0}, [r0] + + pop {r4-r11, pc} + +endfunc + + +@ ff_hevc_rpi_pred_angular_32_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_32_neon_8, export=1 + ldr r12, [sp] + push {r4-r11, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + mov r10, #4 + mov r1, r2 +1: + bl patch_h_down_8x8_8 + bl patch_h_down_8x8_8_continue + bl patch_h_down_8x8_8_continue + bl patch_h_down_8x8_8_continue + + add r2, r1, #8 @ restore r2, but 8 rows further down left + add r1, r1, #8 + mov r6, r4 + sub r0, #32 + subs r10, #1 + add r0, r0, r3, lsl #3 + bne 1b + + pop {r4-r11, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r10, #-128 + vmov.i8 d6, #1<<2 +1: + push {r2,r10} + bl patch_h_up_8x8_8 + bl patch_h_up_8x8_8_continue + bl patch_h_up_8x8_8_continue + bl patch_h_up_8x8_8_continue + pop {r2,r10} + + vmov r8, s12 + sub r0, #32 + add r2, #8 + add r0, r0, r3, lsl #3 + sub r10, r10, r7, lsl #3 + vshr.u8 d6, #1 + teq r8, #0 + bne 1b + + pop {r4-r11, pc} + +@ Left of vertical - works down left +18: + vld1.8 {q0-q1}, [r1] + sub r9, r2, #1 + rsb r12, r6, #32 + ldrh r7, [r7] + mov r8, #-128 + vdup.8 d18, r6 + vdup.8 d19, r12 + mov r5, #32 +1: + vld1.8 {d17[7]}, [r9] + add r8, r7 + vmov q2, q0 + vmov q3, q1 + add r9, r2, r8, asr #8 + vext.8 q1, q0, q1, #15 + vext.8 q0, q8, q0, #15 +2: + vmull.u8 q10, d4, d19 + subs r12, r4 + vmlal.u8 q10, d0, d18 + it cc + addcc r12, #32 + vmull.u8 q11, d5, d19 + rsb r6, r12, #32 + vmlal.u8 q11, d1, d18 + sub r5, #1 + vmull.u8 q12, d6, d19 + teq r5, #0 + vmlal.u8 q12, d2, d18 + vmull.u8 q13, d7, d19 + vmlal.u8 q13, d3, d18 + vdup.8 d18, r6 + vdup.8 d19, r12 + vrshrn.u16 d20, q10, #5 + vrshrn.u16 d21, q11, #5 + vrshrn.u16 d22, q12, #5 + vrshrn.u16 d23, q13, #5 + vst1.8 {q10-q11}, [r0], r3 + bhi 2b + bne 1b + + pop {r4-r11, pc} + +@ Right of vertical - works along top - left unused +26: + add r5, r1, #32 + vld1.8 {q0-q1}, [r1]! + rsb r12, r6, #32 + vld1.8 {d16[0]}, [r5] + mov r5, #32 + vdup.8 d18, r6 + vdup.8 d19, r12 +1: + vmov q2, q0 + add r1, #1 + vmov q3, q1 + vext.8 q0, q0, q1, #1 + vext.8 q1, q1, q8, #1 +2: + vmull.u8 q10, d0, d18 + subs r12, r4 + vmlal.u8 q10, d4, d19 + it cc + addcc r12, #32 + vmull.u8 q11, d1, d18 + rsb r6, r12, #32 + vmlal.u8 q11, d5, d19 + sub r5, #1 + vmull.u8 q12, d2, d18 + teq r5, #0 + vmlal.u8 q12, d6, d19 + vmull.u8 q13, d3, d18 + vmlal.u8 q13, d7, d19 + vld1.8 {d16[0]}, [r1] + vdup.8 d18, r6 + vdup.8 d19, r12 + vrshrn.u16 d20, q10, #5 + vrshrn.u16 d21, q11, #5 + vrshrn.u16 d22, q12, #5 + vrshrn.u16 d23, q13, #5 + vst1.8 {q10-q11}, [r0], r3 + bhi 2b + bne 1b + + pop {r4-r11, pc} + +endfunc + + +@ Chroma 8 bit 4x4 patch fns + .text + +patch_h_down_c_4x4_8: + ldrd r8, r9, [r2] @ Left + rsb r12, r6, #32 + vmov d0, r8, r9 + vdup.8 d3, r6 + lsr r8, #16 + vdup.8 d2, r12 + orr r8, r8, r9, lsl #16 + ldr r9, [r2, #6]! + vmov d1, r8, r9 + // drop through... +patch_h_down_c_4x4_8_continue: + mov r5, #4 +1: + subs r12, r4 + vmull.u8 q2, d0, d2 + it mi + addmi r12, #32 + vmlal.u8 q2, d1, d3 + rsb r6, r12, #32 + vext.8 q8, q8, q9, #8 + it mi + lsrmi r7, r8, #16 + vmov d18, d19 + it mi + vmovmi d0, r8, r9 + vdup.8 d2, r12 + it mi + orrmi r8, r7, r9, lsl #16 + vrshrn.u16 d19, q2, #5 + itt mi + ldrmi r9, [r2, #2]! + vmovmi d1, r8, r9 + subs r5, #1 + vdup.8 d3, r6 + bne 1b + // drop through... +store_tran_c_4x4_8: + vzip.16 d16, d17 + add r6, r0, r3 + vzip.16 d18, d19 + lsl r3, #1 + vzip.32 q8, q9 + add r5, r0, r3 + vst1.16 {d16}, [r0]! + vst1.16 {d17}, [r6], r3 + vst1.16 {d18}, [r5] + asr r3, #1 + vst1.16 {d19}, [r6] + + bx lr + +patch_h_up_c_4x4_8: + ldrd r8, r9, [r2] + rsb r6, r4, #32 + vmov d0, r8, r9 + vdup.8 d3, r4 + lsr r11, r8, #16 + vdup.8 d2, r6 + ldr r8, [r2, #-2]! + orr r9, r11, r9, lsl #16 + vmov d1, r8, r9 + mov r12, r4 + vmull.u8 q2, d0, d2 + vmlal.u8 q2, d1, d3 +patch_h_up_c_4x4_8_continue: + mov r5, #4 +1: + add r12, r4 + cmp r12, #33 + it cs + addcs r10, r7 + mov r11, #0 + itt cs + subcs r12, #32 + tstcs r10, #1<<31 + rsb r6, r12, #32 + it eq + asreq r11, r10, #7 + it cs + vmovcs d0, r8, r9 + it eq + biceq r11, #1 + vdup.8 d2, r6 + it cs + lsrcs r6, r8, #16 + vdup.8 d3, r12 + vext.8 q8, q8, q9, #8 + itt cs + orrcs r9, r6, r9, lsl #16 + ldrhcs r11, [r1, r11] + vmov d18, d19 + it hi + ldrhhi r11, [r2, #-2]! + vrshrn.u16 d19, q2, #5 + itt cs + orrcs r8, r11, r8, lsl #16 + vmovcs d1, r8, r9 + vmull.u8 q2, d0, d2 + subs r5, #1 + vmlal.u8 q2, d1, d3 + bne 1b + + b store_tran_c_4x4_8 + + +@ ff_hevc_rpi_pred_angular_c_4_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1 + ldr r12, [sp] + push {r4-r11, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + lsl r3, #1 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + bl patch_h_down_c_4x4_8 + pop {r4-r11, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r10, #-128 + bl patch_h_up_c_4x4_8 + pop {r4-r11, pc} + +@ Left of vertical - works down left +18: + ldrd r8, r9, [r1] @ Top + rsb r12, r6, #32 + ldrh lr, [r2, #-2] @ Top-left + ldrh r7, [r7] + vmov d0, r8, r9 + lsl r9, r9, #16 + vdup.8 d2, r12 + orr r9, r9, r8, lsr #16 + orr r8, lr, r8, lsl #16 + vmov d1, r8, r9 + sub r1, r7, #128 + mov r5, #3 +1: + vdup.8 d3, r6 + vmull.u8 q2, d0, d2 + subs r12, r12, r4 + vmlal.u8 q2, d1, d3 + itttt mi + addmi lr, r2, r1, asr #7 + bicmi lr, #1 + addmi r12, r12, #32 + vmovmi d0, r8, r9 + rsb r6, r12, #32 + itt mi + lslmi r9, r9, #16 + ldrhmi lr, [lr] + vdup.8 d2, r12 + vrshrn.u16 d4, q2, #5 + itttt mi + orrmi r9, r9, r8, lsr #16 + orrmi r8, lr, r8, lsl #16 + vmovmi d1, r8, r9 + addmi r1, r1, r7 + subs r5, r5, #1 + vst1.16 {d4}, [r0], r3 + bne 1b + + vdup.8 d3, r6 + vmull.u8 q2, d0, d2 + vmlal.u8 q2, d1, d3 + vrshrn.u16 d4, q2, #5 + vst1.16 {d4}, [r0] + + pop {r4-r11, pc} + +@ Right of vertical - works along top - left unused +26: + ldrd r8, r9, [r1] @ Top + rsb r12, r6, #32 + vmov d0, r8, r9 + vdup.8 d3, r6 + mov r5, #3 + lsr r8, #16 + vdup.8 d2, r12 + orr r8, r8, r9, lsl #16 + ldr r9, [r1, #6]! + vmov d1, r8, r9 +1: + vmull.u8 q2, d0, d2 + subs r12, r4 + vmlal.u8 q2, d1, d3 + it mi + addmi r12, #32 + rsb r6, r12, #32 + itt mi + vmovmi d0, r8, r9 + lsrmi r8, #16 + vdup.8 d2, r12 + itt mi + orrmi r8, r8, r9, lsl #16 + ldrmi r9, [r1, #2]! + vrshrn.u16 d6, q2, #5 + it mi + vmovmi d1, r8, r9 + vdup.8 d3, r6 + subs r5, #1 + vst1.16 {d6}, [r0], r3 + bne 1b + + vmull.u8 q2, d0, d2 + vmlal.u8 q2, d1, d3 + vrshrn.u16 d6, q2, #5 + vst1.16 {d6}, [r0] + + pop {r4-r11, pc} + +endfunc + + +@ ff_hevc_rpi_pred_angular_c_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1 + ldr r12, [sp] + push {r4-r11, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + lsl r3, #1 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + mov r1, r2 @ save r2 - r1 unused by patch_down + + bl patch_h_down_c_4x4_8 + bl patch_h_down_c_4x4_8_continue + + add r2, r1, #4*2 @ restore r2, but 4 rows further down left + sub r0, #16 + mov r6, r4 + add r0, r0, r3, lsl #2 + + bl patch_h_down_c_4x4_8 + bl patch_h_down_c_4x4_8_continue + + pop {r4-r11, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r10, #-128 + + push {r2} + bl patch_h_up_c_4x4_8 + bl patch_h_up_c_4x4_8_continue + pop {r2} + + sub r0, #16 + mov r10, #-128 + add r2, #8 + add r0, r0, r3, lsl #2 + sub r10, r10, r7, lsl #2 + + bl patch_h_up_c_4x4_8 + bl patch_h_up_c_4x4_8_continue + + pop {r4-r11, pc} + +@ Left of vertical - works down left +18: + vld1.8 {q9}, [r1] + sub r1, r2, #2 + rsb r12, r6, #32 + ldrh r7, [r7] + vdup.8 d6, r6 + vext.8 q8, q9, q9, #14 + sub r8, r7, #128 + vld1.16 {d16[0]}, [r1] + vdup.8 d7, r12 + mov r5, #7 +1: + subs r12, r4 + vmull.u8 q0, d18, d7 + it cc + asrcc r1, r8, #8 + vmlal.u8 q0, d16, d6 + it cc + addcc r12, #32 + vmull.u8 q1, d19, d7 + it cc + addcc r1, r2, r1, lsl #1 + vmlal.u8 q1, d17, d6 + rsb r6, r12, #32 + vext.8 q10, q8, q8, #14 + sub r5, #1 + vld1.16 {d20[0]}, [r1] + it cc + addcc r8, r7 + vmov q11, q8 + teq r5, #0 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vdup.8 d6, r6 + vdup.8 d7, r12 + vst1.8 {q0}, [r0], r3 + bhi 1b + beq 4f +2: + subs r12, r4 + vmull.u8 q0, d22, d7 + it cc + asrcc r1, r8, #8 + vmlal.u8 q0, d20, d6 + it cc + addcc r12, #32 + vmull.u8 q1, d23, d7 + it cc + addcc r1, r2, r1, lsl #1 + vmlal.u8 q1, d21, d6 + rsb r6, r12, #32 + vext.8 q8, q10, q10, #14 + sub r5, #1 + vld1.16 {d16[0]}, [r1] + it cc + addcc r8, r7 + vmov q9, q10 + teq r5, #0 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vdup.8 d6, r6 + vdup.8 d7, r12 + vst1.8 {q0}, [r0], r3 + bhi 2b + bne 1b + bcc 5f +3: + vmull.u8 q0, d22, d7 + vmlal.u8 q0, d20, d6 + vmull.u8 q1, d23, d7 + vmlal.u8 q1, d21, d6 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vst1.8 {q0}, [r0] + + pop {r4-r11, pc} +4: + bcc 3b +5: + vmull.u8 q0, d18, d7 + vmlal.u8 q0, d16, d6 + vmull.u8 q1, d19, d7 + vmlal.u8 q1, d17, d6 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vst1.8 {q0}, [r0] + + pop {r4-r11, pc} + +@ Right of vertical - works along top - left unused +26: + vld1.8 {q9}, [r1]! + rsb r12, r6, #32 + vdup.8 d6, r6 + vdup.8 d7, r12 + vext.8 q8, q9, q9, #2 + vld1.16 {d17[3]}, [r1]! + mov r5, #7 +1: + vmull.u8 q0, d16, d6 + subs r12, r4 + vmlal.u8 q0, d18, d7 + it cc + addcc r12, #32 + vmull.u8 q1, d17, d6 + rsb r6, r12, #32 + vmlal.u8 q1, d19, d7 + sub r5, #1 + vext.8 q10, q8, q8, #2 + teq r5, #0 + vld1.16 {d21[3]}, [r1] + it cc + addcc r1, #2 + vmov q11, q8 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vdup.8 d6, r6 + vdup.8 d7, r12 + vst1.8 {q0}, [r0], r3 + bhi 1b + beq 4f +2: + vmull.u8 q0, d20, d6 + subs r12, r4 + vmlal.u8 q0, d22, d7 + it cc + addcc r12, #32 + vmull.u8 q1, d21, d6 + rsb r6, r12, #32 + vmlal.u8 q1, d23, d7 + sub r5, #1 + vext.8 q8, q10, q10, #2 + teq r5, #0 + vld1.16 {d17[3]}, [r1] + it cc + addcc r1, #2 + vmov q9, q10 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vdup.8 d6, r6 + vdup.8 d7, r12 + vst1.8 {q0}, [r0], r3 + bhi 2b + bne 1b + bcc 5f +3: + vmull.u8 q0, d20, d6 + vmlal.u8 q0, d22, d7 + vmull.u8 q1, d21, d6 + vmlal.u8 q1, d23, d7 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vst1.8 {q0}, [r0] + + pop {r4-r11, pc} +4: + bcc 3b +5: + vmull.u8 q0, d16, d6 + vmlal.u8 q0, d18, d7 + vmull.u8 q1, d17, d6 + vmlal.u8 q1, d19, d7 + vrshrn.u16 d0, q0, #5 + vrshrn.u16 d1, q1, #5 + vst1.8 {q0}, [r0] + + pop {r4-r11, pc} + +endfunc + + +@ ff_hevc_rpi_pred_angular_c_16_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1 + ldr r12, [sp] + push {r4-r11, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + lsl r3, #1 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + mov r10, #4 + mov r1, r2 +1: + bl patch_h_down_c_4x4_8 + bl patch_h_down_c_4x4_8_continue + bl patch_h_down_c_4x4_8_continue + bl patch_h_down_c_4x4_8_continue + + add r2, r1, #4*2 @ restore r2, but 4 rows further down left + add r1, r1, #4*2 + mov r6, r4 + sub r0, #32 + subs r10, #1 + add r0, r0, r3, lsl #2 + bne 1b + + pop {r4-r11, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r10, #-128 + vmov.i8 d6, #1<<2 +1: + push {r2, r10} + bl patch_h_up_c_4x4_8 + bl patch_h_up_c_4x4_8_continue + bl patch_h_up_c_4x4_8_continue + bl patch_h_up_c_4x4_8_continue + pop {r2, r10} + + vmov r8, s12 + sub r0, #32 + add r2, #8 + add r0, r0, r3, lsl #2 + sub r10, r10, r7, lsl #2 + vshr.u8 d6, #1 + teq r8, #0 + bne 1b + + pop {r4-r11, pc} + +@ Left of vertical - works down left +18: + vld1.8 {q0-q1}, [r1] + sub r9, r2, #2 + rsb r12, r6, #32 + ldrh r7, [r7] + mov r8, #-128 + vdup.8 d18, r6 + vdup.8 d19, r12 + mov r5, #16 +1: + vld1.16 {d17[3]}, [r9] + add r8, r7 + vmov q2, q0 + vmov q3, q1 + asr r9, r8, #8 + vext.8 q1, q0, q1, #14 + add r9, r2, r9, lsl #1 + vext.8 q0, q8, q0, #14 +2: + vmull.u8 q10, d4, d19 + subs r12, r4 + vmlal.u8 q10, d0, d18 + it cc + addcc r12, #32 + vmull.u8 q11, d5, d19 + rsb r6, r12, #32 + vmlal.u8 q11, d1, d18 + sub r5, #1 + vmull.u8 q12, d6, d19 + teq r5, #0 + vmlal.u8 q12, d2, d18 + vmull.u8 q13, d7, d19 + vmlal.u8 q13, d3, d18 + vdup.8 d18, r6 + vdup.8 d19, r12 + vrshrn.u16 d20, q10, #5 + vrshrn.u16 d21, q11, #5 + vrshrn.u16 d22, q12, #5 + vrshrn.u16 d23, q13, #5 + vst1.8 {q10-q11}, [r0], r3 + bhi 2b + bne 1b + + pop {r4-r11, pc} + +@ Right of vertical - works along top - left unused +26: + add r5, r1, #32 + vld1.8 {q0-q1}, [r1]! + rsb r12, r6, #32 + vld1.16 {d16[0]}, [r5] + mov r5, #16 + vdup.8 d18, r6 + vdup.8 d19, r12 +1: + vmov q2, q0 + add r1, #2 + vmov q3, q1 + vext.8 q0, q0, q1, #2 + vext.8 q1, q1, q8, #2 +2: + vmull.u8 q10, d0, d18 + subs r12, r4 + vmlal.u8 q10, d4, d19 + it cc + addcc r12, #32 + vmull.u8 q11, d1, d18 + rsb r6, r12, #32 + vmlal.u8 q11, d5, d19 + sub r5, #1 + vmull.u8 q12, d2, d18 + teq r5, #0 + vmlal.u8 q12, d6, d19 + vmull.u8 q13, d3, d18 + vmlal.u8 q13, d7, d19 + vld1.16 {d16[0]}, [r1] + vdup.8 d18, r6 + vdup.8 d19, r12 + vrshrn.u16 d20, q10, #5 + vrshrn.u16 d21, q11, #5 + vrshrn.u16 d22, q12, #5 + vrshrn.u16 d23, q13, #5 + vst1.8 {q10-q11}, [r0], r3 + bhi 2b + bne 1b + + pop {r4-r11, pc} + +endfunc + +@------------------------------------------------------------------------------ +@ Data + + .text + .balign 64 +angle_2: + .byte 32 + .byte 26, 21, 17, 13, 9, 5, 2, 0 + @ Sign inverted from standards table + .byte 2, 5, 9, 13, 17, 21, 26, 32 + .byte 26, 21, 17, 13, 9, 5, 2, 0 + @ Standard sign + .byte 2, 5, 9, 13, 17, 21, 26, 32 + + .balign 2 + + @ Sign inverted from standards table +inv_angle: + .short 4096, 1638, 910, 630, 482, 390, 315 + .short 256 + .short 315, 390, 482, 630, 910, 1638, 4096 + +@------------------------------------------------------------------------------ +@ +@ 10 bit fns +@ Should work for 9 & 11 bit as there is no actual bit-depth specific code +@ but runs out of register width for 12+ bit + + .text + .balign 64 + +patch_h_down_4x4_10: + ldrd r8, r9, [r2] @ Left + rsb r12, r6, #32 + vmov d0, r8, r9 + vdup.16 d3, r6 + lsr r8, #16 + vdup.16 d2, r12 + orr r8, r8, r9, lsl #16 + ldr r9, [r2, #6]! + vmov d1, r8, r9 + // drop through... +patch_h_down_4x4_10_continue: + mov r5, #4 +1: + subs r12, r4 + vmul.u16 d4, d0, d2 + it mi + addmi r12, #32 + vmla.u16 d4, d1, d3 + rsb r6, r12, #32 + vext.16 q8, q8, q9, #4 + it mi + lsrmi r7, r8, #16 + vmov d18, d19 + it mi + vmovmi d0, r8, r9 + vdup.16 d2, r12 + it mi + orrmi r8, r7, r9, lsl #16 + vrshr.u16 d19, d4, #5 + itt mi + ldrmi r9, [r2, #2]! + vmovmi d1, r8, r9 + subs r5, #1 + vdup.16 d3, r6 + bne 1b + // drop through... +store_tran_4x4_10: + vzip.16 d16, d17 + add r6, r0, r3 + vzip.16 d18, d19 + lsl r3, #1 + vzip.32 q8, q9 + add r5, r0, r3 + vst1.16 {d16}, [r0]! + vst1.16 {d17}, [r6], r3 + vst1.16 {d18}, [r5] + asr r3, #1 + vst1.16 {d19}, [r6] + + bx lr + +patch_h_up_4x4_10: + ldrd r8, r9, [r2] + rsb r6, r4, #32 + vmov d0, r8, r9 + vdup.16 d3, r4 + lsr r11, r8, #16 + vdup.16 d2, r6 + ldr r8, [r2, #-2]! + orr r9, r11, r9, lsl #16 + vmov d1, r8, r9 + mov r12, r4 + vmul.u16 d4, d0, d2 + vmla.u16 d4, d1, d3 +patch_h_up_4x4_10_continue: + mov r5, #4 +1: + add r12, r4 + cmp r12, #33 + it cs + addcs r10, r7 + mov r11, #0 + itt cs + subcs r12, #32 + tstcs r10, #1<<31 + rsb r6, r12, #32 + it eq + asreq r11, r10, #7 + it cs + vmovcs d0, r8, r9 + it eq + biceq r11, #1 + vdup.16 d2, r6 + it cs + lsrcs r6, r8, #16 + vdup.16 d3, r12 + vext.16 q8, q8, q9, #4 + itt cs + orrcs r9, r6, r9, lsl #16 + ldrhcs r11, [r1, r11] + vmov d18, d19 + it hi + ldrhhi r11, [r2, #-2]! + vrshr.u16 d19, d4, #5 + itt cs + orrcs r8, r11, r8, lsl #16 + vmovcs d1, r8, r9 + vmul.u16 d4, d0, d2 + subs r5, #1 + vmla.u16 d4, d1, d3 + bne 1b + + b store_tran_4x4_10 + + +@ ff_hevc_rpi_pred_angular_4_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_4_neon_10, export=1 + ldr r12, [sp] + push {r4-r11, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + lsl r3, #1 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + bl patch_h_down_4x4_10 + pop {r4-r11, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r10, #-128 + bl patch_h_up_4x4_10 + pop {r4-r11, pc} + +@ Left of vertical - works down left +18: + ldrd r8, r9, [r1] @ Top + rsb r12, r6, #32 + ldrh lr, [r2, #-2] @ Top-left + ldrh r7, [r7] + vmov d0, r8, r9 + lsl r9, r9, #16 + vdup.16 d2, r12 + orr r9, r9, r8, lsr #16 + orr r8, lr, r8, lsl #16 + vmov d1, r8, r9 + sub r1, r7, #128 + mov r5, #3 +1: + sel lr, lr, lr @ force pipeline 0 on Cortex-A53 + vdup.16 d3, r6 + vmul.u16 d4, d0, d2 + subs r12, r12, r4 + vmla.u16 d4, d1, d3 + itttt mi + addmi lr, r2, r1, asr #7 + bicmi lr, #1 + addmi r12, r12, #32 + vmovmi d0, r8, r9 + rsb r6, r12, #32 + itt mi + lslmi r9, r9, #16 + ldrhmi lr, [lr] + vdup.16 d2, r12 + vrshr.u16 d4, d4, #5 + itttt mi + orrmi r9, r9, r8, lsr #16 + orrmi r8, lr, r8, lsl #16 + vmovmi d1, r8, r9 + addmi r1, r1, r7 + subs r5, r5, #1 + vst1.16 {d4}, [r0], r3 + bne 1b + + vdup.16 d3, r6 + nop @ force next insn into pipeline 0 to enable + vmul.u16 d4, d0, d2 @ vmla to execute back-to-back on Cortex-A53 + vmla.u16 d4, d1, d3 + vrshr.u16 d4, d4, #5 + vst1.16 {d4}, [r0] + + pop {r4-r11, pc} + +@ Right of vertical - works along top - left unused +26: + ldrd r8, r9, [r1] @ Top + rsb r12, r6, #32 + vmov d0, r8, r9 + vdup.16 d3, r6 + lsr r8, #16 + vdup.16 d2, r12 + orr r8, r8, r9, lsl #16 + ldr r9, [r1, #6]! + vmov d1, r8, r9 + mov r5, #3 +1: + vmul.u16 d4, d0, d2 + subs r12, r4 + vmla.u16 d4, d1, d3 + it mi + addmi r12, #32 + rsb r6, r12, #32 + itt mi + vmovmi d0, r8, r9 + lsrmi r8, #16 + vdup.16 d2, r12 + itt mi + orrmi r8, r8, r9, lsl #16 + ldrmi r9, [r1, #2]! + vrshr.u16 d4, d4, #5 + it mi + vmovmi d1, r8, r9 + vdup.16 d3, r6 + subs r5, #1 + vst1.16 {d4}, [r0], r3 + bne 1b + + vmul.u16 d4, d0, d2 + vmla.u16 d4, d1, d3 + vrshr.u16 d4, d4, #5 + vst1.16 {d4}, [r0] + + pop {r4-r11, pc} + +endfunc + + +@ ff_hevc_rpi_pred_angular_8_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_8_neon_10, export=1 + ldr r12, [sp] + push {r4-r11, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + lsl r3, #1 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + mov r1, r2 @ save r2 - r1 unused by patch_down + + bl patch_h_down_4x4_10 + bl patch_h_down_4x4_10_continue + + add r2, r1, #4*2 @ restore r2, but 4 rows further down left + sub r0, #16 + mov r6, r4 + add r0, r0, r3, lsl #2 + + bl patch_h_down_4x4_10 + bl patch_h_down_4x4_10_continue + + pop {r4-r11, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r10, #-128 + + push {r2} + bl patch_h_up_4x4_10 + bl patch_h_up_4x4_10_continue + pop {r2} + + sub r0, #16 + mov r10, #-128 + add r2, #8 + add r0, r0, r3, lsl #2 + sub r10, r10, r7, lsl #2 + + bl patch_h_up_4x4_10 + bl patch_h_up_4x4_10_continue + + pop {r4-r11, pc} + +@ Left of vertical - works down left +18: + vld1.16 {q9}, [r1] + sub r1, r2, #2 + rsb r12, r6, #32 + ldrh r7, [r7] + vdup.16 q2, r6 + vext.16 q8, q9, q9, #7 + sub r8, r7, #128 + vld1.16 {d16[0]}, [r1] + vdup.16 q3, r12 + mov r5, #7 +1: + vmul.u16 q0, q9, q3 + subs r12, r4 + vmla.u16 q0, q8, q2 + ittt cc + asrcc r1, r8, #8 + addcc r12, #32 + addcc r1, r2, r1, lsl #1 + vext.16 q10, q8, q8, #7 + rsb r6, r12, #32 + vmov q11, q8 + sub r5, #1 + vrshr.u16 q0, q0, #5 + it cc + addcc r8, r7 + vld1.16 {d20[0]}, [r1] + teq r5, #0 + vdup.16 q2, r6 + vdup.16 q3, r12 + vst1.16 {q0}, [r0], r3 + bhi 1b + beq 4f +2: + vmul.u16 q0, q11, q3 + subs r12, r4 + vmla.u16 q0, q10, q2 + ittt cc + asrcc r1, r8, #8 + addcc r12, #32 + addcc r1, r2, r1, lsl #1 + vext.16 q8, q10, q10, #7 + rsb r6, r12, #32 + vmov q9, q10 + sub r5, #1 + vrshr.u16 q0, q0, #5 + it cc + addcc r8, r7 + vld1.16 {d16[0]}, [r1] + teq r5, #0 + vdup.16 q2, r6 + vdup.16 q3, r12 + vst1.16 {q0}, [r0], r3 + bhi 2b + bne 1b + bcc 5f +3: + vmul.u16 q0, q11, q3 + vmla.u16 q0, q10, q2 + vrshr.u16 q0, q0, #5 + vst1.16 {q0}, [r0] + + pop {r4-r11, pc} +4: + bcc 3b +5: + vmul.u16 q0, q9, q3 + vmla.u16 q0, q8, q2 + vrshr.u16 q0, q0, #5 + vst1.16 {q0}, [r0] + + pop {r4-r11, pc} + +@ Right of vertical - works along top - left unused +26: + vld1.16 {q9}, [r1]! + rsb r12, r6, #32 + vdup.16 q2, r6 + vdup.16 q3, r12 + vext.16 q8, q9, q9, #1 + vld1.16 {d17[3]}, [r1]! + mov r5, #7 +1: + vmul.u16 q0, q8, q2 + subs r12, r4 + vmla.u16 q0, q9, q3 + it cc + addcc r12, #32 + vext.16 q10, q8, q8, #1 + rsb r6, r12, #32 + vld1.16 {d21[3]}, [r1] + sub r5, #1 + vmov q11, q8 + teq r5, #0 + vrshr.u16 q0, q0, #5 + it cc + addcc r1, #2 + vdup.16 q2, r6 + vdup.16 q3, r12 + vst1.16 {q0}, [r0], r3 + bhi 1b + beq 4f +2: + vmul.u16 q0, q10, q2 + subs r12, r4 + vmla.u16 q0, q11, q3 + it cc + addcc r12, #32 + vext.16 q8, q10, q10, #1 + rsb r6, r12, #32 + vld1.16 {d17[3]}, [r1] + sub r5, #1 + vmov q9, q10 + teq r5, #0 + vrshr.u16 q0, q0, #5 + it cc + addcc r1, #2 + vdup.16 q2, r6 + vdup.16 q3, r12 + vst1.16 {q0}, [r0], r3 + bhi 2b + bne 1b + bcc 5f +3: + vmul.u16 q0, q10, q2 + vmla.u16 q0, q11, q3 + vrshr.u16 q0, q0, #5 + vst1.16 {q0}, [r0] + + pop {r4-r11, pc} +4: + bcc 3b +5: + vmul.u16 q0, q8, q2 + vmla.u16 q0, q9, q3 + vrshr.u16 q0, q0, #5 + vst1.16 {q0}, [r0] + + pop {r4-r11, pc} + +endfunc + + +@ ff_hevc_rpi_pred_angular_16_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_16_neon_10, export=1 + ldr r12, [sp] + push {r4-r11, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + lsl r3, #1 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + mov r10, #4 + mov r1, r2 +1: + bl patch_h_down_4x4_10 + bl patch_h_down_4x4_10_continue + bl patch_h_down_4x4_10_continue + bl patch_h_down_4x4_10_continue + + add r2, r1, #4*2 @ restore r2, but 4 rows further down left + add r1, r1, #4*2 + mov r6, r4 + sub r0, #32 + subs r10, #1 + add r0, r0, r3, lsl #2 + bne 1b + + pop {r4-r11, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r10, #-128 + vmov.i8 d6, #1<<2 +1: + push {r2, r10} + bl patch_h_up_4x4_10 + bl patch_h_up_4x4_10_continue + bl patch_h_up_4x4_10_continue + bl patch_h_up_4x4_10_continue + pop {r2, r10} + + vmov r8, s12 + sub r0, #32 + add r2, #8 + add r0, r0, r3, lsl #2 + sub r10, r10, r7, lsl #2 + vshr.u8 d6, #1 + teq r8, #0 + bne 1b + + pop {r4-r11, pc} + +@ Left of vertical - works down left +18: + vld1.16 {q0-q1}, [r1] + sub r9, r2, #2 + rsb r12, r6, #32 + ldrh r7, [r7] + mov r8, #-128 + vdup.16 q9, r6 + vdup.16 q10, r12 + mov r5, #16 +1: + vld1.16 {d17[3]}, [r9] + add r8, r7 + vmov q2, q0 + vmov q3, q1 + asr r9, r8, #8 + vext.16 q1, q0, q1, #7 + add r9, r2, r9, lsl #1 + vext.16 q0, q8, q0, #7 +2: + vmul.u16 q11, q2, q10 + subs r12, r4 + vmla.u16 q11, q0, q9 + it cc + addcc r12, #32 + vmul.u16 q12, q3, q10 + rsb r6, r12, #32 + vmla.u16 q12, q1, q9 + sub r5, #1 + teq r5, #0 + vdup.16 q9, r6 + vdup.16 q10, r12 + vrshr.u16 q11, q11, #5 + vrshr.u16 q12, q12, #5 + vst1.16 {q11-q12}, [r0], r3 + bhi 2b + bne 1b + + pop {r4-r11, pc} + +@ Right of vertical - works along top - left unused +26: + add r5, r1, #32 + vld1.16 {q0-q1}, [r1]! + rsb r12, r6, #32 + vld1.16 {d16[0]}, [r5] + mov r5, #16 + vdup.16 q9, r6 + vdup.16 q10, r12 +1: + vmov q2, q0 + add r1, #2 + vmov q3, q1 + vext.16 q0, q0, q1, #1 + vext.16 q1, q1, q8, #1 +2: + vmul.u16 q11, q0, q9 + subs r12, r4 + vmla.u16 q11, q2, q10 + it cc + addcc r12, #32 + vmul.u16 q12, q1, q9 + rsb r6, r12, #32 + vmla.u16 q12, q3, q10 + sub r5, #1 + vld1.16 {d16[0]}, [r1] + teq r5, #0 + vdup.16 q9, r6 + vdup.16 q10, r12 + vrshr.u16 q11, q11, #5 + vrshr.u16 q12, q12, #5 + vst1.16 {q11-q12}, [r0], r3 + bhi 2b + bne 1b + + pop {r4-r11, pc} + +endfunc + + +@ ff_hevc_rpi_pred_angular_32_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_32_neon_10, export=1 + ldr r12, [sp] + push {r4-r11, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + lsl r3, #1 + vpush {d8} + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + add sp, #8 + mov r10, #8 + mov r1, r2 +1: + bl patch_h_down_4x4_10 + bl patch_h_down_4x4_10_continue + bl patch_h_down_4x4_10_continue + bl patch_h_down_4x4_10_continue + bl patch_h_down_4x4_10_continue + bl patch_h_down_4x4_10_continue + bl patch_h_down_4x4_10_continue + bl patch_h_down_4x4_10_continue + + add r2, r1, #4*2 @ restore r2, but 4 rows further down left + add r1, r1, #4*2 + mov r6, r4 + sub r0, #64 + subs r10, #1 + add r0, r0, r3, lsl #2 + bne 1b + + pop {r4-r11, pc} + +@ Up of Horizontal - works down up +10: + add sp, #8 + ldrh r7, [r7] + mov r10, #-128 + vmov.i8 d6, #1<<6 +1: + push {r2, r10} + bl patch_h_up_4x4_10 + bl patch_h_up_4x4_10_continue + bl patch_h_up_4x4_10_continue + bl patch_h_up_4x4_10_continue + bl patch_h_up_4x4_10_continue + bl patch_h_up_4x4_10_continue + bl patch_h_up_4x4_10_continue + bl patch_h_up_4x4_10_continue + pop {r2, r10} + + vmov r8, s12 + sub r0, #64 + add r2, #8 + add r0, r0, r3, lsl #2 + sub r10, r10, r7, lsl #2 + vshr.u8 d6, #1 + teq r8, #0 + bne 1b + + pop {r4-r11, pc} + +@ Left of vertical - works down left +18: + add r5, r1, #32 + vld1.16 {q1-q2}, [r1] + rsb r12, r6, r6, lsl #16 + vld1.16 {q3-q4}, [r5] + sub r9, r2, #2 + rsb r4, r12, #0 + rsb r12, r12, #32 << 16 + ldrh r7, [r7] + mov r8, #-128 + vmov d0, d9 + vmov s2, r12 + add r10, r0, #32 + mov r5, #32 +1: + vld1.16 {d1[3]}, [r9] + add r8, r7 + vmov q11, q4 + vmov q10, q3 + asr r9, r8, #8 + vmov q9, q2 + add r9, r2, r9, lsl #1 + vmov q8, q1 + vext.16 q4, q3, q4, #7 + vext.16 q3, q2, q3, #7 + vext.16 q2, q1, q2, #7 + vext.16 q1, q0, q1, #7 +2: + vmul.u16 q12, q8, d1[1] + adds r12, r4 + vmla.u16 q12, q1, d1[0] + it cc + addcc r12, #32 << 16 + vmul.u16 q13, q9, d1[1] + it cc + subcc r12, #32 + vmla.u16 q13, q2, d1[0] + sub r5, #1 + vmul.u16 q14, q10, d1[1] + teq r5, #0 + vmla.u16 q14, q3, d1[0] + vmul.u16 q15, q11, d1[1] + vmla.u16 q15, q4, d1[0] + vmov s2, r12 + vrshr.u16 q12, q12, #5 + vrshr.u16 q13, q13, #5 + vrshr.u16 q14, q14, #5 + vrshr.u16 q15, q15, #5 + vst1.16 {q12-q13}, [r0], r3 + vst1.16 {q14-q15}, [r10], r3 + bhi 2b + bne 1b + + vpop {d8} + vmov d9, d0 + pop {r4-r11, pc} + +@ Right of vertical - works along top - left unused +26: + add r5, r1, #32 + vld1.16 {q1-q2}, [r1] + rsb r12, r6, r6, lsl #16 + vld1.16 {q3-q4}, [r5] + add r1, r1, #64 + rsb r4, r12, #0 + rsb r12, r12, #32 << 16 + vmov d1, d9 + vmov s1, r12 + add r10, r0, #32 + mov r5, #32 +1: + vld1.16 {d0[0]}, [r1]! + vmov q8, q1 + vmov q9, q2 + vmov q10, q3 + vmov q11, q4 + vext.16 q1, q1, q2, #1 + vext.16 q2, q2, q3, #1 + vext.16 q3, q3, q4, #1 + vext.16 q4, q4, q0, #1 +2: + vmul.u16 q12, q1, d0[2] + adds r12, r4 + vmla.u16 q12, q8, d0[3] + it cc + addcc r12, #32 << 16 + vmul.u16 q13, q2, d0[2] + it cc + subcc r12, #32 + vmla.u16 q13, q9, d0[3] + sub r5, #1 + vmul.u16 q14, q3, d0[2] + teq r5, #0 + vmla.u16 q14, q10, d0[3] + vmul.u16 q15, q4, d0[2] + vmla.u16 q15, q11, d0[3] + vmov s1, r12 + vrshr.u16 q12, q12, #5 + vrshr.u16 q13, q13, #5 + vrshr.u16 q14, q14, #5 + vrshr.u16 q15, q15, #5 + vst1.16 {q12-q13}, [r0], r3 + vst1.16 {q14-q15}, [r10], r3 + bhi 2b + bne 1b + + vpop {d8} + vmov d9, d1 + pop {r4-r11, pc} + +endfunc + + + +@ Generate 4x4 chroma patch +@ +@ In (const) +@ r1 Up ptr (_up only) +@ r3 Out stride +@ r4 Angle add +@ r7 Inv angle (_up only) +@ +@ In/Out (updated) +@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) +@ r2 Left ptr - updated +@ r6 Angle frac (init to r4 + 32) +@ r8 Inv angle accumulator +@ q2 Cur Line - load before 1st call for down - set by _up +@ q8 Cur Line - load before 1st call for up - set by _down +@ +@ Temps +@ r5 Loop counter +@ r12 +@ d0, q1, q12-q15 + +patch_h_down_c_4x4_10: + vld1.16 {q12}, [r2]! + rsb r12, r6, #32 + vdup.16 q2, r6 + vdup.16 q3, r12 + mov r5, #4 +1: + vmov q13, q12 + vext.16 q12, q12, q12, #2 + vld1.32 {d25[1]}, [r2]! +patch_h_down_c_4x4_10_continue: +2: + vmov q8, q9 + subs r12, r4 + vmul.u16 q0, q13, q3 + it cc + addcc r12, #32 + vmla.u16 q0, q12, q2 + rsb r6, r12, #32 + vmov q9, q10 + sub r5, #1 + vmov q10, q11 + teq r5, #0 + vdup.16 q2, r6 + vdup.16 q3, r12 + vrshr.u16 q11, q0, #5 + bhi 2b + bne 1b + + bcs 3f + vmov q13, q12 + vext.16 q12, q12, q12, #2 + vld1.32 {d25[1]}, [r2]! +3: + +store_tran_c_4x4_10: +T add r6, r0, r3 + vzip.32 q8, q10 +A add r6, r0, r3 +T lsl r3, #1 + vzip.32 q9, q11 +A add r5, r0, r3, lsl #1 +T add r5, r0, r3 + vst2.32 {d16,d18}, [r0]! +A lsl r3, #1 + vst2.32 {d17,d19}, [r6], r3 + asr r3, #1 + vst2.32 {d20,d22}, [r5] + mov r5, #4 + vst2.32 {d21,d23}, [r6] + bx lr + +patch_h_up_c_4x4_10: + vld1.16 {q1}, [r2] + rsb r12, r6, #32 + vdup.16 q2, r6 + vdup.16 q3, r12 + mov r5, #4 +1: + adds r8, r7 + vmov q12, q1 + it mi + ldrmi r6, [r2, #-4]! + vext.16 q1, q1, q1, #6 + itt pl + asrpl r6, r8, #8 + ldrpl r6, [r1, r6, lsl #2] + vmov s4, r6 +patch_h_up_c_4x4_10_continue: +2: + vmov q8, q9 + subs r12, r4 + vmul.u16 q0, q12, q3 + it cc + addcc r12, #32 + vmla.u16 q0, q1, q2 + rsb r6, r12, #32 + vmov q9, q10 + sub r5, #1 + vmov q10, q11 + teq r5, #0 + vdup.16 q2, r6 + vdup.16 q3, r12 + vrshr.u16 q11, q0, #5 + bhi 2b + bne 1b + + bcs store_tran_c_4x4_10 + adds r8, r7 + vmov q12, q1 + it mi + ldrmi r6, [r2, #-4]! + vext.16 q1, q1, q1, #6 + itt pl + asrpl r6, r8, #8 + ldrpl r6, [r1, r6, lsl #2] + vmov s4, r6 + b store_tran_c_4x4_10 + + +@ ff_hevc_rpi_pred_angular_c_4_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1 + ldr r12, [sp] + push {r4-r8, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + lsl r3, #2 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + bl patch_h_down_c_4x4_10 + pop {r4-r8, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r8, #-128 + sub r8, r7 + bl patch_h_up_c_4x4_10 + pop {r4-r8, pc} + +@ Left of vertical - works down left +18: + vld1.16 {q9}, [r1] + sub r1, r2, #4 + rsb r12, r6, #32 + ldrh r7, [r7] + vdup.16 q2, r6 + vext.16 q8, q9, q9, #6 + sub r8, r7, #128 + vld1.32 {d16[0]}, [r1] + vdup.16 q3, r12 + mov r5, #3 +1: + vmul.u16 q0, q9, q3 + subs r12, r4 + vmla.u16 q0, q8, q2 + ittt cc + asrcc r1, r8, #8 + addcc r12, #32 + addcc r1, r2, r1, lsl #2 + vext.16 q10, q8, q8, #6 + rsb r6, r12, #32 + vmov q11, q8 + sub r5, #1 + vrshr.u16 q0, q0, #5 + it cc + addcc r8, r7 + vld1.32 {d20[0]}, [r1] + teq r5, #0 + vdup.16 q2, r6 + vdup.16 q3, r12 + vst1.16 {q0}, [r0], r3 + bhi 1b + beq 4f +2: + vmul.u16 q0, q11, q3 + subs r12, r4 + vmla.u16 q0, q10, q2 + ittt cc + asrcc r1, r8, #8 + addcc r12, #32 + addcc r1, r2, r1, lsl #2 + vext.16 q8, q10, q10, #6 + rsb r6, r12, #32 + vmov q9, q10 + sub r5, #1 + vrshr.u16 q0, q0, #5 + it cc + addcc r8, r7 + vld1.32 {d16[0]}, [r1] + teq r5, #0 + vdup.16 q2, r6 + vdup.16 q3, r12 + vst1.16 {q0}, [r0], r3 + bhi 2b + bne 1b + bcc 5f +3: + vmul.u16 q0, q11, q3 + vmla.u16 q0, q10, q2 + vrshr.u16 q0, q0, #5 + vst1.16 {q0}, [r0] + + pop {r4-r8, pc} +4: + bcc 3b +5: + vmul.u16 q0, q9, q3 + vmla.u16 q0, q8, q2 + vrshr.u16 q0, q0, #5 + vst1.16 {q0}, [r0] + + pop {r4-r8, pc} + +@ Right of vertical - works along top - left unused +26: + vld1.16 {q9}, [r1]! + rsb r12, r6, #32 + vdup.16 q2, r6 + vdup.16 q3, r12 + vext.16 q8, q9, q9, #2 + vld1.32 {d17[1]}, [r1]! + mov r5, #3 +1: + vmul.u16 q0, q8, q2 + subs r12, r4 + vmla.u16 q0, q9, q3 + it cc + addcc r12, #32 + vext.16 q10, q8, q8, #2 + rsb r6, r12, #32 + vld1.32 {d21[1]}, [r1] + sub r5, #1 + vmov q11, q8 + teq r5, #0 + vrshr.u16 q0, q0, #5 + it cc + addcc r1, #4 + vdup.16 q2, r6 + vdup.16 q3, r12 + vst1.16 {q0}, [r0], r3 + bhi 1b + beq 4f +2: + vmul.u16 q0, q10, q2 + subs r12, r4 + vmla.u16 q0, q11, q3 + it cc + addcc r12, #32 + vext.16 q8, q10, q10, #2 + rsb r6, r12, #32 + vld1.32 {d17[1]}, [r1] + sub r5, #1 + vmov q9, q10 + teq r5, #0 + vrshr.u16 q0, q0, #5 + it cc + addcc r1, #4 + vdup.16 q2, r6 + vdup.16 q3, r12 + vst1.16 {q0}, [r0], r3 + bhi 2b + bne 1b + bcc 5f +3: + vmul.u16 q0, q10, q2 + vmla.u16 q0, q11, q3 + vrshr.u16 q0, q0, #5 + vst1.16 {q0}, [r0] + + pop {r4-r8, pc} +4: + bcc 3b +5: + vmul.u16 q0, q8, q2 + vmla.u16 q0, q9, q3 + vrshr.u16 q0, q0, #5 + vst1.16 {q0}, [r0] + + pop {r4-r8, pc} + +endfunc + + +@ ff_hevc_rpi_pred_angular_c_8_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1 + ldr r12, [sp] + push {r4-r8, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + lsl r3, #2 + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + mov r1, r2 @ save r2 - r1 unused by patch_down + + bl patch_h_down_c_4x4_10 + bl patch_h_down_c_4x4_10_continue + + add r2, r1, #4*4 @ restore r2, but 4 rows further down left + sub r0, #32 + mov r6, r4 + add r0, r0, r3, lsl #2 + + bl patch_h_down_c_4x4_10 + bl patch_h_down_c_4x4_10_continue + + pop {r4-r8, pc} + +@ Up of Horizontal - works down up +10: + ldrh r7, [r7] + mov r8, #-128 + sub r8, r7 + + push {r2, r8} + bl patch_h_up_c_4x4_10 + bl patch_h_up_c_4x4_10_continue + pop {r2, r8} + + sub r0, #32 + mov r6, r4 + add r2, #16 + sub r8, r8, r7, lsl #2 + add r0, r0, r3, lsl #2 + + bl patch_h_up_c_4x4_10 + bl patch_h_up_c_4x4_10_continue + + pop {r4-r8, pc} + +@ Left of vertical - works down left +18: + vld1.16 {q0-q1}, [r1] + sub r9, r2, #4 + rsb r12, r6, #32 + ldrh r7, [r7] + mov r8, #-128 + vdup.16 q9, r6 + vdup.16 q10, r12 + mov r5, #8 +1: + vld1.32 {d17[1]}, [r9] + add r8, r7 + vmov q2, q0 + vmov q3, q1 + asr r9, r8, #8 + vext.16 q1, q0, q1, #6 + add r9, r2, r9, lsl #2 + vext.16 q0, q8, q0, #6 +2: + vmul.u16 q11, q2, q10 + subs r12, r4 + vmla.u16 q11, q0, q9 + it cc + addcc r12, #32 + vmul.u16 q12, q3, q10 + rsb r6, r12, #32 + vmla.u16 q12, q1, q9 + sub r5, #1 + teq r5, #0 + vdup.16 q9, r6 + vdup.16 q10, r12 + vrshr.u16 q11, q11, #5 + vrshr.u16 q12, q12, #5 + vst1.16 {q11-q12}, [r0], r3 + bhi 2b + bne 1b + + pop {r4-r8, pc} + +@ Right of vertical - works along top - left unused +26: + add r5, r1, #32 + vld1.16 {q0-q1}, [r1]! + rsb r12, r6, #32 + vld1.32 {d16[0]}, [r5] + mov r5, #8 + vdup.16 q9, r6 + vdup.16 q10, r12 +1: + vmov q2, q0 + add r1, #4 + vmov q3, q1 + vext.16 q0, q0, q1, #2 + vext.16 q1, q1, q8, #2 +2: + vmul.u16 q11, q0, q9 + subs r12, r4 + vmla.u16 q11, q2, q10 + it cc + addcc r12, #32 + vmul.u16 q12, q1, q9 + rsb r6, r12, #32 + vmla.u16 q12, q3, q10 + sub r5, #1 + vld1.32 {d16[0]}, [r1] + teq r5, #0 + vdup.16 q9, r6 + vdup.16 q10, r12 + vrshr.u16 q11, q11, #5 + vrshr.u16 q12, q12, #5 + vst1.16 {q11-q12}, [r0], r3 + bhi 2b + bne 1b + + pop {r4-r8, pc} + +endfunc + + +@ ff_hevc_rpi_pred_angular_c_16_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride [r3] +@ unsigned int mode [sp, #0] 2..34 + +function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1 + ldr r12, [sp] + push {r4-r10, lr} + ADRT r4, angle_2 - 2 + ADRT r7, inv_angle - 11*2 + add r7, r7, r12, lsl #1 + lsl r3, #2 + vpush {d8} + ldrsb r6, [r4, r12] + cmp r12, #26 + ldrsb r4, [r4, r12] + bge 26f + cmp r12, #18 + bge 18f + cmp r12, #10 + bge 10f + +@ Down of Horizontal - works down left + add sp, #8 + mov r10, #4 + mov r1, r2 +1: + bl patch_h_down_c_4x4_10 + bl patch_h_down_c_4x4_10_continue + bl patch_h_down_c_4x4_10_continue + bl patch_h_down_c_4x4_10_continue + + add r2, r1, #4*4 @ restore r2, but 4 rows further down left + add r1, r1, #4*4 + mov r6, r4 + sub r0, #64 + subs r10, #1 + add r0, r0, r3, lsl #2 + bne 1b + + pop {r4-r10, pc} + +@ Up of Horizontal - works down up +10: + add sp, #8 + mov r10, #4 + ldrh r7, [r7] + mov r8, #-128 + sub r8, r7 +2: + push {r2, r8} + bl patch_h_up_c_4x4_10 + bl patch_h_up_c_4x4_10_continue + bl patch_h_up_c_4x4_10_continue + bl patch_h_up_c_4x4_10_continue + pop {r2, r8} + + sub r0, #64 + mov r6, r4 + add r2, #16 + sub r8, r8, r7, lsl #2 + add r0, r0, r3, lsl #2 + subs r10, #1 + bne 2b + + pop {r4-r10, pc} + +@ Left of vertical - works down left +18: + add r5, r1, #32 + vld1.16 {q1-q2}, [r1] + rsb r12, r6, r6, lsl #16 + vld1.16 {q3-q4}, [r5] + sub r9, r2, #4 + rsb r4, r12, #0 + rsb r12, r12, #32 << 16 + ldrh r7, [r7] + mov r8, #-128 + vmov d0, d9 + vmov s2, r12 + add r10, r0, #32 + mov r5, #16 +1: + vld1.32 {d1[1]}, [r9] + add r8, r7 + vmov q11, q4 + vmov q10, q3 + asr r9, r8, #8 + vmov q9, q2 + add r9, r2, r9, lsl #2 + vmov q8, q1 + vext.16 q4, q3, q4, #6 + vext.16 q3, q2, q3, #6 + vext.16 q2, q1, q2, #6 + vext.16 q1, q0, q1, #6 +2: + vmul.u16 q12, q8, d1[1] + adds r12, r4 + vmla.u16 q12, q1, d1[0] + it cc + addcc r12, #32 << 16 + vmul.u16 q13, q9, d1[1] + it cc + subcc r12, #32 + vmla.u16 q13, q2, d1[0] + sub r5, #1 + vmul.u16 q14, q10, d1[1] + teq r5, #0 + vmla.u16 q14, q3, d1[0] + vmul.u16 q15, q11, d1[1] + vmla.u16 q15, q4, d1[0] + vmov s2, r12 + vrshr.u16 q12, q12, #5 + vrshr.u16 q13, q13, #5 + vrshr.u16 q14, q14, #5 + vrshr.u16 q15, q15, #5 + vst1.16 {q12-q13}, [r0], r3 + vst1.16 {q14-q15}, [r10], r3 + bhi 2b + bne 1b + + vpop {d8} + vmov d9, d0 + pop {r4-r10, pc} + +@ Right of vertical - works along top - left unused +26: + add r5, r1, #32 + vld1.16 {q1-q2}, [r1] + rsb r12, r6, r6, lsl #16 + vld1.16 {q3-q4}, [r5] + add r1, r1, #64 + rsb r4, r12, #0 + rsb r12, r12, #32 << 16 + vmov d1, d9 + vmov s1, r12 + add r10, r0, #32 + mov r5, #16 +1: + vld1.32 {d0[0]}, [r1]! + vmov q8, q1 + vmov q9, q2 + vmov q10, q3 + vmov q11, q4 + vext.16 q1, q1, q2, #2 + vext.16 q2, q2, q3, #2 + vext.16 q3, q3, q4, #2 + vext.16 q4, q4, q0, #2 +2: + vmul.u16 q12, q1, d0[2] + adds r12, r4 + vmla.u16 q12, q8, d0[3] + it cc + addcc r12, #32 << 16 + vmul.u16 q13, q2, d0[2] + it cc + subcc r12, #32 + vmla.u16 q13, q9, d0[3] + sub r5, #1 + vmul.u16 q14, q3, d0[2] + teq r5, #0 + vmla.u16 q14, q10, d0[3] + vmul.u16 q15, q4, d0[2] + vmla.u16 q15, q11, d0[3] + vmov s1, r12 + vrshr.u16 q12, q12, #5 + vrshr.u16 q13, q13, #5 + vrshr.u16 q14, q14, #5 + vrshr.u16 q15, q15, #5 + vst1.16 {q12-q13}, [r0], r3 + vst1.16 {q14-q15}, [r10], r3 + bhi 2b + bne 1b + + vpop {d8} + vmov d9, d1 + pop {r4-r10, pc} + +endfunc diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S new file mode 100644 index 0000000000..df8c1c25b9 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S @@ -0,0 +1,705 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox, Ben Avison +*/ + + +#include "libavutil/arm/asm.S" +#include "neon.S" + + +@ ff_hevc_rpi_pred_dc_4_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_dc_4_neon_8, export=1 + + @ Average the els of top & left + ldr r2, [r2] + vld1.32 {d0[0]}, [r1] + mov r1, #2 + vmov s1, r2 + vmov s2, r2 + vmov.i16 q2, #3 + add r2, r0, r3 + vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0] + lsl r3, #1 + vmovl.u8 q0, d0 + vmov.i64 d7, #0xffff + vmov.16 d4[0], r1 @ 2, 3, 3, 3... + vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) + vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 + @ as does left + @ top_line[0] is extra special + @ (top[0] + left[0] + 2*dc + 2) >> 2 + + vmov.i64 d7, #0xff + vpadd.i16 d6, d6 @ 1 (all the same) + vrshr.u16 d6, #3 + vmla.i16 q0, q2, d6[0] + vdup.8 d6, d6[0] + vrshrn.i16 d0, q0, #2 + + @ Store top line + vst1.32 {d0[0]}, [r0], r3 + + @ Store the rest + vshr.u64 d1, d0, #5*8 + vshr.u64 d2, d0, #6*8 + vshr.u64 d3, d0, #7*8 + vbif d1, d6, d7 + vbif d2, d6, d7 + vst1.32 {d1[0]}, [r2], r3 + vbif d3, d6, d7 + vst1.32 {d2[0]}, [r0] + vst1.32 {d3[0]}, [r2] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_c_4_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1 + + @ Average the els of top & left + vld1.8 {d0}, [r1] + vld1.8 {d1}, [r2] +A add r2, r0, r3, lsl #1 +A lsl r3, #2 +T lsl r3, #1 +T add r2, r0, r3 +T lsl r3, #1 + vaddl.u8 q0, d0, d1 + vadd.i16 d0, d1 @ d0 has 2 val pairs + vpadd.i32 d2, d0, d0 @ This adds U & V separately + vpadd.i32 d3, d0, d0 + vrshrn.u16 d0, q1, #3 + + @ Store + vst1.8 {d0}, [r0], r3 + vst1.8 {d0}, [r2], r3 + vst1.8 {d0}, [r0] + vst1.8 {d0}, [r2] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_dc_8_neon_8, export=1 + + @ Average the els of top & left + vld1.8 {d0}, [r1] + mov r1, #2 + vld1.8 {d16}, [r2] + vmov.i16 q2, #3 + vmov.i64 d7, #0xffff + vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0] + vmovl.u8 q0, d0 + vadd.i16 d6, d2, d3 @ d6 has 4 vals + vmov.16 d4[0], r1 @ 2, 3, 3, 3... + vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 + @ as does left + @ top_line[0] is extra special + @ (top[0] + left[0] + 2*dc + 2) >> 2 + + vmov.i64 d7, #0xff + vmovl.u8 q1, d16 + vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) + vpadd.i16 d6, d6 @ 1 (all the same) + vrshr.u16 d6, #4 + vmla.i16 q1, q2, d6[0] + vmla.i16 q0, q2, d6[0] + vdup.8 d6, d6[0] + vrshrn.i16 d2, q1, #2 + vrshrn.i16 d0, q0, #2 + + @ Store top line + vst1.8 {d0}, [r0], r3 + + @ Store the rest + vshr.u64 d2, #8 + vbit d6, d2, d7 + vshr.u64 d2, #8 + vst1.8 {d6}, [r0], r3 + mov r1, #6 +1: + vbit d6, d2, d7 + vshr.u64 d2, #8 + vst1.8 {d6}, [r0], r3 + subs r1, #2 + vbit d6, d2, d7 + vshr.u64 d2, #8 + vst1.8 {d6}, [r0], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_c_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1 + + @ Average the els of top & left + vld1.8 {q0}, [r1] + mov r1, #8 + vld1.8 {q1}, [r2] +T lsl r3, #1 + vaddl.u8 q0, d0, d1 +A add r2, r0, r3, lsl #1 +A lsl r3, #2 +T add r2, r0, r3 +T lsl r3, #1 + vaddl.u8 q1, d2, d3 + vadd.i16 q1, q0 + vadd.i16 d3, d2 @ d3 has 2 val pairs + vpadd.i32 d2, d3, d3 @ This add U & V separately + vpadd.i32 d3, d3, d3 + vrshrn.u16 d0, q1, #4 + vrshrn.u16 d1, q1, #4 + + @ Store +1: + vst1.8 {q0}, [r0], r3 + subs r1, #4 + vst1.8 {q0}, [r2], r3 + vst1.8 {q0}, [r0], r3 + vst1.8 {q0}, [r2], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_16_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_dc_16_neon_8, export=1 + + @ Average the els of top & left + vld1.8 {q8}, [r1] + mov r1, #2 + vld1.8 {q9}, [r2] + vaddl.u8 q10, d16, d17 + vaddl.u8 q11, d16, d18 + vaddl.u8 q0, d18, d19 + vmov.i16 q1, #3 + vadd.i16 q10, q0 + vmovl.u8 q0, d18 + vadd.i16 d20, d21 + vmov.i16 d2[0], r1 @ 2, 3, 3, 3... + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 + @ as does left + @ top_line[0] is extra special + @ (top[0] + left[0] + 2*dc + 2) >> 2 + + vmovl.u8 q2, d16 + vmovl.u8 q9, d19 + vpadd.i16 d20, d20 @ 2 (top & bottom of vector the same) + vmov.i64 d7, #0xffff + vmovl.u8 q8, d17 + vbit d4, d22, d7 @ q2 = top[0]+left[0], top[1..7] + vmov.i64 d7, #0xff + vpadd.i16 d20, d20 @ 1 (all the same) + vrshr.u16 d21, d20, #5 + vrshr.u16 d20, d20, #5 + vmla.i16 q0, q10, d2[1] + vmla.i16 q9, q10, d2[1] + vmla.i16 q2, q10, q1 + vmla.i16 q8, q10, d2[1] + vdup.8 q1, d20[0] + vrshrn.i16 d0, q0, #2 + vrshrn.i16 d1, q9, #2 + vrshrn.i16 d4, q2, #2 + vrshrn.i16 d5, q8, #2 + vext.8 q0, q0, q0, #1 + + @ Store top line + vst1.8 {q2}, [r0], r3 + + @ Store the rest + mov r1, #15 +1: + vbit d2, d0, d7 + vext.8 q0, q0, q0, #1 + subs r1, #1 + vst1.8 {q1}, [r0], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_c_16_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1 + + @ Average the els of top & left + vld1.8 {q0-q1}, [r1] + mov r1, #16 + vld1.8 {q2-q3}, [r2] +T lsl r3, #1 + vaddl.u8 q0, d0, d1 +A add r2, r0, r3, lsl #1 +T add r2, r0, r3 + vaddl.u8 q1, d2, d3 +A lsl r3, #2 +T lsl r3, #1 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.i16 q0, q1 + vadd.i16 q2, q3 + vadd.i16 q0, q2 + vadd.i16 d0, d1 @ d0 has 2 val pairs + vpadd.i32 d4, d0, d0 @ This adds U & V separately + vpadd.i32 d5, d0, d0 + vrshrn.u16 d0, q2, #5 + vrshrn.u16 d1, q2, #5 + vrshrn.u16 d2, q2, #5 + vrshrn.u16 d3, q2, #5 + + @ Store +1: + vst1.8 {q0-q1}, [r0], r3 + subs r1, #2 + vst1.8 {q0-q1}, [r2], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_32_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_dc_32_neon_8, export=1 + + @ Average the els of top & left + vld1.8 {q0-q1}, [r1] + mov r1, #32 + vld1.8 {q2-q3}, [r2] + add r2, r0, r3 + vaddl.u8 q0, d0, d1 + lsl r3, #1 + vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.i16 q0, q1 + vadd.i16 q2, q3 + vadd.i16 q0, q2 + vadd.i16 d0, d1 @ d0 has 4 vals + vpadd.i16 d0, d0 @ 2 (top & bottom the same) + vpadd.i16 d4, d0, d0 @ 1 (all the same) + vpadd.i16 d5, d0, d0 + vrshrn.u16 d0, q2, #6 + vrshrn.u16 d1, q2, #6 + vrshrn.u16 d2, q2, #6 + vrshrn.u16 d3, q2, #6 + + @ Store +1: + vst1.8 {q0-q1}, [r0], r3 + subs r1, #2 + vst1.8 {q0-q1}, [r2], r3 + bne 1b + + bx lr +endfunc + + +@ ----------------------------------------------------------------------------- +@ +@ 10 Bit versions +@ +@ There is no actual bit depth dependency in this code except that our +@ intermediate results will overflow the 16 bits they are stored in +@ All there functions are good to 10 bits - with the worst case being +@ in dc_32 where we use all 16 bits. + + +@ ff_hevc_rpi_pred_dc_4_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_dc_4_neon_10, export=1 + + @ Average the els of top & left + vld1.16 {d0}, [r1] + mov r1, #2 + vld1.16 {d1}, [r2] +T lsl r3, #1 + vmov.i16 q2, #3 +A add r2, r0, r3, lsl #1 +T add r2, r0, r3 + vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0] +A lsl r3, #2 +T lsl r3, #1 + vmov.16 d4[0], r1 @ 2, 3, 3, 3... + vmov.i64 d7, #0xffff + vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 + @ as does left + @ top_line[0] is extra special + @ (top[0] + left[0] + 2*dc + 2) >> 2 + + vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) + vpadd.i16 d6, d6 @ 1 (all the same) + vrshr.u16 d6, #3 + vmla.i16 q0, q2, d6[0] + vrshr.u16 q0, #2 + + @ Store top line + vst1.16 {d0}, [r0], r3 + + @ Store the rest + vshr.u64 d3, d1, #1*16 + vshr.u64 d4, d1, #2*16 + vshr.u64 d5, d1, #3*16 + vbif d3, d6, d7 + vbif d4, d6, d7 + vst1.16 {d3}, [r2], r3 + vbif d5, d6, d7 + vst1.16 {d4}, [r0] + vst1.16 {d5}, [r2] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_c_4_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] (In pels - needs * 4) + +function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1 + + @ Average the els of top & left + vld1.8 {q0}, [r1] + vld1.8 {q1}, [r2] +A add r2, r0, r3, lsl #2 +A lsl r3, #3 +T lsl r3, #2 +T add r2, r0, r3 +T lsl r3, #1 + vadd.i16 q0, q1 + vadd.i16 d0, d1 @ d0 has 2 val pairs + vpadd.i32 d2, d0, d0 @ This adds U & V separately + vpadd.i32 d3, d0, d0 + vrshr.u16 q0, q1, #3 + + vst1.16 {q0}, [r0], r3 + vst1.16 {q0}, [r2], r3 + vst1.16 {q0}, [r0] + vst1.16 {q0}, [r2] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_8_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_dc_8_neon_10, export=1 + + @ Average the els of top & left + vld1.16 {q0}, [r1] + mov r1, #2 + vld1.16 {q8}, [r2] +T lsl r3, #1 + vmov.i16 q2, #3 +A add r2, r0, r3, lsl #1 +T add r2, r0, r3 + vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0] +A lsl r3, #2 +T lsl r3, #1 + vmov.i64 d7, #0xffff + vmov.16 d4[0], r1 @ 2, 3, 3, 3... + vadd.i16 d6, d2, d3 @ d6 has 4 vals + vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 + @ as does left + @ top_line[0] is extra special + @ (top[0] + left[0] + 2*dc + 2) >> 2 + + vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) + vpadd.i16 d6, d6 @ 1 (all the same) + vrshr.u16 d6, #4 + vmla.i16 q8, q2, d6[0] + vmla.i16 q0, q2, d6[0] + vdup.16 q2, d6[0] + vdup.16 q9, d6[0] + vrshr.u16 q8, q8, #2 + vrshr.u16 q0, q0, #2 + vext.16 q1, q8, q8, #1 + + @ Store top line + vst1.16 {q0}, [r0], r3 + + @ Store the rest + vbit d18, d2, d7 + vst1.16 {q9}, [r2], r3 + mov r1, #6 +1: + vext.16 q8, q8, q8, #2 + subs r1, #2 + vext.16 q1, q1, q1, #2 + vbit d4, d16, d7 + vst1.16 {q2}, [r0], r3 + vbit d18, d2, d7 + vst1.16 {q9}, [r2], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_c_8_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] (In pels - needs * 4) + +function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1 + + @ Average the els of top & left + vld1.16 {q0-q1}, [r1] + mov r1, #8 + vld1.16 {q2-q3}, [r2] +T lsl r3, #2 + vadd.i16 q1, q0 +A add r2, r0, r3, lsl #2 +A lsl r3, #3 +T add r2, r0, r3 +T lsl r3, #1 + vadd.i16 q2, q3 + vadd.i16 q1, q2 + vadd.i16 d3, d2 @ d3 has 2 val pairs + vpadd.i32 d2, d3, d3 @ This add U & V separately + vpadd.i32 d3, d3, d3 + vrshr.u16 q0, q1, #4 + vrshr.u16 q1, q1, #4 + + @ Store +1: + vst1.8 {q0-q1}, [r0], r3 + subs r1, #2 + vst1.8 {q0-q1}, [r2], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_16_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_dc_16_neon_10, export=1 + + @ Average the els of top & left + vld1.16 {q8-q9}, [r1] + mov r1, #2 + vld1.16 {q10-q11}, [r2] + lsl r3, #1 @ stride given in pels + vadd.i16 q0, q8, q9 + vadd.i16 q1, q10, q11 + vmov.i16 q3, #3 + vadd.i16 q1, q0 + vadd.i16 d0, d16, d20 + vmov.i64 d31, #0xffff + vadd.i16 d3, d2 + vmov.16 d6[0], r1 @ 2, 3, 3, 3... + + @ top line gets some smoothing + @ (top[i] + 3*dc + 2) >> 2 + @ as does left + @ topline[0] is extra special + @ (top[0] + left[0] + 2*dc + 2) >> 2 + + vbit d16, d0, d31 @ q8 = top[0]+left[0], top[1..7] + vpadd.i16 d3, d3 @ 2 (top & bottom of vector the same) + vpadd.i16 d3, d3 @ 1 (all the same) + vrshr.u16 d2, d3, #5 + vrshr.u16 d3, d3, #5 + vmov q0, q1 + vmla.i16 q10, q1, d6[1] + vmla.i16 q11, q1, d6[1] + vmla.i16 q8, q1, q3 + vmla.i16 q9, q1, d6[1] + vrshr.u16 q2, q10, #2 + vrshr.u16 q3, q11, #2 + vrshr.u16 q8, #2 + vrshr.u16 q9, #2 + vext.16 q2, q2, q2, #1 + mov r1, #7<<29 + + @ Store top line + vst1.16 {q8-q9}, [r0], r3 + + @ Store the rest +1: + vbit d0, d4, d31 + vext.16 q2, q2, q2, #1 + subs r1, #1<<29 + vst1.16 {q0-q1}, [r0], r3 + bne 1b +1: + vbit d0, d6, d31 + vext.16 q3, q3, q3, #1 + subs r1, #1<<29 + vst1.16 {q0-q1}, [r0], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_c_16_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] (In pels - needs * 4) + +function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1 + + @ Average the els of top & left + vldm r1, {q0-q3} + vldm r2, {q8-q11} + vadd.i16 q0, q1 + mov r1, #16 + vadd.i16 q2, q3 + add r2, r0, #32 + vadd.i16 q8, q9 + lsl r3, #2 + vadd.i16 q10, q11 + vadd.u16 q0, q2 + vadd.u16 q8, q10 + vadd.i16 q0, q8 + vadd.i16 d0, d1 @ d0 has 2 val pairs + vpadd.i32 d4, d0, d0 @ This adds U & V separately + vpadd.i32 d5, d0, d0 + vrshr.u16 q0, q2, #5 + vrshr.u16 q1, q2, #5 + + @ Store +1: + vst1.16 {q0-q1}, [r0], r3 + subs r1, #1 + vst1.16 {q0-q1}, [r2], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_dc_32_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] (In pels) + +function ff_hevc_rpi_pred_dc_32_neon_10, export=1 + + @ Average the els of top & left + @ With 10 bits we are (just) safe from overflow in i16 + vldm r1, {q0-q3} + vldm r2, {q8-q11} + vadd.i16 q0, q1 + mov r1, #32 + vadd.i16 q2, q3 + add r2, r0, #32 + vadd.i16 q8, q9 + lsl r3, #1 + vadd.i16 q10, q11 + vadd.u16 q0, q2 + vadd.u16 q8, q10 + vadd.i16 q0, q8 + vadd.i16 d0, d1 @ d0 has 4 vals + vpadd.i16 d0, d0 @ 2 (top & bottom the same) + vpadd.i16 d4, d0, d0 @ 1 (all the same) + vpadd.i16 d5, d0, d0 + vrshr.u16 q0, q2, #6 + vrshr.u16 q1, q2, #6 + + @ Store +1: + vst1.16 {q0-q1}, [r0], r3 + subs r1, #1 + vst1.16 {q0-q1}, [r2], r3 + bne 1b + + bx lr +endfunc + + diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S new file mode 100644 index 0000000000..f6969d3591 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S @@ -0,0 +1,881 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox, Ben Avison +*/ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +@ All functions have the call +@ +@ int ff_hevc_rpi_intra_filter_N_neon_PW( +@ pixel * const left, [r0] +@ pixel * const top, [r1] +@ const unsigned int req, [r2] +@ const unsigned int avail, [r3] +@ const pixel * const src_l, [sp, #0] +@ const pixel * const src_u, [sp, #4] +@ const pixel * const src_ur, [sp, #8] +@ const unsigned int stride, [sp, #12] (pels) +@ const unsigned int top_right_size, [sp, #16] +@ const unsigned int down_left_size) [sp, #20] +@ +@ Assumptions: +@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware +@ if reuseing this code) +@ +@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for +@ N==4, but do for chroma N>=8. As we share Y/C fns that means we can ignore +@ N==8,PW=8 (chroma always PW>8) but have to cope for larger +@ +@ We always have at least 64 pixel H frame width rounding - this lets us +@ load UR widthout having to worry about exactly how many pixels are actually +@ within the frame. As partial loads will only occur very occasionally this +@ should be a win in nearly all cases. +@ +@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters +@ so we do no maths on the contents +@ +@ No filtering in 32bit fns as they are chroma only + + +.equ AVAIL_UR, 1 +.equ AVAIL_U, 2 +.equ AVAIL_UL, 4 +.equ AVAIL_L, 8 +.equ AVAIL_DL, 16 + +.equ FILTER_LIGHT, 0x40 +.equ FILTER_STRONG, 0x80 + +.equ AVAIL_S_UR_N_U_C, 32 - 1 +.equ AVAIL_S_U_N_UL_C, 32 - 2 +.equ AVAIL_S_UL_N_L_C, 32 - 3 +.equ AVAIL_S_L_N_DL_C, 32 - 4 + +.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr + +@ On entry +@ r2 req +@ r3 avail +@ [sp, #sp_offset...] args +@ +@ On Exit: +@ +@ Extend values: +@ d_l scalar contains value for L & DL +@ if DL avail then this is is DL[0] so we don't need to load that +@ d_ul scalar containing value for UL +@ d_u scalar containing value for U +@ d_ur scalar containing value for UR +@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else... +@ This means that L-light-filter works even if nreq DL (we never filter +@ req-DL without req-L, but we do filter req-L without req-DL) +@ If UR avail then d_ur == a_ur so U-filter good too +@ +@ Data load pointers (only load if req & avail): +@ r4 DL + stride +@ r10 L +@ r6 U +@ r5 UR +@ +@ Others: +@ r2 req +@ r7 req & avail +@ r3 L + stride +@ r8 DL + stride * 2 +@ r9 stride * 2 +@ cs Load U +@ mi Load UR +@ +@ Clobbered: +@ r12 + +.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur + +.equ src_l\@, \sp_offset + 0 +.equ src_u\@, \sp_offset + 4 +.equ src_ur\@, \sp_offset + 8 +.equ stride\@, \sp_offset + 12 +.equ pw\@, (1 << \pw_s) @ pel width in bytes +.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes + +@ r9 stride +@ r7 = ab_ul, r6 = a_u, r5 = a_ur +@ r4 = b_dl, r10 = b_l, r8 = b_u + + ldr r5, [sp, #src_ur\@] + lsl r12, r3, #AVAIL_S_U_DL_CPSR + ldr r10, [sp, #src_l\@] + ldr r9, [sp, #stride\@] + ldr r6, [sp, #src_u\@] + + @ This is quite a slow instruction but it replaces + @ a decent number of tests that yield a max of 2 flags/op + @ It is annoying we can't branch on Q! + @ If L navail (ne) then DL must be navail (pl) + msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur + + mov r4, r5 + sub r7, r10, r9 + it vs + movvs r4, r6 + add r8, r6, #b_size\@ - pw\@ + it cs + movcs r4, r7 + ite ne + movne r10, r4 + addeq r4, r7, r9, lsl #\log2_s + it cc + movcc r7, r10 + it mi + addmi r4, r10, r9, lsl #\log2_s + vld1.\d_type {\d_ul}, [r7] + itt vc + movvc r8, r7 + movvc r6, r7 + vld1.\d_type {\d_l }, [r4], r9 + tst r3, #AVAIL_UR + vld1.\d_type {\d_u }, [r6] + it eq + moveq r5, r8 + and r7, r2, r3 + add r8, r4, r9 + vld1.\d_type {\d_ur}, [r5] + lsls r12, r7, #AVAIL_S_UR_N_U_C + add r3, r10, r9 + lsl r9, #1 +.endm + + + +@ int ff_hevc_rpi_intra_filter_4_neon_8( +@ pixel * const left, [r0] +@ pixel * const top, [r1] +@ const unsigned int req, [r2] +@ const unsigned int avail, [r3] +@ const pixel * const src_l, [sp, #0] +@ const pixel * const src_u, [sp, #4] +@ const pixel * const src_ur, [sp, #8] +@ const unsigned int stride, [sp, #12] (pels) +@ const unsigned int top_right_size, [sp, #16] +@ const unsigned int down_left_size) [sp, #20] + +.set sp_base, 8*4 +.set pw_s, 0 +.set pw, (1 << pw_s) +.set log2_s, 2 + +function ff_hevc_rpi_intra_filter_4_neon_8, export=1 + push {r4-r10, lr} + load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[] + + it cs + vldrcs s2, [r6] + ite pl + vmovpl s3, s4 + vldrmi s3, [r5] + + lsls r7, #AVAIL_S_L_N_DL_C + add r12, r0, #-pw + bpl 1f + + vld1.8 {d0[0]}, [r10], r9 + vld1.8 {d0[1]}, [r3], r9 + vld1.8 {d0[2]}, [r10] + vld1.8 {d0[3]}, [r3] +1: + bcc 1f + vld1.8 {d0[5]}, [r4], r9 + vld1.8 {d0[6]}, [r8] + vld1.8 {d0[7]}, [r4] +1: + vstr d1, [r1] @ Up + vst1.8 {d31[7]}, [r12] + vstr d0, [r0] @ Left + pop {r4-r10, pc} +endfunc + + +@ int ff_hevc_rpi_intra_filter_4_neon_16( +@ pixel * const left, [r0] +@ pixel * const top, [r1] +@ const unsigned int req, [r2] +@ const unsigned int avail, [r3] +@ const pixel * const src_l, [sp, #0] +@ const pixel * const src_u, [sp, #4] +@ const pixel * const src_ur, [sp, #8] +@ const unsigned int stride, [sp, #12] (pels) +@ const unsigned int top_right_size, [sp, #16] +@ const unsigned int down_left_size) [sp, #20] + +.set sp_base, 8*4 +.set pw_s, 1 +.set pw, (1 << pw_s) +.set log2_s, 2 + +function ff_hevc_rpi_intra_filter_4_neon_16, export=1 + push {r4-r10, lr} + load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[] + + it cs + vldrcs d2, [r6] + it mi + vldrmi d3, [r5] + lsls r7, #AVAIL_S_L_N_DL_C + add r12, r0, #-pw + bpl 1f + vld1.16 {d0[0]}, [r10], r9 + vld1.16 {d0[1]}, [r3], r9 + vld1.16 {d0[2]}, [r10] + vld1.16 {d0[3]}, [r3] +1: + bcc 1f + vld1.16 {d1[1]}, [r4], r9 + vld1.16 {d1[2]}, [r8] + vld1.16 {d1[3]}, [r4] +1: + vst1.16 {q1}, [r1] @ Up + vst1.16 {d31[3]}, [r12] + vst1.16 {q0}, [r0] @ Left + pop {r4-r10, pc} +endfunc + + +@ int ff_hevc_rpi_intra_filter_8_neon_8( +@ pixel * const left, [r0] +@ pixel * const top, [r1] +@ const unsigned int req, [r2] +@ const unsigned int avail, [r3] +@ const pixel * const src_l, [sp, #0] +@ const pixel * const src_u, [sp, #4] +@ const pixel * const src_ur, [sp, #8] +@ const unsigned int stride, [sp, #12] (pels) +@ const unsigned int top_right_size, [sp, #16] +@ const unsigned int down_left_size) [sp, #20] + +.set sp_base, 8*4 +.set pw_s, 0 +.set pw, (1 << pw_s) +.set log2_s, 3 + +function ff_hevc_rpi_intra_filter_8_neon_8, export=1 + push {r4-r10, lr} + load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[] + + it cs + vldrcs d4, [r6] + it mi + vldrmi d5, [r5] + + lsls r7, #AVAIL_S_L_N_DL_C + bpl 1f + vld1.8 {d0[0]}, [r10], r9 + vld1.8 {d0[1]}, [r3], r9 + vld1.8 {d0[2]}, [r10], r9 + vld1.8 {d0[3]}, [r3], r9 + vld1.8 {d0[4]}, [r10], r9 + vld1.8 {d0[5]}, [r3], r9 + vld1.8 {d0[6]}, [r10] + vld1.8 {d0[7]}, [r3] +1: + bcc 1f + vld1.8 {d1[1]}, [r4], r9 + vld1.8 {d1[2]}, [r8], r9 + vld1.8 {d1[3]}, [r4], r9 + vld1.8 {d1[4]}, [r8], r9 + vld1.8 {d1[5]}, [r4], r9 + vld1.8 {d1[6]}, [r8] + vld1.8 {d1[7]}, [r4] +1: + tst r2, #FILTER_LIGHT + add r12, r0, #-pw + beq 10f + + @ Luma light filter + vext.8 q8, q15, q2, #15 + vext.8 q12, q15, q0, #15 + vaddl.u8 q9, d17, d5 + vaddl.u8 q8, d16, d4 + vaddl.u8 q13, d25, d1 + vaddl.u8 q12, d24, d0 + vmov.u8 r3, d5[7] @ Save final pel + vmov.u8 r2, d1[7] @ Save final pel + + vext.16 q2, q8, q9, #1 + vext.16 q3, q9, q9, #1 + vext.16 q0, q12, q13, #1 + vext.16 q1, q13, q13, #1 + vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] + vadd.u16 q2, q8 + vadd.u16 q3, q9 + vadd.u16 q0, q12 + vadd.u16 q1, q13 + + vrshrn.u16 d4, q2, #2 + vrshrn.u16 d5, q3, #2 + vrshrn.u16 d0, q0, #2 + vrshrn.u16 d1, q1, #2 + vrshr.u16 d30, #2 + vmov.u8 d5[7], r3 @ Restore final pel + vmov.u8 d1[7], r2 @ Restore final pel + vdup.u8 d31, d30[0] @ d31[3] = d30[0] + +10: + vst1.8 {q2 }, [r1] @ Up + vst1.8 {d31[7]}, [r12] @ Up-left + vst1.8 {q0 }, [r0] @ Left + pop {r4-r10, pc} +endfunc + + +@ int ff_hevc_rpi_intra_filter_8_neon_16( +@ pixel * const left, [r0] +@ pixel * const top, [r1] +@ const unsigned int req, [r2] +@ const unsigned int avail, [r3] +@ const pixel * const src_l, [sp, #0] +@ const pixel * const src_u, [sp, #4] +@ const pixel * const src_ur, [sp, #8] +@ const unsigned int stride, [sp, #12] (pels) +@ const unsigned int top_right_size, [sp, #16] +@ const unsigned int down_left_size) [sp, #20] + +.set sp_base, 8*4 +.set ur_size, sp_base + 16 +.set dl_size, sp_base + 20 +.set pw_s, 1 +.set pw, (1 << pw_s) +.set log2_s, 3 +.set p_size, (1 << log2_s) @ size in pels + +function ff_hevc_rpi_intra_filter_8_neon_16, export=1 + push {r4-r10, lr} + load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]" + + it cs + vldmcs r6, {d4, d5} + ldr r12, [sp, #ur_size] + bpl 1f + cmp r12, #4 + vldm r5, {d6, d7} + bgt 1f + vdup.16 d7, d6[3] +1: + lsls r12, r7, #AVAIL_S_L_N_DL_C + vdup.16 q1, d0[0] + bpl 1f + vld1.16 {d0[0]}, [r10], r9 + vld1.16 {d0[1]}, [r3], r9 + vld1.16 {d0[2]}, [r10], r9 + vld1.16 {d0[3]}, [r3], r9 + vld1.16 {d1[0]}, [r10], r9 + vld1.16 {d1[1]}, [r3], r9 + vld1.16 {d1[2]}, [r10] + vld1.16 {d1[3]}, [r3] +1: + bcc 1f + ldr r12, [sp, #dl_size] + vld1.16 {d2[1]}, [r4], r9 + cmp r12, #p_size + vld1.16 {d2[2]}, [r8], r9 + vld1.16 {d2[3]}, [r4], r9 + blt 2f + vld1.16 {d3[0]}, [r8], r9 + vld1.16 {d3[1]}, [r4], r9 + vld1.16 {d3[2]}, [r8] + vld1.16 {d3[3]}, [r4] + b 1f +2: + vdup.16 d3, d2[3] +1: + tst r2, #FILTER_LIGHT + add r12, r0, #-pw + beq 10f + + @ Luma light filter + vext.16 q9, q2, q3, #7 + vext.16 q8, q15, q2, #7 + vext.16 q13, q0, q1, #7 + vext.16 q12, q15, q0, #7 + vadd.u16 q9, q3 + vadd.u16 q8, q2 + vadd.u16 q13, q1 + vadd.u16 q12, q0 + vmov.u16 r3, d7[3] @ Save final pel + vmov.u16 r2, d3[3] @ Save final pel + + vext.16 q2, q8, q9, #1 + vext.16 q3, q9, q9, #1 + vext.16 q0, q12, q13, #1 + vext.16 q1, q13, q13, #1 + vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] + vadd.u16 q2, q8 + vadd.u16 q3, q9 + vadd.u16 q0, q12 + vadd.u16 q1, q13 + + vrshr.u16 q2, #2 + vrshr.u16 q3, #2 + vrshr.u16 q0, #2 + vrshr.u16 q1, #2 + vrshr.u16 d30, #2 + vmov.u16 d7[3], r3 @ Restore final pel + vmov.u16 d3[3], r2 @ Restore final pel + vdup.u16 d31, d30[0] @ d31[3] = d30[0] + +10: + vst1.16 {q2, q3}, [r1] @ Up + vst1.16 {d31[3]}, [r12] @ Up-left + vst1.16 {q0, q1}, [r0] @ Left + pop {r4-r10, pc} +endfunc + +@ int ff_hevc_rpi_intra_filter_16_neon_16( +@ pixel * const left, [r0] +@ pixel * const top, [r1] +@ const unsigned int req, [r2] +@ const unsigned int avail, [r3] +@ const pixel * const src_l, [sp, #0] +@ const pixel * const src_u, [sp, #4] +@ const pixel * const src_ur, [sp, #8] +@ const unsigned int stride, [sp, #12] (pels) +@ const unsigned int top_right_size, [sp, #16] +@ const unsigned int down_left_size) [sp, #20] + +.set sp_base, 8*4 +.set ur_size, sp_base + 16 +.set dl_size, sp_base + 20 +.set pw_s, 1 +.set pw, (1 << pw_s) +.set log2_s, 4 +.set p_size, (1 << log2_s) @ size in pels + +function ff_hevc_rpi_intra_filter_16_neon_16, export=1 + push {r4-r10, lr} + load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]" + + vdup.16 q9, d16[0] + vdup.16 q11, d20[0] + + it cs + vldmcs r6, {d16-d19} + ldr r12, [sp, #ur_size] + bpl 1f + cmp r12, #12 + @ Given chroma frame layout, if UR exists then it is always legit to + @ load all of it even if most of it is outside the frame. + vldm r5, {d20-d23} + bgt 1f + bge 4f + cmp r12, #8 + bge 3f + vdup.16 d21, d20[3] +3: vdup.16 d22, d21[3] +4: vdup.16 d23, d22[3] + +1: + lsls r7, #AVAIL_S_L_N_DL_C + ldr r12, [sp, #dl_size] + vdup.16 q1, d0[0] + vdup.16 q2, d0[0] + vdup.16 q3, d0[0] + bpl 1f + vld1.16 {d0[0]}, [r10], r9 + vld1.16 {d0[1]}, [r3], r9 + vld1.16 {d0[2]}, [r10], r9 + vld1.16 {d0[3]}, [r3], r9 + vld1.16 {d1[0]}, [r10], r9 + vld1.16 {d1[1]}, [r3], r9 + vld1.16 {d1[2]}, [r10], r9 + vld1.16 {d1[3]}, [r3], r9 + vld1.16 {d2[0]}, [r10], r9 + vld1.16 {d2[1]}, [r3], r9 + vld1.16 {d2[2]}, [r10], r9 + vld1.16 {d2[3]}, [r3], r9 + vld1.16 {d3[0]}, [r10], r9 + vld1.16 {d3[1]}, [r3], r9 + vld1.16 {d3[2]}, [r10] + vld1.16 {d3[3]}, [r3] +1: + bcc 1f + vld1.16 {d4[1]}, [r4], r9 + cmp r12, #4 + vld1.16 {d4[2]}, [r8], r9 + vld1.16 {d4[3]}, [r4], r9 + ble 2f + vld1.16 {d5[0]}, [r8], r9 + vld1.16 {d5[1]}, [r4], r9 + cmp r12, #12 + vld1.16 {d5[2]}, [r8], r9 + vld1.16 {d5[3]}, [r4], r9 + blt 3f + vld1.16 {d6[0]}, [r8], r9 + vld1.16 {d6[1]}, [r4], r9 + vld1.16 {d6[2]}, [r8], r9 + vld1.16 {d6[3]}, [r4], r9 + ble 4f + vld1.16 {d7[0]}, [r8], r9 + vld1.16 {d7[1]}, [r4], r9 + vld1.16 {d7[2]}, [r8] + vld1.16 {d7[3]}, [r4] + b 1f +2: vdup.16 d5, d4[3] +3: vdup.16 d6, d5[3] +4: vdup.16 d7, d6[3] +1: + tst r2, #FILTER_LIGHT + add r12, r0, #-pw + beq 10f + + vpush {q5} + @ Luma light filter + @ Left + vext.16 q5, q2, q3, #7 + vext.16 q14, q1, q2, #7 + vext.16 q13, q0, q1, #7 + vext.16 q12, q15, q0, #7 + + vadd.u16 q5, q3 + vadd.u16 q14, q2 + vadd.u16 q13, q1 + vadd.u16 q12, q0 + vmov.u16 r2, d7[3] @ Save final pel + + vext.16 q0, q12, q13, #1 + vext.16 q1, q13, q14, #1 + vext.16 q2, q14, q5, #1 + vext.16 q3, q5, q5, #1 + + vmov d30, d24 @ d30[0] = l[0] + ul + vadd.u16 q0, q12 + vadd.u16 q1, q13 + vadd.u16 q2, q14 + vadd.u16 q3, q5 + + vrshr.u16 q0, #2 + vrshr.u16 q1, #2 + vrshr.u16 q2, #2 + vrshr.u16 q3, #2 + + @ Up + vext.16 q5, q10, q11, #7 + vext.16 q14, q9, q10, #7 + vext.16 q13, q8, q9, #7 + vext.16 q12, q15, q8, #7 + + vadd.u16 q5, q11 + vadd.u16 q14, q10 + vadd.u16 q13, q9 + vadd.u16 q12, q8 + vmov.u16 r3, d23[3] @ Save final pel + + vext.16 q8, q12, q13, #1 + vext.16 q9, q13, q14, #1 + vext.16 q10, q14, q5, #1 + vext.16 q11, q5, q5, #1 + + vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0] + vadd.u16 q8, q12 + vadd.u16 q9, q13 + vadd.u16 q10, q14 + vadd.u16 q11, q5 + + vrshr.u16 q8, #2 + vrshr.u16 q9, #2 + vrshr.u16 q10, #2 + vrshr.u16 q11, #2 + + @ Misc + vrshr.u16 d30, #2 + vmov.u16 d7[3], r2 @ Restore final pel + vmov.u16 d23[3], r3 @ Restore final pel + vdup.u16 d31, d30[0] @ d31[3] = d30[0] + vpop {q5} + +10: + vstm r1, {d16-d23} @ Up + vst1.16 {d31[3]}, [r12] @ Up-left + vstm r0, { d0-d7 } @ Left + pop {r4-r10, pc} +endfunc + +@ int ff_hevc_rpi_intra_filter_4_neon_32( +@ pixel * const left, [r0] +@ pixel * const top, [r1] +@ const unsigned int req, [r2] +@ const unsigned int avail, [r3] +@ const pixel * const src_l, [sp, #0] +@ const pixel * const src_u, [sp, #4] +@ const pixel * const src_ur, [sp, #8] +@ const unsigned int stride, [sp, #12] (pels) +@ const unsigned int top_right_size, [sp, #16] +@ const unsigned int down_left_size) [sp, #20] + +.set sp_base, 8*4 +.set pw_s, 2 +.set pw, (1 << pw_s) +.set log2_s, 2 + +function ff_hevc_rpi_intra_filter_4_neon_32, export=1 + push {r4-r10, lr} + load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]" + + it cs + vldmcs r6, {d4, d5} + it mi + vldmmi r5, {d6, d7} + lsls r7, #AVAIL_S_L_N_DL_C + vdup.32 q1, d0[0] + add r12, r0, #-pw + bpl 1f + vld1.32 {d0[0]}, [r10], r9 + vld1.32 {d0[1]}, [r3], r9 + vld1.32 {d1[0]}, [r10] + vld1.32 {d1[1]}, [r3] +1: + bcc 1f + vld1.32 {d2[1]}, [r4], r9 + vld1.32 {d3[0]}, [r8] + vld1.32 {d3[1]}, [r4] +1: + vst1.32 {q2, q3 }, [r1] @ Up + vst1.32 {d31[1]}, [r12] + vst1.32 {q0, q1 }, [r0] @ Left + pop {r4-r10, pc} +endfunc + + +@ int ff_hevc_rpi_intra_filter_8_neon_32( +@ pixel * const left, [r0] +@ pixel * const top, [r1] +@ const unsigned int req, [r2] +@ const unsigned int avail, [r3] +@ const pixel * const src_l, [sp, #0] +@ const pixel * const src_u, [sp, #4] +@ const pixel * const src_ur, [sp, #8] +@ const unsigned int stride, [sp, #12] (pels) +@ const unsigned int top_right_size, [sp, #16] +@ const unsigned int down_left_size) [sp, #20] + +.set sp_base, 8*4 +.set ur_size, sp_base + 16 +.set dl_size, sp_base + 20 +.set pw_s, 2 +.set pw, (1 << pw_s) +.set log2_s, 3 +.set p_size, (1 << log2_s) @ size in pels + +function ff_hevc_rpi_intra_filter_8_neon_32, export=1 + push {r4-r10, lr} + load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]" + + vdup.32 q9, d16[0] + vdup.32 q11, d20[0] + + it cs + vldmcs r6, {q8, q9 } + ldr r12, [sp, #ur_size] + bpl 1f + cmp r12, #p_size + vldm r5, {q10, q11} + bge 1f + vdup.32 q11, d21[1] +1: + lsls r7, #AVAIL_S_L_N_DL_C + vdup.32 q1, d0[0] + vdup.32 q2, d0[0] + vdup.32 q3, d0[0] + bpl 1f + vld1.32 {d0[0]}, [r10], r9 + vld1.32 {d0[1]}, [r3], r9 + vld1.32 {d1[0]}, [r10], r9 + vld1.32 {d1[1]}, [r3], r9 + vld1.32 {d2[0]}, [r10], r9 + vld1.32 {d2[1]}, [r3], r9 + vld1.32 {d3[0]}, [r10] + vld1.32 {d3[1]}, [r3] +1: + bcc 1f + ldr r12, [sp, #dl_size] + vld1.32 {d4[1]}, [r4], r9 + cmp r12, #p_size + vld1.32 {d5[0]}, [r8], r9 + vld1.32 {d5[1]}, [r4], r9 + blt 2f + vld1.32 {d6[0]}, [r8], r9 + vld1.32 {d6[1]}, [r4], r9 + vld1.32 {d7[0]}, [r8] + vld1.32 {d7[1]}, [r4] + b 1f +2: + vdup.32 q3, d5[1] +1: + add r12, r0, #-pw + vstm r1, { q8-q11} @ Up + vst1.32 {d31[1]}, [r12] + vstm r0, { q0-q3 } @ Left + pop {r4-r10, pc} +endfunc + + +@ int ff_hevc_rpi_intra_filter_16_neon_32( +@ pixel * const left, [r0] +@ pixel * const top, [r1] +@ const unsigned int req, [r2] +@ const unsigned int avail, [r3] +@ const pixel * const src_l, [sp, #0] +@ const pixel * const src_u, [sp, #4] +@ const pixel * const src_ur, [sp, #8] +@ const unsigned int stride, [sp, #12] (pels) +@ const unsigned int top_right_size, [sp, #16] +@ const unsigned int down_left_size) [sp, #20] + +.set sp_base, 8*4 +.set ur_size, sp_base + 16 +.set dl_size, sp_base + 20 +.set pw_s, 2 +.set pw, (1 << pw_s) +.set log2_s, 4 +.set p_size, (1 << log2_s) @ size in pels + +function ff_hevc_rpi_intra_filter_16_neon_32, export=1 + push {r4-r10, lr} + load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1] + + @ Once we get this big we have run out of neon regs to store + @ everything at once so do in pieces + + @ Up (have) + it cs + vldmcs r6, { q0-q3 } + ldr r12, [sp, #ur_size] + it mi + vldmmi r5, { q8-q11} + it cs + vstmcs r1, { q0-q3 } + bpl 1f + cmp r12, #12 + add lr, r1, #(pw << log2_s) + bgt 2f + cmp r12, #8 + bge 3f + vdup.16 q9, d17[1] +4: vdup.16 d10, d19[1] +3: vdup.16 q11, d21[1] +2: vstm lr, { q8-q11} +1: + + @ Left (have) + add lr, r0, #-pw + lsls r12, r7, #AVAIL_S_L_N_DL_C + vst1.32 {d30[1]}, [lr] @ UL + bpl 1f + vld1.32 { d0[0]}, [r10], r9 + vld1.32 { d0[1]}, [r3], r9 + vld1.32 { d1[0]}, [r10], r9 + vld1.32 { d1[1]}, [r3], r9 + vld1.32 { d2[0]}, [r10], r9 + vld1.32 { d2[1]}, [r3], r9 + vld1.32 { d3[0]}, [r10], r9 + vld1.32 { d3[1]}, [r3], r9 + vld1.32 { d4[0]}, [r10], r9 + vld1.32 { d4[1]}, [r3], r9 + vld1.32 { d5[0]}, [r10], r9 + vld1.32 { d5[1]}, [r3], r9 + vld1.32 { d6[0]}, [r10], r9 + vld1.32 { d6[1]}, [r3], r9 + vld1.32 { d7[0]}, [r10] + vld1.32 { d7[1]}, [r3] + vstm r0, { q0-q3 } +1: + bcc 1f + ldr r12, [sp, #dl_size] + vdup.32 d16, d30[0] @ d16[0] = d30[0] + add lr, r0, #(pw << log2_s) + vld1.32 {d16[1]}, [r4], r9 + cmp r12, #4 + vld1.32 {d17[0]}, [r8], r9 + vld1.32 {d17[1]}, [r4], r9 + ble 2f + vld1.32 {d18[0]}, [r8], r9 + vld1.32 {d18[1]}, [r4], r9 + cmp r12, #12 + vld1.32 {d19[0]}, [r8], r9 + vld1.32 {d19[1]}, [r4], r9 + blt 3f + vld1.32 {d20[0]}, [r8], r9 + vld1.32 {d20[1]}, [r4], r9 + vld1.32 {d21[0]}, [r8], r9 + vld1.32 {d21[1]}, [r4], r9 + ble 4f + vld1.32 {d22[0]}, [r8], r9 + vld1.32 {d22[1]}, [r4], r9 + vld1.32 {d23[0]}, [r8] + vld1.32 {d23[1]}, [r4] + b 5f +2: vdup.32 q9, d17[1] +3: vdup.32 q10, d19[1] +4: vdup.32 q11, d21[1] +5: vstm lr, { q8-q11} +1: + eors r7, r2 + beq 99f + + lsls r12, r7, #AVAIL_S_UR_N_U_C + vdup.32 q0, d31[0] + vdup.32 q1, d31[0] + vdup.32 q2, d31[0] + vdup.32 q3, d31[0] + add lr, r1, #(pw << log2_s) + vdup.32 q8, d31[1] + vdup.32 q9, d31[1] + vdup.32 q10, d31[1] + vdup.32 q11, d31[1] + it cs + vstmcs r1, { q0-q3 } + it mi + vstmmi lr, { q8-q11} + + lsls r7, #AVAIL_S_L_N_DL_C + vdup.32 q0, d30[0] + vdup.32 q1, d30[0] + vdup.32 q2, d30[0] + vdup.32 q3, d30[0] + add lr, r0, #(pw << log2_s) + it mi + vstmmi r0, { q0-q3 } + it cs + vstmcs lr, { q0-q3 } + +99: + pop {r4-r10, pc} +endfunc + + + + diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S new file mode 100644 index 0000000000..56819ae439 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S @@ -0,0 +1,920 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox, Ben Avison +*/ + +/* + * Horizontal & Vertical special cases of angular intra pred + * + * Split out because: + * Vertical, at least, is relatively common + * Much simpler code than the general angular case + * Luma with size < 32 has extra filtering that doesn't happen anywhere else + * + * *** Currently luma filtering is mandatory where it occurs, but there are + * cases where it should be turned off (rdpcm & an extension sps flag). + * These don't occur in the standard conformance suite for Main Profile + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +@ ff_hevc_rpi_pred_vertical_4_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_4_neon_8, export=1 + ldrb ip, [r2, #-1] @ Top-left + vld1.32 {d0[0]}, [r2 :32] @ Left + add r2, r0, r3 + vld1.8 {d1[]}, [r1] + lsl r3, #1 + vdup.8 d4, ip + vmov.i8 d2, #128 + vhsub.u8 d4, d0, d4 + veor d1, d2 + vld1.32 {d0[0]}, [r1 :32] @ Top + vqadd.s8 d1, d4 + vmov.i64 d3, #0xff + vmov d4, d0 + veor d5, d1, d2 + veor d1, d1, d2 + vbit d0, d1, d3 + vshr.u64 d5, #8 + vst1.32 {d0[0]}, [r0], r3 + vshr.u64 d1, #16 + vbit d4, d5, d3 + vshr.u64 d5, #16 + vst1.32 {d4[0]}, [r2], r3 + vbit d0, d1, d3 + vst1.32 {d0[0]}, [r0] + vbit d4, d5, d3 + vst1.32 {d4[0]}, [r2] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_8_neon_8, export=1 + ldrb ip, [r2, #-1] @ Top-left + vld1.8 {d0}, [r2 :64] @ Left + vmov.i8 d1, #128 + vld1.8 {d2[]}, [r1] + vld1.8 {d3}, [r1 :64] @ Top + vdup.8 d4, ip + vhsub.u8 d4, d0, d4 + veor d2, d1 + vmov.i64 d0, #0xff + mov r1, #8 + vqadd.s8 d2, d4, d2 + veor d1, d2, d1 +1: + vbit d3, d1, d0 + vshr.u64 d1, #8 + vst1.8 {d3}, [r0 :64], r3 + subs r1, #2 + vbit d3, d1, d0 + vshr.u64 d1, #8 + vst1.8 {d3}, [r0 :64], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_16_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_16_neon_8, export=1 + ldrb ip, [r2, #-1] @ Top-left + vld1.8 {q0}, [r2 :128] @ Left + vdup.8 q1, ip + vld1.8 {d4[],d5[]}, [r1] + vhsub.u8 q0, q1 + vmov.i8 q1, #128 + veor q2, q1 + vmov.i64 d16, #0xff + vqadd.s8 q0, q2 + vld1.8 {q3}, [r1 :128] @ Top + mov r1, #16 + veor q0, q1 + vmov q1, q3 + vext.8 q2, q0, q0, #1 +1: + vbit d2, d0, d16 + vbit d6, d4, d16 + vext.8 q0, q0, q0, #2 + subs r1, #2 + vst1.8 {q1}, [r0 :128], r3 + vext.8 q2, q2, q2, #2 + vst1.8 {q3}, [r0 :128], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vert_32_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_32_neon_8, export=1 + vld1.8 {q0, q1 }, [r1 :128] @ Up + add r2, r0, r3 + lsl r3, #1 + mov r1, #16 +1: + vst1.8 {q0, q1 }, [r0 :128], r3 + subs r1, #1 + vst1.8 {q0, q1 }, [r2 :128], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_c_4_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1 + vld1.16 {d0 }, [r1 :64] @ Up + add r2, r0, r3, lsl #1 + lsl r3, #2 + + vst1.16 {d0 }, [r0 :64], r3 + vst1.16 {d0 }, [r2 :64], r3 + vst1.16 {d0 }, [r0 :64] + vst1.16 {d0 }, [r2 :64] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_c_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1 + vld1.16 {q0 }, [r1 :128] @ Up + add r2, r0, r3, lsl #1 + lsl r3, #2 + mov r1, #4 +1: + vst1.16 {q0 }, [r0 :128], r3 + subs r1, #2 + vst1.16 {q0 }, [r2 :128], r3 + vst1.16 {q0 }, [r0 :128], r3 + vst1.16 {q0 }, [r2 :128], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_c_16_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1 + vld1.16 {q0, q1 }, [r1 :128] @ Up + add r2, r0, r3, lsl #1 + lsl r3, #2 + mov r1, #8 +1: + vst1.16 {q0, q1 }, [r0 :128], r3 + subs r1, #1 + vst1.16 {q0, q1 }, [r2 :128], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontalal_4_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +@ ? Might be faster as simple arm + +function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1 + ldrb ip, [r2, #-1] @ Top-left + vld1.32 {d0[0]}, [r1 :32] @ Top + add r1, r2, #3 + vld1.8 {d1[]}, [r2]! + vdup.8 d2, ip + vmov.i8 d3, #128 + vhsub.u8 d0, d2 + veor d1, d3 + vld1.8 {d2[]}, [r2]! + add ip, r0, r3 + vqadd.s8 d0, d0, d1 + lsl r3, #1 + vld1.8 {d1[]}, [r2] + vld1.8 {d4[]}, [r1] + veor d0, d3 + vst1.32 {d0[0]}, [r0 :32], r3 + vst1.32 {d2[0]}, [ip :32], r3 + vst1.32 {d1[0]}, [r0 :32] + vst1.32 {d4[0]}, [ip :32] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontal_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1 + ldrb ip, [r2, #-1] @ Top-left + vld1.8 {d0}, [r1 :64] @ Top + vmov.i8 d1, #128 + vld1.8 {d2[]}, [r2]! + mov r1, #8-2 + vdup.8 d3, ip + vhsub.u8 d0, d3 + veor d2, d1 + vqadd.s8 d0, d2 + vld1.8 {d2[]}, [r2]! + veor d0, d1 + vst1.8 {d0}, [r0], r3 +1: + vld1.8 {d0[]}, [r2]! + subs r1, #2 + vst1.8 {d2}, [r0 :64], r3 + vld1.8 {d2[]}, [r2]! + vst1.8 {d0}, [r0 :64], r3 + bne 1b + + vst1.8 {d2}, [r0 :64] + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontal_16_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1 + ldrb ip, [r2, #-1] @ Top-left + vld1.8 {q0}, [r1 :64] @ Top + mov r1, #16-2 + vld1.8 {d4[],d5[]}, [r2]! + vdup.8 q3, ip + vhsub.u8 q0, q3 + vmov.i8 q1, #128 + veor q2, q1 + vqadd.s8 q0, q2 + vld1.8 {d4[],d5[]}, [r2]! + veor q0, q1 + vst1.8 {q0}, [r0], r3 +1: + vld1.8 {d0[],d1[]}, [r2]! + subs r1, #2 + vst1.8 {q2}, [r0 :64], r3 + vld1.8 {d4[],d5[]}, [r2]! + vst1.8 {q0}, [r0 :64], r3 + bne 1b + + vst1.8 {q2}, [r0 :64] + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontal_32_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1 + vld1.8 {d0[],d1[]}, [r2]! + add ip, r0, #16 + mov r1, #32-2 + vld1.8 {d2[],d3[]}, [r2]! + vst1.8 {q0}, [r0 :128], r3 + vst1.8 {q0}, [ip :128], r3 +1: + vld1.8 {d0[],d1[]}, [r2]! + subs r1, #2 + vst1.8 {q1}, [r0 :128], r3 + vst1.8 {q1}, [ip :128], r3 + vld1.8 {d2[],d3[]}, [r2]! + vst1.8 {q0}, [r0 :128], r3 + vst1.8 {q0}, [ip :128], r3 + bne 1b + + vst1.8 {q1}, [r0 :128] + vst1.8 {q1}, [ip :128] + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontal_c_4_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1 + add r1, r2, #2 + vld1.16 {d0[]}, [r2] + add r2, #4 + vld1.16 {d1[]}, [r1] + add r1, #4 + vld1.16 {d2[]}, [r2] +A add r2, r0, r3, lsl #1 +T lsl r3, #1 +T add r2, r0, r3 + vld1.16 {d3[]}, [r1] +A lsl r3, #2 +T lsl r3, #1 + vst1.16 {d0}, [r0 :64], r3 + vst1.16 {d1}, [r2 :64], r3 + vst1.16 {d2}, [r0 :64] + vst1.16 {d3}, [r2 :64] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontal_c_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1 + vld1.16 {d0[],d1[]}, [r2]! + lsl r3, #1 + vld1.16 {d2[],d3[]}, [r2]! + mov r1, #8-2 + vst1.16 {q0}, [r0 :64], r3 +1: + vld1.16 {d0[],d1[]}, [r2]! + subs r1, #2 + vst1.16 {q1}, [r0 :64], r3 + vld1.16 {d2[],d3[]}, [r2]! + vst1.16 {q0}, [r0 :64], r3 + bne 1b + + vst1.16 {q1}, [r0 :64] + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontal_c_16_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1 + vld1.16 {d0[],d1[]}, [r2]! + lsl r3, #1 + add ip, r0, #16 + mov r1, #16-2 + vld1.16 {d2[],d3[]}, [r2]! + vst1.16 {q0}, [r0 :128], r3 + vst1.16 {q0}, [ip :128], r3 +1: + vld1.16 {d0[],d1[]}, [r2]! + subs r1, #2 + vst1.16 {q1}, [r0 :128], r3 + vst1.16 {q1}, [ip :128], r3 + vld1.16 {d2[],d3[]}, [r2]! + vst1.16 {q0}, [r0 :128], r3 + vst1.16 {q0}, [ip :128], r3 + bne 1b + + vst1.16 {q1}, [r0 :128] + vst1.16 {q1}, [ip :128] + bx lr +endfunc + + +@------------------------------------------------------------------------------ +@ +@ 10 Bit +@ Has clipping constants so 10-bit only but could easily be macroed up to +@ 14-bit before we run out of bits + + +@ ff_hevc_rpi_pred_vertical_4_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_4_neon_10, export=1 + ldrh ip, [r2, #-2] @ Top-left + vld1.16 {d0}, [r2 :64] @ Left + vmov.i16 d2, #0 + vld1.16 {d1[]}, [r1] +T lsl r3, #1 + vdup.16 d4, ip + vmov.i16 d3, #0x3ff + vld1.16 {d5}, [r1 :64] @ Top + vhsub.u16 d4, d0, d4 + vmov.i64 d0, #0xffff +A add r2, r0, r3, lsl #1 +T add r2, r0, r3 + vadd.i16 d1, d1, d4 + vmov d6, d5 + vmax.s16 d1, d1, d2 + vmin.s16 d2, d1, d3 + vmin.s16 d1, d1, d3 + vbit d5, d1, d0 +A lsl r3, #2 +T lsl r3, #1 + vshr.u64 d2, #16 + vshr.u64 d1, #32 + vbit d6, d2, d0 + vst1.16 {d5}, [r0], r3 + vshr.u64 d2, #32 + vst1.16 {d6}, [r2], r3 + vbit d5, d1, d0 + vst1.16 {d5}, [r0] + vbit d6, d2, d0 + vst1.16 {d6}, [r2] + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_8_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_8_neon_10, export=1 + ldrh ip, [r2, #-2] @ Top-left + vld1.16 {q0}, [r2 :128] @ Left + lsl r3, #1 + vdup.16 q1, ip + vld1.16 {d4[],d5[]}, [r1] + vhsub.u16 q0, q0, q1 + vmov.i16 q1, #0 + vadd.i16 q0, q2 + vmov.i16 q2, #0x3ff + vld1.16 {q3}, [r1 :128] @ Top + mov r1, #8 + vmax.s16 q0, q1 + vmov q1, q3 + vmin.s16 q0, q2 + vmov.i64 d16, #0xffff + vext.16 q2, q0, q0, #1 +1: + vbit d2, d0, d16 + vbit d6, d4, d16 + vext.16 q0, q0, q0, #2 + subs r1, #2 + vst1.16 {q1}, [r0 :128], r3 + vext.16 q2, q2, q2, #2 + vst1.16 {q3}, [r0 :128], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_16_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_16_neon_10, export=1 + ldrh ip, [r2, #-2] @ Top-left + vld1.16 {q0-q1}, [r2 :128] @ Left +T lsl r3, #1 + vdup.16 q2, ip +A add r2, r0, r3, lsl #1 +T add r2, r0, r3 + vld1.16 {d6[],d7[]}, [r1] +A lsl r3, #2 +T lsl r3, #1 + vhsub.u16 q0, q2 + vhsub.u16 q1, q2 + vadd.i16 q0, q3 + vadd.i16 q1, q3 + vmov.i16 q2, #0 + vld1.16 {q8-q9}, [r1 :128] @ Top + mov r1, #0 + vmov.i16 q3, #0x3ff + vmax.s16 q0, q2 + vmax.s16 q1, q2 + vmin.s16 q0, q3 + vmin.s16 q1, q3 + vmov q10, q8 + vmov q11, q9 + vext.16 q2, q0, q1, #1 + vext.16 q3, q1, q1, #1 + vmov.i64 d24, #0xffff +1: + vbit d16, d0, d24 + vbit d20, d4, d24 + vext.16 q0, q0, q0, #2 + subs r1, #1<<30 + vst1.16 {q8-q9}, [r0 :128], r3 + vext.16 q2, q2, q2, #2 + vst1.16 {q10-q11}, [r2 :128], r3 + bne 1b +1: + vbit d16, d2, d24 + vbit d20, d6, d24 + vext.16 q1, q1, q1, #2 + subs r1, #1<<30 + vst1.16 {q8-q9}, [r0 :128], r3 + vext.16 q3, q3, q3, #2 + vst1.16 {q10-q11}, [r2 :128], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_32_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_32_neon_10, export=1 + vldm r1, { q0-q3 } @ Up + lsl r3, #1 + mov r1, #32 + add r2, r0, #32 +1: + vst1.16 {q0-q1}, [r0 :128], r3 + subs r1, #1 + vst1.16 {q2-q3}, [r2 :128], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_c_4_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1 + vld1.16 {q0 }, [r1 :128] @ Up + add r2, r0, r3, lsl #2 + lsl r3, #3 + + vst1.16 {q0 }, [r0 :128], r3 + vst1.16 {q0 }, [r2 :128], r3 + vst1.16 {q0 }, [r0 :128] + vst1.16 {q0 }, [r2 :128] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_c_8_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1 + vld1.16 {q0, q1 }, [r1 :128] @ Up + add r2, r0, r3, lsl #2 + lsl r3, #3 + mov r1, #4 +1: + vst1.16 {q0, q1 }, [r0 :128], r3 + subs r1, #1 + vst1.16 {q0, q1 }, [r2 :128], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_vertical_c_16_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1 + vldm r1, { q0-q3 } @ Up + lsl r3, #2 + mov r1, #16 + add r2, r0, #32 +1: + vst1.16 {q0-q1}, [r0 :128], r3 + subs r1, #1 + vst1.16 {q2-q3}, [r2 :128], r3 + bne 1b + + bx lr +endfunc + +@ ff_hevc_rpi_pred_horizontal_4_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1 + ldrh ip, [r2, #-2] @ Top-left + vld1.16 {d0}, [r1 :64] @ Top + vmov.i16 d1, #0 + vld1.16 {d2[]}, [r2]! +T lsl r3, #1 + vdup.16 d3, ip + vmov.i16 d4, #0x3ff + vhsub.u16 d0, d3 +A add ip, r0, r3, lsl #1 +T add ip, r0, r3 + vld1.16 {d3[]}, [r2]! +A lsl r3, #2 +T lsl r3, #1 + vadd.i16 d0, d2 + vld1.16 {d2[]}, [r2]! + vmax.s16 d0, d1 + vld1.16 {d1[]}, [r2] + vmin.s16 d0, d4 + vst1.16 {d0}, [r0 :64], r3 + vst1.16 {d3}, [ip :64], r3 + vst1.16 {d2}, [r0 :64] + vst1.16 {d1}, [ip :64] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontal_8_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1 + ldrh ip, [r2, #-2] @ Top-left + vld1.16 {q0}, [r1 :128] @ Top + lsl r3, #1 + vdup.16 q1, ip + mov r1, #8-2 + vhsub.u16 q0, q1 + vld1.16 {d2[],d3[]}, [r2]! + vmov.i16 q2, #0 + vadd.i16 q0, q1 + vmov.i16 q1, #0x3ff + vmax.s16 q0, q2 + vld1.16 {d4[],d5[]}, [r2]! + vmin.s16 q0, q1 + vst1.16 {q0}, [r0 :128], r3 +1: + vld1.16 {d0[],d1[]}, [r2]! + subs r1, #2 + vst1.16 {q2}, [r0 :128], r3 + vld1.16 {d4[],d5[]}, [r2]! + vst1.16 {q0}, [r0 :128], r3 + bne 1b + + vst1.16 {q2}, [r0 :128] + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontalal_16_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1 + ldrh ip, [r2, #-2] @ Top-left + vld1.16 {q0-q1}, [r1 :128] @ Top + lsl r3, #1 + vdup.16 q2, ip + add ip, r0, r3 + vhsub.u16 q0, q2 + add ip, #16 + vhsub.u16 q1, q2 + mov r1, #16-2 + vld1.16 {d4[],d5[]}, [r2]! + vmov.i16 q3, #0 + vadd.u16 q0, q2 + vadd.i16 q1, q2 + vmov.i16 q2, #0x3ff + vmax.s16 q0, q3 + vmax.s16 q1, q3 + vld1.16 {d6[],d7[]}, [r2]! + vmin.s16 q0, q2 + vmin.s16 q1, q2 + vst1.16 {q0-q1}, [r0 :128], r3 +1: + vld1.16 {d0[],d1[]}, [r2]! + subs r1, #2 + vst1.16 {q3}, [r0 :128], r3 + vst1.16 {q3}, [ip :128], r3 + vld1.16 {d6[],d7[]}, [r2]! + vst1.16 {q0}, [r0 :128], r3 + vst1.16 {q0}, [ip :128], r3 + bne 1b + + vst1.16 {q3}, [r0 :128] + vst1.16 {q3}, [ip :128] + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontal_32_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1 + vld1.16 {d0[],d1[]}, [r2]! + add ip, r0, #16 + push {lr} + mov lr, #32 + vld1.16 {d2[],d3[]}, [r2]! + lsl r3, #1 + vst1.16 {q0}, [r0 :128], lr + sub r3, #32 + vst1.16 {q0}, [ip :128], lr + mov r1, #32-2 + vst1.16 {q0}, [r0 :128], r3 + vst1.16 {q0}, [ip :128], r3 +1: + vld1.16 {d0[],d1[]}, [r2]! + subs r1, #2 + vst1.16 {q1}, [r0 :128], lr + vst1.16 {q1}, [ip :128], lr + vst1.16 {q1}, [r0 :128], r3 + vst1.16 {q1}, [ip :128], r3 + vld1.16 {d2[],d3[]}, [r2]! + vst1.16 {q0}, [r0 :128], lr + vst1.16 {q0}, [ip :128], lr + vst1.16 {q0}, [r0 :128], r3 + vst1.16 {q0}, [ip :128], r3 + bne 1b + + vst1.16 {q1}, [r0 :128], lr + vst1.16 {q1}, [ip :128], lr + vst1.16 {q1}, [r0 :128] + vst1.16 {q1}, [ip :128] + pop {pc} +endfunc + + +@ ff_hevc_rpi_pred_horizontal_c_4_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1 + add r1, r2, #4 + vld1.32 {d0[],d1[]}, [r2] + add r2, #8 + vld1.32 {d2[],d3[]}, [r1] + add r1, #8 + vld1.32 {d4[],d5[]}, [r2] +A add r2, r0, r3, lsl #2 +T lsl r3, #2 +T add r2, r0, r3 + vld1.32 {d6[],d7[]}, [r1] +A lsl r3, #3 +T lsl r3, #1 + vst1.32 {q0}, [r0 :128], r3 + vst1.32 {q1}, [r2 :128], r3 + vst1.32 {q2}, [r0 :128] + vst1.32 {q3}, [r2 :128] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontal_c_8_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1 + vld1.32 {d0[],d1[]}, [r2]! + lsl r3, #2 + add ip, r0, #16 + mov r1, #8-2 + vld1.32 {d2[],d3[]}, [r2]! + vst1.32 {q0}, [r0 :128], r3 + vst1.32 {q0}, [ip :128], r3 +1: + vld1.32 {d0[],d1[]}, [r2]! + subs r1, #2 + vst1.32 {q1}, [r0 :128], r3 + vst1.32 {q1}, [ip :128], r3 + vld1.32 {d2[],d3[]}, [r2]! + vst1.32 {q0}, [r0 :128], r3 + vst1.32 {q0}, [ip :128], r3 + bne 1b + + vst1.32 {q1}, [r0 :128] + vst1.32 {q1}, [ip :128] + bx lr +endfunc + + +@ ff_hevc_rpi_pred_horizontal_c_16_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1 + vld1.32 {d0[],d1[]}, [r2]! + add ip, r0, #16 + push {lr} + mov lr, #32 + vld1.32 {d2[],d3[]}, [r2]! + lsl r3, #2 + vst1.32 {q0}, [r0 :128], lr + sub r3, #32 + vst1.32 {q0}, [ip :128], lr + mov r1, #16-2 + vst1.32 {q0}, [r0 :128], r3 + vst1.32 {q0}, [ip :128], r3 +1: + vld1.32 {d0[],d1[]}, [r2]! + subs r1, #2 + vst1.32 {q1}, [r0 :128], lr + vst1.32 {q1}, [ip :128], lr + vst1.32 {q1}, [r0 :128], r3 + vst1.32 {q1}, [ip :128], r3 + vld1.32 {d2[],d3[]}, [r2]! + vst1.32 {q0}, [r0 :128], lr + vst1.32 {q0}, [ip :128], lr + vst1.32 {q0}, [r0 :128], r3 + vst1.32 {q0}, [ip :128], r3 + bne 1b + + vst1.32 {q1}, [r0 :128], lr + vst1.32 {q1}, [ip :128], lr + vst1.32 {q1}, [r0 :128] + vst1.32 {q1}, [ip :128] + pop {pc} +endfunc + + + diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S new file mode 100644 index 0000000000..af8c4c03f0 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S @@ -0,0 +1,1043 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox, Ben Avison +*/ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +@ Planar intra pred (8.4.4.2.4) +@ +@ predSamples[ x ][ y ] = +@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] + +@ ( x + 1 ) * p[ nTbS ][ -1 ] + +@ ( nTbS - 1 - y ) * p[ x ][ -1 ] + +@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 ) + +@ All 10-bit functions would work with 9 + + +@ ff_hevc_rpi_pred_planar_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_4_neon_8, export=1 + + vld1.8 {d0}, [r1] @ Top + adr ip, nb_3_0_1_4 + vld1.8 {d1}, [r2] @ Left + vmov.i64 d2, #0xffffffff + vldr d3, [ip, #8] @ {1,2,3,4,1,2,3,4} + add r1, r0, r3 + vdup.32 d4, d0[0] @ {t0,t1,t2,t3,t0,t1,t2,t3} + vdup.8 d0, d0[4] @ {t4,t4,t4,t4,t4,t4,t4,t4} + vdup.8 d5, d1[4] @ {l4,l4,l4,l4,l4,l4,l4,l4} + vdup.8 d6, d1[0] @ {l0,l0,l0,l0,l0,l0,l0,l0} + vshll.u8 q8, d4, #2 + lsl r3, #1 + vsubl.u8 q2, d5, d4 + vmlal.u8 q8, d0, d3 + vld1.8 {d0}, [ip] @ {3,2,1,0,3,2,1,0} + vdup.8 d7, d1[1] @ {l1,l1,l1,l1,l1,l1,l1,l1} + vshl.s16 q9, q2, #1 + vbif d6, d7, d2 @ {l0,l0,l0,l0,l1,l1,l1,l1} + vadd.i16 d16, d4 + vdup.8 d7, d1[2] @ {l2,l2,l2,l2,l2,l2,l2,l2} + vadd.i16 d17, d18 + vdup.8 d1, d1[3] @ {l3,l3,l3,l3,l3,l3,l3,l3} + vadd.i16 q2, q8, q9 + vmlal.u8 q8, d0, d6 + vbif d7, d1, d2 @ {l2,l2,l2,l2,l3,l3,l3,l3} + vmlal.u8 q2, d0, d7 + vrshrn.i16 d0, q8, #3 + vst1.32 d0[0], [r0 :32], r3 + vst1.32 d0[1], [r1 :32], r3 + vrshrn.i16 d0, q2, #3 + vst1.32 d0[0], [r0 :32] + vst1.32 d0[1], [r1 :32] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_planar_4_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_4_neon_10, export=1 + @ Load from bytes & expand later - at the very least this uses less + @ memory than having a short table + vld1.16 {q0}, [r1 :64] @ Top + adr ip, nbh_3_0_1_4 + vldr d2, [r2, #8] @ Left (lower) + vldr d3, [ip, #8] @ {1,2,3,4} +T lsl r3, #1 + vshl.s16 d4, d0, #2 + vdup.16 d1, d1[0] @ {t4,t4,t4,t4} + vldr d5, [r2] @ Left (upper) + vdup.16 d2, d2[0] @ {l4,l4,l4,l4} + vldr d6, [ip] @ {3,2,1,0} + vmla.i16 d4, d3, d1 @ Acc set up + vsub.i16 d0, d2, d0 @ Add set up + vmov d7, d6 + vdup.16 d2, d5[0] + vdup.16 d3, d5[1] + vdup.16 d16, d5[2] + vadd.i16 d18, d0, d4 + vshl.s16 d0, #1 @ x2 + vadd.i16 d19, d0, d4 + vdup.16 d17, d5[3] + vadd.i16 d4, d0, d18 +A add r1, r0, r3, lsl #1 +T add r1, r0, r3 + vadd.i16 d5, d0, d19 +A lsl r3, #2 +T lsl r3, #1 + vmla.i16 q9, q1, q3 + vmla.i16 q2, q8, q3 + vrshr.u16 q0, q9, #3 + vst1.16 {d0}, [r0], r3 + vrshr.u16 d2, d4, #3 + vst1.16 {d1}, [r1], r3 + vrshr.u16 d3, d5, #3 + vst1.16 {d2}, [r0] + vst1.16 {d3}, [r1] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_planar_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_8_neon_8, export=1 + + vld1.8 {q0}, [r1] @ Top + adr ip, nb_7_0_1_8 + vldr d2, [r2, #8] @ Left (lower) + mov r1, #8 + vldr d3, [ip, #8] @ {1,2,3,4,5,6,7,8} + vshll.u8 q2, d0, #3 + vdup.8 d1, d1[0] @ {t8,t8,t8,t8,t8,t8,t8,t8} + vdup.8 d2, d2[0] @ {l8,l8,l8,l8,l8,l8,l8,l8} + vldr d6, [r2] @ Left (upper) + vmlal.u8 q2, d3, d1 + vsubl.u8 q0, d2, d0 + vldr d7, [ip] @ {7,6,5,4,3,2,1,0} + +@ u8 7..0 [1] d7 +@ u8 left[y] [1] d6 +@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1] + + vdup.8 d2, d6[0] + vadd.i16 q2, q0 + vdup.8 d3, d6[1] + vadd.i16 q8, q2, q0 +1: + vmlal.u8 q2, d7, d2 + subs r1, #2 + vadd.i16 q9, q8, q0 + vmlal.u8 q8, d7, d3 + vdup.8 d2, d6[2] + vdup.8 d3, d6[3] + vrshrn.i16 d20, q2, #4 + vshr.u64 d6, #16 + vmov q2, q9 + vst1.8 {d20}, [r0], r3 + vrshrn.i16 d20, q8, #4 + vadd.i16 q8, q2, q0 + vst1.8 {d20}, [r0], r3 + bne 1b + + bx lr + +endfunc + + +@ ff_hevc_rpi_pred_planar_8_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_8_neon_10, export=1 + + adr ip, nb_7_0_1_8 + vld1.16 {q0}, [r1 :128]! @ Top (left) + lsl r3, #1 + vld1.16 {q1}, [ip :128] @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8} + add ip, r2, #16 + vld1.16 {d4[],d5[]}, [r1] @ Top (right) + mov r1, #8-2 + vshl.s16 q3, q0, #3 + vmovl.u8 q8, d3 @ {1,2,3,4,5,6,7,8} + vld1.16 {d18[],d19[]}, [ip] @ Left (lower) + vmla.i16 q3, q8, q2 @ Acc set up + vsub.i16 q0, q9, q0 @ Add set up + vmovl.u8 q1, d2 @ {7,6,5,4,3,2,1,0} + vadd.i16 q2, q3, q0 + +@ u16 7..0 [1] q1 +@ u32 left[y] [1] [r2] +@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1] + + vld1.16 {d6[],d7[]}, [r2]! + vadd.i16 q8, q2, q0 + vld1.16 {d18[],d19[]}, [r2]! + vmla.i16 q2, q1, q3 + vadd.i16 q3, q8, q0 + vmla.i16 q8, q1, q9 +1: + vrshr.u16 q9, q2, #4 + subs r1, #2 + vmov q2, q3 + vrshr.u16 q10, q8, #4 + vld1.16 {d6[],d7[]}, [r2]! + vst1.16 {q9}, [r0 :128], r3 + vadd.i16 q8, q2, q0 + vld1.16 {d18[],d19[]}, [r2]! + vmla.i16 q2, q1, q3 + vadd.i16 q3, q8, q0 + vmla.i16 q8, q1, q9 + vst1.16 {q10}, [r0 :128], r3 + bne 1b + + vrshr.u16 q9, q2, #4 + add r3, r0 + vrshr.u16 q10, q8, #4 + vst1.16 {q9}, [r0 :128] + vst1.16 {q10}, [r3 :128] + + bx lr +endfunc + + +@------------------------------------------------------------------------------ +@ +@ Data - has to be in two lumps to ensure we can always reach using adr + + .balign 64 + +nb_31_0_1_32: + .byte 31, 30, 29, 28, 27, 26, 25, 24 + .byte 23, 22, 21, 20, 19, 18, 17, 16 +nb_15_0_1_16: + .byte 15, 14, 13, 12, 11, 10, 9, 8 + .byte 7, 6, 5, 4, 3, 2, 1, 0 + .byte 1, 2, 3, 4, 5, 6, 7, 8 + .byte 9, 10, 11, 12, 13, 14, 15, 16 + .byte 17, 18, 19, 20, 21, 22, 23, 24 + .byte 25, 26, 27, 28, 29, 30, 31, 32 + + @ should be back on a 64-byte boundary here + + @ These could be extracted from the above array, but separate out + @ out for better (16 byte) alignment +nb_3_0_1_4: + .byte 3, 2, 1, 0, 3, 2, 1, 0 + .byte 1, 2, 3, 4, 1, 2, 3, 4 +nb_7_0_1_8: + .byte 7, 6, 5, 4, 3, 2, 1, 0 + .byte 1, 2, 3, 4, 5, 6, 7, 8 +nbh_3_0_1_4: + .short 3, 2, 1, 0, 1, 2, 3, 4 + +@------------------------------------------------------------------------------ + + +@ ff_hevc_rpi_pred_planar_16_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_16_neon_8, export=1 + + adr ip, nb_15_0_1_16 + 16 + vld1.8 {q0}, [r1 :128]! @ Top (left) + add r2, #16 + vld1.8 {q1}, [ip: 128] @ {1,2,3...16} + vld1.8 {d4[]}, [r1] @ Top (right) + sub ip, #16 + vshll.u8 q3, d0, #4 + mov r1, #16 + vshll.u8 q8, d1, #4 + vld1.8 {d5[]}, [r2] @ Left (lower) + sub r2, #16 + vmlal.u8 q3, d2, d4 + vmlal.u8 q8, d3, d4 @ Acc set up + vsubl.u8 q1, d5, d0 + vsubl.u8 q0, d5, d1 @ Add set up + vld1.8 {q2}, [ip :128] @ {15,14,13...0} + +@ u8 15..0 [1] q2 +@ u8 left[y] [1] [r2] +@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1] + + vadd.i16 q3, q1 + vadd.i16 q8, q0 +1: + vadd.i16 q10, q3, q1 + subs r1, #2 + vld1.8 {d18[]}, [r2]! + vadd.i16 q11, q8, q0 + vld1.8 {d19[]}, [r2]! + vmlal.u8 q3, d4, d18 + vmlal.u8 q8, d5, d18 + vadd.i16 q12, q10, q1 + vmlal.u8 q10, d4, d19 + vadd.i16 q13, q11, q0 + vmlal.u8 q11, d5, d19 + vrshrn.u16 d18, q3, #5 + vrshrn.u16 d19, q8, #5 + vmov q3, q12 + vst1.8 {q9}, [r0 :128], r3 + vrshrn.u16 d18, q10, #5 + vrshrn.u16 d19, q11, #5 + vmov q8, q13 + vst1.8 {q9}, [r0 :128], r3 + bne 1b + + bx lr + +endfunc + + +@ ff_hevc_rpi_pred_planar_16_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_16_neon_10, export=1 + + @ Load from bytes & expand later - at the very least this uses less + @ memory than having a short table + adr ip, nb_15_0_1_16 + 16 + vld1.16 {q0-q1}, [r1 :128]! @ Top (left) + add r2, #32 + vld1.8 {q2}, [ip :128] @ {1,2,3...16} + lsl r3, #1 + vld1.16 {d6[],d7[]}, [r1] @ Top (right) + sub ip, #16 + vmovl.u8 q8, d4 + mov r1, #16 + vshl.i16 q9, q0, #4 + vmovl.u8 q2, d5 + vshl.i16 q10, q1, #4 + vld1.16 {d22[],d23[]}, [r2] @ Left (lower) + sub r2, #32 + vld1.8 {q12}, [ip] @ {15,14,13...0} + vmla.i16 q9, q8, q3 + vmla.i16 q10, q2, q3 @ Acc set up + vsub.i16 q0, q11, q0 + vsub.i16 q1, q11, q1 @ Add set up + vadd.i16 q2, q9, q0 + vadd.i16 q3, q10, q1 + vmovl.u8 q8, d24 + vmovl.u8 q9, d25 + +@ u16 15..0 [2] q8,q9 +@ u32 left[y] [2] [r2] +@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1] + +1: + vadd.i16 q10, q2, q0 + subs r1, #2 + vld1.16 {d24[],d25[]}, [r2]! + vadd.i16 q11, q3, q1 + vld1.16 {d28[],d29[]}, [r2]! + vmla.i16 q2, q8, q12 + vmla.i16 q3, q9, q12 + vadd.i16 q12, q10, q0 + vmla.i16 q10, q8, q14 + vadd.i16 q13, q11, q1 + vmla.i16 q11, q9, q14 + vrshr.u16 q14, q2, #5 + vrshr.u16 q15, q3, #5 + vmov q2, q12 + vst1.16 {q14-q15}, [r0 :128], r3 + vrshr.u16 q14, q10, #5 + vrshr.u16 q15, q11, #5 + vmov q3, q13 + vst1.16 {q14-q15}, [r0 :128], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_planar_32_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_32_neon_8, export=1 + + vld1.8 {q0-q1}, [r1 :128]! @ Top (left) + adr ip, nb_31_0_1_32 + 32 + vpush {d8-d12} + vld1.8 {q2-q3}, [ip :128] @ {1,2,3...32} + add r2, #32 + vld1.8 {d8[]}, [r1] @ Top (right) + sub ip, #32 + vshll.u8 q8, d0, #5 + mov r1, #32 + vld1.8 {d9[]}, [r2] @ Left (lower) + sub r2, #32 + vshll.u8 q9, d1, #5 + vshll.u8 q10, d2, #5 + vshll.u8 q11, d3, #5 + vmlal.u8 q8, d4, d8 + vsubl.u8 q12, d9, d0 + vmlal.u8 q9, d5, d8 + vsubl.u8 q13, d9, d1 + vmlal.u8 q10, d6, d8 + vsubl.u8 q14, d9, d2 + vmlal.u8 q11, d7, d8 @ Acc set up + vsubl.u8 q15, d9, d3 @ Add set up + vadd.i16 q8, q12 + vadd.i16 q9, q13 + vadd.i16 q10, q14 + vadd.i16 q11, q15 + vld1.8 {q4-q5}, [ip :128] @ {31,30,29...0} + +@ u8 31..0 [2] q4,q5 +@ u8 left[y] [2] [r2] +@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1] + + vld1.8 {d12[]}, [r2]! + vadd.i16 q0, q8, q12 + b 2f +1: + vld1.8 {d12[]}, [r2]! + vrshrn.u16 d3, q1, #6 + vrshrn.u16 d2, q0, #6 + vadd.i16 q0, q8, q12 + vrshrn.u16 d4, q2, #6 + vrshrn.u16 d5, q3, #6 + vst1.8 {q1-q2}, [r0 :128], r3 +2: vadd.i16 q1, q9, q13 + subs r1, #2 + vadd.i16 q2, q10, q14 + vadd.i16 q3, q11, q15 + vmlal.u8 q8, d8, d12 + vmlal.u8 q9, d9, d12 + vmlal.u8 q10, d10, d12 + vmlal.u8 q11, d11, d12 + vld1.8 {d12[]}, [r2]! + vrshrn.u16 d19, q9, #6 + vrshrn.u16 d18, q8, #6 + vadd.i16 q8, q0, q12 + vrshrn.u16 d20, q10, #6 + vrshrn.u16 d21, q11, #6 + vst1.8 {q9-q10}, [r0 :128], r3 + vadd.i16 q9, q1, q13 + vadd.i16 q10, q2, q14 + vadd.i16 q11, q3, q15 + vmlal.u8 q0, d8, d12 + vmlal.u8 q1, d9, d12 + vmlal.u8 q2, d10, d12 + vmlal.u8 q3, d11, d12 + + bne 1b + + vpop {d8-d12} + + vrshrn.u16 d3, q1, #6 + vrshrn.u16 d2, q0, #6 + vrshrn.u16 d4, q2, #6 + vrshrn.u16 d5, q3, #6 + vst1.8 {q1-q2}, [r0 :128] + + bx lr + +endfunc + + +@ ff_hevc_rpi_pred_planar_32_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_32_neon_10, export=1 + + @ Load from bytes & expand later - at the very least this uses less + @ memory than having a short table + vld1.16 {q0-q1}, [r1 :128]! @ Top (left) + adr ip, nb_31_0_1_32 + 32 + vpush {q4-q7} + vld1.16 {q2-q3}, [r1 :128]! @ Top (centre) + add r2, #64 + vld1.8 {q14-q15}, [ip :128] @ {1,2,3...32} +T lsl r3, #1 + vld1.16 {d8[],d9[]}, [r1] @ Top (right) + sub ip, #32 + vmovl.u8 q12, d28 + mov r1, #32 + vmovl.u8 q13, d29 + vld1.8 {q6-q7}, [ip :128] @ {31,30,29...0} + vmovl.u8 q14, d30 + vmovl.u8 q15, d31 + vld1.16 {d10[],d11[]}, [r2] @ Left (lower) + sub r2, #64 + vshl.i16 q8, q0, #5 + vshl.i16 q9, q1, #5 + vshl.i16 q10, q2, #5 + vshl.i16 q11, q3, #5 + vmla.i16 q8, q12, q4 + vsub.i16 q0, q5, q0 + vmla.i16 q9, q13, q4 + vsub.i16 q1, q5, q1 + vmla.i16 q10, q14, q4 + vmov.u16 ip, d0[0] + vsub.i16 q2, q5, q2 + vmla.i16 q11, q15, q4 @ Acc set up + vsub.i16 q3, q5, q3 @ Add set up + vadd.i16 q8, q0 + vadd.i16 q9, q1 + vadd.i16 q10, q2 + vadd.i16 q11, q3 + vmovl.u8 q4, d12 + vmovl.u8 q5, d13 + vmovl.u8 q6, d14 + vmovl.u8 q7, d15 + +@ u16 31..0 [4] q4-q7 +@ u16 left[y] [4] [r2] +@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1] + + vadd.i16 q12, q8, q0 +A sub r0, r0, r3, lsl #1 +T sub r0, r3 +1: + vld1.16 {d0[0]}, [r2]! +A add r0, r0, r3, lsl #1 +T add r0, r3 + vadd.i16 q13, q9, q1 + subs r1, #2 + vadd.i16 q14, q10, q2 + vadd.i16 q15, q11, q3 + vmla.i16 q8, q4, d0[0] + vmla.i16 q9, q5, d0[0] + vmla.i16 q10, q6, d0[0] + vmla.i16 q11, q7, d0[0] + vmov.16 d0[0], ip + vrshr.u16 q8, #6 + vrshr.u16 q9, #6 + vrshr.u16 q10, #6 + vrshr.u16 q11, #6 + vstm r0, {q8-q11} + vadd.i16 q8, q12, q0 +A add r0, r0, r3, lsl #1 +T add r0, r3 + vld1.16 {d0[0]}, [r2]! + vadd.i16 q9, q13, q1 + vadd.i16 q10, q14, q2 + vadd.i16 q11, q15, q3 + vmla.i16 q12, q4, d0[0] + vmla.i16 q13, q5, d0[0] + vmla.i16 q14, q6, d0[0] + vmla.i16 q15, q7, d0[0] + vmov.16 d0[0], ip + vrshr.u16 q12, #6 + vrshr.u16 q13, #6 + vrshr.u16 q14, #6 + vrshr.u16 q15, #6 + vstm r0, {q12-q15} + vadd.i16 q12, q8, q0 + bne 1b + + vpop {q4-q7} + bx lr + +endfunc + + +@ ff_hevc_rpi_pred_planar_c_4_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1 + + vld1.8 {q0}, [r1] @ Top + adr ip, nbx2_3_0_1_4 + vldr d2, [r2, #8] @ Left (lower) + mov r1, #4 + vldr d3, [ip, #8] @ {1,1,2,2,3,3,4,4} + lsl r3, #1 + vshll.u8 q2, d0, #2 + vdup.16 d1, d1[0] @ {t4,t4,t4,t4,t4,t4,t4,t4} + vdup.16 d2, d2[0] @ {l4,l4,l4,l4,l4,l4,l4,l4} + vldr d6, [r2] @ Left (upper) + vmlal.u8 q2, d3, d1 + vsubl.u8 q0, d2, d0 + vldr d7, [ip] @ {3,3,2,2,1,1,0,0} + +@ u8 3..0 [1] d7 +@ u8 left[y] [1] d6 +@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1] + + vdup.16 d2, d6[0] + vadd.i16 q2, q0 + vdup.16 d3, d6[1] + vadd.i16 q8, q2, q0 +1: + vmlal.u8 q2, d7, d2 + subs r1, #2 + vadd.i16 q9, q8, q0 + vmlal.u8 q8, d7, d3 + vdup.16 d2, d6[2] + vdup.16 d3, d6[3] + vrshrn.i16 d20, q2, #3 + vmov q2, q9 + vst1.8 {d20}, [r0], r3 + vrshrn.i16 d20, q8, #3 + vadd.i16 q8, q2, q0 + vst1.8 {d20}, [r0], r3 + bne 1b + + bx lr + +endfunc + + +@ ff_hevc_rpi_pred_planar_c_4_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1 + + adr ip, nbx2_3_0_1_4 + vld1.16 {q0}, [r1 :128]! @ Top (left) + lsl r3, #2 + vld1.16 {q1}, [ip :128] @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4} + add ip, r2, #16 + vld1.32 {d4[],d5[]}, [r1] @ Top (right) + vshl.s16 q3, q0, #2 + vmovl.u8 q8, d3 @ {1,1,2,2,3,3,4,4} + vld1.32 {d18[],d19[]}, [ip] @ Left (lower) + vmla.i16 q3, q8, q2 @ Acc set up + vsub.i16 q0, q9, q0 @ Add set up + vmovl.u8 q1, d2 @ {3,3,2,2,1,1,0,0} + vadd.i16 q2, q3, q0 + +@ u16 3..0 [1] q1 +@ u32 left[y] [1] [r2] +@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1] + + vld1.32 {d6[],d7[]}, [r2]! + vadd.i16 q8, q2, q0 + vld1.32 {d18[],d19[]}, [r2]! + vmla.i16 q2, q1, q3 + vadd.i16 q3, q8, q0 + vmla.i16 q8, q1, q9 + + vrshr.u16 q9, q2, #3 + vmov q2, q3 + vrshr.u16 q10, q8, #3 + vld1.32 {d6[],d7[]}, [r2]! + vst1.16 {q9}, [r0 :128], r3 + vadd.i16 q8, q2, q0 + vld1.32 {d18[],d19[]}, [r2]! + vmla.i16 q2, q1, q3 + vadd.i16 q3, q8, q0 + vmla.i16 q8, q1, q9 + vst1.16 {q10}, [r0 :128], r3 + + vrshr.u16 q9, q2, #3 + add r3, r0 + vrshr.u16 q10, q8, #3 + vst1.16 {q9}, [r0 :128] + vst1.16 {q10}, [r3 :128] + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_planar_c_8_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1 + + adr ip, nbx2_7_0_1_8 + 16 + vld1.8 {q0}, [r1 :128]! @ Top (left) + add r2, #16 + vld1.8 {q1}, [ip: 128] @ {1,1,2,2,3,3...8,8} + lsl r3, #1 + vld1.16 {d4[]}, [r1] @ Top (right) + sub ip, #16 + vshll.u8 q3, d0, #3 + mov r1, #8 + vshll.u8 q8, d1, #3 + vld1.16 {d5[]}, [r2] @ Left (lower) + sub r2, #16 + vmlal.u8 q3, d2, d4 + vmlal.u8 q8, d3, d4 @ Acc set up + vsubl.u8 q1, d5, d0 + vsubl.u8 q0, d5, d1 @ Add set up + vld1.8 {q2}, [ip :128] @ {7,7,6,6,5,5...0,0} + +@ u8 7..0 [1] q2 +@ u8 left[y] [1] [r2] +@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1] + + vadd.i16 q3, q1 + vadd.i16 q8, q0 +1: + vadd.i16 q10, q3, q1 + subs r1, #2 + vld1.16 {d18[]}, [r2]! + vadd.i16 q11, q8, q0 + vld1.16 {d19[]}, [r2]! + vmlal.u8 q3, d4, d18 + vmlal.u8 q8, d5, d18 + vadd.i16 q12, q10, q1 + vmlal.u8 q10, d4, d19 + vadd.i16 q13, q11, q0 + vmlal.u8 q11, d5, d19 + vrshrn.u16 d18, q3, #4 + vrshrn.u16 d19, q8, #4 + vmov q3, q12 + vst1.8 {q9}, [r0 :128], r3 + vrshrn.u16 d18, q10, #4 + vrshrn.u16 d19, q11, #4 + vmov q8, q13 + vst1.8 {q9}, [r0 :128], r3 + bne 1b + + bx lr + +endfunc + + +@------------------------------------------------------------------------------ +@ +@ Data - has to be in two lumps to ensure we can always reach using adr + + .balign 64 + +nbx2_15_0_1_16: + .byte 15, 15, 14, 14, 13, 13, 12, 12 + .byte 11, 11, 10, 10, 9, 9, 8, 8 +nbx2_7_0_1_8: + .byte 7, 7, 6, 6, 5, 5, 4, 4 + .byte 3, 3, 2, 2, 1, 1, 0, 0 + .byte 1, 1, 2, 2, 3, 3, 4, 4 + .byte 5, 5, 6, 6, 7, 7, 8, 8 + .byte 9, 9, 10, 10, 11, 11, 12, 12 + .byte 13, 13, 14, 14, 15, 15, 16, 16 + + @ should be back on a 64-byte boundary here + +nbx2_3_0_1_4: + .byte 3, 3, 2, 2, 1, 1, 0, 0 + .byte 1, 1, 2, 2, 3, 3, 4, 4 + +@------------------------------------------------------------------------------ + + +@ ff_hevc_rpi_pred_planar_c_8_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1 + + @ Load from bytes & expand later - at the very least this uses less + @ memory than having a short table + adr ip, nbx2_7_0_1_8 + 16 + vld1.16 {q0-q1}, [r1 :128]! @ Top (left) + add r2, #32 + vld1.8 {q2}, [ip :128] @ {1,1,2,2,3,3...8,8} + lsl r3, #2 + vld1.32 {d6[],d7[]}, [r1] @ Top (right) + sub ip, #16 + vmovl.u8 q8, d4 + mov r1, #8 + vshl.i16 q9, q0, #3 + vmovl.u8 q2, d5 + vshl.i16 q10, q1, #3 + vld1.32 {d22[],d23[]}, [r2] @ Left (lower) + sub r2, #32 + vld1.8 {q12}, [ip] @ {7,7,6,6,5,5...0,0} + vmla.i16 q9, q8, q3 + vmla.i16 q10, q2, q3 @ Acc set up + vsub.i16 q0, q11, q0 + vsub.i16 q1, q11, q1 @ Add set up + vadd.i16 q2, q9, q0 + vadd.i16 q3, q10, q1 + vmovl.u8 q8, d24 + vmovl.u8 q9, d25 + +@ u16 7..0 [2] q8,q9 +@ u32 left[y] [2] [r2] +@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1] + +1: + vadd.i16 q10, q2, q0 + subs r1, #2 + vld1.32 {d24[],d25[]}, [r2]! + vadd.i16 q11, q3, q1 + vld1.32 {d28[],d29[]}, [r2]! + vmla.i16 q2, q8, q12 + vmla.i16 q3, q9, q12 + vadd.i16 q12, q10, q0 + vmla.i16 q10, q8, q14 + vadd.i16 q13, q11, q1 + vmla.i16 q11, q9, q14 + vrshr.u16 q14, q2, #4 + vrshr.u16 q15, q3, #4 + vmov q2, q12 + vst1.16 {q14-q15}, [r0 :128], r3 + vrshr.u16 q14, q10, #4 + vrshr.u16 q15, q11, #4 + vmov q3, q13 + vst1.16 {q14-q15}, [r0 :128], r3 + bne 1b + + bx lr +endfunc + + +@ ff_hevc_rpi_pred_planar_c_16_neon_8 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1 + + vld1.8 {q0-q1}, [r1 :128]! @ Top (left) + adr ip, nbx2_15_0_1_16 + 32 + vpush {d8-d12} + vld1.8 {q2-q3}, [ip :128] @ {1,1,2,2,3,3...16,16} + add r2, #32 + vld1.16 {d8[]}, [r1] @ Top (right) + sub ip, #32 + vshll.u8 q8, d0, #4 + mov r1, #16 + vld1.16 {d9[]}, [r2] @ Left (lower) + sub r2, #32 + vshll.u8 q9, d1, #4 + lsl r3, #1 + vshll.u8 q10, d2, #4 + vshll.u8 q11, d3, #4 + vmlal.u8 q8, d4, d8 + vsubl.u8 q12, d9, d0 + vmlal.u8 q9, d5, d8 + vsubl.u8 q13, d9, d1 + vmlal.u8 q10, d6, d8 + vsubl.u8 q14, d9, d2 + vmlal.u8 q11, d7, d8 @ Acc set up + vsubl.u8 q15, d9, d3 @ Add set up + vadd.i16 q8, q12 + vadd.i16 q9, q13 + vadd.i16 q10, q14 + vadd.i16 q11, q15 + vld1.8 {q4-q5}, [ip :128] @ {15,15,14,14,13,13...0,0} + +@ u8 15..0 [2] q4,q5 +@ u8 left[y] [2] [r2] +@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1] + + vld1.16 {d12[]}, [r2]! + vadd.i16 q0, q8, q12 + b 2f +1: + vld1.16 {d12[]}, [r2]! + vrshrn.u16 d3, q1, #5 + vrshrn.u16 d2, q0, #5 + vadd.i16 q0, q8, q12 + vrshrn.u16 d4, q2, #5 + vrshrn.u16 d5, q3, #5 + vst1.8 {q1-q2}, [r0 :128], r3 +2: vadd.i16 q1, q9, q13 + subs r1, #2 + vadd.i16 q2, q10, q14 + vadd.i16 q3, q11, q15 + vmlal.u8 q8, d8, d12 + vmlal.u8 q9, d9, d12 + vmlal.u8 q10, d10, d12 + vmlal.u8 q11, d11, d12 + vld1.16 {d12[]}, [r2]! + vrshrn.u16 d19, q9, #5 + vrshrn.u16 d18, q8, #5 + vadd.i16 q8, q0, q12 + vrshrn.u16 d20, q10, #5 + vrshrn.u16 d21, q11, #5 + vst1.8 {q9-q10}, [r0 :128], r3 + vadd.i16 q9, q1, q13 + vadd.i16 q10, q2, q14 + vadd.i16 q11, q3, q15 + vmlal.u8 q0, d8, d12 + vmlal.u8 q1, d9, d12 + vmlal.u8 q2, d10, d12 + vmlal.u8 q3, d11, d12 + + bne 1b + + vpop {d8-d12} + + vrshrn.u16 d3, q1, #5 + vrshrn.u16 d2, q0, #5 + vrshrn.u16 d4, q2, #5 + vrshrn.u16 d5, q3, #5 + vst1.8 {q1-q2}, [r0 :128] + + bx lr + +endfunc + + +@ ff_hevc_rpi_pred_planar_c_16_neon_10 +@ uint8_t *_src, [r0] +@ const uint8_t *_top, [r1] +@ const uint8_t *_left, [r2] +@ ptrdiff_t stride) [r3] + +function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1 + + @ Load from bytes & expand later - at the very least this uses less + @ memory than having a short table + vld1.16 {q0-q1}, [r1 :128]! @ Top (left) + adr ip, nbx2_15_0_1_16 + 32 + vpush {q4-q7} + vld1.16 {q2-q3}, [r1 :128]! @ Top (centre) + add r2, #64 + vld1.8 {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16} +T lsl r3, #2 + vld1.32 {d8[],d9[]}, [r1] @ Top (right) + sub ip, #32 + vmovl.u8 q12, d28 + mov r1, #16 + vmovl.u8 q13, d29 + vld1.8 {q6-q7}, [ip :128] @ {15,15,14,14,13,13...0,0} + vmovl.u8 q14, d30 + vmovl.u8 q15, d31 + vld1.32 {d10[],d11[]}, [r2] @ Left (lower) + sub r2, #64 + vshl.i16 q8, q0, #4 + vshl.i16 q9, q1, #4 + vshl.i16 q10, q2, #4 + vshl.i16 q11, q3, #4 + vmla.i16 q8, q12, q4 + vsub.i16 q0, q5, q0 + vmla.i16 q9, q13, q4 + vpush {q0} + vsub.i16 q1, q5, q1 + vmla.i16 q10, q14, q4 + vsub.i16 q2, q5, q2 + vmla.i16 q11, q15, q4 @ Acc set up + vsub.i16 q3, q5, q3 @ Add set up + vadd.i16 q8, q0 + vadd.i16 q9, q1 + vadd.i16 q10, q2 + vadd.i16 q11, q3 + vmovl.u8 q4, d12 + vmovl.u8 q5, d13 + vmovl.u8 q6, d14 + vmovl.u8 q7, d15 + +@ u16 31..0 [4] q4-q7 +@ u16 left[y] [4] [r2] +@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially +@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1] + + vadd.i16 q12, q8, q0 +A sub r0, r0, r3, lsl #2 +T sub r0, r3 +1: + vld1.32 {d0[],d1[]}, [r2]! +A add r0, r0, r3, lsl #2 +T add r0, r3 + vadd.i16 q13, q9, q1 + subs r1, #2 + vadd.i16 q14, q10, q2 + vadd.i16 q15, q11, q3 + vmla.i16 q8, q4, q0 + vmla.i16 q9, q5, q0 + vmla.i16 q10, q6, q0 + vmla.i16 q11, q7, q0 + vld1.16 {q0}, [sp] + vrshr.u16 q8, #5 + vrshr.u16 q9, #5 + vrshr.u16 q10, #5 + vrshr.u16 q11, #5 + vstm r0, {q8-q11} + vadd.i16 q8, q12, q0 +A add r0, r0, r3, lsl #2 +T add r0, r3 + vld1.32 {d0[],d1[]}, [r2]! + vadd.i16 q9, q13, q1 + vadd.i16 q10, q14, q2 + vadd.i16 q11, q15, q3 + vmla.i16 q12, q4, q0 + vmla.i16 q13, q5, q0 + vmla.i16 q14, q6, q0 + vmla.i16 q15, q7, q0 + vld1.16 {q0}, [sp] + vrshr.u16 q12, #5 + vrshr.u16 q13, #5 + vrshr.u16 q14, #5 + vrshr.u16 q15, #5 + vstm r0, {q12-q15} + vadd.i16 q12, q8, q0 + bne 1b + + vpop {q3-q7} + bx lr + +endfunc diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c index 2cca784f5a..48cb816b70 100644 --- a/libavcodec/arm/vc1dsp_init_neon.c +++ b/libavcodec/arm/vc1dsp_init_neon.c @@ -19,6 +19,7 @@ #include #include "libavutil/attributes.h" +#include "libavutil/intreadwrite.h" #include "libavcodec/vc1dsp.h" #include "vc1dsp.h" @@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq); +void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq); +void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq); +void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq); +void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq); +void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq); + void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int rnd); @@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y); +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); + +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) +{ + /* Dealing with starting and stopping, and removing escape bytes, are + * comparatively less time-sensitive, so are more clearly expressed using + * a C wrapper around the assembly inner loop. Note that we assume a + * little-endian machine that supports unaligned loads. */ + int dsize = 0; + while (size >= 4) + { + int found = 0; + while (!found && (((uintptr_t) dst) & 7) && size >= 4) + { + found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; + if (!found) + { + *dst++ = *src++; + --size; + ++dsize; + } + } + if (!found) + { + int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); + dst += skip; + src += skip; + size -= skip; + dsize += skip; + while (!found && size >= 4) + { + found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; + if (!found) + { + *dst++ = *src++; + --size; + ++dsize; + } + } + } + if (found) + { + *dst++ = *src++; + *dst++ = *src++; + ++src; + size -= 3; + dsize += 2; + } + } + while (size > 0) + { + *dst++ = *src++; + --size; + ++dsize; + } + return dsize; +} + #define FN_ASSIGN(X, Y) \ dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \ dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon @@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; + dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; + dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; + dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; + dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; + dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; + dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; + dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon; FN_ASSIGN(1, 0); FN_ASSIGN(2, 0); @@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; + + dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; } diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S index 93f043bf08..8e97bc5e58 100644 --- a/libavcodec/arm/vc1dsp_neon.S +++ b/libavcodec/arm/vc1dsp_neon.S @@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1 vst1.32 {d1[1]}, [r0,:32] bx lr endfunc + +@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks +@ On entry: +@ r0 -> top-left pel of lower block +@ r1 = row stride, bytes +@ r2 = PQUANT bitstream parameter +function ff_vc1_v_loop_filter4_neon, export=1 + sub r3, r0, r1, lsl #2 + vldr d0, .Lcoeffs + vld1.32 {d1[0]}, [r0], r1 @ P5 + vld1.32 {d2[0]}, [r3], r1 @ P1 + vld1.32 {d3[0]}, [r3], r1 @ P2 + vld1.32 {d4[0]}, [r0], r1 @ P6 + vld1.32 {d5[0]}, [r3], r1 @ P3 + vld1.32 {d6[0]}, [r0], r1 @ P7 + vld1.32 {d7[0]}, [r3] @ P4 + vld1.32 {d16[0]}, [r0] @ P8 + vshll.u8 q9, d1, #1 @ 2*P5 + vdup.16 d17, r2 @ pq + vshll.u8 q10, d2, #1 @ 2*P1 + vmovl.u8 q11, d3 @ P2 + vmovl.u8 q1, d4 @ P6 + vmovl.u8 q12, d5 @ P3 + vmls.i16 d20, d22, d0[1] @ 2*P1-5*P2 + vmovl.u8 q11, d6 @ P7 + vmls.i16 d18, d2, d0[1] @ 2*P5-5*P6 + vshll.u8 q2, d5, #1 @ 2*P3 + vmovl.u8 q3, d7 @ P4 + vmla.i16 d18, d22, d0[1] @ 2*P5-5*P6+5*P7 + vmovl.u8 q11, d16 @ P8 + vmla.u16 d20, d24, d0[1] @ 2*P1-5*P2+5*P3 + vmovl.u8 q12, d1 @ P5 + vmls.u16 d4, d6, d0[1] @ 2*P3-5*P4 + vmls.u16 d18, d22, d0[0] @ 2*P5-5*P6+5*P7-2*P8 + vsub.i16 d1, d6, d24 @ P4-P5 + vmls.i16 d20, d6, d0[0] @ 2*P1-5*P2+5*P3-2*P4 + vmla.i16 d4, d24, d0[1] @ 2*P3-5*P4+5*P5 + vmls.i16 d4, d2, d0[0] @ 2*P3-5*P4+5*P5-2*P6 + vabs.s16 d2, d1 + vrshr.s16 d3, d18, #3 + vrshr.s16 d5, d20, #3 + vshr.s16 d2, d2, #1 @ clip + vrshr.s16 d4, d4, #3 + vabs.s16 d3, d3 @ a2 + vshr.s16 d1, d1, #8 @ clip_sign + vabs.s16 d5, d5 @ a1 + vceq.i16 d7, d2, #0 @ test clip == 0 + vabs.s16 d16, d4 @ a0 + vshr.s16 d4, d4, #8 @ a0_sign + vcge.s16 d18, d5, d3 @ test a1 >= a2 + vcge.s16 d17, d16, d17 @ test a0 >= pq + vbsl d18, d3, d5 @ a3 + vsub.i16 d1, d1, d4 @ clip_sign - a0_sign + vorr d3, d7, d17 @ test clip == 0 || a0 >= pq + vqsub.u16 d4, d16, d18 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + vcge.s16 d5, d18, d16 @ test a3 >= a0 + vmul.i16 d0, d4, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 + vorr d4, d3, d5 @ test clip == 0 || a0 >= pq || a3 >= a0 + vmov.32 r0, d4[1] @ move to gp reg + vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 + vcge.s16 d4, d0, d2 + tst r0, #1 + bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered + vbsl d4, d2, d0 @ FFMIN(d, clip) + vbic d0, d4, d3 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) + vmls.i16 d6, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 + vmla.i16 d24, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 + vqmovun.s16 d0, q3 + vqmovun.s16 d1, q12 + vst1.32 {d0[0]}, [r3], r1 + vst1.32 {d1[0]}, [r3] +1: bx lr +endfunc + +@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks +@ On entry: +@ r0 -> top-left pel of right block +@ r1 = row stride, bytes +@ r2 = PQUANT bitstream parameter +function ff_vc1_h_loop_filter4_neon, export=1 + sub r3, r0, #4 @ where to start reading + vldr d0, .Lcoeffs + vld1.32 {d2}, [r3], r1 + sub r0, r0, #1 @ where to start writing + vld1.32 {d4}, [r3], r1 + vld1.32 {d3}, [r3], r1 + vld1.32 {d5}, [r3] + vdup.16 d1, r2 @ pq + vtrn.8 q1, q2 + vtrn.16 d2, d3 @ P1, P5, P3, P7 + vtrn.16 d4, d5 @ P2, P6, P4, P8 + vshll.u8 q3, d2, #1 @ 2*P1, 2*P5 + vmovl.u8 q8, d4 @ P2, P6 + vmovl.u8 q9, d3 @ P3, P7 + vmovl.u8 q2, d5 @ P4, P8 + vmls.i16 q3, q8, d0[1] @ 2*P1-5*P2, 2*P5-5*P6 + vshll.u8 q10, d3, #1 @ 2*P3, 2*P7 + vmovl.u8 q1, d2 @ P1, P5 + vmla.i16 q3, q9, d0[1] @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 + vmls.i16 q3, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 + vmov d2, d3 @ needs to be in an even-numbered vector for when we come to narrow it later + vmls.i16 d20, d4, d0[1] @ 2*P3-5*P4 + vmla.i16 d20, d3, d0[1] @ 2*P3-5*P4+5*P5 + vsub.i16 d3, d4, d2 @ P4-P5 + vmls.i16 d20, d17, d0[0] @ 2*P3-5*P4+5*P5-2*P6 + vrshr.s16 q3, q3, #3 + vabs.s16 d5, d3 + vshr.s16 d3, d3, #8 @ clip_sign + vrshr.s16 d16, d20, #3 + vabs.s16 q3, q3 @ a1, a2 + vshr.s16 d5, d5, #1 @ clip + vabs.s16 d17, d16 @ a0 + vceq.i16 d18, d5, #0 @ test clip == 0 + vshr.s16 d16, d16, #8 @ a0_sign + vcge.s16 d19, d6, d7 @ test a1 >= a2 + vcge.s16 d1, d17, d1 @ test a0 >= pq + vsub.i16 d16, d3, d16 @ clip_sign - a0_sign + vbsl d19, d7, d6 @ a3 + vorr d1, d18, d1 @ test clip == 0 || a0 >= pq + vqsub.u16 d3, d17, d19 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + vcge.s16 d6, d19, d17 @ test a3 >= a0 @ + vmul.i16 d0, d3, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 + vorr d3, d1, d6 @ test clip == 0 || a0 >= pq || a3 >= a0 + vmov.32 r2, d3[1] @ move to gp reg + vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 + vcge.s16 d3, d0, d5 + tst r2, #1 + bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered + vbsl d3, d5, d0 @ FFMIN(d, clip) + vbic d0, d3, d1 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) + vmla.i16 d2, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 + vmls.i16 d4, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 + vqmovun.s16 d1, q1 + vqmovun.s16 d0, q2 + vst2.8 {d0[0], d1[0]}, [r0], r1 + vst2.8 {d0[1], d1[1]}, [r0], r1 + vst2.8 {d0[2], d1[2]}, [r0], r1 + vst2.8 {d0[3], d1[3]}, [r0] +1: bx lr +endfunc + +@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks +@ On entry: +@ r0 -> top-left pel of lower block +@ r1 = row stride, bytes +@ r2 = PQUANT bitstream parameter +function ff_vc1_v_loop_filter8_neon, export=1 + sub r3, r0, r1, lsl #2 + vldr d0, .Lcoeffs + vld1.32 {d1}, [r0], r1 @ P5 + vld1.32 {d2}, [r3], r1 @ P1 + vld1.32 {d3}, [r3], r1 @ P2 + vld1.32 {d4}, [r0], r1 @ P6 + vld1.32 {d5}, [r3], r1 @ P3 + vld1.32 {d6}, [r0], r1 @ P7 + vshll.u8 q8, d1, #1 @ 2*P5 + vshll.u8 q9, d2, #1 @ 2*P1 + vld1.32 {d7}, [r3] @ P4 + vmovl.u8 q1, d3 @ P2 + vld1.32 {d20}, [r0] @ P8 + vmovl.u8 q11, d4 @ P6 + vdup.16 q12, r2 @ pq + vmovl.u8 q13, d5 @ P3 + vmls.i16 q9, q1, d0[1] @ 2*P1-5*P2 + vmovl.u8 q1, d6 @ P7 + vshll.u8 q2, d5, #1 @ 2*P3 + vmls.i16 q8, q11, d0[1] @ 2*P5-5*P6 + vmovl.u8 q3, d7 @ P4 + vmovl.u8 q10, d20 @ P8 + vmla.i16 q8, q1, d0[1] @ 2*P5-5*P6+5*P7 + vmovl.u8 q1, d1 @ P5 + vmla.i16 q9, q13, d0[1] @ 2*P1-5*P2+5*P3 + vsub.i16 q13, q3, q1 @ P4-P5 + vmls.i16 q2, q3, d0[1] @ 2*P3-5*P4 + vmls.i16 q8, q10, d0[0] @ 2*P5-5*P6+5*P7-2*P8 + vabs.s16 q10, q13 + vshr.s16 q13, q13, #8 @ clip_sign + vmls.i16 q9, q3, d0[0] @ 2*P1-5*P2+5*P3-2*P4 + vshr.s16 q10, q10, #1 @ clip + vmla.i16 q2, q1, d0[1] @ 2*P3-5*P4+5*P5 + vrshr.s16 q8, q8, #3 + vmls.i16 q2, q11, d0[0] @ 2*P3-5*P4+5*P5-2*P6 + vceq.i16 q11, q10, #0 @ test clip == 0 + vrshr.s16 q9, q9, #3 + vabs.s16 q8, q8 @ a2 + vabs.s16 q9, q9 @ a1 + vrshr.s16 q2, q2, #3 + vcge.s16 q14, q9, q8 @ test a1 >= a2 + vabs.s16 q15, q2 @ a0 + vshr.s16 q2, q2, #8 @ a0_sign + vbsl q14, q8, q9 @ a3 + vcge.s16 q8, q15, q12 @ test a0 >= pq + vsub.i16 q2, q13, q2 @ clip_sign - a0_sign + vqsub.u16 q9, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + vcge.s16 q12, q14, q15 @ test a3 >= a0 + vorr q8, q11, q8 @ test clip == 0 || a0 >= pq + vmul.i16 q0, q9, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 + vorr q9, q8, q12 @ test clip == 0 || a0 >= pq || a3 >= a0 + vshl.i64 q11, q9, #16 + vmov.32 r0, d18[1] @ move to gp reg + vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 + vmov.32 r2, d19[1] + vshr.s64 q9, q11, #48 + vcge.s16 q11, q0, q10 + vorr q8, q8, q9 + and r0, r0, r2 + vbsl q11, q10, q0 @ FFMIN(d, clip) + tst r0, #1 + bne 1f @ none of the 8 pixel pairs should be updated in this case + vbic q0, q11, q8 @ set each d to zero if it should not be filtered + vmls.i16 q3, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 + vmla.i16 q1, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 + vqmovun.s16 d0, q3 + vqmovun.s16 d1, q1 + vst1.32 {d0}, [r3], r1 + vst1.32 {d1}, [r3] +1: bx lr +endfunc + +.align 5 +.Lcoeffs: +.quad 0x00050002 + +@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks +@ On entry: +@ r0 -> top-left pel of right block +@ r1 = row stride, bytes +@ r2 = PQUANT bitstream parameter +function ff_vc1_h_loop_filter8_neon, export=1 + push {lr} + sub r3, r0, #4 @ where to start reading + vldr d0, .Lcoeffs + vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... + sub r0, r0, #1 @ where to start writing + vld1.32 {d4}, [r3], r1 + add r12, r0, r1, lsl #2 + vld1.32 {d3}, [r3], r1 + vld1.32 {d5}, [r3], r1 + vld1.32 {d6}, [r3], r1 + vld1.32 {d16}, [r3], r1 + vld1.32 {d7}, [r3], r1 + vld1.32 {d17}, [r3] + vtrn.8 q1, q2 @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]... + vdup.16 q9, r2 @ pq + vtrn.16 d2, d3 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... + vtrn.16 d4, d5 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... + vtrn.8 q3, q8 @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]... + vtrn.16 d6, d7 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]... + vtrn.16 d16, d17 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... + vtrn.32 d2, d6 @ P1, P5 + vtrn.32 d4, d16 @ P2, P6 + vtrn.32 d3, d7 @ P3, P7 + vtrn.32 d5, d17 @ P4, P8 + vshll.u8 q10, d2, #1 @ 2*P1 + vshll.u8 q11, d6, #1 @ 2*P5 + vmovl.u8 q12, d4 @ P2 + vmovl.u8 q13, d16 @ P6 + vmovl.u8 q14, d3 @ P3 + vmls.i16 q10, q12, d0[1] @ 2*P1-5*P2 + vmovl.u8 q12, d7 @ P7 + vshll.u8 q1, d3, #1 @ 2*P3 + vmls.i16 q11, q13, d0[1] @ 2*P5-5*P6 + vmovl.u8 q2, d5 @ P4 + vmovl.u8 q8, d17 @ P8 + vmla.i16 q11, q12, d0[1] @ 2*P5-5*P6+5*P7 + vmovl.u8 q3, d6 @ P5 + vmla.i16 q10, q14, d0[1] @ 2*P1-5*P2+5*P3 + vsub.i16 q12, q2, q3 @ P4-P5 + vmls.i16 q1, q2, d0[1] @ 2*P3-5*P4 + vmls.i16 q11, q8, d0[0] @ 2*P5-5*P6+5*P7-2*P8 + vabs.s16 q8, q12 + vshr.s16 q12, q12, #8 @ clip_sign + vmls.i16 q10, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4 + vshr.s16 q8, q8, #1 @ clip + vmla.i16 q1, q3, d0[1] @ 2*P3-5*P4+5*P5 + vrshr.s16 q11, q11, #3 + vmls.i16 q1, q13, d0[0] @ 2*P3-5*P4+5*P5-2*P6 + vceq.i16 q13, q8, #0 @ test clip == 0 + vrshr.s16 q10, q10, #3 + vabs.s16 q11, q11 @ a2 + vabs.s16 q10, q10 @ a1 + vrshr.s16 q1, q1, #3 + vcge.s16 q14, q10, q11 @ test a1 >= a2 + vabs.s16 q15, q1 @ a0 + vshr.s16 q1, q1, #8 @ a0_sign + vbsl q14, q11, q10 @ a3 + vcge.s16 q9, q15, q9 @ test a0 >= pq + vsub.i16 q1, q12, q1 @ clip_sign - a0_sign + vqsub.u16 q10, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + vcge.s16 q11, q14, q15 @ test a3 >= a0 + vorr q9, q13, q9 @ test clip == 0 || a0 >= pq + vmul.i16 q0, q10, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 + vorr q10, q9, q11 @ test clip == 0 || a0 >= pq || a3 >= a0 + vmov.32 r2, d20[1] @ move to gp reg + vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 + vmov.32 r3, d21[1] + vcge.s16 q10, q0, q8 + and r14, r2, r3 + vbsl q10, q8, q0 @ FFMIN(d, clip) + tst r14, #1 + bne 2f @ none of the 8 pixel pairs should be updated in this case + vbic q0, q10, q9 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) + vmla.i16 q3, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 + vmls.i16 q2, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 + vqmovun.s16 d1, q3 + vqmovun.s16 d0, q2 + tst r2, #1 + bne 1f @ none of the first 4 pixel pairs should be updated if so + vst2.8 {d0[0], d1[0]}, [r0], r1 + vst2.8 {d0[1], d1[1]}, [r0], r1 + vst2.8 {d0[2], d1[2]}, [r0], r1 + vst2.8 {d0[3], d1[3]}, [r0] +1: tst r3, #1 + bne 2f @ none of the second 4 pixel pairs should be updated if so + vst2.8 {d0[4], d1[4]}, [r12], r1 + vst2.8 {d0[5], d1[5]}, [r12], r1 + vst2.8 {d0[6], d1[6]}, [r12], r1 + vst2.8 {d0[7], d1[7]}, [r12] +2: pop {pc} +endfunc + +@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks +@ On entry: +@ r0 -> top-left pel of lower block +@ r1 = row stride, bytes +@ r2 = PQUANT bitstream parameter +function ff_vc1_v_loop_filter16_neon, export=1 + vpush {d8-d15} + sub r3, r0, r1, lsl #2 + vldr d0, .Lcoeffs + vld1.64 {q1}, [r0], r1 @ P5 + vld1.64 {q2}, [r3], r1 @ P1 + vld1.64 {q3}, [r3], r1 @ P2 + vld1.64 {q4}, [r0], r1 @ P6 + vld1.64 {q5}, [r3], r1 @ P3 + vld1.64 {q6}, [r0], r1 @ P7 + vshll.u8 q7, d2, #1 @ 2*P5[0..7] + vshll.u8 q8, d4, #1 @ 2*P1[0..7] + vld1.64 {q9}, [r3] @ P4 + vmovl.u8 q10, d6 @ P2[0..7] + vld1.64 {q11}, [r0] @ P8 + vmovl.u8 q12, d8 @ P6[0..7] + vdup.16 q13, r2 @ pq + vshll.u8 q2, d5, #1 @ 2*P1[8..15] + vmls.i16 q8, q10, d0[1] @ 2*P1[0..7]-5*P2[0..7] + vshll.u8 q10, d3, #1 @ 2*P5[8..15] + vmovl.u8 q3, d7 @ P2[8..15] + vmls.i16 q7, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] + vmovl.u8 q4, d9 @ P6[8..15] + vmovl.u8 q14, d10 @ P3[0..7] + vmovl.u8 q15, d12 @ P7[0..7] + vmls.i16 q2, q3, d0[1] @ 2*P1[8..15]-5*P2[8..15] + vshll.u8 q3, d10, #1 @ 2*P3[0..7] + vmls.i16 q10, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15] + vmovl.u8 q6, d13 @ P7[8..15] + vmla.i16 q8, q14, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] + vmovl.u8 q14, d18 @ P4[0..7] + vmovl.u8 q9, d19 @ P4[8..15] + vmla.i16 q7, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] + vmovl.u8 q15, d11 @ P3[8..15] + vshll.u8 q5, d11, #1 @ 2*P3[8..15] + vmls.i16 q3, q14, d0[1] @ 2*P3[0..7]-5*P4[0..7] + vmla.i16 q2, q15, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] + vmovl.u8 q15, d22 @ P8[0..7] + vmovl.u8 q11, d23 @ P8[8..15] + vmla.i16 q10, q6, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] + vmovl.u8 q6, d2 @ P5[0..7] + vmovl.u8 q1, d3 @ P5[8..15] + vmls.i16 q5, q9, d0[1] @ 2*P3[8..15]-5*P4[8..15] + vmls.i16 q8, q14, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] + vmls.i16 q7, q15, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] + vsub.i16 q15, q14, q6 @ P4[0..7]-P5[0..7] + vmla.i16 q3, q6, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] + vrshr.s16 q8, q8, #3 + vmls.i16 q2, q9, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] + vrshr.s16 q7, q7, #3 + vmls.i16 q10, q11, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] + vabs.s16 q11, q15 + vabs.s16 q8, q8 @ a1[0..7] + vmla.i16 q5, q1, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] + vshr.s16 q15, q15, #8 @ clip_sign[0..7] + vrshr.s16 q2, q2, #3 + vmls.i16 q3, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] + vabs.s16 q7, q7 @ a2[0..7] + vrshr.s16 q10, q10, #3 + vsub.i16 q12, q9, q1 @ P4[8..15]-P5[8..15] + vshr.s16 q11, q11, #1 @ clip[0..7] + vmls.i16 q5, q4, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] + vcge.s16 q4, q8, q7 @ test a1[0..7] >= a2[0..7] + vabs.s16 q2, q2 @ a1[8..15] + vrshr.s16 q3, q3, #3 + vabs.s16 q10, q10 @ a2[8..15] + vbsl q4, q7, q8 @ a3[0..7] + vabs.s16 q7, q12 + vshr.s16 q8, q12, #8 @ clip_sign[8..15] + vrshr.s16 q5, q5, #3 + vcge.s16 q12, q2, q10 @ test a1[8..15] >= a2[8.15] + vshr.s16 q7, q7, #1 @ clip[8..15] + vbsl q12, q10, q2 @ a3[8..15] + vabs.s16 q2, q3 @ a0[0..7] + vceq.i16 q10, q11, #0 @ test clip[0..7] == 0 + vshr.s16 q3, q3, #8 @ a0_sign[0..7] + vsub.i16 q3, q15, q3 @ clip_sign[0..7] - a0_sign[0..7] + vcge.s16 q15, q2, q13 @ test a0[0..7] >= pq + vorr q10, q10, q15 @ test clip[0..7] == 0 || a0[0..7] >= pq + vqsub.u16 q15, q2, q4 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + vcge.s16 q2, q4, q2 @ test a3[0..7] >= a0[0..7] + vabs.s16 q4, q5 @ a0[8..15] + vshr.s16 q5, q5, #8 @ a0_sign[8..15] + vmul.i16 q15, q15, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 + vcge.s16 q13, q4, q13 @ test a0[8..15] >= pq + vorr q2, q10, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] + vsub.i16 q5, q8, q5 @ clip_sign[8..15] - a0_sign[8..15] + vceq.i16 q8, q7, #0 @ test clip[8..15] == 0 + vshr.u16 q15, q15, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 + vmov.32 r0, d4[1] @ move to gp reg + vorr q8, q8, q13 @ test clip[8..15] == 0 || a0[8..15] >= pq + vqsub.u16 q13, q4, q12 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + vmov.32 r2, d5[1] + vcge.s16 q4, q12, q4 @ test a3[8..15] >= a0[8..15] + vshl.i64 q2, q2, #16 + vcge.s16 q12, q15, q11 + vmul.i16 q0, q13, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 + vorr q4, q8, q4 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] + vshr.s64 q2, q2, #48 + and r0, r0, r2 + vbsl q12, q11, q15 @ FFMIN(d[0..7], clip[0..7]) + vshl.i64 q11, q4, #16 + vmov.32 r2, d8[1] + vshr.u16 q0, q0, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 + vorr q2, q10, q2 + vmov.32 r12, d9[1] + vshr.s64 q4, q11, #48 + vcge.s16 q10, q0, q7 + vbic q2, q12, q2 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) + vorr q4, q8, q4 + and r2, r2, r12 + vbsl q10, q7, q0 @ FFMIN(d[8..15], clip[8..15]) + vmls.i16 q14, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] + and r0, r0, r2 + vbic q0, q10, q4 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) + tst r0, #1 + bne 1f @ none of the 16 pixel pairs should be updated in this case + vmla.i16 q6, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] + vmls.i16 q9, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] + vqmovun.s16 d4, q14 + vmla.i16 q1, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] + vqmovun.s16 d0, q6 + vqmovun.s16 d5, q9 + vqmovun.s16 d1, q1 + vst1.64 {q2}, [r3], r1 + vst1.64 {q0}, [r3] +1: vpop {d8-d15} + bx lr +endfunc + +@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks +@ On entry: +@ r0 -> top-left pel of right block +@ r1 = row stride, bytes +@ r2 = PQUANT bitstream parameter +function ff_vc1_h_loop_filter16_neon, export=1 + push {r4-r6,lr} + vpush {d8-d15} + sub r3, r0, #4 @ where to start reading + vldr d0, .Lcoeffs + vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... + sub r0, r0, #1 @ where to start writing + vld1.32 {d3}, [r3], r1 + add r4, r0, r1, lsl #2 + vld1.32 {d10}, [r3], r1 + vld1.32 {d11}, [r3], r1 + vld1.32 {d16}, [r3], r1 + vld1.32 {d4}, [r3], r1 + vld1.32 {d8}, [r3], r1 + vtrn.8 d2, d3 @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]... + vld1.32 {d14}, [r3], r1 + vld1.32 {d5}, [r3], r1 + vtrn.8 d10, d11 @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]... + vld1.32 {d6}, [r3], r1 + vld1.32 {d12}, [r3], r1 + vtrn.8 d16, d4 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]... + vld1.32 {d13}, [r3], r1 + vtrn.16 d2, d10 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... + vld1.32 {d1}, [r3], r1 + vtrn.8 d8, d14 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]... + vld1.32 {d7}, [r3], r1 + vtrn.16 d3, d11 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... + vld1.32 {d9}, [r3], r1 + vtrn.8 d5, d6 @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]... + vld1.32 {d15}, [r3] + vtrn.16 d16, d8 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]... + vtrn.16 d4, d14 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... + vtrn.8 d12, d13 @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]... + vdup.16 q9, r2 @ pq + vtrn.8 d1, d7 @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]... + vtrn.32 d2, d16 @ P1[0..7], P5[0..7] + vtrn.16 d5, d12 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]... + vtrn.16 d6, d13 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]... + vtrn.8 d9, d15 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]... + vtrn.32 d3, d4 @ P2[0..7], P6[0..7] + vshll.u8 q10, d2, #1 @ 2*P1[0..7] + vtrn.32 d10, d8 @ P3[0..7], P7[0..7] + vshll.u8 q11, d16, #1 @ 2*P5[0..7] + vtrn.32 d11, d14 @ P4[0..7], P8[0..7] + vtrn.16 d1, d9 @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]... + vtrn.16 d7, d15 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]... + vmovl.u8 q1, d3 @ P2[0..7] + vmovl.u8 q12, d4 @ P6[0..7] + vtrn.32 d5, d1 @ P1[8..15], P5[8..15] + vtrn.32 d6, d7 @ P2[8..15], P6[8..15] + vtrn.32 d12, d9 @ P3[8..15], P7[8..15] + vtrn.32 d13, d15 @ P4[8..15], P8[8..15] + vmls.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7] + vmovl.u8 q1, d10 @ P3[0..7] + vshll.u8 q2, d5, #1 @ 2*P1[8..15] + vshll.u8 q13, d1, #1 @ 2*P5[8..15] + vmls.i16 q11, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] + vmovl.u8 q14, d6 @ P2[8..15] + vmovl.u8 q3, d7 @ P6[8..15] + vmovl.u8 q15, d8 @ P7[0..7] + vmla.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] + vmovl.u8 q1, d12 @ P3[8..15] + vmls.i16 q2, q14, d0[1] @ 2*P1[8..15]-5*P2[8..15] + vmovl.u8 q4, d9 @ P7[8..15] + vshll.u8 q14, d10, #1 @ 2*P3[0..7] + vmls.i16 q13, q3, d0[1] @ 2*P5[8..15]-5*P6[8..15] + vmovl.u8 q5, d11 @ P4[0..7] + vmla.i16 q11, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] + vshll.u8 q15, d12, #1 @ 2*P3[8..15] + vmovl.u8 q6, d13 @ P4[8..15] + vmla.i16 q2, q1, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] + vmovl.u8 q1, d14 @ P8[0..7] + vmovl.u8 q7, d15 @ P8[8..15] + vmla.i16 q13, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] + vmovl.u8 q4, d16 @ P5[0..7] + vmovl.u8 q8, d1 @ P5[8..15] + vmls.i16 q14, q5, d0[1] @ 2*P3[0..7]-5*P4[0..7] + vmls.i16 q15, q6, d0[1] @ 2*P3[8..15]-5*P4[8..15] + vmls.i16 q10, q5, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] + vmls.i16 q11, q1, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] + vsub.i16 q1, q5, q4 @ P4[0..7]-P5[0..7] + vmls.i16 q2, q6, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] + vrshr.s16 q10, q10, #3 + vmls.i16 q13, q7, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] + vsub.i16 q7, q6, q8 @ P4[8..15]-P5[8..15] + vrshr.s16 q11, q11, #3 + vmla.s16 q14, q4, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] + vrshr.s16 q2, q2, #3 + vmla.i16 q15, q8, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] + vabs.s16 q10, q10 @ a1[0..7] + vrshr.s16 q13, q13, #3 + vmls.i16 q15, q3, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] + vabs.s16 q3, q11 @ a2[0..7] + vabs.s16 q2, q2 @ a1[8..15] + vmls.i16 q14, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] + vabs.s16 q11, q1 + vabs.s16 q12, q13 @ a2[8..15] + vcge.s16 q13, q10, q3 @ test a1[0..7] >= a2[0..7] + vshr.s16 q1, q1, #8 @ clip_sign[0..7] + vrshr.s16 q15, q15, #3 + vshr.s16 q11, q11, #1 @ clip[0..7] + vrshr.s16 q14, q14, #3 + vbsl q13, q3, q10 @ a3[0..7] + vcge.s16 q3, q2, q12 @ test a1[8..15] >= a2[8.15] + vabs.s16 q10, q15 @ a0[8..15] + vshr.s16 q15, q15, #8 @ a0_sign[8..15] + vbsl q3, q12, q2 @ a3[8..15] + vabs.s16 q2, q14 @ a0[0..7] + vabs.s16 q12, q7 + vshr.s16 q7, q7, #8 @ clip_sign[8..15] + vshr.s16 q14, q14, #8 @ a0_sign[0..7] + vshr.s16 q12, q12, #1 @ clip[8..15] + vsub.i16 q7, q7, q15 @ clip_sign[8..15] - a0_sign[8..15] + vqsub.u16 q15, q10, q3 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + vcge.s16 q3, q3, q10 @ test a3[8..15] >= a0[8..15] + vcge.s16 q10, q10, q9 @ test a0[8..15] >= pq + vcge.s16 q9, q2, q9 @ test a0[0..7] >= pq + vsub.i16 q1, q1, q14 @ clip_sign[0..7] - a0_sign[0..7] + vqsub.u16 q14, q2, q13 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) + vcge.s16 q2, q13, q2 @ test a3[0..7] >= a0[0..7] + vmul.i16 q13, q15, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 + vceq.i16 q15, q11, #0 @ test clip[0..7] == 0 + vmul.i16 q0, q14, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 + vorr q9, q15, q9 @ test clip[0..7] == 0 || a0[0..7] >= pq + vceq.i16 q14, q12, #0 @ test clip[8..15] == 0 + vshr.u16 q13, q13, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 + vorr q2, q9, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] + vshr.u16 q0, q0, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 + vorr q10, q14, q10 @ test clip[8..15] == 0 || a0[8..15] >= pq + vcge.s16 q14, q13, q12 + vmov.32 r2, d4[1] @ move to gp reg + vorr q3, q10, q3 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] + vmov.32 r3, d5[1] + vcge.s16 q2, q0, q11 + vbsl q14, q12, q13 @ FFMIN(d[8..15], clip[8..15]) + vbsl q2, q11, q0 @ FFMIN(d[0..7], clip[0..7]) + vmov.32 r5, d6[1] + vbic q0, q14, q10 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) + vmov.32 r6, d7[1] + and r12, r2, r3 + vbic q2, q2, q9 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) + vmls.i16 q6, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 + vmls.i16 q5, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 + and r14, r5, r6 + vmla.i16 q4, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 + and r12, r12, r14 + vqmovun.s16 d4, q6 + vmla.i16 q8, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 + tst r12, #1 + bne 4f @ none of the 16 pixel pairs should be updated in this case + vqmovun.s16 d2, q5 + vqmovun.s16 d3, q4 + vqmovun.s16 d5, q8 + tst r2, #1 + bne 1f + vst2.8 {d2[0], d3[0]}, [r0], r1 + vst2.8 {d2[1], d3[1]}, [r0], r1 + vst2.8 {d2[2], d3[2]}, [r0], r1 + vst2.8 {d2[3], d3[3]}, [r0] +1: add r0, r4, r1, lsl #2 + tst r3, #1 + bne 2f + vst2.8 {d2[4], d3[4]}, [r4], r1 + vst2.8 {d2[5], d3[5]}, [r4], r1 + vst2.8 {d2[6], d3[6]}, [r4], r1 + vst2.8 {d2[7], d3[7]}, [r4] +2: add r4, r0, r1, lsl #2 + tst r5, #1 + bne 3f + vst2.8 {d4[0], d5[0]}, [r0], r1 + vst2.8 {d4[1], d5[1]}, [r0], r1 + vst2.8 {d4[2], d5[2]}, [r0], r1 + vst2.8 {d4[3], d5[3]}, [r0] +3: tst r6, #1 + bne 4f + vst2.8 {d4[4], d5[4]}, [r4], r1 + vst2.8 {d4[5], d5[5]}, [r4], r1 + vst2.8 {d4[6], d5[6]}, [r4], r1 + vst2.8 {d4[7], d5[7]}, [r4] +4: vpop {d8-d15} + pop {r4-r6,pc} +endfunc + +@ Copy at most the specified number of bytes from source to destination buffer, +@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence +@ On entry: +@ r0 -> source buffer +@ r1 = max number of bytes to copy +@ r2 -> destination buffer, optimally 8-byte aligned +@ On exit: +@ r0 = number of bytes not copied +function ff_vc1_unescape_buffer_helper_neon, export=1 + @ Offset by 48 to screen out cases that are too short for us to handle, + @ and also make it easy to test for loop termination, or to determine + @ whether we need an odd number of half-iterations of the loop. + subs r1, r1, #48 + bmi 90f + + @ Set up useful constants + vmov.i32 q0, #0x3000000 + vmov.i32 q1, #0x30000 + + tst r1, #16 + bne 1f + + vld1.8 {q8, q9}, [r0]! + vbic q12, q8, q0 + vext.8 q13, q8, q9, #1 + vext.8 q14, q8, q9, #2 + vext.8 q15, q8, q9, #3 + veor q12, q12, q1 + vbic q13, q13, q0 + vbic q14, q14, q0 + vbic q15, q15, q0 + vceq.i32 q12, q12, #0 + veor q13, q13, q1 + veor q14, q14, q1 + veor q15, q15, q1 + vceq.i32 q13, q13, #0 + vceq.i32 q14, q14, #0 + vceq.i32 q15, q15, #0 + add r1, r1, #16 + b 3f + +1: vld1.8 {q10, q11}, [r0]! + vbic q12, q10, q0 + vext.8 q13, q10, q11, #1 + vext.8 q14, q10, q11, #2 + vext.8 q15, q10, q11, #3 + veor q12, q12, q1 + vbic q13, q13, q0 + vbic q14, q14, q0 + vbic q15, q15, q0 + vceq.i32 q12, q12, #0 + veor q13, q13, q1 + veor q14, q14, q1 + veor q15, q15, q1 + vceq.i32 q13, q13, #0 + vceq.i32 q14, q14, #0 + vceq.i32 q15, q15, #0 + @ Drop through... +2: vmov q8, q11 + vld1.8 {q9}, [r0]! + vorr q13, q12, q13 + vorr q15, q14, q15 + vbic q12, q8, q0 + vorr q3, q13, q15 + vext.8 q13, q8, q9, #1 + vext.8 q14, q8, q9, #2 + vext.8 q15, q8, q9, #3 + veor q12, q12, q1 + vorr d6, d6, d7 + vbic q13, q13, q0 + vbic q14, q14, q0 + vbic q15, q15, q0 + vceq.i32 q12, q12, #0 + vmov r3, r12, d6 + veor q13, q13, q1 + veor q14, q14, q1 + veor q15, q15, q1 + vceq.i32 q13, q13, #0 + vceq.i32 q14, q14, #0 + vceq.i32 q15, q15, #0 + orrs r3, r3, r12 + bne 90f + vst1.64 {q10}, [r2]! +3: vmov q10, q9 + vld1.8 {q11}, [r0]! + vorr q13, q12, q13 + vorr q15, q14, q15 + vbic q12, q10, q0 + vorr q3, q13, q15 + vext.8 q13, q10, q11, #1 + vext.8 q14, q10, q11, #2 + vext.8 q15, q10, q11, #3 + veor q12, q12, q1 + vorr d6, d6, d7 + vbic q13, q13, q0 + vbic q14, q14, q0 + vbic q15, q15, q0 + vceq.i32 q12, q12, #0 + vmov r3, r12, d6 + veor q13, q13, q1 + veor q14, q14, q1 + veor q15, q15, q1 + vceq.i32 q13, q13, #0 + vceq.i32 q14, q14, #0 + vceq.i32 q15, q15, #0 + orrs r3, r3, r12 + bne 91f + vst1.64 {q8}, [r2]! + subs r1, r1, #32 + bpl 2b + +90: add r0, r1, #48 + bx lr + +91: sub r1, r1, #16 + b 90b +endfunc diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index 8a71c04230..53644506e5 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -2595,6 +2595,17 @@ typedef struct AVHWAccel { * that avctx->hwaccel_priv_data is invalid. */ int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); + + /** + * Called if parsing fails + * + * An error has occured, end_frame will not be called + * start_frame & decode_slice may or may not have been called + * Optional + * + * @param avctx the codec context + */ + void (*abort_frame)(AVCodecContext *avctx); } AVHWAccel; /** diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c index c7efe7e77b..46766244b8 100644 --- a/libavcodec/blockdsp.c +++ b/libavcodec/blockdsp.c @@ -65,6 +65,8 @@ av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx) c->fill_block_tab[0] = fill_block16_c; c->fill_block_tab[1] = fill_block8_c; + if (ARCH_AARCH64) + ff_blockdsp_init_aarch64(c); if (ARCH_ALPHA) ff_blockdsp_init_alpha(c); if (ARCH_ARM) diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h index 26fc2ea13b..fe539491da 100644 --- a/libavcodec/blockdsp.h +++ b/libavcodec/blockdsp.h @@ -41,6 +41,7 @@ typedef struct BlockDSPContext { void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx); +void ff_blockdsp_init_aarch64(BlockDSPContext *c); void ff_blockdsp_init_alpha(BlockDSPContext *c); void ff_blockdsp_init_arm(BlockDSPContext *c); void ff_blockdsp_init_ppc(BlockDSPContext *c); diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h index 38d06b2842..bbf5d70560 100644 --- a/libavcodec/cabac.h +++ b/libavcodec/cabac.h @@ -44,6 +44,10 @@ typedef struct CABACContext{ const uint8_t *bytestream_start; const uint8_t *bytestream; const uint8_t *bytestream_end; + struct { + uint16_t bits; + uint16_t range; + } by22; }CABACContext; int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); diff --git a/libavcodec/codec.h b/libavcodec/codec.h index 50a22f6e3c..5acf572ef4 100644 --- a/libavcodec/codec.h +++ b/libavcodec/codec.h @@ -367,6 +367,17 @@ const AVCodec *av_codec_iterate(void **opaque); */ AVCodec *avcodec_find_decoder(enum AVCodecID id); +/** + * Find a registered decoder with a matching codec ID and pix_fmt. + * A decoder will pix_fmt set to NULL will match any fmt. + * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL. + * + * @param id AVCodecID of the requested decoder + * @param fmt AVPixelForma that msut be supported by decoder + * @return A decoder if one was found, NULL otherwise. + */ +AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt); + /** * Find a registered decoder with the specified name. * diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h new file mode 100644 index 0000000000..72cbba0953 --- /dev/null +++ b/libavcodec/hevc-ctrls-v1.h @@ -0,0 +1,229 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * These are the HEVC state controls for use with stateless HEVC + * codec drivers. + * + * It turns out that these structs are not stable yet and will undergo + * more changes. So keep them private until they are stable and ready to + * become part of the official public API. + */ + +#ifndef _HEVC_CTRLS_H_ +#define _HEVC_CTRLS_H_ + +#include + +/* The pixel format isn't stable at the moment and will likely be renamed. */ +#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ + +#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_MPEG_BASE + 1008) +#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_MPEG_BASE + 1009) +#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_MPEG_BASE + 1010) +#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_MPEG_BASE + 1011) +#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_MPEG_BASE + 1015) +#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_MPEG_BASE + 1016) + +/* enum v4l2_ctrl_type type values */ +#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 +#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 +#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 +#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 + +enum v4l2_mpeg_video_hevc_decode_mode { + V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, + V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, +}; + +enum v4l2_mpeg_video_hevc_start_code { + V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, + V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, +}; + +#define V4L2_HEVC_SLICE_TYPE_B 0 +#define V4L2_HEVC_SLICE_TYPE_P 1 +#define V4L2_HEVC_SLICE_TYPE_I 2 + +#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) +#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) +#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) +#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) +#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) +#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) +#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) +#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) +#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) + +/* The controls are not stable at the moment and will likely be reworked. */ +struct v4l2_ctrl_hevc_sps { + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ + __u16 pic_width_in_luma_samples; + __u16 pic_height_in_luma_samples; + __u8 bit_depth_luma_minus8; + __u8 bit_depth_chroma_minus8; + __u8 log2_max_pic_order_cnt_lsb_minus4; + __u8 sps_max_dec_pic_buffering_minus1; + __u8 sps_max_num_reorder_pics; + __u8 sps_max_latency_increase_plus1; + __u8 log2_min_luma_coding_block_size_minus3; + __u8 log2_diff_max_min_luma_coding_block_size; + __u8 log2_min_luma_transform_block_size_minus2; + __u8 log2_diff_max_min_luma_transform_block_size; + __u8 max_transform_hierarchy_depth_inter; + __u8 max_transform_hierarchy_depth_intra; + __u8 pcm_sample_bit_depth_luma_minus1; + __u8 pcm_sample_bit_depth_chroma_minus1; + __u8 log2_min_pcm_luma_coding_block_size_minus3; + __u8 log2_diff_max_min_pcm_luma_coding_block_size; + __u8 num_short_term_ref_pic_sets; + __u8 num_long_term_ref_pics_sps; + __u8 chroma_format_idc; + __u8 sps_max_sub_layers_minus1; + + __u64 flags; +}; + +#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 0) +#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) +#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) +#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) +#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) +#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) +#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) +#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) +#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) +#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) +#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) +#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) +#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) +#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) +#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) +#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) +#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) +#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) +#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) + +struct v4l2_ctrl_hevc_pps { + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ + __u8 num_extra_slice_header_bits; + __s8 init_qp_minus26; + __u8 diff_cu_qp_delta_depth; + __s8 pps_cb_qp_offset; + __s8 pps_cr_qp_offset; + __u8 num_tile_columns_minus1; + __u8 num_tile_rows_minus1; + __u8 column_width_minus1[20]; + __u8 row_height_minus1[22]; + __s8 pps_beta_offset_div2; + __s8 pps_tc_offset_div2; + __u8 log2_parallel_merge_level_minus2; + + __u8 padding[4]; + __u64 flags; +}; + +#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 +#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 +#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 + +#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 + +struct v4l2_hevc_dpb_entry { + __u64 timestamp; + __u8 rps; + __u8 field_pic; + __u16 pic_order_cnt[2]; + __u8 padding[2]; +}; + +struct v4l2_hevc_pred_weight_table { + __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + + __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + + __u8 padding[6]; + + __u8 luma_log2_weight_denom; + __s8 delta_chroma_log2_weight_denom; +}; + +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) + +struct v4l2_ctrl_hevc_slice_params { + __u32 bit_size; + __u32 data_bit_offset; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + __u32 slice_segment_addr; + __u32 num_entry_point_offsets; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ + __u8 nal_unit_type; + __u8 nuh_temporal_id_plus1; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + __u8 slice_type; + __u8 colour_plane_id; + __u16 slice_pic_order_cnt; + __u8 num_ref_idx_l0_active_minus1; + __u8 num_ref_idx_l1_active_minus1; + __u8 collocated_ref_idx; + __u8 five_minus_max_num_merge_cand; + __s8 slice_qp_delta; + __s8 slice_cb_qp_offset; + __s8 slice_cr_qp_offset; + __s8 slice_act_y_qp_offset; + __s8 slice_act_cb_qp_offset; + __s8 slice_act_cr_qp_offset; + __s8 slice_beta_offset_div2; + __s8 slice_tc_offset_div2; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ + __u8 pic_struct; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + __u8 num_active_dpb_entries; + __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + + __u8 num_rps_poc_st_curr_before; + __u8 num_rps_poc_st_curr_after; + __u8 num_rps_poc_lt_curr; + + __u8 padding; + + __u32 entry_point_offset_minus1[256]; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ + struct v4l2_hevc_pred_weight_table pred_weight_table; + + __u64 flags; +}; + +struct v4l2_ctrl_hevc_scaling_matrix { + __u8 scaling_list_4x4[6][16]; + __u8 scaling_list_8x8[6][64]; + __u8 scaling_list_16x16[6][64]; + __u8 scaling_list_32x32[2][64]; + __u8 scaling_list_dc_coef_16x16[6]; + __u8 scaling_list_dc_coef_32x32[2]; +}; + +#endif diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h new file mode 100644 index 0000000000..7cbbbf055f --- /dev/null +++ b/libavcodec/hevc-ctrls-v2.h @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * These are the HEVC state controls for use with stateless HEVC + * codec drivers. + * + * It turns out that these structs are not stable yet and will undergo + * more changes. So keep them private until they are stable and ready to + * become part of the official public API. + */ + +#ifndef _HEVC_CTRLS_H_ +#define _HEVC_CTRLS_H_ + +#include + +/* The pixel format isn't stable at the moment and will likely be renamed. */ +#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ + +#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) +#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) +#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) +#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) +#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) +#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) +#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) + +/* enum v4l2_ctrl_type type values */ +#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 +#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 +#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 +#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 +#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 + +enum v4l2_mpeg_video_hevc_decode_mode { + V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, + V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, +}; + +enum v4l2_mpeg_video_hevc_start_code { + V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, + V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, +}; + +#define V4L2_HEVC_SLICE_TYPE_B 0 +#define V4L2_HEVC_SLICE_TYPE_P 1 +#define V4L2_HEVC_SLICE_TYPE_I 2 + +#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) +#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) +#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) +#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) +#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) +#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) +#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) +#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) +#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) + +/* The controls are not stable at the moment and will likely be reworked. */ +struct v4l2_ctrl_hevc_sps { + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ + __u16 pic_width_in_luma_samples; + __u16 pic_height_in_luma_samples; + __u8 bit_depth_luma_minus8; + __u8 bit_depth_chroma_minus8; + __u8 log2_max_pic_order_cnt_lsb_minus4; + __u8 sps_max_dec_pic_buffering_minus1; + __u8 sps_max_num_reorder_pics; + __u8 sps_max_latency_increase_plus1; + __u8 log2_min_luma_coding_block_size_minus3; + __u8 log2_diff_max_min_luma_coding_block_size; + __u8 log2_min_luma_transform_block_size_minus2; + __u8 log2_diff_max_min_luma_transform_block_size; + __u8 max_transform_hierarchy_depth_inter; + __u8 max_transform_hierarchy_depth_intra; + __u8 pcm_sample_bit_depth_luma_minus1; + __u8 pcm_sample_bit_depth_chroma_minus1; + __u8 log2_min_pcm_luma_coding_block_size_minus3; + __u8 log2_diff_max_min_pcm_luma_coding_block_size; + __u8 num_short_term_ref_pic_sets; + __u8 num_long_term_ref_pics_sps; + __u8 chroma_format_idc; + __u8 sps_max_sub_layers_minus1; + + __u64 flags; +}; + +#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) +#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) +#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) +#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) +#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) +#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) +#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) +#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) +#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) +#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) +#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) +#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) +#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) +#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) +#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) +#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) +#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) +#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) +#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) +#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) +#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) + +struct v4l2_ctrl_hevc_pps { + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ + __u8 num_extra_slice_header_bits; + __u8 num_ref_idx_l0_default_active_minus1; + __u8 num_ref_idx_l1_default_active_minus1; + __s8 init_qp_minus26; + __u8 diff_cu_qp_delta_depth; + __s8 pps_cb_qp_offset; + __s8 pps_cr_qp_offset; + __u8 num_tile_columns_minus1; + __u8 num_tile_rows_minus1; + __u8 column_width_minus1[20]; + __u8 row_height_minus1[22]; + __s8 pps_beta_offset_div2; + __s8 pps_tc_offset_div2; + __u8 log2_parallel_merge_level_minus2; + + __u8 padding[4]; + __u64 flags; +}; + +#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 +#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 +#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 + +#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 + +struct v4l2_hevc_dpb_entry { + __u64 timestamp; + __u8 rps; + __u8 field_pic; + __u16 pic_order_cnt[2]; + __u8 padding[2]; +}; + +struct v4l2_hevc_pred_weight_table { + __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + + __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + + __u8 padding[6]; + + __u8 luma_log2_weight_denom; + __s8 delta_chroma_log2_weight_denom; +}; + +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) + +struct v4l2_ctrl_hevc_slice_params { + __u32 bit_size; + __u32 data_bit_offset; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + __u32 slice_segment_addr; + __u32 num_entry_point_offsets; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ + __u8 nal_unit_type; + __u8 nuh_temporal_id_plus1; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + __u8 slice_type; + __u8 colour_plane_id; + __u16 slice_pic_order_cnt; + __u8 num_ref_idx_l0_active_minus1; + __u8 num_ref_idx_l1_active_minus1; + __u8 collocated_ref_idx; + __u8 five_minus_max_num_merge_cand; + __s8 slice_qp_delta; + __s8 slice_cb_qp_offset; + __s8 slice_cr_qp_offset; + __s8 slice_act_y_qp_offset; + __s8 slice_act_cb_qp_offset; + __s8 slice_act_cr_qp_offset; + __s8 slice_beta_offset_div2; + __s8 slice_tc_offset_div2; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ + __u8 pic_struct; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + + __u8 padding[5]; + + __u32 entry_point_offset_minus1[256]; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ + struct v4l2_hevc_pred_weight_table pred_weight_table; + + __u64 flags; +}; + +#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 +#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 +#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 + +struct v4l2_ctrl_hevc_decode_params { + __s32 pic_order_cnt_val; + __u8 num_active_dpb_entries; + struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u8 num_poc_st_curr_before; + __u8 num_poc_st_curr_after; + __u8 num_poc_lt_curr; + __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u64 flags; +}; + +/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ +#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) +/* + * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - + * the number of data (in bits) to skip in the + * slice segment header. + * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" + * to before syntax element "slice_temporal_mvp_enabled_flag". + * If IDR, the skipped bits are just "pic_output_flag" + * (separate_colour_plane_flag is not supported). + */ +#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) + +struct v4l2_ctrl_hevc_scaling_matrix { + __u8 scaling_list_4x4[6][16]; + __u8 scaling_list_8x8[6][64]; + __u8 scaling_list_16x16[6][64]; + __u8 scaling_list_32x32[2][64]; + __u8 scaling_list_dc_coef_16x16[6]; + __u8 scaling_list_dc_coef_32x32[2]; +}; + +#endif diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h new file mode 100644 index 0000000000..4e35bd583d --- /dev/null +++ b/libavcodec/hevc-ctrls-v3.h @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * These are the HEVC state controls for use with stateless HEVC + * codec drivers. + * + * It turns out that these structs are not stable yet and will undergo + * more changes. So keep them private until they are stable and ready to + * become part of the official public API. + */ + +#ifndef _HEVC_CTRLS_H_ +#define _HEVC_CTRLS_H_ + +#include + +/* The pixel format isn't stable at the moment and will likely be renamed. */ +#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ + +#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008) +#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009) +#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010) +#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011) +#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012) +#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015) +#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016) + +/* enum v4l2_ctrl_type type values */ +#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 +#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 +#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 +#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 +#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124 + +enum v4l2_mpeg_video_hevc_decode_mode { + V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, + V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, +}; + +enum v4l2_mpeg_video_hevc_start_code { + V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, + V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, +}; + +#define V4L2_HEVC_SLICE_TYPE_B 0 +#define V4L2_HEVC_SLICE_TYPE_P 1 +#define V4L2_HEVC_SLICE_TYPE_I 2 + +#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) +#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) +#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) +#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) +#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) +#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) +#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) +#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) +#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) + +/* The controls are not stable at the moment and will likely be reworked. */ +struct v4l2_ctrl_hevc_sps { + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ + __u16 pic_width_in_luma_samples; + __u16 pic_height_in_luma_samples; + __u8 bit_depth_luma_minus8; + __u8 bit_depth_chroma_minus8; + __u8 log2_max_pic_order_cnt_lsb_minus4; + __u8 sps_max_dec_pic_buffering_minus1; + __u8 sps_max_num_reorder_pics; + __u8 sps_max_latency_increase_plus1; + __u8 log2_min_luma_coding_block_size_minus3; + __u8 log2_diff_max_min_luma_coding_block_size; + __u8 log2_min_luma_transform_block_size_minus2; + __u8 log2_diff_max_min_luma_transform_block_size; + __u8 max_transform_hierarchy_depth_inter; + __u8 max_transform_hierarchy_depth_intra; + __u8 pcm_sample_bit_depth_luma_minus1; + __u8 pcm_sample_bit_depth_chroma_minus1; + __u8 log2_min_pcm_luma_coding_block_size_minus3; + __u8 log2_diff_max_min_pcm_luma_coding_block_size; + __u8 num_short_term_ref_pic_sets; + __u8 num_long_term_ref_pics_sps; + __u8 chroma_format_idc; + __u8 sps_max_sub_layers_minus1; + + __u64 flags; +}; + +#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0) +#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) +#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) +#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) +#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) +#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) +#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) +#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) +#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) +#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) +#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) +#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) +#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) +#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) +#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) +#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) +#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) +#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) +#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) +#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19) +#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20) + +struct v4l2_ctrl_hevc_pps { + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ + __u8 num_extra_slice_header_bits; + __u8 num_ref_idx_l0_default_active_minus1; + __u8 num_ref_idx_l1_default_active_minus1; + __s8 init_qp_minus26; + __u8 diff_cu_qp_delta_depth; + __s8 pps_cb_qp_offset; + __s8 pps_cr_qp_offset; + __u8 num_tile_columns_minus1; + __u8 num_tile_rows_minus1; + __u8 column_width_minus1[20]; + __u8 row_height_minus1[22]; + __s8 pps_beta_offset_div2; + __s8 pps_tc_offset_div2; + __u8 log2_parallel_merge_level_minus2; + + __u8 padding[4]; + __u64 flags; +}; + +#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01 + +#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 + +struct v4l2_hevc_dpb_entry { + __u64 timestamp; + __u8 flags; + __u8 field_pic; + __u16 pic_order_cnt[2]; + __u8 padding[2]; +}; + +struct v4l2_hevc_pred_weight_table { + __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + + __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; + + __u8 padding[6]; + + __u8 luma_log2_weight_denom; + __s8 delta_chroma_log2_weight_denom; +}; + +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) +#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) + +struct v4l2_ctrl_hevc_slice_params { + __u32 bit_size; + __u32 data_bit_offset; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + __u32 slice_segment_addr; + __u32 num_entry_point_offsets; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ + __u8 nal_unit_type; + __u8 nuh_temporal_id_plus1; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + __u8 slice_type; + __u8 colour_plane_id; + __u16 slice_pic_order_cnt; + __u8 num_ref_idx_l0_active_minus1; + __u8 num_ref_idx_l1_active_minus1; + __u8 collocated_ref_idx; + __u8 five_minus_max_num_merge_cand; + __s8 slice_qp_delta; + __s8 slice_cb_qp_offset; + __s8 slice_cr_qp_offset; + __s8 slice_act_y_qp_offset; + __s8 slice_act_cb_qp_offset; + __s8 slice_act_cr_qp_offset; + __s8 slice_beta_offset_div2; + __s8 slice_tc_offset_div2; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ + __u8 pic_struct; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + + __u8 padding[5]; + + __u32 entry_point_offset_minus1[256]; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ + struct v4l2_hevc_pred_weight_table pred_weight_table; + + __u64 flags; +}; + +#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1 +#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2 +#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4 + +struct v4l2_ctrl_hevc_decode_params { + __s32 pic_order_cnt_val; + __u8 num_active_dpb_entries; + struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u8 num_poc_st_curr_before; + __u8 num_poc_st_curr_after; + __u8 num_poc_lt_curr; + __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; + __u64 flags; +}; + +struct v4l2_ctrl_hevc_scaling_matrix { + __u8 scaling_list_4x4[6][16]; + __u8 scaling_list_8x8[6][64]; + __u8 scaling_list_16x16[6][64]; + __u8 scaling_list_32x32[2][64]; + __u8 scaling_list_dc_coef_16x16[6]; + __u8 scaling_list_dc_coef_32x32[2]; +}; + +/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */ +#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200) +/* + * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP - + * the number of data (in bits) to skip in the + * slice segment header. + * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag" + * to before syntax element "slice_temporal_mvp_enabled_flag". + * If IDR, the skipped bits are just "pic_output_flag" + * (separate_colour_plane_flag is not supported). + */ +#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0) + +#endif diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c index 463d352055..7feff43c28 100644 --- a/libavcodec/hevc_parser.c +++ b/libavcodec/hevc_parser.c @@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal, avctx->profile = ps->sps->ptl.general_ptl.profile_idc; avctx->level = ps->sps->ptl.general_ptl.level_idc; + if (ps->sps->chroma_format_idc == 1) { + avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ? + ps->sps->vui.chroma_sample_loc_type_top_field + 1 : + AVCHROMA_LOC_LEFT; + } + else if (ps->sps->chroma_format_idc == 2 || + ps->sps->chroma_format_idc == 3) { + avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;; + } + else { + avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED; + } + if (ps->vps->vps_timing_info_present_flag) { num = ps->vps->vps_num_units_in_tick; den = ps->vps->vps_time_scale; diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c index 2231aed259..6d2d66dfdf 100644 --- a/libavcodec/hevcdec.c +++ b/libavcodec/hevcdec.c @@ -333,6 +333,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps) ff_set_sar(avctx, sps->vui.sar); + if (sps->chroma_format_idc == 1) { + avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ? + sps->vui.chroma_sample_loc_type_top_field + 1 : + AVCHROMA_LOC_LEFT; + } + else if (sps->chroma_format_idc == 2 || + sps->chroma_format_idc == 3) { + avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;; + } + else { + avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED; + } + if (sps->vui.video_signal_type_present_flag) avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG; @@ -392,14 +405,20 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \ CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \ CONFIG_HEVC_NVDEC_HWACCEL + \ + CONFIG_HEVC_V4L2REQUEST_HWACCEL + \ CONFIG_HEVC_VAAPI_HWACCEL + \ CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \ + CONFIG_HEVC_RPI4_8_HWACCEL + \ + CONFIG_HEVC_RPI4_10_HWACCEL + \ CONFIG_HEVC_VDPAU_HWACCEL) enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; switch (sps->pix_fmt) { case AV_PIX_FMT_YUV420P: case AV_PIX_FMT_YUVJ420P: +#if CONFIG_HEVC_RPI4_8_HWACCEL + *fmt++ = AV_PIX_FMT_RPI4_8; +#endif #if CONFIG_HEVC_DXVA2_HWACCEL *fmt++ = AV_PIX_FMT_DXVA2_VLD; #endif @@ -418,9 +437,15 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) #endif #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; +#endif +#if CONFIG_HEVC_V4L2REQUEST_HWACCEL + *fmt++ = AV_PIX_FMT_DRM_PRIME; #endif break; case AV_PIX_FMT_YUV420P10: +#if CONFIG_HEVC_RPI4_10_HWACCEL + *fmt++ = AV_PIX_FMT_RPI4_10; +#endif #if CONFIG_HEVC_DXVA2_HWACCEL *fmt++ = AV_PIX_FMT_DXVA2_VLD; #endif @@ -439,6 +464,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) #endif #if CONFIG_HEVC_NVDEC_HWACCEL *fmt++ = AV_PIX_FMT_CUDA; +#endif +#if CONFIG_HEVC_V4L2REQUEST_HWACCEL + *fmt++ = AV_PIX_FMT_DRM_PRIME; #endif break; case AV_PIX_FMT_YUV444P: @@ -3327,7 +3355,14 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output, s->ref = NULL; ret = decode_nal_units(s, avpkt->data, avpkt->size); if (ret < 0) + { + // Ensure that hwaccel knows this frame is over + if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) { + s->avctx->hwaccel->abort_frame(s->avctx); + } + return ret; + } if (avctx->hwaccel) { if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) { @@ -3697,6 +3732,15 @@ AVCodec ff_hevc_decoder = { #if CONFIG_HEVC_NVDEC_HWACCEL HWACCEL_NVDEC(hevc), #endif +#if CONFIG_HEVC_RPI4_8_HWACCEL + HWACCEL_RPI4_8(hevc), +#endif +#if CONFIG_HEVC_RPI4_10_HWACCEL + HWACCEL_RPI4_10(hevc), +#endif +#if CONFIG_HEVC_V4L2REQUEST_HWACCEL + HWACCEL_V4L2REQUEST(hevc), +#endif #if CONFIG_HEVC_VAAPI_HWACCEL HWACCEL_VAAPI(hevc), #endif diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h index 8e54cf73f9..2277aadf75 100644 --- a/libavcodec/hwaccels.h +++ b/libavcodec/hwaccels.h @@ -39,6 +39,9 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel; extern const AVHWAccel ff_hevc_d3d11va2_hwaccel; extern const AVHWAccel ff_hevc_dxva2_hwaccel; extern const AVHWAccel ff_hevc_nvdec_hwaccel; +extern const AVHWAccel ff_hevc_rpi4_8_hwaccel; +extern const AVHWAccel ff_hevc_rpi4_10_hwaccel; +extern const AVHWAccel ff_hevc_v4l2request_hwaccel; extern const AVHWAccel ff_hevc_vaapi_hwaccel; extern const AVHWAccel ff_hevc_vdpau_hwaccel; extern const AVHWAccel ff_hevc_videotoolbox_hwaccel; diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h index f421dc909f..f93283b893 100644 --- a/libavcodec/hwconfig.h +++ b/libavcodec/hwconfig.h @@ -24,6 +24,7 @@ #define HWACCEL_CAP_ASYNC_SAFE (1 << 0) +#define HWACCEL_CAP_MT_SAFE (1 << 1) typedef struct AVCodecHWConfigInternal { @@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal { HW_CONFIG_HWACCEL(1, 1, 0, D3D11, D3D11VA, ff_ ## codec ## _d3d11va2_hwaccel) #define HWACCEL_NVDEC(codec) \ HW_CONFIG_HWACCEL(1, 1, 0, CUDA, CUDA, ff_ ## codec ## _nvdec_hwaccel) +#define HWACCEL_RPI4_8(codec) \ + HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8, NONE, ff_ ## codec ## _rpi4_8_hwaccel) +#define HWACCEL_RPI4_10(codec) \ + HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10, NONE, ff_ ## codec ## _rpi4_10_hwaccel) +#define HWACCEL_V4L2REQUEST(codec) \ + HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME, DRM, ff_ ## codec ## _v4l2request_hwaccel) #define HWACCEL_VAAPI(codec) \ HW_CONFIG_HWACCEL(1, 1, 1, VAAPI, VAAPI, ff_ ## codec ## _vaapi_hwaccel) #define HWACCEL_VDPAU(codec) \ diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c index cb15ac072a..f6261db962 100644 --- a/libavcodec/mmaldec.c +++ b/libavcodec/mmaldec.c @@ -24,6 +24,9 @@ * MMAL Video Decoder */ +#pragma GCC diagnostic push +// Many many redundant decls in the header files +#pragma GCC diagnostic ignored "-Wredundant-decls" #include #include #include @@ -31,6 +34,7 @@ #include #include #include +#pragma GCC diagnostic pop #include #include "avcodec.h" diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c index 9176027f15..0b0ff03c18 100644 --- a/libavcodec/pthread_frame.c +++ b/libavcodec/pthread_frame.c @@ -209,7 +209,8 @@ FF_ENABLE_DEPRECATION_WARNINGS /* if the previous thread uses hwaccel then we take the lock to ensure * the threads don't run concurrently */ - if (avctx->hwaccel) { + if (avctx->hwaccel && + !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { pthread_mutex_lock(&p->parent->hwaccel_mutex); p->hwaccel_serializing = 1; } @@ -636,7 +637,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return; - if (avctx->hwaccel && !p->hwaccel_serializing) { + if (avctx->hwaccel && + !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) && + !p->hwaccel_serializing) { pthread_mutex_lock(&p->parent->hwaccel_mutex); p->hwaccel_serializing = 1; } diff --git a/libavcodec/raw.c b/libavcodec/raw.c index 079d5c5d10..0781f28615 100644 --- a/libavcodec/raw.c +++ b/libavcodec/raw.c @@ -294,6 +294,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ + /* RPI (Might as well define for everything) */ + { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, + { AV_PIX_FMT_RPI4_8, MKTAG('S', 'A', 'N', 'D') }, + { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, + { AV_PIX_FMT_RPI4_10, MKTAG('S', 'N', 'D', 'B') }, + { AV_PIX_FMT_NONE, 0 }, }; diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c index d181b74570..b943dd0379 100644 --- a/libavcodec/rawenc.c +++ b/libavcodec/rawenc.c @@ -24,6 +24,7 @@ * Raw Video Encoder */ +#include "config.h" #include "avcodec.h" #include "raw.h" #include "internal.h" @@ -31,6 +32,10 @@ #include "libavutil/intreadwrite.h" #include "libavutil/imgutils.h" #include "libavutil/internal.h" +#include "libavutil/avassert.h" +#if CONFIG_SAND +#include "libavutil/rpi_sand_fns.h" +#endif static av_cold int raw_encode_init(AVCodecContext *avctx) { @@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS return 0; } +#if CONFIG_SAND +static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame) +{ + const int width = av_frame_cropped_width(frame); + const int height = av_frame_cropped_height(frame); + const int x0 = frame->crop_left; + const int y0 = frame->crop_top; + const int size = width * height * 3 / 2; + uint8_t * dst; + int ret; + + if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) + return ret; + + dst = pkt->data; + + av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); + dst += width * height; + av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2, + frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2); + return 0; +} + +static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame) +{ + const int width = av_frame_cropped_width(frame); + const int height = av_frame_cropped_height(frame); + const int x0 = frame->crop_left; + const int y0 = frame->crop_top; + const int size = width * height * 3; + uint8_t * dst; + int ret; + + if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) + return ret; + + dst = pkt->data; + + av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height); + dst += width * height * 2; + av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width, + frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2); + return 0; +} + +static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame) +{ + const int width = av_frame_cropped_width(frame); + const int height = av_frame_cropped_height(frame); + const int x0 = frame->crop_left; + const int y0 = frame->crop_top; + const int size = width * height * 3; + uint8_t * dst; + int ret; + + if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) + return ret; + + dst = pkt->data; + + av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); + dst += width * height * 2; + av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width, + frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2); + return 0; +} +#endif + + static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, - const AVFrame *frame, int *got_packet) + const AVFrame *src_frame, int *got_packet) { - int ret = av_image_get_buffer_size(frame->format, - frame->width, frame->height, 1); + int ret; + AVFrame * frame = NULL; - if (ret < 0) +#if CONFIG_SAND + if (av_rpi_is_sand_frame(src_frame)) { + ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) : + av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) : + av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1; + *got_packet = (ret == 0); return ret; + } +#endif + + if ((frame = av_frame_clone(src_frame)) == NULL) { + ret = AVERROR(ENOMEM); + goto fail; + } + + if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0) + goto fail; + + ret = av_image_get_buffer_size(frame->format, + frame->width, frame->height, 1); + if (ret < 0) + goto fail; if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0) - return ret; + goto fail; if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, (const uint8_t **)frame->data, frame->linesize, frame->format, frame->width, frame->height, 1)) < 0) - return ret; + goto fail; if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 && frame->format == AV_PIX_FMT_YUYV422) { @@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, } } pkt->flags |= AV_PKT_FLAG_KEY; + av_frame_free(&frame); *got_packet = 1; return 0; + +fail: + av_frame_free(&frame); + *got_packet = 0; + return ret; } AVCodec ff_rawvideo_encoder = { diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c new file mode 100644 index 0000000000..58c094c5f8 --- /dev/null +++ b/libavcodec/rpi_hevc_cabac.c @@ -0,0 +1,2257 @@ +/* + * HEVC CABAC decoding + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2012 - 2013 Gildas Cocherel + * Copyright (C) 2012 - 2013 Gildas Cocherel + * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define UNCHECKED_BITSTREAM_READER 1 + +#include "libavutil/attributes.h" +#include "libavutil/common.h" + +#include "cabac_functions.h" +#include "rpi_hevc_data.h" +#include "hevc.h" +#include "rpi_hevcdec.h" +#include "rpi_hevc_cabac_fns.h" + +#include "libavutil/rpi_sand_fns.h" + +// BY22 is probably faster than simple bypass if the processor has +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction +// x86 has fast int divide +// Arm doesn't have divide or general fast 64 bit, but does have the multiply +// * Beware: ARCH_xxx isn't set if configure --disable-asm is used +#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86) +// Use native divide if we have a fast one - otherwise use mpy 1/x +// x86 has a fast integer divide - arm doesn't - unsure about other +// architectures +#define USE_BY22_DIV ARCH_X86 + +// Special case blocks with a single significant ceoff +// Decreases the complexity of the code for a common case but increases the +// code size. +#define USE_N_END_1 1 + +#if !USE_BY22_DIV +// * 1/x @ 32 bits gets us 22 bits of accuracy +#define CABAC_BY22_PEEK_BITS 22 +#else +// A real 32-bit divide gets us another bit +// If we have a 64 bit int & a unit time divider then we should get a lot +// of bits (55) but that is untested and it is unclear if it would give +// us a large advantage +#define CABAC_BY22_PEEK_BITS 23 +#endif + +#define CABAC_MAX_BIN 31 + + +#if USE_BY22 && !USE_BY22_DIV +#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL) + +static const uint32_t cabac_by22_inv_range[256] = { + 0, I(257), I(258), I(259), + I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269), + I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279), + I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289), + I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299), + I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309), + I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319), + I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329), + I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339), + I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349), + I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359), + I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369), + I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379), + I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389), + I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399), + I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409), + I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419), + I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429), + I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439), + I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449), + I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459), + I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469), + I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479), + I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489), + I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499), + I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509), + I(510), I(511) +}; +#undef I +#endif // USE_BY22 + +#if ARCH_ARM +#include "arm/rpi_hevc_cabac.h" +#endif + +/** + * number of bin by SyntaxElement. + */ +static const int8_t num_bins_in_se[] = { + 1, // sao_merge_flag + 1, // sao_type_idx + 0, // sao_eo_class + 0, // sao_band_position + 0, // sao_offset_abs + 0, // sao_offset_sign + 0, // end_of_slice_flag + 3, // split_coding_unit_flag + 1, // cu_transquant_bypass_flag + 3, // skip_flag + 3, // cu_qp_delta + 1, // pred_mode + 4, // part_mode + 0, // pcm_flag + 1, // prev_intra_luma_pred_mode + 0, // mpm_idx + 0, // rem_intra_luma_pred_mode + 2, // intra_chroma_pred_mode + 1, // merge_flag + 1, // merge_idx + 5, // inter_pred_idc + 2, // ref_idx_l0 + 2, // ref_idx_l1 + 2, // abs_mvd_greater0_flag + 2, // abs_mvd_greater1_flag + 0, // abs_mvd_minus2 + 0, // mvd_sign_flag + 1, // mvp_lx_flag + 1, // no_residual_data_flag + 3, // split_transform_flag + 2, // cbf_luma + 4, // cbf_cb, cbf_cr + 2, // transform_skip_flag[][] + 2, // explicit_rdpcm_flag[][] + 2, // explicit_rdpcm_dir_flag[][] + 18, // last_significant_coeff_x_prefix + 18, // last_significant_coeff_y_prefix + 0, // last_significant_coeff_x_suffix + 0, // last_significant_coeff_y_suffix + 4, // significant_coeff_group_flag + 44, // significant_coeff_flag + 24, // coeff_abs_level_greater1_flag + 6, // coeff_abs_level_greater2_flag + 0, // coeff_abs_level_remaining + 0, // coeff_sign_flag + 8, // log2_res_scale_abs + 2, // res_scale_sign_flag + 1, // cu_chroma_qp_offset_flag + 1, // cu_chroma_qp_offset_idx +}; + +/** + * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement. + */ +static const int elem_offset[sizeof(num_bins_in_se)] = { + 0, // sao_merge_flag + 1, // sao_type_idx + 2, // sao_eo_class + 2, // sao_band_position + 2, // sao_offset_abs + 2, // sao_offset_sign + 2, // end_of_slice_flag + 2, // split_coding_unit_flag + 5, // cu_transquant_bypass_flag + 6, // skip_flag + 9, // cu_qp_delta + 12, // pred_mode + 13, // part_mode + 17, // pcm_flag + 17, // prev_intra_luma_pred_mode + 18, // mpm_idx + 18, // rem_intra_luma_pred_mode + 18, // intra_chroma_pred_mode + 20, // merge_flag + 21, // merge_idx + 22, // inter_pred_idc + 27, // ref_idx_l0 + 29, // ref_idx_l1 + 31, // abs_mvd_greater0_flag + 33, // abs_mvd_greater1_flag + 35, // abs_mvd_minus2 + 35, // mvd_sign_flag + 35, // mvp_lx_flag + 36, // no_residual_data_flag + 37, // split_transform_flag + 40, // cbf_luma + 42, // cbf_cb, cbf_cr + 46, // transform_skip_flag[][] + 48, // explicit_rdpcm_flag[][] + 50, // explicit_rdpcm_dir_flag[][] + 52, // last_significant_coeff_x_prefix + 70, // last_significant_coeff_y_prefix + 88, // last_significant_coeff_x_suffix + 88, // last_significant_coeff_y_suffix + 88, // significant_coeff_group_flag + 92, // significant_coeff_flag + 136, // coeff_abs_level_greater1_flag + 160, // coeff_abs_level_greater2_flag + 166, // coeff_abs_level_remaining + 166, // coeff_sign_flag + 166, // log2_res_scale_abs + 174, // res_scale_sign_flag + 176, // cu_chroma_qp_offset_flag + 177, // cu_chroma_qp_offset_idx +}; + +#define CNU 154 +/** + * Indexed by init_type + */ +static const uint8_t init_values[3][HEVC_CONTEXTS] = { + { // sao_merge_flag + 153, + // sao_type_idx + 200, + // split_coding_unit_flag + 139, 141, 157, + // cu_transquant_bypass_flag + 154, + // skip_flag + CNU, CNU, CNU, + // cu_qp_delta + 154, 154, 154, + // pred_mode + CNU, + // part_mode + 184, CNU, CNU, CNU, + // prev_intra_luma_pred_mode + 184, + // intra_chroma_pred_mode + 63, 139, + // merge_flag + CNU, + // merge_idx + CNU, + // inter_pred_idc + CNU, CNU, CNU, CNU, CNU, + // ref_idx_l0 + CNU, CNU, + // ref_idx_l1 + CNU, CNU, + // abs_mvd_greater1_flag + CNU, CNU, + // abs_mvd_greater1_flag + CNU, CNU, + // mvp_lx_flag + CNU, + // no_residual_data_flag + CNU, + // split_transform_flag + 153, 138, 138, + // cbf_luma + 111, 141, + // cbf_cb, cbf_cr + 94, 138, 182, 154, + // transform_skip_flag + 139, 139, + // explicit_rdpcm_flag + 139, 139, + // explicit_rdpcm_dir_flag + 139, 139, + // last_significant_coeff_x_prefix + 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, + 79, 108, 123, 63, + // last_significant_coeff_y_prefix + 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, + 79, 108, 123, 63, + // significant_coeff_group_flag + 91, 171, 134, 141, + // significant_coeff_flag + 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153, + 125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140, + 139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111, + 141, 111, + // coeff_abs_level_greater1_flag + 140, 92, 137, 138, 140, 152, 138, 139, 153, 74, 149, 92, 139, 107, + 122, 152, 140, 179, 166, 182, 140, 227, 122, 197, + // coeff_abs_level_greater2_flag + 138, 153, 136, 167, 152, 152, + // log2_res_scale_abs + 154, 154, 154, 154, 154, 154, 154, 154, + // res_scale_sign_flag + 154, 154, + // cu_chroma_qp_offset_flag + 154, + // cu_chroma_qp_offset_idx + 154, + }, + { // sao_merge_flag + 153, + // sao_type_idx + 185, + // split_coding_unit_flag + 107, 139, 126, + // cu_transquant_bypass_flag + 154, + // skip_flag + 197, 185, 201, + // cu_qp_delta + 154, 154, 154, + // pred_mode + 149, + // part_mode + 154, 139, 154, 154, + // prev_intra_luma_pred_mode + 154, + // intra_chroma_pred_mode + 152, 139, + // merge_flag + 110, + // merge_idx + 122, + // inter_pred_idc + 95, 79, 63, 31, 31, + // ref_idx_l0 + 153, 153, + // ref_idx_l1 + 153, 153, + // abs_mvd_greater1_flag + 140, 198, + // abs_mvd_greater1_flag + 140, 198, + // mvp_lx_flag + 168, + // no_residual_data_flag + 79, + // split_transform_flag + 124, 138, 94, + // cbf_luma + 153, 111, + // cbf_cb, cbf_cr + 149, 107, 167, 154, + // transform_skip_flag + 139, 139, + // explicit_rdpcm_flag + 139, 139, + // explicit_rdpcm_dir_flag + 139, 139, + // last_significant_coeff_x_prefix + 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, + 94, 108, 123, 108, + // last_significant_coeff_y_prefix + 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, + 94, 108, 123, 108, + // significant_coeff_group_flag + 121, 140, 61, 154, + // significant_coeff_flag + 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153, + 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, + 153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140, + 140, 140, + // coeff_abs_level_greater1_flag + 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, + 136, 137, 169, 194, 166, 167, 154, 167, 137, 182, + // coeff_abs_level_greater2_flag + 107, 167, 91, 122, 107, 167, + // log2_res_scale_abs + 154, 154, 154, 154, 154, 154, 154, 154, + // res_scale_sign_flag + 154, 154, + // cu_chroma_qp_offset_flag + 154, + // cu_chroma_qp_offset_idx + 154, + }, + { // sao_merge_flag + 153, + // sao_type_idx + 160, + // split_coding_unit_flag + 107, 139, 126, + // cu_transquant_bypass_flag + 154, + // skip_flag + 197, 185, 201, + // cu_qp_delta + 154, 154, 154, + // pred_mode + 134, + // part_mode + 154, 139, 154, 154, + // prev_intra_luma_pred_mode + 183, + // intra_chroma_pred_mode + 152, 139, + // merge_flag + 154, + // merge_idx + 137, + // inter_pred_idc + 95, 79, 63, 31, 31, + // ref_idx_l0 + 153, 153, + // ref_idx_l1 + 153, 153, + // abs_mvd_greater1_flag + 169, 198, + // abs_mvd_greater1_flag + 169, 198, + // mvp_lx_flag + 168, + // no_residual_data_flag + 79, + // split_transform_flag + 224, 167, 122, + // cbf_luma + 153, 111, + // cbf_cb, cbf_cr + 149, 92, 167, 154, + // transform_skip_flag + 139, 139, + // explicit_rdpcm_flag + 139, 139, + // explicit_rdpcm_dir_flag + 139, 139, + // last_significant_coeff_x_prefix + 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, + 79, 108, 123, 93, + // last_significant_coeff_y_prefix + 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, + 79, 108, 123, 93, + // significant_coeff_group_flag + 121, 140, 61, 154, + // significant_coeff_flag + 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153, + 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, + 153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140, + 140, 140, + // coeff_abs_level_greater1_flag + 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, + 136, 122, 169, 208, 166, 167, 154, 152, 167, 182, + // coeff_abs_level_greater2_flag + 107, 167, 91, 107, 107, 167, + // log2_res_scale_abs + 154, 154, 154, 154, 154, 154, 154, 154, + // res_scale_sign_flag + 154, 154, + // cu_chroma_qp_offset_flag + 154, + // cu_chroma_qp_offset_idx + 154, + }, +}; + +static const uint8_t scan_1x1[1] = { + 0, +}; + +static const uint8_t horiz_scan2x2_x[4] = { + 0, 1, 0, 1, +}; + +static const uint8_t horiz_scan2x2_y[4] = { + 0, 0, 1, 1 +}; + +static const uint8_t horiz_scan4x4_x[16] = { + 0, 1, 2, 3, + 0, 1, 2, 3, + 0, 1, 2, 3, + 0, 1, 2, 3, +}; + +static const uint8_t horiz_scan4x4_y[16] = { + 0, 0, 0, 0, + 1, 1, 1, 1, + 2, 2, 2, 2, + 3, 3, 3, 3, +}; + +static const uint8_t horiz_scan8x8_inv[8][8] = { + { 0, 1, 2, 3, 16, 17, 18, 19, }, + { 4, 5, 6, 7, 20, 21, 22, 23, }, + { 8, 9, 10, 11, 24, 25, 26, 27, }, + { 12, 13, 14, 15, 28, 29, 30, 31, }, + { 32, 33, 34, 35, 48, 49, 50, 51, }, + { 36, 37, 38, 39, 52, 53, 54, 55, }, + { 40, 41, 42, 43, 56, 57, 58, 59, }, + { 44, 45, 46, 47, 60, 61, 62, 63, }, +}; + +static const uint8_t diag_scan2x2_x[4] = { + 0, 0, 1, 1, +}; + +static const uint8_t diag_scan2x2_y[4] = { + 0, 1, 0, 1, +}; + +static const uint8_t diag_scan2x2_inv[2][2] = { + { 0, 2, }, + { 1, 3, }, +}; + +static const uint8_t diag_scan4x4_inv[4][4] = { + { 0, 2, 5, 9, }, + { 1, 4, 8, 12, }, + { 3, 7, 11, 14, }, + { 6, 10, 13, 15, }, +}; + +static const uint8_t diag_scan8x8_inv[8][8] = { + { 0, 2, 5, 9, 14, 20, 27, 35, }, + { 1, 4, 8, 13, 19, 26, 34, 42, }, + { 3, 7, 12, 18, 25, 33, 41, 48, }, + { 6, 11, 17, 24, 32, 40, 47, 53, }, + { 10, 16, 23, 31, 39, 46, 52, 57, }, + { 15, 22, 30, 38, 45, 51, 56, 60, }, + { 21, 29, 37, 44, 50, 55, 59, 62, }, + { 28, 36, 43, 49, 54, 58, 61, 63, }, +}; + + +typedef struct +{ + uint16_t coeff; + uint16_t scale; +} xy_off_t; + +#define XYT_C(x,y,t) ((x) + ((y) << (t))) +#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t)) +#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t)) +#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t))) + +#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)} + +#define OFF_DIAG(t) {\ + XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\ + XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\ + XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\ + XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\ +} + +#define OFF_HORIZ(t) {\ + XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\ + XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\ + XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\ + XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\ +} + +#define OFF_VERT(t) {\ + XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\ + XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\ + XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\ + XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\ +} + +static const xy_off_t off_xys[3][4][16] = +{ + {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)}, + {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)}, + {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)} +}; + + +// Helper fns +#ifndef hevc_mem_bits32 +static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset) +{ + return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7); +} +#endif + +#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32) +#define hevc_clz32 hevc_clz32_builtin +static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x) +{ + // __builtin_clz says it works on ints - so adjust if int is >32 bits long + return __builtin_clz(x) - (sizeof(int) * 8 - 32); +} +#endif + +// It is unlikely that we will ever need this but include for completeness +#ifndef hevc_clz32 +static inline unsigned int hevc_clz32(unsigned int x) +{ + unsigned int n = 1; + if ((x & 0xffff0000) == 0) { + n += 16; + x <<= 16; + } + if ((x & 0xff000000) == 0) { + n += 8; + x <<= 8; + } + if ((x & 0xf0000000) == 0) { + n += 4; + x <<= 4; + } + if ((x & 0xc0000000) == 0) { + n += 2; + x <<= 2; + } + return n - ((x >> 31) & 1); +} +#endif + +static inline int cabac_overflow(const CABACContext * const cc) +{ + av_assert0(cc->bytestream >= cc->bytestream_start); + return cc->bytestream >= cc->bytestream_end + 4; +} + +int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc) +{ + return cabac_overflow(&lc->cc); +} + +#if !USE_BY22 +// If no by22 then _by22 functions will revert to normal and so _peek/_flush +// will no longer be called but the setup calls will still exist and we want +// to null them out +#define bypass_start(s) +#define bypass_finish(s) +#else +// Use BY22 for residual bypass block + +#define bypass_start(cc) get_cabac_by22_start(cc) +#define bypass_finish(cc) get_cabac_by22_finish(cc) + +// BY22 notes that bypass is simply a divide into the bitstream and so we +// can peek out large quantities of bits at once and treat the result as if +// it was VLC. In many cases this will lead to O(1) processing rather than +// O(n) though the setup and teardown is sufficiently expensive that it is +// only worth using if we expect to be dealing with more than a few bits +// The definition of "a few bits" will vary from platform to platform but +// tests on ARM show that it probably isn't worth it for a single coded +// residual, but is for >1 - it also seems likely that if there are +// more residuals then they are likely to be bigger and this will make the +// O(1) nature of the code more worthwhile. + + +// Bypass block start +// Must be called before _by22_peek is used as it sets the CABAC environment +// into the correct state. _by22_finish must be called to return to 'normal' +// (i.e. non-bypass) cabac decoding +#ifndef get_cabac_by22_start +static inline void get_cabac_by22_start(CABACContext * const c) +{ + const unsigned int bits = __builtin_ctz(c->low); + const uint32_t m = hevc_mem_bits32(c->bytestream, 0); + uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits)); +#if !USE_BY22_DIV + const uint32_t inv = cabac_by22_inv_range[c->range & 0xff]; +#endif + + c->bytestream -= (CABAC_BITS / 8); + c->by22.bits = bits; +#if !USE_BY22_DIV + c->by22.range = c->range; + c->range = inv; +#endif + c->low = x; +} +#endif + +// Bypass block finish +// Must be called at the end of the bypass block to return to normal operation +static inline void get_cabac_by22_finish(CABACContext * const c) +{ + unsigned int used = c->by22.bits; + unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8); + unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7); + + c->bytestream += bytes_used + (CABAC_BITS / 8); + c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used; +#if !USE_BY22_DIV + c->range = c->by22.range; +#endif +} + +// Peek bypass bits +// _by22_start must be called before _by22_peek is called and _by22_flush +// must be called afterwards to flush any used bits +// The actual number of valid bits returned is +// min(, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS +// will be at least 22 which should be long enough for any prefix or suffix +// though probably not long enough for the worst case combination +#ifndef get_cabac_by22_peek +static inline uint32_t get_cabac_by22_peek(const CABACContext * const c) +{ +#if USE_BY22_DIV + return ((unsigned int)c->low / (unsigned int)c->range) << 9; +#else + uint32_t x = c->low & ~1U; + const uint32_t inv = c->range; + + if (inv != 0) + x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32); + + return x << 1; +#endif +} +#endif + +// Flush bypass bits peeked by _by22_peek +// Flush n bypass bits. n must be >= 1 to guarantee correct operation +// val is an unmodified copy of whatever _by22_peek returned +#ifndef get_cabac_by22_flush +static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val) +{ + // Subtract the bits used & reshift up to the top of the word +#if USE_BY22_DIV + const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23)); +#else + const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23)); +#endif + + // and refill lower bits + // We will probably OR over some existing bits but that doesn't matter + c->by22.bits += n; + c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9); +} +#endif + +#endif // USE_BY22 + + +void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc) +{ + memcpy(s->cabac_save->rice, lc->stat_coeff, 4); + memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS); +} + +static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) +{ + memcpy(lc->stat_coeff, s->cabac_save->rice, 4); + memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS); +} + +int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc) +{ + GetBitContext * const gb = &lc->gb; + skip_bits(gb, 1); + align_get_bits(gb); + return ff_init_cabac_decoder(&lc->cc, + gb->buffer + get_bits_count(gb) / 8, + (get_bits_left(gb) + 7) / 8); +} + +static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) +{ + int init_type = 2 - s->sh.slice_type; + int i; + + if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) + init_type ^= 3; + + for (i = 0; i < HEVC_CONTEXTS; i++) { + int init_value = init_values[init_type][i]; + int m = (init_value >> 4) * 5 - 45; + int n = ((init_value & 15) << 3) - 16; + int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127; + + pre ^= pre >> 31; + if (pre > 124) + pre = 124 + (pre & 1); + lc->cabac_state[i] = pre; + } + + for (i = 0; i < 4; i++) + lc->stat_coeff[i] = 0; +} + +void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags) +{ + if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0) + { + lc->qPy_pred = s->sh.slice_qp; + cabac_init_state(s, lc); + } + else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0) + { + lc->qPy_pred = s->sh.slice_qp; + load_states(s, lc); + } + lc->cabac_init_req = 0; +} + +#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx)) + +int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state) +{ + return get_cabac_inline(c, state); +} + +int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c) +{ + return get_cabac_terminate(c); +} + +int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc) +{ + if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX])) + return 0; + + if (!get_cabac_bypass(&lc->cc)) + return SAO_BAND; + return SAO_EDGE; +} + +int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc) +{ + int i; + int value = get_cabac_bypass(&lc->cc); + + for (i = 0; i < 4; i++) + value = (value << 1) | get_cabac_bypass(&lc->cc); + return value; +} + +int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) +{ + int i = 0; + int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1; + + while (i < length && get_cabac_bypass(&lc->cc)) + i++; + return i; +} + +int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc) +{ + return get_cabac_bypass(&lc->cc); +} + +int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc) +{ + int ret = get_cabac_bypass(&lc->cc) << 1; + ret |= get_cabac_bypass(&lc->cc); + return ret; +} + +int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc) +{ + int val = 1; + + if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0) + return 0; + + while (val < 5 && + get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0) + val++; + + if (val >= 5) { + unsigned int k = 0; + while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) { + val += 1 << k; + k++; + } +// if (k == CABAC_MAX_BIN) +// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); + + while (k--) + val += get_cabac_bypass(&lc->cc) << k; + } + return get_cabac_bypass(&lc->cc) ? -val : val; +} + +int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) +{ + int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1); + int i = 0; + + while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX])) + i++; + + return i; +} + +int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size) +{ + if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1 + return PART_2Nx2N; + if (log2_cb_size == s->ps.sps->log2_min_cb_size) { + if (lc->cu.pred_mode == MODE_INTRA) // 0 + return PART_NxN; + if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01 + return PART_2NxN; + if (log2_cb_size == 3) // 00 + return PART_Nx2N; + if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001 + return PART_Nx2N; + return PART_NxN; // 000 + } + + if (!s->ps.sps->amp_enabled_flag) { + if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01 + return PART_2NxN; + return PART_Nx2N; + } + + if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX + if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011 + return PART_2NxN; + if (get_cabac_bypass(&lc->cc)) // 0101 + return PART_2NxnD; + return PART_2NxnU; // 0100 + } + + if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001 + return PART_Nx2N; + if (get_cabac_bypass(&lc->cc)) // 0001 + return PART_nRx2N; + return PART_nLx2N; // 0000 +} + +int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc) +{ + int i = 0; + while (i < 2 && get_cabac_bypass(&lc->cc)) + i++; + return i; +} + +int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc) +{ + int i; + int value = get_cabac_bypass(&lc->cc); + + for (i = 0; i < 4; i++) + value = (value << 1) | get_cabac_bypass(&lc->cc); + return value; +} + +int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc) +{ + int ret; + if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE])) + return 4; + + ret = get_cabac_bypass(&lc->cc) << 1; + ret |= get_cabac_bypass(&lc->cc); + return ret; +} + +int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) +{ + int i = GET_CABAC_LC(elem_offset[MERGE_IDX]); + + if (i != 0) { + while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc)) + i++; + } + return i; +} + +int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH) +{ + if (nPbW + nPbH == 12) + return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4); + if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth)) + return PRED_BI; + + return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4); +} + +int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx) +{ + int i = 0; + int max = num_ref_idx_lx - 1; + int max_ctx = FFMIN(max, 2); + + while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i)) + i++; + if (i == 2) { + while (i < max && get_cabac_bypass(&lc->cc)) + i++; + } + + return i; +} + +static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc) +{ + return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]); +} + +static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc) +{ + return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1); +} + +#if !USE_BY22 +static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc) +{ + int ret = 2; + int k = 1; + + while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) { + ret += 1U << k; + k++; + } + if (k == CABAC_MAX_BIN) { + av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); + return 0; + } + + while (k--) + ret += get_cabac_bypass(&lc->cc) << k; + return get_cabac_bypass_sign(&lc->cc, -ret); +} +#endif + +static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc) +{ + return get_cabac_bypass_sign(&lc->cc, -1); +} + +static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) +{ + return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz); +} + +static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) +{ + return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz); +} + +static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) +{ + return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz); +} + + +int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) { + int i =0; + + while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i)) + i++; + + return i; +} + +static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, + int log2_size, int *last_scx_prefix, int *last_scy_prefix) +{ + int i = 0; + int max = (log2_size << 1) - 1; + int ctx_offset, ctx_shift; + + if (!c_idx_nz) { + ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2); + ctx_shift = (log2_size + 1) >> 2; + } else { + ctx_offset = 15; + ctx_shift = log2_size - 2; + } + while (i < max && + GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset)) + i++; + *last_scx_prefix = i; + + i = 0; + while (i < max && + GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset)) + i++; + *last_scy_prefix = i; +} + +static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc, + int last_significant_coeff_prefix) +{ + int i; + int length = (last_significant_coeff_prefix >> 1) - 1; + int value = get_cabac_bypass(&lc->cc); + + for (i = 1; i < length; i++) + value = (value << 1) | get_cabac_bypass(&lc->cc); + return value; +} + +static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg) +{ + int inc; + + inc = (ctx_cg != 0) + (c_idx_nz << 1); + + return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc); +} + +static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset) +{ + return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); +} + +#if !USE_BY22 +#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r) +#endif + + +#ifndef coeff_abs_level_remaining_decode_bypass +static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param) +{ + uint32_t y; + unsigned int prefix; + unsigned int last_coeff_abs_level_remaining; + unsigned int n; + + y = get_cabac_by22_peek(c); + prefix = hevc_clz32(~y); + // y << prefix will always have top bit 0 + + if (prefix < 3) { + const unsigned int suffix = (y << prefix) >> (31 - rice_param); + last_coeff_abs_level_remaining = (prefix << rice_param) + suffix; + n = prefix + 1 + rice_param; + } + else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2) + { + const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param)); + + last_coeff_abs_level_remaining = (2 << rice_param) + suffix; + n = prefix * 2 + rice_param - 2; + } + else { + unsigned int suffix; + + get_cabac_by22_flush(c, prefix, y); + y = get_cabac_by22_peek(c); + + suffix = (y | 0x80000000) >> (34 - (prefix + rice_param)); + last_coeff_abs_level_remaining = (2 << rice_param) + suffix; + n = prefix + rice_param - 2; + } + + get_cabac_by22_flush(c, n, y); + + return last_coeff_abs_level_remaining; +} +#endif + +static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param) +{ + int prefix = 0; + int suffix = 0; + int last_coeff_abs_level_remaining; + int i; + + while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c)) + prefix++; + if (prefix == CABAC_MAX_BIN) { +// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix); + return 0; + } + + if (prefix < 3) { + for (i = 0; i < rc_rice_param; i++) + suffix = (suffix << 1) | get_cabac_bypass(c); + last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix; + } else { + int prefix_minus3 = prefix - 3; + for (i = 0; i < prefix_minus3 + rc_rice_param; i++) + suffix = (suffix << 1) | get_cabac_bypass(c); + last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1) + << rc_rice_param) + suffix; + } + + return last_coeff_abs_level_remaining; +} + +#if !USE_BY22 +#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode +static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb) +{ + unsigned int i; + uint32_t ret = 0; + + for (i = 0; i < nb; i++) + ret = (ret << 1) | get_cabac_bypass(c); + + return ret << (32 - nb); +} +#endif + +#ifndef coeff_sign_flag_decode_bypass +static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb) +{ + uint32_t y; + y = get_cabac_by22_peek(c); + get_cabac_by22_flush(c, nb, y); + return y & ~(0xffffffffU >> nb); +} +#endif + + +#ifndef get_cabac_greater1_bits +static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n, + uint8_t * const state0) +{ + unsigned int i; + unsigned int rv = 0; + for (i = 0; i != n; ++i) { + const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3; + const unsigned int b = get_cabac(c, state0 + idx); + rv = (rv << 1) | b; + } + return rv; +} +#endif + + +// N.B. levels returned are the values assuming coeff_abs_level_remaining +// is uncoded, so 1 must be added if it is coded. sum_abs also reflects +// this version of events. +static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels, + int * const pprev_subset_coded, int * const psum, + const unsigned int idx0_gt1, const unsigned int idx_gt2) +{ + CABACContext * const c = &lc->cc; + uint8_t * const state0 = lc->cabac_state + idx0_gt1; + uint8_t * const state_gt2 = lc->cabac_state + idx_gt2; + unsigned int rv; + unsigned int i; + const unsigned int n = FFMIN(n_end, 8); + + // Really this is i != n but the simple unconditional loop is cheaper + // and faster + for (i = 0; i != 8; ++i) + levels[i] = 1; + + rv = get_cabac_greater1_bits(c, n, state0); + + *pprev_subset_coded = 0; + *psum = n; + + rv <<= (32 - n); + if (rv != 0) + { + *pprev_subset_coded = 1; + *psum = n + 1; + i = hevc_clz32(rv); + levels[i] = 2; + if (get_cabac(c, state_gt2) == 0) + { + // Unset first coded bit + rv &= ~(0x80000000U >> i); + } + } + + if (n_end > 8) { + const unsigned int g8 = n_end - 8; + rv |= ((1 << g8) - 1) << (24 - g8); + for (i = 0; i != g8; ++i) { + levels[i + 8] = 0; + } + } + + return rv; +} + +// extended_precision_processing_flag must be false given we are +// putting the result into a 16-bit array +// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining) +// scale_m is uint8_t +// +// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12) +// or it can be 2 (if we have transquant_bypass) +// shift is set to one less than we really want but would normally be +// s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5? +// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6 +// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient) +// to achieve it + +#ifndef trans_scale_sat +static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift) +{ + return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1); +} +#endif + + +#ifndef update_rice +static inline void update_rice(uint8_t * const stat_coeff, + const unsigned int last_coeff_abs_level_remaining, + const unsigned int c_rice_param) +{ + const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param; + if (x >= 6) + (*stat_coeff)++; + else if (x == 0 && *stat_coeff > 0) + (*stat_coeff)--; +} +#endif + + +// n must be > 0 on entry +#ifndef get_cabac_sig_coeff_flag_idxs +static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0, + unsigned int n, + const uint8_t const * ctx_map, + uint8_t * p) +{ + do { + if (get_cabac(c, state0 + ctx_map[n])) + *p++ = n; + } while (--n != 0); + return p; +} +#endif + + +static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0, + unsigned int n, + const uint8_t * ctx_map, // const ptr here but not in asm + uint8_t * const flag_idx) +{ + int rv; + + rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx; + + return rv; +} + +#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ + x0, x1, x2, x3,\ + x4, x5, x6, x7,\ + x8, x9, x10, x11,\ + x12, x13, x14, x15} + +#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ + x0, x4, x8, x12,\ + x1, x5, x9, x13,\ + x2, x6, x10, x14,\ + x3, x7, x11, x15} + +#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ + x0, x4, x1, x8,\ + x5, x2, x12, x9,\ + x6, x3, x13, x10,\ + x7, x14, x11, x15} + + +static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz, + uint8_t * const significant_coeff_group_flag, + const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg, + int * const pPrev_sig) +{ + while (--i >= 0) { + uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag; + const unsigned int x_cg = scan_x_cg[i]; + + // For the flag decode we only care about Z/NZ but + // we use the full Right * 2 + Down when calculating + // significant coeff flags so we obtain it here. + // + // The group flag array is one longer than it needs to + // be so we don't need to check for y_cg limits + const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1); + + if (i == 0 || + significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig)) + { + gf_y[0] |= (1 << x_cg); + *pPrev_sig = prev_sig; + break; + } + } + + return i; +} + +static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb, + const unsigned int log2_trafo_size, const unsigned int c_idx, + const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) +{ + const AVFrame * const frame = s->frame; + const unsigned int stride = frame_stride1(s->frame, c_idx); + const unsigned int x = x0 >> ctx_hshift(s, c_idx); + const unsigned int y = y0 >> ctx_vshift(s, c_idx); + const int is_sliced = 1; // av_rpi_is_sand_frame(frame); + uint8_t * const dst = !is_sliced ? + s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : + c_idx == 0 ? + av_rpi_sand_frame_pos_y(frame, x, y) : + av_rpi_sand_frame_pos_c(frame, x, y); + + const unsigned int i = jb->intra.n; + HEVCPredCmd *const pc = jb->intra.cmds + i - 1; + + if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && + pc->ta.dst == dst) + { + av_assert1(pc->size == log2_trafo_size && + pc->c_idx == 1 && + pc->ta.stride == stride); + + pc->type = RPI_PRED_ADD_RESIDUAL_C; + } + else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && + pc->dc.dst == dst) + { + const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits + av_assert1(pc->size == log2_trafo_size && + pc->c_idx == 1 && + pc->dc.stride == stride); + + // Rewrite as add residual - must rewrite all fields as different union member + pc->type = RPI_PRED_ADD_RESIDUAL_V; + pc->ta.buf = coeffs; + pc->ta.dst = dst; + pc->ta.stride = stride; + pc->ta.dc = dc; + } + else + { + HEVCPredCmd * const cmd = pc + 1; + jb->intra.n = i + 1; + + cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); + cmd->size = log2_trafo_size; + cmd->ta.buf = coeffs; + cmd->ta.dst = dst; + cmd->ta.stride = stride; + cmd->ta.dc = 0; + } +} + + +static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const unsigned int log2_trafo_size, const unsigned int c_idx, + const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) +{ + const AVFrame * const frame = s->frame; + const unsigned int stride = frame_stride1(s->frame, c_idx); + const unsigned int x = x0 >> ctx_hshift(s, c_idx); + const unsigned int y = y0 >> ctx_vshift(s, c_idx); + const int is_sliced = 1; + uint8_t * const dst = !is_sliced ? + s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : + c_idx == 0 ? + av_rpi_sand_frame_pos_y(frame, x, y) : + av_rpi_sand_frame_pos_c(frame, x, y); + + const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0); + const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1); + + const unsigned int i = jb->intra.n; + HEVCPredCmd *const pc = jb->intra.cmds + i - 1; + + if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && + pc->ta.dst == dst) + { + av_assert1(pc->size == log2_trafo_size && + pc->c_idx == 1 && + pc->ta.stride == stride); + + pc->ta.dc = (int16_t)coeff; + } + else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && + pc->dc.dst == dst) + { + av_assert1(pc->size == log2_trafo_size && + pc->c_idx == 1 && + pc->dc.stride == stride && + (pc->dc.dc & ~0xffff) == 0); + + pc->dc.dc |= (coeff << 16); + } + else + { + HEVCPredCmd * const cmd = pc + 1; + jb->intra.n = i + 1; + + cmd->type = RPI_PRED_ADD_DC + c_idx; + cmd->size = log2_trafo_size; + cmd->dc.dst = dst; + cmd->dc.stride = stride; + cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff; + } +} + + +void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const int x0, const int y0, + const int log2_trafo_size, const enum ScanType scan_idx, + const int c_idx) +{ + int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag; + + int last_significant_coeff_x, last_significant_coeff_y; + int num_coeff = 0; + int prev_subset_coded = 0; + + int num_last_subset; + int x_cg_last_sig, y_cg_last_sig; + + const uint8_t *scan_x_cg, *scan_y_cg; + const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; + + int use_vpu; +#if RPI_COMPRESS_COEFFS + int num_nonzero = 0; + int use_compress = 0; + int *coeffs32; +#endif + int use_dc = 0; + int16_t *coeffs; + uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero + int explicit_rdpcm_flag = 0; + int explicit_rdpcm_dir_flag; + + int i; + int shift,scale; + const uint8_t *scale_matrix = NULL; + uint8_t dc_scale; + const int c_idx_nz = (c_idx != 0); + const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; + int prev_sig = 0; + int may_hide_sign; + + int16_t dummy_coeffs[16]; + + // Derive QP for dequant + if (!lc->cu.cu_transquant_bypass_flag) { + may_hide_sign = s->ps.pps->sign_data_hiding_flag; + + if (s->ps.pps->transform_skip_enabled_flag && + log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) { + int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz); + if (transform_skip_flag) { + trans_skip_or_bypass = 1; + if (lc->cu.pred_mode == MODE_INTRA && + s->ps.sps->implicit_rdpcm_enabled_flag && + (pred_mode_intra == 10 || pred_mode_intra == 26)) { + may_hide_sign = 0; + } + } + } + + { + static const uint8_t level_scale[8] = { + 40, 45, 51, 57, 64, 72, 0, 0 // Pad to 8 + }; + const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y]; + + // Shift is set to one less than will actually occur as the scale + // and saturate step adds 1 and then shifts right again + scale = level_scale[qp6 & 7]; +// shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3); + shift = log2_trafo_size - (qp6 >> 3); + + if (shift < 0) { + scale <<= -shift; + shift = 0; + } + } + + if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) { + const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ? + &s->ps.pps->scaling_list : &s->ps.sps->scaling_list; + const unsigned int matrix_id = + lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx; + + scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id]; + dc_scale = scale_matrix[0]; + if (log2_trafo_size >= 4) + dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id]; + } + else + { + static const uint8_t sixteen_scale[64] = { + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16 + }; + scale_matrix = sixteen_scale; + dc_scale = 16; + } + } else { + static const uint8_t unit_scale[64] = { + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + }; + scale_matrix = unit_scale; + shift = 0; + scale = 2; // We will shift right to kill this + dc_scale = 1; + + may_hide_sign = 0; + } + + + + + if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag && + trans_skip_or_bypass) { + explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz); + if (explicit_rdpcm_flag) { + may_hide_sign = 0; + explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz); + } + } + + last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size, + &last_significant_coeff_x, &last_significant_coeff_y); + + if (last_significant_coeff_x > 3) { + int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x); + last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) * + (2 + (last_significant_coeff_x & 1)) + + suffix; + } + + if (last_significant_coeff_y > 3) { + int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y); + last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) * + (2 + (last_significant_coeff_y & 1)) + + suffix; + } + + if (scan_idx == SCAN_VERT) + FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y); + + x_cg_last_sig = last_significant_coeff_x >> 2; + y_cg_last_sig = last_significant_coeff_y >> 2; + + switch (scan_idx) { + case SCAN_DIAG: { + int last_x_c = last_significant_coeff_x & 3; + int last_y_c = last_significant_coeff_y & 3; + + num_coeff = diag_scan4x4_inv[last_y_c][last_x_c]; + + switch (log2_trafo_size) { + case 2: + scan_x_cg = scan_1x1; + scan_y_cg = scan_1x1; + break; + case 3: + num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4; + scan_x_cg = diag_scan2x2_x; + scan_y_cg = diag_scan2x2_y; + break; + case 4: + num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4; + scan_x_cg = ff_hevc_rpi_diag_scan4x4_x; + scan_y_cg = ff_hevc_rpi_diag_scan4x4_y; + break; + case 5: + default: + num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4; + scan_x_cg = ff_hevc_rpi_diag_scan8x8_x; + scan_y_cg = ff_hevc_rpi_diag_scan8x8_y; + break; + } + break; + } + case SCAN_HORIZ: + scan_x_cg = horiz_scan2x2_x; + scan_y_cg = horiz_scan2x2_y; + num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x]; + break; + default: //SCAN_VERT + scan_x_cg = horiz_scan2x2_y; + scan_y_cg = horiz_scan2x2_x; + num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y]; + break; + } + num_coeff++; + num_last_subset = (num_coeff - 1) >> 4; + + significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant + + { + const unsigned int ccount = 1 << (log2_trafo_size * 2); + const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */; // These need special processing + use_vpu = 0; + use_dc = (num_coeff == 1) && !special && + !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2); + + if (use_dc) { + // Just need a little empty space + coeffs = dummy_coeffs; + // No need to clear + } + else + { + use_vpu = !special && log2_trafo_size >= 4; +#if RPI_COMPRESS_COEFFS + use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed; +#endif + coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount); +#if RPI_COMPRESS_COEFFS + coeffs32 = (int*)coeffs; + if (!use_compress) +#endif +#if HAVE_NEON + rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); +#else + memset(coeffs, 0, ccount * sizeof(int16_t)); +#endif + } + } + + i = num_last_subset; + do { + int implicit_non_zero_coeff = 0; + int n_end; + + uint8_t significant_coeff_flag_idx[16]; + unsigned int nb_significant_coeff_flag = 0; + + if (i == num_last_subset) { + // First time through + int last_scan_pos = num_coeff - (i << 4) - 1; + n_end = last_scan_pos - 1; + significant_coeff_flag_idx[0] = last_scan_pos; + nb_significant_coeff_flag = 1; + } else { + n_end = 15; + implicit_non_zero_coeff = (i != 0); + } + + if (n_end >= 0) { + static const uint8_t ctx_idx_maps_ts2[3][16] = { + D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 + H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 + V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2 + }; + // N.B. prev_sig = Right * 2 + Down + static const uint8_t ctx_idx_maps[3][4][16] = { + { + D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 + D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 + D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 + D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default + }, + { + H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 + H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 + H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 + H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default + }, + { + V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 + V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 + V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 + V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default + } + }; + const uint8_t *ctx_idx_map_p; + int scf_offset = 0; + + if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) { + ctx_idx_map_p = ctx_idx_maps[0][3]; + scf_offset = 40 + c_idx_nz; + } else { + if (c_idx_nz != 0) + scf_offset = 27; + + if (log2_trafo_size == 2) { + ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx]; + } else { + ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig]; + if (!c_idx_nz) { + if (i != 0) + scf_offset += 3; + + if (log2_trafo_size == 3) { + scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; + } else { + scf_offset += 21; + } + } else { + if (log2_trafo_size == 3) + scf_offset += 9; + else + scf_offset += 12; + } + } + } + + if (n_end > 0) { + int cnt = get_sig_coeff_flag_idxs(&lc->cc, + lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset, + n_end, ctx_idx_map_p, + significant_coeff_flag_idx + nb_significant_coeff_flag); + + nb_significant_coeff_flag += cnt; + if (cnt != 0) { + implicit_non_zero_coeff = 0; + } + } + + if (implicit_non_zero_coeff == 0) { + if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) { + scf_offset = 42 + c_idx_nz; + } else { + if (i == 0) { + scf_offset = c_idx_nz ? 27 : 0; + } else { + scf_offset = 2 + scf_offset; + } + } + if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) { + significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; + nb_significant_coeff_flag++; + } + } else { + significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; + nb_significant_coeff_flag++; + } + } +#if RPI_COMPRESS_COEFFS + if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full! + int16_t temp[32*32]; + const unsigned int ccount = 1 << (log2_trafo_size * 2); + lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0; + lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer + memcpy(temp, coeffs, sizeof(int)*num_nonzero); + coeffs32 = (int *)temp; + memset(coeffs, 0, ccount * sizeof(int16_t)); + num_nonzero--; + while (num_nonzero >= 0) { + const unsigned int res = coeffs32[num_nonzero]; + const unsigned int offset = res & 0xffff; + coeffs[ offset ] = res >> 16; + num_nonzero--; + } + use_compress = 0; + } +#endif + + if (nb_significant_coeff_flag != 0) { + const unsigned int gt1_idx_delta = (c_idx_nz << 2) | + ((i != 0 && !c_idx_nz) ? 2 : 0) | + prev_subset_coded; + const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] + + (gt1_idx_delta << 2); + const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + + gt1_idx_delta; + + const unsigned int x_cg = scan_x_cg[i]; + const unsigned int y_cg = scan_y_cg[i]; + int16_t * const blk_coeffs = coeffs + + ((x_cg + (y_cg << log2_trafo_size)) << 2); + // This calculation is 'wrong' for log2_traffo_size == 2 + // but that doesn't matter as in this case x_cg & y_cg + // are always 0 so result is correct (0) anyway + const uint8_t * const blk_scale = scale_matrix + + (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size))); + + // * The following code block doesn't deal with these flags: + // (nor did the one it replaces) + // + // cabac_bypass_alignment_enabled_flag + // This should be easy but I can't find a test case + // extended_precision_processing_flag + // This can extend the required precision past 16bits + // so is probably tricky - also no example found yet + +#if USE_N_END_1 + if (nb_significant_coeff_flag == 1) { + // There is a small gain to be had from special casing the single + // transform coefficient case. The reduction in complexity + // makes up for the code duplicatioon. + + int trans_coeff_level = 1; + int coeff_sign_flag; + int coded_val = 0; + + // initialize first elem of coeff_bas_level_greater1_flag + prev_subset_coded = 0; + + if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) { + trans_coeff_level = 2; + prev_subset_coded = 1; + coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2); + } + + // Probably not worth the overhead of starting by22 for just one value + coeff_sign_flag = get_cabac_bypass(&lc->cc); + + if (coded_val) + { + if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) { + trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0); + } else { + uint8_t * const stat_coeff = + lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1); + const unsigned int c_rice_param = *stat_coeff >> 2; + const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param); + + trans_coeff_level = 3 + last_coeff_abs_level_remaining; + update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); + } + } + + { + const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; + const int k = (int32_t)(coeff_sign_flag << 31) >> 31; + const unsigned int scale_m = blk_scale[xy_off->scale]; + const int res = trans_scale_sat( + (trans_coeff_level ^ k) - k, // Apply sign + scale, + i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m, + shift); +#if RPI_COMPRESS_COEFFS + if (use_compress) + coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs); + else +#endif + blk_coeffs[xy_off->coeff] = res; + } + } + else +#endif + { + int sign_hidden = may_hide_sign; + int levels[16]; // Should be able to get away with int16_t but that fails some tests + uint32_t coeff_sign_flags; + uint32_t coded_vals = 0; + // Sum(abs(level[])) + // In fact we only need the bottom bit and in some future + // version that may be all we calculate + unsigned int sum_abs; + + coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels, + &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2); + + if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3) + sign_hidden = 0; + + // -- Start bypass block + + bypass_start(&lc->cc); + + coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden); + + if (coded_vals != 0) + { + const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag; + uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL : + lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1); + int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2; + int * level = levels - 1; + + do { + { + const unsigned int z = hevc_clz32(coded_vals) + 1; + level += z; + coded_vals <<= z; + } + + { + const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param); + const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1; + + sum_abs += last_coeff_abs_level_remaining + 1; + *level = trans_coeff_level; + + if (stat_coeff != NULL) + update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); + stat_coeff = NULL; + + if (trans_coeff_level > (3 << c_rice_param) && + (c_rice_param < 4 || rice_adaptation_enabled)) + ++c_rice_param; + } + } while (coded_vals != 0); + } + + // sign_hidden = 0 or 1 so we can combine the tests + if ((sign_hidden & sum_abs) != 0) { + levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1]; + } + + bypass_finish(&lc->cc); + + // -- Finish bypass block + + // Scale loop + { + int m = nb_significant_coeff_flag - 1; + + // Deal with DC component (if any) first + if (i == 0 && significant_coeff_flag_idx[m] == 0) + { + const int k = (int32_t)(coeff_sign_flags << m) >> 31; + const int res = trans_scale_sat( + (levels[m] ^ k) - k, scale, dc_scale, shift); +#if RPI_COMPRESS_COEFFS + if (use_compress) + { + coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs); + } + else +#endif + { + blk_coeffs[0] = res; + } + --m; + } + +#if !USE_N_END_1 + // If N_END_1 set then m was at least 1 initially + if (m >= 0) +#endif + { + do { + const xy_off_t * const xy_off = scan_xy_off + + significant_coeff_flag_idx[m]; + const int k = (int32_t)(coeff_sign_flags << m) >> 31; + const int res = trans_scale_sat( + (levels[m] ^ k) - k, + scale, + blk_scale[xy_off->scale], + shift); +#if RPI_COMPRESS_COEFFS + if (use_compress) { + coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs); + } else +#endif + blk_coeffs[xy_off->coeff] = res; + } while (--m >= 0); + } + } + + } + } + } while ((i = next_subset(lc, i, c_idx_nz, + significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 && + !cabac_overflow(&lc->cc)); + + if (lc->cu.cu_transquant_bypass_flag) { + if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && + (pred_mode_intra == 10 || pred_mode_intra == 26))) { + int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag; + + s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); + } + } else { + if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass + int rot = s->ps.sps->transform_skip_rotation_enabled_flag && + log2_trafo_size == 2 && + lc->cu.pred_mode == MODE_INTRA; + if (rot) { + for (i = 0; i < 8; i++) + FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]); + } + + s->hevcdsp.dequant(coeffs, log2_trafo_size); + + if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && + lc->cu.pred_mode == MODE_INTRA && + (pred_mode_intra == 10 || pred_mode_intra == 26))) { + int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26); + + s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); + } + } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { + s->hevcdsp.transform_4x4_luma(coeffs); + } + else if (!use_vpu) + { + int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); + if (max_xy == 0) + { + if (use_dc) + rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); + else + s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); + } + else { + int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; + if (max_xy < 4) + col_limit = FFMIN(4, col_limit); + else if (max_xy < 8) + col_limit = FFMIN(8, col_limit); + else if (max_xy < 12) + col_limit = FFMIN(24, col_limit); + s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit); + } + } + } + +#if 0 + // Mildly rotted - we support no mode where cross is valid + if (lc->tu.cross_pf) { + int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer; + const int ccount = 1 << (log2_trafo_size * 2); + + for (i = 0; i < ccount; i++) { + coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); + } + } +#endif + + if (!use_dc) { +#if RPI_COMPRESS_COEFFS + if (use_compress) { + coeffs32[num_nonzero] = 0; + } +#endif + rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); + } +} + +#if !USE_BY22 +// Stores results to lc +MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) +{ + int x = abs_mvd_greater0_flag_decode(lc); + int y = abs_mvd_greater0_flag_decode(lc); + + if (x) + x += abs_mvd_greater1_flag_decode(lc); + if (y) + y += abs_mvd_greater1_flag_decode(lc); + + switch (x) { + case 2: x = mvd_decode(lc); break; + case 1: x = mvd_sign_flag_decode(lc); break; + case 0: x = 0; break; + } + + switch (y) { + case 2: y = mvd_decode(lc); break; + case 1: y = mvd_sign_flag_decode(lc); break; + case 0: y = 0; break; + } + return MV_XY(x,y); +} +#else +MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) +{ + int x = abs_mvd_greater0_flag_decode(lc); + int y = abs_mvd_greater0_flag_decode(lc); + + if ((x | y) == 0) + return 0; + + if (x != 0) + x += abs_mvd_greater1_flag_decode(lc); + if (y != 0) + y += abs_mvd_greater1_flag_decode(lc); + + if ((x | y) == 1) + { + // Not worth starting BY22 + if (x != 0) + x = mvd_sign_flag_decode(lc); + if (y != 0) + y = mvd_sign_flag_decode(lc); + } + else + { + CABACContext * const cc = &lc->cc; + uint32_t val; + uint32_t b; + unsigned int n = 0; + + bypass_start(cc); + b = val = get_cabac_by22_peek(cc); + + if (x == 1) { + x = ((int32_t)b >> 31) | 1; + n = 1; + b <<= 1; + } + else if (x == 2) { + // EG1 so we have (leading one bits + 1) of suffix + // This makes prefix & suffix lengths the same + const unsigned int k = hevc_clz32(~b) + 1; + int s; + + av_assert2(k <= 15); + + b <<= k; + n = 2 * k + 1; // Includes suffix & sign + + // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked + // if we are going to do this without a flush + if (k > CABAC_BY22_PEEK_BITS / 2 - 1) + { + // Need too many bits - flush + // n = k + get_cabac_by22_flush(cc, k, val); + b = val = get_cabac_by22_peek(cc); + n = k + 1; + } + + x = (b >> (32 - k)) + (1 << k); + b <<= k; + s = (int32_t)b >> 31; + x = (x ^ s) - s; + b <<= 1; + + // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits) + if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15) + { + get_cabac_by22_flush(cc, n, val); + b = val = get_cabac_by22_peek(cc); + n = 0; + } + } + + if (y == 1) { + y = ((int32_t)b >> 31) | 1; + ++n; + // don't care about b anymore + } + else if (y == 2) { + const unsigned int k = hevc_clz32(~b) + 1; + int s; + + av_assert2(k <= 15); + + // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked + // if we are going to do this without a flush + b <<= k; + n += 2 * k + 1; + + if (n > CABAC_BY22_PEEK_BITS) + { + // Need too many bits - flush + get_cabac_by22_flush(cc, n - (k + 1), val); + b = val = get_cabac_by22_peek(cc); + n = k + 1; + } + + y = (b >> (32 - k)) + (1 << k); + s = (int32_t)(b << k) >> 31; + y = (y ^ s) - s; + // don't care about b anymore + } + + get_cabac_by22_flush(cc, n, val); + bypass_finish(cc); + } + + return MV_XY(x, y); +} +#endif diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h new file mode 100644 index 0000000000..ca191f00d9 --- /dev/null +++ b/libavcodec/rpi_hevc_cabac_fns.h @@ -0,0 +1,217 @@ +/* + * HEVC CABAC decoding + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2012 - 2013 Gildas Cocherel + * Copyright (C) 2012 - 2013 Gildas Cocherel + * Copyright (C) 2018 John Cox + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H +#define AVCODEC_RPI_HEVC_CABAC_FNS_H + +#include "config.h" +#include "rpi_hevcdec.h" + +void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc); +void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags); +int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size); +int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH); +int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx); +int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx); + +//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); +void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const int x0, const int y0, + const int log2_trafo_size, const enum ScanType scan_idx, + const int c_idx); + +MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc); + +#define HEVC_BIN_SAO_MERGE_FLAG 0 +#define HEVC_BIN_SAO_TYPE_IDX 1 +#define HEVC_BIN_SAO_EO_CLASS 2 +#define HEVC_BIN_SAO_BAND_POSITION 2 +#define HEVC_BIN_SAO_OFFSET_ABS 2 +#define HEVC_BIN_SAO_OFFSET_SIGN 2 +#define HEVC_BIN_END_OF_SLICE_FLAG 2 +#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG 2 +#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG 5 +#define HEVC_BIN_SKIP_FLAG 6 +#define HEVC_BIN_CU_QP_DELTA 9 +#define HEVC_BIN_PRED_MODE 12 +#define HEVC_BIN_PART_MODE 13 +#define HEVC_BIN_PCM_FLAG 17 +#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE 17 +#define HEVC_BIN_MPM_IDX 18 +#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE 18 +#define HEVC_BIN_INTRA_CHROMA_PRED_MODE 18 +#define HEVC_BIN_MERGE_FLAG 20 +#define HEVC_BIN_MERGE_IDX 21 +#define HEVC_BIN_INTER_PRED_IDC 22 +#define HEVC_BIN_REF_IDX_L0 27 +#define HEVC_BIN_REF_IDX_L1 29 +#define HEVC_BIN_ABS_MVD_GREATER0_FLAG 31 +#define HEVC_BIN_ABS_MVD_GREATER1_FLAG 33 +#define HEVC_BIN_ABS_MVD_MINUS2 35 +#define HEVC_BIN_MVD_SIGN_FLAG 35 +#define HEVC_BIN_MVP_LX_FLAG 35 +#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG 36 +#define HEVC_BIN_SPLIT_TRANSFORM_FLAG 37 +#define HEVC_BIN_CBF_LUMA 40 +#define HEVC_BIN_CBF_CB_CR 42 +#define HEVC_BIN_TRANSFORM_SKIP_FLAG 46 +#define HEVC_BIN_EXPLICIT_RDPCM_FLAG 48 +#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG 50 +#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX 52 +#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX 70 +#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX 88 +#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX 88 +#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG 88 +#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG 92 +#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG 136 +#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG 160 +#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING 166 +#define HEVC_BIN_COEFF_SIGN_FLAG 166 +#define HEVC_BIN_LOG2_RES_SCALE_ABS 166 +#define HEVC_BIN_RES_SCALE_SIGN_FLAG 174 +#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG 176 +#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX 177 + + +int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state); +int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c); + +static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) { + const uint8_t *ptr = c->bytestream; + + if (c->low & 0x1) + ptr--; +#if CABAC_BITS == 16 + if (c->low & 0x1FF) + ptr--; +#endif + if ((int) (c->bytestream_end - ptr) < n) + return NULL; + if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0) + return NULL; + + return ptr; +} + +static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG); +} + +static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG); +} + +static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG); +} + +static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int ct_depth, + const unsigned int x0, const unsigned int y0) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG + + ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) + + ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth)); +} + +static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const int x0, const int y0, const int x_cb, const int y_cb) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG + + (s->cabac_stash_left[y0 >> 3] & 1) + + (s->cabac_stash_up[x0 >> 3] & 1)); +} + +static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE); +} + +static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc) +{ + return ff_hevc_rpi_get_cabac_terminate(&lc->cc); +} + +static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE); +} + +static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG); +} + +static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG); +} + +static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG); +} + +static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth); +} + +static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth); +} + +static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size); +} + +static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx) +{ + return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx); +} + + + +#endif + diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c new file mode 100644 index 0000000000..341bb77d9d --- /dev/null +++ b/libavcodec/rpi_hevc_data.c @@ -0,0 +1,75 @@ +/* + * HEVC shared tables + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "rpi_hevc_data.h" + +const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = { + 0, 0, 1, 0, + 1, 2, 0, 1, + 2, 3, 1, 2, + 3, 2, 3, 3, +}; + +const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = { + 0, 1, 0, 2, + 1, 0, 3, 2, + 1, 0, 3, 2, + 1, 3, 2, 3, +}; + +const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = { + 0, 0, 1, 0, + 1, 2, 0, 1, + 2, 3, 0, 1, + 2, 3, 4, 0, + 1, 2, 3, 4, + 5, 0, 1, 2, + 3, 4, 5, 6, + 0, 1, 2, 3, + 4, 5, 6, 7, + 1, 2, 3, 4, + 5, 6, 7, 2, + 3, 4, 5, 6, + 7, 3, 4, 5, + 6, 7, 4, 5, + 6, 7, 5, 6, + 7, 6, 7, 7, +}; + +const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = { + 0, 1, 0, 2, + 1, 0, 3, 2, + 1, 0, 4, 3, + 2, 1, 0, 5, + 4, 3, 2, 1, + 0, 6, 5, 4, + 3, 2, 1, 0, + 7, 6, 5, 4, + 3, 2, 1, 0, + 7, 6, 5, 4, + 3, 2, 1, 7, + 6, 5, 4, 3, + 2, 7, 6, 5, + 4, 3, 7, 6, + 5, 4, 7, 6, + 5, 7, 6, 7, +}; diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h new file mode 100644 index 0000000000..0aee673d8b --- /dev/null +++ b/libavcodec/rpi_hevc_data.h @@ -0,0 +1,31 @@ +/* + * HEVC shared data tables + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RPI_HEVC_DATA_H +#define AVCODEC_RPI_HEVC_DATA_H + +#include + +extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16]; +extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16]; +extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64]; +extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64]; + +#endif /* AVCODEC_RPI_HEVC_DATA_H */ diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c new file mode 100644 index 0000000000..5125d1eb6b --- /dev/null +++ b/libavcodec/rpi_hevc_filter.c @@ -0,0 +1,1210 @@ +/* + * HEVC video decoder + * + * Originally by: + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2013 Seppo Tomperi + * Copyright (C) 2013 Wassim Hamidouche + * + * Substantially rewritten: + * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +//#define DISABLE_SAO +//#define DISABLE_DEBLOCK +//#define DISABLE_STRENGTHS +// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames) +//#define DISABLE_DEBLOCK_NONREF + +#include "libavutil/common.h" +#include "libavutil/internal.h" + +#include "rpi_hevcdec.h" + +#include "bit_depth_template.c" + +#include "rpi_qpu.h" +#include "rpi_zc.h" +#include "libavutil/rpi_sand_fns.h" + +#define LUMA 0 +#define CB 1 +#define CR 2 + +// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2 +// so -12,75 overall +static const uint8_t tctablex[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // QP 0...18 + 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, // QP 19...37 + 5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24, // QP 38...53 + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 // 54..75 +}; +#define tctable (tctablex + 12 + 6*8) + +static const uint8_t betatablex[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, // QP 0...18 + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37 + 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, // QP 38...51 + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 // 52..73 +}; +#define betatable (betatablex + 12 + 6*8) + +static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y, + const int c_idx, const int tc_offset) +{ + return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2]; +} + +static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, + const unsigned int xBase, const unsigned int yBase) +{ + const unsigned int ctb_size_mask = (1 << s->ps.sps->log2_ctb_size) - 1; + const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size; + const unsigned int xQgBase = xBase & MinCuQpDeltaSizeMask; + const unsigned int yQgBase = yBase & MinCuQpDeltaSizeMask; + const unsigned int min_cb_width = s->ps.sps->min_cb_width; + const unsigned int x_cb = xQgBase >> s->ps.sps->log2_min_cb_size; + const unsigned int y_cb = yQgBase >> s->ps.sps->log2_min_cb_size; + const int qPy_pred = lc->qPy_pred; + + return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred : + s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) + + ((yQgBase & ctb_size_mask) == 0 ? qPy_pred : + s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1; +} + +// * Only called from bitstream decode in foreground +// so should be safe +void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase) +{ + const int qp_y = get_qPy_pred(s, lc, xBase, yBase); + + if (lc->tu.cu_qp_delta != 0) { + // ?? I suspect that the -bd_offset here leads to us adding it elsewhere + int off = s->ps.sps->qp_bd_offset; + lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off, + 52 + off) - off; + } else + lc->qp_y = qp_y; +} + +static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx) +{ + return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift; +} + +// "DSP" these? +static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) +{ + switch (pixel_shift) + { + case 2: + *(uint32_t *)dst = *(uint32_t *)src; + break; + case 1: + *(uint16_t *)dst = *(uint16_t *)src; + break; + default: + *dst = *src; + break; + } +} + +static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src, + ptrdiff_t stride_src, int x, int y, int width, int height, + int c_idx, int x_ctb, int y_ctb) +{ + const unsigned int sh = pixel_shift(s, c_idx); + const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx); + const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx); + + /* copy horizontal edges */ + memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh), + src, width << sh); + memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh), + src + stride_src * (height - 1), width << sh); + + /* copy vertical edges */ + ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src); + + ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src); +} + +// N.B. Src & dst are swapped as this is a restore! +// x0 & y0 are in luma coords +// Width & height are in Y/C pels as appropriate +// * Clear scope for optimsation here but not used enough to be worth it +static void restore_tqb_pixels(const HEVCRpiContext * const s, + uint8_t *src1, const uint8_t *dst1, + const ptrdiff_t stride_src, const ptrdiff_t stride_dst, + const unsigned int x0, const unsigned int y0, + const unsigned int width, const int height, + const int c_idx) +{ + if (s->ps.pps->transquant_bypass_enable_flag || + s->ps.sps->pcm.loop_filter_disable_flag) + { + const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width; + int blks_y = height >> (c_idx == 0 ? 3 : 2); + const unsigned int bwidth = 8 << s->ps.sps->pixel_shift; // Y & C have the same width in sand + const unsigned int bheight = (c_idx == 0) ? 8 : 4; + const unsigned int sh = ((x0 >> 3) & 7); + const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1; + + do { + unsigned int m = (*pcm >> sh) & mask; + uint8_t * bd = src1; + const uint8_t * bs = dst1; + while (m != 0) { + if ((m & 1) != 0) { + s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight); + } + m >>= 1; + bs += bwidth; + bd += bwidth; + } + src1 += stride_src * bheight; + dst1 += stride_dst * bheight; + pcm += s->ps.sps->pcm_width; + } while (--blks_y > 0); + } +} + +#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)]) + +static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y) +{ +#if SAO_FILTER_N == 5 + static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; +#elif SAO_FILTER_N == 6 + static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; +#else +#error Confused by size of sao fn array +#endif + int c_idx; + int edges[4]; // 0 left 1 top 2 right 3 bottom + int x_ctb = x >> s->ps.sps->log2_ctb_size; + int y_ctb = y >> s->ps.sps->log2_ctb_size; + int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb; + int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs]; + RpiSAOParams *sao = &CTB(s->sao, x_ctb, y_ctb); + // flags indicating unfilterable edges + uint8_t vert_edge[] = { 0, 0 }; + uint8_t horiz_edge[] = { 0, 0 }; + uint8_t diag_edge[] = { 0, 0, 0, 0 }; + uint8_t lfase = CTB(s->filter_slice_edges, x_ctb, y_ctb); + uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag && + !s->ps.pps->loop_filter_across_tiles_enabled_flag; + uint8_t restore = no_tile_filter || !lfase; + uint8_t left_tile_edge = 0; + uint8_t right_tile_edge = 0; + uint8_t up_tile_edge = 0; + uint8_t bottom_tile_edge = 0; + const int sliced = 1; + const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1); + + edges[0] = x_ctb == 0; + edges[1] = y_ctb == 0; + edges[2] = x_ctb == s->ps.sps->ctb_width - 1; + edges[3] = y_ctb == s->ps.sps->ctb_height - 1; + +#ifdef DISABLE_SAO + return; +#endif + + if (restore) { + if (!edges[0]) { + left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; + vert_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge; + } + if (!edges[2]) { + right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]]; + vert_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge; + } + if (!edges[1]) { + up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]]; + horiz_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge; + } + if (!edges[3]) { + bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]]; + horiz_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge; + } + if (!edges[0] && !edges[1]) { + diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge; + } + if (!edges[1] && !edges[2]) { + diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge; + } + if (!edges[2] && !edges[3]) { + diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge; + } + if (!edges[0] && !edges[3]) { + diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge; + } + } + + for (c_idx = 0; c_idx < plane_count; c_idx++) { + const unsigned int vshift = ctx_vshift(s, c_idx); + const unsigned int hshift = ctx_hshift(s, c_idx); + const int x0 = x >> hshift; + const int y0 = y >> vshift; + const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx); + const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift; + const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift; + const int width = FFMIN(ctb_size_h, (s->ps.sps->width >> hshift) - x0); + const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0); + int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1]; + ptrdiff_t stride_dst; + uint8_t *dst; + + const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0); + const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; + uint8_t * const src = !sliced ? + &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] : + c_idx == 0 ? + av_rpi_sand_frame_pos_y(s->frame, x0, y0) : + av_rpi_sand_frame_pos_c(s->frame, x0, y0); + const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : + !sliced ? src - (1 << sh) : + c_idx == 0 ? + av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) : + av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0); + const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : + !sliced ? src + (width << sh) : + c_idx == 0 ? + av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) : + av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0); + + if (sliced && c_idx > 1) { + break; + } + +// if (c_idx == 1) +// printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr); + + switch (sao->type_idx[c_idx]) { + case SAO_BAND: + copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, + x_ctb, y_ctb); + if (s->ps.pps->transquant_bypass_enable_flag || + s->ps.sps->pcm.loop_filter_disable_flag) + { + // Can't use the edge buffer here as it may be in use by the foreground + DECLARE_ALIGNED(64, uint8_t, dstbuf) + [2*MAX_PB_SIZE*MAX_PB_SIZE]; + dst = dstbuf; + stride_dst = 2*MAX_PB_SIZE; + s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); + if (sliced && c_idx != 0) + { + s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, + sao->offset_val[1], sao->band_position[1], + sao->offset_val[2], sao->band_position[2], + width, height); + } + else + { + s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, + sao->offset_val[c_idx], sao->band_position[c_idx], + width, height); + } + restore_tqb_pixels(s, src, dst, stride_src, stride_dst, + x, y, width, height, c_idx); + } else { + if (sliced && c_idx != 0) + { + s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src, + sao->offset_val[1], sao->band_position[1], + sao->offset_val[2], sao->band_position[2], + width, height); + } + else + { + s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, + sao->offset_val[c_idx], sao->band_position[c_idx], + width, height); + } + } + sao->type_idx[c_idx] = SAO_APPLIED; + break; + case SAO_EDGE: + { + const int w = s->ps.sps->width >> hshift; + const int h = s->ps.sps->height >> vshift; + int top_edge = edges[1]; + int bottom_edge = edges[3]; + // Can't use the edge buffer here as it may be in use by the foreground + DECLARE_ALIGNED(64, uint8_t, dstbuf) + [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64]; + + stride_dst = RPI_HEVC_SAO_BUF_STRIDE; + dst = dstbuf + stride_dst + 32; + + if (!top_edge) { + uint8_t *dst1; + int src_idx; + const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh); + + dst1 = dst - stride_dst; + + if (src_l != NULL) { + src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); + copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh); + } + + src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); + memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh); + + if (src_r != NULL) { + src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); + copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh); + } + } + if (!bottom_edge) { + uint8_t * const dst1 = dst + height * stride_dst; + int src_idx; + const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh); + const unsigned int hoff = height * stride_src; + + if (src_l != NULL) { + src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); + copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh); + } + + src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); + memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh); + + if (src_r != NULL) { + src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); + copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh); + } + } + if (src_l != NULL) { + if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { + ff_hevc_rpi_copy_vert(dst - (1 << sh), + s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh), + sh, height, stride_dst, 1 << sh); + } else { + ff_hevc_rpi_copy_vert(dst - (1 << sh), + src_l, + sh, height, stride_dst, stride_src); + } + } + if (src_r != NULL) { + if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { + ff_hevc_rpi_copy_vert(dst + (width << sh), + s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh), + sh, height, stride_dst, 1 << sh); + } else { + ff_hevc_rpi_copy_vert(dst + (width << sh), + src_r, + sh, height, stride_dst, stride_src); + } + } + + s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); + + copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, + x_ctb, y_ctb); + if (sliced && c_idx != 0) + { + // Class always the same for both U & V (which is just as well :-)) + s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src, + sao->offset_val[1], sao->offset_val[2], sao->eo_class[1], + width, height); + s->hevcdsp.sao_edge_restore_c[restore](src, dst, + stride_src, stride_dst, + sao, + edges, width, + height, c_idx, + vert_edge, + horiz_edge, + diag_edge); + } + else + { + s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx], + sao->eo_class[c_idx], width, height); + s->hevcdsp.sao_edge_restore[restore](src, dst, + stride_src, stride_dst, + sao, + edges, width, + height, c_idx, + vert_edge, + horiz_edge, + diag_edge); + } + restore_tqb_pixels(s, src, dst, stride_src, stride_dst, + x, y, width, height, c_idx); + sao->type_idx[c_idx] = SAO_APPLIED; + break; + } + } + } + +#if RPI_ZC_SAND_8_IN_10_BUF + if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL && + (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2])) + { + const unsigned int stride1 = frame_stride1(s->frame, 1); + const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame); + const unsigned int xoff = (x >> 8) * stride2 * stride1; + const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size); + const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1; + uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1; + const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1; + uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1; + const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255); + const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y; + +// printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size); + av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3); + av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3); + } +#endif +} + +// When bits are delivered to deblock we want them +//#define TL 1 +//#define TR 2 +//#define BL 4 +//#define BR 8 + +// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br +// so we need to rearrange before passing on + +static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) +{ + const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width; + return (pcm[0] | + (pcm[1] << 8) | + (pcm[s->ps.sps->pcm_width] << 16) | + (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7); +} + +static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) +{ + const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width; + return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7); +} + +// We cast away const here as we want this to work for both get and set +static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) +{ + return (uint32_t *)(bs + +#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 +#warning Unexpected masks + // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes + ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & + (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) + +#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 +#error Stride1 < return size +#endif + ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + + (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); +} + +static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) +{ + return (uint8_t *)(bs + + ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & + (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) + + ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + + (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); +} + + +// Get block strength +// Given how we call we will always get within the 32bit boundries +static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2, + unsigned int xl, unsigned int xr, const unsigned int y) +{ + if (xr <= xl) { + return 0; + } + else + { +#if HAVE_ARMV6T2_INLINE +#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 +#error This case not yet handled in bs_get32 +#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 +#error Stride1 < return size +#endif + uint32_t tmp; + __asm__ ( + "lsr %[tmp], %[xl], %[xl_shift] \n\t" + "rsb %[xr], %[xl], %[xr] \n\t" + "mla %[stride2], %[stride2], %[tmp], %[bs] \n\t" + "add %[xr], %[xr], #7 \n\t" + "lsr %[bs], %[y], %[y_shift1] \n\t" + "bic %[xr], %[xr], #7 \n\t" + "ubfx %[xl], %[xl], #1, #5 \n\t" + "lsr %[xr], %[xr], #1 \n\t" + "cmp %[xr], #32 \n\t" + "mvn %[tmp], #0 \n\t" + "ldr %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t" + "lsl %[tmp], %[tmp], %[xr] \n\t" + "lsr %[xl], %[bs], %[xl] \n\t" + "it ne \n\t" + "bicne %[bs], %[xl], %[tmp] \n\t" + : // Outputs + [bs]"+r"(bs), + [stride2]"+r"(stride2), + [xl]"+r"(xl), + [xr]"+r"(xr), + [tmp]"=&r"(tmp) + : // Inputs + [y]"r"(y), + [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT), + [y_shift1]"M"(HEVC_RPI_BS_Y_SHR), + [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + : // Clobbers + "cc" + ); + return (uint32_t) bs; +#else + const uint32_t a = *bs_ptr32(bs, stride2, xl, y); + const unsigned int n = ((xr - xl + 7) & ~7) >> 1; + + return n == 32 ? a : + (a >> ((xl >> 1) & 31)) & ~(~0U << n); +#endif + } +} + +static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) +{ + av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); + return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y); +} + +static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) +{ + av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); + return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y); +} + + +static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y) +{ + const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; + const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; + const unsigned int ctb_size = (1 << log2_ctb_size); + const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 1); + const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size; + const DBParams * cb_dbp = s->deblock + ctb_n; + const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8); + + unsigned int cb_x; + + // Do in CTB-shaped blocks + for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp) + { + const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r); + const unsigned int bv_l = FFMAX(cb_x, 8); + const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9; + const unsigned int bh_l = bv_l - 8; + unsigned int y; + + // Main body + for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8) + { + uint32_t vbs = vbs_get32(s, bv_l, bv_r, y); + + const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp; + const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; + const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; + + if (vbs != 0) + { + const uint8_t * const tcv = tctable + dbp->tc_offset; + const uint8_t * const betav = betatable + dbp->beta_offset; + unsigned int pcmfa = pcm2(s, bv_l - 1, y); + unsigned int x; + + for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1) + { + if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3) + { + const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; + s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), + frame_stride1(s->frame, LUMA), + betav[qp], + ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) | + (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16), + pcmfa & 3, + av_rpi_sand_frame_pos_y(s->frame, x - 4, y)); + } + } + } + + if (y != 0) + { + uint32_t hbs; + + // H left - mostly separated out so we only need a uint32_t hbs + if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0) + { + const unsigned int x = bh_l; + const unsigned int pcmfa = pcm4(s, bh_l, y - 1); + const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; + const DBParams * const dbph = dbp - 1; + const uint8_t * const tc = tctable + dbph->tc_offset + qp; + + av_assert2(cb_x - bh_l == 8); + + s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), + frame_stride1(s->frame, LUMA), + betatable[qp + dbph->beta_offset], + ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | + (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), + (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); + } + + // H + if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop + { + unsigned int x; + unsigned int pcmfa = pcm4(s, cb_x, y - 1); + + for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1) + { + if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0) + { + const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; + const uint8_t * const tc = tctable + dbp->tc_offset + qp; + s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), + frame_stride1(s->frame, LUMA), + betatable[qp + dbp->beta_offset], + ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | + (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), + (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); + } + } + } + } + + } + } +} + +static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) +{ + const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; + const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; + return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1; +} + +static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y) +{ + const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; + const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; + const unsigned int ctb_size = (1 << log2_ctb_size); + const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 8); + const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size; + const DBParams * dbp = s->deblock + ctb_n; + const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8); + const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1]; + const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2]; + + unsigned int cb_x; + + av_assert1((bounds.x & (ctb_size - 1)) == 0); + av_assert1((bounds.y & (ctb_size - 1)) == 0); + av_assert1(bounds.h <= ctb_size); + + // Do in CTB-shaped blocks + for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) { + const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r); + const unsigned int bv_l = FFMAX(cb_x, 16); + unsigned int y; + + // V above + if (bounds.y != 0) { + // Deblock V up 8 + // CTB above current + // Top-half only (tc4 & ~0xffff == 0) is special cased in asm + const unsigned int y = bounds.y - 8; + uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U; + + if (vbs != 0) + { + unsigned int pcmfa = pcm2(s, bv_l - 1, y); + const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset; + unsigned int x; + + for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) + { + if ((vbs & 2) != 0 && (~pcmfa & 3) != 0) + { + const int qp0 = q2h(s, x, y); + s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + frame_stride1(s->frame, 1), + tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8), + av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), + pcmfa & 3); + } + } + } + } + + for (y = bounds.y; y < b_b; y += 16) + { + uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) | + (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4); + + // V + if (vbs != 0) + { + unsigned int x; + unsigned int pcmfa = + (y + 16 > b_b ? + pcm2(s, bv_l - 1, y) | 0xffff0000 : + pcm4(s, bv_l - 1, y)); + const uint8_t * const tc = tctable + 2 + dbp->tc_offset; + + for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) + { + if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) + { + const int qp0 = q2h(s, x, y); + const int qp1 = q2h(s, x, y + 8); + s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + frame_stride1(s->frame, 1), + ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | + ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), + av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), + (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); + } + } + } + + // H + if (y != 0) + { + uint32_t hbs; + const unsigned int bh_l = bv_l - 16; + const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16; + const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; + const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; + + // H left - mostly separated out so we only need a uint32_t hbs + // Stub is width 8 to the left of bounds, but width 16 internally + if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0) + { + unsigned int pcmfa = pcm4(s, bh_l, y - 1); + + // Chop off bits we don't want... + if (bh_l < bounds.x) { + pcmfa |= 0x10001; // TL|BL pre rearrangement + hbs &= ~3; // Make BS 0 + } + + // Double check we still want this + if (hbs != 0 && (~pcmfa & 0x30003) != 0) + { + const unsigned int x = bh_l; + const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; + const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; + const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset; + + s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + frame_stride1(s->frame, 1), + ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | + ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), + (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); + } + } + + // H main + if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0) + { + unsigned int x; + unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it + + for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2) + { + if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) + { + const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; + const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; + const uint8_t * const tc = tctable + 2 + dbp->tc_offset; + + s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + frame_stride1(s->frame, 1), + ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | + ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), + (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); + } + } + } + } + } + } +} + +static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n) +{ + return x & ~(~0U << log2_n); +} + +static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) +{ + av_assert2((y & 7) == 0); + + // This doesn't have the same simultainious update issues that bsf_stash + // does (other threads will have a different y) so we can do it the easy way + if ((bsf &= mask) != 0) + *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31); +} + + +static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) +{ + // We arrange this in a slightly odd fashion but it lines up with + // how we are going to use it in the actual deblock code & it is easier + // to do the contortions here than there + // + // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},... + + av_assert2((x & 7) == 0); + + if ((bsf &= mask) != 0) + { + uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y); + const unsigned int sh = ((x & 8) | (y & 4)) >> 1; + + if (mask <= 0xf) + { + *p |= (bsf << sh); + } + else + { + do { + *p |= (bsf & 0xf) << sh; + p += HEVC_RPI_BS_STRIDE1_BYTES; + } while ((bsf >>= 4) != 0); + } + } +} + +static inline uint32_t bsf_mv(const HEVCRpiContext * const s, + const unsigned int rep, const unsigned int dup, + const unsigned int mvf_stride0, + const unsigned int mvf_stride1, + const RefPicList * const rpl_p, const RefPicList * const rpl_q, + const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q) +{ + return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup, + mvf_p, mvf_q, + rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list, + sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1); +} + + +void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, + const HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int log2_trafo_size, + const int is_coded_block) +{ + const HEVCRpiMvField * const mvf_curr = mvf_stash_ptr(s, lc, x0, y0); + const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE; + const RefPicList * const rpl = s->refPicList; + // Rep count for bsf_mv when running with min_pu chuncks + const unsigned int log2_rep_min_pu = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size; + const unsigned int boundary_flags = s->sh.no_dblk_boundary_flags & lc->boundary_flags; + const unsigned int trafo_size = (1U << log2_trafo_size); + const uint32_t bsf_mask = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1; + const uint32_t bsf_cbf = (bsf_mask & 0x55555555); + + // Do we cover a pred split line? + const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split; + const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split; + + uint32_t bsf_h; + uint32_t bsf_v; + +#ifdef DISABLE_STRENGTHS + return; +#endif + + // We are always on a size boundary + av_assert2((x0 & (trafo_size - 1)) == 0); + av_assert2((y0 & (trafo_size - 1)) == 0); + // log2_trafo_size not really a transform size; we can have to deal + // with size 2^6 blocks + av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6); + + // Retrieve and update coded (b0), intra (b1) bs flags + // + // Store on min width (rather than uint32_t) to avoid possible issues + // with another thread on another core running wpp using the same + // memory (min CTB = 16 pels = 4 bsf els = 8 bits) + // + // In bsf BS=2 is represented by 3 as it is much easier to test & set + // and the actual deblock code tests for 0 and b1 set/not-set so 2 and + // 3 will work the same + { + // Given where we are called from is_cbf_luma & is_intra will be constant over the block + const uint32_t bsf0 = (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0; + uint8_t *const p = s->bsf_stash_up + (x0 >> 4); + uint8_t *const q = s->bsf_stash_left + (y0 >> 4); + + switch (log2_trafo_size) + { + case 2: + case 3: + { + const unsigned int sh_h = (x0 >> 1) & 7; + const unsigned int sh_v = (y0 >> 1) & 7; + bsf_h = *p; + bsf_v = *q; + *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h); + *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v); + bsf_h >>= sh_h; + bsf_v >>= sh_v; + break; + } + case 4: + bsf_h = *p; + bsf_v = *q; + *p = bsf0; + *q = bsf0; + break; + case 5: + bsf_h = *(uint16_t *)p; + bsf_v = *(uint16_t *)q; + *(uint16_t *)p = bsf0; + *(uint16_t *)q = bsf0; + break; + case 6: + default: + bsf_h = *(uint32_t *)p; + bsf_v = *(uint32_t *)q; + *(uint32_t *)p = bsf0; + *(uint32_t *)q = bsf0; + break; + } + + bsf_h |= bsf0; + bsf_v |= bsf0; + } + + // Do Horizontal + if ((y0 & 7) == 0) + { + // Boundary upper + if (y0 != 0 && + (off_boundary(y0, s->ps.sps->log2_ctb_size) || + (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0)) + { + // Look at MVs (BS=1) if we don't already has a full set of bs bits + if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split)) + { + // If we aren't on the top boundary we must be in the middle + // and in that case we know where mvf can change + const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0; + const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ? + s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] : + rpl; + + bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), + trafo_size >> (log2_min_pu_size + log2_rep), + trafo_size >> (log2_min_pu_size + log2_rep), + rpl, rpl_top, + mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1)); + } + + // Finally put the results into bs + hbs_set(s, x0, y0, bsf_mask, bsf_h); + } + + // Max of 1 pu internal split - ignore if not on 8pel boundary + if (has_y_split && !off_boundary(lc->cu.y_split, 3)) + { + const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split); + // If we have the x split as well then it must be in the middle + const unsigned int log2_rep = has_x_split ? 1 : 0; + + hbs_set(s, x0, lc->cu.y_split, bsf_mask, + bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), + trafo_size >> (log2_min_pu_size + log2_rep), + trafo_size >> (log2_min_pu_size + log2_rep), + rpl, rpl, + mvf, mvf - MVF_STASH_WIDTH_PU)); + } + } + + // And again for vertical - same logic as horizontal just in the other direction + if ((x0 & 7) == 0) + { + // Boundary left + if (x0 != 0 && + (off_boundary(x0, s->ps.sps->log2_ctb_size) || + (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0)) + { + if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split)) + { + const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0; + const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ? + s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] : + rpl; + + bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), + (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), + (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep), + rpl, rpl_left, + mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0)); + } + + vbs_set(s, x0, y0, bsf_mask, bsf_v); + } + + if (has_x_split && !off_boundary(lc->cu.x_split, 3)) + { + const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0); + const unsigned int log2_rep = has_y_split ? 1 : 0; + + vbs_set(s, lc->cu.x_split, y0, bsf_mask, + bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), + (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), + (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), + rpl, rpl, + mvf, mvf - 1)); + } + } +} + +#undef LUMA +#undef CB +#undef CR + +static inline unsigned int ussub(const unsigned int a, const unsigned int b) +{ + return a < b ? 0 : a - b; +} + +static inline int cache_boundry(const AVFrame * const frame, const unsigned int x) +{ + return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0; +} + +int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot) +{ + const int ctb_size = (1 << s->ps.sps->log2_ctb_size); + int x, y; + + const unsigned int br = bounds.x + bounds.w; + const unsigned int bb = bounds.y + bounds.h; + + const int x_end = (br >= s->ps.sps->width); + const int y_end = (bb >= s->ps.sps->height); + + // Deblock may not touch the edges of the bound as they are still needed + // for Intra pred + // + // Deblock is disabled with a per-slice flag + // Given that bounds may cover multiple slices & we dblock outside bounds + // anyway we can't avoid deblock using that flag - about the only thing we + // could do is have a "no deblock seen yet" flag but it doesn't really + // seem worth the effort + + deblock_y_blk(s, bounds, x_end, y_end); + deblock_uv_blk(s, bounds, x_end, y_end); + + // SAO needs + // (a) CTB alignment + // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel + { + const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1)); + const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1)); + const unsigned int yt = ussub(bounds.y, yo); + const unsigned int yb = y_end ? bb : ussub(bb, yo); + const unsigned int xl = ussub(bounds.x, xo); + const unsigned int xr = x_end ? br : ussub(br, xo); + + if (s->ps.sps->sao_enabled) + { + for (y = yt; y < yb; y += ctb_size) { + for (x = xl; x < xr; x += ctb_size) { + sao_filter_CTB(s, x, y); + } + } + } + + // Cache invalidate + y = 0; + if (xr != 0 && yb != 0) + { + const unsigned int llen = + (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame)); + const unsigned int mask = ~(llen - 1); + const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask; + const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask; + const unsigned int it = ussub(yt, 1); + const unsigned int ib = y_end ? bb : yb - 1; + + if (il < ir) { + rpi_cache_buf_t cbuf; + rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf); + rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, + il, it, ir - il, ib - it, + ctx_vshift(s, 1), 1, 1); + + // If we have to commit the right hand tile boundry due to + // cache boundry considerations then at EoTile we must commit + // that boundry to bottom of tile (bounds) + if (ib != bb && ir == br && eot) { + rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, + br - 1, ib, 1, bb - ib, + ctx_vshift(s, 1), 1, 1); + } + + rpi_cache_flush_finish(rfe); + + if (x_end) + y = y_end ? INT_MAX : ib; + +// printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1); + } + } + } + + return y; +} + diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h new file mode 100644 index 0000000000..6b36f5e737 --- /dev/null +++ b/libavcodec/rpi_hevc_mv.h @@ -0,0 +1,71 @@ +#ifndef AVCODEC_RPI_HEVC_MV_H +#define AVCODEC_RPI_HEVC_MV_H + +#include "config.h" + +typedef int32_t MvXY; + +typedef struct HEVCRpiMvField { + MvXY xy[2]; + int8_t ref_idx[2]; + int8_t pred_flag; + int8_t dummy; // To 12 bytes +} HEVCRpiMvField; + + +#define MV_X(xy) (((xy) << 16) >> 16) +#define MV_Y(xy) ((xy) >> 16) +#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16)) + +#if ARCH_ARM +#include "arm/rpi_hevc_mv_arm.h" +#endif + +#ifndef mvxy_add +static inline MvXY mvxy_add(const MvXY a, const MvXY b) +{ + return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b)); +} +#endif + + +#ifndef mv_scale_xy +static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb) +{ + int tx, scale_factor; + + td = td == 0 ? 1 : av_clip_int8(td); + tb = av_clip_int8(tb); + tx = (0x4000 + (abs(td) >> 1)) / td; + scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12); + return MV_XY( + av_clip_int16((scale_factor * MV_X(src) + 127 + + (scale_factor * MV_X(src) < 0)) >> 8), + av_clip_int16((scale_factor * MV_Y(src) + 127 + + (scale_factor * MV_Y(src) < 0)) >> 8)); +} +#endif + +// 8.3.1 states that the bitstream may not contain poc diffs that do not +// fit in 16 bits, so given that we don't care about the high bits we only +// store the low 16 + LT & Inter flags + +#define COL_POC_INTRA 0 +#define COL_POC_INTER (1 << 16) +#define COL_POC_LT (1 << 17) +#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y))) +#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff)) +#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0) + +typedef struct ColMv_s { + int32_t poc; + int32_t xy; +} ColMv; + +typedef struct ColMvField_s { + ColMv L[2]; +} ColMvField; + + + +#endif // AVCODEC_RPI_HEVC_MV_H diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c new file mode 100644 index 0000000000..27a9f69525 --- /dev/null +++ b/libavcodec/rpi_hevc_mvs.c @@ -0,0 +1,487 @@ +/* + * HEVC video decoder + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2013 Anand Meher Kotra + * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "hevc.h" +#include "rpi_hevcdec.h" + +static av_always_inline int +is_eq_mer(const unsigned int plevel, + const unsigned int xN, const unsigned int yN, + const unsigned int xP, const unsigned int yP) +{ + return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0; +} + +// check if the mv's and refidx are the same between A and B +static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b) +{ + return a->pred_flag == b->pred_flag && + ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) && + ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1])); + return 0; +} + +/* + * 8.5.3.1.7 temporal luma motion vector prediction + */ +static int temporal_luma_motion_vector(const HEVCRpiContext * const s, + const HEVCRpiLocalContext * const lc, const int x0, const int y0, + const int nPbW, const int nPbH, const int refIdxLx, + MvXY * const mvLXCol, const int X) +{ + int x, y; + const ColMv * cmv = NULL; + + HEVCRpiFrame * const col_ref = s->ref->collocated_ref; + const RefPicList * const refPicList = s->refPicList + X; + const int cur_lt = refPicList->isLongTerm[refIdxLx]; + + *mvLXCol = 0; + // Unlikely but we might have a col_ref IDR frame! + if (col_ref->col_mvf == NULL) + return 0; + + ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH); + + //bottom right collocated motion vector + x = x0 + nPbW; + y = y0 + nPbH; + + if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && + y < s->ps.sps->height && + x < s->ps.sps->width) + { + const ColMvField * const col = col_ref->col_mvf + (x >> 4) + + (y >> 4) * s->col_mvf_stride; + + if (col->L[0].poc != COL_POC_INTRA && + (col->L[1].poc == COL_POC_INTRA || + (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) + { + cmv = col->L + 0; + } + else if (col->L[1].poc != COL_POC_INTRA) + { + cmv = col->L + 1; + } + } + + // derive center collocated motion vector + if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt) + { + cmv = NULL; + x = x0 + (nPbW >> 1); + y = y0 + (nPbH >> 1); + + { + const ColMvField * const col = col_ref->col_mvf + (x >> 4) + + (y >> 4) * s->col_mvf_stride; + + if (col->L[0].poc != COL_POC_INTRA && + (col->L[1].poc == COL_POC_INTRA || + (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) + { + cmv = col->L + 0; + } + else if (col->L[1].poc != COL_POC_INTRA) + { + cmv = col->L + 1; + } + } + } + + if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc)) + return 0; + + { + const int col_poc = col_ref->poc; + const int ref_poc = refPicList->list[refIdxLx]; + + *mvLXCol = (cur_lt || + cmv->poc == col_poc || + COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ? + cmv->xy : + mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc); + } + + return cmv != NULL; +} + +static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b) +{ + return b != NULL && compare_mv_ref_idx(a, b); +} + + + +/* + * 8.5.3.1.2 Derivation process for spatial merging candidates + */ +static inline const HEVCRpiMvField * +derive_spatial_merge_candidates( + const HEVCRpiContext * const s, + const HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int nPbW, const unsigned int nPbH, + const unsigned int avail, + const unsigned int part_idx, + const unsigned int merge_idx, + HEVCRpiMvField * const mvf_t) +{ + const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N); + const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD); + + const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1); + const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1); + const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1); + const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1; + const unsigned int plevel = s->ps.pps->log2_parallel_merge_level; + const unsigned int part_mode = lc->cu.part_mode; + + const HEVCRpiMvField * perm[4]; + unsigned int nb_merge_cand = 0; + + // singleMCLFlag => part_idx == 0 so no need to test for it + if ((avail & AVAIL_L) == 0 || + (part_idx == 1 && + ((parts_a1 >> part_mode) & 1) != 0 || + is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) || + mvf_a1->pred_flag == PF_INTRA) + { + mvf_a1 = NULL; + } + else + { + if (merge_idx == nb_merge_cand) + return mvf_a1; + perm[nb_merge_cand++] = mvf_a1; + } + + if ((avail & AVAIL_U) == 0 || + (part_idx == 1 && + ((parts_b1 >> part_mode) & 1) != 0 || + is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) || + mvf_b1->pred_flag == PF_INTRA) + { + mvf_b1 = NULL; + } + else if (!mvf_eq(mvf_b1, mvf_a1)) + { + if (merge_idx == nb_merge_cand) + return mvf_b1; + perm[nb_merge_cand++] = mvf_b1; + } + + // above right spatial merge candidate + // Never need mvf_b0 again so don't bother zeroing if navail + if ((avail & AVAIL_UR) != 0 && + !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) && + mvf_b0->pred_flag != PF_INTRA && + !mvf_eq(mvf_b0, mvf_b1)) + { + if (merge_idx == nb_merge_cand) + return mvf_b0; + perm[nb_merge_cand++] = mvf_b0; + } + + // left bottom spatial merge candidate + // Never need mvf_a0 again so don't bother zeroing if navail + if ((avail & AVAIL_DL) != 0 && + !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) && + mvf_a0->pred_flag != PF_INTRA && + !mvf_eq(mvf_a0, mvf_a1)) + { + if (merge_idx == nb_merge_cand) + return mvf_a0; + perm[nb_merge_cand++] = mvf_a0; + } + + // above left spatial merge candidate + if (nb_merge_cand != 4 && + (avail & AVAIL_UL) != 0 && + !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0)) + { + const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL + + if (mvf_b2->pred_flag != PF_INTRA && + !mvf_eq(mvf_b2, mvf_a1) && + !mvf_eq(mvf_b2, mvf_b1)) + { + if (merge_idx == nb_merge_cand) + return mvf_b2; + perm[nb_merge_cand++] = mvf_b2; + } + } + + // temporal motion vector candidate + if (s->sh.slice_temporal_mvp_enabled_flag) + { + static const HEVCRpiMvField mvf_z = {{0}}; + + *mvf_t = mvf_z; + + if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, + 0, mvf_t->xy + 0, 0)) + mvf_t->pred_flag = PF_L0; + + if (s->sh.slice_type == HEVC_SLICE_B && + temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, + 0, mvf_t->xy + 1, 1)) + mvf_t->pred_flag |= PF_L1; + + if (mvf_t->pred_flag != 0) + { + if (merge_idx == nb_merge_cand) + return mvf_t; + perm[nb_merge_cand++] = mvf_t; + } + } + + // combined bi-predictive merge candidates (applies for B slices) + if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1) + { + unsigned int comb_idx = 0; + const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1); + const RefPicList * const refPicList = s->refPicList; + + for (comb_idx = 0; comb_idx < cand_count; comb_idx++) + { + static const uint8_t l0_l1_cand_idx[12][2] = { + { 0, 1, }, + { 1, 0, }, + { 0, 2, }, + { 2, 0, }, + { 1, 2, }, + { 2, 1, }, + { 0, 3, }, + { 3, 0, }, + { 1, 3, }, + { 3, 1, }, + { 2, 3, }, + { 3, 2, }, + }; + + const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0]; + const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1]; + const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx]; + const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx]; + + if ((mvf_c0->pred_flag & PF_L0) != 0 && + (mvf_c1->pred_flag & PF_L1) != 0 && + (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] || + mvf_c0->xy[0] != mvf_c1->xy[1])) + { + if (merge_idx == nb_merge_cand++) + { + // Need to be a bit careful as we will construct mvf_t and we + // may already be using that as one of our condidates + // so build & copy rather than build in place + const HEVCRpiMvField mvf_m = { + .xy = { + mvf_c0->xy[0], + mvf_c1->xy[1]}, + .ref_idx = { + mvf_c0->ref_idx[0], + mvf_c1->ref_idx[1]}, + .pred_flag = PF_BI + }; + *mvf_t = mvf_m; + return mvf_t; + } + } + } + } + + // "append" Zero motion vector candidates + { + const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ? + FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0]; + const unsigned int zero_idx = merge_idx - nb_merge_cand; + + const HEVCRpiMvField mvf_m = { + .xy = {0, 0}, + .ref_idx = { + zero_idx < nb_refs ? zero_idx : 0, + (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0}, + .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0 + }; + + *mvf_t = mvf_m; + return mvf_t; + } +} + + +// 8.5.3.1.1 Derivation process of luma Mvs for merge mode +void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, + int nPbH, int log2_cb_size, int part_idx, + int merge_idx, HEVCRpiMvField * const mv) +{ + const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ? + derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8, + ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8), + 0, merge_idx, mv) : + derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, + ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH), + part_idx, merge_idx, mv); + + if (mvf_m != mv) + *mv = *mvf_m; + + if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12) + mv->pred_flag = PF_L0; +} + + +static av_always_inline const MvXY * +mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf) +{ + if (mvf != NULL) + { + if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0) + return mvf->xy + pfi0; + if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0) + return mvf->xy + pfi1; + } + return NULL; +} + +static av_always_inline const MvXY * +mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, + const int islt0, const int poc0, const int poc_cur, + MvXY * const mv_t, const HEVCRpiMvField * const mvf) +{ + if (mvf != NULL) + { + if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0) + { + const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]]; + if (islt0 || poc1 == poc0) { + return mvf->xy + pfi0; + } + *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0); + return mv_t; + } + if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0) + { + const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]]; + if (islt0 || poc1 == poc0) { + return mvf->xy + pfi1; + } + *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0); + return mv_t; + } + } + return NULL; +} + +void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int nPbW, const unsigned int nPbH, + const unsigned int avail, + HEVCRpiMvField * const mv, + const unsigned int mvp_lx_flag, const unsigned int LX) +{ + const unsigned int pfi0 = LX; + const unsigned int pfi1 = LX == 0 ? 1 : 0; + const RefPicList * const rpl = s->refPicList; + const int poc0 = rpl[LX].list[mv->ref_idx[LX]]; + const int poc_cur = s->poc; + const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]]; + + const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1); + const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1); + const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL + const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1); + const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1; + const MvXY * mva = NULL; + const MvXY * mvb; + MvXY * const mv_rv = mv->xy + LX; + MvXY mvt_a, mvt_b; + + *mv_rv = 0; + + if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA) + mvf_a0 = NULL; + else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0) + goto use_mva; + + if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA) + mvf_a1 = NULL; + + if (mva == NULL && + (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL && + (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL) + mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1); + + if (mvp_lx_flag == 0 && mva != NULL) + goto use_mva; + + if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA) + mvf_b0 = NULL; + if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA) + mvf_b1 = NULL; + if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA) + mvf_b2 = NULL; + + if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL && + (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL) + mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2); + + if (mvf_a0 == NULL && mvf_a1 == NULL) { + mva = mvb; + if (mvp_lx_flag == 0 && mva != NULL) + goto use_mva; + + if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL && + (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL) + mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2); + } + + if (mva == NULL) { + mva = mvb; + mvb = NULL; + } + + if (mvb != NULL && *mva == *mvb) // If A == B then ignore B + mvb = NULL; + + if (mvp_lx_flag == 0 && mva != NULL) { + goto use_mva; + } + else if (mvp_lx_flag != 0 && mvb != NULL) { + *mv_rv = *mvb; + } + else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) { + temporal_luma_motion_vector(s, lc, x0, y0, nPbW, + nPbH, mv->ref_idx[LX], + mv_rv, LX); + } + return; + +use_mva: + *mv_rv = *mva; + return; +} + diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c new file mode 100644 index 0000000000..e58a59ce5e --- /dev/null +++ b/libavcodec/rpi_hevc_parse.c @@ -0,0 +1,143 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "bytestream.h" +#include "h2645_parse.h" +#include "hevc.h" +#include "rpi_hevc_parse.h" + +static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps, + HEVCSEIContext *sei, int is_nalff, int nal_length_size, + int err_recognition, int apply_defdispwin, void *logctx) +{ + int i; + int ret = 0; + H2645Packet pkt = { 0 }; + + ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff, + nal_length_size, AV_CODEC_ID_HEVC, 1, 0); + if (ret < 0) { + goto done; + } + + for (i = 0; i < pkt.nb_nals; i++) { + H2645NAL *nal = &pkt.nals[i]; + + /* ignore everything except parameter sets and VCL NALUs */ + switch (nal->type) { + case HEVC_NAL_VPS: + ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps); + if (ret < 0) + goto done; + break; + case HEVC_NAL_SPS: + ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin); + if (ret < 0) + goto done; + break; + case HEVC_NAL_PPS: + ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps); + if (ret < 0) + goto done; + break; + case HEVC_NAL_SEI_PREFIX: + case HEVC_NAL_SEI_SUFFIX: + ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type); + if (ret < 0) + goto done; + break; + default: + av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type); + break; + } + } + +done: + ff_h2645_packet_uninit(&pkt); + if (err_recognition & AV_EF_EXPLODE) + return ret; + + return 0; +} + +int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps, + HEVCSEIContext *sei, int *is_nalff, int *nal_length_size, + int err_recognition, int apply_defdispwin, void *logctx) +{ + int ret = 0; + GetByteContext gb; + + bytestream2_init(&gb, data, size); + + if (size > 3 && (data[0] || data[1] || data[2] > 1)) { + /* It seems the extradata is encoded as hvcC format. + * Temporarily, we support configurationVersion==0 until 14496-15 3rd + * is finalized. When finalized, configurationVersion will be 1 and we + * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */ + int i, j, num_arrays, nal_len_size; + + *is_nalff = 1; + + bytestream2_skip(&gb, 21); + nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1; + num_arrays = bytestream2_get_byte(&gb); + + /* nal units in the hvcC always have length coded with 2 bytes, + * so put a fake nal_length_size = 2 while parsing them */ + *nal_length_size = 2; + + /* Decode nal units from hvcC. */ + for (i = 0; i < num_arrays; i++) { + int type = bytestream2_get_byte(&gb) & 0x3f; + int cnt = bytestream2_get_be16(&gb); + + for (j = 0; j < cnt; j++) { + // +2 for the nal size field + int nalsize = bytestream2_peek_be16(&gb) + 2; + if (bytestream2_get_bytes_left(&gb) < nalsize) { + av_log(logctx, AV_LOG_ERROR, + "Invalid NAL unit size in extradata.\n"); + return AVERROR_INVALIDDATA; + } + + ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff, + *nal_length_size, err_recognition, apply_defdispwin, + logctx); + if (ret < 0) { + av_log(logctx, AV_LOG_ERROR, + "Decoding nal unit %d %d from hvcC failed\n", + type, i); + return ret; + } + bytestream2_skip(&gb, nalsize); + } + } + + /* Now store right nal length size, that will be used to parse + * all other nals */ + *nal_length_size = nal_len_size; + } else { + *is_nalff = 0; + ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size, + err_recognition, apply_defdispwin, logctx); + if (ret < 0) + return ret; + } + + return ret; +} diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h new file mode 100644 index 0000000000..4b4d032a16 --- /dev/null +++ b/libavcodec/rpi_hevc_parse.h @@ -0,0 +1,36 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.265 parser code + */ + +#ifndef AVCODEC_RPI_HEVC_PARSE_H +#define AVCODEC_RPI_HEVC_PARSE_H + +#include + +#include "rpi_hevc_ps.h" +#include "rpi_hevc_sei.h" + +int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps, + HEVCSEIContext *sei, int *is_nalff, int *nal_length_size, + int err_recognition, int apply_defdispwin, void *logctx); + +#endif /* AVCODEC_RPI_HEVC_PARSE_H */ diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c new file mode 100644 index 0000000000..f4e31f7d1d --- /dev/null +++ b/libavcodec/rpi_hevc_ps.c @@ -0,0 +1,1938 @@ +/* + * HEVC Parameter Set decoding + * + * Copyright (C) 2012 - 2103 Guillaume Martres + * Copyright (C) 2012 - 2103 Mickael Raulet + * Copyright (C) 2012 - 2013 Gildas Cocherel + * Copyright (C) 2013 Vittorio Giovara + * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/imgutils.h" +#include "golomb.h" +#include "rpi_hevc_data.h" +#include "rpi_hevc_ps.h" +#include "rpi_hevcdec.h" + +static const uint8_t default_scaling_list_intra[] = { + 16, 16, 16, 16, 17, 18, 21, 24, + 16, 16, 16, 16, 17, 19, 22, 25, + 16, 16, 17, 18, 20, 22, 25, 29, + 16, 16, 18, 21, 24, 27, 31, 36, + 17, 17, 20, 24, 30, 35, 41, 47, + 18, 19, 22, 27, 35, 44, 54, 65, + 21, 22, 25, 31, 41, 54, 70, 88, + 24, 25, 29, 36, 47, 65, 88, 115 +}; + +static const uint8_t default_scaling_list_inter[] = { + 16, 16, 16, 16, 17, 18, 20, 24, + 16, 16, 16, 17, 18, 20, 24, 25, + 16, 16, 17, 18, 20, 24, 25, 28, + 16, 17, 18, 20, 24, 25, 28, 33, + 17, 18, 20, 24, 25, 28, 33, 41, + 18, 20, 24, 25, 28, 33, 41, 54, + 20, 24, 25, 28, 33, 41, 54, 71, + 24, 25, 28, 33, 41, 54, 71, 91 +}; + +static const AVRational vui_sar[] = { + { 0, 1 }, + { 1, 1 }, + { 12, 11 }, + { 10, 11 }, + { 16, 11 }, + { 40, 33 }, + { 24, 11 }, + { 20, 11 }, + { 32, 11 }, + { 80, 33 }, + { 18, 11 }, + { 15, 11 }, + { 64, 33 }, + { 160, 99 }, + { 4, 3 }, + { 3, 2 }, + { 2, 1 }, +}; + + +// pps_cb_qp_offset: -12,+12 +// slice_cb_qp_offset: -12,+12 also +// "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive." +// cr_qp_offset_list[n]: -12,+12 +// So worst case total offset: -24,+24 + +#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6) +#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n)) +#define M(B,n) C(B,(-n)) + +// Sizeof the QP_START_BLOCK +#define QP_OFFSET_0 (8*6 + 12*2) +#define QP_START(B) \ + M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ + M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ + M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ + M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ +\ + M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\ + M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\ + M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\ + M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\ + M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\ + M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\ + M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\ + M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1) +#define QP_END(B) \ + C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ + C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ + C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51) + +#define T1(B)\ +{\ + QP_START(B),\ + C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\ + C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\ + C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\ + C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\ + C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\ + C(B,44), C(B,45),\ + C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\ + QP_END(B)\ +} +#define T0(B)\ +{\ + QP_START(B),\ + C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\ + C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\ + C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\ + C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\ + C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\ + C(B,50), C(B,51),\ + C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ + QP_END(B)\ +} + +#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2) + +static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)}; +static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)}; + +#undef T +#undef C +#undef QP_END + +#define C(B,n) ((n)<0?0:(n)>51?51:(n)) +// We do need a lot of -ve padding to cope with high bit depths that give -ve qps +#define QP_DBLK_OFFSET_0 QP_OFFSET_0 +#define QP_END(B)\ + 51, 51, 51, 51, 51, 51 + +// These don't need all the padding we have here (12 top/bottom would be enough) +static const uint8_t qp_c_dblk_0[] = T0(0); +static const uint8_t qp_c_dblk_1[] = T1(0); + +#undef T +#undef M +#undef C +#undef QP_END +#undef QP_START + + +static void remove_pps(HEVCRpiParamSets * const s, const int id) +{ + if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data) + s->pps = NULL; + av_buffer_unref(&s->pps_list[id]); +} + +static void remove_sps(HEVCRpiParamSets * const s, const int id) +{ + int i; + if (s->sps_list[id]) { + if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data) + s->sps = NULL; + + /* drop all PPS that depend on this SPS */ + for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++) + if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id) + remove_pps(s, i); + + av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data)); + } + av_buffer_unref(&s->sps_list[id]); +} + +static void remove_vps(HEVCRpiParamSets * const s, const int id) +{ + int i; + if (s->vps_list[id]) { + if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data) + s->vps = NULL; + + for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++) + if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id) + remove_sps(s, i); + } + av_buffer_unref(&s->vps_list[id]); +} + +int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx, + ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header) +{ + uint8_t rps_predict = 0; + int delta_poc; + int k0 = 0; + int k1 = 0; + int k = 0; + int i; + + if (rps != sps->st_rps && sps->nb_st_rps) + rps_predict = get_bits1(gb); + + if (rps_predict) { + const ShortTermRPS *rps_ridx; + int delta_rps; + unsigned abs_delta_rps; + uint8_t use_delta_flag = 0; + uint8_t delta_rps_sign; + + if (is_slice_header) { + unsigned int delta_idx = get_ue_golomb_long(gb) + 1; + if (delta_idx > sps->nb_st_rps) { + av_log(avctx, AV_LOG_ERROR, + "Invalid value of delta_idx in slice header RPS: %d > %d.\n", + delta_idx, sps->nb_st_rps); + return AVERROR_INVALIDDATA; + } + rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx]; + rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs; + } else + rps_ridx = &sps->st_rps[rps - sps->st_rps - 1]; + + delta_rps_sign = get_bits1(gb); + abs_delta_rps = get_ue_golomb_long(gb) + 1; + if (abs_delta_rps < 1 || abs_delta_rps > 32768) { + av_log(avctx, AV_LOG_ERROR, + "Invalid value of abs_delta_rps: %d\n", + abs_delta_rps); + return AVERROR_INVALIDDATA; + } + delta_rps = (1 - (delta_rps_sign << 1)) * abs_delta_rps; + for (i = 0; i <= rps_ridx->num_delta_pocs; i++) { + int used = rps->used[k] = get_bits1(gb); + + if (!used) + use_delta_flag = get_bits1(gb); + + if (used || use_delta_flag) { + if (i < rps_ridx->num_delta_pocs) + delta_poc = delta_rps + rps_ridx->delta_poc[i]; + else + delta_poc = delta_rps; + rps->delta_poc[k] = delta_poc; + if (delta_poc < 0) + k0++; + else + k1++; + k++; + } + } + + if (k >= FF_ARRAY_ELEMS(rps->used)) { + av_log(avctx, AV_LOG_ERROR, + "Invalid num_delta_pocs: %d\n", k); + return AVERROR_INVALIDDATA; + } + + rps->num_delta_pocs = k; + rps->num_negative_pics = k0; + // sort in increasing order (smallest first) + if (rps->num_delta_pocs != 0) { + int used, tmp; + for (i = 1; i < rps->num_delta_pocs; i++) { + delta_poc = rps->delta_poc[i]; + used = rps->used[i]; + for (k = i - 1; k >= 0; k--) { + tmp = rps->delta_poc[k]; + if (delta_poc < tmp) { + rps->delta_poc[k + 1] = tmp; + rps->used[k + 1] = rps->used[k]; + rps->delta_poc[k] = delta_poc; + rps->used[k] = used; + } + } + } + } + if ((rps->num_negative_pics >> 1) != 0) { + int used; + k = rps->num_negative_pics - 1; + // flip the negative values to largest first + for (i = 0; i < rps->num_negative_pics >> 1; i++) { + delta_poc = rps->delta_poc[i]; + used = rps->used[i]; + rps->delta_poc[i] = rps->delta_poc[k]; + rps->used[i] = rps->used[k]; + rps->delta_poc[k] = delta_poc; + rps->used[k] = used; + k--; + } + } + } else { + unsigned int prev, nb_positive_pics; + rps->num_negative_pics = get_ue_golomb_long(gb); + nb_positive_pics = get_ue_golomb_long(gb); + + if (rps->num_negative_pics >= HEVC_MAX_REFS || + nb_positive_pics >= HEVC_MAX_REFS) { + av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n"); + return AVERROR_INVALIDDATA; + } + + rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics; + if (rps->num_delta_pocs) { + prev = 0; + for (i = 0; i < rps->num_negative_pics; i++) { + delta_poc = get_ue_golomb_long(gb) + 1; + if (delta_poc < 1 || delta_poc > 32768) { + av_log(avctx, AV_LOG_ERROR, + "Invalid value of delta_poc: %d\n", + delta_poc); + return AVERROR_INVALIDDATA; + } + prev -= delta_poc; + rps->delta_poc[i] = prev; + rps->used[i] = get_bits1(gb); + } + prev = 0; + for (i = 0; i < nb_positive_pics; i++) { + delta_poc = get_ue_golomb_long(gb) + 1; + if (delta_poc < 1 || delta_poc > 32768) { + av_log(avctx, AV_LOG_ERROR, + "Invalid value of delta_poc: %d\n", + delta_poc); + return AVERROR_INVALIDDATA; + } + prev += delta_poc; + rps->delta_poc[rps->num_negative_pics + i] = prev; + rps->used[rps->num_negative_pics + i] = get_bits1(gb); + } + } + } + return 0; +} + + +static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx, + PTLCommon * const ptl) +{ + int i; + + if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12) + return -1; + + ptl->profile_space = get_bits(gb, 2); + ptl->tier_flag = get_bits1(gb); + ptl->profile_idc = get_bits(gb, 5); + if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN) + av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n"); + else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10) + av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n"); + else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE) + av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n"); + else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT) + av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n"); + else + av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc); + + for (i = 0; i < 32; i++) { + ptl->profile_compatibility_flag[i] = get_bits1(gb); + + if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i]) + ptl->profile_idc = i; + } + ptl->progressive_source_flag = get_bits1(gb); + ptl->interlaced_source_flag = get_bits1(gb); + ptl->non_packed_constraint_flag = get_bits1(gb); + ptl->frame_only_constraint_flag = get_bits1(gb); + + skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15] + skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31] + skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43] + + return 0; +} + +static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx, + PTL * const ptl, const int max_num_sub_layers) +{ + int i; + if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 || + get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) { + av_log(avctx, AV_LOG_ERROR, "PTL information too short\n"); + return -1; + } + + ptl->general_ptl.level_idc = get_bits(gb, 8); + + for (i = 0; i < max_num_sub_layers - 1; i++) { + ptl->sub_layer_profile_present_flag[i] = get_bits1(gb); + ptl->sub_layer_level_present_flag[i] = get_bits1(gb); + } + + if (max_num_sub_layers - 1> 0) + for (i = max_num_sub_layers - 1; i < 8; i++) + skip_bits(gb, 2); // reserved_zero_2bits[i] + for (i = 0; i < max_num_sub_layers - 1; i++) { + if (ptl->sub_layer_profile_present_flag[i] && + decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) { + av_log(avctx, AV_LOG_ERROR, + "PTL information for sublayer %i too short\n", i); + return -1; + } + if (ptl->sub_layer_level_present_flag[i]) { + if (get_bits_left(gb) < 8) { + av_log(avctx, AV_LOG_ERROR, + "Not enough data for sublayer %i level_idc\n", i); + return -1; + } else + ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8); + } + } + + return 0; +} + +static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb, + const int subpic_params_present) +{ + int i; + + for (i = 0; i < nb_cpb; i++) { + get_ue_golomb_long(gb); // bit_rate_value_minus1 + get_ue_golomb_long(gb); // cpb_size_value_minus1 + + if (subpic_params_present) { + get_ue_golomb_long(gb); // cpb_size_du_value_minus1 + get_ue_golomb_long(gb); // bit_rate_du_value_minus1 + } + skip_bits1(gb); // cbr_flag + } +} + +static int decode_hrd(GetBitContext * const gb, const int common_inf_present, + const int max_sublayers) +{ + int nal_params_present = 0, vcl_params_present = 0; + int subpic_params_present = 0; + int i; + + if (common_inf_present) { + nal_params_present = get_bits1(gb); + vcl_params_present = get_bits1(gb); + + if (nal_params_present || vcl_params_present) { + subpic_params_present = get_bits1(gb); + + if (subpic_params_present) { + skip_bits(gb, 8); // tick_divisor_minus2 + skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1 + skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag + skip_bits(gb, 5); // dpb_output_delay_du_length_minus1 + } + + skip_bits(gb, 4); // bit_rate_scale + skip_bits(gb, 4); // cpb_size_scale + + if (subpic_params_present) + skip_bits(gb, 4); // cpb_size_du_scale + + skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1 + skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1 + skip_bits(gb, 5); // dpb_output_delay_length_minus1 + } + } + + for (i = 0; i < max_sublayers; i++) { + int low_delay = 0; + unsigned int nb_cpb = 1; + int fixed_rate = get_bits1(gb); + + if (!fixed_rate) + fixed_rate = get_bits1(gb); + + if (fixed_rate) + get_ue_golomb_long(gb); // elemental_duration_in_tc_minus1 + else + low_delay = get_bits1(gb); + + if (!low_delay) { + nb_cpb = get_ue_golomb_long(gb) + 1; + if (nb_cpb < 1 || nb_cpb > 32) { + av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb); + return AVERROR_INVALIDDATA; + } + } + + if (nal_params_present) + decode_sublayer_hrd(gb, nb_cpb, subpic_params_present); + if (vcl_params_present) + decode_sublayer_hrd(gb, nb_cpb, subpic_params_present); + } + return 0; +} + +int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx, + HEVCRpiParamSets * const ps) +{ + int i,j; + int vps_id = 0; + ptrdiff_t nal_size; + HEVCRpiVPS *vps; + AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps)); + + if (!vps_buf) + return AVERROR(ENOMEM); + vps = (HEVCRpiVPS*)vps_buf->data; + + av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n"); + + nal_size = gb->buffer_end - gb->buffer; + if (nal_size > sizeof(vps->data)) { + av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS " + "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", + nal_size, sizeof(vps->data)); + vps->data_size = sizeof(vps->data); + } else { + vps->data_size = nal_size; + } + memcpy(vps->data, gb->buffer, vps->data_size); + + vps_id = get_bits(gb, 4); + if (vps_id >= HEVC_MAX_VPS_COUNT) { + av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id); + goto err; + } + + if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits + av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n"); + goto err; + } + + vps->vps_max_layers = get_bits(gb, 6) + 1; + vps->vps_max_sub_layers = get_bits(gb, 3) + 1; + vps->vps_temporal_id_nesting_flag = get_bits1(gb); + + if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits + av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n"); + goto err; + } + + if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) { + av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n", + vps->vps_max_sub_layers); + goto err; + } + + if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0) + goto err; + + vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb); + + i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1; + for (; i < vps->vps_max_sub_layers; i++) { + vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1; + vps->vps_num_reorder_pics[i] = get_ue_golomb_long(gb); + vps->vps_max_latency_increase[i] = get_ue_golomb_long(gb) - 1; + + if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) { + av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n", + vps->vps_max_dec_pic_buffering[i] - 1); + goto err; + } + if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) { + av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n", + vps->vps_num_reorder_pics[i]); + if (avctx->err_recognition & AV_EF_EXPLODE) + goto err; + } + } + + vps->vps_max_layer_id = get_bits(gb, 6); + vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1; + if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 || + (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) { + av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n"); + goto err; + } + + for (i = 1; i < vps->vps_num_layer_sets; i++) + for (j = 0; j <= vps->vps_max_layer_id; j++) + skip_bits(gb, 1); // layer_id_included_flag[i][j] + + vps->vps_timing_info_present_flag = get_bits1(gb); + if (vps->vps_timing_info_present_flag) { + vps->vps_num_units_in_tick = get_bits_long(gb, 32); + vps->vps_time_scale = get_bits_long(gb, 32); + vps->vps_poc_proportional_to_timing_flag = get_bits1(gb); + if (vps->vps_poc_proportional_to_timing_flag) + vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1; + vps->vps_num_hrd_parameters = get_ue_golomb_long(gb); + if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) { + av_log(avctx, AV_LOG_ERROR, + "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters); + goto err; + } + for (i = 0; i < vps->vps_num_hrd_parameters; i++) { + int common_inf_present = 1; + + get_ue_golomb_long(gb); // hrd_layer_set_idx + if (i) + common_inf_present = get_bits1(gb); + decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers); + } + } + get_bits1(gb); /* vps_extension_flag */ + + if (get_bits_left(gb) < 0) { + av_log(avctx, AV_LOG_ERROR, + "Overread VPS by %d bits\n", -get_bits_left(gb)); + if (ps->vps_list[vps_id]) + goto err; + } + + if (ps->vps_list[vps_id] && + !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) { + av_buffer_unref(&vps_buf); + } else { + remove_vps(ps, vps_id); + ps->vps_list[vps_id] = vps_buf; + } + + return 0; + +err: + av_buffer_unref(&vps_buf); + return AVERROR_INVALIDDATA; +} + +static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx, + const int apply_defdispwin, HEVCRpiSPS * const sps) +{ + VUI backup_vui, * const vui = &sps->vui; + GetBitContext backup; + int sar_present, alt = 0; + + av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n"); + + sar_present = get_bits1(gb); + if (sar_present) { + uint8_t sar_idx = get_bits(gb, 8); + if (sar_idx < FF_ARRAY_ELEMS(vui_sar)) + vui->sar = vui_sar[sar_idx]; + else if (sar_idx == 255) { + vui->sar.num = get_bits(gb, 16); + vui->sar.den = get_bits(gb, 16); + } else + av_log(avctx, AV_LOG_WARNING, + "Unknown SAR index: %u.\n", sar_idx); + } + + vui->overscan_info_present_flag = get_bits1(gb); + if (vui->overscan_info_present_flag) + vui->overscan_appropriate_flag = get_bits1(gb); + + vui->video_signal_type_present_flag = get_bits1(gb); + if (vui->video_signal_type_present_flag) { + vui->video_format = get_bits(gb, 3); + vui->video_full_range_flag = get_bits1(gb); + vui->colour_description_present_flag = get_bits1(gb); + if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P) + sps->pix_fmt = AV_PIX_FMT_YUVJ420P; + if (vui->colour_description_present_flag) { + vui->colour_primaries = get_bits(gb, 8); + vui->transfer_characteristic = get_bits(gb, 8); + vui->matrix_coeffs = get_bits(gb, 8); + + // Set invalid values to "unspecified" + if (!av_color_primaries_name(vui->colour_primaries)) + vui->colour_primaries = AVCOL_PRI_UNSPECIFIED; + if (!av_color_transfer_name(vui->transfer_characteristic)) + vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED; + if (!av_color_space_name(vui->matrix_coeffs)) + vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED; + if (vui->matrix_coeffs == AVCOL_SPC_RGB) { + switch (sps->pix_fmt) { + case AV_PIX_FMT_YUV444P: + sps->pix_fmt = AV_PIX_FMT_GBRP; + break; + case AV_PIX_FMT_YUV444P10: + sps->pix_fmt = AV_PIX_FMT_GBRP10; + break; + case AV_PIX_FMT_YUV444P12: + sps->pix_fmt = AV_PIX_FMT_GBRP12; + break; + } + } + } + } + + vui->chroma_loc_info_present_flag = get_bits1(gb); + if (vui->chroma_loc_info_present_flag) { + vui->chroma_sample_loc_type_top_field = get_ue_golomb_long(gb); + vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb); + } + + vui->neutra_chroma_indication_flag = get_bits1(gb); + vui->field_seq_flag = get_bits1(gb); + vui->frame_field_info_present_flag = get_bits1(gb); + + // Backup context in case an alternate header is detected + memcpy(&backup, gb, sizeof(backup)); + memcpy(&backup_vui, vui, sizeof(backup_vui)); + if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) { + vui->default_display_window_flag = 0; + av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n"); + } else + vui->default_display_window_flag = get_bits1(gb); + + if (vui->default_display_window_flag) { + int vert_mult = 1 + (sps->chroma_format_idc < 2); + int horiz_mult = 1 + (sps->chroma_format_idc < 3); + vui->def_disp_win.left_offset = get_ue_golomb_long(gb) * horiz_mult; + vui->def_disp_win.right_offset = get_ue_golomb_long(gb) * horiz_mult; + vui->def_disp_win.top_offset = get_ue_golomb_long(gb) * vert_mult; + vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult; + + if (apply_defdispwin && + avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) { + av_log(avctx, AV_LOG_DEBUG, + "discarding vui default display window, " + "original values are l:%u r:%u t:%u b:%u\n", + vui->def_disp_win.left_offset, + vui->def_disp_win.right_offset, + vui->def_disp_win.top_offset, + vui->def_disp_win.bottom_offset); + + vui->def_disp_win.left_offset = + vui->def_disp_win.right_offset = + vui->def_disp_win.top_offset = + vui->def_disp_win.bottom_offset = 0; + } + } + +timing_info: + vui->vui_timing_info_present_flag = get_bits1(gb); + + if (vui->vui_timing_info_present_flag) { + if( get_bits_left(gb) < 66 && !alt) { + // The alternate syntax seem to have timing info located + // at where def_disp_win is normally located + av_log(avctx, AV_LOG_WARNING, + "Strange VUI timing information, retrying...\n"); + memcpy(vui, &backup_vui, sizeof(backup_vui)); + memcpy(gb, &backup, sizeof(backup)); + alt = 1; + goto timing_info; + } + vui->vui_num_units_in_tick = get_bits_long(gb, 32); + vui->vui_time_scale = get_bits_long(gb, 32); + if (alt) { + av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n", + vui->vui_time_scale, vui->vui_num_units_in_tick); + } + vui->vui_poc_proportional_to_timing_flag = get_bits1(gb); + if (vui->vui_poc_proportional_to_timing_flag) + vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb); + vui->vui_hrd_parameters_present_flag = get_bits1(gb); + if (vui->vui_hrd_parameters_present_flag) + decode_hrd(gb, 1, sps->max_sub_layers); + } + + vui->bitstream_restriction_flag = get_bits1(gb); + if (vui->bitstream_restriction_flag) { + if (get_bits_left(gb) < 8 && !alt) { + av_log(avctx, AV_LOG_WARNING, + "Strange VUI bitstream restriction information, retrying" + " from timing information...\n"); + memcpy(vui, &backup_vui, sizeof(backup_vui)); + memcpy(gb, &backup, sizeof(backup)); + alt = 1; + goto timing_info; + } + vui->tiles_fixed_structure_flag = get_bits1(gb); + vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb); + vui->restricted_ref_pic_lists_flag = get_bits1(gb); + vui->min_spatial_segmentation_idc = get_ue_golomb_long(gb); + vui->max_bytes_per_pic_denom = get_ue_golomb_long(gb); + vui->max_bits_per_min_cu_denom = get_ue_golomb_long(gb); + vui->log2_max_mv_length_horizontal = get_ue_golomb_long(gb); + vui->log2_max_mv_length_vertical = get_ue_golomb_long(gb); + } + + if (get_bits_left(gb) < 1 && !alt) { + // XXX: Alternate syntax when sps_range_extension_flag != 0? + av_log(avctx, AV_LOG_WARNING, + "Overread in VUI, retrying from timing information...\n"); + memcpy(vui, &backup_vui, sizeof(backup_vui)); + memcpy(gb, &backup, sizeof(backup)); + alt = 1; + goto timing_info; + } +} + +static void set_default_scaling_list_data(ScalingList * const sl) +{ + int matrixId; + + for (matrixId = 0; matrixId < 6; matrixId++) { + // 4x4 default is 16 + memset(sl->sl[0][matrixId], 16, 16); + sl->sl_dc[0][matrixId] = 16; // default for 16x16 + sl->sl_dc[1][matrixId] = 16; // default for 32x32 + } + + memcpy(sl->sl[1][0], default_scaling_list_intra, 64); + memcpy(sl->sl[1][1], default_scaling_list_intra, 64); + memcpy(sl->sl[1][2], default_scaling_list_intra, 64); + + memcpy(sl->sl[1][3], default_scaling_list_inter, 64); + memcpy(sl->sl[1][4], default_scaling_list_inter, 64); + memcpy(sl->sl[1][5], default_scaling_list_inter, 64); + + memcpy(sl->sl[2][0], default_scaling_list_intra, 64); + memcpy(sl->sl[2][1], default_scaling_list_intra, 64); + memcpy(sl->sl[2][2], default_scaling_list_intra, 64); + + memcpy(sl->sl[2][3], default_scaling_list_inter, 64); + memcpy(sl->sl[2][4], default_scaling_list_inter, 64); + memcpy(sl->sl[2][5], default_scaling_list_inter, 64); + + memcpy(sl->sl[3][0], default_scaling_list_intra, 64); + memcpy(sl->sl[3][1], default_scaling_list_intra, 64); + memcpy(sl->sl[3][2], default_scaling_list_intra, 64); + + memcpy(sl->sl[3][3], default_scaling_list_inter, 64); + memcpy(sl->sl[3][4], default_scaling_list_inter, 64); + memcpy(sl->sl[3][5], default_scaling_list_inter, 64); +} + +static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl, + const HEVCRpiSPS * const sps) +{ + uint8_t scaling_list_pred_mode_flag; + int32_t scaling_list_dc_coef[2][6]; + int size_id, matrix_id, pos; + int i; + + for (size_id = 0; size_id < 4; size_id++) + for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) { + scaling_list_pred_mode_flag = get_bits1(gb); + if (!scaling_list_pred_mode_flag) { + unsigned int delta = get_ue_golomb_long(gb); + /* Only need to handle non-zero delta. Zero means default, + * which should already be in the arrays. */ + if (delta) { + // Copy from previous array. + delta *= (size_id == 3) ? 3 : 1; + if (matrix_id < delta) { + av_log(avctx, AV_LOG_ERROR, + "Invalid delta in scaling list data: %d.\n", delta); + return AVERROR_INVALIDDATA; + } + + memcpy(sl->sl[size_id][matrix_id], + sl->sl[size_id][matrix_id - delta], + size_id > 0 ? 64 : 16); + if (size_id > 1) + sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta]; + } + } else { + int next_coef, coef_num; + int32_t scaling_list_delta_coef; + + next_coef = 8; + coef_num = FFMIN(64, 1 << (4 + (size_id << 1))); + if (size_id > 1) { + scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8; + next_coef = scaling_list_dc_coef[size_id - 2][matrix_id]; + sl->sl_dc[size_id - 2][matrix_id] = next_coef; + } + for (i = 0; i < coef_num; i++) { + if (size_id == 0) + pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] + + ff_hevc_rpi_diag_scan4x4_x[i]; + else + pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] + + ff_hevc_rpi_diag_scan8x8_x[i]; + + scaling_list_delta_coef = get_se_golomb(gb); + next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256; + sl->sl[size_id][matrix_id][pos] = next_coef; + } + } + } + + if (sps->chroma_format_idc == 3) { + for (i = 0; i < 64; i++) { + sl->sl[3][1][i] = sl->sl[2][1][i]; + sl->sl[3][2][i] = sl->sl[2][2][i]; + sl->sl[3][4][i] = sl->sl[2][4][i]; + sl->sl[3][5][i] = sl->sl[2][5][i]; + } + sl->sl_dc[1][1] = sl->sl_dc[0][1]; + sl->sl_dc[1][2] = sl->sl_dc[0][2]; + sl->sl_dc[1][4] = sl->sl_dc[0][4]; + sl->sl_dc[1][5] = sl->sl_dc[0][5]; + } + + + return 0; +} + +static int map_pixel_format(HEVCRpiSPS * const sps) +{ + const int cfmt = sps->chroma_format_idc; + + sps->pix_fmt = AV_PIX_FMT_NONE; + switch (sps->bit_depth) { + case 8: + if (cfmt == 1) + sps->pix_fmt = AV_PIX_FMT_SAND128; + break; + case 10: + if (cfmt == 1) + sps->pix_fmt = AV_PIX_FMT_SAND64_10; + break; + default: + break; + } + + sps->hshift[0] = sps->vshift[0] = 0; + sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4 + sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2 + + sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0; + + return 0; +} + +static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id, + const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx) +{ + HEVCRpiWindow *ow; + int ret = 0; + int log2_diff_max_min_transform_block_size; + int bit_depth_chroma, start, vui_present, sublayer_ordering_info; + int i; + + // Coded parameters + + sps->vps_id = get_bits(gb, 4); + if (sps->vps_id >= HEVC_MAX_VPS_COUNT) { + av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id); + return AVERROR_INVALIDDATA; + } + + if (vps_list && !vps_list[sps->vps_id]) { + av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n", + sps->vps_id); + return AVERROR_INVALIDDATA; + } + + sps->max_sub_layers = get_bits(gb, 3) + 1; + if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) { + av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n", + sps->max_sub_layers); + return AVERROR_INVALIDDATA; + } + + sps->temporal_id_nesting_flag = get_bits(gb, 1); + + if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0) + return ret; + + *sps_id = get_ue_golomb_long(gb); + if (*sps_id >= HEVC_MAX_SPS_COUNT) { + av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id); + return AVERROR_INVALIDDATA; + } + + sps->chroma_format_idc = get_ue_golomb_long(gb); + if (sps->chroma_format_idc > 3U) { + av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc); + return AVERROR_INVALIDDATA; + } + + if (sps->chroma_format_idc == 3) + sps->separate_colour_plane_flag = get_bits1(gb); + + if (sps->separate_colour_plane_flag) + sps->chroma_format_idc = 0; + + sps->width = get_ue_golomb_long(gb); + sps->height = get_ue_golomb_long(gb); + if ((ret = av_image_check_size(sps->width, + sps->height, 0, avctx)) < 0) + return ret; + + if (get_bits1(gb)) { // pic_conformance_flag + int vert_mult = 1 + (sps->chroma_format_idc < 2); + int horiz_mult = 1 + (sps->chroma_format_idc < 3); + sps->pic_conf_win.left_offset = get_ue_golomb_long(gb) * horiz_mult; + sps->pic_conf_win.right_offset = get_ue_golomb_long(gb) * horiz_mult; + sps->pic_conf_win.top_offset = get_ue_golomb_long(gb) * vert_mult; + sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult; + + if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) { + av_log(avctx, AV_LOG_DEBUG, + "discarding sps conformance window, " + "original values are l:%u r:%u t:%u b:%u\n", + sps->pic_conf_win.left_offset, + sps->pic_conf_win.right_offset, + sps->pic_conf_win.top_offset, + sps->pic_conf_win.bottom_offset); + + sps->pic_conf_win.left_offset = + sps->pic_conf_win.right_offset = + sps->pic_conf_win.top_offset = + sps->pic_conf_win.bottom_offset = 0; + } + sps->output_window = sps->pic_conf_win; + } + + sps->bit_depth = get_ue_golomb_long(gb) + 8; + bit_depth_chroma = get_ue_golomb_long(gb) + 8; + if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) { + av_log(avctx, AV_LOG_ERROR, + "Luma bit depth (%d) is different from chroma bit depth (%d), " + "this is unsupported.\n", + sps->bit_depth, bit_depth_chroma); + return AVERROR_INVALIDDATA; + } + + ret = map_pixel_format(sps); + if (ret < 0) + return ret; + + sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4; + if (sps->log2_max_poc_lsb > 16) { + av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n", + sps->log2_max_poc_lsb - 4); + return AVERROR_INVALIDDATA; + } + + sublayer_ordering_info = get_bits1(gb); + start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1; + for (i = start; i < sps->max_sub_layers; i++) { + sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1; + sps->temporal_layer[i].num_reorder_pics = get_ue_golomb_long(gb); + sps->temporal_layer[i].max_latency_increase = get_ue_golomb_long(gb) - 1; + if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) { + av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n", + sps->temporal_layer[i].max_dec_pic_buffering - 1U); + return AVERROR_INVALIDDATA; + } + if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) { + av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n", + sps->temporal_layer[i].num_reorder_pics); + if (avctx->err_recognition & AV_EF_EXPLODE || + sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) { + return AVERROR_INVALIDDATA; + } + sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1; + } + } + + if (!sublayer_ordering_info) { + for (i = 0; i < start; i++) { + sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering; + sps->temporal_layer[i].num_reorder_pics = sps->temporal_layer[start].num_reorder_pics; + sps->temporal_layer[i].max_latency_increase = sps->temporal_layer[start].max_latency_increase; + } + } + + sps->log2_min_cb_size = get_ue_golomb_long(gb) + 3; + sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb); + sps->log2_min_tb_size = get_ue_golomb_long(gb) + 2; + log2_diff_max_min_transform_block_size = get_ue_golomb_long(gb); + sps->log2_max_trafo_size = log2_diff_max_min_transform_block_size + + sps->log2_min_tb_size; + + if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) { + av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size); + return AVERROR_INVALIDDATA; + } + + if (sps->log2_diff_max_min_coding_block_size > 30) { + av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size); + return AVERROR_INVALIDDATA; + } + + if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) { + av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size"); + return AVERROR_INVALIDDATA; + } + + if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) { + av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size); + return AVERROR_INVALIDDATA; + } + + { + const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size; + // Not a bitstream limitation, but all profiles + if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) { + av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY); + return AVERROR_INVALIDDATA; + } + + if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) { + av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size); + return AVERROR_INVALIDDATA; + } + + // Inferred parameters + sps->log2_ctb_size = CtbLog2SizeY; +// sps->log2_min_pu_size = sps->log2_min_cb_size - 1; + } + + sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb); + sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb); + + sps->scaling_list_enable_flag = get_bits1(gb); + if (sps->scaling_list_enable_flag) { + set_default_scaling_list_data(&sps->scaling_list); + + if (get_bits1(gb)) { + ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps); + if (ret < 0) + return ret; + } + } + + sps->amp_enabled_flag = get_bits1(gb); + sps->sao_enabled = get_bits1(gb); + + // Set pcm defaults (0) so we don't have to test _enabled when we + // want to use them + memset(&sps->pcm, 0, sizeof(sps->pcm)); + + if (get_bits1(gb)) // pcm_enabled_flag + { + const unsigned int limit_max_pcm = FFMIN(5, + sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size); + sps->pcm.bit_depth = get_bits(gb, 4) + 1; + sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1; + sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3; + sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size + + get_ue_golomb_long(gb); + if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) { + av_log(avctx, AV_LOG_ERROR, + "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n", + sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth); + return AVERROR_INVALIDDATA; + } + if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size || + sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) { + av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)", + sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size); + return AVERROR_INVALIDDATA; + } + + sps->pcm.loop_filter_disable_flag = get_bits1(gb); + } + + // Could be based on min_pcm_cb_size but much easier logic if we just stick + // with 8 (and costs us little) + sps->pcm_width = (sps->width + 63) >> 6; // 8 for min size, 8 bits per byte - round up + sps->pcm_height = (sps->height + 7) >> 3; + + sps->nb_st_rps = get_ue_golomb_long(gb); + if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) { + av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n", + sps->nb_st_rps); + return AVERROR_INVALIDDATA; + } + for (i = 0; i < sps->nb_st_rps; i++) { + if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i], + sps, 0)) < 0) + return ret; + } + + sps->long_term_ref_pics_present_flag = get_bits1(gb); + if (sps->long_term_ref_pics_present_flag) { + sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb); + if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) { + av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n", + sps->num_long_term_ref_pics_sps); + return AVERROR_INVALIDDATA; + } + for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) { + sps->lt_ref_pic_poc_lsb_sps[i] = get_bits(gb, sps->log2_max_poc_lsb); + sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb); + } + } + + sps->sps_temporal_mvp_enabled_flag = get_bits1(gb); + sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag + sps->vui.sar = (AVRational){0, 1}; + vui_present = get_bits1(gb); + if (vui_present) + decode_vui(gb, avctx, apply_defdispwin, sps); + + if (get_bits1(gb)) { // sps_extension_flag + int sps_extension_flag[1]; + for (i = 0; i < 1; i++) + sps_extension_flag[i] = get_bits1(gb); + skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7); + if (sps_extension_flag[0]) { + int extended_precision_processing_flag; + int cabac_bypass_alignment_enabled_flag; + + sps->transform_skip_rotation_enabled_flag = get_bits1(gb); + sps->transform_skip_context_enabled_flag = get_bits1(gb); + sps->implicit_rdpcm_enabled_flag = get_bits1(gb); + + sps->explicit_rdpcm_enabled_flag = get_bits1(gb); + + extended_precision_processing_flag = get_bits1(gb); + if (extended_precision_processing_flag) + av_log(avctx, AV_LOG_WARNING, + "extended_precision_processing_flag not yet implemented\n"); + + if (get_bits1(gb)) // sps->intra_smoothing_disabled_flag + sps->intra_filters_disable |= FILTER_EITHER; + sps->high_precision_offsets_enabled_flag = get_bits1(gb); + sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb); + + cabac_bypass_alignment_enabled_flag = get_bits1(gb); + if (cabac_bypass_alignment_enabled_flag) + av_log(avctx, AV_LOG_WARNING, + "cabac_bypass_alignment_enabled_flag not yet implemented\n"); + } + } + if (apply_defdispwin) { + sps->output_window.left_offset += sps->vui.def_disp_win.left_offset; + sps->output_window.right_offset += sps->vui.def_disp_win.right_offset; + sps->output_window.top_offset += sps->vui.def_disp_win.top_offset; + sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset; + } + + ow = &sps->output_window; + if (ow->left_offset >= INT_MAX - ow->right_offset || + ow->top_offset >= INT_MAX - ow->bottom_offset || + ow->left_offset + ow->right_offset >= sps->width || + ow->top_offset + ow->bottom_offset >= sps->height) { + av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n", + ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset); + if (avctx->err_recognition & AV_EF_EXPLODE) { + return AVERROR_INVALIDDATA; + } + av_log(avctx, AV_LOG_WARNING, + "Displaying the whole video surface.\n"); + memset(ow, 0, sizeof(*ow)); + memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win)); + } + + // Inferred parameters + + sps->ctb_width = (sps->width + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size; + sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size; + sps->ctb_size = sps->ctb_width * sps->ctb_height; + + sps->min_cb_width = sps->width >> sps->log2_min_cb_size; + sps->min_cb_height = sps->height >> sps->log2_min_cb_size; + sps->min_tb_width = sps->width >> sps->log2_min_tb_size; + sps->min_tb_height = sps->height >> sps->log2_min_tb_size; + sps->min_pu_width = sps->width >> LOG2_MIN_PU_SIZE; + sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE; + sps->tb_mask = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1; + + sps->qp_bd_offset = 6 * (sps->bit_depth - 8); + sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7)); + + if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) || + av_mod_uintp2(sps->height, sps->log2_min_cb_size)) { + av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n"); + return AVERROR_INVALIDDATA; + } + + if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) { + av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n", + sps->max_transform_hierarchy_depth_inter); + return AVERROR_INVALIDDATA; + } + if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) { + av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n", + sps->max_transform_hierarchy_depth_intra); + return AVERROR_INVALIDDATA; + } + if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) { + av_log(avctx, AV_LOG_ERROR, + "max transform block size out of range: %d\n", + sps->log2_max_trafo_size); + return AVERROR_INVALIDDATA; + } + + if (get_bits_left(gb) < 0) { + av_log(avctx, AV_LOG_ERROR, + "Overread SPS by %d bits\n", -get_bits_left(gb)); + return AVERROR_INVALIDDATA; + } + + return 0; +} + +int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx, + HEVCRpiParamSets *ps, int apply_defdispwin) +{ + HEVCRpiSPS *sps; + AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps)); + unsigned int sps_id; + int ret; + ptrdiff_t nal_size; + + if (!sps_buf) + return AVERROR(ENOMEM); + sps = (HEVCRpiSPS*)sps_buf->data; + + av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n"); + + nal_size = gb->buffer_end - gb->buffer; + if (nal_size > sizeof(sps->data)) { + av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS " + "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", + nal_size, sizeof(sps->data)); + sps->data_size = sizeof(sps->data); + } else { + sps->data_size = nal_size; + } + memcpy(sps->data, gb->buffer, sps->data_size); + + ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id, + apply_defdispwin, + ps->vps_list, avctx); + if (ret < 0) { + av_buffer_unref(&sps_buf); + return ret; + } + + if (avctx->debug & FF_DEBUG_BITSTREAM) { + av_log(avctx, AV_LOG_DEBUG, + "Parsed SPS: id %d; coded wxh: %dx%d; " + "cropped wxh: %dx%d; pix_fmt: %s.\n", + sps_id, sps->width, sps->height, + sps->width - (sps->output_window.left_offset + sps->output_window.right_offset), + sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset), + av_get_pix_fmt_name(sps->pix_fmt)); + } + + /* check if this is a repeat of an already parsed SPS, then keep the + * original one. + * otherwise drop all PPSes that depend on it */ + if (ps->sps_list[sps_id] && + !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) { + av_buffer_unref(&sps_buf); + } else { + remove_sps(ps, sps_id); + ps->sps_list[sps_id] = sps_buf; + } + + return 0; +} + +static void hevc_pps_free(void *opaque, uint8_t *data) +{ + HEVCRpiPPS *pps = (HEVCRpiPPS*)data; + + av_freep(&pps->column_width); + av_freep(&pps->row_height); + av_freep(&pps->col_bd); + av_freep(&pps->row_bd); + av_freep(&pps->col_idxX); + av_freep(&pps->ctb_addr_rs_to_ts); + av_freep(&pps->ctb_addr_ts_to_rs); + av_freep(&pps->tile_pos_ts); + av_freep(&pps->tile_size); + av_freep(&pps->tile_id); + av_freep(&pps->ctb_ts_flags); + + av_freep(&pps); +} + +static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets) +{ + do + { + const int offset = get_se_golomb_long(gb); + if (offset < -12 || offset > 12) { + av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset); + return AVERROR_INVALIDDATA; + } + *offsets++ = offset; + } while (n_minus_1-- != 0); + return 0; +} + +static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx, + HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps) +{ + if (pps->transform_skip_enabled_flag) { + pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2; + } + pps->cross_component_prediction_enabled_flag = get_bits1(gb); + if (pps->cross_component_prediction_enabled_flag && + (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag)) + { + av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n"); + return AVERROR_INVALIDDATA; + } + pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb); + if (pps->chroma_qp_offset_list_enabled_flag) { + int err; + + pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb); + pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb); + if (pps->chroma_qp_offset_list_len_minus1 > 5) { + av_log(avctx, AV_LOG_ERROR, + "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n"); + return AVERROR_INVALIDDATA; + } + av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n"); + + if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 || + (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0) + return err; + } + + { + const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0; + + pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb); + if (pps->log2_sao_offset_scale_luma > max_offset) { + av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid"); + return AVERROR_INVALIDDATA; + } + pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb); + if (pps->log2_sao_offset_scale_chroma > max_offset) { + av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid"); + return AVERROR_INVALIDDATA; + } + } + + return(0); +} + +static inline int setup_pps(AVCodecContext * const avctx, + HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps) +{ + int pic_area_in_ctbs; + int i, j, x, y, ctb_addr_rs, tile_id; + + // Inferred parameters + + // qp_y -> qp_u/qp_v tables + // The tables have at least -24,+24 overrun after adding offset here + // which should allow for clipless offseting + + pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0; // No offset for luma, but may be useful for general code + pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0; + + if (sps->chroma_format_idc == 1) { + pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0; + pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0; + pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0; + pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0; + } + else + { + pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0; + pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0; + pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0; + pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0; + } + + pps->col_bd = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd)); + pps->row_bd = av_malloc_array(pps->num_tile_rows + 1, sizeof(*pps->row_bd)); + pps->col_idxX = av_malloc_array(sps->ctb_width, sizeof(*pps->col_idxX)); + if (!pps->col_bd || !pps->row_bd || !pps->col_idxX) + return AVERROR(ENOMEM); + + if (pps->uniform_spacing_flag) { + if (!pps->column_width) { + pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width)); + pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height)); + } + if (!pps->column_width || !pps->row_height) + return AVERROR(ENOMEM); + + for (i = 0; i < pps->num_tile_columns; i++) { + pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns - + (i * sps->ctb_width) / pps->num_tile_columns; + } + + for (i = 0; i < pps->num_tile_rows; i++) { + pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows - + (i * sps->ctb_height) / pps->num_tile_rows; + } + } + + { + const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift); + pps->col_bd[0] = 0; + pps->tile_wpp_inter_disable = 0; + for (i = 0; i < pps->num_tile_columns; i++) + { + pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i]; + + // Avoid trying tile parallel if the columns don't fall on cache boundries + // (this causes too much pain syncing flushes with the QPU) + // Ignore the final (RHS of pic) tile boundry + if ((pps->col_bd[i] & td_mask) != 0) { + pps->tile_wpp_inter_disable = 1; + } + } + + // If we can start the next row before finishing the first line of + // this one then we must wait at the end of the tile + // * if this happens a lot then there are better but more complicated + // conditions that we could apply + if (pps->tile_wpp_inter_disable) { + for (i = 0; i < pps->num_tile_rows; i++) + { + if (pps->row_height[i] <= RPI_MAX_JOBS) { + pps->tile_wpp_inter_disable = 2; + break; + } + } + } + } + + pps->row_bd[0] = 0; + for (i = 0; i < pps->num_tile_rows; i++) + pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i]; + + for (i = 0, j = 0; i < sps->ctb_width; i++) { + if (i >= pps->col_bd[j + 1]) + j++; + pps->col_idxX[i] = j; + } + + /** + * 6.5 + */ + pic_area_in_ctbs = sps->ctb_size; + + pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts)); + pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs)); + pps->tile_id = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_id)); + pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size)); + pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts)); + pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags)); + if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs || + !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) { + return AVERROR(ENOMEM); + } + + memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags)); + + for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) { + int tb_x = ctb_addr_rs % sps->ctb_width; + int tb_y = ctb_addr_rs / sps->ctb_width; + int tile_x = 0; + int tile_y = 0; + int val = 0; + + for (i = 0; i < pps->num_tile_columns; i++) { + if (tb_x < pps->col_bd[i + 1]) { + tile_x = i; + break; + } + } + + for (i = 0; i < pps->num_tile_rows; i++) { + if (tb_y < pps->row_bd[i + 1]) { + tile_y = i; + break; + } + } + + for (i = 0; i < tile_x; i++) + val += pps->row_height[tile_y] * pps->column_width[i]; + for (i = 0; i < tile_y; i++) + val += sps->ctb_width * pps->row_height[i]; + + val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] + + tb_x - pps->col_bd[tile_x]; + + pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val; + pps->ctb_addr_ts_to_rs[val] = ctb_addr_rs; + } + + { + uint8_t * pflags = pps->ctb_ts_flags; + uint16_t * ptid = pps->tile_id; + + for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++) + { + for (i = 0; i < pps->num_tile_columns; i++, tile_id++) + { + const unsigned int tile_w = pps->column_width[i]; + + pflags[0] |= CTB_TS_FLAGS_CIREQ; + + for (x = 0; x != tile_w; ++x) { + pflags[x] |= CTB_TS_FLAGS_TOT; + } + + for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++) + { + pflags[0] |= CTB_TS_FLAGS_SOTL; + + if (pps->entropy_coding_sync_enabled_flag) + { + if (pps->column_width[i] != 1) + pflags[1] |= CTB_TS_FLAGS_CSAVE; + else + pflags[0] |= CTB_TS_FLAGS_CIREQ; + + if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0) + pflags[0] |= CTB_TS_FLAGS_CLOAD; + } + + for (x = 0; x != tile_w; ++x) + *ptid++ = tile_id; + + pflags += tile_w; + pflags[-1] |= CTB_TS_FLAGS_EOTL; + if (i + 1 == pps->num_tile_columns) + pflags[-1] |= CTB_TS_FLAGS_EOL; + } + + pflags[-1] |= CTB_TS_FLAGS_EOT; + } + } + } + + { + unsigned int ts = 0; + for (j = 0; j < pps->num_tile_rows; j++) + for (i = 0; i < pps->num_tile_columns; i++) + { + const unsigned int size = pps->column_width[i] * pps->row_height[j]; + pps->tile_size[j * pps->num_tile_columns + i] = size; + pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts; + ts += size; + } + } + + return 0; +} + +int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx, + HEVCRpiParamSets * const ps) +{ + const HEVCRpiSPS *sps = NULL; + int i, ret = 0; + unsigned int pps_id = 0; + ptrdiff_t nal_size; + unsigned log2_parallel_merge_level_minus2; + + AVBufferRef *pps_buf; + HEVCRpiPPS *pps = av_mallocz(sizeof(*pps)); + + if (!pps) + return AVERROR(ENOMEM); + + pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps), + hevc_pps_free, NULL, 0); + if (!pps_buf) { + av_freep(&pps); + return AVERROR(ENOMEM); + } + + av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n"); + + nal_size = gb->buffer_end - gb->buffer; + if (nal_size > sizeof(pps->data)) { + av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS " + "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", + nal_size, sizeof(pps->data)); + pps->data_size = sizeof(pps->data); + } else { + pps->data_size = nal_size; + } + memcpy(pps->data, gb->buffer, pps->data_size); + + // Default values + pps->loop_filter_across_tiles_enabled_flag = 1; + pps->num_tile_columns = 1; + pps->num_tile_rows = 1; + pps->uniform_spacing_flag = 1; + pps->disable_dbf = 0; + pps->beta_offset = 0; + pps->tc_offset = 0; + pps->log2_max_transform_skip_block_size = 2; + + // Coded parameters + pps_id = get_ue_golomb_long(gb); + if (pps_id >= HEVC_MAX_PPS_COUNT) { + av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id); + ret = AVERROR_INVALIDDATA; + goto err; + } + pps->sps_id = get_ue_golomb_long(gb); + if (pps->sps_id >= HEVC_MAX_SPS_COUNT) { + av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id); + ret = AVERROR_INVALIDDATA; + goto err; + } + if (!ps->sps_list[pps->sps_id]) { + av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id); + ret = AVERROR_INVALIDDATA; + goto err; + } + sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data; + + pps->dependent_slice_segments_enabled_flag = get_bits1(gb); + pps->output_flag_present_flag = get_bits1(gb); + pps->num_extra_slice_header_bits = get_bits(gb, 3); + + pps->sign_data_hiding_flag = get_bits1(gb); + + pps->cabac_init_present_flag = get_bits1(gb); + + pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1; + if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) { + av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n"); + ret = AVERROR_INVALIDDATA; + goto err; + } + pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1; + if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) { + av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n"); + ret = AVERROR_INVALIDDATA; + goto err; + } + + pps->pic_init_qp_minus26 = get_se_golomb(gb); + if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) { + av_log(avctx, AV_LOG_ERROR, + "init_qp_minus26 %d is outside the valid range " + "[%d, %d].\n", + pps->pic_init_qp_minus26, + -(26 + sps->qp_bd_offset), 25); + ret = AVERROR_INVALIDDATA; + goto err; + } + + pps->constrained_intra_pred_flag = get_bits1(gb); + pps->transform_skip_enabled_flag = get_bits1(gb); + + pps->cu_qp_delta_enabled_flag = get_bits1(gb); + pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size; + if (pps->cu_qp_delta_enabled_flag) + { + const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb); + + if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) { + av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n", + diff_cu_qp_delta_depth); + ret = AVERROR_INVALIDDATA; + goto err; + } + + pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth; + } + + pps->cb_qp_offset = get_se_golomb(gb); + if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) { + av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n", + pps->cb_qp_offset); + ret = AVERROR_INVALIDDATA; + goto err; + } + pps->cr_qp_offset = get_se_golomb(gb); + if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) { + av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n", + pps->cr_qp_offset); + ret = AVERROR_INVALIDDATA; + goto err; + } + pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb); + + pps->weighted_pred_flag = get_bits1(gb); + pps->weighted_bipred_flag = get_bits1(gb); + + pps->transquant_bypass_enable_flag = get_bits1(gb); + pps->tiles_enabled_flag = get_bits1(gb); + pps->entropy_coding_sync_enabled_flag = get_bits1(gb); + + if (pps->tiles_enabled_flag) { + pps->num_tile_columns = get_ue_golomb_long(gb) + 1; + pps->num_tile_rows = get_ue_golomb_long(gb) + 1; + if (pps->num_tile_columns <= 0 || + pps->num_tile_columns >= sps->width) { + av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n", + pps->num_tile_columns - 1); + ret = AVERROR_INVALIDDATA; + goto err; + } + if (pps->num_tile_rows <= 0 || + pps->num_tile_rows >= sps->height) { + av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n", + pps->num_tile_rows - 1); + ret = AVERROR_INVALIDDATA; + goto err; + } + + pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width)); + pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height)); + if (!pps->column_width || !pps->row_height) { + ret = AVERROR(ENOMEM); + goto err; + } + + pps->uniform_spacing_flag = get_bits1(gb); + if (!pps->uniform_spacing_flag) { + uint64_t sum = 0; + for (i = 0; i < pps->num_tile_columns - 1; i++) { + pps->column_width[i] = get_ue_golomb_long(gb) + 1; + sum += pps->column_width[i]; + } + if (sum >= sps->ctb_width) { + av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n"); + ret = AVERROR_INVALIDDATA; + goto err; + } + pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum; + + sum = 0; + for (i = 0; i < pps->num_tile_rows - 1; i++) { + pps->row_height[i] = get_ue_golomb_long(gb) + 1; + sum += pps->row_height[i]; + } + if (sum >= sps->ctb_height) { + av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n"); + ret = AVERROR_INVALIDDATA; + goto err; + } + pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum; + } + pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb); + } + + pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb); + + pps->deblocking_filter_control_present_flag = get_bits1(gb); + if (pps->deblocking_filter_control_present_flag) { + pps->deblocking_filter_override_enabled_flag = get_bits1(gb); + pps->disable_dbf = get_bits1(gb); + if (!pps->disable_dbf) { + int beta_offset_div2 = get_se_golomb(gb); + int tc_offset_div2 = get_se_golomb(gb) ; + if (beta_offset_div2 < -6 || beta_offset_div2 > 6) { + av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n", + beta_offset_div2); + ret = AVERROR_INVALIDDATA; + goto err; + } + if (tc_offset_div2 < -6 || tc_offset_div2 > 6) { + av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n", + tc_offset_div2); + ret = AVERROR_INVALIDDATA; + goto err; + } + pps->beta_offset = 2 * beta_offset_div2; + pps->tc_offset = 2 * tc_offset_div2; + } + } + + pps->scaling_list_data_present_flag = get_bits1(gb); + if (pps->scaling_list_data_present_flag) { + set_default_scaling_list_data(&pps->scaling_list); + ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps); + if (ret < 0) + goto err; + } + pps->lists_modification_present_flag = get_bits1(gb); + log2_parallel_merge_level_minus2 = get_ue_golomb_long(gb); + if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) { + av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n", + log2_parallel_merge_level_minus2); + ret = AVERROR_INVALIDDATA; + goto err; + } + pps->log2_parallel_merge_level = log2_parallel_merge_level_minus2 + 2; + + pps->slice_header_extension_present_flag = get_bits1(gb); + + if (get_bits1(gb)) { // pps_extension_present_flag + int pps_range_extensions_flag = get_bits1(gb); + skip_bits(gb, 7); // pps_extension_7bits + if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) { + if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0) + goto err; + } + } + + ret = setup_pps(avctx, pps, sps); + if (ret < 0) + goto err; + + if (get_bits_left(gb) < 0) { + av_log(avctx, AV_LOG_ERROR, + "Overread PPS by %d bits\n", -get_bits_left(gb)); + ret = AVERROR_INVALIDDATA; + goto err; + } + + remove_pps(ps, pps_id); + ps->pps_list[pps_id] = pps_buf; + + return 0; + +err: + av_buffer_unref(&pps_buf); + return ret; +} + +int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type) +{ + int max_poc_lsb = 1 << sps->log2_max_poc_lsb; + int prev_poc_lsb = pocTid0 % max_poc_lsb; + int prev_poc_msb = pocTid0 - prev_poc_lsb; + int poc_msb; + + if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2) + poc_msb = prev_poc_msb + max_poc_lsb; + else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2) + poc_msb = prev_poc_msb - max_poc_lsb; + else + poc_msb = prev_poc_msb; + + // For BLA picture types, POCmsb is set to 0. + if (nal_unit_type == HEVC_NAL_BLA_W_LP || + nal_unit_type == HEVC_NAL_BLA_W_RADL || + nal_unit_type == HEVC_NAL_BLA_N_LP) + poc_msb = 0; + + return poc_msb + poc_lsb; +} diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h new file mode 100644 index 0000000000..c725ebb9ca --- /dev/null +++ b/libavcodec/rpi_hevc_ps.h @@ -0,0 +1,449 @@ +/* + * HEVC parameter set parsing + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RPI_HEVC_PS_H +#define AVCODEC_RPI_HEVC_PS_H + +#include + +#include "libavutil/buffer.h" +#include "libavutil/pixfmt.h" +#include "libavutil/rational.h" + +#include "avcodec.h" +#include "get_bits.h" +#include "hevc.h" + +typedef struct ShortTermRPS { + unsigned int num_negative_pics; + int num_delta_pocs; + int rps_idx_num_delta_pocs; + int32_t delta_poc[32]; + uint8_t used[32]; +} ShortTermRPS; + +typedef struct LongTermRPS { + int poc[32]; + uint8_t used[32]; + uint8_t nb_refs; +} LongTermRPS; + +typedef struct RpiSliceHeader { + unsigned int pps_id; + + ///< address (in raster order) of the first block in the current slice segment + unsigned int slice_segment_addr; + ///< address (in raster order) of the first block in the current slice + unsigned int slice_addr; + + enum HEVCSliceType slice_type; + + int pic_order_cnt_lsb; + + uint8_t first_slice_in_pic_flag; + uint8_t dependent_slice_segment_flag; + uint8_t pic_output_flag; + uint8_t colour_plane_id; + + ///< RPS coded in the slice header itself is stored here + int short_term_ref_pic_set_sps_flag; + int short_term_ref_pic_set_size; + ShortTermRPS slice_rps; + const ShortTermRPS *short_term_rps; + int long_term_ref_pic_set_size; + LongTermRPS long_term_rps; + unsigned int list_entry_lx[2][32]; + + uint8_t rpl_modification_flag[2]; + uint8_t no_output_of_prior_pics_flag; + uint8_t slice_temporal_mvp_enabled_flag; + + unsigned int nb_refs[2]; + + uint8_t slice_sample_adaptive_offset_flag[3]; + uint8_t mvd_l1_zero_flag; + + uint8_t cabac_init_flag; + uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag + uint8_t slice_loop_filter_across_slices_enabled_flag; + uint8_t collocated_list; + + uint8_t no_dblk_boundary_flags; + + unsigned int collocated_ref_idx; + + int slice_qp_delta; + int slice_cb_qp_offset; // -12, +12 + int slice_cr_qp_offset; // -12, +12 + + uint8_t cu_chroma_qp_offset_enabled_flag; + + int beta_offset; ///< beta_offset_div2 * 2 + int tc_offset; ///< tc_offset_div2 * 2 + + unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand + + unsigned *entry_point_offset; + int * offset; + int * size; + int num_entry_point_offsets; + int offsets_allocated; + + uint8_t offload_wpp; + uint8_t offload_tiles; + + int8_t slice_qp; + + uint8_t luma_log2_weight_denom; + uint8_t chroma_log2_weight_denom; + + int16_t luma_weight_l0[16]; // -128, +255 + int16_t luma_offset_l0[16]; + int16_t chroma_weight_l0[16][2]; + int16_t chroma_offset_l0[16][2]; + + int16_t luma_weight_l1[16]; + int16_t luma_offset_l1[16]; + int16_t chroma_weight_l1[16][2]; + int16_t chroma_offset_l1[16][2]; + +} RpiSliceHeader; + +typedef struct HEVCRpiWindow { + uint16_t left_offset; + uint16_t right_offset; + uint16_t top_offset; + uint16_t bottom_offset; +} HEVCRpiWindow; + +typedef struct VUI { + AVRational sar; + + int overscan_info_present_flag; + int overscan_appropriate_flag; + + int video_signal_type_present_flag; + int video_format; + int video_full_range_flag; + int colour_description_present_flag; + uint8_t colour_primaries; + uint8_t transfer_characteristic; + uint8_t matrix_coeffs; + + int chroma_loc_info_present_flag; + int chroma_sample_loc_type_top_field; + int chroma_sample_loc_type_bottom_field; + int neutra_chroma_indication_flag; + + int field_seq_flag; + int frame_field_info_present_flag; + + int default_display_window_flag; + HEVCRpiWindow def_disp_win; + + int vui_timing_info_present_flag; + uint32_t vui_num_units_in_tick; + uint32_t vui_time_scale; + int vui_poc_proportional_to_timing_flag; + int vui_num_ticks_poc_diff_one_minus1; + int vui_hrd_parameters_present_flag; + + int bitstream_restriction_flag; + int tiles_fixed_structure_flag; + int motion_vectors_over_pic_boundaries_flag; + int restricted_ref_pic_lists_flag; + int min_spatial_segmentation_idc; + int max_bytes_per_pic_denom; + int max_bits_per_min_cu_denom; + int log2_max_mv_length_horizontal; + int log2_max_mv_length_vertical; +} VUI; + +typedef struct PTLCommon { + uint8_t profile_space; + uint8_t tier_flag; + uint8_t profile_idc; + uint8_t profile_compatibility_flag[32]; + uint8_t level_idc; + uint8_t progressive_source_flag; + uint8_t interlaced_source_flag; + uint8_t non_packed_constraint_flag; + uint8_t frame_only_constraint_flag; +} PTLCommon; + +typedef struct PTL { + PTLCommon general_ptl; + PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS]; + + uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS]; + uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS]; +} PTL; + +typedef struct HEVCRpiVPS { + uint8_t vps_temporal_id_nesting_flag; + int vps_max_layers; + int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1 + + PTL ptl; + int vps_sub_layer_ordering_info_present_flag; + unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS]; + unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS]; + unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS]; + int vps_max_layer_id; + int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1 + uint8_t vps_timing_info_present_flag; + uint32_t vps_num_units_in_tick; + uint32_t vps_time_scale; + uint8_t vps_poc_proportional_to_timing_flag; + int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1 + int vps_num_hrd_parameters; + + uint8_t data[4096]; + int data_size; +} HEVCRpiVPS; + +typedef struct ScalingList { + /* This is a little wasteful, since sizeID 0 only needs 8 coeffs, + * and size ID 3 only has 2 arrays, not 6. */ + uint8_t sl[4][6][64]; + uint8_t sl_dc[2][6]; +} ScalingList; + +typedef struct HEVCRpiSPS { + unsigned vps_id; + uint8_t chroma_format_idc; + uint8_t separate_colour_plane_flag; + + HEVCRpiWindow output_window; + + HEVCRpiWindow pic_conf_win; + + uint16_t wp_offset_half_range; // WpOffsetHalfRange + + uint8_t bit_depth; + +// int bit_depth_chroma; // We only support lum_bit_depth = chroma_bit_depth + uint8_t pixel_shift; + enum AVPixelFormat pix_fmt; + + unsigned int log2_max_poc_lsb; + + int max_sub_layers; + struct { + int max_dec_pic_buffering; + int num_reorder_pics; + int max_latency_increase; + } temporal_layer[HEVC_MAX_SUB_LAYERS]; + uint8_t temporal_id_nesting_flag; + + uint8_t scaling_list_enable_flag; + ScalingList scaling_list; + + unsigned int nb_st_rps; + ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS]; + + uint8_t amp_enabled_flag; + uint8_t sao_enabled; + + uint8_t long_term_ref_pics_present_flag; + uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS]; + uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS]; + uint8_t num_long_term_ref_pics_sps; + + struct { + uint8_t bit_depth; + uint8_t bit_depth_chroma; + uint8_t log2_min_pcm_cb_size; + uint8_t log2_max_pcm_cb_size; + uint8_t loop_filter_disable_flag; + } pcm; + char sps_temporal_mvp_enabled_flag; +// char sps_strong_intra_smoothing_enable_flag; -> intra_filtes_disable + + uint8_t log2_min_cb_size; // 3..6 + uint8_t log2_diff_max_min_coding_block_size; + uint8_t log2_min_tb_size; // 2..5 + uint8_t log2_max_trafo_size; + uint8_t log2_ctb_size; // 4..6 +// unsigned int log2_min_pu_size; // 2..5 (min_cb_size - 1) +#define LOG2_MIN_PU_SIZE 2 +#define LOG2_MIN_CU_SIZE 3 + + uint8_t max_transform_hierarchy_depth_inter; + uint8_t max_transform_hierarchy_depth_intra; + + char transform_skip_rotation_enabled_flag; + char transform_skip_context_enabled_flag; + char implicit_rdpcm_enabled_flag; + char explicit_rdpcm_enabled_flag; +// char intra_smoothing_disabled_flag; -> intra_filtes_disable + char high_precision_offsets_enabled_flag; + char persistent_rice_adaptation_enabled_flag; + + uint8_t intra_filters_disable; + + ///< coded frame dimension in various units + int width; + int height; + int ctb_width; + int ctb_height; + int ctb_size; // Pic size in CTBs not size of a CTB + int min_cb_width; + int min_cb_height; + int min_tb_width; + int min_tb_height; + int min_pu_width; + int min_pu_height; + int pcm_width; + int pcm_height; + int tb_mask; + + int hshift[3]; + int vshift[3]; + + int qp_bd_offset; + + uint8_t data[4096]; + int data_size; + + VUI vui; + PTL ptl; +} HEVCRpiSPS; + +#define CTB_TS_FLAGS_SOTL (1U << 0) // X start of tile line +#define CTB_TS_FLAGS_EOTL (1U << 1) // Last CTB of a tile line +#define CTB_TS_FLAGS_EOL (1U << 2) // Last CTB of a complete line +#define CTB_TS_FLAGS_EOT (1U << 3) // Last CTB of a tile +#define CTB_TS_FLAGS_CSAVE (1U << 4) +#define CTB_TS_FLAGS_CIREQ (1U << 5) // Cabac init request +#define CTB_TS_FLAGS_TOT (1U << 6) // CTB on top row of a tile +#define CTB_TS_FLAGS_CLOAD (1U << 7) + +typedef struct HEVCRpiPPS { + unsigned int sps_id; ///< seq_parameter_set_id + + uint8_t sign_data_hiding_flag; + + uint8_t cabac_init_present_flag; + + int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1 + int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1 + int pic_init_qp_minus26; + + uint8_t constrained_intra_pred_flag; + uint8_t transform_skip_enabled_flag; + + uint8_t cu_qp_delta_enabled_flag; + uint8_t log2_min_cu_qp_delta_size; + int cb_qp_offset; // -12..12 + int cr_qp_offset; // -12..12 + const uint8_t * qp_dblk_x[3]; + const int8_t * qp_bd_x[3]; + + uint8_t pic_slice_level_chroma_qp_offsets_present_flag; + uint8_t weighted_pred_flag; + uint8_t weighted_bipred_flag; + uint8_t output_flag_present_flag; + uint8_t transquant_bypass_enable_flag; + + uint8_t dependent_slice_segments_enabled_flag; + uint8_t tiles_enabled_flag; + uint8_t entropy_coding_sync_enabled_flag; + + uint8_t tile_wpp_inter_disable; + int num_tile_columns; ///< num_tile_columns_minus1 + 1 + int num_tile_rows; ///< num_tile_rows_minus1 + 1 + uint8_t uniform_spacing_flag; + uint8_t loop_filter_across_tiles_enabled_flag; + + uint8_t seq_loop_filter_across_slices_enabled_flag; + + uint8_t deblocking_filter_control_present_flag; + uint8_t deblocking_filter_override_enabled_flag; + uint8_t disable_dbf; + int beta_offset; ///< beta_offset_div2 * 2 + int tc_offset; ///< tc_offset_div2 * 2 + + uint8_t scaling_list_data_present_flag; + ScalingList scaling_list; + + uint8_t lists_modification_present_flag; + int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2 + int num_extra_slice_header_bits; + uint8_t slice_header_extension_present_flag; + uint8_t log2_max_transform_skip_block_size; + uint8_t cross_component_prediction_enabled_flag; + uint8_t chroma_qp_offset_list_enabled_flag; + uint8_t diff_cu_chroma_qp_offset_depth; + uint8_t chroma_qp_offset_list_len_minus1; + int8_t cb_qp_offset_list[6]; + int8_t cr_qp_offset_list[6]; + uint8_t log2_sao_offset_scale_luma; + uint8_t log2_sao_offset_scale_chroma; + + // Inferred parameters + uint16_t *column_width; ///< ColumnWidth + uint16_t *row_height; ///< RowHeight + uint16_t *col_bd; ///< ColBd + uint16_t *row_bd; ///< RowBd + uint16_t *col_idxX; + + // We can limit these to uint16_t given our other size limits + uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS + uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS + uint16_t *tile_id; ///< TileId + uint16_t *tile_pos_ts; ///< TilePosRS + uint16_t *tile_size; ///< TileSize + uint8_t * ctb_ts_flags; + + uint8_t data[4096]; + int data_size; +} HEVCRpiPPS; + +typedef struct HEVCRpiParamSets { + /* currently active parameter sets */ + const HEVCRpiVPS *vps; + const HEVCRpiSPS *sps; + const HEVCRpiPPS *pps; + + AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT]; + AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT]; + AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT]; +} HEVCRpiParamSets; + +int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx, + HEVCRpiParamSets *ps); +int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx, + HEVCRpiParamSets *ps, int apply_defdispwin); +int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx, + HEVCRpiParamSets *ps); + +int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx, + ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header); + +int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id, + uint8_t *buf, int buf_size); + +/** + * Compute POC of the current frame and return it. + */ +int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type); + +#endif /* AVCODEC_RPI_HEVC_PS_H */ diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c new file mode 100644 index 0000000000..8cc5796cf0 --- /dev/null +++ b/libavcodec/rpi_hevc_refs.c @@ -0,0 +1,485 @@ +/* + * HEVC video decoder + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2012 - 2013 Gildas Cocherel + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/pixdesc.h" +#include "libavutil/rpi_sand_fns.h" +#include "internal.h" +#include "thread.h" +#include "hevc.h" +#include "rpi_hevcdec.h" + +void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags) +{ + /* frame->frame can be NULL if context init failed */ + if (!frame->frame || !frame->frame->buf[0]) + return; + + frame->flags &= ~flags; + if (!frame->flags) { + ff_thread_release_buffer(s->avctx, &frame->tf); + + av_buffer_unref(&frame->col_mvf_buf); // OK if already NULL + frame->col_mvf = NULL; + + frame->collocated_ref = NULL; + } +} + +void ff_hevc_rpi_clear_refs(HEVCRpiContext *s) +{ + int i; + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) + ff_hevc_rpi_unref_frame(s, &s->DPB[i], + HEVC_FRAME_FLAG_SHORT_REF | + HEVC_FRAME_FLAG_LONG_REF); +} + +void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s) +{ + int i; + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) + ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); +} + +static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s) +{ + int i, ret; + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + HEVCRpiFrame * const frame = &s->DPB[i]; + if (frame->frame->buf[0]) + continue; + + ret = ff_thread_get_buffer(s->avctx, &frame->tf, + AV_GET_BUFFER_FLAG_REF); + if (ret < 0) + return NULL; + + frame->col_mvf = NULL; + frame->col_mvf_buf = NULL; + if (s->used_for_ref && !s->is_irap) + { + frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool); + if (!frame->col_mvf_buf) + goto fail; + frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data; + } + + frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; + frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); + + return frame; + +fail: + ff_hevc_rpi_unref_frame(s, frame, ~0); + return NULL; + } + av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n"); + return NULL; +} + +int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc) +{ + HEVCRpiFrame *ref; + int i; + + /* check that this POC doesn't already exist */ + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + HEVCRpiFrame *frame = &s->DPB[i]; + + if (frame->frame->buf[0] && frame->sequence == s->seq_decode && + frame->poc == poc) { + av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n", + poc); + return AVERROR_INVALIDDATA; + } + } + + ref = alloc_frame(s); + if (!ref) + return AVERROR(ENOMEM); + + *frame = ref->frame; + s->ref = ref; + + if (s->sh.pic_output_flag) + ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF; + else + ref->flags = HEVC_FRAME_FLAG_SHORT_REF; + + ref->poc = poc; + ref->sequence = s->seq_decode; + ref->frame->crop_left = s->ps.sps->output_window.left_offset; + ref->frame->crop_right = s->ps.sps->output_window.right_offset; + ref->frame->crop_top = s->ps.sps->output_window.top_offset; + ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset; + + return 0; +} + +int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush) +{ + do { + int nb_output = 0; + int min_poc = INT_MAX; + int i, min_idx, ret; + + if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) { + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + HEVCRpiFrame *frame = &s->DPB[i]; + if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc && + frame->sequence == s->seq_output) { + ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); + } + } + } + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + HEVCRpiFrame *frame = &s->DPB[i]; + if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) && + frame->sequence == s->seq_output) { + nb_output++; + if (frame->poc < min_poc || nb_output == 1) { + min_poc = frame->poc; + min_idx = i; + } + } + } + + /* wait for more frames before output */ + if (!flush && s->seq_output == s->seq_decode && s->ps.sps && + nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics) + return 0; + + if (nb_output) { + HEVCRpiFrame *frame = &s->DPB[min_idx]; + if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1) + return 0; + + ret = av_frame_ref(out, frame->frame); + if (frame->flags & HEVC_FRAME_FLAG_BUMPING) + ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING); + else + ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); + if (ret < 0) + return ret; + av_log(s->avctx, AV_LOG_DEBUG, + "Output frame with POC %d.\n", frame->poc); + return 1; + } + + if (s->seq_output != s->seq_decode) + s->seq_output = (s->seq_output + 1) & 0xff; + else + break; + } while (1); + + return 0; +} + +void ff_hevc_rpi_bump_frame(HEVCRpiContext *s) +{ + int dpb = 0; + int min_poc = INT_MAX; + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + HEVCRpiFrame *frame = &s->DPB[i]; + if ((frame->flags) && + frame->sequence == s->seq_output && + frame->poc != s->poc) { + dpb++; + } + } + + if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) { + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + HEVCRpiFrame *frame = &s->DPB[i]; + if ((frame->flags) && + frame->sequence == s->seq_output && + frame->poc != s->poc) { + if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) { + min_poc = frame->poc; + } + } + } + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + HEVCRpiFrame *frame = &s->DPB[i]; + if (frame->flags & HEVC_FRAME_FLAG_OUTPUT && + frame->sequence == s->seq_output && + frame->poc <= min_poc) { + frame->flags |= HEVC_FRAME_FLAG_BUMPING; + } + } + + dpb--; + } +} + +static int init_slice_rpl(HEVCRpiContext *s) +{ + if (s->slice_idx >= s->rpl_tab_size) + return AVERROR_INVALIDDATA; + + s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0; + return 0; +} + +int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s) +{ + RpiSliceHeader *sh = &s->sh; + + uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1; + uint8_t list_idx; + int i, j, ret; + + ret = init_slice_rpl(s); + if (ret < 0) + return ret; + + if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs + + s->rps[LT_CURR].nb_refs)) { + av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n"); + return AVERROR_INVALIDDATA; + } + + for (list_idx = 0; list_idx < nb_list; list_idx++) { + RefPicList rpl_tmp = { { 0 } }; + RefPicList *rpl = &s->refPicList[list_idx]; + + /* The order of the elements is + * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and + * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */ + int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF, + list_idx ? ST_CURR_BEF : ST_CURR_AFT, + LT_CURR }; + + /* concatenate the candidate lists for the current frame */ + while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) { + for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) { + RefPicList *rps = &s->rps[cand_lists[i]]; + for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) { + rpl_tmp.list[rpl_tmp.nb_refs] = rps->list[j]; + rpl_tmp.ref[rpl_tmp.nb_refs] = rps->ref[j]; + rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2; + rpl_tmp.nb_refs++; + } + } + } + + /* reorder the references if necessary */ + if (sh->rpl_modification_flag[list_idx]) { + for (i = 0; i < sh->nb_refs[list_idx]; i++) { + int idx = sh->list_entry_lx[list_idx][i]; + + if (idx >= rpl_tmp.nb_refs) { + av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n"); + return AVERROR_INVALIDDATA; + } + + rpl->list[i] = rpl_tmp.list[idx]; + rpl->ref[i] = rpl_tmp.ref[idx]; + rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx]; + rpl->nb_refs++; + } + } else { + memcpy(rpl, &rpl_tmp, sizeof(*rpl)); + rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]); + } + + if (sh->collocated_list == list_idx && + sh->collocated_ref_idx < rpl->nb_refs) + s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx]; + } + + return 0; +} + +static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc) +{ + int i; + int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1; + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + HEVCRpiFrame *ref = &s->DPB[i]; + if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) { + if ((ref->poc & LtMask) == poc) + return ref; + } + } + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + HEVCRpiFrame *ref = &s->DPB[i]; + if (ref->frame->buf[0] && ref->sequence == s->seq_decode) { + if (ref->poc == poc || (ref->poc & LtMask) == poc) + return ref; + } + } + + if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s)) + av_log(s->avctx, AV_LOG_ERROR, + "Could not find ref with POC %d\n", poc); + return NULL; +} + +static void mark_ref(HEVCRpiFrame *frame, int flag) +{ + frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF); + frame->flags |= flag; +} + +static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc) +{ + HEVCRpiFrame *frame; + int i, x, y; + + frame = alloc_frame(s); + if (!frame) + return NULL; + + if (!s->ps.sps->pixel_shift) { + for (i = 0; frame->frame->buf[i]; i++) + memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1), + frame->frame->buf[i]->size); + } else { + for (i = 0; frame->frame->data[i]; i++) + for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++) + for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) { + AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x, + 1 << (s->ps.sps->bit_depth - 1)); + } + } + + frame->poc = poc; + frame->sequence = s->seq_decode; + frame->flags = 0; + + ff_hevc_rpi_progress_set_all_done(frame); + + return frame; +} + +/* add a reference with the given poc to the list and mark it as used in DPB */ +static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list, + int poc, int ref_flag) +{ + HEVCRpiFrame *ref = find_ref_idx(s, poc); + + if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS) + return AVERROR_INVALIDDATA; + + if (!ref) { + ref = generate_missing_ref(s, poc); + if (!ref) + return AVERROR(ENOMEM); + } + + list->list[list->nb_refs] = ref->poc; + list->ref[list->nb_refs] = ref; + list->nb_refs++; + + mark_ref(ref, ref_flag); + return 0; +} + +int ff_hevc_rpi_frame_rps(HEVCRpiContext *s) +{ + const ShortTermRPS *short_rps = s->sh.short_term_rps; + const LongTermRPS *long_rps = &s->sh.long_term_rps; + RefPicList *rps = s->rps; + int i, ret = 0; + + if (!short_rps) { + rps[0].nb_refs = rps[1].nb_refs = 0; + return 0; + } + + /* clear the reference flags on all frames except the current one */ + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + HEVCRpiFrame *frame = &s->DPB[i]; + + if (frame == s->ref) + continue; + + mark_ref(frame, 0); + } + + for (i = 0; i < NB_RPS_TYPE; i++) + rps[i].nb_refs = 0; + + /* add the short refs */ + for (i = 0; i < short_rps->num_delta_pocs; i++) { + int poc = s->poc + short_rps->delta_poc[i]; + int list; + + if (!short_rps->used[i]) + list = ST_FOLL; + else if (i < short_rps->num_negative_pics) + list = ST_CURR_BEF; + else + list = ST_CURR_AFT; + + ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF); + if (ret < 0) + goto fail; + } + + /* add the long refs */ + for (i = 0; i < long_rps->nb_refs; i++) { + int poc = long_rps->poc[i]; + int list = long_rps->used[i] ? LT_CURR : LT_FOLL; + + ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF); + if (ret < 0) + goto fail; + } + +fail: + /* release any frames that are now unused */ + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) + ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0); + + return ret; +} + +int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s) +{ + int ret = 0; + int i; + const ShortTermRPS *rps = s->sh.short_term_rps; + LongTermRPS *long_rps = &s->sh.long_term_rps; + + if (rps) { + for (i = 0; i < rps->num_negative_pics; i++) + ret += !!rps->used[i]; + for (; i < rps->num_delta_pocs; i++) + ret += !!rps->used[i]; + } + + if (long_rps) { + for (i = 0; i < long_rps->nb_refs; i++) + ret += !!long_rps->used[i]; + } + return ret; +} diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c new file mode 100644 index 0000000000..cd8149d58e --- /dev/null +++ b/libavcodec/rpi_hevc_sei.c @@ -0,0 +1,368 @@ +/* + * HEVC Supplementary Enhancement Information messages + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2012 - 2013 Gildas Cocherel + * Copyright (C) 2013 Vittorio Giovara + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "golomb.h" +#include "rpi_hevc_ps.h" +#include "rpi_hevc_sei.h" + +static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb) +{ + int cIdx, i; + uint8_t hash_type; + //uint16_t picture_crc; + //uint32_t picture_checksum; + hash_type = get_bits(gb, 8); + + for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) { + if (hash_type == 0) { + s->is_md5 = 1; + for (i = 0; i < 16; i++) + s->md5[cIdx][i] = get_bits(gb, 8); + } else if (hash_type == 1) { + // picture_crc = get_bits(gb, 16); + skip_bits(gb, 16); + } else if (hash_type == 2) { + // picture_checksum = get_bits_long(gb, 32); + skip_bits(gb, 32); + } + } + return 0; +} + +static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb) +{ + int i; + // Mastering primaries + for (i = 0; i < 3; i++) { + s->display_primaries[i][0] = get_bits(gb, 16); + s->display_primaries[i][1] = get_bits(gb, 16); + } + // White point (x, y) + s->white_point[0] = get_bits(gb, 16); + s->white_point[1] = get_bits(gb, 16); + + // Max and min luminance of mastering display + s->max_luminance = get_bits_long(gb, 32); + s->min_luminance = get_bits_long(gb, 32); + + // As this SEI message comes before the first frame that references it, + // initialize the flag to 2 and decrement on IRAP access unit so it + // persists for the coded video sequence (e.g., between two IRAPs) + s->present = 2; + return 0; +} + +static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb) +{ + // Max and average light levels + s->max_content_light_level = get_bits_long(gb, 16); + s->max_pic_average_light_level = get_bits_long(gb, 16); + // As this SEI message comes before the first frame that references it, + // initialize the flag to 2 and decrement on IRAP access unit so it + // persists for the coded video sequence (e.g., between two IRAPs) + s->present = 2; + return 0; +} + +static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb) +{ + get_ue_golomb_long(gb); // frame_packing_arrangement_id + s->present = !get_bits1(gb); + + if (s->present) { + s->arrangement_type = get_bits(gb, 7); + s->quincunx_subsampling = get_bits1(gb); + s->content_interpretation_type = get_bits(gb, 6); + + // spatial_flipping_flag, frame0_flipped_flag, field_views_flag + skip_bits(gb, 3); + s->current_frame_is_frame0_flag = get_bits1(gb); + // frame0_self_contained_flag, frame1_self_contained_flag + skip_bits(gb, 2); + + if (!s->quincunx_subsampling && s->arrangement_type != 5) + skip_bits(gb, 16); // frame[01]_grid_position_[xy] + skip_bits(gb, 8); // frame_packing_arrangement_reserved_byte + skip_bits1(gb); // frame_packing_arrangement_persistence_flag + } + skip_bits1(gb); // upsampled_aspect_ratio_flag + return 0; +} + +static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb) +{ + s->present = !get_bits1(gb); + + if (s->present) { + s->hflip = get_bits1(gb); // hor_flip + s->vflip = get_bits1(gb); // ver_flip + + s->anticlockwise_rotation = get_bits(gb, 16); + skip_bits1(gb); // display_orientation_persistence_flag + } + + return 0; +} + +static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps, + void *logctx, int size) +{ + HEVCSEIPictureTiming *h = &s->picture_timing; + HEVCRpiSPS *sps; + + if (!ps->sps_list[s->active_seq_parameter_set_id]) + return(AVERROR(ENOMEM)); + sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data; + + if (sps->vui.frame_field_info_present_flag) { + int pic_struct = get_bits(gb, 4); + h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN; + if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) { + av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n"); + h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD; + } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) { + av_log(logctx, AV_LOG_DEBUG, "TOP Field\n"); + h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD; + } + get_bits(gb, 2); // source_scan_type + get_bits(gb, 1); // duplicate_flag + skip_bits1(gb); + size--; + } + skip_bits_long(gb, 8 * size); + + return 0; +} + +static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb, + int size) +{ + int flag; + int user_data_type_code; + int cc_count; + + if (size < 3) + return AVERROR(EINVAL); + + user_data_type_code = get_bits(gb, 8); + if (user_data_type_code == 0x3) { + skip_bits(gb, 1); // reserved + + flag = get_bits(gb, 1); // process_cc_data_flag + if (flag) { + skip_bits(gb, 1); + cc_count = get_bits(gb, 5); + skip_bits(gb, 8); // reserved + size -= 2; + + if (cc_count && size >= cc_count * 3) { + const uint64_t new_size = (s->a53_caption_size + cc_count + * UINT64_C(3)); + int i, ret; + + if (new_size > INT_MAX) + return AVERROR(EINVAL); + + /* Allow merging of the cc data from two fields. */ + ret = av_reallocp(&s->a53_caption, new_size); + if (ret < 0) + return ret; + + for (i = 0; i < cc_count; i++) { + s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); + s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); + s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); + } + skip_bits(gb, 8); // marker_bits + } + } + } else { + int i; + for (i = 0; i < size - 1; i++) + skip_bits(gb, 8); + } + + return 0; +} + +static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb, + int size) +{ + uint32_t country_code; + uint32_t user_identifier; + + if (size < 7) + return AVERROR(EINVAL); + size -= 7; + + country_code = get_bits(gb, 8); + if (country_code == 0xFF) { + skip_bits(gb, 8); + size--; + } + + skip_bits(gb, 8); + skip_bits(gb, 8); + + user_identifier = get_bits_long(gb, 32); + + switch (user_identifier) { + case MKBETAG('G', 'A', '9', '4'): + return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size); + default: + skip_bits_long(gb, size * 8); + break; + } + return 0; +} + +static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx) +{ + int num_sps_ids_minus1; + int i; + unsigned active_seq_parameter_set_id; + + get_bits(gb, 4); // active_video_parameter_set_id + get_bits(gb, 1); // self_contained_cvs_flag + get_bits(gb, 1); // num_sps_ids_minus1 + num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1 + + if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) { + av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1); + return AVERROR_INVALIDDATA; + } + + active_seq_parameter_set_id = get_ue_golomb_long(gb); + if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) { + av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id); + return AVERROR_INVALIDDATA; + } + s->active_seq_parameter_set_id = active_seq_parameter_set_id; + + for (i = 1; i <= num_sps_ids_minus1; i++) + get_ue_golomb_long(gb); // active_seq_parameter_set_id[i] + + return 0; +} + +static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb) +{ + s->present = 1; + s->preferred_transfer_characteristics = get_bits(gb, 8); + return 0; +} + +static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps, + int type, int size) +{ + switch (type) { + case 256: // Mismatched value from HM 8.1 + return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb); + case HEVC_SEI_TYPE_FRAME_PACKING: + return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb); + case HEVC_SEI_TYPE_DISPLAY_ORIENTATION: + return decode_nal_sei_display_orientation(&s->display_orientation, gb); + case HEVC_SEI_TYPE_PICTURE_TIMING: + return decode_nal_sei_pic_timing(s, gb, ps, logctx, size); + case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO: + return decode_nal_sei_mastering_display_info(&s->mastering_display, gb); + case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO: + return decode_nal_sei_content_light_info(&s->content_light, gb); + case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS: + return decode_nal_sei_active_parameter_sets(s, gb, logctx); + case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35: + return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size); + case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS: + return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb); + default: + av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type); + skip_bits_long(gb, 8 * size); + return 0; + } +} + +static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, + int type, int size) +{ + switch (type) { + case HEVC_SEI_TYPE_DECODED_PICTURE_HASH: + return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb); + default: + av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type); + skip_bits_long(gb, 8 * size); + return 0; + } +} + +static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s, + const HEVCRpiParamSets * const ps, const int nal_unit_type) +{ + int payload_type = 0; + int payload_size = 0; + int byte = 0xFF; + av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n"); + + while (byte == 0xFF) { + if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255) + return AVERROR_INVALIDDATA; + byte = get_bits(gb, 8); + payload_type += byte; + } + byte = 0xFF; + while (byte == 0xFF) { + if (get_bits_left(gb) < 8 + 8LL*payload_size) + return AVERROR_INVALIDDATA; + byte = get_bits(gb, 8); + payload_size += byte; + } + if (nal_unit_type == HEVC_NAL_SEI_PREFIX) { + return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size); + } else { /* nal_unit_type == NAL_SEI_SUFFIX */ + return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size); + } +} + +static int more_rbsp_data(GetBitContext *gb) +{ + return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80; +} + +int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s, + const HEVCRpiParamSets *ps, int type) +{ + int ret; + + do { + ret = decode_nal_sei_message(gb, logctx, s, ps, type); + if (ret < 0) + return ret; + } while (more_rbsp_data(gb)); + return 1; +} + +void ff_hevc_rpi_reset_sei(HEVCSEIContext *s) +{ + s->a53_caption.a53_caption_size = 0; + av_freep(&s->a53_caption.a53_caption); +} diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h new file mode 100644 index 0000000000..d4ac348df9 --- /dev/null +++ b/libavcodec/rpi_hevc_sei.h @@ -0,0 +1,135 @@ +/* + * HEVC Supplementary Enhancement Information messages + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RPI_HEVC_SEI_H +#define AVCODEC_RPI_HEVC_SEI_H + +#include + +#include "libavutil/md5.h" + +#include "get_bits.h" + +/** + * SEI message types + */ +typedef enum { + HEVC_SEI_TYPE_BUFFERING_PERIOD = 0, + HEVC_SEI_TYPE_PICTURE_TIMING = 1, + HEVC_SEI_TYPE_PAN_SCAN_RECT = 2, + HEVC_SEI_TYPE_FILLER_PAYLOAD = 3, + HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35 = 4, + HEVC_SEI_TYPE_USER_DATA_UNREGISTERED = 5, + HEVC_SEI_TYPE_RECOVERY_POINT = 6, + HEVC_SEI_TYPE_SCENE_INFO = 9, + HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT = 15, + HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16, + HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END = 17, + HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS = 19, + HEVC_SEI_TYPE_POST_FILTER_HINT = 22, + HEVC_SEI_TYPE_TONE_MAPPING_INFO = 23, + HEVC_SEI_TYPE_FRAME_PACKING = 45, + HEVC_SEI_TYPE_DISPLAY_ORIENTATION = 47, + HEVC_SEI_TYPE_SOP_DESCRIPTION = 128, + HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS = 129, + HEVC_SEI_TYPE_DECODING_UNIT_INFO = 130, + HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX = 131, + HEVC_SEI_TYPE_DECODED_PICTURE_HASH = 132, + HEVC_SEI_TYPE_SCALABLE_NESTING = 133, + HEVC_SEI_TYPE_REGION_REFRESH_INFO = 134, + HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO = 137, + HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO = 144, + HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147, +} HEVC_SEI_Type; + +typedef struct HEVCSEIPictureHash { + uint8_t md5[3][16]; + uint8_t is_md5; +} HEVCSEIPictureHash; + +typedef struct HEVCSEIFramePacking { + int present; + int arrangement_type; + int content_interpretation_type; + int quincunx_subsampling; + int current_frame_is_frame0_flag; +} HEVCSEIFramePacking; + +typedef struct HEVCSEIDisplayOrientation { + int present; + int anticlockwise_rotation; + int hflip, vflip; +} HEVCSEIDisplayOrientation; + +typedef struct HEVCSEIPictureTiming { + int picture_struct; +} HEVCSEIPictureTiming; + +typedef struct HEVCSEIA53Caption { + int a53_caption_size; + uint8_t *a53_caption; +} HEVCSEIA53Caption; + +typedef struct HEVCSEIMasteringDisplay { + int present; + uint16_t display_primaries[3][2]; + uint16_t white_point[2]; + uint32_t max_luminance; + uint32_t min_luminance; +} HEVCSEIMasteringDisplay; + +typedef struct HEVCSEIContentLight { + int present; + uint16_t max_content_light_level; + uint16_t max_pic_average_light_level; +} HEVCSEIContentLight; + +typedef struct HEVCSEIAlternativeTransfer { + int present; + int preferred_transfer_characteristics; +} HEVCSEIAlternativeTransfer; + +typedef struct HEVCSEIContext { + HEVCSEIPictureHash picture_hash; + HEVCSEIFramePacking frame_packing; + HEVCSEIDisplayOrientation display_orientation; + HEVCSEIPictureTiming picture_timing; + HEVCSEIA53Caption a53_caption; + HEVCSEIMasteringDisplay mastering_display; + HEVCSEIContentLight content_light; + int active_seq_parameter_set_id; + HEVCSEIAlternativeTransfer alternative_transfer; +} HEVCSEIContext; + +struct HEVCRpiParamSets; + +int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s, + const struct HEVCRpiParamSets *ps, int type); + +/** + * Reset SEI values that are stored on the Context. + * e.g. Caption data that was extracted during NAL + * parsing. + * + * @param s HEVCRpiContext. + */ +void ff_hevc_rpi_reset_sei(HEVCSEIContext *s); + +#endif /* AVCODEC_RPI_HEVC_SEI_H */ diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c new file mode 100644 index 0000000000..23b49a99ae --- /dev/null +++ b/libavcodec/rpi_hevc_shader.c @@ -0,0 +1,1537 @@ +#include "rpi_hevc_shader.h" + +#ifdef _MSC_VER + #include + /* cast through uintptr_t to avoid warnings */ + #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X)) +#else + #define POINTER_TO_UINT(X) ((unsigned int)(X)) +#endif + +#ifdef __cplusplus +extern "C" { /* the types are probably wrong... */ +#endif +#ifdef __cplusplus +} +#endif + +#ifdef _MSC_VER +__declspec(align(8)) +#elif defined(__GNUC__) +__attribute__((aligned(8))) +#endif +unsigned int ff_hevc_rpi_shader[] = { +// ::mc_setup_c_q0 +// ::mc_start +/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_setup_c_qn +/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif +/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] +/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif +/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 +/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift +/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 +/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 +/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask +/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) +/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) +/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) +/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif +/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif +/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) +/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch +/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num +/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 +/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num +/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift +/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x +/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a +/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 +/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch +/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1 +/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif +/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 +/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num +/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 +/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 +/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 +/* [0x00000110] */ 0x159e7040, 0x10020827, // or r0, r0, r1 +/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) +/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 +/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) +/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 +/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 +/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift +/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a +/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif +/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 +/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch +/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD +/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 +/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y +// :1 +/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 +/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 +/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y +/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch +/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 +/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 +/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b +/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y +/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch +/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 +/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 +/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 +/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 +/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 +// ::mc_filter_c_p +/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif +/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif +/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 +/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif +/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif +/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next +/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a +/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3 +/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul +/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif +/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height +/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif +/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height +/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif +/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif +/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val +/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c +/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 +/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 +/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add +/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 +/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif +// :1 +/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 +/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next +/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y +/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next +/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 +/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 +/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 +/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch +/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask +/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 +/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 +/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 +/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 +/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 +/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 +/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b +/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 +/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a +/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 +/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 +/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height +/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 +/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add +/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 +/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b +/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 +/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b +/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_c_p_l1 +/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif +/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif +/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 +/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif +/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif +/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next +/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a +/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3 +/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul +/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif +/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height +/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif +/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height +/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif +/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif +/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val +/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c +/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 +/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 +/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add +/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 +/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif +// :1 +/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 +/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next +/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y +/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next +/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 +/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 +/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 +/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch +/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax +/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 +/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 +/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 +/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 +/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 +/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 +/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b +/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 +/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a +/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 +/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 +/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height +/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 +/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add +/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 +/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b +/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 +/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b +/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_c_b +/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif +/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif +/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 +/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a +/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif +/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next +/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif +/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 +/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif +/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul +/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height +/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next +/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif +/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height +/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 +/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif +/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif +/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a +/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b +/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif +/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif +/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif +/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y +/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add +/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 +/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif +/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4 +/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif +/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val +/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0 +/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1 +/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1 +/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d +/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif +/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d +// :1 +/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0 +/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next +/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next +/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next +/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y +/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 +/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 +/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch +/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask +/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0 +/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1 +/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 +/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 +/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 +/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1 +/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6 +/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 +/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 +/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15 +/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 +/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax +/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch +/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax +/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0 +/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1 +/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0 +/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0 +/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 +/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a +/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b +/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 +/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b +/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4 +/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c +/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7 +/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c +/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11 +/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0 +/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6 +/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 +/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 +/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add +/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height +/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 +/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b +/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7 +/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b +/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_sync_q0 +/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) +/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q1 +/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) +/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q2 +/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) +/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) +/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q3 +/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) +/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) +/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop +// ::mc_sync_q4 +/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) +/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q5 +/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) +/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q6 +/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) +/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q7 +/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) +/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop +// ::mc_sync_q8 +/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) +/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q9 +/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) +/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q10 +/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) +/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync_q11 +/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) +/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop +// ::mc_exit_c_qn +// ::mc_exit_y_qn +/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 +// :1 +/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b +/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 +/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 +/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 +/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend +/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop +/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop +// ::mc_exit_c_q0 +// ::mc_exit_y_q0 +/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 +// :1 +/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b +/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 +/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 +/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 +/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) +/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend +/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 +/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop +// ::mc_setup_y_q0 +/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_setup_y_qn +/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif +/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif +/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif +/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] +/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif +/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 +/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask +/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) +/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) +/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) +/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00 +/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40 +/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500 +/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif +/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif +/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1 +/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 +/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif +/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) +/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch +/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 +/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 +/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 +/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 +/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch +/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2 +/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 +/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 +/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 +/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 +/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 +/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2 +/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 +/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0 +/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a +/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a +// :1 +/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 +/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 +/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y +/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch +/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 +/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 +/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b +/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y +/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch +/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 +/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num +/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 +/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 +/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 +/* [0x00000df0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 +/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) +/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 +/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) +/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 +/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 +/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 +/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 +/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 +/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 +// :per_block_setup_8 +/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next +/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 +/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif +/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a +/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif +/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 +/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 +/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a +/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif +/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 +/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif +/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init +/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul +/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 +/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height +/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height +/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8) +/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add +/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 +/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val +/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif +/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif +/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255 +/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 +/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d +/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c +/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d +/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c +/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 +/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif +/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 +/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 +/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif +/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 +/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d +/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c +/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d +/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c +/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 +/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d +/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 +/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 +/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif +/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 +// ::mc_filter_y_pxx +/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 +/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num +/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 +/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next +/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5 +/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 +/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 +// :1 +/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef +/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 +/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 +/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 +/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 +/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 +/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 +/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 +/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax +/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch +/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 +/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 +/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 +/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 +/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 +/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 +/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 +/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 +/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 +/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 +/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 +/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 +/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 +/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 +/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 +/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 +/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b +/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b +/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b +/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 +/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c +/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d +/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 +/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c +/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 +/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height +/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next +/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next +/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next +/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 +/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add +/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 +/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 +/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b +/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y_bxx +/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 +/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num +/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 +/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next +/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6 +/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 +/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 +/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 +// :1 +/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef +/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 +/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 +/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 +/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 +/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 +/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 +/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 +/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax +/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch +/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 +/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 +/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 +/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 +/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 +/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 +/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 +/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 +/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 +/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 +/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 +/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 +/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 +/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 +/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 +/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 +/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b +/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b +/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b +/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 +/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c +/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d +/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 +/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c +/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 +/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4 +/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off +/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6 +/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 +/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add +/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next +/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next +/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8 +/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height +/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch +/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3 +/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b +/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y_p00 +/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num +/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 +/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif +/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a +/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif +/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 +/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif +/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2 +/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif +/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init +/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift +/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height +/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height +/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 +/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 +/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7 +/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif +/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base +// :1 +/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 +/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 +/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch +/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 +/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next +/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 +/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask +/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 +/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height +/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 +/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b +/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8 +/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b +/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y_b00 +/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 +/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num +/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 +/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next +/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1 +/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 +/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 +/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 +// :1 +/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 +/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 +/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch +/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 +/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next +/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 +/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next +/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 +/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y +/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 +/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax +/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 +/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 +/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 +/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height +/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 +/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32 +/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b +/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_setup_c10_q0 +/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_setup_c10_qn +/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif +/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] +/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif +/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 +/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift +/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 +/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 +/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask +/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) +/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) +/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) +/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif +/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif +/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) +/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch +/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num +/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 +/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num +/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0 +/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift +/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x +/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a +/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 +/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch +/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1 +/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif +/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 +/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num +/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 +/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 +/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 +/* [0x00001770] */ 0x159e7040, 0x10020827, // or r0, r0, r1 +/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) +/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 +/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) +/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 +/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 +/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift +/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a +/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif +/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch +/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD +/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 +/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y +// :1 +/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 +/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 +/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y +/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch +/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 +/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 +/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b +/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y +/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch +/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 +/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 +/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 +/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 +/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 +// ::mc_filter_c10_p +/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif +/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif +/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 +/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif +/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif +/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next +/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a +/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul +/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif +/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height +/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif +/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height +/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif +/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif +/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val +/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c +/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 +/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 +/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add +/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 +/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif +// :1 +/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 +/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next +/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y +/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next +/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 +/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 +/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 +/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch +/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask +/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 +/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 +/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 +/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 +/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 +/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 +/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b +/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 +/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a +/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 +/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 +/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height +/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 +/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add +/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 +/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b +/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 +/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b +/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_c10_p_l1 +/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif +/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif +/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 +/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif +/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif +/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next +/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a +/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul +/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif +/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height +/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif +/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height +/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif +/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif +/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val +/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c +/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 +/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 +/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add +/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 +/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif +// :1 +/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 +/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next +/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y +/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next +/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 +/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 +/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 +/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch +/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax +/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 +/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 +/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 +/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 +/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 +/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 +/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b +/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 +/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a +/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 +/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 +/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height +/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 +/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add +/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 +/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 +/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b +/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_c10_b +/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif +/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif +/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 +/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a +/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif +/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next +/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif +/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif +/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul +/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height +/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next +/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif +/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height +/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 +/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif +/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif +/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a +/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b +/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif +/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif +/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif +/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y +/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add +/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif +/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4 +/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif +/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val +/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0 +/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1 +/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1 +/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d +/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif +/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d +// :1 +/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0 +/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next +/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next +/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next +/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y +/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 +/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 +/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch +/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask +/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0 +/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1 +/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 +/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 +/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 +/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1 +/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6 +/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 +/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 +/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15 +/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 +/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax +/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch +/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax +/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0 +/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1 +/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0 +/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0 +/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 +/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a +/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b +/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 +/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b +/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4 +/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c +/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7 +/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c +/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11 +/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0 +/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6 +/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 +/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 +/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add +/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height +/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 +/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7 +/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b +/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_sync10_q0 +/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) +/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) +/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) +/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) +/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) +/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q1 +/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) +/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) +/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q2 +/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) +/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) +/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q3 +/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) +/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop +// ::mc_sync10_q4 +/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) +/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q5 +/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) +/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q6 +/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) +/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) +/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q7 +/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) +/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) +/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop +// ::mc_sync10_q8 +/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) +/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) +/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q9 +/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) +/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q10 +/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) +/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) +/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) +// ::mc_sync10_q11 +/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) +/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) +/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop +// ::mc_exit_c10_q0 +// ::mc_exit_y10_q0 +/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 +// :1 +/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b +/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 +/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 +/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 +/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) +/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend +/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 +/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop +// ::mc_exit_c10_qn +// ::mc_exit_y10_qn +/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 +// :1 +/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b +/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 +/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 +/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 +/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait +/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend +/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop +/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop +// ::mc_setup_y10_q0 +/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) +// ::mc_setup_y10_qn +/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif +/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif +/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif +/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] +/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif +/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 +/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask +/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) +/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) +/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) +/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00 +/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40 +/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500 +/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif +/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif +/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1 +/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift +/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 +/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif +/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) +/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch +/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 +/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift +/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 +/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 +/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 +/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch +/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2 +/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 +/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 +/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 +/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift +/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 +/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 +/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2 +/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 +/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0 +/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a +/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a +// :1 +/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 +/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 +/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y +/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch +/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 +/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 +/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b +/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y +/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch +/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 +/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num +/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 +/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 +/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 +/* [0x00002428] */ 0x159e7040, 0x10020827, // or r0, r0, r1 +/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) +/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 +/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) +/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 +/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 +/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif +/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 +/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 +/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 +/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 +// :per_block_setup_10 +/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift +/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next +/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x +/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 +/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif +/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a +/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif +/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 +/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 +/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift +/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a +/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif +/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 +/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif +/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init +/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul +/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 +/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height +/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height +/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8) +/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add +/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 +/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val +/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif +/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif +/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255 +/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 +/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d +/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c +/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d +/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c +/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 +/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif +/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 +/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 +/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif +/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 +/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d +/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c +/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d +/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c +/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 +/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d +/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 +/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link +/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 +/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif +/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 +// ::mc_filter_y10_pxx +/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 +/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num +/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 +/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next +/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5 +/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 +/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 +// :1 +/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef +/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 +/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 +/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 +/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 +/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 +/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 +/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 +/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax +/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch +/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 +/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 +/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 +/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 +/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 +/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 +/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 +/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 +/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 +/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 +/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 +/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 +/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 +/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 +/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 +/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 +/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b +/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b +/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 +/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c +/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d +/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 +/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c +/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 +/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height +/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next +/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next +/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next +/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 +/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add +/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 +/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b +/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 +/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b +/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y10_p00 +/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num +/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 +/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif +/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift +/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a +/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif +/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 +/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 +/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif +/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2 +/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch +/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif +/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init +/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift +/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height +/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height +/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 +/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 +/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7 +/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif +/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base +// :1 +/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 +/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 +/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch +/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 +/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next +/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 +/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask +/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 +/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height +/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 +/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8 +/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b +/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y10_bxx +/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 +/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num +/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 +/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next +/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6 +/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 +/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 +/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 +// :1 +/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef +/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 +/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 +/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 +/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 +/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 +/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 +/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 +/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax +/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch +/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 +/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 +/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 +/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 +/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 +/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 +/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 +/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 +/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 +/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 +/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 +/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 +/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 +/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 +/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 +/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 +/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 +/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b +/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b +/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 +/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c +/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d +/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 +/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c +/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 +/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4 +/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off +/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6 +/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 +/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add +/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next +/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next +/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8 +/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height +/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch +/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3 +/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b +/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_filter_y10_b00 +/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 +/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num +/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 +/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next +/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1 +/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 +/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 +/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 +// :1 +/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 +/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 +/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch +/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 +/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next +/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 +/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next +/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 +/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y +/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 +/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax +/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 +/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 +/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 +/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height +/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 +/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b +/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32 +/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait +/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 +/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link +/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 +/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest +/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 +/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b +/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 +/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 +/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init +// ::mc_end +}; +#ifdef __HIGHC__ +#pragma Align_to(8, ff_hevc_rpi_shader) +#endif diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h new file mode 100644 index 0000000000..79651c9b6c --- /dev/null +++ b/libavcodec/rpi_hevc_shader.h @@ -0,0 +1,63 @@ +#ifndef rpi_hevc_shader_H +#define rpi_hevc_shader_H + +extern unsigned int ff_hevc_rpi_shader[]; + +#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0) +#define mc_start (ff_hevc_rpi_shader + 0) +#define mc_setup_c_qn (ff_hevc_rpi_shader + 2) +#define mc_filter_c_p (ff_hevc_rpi_shader + 134) +#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260) +#define mc_filter_c_b (ff_hevc_rpi_shader + 386) +#define mc_sync_q0 (ff_hevc_rpi_shader + 580) +#define mc_sync_q1 (ff_hevc_rpi_shader + 598) +#define mc_sync_q2 (ff_hevc_rpi_shader + 610) +#define mc_sync_q3 (ff_hevc_rpi_shader + 622) +#define mc_sync_q4 (ff_hevc_rpi_shader + 634) +#define mc_sync_q5 (ff_hevc_rpi_shader + 652) +#define mc_sync_q6 (ff_hevc_rpi_shader + 664) +#define mc_sync_q7 (ff_hevc_rpi_shader + 676) +#define mc_sync_q8 (ff_hevc_rpi_shader + 688) +#define mc_sync_q9 (ff_hevc_rpi_shader + 706) +#define mc_sync_q10 (ff_hevc_rpi_shader + 718) +#define mc_sync_q11 (ff_hevc_rpi_shader + 730) +#define mc_exit_c_qn (ff_hevc_rpi_shader + 742) +#define mc_exit_y_qn (ff_hevc_rpi_shader + 742) +#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760) +#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760) +#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780) +#define mc_setup_y_qn (ff_hevc_rpi_shader + 782) +#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014) +#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140) +#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272) +#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358) +#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432) +#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434) +#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562) +#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684) +#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806) +#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996) +#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014) +#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026) +#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038) +#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050) +#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068) +#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080) +#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092) +#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104) +#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122) +#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134) +#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146) +#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158) +#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158) +#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178) +#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178) +#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196) +#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198) +#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440) +#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566) +#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654) +#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786) +#define mc_end (ff_hevc_rpi_shader + 2860) + +#endif diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm new file mode 100644 index 0000000000..af5b59e181 --- /dev/null +++ b/libavcodec/rpi_hevc_shader.qasm @@ -0,0 +1,1850 @@ +# Copyright (c) 2017 Raspberry Pi (Trading) Ltd. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the copyright holder nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Written by Peter de Rivaz, John Cox + + + +# Inter pred asm +# +# Logic here should be good to 14 bits without modification +# but only 8 & 10 are currently instantiated & tested +# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow +# in _p00 & _b00 + +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress +# the warning that we are using rotation & ra/rb registers. r0..3 can be +# rotated through all 16 elems ra regs can only be rotated through their +# local 4. As it happens this is what is wanted here as we do not want the +# constants from the other half of the calc. + +# Number limits in P/B calculation +# +# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier +# we offset our intermediates s.t. they always end up +ve before the next +# multiply (may be -ve whilst summing but that doesn't matter). +# +# Range calc for up to 14 bits (Y-B pred): +# +# denom: [0, 7] +# bmax = (1 << bits) - 1 +# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1] +# +# wt_mul: [-128, 255] +# wt_off = off * 2 + 1: [-bmax, bmax] +# +# pel: [0, bmax] +# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff] +# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e] +# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6] +# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4] +# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2): +# [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000] +# +# This all looks good and is mostly bit depth independant - and as we manage +# to do unsigned multiplies everywhere (now) this should be good for any bit +# depth up to 14 (we could probably do 16 - but that requires a few tweaks +# to the shifts we don't currently have logic for) + +# PREREAD is the number of requests that we have sitting in the TMU request +# queue. +# +# There are 8 slots availible in the TMU request Q for tm0s requests, but +# only 4 output FIFO entries and overflow is bad (corruption or crash) +# (If threaded then only 2 out FIFO entries, but we aren't.) +# In s/w we are effectively limited to the min vertical read which is >= 4 +# so output FIFO is the limit. +# +# As the test for read-next is is the main part of the Luma loop (rather than +# the preload FIFO part) we are limited to min_luma_height - 1 +# Min_luma_height is 4 so we can only have a preload of 3 +# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick +# in chroma without abandoning preload pretty much entirely (which would be bad) +# +# Timing tests vs preload of 4 suggests this doesn't hurt us much +# Could have preread 4 for Chroma but when tested it didn't help + +.set PREREAD, 3 + +# Offset added (effectively) at the exit of the H FIR filter +# This is enough to force the result +ve +# Is good if it is a power of 2 as that allows for >> without loss +# +# Worst case for a single Y FIR is *-22 so we need an offset of 256*22 +# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00 +# Round up to next power of 2 + +.set FIR_OFFSET, 0x4000 + +# Block heights - 8 & 16 are the only numbers we currently support + +.set C_BLK_HEIGHT_8, 16 +.set C_BLK_HEIGHT_16, 8 +.set Y_BLK_HEIGHT_8, 16 +.set Y_BLK_HEIGHT_16, 8 + +# QPU counts - depend on block size +# If we have a 2-byte format & block_size > 8 then can only afford +# 8 QPUs +# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h + +.set N_QPU_8, 12 +.set N_QPU_16, 12 + +# Value to add to the weight multiplier to convert it into an unsigned value +# Should be power of two for convienience + +.set LOG2_MUL_ADD, 14 +.set MUL_ADD, (1 << LOG2_MUL_ADD) + +# Fixed denom (max that it can be set to) +.set DENOM, 7 + +# register allocation +# + +# ra0-3 +# Used as temp and may be loop filter coeffs (split into .8s) +# or temp in loop. Check usage on an individual basis. + +# ra4-11 +# V FIFO / temp / free + +# -- free -- ra12 + +# -- free -- ra13 + +# -- free -- ra14 + +# -- free -- ra15 + +# uniform: width:height +.set ra_width_height, ra16 +.set ra_width, ra16.16b +.set ra_height, ra16.16a + +# y:y2 same layout as y_y2_next so we can update both together +.set ra_y_y2, ra17 +.set ra_y2, ra17.16a +.set ra_y, ra17.16b + +# uniform: L1 weight (U on left, V on right) +# Only used in Y B +.set ra_wt_off_mul_l1, ra18 +.set ra_wt_off_l1, ra18.16b +.set ra_wt_mul_l1, ra18.16a + +# y_next:y2_next same layout as y_y2 so we can update both together +.set ra_y_y2_next, ra19 +.set ra_y_next, ra19.16b +.set ra_y2_next, ra19.16a + +# Setup: consts - subdivide a single register +.set ra_kff800100, ra20 +.set ra_k256, ra20.16a +.set ra_k0, ra20.8a +.set ra_k1, ra20.8b +.set ra_k128, ra20.8c +.set ra_k255, ra20.8d + +# Loop: xshifts +.set ra_xshift, ra21.16a +.set ra_xshift_next, ra21.16b + +# Loop var: L0 weight (U on left, V on right) +# _off_ is not used in loop as we want to modify it before use +.set ra_wt_off_mul_l0, ra22 +.set ra_wt_mul_l0, ra22.16a +.set ra_wt_off_l0, ra22.16b + +# Max pel value (for 8 bit we can get away with sat ops but not 9+) +# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the +# 2nd byte but as the source should never be > 3 there 0x3ff should do +.set ra_blk_height_pmax, ra23 +.set ra_pmax, ra23.16a +.set ra_blk_height, ra23.8c +# --free -- ra23.8d + +# Loop: src frame base (L0) +.set ra_base, ra24 + +# Misc offsets +.set ra_fir_off_val_wt_den_p7, ra25 +.set ra_wt_den_p7, ra25.8a +# -- free -- ra25.8b +.set ra_fir_off_val, ra25.16b + +# As it happens these constants are the same +.if FIR_OFFSET == MUL_ADD +# Weight multiplier unsigned add +.set ra_kmul_add, ra_fir_off_val +.else +.error "FIR_OFFSET != MUL_ADD: Need new register & init" +.endif + +# Loop: next src frame base (L0) +.set ra_base_next, ra26 + +# Loop: height<<23 + width<<16 + vdw_setup_0 +.set ra_dma0, ra27 + +# Loop: destination address +.set ra_dest, ra28 + +# Setup: Dup of rb_ef +# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul +# (top bits are ignored by mul24) +.set ra_ef, ra29 + +# Use an even numbered register as a link register to avoid corrupting flags +.set ra_link, ra30 + +# -- free -- ra31 + +.set rb_xshift2, rb0 +.set rb_xshift2_next, rb1 + +# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2 +.set rb_elem_x, rb2 + +# El Flags +# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n +# Duped into ra_ef as sometimes that is easier to use +.set rb_ef, rb3 + +# rb4-11 +# Loop: V filter FIFO or V filter coeff + +# Loop var: offset to add before shift (round + weighting offsets) +# Exact value varies by loop +.set rb_wt_off, rb12 + +# -- free -- rb13 + +# -- free -- rb14 + +# Loop: src frame base (L1) +.set rb_base2, rb15 + +# Line pitch (128 for sand128) +.set rb_pitch, rb16 + +# Loop count - 2 (set up TMU for next xfer) +.set rb_i_tmu, rb17 + +# Loop count for min(height, 16) +# Y will reset & loop again if height > 16 +.set rb_lcount, rb18 + +# frame_base2_next +.set rb_base2_next, rb19 + +# Setup: Height of Y+C in sand, (x&mask)*xpitch will give +# offset to the slice +.set rb_xpitch, rb20 + +# These 3 consts each save 1 instruction in Y loop setup +# so whilst they are worthwhile they should be the 1st to die if we need +# another b reg +.set rb_y_coeffs_2, rb21 # 0x050b0a00 +.set rb_y_coeffs_3, rb22 # 0x11283a40 +.set rb_y_coeffs_5, rb23 # 0x0a0b0500 + +# Setup: 0xff (8-bit) / 0xffff (9+ bit) +.set rb_pmask, rb24 + +# vdw_setup_1(dst_pitch) +.set rb_dma1_base, rb25 + +# Setup: pic width - 1 +# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc. +.set rb_max_x, rb26 + +# vdw_setup_0 (depends on QPU number) +.set rb_dma0_base, rb27 + +# Setup: vw_setup value to reset VPM write pointer +.set rb_vpm_init, rb28 + +# Loop: vdw_setup_1(dst_pitch-width) = stride +.set rb_dma1, rb29 + +# Setup: pic_height - 1 +.set rb_max_y, rb30 + +# Setup: FIR H offset +.set rb_fir_off_h, rb31 + + +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. +.set i_shift16, -16 +.set i_shift21, -11 +.set i_shift23, -9 +.set i_shift30, -2 + +# Much of the setup code is common between Y & C +# Macros that express this - obviously these can't be overlapped +# so are probably unsuitable for loop code + +.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma + mov r2, qpu_num +.if v_bit_depth <= 8 + # 8 bit version + asr r1, r2, 2 + shl r1, r1, 6 + and r0, r2, 3 + or r0, r0, r1 + + mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit + add r_vpm, r0, r1 # VPM 8bit storage + + mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later + shl r0, r0, 5 + +.else + # 16 bit version + # Limited to 8 QPUs if blk height > 8 + asr r1, r2, 1 +.if v_blk_height <= 8 + shl r1, r1, 4 +.else + shl r1, r1, 5 +.endif + and r0, r2, 1 + or r0, r0, r1 + + mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR + add r_vpm, r0, r1 + + # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into + # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg) + mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later + shl r0, r0, 6 +.endif + add r_dma, r0, r1 # DMA out +.endm + + +.macro m_setup_q0 + srel -, 12 +.endm + +# Code start label +::mc_start + +################################################################################ +# mc_setup_c +# +# typedef struct qpu_mc_pred_c_s_s { +# int16_t y; +# int16_t x; +# uint32_t base; +# uint32_t pic_cw; // C Width (== Y width / 2) +# uint32_t pic_ch; // C Height (== Y Height / 2) +# uint32_t stride2; +# uint32_t stride1; +# uint32_t wdenom; +# int16_t y2; +# int16_t x2; +# uint32_t base2; +# uint32_t next_fn; +# } qpu_mc_pred_c_s_t; + +.macro m_setup_c, v_bit_depth + +# Cannot use mul24 on x as x might be -ve, so must use shift +.if v_bit_depth <= 8 +.set v_x_shift, 1 +.set v_pmask, 0xff +.set v_blk_height, C_BLK_HEIGHT_8 +.else +.set v_x_shift, 2 +.set v_pmask, 0xffff +.set v_blk_height, C_BLK_HEIGHT_16 +.endif + + mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y + + mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] + shl rb_ef, r0, i_shift30 ; mov ra_base, unif # ; ref_c_base + +# Read image dimensions + sub r0, unif, 1 # pic c width + shl rb_max_x, r0, v_x_shift # rb_max_x in bytes + sub rb_max_y, unif, 1 # pic c height + +# load constants + mov ra_kff800100, 0xff800100 + mov rb_pmask, v_pmask + mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) + mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) + mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) + +# get source pitch + mov ra_ef, rb_ef ; mov rb_xpitch, unif # ; stride2 + mov rb_pitch, unif # stride1 + mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly + add rb_dma1_base, r1, rb_pitch # vdw_setup_1 + + and r0, 1, elem_num + nop ; mul24 r0, r0, 5 +.if v_bit_depth <= 8 + add rb_elem_x, r0, elem_num +.else + add r0, r0, elem_num + add rb_elem_x, r0, r0 +.endif + +# Compute base address for first and second access +# ra_base ends up with t0s base +# ra_base2 ends up with t1s base + + shl r0, ra0.16b, v_x_shift # [rb_elem_x delay] + add r0, r0, rb_elem_x # Add elem no to x to get X for this slice + max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y + min r0, r0, rb_max_x + +# Get shift +# Shift will always calculate as 0 for 9+ bit +# Ideally we can optimize the shift out of the code in these cases but for now +# it is tidier to leave it in +.if v_bit_depth <= 8 + shl ra_xshift_next, r0, 3 +.else + mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 +.endif + +# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to + +.if v_bit_depth <= 8 + and r0, r0, -4 +.endif + sub r1, ra_k0, rb_pitch + and r1, r0, r1 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 ; mov ra0, unif # ; next_x2_y2 + add ra_base, ra_base, r0 + +# Compute part of VPM to use for DMA output +# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop? + m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base + +# And again for L1, but only worrying about frame2 stuff + +# Compute base address for first and second access +# ra_base ends up with t0s base +# rb_base2 ends up with t1s base + + shl r0, ra0.16b, v_x_shift + add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset + max r0, r0, 0 ; mov rb_base2, unif # ref_c_base2 + min r0, r0, rb_max_x + +# Get shift (already zero if 9+ bit so ignore) +.if v_bit_depth <= 8 + shl rb_xshift2_next, r0, 3 +.endif + +# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs + +.if v_bit_depth <= 8 + and r0, r0, -4 +.endif + sub r1, ra_k0, rb_pitch + and r1, r0, r1 ; mov r3, PREREAD + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 ; mov r2, ra_y2 + add rb_base2, rb_base2, r0 ; mov r0, ra_y + +# Do preloads +# r0 = ra_y, r2 = ra_y2, r3 = PREREAD + +:1 + sub.setf r3, r3, 1 + max r1, r0, 0 + min r1, r1, rb_max_y + add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch + add t0s, ra_base, r1 ; mov ra_y, r0 + + max r1, r2, 0 + brr.anynz -, r:1b + min r1, r1, rb_max_y + add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch + add t1s, rb_base2, r1 ; mov ra_y2, r2 +# >>> .anynz 1b + + mov ra_link, unif # link +# touch registers to keep simulator happy (and fills in delay slots) + mov ra4, 0 ; mov rb4, 0 + bra -, ra_link + mov ra5, 0 ; mov rb5, 0 + mov ra6, 0 ; mov rb6, 0 + mov ra7, 0 ; mov rb7, 0 +# >>> ra_link +.endm + +::mc_setup_c_q0 + m_setup_q0 +::mc_setup_c_qn + m_setup_c 8 + +################################################################################ +# +# mc_filter_c_p +# +# typedef struct qpu_mc_pred_c_p_s { +# int16_t y; +# int16_t x; +# uint32_t base; +# uint16_t h; +# uint16_t w; +# uint32_t coeffs_x; +# uint32_t coeffs_y; +# uint32_t wo_u; +# uint32_t wo_v; +# uint32_t dst_addr_c; +# uint32_t next_fn; +# } qpu_mc_pred_c_p_t; + +.macro m_filter_c_p, v_tmu, v_bit_depth + +.if v_bit_depth <= 8 +.set v_x_shift, 1 +.set v_x_mul, 2 +.set v_v_shift, 8 +# Shifts to get width & height in the right place in rb_dma0 +.set v_dma_h_shift, 7 +.set v_dma_wh_shift, i_shift16 +.else +.set v_x_shift, 2 +.set v_x_mul, 4 +.set v_v_shift, i_shift16 +# Shifts to get width & height in the right place in rb_dma0 +.set v_dma_h_shift, 8 +.set v_dma_wh_shift, 15 +.endif + +.if v_tmu == 0 +.set vrx_xshift, rb_xshift2 # b side more convienient +.set vrx_xshift_next, ra_xshift_next +.set vra_y_next, ra_y_next +.set vrx_base_next, ra_base_next +.set vra_y, ra_y +.set vra_base, ra_base +.set vr_txs, t0s +.else +.set vrx_xshift, ra_xshift # a side more convienient +.set vrx_xshift_next, rb_xshift2_next +.set vra_y_next, ra_y2_next +.set vrx_base_next, rb_base2_next +.set vra_y, ra_y2 +.set vra_base, rb_base2 +.set vr_txs, t1s +.endif + +# denom shift values +.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) +.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) + +# per-channel shifts were calculated on the *previous* invocation +# get base addresses and per-channel shifts for *next* invocation + mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y + + add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base + + shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0 + add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height + sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs + max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next + min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a + +.if v_bit_depth <= 8 + shl vrx_xshift_next, r0, 3 + and r0, r0, -4 +.endif + and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced! + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs + add vrx_base_next, r3, r0 ; mov r1, ra_height + +# set up VPM write + sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight + add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height + add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight + +# Misc final setup... + + shl r0, r1, v_dma_h_shift ; mov ra_dest, unif # ; dst_addr + add r0, r0, r2 ; mov r2, ra_fir_off_val # Combine width and height of destination area (r0=h<<8, r2=w*2) + shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register + add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight + shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 + sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add + add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 # ; loop counter (V FIFO fill = 4) + mov rb11, ra3.8d ; mov ra_link, unif # ; Link + +# r5 = -4 (loop counter) +# ra_wt_mul_l0 = weight L0 + 128 (now unsigned) +# rb_wt_off = (offset * 2 + 1) << (wt_den + 5) +# rb31 = FIR value offset + +# FIFO: rb4, ra5, rb6, ra7 +# Coeffs in ra3.8a, ra3.8b, rb10, rb11 + +# We want (r0r1) +# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ... +# We fetch (after shift) +# C0 : C3 : C1 : C4 : C2 : C5 : ... + +:1 +# retrieve texture results and pick out bytes +# then submit two more texture requests + +.if v_tmu == 0 + sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 + shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next + shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y + add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next +.else + sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 + shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next + shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y + add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next # [r1 << delay] +.endif + + add vra_y, r3, ra_k1 ; mov r0, r1 << 15 + max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 + min r3, r3, rb_max_y ; mov.ifnc r0, r2 + + and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch +.if v_tmu == 0 + add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask # ; mask bytes +.else + add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax # ; mask bytes +.endif + +# apply horizontal filter +# The filter coeffs for the two halves of this are the same (unlike in the +# Y case) so it doesn't matter which ra0 we get them from +# Also as the two halves are locked together we don't need to separate the 1st +# r0 mul or the last r1 mul as they are valid for all QPUs + + add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 + sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 + sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 + nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 + add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 + +# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift) +# We would like to save the r5->r4 shift but we need a delay slot +# for both r7 & r6 which we can't find anything to put in if we have +# already multiplied r4 & r5! + brr.anyn -, r:1b + add r2, r2, r3 ; mul24 r0, ra7, rb10 # r6 post + mov ra5, rb6 ; mul24 r1, rb6, ra3.8b # r5 post + asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 +# >>> .anyn 1b + + add r1, r1, r0 ; mul24 r0, rb4, ra3.8a # [ra7 delay] + sub r1, r1, r0 ; mul24 r0, ra7, rb11 + sub r1, r1, r0 + + asr r1, r1, 6 ; mov r3, ra_blk_height # ; NxtLoop + sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 + add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add + sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop + brr.anyn -, r:1b + asr r1, r1, i_wt_den_p6 + min r1, r1, ra_pmax ; mov -, vw_wait + max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop +# >>> .anyn 1b + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch +# r3 = block_height + +# If looping again then we consumed 16 height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) +# recalc ra_dma0, rb_lcount based on new segment height + + mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link + min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride + sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link + +# Here r1 = cur_blk_height - 16 so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block + brr -, r:1b + add rb_lcount, rb_lcount, r0 + add ra_dma0, ra_dma0, r1 + add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + +::mc_filter_c_p + m_filter_c_p 0, 8 + +::mc_filter_c_p_l1 + m_filter_c_p 1, 8 + +################################################################################ +# +# mc_filter_c_b +# +# typedef struct qpu_mc_pred_c_b_s { +# int16_t y; +# int16_t x; +# uint32_t base; +# uint16_t h; +# uint16_t w; +# uint32_t coeffs_x1; +# uint32_t coeffs_y1; +# int16_t weight_u1; +# int16_t weight_v1; +# int16_t y2; +# int16_t x2; +# uint32_t base2; +# uint32_t coeffs_x2; +# uint32_t coeffs_y2; +# uint32_t wo_u2; +# uint32_t wo_v2; +# uint32_t dst_addr_c; +# uint32_t next_fn; +# } qpu_mc_pred_c_b_t; + +.macro m_filter_c_b, v_bit_depth + +.if v_bit_depth <= 8 +.set v_x_shift, 1 +.set v_v_shift, 8 +# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 7 +.set v_dma_wh_shift, i_shift16 +.else +.set v_x_shift, 2 +.set v_v_shift, i_shift16 +# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 8 +.set v_dma_wh_shift, 15 +.endif +.set v_x_mul, (1 << v_x_shift) + +# denom shift values +.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) +.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) + +# per-channel shifts were calculated on the *previous* invocation + +# get base addresses and per-channel shifts for *next* invocation + mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y + + add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base + + shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0 + add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a + sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height + max r0, r0, r5 ; mov ra_xshift, ra_xshift_next + min r0, r0, rb_max_x ; mov ra0, unif # ; L0 H filter coeffs + +.if v_bit_depth <= 8 + shl ra_xshift_next, r0, 3 +.endif + + and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs + and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs) + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height + add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B + +# set up VPM write + + sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight + add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height + add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight + + shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2 + add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base + shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register + add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b # r0=x + +# L1 - uniform layout could possibly be optimized + + shl r0, r0, v_x_shift ; mov ra1, unif # r0=x<>> .anyn 1b + + sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b # L1 ; L0 + sub.setf -, r5, rb_lcount ; mov r0, ra4 + sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c + add r1, r1, r0 ; mul24 r0, ra7, rb7 + + sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c # L1 + add r2, r2, r0 ; mul24 r0, ra11, rb11 # L1 + sub r2, r2, r0 + + shr r1, r1, 6 + shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 + add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 + add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add + sub r1, r1, r2 ; mov r3, ra_blk_height # ; NxtLoop + add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 # ; NxtLoop + + brr.anyn -, r:1b + asr r1, r1, ra_wt_den_p7 + min r1, r1, ra_pmax ; mov -, vw_wait + max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop +# >>> .anyn 1b + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch +# r3 = block_height + +# If looping again then we consumed 16 height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) +# recalc ra_dma0, rb_lcount based on new segment height + + mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0 + +# DMA out + bra.anyz -, ra_link + min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride + sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link + +# Here r1 = cur_blk_height - 16 so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block + brr -, r:1b + add rb_lcount, rb_lcount, r0 + add ra_dma0, ra_dma0, r1 + add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + +::mc_filter_c_b + m_filter_c_b 8 + +################################################################################ +# Exit code used by both Luma & Chroma so place between them to avoid I-cache +# conflicts + +.macro m_exit_drain +.if PREREAD == 2 +# Special case 2 as loop is wasteful + nop ; nop ; ldtmu0 + nop ; nop ; ldtmu1 + nop ; nop ; ldtmu0 + mov -, vw_wait ; nop ; ldtmu1 +.else + mov.setf r3, PREREAD - 1 +:1 + brr.anynz -, r:1b + nop ; nop ; ldtmu0 + nop ; nop ; ldtmu1 + sub.setf r3, r3, 1 + # >>> + mov -, vw_wait +.endif +.endm + +# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair) +# All qpus start at the beginning and after that (group - 1) must have finished +# before (group) can start +# +# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain +# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important - +# lockup otherwise) +# +# There is some, currently ill defined, potential lockup if we have the VDM active +# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ?? +# +# The code stalled when I had many waiters on a single sem so we have a +# "ripple" of srels to restart. Unsure why, may have been bug, but this works +# and we currently have both the memory & sems to support it. +.macro m_sync_q, n_qpu, n_quads +# Do not generate code for qpu >= quads * 4 - fns should never be called +.if n_qpu < n_quads * 4 + mov ra_link, unif # Can only branch to an a reg (not r0) + mov -, vw_wait # [ra_link delay] + +.set n_sem_sync, n_qpu - (n_qpu % 4) +.set n_sem_in, n_qpu +.set n_sem_out, n_qpu + 1 + +.if n_qpu % 4 == 0 + +.set n_sem_quad_in, 12 + n_qpu / 4 +.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads) + + sacq -, n_sem_sync + sacq -, n_sem_sync + sacq -, n_sem_sync + bra -, ra_link + sacq -, n_sem_quad_in + srel -, n_sem_out + srel -, n_sem_quad_out + +.else + bra -, ra_link + srel -, n_sem_sync + sacq -, n_sem_in +.if n_sem_out % 4 != 0 + srel -, n_sem_out +.else + nop +.endif +.endif +.endif +.endm + +.set v_quads8, N_QPU_8 / 4 + +::mc_sync_q0 + m_sync_q 0, v_quads8 +::mc_sync_q1 + m_sync_q 1, v_quads8 +::mc_sync_q2 + m_sync_q 2, v_quads8 +::mc_sync_q3 + m_sync_q 3, v_quads8 +::mc_sync_q4 + m_sync_q 4, v_quads8 +::mc_sync_q5 + m_sync_q 5, v_quads8 +::mc_sync_q6 + m_sync_q 6, v_quads8 +::mc_sync_q7 + m_sync_q 7, v_quads8 +::mc_sync_q8 + m_sync_q 8, v_quads8 +::mc_sync_q9 + m_sync_q 9, v_quads8 +::mc_sync_q10 + m_sync_q 10, v_quads8 +::mc_sync_q11 + m_sync_q 11, v_quads8 + +# mc_exit() +# Chroma & Luma the same now + +.macro m_exit_qn + m_exit_drain + nop ; nop ; thrend + nop + nop +# >>> thrend <<< +.endm + +::mc_exit_c_qn +::mc_exit_y_qn + m_exit_qn + + + +# mc_interrupt_exit12() + +.macro m_exit_q0 + m_exit_drain + sacq -, 12 + nop ; nop ; thrend + mov interrupt, 1 + nop +# >>> thrend <<< +.endm + +::mc_exit_c_q0 +::mc_exit_y_q0 + m_exit_q0 + +# LUMA CODE + +# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1. +# For P frames we make the second x,y coordinates offset by +8 + + +################################################################################ +# mc_setup +# +# typedef struct qpu_mc_pred_y_s_s { +# qpu_mc_src_t next_src1; +# qpu_mc_src_t next_src2; +# uint16_t pic_h; +# uint16_t pic_w; +# uint32_t stride2; +# uint32_t stride1; +# uint32_t wdenom; +# uint32_t next_fn; +# } qpu_mc_pred_y_s_t; + +.macro m_setup_y, v_bit_depth + +# Cannot use mul24 on x as x might be -ve, so must use shift +.if v_bit_depth <= 8 +.set v_x_shift, 0 +.set v_pmask, 0xff +.set v_blk_height, Y_BLK_HEIGHT_8 +.else +.set v_x_shift, 1 +.set v_pmask, 0xffff +.set v_blk_height, Y_BLK_HEIGHT_16 +.endif + + + # Need to save these because we need to know the frame dimensions before computing texture coordinates + mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y + mov ra9, unif # ref_y_base + mov ra1, unif # x2_y2 + + +# load constants + mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] + shl rb_ef, r0, i_shift30 ; mov ra11, unif # ; ref_y2_base + + mov ra_kff800100, 0xff800100 + mov rb_pmask, v_pmask + mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) + mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) + mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) + mov rb_y_coeffs_2, 0x050b0a00 + mov rb_y_coeffs_3, 0x11283a40 + mov rb_y_coeffs_5, 0x0a0b0500 + +# Compute part of VPM to use + +# Read image dimensions + mov ra3, unif # width_height + mov ra_ef, rb_ef ; mov rb_xpitch, unif # [ra3 delay] ; stride2 +.if v_x_shift == 0 + sub rb_max_x, ra3.16b, 1 +.else + sub r0, ra3.16b, 1 + shl rb_max_x, r0, v_x_shift +.endif + sub rb_max_y, ra3.16a, 1 + mov r3, elem_num ; mov rb_pitch, unif # stride1 + +# get destination pitch + mov r1, vdw_setup_1(0) # [rb_pitch delay] + or rb_dma1_base, r1, rb_pitch + +# Compute base address for first and second access + add r0, ra0.16b, r3 # Load x + elem_num +.if v_x_shift != 0 + shl r0, r0, v_x_shift +.endif + max r0, r0, 0 + min r0, r0, rb_max_x + shl ra_xshift_next, r0, 3 # Compute shifts + +# X is byte offset - we can only load words - mask + + and r0, r0, -4 ; v8subs r2, r2, r2 + sub r2, r2, rb_pitch + and r1, r0, r2 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 # Add stripe offsets + add ra_base, ra9, r0 + + # r3 still contains elem_num + add r0, ra1.16b, r3 # Load x +.if v_x_shift != 0 + shl r0, r0, v_x_shift +.endif + max r0, r0, 0 + min r0, r0, rb_max_x + shl rb_xshift2_next, r0, 3 # Compute shifts + + # r2 still contains mask + and r0, r0, -4 + and r1, r0, r2 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 # Add stripe offsets + add rb_base2, ra11, r0 + +# Do preloads + nop ; mov r0, ra0.16a # ; r0 = y + mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 + +:1 + sub.setf r3, r3, 1 + max r1, r0, 0 + min r1, r1, rb_max_y + add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch + add t0s, ra_base, r1 ; mov ra_y, r0 + + max r1, r2, 0 + brr.anynz -, r:1b + min r1, r1, rb_max_y + add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch + add t1s, rb_base2, r1 ; mov ra_y2, r2 +# >>> .anynz 1b + + m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base + + mov ra_link, unif # Next fn + +# touch vertical context to keep simulator happy + mov ra8, 0 ; mov rb8, 0 # [ra_link delay] + bra -, ra_link + mov ra9, 0 ; mov rb9, 0 + mov ra10, 0 ; mov rb10, 0 + mov ra11, 0 ; mov rb11, 0 +# >>> ra_link +.endm + +::mc_setup_y_q0 + m_setup_q0 +::mc_setup_y_qn + m_setup_y 8 + +################################################################################ +# +# Start of per-block setup code +# P and B blocks share the same setup code to save on Icache space + +# get base addresses and per-channel shifts for *next* invocation +# per-channel shifts were calculated on the *previous* invocation + +# 1st 3 instructions of per_block-setup in branch delay +# +# typedef struct qpu_mc_pred_y_p_s { +# qpu_mc_src_t next_src1; +# qpu_mc_src_t next_src2; +# uint16_t h; +# uint16_t w; +# uint32_t mymx21; +# uint32_t wo1; +# uint32_t wo2; +# uint32_t dst_addr; +# uint32_t next_fn; +# } qpu_mc_pred_y_p_t; +# + +.macro m_luma_setup, v_bit_depth +# Hack - QASM may well have have label pasting but I have no idea how... +.if v_bit_depth == 8 + brr ra_link, r:per_block_setup_8 +.elif v_bit_depth == 10 + brr ra_link, r:per_block_setup_10 +.endif + mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? + add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0 + add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next +.endm + +.macro m_per_block_setup, v_bit_depth + +.if v_bit_depth <= 8 +.set v_x_shift, 0 +.set v_x_mul, 1 +# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 7 +.set v_dma_wh_shift, i_shift16 +.else +.set v_x_shift, 1 +.set v_x_mul, 2 +# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 8 +.set v_dma_wh_shift, 15 +.endif + +.if v_x_shift != 0 + shl r0, r0, v_x_shift +.endif + max r0, r0, r5 ; mov ra_xshift, ra_xshift_next + min r0, r0, rb_max_x + + shl ra_xshift_next, r0, 3 # Compute shifts + and r0, r0, -4 + sub r2, r5, rb_pitch ; mov ra_base_next, unif # ; src1.base + and r1, r0, r2 ; mov ra_y_next, ra0.16a + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y + add ra_base_next, ra_base_next, r0 # [ra1 delay] + + add r0, ra1.16b, r3 # Load x2 +.if v_x_shift != 0 + shl r0, r0, v_x_shift +.endif + max r0, r0, r5 ; mov ra_y2_next, ra1.16a + min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base + shl rb_xshift2_next, r0, 3 # Compute shifts + and r0, r0, -4 ; mov ra_width_height, unif # ; width_height + and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes + add rb_base2_next, rb_base2_next, r0 + +# get width,height of block (unif load above), r1 = width * pel_size + sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width) + add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height + add rb_lcount, r0, (7-8) + shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add # ; r3 return val + add r0, r0, r1 # Combine width and height of destination area + shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val + add ra_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets + +# get filter coefficients and discard unused B frame values + shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight + shl ra8, r0, 3 ; mov rb5, ra_k255 + +# Coeffs are all abs values here as that means mul24 works (no sign extend from .8) + +# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val +# but I can't see a way of doing that that is cheap enough to be worth it + +# Picked out in a slightly random order to space out uniform loads + + # 1 + mov r1, 0x01040400 # [ra8 delay] + ror ra2.8b, r1, ra8.8d + ror ra0.8b, r1, ra8.8c + # 2 + ror ra2.8c, rb_y_coeffs_2, ra8.8d + ror ra0.8c, rb_y_coeffs_2, ra8.8c + # 0 + mov r1,0x00010100 # -ve [ra8 delay] + ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif # ; L1 Wt/Offset + ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 + # 7 + shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000 + ror r0, r1, ra8.8d ; mov ra_dest, unif # ; Destination address + ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 + # 3 + ror ra2.8d, rb_y_coeffs_3, ra8.8d + ror ra0.8d, rb_y_coeffs_3, ra8.8c + # 5 + ror ra3.8b, rb_y_coeffs_5, ra8.8d + ror ra1.8b, rb_y_coeffs_5, ra8.8c + # 6 + mov r1,0x04040100 + ror ra3.8c, r1, ra8.8d + ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 # ; r5 return val + + bra -, ra_link + # 4 + mov r1,0x3a281100 + ror r0, r1, ra8.8d ; mov ra_link, unif # ; link - load after we've used its previous val + ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 +# >>> branch ra_link + +# r5 = -8 +# r2 = fir_off_val +# r3 = 128 +.endm + +:per_block_setup_8 + m_per_block_setup 8 + + + +################################################################################ +# +# mc_filter_y_pxx +# +# Setup (& therefore uniform struct) shared with _bxx +# Struct in m_luma_setup +# +# We can have 2 separate P reqs here as long as they mate to generate a +# rectangular output block (i.e. h0 = h1, w0 = 8) +# +# At this point we have already issued PREREAD pairs of texture requests for the current block + +.macro m_filter_y_pxx, v_bit_depth + +# denom shift values +.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) +.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) + + m_luma_setup v_bit_depth + + shl r1, ra_wt_off_l0, i_wt_den_p5 + add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul + sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 + +# retrieve texture results and pick out bytes +# then submit two more texture requests + +# This loop is identical to the B loop from here ---> +:1 + add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef + + max r2, ra_y, 0 ; mov r1, 0 + min r2, r2, rb_max_y ; mov r3, ra_k1 + add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 + add t0s, ra_base, r2 ; mov rb5, rb6 + shr r0, r4, ra_xshift ; mov rb6, rb7 + + max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes + shr r1, r4, rb_xshift2 ; mov rb7, ra8 + min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax + add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch + add t1s, rb_base2, r2 ; mov ra8, ra9 + +# apply horizontal filter + add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 + mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 + sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 + nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 + nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 + nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 + nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 + nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 + nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 + add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + + brr.anyn -, r:1b + sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b + mov ra9, rb10 ; mul24 r0, rb10, ra3.8b + asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 + # >>> .anyn 1b (r5 + r5) + + # apply vertical filter and write to VPM + # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11 + + sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c + sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d + add r1, r1, r0 ; mul24 r0, ra8, rb8 + add r1, r1, r0 ; mul24 r0, rb10, ra3.8c + add r1, r1, r0 ; mul24 r0, ra11, rb11 +# <--- to here + sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height + sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next + sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next + + asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next + sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 + add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add + sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate) + + brr.anyn -, r:1b + asr r1, r1, i_wt_den_p6 + min r1, r1, ra_pmax ; mov -, vw_wait + max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop +# >>> branch.anyn 1b (r5 - rb_lcount) + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch +# r3 = block_height + +# If looping again then we consumed 16 height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) +# recalc ra_dma0, rb_lcount based on new segment height + + mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link + min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride + sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link + +# Here r1 = cur_blk_height - 16 so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block + brr -, r:1b + add rb_lcount, rb_lcount, r0 + add ra_dma0, ra_dma0, r1 + add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + +::mc_filter_y_pxx + m_filter_y_pxx 8 + + +################################################################################ + +# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) +# +# Setup (& therefore uniform struct) shared with _pxx +# Struct in m_luma_setup +# +# l0 calc in els 0-7, L1 in 8-15 +# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh) +# +# At this point we have already issued PREREAD pairs of texture requests for the current block + +.macro m_filter_y_bxx, v_bit_depth + +# denom shift values +.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) +.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) + + m_luma_setup v_bit_depth + + shl r1, ra_wt_off_l0, i_wt_den_p6 + add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 + sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 + sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 + +# This loop is identical to the P loop from here ---> +:1 + add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef + + max r2, ra_y, 0 ; mov r1, 0 + min r2, r2, rb_max_y ; mov r3, ra_k1 + add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 + add t0s, ra_base, r2 ; mov rb5, rb6 + shr r0, r4, ra_xshift ; mov rb6, rb7 + + max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes + shr r1, r4, rb_xshift2 ; mov rb7, ra8 + min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax + add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch + add t1s, rb_base2, r2 ; mov ra8, ra9 + +# apply horizontal filter + add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 + mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 + sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 + nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 + nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 + nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 + nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 + nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 + sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 + nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 + add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 + add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + + brr.anyn -, r:1b + sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b + mov ra9, rb10 ; mul24 r0, rb10, ra3.8b + asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 + # >>> .anyn 1b (r5 + r5) + + # apply vertical filter and write to VPM + # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11 + + sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c + sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d + add r1, r1, r0 ; mul24 r0, ra8, rb8 + add r1, r1, r0 ; mul24 r0, rb10, ra3.8c + add r1, r1, r0 ; mul24 r0, ra11, rb11 +# <--- to here + sub r1, r1, ra4 + sub r1, r1, r0 ; mov r2, rb_wt_off + + asr r1, r1, 6 + sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 + mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add + sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next + sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next + add r1, r1, r2 ; mov r0, r1 << 8 + add r1, r1, r0 ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height + + brr.anyn -, r:1b + asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch # ; NxtLoop + min r1, r1, ra_pmax ; mov -, vw_wait + max vpm, r1, 0 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate) +# >>> branch.anyn 1b (r5 - rb_lcount) + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch +# r3 = block_height + +# If looping again then we consumed block_height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) +# recalc ra_dma0, rb_lcount based on new segment height + + mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link + min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride + sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link (ra_height - remaining height) + +# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block + brr -, r:1b + add rb_lcount, rb_lcount, r0 + add ra_dma0, ra_dma0, r1 + add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + +::mc_filter_y_bxx + m_filter_y_bxx 8 + +################################################################################ +# +# typedef struct qpu_mc_pred_y_p00_s { +# qpu_mc_src_t next_src1; +# uint16_t h; +# uint16_t w; +# uint32_t wo1; +# uint32_t dst_addr; +# uint32_t next_fn; +# } qpu_mc_pred_y_p00_t; + +.macro m_filter_y_p00, v_bit_depth + +.if v_bit_depth <= 8 +.set v_x_shift, 0 +.set v_x_mul, 1 +# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 7 +.set v_dma_wh_shift, i_shift16 +.else +.set v_x_shift, 1 +.set v_x_mul, 2 +# Shifts to get width & height in the right place in ra_dma0 +.set v_dma_h_shift, 8 +.set v_dma_wh_shift, 15 +.endif + + mov ra0, unif ; mov r0, elem_num # y_x + mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 # [ra0 delay] ; r5 = 0 + add r0, ra0.16b, r0 ; mov ra_base_next, unif # ; src1.base +.if v_x_shift != 0 + shl r0, r0, v_x_shift +.endif + + max r0, r0, r5 ; mov ra_y_next, ra0.16a # ; width_height + min r0, r0, rb_max_x ; mov ra_width_height, unif + + shl ra_xshift_next, r0, 3 # Compute shifts + and r0, r0, -4 + sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif # ; weight_offset + and r1, r0, r2 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch + add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dest addr + add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write + +# get width,height of block (unif load above) +# Compute vdw_setup1(dst_pitch-width) + shl r1, ra_width, v_x_shift + sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height + sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height + shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 + add r0, r0, r1 # Combine width and height of destination area + shl rb_wt_off, ra_wt_off_l0, DENOM + 7 + shl r0, r0, v_dma_wh_shift ; mov ra_link, unif # Shift into bits 16 upwards of the vdw_setup0 register ; link + add ra_dma0, r0, rb_dma0_base + +:1 + sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 + nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 + shr r0, r4, ra_xshift ; mov r3, rb_pitch + + max r2, ra_y, 0 # y + min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next + add ra_y, ra_y, 1 ; mul24 r2, r2, r3 + add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask + + sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 + shl r1, r1, 8 ; mov r3, ra_blk_height + add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 + + brr.anyn -, r:1b + asr r1, r1, DENOM + 8 + min r1, r1, ra_pmax ; mov -, vw_wait + max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +# >>> branch.anyn 1b + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch +# r3 = block_height + +# If looping again then we consumed 16 height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) +# recalc ra_dma0, rb_lcount based on new segment height + + mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 + +# DMA out + bra.anyz -, ra_link + min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride + sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link + +# Here r1 = cur_blk_height - 16 so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block + brr -, r:1b + add rb_lcount, rb_lcount, r0 + add ra_dma0, ra_dma0, r1 + add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + +::mc_filter_y_p00 + m_filter_y_p00 8 + +################################################################################ + +.macro m_filter_y_b00, v_bit_depth +# luma setup does a fair bit more than we need calculating filter coeffs +# that we will never use but it saves I-cache to use it (also simple!) + m_luma_setup v_bit_depth + +# Fix up vals that were expecting a filter (somewhat icky) + mov r2, 1 + add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 # Need in rX rather than raX for <<8 to do what we want + shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero + nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 + +:1 + sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 + shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 + shr r0, r4, ra_xshift ; mov r3, rb_pitch + + max r2, ra_y, 0 # y + min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next + add ra_y, ra_y, 1 ; mul24 r2, r2, r3 + add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next + + max r2, ra_y2, 0 + min r2, r2, rb_max_y + add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 + add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax # v8subs masks out all but bottom byte + and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 + + sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 + add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 + + shl r1, r1, 8 ; mov r3, ra_blk_height + add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 + + brr.anyn -, r:1b + asr r1, r1, (DENOM + 9) - 32 # -32 to get valid shift immediate + min r1, r1, ra_pmax ; mov -, vw_wait + max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch +# >>> branch.anyn 1b + +# r0 = remaining height (min 0) +# r2 = r3 * rb_pitch +# r3 = block_height + +# If looping again then we consumed 16 height last loop +# rb_dma1 (stride) remains constant +# rb_i_tmu remains const (based on total height) +# recalc ra_dma0, rb_lcount based on new segment height + + mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0 + +# DMA out + bra.anyz -, ra_link + min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride + sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW + shl r1, r1, i_shift23 +# >>> .anyz ra_link + +# Here r1 = cur_blk_height - 16 so it will be 0 or -ve +# We add to dma0 to reduce the number of output lines in the final block + brr -, r:1b + add rb_lcount, rb_lcount, r0 + add ra_dma0, ra_dma0, r1 + add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer +# >>> 1b +.endm + +::mc_filter_y_b00 + m_filter_y_b00 8 + +################################################################################ +################################################################################ +# 10 BIT + +::mc_setup_c10_q0 + m_setup_q0 +::mc_setup_c10_qn + m_setup_c 10 + +::mc_filter_c10_p + m_filter_c_p 0, 10 + +::mc_filter_c10_p_l1 + m_filter_c_p 1, 10 + + +::mc_filter_c10_b + m_filter_c_b 10 + +# Even if these fns are the same as for other bit depths we want our own copy +# to keep the code we are using in a single lump to avoid (direct map) cache +# thrashing +.set v_quads10, N_QPU_16 / 4 + +::mc_sync10_q0 + m_sync_q 0, v_quads10 +::mc_sync10_q1 + m_sync_q 1, v_quads10 +::mc_sync10_q2 + m_sync_q 2, v_quads10 +::mc_sync10_q3 + m_sync_q 3, v_quads10 +::mc_sync10_q4 + m_sync_q 4, v_quads10 +::mc_sync10_q5 + m_sync_q 5, v_quads10 +::mc_sync10_q6 + m_sync_q 6, v_quads10 +::mc_sync10_q7 + m_sync_q 7, v_quads10 +::mc_sync10_q8 + m_sync_q 8, v_quads10 +::mc_sync10_q9 + m_sync_q 9, v_quads10 +::mc_sync10_q10 + m_sync_q 10, v_quads10 +::mc_sync10_q11 + m_sync_q 11, v_quads10 + +::mc_exit_y10_q0 +::mc_exit_c10_q0 + m_exit_q0 + +::mc_exit_y10_qn +::mc_exit_c10_qn + m_exit_qn + +::mc_setup_y10_q0 + m_setup_q0 +::mc_setup_y10_qn + m_setup_y 10 + +:per_block_setup_10 + m_per_block_setup 10 + +::mc_filter_y10_pxx + m_filter_y_pxx 10 + +::mc_filter_y10_p00 + m_filter_y_p00 10 + +::mc_filter_y10_bxx + m_filter_y_bxx 10 + +::mc_filter_y10_b00 + m_filter_y_b00 10 + + + +::mc_end +# Do not add code here because mc_end must appear after all other code. diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h new file mode 100644 index 0000000000..89711d776b --- /dev/null +++ b/libavcodec/rpi_hevc_shader_cmd.h @@ -0,0 +1,165 @@ +/* +Copyright (c) 2017 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef RPI_SHADER_CMD_H +#define RPI_SHADER_CMD_H + +#pragma pack(push, 4) + +#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y +// If mixed then we are just confused and get a lot of warnings.... +typedef const uint8_t * qpu_mc_src_addr_t; +typedef uint8_t * qpu_mc_dst_addr_t; +#else +typedef uint32_t qpu_mc_src_addr_t; +typedef uint32_t qpu_mc_dst_addr_t; +#endif + +typedef struct qpu_mc_src_s +{ + int16_t y; + int16_t x; + qpu_mc_src_addr_t base; +} qpu_mc_src_t; + + +typedef struct qpu_mc_pred_c_p_s { + qpu_mc_src_t next_src; + uint16_t h; + uint16_t w; + uint32_t coeffs_x; + uint32_t coeffs_y; + uint32_t wo_u; + uint32_t wo_v; + qpu_mc_dst_addr_t dst_addr_c; + uint32_t next_fn; +} qpu_mc_pred_c_p_t; + +typedef struct qpu_mc_pred_c_b_s { + qpu_mc_src_t next_src1; + uint16_t h; + uint16_t w; + uint32_t coeffs_x1; + uint32_t coeffs_y1; + int16_t weight_u1; + int16_t weight_v1; + qpu_mc_src_t next_src2; + uint32_t coeffs_x2; + uint32_t coeffs_y2; + uint32_t wo_u2; + uint32_t wo_v2; + qpu_mc_dst_addr_t dst_addr_c; + uint32_t next_fn; +} qpu_mc_pred_c_b_t; + +typedef struct qpu_mc_pred_c_s_s { + qpu_mc_src_t next_src1; + uint32_t pic_cw; // C Width (== Y width / 2) + uint32_t pic_ch; // C Height (== Y Height / 2) + uint32_t stride2; + uint32_t stride1; + qpu_mc_src_t next_src2; + uint32_t next_fn; +} qpu_mc_pred_c_s_t; + +typedef struct qpu_mc_pred_c_s { + union { + qpu_mc_pred_c_p_t p; + qpu_mc_pred_c_b_t b; + qpu_mc_pred_c_s_t s; + }; +} qpu_mc_pred_c_t; + + +typedef struct qpu_mc_pred_y_p_s { + qpu_mc_src_t next_src1; + qpu_mc_src_t next_src2; + uint16_t h; + uint16_t w; + uint32_t mymx21; + uint32_t wo1; + uint32_t wo2; + qpu_mc_dst_addr_t dst_addr; + uint32_t next_fn; +} qpu_mc_pred_y_p_t; + +typedef struct qpu_mc_pred_y_p00_s { + qpu_mc_src_t next_src1; + uint16_t h; + uint16_t w; + uint32_t wo1; + qpu_mc_dst_addr_t dst_addr; + uint32_t next_fn; +} qpu_mc_pred_y_p00_t; + +typedef struct qpu_mc_pred_y_s_s { + qpu_mc_src_t next_src1; + qpu_mc_src_t next_src2; + uint16_t pic_h; + uint16_t pic_w; + uint32_t stride2; + uint32_t stride1; + uint32_t next_fn; +} qpu_mc_pred_y_s_t; + +typedef struct qpu_mc_pred_sync_s { + uint32_t next_fn; +} qpu_mc_pred_sync_t; + +// Only a useful structure in that it allows us to return something other than a void * +typedef struct qpu_mc_pred_y_s { + union { + qpu_mc_pred_y_p_t p; + qpu_mc_pred_y_p00_t p00; + qpu_mc_pred_y_s_t s; + }; +} qpu_mc_pred_y_t; + +typedef union qpu_mc_pred_cmd_u { + qpu_mc_pred_y_t y; + qpu_mc_pred_c_t c; + qpu_mc_pred_sync_t sync; +} qpu_mc_pred_cmd_t; + +static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn) +{ + // Link is last el of previous cmd + ((uint32_t *)cmd)[-1] = fn; +} + +#define QPU_MC_PRED_N_Y8 12 +#define QPU_MC_PRED_N_C8 12 + +#define QPU_MC_PRED_N_Y10 12 +#define QPU_MC_PRED_N_C10 12 + +#define QPU_MC_DENOM 7 + +#pragma pack(pop) + +#endif + diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c new file mode 100644 index 0000000000..77d8366eb8 --- /dev/null +++ b/libavcodec/rpi_hevc_shader_template.c @@ -0,0 +1,88 @@ +/* +Copyright (c) 2017 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "hevc.h" +#include "rpi_hevcdec.h" +#include "libavutil/rpi_sand_fns.h" +#include "rpi_hevc_shader_cmd.h" +#include "rpi_hevc_shader_template.h" + +typedef struct shader_track_s +{ + const union qpu_mc_pred_cmd_u *qpu_mc_curr; + const struct qpu_mc_src_s *last_l0; + const struct qpu_mc_src_s *last_l1; + uint32_t width; // pic_width * PW + uint32_t height; + uint32_t stride2; + uint32_t stride1; +} shader_track_t; + +static int wtoidx(const unsigned int w) +{ + static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; + return pel_weight[w]; +} + +static const int fctom(uint32_t x) +{ + int rv; + // As it happens we can take the 2nd filter term & divide it by 8 + // (dropping fractions) to get the fractional move + rv = 8 - ((x >> 11) & 0xf); + av_assert2(rv >= 0 && rv <= 7); + return rv; +} + +static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr) +{ + return (x << shl) >> shr; +} + +static inline int woff_p(HEVCRpiContext *const s, int32_t x) +{ + return ext(x, 0, 17 + s->ps.sps->bit_depth - 8); +} + +static inline int woff_b(HEVCRpiContext *const s, int32_t x) +{ + return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8); +} + +static inline int wweight(int32_t x) +{ + return ext(x, 16, 16); +} + + +#define PW 1 +#include "rpi_hevc_shader_template_fn.h" + +#undef PW +#define PW 2 +#include "rpi_hevc_shader_template_fn.h" + diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h new file mode 100644 index 0000000000..0fc5a45e9f --- /dev/null +++ b/libavcodec/rpi_hevc_shader_template.h @@ -0,0 +1,49 @@ +/* +Copyright (c) 2017 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H +#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H + +struct HEVCRpiContext; +struct HEVCRpiInterPredEnv; + +void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s, + const struct HEVCRpiInterPredEnv *const ipe_y, + const struct HEVCRpiInterPredEnv *const ipe_c); + +void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s, + const struct HEVCRpiInterPredEnv *const ipe_y, + const struct HEVCRpiInterPredEnv *const ipe_c); + +void rpi_sand_dump8(const char * const name, + const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); + +void rpi_sand_dump16(const char * const name, + const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); + +#endif + diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h new file mode 100644 index 0000000000..10c163a4b9 --- /dev/null +++ b/libavcodec/rpi_hevc_shader_template_fn.h @@ -0,0 +1,502 @@ +/* +Copyright (c) 2017 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define STRCAT(x,y) x##y + +#if PW == 1 +#define pixel uint8_t +#define FUNC(f) STRCAT(f, 8) +#elif PW == 2 +#define pixel uint16_t +#define FUNC(f) STRCAT(f, 16) +#else +#error Unexpected PW +#endif + +#define PATCH_STRIDE (16 * PW) + +static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) +{ + for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) { + const pixel s = *(const pixel *)src; + pixel * d = (pixel *)dst; + for (unsigned int j = 0; j < w; j += PW) { + *d++ = s; + } + } +} + +static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) +{ + for (unsigned int i = 0; i != h; ++i, dst += stride) { + memcpy(dst, src, w); + } +} + +static void FUNC(get_patch_y)(const shader_track_t * const st, + uint8_t * dst, const unsigned int dst_stride, + const qpu_mc_src_t *src, + unsigned int _w, unsigned int _h) +{ + int x = src->x * PW; + int y = src->y; + int w = _w * PW; + int h = _h; + int dl = 0; + int dr = 0; + int dt = 0; + int db = 0; + + if (x < 0) { + if (-x >= w) + x = PW - w; + dl = -x; + w += x; + x = 0; + } + if (x + w > st->width) { + if (x >= st->width) + x = st->width - PW; + dr = (x + w) - st->width; + w = st->width - x; + } + + // Y + if (y < 0) { + if (-y >= h) + y = 1 - h; + dt = -y; + h += y; + y = 0; + } + if (y + h > st->height) { + if (y >= st->height) + y = st->height - 1; + db = (y + h) - st->height; + h = st->height - y; + } + + dst += dl + dt * dst_stride; + FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); + + // Edge dup + if (dl != 0) + FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride); + if (dr != 0) + FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride); + w += dl + dr; + dst -= dl; + + if (dt != 0) + FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride); + if (db != 0) + FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride); +} + + + +static void FUNC(get_patch_c)(const shader_track_t * const st, + uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride, + const qpu_mc_src_t *src, + unsigned int _w, unsigned int _h) +{ + int x = src->x * PW; + int y = src->y; + int w = _w * PW; + int h = _h; + int dl = 0; + int dr = 0; + int dt = 0; + int db = 0; + const int width = st->width; + const int height = st->height; + + if (x < 0) { + if (-x >= w) + x = PW - w; + dl = -x; + w += x; + x = 0; + } + if (x + w > width) { + if (x >= width) + x = width - PW; + dr = (x + w) - width; + w = width - x; + } + + // Y + if (y < 0) { + if (-y >= h) + y = 1 - h; + dt = -y; + h += y; + y = 0; + } + if (y + h > height) { + if (y >= height) + y = height - 1; + db = (y + h) - height; + h = height - y; + } + + dst_u += dl + dt * dst_stride; + dst_v += dl + dt * dst_stride; + FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); + + // Edge dup + if (dl != 0) + { + FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride); + FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride); + } + if (dr != 0) + { + FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride); + FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride); + } + w += dl + dr; + dst_u -= dl; + dst_v -= dl; + + if (dt != 0) + { + FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride); + FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride); + } + if (db != 0) + { + FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride); + FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride); + } +} + +// w, y, w, h in pixels +// stride1, stride2 in bytes +void FUNC(rpi_sand_dump)(const char * const name, + const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c) +{ + const int mask = stride2 == 0 ? ~0 : stride1 - 1; + + printf("%s (%d,%d) %dx%d\n", name, x, y, w, h); + + if (is_c) { + x *= 2; + w *= 2; + } + + for (int i = y; i != y + h; ++i) { + for (int j = x; j != x + w; ++j) { + const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2; + char sep = is_c && (j & 1) == 0 ? ':' : ' '; +#if PW == 1 + if (j < 0 || i < 0) + printf("..%c", sep); + else + printf("%02x%c", *(const pixel*)p, sep); +#else + if (j < 0 || i < 0) + printf("...%c", sep); + else + printf("%03x%c", *(const pixel*)p, sep); +#endif + } + printf("\n"); + } +} + + +void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s, + const HEVCRpiInterPredEnv *const ipe_y, + const HEVCRpiInterPredEnv *const ipe_c) +{ + for (int c_idx = 0; c_idx < 2; ++c_idx) + { + const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c; + shader_track_t tracka[QPU_N_MAX] = {{NULL}}; + unsigned int exit_n = 0; + + if (ipe == NULL || !ipe->used) { + continue; + } + + do { + for (unsigned int i = 0; i != ipe->n; ++i) { + const HEVCRpiInterPredQ * const q = ipe->q + i; + shader_track_t * const st = tracka + i; + const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr; + + for (;;) { + const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1]; + + if (link == q->code_setup) { + if (c_idx == 0) { + // Luma + const qpu_mc_pred_y_s_t *const c = &cmd->y.s; + + st->height = c->pic_h; + st->width = c->pic_w * PW; + st->stride1 = c->stride1; + st->stride2 = c->stride2; + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); + } + else { + // Chroma + const qpu_mc_pred_c_s_t *const c = &cmd->c.s; + + st->height = c->pic_ch; + st->width = c->pic_cw * PW; + st->stride1 = c->stride1; + st->stride2 = c->stride2; + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); + } + } + else if (link == s->qpu.y_pxx) { + const qpu_mc_pred_y_p_t *const c = &cmd->y.p; + const int w1 = FFMIN(c->w, 8); + const int w2 = c->w - w1; + + uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + + FUNC(get_patch_y)(st, + patch_y1, PATCH_STRIDE, + st->last_l0, + 16, c->h + 7); + if (w2 > 0) { + FUNC(get_patch_y)(st, + patch_y2, PATCH_STRIDE, + st->last_l1, + 16, c->h + 7); + } + + // wo[offset] = offset*2+1 + s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( + (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, + c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1); + if (w2 > 0) { + s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( + (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, + c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2); + } + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); + } + else if (link == s->qpu.y_bxx) { + const qpu_mc_pred_y_p_t *const c = &cmd->y.p; + + uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; + + FUNC(get_patch_y)(st, + patch_y1, PATCH_STRIDE, + st->last_l0, + 16, c->h + 7); + FUNC(get_patch_y)(st, + patch_y2, PATCH_STRIDE, + st->last_l1, + 16, c->h + 7); + + s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( + patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, + c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w); + + s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( + (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3, + c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2), + 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w); + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); + } + else if (link == s->qpu.y_p00) { + const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00; + + uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + + FUNC(get_patch_y)(st, + patch_y1, PATCH_STRIDE, + st->last_l0, + 16, c->h + 7); + + // wo[offset] = offset*2+1 + s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0]( + (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE, + c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w); + + st->last_l0 = &c->next_src1; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); + } + else if (link == s->qpu.y_b00) { + const qpu_mc_pred_y_p_t *const c = &cmd->y.p; + + uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; + + av_assert0(c->w <= 16 && c->h <= 64); + + FUNC(get_patch_y)(st, + patch_y1, PATCH_STRIDE, + st->last_l0, + 16, c->h); + FUNC(get_patch_y)(st, + patch_y2, PATCH_STRIDE, + st->last_l1, + 16, c->h); + + s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0]( + patch_y3, patch_y1, PATCH_STRIDE, + c->h, 0, 0, c->w); + + s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0]( + (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3, + c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2), + 0, woff_b(s, c->wo2), 0, 0, c->w); + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); + } + else if (link == s->qpu.c_pxx) { + const qpu_mc_pred_c_p_t *const c = &cmd->c.p; + const int mx = fctom(c->coeffs_x); + const int my = fctom(c->coeffs_y); + + uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + uint8_t patch_u3[8 * 16 * PW]; + uint8_t patch_v3[8 * 16 * PW]; + + FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); + + s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( + patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, + c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); + s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( + patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, + c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); + + FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); + + st->last_l0 = &c->next_src; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); + } + else if (link == s->qpu.c_pxx_l1) { + const qpu_mc_pred_c_p_t *const c = &cmd->c.p; + const int mx = fctom(c->coeffs_x); + const int my = fctom(c->coeffs_y); + + uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) + uint8_t patch_u3[8 * 16 * PW]; + uint8_t patch_v3[8 * 16 * PW]; + + FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); + + s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( + patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, + c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); + s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( + patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, + c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); + + FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); + + st->last_l1 = &c->next_src; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); + } + else if (link == s->qpu.c_bxx) { + const qpu_mc_pred_c_b_t *const c = &cmd->c.b; + const int mx1 = fctom(c->coeffs_x1); + const int my1 = fctom(c->coeffs_y1); + const int mx2 = fctom(c->coeffs_x2); + const int my2 = fctom(c->coeffs_y2); + + uint8_t patch_u1[PATCH_STRIDE * 72]; + uint8_t patch_v1[PATCH_STRIDE * 72]; + uint8_t patch_u2[PATCH_STRIDE * 72]; + uint8_t patch_v2[PATCH_STRIDE * 72]; + uint8_t patch_u3[8 * 16 * PW]; + uint8_t patch_v3[8 * 16 * PW]; + uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE]; + uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE]; + + FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); + FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); + + s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( + patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, + c->h, mx1, my1, c->w); + s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( + patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, + c->h, mx1, my1, c->w); + + s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( + patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4, + c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2), + 0, woff_b(s, c->wo_u2), mx2, my2, c->w); + s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( + patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4, + c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2), + 0, woff_b(s, c->wo_v2), mx2, my2, c->w); + + FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); + + st->last_l0 = &c->next_src1; + st->last_l1 = &c->next_src2; + cmd = (const qpu_mc_pred_cmd_t *)(c + 1); + } + else if (link == q->code_sync) { + cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1); + break; + } + else if (link == q->code_exit) { + // We expect exit to occur without other sync + av_assert0(i == exit_n); + ++exit_n; + break; + } + else { + av_assert0(0); + } + } + + st->qpu_mc_curr = cmd; + } + } while (exit_n == 0); + } +} + +#undef FUNC +#undef pixel + diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s new file mode 100644 index 0000000000..3caef20137 --- /dev/null +++ b/libavcodec/rpi_hevc_transform.s @@ -0,0 +1,444 @@ +# ****************************************************************************** +# Argon Design Ltd. +# (c) Copyright 2015 Argon Design Ltd. All rights reserved. +# +# Module : HEVC +# Author : Peter de Rivaz +# ****************************************************************************** + +# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack) +# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions) +.set USE_STACK, 0 + +# Lines that fail to assemble start with #: +# The script insert_magic_opcodes.sh inserts the machine code directly for these. +# HEVC VPU Transform +# +# Transform matrix can be thought of as +# output row vector = input row vector * transMatrix2 +# +# The even rows of the matrix are symmetric +# The odd rows of the matrix are antisymmetric +# +# So only need to compute the first half of the results, then can compute the remainder with a butterfly +# +# EXAMPLE +# (a b c d) (1 2 2 1) +# (3 4 -4 -3) +# (5 6 6 5) +# (7 8 -8 -7) +# +# x=(a c)(1 2) = 1a+5c 2a+6c +# (5 6) +# +# y=(b d)(3 4) = 3b+7d 4b+8d +# (7 8) +# +# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d +# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d +# +# Final results are (u , v[::-1]) +# +# +# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0) +# Apply the even matrix first and stop before rounding +# Then apply the odd matrix in a full manner: +# +# First step is to compute partial products with the first input (16 cycles) +# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output +# 2a 4b 6c 8d +# 2a -4b 6c -8d +# 1a -3b 5c -7d +# +# Second step is to sum partial products into final position (8 cycles) +# 1a+3b+5c+7d +# 2a+4b+6c+8d +# 2a-4b+6c-8d +# 1a-3b+5c-7d +# +# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format) +# +# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds) +# +# For 8x8 we could compute two in parallel. +# +# + +# Columns are transformed first +# +# Store top left half of transMatrix2 in +# Store bottom left half of transMatrix2 in HX(32,32) +# +# For 16x16 +# HX(0:15,0) contains input data before transform +# HY(0:15,0) contains 32bit output data after transform +# HX(32,0) contains even rows of left half of transMatrix2 +# HX(32,32) contains odd rows of left half of transMatrix2 +# HY(48,0) contains partial products ready for summing +# + + +# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!) +# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) +# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) +# num: number of 16x16 transforms to be done +# coeffs32 +# num32: number of 32x32 transforms +# command 0 for transform, 1 for memclear16(int16_t *dst,num16) +# + +.equ TRANS_SHIFT, 20 - BIT_DEPTH +.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1) +.equ TRANS_ASL2, 16 - TRANS_SHIFT + + +hevc_trans_16x16: + push r6-r15, lr # TODO cut down number of used registers + mov r14,r3 # coeffs32 + mov r15,r4 # num32 + mov r3, 16*2 # Stride of transMatrix2 in bytes + vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix + + add r0, 16*16*2 # For 32x32 transforms we also need this matrix + vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix + + # Now use r0 to describe which matrix we are working on. + # Allows us to prefetch the next block of coefficients for efficiency. + mov r0,0 # This describes the location where we read our coefficients from + mov r3,16*2 # Stride of coefficients in bytes (TODO remove) + mov r7,16*16*2 # Total block size + mov r8,64*16 # Value used to swap from current to next VRF location + mov r4,64 # Constant used for rounding first pass + mov r5,TRANS_RND2 # Constant used for rounding second pass + + sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack + + add r11,sp,64 # Space for 32 bytes before, and rounding + lsr r11,5 + lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32 + + lsr r10, r2, 16 # Number of compressed blocks stored in top short + extu r2,16 + # At start of block r0,r1 point to the current block (that has already been loaded) + # r0 VRF location of current block + # r1 address of current block + # r2 number of 16*16 transforms to do + # r3 Stride of coefficients (==32) + # r4 TRANS_RND1 (64) + # r5 TRANS_RND2 + # r6 temporary used inside col_trans16 + # r7 16*16*2 total bytes in block + # r8 64*16 VRF switch locations + # r9 temporary in unpack_coeff for index + # r10 number of 16x16 transforms using compression + # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer) + # r12 temporary counter in unpack_coeff + # r13 + # r14 Save information for 32 bit transform (coeffs location) + # r15 Save information for 32 bit transform (number of transforms) + cmp r2,0 + beq done16x16s +block_loop: + # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests + cmp r10,0 + mov r6, r1 + beq not_compressed + sub r10, 1 + bl unpack16x16 +not_compressed: + #mov r6,r1 # DEBUG without compress + vldh HX(0++,0)+r0,(r6 += r3) REP 16 + #eor r0,r8 + #add r1,r7 + # Prefetch the next block + #bl unpack16x16 + #vldh HX(0++,0)+r0,(r6 += r3) REP 16 + #vmov HX(0++,0)+r0,0 REP 16 # DEBUG + #eor r0,r8 + #sub r1,r7 + + # Transform the current block + bl col_trans_16 + vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate + #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word. + vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble? + vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position + + bl col_trans_16 + vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate + #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word. + vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag) + + # Save results - note there has been a transposition during the processing so we save columns + vsth VX(0,32++)+r0, (r1 += r3) REP 16 + + # Move onto next block + eor r0,r8 + add r1,r7 + + addcmpbgt r2,-1,0,block_loop +done16x16s: + + add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack + # Now go and do any 32x32 transforms + b hevc_trans_32x32 + + pop r6-r15, pc +# This returns a value in r6 that says where to load the data from. +# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it. +unpack16x16: +# Clear out destination + vmov HX(0,0)+r0,0 + mov r6, r11 + vsth HX(0,0)+r0,(r6 += r3) REP 16 + mov r5, r1 # Moving pointer to input coefficients +unpack_outer_loop: + # Loop until we find the end + vldh HX(0,0)+r0,(r5) # TODO would prefetch help here while unpacking previous? + sub r6,r11,32 + #add r6,pc,packed_data-$ # Packed data + vsth HX(0,0)+r0,(r6) # Store into packed data + mov r12,0 +unpack_loop: + ld r4,(r6) + add r6,r6,4 + lsr r9,r4,16 # r9 is destination value + cmp r4,0 # {value,index} + extu r4,8 + beq done_unpack + sth r9,(r11, r4) + addcmpblt r12,1,8,unpack_loop +# # Read next 16 + add r5,32 + b unpack_outer_loop +done_unpack: +# # Set new load location + mov r6, r11 + #add r6,pc,unpacked_data-$ +# # Restore constants + mov r4,64 + mov r5,TRANS_RND2 +# pop r6-r15, pc + b lr + +# r1,r2,r3 r7,r8 should be preserved +# HX(0++,0)+r0 is the block to be transformed +# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients +# Use HY(48,0) for intermediate results +# r0 can be used, but should be returned to its original value at the end +col_trans_16: + add r6,r0,16 # Final value for this loop +col_trans_16_loop: + # First compute partial products for a single column + vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16 + # Then sum up the results and place back + vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC + addcmpblt r0,1,r6,col_trans_16_loop + sub r0,16 # put r0 back to its original value + b lr + +col_trans_odd_16: + add r6,r0,16 # Final value for this loop +col_trans_odd_16_loop: + # First compute partial products for a single column + vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16 + # Then sum up the results and place back + vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC + addcmpblt r0,1,r6,col_trans_odd_16_loop + sub r0,16 # put r0 back to its original value + b lr + +# r1/r10 input pointer +# r0,r4,r5,r6 free +# r8/r9 output storage +# +# Store packed coefficients at r9-32 +# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows) +unpack32x32: +# Clear out destination + vmov HX(0,0),0 + add r0, r9, 32*32*2 # Unpacked buffer + mov r4, 32 + vsth HX(0,0),(r0 += r4) REP 64 +unpack_outer_loop32: + # Loop until we find the end + vldh HX(0,0),(r1) # TODO would prefetch help here while unpacking previous? + sub r6,r9,32 + #add r6,pc,packed_data-$ # Packed data + vsth HX(0,0),(r6) # Store into packed data + mov r8,0 +unpack_loop32: + ld r4,(r6) + add r6,r6,4 + lsr r5,r4,16 # r5 is destination value + cmp r4,0 # {value,index} + extu r4,10 + beq done_unpack + sth r5,(r0, r4) + addcmpblt r8,1,8,unpack_loop32 +# # Read next 16 + add r1,32 + b unpack_outer_loop32 +done_unpack32: + b lr +# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num) +# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd +# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) +# num: number of 16x16 transforms to be done in low 16, number of packed in high 16 +# +# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first! +hevc_trans_32x32: + mov r1,r14 # coeffs + mov r2,r15 # num + lsr r15,r15,16 # Number that are packed + extu r2,16 # Total number + + # Fetch odd transform matrix + #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients) + #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix + #add r0, 16*16*2 + #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix + + mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer + mov r7, 16*16*2 # Total block size + +.if USE_STACK + # Stack base allocation + sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking + # set r8 to 32byte aligned stack pointer with 32 bytes of space before it + add r8,sp,63 + lsr r8,5 + lsl r8,5 +.else +#:version r8 + .half 0x00e8 #AUTOINSERTED + btst r8,16 +#:add r8,pc,intermediate_results-$ + .half 0xbfe8 + .half intermediate_results-($-2) + beq on_vpu1 + add r8,r8,32*32*2*2+16*2 # Move to secondary storage +on_vpu1: +.endif + mov r9,r8 # Backup of the temporary storage + mov r10,r1 # Backup of the coefficient buffer + + cmp r2,0 + beq done32x32s +block_loop32: + + # Transform the first 16 columns + mov r1,r10 # Input Coefficient buffer + mov r8,r9 # Output temporary storage + # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed) + cmp r2,r15 + bgt not_compressed_32 + bl unpack32x32 + add r1,r9,32*32*2 # Uncompressed into temporary storage + mov r8,r9 # Transform into here +not_compressed_32: + # COLUMN TRANSFORM + mov r4, 64 # Constant used for rounding first pass + mov r5, 9 # left shift used for rounding first pass + + bl trans32 + # Transform the second 16 columns + add r8,32*16*2 + add r1,32 + bl trans32 + + # ROW TRANSFORM + mov r4, TRANS_RND2 # Constant used for rounding second pass + mov r5, TRANS_ASL2 # left shift used for rounding second pass + + mov r1,r9 # Input temporary storage + mov r8,r10 # Output Coefficient buffer + bl trans32 + # Transform the second 16 columns + add r8,32*16*2 + add r1,32 + bl trans32 + + add r10, 32*32*2 # move onto next block of coefficients + addcmpbgt r2,-1,0,block_loop32 +done32x32s: + +.if USE_STACK + add sp,sp,32*32*4+64# Restore stack +.endif + + pop r6-r15, pc + +trans32: + push lr + # We can no longer afford the VRF space to do prefetching when doing 32x32 + # Fetch the even rows + vldh HX(0++,0),(r1 += r3) REP 16 + # Fetch the odd rows + vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1 + + # Transform the even rows using even matrix + mov r0, 0 # Even rows + bl col_trans_16 + + # Now transform the odd rows using odd matrix + mov r0, 64*16 # Odd rows + bl col_trans_odd_16 + + # Now apply butterfly to compute the first 16 results + vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16 + vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, + vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate + # 16bit results now in HX(48,32) + mov r0,r8 + mov r6,32*2 + vsth VX(48,32++),(r0+=r6) REP 16 + + # Now apply butterfly to compute the second 16 results (in reverse order) + vsub HY(63,0),HY(0 ,0),HY(16,0) + vsub HY(62,0),HY(1 ,0),HY(17,0) + vsub HY(61,0),HY(2 ,0),HY(18,0) + vsub HY(60,0),HY(3 ,0),HY(19,0) + vsub HY(59,0),HY(4 ,0),HY(20,0) + vsub HY(58,0),HY(5 ,0),HY(21,0) + vsub HY(57,0),HY(6 ,0),HY(22,0) + vsub HY(56,0),HY(7 ,0),HY(23,0) + vsub HY(55,0),HY(8 ,0),HY(24,0) + vsub HY(54,0),HY(9 ,0),HY(25,0) + vsub HY(53,0),HY(10,0),HY(26,0) + vsub HY(52,0),HY(11,0),HY(27,0) + vsub HY(51,0),HY(12,0),HY(28,0) + vsub HY(50,0),HY(13,0),HY(29,0) + vsub HY(49,0),HY(14,0),HY(30,0) + vsub HY(48,0),HY(15,0),HY(31,0) + vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, + vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate + add r0,r8,32 + vsth VX(48,32++),(r0+=r6) REP 16 + pop pc + +.if USE_STACK == 0 + .balign 32 + +# .space directives generate 0's in the bin so avoid unnecessary padding by +# just setting to appropriate value +.equ intermediate_results, $+16*2 + +# Layout goes: +# +#packed_buffer: +# .space 16*2 +#intermediate_results: +# .space 32*32*2 +#unpacked_buffer: +# .space 32*32*2 +# +#packed_buffer2: +# .space 16*2 +#intermediate_results2: +# .space 32*32*2 +#unpacked_buffer2: +# .space 32*32*2 +.endif + + diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h new file mode 100644 index 0000000000..1c364492d0 --- /dev/null +++ b/libavcodec/rpi_hevc_transform10.h @@ -0,0 +1,94 @@ +static const unsigned char rpi_hevc_transform10 [] = { +0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 +0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 +0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 +0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 +0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 +0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 +0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030 +0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 +0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 +0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 +0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 +0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 +0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 +0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 +0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 +0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 +0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 +0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 +0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090 +0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 +0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 +0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 +0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 +0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 +0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 +0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 +0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 +0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 +0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 +0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 +0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 +0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 +0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 +0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 +0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 +0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 +0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 +0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 +0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 +0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 +0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 +0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 +0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 +0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 +0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 +0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 +0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 +0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 +0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 +0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 +0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 +0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 +0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 +0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 +0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 +0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 +0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 +0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 +0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 +0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8 +0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 +0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 +0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 +0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 +0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 +0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 +0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 +0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 +0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 +0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 +0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 +0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 +0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 +0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 +0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 +0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 +0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 +0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 +0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 +0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 +0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 +0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 +0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 +0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 +0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 +0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 +0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 +0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 +0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 +0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 +0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 +0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 +}; diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h new file mode 100644 index 0000000000..1128a2c054 --- /dev/null +++ b/libavcodec/rpi_hevc_transform8.h @@ -0,0 +1,94 @@ +static const unsigned char rpi_hevc_transform8 [] = { +0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 +0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 +0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 +0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 +0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 +0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 +0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030 +0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 +0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 +0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 +0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 +0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 +0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 +0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 +0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 +0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 +0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 +0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 +0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090 +0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 +0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 +0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 +0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 +0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 +0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 +0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 +0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 +0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 +0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 +0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 +0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 +0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 +0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 +0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 +0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 +0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 +0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 +0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 +0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 +0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 +0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 +0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 +0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 +0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 +0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 +0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 +0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 +0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 +0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 +0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 +0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 +0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 +0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 +0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 +0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 +0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 +0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 +0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 +0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 +0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8 +0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 +0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 +0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 +0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 +0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 +0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 +0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 +0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 +0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 +0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 +0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 +0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 +0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 +0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 +0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 +0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 +0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 +0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 +0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 +0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 +0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 +0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 +0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 +0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 +0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 +0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 +0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 +0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 +0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 +0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 +0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 +0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 +}; diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c new file mode 100644 index 0000000000..e651e5c565 --- /dev/null +++ b/libavcodec/rpi_hevcdec.c @@ -0,0 +1,6134 @@ +/* + * HEVC video Decoder + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2012 - 2013 Mickael Raulet + * Copyright (C) 2012 - 2013 Gildas Cocherel + * Copyright (C) 2012 - 2013 Wassim Hamidouche + * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/common.h" +#include "libavutil/display.h" +#include "libavutil/internal.h" +#include "libavutil/mastering_display_metadata.h" +#include "libavutil/md5.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/stereo3d.h" + +#include "decode.h" +#include "bswapdsp.h" +#include "bytestream.h" +#include "golomb.h" +#include "hevc.h" +#include "rpi_hevc_data.h" +#include "rpi_hevc_parse.h" +#include "rpi_hevcdec.h" +#include "rpi_hevc_cabac_fns.h" +#include "profiles.h" +#include "hwconfig.h" + +#include "rpi_zc_frames.h" +#include "rpi_qpu.h" +#include "rpi_hevc_shader.h" +#include "rpi_hevc_shader_cmd.h" +#include "rpi_hevc_shader_template.h" +#include "rpi_zc.h" +#include "libavutil/rpi_sand_fns.h" + +#include "pthread.h" +#include + +#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards + +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) + +#ifndef av_mod_uintp2 +static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p) +{ + return a & ((1 << p) - 1); +} +# define av_mod_uintp2 av_mod_uintp2_c +#endif + +const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; +static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first); + +#define MC_DUMMY_X (-32) +#define MC_DUMMY_Y (-32) + +// UV & Y both have min 4x4 pred (no 2x2 chroma) +// Allow for even spread +1 for setup, +1 for rounding +// As we have load sharing this can (in theory) be exceeded so we have to +// check after each CTU, but it is a good base size + +// Worst case (all 4x4) commands per CTU +#define QPU_Y_CMD_PER_CTU_MAX (16 * 16) +#define QPU_C_CMD_PER_CTU_MAX (8 * 8) + +#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64) + +#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP) +#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS) + +#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2) +#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2) + +// Total cmds to allocate - allow for slack & setup +#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX) +#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX) + +#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2)) +#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2)) + +// The QPU code for UV blocks only works up to a block width of 8 +#define RPI_CHROMA_BLOCK_WIDTH 8 + +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) + + +// Actual filter goes -ve, +ve, +ve, -ve using these values +static const uint32_t rpi_filter_coefs[8] = { + ENCODE_COEFFS( 0, 64, 0, 0), + ENCODE_COEFFS( 2, 58, 10, 2), + ENCODE_COEFFS( 4, 54, 16, 2), + ENCODE_COEFFS( 6, 46, 28, 4), + ENCODE_COEFFS( 4, 36, 36, 4), + ENCODE_COEFFS( 4, 28, 46, 6), + ENCODE_COEFFS( 2, 16, 54, 4), + ENCODE_COEFFS( 2, 10, 58, 2) +}; + +// Function arrays by QPU + +static const int * const inter_pred_setup_c_qpu[12] = { + mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, + mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, + mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn +}; + +static const int * const inter_pred_setup_c10_qpu[12] = { + mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, + mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, + mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn +}; + +static const int * const inter_pred_setup_y_qpu[12] = { + mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, + mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, + mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn +}; + +static const int * const inter_pred_setup_y10_qpu[12] = { + mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, + mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, + mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn +}; + +static const int * const inter_pred_sync_qpu[12] = { + mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3, + mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7, + mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11 +}; + +static const int * const inter_pred_sync10_qpu[12] = { + mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3, + mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7, + mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11 +}; + +static const int * const inter_pred_exit_c_qpu[12] = { + mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, + mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, + mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn +}; + +static const int * const inter_pred_exit_c10_qpu[12] = { + mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, + mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, + mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn +}; + +static const int * const inter_pred_exit_y_qpu[12] = { + mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, + mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, + mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn +}; + +static const int * const inter_pred_exit_y10_qpu[12] = { + mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, + mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, + mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn +}; + +typedef struct ipe_chan_info_s +{ + const uint8_t bit_depth; + const uint8_t n; + const int * const * setup_fns; + const int * const * sync_fns; + const int * const * exit_fns; +} ipe_chan_info_t; + +typedef struct ipe_init_info_s +{ + ipe_chan_info_t luma; + ipe_chan_info_t chroma; +} ipe_init_info_t; + +static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a) +{ + switch (ln) + { + default: // normally 0 + *b = a; + break; + case 1: + a |= a << 8; + *(uint16_t *)b = a; + b += stride; + *(uint16_t *)b = a; + break; + case 2: + a |= a << 8; + a |= a << 16; + *(uint32_t *)b = a; + b += stride; + *(uint32_t *)b = a; + b += stride; + *(uint32_t *)b = a; + b += stride; + *(uint32_t *)b = a; + break; + case 3: + { + unsigned int i; + uint64_t d; + a |= a << 8; + a |= a << 16; + d = ((uint64_t)a << 32) | a; + for (i = 0; i != 8; ++i, b += stride) + *(uint64_t *)b = d; + break; + } + case 4: + { + unsigned int i; + uint64_t d; + a |= a << 8; + a |= a << 16; + d = ((uint64_t)a << 32) | a; + for (i = 0; i != 16; ++i, b += stride) + { + *(uint64_t *)b = d; + *(uint64_t *)(b + 8) = d; + } + break; + } + } +} + +// We expect this to be called with ln = (log2_cb_size - 3) so range = -1..3 +// (4 not required) +static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a) +{ + switch (ln) + { + default: // 0 or -1 + *b_u = a; + *b_l = a; + break; + case 1: + a |= a << 8; + *(uint16_t *)b_u = a; + *(uint16_t *)b_l = a; + break; + case 2: + a |= a << 8; + a |= a << 16; + *(uint32_t *)b_u = a; + *(uint32_t *)b_l = a; + break; + case 3: + a |= a << 8; + a |= a << 16; + *(uint32_t *)b_u = a; + *(uint32_t *)(b_u + 4) = a; + *(uint32_t *)b_l = a; + *(uint32_t *)(b_l + 4) = a; + break; + case 4: + a |= a << 8; + a |= a << 16; + *(uint32_t *)b_u = a; + *(uint32_t *)(b_u + 4) = a; + *(uint32_t *)(b_u + 8) = a; + *(uint32_t *)(b_u + 12) = a; + *(uint32_t *)b_l = a; + *(uint32_t *)(b_l + 4) = a; + *(uint32_t *)(b_l + 8) = a; + *(uint32_t *)(b_l + 12) = a; + break; + } +} + +static void zap_cabac_stash(uint8_t * b, const int ln) +{ + switch (ln) + { + default: // 0 + *b = 0; + break; + case 1: + *(uint16_t *)b = 0; + break; + case 2: + *(uint32_t *)b = 0; + break; + case 3: + *(uint32_t *)b = 0; + *(uint32_t *)(b + 4) = 0; + break; + } +} + + + +// Set a small square block of bits in a bitmap +// Bits must be aligned on their size boundry (which will be true of all split CBs) +static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln) +{ + unsigned int n; + const unsigned int sh = (x & 7); + + f += (x >> 3); + + av_assert2(ln <= 3); + av_assert2((x & ((1 << ln) - 1)) == 0); + + switch (ln) + { + default: // 1 + f[0] |= 1 << sh; + break; + case 1: // 3 * 2 + n = 3 << sh; + f[0] |= n; + f[stride] |= n; + break; + case 2: // 0xf * 4 + n = 0xf << sh; + f[0] |= n; + f[stride] |= n; + f[stride * 2] |= n; + f[stride * 3] |= n; + break; + case 3: // 0xff * 8 + for (n = 0; n != 8; ++n, f += stride) + *f = 0xff; + break; + } +} + +static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16 + { // 8 + .luma = {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu}, + .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu} + }, + { // 9 + .luma = {0}, + .chroma = {0} + }, + { // 10 + .luma = {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu}, + .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu} + } + +}; + +static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici) +{ + const unsigned int n = ici->n; + const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word + + ipe->n = n; + ipe->max_fill = q1_size - ipe->min_gap; + for(unsigned int i = 0; i < n; i++) { + HEVCRpiInterPredQ * const q = ipe->q + i; + q->qpu_mc_curr = q->qpu_mc_base = + (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size); + q->code_setup = qpu_fn(ici->setup_fns[i]); + q->code_sync = qpu_fn(ici->sync_fns[i]); + q->code_exit = qpu_fn(ici->exit_fns[i]); + } +} + +static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth) +{ + av_assert0(bit_depth >= 8 && bit_depth <= 16); + + rpi_hevc_qpu_init_fn(&s->qpu, bit_depth); +} + +// Unsigned Trivial MOD +static inline unsigned int utmod(const unsigned int x, const unsigned int n) +{ + return x >= n ? x - n : x; +} + +// returns pq->job_n++ +static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq) +{ + unsigned int const x2 = pq->job_n; + pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS); + return x2; +} + +static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n) +{ + pq->terminate = 0; + pq->job_n = 0; + pq->context = s; + pq->worker = worker; + pq->psem_out = psem_out; + pq->pass_n = n; + pq->started = 0; + sem_init(&pq->sem_in, 0, 0); +} + +static void pass_queue_kill(HEVCRpiPassQueue * const pq) +{ + sem_destroy(&pq->sem_in); +} + +static inline void rpi_sem_wait(sem_t * const sem) +{ + while (sem_wait(sem) != 0) { + av_assert0(errno == EINTR); + } +} + +static void pass_queue_submit_job(HEVCRpiPassQueue * const pq) +{ + sem_post(&pq->sem_in); +} + +static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb) +{ + // Do the various passes - common with the worker code + for (unsigned int i = 0; i != RPI_PASSES; ++i) { + s->passq[i].worker(s, jb); + } +} + + +#if 0 +static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func) +{ + int x; + sem_getvalue((sem_t *)&jbc->sem_out, &x); + printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x); +} +#endif + + +static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc) +{ + HEVCRpiJob * jb; + HEVCRpiJobGlobal * const jbg = jbc->jbg; + + pthread_mutex_lock(&jbg->lock); + // Check local 1st + if ((jb = jbc->jb1) != NULL) + { + // Only 1 - very easy :-) + jbc->jb1 = NULL; + } + else + { + // Now look for global free chain + if ((jb = jbg->free1) != NULL) + { + // Found one - unlink it + jbg->free1 = jb->next; + jb->next = NULL; + } + else + { + // Out of places to look - wait for one to become free - add to Qs + + // Global + // If "good" lc then add after the last "good" el in the chain + // otherwise add to the tail + if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good) + { + // Add to end as we had to wait last time or wait Q empty + if ((lc->jw_prev = jbg->wait_tail) == NULL) + jbg->wait_head = lc; + else + lc->jw_prev->jw_next = lc; + lc->jw_next = NULL; + jbg->wait_tail = lc; + } + else + { + // This is a "good" lc that we need to poke into the middle + // of the Q + // We know that the Q isn't empty and there is at least one + // !last_progess_good el in it from the previous test + + HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after + + if (p == NULL) + { + // No current good els - add to head + lc->jw_next = jbg->wait_head; + jbg->wait_head = lc; + } + else + { + lc->jw_next = p->jw_next; + p->jw_next = lc; + } + + lc->jw_next->jw_prev = lc; + lc->jw_prev = p; + } + + // If "good" then we are now the last good waiting el + if (lc->last_progress_good) + jbg->wait_good = lc; + + // Local + if ((lc->ljw_prev = jbc->lcw_tail) == NULL) + jbc->lcw_head = lc; + else + lc->ljw_prev->ljw_next = lc; + lc->ljw_next = NULL; + jbc->lcw_tail = lc; + } + } + + pthread_mutex_unlock(&jbg->lock); + + if (jb == NULL) // Need to wait + { + rpi_sem_wait(&lc->jw_sem); + jb = lc->jw_job; // Set by free code + } + + return jb; +} + + +static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb) +{ + HEVCRpiJobGlobal * const jbg = jbc0->jbg; // This jbc only used to find jbg so we can get the lock + HEVCRpiJobCtl * jbc = jb->jbc_local; + HEVCRpiLocalContext * lc = NULL; + + pthread_mutex_lock(&jbg->lock); + + if (jbc != NULL) + { + av_assert1(jbc->jb1 == NULL); + + // Release to Local if nothing waiting there + if ((lc = jbc->lcw_head) == NULL) + jbc->jb1 = jb; + } + else + { + // Release to global if nothing waiting there + if ((lc = jbg->wait_head) == NULL) + { + jb->next = jbg->free1; + jbg->free1 = jb; + } + else + { + // ? seems somehow mildy ugly... + jbc = lc->context->jbc; + } + } + + if (lc != NULL) + { + // Something was waiting + + // Unlink + // Global + if (lc->jw_next == NULL) + jbg->wait_tail = lc->jw_prev; + else + lc->jw_next->jw_prev = lc->jw_prev; + + if (lc->jw_prev == NULL) + jbg->wait_head = lc->jw_next; + else + lc->jw_prev->jw_next = lc->jw_next; + + // Local + if (lc->ljw_next == NULL) + jbc->lcw_tail = lc->ljw_prev; + else + lc->ljw_next->ljw_prev = lc->ljw_prev; + + if (lc->ljw_prev == NULL) + jbc->lcw_head = lc->ljw_next; + else + lc->ljw_prev->ljw_next = lc->ljw_next; + + // Update good if required + if (jbg->wait_good == lc) + jbg->wait_good = lc->jw_prev; + + // Prod + lc->jw_job = jb; + sem_post(&lc->jw_sem); + } + + pthread_mutex_unlock(&jbg->lock); +} + +static void job_lc_kill(HEVCRpiLocalContext * const lc) +{ + sem_destroy(&lc->jw_sem); +} + +static void job_lc_init(HEVCRpiLocalContext * const lc) +{ + lc->jw_next = NULL; + lc->jw_prev = NULL; + lc->ljw_next = NULL; + lc->ljw_prev = NULL; + lc->jw_job = NULL; + sem_init(&lc->jw_sem, 0, 0); +} + +// Returns: +// 0 if we have waited for MV or expect to wait for recon +// 1 if we haven't waited for MV & do not need to wait for recon +static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb) +{ + if (jb->waited) // reset by rpi_begin + return 0; + for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) + { + if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL && + ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i]) + return 0; + } + return 1; +} + +// Submit job if it is full (indicated by having ctu_ts_last set >= 0) +static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc) +{ + HEVCRpiJobCtl *const jbc = s->jbc; + HEVCRpiJob * const jb = lc->jb0; + + av_assert1(jb != NULL); + + if (jb->ctu_ts_last < 0) { + return; + } + + lc->last_progress_good = progress_good(s, jb); + jb->waited = !lc->last_progress_good; + lc->jb0 = NULL; + + if (s->offload_recon) + { + pthread_mutex_lock(&jbc->in_lock); + jbc->offloadq[jbc->offload_in] = jb; + jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS); + pthread_mutex_unlock(&jbc->in_lock); + + pass_queue_submit_job(s->passq + 0); // Consumes job eventually + } + else + { + pass_queue_do_all(s, jb); // Consumes job before return + } +} + + +// Call worker_pass0_ready to wait until the s->pass0_job slot becomes +// available to receive the next job. +// +// Now safe against multiple callers - needed for tiles +// "normal" and WPP will only call here one at a time +static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) +{ + HEVCRpiJobCtl * const jbc = s->jbc; + + // It is legit for us to already have a job allocated - do nothing in this case + if (lc->jb0 != NULL) + return; + + if (s->offload_recon) + rpi_sem_wait(&jbc->sem_out); // This sem will stop this frame grabbing too much + + lc->jb0 = job_alloc(jbc, lc); + + rpi_begin(s, lc->jb0, lc->ts); +} + +// Free up a job without submission +static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) +{ + HEVCRpiJobCtl * const jbc = s->jbc; + HEVCRpiJob * const jb = lc->jb0; + + if (jb == NULL) { + return; + } + + lc->jb0 = NULL; + + job_free(jbc, jb); + + // If offload then poke sem_out too + if (s->offload_recon) { + sem_post(&jbc->sem_out); + } +} + + +// Call this to wait for all jobs to have completed at the end of a frame +// Slightly icky as there is no clean way to wait for a sem to count up +// Not reentrant - call on main thread only +static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) +{ + HEVCRpiJobCtl * const jbc = s->jbc; + int i = 0; + + // We shouldn't reach here with an unsubmitted job + av_assert1(lc->jb0 == NULL); + + // If no offload then there can't be anything to wait for + if (!s->offload_recon) { + return; + } + + if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS) + { + for (i = 0; i != RPI_MAX_JOBS; ++i) { + rpi_sem_wait(&jbc->sem_out); + } + for (i = 0; i != RPI_MAX_JOBS; ++i) { + sem_post(&jbc->sem_out); + } + } +} + +static void * pass_worker(void *arg) +{ + HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg; + HEVCRpiContext *const s = pq->context; + + for (;;) + { + rpi_sem_wait(&pq->sem_in); + + if (pq->terminate) + break; + + pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]); + // * should really set jb->passes_done here + + sem_post(pq->psem_out); + } + return NULL; +} + +static void pass_queues_start_all(HEVCRpiContext *const s) +{ + unsigned int i; + HEVCRpiPassQueue * const pqs = s->passq; + + for (i = 0; i != RPI_PASSES; ++i) + { + av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0); + pqs[i].started = 1; + } +} + +static void pass_queues_term_all(HEVCRpiContext *const s) +{ + unsigned int i; + HEVCRpiPassQueue * const pqs = s->passq; + + for (i = 0; i != RPI_PASSES; ++i) + pqs[i].terminate = 1; + for (i = 0; i != RPI_PASSES; ++i) + { + if (pqs[i].started) + sem_post(&pqs[i].sem_in); + } + for (i = 0; i != RPI_PASSES; ++i) + { + if (pqs[i].started) { + pthread_join(pqs[i].thread, NULL); + pqs[i].started = 0; + } + } +} + +static void pass_queues_kill_all(HEVCRpiContext *const s) +{ + unsigned int i; + HEVCRpiPassQueue * const pqs = s->passq; + + for (i = 0; i != RPI_PASSES; ++i) + pass_queue_kill(pqs + i); +} + + +static void worker_pic_free_one(HEVCRpiJob * const jb) +{ + // Free coeff stuff - allocation not the same for all buffers + HEVCRpiCoeffsEnv * const cf = &jb->coeffs; + + if (cf->s[0].buf != NULL) + av_freep(&cf->mptr); + if (cf->s[2].buf != NULL) + gpu_free(&cf->gptr); + memset(cf, 0, sizeof(*cf)); +} + +static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count) +{ + HEVCRpiCoeffsEnv * const cf = &jb->coeffs; + + if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0) + goto fail; + cf->s[2].buf = (int16_t *)cf->gptr.arm; + cf->s[3].buf = cf->s[2].buf + coeff_count; + + // Must be 64 byte aligned for our zero zapping code so over-allocate & + // round + if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL) + goto fail; + cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63); + return 0; + +fail: + av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__); + worker_pic_free_one(jb); + return -1; +} + +static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf) +{ + unsigned int i; + for (i = 0; i != 4; ++i) { + cf->s[i].n = 0; +#if RPI_COMPRESS_COEFFS + cf->s[i].packed = 1; + cf->s[i].packed_n = 0; +#endif + } +} + +int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n) +{ + HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no; + int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n); + cfe->n += n; + return coeffs; +} + +void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const HEVCRpiFrame * const ref, const int val, const int field) +{ + if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) { + HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data; + HEVCRpiFrameProgressState * const pstate = fs->progress_states + field; + sem_t * sem = NULL; + + av_assert0(pthread_mutex_lock(&pstate->lock) == 0); + if (((volatile int *)ref->tf.progress->data)[field] < val) { + HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait; + + av_assert1(pwait->req == -1 && pwait->next == NULL); + jb->waited = 1; // Remember that we had to wait for later scheduling + + pwait->req = val; + pwait->next = NULL; + if (pstate->first == NULL) + pstate->first = pwait; + else + pstate->last->next = pwait; + pstate->last = pwait; + sem = &pwait->sem; + } + pthread_mutex_unlock(&pstate->lock); + + if (sem != NULL) { + rpi_sem_wait(sem); + } + } +} + +void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field) +{ + HEVCRpiFrameProgressState *const pstate = s->progress_states + field; + + ((int *)s->ref->tf.progress->data)[field] = val; + + av_assert0(pthread_mutex_lock(&pstate->lock) == 0); + { + HEVCRpiFrameProgressWait ** ppwait = &pstate->first; + HEVCRpiFrameProgressWait * pwait; + + while ((pwait = *ppwait) != NULL) { + if (pwait->req > val) + { + ppwait = &pwait->next; + pstate->last = pwait; + } + else + { + *ppwait = pwait->next; + pwait->req = -1; + pwait->next = NULL; + sem_post(&pwait->sem); + } + } + } + pthread_mutex_unlock(&pstate->lock); +} + +static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate) +{ + pstate->first = NULL; + pstate->last = NULL; + pthread_mutex_init(&pstate->lock, NULL); +} + +static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait) +{ + pwait->req = -1; + pwait->next = NULL; + sem_init(&pwait->sem, 0, 0); +} + +static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate) +{ + av_assert1(pstate->first == NULL); + pthread_mutex_destroy(&pstate->lock); +} + +static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait) +{ + sem_destroy(&pwait->sem); +} + + +/** + * NOTE: Each function hls_foo correspond to the function foo in the + * specification (HLS stands for High Level Syntax). + */ + +/** + * Section 5.7 + */ + +// Realloc the entry point arrays +static int alloc_entry_points(RpiSliceHeader * const sh, const int n) +{ + if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0) + { + // Round up alloc to multiple of 32 + int a = (n + 31) & ~31; + + // We don't care about the previous contents so probably fastest to simply discard + av_freep(&sh->entry_point_offset); + av_freep(&sh->offset); + av_freep(&sh->size); + + if (a != 0) + { + sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned)); + sh->offset = av_malloc_array(a, sizeof(int)); + sh->size = av_malloc_array(a, sizeof(int)); + + if (!sh->entry_point_offset || !sh->offset || !sh->size) { + sh->num_entry_point_offsets = 0; + sh->offsets_allocated = 0; + return AVERROR(ENOMEM); + } + } + + sh->offsets_allocated = a; + } + + return 0; +} + +/* free everything allocated by pic_arrays_init() */ +static void pic_arrays_free(HEVCRpiContext *s) +{ + av_freep(&s->sao); + av_freep(&s->deblock); + + av_freep(&s->cabac_stash_up); + s->cabac_stash_left = NULL; // freed with _up + + av_freep(&s->mvf_up); + av_freep(&s->mvf_left); + + av_freep(&s->is_pcm); + av_freep(&s->is_intra_store); + s->is_intra = NULL; + av_freep(&s->rpl_tab); + s->rpl_tab_size = 0; + + av_freep(&s->qp_y_tab); + av_freep(&s->tab_slice_address); + av_freep(&s->filter_slice_edges); + + av_freep(&s->bs_horizontal); + s->bs_vertical = NULL; // freed with H + av_freep(&s->bsf_stash_left); + av_freep(&s->bsf_stash_up); + + av_freep(&s->rpl_up); + av_freep(&s->rpl_left); + + alloc_entry_points(&s->sh, 0); + + av_buffer_pool_uninit(&s->col_mvf_pool); +} + +/* allocate arrays that depend on frame dimensions */ +static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps) +{ + const unsigned int log2_min_cb_size = sps->log2_min_cb_size; + const unsigned int width = sps->width; + const unsigned int height = sps->height; + const unsigned int pic_size_in_cb = ((width >> log2_min_cb_size) + 1) * + ((height >> log2_min_cb_size) + 1); + const unsigned int ctb_count = sps->ctb_size; + + { + unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK); + unsigned int h = ((height + 15) & ~15); + + s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size + s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols + } + + s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly + s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock)); + if (!s->sao || !s->deblock) + goto fail; + + s->cabac_stash_up = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3)); + s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3); + if (s->cabac_stash_up == NULL) + goto fail; + + // Round width up to max ctb size + s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up)); + // * Only needed if we have H tiles + s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up)); + + // We can overread by 1 line & one byte in deblock so alloc & zero + // We don't need to zero the extra @ start of frame as it will never be + // written + s->is_pcm = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); + s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); + if (s->is_pcm == NULL || s->is_intra_store == NULL) + goto fail; + + s->filter_slice_edges = av_mallocz(ctb_count); + s->tab_slice_address = av_malloc_array(ctb_count, + sizeof(*s->tab_slice_address)); + s->qp_y_tab = av_malloc_array(pic_size_in_cb, + sizeof(*s->qp_y_tab)); + if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address) + goto fail; + + s->bs_horizontal = av_mallocz(s->bs_size * 2); + s->bs_vertical = s->bs_horizontal + s->bs_size; + if (s->bs_horizontal == NULL) + goto fail; + + s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up)); + s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left)); + if (s->rpl_left == NULL || s->rpl_up == NULL) + goto fail; + + if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL || + (s->bsf_stash_up = av_mallocz(((width + 63) & ~63) >> 4)) == NULL) + goto fail; + + s->col_mvf_stride = (width + 15) >> 4; + s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField), + av_buffer_allocz); + if (s->col_mvf_pool == NULL) + goto fail; + + return 0; + +fail: + pic_arrays_free(s); + return AVERROR(ENOMEM); +} + +static void default_pred_weight_table(HEVCRpiContext * const s) +{ + unsigned int i; + const unsigned int wt = 1 << QPU_MC_DENOM; + s->sh.luma_log2_weight_denom = 0; + s->sh.chroma_log2_weight_denom = 0; + for (i = 0; i < s->sh.nb_refs[L0]; i++) { + s->sh.luma_weight_l0[i] = wt; + s->sh.luma_offset_l0[i] = 0; + s->sh.chroma_weight_l0[i][0] = wt; + s->sh.chroma_weight_l0[i][1] = wt; + s->sh.chroma_offset_l0[i][0] = 0; + s->sh.chroma_offset_l0[i][1] = 0; + } + for (i = 0; i < s->sh.nb_refs[L1]; i++) { + s->sh.luma_weight_l1[i] = wt; + s->sh.luma_offset_l1[i] = 0; + s->sh.chroma_weight_l1[i][0] = wt; + s->sh.chroma_weight_l1[i][1] = wt; + s->sh.chroma_offset_l1[i][0] = 0; + s->sh.chroma_offset_l1[i][1] = 0; + } +} + +static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb, + const unsigned int refs, + int16_t * luma_weight, int16_t * luma_offset, + int16_t * chroma_weight, int16_t * chroma_offset) +{ + unsigned int luma_flags; + unsigned int chroma_flags; + unsigned int i; + const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8); + const int wp_offset_half_range = s->ps.sps->wp_offset_half_range; + const unsigned int luma_weight_base = 1 << QPU_MC_DENOM; + const unsigned int chroma_weight_base = 1 << QPU_MC_DENOM; + const unsigned int luma_weight_shift = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom); + const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom); + + if (refs == 0) + return 0; + + luma_flags = get_bits(gb, refs); + chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs); + i = 1 << (refs - 1); + + do + { + if ((luma_flags & i) != 0) + { + const int delta_weight = get_se_golomb(gb); + const int offset = get_se_golomb(gb); + if (delta_weight < -128 || delta_weight > 127 || + offset < -wp_offset_half_range || offset >= wp_offset_half_range) + { + return AVERROR_INVALIDDATA; + } + *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift); + *luma_offset++ = offset << wp_offset_bd_shift; + } + else + { + *luma_weight++ = luma_weight_base; + *luma_offset++ = 0; + } + + if ((chroma_flags & i) != 0) + { + unsigned int j; + for (j = 0; j != 2; ++j) + { + const int delta_weight = get_se_golomb(gb); + const int delta_offset = get_se_golomb(gb); + + if (delta_weight < -128 || delta_weight > 127 || + delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range) + { + return AVERROR_INVALIDDATA; + } + + *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift); + *chroma_offset++ = av_clip( + wp_offset_half_range + delta_offset - + ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom), + -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift; + } + } + else + { + *chroma_weight++ = chroma_weight_base; + *chroma_weight++ = chroma_weight_base; + *chroma_offset++ = 0; + *chroma_offset++ = 0; + } + } while ((i >>= 1) != 0); + + return 0; +} + +static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb) +{ + int err; + const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb); + const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb); + + if (luma_log2_weight_denom > 7 || + chroma_log2_weight_denom > 7) + { + av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n", + luma_log2_weight_denom, chroma_log2_weight_denom); + return AVERROR_INVALIDDATA; + } + + s->sh.luma_log2_weight_denom = luma_log2_weight_denom; + s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom; + + if ((err = get_weights(s, gb, s->sh.nb_refs[L0], + s->sh.luma_weight_l0, s->sh.luma_offset_l0, + s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 || + (err = get_weights(s, gb, s->sh.nb_refs[L1], + s->sh.luma_weight_l1, s->sh.luma_offset_l1, + s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0) + { + av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n"); + return err; + } + + return 0; +} + +static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb) +{ + const HEVCRpiSPS *sps = s->ps.sps; + int max_poc_lsb = 1 << sps->log2_max_poc_lsb; + int prev_delta_msb = 0; + unsigned int nb_sps = 0, nb_sh; + int i; + + rps->nb_refs = 0; + if (!sps->long_term_ref_pics_present_flag) + return 0; + + if (sps->num_long_term_ref_pics_sps > 0) + nb_sps = get_ue_golomb_long(gb); + nb_sh = get_ue_golomb_long(gb); + + if (nb_sps > sps->num_long_term_ref_pics_sps) + return AVERROR_INVALIDDATA; + if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc)) + return AVERROR_INVALIDDATA; + + rps->nb_refs = nb_sh + nb_sps; + + for (i = 0; i < rps->nb_refs; i++) { + uint8_t delta_poc_msb_present; + + if (i < nb_sps) { + uint8_t lt_idx_sps = 0; + + if (sps->num_long_term_ref_pics_sps > 1) + lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps)); + + rps->poc[i] = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps]; + rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps]; + } else { + rps->poc[i] = get_bits(gb, sps->log2_max_poc_lsb); + rps->used[i] = get_bits1(gb); + } + + delta_poc_msb_present = get_bits1(gb); + if (delta_poc_msb_present) { + int64_t delta = get_ue_golomb_long(gb); + int64_t poc; + + if (i && i != nb_sps) + delta += prev_delta_msb; + + poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb; + if (poc != (int32_t)poc) + return AVERROR_INVALIDDATA; + rps->poc[i] = poc; + prev_delta_msb = delta; + } + } + + return 0; +} + +static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps, + const HEVCRpiSPS *sps) +{ + const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data; + const HEVCRpiWindow *ow = &sps->output_window; + unsigned int num = 0, den = 0; + + avctx->pix_fmt = sps->pix_fmt; + avctx->coded_width = sps->width; + avctx->coded_height = sps->height; + avctx->width = sps->width - ow->left_offset - ow->right_offset; + avctx->height = sps->height - ow->top_offset - ow->bottom_offset; + avctx->has_b_frames = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics; + avctx->profile = sps->ptl.general_ptl.profile_idc; + avctx->level = sps->ptl.general_ptl.level_idc; + + ff_set_sar(avctx, sps->vui.sar); + + if (sps->vui.video_signal_type_present_flag) + avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG + : AVCOL_RANGE_MPEG; + else + avctx->color_range = AVCOL_RANGE_MPEG; + + if (sps->vui.colour_description_present_flag) { + avctx->color_primaries = sps->vui.colour_primaries; + avctx->color_trc = sps->vui.transfer_characteristic; + avctx->colorspace = sps->vui.matrix_coeffs; + } else { + avctx->color_primaries = AVCOL_PRI_UNSPECIFIED; + avctx->color_trc = AVCOL_TRC_UNSPECIFIED; + avctx->colorspace = AVCOL_SPC_UNSPECIFIED; + } + + if (vps->vps_timing_info_present_flag) { + num = vps->vps_num_units_in_tick; + den = vps->vps_time_scale; + } else if (sps->vui.vui_timing_info_present_flag) { + num = sps->vui.vui_num_units_in_tick; + den = sps->vui.vui_time_scale; + } + + if (num != 0 && den != 0) + av_reduce(&avctx->framerate.den, &avctx->framerate.num, + num, den, 1 << 30); +} + +static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps) +{ + enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts; + + // Admit to no h/w formats + + *fmt++ = sps->pix_fmt; + *fmt = AV_PIX_FMT_NONE; + + return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts); +} + +static int is_sps_supported(const HEVCRpiSPS * const sps) +{ + return av_rpi_is_sand_format(sps->pix_fmt) && + sps->width <= HEVC_RPI_MAX_WIDTH && + sps->height <= HEVC_RPI_MAX_HEIGHT; +} + +static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps, + const enum AVPixelFormat pix_fmt) +{ + int ret; + + pic_arrays_free(s); + s->ps.sps = NULL; + s->ps.vps = NULL; + + if (sps == NULL) + return 0; + + if (!is_sps_supported(sps)) + return AVERROR_DECODER_NOT_FOUND; + + ret = pic_arrays_init(s, sps); + if (ret < 0) + goto fail; + + export_stream_params(s->avctx, &s->ps, sps); + + s->avctx->pix_fmt = pix_fmt; + + ff_hevc_rpi_pred_init(&s->hpc, sps->bit_depth); + ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth); + + // * We don't support cross_component_prediction_enabled_flag but as that + // must be 0 unless we have 4:4:4 there is no point testing for it as we + // only deal with sand which is never 4:4:4 + // [support wouldn't be hard] + + rpi_hevc_qpu_set_fns(s, sps->bit_depth); + + av_freep(&s->sao_pixel_buffer_h[0]); + av_freep(&s->sao_pixel_buffer_v[0]); + + if (sps->sao_enabled) + { + const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1; + unsigned int c_idx; + size_t vsize[3] = {0}; + size_t hsize[3] = {0}; + + for(c_idx = 0; c_idx < c_count; c_idx++) { + int w = sps->width >> ctx_hshift(s, c_idx); + int h = sps->height >> ctx_vshift(s, c_idx); + // ctb height & width are a min of 8 so this must a multiple of 16 + // so no point rounding up! + hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift; + vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift; + } + + // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2] + // when we have plaited chroma + s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]); + s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]); + s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0]; + s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1]; + s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0]; + s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1]; + } + + s->ps.sps = sps; + s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data; + + return 0; + +fail: + pic_arrays_free(s); + s->ps.sps = NULL; + return ret; +} + +static inline int qp_offset_valid(const int qp_offset) +{ + return qp_offset >= -12 && qp_offset <= 12; +} + +static int hls_slice_header(HEVCRpiContext * const s) +{ + GetBitContext * const gb = &s->HEVClc->gb; + RpiSliceHeader * const sh = &s->sh; + int i, ret; + + // Coded parameters + sh->first_slice_in_pic_flag = get_bits1(gb); + if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) { + s->seq_decode = (s->seq_decode + 1) & 0xff; + s->max_ra = INT_MAX; + if (IS_IDR(s)) + ff_hevc_rpi_clear_refs(s); + } + sh->no_output_of_prior_pics_flag = 0; + if (IS_IRAP(s)) + sh->no_output_of_prior_pics_flag = get_bits1(gb); + + sh->pps_id = get_ue_golomb_long(gb); + if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) { + av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id); + return AVERROR_INVALIDDATA; + } + if (!sh->first_slice_in_pic_flag && + s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) { + av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n"); + return AVERROR_INVALIDDATA; + } + s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data; + if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1) + sh->no_output_of_prior_pics_flag = 1; + + if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) { + const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data; + const HEVCRpiSPS *last_sps = s->ps.sps; + enum AVPixelFormat pix_fmt; + + if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) { + if (sps->width != last_sps->width || sps->height != last_sps->height || + sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering != + last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering) + sh->no_output_of_prior_pics_flag = 0; + } + ff_hevc_rpi_clear_refs(s); + + ret = set_sps(s, sps, sps->pix_fmt); + if (ret < 0) + return ret; + + pix_fmt = get_format(s, sps); + if (pix_fmt < 0) + return pix_fmt; + +// ret = set_sps(s, sps, pix_fmt); +// if (ret < 0) +// return ret; + + s->avctx->pix_fmt = pix_fmt; + + s->seq_decode = (s->seq_decode + 1) & 0xff; + s->max_ra = INT_MAX; + } + + sh->dependent_slice_segment_flag = 0; + if (!sh->first_slice_in_pic_flag) { + int slice_address_length; + + if (s->ps.pps->dependent_slice_segments_enabled_flag) + sh->dependent_slice_segment_flag = get_bits1(gb); + + slice_address_length = av_ceil_log2(s->ps.sps->ctb_size); + sh->slice_segment_addr = get_bitsz(gb, slice_address_length); + if (sh->slice_segment_addr >= s->ps.sps->ctb_size) { + av_log(s->avctx, AV_LOG_ERROR, + "Invalid slice segment address: %u.\n", + sh->slice_segment_addr); + return AVERROR_INVALIDDATA; + } + + if (!sh->dependent_slice_segment_flag) { + sh->slice_addr = sh->slice_segment_addr; + s->slice_idx++; + } + } else { + sh->slice_segment_addr = sh->slice_addr = 0; + s->slice_idx = 0; + s->slice_initialized = 0; + } + + if (!sh->dependent_slice_segment_flag) { + s->slice_initialized = 0; + + for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++) + skip_bits(gb, 1); // slice_reserved_undetermined_flag[] + + sh->slice_type = get_ue_golomb_long(gb); + if (!(sh->slice_type == HEVC_SLICE_I || + sh->slice_type == HEVC_SLICE_P || + sh->slice_type == HEVC_SLICE_B)) { + av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n", + sh->slice_type); + return AVERROR_INVALIDDATA; + } + if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) { + av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n"); + return AVERROR_INVALIDDATA; + } + + // when flag is not present, picture is inferred to be output + sh->pic_output_flag = 1; + if (s->ps.pps->output_flag_present_flag) + sh->pic_output_flag = get_bits1(gb); + + if (s->ps.sps->separate_colour_plane_flag) + sh->colour_plane_id = get_bits(gb, 2); + + if (!IS_IDR(s)) { + int poc, pos; + + sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb); + poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type); + if (!sh->first_slice_in_pic_flag && poc != s->poc) { + av_log(s->avctx, AV_LOG_WARNING, + "Ignoring POC change between slices: %d -> %d\n", s->poc, poc); + if (s->avctx->err_recognition & AV_EF_EXPLODE) + return AVERROR_INVALIDDATA; + poc = s->poc; + } + s->poc = poc; + + sh->short_term_ref_pic_set_sps_flag = get_bits1(gb); + pos = get_bits_left(gb); + if (!sh->short_term_ref_pic_set_sps_flag) { + ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1); + if (ret < 0) + return ret; + + sh->short_term_rps = &sh->slice_rps; + } else { + int numbits, rps_idx; + + if (!s->ps.sps->nb_st_rps) { + av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n"); + return AVERROR_INVALIDDATA; + } + + numbits = av_ceil_log2(s->ps.sps->nb_st_rps); + rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0; + sh->short_term_rps = &s->ps.sps->st_rps[rps_idx]; + } + sh->short_term_ref_pic_set_size = pos - get_bits_left(gb); + + pos = get_bits_left(gb); + ret = decode_lt_rps(s, &sh->long_term_rps, gb); + if (ret < 0) { + av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n"); + if (s->avctx->err_recognition & AV_EF_EXPLODE) + return AVERROR_INVALIDDATA; + } + sh->long_term_ref_pic_set_size = pos - get_bits_left(gb); + + if (s->ps.sps->sps_temporal_mvp_enabled_flag) + sh->slice_temporal_mvp_enabled_flag = get_bits1(gb); + else + sh->slice_temporal_mvp_enabled_flag = 0; + } else { + s->sh.short_term_rps = NULL; + s->poc = 0; + } + + /* 8.3.1 */ + if (sh->first_slice_in_pic_flag && s->temporal_id == 0 && + s->nal_unit_type != HEVC_NAL_TRAIL_N && + s->nal_unit_type != HEVC_NAL_TSA_N && + s->nal_unit_type != HEVC_NAL_STSA_N && + s->nal_unit_type != HEVC_NAL_RADL_N && + s->nal_unit_type != HEVC_NAL_RADL_R && + s->nal_unit_type != HEVC_NAL_RASL_N && + s->nal_unit_type != HEVC_NAL_RASL_R) + s->pocTid0 = s->poc; + + if (s->ps.sps->sao_enabled) { + sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb); + if (ctx_cfmt(s) != 0) { + sh->slice_sample_adaptive_offset_flag[1] = + sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb); + } + } else { + sh->slice_sample_adaptive_offset_flag[0] = 0; + sh->slice_sample_adaptive_offset_flag[1] = 0; + sh->slice_sample_adaptive_offset_flag[2] = 0; + } + + sh->nb_refs[L0] = sh->nb_refs[L1] = 0; + if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) { + int nb_refs; + + sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active; + if (sh->slice_type == HEVC_SLICE_B) + sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active; + + if (get_bits1(gb)) { // num_ref_idx_active_override_flag + sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1; + if (sh->slice_type == HEVC_SLICE_B) + sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1; + } + if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) { + av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n", + sh->nb_refs[L0], sh->nb_refs[L1]); + return AVERROR_INVALIDDATA; + } + + sh->rpl_modification_flag[0] = 0; + sh->rpl_modification_flag[1] = 0; + nb_refs = ff_hevc_rpi_frame_nb_refs(s); + if (!nb_refs) { + av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n"); + return AVERROR_INVALIDDATA; + } + + if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) { + sh->rpl_modification_flag[0] = get_bits1(gb); + if (sh->rpl_modification_flag[0]) { + for (i = 0; i < sh->nb_refs[L0]; i++) + sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs)); + } + + if (sh->slice_type == HEVC_SLICE_B) { + sh->rpl_modification_flag[1] = get_bits1(gb); + if (sh->rpl_modification_flag[1] == 1) + for (i = 0; i < sh->nb_refs[L1]; i++) + sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs)); + } + } + + if (sh->slice_type == HEVC_SLICE_B) + sh->mvd_l1_zero_flag = get_bits1(gb); + + if (s->ps.pps->cabac_init_present_flag) + sh->cabac_init_flag = get_bits1(gb); + else + sh->cabac_init_flag = 0; + + sh->collocated_ref_idx = 0; + if (sh->slice_temporal_mvp_enabled_flag) { + sh->collocated_list = L0; + if (sh->slice_type == HEVC_SLICE_B) + sh->collocated_list = !get_bits1(gb); + + if (sh->nb_refs[sh->collocated_list] > 1) { + sh->collocated_ref_idx = get_ue_golomb_long(gb); + if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) { + av_log(s->avctx, AV_LOG_ERROR, + "Invalid collocated_ref_idx: %d.\n", + sh->collocated_ref_idx); + return AVERROR_INVALIDDATA; + } + } + } + + if ((s->ps.pps->weighted_pred_flag && sh->slice_type == HEVC_SLICE_P) || + (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) + { + if ((ret = pred_weight_table(s, gb)) != 0) + return ret; + } + else + { + // Give us unit weights + default_pred_weight_table(s); + } + + sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); + if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { + av_log(s->avctx, AV_LOG_ERROR, + "Invalid number of merging MVP candidates: %d.\n", + sh->max_num_merge_cand); + return AVERROR_INVALIDDATA; + } + } + + sh->slice_qp_delta = get_se_golomb(gb); + + if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) { + sh->slice_cb_qp_offset = get_se_golomb(gb); + sh->slice_cr_qp_offset = get_se_golomb(gb); + if (!qp_offset_valid(sh->slice_cb_qp_offset) || + !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) || + !qp_offset_valid(sh->slice_cr_qp_offset) || + !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset)) + { + av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n", + sh->slice_cr_qp_offset, sh->slice_cr_qp_offset, + s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset); + return AVERROR_INVALIDDATA; + } + } else + { + sh->slice_cb_qp_offset = 0; + sh->slice_cr_qp_offset = 0; + } + + if (s->ps.pps->chroma_qp_offset_list_enabled_flag) + sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb); + else + sh->cu_chroma_qp_offset_enabled_flag = 0; + + if (s->ps.pps->deblocking_filter_control_present_flag) { + int deblocking_filter_override_flag = 0; + + if (s->ps.pps->deblocking_filter_override_enabled_flag) + deblocking_filter_override_flag = get_bits1(gb); + + if (deblocking_filter_override_flag) { + sh->disable_deblocking_filter_flag = get_bits1(gb); + if (!sh->disable_deblocking_filter_flag) { + int beta_offset_div2 = get_se_golomb(gb); + int tc_offset_div2 = get_se_golomb(gb) ; + if (beta_offset_div2 < -6 || beta_offset_div2 > 6 || + tc_offset_div2 < -6 || tc_offset_div2 > 6) { + av_log(s->avctx, AV_LOG_ERROR, + "Invalid deblock filter offsets: %d, %d\n", + beta_offset_div2, tc_offset_div2); + return AVERROR_INVALIDDATA; + } + sh->beta_offset = beta_offset_div2 * 2; + sh->tc_offset = tc_offset_div2 * 2; + } + } else { + sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf; + sh->beta_offset = s->ps.pps->beta_offset; + sh->tc_offset = s->ps.pps->tc_offset; + } + } else { + sh->disable_deblocking_filter_flag = 0; + sh->beta_offset = 0; + sh->tc_offset = 0; + } + + if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag && + (sh->slice_sample_adaptive_offset_flag[0] || + sh->slice_sample_adaptive_offset_flag[1] || + !sh->disable_deblocking_filter_flag)) { + sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb); + } else { + sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag; + } + sh->no_dblk_boundary_flags = + (sh->slice_loop_filter_across_slices_enabled_flag ? 0 : + BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) | + (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 : + BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE); + + + } else if (!s->slice_initialized) { + av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n"); + return AVERROR_INVALIDDATA; + } + + sh->num_entry_point_offsets = 0; + sh->offload_wpp = 0; + sh->offload_tiles = 0; + + if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) { + unsigned num_entry_point_offsets = get_ue_golomb_long(gb); + // It would be possible to bound this tighter but this here is simpler + if (num_entry_point_offsets > get_bits_left(gb)) { + av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets); + return AVERROR_INVALIDDATA; + } + + sh->num_entry_point_offsets = num_entry_point_offsets; + if (sh->num_entry_point_offsets > 0) { + int offset_len = get_ue_golomb_long(gb) + 1; + + if (offset_len < 1 || offset_len > 32) { + sh->num_entry_point_offsets = 0; + av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len); + return AVERROR_INVALIDDATA; + } + + if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0) + { + av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n"); + return ret; + } + + for (i = 0; i < sh->num_entry_point_offsets; i++) { + uint32_t val_minus1 = get_bits_long(gb, offset_len); + if (val_minus1 > (1 << 28)) + { + // We can declare offsets of > 2^28 bad without loss of generality + // Will check actual bounds wrt NAL later, but this keeps + // the values within bounds we can deal with easily + av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1); + return AVERROR_INVALIDDATA; + } + sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size + } + + // Do we want to offload this + if (s->threads_type != 0) + { + sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) && + s->ps.pps->num_tile_columns > 1; + // * We only cope with WPP in a single column + // Probably want to deal with that case as tiles rather than WPP anyway + // ?? Not actually sure that the main code deals with WPP + multi-col correctly + sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag && + s->ps.pps->num_tile_columns == 1; + } + } + } + + if (s->ps.pps->slice_header_extension_present_flag) { + unsigned int length = get_ue_golomb_long(gb); + if (length*8LL > get_bits_left(gb)) { + av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n"); + return AVERROR_INVALIDDATA; + } + for (i = 0; i < length; i++) + skip_bits(gb, 8); // slice_header_extension_data_byte + } + + // Inferred parameters + sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta; + if (sh->slice_qp > 51 || + sh->slice_qp < -s->ps.sps->qp_bd_offset) { + av_log(s->avctx, AV_LOG_ERROR, + "The slice_qp %d is outside the valid range " + "[%d, 51].\n", + sh->slice_qp, + -s->ps.sps->qp_bd_offset); + return AVERROR_INVALIDDATA; + } + + if (get_bits_left(gb) < 0) { + av_log(s->avctx, AV_LOG_ERROR, + "Overread slice header by %d bits\n", -get_bits_left(gb)); + return AVERROR_INVALIDDATA; + } + + s->slice_initialized = 1; + return 0; +} + +static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry) +{ + RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width; + int c_idx, i; + + if (s->sh.slice_sample_adaptive_offset_flag[0] || + s->sh.slice_sample_adaptive_offset_flag[1]) { + if ((lc->ctb_avail & AVAIL_L) != 0) + { + const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); + if (sao_merge_left_flag) { + *sao = sao[-1]; + return; + } + } + if ((lc->ctb_avail & AVAIL_U) != 0) + { + const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); + if (sao_merge_up_flag) { + *sao = sao[-(int)s->ps.sps->ctb_width]; + return; + } + } + } + + for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) { + const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma : + s->ps.pps->log2_sao_offset_scale_chroma; + int offset_abs[4]; + char offset_sign[4] = {0}; + + if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) { + sao->type_idx[c_idx] = SAO_NOT_APPLIED; + continue; + } + + if (c_idx == 2) { + sao->type_idx[2] = sao->type_idx[1]; + sao->eo_class[2] = sao->eo_class[1]; + } else { + sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc); + } + + // ** Could use BY22 here quite plausibly - this is all bypass stuff + // though only per CTB so not very timing critical + + if (sao->type_idx[c_idx] == SAO_NOT_APPLIED) + continue; + + for (i = 0; i < 4; i++) + offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc); + + if (sao->type_idx[c_idx] == SAO_BAND) { + for (i = 0; i < 4; i++) { + if (offset_abs[i] != 0) + offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc); + } + sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc); + } else if (c_idx != 2) { + sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc); + } + + // Inferred parameters + sao->offset_val[c_idx][0] = 0; + for (i = 0; i < 4; i++) { + sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale; + if (sao->type_idx[c_idx] == SAO_EDGE) { + if (i > 1) + sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; + } else if (offset_sign[i]) { + sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; + } + } + } +} + +#if 0 +static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) { + int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx); // 0..4 + + if (log2_res_scale_abs_plus1 != 0) { + int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx); + lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) * + (1 - 2 * res_scale_sign_flag); + } else { + lc->tu.res_scale_val = 0; + } + + + return 0; +} +#endif + +static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb) +{ + return jb->intra.cmds + jb->intra.n++; +} + +#define A0(x, y, U, L, UL, UR, DL) \ + [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0)) + +#define A1(x, y, U, L, UL, UR, DL) \ + A0((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A0((x) + 1, (y) + 0, (U), 1, (U), (UR), 0 ),\ + A0((x) + 0, (y) + 1, 1, (L), (L), 1, (DL)), A0((x) + 1, (y) + 1, 1, 1, 1, 0, 0 ) + +#define A2(x, y, U, L, UL, UR, DL) \ + A1((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A1((x) + 2, (y) + 0, (U), 1, (U), (UR), 0 ),\ + A1((x) + 0, (y) + 2, 1, (L), (L), 1, (DL)), A1((x) + 2, (y) + 2, 1, 1, 1, 0, 0 ) + +#define A3(x, y, U, L, UL, UR, DL) \ + A2((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A2((x) + 4, (y) + 0, (U), 1, (U), (UR), 0 ),\ + A2((x) + 0, (y) + 4, 1, (L), (L), 1, (DL)), A2((x) + 4, (y) + 4, 1, 1, 1, 0, 0 ) + +#define A4(x, y, U, L, UL, UR, DL) \ + A3((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A3((x) + 8, (y) + 0, (U), 1, (U), (UR), 0 ),\ + A3((x) + 0, (y) + 8, 1, (L), (L), 1, (DL)), A3((x) + 8, (y) + 8, 1, 1, 1, 0, 0 ) + +static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)}; + +unsigned int ff_hevc_rpi_tb_avail_flags( + const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, + const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h) +{ + const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size; + const unsigned int tb_x = x & ~ctb_mask; + const unsigned int tb_y = y & ~ctb_mask; + const unsigned int ctb_avail = lc->ctb_avail; + + const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16; + + unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL); + + // This deals with both the U & L edges + if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0) + f |= AVAIL_UL; + + if (x + w < lc->end_of_ctb_x) + f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR; + else if (tb_y == 0) + f |= (ctb_avail & AVAIL_UR); +#if AVAIL_S_U - AVAIL_S_UR < 0 +#error Shift problem +#endif + + // Never any D if Y beyond eoctb + if (y + h < lc->end_of_ctb_y) + f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL; +#if AVAIL_S_DL - AVAIL_S_L < 0 +#error Shift problem +#endif + +// printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h, +// lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16], +// lc->end_of_ctb_x, lc->end_of_ctb_y); + + return f; +} + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef A4 + +static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx, + unsigned int avail) +{ + // If rpi_enabled then sand - U & V done on U call + if (c_idx <= 1) + { + HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); + cmd->type = RPI_PRED_INTRA + c_idx; + cmd->size = log2_trafo_size; + cmd->avail = avail; + cmd->i_pred.x = x0; + cmd->i_pred.y = y0; + cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; + +// printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail); + } +} + +#define CBF_CB0_S 0 +#define CBF_CB1_S 1 // CB1 must be CB0 + 1 +#define CBF_CR0_S 2 +#define CBF_CR1_S 3 + +#define CBF_CB0 (1 << CBF_CB0_S) +#define CBF_CR0 (1 << CBF_CR0_S) +#define CBF_CB1 (1 << CBF_CB1_S) +#define CBF_CR1 (1 << CBF_CR1_S) + +// * Only good for chroma_idx == 1 +static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int log2_cb_size, const unsigned int log2_trafo_size, + const unsigned int blk_idx, const int cbf_luma, + const unsigned int cbf_chroma) +{ + const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1); + const unsigned int x0_c = x0 & ~7; + const unsigned int y0_c = y0 & ~7; + + enum ScanType scan_idx = SCAN_DIAG; + enum ScanType scan_idx_c = SCAN_DIAG; + + if (lc->cu.pred_mode == MODE_INTRA) + { + const unsigned int trafo_size = 1 << log2_trafo_size; + const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size); + + do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail); + + if (log2_trafo_size > 2) + do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail); + else if (blk_idx == 3) + do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, + ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8)); + + if (log2_trafo_size < 4) { + if (lc->tu.intra_pred_mode >= 6 && + lc->tu.intra_pred_mode <= 14) { + scan_idx = SCAN_VERT; + } else if (lc->tu.intra_pred_mode >= 22 && + lc->tu.intra_pred_mode <= 30) { + scan_idx = SCAN_HORIZ; + } + + if (lc->tu.intra_pred_mode_c >= 6 && + lc->tu.intra_pred_mode_c <= 14) { + scan_idx_c = SCAN_VERT; + } else if (lc->tu.intra_pred_mode_c >= 22 && + lc->tu.intra_pred_mode_c <= 30) { + scan_idx_c = SCAN_HORIZ; + } + } + } + + if (!cbf_luma && cbf_chroma == 0) + return 0; + + if (lc->tu.is_cu_qp_delta_wanted) + { + const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc); + const unsigned int cb_mask = ~0U << log2_cb_size; + + if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) || + qp_delta > (25 + (s->ps.sps->qp_bd_offset >> 1))) + { + av_log(s->avctx, AV_LOG_ERROR, + "The cu_qp_delta %d is outside the valid range " + "[%d, %d].\n", + qp_delta, + -(26 + (s->ps.sps->qp_bd_offset >> 1)), + (25 + (s->ps.sps->qp_bd_offset >> 1))); + return AVERROR_INVALIDDATA; + } + + lc->tu.is_cu_qp_delta_wanted = 0; + lc->tu.cu_qp_delta = qp_delta; + ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask); + } + + // * Not main profile & untested due to no conform streams + if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma && + !lc->cu.cu_transquant_bypass_flag) { + int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc); + if (cu_chroma_qp_offset_flag) { + int cu_chroma_qp_offset_idx = 0; + if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) { + cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc); + } + lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx]; + lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx]; + } + lc->tu.cu_chroma_qp_offset_wanted = 0; + } + + if (cbf_luma) + ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0); + + if (log2_trafo_size > 2 || blk_idx == 3) + { + if ((cbf_chroma & CBF_CB0) != 0) + ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c, + log2_trafo_size_c, scan_idx_c, 1); + if ((cbf_chroma & CBF_CR0) != 0) + ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c, + log2_trafo_size_c, scan_idx_c, 2); + } + + return 0; +} + +static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size) +{ + set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3); +} + + +static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int log2_trafo_size, + const unsigned int trafo_depth, const unsigned int blk_idx, + const unsigned int cbf_c0) +{ + // When trafo_size == 2 hls_transform_unit uses c0 so put in c1 + unsigned int cbf_c1 = cbf_c0; + int split_transform_flag; + int ret; + + if (lc->cu.intra_split_flag) { + if (trafo_depth == 1) { + lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[blk_idx]; + if (ctx_cfmt(s) == 3) { + lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx]; + lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[blk_idx]; + } else { + lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0]; + lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0]; + } + } + } else { + lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[0]; + lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0]; + lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0]; + } + + if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size && + log2_trafo_size > s->ps.sps->log2_min_tb_size && + trafo_depth < lc->cu.max_trafo_depth && + !(lc->cu.intra_split_flag && trafo_depth == 0)) + { + split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size); + } else { + int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 && + lc->cu.pred_mode == MODE_INTER && + lc->cu.part_mode != PART_2Nx2N && + trafo_depth == 0; + + split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size || + (lc->cu.intra_split_flag && trafo_depth == 0) || + inter_split; + } + + if (log2_trafo_size > 2 || ctx_cfmt(s) == 3) + { + const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3); + cbf_c1 = 0; + + if ((cbf_c0 & CBF_CB0) != 0) + { + cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S; + if (wants_c1) + cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S; + } + + if ((cbf_c0 & CBF_CR0) != 0) + { + cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S; + if (wants_c1) + cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S; + } + } + + if (split_transform_flag) { + const int trafo_size_split = 1 << (log2_trafo_size - 1); + const int x1 = x0 + trafo_size_split; + const int y1 = y0 + trafo_size_split; + +#define SUBDIVIDE(x, y, idx) \ +do { \ + ret = hls_transform_tree(s, lc, x, y, \ + log2_trafo_size - 1, trafo_depth + 1, idx, \ + cbf_c1); \ + if (ret < 0) \ + return ret; \ +} while (0) + + SUBDIVIDE(x0, y0, 0); + SUBDIVIDE(x1, y0, 1); + SUBDIVIDE(x0, y1, 2); + SUBDIVIDE(x1, y1, 3); + +#undef SUBDIVIDE + } else { + // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have + // trafo_size == 2 with depth == 0 the issue is moot + const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) || + ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth)); + + ret = hls_transform_unit(s, lc, x0, y0, + log2_trafo_size + trafo_depth, log2_trafo_size, + blk_idx, cbf_luma, cbf_c1); + if (ret < 0) + return ret; + + if (!s->sh.disable_deblocking_filter_flag) { + ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma); + } + } + return 0; +} + + +static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size) +{ + GetBitContext gb; + int ret; + + ret = init_get_bits(&gb, pcm, length); + if (ret < 0) + return ret; + + s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0), + frame_stride1(s->frame, 0), + cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); + + s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)), + s->frame->linesize[1], + cb_size >> ctx_hshift(s, 1), + cb_size >> ctx_vshift(s, 1), + &gb, s->ps.sps->pcm.bit_depth_chroma); + + return 0; +} + + +// x * 2^(y*2) +static inline unsigned int xyexp2(const unsigned int x, const unsigned int y) +{ + return x << (y * 2); +} + +static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size) +{ + // Length in bits + const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) + + xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) + + xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2)); + + const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3); + + if (!s->sh.disable_deblocking_filter_flag) + ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); + + // Copy coeffs + { + const int blen = (length + 7) >> 3; + // Round allocated bytes up to nearest 32 to avoid alignment confusion + // Allocation is in int16_t s + // As we are only using 1 byte per sample and the coeff buffer allows 2 per + // sample this rounding doesn't affect the total size we need to allocate for + // the coeff buffer + int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1); + memcpy(coeffs, pcm, blen); + + // Our coeff stash assumes that any partially allocated 64byte lump + // is zeroed so make that true. + { + uint8_t * const eopcm = (uint8_t *)coeffs + blen; + if ((-(intptr_t)eopcm & 63) != 0) + memset(eopcm, 0, -(intptr_t)eopcm & 63); + } + + // Add command + { + HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); + cmd->type = RPI_PRED_I_PCM; + cmd->size = log2_cb_size; + cmd->i_pcm.src = coeffs; + cmd->i_pcm.x = x0; + cmd->i_pcm.y = y0; + cmd->i_pcm.src_len = length; + } + return 0; + } +} + + +static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref, + const MvXY xy, const int y0, const int height) +{ + if (s->threads_type != 0) { + const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9); + + // Progress has to be attached to current job as the actual wait + // is in worker_core which can't use lc + int16_t *const pr = lc->jb0->progress_req + ref->dpb_no; + if (*pr < y) { + *pr = y; + } + } +} + +static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const int x0, const int y0, const int nPbW, + const int nPbH, + HEVCRpiMvField * const mv) +{ + enum InterPredIdc inter_pred_idc = PRED_L0; + int mvp_flag; + const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH); + + mv->pred_flag = 0; + if (s->sh.slice_type == HEVC_SLICE_B) + inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH); + + if (inter_pred_idc != PRED_L1) { + MvXY mvd; + + if (s->sh.nb_refs[L0]) + mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]); + + mv->pred_flag = PF_L0; + mvd = ff_hevc_rpi_hls_mvd_coding(lc); + mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); + ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail, + mv, mvp_flag, 0); + mv->xy[0] = mvxy_add(mv->xy[0], mvd); + } + + if (inter_pred_idc != PRED_L0) { + MvXY mvd = 0; + + if (s->sh.nb_refs[L1]) + mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]); + + if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI) + mvd = ff_hevc_rpi_hls_mvd_coding(lc); + + mv->pred_flag += PF_L1; + mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); + ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail, + mv, mvp_flag, 1); + mv->xy[1] = mvxy_add(mv->xy[1], mvd); + } +} + + +static HEVCRpiInterPredQ * +rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn) +{ + HEVCRpiInterPredQ * yp = NULL; + HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr; + const unsigned int max_fill = ipe->max_fill; + unsigned int load = UINT_MAX; + + for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) { + // We will always have enough room between the Qs but if we are + // running critically low due to poor scheduling then use fill size + // rather than load to determine QPU. This has obvious dire + // performance implications but (a) it is better than crashing + // and (b) it should (almost) never happen + const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base; + const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load; + + if (tload < load) + { + yp = ypt; + load = tload; + } + } + + yp->load += load_val; + ipe->used_grp = 1; + qpu_mc_link_set(yp->qpu_mc_curr, fn); + + return yp; +} + + +static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe) +{ + for (unsigned int i = 0; i != ipe->n; ++i) { + HEVCRpiInterPredQ * const q = ipe->q + i; + const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base; + + qpu_mc_link_set(q->qpu_mc_curr, q->code_sync); + q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1); + q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage + } +} + +// Returns 0 on success +// We no longer check for Q fullness as wew have emergncy code in ctu alloc +// * However it might be an idea to have some means of spotting that we've used it +static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe) +{ + if (!ipe->used_grp) + return 0; + + if ((ipe->curr += ipe->n_grp) >= ipe->n) + { + ipe->curr = 0; + rpi_inter_pred_sync(ipe); + } + ipe->used = 1; + ipe->used_grp = 0; + + return 0; +} + +static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe) +{ + unsigned int i; + + ipe->curr = 0; + ipe->used = 0; + ipe->used_grp = 0; + for (i = 0; i != ipe->n; ++i) { + HEVCRpiInterPredQ * const q = ipe->q + i; + q->qpu_mc_curr = q->qpu_mc_base; + q->load = 0; + q->last_l0 = NULL; + q->last_l1 = NULL; + } +} + +static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe, + const unsigned int n_max, const unsigned int n_grp, + const unsigned int total_size, const unsigned int min_gap) +{ + int rv; + + memset(ipe, 0, sizeof(*ipe)); + if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL) + return AVERROR(ENOMEM); + + ipe->n_grp = n_grp; + ipe->min_gap = min_gap; + + if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0) + av_freep(&ipe->q); + return rv; +} + + +#if RPI_QPU_EMU_Y +#define get_mc_address_y(f) ((f)->data[0]) +#else +#define get_mc_address_y(f) get_vc_address_y(f) +#endif +#if RPI_QPU_EMU_C +#define get_mc_address_u(f) ((f)->data[1]) +#else +#define get_mc_address_u(f) get_vc_address_u(f) +#endif + +static inline uint32_t pack_wo_p(const int off, const int mul) +{ + return PACK2(off * 2 + 1, mul); +} + +static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul) +{ + return PACK2(off0 + off1 + 1, mul); +} + + +static void +rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb, + const int x0, const int y0, + const int nPbW, const int nPbH, + const MvXY mv_xy, + const int weight_mul, + const int weight_offset, + AVFrame *const src_frame) +{ + const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); + const unsigned int mx = MV_X(mv_xy) & 3; + const unsigned int my = MV_Y(mv_xy) & 3; + const unsigned int my_mx = (my << 8) | mx; + const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; + const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame); + qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off; + const uint32_t wo = pack_wo_p(weight_offset, weight_mul); + HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; + const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); + + if (my_mx == 0) + { + const int x1 = x0 + (MV_X(mv_xy) >> 2); + const int y1 = y0 + (MV_Y(mv_xy) >> 2); + const int bh = nPbH; + + for (int start_x = 0; start_x < nPbW; start_x += 16) + { + const int bw = FFMIN(nPbW - start_x, 16); + HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00); + qpu_mc_src_t *const src1 = yp->last_l0; + qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00; + +#if RPI_TSTATS + { + HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; + ++ts->y_pred1_x0y0; + + if (nPbW > 8) + ++ts->y_pred1_wgt8; + else + ++ts->y_pred1_wle8; + + if (nPbH > 16) + ++ts->y_pred1_hgt16; + else + ++ts->y_pred1_hle16; + } +#endif + + src1->x = x1 + start_x; + src1->y = y1; + src1->base = src_vc_address_y; + cmd_y->w = bw; + cmd_y->h = bh; + cmd_y->wo1 = wo; + cmd_y->dst_addr = dst_addr + (start_x << xshl); + yp->last_l0 = &cmd_y->next_src1; + yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); + } + } + else + { + const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3; + const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3; + const unsigned int bh = nPbH; + int start_x = 0; + +#if 1 + // As Y-pred operates on two independant 8-wide src blocks we can merge + // this pred with the previous one if it the previous one is 8 pel wide, + // the same height as the current block, immediately to the left of our + // current dest block and mono-pred. + + qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p; + if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr) + { + const int bw = FFMIN(nPbW, 8); + qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1; + + last_y8_src2->x = x1_m3; + last_y8_src2->y = y1_m3; + last_y8_src2->base = src_vc_address_y; + last_y8_p->w += bw; + last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21); + last_y8_p->wo2 = wo; + + jb->last_y8_p = NULL; + jb->last_y8_l1 = NULL; + start_x = bw; +#if RPI_TSTATS + ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge; +#endif + } +#endif + + for (; start_x < nPbW; start_x += 16) + { + const int bw = FFMIN(nPbW - start_x, 16); + HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx); + qpu_mc_src_t *const src1 = yp->last_l0; + qpu_mc_src_t *const src2 = yp->last_l1; + qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; +#if RPI_TSTATS + { + HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; + if (mx == 0 && my == 0) + ++ts->y_pred1_x0y0; + else if (mx == 0) + ++ts->y_pred1_x0; + else if (my == 0) + ++ts->y_pred1_y0; + else + ++ts->y_pred1_xy; + + if (nPbW > 8) + ++ts->y_pred1_wgt8; + else + ++ts->y_pred1_wle8; + + if (nPbH > 16) + ++ts->y_pred1_hgt16; + else + ++ts->y_pred1_hle16; + } +#endif + src1->x = x1_m3 + start_x; + src1->y = y1_m3; + src1->base = src_vc_address_y; + if (bw <= 8) + { + src2->x = MC_DUMMY_X; + src2->y = MC_DUMMY_Y; +#if RPI_QPU_EMU_Y + src2->base = s->qpu_dummy_frame_emu; +#else + src2->base = s->qpu_dummy_frame_qpu; +#endif + } + else + { + src2->x = x1_m3 + start_x + 8; + src2->y = y1_m3; + src2->base = src_vc_address_y; + } + cmd_y->w = bw; + cmd_y->h = bh; + cmd_y->mymx21 = my2_mx2_my_mx; + cmd_y->wo1 = wo; + cmd_y->wo2 = wo; + cmd_y->dst_addr = dst_addr + (start_x << xshl); + yp->last_l0 = &cmd_y->next_src1; + yp->last_l1 = &cmd_y->next_src2; + yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); + + if (bw == 8) { + jb->last_y8_l1 = src2; + jb->last_y8_p = cmd_y; + } + } + } +} + +static void +rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const int x0, const int y0, + const int nPbW, const int nPbH, + const struct HEVCRpiMvField *const mv_field, + const AVFrame *const src_frame, + const AVFrame *const src_frame2) +{ + const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); + const MvXY mv = mv_field->xy[0]; + const MvXY mv2 = mv_field->xy[1]; + + const unsigned int mx = MV_X(mv) & 3; + const unsigned int my = MV_Y(mv) & 3; + const unsigned int my_mx = (my<<8) | mx; + const unsigned int mx2 = MV_X(mv2) & 3; + const unsigned int my2 = MV_Y(mv2) & 3; + const unsigned int my2_mx2 = (my2<<8) | mx2; + const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; + const unsigned int ref_idx0 = mv_field->ref_idx[0]; + const unsigned int ref_idx1 = mv_field->ref_idx[1]; + const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]); + const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]); + + const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); + qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off; + const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame); + const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2); + HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; + + if (my2_mx2_my_mx == 0) + { + const int x1 = x0 + (MV_X(mv) >> 2); + const int y1 = y0 + (MV_Y(mv) >> 2); + const int x2 = x0 + (MV_X(mv2) >> 2); + const int y2 = y0 + (MV_Y(mv2) >> 2); + const int bh = nPbH; + + // Can do chunks a full 16 wide if we don't want the H filter + for (int start_x=0; start_x < nPbW; start_x += 16) + { + HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00); + qpu_mc_src_t *const src1 = yp->last_l0; + qpu_mc_src_t *const src2 = yp->last_l1; + qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; +#if RPI_TSTATS + { + HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; + ++ts->y_pred2_x0y0; + + if (nPbH > 16) + ++ts->y_pred2_hgt16; + else + ++ts->y_pred2_hle16; + } +#endif + src1->x = x1 + start_x; + src1->y = y1; + src1->base = src1_base; + src2->x = x2 + start_x; + src2->y = y2; + src2->base = src2_base; + cmd_y->w = FFMIN(nPbW - start_x, 16); + cmd_y->h = bh; + cmd_y->mymx21 = 0; + cmd_y->wo1 = wo1; + cmd_y->wo2 = wo2; + cmd_y->dst_addr = dst + (start_x << xshl); + yp->last_l0 = &cmd_y->next_src1; + yp->last_l1 = &cmd_y->next_src2; + yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); + } + } + else + { + // Filter requires a run-up of 3 + const int x1 = x0 + (MV_X(mv) >> 2) - 3; + const int y1 = y0 + (MV_Y(mv) >> 2) - 3; + const int x2 = x0 + (MV_X(mv2) >> 2) - 3; + const int y2 = y0 + (MV_Y(mv2) >> 2) - 3; + const int bh = nPbH; + + for (int start_x=0; start_x < nPbW; start_x += 8) + { // B blocks work 8 at a time + // B weights aren't doubled as the QPU code does the same + // amount of work as it does for P + HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx); + qpu_mc_src_t *const src1 = yp->last_l0; + qpu_mc_src_t *const src2 = yp->last_l1; + qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; +#if RPI_TSTATS + { + HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; + const unsigned int mmx = mx | mx2; + const unsigned int mmy = my | my2; + if (mmx == 0 && mmy == 0) + ++ts->y_pred2_x0y0; + else if (mmx == 0) + ++ts->y_pred2_x0; + else if (mmy == 0) + ++ts->y_pred2_y0; + else + ++ts->y_pred2_xy; + + if (nPbH > 16) + ++ts->y_pred2_hgt16; + else + ++ts->y_pred2_hle16; + } +#endif + src1->x = x1 + start_x; + src1->y = y1; + src1->base = src1_base; + src2->x = x2 + start_x; + src2->y = y2; + src2->base = src2_base; + cmd_y->w = FFMIN(nPbW - start_x, 8); + cmd_y->h = bh; + cmd_y->mymx21 = my2_mx2_my_mx; + cmd_y->wo1 = wo1; + cmd_y->wo2 = wo2; + cmd_y->dst_addr = dst + (start_x << xshl); + yp->last_l0 = &cmd_y->next_src1; + yp->last_l1 = &cmd_y->next_src2; + yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); + } + } +} + +// h/v shifts fixed at one as that is all the qasm copes with +static void +rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const unsigned int lx, const int x0_c, const int y0_c, + const int nPbW_c, const int nPbH_c, + const MvXY mv, + const int16_t * const c_weights, + const int16_t * const c_offsets, + AVFrame * const src_frame) +{ + const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); + const int hshift = 1; // = s->ps.sps->hshift[1]; + const int vshift = 1; // = s->ps.sps->vshift[1]; + + const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1; + const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1; + const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame); + const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)]; + const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)]; + const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]); + const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]); + qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; + HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; + const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; + const unsigned int bh = nPbH_c; + const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1; + + for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) + { + HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn); + qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p; + qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1; + qpu_mc_src_t * const last_lx = *plast_lx; + const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); + + last_lx->x = x1_c + start_x; + last_lx->y = y1_c; + last_lx->base = src_base_u; + cmd_c->h = bh; + cmd_c->w = bw; + cmd_c->coeffs_x = x_coeffs; + cmd_c->coeffs_y = y_coeffs; + cmd_c->wo_u = wo_u; + cmd_c->wo_v = wo_v; + cmd_c->dst_addr_c = dst_base_u + (start_x << xshl); + *plast_lx = &cmd_c->next_src; + cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1); + } + return; +} + +// h/v shifts fixed at one as that is all the qasm copes with +static void +rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const int x0_c, const int y0_c, + const int nPbW_c, const int nPbH_c, + const struct HEVCRpiMvField * const mv_field, + const int16_t * const c_weights, + const int16_t * const c_offsets, + const int16_t * const c_weights2, + const int16_t * const c_offsets2, + AVFrame * const src_frame, + AVFrame * const src_frame2) +{ + const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); + const int hshift = 1; // s->ps.sps->hshift[1]; + const int vshift = 1; // s->ps.sps->vshift[1]; + const MvXY mv = mv_field->xy[0]; + const MvXY mv2 = mv_field->xy[1]; + + const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift); + const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift); + const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; + const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector + const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1; + const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1; + + const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift); + const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift); + const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; + const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector + + const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1; + const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1; + + const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]); + const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]); + + const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; + const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame); + const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2); + HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; + const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; + const unsigned int bh = nPbH_c; + + for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) + { + const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); + + HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx); + qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b; + qpu_mc_src_t * const src_l0 = cp->last_l0; + qpu_mc_src_t * const src_l1 = cp->last_l1; + + src_l0->x = x1_c + start_x; + src_l0->y = y1_c; + src_l0->base = src1_base; + src_l1->x = x2_c + start_x; + src_l1->y = y2_c; + src_l1->base = src2_base; + + u[0].h = bh; + u[0].w = bw; + u[0].coeffs_x1 = coefs0_x; + u[0].coeffs_y1 = coefs0_y; + u[0].weight_u1 = c_weights[0]; // Weight L0 U + u[0].weight_v1 = c_weights[1]; // Weight L0 V + u[0].coeffs_x2 = coefs1_x; + u[0].coeffs_y2 = coefs1_y; + u[0].wo_u2 = wo_u2; + u[0].wo_v2 = wo_v2; + u[0].dst_addr_c = dst_base_u + (start_x << xshl); + + cp->last_l0 = &u[0].next_src1; + cp->last_l1 = &u[0].next_src2; + cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); + } +} + + +static inline void +col_stash(const HEVCRpiContext * const s, + const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0, + const HEVCRpiMvField * const mvf) +{ + ColMvField * const col_mvf = s->ref->col_mvf; + const unsigned int x = (x0 + 15) >> 4; + const unsigned int y = (y0 + 15) >> 4; + const unsigned int w = ((x0 + 15 + w0) >> 4) - x; + const unsigned int h = ((y0 + 15 + h0) >> 4) - y; + + if (col_mvf != NULL && w != 0 && h != 0) + { + // Only record MV from the top left of the 16x16 block + + const RefPicList * const rpl = s->refPicList; + const ColMvField cmv = { + .L = { + { + .poc = (mvf->pred_flag & PF_L0) == 0 ? + COL_POC_INTRA : + COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]), + .xy = mvf->xy[0] + }, + { + .poc = (mvf->pred_flag & PF_L1) == 0 ? + COL_POC_INTRA : + COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]), + .xy = mvf->xy[1] + } + } + }; + + ColMvField * p = col_mvf + y * s->col_mvf_stride + x; + const unsigned int stride = s->col_mvf_stride - w; + unsigned int j = h; + + do + { + unsigned int k = w; + do + { + *p++ = cmv; + } while (--k != 0); + p += stride; + } while (--j != 0); + } +} + +static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int nPbW, const unsigned int nPbH, + const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx) +{ + HEVCRpiJob * const jb = lc->jb0; + + struct HEVCRpiMvField current_mv = {{0}}; + const RefPicList *const refPicList = s->refPicList; + const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL; + + if (lc->cu.pred_mode != MODE_SKIP) + lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc); + + if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) { + const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 : + ff_hevc_rpi_merge_idx_decode(s, lc); + + ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, + partIdx, merge_idx, ¤t_mv); + } else { + hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, ¤t_mv); + } + + { + HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0); + unsigned int i, j; + + for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++) + { + for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++) + p[i] = current_mv; + p += MVF_STASH_WIDTH_PU; + } + } + + col_stash(s, x0, y0, nPbW, nPbH, ¤t_mv); + + if (current_mv.pred_flag & PF_L0) { + ref0 = refPicList[0].ref[current_mv.ref_idx[0]]; + if (!ref0) + return; + hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH); + } + if (current_mv.pred_flag & PF_L1) { + ref1 = refPicList[1].ref[current_mv.ref_idx[1]]; + if (!ref1) + return; + hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH); + } + + if (current_mv.pred_flag == PF_L0) { + const int x0_c = x0 >> ctx_hshift(s, 1); + const int y0_c = y0 >> ctx_vshift(s, 1); + const int nPbW_c = nPbW >> ctx_hshift(s, 1); + const int nPbH_c = nPbH >> ctx_vshift(s, 1); + + rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0], + s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]], + ref0->frame); + + if (ctx_cfmt(s) != 0) { + rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0], + s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], + ref0->frame); + return; + } + } else if (current_mv.pred_flag == PF_L1) { + const int x0_c = x0 >> ctx_hshift(s, 1); + const int y0_c = y0 >> ctx_vshift(s, 1); + const int nPbW_c = nPbW >> ctx_hshift(s, 1); + const int nPbH_c = nPbH >> ctx_vshift(s, 1); + + rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1], + s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]], + ref1->frame); + + if (ctx_cfmt(s) != 0) { + rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1], + s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], + ref1->frame); + return; + } + } else if (current_mv.pred_flag == PF_BI) { + const int x0_c = x0 >> ctx_hshift(s, 1); + const int y0_c = y0 >> ctx_vshift(s, 1); + const int nPbW_c = nPbW >> ctx_hshift(s, 1); + const int nPbH_c = nPbH >> ctx_vshift(s, 1); + + rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, ¤t_mv, ref0->frame, ref1->frame); + + if (ctx_cfmt(s) != 0) { + rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c, + ¤t_mv, + s->sh.chroma_weight_l0[current_mv.ref_idx[0]], + s->sh.chroma_offset_l0[current_mv.ref_idx[0]], + s->sh.chroma_weight_l1[current_mv.ref_idx[1]], + s->sh.chroma_offset_l1[current_mv.ref_idx[1]], + ref0->frame, + ref1->frame); + return; + } + } +} + +static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int log2_cb_size, + const unsigned int ipm) +{ + const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE; + const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE; + + { + const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE)); + set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm); + } + + // If IRAP then everything is Intra & we avoid ever looking at these + // stashes so don't bother setting them + if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA) + { + if (s->is_intra != NULL) + { + set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE); + } + + { + HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0); + const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1 + unsigned int n = size_in_pus; + + do + { + memset(p, 0, size_in_pus * sizeof(*p)); + p += MVF_STASH_WIDTH_PU; + } while (--n != 0); + } + + + if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0) + { + // Only record top left stuff + // Blocks should always be alinged on size boundries + // so cannot have overflow from a small block + + ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4); + const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4)); + const unsigned int stride = s->col_mvf_stride - size_in_col; + unsigned int j = size_in_col; + + do + { + unsigned int k = size_in_col; + do + { + p->L[0].poc = COL_POC_INTRA; + p->L[0].xy = 0; + p->L[1].poc = COL_POC_INTRA; + p->L[1].xy = 0; + ++p; + } while (--k != 0); + p += stride; + } while (--j != 0); + } + } +} + +static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int log2_cb_size) +{ + set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC); +} + + +/** + * 8.4.1 + */ +static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + int x0, int y0, int log2_pu_size, + int prev_intra_luma_pred_flag, + const unsigned int idx) +{ + const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size); + const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE; + const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE; + + // Up does not cross boundries so as we always scan 1 slice-tile-line in an + // lc we can just keep 1 CTB lR stashes + // Left is reset to DC @ Start of Line/Tile/Slice in fill_job + const unsigned int cand_up = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu]; + const unsigned int cand_left = lc->ipm_left[yb_pu]; + + unsigned int intra_pred_mode; + unsigned int a, b, c; + + if (cand_left == cand_up) { + if (cand_left < 2) { + a = INTRA_PLANAR; + b = INTRA_DC; + c = INTRA_ANGULAR_26; + } else { + a = cand_left; + b = 2 + ((cand_left - 2 - 1 + 32) & 31); + c = 2 + ((cand_left - 2 + 1) & 31); + } + } else { + a = cand_left; + b = cand_up; + c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ? + INTRA_PLANAR : + (cand_left != INTRA_DC && cand_up != INTRA_DC) ? + INTRA_DC : + INTRA_ANGULAR_26; + } + + if (prev_intra_luma_pred_flag) { + intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c; + } else { + // Sort lowest 1st + if (a > b) + FFSWAP(int, a, b); + if (a > c) + FFSWAP(int, a, c); + if (b > c) + FFSWAP(int, b, c); + + intra_pred_mode = idx; + if (intra_pred_mode >= a) + intra_pred_mode++; + if (intra_pred_mode >= b) + intra_pred_mode++; + if (intra_pred_mode >= c) + intra_pred_mode++; + } + + /* write the intra prediction units into the mv array */ + set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode); + return intra_pred_mode; +} + +static const uint8_t tab_mode_idx[] = { + 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, + 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31}; + +static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int log2_cb_size) +{ + static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 }; + uint8_t prev_intra_luma_pred_flag[4]; + int split = lc->cu.part_mode == PART_NxN; + const unsigned int split_size = (1 << (log2_cb_size - 1)); + int chroma_mode; + const unsigned int n = split ? 4 : 1; + unsigned int i; + + for (i = 0; i != n; i++) + prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc); + + for (i = 0; i < n; i++) { + // depending on mode idx is mpm or luma_pred_mode + const unsigned int idx = prev_intra_luma_pred_flag[i] ? + ff_hevc_rpi_mpm_idx_decode(lc) : + ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc); + + lc->pu.intra_pred_mode[i] = + luma_intra_pred_mode(s, lc, + x0 + ((i & 1) == 0 ? 0 : split_size), + y0 + ((i & 2) == 0 ? 0 : split_size), + log2_cb_size - split, + prev_intra_luma_pred_flag[i], idx); + } + + if (ctx_cfmt(s) == 3) { + for (i = 0; i < n; i++) { + lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); + if (chroma_mode != 4) { + if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode]) + lc->pu.intra_pred_mode_c[i] = 34; + else + lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode]; + } else { + lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i]; + } + } + } else if (ctx_cfmt(s) == 2) { + int mode_idx; + lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); + if (chroma_mode != 4) { + if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) + mode_idx = 34; + else + mode_idx = intra_chroma_table[chroma_mode]; + } else { + mode_idx = lc->pu.intra_pred_mode[0]; + } + lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx]; + } else if (ctx_cfmt(s) != 0) { + chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); + if (chroma_mode != 4) { + if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) + lc->pu.intra_pred_mode_c[0] = 34; + else + lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode]; + } else { + lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0]; + } + } +} + +static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size) +{ + const unsigned int cb_size = 1 << log2_cb_size; + const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; + const unsigned int min_cb_width = s->ps.sps->min_cb_width; + const unsigned int x_cb = x0 >> log2_min_cb_size; + const unsigned int y_cb = y0 >> log2_min_cb_size; + const unsigned int idx = log2_cb_size - 2; + const unsigned int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1; + int skip_flag = 0; + + lc->cu.x = x0; + lc->cu.y = y0; + lc->cu.x_split = x0; + lc->cu.y_split = y0; + + lc->cu.pred_mode = MODE_INTRA; + lc->cu.part_mode = PART_2Nx2N; + lc->cu.intra_split_flag = 0; + lc->cu.cu_transquant_bypass_flag = 0; + lc->pu.intra_pred_mode[0] = 1; + lc->pu.intra_pred_mode[1] = 1; + lc->pu.intra_pred_mode[2] = 1; + lc->pu.intra_pred_mode[3] = 1; + + if (s->ps.pps->transquant_bypass_enable_flag) { + lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc); + if (lc->cu.cu_transquant_bypass_flag) + set_deblocking_bypass(s, x0, y0, log2_cb_size); + } + + if (s->sh.slice_type != HEVC_SLICE_I) { + lc->cu.pred_mode = MODE_INTER; + skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb); + } + + if (skip_flag) { + lc->cu.pred_mode = MODE_SKIP; + + hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); + intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); + + if (!s->sh.disable_deblocking_filter_flag) + ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); + } else { + int pcm_flag = 0; + + if (s->sh.slice_type != HEVC_SLICE_I) + lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc); + if (lc->cu.pred_mode != MODE_INTRA || + log2_cb_size == s->ps.sps->log2_min_cb_size) { + lc->cu.part_mode = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size); + lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN && + lc->cu.pred_mode == MODE_INTRA; + } + + if (lc->cu.pred_mode == MODE_INTRA) { + if (lc->cu.part_mode == PART_2Nx2N && + log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size && // 0 if not enabled + log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size && + ff_hevc_rpi_pcm_flag_decode(lc) != 0) + { + int ret; + pcm_flag = 1; + intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); + if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0) + return ret; + + if (s->ps.sps->pcm.loop_filter_disable_flag) + set_deblocking_bypass(s, x0, y0, log2_cb_size); + } else { + intra_prediction_unit(s, lc, x0, y0, log2_cb_size); + } + } else { + intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); + switch (lc->cu.part_mode) { + case PART_2Nx2N: + hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); + break; + case PART_2NxN: + hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx); + lc->cu.y_split = y0 + cb_size / 2; + hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx); + break; + case PART_Nx2N: + hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1); + lc->cu.x_split = x0 + cb_size / 2; + hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1); + break; + case PART_2NxnU: + hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx); + lc->cu.y_split = y0 + cb_size / 4; + hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx); + break; + case PART_2NxnD: + hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx); + lc->cu.y_split = y0 + cb_size / 4 * 3; + hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size / 4, log2_cb_size, 1, idx); + break; + case PART_nLx2N: + hls_prediction_unit(s, lc, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2); + lc->cu.x_split = x0 + cb_size / 4; + hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2); + break; + case PART_nRx2N: + hls_prediction_unit(s, lc, x0, y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2); + lc->cu.x_split = x0 + cb_size / 4 * 3; + hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2); + break; + case PART_NxN: + hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1); + lc->cu.x_split = x0 + cb_size / 2; + hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1); + lc->cu.y_split = y0 + cb_size / 2; + hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1); + hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1); + break; + } + } + + if (!pcm_flag) { + int rqt_root_cbf = 1; + + if (lc->cu.pred_mode != MODE_INTRA && + !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) { + rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc); + } + if (rqt_root_cbf) { + const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0); + int ret; + + lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ? + s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag : + s->ps.sps->max_transform_hierarchy_depth_inter; + // transform_tree does deblock_boundary_strengths + ret = hls_transform_tree(s, lc, x0, y0, + log2_cb_size, 0, 0, cbf_c); + if (ret < 0) + return ret; + } else { + if (!s->sh.disable_deblocking_filter_flag) + ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); + } + } + } + + // If the delta is still wanted then we haven't read the delta & therefore need to set qp here + if (lc->tu.is_cu_qp_delta_wanted) + ff_hevc_rpi_set_qPy(s, lc, x0, y0); + + if(((x0 + (1<qPy_pred = lc->qp_y; + } + + set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff); + + set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag); + + return 0; +} + +// Returns: +// < 0 Error +// 0 More data wanted +// 1 EoSlice / EoPicture +static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, + const int log2_cb_size, const unsigned int cb_depth) +{ + const int cb_size = 1 << log2_cb_size; + int ret; + int split_cu; + + lc->ct_depth = cb_depth; + split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size); + if (x0 + cb_size <= s->ps.sps->width && + y0 + cb_size <= s->ps.sps->height && + split_cu) + { + split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0); + } + + // Qp delta (and offset) need to remain wanted if cb_size < min until + // a coded block is found so we still initial state at depth 0 (outside + // this fn) and only reset here + if (s->ps.pps->cu_qp_delta_enabled_flag && + log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size) + { + lc->tu.is_cu_qp_delta_wanted = 1; + lc->tu.cu_qp_delta = 0; + } + if (s->sh.cu_chroma_qp_offset_enabled_flag && + log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size) + { + lc->tu.cu_chroma_qp_offset_wanted = 1; + } + + lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0]; + lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset; + lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset; + + if (split_cu) { + int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1; + const int cb_size_split = cb_size >> 1; + const int x1 = x0 + cb_size_split; + const int y1 = y0 + cb_size_split; + + int more_data = 0; + + more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1); + if (more_data < 0) + return more_data; + + if (more_data && x1 < s->ps.sps->width) { + more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1); + if (more_data < 0) + return more_data; + } + if (more_data && y1 < s->ps.sps->height) { + more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1); + if (more_data < 0) + return more_data; + } + if (more_data && x1 < s->ps.sps->width && + y1 < s->ps.sps->height) { + more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1); + if (more_data < 0) + return more_data; + } + + if(((x0 + (1<qPy_pred = lc->qp_y; + + if (more_data) + return ((x1 + cb_size_split) < s->ps.sps->width || + (y1 + cb_size_split) < s->ps.sps->height); + else + return 0; + } else { + ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size); + if (ret < 0) + return ret; + if ((!((x0 + cb_size) % + (1 << (s->ps.sps->log2_ctb_size))) || + (x0 + cb_size >= s->ps.sps->width)) && + (!((y0 + cb_size) % + (1 << (s->ps.sps->log2_ctb_size))) || + (y0 + cb_size >= s->ps.sps->height))) { + int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc); + return !end_of_slice_flag; + } else { + return 1; + } + } + + return 0; // NEVER +} + +static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const int x_ctb, const int y_ctb, const int ctb_addr_ts) +{ + const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; + const unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; + const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice + const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; + const unsigned int line_w = s->ps.sps->ctb_width; + + s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr; + + lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width); + lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height); + + lc->boundary_flags = 0; + + if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0) + lc->boundary_flags |= BOUNDARY_LEFT_TILE; + if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1]) + lc->boundary_flags |= BOUNDARY_LEFT_SLICE; + if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0) + lc->boundary_flags |= BOUNDARY_UPPER_TILE; + if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w]) + lc->boundary_flags |= BOUNDARY_UPPER_SLICE; + + // Use line width rather than tile width for addr_in_slice test as + // addr_in_slice is in raster units + + lc->ctb_avail = + ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) | + ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) | + ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && + (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) | + ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 && + (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0); + // Down-left never avail at CTB level +} + + +static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) +{ + int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds, + (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0); + + // Signal + if (y > 0) { + // Cast away const as progress is held in s, but this really shouldn't confuse anything + ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1); + } + + // Job done now + // ? Move outside this fn + job_free(s->jbc, jb); +} + +// I-pred, transform_and_add for all blocks types done here +// All ARM +static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) +{ + unsigned int i; + HEVCRpiIntraPredEnv * const iap = &jb->intra; + const HEVCPredCmd *cmd = iap->cmds; + +#if !RPI_WORKER_WAIT_PASS_0 + rpi_sem_wait(&jb->sem); + rpi_cache_flush_execute(jb->rfe); // Invalidate data set up in pass1 +#endif + + for (i = iap->n; i > 0; i--, cmd++) + { + switch (cmd->type) + { + case RPI_PRED_INTRA: + s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size); + break; + case RPI_PRED_INTRA_C: + s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size); + break; + case RPI_PRED_ADD_RESIDUAL: + s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); + break; + case RPI_PRED_ADD_DC: + s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); + break; + case RPI_PRED_ADD_RESIDUAL_U: + s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); + break; + case RPI_PRED_ADD_RESIDUAL_V: + s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); + break; + case RPI_PRED_ADD_RESIDUAL_C: + s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); + break; + case RPI_PRED_ADD_DC_U: + case RPI_PRED_ADD_DC_V: + s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); + break; + + case RPI_PRED_I_PCM: + pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); + break; + + default: + av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type); + abort(); + } + } + + // Mark done + iap->n = 0; +} + + +// Set initial uniform job values & zero ctu_count +static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first) +{ + unsigned int i; + HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip; + HEVCRpiInterPredEnv *const yipe = &jb->luma_ip; + const HEVCRpiSPS * const sps = s->ps.sps; + + const uint16_t pic_width_y = sps->width; + const uint16_t pic_height_y = sps->height; + + const uint16_t pic_width_c = sps->width >> ctx_hshift(s, 1); + const uint16_t pic_height_c = sps->height >> ctx_vshift(s, 1); + + // We expect the pointer to change if we use another sps + if (sps != jb->sps) + { + worker_pic_free_one(jb); + + set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma); + set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma); + + { + const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH; + const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1)); + worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma); + } + + jb->sps = sps; + } + + jb->waited = 0; + jb->ctu_ts_first = ctu_ts_first; + jb->ctu_ts_last = -1; + + rpi_inter_pred_reset(cipe); + for (i = 0; i < cipe->n; i++) { + HEVCRpiInterPredQ * const cp = cipe->q + i; + qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s; + + u->next_src1.x = 0; + u->next_src1.y = 0; + u->next_src1.base = 0; + u->pic_cw = pic_width_c; + u->pic_ch = pic_height_c; + u->stride2 = av_rpi_sand_frame_stride2(s->frame); + u->stride1 = av_rpi_sand_frame_stride1(s->frame); + cp->last_l0 = &u->next_src1; + + u->next_fn = 0; + u->next_src2.x = 0; + u->next_src2.y = 0; + u->next_src2.base = 0; + cp->last_l1 = &u->next_src2; + + cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); + } + + rpi_inter_pred_reset(yipe); + for (i = 0; i < yipe->n; i++) { + HEVCRpiInterPredQ * const yp = yipe->q + i; + qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s; + + y->next_src1.x = 0; + y->next_src1.y = 0; + y->next_src1.base = 0; + y->next_src2.x = 0; + y->next_src2.y = 0; + y->next_src2.base = 0; + y->pic_h = pic_height_y; + y->pic_w = pic_width_y; + y->stride2 = av_rpi_sand_frame_stride2(s->frame); + y->stride1 = av_rpi_sand_frame_stride1(s->frame); + y->next_fn = 0; + yp->last_l0 = &y->next_src1; + yp->last_l1 = &y->next_src2; + + yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1); + } + + jb->last_y8_p = NULL; + jb->last_y8_l1 = NULL; + + for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) { + jb->progress_req[i] = -1; + } + + worker_pic_reset(&jb->coeffs); +} + + +#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C +static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s, + const vpu_qpu_job_h vqj, + rpi_cache_flush_env_t * const rfe, + HEVCRpiInterPredEnv * const ipe) +{ + unsigned int i; + uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS]; + unsigned int max_block = 0; + + if (!ipe->used) { + return 0; + } + + if (ipe->curr != 0) { + rpi_inter_pred_sync(ipe); + } + + // Add final commands to Q + for(i = 0; i != ipe->n; ++i) { + HEVCRpiInterPredQ * const yp = ipe->q + i; + qpu_mc_src_t *const p0 = yp->last_l0; + qpu_mc_src_t *const p1 = yp->last_l1; + const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base; + + if (block_size > max_block) + max_block = block_size; + + qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit); + + // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched + p0->x = MC_DUMMY_X; + p0->y = MC_DUMMY_Y; + p0->base = s->qpu_dummy_frame_qpu; + p1->x = MC_DUMMY_X; + p1->y = MC_DUMMY_Y; + p1->base = s->qpu_dummy_frame_qpu; + + yp->last_l0 = NULL; + yp->last_l1 = NULL; + + // Add to mailbox list + mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm); + mail[i][1] = yp->code_setup; + } + + // We don't need invalidate here as the uniforms aren't changed by the QPU + // and leaving them in ARM cache avoids (pointless) pre-reads when writing + // new values which seems to give us a small performance advantage + // + // In most cases we will not have a completely packed set of uniforms and as + // we have a 2d invalidate we writeback all uniform Qs to the depth of the + // fullest + rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK, + (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block, + ipe->n, ipe->max_fill + ipe->min_gap); + vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail); + + return 1; +} +#endif + +#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C +static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s, + const vpu_qpu_job_h vqj, + rpi_cache_flush_env_t * const rfe, + HEVCRpiInterPredEnv * const ipe) +{ + unsigned int i; + if (!ipe->used) { + return 0; + } + + if (ipe->curr != 0) { + rpi_inter_pred_sync(ipe); + } + + // Add final commands to Q + for(i = 0; i != ipe->n; ++i) { + HEVCRpiInterPredQ * const yp = ipe->q + i; + qpu_mc_src_t *const p0 = yp->last_l0; + qpu_mc_src_t *const p1 = yp->last_l1; + + yp->qpu_mc_curr->data[-1] = yp->code_exit; + + // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched + p0->x = MC_DUMMY_X; + p0->y = MC_DUMMY_Y; + p0->base = s->qpu_dummy_frame_emu; + p1->x = MC_DUMMY_X; + p1->y = MC_DUMMY_Y; + p1->base = s->qpu_dummy_frame_emu; + + yp->last_l0 = NULL; + yp->last_l1 = NULL; + } + + return 1; +} +#endif + + +#if RPI_QPU_EMU_Y +#define mc_terminate_add_y mc_terminate_add_emu +#else +#define mc_terminate_add_y mc_terminate_add_qpu +#endif +#if RPI_QPU_EMU_C +#define mc_terminate_add_c mc_terminate_add_emu +#else +#define mc_terminate_add_c mc_terminate_add_qpu +#endif + + +static void flush_frame(HEVCRpiContext *s,AVFrame *frame) +{ + rpi_cache_buf_t cbuf; + rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf); + rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); + rpi_cache_flush_finish(rfe); +} + +static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) +{ + const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first]; + const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last]; + const unsigned int ctb_width = s->ps.sps->ctb_width; + RpiBlk *const bounds = &jb->bounds; + av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last); + bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size; + bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size; + bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size; + bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size; + + bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x); + bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y); +} + +#if RPI_PASSES == 2 +static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb) +{ + // Perform intra prediction and residual reconstruction + rpi_execute_pred_cmds(s, jb); + + // Perform deblocking for CTBs in this row + rpi_execute_dblk_cmds(s, jb); +} +#endif + +// Core execution tasks +static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb) +{ + int pred_y, pred_c; + vpu_qpu_job_env_t qvbuf; + const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf); +#if RPI_WORKER_WAIT_PASS_0 + int do_wait; +#endif + + { + const HEVCRpiCoeffsEnv * const cf = &jb->coeffs; + if (cf->s[3].n + cf->s[2].n != 0) + { + const unsigned int csize = sizeof(cf->s[3].buf[0]); + const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize; + unsigned int n16 = (cf->s[2].n >> 8); + unsigned int n32 = (cf->s[3].n >> 10); +#if RPI_COMPRESS_COEFFS + if (cf->s[2].packed) { + n16 = n16 | (n16<<16); + } else { + const unsigned int npack16 = (cf->s[2].packed_n>>8); + n16 = n16 | (npack16<<16); + } + if (cf->s[3].packed) { + n32 = n32 | (n32<<16); + } else { + const unsigned int npack32 = (cf->s[3].packed_n>>10); + n32 = n32 | (npack32<<16); + } +#endif + vpu_qpu_job_add_vpu(vqj, + vpu_get_fn(s->ps.sps->bit_depth), + vpu_get_constants(), + cf->gptr.vc, + n16, + cf->gptr.vc + offset32, + n32, + 0); + + rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize); + rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize); + } + } + + pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip); + +// We could take a sync here and try to locally overlap QPU processing with ARM +// but testing showed a slightly negative benefit with noticable extra complexity + + pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip); + + // Returns 0 if nothing to do, 1 if sync added +#if RPI_WORKER_WAIT_PASS_0 + do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem); +#else + if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0) + sem_post(&jb->sem); +#endif + + rpi_cache_flush_execute(jb->rfe); + + // Await progress as required + // jb->waited will only be clear if we have already tested the progress values + // (in worker_submit_job) and found we don't have to wait + if (jb->waited) + { + unsigned int i; + for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) { + if (jb->progress_req[i] >= 0) { + ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]); + } + } + } + + vpu_qpu_job_finish(vqj); + + // We always work on a rectangular block + if (pred_y || pred_c) + { + rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE, + jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h, + ctx_vshift(s, 1), pred_y, pred_c); + } + + // If we have emulated VPU ops - do it here +#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C + if (av_rpi_is_sand8_frame(s->frame)) + { +#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C + ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip); +#elif RPI_QPU_EMU_Y + ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL); +#else + ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip); +#endif + } + else + { +#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C + ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip); +#elif RPI_QPU_EMU_Y + ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL); +#else + ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip); +#endif + } +#endif + +#if RPI_WORKER_WAIT_PASS_0 + if (do_wait) + rpi_sem_wait(&jb->sem); + rpi_cache_flush_execute(jb->rfe); +#endif +} + + +static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe) +{ + av_freep(&ipe->q); + gpu_free(&ipe->gptr); +} + +static HEVCRpiJob * job_new(void) +{ + HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob)); + + if (jb == NULL) + return NULL; + + sem_init(&jb->sem, 0, 0); + jb->rfe = rpi_cache_flush_init(&jb->flush_buf); + ff_hevc_rpi_progress_init_wait(&jb->progress_wait); + + jb->intra.n = 0; + if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL) + goto fail1; + + // * Sizeof the union structure might be overkill but at the moment it + // is correct (it certainly isn't going to be too small) + // Set max fill to slack/2 from the end of the Q + // If we exceed this in any Q then we will schedule by size (which should + // mean that we never use that Q again part from syncs) + // * Given how agressive the overflow resonse is we could maybe put the + // threshold even nearer the end, but I don't expect us to ever hit + // it on any real stream anyway. + + if (rpi_inter_pred_alloc(&jb->chroma_ip, + QPU_N_MAX, QPU_N_GRP, + QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t), + QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0) + goto fail2; + if (rpi_inter_pred_alloc(&jb->luma_ip, + QPU_N_MAX, QPU_N_GRP, + QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t), + QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0) + goto fail3; + + return jb; + +fail3: + rpi_free_inter_pred(&jb->luma_ip); +fail2: + av_freep(&jb->intra.cmds); +fail1: + ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); + rpi_cache_flush_finish(jb->rfe); + sem_destroy(&jb->sem); + return NULL; +} + +static void job_delete(HEVCRpiJob * const jb) +{ + worker_pic_free_one(jb); + ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); + rpi_free_inter_pred(&jb->chroma_ip); + rpi_free_inter_pred(&jb->luma_ip); + av_freep(&jb->intra.cmds); + rpi_cache_flush_finish(jb->rfe); // Not really needed - should do nothing + sem_destroy(&jb->sem); + av_free(jb); +} + +static void jbg_delete(HEVCRpiJobGlobal * const jbg) +{ + HEVCRpiJob * jb; + + if (jbg == NULL) + return; + + jb = jbg->free1; + while (jb != NULL) + { + HEVCRpiJob * const jb2 = jb; + jb = jb2->next; + job_delete(jb2); + } + + pthread_mutex_destroy(&jbg->lock); + av_free(jbg); +} + +static HEVCRpiJobGlobal * jbg_new(unsigned int job_count) +{ + HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal)); + if (jbg == NULL) + return NULL; + + pthread_mutex_init(&jbg->lock, NULL); + + while (job_count-- != 0) + { + HEVCRpiJob * const jb = job_new(); + if (jb == NULL) + goto fail; + + jb->next = jbg->free1; + jbg->free1 = jb; + } + + return jbg; + +fail: + jbg_delete(jbg); + return NULL; +} + +static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc) +{ + HEVCRpiJobGlobal * jbg; + + if (jbc == NULL) + return; + + jbg = jbc->jbg; + + if (jbc->jb1 != NULL) + job_delete(jbc->jb1); + + pthread_mutex_destroy(&jbc->in_lock); + sem_destroy(&jbc->sem_out); + av_free(jbc); + + // Deref the global job context + if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1) + jbg_delete(jbg); +} + +static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg) +{ + HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl)); + + if (jbc == NULL) + return NULL; + + jbc->jbg = jbg; + atomic_fetch_add(&jbg->ref_count, 1); + + sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS); + pthread_mutex_init(&jbc->in_lock, NULL); + + if ((jbc->jb1 = job_new()) == NULL) + goto fail; + jbc->jb1->jbc_local = jbc; + + return jbc; + +fail: + rpi_job_ctl_delete(jbc); + return NULL; +} + + + +static av_cold void hevc_init_worker(HEVCRpiContext * const s) +{ +#if RPI_PASSES == 2 + pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1); +#elif RPI_PASSES == 3 + pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2); + pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1); +#else +#error Passes confused +#endif + pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0); + + pass_queues_start_all(s); +} + +static av_cold void hevc_exit_worker(HEVCRpiContext *s) +{ + pass_queues_term_all(s); + + pass_queues_kill_all(s); + + rpi_job_ctl_delete(s->jbc); + s->jbc = NULL; +} + + +static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc) +{ + const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; + const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns; + const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts]; + + // Check for obvious disasters + if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) { + av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); + return AVERROR_INVALIDDATA; + } + + // If dependant then ctb_addr_ts != 0 from previous check + if (s->sh.dependent_slice_segment_flag) { + int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1]; + if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) { + av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n"); + return AVERROR_INVALIDDATA; + } + } + + if (!s->ps.pps->entropy_coding_sync_enabled_flag && + tile_id + s->sh.num_entry_point_offsets >= tiles) + { + av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n"); + return AVERROR_INVALIDDATA; + } + + // Tiled stuff must start at start of tile if it has multiple entry points + if (!s->ps.pps->entropy_coding_sync_enabled_flag && + s->sh.num_entry_point_offsets != 0 && + ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id]) + { + av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n"); + return AVERROR_INVALIDDATA; + } + + ff_hevc_rpi_cabac_init_decoder(lc); + + // Setup any required decode vars + lc->cabac_init_req = !s->sh.dependent_slice_segment_flag; + +// printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot); + lc->qp_y = s->sh.slice_qp; + + // General setup + lc->bt_line_no = 0; + lc->ts = ctb_addr_ts; + return 0; +} + +static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal) +{ + const GetBitContext * const gb = &s->HEVClc->gb; + RpiSliceHeader * const sh = &s->sh; + int i, j; + + const unsigned int length = nal->size; + unsigned int offset = ((gb->index) >> 3) + 1; // We have a bit & align still to come = +1 byte + unsigned int cmpt; + unsigned int startheader; + + if (sh->num_entry_point_offsets == 0) { + s->data = NULL; + return 0; + } + + // offset in slice header includes emulation prevention bytes. + // Unfortunately those have been removed by the time we get here so we + // have to compensate. The nal layer keeps a track of where they were. + for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) { + if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { + startheader--; + cmpt++; + } + } + + for (i = 1; i < sh->num_entry_point_offsets; i++) { + offset += (sh->entry_point_offset[i - 1] - cmpt); + for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) { + if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { + startheader--; + cmpt++; + } + } + if (sh->entry_point_offset[i] <= cmpt) { + av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n"); + return AVERROR_INVALIDDATA; + } + sh->size[i - 1] = sh->entry_point_offset[i] - cmpt; + sh->offset[i - 1] = offset; + } + + offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt; + if (length < offset) { + av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n"); + return AVERROR_INVALIDDATA; + } + sh->size[sh->num_entry_point_offsets - 1] = length - offset; + sh->offset[sh->num_entry_point_offsets - 1] = offset; + + // Remember data start pointer as we won't have nal later + s->data = nal->data; + return 0; +} + + +// Return +// < 0 Error +// 0 OK +// +// jb->ctu_ts_last < 0 Job still filling +// jb->ctu_ts_last >= 0 Job ready + +static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks) +{ + const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; + const unsigned int ctb_size = (1 << log2_ctb_size); + HEVCRpiJob * const jb = lc->jb0; + int more_data = 1; + unsigned int ctb_addr_ts = lc->ts; + unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; + unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size; + const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size; + + lc->unit_done = 0; + + while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) + { + int q_full; + const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; + + hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts); + + ff_hevc_rpi_cabac_init(s, lc, ctb_flags); + + hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size); + + s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset; + s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; + s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; + + // Zap stashes if navail + if ((lc->ctb_avail & AVAIL_U) == 0) + zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3); + if ((lc->ctb_avail & AVAIL_L) == 0) + { + memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE); + zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3); + } +#if MVF_STASH_WIDTH > 64 + // Restore left mvf stash at start of tile if not at start of line + if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap) + { + unsigned int i; + HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0); + const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE); + for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i) + { + *dst = *src++; + dst += MVF_STASH_WIDTH_PU; + } + } +#endif + + // Set initial tu states + lc->tu.cu_qp_delta = 0; + lc->tu.is_cu_qp_delta_wanted = 0; + lc->tu.cu_chroma_qp_offset_wanted = 0; + + // Decode + more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0); + + if (ff_hevc_rpi_cabac_overflow(lc)) + { + av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n "); + more_data = AVERROR_INVALIDDATA; + } + + if (more_data < 0) { + s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN; // Mark slice as broken + return more_data; + } + + if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 || + (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0))) + { + if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 || + ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL) + { + av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n "); + return -1; + } + } + + // --- Post CTB processing + + // Stash rpl top/left for deblock that needs to remember such things cross-slice + s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList; + s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList; + + if (!s->is_irap) + { + // Copy MVF up to up-left & stash to up + { + const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1); + HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE); + + // printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst); + + lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE]; + memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE); + } + // Stash sideways if end of tile line but not end of line (no point) + // ** Could/should do this @ end of fn +#if MVF_STASH_WIDTH > 64 + if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL) +#endif + { + unsigned int i; + const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0); + HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE); + for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i) + { + *dst++ = *src; + src += MVF_STASH_WIDTH_PU; + } + } + } + + if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0) + ff_hevc_rpi_save_states(s, lc); + + // Report progress so we can use our MVs in other frames + if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0) + ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1); + + // End of line || End of tile line || End of tile + // (EoL covers end of frame for our purposes here) + q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0); + + // Allocate QPU chunks on fixed size 64 pel boundries rather than + // whatever ctb_size is today. + // * We might quite like to continue to 64 pel vertical too but that + // currently confuses WPP + if (((x_ctb + ctb_size) & 63) == 0 || q_full) + { + int overflow = 0; + if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0) + overflow = 1; + if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0) + overflow = 1; + if (overflow) + { + // * This is very annoying (and slow) to cope with in WPP so + // we treat it as an error there (no known stream triggers this + // with the current buffer sizes). Non-wpp should cope fine. + av_log(s->avctx, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__); + q_full = 1; + } + } + + // Inc TS to next. + ctb_addr_ts++; + ctb_addr_rs++; + x_ctb += ctb_size; + + if (q_full) + { + // Do job + // Prep for submission + jb->ctu_ts_last = ctb_addr_ts - 1; // Was pre-inced + job_gen_bounds(s, jb); + break; + } + + // If max_blocks started as 0 then this will never be true + if (--max_blocks == 0) + break; + } + + lc->unit_done = (more_data <= 0); + lc->ts = ctb_addr_ts; + return 0; +} + +static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n) +{ + lc->context = s; + lc->jb0 = NULL; + lc->lc_n = n; + lc->bt_terminate = 0; + lc->bt_psem_out = NULL; + sem_init(&lc->bt_sem_in, 0, 0); +} + +#define TRACE_WPP 0 +#if RPI_EXTRA_BIT_THREADS > 0 +static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts) +{ + unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts]; + return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]]; +} + +// Move local context parameters from an aux bit thread back to the main +// thread at the end of a slice as processing is going to continue there. +static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep) +{ + if (src_lc == dst_lc) { + return; + } + + // Move the job + // We will still have an active job if the final line terminates early + // Dest should always be null by now + av_assert1(dst_lc->jb0 == NULL); + dst_lc->jb0 = src_lc->jb0; + src_lc->jb0 = NULL; + + // Always need to store where we are in the bitstream + dst_lc->ts = src_lc->ts; + dst_lc->gb = src_lc->gb; + // Cabac init request will be built at start of next slice + + // Need to store context if we might have a dependent seg + if (is_dep) + { + dst_lc->qPy_pred = src_lc->qPy_pred; + memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left)); + memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state)); + memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff)); + } +} + +static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc) +{ + rpi_sem_wait(&lc->bt_sem_in); + return lc->bt_terminate; +} + +// Do one WPP line +// Will not work correctly over horizontal tile boundries - vertical should be OK +static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first) +{ + const int is_tile = lc->bt_is_tile; + const unsigned int tile_id = s->ps.pps->tile_id[lc->ts]; + const unsigned int line = lc->bt_line_no; + const unsigned int line_inc = lc->bt_line_inc; + const int is_last = (line >= lc->bt_last_line); + + const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width); + const unsigned int ts_next = + line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ? + INT_MAX : + is_tile ? + s->ps.pps->tile_pos_ts[tile_id + line_inc] : + lc->ts + lc->bt_line_width * line_inc; + // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work) + const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2; + unsigned int ts_prev; + int loop_n = 0; + int err = 0; + + av_assert1(line <= s->sh.num_entry_point_offsets); + +#if TRACE_WPP + printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__, + lc->lc_n, is_tile ? "Tile" : "WPP", tile_id, + line, lc->bt_last_line, s->sh.num_entry_point_offsets, + lc->ts, ts_eol, ts_next, partial_size, lc->jb0); +#endif + if (line != 0) + { + const uint8_t * const data = s->data + s->sh.offset[line - 1]; + const unsigned int len = s->sh.size[line - 1]; + if ((err = init_get_bits8(&lc->gb, data, len)) < 0) + return err; + + ff_init_cabac_decoder(&lc->cc, data, len); + } + + // We should never be processing a dependent slice here so reset is good + // ?? These probably shouldn't be needed (as they should be set by later + // logic) but do seem to be required + lc->qp_y = s->sh.slice_qp; + + do + { + if (!is_last && loop_n > 1) { +#if TRACE_WPP + printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out); +#endif + sem_post(lc->bt_psem_out); + } + // The wait for loop_n == 0 has been done in bit_thread + if (!is_first && loop_n != 0) + { +#if TRACE_WPP + printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in); +#endif + if (wait_bt_sem_in(lc) != 0) + return AVERROR_EXIT; + } + +#if TRACE_WPP + { + int n; + sem_getvalue(&lc->bt_sem_in, &n); + printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in); + } +#endif + + ts_prev = lc->ts; + + // If we have had an error - do no further decode but do continue + // moving signals around so the other threads continue to operate + // correctly (or at least as correctly as they can with this line missing) + // + // Errors in WPP/Tile are less fatal than normal as we have a good idea + // of how to restart on the next line so there is no need to give up totally + if (err != 0) + { + lc->unit_done = 0; + lc->ts += partial_size; + } + else + { + worker_pass0_ready(s, lc); + + if ((err = fill_job(s, lc, partial_size)) < 0 || + (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done))) + { + if (err == 0) { + av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n"); + err = AVERROR_INVALIDDATA; + } + worker_free(s, lc); + lc->ts = ts_prev + partial_size; // Pretend we did all that + lc->unit_done = 0; + } + else if (is_tile) + { + worker_submit_job(s, lc); + } + } + + ++loop_n; + } while (lc->ts < ts_eol && !lc->unit_done); + + // If we are on the last line & we didn't get a whole line we must wait for + // and sink the sem_posts from the line above / tile to the left. + while ((ts_prev += partial_size) < ts_eol) + { +#if TRACE_WPP + printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in); +#endif + if (wait_bt_sem_in(lc) != 0) + return AVERROR_EXIT; + } + + lc->bt_line_no += line_inc; + + if (!is_tile && err == 0) + worker_submit_job(s, lc); + + if (!is_last) { + lc->ts = ts_next; + +#if TRACE_WPP + printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out); +#endif + sem_post(lc->bt_psem_out); + if (loop_n > 1) { +#if TRACE_WPP + printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out); +#endif + sem_post(lc->bt_psem_out); + } + } + else + { + movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); // * & not EoT +#if MVF_STASH_WIDTH > 64 + // Horrid calculations to work out what we want but luckily this should almost never execute + // **** Move to movlc + if (!s->is_irap) + { + const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts]; + if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf + { + const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1; + unsigned int i; + const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)); + HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)); + + for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i) + { + *d_mvf = *s_mvf; + d_mvf += MVF_STASH_WIDTH_PU; + s_mvf += MVF_STASH_WIDTH_PU; + } + + } + } +#endif + // When all done poke the thread 0 sem_in one final time +#if TRACE_WPP + printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in); +#endif + sem_post(&s->HEVClcList[0]->bt_sem_in); + } + +#if TRACE_WPP + printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag); +#endif + return err; +} + +static void wpp_setup_lcs(HEVCRpiContext * const s) +{ + unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; + const unsigned int line_width = line_ts_width(s, ts); + + for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i) + { + HEVCRpiLocalContext * const lc = s->HEVClcList[i]; + lc->ts = ts; + lc->bt_is_tile = 0; + lc->bt_line_no = i; + lc->bt_line_width = line_width; + lc->bt_last_line = s->sh.num_entry_point_offsets; + lc->bt_line_inc = RPI_BIT_THREADS; + ts += line_width; + } +} + + +// Can only process tile single row at once +static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row) +{ + const HEVCRpiPPS * const pps = s->ps.pps; + const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; + const unsigned int tile0 = pps->tile_id[ts0]; + const unsigned int col0 = tile0 % pps->num_tile_columns; + + const unsigned int col = (slice_row == 0) ? col0 : 0; + unsigned int line = slice_row * pps->num_tile_columns - col0 + col; + const unsigned int last_line = FFMIN( + line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets); + + const unsigned int par = + FFMIN(RPI_BIT_THREADS, last_line + 1 - line); +#if TRACE_WPP + printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row, + pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line); +#endif + for (unsigned int i = 0; i != par; ++i, ++line) + { + HEVCRpiLocalContext * const lc = s->HEVClcList[i]; + const unsigned int tile = tile0 + line; + + lc->ts = pps->tile_pos_ts[tile]; + lc->bt_line_no = line; + lc->bt_is_tile = 1; + lc->bt_line_width = line_ts_width(s, lc->ts); + lc->bt_last_line = last_line; + lc->bt_line_inc = par; + } +} + + +static void * bit_thread(void * v) +{ + HEVCRpiLocalContext * const lc = v; + HEVCRpiContext *const s = lc->context; + + while (wait_bt_sem_in(lc) == 0) + { + int err; + + if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp + if (lc->bt_terminate) { + av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__); + break; + } + av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err); + } + } + + return NULL; +} + +static int bit_threads_start(HEVCRpiContext * const s) +{ + if (s->bt_started) + return 0; + + for (int i = 1; i < RPI_BIT_THREADS; ++i) + { + // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS] + if (s->HEVClcList[i] == NULL) { + if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL) + return -1; + } + + bt_lc_init(s, s->HEVClcList[i], i); + job_lc_init(s->HEVClcList[i]); + } + + // Link the sems in a circle + for (int i = 0; i < RPI_BIT_THREADS - 1; ++i) + s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in; + s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in; + + // Init all lc before starting any threads + for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) + { + if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0) + return -1; + } + + s->bt_started = 1; + return 0; +} + +static int bit_threads_kill(HEVCRpiContext * const s) +{ + if (!s->bt_started) + return 0; + s->bt_started = 0; + + for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) + { + HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1]; + if (lc == NULL) + break; + + lc->bt_terminate = 1; + sem_post(&lc->bt_sem_in); + pthread_join(s->bit_threads[i], NULL); + + sem_destroy(&lc->bt_sem_in); + job_lc_kill(lc); + } + return 0; +} +#endif + + +// If we are at EoT and the row is shorter than the number of jobs +// we can Q we have to wait for it finish otherwise we risk cache/QPU +// disasters +static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n) +{ + return + s->ps.pps->tile_wpp_inter_disable >= 2 && + s->sh.slice_type != HEVC_SLICE_I && + n >= 0 && + (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT; +} + +static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +{ + HEVCRpiContext * const s = avctxt->priv_data; + HEVCRpiLocalContext * const lc = s->HEVClc; + int err; + + // Start of slice + if ((err = slice_start(s, lc)) != 0) + return err; + +#if RPI_EXTRA_BIT_THREADS > 0 + + if (s->sh.offload_tiles) + { + unsigned int slice_row = 0; + +#if TRACE_WPP + printf("%s: Do Tiles\n", __func__); +#endif + // Generate & start extra bit threads if they aren't already running + bit_threads_start(s); + + do + { + // Reset lc lines etc. + tile_one_row_setup_lcs(s, slice_row); + +#if TRACE_WPP + printf("%s: Row %d: Do 1st: line=%d/%d/%d\n", + __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); +#endif + + rpi_run_one_line(s, lc, 1); // Kicks off the other threads +#if TRACE_WPP + printf("%s: Row %d: Done 1st: line=%d/%d/%d\n", + __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); +#endif + + while (lc->bt_line_no <= lc->bt_last_line) { + rpi_sem_wait(&lc->bt_sem_in); + rpi_run_one_line(s, lc, 0); + } +#if TRACE_WPP + printf("%s: Done body\n", __func__); +#endif + + // Wait for everything else to finish + rpi_sem_wait(&lc->bt_sem_in); + + ++slice_row; + } while (lc->bt_last_line < s->sh.num_entry_point_offsets); + + +#if TRACE_WPP + printf("%s: Done wait: ts=%d\n", __func__, lc->ts); +#endif + } + else if (s->sh.offload_wpp) + { +#if TRACE_WPP + printf("%s: Do WPP\n", __func__); +#endif + // Generate & start extra bit threads if they aren't already running + bit_threads_start(s); + + // Reset lc lines etc. + wpp_setup_lcs(s); + + rpi_run_one_line(s, lc, 1); // Kicks off the other threads +#if TRACE_WPP + printf("%s: Done 1st\n", __func__); +#endif + + while (lc->bt_line_no <= s->sh.num_entry_point_offsets) { + rpi_sem_wait(&lc->bt_sem_in); + rpi_run_one_line(s, lc, 0); + } +#if TRACE_WPP + printf("%s: Done body\n", __func__); +#endif + + // Wait for everything else to finish + rpi_sem_wait(&lc->bt_sem_in); + +#if TRACE_WPP + printf("%s: Done wait: ts=%d\n", __func__, lc->ts); +#endif + } + else +#endif + { +#if TRACE_WPP + printf("%s: Single start: ts=%d\n", __func__, lc->ts); +#endif + // Single bit thread + do { + // Make sure we have space to prepare the next job + worker_pass0_ready(s, lc); + + if ((err = fill_job(s, lc, 0)) < 0) + goto fail; + + worker_submit_job(s, lc); + + if (tile_needs_wait(s, lc->ts - 1)) + worker_wait(s, lc); + + } while (!lc->unit_done); + +#if TRACE_WPP + printf("%s: Single end: ts=%d\n", __func__, lc->ts); +#endif + } + + // If we have reached the end of the frame or + // then wait for the worker to finish all its jobs + if (lc->ts >= s->ps.sps->ctb_size) + worker_wait(s, lc); + +#if RPI_TSTATS + { + HEVCRpiStats *const ts = &s->tstats; + + printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n", + ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0, + ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge, + ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0, + ts->y_pred2_hgt16, ts->y_pred2_hle16); + memset(ts, 0, sizeof(*ts)); + } +#endif + + return lc->ts; + +fail: + // Cleanup + av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err); + // Free our job & wait for temination + worker_free(s, lc); + worker_wait(s, lc); + return err; +} + + +static void set_no_backward_pred(HEVCRpiContext * const s) +{ + int i, j; + const RefPicList *const refPicList = s->refPicList; + + s->no_backward_pred_flag = 0; + if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag) + return; + + for (j = 0; j < 2; j++) { + for (i = 0; i < refPicList[j].nb_refs; i++) { + if (refPicList[j].list[i] > s->poc) { + s->no_backward_pred_flag = 1; + return; + } + } + } +} + +static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal) +{ + int err; + if ((err = gen_entry_points(s, nal)) < 0) + return err; + + set_no_backward_pred(s); + + return rpi_decode_entry(s->avctx, NULL); +} + +static int set_side_data(HEVCRpiContext *s) +{ + AVFrame *out = s->ref->frame; + + if (s->sei.frame_packing.present && + s->sei.frame_packing.arrangement_type >= 3 && + s->sei.frame_packing.arrangement_type <= 5 && + s->sei.frame_packing.content_interpretation_type > 0 && + s->sei.frame_packing.content_interpretation_type < 3) { + AVStereo3D *stereo = av_stereo3d_create_side_data(out); + if (!stereo) + return AVERROR(ENOMEM); + + switch (s->sei.frame_packing.arrangement_type) { + case 3: + if (s->sei.frame_packing.quincunx_subsampling) + stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX; + else + stereo->type = AV_STEREO3D_SIDEBYSIDE; + break; + case 4: + stereo->type = AV_STEREO3D_TOPBOTTOM; + break; + case 5: + stereo->type = AV_STEREO3D_FRAMESEQUENCE; + break; + } + + if (s->sei.frame_packing.content_interpretation_type == 2) + stereo->flags = AV_STEREO3D_FLAG_INVERT; + + if (s->sei.frame_packing.arrangement_type == 5) { + if (s->sei.frame_packing.current_frame_is_frame0_flag) + stereo->view = AV_STEREO3D_VIEW_LEFT; + else + stereo->view = AV_STEREO3D_VIEW_RIGHT; + } + } + + if (s->sei.display_orientation.present && + (s->sei.display_orientation.anticlockwise_rotation || + s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) { + double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16); + AVFrameSideData *rotation = av_frame_new_side_data(out, + AV_FRAME_DATA_DISPLAYMATRIX, + sizeof(int32_t) * 9); + if (!rotation) + return AVERROR(ENOMEM); + + av_display_rotation_set((int32_t *)rotation->data, angle); + av_display_matrix_flip((int32_t *)rotation->data, + s->sei.display_orientation.hflip, + s->sei.display_orientation.vflip); + } + + // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1 + // so the side data persists for the entire coded video sequence. + if (s->sei.mastering_display.present > 0 && + IS_IRAP(s) && s->no_rasl_output_flag) { + s->sei.mastering_display.present--; + } + if (s->sei.mastering_display.present) { + // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b + const int mapping[3] = {2, 0, 1}; + const int chroma_den = 50000; + const int luma_den = 10000; + int i; + AVMasteringDisplayMetadata *metadata = + av_mastering_display_metadata_create_side_data(out); + if (!metadata) + return AVERROR(ENOMEM); + + for (i = 0; i < 3; i++) { + const int j = mapping[i]; + metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0]; + metadata->display_primaries[i][0].den = chroma_den; + metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1]; + metadata->display_primaries[i][1].den = chroma_den; + } + metadata->white_point[0].num = s->sei.mastering_display.white_point[0]; + metadata->white_point[0].den = chroma_den; + metadata->white_point[1].num = s->sei.mastering_display.white_point[1]; + metadata->white_point[1].den = chroma_den; + + metadata->max_luminance.num = s->sei.mastering_display.max_luminance; + metadata->max_luminance.den = luma_den; + metadata->min_luminance.num = s->sei.mastering_display.min_luminance; + metadata->min_luminance.den = luma_den; + metadata->has_luminance = 1; + metadata->has_primaries = 1; + + av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n"); + av_log(s->avctx, AV_LOG_DEBUG, + "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n", + av_q2d(metadata->display_primaries[0][0]), + av_q2d(metadata->display_primaries[0][1]), + av_q2d(metadata->display_primaries[1][0]), + av_q2d(metadata->display_primaries[1][1]), + av_q2d(metadata->display_primaries[2][0]), + av_q2d(metadata->display_primaries[2][1]), + av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1])); + av_log(s->avctx, AV_LOG_DEBUG, + "min_luminance=%f, max_luminance=%f\n", + av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance)); + } + // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1 + // so the side data persists for the entire coded video sequence. + if (s->sei.content_light.present > 0 && + IS_IRAP(s) && s->no_rasl_output_flag) { + s->sei.content_light.present--; + } + if (s->sei.content_light.present) { + AVContentLightMetadata *metadata = + av_content_light_metadata_create_side_data(out); + if (!metadata) + return AVERROR(ENOMEM); + metadata->MaxCLL = s->sei.content_light.max_content_light_level; + metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level; + + av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n"); + av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n", + metadata->MaxCLL, metadata->MaxFALL); + } + + if (s->sei.a53_caption.a53_caption) { + AVFrameSideData* sd = av_frame_new_side_data(out, + AV_FRAME_DATA_A53_CC, + s->sei.a53_caption.a53_caption_size); + if (sd) + memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size); + av_freep(&s->sei.a53_caption.a53_caption); + s->sei.a53_caption.a53_caption_size = 0; + s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS; + } + + if (s->sei.alternative_transfer.present && + av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) && + s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) { + s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics; + } + + return 0; +} + +static int hevc_frame_start(HEVCRpiContext * const s) +{ + int ret; + + memset(s->bs_horizontal, 0, s->bs_size * 2); // Does V too + memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); + memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address)); + + // Only need to remember intra for CIP + if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap) + s->is_intra = NULL; + else + { + s->is_intra = s->is_intra_store; + memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); + } + + s->is_decoded = 0; + s->first_nal_type = s->nal_unit_type; + + s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos); + + if (s->pkt.nb_nals > s->rpl_tab_size) + { + // In most cases it will be faster to free & realloc as that doesn't + // require (an unwanted) copy + av_freep(&s->rpl_tab); + s->rpl_tab_size = 0; + if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL) + goto fail; + s->rpl_tab_size = s->pkt.nb_nals; + } + memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab)); + + ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc); + if (ret < 0) + goto fail; + + // Resize rpl_tab to max that we might want + ret = ff_hevc_rpi_frame_rps(s); + if (ret < 0) { + av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n"); + goto fail; + } + + s->ref->frame->key_frame = IS_IRAP(s); + + ret = set_side_data(s); + if (ret < 0) + goto fail; + + s->frame->pict_type = 3 - s->sh.slice_type; + + if (!IS_IRAP(s)) + ff_hevc_rpi_bump_frame(s); + + av_frame_unref(s->output_frame); + ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0); + if (ret < 0) + goto fail; + + ff_thread_finish_setup(s->avctx); + + return 0; + +fail: + if (s->ref) + ff_hevc_rpi_unref_frame(s, s->ref, ~0); + s->ref = NULL; + return ret; +} + +static inline int is_non_ref_unit_type(const unsigned int nal_unit_type) +{ + // From Table 7-1 + return (nal_unit_type & ~0xe) == 0; // True for 0, 2, 4, 6, 8, 10, 12, 14 +} + +static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal) +{ + GetBitContext * const gb = &s->HEVClc->gb; + int ctb_addr_ts, ret; + + *gb = nal->gb; + s->nal_unit_type = nal->type; + s->temporal_id = nal->temporal_id; + + switch (s->nal_unit_type) { + case HEVC_NAL_VPS: + ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps); + if (ret < 0) + goto fail; + break; + case HEVC_NAL_SPS: + ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps, + s->apply_defdispwin); + if (ret < 0) + goto fail; + break; + case HEVC_NAL_PPS: + ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps); + if (ret < 0) + goto fail; + break; + case HEVC_NAL_SEI_PREFIX: + case HEVC_NAL_SEI_SUFFIX: + ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type); + if (ret < 0) + goto fail; + break; + case HEVC_NAL_TRAIL_R: + case HEVC_NAL_TRAIL_N: + case HEVC_NAL_TSA_N: + case HEVC_NAL_TSA_R: + case HEVC_NAL_STSA_N: + case HEVC_NAL_STSA_R: + case HEVC_NAL_BLA_W_LP: + case HEVC_NAL_BLA_W_RADL: + case HEVC_NAL_BLA_N_LP: + case HEVC_NAL_IDR_W_RADL: + case HEVC_NAL_IDR_N_LP: + case HEVC_NAL_CRA_NUT: + case HEVC_NAL_RADL_N: + case HEVC_NAL_RADL_R: + case HEVC_NAL_RASL_N: + case HEVC_NAL_RASL_R: + ret = hls_slice_header(s); + if (ret < 0) + return ret; + + // The definition of _N unit types is "non-reference for other frames + // with the same temporal_id" so they may/will be ref frames for pics + // with a higher temporal_id. + s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 || + !is_non_ref_unit_type(s->nal_unit_type); + s->offload_recon = s->threads_type != 0 && s->used_for_ref; + s->is_irap = IS_IRAP(s); + +#if DEBUG_DECODE_N + { + static int z = 0; + if (IS_IDR(s)) { + z = 1; + } + if (z != 0 && z++ > DEBUG_DECODE_N) { + s->is_decoded = 0; + break; + } + } +#endif + if ( + (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) || + (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) || + (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) || + (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s))) + { + s->is_decoded = 0; + break; + } + + if (s->sh.first_slice_in_pic_flag) { + if (s->max_ra == INT_MAX) { + if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) { + s->max_ra = s->poc; + } else { + if (IS_IDR(s)) + s->max_ra = INT_MIN; + } + } + + if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) && + s->poc <= s->max_ra) { + s->is_decoded = 0; + break; + } else { + if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra) + s->max_ra = INT_MIN; + } + + ret = hevc_frame_start(s); + if (ret < 0) + return ret; + } else if (!s->ref) { + av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n"); + goto fail; + } + + if (s->nal_unit_type != s->first_nal_type) { + av_log(s->avctx, AV_LOG_ERROR, + "Non-matching NAL types of the VCL NALUs: %d %d\n", + s->first_nal_type, s->nal_unit_type); + return AVERROR_INVALIDDATA; + } + + if (!s->sh.dependent_slice_segment_flag && + s->sh.slice_type != HEVC_SLICE_I) { + ret = ff_hevc_rpi_slice_rpl(s); + if (ret < 0) { + av_log(s->avctx, AV_LOG_WARNING, + "Error constructing the reference lists for the current slice.\n"); + goto fail; + } + } + + ctb_addr_ts = hls_slice_data(s, nal); + if (ctb_addr_ts >= s->ps.sps->ctb_size) { + s->is_decoded = 1; + } + + if (ctb_addr_ts < 0) { + ret = ctb_addr_ts; + goto fail; + } + break; + case HEVC_NAL_EOS_NUT: + case HEVC_NAL_EOB_NUT: + s->seq_decode = (s->seq_decode + 1) & 0xff; + s->max_ra = INT_MAX; + break; + case HEVC_NAL_AUD: + case HEVC_NAL_FD_NUT: + break; + default: + av_log(s->avctx, AV_LOG_INFO, + "Skipping NAL unit %d\n", s->nal_unit_type); + } + + return 0; +fail: + if (s->avctx->err_recognition & AV_EF_EXPLODE) + return ret; + return 0; +} + +static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length) +{ + int i, ret = 0; + int eos_at_start = 1; + + s->ref = NULL; + s->last_eos = s->eos; + s->eos = 0; + + /* split the input packet into NAL units, so we know the upper bound on the + * number of slices in the frame */ + ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff, + s->nal_length_size, s->avctx->codec_id, 0, 0); + if (ret < 0) { + av_log(s->avctx, AV_LOG_ERROR, + "Error splitting the input into NAL units.\n"); + return ret; + } + + for (i = 0; i < s->pkt.nb_nals; i++) { + if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT || + s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) { + if (eos_at_start) { + s->last_eos = 1; + } else { + s->eos = 1; + } + } else { + eos_at_start = 0; + } + } + + /* decode the NAL units */ + for (i = 0; i < s->pkt.nb_nals; i++) { + ret = decode_nal_unit(s, &s->pkt.nals[i]); + if (ret < 0) { + av_log(s->avctx, AV_LOG_WARNING, + "Error parsing NAL unit #%d.\n", i); + goto fail; + } + } + +fail: // Also success path + if (s->ref != NULL) { + if (s->used_for_ref && s->threads_type != 0) { + ff_hevc_rpi_progress_signal_all_done(s); + } + else { + // Flush frame to real memory as we expect to be able to pass + // it straight on to mmal + flush_frame(s, s->frame); + } + } + return ret; +} + +static void print_md5(void *log_ctx, int level, uint8_t md5[16]) +{ + int i; + for (i = 0; i < 16; i++) + av_log(log_ctx, level, "%02"PRIx8, md5[i]); +} + +static int verify_md5(HEVCRpiContext *s, AVFrame *frame) +{ + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); + int pixel_shift; + int i, j; + + if (!desc) + return AVERROR(EINVAL); + + pixel_shift = desc->comp[0].depth > 8; + + av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ", + s->poc); + + /* the checksums are LE, so we have to byteswap for >8bpp formats + * on BE arches */ +#if HAVE_BIGENDIAN + if (pixel_shift && !s->checksum_buf) { + av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size, + FFMAX3(frame->linesize[0], frame->linesize[1], + frame->linesize[2])); + if (!s->checksum_buf) + return AVERROR(ENOMEM); + } +#endif + + for (i = 0; frame->data[i]; i++) { + int width = s->avctx->coded_width; + int height = s->avctx->coded_height; + int w = (i == 1 || i == 2) ? (width >> desc->log2_chroma_w) : width; + int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height; + uint8_t md5[16]; + + av_md5_init(s->md5_ctx); + for (j = 0; j < h; j++) { + const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1); +#if HAVE_BIGENDIAN + if (pixel_shift) { + s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf, + (const uint16_t *) src, w); + src = s->checksum_buf; + } +#endif + av_md5_update(s->md5_ctx, src, w << pixel_shift); + } + av_md5_final(s->md5_ctx, md5); + + if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) { + av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i); + print_md5(s->avctx, AV_LOG_DEBUG, md5); + av_log (s->avctx, AV_LOG_DEBUG, "; "); + } else { + av_log (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i); + print_md5(s->avctx, AV_LOG_ERROR, md5); + av_log (s->avctx, AV_LOG_ERROR, " != "); + print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]); + av_log (s->avctx, AV_LOG_ERROR, "\n"); + return AVERROR_INVALIDDATA; + } + } + + av_log(s->avctx, AV_LOG_DEBUG, "\n"); + + return 0; +} + +static int all_sps_supported(const HEVCRpiContext * const s) +{ + for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { + if (s->ps.sps_list[i] != NULL) + { + const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data; + if (!is_sps_supported(sps)) + return 0; + } + } + return 1; +} + +static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first) +{ + int ret, i; + + ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff, + &s->nal_length_size, s->avctx->err_recognition, + s->apply_defdispwin, s->avctx); + if (ret < 0) + return ret; + + /* export stream parameters from the first SPS */ + for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { + if (first && s->ps.sps_list[i]) { + const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data; + export_stream_params(s->avctx, &s->ps, sps); + break; + } + } + + return 0; +} + +static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output, + AVPacket *avpkt) +{ + int ret; + int new_extradata_size; + uint8_t *new_extradata; + HEVCRpiContext *s = avctx->priv_data; + + if (!avpkt->size) { + ret = ff_hevc_rpi_output_frame(s, data, 1); + if (ret < 0) + return ret; + + *got_output = ret; + return 0; + } + + new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, + &new_extradata_size); + if (new_extradata && new_extradata_size > 0) { + ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0); + if (ret < 0) + return ret; + } + + s->ref = NULL; + ret = decode_nal_units(s, avpkt->data, avpkt->size); + if (ret < 0) + return ret; + + /* verify the SEI checksum */ + if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded && + s->sei.picture_hash.is_md5) { + ret = verify_md5(s, s->ref->frame); + if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) { + ff_hevc_rpi_unref_frame(s, s->ref, ~0); + return ret; + } + } + s->sei.picture_hash.is_md5 = 0; + + if (s->is_decoded) { + av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc); + s->is_decoded = 0; + } + + if (s->output_frame->buf[0]) { + av_frame_move_ref(data, s->output_frame); + *got_output = 1; + } + + return avpkt->size; +} + +static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src) +{ + int ret; + + ret = ff_thread_ref_frame(&dst->tf, &src->tf); + if (ret < 0) + return ret; + + if (src->col_mvf_buf != NULL) + { + dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf); + if (!dst->col_mvf_buf) + goto fail; + } + dst->col_mvf = src->col_mvf; + + dst->poc = src->poc; + dst->flags = src->flags; + dst->sequence = src->sequence; + return 0; + +fail: + ff_hevc_rpi_unref_frame(s, dst, ~0); + return AVERROR(ENOMEM); +} + + +static av_cold int hevc_decode_free(AVCodecContext *avctx) +{ + HEVCRpiContext * const s = avctx->priv_data; + int i; + + pic_arrays_free(s); + + av_freep(&s->md5_ctx); + + av_freep(&s->cabac_save); + +#if RPI_EXTRA_BIT_THREADS + bit_threads_kill(s); +#endif + + hevc_exit_worker(s); + for (i = 0; i != 2; ++i) { + ff_hevc_rpi_progress_kill_state(s->progress_states + i); + } + job_lc_kill(s->HEVClc); + + av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0] + av_freep(&s->sao_pixel_buffer_v[0]); + av_frame_free(&s->output_frame); + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); + av_frame_free(&s->DPB[i].frame); + } + + for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) + av_buffer_unref(&s->ps.vps_list[i]); + for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) + av_buffer_unref(&s->ps.sps_list[i]); + for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) + av_buffer_unref(&s->ps.pps_list[i]); + s->ps.sps = NULL; + s->ps.pps = NULL; + s->ps.vps = NULL; + + // Free separately from sLists as used that way by RPI WPP + for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) { + av_freep(s->HEVClcList + i); + } + s->HEVClc = NULL; // Allocated as part of HEVClcList + + ff_h2645_packet_uninit(&s->pkt); + + if (s->qpu_init_ok) + vpu_qpu_term(); + s->qpu_init_ok = 0; + + return 0; +} + + +static av_cold int hevc_init_context(AVCodecContext *avctx) +{ + HEVCRpiContext *s = avctx->priv_data; + int i; + + s->avctx = avctx; + + s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext)); + if (!s->HEVClc) + goto fail; + s->HEVClcList[0] = s->HEVClc; + + if (vpu_qpu_init() != 0) + goto fail; + s->qpu_init_ok = 1; + +#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C + { + static const uint32_t dframe[1] = {0x80808080}; + s->qpu_dummy_frame_emu = (const uint8_t *)dframe; + } +#endif +#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C + s->qpu_dummy_frame_qpu = qpu_dummy(); +#endif + + bt_lc_init(s, s->HEVClc, 0); + job_lc_init(s->HEVClc); + + for (i = 0; i != 2; ++i) { + ff_hevc_rpi_progress_init_state(s->progress_states + i); + } + + if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL) + goto fail; + + if ((s->output_frame = av_frame_alloc()) == NULL) + goto fail; + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + s->DPB[i].frame = av_frame_alloc(); + if (!s->DPB[i].frame) + goto fail; + s->DPB[i].tf.f = s->DPB[i].frame; + s->DPB[i].dpb_no = i; + } + + s->max_ra = INT_MAX; + + if ((s->md5_ctx = av_md5_alloc()) == NULL) + goto fail; + + s->context_initialized = 1; + s->eos = 0; + + ff_hevc_rpi_reset_sei(&s->sei); + + return 0; + +fail: + av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__); + hevc_decode_free(avctx); + return AVERROR(ENOMEM); +} + +#if HAVE_THREADS +static int hevc_update_thread_context(AVCodecContext *dst, + const AVCodecContext *src) +{ + HEVCRpiContext *s = dst->priv_data; + HEVCRpiContext *s0 = src->priv_data; + int i, ret; + + av_assert0(s->context_initialized); + + // dst == src can happen according to the comments and in that case + // there is nothing to do here + if (dst == src) + return 0; + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { + ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); + if (s0->DPB[i].frame->buf[0]) { + ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]); + if (ret < 0) + return ret; + } + } + + if (s->ps.sps != s0->ps.sps) + s->ps.sps = NULL; + for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) { + av_buffer_unref(&s->ps.vps_list[i]); + if (s0->ps.vps_list[i]) { + s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]); + if (!s->ps.vps_list[i]) + return AVERROR(ENOMEM); + } + } + + for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { + av_buffer_unref(&s->ps.sps_list[i]); + if (s0->ps.sps_list[i]) { + s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]); + if (!s->ps.sps_list[i]) + return AVERROR(ENOMEM); + } + } + + for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) { + av_buffer_unref(&s->ps.pps_list[i]); + if (s0->ps.pps_list[i]) { + s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]); + if (!s->ps.pps_list[i]) + return AVERROR(ENOMEM); + } + } + + if (s->ps.sps != s0->ps.sps) + if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0) + return ret; + + s->seq_decode = s0->seq_decode; + s->seq_output = s0->seq_output; + s->pocTid0 = s0->pocTid0; + s->max_ra = s0->max_ra; + s->eos = s0->eos; + s->no_rasl_output_flag = s0->no_rasl_output_flag; + + s->is_nalff = s0->is_nalff; + s->nal_length_size = s0->nal_length_size; + + s->threads_type = s0->threads_type; + + if (s0->eos) { + s->seq_decode = (s->seq_decode + 1) & 0xff; + s->max_ra = INT_MAX; + } + + s->sei.frame_packing = s0->sei.frame_packing; + s->sei.display_orientation = s0->sei.display_orientation; + s->sei.mastering_display = s0->sei.mastering_display; + s->sei.content_light = s0->sei.content_light; + s->sei.alternative_transfer = s0->sei.alternative_transfer; + + // * We do this here as it allows us to easily locate our parents + // global job pool, but there really should be a less nasty way + if (s->jbc == NULL) + { + av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL); + hevc_init_worker(s); + } + + return 0; +} +#endif + +#include +static int qpu_ok(void) +{ + static int is_pi3 = -1; + if (is_pi3 == -1) + { + struct stat sb; + is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0); + } + return is_pi3; +} + +static av_cold int hevc_decode_init(AVCodecContext *avctx) +{ + HEVCRpiContext *s = avctx->priv_data; + int ret; + + if (!qpu_ok()) + return AVERROR_DECODER_NOT_FOUND; + + if ((ret = hevc_init_context(avctx)) < 0) + return ret; + + // If we are a child context then stop now + // Everything after this point is either 1st decode setup or global alloc + // that must not be repeated + // Global info will be copied into children in update_thread_context (we + // can't do it here as we have no way of finding the parent context) + if (avctx->internal->is_copy) + return 0; + + // Job allocation requires VCSM alloc to work so ensure that we have it + // initialised by this point + { + HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5)); + if (jbg == NULL) { + av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__); + ret = AVERROR(ENOMEM); + goto fail; + } + + if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) { + av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__); + ret = AVERROR(ENOMEM); + goto fail; + } + } + + hevc_init_worker(s); + + s->eos = 1; + + if (avctx->extradata_size > 0 && avctx->extradata) { + if ((ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1)) < 0) + goto fail; + + if (!all_sps_supported(s)) { + ret = AVERROR_DECODER_NOT_FOUND; + goto fail; + } + } + + if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) + s->threads_type = FF_THREAD_FRAME; + else + s->threads_type = 0; + + return 0; + +fail: + hevc_decode_free(avctx); + return ret; +} + +static void hevc_decode_flush(AVCodecContext *avctx) +{ + HEVCRpiContext *s = avctx->priv_data; + ff_hevc_rpi_flush_dpb(s); + s->max_ra = INT_MAX; + s->eos = 1; +} + +typedef struct hwaccel_rpi3_qpu_env_s { + const AVClass *av_class; + AVZcEnvPtr zc; +} hwaccel_rpi3_qpu_env_t; + +static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame) +{ + hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data; + int rv; + + if (av_rpi_zc_in_use(s)) + { + rv = s->get_buffer2(s, frame, 0); + } + else + { + rv = av_rpi_zc_get_buffer(r3->zc, frame); + if (rv == 0) + rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID); // actually do the alloc + } + + if (rv == 0 && + (rv = ff_attach_decode_data(frame)) < 0) + { + av_frame_unref(frame); + } + + return rv; +} + +static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx) +{ + hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data; + av_rpi_zc_int_env_freep(&r3->zc); + return 0; +} + +static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx) +{ + hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data; + + if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL) + goto fail; + + return 0; + +fail: + av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n"); + hwaccel_rpi3_qpu_free(avctx); + return AVERROR(ENOMEM); +} + + +#define OFFSET(x) offsetof(HEVCRpiContext, x) +#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) + + +static const AVOption options[] = { + { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin), + AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR }, + { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin), + AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR }, + { NULL }, +}; + +static const AVClass hevc_rpi_decoder_class = { + .class_name = "HEVC RPI decoder", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const enum AVPixelFormat hevc_rpi_pix_fmts[] = { + AV_PIX_FMT_SAND128, + AV_PIX_FMT_SAND64_10, + AV_PIX_FMT_NONE +}; + + +static const AVHWAccel hwaccel_rpi3_qpu = { + .name = "Pi3 QPU Hwaccel", + .alloc_frame = hwaccel_alloc_frame, + .init = hwaccel_rpi3_qpu_init, + .uninit = hwaccel_rpi3_qpu_free, + .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t), + .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, +}; + +static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 = +{ + .public = { + .pix_fmt = AV_PIX_FMT_SAND128, + .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC, + .device_type = AV_HWDEVICE_TYPE_NONE, + }, + .hwaccel = &hwaccel_rpi3_qpu +}; +static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 = +{ + .public = { + .pix_fmt = AV_PIX_FMT_SAND64_10, + .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC, + .device_type = AV_HWDEVICE_TYPE_NONE, + }, + .hwaccel = &hwaccel_rpi3_qpu +}; + + +static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = { + &hevc_rpi_hw_config_sand128, + &hevc_rpi_hw_config_sand64_10, + NULL +}; + + +AVCodec ff_hevc_rpi_decoder = { + .name = "hevc_rpi", + .long_name = NULL_IF_CONFIG_SMALL("HEVC (rpi)"), + .type = AVMEDIA_TYPE_VIDEO, + .id = AV_CODEC_ID_HEVC, + .priv_data_size = sizeof(HEVCRpiContext), + .priv_class = &hevc_rpi_decoder_class, + .init = hevc_decode_init, + .close = hevc_decode_free, + .decode = hevc_rpi_decode_frame, + .flush = hevc_decode_flush, + .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context), + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | + AV_CODEC_CAP_HARDWARE | + AV_CODEC_CAP_AVOID_PROBING | +#if 0 + // Debugging is often easier without threads getting in the way + 0, +#warning H265 threading turned off +#else + // We only have decent optimisation for frame - so only admit to that + AV_CODEC_CAP_FRAME_THREADS, +#endif + .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | + FF_CODEC_CAP_EXPORTS_CROPPING | + FF_CODEC_CAP_ALLOCATE_PROGRESS, + .pix_fmts = hevc_rpi_pix_fmts, + .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), + .hw_configs = hevc_rpi_hw_configs, +// .wrapper_name = "hevc_rpi", +}; + diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h new file mode 100644 index 0000000000..1f94d18673 --- /dev/null +++ b/libavcodec/rpi_hevcdec.h @@ -0,0 +1,1091 @@ +/* + * HEVC video decoder + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RPI_HEVCDEC_H +#define AVCODEC_RPI_HEVCDEC_H + +#include "config.h" + +#include + +#include "libavutil/buffer.h" + +#include "avcodec.h" +#include "bswapdsp.h" +#include "cabac.h" +#include "get_bits.h" +#include "rpi_hevcpred.h" +#include "h2645_parse.h" +#include "hevc.h" +#include "rpi_hevc_mv.h" +#include "rpi_hevc_ps.h" +#include "rpi_hevc_sei.h" +#include "rpi_hevcdsp.h" +#include "internal.h" +#include "thread.h" +#include "videodsp.h" + +#if ARCH_ARM +#include "arm/rpi_hevc_misc_neon.h" +#endif + +#define MAX_NB_THREADS 16 +#define SHIFT_CTB_WPP 2 + +//TODO: check if this is really the maximum +#define MAX_TRANSFORM_DEPTH 5 + +#define MAX_TB_SIZE 32 +#define MAX_QP 51 +#define DEFAULT_INTRA_TC_OFFSET 2 + +#define HEVC_CONTEXTS 199 + +#define MRG_MAX_NUM_CANDS 5 + +#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE) // 64 + +// Size of DPB array +#define HEVC_DPB_ELS 32 + +#define L0 0 +#define L1 1 + +#define EPEL_EXTRA_BEFORE 1 +#define EPEL_EXTRA_AFTER 2 +#define EPEL_EXTRA 3 +#define QPEL_EXTRA_BEFORE 3 +#define QPEL_EXTRA_AFTER 4 +#define QPEL_EXTRA 7 + +#define EDGE_EMU_BUFFER_STRIDE 80 + +#include +#include "rpi_qpu.h" + +// Max jobs per frame thread. Actual usage will be limited by the size +// of the global job pool +// ?? Limits +#define RPI_MAX_JOBS 8 + +// This is the number of _extra_ bit threads - we will have +// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing +// +// 0 is legitimate and will disable our WPP processing +//#define RPI_EXTRA_BIT_THREADS 0 +#define RPI_EXTRA_BIT_THREADS 2 + +// Number of separate threads/passes in worker +// 2 and 3 are the currently valid numbers +// At the moment 3 seems fractionally faster +//#define RPI_PASSES 2 +#define RPI_PASSES 3 + +// Print out various usage stats +#define RPI_TSTATS 0 + +// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form +#define RPI_COMPRESS_COEFFS 1 + +// Wait for VPU/QPU to finish in worker pass 0 +// If 0 then the wait is in pass 1 +// +// One might expect the better place to wait would be in pass 1 however +// testing shows that pass 0 produces overall faster decode. +// Interestingly it is QPU/VPU limited streams that seem to suffer +// from pass 1 waits, CPU limited ones tend to show a very mild gain. +// This define exists so it is easy to test this. +#define RPI_WORKER_WAIT_PASS_0 1 + +// Use ARM emulation of QPU pred +// These are for debug only as the emulation makes only limited +// effort to be fast +#define RPI_QPU_EMU_Y 0 +#define RPI_QPU_EMU_C 0 + +// Max width & height we are prepared to consider +// Sand frame shape calc becomes confused with large frames +// Some buffer alloc also depends on this +#define HEVC_RPI_MAX_WIDTH 2048 +#define HEVC_RPI_MAX_HEIGHT 1088 + + +// Min CTB size is 16 +#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16) + +/** + * Value of the luma sample at position (x, y) in the 2D array tab. + */ +#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)]) +#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)]) + +#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP) +#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \ + (s)->nal_unit_type == HEVC_NAL_BLA_N_LP) +#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23) + +enum RPSType { + ST_CURR_BEF = 0, + ST_CURR_AFT, + ST_FOLL, + LT_CURR, + LT_FOLL, + NB_RPS_TYPE, +}; + +enum SyntaxElement { + SAO_MERGE_FLAG = 0, + SAO_TYPE_IDX, + SAO_EO_CLASS, + SAO_BAND_POSITION, + SAO_OFFSET_ABS, + SAO_OFFSET_SIGN, + END_OF_SLICE_FLAG, + SPLIT_CODING_UNIT_FLAG, + CU_TRANSQUANT_BYPASS_FLAG, + SKIP_FLAG, + CU_QP_DELTA, + PRED_MODE_FLAG, + PART_MODE, + PCM_FLAG, + PREV_INTRA_LUMA_PRED_FLAG, + MPM_IDX, + REM_INTRA_LUMA_PRED_MODE, + INTRA_CHROMA_PRED_MODE, + MERGE_FLAG, + MERGE_IDX, + INTER_PRED_IDC, + REF_IDX_L0, + REF_IDX_L1, + ABS_MVD_GREATER0_FLAG, + ABS_MVD_GREATER1_FLAG, + ABS_MVD_MINUS2, + MVD_SIGN_FLAG, + MVP_LX_FLAG, + NO_RESIDUAL_DATA_FLAG, + SPLIT_TRANSFORM_FLAG, + CBF_LUMA, + CBF_CB_CR, + TRANSFORM_SKIP_FLAG, + EXPLICIT_RDPCM_FLAG, + EXPLICIT_RDPCM_DIR_FLAG, + LAST_SIGNIFICANT_COEFF_X_PREFIX, + LAST_SIGNIFICANT_COEFF_Y_PREFIX, + LAST_SIGNIFICANT_COEFF_X_SUFFIX, + LAST_SIGNIFICANT_COEFF_Y_SUFFIX, + SIGNIFICANT_COEFF_GROUP_FLAG, + SIGNIFICANT_COEFF_FLAG, + COEFF_ABS_LEVEL_GREATER1_FLAG, + COEFF_ABS_LEVEL_GREATER2_FLAG, + COEFF_ABS_LEVEL_REMAINING, + COEFF_SIGN_FLAG, + LOG2_RES_SCALE_ABS, + RES_SCALE_SIGN_FLAG, + CU_CHROMA_QP_OFFSET_FLAG, + CU_CHROMA_QP_OFFSET_IDX, +}; + +enum PartMode { + PART_2Nx2N = 0, + PART_2NxN = 1, + PART_Nx2N = 2, + PART_NxN = 3, + PART_2NxnU = 4, + PART_2NxnD = 5, + PART_nLx2N = 6, + PART_nRx2N = 7, +}; + +enum PredMode { + MODE_INTER = 0, + MODE_INTRA, + MODE_SKIP, +}; + +enum InterPredIdc { + PRED_L0 = 0, + PRED_L1, + PRED_BI, +}; + +enum PredFlag { + PF_INTRA = 0, + PF_L0, + PF_L1, + PF_BI, +}; + +enum SAOType { + SAO_NOT_APPLIED = 0, + SAO_BAND, + SAO_EDGE, + SAO_APPLIED +}; + +enum SAOEOClass { + SAO_EO_HORIZ = 0, + SAO_EO_VERT, + SAO_EO_135D, + SAO_EO_45D, +}; + +enum ScanType { + SCAN_DIAG = 0, + SCAN_HORIZ, + SCAN_VERT, +}; + +typedef struct RefPicList { + struct HEVCRpiFrame *ref[HEVC_MAX_REFS]; + int list[HEVC_MAX_REFS]; + uint8_t isLongTerm[HEVC_MAX_REFS]; + int nb_refs; +} RefPicList; + +typedef struct RefPicListTab { + RefPicList refPicList[2]; +} RefPicListTab; + +typedef struct RpiCodingUnit { + unsigned int x; // Passed to deblock + unsigned int y; + unsigned int x_split; + unsigned int y_split; + + enum PredMode pred_mode; ///< PredMode + enum PartMode part_mode; ///< PartMode + + // Inferred parameters + uint8_t intra_split_flag; ///< IntraSplitFlag + uint8_t max_trafo_depth; ///< MaxTrafoDepth + uint8_t cu_transquant_bypass_flag; +} RpiCodingUnit; + +typedef struct RpiPredictionUnit { + uint8_t intra_pred_mode[4]; + uint8_t intra_pred_mode_c[4]; + uint8_t chroma_mode_c[4]; + uint8_t merge_flag; +} RpiPredictionUnit; + +typedef struct HEVCRpiTransformUnit { + int8_t cu_qp_delta; + + // Inferred parameters; + uint8_t intra_pred_mode; + uint8_t intra_pred_mode_c; + uint8_t chroma_mode_c; + uint8_t is_cu_qp_delta_wanted; + uint8_t cu_chroma_qp_offset_wanted; + const int8_t * qp_divmod6[3]; +} HEVCRpiTransformUnit; + +typedef struct DBParams { + int8_t beta_offset; // -12 to +12 + int8_t tc_offset; // -12 to +12 +} DBParams; + +#define HEVC_FRAME_FLAG_OUTPUT (1 << 0) +#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1) +#define HEVC_FRAME_FLAG_LONG_REF (1 << 2) +#define HEVC_FRAME_FLAG_BUMPING (1 << 3) + +struct HEVCRpiJob; + +typedef struct HEVCRpiFrame { + AVFrame *frame; + ThreadFrame tf; + ColMvField *col_mvf; + int poc; + struct HEVCRpiFrame *collocated_ref; + + AVBufferRef *col_mvf_buf; + + /** + * A sequence counter, so that old frames are output first + * after a POC reset + */ + uint16_t sequence; + + /** + * A combination of HEVC_FRAME_FLAG_* + */ + uint8_t flags; + + // Entry no in DPB - can be used as a small unique + // frame identifier (within the current thread) + uint8_t dpb_no; +} HEVCRpiFrame; + +typedef struct HEVCRpiLocalContext { + HEVCRpiTransformUnit tu; + + CABACContext cc; + + // Vars that allow us to locate everything from just an lc + struct HEVCRpiContext * context; // ??? make const ??? + unsigned int lc_n; // lc list el no + + // Job wait links + struct HEVCRpiLocalContext * jw_next; + struct HEVCRpiLocalContext * jw_prev; + struct HEVCRpiLocalContext * ljw_next; + struct HEVCRpiLocalContext * ljw_prev; + struct HEVCRpiJob * volatile jw_job; + sem_t jw_sem; + + // ?? Wrap in structure ?? + sem_t bt_sem_in; + sem_t * bt_psem_out; + volatile int bt_terminate; + unsigned int ts; + unsigned int bt_last_line; // Last line in this bit_thread chunk + unsigned int bt_line_no; + unsigned int bt_line_width; + unsigned int bt_line_inc; + + struct HEVCRpiJob * jb0; + char unit_done; // Set once we have dealt with this slice + char bt_is_tile; + char last_progress_good; + char cabac_init_req; + + uint8_t cabac_state[HEVC_CONTEXTS]; + uint8_t stat_coeff[4]; + GetBitContext gb; + + uint8_t ct_depth; + int8_t qp_y; + int8_t curr_qp_y; + int8_t qPy_pred; + +// N.B. Used by asm (neon) - do not change +#define AVAIL_S_UR 0 +#define AVAIL_S_U 1 +#define AVAIL_S_UL 2 +#define AVAIL_S_L 3 +#define AVAIL_S_DL 4 + +#define AVAIL_U (1 << AVAIL_S_U) +#define AVAIL_L (1 << AVAIL_S_L) +#define AVAIL_UL (1 << AVAIL_S_UL) +#define AVAIL_UR (1 << AVAIL_S_UR) +#define AVAIL_DL (1 << AVAIL_S_DL) + +// Intra filters - same number space as avail +#define FILTER_LIGHT 0x40 +#define FILTER_STRONG 0x80 +#define FILTER_EITHER (FILTER_LIGHT | FILTER_STRONG) + + uint8_t ctb_avail; + int end_of_ctb_x; + int end_of_ctb_y; + + RpiCodingUnit cu; + RpiPredictionUnit pu; + +#define BOUNDARY_LEFT_SLICE (1 << 0) +#define BOUNDARY_LEFT_TILE (1 << 1) +#define BOUNDARY_UPPER_SLICE (1 << 2) +#define BOUNDARY_UPPER_TILE (1 << 3) + /* properties of the boundary of the current CTB for the purposes + * of the deblocking filter */ + unsigned int boundary_flags; + +#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE) + uint8_t ipm_left[IPM_TAB_SIZE]; + uint8_t ipm_up[IPM_TAB_SIZE]; + +//#define MVF_STASH_WIDTH 128 +#define MVF_STASH_WIDTH 64 +#define MVF_STASH_HEIGHT 64 +#define MVF_STASH_WIDTH_PU (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE) +#define MVF_STASH_HEIGHT_PU (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE) + HEVCRpiMvField mvf_ul[1]; + HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU]; + + /* +7 is for subpixel interpolation, *2 for high bit depths */ +// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; + /* The extended size between the new edge emu buffer is abused by SAO */ +// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; +// DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); + +} HEVCRpiLocalContext; + +// Each block can have an intra prediction and an add_residual command +// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH + +// Sand only has 2 planes (Y/C) +#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4)) + +// Command for intra prediction and transform_add of predictions to coefficients +enum rpi_pred_cmd_e +{ + RPI_PRED_ADD_RESIDUAL, + RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx + RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx + RPI_PRED_ADD_RESIDUAL_C, // Merged U+V + RPI_PRED_ADD_DC, + RPI_PRED_ADD_DC_U, // Both U & V are effectively C + RPI_PRED_ADD_DC_V, + RPI_PRED_INTRA, + RPI_PRED_INTRA_C, + RPI_PRED_I_PCM, + RPI_PRED_CMD_MAX +}; + +typedef struct HEVCPredCmd { + uint8_t type; + uint8_t size; // log2 "size" used by all variants + uint8_t avail; // i_pred - but left here as they pack well + uint8_t dummy; + union { + struct { // TRANSFORM_ADD + uint8_t * dst; + const int16_t * buf; + uint16_t stride; // Should be good enough for all pic fmts we use + int16_t dc; + } ta; + struct { + uint8_t * dst; + uint32_t stride; + int dc; + } dc; + struct { // INTRA + uint16_t x; + uint16_t y; + enum IntraPredMode mode; + } i_pred; + struct { // I_PCM + uint16_t x; + uint16_t y; + const void * src; + uint32_t src_len; + } i_pcm; + }; +} HEVCPredCmd; + +union qpu_mc_pred_cmd_s; +struct qpu_mc_pred_y_p_s; +struct qpu_mc_src_s; + +typedef struct HEVCRpiInterPredQ +{ + union qpu_mc_pred_cmd_u *qpu_mc_base; + union qpu_mc_pred_cmd_u *qpu_mc_curr; + struct qpu_mc_src_s *last_l0; + struct qpu_mc_src_s *last_l1; + unsigned int load; + uint32_t code_setup; + uint32_t code_sync; + uint32_t code_exit; +} HEVCRpiInterPredQ; + +typedef struct HEVCRpiInterPredEnv +{ + HEVCRpiInterPredQ * q; + uint8_t n; // Number of Qs + uint8_t n_grp; // Number of Q in a group + uint8_t curr; // Current Q number (0..n-1) + uint8_t used; // 0 if nothing in any Q, 1 otherwise + uint8_t used_grp; // 0 if nothing in any Q in the current group + unsigned int max_fill; + unsigned int min_gap; + GPU_MEM_PTR_T gptr; +} HEVCRpiInterPredEnv; + +typedef struct HEVCRpiIntraPredEnv { + unsigned int n; // Number of commands + HEVCPredCmd * cmds; +} HEVCRpiIntraPredEnv; + +typedef struct HEVCRpiCoeffEnv { + unsigned int n; +#if RPI_COMPRESS_COEFFS + unsigned int packed; // Equal to 1 if coefficients should be being packed + unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed). Only valid if packed==0 +#endif + int16_t * buf; +} HEVCRpiCoeffEnv; + +typedef struct HEVCRpiCoeffsEnv { + HEVCRpiCoeffEnv s[4]; + GPU_MEM_PTR_T gptr; + void * mptr; +} HEVCRpiCoeffsEnv; + +typedef struct HEVCRpiFrameProgressWait { + int req; + struct HEVCRpiFrameProgressWait * next; + sem_t sem; +} HEVCRpiFrameProgressWait; + +typedef struct HEVCRpiFrameProgressState { + struct HEVCRpiFrameProgressWait * first; + struct HEVCRpiFrameProgressWait * last; + pthread_mutex_t lock; +} HEVCRpiFrameProgressState; + +typedef struct RpiBlk +{ + unsigned int x; + unsigned int y; + unsigned int w; + unsigned int h; +} RpiBlk; + +typedef struct HEVCRpiJob { + struct HEVCRpiJob * next; // Free chain + struct HEVCRpiJobCtl * jbc_local; + const HEVCRpiSPS * sps; // sps used to set up this job + + int waited; + int ctu_ts_first; + int ctu_ts_last; + RpiBlk bounds; // Bounding box of job + + struct qpu_mc_pred_y_p_s * last_y8_p; + struct qpu_mc_src_s * last_y8_l1; + rpi_cache_flush_env_t * rfe; + + HEVCRpiInterPredEnv chroma_ip; + HEVCRpiInterPredEnv luma_ip; + int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no + HEVCRpiIntraPredEnv intra; + HEVCRpiCoeffsEnv coeffs; + HEVCRpiFrameProgressWait progress_wait; + sem_t sem; + rpi_cache_buf_t flush_buf; +} HEVCRpiJob; + +struct HEVCRpiContext; + +typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb); + +typedef struct HEVCRpiPassQueue +{ +// int pending; + volatile int terminate; + sem_t sem_in; + sem_t * psem_out; + unsigned int job_n; + struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread + HEVCRpiWorkerFn * worker; + pthread_t thread; + uint8_t pass_n; // Pass number - debug + uint8_t started; +} HEVCRpiPassQueue; + + +struct HEVCRpiJobGlobal; + +typedef struct HEVCRpiJobCtl +{ + sem_t sem_out; + + HEVCRpiJob * volatile jb1; // The job associated with this frame if unallocated - NULL if allocated + struct HEVCRpiJobGlobal * jbg; + + HEVCRpiLocalContext * lcw_head; + HEVCRpiLocalContext * lcw_tail; + + pthread_mutex_t in_lock; + int offload_in; + + HEVCRpiJob *offloadq[RPI_MAX_JOBS]; +} HEVCRpiJobCtl; + + +typedef struct HEVCRpiJobGlobal +{ + intptr_t ref_count; + pthread_mutex_t lock; + HEVCRpiJob * free1; // Singly linked list of free jobs + HEVCRpiLocalContext * wait_head; // Double linked list of lcs waiting for a job + HEVCRpiLocalContext * wait_good; // Last good tail + HEVCRpiLocalContext * wait_tail; + +} HEVCRpiJobGlobal; + +#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1) + +#if RPI_TSTATS +typedef struct HEVCRpiStats { + int y_pred1_y8_merge; + int y_pred1_xy; + int y_pred1_x0; + int y_pred1_y0; + int y_pred1_x0y0; + int y_pred1_wle8; + int y_pred1_wgt8; + int y_pred1_hle16; + int y_pred1_hgt16; + int y_pred2_xy; + int y_pred2_x0; + int y_pred2_y0; + int y_pred2_x0y0; + int y_pred2_hle16; + int y_pred2_hgt16; +} HEVCRpiStats; +#endif + +typedef struct HEVCRpiCabacState +{ + uint8_t rice[4]; + uint8_t state[HEVC_CONTEXTS]; +} HEVCRpiCabacState; + +#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels +#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT) +#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1) +#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte +#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el +#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT) +#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) +#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row +#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + +typedef struct HEVCRpiContext { + const AVClass *c; // needed by private avoptions + AVCodecContext *avctx; + + uint8_t threads_type; + char qpu_init_ok; + + /** 1 if the independent slice segment header was successfully parsed */ + uint8_t slice_initialized; + char used_for_ref; // rpi + char is_irap; + char offload_recon; + uint8_t eos; ///< current packet contains an EOS/EOB NAL + uint8_t last_eos; ///< last packet contains an EOS/EOB NAL + uint8_t no_backward_pred_flag; + uint8_t is_decoded; + uint8_t no_rasl_output_flag; + + + /** + * Sequence counters for decoded and output frames, so that old + * frames are output first after a POC reset + */ + uint16_t seq_decode; + uint16_t seq_output; + + int width; + int height; + + HEVCRpiJobCtl * jbc; + // cabac stash + // b0 skip flag + // b1+ ct_depth + uint8_t * cabac_stash_left; + uint8_t * cabac_stash_up; + + // Function pointers +#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C + const uint8_t * qpu_dummy_frame_emu; +#endif +#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C + uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory +#endif + HEVCRpiQpu qpu; + + HEVCRpiFrameProgressState progress_states[2]; + + HEVCRpiCabacState *cabac_save; + + AVFrame *frame; + AVFrame *output_frame; + uint8_t *sao_pixel_buffer_h[3]; + uint8_t *sao_pixel_buffer_v[3]; + + unsigned int col_mvf_stride; + AVBufferPool *col_mvf_pool; + + RpiSAOParams *sao; + DBParams *deblock; + enum HEVCNALUnitType nal_unit_type; + int temporal_id; ///< temporal_id_plus1 - 1 + HEVCRpiFrame *ref; + int poc; + int pocTid0; + int slice_idx; ///< number of the slice being currently decoded + int max_ra; + + int8_t *qp_y_tab; + + // Deblocking block strength bitmaps + unsigned int bs_stride2; + unsigned int bs_size; + uint8_t *bs_horizontal; + uint8_t *bs_vertical; + uint8_t *bsf_stash_up; + uint8_t *bsf_stash_left; + +#if HEVC_RPI_MAX_CTBS >= 0xffff +#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0 + uint32_t *tab_slice_address; +#else +#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0 + uint16_t *tab_slice_address; +#endif + + // Bitfield 1 bit per 8 pels (min pcm size) + uint8_t *is_pcm; + // Bitfield 1 bit per 8 pels (min cb size) + // Only needed for CIP as CIP processing is async to the main thread + uint8_t *is_intra; + + // PU + HEVCRpiMvField *mvf_up; + HEVCRpiMvField *mvf_left; + + const RefPicList **rpl_up; + const RefPicList **rpl_left; + RefPicList * refPicList; + + // CTB-level flags affecting loop filter operation + uint8_t *filter_slice_edges; + + /** used on BE to byteswap the lines for checksumming */ + uint8_t *checksum_buf; + int checksum_buf_size; + + const uint8_t *data; + + H2645Packet pkt; + // type of the first VCL NAL of the current frame + enum HEVCNALUnitType first_nal_type; + + uint8_t context_initialized; + int is_nalff; ///< this flag is != 0 if bitstream is encapsulated + ///< as a format defined in 14496-15 + int apply_defdispwin; + + int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4) + int nuh_layer_id; + + struct AVMD5 *md5_ctx; + + RefPicListTab * rpl_tab; + unsigned int rpl_tab_size; + + uint8_t *is_intra_store; + + RpiSliceHeader sh; + + HEVCRpiParamSets ps; + + HEVCRpiLocalContext *HEVClc; + HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS]; + + HEVCRpiFrame DPB[HEVC_DPB_ELS]; + + ///< candidate references for the current frame + RefPicList rps[5]; + + HEVCRpiPredContext hpc; + HEVCDSPContext hevcdsp; + + HEVCSEIContext sei; + + // Put structures that allocate non-trivial storage at the end + // These are mostly used indirectly so position in the structure doesn't matter + HEVCRpiPassQueue passq[RPI_PASSES]; +#if RPI_EXTRA_BIT_THREADS > 0 + int bt_started; + // This simply contains thread descriptors - task setup is held elsewhere + pthread_t bit_threads[RPI_EXTRA_BIT_THREADS]; +#endif +#if RPI_TSTATS + HEVCRpiStats tstats; +#endif +} HEVCRpiContext; + +/** + * Mark all frames in DPB as unused for reference. + */ +void ff_hevc_rpi_clear_refs(HEVCRpiContext *s); + +/** + * Drop all frames currently in DPB. + */ +void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s); + +/** + * Construct the reference picture sets for the current frame. + */ +int ff_hevc_rpi_frame_rps(HEVCRpiContext *s); + +/** + * Construct the reference picture list(s) for the current slice. + */ +int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s); + + +/** + * Get the number of candidate references for the current frame. + */ +int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s); + +int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc); + +/** + * Find next frame in output order and put a reference to it in frame. + * @return 1 if a frame was output, 0 otherwise + */ +int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush); + +void ff_hevc_rpi_bump_frame(HEVCRpiContext *s); + +void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags); + +unsigned int ff_hevc_rpi_tb_avail_flags( + const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, + const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h); + +void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, + int nPbH, int log2_cb_size, int part_idx, + int merge_idx, HEVCRpiMvField * const mv); +void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int nPbW, const unsigned int nPbH, + const unsigned int avail, + HEVCRpiMvField * const mv, + const unsigned int mvp_lx_flag, const unsigned int LX); +void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase); +void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int log2_trafo_size, const int is_coded_block); +int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot); + +extern const uint8_t ff_hevc_rpi_qpel_extra_before[4]; +extern const uint8_t ff_hevc_rpi_qpel_extra_after[4]; +extern const uint8_t ff_hevc_rpi_qpel_extra[4]; + +int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n); + +// arm/hevc_misc_neon.S +// Neon coeff zap fn +#if HAVE_NEON +extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2); +#endif + +void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const HEVCRpiFrame * const ref, const int val, const int field); + +void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field); + +// All of these expect that s->threads_type == FF_THREAD_FRAME + +static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const HEVCRpiFrame * const ref, const int y) +{ + if (s->threads_type != 0) + ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); +} + +static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y) +{ + if (s->used_for_ref && s->threads_type != 0) + ff_hevc_rpi_progress_signal_field(s, y, 1); +} + +static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const HEVCRpiFrame * const ref, const int y) +{ + ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0); +} + +static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y) +{ + if (s->used_for_ref && s->threads_type != 0) + { + ff_hevc_rpi_progress_signal_field(s, y, 0); + } +} + +static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s) +{ + ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0); + ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1); +} + + +// Set all done - signal nothing (used in missing refs) +// Works for both rpi & non-rpi +static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref) +{ + if (ref->tf.progress != NULL) + { + int * const p = (int *)ref->tf.progress->data; + p[0] = INT_MAX; + p[1] = INT_MAX; + } +} + +#define HEVC_RPI_420_ONLY 1 +#define HEVC_RPI_SAND128_ONLY 1 + +static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx) +{ +#if HEVC_RPI_420_ONLY + return cidx == 0 ? 0 : 1; +#else + return s->ps.sps->hshift[cidx]; +#endif +} + +static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx) +{ +#if HEVC_RPI_420_ONLY + return cidx == 0 ? 0 : 1; +#else + return s->ps.sps->vshift[cidx]; +#endif +} + +static inline int ctx_cfmt(const HEVCRpiContext * const s) +{ +#if HEVC_RPI_420_ONLY + return 1; +#else + return s->ps.sps->chroma_format_idc; +#endif +} + +static inline int frame_stride1(const AVFrame * const frame, const int c_idx) +{ +#if HEVC_RPI_SAND128_ONLY + return 128; +#else + return frame->linesize[c_idx]; +#endif +} + +#if HEVC_RPI_SAND128_ONLY +// Propagate this decision to later zc includes +#define RPI_ZC_SAND128_ONLY 1 +#endif + +#ifndef ff_hevc_rpi_copy_vert +static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src, + int pixel_shift, int height, + ptrdiff_t stride_dst, ptrdiff_t stride_src) +{ + int i; + switch (pixel_shift) + { + case 2: + for (i = 0; i < height; i++) { + *(uint32_t *)dst = *(uint32_t *)src; + dst += stride_dst; + src += stride_src; + } + break; + case 1: + for (i = 0; i < height; i++) { + *(uint16_t *)dst = *(uint16_t *)src; + dst += stride_dst; + src += stride_src; + } + break; + default: + for (i = 0; i < height; i++) { + *dst = *src; + dst += stride_dst; + src += stride_src; + } + break; + } +} +#endif + + +#if MVF_STASH_WIDTH == 64 +static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, + const unsigned int x, const unsigned int y) +{ + const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); + return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)); +} + +static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int x, const unsigned int y) +{ + const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); + const unsigned int x0_ctb = x0 & mask_cs_hi; + const unsigned int y0_ctb = y0 & mask_cs_hi; + + return (HEVCRpiMvField *)((y < y0_ctb) ? + (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) : + (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) : + lc->mvf_stash + + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE))); +} + +static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s, + const unsigned int x0, + const unsigned int x) +{ + const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); + const unsigned int x0_ctb = x0 & mask_cs_hi; + return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU; +} + +#else +static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, + const unsigned int x, const unsigned int y) +{ + const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); + return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1))); +} + +static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int x, const unsigned int y) +{ + const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); + + const unsigned int x0_ctb = x0 & mask_cs_hi; + const unsigned int y0_ctb = y0 & mask_cs_hi; + + // If not in the same CTB for Y assume up + if (y < y0_ctb) { + // If not in the same CTB for X too assume up-left + return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)); + } + return mvf_stash_ptr(s, lc, x, y); +} + +static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s, + const unsigned int x0, + const unsigned int x) +{ + return MVF_STASH_WIDTH_PU; +} +#endif + +#endif /* AVCODEC_RPI_HEVCDEC_H */ diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c new file mode 100644 index 0000000000..87f3cc9d14 --- /dev/null +++ b/libavcodec/rpi_hevcdsp.c @@ -0,0 +1,450 @@ +/* + * HEVC video decoder + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere + * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "rpi_hevcdsp.h" +#include "rpi_hevc_mv.h" + +static const int8_t transform[32][32] = { + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, + -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 }, + { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, + -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 }, + { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, + 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 }, + { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, + 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 }, + { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, + -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 }, + { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, + -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 }, + { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, + 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 }, + { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, + 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 }, + { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, + -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 }, + { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, + -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 }, + { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, + 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 }, + { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, + 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 }, + { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, + -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 }, + { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, + -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 }, + { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, + 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 }, + { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, + 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 }, + { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, + -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 }, + { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, + -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 }, + { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, + 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 }, + { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, + 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 }, + { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, + -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 }, + { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, + -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 }, + { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, + 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 }, + { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, + 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 }, + { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, + -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 }, + { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, + -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 }, + { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, + 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 }, + { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, + 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 }, + { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, + -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 }, + { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, + -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 }, + { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, + 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }, +}; + +DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = { + { -2, 58, 10, -2}, + { -4, 54, 16, -2}, + { -6, 46, 28, -4}, + { -4, 36, 36, -4}, + { -4, 28, 46, -6}, + { -2, 16, 54, -4}, + { -2, 10, 58, -2}, +}; + +DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = { + { -1, 4,-10, 58, 17, -5, 1, 0, -1, 4,-10, 58, 17, -5, 1, 0}, + { -1, 4,-11, 40, 40,-11, 4, -1, -1, 4,-11, 40, 40,-11, 4, -1}, + { 0, 1, -5, 17, 58,-10, 4, -1, 0, 1, -5, 17, 58,-10, 4, -1} +}; + +#define BIT_DEPTH 8 +#include "rpi_hevcdsp_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 9 +#include "rpi_hevcdsp_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 10 +#include "rpi_hevcdsp_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 12 +#include "rpi_hevcdsp_template.c" +#undef BIT_DEPTH + +static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, + int in_inc0, int in_inc1) +{ + int shift = 32; + uint32_t bs = 0; + for (; pus > 0; pus--) { + int strength, out; + int curr_refL0 = curr_rpl0[curr->ref_idx[0]]; + int curr_refL1 = curr_rpl1[curr->ref_idx[1]]; + int nr_idx0 = neigh->ref_idx[0]; + int nr_idx1 = neigh->ref_idx[1]; + int neigh_refL0 = neigh_rpl0[nr_idx0]; + int neigh_refL1 = neigh_rpl1[nr_idx1]; + + av_assert0(nr_idx0 >= 0 && nr_idx0 <=31); + av_assert0(nr_idx1 >= 0 && nr_idx1 <=31); + +#if 1 // This more directly matches the original implementation + if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) { + // same L0 and L1 + if (curr_refL0 == neigh_refL0 && + curr_refL0 == curr_refL1 && + neigh_refL0 == neigh_refL1) { + if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 || + FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) && + (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 || + FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)) + strength = 1; + else + strength = 0; + } else if (neigh_refL0 == curr_refL0 && + neigh_refL1 == curr_refL1) { + if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 || + FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) + strength = 1; + else + strength = 0; + } else if (neigh_refL1 == curr_refL0 && + neigh_refL0 == curr_refL1) { + if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 || + FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4) + strength = 1; + else + strength = 0; + } else { + strength = 1; + } + } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV + MvXY curr_mv0, neigh_mv0; + + if (curr->pred_flag & 1) { + curr_mv0 = curr->xy[0]; + } else { + curr_mv0 = curr->xy[1]; + curr_refL0 = curr_refL1; + } + + if (neigh->pred_flag & 1) { + neigh_mv0 = neigh->xy[0]; + } else { + neigh_mv0 = neigh->xy[1]; + neigh_refL0 = neigh_refL1; + } + + if (curr_refL0 == neigh_refL0) { + if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4) + strength = 1; + else + strength = 0; + } else + strength = 1; + } else + strength = 1; +#else // This has exactly the same effect, but is more suitable for vectorisation + MvXY curr_mv[2]; + MvXY neigh_mv[2]; + memcpy(curr_mv, curr->xy, sizeof curr_mv); + memcpy(neigh_mv, neigh->xy, sizeof neigh_mv); + + if (!(curr->pred_flag & 2)) { + curr_mv[1] = curr_mv[0]; + curr_refL1 = curr_refL0; + } + if (!(neigh->pred_flag & 2)) { + neigh_mv[1] = neigh_mv[0]; + neigh_refL1 = neigh_refL0; + } + if (!(curr->pred_flag & 1)) { + curr_mv[0] = curr_mv[1]; + curr_refL0 = curr_refL1; + } + if (!(neigh->pred_flag & 1)) { + neigh_mv[0] = neigh_mv[1]; + neigh_refL0 = neigh_refL1; + } + + strength = 1; + + strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) | + (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) | + (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4); + + strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) | + (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) | + (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4); + + strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2); +#endif + + curr += in_inc0 / sizeof (HEVCRpiMvField); + neigh += in_inc1 / sizeof (HEVCRpiMvField); + + for (out = dup; out > 0; out--) + { + bs = (bs >> 2) | (strength << 30); + shift -= 2; + } + } + return bs >> shift; +} + + +static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height) +{ + unsigned int i, j; + + if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) { + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+=8) + AV_COPY64U(dst+j, src+j); + dst += stride_dst; + src += stride_src; + } + } else { + for (i = 0; i < height; i++) { + for (j = 0; j < width; j+=16) + AV_COPY128(dst+j, src+j); + dst += stride_dst; + src += stride_src; + } + } +} + + + +void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) +{ +#undef FUNC +#define FUNC(a, depth) a ## _ ## depth + +#undef PEL_FUNC +#define PEL_FUNC(dst1, idx1, idx2, a, depth) \ + for(i = 0 ; i < 10 ; i++) \ +{ \ + hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth; \ +} + +#undef EPEL_FUNCS +#define EPEL_FUNCS(depth) \ + PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth); \ + PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth); \ + PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth); \ + PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth) + +#undef EPEL_UNI_FUNCS +#define EPEL_UNI_FUNCS(depth) \ + PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \ + PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth); \ + PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth); \ + PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth); \ + PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \ + PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth); \ + PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth); \ + PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth) + +#undef EPEL_BI_FUNCS +#define EPEL_BI_FUNCS(depth) \ + PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \ + PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth); \ + PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth); \ + PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth); \ + PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \ + PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth); \ + PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth); \ + PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth) + +#undef QPEL_FUNCS +#define QPEL_FUNCS(depth) \ + PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth); \ + PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth); \ + PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth); \ + PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth) + +#undef QPEL_UNI_FUNCS +#define QPEL_UNI_FUNCS(depth) \ + PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \ + PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth); \ + PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth); \ + PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth); \ + PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \ + PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth); \ + PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth); \ + PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth) + +#undef QPEL_BI_FUNCS +#define QPEL_BI_FUNCS(depth) \ + PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \ + PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth); \ + PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth); \ + PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth); \ + PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \ + PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth); \ + PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \ + PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth) + +#define SLICED_ADD_RESIDUAL(depth)\ + hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \ + hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \ + hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \ + hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \ + hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \ + hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \ + hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \ + hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \ + hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \ + hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \ + hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \ + hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \ + hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \ + hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \ + hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \ + hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \ + hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth) +#define SLICED_LOOP_FILTERS(depth)\ + hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \ + hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ + hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ + hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) +#define SLICED_SAO(depth)\ + for (i = 0; i != SAO_FILTER_N; ++i) { \ + hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \ + hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \ + } \ + hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ + hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth) + +#define HEVC_DSP(depth) \ + hevcdsp->put_pcm = FUNC(put_pcm, depth); \ + hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \ + hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \ + hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \ + hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \ + hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \ + hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \ + hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \ + hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \ + SLICED_ADD_RESIDUAL(depth); \ + hevcdsp->dequant = FUNC(dequant, depth); \ + hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \ + hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \ + hevcdsp->idct[0] = FUNC(idct_4x4, depth); \ + hevcdsp->idct[1] = FUNC(idct_8x8, depth); \ + hevcdsp->idct[2] = FUNC(idct_16x16, depth); \ + hevcdsp->idct[3] = FUNC(idct_32x32, depth); \ + \ + hevcdsp->idct_dc[0] = FUNC(idct_4x4_dc, depth); \ + hevcdsp->idct_dc[1] = FUNC(idct_8x8_dc, depth); \ + hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \ + hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \ + \ + for (i = 0; i != SAO_FILTER_N; ++i) { \ + hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \ + hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \ + } \ + hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ + hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ + SLICED_SAO(depth); \ + \ + QPEL_FUNCS(depth); \ + QPEL_UNI_FUNCS(depth); \ + QPEL_BI_FUNCS(depth); \ + EPEL_FUNCS(depth); \ + EPEL_UNI_FUNCS(depth); \ + EPEL_BI_FUNCS(depth); \ + \ + SLICED_LOOP_FILTERS(depth); \ + hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ + hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ + hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \ + hevcdsp->hevc_v_loop_filter_chroma = FUNC(hevc_v_loop_filter_chroma, depth); \ + hevcdsp->hevc_h_loop_filter_luma_c = FUNC(hevc_h_loop_filter_luma, depth); \ + hevcdsp->hevc_v_loop_filter_luma_c = FUNC(hevc_v_loop_filter_luma, depth); \ + hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \ + hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth) +int i = 0; + + switch (bit_depth) { + case 9: + HEVC_DSP(9); + break; + case 10: + HEVC_DSP(10); + break; + case 12: + HEVC_DSP(12); + break; + default: + HEVC_DSP(8); + break; + } + + hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths; + hevcdsp->cpy_blk = cpy_blk; + + if (ARCH_PPC) + ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth); + if (ARCH_X86) + ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth); + if (ARCH_ARM) + ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth); + if (ARCH_MIPS) + ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth); +} diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h new file mode 100644 index 0000000000..5a7cdeeb66 --- /dev/null +++ b/libavcodec/rpi_hevcdsp.h @@ -0,0 +1,177 @@ +/* + * HEVC video decoder + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere + * + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RPI_HEVCDSP_H +#define AVCODEC_RPI_HEVCDSP_H + +#include "hevc.h" +#include "get_bits.h" + +struct HEVCRpiMvField; + +#define MAX_PB_SIZE 64 + +#define RPI_HEVC_SAO_BUF_STRIDE 160 + + +typedef struct RpiSAOParams { + uint8_t band_position[3]; ///< sao_band_position (Y,U,V) + uint8_t eo_class[3]; ///< sao_eo_class (Y,U=V) + uint8_t type_idx[3]; ///< sao_type_idx (Y,U=V) + + int16_t offset_val[3][5]; ///> 16; + const int dc_u = (dc << 16) >> 16; + + stride /= sizeof(pixel); + + for (y = 0; y < size; y++) { + for (x = 0; x < size * 2; x += 2) { + dst[x] = av_clip_pixel(dst[x] + dc_u); + dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v); + } + dst += stride; + } +} + + +static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res, + ptrdiff_t stride) +{ + FUNC(add_residual)(_dst, res, stride, 4); +} + +static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res, + ptrdiff_t stride) +{ + FUNC(add_residual)(_dst, res, stride, 8); +} + +static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res, + ptrdiff_t stride) +{ + FUNC(add_residual)(_dst, res, stride, 16); +} + +static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res, + ptrdiff_t stride) +{ + FUNC(add_residual)(_dst, res, stride, 32); +} + +static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) +{ + FUNC(add_residual_dc)(_dst, stride, dc, 4); +} + +static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) +{ + FUNC(add_residual_dc)(_dst, stride, dc, 8); +} + +static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) +{ + FUNC(add_residual_dc)(_dst, stride, dc, 16); +} + +static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) +{ + FUNC(add_residual_dc)(_dst, stride, dc, 32); +} + +// -- U -- (plaited) + +static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride, int dc_u) +{ + FUNC(add_residual_u)(_dst, res, stride, dc_u, 4); +} + +static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride, int dc_u) +{ + FUNC(add_residual_u)(_dst, res, stride, dc_u, 8); +} + +static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride, int dc_u) +{ + FUNC(add_residual_u)(_dst, res, stride, dc_u, 16); +} + +static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride, int dc_u) +{ + // Should never occur for 420, which is all that sand supports + av_assert0(0); +} + +// -- V -- (plaited) + +static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride, int dc_v) +{ + FUNC(add_residual_v)(_dst, res, stride, dc_v, 4); +} + +static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride, int dc_v) +{ + FUNC(add_residual_v)(_dst, res, stride, dc_v, 8); +} + +static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride, int dc_v) +{ + FUNC(add_residual_v)(_dst, res, stride, dc_v, 16); +} + +static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride, int dc_v) +{ + // Should never occur for 420, which is all that sand supports + av_assert0(0); +} + +// -- C -- (plaited - both U & V) + +static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride) +{ + FUNC(add_residual_c)(_dst, res, stride, 4); +} + +static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride) +{ + FUNC(add_residual_c)(_dst, res, stride, 8); +} + +static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride) +{ + FUNC(add_residual_c)(_dst, res, stride, 16); +} + +static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride) +{ + // Should never occur for 420, which is all that sand supports + av_assert0(0); +} + +static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) +{ + FUNC(add_residual_dc_c)(_dst, stride, dc, 4); +} + +static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) +{ + FUNC(add_residual_dc_c)(_dst, stride, dc, 8); +} + +static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) +{ + FUNC(add_residual_dc_c)(_dst, stride, dc, 16); +} + +static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) +{ + // Should never occur for 420, which is all that sand supports + av_assert0(0); +} + + +static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) +{ + int16_t *coeffs = (int16_t *) _coeffs; + int x, y; + int size = 1 << log2_size; + + if (mode) { + coeffs += size; + for (y = 0; y < size - 1; y++) { + for (x = 0; x < size; x++) + coeffs[x] += coeffs[x - size]; + coeffs += size; + } + } else { + for (y = 0; y < size; y++) { + for (x = 1; x < size; x++) + coeffs[x] += coeffs[x - 1]; + coeffs += size; + } + } +} + +static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) +{ + int shift = 15 - BIT_DEPTH - log2_size; + int x, y; + int size = 1 << log2_size; + + if (shift > 0) { + int offset = 1 << (shift - 1); + for (y = 0; y < size; y++) { + for (x = 0; x < size; x++) { + *coeffs = (*coeffs + offset) >> shift; + coeffs++; + } + } + } else { + for (y = 0; y < size; y++) { + for (x = 0; x < size; x++) { + *coeffs = *coeffs << -shift; + coeffs++; + } + } + } +} + +#define SET(dst, x) (dst) = (x) +#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift) + +#define TR_4x4_LUMA(dst, src, step, assign) \ + do { \ + int c0 = src[0 * step] + src[2 * step]; \ + int c1 = src[2 * step] + src[3 * step]; \ + int c2 = src[0 * step] - src[3 * step]; \ + int c3 = 74 * src[1 * step]; \ + \ + assign(dst[2 * step], 74 * (src[0 * step] - \ + src[2 * step] + \ + src[3 * step])); \ + assign(dst[0 * step], 29 * c0 + 55 * c1 + c3); \ + assign(dst[1 * step], 55 * c2 - 29 * c1 + c3); \ + assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \ + } while (0) + +static void FUNC(transform_4x4_luma)(int16_t *coeffs) +{ + int i; + int shift = 7; + int add = 1 << (shift - 1); + int16_t *src = coeffs; + + for (i = 0; i < 4; i++) { + TR_4x4_LUMA(src, src, 4, SCALE); + src++; + } + + shift = 20 - BIT_DEPTH; + add = 1 << (shift - 1); + for (i = 0; i < 4; i++) { + TR_4x4_LUMA(coeffs, coeffs, 1, SCALE); + coeffs += 4; + } +} + +#undef TR_4x4_LUMA + +#define TR_4(dst, src, dstep, sstep, assign, end) \ + do { \ + const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \ + const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \ + const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \ + const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \ + \ + assign(dst[0 * dstep], e0 + o0); \ + assign(dst[1 * dstep], e1 + o1); \ + assign(dst[2 * dstep], e1 - o1); \ + assign(dst[3 * dstep], e0 - o0); \ + } while (0) + +#define TR_8(dst, src, dstep, sstep, assign, end) \ + do { \ + int i, j; \ + int e_8[4]; \ + int o_8[4] = { 0 }; \ + for (i = 0; i < 4; i++) \ + for (j = 1; j < end; j += 2) \ + o_8[i] += transform[4 * j][i] * src[j * sstep]; \ + TR_4(e_8, src, 1, 2 * sstep, SET, 4); \ + \ + for (i = 0; i < 4; i++) { \ + assign(dst[i * dstep], e_8[i] + o_8[i]); \ + assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \ + } \ + } while (0) + +#define TR_16(dst, src, dstep, sstep, assign, end) \ + do { \ + int i, j; \ + int e_16[8]; \ + int o_16[8] = { 0 }; \ + for (i = 0; i < 8; i++) \ + for (j = 1; j < end; j += 2) \ + o_16[i] += transform[2 * j][i] * src[j * sstep]; \ + TR_8(e_16, src, 1, 2 * sstep, SET, 8); \ + \ + for (i = 0; i < 8; i++) { \ + assign(dst[i * dstep], e_16[i] + o_16[i]); \ + assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \ + } \ + } while (0) + +#define TR_32(dst, src, dstep, sstep, assign, end) \ + do { \ + int i, j; \ + int e_32[16]; \ + int o_32[16] = { 0 }; \ + for (i = 0; i < 16; i++) \ + for (j = 1; j < end; j += 2) \ + o_32[i] += transform[j][i] * src[j * sstep]; \ + TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \ + \ + for (i = 0; i < 16; i++) { \ + assign(dst[i * dstep], e_32[i] + o_32[i]); \ + assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \ + } \ + } while (0) + +#define IDCT_VAR4(H) \ + int limit2 = FFMIN(col_limit + 4, H) +#define IDCT_VAR8(H) \ + int limit = FFMIN(col_limit, H); \ + int limit2 = FFMIN(col_limit + 4, H) +#define IDCT_VAR16(H) IDCT_VAR8(H) +#define IDCT_VAR32(H) IDCT_VAR8(H) + +#define IDCT(H) \ +static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \ + int col_limit) \ +{ \ + int i; \ + int shift = 7; \ + int add = 1 << (shift - 1); \ + int16_t *src = coeffs; \ + IDCT_VAR ## H(H); \ + \ + for (i = 0; i < H; i++) { \ + TR_ ## H(src, src, H, H, SCALE, limit2); \ + if (limit2 < H && i%4 == 0 && !!i) \ + limit2 -= 4; \ + src++; \ + } \ + \ + shift = 20 - BIT_DEPTH; \ + add = 1 << (shift - 1); \ + for (i = 0; i < H; i++) { \ + TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \ + coeffs += H; \ + } \ +} + +#define IDCT_DC(H) \ +static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs) \ +{ \ + int i, j; \ + int shift = 14 - BIT_DEPTH; \ + int add = 1 << (shift - 1); \ + int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \ + \ + for (j = 0; j < H; j++) { \ + for (i = 0; i < H; i++) { \ + coeffs[i + j * H] = coeff; \ + } \ + } \ +} + +IDCT( 4) +IDCT( 8) +IDCT(16) +IDCT(32) + +IDCT_DC( 4) +IDCT_DC( 8) +IDCT_DC(16) +IDCT_DC(32) + +#undef TR_4 +#undef TR_8 +#undef TR_16 +#undef TR_32 + +#undef SET +#undef SCALE + +static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, + int width, int height) +{ + pixel *dst = (pixel *)_dst; + pixel *src = (pixel *)_src; + int offset_table[32] = { 0 }; + int k, y, x; + int shift = BIT_DEPTH - 5; + + stride_dst /= sizeof(pixel); + stride_src /= sizeof(pixel); + + for (k = 0; k < 4; k++) + offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1]; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); + dst += stride_dst; + src += stride_src; + } +} + +#define CMP(a, b) (((a) > (b)) - ((a) < (b))) + +static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, + int eo, int width, int height) { + + static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; + static const int8_t pos[4][2][2] = { + { { -1, 0 }, { 1, 0 } }, // horizontal + { { 0, -1 }, { 0, 1 } }, // vertical + { { -1, -1 }, { 1, 1 } }, // 45 degree + { { 1, -1 }, { -1, 1 } }, // 135 degree + }; + pixel *dst = (pixel *)_dst; + pixel *src = (pixel *)_src; + int a_stride, b_stride; + int x, y; + const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel); + stride_dst /= sizeof(pixel); + + a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src; + b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + int diff0 = CMP(src[x], src[x + a_stride]); + int diff1 = CMP(src[x], src[x + b_stride]); + int offset_val = edge_idx[2 + diff0 + diff1]; + dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]); + } + src += stride_src; + dst += stride_dst; + } +} + + +#if BIT_DEPTH == 10 +// We need a 32 bit variation for the _c restores so hijack bit depth 10 +#undef pixel +#undef BIT_DEPTH +#define pixel uint32_t +#define BIT_DEPTH 32 +// All 16 bit variations are the same +#define sao_edge_restore_0_10 sao_edge_restore_0_9 +#define sao_edge_restore_1_10 sao_edge_restore_1_9 +#define sao_edge_restore_0_11 sao_edge_restore_0_9 +#define sao_edge_restore_1_11 sao_edge_restore_1_9 +#define sao_edge_restore_0_12 sao_edge_restore_0_9 +#define sao_edge_restore_1_12 sao_edge_restore_1_9 +#define sao_edge_restore_0_13 sao_edge_restore_0_9 +#define sao_edge_restore_1_13 sao_edge_restore_1_9 +#define sao_edge_restore_0_14 sao_edge_restore_0_9 +#define sao_edge_restore_1_14 sao_edge_restore_1_9 +#define sao_edge_restore_0_15 sao_edge_restore_0_9 +#define sao_edge_restore_1_15 sao_edge_restore_1_9 +#define sao_edge_restore_0_16 sao_edge_restore_0_9 +#define sao_edge_restore_1_16 sao_edge_restore_1_9 +#endif +#if BIT_DEPTH <= 9 || BIT_DEPTH == 32 +static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao, + int *borders, int _width, int _height, + int c_idx, uint8_t *vert_edge, + uint8_t *horiz_edge, uint8_t *diag_edge) +{ + int x, y; + pixel *dst = (pixel *)_dst; + pixel *src = (pixel *)_src; + int sao_eo_class = sao->eo_class[c_idx]; + int init_x = 0, width = _width, height = _height; + + stride_dst /= sizeof(pixel); + stride_src /= sizeof(pixel); + + if (sao_eo_class != SAO_EO_VERT) { + if (borders[0]) { + for (y = 0; y < height; y++) { + dst[y * stride_dst] = src[y * stride_src]; + } + init_x = 1; + } + if (borders[2]) { + int offset = width - 1; + for (x = 0; x < height; x++) { + dst[x * stride_dst + offset] = src[x * stride_src + offset]; + } + width--; + } + } + if (sao_eo_class != SAO_EO_HORIZ) { + if (borders[1]) { + for (x = init_x; x < width; x++) + dst[x] = src[x]; + } + if (borders[3]) { + ptrdiff_t y_stride_dst = stride_dst * (height - 1); + ptrdiff_t y_stride_src = stride_src * (height - 1); + for (x = init_x; x < width; x++) + dst[x + y_stride_dst] = src[x + y_stride_src]; + height--; + } + } +} + +static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao, + int *borders, int _width, int _height, + int c_idx, uint8_t *vert_edge, + uint8_t *horiz_edge, uint8_t *diag_edge) +{ + int x, y; + pixel *dst = (pixel *)_dst; + pixel *src = (pixel *)_src; + int sao_eo_class = sao->eo_class[c_idx]; + int init_x = 0, init_y = 0, width = _width, height = _height; + + stride_dst /= sizeof(pixel); + stride_src /= sizeof(pixel); + + if (sao_eo_class != SAO_EO_VERT) { + if (borders[0]) { + for (y = 0; y < height; y++) { + dst[y * stride_dst] = src[y * stride_src]; + } + init_x = 1; + } + if (borders[2]) { + int offset = width - 1; + for (x = 0; x < height; x++) { + dst[x * stride_dst + offset] = src[x * stride_src + offset]; + } + width--; + } + } + if (sao_eo_class != SAO_EO_HORIZ) { + if (borders[1]) { + for (x = init_x; x < width; x++) + dst[x] = src[x]; + init_y = 1; + } + if (borders[3]) { + ptrdiff_t y_stride_dst = stride_dst * (height - 1); + ptrdiff_t y_stride_src = stride_src * (height - 1); + for (x = init_x; x < width; x++) + dst[x + y_stride_dst] = src[x + y_stride_src]; + height--; + } + } + + { + int save_upper_left = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1]; + int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D && !borders[1] && !borders[2]; + int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3]; + int save_lower_left = !diag_edge[3] && sao_eo_class == SAO_EO_45D && !borders[0] && !borders[3]; + + // Restore pixels that can't be modified + if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) { + for(y = init_y+save_upper_left; y< height-save_lower_left; y++) + dst[y*stride_dst] = src[y*stride_src]; + } + if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) { + for(y = init_y+save_upper_right; y< height-save_lower_right; y++) + dst[y*stride_dst+width-1] = src[y*stride_src+width-1]; + } + + if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) { + for(x = init_x+save_upper_left; x < width-save_upper_right; x++) + dst[x] = src[x]; + } + if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) { + for(x = init_x+save_lower_left; x < width-save_lower_right; x++) + dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x]; + } + if(diag_edge[0] && sao_eo_class == SAO_EO_135D) + dst[0] = src[0]; + if(diag_edge[1] && sao_eo_class == SAO_EO_45D) + dst[width-1] = src[width-1]; + if(diag_edge[2] && sao_eo_class == SAO_EO_135D) + dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1]; + if(diag_edge[3] && sao_eo_class == SAO_EO_45D) + dst[stride_dst*(height-1)] = src[stride_src*(height-1)]; + + } +} +#endif +#if BIT_DEPTH == 32 +#undef BIT_DEPTH +#undef pixel +#define BIT_DEPTH 10 +#define pixel uint16_t +#endif + +// --- Plaited chroma versions + +static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height) +{ + pixel *dst = (pixel *)_dst; + pixel *src = (pixel *)_src; + int offset_table_u[32] = { 0 }; + int offset_table_v[32] = { 0 }; + int k, y, x; + int shift = BIT_DEPTH - 5; + + stride_dst /= sizeof(pixel); + stride_src /= sizeof(pixel); + width *= 2; + + for (k = 0; k < 4; k++) + { + offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; + offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; + } + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 2) + { +// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift); +// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]); + // *** & 31 shouldn't be wanted but just now we generate broken input that + // crashes us in 10-bit world + dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]); + dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]); + } + dst += stride_dst; + src += stride_src; + } +} + +static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, + int eo, int width, int height) { + + static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; + static const int8_t pos[4][2][2] = { + { { -1, 0 }, { 1, 0 } }, // horizontal + { { 0, -1 }, { 0, 1 } }, // vertical + { { -1, -1 }, { 1, 1 } }, // 45 degree + { { 1, -1 }, { -1, 1 } }, // 135 degree + }; + pixel *dst = (pixel *)_dst; + pixel *src = (pixel *)_src; + int a_stride, b_stride; + int x, y; + const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel); + + stride_dst /= sizeof(pixel); + width *= 2; + + av_assert0(width <= 64); + + a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; + b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 2) { + int diff0u = CMP(src[x], src[x + a_stride]); + int diff1u = CMP(src[x], src[x + b_stride]); + int offset_valu = edge_idx[2 + diff0u + diff1u]; + int diff0v = CMP(src[x+1], src[x+1 + a_stride]); + int diff1v = CMP(src[x+1], src[x+1 + b_stride]); + int offset_valv = edge_idx[2 + diff0v + diff1v]; + dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]); + dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]); + } + src += stride_src; + dst += stride_dst; + } +} + +// Do once +#if BIT_DEPTH == 8 +// Any old 2 byte 'normal' restore will work for these +#define sao_edge_restore_c_0_8 sao_edge_restore_0_16 +#define sao_edge_restore_c_1_8 sao_edge_restore_1_16 +// We need 32 bit for 9 bit+ +#define sao_edge_restore_c_0_9 sao_edge_restore_0_32 +#define sao_edge_restore_c_1_9 sao_edge_restore_1_32 +#define sao_edge_restore_c_0_10 sao_edge_restore_0_32 +#define sao_edge_restore_c_1_10 sao_edge_restore_1_32 +#define sao_edge_restore_c_0_11 sao_edge_restore_0_32 +#define sao_edge_restore_c_1_11 sao_edge_restore_1_32 +#define sao_edge_restore_c_0_12 sao_edge_restore_0_32 +#define sao_edge_restore_c_1_12 sao_edge_restore_1_32 +#define sao_edge_restore_c_0_13 sao_edge_restore_0_32 +#define sao_edge_restore_c_1_13 sao_edge_restore_1_32 +#define sao_edge_restore_c_0_14 sao_edge_restore_0_32 +#define sao_edge_restore_c_1_14 sao_edge_restore_1_32 +#define sao_edge_restore_c_0_15 sao_edge_restore_0_32 +#define sao_edge_restore_c_1_15 sao_edge_restore_1_32 +#define sao_edge_restore_c_0_16 sao_edge_restore_0_32 +#define sao_edge_restore_c_1_16 sao_edge_restore_1_32 +#endif + +#undef CMP + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +static void FUNC(put_hevc_pel_pixels)(int16_t *dst, + uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = src[x] << (14 - BIT_DEPTH); + src += srcstride; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + + for (y = 0; y < height; y++) { + memcpy(dst, src, width * sizeof(pixel)); + src += srcstride; + dst += dststride; + } +} + +static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + + int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift); + src += srcstride; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox); + src += srcstride; + dst += dststride; + } +} + +static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, int denom, int wx0, int wx1, + int ox0, int ox1, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + + int shift = 14 + 1 - BIT_DEPTH; + int log2Wd = denom + shift - 1; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1)); + } + src += srcstride; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define QPEL_FILTER(src, stride) \ + (filter[0] * src[x - 3 * stride] + \ + filter[1] * src[x - 2 * stride] + \ + filter[2] * src[x - stride] + \ + filter[3] * src[x ] + \ + filter[4] * src[x + stride] + \ + filter[5] * src[x + 2 * stride] + \ + filter[6] * src[x + 3 * stride] + \ + filter[7] * src[x + 4 * stride]) + +static void FUNC(put_hevc_qpel_h)(int16_t *dst, + uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_qpel_v)(int16_t *dst, + uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); + src += srcstride; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_qpel_hv)(int16_t *dst, + uint8_t *_src, + ptrdiff_t _srcstride, + int height, intptr_t mx, + intptr_t my, int width) +{ + int x, y; + const int8_t *filter; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + + src -= QPEL_EXTRA_BEFORE * srcstride; + filter = ff_hevc_rpi_qpel_filters[mx - 1]; + for (y = 0; y < height + QPEL_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_hevc_rpi_qpel_filters[my - 1]; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; + tmp += MAX_PB_SIZE; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; + int shift = 14 - BIT_DEPTH; + +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); + src += srcstride; + dst += dststride; + } +} + +static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + + const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; + + int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); + src += srcstride; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; + int shift = 14 - BIT_DEPTH; + +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); + src += srcstride; + dst += dststride; + } +} + + +static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + + const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; + + int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); + src += srcstride; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + const int8_t *filter; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = 14 - BIT_DEPTH; + +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + src -= QPEL_EXTRA_BEFORE * srcstride; + filter = ff_hevc_rpi_qpel_filters[mx - 1]; + for (y = 0; y < height + QPEL_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_hevc_rpi_qpel_filters[my - 1]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); + tmp += MAX_PB_SIZE; + dst += dststride; + } +} + +static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + const int8_t *filter; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + src -= QPEL_EXTRA_BEFORE * srcstride; + filter = ff_hevc_rpi_qpel_filters[mx - 1]; + for (y = 0; y < height + QPEL_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_hevc_rpi_qpel_filters[my - 1]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); + tmp += MAX_PB_SIZE; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); + src += srcstride; + dst += dststride; + } +} + +static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, int denom, int wx0, int wx1, + int ox0, int ox1, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + + const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; + + int shift = 14 + 1 - BIT_DEPTH; + int log2Wd = denom + shift - 1; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); + src += srcstride; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); + src += srcstride; + dst += dststride; + } +} + +static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, int denom, int wx0, int wx1, + int ox0, int ox1, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + + const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; + + int shift = 14 + 1 - BIT_DEPTH; + int log2Wd = denom + shift - 1; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); + src += srcstride; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, + intptr_t mx, intptr_t my, int width) +{ + int x, y; + const int8_t *filter; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + src -= QPEL_EXTRA_BEFORE * srcstride; + filter = ff_hevc_rpi_qpel_filters[mx - 1]; + for (y = 0; y < height + QPEL_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_hevc_rpi_qpel_filters[my - 1]; + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); + tmp += MAX_PB_SIZE; + dst += dststride; + } +} + +static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, int denom, int wx0, int wx1, + int ox0, int ox1, intptr_t mx, intptr_t my, int width) +{ + int x, y; + const int8_t *filter; + pixel *src = (pixel*)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = 14 + 1 - BIT_DEPTH; + int log2Wd = denom + shift - 1; + + src -= QPEL_EXTRA_BEFORE * srcstride; + filter = ff_hevc_rpi_qpel_filters[mx - 1]; + for (y = 0; y < height + QPEL_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_hevc_rpi_qpel_filters[my - 1]; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); + tmp += MAX_PB_SIZE; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// +//////////////////////////////////////////////////////////////////////////////// +#define EPEL_FILTER(src, stride) \ + (filter[0] * src[x - stride] + \ + filter[1] * src[x] + \ + filter[2] * src[x + stride] + \ + filter[3] * src[x + 2 * stride]) + +static void FUNC(put_hevc_epel_h)(int16_t *dst, + uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_epel_v)(int16_t *dst, + uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); + src += srcstride; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_epel_hv)(int16_t *dst, + uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; + int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + + src -= EPEL_EXTRA_BEFORE * srcstride; + + for (y = 0; y < height + EPEL_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_hevc_rpi_epel_filters[my - 1]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; + tmp += MAX_PB_SIZE; + dst += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; + int shift = 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); + src += srcstride; + dst += dststride; + } +} + +static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; + int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); + } + dst += dststride; + src += srcstride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; + int shift = 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); + src += srcstride; + dst += dststride; + } +} + +static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); + dst += dststride; + src += srcstride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; + int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + src -= EPEL_EXTRA_BEFORE * srcstride; + + for (y = 0; y < height + EPEL_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_hevc_rpi_epel_filters[my - 1]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); + tmp += MAX_PB_SIZE; + dst += dststride; + } +} + +static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; + int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = 14 + 1 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + src -= EPEL_EXTRA_BEFORE * srcstride; + + for (y = 0; y < height + EPEL_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_hevc_rpi_epel_filters[my - 1]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); + tmp += MAX_PB_SIZE; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); + } + dst += dststride; + src += srcstride; + } +} + +static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, int denom, int wx0, int wx1, + int ox0, int ox1, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; + int shift = 14 + 1 - BIT_DEPTH; + int log2Wd = denom + shift - 1; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); + src += srcstride; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); + } + dst += dststride; + src += srcstride; + } +} + +static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, int denom, int wx0, int wx1, + int ox0, int ox1, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + int shift = 14 + 1 - BIT_DEPTH; + int log2Wd = denom + shift - 1; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + + ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); + src += srcstride; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; + int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = denom + 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int offset = 1 << (shift - 1); +#else + int offset = 0; +#endif + + src -= EPEL_EXTRA_BEFORE * srcstride; + + for (y = 0; y < height + EPEL_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_hevc_rpi_epel_filters[my - 1]; + + ox = ox * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); + tmp += MAX_PB_SIZE; + dst += dststride; + } +} + +static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, + int16_t *src2, + int height, int denom, int wx0, int wx1, + int ox0, int ox1, intptr_t mx, intptr_t my, int width) +{ + int x, y; + pixel *src = (pixel *)_src; + ptrdiff_t srcstride = _srcstride / sizeof(pixel); + pixel *dst = (pixel *)_dst; + ptrdiff_t dststride = _dststride / sizeof(pixel); + const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; + int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; + int16_t *tmp = tmp_array; + int shift = 14 + 1 - BIT_DEPTH; + int log2Wd = denom + shift - 1; + + src -= EPEL_EXTRA_BEFORE * srcstride; + + for (y = 0; y < height + EPEL_EXTRA; y++) { + for (x = 0; x < width; x++) + tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; + filter = ff_hevc_rpi_epel_filters[my - 1]; + + ox0 = ox0 * (1 << (BIT_DEPTH - 8)); + ox1 = ox1 * (1 << (BIT_DEPTH - 8)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); + tmp += MAX_PB_SIZE; + dst += dststride; + src2 += MAX_PB_SIZE; + } +} + +// line zero +#define P3 pix[-4 * xstride] +#define P2 pix[-3 * xstride] +#define P1 pix[-2 * xstride] +#define P0 pix[-1 * xstride] +#define Q0 pix[0 * xstride] +#define Q1 pix[1 * xstride] +#define Q2 pix[2 * xstride] +#define Q3 pix[3 * xstride] + +// line three. used only for deblocking decision +#define TP3 pix[-4 * xstride + 3 * ystride] +#define TP2 pix[-3 * xstride + 3 * ystride] +#define TP1 pix[-2 * xstride + 3 * ystride] +#define TP0 pix[-1 * xstride + 3 * ystride] +#define TQ0 pix[0 * xstride + 3 * ystride] +#define TQ1 pix[1 * xstride + 3 * ystride] +#define TQ2 pix[2 * xstride + 3 * ystride] +#define TQ3 pix[3 * xstride + 3 * ystride] + +static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, + ptrdiff_t _xstride, ptrdiff_t _ystride, + int beta, int *_tc, + uint8_t *_no_p, uint8_t *_no_q) +{ + int d, j; + pixel *pix = (pixel *)_pix; + ptrdiff_t xstride = _xstride / sizeof(pixel); + ptrdiff_t ystride = _ystride / sizeof(pixel); + + beta <<= BIT_DEPTH - 8; + + for (j = 0; j < 2; j++) { + const int dp0 = abs(P2 - 2 * P1 + P0); + const int dq0 = abs(Q2 - 2 * Q1 + Q0); + const int dp3 = abs(TP2 - 2 * TP1 + TP0); + const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); + const int d0 = dp0 + dq0; + const int d3 = dp3 + dq3; + const int tc = _tc[j] << (BIT_DEPTH - 8); + const int no_p = _no_p[j]; + const int no_q = _no_q[j]; + + if (d0 + d3 >= beta) { + pix += 4 * ystride; + continue; + } else { + const int beta_3 = beta >> 3; + const int beta_2 = beta >> 2; + const int tc25 = ((tc * 5 + 1) >> 1); + + if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && + abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && + (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { + // strong filtering + const int tc2 = tc << 1; + for (d = 0; d < 4; d++) { + const int p3 = P3; + const int p2 = P2; + const int p1 = P1; + const int p0 = P0; + const int q0 = Q0; + const int q1 = Q1; + const int q2 = Q2; + const int q3 = Q3; + if (!no_p) { + P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); + P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); + P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); + } + if (!no_q) { + Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); + Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); + Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); + } + pix += ystride; + } + } else { // normal filtering + int nd_p = 1; + int nd_q = 1; + const int tc_2 = tc >> 1; + if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) + nd_p = 2; + if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) + nd_q = 2; + + for (d = 0; d < 4; d++) { + const int p2 = P2; + const int p1 = P1; + const int p0 = P0; + const int q0 = Q0; + const int q1 = Q1; + const int q2 = Q2; + int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; + if (abs(delta0) < 10 * tc) { + delta0 = av_clip(delta0, -tc, tc); + if (!no_p) + P0 = av_clip_pixel(p0 + delta0); + if (!no_q) + Q0 = av_clip_pixel(q0 - delta0); + if (!no_p && nd_p > 1) { + const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); + P1 = av_clip_pixel(p1 + deltap1); + } + if (!no_q && nd_q > 1) { + const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); + Q1 = av_clip_pixel(q1 + deltaq1); + } + } + pix += ystride; + } + } + } + } +} + +static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride, + ptrdiff_t _ystride, int *_tc, + uint8_t *_no_p, uint8_t *_no_q) +{ + int d, j, no_p, no_q; + pixel *pix = (pixel *)_pix; + ptrdiff_t xstride = _xstride / sizeof(pixel); + ptrdiff_t ystride = _ystride / sizeof(pixel); + + for (j = 0; j < 2; j++) { + const int tc = _tc[j] << (BIT_DEPTH - 8); + if (tc <= 0) { + pix += 4 * ystride; + continue; + } + no_p = _no_p[j]; + no_q = _no_q[j]; + + for (d = 0; d < 4; d++) { + int delta0; + const int p1 = P1; + const int p0 = P0; + const int q0 = Q0; + const int q1 = Q1; + delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); + if (!no_p) + P0 = av_clip_pixel(p0 + delta0); + if (!no_q) + Q0 = av_clip_pixel(q0 - delta0); + pix += ystride; + } + } +} + +static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, + int32_t *tc, uint8_t *no_p, + uint8_t *no_q) +{ + FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q); +} + +static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, + int32_t *tc, uint8_t *no_p, + uint8_t *no_q) +{ + FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q); +} + +static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, + int beta, int32_t *tc, uint8_t *no_p, + uint8_t *no_q) +{ + FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel), + beta, tc, no_p, no_q); +} + +static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, + int beta, int32_t *tc, uint8_t *no_p, + uint8_t *no_q) +{ + FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride, + beta, tc, no_p, no_q); +} + +#undef P3 +#undef P2 +#undef P1 +#undef P0 +#undef Q0 +#undef Q1 +#undef Q2 +#undef Q3 + +#undef TP3 +#undef TP2 +#undef TP1 +#undef TP0 +#undef TQ0 +#undef TQ1 +#undef TQ2 +#undef TQ3 + +// line zero +#define P3 pix_l[0 * xstride] +#define P2 pix_l[1 * xstride] +#define P1 pix_l[2 * xstride] +#define P0 pix_l[3 * xstride] +#define Q0 pix_r[0 * xstride] +#define Q1 pix_r[1 * xstride] +#define Q2 pix_r[2 * xstride] +#define Q3 pix_r[3 * xstride] + +// line three. used only for deblocking decision +#define TP3 pix_l[0 * xstride + 3 * ystride] +#define TP2 pix_l[1 * xstride + 3 * ystride] +#define TP1 pix_l[2 * xstride + 3 * ystride] +#define TP0 pix_l[3 * xstride + 3 * ystride] +#define TQ0 pix_r[0 * xstride + 3 * ystride] +#define TQ1 pix_r[1 * xstride + 3 * ystride] +#define TQ2 pix_r[2 * xstride + 3 * ystride] +#define TQ3 pix_r[3 * xstride + 3 * ystride] + +// This is identical to hevc_loop_filter_luma except that the P/Q +// components are on separate pointers +static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r, + unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, + uint8_t * _pix_l) +{ + int d, j; + pixel *pix_l = (pixel *)_pix_l; + pixel *pix_r = (pixel *)_pix_r; + const ptrdiff_t xstride = 1; + const ptrdiff_t ystride = _stride / sizeof(pixel); + + beta <<= BIT_DEPTH - 8; + + for (j = 0; j < 2; j++) { + const int dp0 = abs(P2 - 2 * P1 + P0); + const int dq0 = abs(Q2 - 2 * Q1 + Q0); + const int dp3 = abs(TP2 - 2 * TP1 + TP0); + const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); + const int d0 = dp0 + dq0; + const int d3 = dp3 + dq3; + const int tc = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8); + const int no_p = no_f & 1; + const int no_q = no_f & 2; + + if (d0 + d3 >= beta) { + pix_l += 4 * ystride; + pix_r += 4 * ystride; + continue; + } else { + const int beta_3 = beta >> 3; + const int beta_2 = beta >> 2; + const int tc25 = ((tc * 5 + 1) >> 1); + + if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && + abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && + (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { + // strong filtering + const int tc2 = tc << 1; + for (d = 0; d < 4; d++) { + const int p3 = P3; + const int p2 = P2; + const int p1 = P1; + const int p0 = P0; + const int q0 = Q0; + const int q1 = Q1; + const int q2 = Q2; + const int q3 = Q3; + if (!no_p) { + P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); + P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); + P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); + } + if (!no_q) { + Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); + Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); + Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); + } + pix_l += ystride; + pix_r += ystride; + } + } else { // normal filtering + int nd_p = 1; + int nd_q = 1; + const int tc_2 = tc >> 1; + if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) + nd_p = 2; + if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) + nd_q = 2; + + for (d = 0; d < 4; d++) { + const int p2 = P2; + const int p1 = P1; + const int p0 = P0; + const int q0 = Q0; + const int q1 = Q1; + const int q2 = Q2; + int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; + if (abs(delta0) < 10 * tc) { + delta0 = av_clip(delta0, -tc, tc); + if (!no_p) + P0 = av_clip_pixel(p0 + delta0); + if (!no_q) + Q0 = av_clip_pixel(q0 - delta0); + if (!no_p && nd_p > 1) { + const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); + P1 = av_clip_pixel(p1 + deltap1); + } + if (!no_q && nd_q > 1) { + const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); + Q1 = av_clip_pixel(q1 + deltaq1); + } + } + pix_l += ystride; + pix_r += ystride; + } + } + } + } +} + +static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r, + unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f) +{ + // Just call the non-2 function having massaged the parameters + int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16}; + uint8_t no_p[2] = {no_f & 1, no_f & 1}; + uint8_t no_q[2] = {no_f & 2, no_f & 2}; + FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q); +} + +#undef TP3 +#undef TP2 +#undef TP1 +#undef TP0 +#undef TQ0 +#undef TQ1 +#undef TQ2 +#undef TQ3 + +#undef P3 +#undef P2 +#undef P1 +#undef P0 +#undef Q0 +#undef Q1 +#undef Q2 +#undef Q3 + +#define P1 pix_l[0 * xstride] +#define P0 pix_l[1 * xstride] +#define Q0 pix_r[0 * xstride] +#define Q1 pix_r[1 * xstride] + +static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride, + ptrdiff_t _ystride, const int32_t *_tc, + const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r) +{ + int d, j, no_p, no_q; + pixel *pix_l = (pixel *)_pix_l; + pixel *pix_r = (pixel *)_pix_r; + ptrdiff_t xstride = _xstride / sizeof(pixel); + ptrdiff_t ystride = _ystride / sizeof(pixel); + + for (j = 0; j < 2; j++) { + const int tc = _tc[j] << (BIT_DEPTH - 8); + if (tc <= 0) { + pix_l += 4 * ystride; + pix_r += 4 * ystride; + continue; + } + no_p = _no_p[j]; + no_q = _no_q[j]; + + for (d = 0; d < 4; d++) { + int delta0; + const int p1 = P1; + const int p0 = P0; + const int q0 = Q0; + const int q1 = Q1; + delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); + if (!no_p) + P0 = av_clip_pixel(p0 + delta0); + if (!no_q) + Q0 = av_clip_pixel(q0 - delta0); + pix_l += ystride; + pix_r += ystride; + } + } +} + +static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4, + unsigned int no_f) +{ + uint8_t no_p[2] = {no_f & 1, no_f & 2}; + uint8_t no_q[2] = {no_f & 4, no_f & 8}; + int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; + FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q); + FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q); +} + +static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4, + uint8_t * src_l, + unsigned int no_f) +{ + uint8_t no_p[2] = {no_f & 1, no_f & 2}; + uint8_t no_q[2] = {no_f & 4, no_f & 8}; + int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; + FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r); + FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel)); +} + +#undef P1 +#undef P0 +#undef Q0 +#undef Q1 + diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c new file mode 100644 index 0000000000..0aa8809a4b --- /dev/null +++ b/libavcodec/rpi_hevcpred.c @@ -0,0 +1,161 @@ +/* + * HEVC video Decoder + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "rpi_hevcdec.h" + +#include "rpi_hevcpred.h" +#if (ARCH_ARM) +#include "arm/rpi_hevcpred_arm.h" +#endif + +#define PRED_C 0 +#define BIT_DEPTH 8 +#include "rpi_hevcpred_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 9 +#include "rpi_hevcpred_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 10 +#include "rpi_hevcpred_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 12 +#include "rpi_hevcpred_template.c" +#undef BIT_DEPTH +#undef PRED_C + +#define PRED_C 1 +#define BIT_DEPTH 8 +#include "rpi_hevcpred_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 9 +#include "rpi_hevcpred_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 10 +#include "rpi_hevcpred_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 12 +#include "rpi_hevcpred_template.c" +#undef BIT_DEPTH +#undef PRED_C + +void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth) +{ +#undef FUNC +#define FUNC(a, depth) a ## _ ## depth + +#undef FUNCC +#define FUNCC(a, depth) a ## _ ## depth ## _c + +#define HEVC_PRED_Y(depth) \ + hpc->intra_pred = FUNC(intra_pred, depth); \ + hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \ + hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \ + hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \ + hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \ + hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \ + hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \ + hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ + hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \ + hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \ + hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \ + hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \ + hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \ + hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \ + hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \ + hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \ + hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \ + hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \ + hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \ + hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \ + hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \ + hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ + hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ + hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ + hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \ + hpc->pred_dc0[0] = FUNC(pred_dc0_0, depth); \ + hpc->pred_dc0[1] = FUNC(pred_dc0_1, depth); \ + hpc->pred_dc0[2] = FUNC(pred_dc0_2, depth); \ + hpc->pred_dc0[3] = FUNC(pred_dc0_3, depth); + +#define HEVC_PRED_C(depth) \ + hpc->intra_pred_c = FUNCC(intra_pred, depth); \ + hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \ + hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \ + hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \ + hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \ + hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \ + hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ + hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ + hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \ + hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \ + hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \ + hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \ + hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \ + hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \ + hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \ + hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \ + hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \ + hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \ + hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \ + hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \ + hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \ + hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ + hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ + hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ + hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \ + hpc->pred_dc0_c[0] = FUNCC(pred_dc0_0, depth); \ + hpc->pred_dc0_c[1] = FUNCC(pred_dc0_1, depth); \ + hpc->pred_dc0_c[2] = FUNCC(pred_dc0_2, depth); \ + hpc->pred_dc0_c[3] = FUNCC(pred_dc0_3, depth); + +#define HEVC_PRED(depth) \ + HEVC_PRED_Y(depth); \ + HEVC_PRED_C(depth); + + switch (bit_depth) { + case 9: + HEVC_PRED(9); + break; + case 10: + HEVC_PRED(10); + break; + case 12: + HEVC_PRED(12); + break; + default: + HEVC_PRED(8); + break; + } + +#if (ARCH_ARM) + ff_hevc_rpi_pred_init_arm(hpc, bit_depth); +#elif (ARCH_MIPS) + ff_hevc_rpi_pred_init_mips(hpc, bit_depth); +#endif +} diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h new file mode 100644 index 0000000000..9f0edb8798 --- /dev/null +++ b/libavcodec/rpi_hevcpred.h @@ -0,0 +1,123 @@ +/* + * HEVC video Decoder + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RPI_HEVCPRED_H +#define AVCODEC_RPI_HEVCPRED_H + +#include +#include +#include "config.h" + +struct HEVCRpiContext; +struct HEVCRpiLocalContext; + +enum IntraPredMode { + INTRA_PLANAR = 0, + INTRA_DC, + INTRA_ANGULAR_2, + INTRA_ANGULAR_3, + INTRA_ANGULAR_4, + INTRA_ANGULAR_5, + INTRA_ANGULAR_6, + INTRA_ANGULAR_7, + INTRA_ANGULAR_8, + INTRA_ANGULAR_9, + INTRA_ANGULAR_10, + INTRA_ANGULAR_11, + INTRA_ANGULAR_12, + INTRA_ANGULAR_13, + INTRA_ANGULAR_14, + INTRA_ANGULAR_15, + INTRA_ANGULAR_16, + INTRA_ANGULAR_17, + INTRA_ANGULAR_18, + INTRA_ANGULAR_19, + INTRA_ANGULAR_20, + INTRA_ANGULAR_21, + INTRA_ANGULAR_22, + INTRA_ANGULAR_23, + INTRA_ANGULAR_24, + INTRA_ANGULAR_25, + INTRA_ANGULAR_26, + INTRA_ANGULAR_27, + INTRA_ANGULAR_28, + INTRA_ANGULAR_29, + INTRA_ANGULAR_30, + INTRA_ANGULAR_31, + INTRA_ANGULAR_32, + INTRA_ANGULAR_33, + INTRA_ANGULAR_34, +}; +#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10 +#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26 + +typedef void intra_filter_fn_t( + uint8_t * const left, uint8_t * const top, + const unsigned int req, const unsigned int avail, + const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, + const unsigned int stride, + const unsigned int top_right_size, const unsigned int down_left_size); + +typedef struct HEVCRpiPredContext { + void (*intra_pred)(const struct HEVCRpiContext * const s, + const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, + const unsigned int avail, const unsigned int log2_size); + + intra_filter_fn_t *intra_filter[4]; + void (*pred_planar[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); + void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, + ptrdiff_t stride); + void (*pred_angular[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int mode); + void (*pred_vertical[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int mode); + void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int mode); + void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride); + + void (*intra_pred_c)(const struct HEVCRpiContext * const s, + const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, + const unsigned int avail, const unsigned int log2_size); + intra_filter_fn_t *intra_filter_c[4]; + void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride); + void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, + ptrdiff_t stride); + void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int mode); + void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int mode); + void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int mode); + void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride); +} HEVCRpiPredContext; + +void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth); + +#endif /* AVCODEC_RPI_HEVCPRED_H */ diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c new file mode 100644 index 0000000000..f2ebcad332 --- /dev/null +++ b/libavcodec/rpi_hevcpred_template.c @@ -0,0 +1,1407 @@ +/* + * HEVC video decoder + * + * Copyright (C) 2012 - 2013 Guillaume Martres + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/pixdesc.h" +#include "libavutil/rpi_sand_fns.h" +#include "bit_depth_template.c" + +#include "rpi_hevcdec.h" +#include "rpi_hevcpred.h" + +#define DUMP_PRED 0 + +#define POS(x, y) src[(x) + stride * (y)] + +// INCLUDED_ONCE defined at EOF +#ifndef INCLUDED_ONCE +typedef uint8_t (* c8_dst_ptr_t)[2]; +typedef const uint8_t (* c8_src_ptr_t)[2]; +typedef uint16_t (* c16_dst_ptr_t)[2]; +typedef const uint16_t (* c16_src_ptr_t)[2]; + +// *** On ARM make these NEON registers +typedef struct pixel4_16 { + uint16_t x[4]; +} pixel4_16; +typedef struct pixel4_32 { + uint32_t x[4]; +} pixel4_32; +static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x) +{ + pixel4_16 t = {{x, x, x, x}}; + return t; +} +static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x) +{ + pixel4_32 t = {{x, x, x, x}}; + return t; +} +#endif + +#if PRED_C +// For chroma we double pixel size so we copy pairs +#undef pixel +#undef pixel2 +#undef pixel4 +#undef dctcoef +#undef INIT_CLIP +#undef no_rnd_avg_pixel4 +#undef rnd_avg_pixel4 +#undef AV_RN2P +#undef AV_RN4P +#undef AV_RN4PA +#undef AV_WN2P +#undef AV_WN4P +#undef AV_WN4PA +#undef CLIP +#undef FUNC +#undef FUNCC +#undef av_clip_pixel +#undef PIXEL_SPLAT_X4 + +#if BIT_DEPTH == 8 +#define pixel uint16_t +#define pixel4 pixel4_16 +#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16 +#define cpel uint8_t +#define c_src_ptr_t c8_src_ptr_t +#define c_dst_ptr_t c8_dst_ptr_t +#else +#define pixel uint32_t +#define pixel4 pixel4_32 +#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32 +#define cpel uint16_t +#define c_src_ptr_t c16_dst_ptr_t +#define c_dst_ptr_t c16_dst_ptr_t +#endif +#define AV_RN4P(p) (*(pixel4*)(p)) +#define AV_WN4P(p,x) (*(pixel4*)(p) = (x)) +#define FUNC(a) FUNC2(a, BIT_DEPTH, _c) +#endif + + +// Get PW prior to horrid PRED_C trickery +#if BIT_DEPTH == 8 +#define PW 1 +#else +#define PW 2 +#endif + + +#if DUMP_PRED && !defined(INCLUDED_ONCE) +static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) +{ + for (unsigned int y = 0; y != size; y++, data += stride * 2) { + for (unsigned int x = 0; x != size; x++) { + printf("%4d", data[x * 2]); + } + printf("\n"); + } + printf("\n"); +} +#endif + +#ifndef INCLUDED_ONCE +static inline void extend_8(void * ptr, const unsigned int v, unsigned int n) +{ + if ((n >>= 2) != 0) { + uint32_t v4 = v | (v << 8); + uint32_t * p = (uint32_t *)ptr; + v4 = v4 | (v4 << 16); + do { + *p++ = v4; + } while (--n != 0); + } +} + +static inline void extend_16(void * ptr, const unsigned int v, unsigned int n) +{ + if ((n >>= 2) != 0) { + uint32_t v2 = v | (v << 16); + uint32_t * p = (uint32_t *)ptr; + do { + *p++ = v2; + *p++ = v2; + } while (--n != 0); + } +} + +static inline void extend_32(void * ptr, const unsigned int v, unsigned int n) +{ + if ((n >>= 2) != 0) { + uint32_t * p = (uint32_t *)ptr; + do { + *p++ = v; + *p++ = v; + *p++ = v; + *p++ = v; + } while (--n != 0); + } +} + +// Beware that this inverts the avail ordering +// For CIP it seems easier this way round +static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask, + const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size, + unsigned int s0, unsigned int odd_s) +{ + const unsigned int n = 1 << log2_intra_bits; + unsigned int fa = 0; + unsigned int i; + + size >>= 2; // Now in 4-pel units + s0 >>= 2; + + if ((avail & AVAIL_DL) != 0) + fa |= ((1 << s0) - 1) << (size - s0); + if ((avail & AVAIL_L) != 0) + fa |= ((1 << size) - 1) << size; + if ((avail & AVAIL_UL) != 0) + fa |= 1 << (size << 1); + + if (odd_s) { + if ((fa & 1) != 0 && (*is_intra & i_mask) == 0) + fa &= ~1; + is_intra += i_stride; + } + + for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) { + const unsigned int m = ((1 << n) - 1) << i; + if ((fa & m) != 0 && (*is_intra & i_mask) == 0) + fa &= ~m; + } + + return fa; +} + +static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift, + const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size, + unsigned int s1, unsigned int odd_s) +{ + if ((avail & (AVAIL_U | AVAIL_UR)) == 0) + { + return 0; + } + else + { + const unsigned int n = 1 << log2_intra_bits; + unsigned int fa = 0; + unsigned int i; + unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift; + + size >>= 2; // Now in 4-pel units + s1 >>= 2; + + if ((avail & AVAIL_U) != 0) + fa |= ((1 << size) - 1); + if ((avail & AVAIL_UR) != 0) + fa |= ((1 << s1) - 1) << size; + + if (odd_s) { + fa &= im | ~1; + im >>= 1; + } + + for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) { + const unsigned int m = ((1 << n) - 1) << i; + if ((im & 1) == 0) + fa &= ~m; + } + return fa; + } +} + + + +static inline unsigned int rmbd(unsigned int x) +{ +#if 1 + return __builtin_ctz(x); +#else + unsigned int n = 0; + if ((x & 0xffff) == 0) { + x >>= 16; + n += 16; + } + if ((x & 0xff) == 0) { + x >>= 8; + n += 8; + } + if ((x & 0xf) == 0) { + x >>= 4; + n += 4; + } + if ((x & 0x3) == 0) { + x >>= 2; + n += 2; + } + + return (x & 1) == 0 ? n + 1 : n; +#endif +} +#endif + + +static void FUNC(cip_fill)(pixel * const left, pixel * const top, + const unsigned int avail_l, const unsigned int avail_u, + const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, + const unsigned int stride, + const unsigned int size) +{ + pixel a; + unsigned int i; + + // 1st find DL value + if ((avail_l & 1) == 0) { + if (avail_l != 0) + a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride]; + else + { + // (avail_l | avail_u) != 0 so this must be good + const unsigned int n = rmbd(avail_u)*4; + a = (n >= size) ? src_ur[n - size] : src_u[n]; + } + } + + // L + { + pixel * d = left + size * 2 - 1; + const pixel * s = src_l + (size * 2 - 1) * stride; + unsigned int x = avail_l; + for (i = 0; i < size * 2; i += 4, x >>= 1) + { + if ((x & 1) != 0) { + // Avail + *d-- = *s; + s -= stride; + *d-- = *s; + s -= stride; + *d-- = *s; + s -= stride; + *d-- = a = *s; + s -= stride; + } + else + { + *d-- = a; + *d-- = a; + *d-- = a; + *d-- = a; + s -= stride * 4; + } + } + // UL + *d = a = (x & 1) != 0 ? *s : a; + } + + // U + { + pixel * d = top; + const pixel * s = src_u; + unsigned int x = avail_u; + + for (i = 0; i < size; i += 4, x >>= 1) + { + if ((x & 1) != 0) { + // Avail + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + *d++ = a = *s++; + } + else + { + *d++ = a; + *d++ = a; + *d++ = a; + *d++ = a; + s += 4; + } + } + + // UR + s = src_ur; + for (i = 0; i < size; i += 4, x >>= 1) + { + if ((x & 1) != 0) { + // Avail + *d++ = *s++; + *d++ = *s++; + *d++ = *s++; + *d++ = a = *s++; + } + else + { + *d++ = a; + *d++ = a; + *d++ = a; + *d++ = a; + s += 4; + } + } + } +} + + +#if !PRED_C && PW == 1 +#define EXTEND(ptr, val, len) extend_8(ptr, val, len) +#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1) +#define EXTEND(ptr, val, len) extend_16(ptr, val, len) +#else +#define EXTEND(ptr, val, len) extend_32(ptr, val, len) +#endif + +// Reqs: +// +// Planar: DL[0], L, ul, U, UR[0] +// DC: dl, L, ul, U, ur +// A2-9: DL, L, ul, u, ur +// A10: dl, L, ul, u, ur +// A11-17 dl, L, UL, U, ur +// A18-25 dl, L, Ul, U, ur +// A26 dl, l, ul, U, ur +// A27-34 dl, l, ul, U, UR + +#ifndef INCLUDED_ONCE + +intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; +intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; +intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; + +static const uint8_t req_avail_c[35] = +{ + AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) + AVAIL_L | 0 | AVAIL_U, // DC + AVAIL_DL | AVAIL_L, // 2 + AVAIL_DL | AVAIL_L, // 3 + AVAIL_DL | AVAIL_L, // 4 + AVAIL_DL | AVAIL_L, // 5 + AVAIL_DL | AVAIL_L, // 6 + AVAIL_DL | AVAIL_L, // 7 + AVAIL_DL | AVAIL_L, // 8 + AVAIL_DL | AVAIL_L, // 9 + AVAIL_L, // 10 (H) + AVAIL_L | AVAIL_UL | AVAIL_U, // 11 + AVAIL_L | AVAIL_UL | AVAIL_U, // 12 + AVAIL_L | AVAIL_UL | AVAIL_U, // 13 + AVAIL_L | AVAIL_UL | AVAIL_U, // 14 + AVAIL_L | AVAIL_UL | AVAIL_U, // 15 + AVAIL_L | AVAIL_UL | AVAIL_U, // 16 + AVAIL_L | AVAIL_UL | AVAIL_U, // 17 + AVAIL_L | AVAIL_UL | AVAIL_U, // 18 + AVAIL_L | AVAIL_UL | AVAIL_U, // 19 + AVAIL_L | AVAIL_UL | AVAIL_U, // 20 + AVAIL_L | AVAIL_UL | AVAIL_U, // 21 + AVAIL_L | AVAIL_UL | AVAIL_U, // 22 + AVAIL_L | AVAIL_UL | AVAIL_U, // 23 + AVAIL_L | AVAIL_UL | AVAIL_U, // 24 + AVAIL_L | AVAIL_UL | AVAIL_U, // 25 + AVAIL_U, // 26 (V) + AVAIL_U | AVAIL_UR, // 27 + AVAIL_U | AVAIL_UR, // 28 + AVAIL_U | AVAIL_UR, // 29 + AVAIL_U | AVAIL_UR, // 30 + AVAIL_U | AVAIL_UR, // 31 + AVAIL_U | AVAIL_UR, // 32 + AVAIL_U | AVAIL_UR, // 33 + AVAIL_U | AVAIL_UR // 34 +}; + +static const uint8_t req_avail[4][35] = { +{ + AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) + AVAIL_L | 0 | AVAIL_U, // DC + AVAIL_DL | AVAIL_L, // 2 + AVAIL_DL | AVAIL_L, // 3 + AVAIL_DL | AVAIL_L, // 4 + AVAIL_DL | AVAIL_L, // 5 + AVAIL_DL | AVAIL_L, // 6 + AVAIL_DL | AVAIL_L, // 7 + AVAIL_DL | AVAIL_L, // 8 + AVAIL_DL | AVAIL_L, // 9 + AVAIL_L | AVAIL_UL | AVAIL_U, // 10 (H) + AVAIL_L | AVAIL_UL | AVAIL_U, // 11 + AVAIL_L | AVAIL_UL | AVAIL_U, // 12 + AVAIL_L | AVAIL_UL | AVAIL_U, // 13 + AVAIL_L | AVAIL_UL | AVAIL_U, // 14 + AVAIL_L | AVAIL_UL | AVAIL_U, // 15 + AVAIL_L | AVAIL_UL | AVAIL_U, // 16 + AVAIL_L | AVAIL_UL | AVAIL_U, // 17 + AVAIL_L | AVAIL_UL | AVAIL_U, // 18 + AVAIL_L | AVAIL_UL | AVAIL_U, // 19 + AVAIL_L | AVAIL_UL | AVAIL_U, // 20 + AVAIL_L | AVAIL_UL | AVAIL_U, // 21 + AVAIL_L | AVAIL_UL | AVAIL_U, // 22 + AVAIL_L | AVAIL_UL | AVAIL_U, // 23 + AVAIL_L | AVAIL_UL | AVAIL_U, // 24 + AVAIL_L | AVAIL_UL | AVAIL_U, // 25 + AVAIL_L | AVAIL_UL | AVAIL_U, // 26 (V) + AVAIL_U | AVAIL_UR, // 27 + AVAIL_U | AVAIL_UR, // 28 + AVAIL_U | AVAIL_UR, // 29 + AVAIL_U | AVAIL_UR, // 30 + AVAIL_U | AVAIL_UR, // 31 + AVAIL_U | AVAIL_UR, // 32 + AVAIL_U | AVAIL_UR, // 33 + AVAIL_U | AVAIL_UR // 34 +}, +{ // 3 + AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) + AVAIL_L | 0 | AVAIL_U, // DC + AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 + AVAIL_DL | AVAIL_L | 0, // 3 + AVAIL_DL | AVAIL_L | 0, // 4 + AVAIL_DL | AVAIL_L | 0, // 5 + AVAIL_DL | AVAIL_L | 0, // 6 + AVAIL_DL | AVAIL_L | 0, // 7 + AVAIL_DL | AVAIL_L | 0, // 8 + AVAIL_DL | AVAIL_L | 0, // 9 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 12 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 13 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 14 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 15 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 16 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 17 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 19 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 20 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 21 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 22 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 23 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 24 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) + AVAIL_U | AVAIL_UR | 0, // 27 + AVAIL_U | AVAIL_UR | 0, // 28 + AVAIL_U | AVAIL_UR | 0, // 29 + AVAIL_U | AVAIL_UR | 0, // 30 + AVAIL_U | AVAIL_UR | 0, // 31 + AVAIL_U | AVAIL_UR | 0, // 32 + AVAIL_U | AVAIL_UR | 0, // 33 + AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 +}, +{ // 4 + AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) + AVAIL_L | 0 | AVAIL_U, // DC + AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 + AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 3 + AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 4 + AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 5 + AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 6 + AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 7 + AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 8 + AVAIL_DL | AVAIL_L | 0, // 9 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 12 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 13 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 14 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 15 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 16 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 17 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 19 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 20 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 21 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 22 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 23 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 24 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 + AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) + AVAIL_U | AVAIL_UR | 0, // 27 + AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 28 + AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 29 + AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 30 + AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 31 + AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 32 + AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 33 + AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 +}, +{ // 5 + AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed) + AVAIL_L | 0 | AVAIL_U, // DC + AVAIL_DL | AVAIL_L | FILTER_EITHER, // 2 + AVAIL_DL | AVAIL_L | FILTER_EITHER, // 3 + AVAIL_DL | AVAIL_L | FILTER_EITHER, // 4 + AVAIL_DL | AVAIL_L | FILTER_EITHER, // 5 + AVAIL_DL | AVAIL_L | FILTER_EITHER, // 6 + AVAIL_DL | AVAIL_L | FILTER_EITHER, // 7 + AVAIL_DL | AVAIL_L | FILTER_EITHER, // 8 + AVAIL_DL | AVAIL_L | FILTER_EITHER, // 9 + AVAIL_L | 0, // 10 (H) + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 11 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 12 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 13 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 14 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 15 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 16 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 17 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 18 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 19 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 20 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 21 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 22 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 23 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 24 + AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 25 + AVAIL_U | 0, // 26 (V) + AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27 + AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28 + AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29 + AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30 + AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31 + AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32 + AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33 + AVAIL_U | AVAIL_UR | FILTER_EITHER // 34 +} +}; + + +#endif + +#define filter_light1 FUNC(filter_light1) +static inline pixel filter_light1(pixel a, pixel b, pixel c) +{ + return (a + b*2 + c + 2) >> 2; +} + +#define filter_light FUNC(filter_light) +static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n) +{ + pixel p0; + pixel p2 = *src; + // Allow for final pel - it is just clearer to to have the call take the actual number of output pels + unsigned int n_minus_1 = n - 1; + + do + { + src += sstride; + p0 = p1; + p1 = p2; + p2 = *src; + *dst++ = filter_light1(p0, p1, p2); + } while (--n_minus_1 != 0); + *dst = filter_light1(p1, p2, pn); +} + +#define filter_strong FUNC(filter_strong) +static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n) +{ + unsigned int a = 64 * p0 + 32; + const int v = p1 - p0; + + do + { + *dst++ = (a += v) >> 6; + } while (--n != 0); +} + +#define intra_filter FUNC(intra_filter) +static av_always_inline void intra_filter( + pixel * const left, pixel * const top, + const unsigned int req, const unsigned int avail, + const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, + const unsigned int stride, + const unsigned int top_right_size, const unsigned int down_left_size, + const unsigned int log2_size) +{ + const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5); + const unsigned int size = 1 << log2_size; + + // a_ is the first pel in a section working round dl -> ur + // b_ is the last + // Beware that top & left work out from UL so usage of a_ & b_ may + // swap between them. It is a bad naming scheme but I have found no + // better + const pixel * a_dl = src_l + (down_left_size + size - 1) * stride; + const pixel * b_dl = src_l + size * stride; + const pixel * a_l = src_l + (size - 1) * stride; + const pixel * b_l = src_l; + const pixel * ab_ul = src_l - stride; + const pixel * a_u = src_u; + const pixel * b_u = src_u + size - 1; + const pixel * a_ur = src_ur; + const pixel * b_ur = src_ur + top_right_size - 1; + + const unsigned int want = req & ~avail; + const unsigned int have = req & avail; + unsigned int i; + + if ((avail & AVAIL_DL) == 0) + { + a_dl = a_ur; + if ((avail & AVAIL_U) != 0) + a_dl = a_u; + if ((avail & AVAIL_UL) != 0) + a_dl = ab_ul; + if ((avail & AVAIL_L) != 0) + a_dl = a_l; + b_dl = a_dl; + } + + if ((avail & AVAIL_L) == 0) + { + a_l = b_dl; + b_l = b_dl; + } + if ((avail & AVAIL_UL) == 0) + { + ab_ul = b_l; + } + if ((avail & AVAIL_U) == 0) + { + a_u = ab_ul; + b_u = ab_ul; + } + if ((avail & AVAIL_UR) == 0) + { + a_ur = b_u; + b_ur = b_u; + } + + if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2) // PRED_C, log2_size compiler opt hints + { + if ((req & AVAIL_UL) != 0) + left[-1] = *ab_ul; + + if ((want & AVAIL_L) != 0) + EXTEND(left, *a_l, size); + if ((want & AVAIL_DL) != 0) + EXTEND(left + size, *a_dl, size); + if ((want & AVAIL_U) != 0) + EXTEND(top, *a_u, size); + if ((want & AVAIL_UR) != 0) + EXTEND(top + size, *a_ur, size); + + if ((have & AVAIL_U) != 0) + // Always good - even with sand + memcpy(top, a_u, size * sizeof(pixel)); + if ((have & AVAIL_UR) != 0) + { + memcpy(top + size, a_ur, top_right_size * sizeof(pixel)); + EXTEND(top + size + top_right_size, *b_ur, + size - top_right_size); + } + if ((have & AVAIL_L) != 0) + { + for (i = 0; i < size; i++) + left[i] = b_l[stride * i]; + } + if ((have & AVAIL_DL) != 0) + { + for (i = 0; i < down_left_size; i++) + left[i + size] = b_dl[stride * i]; + EXTEND(left + size + down_left_size, *a_dl, + size - down_left_size); + } + } + else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint + FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold && + FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold) + { + if ((req & (AVAIL_U | AVAIL_UR)) != 0) + filter_strong(top, *ab_ul, *b_ur, size * 2); + left[-1] = *ab_ul; + if ((req & (AVAIL_L | AVAIL_DL)) != 0) + filter_strong(left, *ab_ul, *a_dl, size*2); + } + else + { + // Same code for both have & want for UL + if ((req & AVAIL_UL) != 0) + { + left[-1] = filter_light1(*b_l, *ab_ul, *a_u); + } + + if ((want & AVAIL_L) != 0) + { + EXTEND(left, *a_l, size); + left[0] = (*a_l * 3 + *ab_ul + 2) >> 2; + } + if ((want & AVAIL_DL) != 0) + { + // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding + EXTEND(left + size, *a_l, size); + } + if ((want & AVAIL_U) != 0) + { + EXTEND(top, *a_u, size); + top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2; + } + if ((want & AVAIL_UR) != 0) + { + // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding + EXTEND(top + size, *a_ur, size); + } + + if ((have & AVAIL_U) != 0) + { + filter_light(top, *ab_ul, a_u, *a_ur, 1, size); + } + if ((have & AVAIL_UR) != 0) { + filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size); + top[size*2 - 1] = *b_ur; + EXTEND(top + size + top_right_size, *b_ur, size - top_right_size); + } + if ((have & AVAIL_L) != 0) + { + filter_light(left, *ab_ul, b_l, *b_dl, stride, size); + } + if ((have & AVAIL_DL) != 0) + { + filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size); + left[size*2 - 1] = *a_dl; + EXTEND(left + size + down_left_size, *a_dl, size - down_left_size); + } + } +} + +#define INTRA_FILTER(log2_size) \ +static void FUNC(intra_filter_ ## log2_size)( \ + uint8_t * const left, uint8_t * const top, \ + const unsigned int req, const unsigned int avail, \ + const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \ + const unsigned int stride, \ + const unsigned int top_right_size, const unsigned int down_left_size) \ +{ \ + intra_filter((pixel *)left, (pixel *)top, req, avail, \ + (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \ +} + +INTRA_FILTER(2) +INTRA_FILTER(3) +INTRA_FILTER(4) +INTRA_FILTER(5) + +#undef intra_filter +#undef INTRA_FILTER + +static void FUNC(intra_pred)(const HEVCRpiContext * const s, + const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail, + const unsigned int log2_size) +{ + // c_idx will alaways be 1 for _c versions and 0 for y + const unsigned int c_idx = PRED_C; + const unsigned int hshift = ctx_hshift(s, c_idx); + const unsigned int vshift = ctx_vshift(s, c_idx); + const unsigned int size = (1 << log2_size); + const unsigned int x = x0 >> hshift; + const unsigned int y = y0 >> vshift; + + const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel); + pixel *const src = c_idx == 0 ? + (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) : + (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y); + + // Align so we can do multiple loads in the asm + // Padded to 16 byte boundary so as not to confuse anything + DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]); + DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); + + pixel * const left = left_array + 16 / sizeof(pixel); + const pixel * top_pred = top; + + const pixel * src_l = src - 1; + const pixel * src_u = src - stride; + const pixel * src_ur = src_u + size; +#if !PRED_C + const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable; +#else + const unsigned int req = req_avail_c[mode]; +#endif + + // If we have nothing to pred from then fill with grey + // This isn't a common case but dealing with it here means we don't have to + // test for it later + if (avail == 0) + { +dc_only: +#if !PRED_C + s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride); +#else + s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride); +#endif + return; + } + + { + // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs + const AVFrame * const frame = s->frame; + const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2 + const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride; + if ((x & mask) == 0) + src_l -= stripe_adj; + if (((x + size) & mask) == 0) + src_ur += stripe_adj; + } + + // Can deal with I-slices in 'normal' code even if CIP + // This also means that we don't need to generate (elsewhere) is_intra + // for IRAP frames + if (s->ps.pps->constrained_intra_pred_flag == 1 && + s->sh.slice_type != HEVC_SLICE_I) + { + // * If we ever actually care about CIP performance then we should + // special case out size 4 stuff (can be done by 'normal') and + // have 8-pel avail masks + unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)), + -(int)(s->ps.sps->pcm_width), + 1 << (((x - 1) >> (3 - hshift)) & 7), + 1 - hshift, + avail, + size, + FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size), + vshift != 0 ? 0 : (y >> 2) & 1); + + unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)), + (x >> (3 - hshift)) & 7, + 1 - hshift, + avail, + size, + FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size), + hshift != 0 ? 0 : (x >> 2) & 1); + + // Anything left? + if ((avail_l | avail_u) == 0) + goto dc_only; + + FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size); + +#if !PRED_C + if ((req & FILTER_LIGHT) != 0) + { + const unsigned threshold = 1 << (BIT_DEPTH - 5); + if ((req & FILTER_STRONG) != 0 && + (int)(FFABS(left[-1] + top[63] - 2 * top[31])) < threshold && + (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold) + { + filter_strong(top, left[-1], top[63], 64); + filter_strong(left, left[-1], left[63], 64); + } else + { + // LHS writes UL too so copy for top + const pixel p_ul = left[-1]; + filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size); + filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1); + } + } +#endif + } + else + { + const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size); + if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 && + ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size)) + { + top_pred = src_u; + } + else + { +#if !PRED_C + s->hpc.intra_filter[log2_size - 2] +#else + s->hpc.intra_filter_c[log2_size - 2] +#endif + ((uint8_t *)left, (uint8_t *)top, req, avail, + (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel), + ur_size, + FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size)); + } + } + + +#if !PRED_C + switch (mode) { + case INTRA_PLANAR: + s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_DC: + s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_ANGULAR_HORIZONTAL: + s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + case INTRA_ANGULAR_VERTICAL: + s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + default: + s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + } +#else + switch (mode) { + case INTRA_PLANAR: + s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_DC: + s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride); + break; + case INTRA_ANGULAR_HORIZONTAL: + s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + case INTRA_ANGULAR_VERTICAL: + s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + default: + s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, + (uint8_t *)left, stride, + mode); + break; + } + +#if DUMP_PRED + printf("U pred @ %d, %d: mode=%d\n", x, y, mode); + dump_pred_uv((uint8_t *)src, stride, 1 << log2_size); + printf("V pred @ %d, %d: mode=%d\n", x, y, mode); + dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size); +#endif +#endif +} + +#if !PRED_C +static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, ptrdiff_t stride, + int trafo_size) +{ + int x, y; + pixel *src = (pixel *)_src; + const pixel *top = (const pixel *)_top; + const pixel *left = (const pixel *)_left; + int size = 1 << trafo_size; + for (y = 0; y < size; y++) + for (x = 0; x < size; x++) + POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] + + (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1); +} +#else +static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top, + const uint8_t * _left, ptrdiff_t stride, + int trafo_size) +{ + int x, y; + int size = 1 << trafo_size; + c_dst_ptr_t src = (c_dst_ptr_t)_src; + const c_src_ptr_t top = (c_src_ptr_t)_top; + const c_src_ptr_t left = (c_src_ptr_t)_left; + + for (y = 0; y < size; y++, src += stride) + { + for (x = 0; x < size; x++) + { + src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] + + (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1); + src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] + + (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1); + } + } +} +#endif + +#define PRED_PLANAR(size)\ +static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ + const uint8_t *left, ptrdiff_t stride) \ +{ \ + FUNC(pred_planar)(src, top, left, stride, size + 2); \ +} + +PRED_PLANAR(0) +PRED_PLANAR(1) +PRED_PLANAR(2) +PRED_PLANAR(3) + +#undef PRED_PLANAR + +#if !PRED_C +static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, + ptrdiff_t stride, int log2_size) +{ + int i, j, x, y; + int size = (1 << log2_size); + pixel *src = (pixel *)_src; + const pixel *top = (const pixel *)_top; + const pixel *left = (const pixel *)_left; + int dc = size; + pixel4 a; + for (i = 0; i < size; i++) + dc += left[i] + top[i]; + + dc >>= log2_size + 1; + + a = PIXEL_SPLAT_X4(dc); + + for (i = 0; i < size; i++) + for (j = 0; j < size; j+=4) + AV_WN4P(&POS(j, i), a); + +// if (c_idx == 0 && size < 32) +// As we now have separate fns for y & c - no need to test that + if (size < 32) + { + POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2; + for (x = 1; x < size; x++) + POS(x, 0) = (top[x] + 3 * dc + 2) >> 2; + for (y = 1; y < size; y++) + POS(0, y) = (left[y] + 3 * dc + 2) >> 2; + } +} +#else +static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, + ptrdiff_t stride, int log2_size) +{ + unsigned int i, j; + const unsigned int size = (1 << log2_size); + c_dst_ptr_t src = (c_dst_ptr_t)_src; + const c_src_ptr_t top = (c_src_ptr_t)_top; + const c_src_ptr_t left = (c_src_ptr_t)_left; + unsigned int dc0 = size; + unsigned int dc1 = size; + + for (i = 0; i < size; i++) + { + dc0 += left[i][0] + top[i][0]; + dc1 += left[i][1] + top[i][1]; + } + + dc0 >>= log2_size + 1; + dc1 >>= log2_size + 1; + + for (i = 0; i < size; i++, src += stride) + { + for (j = 0; j < size; ++j) + { + src[j][0] = dc0; + src[j][1] = dc1; + + } + } +} +#endif + +#define PRED_DC(size)\ +static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \ + const uint8_t *left, ptrdiff_t stride) \ +{ \ + FUNC(pred_dc)(src, top, left, stride, size + 2); \ +} + +PRED_DC(0) +PRED_DC(1) +PRED_DC(2) +PRED_DC(3) + +#undef PRED_DC + + + + +#if !PRED_C +static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) +{ + int i, j; + int size = (1 << log2_size); + pixel *src = (pixel *)_src; + pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1)); + + for (i = 0; i < size; i++) + for (j = 0; j < size; j+=4) + AV_WN4P(&POS(j, i), a); +} +#else +static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) +{ + unsigned int i, j; + const unsigned int size = (1 << log2_size); + c_dst_ptr_t src = (c_dst_ptr_t)_src; + const pixel a = (1 << (BIT_DEPTH - 1)); + + for (i = 0; i < size; i++, src += stride) + { + for (j = 0; j < size; ++j) + { + src[j][0] = a; + src[j][1] = a; + } + } +} +#endif + +#define PRED_DC0(size)\ +static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride) \ +{ \ + FUNC(pred_dc0)(src, stride, size + 2); \ +} + +PRED_DC0(0) +PRED_DC0(1) +PRED_DC0(2) +PRED_DC0(3) + +#undef PRED_DC0 + + + + +#ifndef ANGLE_CONSTS +#define ANGLE_CONSTS +static const int intra_pred_angle[] = { + 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, + -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 +}; +static const int inv_angle[] = { + -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, + -630, -910, -1638, -4096 +}; +#endif + +#if !PRED_C +static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + const uint8_t *_top, + const uint8_t *_left, + ptrdiff_t stride, + int mode, int size) +{ + int x, y; + pixel *src = (pixel *)_src; + const pixel *top = (const pixel *)_top; + const pixel *left = (const pixel *)_left; + + int angle = intra_pred_angle[mode - 2]; + pixel ref_array[3 * MAX_TB_SIZE + 4]; + pixel *ref_tmp = ref_array + size; + const pixel *ref; + int last = (size * angle) >> 5; + + if (mode >= 18) { + ref = top - 1; + + if (angle < 0) + { + memcpy(ref_tmp + 1, top, size * PW); + ref_tmp[0] = left[-1]; + + for (x = last; x <= -1; x++) + ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)]; + ref = ref_tmp; + } + + for (y = 0; y < size; y++) { + int idx = ((y + 1) * angle) >> 5; + int fact = ((y + 1) * angle) & 31; + if (fact) { + for (x = 0; x < size; x += 4) { + POS(x , y) = ((32 - fact) * ref[x + idx + 1] + + fact * ref[x + idx + 2] + 16) >> 5; + POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] + + fact * ref[x + 1 + idx + 2] + 16) >> 5; + POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] + + fact * ref[x + 2 + idx + 2] + 16) >> 5; + POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] + + fact * ref[x + 3 + idx + 2] + 16) >> 5; + } + } else { + for (x = 0; x < size; x += 4) + AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1])); + } + } + if (mode == 26 && size < 32) { + for (y = 0; y < size; y++) + POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1)); + } + + } else { + ref = left - 1; + if (angle < 0 && last < -1) { + for (x = 0; x <= size; x += 4) + AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1])); + // Inv angle <= -256 so top offset >= 0 + for (x = last; x <= -1; x++) + ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)]; + ref = ref_tmp; + } + + for (x = 0; x < size; x++) { + int idx = ((x + 1) * angle) >> 5; + int fact = ((x + 1) * angle) & 31; + if (fact) { + for (y = 0; y < size; y++) { + POS(x, y) = ((32 - fact) * ref[y + idx + 1] + + fact * ref[y + idx + 2] + 16) >> 5; + } + } else { + for (y = 0; y < size; y++) + POS(x, y) = ref[y + idx + 1]; + } + } + if (mode == 10 && size < 32) { + for (x = 0; x < size; x += 4) { + POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - left[-1]) >> 1)); + POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1)); + POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1)); + POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1)); + } + } + } +} +#else +static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + const uint8_t *_top, + const uint8_t *_left, + ptrdiff_t stride, + int mode, int size) +{ + int x, y; + c_dst_ptr_t src = (c_dst_ptr_t)_src; + c_src_ptr_t top = (c_src_ptr_t)_top; + c_src_ptr_t left = (c_src_ptr_t)_left; + + const int angle = intra_pred_angle[mode - 2]; + cpel ref_array[3 * MAX_TB_SIZE + 4][2]; + c_dst_ptr_t ref_tmp = ref_array + size; + c_src_ptr_t ref; + const int last = (size * angle) >> 5; + + if (mode >= 18) { + ref = top - 1; + if (angle < 0) { + memcpy(ref_tmp + 1, top, size * 2 * PW); + ref_tmp[0][0] = left[-1][0]; + ref_tmp[0][1] = left[-1][1]; + for (x = last; x <= -1; x++) + { + ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; + ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; + } + ref = (c_src_ptr_t)ref_tmp; + } + + for (y = 0; y < size; y++, src += stride) { + const int idx = ((y + 1) * angle) >> 5; + const int fact = ((y + 1) * angle) & 31; + if (fact) { + for (x = 0; x < size; ++x) { + src[x][0] = ((32 - fact) * ref[x + idx + 1][0] + + fact * ref[x + idx + 2][0] + 16) >> 5; + src[x][1] = ((32 - fact) * ref[x + idx + 1][1] + + fact * ref[x + idx + 2][1] + 16) >> 5; + } + } else { + memcpy(src, ref + idx + 1, size * 2 * PW); + } + } + } else { + ref = left - 1; + if (angle < 0 && last < -1) { + memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW); + for (x = last; x <= -1; x++) + { + ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; + ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; + } + ref = (c_src_ptr_t)ref_tmp; + } + + for (x = 0; x < size; x++, src++) { + const int idx = ((x + 1) * angle) >> 5; + const int fact = ((x + 1) * angle) & 31; + if (fact) { + for (y = 0; y < size; y++) { + src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] + + fact * ref[y + idx + 2][0] + 16) >> 5; + src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] + + fact * ref[y + idx + 2][1] + 16) >> 5; + } + } else { + for (y = 0; y < size; y++) + { + src[y * stride][0] = ref[y + idx + 1][0]; + src[y * stride][1] = ref[y + idx + 1][1]; + } + } + } + } +} +#endif + +static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, + const uint8_t *left, + ptrdiff_t stride, int mode) +{ + FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2); +} + +static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top, + const uint8_t *left, + ptrdiff_t stride, int mode) +{ + FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3); +} + +static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top, + const uint8_t *left, + ptrdiff_t stride, int mode) +{ + FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4); +} + +static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, + const uint8_t *left, + ptrdiff_t stride, int mode) +{ + FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5); +} + +#undef cpel +#undef c_src_ptr_t +#undef c_dst_ptr_t + +#undef EXTEND +#undef POS +#undef PW + +#undef filter_light1 +#undef filter_light +#undef filter_strong +#undef ref_gen + +#ifndef INCLUDED_ONCE +#define INCLUDED_ONCE +#endif + diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c new file mode 100644 index 0000000000..98a0b104b7 --- /dev/null +++ b/libavcodec/rpi_mailbox.c @@ -0,0 +1,155 @@ +/* +Copyright (c) 2012, Broadcom Europe Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MAJOR_NUM 100 +#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *) +#define DEVICE_FILE_NAME "/dev/vcio" + +#include "rpi_mailbox.h" +//#include + +/* + * use ioctl to send mbox property message + */ + +static int mbox_property(int file_desc, void *buf) +{ + int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf); + + if (ret_val < 0) { + printf("ioctl_set_msg failed:%d\n", ret_val); + } + +#ifdef DEBUG + unsigned *p = buf; int i; unsigned size = *(unsigned *)buf; + for (i=0; i +#include +#include +#include + +#include "config.h" + +#include "libavutil/avassert.h" +#include "libavutil/rpi_sand_fns.h" + +#pragma GCC diagnostic push +// Many many redundant decls in the header files +#pragma GCC diagnostic ignored "-Wredundant-decls" +#include +#include +#include +#pragma GCC diagnostic pop + +#include "rpi_mem.h" +#include "rpi_zc_frames.h" + + +#define OPT_PREFER_CMA 0 + +struct rpi_cache_flush_env_s { + struct vcsm_user_clean_invalid2_s v; +}; + + +// GPU memory alloc fns (internal) + +static void gpu_free_internal(GPU_MEM_PTR_T * const p) +{ + if (p->arm != NULL) + vcsm_unlock_ptr(p->arm); + if (p->vcsm_handle != 0) + vcsm_free(p->vcsm_handle); + memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again +} + + +static int gpu_malloc_internal(GPU_MEM_PTR_T * const p, + const int numbytes, const unsigned int cache_type, const char * const name) +{ + memset(p, 0, sizeof(*p)); + p->numbytes = (numbytes + 255) & ~255; // Round up + + if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0) + { + av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name); + goto fail; + } + if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0) + { + av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name); + goto fail; + } + if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL) + { + av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name); + goto fail; + } + if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0) + { + av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name); + goto fail; + } + + return 0; + +fail: + gpu_free_internal(p); + return AVERROR(ENOMEM); +} + +// Public gpu fns + +// Allocate memory on GPU +// Fills in structure

containing ARM pointer, videocore handle, videocore memory address, numbytes +// Returns 0 on success. +// This allocates memory that will not be cached in ARM's data cache. +// Therefore safe to use without data cache flushing. +int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) +{ + return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached"); +} + +// This allocates data that will be +// Cached in ARM L2 +// Uncached in VPU L2 +int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) +{ + return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached"); +} + +void gpu_free(GPU_MEM_PTR_T * const p) { + gpu_free_internal(p); +} + +void rpi_mem_gpu_uninit(void) +{ + vcsm_exit(); + bcm_host_deinit(); +} + +int rpi_mem_gpu_init(const unsigned int flags) +{ + const int wants_cma = bcm_host_is_fkms_active(); + int use_cma; + + (void)flags; + + if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0) + use_cma = 1; + else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0) + use_cma = 0; + else + return AVERROR(EINVAL); + + bcm_host_init(); + + return use_cma + 1; +} + +// ---------------------------------------------------------------------------- +// +// Cache flush functions + +#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s)) + +rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf) +{ + rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf; + *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}}; + return rfe; +} + +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) +{ + // Nothing needed +} + +int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe) +{ + int rc = 0; + if (rfe->v.op_count != 0) { + if (vcsm_clean_invalid2(&rfe->v) != 0) + { + const int err = errno; + av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err); + rc = AVERROR(err); + } + rfe->v.op_count = 0; + } + return rc; +} + +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) +{ + int rc = rpi_cache_flush_execute(rfe);; + + return rc; +} + +inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, + const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride) +{ + struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; + + av_assert1(rfe->v.op_count <= CACHE_EL_MAX); + + b->invalidate_mode = mode; + b->block_count = blocks; + b->start_address = gm->arm + offset0; + b->block_size = block_size; + b->inter_block_stride = block_stride; +} + +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, + const unsigned int offset, const unsigned int size) +{ + // Deal with empty pointer trivially + if (gm == NULL || size == 0) + return; + + av_assert1(offset <= gm->numbytes); + av_assert1(size <= gm->numbytes); + av_assert1(offset + size <= gm->numbytes); + + rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0); +} + +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) +{ + rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0); +} + + +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) +{ +#if !RPI_ONE_BUF +#error Fixme! (NIF) +#endif + if (gpu_is_buf1(frame)) { + rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode); + } + else + { + rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode); + rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode); + rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode); + } +} + +// Flush an area of a frame +// Width, height, x0, y0 in luma pels +void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, + const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, + const unsigned int uv_shift, const int do_luma, const int do_chroma) +{ + const unsigned int y_offset = frame->linesize[0] * y0; + const unsigned int y_size = frame->linesize[0] * height; + // Round UV up/down to get everything + const unsigned int uv_rnd = (1U << uv_shift) >> 1; + const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift); + const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset; + +#if 0 + // *** frame->height is cropped height so not good + // As all unsigned they will also reject -ve + // Test individually as well as added to reject overflow + av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped + av_assert0(n <= (unsigned int)frame->height); + av_assert0(start_line + n <= (unsigned int)frame->height); +#endif + + if (!gpu_is_buf1(frame)) + { + if (do_luma) { + rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size); + } + if (do_chroma) { + rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size); + rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); + } + } + else if (!av_rpi_is_sand_frame(frame)) + { + const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); + if (do_luma) { + rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size); + } + if (do_chroma) { + rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size); + rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size); + } + } + else + { + const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); + const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); + const unsigned int xshl = av_rpi_sand_frame_xshl(frame); + const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1); + const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C + av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX); + + if (do_chroma) + { + struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; + b->invalidate_mode = mode; + b->block_count = block_count; + b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1); + b->block_size = uv_size; + b->inter_block_stride = stride1 * stride2; + } + if (do_luma) + { + struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; + b->invalidate_mode = mode; + b->block_count = block_count; + b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0); + b->block_size = y_size; + b->inter_block_stride = stride1 * stride2; + } + } +} + +// Call this to clean and invalidate a region of memory +void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode) +{ + rpi_cache_buf_t cbuf; + rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf); + rpi_cache_flush_add_gm_ptr(rfe, p, mode); + rpi_cache_flush_finish(rfe); +} + diff --git a/libavcodec/rpi_mem.h b/libavcodec/rpi_mem.h new file mode 100644 index 0000000000..a451079806 --- /dev/null +++ b/libavcodec/rpi_mem.h @@ -0,0 +1,88 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox, Ben Avison +*/ + +#ifndef RPI_MEM_H +#define RPI_MEM_H + +typedef struct gpu_mem_ptr_s { + unsigned char *arm; // Pointer to memory mapped on ARM side + int vc_handle; // Videocore handle of relocatable memory + int vcsm_handle; // Handle for use by VCSM + int vc; // Address for use in GPU code + int numbytes; // Size of memory block +} GPU_MEM_PTR_T; + +// General GPU functions + +#define GPU_INIT_GPU 1 +#define GPU_INIT_CMA 2 + +extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p); +extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p); +extern void gpu_free(GPU_MEM_PTR_T * const p); +int rpi_mem_gpu_init(const unsigned int flags); +void rpi_mem_gpu_uninit(void); + +// Cache flush stuff + +struct rpi_cache_flush_env_s; +typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t; + +typedef struct {uint32_t t[33];} rpi_cache_buf_t; + +rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf); +// Free env without flushing +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe); +// Do the accumulated flush & clear but do not free the env +int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe); +// Do the accumulated flush & free the env +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe); + +typedef enum +{ + RPI_CACHE_FLUSH_MODE_INVALIDATE = 1, + RPI_CACHE_FLUSH_MODE_WRITEBACK = 2, + RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3 +} rpi_cache_flush_mode_t; + +struct AVFrame; +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode); +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, + const unsigned int offset, const unsigned int size); +void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, + const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride); +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode); +void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode, + const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, + const unsigned int uv_shift, const int do_luma, const int do_chroma); + +// init, add, finish for one gm ptr +void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode); + +#endif diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c new file mode 100644 index 0000000000..cb7b96119e --- /dev/null +++ b/libavcodec/rpi_qpu.c @@ -0,0 +1,776 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox +*/ + + +#include +#include +#include +#include +#include +#include "libavutil/avassert.h" + +#include "config.h" + +#include +#include + +#include + +#include "rpi_mailbox.h" +#include "rpi_mem.h" +#include "rpi_qpu.h" +#include "rpi_hevc_shader.h" +#include "rpi_hevc_transform8.h" +#include "rpi_hevc_transform10.h" +#include "libavutil/rpi_sand_fns.h" + +// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No) +#define RPI_TRACE_TIME_VPU_QPU_WAIT 0 + +// Add profile flags to all QPU requests - generates output in "vcdbg log msg" +// Beware this is expensive and will probably throw off all other timing by >10% +#define RPI_TRACE_QPU_PROFILE_ALL 0 + +// QPU "noflush" flags +// a mixture of flushing & profiling + +#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed +#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers +#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results +#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling +#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed) + +#define vcos_verify_ge0(x) ((x)>=0) + +// Size in 32bit words +#define QPU_CODE_SIZE 4098 +#define VPU_CODE_SIZE 16384 + +static const short rpi_transMatrix2even[32][16] = { // Even rows first +{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, +{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90}, +{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89}, +{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87}, +{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83}, +{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80}, +{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75}, +{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70}, +{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64}, +{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57}, +{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50}, +{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43}, +{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36}, +{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25}, +{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18}, +{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9}, +// Odd rows +{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4}, +{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13}, +{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22}, +{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31}, +{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38}, +{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46}, +{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54}, +{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61}, +{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67}, +{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73}, +{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78}, +{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82}, +{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85}, +{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88}, +{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90}, +{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90} +}; + +// Code/constants on GPU +struct GPU +{ +// unsigned int qpu_code[QPU_CODE_SIZE]; + unsigned int vpu_code8[VPU_CODE_SIZE]; + unsigned int vpu_code10[VPU_CODE_SIZE]; + short transMatrix2even[16*16*2]; +}; + +#define WAIT_COUNT_MAX 16 + +typedef struct trace_time_one_s +{ + int count; + int64_t start[WAIT_COUNT_MAX]; + int64_t total[WAIT_COUNT_MAX]; +} trace_time_one_t; + +typedef struct trace_time_wait_s +{ + unsigned int jcount; + int64_t start0; + int64_t last_update; + trace_time_one_t active; + trace_time_one_t wait; +} trace_time_wait_t; + +typedef struct vq_wait_s +{ + sem_t sem; + struct vq_wait_s * next; +} vq_wait_t; + +#define VQ_WAIT_POOL_SIZE 16 +typedef struct vq_wait_pool_s +{ + vq_wait_t * head; + vq_wait_t pool[VQ_WAIT_POOL_SIZE]; +} vq_wait_pool_t; + +static void vq_wait_pool_init(vq_wait_pool_t * const pool); +static void vq_wait_pool_deinit(vq_wait_pool_t * const pool); + +typedef struct gpu_env_s +{ + int open_count; + int init_count; + int vpu_i_cache_flushed; + GPU_MEM_PTR_T qpu_code_gm_ptr; + GPU_MEM_PTR_T code_gm_ptr; + GPU_MEM_PTR_T dummy_gm_ptr; + vq_wait_pool_t wait_pool; +#if RPI_TRACE_TIME_VPU_QPU_WAIT + trace_time_wait_t ttw; +#endif +} gpu_env_t; + +// Stop more than one thread trying to allocate memory or use the processing resources at once +static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER; +static gpu_env_t * gpu = NULL; + +#if RPI_TRACE_TIME_VPU_QPU_WAIT + +static int64_t ns_time(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec; +} + + +#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000 + +#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U) +#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000) +#define T_ARG(t) T_SEC(t), T_MS(t) +#define T_FMT "%u.%03u" + +static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix) +{ + // Update totals for levels that are still pending + for (int i = 0; i < tto->count; ++i) { + tto->total[i] += now - tto->start[i]; + tto->start[i] = now; + } + + printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n", + prefix, + T_ARG(now - start0 - tto->total[0]), + T_ARG(tto->total[0]), + T_ARG(tto->total[1]), + T_ARG(tto->total[2]), + T_ARG(tto->total[3])); +} + + +static void tto_start(trace_time_one_t * const tto, const int64_t now) +{ + av_assert0(tto->count < WAIT_COUNT_MAX); + tto->start[tto->count++] = now; +} + +static void tto_end(trace_time_one_t * const tto, const int64_t now) +{ + const int n = --tto->count; + av_assert0(n >= 0); + tto->total[n] += now - tto->start[n]; +} + +static void ttw_print(trace_time_wait_t * const ttw, const int64_t now) +{ + printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0)); + tto_print(&ttw->active, now, ttw->start0, "Active"); + tto_print(&ttw->wait, now, ttw->start0, " Wait"); +} + +#endif + +// GPU memory alloc fns (internal) + +static void gpu_free_internal(GPU_MEM_PTR_T * const p) +{ + if (p->arm != NULL) + vcsm_unlock_ptr(p->arm); + if (p->vcsm_handle != 0) + vcsm_free(p->vcsm_handle); + memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again +} + + +static int gpu_malloc_internal(GPU_MEM_PTR_T * const p, + const int numbytes, const unsigned int cache_type, const char * const name) +{ + memset(p, 0, sizeof(*p)); + p->numbytes = (numbytes + 255) & ~255; // Round up + + if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 || + (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 || + (p->arm = vcsm_lock(p->vcsm_handle)) == NULL || + (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0) + { + gpu_free_internal(p); + return AVERROR(ENOMEM); + } + return 0; +} + + +// GPU init, free, lock, unlock + +static void gpu_term(void) +{ + gpu_env_t * const ge = gpu; + + // We have to hope that eveything has terminated... + gpu = NULL; + + vc_gpuserv_deinit(); + + gpu_free_internal(&ge->code_gm_ptr); + gpu_free_internal(&ge->qpu_code_gm_ptr); + gpu_free_internal(&ge->dummy_gm_ptr); + + vcsm_exit(); + + vq_wait_pool_deinit(&ge->wait_pool); + + free(ge); +} + + +// Connect to QPU, returns 0 on success. +static int gpu_init(gpu_env_t ** const gpu) { + volatile struct GPU* ptr; + gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t)); + int rv; + *gpu = NULL; + + if (ge == NULL) + return -1; + + vq_wait_pool_init(&ge->wait_pool); + + vcsm_init(); + + // Now copy over the QPU code into GPU memory + if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0) + return rv; + + { + int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader; + av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int)); + memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes); + memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes); + } + + // And the VPU code + if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0) + return rv; + ptr = (volatile struct GPU*)ge->code_gm_ptr.arm; + + // Zero everything so we have zeros between the code bits + memset((void *)ptr, 0, sizeof(*ptr)); + { + int num_bytes = sizeof(rpi_hevc_transform8); + av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); + memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes); + } + { + int num_bytes = sizeof(rpi_hevc_transform10); + av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); + memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes); + } + // And the transform coefficients + memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even)); + + // Generate a dummy "frame" & fill with 0x80 + // * Could reset to 1 <dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0) + return rv; + memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000); + + *gpu = ge; + return 0; +} + + + +static void gpu_unlock(void) { + pthread_mutex_unlock(&gpu_mutex); +} + +// Make sure we have exclusive access to the mailbox, and enable qpu if necessary. +static gpu_env_t * gpu_lock(void) { + pthread_mutex_lock(&gpu_mutex); + + av_assert1(gpu != NULL); + return gpu; +} + +static gpu_env_t * gpu_lock_ref(void) +{ + pthread_mutex_lock(&gpu_mutex); + + if (gpu == NULL) { + int rv = gpu_init(&gpu); + if (rv != 0) { + gpu_unlock(); + return NULL; + } + } + + ++gpu->open_count; + return gpu; +} + +static void gpu_unlock_unref(gpu_env_t * const ge) +{ + if (--ge->open_count == 0) + gpu_term(); + + gpu_unlock(); +} + +static inline gpu_env_t * gpu_ptr(void) +{ + av_assert1(gpu != NULL); + return gpu; +} + +unsigned int vpu_get_fn(const unsigned int bit_depth) { + uint32_t a = 0; + + // Make sure that the gpu is initialized + av_assert1(gpu != NULL); + switch (bit_depth){ + case 8: + a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8); + break; + case 10: + a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10); + break; + default: + av_assert0(0); + } + return a; +} + +unsigned int vpu_get_constants(void) { + av_assert1(gpu != NULL); + return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even)); +} + +void gpu_ref(void) +{ + gpu_lock_ref(); + gpu_unlock(); +} + +void gpu_unref(void) +{ + gpu_env_t * const ge = gpu_lock(); + gpu_unlock_unref(ge); +} + +// ---------------------------------------------------------------------------- + + +// Wait abstractions - mostly so we can easily add profile code +static void vq_wait_pool_init(vq_wait_pool_t * const wp) +{ + unsigned int i; + for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { + sem_init(&wp->pool[i].sem, 0, 0); + wp->pool[i].next = wp->pool + i + 1; + } + wp->head = wp->pool + 0; + wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL; +} + +static void vq_wait_pool_deinit(vq_wait_pool_t * const wp) +{ + unsigned int i; + wp->head = NULL; + for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { + sem_destroy(&wp->pool[i].sem); + wp->pool[i].next = NULL; + } +} + + +// If sem_init actually takes time then maybe we want a pool... +static vq_wait_t * vq_wait_new(void) +{ + gpu_env_t * const ge = gpu_lock_ref(); + vq_wait_t * const wait = ge->wait_pool.head; + ge->wait_pool.head = wait->next; + wait->next = NULL; + +#if RPI_TRACE_TIME_VPU_QPU_WAIT + tto_start(&ge->ttw.active, ns_time()); +#endif + + gpu_unlock(); + return wait; +} + +static void vq_wait_delete(vq_wait_t * const wait) +{ + gpu_env_t * const ge = gpu_lock(); + wait->next = ge->wait_pool.head; + ge->wait_pool.head = wait; + +#if RPI_TRACE_TIME_VPU_QPU_WAIT + { + trace_time_wait_t * const ttw = &ge->ttw; + const int64_t now = ns_time(); + ++ttw->jcount; + tto_end(&ttw->wait, now); + + if (ttw->start0 == 0) + { + ttw->start0 = ttw->active.start[0]; + ttw->last_update = ttw->start0; + } + if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD) + { + ttw->last_update += WAIT_TIME_PRINT_PERIOD; + ttw_print(ttw, now); + } + } +#endif + gpu_unlock_unref(ge); +} + +static void vq_wait_wait(vq_wait_t * const wait) +{ +#if RPI_TRACE_TIME_VPU_QPU_WAIT + { + const int64_t now = ns_time(); + gpu_env_t * const ge = gpu_lock(); + tto_start(&ge->ttw.wait, now); + gpu_unlock(); + } +#endif + + while (sem_wait(&wait->sem) == -1 && errno == EINTR) + /* loop */; +} + +static void vq_wait_post(vq_wait_t * const wait) +{ +#if RPI_TRACE_TIME_VPU_QPU_WAIT + { + gpu_env_t *const ge = gpu_lock(); + tto_end(&ge->ttw.active, ns_time()); + gpu_unlock(); + } +#endif + + sem_post(&wait->sem); +} + + + +// Header comments were wrong for these two +#define VPU_QPU_MASK_QPU 1 +#define VPU_QPU_MASK_VPU 2 + +typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; + +vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf) +{ +// vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t)); + vpu_qpu_job_env_t * vqj = buf; +// memset(vqj, 0, sizeof(*vqj)); + vqj->n = 0; + vqj->mask = 0; + return vqj; +} + +void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj) +{ +// memset(vqj, 0, sizeof(*vqj)); +// free(vqj); +} + +static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj) +{ + struct gpu_job_s * const j = vqj->j + vqj->n++; + av_assert1(vqj->n <= VPU_QPU_JOB_MAX); + return j; +} + +void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code, + const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5) +{ + if (vpu_code != 0) { + struct gpu_job_s *const j = new_job(vqj); + vqj->mask |= VPU_QPU_MASK_VPU; + + j->command = EXECUTE_VPU; + j->callback.func = 0; + j->callback.cookie = NULL; + // The bottom two bits of the execute address contain no-flush flags + // b0 will flush the VPU I-cache if unset so we nearly always want that set + // as we never reload code + j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed; + j->u.v.q[1] = r0; + j->u.v.q[2] = r1; + j->u.v.q[3] = r2; + j->u.v.q[4] = r3; + j->u.v.q[5] = r4; + j->u.v.q[6] = r5; + gpu->vpu_i_cache_flushed = 1; + } +} + +// flags are QPU_FLAGS_xxx +void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail) +{ + if (n != 0) { + struct gpu_job_s *const j = new_job(vqj); + vqj->mask |= VPU_QPU_MASK_QPU; + + j->command = EXECUTE_QPU; + j->callback.func = 0; + j->callback.cookie = NULL; + + j->u.q.jobs = n; +#if RPI_TRACE_QPU_PROFILE_ALL + j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS; +#else + j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU; +#endif + j->u.q.timeout = 5000; + memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); + } +} + +// Convert callback to sem post +static void vpu_qpu_job_callback_wait(void * v) +{ + vq_wait_post(v); +} + +// Poke a user-supplied sem +static void vpu_qpu_job_callback_sem(void * v) +{ + sem_post((sem_t *)v); +} + +void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h) +{ + vq_wait_t * wait; + + if (vqj->mask == 0) { + *wait_h = NULL; + return; + } + + // We are going to want a sync object + wait = vq_wait_new(); + + // There are 2 VPU Qs & 1 QPU Q so we can collapse sync + // If we only posted one thing or only QPU jobs + if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) + { + struct gpu_job_s * const j = vqj->j + (vqj->n - 1); + av_assert1(j->callback.func == 0); + + j->callback.func = vpu_qpu_job_callback_wait; + j->callback.cookie = wait; + } + else + { + struct gpu_job_s *const j = new_job(vqj); + + j->command = EXECUTE_SYNC; + j->u.s.mask = vqj->mask; + j->callback.func = vpu_qpu_job_callback_wait; + j->callback.cookie = wait; + } + + vqj->mask = 0; + *wait_h = wait; +} + +// Returns 0 if no sync added ('cos Q empty), 1 if sync added +int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem) +{ + // If nothing on q then just return + if (vqj->mask == 0) + return 0; + + // There are 2 VPU Qs & 1 QPU Q so we can collapse sync + // If we only posted one thing or only QPU jobs + if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) + { + struct gpu_job_s * const j = vqj->j + (vqj->n - 1); + av_assert1(j->callback.func == 0); + + j->callback.func = vpu_qpu_job_callback_sem; + j->callback.cookie = sem; + } + else + { + struct gpu_job_s *const j = new_job(vqj); + + j->command = EXECUTE_SYNC; + j->u.s.mask = vqj->mask; + j->callback.func = vpu_qpu_job_callback_sem; + j->callback.cookie = sem; + } + + vqj->mask = 0; + return 1; +} + + +int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj) +{ + if (vqj->n == 0) + return 0; + + return vc_gpuserv_execute_code(vqj->n, vqj->j); +} + +// Simple wrapper of start + delete +int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj) +{ + int rv; + rv = vpu_qpu_job_start(vqj); + vpu_qpu_job_delete(vqj); + return rv; +} + +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h) +{ + if (wait_h != NULL) + { + vq_wait_t * const wait = *wait_h; + if (wait != NULL) { + *wait_h = NULL; + vq_wait_wait(wait); + vq_wait_delete(wait); + } + } +} + +int vpu_qpu_init() +{ + gpu_env_t * const ge = gpu_lock_ref(); + if (ge == NULL) + return -1; + + if (ge->init_count++ == 0) + { + vc_gpuserv_init(); + } + + gpu_unlock(); + return 0; +} + +void vpu_qpu_term() +{ + gpu_env_t * const ge = gpu_lock(); + + if (--ge->init_count == 0) { + vc_gpuserv_deinit(); + +#if RPI_TRACE_TIME_VPU_QPU_WAIT + ttw_print(&ge->ttw, ns_time()); +#endif + } + + gpu_unlock_unref(ge); +} + +uint32_t qpu_fn(const int * const mc_fn) +{ + return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader); +} + +uint32_t qpu_dummy(void) +{ + return gpu->dummy_gm_ptr.vc; +} + +int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth) +{ + // Dummy values we can catch with emulation + qf->y_pxx = ~1U; + qf->y_bxx = ~2U; + qf->y_p00 = ~3U; + qf->y_b00 = ~4U; + qf->c_pxx = ~5U; + qf->c_bxx = ~6U; + + switch (bit_depth) { + case 8: + qf->y_pxx = qpu_fn(mc_filter_y_pxx); + qf->y_pxx = qpu_fn(mc_filter_y_pxx); + qf->y_bxx = qpu_fn(mc_filter_y_bxx); + qf->y_p00 = qpu_fn(mc_filter_y_p00); + qf->y_b00 = qpu_fn(mc_filter_y_b00); + qf->c_pxx = qpu_fn(mc_filter_c_p); + qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1); + qf->c_bxx = qpu_fn(mc_filter_c_b); + break; + case 10: + qf->c_pxx = qpu_fn(mc_filter_c10_p); + qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1); + qf->c_bxx = qpu_fn(mc_filter_c10_b); + qf->y_pxx = qpu_fn(mc_filter_y10_pxx); + qf->y_bxx = qpu_fn(mc_filter_y10_bxx); + qf->y_p00 = qpu_fn(mc_filter_y10_p00); + qf->y_b00 = qpu_fn(mc_filter_y10_b00); + break; + default: + return -1; + } + return 0; +} + diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h new file mode 100644 index 0000000000..8777687021 --- /dev/null +++ b/libavcodec/rpi_qpu.h @@ -0,0 +1,103 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox, Ben Avison +*/ + +#ifndef RPI_QPU_H +#define RPI_QPU_H + +#include "rpi_mem.h" +#include "rpi_zc_frames.h" + +#pragma GCC diagnostic push +// Many many redundant decls in the header files +#pragma GCC diagnostic ignored "-Wredundant-decls" +#pragma GCC diagnostic ignored "-Wstrict-prototypes" +#include "interface/vmcs_host/vc_vchi_gpuserv.h" // for gpu_job_s +#pragma GCC diagnostic pop + +// QPU specific functions + +typedef struct HEVCRpiQpu { + uint32_t c_pxx; + uint32_t c_pxx_l1; + uint32_t c_bxx; + uint32_t y_pxx; + uint32_t y_bxx; + uint32_t y_p00; + uint32_t y_b00; +} HEVCRpiQpu; + +int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth); + +uint32_t qpu_fn(const int * const mc_fn); +uint32_t qpu_dummy(void); + +#define QPU_N_GRP 4 +#define QPU_N_MAX 12 + +#define QPU_MAIL_EL_VALS 2 + +struct vpu_qpu_wait_s; +typedef struct vq_wait_s * vpu_qpu_wait_h; + +// VPU specific functions + +struct vpu_qpu_job_env_s; +typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h; + +#define VPU_QPU_JOB_MAX 4 +struct vpu_qpu_job_env_s +{ + unsigned int n; + unsigned int mask; + struct gpu_job_s j[VPU_QPU_JOB_MAX]; +}; +typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; + +vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf); +void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); +void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, + const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5); +void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail); +void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h); +int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem); +int vpu_qpu_job_start(const vpu_qpu_job_h vqj); +int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); + +extern unsigned int vpu_get_fn(const unsigned int bit_depth); +extern unsigned int vpu_get_constants(void); + +// Waits for previous post_codee to complete and Will null out *wait_h after use +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h); +int vpu_qpu_init(void); +void vpu_qpu_term(void); + +void gpu_ref(void); +void gpu_unref(void); + +#endif diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c new file mode 100644 index 0000000000..37be9a0f49 --- /dev/null +++ b/libavcodec/rpi_zc.c @@ -0,0 +1,1227 @@ +#include "config.h" + +#include "libavcodec/avcodec.h" +#include "rpi_mem.h" +#include "rpi_mailbox.h" +#include "rpi_zc.h" +#include "libavutil/avassert.h" +#include + +#include "libavutil/buffer_internal.h" + +#pragma GCC diagnostic push +// Many many redundant decls in the header files +#pragma GCC diagnostic ignored "-Wredundant-decls" +#include +#include +#pragma GCC diagnostic pop + +#define TRACE_ALLOC 0 +#define DEBUG_ALWAYS_KEEP_LOCKED 0 + +struct ZcPoolEnt; + +typedef struct ZcPool +{ + size_t numbytes; + struct ZcPoolEnt * head; + pthread_mutex_t lock; +} ZcPool; + +typedef struct ZcPoolEnt +{ + size_t numbytes; + + unsigned int vcsm_handle; + unsigned int vc_handle; + void * map_arm; + unsigned int map_vc; + + struct ZcPoolEnt * next; + struct ZcPool * pool; +} ZcPoolEnt; + +typedef struct ZcOldCtxVals +{ + int thread_safe_callbacks; + int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags); + void * opaque; +} ZcOldCtxVals; + +typedef struct AVZcEnv +{ + unsigned int refcount; + ZcOldCtxVals old; + + void * pool_env; + av_rpi_zc_alloc_buf_fn_t * alloc_buf; + av_rpi_zc_free_pool_fn_t * free_pool; + + unsigned int pool_size; +} ZcEnv; + +typedef struct ZcUserBufEnv { + void * v; + const av_rpi_zc_buf_fn_tab_t * fn; + size_t numbytes; + int offset; +} ZcUserBufEnv; + +#define ZC_BUF_INVALID 0 +#define ZC_BUF_VALID 1 +#define ZC_BUF_NEVER 2 + +typedef struct ZcBufEnv { + GPU_MEM_PTR_T gmem; + AVZcEnvPtr zc; + int is_valid; + AVBufferRef * user; + AVRpiZcFrameGeometry geo; + size_t size_y; + size_t size_c; + size_t size_pic; + ssize_t offset; + pthread_mutex_t lock; + pthread_cond_t cond; +} ZcBufEnv; + + + + + + +#define ALLOC_PAD 0 +#define ALLOC_ROUND 0x1000 +#define STRIDE_ROUND 64 +#define STRIDE_OR 0 + +#define DEBUG_ZAP0_BUFFERS 0 + +static inline int av_rpi_is_sand_format(const int format) +{ + return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) || + (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10); +} + +static inline int av_rpi_is_sand_frame(const AVFrame * const frame) +{ + return av_rpi_is_sand_format(frame->format); +} + +//---------------------------------------------------------------------------- +// +// Internal pool stuff + +// Pool entry functions + +static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size) +{ + ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt)); + + // Round up to 4k & add 4k + const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1); + + if (zp == NULL) { + av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n"); + goto fail0; + } + + // The 0x80 here maps all pages here rather than waiting for lazy mapping + // BEWARE that in GPU land a later unlock/lock pair will put us back into + // lazy mode - which will also break cache invalidate calls. + if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0) + { + av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size); + goto fail1; + } + +#if TRACE_ALLOC + printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle); +#endif + + zp->numbytes = alloc_size; + zp->pool = pool; + return zp; + +fail1: + av_free(zp); +fail0: + return NULL; +} + +static void zc_pool_ent_free(ZcPoolEnt * const zp) +{ +#if TRACE_ALLOC + printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle); +#endif + + if (zp->vcsm_handle != 0) + { + // VC addr & handle need no dealloc + if (zp->map_arm != NULL) + vcsm_unlock_hdl(zp->vcsm_handle); + vcsm_free(zp->vcsm_handle); + } + av_free(zp); +} + +//---------------------------------------------------------------------------- +// +// Pool functions + +static void zc_pool_free_ent_list(ZcPoolEnt * p) +{ + while (p != NULL) + { + ZcPoolEnt * const zp = p; + p = p->next; + zc_pool_ent_free(zp); + } +} + +static void zc_pool_flush(ZcPool * const pool) +{ + ZcPoolEnt * p = pool->head; + pool->head = NULL; + pool->numbytes = ~0U; + zc_pool_free_ent_list(p); +} + +static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes) +{ + ZcPoolEnt * zp = NULL; + ZcPoolEnt * flush_list = NULL; + size_t numbytes; + + pthread_mutex_lock(&pool->lock); + + numbytes = pool->numbytes; + + // If size isn't close then dump the pool + // Close in this context means within 128k + if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes) + { + flush_list = pool->head; + pool->head = NULL; + pool->numbytes = numbytes = req_bytes; + } + else if (pool->head != NULL) + { + zp = pool->head; + pool->head = zp->next; + } + + pthread_mutex_unlock(&pool->lock); + + zc_pool_free_ent_list(flush_list); + + if (zp == NULL) + zp = zc_pool_ent_alloc(pool, numbytes); + + return zp; +} + +static void zc_pool_put_ent(ZcPoolEnt * const zp) +{ + ZcPool * const pool = zp == NULL ? NULL : zp->pool; + if (zp != NULL) + { + pthread_mutex_lock(&pool->lock); +#if TRACE_ALLOC + printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes); +#endif + + if (pool->numbytes == zp->numbytes) + { + zp->next = pool->head; + pool->head = zp; + pthread_mutex_unlock(&pool->lock); + } + else + { + pthread_mutex_unlock(&pool->lock); + zc_pool_ent_free(zp); + } + } +} + +static ZcPool * +zc_pool_new(void) +{ + ZcPool * const pool = av_mallocz(sizeof(*pool)); + if (pool == NULL) + return NULL; + + pool->numbytes = -1; + pool->head = NULL; + pthread_mutex_init(&pool->lock, NULL); + return pool; +} + +static void +zc_pool_delete(ZcPool * const pool) +{ + if (pool != NULL) + { + pool->numbytes = -1; + zc_pool_flush(pool); + pthread_mutex_destroy(&pool->lock); + av_free(pool); + } +} + +//============================================================================ +// +// ZC implementation using above pool implementation +// +// Fn table fns... + +static void zc_pool_free_v(void * v) +{ + zc_pool_put_ent(v); +} + +static unsigned int zc_pool_ent_vcsm_handle_v(void * v) +{ + ZcPoolEnt * zp = v; + return zp->vcsm_handle; +} + +static unsigned int zc_pool_ent_vc_handle_v(void * v) +{ + ZcPoolEnt * zp = v; + if (zp->vc_handle == 0) + { + if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0) + av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n", + __func__, zp->vcsm_handle); + } + return zp->vc_handle; +} + +static void * zc_pool_ent_map_arm_v(void * v) +{ + ZcPoolEnt * zp = v; + if (zp->map_arm == NULL) + { + if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL) + av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n", + __func__, zp->vcsm_handle); + } + return zp->map_arm; +} + +static unsigned int zc_pool_ent_map_vc_v(void * v) +{ + ZcPoolEnt * zp = v; + if (zp->map_vc == 0) + { + if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0) + av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n", + __func__, zp->vcsm_handle); + } + return zp->map_vc; +} + +static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = { + .free = zc_pool_free_v, + .vcsm_handle = zc_pool_ent_vcsm_handle_v, + .vc_handle = zc_pool_ent_vc_handle_v, + .map_arm = zc_pool_ent_map_arm_v, + .map_vc = zc_pool_ent_map_vc_v, +}; + +// ZC Env fns + +// Delete pool +// All buffers guaranteed freed by now +static void +zc_pool_delete_v(void * v) +{ + zc_pool_delete((ZcPool *)v); + rpi_mem_gpu_uninit(); +} + +// Allocate a new ZC buffer +static AVBufferRef * +zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo) +{ + ZcPool * const pool = v; + ZcPoolEnt *const zp = zc_pool_get_ent(pool, size); + AVBufferRef * buf; + + (void)geo; // geo ignored here + + if (zp == NULL) { + av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size); + goto fail0; + } + + if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL) + { + av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n"); + goto fail2; + } + + return buf; + +fail2: + zc_pool_put_ent(zp); +fail0: + return NULL; +} + +// Init wrappers - the public fns + +AVZcEnvPtr +av_rpi_zc_int_env_alloc(void * logctx) +{ + ZcEnv * zc; + ZcPool * pool_env; + + if (rpi_mem_gpu_init(0) < 0) + return NULL; + + if ((pool_env = zc_pool_new()) == NULL) + goto fail1; + + if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL) + goto fail2; + + return zc; + +fail2: + zc_pool_delete(pool_env); +fail1: + rpi_mem_gpu_uninit(); + return NULL; +} + +void +av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp) +{ + const AVZcEnvPtr zc = *zcp; + *zcp = NULL; + if (zc != NULL) + av_rpi_zc_env_release(zc); +} + +//============================================================================ +// +// Geometry +// +// This is a separate chunck to the rest + +// Get mailbox fd - should be in a lock when called +// Rely on process close to close it +static int mbox_fd(void) +{ + static int fd = -1; + if (fd != -1) + return fd; + return (fd = mbox_open()); +} + +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( + const int format, const unsigned int video_width, const unsigned int video_height) +{ + static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; + + AVRpiZcFrameGeometry geo = { + .format = format, + .video_width = video_width, + .video_height = video_height + }; + + switch (format) + { + case AV_PIX_FMT_YUV420P: + geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; + geo.stride_c = geo.stride_y / 2; + geo.height_y = (video_height + 32 + 31) & ~31; + geo.height_c = geo.height_y / 2; + geo.planes_c = 2; + geo.stripes = 1; + geo.bytes_per_pel = 1; + geo.stripe_is_yc = 1; + break; + + case AV_PIX_FMT_YUV420P10: + geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; + geo.stride_c = geo.stride_y / 2; + geo.height_y = (video_height + 32 + 31) & ~31; + geo.height_c = geo.height_y / 2; + geo.planes_c = 2; + geo.stripes = 1; + geo.bytes_per_pel = 2; + geo.stripe_is_yc = 1; + break; + + case AV_PIX_FMT_SAND128: + case AV_PIX_FMT_RPI4_8: + { + const unsigned int stripe_w = 128; + + static VC_IMAGE_T img = {0}; + + // Given the overhead of calling the mailbox keep a stashed + // copy as we will almost certainly just want the same numbers again + // but that means we need a lock + pthread_mutex_lock(&sand_lock); + + if (img.width != video_width || img.height != video_height) + { + VC_IMAGE_T new_img = { + .type = VC_IMAGE_YUV_UV, + .width = video_width, + .height = video_height + }; + + mbox_get_image_params(mbox_fd(), &new_img); + img = new_img; + } + + geo.stride_y = stripe_w; + geo.stride_c = stripe_w; + geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; + geo.height_c = img.pitch / stripe_w - geo.height_y; + geo.stripe_is_yc = 1; + if (geo.height_y * stripe_w > img.pitch) + { + // "tall" sand - all C blocks now follow Y + geo.height_y = img.pitch / stripe_w; + geo.height_c = geo.height_y; + geo.stripe_is_yc = 0; + } + geo.planes_c = 1; + geo.stripes = (video_width + stripe_w - 1) / stripe_w; + geo.bytes_per_pel = 1; + + pthread_mutex_unlock(&sand_lock); +#if 0 + printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n", + video_width, video_height, + geo.stride_y, geo.stride_c, + geo.height_y, geo.height_c, + geo.stripes, img.pitch); +#endif + av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0); + av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2); + break; + } + + case AV_PIX_FMT_RPI4_10: + { + const unsigned int stripe_w = 128; // bytes + + static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; + static VC_IMAGE_T img = {0}; + + // Given the overhead of calling the mailbox keep a stashed + // copy as we will almost certainly just want the same numbers again + // but that means we need a lock + pthread_mutex_lock(&sand_lock); + + if (img.width != video_width || img.height != video_height) + { + VC_IMAGE_T new_img = { + .type = VC_IMAGE_YUV10COL, + .width = video_width, + .height = video_height + }; + + mbox_get_image_params(mbox_fd(), &new_img); + img = new_img; + } + + geo.stride_y = stripe_w; + geo.stride_c = stripe_w; + geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; + geo.height_c = img.pitch / stripe_w - geo.height_y; + geo.planes_c = 1; + geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w; + geo.bytes_per_pel = 1; + geo.stripe_is_yc = 1; + + pthread_mutex_unlock(&sand_lock); + +#if 0 + printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n", + video_width, video_height, + geo.stride_y, geo.stride_c, + geo.height_y, geo.height_c, + geo.stripes, img.pitch); +#endif + av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0); + av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2); + break; + } + + case AV_PIX_FMT_SAND64_16: + case AV_PIX_FMT_SAND64_10: + { + const unsigned int stripe_w = 128; // bytes + + static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; + static VC_IMAGE_T img = {0}; + + // Given the overhead of calling the mailbox keep a stashed + // copy as we will almost certainly just want the same numbers again + // but that means we need a lock + pthread_mutex_lock(&sand_lock); + + if (img.width != video_width || img.height != video_height) + { + VC_IMAGE_T new_img = { + .type = VC_IMAGE_YUV_UV_16, + .width = video_width, + .height = video_height + }; + + mbox_get_image_params(mbox_fd(), &new_img); + img = new_img; + } + + geo.stride_y = stripe_w; + geo.stride_c = stripe_w; + geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; + geo.height_c = img.pitch / stripe_w - geo.height_y; + geo.planes_c = 1; + geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w; + geo.bytes_per_pel = 2; + geo.stripe_is_yc = 1; + + pthread_mutex_unlock(&sand_lock); + break; + } + + default: + break; + } + return geo; +} + +//============================================================================ +// +// ZC Env fns +// +// Frame copy fns + +static AVBufferRef * zc_copy(const AVZcEnvPtr zc, + const AVFrame * const src) +{ + AVFrame dest_frame; + AVFrame * const dest = &dest_frame; + unsigned int i; + uint8_t * psrc, * pdest; + + dest->format = src->format; + dest->width = src->width; + dest->height = src->height; + + if (av_rpi_zc_get_buffer(zc, dest) != 0 || + av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0) + { + return NULL; + } + + for (i = 0, psrc = src->data[0], pdest = dest->data[0]; + i != dest->height; + ++i, psrc += src->linesize[0], pdest += dest->linesize[0]) + { + memcpy(pdest, psrc, dest->width); + } + for (i = 0, psrc = src->data[1], pdest = dest->data[1]; + i != dest->height / 2; + ++i, psrc += src->linesize[1], pdest += dest->linesize[1]) + { + memcpy(pdest, psrc, dest->width / 2); + } + for (i = 0, psrc = src->data[2], pdest = dest->data[2]; + i != dest->height / 2; + ++i, psrc += src->linesize[2], pdest += dest->linesize[2]) + { + memcpy(pdest, psrc, dest->width / 2); + } + + return dest->buf[0]; +} + + +static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc, + const AVFrame * const src) +{ + assert(0); + return NULL; +} + + +static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc, + const AVFrame * const src, const unsigned int src_bits) +{ + assert(0); + return NULL; +} + +//---------------------------------------------------------------------------- +// +// Public info extraction calls + +static void zc_buf_env_free_cb(void * opaque, uint8_t * data); + +static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf) +{ + // Kludge where we check the free fn to check this is really + // one of our buffers - can't think of a better way + return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL : + av_buffer_get_opaque(buf); +} + +static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf) +{ + // As gmem is the first el NULL should be preserved + return &pic_zbe_ptr(buf)->gmem; +} + +unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref) +{ + const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); + return p == NULL ? 0 : p->vcsm_handle; +} + +int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref) +{ + const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); + return p == NULL ? -1 : p->vc_handle; +} + +int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref) +{ + const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); + return zbe == NULL ? 0 : zbe->offset; +} + +int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref) +{ + const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); + return zbe == NULL ? 0 : zbe->size_pic; +} + +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref) +{ + const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); + return p == NULL ? 0 : p->numbytes; +} + +const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref) +{ + const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); + return zbe == NULL ? NULL : &zbe->geo; +} + +AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc, + const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy) +{ + av_assert0(!maycopy || zc != NULL); + + if (frame->format != AV_PIX_FMT_YUV420P && + frame->format != AV_PIX_FMT_YUV420P10 && + !av_rpi_is_sand_frame(frame)) + { + av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format); + return NULL; + } + + if (frame->buf[1] != NULL || frame->format != expected_format) + { +#if RPI_ZC_SAND_8_IN_10_BUF + if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL) + { +// av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__); + return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]); + } +#endif + + if (maycopy) + { + if (frame->buf[1] != NULL) + av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__); + else + av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format); + + switch (frame->format) + { + case AV_PIX_FMT_YUV420P10: + return zc_420p10_to_sand128(zc, frame); + + case AV_PIX_FMT_SAND64_10: + return zc_sand64_16_to_sand128(zc, frame, 10); + + default: + return zc_copy(zc, frame); + } + } + else + { + if (frame->buf[1] != NULL) + av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__); + else + av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format); + return NULL; + } + } + + if (pic_gm_ptr(frame->buf[0]) == NULL) + { + if (maycopy) + { + av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__); + return zc_copy(zc, frame); + } + else + { + av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__); + return NULL; + } + } + + return av_buffer_ref(frame->buf[0]); +} + +void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref) +{ + if (fr_ref != NULL) + { + av_buffer_unref(&fr_ref); + } +} + +//---------------------------------------------------------------------------- + +// Extract user environment from an AVBufferRef +void * av_rpi_zc_buf_v(AVBufferRef * const buf) +{ + ZcBufEnv * const zbe = pic_zbe_ptr(buf); + if (zbe != NULL && zbe->user != NULL) + { + const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data; + return zub == NULL ? NULL : zub->v; + } + return NULL; +} + +// AV buffer pre-free callback +static void zc_user_buf_free_cb(void * opaque, uint8_t * data) +{ + if (opaque != NULL) + { + ZcUserBufEnv * const zub = opaque; + + if (zub->fn->free) + zub->fn->free(zub->v); + + av_free(zub); + } +} + +static void zc_buf_env_free_cb(void * opaque, uint8_t * data) +{ + if (opaque != NULL) + { + ZcBufEnv * const zbe = opaque; + + av_buffer_unref(&zbe->user); + + if (zbe->zc != NULL) + av_rpi_zc_env_release(zbe->zc); + + pthread_cond_destroy(&zbe->cond); + pthread_mutex_destroy(&zbe->lock); + av_free(zbe); + } +} + + +// Wrap the various ZC bits in an AV Buffer and resolve those things we want +// resolved now. +// Currently we resolve everything, but in future we might not +AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab) +{ + AVBufferRef *buf; + ZcUserBufEnv * zub; + + if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL) + return NULL; + + zub->fn = fn_tab; + zub->v = v; + zub->numbytes = numbytes; + zub->offset = addr_offset; + + if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL) + { + av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n"); + av_free(zub); + return NULL; + } + + return buf; +} + +int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode) +{ + ZcBufEnv * const zbe = pic_zbe_ptr(buf); + + if (zbe == NULL) + return AVERROR(EINVAL); + + if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid) + return AVERROR(EAGAIN); + + if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid) + { + pthread_mutex_lock(&zbe->lock); + while (!zbe->is_valid) + pthread_cond_wait(&zbe->cond, &zbe->lock); + pthread_mutex_unlock(&zbe->lock); + } + + if (zbe->is_valid == ZC_BUF_NEVER) + return AVERROR(EINVAL); + + // Do alloc if we need it + if (zbe->user == NULL) + { + ZcEnv * const zc = zbe->zc; + const ZcUserBufEnv * zub; + + av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID); + + if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL) + { + av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); + goto fail; + } + zub = (const ZcUserBufEnv *)zbe->user->data; + + // Track + + zbe->offset = zub->offset; + zbe->gmem.numbytes = zub->numbytes; + if ((zbe->gmem.arm = zub->fn->map_arm(zub->v)) == NULL) + { + av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle); + goto fail; + } + + if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0) + { + av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n"); + goto fail; + } + + if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0) + { + av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle); + goto fail; + } + if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0) + { + av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle); + goto fail; + } + + buf->buffer->data = zbe->gmem.arm + zbe->offset; + buf->buffer->size = zbe->size_pic; + + // In this mode we shouldn't have anyone waiting for us + // so no need to signal + if (alloc_mode == ZC_RESOLVE_ALLOC_VALID) + zbe->is_valid = 1; + } + + // Just overwrite - no point in testing + buf->data = zbe->gmem.arm + zbe->offset; + buf->size = zbe->size_pic; + return 0; + +fail: + av_buffer_unref(&zbe->user); + return AVERROR(ENOMEM); +} + +int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc) +{ + int rv; + + // Do alloc if we need it + if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0) + return rv; + + // If we are a framebuf copy then the alloc can be done but we haven't + // imported its results yet + if (frame->data[0] == NULL) + { + const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); + + frame->linesize[0] = zbe->geo.stride_y; + frame->linesize[1] = zbe->geo.stride_c; + frame->linesize[2] = zbe->geo.stride_c; + // abuse: linesize[3] = "stripe stride" + // stripe_stride is NOT the stride between slices it is (that / geo.stride_y). + // In a general case this makes the calculation an xor and multiply rather + // than a divide and multiply + if (zbe->geo.stripes > 1) + frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y; + + frame->data[0] = frame->buf[0]->data; + frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes); + if (zbe->geo.planes_c > 1) + frame->data[2] = frame->data[1] + zbe->size_c; + + frame->extended_data = frame->data; + // Leave extended buf alone + } + + return 0; +} + +int av_rpi_zc_set_valid_frame(AVFrame * const frame) +{ + ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); + + if (zbe == NULL) + return AVERROR(EINVAL); + + zbe->is_valid = ZC_BUF_VALID; + pthread_cond_broadcast(&zbe->cond); + + return 0; +} + +int av_rpi_zc_set_broken_frame(AVFrame * const frame) +{ + ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); + + if (zbe == NULL) + return AVERROR(EINVAL); + + zbe->is_valid = ZC_BUF_NEVER; + pthread_cond_broadcast(&zbe->cond); + + return 0; +} + +void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size) +{ + zc->pool_size = pool_size; +} + +unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc) +{ + return zc->pool_size; +} + +int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame) +{ +#if 1 + ZcBufEnv * zbe = av_mallocz(sizeof(*zbe)); + + for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) { + frame->buf[i] = NULL; + frame->data[i] = NULL; + frame->linesize[i] = 0; + } + + if (zbe == NULL) + return AVERROR(ENOMEM); + + if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL) + { + av_free(zbe); + return AVERROR(ENOMEM); + } + + pthread_mutex_init(&zbe->lock, NULL); + pthread_cond_init(&zbe->cond, NULL); + zbe->zc = zc; + atomic_fetch_add(&zc->refcount, 1); + + zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); // Note geometry for later use + zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y; + zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c; + zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes; + +#else + const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); + const unsigned int size_y = geo.stride_y * geo.height_y; + const unsigned int size_c = geo.stride_c * geo.height_c; + const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes; + AVBufferRef * buf; + unsigned int i; + +// printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic); + + if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL) + { + av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); + return AVERROR(ENOMEM); + } + + // Track + atomic_fetch_add(&zc->refcount, 1); + pic_zbe_ptr(buf)->zc = zc; + + for (i = 0; i < AV_NUM_DATA_POINTERS; i++) { + frame->buf[i] = NULL; + frame->data[i] = NULL; + frame->linesize[i] = 0; + } + + frame->buf[0] = buf; + + frame->linesize[0] = geo.stride_y; + frame->linesize[1] = geo.stride_c; + frame->linesize[2] = geo.stride_c; + // abuse: linesize[3] = "stripe stride" + // stripe_stride is NOT the stride between slices it is (that / geo.stride_y). + // In a general case this makes the calculation an xor and multiply rather + // than a divide and multiply + if (geo.stripes > 1) + frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y; + + frame->data[0] = buf->data; + frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes); + if (geo.planes_c > 1) + frame->data[2] = frame->data[1] + size_c; + + frame->extended_data = frame->data; + // Leave extended buf alone + +#if RPI_ZC_SAND_8_IN_10_BUF != 0 + // *** If we intend to use this for real we will want a 2nd buffer pool + frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic); // *** 2 * wanted size - kludge +#endif +#endif + + return 0; +} + +void av_rpi_zc_env_release(const AVZcEnvPtr zc) +{ + const int n = atomic_fetch_add(&zc->refcount, -1); + if (n == 1) // was 1, now 0 + { + zc->free_pool(zc->pool_env); + av_free(zc); + } +} + +AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx, + void * pool_env, + av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, + av_rpi_zc_free_pool_fn_t * free_pool_fn) +{ + ZcEnv * zc; + + if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL) + { + av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n"); + return NULL; + } + + *zc = (ZcEnv){ + .refcount = ATOMIC_VAR_INIT(1), + .pool_env = pool_env, + .alloc_buf = alloc_buf_fn, + .free_pool = free_pool_fn, + .pool_size = 0 + }; + + return zc; +} + +//============================================================================ +// +// External ZC initialisation + +#define RPI_GET_BUFFER2 1 + + +static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags) +{ +#if !RPI_GET_BUFFER2 + return avcodec_default_get_buffer2(s, frame, flags); +#else + int rv; + + if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0) + { +// printf("Do default alloc: format=%#x\n", frame->format); + rv = avcodec_default_get_buffer2(s, frame, flags); + } + else if (frame->format == AV_PIX_FMT_YUV420P || + av_rpi_is_sand_frame(frame)) + { + if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0) + rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID); + } + else + { + rv = avcodec_default_get_buffer2(s, frame, flags); + } + +#if 0 + printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__, + frame->format, frame->width, frame->height, + frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3], + frame->data[0], frame->data[1], frame->data[2], + frame->buf[0], frame->buf[1], frame->buf[2], + av_buffer_get_opaque(frame->buf[0])); +#endif + return rv; +#endif +} + +int av_rpi_zc_in_use(const struct AVCodecContext * const s) +{ + return s->get_buffer2 == zc_get_buffer2; +} + +int av_rpi_zc_init2(struct AVCodecContext * const s, + void * pool_env, + av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, + av_rpi_zc_free_pool_fn_t * free_pool_fn) +{ + ZcEnv * zc; + + av_assert0(!av_rpi_zc_in_use(s)); + + if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL) + return AVERROR(ENOMEM); + + zc->old = (ZcOldCtxVals){ + .opaque = s->opaque, + .get_buffer2 = s->get_buffer2, + .thread_safe_callbacks = s->thread_safe_callbacks + }; + + s->opaque = zc; + s->get_buffer2 = zc_get_buffer2; + s->thread_safe_callbacks = 1; + return 0; +} + +void av_rpi_zc_uninit2(struct AVCodecContext * const s) +{ + ZcEnv * const zc = s->opaque; + + av_assert0(av_rpi_zc_in_use(s)); + + s->get_buffer2 = zc->old.get_buffer2; + s->opaque = zc->old.opaque; + s->thread_safe_callbacks = zc->old.thread_safe_callbacks; + + av_rpi_zc_env_release(zc); +} + diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h new file mode 100644 index 0000000000..f00a7c962c --- /dev/null +++ b/libavcodec/rpi_zc.h @@ -0,0 +1,228 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox +*/ + +#ifndef LIBAVCODEC_RPI_ZC_H +#define LIBAVCODEC_RPI_ZC_H + +// Zero-Copy frame code for RPi +// RPi needs Y/U/V planes to be contiguous for display. By default +// ffmpeg will allocate separated planes so a memcpy is needed before +// display. This code provides a method a making ffmpeg allocate a single +// bit of memory for the frame when can then be reference counted until +// display has finished with it. + +// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame +// 0 disables +// *** This option still in development +// Only works if SAO active +// Allocates buffers that are twice the required size +#define RPI_ZC_SAND_8_IN_10_BUF 0 + +struct AVBufferRef; +struct AVFrame; +struct AVCodecContext; +enum AVPixelFormat; + +// "Opaque" pointer to whatever we are using as a buffer reference +typedef struct AVBufferRef * AVRpiZcRefPtr; + +struct AVZcEnv; +typedef struct AVZcEnv * AVZcEnvPtr; + +typedef struct AVRpiZcFrameGeometry +{ + unsigned int stride_y; // Luma stride (bytes) + unsigned int height_y; // Luma height (lines) + unsigned int stride_c; // Chroma stride (bytes) + unsigned int height_c; // Chroma stride (lines) + unsigned int planes_c; // Chroma plane count (U, V = 2, interleaved = 1) + unsigned int stripes; // Number of stripes (sand) + unsigned int bytes_per_pel; + int stripe_is_yc; // A single stripe is Y then C (false for tall sand) + + int format; // Requested format + unsigned int video_width; // Requested width + unsigned int video_height; // Requested height +} AVRpiZcFrameGeometry; + +// Get expected MMAL geometry for a given format, width & height +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( + const int format, + const unsigned int video_width, const unsigned int video_height); + +//---------------------------------------------------------------------------- +// +// Calls that extract info from a ZC frame whether internally or externally +// allocated + +// Generate a ZC reference to the buffer(s) in this frame +// If the buffer doesn't appear to be one allocated by ZC +// then the behaviour depends on maycopy: +// If maycopy=0 then return NULL +// If maycopy=1 && the src frame is in a form where we can easily copy +// the data, then allocate a new buffer and copy the data into it +// Otherwise return NULL +// If maycopy == 0 then ZC may be NULL +AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc, + const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy); + +// Unreference the buffer refed/allocated by _zc_ref +// If fr_ref is NULL then this will NOP +void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref); + +// Get the vc_handle from the frame ref +// Returns -1 if ref doesn't look valid +int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref); +// Get the vcsm_handle from the frame ref +// Returns 0 if ref doesn't look valid +unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref); +// Get offset from the start of the memory referenced +// by the vc_handle to valid data +int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref); +// Length of buffer data +int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref); +// Get the number of bytes allocated from the frame ref +// Returns 0 if ref doesn't look valid +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref); +// Geometry this frame was allocated with +const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref); + +//---------------------------------------------------------------------------- +// +// Calls for external frame allocation + +// Callbacks registered in av_rpi_zc_init2 + +// Callback to allocate a buf for a frame +// The frame itself is generated in the calling code +// +// Parameters: +// pool_env value passed to av-rpi_zc_init2 +// size size wanted +// geo geometry of the frame to be allocated +// Returns: +// NULL Alloc failed +// ptr AVBufferBuf* of allocated buffer +// In most cases av_rpi_zc_buf will be called by this function +// and this will be the buf returned by that. +typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size, + const AVRpiZcFrameGeometry * geo); + +// Callback once ffmpeg is completely done with this pool +// Called once all allocated buffers have been derefed and ffmpegs ref to this +// pool has been dropped +typedef void av_rpi_zc_free_pool_fn_t(void * pool_env); + +// Init ZC into a context +// Sets opaque, get_buffer2, thread_safe_callbacks +// Use if you want to allocate your own pools and/or create ZC buffers for +// all decoders +// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken +// apart by av_rpi_zc_xxx calls without this +int av_rpi_zc_init2(struct AVCodecContext * const s, + void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, + av_rpi_zc_free_pool_fn_t * free_pool_fn); + +// Free ZC from a context +void av_rpi_zc_uninit2(struct AVCodecContext * const s); + +// Get minimum pool size in frames - valid by the time the first alloc request +// occurs. Takes into account thread requests and DPB sizes derived from SPS +// rather than just adding a worst case DPB size. +unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc); + +typedef struct av_rpi_zc_buf_fn_tab_s { + // This AVBuffer is being freed by ffmpeg - return memory + // to external pool. Memory may be, but need not be, unmapped. + // v is the ptr passed in av_rpi_zc_buf + void (* free)(void * v); + + // Return appropriate handles / mappings + // v is the ptr passed in av_rpi_zc_buf + unsigned int (* vcsm_handle)(void * v); + unsigned int (* vc_handle)(void * v); + void * (* map_arm)(void * v); + unsigned int (* map_vc)(void * v); +} av_rpi_zc_buf_fn_tab_t; + +// Allocate a ZC AVBufferRef and set its callback table +// Doesn't take a buffer address directly - relies on callbacks to return +// addresses as they are required. Mappings need not be generated until +// the map callbacks are called but they should persist from then until +// the buffer is freed. +// +// Parameters: +// numbytes Size of the buffer +// addr_offset Offset to first usable byte of buffer (for alignment) +// normally 0 +// v Pointer passed to callbacks +// fn_tab Function table +AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab); + +// Get v ptr set in in av_rpi_zc_buf +void * av_rpi_zc_buf_v(AVBufferRef * const buf); + +//---------------------------------------------------------------------------- +// +// Mostly internal calls but might possibly be wanted by outside code + +void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc); +AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx); +void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size); + +// Test to see if the context is using zc (checks get_buffer2) +int av_rpi_zc_in_use(const struct AVCodecContext * const s); + +// Get buffer generates placeholders for later alloc +int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame); +// Resolve actually does the alloc (noop if already alloced) +// Set data pointers on a buffer/frame that was copied before the alloc +// accured +#define ZC_RESOLVE_FAIL 0 // return error on invalid +#define ZC_RESOLVE_ALLOC 1 // alloc as invalid +#define ZC_RESOLVE_WAIT_VALID 2 // wait for valid +#define ZC_RESOLVE_ALLOC_VALID 3 // alloc as valid +int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc); +int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc); + +int av_rpi_zc_set_valid_frame(AVFrame * const frame); +int av_rpi_zc_set_broken_frame(AVFrame * const frame); + + + + +AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx, + void * pool_env, + av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, + av_rpi_zc_free_pool_fn_t * free_pool_fn); +void av_rpi_zc_env_release(const AVZcEnvPtr zc); + + +#endif + diff --git a/libavcodec/rpi_zc_frames.h b/libavcodec/rpi_zc_frames.h new file mode 100644 index 0000000000..9b7b6536a4 --- /dev/null +++ b/libavcodec/rpi_zc_frames.h @@ -0,0 +1,142 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox, Ben Avison +*/ + +#ifndef RPI_ZC_FRAMES_H +#define RPI_ZC_FRAMES_H + +#define RPI_ONE_BUF 1 + +#include "rpi_mem.h" // for GPU_MEM_PTR_T +#include "libavutil/frame.h" + +#if !RPI_ONE_BUF +static inline uint32_t get_vc_address_y(const AVFrame * const frame) { + GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]); + return p->vc; +} + +static inline uint32_t get_vc_address_u(const AVFrame * const frame) { + GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]); + return p->vc; +} + +static inline uint32_t get_vc_address_v(const AVFrame * const frame) { + GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]); + return p->vc; +} + +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { + return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]); +} + +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { + return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]); +} + +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { + return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]); +} + +#else + +static inline int gpu_is_buf1(const AVFrame * const frame) +{ + return frame->buf[1] == NULL; +} + +static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame) +{ + return av_buffer_get_opaque(frame->buf[0]); +} + +static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n) +{ + return av_buffer_pool_buffer_get_opaque(frame->buf[n]); +} + +static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n) +{ + const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n); + return gm->vc + (frame->data[n] - gm->arm); +} + + +static inline uint32_t get_vc_address_y(const AVFrame * const frame) { + return get_vc_address3(frame, 0); +} + +static inline uint32_t get_vc_address_u(const AVFrame * const frame) { + return get_vc_address3(frame, 1); +} + +static inline uint32_t get_vc_address_v(const AVFrame * const frame) { + return get_vc_address3(frame, 2); +} + +#if 0 +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { + if (gpu_is_buf1(frame)) + { + GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); + g.numbytes = frame->data[1] - frame->data[0]; + return g; + } + else + return *gpu_buf3_gmem(frame, 0); +} + +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { + if (gpu_is_buf1(frame)) + { + GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); + g.arm += frame->data[1] - frame->data[0]; + g.vc += frame->data[1] - frame->data[0]; + g.numbytes = frame->data[2] - frame->data[1]; // chroma size + return g; + } + else + return *gpu_buf3_gmem(frame, 1); +} + +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { + if (gpu_is_buf1(frame)) + { + GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); + g.arm += frame->data[2] - frame->data[0]; + g.vc += frame->data[2] - frame->data[0]; + g.numbytes = frame->data[2] - frame->data[1]; // chroma size + return g; + } + else + return *gpu_buf3_gmem(frame, 2); +} +#endif +#endif + +#endif diff --git a/libavcodec/rpivid_hevc.c b/libavcodec/rpivid_hevc.c new file mode 100644 index 0000000000..85c5b46d75 --- /dev/null +++ b/libavcodec/rpivid_hevc.c @@ -0,0 +1,2128 @@ +// FFMPEG HEVC decoder hardware accelerator +// Andrew Holme, Argon Design Ltd +// Copyright (c) June 2017 Raspberry Pi Ltd + +#include +#include +#include +#include +#include +#include + +#include "fftools/ffmpeg.h" +#include "libavutil/avassert.h" +#include "libavutil/imgutils.h" +#include "avcodec.h" +#include "hwconfig.h" +#include "decode.h" + +#include "hevc.h" +#include "hevcdec.h" +#include "rpi_zc.h" +#include "rpi_mem.h" +#include "rpi_zc_frames.h" +#include "rpi_mailbox.h" + + +#define OPT_PHASE_TIMING 0 // Generate stats for phase usage + +#define OPT_EMU 0 + +#define TRACE_DEV 0 +#define TRACE_ENTRY 0 + +#define NUM_SCALING_FACTORS 4064 + +#define AXI_BASE64 0 + +#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0)) +#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6)) + +#define RPIVID_COL_PICS 17 // 16 ref & current + +#define RPIVID_BITBUFS 2 // Bit + Cmd bufs (phase 0 & 1) +#define RPIVID_BITBUF_SIZE (4 << 20) // Bit + Cmd buf size + +#define RPIVID_COEFFBUFS 3 // PU + Coeff bufs (phase 1 & 2) +#define RPIVID_COEFFBUF_SIZE (16 << 20) // PU + Coeff buf size + +////////////////////////////////////////////////////////////////////////////// +// +// Register offsets + +#define RPI_SPS0 0 +#define RPI_SPS1 4 +#define RPI_PPS 8 +#define RPI_SLICE 12 +#define RPI_TILESTART 16 +#define RPI_TILEEND 20 +#define RPI_SLICESTART 24 +#define RPI_MODE 28 +#define RPI_LEFT0 32 +#define RPI_LEFT1 36 +#define RPI_LEFT2 40 +#define RPI_LEFT3 44 +#define RPI_QP 48 +#define RPI_CONTROL 52 +#define RPI_STATUS 56 +#define RPI_VERSION 60 +#define RPI_BFBASE 64 +#define RPI_BFNUM 68 +#define RPI_BFCONTROL 72 +#define RPI_BFSTATUS 76 +#define RPI_PUWBASE 80 +#define RPI_PUWSTRIDE 84 +#define RPI_COEFFWBASE 88 +#define RPI_COEFFWSTRIDE 92 +#define RPI_SLICECMDS 96 +#define RPI_BEGINTILEEND 100 +#define RPI_TRANSFER 104 +#define RPI_CFBASE 108 +#define RPI_CFNUM 112 +#define RPI_CFSTATUS 116 + +#define RPI_PURBASE 0x8000 +#define RPI_PURSTRIDE 0x8004 +#define RPI_COEFFRBASE 0x8008 +#define RPI_COEFFRSTRIDE 0x800C +#define RPI_NUMROWS 0x8010 +#define RPI_CONFIG2 0x8014 +#define RPI_OUTYBASE 0x8018 +#define RPI_OUTYSTRIDE 0x801C +#define RPI_OUTCBASE 0x8020 +#define RPI_OUTCSTRIDE 0x8024 +#define RPI_STATUS2 0x8028 +#define RPI_FRAMESIZE 0x802C +#define RPI_MVBASE 0x8030 +#define RPI_MVSTRIDE 0x8034 +#define RPI_COLBASE 0x8038 +#define RPI_COLSTRIDE 0x803C +#define RPI_CURRPOC 0x8040 + +////////////////////////////////////////////////////////////////////////////// + +// Unused but left here to illustrate the diffrences between FFmpegs prob +// structure and the rpivid one + +struct FFM_PROB { + uint8_t sao_merge_flag [ 1]; + uint8_t sao_type_idx [ 1]; + uint8_t split_coding_unit_flag [ 3]; + uint8_t cu_transquant_bypass_flag [ 1]; + uint8_t skip_flag [ 3]; + uint8_t cu_qp_delta [ 3]; + uint8_t pred_mode_flag [ 1]; + uint8_t part_mode [ 4]; + uint8_t prev_intra_luma_pred_flag [ 1]; + uint8_t intra_chroma_pred_mode [ 2]; + uint8_t merge_flag [ 1]; + uint8_t merge_idx [ 1]; + uint8_t inter_pred_idc [ 5]; + uint8_t ref_idx_l0 [ 2]; + uint8_t ref_idx_l1 [ 2]; + uint8_t abs_mvd_greater0_flag [ 2]; + uint8_t abs_mvd_greater1_flag [ 2]; + uint8_t mvp_lx_flag [ 1]; + uint8_t no_residual_data_flag [ 1]; + uint8_t split_transform_flag [ 3]; + uint8_t cbf_luma [ 2]; + uint8_t cbf_cb_cr [ 4]; + uint8_t transform_skip_flag/*[][]*/ [ 2]; + uint8_t explicit_rdpcm_flag/*[][]*/ [ 2]; + uint8_t explicit_rdpcm_dir_flag/*[][]*/ [ 2]; + uint8_t last_significant_coeff_x_prefix [18]; + uint8_t last_significant_coeff_y_prefix [18]; + uint8_t significant_coeff_group_flag [ 4]; + uint8_t significant_coeff_flag [44]; + uint8_t coeff_abs_level_greater1_flag [24]; + uint8_t coeff_abs_level_greater2_flag [ 6]; + uint8_t log2_res_scale_abs [ 8]; + uint8_t res_scale_sign_flag [ 2]; + uint8_t cu_chroma_qp_offset_flag [ 1]; + uint8_t cu_chroma_qp_offset_idx [ 1]; +} __attribute__((packed)); + +////////////////////////////////////////////////////////////////////////////// + +struct RPI_PROB { + uint8_t SAO_MERGE_FLAG [ 1]; + uint8_t SAO_TYPE_IDX [ 1]; + uint8_t SPLIT_FLAG [ 3]; + uint8_t CU_SKIP_FLAG [ 3]; + uint8_t CU_TRANSQUANT_BYPASS_FLAG [ 1]; + uint8_t PRED_MODE [ 1]; + uint8_t PART_SIZE [ 4]; + uint8_t INTRA_PRED_MODE [ 1]; + uint8_t CHROMA_PRED_MODE [ 1]; + uint8_t MERGE_FLAG_EXT [ 1]; + uint8_t MERGE_IDX_EXT [ 1]; + uint8_t INTER_DIR [ 5]; + uint8_t REF_PIC [ 2]; + uint8_t MVP_IDX [ 1]; + uint8_t MVD [ 2]; + uint8_t QT_ROOT_CBF [ 1]; + uint8_t TRANS_SUBDIV_FLAG [ 3]; + uint8_t QT_CBF [ 6]; + uint8_t DQP [ 2]; + uint8_t ONE_FLAG [24]; + uint8_t LASTX [18]; + uint8_t LASTY [18]; + uint8_t SIG_CG_FLAG [ 4]; + uint8_t ABS_FLAG [ 6]; + uint8_t TRANSFORMSKIP_FLAG [ 2]; + uint8_t SIG_FLAG [42]; + uint8_t SIG_FLAG_unused [ 2]; +} __attribute__((packed)); + +////////////////////////////////////////////////////////////////////////////// + +struct RPI_CMD { + uint32_t addr; + uint32_t data; +} __attribute__((packed)); + +struct RPI_BIT { + int cmd; + const void *ptr; + int len; +}; + +////////////////////////////////////////////////////////////////////////////// + +struct RPI_T; + +// Actual addressability is 38bits but we can only alloc in the bottom 32 +// currently - when passed to rpivid h/w the address is always >> 6 so will +// fit in 32 bit there +// At some point we may weant to make this uint64_t +typedef uint32_t vid_vc_addr_t; + +typedef enum rpivid_decode_state_e { + RPIVID_DECODE_NEW = 0, + RPIVID_DECODE_START, + RPIVID_DECODE_SLICE, + RPIVID_DECODE_END, +} rpivid_decode_state_t; + +#define RPI_PROB_VALS 154U +#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3) + +typedef struct dec_env_s { + const AVCodecContext * avctx; + + rpivid_decode_state_t state; + unsigned int decode_order; + + int phase_no; // Current phase (i.e. the last one we waited for) + struct dec_env_s * phase_wait_q_next; + sem_t phase_wait; + + struct RPI_BIT *bit_fifo; + struct RPI_CMD *cmd_fifo; + unsigned int bit_len, bit_max; + unsigned int cmd_len, cmd_max; + unsigned int num_slice_msgs; + unsigned int PicWidthInCtbsY; + unsigned int PicHeightInCtbsY; + unsigned int dpbno_col; + uint32_t reg_slicestart; + unsigned int wpp_entry_x; + unsigned int wpp_entry_y; + + const uint8_t * nal_buffer; + size_t nal_size; + + uint16_t slice_msgs[2*HEVC_MAX_REFS*8+3]; + uint8_t scaling_factors[NUM_SCALING_FACTORS]; +// unsigned int RefPicList[2][HEVC_MAX_REFS]; +} dec_env_t; + +#define RPIVID_PHASES 3 +#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order +#define RPIVID_PHASE_START (-1) // Phase after we have inced decode_order + +#if OPT_PHASE_TIMING +static const unsigned int time_thresholds[8] = { + 10, 15, 20, 30, 45, 60, 75, 90 +}; +#endif + +typedef struct phase_wait_env_s { + unsigned int last_order; + dec_env_t * q; +#if OPT_PHASE_TIMING + uint64_t phase_time; + uint64_t max_phase_time; + uint64_t time_in_phase; + uint64_t time_out_phase; + unsigned int max_time_decode_order; + unsigned int time_bins[9]; + unsigned int time_bins3[9]; + unsigned int time_bins5[9]; + uint64_t time_stash[16]; + unsigned int i3; +#endif +} phase_wait_env_t; // Single linked list of threads waiting for this phase + +typedef struct RPI_T { + atomic_int ref_count; + sem_t ref_zero; + + dec_env_t ** dec_envs; + AVZcEnvPtr zc; + + pthread_mutex_t phase_lock; + phase_wait_env_t phase_reqs[RPIVID_PHASES]; + + volatile uint32_t * regs; + volatile uint32_t * ints; + + GPU_MEM_PTR_T gcolbuf; + unsigned int col_stride; + size_t col_picsize; + + unsigned int bitbuf_no; + sem_t bitbuf_sem; + GPU_MEM_PTR_T gbitbufs[RPIVID_BITBUFS]; + + unsigned int max_pu_msgs; + unsigned int coeffbuf_no; + sem_t coeffbuf_sem; + GPU_MEM_PTR_T gcoeffbufs[RPIVID_COEFFBUFS]; + + unsigned int decode_order; + int mbox_fd; + int gpu_init_type; +} RPI_T; + +#if OPT_PHASE_TIMING +static uint64_t tus64(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000; +} +#endif + +static inline unsigned int rnd64(unsigned int x) +{ + return (x + 63) & ~63; +} + +static inline int rpi_sem_wait(sem_t * const sem) +{ + int rv; + while ((rv = sem_wait(sem)) != 0 && errno == EINTR) + /* Loop */; + return rv; +} + +//============================================================================ + +#define REGS_NAME "/dev/rpivid-hevcmem" +#define REGS_SIZE 0x10000 +#define INTS_NAME "/dev/rpivid-intcmem" +#define INTS_SIZE 0x10000 // 4 is probably enough but we are going to alloc a page anyway + +static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size) +{ + void *gpio_map; + int mem_fd; + + /* open /dev/mem */ + if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) { + av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name); + return NULL; + } + + // Now map it + gpio_map = mmap( + NULL, + size, + PROT_READ|PROT_WRITE, + MAP_SHARED, + mem_fd, + 0 + ); + + close(mem_fd); // No longer need the FD + + if (gpio_map == MAP_FAILED) { + av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed"); + return NULL; + } + + return (volatile uint32_t *)gpio_map; +} + +static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size) +{ + volatile uint32_t * const gpio_map = *p_gpio_map; + if (gpio_map != NULL) { + *p_gpio_map = NULL; + munmap((void *)gpio_map, size); + } +} + +#define MANGLE(x) ((x) &~0xc0000000) // ** If x is ever a 64 bit thing this will need fixing! +#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6) + +static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data) +{ +#if TRACE_DEV + printf("W %x %08x\n", addr, MANGLE64(data)); +#endif + + rpi->regs[addr >> 2] = MANGLE64(data); +} + +static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data) +{ +#if TRACE_DEV + printf("W %x %08x\n", addr, data >> 6); +#endif + + rpi->regs[addr >> 2] = data >> 6; // ?? rnd64 - but not currently needed +} + +static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data) +{ +#if TRACE_DEV + printf("W %x %08x\n", addr, data); +#endif + + rpi->regs[addr >> 2] = data; +} + +static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr) +{ + const uint32_t v = rpi->regs[addr >> 2]; +#if TRACE_DEV + printf("R %x (=%x)\n", addr, v); +#endif + return v; +} + +#define ARG_IC_ICTRL_ACTIVE1_INT_SET 0x00000001 +#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET 0x00000002 +#define ARG_IC_ICTRL_ACTIVE1_EN_SET 0x00000004 +#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET 0x00000008 +#define ARG_IC_ICTRL_ACTIVE2_INT_SET 0x00000010 +#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET 0x00000020 +#define ARG_IC_ICTRL_ACTIVE2_EN_SET 0x00000040 +#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET 0x00000080 + +static inline void int_wait(const RPI_T * const rpi, const unsigned int phase) +{ + const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET; + const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET; + uint32_t ival; + while (((ival = rpi->ints[0]) & mask_done) == 0) { + usleep(1000); + } + rpi->ints[0] = ival & mask_reset; +} + +#if TRACE_DEV && 0 +static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) { + int i; + + for (i=0; iregs[(addr>>2)+i]); + + if ((i%4)==3 || i+1 == num) + printf("\n"); + else + printf(" "); + } +} + +static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) { + int i; + + for (i=0; i>2; i++) + { + if ((i%4)==0) + printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i); + + printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]); + + if ((i%4)==3 || i+1 == size>>2) + printf("\n"); + else + printf(" "); + } +} +#endif + +////////////////////////////////////////////////////////////////////////////// + +static inline size_t round_up_size(const size_t x) +{ + /* Admit no size < 256 */ + const unsigned int n = x < 256 ? 8 : av_log2(x) - 1; + + return x >= (3 << n) ? 4 << n : (3 << n); +} + +////////////////////////////////////////////////////////////////////////////// +// Scaling factors + +static void expand_scaling_list( + const unsigned int sizeID, + const unsigned int matrixID, + uint8_t * const dst0, + const uint8_t * const src0, + uint8_t dc) +{ + switch (sizeID) { + case 0: + memcpy(dst0, src0, 16); + break; + case 1: + memcpy(dst0, src0, 64); + break; + case 2: + { + uint8_t * d = dst0; + for (unsigned int y=0; y != 16; y++) { + const uint8_t * s = src0 + (y >> 1) * 8; + for (unsigned int x = 0; x != 8; ++x) { + *d++ = *s; + *d++ = *s++; + } + } + dst0[0] = dc; + break; + } + default: + { + uint8_t * d = dst0; + for (unsigned int y=0; y != 32; y++) { + const uint8_t * s = src0 + (y >> 2) * 8; + for (unsigned int x = 0; x != 8; ++x) { + *d++ = *s; + *d++ = *s; + *d++ = *s; + *d++ = *s++; + } + } + dst0[0] = dc; + break; + } + } +} + +static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) { + // Array of constants for scaling factors + static const uint32_t scaling_factor_offsets[4][6] = { + // MID0 MID1 MID2 MID3 MID4 MID5 + {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050}, // SID0 (4x4) + {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0}, // SID1 (8x8) + {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0}, // SID2 (16x16) + {0x07E0, 0, 0, 0x0BE0, 0, 0}}; // SID3 (32x32) + + // ffmpeg places SID3,MID1 where matrixID 3 normally is + const ScalingList * const sl = + s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list + : &s->ps.sps->scaling_list; + unsigned int mid; + + for (mid=0; mid<6; mid++) + expand_scaling_list(0, mid, + de->scaling_factors + scaling_factor_offsets[0][mid], + sl->sl[0][mid], 0); + for (mid=0; mid<6; mid++) + expand_scaling_list(1, mid, + de->scaling_factors + scaling_factor_offsets[1][mid], + sl->sl[1][mid], 0); + for (mid=0; mid<6; mid++) + expand_scaling_list(2, mid, + de->scaling_factors + scaling_factor_offsets[2][mid], + sl->sl[2][mid], + sl->sl_dc[0][mid]); + // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg + for (mid=0; mid<6; mid += 3) + expand_scaling_list(3, mid, + de->scaling_factors + scaling_factor_offsets[3][mid], + sl->sl[3][mid], + sl->sl_dc[1][mid]); +} + +////////////////////////////////////////////////////////////////////////////// +// Probabilities + +static const uint8_t prob_init[3][156] = { + { + 153, 200, 139, 141, 157, 154, 154, 154, + 154, 154, 184, 154, 154, 154, 184, 63, + 154, 154, 154, 154, 154, 154, 154, 154, + 154, 154, 154, 154, 154, 153, 138, 138, + 111, 141, 94, 138, 182, 154, 154, 154, + 140, 92, 137, 138, 140, 152, 138, 139, + 153, 74, 149, 92, 139, 107, 122, 152, + 140, 179, 166, 182, 140, 227, 122, 197, + 110, 110, 124, 125, 140, 153, 125, 127, + 140, 109, 111, 143, 127, 111, 79, 108, + 123, 63, 110, 110, 124, 125, 140, 153, + 125, 127, 140, 109, 111, 143, 127, 111, + 79, 108, 123, 63, 91, 171, 134, 141, + 138, 153, 136, 167, 152, 152, 139, 139, + 111, 111, 125, 110, 110, 94, 124, 108, + 124, 107, 125, 141, 179, 153, 125, 107, + 125, 141, 179, 153, 125, 107, 125, 141, + 179, 153, 125, 140, 139, 182, 182, 152, + 136, 152, 136, 153, 136, 139, 111, 136, + 139, 111, 0, 0, }, + { + 153, 185, 107, 139, 126, 197, 185, 201, + 154, 149, 154, 139, 154, 154, 154, 152, + 110, 122, 95, 79, 63, 31, 31, 153, + 153, 168, 140, 198, 79, 124, 138, 94, + 153, 111, 149, 107, 167, 154, 154, 154, + 154, 196, 196, 167, 154, 152, 167, 182, + 182, 134, 149, 136, 153, 121, 136, 137, + 169, 194, 166, 167, 154, 167, 137, 182, + 125, 110, 94, 110, 95, 79, 125, 111, + 110, 78, 110, 111, 111, 95, 94, 108, + 123, 108, 125, 110, 94, 110, 95, 79, + 125, 111, 110, 78, 110, 111, 111, 95, + 94, 108, 123, 108, 121, 140, 61, 154, + 107, 167, 91, 122, 107, 167, 139, 139, + 155, 154, 139, 153, 139, 123, 123, 63, + 153, 166, 183, 140, 136, 153, 154, 166, + 183, 140, 136, 153, 154, 166, 183, 140, + 136, 153, 154, 170, 153, 123, 123, 107, + 121, 107, 121, 167, 151, 183, 140, 151, + 183, 140, 0, 0, }, + { + 153, 160, 107, 139, 126, 197, 185, 201, + 154, 134, 154, 139, 154, 154, 183, 152, + 154, 137, 95, 79, 63, 31, 31, 153, + 153, 168, 169, 198, 79, 224, 167, 122, + 153, 111, 149, 92, 167, 154, 154, 154, + 154, 196, 167, 167, 154, 152, 167, 182, + 182, 134, 149, 136, 153, 121, 136, 122, + 169, 208, 166, 167, 154, 152, 167, 182, + 125, 110, 124, 110, 95, 94, 125, 111, + 111, 79, 125, 126, 111, 111, 79, 108, + 123, 93, 125, 110, 124, 110, 95, 94, + 125, 111, 111, 79, 125, 126, 111, 111, + 79, 108, 123, 93, 121, 140, 61, 154, + 107, 167, 91, 107, 107, 167, 139, 139, + 170, 154, 139, 153, 139, 123, 123, 63, + 124, 166, 183, 140, 136, 153, 154, 166, + 183, 140, 136, 153, 154, 166, 183, 140, + 136, 153, 154, 170, 153, 138, 138, 122, + 121, 122, 121, 167, 151, 183, 140, 151, + 183, 140, 0, 0, }, +}; + + +////////////////////////////////////////////////////////////////////////////// +// Phase 1 command and bit FIFOs + +// ???? uint16_t addr - put in uint32_t +static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) { + if (de->cmd_len==de->cmd_max) + av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD))); + +#if TRACE_DEV + printf("[%02x] %x %x\n", de->cmd_len, addr, data); +#endif + + de->cmd_fifo[de->cmd_len].addr = addr; + de->cmd_fifo[de->cmd_len].data = data; + return de->cmd_len++; +} + +static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) { + if (de->bit_len==de->bit_max) + av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT))); + de->bit_fifo[de->bit_len].cmd = cmd_idx; + de->bit_fifo[de->bit_len].ptr = ptr; + de->bit_fifo[de->bit_len].len = len; + de->bit_len++; +} + +////////////////////////////////////////////////////////////////////////////// +// Write probability and scaling factor memories + +#if 0 +static void WriteProb(dec_env_t * const de) { + int i; + const uint8_t *p = (uint8_t *) &de->probabilities; + for (i=0; ish.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ? + s->sh.slice_type + 1 : 2 - s->sh.slice_type; + const uint8_t * p = prob_init[init_type]; + const int q = av_clip(s->sh.slice_qp, 0, 51); + unsigned int i; + + for (i = 0; i < RPI_PROB_VALS; i++) { + int init_value = p[i]; + int m = (init_value >> 4) * 5 - 45; + int n = ((init_value & 15) << 3) - 16; + int pre = 2 * (((m * q) >> 4) + n) - 127; + + pre ^= pre >> 31; + if (pre > 124) + pre = 124 + (pre & 1); + dst[i] = pre; + } + for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) { + dst[i] = 0; + } + + for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4) + p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24)); + +} + + +static void WriteScalingFactors(dec_env_t * const de) { + int i; + const uint8_t *p = (uint8_t *) de->scaling_factors; + for (i=0; i= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c + return i-1; +} + +static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) { + if (ctb < bd[num-1]) return ctb_size; + else if (width % ctb_size) return width % ctb_size; + else return ctb_size; +} + +////////////////////////////////////////////////////////////////////////////// +// Handle PU and COEFF stream overflow + + +// Returns: +// -2 Other error +// -1 Out of coeff space +// 0 OK +// 1 Out of PU space + +static int check_status(const RPI_T * const rpi, dec_env_t * const de) { + uint32_t status; + + // this is the definition of successful completion of phase 1 + // it assures that status register is zero and all blocks in each tile have completed + if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM)) + return 0; + + status = apb_read(rpi, RPI_STATUS); + + if ((status & 8) != 0) + return -1; + + if ((status & 0x10) != 0) + return 1; + + return -2; +} + +////////////////////////////////////////////////////////////////////////////// +// Write STATUS register with expected end CTU address of previous slice + +static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) { + const HEVCPPS * const pps = s->ps.pps; + int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY; + int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY; + p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); +} + +static void wpp_pause(dec_env_t * const de, int ctb_row) { + p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25); + p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); + p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000); + p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2); +} + +static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) { + const HEVCPPS *pps = s->ps.pps; + int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; + int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY; + int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY; + int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY; + if (de->wpp_entry_x<2 && (de->wpp_entry_y2) && de->PicWidthInCtbsY>2) + wpp_pause(de, last_y); + p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); + if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_yps.sps; + const HEVCPPS *pps = s->ps.pps; + + p1_apb_write(de, RPI_SPS0, + (sps->log2_min_cb_size << 0) + + (sps->log2_ctb_size << 4) + + (sps->log2_min_tb_size << 8) + + (sps->log2_max_trafo_size << 12) + + (sps->bit_depth << 16) + + (sps->bit_depth << 20) + + (sps->max_transform_hierarchy_depth_intra << 24) + + (sps->max_transform_hierarchy_depth_inter << 28)); + + p1_apb_write(de, RPI_SPS1, + (sps->pcm.bit_depth << 0) + + (sps->pcm.bit_depth_chroma << 4) + + (sps->pcm.log2_min_pcm_cb_size << 8) + + (sps->pcm.log2_max_pcm_cb_size << 12) + + (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) + + (sps->amp_enabled_flag << 18) + + (sps->pcm_enabled_flag << 19) + + (sps->scaling_list_enable_flag << 20) + + (sps->sps_strong_intra_smoothing_enable_flag << 21)); + + p1_apb_write(de, RPI_PPS, + (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth << 0) + + (pps->cu_qp_delta_enabled_flag << 4) + + (pps->transquant_bypass_enable_flag << 5) + + (pps->transform_skip_enabled_flag << 6) + + (pps->sign_data_hiding_flag << 7) + + (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) << 8) + + (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) + + (pps->constrained_intra_pred_flag << 24)); + + if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de); + + if (!s->sh.dependent_slice_segment_flag) { + int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; + int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY; + de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16); + } + + p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart); +} + +////////////////////////////////////////////////////////////////////////////// + +static void write_slice(dec_env_t * const de, const HEVCContext * const s, + const unsigned int slice_w, const unsigned int slice_h) { + uint32_t u32 = + (s->sh.slice_type << 12) + + (s->sh.slice_sample_adaptive_offset_flag[0] << 14) + + (s->sh.slice_sample_adaptive_offset_flag[1] << 15) + + (slice_w << 17) + + (slice_h << 24); + + if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |= + (s->sh.max_num_merge_cand << 0) + + (s->sh.nb_refs[L0] << 4) + + (s->sh.nb_refs[L1] << 8); + + if (s->sh.slice_type==HEVC_SLICE_B) + u32 |= s->sh.mvd_l1_zero_flag<<16; + p1_apb_write(de, RPI_SLICE, u32); +} + +////////////////////////////////////////////////////////////////////////////// +// Wavefront mode + +static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s, + const int do_bte, const int resetQPY, const int ctb_addr_ts) { + const HEVCSPS * const sps = s->ps.sps; + const HEVCPPS * const pps = s->ps.pps; + + int ctb_size = 1<log2_ctb_size; + int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; + + int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY; + int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY; + + int endx = de->PicWidthInCtbsY-1; + int endy = ctb_row; + + uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width, pps->col_bd, pps->num_tile_columns); + uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows); + + p1_apb_write(de, RPI_TILESTART, 0); + p1_apb_write(de, RPI_TILEEND, endx + (endy<<16)); + + if (do_bte) + p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16)); + + write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size); + + if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp); + + p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001); + p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16)); +} + +////////////////////////////////////////////////////////////////////////////// +// Tiles mode + +static void new_entry_point(dec_env_t * const de, const HEVCContext * const s, + const int do_bte, const int resetQPY, const int ctb_addr_ts) { + const HEVCSPS * const sps = s->ps.sps; + const HEVCPPS * const pps = s->ps.pps; + + int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY; + int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY; + + int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns); + int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows); + + int endx = pps->col_bd[tile_x+1] - 1; + int endy = pps->row_bd[tile_y+1] - 1; + + uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<log2_ctb_size, sps->width, pps->col_bd, pps->num_tile_columns); + uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows); + + p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16)); + p1_apb_write(de, RPI_TILEEND, endx + (endy<<16)); + + if (do_bte) + p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16)); + + write_slice(de, s, slice_w, slice_h); + + if (resetQPY) + p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp); + + p1_apb_write(de, RPI_MODE, (0xFFFF << 0) + + (0x0 << 16) + + ((tile_x==pps->num_tile_columns-1) << 17) + + ((tile_y==pps->num_tile_rows-1) << 18)); + + p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16)); +} + +////////////////////////////////////////////////////////////////////////////// + +// Doesn't attempt to remove from context as we should only do this at the end +// of time or on create error +static void +dec_env_delete(dec_env_t * const de) +{ +// gpu_free(&de->gbuf); + + av_freep(&de->cmd_fifo); + av_freep(&de->bit_fifo); + + sem_destroy(&de->phase_wait); + av_free(de); +} + +static dec_env_t * +dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi) +{ + dec_env_t * const de = av_mallocz(sizeof(*de)); + int i; + + if (de == NULL) + return NULL; + + de->avctx = avctx; + de->phase_no = RPIVID_PHASE_NEW; + + sem_init(&de->phase_wait, 0, 0); + + if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL) + goto fail; + + if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL) + goto fail; + + pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this + for (i = 0; i != avctx->thread_count; ++i) { + if (rpi->dec_envs[i] == NULL) + { + rpi->dec_envs[i] = de; + break; + } + } + pthread_mutex_unlock(&rpi->phase_lock); + + if (i == avctx->thread_count) { + av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n"); + goto fail; + } + + return de; + +fail: + dec_env_delete(de); + return NULL; +} + + +static dec_env_t * +dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi) +{ + dec_env_t * de = NULL; + const int ref_count = atomic_fetch_add(&rpi->ref_count, 1); + + if (ref_count <= 0) { + // Already dead + av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");; + return NULL; + } + + for (int i = 0; i != avctx->thread_count; ++i) { + if (rpi->dec_envs[i] == NULL) + { + de = dec_env_new(avctx, rpi); + break; + } + if (rpi->dec_envs[i]->avctx == avctx) + { + de = rpi->dec_envs[i]; + break; + } + } + return de; +} + +// Call at end of fn +// Used to ensure we aren't in a worker thead when killed +static void +dec_env_release(RPI_T * const rpi, dec_env_t * const de) +{ + const int n = atomic_fetch_sub(&rpi->ref_count, 1); + if (n == 1) { + sem_post(&rpi->ref_zero); + } +} + +//---------------------------------------------------------------------------- + +// Wait for a slot in the given phase +// Any error return is probably fatal +static int +wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no) +{ + int needs_wait = 0; + phase_wait_env_t *const p = rpi->phase_reqs + phase_no; + + pthread_mutex_lock(&rpi->phase_lock); + if (p->last_order + 1 != de->decode_order) { + de->phase_wait_q_next = p->q; + p->q = de; + needs_wait = 1; + } + pthread_mutex_unlock(&rpi->phase_lock); + + if (needs_wait) { + while (sem_wait(&de->phase_wait) == -1) + { + int err; + if ((err = errno) != EINTR) + return AVERROR(err); + } + } + + de->phase_no = phase_no; + return 0; +} + +static void +post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no) +{ + dec_env_t * next_de = NULL; + phase_wait_env_t *const p = rpi->phase_reqs + phase_no; + dec_env_t ** q = &p->q; + + pthread_mutex_lock(&rpi->phase_lock); + + p->last_order = de->decode_order; + while (*q != NULL) { + dec_env_t * const t_de = *q; + + if (t_de->decode_order == p->last_order + 1) { + // This is us - remove from Q + *q = t_de->phase_wait_q_next; + t_de->phase_wait_q_next = NULL; // Tidy + next_de = t_de; + break; + } + q = &t_de->phase_wait_q_next; + } + + pthread_mutex_unlock(&rpi->phase_lock); + + if (next_de != NULL) + sem_post(&next_de->phase_wait); +} + +// Wait & signal stuff s.t. threads in other phases can continue +static void +abort_phases(RPI_T * const rpi, dec_env_t * const de) +{ + for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) { + wait_phase(rpi, de, i); + post_phase(rpi, de, i); + } + de->phase_no = RPIVID_PHASE_NEW; +} + +// Start timing for phase +// Stats only - no actual effect +static inline void tstart_phase(RPI_T * const rpi, const int phase_no) +{ +#if OPT_PHASE_TIMING + phase_wait_env_t *const p = rpi->phase_reqs + phase_no; + const int64_t now = tus64(); + if (p->phase_time != 0) + p->time_out_phase += now - p->phase_time; + p->phase_time = now; +#endif +} + +#if OPT_PHASE_TIMING +static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n) +{ + uint64_t tsum = 0; + unsigned int i; + for (i = 0; i != avg_n; ++i) + tsum += p->time_stash[(p->i3 - i) & 15]; + for (i = 0; i != 9; ++i) { + if (time_thresholds[i] * 1000 * avg_n > tsum) + break; + } + return i; +} +#endif + +// End timing for phase +// Stats only - no actual effect +static inline void tend_phase(RPI_T * const rpi, const int phase_no) +{ +#if OPT_PHASE_TIMING + phase_wait_env_t *const p = rpi->phase_reqs + phase_no; + const uint64_t now = tus64(); + const uint64_t in_time = now - p->phase_time; + + p->time_in_phase += in_time; + p->phase_time = now; + p->time_stash[p->i3] = in_time; + if (in_time > p->max_phase_time) { + p->max_phase_time = in_time; + p->max_time_decode_order = p->last_order; + } + ++p->time_bins[tavg_bin_phase(p, 1)]; + ++p->time_bins3[tavg_bin_phase(p, 3)]; + ++p->time_bins5[tavg_bin_phase(p, 5)]; + + p->i3 = (p->i3 + 1) & 15; +#endif +} + +////////////////////////////////////////////////////////////////////////////// +// Start frame + +static int rpi_hevc_start_frame( + AVCodecContext * avctx, + const uint8_t *buffer, + uint32_t size) { + + RPI_T * const rpi = avctx->internal->hwaccel_priv_data; + dec_env_t * const de = dec_env_get(avctx, rpi); + const HEVCContext * const s = avctx->priv_data; + const HEVCSPS * const sps = s->ps.sps; + const unsigned int CtbSizeY = 1U << sps->log2_ctb_size; + +#if TRACE_ENTRY + printf("<<< %s[%p]\n", __func__, de); +#endif + + if (de == NULL) { + av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); + return -1; + } + + de->phase_no = RPIVID_PHASE_START; + de->decode_order = ++rpi->decode_order; // *** atomic? + + ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame + + if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) { + av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state); + return -1; + } + de->state = RPIVID_DECODE_START; + + de->PicWidthInCtbsY = (sps->width + CtbSizeY - 1) / CtbSizeY; //7-15 + de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY; //7-17 + de->bit_len = 0; + de->cmd_len = 0; + +#if TRACE_ENTRY + printf(">>> %s[%p]\n", __func__, de); +#endif + + dec_env_release(rpi, de); + return 0; +} + +////////////////////////////////////////////////////////////////////////////// +// Slice messages + +static void msg_slice(dec_env_t * const de, const uint16_t msg) { + de->slice_msgs[de->num_slice_msgs++] = msg; +} + +static void program_slicecmds(dec_env_t * const de, const int sliceid) { + int i; + p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8)); + for(i=0; i < de->num_slice_msgs; i++) { + p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff); + } +} + +static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) { + const HEVCSPS * const sps = s->ps.sps; + const HEVCPPS * const pps = s->ps.pps; + const SliceHeader *sh = &s->sh; + + int weightedPredFlag, i, rIdx; + uint16_t cmd_slice; + unsigned int collocated_from_l0_flag; + + de->num_slice_msgs=0; + de->dpbno_col = 0; + cmd_slice = 0; + if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1; + if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2; + if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3; + + if (sh->slice_type!=HEVC_SLICE_I) { + cmd_slice += sh->nb_refs[L0]<<2; + cmd_slice += sh->nb_refs[L1]<<6; + } + + if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) + cmd_slice |= sh->max_num_merge_cand<<11; + + collocated_from_l0_flag = + !sh->slice_temporal_mvp_enabled_flag ? + 0 : + sh->slice_type == HEVC_SLICE_B ? + (sh->collocated_list == L0) : + (sh->slice_type==HEVC_SLICE_P); + cmd_slice |= collocated_from_l0_flag<<14; + + if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) { + + int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past + for(i=L0; i<=L1; i++) { + for(rIdx=0; rIdx nb_refs[i]; rIdx++) { + HEVCFrame *f = s->ref->refPicList[i].ref[rIdx]; + HEVCFrame *c = s->ref; // CurrentPicture + if (c->poc < f->poc) NoBackwardPredFlag = 0; + } + } + + if (sps->sps_temporal_mvp_enabled_flag) + { + const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ? + s->ref->refPicList + 0 : + s->ref->refPicList + 1; + de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB; + } + + cmd_slice += NoBackwardPredFlag<<10; + msg_slice(de, cmd_slice); + + // Write reference picture descriptions + weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag; + + for(i=L0; i<=L1; i++) + for(rIdx=0; rIdx nb_refs[i]; rIdx++) { + HEVCFrame *f = s->ref->refPicList[i].ref[rIdx]; + HEVCFrame *c = s->ref; // CurrentPicture + int pic = f - s->DPB; + // Make sure pictures are in range 0 to 15 + int adjusted_pic = fref->refPicList[i].isLongTerm[rIdx]; + msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6)); + msg_slice(de, f->poc); + if (weightedPredFlag) { + msg_slice(de, s->sh.luma_log2_weight_denom+(((i?s-> sh.luma_weight_l1: s->sh.luma_weight_l0)[rIdx] &0x1ff)<<3)); + msg_slice(de, (i?s-> sh.luma_offset_l1: s->sh.luma_offset_l0)[rIdx] & 0xff); + msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3)); + msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff); + msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3)); + msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff); + } + } + } + else + msg_slice(de, cmd_slice); + + msg_slice(de, ((sh->beta_offset/2)&15) + + (((sh->tc_offset/2)&15) << 4) + + (sh->disable_deblocking_filter_flag << 8) + + (sh->slice_loop_filter_across_slices_enabled_flag << 9) + + (pps->loop_filter_across_tiles_enabled_flag << 10)); // CMD_DEBLOCK + + msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF +} + + +////////////////////////////////////////////////////////////////////////////// + +static void rpi_hevc_abort_frame(AVCodecContext * const avctx) { + RPI_T * const rpi = avctx->internal->hwaccel_priv_data; + dec_env_t * const de = dec_env_get(avctx, rpi); + +#if TRACE_ENTRY + printf("<<< %s[%p]\n", __func__, de); +#endif + + if (de == NULL) { + av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); + return; + } + + switch (de->state) { + case RPIVID_DECODE_NEW: + case RPIVID_DECODE_END: + // Expected transition + break; + + case RPIVID_DECODE_SLICE: + // Error transition + av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n"); + break; + + case RPIVID_DECODE_START: + default: + av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state); + break; + } + + abort_phases(rpi, de); + de->state = RPIVID_DECODE_NEW; + + dec_env_release(rpi, de); +} + +////////////////////////////////////////////////////////////////////////////// +// End frame + +static int rpi_hevc_end_frame(AVCodecContext * const avctx) { + RPI_T * const rpi = avctx->internal->hwaccel_priv_data; + const HEVCContext * const s = avctx->priv_data; + const HEVCPPS * const pps = s->ps.pps; + const HEVCSPS * const sps = s->ps.sps; + dec_env_t * const de = dec_env_get(avctx, rpi); + AVFrame * const f = s->ref->frame; + const unsigned int dpbno_cur = s->ref - s->DPB; + vid_vc_addr_t cmds_vc; + vid_vc_addr_t pu_base_vc; + unsigned int pu_stride; + vid_vc_addr_t coeff_base_vc; + unsigned int coeff_stride; + unsigned int i; + int rv = 0; + int status = 0; + int coeffbuf_sem_claimed = 0; + +#if TRACE_ENTRY + fprintf("<<< %s[%p]\n", __func__, de); +#endif + + if (de == NULL) { + av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); + return AVERROR_BUG; // Should never happen + } + + if (de->state != RPIVID_DECODE_SLICE) { + av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state); + rv = AVERROR_UNKNOWN; + goto fail; + } + de->state = RPIVID_DECODE_END; + + // End of command compilation + { + const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1; + const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1; + if (pps->entropy_coding_sync_enabled_flag) { + if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2) + wpp_pause(de, last_y); + } + p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); + } + + // Phase 0 --------------------------------------------------------------- + + wait_phase(rpi, de, 0); + rpi_sem_wait(&rpi->bitbuf_sem); + tstart_phase(rpi, 0); + + // Copy cmds & bits into gpu side buffer + // Layout: CMDS, BITS + { + uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm; + vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc; + unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD); + + uint8_t * p = armbase + rnd64(cmd_bytes); + uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes; + + cmds_vc = vcbase; + + // Copy all the bits & update bitstream cmds to point at the right bits + for (i = 0; i < de->bit_len; ++i) + { + const unsigned int seg_len = de->bit_fifo[i].len; + + if (p + seg_len > eobits) { + status = -1; + break; + } + + memcpy(p, de->bit_fifo[i].ptr, seg_len); + de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase); + + p += rnd64(seg_len); + } + + memcpy(armbase, de->cmd_fifo, cmd_bytes); + } + + if (status == 0) + { + if (++rpi->bitbuf_no >= RPIVID_BITBUFS) + rpi->bitbuf_no = 0; + } + else + { + sem_post(&rpi->bitbuf_sem); + av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n"); + rv = AVERROR_BUFFER_TOO_SMALL; + } + + tend_phase(rpi, 0); + post_phase(rpi, de, 0); + + if (status < 0) + goto fail; + + // Phase 1 --------------------------------------------------------------- + + wait_phase(rpi, de, 1); + rpi_sem_wait(&rpi->coeffbuf_sem); + coeffbuf_sem_claimed = 1; + tstart_phase(rpi, 1); + + status = 0; + for (;;) + { + // (Re-)allocate PU/COEFF stream space + const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes; + unsigned int pu_size; + + pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc; + pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY); + pu_size = pu_stride * de->PicHeightInCtbsY; + + if (pu_size >= total_size || status == -1) { + GPU_MEM_PTR_T newbuf; + + if (gpu_malloc_uncached(round_up_size(total_size + 1), &newbuf) != 0) + { + av_log(avctx, AV_LOG_ERROR, "Failed to reallocate coeffbuf\n"); + status = -1; + break; + } + gpu_free(rpi->gcoeffbufs + rpi->coeffbuf_no); + rpi->gcoeffbufs[rpi->coeffbuf_no] = newbuf; + status = 0; + continue; + } + + // Allocate all remaining space to coeff + coeff_base_vc = pu_base_vc + pu_size; + coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63; // Round down to multiple of 64 + + apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc); + apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride); + apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc); + apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride); + + // Trigger command FIFO + apb_write(rpi, RPI_CFNUM, de->cmd_len); +#if TRACE_DEV && 0 + apb_dump_regs(rpi, 0x0, 32); + apb_dump_regs(rpi, 0x8000, 24); + axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD)); +#endif + apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc); + + int_wait(rpi, 1); + + status = check_status(rpi, de); + + if (status == -1) + continue; + else if (status != 1) + break; + + // Status 1 means out of PU space so try again with more + // If we ran out of Coeff space then we are out of memory - we could possibly realloc? + rpi->max_pu_msgs += rpi->max_pu_msgs / 2; + } + + // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we + // may reuse a live buffer when we kick the coeff sem + if (status == 0) + { + if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS) + rpi->coeffbuf_no = 0; + } + else + { + if (status == -1) + { + av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs); + rv = AVERROR_BUFFER_TOO_SMALL; + } + else + { + av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n"); + rv = AVERROR_INVALIDDATA; + } + } + + tend_phase(rpi, 1); + sem_post(&rpi->bitbuf_sem); + post_phase(rpi, de, 1); + + if (status != 0) + goto fail; + + // Phase 2 --------------------------------------------------------------- + + wait_phase(rpi, de, 2); + + if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0) + { + // As we are in phase 2 already here we don't need to worry about + // ceoffbuf_no despite the early exit + post_phase(rpi, de, 2); + av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n"); + goto fail; + } + + tstart_phase(rpi, 2); + + apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc); + apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride); + apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc); + apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride); + + apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f)); + apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f)); + apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128); + apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128); + + // Keep the last thing we resolved as fallback for any ref we fail to + // resolve. As a final fallback use our current frame. The pels might + // not be there yet but at least the memory is valid. + // + // Attempt to resolve the entire DPB - we could note what we have used + // in ref lists but probably simpler and more reliable to set the whole thing + { + AVFrame * fallback_frame = f; + for (i = 0; i != 16; ++i) { + // Avoid current frame + const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i; + AVFrame * fr = hevc_fr->frame; + + if (fr != NULL && + av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0) + { + fallback_frame = fr; + } + else + { + fr = fallback_frame; + } + + apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr)); + apb_write(rpi, 0x9004+16*i, 0); + apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr)); + apb_write(rpi, 0x900C+16*i, 0); + } + } + + apb_write(rpi, RPI_CONFIG2, + (sps->bit_depth << 0) // BitDepthY + + (sps->bit_depth << 4) // BitDepthC + + ((sps->bit_depth>8) << 8) // BitDepthY + + ((sps->bit_depth>8) << 9) // BitDepthC + + (sps->log2_ctb_size <<10) + + (pps->constrained_intra_pred_flag <<13) + + (sps->sps_strong_intra_smoothing_enable_flag<<14) + + (sps->sps_temporal_mvp_enabled_flag <<15) + + (pps->log2_parallel_merge_level <<16) + + (s->sh.slice_temporal_mvp_enabled_flag <<19) + + (sps->pcm.loop_filter_disable_flag <<20) + + ((pps->cb_qp_offset&31) <<21) + + ((pps->cr_qp_offset&31) <<26)); + + apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width); + apb_write(rpi, RPI_CURRPOC, s->poc); + + // collocated reads/writes + if (sps->sps_temporal_mvp_enabled_flag) { + av_assert0(de->dpbno_col < RPIVID_COL_PICS); + av_assert0(dpbno_cur < RPIVID_COL_PICS); + + apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride); + apb_write_vc_len(rpi, RPI_MVSTRIDE, rpi->col_stride); + apb_write_vc_addr(rpi, RPI_MVBASE, rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize); + apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize); + } + +#if TRACE_DEV && 0 + apb_dump_regs(rpi, 0x0, 32); + apb_dump_regs(rpi, 0x8000, 24); +#endif + + apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY); + apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block + + int_wait(rpi, 2); + + tend_phase(rpi, 2); + coeffbuf_sem_claimed = 0; + sem_post(&rpi->coeffbuf_sem); + // Set valid here to avoid race in resolving in any pending phase 2 + av_rpi_zc_set_valid_frame(f); + + post_phase(rpi, de, 2); + + // Flush frame for CPU access + // Arguably the best place would be at the start of phase 2 but here + // will overlap with the wait + // + // * Even better would be to have better lock/unlock control in ZC for external access + if (rpi->gpu_init_type == GPU_INIT_GPU) // * CMA is currently always uncached + { + rpi_cache_buf_t cbuf; + rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf); + rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE); + rpi_cache_flush_finish(fe); + } + +#if TRACE_ENTRY + printf(">>> %s[%p] OK\n", __func__, de); +#endif + + dec_env_release(rpi, de); + return 0; + +fail: + av_rpi_zc_set_broken_frame(f); + if (coeffbuf_sem_claimed) + sem_post(&rpi->coeffbuf_sem); + abort_phases(rpi, de); // Dummy any unresolved phases + +#if TRACE_ENTRY + printf(">>> %s[%p] FAIL\n", __func__, de); +#endif + + dec_env_release(rpi, de); + return rv; +} + +////////////////////////////////////////////////////////////////////////////// + + +#if TRACE_DEV +static void dump_data(const uint8_t * p, size_t len) +{ + size_t i; + for (i = 0; i < len; i += 16) { + size_t j; + printf("%04x", i); + for (j = 0; j != 16; ++j) { + printf("%c%02x", i == 8 ? '-' : ' ', p[i+j]); + } + printf("\n"); + } +} +#endif + +#if OPT_EMU +static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx) +{ + unsigned int z = 0; + while (idx--) { + if (*b++ == 0) { + ++z; + if (z >= 2 && *b == 3) { + ++b; + z = 0; + } + } + else { + z = 0; + } + } + return b; +} +#endif + +static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) { + const int rpi_use_emu = OPT_EMU; // FFmpeg removes emulation prevention bytes + const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware + const GetBitContext *gb = &s->HEVClc->gb; + +#if OPT_EMU + const uint8_t *ptr = ptr_from_index(de->nal_buffer, gb->index/8 + 1); + const int len = de->nal_size - (ptr - de->nal_buffer); +#else + const int len = 1 + gb->size_in_bits/8 - gb->index/8; + const void *ptr = &gb->buffer[gb->index/8]; +#endif + +#if TRACE_DEV + printf("Index=%d, /8=%#x\n", gb->index, gb->index/8); + dump_data(de->nal_buffer, 128); +#endif + + p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later + p1_apb_write(de, RPI_BFNUM, len); + p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop + p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6)); +} + +////////////////////////////////////////////////////////////////////////////// +// Wavefront mode + +static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) +{ + const HEVCPPS * const pps = s->ps.pps; + + int i, resetQPY=1; + int indep = !s->sh.dependent_slice_segment_flag; + int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; + + if (ctb_addr_ts) + wpp_end_previous_slice(de, s, ctb_addr_ts); + pre_slice_decode(de, s); + WriteBitstream(de, s); + if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1) + WriteProb(de, s); + else if (ctb_col==0) + p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); + else + resetQPY=0; + program_slicecmds(de, s->slice_idx); + new_slice_segment(de, s); + wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts); + for (i=0; ish.num_entry_point_offsets; i++) { + int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; + int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY; + int last_x = de->PicWidthInCtbsY-1; + if (de->PicWidthInCtbsY>2) + wpp_pause(de, ctb_row); + p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2); + if (de->PicWidthInCtbsY==2) + p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); + if (de->PicWidthInCtbsY==1) + WriteProb(de, s); + else + p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); + ctb_addr_ts += pps->column_width[0]; + wpp_entry_point(de, s, 0, 1, ctb_addr_ts); + } +} + +////////////////////////////////////////////////////////////////////////////// +// Tiles mode + +static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) { + const HEVCPPS * const pps = s->ps.pps; + int i, resetQPY; + + if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts); + pre_slice_decode(de, s); + WriteBitstream(de, s); + resetQPY = ctb_addr_ts==0 + || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1] + || !s->sh.dependent_slice_segment_flag; + if (resetQPY) WriteProb(de, s); + program_slicecmds(de, s->slice_idx); + new_slice_segment(de, s); + new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts); + for (i=0; ish.num_entry_point_offsets; i++) { + int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; + int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY; + int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY; + int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns); + int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows); + int last_x = pps->col_bd[tile_x+1]-1; + int last_y = pps->row_bd[tile_y+1]-1; + p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18)); + WriteProb(de, s); + ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y]; + new_entry_point(de, s, 0, 1, ctb_addr_ts); + } +} + +////////////////////////////////////////////////////////////////////////////// + +static int cabac_start_align(HEVCContext *s) +{ + GetBitContext *gb = &s->HEVClc->gb; + skip_bits(gb, 1); + align_get_bits(gb); + // Should look at getting rid of this + return ff_init_cabac_decoder(&s->HEVClc->cc, + gb->buffer + get_bits_count(gb) / 8, + (get_bits_left(gb) + 7) / 8); +} + +static int rpi_hevc_decode_slice( + AVCodecContext *avctx, + const uint8_t *buffer, + uint32_t size) +{ + RPI_T * const rpi = avctx->internal->hwaccel_priv_data; + HEVCContext * const s = avctx->priv_data; + dec_env_t * const de = dec_env_get(avctx, rpi); + const HEVCPPS *pps = s->ps.pps; + int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; + +#if TRACE_ENTRY + printf("<<< %s[%p]\n", __func__, de); +#endif + if (de == NULL) { + av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); + return -1; + } + + if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) { + av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state); + return -1; + } + de->state = RPIVID_DECODE_SLICE; + + de->nal_buffer = buffer; + de->nal_size = size; + +#if !OPT_EMU +// ff_hevc_cabac_init(s, ctb_addr_ts); + cabac_start_align(s); +#endif + if (s->ps.sps->scaling_list_enable_flag) + populate_scaling_factors(de, s); + pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts) + : decode_slice(de, s, ctb_addr_ts); +#if TRACE_ENTRY + printf(">>> %s[%p]\n", __func__, de); +#endif + dec_env_release(rpi, de); + return 0; +} + +////////////////////////////////////////////////////////////////////////////// + +static int rpivid_retrieve_data(void *logctx, AVFrame *frame) +{ + int rv; + if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0) + av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n"); + return rv; +} + +static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) +{ + RPI_T * const rpi = avctx->internal->hwaccel_priv_data; + HEVCContext * const s = avctx->priv_data; + // Frame buffering + 1 output. Would need thread_count extra but we now + // alloc at the start of phase 2 so that is the only thread we need the + // extra buffer for. + const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1; + int rv; + + if (av_rpi_zc_in_use(avctx)) + { + const AVZcEnvPtr zc = avctx->opaque; + av_rpi_zc_set_decoder_pool_size(zc, pool_req); + rv = av_rpi_zc_get_buffer(zc, frame); // get_buffer2 would alloc + } + else + { + if (rpi->zc == NULL) { + pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this + // Alloc inside lock to make sure we only ever alloc one + if (rpi->zc == NULL) { + rpi->zc = av_rpi_zc_int_env_alloc(s); + } + pthread_mutex_unlock(&rpi->phase_lock); + } + av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-) + rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) : + av_rpi_zc_get_buffer(rpi->zc, frame); + } + + if (rv == 0 && + (rv = ff_attach_decode_data(frame)) < 0) + { + av_frame_unref(frame); + } + + if (rv == 0) + { + FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; + fdd->post_process = rpivid_retrieve_data; + } + + return rv; +} + +#if OPT_PHASE_TIMING +static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins) +{ + av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n", + bins[0], bins[1], bins[2], bins[3], + bins[4], bins[5], bins[6], bins[7], bins[8]); +} +#endif + +////////////////////////////////////////////////////////////////////////////// + +static int rpi_hevc_free(AVCodecContext *avctx) { + RPI_T * const rpi = avctx->internal->hwaccel_priv_data; + +#if TRACE_ENTRY + printf("<<< %s\n", __func__); +#endif + + dec_env_release(rpi, NULL); + + // Wait for everything else to stop + { + struct timespec tt; + clock_gettime(CLOCK_REALTIME, &tt); + tt.tv_sec += 2; + while (sem_timedwait(&rpi->ref_zero, &tt) == -1) { + const int err = errno; + if (err == ETIMEDOUT) { + av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n"); + return -1; + } + if (err != EINTR) { + av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err); + break; + } + } + } + +#if OPT_PHASE_TIMING + { + unsigned int i; + for (i = 0; i != RPIVID_PHASES; ++i) { + const phase_wait_env_t * const p = rpi->phase_reqs + i; + av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i, + (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000), + (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000)); + av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d >\n", + time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3], + time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]); + log_bin_phase(avctx, p->time_bins); + log_bin_phase(avctx, p->time_bins3); + log_bin_phase(avctx, p->time_bins5); + av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n", + (unsigned int)(p->max_phase_time / 1000), + p->max_time_decode_order); + } + av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs); + } +#endif + + if (rpi->dec_envs != NULL) + { + for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) { + dec_env_delete(rpi->dec_envs[i]); + } + av_freep(&rpi->dec_envs); + } + + av_rpi_zc_int_env_freep(&rpi->zc); + + gpu_free(&rpi->gcolbuf); + + for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) { + gpu_free(rpi->gbitbufs + i); + } + for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) { + gpu_free(rpi->gcoeffbufs + i); + } + + unmap_devp(&rpi->regs, REGS_SIZE); + unmap_devp(&rpi->ints, INTS_SIZE); + + if (rpi->gpu_init_type > 0) + rpi_mem_gpu_uninit(); + + if (rpi->mbox_fd >= 0) { + mbox_release_clock(rpi->mbox_fd); + mbox_close(rpi->mbox_fd); + } + + sem_destroy(&rpi->ref_zero); + sem_destroy(&rpi->coeffbuf_sem); + sem_destroy(&rpi->bitbuf_sem); + +#if TRACE_ENTRY + printf(">>> %s\n", __func__); +#endif + return 0; +} + +////////////////////////////////////////////////////////////////////////////// + +static int rpi_hevc_init(AVCodecContext *avctx) { + RPI_T * const rpi = avctx->internal->hwaccel_priv_data; +// const char *err; + +#if TRACE_ENTRY + printf("<<< %s\n", __func__); +#endif + + if (avctx->width>4096 || avctx->height>4096) { + av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height); + return AVERROR(ENOTSUP); + } + + memset(rpi, 0, sizeof(*rpi)); + + rpi->mbox_fd = -1; + rpi->decode_order = 0; + + // Initial PU/COEFF stream buffer split chosen as worst case seen so far + rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU + + + atomic_store(&rpi->ref_count, 1); + sem_init(&rpi->ref_zero, 0, 0); + + sem_init(&rpi->bitbuf_sem, 0, RPIVID_BITBUFS); + sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS); + + pthread_mutex_init(&rpi->phase_lock, NULL); + + if ((rpi->mbox_fd = mbox_open()) < 0) + { + av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n"); + goto fail; + } + mbox_request_clock(rpi->mbox_fd); + + if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL || + (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n"); + goto fail; + } + + if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n"); + goto fail; + } + + if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count); + goto fail; + } + + rpi->col_stride = rnd64(avctx->width); + rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4); + if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0) + { + av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n"); + goto fail; + } + + for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) { + if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0) + { + av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i); + goto fail; + } + } + + for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) { + if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0) + { + av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i); + goto fail; + } + } + + av_log(avctx, AV_LOG_INFO, "RPI HEVC h/w accel init OK\n"); + + return 0; + +fail: + rpi_hevc_free(avctx); + return AVERROR_EXTERNAL; +} + +////////////////////////////////////////////////////////////////////////////// + +const AVHWAccel ff_hevc_rpi4_8_hwaccel = { + .name = "hevc_rpi4_8", + .type = AVMEDIA_TYPE_VIDEO, + .id = AV_CODEC_ID_HEVC, + .pix_fmt = AV_PIX_FMT_RPI4_8, + .alloc_frame = rpivid_hevc_alloc_frame, + .start_frame = rpi_hevc_start_frame, + .end_frame = rpi_hevc_end_frame, + .abort_frame = rpi_hevc_abort_frame, + .decode_slice = rpi_hevc_decode_slice, + .init = rpi_hevc_init, + .uninit = rpi_hevc_free, + .priv_data_size = sizeof(RPI_T), + .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, +}; + +const AVHWAccel ff_hevc_rpi4_10_hwaccel = { + .name = "hevc_rpi4_10", + .type = AVMEDIA_TYPE_VIDEO, + .id = AV_CODEC_ID_HEVC, + .pix_fmt = AV_PIX_FMT_RPI4_10, + .alloc_frame = rpivid_hevc_alloc_frame, + .start_frame = rpi_hevc_start_frame, + .end_frame = rpi_hevc_end_frame, + .abort_frame = rpi_hevc_abort_frame, + .decode_slice = rpi_hevc_decode_slice, + .init = rpi_hevc_init, + .uninit = rpi_hevc_free, + .priv_data_size = sizeof(RPI_T), + .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, +}; + diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c index 4b2679eb38..6ca83cc21b 100644 --- a/libavcodec/v4l2_buffers.c +++ b/libavcodec/v4l2_buffers.c @@ -21,6 +21,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include #include #include #include @@ -30,56 +31,68 @@ #include "libavcodec/avcodec.h" #include "libavcodec/internal.h" #include "libavutil/pixdesc.h" +#include "libavutil/hwcontext.h" #include "v4l2_context.h" #include "v4l2_buffers.h" #include "v4l2_m2m.h" +#include "weak_link.h" #define USEC_PER_SEC 1000000 -static AVRational v4l2_timebase = { 1, USEC_PER_SEC }; +static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; -static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf) +static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf) { return V4L2_TYPE_IS_OUTPUT(buf->context->type) ? container_of(buf->context, V4L2m2mContext, output) : container_of(buf->context, V4L2m2mContext, capture); } -static inline AVCodecContext *logger(V4L2Buffer *buf) +static inline AVCodecContext *logger(const V4L2Buffer * const buf) { return buf_to_m2mctx(buf)->avctx; } -static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf) +static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf) { - V4L2m2mContext *s = buf_to_m2mctx(avbuf); - - if (s->avctx->pkt_timebase.num) - return s->avctx->pkt_timebase; - return s->avctx->time_base; + const V4L2m2mContext *s = buf_to_m2mctx(avbuf); + const AVRational tb = s->avctx->pkt_timebase.num ? + s->avctx->pkt_timebase : + s->avctx->time_base; + return tb.num && tb.den ? tb : v4l2_timebase; } -static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts) +static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts) { - int64_t v4l2_pts; - - if (pts == AV_NOPTS_VALUE) - pts = 0; - /* convert pts to v4l2 timebase */ - v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); + const int64_t v4l2_pts = + out->context->no_pts_rescale ? pts : + pts == AV_NOPTS_VALUE ? 0 : + av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase); out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC; out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC; } -static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf) +static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf) { - int64_t v4l2_pts; - /* convert pts back to encoder timebase */ - v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + + const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC + avbuf->buf.timestamp.tv_usec; - return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); + return + avbuf->context->no_pts_rescale ? v4l2_pts : + v4l2_pts == 0 ? AV_NOPTS_VALUE : + av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf)); +} + +static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length) +{ + if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { + out->planes[plane].bytesused = bytesused; + out->planes[plane].length = length; + } else { + out->buf.bytesused = bytesused; + out->buf.length = length; + } } static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) @@ -116,6 +129,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf) return AVCOL_PRI_UNSPECIFIED; } +static void v4l2_set_color(V4L2Buffer *buf, + const enum AVColorPrimaries avcp, + const enum AVColorSpace avcs, + const enum AVColorTransferCharacteristic avxc) +{ + enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT; + enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT; + enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT; + + switch (avcp) { + case AVCOL_PRI_BT709: + cs = V4L2_COLORSPACE_REC709; + ycbcr = V4L2_YCBCR_ENC_709; + break; + case AVCOL_PRI_BT470M: + cs = V4L2_COLORSPACE_470_SYSTEM_M; + ycbcr = V4L2_YCBCR_ENC_601; + break; + case AVCOL_PRI_BT470BG: + cs = V4L2_COLORSPACE_470_SYSTEM_BG; + break; + case AVCOL_PRI_SMPTE170M: + cs = V4L2_COLORSPACE_SMPTE170M; + break; + case AVCOL_PRI_SMPTE240M: + cs = V4L2_COLORSPACE_SMPTE240M; + break; + case AVCOL_PRI_BT2020: + cs = V4L2_COLORSPACE_BT2020; + break; + case AVCOL_PRI_SMPTE428: + case AVCOL_PRI_SMPTE431: + case AVCOL_PRI_SMPTE432: + case AVCOL_PRI_EBU3213: + case AVCOL_PRI_RESERVED: + case AVCOL_PRI_FILM: + case AVCOL_PRI_UNSPECIFIED: + default: + break; + } + + switch (avcs) { + case AVCOL_SPC_RGB: + cs = V4L2_COLORSPACE_SRGB; + break; + case AVCOL_SPC_BT709: + cs = V4L2_COLORSPACE_REC709; + break; + case AVCOL_SPC_FCC: + cs = V4L2_COLORSPACE_470_SYSTEM_M; + break; + case AVCOL_SPC_BT470BG: + cs = V4L2_COLORSPACE_470_SYSTEM_BG; + break; + case AVCOL_SPC_SMPTE170M: + cs = V4L2_COLORSPACE_SMPTE170M; + break; + case AVCOL_SPC_SMPTE240M: + cs = V4L2_COLORSPACE_SMPTE240M; + break; + case AVCOL_SPC_BT2020_CL: + cs = V4L2_COLORSPACE_BT2020; + ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM; + break; + case AVCOL_SPC_BT2020_NCL: + cs = V4L2_COLORSPACE_BT2020; + break; + default: + break; + } + + switch (xfer) { + case AVCOL_TRC_BT709: + xfer = V4L2_XFER_FUNC_709; + break; + case AVCOL_TRC_IEC61966_2_1: + xfer = V4L2_XFER_FUNC_SRGB; + break; + case AVCOL_TRC_SMPTE240M: + xfer = V4L2_XFER_FUNC_SMPTE240M; + break; + case AVCOL_TRC_SMPTE2084: + xfer = V4L2_XFER_FUNC_SMPTE2084; + break; + default: + break; + } + + if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { + buf->context->format.fmt.pix_mp.colorspace = cs; + buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr; + buf->context->format.fmt.pix_mp.xfer_func = xfer; + } else { + buf->context->format.fmt.pix.colorspace = cs; + buf->context->format.fmt.pix.ycbcr_enc = ycbcr; + buf->context->format.fmt.pix.xfer_func = xfer; + } +} + static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) { enum v4l2_quantization qt; @@ -134,6 +246,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf) return AVCOL_RANGE_UNSPECIFIED; } +static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr) +{ + const enum v4l2_quantization q = + avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE : + avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE : + V4L2_QUANTIZATION_DEFAULT; + + if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) { + buf->context->format.fmt.pix_mp.quantization = q; + } else { + buf->context->format.fmt.pix.quantization = q; + } +} + static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf) { enum v4l2_ycbcr_encoding ycbcr; @@ -210,73 +336,165 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) return AVCOL_TRC_UNSPECIFIED; } -static void v4l2_free_buffer(void *opaque, uint8_t *unused) +static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf) +{ + return V4L2_FIELD_IS_INTERLACED(buf->buf.field); +} + +static int v4l2_buf_is_top_first(const V4L2Buffer * const buf) { - V4L2Buffer* avbuf = opaque; - V4L2m2mContext *s = buf_to_m2mctx(avbuf); + return buf->buf.field == V4L2_FIELD_INTERLACED_TB; +} - if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) { - atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel); +static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff) +{ + buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE : + is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT; +} - if (s->reinit) { - if (!atomic_load(&s->refcount)) - sem_post(&s->refsync); - } else { - if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) { - /* no need to queue more buffers to the driver */ - avbuf->status = V4L2BUF_AVAILABLE; - } - else if (avbuf->context->streamon) - ff_v4l2_buffer_enqueue(avbuf); - } +static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) +{ + AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; + AVDRMLayerDescriptor *layer; + + /* fill the DRM frame descriptor */ + drm_desc->nb_objects = avbuf->num_planes; + drm_desc->nb_layers = 1; + + layer = &drm_desc->layers[0]; + layer->nb_planes = avbuf->num_planes; + + for (int i = 0; i < avbuf->num_planes; i++) { + layer->planes[i].object_index = i; + layer->planes[i].offset = 0; + layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; + } + + switch (avbuf->context->av_pix_fmt) { + case AV_PIX_FMT_YUYV422: + + layer->format = DRM_FORMAT_YUYV; + layer->nb_planes = 1; + + break; + + case AV_PIX_FMT_NV12: + case AV_PIX_FMT_NV21: + + layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ? + DRM_FORMAT_NV12 : DRM_FORMAT_NV21; + + if (avbuf->num_planes > 1) + break; + + layer->nb_planes = 2; + + layer->planes[1].object_index = 0; + layer->planes[1].offset = avbuf->plane_info[0].bytesperline * + avbuf->context->format.fmt.pix.height; + layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; + break; + + case AV_PIX_FMT_YUV420P: - av_buffer_unref(&avbuf->context_ref); + layer->format = DRM_FORMAT_YUV420; + + if (avbuf->num_planes > 1) + break; + + layer->nb_planes = 3; + + layer->planes[1].object_index = 0; + layer->planes[1].offset = avbuf->plane_info[0].bytesperline * + avbuf->context->format.fmt.pix.height; + layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; + + layer->planes[2].object_index = 0; + layer->planes[2].offset = layer->planes[1].offset + + ((avbuf->plane_info[0].bytesperline * + avbuf->context->format.fmt.pix.height) >> 2); + layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; + break; + + default: + drm_desc->nb_layers = 0; + break; } + + return (uint8_t *) drm_desc; } -static int v4l2_buf_increase_ref(V4L2Buffer *in) +static void v4l2_free_bufref(void *opaque, uint8_t *data) { - V4L2m2mContext *s = buf_to_m2mctx(in); + AVBufferRef * bufref = (AVBufferRef *)data; + V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data; + struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl); - if (in->context_ref) - atomic_fetch_add(&in->context_refcount, 1); - else { - in->context_ref = av_buffer_ref(s->self_ref); - if (!in->context_ref) - return AVERROR(ENOMEM); + if (ctx != NULL) { + // Buffer still attached to context + V4L2m2mContext *s = buf_to_m2mctx(avbuf); - in->context_refcount = 1; - } + ff_mutex_lock(&ctx->lock); - in->status = V4L2BUF_RET_USER; - atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed); + avbuf->status = V4L2BUF_AVAILABLE; - return 0; + if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) { + av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name); + /* no need to queue more buffers to the driver */ + } + else if (ctx->streamon) { + av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name); + avbuf->buf.timestamp.tv_sec = 0; + avbuf->buf.timestamp.tv_usec = 0; + ff_v4l2_buffer_enqueue(avbuf); // will set to IN_DRIVER + } + else { + av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name); + } + + ff_mutex_unlock(&ctx->lock); + } + + ff_weak_link_unlock(avbuf->context_wl); + av_buffer_unref(&bufref); } -static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf) +static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) { - int ret; + struct v4l2_exportbuffer expbuf; + int i, ret; - if (plane >= in->num_planes) - return AVERROR(EINVAL); + for (i = 0; i < avbuf->num_planes; i++) { + memset(&expbuf, 0, sizeof(expbuf)); - /* even though most encoders return 0 in data_offset encoding vp8 does require this value */ - *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset, - in->plane_info[plane].length, v4l2_free_buffer, in, 0); - if (!*buf) - return AVERROR(ENOMEM); + expbuf.index = avbuf->buf.index; + expbuf.type = avbuf->buf.type; + expbuf.plane = i; - ret = v4l2_buf_increase_ref(in); - if (ret) - av_buffer_unref(buf); + ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf); + if (ret < 0) + return AVERROR(errno); - return ret; + if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) { + /* drm frame */ + avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length; + avbuf->drm_frame.objects[i].fd = expbuf.fd; + avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; + } else { + /* drm frame */ + avbuf->drm_frame.objects[0].size = avbuf->buf.length; + avbuf->drm_frame.objects[0].fd = expbuf.fd; + avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; + } + } + + return 0; } static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset) { unsigned int bytesused, length; + int rv = 0; if (plane >= out->num_planes) return AVERROR(EINVAL); @@ -284,32 +502,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i length = out->plane_info[plane].length; bytesused = FFMIN(size+offset, length); - memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset)); - - if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { - out->planes[plane].bytesused = bytesused; - out->planes[plane].length = length; - } else { - out->buf.bytesused = bytesused; - out->buf.length = length; + if (size > length - offset) { + size = length - offset; + rv = AVERROR(ENOMEM); } - return 0; + memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size); + + set_buf_length(out, plane, bytesused, length); + + return rv; +} + +static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf) +{ + AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]); + AVBufferRef * newbuf; + + if (!bufref) + return NULL; + + newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0); + if (newbuf == NULL) + av_buffer_unref(&bufref); + + avbuf->status = V4L2BUF_RET_USER; + return newbuf; } static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) { - int i, ret; + int i; frame->format = avbuf->context->av_pix_fmt; - for (i = 0; i < avbuf->num_planes; i++) { - ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]); - if (ret) - return ret; + frame->buf[0] = wrap_avbuf(avbuf); + if (frame->buf[0] == NULL) + return AVERROR(ENOMEM); + if (buf_to_m2mctx(avbuf)->output_drm) { + /* 1. get references to the actual data */ + frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf); + frame->format = AV_PIX_FMT_DRM_PRIME; + frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref); + return 0; + } + + + /* 1. get references to the actual data */ + for (i = 0; i < avbuf->num_planes; i++) { + frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset; frame->linesize[i] = avbuf->plane_info[i].bytesperline; - frame->data[i] = frame->buf[i]->data; } /* fixup special cases */ @@ -318,17 +561,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) case AV_PIX_FMT_NV21: if (avbuf->num_planes > 1) break; - frame->linesize[1] = avbuf->plane_info[0].bytesperline; - frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height; + frame->linesize[1] = frame->linesize[0]; + frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); break; case AV_PIX_FMT_YUV420P: if (avbuf->num_planes > 1) break; - frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1; - frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1; - frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height; - frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2); + frame->linesize[1] = frame->linesize[0] / 2; + frame->linesize[2] = frame->linesize[1]; + frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format); + frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2; break; default: @@ -338,68 +581,95 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) return 0; } +static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h) +{ + if (dst_stride == src_stride && w + 32 >= dst_stride) { + memcpy(dst, src, dst_stride * h); + } + else { + while (--h >= 0) { + memcpy(dst, src, w); + dst += dst_stride; + src += src_stride; + } + } +} + +static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes) +{ + return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA)); +} + static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) { - int i, ret; - struct v4l2_format fmt = out->context->format; - int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? - fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat; - int height = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? - fmt.fmt.pix_mp.height : fmt.fmt.pix.height; - int is_planar_format = 0; - - switch (pixel_format) { - case V4L2_PIX_FMT_YUV420M: - case V4L2_PIX_FMT_YVU420M: -#ifdef V4L2_PIX_FMT_YUV422M - case V4L2_PIX_FMT_YUV422M: -#endif -#ifdef V4L2_PIX_FMT_YVU422M - case V4L2_PIX_FMT_YVU422M: -#endif -#ifdef V4L2_PIX_FMT_YUV444M - case V4L2_PIX_FMT_YUV444M: -#endif -#ifdef V4L2_PIX_FMT_YVU444M - case V4L2_PIX_FMT_YVU444M: -#endif - case V4L2_PIX_FMT_NV12M: - case V4L2_PIX_FMT_NV21M: - case V4L2_PIX_FMT_NV12MT_16X16: - case V4L2_PIX_FMT_NV12MT: - case V4L2_PIX_FMT_NV16M: - case V4L2_PIX_FMT_NV61M: - is_planar_format = 1; - } - - if (!is_planar_format) { - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); - int planes_nb = 0; - int offset = 0; - - for (i = 0; i < desc->nb_components; i++) - planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1); - - for (i = 0; i < planes_nb; i++) { - int size, h = height; - if (i == 1 || i == 2) { + int i; + int num_planes = 0; + int pel_strides[4] = {0}; + + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); + + if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) { + av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__); + return -1; + } + + for (i = 0; i != desc->nb_components; ++i) { + if (desc->comp[i].plane >= num_planes) + num_planes = desc->comp[i].plane + 1; + pel_strides[desc->comp[i].plane] = desc->comp[i].step; + } + + if (out->num_planes > 1) { + if (num_planes != out->num_planes) { + av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes); + return -1; + } + for (i = 0; i != num_planes; ++i) { + int w = frame->width; + int h = frame->height; + if (is_chroma(desc, i, num_planes)) { + w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); } - size = frame->linesize[i] * h; - ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset); - if (ret) - return ret; - offset += size; + + cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline, + frame->data[i], frame->linesize[i], + w * pel_strides[i], h); + set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length); } - return 0; } + else + { + unsigned int offset = 0; + + for (i = 0; i != num_planes; ++i) { + int w = frame->width; + int h = frame->height; + int dst_stride = out->plane_info[0].bytesperline; + uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset; + + if (is_chroma(desc, i, num_planes)) { + // Is chroma + dst_stride >>= desc->log2_chroma_w; + offset += dst_stride * (out->context->height >> desc->log2_chroma_h); + w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); + h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); + } + else { + // Is luma or alpha + offset += dst_stride * out->context->height; + } + if (offset > out->plane_info[0].length) { + av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length); + return -1; + } - for (i = 0; i < out->num_planes; i++) { - ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0); - if (ret) - return ret; + cpy_2d(dst, dst_stride, + frame->data[i], frame->linesize[i], + w * pel_strides[i], h); + } + set_buf_length(out, 0, offset, out->plane_info[0].length); } - return 0; } @@ -411,7 +681,16 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) { + out->buf.flags = frame->key_frame ? + (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : + (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); + // Beware that colour info is held in format rather than the actual + // v4l2 buffer struct so this may not be as useful as you might hope + v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc); + v4l2_set_color_range(out, frame->color_range); + // PTS & interlace are buffer vars v4l2_set_pts(out, frame->pts); + v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first); return v4l2_buffer_swframe_to_buf(frame, out); } @@ -419,6 +698,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out) int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) { int ret; + V4L2Context * const ctx = avbuf->context; av_frame_unref(frame); @@ -429,17 +709,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) /* 2. get frame information */ frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME); + frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I : + (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P : + (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B : + AV_PICTURE_TYPE_NONE; frame->color_primaries = v4l2_get_color_primaries(avbuf); frame->colorspace = v4l2_get_color_space(avbuf); frame->color_range = v4l2_get_color_range(avbuf); frame->color_trc = v4l2_get_color_trc(avbuf); frame->pts = v4l2_get_pts(avbuf); frame->pkt_dts = AV_NOPTS_VALUE; + frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf); + frame->top_field_first = v4l2_buf_is_top_first(avbuf); /* these values are updated also during re-init in v4l2_process_driver_event */ - frame->height = avbuf->context->height; - frame->width = avbuf->context->width; - frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio; + frame->height = ctx->height; + frame->width = ctx->width; + frame->sample_aspect_ratio = ctx->sample_aspect_ratio; + + if (ctx->selection.height && ctx->selection.width) { + frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0; + frame->crop_top = ctx->selection.top < frame->height ? ctx->selection.top : 0; + frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ? + frame->width - (ctx->selection.left + ctx->selection.width) : 0; + frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ? + frame->height - (ctx->selection.top + ctx->selection.height) : 0; + } /* 3. report errors upstream */ if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) { @@ -452,15 +747,14 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf) int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) { - int ret; - av_packet_unref(pkt); - ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf); - if (ret) - return ret; + + pkt->buf = wrap_avbuf(avbuf); + if (pkt->buf == NULL) + return AVERROR(ENOMEM); pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused; - pkt->data = pkt->buf->data; + pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset; if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME) pkt->flags |= AV_PKT_FLAG_KEY; @@ -475,31 +769,85 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) return 0; } -int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) +int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, + const void *extdata, size_t extlen) { int ret; - ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0); - if (ret) + if (extlen) { + ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0); + if (ret) + return ret; + } + + ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen); + if (ret && ret != AVERROR(ENOMEM)) return ret; v4l2_set_pts(out, pkt->pts); - if (pkt->flags & AV_PKT_FLAG_KEY) - out->flags = V4L2_BUF_FLAG_KEYFRAME; + out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ? + (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) : + (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME); - return 0; + return ret; } -int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) +int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) +{ + return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0); +} + + +static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data) +{ + V4L2Buffer * const avbuf = (V4L2Buffer *)data; + int i; + + for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) { + struct V4L2Plane_info *p = avbuf->plane_info + i; + if (p->mm_addr != NULL) + munmap(p->mm_addr, p->length); + } + + for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { + if (avbuf->drm_frame.objects[i].fd != -1) + close(avbuf->drm_frame.objects[i].fd); + } + + ff_weak_link_unref(&avbuf->context_wl); + + av_free(avbuf); +} + + +int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx) { - V4L2Context *ctx = avbuf->context; int ret, i; + V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf)); + AVBufferRef * bufref; + + *pbufref = NULL; + if (avbuf == NULL) + return AVERROR(ENOMEM); + + bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0); + if (bufref == NULL) { + av_free(avbuf); + return AVERROR(ENOMEM); + } + avbuf->context = ctx; avbuf->buf.memory = V4L2_MEMORY_MMAP; avbuf->buf.type = ctx->type; avbuf->buf.index = index; + for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) { + avbuf->drm_frame.objects[i].fd = -1; + } + + avbuf->context_wl = ff_weak_link_ref(ctx->wl_master); + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->buf.length = VIDEO_MAX_PLANES; avbuf->buf.m.planes = avbuf->planes; @@ -507,7 +855,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf); if (ret < 0) - return AVERROR(errno); + goto fail; if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->num_planes = 0; @@ -527,25 +875,33 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; - avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, - PROT_READ | PROT_WRITE, MAP_SHARED, - buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); + + if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || + !buf_to_m2mctx(avbuf)->output_drm) { + avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, + PROT_READ | PROT_WRITE, MAP_SHARED, + buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); + } } else { avbuf->plane_info[i].length = avbuf->buf.length; - avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, - PROT_READ | PROT_WRITE, MAP_SHARED, - buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); + + if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || + !buf_to_m2mctx(avbuf)->output_drm) { + avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, + PROT_READ | PROT_WRITE, MAP_SHARED, + buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); + } } - if (avbuf->plane_info[i].mm_addr == MAP_FAILED) - return AVERROR(ENOMEM); + if (avbuf->plane_info[i].mm_addr == MAP_FAILED) { + avbuf->plane_info[i].mm_addr = NULL; + ret = AVERROR(ENOMEM); + goto fail; + } } avbuf->status = V4L2BUF_AVAILABLE; - if (V4L2_TYPE_IS_OUTPUT(ctx->type)) - return 0; - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->buf.m.planes = avbuf->planes; avbuf->buf.length = avbuf->num_planes; @@ -555,20 +911,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) avbuf->buf.length = avbuf->planes[0].length; } - return ff_v4l2_buffer_enqueue(avbuf); + if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { + if (buf_to_m2mctx(avbuf)->output_drm) { + ret = v4l2_buffer_export_drm(avbuf); + if (ret) + goto fail; + } + } + + *pbufref = bufref; + return 0; + +fail: + av_buffer_unref(&bufref); + return ret; } int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) { int ret; + int qc; - avbuf->buf.flags = avbuf->flags; + if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) { + av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", + avbuf->context->name, avbuf->buf.index, + avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, + avbuf->context->q_count); + } ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf); - if (ret < 0) - return AVERROR(errno); + if (ret < 0) { + int err = errno; + av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n", + avbuf->context->name, avbuf->buf.index, + err, strerror(err)); + return AVERROR(err); + } + // Lock not wanted - if called from buffer free then lock already obtained + qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1; avbuf->status = V4L2BUF_IN_DRIVER; + pthread_cond_broadcast(&avbuf->context->cond); + + av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", + avbuf->context->name, avbuf->buf.index, + avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc); return 0; } diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h index 8dbc7fc104..7d5fadcd3d 100644 --- a/libavcodec/v4l2_buffers.h +++ b/libavcodec/v4l2_buffers.h @@ -27,25 +27,34 @@ #include #include +#include "libavutil/hwcontext_drm.h" #include "avcodec.h" enum V4L2Buffer_status { V4L2BUF_AVAILABLE, V4L2BUF_IN_DRIVER, + V4L2BUF_IN_USE, V4L2BUF_RET_USER, }; /** * V4L2Buffer (wrapper for v4l2_buffer management) */ +struct V4L2Context; +struct ff_weak_link_client; + typedef struct V4L2Buffer { - /* each buffer needs to have a reference to its context */ + /* each buffer needs to have a reference to its context + * The pointer is good enough for most operation but once the buffer has + * been passed to the user the buffer may become orphaned so for free ops + * the weak link must be used to ensure that the context is actually + * there + */ struct V4L2Context *context; + struct ff_weak_link_client *context_wl; - /* This object is refcounted per-plane, so we need to keep track - * of how many context-refs we are holding. */ - AVBufferRef *context_ref; - atomic_uint context_refcount; + /* DRM descriptor */ + AVDRMFrameDescriptor drm_frame; /* keep track of the mmap address and mmap length */ struct V4L2Plane_info { @@ -60,7 +69,6 @@ typedef struct V4L2Buffer { struct v4l2_buffer buf; struct v4l2_plane planes[VIDEO_MAX_PLANES]; - int flags; enum V4L2Buffer_status status; } V4L2Buffer; @@ -98,6 +106,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf); */ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); +int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, + const void *extdata, size_t extlen); + /** * Extracts the data from an AVFrame to a V4L2Buffer * @@ -116,7 +127,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out); * * @returns 0 in case of success, a negative AVERROR code otherwise */ -int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index); +int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx); /** * Enqueues a V4L2Buffer diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c index ff1ea8e57b..c0d257e5d3 100644 --- a/libavcodec/v4l2_context.c +++ b/libavcodec/v4l2_context.c @@ -27,11 +27,13 @@ #include #include #include +#include "libavutil/avassert.h" #include "libavcodec/avcodec.h" #include "libavcodec/internal.h" #include "v4l2_buffers.h" #include "v4l2_fmt.h" #include "v4l2_m2m.h" +#include "weak_link.h" struct v4l2_format_update { uint32_t v4l2_fmt; @@ -41,28 +43,18 @@ struct v4l2_format_update { int update_avfmt; }; -static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx) +static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx) { return V4L2_TYPE_IS_OUTPUT(ctx->type) ? container_of(ctx, V4L2m2mContext, output) : container_of(ctx, V4L2m2mContext, capture); } -static inline AVCodecContext *logger(V4L2Context *ctx) +static inline AVCodecContext *logger(const V4L2Context *ctx) { return ctx_to_m2mctx(ctx)->avctx; } -static inline unsigned int v4l2_get_width(struct v4l2_format *fmt) -{ - return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; -} - -static inline unsigned int v4l2_get_height(struct v4l2_format *fmt) -{ - return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; -} - static AVRational v4l2_get_sar(V4L2Context *ctx) { struct AVRational sar = { 0, 1 }; @@ -81,21 +73,29 @@ static AVRational v4l2_get_sar(V4L2Context *ctx) return sar; } -static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2) +static inline int ctx_buffers_alloced(const V4L2Context * const ctx) +{ + return ctx->bufrefs != NULL; +} + +// Width/Height changed or we don't have an alloc in the first place? +static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2) { - struct v4l2_format *fmt1 = &ctx->format; - int ret = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? - fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || - fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height - : - fmt1->fmt.pix.width != fmt2->fmt.pix.width || - fmt1->fmt.pix.height != fmt2->fmt.pix.height; + const struct v4l2_format *fmt1 = &ctx->format; + int ret = !ctx_buffers_alloced(ctx) || + (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? + fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width || + fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height + : + fmt1->fmt.pix.width != fmt2->fmt.pix.width || + fmt1->fmt.pix.height != fmt2->fmt.pix.height); if (ret) - av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n", + av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n", ctx->name, - v4l2_get_width(fmt1), v4l2_get_height(fmt1), - v4l2_get_width(fmt2), v4l2_get_height(fmt2)); + ctx_buffers_alloced(ctx), + ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1), + ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2)); return ret; } @@ -153,90 +153,110 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd } } -/** - * handle resolution change event and end of stream event - * returns 1 if reinit was successful, negative if it failed - * returns 0 if reinit was not executed - */ -static int v4l2_handle_event(V4L2Context *ctx) +static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r) { - V4L2m2mContext *s = ctx_to_m2mctx(ctx); - struct v4l2_format cap_fmt = s->capture.format; - struct v4l2_format out_fmt = s->output.format; - struct v4l2_event evt = { 0 }; - int full_reinit, reinit, ret; + V4L2m2mContext * const s = ctx_to_m2mctx(ctx); + struct v4l2_selection selection = { + .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, + .target = V4L2_SEL_TGT_COMPOSE + }; - ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt); - if (ret < 0) { - av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name); - return 0; - } + memset(r, 0, sizeof(*r)); + if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection)) + return AVERROR(errno); - if (evt.type == V4L2_EVENT_EOS) { - ctx->done = 1; - return 0; - } + *r = selection.r; + return 0; +} - if (evt.type != V4L2_EVENT_SOURCE_CHANGE) - return 0; +static int do_source_change(V4L2m2mContext * const s) +{ + AVCodecContext *const avctx = s->avctx; - ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt); - if (ret) { - av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name); - return 0; - } + int ret; + int reinit; + struct v4l2_format cap_fmt = s->capture.format; + + s->capture.done = 0; ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt); if (ret) { - av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name); + av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name); return 0; } - full_reinit = v4l2_resolution_changed(&s->output, &out_fmt); - if (full_reinit) { - s->output.height = v4l2_get_height(&out_fmt); - s->output.width = v4l2_get_width(&out_fmt); - s->output.sample_aspect_ratio = v4l2_get_sar(&s->output); - } + get_default_selection(&s->capture, &s->capture.selection); - reinit = v4l2_resolution_changed(&s->capture, &cap_fmt); + reinit = ctx_resolution_changed(&s->capture, &cap_fmt); + if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0) + reinit = 1; + + s->capture.format = cap_fmt; if (reinit) { - s->capture.height = v4l2_get_height(&cap_fmt); - s->capture.width = v4l2_get_width(&cap_fmt); - s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); + s->capture.height = ff_v4l2_get_format_height(&cap_fmt); + s->capture.width = ff_v4l2_get_format_width(&cap_fmt); } - if (full_reinit || reinit) - s->reinit = 1; - - if (full_reinit) { - ret = ff_v4l2_m2m_codec_full_reinit(s); - if (ret) { - av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n"); - return AVERROR(EINVAL); - } - goto reinit_run; + // If we don't support selection (or it is bust) and we obviously have HD then kludge + if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) && + (s->capture.height == 1088 && s->capture.width == 1920)) { + s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080}; } + s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture); + + av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n", + s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den, + s->capture.width, s->capture.height, + s->capture.selection.width, s->capture.selection.height, + s->capture.selection.left, s->capture.selection.top, reinit); + if (reinit) { - if (s->avctx) - ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height); + if (avctx) + ret = ff_set_dimensions(s->avctx, + s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width, + s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height); if (ret < 0) - av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n"); + av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n"); ret = ff_v4l2_m2m_codec_reinit(s); if (ret) { - av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n"); + av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n"); return AVERROR(EINVAL); } + + if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) || + s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) { + av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n", + s->capture.width, s->capture.height, + ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format)); + return AVERROR(EINVAL); + } + + // Update pixel format - should only actually do something on initial change + s->capture.av_pix_fmt = + ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO); + if (s->output_drm) { + avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; + avctx->sw_pix_fmt = s->capture.av_pix_fmt; + } + else + avctx->pix_fmt = s->capture.av_pix_fmt; + goto reinit_run; } - /* dummy event received */ - return 0; + /* Buffers are OK so just stream off to ack */ + av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__); + + ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); + if (ret) + av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n"); + s->draining = 0; /* reinit executed */ reinit_run: + ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON); return 1; } @@ -280,171 +300,275 @@ static int v4l2_stop_encode(V4L2Context *ctx) return 0; } -static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) +// DQ a buffer +// Amalgamates all the various ways there are of signalling EOS/Event to +// generate a consistant EPIPE. +// +// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped) +// +// Returns: +// 0 Success +// AVERROR(EPIPE) Nothing more to read +// AVERROR(ENOSPC) No buffers in Q to put result in +// * AVERROR(..) + + static int +dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf) { - struct v4l2_plane planes[VIDEO_MAX_PLANES]; - struct v4l2_buffer buf = { 0 }; - V4L2Buffer *avbuf; - struct pollfd pfd = { - .events = POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */ - .fd = ctx_to_m2mctx(ctx)->fd, + V4L2m2mContext * const m = ctx_to_m2mctx(ctx); + AVCodecContext * const avctx = m->avctx; + V4L2Buffer * avbuf; + const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type); + + struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; + + struct v4l2_buffer buf = { + .type = ctx->type, + .memory = V4L2_MEMORY_MMAP, }; - int i, ret; - if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) { - for (i = 0; i < ctx->num_buffers; i++) { - if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) - break; - } - if (i == ctx->num_buffers) - av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to " - "userspace. Increase num_capture_buffers " - "to prevent device deadlock or dropped " - "packets/frames.\n"); - } - - /* if we are draining and there are no more capture buffers queued in the driver we are done */ - if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) { - for (i = 0; i < ctx->num_buffers; i++) { - /* capture buffer initialization happens during decode hence - * detection happens at runtime - */ - if (!ctx->buffers) - break; - - if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) - goto start; + *ppavbuf = NULL; + + if (ctx->flag_last) + return AVERROR(EPIPE); + + if (is_mp) { + buf.length = VIDEO_MAX_PLANES; + buf.m.planes = planes; + } + + while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) { + const int err = errno; + av_assert0(AVERROR(err) < 0); + if (err != EINTR) { + av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", + ctx->name, av_err2str(AVERROR(err))); + + if (err == EPIPE) + ctx->flag_last = 1; + + return AVERROR(err); } - ctx->done = 1; - return NULL; + } + atomic_fetch_sub(&ctx->q_count, 1); + + avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data; + avbuf->status = V4L2BUF_AVAILABLE; + avbuf->buf = buf; + if (is_mp) { + memcpy(avbuf->planes, planes, sizeof(planes)); + avbuf->buf.m.planes = avbuf->planes; } -start: - if (V4L2_TYPE_IS_OUTPUT(ctx->type)) - pfd.events = POLLOUT | POLLWRNORM; - else { - /* no need to listen to requests for more input while draining */ - if (ctx_to_m2mctx(ctx)->draining) - pfd.events = POLLIN | POLLRDNORM | POLLPRI; + if (V4L2_TYPE_IS_CAPTURE(ctx->type)) { + // Zero length cap buffer return == EOS + if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) { + av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n"); + + // Must reQ so we don't leak + // May not matter if the next thing we do is release all the + // buffers but better to be tidy. + ff_v4l2_buffer_enqueue(avbuf); + + ctx->flag_last = 1; + return AVERROR(EPIPE); + } + +#ifdef V4L2_BUF_FLAG_LAST + // If flag_last set then this contains data but is the last frame + // so remember that but return OK + if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0) + ctx->flag_last = 1; +#endif } - for (;;) { - ret = poll(&pfd, 1, timeout); - if (ret > 0) - break; - if (errno == EINTR) + *ppavbuf = avbuf; + return 0; +} + +/** + * handle resolution change event and end of stream event + * Expects to be called after the stream has stopped + * + * returns 1 if reinit was successful, negative if it failed + * returns 0 if reinit was not executed + */ +static int +get_event(V4L2m2mContext * const m) +{ + AVCodecContext * const avctx = m->avctx; + struct v4l2_event evt = { 0 }; + + while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) { + const int rv = AVERROR(errno); + if (rv == AVERROR(EINTR)) continue; - return NULL; + if (rv == AVERROR(EAGAIN)) { + av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n"); + return AVERROR_EOF; + } + av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv)); + return rv; } - /* 0. handle errors */ - if (pfd.revents & POLLERR) { - /* if we are trying to get free buffers but none have been queued yet - no need to raise a warning */ - if (timeout == 0) { - for (i = 0; i < ctx->num_buffers; i++) { - if (ctx->buffers[i].status != V4L2BUF_AVAILABLE) - av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); - } - } - else - av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name); + av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type); - return NULL; + if (evt.type == V4L2_EVENT_EOS) { + av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n"); + return AVERROR_EOF; } - /* 1. handle resolution changes */ - if (pfd.revents & POLLPRI) { - ret = v4l2_handle_event(ctx); - if (ret < 0) { - /* if re-init failed, abort */ - ctx->done = 1; - return NULL; - } - if (ret) { - /* if re-init was successful drop the buffer (if there was one) - * since we had to reconfigure capture (unmap all buffers) - */ - return NULL; + if (evt.type == V4L2_EVENT_SOURCE_CHANGE) + return do_source_change(m); + + return 0; +} + + +// Get a buffer +// If output then just gets the buffer in the expected way +// If capture then runs the capture state m/c to deal with res change etc. +// If return value == 0 then *ppavbuf != NULL + +static int +get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout) +{ + V4L2m2mContext * const m = ctx_to_m2mctx(ctx); + AVCodecContext * const avctx = m->avctx; + const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type); + + const unsigned int poll_cap = (POLLIN | POLLRDNORM); + const unsigned int poll_out = (POLLOUT | POLLWRNORM); + const unsigned int poll_event = POLLPRI; + + *ppavbuf = NULL; + + for (;;) { + struct pollfd pfd = { + .fd = m->fd, + // If capture && stream not started then assume we are waiting for the initial event + .events = !is_cap ? poll_out : + !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap : + poll_event, + }; + int ret; + + if (ctx->done) { + av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name); + return AVERROR_EOF; } - } - /* 2. dequeue the buffer */ - if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { + // If capture && timeout == -1 then also wait for rx buffer free + if (is_cap && timeout == -1 && m->output.streamon && !m->draining) + pfd.events |= poll_out; - if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { - /* there is a capture buffer ready */ - if (pfd.revents & (POLLIN | POLLRDNORM)) - goto dequeue; + // If nothing Qed all we will get is POLLERR - avoid that + if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) || + (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) || + (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) { + av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); + return AVERROR(ENOSPC); + } - /* the driver is ready to accept more input; instead of waiting for the capture - * buffer to complete we return NULL so input can proceed (we are single threaded) - */ - if (pfd.revents & (POLLOUT | POLLWRNORM)) - return NULL; + // Timeout kludged s.t. "forever" eventually gives up & produces logging + // If waiting for an event when we have seen a last_frame then we expect + // it to be ready already so force a short timeout + ret = poll(&pfd, 1, + ff_v4l2_ctx_eos(ctx) ? 10 : + timeout == -1 ? 3000 : timeout); + if (ret < 0) { + ret = AVERROR(errno); // Remember errno before logging etc. + av_assert0(ret < 0); } -dequeue: - memset(&buf, 0, sizeof(buf)); - buf.memory = V4L2_MEMORY_MMAP; - buf.type = ctx->type; - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - memset(planes, 0, sizeof(planes)); - buf.length = VIDEO_MAX_PLANES; - buf.m.planes = planes; + av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n", + ctx->name, ret, timeout, pfd.events, pfd.revents); + + if (ret < 0) { + if (ret == AVERROR(EINTR)) + continue; + av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret)); + return ret; } - ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf); - if (ret) { - if (errno != EAGAIN) { - ctx->done = 1; - if (errno != EPIPE) - av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", - ctx->name, av_err2str(AVERROR(errno))); + if (ret == 0) { + if (timeout == -1) + av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events); + if (ff_v4l2_ctx_eos(ctx)) { + av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name); + ret = get_event(m); + if (ret < 0) { + ctx->done = 1; + return ret; + } } - return NULL; + return AVERROR(EAGAIN); } - if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) { - int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? - buf.m.planes[0].bytesused : buf.bytesused; - if (bytesused == 0) { + if ((pfd.revents & POLLERR) != 0) { + av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name); + return AVERROR_UNKNOWN; + } + + if ((pfd.revents & poll_event) != 0) { + ret = get_event(m); + if (ret < 0) { ctx->done = 1; - return NULL; + return ret; } -#ifdef V4L2_BUF_FLAG_LAST - if (buf.flags & V4L2_BUF_FLAG_LAST) - ctx->done = 1; -#endif + continue; } - avbuf = &ctx->buffers[buf.index]; - avbuf->status = V4L2BUF_AVAILABLE; - avbuf->buf = buf; - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - memcpy(avbuf->planes, planes, sizeof(planes)); - avbuf->buf.m.planes = avbuf->planes; + if ((pfd.revents & poll_cap) != 0) { + ret = dq_buf(ctx, ppavbuf); + if (ret == AVERROR(EPIPE)) + continue; + return ret; } - return avbuf; + + if ((pfd.revents & poll_out) != 0) { + if (is_cap) + return AVERROR(EAGAIN); + return dq_buf(ctx, ppavbuf); + } + + av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents); + return AVERROR_UNKNOWN; } +} - return NULL; +// Clear out flags and timestamps that should should be set by the user +// Returns the passed avbuf +static V4L2Buffer * +clean_v4l2_buffer(V4L2Buffer * const avbuf) +{ + struct v4l2_buffer *const buf = &avbuf->buf; + + buf->flags = 0; + buf->field = V4L2_FIELD_ANY; + buf->timestamp = (struct timeval){0}; + buf->timecode = (struct v4l2_timecode){0}; + buf->sequence = 0; + + return avbuf; } static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) { - int timeout = 0; /* return when no more buffers to dequeue */ int i; /* get back as many output buffers as possible */ if (V4L2_TYPE_IS_OUTPUT(ctx->type)) { - do { - } while (v4l2_dequeue_v4l2buf(ctx, timeout)); + V4L2Buffer * avbuf; + do { + get_qbuf(ctx, &avbuf, 0); + } while (avbuf); } for (i = 0; i < ctx->num_buffers; i++) { - if (ctx->buffers[i].status == V4L2BUF_AVAILABLE) - return &ctx->buffers[i]; + V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data; + if (avbuf->status == V4L2BUF_AVAILABLE) + return clean_v4l2_buffer(avbuf); } return NULL; @@ -452,25 +576,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) static int v4l2_release_buffers(V4L2Context* ctx) { - struct v4l2_requestbuffers req = { - .memory = V4L2_MEMORY_MMAP, - .type = ctx->type, - .count = 0, /* 0 -> unmaps buffers from the driver */ - }; - int i, j; + int i; + int ret = 0; + const int fd = ctx_to_m2mctx(ctx)->fd; - for (i = 0; i < ctx->num_buffers; i++) { - V4L2Buffer *buffer = &ctx->buffers[i]; + // Orphan any buffers in the wild + ff_weak_link_break(&ctx->wl_master); + + if (ctx->bufrefs) { + for (i = 0; i < ctx->num_buffers; i++) + av_buffer_unref(ctx->bufrefs + i); + } + + if (fd != -1) { + struct v4l2_requestbuffers req = { + .memory = V4L2_MEMORY_MMAP, + .type = ctx->type, + .count = 0, /* 0 -> unmap all buffers from the driver */ + }; + + while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) { + if (errno == EINTR) + continue; - for (j = 0; j < buffer->num_planes; j++) { - struct V4L2Plane_info *p = &buffer->plane_info[j]; - if (p->mm_addr && p->length) - if (munmap(p->mm_addr, p->length) < 0) - av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno))); + ret = AVERROR(errno); + + av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n", + ctx->name, av_err2str(AVERROR(errno))); + + if (ctx_to_m2mctx(ctx)->output_drm) + av_log(logger(ctx), AV_LOG_ERROR, + "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n" + "for all buffers: \n" + " 1. drmModeRmFB(..)\n" + " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n"); } } + atomic_store(&ctx->q_count, 0); - return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req); + return ret; } static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt) @@ -499,6 +643,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) { + V4L2m2mContext* s = ctx_to_m2mctx(ctx); + V4L2m2mPriv *priv = s->avctx->priv_data; enum AVPixelFormat pixfmt = ctx->av_pix_fmt; struct v4l2_fmtdesc fdesc; int ret; @@ -517,6 +663,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) if (ret) return AVERROR(EINVAL); + if (priv->pix_fmt != AV_PIX_FMT_NONE) { + if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) { + fdesc.index++; + continue; + } + } + pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); ret = v4l2_try_raw_format(ctx, pixfmt); if (ret){ @@ -569,18 +722,83 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) * *****************************************************************************/ + +static void flush_all_buffers_status(V4L2Context* const ctx) +{ + int i; + + if (!ctx->bufrefs) + return; + + for (i = 0; i < ctx->num_buffers; ++i) { + struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; + if (buf->status == V4L2BUF_IN_DRIVER) + buf->status = V4L2BUF_AVAILABLE; + } + atomic_store(&ctx->q_count, 0); +} + +static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx) +{ + int i; + int rv; + + if (!ctx->bufrefs) { + rv = ff_v4l2_context_init(ctx); + if (rv) { + av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); + return rv; + } + } + + for (i = 0; i < ctx->num_buffers; ++i) { + struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data; + if (buf->status == V4L2BUF_AVAILABLE) { + rv = ff_v4l2_buffer_enqueue(buf); + if (rv < 0) + return rv; + } + } + return 0; +} + int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) { int type = ctx->type; - int ret; + int ret = 0; + AVCodecContext * const avctx = logger(ctx); - ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type); - if (ret < 0) - return AVERROR(errno); + // Avoid doing anything if there is nothing we can do + if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon) + return 0; - ctx->streamon = (cmd == VIDIOC_STREAMON); + ff_mutex_lock(&ctx->lock); - return 0; + if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type)) + stuff_all_buffers(avctx, ctx); + + if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) { + const int err = errno; + av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name, + cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err); + ret = AVERROR(err); + } + else + { + if (cmd == VIDIOC_STREAMOFF) + flush_all_buffers_status(ctx); + + ctx->streamon = (cmd == VIDIOC_STREAMON); + av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name, + cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF"); + } + + // Both stream off & on effectively clear flag_last + ctx->flag_last = 0; + + ff_mutex_unlock(&ctx->lock); + + return ret; } int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) @@ -608,7 +826,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) return ff_v4l2_buffer_enqueue(avbuf); } -int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) +int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, + const void * extdata, size_t extlen) { V4L2m2mContext *s = ctx_to_m2mctx(ctx); V4L2Buffer* avbuf; @@ -616,8 +835,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) if (!pkt->size) { ret = v4l2_stop_decode(ctx); + // Log but otherwise ignore stop failure if (ret) - av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name); + av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret); s->draining = 1; return 0; } @@ -626,8 +846,11 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) if (!avbuf) return AVERROR(EAGAIN); - ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf); - if (ret) + ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen); + if (ret == AVERROR(ENOMEM)) + av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n", + __func__, pkt->size, avbuf->planes[0].length); + else if (ret) return ret; return ff_v4l2_buffer_enqueue(avbuf); @@ -636,19 +859,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) { V4L2Buffer *avbuf; + int rv; - /* - * timeout=-1 blocks until: - * 1. decoded frame available - * 2. an input buffer is ready to be dequeued - */ - avbuf = v4l2_dequeue_v4l2buf(ctx, timeout); - if (!avbuf) { - if (ctx->done) - return AVERROR_EOF; - - return AVERROR(EAGAIN); - } + if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0) + return rv; return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); } @@ -656,19 +870,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) { V4L2Buffer *avbuf; + int rv; - /* - * blocks until: - * 1. encoded packet available - * 2. an input buffer ready to be dequeued - */ - avbuf = v4l2_dequeue_v4l2buf(ctx, -1); - if (!avbuf) { - if (ctx->done) - return AVERROR_EOF; - - return AVERROR(EAGAIN); - } + if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) + return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); } @@ -702,78 +907,160 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) int ff_v4l2_context_set_format(V4L2Context* ctx) { - return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); + int ret; + + ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); + if (ret != 0) + return ret; + + // Check returned size against min size and if smaller have another go + // Only worry about plane[0] as this is meant to enforce limits for + // encoded streams where we might know a bit more about the shape + // than the driver + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) { + if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage) + return 0; + ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size; + } + else { + if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage) + return 0; + ctx->format.fmt.pix.sizeimage = ctx->min_buf_size; + } + + ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format); + return ret; } void ff_v4l2_context_release(V4L2Context* ctx) { int ret; - if (!ctx->buffers) + if (!ctx->bufrefs) return; ret = v4l2_release_buffers(ctx); if (ret) av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name); - av_freep(&ctx->buffers); + av_freep(&ctx->bufrefs); + av_buffer_unref(&ctx->frames_ref); + + ff_mutex_destroy(&ctx->lock); + pthread_cond_destroy(&ctx->cond); } -int ff_v4l2_context_init(V4L2Context* ctx) + +static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers) { - V4L2m2mContext *s = ctx_to_m2mctx(ctx); + V4L2m2mContext * const s = ctx_to_m2mctx(ctx); struct v4l2_requestbuffers req; - int ret, i; - - if (!v4l2_type_supported(ctx)) { - av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); - return AVERROR_PATCHWELCOME; - } + int ret; + int i; - ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); - if (ret) - av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name); + av_assert0(ctx->bufrefs == NULL); memset(&req, 0, sizeof(req)); - req.count = ctx->num_buffers; + req.count = req_buffers; req.memory = V4L2_MEMORY_MMAP; req.type = ctx->type; - ret = ioctl(s->fd, VIDIOC_REQBUFS, &req); - if (ret < 0) { - av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno)); - return AVERROR(errno); + while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) { + if (errno != EINTR) { + ret = AVERROR(errno); + av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret)); + return ret; + } } ctx->num_buffers = req.count; - ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer)); - if (!ctx->buffers) { + ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs)); + if (!ctx->bufrefs) { av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name); - return AVERROR(ENOMEM); + goto fail_release; } - for (i = 0; i < req.count; i++) { - ctx->buffers[i].context = ctx; - ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i); - if (ret < 0) { + ctx->wl_master = ff_weak_link_new(ctx); + if (!ctx->wl_master) { + ret = AVERROR(ENOMEM); + goto fail_release; + } + + for (i = 0; i < ctx->num_buffers; i++) { + ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx); + if (ret) { av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret)); - goto error; + goto fail_release; } } av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name, V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat), req.count, - v4l2_get_width(&ctx->format), - v4l2_get_height(&ctx->format), + ff_v4l2_get_format_width(&ctx->format), + ff_v4l2_get_format_height(&ctx->format), V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage, V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline); return 0; -error: +fail_release: v4l2_release_buffers(ctx); + av_freep(&ctx->bufrefs); + return ret; +} + +int ff_v4l2_context_init(V4L2Context* ctx) +{ + V4L2m2mContext * const s = ctx_to_m2mctx(ctx); + int ret; + + // It is not valid to reinit a context without a previous release + av_assert0(ctx->bufrefs == NULL); + + if (!v4l2_type_supported(ctx)) { + av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type); + return AVERROR_PATCHWELCOME; + } + + ff_mutex_init(&ctx->lock, NULL); + pthread_cond_init(&ctx->cond, NULL); + atomic_init(&ctx->q_count, 0); + + if (s->output_drm) { + AVHWFramesContext *hwframes; + + ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref); + if (!ctx->frames_ref) { + ret = AVERROR(ENOMEM); + goto fail_unlock; + } + + hwframes = (AVHWFramesContext*)ctx->frames_ref->data; + hwframes->format = AV_PIX_FMT_DRM_PRIME; + hwframes->sw_format = ctx->av_pix_fmt; + hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width; + hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height; + ret = av_hwframe_ctx_init(ctx->frames_ref); + if (ret < 0) + goto fail_unref_hwframes; + } - av_freep(&ctx->buffers); + ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format); + if (ret) { + ret = AVERROR(errno); + av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret)); + goto fail_unref_hwframes; + } + + ret = create_buffers(ctx, ctx->num_buffers); + if (ret < 0) + goto fail_unref_hwframes; + + return 0; +fail_unref_hwframes: + av_buffer_unref(&ctx->frames_ref); +fail_unlock: + ff_mutex_destroy(&ctx->lock); return ret; } diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h index 22a9532444..a56216e990 100644 --- a/libavcodec/v4l2_context.h +++ b/libavcodec/v4l2_context.h @@ -31,6 +31,7 @@ #include "libavutil/pixfmt.h" #include "libavutil/frame.h" #include "libavutil/buffer.h" +#include "libavutil/thread.h" #include "v4l2_buffers.h" typedef struct V4L2Context { @@ -70,11 +71,18 @@ typedef struct V4L2Context { */ int width, height; AVRational sample_aspect_ratio; + struct v4l2_rect selection; /** - * Indexed array of V4L2Buffers + * If the default size of buffer is less than this then try to + * set to this. */ - V4L2Buffer *buffers; + uint32_t min_buf_size; + + /** + * Indexed array of pointers to V4L2Buffers + */ + AVBufferRef **bufrefs; /** * Readonly after init. @@ -92,6 +100,21 @@ typedef struct V4L2Context { */ int done; + int flag_last; + + /** + * PTS rescale not wanted + * If the PTS is just a dummy frame count then rescale is + * actively harmful + */ + int no_pts_rescale; + + AVBufferRef *frames_ref; + atomic_int q_count; + struct ff_weak_link_master *wl_master; + + AVMutex lock; + pthread_cond_t cond; } V4L2Context; /** @@ -156,7 +179,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); * @param[in] ctx The V4L2Context to dequeue from. * @param[inout] f The AVFrame to dequeue to. * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) + * * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. + * AVERROR(ENOSPC) if no buffer availible to put + * the frame in */ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); @@ -170,7 +196,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); * @param[in] pkt A pointer to an AVPacket. * @return 0 in case of success, a negative error otherwise. */ -int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); +int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size); /** * Enqueues a buffer to a V4L2Context from an AVFrame diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c index cdfd579810..f14ed0b708 100644 --- a/libavcodec/v4l2_m2m.c +++ b/libavcodec/v4l2_m2m.c @@ -215,13 +215,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n"); /* 2. unmap the capture buffers (v4l2 and ffmpeg): - * we must wait for all references to be released before being allowed - * to queue new buffers. */ - av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n"); - if (atomic_load(&s->refcount)) - while(sem_wait(&s->refsync) == -1 && errno == EINTR); - ff_v4l2_context_release(&s->capture); /* 3. get the new capture format */ @@ -240,7 +234,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s) /* 5. complete reinit */ s->draining = 0; - s->reinit = 0; return 0; } @@ -274,7 +267,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *s) /* start again now that we know the stream dimensions */ s->draining = 0; - s->reinit = 0; ret = ff_v4l2_context_get_format(&s->output, 0); if (ret) { @@ -328,10 +320,14 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context) ff_v4l2_context_release(&s->capture); sem_destroy(&s->refsync); - close(s->fd); + if (s->fd != -1) + close(s->fd); av_frame_unref(s->frame); av_frame_free(&s->frame); av_packet_unref(&s->buf_pkt); + av_freep(&s->extdata_data); + + av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n"); av_free(s); } @@ -344,6 +340,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) if (!s) return 0; + av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n"); + + if (av_codec_is_decoder(s->avctx->codec)) + av_packet_unref(&s->buf_pkt); + if (s->fd >= 0) { ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF); if (ret) @@ -356,7 +357,14 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) ff_v4l2_context_release(&s->output); + close(s->fd); + s->fd = -1; + s->self_ref = NULL; + // This is only called on avctx close so after this point we don't have that + // Crash sooner if we find we are using it (can still log with avctx = NULL) + s->avctx = NULL; + priv->context = NULL; av_buffer_unref(&priv->context_ref); return 0; diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h index b67b216331..19d618698d 100644 --- a/libavcodec/v4l2_m2m.h +++ b/libavcodec/v4l2_m2m.h @@ -30,6 +30,7 @@ #include #include "libavcodec/avcodec.h" +#include "libavutil/pixfmt.h" #include "v4l2_context.h" #define container_of(ptr, type, member) ({ \ @@ -38,7 +39,38 @@ #define V4L_M2M_DEFAULT_OPTS \ { "num_output_buffers", "Number of buffers in the output context",\ - OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS } + OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS } + +#define FF_V4L2_M2M_TRACK_SIZE 128 +typedef struct V4L2m2mTrackEl { + int discard; // If we see this buffer its been flushed, so discard + int pending; + int pkt_size; + int64_t pts; + int64_t dts; + int64_t reordered_opaque; + int64_t pkt_pos; + int64_t pkt_duration; + int64_t track_pts; +} V4L2m2mTrackEl; + +typedef struct pts_stats_s +{ + void * logctx; + const char * name; // For debug + unsigned int last_count; + unsigned int last_interval; + int64_t last_pts; + int64_t guess; +} pts_stats_t; + +typedef struct xlat_track_s { + unsigned int track_no; + int64_t last_pts; + int64_t last_pkt_dts; + int64_t last_opaque; + V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; +} xlat_track_t; typedef struct V4L2m2mContext { char devname[PATH_MAX]; @@ -52,7 +84,6 @@ typedef struct V4L2m2mContext { AVCodecContext *avctx; sem_t refsync; atomic_uint refcount; - int reinit; /* null frame/packet received */ int draining; @@ -66,6 +97,33 @@ typedef struct V4L2m2mContext { /* reference back to V4L2m2mPriv */ void *priv; + + AVBufferRef *device_ref; + + /* generate DRM frames */ + int output_drm; + + /* Frame tracking */ + xlat_track_t xlat; + int pending_hw; + int pending_n; + + pts_stats_t pts_stat; + + /* req pkt */ + int req_pkt; + + /* Ext data sent */ + int extdata_sent; + /* Ext data sent in packet - overrides ctx */ + uint8_t * extdata_data; + size_t extdata_size; + +#define FF_V4L2_QUIRK_REINIT_ALWAYS 1 +#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN 2 + /* Quirks */ + unsigned int quirks; + } V4L2m2mContext; typedef struct V4L2m2mPriv { @@ -76,6 +134,7 @@ typedef struct V4L2m2mPriv { int num_output_buffers; int num_capture_buffers; + enum AVPixelFormat pix_fmt; } V4L2m2mPriv; /** @@ -129,4 +188,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); */ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx); + +static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt) +{ + return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width; +} + +static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt) +{ + return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height; +} + +static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt) +{ + return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat; +} + +static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx) +{ + return ctx->flag_last; +} + + #endif /* AVCODEC_V4L2_M2M_H */ diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c index ab07c0a24a..dd383f31e5 100644 --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c @@ -23,6 +23,10 @@ #include #include + +#include "libavutil/avassert.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_drm.h" #include "libavutil/pixfmt.h" #include "libavutil/pixdesc.h" #include "libavutil/opt.h" @@ -30,75 +34,107 @@ #include "libavcodec/decode.h" #include "libavcodec/internal.h" +#include "libavcodec/hwaccels.h" +#include "libavcodec/internal.h" +#include "libavcodec/hwconfig.h" + #include "v4l2_context.h" #include "v4l2_m2m.h" #include "v4l2_fmt.h" -static int v4l2_try_start(AVCodecContext *avctx) +// Pick 64 for max last count - that is >1sec at 60fps +#define STATS_LAST_COUNT_MAX 64 +#define STATS_INTERVAL_MAX (1 << 30) + +static int64_t pts_stats_guess(const pts_stats_t * const stats) { - V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; - V4L2Context *const capture = &s->capture; - V4L2Context *const output = &s->output; - struct v4l2_selection selection = { 0 }; - int ret; + if (stats->last_pts == AV_NOPTS_VALUE || + stats->last_interval == 0 || + stats->last_count >= STATS_LAST_COUNT_MAX) + return AV_NOPTS_VALUE; + return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval; +} - /* 1. start the output process */ - if (!output->streamon) { - ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON); - if (ret < 0) { - av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n"); - return ret; +static void pts_stats_add(pts_stats_t * const stats, int64_t pts) +{ + if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { + if (stats->last_count < STATS_LAST_COUNT_MAX) + ++stats->last_count; + return; + } + + if (stats->last_pts != AV_NOPTS_VALUE) { + const int64_t interval = pts - stats->last_pts; + + if (interval < 0 || interval >= STATS_INTERVAL_MAX || + stats->last_count >= STATS_LAST_COUNT_MAX) { + if (stats->last_interval != 0) + av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", + __func__, stats->name, interval, stats->last_count); + stats->last_interval = 0; + } + else { + const int64_t frame_time = interval / (int64_t)stats->last_count; + + if (frame_time != stats->last_interval) + av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", + __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); + stats->last_interval = frame_time; } } - if (capture->streamon) + stats->last_pts = pts; + stats->last_count = 1; +} + +static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) +{ + *stats = (pts_stats_t){ + .logctx = logctx, + .name = name, + .last_count = 1, + .last_interval = 0, + .last_pts = AV_NOPTS_VALUE + }; +} + +static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) +{ + int ret; + struct v4l2_decoder_cmd cmd = { + .cmd = V4L2_DEC_CMD_START, + .flags = 0, + }; + + if (s->output.streamon) return 0; - /* 2. get the capture format */ - capture->format.type = capture->type; - ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format); - if (ret) { - av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n"); + ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON); + if (ret != 0) { + av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret)); return ret; } - /* 2.1 update the AVCodecContext */ - avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); - capture->av_pix_fmt = avctx->pix_fmt; - - /* 3. set the crop parameters */ - selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; - selection.r.height = avctx->coded_height; - selection.r.width = avctx->coded_width; - ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection); - if (!ret) { - ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection); - if (ret) { - av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n"); - } else { - av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height); - /* update the size of the resulting frame */ - capture->height = selection.r.height; - capture->width = selection.r.width; - } + // STREAMON should do implicit START so this just for those that don't. + // It is optional so don't worry if it fails + if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) { + ret = AVERROR(errno); + av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret)); } - - /* 4. init the capture context now that we have the capture format */ - if (!capture->buffers) { - ret = ff_v4l2_context_init(capture); - if (ret) { - av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n"); - return AVERROR(ENOMEM); - } + else { + av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n"); } + return 0; +} - /* 5. start the capture process */ - ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); - if (ret) { - av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n"); - return ret; - } +static int v4l2_try_start(AVCodecContext *avctx) +{ + V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context; + int ret; + /* 1. start the output process */ + if ((ret = check_output_streamon(avctx, s)) != 0) + return ret; return 0; } @@ -133,58 +169,637 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) return 0; } -static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) +static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) +{ + return (int64_t)n; +} + +static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) +{ + return (unsigned int)pts; +} + +// FFmpeg requires us to propagate a number of vars from the coded pkt into +// the decoded frame. The only thing that tracks like that in V4L2 stateful +// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no +// guarantees about PTS being unique or specified for every frame so replace +// the supplied PTS with a simple incrementing number and keep a circular +// buffer of all the things we want preserved (including the original PTS) +// indexed by the tracking no. +static void +xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt) +{ + int64_t track_pts; + + // Avoid 0 + if (++x->track_no == 0) + x->track_no = 1; + + track_pts = track_to_pts(avctx, x->track_no); + + av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no); + x->last_pkt_dts = avpkt->dts; + x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ + .discard = 0, + .pending = 1, + .pkt_size = avpkt->size, + .pts = avpkt->pts, + .dts = avpkt->dts, + .reordered_opaque = avctx->reordered_opaque, + .pkt_pos = avpkt->pos, + .pkt_duration = avpkt->duration, + .track_pts = track_pts + }; + avpkt->pts = track_pts; +} + +// Returns -1 if we should discard the frame +static int +xlat_pts_out(AVCodecContext *const avctx, + xlat_track_t * const x, + pts_stats_t * const ps, + AVFrame *const frame) +{ + unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; + V4L2m2mTrackEl *const t = x->track_els + n; + if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) + { + av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); + frame->pts = AV_NOPTS_VALUE; + frame->pkt_dts = x->last_pkt_dts; + frame->reordered_opaque = x->last_opaque; + frame->pkt_pos = -1; + frame->pkt_duration = 0; + frame->pkt_size = -1; + } + else if (!t->discard) + { + frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE; + frame->pkt_dts = x->last_pkt_dts; + frame->reordered_opaque = t->reordered_opaque; + frame->pkt_pos = t->pkt_pos; + frame->pkt_duration = t->pkt_duration; + frame->pkt_size = t->pkt_size; + + x->last_opaque = x->track_els[n].reordered_opaque; + if (frame->pts != AV_NOPTS_VALUE) + x->last_pts = frame->pts; + t->pending = 0; + } + else + { + av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); + return -1; + } + + pts_stats_add(ps, frame->pts); + +#if FF_API_PKT_PTS +FF_DISABLE_DEPRECATION_WARNINGS + frame->pkt_pts = frame->pts; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + frame->best_effort_timestamp = pts_stats_guess(ps); + frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? + av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", + frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); + return 0; +} + +static void +xlat_flush(xlat_track_t * const x) +{ + unsigned int i; + for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) { + x->track_els[i].pending = 0; + x->track_els[i].discard = 1; + } + x->last_pts = AV_NOPTS_VALUE; +} + +static void +xlat_init(xlat_track_t * const x) +{ + memset(x, 0, sizeof(*x)); + x->last_pts = AV_NOPTS_VALUE; +} + +static int +xlat_pending(const xlat_track_t * const x) +{ + unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE; + unsigned int i; + int r = 0; + int64_t now = AV_NOPTS_VALUE; + + for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) { + const V4L2m2mTrackEl * const t = x->track_els + n; + + if (!t->pending) + continue; + + if (now == AV_NOPTS_VALUE) + now = t->dts; + + if (t->pts == AV_NOPTS_VALUE || + ((now == AV_NOPTS_VALUE || t->pts <= now) && + (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts))) + ++r; + } + + // If we never get any ideas about PTS vs DTS allow a lot more buffer + if (now == AV_NOPTS_VALUE) + r -= 16; + + return r; +} + +static inline int stream_started(const V4L2m2mContext * const s) { + return s->output.streamon; +} + +#define NQ_OK 0 +#define NQ_Q_FULL 1 +#define NQ_SRC_EMPTY 2 +#define NQ_NONE 3 +#define NQ_DRAINING 4 +#define NQ_DEAD 5 + +#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING) +#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE) + +// do_not_get If true then no new packet will be got but status will +// be set appropriately + +// AVERROR_EOF Flushing an already flushed stream +// -ve Error (all errors except EOF are unexpected) +// NQ_OK (0) OK +// NQ_Q_FULL Dst full (retry if we think V4L2 Q has space now) +// NQ_SRC_EMPTY Src empty (do not retry) +// NQ_NONE Enqueue not attempted +// NQ_DRAINING At EOS, dQ dest until EOS there too +// NQ_DEAD Not running (do not retry, do not attempt capture dQ) + +static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get) { - V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; - V4L2Context *const capture = &s->capture; - V4L2Context *const output = &s->output; int ret; - if (!s->buf_pkt.size) { - ret = ff_decode_get_packet(avctx, &s->buf_pkt); - if (ret < 0 && ret != AVERROR_EOF) + // If we don't already have a coded packet - get a new one + // We will already have a coded pkt if the output Q was full last time we + // tried to Q it + if (!s->buf_pkt.size && !do_not_get) { + unsigned int i; + + for (i = 0; i < 256; ++i) { + uint8_t * side_data; + size_t side_size; + + ret = ff_decode_get_packet(avctx, &s->buf_pkt); + if (ret != 0) + break; + + // New extradata is the only side-data we undertand + side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size); + if (side_data) { + av_log(avctx, AV_LOG_DEBUG, "New extradata\n"); + av_freep(&s->extdata_data); + if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size); + return AVERROR(ENOMEM); + } + memcpy(s->extdata_data, side_data, side_size); + s->extdata_size = side_size; + s->extdata_sent = 0; + } + + if (s->buf_pkt.size != 0) + break; + + if (s->buf_pkt.side_data_elems == 0) { + av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n"); + ret = AVERROR_EOF; + break; + } + + // Retry a side-data only pkt + } + // If i >= 256 something has gone wrong + if (i >= 256) { + av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n"); + return AVERROR(EIO); + } + + if (ret == AVERROR(EAGAIN)) { + if (!stream_started(s)) { + av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__); + return NQ_DEAD; + } + return NQ_SRC_EMPTY; + } + + if (ret == AVERROR_EOF) { + // EOF - enter drain mode + av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n", + ret, s->buf_pkt.size, stream_started(s), s->draining); + if (!stream_started(s)) { + av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n"); + s->draining = 1; + s->capture.done = 1; + return AVERROR_EOF; + } + + if (!s->draining) { + // Calling enqueue with an empty pkt starts drain + av_assert0(s->buf_pkt.size == 0); + ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); + if (ret) { + av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret); + return ret; + } + } + return NQ_DRAINING; + } + + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret); return ret; + } + + xlat_pts_in(avctx, &s->xlat, &s->buf_pkt); + } + + if (s->draining) { + if (s->buf_pkt.size) { + av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n"); + av_packet_unref(&s->buf_pkt); + } + return NQ_DRAINING; } - if (s->draining) - goto dequeue; + if (!s->buf_pkt.size) + return NQ_NONE; + + if ((ret = check_output_streamon(avctx, s)) != 0) + return ret; - ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt); - if (ret < 0 && ret != AVERROR(EAGAIN)) - goto fail; + if (s->extdata_sent) + ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0); + else if (s->extdata_data) + ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size); + else + ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size); - /* if EAGAIN don't unref packet and try to enqueue in the next iteration */ - if (ret != AVERROR(EAGAIN)) + if (ret == AVERROR(EAGAIN)) { + // Out of input buffers - keep packet + ret = NQ_Q_FULL; + } + else { + // In all other cases we are done with this packet av_packet_unref(&s->buf_pkt); + s->extdata_sent = 1; - if (!s->draining) { - ret = v4l2_try_start(avctx); if (ret) { - /* cant recover */ - if (ret != AVERROR(ENOMEM)) - ret = 0; - goto fail; + av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret); + return ret; + } + } + + // Start if we haven't + { + const int ret2 = v4l2_try_start(avctx); + if (ret2) { + av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2); + ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD; } } -dequeue: - return ff_v4l2_context_dequeue_frame(capture, frame, -1); -fail: - av_packet_unref(&s->buf_pkt); return ret; } +static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) +{ + int rv = 0; + + ff_mutex_lock(&ctx->lock); + + while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) { + if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) { + rv = AVERROR(errno); + av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv)); + break; + } + } + + ff_mutex_unlock(&ctx->lock); + return rv; +} + +// Number of frames over what xlat_pending returns that we keep *16 +// This is a min value - if it appears to be too small the threshold should +// adjust dynamically. +#define PENDING_HW_MIN (3 * 16) +// Offset to use when setting dynamically +// Set to %16 == 15 to avoid the threshold changing immediately as we relax +#define PENDING_HW_OFFSET (PENDING_HW_MIN - 1) +// Number of consecutive times we've failed to get a frame when we prefer it +// before we increase the prefer threshold (5ms * N = max expected decode +// time) +#define PENDING_N_THRESHOLD 6 + +static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) +{ + V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; + int src_rv = NQ_OK; + int dst_rv = 1; // Non-zero (done), non-negative (error) number + unsigned int i = 0; + + do { + const int pending = xlat_pending(&s->xlat); + const int prefer_dq = (pending > s->pending_hw / 16); + const int last_src_rv = src_rv; + + // Enqueue another pkt for decode if + // (a) We don't have a lot of stuff in the buffer already OR + // (b) ... we (think we) do but we've failed to get a frame already OR + // (c) We've dequeued a lot of frames without asking for input + src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2)); + + // If we got a frame last time or we've already tried to get a frame and + // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN) + // indicating that we want more input. + // This should mean that once decode starts we enter a stable state where + // we alternately ask for input and produce output + if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY) + break; + + if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) { + av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n"); + break; + } + + // Try to get a new frame if + // (a) we haven't already got one AND + // (b) enqueue returned a status indicating that decode should be attempted + if (dst_rv != 0 && TRY_DQ(src_rv)) { + // Pick a timeout depending on state + const int t = + src_rv == NQ_DRAINING ? 300 : + prefer_dq ? 5 : + src_rv == NQ_Q_FULL ? -1 : 0; + + do { + // Dequeue frame will unref any previous contents of frame + // if it returns success so we don't need an explicit unref + // when discarding + // This returns AVERROR(EAGAIN) on timeout or if + // there is room in the input Q and timeout == -1 + dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); + + // Failure due to no buffer in Q? + if (dst_rv == AVERROR(ENOSPC)) { + // Wait & retry + if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { + dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); + } + } + + // Adjust dynamic pending threshold + if (dst_rv == 0) { + if (--s->pending_hw < PENDING_HW_MIN) + s->pending_hw = PENDING_HW_MIN; + s->pending_n = 0; + } + else if (dst_rv == AVERROR(EAGAIN)) { + if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { + s->pending_hw = pending * 16 + PENDING_HW_OFFSET; + s->pending_n = 0; + } + } + + if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { + av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); + dst_rv = AVERROR_EOF; + s->capture.done = 1; + } + else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done)) + av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n", + s->draining, s->capture.done); + else if (dst_rv && dst_rv != AVERROR(EAGAIN)) + av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", + s->draining, s->capture.done, dst_rv); + + // Go again if we got a frame that we need to discard + } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame)); + } + + ++i; + if (i >= 256) { + av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i); + src_rv = AVERROR(EIO); + } + + // Continue trying to enqueue packets if either + // (a) we succeeded last time OR + // (b) we didn't ret a frame and we can retry the input + } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv))); + + // Ensure that the frame contains nothing if we aren't returning a frame + // (might happen when discarding) + if (dst_rv) + av_frame_unref(frame); + + // If we got a frame this time ask for a pkt next time + s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0; + +#if 0 + if (dst_rv == 0) + { + static int z = 0; + if (++z > 50) { + av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n"); + ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); + return -1; + } + } +#endif + + return dst_rv == 0 ? 0 : + src_rv < 0 ? src_rv : + dst_rv < 0 ? dst_rv : + AVERROR(EAGAIN); +} + +#if 0 +#include +static int64_t us_time(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000; +} + +static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) +{ + int ret; + const int64_t now = us_time(); + int64_t done; + av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); + ret = v4l2_receive_frame2(avctx, frame); + done = us_time(); + av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret); + return ret; +} +#endif + +static int +check_size(AVCodecContext * const avctx, V4L2m2mContext * const s) +{ + unsigned int i; + const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format); + const uint32_t w = avctx->coded_width; + const uint32_t h = avctx->coded_height; + + if (w == 0 || h == 0 || fcc == 0) { + av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc)); + return 0; + } + if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) { + av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc)); + return 0; + } + + for (i = 0;; ++i) { + struct v4l2_frmsizeenum fs = { + .index = i, + .pixel_format = fcc, + }; + + while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) { + const int err = AVERROR(errno); + if (err == AVERROR(EINTR)) + continue; + if (i == 0 && err == AVERROR(ENOTTY)) { + av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n"); + return 0; + } + if (err != AVERROR(EINVAL)) { + av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err)); + return err; + } + av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n", + w, h, av_fourcc2str(fcc), i); + return err; + } + + switch (fs.type) { + case V4L2_FRMSIZE_TYPE_DISCRETE: + av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i, + fs.discrete.width,fs.discrete.height); + if (w == fs.discrete.width && h == fs.discrete.height) + return 0; + break; + case V4L2_FRMSIZE_TYPE_STEPWISE: + av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, + fs.stepwise.min_width, fs.stepwise.min_height, + fs.stepwise.max_width, fs.stepwise.max_height, + fs.stepwise.step_width,fs.stepwise.step_height); + if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && + h >= fs.stepwise.min_height && h <= fs.stepwise.max_height && + (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 && + (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0) + return 0; + break; + case V4L2_FRMSIZE_TYPE_CONTINUOUS: + av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i, + fs.stepwise.min_width, fs.stepwise.min_height, + fs.stepwise.max_width, fs.stepwise.max_height, + fs.stepwise.step_width,fs.stepwise.step_height); + if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width && + h >= fs.stepwise.min_height && h <= fs.stepwise.max_height) + return 0; + break; + default: + av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type); + return AVERROR(EINVAL); + } + } +} + +static int +get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s) +{ + struct v4l2_capability cap; + + memset(&cap, 0, sizeof(cap)); + while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) { + int err = errno; + if (err == EINTR) + continue; + av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err)); + return AVERROR(err); + } + + // Could be made table driven if we have a few more but right now there + // seems no point + + // Meson (amlogic) always gives a resolution changed event after output + // streamon and userspace must (re)allocate capture buffers and streamon + // capture to clear the event even if the capture buffers were the right + // size in the first place. + if (strcmp(cap.driver, "meson-vdec") == 0) + s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN; + + av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks); + return 0; +} + +// This heuristic is for H264 but use for everything +static uint32_t max_coded_size(const AVCodecContext * const avctx) +{ + uint32_t wxh = avctx->coded_width * avctx->coded_height; + uint32_t size; + + size = wxh * 3 / 2; + // H.264 Annex A table A-1 gives minCR which is either 2 or 4 + // unfortunately that doesn't yield an actually useful limit + // and it should be noted that frame 0 is special cased to allow + // a bigger number which really isn't helpful for us. So just pick + // frame_size / 2 + size /= 2; + // Add 64k to allow for any overheads and/or encoder hopefulness + // with small WxH + return size + (1 << 16); +} + static av_cold int v4l2_decode_init(AVCodecContext *avctx) { V4L2Context *capture, *output; V4L2m2mContext *s; V4L2m2mPriv *priv = avctx->priv_data; + int gf_pix_fmt; int ret; + av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); + + if (avctx->codec_id == AV_CODEC_ID_H264) { + if (avctx->ticks_per_frame == 1) { + if(avctx->time_base.den < INT_MAX/2) { + avctx->time_base.den *= 2; + } else + avctx->time_base.num /= 2; + } + avctx->ticks_per_frame = 2; + } + + av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level); ret = ff_v4l2_m2m_create_context(priv, &s); if (ret < 0) return ret; + xlat_init(&s->xlat); + pts_stats_init(&s->pts_stat, avctx, "decoder"); + s->pending_hw = PENDING_HW_MIN; + capture = &s->capture; output = &s->output; @@ -192,14 +807,53 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) * by the v4l2 driver; this event will trigger a full pipeline reconfig and * the proper values will be retrieved from the kernel driver. */ - output->height = capture->height = avctx->coded_height; - output->width = capture->width = avctx->coded_width; +// output->height = capture->height = avctx->coded_height; +// output->width = capture->width = avctx->coded_width; + output->height = capture->height = 0; + output->width = capture->width = 0; output->av_codec_id = avctx->codec_id; output->av_pix_fmt = AV_PIX_FMT_NONE; + output->min_buf_size = max_coded_size(avctx); + output->no_pts_rescale = 1; capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; capture->av_pix_fmt = avctx->pix_fmt; + capture->min_buf_size = 0; + capture->no_pts_rescale = 1; + + /* the client requests the codec to generate DRM frames: + * - data[0] will therefore point to the returned AVDRMFrameDescriptor + * check the ff_v4l2_buffer_to_avframe conversion function. + * - the DRM frame format is passed in the DRM frame descriptor layer. + * check the v4l2_get_drm_frame function. + */ + + avctx->sw_pix_fmt = avctx->pix_fmt; + gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts); + av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n", + avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), + avctx->coded_width, avctx->coded_height, + gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt)); + + if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) { + avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; + s->output_drm = 1; + } + else { + capture->av_pix_fmt = gf_pix_fmt; + s->output_drm = 0; + } + + s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); + if (!s->device_ref) { + ret = AVERROR(ENOMEM); + return ret; + } + + ret = av_hwdevice_ctx_init(s->device_ref); + if (ret < 0) + return ret; s->avctx = avctx; ret = ff_v4l2_m2m_codec_init(priv); @@ -208,12 +862,74 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) return ret; } - return v4l2_prepare_decoder(s); + if ((ret = v4l2_prepare_decoder(s)) < 0) + return ret; + + if ((ret = get_quirks(avctx, s)) != 0) + return ret; + + if ((ret = check_size(avctx, s)) != 0) + return ret; + + return 0; } static av_cold int v4l2_decode_close(AVCodecContext *avctx) { - return ff_v4l2_m2m_codec_end(avctx->priv_data); + int rv; + av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); + rv = ff_v4l2_m2m_codec_end(avctx->priv_data); + av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv); + return rv; +} + +static void v4l2_decode_flush(AVCodecContext *avctx) +{ + // An alternatve and more drastic form of flush is to simply do this: + // v4l2_decode_close(avctx); + // v4l2_decode_init(avctx); + // The downside is that this keeps a decoder open until all the frames + // associated with it have been returned. This is a bit wasteful on + // possibly limited h/w resources and fails on a Pi for this reason unless + // more GPU mem is allocated than is the default. + + V4L2m2mPriv * const priv = avctx->priv_data; + V4L2m2mContext * const s = priv->context; + V4L2Context * const output = &s->output; + V4L2Context * const capture = &s->capture; + + av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon); + + // Reflushing everything is benign, quick and avoids having to worry about + // states like EOS processing so don't try to optimize out (having got it + // wrong once) + + ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); + + // Clear any buffered input packet + av_packet_unref(&s->buf_pkt); + + // Clear a pending EOS + if (ff_v4l2_ctx_eos(capture)) { + // Arguably we could delay this but this is easy and doesn't require + // thought or extra vars + ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF); + ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); + } + + // V4L2 makes no guarantees about whether decoded frames are flushed or not + // so mark all frames we are tracking to be discarded if they appear + xlat_flush(&s->xlat); + + // resend extradata + s->extdata_sent = 0; + // clear EOS status vars + s->draining = 0; + output->done = 0; + capture->done = 0; + + // Stream on will occur when we actually submit a new frame + av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__); } #define OFFSET(x) offsetof(V4L2m2mPriv, x) @@ -222,10 +938,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) static const AVOption options[] = { V4L_M2M_DEFAULT_OPTS, { "num_capture_buffers", "Number of buffers in the capture context", - OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS }, + OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS }, + { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS }, { NULL}, }; +static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { + HW_CONFIG_INTERNAL(DRM_PRIME), + NULL +}; + #define M2MDEC_CLASS(NAME) \ static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ .class_name = #NAME "_v4l2m2m_decoder", \ @@ -246,9 +968,15 @@ static const AVOption options[] = { .init = v4l2_decode_init, \ .receive_frame = v4l2_receive_frame, \ .close = v4l2_decode_close, \ + .flush = v4l2_decode_flush, \ .bsfs = bsf_name, \ .capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \ + .pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ + AV_PIX_FMT_NV12, \ + AV_PIX_FMT_YUV420P, \ + AV_PIX_FMT_NONE}, \ + .hw_configs = v4l2_m2m_hw_configs, \ .wrapper_name = "v4l2m2m", \ } diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c new file mode 100644 index 0000000000..5b3fb958fa --- /dev/null +++ b/libavcodec/v4l2_req_decode_q.c @@ -0,0 +1,84 @@ +#include +#include +#include + +#include "v4l2_req_decode_q.h" + +int decode_q_in_q(const req_decode_ent * const d) +{ + return d->in_q; +} + +void decode_q_add(req_decode_q * const q, req_decode_ent * const d) +{ + pthread_mutex_lock(&q->q_lock); + if (!q->head) { + q->head = d; + q->tail = d; + d->prev = NULL; + } + else { + q->tail->next = d; + d->prev = q->tail; + q->tail = d; + } + d->next = NULL; + d->in_q = 1; + pthread_mutex_unlock(&q->q_lock); +} + +// Remove entry from Q - if head wake-up anything that was waiting +void decode_q_remove(req_decode_q * const q, req_decode_ent * const d) +{ + int try_signal = 0; + + if (!d->in_q) + return; + + pthread_mutex_lock(&q->q_lock); + if (d->prev) + d->prev->next = d->next; + else { + try_signal = 1; // Only need to signal if we were head + q->head = d->next; + } + + if (d->next) + d->next->prev = d->prev; + else + q->tail = d->prev; + + // Not strictly needed but makes debug easier + d->next = NULL; + d->prev = NULL; + d->in_q = 0; + pthread_mutex_unlock(&q->q_lock); + + if (try_signal) + pthread_cond_broadcast(&q->q_cond); +} + +void decode_q_wait(req_decode_q * const q, req_decode_ent * const d) +{ + pthread_mutex_lock(&q->q_lock); + + while (q->head != d) + pthread_cond_wait(&q->q_cond, &q->q_lock); + + pthread_mutex_unlock(&q->q_lock); +} + +void decode_q_uninit(req_decode_q * const q) +{ + pthread_mutex_destroy(&q->q_lock); + pthread_cond_destroy(&q->q_cond); +} + +void decode_q_init(req_decode_q * const q) +{ + memset(q, 0, sizeof(*q)); + pthread_mutex_init(&q->q_lock, NULL); + pthread_cond_init(&q->q_cond, NULL); +} + + diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h new file mode 100644 index 0000000000..af7bbe1de4 --- /dev/null +++ b/libavcodec/v4l2_req_decode_q.h @@ -0,0 +1,25 @@ +#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H +#define AVCODEC_V4L2_REQ_DECODE_Q_H + +typedef struct req_decode_ent { + struct req_decode_ent * next; + struct req_decode_ent * prev; + int in_q; +} req_decode_ent; + +typedef struct req_decode_q { + pthread_mutex_t q_lock; + pthread_cond_t q_cond; + req_decode_ent * head; + req_decode_ent * tail; +} req_decode_q; + +int decode_q_in_q(const req_decode_ent * const d); +void decode_q_add(req_decode_q * const q, req_decode_ent * const d); +void decode_q_remove(req_decode_q * const q, req_decode_ent * const d); +void decode_q_wait(req_decode_q * const q, req_decode_ent * const d); +void decode_q_uninit(req_decode_q * const q); +void decode_q_init(req_decode_q * const q); + +#endif + diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c new file mode 100644 index 0000000000..cfa94d55c4 --- /dev/null +++ b/libavcodec/v4l2_req_devscan.c @@ -0,0 +1,449 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "v4l2_req_devscan.h" +#include "v4l2_req_utils.h" + +struct decdev { + enum v4l2_buf_type src_type; + uint32_t src_fmt_v4l2; + const char * vname; + const char * mname; +}; + +struct devscan { + struct decdev env; + unsigned int dev_size; + unsigned int dev_count; + struct decdev *devs; +}; + +static int video_src_pixfmt_supported(uint32_t fmt) +{ + return 1; +} + +static void v4l2_setup_format(struct v4l2_format *format, unsigned int type, + unsigned int width, unsigned int height, + unsigned int pixelformat) +{ + unsigned int sizeimage; + + memset(format, 0, sizeof(*format)); + format->type = type; + + sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0; + + if (V4L2_TYPE_IS_MULTIPLANAR(type)) { + format->fmt.pix_mp.width = width; + format->fmt.pix_mp.height = height; + format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage; + format->fmt.pix_mp.pixelformat = pixelformat; + } else { + format->fmt.pix.width = width; + format->fmt.pix.height = height; + format->fmt.pix.sizeimage = sizeimage; + format->fmt.pix.pixelformat = pixelformat; + } +} + +static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat, + unsigned int width, unsigned int height) +{ + struct v4l2_format format; + + v4l2_setup_format(&format, type, width, height, pixelformat); + + return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0; +} + +static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities) +{ + struct v4l2_capability capability = { 0 }; + int rc; + + rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability); + if (rc < 0) + return -errno; + + if (capabilities != NULL) { + if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0) + *capabilities = capability.device_caps; + else + *capabilities = capability.capabilities; + } + + return 0; +} + +static int devscan_add(struct devscan *const scan, + enum v4l2_buf_type src_type, + uint32_t src_fmt_v4l2, + const char * vname, + const char * mname) +{ + struct decdev *d; + + if (scan->dev_size <= scan->dev_count) { + unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2; + d = realloc(scan->devs, n * sizeof(*d)); + if (!d) + return -ENOMEM; + scan->devs = d; + scan->dev_size = n; + } + + d = scan->devs + scan->dev_count; + d->src_type = src_type; + d->src_fmt_v4l2 = src_fmt_v4l2; + d->vname = strdup(vname); + if (!d->vname) + return -ENOMEM; + d->mname = strdup(mname); + if (!d->mname) { + free((char *)d->vname); + return -ENOMEM; + } + ++scan->dev_count; + return 0; +} + +void devscan_delete(struct devscan **const pScan) +{ + unsigned int i; + struct devscan * const scan = *pScan; + + if (!scan) + return; + *pScan = NULL; + + for (i = 0; i < scan->dev_count; ++i) { + free((char*)scan->devs[i].mname); + free((char*)scan->devs[i].vname); + } + free(scan->devs); + free(scan); +} + +#define REQ_BUF_CAPS (\ + V4L2_BUF_CAP_SUPPORTS_DMABUF |\ + V4L2_BUF_CAP_SUPPORTS_REQUESTS |\ + V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) + +static void probe_formats(void * const dc, + struct devscan *const scan, + const int fd, + const unsigned int type_v4l2, + const char *const mpath, + const char *const vpath) +{ + unsigned int i; + for (i = 0;; ++i) { + struct v4l2_fmtdesc fmtdesc = { + .index = i, + .type = type_v4l2 + }; + struct v4l2_requestbuffers rbufs = { + .count = 0, + .type = type_v4l2, + .memory = V4L2_MEMORY_MMAP + }; + while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) { + if (errno == EINTR) + continue; + if (errno != EINVAL) + request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2); + return; + } + if (!video_src_pixfmt_supported(fmtdesc.pixelformat)) + continue; + + if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) { + request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat); + continue; + } + + while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) { + if (errno != EINTR) { + request_debug(dc, "%s: Reqbufs failed\n", vpath); + continue; + } + } + + if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) { + request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities); + continue; + } + + request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n", + mpath, vpath, fmtdesc.pixelformat, type_v4l2); + devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath); + } +} + + +static int probe_video_device(void * const dc, + struct udev_device *const device, + struct devscan *const scan, + const char *const mpath) +{ + int ret; + unsigned int capabilities = 0; + int video_fd = -1; + + const char *path = udev_device_get_devnode(device); + if (!path) { + request_err(dc, "%s: get video device devnode failed\n", __func__); + ret = -EINVAL; + goto fail; + } + + video_fd = open(path, O_RDWR, 0); + if (video_fd == -1) { + ret = -errno; + request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno); + goto fail; + } + + ret = v4l2_query_capabilities(video_fd, &capabilities); + if (ret < 0) { + request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret); + goto fail; + } + + request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities); + + if (!(capabilities & V4L2_CAP_STREAMING)) { + request_debug(dc, "%s: missing required streaming capability\n", __func__); + ret = -EINVAL; + goto fail; + } + + if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) { + request_debug(dc, "%s: missing required mem2mem capability\n", __func__); + ret = -EINVAL; + goto fail; + } + + /* Should check capture formats too... */ + if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0) + probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path); + if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) + probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path); + + close(video_fd); + return 0; + +fail: + if (video_fd >= 0) + close(video_fd); + return ret; +} + +static int probe_media_device(void * const dc, + struct udev_device *const device, + struct devscan *const scan) +{ + int ret; + int rv; + struct media_device_info device_info = { 0 }; + struct media_v2_topology topology = { 0 }; + struct media_v2_interface *interfaces = NULL; + struct udev *udev = udev_device_get_udev(device); + struct udev_device *video_device; + dev_t devnum; + int media_fd = -1; + + const char *path = udev_device_get_devnode(device); + if (!path) { + request_err(dc, "%s: get media device devnode failed\n", __func__); + ret = -EINVAL; + goto fail; + } + + media_fd = open(path, O_RDWR, 0); + if (media_fd < 0) { + ret = -errno; + request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret); + goto fail; + } + + rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info); + if (rv < 0) { + ret = -errno; + request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret); + goto fail; + } + + rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); + if (rv < 0) { + ret = -errno; + request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret); + goto fail; + } + + if (topology.num_interfaces <= 0) { + request_err(dc, "%s: media device has no interfaces\n", __func__); + ret = -EINVAL; + goto fail; + } + + interfaces = calloc(topology.num_interfaces, sizeof(*interfaces)); + if (!interfaces) { + request_err(dc, "%s: allocating media interface struct failed\n", __func__); + ret = -ENOMEM; + goto fail; + } + + topology.ptr_interfaces = (__u64)(uintptr_t)interfaces; + rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); + if (rv < 0) { + ret = -errno; + request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret); + goto fail; + } + + for (int i = 0; i < topology.num_interfaces; i++) { + if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO) + continue; + + devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor); + video_device = udev_device_new_from_devnum(udev, 'c', devnum); + if (!video_device) { + ret = -errno; + request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device); + continue; + } + + ret = probe_video_device(dc, video_device, scan, path); + udev_device_unref(video_device); + + if (ret != 0) + goto fail; + } + +fail: + free(interfaces); + if (media_fd != -1) + close(media_fd); + return ret; +} + +const char *decdev_media_path(const struct decdev *const dev) +{ + return !dev ? NULL : dev->mname; +} + +const char *decdev_video_path(const struct decdev *const dev) +{ + return !dev ? NULL : dev->vname; +} + +enum v4l2_buf_type decdev_src_type(const struct decdev *const dev) +{ + return !dev ? 0 : dev->src_type; +} + +uint32_t decdev_src_pixelformat(const struct decdev *const dev) +{ + return !dev ? 0 : dev->src_fmt_v4l2; +} + + +const struct decdev *devscan_find(struct devscan *const scan, + const uint32_t src_fmt_v4l2) +{ + unsigned int i; + + if (scan->env.mname && scan->env.vname) + return &scan->env; + + if (!src_fmt_v4l2) + return scan->dev_count ? scan->devs + 0 : NULL; + + for (i = 0; i != scan->dev_count; ++i) { + if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2) + return scan->devs + i; + } + return NULL; +} + +int devscan_build(void * const dc, struct devscan **pscan) +{ + int ret; + struct udev *udev; + struct udev_enumerate *enumerate; + struct udev_list_entry *devices; + struct udev_list_entry *entry; + struct udev_device *device; + struct devscan * scan; + + *pscan = NULL; + + scan = calloc(1, sizeof(*scan)); + if (!scan) { + ret = -ENOMEM; + goto fail; + } + + scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH"); + scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH"); + if (scan->env.mname && scan->env.vname) { + request_info(dc, "Media/video device env overrides found: %s,%s\n", + scan->env.mname, scan->env.vname); + *pscan = scan; + return 0; + } + + udev = udev_new(); + if (!udev) { + request_err(dc, "%s: allocating udev context failed\n", __func__); + ret = -ENOMEM; + goto fail; + } + + enumerate = udev_enumerate_new(udev); + if (!enumerate) { + request_err(dc, "%s: allocating udev enumerator failed\n", __func__); + ret = -ENOMEM; + goto fail; + } + + udev_enumerate_add_match_subsystem(enumerate, "media"); + udev_enumerate_scan_devices(enumerate); + + devices = udev_enumerate_get_list_entry(enumerate); + udev_list_entry_foreach(entry, devices) { + const char *path = udev_list_entry_get_name(entry); + if (!path) + continue; + + device = udev_device_new_from_syspath(udev, path); + if (!device) + continue; + + probe_media_device(dc, device, scan); + udev_device_unref(device); + } + + udev_enumerate_unref(enumerate); + + *pscan = scan; + return 0; + +fail: + udev_unref(udev); + devscan_delete(&scan); + return ret; +} + diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h new file mode 100644 index 0000000000..0baef36535 --- /dev/null +++ b/libavcodec/v4l2_req_devscan.h @@ -0,0 +1,21 @@ +#ifndef _DEVSCAN_H_ +#define _DEVSCAN_H_ + +struct devscan; +struct decdev; +enum v4l2_buf_type; + +/* These return pointers to data in the devscan structure and so are vaild + * for the lifetime of that + */ +const char *decdev_media_path(const struct decdev *const dev); +const char *decdev_video_path(const struct decdev *const dev); +enum v4l2_buf_type decdev_src_type(const struct decdev *const dev); +uint32_t decdev_src_pixelformat(const struct decdev *const dev); + +const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2); + +int devscan_build(void * const dc, struct devscan **pscan); +void devscan_delete(struct devscan **const pScan); + +#endif diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c new file mode 100644 index 0000000000..ae6c648369 --- /dev/null +++ b/libavcodec/v4l2_req_dmabufs.c @@ -0,0 +1,266 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v4l2_req_dmabufs.h" +#include "v4l2_req_utils.h" + +#define DMABUF_NAME1 "/dev/dma_heap/linux,cma" +#define DMABUF_NAME2 "/dev/dma_heap/reserved" + +#define TRACE_ALLOC 0 + +struct dmabufs_ctl { + int fd; + size_t page_size; +}; + +struct dmabuf_h { + int fd; + size_t size; + size_t len; + void * mapptr; +}; + +#if TRACE_ALLOC +static unsigned int total_bufs = 0; +static size_t total_size = 0; +#endif + +struct dmabuf_h * dmabuf_import(int fd, size_t size) +{ + struct dmabuf_h *dh; + + fd = dup(fd); + if (fd < 0 || size == 0) + return NULL; + + dh = malloc(sizeof(*dh)); + if (!dh) { + close(fd); + return NULL; + } + + *dh = (struct dmabuf_h) { + .fd = fd, + .size = size, + .mapptr = MAP_FAILED + }; + +#if TRACE_ALLOC + ++total_bufs; + total_size += dh->size; + request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); +#endif + + return dh; +} + +struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size) +{ + struct dmabuf_h * dh; + struct dma_heap_allocation_data data = { + .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1), + .fd = 0, + .fd_flags = O_RDWR, + .heap_flags = 0 + }; + + if (old != NULL) { + if (old->size == data.len) { + return old; + } + dmabuf_free(old); + } + + if (size == 0 || + (dh = malloc(sizeof(*dh))) == NULL) + return NULL; + + while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) { + int err = errno; + request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n", + (uint64_t)data.len, + dbsc->fd, + err, + strerror(err)); + if (err == EINTR) + continue; + goto fail; + } + + *dh = (struct dmabuf_h){ + .fd = data.fd, + .size = (size_t)data.len, + .mapptr = MAP_FAILED + }; + +#if TRACE_ALLOC + ++total_bufs; + total_size += dh->size; + request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); +#endif + + return dh; + +fail: + free(dh); + return NULL; +} + +int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags) +{ + struct dma_buf_sync sync = { + .flags = flags + }; + while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) { + const int err = errno; + if (errno == EINTR) + continue; + request_log("%s: ioctl failed: flags=%#x\n", __func__, flags); + return -err; + } + return 0; +} + +int dmabuf_write_start(struct dmabuf_h * const dh) +{ + return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE); +} + +int dmabuf_write_end(struct dmabuf_h * const dh) +{ + return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE); +} + +int dmabuf_read_start(struct dmabuf_h * const dh) +{ + if (!dmabuf_map(dh)) + return -1; + return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ); +} + +int dmabuf_read_end(struct dmabuf_h * const dh) +{ + return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ); +} + + +void * dmabuf_map(struct dmabuf_h * const dh) +{ + if (!dh) + return NULL; + if (dh->mapptr != MAP_FAILED) + return dh->mapptr; + dh->mapptr = mmap(NULL, dh->size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, + dh->fd, 0); + if (dh->mapptr == MAP_FAILED) { + request_log("%s: Map failed\n", __func__); + return NULL; + } + return dh->mapptr; +} + +int dmabuf_fd(const struct dmabuf_h * const dh) +{ + if (!dh) + return -1; + return dh->fd; +} + +size_t dmabuf_size(const struct dmabuf_h * const dh) +{ + if (!dh) + return 0; + return dh->size; +} + +size_t dmabuf_len(const struct dmabuf_h * const dh) +{ + if (!dh) + return 0; + return dh->len; +} + +void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len) +{ + dh->len = len; +} + + + +void dmabuf_free(struct dmabuf_h * dh) +{ + if (!dh) + return; + +#if TRACE_ALLOC + --total_bufs; + total_size -= dh->size; + request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs); +#endif + + if (dh->mapptr != MAP_FAILED) + munmap(dh->mapptr, dh->size); + while (close(dh->fd) == -1 && errno == EINTR) + /* loop */; + free(dh); +} + +struct dmabufs_ctl * dmabufs_ctl_new(void) +{ + struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc)); + + if (!dbsc) + return NULL; + + while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 && + errno == EINTR) + /* Loop */; + + if (dbsc->fd == -1) { + while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 && + errno == EINTR) + /* Loop */; + if (dbsc->fd == -1) { + request_log("Unable to open either %s or %s\n", + DMABUF_NAME1, DMABUF_NAME2); + goto fail; + } + } + + dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE); + + return dbsc; + +fail: + free(dbsc); + return NULL; +} + +void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc) +{ + struct dmabufs_ctl * const dbsc = *pDbsc; + + if (!dbsc) + return; + *pDbsc = NULL; + + while (close(dbsc->fd) == -1 && errno == EINTR) + /* loop */; + + free(dbsc); +} + + diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h new file mode 100644 index 0000000000..8d909c4297 --- /dev/null +++ b/libavcodec/v4l2_req_dmabufs.h @@ -0,0 +1,38 @@ +#ifndef DMABUFS_H +#define DMABUFS_H + +struct dmabufs_ctl; +struct dmabuf_h; + +struct dmabufs_ctl * dmabufs_ctl_new(void); +void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc); + +// Need not preserve old contents +// On NULL return old buffer is freed +struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size); + +static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) { + return dmabuf_realloc(dbsc, NULL, size); +} +/* Create from existing fd - dups(fd) */ +struct dmabuf_h * dmabuf_import(int fd, size_t size); +void * dmabuf_map(struct dmabuf_h * const dh); + +/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */ +int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags); + +int dmabuf_write_start(struct dmabuf_h * const dh); +int dmabuf_write_end(struct dmabuf_h * const dh); +int dmabuf_read_start(struct dmabuf_h * const dh); +int dmabuf_read_end(struct dmabuf_h * const dh); + +int dmabuf_fd(const struct dmabuf_h * const dh); +/* Allocated size */ +size_t dmabuf_size(const struct dmabuf_h * const dh); +/* Bytes in use */ +size_t dmabuf_len(const struct dmabuf_h * const dh); +/* Set bytes in use */ +void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len); +void dmabuf_free(struct dmabuf_h * dh); + +#endif diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c new file mode 100644 index 0000000000..169b532832 --- /dev/null +++ b/libavcodec/v4l2_req_hevc_v1.c @@ -0,0 +1,3 @@ +#define HEVC_CTRLS_VERSION 1 +#include "v4l2_req_hevc_vx.c" + diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c new file mode 100644 index 0000000000..42af98e156 --- /dev/null +++ b/libavcodec/v4l2_req_hevc_v2.c @@ -0,0 +1,3 @@ +#define HEVC_CTRLS_VERSION 2 +#include "v4l2_req_hevc_vx.c" + diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c new file mode 100644 index 0000000000..dcc8d95632 --- /dev/null +++ b/libavcodec/v4l2_req_hevc_v3.c @@ -0,0 +1,3 @@ +#define HEVC_CTRLS_VERSION 3 +#include "v4l2_req_hevc_vx.c" + diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c new file mode 100644 index 0000000000..55c41ae679 --- /dev/null +++ b/libavcodec/v4l2_req_hevc_vx.c @@ -0,0 +1,1228 @@ +// File included by v4l2_req_hevc_v* - not compiled on its own + +#include "decode.h" +#include "hevcdec.h" +#include "hwconfig.h" + +#include "v4l2_request_hevc.h" + +#if HEVC_CTRLS_VERSION == 1 +#include "hevc-ctrls-v1.h" + +// Fixup renamed entries +#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT + +#elif HEVC_CTRLS_VERSION == 2 +#include "hevc-ctrls-v2.h" +#elif HEVC_CTRLS_VERSION == 3 +#include "hevc-ctrls-v3.h" +#else +#error Unknown HEVC_CTRLS_VERSION +#endif + +#include "libavutil/hwcontext_drm.h" + +#include +#include + +#include "v4l2_req_devscan.h" +#include "v4l2_req_dmabufs.h" +#include "v4l2_req_pollqueue.h" +#include "v4l2_req_media.h" +#include "v4l2_req_utils.h" + +// Attached to buf[0] in frame +// Pooled in hwcontext so generally create once - 1/frame +typedef struct V4L2MediaReqDescriptor { + AVDRMFrameDescriptor drm; + + // Media + uint64_t timestamp; + struct qent_dst * qe_dst; + + // Decode only - should be NULL by the time we emit the frame + struct req_decode_ent decode_ent; + + struct media_request *req; + struct qent_src *qe_src; + +#if HEVC_CTRLS_VERSION >= 2 + struct v4l2_ctrl_hevc_decode_params dec; +#endif + + size_t num_slices; + size_t alloced_slices; + struct v4l2_ctrl_hevc_slice_params * slice_params; + struct slice_info * slices; + +} V4L2MediaReqDescriptor; + +struct slice_info { + const uint8_t * ptr; + size_t len; // bytes +}; + +// Handy container for accumulating controls before setting +struct req_controls { + int has_scaling; + struct timeval tv; + struct v4l2_ctrl_hevc_sps sps; + struct v4l2_ctrl_hevc_pps pps; + struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix; +}; + +//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 }; + + +// Get an FFmpeg format from the v4l2 format +static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format) +{ + switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ? + format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) { + case V4L2_PIX_FMT_YUV420: + return AV_PIX_FMT_YUV420P; + case V4L2_PIX_FMT_NV12: + return AV_PIX_FMT_NV12; +#if CONFIG_SAND + case V4L2_PIX_FMT_NV12_COL128: + return AV_PIX_FMT_RPI4_8; + case V4L2_PIX_FMT_NV12_10_COL128: + return AV_PIX_FMT_RPI4_10; +#endif + default: + break; + } + return AV_PIX_FMT_NONE; +} + +static inline uint64_t frame_capture_dpb(const AVFrame * const frame) +{ + const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0]; + return rd->timestamp; +} + +static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp) +{ + V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0]; + rd->timestamp = dpb_stamp; +} + +static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table) +{ + int32_t luma_weight_denom, chroma_weight_denom; + const SliceHeader *sh = &h->sh; + + if (sh->slice_type == HEVC_SLICE_I || + (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) || + (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag)) + return; + + table->luma_log2_weight_denom = sh->luma_log2_weight_denom; + + if (h->ps.sps->chroma_format_idc) + table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom; + + luma_weight_denom = (1 << sh->luma_log2_weight_denom); + chroma_weight_denom = (1 << sh->chroma_log2_weight_denom); + + for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) { + table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom; + table->luma_offset_l0[i] = sh->luma_offset_l0[i]; + table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom; + table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom; + table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0]; + table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1]; + } + + if (sh->slice_type != HEVC_SLICE_B) + return; + + for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) { + table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom; + table->luma_offset_l1[i] = sh->luma_offset_l1[i]; + table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom; + table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom; + table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0]; + table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1]; + } +} + +#if HEVC_CTRLS_VERSION <= 2 +static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) +{ + const HEVCFrame *frame; + int i; + + for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) { + frame = h->rps[ST_CURR_BEF].ref[i]; + if (frame && timestamp == frame_capture_dpb(frame->frame)) + return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE; + } + + for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) { + frame = h->rps[ST_CURR_AFT].ref[i]; + if (frame && timestamp == frame_capture_dpb(frame->frame)) + return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER; + } + + for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) { + frame = h->rps[LT_CURR].ref[i]; + if (frame && timestamp == frame_capture_dpb(frame->frame)) + return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR; + } + + return 0; +} +#endif + +static unsigned int +get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame, + const struct v4l2_hevc_dpb_entry * const entries, + const unsigned int num_entries) +{ + uint64_t timestamp; + + if (!frame) + return 0; + + timestamp = frame_capture_dpb(frame->frame); + + for (unsigned int i = 0; i < num_entries; i++) { + if (entries[i].timestamp == timestamp) + return i; + } + + return 0; +} + +static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx) +{ + unsigned int z = 0; + while (idx--) { + if (*b++ == 0) { + ++z; + if (z >= 2 && *b == 3) { + ++b; + z = 0; + } + } + else { + z = 0; + } + } + return b; +} + +static int slice_add(V4L2MediaReqDescriptor * const rd) +{ + if (rd->num_slices >= rd->alloced_slices) { + struct v4l2_ctrl_hevc_slice_params * p2; + struct slice_info * s2; + size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2; + + p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2)); + if (p2 == NULL) + return AVERROR(ENOMEM); + rd->slice_params = p2; + + s2 = av_realloc_array(rd->slices, n2, sizeof(*s2)); + if (s2 == NULL) + return AVERROR(ENOMEM); + rd->slices = s2; + + rd->alloced_slices = n2; + } + ++rd->num_slices; + return 0; +} + +static unsigned int +fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries) +{ + unsigned int i; + unsigned int n = 0; + const HEVCFrame * const pic = h->ref; + + for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) { + const HEVCFrame * const frame = &h->DPB[i]; + if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) { + struct v4l2_hevc_dpb_entry * const entry = entries + n++; + + entry->timestamp = frame_capture_dpb(frame->frame); +#if HEVC_CTRLS_VERSION <= 2 + entry->rps = find_frame_rps_type(h, entry->timestamp); +#else + entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 : + V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE; +#endif + entry->field_pic = frame->frame->interlaced_frame; + + /* TODO: Interleaved: Get the POC for each field. */ + entry->pic_order_cnt[0] = frame->poc; + entry->pic_order_cnt[1] = frame->poc; + } + } + return n; +} + +static void fill_slice_params(const HEVCContext * const h, +#if HEVC_CTRLS_VERSION >= 2 + const struct v4l2_ctrl_hevc_decode_params * const dec, +#endif + struct v4l2_ctrl_hevc_slice_params *slice_params, + uint32_t bit_size, uint32_t bit_offset) +{ + const SliceHeader * const sh = &h->sh; +#if HEVC_CTRLS_VERSION >= 2 + const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb; + const unsigned int dpb_n = dec->num_active_dpb_entries; +#else + struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb; + unsigned int dpb_n; +#endif + unsigned int i; + RefPicList *rpl; + + *slice_params = (struct v4l2_ctrl_hevc_slice_params) { + .bit_size = bit_size, + .data_bit_offset = bit_offset, + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + .slice_segment_addr = sh->slice_segment_addr, + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ + .nal_unit_type = h->nal_unit_type, + .nuh_temporal_id_plus1 = h->temporal_id + 1, + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + .slice_type = sh->slice_type, + .colour_plane_id = sh->colour_plane_id, + .slice_pic_order_cnt = h->ref->poc, + .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0, + .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0, + .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0, + .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand, + .slice_qp_delta = sh->slice_qp_delta, + .slice_cb_qp_offset = sh->slice_cb_qp_offset, + .slice_cr_qp_offset = sh->slice_cr_qp_offset, + .slice_act_y_qp_offset = 0, + .slice_act_cb_qp_offset = 0, + .slice_act_cr_qp_offset = 0, + .slice_beta_offset_div2 = sh->beta_offset / 2, + .slice_tc_offset_div2 = sh->tc_offset / 2, + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ + .pic_struct = h->sei.picture_timing.picture_struct, + +#if HEVC_CTRLS_VERSION < 2 + /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ + .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, + .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, + .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs, +#endif + }; + + if (sh->slice_sample_adaptive_offset_flag[0]) + slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA; + + if (sh->slice_sample_adaptive_offset_flag[1]) + slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA; + + if (sh->slice_temporal_mvp_enabled_flag) + slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED; + + if (sh->mvd_l1_zero_flag) + slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO; + + if (sh->cabac_init_flag) + slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT; + + if (sh->collocated_list == L0) + slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0; + + if (sh->disable_deblocking_filter_flag) + slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED; + + if (sh->slice_loop_filter_across_slices_enabled_flag) + slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED; + + if (sh->dependent_slice_segment_flag) + slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT; + +#if HEVC_CTRLS_VERSION < 2 + dpb_n = fill_dpb_entries(h, dpb); + slice_params->num_active_dpb_entries = dpb_n; +#endif + + if (sh->slice_type != HEVC_SLICE_I) { + rpl = &h->ref->refPicList[0]; + for (i = 0; i < rpl->nb_refs; i++) + slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n); + } + + if (sh->slice_type == HEVC_SLICE_B) { + rpl = &h->ref->refPicList[1]; + for (i = 0; i < rpl->nb_refs; i++) + slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n); + } + + fill_pred_table(h, &slice_params->pred_weight_table); + + slice_params->num_entry_point_offsets = sh->num_entry_point_offsets; + if (slice_params->num_entry_point_offsets > 256) { + slice_params->num_entry_point_offsets = 256; + av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); + } + + for (i = 0; i < slice_params->num_entry_point_offsets; i++) + slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; +} + +#if HEVC_CTRLS_VERSION >= 2 +static void +fill_decode_params(const HEVCContext * const h, + struct v4l2_ctrl_hevc_decode_params * const dec) +{ + unsigned int i; + + *dec = (struct v4l2_ctrl_hevc_decode_params){ + .pic_order_cnt_val = h->poc, + .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, + .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, + .num_poc_lt_curr = h->rps[LT_CURR].nb_refs, + }; + + dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb); + + // The docn does seem to ask that we fit our 32 bit signed POC into + // a U8 so... (To be fair 16 bits would be enough) + // Luckily we (Pi) don't use these fields + for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i) + dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc; + for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i) + dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc; + for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i) + dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc; + + if (IS_IRAP(h)) + dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC; + if (IS_IDR(h)) + dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC; + if (h->sh.no_output_of_prior_pics_flag) + dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR; + +} +#endif + +static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps) +{ + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ + *ctrl = (struct v4l2_ctrl_hevc_sps) { + .chroma_format_idc = sps->chroma_format_idc, + .pic_width_in_luma_samples = sps->width, + .pic_height_in_luma_samples = sps->height, + .bit_depth_luma_minus8 = sps->bit_depth - 8, + .bit_depth_chroma_minus8 = sps->bit_depth - 8, + .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4, + .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1, + .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics, + .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1, + .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3, + .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size, + .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2, + .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size, + .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter, + .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra, + .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1, + .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1, + .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3, + .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size, + .num_short_term_ref_pic_sets = sps->nb_st_rps, + .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps, + .chroma_format_idc = sps->chroma_format_idc, + .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1, + }; + + if (sps->separate_colour_plane_flag) + ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE; + + if (sps->scaling_list_enable_flag) + ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED; + + if (sps->amp_enabled_flag) + ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED; + + if (sps->sao_enabled) + ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET; + + if (sps->pcm_enabled_flag) + ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED; + + if (sps->pcm.loop_filter_disable_flag) + ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED; + + if (sps->long_term_ref_pics_present_flag) + ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT; + + if (sps->sps_temporal_mvp_enabled_flag) + ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED; + + if (sps->sps_strong_intra_smoothing_enable_flag) + ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED; +} + +static void fill_scaling_matrix(const ScalingList * const sl, + struct v4l2_ctrl_hevc_scaling_matrix * const sm) +{ + unsigned int i; + + for (i = 0; i < 6; i++) { + unsigned int j; + + for (j = 0; j < 16; j++) + sm->scaling_list_4x4[i][j] = sl->sl[0][i][j]; + for (j = 0; j < 64; j++) { + sm->scaling_list_8x8[i][j] = sl->sl[1][i][j]; + sm->scaling_list_16x16[i][j] = sl->sl[2][i][j]; + if (i < 2) + sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j]; + } + sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i]; + if (i < 2) + sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3]; + } +} + +static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps) +{ + uint64_t flags = 0; + + if (pps->dependent_slice_segments_enabled_flag) + flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED; + + if (pps->output_flag_present_flag) + flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT; + + if (pps->sign_data_hiding_flag) + flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED; + + if (pps->cabac_init_present_flag) + flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT; + + if (pps->constrained_intra_pred_flag) + flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED; + + if (pps->transform_skip_enabled_flag) + flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED; + + if (pps->cu_qp_delta_enabled_flag) + flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED; + + if (pps->pic_slice_level_chroma_qp_offsets_present_flag) + flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT; + + if (pps->weighted_pred_flag) + flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED; + + if (pps->weighted_bipred_flag) + flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED; + + if (pps->transquant_bypass_enable_flag) + flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED; + + if (pps->tiles_enabled_flag) + flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED; + + if (pps->entropy_coding_sync_enabled_flag) + flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED; + + if (pps->loop_filter_across_tiles_enabled_flag) + flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED; + + if (pps->seq_loop_filter_across_slices_enabled_flag) + flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED; + + if (pps->deblocking_filter_override_enabled_flag) + flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED; + + if (pps->disable_dbf) + flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER; + + if (pps->lists_modification_present_flag) + flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT; + + if (pps->slice_header_extension_present_flag) + flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT; + + /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ + *ctrl = (struct v4l2_ctrl_hevc_pps) { + .num_extra_slice_header_bits = pps->num_extra_slice_header_bits, + .init_qp_minus26 = pps->pic_init_qp_minus26, + .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth, + .pps_cb_qp_offset = pps->cb_qp_offset, + .pps_cr_qp_offset = pps->cr_qp_offset, + .pps_beta_offset_div2 = pps->beta_offset / 2, + .pps_tc_offset_div2 = pps->tc_offset / 2, + .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2, + .flags = flags + }; + + + if (pps->tiles_enabled_flag) { + ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1; + ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1; + + for (int i = 0; i < pps->num_tile_columns; i++) + ctrl->column_width_minus1[i] = pps->column_width[i] - 1; + + for (int i = 0; i < pps->num_tile_rows; i++) + ctrl->row_height_minus1[i] = pps->row_height[i] - 1; + } +} + +// Called before finally returning the frame to the user +// Set corrupt flag here as this is actually the frame structure that +// is going to the user (in MT land each thread has its own pool) +static int frame_post_process(void *logctx, AVFrame *frame) +{ + V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0]; + +// av_log(NULL, AV_LOG_INFO, "%s\n", __func__); + frame->flags &= ~AV_FRAME_FLAG_CORRUPT; + if (rd->qe_dst) { + MediaBufsStatus stat = qent_dst_wait(rd->qe_dst); + if (stat != MEDIABUFS_STATUS_SUCCESS) { + av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__); + frame->flags |= AV_FRAME_FLAG_CORRUPT; + } + } + + return 0; +} + +static inline struct timeval cvt_dpb_to_tv(uint64_t t) +{ + t /= 1000; + return (struct timeval){ + .tv_usec = t % 1000000, + .tv_sec = t / 1000000 + }; +} + +static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t) +{ + return (uint64_t)t * 1000; +} + +static int v4l2_request_hevc_start_frame(AVCodecContext *avctx, + av_unused const uint8_t *buffer, + av_unused uint32_t size) +{ + const HEVCContext *h = avctx->priv_data; + V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0]; + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + +// av_log(NULL, AV_LOG_INFO, "%s\n", __func__); + decode_q_add(&ctx->decode_q, &rd->decode_ent); + + rd->num_slices = 0; + ctx->timestamp++; + rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp); + + { + FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data; + fdd->post_process = frame_post_process; + } + + // qe_dst needs to be bound to the data buffer and only returned when that is + if (!rd->qe_dst) + { + if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__); + return AVERROR(ENOMEM); + } + } + + ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame + + return 0; +} + +// Object fd & size will be zapped by this & need setting later +static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format) +{ + AVDRMLayerDescriptor *layer = &desc->layers[0]; + unsigned int width; + unsigned int height; + unsigned int bpl; + uint32_t pixelformat; + + if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) { + width = format->fmt.pix_mp.width; + height = format->fmt.pix_mp.height; + pixelformat = format->fmt.pix_mp.pixelformat; + bpl = format->fmt.pix_mp.plane_fmt[0].bytesperline; + } + else { + width = format->fmt.pix.width; + height = format->fmt.pix.height; + pixelformat = format->fmt.pix.pixelformat; + bpl = format->fmt.pix.bytesperline; + } + + switch (pixelformat) { + case V4L2_PIX_FMT_NV12: + layer->format = DRM_FORMAT_NV12; + desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; + break; +#if CONFIG_SAND + case V4L2_PIX_FMT_NV12_COL128: + layer->format = DRM_FORMAT_NV12; + desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl); + break; + case V4L2_PIX_FMT_NV12_10_COL128: + layer->format = DRM_FORMAT_P030; + desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl); + break; +#endif +#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED + case V4L2_PIX_FMT_SUNXI_TILED_NV12: + layer->format = DRM_FORMAT_NV12; + desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED; + break; +#endif +#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15) + case V4L2_PIX_FMT_NV15: + layer->format = DRM_FORMAT_NV15; + desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; + break; +#endif + case V4L2_PIX_FMT_NV16: + layer->format = DRM_FORMAT_NV16; + desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; + break; +#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20) + case V4L2_PIX_FMT_NV20: + layer->format = DRM_FORMAT_NV20; + desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; + break; +#endif + default: + return -1; + } + + desc->nb_objects = 1; + desc->objects[0].fd = -1; + desc->objects[0].size = 0; + + desc->nb_layers = 1; + layer->nb_planes = 2; + + layer->planes[0].object_index = 0; + layer->planes[0].offset = 0; + layer->planes[0].pitch = bpl; +#if CONFIG_SAND + if (pixelformat == V4L2_PIX_FMT_NV12_COL128) { + layer->planes[1].object_index = 0; + layer->planes[1].offset = height * 128; + layer->planes[0].pitch = width; + layer->planes[1].pitch = width; + } + else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { + layer->planes[1].object_index = 0; + layer->planes[1].offset = height * 128; + layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy + layer->planes[1].pitch = width * 2; + } + else +#endif + { + layer->planes[1].object_index = 0; + layer->planes[1].offset = layer->planes[0].pitch * height; + layer->planes[1].pitch = layer->planes[0].pitch; + } + + return 0; +} + +static int +set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq, + struct req_controls *const controls, +#if HEVC_CTRLS_VERSION >= 2 + struct v4l2_ctrl_hevc_decode_params * const dec, +#endif + struct v4l2_ctrl_hevc_slice_params * const slices, + const unsigned int slice_no, + const unsigned int slice_count) +{ + int rv; + + struct v4l2_ext_control control[] = { + { + .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS, + .ptr = &controls->sps, + .size = sizeof(controls->sps), + }, + { + .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS, + .ptr = &controls->pps, + .size = sizeof(controls->pps), + }, +#if HEVC_CTRLS_VERSION >= 2 + { + .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS, + .ptr = dec, + .size = sizeof(*dec), + }, +#endif + { + .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, + .ptr = slices + slice_no, + .size = sizeof(*slices) * slice_count, + }, + // Optional + { + .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX, + .ptr = &controls->scaling_matrix, + .size = sizeof(controls->scaling_matrix), + }, + }; + + rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, + controls->has_scaling ? + FF_ARRAY_ELEMS(control) : + FF_ARRAY_ELEMS(control) - 1); + + return rv; +} + +static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) +{ + const HEVCContext * const h = avctx->priv_data; + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0]; + int bcount = get_bits_count(&h->HEVClc->gb); + uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount; + + int rv; + struct slice_info * si; + + if ((rv = slice_add(rd)) != 0) + return rv; + + si = rd->slices + rd->num_slices - 1; + si->ptr = buffer; + si->len = size; + + if (ctx->multi_slice && rd->num_slices > 1) { + struct slice_info *const si0 = rd->slices; + const size_t offset = (buffer - si0->ptr); + boff += offset * 8; + size += offset; + si0->len = si->len + offset; + } + +#if HEVC_CTRLS_VERSION >= 2 + if (rd->num_slices == 1) + fill_decode_params(h, &rd->dec); + fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff); +#else + fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff); +#endif + + return 0; +} + +static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx) +{ + const HEVCContext * const h = avctx->priv_data; + if (h->ref != NULL) { + V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0]; + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + + media_request_abort(&rd->req); + mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src); + + decode_q_remove(&ctx->decode_q, &rd->decode_ent); + } +} + +static int send_slice(AVCodecContext * const avctx, + V4L2MediaReqDescriptor * const rd, + struct req_controls *const controls, + const unsigned int i, const unsigned int j) +{ + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + + struct slice_info *const si = rd->slices + i; + struct media_request * req = NULL; + struct qent_src * src = NULL; + MediaBufsStatus stat; + + if ((req = media_request_get(ctx->mpool)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__); + return AVERROR(ENOMEM); + } + + if (set_req_ctls(ctx, req, + controls, +#if HEVC_CTRLS_VERSION >= 2 + &rd->dec, +#endif + rd->slice_params, + i, j - i)) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__); + goto fail1; + } + + if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__); + goto fail1; + } + + if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__); + goto fail2; + } + + if (qent_src_params_set(src, &controls->tv)) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__); + goto fail2; + } + +#warning ANNEX_B start code +// if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { +// } + + stat = mediabufs_start_request(ctx->mbufs, &req, &src, + i == 0 ? rd->qe_dst : NULL, + j == rd->num_slices); + + if (stat != MEDIABUFS_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__); + return AVERROR_UNKNOWN; + } + return 0; + +fail2: + mediabufs_src_qent_abort(ctx->mbufs, &src); +fail1: + media_request_abort(&req); + return AVERROR_UNKNOWN; +} + +static int v4l2_request_hevc_end_frame(AVCodecContext *avctx) +{ + const HEVCContext * const h = avctx->priv_data; + V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0]; + V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; + struct req_controls rc; + unsigned int i; + int rv; + + // It is possible, though maybe a bug, to get an end_frame without + // a previous start_frame. If we do then give up. + if (!decode_q_in_q(&rd->decode_ent)) { + av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__); + return AVERROR_INVALIDDATA; + } + + { + const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ? + &h->ps.pps->scaling_list : + h->ps.sps->scaling_list_enable_flag ? + &h->ps.sps->scaling_list : NULL; + + + memset(&rc, 0, sizeof(rc)); + rc.tv = cvt_dpb_to_tv(rd->timestamp); + fill_sps(&rc.sps, h->ps.sps); + fill_pps(&rc.pps, h->ps.pps); + if (sl) { + rc.has_scaling = 1; + fill_scaling_matrix(sl, &rc.scaling_matrix); + } + } + + decode_q_wait(&ctx->decode_q, &rd->decode_ent); + + // qe_dst needs to be bound to the data buffer and only returned when that is + // Alloc almost certainly wants to be serialised if there is any chance of blocking + // so we get the next frame to be free in the thread that needs it for decode first. + // + // In our current world this probably isn't a concern but put it here anyway + if (!rd->qe_dst) + { + if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__); + rv = AVERROR(ENOMEM); + goto fail; + } + } + + // Send as slices + if (ctx->multi_slice) + { + if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0) + goto fail; + } + else + { + for (i = 0; i != rd->num_slices; ++i) { + if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0) + goto fail; + } + } + + // Set the drm_prime desriptor + drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs)); + rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0)); + rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0)); + + decode_q_remove(&ctx->decode_q, &rd->decode_ent); + return 0; + +fail: + decode_q_remove(&ctx->decode_q, &rd->decode_ent); + return rv; +} + +// Initial check & init +static int +probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) +{ + const HEVCContext *h = avctx->priv_data; + const HEVCSPS * const sps = h->ps.sps; + struct v4l2_ctrl_hevc_sps ctrl_sps; + unsigned int i; + + // Check for var slice array + struct v4l2_query_ext_ctrl qc[] = { + { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS }, + { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS }, + { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS }, + { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX }, +#if HEVC_CTRLS_VERSION >= 2 + { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS }, +#endif + }; + // Order & size must match! + static const size_t ctrl_sizes[] = { + sizeof(struct v4l2_ctrl_hevc_slice_params), + sizeof(struct v4l2_ctrl_hevc_sps), + sizeof(struct v4l2_ctrl_hevc_pps), + sizeof(struct v4l2_ctrl_hevc_scaling_matrix), +#if HEVC_CTRLS_VERSION >= 2 + sizeof(struct v4l2_ctrl_hevc_decode_params), +#endif + }; + const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc); + +#if HEVC_CTRLS_VERSION == 2 + if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0)) + return AVERROR(EINVAL); +#elif HEVC_CTRLS_VERSION == 3 + if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0)) + return AVERROR(EINVAL); +#endif + + if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) { + av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION); + return AVERROR(EINVAL); + } + for (i = 0; i != noof_ctrls; ++i) { + if (ctrl_sizes[i] != (size_t)qc[i].elem_size) { + av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n", + HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size); + return AVERROR(EINVAL); + } + } + + fill_sps(&ctrl_sps, sps); + + if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) { + av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n"); + return AVERROR(EINVAL); + } + + ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0; + return 0; +} + +// Final init +static int +set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx) +{ + int ret; + + struct v4l2_query_ext_ctrl querys[] = { + { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, + { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, + { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, }, + }; + + struct v4l2_ext_control ctrls[] = { + { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, + { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, + }; + + mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys)); + + ctx->decode_mode = querys[0].default_value; + + if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && + ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) { + av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode); + return AVERROR(EINVAL); + } + + ctx->start_code = querys[1].default_value; + if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE && + ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { + av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code); + return AVERROR(EINVAL); + } + + ctx->max_slices = querys[2].elems; + if (ctx->max_slices > MAX_SLICES) { + av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices); + return AVERROR(EINVAL); + } + + ctrls[0].value = ctx->decode_mode; + ctrls[1].value = ctx->start_code; + + ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls)); + return !ret ? 0 : AVERROR(-ret); +} + +static void v4l2_req_frame_free(void *opaque, uint8_t *data) +{ + AVCodecContext *avctx = opaque; + V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data; + + av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data); + + qent_dst_unref(&rd->qe_dst); + + // We don't expect req or qe_src to be set + if (rd->req || rd->qe_src) + av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src); + + av_freep(&rd->slices); + av_freep(&rd->slice_params); + + av_free(rd); +} + +static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size) +{ + AVCodecContext *avctx = opaque; +// V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; +// V4L2MediaReqDescriptor *req; + AVBufferRef *ref; + uint8_t *data; +// int ret; + + data = av_mallocz(size); + if (!data) + return NULL; + + av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data); + ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0); + if (!ref) { + av_freep(&data); + return NULL; + } + return ref; +} + +#if 0 +static void v4l2_req_pool_free(void *opaque) +{ + av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque); +} + +static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc) +{ + av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool); + + av_buffer_pool_uninit(&hwfc->pool); +} +#endif + +static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) +{ + V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; + AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data; + const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs); + + hwfc->format = AV_PIX_FMT_DRM_PRIME; + hwfc->sw_format = pixel_format_from_format(vfmt); + if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) { + hwfc->width = vfmt->fmt.pix_mp.width; + hwfc->height = vfmt->fmt.pix_mp.height; + } else { + hwfc->width = vfmt->fmt.pix.width; + hwfc->height = vfmt->fmt.pix.height; + } +#if 0 + hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free); + if (!hwfc->pool) + return AVERROR(ENOMEM); + + hwfc->free = v4l2_req_hwframe_ctx_free; + + hwfc->initial_pool_size = 1; + + switch (avctx->codec_id) { + case AV_CODEC_ID_VP9: + hwfc->initial_pool_size += 8; + break; + case AV_CODEC_ID_VP8: + hwfc->initial_pool_size += 3; + break; + default: + hwfc->initial_pool_size += 2; + } +#endif + av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size); + + return 0; +} + +static int alloc_frame(AVCodecContext * avctx, AVFrame *frame) +{ + int rv; + + frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor)); + if (!frame->buf[0]) + return AVERROR(ENOMEM); + + frame->data[0] = frame->buf[0]->data; + + frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx); + + if ((rv = ff_attach_decode_data(frame)) != 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n"); + av_frame_unref(frame); + return rv; + } + + return 0; +} + +const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = { + .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE, + .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION), + .probe = probe, + .set_controls = set_controls, + + .start_frame = v4l2_request_hevc_start_frame, + .decode_slice = v4l2_request_hevc_decode_slice, + .end_frame = v4l2_request_hevc_end_frame, + .abort_frame = v4l2_request_hevc_abort_frame, + .frame_params = frame_params, + .alloc_frame = alloc_frame, +}; + diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c new file mode 100644 index 0000000000..980b306b8a --- /dev/null +++ b/libavcodec/v4l2_req_media.c @@ -0,0 +1,1601 @@ +/* + * Copyright (C) 2018 Paul Kocialkowski + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "v4l2_req_dmabufs.h" +#include "v4l2_req_media.h" +#include "v4l2_req_pollqueue.h" +#include "v4l2_req_utils.h" +#include "weak_link.h" + + +/* floor(log2(x)) */ +static unsigned int log2_size(size_t x) +{ + unsigned int n = 0; + + if (x & ~0xffff) { + n += 16; + x >>= 16; + } + if (x & ~0xff) { + n += 8; + x >>= 8; + } + if (x & ~0xf) { + n += 4; + x >>= 4; + } + if (x & ~3) { + n += 2; + x >>= 2; + } + return (x & ~1) ? n + 1 : n; +} + +static size_t round_up_size(const size_t x) +{ + /* Admit no size < 256 */ + const unsigned int n = x < 256 ? 8 : log2_size(x) - 1; + + return x >= (3 << n) ? 4 << n : (3 << n); +} + +struct media_request; + +struct media_pool { + int fd; + sem_t sem; + pthread_mutex_t lock; + struct media_request * free_reqs; + struct pollqueue * pq; +}; + +struct media_request { + struct media_request * next; + struct media_pool * mp; + int fd; + struct polltask * pt; +}; + + +static inline int do_trywait(sem_t *const sem) +{ + while (sem_trywait(sem)) { + if (errno != EINTR) + return -errno; + } + return 0; +} + +static inline int do_wait(sem_t *const sem) +{ + while (sem_wait(sem)) { + if (errno != EINTR) + return -errno; + } + return 0; +} + +static int request_buffers(int video_fd, unsigned int type, + enum v4l2_memory memory, unsigned int buffers_count) +{ + struct v4l2_requestbuffers buffers; + int rc; + + memset(&buffers, 0, sizeof(buffers)); + buffers.type = type; + buffers.memory = memory; + buffers.count = buffers_count; + + rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers); + if (rc < 0) { + rc = -errno; + request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc)); + return rc; + } + + return 0; +} + + +static int set_stream(int video_fd, unsigned int type, bool enable) +{ + enum v4l2_buf_type buf_type = type; + int rc; + + rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF, + &buf_type); + if (rc < 0) { + rc = -errno; + request_log("Unable to %sable stream: %s\n", + enable ? "en" : "dis", strerror(-rc)); + return rc; + } + + return 0; +} + + + +struct media_request * media_request_get(struct media_pool * const mp) +{ + struct media_request *req = NULL; + + /* Timeout handled by poll code */ + if (do_wait(&mp->sem)) + return NULL; + + pthread_mutex_lock(&mp->lock); + req = mp->free_reqs; + if (req) { + mp->free_reqs = req->next; + req->next = NULL; + } + pthread_mutex_unlock(&mp->lock); + return req; +} + +int media_request_fd(const struct media_request * const req) +{ + return req->fd; +} + +int media_request_start(struct media_request * const req) +{ + while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1) + { + const int err = errno; + if (err == EINTR) + continue; + request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err)); + return -err; + } + + pollqueue_add_task(req->pt, 2000); + return 0; +} + +static void media_request_done(void *v, short revents) +{ + struct media_request *const req = v; + struct media_pool *const mp = req->mp; + + /* ** Not sure what to do about timeout */ + + if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0) + request_log("Unable to reinit media request: %s\n", + strerror(errno)); + + pthread_mutex_lock(&mp->lock); + req->next = mp->free_reqs; + mp->free_reqs = req; + pthread_mutex_unlock(&mp->lock); + sem_post(&mp->sem); +} + +int media_request_abort(struct media_request ** const preq) +{ + struct media_request * const req = *preq; + + if (req == NULL) + return 0; + *preq = NULL; + + media_request_done(req, 0); + return 0; +} + +static void delete_req_chain(struct media_request * const chain) +{ + struct media_request * next = chain; + while (next) { + struct media_request * const req = next; + next = req->next; + if (req->pt) + polltask_delete(&req->pt); + if (req->fd != -1) + close(req->fd); + free(req); + } +} + +struct media_pool * media_pool_new(const char * const media_path, + struct pollqueue * const pq, + const unsigned int n) +{ + struct media_pool * const mp = calloc(1, sizeof(*mp)); + unsigned int i; + + if (!mp) + goto fail0; + + mp->pq = pq; + pthread_mutex_init(&mp->lock, NULL); + mp->fd = open(media_path, O_RDWR | O_NONBLOCK); + if (mp->fd == -1) { + request_log("Failed to open '%s': %s\n", media_path, strerror(errno)); + goto fail1; + } + + for (i = 0; i != n; ++i) { + struct media_request * req = malloc(sizeof(*req)); + if (!req) + goto fail4; + + *req = (struct media_request){ + .next = mp->free_reqs, + .mp = mp, + .fd = -1 + }; + mp->free_reqs = req; + + if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) { + request_log("Failed to alloc request %d: %s\n", i, strerror(errno)); + goto fail4; + } + + req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req); + if (!req->pt) + goto fail4; + } + + sem_init(&mp->sem, 0, n); + + return mp; + +fail4: + delete_req_chain(mp->free_reqs); + close(mp->fd); + pthread_mutex_destroy(&mp->lock); +fail1: + free(mp); +fail0: + return NULL; +} + +void media_pool_delete(struct media_pool ** pMp) +{ + struct media_pool * const mp = *pMp; + + if (!mp) + return; + *pMp = NULL; + + delete_req_chain(mp->free_reqs); + close(mp->fd); + sem_destroy(&mp->sem); + pthread_mutex_destroy(&mp->lock); + free(mp); +} + + +#define INDEX_UNSET (~(uint32_t)0) + +enum qent_status { + QENT_NEW = 0, // Initial state - shouldn't last + QENT_FREE, // On free chain + QENT_PENDING, // User has ent + QENT_WAITING, // On inuse + QENT_DONE, // Frame rx + QENT_ERROR, // Error + QENT_IMPORT +}; + +struct qent_base { + atomic_int ref_count; + struct qent_base *next; + struct qent_base *prev; + enum qent_status status; + uint32_t index; + struct dmabuf_h *dh[VIDEO_MAX_PLANES]; + struct timeval timestamp; +}; + +struct qent_src { + struct qent_base base; + int fixed_size; +}; + +struct qent_dst { + struct qent_base base; + bool waiting; + pthread_mutex_t lock; + pthread_cond_t cond; + struct ff_weak_link_client * mbc_wl; +}; + +struct qe_list_head { + struct qent_base *head; + struct qent_base *tail; +}; + +struct buf_pool { + pthread_mutex_t lock; + sem_t free_sem; + enum v4l2_buf_type buf_type; + struct qe_list_head free; + struct qe_list_head inuse; +}; + + +static inline struct qent_dst *base_to_dst(struct qent_base *be) +{ + return (struct qent_dst *)be; +} + +static inline struct qent_src *base_to_src(struct qent_base *be) +{ + return (struct qent_src *)be; +} + + +#define QENT_BASE_INITIALIZER {\ + .ref_count = ATOMIC_VAR_INIT(0),\ + .status = QENT_NEW,\ + .index = INDEX_UNSET\ +} + +static void qe_base_uninit(struct qent_base *const be) +{ + unsigned int i; + for (i = 0; i != VIDEO_MAX_PLANES; ++i) { + dmabuf_free(be->dh[i]); + be->dh[i] = NULL; + } +} + +static void qe_src_free(struct qent_src *const be_src) +{ + if (!be_src) + return; + qe_base_uninit(&be_src->base); + free(be_src); +} + +static struct qent_src * qe_src_new(void) +{ + struct qent_src *const be_src = malloc(sizeof(*be_src)); + if (!be_src) + return NULL; + *be_src = (struct qent_src){ + .base = QENT_BASE_INITIALIZER + }; + return be_src; +} + +static void qe_dst_free(struct qent_dst *const be_dst) +{ + if (!be_dst) + return; + + ff_weak_link_unref(&be_dst->mbc_wl); + pthread_cond_destroy(&be_dst->cond); + pthread_mutex_destroy(&be_dst->lock); + qe_base_uninit(&be_dst->base); + free(be_dst); +} + +static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl) +{ + struct qent_dst *const be_dst = malloc(sizeof(*be_dst)); + if (!be_dst) + return NULL; + *be_dst = (struct qent_dst){ + .base = QENT_BASE_INITIALIZER, + .lock = PTHREAD_MUTEX_INITIALIZER, + .cond = PTHREAD_COND_INITIALIZER, + .mbc_wl = ff_weak_link_ref(wl) + }; + return be_dst; +} + +static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be) +{ + if (ql->tail) + ql->tail->next = be; + else + ql->head = be; + be->prev = ql->tail; + be->next = NULL; + ql->tail = be; +} + +static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be) +{ + if (!be) + return NULL; + + if (be->next) + be->next->prev = be->prev; + else + ql->tail = be->prev; + if (be->prev) + be->prev->next = be->next; + else + ql->head = be->next; + be->next = NULL; + be->prev = NULL; + return be; +} + + +static void bq_put_free(struct buf_pool *const bp, struct qent_base * be) +{ + ql_add_tail(&bp->free, be); +} + +static struct qent_base * bq_get_free(struct buf_pool *const bp) +{ + return ql_extract(&bp->free, bp->free.head); +} + +static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be) +{ + return ql_extract(&bp->inuse, be); +} + +static struct qent_base * bq_get_inuse(struct buf_pool *const bp) +{ + return ql_extract(&bp->inuse, bp->inuse.head); +} + +static void bq_free_all_free_src(struct buf_pool *const bp) +{ + struct qent_base *be; + while ((be = bq_get_free(bp)) != NULL) + qe_src_free(base_to_src(be)); +} + +static void bq_free_all_inuse_src(struct buf_pool *const bp) +{ + struct qent_base *be; + while ((be = bq_get_inuse(bp)) != NULL) + qe_src_free(base_to_src(be)); +} + +static void bq_free_all_free_dst(struct buf_pool *const bp) +{ + struct qent_base *be; + while ((be = bq_get_free(bp)) != NULL) + qe_dst_free(base_to_dst(be)); +} + +static void queue_put_free(struct buf_pool *const bp, struct qent_base *be) +{ + unsigned int i; + + pthread_mutex_lock(&bp->lock); + /* Clear out state vars */ + be->timestamp.tv_sec = 0; + be->timestamp.tv_usec = 0; + be->status = QENT_FREE; + for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) + dmabuf_len_set(be->dh[i], 0); + bq_put_free(bp, be); + pthread_mutex_unlock(&bp->lock); + sem_post(&bp->free_sem); +} + +static bool queue_is_inuse(const struct buf_pool *const bp) +{ + return bp->inuse.tail != NULL; +} + +static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be) +{ + if (!be) + return; + pthread_mutex_lock(&bp->lock); + ql_add_tail(&bp->inuse, be); + be->status = QENT_WAITING; + pthread_mutex_unlock(&bp->lock); +} + +static struct qent_base *queue_get_free(struct buf_pool *const bp) +{ + struct qent_base *buf; + + if (do_wait(&bp->free_sem)) + return NULL; + pthread_mutex_lock(&bp->lock); + buf = bq_get_free(bp); + pthread_mutex_unlock(&bp->lock); + return buf; +} + +static struct qent_base *queue_tryget_free(struct buf_pool *const bp) +{ + struct qent_base *buf; + + if (do_trywait(&bp->free_sem)) + return NULL; + pthread_mutex_lock(&bp->lock); + buf = bq_get_free(bp); + pthread_mutex_unlock(&bp->lock); + return buf; +} + +static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd) +{ + struct qent_base *be; + + pthread_mutex_lock(&bp->lock); + /* Expect 1st in Q, but allow anywhere */ + for (be = bp->inuse.head; be; be = be->next) { + if (dmabuf_fd(be->dh[0]) == fd) { + bq_extract_inuse(bp, be); + break; + } + } + pthread_mutex_unlock(&bp->lock); + + return be; +} + +static void queue_delete(struct buf_pool *const bp) +{ + sem_destroy(&bp->free_sem); + pthread_mutex_destroy(&bp->lock); + free(bp); +} + +static struct buf_pool* queue_new(const int vfd) +{ + struct buf_pool *bp = calloc(1, sizeof(*bp)); + if (!bp) + return NULL; + pthread_mutex_init(&bp->lock, NULL); + sem_init(&bp->free_sem, 0, 0); + return bp; +} + + +struct mediabufs_ctl { + atomic_int ref_count; /* 0 is single ref for easier atomics */ + void * dc; + int vfd; + bool stream_on; + bool polling; + bool dst_fixed; // Dst Q is fixed size + pthread_mutex_t lock; + struct buf_pool * src; + struct buf_pool * dst; + struct polltask * pt; + struct pollqueue * pq; + struct ff_weak_link_master * this_wlm; + + struct v4l2_format src_fmt; + struct v4l2_format dst_fmt; + struct v4l2_capability capability; +}; + +static int qe_v4l2_queue(struct qent_base *const be, + const int vfd, struct media_request *const mreq, + const struct v4l2_format *const fmt, + const bool is_dst, const bool hold_flag) +{ + struct v4l2_buffer buffer = { + .type = fmt->type, + .memory = V4L2_MEMORY_DMABUF, + .index = be->index + }; + struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; + + if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { + unsigned int i; + for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) { + if (is_dst) + dmabuf_len_set(be->dh[i], 0); + + /* *** Really need a pixdesc rather than a format so we can fill in data_offset */ + planes[i].length = dmabuf_size(be->dh[i]); + planes[i].bytesused = dmabuf_len(be->dh[i]); + planes[i].m.fd = dmabuf_fd(be->dh[i]); + } + buffer.m.planes = planes; + buffer.length = i; + } + else { + if (is_dst) + dmabuf_len_set(be->dh[0], 0); + + buffer.bytesused = dmabuf_len(be->dh[0]); + buffer.length = dmabuf_size(be->dh[0]); + buffer.m.fd = dmabuf_fd(be->dh[0]); + } + + if (!is_dst && mreq) { + buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD; + buffer.request_fd = media_request_fd(mreq); + if (hold_flag) + buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF; + } + + if (is_dst) + be->timestamp = (struct timeval){0,0}; + + buffer.timestamp = be->timestamp; + + while (ioctl(vfd, VIDIOC_QBUF, &buffer)) { + const int err = errno; + if (err != EINTR) { + request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err)); + return -err; + } + } + return 0; +} + +static struct qent_base * qe_dequeue(struct buf_pool *const bp, + const int vfd, + const struct v4l2_format * const f) +{ + int fd; + struct qent_base *be; + int rc; + const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type); + struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}}; + struct v4l2_buffer buffer = { + .type = f->type, + .memory = V4L2_MEMORY_DMABUF + }; + if (mp) { + buffer.length = f->fmt.pix_mp.num_planes; + buffer.m.planes = planes; + } + + while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 && + errno == EINTR) + /* Loop */; + if (rc) { + request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno)); + return NULL; + } + + fd = mp ? planes[0].m.fd : buffer.m.fd; + be = queue_find_extract_fd(bp, fd); + if (!be) { + request_log("Failed to find fd %d in Q\n", fd); + return NULL; + } + + be->timestamp = buffer.timestamp; + be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE; + return be; +} + +static void qe_dst_done(struct qent_dst * dst_be) +{ + pthread_mutex_lock(&dst_be->lock); + dst_be->waiting = false; + pthread_cond_broadcast(&dst_be->cond); + pthread_mutex_unlock(&dst_be->lock); + + qent_dst_unref(&dst_be); +} + +static bool qe_dst_waiting(struct qent_dst *const dst_be) +{ + bool waiting; + pthread_mutex_lock(&dst_be->lock); + waiting = dst_be->waiting; + dst_be->waiting = true; + pthread_mutex_unlock(&dst_be->lock); + return waiting; +} + + +static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc) +{ + return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst); +} + +static void mediabufs_poll_cb(void * v, short revents) +{ + struct mediabufs_ctl *mbc = v; + struct qent_src *src_be = NULL; + struct qent_dst *dst_be = NULL; + + if (!revents) + request_err(mbc->dc, "%s: Timeout\n", __func__); + + pthread_mutex_lock(&mbc->lock); + mbc->polling = false; + + if ((revents & POLLOUT) != 0) + src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt)); + if ((revents & POLLIN) != 0) + dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt)); + + /* Reschedule */ + if (mediabufs_wants_poll(mbc)) { + mbc->polling = true; + pollqueue_add_task(mbc->pt, 2000); + } + pthread_mutex_unlock(&mbc->lock); + + if (src_be) + queue_put_free(mbc->src, &src_be->base); + if (dst_be) + qe_dst_done(dst_be); +} + +int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp) +{ + struct qent_base *const be = &be_src->base; + + be->timestamp = *timestamp; + return 0; +} + +struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst) +{ + return be_dst->base.timestamp; +} + +static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc) +{ + if (!be->dh[0] || len > dmabuf_size(be->dh[0])) { + size_t newsize = round_up_size(len); + request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize); + if (!dbsc) { + request_log("%s: No dmbabuf_ctrl for realloc\n", __func__); + return -ENOMEM; + } + if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) { + request_log("%s: Realloc %zd failed\n", __func__, newsize); + return -ENOMEM; + } + } + return 0; +} + +int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc) +{ + struct qent_base *const be = &be_src->base; + return qent_base_realloc(be, len, dbsc); +} + + +int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc) +{ + void * dst; + struct qent_base *const be = &be_src->base; + int rv; + + // Realloc doesn't copy so don't alloc if offset != 0 + if ((rv = qent_base_realloc(be, offset + len, + be_src->fixed_size || offset ? NULL : dbsc)) != 0) + return rv; + + dmabuf_write_start(be->dh[0]); + dst = dmabuf_map(be->dh[0]); + if (!dst) + return -1; + memcpy((char*)dst + offset, src, len); + dmabuf_len_set(be->dh[0], len); + dmabuf_write_end(be->dh[0]); + return 0; +} + +const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane) +{ + const struct qent_base *const be = &be_dst->base; + + return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane]; +} + +int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane) +{ + return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane))); +} + +MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, + struct media_request **const pmreq, + struct qent_src **const psrc_be, + struct qent_dst *const dst_be, + const bool is_final) +{ + struct media_request * mreq = *pmreq; + struct qent_src *const src_be = *psrc_be; + + // Req & src are always both "consumed" + *pmreq = NULL; + *psrc_be = NULL; + + pthread_mutex_lock(&mbc->lock); + + if (!src_be) + goto fail1; + + if (dst_be) { + if (qe_dst_waiting(dst_be)) { + request_info(mbc->dc, "Request buffer already waiting on start\n"); + goto fail1; + } + dst_be->base.timestamp = (struct timeval){0,0}; + if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false)) + goto fail1; + + qent_dst_ref(dst_be); + queue_put_inuse(mbc->dst, &dst_be->base); + } + + if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final)) + goto fail1; + queue_put_inuse(mbc->src, &src_be->base); + + if (!mbc->polling && mediabufs_wants_poll(mbc)) { + mbc->polling = true; + pollqueue_add_task(mbc->pt, 2000); + } + pthread_mutex_unlock(&mbc->lock); + + if (media_request_start(mreq)) + return MEDIABUFS_ERROR_OPERATION_FAILED; + + return MEDIABUFS_STATUS_SUCCESS; + +fail1: + media_request_abort(&mreq); + if (src_be) + queue_put_free(mbc->src, &src_be->base); + +// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q + if (dst_be) { + dst_be->base.status = QENT_ERROR; + qe_dst_done(dst_be); + } + pthread_mutex_unlock(&mbc->lock); + return MEDIABUFS_ERROR_OPERATION_FAILED; +} + + +static int qe_alloc_from_fmt(struct qent_base *const be, + struct dmabufs_ctl *const dbsc, + const struct v4l2_format *const fmt) +{ + if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { + unsigned int i; + for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) { + be->dh[i] = dmabuf_realloc(dbsc, be->dh[i], + fmt->fmt.pix_mp.plane_fmt[i].sizeimage); + /* On failure tidy up and die */ + if (!be->dh[i]) { + while (i--) { + dmabuf_free(be->dh[i]); + be->dh[i] = NULL; + } + return -1; + } + } + } + else { +// be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage); + size_t size = fmt->fmt.pix.sizeimage; + be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size); + if (!be->dh[0]) + return -1; + } + return 0; +} + +static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd, + const enum v4l2_buf_type buftype, + uint32_t pixfmt, + const unsigned int width, const unsigned int height, + const size_t bufsize) +{ + *fmt = (struct v4l2_format){.type = buftype}; + + if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) { + fmt->fmt.pix_mp.width = width; + fmt->fmt.pix_mp.height = height; + fmt->fmt.pix_mp.pixelformat = pixfmt; + if (bufsize) { + fmt->fmt.pix_mp.num_planes = 1; + fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize; + } + } + else { + fmt->fmt.pix.width = width; + fmt->fmt.pix.height = height; + fmt->fmt.pix.pixelformat = pixfmt; + fmt->fmt.pix.sizeimage = bufsize; + } + + while (ioctl(fd, VIDIOC_S_FMT, fmt)) + if (errno != EINTR) + return MEDIABUFS_ERROR_OPERATION_FAILED; + + // Treat anything where we don't get at least what we asked for as a fail + if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) { + if (fmt->fmt.pix_mp.width < width || + fmt->fmt.pix_mp.height < height || + fmt->fmt.pix_mp.pixelformat != pixfmt) { + return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; + } + } + else { + if (fmt->fmt.pix.width < width || + fmt->fmt.pix.height < height || + fmt->fmt.pix.pixelformat != pixfmt) { + return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; + } + } + + return MEDIABUFS_STATUS_SUCCESS; +} + +static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt, + const int fd, + const unsigned int type_v4l2, + const uint32_t flags_must, + const uint32_t flags_not, + const unsigned int width, + const unsigned int height, + mediabufs_dst_fmt_accept_fn *const accept_fn, + void *const accept_v) +{ + unsigned int i; + + for (i = 0;; ++i) { + struct v4l2_fmtdesc fmtdesc = { + .index = i, + .type = type_v4l2 + }; + while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) { + if (errno != EINTR) + return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE; + } + if ((fmtdesc.flags & flags_must) != flags_must || + (fmtdesc.flags & flags_not)) + continue; + if (!accept_fn(accept_v, &fmtdesc)) + continue; + + if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat, + width, height, 0) == MEDIABUFS_STATUS_SUCCESS) + return MEDIABUFS_STATUS_SUCCESS; + } + return 0; +} + + +/* Wait for qent done */ + +MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst) +{ + struct qent_base *const be = &be_dst->base; + enum qent_status estat; + + pthread_mutex_lock(&be_dst->lock); + while (be_dst->waiting && + !pthread_cond_wait(&be_dst->cond, &be_dst->lock)) + /* Loop */; + estat = be->status; + pthread_mutex_unlock(&be_dst->lock); + + return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS : + estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR : + MEDIABUFS_ERROR_OPERATION_FAILED; +} + +const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no) +{ + struct qent_base *const be = &be_dst->base; + return dmabuf_map(be->dh[buf_no]); +} + +MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst) +{ + struct qent_base *const be = &be_dst->base; + unsigned int i; + for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) { + if (dmabuf_read_start(be->dh[i])) { + while (i--) + dmabuf_read_end(be->dh[i]); + return MEDIABUFS_ERROR_ALLOCATION_FAILED; + } + } + return MEDIABUFS_STATUS_SUCCESS; +} + +MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst) +{ + struct qent_base *const be = &be_dst->base; + unsigned int i; + MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS; + + for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) { + if (dmabuf_read_end(be->dh[i])) + status = MEDIABUFS_ERROR_OPERATION_FAILED; + } + return status; +} + +struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst) +{ + if (be_dst) + atomic_fetch_add(&be_dst->base.ref_count, 1); + return be_dst; +} + +void qent_dst_unref(struct qent_dst ** const pbe_dst) +{ + struct qent_dst * const be_dst = *pbe_dst; + struct mediabufs_ctl * mbc; + if (!be_dst) + return; + *pbe_dst = NULL; + + if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0) + return; + + if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) { + queue_put_free(mbc->dst, &be_dst->base); + ff_weak_link_unlock(be_dst->mbc_wl); + } + else { + qe_dst_free(be_dst); + } +} + +MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, + unsigned int plane, + int fd, size_t size) +{ + struct qent_base *const be = &be_dst->base; + struct dmabuf_h * dh; + + if (be->status != QENT_IMPORT || be->dh[plane]) + return MEDIABUFS_ERROR_OPERATION_FAILED; + + dh = dmabuf_import(fd, size); + if (!dh) + return MEDIABUFS_ERROR_ALLOCATION_FAILED; + + be->dh[plane] = dh; + return MEDIABUFS_STATUS_SUCCESS; +} + +// Returns noof buffers created, -ve for error +static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[]) +{ + unsigned int i; + + struct v4l2_create_buffers cbuf = { + .count = n, + .memory = V4L2_MEMORY_DMABUF, + .format = mbc->dst_fmt, + }; + + while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) { + const int err = -errno; + if (err != EINTR) { + request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__); + return -err; + } + } + + if (cbuf.count != n) + request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n); + + for (i = 0; i != cbuf.count; ++i) + qes[i]->base.index = cbuf.index + i; + + return cbuf.count; +} + +struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc) +{ + struct qent_dst * be_dst; + + if (mbc == NULL) { + be_dst = qe_dst_new(NULL); + if (be_dst) + be_dst->base.status = QENT_IMPORT; + return be_dst; + } + + if (mbc->dst_fixed) { + be_dst = base_to_dst(queue_get_free(mbc->dst)); + if (!be_dst) + return NULL; + } + else { + be_dst = base_to_dst(queue_tryget_free(mbc->dst)); + if (!be_dst) { + be_dst = qe_dst_new(mbc->this_wlm); + if (!be_dst) + return NULL; + + if (create_dst_bufs(mbc, 1, &be_dst) != 1) { + qe_dst_free(be_dst); + return NULL; + } + } + } + + if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) { + /* Given how create buf works we can't uncreate it on alloc failure + * all we can do is put it on the free Q + */ + queue_put_free(mbc->dst, &be_dst->base); + return NULL; + } + + be_dst->base.status = QENT_PENDING; + atomic_store(&be_dst->base.ref_count, 0); + return be_dst; +} + +const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc) +{ + return &mbc->dst_fmt; +} + +MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, + const unsigned int width, + const unsigned int height, + mediabufs_dst_fmt_accept_fn *const accept_fn, + void *const accept_v) +{ + MediaBufsStatus status; + unsigned int i; + const enum v4l2_buf_type buf_type = mbc->dst_fmt.type; + static const struct { + unsigned int flags_must; + unsigned int flags_not; + } trys[] = { + {0, V4L2_FMT_FLAG_EMULATED}, + {V4L2_FMT_FLAG_EMULATED, 0}, + }; + for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) { + status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd, + buf_type, + trys[i].flags_must, + trys[i].flags_not, + width, height, accept_fn, accept_v); + if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE) + return status; + } + + if (status != MEDIABUFS_STATUS_SUCCESS) + return status; + + /* Try to create a buffer - don't alloc */ + return status; +} + +// ** This is a mess if we get partial alloc but without any way to remove +// individual V4L2 Q members we are somewhat stuffed +MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed) +{ + unsigned int i; + int a = 0; + unsigned int qc; + struct qent_dst * qes[32]; + + if (n > 32) + return MEDIABUFS_ERROR_ALLOCATION_FAILED; + + // Create qents first as it is hard to get rid of the V4L2 buffers on error + for (qc = 0; qc != n; ++qc) + { + if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL) + goto fail; + } + + if ((a = create_dst_bufs(mbc, n, qes)) < 0) + goto fail; + + for (i = 0; i != a; ++i) + queue_put_free(mbc->dst, &qes[i]->base); + + if (a != n) + goto fail; + + mbc->dst_fixed = fixed; + return MEDIABUFS_STATUS_SUCCESS; + +fail: + for (i = (a < 0 ? 0 : a); i != qc; ++i) + qe_dst_free(qes[i]); + + return MEDIABUFS_ERROR_ALLOCATION_FAILED; +} + +struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc) +{ + struct qent_base * buf = queue_get_free(mbc->src); + buf->status = QENT_PENDING; + return base_to_src(buf); +} + +void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src) +{ + struct qent_src *const qe_src = *pqe_src; + if (!qe_src) + return; + *pqe_src = NULL; + queue_put_free(mbc->src, &qe_src->base); +} + +/* src format must have been set up before this */ +MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc, + struct dmabufs_ctl * const dbsc, + unsigned int n) +{ + unsigned int i; + struct v4l2_requestbuffers req = { + .count = n, + .type = mbc->src_fmt.type, + .memory = V4L2_MEMORY_DMABUF + }; + + bq_free_all_free_src(mbc->src); + while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) { + if (errno != EINTR) { + request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__); + return MEDIABUFS_ERROR_OPERATION_FAILED; + } + } + + if (n > req.count) { + request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n); + n = req.count; + } + + for (i = 0; i != n; ++i) { + struct qent_src *const be_src = qe_src_new(); + if (!be_src) { + request_err(mbc->dc, "Failed to create src be %d\n", i); + goto fail; + } + if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) { + qe_src_free(be_src); + goto fail; + } + be_src->base.index = i; + be_src->fixed_size = !mediabufs_src_resizable(mbc); + + queue_put_free(mbc->src, &be_src->base); + } + + return MEDIABUFS_STATUS_SUCCESS; + +fail: + bq_free_all_free_src(mbc->src); + req.count = 0; + while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 && + errno == EINTR) + /* Loop */; + + return MEDIABUFS_ERROR_OPERATION_FAILED; +} + + + +/* + * Set stuff order: + * Set src fmt + * Set parameters (sps) on vfd + * Negotiate dst format (dst_fmt_set) + * Create src buffers + * Alloc a dst buffer or Create dst slots +*/ +MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc) +{ + if (mbc->stream_on) + return MEDIABUFS_STATUS_SUCCESS; + + if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) { + request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type); + return MEDIABUFS_ERROR_OPERATION_FAILED; + } + + if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) { + request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type); + set_stream(mbc->vfd, mbc->src_fmt.type, false); + return MEDIABUFS_ERROR_OPERATION_FAILED; + } + + mbc->stream_on = true; + return MEDIABUFS_STATUS_SUCCESS; +} + +MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc) +{ + MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS; + + if (!mbc->stream_on) + return MEDIABUFS_STATUS_SUCCESS; + + if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) { + request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type); + status = MEDIABUFS_ERROR_OPERATION_FAILED; + } + + if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) { + request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type); + status = MEDIABUFS_ERROR_OPERATION_FAILED; + } + + mbc->stream_on = false; + return status; +} + +int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n) +{ + struct v4l2_ext_controls controls = { + .controls = control_array, + .count = n + }; + + if (mreq) { + controls.which = V4L2_CTRL_WHICH_REQUEST_VAL; + controls.request_fd = media_request_fd(mreq); + } + + while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls)) + { + const int err = errno; + if (err != EINTR) { + request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err)); + return -err; + } + } + + return 0; +} + +MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc, + struct media_request * const mreq, + unsigned int id, void *data, + unsigned int size) +{ + struct v4l2_ext_control control = { + .id = id, + .ptr = data, + .size = size + }; + + int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1); + return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED; +} + +MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, + enum v4l2_buf_type buf_type, + const uint32_t pixfmt, + const uint32_t width, const uint32_t height, + const size_t bufsize) +{ + MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize); + if (rv != MEDIABUFS_STATUS_SUCCESS) + request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height); + + return rv; +} + +int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n) +{ + int rv = 0; + while (n--) { + while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) { + const int err = errno; + if (err != EINTR) { + // Often used for probing - errors are to be expected + request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err); + ctrls->type = 0; // 0 is invalid + rv = -err; + break; + } + } + ++ctrls; + } + return rv; +} + +int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc) +{ + // Single planar OUTPUT can only take exact size buffers + // Multiplanar will take larger than negotiated + return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type); +} + +static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc) +{ + if (!mbc) + return; + + // Break the weak link first + ff_weak_link_break(&mbc->this_wlm); + + polltask_delete(&mbc->pt); + + mediabufs_stream_off(mbc); + + // Empty v4l2 buffer stash + request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0); + request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0); + + bq_free_all_free_src(mbc->src); + bq_free_all_inuse_src(mbc->src); + bq_free_all_free_dst(mbc->dst); + + { + struct qent_dst *dst_be; + while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) { + dst_be->base.timestamp = (struct timeval){0}; + dst_be->base.status = QENT_ERROR; + qe_dst_done(dst_be); + } + } + + queue_delete(mbc->dst); + queue_delete(mbc->src); + close(mbc->vfd); + pthread_mutex_destroy(&mbc->lock); + + free(mbc); +} + +struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc) +{ + atomic_fetch_add(&mbc->ref_count, 1); + return mbc; +} + +void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc) +{ + struct mediabufs_ctl *const mbc = *pmbc; + int n; + + if (!mbc) + return; + *pmbc = NULL; + n = atomic_fetch_sub(&mbc->ref_count, 1); + if (n) + return; + mediabufs_ctl_delete(mbc); +} + +unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc) +{ + return mbc->capability.version; +} + +static int set_capabilities(struct mediabufs_ctl *const mbc) +{ + uint32_t caps; + + if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) { + int err = errno; + request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err)); + return -err; + } + + caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ? + mbc->capability.device_caps : + mbc->capability.capabilities; + + if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) { + mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + } + else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) { + mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; + mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + } + else { + request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps); + return -EINVAL; + } + + return 0; +} + +/* One of these per context */ +struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq) +{ + struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc)); + + if (!mbc) + return NULL; + + mbc->dc = dc; + // Default mono planar + mbc->pq = pq; + pthread_mutex_init(&mbc->lock, NULL); + + /* Pick a default - could we scan for this? */ + if (vpath == NULL) + vpath = "/dev/media0"; + + while ((mbc->vfd = open(vpath, O_RDWR)) == -1) + { + const int err = errno; + if (err != EINTR) { + request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err)); + goto fail0; + } + } + + if (set_capabilities(mbc)) { + request_err(dc, "Bad capabilities for video dev '%s'\n", vpath); + goto fail1; + } + + mbc->src = queue_new(mbc->vfd); + if (!mbc->src) + goto fail1; + mbc->dst = queue_new(mbc->vfd); + if (!mbc->dst) + goto fail2; + mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc); + if (!mbc->pt) + goto fail3; + mbc->this_wlm = ff_weak_link_new(mbc); + if (!mbc->this_wlm) + goto fail4; + + /* Cannot add polltask now - polling with nothing pending + * generates infinite error polls + */ + return mbc; + +fail4: + polltask_delete(&mbc->pt); +fail3: + queue_delete(mbc->dst); +fail2: + queue_delete(mbc->src); +fail1: + close(mbc->vfd); +fail0: + free(mbc); + request_info(dc, "%s: FAILED\n", __func__); + return NULL; +} + + + diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h new file mode 100644 index 0000000000..0307a831de --- /dev/null +++ b/libavcodec/v4l2_req_media.h @@ -0,0 +1,154 @@ +/* +e.h +* + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _MEDIA_H_ +#define _MEDIA_H_ + +#include +#include + +struct v4l2_format; +struct v4l2_fmtdesc; +struct v4l2_query_ext_ctrl; + +struct pollqueue; +struct media_request; +struct media_pool; + +typedef enum media_buf_status { + MEDIABUFS_STATUS_SUCCESS = 0, + MEDIABUFS_ERROR_OPERATION_FAILED, + MEDIABUFS_ERROR_DECODING_ERROR, + MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE, + MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT, + MEDIABUFS_ERROR_ALLOCATION_FAILED, +} MediaBufsStatus; + +struct media_pool * media_pool_new(const char * const media_path, + struct pollqueue * const pq, + const unsigned int n); +void media_pool_delete(struct media_pool ** pmp); + +// Obtain a media request +// Will block if none availible - has a 2sec timeout +struct media_request * media_request_get(struct media_pool * const mp); +int media_request_fd(const struct media_request * const req); + +// Start this request +// Request structure is returned to pool once done +int media_request_start(struct media_request * const req); + +// Return an *unstarted* media_request to the pool +// May later be upgraded to allow for aborting a started req +int media_request_abort(struct media_request ** const preq); + + +struct mediabufs_ctl; +struct qent_src; +struct qent_dst; +struct dmabuf_h; +struct dmabufs_ctl; + +int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp); +struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst); + +// prealloc +int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc); +// dbsc may be NULL if realloc not required +int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc); +const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane); +int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane); +MediaBufsStatus qent_dst_wait(struct qent_dst *const be); +void qent_dst_delete(struct qent_dst *const be); +// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead +void qent_dst_unref(struct qent_dst ** const pbe_dst); +struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst); + +const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no); +MediaBufsStatus qent_dst_read_start(struct qent_dst *const be); +MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be); +/* Import an fd unattached to any mediabuf */ +MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst, + unsigned int plane, + int fd, size_t size); + +MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc, + struct media_request **const pmreq, + struct qent_src **const psrc_be, + struct qent_dst *const dst_be, + const bool is_final); +// Get / alloc a dst buffer & associate with a slot +// If the dst pool is empty then behaviour depends on the fixed flag passed to +// dst_slots_create. Default is !fixed = unlimited alloc +struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, + struct dmabufs_ctl *const dbsc); +// Create dst slots without alloc +// If fixed true then qent_alloc will only get slots from this pool and will +// block until a qent has been unrefed +MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed); + +MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc); +MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc); +const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc); + +typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc); + +MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc, + const unsigned int width, + const unsigned int height, + mediabufs_dst_fmt_accept_fn *const accept_fn, + void *const accept_v); +struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc); +void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src); + +int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, + struct v4l2_ext_control control_array[], unsigned int n); +MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc, + struct media_request * const mreq, + unsigned int id, void *data, + unsigned int size); +int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n); + +int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc); + +MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc, + enum v4l2_buf_type buf_type, + const uint32_t pixfmt, + const uint32_t width, const uint32_t height, + const size_t bufsize); + +MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw, + struct dmabufs_ctl * const dbsc, + unsigned int n); + +#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c)) +unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc); + +struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, + const char *vpath, struct pollqueue *const pq); +void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc); +struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc); + + +#endif diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c new file mode 100644 index 0000000000..cc8a5d4001 --- /dev/null +++ b/libavcodec/v4l2_req_pollqueue.c @@ -0,0 +1,361 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v4l2_req_pollqueue.h" +#include "v4l2_req_utils.h" + + +struct pollqueue; + +enum polltask_state { + POLLTASK_UNQUEUED = 0, + POLLTASK_QUEUED, + POLLTASK_RUNNING, + POLLTASK_Q_KILL, + POLLTASK_RUN_KILL, +}; + +struct polltask { + struct polltask *next; + struct polltask *prev; + struct pollqueue *q; + enum polltask_state state; + + int fd; + short events; + + void (*fn)(void *v, short revents); + void * v; + + uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */ + sem_t kill_sem; +}; + +struct pollqueue { + atomic_int ref_count; + pthread_mutex_t lock; + + struct polltask *head; + struct polltask *tail; + + bool kill; + bool no_prod; + int prod_fd; + struct polltask *prod_pt; + pthread_t worker; +}; + +struct polltask *polltask_new(struct pollqueue *const pq, + const int fd, const short events, + void (*const fn)(void *v, short revents), + void *const v) +{ + struct polltask *pt; + + if (!events) + return NULL; + + pt = malloc(sizeof(*pt)); + if (!pt) + return NULL; + + *pt = (struct polltask){ + .next = NULL, + .prev = NULL, + .q = pollqueue_ref(pq), + .fd = fd, + .events = events, + .fn = fn, + .v = v + }; + + sem_init(&pt->kill_sem, 0, 0); + + return pt; +} + +static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt) +{ + if (pt->prev) + pt->prev->next = pt->next; + else + pq->head = pt->next; + if (pt->next) + pt->next->prev = pt->prev; + else + pq->tail = pt->prev; + pt->next = NULL; + pt->prev = NULL; +} + +static void polltask_free(struct polltask * const pt) +{ + sem_destroy(&pt->kill_sem); + free(pt); +} + +static int pollqueue_prod(const struct pollqueue *const pq) +{ + static const uint64_t one = 1; + return write(pq->prod_fd, &one, sizeof(one)); +} + +void polltask_delete(struct polltask **const ppt) +{ + struct polltask *const pt = *ppt; + struct pollqueue * pq; + enum polltask_state state; + bool prodme; + + if (!pt) + return; + + pq = pt->q; + pthread_mutex_lock(&pq->lock); + state = pt->state; + pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL; + prodme = !pq->no_prod; + pthread_mutex_unlock(&pq->lock); + + if (state != POLLTASK_UNQUEUED) { + if (prodme) + pollqueue_prod(pq); + while (sem_wait(&pt->kill_sem) && errno == EINTR) + /* loop */; + } + + // Leave zapping the ref until we have DQed the PT as might well be + // legitimately used in it + *ppt = NULL; + polltask_free(pt); + pollqueue_unref(&pq); +} + +static uint64_t pollqueue_now(int timeout) +{ + struct timespec now; + uint64_t now_ms; + + if (clock_gettime(CLOCK_MONOTONIC, &now)) + return 0; + now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout; + return now_ms ? now_ms : (uint64_t)1; +} + +void pollqueue_add_task(struct polltask *const pt, const int timeout) +{ + bool prodme = false; + struct pollqueue * const pq = pt->q; + + pthread_mutex_lock(&pq->lock); + if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) { + if (pq->tail) + pq->tail->next = pt; + else + pq->head = pt; + pt->prev = pq->tail; + pt->next = NULL; + pt->state = POLLTASK_QUEUED; + pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout); + pq->tail = pt; + prodme = !pq->no_prod; + } + pthread_mutex_unlock(&pq->lock); + if (prodme) + pollqueue_prod(pq); +} + +static void *poll_thread(void *v) +{ + struct pollqueue *const pq = v; + struct pollfd *a = NULL; + size_t asize = 0; + + pthread_mutex_lock(&pq->lock); + do { + unsigned int i; + unsigned int n = 0; + struct polltask *pt; + struct polltask *pt_next; + uint64_t now = pollqueue_now(0); + int timeout = -1; + int rv; + + for (pt = pq->head; pt; pt = pt_next) { + int64_t t; + + pt_next = pt->next; + + if (pt->state == POLLTASK_Q_KILL) { + pollqueue_rem_task(pq, pt); + sem_post(&pt->kill_sem); + continue; + } + + if (n >= asize) { + asize = asize ? asize * 2 : 4; + a = realloc(a, asize * sizeof(*a)); + if (!a) { + request_log("Failed to realloc poll array to %zd\n", asize); + goto fail_locked; + } + } + + a[n++] = (struct pollfd){ + .fd = pt->fd, + .events = pt->events + }; + + t = (int64_t)(pt->timeout - now); + if (pt->timeout && t < INT_MAX && + (timeout < 0 || (int)t < timeout)) + timeout = (t < 0) ? 0 : (int)t; + } + pthread_mutex_unlock(&pq->lock); + + if ((rv = poll(a, n, timeout)) == -1) { + if (errno != EINTR) { + request_log("Poll error: %s\n", strerror(errno)); + goto fail_unlocked; + } + } + + pthread_mutex_lock(&pq->lock); + now = pollqueue_now(0); + + /* Prodding in this loop is pointless and might lead to + * infinite looping + */ + pq->no_prod = true; + for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) { + pt_next = pt->next; + + /* Pending? */ + if (a[i].revents || + (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) { + pollqueue_rem_task(pq, pt); + if (pt->state == POLLTASK_QUEUED) + pt->state = POLLTASK_RUNNING; + if (pt->state == POLLTASK_Q_KILL) + pt->state = POLLTASK_RUN_KILL; + pthread_mutex_unlock(&pq->lock); + + /* This can add new entries to the Q but as + * those are added to the tail our existing + * chain remains intact + */ + pt->fn(pt->v, a[i].revents); + + pthread_mutex_lock(&pq->lock); + if (pt->state == POLLTASK_RUNNING) + pt->state = POLLTASK_UNQUEUED; + if (pt->state == POLLTASK_RUN_KILL) + sem_post(&pt->kill_sem); + } + } + pq->no_prod = false; + + } while (!pq->kill); + +fail_locked: + pthread_mutex_unlock(&pq->lock); +fail_unlocked: + free(a); + return NULL; +} + +static void prod_fn(void *v, short revents) +{ + struct pollqueue *const pq = v; + char buf[8]; + if (revents) + read(pq->prod_fd, buf, 8); + if (!pq->kill) + pollqueue_add_task(pq->prod_pt, -1); +} + +struct pollqueue * pollqueue_new(void) +{ + struct pollqueue *pq = malloc(sizeof(*pq)); + if (!pq) + return NULL; + *pq = (struct pollqueue){ + .ref_count = ATOMIC_VAR_INIT(0), + .lock = PTHREAD_MUTEX_INITIALIZER, + .head = NULL, + .tail = NULL, + .kill = false, + .prod_fd = -1 + }; + + pq->prod_fd = eventfd(0, EFD_NONBLOCK); + if (pq->prod_fd == 1) + goto fail1; + pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq); + if (!pq->prod_pt) + goto fail2; + pollqueue_add_task(pq->prod_pt, -1); + if (pthread_create(&pq->worker, NULL, poll_thread, pq)) + goto fail3; + // Reset ref count which will have been inced by the add_task + atomic_store(&pq->ref_count, 0); + return pq; + +fail3: + polltask_free(pq->prod_pt); +fail2: + close(pq->prod_fd); +fail1: + free(pq); + return NULL; +} + +static void pollqueue_free(struct pollqueue *const pq) +{ + void *rv; + + pthread_mutex_lock(&pq->lock); + pq->kill = true; + pollqueue_prod(pq); + pthread_mutex_unlock(&pq->lock); + + pthread_join(pq->worker, &rv); + polltask_free(pq->prod_pt); + pthread_mutex_destroy(&pq->lock); + close(pq->prod_fd); + free(pq); +} + +struct pollqueue * pollqueue_ref(struct pollqueue *const pq) +{ + atomic_fetch_add(&pq->ref_count, 1); + return pq; +} + +void pollqueue_unref(struct pollqueue **const ppq) +{ + struct pollqueue * const pq = *ppq; + + if (!pq) + return; + *ppq = NULL; + + if (atomic_fetch_sub(&pq->ref_count, 1) != 0) + return; + + pollqueue_free(pq); +} + + + diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h new file mode 100644 index 0000000000..e1182cb2fc --- /dev/null +++ b/libavcodec/v4l2_req_pollqueue.h @@ -0,0 +1,18 @@ +#ifndef POLLQUEUE_H_ +#define POLLQUEUE_H_ + +struct polltask; +struct pollqueue; + +struct polltask *polltask_new(struct pollqueue *const pq, + const int fd, const short events, + void (*const fn)(void *v, short revents), + void *const v); +void polltask_delete(struct polltask **const ppt); + +void pollqueue_add_task(struct polltask *const pt, const int timeout); +struct pollqueue * pollqueue_new(void); +void pollqueue_unref(struct pollqueue **const ppq); +struct pollqueue * pollqueue_ref(struct pollqueue *const pq); + +#endif /* POLLQUEUE_H_ */ diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h new file mode 100644 index 0000000000..cb4bd164b4 --- /dev/null +++ b/libavcodec/v4l2_req_utils.h @@ -0,0 +1,22 @@ +#include "libavutil/log.h" + +#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__) + +#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__) +#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__) +#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__) +#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__) + +static inline char safechar(char c) { + return c > 0x20 && c < 0x7f ? c : '.'; +} + +static inline const char * strfourcc(char tbuf[5], uint32_t fcc) { + tbuf[0] = safechar((fcc >> 0) & 0xff); + tbuf[1] = safechar((fcc >> 8) & 0xff); + tbuf[2] = safechar((fcc >> 16) & 0xff); + tbuf[3] = safechar((fcc >> 24) & 0xff); + tbuf[4] = '\0'; + return tbuf; +} + diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c new file mode 100644 index 0000000000..0ae14db90b --- /dev/null +++ b/libavcodec/v4l2_request_hevc.c @@ -0,0 +1,311 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + + +#include "decode.h" +#include "hevcdec.h" +#include "hwconfig.h" + +#include "v4l2_request_hevc.h" + +#include "libavutil/hwcontext_drm.h" + +#include "v4l2_req_devscan.h" +#include "v4l2_req_dmabufs.h" +#include "v4l2_req_pollqueue.h" +#include "v4l2_req_media.h" +#include "v4l2_req_utils.h" + +static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8) +{ + const size_t wxh = w * h; + size_t bits_alloc; + + /* Annex A gives a min compression of 2 @ lvl 3.1 + * (wxh <= 983040) and min 4 thereafter but avoid + * the odity of 983041 having a lower limit than + * 983040. + * Multiply by 3/2 for 4:2:0 + */ + bits_alloc = wxh < 983040 ? wxh * 3 / 4 : + wxh < 983040 * 2 ? 983040 * 3 / 4 : + wxh * 3 / 8; + /* Allow for bit depth */ + bits_alloc += (bits_alloc * bits_minus8) / 8; + /* Add a few bytes (16k) for overhead */ + bits_alloc += 0x4000; + return bits_alloc; +} + +static int v4l2_req_hevc_start_frame(AVCodecContext *avctx, + av_unused const uint8_t *buffer, + av_unused uint32_t size) +{ + const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + return ctx->fns->start_frame(avctx, buffer, size); +} + +static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) +{ + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + return ctx->fns->decode_slice(avctx, buffer, size); +} + +static int v4l2_req_hevc_end_frame(AVCodecContext *avctx) +{ + V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; + return ctx->fns->end_frame(avctx); +} + +static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx) +{ + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + ctx->fns->abort_frame(avctx); +} + +static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) +{ + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + return ctx->fns->frame_params(avctx, hw_frames_ctx); +} + +static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) +{ + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + return ctx->fns->alloc_frame(avctx, frame); +} + + +static int v4l2_request_hevc_uninit(AVCodecContext *avctx) +{ + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + + av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); + + decode_q_wait(&ctx->decode_q, NULL); // Wait for all other threads to be out of decode + + mediabufs_ctl_unref(&ctx->mbufs); + media_pool_delete(&ctx->mpool); + pollqueue_unref(&ctx->pq); + dmabufs_ctl_delete(&ctx->dbufs); + devscan_delete(&ctx->devscan); + + decode_q_uninit(&ctx->decode_q); + +// if (avctx->hw_frames_ctx) { +// AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data; +// av_buffer_pool_flush(hwfc->pool); +// } + return 0; +} + +static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc) +{ + AVCodecContext *const avctx = v; + const HEVCContext *const h = avctx->priv_data; + + if (h->ps.sps->bit_depth == 8) { + if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 || + fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) { + return 1; + } + } + else if (h->ps.sps->bit_depth == 10) { + if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { + return 1; + } + } + return 0; +} + +static int v4l2_request_hevc_init(AVCodecContext *avctx) +{ + const HEVCContext *h = avctx->priv_data; + V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; + const HEVCSPS * const sps = h->ps.sps; + int ret; + const struct decdev * decdev; + const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes + size_t src_size; + + av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__); + + // Give up immediately if this is something that we have no code to deal with + if (h->ps.sps->chroma_format_idc != 1) { + av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc); + return AVERROR_PATCHWELCOME; + } + if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) || + h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) { + av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma); + return AVERROR_PATCHWELCOME; + } + + if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) { + av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n"); + return (AVERROR(-ret)); + } + ret = AVERROR(ENOMEM); // Assume mem fail by default for these + + if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL) + { + av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n"); + ret = AVERROR(ENODEV); + goto fail0; + } + av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n", + decdev_media_path(decdev), decdev_video_path(decdev)); + + if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) { + av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n"); + goto fail0; + } + + if ((ctx->pq = pollqueue_new()) == NULL) { + av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n"); + goto fail1; + } + + if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n"); + goto fail2; + } + + if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) { + av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n"); + goto fail3; + } + + // Ask for an initial bitbuf size of max size / 4 + // We will realloc if we need more + // Must use sps->h/w as avctx contains cropped size + src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8); + if (mediabufs_src_resizable(ctx->mbufs)) + src_size /= 4; + // Kludge for conformance tests which break Annex A limits + else if (src_size < 0x40000) + src_size = 0x40000; + + if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt, + sps->width, sps->height, src_size)) { + char tbuf1[5]; + av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height); + goto fail4; + } + + if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) { + av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n"); + ctx->fns = &V2(ff_v4l2_req_hevc, 3); + } + else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) { + av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n"); + ctx->fns = &V2(ff_v4l2_req_hevc, 2); + } + else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) { + av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n"); + ctx->fns = &V2(ff_v4l2_req_hevc, 1); + } + else { + av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n"); + ret = AVERROR(EINVAL); + goto fail4; + } + + if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) { + char tbuf1[5]; + av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height); + goto fail4; + } + + if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) { + av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n"); + goto fail4; + } + + { + unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering + + avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6); + av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots, + sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering, + avctx->thread_count, avctx->extra_hw_frames); + + // extra_hw_frames is -1 if unset + if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) { + av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n"); + goto fail4; + } + } + + if (mediabufs_stream_on(ctx->mbufs)) { + av_log(avctx, AV_LOG_ERROR, "Failed stream on\n"); + goto fail4; + } + + if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n"); + goto fail4; + } + + if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) { + av_log(avctx, AV_LOG_ERROR, "Failed set controls\n"); + goto fail5; + } + + decode_q_init(&ctx->decode_q); + + // Set our s/w format + avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format; + + av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n", + ctx->fns->name, + decdev_media_path(decdev), decdev_video_path(decdev)); + + return 0; + +fail5: + av_buffer_unref(&avctx->hw_frames_ctx); +fail4: + mediabufs_ctl_unref(&ctx->mbufs); +fail3: + media_pool_delete(&ctx->mpool); +fail2: + pollqueue_unref(&ctx->pq); +fail1: + dmabufs_ctl_delete(&ctx->dbufs); +fail0: + devscan_delete(&ctx->devscan); + return ret; +} + +const AVHWAccel ff_hevc_v4l2request_hwaccel = { + .name = "hevc_v4l2request", + .type = AVMEDIA_TYPE_VIDEO, + .id = AV_CODEC_ID_HEVC, + .pix_fmt = AV_PIX_FMT_DRM_PRIME, + .alloc_frame = v4l2_req_hevc_alloc_frame, + .start_frame = v4l2_req_hevc_start_frame, + .decode_slice = v4l2_req_hevc_decode_slice, + .end_frame = v4l2_req_hevc_end_frame, + .abort_frame = v4l2_req_hevc_abort_frame, + .init = v4l2_request_hevc_init, + .uninit = v4l2_request_hevc_uninit, + .priv_data_size = sizeof(V4L2RequestContextHEVC), + .frame_params = v4l2_req_hevc_frame_params, + .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, +}; diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h new file mode 100644 index 0000000000..b2cb8c8584 --- /dev/null +++ b/libavcodec/v4l2_request_hevc.h @@ -0,0 +1,102 @@ +#ifndef AVCODEC_V4L2_REQUEST_HEVC_H +#define AVCODEC_V4L2_REQUEST_HEVC_H + +#include +#include "v4l2_req_decode_q.h" + +#ifndef DRM_FORMAT_NV15 +#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') +#endif + +#ifndef DRM_FORMAT_NV20 +#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') +#endif + +// P030 should be defined in drm_fourcc.h and hopefully will be sometime +// in the future but until then... +#ifndef DRM_FORMAT_P030 +#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') +#endif + +#ifndef DRM_FORMAT_NV15 +#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') +#endif + +#ifndef DRM_FORMAT_NV20 +#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') +#endif + +#include +#ifndef V4L2_CID_CODEC_BASE +#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE +#endif + +// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined +// in drm_fourcc.h hopefully will be sometime in the future but until then... +#ifndef V4L2_PIX_FMT_NV12_10_COL128 +#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') +#endif + +#ifndef V4L2_PIX_FMT_NV12_COL128 +#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ +#endif + +#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY +#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY 0x0800 +#endif + +#define MAX_SLICES 128 + +#define VCAT(name, version) name##_v##version +#define V2(n,v) VCAT(n, v) +#define V(n) V2(n, HEVC_CTRLS_VERSION) + +#define S2(x) #x +#define STR(x) S2(x) + +// 1 per decoder +struct v4l2_req_decode_fns; + +typedef struct V4L2RequestContextHEVC { +// V4L2RequestContext base; + const struct v4l2_req_decode_fns * fns; + + unsigned int timestamp; // ?? maybe uint64_t + + int multi_slice; + int decode_mode; + int start_code; + int max_slices; + + req_decode_q decode_q; + + struct devscan *devscan; + struct dmabufs_ctl *dbufs; + struct pollqueue *pq; + struct media_pool * mpool; + struct mediabufs_ctl *mbufs; +} V4L2RequestContextHEVC; + +typedef struct v4l2_req_decode_fns { + int src_pix_fmt_v4l2; + const char * name; + + // Init setup + int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx); + int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx); + + // Passthrough of hwaccel fns + int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); + int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); + int (*end_frame)(AVCodecContext *avctx); + void (*abort_frame)(AVCodecContext *avctx); + int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); + int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame); +} v4l2_req_decode_fns; + + +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1); +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3); + +#endif diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index ea93e11588..a9e0c6323e 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) size = next - start - 4; if (size <= 0) continue; - buf2_size = vc1_unescape_buffer(start + 4, size, buf2); + buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); init_get_bits(&gb, buf2, buf2_size * 8); switch (AV_RB32(start)) { case VC1_CODE_SEQHDR: @@ -678,7 +678,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, case VC1_CODE_FRAME: if (avctx->hwaccel) buf_start = start; - buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); + buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); break; case VC1_CODE_FIELD: { int buf_size3; @@ -695,8 +695,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, ret = AVERROR(ENOMEM); goto err; } - buf_size3 = vc1_unescape_buffer(start + 4, size, - slices[n_slices].buf); + buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, + slices[n_slices].buf); init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, buf_size3 << 3); slices[n_slices].mby_start = avctx->coded_height + 31 >> 5; @@ -707,7 +707,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, break; } case VC1_CODE_ENTRYPOINT: /* it should be before frame data */ - buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); + buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); init_get_bits(&s->gb, buf2, buf_size2 * 8); ff_vc1_decode_entry_point(avctx, v, &s->gb); break; @@ -724,8 +724,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, ret = AVERROR(ENOMEM); goto err; } - buf_size3 = vc1_unescape_buffer(start + 4, size, - slices[n_slices].buf); + buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, + slices[n_slices].buf); init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, buf_size3 << 3); slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9); @@ -759,7 +759,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, ret = AVERROR(ENOMEM); goto err; } - buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); + buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, buf_size3 << 3); slices[n_slices].mby_start = s->mb_height + 1 >> 1; @@ -768,9 +768,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, n_slices1 = n_slices - 1; n_slices++; } - buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2); + buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2); } else { - buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2); + buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2); } init_get_bits(&s->gb, buf2, buf_size2*8); } else diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c index c25a6f3adf..10182786b3 100644 --- a/libavcodec/vc1dsp.c +++ b/libavcodec/vc1dsp.c @@ -32,6 +32,7 @@ #include "rnd_avg.h" #include "vc1dsp.h" #include "startcode.h" +#include "vc1_common.h" /* Apply overlap transform to horizontal edge */ static void vc1_v_overlap_c(uint8_t *src, int stride) @@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp) #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ dsp->startcode_find_candidate = ff_startcode_find_candidate_c; + dsp->vc1_unescape_buffer = vc1_unescape_buffer; if (ARCH_AARCH64) ff_vc1dsp_init_aarch64(dsp); diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h index 75db62b1b4..e192b431be 100644 --- a/libavcodec/vc1dsp.h +++ b/libavcodec/vc1dsp.h @@ -80,6 +80,9 @@ typedef struct VC1DSPContext { * one or more further zero bytes and a one byte. */ int (*startcode_find_candidate)(const uint8_t *buf, int size); + + /* Copy a buffer, removing startcode emulation escape bytes as we go */ + int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst); } VC1DSPContext; void ff_vc1dsp_init(VC1DSPContext* c); diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c new file mode 100644 index 0000000000..f234a985b9 --- /dev/null +++ b/libavcodec/weak_link.c @@ -0,0 +1,102 @@ +#include +#include +#include +#include "weak_link.h" + +struct ff_weak_link_master { + atomic_int ref_count; /* 0 is single ref for easier atomics */ + pthread_rwlock_t lock; + void * ptr; +}; + +static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c) +{ + return (struct ff_weak_link_master *)c; +} + +struct ff_weak_link_master * ff_weak_link_new(void * p) +{ + struct ff_weak_link_master * w = malloc(sizeof(*w)); + if (!w) + return NULL; + w->ptr = p; + if (pthread_rwlock_init(&w->lock, NULL)) { + free(w); + return NULL; + } + return w; +} + +static void weak_link_do_unref(struct ff_weak_link_master * const w) +{ + int n = atomic_fetch_sub(&w->ref_count, 1); + if (n) + return; + + pthread_rwlock_destroy(&w->lock); + free(w); +} + +// Unref & break link +void ff_weak_link_break(struct ff_weak_link_master ** ppLink) +{ + struct ff_weak_link_master * const w = *ppLink; + if (!w) + return; + + *ppLink = NULL; + pthread_rwlock_wrlock(&w->lock); + w->ptr = NULL; + pthread_rwlock_unlock(&w->lock); + + weak_link_do_unref(w); +} + +struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w) +{ + if (!w) + return NULL; + atomic_fetch_add(&w->ref_count, 1); + return (struct ff_weak_link_client*)w; +} + +void ff_weak_link_unref(struct ff_weak_link_client ** ppLink) +{ + struct ff_weak_link_master * const w = weak_link_x(*ppLink); + if (!w) + return; + + *ppLink = NULL; + weak_link_do_unref(w); +} + +void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink) +{ + struct ff_weak_link_master * const w = weak_link_x(*ppLink); + + if (!w) + return NULL; + + if (pthread_rwlock_rdlock(&w->lock)) + goto broken; + + if (w->ptr) + return w->ptr; + + pthread_rwlock_unlock(&w->lock); + +broken: + *ppLink = NULL; + weak_link_do_unref(w); + return NULL; +} + +// Ignores a NULL c (so can be on the return path of both broken & live links) +void ff_weak_link_unlock(struct ff_weak_link_client * c) +{ + struct ff_weak_link_master * const w = weak_link_x(c); + if (w) + pthread_rwlock_unlock(&w->lock); +} + + diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h new file mode 100644 index 0000000000..415b6a27a0 --- /dev/null +++ b/libavcodec/weak_link.h @@ -0,0 +1,23 @@ +struct ff_weak_link_master; +struct ff_weak_link_client; + +struct ff_weak_link_master * ff_weak_link_new(void * p); +void ff_weak_link_break(struct ff_weak_link_master ** ppLink); + +struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w); +void ff_weak_link_unref(struct ff_weak_link_client ** ppLink); + +// Returns NULL if link broken - in this case it will also zap +// *ppLink and unref the weak_link. +// Returns NULL if *ppLink is NULL (so a link once broken stays broken) +// +// The above does mean that there is a race if this is called simultainiously +// by two threads using the same weak_link_client (so don't do that) +void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink); +void ff_weak_link_unlock(struct ff_weak_link_client * c); + + + + + + diff --git a/libavdevice/Makefile b/libavdevice/Makefile index 0dfe47a1f4..ec7c7b4147 100644 --- a/libavdevice/Makefile +++ b/libavdevice/Makefile @@ -47,6 +47,9 @@ OBJS-$(CONFIG_SNDIO_OUTDEV) += sndio_enc.o sndio.o OBJS-$(CONFIG_V4L2_INDEV) += v4l2.o v4l2-common.o timefilter.o OBJS-$(CONFIG_V4L2_OUTDEV) += v4l2enc.o v4l2-common.o OBJS-$(CONFIG_VFWCAP_INDEV) += vfwcap.o +OBJS-$(CONFIG_VOUT_DRM_OUTDEV) += drm_vout.o +OBJS-$(CONFIG_VOUT_EGL_OUTDEV) += egl_vout.o +OBJS-$(CONFIG_VOUT_RPI_OUTDEV) += rpi_vout.o OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o OBJS-$(CONFIG_XV_OUTDEV) += xv.o diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c index 92b27a1d14..19d2a9de55 100644 --- a/libavdevice/alldevices.c +++ b/libavdevice/alldevices.c @@ -53,6 +53,9 @@ extern AVOutputFormat ff_sndio_muxer; extern AVInputFormat ff_v4l2_demuxer; extern AVOutputFormat ff_v4l2_muxer; extern AVInputFormat ff_vfwcap_demuxer; +extern AVOutputFormat ff_vout_drm_muxer; +extern AVOutputFormat ff_vout_egl_muxer; +extern AVOutputFormat ff_vout_rpi_muxer; extern AVInputFormat ff_xcbgrab_demuxer; extern AVOutputFormat ff_xv_muxer; diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c new file mode 100644 index 0000000000..4b25ec4344 --- /dev/null +++ b/libavdevice/drm_vout.c @@ -0,0 +1,643 @@ +/* + * Copyright (c) 2020 John Cox for Raspberry Pi Trading + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +// *** This module is a work in progress and its utility is strictly +// limited to testing. + +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/hwcontext_drm.h" +#include "libavformat/internal.h" +#include "avdevice.h" + +#include "pthread.h" +#include +#include + +#include +#include + +#define TRACE_ALL 0 + +#define DRM_MODULE "vc4" + +#define ERRSTR strerror(errno) + +struct drm_setup { + int conId; + uint32_t crtcId; + int crtcIdx; + uint32_t planeId; + unsigned int out_fourcc; + struct { + int x, y, width, height; + } compose; +}; + +typedef struct drm_aux_s { + unsigned int fb_handle; + uint32_t bo_handles[AV_DRM_MAX_PLANES]; + AVFrame * frame; +} drm_aux_t; + +// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS +// we get initial flicker probably due to dodgy drm timing +#define AUX_SIZE 3 +typedef struct drm_display_env_s +{ + AVClass *class; + + int drm_fd; + uint32_t con_id; + struct drm_setup setup; + enum AVPixelFormat avfmt; + int show_all; + + unsigned int ano; + drm_aux_t aux[AUX_SIZE]; + + pthread_t q_thread; + sem_t q_sem_in; + sem_t q_sem_out; + int q_terminate; + AVFrame * q_next; + +} drm_display_env_t; + + +static int drm_vout_write_trailer(AVFormatContext *s) +{ +#if TRACE_ALL + av_log(s, AV_LOG_DEBUG, "%s\n", __func__); +#endif + + return 0; +} + +static int drm_vout_write_header(AVFormatContext *s) +{ + const AVCodecParameters * const par = s->streams[0]->codecpar; + +#if TRACE_ALL + av_log(s, AV_LOG_DEBUG, "%s\n", __func__); +#endif + if ( s->nb_streams > 1 + || par->codec_type != AVMEDIA_TYPE_VIDEO + || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { + av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); + return AVERROR(EINVAL); + } + + return 0; +} + +static int find_plane(struct AVFormatContext * const avctx, + const int drmfd, const int crtcidx, const uint32_t format, + uint32_t * const pplane_id) +{ + drmModePlaneResPtr planes; + drmModePlanePtr plane; + unsigned int i; + unsigned int j; + int ret = 0; + + planes = drmModeGetPlaneResources(drmfd); + if (!planes) + { + av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR); + return -1; + } + + for (i = 0; i < planes->count_planes; ++i) { + plane = drmModeGetPlane(drmfd, planes->planes[i]); + if (!planes) + { + av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR); + break; + } + + if (!(plane->possible_crtcs & (1 << crtcidx))) { + drmModeFreePlane(plane); + continue; + } + + for (j = 0; j < plane->count_formats; ++j) { + if (plane->formats[j] == format) + break; + } + + if (j == plane->count_formats) { + drmModeFreePlane(plane); + continue; + } + + *pplane_id = plane->plane_id; + drmModeFreePlane(plane); + break; + } + + if (i == planes->count_planes) + ret = -1; + + drmModeFreePlaneResources(planes); + return ret; +} + +static void da_uninit(drm_display_env_t * const de, drm_aux_t * da) +{ + if (da->fb_handle != 0) { + drmModeRmFB(de->drm_fd, da->fb_handle); + da->fb_handle = 0; + } + + for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) { + if (da->bo_handles[i]) { + struct drm_gem_close gem_close = {.handle = da->bo_handles[i]}; + drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close); + da->bo_handles[i] = 0; + } + } + av_frame_free(&da->frame); +} + +static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame) +{ + const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; + drm_aux_t * da = de->aux + de->ano; + const uint32_t format = desc->layers[0].format; + int ret = 0; + +#if TRACE_ALL + av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd); +#endif + + if (de->setup.out_fourcc != format) { + if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) { + av_frame_free(&frame); + av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format); + return -1; + } + de->setup.out_fourcc = format; + } + + { + drmVBlank vbl = { + .request = { + .type = DRM_VBLANK_RELATIVE, + .sequence = 0 + } + }; + + while (drmWaitVBlank(de->drm_fd, &vbl)) { + if (errno != EINTR) { +// av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR); + break; + } + } + } + + da_uninit(de, da); + + { + uint32_t pitches[4] = {0}; + uint32_t offsets[4] = {0}; + uint64_t modifiers[4] = {0}; + uint32_t bo_handles[4] = {0}; + int i, j, n; + + da->frame = frame; + + for (i = 0; i < desc->nb_objects; ++i) { + if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) { + av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR); + return -1; + } + } + + n = 0; + for (i = 0; i < desc->nb_layers; ++i) { + for (j = 0; j < desc->layers[i].nb_planes; ++j) { + const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; + const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; + pitches[n] = p->pitch; + offsets[n] = p->offset; + modifiers[n] = obj->format_modifier; + bo_handles[n] = da->bo_handles[p->object_index]; + ++n; + } + } + +#if 1 && TRACE_ALL + av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," + " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", + av_frame_cropped_width(frame), + av_frame_cropped_height(frame), + desc->layers[0].format, + bo_handles[0], + bo_handles[1], + bo_handles[2], + bo_handles[3], + pitches[0], + pitches[1], + pitches[2], + pitches[3], + offsets[0], + offsets[1], + offsets[2], + offsets[3], + (long long)modifiers[0], + (long long)modifiers[1], + (long long)modifiers[2], + (long long)modifiers[3] + ); +#endif + + if (drmModeAddFB2WithModifiers(de->drm_fd, + av_frame_cropped_width(frame), + av_frame_cropped_height(frame), + desc->layers[0].format, bo_handles, + pitches, offsets, modifiers, + &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) { + av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR); + return -1; + } + } + + ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId, + da->fb_handle, 0, + de->setup.compose.x, de->setup.compose.y, + de->setup.compose.width, + de->setup.compose.height, + 0, 0, + av_frame_cropped_width(frame) << 16, + av_frame_cropped_height(frame) << 16); + + if (ret != 0) { + av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR); + } + + de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1; + + return ret; +} + +static int do_sem_wait(sem_t * const sem, const int nowait) +{ + while (nowait ? sem_trywait(sem) : sem_wait(sem)) { + if (errno != EINTR) + return -errno; + } + return 0; +} + +static void * display_thread(void * v) +{ + AVFormatContext * const s = v; + drm_display_env_t * const de = s->priv_data; + int i; + +#if TRACE_ALL + av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); +#endif + + sem_post(&de->q_sem_out); + + for (;;) { + AVFrame * frame; + + do_sem_wait(&de->q_sem_in, 0); + + if (de->q_terminate) + break; + + frame = de->q_next; + de->q_next = NULL; + sem_post(&de->q_sem_out); + + do_display(s, de, frame); + } + +#if TRACE_ALL + av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); +#endif + + for (i = 0; i != AUX_SIZE; ++i) + da_uninit(de, de->aux + i); + + av_frame_free(&de->q_next); + + return NULL; +} + +static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt) +{ + const AVFrame * const src_frame = (AVFrame *)pkt->data; + AVFrame * frame; + drm_display_env_t * const de = s->priv_data; + int ret; + +#if TRACE_ALL + av_log(s, AV_LOG_DEBUG, "%s\n", __func__); +#endif + + if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) { + av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts); + return 0; + } + + if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { + frame = av_frame_alloc(); + av_frame_ref(frame, src_frame); + } + else if (src_frame->format == AV_PIX_FMT_VAAPI) { + frame = av_frame_alloc(); + frame->format = AV_PIX_FMT_DRM_PRIME; + if (av_hwframe_map(frame, src_frame, 0) != 0) + { + av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); + av_frame_free(&frame); + return AVERROR(EINVAL); + } + } + else { + av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); + return AVERROR(EINVAL); + } + + ret = do_sem_wait(&de->q_sem_out, !de->show_all); + if (ret) { + av_frame_free(&frame); + } + else { + de->q_next = frame; + sem_post(&de->q_sem_in); + } + + return 0; +} + +static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, + unsigned flags) +{ +#if TRACE_ALL + av_log(s, AV_LOG_DEBUG, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); +#endif + + /* drm_vout_write_header() should have accepted only supported formats */ + if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) + return 0; + + return 0; +} + +static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) +{ +#if TRACE_ALL + av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type); +#endif + switch(type) { + case AV_APP_TO_DEV_WINDOW_REPAINT: + return 0; + default: + break; + } + return AVERROR(ENOSYS); +} + +static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId) +{ + int ret = -1; + int i; + drmModeRes *res = drmModeGetResources(drmfd); + drmModeConnector *c; + + if(!res) + { + printf( "drmModeGetResources failed: %s\n", ERRSTR); + return -1; + } + + if (res->count_crtcs <= 0) + { + printf( "drm: no crts\n"); + goto fail_res; + } + + if (!s->conId) { + fprintf(stderr, + "No connector ID specified. Choosing default from list:\n"); + + for (i = 0; i < res->count_connectors; i++) { + drmModeConnector *con = + drmModeGetConnector(drmfd, res->connectors[i]); + drmModeEncoder *enc = NULL; + drmModeCrtc *crtc = NULL; + + if (con->encoder_id) { + enc = drmModeGetEncoder(drmfd, con->encoder_id); + if (enc->crtc_id) { + crtc = drmModeGetCrtc(drmfd, enc->crtc_id); + } + } + + if (!s->conId && crtc) { + s->conId = con->connector_id; + s->crtcId = crtc->crtc_id; + } + + av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n", + con->connector_id, + crtc ? crtc->crtc_id : 0, + con->connector_type, + crtc ? crtc->width : 0, + crtc ? crtc->height : 0, + (s->conId == (int)con->connector_id ? + " (chosen)" : "")); + } + + if (!s->conId) { + av_log(avctx, AV_LOG_ERROR, + "No suitable enabled connector found.\n"); + return -1;; + } + } + + s->crtcIdx = -1; + + for (i = 0; i < res->count_crtcs; ++i) { + if (s->crtcId == res->crtcs[i]) { + s->crtcIdx = i; + break; + } + } + + if (s->crtcIdx == -1) + { + av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId); + goto fail_res; + } + + if (res->count_connectors <= 0) + { + av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n"); + goto fail_res; + } + + c = drmModeGetConnector(drmfd, s->conId); + if (!c) + { + av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR); + goto fail_res; + } + + if (!c->count_modes) + { + av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n"); + goto fail_conn; + } + + { + drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId); + s->compose.x = crtc->x; + s->compose.y = crtc->y; + s->compose.width = crtc->width; + s->compose.height = crtc->height; + drmModeFreeCrtc(crtc); + } + + if (pConId) + *pConId = c->connector_id; + ret = 0; + +fail_conn: + drmModeFreeConnector(c); + +fail_res: + drmModeFreeResources(res); + + return ret; +} + +// deinit is called if init fails so no need to clean up explicity here +static int drm_vout_init(struct AVFormatContext * s) +{ + drm_display_env_t * const de = s->priv_data; + int rv; + const char * drm_module = DRM_MODULE; + + av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); + + de->drm_fd = -1; + de->con_id = 0; + de->setup = (struct drm_setup){0}; + de->q_terminate = 0; + + if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0) + { + rv = AVERROR(errno); + av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv)); + return rv; + } + + if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0) + { + av_log(s, AV_LOG_ERROR, "failed to find valid mode\n"); + rv = AVERROR(EINVAL); + goto fail_close; + } + + sem_init(&de->q_sem_in, 0, 0); + sem_init(&de->q_sem_out, 0, 0); + if (pthread_create(&de->q_thread, NULL, display_thread, s)) { + rv = AVERROR(errno); + av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv)); + goto fail_close; + } + + av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); + + return 0; + +fail_close: + close(de->drm_fd); + de->drm_fd = -1; + av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__); + + return rv; +} + +static void drm_vout_deinit(struct AVFormatContext * s) +{ + drm_display_env_t * const de = s->priv_data; + + av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); + + de->q_terminate = 1; + sem_post(&de->q_sem_in); + pthread_join(de->q_thread, NULL); + sem_destroy(&de->q_sem_in); + sem_destroy(&de->q_sem_out); + + for (unsigned int i = 0; i != AUX_SIZE; ++i) + da_uninit(de, de->aux + i); + + av_frame_free(&de->q_next); + + if (de->drm_fd >= 0) { + close(de->drm_fd); + de->drm_fd = -1; + } + + av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); +} + + +#define OFFSET(x) offsetof(drm_display_env_t, x) +static const AVOption options[] = { + { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, + { NULL } +}; + +static const AVClass drm_vout_class = { + .class_name = "drm vid outdev", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, + .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, +}; + +AVOutputFormat ff_vout_drm_muxer = { + .name = "vout_drm", + .long_name = NULL_IF_CONFIG_SMALL("Drm video output device"), + .priv_data_size = sizeof(drm_display_env_t), + .audio_codec = AV_CODEC_ID_NONE, + .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, + .write_header = drm_vout_write_header, + .write_packet = drm_vout_write_packet, + .write_uncoded_frame = drm_vout_write_frame, + .write_trailer = drm_vout_write_trailer, + .control_message = drm_vout_control_message, + .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, + .priv_class = &drm_vout_class, + .init = drm_vout_init, + .deinit = drm_vout_deinit, +}; + diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c new file mode 100644 index 0000000000..f666adc8e9 --- /dev/null +++ b/libavdevice/egl_vout.c @@ -0,0 +1,816 @@ +/* + * Copyright (c) 2020 John Cox for Raspberry Pi Trading + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +// *** This module is a work in progress and its utility is strictly +// limited to testing. +// Amongst other issues it doesn't wait for the pic to be displayed before +// returning the buffer so flikering does occur. + +#include +#include + +#include "libavutil/opt.h" +#include "libavutil/avassert.h" +#include "libavutil/pixdesc.h" +#include "libavutil/imgutils.h" +#include "libavutil/hwcontext_drm.h" +#include "libavformat/internal.h" +#include "avdevice.h" + +#include "pthread.h" +#include +#include +#include + +#include +#include + +#include "libavutil/rpi_sand_fns.h" + +#define TRACE_ALL 0 + +struct egl_setup { + int conId; + + Display *dpy; + EGLDisplay egl_dpy; + EGLContext ctx; + EGLSurface surf; + Window win; + + uint32_t crtcId; + int crtcIdx; + uint32_t planeId; + struct { + int x, y, width, height; + } compose; +}; + +typedef struct egl_aux_s { + int fd; + GLuint texture; + +} egl_aux_t; + +typedef struct egl_display_env_s +{ + AVClass *class; + + struct egl_setup setup; + enum AVPixelFormat avfmt; + + int show_all; + int window_width, window_height; + int window_x, window_y; + int fullscreen; + + egl_aux_t aux[32]; + + pthread_t q_thread; + pthread_mutex_t q_lock; + sem_t display_start_sem; + sem_t q_sem; + int q_terminate; + AVFrame * q_this; + AVFrame * q_next; + +} egl_display_env_t; + + +/** + * Remove window border/decorations. + */ +static void +no_border( Display *dpy, Window w) +{ + static const unsigned MWM_HINTS_DECORATIONS = (1 << 1); + static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5; + + typedef struct + { + unsigned long flags; + unsigned long functions; + unsigned long decorations; + long inputMode; + unsigned long status; + } PropMotifWmHints; + + PropMotifWmHints motif_hints; + Atom prop, proptype; + unsigned long flags = 0; + + /* setup the property */ + motif_hints.flags = MWM_HINTS_DECORATIONS; + motif_hints.decorations = flags; + + /* get the atom for the property */ + prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True ); + if (!prop) { + /* something went wrong! */ + return; + } + + /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */ + proptype = prop; + + XChangeProperty( dpy, w, /* display, window */ + prop, proptype, /* property, type */ + 32, /* format: 32-bit datums */ + PropModeReplace, /* mode */ + (unsigned char *) &motif_hints, /* data */ + PROP_MOTIF_WM_HINTS_ELEMENTS /* nelements */ + ); +} + + +/* + * Create an RGB, double-buffered window. + * Return the window and context handles. + */ +static int +make_window(struct AVFormatContext * const s, + egl_display_env_t * const de, + Display *dpy, EGLDisplay egl_dpy, const char *name, + Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet) +{ + int scrnum = DefaultScreen( dpy ); + XSetWindowAttributes attr; + unsigned long mask; + Window root = RootWindow( dpy, scrnum ); + Window win; + EGLContext ctx; + const int fullscreen = de->fullscreen; + EGLConfig config; + int x = de->window_x; + int y = de->window_y; + int width = de->window_width ? de->window_width : 1280; + int height = de->window_height ? de->window_height : 720; + + + if (fullscreen) { + int scrnum = DefaultScreen(dpy); + + x = 0; y = 0; + width = DisplayWidth(dpy, scrnum); + height = DisplayHeight(dpy, scrnum); + } + + { + EGLint num_configs; + static const EGLint attribs[] = { + EGL_RED_SIZE, 1, + EGL_GREEN_SIZE, 1, + EGL_BLUE_SIZE, 1, + EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, + EGL_NONE + }; + + if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) { + av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n"); + return -1; + } + } + + { + EGLint vid; + if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) { + av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n"); + return -1; + } + + { + XVisualInfo visTemplate = { + .visualid = vid, + }; + int num_visuals; + XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask, + &visTemplate, &num_visuals); + + /* window attributes */ + attr.background_pixel = 0; + attr.border_pixel = 0; + attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone); + attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask; + /* XXX this is a bad way to get a borderless window! */ + mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask; + + win = XCreateWindow( dpy, root, x, y, width, height, + 0, visinfo->depth, InputOutput, + visinfo->visual, mask, &attr ); + XFree(visinfo); + } + } + + if (fullscreen) + no_border(dpy, win); + + /* set hints and properties */ + { + XSizeHints sizehints; + sizehints.x = x; + sizehints.y = y; + sizehints.width = width; + sizehints.height = height; + sizehints.flags = USSize | USPosition; + XSetNormalHints(dpy, win, &sizehints); + XSetStandardProperties(dpy, win, name, name, + None, (char **)NULL, 0, &sizehints); + } + + eglBindAPI(EGL_OPENGL_ES_API); + + { + static const EGLint ctx_attribs[] = { + EGL_CONTEXT_CLIENT_VERSION, 2, + EGL_NONE + }; + ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs ); + if (!ctx) { + av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); + return -1; + } + } + + + XMapWindow(dpy, win); + + { + EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL); + if (!surf) { + av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); + return -1; + } + + if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) { + av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); + return -1; + } + + *winRet = win; + *ctxRet = ctx; + *surfRet = surf; + } + + return 0; +} + +static GLint +compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source) +{ + GLuint s = glCreateShader(target); + + if (s == 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n"); + return 0; + } + + glShaderSource(s, 1, (const GLchar **) &source, NULL); + glCompileShader(s); + + { + GLint ok; + glGetShaderiv(s, GL_COMPILE_STATUS, &ok); + + if (!ok) { + GLchar *info; + GLint size; + + glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size); + info = malloc(size); + + glGetShaderInfoLog(s, size, NULL, info); + av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source); + + return 0; + } + } + + return s; +} + +static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs) +{ + GLuint prog = glCreateProgram(); + + if (prog == 0) { + av_log(s, AV_LOG_ERROR, "Failed to create program\n"); + return 0; + } + + glAttachShader(prog, vs); + glAttachShader(prog, fs); + glLinkProgram(prog); + + { + GLint ok; + glGetProgramiv(prog, GL_LINK_STATUS, &ok); + if (!ok) { + /* Some drivers return a size of 1 for an empty log. This is the size + * of a log that contains only a terminating NUL character. + */ + GLint size; + GLchar *info = NULL; + glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size); + if (size > 1) { + info = malloc(size); + glGetProgramInfoLog(prog, size, NULL, info); + } + + av_log(s, AV_LOG_ERROR, "Failed to link: %s\n", + (info != NULL) ? info : ""); + return 0; + } + } + + return prog; +} + +static int +gl_setup(struct AVFormatContext * const s) +{ + const char *vs = + "attribute vec4 pos;\n" + "varying vec2 texcoord;\n" + "\n" + "void main() {\n" + " gl_Position = pos;\n" + " texcoord.x = (pos.x + 1.0) / 2.0;\n" + " texcoord.y = (-pos.y + 1.0) / 2.0;\n" + "}\n"; + const char *fs = + "#extension GL_OES_EGL_image_external : enable\n" + "precision mediump float;\n" + "uniform samplerExternalOES s;\n" + "varying vec2 texcoord;\n" + "void main() {\n" + " gl_FragColor = texture2D(s, texcoord);\n" + "}\n"; + + GLuint vs_s; + GLuint fs_s; + GLuint prog; + + if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) || + !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) || + !(prog = link_program(s, vs_s, fs_s))) + return -1; + + glUseProgram(prog); + + { + static const float verts[] = { + -1, -1, + 1, -1, + 1, 1, + -1, 1, + }; + glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts); + } + + glEnableVertexAttribArray(0); + return 0; +} + +static int egl_vout_write_trailer(AVFormatContext *s) +{ +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s\n", __func__); +#endif + + return 0; +} + +static int egl_vout_write_header(AVFormatContext *s) +{ + const AVCodecParameters * const par = s->streams[0]->codecpar; + +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s\n", __func__); +#endif + if ( s->nb_streams > 1 + || par->codec_type != AVMEDIA_TYPE_VIDEO + || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { + av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); + return AVERROR(EINVAL); + } + + return 0; +} + + +static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame) +{ + const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; + egl_aux_t * da = NULL; + unsigned int i; + +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); +#endif + + for (i = 0; i != 32; ++i) { + if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) { + da = de->aux + i; + break; + } + } + + if (da == NULL) { + av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__); + return AVERROR(EINVAL); + } + + if (da->texture == 0) { + EGLint attribs[50]; + EGLint * a = attribs; + int i, j; + static const EGLint anames[] = { + EGL_DMA_BUF_PLANE0_FD_EXT, + EGL_DMA_BUF_PLANE0_OFFSET_EXT, + EGL_DMA_BUF_PLANE0_PITCH_EXT, + EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, + EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, + EGL_DMA_BUF_PLANE1_FD_EXT, + EGL_DMA_BUF_PLANE1_OFFSET_EXT, + EGL_DMA_BUF_PLANE1_PITCH_EXT, + EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT, + EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT, + EGL_DMA_BUF_PLANE2_FD_EXT, + EGL_DMA_BUF_PLANE2_OFFSET_EXT, + EGL_DMA_BUF_PLANE2_PITCH_EXT, + EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT, + EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT, + }; + const EGLint * b = anames; + + *a++ = EGL_WIDTH; + *a++ = av_frame_cropped_width(frame); + *a++ = EGL_HEIGHT; + *a++ = av_frame_cropped_height(frame); + *a++ = EGL_LINUX_DRM_FOURCC_EXT; + *a++ = desc->layers[0].format; + + for (i = 0; i < desc->nb_layers; ++i) { + for (j = 0; j < desc->layers[i].nb_planes; ++j) { + const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; + const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; + *a++ = *b++; + *a++ = obj->fd; + *a++ = *b++; + *a++ = p->offset; + *a++ = *b++; + *a++ = p->pitch; + if (obj->format_modifier == 0) { + b += 2; + } + else { + *a++ = *b++; + *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF); + *a++ = *b++; + *a++ = (EGLint)(obj->format_modifier >> 32); + } + } + } + + *a = EGL_NONE; + +#if TRACE_ALL + for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) { + av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]); + } +#endif + { + const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy, + EGL_NO_CONTEXT, + EGL_LINUX_DMA_BUF_EXT, + NULL, attribs); + if (!image) { + av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd); + return -1; + } + + glGenTextures(1, &da->texture); + glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); + glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image); + + eglDestroyImageKHR(de->setup.egl_dpy, image); + } + + da->fd = desc->objects[0].fd; + +#if 0 + av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," + " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", + av_frame_cropped_width(frame), + av_frame_cropped_height(frame), + desc->layers[0].format, + bo_plane_handles[0], + bo_plane_handles[1], + bo_plane_handles[2], + bo_plane_handles[3], + pitches[0], + pitches[1], + pitches[2], + pitches[3], + offsets[0], + offsets[1], + offsets[2], + offsets[3], + (long long)modifiers[0], + (long long)modifiers[1], + (long long)modifiers[2], + (long long)modifiers[3] + ); +#endif + } + + glClearColor(0.5, 0.5, 0.5, 0.5); + glClear(GL_COLOR_BUFFER_BIT); + + glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); + glDrawArrays(GL_TRIANGLE_FAN, 0, 4); + eglSwapBuffers(de->setup.egl_dpy, de->setup.surf); + + glDeleteTextures(1, &da->texture); + da->texture = 0; + da->fd = -1; + + return 0; +} + +static void * display_thread(void * v) +{ + AVFormatContext * const s = v; + egl_display_env_t * const de = s->priv_data; + +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); +#endif + { + EGLint egl_major, egl_minor; + + de->setup.dpy = XOpenDisplay(NULL); + if (!de->setup.dpy) { + av_log(s, AV_LOG_ERROR, "Couldn't open X display\n"); + goto fail; + } + + de->setup.egl_dpy = eglGetDisplay(de->setup.dpy); + if (!de->setup.egl_dpy) { + av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n"); + goto fail; + } + + if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) { + av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n"); + goto fail; + } + + av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor); + + if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) { + av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n"); + goto fail; + } + } + + if (!de->window_width || !de->window_height) { + de->window_width = 1280; + de->window_height = 720; + } + if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout", + &de->setup.win, &de->setup.ctx, &de->setup.surf)) { + av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__); + goto fail; + } + + if (gl_setup(s)) { + av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__); + goto fail; + } + +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__); +#endif + sem_post(&de->display_start_sem); + + for (;;) { + AVFrame * frame; + + while (sem_wait(&de->q_sem) != 0) { + av_assert0(errno == EINTR); + } + + if (de->q_terminate) + break; + + pthread_mutex_lock(&de->q_lock); + frame = de->q_next; + de->q_next = NULL; + pthread_mutex_unlock(&de->q_lock); + + do_display(s, de, frame); + + av_frame_free(&de->q_this); + de->q_this = frame; + } + +#if TRACE_ALL + av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); +#endif + + return NULL; + +fail: +#if TRACE_ALL + av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__); +#endif + de->q_terminate = 1; + sem_post(&de->display_start_sem); + + return NULL; +} + +static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) +{ + const AVFrame * const src_frame = (AVFrame *)pkt->data; + AVFrame * frame; + egl_display_env_t * const de = s->priv_data; + +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s\n", __func__); +#endif + + if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { + frame = av_frame_alloc(); + av_frame_ref(frame, src_frame); + } + else if (src_frame->format == AV_PIX_FMT_VAAPI) { + frame = av_frame_alloc(); + frame->format = AV_PIX_FMT_DRM_PRIME; + if (av_hwframe_map(frame, src_frame, 0) != 0) + { + av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); + av_frame_free(&frame); + return AVERROR(EINVAL); + } + } + else { + av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); + return AVERROR(EINVAL); + } + + // Really hacky sync + while (de->show_all && de->q_next) { + usleep(3000); + } + + pthread_mutex_lock(&de->q_lock); + { + AVFrame * const t = de->q_next; + de->q_next = frame; + frame = t; + } + pthread_mutex_unlock(&de->q_lock); + + if (frame == NULL) + sem_post(&de->q_sem); + else + av_frame_free(&frame); + + return 0; +} + +static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, + unsigned flags) +{ +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); +#endif + + /* egl_vout_write_header() should have accepted only supported formats */ + if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) + return 0; + + return 0; +} + +static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) +{ +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); +#endif + switch(type) { + case AV_APP_TO_DEV_WINDOW_REPAINT: + return 0; + default: + break; + } + return AVERROR(ENOSYS); +} + +// deinit is called if init fails so no need to clean up explicity here +static int egl_vout_init(struct AVFormatContext * s) +{ + egl_display_env_t * const de = s->priv_data; + unsigned int i; + + av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); + + de->setup = (struct egl_setup){0}; + + for (i = 0; i != 32; ++i) { + de->aux[i].fd = -1; + } + + de->q_terminate = 0; + pthread_mutex_init(&de->q_lock, NULL); + sem_init(&de->q_sem, 0, 0); + sem_init(&de->display_start_sem, 0, 0); + av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0); + + sem_wait(&de->display_start_sem); + if (de->q_terminate) { + av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__); + return -1; + } + + av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); + + return 0; +} + +static void egl_vout_deinit(struct AVFormatContext * s) +{ + egl_display_env_t * const de = s->priv_data; + + av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__); + + de->q_terminate = 1; + sem_post(&de->q_sem); + pthread_join(de->q_thread, NULL); + sem_destroy(&de->q_sem); + pthread_mutex_destroy(&de->q_lock); + + av_frame_free(&de->q_next); + av_frame_free(&de->q_this); + + av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__); +} + +#define OFFSET(x) offsetof(egl_display_env_t, x) +static const AVOption options[] = { + { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, + { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, + { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, + { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, + { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, + { NULL } + +}; + +static const AVClass egl_vout_class = { + .class_name = "egl vid outdev", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, + .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, +}; + +AVOutputFormat ff_vout_egl_muxer = { + .name = "vout_egl", + .long_name = NULL_IF_CONFIG_SMALL("Egl video output device"), + .priv_data_size = sizeof(egl_display_env_t), + .audio_codec = AV_CODEC_ID_NONE, + .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, + .write_header = egl_vout_write_header, + .write_packet = egl_vout_write_packet, + .write_uncoded_frame = egl_vout_write_frame, + .write_trailer = egl_vout_write_trailer, + .control_message = egl_vout_control_message, + .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, + .priv_class = &egl_vout_class, + .init = egl_vout_init, + .deinit = egl_vout_deinit, +}; + diff --git a/libavdevice/rpi_vout.c b/libavdevice/rpi_vout.c new file mode 100644 index 0000000000..84723a34ad --- /dev/null +++ b/libavdevice/rpi_vout.c @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2013 Jeff Moguillansky + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * XVideo output device + * + * TODO: + * - add support to more formats + */ + +#include "libavutil/opt.h" +#include "libavutil/avassert.h" +#include "libavutil/pixdesc.h" +#include "libavutil/imgutils.h" +#include "libavformat/internal.h" +#include "avdevice.h" + +#include +#include + +#pragma GCC diagnostic push +// Many many redundant decls in the header files +#pragma GCC diagnostic ignored "-Wredundant-decls" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#pragma GCC diagnostic pop +#include "libavutil/rpi_sand_fns.h" +#include "libavcodec/rpi_zc.h" + +#define TRACE_ALL 0 + +#define DISPLAY_PORT_DEPTH 4 + +typedef struct rpi_display_env_s +{ + AVClass *class; + + MMAL_COMPONENT_T* display; + MMAL_COMPONENT_T* isp; + MMAL_PORT_T * port_in; // Input port of either isp or display depending on pipe setup + MMAL_CONNECTION_T * conn; + + MMAL_POOL_T *rpi_pool; + volatile int rpi_display_count; + + MMAL_FOURCC_T req_fmt; + MMAL_VIDEO_FORMAT_T req_vfmt; + + AVZcEnvPtr zc; + + int window_width, window_height; + int window_x, window_y; + int layer, fullscreen; + int show_all; +} rpi_display_env_t; + + +static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) { + mmal_buffer_header_release(buffer); +} + +static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) { + mmal_buffer_header_release(buffer); +} + + +static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt) +{ + switch (fmt) { + case AV_PIX_FMT_SAND128: + case AV_PIX_FMT_RPI4_8: + return MMAL_ENCODING_YUVUV128; + case AV_PIX_FMT_RPI4_10: + return MMAL_ENCODING_YUV10_COL; + case AV_PIX_FMT_SAND64_10: + return MMAL_ENCODING_YUVUV64_10; + case AV_PIX_FMT_SAND64_16: + return MMAL_ENCODING_YUVUV64_16; + case AV_PIX_FMT_YUV420P: + return MMAL_ENCODING_I420; + + default: + break; + } + return 0; +} + + +static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt, + const AVFrame * const frame, const AVRpiZcRefPtr fr_ref) +{ + MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video; + const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref); + if (av_rpi_is_sand_format(geo->format)) { + // Sand formats are a bit "special" + // stride1 implicit in format + // width = stride2 + vfmt->width = geo->stripe_is_yc ? + geo->height_y + geo->height_c : geo->height_y; +// es->height = geo->video_height; //*** When we get the FLAG this will change + vfmt->height = geo->height_y; + es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE; + } + else { + vfmt->width = geo->stride_y / geo->bytes_per_pel; + vfmt->height = geo->height_y; + es_fmt->flags = 0; + } + + es_fmt->type = MMAL_ES_TYPE_VIDEO; + es_fmt->encoding = mmfmt_from_avfmt(geo->format); + es_fmt->encoding_variant = 0; + es_fmt->bitrate = 0; + + vfmt->crop.x = frame->crop_left; + vfmt->crop.y = frame->crop_top; + vfmt->crop.width = av_frame_cropped_width(frame); + vfmt->crop.height = av_frame_cropped_height(frame); + + vfmt->frame_rate.den = 0; // Don't think I know it here + vfmt->frame_rate.num = 0; + + vfmt->par.den = frame->sample_aspect_ratio.den; + vfmt->par.num = frame->sample_aspect_ratio.num; + + vfmt->color_space = 0; // Unknown currently +} + +static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata) +{ + rpi_display_env_t * const de = userdata; + if (buf->user_data != NULL) { + av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data); + buf->user_data = NULL; + } + atomic_fetch_add(&de->rpi_display_count, -1); + return MMAL_FALSE; +} + +static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt) +{ + return avfmt == AV_PIX_FMT_SAND64_10; +} + +static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de) +{ + if (de->isp != NULL) + { + if (de->isp->input[0]->is_enabled) + mmal_port_disable(de->isp->input[0]); + if (de->isp->control->is_enabled) + mmal_port_disable(de->isp->control); + } + if (de->conn != NULL) { + mmal_connection_destroy(de->conn); + de->conn = NULL; + } + if (de->isp != NULL) { + mmal_component_destroy(de->isp); + de->isp = NULL; + } +} + +static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr) +{ + MMAL_BUFFER_HEADER_T* buf = NULL; + AVRpiZcRefPtr fr_buf = NULL; + + if (de == NULL) + return; + + if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { + av_log(s, AV_LOG_VERBOSE, "Frame dropped\n"); + return; + } + + if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) { + return; + } + + buf = mmal_queue_get(de->rpi_pool->queue); + if (!buf) { + // Running too fast so drop the frame (unexpected) + goto fail; + } + + buf->cmd = 0; + buf->offset = 0; + buf->flags = 0; + mmal_buffer_header_reset(buf); + + atomic_fetch_add(&de->rpi_display_count, 1); // Deced on release + mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de); + + buf->user_data = fr_buf; + buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf); // Cast our handle to a pointer for mmal + buf->offset = av_rpi_zc_offset(fr_buf); + buf->length = av_rpi_zc_length(fr_buf); + buf->alloc_size = av_rpi_zc_numbytes(fr_buf); + + while (de->show_all && atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { + usleep(5000); + } + + { + MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}}; + MMAL_ES_FORMAT_T new_es = {.es = &new_ess}; + MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video; + + video_format_from_zc_frame(&new_es, fr, fr_buf); + if (de->req_fmt != new_es.encoding || + de->req_vfmt.width != new_vfmt->width || + de->req_vfmt.height != new_vfmt->height || + de->req_vfmt.crop.x != new_vfmt->crop.x || + de->req_vfmt.crop.y != new_vfmt->crop.y || + de->req_vfmt.crop.width != new_vfmt->crop.width || + de->req_vfmt.crop.height != new_vfmt->crop.height) { + // Something has changed + + // If we have an ISP tear it down + isp_remove(s, de); + de->port_in = de->display->input[0]; + + // If we still need an ISP create it now + if (avfmt_needs_isp(fr->format)) + { + if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS) + { + av_log(s, AV_LOG_ERROR, "ISP creation failed\n"); + goto fail; + } + de->port_in = de->isp->input[0]; + } + + mmal_format_copy(de->port_in->format, &new_es); + + if (mmal_port_format_commit(de->port_in)) { + av_log(s, AV_LOG_ERROR, "Failed to commit input format\n"); + goto fail; + } + + // If we have an ISP then we must want to use it + if (de->isp != NULL) { + MMAL_PORT_T * const port_out = de->isp->output[0]; + MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video; + MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video; + + port_out->format->type = MMAL_ES_TYPE_VIDEO; + port_out->format->encoding = MMAL_ENCODING_YUVUV128; + port_out->format->encoding_variant = 0; + port_out->format->bitrate = 0; + port_out->format->flags = 0; + port_out->format->extradata = NULL; + port_out->format->extradata_size = 0; + + vfmt_out->width = (vfmt_in->crop.width + 31) & ~31; + vfmt_out->height = (vfmt_in->crop.height + 15) & ~15; + vfmt_out->crop.x = 0; + vfmt_out->crop.y = 0; + vfmt_out->crop.width = vfmt_in->crop.width; + vfmt_out->crop.height = vfmt_in->crop.height; + vfmt_out->frame_rate = vfmt_in->frame_rate; + vfmt_out->par = vfmt_in->par; + vfmt_out->color_space = vfmt_in->color_space; + + if (mmal_port_format_commit(port_out)) { + av_log(s, AV_LOG_ERROR, "Failed to commit output format\n"); + goto fail; + } + + if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) { + av_log(s, AV_LOG_ERROR, "Failed to create connection\n"); + goto fail; + } + if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) { + av_log(s, AV_LOG_ERROR, "Failed to enable connection\n"); + goto fail; + } + mmal_port_enable(de->isp->control,display_cb_control); + mmal_component_enable(de->isp); + } + + // Number of slots in my port Q + de->port_in->buffer_num = DISPLAY_PORT_DEPTH; + // Size to keep it happy - isn't used for anything other than error checking + de->port_in->buffer_size = buf->alloc_size; + if (!de->port_in->is_enabled) + { + mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image? + if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) { + av_log(s, AV_LOG_ERROR, "Failed to enable input port\n"); + goto fail; + } + } + + de->req_fmt = new_es.encoding; + de->req_vfmt = *new_vfmt; + } + } + + if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS) + { + av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count); + goto fail; + } + return; + +fail: + // If we have a buf then fr_buf is held by that + if (buf != NULL) + mmal_buffer_header_release(buf); + else if (fr_buf != NULL) + av_rpi_zc_unref(fr_buf); +} + + +static int xv_write_trailer(AVFormatContext *s) +{ + rpi_display_env_t * const de = s->priv_data; +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s\n", __func__); +#endif + if (de->port_in != NULL && de->port_in->is_enabled) { + mmal_port_disable(de->port_in); + } + + // The above disable should kick out all buffers - check that + if (atomic_load(&de->rpi_display_count) != 0) { + av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count)); + } + + isp_remove(s, de); + if (de->rpi_pool != NULL) { + mmal_pool_destroy(de->rpi_pool); + de->rpi_pool = NULL; + } + if (de->display != NULL) { + mmal_component_destroy(de->display); + de->display = NULL; + } + + return 0; +} + +static int xv_write_header(AVFormatContext *s) +{ + rpi_display_env_t * const de = s->priv_data; + const AVCodecParameters * const par = s->streams[0]->codecpar; + const unsigned int w = de->window_width ? de->window_width : par->width; + const unsigned int h = de->window_height ? de->window_height : par->height; + const unsigned int x = de->window_x; + const unsigned int y = de->window_y; + const int layer = de->layer ? de->layer : 2; + const MMAL_BOOL_T fullscreen = de->fullscreen; + +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h); +#endif + if ( s->nb_streams > 1 + || par->codec_type != AVMEDIA_TYPE_VIDEO + || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { + av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); + return AVERROR(EINVAL); + } + + { + MMAL_DISPLAYREGION_T region = + { + .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)}, + .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | + MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA, + .layer = layer, + .fullscreen = fullscreen, + .dest_rect = {x, y, w, h}, + .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS, + }; + + bcm_host_init(); // Needs to be done by someone... + + if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS) + { + av_log(s, AV_LOG_ERROR, "Failed to create display component\n"); + goto fail; + } + de->port_in = de->display->input[0]; + + mmal_port_parameter_set(de->display->input[0], ®ion.hdr); + + if (mmal_component_enable(de->display) != MMAL_SUCCESS) + { + av_log(s, AV_LOG_ERROR, "Failed to enable display component\n"); + goto fail; + } + if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS) + { + av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n"); + goto fail; + } + + if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL) + { + av_log(s, AV_LOG_ERROR, "Failed to create pool\n"); + goto fail; + } + } + + return 0; + +fail: + xv_write_trailer(s); + return AVERROR_UNKNOWN; +} + +static int xv_write_packet(AVFormatContext *s, AVPacket *pkt) +{ + AVFrame * const frame = (AVFrame *)pkt->data; +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s\n", __func__); +#endif + display_frame(s, s->priv_data, frame); + return 0; +} + +static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, + unsigned flags) +{ +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); +#endif + + /* xv_write_header() should have accepted only supported formats */ + if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) + return 0; +// return write_picture(s, (*frame)->data, (*frame)->linesize); + + display_frame(s, s->priv_data, *ppframe); + return 0; +} + +static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size) +{ +#if TRACE_ALL + av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); +#endif + switch(type) { + case AV_APP_TO_DEV_WINDOW_REPAINT: + return 0; + default: + break; + } + return AVERROR(ENOSYS); +} + +// deinit is called if init fails so no need to clean up explicity here +static int rpi_vout_init(struct AVFormatContext * s) +{ + rpi_display_env_t * const de = s->priv_data; + + // Get a ZC context in case we need one - has little overhead if unused + if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL) + return 1; + + return 0; +} + +static void rpi_vout_deinit(struct AVFormatContext * s) +{ + rpi_display_env_t * const de = s->priv_data; + + av_rpi_zc_int_env_freep(&de->zc); +} + + +#define OFFSET(x) offsetof(rpi_display_env_t, x) +static const AVOption options[] = { + { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, + { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, + { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, + { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, + { "display_layer","set display layer", OFFSET(layer), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, + { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, + { NULL } + +}; + +static const AVClass xv_class = { + .class_name = "rpi vid outdev", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, + .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, +}; + +AVOutputFormat ff_vout_rpi_muxer = { + .name = "vout_rpi", + .long_name = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"), + .priv_data_size = sizeof(rpi_display_env_t), + .audio_codec = AV_CODEC_ID_NONE, + .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, + .write_header = xv_write_header, + .write_packet = xv_write_packet, + .write_uncoded_frame = xv_write_frame, + .write_trailer = xv_write_trailer, + .control_message = xv_control_message, + .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, + .priv_class = &xv_class, + .init = rpi_vout_init, + .deinit = rpi_vout_deinit, +}; diff --git a/libavfilter/Makefile b/libavfilter/Makefile index b2c254ea67..144fbda652 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -233,6 +233,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER) += vf_neighbor.o OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_deinterlace_qsv.o OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o +OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o @@ -459,6 +460,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER) += vf_transpose_opencl.o opencl.o o OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER) += vf_transpose_vaapi.o vaapi_vpp.o OBJS-$(CONFIG_TRIM_FILTER) += trim.o OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o +OBJS-$(CONFIG_UNSAND_FILTER) += vf_unsand.o OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER) += vf_unsharp_opencl.o opencl.o \ opencl/unsharp.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 0872c6e0f2..8b23df9323 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -218,6 +218,7 @@ extern AVFilter ff_vf_dedot; extern AVFilter ff_vf_deflate; extern AVFilter ff_vf_deflicker; extern AVFilter ff_vf_deinterlace_qsv; +extern AVFilter ff_vf_deinterlace_v4l2m2m; extern AVFilter ff_vf_deinterlace_vaapi; extern AVFilter ff_vf_dejudder; extern AVFilter ff_vf_delogo; @@ -438,6 +439,7 @@ extern AVFilter ff_vf_transpose_opencl; extern AVFilter ff_vf_transpose_vaapi; extern AVFilter ff_vf_trim; extern AVFilter ff_vf_unpremultiply; +extern AVFilter ff_vf_unsand; extern AVFilter ff_vf_unsharp; extern AVFilter ff_vf_unsharp_opencl; extern AVFilter ff_vf_untile; diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c index f6b572b3de..44fe8b679c 100644 --- a/libavfilter/avfiltergraph.c +++ b/libavfilter/avfiltergraph.c @@ -32,6 +32,9 @@ #include "libavutil/internal.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" +#if CONFIG_UNSAND_FILTER +#include "libavutil/rpi_sand_fns.h" +#endif #define FF_INTERNAL_FIELDS 1 #include "framequeue.h" @@ -422,6 +425,19 @@ static int formats_declared(AVFilterContext *f) return 1; } +#if CONFIG_UNSAND_FILTER +static int has_sand_format(const AVFilterFormats * const ff) +{ + int i; + for (i = 0; i != ff->nb_formats; ++i) { + if (av_rpi_is_sand_format(ff->formats[i])) { + return 1; + } + } + return 0; +} +#endif + /** * Perform one round of query_formats() and merging formats lists on the * filter graph. @@ -462,6 +478,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) for (j = 0; j < filter->nb_inputs; j++) { AVFilterLink *link = filter->inputs[j]; int convert_needed = 0; + unsigned int extra_convert_tried = 0; if (!link) continue; @@ -504,11 +521,14 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) link->outcfg.formats, link->type) #undef MERGE_DISPATCH - if (convert_needed) { + while (convert_needed) { AVFilterContext *convert; const AVFilter *filter; AVFilterLink *inlink, *outlink; char inst_name[30]; + int can_retry = 0; + + convert_needed = 0; if (graph->disable_auto_convert) { av_log(log_ctx, AV_LOG_ERROR, @@ -521,19 +541,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) /* couldn't merge format lists. auto-insert conversion filter */ switch (link->type) { case AVMEDIA_TYPE_VIDEO: - if (!(filter = avfilter_get_by_name("scale"))) { - av_log(log_ctx, AV_LOG_ERROR, "'scale' filter " - "not present, cannot convert pixel formats.\n"); - return AVERROR(EINVAL); - } - - snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d", - scaler_count++); +#if CONFIG_UNSAND_FILTER + // Only try each extra conversion once + // The unsand output pad should never trigger has_sand_format + // but it is better to be safe + if ((extra_convert_tried & 1) == 0 && has_sand_format(link->incfg.formats)) { + if (!(filter = avfilter_get_by_name("unsand"))) { + av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter " + "not present, cannot convert pixel formats.\n"); + return AVERROR(EINVAL); + } + + snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d", + scaler_count++); + + if ((ret = avfilter_graph_create_filter(&convert, filter, + inst_name, "", NULL, + graph)) < 0) + return ret; - if ((ret = avfilter_graph_create_filter(&convert, filter, - inst_name, graph->scale_sws_opts, NULL, - graph)) < 0) - return ret; + extra_convert_tried |= 1; + can_retry = 1; + } + else +#endif + { + if (!(filter = avfilter_get_by_name("scale"))) { + av_log(log_ctx, AV_LOG_ERROR, "'scale' filter " + "not present, cannot convert pixel formats.\n"); + return AVERROR(EINVAL); + } + + snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d", + scaler_count++); + + if ((ret = avfilter_graph_create_filter(&convert, filter, + inst_name, graph->scale_sws_opts, NULL, + graph)) < 0) + return ret; + } break; case AVMEDIA_TYPE_AUDIO: if (!(filter = avfilter_get_by_name("aresample"))) { @@ -589,6 +635,13 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) outlink->outcfg.samplerates) || CHECKED_MERGE(channel_layouts, outlink->incfg.channel_layouts, outlink->outcfg.channel_layouts))) { + // Try adding an unsand filter & see if that helps + if (ret < 0 && can_retry) { + link = outlink; + convert_needed = 1; + continue; + } + if (ret < 0) return ret; av_log(log_ctx, AV_LOG_ERROR, diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c index da1cf9941e..c588ed23cb 100644 --- a/libavfilter/buffersrc.c +++ b/libavfilter/buffersrc.c @@ -188,7 +188,7 @@ int attribute_align_arg av_buffersrc_add_frame_flags(AVFilterContext *ctx, AVFra switch (ctx->outputs[0]->type) { case AVMEDIA_TYPE_VIDEO: - CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height, + CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame), frame->format, frame->pts); break; case AVMEDIA_TYPE_AUDIO: diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c new file mode 100644 index 0000000000..d1c714b805 --- /dev/null +++ b/libavfilter/vf_deinterlace_v4l2m2m.c @@ -0,0 +1,1282 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * deinterlace video filter - V4L2 M2M + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libavutil/avassert.h" +#include "libavutil/avstring.h" +#include "libavutil/common.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_drm.h" +#include "libavutil/internal.h" +#include "libavutil/mathematics.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/time.h" + +#define FF_INTERNAL_FIELDS 1 +#include "framequeue.h" +#include "filters.h" +#include "avfilter.h" +#include "formats.h" +#include "internal.h" +#include "video.h" + +typedef struct V4L2Queue V4L2Queue; +typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared; + +typedef struct V4L2PlaneInfo { + int bytesperline; + size_t length; +} V4L2PlaneInfo; + +typedef struct V4L2Buffer { + int enqueued; + int reenqueue; + int fd; + struct v4l2_buffer buffer; + AVFrame frame; + struct v4l2_plane planes[VIDEO_MAX_PLANES]; + int num_planes; + V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES]; + AVDRMFrameDescriptor drm_frame; + V4L2Queue *q; +} V4L2Buffer; + +typedef struct V4L2Queue { + struct v4l2_format format; + int num_buffers; + V4L2Buffer *buffers; + DeintV4L2M2MContextShared *ctx; +} V4L2Queue; + +typedef struct pts_stats_s +{ + void * logctx; + const char * name; // For debug + unsigned int last_count; + unsigned int last_interval; + int64_t last_pts; +} pts_stats_t; + +#define PTS_TRACK_SIZE 32 +typedef struct pts_track_el_s +{ + uint32_t n; + unsigned int interval; + AVFrame * props; +} pts_track_el_t; + +typedef struct pts_track_s +{ + uint32_t n; + uint32_t last_n; + int got_2; + void * logctx; + pts_stats_t stats; + pts_track_el_t a[PTS_TRACK_SIZE]; +} pts_track_t; + +typedef struct DeintV4L2M2MContextShared { + void * logctx; // For logging - will be NULL when done + + int fd; + int done; + int width; + int height; + int orig_width; + int orig_height; + atomic_uint refcount; + + AVBufferRef *hw_frames_ctx; + + unsigned int field_order; + + pts_track_t track; + + V4L2Queue output; + V4L2Queue capture; +} DeintV4L2M2MContextShared; + +typedef struct DeintV4L2M2MContext { + const AVClass *class; + + DeintV4L2M2MContextShared *shared; +} DeintV4L2M2MContext; + +static unsigned int pts_stats_interval(const pts_stats_t * const stats) +{ + return stats->last_interval; +} + +// Pick 64 for max last count - that is >1sec at 60fps +#define STATS_LAST_COUNT_MAX 64 +#define STATS_INTERVAL_MAX (1 << 30) +static void pts_stats_add(pts_stats_t * const stats, int64_t pts) +{ + if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) { + if (stats->last_count < STATS_LAST_COUNT_MAX) + ++stats->last_count; + return; + } + + if (stats->last_pts != AV_NOPTS_VALUE) { + const int64_t interval = pts - stats->last_pts; + + if (interval < 0 || interval >= STATS_INTERVAL_MAX || + stats->last_count >= STATS_LAST_COUNT_MAX) { + if (stats->last_interval != 0) + av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n", + __func__, stats->name, interval, stats->last_count); + stats->last_interval = 0; + } + else { + const int64_t frame_time = interval / (int64_t)stats->last_count; + + if (frame_time != stats->last_interval) + av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n", + __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time); + stats->last_interval = frame_time; + } + } + + stats->last_pts = pts; + stats->last_count = 1; +} + +static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name) +{ + *stats = (pts_stats_t){ + .logctx = logctx, + .name = name, + .last_count = 1, + .last_interval = 0, + .last_pts = AV_NOPTS_VALUE + }; +} + +static inline uint32_t pts_track_next_n(pts_track_t * const trk) +{ + if (++trk->n == 0) + trk->n = 1; + return trk->n; +} + +static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst) +{ + uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000); + pts_track_el_t * t; + + // As a first guess assume that n==0 means last frame + if (n == 0) { + n = trk->last_n; + if (n == 0) + goto fail; + } + + t = trk->a + (n & (PTS_TRACK_SIZE - 1)); + + if (t->n != n) { + av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n); + goto fail; + } + + // 1st frame is simple - just believe it + if (n != trk->last_n) { + trk->last_n = n; + trk->got_2 = 0; + return av_frame_copy_props(dst, t->props); + } + + // Only believe in a single interpolated frame + if (trk->got_2) + goto fail; + trk->got_2 = 1; + + av_frame_copy_props(dst, t->props); + + + // If we can't guess - don't + if (t->interval == 0) { + dst->best_effort_timestamp = AV_NOPTS_VALUE; + dst->pts = AV_NOPTS_VALUE; + dst->pkt_dts = AV_NOPTS_VALUE; + } + else { + if (dst->best_effort_timestamp != AV_NOPTS_VALUE) + dst->best_effort_timestamp += t->interval / 2; + if (dst->pts != AV_NOPTS_VALUE) + dst->pts += t->interval / 2; + if (dst->pkt_dts != AV_NOPTS_VALUE) + dst->pkt_dts += t->interval / 2; + } + + return 0; + +fail: + trk->last_n = 0; + trk->got_2 = 0; + dst->pts = AV_NOPTS_VALUE; + dst->pkt_dts = AV_NOPTS_VALUE; + return 0; +} + +static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src) +{ + const uint32_t n = pts_track_next_n(trk); + pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1)); + + pts_stats_add(&trk->stats, src->pts); + + t->n = n; + t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last + av_frame_unref(t->props); + av_frame_copy_props(t->props, src); + + // We now know what the previous interval was, rather than having to guess, + // so set it. There is a better than decent chance that this is before + // we use it. + if (t->interval != 0) { + pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1)); + prev_t->interval = t->interval; + } + + // In case deinterlace interpolates frames use every other usec + return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2}; +} + +static void pts_track_uninit(pts_track_t * const trk) +{ + unsigned int i; + for (i = 0; i != PTS_TRACK_SIZE; ++i) { + trk->a[i].n = 0; + av_frame_free(&trk->a[i].props); + } +} + +static int pts_track_init(pts_track_t * const trk, void *logctx) +{ + unsigned int i; + trk->n = 1; + pts_stats_init(&trk->stats, logctx, "track"); + for (i = 0; i != PTS_TRACK_SIZE; ++i) { + trk->a[i].n = 0; + if ((trk->a[i].props = av_frame_alloc()) == NULL) { + pts_track_uninit(trk); + return AVERROR(ENOMEM); + } + } + return 0; +} + +static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx) +{ + struct v4l2_capability cap; + int ret; + + memset(&cap, 0, sizeof(cap)); + ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap); + if (ret < 0) + return ret; + + if (!(cap.capabilities & V4L2_CAP_STREAMING)) + return AVERROR(EINVAL); + + if (cap.capabilities & V4L2_CAP_VIDEO_M2M) { + ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; + + return 0; + } + + if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) { + ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; + ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; + + return 0; + } + + return AVERROR(EINVAL); +} + +static int deint_v4l2m2m_try_format(V4L2Queue *queue) +{ + struct v4l2_format *fmt = &queue->format; + DeintV4L2M2MContextShared *ctx = queue->ctx; + int ret, field; + + ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt); + if (ret) + av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret); + + if (V4L2_TYPE_IS_OUTPUT(fmt->type)) + field = V4L2_FIELD_INTERLACED_TB; + else + field = V4L2_FIELD_NONE; + + if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { + fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420; + fmt->fmt.pix_mp.field = field; + fmt->fmt.pix_mp.width = ctx->width; + fmt->fmt.pix_mp.height = ctx->height; + } else { + fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420; + fmt->fmt.pix.field = field; + fmt->fmt.pix.width = ctx->width; + fmt->fmt.pix.height = ctx->height; + } + + av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__, + fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, + fmt->fmt.pix_mp.pixelformat, + fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); + + ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt); + if (ret) + return AVERROR(EINVAL); + + av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__, + fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height, + fmt->fmt.pix_mp.pixelformat, + fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline); + + if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { + if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 || + fmt->fmt.pix_mp.field != field) { + av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); + + return AVERROR(EINVAL); + } + } else { + if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 || + fmt->fmt.pix.field != field) { + av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type); + + return AVERROR(EINVAL); + } + } + + return 0; +} + +static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize) +{ + struct v4l2_format *fmt = &queue->format; + DeintV4L2M2MContextShared *ctx = queue->ctx; + int ret; + + struct v4l2_selection sel = { + .type = fmt->type, + .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS, + }; + + if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) { + fmt->fmt.pix_mp.field = field; + fmt->fmt.pix_mp.width = width; + fmt->fmt.pix_mp.height = ysize / pitch; + fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch; + fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1); + } else { + fmt->fmt.pix.field = field; + fmt->fmt.pix.width = width; + fmt->fmt.pix.height = height; + fmt->fmt.pix.sizeimage = 0; + fmt->fmt.pix.bytesperline = 0; + } + + ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt); + if (ret) + av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret); + + ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel); + if (ret) + av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret); + + sel.r.width = width; + sel.r.height = height; + sel.r.left = 0; + sel.r.top = 0; + sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE, + sel.flags = V4L2_SEL_FLAG_LE; + + ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel); + if (ret) + av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret); + + return ret; +} + +static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node) +{ + int ret; + + ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0); + if (ctx->fd < 0) + return AVERROR(errno); + + ret = deint_v4l2m2m_prepare_context(ctx); + if (ret) + goto fail; + + ret = deint_v4l2m2m_try_format(&ctx->capture); + if (ret) + goto fail; + + ret = deint_v4l2m2m_try_format(&ctx->output); + if (ret) + goto fail; + + return 0; + +fail: + close(ctx->fd); + ctx->fd = -1; + + return ret; +} + +static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx) +{ + int ret = AVERROR(EINVAL); + struct dirent *entry; + char node[PATH_MAX]; + DIR *dirp; + + dirp = opendir("/dev"); + if (!dirp) + return AVERROR(errno); + + for (entry = readdir(dirp); entry; entry = readdir(dirp)) { + + if (strncmp(entry->d_name, "video", 5)) + continue; + + snprintf(node, sizeof(node), "/dev/%s", entry->d_name); + av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node); + ret = deint_v4l2m2m_probe_device(ctx, node); + if (!ret) + break; + } + + closedir(dirp); + + if (ret) { + av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n"); + ctx->fd = -1; + + return ret; + } + + av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node); + + return 0; +} + +static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf) +{ + int ret; + + ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer); + if (ret < 0) + return AVERROR(errno); + + buf->enqueued = 1; + + return 0; +} + +static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) +{ + struct v4l2_exportbuffer expbuf; + int i, ret; + + for (i = 0; i < avbuf->num_planes; i++) { + memset(&expbuf, 0, sizeof(expbuf)); + + expbuf.index = avbuf->buffer.index; + expbuf.type = avbuf->buffer.type; + expbuf.plane = i; + + ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf); + if (ret < 0) + return AVERROR(errno); + + avbuf->fd = expbuf.fd; + + if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) { + /* drm frame */ + avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length; + avbuf->drm_frame.objects[i].fd = expbuf.fd; + avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; + } else { + /* drm frame */ + avbuf->drm_frame.objects[0].size = avbuf->buffer.length; + avbuf->drm_frame.objects[0].fd = expbuf.fd; + avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; + } + } + + return 0; +} + +static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue) +{ + struct v4l2_format *fmt = &queue->format; + DeintV4L2M2MContextShared *ctx = queue->ctx; + struct v4l2_requestbuffers req; + int ret, i, j, multiplanar; + uint32_t memory; + + memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ? + V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP; + + multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type); + + memset(&req, 0, sizeof(req)); + req.count = queue->num_buffers; + req.memory = memory; + req.type = fmt->type; + + ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req); + if (ret < 0) { + av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno)); + + return AVERROR(errno); + } + + queue->num_buffers = req.count; + queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer)); + if (!queue->buffers) { + av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n"); + + return AVERROR(ENOMEM); + } + + for (i = 0; i < queue->num_buffers; i++) { + V4L2Buffer *buf = &queue->buffers[i]; + + buf->enqueued = 0; + buf->fd = -1; + buf->q = queue; + + buf->buffer.type = fmt->type; + buf->buffer.memory = memory; + buf->buffer.index = i; + + if (multiplanar) { + buf->buffer.length = VIDEO_MAX_PLANES; + buf->buffer.m.planes = buf->planes; + } + + ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer); + if (ret < 0) { + ret = AVERROR(errno); + + goto fail; + } + + if (multiplanar) + buf->num_planes = buf->buffer.length; + else + buf->num_planes = 1; + + for (j = 0; j < buf->num_planes; j++) { + V4L2PlaneInfo *info = &buf->plane_info[j]; + + if (multiplanar) { + info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline; + info->length = buf->buffer.m.planes[j].length; + } else { + info->bytesperline = fmt->fmt.pix.bytesperline; + info->length = buf->buffer.length; + } + } + + if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) { + ret = deint_v4l2m2m_enqueue_buffer(buf); + if (ret) + goto fail; + + ret = v4l2_buffer_export_drm(buf); + if (ret) + goto fail; + } + } + + return 0; + +fail: + for (i = 0; i < queue->num_buffers; i++) + if (queue->buffers[i].fd >= 0) + close(queue->buffers[i].fd); + av_free(queue->buffers); + queue->buffers = NULL; + + return ret; +} + +static int deint_v4l2m2m_streamon(V4L2Queue *queue) +{ + DeintV4L2M2MContextShared * const ctx = queue->ctx; + int type = queue->format.type; + int ret; + + ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type); + av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); + if (ret < 0) + return AVERROR(errno); + + return 0; +} + +static int deint_v4l2m2m_streamoff(V4L2Queue *queue) +{ + DeintV4L2M2MContextShared * const ctx = queue->ctx; + int type = queue->format.type; + int ret; + + ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type); + av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno)); + if (ret < 0) + return AVERROR(errno); + + return 0; +} + +// timeout in ms +static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout) +{ + struct v4l2_plane planes[VIDEO_MAX_PLANES]; + DeintV4L2M2MContextShared *ctx = queue->ctx; + struct v4l2_buffer buf = { 0 }; + V4L2Buffer* avbuf = NULL; + struct pollfd pfd; + short events; + int ret; + + if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) + events = POLLOUT | POLLWRNORM; + else + events = POLLIN | POLLRDNORM; + + pfd.events = events; + pfd.fd = ctx->fd; + + for (;;) { + ret = poll(&pfd, 1, timeout); + if (ret > 0) + break; + if (errno == EINTR) + continue; + return NULL; + } + + if (pfd.revents & POLLERR) + return NULL; + + if (pfd.revents & events) { + memset(&buf, 0, sizeof(buf)); + buf.memory = V4L2_MEMORY_MMAP; + buf.type = queue->format.type; + if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { + memset(planes, 0, sizeof(planes)); + buf.length = VIDEO_MAX_PLANES; + buf.m.planes = planes; + } + + ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf); + if (ret) { + if (errno != EAGAIN) + av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n", + av_err2str(AVERROR(errno))); + return NULL; + } + + avbuf = &queue->buffers[buf.index]; + avbuf->enqueued = 0; + avbuf->buffer = buf; + if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) { + memcpy(avbuf->planes, planes, sizeof(planes)); + avbuf->buffer.m.planes = avbuf->planes; + } + return avbuf; + } + + return NULL; +} + +static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue) +{ + int i; + V4L2Buffer *buf = NULL; + + for (i = 0; i < queue->num_buffers; i++) + if (!queue->buffers[i].enqueued) { + buf = &queue->buffers[i]; + break; + } + return buf; +} + +static void deint_v4l2m2m_unref_queued(V4L2Queue *queue) +{ + int i; + V4L2Buffer *buf = NULL; + + if (!queue || !queue->buffers) + return; + for (i = 0; i < queue->num_buffers; i++) { + buf = &queue->buffers[i]; + if (queue->buffers[i].enqueued) + av_frame_unref(&buf->frame); + } +} + +static void recycle_q(V4L2Queue * const queue) +{ + V4L2Buffer* avbuf; + while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) { + av_frame_unref(&avbuf->frame); + } +} + +static int count_enqueued(V4L2Queue *queue) +{ + int i; + int n = 0; + + if (queue->buffers == NULL) + return 0; + + for (i = 0; i < queue->num_buffers; i++) + if (queue->buffers[i].enqueued) + ++n; + return n; +} + +static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame) +{ + DeintV4L2M2MContextShared *const ctx = queue->ctx; + AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0]; + V4L2Buffer *buf; + int i; + + if (V4L2_TYPE_IS_OUTPUT(queue->format.type)) + recycle_q(queue); + + buf = deint_v4l2m2m_find_free_buf(queue); + if (!buf) { + av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0); + return AVERROR(EAGAIN); + } + if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) + for (i = 0; i < drm_desc->nb_objects; i++) + buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd; + else + buf->buffer.m.fd = drm_desc->objects[0].fd; + + buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE : + frame->top_field_first ? V4L2_FIELD_INTERLACED_TB : + V4L2_FIELD_INTERLACED_BT; + + if (ctx->field_order != buf->buffer.field) { + av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field); + ctx->field_order = buf->buffer.field; + } + + buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame); + + buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd; + + av_frame_move_ref(&buf->frame, frame); + + return deint_v4l2m2m_enqueue_buffer(buf); +} + +static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx) +{ + if (atomic_fetch_sub(&ctx->refcount, 1) == 1) { + V4L2Queue *capture = &ctx->capture; + V4L2Queue *output = &ctx->output; + int i; + + av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__); + + if (ctx->fd >= 0) { + deint_v4l2m2m_streamoff(capture); + deint_v4l2m2m_streamoff(output); + } + + if (capture->buffers) + for (i = 0; i < capture->num_buffers; i++) { + capture->buffers[i].q = NULL; + if (capture->buffers[i].fd >= 0) + close(capture->buffers[i].fd); + } + + deint_v4l2m2m_unref_queued(output); + + av_buffer_unref(&ctx->hw_frames_ctx); + + if (capture->buffers) + av_free(capture->buffers); + + if (output->buffers) + av_free(output->buffers); + + if (ctx->fd >= 0) { + close(ctx->fd); + ctx->fd = -1; + } + + av_free(ctx); + } +} + +static void v4l2_free_buffer(void *opaque, uint8_t *unused) +{ + V4L2Buffer *buf = opaque; + DeintV4L2M2MContextShared *ctx = buf->q->ctx; + + if (!ctx->done) + deint_v4l2m2m_enqueue_buffer(buf); + + deint_v4l2m2m_destroy_context(ctx); +} + +static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height) +{ + int av_pix_fmt = AV_PIX_FMT_YUV420P; + AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; + AVDRMLayerDescriptor *layer; + + /* fill the DRM frame descriptor */ + drm_desc->nb_objects = avbuf->num_planes; + drm_desc->nb_layers = 1; + + layer = &drm_desc->layers[0]; + layer->nb_planes = avbuf->num_planes; + + for (int i = 0; i < avbuf->num_planes; i++) { + layer->planes[i].object_index = i; + layer->planes[i].offset = 0; + layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; + } + + switch (av_pix_fmt) { + case AV_PIX_FMT_YUYV422: + + layer->format = DRM_FORMAT_YUYV; + layer->nb_planes = 1; + + break; + + case AV_PIX_FMT_NV12: + case AV_PIX_FMT_NV21: + + layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ? + DRM_FORMAT_NV12 : DRM_FORMAT_NV21; + + if (avbuf->num_planes > 1) + break; + + layer->nb_planes = 2; + + layer->planes[1].object_index = 0; + layer->planes[1].offset = avbuf->plane_info[0].bytesperline * + height; + layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; + break; + + case AV_PIX_FMT_YUV420P: + + layer->format = DRM_FORMAT_YUV420; + + if (avbuf->num_planes > 1) + break; + + layer->nb_planes = 3; + + layer->planes[1].object_index = 0; + layer->planes[1].offset = avbuf->plane_info[0].bytesperline * + height; + layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; + + layer->planes[2].object_index = 0; + layer->planes[2].offset = layer->planes[1].offset + + ((avbuf->plane_info[0].bytesperline * + height) >> 2); + layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; + break; + + default: + drm_desc->nb_layers = 0; + break; + } + + return (uint8_t *) drm_desc; +} + +// timeout in ms +static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout) +{ + DeintV4L2M2MContextShared *ctx = queue->ctx; + V4L2Buffer* avbuf; + + av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__); + + avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout); + if (!avbuf) { + av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout); + return AVERROR(EAGAIN); + } + + // Fill in PTS and anciliary info from src frame + // we will want to overwrite some fields as only the pts/dts + // fields are updated with new timing in this fn + pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame); + + frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame, + sizeof(avbuf->drm_frame), v4l2_free_buffer, + avbuf, AV_BUFFER_FLAG_READONLY); + if (!frame->buf[0]) { + av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0); + return AVERROR(ENOMEM); + } + + atomic_fetch_add(&ctx->refcount, 1); + + frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height); + frame->format = AV_PIX_FMT_DRM_PRIME; + if (ctx->hw_frames_ctx) + frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); + frame->height = ctx->height; + frame->width = ctx->width; + + // Not interlaced now + frame->interlaced_frame = 0; + frame->top_field_first = 0; + // Pkt duration halved + frame->pkt_duration /= 2; + + if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) { + av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n"); + frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM; + } + + av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts); + return 0; +} + +static int deint_v4l2m2m_config_props(AVFilterLink *outlink) +{ + AVFilterLink *inlink = outlink->src->inputs[0]; + AVFilterContext *avctx = outlink->src; + DeintV4L2M2MContext *priv = avctx->priv; + DeintV4L2M2MContextShared *ctx = priv->shared; + int ret; + + ctx->height = avctx->inputs[0]->h; + ctx->width = avctx->inputs[0]->w; + + av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height); + + outlink->time_base = inlink->time_base; + outlink->w = inlink->w; + outlink->h = inlink->h; + outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; + outlink->format = inlink->format; + outlink->frame_rate = (AVRational) {1, 0}; // Deny knowledge of frame rate + + ret = deint_v4l2m2m_find_device(ctx); + if (ret) + return ret; + + if (inlink->hw_frames_ctx) { + ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx); + if (!ctx->hw_frames_ctx) + return AVERROR(ENOMEM); + } + return 0; +} + +static int deint_v4l2m2m_query_formats(AVFilterContext *avctx) +{ + static const enum AVPixelFormat pixel_formats[] = { + AV_PIX_FMT_DRM_PRIME, + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NONE, + }; + + return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats)); +} + +static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in) +{ + AVFilterContext *avctx = link->dst; + DeintV4L2M2MContext *priv = avctx->priv; + DeintV4L2M2MContextShared *ctx = priv->shared; + V4L2Queue *capture = &ctx->capture; + V4L2Queue *output = &ctx->output; + int ret; + + av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n", + __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den); + av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__, + avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out); + + if (ctx->field_order == V4L2_FIELD_ANY) { + AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0]; + ctx->orig_width = drm_desc->layers[0].planes[0].pitch; + ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width; + + av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%d,%d)\n", __func__, ctx->width, ctx->height, + drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset); + + if (in->top_field_first) + ctx->field_order = V4L2_FIELD_INTERLACED_TB; + else + ctx->field_order = V4L2_FIELD_INTERLACED_BT; + + ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); + if (ret) + return ret; + + ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset); + if (ret) + return ret; + + ret = deint_v4l2m2m_allocate_buffers(capture); + if (ret) + return ret; + + ret = deint_v4l2m2m_streamon(capture); + if (ret) + return ret; + + ret = deint_v4l2m2m_allocate_buffers(output); + if (ret) + return ret; + + ret = deint_v4l2m2m_streamon(output); + if (ret) + return ret; + } + + ret = deint_v4l2m2m_enqueue_frame(output, in); + + av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret)); + return ret; +} + +static int deint_v4l2m2m_activate(AVFilterContext *avctx) +{ + DeintV4L2M2MContext * const priv = avctx->priv; + DeintV4L2M2MContextShared *const s = priv->shared; + AVFilterLink * const outlink = avctx->outputs[0]; + AVFilterLink * const inlink = avctx->inputs[0]; + int n = 0; + int cn = 99; + int instatus = 0; + int64_t inpts = 0; + int did_something = 0; + + av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__); + + FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx); + + ff_inlink_acknowledge_status(inlink, &instatus, &inpts); + + if (!ff_outlink_frame_wanted(outlink)) { + av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__); + } + else if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup! + { + AVFrame * frame = av_frame_alloc(); + int rv; + +again: + recycle_q(&s->output); + n = count_enqueued(&s->output); + + if (frame == NULL) { + av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__); + return AVERROR(ENOMEM); + } + + rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0); + if (rv != 0) { + av_frame_free(&frame); + if (rv != AVERROR(EAGAIN)) { + av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv)); + return rv; + } + } + else { + frame->interlaced_frame = 0; + // frame is always consumed by filter_frame - even on error despite + // a somewhat confusing comment in the header + rv = ff_filter_frame(outlink, frame); + + if (instatus != 0) { + av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__); + goto again; + } + + av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv)); + did_something = 1; + } + + cn = count_enqueued(&s->capture); + } + + if (instatus != 0) { + ff_outlink_set_status(outlink, instatus, inpts); + av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus)); + return 0; + } + + { + AVFrame * frame; + int rv; + + recycle_q(&s->output); + n = count_enqueued(&s->output); + + while (n < 6) { + if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) { + av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv)); + return rv; + } + + if (frame == NULL) { + av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__); + break; + } + + deint_v4l2m2m_filter_frame(inlink, frame); + av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__); + ++n; + } + } + + if (n < 6) { + ff_inlink_request_frame(inlink); + did_something = 1; + av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__); + } + + if (n > 4 && ff_outlink_frame_wanted(outlink)) { + ff_filter_set_ready(avctx, 1); + did_something = 1; + av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__); + } + + av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn); + return did_something ? 0 : FFERROR_NOT_READY; +} + +static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx) +{ + DeintV4L2M2MContext * const priv = avctx->priv; + DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared)); + + if (!ctx) { + av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0); + return AVERROR(ENOMEM); + } + priv->shared = ctx; + ctx->logctx = priv; + ctx->fd = -1; + ctx->output.ctx = ctx; + ctx->output.num_buffers = 8; + ctx->capture.ctx = ctx; + ctx->capture.num_buffers = 12; + ctx->done = 0; + ctx->field_order = V4L2_FIELD_ANY; + + pts_track_init(&ctx->track, priv); + + atomic_init(&ctx->refcount, 1); + + return 0; +} + +static void deint_v4l2m2m_uninit(AVFilterContext *avctx) +{ + DeintV4L2M2MContext *priv = avctx->priv; + DeintV4L2M2MContextShared *ctx = priv->shared; + + ctx->done = 1; + ctx->logctx = NULL; // Log to NULL works, log to missing crashes + pts_track_uninit(&ctx->track); + deint_v4l2m2m_destroy_context(ctx); +} + +static const AVOption deinterlace_v4l2m2m_options[] = { + { NULL }, +}; + +AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m); + +static const AVFilterPad deint_v4l2m2m_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + }, + { NULL } +}; + +static const AVFilterPad deint_v4l2m2m_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = deint_v4l2m2m_config_props, + }, + { NULL } +}; + +AVFilter ff_vf_deinterlace_v4l2m2m = { + .name = "deinterlace_v4l2m2m", + .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"), + .priv_size = sizeof(DeintV4L2M2MContext), + .init = &deint_v4l2m2m_init, + .uninit = &deint_v4l2m2m_uninit, + .query_formats = &deint_v4l2m2m_query_formats, + .inputs = deint_v4l2m2m_inputs, + .outputs = deint_v4l2m2m_outputs, + .priv_class = &deinterlace_v4l2m2m_class, + .activate = deint_v4l2m2m_activate, +}; diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c new file mode 100644 index 0000000000..61c03a385c --- /dev/null +++ b/libavfilter/vf_unsand.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2007 Bobby Bingham + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * format and noformat video filters + */ + +#include + +#include "libavutil/internal.h" +#include "libavutil/mem.h" +#include "libavutil/pixdesc.h" +#include "libavutil/opt.h" +#include "libavutil/rpi_sand_fns.h" + +#include "avfilter.h" +#include "formats.h" +#include "internal.h" +#include "video.h" + +typedef struct UnsandContext { + const AVClass *class; +} UnsandContext; + +static av_cold void uninit(AVFilterContext *ctx) +{ +// UnsandContext *s = ctx->priv; +} + +static av_cold int init(AVFilterContext *ctx) +{ +// UnsandContext *s = ctx->priv; + + return 0; +} + + +static int filter_frame(AVFilterLink *link, AVFrame *in) +{ + AVFilterLink * const outlink = link->dst->outputs[0]; + AVFrame *out = NULL; + int rv = 0; + + if (outlink->format == in->format) { + // If nothing to do then do nothing + out = in; + } + else + { + if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL) + { + rv = AVERROR(ENOMEM); + goto fail; + } + if (av_rpi_sand_to_planar_frame(out, in) != 0) + { + rv = -1; + goto fail; + } + + av_frame_free(&in); + } + + return ff_filter_frame(outlink, out); + +fail: + av_frame_free(&out); + av_frame_free(&in); + return rv; +} + +#if 0 +static void dump_fmts(const AVFilterFormats * fmts) +{ + int i; + if (fmts== NULL) { + printf("NULL\n"); + return; + } + for (i = 0; i < fmts->nb_formats; ++i) { + printf(" %d", fmts->formats[i]); + } + printf("\n"); +} +#endif + +static int query_formats(AVFilterContext *ctx) +{ +// UnsandContext *s = ctx->priv; + int ret; + + // If we aren't connected at both ends then just do nothing + if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL) + return 0; + + // Our output formats depend on our input formats and we can't/don't + // want to convert between bit depths so we need to wait for the source + // to have an opinion before we do + if (ctx->inputs[0]->incfg.formats == NULL) + return AVERROR(EAGAIN); + + // Accept anything + if (ctx->inputs[0]->outcfg.formats == NULL && + (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0) + return ret; + + // Filter out sand formats + + // Generate a container if we don't already have one + if (ctx->outputs[0]->incfg.formats == NULL) + { + // Somewhat rubbish way of ensuring we have a good structure + const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE}; + AVFilterFormats *formats = ff_make_format_list(out_fmts); + + if (formats == NULL) + return AVERROR(ENOMEM); + if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0) + return ret; + } + + // Replace old format list with new filtered list derived from what our + // input says it can do + { + const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats; + AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats; + enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats); + int i; + int n = 0; + int seen_420p = 0; + int seen_420p10 = 0; + + for (i = 0; i < src_ff->nb_formats; ++i) { + const enum AVPixelFormat f = src_ff->formats[i]; + + switch (f){ + case AV_PIX_FMT_YUV420P: + case AV_PIX_FMT_SAND128: + case AV_PIX_FMT_RPI4_8: + if (!seen_420p) { + seen_420p = 1; + dst_fmts[n++] = AV_PIX_FMT_YUV420P; + } + break; + case AV_PIX_FMT_SAND64_10: + case AV_PIX_FMT_YUV420P10: + case AV_PIX_FMT_RPI4_10: + if (!seen_420p10) { + seen_420p10 = 1; + dst_fmts[n++] = AV_PIX_FMT_YUV420P10; + } + break; + default: + dst_fmts[n++] = f; + break; + } + } + + av_freep(&dst_ff->formats); + dst_ff->formats = dst_fmts; + dst_ff->nb_formats = n; + } + +// printf("Unsand: %s calc: ", __func__); +// dump_fmts(ctx->outputs[0]->incfg.formats); + + return 0; +} + + +#define OFFSET(x) offsetof(UnsandContext, x) +static const AVOption unsand_options[] = { + { NULL } +}; + + +AVFILTER_DEFINE_CLASS(unsand); + +static const AVFilterPad avfilter_vf_unsand_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = filter_frame, + }, + { NULL } +}; + +static const AVFilterPad avfilter_vf_unsand_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO + }, + { NULL } +}; + +AVFilter ff_vf_unsand = { + .name = "unsand", + .description = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"), + + .init = init, + .uninit = uninit, + + .query_formats = query_formats, + + .priv_size = sizeof(UnsandContext), + .priv_class = &unsand_class, + + .inputs = avfilter_vf_unsand_inputs, + .outputs = avfilter_vf_unsand_outputs, +}; + diff --git a/libavformat/utils.c b/libavformat/utils.c index 1384b56771..27479e3c40 100644 --- a/libavformat/utils.c +++ b/libavformat/utils.c @@ -3011,6 +3011,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr) return 1; } +#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER +// This should be quite general purpose but avoid possible conflicts +// by limiting usage to cases wehere we know it works. +static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts) +{ + // Only try fallback if we know it is supported (HEVC only) + const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL : + avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE); + int err; + + // Failed to find fallback or we are already at the fallback + if (new_codec == NULL || new_codec == old_codec) + { + return AVERROR_DECODER_NOT_FOUND; + } + + // * This may be dodgy - header says to not use this fn, + // especially if we are going to reopen the context... + // (but it does seem to work for our cases) + if (avcodec_is_open(avctx)) { + avcodec_close(avctx); + } + + if ((err = avcodec_open2(avctx, new_codec, opts)) < 0) + { + return err; + } + + return 0; +} +#else +#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND) +#endif + /* returns 1 or 0 if or if not decoded data was returned, or a negative error */ static int try_decode_frame(AVFormatContext *s, AVStream *st, const AVPacket *avpkt, AVDictionary **options) @@ -3049,7 +3083,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, av_dict_set(options ? options : &thread_opt, "lowres", "0", 0); if (s->codec_whitelist) av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0); - ret = avcodec_open2(avctx, codec, options ? options : &thread_opt); + if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND) + { + // Try fallback if if looks worth a try + ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt); + } if (!options) av_dict_free(&thread_opt); if (ret < 0) { @@ -3080,6 +3118,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, if (avctx->codec_type == AVMEDIA_TYPE_VIDEO || avctx->codec_type == AVMEDIA_TYPE_AUDIO) { ret = avcodec_send_packet(avctx, &pkt); + + // If we are going to want to fall back we should know here + if (ret == AVERROR_DECODER_NOT_FOUND) { + if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0) + break; + continue; + } + if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) break; if (ret >= 0) @@ -3708,9 +3754,20 @@ FF_ENABLE_DEPRECATION_WARNINGS // Try to just open decoders, in case this is enough to get parameters. if (!has_codec_parameters(st, NULL) && st->internal->request_probe <= 0) { if (codec && !avctx->codec) - if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0) - av_log(ic, AV_LOG_WARNING, - "Failed to open codec in %s\n",__FUNCTION__); + { + int err; + + if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0) + { + if (err == AVERROR_DECODER_NOT_FOUND) { + err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt); + } + if (err < 0) { + av_log(ic, AV_LOG_WARNING, + "Failed to open codec in %s\n",__FUNCTION__); + } + } + } } if (!options) av_dict_free(&thread_opt); diff --git a/libavutil/Makefile b/libavutil/Makefile index 27bafe9e12..c9075ddf8a 100644 --- a/libavutil/Makefile +++ b/libavutil/Makefile @@ -68,6 +68,7 @@ HEADERS = adler32.h \ rational.h \ replaygain.h \ ripemd.h \ + rpi_sand_fns.h \ samplefmt.h \ sha.h \ sha512.h \ @@ -87,6 +88,7 @@ HEADERS = adler32.h \ film_grain_params.h \ HEADERS-$(CONFIG_LZO) += lzo.h +HEADERS-$(CONFIG-RPI) += rpi_sand_fn_pw.h ARCH_HEADERS = bswap.h \ intmath.h \ @@ -182,6 +184,7 @@ OBJS-$(CONFIG_LZO) += lzo.o OBJS-$(CONFIG_MEDIACODEC) += hwcontext_mediacodec.o OBJS-$(CONFIG_OPENCL) += hwcontext_opencl.o OBJS-$(CONFIG_QSV) += hwcontext_qsv.o +OBJS-$(CONFIG_SAND) += rpi_sand_fns.o OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile index 5613813ba8..ab8bcfcf34 100644 --- a/libavutil/aarch64/Makefile +++ b/libavutil/aarch64/Makefile @@ -1,4 +1,6 @@ OBJS += aarch64/cpu.o \ aarch64/float_dsp_init.o \ -NEON-OBJS += aarch64/float_dsp_neon.o +NEON-OBJS += aarch64/float_dsp_neon.o \ + aarch64/rpi_sand_neon.o \ + diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S new file mode 100644 index 0000000000..cdcf71ee67 --- /dev/null +++ b/libavutil/aarch64/rpi_sand_neon.S @@ -0,0 +1,676 @@ +/* +Copyright (c) 2021 Michael Eiler + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: Michael Eiler +*/ + +#include "asm.S" + +// void ff_rpi_sand8_lines_to_planar_y8( +// uint8_t * dest, : x0 +// unsigned int dst_stride, : w1 +// const uint8_t * src, : x2 +// unsigned int src_stride1, : w3, always 128 +// unsigned int src_stride2, : w4 +// unsigned int _x, : w5 +// unsigned int y, : w6 +// unsigned int _w, : w7 +// unsigned int h); : [sp, #0] + +function ff_rpi_sand8_lines_to_planar_y8, export=1 + // w15 contains the number of rows we need to process + ldr w15, [sp, #0] + + // w8 will contain the number of blocks per row + // w8 = floor(_w/stride1) + // stride1 is assumed to always be 128 + mov w8, w1 + lsr w8, w8, #7 + + // in case the width of the image is not a multiple of 128, there will + // be an incomplete block at the end of every row + // w9 contains the number of pixels stored within this block + // w9 = _w - w8 * 128 + lsl w9, w8, #7 + sub w9, w7, w9 + + // this is the value we have to add to the src pointer after reading a complete block + // it will move the address to the start of the next block + // w10 = stride2 * stride1 - stride1 + mov w10, w4 + lsl w10, w10, #7 + sub w10, w10, #128 + + // w11 is the row offset, meaning the start offset of the first block of every collumn + // this will be increased with stride1 within every iteration of the row_loop + eor w11, w11, w11 + + // w12 = 0, processed row count + eor w12, w12, w12 +row_loop: + // start of the first block within the current row + // x13 = row offset + src + mov x13, x2 + add x13, x13, x11 + + // w14 = 0, processed block count + eor w14, w14, w14 + + cmp w8, #0 + beq no_main_y8 + +block_loop: + // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128 + // fortunately these aren't callee saved ones, meaning we don't need to backup them + ld1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x13], #64 + ld1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x13], #64 + + // write these registers back to the destination vector and increase the dst address by 128 + st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 + st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], #64 + + // move the source register to the beginning of the next block (x13 = src + block offset) + add x13, x13, x10 + // increase the block counter + add w14, w14, #1 + + // continue with the block_loop if we haven't copied all full blocks yet + cmp w8, w14 + bgt block_loop + + // handle the last block at the end of each row + // at most 127 byte values copied from src to dst +no_main_y8: + eor w5, w5, w5 // i = 0 +incomplete_block_loop_y8: + cmp w5, w9 + bge incomplete_block_loop_end_y8 + + ldrb w6, [x13] + strb w6, [x0] + add x13, x13, #1 + add x0, x0, #1 + + add w5, w5, #1 + b incomplete_block_loop_y8 +incomplete_block_loop_end_y8: + + + // increase the row offset by 128 (stride1) + add w11, w11, #128 + // increment the row counter + add w12, w12, #1 + + // process the next row if we haven't finished yet + cmp w15, w12 + bgt row_loop + + ret +endfunc + + + +// void ff_rpi_sand8_lines_to_planar_c8( +// uint8_t * dst_u, : x0 +// unsigned int dst_stride_u, : w1 == width +// uint8_t * dst_v, : x2 +// unsigned int dst_stride_v, : w3 == width +// const uint8_t * src, : x4 +// unsigned int stride1, : w5 == 128 +// unsigned int stride2, : w6 +// unsigned int _x, : w7 +// unsigned int y, : [sp, #0] +// unsigned int _w, : [sp, #8] +// unsigned int h); : [sp, #16] + +function ff_rpi_sand8_lines_to_planar_c8, export=1 + // w7 = width + ldr w7, [sp, #8] + + // w15 contains the number of rows we need to process + // counts down + ldr w15, [sp, #16] + + // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6 + mov w8, w7 + lsr w8, w8, #6 + + // number of pixels in block at the end of every row + // w9 = _w - (w8 * 64) + lsl w9, w8, #6 + sub w9, w7, w9 + + // Skip at the end of the line to account for stride + sub w12, w1, w7 + + // address delta to the beginning of the next block + // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128 + lsl w10, w6, #7 + sub w10, w10, #128 + + // w11 = row address start offset = 0 + eor w11, w11, w11 + +row_loop_c8: + // start of the first block within the current row + // x13 = row offset + src + mov x13, x4 + add x13, x13, x11 + + // w14 = 0, processed block count + eor w14, w14, w14 + + cmp w8, #0 + beq no_main_c8 + +block_loop_c8: + // load the full block -> 128 bytes, the block contains 64 interleaved U and V values + ld2 { v0.16b, v1.16b }, [x13], #32 + ld2 { v2.16b, v3.16b }, [x13], #32 + ld2 { v4.16b, v5.16b }, [x13], #32 + ld2 { v6.16b, v7.16b }, [x13], #32 + + // swap register so that we can write them out with a single instruction + mov v16.16b, v1.16b + mov v17.16b, v3.16b + mov v18.16b, v5.16b + mov v1.16b, v2.16b + mov v2.16b, v4.16b + mov v3.16b, v6.16b + mov v4.16b, v16.16b + mov v5.16b, v17.16b + mov v6.16b, v18.16b + + st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64 + st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x2], #64 + + // increment row counter and move src to the beginning of the next block + add w14, w14, #1 + add x13, x13, x10 + + // jump to block_loop_c8 iff the block count is smaller than the number of full blocks + cmp w8, w14 + bgt block_loop_c8 + +no_main_c8: + // handle incomplete block at the end of every row + eor w5, w5, w5 // point counter, this might be +incomplete_block_loop_c8: + cmp w5, w9 + bge incomplete_block_loop_end_c8 + + ldrb w1, [x13] + strb w1, [x0] + add x13, x13, #1 + + ldrb w1, [x13] + strb w1, [x2] + add x13, x13, #1 + + add x0, x0, #1 + add x2, x2, #1 + + add w5, w5, #1 + b incomplete_block_loop_c8 +incomplete_block_loop_end_c8: + + // increase row_offset by stride1 + add w11, w11, #128 + add x0, x0, w12, sxtw + add x2, x2, w12, sxtw + + // jump to row_Loop_c8 iff the row count is small than the height + subs w15, w15, #1 + bgt row_loop_c8 + + ret +endfunc + +//void ff_rpi_sand30_lines_to_planar_y16( +// uint8_t * dest, // [x0] +// unsigned int dst_stride, // [w1] -> assumed to be equal to _w +// const uint8_t * src, // [x2] +// unsigned int src_stride1, // [w3] -> 128 +// unsigned int src_stride2, // [w4] +// unsigned int _x, // [w5] +// unsigned int y, // [w6] +// unsigned int _w, // [w7] +// unsigned int h); // [sp, #0] + +function ff_rpi_sand30_lines_to_planar_y16, export=1 + stp x19, x20, [sp, #-48]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + + // w6 = argument h + ldr w6, [sp, #48] + + // slice_inc = ((stride2 - 1) * stride1) + mov w5, w4 + sub w5, w5, #1 + lsl w5, w5, #7 + + // total number of bytes per row = (width / 3) * 4 + mov w8, w7 + mov w9, #3 + udiv w8, w8, w9 + lsl w8, w8, #2 + + // number of full 128 byte blocks to be processed + mov w9, #96 + udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96 + + // w10 = number of full integers to process (4 bytes) + // w11 = remaning zero to two 10bit values still to copy over + mov w12, #96 + mul w12, w9, w12 + sub w12, w7, w12 // width - blocks*96 = remaining points per row + mov w11, #3 + udiv w10, w12, w11 // full integers to process = w12 / 3 + mul w11, w10, w11 // #integers *3 + sub w11, w12, w11 // remaining 0-2 points = remaining points - integers*3 + + // increase w9 by one if w10+w11 is not zero, and decrease the row count by one + // this is to efficiently copy incomplete blocks at the end of the rows + // the last row is handled explicitly to avoid writing out of bounds + add w22, w10, w11 + cmp w22, #0 + cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise + add w9, w9, w22 + sub w6, w6, #1 + + // store the number of bytes in w20 which we copy too much for every row + // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values) + mov w20, #96*2 + mul w20, w20, w9 + sub w20, w1, w20 + + mov w23, #0 // flag to check whether the last line had already been processed + + // bitmask to clear the uppper 6bits of the result values + mov x19, #0x03ff03ff03ff03ff + dup v22.2d, x19 + + // row counter = 0 + eor w12, w12, w12 +row_loop_y16: + cmp w12, w6 // jump to row_loop_y16_fin if we processed all rows + bge row_loop_y16_fin + + mov x13, x2 // row src + eor w14, w14, w14 // full block counter +block_loop_y16: + cmp w14, w9 + bge block_loop_y16_fin + + // load 64 bytes + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 + + // process v0 and v1 + xtn v16.4h, v0.4s + ushr v0.4s, v0.4s, #10 + xtn v17.4h, v0.4s + ushr v0.4s, v0.4s, #10 + xtn v18.4h, v0.4s + + xtn2 v16.8h, v1.4s + and v16.16b, v16.16b, v22.16b + ushr v1.4s, v1.4s, #10 + xtn2 v17.8h, v1.4s + and v17.16b, v17.16b, v22.16b + ushr v1.4s, v1.4s, #10 + xtn2 v18.8h, v1.4s + and v18.16b, v18.16b, v22.16b + + st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 + + // process v2 and v3 + xtn v23.4h, v2.4s + ushr v2.4s, v2.4s, #10 + xtn v24.4h, v2.4s + ushr v2.4s, v2.4s, #10 + xtn v25.4h, v2.4s + + xtn2 v23.8h, v3.4s + and v23.16b, v23.16b, v22.16b + ushr v3.4s, v3.4s, #10 + xtn2 v24.8h, v3.4s + and v24.16b, v24.16b, v22.16b + ushr v3.4s, v3.4s, #10 + xtn2 v25.8h, v3.4s + and v25.16b, v25.16b, v22.16b + + st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 + + // load the second half of the block -> 64 bytes into registers v4-v7 + ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x13], #64 + + // process v4 and v5 + xtn v16.4h, v4.4s + ushr v4.4s, v4.4s, #10 + xtn v17.4h, v4.4s + ushr v4.4s, v4.4s, #10 + xtn v18.4h, v4.4s + + xtn2 v16.8h, v5.4s + and v16.16b, v16.16b, v22.16b + ushr v5.4s, v5.4s, #10 + xtn2 v17.8h, v5.4s + and v17.16b, v17.16b, v22.16b + ushr v5.4s, v5.4s, #10 + xtn2 v18.8h, v5.4s + and v18.16b, v18.16b, v22.16b + + st3 { v16.8h, v17.8h, v18.8h }, [x0], #48 + + // v6 and v7 + xtn v23.4h, v6.4s + ushr v6.4s, v6.4s, #10 + xtn v24.4h, v6.4s + ushr v6.4s, v6.4s, #10 + xtn v25.4h, v6.4s + + xtn2 v23.8h, v7.4s + and v23.16b, v23.16b, v22.16b + ushr v7.4s, v7.4s, #10 + xtn2 v24.8h, v7.4s + and v24.16b, v24.16b, v22.16b + ushr v7.4s, v7.4s, #10 + xtn2 v25.8h, v7.4s + and v25.16b, v25.16b, v22.16b + + st3 { v23.8h, v24.8h, v25.8h }, [x0], #48 + + add x13, x13, x5 // row src += slice_inc + add w14, w14, #1 + b block_loop_y16 +block_loop_y16_fin: + + + + + add x2, x2, #128 // src += stride1 (start of the next row) + add x0, x0, w20, sxtw // subtract the bytes we copied too much from dst + add w12, w12, #1 + b row_loop_y16 +row_loop_y16_fin: + + // check whether we have incomplete blocks at the end of every row + // in that case decrease row block count by one + // change height back to it's original value (meaning increase it by 1) + // and jump back to another iteration of row_loop_y16 + + cmp w23, #1 + beq row_loop_y16_fin2 // don't continue here if we already processed the last row + add w6, w6, #1 // increase height to the original value + sub w9, w9, w22 // block count - 1 or 0, depending on the remaining bytes count + mov w23, #1 + b row_loop_y16 +row_loop_y16_fin2: + + sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference + + // now we've got to handle the last block in the last row + eor w12, w12, w12 // w12 = 0 = counter +integer_loop_y16: + cmp w12, w10 + bge integer_loop_y16_fin + ldr w14, [x13], #4 + and w15, w14, #0x3ff + strh w15, [x0], #2 + lsr w14, w14, #10 + and w15, w14, #0x3ff + strh w15, [x0], #2 + lsr w14, w14, #10 + and w15, w14, #0x3ff + strh w15, [x0], #2 + add w12, w12, #1 + b integer_loop_y16 +integer_loop_y16_fin: + +final_values_y16: + // remaining point count = w11 + ldr w14, [x13], #4 + cmp w11, #0 + beq final_values_y16_fin + and w15, w14, #0x3ff + strh w15, [x0], #2 + cmp w11, #1 + beq final_values_y16_fin + lsr w14, w14, #10 + and w15, w14, #0x3ff + strh w15, [x0], #2 +final_values_y16_fin: + + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #48 + ret +endfunc + +//void ff_rpi_sand30_lines_to_planar_c16( +// uint8_t * dst_u, // [x0] +// unsigned int dst_stride_u, // [w1] == _w*2 +// uint8_t * dst_v, // [x2] +// unsigned int dst_stride_v, // [w3] == _w*2 +// const uint8_t * src, // [x4] +// unsigned int stride1, // [w5] == 128 +// unsigned int stride2, // [w6] +// unsigned int _x, // [w7] == 0 +// unsigned int y, // [sp, #0] == 0 +// unsigned int _w, // [sp, #8] -> w3 +// unsigned int h); // [sp, #16] -> w7 + +.macro rpi_sand30_lines_to_planar_c16_block_half + ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64 + + xtn v4.4h, v0.4s + ushr v0.4s, v0.4s, #10 + xtn v5.4h, v0.4s + ushr v0.4s, v0.4s, #10 + xtn v6.4h, v0.4s + xtn2 v4.8h, v1.4s + ushr v1.4s, v1.4s, #10 + xtn2 v5.8h, v1.4s + ushr v1.4s, v1.4s, #10 + xtn2 v6.8h, v1.4s + and v4.16b, v4.16b, v16.16b + and v5.16b, v5.16b, v16.16b + and v6.16b, v6.16b, v16.16b + st3 { v4.8h, v5.8h, v6.8h }, [sp], #48 + + xtn v4.4h, v2.4s + ushr v2.4s, v2.4s, #10 + xtn v5.4h, v2.4s + ushr v2.4s, v2.4s, #10 + xtn v6.4h, v2.4s + xtn2 v4.8h, v3.4s + ushr v3.4s, v3.4s, #10 + xtn2 v5.8h, v3.4s + ushr v3.4s, v3.4s, #10 + xtn2 v6.8h, v3.4s + and v4.16b, v4.16b, v16.16b + and v5.16b, v5.16b, v16.16b + and v6.16b, v6.16b, v16.16b + st3 { v4.8h, v5.8h, v6.8h }, [sp] + sub sp, sp, #48 +.endm + +function ff_rpi_sand30_lines_to_planar_c16, export=1 + stp x19, x20, [sp, #-48]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + + ldr w3, [sp, #48+8] // w3 = width + ldr w7, [sp, #48+16] // w7 = height + + // reserve space on the stack for intermediate results + sub sp, sp, #256 + + // number of 128byte blocks per row, w8 = width / 48 + mov w9, #48 + udiv w8, w3, w9 + + // remaining pixels (rem_pix) per row, w9 = width - w8 * 48 + mul w9, w8, w9 + sub w9, w3, w9 + + // row offset, the beginning of the next row to process + eor w10, w10, w10 + + // offset to the beginning of the next block, w11 = stride2 * 128 - 128 + lsl w11, w6, #7 + sub w11, w11, #128 + + // decrease the height by one and in case of remaining pixels increase the block count by one + sub w7, w7, #1 + cmp w9, #0 + cset w19, ne // w19 == 1 iff reamining pixels != 0 + add w8, w8, w19 + + // bytes we have to move dst back by at the end of every row + mov w21, #48*2 + mul w21, w21, w8 + sub w21, w1, w21 + + mov w20, #0 // w20 = flag, last row processed + + mov x12, #0x03ff03ff03ff03ff + dup v16.2d, x12 + + // iterate through rows, row counter = w12 = 0 + eor w12, w12, w12 +row_loop_c16: + cmp w12, w7 + bge row_loop_c16_fin + + // address of row data = src + row_offset + mov x13, x4 + add x13, x13, x10 + + eor w14, w14, w14 +block_loop_c16: + cmp w14, w8 + bge block_loop_c16_fin + + rpi_sand30_lines_to_planar_c16_block_half + + ld2 { v0.8h, v1.8h }, [sp], #32 + ld2 { v2.8h, v3.8h }, [sp], #32 + ld2 { v4.8h, v5.8h }, [sp] + sub sp, sp, #64 + + st1 { v0.8h }, [x0], #16 + st1 { v2.8h }, [x0], #16 + st1 { v4.8h }, [x0], #16 + st1 { v1.8h }, [x2], #16 + st1 { v3.8h }, [x2], #16 + st1 { v5.8h }, [x2], #16 + + rpi_sand30_lines_to_planar_c16_block_half + + ld2 { v0.8h, v1.8h }, [sp], #32 + ld2 { v2.8h, v3.8h }, [sp], #32 + ld2 { v4.8h, v5.8h }, [sp] + sub sp, sp, #64 + + st1 { v0.8h }, [x0], #16 + st1 { v2.8h }, [x0], #16 + st1 { v4.8h }, [x0], #16 + st1 { v1.8h }, [x2], #16 + st1 { v3.8h }, [x2], #16 + st1 { v5.8h }, [x2], #16 + + add x13, x13, x11 // offset to next block + add w14, w14, #1 + b block_loop_c16 +block_loop_c16_fin: + + add w10, w10, #128 + add w12, w12, #1 + add x0, x0, w21, sxtw // move dst pointers back by x21 + add x2, x2, w21, sxtw + b row_loop_c16 +row_loop_c16_fin: + + cmp w20, #1 + beq row_loop_c16_fin2 + mov w20, #1 + sub w8, w8, w19 // decrease block count by w19 + add w7, w7, #1 // increase height + b row_loop_c16 + +row_loop_c16_fin2: + sub x0, x0, w21, sxtw // readd x21 in case of the last row + sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels + + // last incomplete block to be finished + // read operations are fine, stride2 is more than large enough even if rem_pix is 0 + rpi_sand30_lines_to_planar_c16_block_half + ld2 { v0.8h, v1.8h }, [sp], #32 + ld2 { v2.8h, v3.8h }, [sp], #32 + ld2 { v4.8h, v5.8h }, [sp], #32 + rpi_sand30_lines_to_planar_c16_block_half + ld2 { v0.8h, v1.8h }, [sp], #32 + ld2 { v2.8h, v3.8h }, [sp], #32 + ld2 { v4.8h, v5.8h }, [sp] + sub sp, sp, #160 + + mov x4, sp + eor w20, w20, w20 +rem_pix_c16_loop: + cmp w20, w9 + bge rem_pix_c16_fin + + ldr w22, [x4], #4 + str w22, [x0], #2 + lsr w22, w22, #16 + str w22, [x2], #2 + + add w20, w20, #1 + b rem_pix_c16_loop +rem_pix_c16_fin: + + add sp, sp, #256 + + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #48 + ret +endfunc + + + +//void ff_rpi_sand30_lines_to_planar_p010( +// uint8_t * dest, +// unsigned int dst_stride, +// const uint8_t * src, +// unsigned int src_stride1, +// unsigned int src_stride2, +// unsigned int _x, +// unsigned int y, +// unsigned int _w, +// unsigned int h); + diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h new file mode 100644 index 0000000000..b3aa481ea4 --- /dev/null +++ b/libavutil/aarch64/rpi_sand_neon.h @@ -0,0 +1,55 @@ +/* +Copyright (c) 2021 Michael Eiler + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: Michael Eiler +*/ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride, + const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, + unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); + +void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u, + uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, + unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); + +void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride, + const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2, + unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); + +void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u, + uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1, + unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h); + +#ifdef __cplusplus +} +#endif + diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile index 5da44b0542..b74b7c4e2f 100644 --- a/libavutil/arm/Makefile +++ b/libavutil/arm/Makefile @@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o \ NEON-OBJS += arm/float_dsp_init_neon.o \ arm/float_dsp_neon.o \ + arm/rpi_sand_neon.o \ diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S new file mode 100644 index 0000000000..80890fe985 --- /dev/null +++ b/libavutil/arm/rpi_sand_neon.S @@ -0,0 +1,768 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox +*/ + +#include "libavutil/arm/asm.S" + + +@ General notes: +@ Having done some timing on this in sand8->y8 (Pi4) +@ vst1 (680fps) is a bit faster than vstm (660fps) +@ vldm (680fps) is noticably faster than vld1 (480fps) +@ (or it might be that a mix is what is required) +@ +@ At least on a Pi4 it is no more expensive to have a single auto-inc register +@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted +@ the latter was better) +@ +@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless +@ the memory is uncached. +@ As these are Sand -> planar we can assume that src is going to be aligned but +@ it is possible that dest isn't (converting to .yuv or other packed format). +@ Luckily vst1 is faster than vstm :-) so all is well +@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4 +@ .8 stores would let us do non-word aligned stores into uncached but it +@ probably isn't worth it. + + + + +@ void ff_rpi_sand128b_stripe_to_8_10( +@ uint8_t * dest, // [r0] +@ const uint8_t * src1, // [r1] +@ const uint8_t * src2, // [r2] +@ unsigned int lines); // [r3] + +.macro stripe2_to_8, bit_depth + vpush {q4-q7} +1: + vldm r1!, {q0-q7} + subs r3, #1 + vldm r2!, {q8-q15} + vqrshrn.u16 d0, q0, #\bit_depth - 8 + vqrshrn.u16 d1, q1, #\bit_depth - 8 + vqrshrn.u16 d2, q2, #\bit_depth - 8 + vqrshrn.u16 d3, q3, #\bit_depth - 8 + vqrshrn.u16 d4, q4, #\bit_depth - 8 + vqrshrn.u16 d5, q5, #\bit_depth - 8 + vqrshrn.u16 d6, q6, #\bit_depth - 8 + vqrshrn.u16 d7, q7, #\bit_depth - 8 + vqrshrn.u16 d8, q8, #\bit_depth - 8 + vqrshrn.u16 d9, q9, #\bit_depth - 8 + vqrshrn.u16 d10, q10, #\bit_depth - 8 + vqrshrn.u16 d11, q11, #\bit_depth - 8 + vqrshrn.u16 d12, q12, #\bit_depth - 8 + vqrshrn.u16 d13, q13, #\bit_depth - 8 + vqrshrn.u16 d14, q14, #\bit_depth - 8 + vqrshrn.u16 d15, q15, #\bit_depth - 8 + vstm r0!, {q0-q7} + bne 1b + vpop {q4-q7} + bx lr +.endm + +function ff_rpi_sand128b_stripe_to_8_10, export=1 + stripe2_to_8 10 +endfunc + +@ void ff_rpi_sand8_lines_to_planar_y8( +@ uint8_t * dest, // [r0] +@ unsigned int dst_stride, // [r1] +@ const uint8_t * src, // [r2] +@ unsigned int src_stride1, // [r3] Ignored - assumed 128 +@ unsigned int src_stride2, // [sp, #0] -> r3 +@ unsigned int _x, // [sp, #4] Ignored - 0 +@ unsigned int y, // [sp, #8] (r7 in prefix) +@ unsigned int _w, // [sp, #12] -> r6 (cur r5) +@ unsigned int h); // [sp, #16] -> r7 +@ +@ Assumes that we are starting on a stripe boundary and that overreading +@ within the stripe is OK. However it does respect the dest size for writing + +function ff_rpi_sand8_lines_to_planar_y8, export=1 + push {r4-r8, lr} @ +24 L + ldr r3, [sp, #24] + ldr r6, [sp, #36] + ldr r7, [sp, #32] @ y + lsl r3, #7 + sub r1, r6 + add r8, r2, r7, lsl #7 + ldr r7, [sp, #40] + +10: + mov r2, r8 + add r4, r0, #24 + mov r5, r6 + mov lr, #0 +1: + vldm r2, {q8-q15} + add r2, r3 + subs r5, #128 + blt 2f + vst1.8 {d16, d17, d18, d19}, [r0]! + vst1.8 {d20, d21, d22, d23}, [r0]! + vst1.8 {d24, d25, d26, d27}, [r0]! + vst1.8 {d28, d29, d30, d31}, [r0]! + bne 1b +11: + subs r7, #1 + add r0, r1 + add r8, #128 + bne 10b + + pop {r4-r8, pc} + +@ Partial final write +2: + cmp r5, #64-128 + blt 1f + vst1.8 {d16, d17, d18, d19}, [r0]! + vst1.8 {d20, d21, d22, d23}, [r0]! + beq 11b + vmov q8, q12 + vmov q9, q13 + sub r5, #64 + vmov q10, q14 + vmov q11, q15 +1: + cmp r5, #32-128 + blt 1f + vst1.8 {d16, d17, d18, d19}, [r0]! + beq 11b + vmov q8, q10 + sub r5, #32 + vmov q9, q11 +1: + cmp r5, #16-128 + blt 1f + vst1.8 {d16, d17}, [r0]! + beq 11b + sub r5, #16 + vmov q8, q9 +1: + cmp r5, #8-128 + blt 1f + vst1.8 {d16}, [r0]! + beq 11b + sub r5, #8 + vmov d16, d17 +1: + cmp r5, #4-128 + blt 1f + vst1.32 {d16[0]}, [r0]! + beq 11b + sub r5, #4 + vshr.u64 d16, #32 +1: + cmp r5, #2-128 + blt 1f + vst1.16 {d16[0]}, [r0]! + beq 11b + vst1.8 {d16[2]}, [r0]! + b 11b +1: + vst1.8 {d16[0]}, [r0]! + b 11b +endfunc + +@ void ff_rpi_sand8_lines_to_planar_c8( +@ uint8_t * dst_u, // [r0] +@ unsigned int dst_stride_u, // [r1] +@ uint8_t * dst_v, // [r2] +@ unsigned int dst_stride_v, // [r3] +@ const uint8_t * src, // [sp, #0] -> r4, r5 +@ unsigned int stride1, // [sp, #4] 128 +@ unsigned int stride2, // [sp, #8] -> r8 +@ unsigned int _x, // [sp, #12] 0 +@ unsigned int y, // [sp, #16] (r7 in prefix) +@ unsigned int _w, // [sp, #20] -> r12, r6 +@ unsigned int h); // [sp, #24] -> r7 +@ +@ Assumes that we are starting on a stripe boundary and that overreading +@ within the stripe is OK. However it does respect the dest size for writing + +function ff_rpi_sand8_lines_to_planar_c8, export=1 + push {r4-r8, lr} @ +24 + + ldr r5, [sp, #24] + ldr r8, [sp, #32] + ldr r7, [sp, #40] + ldr r6, [sp, #44] + lsl r8, #7 + add r5, r5, r7, lsl #7 + sub r1, r1, r6 + sub r3, r3, r6 + ldr r7, [sp, #48] + vpush {q4-q7} + +10: + mov r4, r5 + mov r12, r6 +1: + subs r12, #64 + vldm r4, {q0-q7} + add r4, r8 + it gt + vldmgt r4, {q8-q15} + add r4, r8 + + vuzp.8 q0, q1 + vuzp.8 q2, q3 + vuzp.8 q4, q5 + vuzp.8 q6, q7 + + vuzp.8 q8, q9 + vuzp.8 q10, q11 + vuzp.8 q12, q13 + vuzp.8 q14, q15 + subs r12, #64 + + @ Rearrange regs so we can use vst1 with 4 regs + vswp q1, q2 + vswp q5, q6 + vswp q9, q10 + vswp q13, q14 + blt 2f + + vst1.8 {d0, d1, d2, d3 }, [r0]! + vst1.8 {d8, d9, d10, d11}, [r0]! + vst1.8 {d16, d17, d18, d19}, [r0]! + vst1.8 {d24, d25, d26, d27}, [r0]! + + vst1.8 {d4, d5, d6, d7 }, [r2]! + vst1.8 {d12, d13, d14, d15}, [r2]! + vst1.8 {d20, d21, d22, d23}, [r2]! + vst1.8 {d28, d29, d30, d31}, [r2]! + bne 1b +11: + subs r7, #1 + add r5, #128 + add r0, r1 + add r2, r3 + bne 10b + vpop {q4-q7} + pop {r4-r8,pc} + +2: + cmp r12, #64-128 + blt 1f + vst1.8 {d0, d1, d2, d3 }, [r0]! + vst1.8 {d8, d9, d10, d11}, [r0]! + vst1.8 {d4, d5, d6, d7 }, [r2]! + vst1.8 {d12, d13, d14, d15}, [r2]! + beq 11b + sub r12, #64 + vmov q0, q8 + vmov q1, q9 + vmov q2, q10 + vmov q3, q11 + vmov q4, q12 + vmov q5, q13 + vmov q6, q14 + vmov q7, q15 +1: + cmp r12, #32-128 + blt 1f + vst1.8 {d0, d1, d2, d3 }, [r0]! + vst1.8 {d4, d5, d6, d7 }, [r2]! + beq 11b + sub r12, #32 + vmov q0, q4 + vmov q1, q5 + vmov q2, q6 + vmov q3, q7 +1: + cmp r12, #16-128 + blt 1f + vst1.8 {d0, d1 }, [r0]! + vst1.8 {d4, d5 }, [r2]! + beq 11b + sub r12, #16 + vmov q0, q1 + vmov q2, q3 +1: + cmp r12, #8-128 + blt 1f + vst1.8 {d0}, [r0]! + vst1.8 {d4}, [r2]! + beq 11b + sub r12, #8 + vmov d0, d1 + vmov d4, d5 +1: + cmp r12, #4-128 + blt 1f + vst1.32 {d0[0]}, [r0]! + vst1.32 {d4[0]}, [r2]! + beq 11b + sub r12, #4 + vmov s0, s1 + vmov s8, s9 +1: + cmp r12, #2-128 + blt 1f + vst1.16 {d0[0]}, [r0]! + vst1.16 {d4[0]}, [r2]! + beq 11b + vst1.8 {d0[2]}, [r0]! + vst1.8 {d4[2]}, [r2]! + b 11b +1: + vst1.8 {d0[0]}, [r0]! + vst1.8 {d4[0]}, [r2]! + b 11b +endfunc + + + +@ void ff_rpi_sand30_lines_to_planar_y16( +@ uint8_t * dest, // [r0] +@ unsigned int dst_stride, // [r1] +@ const uint8_t * src, // [r2] +@ unsigned int src_stride1, // [r3] Ignored - assumed 128 +@ unsigned int src_stride2, // [sp, #0] -> r3 +@ unsigned int _x, // [sp, #4] Ignored - 0 +@ unsigned int y, // [sp, #8] (r7 in prefix) +@ unsigned int _w, // [sp, #12] -> r6 (cur r5) +@ unsigned int h); // [sp, #16] -> r7 +@ +@ Assumes that we are starting on a stripe boundary and that overreading +@ within the stripe is OK. However it does respect the dest size for writing + +function ff_rpi_sand30_lines_to_planar_y16, export=1 + push {r4-r8, lr} @ +24 + ldr r3, [sp, #24] + ldr r6, [sp, #36] + ldr r7, [sp, #32] @ y + mov r12, #48 + vmov.u16 q15, #0x3ff + sub r3, #1 + lsl r3, #7 + sub r1, r1, r6, lsl #1 + add r8, r2, r7, lsl #7 + ldr r7, [sp, #40] + +10: + mov r2, r8 + add r4, r0, #24 + mov r5, r6 + mov lr, #0 +1: + vldm r2!, {q10-q13} + add lr, #64 + + vshr.u32 q14, q10, #20 @ Cannot vshrn.u32 #20! + ands lr, #127 + vshrn.u32 d2, q10, #10 + vmovn.u32 d0, q10 + vmovn.u32 d4, q14 + + vshr.u32 q14, q11, #20 + it eq + addeq r2, r3 + vshrn.u32 d3, q11, #10 + vmovn.u32 d1, q11 + vmovn.u32 d5, q14 + + subs r5, #48 + vand q0, q15 + vand q1, q15 + vand q2, q15 + + vshr.u32 q14, q12, #20 + vshrn.u32 d18, q12, #10 + vmovn.u32 d16, q12 + vmovn.u32 d20, q14 + + vshr.u32 q14, q13, #20 + vshrn.u32 d19, q13, #10 + vmovn.u32 d17, q13 + vmovn.u32 d21, q14 + + vand q8, q15 + vand q9, q15 + vand q10, q15 + blt 2f + + vst3.16 {d0, d2, d4}, [r0], r12 + vst3.16 {d1, d3, d5}, [r4], r12 + vst3.16 {d16, d18, d20}, [r0], r12 + vst3.16 {d17, d19, d21}, [r4], r12 + + bne 1b + +11: + subs r7, #1 + add r0, r1 + add r8, #128 + bne 10b + + pop {r4-r8, pc} + +@ Partial final write +2: + cmp r5, #24-48 + blt 1f + vst3.16 {d0, d2, d4}, [r0], r12 + vst3.16 {d1, d3, d5}, [r4] + beq 11b + vmov q0, q8 + sub r5, #24 + vmov q1, q9 + vmov q2, q10 +1: + cmp r5, #12-48 + blt 1f + vst3.16 {d0, d2, d4}, [r0]! + beq 11b + vmov d0, d1 + sub r5, #12 + vmov d2, d3 + vmov d4, d5 +1: + cmp r5, #6-48 + add r4, r0, #6 @ avoid [r0]! on sequential instructions + blt 1f + vst3.16 {d0[0], d2[0], d4[0]}, [r0] + vst3.16 {d0[1], d2[1], d4[1]}, [r4] + add r0, #12 + beq 11b + vmov s0, s1 + sub r5, #6 + vmov s4, s5 + vmov s8, s9 +1: + cmp r5, #3-48 + blt 1f + vst3.16 {d0[0], d2[0], d4[0]}, [r0]! + beq 11b + sub r5, #3 + vshr.u32 d0, #16 + vshr.u32 d2, #16 +1: + cmp r5, #2-48 + blt 1f + vst2.16 {d0[0], d2[0]}, [r0]! + b 11b +1: + vst1.16 {d0[0]}, [r0]! + b 11b + +endfunc + + +@ void ff_rpi_sand30_lines_to_planar_c16( +@ uint8_t * dst_u, // [r0] +@ unsigned int dst_stride_u, // [r1] +@ uint8_t * dst_v, // [r2] +@ unsigned int dst_stride_v, // [r3] +@ const uint8_t * src, // [sp, #0] -> r4, r5 +@ unsigned int stride1, // [sp, #4] 128 +@ unsigned int stride2, // [sp, #8] -> r8 +@ unsigned int _x, // [sp, #12] 0 +@ unsigned int y, // [sp, #16] (r7 in prefix) +@ unsigned int _w, // [sp, #20] -> r6, r9 +@ unsigned int h); // [sp, #24] -> r7 +@ +@ Assumes that we are starting on a stripe boundary and that overreading +@ within the stripe is OK. However it does respect the dest size for writing + +function ff_rpi_sand30_lines_to_planar_c16, export=1 + push {r4-r10, lr} @ +32 + ldr r5, [sp, #32] + ldr r8, [sp, #40] + ldr r7, [sp, #48] + ldr r9, [sp, #52] + mov r12, #48 + vmov.u16 q15, #0x3ff + sub r8, #1 + lsl r8, #7 + add r5, r5, r7, lsl #7 + sub r1, r1, r9, lsl #1 + sub r3, r3, r9, lsl #1 + ldr r7, [sp, #56] +10: + mov lr, #0 + mov r4, r5 + mov r6, r9 +1: + vldm r4!, {q0-q3} + add lr, #64 + + @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2 + vshr.u32 q14, q0, #20 + vshrn.u32 d16, q0, #10 + vmovn.u32 d18, q0 + ands lr, #127 + vmovn.u32 d20, q14 + + vshr.u32 q14, q1, #20 + vshrn.u32 d17, q1, #10 + vmovn.u32 d19, q1 + vmovn.u32 d21, q14 + + vshr.u32 q14, q2, #20 + vshrn.u32 d22, q2, #10 + vmovn.u32 d24, q2 + vmovn.u32 d26, q14 + + vshr.u32 q14, q3, #20 + vshrn.u32 d23, q3, #10 + vmovn.u32 d25, q3 + add r10, r0, #24 + vmovn.u32 d27, q14 + + it eq + addeq r4, r8 + vuzp.16 q8, q11 + vuzp.16 q9, q12 + vuzp.16 q10, q13 + + @ q8 V0, V3,.. -> q0 + @ q9 U0, U3... + @ q10 U1, U4... + @ q11 U2, U5,.. + @ q12 V1, V4,.. -> q1 + @ q13 V2, V5,.. -> q2 + + subs r6, #24 + vand q11, q15 + vand q9, q15 + vand q10, q15 + vand q0, q8, q15 + vand q1, q12, q15 + vand q2, q13, q15 + + blt 2f + + vst3.16 {d18, d20, d22}, [r0], r12 + vst3.16 {d19, d21, d23}, [r10] + add r10, r2, #24 + vst3.16 {d0, d2, d4}, [r2], r12 + vst3.16 {d1, d3, d5}, [r10] + + bne 1b + +11: + subs r7, #1 + add r5, #128 + add r0, r1 + add r2, r3 + bne 10b + + pop {r4-r10, pc} + +@ Partial final write +2: + cmp r6, #-12 + blt 1f + vst3.16 {d18, d20, d22}, [r0]! + vst3.16 {d0, d2, d4}, [r2]! + beq 11b + vmov d18, d19 + vmov d20, d21 + vmov d22, d23 + sub r6, #12 + vmov d0, d1 + vmov d2, d3 + vmov d4, d5 +1: + cmp r6, #-18 + @ Rezip here as it makes the remaining tail handling easier + vzip.16 d0, d18 + vzip.16 d2, d20 + vzip.16 d4, d22 + blt 1f + vst3.16 {d0[1], d2[1], d4[1]}, [r0]! + vst3.16 {d0[0], d2[0], d4[0]}, [r2]! + vst3.16 {d0[3], d2[3], d4[3]}, [r0]! + vst3.16 {d0[2], d2[2], d4[2]}, [r2]! + beq 11b + vmov d0, d18 + vmov d2, d20 + sub r6, #6 + vmov d4, d22 +1: + cmp r6, #-21 + blt 1f + vst3.16 {d0[1], d2[1], d4[1]}, [r0]! + vst3.16 {d0[0], d2[0], d4[0]}, [r2]! + beq 11b + vmov s4, s5 + sub r6, #3 + vmov s0, s1 +1: + cmp r6, #-22 + blt 1f + vst2.16 {d0[1], d2[1]}, [r0]! + vst2.16 {d0[0], d2[0]}, [r2]! + b 11b +1: + vst1.16 {d0[1]}, [r0]! + vst1.16 {d0[0]}, [r2]! + b 11b + +endfunc + +@ void ff_rpi_sand30_lines_to_planar_p010( +@ uint8_t * dest, // [r0] +@ unsigned int dst_stride, // [r1] +@ const uint8_t * src, // [r2] +@ unsigned int src_stride1, // [r3] Ignored - assumed 128 +@ unsigned int src_stride2, // [sp, #0] -> r3 +@ unsigned int _x, // [sp, #4] Ignored - 0 +@ unsigned int y, // [sp, #8] (r7 in prefix) +@ unsigned int _w, // [sp, #12] -> r6 (cur r5) +@ unsigned int h); // [sp, #16] -> r7 +@ +@ Assumes that we are starting on a stripe boundary and that overreading +@ within the stripe is OK. However it does respect the dest size for writing + +function ff_rpi_sand30_lines_to_planar_p010, export=1 + push {r4-r8, lr} @ +24 + ldr r3, [sp, #24] + ldr r6, [sp, #36] + ldr r7, [sp, #32] @ y + mov r12, #48 + vmov.u16 q15, #0xffc0 + sub r3, #1 + lsl r3, #7 + sub r1, r1, r6, lsl #1 + add r8, r2, r7, lsl #7 + ldr r7, [sp, #40] + +10: + mov r2, r8 + add r4, r0, #24 + mov r5, r6 + mov lr, #0 +1: + vldm r2!, {q10-q13} + add lr, #64 + + vshl.u32 q14, q10, #6 + ands lr, #127 + vshrn.u32 d4, q10, #14 + vshrn.u32 d2, q10, #4 + vmovn.u32 d0, q14 + + vshl.u32 q14, q11, #6 + it eq + addeq r2, r3 + vshrn.u32 d5, q11, #14 + vshrn.u32 d3, q11, #4 + vmovn.u32 d1, q14 + + subs r5, #48 + vand q2, q15 + vand q1, q15 + vand q0, q15 + + vshl.u32 q14, q12, #6 + vshrn.u32 d20, q12, #14 + vshrn.u32 d18, q12, #4 + vmovn.u32 d16, q14 + + vshl.u32 q14, q13, #6 + vshrn.u32 d21, q13, #14 + vshrn.u32 d19, q13, #4 + vmovn.u32 d17, q14 + + vand q10, q15 + vand q9, q15 + vand q8, q15 + blt 2f + + vst3.16 {d0, d2, d4}, [r0], r12 + vst3.16 {d1, d3, d5}, [r4], r12 + vst3.16 {d16, d18, d20}, [r0], r12 + vst3.16 {d17, d19, d21}, [r4], r12 + + bne 1b + +11: + subs r7, #1 + add r0, r1 + add r8, #128 + bne 10b + + pop {r4-r8, pc} + +@ Partial final write +2: + cmp r5, #24-48 + blt 1f + vst3.16 {d0, d2, d4}, [r0], r12 + vst3.16 {d1, d3, d5}, [r4] + beq 11b + vmov q0, q8 + sub r5, #24 + vmov q1, q9 + vmov q2, q10 +1: + cmp r5, #12-48 + blt 1f + vst3.16 {d0, d2, d4}, [r0]! + beq 11b + vmov d0, d1 + sub r5, #12 + vmov d2, d3 + vmov d4, d5 +1: + cmp r5, #6-48 + add r4, r0, #6 @ avoid [r0]! on sequential instructions + blt 1f + vst3.16 {d0[0], d2[0], d4[0]}, [r0] + vst3.16 {d0[1], d2[1], d4[1]}, [r4] + add r0, #12 + beq 11b + vmov s0, s1 + sub r5, #6 + vmov s4, s5 + vmov s8, s9 +1: + cmp r5, #3-48 + blt 1f + vst3.16 {d0[0], d2[0], d4[0]}, [r0]! + beq 11b + sub r5, #3 + vshr.u32 d0, #16 + vshr.u32 d2, #16 +1: + cmp r5, #2-48 + blt 1f + vst2.16 {d0[0], d2[0]}, [r0]! + b 11b +1: + vst1.16 {d0[0]}, [r0]! + b 11b + +endfunc + + + diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h new file mode 100644 index 0000000000..447f367bea --- /dev/null +++ b/libavutil/arm/rpi_sand_neon.h @@ -0,0 +1,99 @@ +/* +Copyright (c) 2020 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox +*/ + +#ifndef AVUTIL_ARM_SAND_NEON_H +#define AVUTIL_ARM_SAND_NEON_H + +void ff_rpi_sand128b_stripe_to_8_10( + uint8_t * dest, // [r0] + const uint8_t * src1, // [r1] + const uint8_t * src2, // [r2] + unsigned int lines); // [r3] + +void ff_rpi_sand8_lines_to_planar_y8( + uint8_t * dest, // [r0] + unsigned int dst_stride, // [r1] + const uint8_t * src, // [r2] + unsigned int src_stride1, // [r3] Ignored - assumed 128 + unsigned int src_stride2, // [sp, #0] -> r3 + unsigned int _x, // [sp, #4] Ignored - 0 + unsigned int y, // [sp, #8] (r7 in prefix) + unsigned int _w, // [sp, #12] -> r6 (cur r5) + unsigned int h); // [sp, #16] -> r7 + +void ff_rpi_sand8_lines_to_planar_c8( + uint8_t * dst_u, // [r0] + unsigned int dst_stride_u, // [r1] + uint8_t * dst_v, // [r2] + unsigned int dst_stride_v, // [r3] + const uint8_t * src, // [sp, #0] -> r4, r5 + unsigned int stride1, // [sp, #4] 128 + unsigned int stride2, // [sp, #8] -> r8 + unsigned int _x, // [sp, #12] 0 + unsigned int y, // [sp, #16] (r7 in prefix) + unsigned int _w, // [sp, #20] -> r12, r6 + unsigned int h); // [sp, #24] -> r7 + +void ff_rpi_sand30_lines_to_planar_y16( + uint8_t * dest, // [r0] + unsigned int dst_stride, // [r1] + const uint8_t * src, // [r2] + unsigned int src_stride1, // [r3] Ignored - assumed 128 + unsigned int src_stride2, // [sp, #0] -> r3 + unsigned int _x, // [sp, #4] Ignored - 0 + unsigned int y, // [sp, #8] (r7 in prefix) + unsigned int _w, // [sp, #12] -> r6 (cur r5) + unsigned int h); // [sp, #16] -> r7 + +void ff_rpi_sand30_lines_to_planar_c16( + uint8_t * dst_u, // [r0] + unsigned int dst_stride_u, // [r1] + uint8_t * dst_v, // [r2] + unsigned int dst_stride_v, // [r3] + const uint8_t * src, // [sp, #0] -> r4, r5 + unsigned int stride1, // [sp, #4] 128 + unsigned int stride2, // [sp, #8] -> r8 + unsigned int _x, // [sp, #12] 0 + unsigned int y, // [sp, #16] (r7 in prefix) + unsigned int _w, // [sp, #20] -> r6, r9 + unsigned int h); // [sp, #24] -> r7 + +void ff_rpi_sand30_lines_to_planar_p010( + uint8_t * dest, // [r0] + unsigned int dst_stride, // [r1] + const uint8_t * src, // [r2] + unsigned int src_stride1, // [r3] Ignored - assumed 128 + unsigned int src_stride2, // [sp, #0] -> r3 + unsigned int _x, // [sp, #4] Ignored - 0 + unsigned int y, // [sp, #8] (r7 in prefix) + unsigned int _w, // [sp, #12] -> r6 (cur r5) + unsigned int h); // [sp, #16] -> r7 + +#endif // AVUTIL_ARM_SAND_NEON_H + diff --git a/libavutil/frame.c b/libavutil/frame.c index 75e347bf2f..daa6477485 100644 --- a/libavutil/frame.c +++ b/libavutil/frame.c @@ -16,6 +16,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "config.h" + #include "channel_layout.h" #include "avassert.h" #include "buffer.h" @@ -26,6 +28,9 @@ #include "mem.h" #include "samplefmt.h" #include "hwcontext.h" +#if CONFIG_SAND +#include "rpi_sand_fns.h" +#endif #if FF_API_FRAME_GET_SET MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp) @@ -903,6 +908,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags) (frame->crop_top + frame->crop_bottom) >= frame->height) return AVERROR(ERANGE); +#if CONFIG_SAND + // Sand cannot be cropped - do not try + if (av_rpi_is_sand_format(frame->format)) + return 0; +#endif + desc = av_pix_fmt_desc_get(frame->format); if (!desc) return AVERROR_BUG; diff --git a/libavutil/frame.h b/libavutil/frame.h index 7d1f8e2935..a4e7dc915d 100644 --- a/libavutil/frame.h +++ b/libavutil/frame.h @@ -990,6 +990,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags); */ const char *av_frame_side_data_name(enum AVFrameSideDataType type); + +static inline int av_frame_cropped_width(const AVFrame * const frame) +{ + return frame->width - (frame->crop_left + frame->crop_right); +} +static inline int av_frame_cropped_height(const AVFrame * const frame) +{ + return frame->height - (frame->crop_top + frame->crop_bottom); +} + /** * @} */ diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c index 7a9fdbd263..2a498f9b50 100644 --- a/libavutil/hwcontext_drm.c +++ b/libavutil/hwcontext_drm.c @@ -21,6 +21,7 @@ #include #include #include +#include /* This was introduced in version 4.6. And may not exist all without an * optional package. So to prevent a hard dependency on needing the Linux @@ -31,6 +32,7 @@ #endif #include +#include #include #include "avassert.h" @@ -38,7 +40,9 @@ #include "hwcontext_drm.h" #include "hwcontext_internal.h" #include "imgutils.h" - +#if CONFIG_SAND +#include "libavutil/rpi_sand_fns.h" +#endif static void drm_device_free(AVHWDeviceContext *hwdev) { @@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device, AVDRMDeviceContext *hwctx = hwdev->hwctx; drmVersionPtr version; + if (device == NULL) { + hwctx->fd = -1; + return 0; + } + hwctx->fd = open(device, O_RDWR); if (hwctx->fd < 0) return AVERROR(errno); @@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc, if (flags & AV_HWFRAME_MAP_WRITE) mmap_prot |= PROT_WRITE; + if (dst->format == AV_PIX_FMT_NONE) + dst->format = hwfc->sw_format; #if HAVE_LINUX_DMA_BUF_H if (flags & AV_HWFRAME_MAP_READ) map->sync_flags |= DMA_BUF_SYNC_READ; @@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc, dst->width = src->width; dst->height = src->height; + dst->crop_top = src->crop_top; + dst->crop_bottom = src->crop_bottom; + dst->crop_left = src->crop_left; + dst->crop_right = src->crop_right; + +#if CONFIG_SAND + // Rework for sand frames + if (av_rpi_is_sand_frame(dst)) { + // As it stands the sand formats hold stride2 in linesize[3] + // linesize[0] & [1] contain stride1 which is always 128 for everything we do + // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1] + dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier); + dst->linesize[0] = 128; + dst->linesize[1] = 128; + // *** Are we sure src->height is actually what we want ??? + } +#endif err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, &drm_unmap_frame, map); @@ -212,7 +240,15 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx, if (!pix_fmts) return AVERROR(ENOMEM); - pix_fmts[0] = ctx->sw_format; + // **** Offer native sand too ???? + pix_fmts[0] = +#if CONFIG_SAND + ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ? + AV_PIX_FMT_YUV420P : + ctx->sw_format == AV_PIX_FMT_RPI4_10 ? + AV_PIX_FMT_YUV420P10LE : +#endif + ctx->sw_format; pix_fmts[1] = AV_PIX_FMT_NONE; *formats = pix_fmts; @@ -231,18 +267,80 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc, map = av_frame_alloc(); if (!map) return AVERROR(ENOMEM); - map->format = dst->format; + // Map to default + map->format = AV_PIX_FMT_NONE; err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ); if (err) goto fail; - map->width = dst->width; - map->height = dst->height; +#if 0 + av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__, + hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE, + map->width, map->height, + map->linesize[0], + map->linesize[1], + map->linesize[2], + map->linesize[3], + dst->width, dst->height, + dst->linesize[0], + dst->linesize[1], + dst->linesize[2]); +#endif +#if CONFIG_SAND + if (av_rpi_is_sand_frame(map)) { + // Preserve crop - later ffmpeg code assumes that we have in that it + // overwrites any crop that we create with the old values + unsigned int stride2 = map->linesize[3]; + const unsigned int w = FFMIN(dst->width, map->width); + const unsigned int h = FFMIN(dst->height, map->height); + + if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) { + av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], + map->data[0], + 128, stride2, + 0, 0, w, h); + av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], + dst->data[2], dst->linesize[2], + map->data[1], + 128, stride2, + 0, 0, w / 2, h / 2); + } + else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) { + av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], + map->data[0], + 128, stride2, + 0, 0, w, h); + av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], + dst->data[2], dst->linesize[2], + map->data[1], + 128, stride2, + 0, 0, w / 2, h / 2); + } + else + { + av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__); + err = AVERROR(EINVAL); + goto fail; + } + + dst->width = w; + dst->height = h; + } + else +#endif + { + // Kludge mapped h/w s.t. frame_copy works + map->width = dst->width; + map->height = dst->height; + err = av_frame_copy(dst, map); + } - err = av_frame_copy(dst, map); if (err) + { + av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__); goto fail; + } err = 0; fail: @@ -257,7 +355,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc, int err; if (src->width > hwfc->width || src->height > hwfc->height) + { + av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height); return AVERROR(EINVAL); + } map = av_frame_alloc(); if (!map) diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c index 18c7a0efc8..cada39e92f 100644 --- a/libavutil/pixdesc.c +++ b/libavutil/pixdesc.c @@ -2395,6 +2395,38 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { .name = "vulkan", .flags = AV_PIX_FMT_FLAG_HWACCEL, }, + [AV_PIX_FMT_SAND128] = { + .name = "sand128", + .nb_components = 3, + .log2_chroma_w = 1, + .log2_chroma_h = 1, + .comp = { + { 0, 1, 0, 0, 8, 0, 7, 1 }, /* Y */ + { 1, 2, 0, 0, 8, 1, 7, 1 }, /* U */ + { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */ + }, + .flags = 0, + }, + [AV_PIX_FMT_SAND64_10] = { + .name = "sand64_10", + .nb_components = 3, + .log2_chroma_w = 1, + .log2_chroma_h = 1, + .comp = { + { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */ + { 1, 4, 0, 0, 10, 3, 9, 1 }, /* U */ + { 1, 4, 2, 0, 10, 3, 9, 3 }, /* V */ + }, + .flags = 0, + }, + [AV_PIX_FMT_RPI4_8] = { + .name = "rpi", + .flags = AV_PIX_FMT_FLAG_HWACCEL, + }, + [AV_PIX_FMT_RPI4_10] = { + .name = "rpi", + .flags = AV_PIX_FMT_FLAG_HWACCEL, + }, }; #if FF_API_PLUS1_MINUS1 FF_ENABLE_DEPRECATION_WARNINGS diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h index 46ef211add..84b7c9dd88 100644 --- a/libavutil/pixfmt.h +++ b/libavutil/pixfmt.h @@ -357,6 +357,12 @@ enum AVPixelFormat { AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian +// RPI - not on ifdef so can be got at by calling progs + AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding + AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding + AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding + AV_PIX_FMT_RPI4_8, + AV_PIX_FMT_RPI4_10, AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h new file mode 100644 index 0000000000..0d5d203dc3 --- /dev/null +++ b/libavutil/rpi_sand_fn_pw.h @@ -0,0 +1,227 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox +*/ + +// * Included twice from rpi_sand_fn with different PW + +#define STRCAT(x,y) x##y + +#if PW == 1 +#define pixel uint8_t +#define FUNC(f) STRCAT(f, 8) +#elif PW == 2 +#define pixel uint16_t +#define FUNC(f) STRCAT(f, 16) +#else +#error Unexpected PW +#endif + +// Fetches a single patch - offscreen fixup not done here +// w <= stride1 +// unclipped +void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride, + const uint8_t * src, + unsigned int stride1, unsigned int stride2, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h) +{ + const unsigned int x = _x; + const unsigned int w = _w; + const unsigned int mask = stride1 - 1; + +#if PW == 1 && HAVE_SAND_ASM + if (_x == 0) { + ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride, + src, stride1, stride2, _x, y, _w, h); + return; + } +#endif + + if ((x & ~mask) == ((x + w) & ~mask)) { + // All in one sand stripe + const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; + for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) { + memcpy(dst, p, w); + } + } + else + { + // Two+ stripe + const unsigned int sstride = stride1 * stride2; + const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; + const uint8_t * p2 = p1 + sstride - (x & mask); + const unsigned int w1 = stride1 - (x & mask); + const unsigned int w3 = (x + w) & mask; + const unsigned int w2 = w - (w1 + w3); + + for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) { + unsigned int j; + const uint8_t * p = p2; + uint8_t * d = dst; + memcpy(d, p1, w1); + d += w1; + for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) { + memcpy(d, p, stride1); + } + memcpy(d, p, w3); + } + } +} + +// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V) + +void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u, + uint8_t * dst_v, const unsigned int dst_stride_v, + const uint8_t * src, + unsigned int stride1, unsigned int stride2, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h) +{ + const unsigned int x = _x * 2; + const unsigned int w = _w * 2; + const unsigned int mask = stride1 - 1; + +#if PW == 1 && HAVE_SAND_ASM + if (_x == 0) { + ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v, + src, stride1, stride2, _x, y, _w, h); + return; + } +#endif + + if ((x & ~mask) == ((x + w) & ~mask)) { + // All in one sand stripe + const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; + for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) { + pixel * du = (pixel *)dst_u; + pixel * dv = (pixel *)dst_v; + const pixel * p = (const pixel *)p1; + for (unsigned int k = 0; k < w; k += 2 * PW) { + *du++ = *p++; + *dv++ = *p++; + } + } + } + else + { + // Two+ stripe + const unsigned int sstride = stride1 * stride2; + const unsigned int sstride_p = (sstride - stride1) / PW; + + const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; + const uint8_t * p2 = p1 + sstride - (x & mask); + const unsigned int w1 = stride1 - (x & mask); + const unsigned int w3 = (x + w) & mask; + const unsigned int w2 = w - (w1 + w3); + + for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) { + unsigned int j; + const pixel * p = (const pixel *)p1; + pixel * du = (pixel *)dst_u; + pixel * dv = (pixel *)dst_v; + for (unsigned int k = 0; k < w1; k += 2 * PW) { + *du++ = *p++; + *dv++ = *p++; + } + for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) { + for (unsigned int k = 0; k < stride1; k += 2 * PW) { + *du++ = *p++; + *dv++ = *p++; + } + } + for (unsigned int k = 0; k < w3; k += 2 * PW) { + *du++ = *p++; + *dv++ = *p++; + } + } + } +} + +void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c, + unsigned int stride1, unsigned int stride2, + const uint8_t * src_u, const unsigned int src_stride_u, + const uint8_t * src_v, const unsigned int src_stride_v, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h) +{ + const unsigned int x = _x * 2; + const unsigned int w = _w * 2; + const unsigned int mask = stride1 - 1; + if ((x & ~mask) == ((x + w) & ~mask)) { + // All in one sand stripe + uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; + for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) { + const pixel * su = (const pixel *)src_u; + const pixel * sv = (const pixel *)src_v; + pixel * p = (pixel *)p1; + for (unsigned int k = 0; k < w; k += 2 * PW) { + *p++ = *su++; + *p++ = *sv++; + } + } + } + else + { + // Two+ stripe + const unsigned int sstride = stride1 * stride2; + const unsigned int sstride_p = (sstride - stride1) / PW; + + const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; + const uint8_t * p2 = p1 + sstride - (x & mask); + const unsigned int w1 = stride1 - (x & mask); + const unsigned int w3 = (x + w) & mask; + const unsigned int w2 = w - (w1 + w3); + + for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) { + unsigned int j; + const pixel * su = (const pixel *)src_u; + const pixel * sv = (const pixel *)src_v; + pixel * p = (pixel *)p1; + for (unsigned int k = 0; k < w1; k += 2 * PW) { + *p++ = *su++; + *p++ = *sv++; + } + for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) { + for (unsigned int k = 0; k < stride1; k += 2 * PW) { + *p++ = *su++; + *p++ = *sv++; + } + } + for (unsigned int k = 0; k < w3; k += 2 * PW) { + *p++ = *su++; + *p++ = *sv++; + } + } + } +} + + +#undef pixel +#undef STRCAT +#undef FUNC + diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c new file mode 100644 index 0000000000..1f543e9357 --- /dev/null +++ b/libavutil/rpi_sand_fns.c @@ -0,0 +1,356 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox +*/ + +#include "config.h" +#include +#include +#include "rpi_sand_fns.h" +#include "avassert.h" +#include "frame.h" + +#if ARCH_ARM && HAVE_NEON +#include "arm/rpi_sand_neon.h" +#define HAVE_SAND_ASM 1 +#elif ARCH_AARCH64 && HAVE_NEON +#include "aarch64/rpi_sand_neon.h" +#define HAVE_SAND_ASM 1 +#else +#define HAVE_SAND_ASM 0 +#endif + +#define PW 1 +#include "rpi_sand_fn_pw.h" +#undef PW + +#define PW 2 +#include "rpi_sand_fn_pw.h" +#undef PW + +#if 1 +// Simple round +static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) +{ + const unsigned int rnd = (1 << shr) >> 1; + const uint16_t * src = (const uint16_t *)_src; + + for (; n != 0; --n) { + *dst++ = (*src++ + rnd) >> shr; + } +} +#else +// Dithered variation +static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) +{ + unsigned int rnd = (1 << shr) >> 1; + const unsigned int mask = ((1 << shr) - 1); + const uint16_t * src = (const uint16_t *)_src; + + for (; n != 0; --n) { + rnd = *src++ + (rnd & mask); + *dst++ = rnd >> shr; + } +} +#endif + +// Fetches a single patch - offscreen fixup not done here +// w <= stride1 +// unclipped +// _x & _w in pixels, strides in bytes +void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, + const uint8_t * src, + unsigned int stride1, unsigned int stride2, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h) +{ + const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word + const unsigned int xskip0 = _x - (x0 >> 2) * 3; + const unsigned int x1 = ((_x + _w) / 3) * 4; + const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; + const unsigned int mask = stride1 - 1; + const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; + const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words + +#if HAVE_SAND_ASM + if (_x == 0) { + ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); + return; + } +#endif + + if (x0 == x1) { + // ******************* + // Partial single word xfer + return; + } + + for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) + { + unsigned int x = x0; + const uint32_t * p = (const uint32_t *)p0; + uint16_t * d = (uint16_t *)dst; + + if (xskip0 != 0) { + const uint32_t p3 = *p++; + + if (xskip0 == 1) + *d++ = (p3 >> 10) & 0x3ff; + *d++ = (p3 >> 20) & 0x3ff; + + if (((x += 4) & mask) == 0) + p += slice_inc; + } + + while (x != x1) { + const uint32_t p3 = *p++; + *d++ = p3 & 0x3ff; + *d++ = (p3 >> 10) & 0x3ff; + *d++ = (p3 >> 20) & 0x3ff; + + if (((x += 4) & mask) == 0) + p += slice_inc; + } + + if (xrem1 != 0) { + const uint32_t p3 = *p; + + *d++ = p3 & 0x3ff; + if (xrem1 == 2) + *d++ = (p3 >> 10) & 0x3ff; + } + } +} + + +void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, + uint8_t * dst_v, const unsigned int dst_stride_v, + const uint8_t * src, + unsigned int stride1, unsigned int stride2, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h) +{ + const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word + const unsigned int xskip0 = _x - (x0 >> 3) * 3; + const unsigned int x1 = ((_x + _w) / 3) * 8; + const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3; + const unsigned int mask = stride1 - 1; + const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; + const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words + +#if HAVE_SAND_ASM + if (_x == 0) { + ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v, + src, stride1, stride2, _x, y, _w, h); + return; + } +#endif + + if (x0 == x1) { + // ******************* + // Partial single word xfer + return; + } + + for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1) + { + unsigned int x = x0; + const uint32_t * p = (const uint32_t *)p0; + uint16_t * du = (uint16_t *)dst_u; + uint16_t * dv = (uint16_t *)dst_v; + + if (xskip0 != 0) { + const uint32_t p3a = *p++; + const uint32_t p3b = *p++; + + if (xskip0 == 1) + { + *du++ = (p3a >> 20) & 0x3ff; + *dv++ = (p3b >> 0) & 0x3ff; + } + *du++ = (p3b >> 10) & 0x3ff; + *dv++ = (p3b >> 20) & 0x3ff; + + if (((x += 8) & mask) == 0) + p += slice_inc; + } + + while (x != x1) { + const uint32_t p3a = *p++; + const uint32_t p3b = *p++; + + *du++ = p3a & 0x3ff; + *dv++ = (p3a >> 10) & 0x3ff; + *du++ = (p3a >> 20) & 0x3ff; + *dv++ = p3b & 0x3ff; + *du++ = (p3b >> 10) & 0x3ff; + *dv++ = (p3b >> 20) & 0x3ff; + + if (((x += 8) & mask) == 0) + p += slice_inc; + } + + if (xrem1 != 0) { + const uint32_t p3a = *p++; + const uint32_t p3b = *p++; + + *du++ = p3a & 0x3ff; + *dv++ = (p3a >> 10) & 0x3ff; + if (xrem1 == 2) + { + *du++ = (p3a >> 20) & 0x3ff; + *dv++ = p3b & 0x3ff; + } + } + } +} + + +// w/h in pixels +void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, + const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, + unsigned int w, unsigned int h, const unsigned int shr) +{ + const unsigned int n = dst_stride1 / 2; + unsigned int j; + + // This is true for our current layouts + av_assert0(dst_stride1 == src_stride1); + + // As we have the same stride1 for src & dest and src is wider than dest + // then if we loop on src we can always write contiguously to dest + // We make no effort to copy an exact width - round up to nearest src stripe + // as we will always have storage in dest for that + +#if ARCH_ARM && HAVE_NEON + if (shr == 3 && src_stride1 == 128) { + for (j = 0; j + n < w; j += dst_stride1) { + uint8_t * d = dst + j * dst_stride2; + const uint8_t * s1 = src + j * 2 * src_stride2; + const uint8_t * s2 = s1 + src_stride1 * src_stride2; + + ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h); + } + } + else +#endif + { + for (j = 0; j + n < w; j += dst_stride1) { + uint8_t * d = dst + j * dst_stride2; + const uint8_t * s1 = src + j * 2 * src_stride2; + const uint8_t * s2 = s1 + src_stride1 * src_stride2; + + for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) { + cpy16_to_8(d, s1, n, shr); + cpy16_to_8(d + n, s2, n, shr); + } + } + } + + // Fix up a trailing dest half stripe + if (j < w) { + uint8_t * d = dst + j * dst_stride2; + const uint8_t * s1 = src + j * 2 * src_stride2; + + for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) { + cpy16_to_8(d, s1, n, shr); + } + } +} + +int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) +{ + const int w = av_frame_cropped_width(src); + const int h = av_frame_cropped_height(src); + const int x = src->crop_left; + const int y = src->crop_top; + + // We will crop as part of the conversion + dst->crop_top = 0; + dst->crop_left = 0; + dst->crop_bottom = 0; + dst->crop_right = 0; + + switch (src->format){ + case AV_PIX_FMT_SAND128: + case AV_PIX_FMT_RPI4_8: + switch (dst->format){ + case AV_PIX_FMT_YUV420P: + av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], + src->data[0], + av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), + x, y, w, h); + av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], + dst->data[2], dst->linesize[2], + src->data[1], + av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), + x/2, y/2, w/2, h/2); + break; + default: + return -1; + } + break; + case AV_PIX_FMT_SAND64_10: + switch (dst->format){ + case AV_PIX_FMT_YUV420P10: + av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0], + src->data[0], + av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), + x*2, y, w*2, h); + av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1], + dst->data[2], dst->linesize[2], + src->data[1], + av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), + x, y/2, w, h/2); + break; + default: + return -1; + } + break; + case AV_PIX_FMT_RPI4_10: + switch (dst->format){ + case AV_PIX_FMT_YUV420P10: + av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], + src->data[0], + av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), + x, y, w, h); + av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], + dst->data[2], dst->linesize[2], + src->data[1], + av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), + x/2, y/2, w/2, h/2); + break; + default: + return -1; + } + break; + default: + return -1; + } + + return av_frame_copy_props(dst, src); +} diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h new file mode 100644 index 0000000000..634b55e800 --- /dev/null +++ b/libavutil/rpi_sand_fns.h @@ -0,0 +1,183 @@ +/* +Copyright (c) 2018 Raspberry Pi (Trading) Ltd. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: John Cox +*/ + +#ifndef AVUTIL_RPI_SAND_FNS +#define AVUTIL_RPI_SAND_FNS + +#include "libavutil/frame.h" + +// For all these fns _x & _w are measured as coord * PW +// For the C fns coords are in chroma pels (so luma / 2) +// Strides are in bytes + +void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, + const uint8_t * src, + unsigned int stride1, unsigned int stride2, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); +void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, + const uint8_t * src, + unsigned int stride1, unsigned int stride2, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); + +void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u, + uint8_t * dst_v, const unsigned int dst_stride_v, + const uint8_t * src, + unsigned int stride1, unsigned int stride2, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); +void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, + uint8_t * dst_v, const unsigned int dst_stride_v, + const uint8_t * src, + unsigned int stride1, unsigned int stride2, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); + +void av_rpi_planar_to_sand_c8(uint8_t * dst_c, + unsigned int stride1, unsigned int stride2, + const uint8_t * src_u, const unsigned int src_stride_u, + const uint8_t * src_v, const unsigned int src_stride_v, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); +void av_rpi_planar_to_sand_c16(uint8_t * dst_c, + unsigned int stride1, unsigned int stride2, + const uint8_t * src_u, const unsigned int src_stride_u, + const uint8_t * src_v, const unsigned int src_stride_v, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); + +void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, + const uint8_t * src, + unsigned int stride1, unsigned int stride2, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); +void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, + uint8_t * dst_v, const unsigned int dst_stride_v, + const uint8_t * src, + unsigned int stride1, unsigned int stride2, + unsigned int _x, unsigned int y, + unsigned int _w, unsigned int h); + + +// w/h in pixels +void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, + const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, + unsigned int w, unsigned int h, const unsigned int shr); + + +// dst must contain required pixel format & allocated data buffers +// Cropping on the src buffer will be honoured and dst crop will be set to zero +int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src); + + +static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame) +{ +#ifdef RPI_ZC_SAND128_ONLY + // If we are sure we only only support 128 byte sand formats replace the + // var with a constant which should allow for better optimisation + return 128; +#else + return frame->linesize[0]; +#endif +} + +static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame) +{ + return frame->linesize[3]; +} + + +static inline int av_rpi_is_sand_format(const int format) +{ + return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10); +} + +static inline int av_rpi_is_sand_frame(const AVFrame * const frame) +{ + return av_rpi_is_sand_format(frame->format); +} + +static inline int av_rpi_is_sand8_frame(const AVFrame * const frame) +{ + return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8); +} + +static inline int av_rpi_is_sand16_frame(const AVFrame * const frame) +{ + return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16); +} + +static inline int av_rpi_is_sand30_frame(const AVFrame * const frame) +{ + return (frame->format == AV_PIX_FMT_RPI4_10); +} + +static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame) +{ + return av_rpi_is_sand8_frame(frame) ? 0 : 1; +} + +// If x is measured in bytes (not pixels) then this works for sand64_16 as +// well as sand128 - but in the general case we work that out + +static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y) +{ + const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); + const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); + const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame); + const unsigned int x1 = x & (stride1 - 1); + const unsigned int x2 = x ^ x1; + + return x1 + stride1 * y + stride2 * x2; +} + +static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) +{ + const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); + const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); + const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1); + const unsigned int x1 = x & (stride1 - 1); + const unsigned int x2 = x ^ x1; + + return x1 + stride1 * y_c + stride2 * x2; +} + +static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) +{ + return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y); +} + +static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) +{ + return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y); +} + +#endif + diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt new file mode 100644 index 0000000000..b050971f63 --- /dev/null +++ b/pi-util/BUILD.txt @@ -0,0 +1,59 @@ +Building Pi FFmpeg +================== + +Current only building on a Pi is supported. +This builds ffmpeg the way I've tested it + +Get all dependencies - the current package dependencies are good enough + +$ sudo apt-get build-dep ffmpeg + +Configure using the pi-util/conf_native.sh script +------------------------------------------------- + +This sets the normal release options and creates an ouutput dir to build into +The directory name will depend on system and options but will be under out/ + +There are a few choices here + --mmal build including the legacy mmal-based decoders and zero-copy code + this requires appropriate libraries which currently will exist for + armv7 but not arm64 + --noshared + Build a static image rather than a shared library one. Static is + easier for testing as there is no need to worry about library + paths being confused and therefore running the wrong code, Shared + is what is needed, in most cases, when building for use by other + programs. + +So for a static build +--------------------- + +$ pi-util/conf_native.sh --noshared + +$ make -j8 -C out/ + +You can now run ffmpeg directly from where it was built + +For a shared build +------------------ + +$ pi-util/conf_native.sh + +You will normally want an install target if shared. Note that the script has +set this up to be generated in out//install, you don't have to worry +about overwriting your system libs. + +$ make -j8 -C out/ install + +You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was +built or install the image on the system - you have to be careful to get rid +of all other ffmpeg libs or confusion may result. There is a little script +that wipes all other versions - obviously use with care! + +$ sudo pi-util/clean_usr_libs.sh + +Then simply copying from the install to /usr works + +$ sudo cp -r out//install/* /usr + + diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt new file mode 100644 index 0000000000..fcce72226a --- /dev/null +++ b/pi-util/NOTES.txt @@ -0,0 +1,69 @@ +Notes on the hevc_rpi decoder & associated support code +------------------------------------------------------- + +There are 3 main parts to the existing code: + +1) The decoder - this is all in libavcodec as rpi_hevc*. + +2) A few filters to deal with Sand frames and a small patch to +automatically select the sand->i420 converter when required. + +3) A kludge in ffmpeg.c to display the decoded video. This could & should +be converted into a proper ffmpeg display module. + + +Decoder +------- + +The decoder is a modified version of the existing ffmpeg hevc decoder. +Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder. +More complex bitstreams can be up to ~200% faster but particularly easy +streams can cut its advantage down to ~50%. This means that a Pi3+ can +display nearly all 8-bit 1080p30 streams and with some overclocking it can +display most lower bitrate 10-bit 1080p30 streams - this latter case is +not helped by the requirement to downsample to 8-bit before display on a +Pi. + +It has had co-processor offload added for inter-pred and large block +residual transform. Various parts have had optimized ARM NEON assembler +added and the existing ARM asm sections have been profiled and +re-optimized for A53. The main C code has been substantially reworked at +its lower levels in an attempt to optimize it and minimize memory +bandwidth. To some extent code paths that deal with frame types that it +doesn't support have been pruned. + +It outputs frames in Broadcom Sand format. This is a somewhat annoying +layout that doesn't fit into ffmpegs standard frame descriptions. It has +vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for +the stripe followed by interleaved U & V, that is then followed by the Y +for the next stripe, etc. The final stripe is always padded to +stripe-width. This is used in an attempt to help with cache locality and +cut down on the number of dram bank switches. It is annoying to use for +inter-pred with conventional processing but the way the Pi QPU (which is +used for inter-pred) works means that it has negligible downsides here and +the improved memory performance exceeds the overhead of the increased +complexity in the rest of the code. + +Frames must be allocated out of GPU memory (as otherwise they can't be +accessed by the co-processors). Utility functions (in rpi_zc.c) have been +written to make this easier. As the frames are already in GPU memory they +can be displayed by the Pi h/w without any further copying. + + +Known non-features +------------------ + +Frame allocation should probably be done in some other way in order to fit +into the standard framework better. + +Sand frames are currently declared as software frames, there is an +argument that they should be hardware frames but they aren't really. + +There must be a better way of auto-selecting the hevc_rpi decoder over the +normal s/w hevc decoder, but I became confused by the existing h/w +acceleration framework and what I wanted to do didn't seem to fit in +neatly. + +Display should be a proper device rather than a kludge in ffmpeg.c + + diff --git a/pi-util/TESTMESA.txt b/pi-util/TESTMESA.txt new file mode 100644 index 0000000000..92bc13a3df --- /dev/null +++ b/pi-util/TESTMESA.txt @@ -0,0 +1,82 @@ +# Setup & Build instructions for testing Argon30 mesa support (on Pi4) + +# These assume that the drm_mmal test for Sand8 has been built on this Pi +# as build relies on many of the same files + +# 1st get everything required to build ffmpeg +# If sources aren't already enabled on your Pi then enable them +sudo su +sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list +sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list +mv /tmp/sources.list /etc/apt/ +mv /tmp/raspi.list /etc/apt/sources.list.d/ +apt update + +# Get dependancies +sudo apt build-dep ffmpeg + +sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev + +# Enable H265 V4L2 request decoder +sudo su +echo dtoverlay=rpivid-v4l2 >> /boot/config.txt +# You may also want to add more CMA if you are going to try 4k videos +# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read +# dtoverlay=vc4-fkms-v3d,cma-512 +reboot +# Check it has turned up +ls -la /dev/video* +# This should include video19 +# crw-rw----+ 1 root video 81, 7 Aug 4 17:25 /dev/video19 + +# Currently on the Pi the linux headers from the debian distro don't match +# the kernel that we ship and we need to update them - hopefully this step +# will be unneeded in the future +sudo apt install git bc bison flex libssl-dev make +git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y +cd linux +KERNEL=kernel7l +make bcm2711_defconfig +make headers_install +sudo cp -r usr/include/linux /usr/include +cd .. + +# Config - this builds a staticly linked ffmpeg which is easier for testing +pi-util/conf_native.sh --noshared + +# Build (this is a bit dull) +# If you want to poke the source the libavdevice/egl_vout.c contains the +# output code - +cd out/armv7-static-rel + +# Check that you have actually configured V4L2 request +grep HEVC_V4L2REQUEST config.h +# You are hoping for +# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1 +# if you get 0 then the config has failed + +make -j6 + +# Grab test streams +wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv +wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv +wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv + +# Test i420 output (works currently) +./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl - + +# Test Sand8 output - doesn't currently work but should once you have +# Sand8 working in drm_mmal. I can't guarantee that this will work as +# I can't test this path with a known working format, but the debug looks +# good. If this doesn't work & drm_mmal does with sand8 then come back to me +# The "show_all 1" forces vout to display every frame otherwise it drops any +# frame that would cause it to block +./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl - + +# Test Sand30 - doesn't currently work +# (Beware that when FFmpeg errors out it often leaves your teminal window +# in a state where you need to reset it) +./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl - + + + diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh new file mode 100755 index 0000000000..b3b2d5509d --- /dev/null +++ b/pi-util/clean_usr_libs.sh @@ -0,0 +1,26 @@ +set -e +U=/usr/lib/arm-linux-gnueabihf +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* +rm -f $U/libavfilter.* +rm -f $U/libavformat.* +rm -f $U/libavutil.* +rm -f $U/libswresample.* +rm -f $U/libswscale.* +U=/usr/lib/arm-linux-gnueabihf/neon/vfp +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* +rm -f $U/libavfilter.* +rm -f $U/libavformat.* +rm -f $U/libavutil.* +rm -f $U/libswresample.* +rm -f $U/libswscale.* +U=/usr/lib/aarch64-linux-gnu +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* +rm -f $U/libavfilter.* +rm -f $U/libavformat.* +rm -f $U/libavutil.* +rm -f $U/libswresample.* +rm -f $U/libswscale.* + diff --git a/pi-util/conf_arm64_native.sh b/pi-util/conf_arm64_native.sh new file mode 100644 index 0000000000..9e3bbfa190 --- /dev/null +++ b/pi-util/conf_arm64_native.sh @@ -0,0 +1,45 @@ +echo "Configure for ARM64 native build" + +#RPI_KEEPS="-save-temps=obj" + +SHARED_LIBS="--enable-shared" +if [ "$1" == "--noshared" ]; then + SHARED_LIBS="--disable-shared" + echo Static libs + OUT=out/arm64-static-rel +else + echo Shared libs + OUT=out/arm64-shared-rel +fi + +mkdir -p $OUT +cd $OUT + +A=aarch64-linux-gnu +USR_PREFIX=`pwd`/install +LIB_PREFIX=$USR_PREFIX/lib/$A +INC_PREFIX=$USR_PREFIX/include/$A + +../../configure \ + --prefix=$USR_PREFIX\ + --libdir=$LIB_PREFIX\ + --incdir=$INC_PREFIX\ + --disable-stripping\ + --disable-thumb\ + --disable-mmal\ + --enable-sand\ + --enable-v4l2-request\ + --enable-libdrm\ + --enable-epoxy\ + --enable-libudev\ + --enable-vout-drm\ + --enable-vout-egl\ + $SHARED_LIBS\ + --extra-cflags="-ggdb" + +# --enable-decoder=hevc_rpi\ +# --enable-extra-warnings\ +# --arch=armv71\ + +# gcc option for getting asm listing +# -Wa,-ahls diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv new file mode 100644 index 0000000000..4efd5d1c67 --- /dev/null +++ b/pi-util/conf_h265.2016.csv @@ -0,0 +1,195 @@ +1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8 +1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8 +1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8 +1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8 +1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8 +1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8 +1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8 +1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8 +1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8 +1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8 +1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8 +1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8 +1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8 +1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8 +1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8 +1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8 +1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8 +1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8 +1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8 +1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8 +1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8 +1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10 +1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8 +1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8 +1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8 +1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8 +1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8 +1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8 +1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8 +1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8 +1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8 +1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8 +1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8 +1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8 +1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8 +1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8 +1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8 +1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8 +1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8 +1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8 +1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8 +1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8 +1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10 +1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8 +1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8 +1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8 +1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8 +1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8 +1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8 +1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8 +1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8 +1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8 +1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8 +1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8 +1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8 +1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8 +1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8 +1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8 +1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8 +1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8 +1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8 +1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8 +1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8 +1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8 +1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8 +1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8 +1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8 +1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8 +1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8 +1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8 +1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8 +1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8 +1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8 +1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8 +1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8 +1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8 +1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8 +1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8 +1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8 +1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8 +1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8 +1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8 +1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8 +1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8 +1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8 +1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8 +1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8 +1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8 +1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8 +1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8 +1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8 +1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8 +1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8 +1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8 +1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8 +1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8 +1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8 +1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8 +1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8 +1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8 +1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8 +1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8 +1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8 +1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8 +1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8 +1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8 +1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8 +1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8 +1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8 +1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8 +1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8 +1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8 +1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8 +1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8 +1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8 +1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8 +1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8 +1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8 +1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8 +1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8 +1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8 +1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8 +1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8 +1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8 +1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8 +1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8 +1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8 +3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10 +1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8 +1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8 +3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8 +1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10 +1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8 +1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8 +1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10 +1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10 +1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8 +1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10 +1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8 +1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10 +1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8 +1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10 +1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8 +1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10 +1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8 +1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10 +1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8 +1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0 +0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8 +0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8 +0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10 +0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8 +0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8 +1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0 +0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8 +0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10 +0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8 +0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8 +0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8 +0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10 +0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8 +0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8 +0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8 +1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10 +1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0 +1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0 +1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0 +1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0 +1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0 +1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0 +0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0 +0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8 +0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8 +1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0 +1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8 +1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0 +1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0 +1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0 +1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0 +1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0 +1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0 +1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0 +0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8 +0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10 +0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10 +0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8 +0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8 +0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8 +0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8 +0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8 +1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8 +1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8 +1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8 +1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8 +1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8 diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv new file mode 100644 index 0000000000..6082641271 --- /dev/null +++ b/pi-util/conf_h265.2016_HEVC_v1.csv @@ -0,0 +1,147 @@ +1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5 +1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5 +1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 +1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 +1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 +1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 +1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 +1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5 +1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 +1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 +1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 +1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 +1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 +1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 +1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 +1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 +1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 +1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 +1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 +1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 +1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 +1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5 +1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 +1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 +1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 +1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 +1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 +1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 +1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 +1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 +1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 +1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 +1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 +1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 +1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 +1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 +1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 +1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 +1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 +1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 +1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 +1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 +1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 +1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 +1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 +1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 +1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 +1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 +1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 +1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 +1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 +1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 +1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 +1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 +1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5 +1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5 +1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5 +1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 +1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 +1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 +1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 +1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 +1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 +1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 +1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 +1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 +1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 +1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 +1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 +1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 +1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 +1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 +1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 +1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 +1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 +1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 +1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 +1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 +1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 +1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 +1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 +1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 +1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 +1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 +1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 +1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 +1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 +1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 +1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 +1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 +1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 +1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 +1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 +1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 +1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 +1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 +1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 +1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 +1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 +1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 +1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 +1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 +1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 +1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 +1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 +1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 +1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 +1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 +1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 +1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5 +2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt +2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt +1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 +1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 +1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5 +1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5 +1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5 +1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 +1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 +1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5 +1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5 +1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 +1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 +1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 +1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 +1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 +3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 +1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 +3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ??? +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 +1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 +1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 +1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 +1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 +1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 +1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 +1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 +1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 +1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 +1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 +1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 +1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv new file mode 100644 index 0000000000..fc14f2a3c2 --- /dev/null +++ b/pi-util/conf_h265.csv @@ -0,0 +1,144 @@ +1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5 +1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5 +1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5 +1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 +1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 +1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 +1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 +1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 +1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5 +1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 +1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 +1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 +1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 +1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 +1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 +1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 +1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 +1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 +1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 +1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 +1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 +1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 +1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5 +1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 +1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 +1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 +1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 +1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 +1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 +1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 +1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 +1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 +1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 +1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 +1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 +1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 +1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 +1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 +1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 +1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 +1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 +1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 +1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 +1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 +1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 +1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 +1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 +1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 +1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 +1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 +1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 +1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 +1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 +1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 +1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 +1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5 +1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5 +1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5 +1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 +1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 +1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 +1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 +1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 +1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 +1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 +1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 +1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 +1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 +1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 +1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 +1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 +1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 +1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 +1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 +1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 +1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 +1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 +1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 +1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 +1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 +1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 +1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 +1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 +1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 +1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 +1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 +1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 +1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 +1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 +1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 +1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 +1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 +1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 +1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 +1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 +1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 +1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 +1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 +1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 +1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 +1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 +1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 +1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 +1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 +1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 +1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 +1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 +1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 +1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 +1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 +1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 +1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 +1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5 +1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5 +1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5 +1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 +1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 +1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5 +1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5 +1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 +1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 +1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 +1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 +1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 +0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 +1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 +1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 +1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 +1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 +1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 +1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 +1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 +1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 +1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 +1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 +1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 +1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 +1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh new file mode 100755 index 0000000000..37cea71756 --- /dev/null +++ b/pi-util/conf_native.sh @@ -0,0 +1,106 @@ +echo "Configure for native build" + +FFSRC=`pwd` +MC=`dpkg --print-architecture` +BUILDBASE=$FFSRC/out + +#RPI_KEEPS="-save-temps=obj" +RPI_KEEPS="" + +NOSHARED= +MMAL= + +while [ "$1" != "" ] ; do + case $1 in + --noshared) + NOSHARED=1 + ;; + --mmal) + MMAL=1 + ;; + *) + echo "Usage $0: [--noshared] [--mmal]" + exit 1 + ;; + esac + shift +done + + +MCOPTS= +RPI_INCLUDES= +RPI_LIBDIRS= +RPI_DEFINES= +RPI_EXTRALIBS= + +if [ "$MC" == "arm64" ]; then + echo "M/C aarch64" + A=aarch64-linux-gnu + B=arm64 +elif [ "$MC" == "armhf" ]; then + echo "M/C armv7" + A=arm-linux-gnueabihf + B=armv7 + MCOPTS="--arch=armv6t2 --cpu=cortex-a7" + RPI_DEFINES=-mfpu=neon-vfpv4 +else + echo Unexpected architecture $MC + exit 1 +fi + +if [ $MMAL ]; then + RPI_OPT_VC=/opt/vc + RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" + RPI_LIBDIRS="-L$RPI_OPT_VC/lib" + RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000" + RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group" + RPIOPTS="--enable-mmal --enable-rpi" +else + RPIOPTS="--disable-mmal --enable-sand" +fi + +C=`lsb_release -sc` +V=`cat RELEASE` + +SHARED_LIBS="--enable-shared" +if [ $NOSHARED ]; then + SHARED_LIBS="--disable-shared" + OUT=$BUILDBASE/$B-$C-$V-static-rel + echo Static libs +else + echo Shared libs + OUT=$BUILDBASE/$B-$C-$V-shared-rel +fi + +USR_PREFIX=$OUT/install +LIB_PREFIX=$USR_PREFIX/lib/$A +INC_PREFIX=$USR_PREFIX/include/$A + +echo Destination directory: $OUT +mkdir -p $OUT +# Nothing under here need worry git - including this .gitignore! +echo "**" > $BUILDBASE/.gitignore +cd $OUT + +$FFSRC/configure \ + --prefix=$USR_PREFIX\ + --libdir=$LIB_PREFIX\ + --incdir=$INC_PREFIX\ + $MCOPTS\ + --disable-stripping\ + --disable-thumb\ + --enable-v4l2-request\ + --enable-libdrm\ + --enable-vout-egl\ + --enable-vout-drm\ + $SHARED_LIBS\ + $RPIOPTS\ + --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ + --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\ + --extra-ldflags="$RPI_LIBDIRS"\ + --extra-libs="$RPI_EXTRALIBS"\ + --extra-version="rpi" + + +# gcc option for getting asm listing +# -Wa,-ahls diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py new file mode 100755 index 0000000000..657568014e --- /dev/null +++ b/pi-util/ffconf.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 + +import string +import os +import subprocess +import re +import argparse +import sys +import csv +from stat import * + +CODEC_HEVC_RPI = 1 +HWACCEL_RPI = 2 +HWACCEL_DRM = 3 +HWACCEL_VAAPI = 4 + +def testone(fileroot, srcname, es_file, md5_file, pix, dectype, vcodec, ffmpeg_exec): + hwaccel = "" + if dectype == HWACCEL_RPI: + hwaccel = "rpi" + elif dectype == HWACCEL_DRM: + hwaccel = "drm" + elif dectype == HWACCEL_VAAPI: + hwaccel = "vaapi" + + pix_fmt = [] + if pix == "8": + pix_fmt = ["-pix_fmt", "yuv420p"] + elif pix == "10": + pix_fmt = ["-pix_fmt", "yuv420p10le"] + elif pix == "12": + pix_fmt = ["-pix_fmt", "yuv420p12le"] + + tmp_root = "/tmp" + + names = srcname.split('/') + while len(names) > 1: + tmp_root = os.path.join(tmp_root, names[0]) + del names[0] + name = names[0] + + if not os.path.exists(tmp_root): + os.makedirs(tmp_root) + + dec_file = os.path.join(tmp_root, name + ".dec.md5") + try: + os.remove(dec_file) + except: + pass + + flog = open(os.path.join(tmp_root, name + ".log"), "wt") + + ffargs = [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", hwaccel, "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file)] + pix_fmt + ["-f", "md5", dec_file] + + # Unaligned needed for cropping conformance + if hwaccel: + rstr = subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT) + else: + rstr = subprocess.call( + [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file], + stdout=flog, stderr=subprocess.STDOUT) + + try: + m1 = None + m2 = None + with open(os.path.join(fileroot, md5_file)) as f: + for line in f: + m1 = re.search("[0-9a-f]{32}", line.lower()) + if m1: + break + + with open(dec_file) as f: + m2 = re.search("[0-9a-f]{32}", f.readline()) + except: + pass + + if m1 and m2 and m1.group() == m2.group(): + print("Match: " + m1.group(), file=flog) + rv = 0 + elif not m1: + print("****** Cannot find m1", file=flog) + rv = 3 + elif not m2: + print("****** Cannot find m2", file=flog) + rv = 2 + else: + print("****** Mismatch: " + m1.group() + " != " + m2.group(), file=flog) + rv = 1 + flog.close() + return rv + +def scandir(root): + aconf = [] + ents = os.listdir(root) + ents.sort(key=str.lower) + for name in ents: + test_path = os.path.join(root, name) + if S_ISDIR(os.stat(test_path).st_mode): + files = os.listdir(test_path) + es_file = "?" + md5_file = "?" + for f in files: + (base, ext) = os.path.splitext(f) + if base[0] == '.': + pass + elif ext == ".bit" or ext == ".bin": + es_file = f + elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")): + if md5_file == "?": + md5_file = f + elif base[-3:] == "yuv": + md5_file = f + aconf.append((1, name, es_file, md5_file)) + return aconf + +def runtest(name, tests): + if not tests: + return True + for t in tests: + if name[0:len(t)] == t or name.find("/" + t) != -1: + return True + return False + +def doconf(csva, tests, test_root, vcodec, dectype, ffmpeg_exec): + unx_failures = [] + unx_success = [] + failures = 0 + successes = 0 + for a in csva: + exp_test = int(a[0]) + if (exp_test and runtest(a[1], tests)): + name = a[1] + print ("==== ", name, end="") + sys.stdout.flush() + + rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec, ffmpeg_exec=ffmpeg_exec) + if (rv == 0): + successes += 1 + else: + failures += 1 + + if (rv == 0): + if exp_test == 2: + print(": * OK *") + unx_success.append(name) + else: + print(": ok") + elif exp_test == 2 and rv == 1: + print(": fail") + elif exp_test == 3 and rv == 2: + # Call an expected "crash" an abort + print(": abort") + else: + unx_failures.append(name) + if rv == 1: + print(": * FAIL *") + elif (rv == 2) : + print(": * CRASH *") + elif (rv == 3) : + print(": * MD5 MISSING *") + else : + print(": * BANG *") + + if unx_failures or unx_success: + print("Unexpected Failures:", unx_failures) + print("Unexpected Success: ", unx_success) + else: + print("All tests normal:", successes, "ok,", failures, "failed") + + +class ConfCSVDialect(csv.Dialect): + delimiter = ',' + doublequote = True + lineterminator = '\n' + quotechar='"' + quoting = csv.QUOTE_MINIMAL + skipinitialspace = True + strict = True + +if __name__ == '__main__': + + argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester") + argp.add_argument("tests", nargs='*') + argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line") + argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line") + argp.add_argument("--vaapi", action='store_true', help="Force vaapi cmd line") + argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test") + argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir") + argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename") + argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use") + argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name") + args = argp.parse_args() + + if args.csvgen: + csv.writer(sys.stdout).writerows(scandir(args.test_root)) + exit(0) + + with open(args.csv, 'rt') as csvfile: + csva = [a for a in csv.reader(csvfile, ConfCSVDialect())] + + dectype = CODEC_HEVC_RPI + if os.path.exists("/dev/rpivid-hevcmem"): + dectype = HWACCEL_RPI + if args.drm or os.path.exists("/sys/module/rpivid_hevc"): + dectype = HWACCEL_DRM + + if args.pi4: + dectype = HWACCEL_RPI + elif args.drm: + dectype = HWACCEL_DRM + elif args.vaapi: + dectype = HWACCEL_VAAPI + + doconf(csva, args.tests, args.test_root, args.vcodec, dectype, args.ffmpeg) + diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py new file mode 100755 index 0000000000..65c5224cd8 --- /dev/null +++ b/pi-util/ffperf.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 + +import time +import string +import os +import tempfile +import subprocess +import re +import argparse +import sys +import csv +from stat import * + +class tstats: + close_threshold = 0.01 + + def __init__(self, stats_dict=None): + if stats_dict != None: + self.name = stats_dict["name"] + self.elapsed = float(stats_dict["elapsed"]) + self.user = float(stats_dict["user"]) + self.sys = float(stats_dict["sys"]) + + def times_str(self): + ctime = self.sys + self.user + return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed) + + def dict(self): + return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys} + + def is_close(self, other): + return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold + + def __lt__(self, other): + return self.elapsed < other.elapsed + def __gt__(self, other): + return self.elapsed > other.elapsed + + def time_file(name, prefix, ffmpeg="./ffmpeg"): + stats = tstats() + stats.name = name + start_time = time.clock_gettime(time.CLOCK_MONOTONIC); + cproc = subprocess.Popen([ffmpeg, "-no_cvt_hw", + "-vcodec", "hevc_rpi", + "-t", "30", "-i", prefix + name, + "-f", "vout_rpi", os.devnull], bufsize=-1, stdout=flog, stderr=flog); + pinfo = os.wait4(cproc.pid, 0) + end_time = time.clock_gettime(time.CLOCK_MONOTONIC); + stats.elapsed = end_time - start_time + stats.user = pinfo[2].ru_utime + stats.sys = pinfo[2].ru_stime + return stats + + +def common_prefix(s1, s2): + for i in range(min(len(s1),len(s2))): + if s1[i] != s2[i]: + return s1[:i] + return s1[:i+1] + +def main(): + global flog + + argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog=""" +To blank the screen before starting use "xdg-screensaver activate" +(For some reason this doesn't seem to work from within python). +""") + + argp.add_argument("streams", nargs='*') + argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename") + argp.add_argument("--csv_in", help="CSV input filename") + argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).") + argp.add_argument("--repeat", default=3, type=int, help="Run repeat count") + argp.add_argument("--ffmpeg", default="./ffmpeg", help="FFmpeg executable") + + args = argp.parse_args() + + csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"]) + csv_out.writeheader() + + stats_in = {} + if args.csv_in != None: + with open(args.csv_in, 'r', newline='') as f_in: + stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)} + + flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt") + + streams = args.streams + if not streams: + if not stats_in: + print ("No source streams specified") + return 1 + prefix = "" if args.prefix == None else args.prefix + streams = [k for k in stats_in] + elif args.prefix != None: + prefix = args.prefix + else: + prefix = streams[0] + for f in streams[1:]: + prefix = common_prefix(prefix, f) + pp = prefix.rpartition(os.sep) + prefix = pp[0] + pp[1] + streams = [s[len(prefix):] for s in streams] + + for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()): + print ("====", f) + + t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999}) + for i in range(args.repeat): + t = tstats.time_file(f, prefix, args.ffmpeg) + print ("...", t.times_str()) + if t0 > t: + t0 = t + + if t0.name in stats_in: + pstat = stats_in[t0.name] + print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str()) + + csv_out.writerow(t0.dict()) + + print () + + return 0 + + +if __name__ == '__main__': + exit(main()) + diff --git a/pi-util/genpatch.sh b/pi-util/genpatch.sh new file mode 100755 index 0000000000..0948a68a7a --- /dev/null +++ b/pi-util/genpatch.sh @@ -0,0 +1,35 @@ +set -e + +NOPATCH= +if [ "$1" == "--notag" ]; then + shift + NOPATCH=1 +fi + +if [ "$1" == "" ]; then + echo Usage: $0 [--notag] \ + echo e.g.: $0 mmal_4 + exit 1 +fi + +VERSION=`cat RELEASE` +if [ "$VERSION" == "" ]; then + echo Can\'t find version RELEASE + exit 1 +fi + +PATCHFILE=../ffmpeg-$VERSION-$1.patch + +if [ $NOPATCH ]; then + echo Not tagged +else + # Only continue if we are all comitted + git diff --name-status --exit-code + + PATCHTAG=pi/$VERSION/$1 + echo Tagging: $PATCHTAG + + git tag $PATCHTAG +fi +echo Generating patch: $PATCHFILE +git diff n$VERSION -- > $PATCHFILE diff --git a/pi-util/make_array.py b/pi-util/make_array.py new file mode 100755 index 0000000000..67b22d2d51 --- /dev/null +++ b/pi-util/make_array.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python + +# Usage +# make_array file.bin +# Produces file.h with array of bytes. +# +import sys +for file in sys.argv[1:]: + prefix,suffix = file.split('.') + assert suffix=='bin' + name=prefix.split('/')[-1] + print 'Converting',file + with open(prefix+'.h','wb') as out: + print >>out, 'static const unsigned char',name,'[] = {' + with open(file,'rb') as fd: + i = 0 + for byte in fd.read(): + print >>out, '0x%02x, ' % ord(byte), + i = i + 1 + if i % 8 == 0: + print >>out, ' // %04x' % (i - 8) + print >>out,'};' + diff --git a/pi-util/mkinst.sh b/pi-util/mkinst.sh new file mode 100755 index 0000000000..271a39e846 --- /dev/null +++ b/pi-util/mkinst.sh @@ -0,0 +1,5 @@ +set -e + +make install + +cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr diff --git a/pi-util/patkodi.sh b/pi-util/patkodi.sh new file mode 100644 index 0000000000..dcd05a606e --- /dev/null +++ b/pi-util/patkodi.sh @@ -0,0 +1,9 @@ +set -e +KODIBASE=/home/jc/rpi/kodi/xbmc +JOBS=-j20 +make $JOBS +git diff xbmc/release/4.3-kodi > $KODIBASE/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch +make -C $KODIBASE/tools/depends/target/ffmpeg $JOBS +make -C $KODIBASE/build install + + diff --git a/pi-util/perfcmp.py b/pi-util/perfcmp.py new file mode 100755 index 0000000000..e44cfa0c3c --- /dev/null +++ b/pi-util/perfcmp.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 + +import time +import string +import os +import tempfile +import subprocess +import re +import argparse +import sys +import csv +from stat import * + +class tstats: + close_threshold = 0.01 + + def __init__(self, stats_dict=None): + if stats_dict != None: + self.name = stats_dict["name"] + self.elapsed = float(stats_dict["elapsed"]) + self.user = float(stats_dict["user"]) + self.sys = float(stats_dict["sys"]) + + def times_str(self): + ctime = self.sys + self.user + return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed) + + def dict(self): + return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys} + + def is_close(self, other): + return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold + + def __lt__(self, other): + return self.elapsed < other.elapsed + def __gt__(self, other): + return self.elapsed > other.elapsed + + def time_file(name, prefix): + stats = tstats() + stats.name = name + start_time = time.clock_gettime(time.CLOCK_MONOTONIC); + cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name, + "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog); + pinfo = os.wait4(cproc.pid, 0) + end_time = time.clock_gettime(time.CLOCK_MONOTONIC); + stats.elapsed = end_time - start_time + stats.user = pinfo[2].ru_utime + stats.sys = pinfo[2].ru_stime + return stats + + +def common_prefix(s1, s2): + for i in range(min(len(s1),len(s2))): + if s1[i] != s2[i]: + return s1[:i] + return s1[:i+1] + +def main(): + argp = argparse.ArgumentParser(description="FFmpeg performance compare") + + argp.add_argument("stream0", help="CSV to compare") + argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare") + + args = argp.parse_args() + + with open(args.stream0, 'r', newline='') as f_in: + stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)} + with open(args.stream1, 'r', newline='') as f_in: + stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)} + + print (args.stream0, "<<-->>", args.stream1) + print () + + for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()): + if not (f in stats0) : + print (" XX :", f) + continue + if not (f in stats1) : + print (" XX :", f) + continue + + s0 = stats0[f] + s1 = stats1[f] + + pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0 + thresh = 0.3 + tc = 6 + + nchar = min(tc - 1, int(abs(pcent) / thresh)) + cc = " -- " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar + + print ("%6.2f %s%6.2f (%+5.2f) : %s" % + (s0.elapsed, cc, s1.elapsed, pcent, f)) + + return 0 + + +if __name__ == '__main__': + exit(main()) + diff --git a/pi-util/qem.sh b/pi-util/qem.sh new file mode 100755 index 0000000000..a4dbb6eacd --- /dev/null +++ b/pi-util/qem.sh @@ -0,0 +1,9 @@ +TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex +QASM=python\ ../local/bin/qasm.py +SRC_FILE=libavcodec/rpi_hevc_shader.qasm +DST_BASE=shader + +cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR +$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c +$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h + diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py new file mode 100755 index 0000000000..5935a11ca5 --- /dev/null +++ b/pi-util/v3dusage.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python + +import sys +import argparse +import re + +def do_logparse(logname): + + rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ') + rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$') + rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$') + rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$') + + ttotal = {'idle':0.0} + tstart = {} + qctotal = {} + qtstotal = {} + l2hits = {} + l2total = {} + time0 = None + idle_start = None + qpu_op_no = 0 + op_count = 0 + + with open(logname, "rt") as infile: + for line in infile: + match = rmatch.match(line) + if match: +# print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":" + time = float(match.group(1)) + unit = match.group(3) + opstart = not match.group(2) + optype = match.group(7) + hascb = match.group(8) != "0" + + if unit == 'qpu1': + unit = unit + "." + str(qpu_op_no) + if not opstart: + if hascb or optype == 'EXECUTE_SYNC': + qpu_op_no = 0 + else: + qpu_op_no += 1 + + # Ignore sync type + if optype == 'EXECUTE_SYNC': + continue + + if not time0: + time0 = time + + if opstart: + tstart[unit] = time; + elif unit in tstart: + op_count += 1 + if not unit in ttotal: + ttotal[unit] = 0.0 + ttotal[unit] += time - tstart[unit] + del tstart[unit] + + if not idle_start and not tstart: + idle_start = time + elif idle_start and tstart: + ttotal['idle'] += time - idle_start + idle_start = None + + match = rqcycle.match(line) + if match: + unit = "qpu1." + str(qpu_op_no) + if not unit in qctotal: + qctotal[unit] = 0 + qctotal[unit] += int(match.group(2)) + + match = rqtscycle.match(line) + if match: + unit = "qpu1." + str(qpu_op_no) + if not unit in qtstotal: + qtstotal[unit] = 0 + qtstotal[unit] += int(match.group(2)) + + match = rl2hits.match(line) + if match: + unit = "qpu1." + str(qpu_op_no) + if not unit in l2total: + l2total[unit] = 0 + l2hits[unit] = 0 + l2total[unit] += int(match.group(3)) + if match.group(2) == "hits": + l2hits[unit] += int(match.group(3)) + + + if not time0: + print "No v3d profile records found" + else: + tlogged = time - time0 + + print "Logged time:", tlogged, " Op count:", op_count + for unit in sorted(ttotal): + print b'%6s: %10.3f %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged) + print + for unit in sorted(qctotal): + if not unit in qtstotal: + qtstotal[unit] = 0; + print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit]) + if unit in l2total: + print b' L2Total: %10d, hits: %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit]) + + + +if __name__ == '__main__': + argp = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="QPU/VPU perf summary from VC logging", + epilog = """ +Will also summarise TMU stalls if logging requests set in qpu noflush param +in the profiled code. + +Example use: + vcgencmd set_logging level=0xc0 + + sudo vcdbg log msg >& t.log + v3dusage.py t.log +""") + + argp.add_argument("logfile") + args = argp.parse_args() + + do_logparse(args.logfile) +