diff --git a/.hgtags b/.hgtags index 0e65da4646..4a074b6f72 100644 --- a/.hgtags +++ b/.hgtags @@ -22,3 +22,4 @@ e27327f5da35c5feb660360336fdc94bd0afe719 1.8 981e3bfef16a997bce6f46ce1b15631a0e234747 2.1 be14a7e9755e54f0fd34911c72bdfa66981220bc 2.2 3037c1448549ca920967831482c653e5892fa8ed 2.3 +e7a4dd48293b7956d4a20df257d23904cc78e376 2.4 diff --git a/doc/reST/api.rst b/doc/reST/api.rst index df9f380357..1706ab731e 100644 --- a/doc/reST/api.rst +++ b/doc/reST/api.rst @@ -192,6 +192,12 @@ changes made to the parameters for auto-detection and other reasons:: * presets is not recommended without a more fine-grained breakdown of * parameters to take this into account. */ int x265_encoder_reconfig(x265_encoder *, x265_param *); +**x265_encoder_ctu_info** + /* x265_encoder_ctu_info: + * Copy CTU information such as ctu address and ctu partition structure of all + * CTUs in each frame. The function is invoked only if "--ctu-info" is enabled and + * the encoder will wait for this copy to complete if enabled. + */ Pictures ======== @@ -341,6 +347,14 @@ statistics from the encoder:: Cleanup ======= +At the end of the encode, the application will want to trigger logging +of the final encode statistics, if :option:`--csv` had been specified:: + + /* x265_encoder_log: + * write a line to the configured CSV file. If a CSV filename was not + * configured, or file open failed, this function will perform no write. */ + void x265_encoder_log(x265_encoder *encoder, int argc, char **argv); + Finally, the encoder must be closed in order to free all of its resources. An encoder that has been flushed cannot be restarted and reused. Once **x265_encoder_close()** has been called, the encoder diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index 2543bfc41a..f0a5ff8a13 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -52,8 +52,7 @@ Command line executable return codes:: 2. unable to open encoder 3. unable to generate stream headers 4. encoder abort - 5. unable to open csv file - + Logging/Statistic Options ========================= @@ -83,9 +82,66 @@ Logging/Statistic Options it adds one line per run. If :option:`--csv-log-level` is greater than 0, it writes one line per frame. Default none - Several frame performance statistics are available when - :option:`--csv-log-level` is greater than or equal to 2: - + The following statistics are available when :option:`--csv-log-level` is + greater than or equal to 1: + + **Encode Order** The frame order in which the encoder encodes. + + **Type** Slice type of the frame. + + **POC** Picture Order Count - The display order of the frames. + + **QP** Quantization Parameter decided for the frame. + + **Bits** Number of bits consumed by the frame. + + **Scenecut** 1 if the frame is a scenecut, 0 otherwise. + + **RateFactor** Applicable only when CRF is enabled. The rate factor depends + on the CRF given by the user. This is used to determine the QP so as to + target a certain quality. + + **BufferFill** Bits available for the next frame. Includes bits carried + over from the current frame. + + **Latency** Latency in terms of number of frames between when the frame + was given in and when the frame is given out. + + **PSNR** Peak signal to noise ratio for Y, U and V planes. + + **SSIM** A quality metric that denotes the structural similarity between frames. + + **Ref lists** POC of references in lists 0 and 1 for the frame. + + Several statistics about the encoded bitstream and encoder performance are + available when :option:`--csv-log-level` is greater than or equal to 2: + + **I/P cost ratio:** The ratio between the cost when a frame is decided as an + I frame to that when it is decided as a P frame as computed from the + quarter-resolution frame in look-ahead. This, in combination with other parameters + such as position of the frame in the GOP, is used to decide scene transitions. + + **Analysis statistics:** + + **CU Statistics** percentage of CU modes. + + **Distortion** Average luma and chroma distortion. Calculated as + SSE is done on fenc and recon(after quantization). + + **Psy Energy** Average psy energy calculated as the sum of absolute + difference between source and recon energy. Energy is measured by sa8d + minus SAD. + + **Residual Energy** Average residual energy. SSE is calculated on fenc + and pred(before quantization). + + **Luma/Chroma Values** minumum, maximum and average(averaged by area) + luma and chroma values of source for each frame. + + **PU Statistics** percentage of PU modes at each depth. + + **Performance statistics:** + **DecideWait ms** number of milliseconds the frame encoder had to wait, since the previous frame was retrieved by the API thread, before a new frame has been given to it. This is the latency @@ -111,6 +167,8 @@ Logging/Statistic Options **Stall Time ms** the number of milliseconds of the reported wall time that were spent with zero worker threads, aka all compression was completely stalled. + + **Total frame time** Total time spent to encode the frame. **Avg WPP** the average number of worker threads working on this frame, at any given time. This value is sampled at the completion of @@ -123,8 +181,6 @@ Logging/Statistic Options is more of a problem for P frames where some blocks are much more expensive than others. - **CLI ONLY** - .. option:: --csv-log-level <integer> Controls the level of detail (and size) of --csv log files @@ -133,8 +189,6 @@ Logging/Statistic Options 1. frame level logging 2. frame level logging with performance statistics - **CLI ONLY** - .. option:: --ssim, --no-ssim Calculate and report Structural Similarity values. It is @@ -795,33 +849,31 @@ the prediction quad-tree. Analysis re-use options, to improve performance when encoding the same sequence multiple times (presumably at varying bitrates). The encoder -will not reuse analysis if the resolution and slice type parameters do -not match. +will not reuse analysis if slice type parameters do not match. -.. option:: --analysis-mode <string|int> +.. option:: --analysis-reuse-mode <string|int> - Specify whether analysis information of each frame is output by encoder - or input for reuse. By reading the analysis data writen by an - earlier encode of the same sequence, substantial redundant work may - be avoided. - - The following data may be stored and reused: - I frames - split decisions and luma intra directions of all CUs. - P/B frames - motion vectors are dumped at each depth for all CUs. + This option allows reuse of analysis information from first pass to second pass. + :option:`--analysis-reuse-mode save` specifies that encoder outputs analysis information of each frame. + :option:`--analysis-reuse-mode load` specifies that encoder reuses analysis information from first pass. + There is no benefit using load mode without running encoder in save mode. Analysis data from save mode is + written to a file specified by :option:`--analysis-reuse-file`. The amount of analysis data stored/reused + is determined by :option:`--analysis-reuse-level`. By reading the analysis data writen by an earlier encode + of the same sequence, substantial redundant work may be avoided. Requires cutree, pmode to be off. Default 0. **Values:** off(0), save(1): dump analysis data, load(2): read analysis data -.. option:: --analysis-file <filename> +.. option:: --analysis-reuse-file <filename> - Specify a filename for analysis data (see :option:`--analysis-mode`) + Specify a filename for analysis data (see :option:`--analysis-reuse-mode`) If no filename is specified, x265_analysis.dat is used. -.. option:: --refine-level <1..10> +.. option:: --analysis-reuse-level <1..10> - Amount of information stored/reused in :option:`--analysis-mode` is distributed across levels. + Amount of information stored/reused in :option:`--analysis-reuse-mode` is distributed across levels. Higher the value, higher the information stored/reused, faster the encode. Default 5. - Note that --refine-level must be paired with analysis-mode. + Note that --analysis-reuse-level must be paired with analysis-reuse-mode. +--------+-----------------------------------------+ | Level | Description | @@ -835,6 +887,41 @@ not match. | 10 | Level 5 + Full CU analysis-info | +--------+-----------------------------------------+ +.. option:: --scale-factor + + Factor by which input video is scaled down for analysis save mode. + This option should be coupled with analysis-reuse-mode option, --analysis-reuse-level 10. + The ctu size of load should be double the size of save. Default 0. + +.. option:: --refine-intra <0|1|2> + + Enables refinement of intra blocks in current encode. + + Level 0 - Forces both mode and depth from the previous encode. + + Level 1 - Evaluates all intra modes for blocks of size one smaller than + the min-cu-size of the incoming analysis data from the previous encode, + forces modes for blocks of larger size. + + Level 2 - Evaluates all intra modes for blocks of size one smaller than + the min-cu-size of the incoming analysis data from the previous encode. + For larger blocks, force only depth when angular mode is chosen by the + previous encode, force depth and mode when other intra modes are chosen. + + Default 0. + +.. option:: --refine-inter-depth + + Enables refinement of inter blocks in current encode. Evaluates all + inter modes for blocks of size one smaller than the min-cu-size of the + incoming analysis data from the previous encode. Default disabled. + +.. option:: --refine-mv + + Enables refinement of motion vector for scaled video. Evaluates the best + motion vector by searching the surrounding eight integer and subpel pixel + positions. + Options which affect the transform unit quad-tree, sometimes referred to as the residual quad-tree (RQT). @@ -1221,7 +1308,16 @@ Slice decision options intra cost of a frame used in scenecut detection. For example, a value of 5 indicates, if the inter cost of a frame is greater than or equal to 95 percent of the intra cost of the frame, then detect this frame as scenecut. Values between 5 and 15 are recommended. Default 5. - + +.. option:: --ctu-info <0, 1, 2, 4, 6> + + This value enables receiving CTU information asynchronously and determine reaction to the CTU information. Default 0. + 1: force the partitions if CTU information is present. + 2: functionality of (1) and reduce qp if CTU information has changed. + 4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise. + This option should be enabled only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously. + If enabled without calling the API function, the encoder will wait indefinitely. + .. option:: --intra-refresh Enables Periodic Intra Refresh(PIR) instead of keyframe insertion. @@ -1491,7 +1587,11 @@ Quality, rate control and rate distortion options and also redundant steps are skipped. In pass 1 analysis information like motion vector, depth, reference and prediction modes of the final best CTU partition is stored for each CTU. - Default disabled. + Multipass analysis refinement cannot be enabled when 'analysis-save/analysis-load' option + is enabled and both will be disabled when enabled together. This feature requires 'pmode/pme' + to be disabled and hence pmode/pme will be disabled when enabled at the same time. + + Default: disabled. .. option:: --multi-pass-opt-distortion, --no-multi-pass-opt-distortion @@ -1499,7 +1599,11 @@ Quality, rate control and rate distortion options ratecontrol. In pass 1 distortion of best CTU partition is stored. CTUs with high distortion get lower(negative)qp offsets and vice-versa for low distortion CTUs in pass 2. This helps to improve the subjective quality. - Default disabled. + Multipass refinement of qp cannot be enabled when 'analysis-save/analysis-load' option + is enabled and both will be disabled when enabled together. 'multi-pass-opt-distortion' + requires 'pmode/pme' to be disabled and hence pmode/pme will be disabled when enabled along with it. + + Default: disabled. .. option:: --strict-cbr, --no-strict-cbr @@ -1573,6 +1677,11 @@ Quality, rate control and rate distortion options that this option is used through the tune grain feature where a combination of param options are used to improve visual quality. +.. option:: --const-vbv, --no-const-vbv + + Enables VBV algorithm to be consistent across runs. Default disabled. + Enabled when :option:'--tune' grain is applied. + .. option:: --qblur <float> Temporally blur quants. Default 0.5 @@ -1879,7 +1988,12 @@ VUI fields must be manually specified. .. option:: --dhdr10-info <filename> - Inserts tone mapping information as an SEI message. + Inserts tone mapping information as an SEI message. It takes as input, + the path to the JSON file containing the Creative Intent Metadata + to be encoded as Dynamic Tone Mapping into the bitstream. + + Click `here <https://www.sra.samsung.com/assets/User-data-registered-itu-t-t35-SEI-message-for-ST-2094-40-v1.1.pdf>`_ + for the syntax of the metadata file. A sample JSON file is available in `the downloads page <https://bitbucket.org/multicoreware/x265/downloads/DCIP3_4K_to_400_dynamic.json>`_ .. option:: --dhdr10-opt, --no-dhdr10-opt diff --git a/doc/reST/releasenotes.rst b/doc/reST/releasenotes.rst index 65264a101e..bf88bf2324 100644 --- a/doc/reST/releasenotes.rst +++ b/doc/reST/releasenotes.rst @@ -2,8 +2,33 @@ Release Notes ************* -Release Notes -************* +Version 2.5 +=========== + +Release date - 13th July, 2017. + +Encoder enhancements +-------------------- +1. Improved grain handling with :option:`--tune` grain option by throttling VBV operations to limit QP jumps. +2. Frame threads are now decided based on number of threads specified in the :option:`--pools`, as opposed to the number of hardware threads available. The mapping was also adjusted to improve quality of the encodes with minimal impact to performance. +3. CSV logging feature (enabled by :option:`--csv`) is now part of the library; it was previously part of the x265 application. Applications that integrate libx265 can now extract frame level statistics for their encodes by exercising this option in the library. +4. Globals that track min and max CU sizes, number of slices, and other parameters have now been moved into instance-specific variables. Consequently, applications that invoke multiple instances of x265 library are no longer restricted to use the same settings for these parameter options across the multiple instances. +5. x265 can now generate a seprate library that exports the HDR10+ parsing API. Other libraries that wish to use this API may do so by linking against this library. Enable ENABLE_HDR10_PLUS in CMake options and build to generate this library. +6. SEA motion search receives a 10% performance boost from AVX2 optimization of its kernels. +7. The CSV log is now more elaborate with additional fields such as PU statistics, average-min-max luma and chroma values, etc. Refer to documentation of :option:`--csv` for details of all fields. +8. x86inc.asm cleaned-up for improved instruction handling. + +API changes +----------- +1. New API x265_encoder_ctu_info() introduced to specify suggested partition sizes for various CTUs in a frame. To be used in conjunction with :option:`--ctu-info` to react to the specified partitions appropriately. +2. Rate-control statistics passed through the x265_picture object for an incoming frame are now used by the encoder. +3. Options to scale, reuse, and refine analysis for incoming analysis shared through the x265_analysis_data field in x265_picture for runs that use :option:`--analysis-reuse-mode` load; use options :option:`--scale`, :option:`--refine-mv`, :option:`--refine-inter`, and :option:`--refine-intra` to explore. +4. VBV now has a deterministic mode. Use :option:`--const-vbv` to exercise. + +Bug fixes +--------- +1. Several fixes for HDR10+ parsing code including incompatibility with user-specific SEI, removal of warnings, linking issues in linux, etc. +2. SEI messages for HDR10 repeated every keyint when HDR options (:option:`--hdr-opt`, :option:`--master-display`) specified. Version 2.4 =========== diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index acdeb7b2ce..a012dd48a5 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CPU" OFF) option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 116) +set(X265_BUILD 130) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" @@ -182,12 +182,19 @@ if(CC STREQUAL "xlc") add_definitions(-O3 -qstrict -qhot -qaltivec) add_definitions(-qinline=level=10 -qpath=IL:/data/video_files/latest.tpo/) endif() - - +# this option is to enable the inclusion of dynamic HDR10 library to the libx265 compilation +option(ENABLE_HDR10_PLUS "Enable dynamic HDR10 compilation" OFF) if(GCC) add_definitions(-Wall -Wextra -Wshadow) add_definitions(-D__STDC_LIMIT_MACROS=1) - add_definitions(-std=gnu++98) + if(ENABLE_HDR10_PLUS) + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8") + message(FATAL_ERROR "gcc version above 4.8 required to support hdr10plus") + endif() + add_definitions(-std=gnu++11) + else() + add_definitions(-std=gnu++98) + endif() if(ENABLE_PIC) add_definitions(-fPIC) endif(ENABLE_PIC) @@ -363,14 +370,12 @@ if(HIGH_BIT_DEPTH) else(HIGH_BIT_DEPTH) add_definitions(-DHIGH_BIT_DEPTH=0 -DX265_DEPTH=8) endif(HIGH_BIT_DEPTH) -# this option is to enable the inclusion of dynamic HDR10 library to the libx265 compilation -option(ENABLE_DYNAMIC_HDR10 "Enable dynamic HDR10 compilation" OFF) -if (ENABLE_DYNAMIC_HDR10) - add_subdirectory(dynamicHDR10) - include_directories(dynamicHDR10) - add_definitions(-DENABLE_DYNAMIC_HDR10) -endif(ENABLE_DYNAMIC_HDR10) +if (ENABLE_HDR10_PLUS) + include_directories(. dynamicHDR10 "${PROJECT_BINARY_DIR}") + add_subdirectory(dynamicHDR10) + add_definitions(-DENABLE_HDR10_PLUS) +endif(ENABLE_HDR10_PLUS) # this option can only be used when linking multiple libx265 libraries # together, and some alternate API access method is implemented. option(EXPORT_C_API "Implement public C programming interface" ON) @@ -510,8 +515,10 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) endif() endif() source_group(ASM FILES ${ASM_SRCS}) -if(ENABLE_DYNAMIC_HDR10) +if(ENABLE_HDR10_PLUS) add_library(x265-static STATIC $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> $<TARGET_OBJECTS:dynamicHDR10> ${ASM_OBJS} ${ASM_SRCS}) + add_library(hdr10plus-static STATIC $<TARGET_OBJECTS:dynamicHDR10>) + set_target_properties(hdr10plus-static PROPERTIES OUTPUT_NAME hdr10plus) else() add_library(x265-static STATIC $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS} ${ASM_SRCS}) endif() @@ -524,6 +531,12 @@ endif() install(TARGETS x265-static LIBRARY DESTINATION ${LIB_INSTALL_DIR} ARCHIVE DESTINATION ${LIB_INSTALL_DIR}) + +if(ENABLE_HDR10_PLUS) + install(TARGETS hdr10plus-static + LIBRARY DESTINATION ${LIB_INSTALL_DIR} + ARCHIVE DESTINATION ${LIB_INSTALL_DIR}) +endif() install(FILES x265.h "${PROJECT_BINARY_DIR}/x265_config.h" DESTINATION include) if(CMAKE_RC_COMPILER) @@ -547,10 +560,16 @@ if(NOT (MSVC_IDE OR XCODE)) endif() option(ENABLE_SHARED "Build shared library" ON) if(ENABLE_SHARED) - - if(ENABLE_DYNAMIC_HDR10) + if(ENABLE_HDR10_PLUS) add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${ASM_OBJS} ${X265_RC_FILE} $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> $<TARGET_OBJECTS:dynamicHDR10>) + add_library(hdr10plus-shared SHARED $<TARGET_OBJECTS:dynamicHDR10>) + + if(MSVC) + set_target_properties(hdr10plus-shared PROPERTIES OUTPUT_NAME libhdr10plus) + else() + set_target_properties(hdr10plus-shared PROPERTIES OUTPUT_NAME hdr10plus) + endif() else() add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${ASM_OBJS} ${X265_RC_FILE} $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>) @@ -585,6 +604,11 @@ if(ENABLE_SHARED) ARCHIVE DESTINATION ${LIB_INSTALL_DIR} RUNTIME DESTINATION ${BIN_INSTALL_DIR}) endif() + if(ENABLE_HDR10_PLUS) + install(TARGETS hdr10plus-shared + LIBRARY DESTINATION ${LIB_INSTALL_DIR} + ARCHIVE DESTINATION ${LIB_INSTALL_DIR}) + endif() if(LINKER_OPTIONS) # set_target_properties can't do list expansion string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}") @@ -646,18 +670,18 @@ if(ENABLE_CLI) endif(WIN32) if(XCODE) # Xcode seems unable to link the CLI with libs, so link as one targget - if(ENABLE_DYNAMIC_HDR10) + if(ENABLE_HDR10_PLUS) add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} - x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp + x265.cpp x265.h x265cli.h $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> $<TARGET_OBJECTS:dynamicHDR10> ${ASM_OBJS} ${ASM_SRCS}) else() add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} - x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp + x265.cpp x265.h x265cli.h $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS} ${ASM_SRCS}) endif() else() add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE} - ${ExportDefs} x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp) + ${ExportDefs} x265.cpp x265.h x265cli.h) if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX) # The CLI cannot link to the shared library on Windows, it # requires internal APIs not exported from the DLL diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 102ef227a7..541abe6d51 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -57,10 +57,10 @@ if(ENABLE_ASSEMBLY AND X86) set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES}) source_group(Intrinsics FILES ${VEC_PRIMITIVES}) - set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) + set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h seaintegral.h) set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm mc-a2.asm pixel-util8.asm blockcopy8.asm - pixeladd8.asm dct8.asm) + pixeladd8.asm dct8.asm seaintegral.asm) if(HIGH_BIT_DEPTH) set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm) else() diff --git a/source/common/common.h b/source/common/common.h index a7daf1d176..82f5ccd970 100644 --- a/source/common/common.h +++ b/source/common/common.h @@ -259,7 +259,6 @@ typedef int16_t coeff_t; // transform coefficient #define LOG2_RASTER_SIZE (MAX_LOG2_CU_SIZE - LOG2_UNIT_SIZE) #define RASTER_SIZE (1 << LOG2_RASTER_SIZE) #define MAX_NUM_PARTITIONS (RASTER_SIZE * RASTER_SIZE) -#define NUM_4x4_PARTITIONS (1U << (g_unitSizeDepth << 1)) // number of 4x4 units in max CU size #define MIN_PU_SIZE 4 #define MIN_TU_SIZE 4 diff --git a/source/common/constants.cpp b/source/common/constants.cpp index e360793e2f..be1c926493 100644 --- a/source/common/constants.cpp +++ b/source/common/constants.cpp @@ -161,7 +161,6 @@ const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] = 65535 }; -int g_ctuSizeConfigured = 0; uint32_t g_maxLog2CUSize = MAX_LOG2_CU_SIZE; uint32_t g_maxCUSize = MAX_CU_SIZE; uint32_t g_unitSizeDepth = NUM_CU_DEPTH; diff --git a/source/common/constants.h b/source/common/constants.h index f8b5d857d7..93731f470a 100644 --- a/source/common/constants.h +++ b/source/common/constants.h @@ -30,8 +30,6 @@ namespace X265_NS { // private namespace -extern int g_ctuSizeConfigured; - extern double x265_lambda_tab[QP_MAX_MAX + 1]; extern double x265_lambda2_tab[QP_MAX_MAX + 1]; extern const uint16_t x265_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET + 1]; diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp index 7d51abfb89..1f17778d60 100644 --- a/source/common/cpu.cpp +++ b/source/common/cpu.cpp @@ -69,6 +69,7 @@ const cpu_name_t cpu_names[] = { "SSE2Slow", SSE2 | X265_CPU_SSE2_IS_SLOW }, { "SSE2", SSE2 }, { "SSE2Fast", SSE2 | X265_CPU_SSE2_IS_FAST }, + { "LZCNT", X265_CPU_LZCNT }, { "SSE3", SSE2 | X265_CPU_SSE3 }, { "SSSE3", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 }, { "SSE4.1", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 }, @@ -78,16 +79,17 @@ const cpu_name_t cpu_names[] = { "AVX", AVX }, { "XOP", AVX | X265_CPU_XOP }, { "FMA4", AVX | X265_CPU_FMA4 }, - { "AVX2", AVX | X265_CPU_AVX2 }, { "FMA3", AVX | X265_CPU_FMA3 }, + { "BMI1", AVX | X265_CPU_LZCNT | X265_CPU_BMI1 }, + { "BMI2", AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 }, +#define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2 + { "AVX2", AVX2}, +#undef AVX2 #undef AVX #undef SSE2 #undef MMX2 { "Cache32", X265_CPU_CACHELINE_32 }, { "Cache64", X265_CPU_CACHELINE_64 }, - { "LZCNT", X265_CPU_LZCNT }, - { "BMI1", X265_CPU_BMI1 }, - { "BMI2", X265_CPU_BMI1 | X265_CPU_BMI2 }, { "SlowCTZ", X265_CPU_SLOW_CTZ }, { "SlowAtom", X265_CPU_SLOW_ATOM }, { "SlowPshufb", X265_CPU_SLOW_PSHUFB }, diff --git a/source/common/cudata.cpp b/source/common/cudata.cpp index 639f6d60cb..7e69d8788b 100644 --- a/source/common/cudata.cpp +++ b/source/common/cudata.cpp @@ -28,6 +28,7 @@ #include "picyuv.h" #include "mv.h" #include "cudata.h" +#define MAX_MV 1 << 14 using namespace X265_NS; @@ -110,25 +111,23 @@ inline MV scaleMv(MV mv, int scale) } -cubcast_t CUData::s_partSet[NUM_FULL_DEPTH] = { NULL, NULL, NULL, NULL, NULL }; -uint32_t CUData::s_numPartInCUSize; - CUData::CUData() { memset(this, 0, sizeof(*this)); } -void CUData::initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance) +void CUData::initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance) { + int csp = param.internalCsp; m_chromaFormat = csp; m_hChromaShift = CHROMA_H_SHIFT(csp); m_vChromaShift = CHROMA_V_SHIFT(csp); - m_numPartitions = NUM_4x4_PARTITIONS >> (depth * 2); + m_numPartitions = param.num4x4Partitions >> (depth * 2); if (!s_partSet[0]) { - s_numPartInCUSize = 1 << g_unitSizeDepth; - switch (g_maxLog2CUSize) + s_numPartInCUSize = 1 << param.unitSizeDepth; + switch (param.maxLog2CUSize) { case 6: s_partSet[0] = bcast256; @@ -220,7 +219,7 @@ void CUData::initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, m_distortion = dataPool.distortionMemBlock + instance * m_numPartitions; - uint32_t cuSize = g_maxCUSize >> depth; + uint32_t cuSize = param.maxCUSize >> depth; m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (cuSize * cuSize); m_trCoeff[1] = m_trCoeff[2] = 0; m_transformSkip[1] = m_transformSkip[2] = m_cbf[1] = m_cbf[2] = 0; @@ -262,7 +261,7 @@ void CUData::initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, m_distortion = dataPool.distortionMemBlock + instance * m_numPartitions; - uint32_t cuSize = g_maxCUSize >> depth; + uint32_t cuSize = param.maxCUSize >> depth; uint32_t sizeL = cuSize * cuSize; uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); // block chroma part m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2); @@ -278,17 +277,17 @@ void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp, uint32_t first m_encData = frame.m_encData; m_slice = m_encData->m_slice; m_cuAddr = cuAddr; - m_cuPelX = (cuAddr % m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize; - m_cuPelY = (cuAddr / m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize; + m_cuPelX = (cuAddr % m_slice->m_sps->numCuInWidth) << m_slice->m_param->maxLog2CUSize; + m_cuPelY = (cuAddr / m_slice->m_sps->numCuInWidth) << m_slice->m_param->maxLog2CUSize; m_absIdxInCTU = 0; - m_numPartitions = NUM_4x4_PARTITIONS; + m_numPartitions = m_encData->m_param->num4x4Partitions; m_bFirstRowInSlice = (uint8_t)firstRowInSlice; m_bLastRowInSlice = (uint8_t)lastRowInSlice; m_bLastCuInSlice = (uint8_t)lastCuInSlice; /* sequential memsets */ m_partSet((uint8_t*)m_qp, (uint8_t)qp); - m_partSet(m_log2CUSize, (uint8_t)g_maxLog2CUSize); + m_partSet(m_log2CUSize, (uint8_t)m_slice->m_param->maxLog2CUSize); m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX); m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX); m_partSet(m_tqBypass, (uint8_t)frame.m_encData->m_param->bLossless); @@ -390,7 +389,7 @@ void CUData::copyPartFrom(const CUData& subCU, const CUGeom& childGeom, uint32_t memcpy(m_distortion + offset, subCU.m_distortion, childGeom.numPartitions * sizeof(sse_t)); - uint32_t tmp = 1 << ((g_maxLog2CUSize - childGeom.depth) * 2); + uint32_t tmp = 1 << ((m_slice->m_param->maxLog2CUSize - childGeom.depth) * 2); uint32_t tmp2 = subPartIdx * tmp; memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t)* tmp); @@ -489,7 +488,7 @@ void CUData::copyToPic(uint32_t depth) const memcpy(ctu.m_distortion + m_absIdxInCTU, m_distortion, m_numPartitions * sizeof(sse_t)); - uint32_t tmpY = 1 << ((g_maxLog2CUSize - depth) * 2); + uint32_t tmpY = 1 << ((m_slice->m_param->maxLog2CUSize - depth) * 2); uint32_t tmpY2 = m_absIdxInCTU << (LOG2_UNIT_SIZE * 2); memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t)* tmpY); @@ -568,7 +567,7 @@ void CUData::updatePic(uint32_t depth, int picCsp) const m_partCopy(ctu.m_tuDepth + m_absIdxInCTU, m_tuDepth); m_partCopy(ctu.m_cbf[0] + m_absIdxInCTU, m_cbf[0]); - uint32_t tmpY = 1 << ((g_maxLog2CUSize - depth) * 2); + uint32_t tmpY = 1 << ((m_slice->m_param->maxLog2CUSize - depth) * 2); uint32_t tmpY2 = m_absIdxInCTU << (LOG2_UNIT_SIZE * 2); memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t)* tmpY); @@ -656,7 +655,7 @@ const CUData* CUData::getPUAboveLeft(uint32_t& alPartUnitIdx, uint32_t curPartUn return m_cuLeft; } - alPartUnitIdx = NUM_4x4_PARTITIONS - 1; + alPartUnitIdx = m_encData->m_param->num4x4Partitions - 1; return m_cuAboveLeft; } @@ -799,7 +798,7 @@ const CUData* CUData::getPUAboveRightAdi(uint32_t& arPartUnitIdx, uint32_t curPa /* Get left QpMinCu */ const CUData* CUData::getQpMinCuLeft(uint32_t& lPartUnitIdx, uint32_t curAbsIdxInCTU) const { - uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2); + uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (m_encData->m_param->unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2); uint32_t absRorderQpMinCUIdx = g_zscanToRaster[absZorderQpMinCUIdx]; // check for left CTU boundary @@ -816,7 +815,7 @@ const CUData* CUData::getQpMinCuLeft(uint32_t& lPartUnitIdx, uint32_t curAbsIdxI /* Get above QpMinCu */ const CUData* CUData::getQpMinCuAbove(uint32_t& aPartUnitIdx, uint32_t curAbsIdxInCTU) const { - uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2); + uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (m_encData->m_param->unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2); uint32_t absRorderQpMinCUIdx = g_zscanToRaster[absZorderQpMinCUIdx]; // check for top CTU boundary @@ -855,7 +854,7 @@ int CUData::getLastValidPartIdx(int absPartIdx) const int8_t CUData::getLastCodedQP(uint32_t absPartIdx) const { - uint32_t quPartIdxMask = 0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2; + uint32_t quPartIdxMask = 0xFF << (m_encData->m_param->unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2; int lastValidPartIdx = getLastValidPartIdx(absPartIdx & quPartIdxMask); if (lastValidPartIdx >= 0) @@ -865,7 +864,7 @@ int8_t CUData::getLastCodedQP(uint32_t absPartIdx) const if (m_absIdxInCTU) return m_encData->getPicCTU(m_cuAddr)->getLastCodedQP(m_absIdxInCTU); else if (m_cuAddr > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled && !(m_cuAddr % m_slice->m_sps->numCuInWidth))) - return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_4x4_PARTITIONS); + return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(m_encData->m_param->num4x4Partitions); else return (int8_t)m_slice->m_sliceQp; } @@ -997,7 +996,7 @@ uint32_t CUData::getCtxSkipFlag(uint32_t absPartIdx) const bool CUData::setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth) { - uint32_t curPartNumb = NUM_4x4_PARTITIONS >> (depth << 1); + uint32_t curPartNumb = m_encData->m_param->num4x4Partitions >> (depth << 1); uint32_t curPartNumQ = curPartNumb >> 2; if (m_cuDepth[absPartIdx] > depth) @@ -1623,6 +1622,11 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV dir |= (1 << list); candMvField[count][list].mv = colmv; candMvField[count][list].refIdx = refIdx; + if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisReuseMode == X265_ANALYSIS_SAVE && m_log2CUSize[0] < 4) + { + MV dist(MAX_MV, MAX_MV); + candMvField[count][list].mv = dist; + } } } @@ -1783,7 +1787,13 @@ int CUData::getPMV(InterNeighbourMV *neighbours, uint32_t picList, uint32_t refI int curRefPOC = m_slice->m_refPOCList[picList][refIdx]; int curPOC = m_slice->m_poc; - pmv[numMvc++] = amvpCand[num++] = scaleMvByPOCDist(neighbours[MD_COLLOCATED].mv[picList], curPOC, curRefPOC, colPOC, colRefPOC); + if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisReuseMode == X265_ANALYSIS_SAVE && (m_log2CUSize[0] < 4)) + { + MV dist(MAX_MV, MAX_MV); + pmv[numMvc++] = amvpCand[num++] = dist; + } + else + pmv[numMvc++] = amvpCand[num++] = scaleMvByPOCDist(neighbours[MD_COLLOCATED].mv[picList], curPOC, curRefPOC, colPOC, colRefPOC); } } @@ -1905,10 +1915,10 @@ void CUData::clipMv(MV& outMV) const uint32_t offset = 8; int16_t xmax = (int16_t)((m_slice->m_sps->picWidthInLumaSamples + offset - m_cuPelX - 1) << mvshift); - int16_t xmin = -(int16_t)((g_maxCUSize + offset + m_cuPelX - 1) << mvshift); + int16_t xmin = -(int16_t)((m_encData->m_param->maxCUSize + offset + m_cuPelX - 1) << mvshift); int16_t ymax = (int16_t)((m_slice->m_sps->picHeightInLumaSamples + offset - m_cuPelY - 1) << mvshift); - int16_t ymin = -(int16_t)((g_maxCUSize + offset + m_cuPelY - 1) << mvshift); + int16_t ymin = -(int16_t)((m_encData->m_param->maxCUSize + offset + m_cuPelY - 1) << mvshift); outMV.x = X265_MIN(xmax, X265_MAX(xmin, outMV.x)); outMV.y = X265_MIN(ymax, X265_MAX(ymin, outMV.y)); @@ -2090,6 +2100,8 @@ void CUData::getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uin void CUData::calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) { + uint32_t num4x4Partition = (1U << ((g_log2Size[maxCUSize] - LOG2_UNIT_SIZE) << 1)); + // Initialize the coding blocks inside the CTB for (uint32_t log2CUSize = g_log2Size[maxCUSize], rangeCUIdx = 0; log2CUSize >= g_log2Size[minCUSize]; log2CUSize--) { @@ -2118,7 +2130,7 @@ void CUData::calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUS cu->log2CUSize = log2CUSize; cu->childOffset = childIdx - cuIdx; cu->absPartIdx = g_depthScanIdx[yOffset][xOffset] * 4; - cu->numPartitions = (NUM_4x4_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2)); + cu->numPartitions = (num4x4Partition >> ((g_log2Size[maxCUSize] - cu->log2CUSize) * 2)); cu->depth = g_log2Size[maxCUSize] - log2CUSize; cu->geomRecurId = cuIdx; diff --git a/source/common/cudata.h b/source/common/cudata.h index adb30828d7..b3e6f302a9 100644 --- a/source/common/cudata.h +++ b/source/common/cudata.h @@ -161,8 +161,8 @@ class CUData { public: - static cubcast_t s_partSet[NUM_FULL_DEPTH]; // pointer to broadcast set functions per absolute depth - static uint32_t s_numPartInCUSize; + cubcast_t s_partSet[NUM_FULL_DEPTH]; // pointer to broadcast set functions per absolute depth + uint32_t s_numPartInCUSize; bool m_vbvAffected; @@ -225,7 +225,7 @@ class CUData CUData(); - void initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance); + void initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance); static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]); void initCTU(const Frame& frame, uint32_t cuAddr, int qp, uint32_t firstRowInSlice, uint32_t lastRowInSlice, uint32_t lastCUInSlice); @@ -271,7 +271,7 @@ class CUData void getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const; uint32_t getBestRefIdx(uint32_t subPartIdx) const { return ((m_interDir[subPartIdx] & 1) << m_refIdx[0][subPartIdx]) | (((m_interDir[subPartIdx] >> 1) & 1) << (m_refIdx[1][subPartIdx] + 16)); } - uint32_t getPUOffset(uint32_t puIdx, uint32_t absPartIdx) const { return (partAddrTable[(int)m_partSize[absPartIdx]][puIdx] << (g_unitSizeDepth - m_cuDepth[absPartIdx]) * 2) >> 4; } + uint32_t getPUOffset(uint32_t puIdx, uint32_t absPartIdx) const { return (partAddrTable[(int)m_partSize[absPartIdx]][puIdx] << (m_slice->m_param->unitSizeDepth - m_cuDepth[absPartIdx]) * 2) >> 4; } uint32_t getNumPartInter(uint32_t absPartIdx) const { return nbPartsTable[(int)m_partSize[absPartIdx]]; } bool isIntra(uint32_t absPartIdx) const { return m_predMode[absPartIdx] == MODE_INTRA; } @@ -285,7 +285,7 @@ class CUData void getAllowedChromaDir(uint32_t absPartIdx, uint32_t* modeList) const; int getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred) const; - uint32_t getSCUAddr() const { return (m_cuAddr << g_unitSizeDepth * 2) + m_absIdxInCTU; } + uint32_t getSCUAddr() const { return (m_cuAddr << m_slice->m_param->unitSizeDepth * 2) + m_absIdxInCTU; } uint32_t getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const; uint32_t getCtxSkipFlag(uint32_t absPartIdx) const; void getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const; @@ -350,10 +350,10 @@ struct CUDataMemPool CUDataMemPool() { charMemBlock = NULL; trCoeffMemBlock = NULL; mvMemBlock = NULL; distortionMemBlock = NULL; } - bool create(uint32_t depth, uint32_t csp, uint32_t numInstances) + bool create(uint32_t depth, uint32_t csp, uint32_t numInstances, const x265_param& param) { - uint32_t numPartition = NUM_4x4_PARTITIONS >> (depth * 2); - uint32_t cuSize = g_maxCUSize >> depth; + uint32_t numPartition = param.num4x4Partitions >> (depth * 2); + uint32_t cuSize = param.maxCUSize >> depth; uint32_t sizeL = cuSize * cuSize; if (csp == X265_CSP_I400) { diff --git a/source/common/frame.cpp b/source/common/frame.cpp index aefe9a6c07..3111bb9643 100644 --- a/source/common/frame.cpp +++ b/source/common/frame.cpp @@ -48,6 +48,11 @@ Frame::Frame() m_rcData = NULL; m_encodeStartTime = 0; m_reconfigureRc = false; + m_ctuInfo = NULL; + m_prevCtuInfoChange = NULL; + m_addOnDepth = NULL; + m_addOnCtuInfo = NULL; + m_addOnPrevChange = NULL; } bool Frame::create(x265_param *param, float* quantOffsets) @@ -56,11 +61,26 @@ bool Frame::create(x265_param *param, float* quantOffsets) m_param = param; CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1); - if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) && - m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode || !!param->bAQMotion, param->rc.qgSize)) + if (param->bCTUInfo) + { + uint32_t widthInCTU = (m_param->sourceWidth + param->maxCUSize - 1) >> m_param->maxLog2CUSize; + uint32_t heightInCTU = (m_param->sourceHeight + param->maxCUSize - 1) >> m_param->maxLog2CUSize; + uint32_t numCTUsInFrame = widthInCTU * heightInCTU; + CHECKED_MALLOC_ZERO(m_addOnDepth, uint8_t *, numCTUsInFrame); + CHECKED_MALLOC_ZERO(m_addOnCtuInfo, uint8_t *, numCTUsInFrame); + CHECKED_MALLOC_ZERO(m_addOnPrevChange, int *, numCTUsInFrame); + for (uint32_t i = 0; i < numCTUsInFrame; i++) + { + CHECKED_MALLOC_ZERO(m_addOnDepth[i], uint8_t, uint32_t(param->num4x4Partitions)); + CHECKED_MALLOC_ZERO(m_addOnCtuInfo[i], uint8_t, uint32_t(param->num4x4Partitions)); + CHECKED_MALLOC_ZERO(m_addOnPrevChange[i], int, uint32_t(param->num4x4Partitions)); + } + } + + if (m_fencPic->create(param) && m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode || !!param->bAQMotion, param->rc.qgSize)) { X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized"); - m_numRows = (m_fencPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize; + m_numRows = (m_fencPic->m_picHeight + param->maxCUSize - 1) / param->maxCUSize; m_reconRowFlag = new ThreadSafeInteger[m_numRows]; m_reconColCount = new ThreadSafeInteger[m_numRows]; @@ -86,12 +106,12 @@ bool Frame::allocEncodeData(x265_param *param, const SPS& sps) m_reconPic = new PicYuv; m_param = param; m_encData->m_reconPic = m_reconPic; - bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp); + bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic->create(param); if (ok) { /* initialize right border of m_reconpicYuv as SAO may read beyond the * end of the picture accessing uninitialized pixels */ - int maxHeight = sps.numCuInHeight * g_maxCUSize; + int maxHeight = sps.numCuInHeight * param->maxCUSize; memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight); /* use pre-calculated cu/pu offsets cached in the SPS structure */ @@ -166,6 +186,35 @@ void Frame::destroy() delete[] m_userSEI.payloads; } + if (m_ctuInfo) + { + uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + uint32_t numCUsInFrame = widthInCU * heightInCU; + for (uint32_t i = 0; i < numCUsInFrame; i++) + { + X265_FREE((*m_ctuInfo + i)->ctuInfo); + (*m_ctuInfo + i)->ctuInfo = NULL; + X265_FREE(m_addOnDepth[i]); + m_addOnDepth[i] = NULL; + X265_FREE(m_addOnCtuInfo[i]); + m_addOnCtuInfo[i] = NULL; + X265_FREE(m_addOnPrevChange[i]); + m_addOnPrevChange[i] = NULL; + } + X265_FREE(*m_ctuInfo); + *m_ctuInfo = NULL; + X265_FREE(m_ctuInfo); + m_ctuInfo = NULL; + X265_FREE(m_prevCtuInfoChange); + m_prevCtuInfoChange = NULL; + X265_FREE(m_addOnDepth); + m_addOnDepth = NULL; + X265_FREE(m_addOnCtuInfo); + m_addOnCtuInfo = NULL; + X265_FREE(m_addOnPrevChange); + m_addOnPrevChange = NULL; + } m_lowres.destroy(); X265_FREE(m_rcData); } diff --git a/source/common/frame.h b/source/common/frame.h index 0eae3fd819..0ad1173223 100644 --- a/source/common/frame.h +++ b/source/common/frame.h @@ -66,6 +66,10 @@ struct RcStats double shortTermCplxCount; int64_t totalBits; int64_t encodedBits; + double coeff[4]; + double count[4]; + double offset[4]; + double bufferFillFinal; }; class Frame @@ -108,7 +112,14 @@ class Frame x265_analysis_2Pass m_analysis2Pass; RcStats* m_rcData; + x265_ctu_info_t** m_ctuInfo; + Event m_copied; + int* m_prevCtuInfoChange; int64_t m_encodeStartTime; + + uint8_t** m_addOnDepth; + uint8_t** m_addOnCtuInfo; + int** m_addOnPrevChange; Frame(); bool create(x265_param *param, float* quantOffsets); diff --git a/source/common/framedata.cpp b/source/common/framedata.cpp index 00a74c17f4..6292b9f628 100644 --- a/source/common/framedata.cpp +++ b/source/common/framedata.cpp @@ -41,9 +41,9 @@ bool FrameData::create(const x265_param& param, const SPS& sps, int csp) if (param.rc.bStatWrite) m_spsrps = const_cast<RPS*>(sps.spsrps); - m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame); + m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame, param); for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++) - m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param.internalCsp, ctuAddr); + m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param, ctuAddr); CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame); CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight); diff --git a/source/common/framedata.h b/source/common/framedata.h index 0004a46535..d17b53fa0b 100644 --- a/source/common/framedata.h +++ b/source/common/framedata.h @@ -62,6 +62,7 @@ struct FrameStats double percentMergeCu[NUM_CU_DEPTH]; double percentIntraDistribution[NUM_CU_DEPTH][INTRA_MODES]; double percentInterDistribution[NUM_CU_DEPTH][3]; // 2Nx2N, RECT, AMP modes percentage + double ipCostRatio; uint64_t cntIntraNxN; uint64_t totalCu; @@ -78,6 +79,15 @@ struct FrameStats uint64_t cuInterDistribution[NUM_CU_DEPTH][INTER_MODES]; uint64_t cuIntraDistribution[NUM_CU_DEPTH][INTRA_MODES]; + + uint64_t totalPu[NUM_CU_DEPTH + 1]; + uint64_t cntSkipPu[NUM_CU_DEPTH]; + uint64_t cntIntraPu[NUM_CU_DEPTH]; + uint64_t cntAmp[NUM_CU_DEPTH]; + uint64_t cnt4x4; + uint64_t cntInterPu[NUM_CU_DEPTH][INTER_MODES - 1]; + uint64_t cntMergePu[NUM_CU_DEPTH][INTER_MODES - 1]; + FrameStats() { memset(this, 0, sizeof(FrameStats)); diff --git a/source/common/ipfilter.cpp b/source/common/ipfilter.cpp index 842b4788bf..acfd7ce474 100644 --- a/source/common/ipfilter.cpp +++ b/source/common/ipfilter.cpp @@ -123,9 +123,8 @@ void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intpt const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; int headRoom = IF_INTERNAL_PREC - X265_DEPTH; int shift = IF_FILTER_PREC - headRoom; - int offset = -IF_INTERNAL_OFFS << shift; + int offset = (unsigned)-IF_INTERNAL_OFFS << shift; int blkheight = height; - src -= N / 2 - 1; if (isRowExt) @@ -209,10 +208,8 @@ void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx]; int headRoom = IF_INTERNAL_PREC - X265_DEPTH; int shift = IF_FILTER_PREC - headRoom; - int offset = -IF_INTERNAL_OFFS << shift; - + int offset = (unsigned)-IF_INTERNAL_OFFS << shift; src -= (N / 2 - 1) * srcStride; - int row, col; for (row = 0; row < height; row++) { diff --git a/source/common/lowres.h b/source/common/lowres.h index 125f4e2da8..4cb4d00fcb 100644 --- a/source/common/lowres.h +++ b/source/common/lowres.h @@ -118,6 +118,8 @@ struct Lowres : public ReferencePlanes bool bKeyframe; bool bLastMiniGopBFrame; + double ipCostRatio; + /* lookahead output data */ int64_t costEst[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]; int64_t costEstAq[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]; diff --git a/source/common/param.cpp b/source/common/param.cpp index 70bb478f39..661ef5bb97 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -110,6 +110,7 @@ void x265_param_default(x265_param* param) param->frameNumThreads = 0; param->logLevel = X265_LOG_INFO; + param->csvLogLevel = 0; param->csvfn = NULL; param->rc.lambdaFileName = NULL; param->bLogCuStats = 0; @@ -194,10 +195,10 @@ void x265_param_default(x265_param* param) param->rdPenalty = 0; param->psyRd = 2.0; param->psyRdoq = 0.0; - param->analysisMode = 0; + param->analysisReuseMode = 0; param->analysisMultiPassRefine = 0; param->analysisMultiPassDistortion = 0; - param->analysisFileName = NULL; + param->analysisReuseFileName = NULL; param->bIntraInBFrames = 0; param->bLossless = 0; param->bCULossless = 0; @@ -236,6 +237,7 @@ void x265_param_default(x265_param* param) param->rc.bEnableGrain = 0; param->rc.qpMin = 0; param->rc.qpMax = QP_MAX_MAX; + param->rc.bEnableConstVbv = 0; /* Video Usability Information (VUI) */ param->vui.aspectRatioIdc = 0; @@ -271,10 +273,18 @@ void x265_param_default(x265_param* param) param->bOptCUDeltaQP = 0; param->bAQMotion = 0; param->bHDROpt = 0; - param->analysisRefineLevel = 5; + param->analysisReuseLevel = 5; param->toneMapFile = NULL; param->bDhdr10opt = 0; + param->bCTUInfo = 0; + param->bUseRcStats = 0; + param->scaleFactor = 0; + param->intraRefine = 0; + param->interRefine = 0; + param->mvRefine = 0; + param->bUseAnalysisFile = 1; + param->csvfpt = NULL; } int x265_param_default_preset(x265_param* param, const char* preset, const char* tune) @@ -494,6 +504,7 @@ int x265_param_default_preset(x265_param* param, const char* preset, const char* param->psyRd = 4.0; param->psyRdoq = 10.0; param->bEnableSAO = 0; + param->rc.bEnableConstVbv = 1; } else return -1; @@ -828,7 +839,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) p->rc.bStrictCbr = atobool(value); p->rc.pbFactor = 1.0; } - OPT("analysis-mode") p->analysisMode = parseName(value, x265_analysis_names, bError); + OPT("analysis-reuse-mode") p->analysisReuseMode = parseName(value, x265_analysis_names, bError); OPT("sar") { p->vui.aspectRatioIdc = parseName(value, x265_sar_names, bError); @@ -907,7 +918,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) OPT("scaling-list") p->scalingLists = strdup(value); OPT2("pools", "numa-pools") p->numaPools = strdup(value); OPT("lambda-file") p->rc.lambdaFileName = strdup(value); - OPT("analysis-file") p->analysisFileName = strdup(value); + OPT("analysis-reuse-file") p->analysisReuseFileName = strdup(value); OPT("qg-size") p->rc.qgSize = atoi(value); OPT("master-display") p->masteringDisplayColorVolume = strdup(value); OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2; @@ -921,6 +932,8 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) if (bExtraParams) { if (0) ; + OPT("csv") p->csvfn = strdup(value); + OPT("csv-log-level") p->csvLogLevel = atoi(value); OPT("qpmin") p->rc.qpMin = atoi(value); OPT("analyze-src-pics") p->bSourceReferenceEstimation = atobool(value); OPT("log2-max-poc-lsb") p->log2MaxPocLsb = atoi(value); @@ -938,7 +951,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) OPT("multi-pass-opt-distortion") p->analysisMultiPassDistortion = atobool(value); OPT("aq-motion") p->bAQMotion = atobool(value); OPT("dynamic-rd") p->dynamicRd = atof(value); - OPT("refine-level") p->analysisRefineLevel = atoi(value); + OPT("analysis-reuse-level") p->analysisReuseLevel = atoi(value); OPT("ssim-rd") { int bval = atobool(value); @@ -954,6 +967,12 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) OPT("limit-sao") p->bLimitSAO = atobool(value); OPT("dhdr10-info") p->toneMapFile = strdup(value); OPT("dhdr10-opt") p->bDhdr10opt = atobool(value); + OPT("const-vbv") p->rc.bEnableConstVbv = atobool(value); + OPT("ctu-info") p->bCTUInfo = atoi(value); + OPT("scale-factor") p->scaleFactor = atoi(value); + OPT("refine-intra")p->intraRefine = atoi(value); + OPT("refine-inter")p->interRefine = atobool(value); + OPT("refine-mv")p->mvRefine = atobool(value); else return X265_PARAM_BAD_NAME; } @@ -1284,16 +1303,19 @@ int x265_check_params(x265_param* param) "Constant QP is incompatible with 2pass"); CHECK(param->rc.bStrictCbr && (param->rc.bitrate <= 0 || param->rc.vbvBufferSize <=0), "Strict-cbr cannot be applied without specifying target bitrate or vbv bufsize"); - CHECK(param->analysisMode && (param->analysisMode < X265_ANALYSIS_OFF || param->analysisMode > X265_ANALYSIS_LOAD), + CHECK(param->analysisReuseMode && (param->analysisReuseMode < X265_ANALYSIS_OFF || param->analysisReuseMode > X265_ANALYSIS_LOAD), "Invalid analysis mode. Analysis mode 0: OFF 1: SAVE : 2 LOAD"); - CHECK(param->analysisMode && (param->analysisRefineLevel < 1 || param->analysisRefineLevel > 10), + CHECK(param->analysisReuseMode && (param->analysisReuseLevel < 1 || param->analysisReuseLevel > 10), "Invalid analysis refine level. Value must be between 1 and 10 (inclusive)"); + CHECK(param->scaleFactor > 2, "Invalid scale-factor. Supports factor <= 2"); CHECK(param->rc.qpMax < QP_MIN || param->rc.qpMax > QP_MAX_MAX, "qpmax exceeds supported range (0 to 69)"); CHECK(param->rc.qpMin < QP_MIN || param->rc.qpMin > QP_MAX_MAX, "qpmin exceeds supported range (0 to 69)"); CHECK(param->log2MaxPocLsb < 4 || param->log2MaxPocLsb > 16, "Supported range for log2MaxPocLsb is 4 to 16"); + CHECK(param->bCTUInfo < 0 || (param->bCTUInfo != 0 && param->bCTUInfo != 1 && param->bCTUInfo != 2 && param->bCTUInfo != 4 && param->bCTUInfo != 6) || param->bCTUInfo > 6, + "Supported values for bCTUInfo are 0, 1, 2, 4, 6"); #if !X86_64 CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 || param->sourceHeight > 480), "SEA motion search does not support resolutions greater than 480p in 32 bit build"); @@ -1322,42 +1344,6 @@ void x265_param_apply_fastfirstpass(x265_param* param) } } -int x265_set_globals(x265_param* param) -{ - uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize]; - uint32_t minLog2CUSize = (uint32_t)g_log2Size[param->minCUSize]; - - Lock gLock; - ScopedLock sLock(gLock); - - if (++g_ctuSizeConfigured > 1) - { - if (g_maxCUSize != param->maxCUSize) - { - x265_log(param, X265_LOG_WARNING, "maxCUSize must be the same for all encoders in a single process"); - } - if (g_maxCUDepth != maxLog2CUSize - minLog2CUSize) - { - x265_log(param, X265_LOG_WARNING, "maxCUDepth must be the same for all encoders in a single process"); - } - param->maxCUSize = g_maxCUSize; - return x265_check_params(param); /* Check again, since param may have changed */ - } - else - { - // set max CU width & height - g_maxCUSize = param->maxCUSize; - g_maxLog2CUSize = maxLog2CUSize; - - // compute actual CU depth with respect to config depth and max transform size - g_maxCUDepth = maxLog2CUSize - minLog2CUSize; - g_unitSizeDepth = maxLog2CUSize - LOG2_UNIT_SIZE; - } - - g_maxSlices = param->maxSlices; - return 0; -} - static void appendtool(x265_param* param, char* buf, size_t size, const char* toolstr) { static const int overhead = (int)strlen("x265 [info]: tools: "); @@ -1457,6 +1443,7 @@ void x265_print_params(x265_param* param) TOOLOPT(param->bEnableStrongIntraSmoothing, "strong-intra-smoothing"); TOOLVAL(param->lookaheadSlices, "lslices=%d"); TOOLVAL(param->lookaheadThreads, "lthreads=%d") + TOOLVAL(param->bCTUInfo, "ctu-info=%d"); if (param->maxSlices > 1) TOOLVAL(param->maxSlices, "slices=%d"); if (param->bEnableLoopFilter) @@ -1473,8 +1460,8 @@ void x265_print_params(x265_param* param) TOOLOPT(!param->bSaoNonDeblocked && param->bEnableSAO, "sao"); TOOLOPT(param->rc.bStatWrite, "stats-write"); TOOLOPT(param->rc.bStatRead, "stats-read"); -#if ENABLE_DYNAMIC_HDR10 - TOOLVAL(param->toneMapFile != NULL, "dhdr10-info"); +#if ENABLE_HDR10_PLUS + TOOLOPT(param->toneMapFile != NULL, "dhdr10-info"); #endif x265_log(param, X265_LOG_INFO, "tools:%s\n", buf); fflush(stderr); @@ -1501,6 +1488,8 @@ char *x265_param2string(x265_param* p, int padx, int pady) BOOL(p->bEnablePsnr, "psnr"); BOOL(p->bEnableSsim, "ssim"); s += sprintf(s, " log-level=%d", p->logLevel); + if (p->csvfn) + s += sprintf(s, " csvfn=%s csv-log-level=%d", p->csvfn, p->csvLogLevel); s += sprintf(s, " bitdepth=%d", p->internalBitDepth); s += sprintf(s, " input-csp=%d", p->internalCsp); s += sprintf(s, " fps=%u/%u", p->fpsNum, p->fpsDenom); @@ -1573,7 +1562,7 @@ char *x265_param2string(x265_param* p, int padx, int pady) s += sprintf(s, " psy-rd=%.2f", p->psyRd); s += sprintf(s, " psy-rdoq=%.2f", p->psyRdoq); BOOL(p->bEnableRdRefine, "rd-refine"); - s += sprintf(s, " analysis-mode=%d", p->analysisMode); + s += sprintf(s, " analysis-reuse-mode=%d", p->analysisReuseMode); BOOL(p->bLossless, "lossless"); s += sprintf(s, " cbqpoffs=%d", p->cbQpOffset); s += sprintf(s, " crqpoffs=%d", p->crQpOffset); @@ -1630,6 +1619,7 @@ char *x265_param2string(x265_param* p, int padx, int pady) s += sprintf(s, " qg-size=%d", p->rc.qgSize); BOOL(p->rc.bEnableGrain, "rc-grain"); s += sprintf(s, " qpmax=%d qpmin=%d", p->rc.qpMax, p->rc.qpMin); + BOOL(p->rc.bEnableConstVbv, "const-vbv"); s += sprintf(s, " sar=%d", p->vui.aspectRatioIdc); if (p->vui.aspectRatioIdc == X265_EXTENDED_SAR) s += sprintf(s, " sar-width : sar-height=%d:%d", p->vui.sarWidth, p->vui.sarHeight); @@ -1668,8 +1658,13 @@ char *x265_param2string(x265_param* p, int padx, int pady) BOOL(p->bEmitHDRSEI, "hdr"); BOOL(p->bHDROpt, "hdr-opt"); BOOL(p->bDhdr10opt, "dhdr10-opt"); - s += sprintf(s, " refine-level=%d", p->analysisRefineLevel); + s += sprintf(s, " analysis-reuse-level=%d", p->analysisReuseLevel); + s += sprintf(s, " scale-factor=%d", p->scaleFactor); + s += sprintf(s, " refine-intra=%d", p->intraRefine); + s += sprintf(s, " refine-inter=%d", p->interRefine); + s += sprintf(s, " refine-mv=%d", p->mvRefine); BOOL(p->bLimitSAO, "limit-sao"); + s += sprintf(s, " ctu-info=%d", p->bCTUInfo); #undef BOOL return buf; } diff --git a/source/common/param.h b/source/common/param.h index f6f03a169c..9424b44c41 100644 --- a/source/common/param.h +++ b/source/common/param.h @@ -28,7 +28,6 @@ namespace X265_NS { int x265_check_params(x265_param *param); -int x265_set_globals(x265_param *param); void x265_print_params(x265_param *param); void x265_param_apply_fastfirstpass(x265_param *p); char* x265_param2string(x265_param *param, int padx, int pady); diff --git a/source/common/picyuv.cpp b/source/common/picyuv.cpp index ca5d3274a2..01eb955356 100644 --- a/source/common/picyuv.cpp +++ b/source/common/picyuv.cpp @@ -46,36 +46,62 @@ PicYuv::PicYuv() m_maxLumaLevel = 0; m_avgLumaLevel = 0; + + m_maxChromaULevel = 0; + m_avgChromaULevel = 0; + + m_maxChromaVLevel = 0; + m_avgChromaVLevel = 0; + +#if (X265_DEPTH > 8) + m_minLumaLevel = 0xFFFF; + m_minChromaULevel = 0xFFFF; + m_minChromaVLevel = 0xFFFF; +#else + m_minLumaLevel = 0xFF; + m_minChromaULevel = 0xFF; + m_minChromaVLevel = 0xFF; +#endif + m_stride = 0; m_strideC = 0; m_hChromaShift = 0; m_vChromaShift = 0; } -bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp) +bool PicYuv::create(x265_param* param, pixel *pixelbuf) { + m_param = param; + uint32_t picWidth = m_param->sourceWidth; + uint32_t picHeight = m_param->sourceHeight; + uint32_t picCsp = m_param->internalCsp; m_picWidth = picWidth; m_picHeight = picHeight; m_hChromaShift = CHROMA_H_SHIFT(picCsp); m_vChromaShift = CHROMA_V_SHIFT(picCsp); m_picCsp = picCsp; - uint32_t numCuInWidth = (m_picWidth + g_maxCUSize - 1) / g_maxCUSize; - uint32_t numCuInHeight = (m_picHeight + g_maxCUSize - 1) / g_maxCUSize; + uint32_t numCuInWidth = (m_picWidth + param->maxCUSize - 1) / param->maxCUSize; + uint32_t numCuInHeight = (m_picHeight + param->maxCUSize - 1) / param->maxCUSize; - m_lumaMarginX = g_maxCUSize + 32; // search margin and 8-tap filter half-length, padded for 32-byte alignment - m_lumaMarginY = g_maxCUSize + 16; // margin for 8-tap filter and infinite padding - m_stride = (numCuInWidth * g_maxCUSize) + (m_lumaMarginX << 1); + m_lumaMarginX = param->maxCUSize + 32; // search margin and 8-tap filter half-length, padded for 32-byte alignment + m_lumaMarginY = param->maxCUSize + 16; // margin for 8-tap filter and infinite padding + m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1); - int maxHeight = numCuInHeight * g_maxCUSize; - CHECKED_MALLOC(m_picBuf[0], pixel, m_stride * (maxHeight + (m_lumaMarginY * 2))); - m_picOrg[0] = m_picBuf[0] + m_lumaMarginY * m_stride + m_lumaMarginX; + int maxHeight = numCuInHeight * param->maxCUSize; + if (pixelbuf) + m_picOrg[0] = pixelbuf; + else + { + CHECKED_MALLOC(m_picBuf[0], pixel, m_stride * (maxHeight + (m_lumaMarginY * 2))); + m_picOrg[0] = m_picBuf[0] + m_lumaMarginY * m_stride + m_lumaMarginX; + } if (picCsp != X265_CSP_I400) { m_chromaMarginX = m_lumaMarginX; // keep 16-byte alignment for chroma CTUs m_chromaMarginY = m_lumaMarginY >> m_vChromaShift; - m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2); + m_strideC = ((numCuInWidth * m_param->maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2); CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2))); CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2))); @@ -94,12 +120,33 @@ bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp) return false; } +int PicYuv::getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp) +{ + m_picWidth = picWidth; + m_picHeight = picHeight; + m_hChromaShift = CHROMA_H_SHIFT(picCsp); + m_vChromaShift = CHROMA_V_SHIFT(picCsp); + m_picCsp = picCsp; + + uint32_t numCuInWidth = (m_picWidth + m_param->maxCUSize - 1) / m_param->maxCUSize; + uint32_t numCuInHeight = (m_picHeight + m_param->maxCUSize - 1) / m_param->maxCUSize; + + m_lumaMarginX = m_param->maxCUSize + 32; // search margin and 8-tap filter half-length, padded for 32-byte alignment + m_lumaMarginY = m_param->maxCUSize + 16; // margin for 8-tap filter and infinite padding + m_stride = (numCuInWidth * m_param->maxCUSize) + (m_lumaMarginX << 1); + + int maxHeight = numCuInHeight * m_param->maxCUSize; + int bufLen = (int)(m_stride * (maxHeight + (m_lumaMarginY * 2))); + + return bufLen; +} + /* the first picture allocated by the encoder will be asked to generate these * offset arrays. Once generated, they will be provided to all future PicYuv * allocated by the same encoder. */ bool PicYuv::createOffsets(const SPS& sps) { - uint32_t numPartitions = 1 << (g_unitSizeDepth * 2); + uint32_t numPartitions = 1 << (m_param->unitSizeDepth * 2); if (m_picCsp != X265_CSP_I400) { @@ -109,8 +156,8 @@ bool PicYuv::createOffsets(const SPS& sps) { for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++) { - m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize; - m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift); + m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * m_param->maxCUSize + cuCol * m_param->maxCUSize; + m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (m_param->maxCUSize >> m_vChromaShift) + cuCol * (m_param->maxCUSize >> m_hChromaShift); } } @@ -129,7 +176,7 @@ bool PicYuv::createOffsets(const SPS& sps) CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight); for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++) for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++) - m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize; + m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * m_param->maxCUSize + cuCol * m_param->maxCUSize; CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions); for (uint32_t idx = 0; idx < numPartitions; ++idx) @@ -184,6 +231,11 @@ void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, i X265_CHECK(pic.bitDepth >= 8, "pic.bitDepth check failure"); + uint64_t lumaSum; + uint64_t cbSum; + uint64_t crSum; + lumaSum = cbSum = crSum = 0; + if (pic.bitDepth == 8) { #if (X265_DEPTH > 8) @@ -288,6 +340,47 @@ void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, i pixel *U = m_picOrg[1]; pixel *V = m_picOrg[2]; + pixel *yPic = m_picOrg[0]; + pixel *uPic = m_picOrg[1]; + pixel *vPic = m_picOrg[2]; + + for (int r = 0; r < height; r++) + { + for (int c = 0; c < width; c++) + { + m_maxLumaLevel = X265_MAX(yPic[c], m_maxLumaLevel); + m_minLumaLevel = X265_MIN(yPic[c], m_minLumaLevel); + lumaSum += yPic[c]; + } + yPic += m_stride; + } + m_avgLumaLevel = (double)lumaSum / (m_picHeight * m_picWidth); + + if (param.csvLogLevel >= 2) + { + if (param.internalCsp != X265_CSP_I400) + { + for (int r = 0; r < height >> m_vChromaShift; r++) + { + for (int c = 0; c < width >> m_hChromaShift; c++) + { + m_maxChromaULevel = X265_MAX(uPic[c], m_maxChromaULevel); + m_minChromaULevel = X265_MIN(uPic[c], m_minChromaULevel); + cbSum += uPic[c]; + + m_maxChromaVLevel = X265_MAX(vPic[c], m_maxChromaVLevel); + m_minChromaVLevel = X265_MIN(vPic[c], m_minChromaVLevel); + crSum += vPic[c]; + } + + uPic += m_strideC; + vPic += m_strideC; + } + m_avgChromaULevel = (double)cbSum / ((height >> m_vChromaShift) * (width >> m_hChromaShift)); + m_avgChromaVLevel = (double)crSum / ((height >> m_vChromaShift) * (width >> m_hChromaShift)); + } + } + #if HIGH_BIT_DEPTH bool calcHDRParams = !!param.minLuma || (param.maxLuma != PIXEL_MAX); /* Apply min/max luma bounds for HDR pixel manipulations */ diff --git a/source/common/picyuv.h b/source/common/picyuv.h index c2e9238ac9..0c8dfa7298 100644 --- a/source/common/picyuv.h +++ b/source/common/picyuv.h @@ -60,14 +60,25 @@ class PicYuv uint32_t m_chromaMarginX; uint32_t m_chromaMarginY; - pixel m_maxLumaLevel; - double m_avgLumaLevel; + pixel m_maxLumaLevel; + pixel m_minLumaLevel; + double m_avgLumaLevel; + + pixel m_maxChromaULevel; + pixel m_minChromaULevel; + double m_avgChromaULevel; + + pixel m_maxChromaVLevel; + pixel m_minChromaVLevel; + double m_avgChromaVLevel; + x265_param *m_param; PicYuv(); - bool create(uint32_t picWidth, uint32_t picHeight, uint32_t csp); + bool create(x265_param* param, pixel *pixelbuf = NULL); bool createOffsets(const SPS& sps); void destroy(); + int getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp); void copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady); diff --git a/source/common/primitives.cpp b/source/common/primitives.cpp index aa72496c70..211dc2f487 100644 --- a/source/common/primitives.cpp +++ b/source/common/primitives.cpp @@ -57,6 +57,7 @@ void setupFilterPrimitives_c(EncoderPrimitives &p); void setupIntraPrimitives_c(EncoderPrimitives &p); void setupLoopFilterPrimitives_c(EncoderPrimitives &p); void setupSaoPrimitives_c(EncoderPrimitives &p); +void setupSeaIntegralPrimitives_c(EncoderPrimitives &p); void setupCPrimitives(EncoderPrimitives &p) { @@ -66,6 +67,7 @@ void setupCPrimitives(EncoderPrimitives &p) setupIntraPrimitives_c(p); // intrapred.cpp setupLoopFilterPrimitives_c(p); // loopfilter.cpp setupSaoPrimitives_c(p); // sao.cpp + setupSeaIntegralPrimitives_c(p); // framefilter.cpp } void setupAliasPrimitives(EncoderPrimitives &p) diff --git a/source/common/primitives.h b/source/common/primitives.h index edee09729e..cf0bc29018 100644 --- a/source/common/primitives.h +++ b/source/common/primitives.h @@ -110,6 +110,17 @@ enum ChromaCU422 BLOCK_422_32x64 }; +enum IntegralSize +{ + INTEGRAL_4, + INTEGRAL_8, + INTEGRAL_12, + INTEGRAL_16, + INTEGRAL_24, + INTEGRAL_32, + NUM_INTEGRAL_SIZE +}; + typedef int (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned typedef int (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride); typedef sse_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned @@ -203,6 +214,9 @@ typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8 typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ); typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ); +typedef void (*integralv_t)(uint32_t *sum, intptr_t stride); +typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride); + /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ struct EncoderPrimitives @@ -342,6 +356,9 @@ struct EncoderPrimitives pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1 pelFilterChroma_t pelFilterChroma[2]; // EDGE_VER = 0, EDGE_HOR = 1 + integralv_t integral_initv[NUM_INTEGRAL_SIZE]; + integralh_t integral_inith[NUM_INTEGRAL_SIZE]; + /* There is one set of chroma primitives per color space. An encoder will * have just a single color space and thus it will only ever use one entry * in this array. However we always fill all entries in the array in case diff --git a/source/common/slice.cpp b/source/common/slice.cpp index 3d5a5c95fa..2335ce6340 100644 --- a/source/common/slice.cpp +++ b/source/common/slice.cpp @@ -185,22 +185,22 @@ void RPS::sortDeltaPOC() uint32_t Slice::realEndAddress(uint32_t endCUAddr) const { // Calculate end address - uint32_t internalAddress = (endCUAddr - 1) % NUM_4x4_PARTITIONS; - uint32_t externalAddress = (endCUAddr - 1) / NUM_4x4_PARTITIONS; - uint32_t xmax = m_sps->picWidthInLumaSamples - (externalAddress % m_sps->numCuInWidth) * g_maxCUSize; - uint32_t ymax = m_sps->picHeightInLumaSamples - (externalAddress / m_sps->numCuInWidth) * g_maxCUSize; + uint32_t internalAddress = (endCUAddr - 1) % m_param->num4x4Partitions; + uint32_t externalAddress = (endCUAddr - 1) / m_param->num4x4Partitions; + uint32_t xmax = m_sps->picWidthInLumaSamples - (externalAddress % m_sps->numCuInWidth) * m_param->maxCUSize; + uint32_t ymax = m_sps->picHeightInLumaSamples - (externalAddress / m_sps->numCuInWidth) * m_param->maxCUSize; while (g_zscanToPelX[internalAddress] >= xmax || g_zscanToPelY[internalAddress] >= ymax) internalAddress--; internalAddress++; - if (internalAddress == NUM_4x4_PARTITIONS) + if (internalAddress == m_param->num4x4Partitions) { internalAddress = 0; externalAddress++; } - return externalAddress * NUM_4x4_PARTITIONS + internalAddress; + return externalAddress * m_param->num4x4Partitions + internalAddress; } diff --git a/source/common/slice.h b/source/common/slice.h index 160ebf5390..d08da58e1f 100644 --- a/source/common/slice.h +++ b/source/common/slice.h @@ -360,6 +360,7 @@ class Slice int m_iPPSQpMinus26; int numRefIdxDefault[2]; int m_iNumRPSInSPS; + const x265_param *m_param; Slice() { diff --git a/source/common/threadpool.cpp b/source/common/threadpool.cpp index f6509b7018..a23ba7beb0 100644 --- a/source/common/threadpool.cpp +++ b/source/common/threadpool.cpp @@ -253,6 +253,7 @@ ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools, bool isTh int cpusPerNode[MAX_NODE_NUM + 1]; int threadsPerPool[MAX_NODE_NUM + 2]; uint64_t nodeMaskPerPool[MAX_NODE_NUM + 2]; + int totalNumThreads = 0; memset(cpusPerNode, 0, sizeof(cpusPerNode)); memset(threadsPerPool, 0, sizeof(threadsPerPool)); @@ -388,9 +389,23 @@ ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools, bool isTh if (bNumaSupport) x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d logical cores\n", i, cpusPerNode[i]); if (threadsPerPool[i]) + { numPools += (threadsPerPool[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS; + totalNumThreads += threadsPerPool[i]; + } } + if (!isThreadsReserved) + { + if (!numPools) + { + x265_log(p, X265_LOG_DEBUG, "No pool thread available. Deciding frame-threads based on detected CPU threads\n"); + totalNumThreads = ThreadPool::getCpuCount(); // auto-detect frame threads + } + if (!p->frameNumThreads) + ThreadPool::getFrameThreadsCount(p, totalNumThreads); + } + if (!numPools) return NULL; @@ -412,7 +427,7 @@ ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools, bool isTh node++; int numThreads = X265_MIN(MAX_POOL_THREADS, threadsPerPool[node]); int origNumThreads = numThreads; - if (p->lookaheadThreads > numThreads / 2) + if (i == 0 && p->lookaheadThreads > numThreads / 2) { p->lookaheadThreads = numThreads / 2; x265_log(p, X265_LOG_DEBUG, "Setting lookahead threads to a maximum of half the total number of threads\n"); @@ -423,7 +438,7 @@ ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools, bool isTh maxProviders = 1; } - else + else if (i == 0) numThreads -= p->lookaheadThreads; if (!pools[i].create(numThreads, maxProviders, nodeMaskPerPool[node])) { @@ -643,4 +658,21 @@ int ThreadPool::getCpuCount() #endif } +void ThreadPool::getFrameThreadsCount(x265_param* p, int cpuCount) +{ + int rows = (p->sourceHeight + p->maxCUSize - 1) >> g_log2Size[p->maxCUSize]; + if (!p->bEnableWavefront) + p->frameNumThreads = X265_MIN3(cpuCount, (rows + 1) / 2, X265_MAX_FRAME_THREADS); + else if (cpuCount >= 32) + p->frameNumThreads = (p->sourceHeight > 2000) ? 6 : 5; + else if (cpuCount >= 16) + p->frameNumThreads = 4; + else if (cpuCount >= 8) + p->frameNumThreads = 3; + else if (cpuCount >= 4) + p->frameNumThreads = 2; + else + p->frameNumThreads = 1; +} + } // end namespace X265_NS diff --git a/source/common/threadpool.h b/source/common/threadpool.h index 649716dfd1..6f58a70d5b 100644 --- a/source/common/threadpool.h +++ b/source/common/threadpool.h @@ -105,6 +105,7 @@ class ThreadPool static ThreadPool* allocThreadPools(x265_param* p, int& numPools, bool isThreadsReserved); static int getCpuCount(); static int getNumaNodeCount(); + static void getFrameThreadsCount(x265_param* p,int cpuCount); }; /* Any worker thread may enlist the help of idle worker threads from the same diff --git a/source/common/x86/asm-primitives.cpp b/source/common/x86/asm-primitives.cpp index fad3c7a706..1546734c91 100644 --- a/source/common/x86/asm-primitives.cpp +++ b/source/common/x86/asm-primitives.cpp @@ -114,6 +114,7 @@ extern "C" { #include "blockcopy8.h" #include "intrapred.h" #include "dct8.h" +#include "seaintegral.h" } #define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \ @@ -2157,6 +2158,17 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10 p.fix8Unpack = PFX(cutree_fix8_unpack_avx2); p.fix8Pack = PFX(cutree_fix8_pack_avx2); + p.integral_initv[INTEGRAL_4] = PFX(integral4v_avx2); + p.integral_initv[INTEGRAL_8] = PFX(integral8v_avx2); + p.integral_initv[INTEGRAL_12] = PFX(integral12v_avx2); + p.integral_initv[INTEGRAL_16] = PFX(integral16v_avx2); + p.integral_initv[INTEGRAL_24] = PFX(integral24v_avx2); + p.integral_initv[INTEGRAL_32] = PFX(integral32v_avx2); + p.integral_inith[INTEGRAL_4] = PFX(integral4h_avx2); + p.integral_inith[INTEGRAL_8] = PFX(integral8h_avx2); + p.integral_inith[INTEGRAL_12] = PFX(integral12h_avx2); + p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2); + /* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only p.planeClipAndMax = PFX(planeClipAndMax_avx2); */ @@ -3695,6 +3707,19 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main p.fix8Unpack = PFX(cutree_fix8_unpack_avx2); p.fix8Pack = PFX(cutree_fix8_pack_avx2); + p.integral_initv[INTEGRAL_4] = PFX(integral4v_avx2); + p.integral_initv[INTEGRAL_8] = PFX(integral8v_avx2); + p.integral_initv[INTEGRAL_12] = PFX(integral12v_avx2); + p.integral_initv[INTEGRAL_16] = PFX(integral16v_avx2); + p.integral_initv[INTEGRAL_24] = PFX(integral24v_avx2); + p.integral_initv[INTEGRAL_32] = PFX(integral32v_avx2); + p.integral_inith[INTEGRAL_4] = PFX(integral4h_avx2); + p.integral_inith[INTEGRAL_8] = PFX(integral8h_avx2); + p.integral_inith[INTEGRAL_12] = PFX(integral12h_avx2); + p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2); + p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2); + p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2); + } #endif } diff --git a/source/common/x86/loopfilter.asm b/source/common/x86/loopfilter.asm index 04f67fef52..590652d130 100644 --- a/source/common/x86/loopfilter.asm +++ b/source/common/x86/loopfilter.asm @@ -1583,7 +1583,7 @@ cglobal saoCuOrgB0, 5,7,8 pshufb m1, m4, m0 pcmpgtb m0, [pb_15] ; m0 = [mask] - pblendvb m6, m6, m1, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug! + pblendvb m6, m1, m0 pmovsxbw m0, m6 ; offset punpckhbw m6, m6 @@ -1630,7 +1630,7 @@ cglobal saoCuOrgB0, 4, 7, 8 pshufb m6, m3, m1 pshufb m5, m4, m1 - pblendvb m6, m6, m5, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug! + pblendvb m6, m5, m0 pmovzxbw m1, m2 ; rec punpckhbw m2, m7 @@ -1904,7 +1904,7 @@ cglobal calSign, 4,5,6 sub r3, r4 movu xmm0, [r3] movu m3, [r0] - pblendvb m5, m5, m3, xmm0 + pblendvb m5, m3, xmm0 movu [r0], m5 .end: diff --git a/source/common/x86/pixel-a.asm b/source/common/x86/pixel-a.asm index eaaee7756c..79b3bb5d3b 100644 --- a/source/common/x86/pixel-a.asm +++ b/source/common/x86/pixel-a.asm @@ -227,7 +227,7 @@ cextern pw_pixel_max ; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 - %xdefine %%n n%1 + %xdefine %%n nn%1 %assign offset %2*SIZEOF_PIXEL LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] diff --git a/source/common/x86/pixel-util8.asm b/source/common/x86/pixel-util8.asm index eaa3d0a169..50264c45c0 100644 --- a/source/common/x86/pixel-util8.asm +++ b/source/common/x86/pixel-util8.asm @@ -1597,7 +1597,7 @@ cglobal weight_sp, 6,7,8 .widthLess8: movu m6, [r1] - pblendvb m6, m6, m7, m0 + pblendvb m6, m7, m0 movu [r1], m6 .nextH: diff --git a/source/common/x86/seaintegral.asm b/source/common/x86/seaintegral.asm new file mode 100644 index 0000000000..cf79ca4478 --- /dev/null +++ b/source/common/x86/seaintegral.asm @@ -0,0 +1,1062 @@ +;***************************************************************************** +;* Copyright (C) 2013-2017 MulticoreWare, Inc +;* +;* Authors: Jayashri Murugan <jayashri@multicorewareinc.com> +;* Vignesh V Menon <vignesh@multicorewareinc.com> +;* Praveen Tiwari <praveen@multicorewareinc.com> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +;void integral_init4v_c(uint32_t *sum4, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal integral4v, 2, 3, 2 + mov r2, r1 + shl r2, 4 + +.loop + movu m0, [r0] + movu m1, [r0 + r2] + psubd m1, m0 + movu [r0], m1 + add r0, 32 + sub r1, 8 + jnz .loop + RET + +;----------------------------------------------------------------------------- +;void integral_init8v_c(uint32_t *sum8, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal integral8v, 2, 3, 2 + mov r2, r1 + shl r2, 5 + +.loop + movu m0, [r0] + movu m1, [r0 + r2] + psubd m1, m0 + movu [r0], m1 + add r0, 32 + sub r1, 8 + jnz .loop + RET + +;----------------------------------------------------------------------------- +;void integral_init12v_c(uint32_t *sum12, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal integral12v, 2, 4, 2 + mov r2, r1 + mov r3, r1 + shl r2, 5 + shl r3, 4 + add r2, r3 + +.loop + movu m0, [r0] + movu m1, [r0 + r2] + psubd m1, m0 + movu [r0], m1 + add r0, 32 + sub r1, 8 + jnz .loop + RET + +;----------------------------------------------------------------------------- +;void integral_init16v_c(uint32_t *sum16, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal integral16v, 2, 3, 2 + mov r2, r1 + shl r2, 6 + +.loop + movu m0, [r0] + movu m1, [r0 + r2] + psubd m1, m0 + movu [r0], m1 + add r0, 32 + sub r1, 8 + jnz .loop + RET + +;----------------------------------------------------------------------------- +;void integral_init24v_c(uint32_t *sum24, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal integral24v, 2, 4, 2 + mov r2, r1 + mov r3, r1 + shl r2, 6 + shl r3, 5 + add r2, r3 + +.loop + movu m0, [r0] + movu m1, [r0 + r2] + psubd m1, m0 + movu [r0], m1 + add r0, 32 + sub r1, 8 + jnz .loop + RET + +;----------------------------------------------------------------------------- +;void integral_init32v_c(uint32_t *sum32, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal integral32v, 2, 3, 2 + mov r2, r1 + shl r2, 7 + +.loop + movu m0, [r0] + movu m1, [r0 + r2] + psubd m1, m0 + movu [r0], m1 + add r0, 32 + sub r1, 8 + jnz .loop + RET + +%macro INTEGRAL_FOUR_HORIZONTAL_16 0 + pmovzxbw m0, [r1] + pmovzxbw m1, [r1 + 1] + paddw m0, m1 + pmovzxbw m1, [r1 + 2] + paddw m0, m1 + pmovzxbw m1, [r1 + 3] + paddw m0, m1 +%endmacro + +%macro INTEGRAL_FOUR_HORIZONTAL_4 0 + movd xm0, [r1] + movd xm1, [r1 + 1] + pmovzxbw xm0, xm0 + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 2] + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 3] + pmovzxbw xm1, xm1 + paddw xm0, xm1 +%endmacro + +%macro INTEGRAL_FOUR_HORIZONTAL_8_HBD 0 + pmovzxwd m0, [r1] + pmovzxwd m1, [r1 + 2] + paddd m0, m1 + pmovzxwd m1, [r1 + 4] + paddd m0, m1 + pmovzxwd m1, [r1 + 6] + paddd m0, m1 +%endmacro + +%macro INTEGRAL_FOUR_HORIZONTAL_4_HBD 0 + pmovzxwd xm0, [r1] + pmovzxwd xm1, [r1 + 2] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 4] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 6] + paddd xm0, xm1 +%endmacro + +;----------------------------------------------------------------------------- +;static void integral_init4h(uint32_t *sum, pixel *pix, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal integral4h, 3, 5, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 4 ;stride - 4 + mov r4, r2 + shr r4, 3 + +.loop_8: + INTEGRAL_FOUR_HORIZONTAL_8_HBD + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + add r1, 16 + add r0, 32 + sub r2, 8 + sub r4, 1 + jnz .loop_8 + INTEGRAL_FOUR_HORIZONTAL_4_HBD + movu xm1, [r0] + paddd xm0, xm1 + movu [r0 + r3], xm0 + RET + +%else +cglobal integral4h, 3, 5, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 4 ;stride - 4 + mov r4, r2 + shr r4, 4 + +.loop_16: + INTEGRAL_FOUR_HORIZONTAL_16 + vperm2i128 m2, m0, m0, 1 + pmovzxwd m2, xm2 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + movu m1, [r0 + 32] + paddd m2, m1 + movu [r0 + r3 + 32], m2 + add r1, 16 + add r0, 64 + sub r2, 16 + sub r4, 1 + jnz .loop_16 + cmp r2, 12 + je .loop_12 + cmp r2, 4 + je .loop_4 + +.loop_12: + INTEGRAL_FOUR_HORIZONTAL_16 + vperm2i128 m2, m0, m0, 1 + pmovzxwd xm2, xm2 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + movu xm1, [r0 + 32] + paddd xm2, xm1 + movu [r0 + r3 + 32], xm2 + jmp .end + +.loop_4: + INTEGRAL_FOUR_HORIZONTAL_4 + pmovzxwd xm0, xm0 + movu xm1, [r0] + paddd xm0, xm1 + movu [r0 + r3], xm0 + jmp .end + +.end + RET +%endif + +%macro INTEGRAL_EIGHT_HORIZONTAL_16 0 + pmovzxbw m0, [r1] + pmovzxbw m1, [r1 + 1] + paddw m0, m1 + pmovzxbw m1, [r1 + 2] + paddw m0, m1 + pmovzxbw m1, [r1 + 3] + paddw m0, m1 + pmovzxbw m1, [r1 + 4] + paddw m0, m1 + pmovzxbw m1, [r1 + 5] + paddw m0, m1 + pmovzxbw m1, [r1 + 6] + paddw m0, m1 + pmovzxbw m1, [r1 + 7] + paddw m0, m1 +%endmacro + +%macro INTEGRAL_EIGHT_HORIZONTAL_8 0 + pmovzxbw xm0, [r1] + pmovzxbw xm1, [r1 + 1] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 2] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 3] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 4] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 5] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 6] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 7] + paddw xm0, xm1 +%endmacro + +%macro INTEGRAL_EIGHT_HORIZONTAL_8_HBD 0 + pmovzxwd m0, [r1] + pmovzxwd m1, [r1 + 2] + paddd m0, m1 + pmovzxwd m1, [r1 + 4] + paddd m0, m1 + pmovzxwd m1, [r1 + 6] + paddd m0, m1 + pmovzxwd m1, [r1 + 8] + paddd m0, m1 + pmovzxwd m1, [r1 + 10] + paddd m0, m1 + pmovzxwd m1, [r1 + 12] + paddd m0, m1 + pmovzxwd m1, [r1 + 14] + paddd m0, m1 +%endmacro + +;----------------------------------------------------------------------------- +;static void integral_init8h_c(uint32_t *sum, pixel *pix, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal integral8h, 3, 4, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 8 ;stride - 8 + +.loop: + INTEGRAL_EIGHT_HORIZONTAL_8_HBD + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + add r1, 16 + add r0, 32 + sub r2, 8 + jnz .loop + RET + +%else +cglobal integral8h, 3, 5, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 8 ;stride - 8 + mov r4, r2 + shr r4, 4 + +.loop_16: + INTEGRAL_EIGHT_HORIZONTAL_16 + vperm2i128 m2, m0, m0, 1 + pmovzxwd m2, xm2 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + movu m1, [r0 + 32] + paddd m2, m1 + movu [r0 + r3 + 32], m2 + add r1, 16 + add r0, 64 + sub r2, 16 + sub r4, 1 + jnz .loop_16 + cmp r2, 8 + je .loop_8 + jmp .end + +.loop_8: + INTEGRAL_EIGHT_HORIZONTAL_8 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + jmp .end + +.end + RET +%endif + +%macro INTEGRAL_TWELVE_HORIZONTAL_16 0 + pmovzxbw m0, [r1] + pmovzxbw m1, [r1 + 1] + paddw m0, m1 + pmovzxbw m1, [r1 + 2] + paddw m0, m1 + pmovzxbw m1, [r1 + 3] + paddw m0, m1 + pmovzxbw m1, [r1 + 4] + paddw m0, m1 + pmovzxbw m1, [r1 + 5] + paddw m0, m1 + pmovzxbw m1, [r1 + 6] + paddw m0, m1 + pmovzxbw m1, [r1 + 7] + paddw m0, m1 + pmovzxbw m1, [r1 + 8] + paddw m0, m1 + pmovzxbw m1, [r1 + 9] + paddw m0, m1 + pmovzxbw m1, [r1 + 10] + paddw m0, m1 + pmovzxbw m1, [r1 + 11] + paddw m0, m1 +%endmacro + +%macro INTEGRAL_TWELVE_HORIZONTAL_4 0 + movd xm0, [r1] + movd xm1, [r1 + 1] + pmovzxbw xm0, xm0 + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 2] + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 3] + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 4] + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 5] + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 6] + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 7] + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 8] + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 9] + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 10] + pmovzxbw xm1, xm1 + paddw xm0, xm1 + movd xm1, [r1 + 11] + pmovzxbw xm1, xm1 + paddw xm0, xm1 +%endmacro + +%macro INTEGRAL_TWELVE_HORIZONTAL_8_HBD 0 + pmovzxwd m0, [r1] + pmovzxwd m1, [r1 + 2] + paddd m0, m1 + pmovzxwd m1, [r1 + 4] + paddd m0, m1 + pmovzxwd m1, [r1 + 6] + paddd m0, m1 + pmovzxwd m1, [r1 + 8] + paddd m0, m1 + pmovzxwd m1, [r1 + 10] + paddd m0, m1 + pmovzxwd m1, [r1 + 12] + paddd m0, m1 + pmovzxwd m1, [r1 + 14] + paddd m0, m1 + pmovzxwd m1, [r1 + 16] + paddd m0, m1 + pmovzxwd m1, [r1 + 18] + paddd m0, m1 + pmovzxwd m1, [r1 + 20] + paddd m0, m1 + pmovzxwd m1, [r1 + 22] + paddd m0, m1 +%endmacro + +%macro INTEGRAL_TWELVE_HORIZONTAL_4_HBD 0 + pmovzxwd xm0, [r1] + pmovzxwd xm1, [r1 + 2] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 4] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 6] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 8] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 10] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 12] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 14] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 16] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 18] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 20] + paddd xm0, xm1 + pmovzxwd xm1, [r1 + 22] + paddd xm0, xm1 +%endmacro + +;----------------------------------------------------------------------------- +;static void integral_init12h_c(uint32_t *sum, pixel *pix, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal integral12h, 3, 5, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 12 ;stride - 12 + mov r4, r2 + shr r4, 3 + +.loop: + INTEGRAL_TWELVE_HORIZONTAL_8_HBD + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + add r1, 16 + add r0, 32 + sub r2, 8 + sub r4, 1 + jnz .loop + INTEGRAL_TWELVE_HORIZONTAL_4_HBD + movu xm1, [r0] + paddd xm0, xm1 + movu [r0 + r3], xm0 + RET + +%else +cglobal integral12h, 3, 5, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 12 ;stride - 12 + mov r4, r2 + shr r4, 4 + +.loop_16: + INTEGRAL_TWELVE_HORIZONTAL_16 + vperm2i128 m2, m0, m0, 1 + pmovzxwd m2, xm2 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + movu m1, [r0 + 32] + paddd m2, m1 + movu [r0 + r3 + 32], m2 + add r1, 16 + add r0, 64 + sub r2, 16 + sub r4, 1 + jnz .loop_16 + cmp r2, 12 + je .loop_12 + cmp r2, 4 + je .loop_4 + +.loop_12: + INTEGRAL_TWELVE_HORIZONTAL_16 + vperm2i128 m2, m0, m0, 1 + pmovzxwd xm2, xm2 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + movu xm1, [r0 + 32] + paddd xm2, xm1 + movu [r0 + r3 + 32], xm2 + jmp .end + +.loop_4: + INTEGRAL_TWELVE_HORIZONTAL_4 + pmovzxwd xm0, xm0 + movu xm1, [r0] + paddd xm0, xm1 + movu [r0 + r3], xm0 + jmp .end + +.end + RET +%endif + +%macro INTEGRAL_SIXTEEN_HORIZONTAL_16 0 + pmovzxbw m0, [r1] + pmovzxbw m1, [r1 + 1] + paddw m0, m1 + pmovzxbw m1, [r1 + 2] + paddw m0, m1 + pmovzxbw m1, [r1 + 3] + paddw m0, m1 + pmovzxbw m1, [r1 + 4] + paddw m0, m1 + pmovzxbw m1, [r1 + 5] + paddw m0, m1 + pmovzxbw m1, [r1 + 6] + paddw m0, m1 + pmovzxbw m1, [r1 + 7] + paddw m0, m1 + pmovzxbw m1, [r1 + 8] + paddw m0, m1 + pmovzxbw m1, [r1 + 9] + paddw m0, m1 + pmovzxbw m1, [r1 + 10] + paddw m0, m1 + pmovzxbw m1, [r1 + 11] + paddw m0, m1 + pmovzxbw m1, [r1 + 12] + paddw m0, m1 + pmovzxbw m1, [r1 + 13] + paddw m0, m1 + pmovzxbw m1, [r1 + 14] + paddw m0, m1 + pmovzxbw m1, [r1 + 15] + paddw m0, m1 +%endmacro + +%macro INTEGRAL_SIXTEEN_HORIZONTAL_8 0 + pmovzxbw xm0, [r1] + pmovzxbw xm1, [r1 + 1] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 2] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 3] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 4] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 5] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 6] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 7] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 8] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 9] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 10] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 11] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 12] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 13] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 14] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 15] + paddw xm0, xm1 +%endmacro + +%macro INTEGRAL_SIXTEEN_HORIZONTAL_8_HBD 0 + pmovzxwd m0, [r1] + pmovzxwd m1, [r1 + 2] + paddd m0, m1 + pmovzxwd m1, [r1 + 4] + paddd m0, m1 + pmovzxwd m1, [r1 + 6] + paddd m0, m1 + pmovzxwd m1, [r1 + 8] + paddd m0, m1 + pmovzxwd m1, [r1 + 10] + paddd m0, m1 + pmovzxwd m1, [r1 + 12] + paddd m0, m1 + pmovzxwd m1, [r1 + 14] + paddd m0, m1 + pmovzxwd m1, [r1 + 16] + paddd m0, m1 + pmovzxwd m1, [r1 + 18] + paddd m0, m1 + pmovzxwd m1, [r1 + 20] + paddd m0, m1 + pmovzxwd m1, [r1 + 22] + paddd m0, m1 + pmovzxwd m1, [r1 + 24] + paddd m0, m1 + pmovzxwd m1, [r1 + 26] + paddd m0, m1 + pmovzxwd m1, [r1 + 28] + paddd m0, m1 + pmovzxwd m1, [r1 + 30] + paddd m0, m1 +%endmacro + +;----------------------------------------------------------------------------- +;static void integral_init16h_c(uint32_t *sum, pixel *pix, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +%if HIGH_BIT_DEPTH +cglobal integral16h, 3, 4, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 16 ;stride - 16 + +.loop: + INTEGRAL_SIXTEEN_HORIZONTAL_8_HBD + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + add r1, 16 + add r0, 32 + sub r2, 8 + jnz .loop + RET + +%else +cglobal integral16h, 3, 5, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 16 ;stride - 16 + mov r4, r2 + shr r4, 4 + +.loop_16: + INTEGRAL_SIXTEEN_HORIZONTAL_16 + vperm2i128 m2, m0, m0, 1 + pmovzxwd m2, xm2 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + movu m1, [r0 + 32] + paddd m2, m1 + movu [r0 + r3 + 32], m2 + add r1, 16 + add r0, 64 + sub r2, 16 + sub r4, 1 + jnz .loop_16 + cmp r2, 8 + je .loop_8 + jmp .end + +.loop_8: + INTEGRAL_SIXTEEN_HORIZONTAL_8 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + jmp .end + +.end + RET +%endif + +%macro INTEGRAL_TWENTYFOUR_HORIZONTAL_16 0 + pmovzxbw m0, [r1] + pmovzxbw m1, [r1 + 1] + paddw m0, m1 + pmovzxbw m1, [r1 + 2] + paddw m0, m1 + pmovzxbw m1, [r1 + 3] + paddw m0, m1 + pmovzxbw m1, [r1 + 4] + paddw m0, m1 + pmovzxbw m1, [r1 + 5] + paddw m0, m1 + pmovzxbw m1, [r1 + 6] + paddw m0, m1 + pmovzxbw m1, [r1 + 7] + paddw m0, m1 + pmovzxbw m1, [r1 + 8] + paddw m0, m1 + pmovzxbw m1, [r1 + 9] + paddw m0, m1 + pmovzxbw m1, [r1 + 10] + paddw m0, m1 + pmovzxbw m1, [r1 + 11] + paddw m0, m1 + pmovzxbw m1, [r1 + 12] + paddw m0, m1 + pmovzxbw m1, [r1 + 13] + paddw m0, m1 + pmovzxbw m1, [r1 + 14] + paddw m0, m1 + pmovzxbw m1, [r1 + 15] + paddw m0, m1 + pmovzxbw m1, [r1 + 16] + paddw m0, m1 + pmovzxbw m1, [r1 + 17] + paddw m0, m1 + pmovzxbw m1, [r1 + 18] + paddw m0, m1 + pmovzxbw m1, [r1 + 19] + paddw m0, m1 + pmovzxbw m1, [r1 + 20] + paddw m0, m1 + pmovzxbw m1, [r1 + 21] + paddw m0, m1 + pmovzxbw m1, [r1 + 22] + paddw m0, m1 + pmovzxbw m1, [r1 + 23] + paddw m0, m1 +%endmacro + +%macro INTEGRAL_TWENTYFOUR_HORIZONTAL_8 0 + pmovzxbw xm0, [r1] + pmovzxbw xm1, [r1 + 1] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 2] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 3] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 4] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 5] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 6] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 7] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 8] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 9] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 10] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 11] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 12] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 13] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 14] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 15] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 16] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 17] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 18] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 19] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 20] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 21] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 22] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 23] + paddw xm0, xm1 +%endmacro + +;----------------------------------------------------------------------------- +;static void integral_init24h_c(uint32_t *sum, pixel *pix, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal integral24h, 3, 5, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 24 ;stride - 24 + mov r4, r2 + shr r4, 4 + +.loop_16: + INTEGRAL_TWENTYFOUR_HORIZONTAL_16 + vperm2i128 m2, m0, m0, 1 + pmovzxwd m2, xm2 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + movu m1, [r0 + 32] + paddd m2, m1 + movu [r0 + r3 + 32], m2 + add r1, 16 + add r0, 64 + sub r2, 16 + sub r4, 1 + jnz .loop_16 + cmp r2, 8 + je .loop_8 + jmp .end + +.loop_8: + INTEGRAL_TWENTYFOUR_HORIZONTAL_8 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + jmp .end + +.end + RET + +%macro INTEGRAL_THIRTYTWO_HORIZONTAL_16 0 + pmovzxbw m0, [r1] + pmovzxbw m1, [r1 + 1] + paddw m0, m1 + pmovzxbw m1, [r1 + 2] + paddw m0, m1 + pmovzxbw m1, [r1 + 3] + paddw m0, m1 + pmovzxbw m1, [r1 + 4] + paddw m0, m1 + pmovzxbw m1, [r1 + 5] + paddw m0, m1 + pmovzxbw m1, [r1 + 6] + paddw m0, m1 + pmovzxbw m1, [r1 + 7] + paddw m0, m1 + pmovzxbw m1, [r1 + 8] + paddw m0, m1 + pmovzxbw m1, [r1 + 9] + paddw m0, m1 + pmovzxbw m1, [r1 + 10] + paddw m0, m1 + pmovzxbw m1, [r1 + 11] + paddw m0, m1 + pmovzxbw m1, [r1 + 12] + paddw m0, m1 + pmovzxbw m1, [r1 + 13] + paddw m0, m1 + pmovzxbw m1, [r1 + 14] + paddw m0, m1 + pmovzxbw m1, [r1 + 15] + paddw m0, m1 + pmovzxbw m1, [r1 + 16] + paddw m0, m1 + pmovzxbw m1, [r1 + 17] + paddw m0, m1 + pmovzxbw m1, [r1 + 18] + paddw m0, m1 + pmovzxbw m1, [r1 + 19] + paddw m0, m1 + pmovzxbw m1, [r1 + 20] + paddw m0, m1 + pmovzxbw m1, [r1 + 21] + paddw m0, m1 + pmovzxbw m1, [r1 + 22] + paddw m0, m1 + pmovzxbw m1, [r1 + 23] + paddw m0, m1 + pmovzxbw m1, [r1 + 24] + paddw m0, m1 + pmovzxbw m1, [r1 + 25] + paddw m0, m1 + pmovzxbw m1, [r1 + 26] + paddw m0, m1 + pmovzxbw m1, [r1 + 27] + paddw m0, m1 + pmovzxbw m1, [r1 + 28] + paddw m0, m1 + pmovzxbw m1, [r1 + 29] + paddw m0, m1 + pmovzxbw m1, [r1 + 30] + paddw m0, m1 + pmovzxbw m1, [r1 + 31] + paddw m0, m1 +%endmacro + + +%macro INTEGRAL_THIRTYTWO_HORIZONTAL_8 0 + pmovzxbw xm0, [r1] + pmovzxbw xm1, [r1 + 1] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 2] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 3] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 4] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 5] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 6] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 7] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 8] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 9] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 10] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 11] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 12] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 13] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 14] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 15] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 16] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 17] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 18] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 19] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 20] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 21] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 22] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 23] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 24] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 25] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 26] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 27] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 28] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 29] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 30] + paddw xm0, xm1 + pmovzxbw xm1, [r1 + 31] + paddw xm0, xm1 +%endmacro + +;----------------------------------------------------------------------------- +;static void integral_init32h_c(uint32_t *sum, pixel *pix, intptr_t stride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal integral32h, 3, 5, 3 + lea r3, [4 * r2] + sub r0, r3 + sub r2, 32 ;stride - 32 + mov r4, r2 + shr r4, 4 + +.loop_16: + INTEGRAL_THIRTYTWO_HORIZONTAL_16 + vperm2i128 m2, m0, m0, 1 + pmovzxwd m2, xm2 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + movu m1, [r0 + 32] + paddd m2, m1 + movu [r0 + r3 + 32], m2 + add r1, 16 + add r0, 64 + sub r2, 16 + sub r4, 1 + jnz .loop_16 + cmp r2, 8 + je .loop_8 + jmp .end + +.loop_8: + INTEGRAL_THIRTYTWO_HORIZONTAL_8 + pmovzxwd m0, xm0 + movu m1, [r0] + paddd m0, m1 + movu [r0 + r3], m0 + jmp .end + +.end + RET diff --git a/source/common/x86/seaintegral.h b/source/common/x86/seaintegral.h new file mode 100644 index 0000000000..dc98dc41ae --- /dev/null +++ b/source/common/x86/seaintegral.h @@ -0,0 +1,42 @@ +/***************************************************************************** +* Copyright (C) 2013-2017 MulticoreWare, Inc +* +* Authors: Vignesh V Menon <vignesh@multicorewareinc.com> +* Jayashri Murugan <jayashri@multicorewareinc.com> +* Praveen Tiwari <praveen@multicorewareinc.com> +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_SEAINTEGRAL_H +#define X265_SEAINTEGRAL_H + +void PFX(integral4v_avx2)(uint32_t *sum, intptr_t stride); +void PFX(integral8v_avx2)(uint32_t *sum, intptr_t stride); +void PFX(integral12v_avx2)(uint32_t *sum, intptr_t stride); +void PFX(integral16v_avx2)(uint32_t *sum, intptr_t stride); +void PFX(integral24v_avx2)(uint32_t *sum, intptr_t stride); +void PFX(integral32v_avx2)(uint32_t *sum, intptr_t stride); +void PFX(integral4h_avx2)(uint32_t *sum, pixel *pix, intptr_t stride); +void PFX(integral8h_avx2)(uint32_t *sum, pixel *pix, intptr_t stride); +void PFX(integral12h_avx2)(uint32_t *sum, pixel *pix, intptr_t stride); +void PFX(integral16h_avx2)(uint32_t *sum, pixel *pix, intptr_t stride); +void PFX(integral24h_avx2)(uint32_t *sum, pixel *pix, intptr_t stride); +void PFX(integral32h_avx2)(uint32_t *sum, pixel *pix, intptr_t stride); + +#endif //X265_SEAINTEGRAL_H diff --git a/source/common/x86/x86inc.asm b/source/common/x86/x86inc.asm index e6d10a119d..1a4e6a24c1 100644 --- a/source/common/x86/x86inc.asm +++ b/source/common/x86/x86inc.asm @@ -85,10 +85,6 @@ SECTION .rodata align=%1 %endmacro -%macro SECTION_TEXT 0-1 16 - SECTION .text align=%1 -%endmacro - %if WIN64 %define PIC %elif ARCH_X86_64 == 0 @@ -152,6 +148,7 @@ %define r%1w %2w %define r%1b %2b %define r%1h %2h + %define %2q %2 %if %0 == 2 %define r%1m %2d %define r%1mp %2 @@ -176,9 +173,9 @@ %define e%1h %3 %define r%1b %2 %define e%1b %2 -%if ARCH_X86_64 == 0 - %define r%1 e%1 -%endif + %if ARCH_X86_64 == 0 + %define r%1 e%1 + %endif %endmacro DECLARE_REG_SIZE ax, al, ah @@ -288,7 +285,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %macro ASSERT 1 %if (%1) == 0 - %error assert failed + %error assertion ``%1'' failed %endif %endmacro @@ -378,9 +375,19 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 + ; Reserve an additional register for storing the original stack pointer, but avoid using + ; eax/rax for this purpose since it can potentially get overwritten as a return value. %assign regs_used (regs_used + 1) - %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 - %warning "Stack pointer will overwrite register argument" + %if ARCH_X86_64 && regs_used == 7 + %assign regs_used 8 + %elif ARCH_X86_64 == 0 && regs_used == 1 + %assign regs_used 2 + %endif + %endif + %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 + ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) + ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. + %assign regs_used 5 + UNIX64 * 3 %endif %endif %endif @@ -409,10 +416,10 @@ DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 +DECLARE_REG 11, R14, 96 +DECLARE_REG 12, R15, 104 +DECLARE_REG 13, R12, 112 +DECLARE_REG 14, R13, 120 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 @@ -458,45 +465,46 @@ DECLARE_REG 14, R15, 120 WIN64_PUSH_XMM %endmacro -%macro WIN64_RESTORE_XMM_INTERNAL 1 +%macro WIN64_RESTORE_XMM_INTERNAL 0 %assign %%pad_size 0 %if xmm_regs_used > 8 %assign %%i xmm_regs_used %rep xmm_regs_used-8 %assign %%i %%i-1 - movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else - add %1, stack_size_padded + add rsp, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif %if xmm_regs_used > 7 - movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + movaps xmm7, [rsp + stack_offset - %%pad_size + 24] %endif %if xmm_regs_used > 6 - movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + movaps xmm6, [rsp + stack_offset - %%pad_size + 8] %endif %endmacro -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 +%macro WIN64_RESTORE_XMM 0 + WIN64_RESTORE_XMM_INTERNAL %assign stack_offset (stack_offset-stack_size_padded) + %assign stack_size_padded 0 %assign xmm_regs_used 0 %endmacro %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 %macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp + WIN64_RESTORE_XMM_INTERNAL POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 -%if mmsize == 32 - vzeroupper -%endif + %if mmsize == 32 + vzeroupper + %endif AUTO_REP_RET %endmacro @@ -513,10 +521,10 @@ DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 +DECLARE_REG 11, R14, 48 +DECLARE_REG 12, R15, 56 +DECLARE_REG 13, R12, 64 +DECLARE_REG 14, R13, 72 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 @@ -533,17 +541,17 @@ DECLARE_REG 14, R15, 72 %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 %macro RET 0 -%if stack_size_padded > 0 -%if required_stack_alignment > STACK_ALIGNMENT - mov rsp, rstkm -%else - add rsp, stack_size_padded -%endif -%endif + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif POP_IF_USED 14, 13, 12, 11, 10, 9 -%if mmsize == 32 - vzeroupper -%endif + %if mmsize == 32 + vzeroupper + %endif AUTO_REP_RET %endmacro @@ -589,29 +597,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 %macro RET 0 -%if stack_size_padded > 0 -%if required_stack_alignment > STACK_ALIGNMENT - mov rsp, rstkm -%else - add rsp, stack_size_padded -%endif -%endif + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif POP_IF_USED 6, 5, 4, 3 -%if mmsize == 32 - vzeroupper -%endif + %if mmsize == 32 + vzeroupper + %endif AUTO_REP_RET %endmacro %endif ;====================================================================== %if WIN64 == 0 -%macro WIN64_SPILL_XMM 1 -%endmacro -%macro WIN64_RESTORE_XMM 1 -%endmacro -%macro WIN64_PUSH_XMM 0 -%endmacro + %macro WIN64_SPILL_XMM 1 + %endmacro + %macro WIN64_RESTORE_XMM 0 + %endmacro + %macro WIN64_PUSH_XMM 0 + %endmacro %endif ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either @@ -628,10 +636,8 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %define last_branch_adr $$ %macro AUTO_REP_RET 0 - %ifndef cpuflags - times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. - %elif notcpuflag(ssse3) - times ((last_branch_adr-$)>>31)+1 rep + %if notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. %endif ret %endmacro @@ -640,8 +646,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %rep %0 %macro %1 1-2 %1 %2 %1 - %%branch_instr: - %xdefine last_branch_adr %%branch_instr + %if notcpuflag(ssse3) + %%branch_instr equ $ + %xdefine last_branch_adr %%branch_instr + %endif %endmacro %rotate 1 %endrep @@ -736,8 +744,8 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, ; This is needed for ELF, otherwise the GNU linker assumes the stack is ; executable by default. -%if FORMAT_ELF -SECTION .note.GNU-stack noalloc noexec nowrite progbits +%ifidn __OUTPUT_FORMAT__,elf + [SECTION .note.GNU-stack noalloc noexec nowrite progbits] %endif ; cpuflags @@ -749,27 +757,28 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_avx2 (1<<14)| cpuflags_avx +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 +%assign cpuflags_avx (1<<12)| cpuflags_sse42 +%assign cpuflags_xop (1<<13)| cpuflags_avx +%assign cpuflags_fma4 (1<<14)| cpuflags_avx %assign cpuflags_fma3 (1<<15)| cpuflags_avx +%assign cpuflags_bmi1 (1<<16)| cpuflags_avx | cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<17)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<18)| cpuflags_fma3 | cpuflags_bmi2 -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 +%assign cpuflags_cache32 (1<<19) +%assign cpuflags_cache64 (1<<20) +%assign cpuflags_slowctz (1<<21) +%assign cpuflags_aligned (1<<22) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<23) -%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) -%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) +; Returns a boolean value expressing whether or not the specified cpuflag is enabled. +%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) +%define notcpuflag(x) (cpuflag(x) ^ 1) ; Takes an arbitrary number of cpuflags from the above list. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. @@ -846,14 +855,14 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define movnta movntq %assign %%i 0 %rep 8 - CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nmm, %%i, %%i - %assign %%i %%i+1 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nnmm, %%i, %%i + %assign %%i %%i+1 %endrep %rep 8 - CAT_UNDEF m, %%i - CAT_UNDEF nmm, %%i - %assign %%i %%i+1 + CAT_UNDEF m, %%i + CAT_UNDEF nnmm, %%i + %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 %endmacro @@ -864,7 +873,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define mmsize 16 %define num_mmregs 8 %if ARCH_X86_64 - %define num_mmregs 16 + %define num_mmregs 16 %endif %define mova movdqa %define movu movdqu @@ -872,9 +881,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define movnta movntdq %assign %%i 0 %rep num_mmregs - CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nxmm, %%i, %%i - %assign %%i %%i+1 + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nnxmm, %%i, %%i + %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 %endmacro @@ -885,7 +894,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define mmsize 32 %define num_mmregs 8 %if ARCH_X86_64 - %define num_mmregs 16 + %define num_mmregs 16 %endif %define mova movdqa %define movu movdqu @@ -893,9 +902,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define movnta movntdq %assign %%i 0 %rep num_mmregs - CAT_XDEFINE m, %%i, ymm %+ %%i - CAT_XDEFINE nymm, %%i, %%i - %assign %%i %%i+1 + CAT_XDEFINE m, %%i, ymm %+ %%i + CAT_XDEFINE nnymm, %%i, %%i + %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 %endmacro @@ -912,8 +921,6 @@ INIT_XMM %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 - %define ymm%1xmm xmm%1 - %define xmm%1ymm ymm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 %endmacro @@ -921,7 +928,7 @@ INIT_XMM %assign i 0 %rep 16 DECLARE_MMCAST i -%assign i i+1 + %assign i i+1 %endrep ; I often want to use macros that permute their arguments. e.g. there's no @@ -939,23 +946,23 @@ INIT_XMM ; doesn't cost any cycles. %macro PERMUTE 2-* ; takes a list of pairs to swap -%rep %0/2 - %xdefine %%tmp%2 m%2 - %rotate 2 -%endrep -%rep %0/2 - %xdefine m%1 %%tmp%2 - CAT_XDEFINE n, m%1, %1 - %rotate 2 -%endrep + %rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 + %endrep + %rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE nn, m%1, %1 + %rotate 2 + %endrep %endmacro %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) -%ifnum %1 ; SWAP 0, 1, ... - SWAP_INTERNAL_NUM %1, %2 -%else ; SWAP m0, m1, ... - SWAP_INTERNAL_NAME %1, %2 -%endif + %ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 + %else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 + %endif %endmacro %macro SWAP_INTERNAL_NUM 2-* @@ -963,17 +970,17 @@ INIT_XMM %xdefine %%tmp m%1 %xdefine m%1 m%2 %xdefine m%2 %%tmp - CAT_XDEFINE n, m%1, %1 - CAT_XDEFINE n, m%2, %2 - %rotate 1 + CAT_XDEFINE nn, m%1, %1 + CAT_XDEFINE nn, m%2, %2 + %rotate 1 %endrep %endmacro %macro SWAP_INTERNAL_NAME 2-* - %xdefine %%args n %+ %1 + %xdefine %%args nn %+ %1 %rep %0-1 - %xdefine %%args %%args, n %+ %2 - %rotate 1 + %xdefine %%args %%args, nn %+ %2 + %rotate 1 %endrep SWAP_INTERNAL_NUM %%args %endmacro @@ -990,7 +997,7 @@ INIT_XMM %assign %%i 0 %rep num_mmregs CAT_XDEFINE %%f, %%i, m %+ %%i - %assign %%i %%i+1 + %assign %%i %%i+1 %endrep %endmacro @@ -999,21 +1006,25 @@ INIT_XMM %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1_m %+ %%i - CAT_XDEFINE n, m %+ %%i, %%i - %assign %%i %%i+1 + CAT_XDEFINE nn, m %+ %%i, %%i + %assign %%i %%i+1 %endrep %endif %endmacro ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 - call_internal %1, %1 %+ SUFFIX + %ifid %1 + call_internal %1 %+ SUFFIX, %1 + %else + call %1 + %endif %endmacro %macro call_internal 2 - %xdefine %%i %1 - %ifndef cglobaled_%1 - %ifdef cglobaled_%2 - %xdefine %%i %2 + %xdefine %%i %2 + %ifndef cglobaled_%2 + %ifdef cglobaled_%1 + %xdefine %%i %1 %endif %endif call %%i @@ -1056,7 +1067,7 @@ INIT_XMM %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 -%assign i i+1 + %assign i i+1 %endrep %undef i @@ -1074,7 +1085,7 @@ INIT_XMM ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int -;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not ;%6+: operands %macro RUN_AVX_INSTR 6-9+ @@ -1098,6 +1109,8 @@ INIT_XMM %ifdef cpuname %if notcpuflag(%2) %error use of ``%1'' %2 instruction in cpuname function: current_function + %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 + %error use of ``%1'' sse2 instruction in cpuname function: current_function %endif %endif %endif @@ -1105,14 +1118,12 @@ INIT_XMM %if __emulate_avx %xdefine __src1 %7 %xdefine __src2 %8 - %ifnidn %6, %7 - %if %0 >= 9 - CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 - %else - CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 - %endif - %if %5 && %4 == 0 - %ifnid %8 + %if %5 && %4 == 0 + %ifnidn %6, %7 + %ifidn %6, %8 + %xdefine __src1 %8 + %xdefine __src2 %7 + %elifnnum sizeof%8 ; 3-operand AVX instructions with a memory arg can only have it in src2, ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). ; So, if the instruction is commutative with a memory arg, swap them. @@ -1120,6 +1131,13 @@ INIT_XMM %xdefine __src2 %7 %endif %endif + %endif + %ifnidn %6, __src1 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 + %endif %if __sizeofreg == 8 MOVQ %6, __src1 %elif %3 @@ -1147,9 +1165,9 @@ INIT_XMM ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int -;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -%macro AVX_INSTR 1-5 fnord, 0, 1, 0 +%macro AVX_INSTR 1-5 fnord, 0, 255, 0 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 %ifidn %2, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 @@ -1169,8 +1187,8 @@ INIT_XMM ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 -AVX_INSTR addsd, sse2, 1, 0, 1 -AVX_INSTR addss, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 0 +AVX_INSTR addss, sse, 1, 0, 0 AVX_INSTR addsubpd, sse3, 1, 0, 0 AVX_INSTR addsubps, sse3, 1, 0, 0 AVX_INSTR aesdec, fnord, 0, 0, 0 @@ -1183,10 +1201,10 @@ AVX_INSTR andnpd, sse2, 1, 0, 0 AVX_INSTR andnps, sse, 1, 0, 0 AVX_INSTR andpd, sse2, 1, 0, 1 AVX_INSTR andps, sse, 1, 0, 1 -AVX_INSTR blendpd, sse4, 1, 0, 0 -AVX_INSTR blendps, sse4, 1, 0, 0 -AVX_INSTR blendvpd, sse4, 1, 0, 0 -AVX_INSTR blendvps, sse4, 1, 0, 0 +AVX_INSTR blendpd, sse4, 1, 1, 0 +AVX_INSTR blendps, sse4, 1, 1, 0 +AVX_INSTR blendvpd, sse4 ; can't be emulated +AVX_INSTR blendvps, sse4 ; can't be emulated AVX_INSTR cmppd, sse2, 1, 1, 0 AVX_INSTR cmpps, sse, 1, 1, 0 AVX_INSTR cmpsd, sse2, 1, 1, 0 @@ -1200,10 +1218,10 @@ AVX_INSTR cvtpd2ps, sse2 AVX_INSTR cvtps2dq, sse2 AVX_INSTR cvtps2pd, sse2 AVX_INSTR cvtsd2si, sse2 -AVX_INSTR cvtsd2ss, sse2 -AVX_INSTR cvtsi2sd, sse2 -AVX_INSTR cvtsi2ss, sse -AVX_INSTR cvtss2sd, sse2 +AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 +AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 +AVX_INSTR cvtsi2ss, sse, 1, 0, 0 +AVX_INSTR cvtss2sd, sse2, 1, 0, 0 AVX_INSTR cvtss2si, sse AVX_INSTR cvttpd2dq, sse2 AVX_INSTR cvttps2dq, sse2 @@ -1226,15 +1244,15 @@ AVX_INSTR ldmxcsr, sse AVX_INSTR maskmovdqu, sse2 AVX_INSTR maxpd, sse2, 1, 0, 1 AVX_INSTR maxps, sse, 1, 0, 1 -AVX_INSTR maxsd, sse2, 1, 0, 1 -AVX_INSTR maxss, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 0 +AVX_INSTR maxss, sse, 1, 0, 0 AVX_INSTR minpd, sse2, 1, 0, 1 AVX_INSTR minps, sse, 1, 0, 1 -AVX_INSTR minsd, sse2, 1, 0, 1 -AVX_INSTR minss, sse, 1, 0, 1 +AVX_INSTR minsd, sse2, 1, 0, 0 +AVX_INSTR minss, sse, 1, 0, 0 AVX_INSTR movapd, sse2 AVX_INSTR movaps, sse -AVX_INSTR movd +AVX_INSTR movd, mmx AVX_INSTR movddup, sse3 AVX_INSTR movdqa, sse2 AVX_INSTR movdqu, sse2 @@ -1250,18 +1268,18 @@ AVX_INSTR movntdq, sse2 AVX_INSTR movntdqa, sse4 AVX_INSTR movntpd, sse2 AVX_INSTR movntps, sse -AVX_INSTR movq +AVX_INSTR movq, mmx AVX_INSTR movsd, sse2, 1, 0, 0 AVX_INSTR movshdup, sse3 AVX_INSTR movsldup, sse3 AVX_INSTR movss, sse, 1, 0, 0 AVX_INSTR movupd, sse2 AVX_INSTR movups, sse -AVX_INSTR mpsadbw, sse4 +AVX_INSTR mpsadbw, sse4, 0, 1, 0 AVX_INSTR mulpd, sse2, 1, 0, 1 AVX_INSTR mulps, sse, 1, 0, 1 -AVX_INSTR mulsd, sse2, 1, 0, 1 -AVX_INSTR mulss, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 0 +AVX_INSTR mulss, sse, 1, 0, 0 AVX_INSTR orpd, sse2, 1, 0, 1 AVX_INSTR orps, sse, 1, 0, 1 AVX_INSTR pabsb, ssse3 @@ -1279,14 +1297,18 @@ AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1 -AVX_INSTR palignr, ssse3 +AVX_INSTR palignr, ssse3, 0, 1, 0 AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 -AVX_INSTR pblendvb, sse4, 0, 0, 0 -AVX_INSTR pblendw, sse4 -AVX_INSTR pclmulqdq +AVX_INSTR pblendvb, sse4 ; can't be emulated +AVX_INSTR pblendw, sse4, 0, 1, 0 +AVX_INSTR pclmulqdq, fnord, 0, 1, 0 +AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 AVX_INSTR pcmpestri, sse42 AVX_INSTR pcmpestrm, sse42 AVX_INSTR pcmpistri, sse42 @@ -1310,10 +1332,10 @@ AVX_INSTR phminposuw, sse4 AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0 -AVX_INSTR pinsrb, sse4 -AVX_INSTR pinsrd, sse4 -AVX_INSTR pinsrq, sse4 -AVX_INSTR pinsrw, mmx2 +AVX_INSTR pinsrb, sse4, 0, 1, 0 +AVX_INSTR pinsrd, sse4, 0, 1, 0 +AVX_INSTR pinsrq, sse4, 0, 1, 0 +AVX_INSTR pinsrw, mmx2, 0, 1, 0 AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 AVX_INSTR pmaxsb, sse4, 0, 0, 1 @@ -1385,18 +1407,18 @@ AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 -AVX_INSTR rcpps, sse, 1, 0, 0 +AVX_INSTR rcpps, sse AVX_INSTR rcpss, sse, 1, 0, 0 AVX_INSTR roundpd, sse4 AVX_INSTR roundps, sse4 -AVX_INSTR roundsd, sse4 -AVX_INSTR roundss, sse4 -AVX_INSTR rsqrtps, sse, 1, 0, 0 +AVX_INSTR roundsd, sse4, 1, 1, 0 +AVX_INSTR roundss, sse4, 1, 1, 0 +AVX_INSTR rsqrtps, sse AVX_INSTR rsqrtss, sse, 1, 0, 0 AVX_INSTR shufpd, sse2, 1, 1, 0 AVX_INSTR shufps, sse, 1, 1, 0 -AVX_INSTR sqrtpd, sse2, 1, 0, 0 -AVX_INSTR sqrtps, sse, 1, 0, 0 +AVX_INSTR sqrtpd, sse2 +AVX_INSTR sqrtps, sse AVX_INSTR sqrtsd, sse2, 1, 0, 0 AVX_INSTR sqrtss, sse, 1, 0, 0 AVX_INSTR stmxcsr, sse @@ -1431,7 +1453,7 @@ AVX_INSTR pfmul, 3dnow, 1, 0, 1 %else CAT_XDEFINE q, j, i %endif -%assign i i+1 + %assign i i+1 %endrep %undef i %undef j @@ -1454,57 +1476,52 @@ FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation FMA_INSTR pmadcswd, pmaddwd, paddd -; convert FMA4 to FMA3 if possible -%macro FMA4_INSTR 4 - %macro %1 4-8 %1, %2, %3, %4 - %if cpuflag(fma4) - v%5 %1, %2, %3, %4 - %elifidn %1, %2 - v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 - %elifidn %1, %3 - v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 - %elifidn %1, %4 - v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 +; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. +; FMA3 is only possible if dst is the same as one of the src registers. +; Either src2 or src3 can be a memory operand. +%macro FMA4_INSTR 2-* + %push fma4_instr + %xdefine %$prefix %1 + %rep %0 - 1 + %macro %$prefix%2 4-6 %$prefix, %2 + %if notcpuflag(fma3) && notcpuflag(fma4) + %error use of ``%5%6'' fma instruction in cpuname function: current_function + %elif cpuflag(fma4) + v%5%6 %1, %2, %3, %4 + %elifidn %1, %2 + ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. + %ifid %3 + v%{5}213%6 %2, %3, %4 + %else + v%{5}132%6 %2, %4, %3 + %endif + %elifidn %1, %3 + v%{5}213%6 %3, %2, %4 + %elifidn %1, %4 + v%{5}231%6 %4, %2, %3 + %else + %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported + %endif + %endmacro + %rotate 1 + %endrep + %pop +%endmacro + +FMA4_INSTR fmadd, pd, ps, sd, ss +FMA4_INSTR fmaddsub, pd, ps +FMA4_INSTR fmsub, pd, ps, sd, ss +FMA4_INSTR fmsubadd, pd, ps +FMA4_INSTR fnmadd, pd, ps, sd, ss +FMA4_INSTR fnmsub, pd, ps, sd, ss + +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) +%if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 + %macro vpbroadcastq 2 + %if sizeof%1 == 16 + movddup %1, %2 %else - %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported + vbroadcastsd %1, %2 %endif %endmacro -%endmacro - -FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd -FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps -FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd -FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss - -FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd -FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps -FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd -FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps - -FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd -FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps -FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd -FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss - -FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd -FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps -FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd -FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss - -FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd -FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps -FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd -FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss - -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug -%ifdef __YASM_VER__ - %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 - %macro vpbroadcastq 2 - %if sizeof%1 == 16 - movddup %1, %2 - %else - vbroadcastsd %1, %2 - %endif - %endmacro - %endif %endif diff --git a/source/dynamicHDR10/BasicStructures.cpp b/source/dynamicHDR10/BasicStructures.cpp deleted file mode 100644 index 31a074f812..0000000000 --- a/source/dynamicHDR10/BasicStructures.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/** - * @file BasicStructures.cpp - * @brief Defines the structure of metadata parameters - * @author Daniel Maximiliano Valenzuela, Seongnam Oh. - * @create date 03/01/2017 - * @version 0.0.1 - * - * Copyright @ 2017 Samsung Electronics, DMS Lab, Samsung Research America and Samsung Research Tijuana - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, - * MA 02110-1301, USA. -**/ - -#include "BasicStructures.h" -#include "vector" - -struct PercentileLuminance{ - - float averageLuminance = 0.0; - float maxRLuminance = 0.0; - float maxGLuminance = 0.0; - float maxBLuminance = 0.0; - int order; - std::vector<unsigned int> percentiles; -}; - - - diff --git a/source/dynamicHDR10/BasicStructures.h b/source/dynamicHDR10/BasicStructures.h index e2451a9f9b..c139b226c7 100644 --- a/source/dynamicHDR10/BasicStructures.h +++ b/source/dynamicHDR10/BasicStructures.h @@ -35,16 +35,26 @@ struct LuminanceParameters float maxRLuminance = 0.0; float maxGLuminance = 0.0; float maxBLuminance = 0.0; - int order; + int order = 0; std::vector<unsigned int> percentiles; }; struct BezierCurveData { - int order; - int sPx; - int sPy; + int order = 0; + int sPx = 0; + int sPy = 0; std::vector<int> coeff; }; +struct PercentileLuminance{ + + float averageLuminance = 0.0; + float maxRLuminance = 0.0; + float maxGLuminance = 0.0; + float maxBLuminance = 0.0; + int order = 0; + std::vector<unsigned int> percentiles; +}; + #endif // BASICSTRUCTURES_H diff --git a/source/dynamicHDR10/CMakeLists.txt b/source/dynamicHDR10/CMakeLists.txt index 5e6eef2c1c..22fb79d447 100644 --- a/source/dynamicHDR10/CMakeLists.txt +++ b/source/dynamicHDR10/CMakeLists.txt @@ -1,8 +1,8 @@ # vim: syntax=cmake -if(ENABLE_DYNAMIC_HDR10) +if(ENABLE_HDR10_PLUS) add_library(dynamicHDR10 OBJECT - BasicStructures.cpp BasicStructures.h + BasicStructures.h json11/json11.cpp json11/json11.h JsonHelper.cpp JsonHelper.h metadataFromJson.cpp metadataFromJson.h @@ -10,7 +10,6 @@ add_library(dynamicHDR10 OBJECT hdr10plus.h api.cpp ) -else() cmake_minimum_required (VERSION 2.8.11) project(dynamicHDR10) include(CheckIncludeFiles) @@ -150,26 +149,5 @@ set(BIN_INSTALL_DIR bin CACHE STRING "Install location of executables") option(ENABLE_SHARED "Build shared library" OFF) -if(ENABLE_SHARED) - add_library(dynamicHDR10 SHARED - json11/json11.cpp json11/json11.h - BasicStructures.cpp BasicStructures.h - JsonHelper.cpp JsonHelper.h - metadataFromJson.cpp metadataFromJson.h - SeiMetadataDictionary.cpp SeiMetadataDictionary.h - hdr10plus.h api.cpp ) -else() - add_library(dynamicHDR10 STATIC - json11/json11.cpp json11/json11.h - BasicStructures.cpp BasicStructures.h - JsonHelper.cpp JsonHelper.h - metadataFromJson.cpp metadataFromJson.h - SeiMetadataDictionary.cpp SeiMetadataDictionary.h - hdr10plus.h api.cpp ) -endif() - -install (TARGETS dynamicHDR10 - LIBRARY DESTINATION ${LIB_INSTALL_DIR} - ARCHIVE DESTINATION ${LIB_INSTALL_DIR}) install(FILES hdr10plus.h DESTINATION include) endif() \ No newline at end of file diff --git a/source/dynamicHDR10/json11/json11.cpp b/source/dynamicHDR10/json11/json11.cpp index 9cbb2d1753..3031fa9cea 100644 --- a/source/dynamicHDR10/json11/json11.cpp +++ b/source/dynamicHDR10/json11/json11.cpp @@ -26,6 +26,12 @@ #include <cstdio> #include <limits> +#if _MSC_VER +#pragma warning(disable: 4510) //const member cannot be default initialized +#pragma warning(disable: 4512) //assignment operator could not be generated +#pragma warning(disable: 4610) //const member cannot be default initialized +#endif + namespace json11 { static const int max_depth = 200; @@ -435,7 +441,7 @@ struct JsonParser final { char get_next_token() { consume_garbage(); if (i == str.size()) - return fail("unexpected end of input", 0); + return fail("unexpected end of input", '0'); return str[i++]; } @@ -472,7 +478,7 @@ struct JsonParser final { string parse_string() { string out; long last_escaped_codepoint = -1; - while (true) { + for (;;) { if (i == str.size()) return fail("unexpected end of input in string", ""); @@ -665,7 +671,7 @@ struct JsonParser final { if (ch == '}') return data; - while (1) { + for (;;) { if (ch != '"') return fail("expected '\"' in object, got " + esc(ch)); @@ -698,7 +704,7 @@ struct JsonParser final { if (ch == ']') return data; - while (1) { + for (;;) { i--; data.push_back(parse_json(depth + 1)); if (failed) diff --git a/source/dynamicHDR10/metadataFromJson.cpp b/source/dynamicHDR10/metadataFromJson.cpp index 9a2a437e37..f33067adbe 100644 --- a/source/dynamicHDR10/metadataFromJson.cpp +++ b/source/dynamicHDR10/metadataFromJson.cpp @@ -168,7 +168,7 @@ class metadataFromJson::DynamicMetaIO { int payloadBytes = 1; - for(;payload > 0xFF; payload -= 0xFF, ++payloadBytes); + for(;payload >= 0xFF; payload -= 0xFF, ++payloadBytes); if(payloadBytes > 1) { diff --git a/source/encoder/CMakeLists.txt b/source/encoder/CMakeLists.txt index d91af8dba7..0b079ae985 100644 --- a/source/encoder/CMakeLists.txt +++ b/source/encoder/CMakeLists.txt @@ -43,4 +43,5 @@ add_library(encoder OBJECT ../x265.h reference.cpp reference.h encoder.cpp encoder.h api.cpp - weightPrediction.cpp) + weightPrediction.cpp + ../x265-extras.cpp ../x265-extras.h) diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp index 858a84d914..5dabe33eae 100644 --- a/source/encoder/analysis.cpp +++ b/source/encoder/analysis.cpp @@ -75,6 +75,7 @@ Analysis::Analysis() m_reuseInterDataCTU = NULL; m_reuseRef = NULL; m_bHD = false; + m_evaluateInter = 0; } bool Analysis::create(ThreadLocalData *tld) @@ -89,19 +90,19 @@ bool Analysis::create(ThreadLocalData *tld) cacheCost = X265_MALLOC(uint64_t, costArrSize); int csp = m_param->internalCsp; - uint32_t cuSize = g_maxCUSize; + uint32_t cuSize = m_param->maxCUSize; bool ok = true; - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++, cuSize >>= 1) + for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++, cuSize >>= 1) { ModeDepth &md = m_modeDepth[depth]; - md.cuMemPool.create(depth, csp, MAX_PRED_TYPES); + md.cuMemPool.create(depth, csp, MAX_PRED_TYPES, *m_param); ok &= md.fencYuv.create(cuSize, csp); for (int j = 0; j < MAX_PRED_TYPES; j++) { - md.pred[j].cu.initialize(md.cuMemPool, depth, csp, j); + md.pred[j].cu.initialize(md.cuMemPool, depth, *m_param, j); ok &= md.pred[j].predYuv.create(cuSize, csp); ok &= md.pred[j].reconYuv.create(cuSize, csp); md.pred[j].fencYuv = &md.fencYuv; @@ -115,7 +116,7 @@ bool Analysis::create(ThreadLocalData *tld) void Analysis::destroy() { - for (uint32_t i = 0; i <= g_maxCUDepth; i++) + for (uint32_t i = 0; i <= m_param->maxCUDepth; i++) { m_modeDepth[i].cuMemPool.destroy(); m_modeDepth[i].fencYuv.destroy(); @@ -150,6 +151,41 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con calculateNormFactor(ctu, qp); uint32_t numPartition = ctu.m_numPartitions; + if (m_param->bCTUInfo && (*m_frame->m_ctuInfo + ctu.m_cuAddr)) + { + x265_ctu_info_t* ctuTemp = *m_frame->m_ctuInfo + ctu.m_cuAddr; + if (ctuTemp->ctuPartitions) + { + int32_t depthIdx = 0; + uint32_t maxNum8x8Partitions = 64; + uint8_t* depthInfoPtr = m_frame->m_addOnDepth[ctu.m_cuAddr]; + uint8_t* contentInfoPtr = m_frame->m_addOnCtuInfo[ctu.m_cuAddr]; + int* prevCtuInfoChangePtr = m_frame->m_addOnPrevChange[ctu.m_cuAddr]; + do + { + uint8_t depth = (uint8_t)ctuTemp->ctuPartitions[depthIdx]; + uint8_t content = (uint8_t)(*((int32_t *)ctuTemp->ctuInfo + depthIdx)); + int prevCtuInfoChange = m_frame->m_prevCtuInfoChange[ctu.m_cuAddr * maxNum8x8Partitions + depthIdx]; + memset(depthInfoPtr, depth, sizeof(uint8_t) * numPartition >> 2 * depth); + memset(contentInfoPtr, content, sizeof(uint8_t) * numPartition >> 2 * depth); + memset(prevCtuInfoChangePtr, 0, sizeof(int) * numPartition >> 2 * depth); + for (uint32_t l = 0; l < numPartition >> 2 * depth; l++) + prevCtuInfoChangePtr[l] = prevCtuInfoChange; + depthInfoPtr += ctu.m_numPartitions >> 2 * depth; + contentInfoPtr += ctu.m_numPartitions >> 2 * depth; + prevCtuInfoChangePtr += ctu.m_numPartitions >> 2 * depth; + depthIdx++; + } while (ctuTemp->ctuPartitions[depthIdx] != 0); + + m_additionalCtuInfo = m_frame->m_addOnCtuInfo[ctu.m_cuAddr]; + m_prevCtuInfoChange = m_frame->m_addOnPrevChange[ctu.m_cuAddr]; + memcpy(ctu.m_cuDepth, m_frame->m_addOnDepth[ctu.m_cuAddr], sizeof(uint8_t) * numPartition); + //Calculate log2CUSize from depth + for (uint32_t i = 0; i < cuGeom.numPartitions; i++) + ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i]; + } + } + if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) { m_multipassAnalysis = (analysis2PassFrameData*)m_frame->m_analysis2Pass.analysisFramedata; @@ -167,19 +203,19 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con } } - if (m_param->analysisMode && m_slice->m_sliceType != I_SLICE && m_param->analysisRefineLevel > 1 && m_param->analysisRefineLevel < 10) + if (m_param->analysisReuseMode && m_slice->m_sliceType != I_SLICE && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel < 10) { int numPredDir = m_slice->isInterP() ? 1 : 2; m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData; m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir]; m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions]; m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions]; - if (m_param->analysisRefineLevel > 4) + if (m_param->analysisReuseLevel > 4) { m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions]; m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions]; } - if (m_param->analysisMode == X265_ANALYSIS_SAVE) + if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE) for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++) m_reuseRef[i] = -1; } @@ -188,7 +224,7 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con if (m_slice->m_sliceType == I_SLICE) { analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData; - if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_param->analysisRefineLevel > 1) + if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1) { memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition); memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition); @@ -200,8 +236,8 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con else { if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE && - ctu.m_cuPelX / g_maxCUSize >= frame.m_encData->m_pir.pirStartCol - && ctu.m_cuPelX / g_maxCUSize < frame.m_encData->m_pir.pirEndCol) + ctu.m_cuPelX / m_param->maxCUSize >= frame.m_encData->m_pir.pirStartCol + && ctu.m_cuPelX / m_param->maxCUSize < frame.m_encData->m_pir.pirEndCol) compressIntraCU(ctu, cuGeom, qp); else if (!m_param->rdLevel) { @@ -214,7 +250,7 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con /* generate residual for entire CTU at once and copy to reconPic */ encodeResidue(ctu, cuGeom); } - else if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_param->analysisRefineLevel == 10) + else if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel == 10) { analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData; int posCTU = ctu.m_cuAddr * numPartition; @@ -229,7 +265,7 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con } //Calculate log2CUSize from depth for (uint32_t i = 0; i < cuGeom.numPartitions; i++) - ctu.m_log2CUSize[i] = (uint8_t)g_maxLog2CUSize - ctu.m_cuDepth[i]; + ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i]; qprdRefine (ctu, cuGeom, qp, qp); return *m_modeDepth[0].bestMode; @@ -245,9 +281,69 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con if (m_param->bEnableRdRefine || m_param->bOptCUDeltaQP) qprdRefine(ctu, cuGeom, qp, qp); + if (m_param->csvLogLevel >= 2) + collectPUStatistics(ctu, cuGeom); + return *m_modeDepth[0].bestMode; } +void Analysis::collectPUStatistics(const CUData& ctu, const CUGeom& cuGeom) +{ + uint8_t depth = 0; + uint8_t partSize = 0; + for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) + { + depth = ctu.m_cuDepth[absPartIdx]; + partSize = ctu.m_partSize[absPartIdx]; + uint32_t numPU = nbPartsTable[(int)partSize]; + int shift = 2 * (m_param->maxCUDepth + 1 - depth); + for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) + { + PredictionUnit pu(ctu, cuGeom, puIdx); + int puabsPartIdx = ctu.getPUOffset(puIdx, absPartIdx); + int mode = 1; + if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_Nx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxN) + mode = 2; + else if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnU || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnD || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nLx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nRx2N) + mode = 3; + + if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_SKIP) + { + ctu.m_encData->m_frameStats.cntSkipPu[depth] += (uint64_t)(1 << shift); + ctu.m_encData->m_frameStats.totalPu[depth] += (uint64_t)(1 << shift); + } + else if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_INTRA) + { + if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_NxN) + { + ctu.m_encData->m_frameStats.cnt4x4++; + ctu.m_encData->m_frameStats.totalPu[4]++; + } + else + { + ctu.m_encData->m_frameStats.cntIntraPu[depth] += (uint64_t)(1 << shift); + ctu.m_encData->m_frameStats.totalPu[depth] += (uint64_t)(1 << shift); + } + } + else if (mode == 3) + { + ctu.m_encData->m_frameStats.cntAmp[depth] += (uint64_t)(1 << shift); + ctu.m_encData->m_frameStats.totalPu[depth] += (uint64_t)(1 << shift); + break; + } + else + { + if (ctu.m_mergeFlag[puabsPartIdx + absPartIdx]) + ctu.m_encData->m_frameStats.cntMergePu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode; + else + ctu.m_encData->m_frameStats.cntInterPu[depth][ctu.m_partSize[puabsPartIdx + absPartIdx]] += (1 << shift) / mode; + + ctu.m_encData->m_frameStats.totalPu[depth] += (1 << shift) / mode; + } + } + } +} + int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU) { float predDepth = 0; @@ -336,7 +432,7 @@ void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t int lambdaQP = lqp; bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth); - if (m_param->analysisRefineLevel == 10) + if (m_param->analysisReuseLevel == 10) doQPRefine = false; if (doQPRefine) @@ -400,6 +496,13 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, in bool bAlreadyDecided = parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX; bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth; + int split = 0; + if (m_param->intraRefine) + { + split = ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1)) && bDecidedDepth); + if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth) + bAlreadyDecided = false; + } if (bAlreadyDecided) { @@ -408,8 +511,11 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, in Mode& mode = md.pred[0]; md.bestMode = &mode; mode.cu.initSubCU(parentCTU, cuGeom, qp); - memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); - memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); + if (m_param->intraRefine != 2 || parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] <= 1) + { + memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); + memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); + } checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]); if (m_bTryLossless) @@ -440,7 +546,7 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, in } // stop recursion if we reach the depth of previous analysis decision - mightSplit &= !(bAlreadyDecided && bDecidedDepth); + mightSplit &= !(bAlreadyDecided && bDecidedDepth) || split; if (mightSplit) { @@ -501,7 +607,7 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, in } /* Save Intra CUs TU depth only when analysis mode is OFF */ - if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4 && !m_param->analysisMode) + if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4 && !m_param->analysisReuseMode) { CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr); int8_t maxTUDepth = -1; @@ -1017,11 +1123,21 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom); + bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth; bool skipModes = false; /* Skip any remaining mode analyses at current depth */ bool skipRecursion = false; /* Skip recursion */ bool splitIntra = true; bool skipRectAmp = false; bool chooseMerge = false; + bool bCtuInfoCheck = false; + int sameContentRef = 0; + + if (m_evaluateInter == 1) + { + skipRectAmp = !!md.bestMode; + mightSplit &= false; + minDepth = depth; + } if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4) m_maxTUDepth = loadTUDepth(cuGeom, parentCTU); @@ -1040,7 +1156,54 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& md.pred[PRED_2Nx2N].sa8dCost = 0; } - if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_param->analysisRefineLevel > 1) + if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx]) + { + if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx]) + sameContentRef = findSameContentRefCount(parentCTU, cuGeom); + if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx]) + { + mightNotSplit &= bDecidedDepth; + bCtuInfoCheck = skipRecursion = false; + skipModes = true; + } + else if (mightNotSplit && bDecidedDepth) + { + if (m_additionalCtuInfo[cuGeom.absPartIdx]) + { + bCtuInfoCheck = skipRecursion = true; + md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); + md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); + checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); + if (!sameContentRef) + { + if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth)) + { + qp -= int32_t(0.04 * qp); + setLambdaFromQP(parentCTU, qp); + } + if (m_param->bCTUInfo & 4) + skipModes = false; + } + if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4))) + { + if (m_param->rdLevel) + skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); + if ((m_param->bCTUInfo & 4) && sameContentRef) + skipModes = md.bestMode && true; + } + } + else + { + md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); + md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); + checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); + if (m_param->rdLevel) + skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); + } + mightSplit &= !bDecidedDepth; + } + } + if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) { if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx]) { @@ -1054,7 +1217,7 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& if (m_param->rdLevel) skipModes = m_param->bEnableEarlySkip && md.bestMode; } - if (m_param->analysisRefineLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N) + if (m_param->analysisReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N) { if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA && m_reuseModes[cuGeom.absPartIdx] != 4) { @@ -1082,7 +1245,7 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& } /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */ - if (mightNotSplit && depth >= minDepth && !md.bestMode) /* TODO: Re-evaluate if analysis load/save still works */ + if (mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) /* TODO: Re-evaluate if analysis load/save still works */ { /* Compute Merge Cost */ md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); @@ -1092,7 +1255,7 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth } - if (md.bestMode && m_param->bEnableRecursionSkip) + if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck) { skipRecursion = md.bestMode->cu.isSkipped(0); if (mightSplit && depth >= minDepth && !skipRecursion) @@ -1107,6 +1270,8 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& /* Step 2. Evaluate each of the 4 split sub-blocks in series */ if (mightSplit && !skipRecursion) { + if (bCtuInfoCheck && m_param->bCTUInfo & 2) + qp = int((1 / 0.96) * qp + 0.5); Mode* splitPred = &md.pred[PRED_SPLIT]; splitPred->initCosts(); CUData* splitCU = &splitPred->cu; @@ -1162,7 +1327,7 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& * 2 3 */ uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs; /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */ - if (mightNotSplit && depth >= minDepth) + if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode))) { if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0) setLambdaFromQP(parentCTU, qp); @@ -1346,7 +1511,7 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& } } } - bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE; + bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck); if (m_param->rdLevel >= 3) { /* Calculate RD cost of best inter option */ @@ -1584,10 +1749,19 @@ SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); + bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth; bool skipRecursion = false; bool skipModes = false; bool splitIntra = true; bool skipRectAmp = false; + bool bCtuInfoCheck = false; + int sameContentRef = 0; + + if (m_evaluateInter == 1) + { + skipRectAmp = !!md.bestMode; + mightSplit &= false; + } // avoid uninitialize value in below reference if (m_param->limitModes) @@ -1607,7 +1781,58 @@ SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& splitData[3].initSplitCUData(); uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs; uint32_t refMasks[2]; - if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_param->analysisRefineLevel > 1) + if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx]) + { + if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx]) + sameContentRef = findSameContentRefCount(parentCTU, cuGeom); + if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx]) + { + mightNotSplit &= bDecidedDepth; + bCtuInfoCheck = skipRecursion = false; + skipModes = true; + } + else if (mightNotSplit && bDecidedDepth) + { + if (m_additionalCtuInfo[cuGeom.absPartIdx]) + { + bCtuInfoCheck = skipRecursion = true; + refMasks[0] = allSplitRefs; + md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); + checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks); + checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); + if (!sameContentRef) + { + if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth)) + { + qp -= int32_t(0.04 * qp); + setLambdaFromQP(parentCTU, qp); + } + if (m_param->bCTUInfo & 4) + skipModes = false; + } + if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4))) + { + if (m_param->rdLevel) + skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); + if ((m_param->bCTUInfo & 4) && sameContentRef) + skipModes = md.bestMode && true; + } + } + else + { + md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); + md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); + checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); + skipModes = !!m_param->bEnableEarlySkip && md.bestMode; + refMasks[0] = allSplitRefs; + md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); + checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks); + checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); + } + mightSplit &= !bDecidedDepth; + } + } + if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) { if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx]) { @@ -1625,7 +1850,7 @@ SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode) skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0); } - if (m_param->analysisRefineLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N) + if (m_param->analysisReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N) skipRectAmp = true && !!md.bestMode; } } @@ -1653,7 +1878,7 @@ SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& } /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */ - if (mightNotSplit && !md.bestMode) + if (mightNotSplit && !md.bestMode && !bCtuInfoCheck) { md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); @@ -1672,6 +1897,8 @@ SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& /* Step 2. Evaluate each of the 4 split sub-blocks in series */ if (mightSplit && !skipRecursion) { + if (bCtuInfoCheck && m_param->bCTUInfo & 2) + qp = int((1 / 0.96) * qp + 0.5); Mode* splitPred = &md.pred[PRED_SPLIT]; splitPred->initCosts(); CUData* splitCU = &splitPred->cu; @@ -1908,7 +2135,7 @@ SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& } } - if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE) + if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE) && !((m_param->bCTUInfo & 4) && bCtuInfoCheck)) { if (!m_param->limitReferences || splitIntra) { @@ -2008,10 +2235,14 @@ void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t q ModeDepth& md = m_modeDepth[depth]; md.bestMode = NULL; + m_evaluateInter = 0; bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth; + int split = (m_param->interRefine && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1) + && bDecidedDepth && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP); + if (bDecidedDepth) { setLambdaFromQP(parentCTU, qp, lqp); @@ -2022,8 +2253,11 @@ void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t q PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]; if (parentCTU.isIntra(cuGeom.absPartIdx)) { - memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); - memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); + if (m_param->intraRefine != 2 || parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] <= 1) + { + memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); + memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions); + } checkIntra(mode, cuGeom, size); } else @@ -2033,20 +2267,22 @@ void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t q for (uint32_t part = 0; part < numPU; part++) { PredictionUnit pu(mode.cu, cuGeom, part); - if (m_param->analysisRefineLevel == 10) + if (m_param->analysisReuseLevel == 10) { analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData; int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx; mode.cu.m_mergeFlag[pu.puAbsPartIdx] = interDataCTU->mergeFlag[cuIdx + part]; mode.cu.setPUInterDir(interDataCTU->interDir[cuIdx + part], pu.puAbsPartIdx, part); - for (int dir = 0; dir < m_slice->isInterB() + 1; dir++) + for (int list = 0; list < m_slice->isInterB() + 1; list++) { - mode.cu.setPUMv(dir, interDataCTU->mv[dir][cuIdx + part], pu.puAbsPartIdx, part); - mode.cu.setPURefIdx(dir, interDataCTU->refIdx[dir][cuIdx + part], pu.puAbsPartIdx, part); - mode.cu.m_mvpIdx[dir][pu.puAbsPartIdx] = interDataCTU->mvpIdx[dir][cuIdx + part]; + mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part], pu.puAbsPartIdx, part); + mode.cu.setPURefIdx(list, interDataCTU->refIdx[list][cuIdx + part], pu.puAbsPartIdx, part); + mode.cu.m_mvpIdx[list][pu.puAbsPartIdx] = interDataCTU->mvpIdx[list][cuIdx + part]; } if (!mode.cu.m_mergeFlag[pu.puAbsPartIdx]) { + if (m_param->mvRefine) + m_me.setSourcePU(*mode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, false); //AMVP MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2]; mode.cu.getNeighbourMV(part, pu.puAbsPartIdx, mode.interNeighbours); @@ -2057,14 +2293,31 @@ void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t q continue; mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc); MV mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]; + if (m_param->mvRefine) + { + MV outmv; + searchMV(mode, pu, list, ref, outmv); + mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part); + } mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mvp; } } + else if(m_param->scaleFactor) + { + MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists + uint8_t candDir[MRG_MAX_NUM_CANDS]; + mode.cu.getInterMergeCandidates(pu.puAbsPartIdx, part, candMvField, candDir); + uint8_t mvpIdx = mode.cu.m_mvpIdx[0][pu.puAbsPartIdx]; + mode.cu.setPUInterDir(candDir[mvpIdx], pu.puAbsPartIdx, part); + mode.cu.setPUMv(0, candMvField[mvpIdx][0].mv, pu.puAbsPartIdx, part); + mode.cu.setPUMv(1, candMvField[mvpIdx][1].mv, pu.puAbsPartIdx, part); + mode.cu.setPURefIdx(0, (int8_t)candMvField[mvpIdx][0].refIdx, pu.puAbsPartIdx, part); + mode.cu.setPURefIdx(1, (int8_t)candMvField[mvpIdx][1].refIdx, pu.puAbsPartIdx, part); + } } motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)); } - - if (parentCTU.isSkipped(cuGeom.absPartIdx)) + if (!m_param->interRefine && parentCTU.isSkipped(cuGeom.absPartIdx)) encodeResAndCalcRdSkipCU(mode); else encodeResAndCalcRdInterCU(mode, cuGeom); @@ -2083,11 +2336,18 @@ void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t q if (mightSplit && m_param->rdLevel < 5) checkDQPForSplitPred(*md.bestMode, cuGeom); + + if (m_param->interRefine && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && !mode.cu.isSkipped(0)) + { + m_evaluateInter = 1; + m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp); + } } - else + if (!bDecidedDepth || split) { Mode* splitPred = &md.pred[PRED_SPLIT]; - md.bestMode = splitPred; + if (!split) + md.bestMode = splitPred; splitPred->initCosts(); CUData* splitCU = &splitPred->cu; splitCU->initSubCU(parentCTU, cuGeom, qp); @@ -2109,8 +2369,12 @@ void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t q if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth) nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom)); - int lamdaQP = m_param->analysisRefineLevel == 10 ? nextQP : lqp; - qprdRefine(parentCTU, childGeom, nextQP, lamdaQP); + int lamdaQP = m_param->analysisReuseLevel == 10 ? nextQP : lqp; + + if (split) + m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP); + else + qprdRefine(parentCTU, childGeom, nextQP, lamdaQP); // Save best CU and pred data for this sub CU splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); @@ -2131,6 +2395,14 @@ void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t q else updateModeCost(*splitPred); + if (m_param->interRefine) + { + if (m_param->rdLevel > 1) + checkBestMode(*splitPred, cuGeom.depth); + else if (splitPred->sa8dCost < md.bestMode->sa8dCost) + md.bestMode = splitPred; + } + checkDQPForSplitPred(*splitPred, cuGeom); /* Copy best data to encData CTU and recon */ @@ -2174,7 +2446,7 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe int safeX, maxSafeMv; if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE) { - safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * g_maxCUSize - 3; + safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3; maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4; } for (uint32_t i = 0; i < numMergeCand; ++i) @@ -2200,7 +2472,7 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe } if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE && - tempPred->cu.m_cuPelX / g_maxCUSize < m_frame->m_encData->m_pir.pirEndCol && + tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol && candMvField[i][0].mv.x > maxSafeMv) // skip merge candidates which reference beyond safe reference area continue; @@ -2304,7 +2576,7 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe int safeX, maxSafeMv; if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE) { - safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * g_maxCUSize - 3; + safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3; maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4; } for (uint32_t i = 0; i < numMergeCand; i++) @@ -2345,7 +2617,7 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe triedBZero = true; } if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE && - tempPred->cu.m_cuPelX / g_maxCUSize < m_frame->m_encData->m_pir.pirEndCol && + tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol && candMvField[i][0].mv.x > maxSafeMv) // skip merge candidates which reference beyond safe reference area continue; @@ -2420,7 +2692,7 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize interMode.cu.setPredModeSubParts(MODE_INTER); int numPredDir = m_slice->isInterP() ? 1 : 2; - if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU && m_param->analysisRefineLevel > 1) + if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; @@ -2462,7 +2734,7 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize } interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits); - if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU && m_param->analysisRefineLevel > 1) + if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; @@ -2484,7 +2756,7 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize interMode.cu.setPredModeSubParts(MODE_INTER); int numPredDir = m_slice->isInterP() ? 1 : 2; - if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU && m_param->analysisRefineLevel > 1) + if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; @@ -2518,7 +2790,7 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize /* predInterSearch sets interMode.sa8dBits, but this is ignored */ encodeResAndCalcRdInterCU(interMode, cuGeom); - if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU && m_param->analysisRefineLevel > 1) + if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; @@ -2671,7 +2943,7 @@ void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) { - if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < g_maxCUDepth) + if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < ctu.m_encData->m_param->maxCUDepth) { for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { @@ -2970,7 +3242,7 @@ int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int3 uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]; uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]; uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr; - uint32_t blockSize = g_maxCUSize >> cuGeom.depth; + uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth; double qp_offset = 0; uint32_t cnt = 0; uint32_t idx; @@ -3064,3 +3336,22 @@ void Analysis::calculateNormFactor(CUData& ctu, int qp) normFactor(srcV, blockSizeC, ctu, qp, TEXT_CHROMA_V); } } + +int Analysis::findSameContentRefCount(const CUData& parentCTU, const CUGeom& cuGeom) +{ + int sameContentRef = 0; + int m_curPoc = parentCTU.m_slice->m_poc; + int prevChange = m_prevCtuInfoChange[cuGeom.absPartIdx]; + int numPredDir = m_slice->isInterP() ? 1 : 2; + for (int list = 0; list < numPredDir; list++) + { + for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++) + { + int refPoc = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_poc; + int refPrevChange = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_addOnPrevChange[parentCTU.m_cuAddr][cuGeom.absPartIdx]; + if ((refPoc < prevChange && refPoc < m_curPoc) || (refPoc > m_curPoc && prevChange < m_curPoc && refPrevChange > m_curPoc) || ((refPoc == prevChange) && (m_additionalCtuInfo[cuGeom.absPartIdx] == CTU_INFO_CHANGE))) + sameContentRef++; /* Content changed */ + } + } + return sameContentRef; +} diff --git a/source/encoder/analysis.h b/source/encoder/analysis.h index 44f38f1853..077db0ced7 100644 --- a/source/encoder/analysis.h +++ b/source/encoder/analysis.h @@ -137,6 +137,10 @@ class Analysis : public Search int* m_multipassMvpIdx[2]; int32_t* m_multipassRef[2]; uint8_t* m_multipassModes; + + uint8_t m_evaluateInter; + uint8_t* m_additionalCtuInfo; + int* m_prevCtuInfoChange; /* refine RD based on QP for rd-levels 5 and 6 */ void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp); @@ -178,6 +182,9 @@ class Analysis : public Search void calculateNormFactor(CUData& ctu, int qp); void normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype); + + void collectPUStatistics(const CUData& ctu, const CUGeom& cuGeom); + /* check whether current mode is the new best */ inline void checkBestMode(Mode& mode, uint32_t depth) { @@ -190,6 +197,7 @@ class Analysis : public Search else md.bestMode = &mode; } + int findSameContentRefCount(const CUData& parentCTU, const CUGeom& cuGeom); }; struct ThreadLocalData diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp index d38ba81d37..85fb8936d8 100644 --- a/source/encoder/api.cpp +++ b/source/encoder/api.cpp @@ -30,6 +30,7 @@ #include "level.h" #include "nal.h" #include "bitcost.h" +#include "x265-extras.h" /* multilib namespace reflectors */ #if LINKED_8BIT @@ -96,9 +97,6 @@ x265_encoder *x265_encoder_open(x265_param *p) if (x265_check_params(param)) goto fail; - if (x265_set_globals(param)) - goto fail; - encoder = new Encoder; if (!param->rc.bEnableSlowFirstPass) PARAM_NS::x265_param_apply_fastfirstpass(param); @@ -119,6 +117,17 @@ x265_encoder *x265_encoder_open(x265_param *p) } encoder->create(); + /* Try to open CSV file handle */ + if (encoder->m_param->csvfn) + { + encoder->m_param->csvfpt = x265_csvlog_open(*encoder->m_param, encoder->m_param->csvfn, encoder->m_param->csvLogLevel); + if (!encoder->m_param->csvfpt) + { + x265_log(encoder->m_param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", encoder->m_param->csvfn); + encoder->m_aborted = true; + } + } + encoder->m_latestParam = latestParam; memcpy(latestParam, param, sizeof(x265_param)); if (encoder->m_aborted) @@ -144,7 +153,10 @@ int x265_encoder_headers(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal) if (encoder->m_param->rc.bStatRead && encoder->m_param->bMultiPassOptRPS) { if (!encoder->computeSPSRPSIndex()) + { + encoder->m_aborted = true; return -1; + } } encoder->getStreamHeaders(encoder->m_nalList, sbacCoder, bs); *pp_nal = &encoder->m_nalList.m_nal[0]; @@ -152,6 +164,11 @@ int x265_encoder_headers(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal) return encoder->m_nalList.m_occupancy; } + if (enc) + { + Encoder *encoder = static_cast<Encoder*>(enc); + encoder->m_aborted = true; + } return -1; } @@ -251,6 +268,12 @@ int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, else if (pi_nal) *pi_nal = 0; + if (numEncoded && encoder->m_param->csvLogLevel) + x265_csvlog_frame(encoder->m_param->csvfpt, *encoder->m_param, *pic_out, encoder->m_param->csvLogLevel); + + if (numEncoded < 0) + encoder->m_aborted = true; + return numEncoded; } @@ -263,12 +286,17 @@ void x265_encoder_get_stats(x265_encoder *enc, x265_stats *outputStats, uint32_t } } -void x265_encoder_log(x265_encoder* enc, int, char **) +void x265_encoder_log(x265_encoder* enc, int argc, char **argv) { if (enc) { Encoder *encoder = static_cast<Encoder*>(enc); - x265_log(encoder->m_param, X265_LOG_WARNING, "x265_encoder_log is now deprecated\n"); + x265_stats stats; + int padx = encoder->m_sps.conformanceWindow.rightOffset; + int pady = encoder->m_sps.conformanceWindow.bottomOffset; + encoder->fetchStats(&stats, sizeof(stats)); + const x265_api * api = x265_api_get(0); + x265_csvlog_encode(encoder->m_param->csvfpt, api->version_str, *encoder->m_param, padx, pady, stats, encoder->m_param->csvLogLevel, argc, argv); } } @@ -282,7 +310,6 @@ void x265_encoder_close(x265_encoder *enc) encoder->printSummary(); encoder->destroy(); delete encoder; - ATOMIC_DEC(&g_ctuSizeConfigured); } } @@ -295,14 +322,18 @@ int x265_encoder_intra_refresh(x265_encoder *enc) encoder->m_bQueuedIntraRefresh = 1; return 0; } +int x265_encoder_ctu_info(x265_encoder *enc, int poc, x265_ctu_info_t** ctu) +{ + if (!ctu || !enc) + return -1; + Encoder* encoder = static_cast<Encoder*>(enc); + encoder->copyCtuInfo(ctu, poc); + return 0; +} void x265_cleanup(void) { - if (!g_ctuSizeConfigured) - { - BitCost::destroy(); - CUData::s_partSet[0] = NULL; /* allow CUData to adjust to new CTU size */ - } + BitCost::destroy(); } x265_picture *x265_picture_alloc() @@ -321,14 +352,14 @@ void x265_picture_init(x265_param *param, x265_picture *pic) pic->userSEI.payloads = NULL; pic->userSEI.numPayloads = 0; - if (param->analysisMode) + if (param->analysisReuseMode) { - uint32_t widthInCU = (param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize; - uint32_t heightInCU = (param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize; + uint32_t widthInCU = (param->sourceWidth + param->maxCUSize - 1) >> param->maxLog2CUSize; + uint32_t heightInCU = (param->sourceHeight + param->maxCUSize - 1) >> param->maxLog2CUSize; uint32_t numCUsInFrame = widthInCU * heightInCU; pic->analysisData.numCUsInFrame = numCUsInFrame; - pic->analysisData.numPartitions = NUM_4x4_PARTITIONS; + pic->analysisData.numPartitions = param->num4x4Partitions; } } @@ -372,6 +403,7 @@ static const x265_api libapi = sizeof(x265_frame_stats), &x265_encoder_intra_refresh, + &x265_encoder_ctu_info, }; typedef const x265_api* (*api_get_func)(int bitDepth); diff --git a/source/encoder/dpb.cpp b/source/encoder/dpb.cpp index 3a8fef5235..c225cf3eff 100644 --- a/source/encoder/dpb.cpp +++ b/source/encoder/dpb.cpp @@ -105,6 +105,23 @@ void DPB::recycleUnreferenced() } } + if (curFrame->m_ctuInfo != NULL) + { + uint32_t widthInCU = (curFrame->m_param->sourceWidth + curFrame->m_param->maxCUSize - 1) >> curFrame->m_param->maxLog2CUSize; + uint32_t heightInCU = (curFrame->m_param->sourceHeight + curFrame->m_param->maxCUSize - 1) >> curFrame->m_param->maxLog2CUSize; + uint32_t numCUsInFrame = widthInCU * heightInCU; + for (uint32_t i = 0; i < numCUsInFrame; i++) + { + X265_FREE((*curFrame->m_ctuInfo + i)->ctuInfo); + (*curFrame->m_ctuInfo + i)->ctuInfo = NULL; + } + X265_FREE(*curFrame->m_ctuInfo); + *(curFrame->m_ctuInfo) = NULL; + X265_FREE(curFrame->m_ctuInfo); + curFrame->m_ctuInfo = NULL; + X265_FREE(curFrame->m_prevCtuInfoChange); + curFrame->m_prevCtuInfoChange = NULL; + } curFrame->m_encData = NULL; curFrame->m_reconPic = NULL; } @@ -187,7 +204,7 @@ void DPB::prepareEncode(Frame *newFrame) } // Disable Loopfilter in bound area, because we will do slice-parallelism in future - slice->m_sLFaseFlag = (g_maxSlices > 1) ? false : ((SLFASE_CONSTANT & (1 << (pocCurr % 31))) > 0); + slice->m_sLFaseFlag = (newFrame->m_param->maxSlices > 1) ? false : ((SLFASE_CONSTANT & (1 << (pocCurr % 31))) > 0); /* Increment reference count of all motion-referenced frames to prevent them * from being recycled. These counts are decremented at the end of diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index 9aea032fd8..0709d0df5b 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -86,8 +86,10 @@ Encoder::Encoder() m_frameEncoder[i] = NULL; MotionEstimate::initScales(); -#if ENABLE_DYNAMIC_HDR10 +#if ENABLE_HDR10_PLUS m_hdr10plus_api = hdr10plus_api_get(); + numCimInfo = 0; + cim = NULL; #endif m_prevTonemapPayload.payload = NULL; @@ -132,26 +134,19 @@ void Encoder::create() if (!p->bEnableWavefront && !p->bDistributeModeAnalysis && !p->bDistributeMotionEstimation && !p->lookaheadSlices) allowPools = false; - if (!p->frameNumThreads) - { - // auto-detect frame threads - int cpuCount = ThreadPool::getCpuCount(); - if (!p->bEnableWavefront) - p->frameNumThreads = X265_MIN3(cpuCount, (rows + 1) / 2, X265_MAX_FRAME_THREADS); - else if (cpuCount >= 32) - p->frameNumThreads = (p->sourceHeight > 2000) ? 8 : 6; // dual-socket 10-core IvyBridge or higher - else if (cpuCount >= 16) - p->frameNumThreads = 5; // 8 HT cores, or dual socket - else if (cpuCount >= 8) - p->frameNumThreads = 3; // 4 HT cores - else if (cpuCount >= 4) - p->frameNumThreads = 2; // Dual or Quad core - else - p->frameNumThreads = 1; - } m_numPools = 0; if (allowPools) m_threadPool = ThreadPool::allocThreadPools(p, m_numPools, 0); + else + { + if (!p->frameNumThreads) + { + // auto-detect frame threads + int cpuCount = ThreadPool::getCpuCount(); + ThreadPool::getFrameThreadsCount(p, cpuCount); + } + } + if (!m_numPools) { // issue warnings if any of these features were requested @@ -320,8 +315,8 @@ void Encoder::create() else m_scalingList.setupQuantMatrices(m_sps.chromaFormatIdc); - int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; - int numCols = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; + int numRows = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize; + int numCols = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize; for (int i = 0; i < m_param->frameNumThreads; i++) { if (!m_frameEncoder[i]->init(this, numRows, numCols)) @@ -346,12 +341,12 @@ void Encoder::create() initRefIdx(); - if (m_param->analysisMode) + if (m_param->analysisReuseMode) { - const char* name = m_param->analysisFileName; + const char* name = m_param->analysisReuseFileName; if (!name) name = defaultAnalysisFileName; - const char* mode = m_param->analysisMode == X265_ANALYSIS_LOAD ? "rb" : "wb"; + const char* mode = m_param->analysisReuseMode == X265_ANALYSIS_LOAD ? "rb" : "wb"; m_analysisFile = x265_fopen(name, mode); if (!m_analysisFile) { @@ -362,7 +357,7 @@ void Encoder::create() if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion) { - const char* name = m_param->analysisFileName; + const char* name = m_param->analysisReuseFileName; if (!name) name = defaultAnalysisFileName; if (m_param->rc.bStatWrite) @@ -431,6 +426,10 @@ void Encoder::stopJobs() void Encoder::destroy() { +#if ENABLE_HDR10_PLUS + m_hdr10plus_api->hdr10plus_clear_movie(cim, numCimInfo); +#endif + if (m_exportedPic) { ATOMIC_DEC(&m_exportedPic->m_countRefEncoders); @@ -482,7 +481,7 @@ void Encoder::destroy() { int bError = 1; fclose(m_analysisFileOut); - const char* name = m_param->analysisFileName; + const char* name = m_param->analysisReuseFileName; if (!name) name = defaultAnalysisFileName; char* temp = strcatFilename(name, ".temp"); @@ -499,11 +498,14 @@ void Encoder::destroy() } if (m_param) { + if (m_param->csvfpt) + fclose(m_param->csvfpt); /* release string arguments that were strdup'd */ free((char*)m_param->rc.lambdaFileName); free((char*)m_param->rc.statFileName); - free((char*)m_param->analysisFileName); + free((char*)m_param->analysisReuseFileName); free((char*)m_param->scalingLists); + free((char*)m_param->csvfn); free((char*)m_param->numaPools); free((char*)m_param->masteringDisplayColorVolume); free((char*)m_param->toneMapFile); @@ -518,7 +520,7 @@ void Encoder::updateVbvPlan(RateControl* rc) FrameEncoder *encoder = m_frameEncoder[i]; if (encoder->m_rce.isActive && encoder->m_rce.poc != rc->m_curSlice->m_poc) { - int64_t bits = (int64_t) X265_MAX(encoder->m_rce.frameSizeEstimated, encoder->m_rce.frameSizePlanned); + int64_t bits = m_param->rc.bEnableConstVbv ? (int64_t)encoder->m_rce.frameSizePlanned : (int64_t)X265_MAX(encoder->m_rce.frameSizeEstimated, encoder->m_rce.frameSizePlanned); rc->m_bufferFill -= bits; rc->m_bufferFill = X265_MAX(rc->m_bufferFill, 0); rc->m_bufferFill += encoder->m_rce.bufferRate; @@ -593,6 +595,8 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) if (m_exportedPic) { + if (!m_param->bUseAnalysisFile && m_param->analysisReuseMode == X265_ANALYSIS_SAVE) + freeAnalysis(&m_exportedPic->m_analysisData); ATOMIC_DEC(&m_exportedPic->m_countRefEncoders); m_exportedPic = NULL; m_dpb->recycleUnreferenced(); @@ -601,16 +605,22 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) { x265_sei_payload toneMap; toneMap.payload = NULL; -#if ENABLE_DYNAMIC_HDR10 +#if ENABLE_HDR10_PLUS if (m_bToneMap) { - uint8_t *cim = NULL; - if (m_hdr10plus_api->hdr10plus_json_to_frame_cim(m_param->toneMapFile, pic_in->poc, cim)) + if (pic_in->poc == 0) + numCimInfo = m_hdr10plus_api->hdr10plus_json_to_movie_cim(m_param->toneMapFile, cim); + if (pic_in->poc < numCimInfo) { - toneMap.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * cim[0]); - toneMap.payloadSize = cim[0]; + int32_t i = 0; + toneMap.payloadSize = 0; + while (cim[pic_in->poc][i] == 0xFF) + toneMap.payloadSize += cim[pic_in->poc][i++]; + toneMap.payloadSize += cim[pic_in->poc][i++]; + + toneMap.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * toneMap.payloadSize); toneMap.payloadType = USER_DATA_REGISTERED_ITU_T_T35; - memcpy(toneMap.payload, cim, toneMap.payloadSize); + memcpy(toneMap.payload, cim[pic_in->poc] + i, toneMap.payloadSize); } } #endif @@ -708,7 +718,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) for (int i = 0; i < numPayloads; i++) { x265_sei_payload input; - if (i == (numPayloads - 1)) + if ((i == (numPayloads - 1)) && toneMapEnable) input = toneMap; else input = pic_in->userSEI.payloads[i]; @@ -754,24 +764,40 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) /* In analysisSave mode, x265_analysis_data is allocated in pic_in and inFrame points to this */ /* Load analysis data before lookahead->addPicture, since sliceType has been decided */ - if (m_param->analysisMode == X265_ANALYSIS_LOAD) + if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD) { - x265_picture* inputPic = const_cast<x265_picture*>(pic_in); /* readAnalysisFile reads analysis data for the frame and allocates memory based on slicetype */ - readAnalysisFile(&inputPic->analysisData, inFrame->m_poc); - inFrame->m_analysisData.poc = inFrame->m_poc; - inFrame->m_analysisData.sliceType = inputPic->analysisData.sliceType; - inFrame->m_analysisData.bScenecut = inputPic->analysisData.bScenecut; - inFrame->m_analysisData.satdCost = inputPic->analysisData.satdCost; - inFrame->m_analysisData.numCUsInFrame = inputPic->analysisData.numCUsInFrame; - inFrame->m_analysisData.numPartitions = inputPic->analysisData.numPartitions; - inFrame->m_analysisData.wt = inputPic->analysisData.wt; - inFrame->m_analysisData.interData = inputPic->analysisData.interData; - inFrame->m_analysisData.intraData = inputPic->analysisData.intraData; - sliceType = inputPic->analysisData.sliceType; + readAnalysisFile(&inFrame->m_analysisData, inFrame->m_poc, pic_in); + sliceType = inFrame->m_analysisData.sliceType; inFrame->m_lowres.bScenecut = !!inFrame->m_analysisData.bScenecut; inFrame->m_lowres.satdCost = inFrame->m_analysisData.satdCost; } + if (m_param->bUseRcStats && pic_in->rcData) + { + RcStats* rc = (RcStats*)pic_in->rcData; + m_rateControl->m_accumPQp = rc->cumulativePQp; + m_rateControl->m_accumPNorm = rc->cumulativePNorm; + m_rateControl->m_isNextGop = true; + for (int j = 0; j < 3; j++) + m_rateControl->m_lastQScaleFor[j] = rc->lastQScaleFor[j]; + m_rateControl->m_wantedBitsWindow = rc->wantedBitsWindow; + m_rateControl->m_cplxrSum = rc->cplxrSum; + m_rateControl->m_totalBits = rc->totalBits; + m_rateControl->m_encodedBits = rc->encodedBits; + m_rateControl->m_shortTermCplxSum = rc->shortTermCplxSum; + m_rateControl->m_shortTermCplxCount = rc->shortTermCplxCount; + if (m_rateControl->m_isVbv) + { + m_rateControl->m_bufferFillFinal = rc->bufferFillFinal; + for (int i = 0; i < 4; i++) + { + m_rateControl->m_pred[i].coeff = rc->coeff[i]; + m_rateControl->m_pred[i].count = rc->count[i]; + m_rateControl->m_pred[i].offset = rc->offset[i]; + } + } + m_param->bUseRcStats = 0; + } if (m_reconfigureRc) inFrame->m_reconfigureRc = true; @@ -805,7 +831,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) x265_frame_stats* frameData = NULL; /* Free up pic_in->analysisData since it has already been used */ - if (m_param->analysisMode == X265_ANALYSIS_LOAD) + if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD) freeAnalysis(&outFrame->m_analysisData); if (pic_out) @@ -819,20 +845,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) pic_out->pts = outFrame->m_pts; pic_out->dts = outFrame->m_dts; - - switch (slice->m_sliceType) - { - case I_SLICE: - pic_out->sliceType = outFrame->m_lowres.bKeyframe ? X265_TYPE_IDR : X265_TYPE_I; - break; - case P_SLICE: - pic_out->sliceType = X265_TYPE_P; - break; - case B_SLICE: - pic_out->sliceType = X265_TYPE_B; - break; - } - + pic_out->sliceType = outFrame->m_lowres.sliceType; pic_out->planes[0] = recpic->m_picOrg[0]; pic_out->stride[0] = (int)(recpic->m_stride * sizeof(pixel)); if (m_param->internalCsp != X265_CSP_I400) @@ -844,7 +857,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) } /* Dump analysis data from pic_out to file in save mode and free */ - if (m_param->analysisMode == X265_ANALYSIS_SAVE) + if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE) { pic_out->analysisData.poc = pic_out->poc; pic_out->analysisData.sliceType = pic_out->sliceType; @@ -856,7 +869,8 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) pic_out->analysisData.interData = outFrame->m_analysisData.interData; pic_out->analysisData.intraData = outFrame->m_analysisData.intraData; writeAnalysisFile(&pic_out->analysisData, *outFrame->m_encData); - freeAnalysis(&pic_out->analysisData); + if (m_param->bUseAnalysisFile) + freeAnalysis(&pic_out->analysisData); } } if (m_param->rc.bStatWrite && (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)) @@ -1012,16 +1026,17 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) Slice* slice = frameEnc->m_encData->m_slice; slice->m_sps = &m_sps; slice->m_pps = &m_pps; + slice->m_param = m_param; slice->m_maxNumMergeCand = m_param->maxNumMergeCand; - slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_4x4_PARTITIONS); + slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * m_param->num4x4Partitions); } if (m_param->searchMethod == X265_SEA && frameEnc->m_lowres.sliceType != X265_TYPE_B) { - int padX = g_maxCUSize + 32; - int padY = g_maxCUSize + 16; - uint32_t numCuInHeight = (frameEnc->m_encData->m_reconPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize; - int maxHeight = numCuInHeight * g_maxCUSize; + int padX = m_param->maxCUSize + 32; + int padY = m_param->maxCUSize + 16; + uint32_t numCuInHeight = (frameEnc->m_encData->m_reconPic->m_picHeight + m_param->maxCUSize - 1) / m_param->maxCUSize; + int maxHeight = numCuInHeight * m_param->maxCUSize; for (int i = 0; i < INTEGRAL_PLANE_NUM; i++) { frameEnc->m_encData->m_meBuffer[i] = X265_MALLOC(uint32_t, frameEnc->m_reconPic->m_stride * (maxHeight + (2 * padY))); @@ -1080,17 +1095,17 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) frameEnc->m_dts = frameEnc->m_reorderedPts; /* Allocate analysis data before encode in save mode. This is allocated in frameEnc */ - if (m_param->analysisMode == X265_ANALYSIS_SAVE) + if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE) { x265_analysis_data* analysis = &frameEnc->m_analysisData; analysis->poc = frameEnc->m_poc; analysis->sliceType = frameEnc->m_lowres.sliceType; - uint32_t widthInCU = (m_param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize; - uint32_t heightInCU = (m_param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize; + uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; uint32_t numCUsInFrame = widthInCU * heightInCU; analysis->numCUsInFrame = numCUsInFrame; - analysis->numPartitions = NUM_4x4_PARTITIONS; + analysis->numPartitions = m_param->num4x4Partitions; allocAnalysis(analysis); } /* determine references, setup RPS, etc */ @@ -1157,6 +1172,120 @@ int Encoder::reconfigureParam(x265_param* encParam, x265_param* param) return x265_check_params(encParam); } +void Encoder::copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc) +{ + uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + Frame* curFrame; + Frame* prevFrame = NULL; + int32_t* frameCTU; + uint32_t numCUsInFrame = widthInCU * heightInCU; + uint32_t maxNum8x8Partitions = 64; + bool copied = false; + do + { + curFrame = m_lookahead->m_inputQueue.getPOC(poc); + if (!curFrame) + curFrame = m_lookahead->m_outputQueue.getPOC(poc); + + if (poc > 0) + { + prevFrame = m_lookahead->m_inputQueue.getPOC(poc - 1); + if (!prevFrame) + prevFrame = m_lookahead->m_outputQueue.getPOC(poc - 1); + if (!prevFrame) + { + FrameEncoder* prevEncoder; + for (int i = 0; i < m_param->frameNumThreads; i++) + { + prevEncoder = m_frameEncoder[i]; + prevFrame = prevEncoder->m_frame; + if (prevFrame && (prevEncoder->m_frame->m_poc == poc - 1)) + { + prevFrame = prevEncoder->m_frame; + break; + } + } + } + } + x265_ctu_info_t* ctuTemp, *prevCtuTemp; + if (curFrame) + { + if (!curFrame->m_ctuInfo) + CHECKED_MALLOC(curFrame->m_ctuInfo, x265_ctu_info_t*, 1); + CHECKED_MALLOC(*curFrame->m_ctuInfo, x265_ctu_info_t, numCUsInFrame); + CHECKED_MALLOC_ZERO(curFrame->m_prevCtuInfoChange, int, numCUsInFrame * maxNum8x8Partitions); + for (uint32_t i = 0; i < numCUsInFrame; i++) + { + ctuTemp = *curFrame->m_ctuInfo + i; + CHECKED_MALLOC(frameCTU, int32_t, maxNum8x8Partitions); + ctuTemp->ctuInfo = (int32_t*)frameCTU; + ctuTemp->ctuAddress = frameCtuInfo[i]->ctuAddress; + memcpy(ctuTemp->ctuPartitions, frameCtuInfo[i]->ctuPartitions, sizeof(int32_t) * maxNum8x8Partitions); + memcpy(ctuTemp->ctuInfo, frameCtuInfo[i]->ctuInfo, sizeof(int32_t) * maxNum8x8Partitions); + if (prevFrame && curFrame->m_poc > 1) + { + prevCtuTemp = *prevFrame->m_ctuInfo + i; + for (uint32_t j = 0; j < maxNum8x8Partitions; j++) + curFrame->m_prevCtuInfoChange[i * maxNum8x8Partitions + j] = (*((int32_t *)prevCtuTemp->ctuInfo + j) == 2) ? (poc - 1) : prevFrame->m_prevCtuInfoChange[i * maxNum8x8Partitions + j]; + } + } + copied = true; + curFrame->m_copied.trigger(); + } + else + { + FrameEncoder* curEncoder; + for (int i = 0; i < m_param->frameNumThreads; i++) + { + curEncoder = m_frameEncoder[i]; + curFrame = curEncoder->m_frame; + if (curFrame) + { + if (poc == curFrame->m_poc) + { + if (!curFrame->m_ctuInfo) + CHECKED_MALLOC(curFrame->m_ctuInfo, x265_ctu_info_t*, 1); + CHECKED_MALLOC(*curFrame->m_ctuInfo, x265_ctu_info_t, numCUsInFrame); + CHECKED_MALLOC_ZERO(curFrame->m_prevCtuInfoChange, int, numCUsInFrame * maxNum8x8Partitions); + for (uint32_t l = 0; l < numCUsInFrame; l++) + { + ctuTemp = *curFrame->m_ctuInfo + l; + CHECKED_MALLOC(frameCTU, int32_t, maxNum8x8Partitions); + ctuTemp->ctuInfo = (int32_t*)frameCTU; + ctuTemp->ctuAddress = frameCtuInfo[l]->ctuAddress; + memcpy(ctuTemp->ctuPartitions, frameCtuInfo[l]->ctuPartitions, sizeof(int32_t) * maxNum8x8Partitions); + memcpy(ctuTemp->ctuInfo, frameCtuInfo[l]->ctuInfo, sizeof(int32_t) * maxNum8x8Partitions); + if (prevFrame && curFrame->m_poc > 1) + { + prevCtuTemp = *prevFrame->m_ctuInfo + l; + for (uint32_t j = 0; j < maxNum8x8Partitions; j++) + curFrame->m_prevCtuInfoChange[l * maxNum8x8Partitions + j] = (*((int32_t *)prevCtuTemp->ctuInfo + j) == CTU_INFO_CHANGE) ? (poc - 1) : prevFrame->m_prevCtuInfoChange[l * maxNum8x8Partitions + j]; + } + } + copied = true; + curFrame->m_copied.trigger(); + break; + } + } + } + } + } while (!copied); + return; +fail: + for (uint32_t i = 0; i < numCUsInFrame; i++) + { + X265_FREE((*curFrame->m_ctuInfo + i)->ctuInfo); + (*curFrame->m_ctuInfo + i)->ctuInfo = NULL; + } + X265_FREE(*curFrame->m_ctuInfo); + *(curFrame->m_ctuInfo) = NULL; + X265_FREE(curFrame->m_ctuInfo); + curFrame->m_ctuInfo = NULL; + X265_FREE(curFrame->m_prevCtuInfoChange); + curFrame->m_prevCtuInfoChange = NULL; +} + void EncStats::addPsnr(double psnrY, double psnrU, double psnrV) { m_psnrSumY += psnrY; @@ -1286,7 +1415,7 @@ void Encoder::printSummary() /* Summarize stats from all frame encoders */ CUStats cuStats; for (int i = 0; i < m_param->frameNumThreads; i++) - cuStats.accumulate(m_frameEncoder[i]->m_cuStats); + cuStats.accumulate(m_frameEncoder[i]->m_cuStats, *m_param); if (!cuStats.totalCTUTime) return; @@ -1307,7 +1436,7 @@ void Encoder::printSummary() int64_t interRDOTotalTime = 0, intraRDOTotalTime = 0; uint64_t interRDOTotalCount = 0, intraRDOTotalCount = 0; - for (uint32_t i = 0; i <= g_maxCUDepth; i++) + for (uint32_t i = 0; i <= m_param->maxCUDepth; i++) { interRDOTotalTime += cuStats.interRDOElapsedTime[i]; intraRDOTotalTime += cuStats.intraRDOElapsedTime[i]; @@ -1417,7 +1546,7 @@ void Encoder::printSummary() } x265_log(m_param, X265_LOG_INFO, "CU: " X265_LL " %dX%d CTUs compressed in %.3lf seconds, %.3lf CTUs per worker-second\n", - cuStats.totalCTUs, g_maxCUSize, g_maxCUSize, + cuStats.totalCTUs, m_param->maxCUSize, m_param->maxCUSize, ELAPSED_SEC(totalWorkerTime), cuStats.totalCTUs / ELAPSED_SEC(totalWorkerTime)); @@ -1578,6 +1707,8 @@ void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, x265_f frameStats->qp = curEncData.m_avgQpAq; frameStats->bits = bits; frameStats->bScenecut = curFrame->m_lowres.bScenecut; + if (m_param->csvLogLevel >= 2) + frameStats->ipCostRatio = curFrame->m_lowres.ipCostRatio; frameStats->bufferFill = m_rateControl->m_bufferFillActual; frameStats->frameLatency = inPoc - poc; if (m_param->rc.rateControlMode == X265_RC_CRF) @@ -1602,35 +1733,83 @@ void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, x265_f #define ELAPSED_MSEC(start, end) (((double)(end) - (start)) / 1000) - frameStats->decideWaitTime = ELAPSED_MSEC(0, curEncoder->m_slicetypeWaitTime); - frameStats->row0WaitTime = ELAPSED_MSEC(curEncoder->m_startCompressTime, curEncoder->m_row0WaitTime); - frameStats->wallTime = ELAPSED_MSEC(curEncoder->m_row0WaitTime, curEncoder->m_endCompressTime); - frameStats->refWaitWallTime = ELAPSED_MSEC(curEncoder->m_row0WaitTime, curEncoder->m_allRowsAvailableTime); - frameStats->totalCTUTime = ELAPSED_MSEC(0, curEncoder->m_totalWorkerElapsedTime); - frameStats->stallTime = ELAPSED_MSEC(0, curEncoder->m_totalNoWorkerTime); - frameStats->totalFrameTime = ELAPSED_MSEC(curFrame->m_encodeStartTime, x265_mdate()); - if (curEncoder->m_totalActiveWorkerCount) - frameStats->avgWPP = (double)curEncoder->m_totalActiveWorkerCount / curEncoder->m_activeWorkerCountSamples; - else - frameStats->avgWPP = 1; - frameStats->countRowBlocks = curEncoder->m_countRowBlocks; - - frameStats->cuStats.percentIntraNxN = curFrame->m_encData->m_frameStats.percentIntraNxN; - frameStats->avgChromaDistortion = curFrame->m_encData->m_frameStats.avgChromaDistortion; - frameStats->avgLumaDistortion = curFrame->m_encData->m_frameStats.avgLumaDistortion; - frameStats->avgPsyEnergy = curFrame->m_encData->m_frameStats.avgPsyEnergy; - frameStats->avgResEnergy = curFrame->m_encData->m_frameStats.avgResEnergy; - frameStats->avgLumaLevel = curFrame->m_fencPic->m_avgLumaLevel; - frameStats->maxLumaLevel = curFrame->m_fencPic->m_maxLumaLevel; - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) - { - frameStats->cuStats.percentSkipCu[depth] = curFrame->m_encData->m_frameStats.percentSkipCu[depth]; - frameStats->cuStats.percentMergeCu[depth] = curFrame->m_encData->m_frameStats.percentMergeCu[depth]; - frameStats->cuStats.percentInterDistribution[depth][0] = curFrame->m_encData->m_frameStats.percentInterDistribution[depth][0]; - frameStats->cuStats.percentInterDistribution[depth][1] = curFrame->m_encData->m_frameStats.percentInterDistribution[depth][1]; - frameStats->cuStats.percentInterDistribution[depth][2] = curFrame->m_encData->m_frameStats.percentInterDistribution[depth][2]; - for (int n = 0; n < INTRA_MODES; n++) - frameStats->cuStats.percentIntraDistribution[depth][n] = curFrame->m_encData->m_frameStats.percentIntraDistribution[depth][n]; + frameStats->maxLumaLevel = curFrame->m_fencPic->m_maxLumaLevel; + frameStats->minLumaLevel = curFrame->m_fencPic->m_minLumaLevel; + frameStats->avgLumaLevel = curFrame->m_fencPic->m_avgLumaLevel; + + if (m_param->csvLogLevel >= 2) + { + frameStats->decideWaitTime = ELAPSED_MSEC(0, curEncoder->m_slicetypeWaitTime); + frameStats->row0WaitTime = ELAPSED_MSEC(curEncoder->m_startCompressTime, curEncoder->m_row0WaitTime); + frameStats->wallTime = ELAPSED_MSEC(curEncoder->m_row0WaitTime, curEncoder->m_endCompressTime); + frameStats->refWaitWallTime = ELAPSED_MSEC(curEncoder->m_row0WaitTime, curEncoder->m_allRowsAvailableTime); + frameStats->totalCTUTime = ELAPSED_MSEC(0, curEncoder->m_totalWorkerElapsedTime); + frameStats->stallTime = ELAPSED_MSEC(0, curEncoder->m_totalNoWorkerTime); + frameStats->totalFrameTime = ELAPSED_MSEC(curFrame->m_encodeStartTime, x265_mdate()); + if (curEncoder->m_totalActiveWorkerCount) + frameStats->avgWPP = (double)curEncoder->m_totalActiveWorkerCount / curEncoder->m_activeWorkerCountSamples; + else + frameStats->avgWPP = 1; + frameStats->countRowBlocks = curEncoder->m_countRowBlocks; + + frameStats->avgChromaDistortion = curFrame->m_encData->m_frameStats.avgChromaDistortion; + frameStats->avgLumaDistortion = curFrame->m_encData->m_frameStats.avgLumaDistortion; + frameStats->avgPsyEnergy = curFrame->m_encData->m_frameStats.avgPsyEnergy; + frameStats->avgResEnergy = curFrame->m_encData->m_frameStats.avgResEnergy; + + frameStats->maxChromaULevel = curFrame->m_fencPic->m_maxChromaULevel; + frameStats->minChromaULevel = curFrame->m_fencPic->m_minChromaULevel; + frameStats->avgChromaULevel = curFrame->m_fencPic->m_avgChromaULevel; + + frameStats->maxChromaVLevel = curFrame->m_fencPic->m_maxChromaVLevel; + frameStats->minChromaVLevel = curFrame->m_fencPic->m_minChromaVLevel; + frameStats->avgChromaVLevel = curFrame->m_fencPic->m_avgChromaVLevel; + + if (curFrame->m_encData->m_frameStats.totalPu[4] == 0) + frameStats->puStats.percentNxN = 0; + else + frameStats->puStats.percentNxN = (double)(curFrame->m_encData->m_frameStats.cnt4x4 / (double)curFrame->m_encData->m_frameStats.totalPu[4]) * 100; + for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++) + { + if (curFrame->m_encData->m_frameStats.totalPu[depth] == 0) + { + frameStats->puStats.percentSkipPu[depth] = 0; + frameStats->puStats.percentIntraPu[depth] = 0; + frameStats->puStats.percentAmpPu[depth] = 0; + for (int i = 0; i < INTER_MODES - 1; i++) + { + frameStats->puStats.percentInterPu[depth][i] = 0; + frameStats->puStats.percentMergePu[depth][i] = 0; + } + } + else + { + frameStats->puStats.percentSkipPu[depth] = (double)(curFrame->m_encData->m_frameStats.cntSkipPu[depth] / (double)curFrame->m_encData->m_frameStats.totalPu[depth]) * 100; + frameStats->puStats.percentIntraPu[depth] = (double)(curFrame->m_encData->m_frameStats.cntIntraPu[depth] / (double)curFrame->m_encData->m_frameStats.totalPu[depth]) * 100; + frameStats->puStats.percentAmpPu[depth] = (double)(curFrame->m_encData->m_frameStats.cntAmp[depth] / (double)curFrame->m_encData->m_frameStats.totalPu[depth]) * 100; + for (int i = 0; i < INTER_MODES - 1; i++) + { + frameStats->puStats.percentInterPu[depth][i] = (double)(curFrame->m_encData->m_frameStats.cntInterPu[depth][i] / (double)curFrame->m_encData->m_frameStats.totalPu[depth]) * 100; + frameStats->puStats.percentMergePu[depth][i] = (double)(curFrame->m_encData->m_frameStats.cntMergePu[depth][i] / (double)curFrame->m_encData->m_frameStats.totalPu[depth]) * 100; + } + } + } + } + + if (m_param->csvLogLevel >= 1) + { + frameStats->cuStats.percentIntraNxN = curFrame->m_encData->m_frameStats.percentIntraNxN; + + for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++) + { + frameStats->cuStats.percentSkipCu[depth] = curFrame->m_encData->m_frameStats.percentSkipCu[depth]; + frameStats->cuStats.percentMergeCu[depth] = curFrame->m_encData->m_frameStats.percentMergeCu[depth]; + frameStats->cuStats.percentInterDistribution[depth][0] = curFrame->m_encData->m_frameStats.percentInterDistribution[depth][0]; + frameStats->cuStats.percentInterDistribution[depth][1] = curFrame->m_encData->m_frameStats.percentInterDistribution[depth][1]; + frameStats->cuStats.percentInterDistribution[depth][2] = curFrame->m_encData->m_frameStats.percentInterDistribution[depth][2]; + for (int n = 0; n < INTRA_MODES; n++) + frameStats->cuStats.percentIntraDistribution[depth][n] = curFrame->m_encData->m_frameStats.percentIntraDistribution[depth][n]; + } } } } @@ -1803,16 +1982,16 @@ void Encoder::initSPS(SPS *sps) sps->chromaFormatIdc = m_param->internalCsp; sps->picWidthInLumaSamples = m_param->sourceWidth; sps->picHeightInLumaSamples = m_param->sourceHeight; - sps->numCuInWidth = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; - sps->numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; + sps->numCuInWidth = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize; + sps->numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize; sps->numCUsInFrame = sps->numCuInWidth * sps->numCuInHeight; - sps->numPartitions = NUM_4x4_PARTITIONS; - sps->numPartInCUSize = 1 << g_unitSizeDepth; + sps->numPartitions = m_param->num4x4Partitions; + sps->numPartInCUSize = 1 << m_param->unitSizeDepth; - sps->log2MinCodingBlockSize = g_maxLog2CUSize - g_maxCUDepth; - sps->log2DiffMaxMinCodingBlockSize = g_maxCUDepth; + sps->log2MinCodingBlockSize = m_param->maxLog2CUSize - m_param->maxCUDepth; + sps->log2DiffMaxMinCodingBlockSize = m_param->maxCUDepth; uint32_t maxLog2TUSize = (uint32_t)g_log2Size[m_param->maxTUSize]; - sps->quadtreeTULog2MaxSize = X265_MIN(g_maxLog2CUSize, maxLog2TUSize); + sps->quadtreeTULog2MaxSize = X265_MIN((uint32_t)m_param->maxLog2CUSize, maxLog2TUSize); sps->quadtreeTULog2MinSize = 2; sps->quadtreeTUMaxDepthInter = m_param->tuQTMaxInterDepth; sps->quadtreeTUMaxDepthIntra = m_param->tuQTMaxIntraDepth; @@ -1820,7 +1999,7 @@ void Encoder::initSPS(SPS *sps) sps->bUseSAO = m_param->bEnableSAO; sps->bUseAMP = m_param->bEnableAMP; - sps->maxAMPDepth = m_param->bEnableAMP ? g_maxCUDepth : 0; + sps->maxAMPDepth = m_param->bEnableAMP ? m_param->maxCUDepth : 0; sps->maxTempSubLayers = m_param->bEnableTemporalSubLayers ? 2 : 1; sps->maxDecPicBuffering = m_vps.maxDecPicBuffering; @@ -2034,7 +2213,7 @@ void Encoder::configure(x265_param *p) p->lookaheadDepth = p->totalFrames; if (p->bIntraRefresh) { - int numCuInWidth = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; + int numCuInWidth = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize; if (p->maxNumReferences > 1) { x265_log(p, X265_LOG_WARNING, "Max References > 1 + intra-refresh is not supported , setting max num references = 1\n"); @@ -2070,23 +2249,68 @@ void Encoder::configure(x265_param *p) p->rc.rfConstantMin = 0; } - if (p->analysisMode && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation)) + if (p->analysisReuseMode && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation)) { x265_log(p, X265_LOG_WARNING, "Analysis load/save options incompatible with pmode/pme, Disabling pmode/pme\n"); p->bDistributeMotionEstimation = p->bDistributeModeAnalysis = 0; } - if (p->analysisMode && p->rc.cuTree) + if (p->analysisReuseMode && p->rc.cuTree) { x265_log(p, X265_LOG_WARNING, "Analysis load/save options works only with cu-tree off, Disabling cu-tree\n"); p->rc.cuTree = 0; } - if (p->analysisMode && (p->analysisMultiPassRefine || p->analysisMultiPassDistortion)) + if (p->analysisReuseMode && (p->analysisMultiPassRefine || p->analysisMultiPassDistortion)) { x265_log(p, X265_LOG_WARNING, "Cannot use Analysis load/save option and multi-pass-opt-analysis/multi-pass-opt-distortion together," "Disabling Analysis load/save and multi-pass-opt-analysis/multi-pass-opt-distortion\n"); - p->analysisMode = p->analysisMultiPassRefine = p->analysisMultiPassDistortion = 0; + p->analysisReuseMode = p->analysisMultiPassRefine = p->analysisMultiPassDistortion = 0; + } + if (p->scaleFactor) + { + if (p->scaleFactor == 1) + { + p->scaleFactor = 0; + } + else if (!p->analysisReuseMode || p->analysisReuseLevel < 10) + { + x265_log(p, X265_LOG_WARNING, "Input scaling works with analysis-reuse-mode, analysis-reuse-level 10. Disabling scale-factor.\n"); + p->scaleFactor = 0; + } + } + + if (p->intraRefine) + { + if (p->analysisReuseMode!= X265_ANALYSIS_LOAD || p->analysisReuseLevel < 10 || !p->scaleFactor) + { + x265_log(p, X265_LOG_WARNING, "Intra refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling intra refine.\n"); + p->intraRefine = 0; + } + } + + if (p->interRefine) + { + if (p->analysisReuseMode != X265_ANALYSIS_LOAD || p->analysisReuseLevel < 10 || !p->scaleFactor) + { + x265_log(p, X265_LOG_WARNING, "Inter refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling inter refine.\n"); + p->interRefine = 0; + } + } + + if (p->limitTU && p->interRefine) + { + x265_log(p, X265_LOG_WARNING, "Inter refinement does not support limitTU. Disabling limitTU.\n"); + p->limitTU = 0; + } + + if (p->mvRefine) + { + if (p->analysisReuseMode != X265_ANALYSIS_LOAD || p->analysisReuseLevel < 10 || !p->scaleFactor) + { + x265_log(p, X265_LOG_WARNING, "MV refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling MV refine.\n"); + p->mvRefine = 0; + } } if ((p->analysisMultiPassRefine || p->analysisMultiPassDistortion) && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation)) @@ -2177,9 +2401,17 @@ void Encoder::configure(x265_param *p) m_conformanceWindow.topOffset = 0; m_conformanceWindow.bottomOffset = 0; m_conformanceWindow.leftOffset = 0; - /* set pad size if width is not multiple of the minimum CU size */ - if (p->sourceWidth & (p->minCUSize - 1)) + if (p->scaleFactor == 2 && ((p->sourceWidth / 2) & (p->minCUSize - 1)) && p->analysisReuseMode == X265_ANALYSIS_LOAD) + { + uint32_t rem = (p->sourceWidth / 2) & (p->minCUSize - 1); + uint32_t padsize = p->minCUSize - rem; + p->sourceWidth += padsize * 2; + + m_conformanceWindow.bEnabled = true; + m_conformanceWindow.rightOffset = padsize * 2; + } + else if(p->sourceWidth & (p->minCUSize - 1)) { uint32_t rem = p->sourceWidth & (p->minCUSize - 1); uint32_t padsize = p->minCUSize - rem; @@ -2228,7 +2460,7 @@ void Encoder::configure(x265_param *p) p->dynamicRd = 0; x265_log(p, X265_LOG_WARNING, "Dynamic-rd disabled, requires RD <= 4, VBV and aq-mode enabled\n"); } -#ifdef ENABLE_DYNAMIC_HDR10 +#ifdef ENABLE_HDR10_PLUS if (m_param->bDhdr10opt && m_param->toneMapFile == NULL) { x265_log(p, X265_LOG_WARNING, "Disabling dhdr10-opt. dhdr10-info must be enabled.\n"); @@ -2252,7 +2484,7 @@ void Encoder::configure(x265_param *p) #else if (m_param->toneMapFile) { - x265_log(p, X265_LOG_WARNING, "--dhdr10-info disabled. Enable dynamic HDR in cmake.\n"); + x265_log(p, X265_LOG_WARNING, "--dhdr10-info disabled. Enable HDR10_PLUS in cmake.\n"); m_bToneMap = 0; m_param->toneMapFile = NULL; } @@ -2358,9 +2590,16 @@ void Encoder::configure(x265_param *p) x265_log(p, X265_LOG_ERROR, "uhd-bd: Disabled\n"); } } - /* set pad size if height is not multiple of the minimum CU size */ - if (p->sourceHeight & (p->minCUSize - 1)) + if (p->scaleFactor == 2 && ((p->sourceHeight / 2) & (p->minCUSize - 1)) && p->analysisReuseMode == X265_ANALYSIS_LOAD) + { + uint32_t rem = (p->sourceHeight / 2) & (p->minCUSize - 1); + uint32_t padsize = p->minCUSize - rem; + p->sourceHeight += padsize * 2; + m_conformanceWindow.bEnabled = true; + m_conformanceWindow.bottomOffset = padsize * 2; + } + else if(p->sourceHeight & (p->minCUSize - 1)) { uint32_t rem = p->sourceHeight & (p->minCUSize - 1); uint32_t padsize = p->minCUSize - rem; @@ -2372,9 +2611,6 @@ void Encoder::configure(x265_param *p) if (p->bLogCuStats) x265_log(p, X265_LOG_WARNING, "--cu-stats option is now deprecated\n"); - if (p->csvfn) - x265_log(p, X265_LOG_WARNING, "libx265 no longer supports CSV file statistics\n"); - if (p->log2MaxPocLsb < 4) { x265_log(p, X265_LOG_WARNING, "maximum of the picture order count can not be less than 4\n"); @@ -2406,6 +2642,20 @@ void Encoder::configure(x265_param *p) p->bHDROpt = 0; } } + + if (m_param->toneMapFile || p->bHDROpt || p->bEmitHDRSEI) + { + if (!p->bRepeatHeaders) + { + p->bRepeatHeaders = 1; + x265_log(p, X265_LOG_WARNING, "Turning on repeat-headers for HDR compatibility\n"); + } + } + + p->maxLog2CUSize = g_log2Size[p->maxCUSize]; + p->maxCUDepth = p->maxLog2CUSize - g_log2Size[p->minCUSize]; + p->unitSizeDepth = p->maxLog2CUSize - LOG2_UNIT_SIZE; + p->num4x4Partitions = (1U << (p->unitSizeDepth << 1)); } void Encoder::allocAnalysis(x265_analysis_data* analysis) @@ -2414,7 +2664,7 @@ void Encoder::allocAnalysis(x265_analysis_data* analysis) analysis->interData = analysis->intraData = NULL; if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) { - if (m_param->analysisRefineLevel < 2) + if (m_param->analysisReuseLevel < 2) return; analysis_intra_data *intraData = (analysis_intra_data*)analysis->intraData; @@ -2430,27 +2680,27 @@ void Encoder::allocAnalysis(x265_analysis_data* analysis) int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2; uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3; CHECKED_MALLOC_ZERO(analysis->wt, WeightParam, numPlanes * numDir); - if (m_param->analysisRefineLevel < 2) + if (m_param->analysisReuseLevel < 2) return; analysis_inter_data *interData = (analysis_inter_data*)analysis->interData; CHECKED_MALLOC_ZERO(interData, analysis_inter_data, 1); CHECKED_MALLOC(interData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); CHECKED_MALLOC(interData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); - if (m_param->analysisRefineLevel > 4) + if (m_param->analysisReuseLevel > 4) { CHECKED_MALLOC(interData->partSize, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); CHECKED_MALLOC(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); } - if (m_param->analysisRefineLevel == 10) + if (m_param->analysisReuseLevel == 10) { CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); for (int dir = 0; dir < numDir; dir++) { CHECKED_MALLOC(interData->mvpIdx[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame); CHECKED_MALLOC(interData->refIdx[dir], int8_t, analysis->numPartitions * analysis->numCUsInFrame); - CHECKED_MALLOC(interData->mv[dir], MV, analysis->numPartitions * analysis->numCUsInFrame); + CHECKED_MALLOC(interData->mv[dir], MV, analysis->numPartitions * analysis->numCUsInFrame); } /* Allocate intra in inter */ @@ -2480,51 +2730,56 @@ void Encoder::freeAnalysis(x265_analysis_data* analysis) /* Early exit freeing weights alone if level is 1 (when there is no analysis inter/intra) */ if (analysis->sliceType > X265_TYPE_I && analysis->wt) X265_FREE(analysis->wt); - if (m_param->analysisRefineLevel < 2) + if (m_param->analysisReuseLevel < 2) return; - if (analysis->intraData) + if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) { - if (m_param->analysisRefineLevel < 2) - return; - - X265_FREE(((analysis_intra_data*)analysis->intraData)->depth); - X265_FREE(((analysis_intra_data*)analysis->intraData)->modes); - X265_FREE(((analysis_intra_data*)analysis->intraData)->partSizes); - X265_FREE(((analysis_intra_data*)analysis->intraData)->chromaModes); - X265_FREE(analysis->intraData); + if (analysis->intraData) + { + X265_FREE(((analysis_intra_data*)analysis->intraData)->depth); + X265_FREE(((analysis_intra_data*)analysis->intraData)->modes); + X265_FREE(((analysis_intra_data*)analysis->intraData)->partSizes); + X265_FREE(((analysis_intra_data*)analysis->intraData)->chromaModes); + X265_FREE(analysis->intraData); + analysis->intraData = NULL; + } } - else if (analysis->interData) + else { - X265_FREE(((analysis_inter_data*)analysis->interData)->depth); - X265_FREE(((analysis_inter_data*)analysis->interData)->modes); - if (m_param->analysisRefineLevel > 4) + if (analysis->intraData) { - X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag); - X265_FREE(((analysis_inter_data*)analysis->interData)->partSize); + X265_FREE(((analysis_intra_data*)analysis->intraData)->modes); + X265_FREE(((analysis_intra_data*)analysis->intraData)->chromaModes); + X265_FREE(analysis->intraData); + analysis->intraData = NULL; } - - if (m_param->analysisRefineLevel == 10) + if (analysis->interData) { - X265_FREE(((analysis_inter_data*)analysis->interData)->interDir); - int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2; - for (int dir = 0; dir < numDir; dir++) + X265_FREE(((analysis_inter_data*)analysis->interData)->depth); + X265_FREE(((analysis_inter_data*)analysis->interData)->modes); + if (m_param->analysisReuseLevel > 4) { - X265_FREE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir]); - X265_FREE(((analysis_inter_data*)analysis->interData)->refIdx[dir]); - X265_FREE(((analysis_inter_data*)analysis->interData)->mv[dir]); + X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag); + X265_FREE(((analysis_inter_data*)analysis->interData)->partSize); } - if (analysis->sliceType == P_SLICE || m_param->bIntraInBFrames) + if (m_param->analysisReuseLevel == 10) { - X265_FREE(((analysis_intra_data*)analysis->intraData)->modes); - X265_FREE(((analysis_intra_data*)analysis->intraData)->chromaModes); - X265_FREE(analysis->intraData); + X265_FREE(((analysis_inter_data*)analysis->interData)->interDir); + int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2; + for (int dir = 0; dir < numDir; dir++) + { + X265_FREE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir]); + X265_FREE(((analysis_inter_data*)analysis->interData)->refIdx[dir]); + X265_FREE(((analysis_inter_data*)analysis->interData)->mv[dir]); + } } - } - else - X265_FREE(((analysis_inter_data*)analysis->interData)->ref); + else + X265_FREE(((analysis_inter_data*)analysis->interData)->ref); - X265_FREE(analysis->interData); + X265_FREE(analysis->interData); + analysis->interData = NULL; + } } } @@ -2532,13 +2787,13 @@ void Encoder::allocAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType) { analysis->analysisFramedata = NULL; analysis2PassFrameData *analysisFrameData = (analysis2PassFrameData*)analysis->analysisFramedata; - uint32_t widthInCU = (m_param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize; - uint32_t heightInCU = (m_param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize; + uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; uint32_t numCUsInFrame = widthInCU * heightInCU; CHECKED_MALLOC_ZERO(analysisFrameData, analysis2PassFrameData, 1); - CHECKED_MALLOC_ZERO(analysisFrameData->depth, uint8_t, NUM_4x4_PARTITIONS * numCUsInFrame); - CHECKED_MALLOC_ZERO(analysisFrameData->distortion, sse_t, NUM_4x4_PARTITIONS * numCUsInFrame); + CHECKED_MALLOC_ZERO(analysisFrameData->depth, uint8_t, m_param->num4x4Partitions * numCUsInFrame); + CHECKED_MALLOC_ZERO(analysisFrameData->distortion, sse_t, m_param->num4x4Partitions * numCUsInFrame); if (m_param->rc.bStatRead) { CHECKED_MALLOC_ZERO(analysisFrameData->ctuDistortion, sse_t, numCUsInFrame); @@ -2548,13 +2803,13 @@ void Encoder::allocAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType) } if (!IS_X265_TYPE_I(sliceType)) { - CHECKED_MALLOC_ZERO(analysisFrameData->m_mv[0], MV, NUM_4x4_PARTITIONS * numCUsInFrame); - CHECKED_MALLOC_ZERO(analysisFrameData->m_mv[1], MV, NUM_4x4_PARTITIONS * numCUsInFrame); - CHECKED_MALLOC_ZERO(analysisFrameData->mvpIdx[0], int, NUM_4x4_PARTITIONS * numCUsInFrame); - CHECKED_MALLOC_ZERO(analysisFrameData->mvpIdx[1], int, NUM_4x4_PARTITIONS * numCUsInFrame); - CHECKED_MALLOC_ZERO(analysisFrameData->ref[0], int32_t, NUM_4x4_PARTITIONS * numCUsInFrame); - CHECKED_MALLOC_ZERO(analysisFrameData->ref[1], int32_t, NUM_4x4_PARTITIONS * numCUsInFrame); - CHECKED_MALLOC(analysisFrameData->modes, uint8_t, NUM_4x4_PARTITIONS * numCUsInFrame); + CHECKED_MALLOC_ZERO(analysisFrameData->m_mv[0], MV, m_param->num4x4Partitions * numCUsInFrame); + CHECKED_MALLOC_ZERO(analysisFrameData->m_mv[1], MV, m_param->num4x4Partitions * numCUsInFrame); + CHECKED_MALLOC_ZERO(analysisFrameData->mvpIdx[0], int, m_param->num4x4Partitions * numCUsInFrame); + CHECKED_MALLOC_ZERO(analysisFrameData->mvpIdx[1], int, m_param->num4x4Partitions * numCUsInFrame); + CHECKED_MALLOC_ZERO(analysisFrameData->ref[0], int32_t, m_param->num4x4Partitions * numCUsInFrame); + CHECKED_MALLOC_ZERO(analysisFrameData->ref[1], int32_t, m_param->num4x4Partitions * numCUsInFrame); + CHECKED_MALLOC(analysisFrameData->modes, uint8_t, m_param->num4x4Partitions * numCUsInFrame); } analysis->analysisFramedata = analysisFrameData; @@ -2593,11 +2848,15 @@ void Encoder::freeAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType) } } -void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc) +void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn) { -#define X265_FREAD(val, size, readSize, fileOffset)\ - if (fread(val, size, readSize, fileOffset) != readSize)\ +#define X265_FREAD(val, size, readSize, fileOffset, src)\ + if (!m_param->bUseAnalysisFile)\ + {\ + memcpy(val, src, (size * readSize));\ + }\ + else if (fread(val, size, readSize, fileOffset) != readSize)\ {\ x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\ freeAnalysis(analysis);\ @@ -2610,67 +2869,98 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc) uint32_t depthBytes = 0; fseeko(m_analysisFile, totalConsumedBytes, SEEK_SET); - int poc; uint32_t frameRecordSize; - X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile); - X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFile); - X265_FREAD(&poc, sizeof(int), 1, m_analysisFile); + const x265_analysis_data *picData = &(picIn->analysisData); + analysis_intra_data *intraPic = (analysis_intra_data *)picData->intraData; + analysis_inter_data *interPic = (analysis_inter_data *)picData->interData; - uint64_t currentOffset = totalConsumedBytes; + int poc; uint32_t frameRecordSize; + X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile, &(picData->frameRecordSize)); + X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFile, &(picData->depthBytes)); + X265_FREAD(&poc, sizeof(int), 1, m_analysisFile, &(picData->poc)); - /* Seeking to the right frame Record */ - while (poc != curPoc && !feof(m_analysisFile)) + if (m_param->bUseAnalysisFile) { - currentOffset += frameRecordSize; - fseeko(m_analysisFile, currentOffset, SEEK_SET); - X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile); - X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFile); - X265_FREAD(&poc, sizeof(int), 1, m_analysisFile); - } + uint64_t currentOffset = totalConsumedBytes; - if (poc != curPoc || feof(m_analysisFile)) - { - x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc); - freeAnalysis(analysis); - return; + /* Seeking to the right frame Record */ + while (poc != curPoc && !feof(m_analysisFile)) + { + currentOffset += frameRecordSize; + fseeko(m_analysisFile, currentOffset, SEEK_SET); + X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile, &(picData->frameRecordSize)); + X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFile, &(picData->depthBytes)); + X265_FREAD(&poc, sizeof(int), 1, m_analysisFile, &(picData->poc)); + } + if (poc != curPoc || feof(m_analysisFile)) + { + x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc); + freeAnalysis(analysis); + return; + } } /* Now arrived at the right frame, read the record */ analysis->poc = poc; analysis->frameRecordSize = frameRecordSize; - X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFile); - X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFile); - X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFile); - X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile); - X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFile); + X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFile, &(picData->sliceType)); + X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFile, &(picData->bScenecut)); + X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFile, &(picData->satdCost)); + X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile, &(picData->numCUsInFrame)); + X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFile, &(picData->numPartitions)); + int scaledNumPartition = analysis->numPartitions; + int factor = 1 << m_param->scaleFactor; + + if (m_param->scaleFactor) + analysis->numPartitions *= factor; /* Memory is allocated for inter and intra analysis data based on the slicetype */ allocAnalysis(analysis); if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) { - analysis->sliceType = X265_TYPE_I; - if (m_param->analysisRefineLevel < 2) + if (m_param->analysisReuseLevel < 2) return; uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSizes = NULL; tempBuf = X265_MALLOC(uint8_t, depthBytes * 3); - X265_FREAD(tempBuf, sizeof(uint8_t), depthBytes * 3, m_analysisFile); - depthBuf = tempBuf; modeBuf = tempBuf + depthBytes; partSizes = tempBuf + 2 * depthBytes; + X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, m_analysisFile, intraPic->depth); + X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, m_analysisFile, intraPic->chromaModes); + X265_FREAD(partSizes, sizeof(uint8_t), depthBytes, m_analysisFile, intraPic->partSizes); + size_t count = 0; for (uint32_t d = 0; d < depthBytes; d++) { int bytes = analysis->numPartitions >> (depthBuf[d] * 2); + if (m_param->scaleFactor) + { + if (depthBuf[d] == 0) + depthBuf[d] = 1; + if (partSizes[d] == SIZE_NxN) + partSizes[d] = SIZE_2Nx2N; + } memset(&((analysis_intra_data *)analysis->intraData)->depth[count], depthBuf[d], bytes); memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], modeBuf[d], bytes); memset(&((analysis_intra_data *)analysis->intraData)->partSizes[count], partSizes[d], bytes); count += bytes; } - X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + + if (!m_param->scaleFactor) + { + X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile, intraPic->modes); + } + else + { + uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition); + X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFile, intraPic->modes); + for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor) + memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor); + X265_FREE(tempLumaBuf); + } X265_FREE(tempBuf); consumedBytes += frameRecordSize; } @@ -2679,8 +2969,8 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc) { uint32_t numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2; uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3; - X265_FREAD((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFile); - if (m_param->analysisRefineLevel < 2) + X265_FREAD((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFile, (picIn->analysisData.wt)); + if (m_param->analysisReuseLevel < 2) return; uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSize = NULL, *mergeFlag = NULL; @@ -2688,9 +2978,9 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc) MV* mv[2]; int8_t* refIdx[2]; - int numBuf = m_param->analysisRefineLevel > 4 ? 4 : 2; + int numBuf = m_param->analysisReuseLevel > 4 ? 4 : 2; bool bIntraInInter = false; - if (m_param->analysisRefineLevel == 10) + if (m_param->analysisReuseLevel == 10) { numBuf++; bIntraInInter = (analysis->sliceType == X265_TYPE_P || m_param->bIntraInBFrames); @@ -2698,26 +2988,36 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc) } tempBuf = X265_MALLOC(uint8_t, depthBytes * numBuf); - X265_FREAD(tempBuf, sizeof(uint8_t), depthBytes * numBuf, m_analysisFile); - depthBuf = tempBuf; modeBuf = tempBuf + depthBytes; - if (m_param->analysisRefineLevel > 4) + + X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, m_analysisFile, interPic->depth); + X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, m_analysisFile, interPic->modes); + + if (m_param->analysisReuseLevel > 4) { partSize = modeBuf + depthBytes; mergeFlag = partSize + depthBytes; - if (m_param->analysisRefineLevel == 10) + X265_FREAD(partSize, sizeof(uint8_t), depthBytes, m_analysisFile, interPic->partSize); + X265_FREAD(mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFile, interPic->mergeFlag); + + if (m_param->analysisReuseLevel == 10) { interDir = mergeFlag + depthBytes; - if (bIntraInInter) chromaDir = interDir + depthBytes; + X265_FREAD(interDir, sizeof(uint8_t), depthBytes, m_analysisFile, interPic->interDir); + if (bIntraInInter) + { + chromaDir = interDir + depthBytes; + X265_FREAD(chromaDir, sizeof(uint8_t), depthBytes, m_analysisFile, intraPic->chromaModes); + } for (uint32_t i = 0; i < numDir; i++) { - mvpIdx[i] = X265_MALLOC(uint8_t, depthBytes * 3); - X265_FREAD(mvpIdx[i], sizeof(uint8_t), depthBytes, m_analysisFile); + mvpIdx[i] = X265_MALLOC(uint8_t, depthBytes); refIdx[i] = X265_MALLOC(int8_t, depthBytes); - X265_FREAD(refIdx[i], sizeof(int8_t), depthBytes, m_analysisFile); mv[i] = X265_MALLOC(MV, depthBytes); - X265_FREAD(mv[i], sizeof(MV), depthBytes, m_analysisFile); + X265_FREAD(mvpIdx[i], sizeof(uint8_t), depthBytes, m_analysisFile, interPic->mvpIdx[i]); + X265_FREAD(refIdx[i], sizeof(int8_t), depthBytes, m_analysisFile, interPic->refIdx[i]); + X265_FREAD(mv[i], sizeof(MV), depthBytes, m_analysisFile, interPic->mv[i]); } } } @@ -2726,28 +3026,37 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc) for (uint32_t d = 0; d < depthBytes; d++) { int bytes = analysis->numPartitions >> (depthBuf[d] * 2); + if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && depthBuf[d] == 0) + depthBuf[d] = 1; memset(&((analysis_inter_data *)analysis->interData)->depth[count], depthBuf[d], bytes); memset(&((analysis_inter_data *)analysis->interData)->modes[count], modeBuf[d], bytes); - if (m_param->analysisRefineLevel > 4) + if (m_param->analysisReuseLevel > 4) { + if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && partSize[d] == SIZE_NxN) + partSize[d] = SIZE_2Nx2N; memset(&((analysis_inter_data *)analysis->interData)->partSize[count], partSize[d], bytes); - int numPU = nbPartsTable[(int)partSize[d]]; + int numPU = (modeBuf[d] == MODE_INTRA) ? 1 : nbPartsTable[(int)partSize[d]]; for (int pu = 0; pu < numPU; pu++) { if (pu) d++; ((analysis_inter_data *)analysis->interData)->mergeFlag[count + pu] = mergeFlag[d]; - if (m_param->analysisRefineLevel == 10) + if (m_param->analysisReuseLevel == 10) { ((analysis_inter_data *)analysis->interData)->interDir[count + pu] = interDir[d]; for (uint32_t i = 0; i < numDir; i++) { ((analysis_inter_data *)analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d]; ((analysis_inter_data *)analysis->interData)->refIdx[i][count + pu] = refIdx[i][d]; + if (m_param->scaleFactor) + { + mv[i][d].x *= (int16_t)m_param->scaleFactor; + mv[i][d].y *= (int16_t)m_param->scaleFactor; + } memcpy(&((analysis_inter_data *)analysis->interData)->mv[i][count + pu], &mv[i][d], sizeof(MV)); } } } - if (m_param->analysisRefineLevel == 10 && bIntraInInter) + if (m_param->analysisReuseLevel == 10 && bIntraInInter) memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], chromaDir[d], bytes); } count += bytes; @@ -2755,7 +3064,7 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc) X265_FREE(tempBuf); - if (m_param->analysisRefineLevel == 10) + if (m_param->analysisReuseLevel == 10) { for (uint32_t i = 0; i < numDir; i++) { @@ -2764,10 +3073,23 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc) X265_FREE(mv[i]); } if (bIntraInInter) - X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + { + if (!m_param->scaleFactor) + { + X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile, intraPic->modes); + } + else + { + uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition); + X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFile, intraPic->modes); + for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor) + memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor); + X265_FREE(tempLumaBuf); + } + } } else - X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFile); + X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFile, interPic->ref); consumedBytes += frameRecordSize; if (numDir == 1) @@ -2789,8 +3111,8 @@ void Encoder::readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int curP }\ uint32_t depthBytes = 0; - uint32_t widthInCU = (m_param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize; - uint32_t heightInCU = (m_param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize; + uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; uint32_t numCUsInFrame = widthInCU * heightInCU; int poc; uint32_t frameRecordSize; @@ -2820,12 +3142,12 @@ void Encoder::readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int curP double sum = 0, sqrSum = 0; for (uint32_t d = 0; d < depthBytes; d++) { - int bytes = NUM_4x4_PARTITIONS >> (depthBuf[d] * 2); + int bytes = m_param->num4x4Partitions >> (depthBuf[d] * 2); memset(&analysisFrameData->depth[count], depthBuf[d], bytes); analysisFrameData->distortion[count] = distortionBuf[d]; analysisFrameData->ctuDistortion[ctuCount] += analysisFrameData->distortion[count]; count += bytes; - if ((count % (size_t)NUM_4x4_PARTITIONS) == 0) + if ((count % (unsigned)m_param->num4x4Partitions) == 0) { analysisFrameData->scaledDistortion[ctuCount] = X265_LOG2(X265_MAX(analysisFrameData->ctuDistortion[ctuCount], 1)); sum += analysisFrameData->scaledDistortion[ctuCount]; @@ -2873,7 +3195,7 @@ void Encoder::readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int curP count = 0; for (uint32_t d = 0; d < depthBytes; d++) { - size_t bytes = NUM_4x4_PARTITIONS >> (depthBuf[d] * 2); + size_t bytes = m_param->num4x4Partitions >> (depthBuf[d] * 2); for (int i = 0; i < numDir; i++) { for (size_t j = count, k = 0; k < bytes; j++, k++) @@ -2927,7 +3249,7 @@ void Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncD analysis->frameRecordSize += sizeof(WeightParam) * numPlanes * numDir; } - if (m_param->analysisRefineLevel > 1) + if (m_param->analysisReuseLevel > 1) { if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) { @@ -2975,25 +3297,25 @@ void Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncD interDataCTU->depth[depthBytes] = depth; predMode = ctu->m_predMode[absPartIdx]; - if (m_param->analysisRefineLevel != 10 && ctu->m_refIdx[1][absPartIdx] != -1) + if (m_param->analysisReuseLevel != 10 && ctu->m_refIdx[1][absPartIdx] != -1) predMode = 4; // used as indiacator if the block is coded as bidir interDataCTU->modes[depthBytes] = predMode; - if (m_param->analysisRefineLevel > 4) + if (m_param->analysisReuseLevel > 4) { partSize = ctu->m_partSize[absPartIdx]; interDataCTU->partSize[depthBytes] = partSize; /* Store per PU data */ - uint32_t numPU = nbPartsTable[(int)partSize]; + uint32_t numPU = (predMode == MODE_INTRA) ? 1 : nbPartsTable[(int)partSize]; for (uint32_t puIdx = 0; puIdx < numPU; puIdx++) { uint32_t puabsPartIdx = ctu->getPUOffset(puIdx, absPartIdx) + absPartIdx; if (puIdx) depthBytes++; interDataCTU->mergeFlag[depthBytes] = ctu->m_mergeFlag[puabsPartIdx]; - if (m_param->analysisRefineLevel == 10) + if (m_param->analysisReuseLevel == 10) { interDataCTU->interDir[depthBytes] = ctu->m_interDir[puabsPartIdx]; for (uint32_t dir = 0; dir < numDir; dir++) @@ -3004,12 +3326,12 @@ void Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncD } } } - if (m_param->analysisRefineLevel == 10 && bIntraInInter) + if (m_param->analysisReuseLevel == 10 && bIntraInInter) intraDataCTU->chromaModes[depthBytes] = ctu->m_chromaIntraDir[absPartIdx]; } absPartIdx += ctu->m_numPartitions >> (depth * 2); } - if (m_param->analysisRefineLevel == 10 && bIntraInInter) + if (m_param->analysisReuseLevel == 10 && bIntraInInter) memcpy(&intraDataCTU->modes[ctu->m_cuAddr * ctu->m_numPartitions], ctu->m_lumaIntraDir, sizeof(uint8_t)* ctu->m_numPartitions); } } @@ -3020,10 +3342,10 @@ void Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncD { /* Add sizeof depth, modes, partSize, mergeFlag */ analysis->frameRecordSize += depthBytes * 2; - if (m_param->analysisRefineLevel > 4) + if (m_param->analysisReuseLevel > 4) analysis->frameRecordSize += (depthBytes * 2); - if (m_param->analysisRefineLevel == 10) + if (m_param->analysisReuseLevel == 10) { /* Add Size of interDir, mvpIdx, refIdx, mv, luma and chroma modes */ analysis->frameRecordSize += depthBytes; @@ -3036,7 +3358,12 @@ void Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncD else analysis->frameRecordSize += sizeof(int32_t)* analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir; } + analysis->depthBytes = depthBytes; } + + if (!m_param->bUseAnalysisFile) + return; + X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFile); X265_FWRITE(&depthBytes, sizeof(uint32_t), 1, m_analysisFile); X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFile); @@ -3048,7 +3375,7 @@ void Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncD if (analysis->sliceType > X265_TYPE_I) X265_FWRITE((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFile); - if (m_param->analysisRefineLevel < 2) + if (m_param->analysisReuseLevel < 2) return; if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) @@ -3062,11 +3389,11 @@ void Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncD { X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), depthBytes, m_analysisFile); X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), depthBytes, m_analysisFile); - if (m_param->analysisRefineLevel > 4) + if (m_param->analysisReuseLevel > 4) { X265_FWRITE(((analysis_inter_data*)analysis->interData)->partSize, sizeof(uint8_t), depthBytes, m_analysisFile); X265_FWRITE(((analysis_inter_data*)analysis->interData)->mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFile); - if (m_param->analysisRefineLevel == 10) + if (m_param->analysisReuseLevel == 10) { X265_FWRITE(((analysis_inter_data*)analysis->interData)->interDir, sizeof(uint8_t), depthBytes, m_analysisFile); if (bIntraInInter) X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFile); @@ -3080,7 +3407,7 @@ void Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncD X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); } } - if (m_param->analysisRefineLevel != 10) + if (m_param->analysisReuseLevel != 10) X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFile); } @@ -3099,8 +3426,8 @@ void Encoder::writeAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, FrameDa }\ uint32_t depthBytes = 0; - uint32_t widthInCU = (m_param->sourceWidth + g_maxCUSize - 1) >> g_maxLog2CUSize; - uint32_t heightInCU = (m_param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize; + uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; + uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize; uint32_t numCUsInFrame = widthInCU * heightInCU; analysis2PassFrameData* analysisFrameData = (analysis2PassFrameData*)analysis2Pass->analysisFramedata; diff --git a/source/encoder/encoder.h b/source/encoder/encoder.h index 659d977f5d..d456a89fd2 100644 --- a/source/encoder/encoder.h +++ b/source/encoder/encoder.h @@ -31,11 +31,9 @@ #include "x265.h" #include "nal.h" #include "framedata.h" - -#ifdef ENABLE_DYNAMIC_HDR10 - #include "dynamicHDR10\hdr10plus.h" +#ifdef ENABLE_HDR10_PLUS + #include "dynamicHDR10/hdr10plus.h" #endif - struct x265_encoder {}; namespace X265_NS { // private namespace @@ -178,8 +176,10 @@ class Encoder : public x265_encoder int m_bToneMap; // Enables tone-mapping -#ifdef ENABLE_DYNAMIC_HDR10 +#ifdef ENABLE_HDR10_PLUS const hdr10plus_api *m_hdr10plus_api; + uint8_t **cim; + int numCimInfo; #endif x265_sei_payload m_prevTonemapPayload; @@ -187,7 +187,7 @@ class Encoder : public x265_encoder Encoder(); ~Encoder() { -#ifdef ENABLE_DYNAMIC_HDR10 +#ifdef ENABLE_HDR10_PLUS if (m_prevTonemapPayload.payload != NULL) X265_FREE(m_prevTonemapPayload.payload); #endif @@ -201,6 +201,8 @@ class Encoder : public x265_encoder int reconfigureParam(x265_param* encParam, x265_param* param); + void copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc); + void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs); void fetchStats(x265_stats* stats, size_t statsSizeBytes); @@ -223,7 +225,7 @@ class Encoder : public x265_encoder void freeAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType); - void readAnalysisFile(x265_analysis_data* analysis, int poc); + void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn); void writeAnalysisFile(x265_analysis_data* pic, FrameData &curEncData); void readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int poc, int sliceType); diff --git a/source/encoder/entropy.cpp b/source/encoder/entropy.cpp index 190365b3ac..ba591d007a 100644 --- a/source/encoder/entropy.cpp +++ b/source/encoder/entropy.cpp @@ -700,7 +700,7 @@ void Entropy::codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t s // TODO: Enable when pps_loop_filter_across_slices_enabled_flag==1 // We didn't support filter across slice board, so disable it now - if (g_maxSlices <= 1) + if (encData.m_param->maxSlices <= 1) { bool isSAOEnabled = slice.m_sps->bUseSAO ? saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1] : false; bool isDBFEnabled = !slice.m_pps->bPicDisableDeblockingFilter; @@ -783,7 +783,7 @@ void Entropy::encodeCU(const CUData& ctu, const CUGeom& cuGeom, uint32_t absPart if (cuSplitFlag) codeSplitFlag(ctu, absPartIdx, depth); - if (depth < ctu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth) + if (depth < ctu.m_cuDepth[absPartIdx] && depth < ctu.m_encData->m_param->maxCUDepth) { uint32_t qNumParts = cuGeom.numPartitions >> 2; if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP) @@ -863,7 +863,7 @@ uint32_t Entropy::bitsInterMode(const CUData& cu, uint32_t absPartIdx, uint32_t case SIZE_nRx2N: bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]); bits += bitsCodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 1]); - if (depth == g_maxCUDepth && !(cu.m_log2CUSize[absPartIdx] == 3)) + if (depth == cu.m_encData->m_param->maxCUDepth && !(cu.m_log2CUSize[absPartIdx] == 3)) bits += bitsCodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 2]); if (cu.m_slice->m_sps->maxAMPDepth > depth) { @@ -888,7 +888,7 @@ void Entropy::finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth, b uint32_t cuAddr = ctu.getSCUAddr() + absPartIdx; X265_CHECK(realEndAddress == slice->realEndAddress(slice->m_endCUAddr), "real end address expected\n"); - uint32_t granularityMask = g_maxCUSize - 1; + uint32_t granularityMask = ctu.m_encData->m_param->maxCUSize - 1; uint32_t cuSize = 1 << ctu.m_log2CUSize[absPartIdx]; uint32_t rpelx = ctu.m_cuPelX + g_zscanToPelX[absPartIdx] + cuSize; uint32_t bpely = ctu.m_cuPelY + g_zscanToPelY[absPartIdx] + cuSize; @@ -902,7 +902,7 @@ void Entropy::finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth, b { // Encode slice finish uint32_t bTerminateSlice = ctu.m_bLastCuInSlice; - if (cuAddr + (NUM_4x4_PARTITIONS >> (depth << 1)) == realEndAddress) + if (cuAddr + (slice->m_param->num4x4Partitions >> (depth << 1)) == realEndAddress) bTerminateSlice = 1; // The 1-terminating bit is added to all streams, so don't add it here when it's 1. @@ -1512,7 +1512,7 @@ void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth if (cu.isIntra(absPartIdx)) { - if (depth == g_maxCUDepth) + if (depth == cu.m_encData->m_param->maxCUDepth) encodeBin(partSize == SIZE_2Nx2N ? 1 : 0, m_contextState[OFF_PART_SIZE_CTX]); return; } @@ -1541,7 +1541,7 @@ void Entropy::codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth case SIZE_nRx2N: encodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 0]); encodeBin(0, m_contextState[OFF_PART_SIZE_CTX + 1]); - if (depth == g_maxCUDepth && !(cu.m_log2CUSize[absPartIdx] == 3)) + if (depth == cu.m_encData->m_param->maxCUDepth && !(cu.m_log2CUSize[absPartIdx] == 3)) encodeBin(1, m_contextState[OFF_PART_SIZE_CTX + 2]); if (cu.m_slice->m_sps->maxAMPDepth > depth) { diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp index 3d04f9abbc..f354fbef09 100644 --- a/source/encoder/frameencoder.cpp +++ b/source/encoder/frameencoder.cpp @@ -124,7 +124,7 @@ bool FrameEncoder::init(Encoder *top, int numRows, int numCols) range += !!(m_param->searchMethod < 2); /* diamond/hex range check lag */ range += NTAPS_LUMA / 2; /* subpel filter half-length */ range += 2 + (MotionEstimate::hpelIterationCount(m_param->subpelRefine) + 1) / 2; /* subpel refine steps */ - m_refLagRows = /*(m_param->maxSlices > 1 ? 1 : 0) +*/ 1 + ((range + g_maxCUSize - 1) / g_maxCUSize); + m_refLagRows = /*(m_param->maxSlices > 1 ? 1 : 0) +*/ 1 + ((range + m_param->maxCUSize - 1) / m_param->maxCUSize); // NOTE: 2 times of numRows because both Encoder and Filter in same queue if (!WaveFront::init(m_numRows * 2)) @@ -295,6 +295,11 @@ void FrameEncoder::threadMain() while (m_threadActive) { + if (m_param->bCTUInfo) + { + while (!m_frame->m_ctuInfo) + m_frame->m_copied.wait(); + } compressFrame(); m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */ m_enable.wait(); @@ -383,7 +388,7 @@ void FrameEncoder::compressFrame() bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred; WeightParam* reuseWP = NULL; - if (m_param->analysisMode && (bUseWeightP || bUseWeightB)) + if (m_param->analysisReuseMode && (bUseWeightP || bUseWeightB)) reuseWP = (WeightParam*)m_frame->m_analysisData.wt; if (bUseWeightP || bUseWeightB) @@ -392,7 +397,7 @@ void FrameEncoder::compressFrame() m_cuStats.countWeightAnalyze++; ScopedElapsedTime time(m_cuStats.weightAnalyzeTime); #endif - if (m_param->analysisMode == X265_ANALYSIS_LOAD) + if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD) { for (int list = 0; list < slice->isInterB() + 1; list++) { @@ -431,7 +436,7 @@ void FrameEncoder::compressFrame() slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic; m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param); } - if (m_param->analysisMode == X265_ANALYSIS_SAVE && (bUseWeightP || bUseWeightB)) + if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE && (bUseWeightP || bUseWeightB)) { for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++) *(reuseWP++) = slice->m_weightPredTable[l][0][i]; @@ -664,7 +669,7 @@ void FrameEncoder::compressFrame() if (writeSei) { SEICreativeIntentMeta sei; - sei.cim = payload->payload; + sei.m_payload = payload->payload; m_bs.resetBits(); sei.setSize(payload->payloadSize); sei.write(m_bs, *slice->m_sps); @@ -832,7 +837,7 @@ void FrameEncoder::compressFrame() } else if (m_param->decodedPictureHashSEI == 3) { - uint32_t cuHeight = g_maxCUSize; + uint32_t cuHeight = m_param->maxCUSize; m_checksum[0] = 0; @@ -872,43 +877,52 @@ void FrameEncoder::compressFrame() m_frame->m_encData->m_frameStats.percent8x8Inter = (double)totalP / totalCuCount; m_frame->m_encData->m_frameStats.percent8x8Skip = (double)totalSkip / totalCuCount; } - for (uint32_t i = 0; i < m_numRows; i++) + + if (m_param->csvLogLevel >= 1) { - m_frame->m_encData->m_frameStats.cntIntraNxN += m_rows[i].rowStats.cntIntraNxN; - m_frame->m_encData->m_frameStats.totalCu += m_rows[i].rowStats.totalCu; - m_frame->m_encData->m_frameStats.totalCtu += m_rows[i].rowStats.totalCtu; - m_frame->m_encData->m_frameStats.lumaDistortion += m_rows[i].rowStats.lumaDistortion; - m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion; - m_frame->m_encData->m_frameStats.psyEnergy += m_rows[i].rowStats.psyEnergy; - m_frame->m_encData->m_frameStats.ssimEnergy += m_rows[i].rowStats.ssimEnergy; - m_frame->m_encData->m_frameStats.resEnergy += m_rows[i].rowStats.resEnergy; - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + for (uint32_t i = 0; i < m_numRows; i++) { - m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth]; - m_frame->m_encData->m_frameStats.cntMergeCu[depth] += m_rows[i].rowStats.cntMergeCu[depth]; - for (int m = 0; m < INTER_MODES; m++) - m_frame->m_encData->m_frameStats.cuInterDistribution[depth][m] += m_rows[i].rowStats.cuInterDistribution[depth][m]; + m_frame->m_encData->m_frameStats.cntIntraNxN += m_rows[i].rowStats.cntIntraNxN; + m_frame->m_encData->m_frameStats.totalCu += m_rows[i].rowStats.totalCu; + m_frame->m_encData->m_frameStats.totalCtu += m_rows[i].rowStats.totalCtu; + m_frame->m_encData->m_frameStats.lumaDistortion += m_rows[i].rowStats.lumaDistortion; + m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion; + m_frame->m_encData->m_frameStats.psyEnergy += m_rows[i].rowStats.psyEnergy; + m_frame->m_encData->m_frameStats.ssimEnergy += m_rows[i].rowStats.ssimEnergy; + m_frame->m_encData->m_frameStats.resEnergy += m_rows[i].rowStats.resEnergy; + for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++) + { + m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth]; + m_frame->m_encData->m_frameStats.cntMergeCu[depth] += m_rows[i].rowStats.cntMergeCu[depth]; + for (int m = 0; m < INTER_MODES; m++) + m_frame->m_encData->m_frameStats.cuInterDistribution[depth][m] += m_rows[i].rowStats.cuInterDistribution[depth][m]; + for (int n = 0; n < INTRA_MODES; n++) + m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] += m_rows[i].rowStats.cuIntraDistribution[depth][n]; + } + } + m_frame->m_encData->m_frameStats.percentIntraNxN = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu; + + for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++) + { + m_frame->m_encData->m_frameStats.percentSkipCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntSkipCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu; + m_frame->m_encData->m_frameStats.percentMergeCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntMergeCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu; for (int n = 0; n < INTRA_MODES; n++) - m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] += m_rows[i].rowStats.cuIntraDistribution[depth][n]; + m_frame->m_encData->m_frameStats.percentIntraDistribution[depth][n] = (double)(m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] * 100) / m_frame->m_encData->m_frameStats.totalCu; + uint64_t cuInterRectCnt = 0; // sum of Nx2N, 2NxN counts + cuInterRectCnt += m_frame->m_encData->m_frameStats.cuInterDistribution[depth][1] + m_frame->m_encData->m_frameStats.cuInterDistribution[depth][2]; + m_frame->m_encData->m_frameStats.percentInterDistribution[depth][0] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][0] * 100) / m_frame->m_encData->m_frameStats.totalCu; + m_frame->m_encData->m_frameStats.percentInterDistribution[depth][1] = (double)(cuInterRectCnt * 100) / m_frame->m_encData->m_frameStats.totalCu; + m_frame->m_encData->m_frameStats.percentInterDistribution[depth][2] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][3] * 100) / m_frame->m_encData->m_frameStats.totalCu; } } - m_frame->m_encData->m_frameStats.avgLumaDistortion = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu; - m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu; - m_frame->m_encData->m_frameStats.avgPsyEnergy = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu; - m_frame->m_encData->m_frameStats.avgSsimEnergy = (double)(m_frame->m_encData->m_frameStats.ssimEnergy) / m_frame->m_encData->m_frameStats.totalCtu; - m_frame->m_encData->m_frameStats.avgResEnergy = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu; - m_frame->m_encData->m_frameStats.percentIntraNxN = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu; - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + + if (m_param->csvLogLevel >= 2) { - m_frame->m_encData->m_frameStats.percentSkipCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntSkipCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu; - m_frame->m_encData->m_frameStats.percentMergeCu[depth] = (double)(m_frame->m_encData->m_frameStats.cntMergeCu[depth] * 100) / m_frame->m_encData->m_frameStats.totalCu; - for (int n = 0; n < INTRA_MODES; n++) - m_frame->m_encData->m_frameStats.percentIntraDistribution[depth][n] = (double)(m_frame->m_encData->m_frameStats.cuIntraDistribution[depth][n] * 100) / m_frame->m_encData->m_frameStats.totalCu; - uint64_t cuInterRectCnt = 0; // sum of Nx2N, 2NxN counts - cuInterRectCnt += m_frame->m_encData->m_frameStats.cuInterDistribution[depth][1] + m_frame->m_encData->m_frameStats.cuInterDistribution[depth][2]; - m_frame->m_encData->m_frameStats.percentInterDistribution[depth][0] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][0] * 100) / m_frame->m_encData->m_frameStats.totalCu; - m_frame->m_encData->m_frameStats.percentInterDistribution[depth][1] = (double)(cuInterRectCnt * 100) / m_frame->m_encData->m_frameStats.totalCu; - m_frame->m_encData->m_frameStats.percentInterDistribution[depth][2] = (double)(m_frame->m_encData->m_frameStats.cuInterDistribution[depth][3] * 100) / m_frame->m_encData->m_frameStats.totalCu; + m_frame->m_encData->m_frameStats.avgLumaDistortion = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu; + m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu; + m_frame->m_encData->m_frameStats.avgPsyEnergy = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu; + m_frame->m_encData->m_frameStats.avgSsimEnergy = (double)(m_frame->m_encData->m_frameStats.ssimEnergy) / m_frame->m_encData->m_frameStats.totalCtu; + m_frame->m_encData->m_frameStats.avgResEnergy = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu; } m_bs.resetBits(); @@ -1096,7 +1110,7 @@ void FrameEncoder::compressFrame() /* Accumulate CU statistics from each worker thread, we could report * per-frame stats here, but currently we do not. */ for (int i = 0; i < numTLD; i++) - m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId]); + m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId], *m_param); #endif m_endFrameTime = x265_mdate(); @@ -1106,7 +1120,7 @@ void FrameEncoder::encodeSlice(uint32_t sliceAddr) { Slice* slice = m_frame->m_encData->m_slice; const uint32_t widthInLCUs = slice->m_sps->numCuInWidth; - const uint32_t lastCUAddr = (slice->m_endCUAddr + NUM_4x4_PARTITIONS - 1) / NUM_4x4_PARTITIONS; + const uint32_t lastCUAddr = (slice->m_endCUAddr + m_param->num4x4Partitions - 1) / m_param->num4x4Partitions; const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1; SAOParam* saoParam = slice->m_sps->bUseSAO ? m_frame->m_encData->m_saoParam : NULL; @@ -1208,7 +1222,6 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) const uint32_t row = (uint32_t)intRow; CTURow& curRow = m_rows[row]; - tld.analysis.m_param = m_param; if (m_param->bEnableWavefront) { ScopedLock self(curRow.lock); @@ -1241,7 +1254,7 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16; uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16; - uint32_t noOfBlocks = g_maxCUSize / 16; + uint32_t noOfBlocks = m_param->maxCUSize / 16; const uint32_t bFirstRowInSlice = ((row == 0) || (m_rows[row - 1].sliceId != curRow.sliceId)) ? 1 : 0; const uint32_t bLastRowInSlice = ((row == m_numRows - 1) || (m_rows[row + 1].sliceId != curRow.sliceId)) ? 1 : 0; const uint32_t sliceId = curRow.sliceId; @@ -1320,8 +1333,8 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) // TODO: specially case handle on first and last row // Initialize restrict on MV range in slices - tld.analysis.m_sliceMinY = -(int16_t)(rowInSlice * g_maxCUSize * 4) + 3 * 4; - tld.analysis.m_sliceMaxY = (int16_t)((endRowInSlicePlus1 - 1 - row) * (g_maxCUSize * 4) - 4 * 4); + tld.analysis.m_sliceMinY = -(int16_t)(rowInSlice * m_param->maxCUSize * 4) + 3 * 4; + tld.analysis.m_sliceMaxY = (int16_t)((endRowInSlicePlus1 - 1 - row) * (m_param->maxCUSize * 4) - 4 * 4); // Handle single row slice if (tld.analysis.m_sliceMaxY < tld.analysis.m_sliceMinY) @@ -1361,8 +1374,8 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) cuStat.baseQp = curEncData.m_rowStat[row].rowQp; /* TODO: use defines from slicetype.h for lowres block size */ - uint32_t block_y = (ctu->m_cuPelY >> g_maxLog2CUSize) * noOfBlocks; - uint32_t block_x = (ctu->m_cuPelX >> g_maxLog2CUSize) * noOfBlocks; + uint32_t block_y = (ctu->m_cuPelY >> m_param->maxLog2CUSize) * noOfBlocks; + uint32_t block_x = (ctu->m_cuPelX >> m_param->maxLog2CUSize) * noOfBlocks; cuStat.vbvCost = 0; cuStat.intraVbvCost = 0; @@ -1473,11 +1486,11 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) curRow.rowStats.coeffBits += best.coeffBits; curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits); - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++) { /* 1 << shift == number of 8x8 blocks at current depth */ - int shift = 2 * (g_maxCUDepth - depth); - int cuSize = g_maxCUSize >> depth; + int shift = 2 * (m_param->maxCUDepth - depth); + int cuSize = m_param->maxCUSize >> depth; if (cuSize == 8) curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN); @@ -1496,7 +1509,7 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) curRow.rowStats.resEnergy += best.resEnergy; curRow.rowStats.cntIntraNxN += frameLog.cntIntraNxN; curRow.rowStats.totalCu += frameLog.totalCu; - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++) { curRow.rowStats.cntSkipCu[depth] += frameLog.cntSkipCu[depth]; curRow.rowStats.cntMergeCu[depth] += frameLog.cntMergeCu[depth]; @@ -1510,14 +1523,17 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) x265_emms(); if (bIsVbv) - { - // Update encoded bits, satdCost, baseQP for each CU - curEncData.m_rowStat[row].rowSatd += curEncData.m_cuStat[cuAddr].vbvCost; - curEncData.m_rowStat[row].rowIntraSatd += curEncData.m_cuStat[cuAddr].intraVbvCost; - curEncData.m_rowStat[row].encodedBits += curEncData.m_cuStat[cuAddr].totalBits; - curEncData.m_rowStat[row].sumQpRc += curEncData.m_cuStat[cuAddr].baseQp; - curEncData.m_rowStat[row].numEncodedCUs = cuAddr; - + { + // Update encoded bits, satdCost, baseQP for each CU if tune grain is disabled + if ((m_param->bEnableWavefront && (!cuAddr || !m_param->rc.bEnableConstVbv)) || !m_param->bEnableWavefront) + { + curEncData.m_rowStat[row].rowSatd += curEncData.m_cuStat[cuAddr].vbvCost; + curEncData.m_rowStat[row].rowIntraSatd += curEncData.m_cuStat[cuAddr].intraVbvCost; + curEncData.m_rowStat[row].encodedBits += curEncData.m_cuStat[cuAddr].totalBits; + curEncData.m_rowStat[row].sumQpRc += curEncData.m_cuStat[cuAddr].baseQp; + curEncData.m_rowStat[row].numEncodedCUs = cuAddr; + } + // If current block is at row end checkpoint, call vbv ratecontrol. if (!m_param->bEnableWavefront && col == numCols - 1) @@ -1553,6 +1569,24 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) else if (m_param->bEnableWavefront && row == col && row) { + if (m_param->rc.bEnableConstVbv) + { + int32_t startCuAddr = numCols * row; + int32_t EndCuAddr = startCuAddr + col; + for (int32_t r = row; r >= 0; r--) + { + for (int32_t c = startCuAddr; c <= EndCuAddr && c <= (int32_t)numCols * (r + 1) - 1; c++) + { + curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost; + curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost; + curEncData.m_rowStat[r].encodedBits += curEncData.m_cuStat[c].totalBits; + curEncData.m_rowStat[r].sumQpRc += curEncData.m_cuStat[c].baseQp; + curEncData.m_rowStat[r].numEncodedCUs = c; + } + startCuAddr = EndCuAddr - numCols; + EndCuAddr = startCuAddr + 1; + } + } double qpBase = curEncData.m_cuStat[cuAddr].baseQp; int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase); qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase); @@ -1648,6 +1682,23 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) } /** this row of CTUs has been compressed **/ + if (m_param->bEnableWavefront && m_param->rc.bEnableConstVbv) + { + if (row == m_numRows - 1) + { + for (int32_t r = 0; r < (int32_t)m_numRows; r++) + { + for (int32_t c = curEncData.m_rowStat[r].numEncodedCUs + 1; c < (int32_t)numCols * (r + 1); c++) + { + curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost; + curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost; + curEncData.m_rowStat[r].encodedBits += curEncData.m_cuStat[c].totalBits; + curEncData.m_rowStat[r].sumQpRc += curEncData.m_cuStat[c].baseQp; + curEncData.m_rowStat[r].numEncodedCUs = c; + } + } + } + } /* If encoding with ABR, update update bits and complexity in rate control * after a number of rows so the next frame's rateControlStart has more @@ -1729,7 +1780,6 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) } } - tld.analysis.m_param = NULL; curRow.busy = false; // CHECK_ME: Does it always FALSE condition? @@ -1741,73 +1791,36 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) int FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log) { int totQP = 0; - if (ctu.m_slice->m_sliceType == I_SLICE) + uint32_t depth = 0; + for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) { - uint32_t depth = 0; - for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) - { - depth = ctu.m_cuDepth[absPartIdx]; - - log->totalCu++; - log->cntIntra[depth]++; - totQP += ctu.m_qp[absPartIdx] * (ctu.m_numPartitions >> (depth * 2)); - - if (ctu.m_predMode[absPartIdx] == MODE_NONE) - { - log->totalCu--; - log->cntIntra[depth]--; - } - else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N) - { - /* TODO: log intra modes at absPartIdx +0 to +3 */ - X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n"); - log->cntIntraNxN++; - log->cntIntra[depth]--; - } - else if (ctu.m_lumaIntraDir[absPartIdx] > 1) - log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++; - else - log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++; - } + depth = ctu.m_cuDepth[absPartIdx]; + totQP += ctu.m_qp[absPartIdx] * (ctu.m_numPartitions >> (depth * 2)); } - else + + if (m_param->csvLogLevel >= 1 || m_param->rc.bStatWrite) { - uint32_t depth = 0; - for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) + if (ctu.m_slice->m_sliceType == I_SLICE) { - depth = ctu.m_cuDepth[absPartIdx]; - - log->totalCu++; - totQP += ctu.m_qp[absPartIdx] * (ctu.m_numPartitions >> (depth * 2)); - - if (ctu.m_predMode[absPartIdx] == MODE_NONE) - log->totalCu--; - else if (ctu.isSkipped(absPartIdx)) + depth = 0; + for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) { - if (ctu.m_mergeFlag[0]) - log->cntMergeCu[depth]++; - else - log->cntSkipCu[depth]++; - } - else if (ctu.isInter(absPartIdx)) - { - log->cntInter[depth]++; + depth = ctu.m_cuDepth[absPartIdx]; - if (ctu.m_partSize[absPartIdx] < AMP_ID) - log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++; - else - log->cuInterDistribution[depth][AMP_ID]++; - } - else if (ctu.isIntra(absPartIdx)) - { + log->totalCu++; log->cntIntra[depth]++; - if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N) + if (ctu.m_predMode[absPartIdx] == MODE_NONE) + { + log->totalCu--; + log->cntIntra[depth]--; + } + else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N) { + /* TODO: log intra modes at absPartIdx +0 to +3 */ X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n"); log->cntIntraNxN++; log->cntIntra[depth]--; - /* TODO: log intra modes at absPartIdx +0 to +3 */ } else if (ctu.m_lumaIntraDir[absPartIdx] > 1) log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++; @@ -1815,6 +1828,51 @@ int FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log) log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++; } } + else + { + depth = 0; + for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) + { + depth = ctu.m_cuDepth[absPartIdx]; + + log->totalCu++; + + if (ctu.m_predMode[absPartIdx] == MODE_NONE) + log->totalCu--; + else if (ctu.isSkipped(absPartIdx)) + { + if (ctu.m_mergeFlag[0]) + log->cntMergeCu[depth]++; + else + log->cntSkipCu[depth]++; + } + else if (ctu.isInter(absPartIdx)) + { + log->cntInter[depth]++; + + if (ctu.m_partSize[absPartIdx] < AMP_ID) + log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++; + else + log->cuInterDistribution[depth][AMP_ID]++; + } + else if (ctu.isIntra(absPartIdx)) + { + log->cntIntra[depth]++; + + if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N) + { + X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n"); + log->cntIntraNxN++; + log->cntIntra[depth]--; + /* TODO: log intra modes at absPartIdx +0 to +3 */ + } + else if (ctu.m_lumaIntraDir[absPartIdx] > 1) + log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++; + else + log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++; + } + } + } } return totQP; diff --git a/source/encoder/framefilter.cpp b/source/encoder/framefilter.cpp index d685f27384..37605e1a8a 100644 --- a/source/encoder/framefilter.cpp +++ b/source/encoder/framefilter.cpp @@ -35,107 +35,126 @@ using namespace X265_NS; static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height); static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt); -static void integral_init4h(uint32_t *sum, pixel *pix, intptr_t stride) +namespace X265_NS { - int32_t v = pix[0] + pix[1] + pix[2] + pix[3]; - for (int16_t x = 0; x < stride - 4; x++) + static void integral_init4h_c(uint32_t *sum, pixel *pix, intptr_t stride) { - sum[x] = v + sum[x - stride]; - v += pix[x + 4] - pix[x]; + int32_t v = pix[0] + pix[1] + pix[2] + pix[3]; + for (int16_t x = 0; x < stride - 4; x++) + { + sum[x] = v + sum[x - stride]; + v += pix[x + 4] - pix[x]; + } } -} -static void integral_init8h(uint32_t *sum, pixel *pix, intptr_t stride) -{ - int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7]; - for (int16_t x = 0; x < stride - 8; x++) + static void integral_init8h_c(uint32_t *sum, pixel *pix, intptr_t stride) { - sum[x] = v + sum[x - stride]; - v += pix[x + 8] - pix[x]; + int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7]; + for (int16_t x = 0; x < stride - 8; x++) + { + sum[x] = v + sum[x - stride]; + v += pix[x + 8] - pix[x]; + } } -} -static void integral_init12h(uint32_t *sum, pixel *pix, intptr_t stride) -{ - int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] + - pix[8] + pix[9] + pix[10] + pix[11]; - for (int16_t x = 0; x < stride - 12; x++) + static void integral_init12h_c(uint32_t *sum, pixel *pix, intptr_t stride) { - sum[x] = v + sum[x - stride]; - v += pix[x + 12] - pix[x]; + int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] + + pix[8] + pix[9] + pix[10] + pix[11]; + for (int16_t x = 0; x < stride - 12; x++) + { + sum[x] = v + sum[x - stride]; + v += pix[x + 12] - pix[x]; + } } -} -static void integral_init16h(uint32_t *sum, pixel *pix, intptr_t stride) -{ - int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] + - pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15]; - for (int16_t x = 0; x < stride - 16; x++) + static void integral_init16h_c(uint32_t *sum, pixel *pix, intptr_t stride) { - sum[x] = v + sum[x - stride]; - v += pix[x + 16] - pix[x]; + int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] + + pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15]; + for (int16_t x = 0; x < stride - 16; x++) + { + sum[x] = v + sum[x - stride]; + v += pix[x + 16] - pix[x]; + } } -} -static void integral_init24h(uint32_t *sum, pixel *pix, intptr_t stride) -{ - int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] + - pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15] + - pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] + pix[22] + pix[23]; - for (int16_t x = 0; x < stride - 24; x++) + static void integral_init24h_c(uint32_t *sum, pixel *pix, intptr_t stride) { - sum[x] = v + sum[x - stride]; - v += pix[x + 24] - pix[x]; + int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] + + pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15] + + pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] + pix[22] + pix[23]; + for (int16_t x = 0; x < stride - 24; x++) + { + sum[x] = v + sum[x - stride]; + v += pix[x + 24] - pix[x]; + } } -} -static void integral_init32h(uint32_t *sum, pixel *pix, intptr_t stride) -{ - int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] + - pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15] + - pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] + pix[22] + pix[23] + - pix[24] + pix[25] + pix[26] + pix[27] + pix[28] + pix[29] + pix[30] + pix[31]; - for (int16_t x = 0; x < stride - 32; x++) + static void integral_init32h_c(uint32_t *sum, pixel *pix, intptr_t stride) { - sum[x] = v + sum[x - stride]; - v += pix[x + 32] - pix[x]; + int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] + + pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15] + + pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] + pix[22] + pix[23] + + pix[24] + pix[25] + pix[26] + pix[27] + pix[28] + pix[29] + pix[30] + pix[31]; + for (int16_t x = 0; x < stride - 32; x++) + { + sum[x] = v + sum[x - stride]; + v += pix[x + 32] - pix[x]; + } } -} -static void integral_init4v(uint32_t *sum4, intptr_t stride) -{ - for (int x = 0; x < stride; x++) - sum4[x] = sum4[x + 4 * stride] - sum4[x]; -} + static void integral_init4v_c(uint32_t *sum4, intptr_t stride) + { + for (int x = 0; x < stride; x++) + sum4[x] = sum4[x + 4 * stride] - sum4[x]; + } -static void integral_init8v(uint32_t *sum8, intptr_t stride) -{ - for (int x = 0; x < stride; x++) - sum8[x] = sum8[x + 8 * stride] - sum8[x]; -} + static void integral_init8v_c(uint32_t *sum8, intptr_t stride) + { + for (int x = 0; x < stride; x++) + sum8[x] = sum8[x + 8 * stride] - sum8[x]; + } -static void integral_init12v(uint32_t *sum12, intptr_t stride) -{ - for (int x = 0; x < stride; x++) - sum12[x] = sum12[x + 12 * stride] - sum12[x]; -} + static void integral_init12v_c(uint32_t *sum12, intptr_t stride) + { + for (int x = 0; x < stride; x++) + sum12[x] = sum12[x + 12 * stride] - sum12[x]; + } -static void integral_init16v(uint32_t *sum16, intptr_t stride) -{ - for (int x = 0; x < stride; x++) - sum16[x] = sum16[x + 16 * stride] - sum16[x]; -} + static void integral_init16v_c(uint32_t *sum16, intptr_t stride) + { + for (int x = 0; x < stride; x++) + sum16[x] = sum16[x + 16 * stride] - sum16[x]; + } -static void integral_init24v(uint32_t *sum24, intptr_t stride) -{ - for (int x = 0; x < stride; x++) - sum24[x] = sum24[x + 24 * stride] - sum24[x]; -} + static void integral_init24v_c(uint32_t *sum24, intptr_t stride) + { + for (int x = 0; x < stride; x++) + sum24[x] = sum24[x + 24 * stride] - sum24[x]; + } -static void integral_init32v(uint32_t *sum32, intptr_t stride) -{ - for (int x = 0; x < stride; x++) - sum32[x] = sum32[x + 32 * stride] - sum32[x]; + static void integral_init32v_c(uint32_t *sum32, intptr_t stride) + { + for (int x = 0; x < stride; x++) + sum32[x] = sum32[x + 32 * stride] - sum32[x]; + } + + void setupSeaIntegralPrimitives_c(EncoderPrimitives &p) + { + p.integral_initv[INTEGRAL_4] = integral_init4v_c; + p.integral_initv[INTEGRAL_8] = integral_init8v_c; + p.integral_initv[INTEGRAL_12] = integral_init12v_c; + p.integral_initv[INTEGRAL_16] = integral_init16v_c; + p.integral_initv[INTEGRAL_24] = integral_init24v_c; + p.integral_initv[INTEGRAL_32] = integral_init32v_c; + p.integral_inith[INTEGRAL_4] = integral_init4h_c; + p.integral_inith[INTEGRAL_8] = integral_init8h_c; + p.integral_inith[INTEGRAL_12] = integral_init12h_c; + p.integral_inith[INTEGRAL_16] = integral_init16h_c; + p.integral_inith[INTEGRAL_24] = integral_init24h_c; + p.integral_inith[INTEGRAL_32] = integral_init32h_c; + } } void FrameFilter::destroy() @@ -166,8 +185,8 @@ void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t m_pad[0] = top->m_sps.conformanceWindow.rightOffset; m_pad[1] = top->m_sps.conformanceWindow.bottomOffset; m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0; - m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ? (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize; - m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize; + m_lastHeight = (m_param->sourceHeight % m_param->maxCUSize) ? (m_param->sourceHeight % m_param->maxCUSize) : m_param->maxCUSize; + m_lastWidth = (m_param->sourceWidth % m_param->maxCUSize) ? (m_param->sourceWidth % m_param->maxCUSize) : m_param->maxCUSize; integralCompleted.set(0); if (m_param->bEnableSsim) @@ -195,7 +214,7 @@ void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t for(int row = 0; row < numRows; row++) { // Setting maximum bound information - m_parallelFilter[row].m_rowHeight = (row == numRows - 1) ? m_lastHeight : g_maxCUSize; + m_parallelFilter[row].m_rowHeight = (row == numRows - 1) ? m_lastHeight : m_param->maxCUSize; m_parallelFilter[row].m_row = row; m_parallelFilter[row].m_rowAddr = row * numCols; m_parallelFilter[row].m_frameFilter = this; @@ -281,7 +300,7 @@ static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Fram void FrameFilter::ParallelFilter::copySaoAboveRef(const CUData *ctu, PicYuv* reconPic, uint32_t cuAddr, int col) { // Copy SAO Top Reference Pixels - int ctuWidth = g_maxCUSize; + int ctuWidth = ctu->m_encData->m_param->maxCUSize; const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (ctu->m_bFirstRowInSlice ? 0 : reconPic->m_stride); // Luma @@ -682,8 +701,8 @@ void FrameFilter::processPostRow(int row) intptr_t stride2 = m_frame->m_fencPic->m_stride; uint32_t bEnd = ((row) == (this->m_numRows - 1)); uint32_t bStart = (row == 0); - uint32_t minPixY = row * g_maxCUSize - 4 * !bStart; - uint32_t maxPixY = X265_MIN((row + 1) * g_maxCUSize - 4 * !bEnd, (uint32_t)m_param->sourceHeight); + uint32_t minPixY = row * m_param->maxCUSize - 4 * !bStart; + uint32_t maxPixY = X265_MIN((row + 1) * m_param->maxCUSize - 4 * !bEnd, (uint32_t)m_param->sourceHeight); uint32_t ssim_cnt; x265_emms(); @@ -749,7 +768,7 @@ void FrameFilter::processPostRow(int row) uint32_t width = reconPic->m_picWidth; uint32_t height = m_parallelFilter[row].getCUHeight(); intptr_t stride = reconPic->m_stride; - uint32_t cuHeight = g_maxCUSize; + uint32_t cuHeight = m_param->maxCUSize; if (!row) m_frameEncoder->m_checksum[0] = 0; @@ -793,18 +812,18 @@ void FrameFilter::computeMEIntegral(int row) } int stride = (int)m_frame->m_reconPic->m_stride; - int padX = g_maxCUSize + 32; - int padY = g_maxCUSize + 16; + int padX = m_param->maxCUSize + 32; + int padY = m_param->maxCUSize + 16; int numCuInHeight = m_frame->m_encData->m_slice->m_sps->numCuInHeight; - int maxHeight = numCuInHeight * g_maxCUSize; + int maxHeight = numCuInHeight * m_param->maxCUSize; int startRow = 0; if (m_param->interlaceMode) - startRow = (row * g_maxCUSize >> 1); + startRow = (row * m_param->maxCUSize >> 1); else - startRow = row * g_maxCUSize; + startRow = row * m_param->maxCUSize; - int height = lastRow ? (maxHeight + g_maxCUSize * m_param->interlaceMode) : (((row + m_param->interlaceMode) * g_maxCUSize) + g_maxCUSize); + int height = lastRow ? (maxHeight + m_param->maxCUSize * m_param->interlaceMode) : (((row + m_param->interlaceMode) * m_param->maxCUSize) + m_param->maxCUSize); if (!row) { @@ -833,47 +852,47 @@ void FrameFilter::computeMEIntegral(int row) uint32_t *sum4x4 = m_frame->m_encData->m_meIntegral[11] + (y + 1) * stride - padX; /*For width = 32 */ - integral_init32h(sum32x32, pix, stride); + primitives.integral_inith[INTEGRAL_32](sum32x32, pix, stride); if (y >= 32 - padY) - integral_init32v(sum32x32 - 32 * stride, stride); - integral_init32h(sum32x24, pix, stride); + primitives.integral_initv[INTEGRAL_32](sum32x32 - 32 * stride, stride); + primitives.integral_inith[INTEGRAL_32](sum32x24, pix, stride); if (y >= 24 - padY) - integral_init24v(sum32x24 - 24 * stride, stride); - integral_init32h(sum32x8, pix, stride); + primitives.integral_initv[INTEGRAL_24](sum32x24 - 24 * stride, stride); + primitives.integral_inith[INTEGRAL_32](sum32x8, pix, stride); if (y >= 8 - padY) - integral_init8v(sum32x8 - 8 * stride, stride); + primitives.integral_initv[INTEGRAL_8](sum32x8 - 8 * stride, stride); /*For width = 24 */ - integral_init24h(sum24x32, pix, stride); + primitives.integral_inith[INTEGRAL_24](sum24x32, pix, stride); if (y >= 32 - padY) - integral_init32v(sum24x32 - 32 * stride, stride); + primitives.integral_initv[INTEGRAL_32](sum24x32 - 32 * stride, stride); /*For width = 16 */ - integral_init16h(sum16x16, pix, stride); + primitives.integral_inith[INTEGRAL_16](sum16x16, pix, stride); if (y >= 16 - padY) - integral_init16v(sum16x16 - 16 * stride, stride); - integral_init16h(sum16x12, pix, stride); + primitives.integral_initv[INTEGRAL_16](sum16x16 - 16 * stride, stride); + primitives.integral_inith[INTEGRAL_16](sum16x12, pix, stride); if (y >= 12 - padY) - integral_init12v(sum16x12 - 12 * stride, stride); - integral_init16h(sum16x4, pix, stride); + primitives.integral_initv[INTEGRAL_12](sum16x12 - 12 * stride, stride); + primitives.integral_inith[INTEGRAL_16](sum16x4, pix, stride); if (y >= 4 - padY) - integral_init4v(sum16x4 - 4 * stride, stride); + primitives.integral_initv[INTEGRAL_4](sum16x4 - 4 * stride, stride); /*For width = 12 */ - integral_init12h(sum12x16, pix, stride); + primitives.integral_inith[INTEGRAL_12](sum12x16, pix, stride); if (y >= 16 - padY) - integral_init16v(sum12x16 - 16 * stride, stride); + primitives.integral_initv[INTEGRAL_16](sum12x16 - 16 * stride, stride); /*For width = 8 */ - integral_init8h(sum8x32, pix, stride); + primitives.integral_inith[INTEGRAL_8](sum8x32, pix, stride); if (y >= 32 - padY) - integral_init32v(sum8x32 - 32 * stride, stride); - integral_init8h(sum8x8, pix, stride); + primitives.integral_initv[INTEGRAL_32](sum8x32 - 32 * stride, stride); + primitives.integral_inith[INTEGRAL_8](sum8x8, pix, stride); if (y >= 8 - padY) - integral_init8v(sum8x8 - 8 * stride, stride); + primitives.integral_initv[INTEGRAL_8](sum8x8 - 8 * stride, stride); /*For width = 4 */ - integral_init4h(sum4x16, pix, stride); + primitives.integral_inith[INTEGRAL_4](sum4x16, pix, stride); if (y >= 16 - padY) - integral_init16v(sum4x16 - 16 * stride, stride); - integral_init4h(sum4x4, pix, stride); + primitives.integral_initv[INTEGRAL_16](sum4x16 - 16 * stride, stride); + primitives.integral_inith[INTEGRAL_4](sum4x4, pix, stride); if (y >= 4 - padY) - integral_init4v(sum4x4 - 4 * stride, stride); + primitives.integral_initv[INTEGRAL_4](sum4x4 - 4 * stride, stride); } m_parallelFilter[row].m_frameFilter->integralCompleted.set(1); } diff --git a/source/encoder/framefilter.h b/source/encoder/framefilter.h index 1bbcabb5b5..19a6d64f4a 100644 --- a/source/encoder/framefilter.h +++ b/source/encoder/framefilter.h @@ -123,7 +123,7 @@ class FrameFilter uint32_t getCUWidth(int colNum) const { - return (colNum == (int)m_numCols - 1) ? m_lastWidth : g_maxCUSize; + return (colNum == (int)m_numCols - 1) ? m_lastWidth : m_param->maxCUSize; } void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols); diff --git a/source/encoder/motion.cpp b/source/encoder/motion.cpp index fba2419082..0cd7de5317 100644 --- a/source/encoder/motion.cpp +++ b/source/encoder/motion.cpp @@ -598,6 +598,139 @@ void MotionEstimate::StarPatternSearch(ReferencePlanes *ref, } } +void MotionEstimate::refineMV(ReferencePlanes* ref, + const MV& mvmin, + const MV& mvmax, + const MV& qmvp, + MV& outQMv) +{ + ALIGN_VAR_16(int, costs[16]); + if (ctuAddr >= 0) + blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0); + intptr_t stride = ref->lumaStride; + pixel* fenc = fencPUYuv.m_buf[0]; + pixel* fref = ref->fpelPlane[0] + blockOffset; + + setMVP(qmvp); + + MV qmvmin = mvmin.toQPel(); + MV qmvmax = mvmax.toQPel(); + + /* The term cost used here means satd/sad values for that particular search. + * The costs used in ME integer search only includes the SAD cost of motion + * residual and sqrtLambda times MVD bits. The subpel refine steps use SATD + * cost of residual and sqrtLambda * MVD bits. + */ + + // measure SATD cost at clipped QPEL MVP + MV pmv = qmvp.clipped(qmvmin, qmvmax); + MV bestpre = pmv; + int bprecost; + + bprecost = subpelCompare(ref, pmv, sad); + + /* re-measure full pel rounded MVP with SAD as search start point */ + MV bmv = pmv.roundToFPel(); + int bcost = bprecost; + if (pmv.isSubpel()) + bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2); + + /* square refine */ + int dir = 0; + COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs); + if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y)) + COPY2_IF_LT(bcost, costs[0], dir, 1); + if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y)) + COPY2_IF_LT(bcost, costs[1], dir, 2); + COPY2_IF_LT(bcost, costs[2], dir, 3); + COPY2_IF_LT(bcost, costs[3], dir, 4); + COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs); + if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y)) + COPY2_IF_LT(bcost, costs[0], dir, 5); + if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y)) + COPY2_IF_LT(bcost, costs[1], dir, 6); + if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y)) + COPY2_IF_LT(bcost, costs[2], dir, 7); + if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y)) + COPY2_IF_LT(bcost, costs[3], dir, 8); + bmv += square1[dir]; + + if (bprecost < bcost) + { + bmv = bestpre; + bcost = bprecost; + } + else + bmv = bmv.toQPel(); // promote search bmv to qpel + + // TO DO: Change SubpelWorkload to fine tune MV + // Now it is set to 5 for experiment. + // const SubpelWorkload& wl = workload[this->subpelRefine]; + const SubpelWorkload& wl = workload[5]; + + pixelcmp_t hpelcomp; + + if (wl.hpel_satd) + { + bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); + hpelcomp = satd; + } + else + hpelcomp = sad; + + for (int iter = 0; iter < wl.hpel_iters; iter++) + { + int bdir = 0; + for (int i = 1; i <= wl.hpel_dirs; i++) + { + MV qmv = bmv + square1[i] * 2; + + // check mv range for slice bound + if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y)) + continue; + + int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); + COPY2_IF_LT(bcost, cost, bdir, i); + } + + if (bdir) + bmv += square1[bdir] * 2; + else + break; + } + + /* if HPEL search used SAD, remeasure with SATD before QPEL */ + if (!wl.hpel_satd) + bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); + + for (int iter = 0; iter < wl.qpel_iters; iter++) + { + int bdir = 0; + for (int i = 1; i <= wl.qpel_dirs; i++) + { + MV qmv = bmv + square1[i]; + + // check mv range for slice bound + if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y)) + continue; + + int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv); + COPY2_IF_LT(bcost, cost, bdir, i); + } + + if (bdir) + bmv += square1[bdir]; + else + break; + } + + // check mv range for slice bound + X265_CHECK(((pmv.y >= qmvmin.y) & (pmv.y <= qmvmax.y)), "mv beyond range!"); + + x265_emms(); + outQMv = bmv; +} + int MotionEstimate::motionEstimate(ReferencePlanes *ref, const MV & mvmin, const MV & mvmax, @@ -606,6 +739,7 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref, const MV * mvc, int merange, MV & outQMv, + uint32_t maxSlices, pixel * srcReferencePlane) { ALIGN_VAR_16(int, costs[16]); @@ -1306,7 +1440,7 @@ int MotionEstimate::motionEstimate(ReferencePlanes *ref, const SubpelWorkload& wl = workload[this->subpelRefine]; // check mv range for slice bound - if ((g_maxSlices > 1) & ((bmv.y < qmvmin.y) | (bmv.y > qmvmax.y))) + if ((maxSlices > 1) & ((bmv.y < qmvmin.y) | (bmv.y > qmvmax.y))) { bmv.y = x265_min(x265_max(bmv.y, qmvmin.y), qmvmax.y); bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); diff --git a/source/encoder/motion.h b/source/encoder/motion.h index 866b977d58..7d3653e80e 100644 --- a/source/encoder/motion.h +++ b/source/encoder/motion.h @@ -92,7 +92,8 @@ class MotionEstimate : public BitCost chromaSatd(refYuv.getCrAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[2], fencPUYuv.m_csize); } - int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, pixel *srcReferencePlane = 0); + void refineMV(ReferencePlanes* ref, const MV& mvmin, const MV& mvmax, const MV& qmvp, MV& outQMv); + int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv, uint32_t maxSlices, pixel *srcReferencePlane = 0); int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t); diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp index c6346d7ab8..77c66cf137 100644 --- a/source/encoder/ratecontrol.cpp +++ b/source/encoder/ratecontrol.cpp @@ -2272,7 +2272,7 @@ double RateControl::predictRowsSizeSum(Frame* curFrame, RateControlEntry* rce, d uint32_t refRowSatdCost = 0, refRowBits = 0, intraCostForPendingCus = 0; double refQScale = 0; - if (picType != I_SLICE) + if (picType != I_SLICE && !m_param->rc.bEnableConstVbv) { FrameData& refEncData = *refFrame->m_encData; uint32_t endCuAddr = maxCols * (row + 1); @@ -2301,7 +2301,8 @@ double RateControl::predictRowsSizeSum(Frame* curFrame, RateControlEntry* rce, d && refFrame && refFrame->m_encData->m_slice->m_sliceType == picType && refQScale > 0 - && refRowSatdCost > 0) + && refRowBits > 0 + && !m_param->rc.bEnableConstVbv) { if (abs((int32_t)(refRowSatdCost - satdCostForPendingCus)) < (int32_t)satdCostForPendingCus / 2) { @@ -2343,7 +2344,7 @@ int RateControl::rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEnt } rowSatdCost >>= X265_DEPTH - 8; updatePredictor(rce->rowPred[0], qScaleVbv, (double)rowSatdCost, encodedBits); - if (curEncData.m_slice->m_sliceType != I_SLICE) + if (curEncData.m_slice->m_sliceType != I_SLICE && !m_param->rc.bEnableConstVbv) { Frame* refFrame = curEncData.m_slice->m_refFrameList[0][0]; if (qpVbv < refFrame->m_encData->m_rowStat[row].rowQp) @@ -2613,7 +2614,7 @@ int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* for (uint32_t i = 0; i < slice->m_sps->numCuInHeight; i++) avgQpAq += curEncData.m_rowStat[i].sumQpAq; - avgQpAq /= (slice->m_sps->numCUsInFrame * NUM_4x4_PARTITIONS); + avgQpAq /= (slice->m_sps->numCUsInFrame * m_param->num4x4Partitions); curEncData.m_avgQpAq = avgQpAq; } else @@ -2711,6 +2712,13 @@ int RateControl::rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* { *filler = updateVbv(actualBits, rce); + curFrame->m_rcData->bufferFillFinal = m_bufferFillFinal; + for (int i = 0; i < 4; i++) + { + curFrame->m_rcData->coeff[i] = m_pred[i].coeff; + curFrame->m_rcData->count[i] = m_pred[i].count; + curFrame->m_rcData->offset[i] = m_pred[i].offset; + } if (m_param->bEmitHRDSEI) { const VUI *vui = &curEncData.m_slice->m_sps->vuiParameters; diff --git a/source/encoder/reference.cpp b/source/encoder/reference.cpp index e843061e1f..f99a1795b2 100644 --- a/source/encoder/reference.cpp +++ b/source/encoder/reference.cpp @@ -72,12 +72,12 @@ int MotionReference::init(PicYuv* recPic, WeightParam *wp, const x265_param& p) if (wp) { - uint32_t numCUinHeight = (reconPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize; + uint32_t numCUinHeight = (reconPic->m_picHeight + p.maxCUSize - 1) / p.maxCUSize; int marginX = reconPic->m_lumaMarginX; int marginY = reconPic->m_lumaMarginY; intptr_t stride = reconPic->m_stride; - int cuHeight = g_maxCUSize; + int cuHeight = p.maxCUSize; for (int c = 0; c < (p.internalCsp != X265_CSP_I400 && recPic->m_picCsp != X265_CSP_I400 ? numInterpPlanes : 1); c++) { @@ -127,15 +127,15 @@ void MotionReference::applyWeight(uint32_t finishedRows, uint32_t maxNumRows, ui int marginY = reconPic->m_lumaMarginY; intptr_t stride = reconPic->m_stride; int width = reconPic->m_picWidth; - int height = (finishedRows - numWeightedRows) * g_maxCUSize; + int height = (finishedRows - numWeightedRows) * reconPic->m_param->maxCUSize; /* the last row may be partial height */ if (finishedRows == maxNumRows - 1) { - const int leftRows = (reconPic->m_picHeight & (g_maxCUSize - 1)); + const int leftRows = (reconPic->m_picHeight & (reconPic->m_param->maxCUSize - 1)); - height += leftRows ? leftRows : g_maxCUSize; + height += leftRows ? leftRows : reconPic->m_param->maxCUSize; } - int cuHeight = g_maxCUSize; + int cuHeight = reconPic->m_param->maxCUSize; for (int c = 0; c < numInterpPlanes; c++) { diff --git a/source/encoder/sao.cpp b/source/encoder/sao.cpp index 2530bb83d0..a74db48ab4 100644 --- a/source/encoder/sao.cpp +++ b/source/encoder/sao.cpp @@ -98,8 +98,8 @@ bool SAO::create(x265_param* param, int initCommon) m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp); m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp); - m_numCuInWidth = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; - m_numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; + m_numCuInWidth = (m_param->sourceWidth + m_param->maxCUSize - 1) / m_param->maxCUSize; + m_numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize; const pixel maxY = (1 << X265_DEPTH) - 1; const pixel rangeExt = maxY >> 1; @@ -107,12 +107,12 @@ bool SAO::create(x265_param* param, int initCommon) for (int i = 0; i < (param->internalCsp != X265_CSP_I400 ? 3 : 1); i++) { - CHECKED_MALLOC(m_tmpL1[i], pixel, g_maxCUSize + 1); - CHECKED_MALLOC(m_tmpL2[i], pixel, g_maxCUSize + 1); + CHECKED_MALLOC(m_tmpL1[i], pixel, m_param->maxCUSize + 1); + CHECKED_MALLOC(m_tmpL2[i], pixel, m_param->maxCUSize + 1); // SAO asm code will read 1 pixel before and after, so pad by 2 // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here - CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * g_maxCUSize + 2 + 32); + CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * m_param->maxCUSize + 2 + 32); m_tmpU[i] += 1; } @@ -279,8 +279,8 @@ void SAO::applyPixelOffsets(int addr, int typeIdx, int plane) uint32_t picWidth = m_param->sourceWidth; uint32_t picHeight = m_param->sourceHeight; const CUData* cu = m_frame->m_encData->getPicCTU(addr); - int ctuWidth = g_maxCUSize; - int ctuHeight = g_maxCUSize; + int ctuWidth = m_param->maxCUSize; + int ctuHeight = m_param->maxCUSize; uint32_t lpelx = cu->m_cuPelX; uint32_t tpely = cu->m_cuPelY; const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice; @@ -573,8 +573,8 @@ void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX) { PicYuv* reconPic = m_frame->m_reconPic; intptr_t stride = reconPic->m_stride; - int ctuWidth = g_maxCUSize; - int ctuHeight = g_maxCUSize; + int ctuWidth = m_param->maxCUSize; + int ctuHeight = m_param->maxCUSize; int addr = idxY * m_numCuInWidth + idxX; pixel* rec = reconPic->getLumaAddr(addr); @@ -633,8 +633,8 @@ void SAO::generateChromaOffsets(SaoCtuParam* ctuParam[3], int idxY, int idxX) { PicYuv* reconPic = m_frame->m_reconPic; intptr_t stride = reconPic->m_strideC; - int ctuWidth = g_maxCUSize; - int ctuHeight = g_maxCUSize; + int ctuWidth = m_param->maxCUSize; + int ctuHeight = m_param->maxCUSize; { ctuWidth >>= m_hChromaShift; @@ -744,8 +744,8 @@ void SAO::calcSaoStatsCTU(int addr, int plane) intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride; uint32_t picWidth = m_param->sourceWidth; uint32_t picHeight = m_param->sourceHeight; - int ctuWidth = g_maxCUSize; - int ctuHeight = g_maxCUSize; + int ctuWidth = m_param->maxCUSize; + int ctuHeight = m_param->maxCUSize; uint32_t lpelx = cu->m_cuPelX; uint32_t tpely = cu->m_cuPelY; const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice; @@ -791,9 +791,9 @@ void SAO::calcSaoStatsCTU(int addr, int plane) // WARNING: *) May read beyond bound on video than ctuWidth or ctuHeight is NOT multiple of cuSize X265_CHECK((ctuWidth == ctuHeight) || (m_chromaFormat != X265_CSP_I420), "video size check failure\n"); if (plane) - primitives.chroma[m_chromaFormat].cu[g_maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride); + primitives.chroma[m_chromaFormat].cu[m_param->maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride); else - primitives.cu[g_maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride); + primitives.cu[m_param->maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride); } else { @@ -928,8 +928,8 @@ void SAO::calcSaoStatsCu_BeforeDblk(Frame* frame, int idxX, int idxY) intptr_t stride = reconPic->m_stride; uint32_t picWidth = m_param->sourceWidth; uint32_t picHeight = m_param->sourceHeight; - int ctuWidth = g_maxCUSize; - int ctuHeight = g_maxCUSize; + int ctuWidth = m_param->maxCUSize; + int ctuHeight = m_param->maxCUSize; uint32_t lpelx = cu->m_cuPelX; uint32_t tpely = cu->m_cuPelY; const uint32_t firstRowInSlice = cu->m_bFirstRowInSlice; @@ -1553,14 +1553,17 @@ void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& r } // Estimate Best Position - int64_t bestRDCostBO = MAX_INT64; int32_t bestClassBO = 0; + int64_t currentRDCost = costClasses[0]; + currentRDCost += costClasses[1]; + currentRDCost += costClasses[2]; + currentRDCost += costClasses[3]; + int64_t bestRDCostBO = currentRDCost; - for (int i = 0; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++) + for (int i = 1; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++) { - int64_t currentRDCost = 0; - for (int j = i; j < i + SAO_NUM_OFFSET; j++) - currentRDCost += costClasses[j]; + currentRDCost -= costClasses[i - 1]; + currentRDCost += costClasses[i + 3]; if (currentRDCost < bestRDCostBO) { diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp index e5e7ff1a0c..21a0ed8ffc 100644 --- a/source/encoder/search.cpp +++ b/source/encoder/search.cpp @@ -120,8 +120,8 @@ bool Search::initSearch(const x265_param& param, ScalingList& scalingList) CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2); m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL; m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC; - ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp); - ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp); + ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp); + ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp); } } else @@ -130,15 +130,15 @@ bool Search::initSearch(const x265_param& param, ScalingList& scalingList) { CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL); m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL; - ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp); - ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp); + ok &= m_rqt[i].reconQtYuv.create(param.maxCUSize, param.internalCsp); + ok &= m_rqt[i].resiQtYuv.create(param.maxCUSize, param.internalCsp); } } /* the rest of these buffers are indexed per-depth */ - for (uint32_t i = 0; i <= g_maxCUDepth; i++) + for (uint32_t i = 0; i <= m_param->maxCUDepth; i++) { - int cuSize = g_maxCUSize >> i; + int cuSize = param.maxCUSize >> i; ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp); ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp); ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp); @@ -186,7 +186,7 @@ Search::~Search() m_rqt[i].resiQtYuv.destroy(); } - for (uint32_t i = 0; i <= g_maxCUDepth; i++) + for (uint32_t i = 0; i <= m_param->maxCUDepth; i++) { m_rqt[i].tmpResiYuv.destroy(); m_rqt[i].tmpPredYuv.destroy(); @@ -2073,7 +2073,7 @@ void Search::singleMotionEstimation(Search& master, Mode& interMode, const Predi int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref); MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx]; - if (!m_param->analysisMode) /* Prevents load/save outputs from diverging if lowresMV is not available */ + if (!m_param->analysisReuseMode) /* Prevents load/save outputs from diverging if lowresMV is not available */ { MV lmv = getLowresMV(interMode.cu, pu, list, ref); if (lmv.notZero()) @@ -2082,7 +2082,7 @@ void Search::singleMotionEstimation(Search& master, Mode& interMode, const Predi setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax); - int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, + int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); /* Get total cost of partition, but only include MV bit cost once */ @@ -2108,6 +2108,17 @@ void Search::singleMotionEstimation(Search& master, Mode& interMode, const Predi } } +void Search::searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv) +{ + CUData& cu = interMode.cu; + const Slice *slice = m_slice; + MV mv = cu.m_mv[list][pu.puAbsPartIdx]; + cu.clipMv(mv); + MV mvmin, mvmax; + setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax); + m_me.refineMV(&slice->m_mref[list][ref], mvmin, mvmax, mv, outmv); +} + /* find the best inter prediction for each PU of specified mode */ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2]) { @@ -2150,7 +2161,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours); /* Uni-directional prediction */ - if ((m_param->analysisMode == X265_ANALYSIS_LOAD && m_param->analysisRefineLevel > 1) + if ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead)) { for (int list = 0; list < numPredDir; list++) @@ -2180,7 +2191,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx) mvpIn = bestME[list].mv; - int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, + int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); /* Get total cost of partition, but only include MV bit cost once */ @@ -2286,7 +2297,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma int mvpIdx = selectMVP(cu, pu, amvp, list, ref); MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx]; - if (!m_param->analysisMode) /* Prevents load/save outputs from diverging when lowresMV is not available */ + if (!m_param->analysisReuseMode) /* Prevents load/save outputs from diverging when lowresMV is not available */ { MV lmv = getLowresMV(cu, pu, list, ref); if (lmv.notZero()) @@ -2300,7 +2311,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride; } setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax); - int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, + int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0); /* Get total cost of partition, but only include MV bit cost once */ @@ -2582,11 +2593,11 @@ void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mv cu.clipMv(mvmax); if (cu.m_encData->m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE && - cu.m_cuPelX / g_maxCUSize < m_frame->m_encData->m_pir.pirStartCol && + cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirStartCol && m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol < m_slice->m_sps->numCuInWidth) { int safeX, maxSafeMv; - safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * g_maxCUSize - 3; + safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3; maxSafeMv = (safeX - cu.m_cuPelX) * 4; mvmax.x = X265_MIN(mvmax.x, maxSafeMv); mvmin.x = X265_MIN(mvmin.x, maxSafeMv); diff --git a/source/encoder/search.h b/source/encoder/search.h index 2f9805b495..f6cc651992 100644 --- a/source/encoder/search.h +++ b/source/encoder/search.h @@ -204,9 +204,9 @@ struct CUStats memset(this, 0, sizeof(*this)); } - void accumulate(CUStats& other) + void accumulate(CUStats& other, x265_param& param) { - for (uint32_t i = 0; i <= g_maxCUDepth; i++) + for (uint32_t i = 0; i <= param.maxCUDepth; i++) { intraRDOElapsedTime[i] += other.intraRDOElapsedTime[i]; interRDOElapsedTime[i] += other.interRDOElapsedTime[i]; @@ -311,6 +311,7 @@ class Search : public Predict // estimation inter prediction (non-skip) void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]); + void searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv); // encode residual and compute rd-cost for inter mode void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom); void encodeResAndCalcRdSkipCU(Mode& interMode); diff --git a/source/encoder/sei.cpp b/source/encoder/sei.cpp index 52bed8441e..24c9d4a627 100644 --- a/source/encoder/sei.cpp +++ b/source/encoder/sei.cpp @@ -54,21 +54,23 @@ void SEI::write(Bitstream& bs, const SPS& sps) } WRITE_CODE(type, 8, "payload_type"); uint32_t payloadSize; - if (hrdTypes || m_payloadType == USER_DATA_UNREGISTERED) + if (hrdTypes || m_payloadType == USER_DATA_UNREGISTERED || m_payloadType == USER_DATA_REGISTERED_ITU_T_T35) { if (hrdTypes) { X265_CHECK(0 == (count.getNumberOfWrittenBits() & 7), "payload unaligned\n"); payloadSize = count.getNumberOfWrittenBits() >> 3; } - else + else if (m_payloadType == USER_DATA_UNREGISTERED) payloadSize = m_payloadSize + 16; + else + payloadSize = m_payloadSize; for (; payloadSize >= 0xff; payloadSize -= 0xff) WRITE_CODE(0xff, 8, "payload_size"); WRITE_CODE(payloadSize, 8, "payload_size"); } - else if(m_payloadType != USER_DATA_REGISTERED_ITU_T_T35) + else WRITE_CODE(m_payloadSize, 8, "payload_size"); /* virtual writeSEI method, write to bs */ writeSEI(sps); diff --git a/source/encoder/sei.h b/source/encoder/sei.h index b87688e9e6..ac7a9132ef 100644 --- a/source/encoder/sei.h +++ b/source/encoder/sei.h @@ -276,27 +276,17 @@ class SEICreativeIntentMeta : public SEI m_payloadSize = 0; } - uint8_t *cim; + uint8_t *m_payload; // daniel.vt@samsung.com :: for the Creative Intent Meta Data Encoding ( seongnam.oh@samsung.com ) void writeSEI(const SPS&) { - if (!cim) + if (!m_payload) return; - int i = 0; - int payloadSize = m_payloadSize; - while (cim[i] == 0xFF) - { - i++; - payloadSize += cim[i]; - WRITE_CODE(0xFF, 8, "payload_size"); - } - WRITE_CODE(payloadSize, 8, "payload_size"); - i++; - payloadSize += i; - for (; i < payloadSize; ++i) - WRITE_CODE(cim[i], 8, "creative_intent_metadata"); + uint32_t i = 0; + for (; i < m_payloadSize; ++i) + WRITE_CODE(m_payload[i], 8, "creative_intent_metadata"); } }; } diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp index d3f62f4f75..d7638a4902 100644 --- a/source/encoder/slicetype.cpp +++ b/source/encoder/slicetype.cpp @@ -893,7 +893,7 @@ void Lookahead::getEstimatedPictureCost(Frame *curFrame) if (m_param->rc.cuTree && !m_param->rc.bStatRead) /* update row satds based on cutree offsets */ curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b); - else if (m_param->analysisMode != X265_ANALYSIS_LOAD) + else if (m_param->analysisReuseMode != X265_ANALYSIS_LOAD || m_param->scaleFactor) { if (m_param->rc.aqMode) curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b]; @@ -907,7 +907,7 @@ void Lookahead::getEstimatedPictureCost(Frame *curFrame) curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCosts[b - p0][p1 - b]; uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0; uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE); - uint32_t numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; + uint32_t numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize; uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height; double *qp_offset = 0; /* Factor in qpoffsets based on Aq/Cutree in CU costs */ @@ -1638,6 +1638,13 @@ bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, in m_isSceneTransition = false; /* Signal end of scene transitioning */ } + if (m_param->csvLogLevel >= 2) + { + int64_t icost = frames[p1]->costEst[0][0]; + int64_t pcost = frames[p1]->costEst[p1 - p0][0]; + frames[p1]->ipCostRatio = (double)icost / pcost; + } + /* A frame is always analysed with bRealScenecut = true first, and then bRealScenecut = false, the former for I decisions and the latter for P/B decisions. It's possible that the first analysis detected scenecuts which were later nulled due to scene transitioning, in which @@ -1812,7 +1819,8 @@ void Lookahead::calcMotionAdaptiveQuantFrame(Lowres **frames, int p0, int p1, in MV *mvs = frames[b]->lowresMvs[list][listDist[list]]; int32_t x = mvs[cuIndex].x; int32_t y = mvs[cuIndex].y; - displacement += sqrt(pow(abs(x), 2) + pow(abs(y), 2)); + // NOTE: the dynamic range of abs(x) and abs(y) is 15-bits + displacement += sqrt((double)(abs(x) * abs(x)) + (double)(abs(y) * abs(y))); } else displacement += 0.0; @@ -2400,7 +2408,7 @@ void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int /* ME will never return a cost larger than the cost @MVP, so we do not * have to check that ME cost is more than the estimated merge cost */ - fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV); + fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices); if (skipCost < 64 && skipCost < fencCost && bBidir) { fencCost = skipCost; diff --git a/source/test/ipfilterharness.cpp b/source/test/ipfilterharness.cpp index 312a878a25..95ac9c77b4 100644 --- a/source/test/ipfilterharness.cpp +++ b/source/test/ipfilterharness.cpp @@ -38,10 +38,8 @@ IPFilterHarness::IPFilterHarness() { pixel_test_buff[0][i] = rand() & PIXEL_MAX; short_test_buff[0][i] = (rand() % (2 * SMAX)) - SMAX; - pixel_test_buff[1][i] = PIXEL_MIN; - short_test_buff[1][i] = SMIN; - + short_test_buff[1][i] = (int16_t)SMIN; pixel_test_buff[2][i] = PIXEL_MAX; short_test_buff[2][i] = SMAX; } diff --git a/source/test/ipfilterharness.h b/source/test/ipfilterharness.h index 3edbd6a3a8..fcf4360483 100644 --- a/source/test/ipfilterharness.h +++ b/source/test/ipfilterharness.h @@ -39,8 +39,7 @@ class IPFilterHarness : public TestHarness enum { ITERS = 100 }; enum { TEST_CASES = 3 }; enum { SMAX = 1 << 12 }; - enum { SMIN = -1 << 12 }; - + enum { SMIN = (unsigned)-1 << 12 }; ALIGN_VAR_32(pixel, pixel_buff[TEST_BUF_SIZE]); int16_t short_buff[TEST_BUF_SIZE]; int16_t IPF_vec_output_s[TEST_BUF_SIZE]; diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp index 8727d2e771..4feee58dcc 100644 --- a/source/test/pixelharness.cpp +++ b/source/test/pixelharness.cpp @@ -44,9 +44,8 @@ PixelHarness::PixelHarness() uchar_test_buff[0][i] = rand() % ((1 << 8) - 1); residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only double_test_buff[0][i] = (double)(short_test_buff[0][i]) / 256.0; - pixel_test_buff[1][i] = PIXEL_MIN; - short_test_buff[1][i] = SMIN; + short_test_buff[1][i] = (int16_t)SMIN; short_test_buff1[1][i] = PIXEL_MIN; short_test_buff2[1][i] = -16384; int_test_buff[1][i] = SHORT_MIN; @@ -2003,6 +2002,76 @@ bool PixelHarness::check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChrom return true; } +bool PixelHarness::check_integral_initv(integralv_t ref, integralv_t opt) +{ + intptr_t srcStep = 64; + int j = 0; + uint32_t dst_ref[BUFFSIZE] = { 0 }; + uint32_t dst_opt[BUFFSIZE] = { 0 }; + + for (int i = 0; i < 64; i++) + { + dst_ref[i] = pixel_test_buff[0][i]; + dst_opt[i] = pixel_test_buff[0][i]; + } + + for (int i = 0, k = 0; i < BUFFSIZE; i++) + { + if (i % 64 == 0) + k++; + dst_ref[i] = dst_ref[i % 64] + k; + dst_opt[i] = dst_opt[i % 64] + k; + } + + int padx = 4; + int pady = 4; + uint32_t *dst_ref_ptr = dst_ref + srcStep * pady + padx; + uint32_t *dst_opt_ptr = dst_opt + srcStep * pady + padx; + for (int i = 0; i < ITERS; i++) + { + ref(dst_ref_ptr, srcStep); + checked(opt, dst_opt_ptr, srcStep); + + if (memcmp(dst_ref, dst_opt, sizeof(uint32_t) * BUFFSIZE)) + return false; + + reportfail() + j += INCR; + } + return true; +} + +bool PixelHarness::check_integral_inith(integralh_t ref, integralh_t opt) +{ + /* Since stride is always a multiple of 8 and data movement in AVX2 is 16 elements at a time for 8 bit pixel, we need + * to check correctness for two cases: stride multiple of 16 and stride not a multiple of 16; fine for High bit depth + * where data movement in AVX2 is 8 elements at a time */ + intptr_t srcStep[2] = { 56, 64 }; + int j = 0; + uint32_t dst_ref[BUFFSIZE] = { 0 }; + uint32_t dst_opt[BUFFSIZE] = { 0 }; + + int padx = 4; + int pady = 4; + for (int l = 0; l < 2; l++) + { + uint32_t *dst_ref_ptr = dst_ref + srcStep[l] * pady + padx; + uint32_t *dst_opt_ptr = dst_opt + srcStep[l] * pady + padx; + for (int k = 0; k < ITERS; k++) + { + ref(dst_ref_ptr, pixel_test_buff[0], srcStep[l]); + checked(opt, dst_opt_ptr, pixel_test_buff[0], srcStep[l]); + + if (memcmp(dst_ref, dst_opt, sizeof(uint32_t) * BUFFSIZE)) + return false; + + reportfail() + j += INCR; + } + } + return true; +} + bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt) { if (opt.pu[part].satd) @@ -2688,6 +2757,64 @@ bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr } } + for (int k = 0; k < NUM_INTEGRAL_SIZE; k++) + { + if (opt.integral_initv[k] && !check_integral_initv(ref.integral_initv[k], opt.integral_initv[k])) + { + switch (k) + { + case 0: + printf("Integral4v failed!\n"); + break; + case 1: + printf("Integral8v failed!\n"); + break; + case 2: + printf("Integral12v failed!\n"); + break; + case 3: + printf("Integral16v failed!\n"); + break; + case 4: + printf("Integral24v failed!\n"); + break; + case 5: + printf("Integral32v failed!\n"); + break; + } + return false; + } + } + + + for (int k = 0; k < NUM_INTEGRAL_SIZE; k++) + { + if (opt.integral_inith[k] && !check_integral_inith(ref.integral_inith[k], opt.integral_inith[k])) + { + switch (k) + { + case 0: + printf("Integral4h failed!\n"); + break; + case 1: + printf("Integral8h failed!\n"); + break; + case 2: + printf("Integral12h failed!\n"); + break; + case 3: + printf("Integral16h failed!\n"); + break; + case 4: + printf("Integral24h failed!\n"); + break; + case 5: + printf("Integral32h failed!\n"); + break; + } + return false; + } + } return true; } @@ -3210,4 +3337,67 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi HEADER0("pelFilterChroma_Horizontal"); REPORT_SPEEDUP(opt.pelFilterChroma[1], ref.pelFilterChroma[1], pbuf1, 1, STRIDE, tc, maskP, maskQ); } + + for (int k = 0; k < NUM_INTEGRAL_SIZE; k++) + { + if (opt.integral_initv[k]) + { + switch (k) + { + case 0: + HEADER0("integral_init4v"); + break; + case 1: + HEADER0("integral_init8v"); + break; + case 2: + HEADER0("integral_init12v"); + break; + case 3: + HEADER0("integral_init16v"); + break; + case 4: + HEADER0("integral_init24v"); + break; + case 5: + HEADER0("integral_init32v"); + break; + default: + break; + } + REPORT_SPEEDUP(opt.integral_initv[k], ref.integral_initv[k], (uint32_t*)pbuf1, STRIDE); + } + } + + for (int k = 0; k < NUM_INTEGRAL_SIZE; k++) + { + if (opt.integral_inith[k]) + { + uint32_t dst_buf[BUFFSIZE] = { 0 }; + switch (k) + { + case 0: + HEADER0("integral_init4h"); + break; + case 1: + HEADER0("integral_init8h"); + break; + case 2: + HEADER0("integral_init12h"); + break; + case 3: + HEADER0("integral_init16h"); + break; + case 4: + HEADER0("integral_init24h"); + break; + case 5: + HEADER0("integral_init32h"); + break; + default: + break; + } + REPORT_SPEEDUP(opt.integral_inith[k], ref.integral_inith[k], dst_buf, pbuf1, STRIDE); + } + } } diff --git a/source/test/pixelharness.h b/source/test/pixelharness.h index e67edb450b..08eac39983 100644 --- a/source/test/pixelharness.h +++ b/source/test/pixelharness.h @@ -40,7 +40,7 @@ class PixelHarness : public TestHarness enum { BUFFSIZE = STRIDE * (MAX_HEIGHT + PAD_ROWS) + INCR * ITERS }; enum { TEST_CASES = 3 }; enum { SMAX = 1 << 12 }; - enum { SMIN = -1 << 12 }; + enum { SMIN = (unsigned)-1 << 12 }; enum { RMAX = PIXEL_MAX - PIXEL_MIN }; //The maximum value obtained by subtracting pixel values (residual max) enum { RMIN = PIXEL_MIN - PIXEL_MAX }; //The minimum value obtained by subtracting pixel values (residual min) @@ -126,6 +126,8 @@ class PixelHarness : public TestHarness bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt); bool check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt); bool check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt); + bool check_integral_initv(integralv_t ref, integralv_t opt); + bool check_integral_inith(integralh_t ref, integralh_t opt); public: diff --git a/source/test/regression-tests.txt b/source/test/regression-tests.txt index 2e3df7cce3..1e35dc0499 100644 --- a/source/test/regression-tests.txt +++ b/source/test/regression-tests.txt @@ -17,17 +17,17 @@ BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-tempo BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao -BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --refine-level 2 --bitrate 7000 --limit-modes,--preset medium --no-cutree --analysis-mode=load --refine-level 2 --bitrate 7000 --limit-modes +BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-reuse-mode=save --analysis-reuse-level 2 --bitrate 7000 --limit-modes,--preset medium --no-cutree --analysis-reuse-mode=load --analysis-reuse-level 2 --bitrate 7000 --limit-modes BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4 -BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-mode=save --refine-level 10 --bitrate 7000 --limit-tu 0,--preset slower --no-cutree --analysis-mode=load --refine-level 10 --bitrate 7000 --limit-tu 0 +BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-reuse-mode=save --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0,--preset slower --no-cutree --analysis-reuse-mode=load --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0 BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3 -BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast --limit-tu 4,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000 --tskip-fast --limit-tu 4 +BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-reuse-mode=save --bitrate 7000 --tskip-fast --limit-tu 4,--preset veryslow --no-cutree --analysis-reuse-mode=load --bitrate 7000 --tskip-fast --limit-tu 4 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit" Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit" Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop Coastguard-4k.y4m,--preset superfast --tune grain --pme --aq-strength 2 --merange 190 -Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-mode=save --refine-level 1 --bitrate 15000,--preset veryfast --no-cutree --analysis-mode=load --refine-level 1 --bitrate 15000 +Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-reuse-mode=save --analysis-reuse-level 1 --bitrate 15000,--preset veryfast --no-cutree --analysis-reuse-mode=load --analysis-reuse-level 1 --bitrate 15000 Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh --slices 2 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16 @@ -51,7 +51,7 @@ DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3 -DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-mode=save --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1,--preset fast --no-cutree --analysis-mode=load --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1 +DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-reuse-mode=save --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1,--preset fast --no-cutree --analysis-reuse-mode=load --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2 FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd @@ -68,8 +68,8 @@ KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16 KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes --limit-tu 1 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2 -NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --rd 5 --refine-level 10 --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --rd 5 --refine-level 10 --bitrate 9000 -News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --refine-level 2 --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --refine-level 2 --bitrate 15000 +NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-reuse-mode=save --rd 5 --analysis-reuse-level 10 --bitrate 9000,--preset slow --no-cutree --analysis-reuse-mode=load --rd 5 --analysis-reuse-level 10 --bitrate 9000 +News-4k.y4m,--preset ultrafast --no-cutree --analysis-reuse-mode=save --analysis-reuse-level 2 --bitrate 15000,--preset ultrafast --no-cutree --analysis-reuse-mode=load --analysis-reuse-level 2 --bitrate 15000 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0 News-4k.y4m,--preset superfast --slices 4 --aq-mode 0 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16 @@ -123,7 +123,7 @@ old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32 old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 --limit-modes old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency -old_town_cross_444_720p50.y4m,--preset fast --no-cutree --analysis-mode=save --refine-level 1 --bitrate 3000 --early-skip,--preset fast --no-cutree --analysis-mode=load --refine-level 1 --bitrate 3000 --early-skip +old_town_cross_444_720p50.y4m,--preset fast --no-cutree --analysis-reuse-mode=save --analysis-reuse-level 1 --bitrate 3000 --early-skip,--preset fast --no-cutree --analysis-reuse-mode=load --analysis-reuse-level 1 --bitrate 3000 --early-skip old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless diff --git a/source/x265-extras.cpp b/source/x265-extras.cpp index e488ab68bc..58cf0d4c35 100644 --- a/source/x265-extras.cpp +++ b/source/x265-extras.cpp @@ -25,7 +25,7 @@ #include "x265.h" #include "x265-extras.h" - +#include "param.h" #include "common.h" using namespace X265_NS; @@ -38,14 +38,8 @@ static const char* summaryCSVHeader = "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), " "MaxCLL, MaxFALL, Version\n"; -FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level) +FILE* x265_csvlog_open(const x265_param& param, const char* fname, int level) { - if (sizeof(x265_stats) != api.sizeof_stats || sizeof(x265_picture) != api.sizeof_picture) - { - fprintf(stderr, "extras [error]: structure size skew, unable to create CSV logfile\n"); - return NULL; - } - FILE *csvfp = x265_fopen(fname, "r"); if (csvfp) { @@ -62,6 +56,8 @@ FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* if (level) { fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, "); + if (level >= 2) + fprintf(csvfp, "I/P cost ratio, "); if (param.rc.rateControlMode == X265_RC_CRF) fprintf(csvfp, "RateFactor, "); if (param.rc.vbvBufferSize) @@ -73,7 +69,7 @@ FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fprintf(csvfp, "Latency, "); fprintf(csvfp, "List 0, List 1"); uint32_t size = param.maxCUSize; - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++) { fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size); size /= 2; @@ -82,7 +78,7 @@ FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* size = param.maxCUSize; if (param.bEnableRectInter) { - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++) { fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size); if (param.bEnableAMP) @@ -92,29 +88,56 @@ FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* } else { - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++) { fprintf(csvfp, ", Inter %dx%d", size, size); size /= 2; } } size = param.maxCUSize; - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++) { fprintf(csvfp, ", Skip %dx%d", size, size); size /= 2; } size = param.maxCUSize; - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++) { fprintf(csvfp, ", Merge %dx%d", size, size); size /= 2; } - fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level, Avg Residual Energy"); - /* detailed performance statistics */ if (level >= 2) - fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Total frame time (ms), Avg WPP, Row Blocks"); + { + fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Residual Energy," + " Min Luma Level, Max Luma Level, Avg Luma Level"); + + if (param.internalCsp != X265_CSP_I400) + fprintf(csvfp, ", Min Cb Level, Max Cb Level, Avg Cb Level, Min Cr Level, Max Cr Level, Avg Cr Level"); + + /* PU statistics */ + size = param.maxCUSize; + for (uint32_t i = 0; i< param.maxLog2CUSize - (uint32_t)g_log2Size[param.minCUSize] + 1; i++) + { + fprintf(csvfp, ", Intra %dx%d", size, size); + fprintf(csvfp, ", Skip %dx%d", size, size); + fprintf(csvfp, ", AMP %d", size); + fprintf(csvfp, ", Inter %dx%d", size, size); + fprintf(csvfp, ", Merge %dx%d", size, size); + fprintf(csvfp, ", Inter %dx%d", size, size / 2); + fprintf(csvfp, ", Merge %dx%d", size, size / 2); + fprintf(csvfp, ", Inter %dx%d", size / 2, size); + fprintf(csvfp, ", Merge %dx%d", size / 2, size); + size /= 2; + } + + if ((uint32_t)g_log2Size[param.minCUSize] == 3) + fprintf(csvfp, ", 4x4"); + + /* detailed performance statistics */ + fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms)," + "Stall Time (ms), Total frame time (ms), Avg WPP, Row Blocks"); + } fprintf(csvfp, "\n"); } else @@ -131,7 +154,10 @@ void x265_csvlog_frame(FILE* csvfp, const x265_param& param, const x265_picture& return; const x265_frame_stats* frameStats = &pic.frameData; - fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits, frameStats->bScenecut); + fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, + frameStats->qp, (int)frameStats->bits, frameStats->bScenecut); + if (level >= 2) + fprintf(csvfp, "%.2f,", frameStats->ipCostRatio); if (param.rc.rateControlMode == X265_RC_CRF) fprintf(csvfp, "%.3lf,", frameStats->rateFactor); if (param.rc.vbvBufferSize) @@ -159,39 +185,76 @@ void x265_csvlog_frame(FILE* csvfp, const x265_param& param, const x265_picture& else fputs(" -,", csvfp); } - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) - fprintf(csvfp, "%5.2lf%%, %5.2lf%%, %5.2lf%%,", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]); - fprintf(csvfp, "%5.2lf%%", frameStats->cuStats.percentIntraNxN); - if (param.bEnableRectInter) + + if (level) { - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) + for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++) + fprintf(csvfp, "%5.2lf%%, %5.2lf%%, %5.2lf%%,", frameStats->cuStats.percentIntraDistribution[depth][0], + frameStats->cuStats.percentIntraDistribution[depth][1], + frameStats->cuStats.percentIntraDistribution[depth][2]); + fprintf(csvfp, "%5.2lf%%", frameStats->cuStats.percentIntraNxN); + if (param.bEnableRectInter) { - fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]); - if (param.bEnableAMP) - fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]); + for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++) + { + fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], + frameStats->cuStats.percentInterDistribution[depth][1]); + if (param.bEnableAMP) + fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]); + } } + else + { + for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++) + fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]); + } + for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++) + fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]); + for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++) + fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]); } - else - { - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) - fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]); - } - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) - fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]); - for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) - fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]); - fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d, %.2lf", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel, frameStats->avgResEnergy); if (level >= 2) { - fprintf(csvfp, ", %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime, frameStats->totalFrameTime); + fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf ", frameStats->avgLumaDistortion, + frameStats->avgChromaDistortion, + frameStats->avgPsyEnergy, + frameStats->avgResEnergy); + + fprintf(csvfp, ", %d, %d, %.2lf", frameStats->minLumaLevel, frameStats->maxLumaLevel, frameStats->avgLumaLevel); + + if (param.internalCsp != X265_CSP_I400) + { + fprintf(csvfp, ", %d, %d, %.2lf", frameStats->minChromaULevel, frameStats->maxChromaULevel, frameStats->avgChromaULevel); + fprintf(csvfp, ", %d, %d, %.2lf", frameStats->minChromaVLevel, frameStats->maxChromaVLevel, frameStats->avgChromaVLevel); + } + + for (uint32_t i = 0; i < param.maxLog2CUSize - (uint32_t)g_log2Size[param.minCUSize] + 1; i++) + { + fprintf(csvfp, ", %.2lf%%", frameStats->puStats.percentIntraPu[i]); + fprintf(csvfp, ", %.2lf%%", frameStats->puStats.percentSkipPu[i]); + fprintf(csvfp, ",%.2lf%%", frameStats->puStats.percentAmpPu[i]); + for (uint32_t j = 0; j < 3; j++) + { + fprintf(csvfp, ", %.2lf%%", frameStats->puStats.percentInterPu[i][j]); + fprintf(csvfp, ", %.2lf%%", frameStats->puStats.percentMergePu[i][j]); + } + } + if ((uint32_t)g_log2Size[param.minCUSize] == 3) + fprintf(csvfp, ",%.2lf%%", frameStats->puStats.percentNxN); + + fprintf(csvfp, ", %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, + frameStats->wallTime, frameStats->refWaitWallTime, + frameStats->totalCTUTime, frameStats->stallTime, + frameStats->totalFrameTime); + fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks); } fprintf(csvfp, "\n"); fflush(stderr); } -void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv) +void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, int padx, int pady, const x265_stats& stats, int level, int argc, char** argv) { if (!csvfp) return; @@ -204,13 +267,27 @@ void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& para } // CLI arguments or other - fputc('"', csvfp); - for (int i = 1; i < argc; i++) + if (argc) { - fputc(' ', csvfp); - fputs(argv[i], csvfp); + fputc('"', csvfp); + for (int i = 1; i < argc; i++) + { + fputc(' ', csvfp); + fputs(argv[i], csvfp); + } + fputc('"', csvfp); + } + else + { + const x265_param* paramTemp = ¶m; + char *opts = x265_param2string((x265_param*)paramTemp, padx, pady); + if (opts) + { + fputc('"', csvfp); + fputs(opts, csvfp); + fputc('"', csvfp); + } } - fputc('"', csvfp); // current date and time time_t now; diff --git a/source/x265-extras.h b/source/x265-extras.h index d4b10eb3b6..5b29345830 100644 --- a/source/x265-extras.h +++ b/source/x265-extras.h @@ -44,7 +44,7 @@ extern "C" { * closed by the caller using fclose(). If level is 0, then no frame logging * header is written to the file. This function will return NULL if it is unable * to open the file for write or if it detects a structure size skew */ -LIBAPI FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level); +LIBAPI FILE* x265_csvlog_open(const x265_param& param, const char* fname, int level); /* Log frame statistics to the CSV file handle. level should have been non-zero * in the call to x265_csvlog_open() if this function is called. */ @@ -53,7 +53,7 @@ LIBAPI void x265_csvlog_frame(FILE* csvfp, const x265_param& param, const x265_p /* Log final encode statistics to the CSV file handle. 'argc' and 'argv' are * intended to be command line arguments passed to the encoder. Encode * statistics should be queried from the encoder just prior to closing it. */ -LIBAPI void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv); +LIBAPI void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, int padx, int pady, const x265_stats& stats, int level, int argc, char** argv); /* In-place downshift from a bit-depth greater than 8 to a bit-depth of 8, using * the residual bits to dither each row. */ diff --git a/source/x265.cpp b/source/x265.cpp index 9f61cd71ca..2703109832 100644 --- a/source/x265.cpp +++ b/source/x265.cpp @@ -73,15 +73,12 @@ struct CLIOptions ReconFile* recon; OutputFile* output; FILE* qpfile; - FILE* csvfpt; - const char* csvfn; const char* reconPlayCmd; const x265_api* api; x265_param* param; bool bProgress; bool bForceY4m; bool bDither; - int csvLogLevel; uint32_t seek; // number of frames to skip from the beginning uint32_t framesToBeEncoded; // number of frames to encode uint64_t totalbytes; @@ -97,8 +94,6 @@ struct CLIOptions recon = NULL; output = NULL; qpfile = NULL; - csvfpt = NULL; - csvfn = NULL; reconPlayCmd = NULL; api = NULL; param = NULL; @@ -109,7 +104,6 @@ struct CLIOptions startTime = x265_mdate(); prevUpdateTime = 0; bDither = false; - csvLogLevel = 0; } void destroy(); @@ -129,9 +123,6 @@ void CLIOptions::destroy() if (qpfile) fclose(qpfile); qpfile = NULL; - if (csvfpt) - fclose(csvfpt); - csvfpt = NULL; if (output) output->release(); output = NULL; @@ -292,8 +283,6 @@ bool CLIOptions::parse(int argc, char **argv) if (0) ; OPT2("frame-skip", "seek") this->seek = (uint32_t)x265_atoi(optarg, bError); OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError); - OPT("csv") this->csvfn = optarg; - OPT("csv-log-level") this->csvLogLevel = x265_atoi(optarg, bError); OPT("no-progress") this->bProgress = false; OPT("output") outputfn = optarg; OPT("input") inputfn = optarg; @@ -530,8 +519,7 @@ static int get_argv_utf8(int *argc_ptr, char ***argv_ptr) * 1 - unable to parse command line * 2 - unable to open encoder * 3 - unable to generate stream headers - * 4 - encoder abort - * 5 - unable to open csv file */ + * 4 - encoder abort */ int main(int argc, char **argv) { @@ -586,28 +574,15 @@ int main(int argc, char **argv) /* get the encoder parameters post-initialization */ api->encoder_parameters(encoder, param); - if (cliopt.csvfn) - { - cliopt.csvfpt = x265_csvlog_open(*api, *param, cliopt.csvfn, cliopt.csvLogLevel); - if (!cliopt.csvfpt) - { - x265_log_file(param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", cliopt.csvfn); - cliopt.destroy(); - if (cliopt.api) - cliopt.api->param_free(cliopt.param); - exit(5); - } - } - - /* Control-C handler */ + /* Control-C handler */ if (signal(SIGINT, sigint_handler) == SIG_ERR) x265_log(param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s\n", strerror(errno)); x265_picture pic_orig, pic_out; x265_picture *pic_in = &pic_orig; - /* Allocate recon picture if analysisMode is enabled */ + /* Allocate recon picture if analysisReuseMode is enabled */ std::priority_queue<int64_t>* pts_queue = cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL; - x265_picture *pic_recon = (cliopt.recon || !!param->analysisMode || pts_queue || reconPlay || cliopt.csvLogLevel) ? &pic_out : NULL; + x265_picture *pic_recon = (cliopt.recon || !!param->analysisReuseMode || pts_queue || reconPlay || param->csvLogLevel) ? &pic_out : NULL; uint32_t inFrameCount = 0; uint32_t outFrameCount = 0; x265_nal *p_nal; @@ -698,8 +673,6 @@ int main(int argc, char **argv) } cliopt.printStatus(outFrameCount); - if (numEncoded && cliopt.csvLogLevel) - x265_csvlog_frame(cliopt.csvfpt, *param, *pic_recon, cliopt.csvLogLevel); } /* Flush the encoder */ @@ -730,8 +703,6 @@ int main(int argc, char **argv) } cliopt.printStatus(outFrameCount); - if (numEncoded && cliopt.csvLogLevel) - x265_csvlog_frame(cliopt.csvfpt, *param, *pic_recon, cliopt.csvLogLevel); if (!numEncoded) break; @@ -746,8 +717,8 @@ int main(int argc, char **argv) delete reconPlay; api->encoder_get_stats(encoder, &stats, sizeof(stats)); - if (cliopt.csvfpt && !b_ctrl_c) - x265_csvlog_encode(cliopt.csvfpt, api->version_str, *param, stats, cliopt.csvLogLevel, argc, argv); + if (param->csvfn && !b_ctrl_c) + api->encoder_log(encoder, argc, argv); api->encoder_close(encoder); int64_t second_largest_pts = 0; diff --git a/source/x265.h b/source/x265.h index f2ab68bb35..a242461682 100644 --- a/source/x265.h +++ b/source/x265.h @@ -24,10 +24,9 @@ #ifndef X265_H #define X265_H - #include <stdint.h> +#include <stdio.h> #include "x265_config.h" - #ifdef __cplusplus extern "C" { #endif @@ -98,6 +97,7 @@ typedef struct x265_analysis_data uint32_t sliceType; uint32_t numCUsInFrame; uint32_t numPartitions; + uint32_t depthBytes; int bScenecut; void* wt; void* interData; @@ -117,6 +117,20 @@ typedef struct x265_cu_stats } x265_cu_stats; +/* pu statistics */ +typedef struct x265_pu_stats +{ + double percentSkipPu[4]; // Percentage of skip cu in all depths + double percentIntraPu[4]; // Percentage of intra modes in all depths + double percentAmpPu[4]; // Percentage of amp modes in all depths + double percentInterPu[4][3]; // Percentage of inter 2nx2n, 2nxn and nx2n in all depths + double percentMergePu[4][3]; // Percentage of merge 2nx2n, 2nxn and nx2n in all depth + double percentNxN; + + /* All the above values will add up to 100%. */ +} x265_pu_stats; + + typedef struct x265_analysis_2Pass { uint32_t poc; @@ -154,13 +168,41 @@ typedef struct x265_frame_stats int list0POC[16]; int list1POC[16]; uint16_t maxLumaLevel; + uint16_t minLumaLevel; + + uint16_t maxChromaULevel; + uint16_t minChromaULevel; + double avgChromaULevel; + + + uint16_t maxChromaVLevel; + uint16_t minChromaVLevel; + double avgChromaVLevel; + char sliceType; int bScenecut; + double ipCostRatio; int frameLatency; x265_cu_stats cuStats; + x265_pu_stats puStats; double totalFrameTime; } x265_frame_stats; +typedef struct x265_ctu_info_t +{ + int32_t ctuAddress; + int32_t ctuPartitions[64]; + void* ctuInfo; +} x265_ctu_info_t; + +typedef enum +{ + NO_CTU_INFO = 0, + HAS_CTU_INFO = 1, + CTU_INFO_CHANGE = 2, +}CTUInfo; + + /* Arbitrary User SEI * Payload size is in bytes and the payload pointer must be non-NULL. * Payload types and syntax can be found in Annex D of the H.265 Specification. @@ -258,15 +300,15 @@ typedef struct x265_picture * to allow the encoder to determine base QP */ int forceqp; - /* If param.analysisMode is X265_ANALYSIS_OFF this field is ignored on input + /* If param.analysisReuseMode is X265_ANALYSIS_OFF this field is ignored on input * and output. Else the user must call x265_alloc_analysis_data() to * allocate analysis buffers for every picture passed to the encoder. * - * On input when param.analysisMode is X265_ANALYSIS_LOAD and analysisData + * On input when param.analysisReuseMode is X265_ANALYSIS_LOAD and analysisData * member pointers are valid, the encoder will use the data stored here to * reduce encoder work. * - * On output when param.analysisMode is X265_ANALYSIS_SAVE and analysisData + * On output when param.analysisReuseMode is X265_ANALYSIS_SAVE and analysisData * member pointers are valid, the encoder will write output analysis into * this data structure */ x265_analysis_data analysisData; @@ -612,7 +654,14 @@ typedef struct x265_param * X265_LOG_FULL, default is X265_LOG_INFO */ int logLevel; - /* Filename of CSV log. Now deprecated */ + /* Level of csv logging. 0 is summary, 1 is frame level logging, + * 2 is frame level logging with performance statistics */ + int csvLogLevel; + + /* filename of CSV log. If csvLogLevel is non-zero, the encoder will emit + * per-slice statistics to this log file in encode order. Otherwise the + * encoder will emit per-stream statistics into the log file when + * x265_encoder_log is called (presumably at the end of the encode) */ const char* csvfn; /*== Internal Picture Specification ==*/ @@ -1057,10 +1106,10 @@ typedef struct x265_param * buffers. if X265_ANALYSIS_LOAD, read analysis information into analysis * buffer and use this analysis information to reduce the amount of work * the encoder must perform. Default X265_ANALYSIS_OFF */ - int analysisMode; + int analysisReuseMode; - /* Filename for analysisMode save/load. Default name is "x265_analysis.dat" */ - const char* analysisFileName; + /* Filename for analysisReuseMode save/load. Default name is "x265_analysis.dat" */ + const char* analysisReuseFileName; /*== Rate Control ==*/ @@ -1194,6 +1243,9 @@ typedef struct x265_param /* sets a hard lower limit on QP */ int qpMin; + + /* internally enable if tune grain is set */ + int bEnableConstVbv; } rc; /*== Video Usability Information ==*/ @@ -1376,9 +1428,9 @@ typedef struct x265_param int bHDROpt; /* A value between 1 and 10 (both inclusive) determines the level of - * information stored/reused in save/load analysis-mode. Higher the refine - * level higher the informtion stored/reused. Default is 5 */ - int analysisRefineLevel; + * information stored/reused in save/load analysis-reuse-mode. Higher the refine + * level higher the information stored/reused. Default is 5 */ + int analysisReuseLevel; /* Limit Sample Adaptive Offset filter computation by early terminating SAO * process based on inter prediction mode, CTU spatial-domain correlations, @@ -1391,7 +1443,44 @@ typedef struct x265_param /* Insert tone mapping information only for IDR frames and when the * tone mapping information changes. */ int bDhdr10opt; + + /* Determine how x265 react to the content information recieved through the API */ + int bCTUInfo; + + /* Use ratecontrol statistics from pic_in, if available*/ + int bUseRcStats; + + /* Factor by which input video is scaled down for analysis save mode. Default is 0 */ + int scaleFactor; + + /* Enable intra refinement in load mode*/ + int intraRefine; + + /* Enable inter refinement in load mode*/ + int interRefine; + + /* Enable motion vector refinement in load mode*/ + int mvRefine; + + /* Log of maximum CTU size */ + uint32_t maxLog2CUSize; + + /* Actual CU depth with respect to config depth */ + uint32_t maxCUDepth; + + /* CU depth with respect to maximum transform size */ + uint32_t unitSizeDepth; + + /* Number of 4x4 units in maximum CU size */ + uint32_t num4x4Partitions; + + /* Specify if analysis mode uses file for data reuse */ + int bUseAnalysisFile; + + /* File pointer for csv log */ + FILE* csvfpt; } x265_param; + /* x265_param_alloc: * Allocates an x265_param instance. The returned param structure is not * special in any way, but using this method together with x265_param_free() @@ -1558,7 +1647,8 @@ int x265_encoder_reconfig(x265_encoder *, x265_param *); void x265_encoder_get_stats(x265_encoder *encoder, x265_stats *, uint32_t statsSizeBytes); /* x265_encoder_log: - * This function is deprecated */ + * write a line to the configured CSV file. If a CSV filename was not + * configured, or file open failed, this function will perform no write. */ void x265_encoder_log(x265_encoder *encoder, int argc, char **argv); /* x265_encoder_close: @@ -1581,6 +1671,12 @@ void x265_encoder_close(x265_encoder *); int x265_encoder_intra_refresh(x265_encoder *); +/* x265_encoder_ctu_info: + * Copy CTU information such as ctu address and ctu partition structure of all + * CTUs in each frame. The function is invoked only if "--ctu-info" is enabled and + * the encoder will wait for this copy to complete if enabled. + */ +int x265_encoder_ctu_info(x265_encoder *, int poc, x265_ctu_info_t** ctu); /* x265_cleanup: * release library static allocations, reset configured CTU size */ void x265_cleanup(void); @@ -1629,6 +1725,7 @@ typedef struct x265_api int sizeof_frame_stats; /* sizeof(x265_frame_stats) */ int (*encoder_intra_refresh)(x265_encoder*); + int (*encoder_ctu_info)(x265_encoder*, int, x265_ctu_info_t**); /* add new pointers to the end, or increment X265_MAJOR_VERSION */ } x265_api; diff --git a/source/x265cli.h b/source/x265cli.h index 7b85d952f7..14fd6ce7d9 100644 --- a/source/x265cli.h +++ b/source/x265cli.h @@ -122,6 +122,7 @@ static const struct option long_options[] = { "scenecut", required_argument, NULL, 0 }, { "no-scenecut", no_argument, NULL, 0 }, { "scenecut-bias", required_argument, NULL, 0 }, + { "ctu-info", required_argument, NULL, 0 }, { "intra-refresh", no_argument, NULL, 0 }, { "rc-lookahead", required_argument, NULL, 0 }, { "lookahead-slices", required_argument, NULL, 0 }, @@ -158,6 +159,8 @@ static const struct option long_options[] = { "qpstep", required_argument, NULL, 0 }, { "qpmin", required_argument, NULL, 0 }, { "qpmax", required_argument, NULL, 0 }, + { "const-vbv", no_argument, NULL, 0 }, + { "no-const-vbv", no_argument, NULL, 0 }, { "ratetol", required_argument, NULL, 0 }, { "cplxblur", required_argument, NULL, 0 }, { "qblur", required_argument, NULL, 0 }, @@ -247,9 +250,13 @@ static const struct option long_options[] = { "no-slow-firstpass", no_argument, NULL, 0 }, { "multi-pass-opt-rps", no_argument, NULL, 0 }, { "no-multi-pass-opt-rps", no_argument, NULL, 0 }, - { "analysis-mode", required_argument, NULL, 0 }, - { "analysis-file", required_argument, NULL, 0 }, - { "refine-level", required_argument, NULL, 0 }, + { "analysis-reuse-mode", required_argument, NULL, 0 }, + { "analysis-reuse-file", required_argument, NULL, 0 }, + { "analysis-reuse-level", required_argument, NULL, 0 }, + { "scale-factor", required_argument, NULL, 0 }, + { "refine-intra", required_argument, NULL, 0 }, + { "refine-inter", no_argument, NULL, 0 }, + { "no-refine-inter",no_argument, NULL, 0 }, { "strict-cbr", no_argument, NULL, 0 }, { "temporal-layers", no_argument, NULL, 0 }, { "no-temporal-layers", no_argument, NULL, 0 }, @@ -271,6 +278,8 @@ static const struct option long_options[] = { "dhdr10-info", required_argument, NULL, 0 }, { "dhdr10-opt", no_argument, NULL, 0}, { "no-dhdr10-opt", no_argument, NULL, 0}, + { "refine-mv", no_argument, NULL, 0 }, + { "no-refine-mv", no_argument, NULL, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, @@ -316,9 +325,9 @@ static void showHelp(x265_param *param) H1(" 1 - i420 (4:2:0 default)\n"); H1(" 2 - i422 (4:2:2)\n"); H1(" 3 - i444 (4:4:4)\n"); -#if ENABLE_DYNAMIC_HDR10 - H0(" --dhdr10-info <filename> JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping \n"); - H0(" --[no-]dhdr10-opt Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled"); +#if ENABLE_HDR10_PLUS + H0(" --dhdr10-info <filename> JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n"); + H0(" --[no-]dhdr10-opt Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n"); #endif H0("-f/--frames <integer> Maximum number of frames to encode. Default all\n"); H0(" --seek <integer> First frame to encode\n"); @@ -367,6 +376,11 @@ static void showHelp(x265_param *param) H1(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast)); H1(" --nr-intra <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n"); H1(" --nr-inter <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n"); + H0(" --ctu-info <integer> Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n" + " - 1: force the partitions if CTU information is present\n" + " - 2: functionality of (1) and reduce qp if CTU information has changed\n" + " - 4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise\n" + " Enable this option only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously\n"); H0("\nCoding tools:\n"); H0("-w/--[no-]weightp Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred)); H0(" --[no-]weightb Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred)); @@ -431,9 +445,13 @@ static void showHelp(x265_param *param) H0(" --[no-]analyze-src-pics Motion estimation uses source frame planes. Default disable\n"); H0(" --[no-]slow-firstpass Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass)); H0(" --[no-]strict-cbr Enable stricter conditions and tolerance for bitrate deviations in CBR mode. Default %s\n", OPT(param->rc.bStrictCbr)); - H0(" --analysis-mode <string|int> save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisMode); - H0(" --analysis-file <filename> Specify file name used for either dumping or reading analysis data.\n"); - H0(" --refine-level <1..10> Level of analysis refinement indicates amount of info stored/reused in save/load mode, 1:least....10:most. Default %d\n", param->analysisRefineLevel); + H0(" --analysis-reuse-mode <string|int> save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisReuseMode); + H0(" --analysis-reuse-file <filename> Specify file name used for either dumping or reading analysis data. Deault x265_analysis.dat\n"); + H0(" --analysis-reuse-level <1..10> Level of analysis reuse indicates amount of info stored/reused in save/load mode, 1:least..10:most. Default %d\n", param->analysisReuseLevel); + H0(" --scale-factor <int> Specify factor by which input video is scaled down for analysis save mode. Default %d\n", param->scaleFactor); + H0(" --refine-intra <int> Enable intra refinement for load mode. Default %d\n", param->intraRefine); + H0(" --[no-]refine-inter Enable inter refinement for load mode. Default %s\n", OPT(param->interRefine)); + H0(" --[no-]refine-mv Enable mv refinement for load mode. Default %s\n", OPT(param->mvRefine)); H0(" --aq-mode <integer> Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode); H0(" --aq-strength <float> Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength); H0(" --[no-]aq-motion Adaptive Quantization based on the relative motion of each CU w.r.t., frame. Default %s\n", OPT(param->bOptCUDeltaQP)); @@ -446,6 +464,7 @@ static void showHelp(x265_param *param) H1(" --qpstep <integer> The maximum single adjustment in QP allowed to rate control. Default %d\n", param->rc.qpStep); H1(" --qpmin <integer> sets a hard lower limit on QP allowed to ratecontrol. Default %d\n", param->rc.qpMin); H1(" --qpmax <integer> sets a hard upper limit on QP allowed to ratecontrol. Default %d\n", param->rc.qpMax); + H0(" --[no-]const-vbv Enable consistent vbv. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableConstVbv)); H1(" --cbqpoffs <integer> Chroma Cb QP Offset [-12..12]. Default %d\n", param->cbQpOffset); H1(" --crqpoffs <integer> Chroma Cr QP Offset [-12..12]. Default %d\n", param->crQpOffset); H1(" --scaling-list <string> Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");