From d5e8e10947951dde5b01f53751f7fa1ce1b0fe7f Mon Sep 17 00:00:00 2001
From: Kyle Xiao <xiaoguoqiao@bytedance.com>
Date: Tue, 6 Aug 2024 22:24:04 +0800
Subject: [PATCH] refactor: finalize reflect & README.md

---
 README.md                   | 79 ++++++++++++++++++++----------------
 README_cn.md                | 81 +++++++++++++++++++++----------------
 frugal_jit.go               | 10 +++++
 internal/reflect/decoder.go | 12 +++---
 options.go                  | 14 -------
 5 files changed, 109 insertions(+), 87 deletions(-)

diff --git a/README.md b/README.md
index e1e9fe5..f5e5ff3 100644
--- a/README.md
+++ b/README.md
@@ -2,50 +2,61 @@
 
 English | [中文](README_cn.md)
 
-A very fast dynamic Thrift serializer &amp; deserializer based on just-in-time compilation.
+A very fast dynamic Thrift serializer & deserializer without generating code.
+
+It implements a pure Go version (or reflect version) and a just-in-time(JIT) compilation version.
+Since the reflect version performs better than the JIT version in most cases and it works on different cpu architectures,
+so we plan to deprecate the JIT version starting with Go 1.24.
+
+Before Go 1.24:
+* JIT version is the default serializer & deserializer for `amd64`.
+* reflect version is only enabled when running on `!amd64` or `windows`
+* You can enable the reflect version by specifying os env `FRUGAL_NO_JIT=1`
+
+Since Go 1.24:
+* reflect version is the default serializer & deserializer.
+* JIT code will be skipped by go build tags.
+
 
 ## Features
 
 ### Code Generation Free
 
-Traditional Thrift serializer and deserializer are based on generated code which is no longer needed since we can use JIT compilation to dynamically generate machine code.
+Traditional Thrift serializer and deserializer are based on generated code which is no longer needed since we can make use of struct field tags.
 
 ### High Performance
 
-Thanks to JIT compilation, Frugal can generate better machine code than Go language compiler. In multi-core scenarios, Frugal's performance is about 5 times higher than that of traditional serializer and deserializer.
+Based on the test cases in `frugal/tests`, Frugal's performance is 1 to 4 times better than Apache Thrift (TBinaryProtocol).
+
+There may be variations between different test cases. Feel free to share your test cases with us.
 
 ```text
-name                                 old time/op    new time/op     delta
-MarshalAllSize_Parallel/small-16       78.8ns ± 0%     14.9ns ± 0%    -81.10%
-MarshalAllSize_Parallel/medium-16      1.34µs ± 0%     0.32µs ± 0%    -76.32%
-MarshalAllSize_Parallel/large-16       37.7µs ± 0%      9.4µs ± 0%    -75.02%
-UnmarshalAllSize_Parallel/small-16      368ns ± 0%       30ns ± 0%    -91.90%
-UnmarshalAllSize_Parallel/medium-16    11.9µs ± 0%      0.8µs ± 0%    -92.98%
-UnmarshalAllSize_Parallel/large-16      233µs ± 0%       21µs ± 0%    -90.99%
-
-name                                 old speed      new speed       delta
-MarshalAllSize_Parallel/small-16     7.31GB/s ± 0%  38.65GB/s ± 0%   +428.84%
-MarshalAllSize_Parallel/medium-16    12.9GB/s ± 0%   54.7GB/s ± 0%   +322.10%
-MarshalAllSize_Parallel/large-16     11.7GB/s ± 0%   46.8GB/s ± 0%   +300.26%
-UnmarshalAllSize_Parallel/small-16   1.56GB/s ± 0%  19.31GB/s ± 0%  +1134.41%
-UnmarshalAllSize_Parallel/medium-16  1.46GB/s ± 0%  20.80GB/s ± 0%  +1324.55%
-UnmarshalAllSize_Parallel/large-16   1.89GB/s ± 0%  20.98GB/s ± 0%  +1009.73%
-
-name                                 old alloc/op   new alloc/op    delta
-MarshalAllSize_Parallel/small-16         112B ± 0%         0B        -100.00%
-MarshalAllSize_Parallel/medium-16        112B ± 0%         0B        -100.00%
-MarshalAllSize_Parallel/large-16         779B ± 0%        57B ± 0%    -92.68%
-UnmarshalAllSize_Parallel/small-16     1.31kB ± 0%     0.10kB ± 0%    -92.76%
-UnmarshalAllSize_Parallel/medium-16      448B ± 0%      3022B ± 0%   +574.55%
-UnmarshalAllSize_Parallel/large-16     1.13MB ± 0%     0.07MB ± 0%    -93.54%
-
-name                                 old allocs/op  new allocs/op   delta
-MarshalAllSize_Parallel/small-16         1.00 ± 0%       0.00        -100.00%
-MarshalAllSize_Parallel/medium-16        1.00 ± 0%       0.00        -100.00%
-MarshalAllSize_Parallel/large-16         1.00 ± 0%       0.00        -100.00%
-UnmarshalAllSize_Parallel/small-16       6.00 ± 0%       1.00 ± 0%    -83.33%
-UnmarshalAllSize_Parallel/medium-16      6.00 ± 0%      30.00 ± 0%   +400.00%
-UnmarshalAllSize_Parallel/large-16      4.80k ± 0%      0.76k ± 0%    -84.10%
+go version go1.22.4 darwin/arm64
+
+goos: linux
+goarch: amd64
+pkg: github.com/cloudwego/frugal/tests
+cpu: Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz
+
+BenchmarkAllSize_Marshal_ApacheThrift/small-4          	 2070745	     584.0 ns/op	 998.32 MB/s	     112 B/op	       1 allocs/op
+BenchmarkAllSize_Marshal_ApacheThrift/medium-4         	   78729	     13680 ns/op	1280.57 MB/s	     112 B/op	       1 allocs/op
+BenchmarkAllSize_Marshal_ApacheThrift/large-4          	    3097	    376184 ns/op	1179.75 MB/s	     620 B/op	       1 allocs/op
+BenchmarkAllSize_Marshal_Frugal_JIT/small-4            	 4939591	     242.1 ns/op	2407.83 MB/s	      13 B/op	       0 allocs/op
+BenchmarkAllSize_Marshal_Frugal_JIT/medium-4           	  160820	      7485 ns/op	2340.29 MB/s	      54 B/op	       0 allocs/op
+BenchmarkAllSize_Marshal_Frugal_JIT/large-4            	    5370	    214258 ns/op	2071.35 MB/s	     338 B/op	       0 allocs/op
+BenchmarkAllSize_Marshal_Frugal_Reflect/small-4        	10171197	     117.3 ns/op	4970.90 MB/s	       0 B/op	       0 allocs/op
+BenchmarkAllSize_Marshal_Frugal_Reflect/medium-4       	  180207	      6644 ns/op	2636.73 MB/s	       0 B/op	       0 allocs/op
+BenchmarkAllSize_Marshal_Frugal_Reflect/large-4        	    6312	    185534 ns/op	2392.04 MB/s	       0 B/op	       0 allocs/op
+
+BenchmarkAllSize_Unmarshal_ApacheThrift/small-4        	  768525	      1443 ns/op	 403.94 MB/s	    1232 B/op	       5 allocs/op
+BenchmarkAllSize_Unmarshal_ApacheThrift/medium-4       	   24463	     47067 ns/op	 372.19 MB/s	   44816 B/op	     176 allocs/op
+BenchmarkAllSize_Unmarshal_ApacheThrift/large-4        	    1053	   1155725 ns/op	 384.00 MB/s	 1135540 B/op	    4433 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_JIT/small-4          	 2575767	     466.3 ns/op	1250.36 MB/s	     547 B/op	       2 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_JIT/medium-4         	   62128	     19333 ns/op	 906.12 MB/s	   19404 B/op	      89 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_JIT/large-4          	    2328	    496431 ns/op	 893.99 MB/s	  495906 B/op	    2283 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_Reflect/small-4      	 2770252	     437.2 ns/op	1333.60 MB/s	     544 B/op	       1 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_Reflect/medium-4     	   64232	     18183 ns/op	 963.45 MB/s	   19945 B/op	      57 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_Reflect/large-4      	    2325	    496415 ns/op	 894.02 MB/s	  511876 B/op	    1467 allocs/op
 ```
 
 ## What can you do with Frugal ?
diff --git a/README_cn.md b/README_cn.md
index dc4d179..6daae84 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -2,50 +2,63 @@
 
 [English](README.md) | 中文
 
-一款基于 JIT 编译的高性能动态 Thrift 编解码器。
+一种无需生成代码、高性能的动态 Thrift 编解码器。
+
+它实现了纯 Go 版本（或reflect版本）和 即时编译（JIT）版本。
+由于在大多数情况下，reflect版本的性能优于 JIT 版本，并且它可以在不同的 CPU 架构上运行，
+因此我们计划从 Go 1.24 开始废弃 JIT 版本。
+
+在 Go 1.24 之前：
+
+* JIT 版本是 `amd64` 的默认编解码器。
+* reflect 版本仅在运行在 `!amd64` 或 `windows` 上时启用
+* 您可以通过指定操作系统环境变量 `FRUGAL_NO_JIT=1` 来启用reflect版本
+
+自 Go 1.24 起：
+
+* reflect 版本成为默认编解码器。
+* JIT 代码将在 go build 时跳过不执行
 
 ## 特点
 
 ### 无需生成代码
 
-传统的 Thrift 编解码方式，要求用户必须要先生成编解码代码，Frugal 通过 JIT 编译技术在运行时动态生成编解码机器代码，避免了这一过程。
+传统的 Thrift 编解码方式，要求用户必须要先生成编解码代码，Frugal 通过反射 struct field tag 动态生成编解码器避免了这一过程。
 
 ### 高性能
 
-基于 JIT 技术 Frugal 可以生成比 Go 语言编译器性能更好的机器代码，在多核场景下，Frugal 的性能可以达到传统编解码方式的 5 倍左右。
+基于 `frugal/tests` 的测试用例，Frugal 的性能 比 Apache Thrift (TBinaryProtocol) 好 1 到 4 倍。
+
+不同的测试用例，结果可能会有些差异。欢迎给我们分享你的测试数据。
+
 
 ```text
-name                                 old time/op    new time/op     delta
-MarshalAllSize_Parallel/small-16       78.8ns ± 0%     14.9ns ± 0%    -81.10%
-MarshalAllSize_Parallel/medium-16      1.34µs ± 0%     0.32µs ± 0%    -76.32%
-MarshalAllSize_Parallel/large-16       37.7µs ± 0%      9.4µs ± 0%    -75.02%
-UnmarshalAllSize_Parallel/small-16      368ns ± 0%       30ns ± 0%    -91.90%
-UnmarshalAllSize_Parallel/medium-16    11.9µs ± 0%      0.8µs ± 0%    -92.98%
-UnmarshalAllSize_Parallel/large-16      233µs ± 0%       21µs ± 0%    -90.99%
-
-name                                 old speed      new speed       delta
-MarshalAllSize_Parallel/small-16     7.31GB/s ± 0%  38.65GB/s ± 0%   +428.84%
-MarshalAllSize_Parallel/medium-16    12.9GB/s ± 0%   54.7GB/s ± 0%   +322.10%
-MarshalAllSize_Parallel/large-16     11.7GB/s ± 0%   46.8GB/s ± 0%   +300.26%
-UnmarshalAllSize_Parallel/small-16   1.56GB/s ± 0%  19.31GB/s ± 0%  +1134.41%
-UnmarshalAllSize_Parallel/medium-16  1.46GB/s ± 0%  20.80GB/s ± 0%  +1324.55%
-UnmarshalAllSize_Parallel/large-16   1.89GB/s ± 0%  20.98GB/s ± 0%  +1009.73%
-
-name                                 old alloc/op   new alloc/op    delta
-MarshalAllSize_Parallel/small-16         112B ± 0%         0B        -100.00%
-MarshalAllSize_Parallel/medium-16        112B ± 0%         0B        -100.00%
-MarshalAllSize_Parallel/large-16         779B ± 0%        57B ± 0%    -92.68%
-UnmarshalAllSize_Parallel/small-16     1.31kB ± 0%     0.10kB ± 0%    -92.76%
-UnmarshalAllSize_Parallel/medium-16      448B ± 0%      3022B ± 0%   +574.55%
-UnmarshalAllSize_Parallel/large-16     1.13MB ± 0%     0.07MB ± 0%    -93.54%
-
-name                                 old allocs/op  new allocs/op   delta
-MarshalAllSize_Parallel/small-16         1.00 ± 0%       0.00        -100.00%
-MarshalAllSize_Parallel/medium-16        1.00 ± 0%       0.00        -100.00%
-MarshalAllSize_Parallel/large-16         1.00 ± 0%       0.00        -100.00%
-UnmarshalAllSize_Parallel/small-16       6.00 ± 0%       1.00 ± 0%    -83.33%
-UnmarshalAllSize_Parallel/medium-16      6.00 ± 0%      30.00 ± 0%   +400.00%
-UnmarshalAllSize_Parallel/large-16      4.80k ± 0%      0.76k ± 0%    -84.10%
+go version go1.22.4 darwin/arm64
+
+goos: linux
+goarch: amd64
+pkg: github.com/cloudwego/frugal/tests
+cpu: Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz
+
+BenchmarkAllSize_Marshal_ApacheThrift/small-4          	 2070745	     584.0 ns/op	 998.32 MB/s	     112 B/op	       1 allocs/op
+BenchmarkAllSize_Marshal_ApacheThrift/medium-4         	   78729	     13680 ns/op	1280.57 MB/s	     112 B/op	       1 allocs/op
+BenchmarkAllSize_Marshal_ApacheThrift/large-4          	    3097	    376184 ns/op	1179.75 MB/s	     620 B/op	       1 allocs/op
+BenchmarkAllSize_Marshal_Frugal_JIT/small-4            	 4939591	     242.1 ns/op	2407.83 MB/s	      13 B/op	       0 allocs/op
+BenchmarkAllSize_Marshal_Frugal_JIT/medium-4           	  160820	      7485 ns/op	2340.29 MB/s	      54 B/op	       0 allocs/op
+BenchmarkAllSize_Marshal_Frugal_JIT/large-4            	    5370	    214258 ns/op	2071.35 MB/s	     338 B/op	       0 allocs/op
+BenchmarkAllSize_Marshal_Frugal_Reflect/small-4        	10171197	     117.3 ns/op	4970.90 MB/s	       0 B/op	       0 allocs/op
+BenchmarkAllSize_Marshal_Frugal_Reflect/medium-4       	  180207	      6644 ns/op	2636.73 MB/s	       0 B/op	       0 allocs/op
+BenchmarkAllSize_Marshal_Frugal_Reflect/large-4        	    6312	    185534 ns/op	2392.04 MB/s	       0 B/op	       0 allocs/op
+
+BenchmarkAllSize_Unmarshal_ApacheThrift/small-4        	  768525	      1443 ns/op	 403.94 MB/s	    1232 B/op	       5 allocs/op
+BenchmarkAllSize_Unmarshal_ApacheThrift/medium-4       	   24463	     47067 ns/op	 372.19 MB/s	   44816 B/op	     176 allocs/op
+BenchmarkAllSize_Unmarshal_ApacheThrift/large-4        	    1053	   1155725 ns/op	 384.00 MB/s	 1135540 B/op	    4433 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_JIT/small-4          	 2575767	     466.3 ns/op	1250.36 MB/s	     547 B/op	       2 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_JIT/medium-4         	   62128	     19333 ns/op	 906.12 MB/s	   19404 B/op	      89 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_JIT/large-4          	    2328	    496431 ns/op	 893.99 MB/s	  495906 B/op	    2283 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_Reflect/small-4      	 2770252	     437.2 ns/op	1333.60 MB/s	     544 B/op	       1 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_Reflect/medium-4     	   64232	     18183 ns/op	 963.45 MB/s	   19945 B/op	      57 allocs/op
+BenchmarkAllSize_Unmarshal_Frugal_Reflect/large-4      	    2325	    496415 ns/op	 894.02 MB/s	  511876 B/op	    1467 allocs/op
 ```
 
 ## 用 Frugal 可以做什么？
diff --git a/frugal_jit.go b/frugal_jit.go
index 416af02..6f65eae 100644
--- a/frugal_jit.go
+++ b/frugal_jit.go
@@ -19,7 +19,9 @@
 package frugal
 
 import (
+	"os"
 	"reflect"
+	"strconv"
 	"sync"
 
 	"github.com/cloudwego/frugal/internal/jit/decoder"
@@ -30,6 +32,14 @@ import (
 	"github.com/cloudwego/gopkg/protocol/thrift"
 )
 
+var nojit bool
+
+func init() {
+	if v, err := strconv.ParseBool(os.Getenv("FRUGAL_NO_JIT")); err == nil {
+		nojit = v
+	}
+}
+
 func jitEncodedSize(val interface{}) int {
 	return encoder.EncodedSize(val)
 }
diff --git a/internal/reflect/decoder.go b/internal/reflect/decoder.go
index 343e388..94f392f 100644
--- a/internal/reflect/decoder.go
+++ b/internal/reflect/decoder.go
@@ -225,7 +225,6 @@ func (d *tDecoder) decodeType(t *tType, b []byte, p unsafe.Pointer, maxdepth int
 		// tmpv = decode(b)
 		// map[tmpk] = tmpv
 		tmp := t.MapTmpVarsPool.Get().(*tmpMapVars)
-		defer t.MapTmpVarsPool.Put(tmp)
 		k := tmp.k
 		v := tmp.v
 		kp := tmp.kp
@@ -247,6 +246,8 @@ func (d *tDecoder) decodeType(t *tType, b []byte, p unsafe.Pointer, maxdepth int
 			sliceV = d.Malloc(l*vt.V.Size, vt.V.Align, vt.V.MallocAbiType)
 		}
 
+		var n int
+		var err error
 		i := 6
 		for j := 0; j < l; j++ {
 			p = kp
@@ -260,8 +261,8 @@ func (d *tDecoder) decodeType(t *tType, b []byte, p unsafe.Pointer, maxdepth int
 			if kt.FixedSize > 0 {
 				i += decodeFixedSizeTypes(kt.T, b[i:], p)
 			} else {
-				if n, err := d.decodeType(kt, b[i:], p, maxdepth-1); err != nil {
-					return i, err
+				if n, err = d.decodeType(kt, b[i:], p, maxdepth-1); err != nil {
+					break
 				} else {
 					i += n
 				}
@@ -277,14 +278,15 @@ func (d *tDecoder) decodeType(t *tType, b []byte, p unsafe.Pointer, maxdepth int
 			if vt.FixedSize > 0 {
 				i += decodeFixedSizeTypes(vt.T, b[i:], p)
 			} else {
-				if n, err := d.decodeType(vt, b[i:], p, maxdepth-1); err != nil {
-					return i, err
+				if n, err = d.decodeType(vt, b[i:], p, maxdepth-1); err != nil {
+					break
 				} else {
 					i += n
 				}
 			}
 			m.SetMapIndex(k, v)
 		}
+		t.MapTmpVarsPool.Put(tmp) // no defer, it may be in hot path
 		return i, nil
 	case tLIST, tSET: // NOTE: for tSET, it may be map in the future
 		// list header
diff --git a/options.go b/options.go
index 4e49b76..304c871 100644
--- a/options.go
+++ b/options.go
@@ -18,24 +18,10 @@ package frugal
 
 import (
 	"fmt"
-	"os"
-	"runtime"
-	"strconv"
 
 	"github.com/cloudwego/frugal/internal/opts"
 )
 
-// Frugal supports `reflect` mode which faster than jit version in many scenarios,
-// and performs more stable in concurrency, also  works in different CPU architectures.
-// It will become a default option soon.
-var nojit = runtime.GOARCH != "amd64" // always nojit=true under non-amd64 env
-
-func init() {
-	if v, err := strconv.ParseBool(os.Getenv("FRUGAL_NO_JIT")); err == nil {
-		nojit = v
-	}
-}
-
 const (
 	_MinILSize = 1024
 )