From 561b4d863eec9c21f8a85ca8c0393bc43a7a359c Mon Sep 17 00:00:00 2001 From: "Jonathan A. Sternberg" Date: Thu, 11 Jan 2024 10:12:35 -0600 Subject: [PATCH] metrics: add build duration metric This adds a build duration metric with attributes related to the build id, build ref, driver name, and the resulting status of the build. This also modifies some aspects of how the resource is configured by including `service.instance.id`. This id is used to uniquely identify the CLI invocation for use in downstream aggregators. This allows downstream aggregators to know that the metric for an instance has not reset and that it is a unique invocation for future aggregation. Some work will have to be done in the aggregator to prevent the storage of this instance id as it can cause issues with cardinality limitations, but it's necessary for the initial reporter to include this. The temporality selector is still the same, but has been removed from the otlp exporter options since that one doesn't seem to do anything. I also recently learned the temporality didn't do what I thought it did. I thought it would allow for multiple different invocations to be treated as the same instance for the purposes of aggregation. It does not work this way as the metric sdk considers each of these a gap reset. That's the reason the instance id was added. This makes the difference between cumulative and delta mostly cosmetic, but delta temporality has some benefits for downstream consumers for aggregation so it's likely good to keep. The cli count has been removed as it doesn't contain anything new and wasn't a useful metric. The naming of metrics has also been changed from using `docker` as a prefix and instead relying on the instrumentation name. Signed-off-by: Jonathan A. Sternberg --- build/build.go | 61 + commands/bake.go | 8 +- commands/build.go | 39 +- go.mod | 4 +- go.sum | 6 + internal/env/env.go | 15 + {util => internal}/metrics/metrics.go | 111 +- {util => internal}/metrics/otlp.go | 8 +- internal/metrics/util.go | 33 + .../github.com/klauspost/cpuid/v2/.gitignore | 24 + .../klauspost/cpuid/v2/.goreleaser.yml | 74 + .../klauspost/cpuid/v2/CONTRIBUTING.txt | 35 + vendor/github.com/klauspost/cpuid/v2/LICENSE | 22 + .../github.com/klauspost/cpuid/v2/README.md | 137 ++ vendor/github.com/klauspost/cpuid/v2/cpuid.go | 1070 ++++++++++++++ .../github.com/klauspost/cpuid/v2/cpuid_386.s | 47 + .../klauspost/cpuid/v2/cpuid_amd64.s | 72 + .../klauspost/cpuid/v2/cpuid_arm64.s | 26 + .../klauspost/cpuid/v2/detect_arm64.go | 246 ++++ .../klauspost/cpuid/v2/detect_ref.go | 14 + .../klauspost/cpuid/v2/detect_x86.go | 35 + .../klauspost/cpuid/v2/featureid_string.go | 185 +++ .../klauspost/cpuid/v2/os_darwin_arm64.go | 19 + .../klauspost/cpuid/v2/os_linux_arm64.go | 130 ++ .../klauspost/cpuid/v2/os_other_arm64.go | 17 + .../klauspost/cpuid/v2/os_safe_linux_arm64.go | 7 + .../cpuid/v2/os_unsafe_linux_arm64.go | 10 + .../klauspost/cpuid/v2/test-architectures.sh | 15 + vendor/github.com/zeebo/xxh3/.gitignore | 6 + vendor/github.com/zeebo/xxh3/LICENSE | 25 + vendor/github.com/zeebo/xxh3/Makefile | 27 + vendor/github.com/zeebo/xxh3/README.md | 38 + vendor/github.com/zeebo/xxh3/_compat.c | 39 + vendor/github.com/zeebo/xxh3/accum_generic.go | 542 ++++++++ .../zeebo/xxh3/accum_stubs_amd64.go | 40 + .../zeebo/xxh3/accum_stubs_other.go | 25 + .../zeebo/xxh3/accum_vector_avx512_amd64.s | 379 +++++ .../zeebo/xxh3/accum_vector_avx_amd64.s | 586 ++++++++ .../zeebo/xxh3/accum_vector_sse_amd64.s | 1236 +++++++++++++++++ vendor/github.com/zeebo/xxh3/consts.go | 97 ++ vendor/github.com/zeebo/xxh3/hash128.go | 253 ++++ vendor/github.com/zeebo/xxh3/hash128_seed.go | 264 ++++ vendor/github.com/zeebo/xxh3/hash64.go | 126 ++ vendor/github.com/zeebo/xxh3/hash64_seed.go | 134 ++ vendor/github.com/zeebo/xxh3/hasher.go | 239 ++++ vendor/github.com/zeebo/xxh3/utils.go | 129 ++ vendor/modules.txt | 6 + 47 files changed, 6594 insertions(+), 67 deletions(-) create mode 100644 internal/env/env.go rename {util => internal}/metrics/metrics.go (62%) rename {util => internal}/metrics/otlp.go (87%) create mode 100644 internal/metrics/util.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/.gitignore create mode 100644 vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml create mode 100644 vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt create mode 100644 vendor/github.com/klauspost/cpuid/v2/LICENSE create mode 100644 vendor/github.com/klauspost/cpuid/v2/README.md create mode 100644 vendor/github.com/klauspost/cpuid/v2/cpuid.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/cpuid_386.s create mode 100644 vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s create mode 100644 vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s create mode 100644 vendor/github.com/klauspost/cpuid/v2/detect_arm64.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/detect_ref.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/detect_x86.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/featureid_string.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go create mode 100644 vendor/github.com/klauspost/cpuid/v2/test-architectures.sh create mode 100644 vendor/github.com/zeebo/xxh3/.gitignore create mode 100644 vendor/github.com/zeebo/xxh3/LICENSE create mode 100644 vendor/github.com/zeebo/xxh3/Makefile create mode 100644 vendor/github.com/zeebo/xxh3/README.md create mode 100644 vendor/github.com/zeebo/xxh3/_compat.c create mode 100644 vendor/github.com/zeebo/xxh3/accum_generic.go create mode 100644 vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go create mode 100644 vendor/github.com/zeebo/xxh3/accum_stubs_other.go create mode 100644 vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s create mode 100644 vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s create mode 100644 vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s create mode 100644 vendor/github.com/zeebo/xxh3/consts.go create mode 100644 vendor/github.com/zeebo/xxh3/hash128.go create mode 100644 vendor/github.com/zeebo/xxh3/hash128_seed.go create mode 100644 vendor/github.com/zeebo/xxh3/hash64.go create mode 100644 vendor/github.com/zeebo/xxh3/hash64_seed.go create mode 100644 vendor/github.com/zeebo/xxh3/hasher.go create mode 100644 vendor/github.com/zeebo/xxh3/utils.go diff --git a/build/build.go b/build/build.go index 2ae0fbd8e783..b2c8e03e7d99 100644 --- a/build/build.go +++ b/build/build.go @@ -26,6 +26,7 @@ import ( "github.com/distribution/reference" "github.com/docker/buildx/builder" "github.com/docker/buildx/driver" + "github.com/docker/buildx/internal/metrics" "github.com/docker/buildx/util/desktop" "github.com/docker/buildx/util/dockerutil" "github.com/docker/buildx/util/imagetools" @@ -55,6 +56,9 @@ import ( specs "github.com/opencontainers/image-spec/specs-go/v1" "github.com/pkg/errors" "github.com/sirupsen/logrus" + "github.com/zeebo/xxh3" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/trace" "golang.org/x/sync/errgroup" ) @@ -686,6 +690,14 @@ func BuildWithResultHandler(ctx context.Context, nodes []builder.Node, opt map[s return err } + record := metrics.Measure(ctx, "build.duration", "Measures the total build duration.", + metric.WithAttributes( + backendAttribute(dp), + buildIDAttribute(so), + buildRefAttribute(so), + ), + ) + frontendInputs := make(map[string]*pb.Definition) for key, st := range so.FrontendInputs { def, err := st.Marshal(ctx) @@ -785,6 +797,11 @@ func BuildWithResultHandler(ctx context.Context, nodes []builder.Node, opt map[s } else { rr, err = c.Build(ctx, *so, "buildx", buildFunc, ch) } + + record(ctx, metric.WithAttributes( + statusAttribute(err), + )) + if desktop.BuildBackendEnabled() && node.Driver.HistoryAPISupported(ctx) { buildRef := fmt.Sprintf("%s/%s/%s", node.Builder, node.Name, so.Ref) if err != nil { @@ -1523,3 +1540,47 @@ func ReadSourcePolicy() (*spb.Policy, error) { return &pol, nil } + +// backendAttribute is a utility to retrieve the backend attribute from a resolvedNode. +func backendAttribute(dp *resolvedNode) attribute.KeyValue { + driverName := dp.Node().Driver.Factory().Name() + return attribute.String("backend", driverName) +} + +// buildIDAttribute is a utility to retrieve the build id attribute from the solve options. +// This value should be consistent between builds. +func buildIDAttribute(so *client.SolveOpt) attribute.KeyValue { + vcs := so.FrontendAttrs["vcs:source"] + target := so.FrontendAttrs["target"] + context := so.FrontendAttrs["context"] + filename := so.FrontendAttrs["filename"] + + buildID := "" + if vcs != "" || target != "" || context != "" || filename != "" { + h := xxh3.New() + for _, s := range []string{vcs, target, context, filename} { + h.WriteString(s) + h.Write([]byte{0}) + } + buildID = hex.EncodeToString(h.Sum(nil)) + } + return attribute.String("build.id", buildID) +} + +// buildRefAttribute is a utility to retrieve the build ref attribute from the solve options. +// This value should be unique to each build. +func buildRefAttribute(so *client.SolveOpt) attribute.KeyValue { + return attribute.String("build.ref", so.Ref) +} + +// statusAttribute is a utility to retrieve the status attribute from an error. +func statusAttribute(err error) attribute.KeyValue { + status := "completed" + if err != nil { + status = "error" + if errors.Is(err, context.Canceled) { + status = "canceled" + } + } + return attribute.String("status", status) +} diff --git a/commands/bake.go b/commands/bake.go index 5cfa397991f4..983396c34ddf 100644 --- a/commands/bake.go +++ b/commands/bake.go @@ -13,13 +13,13 @@ import ( "github.com/docker/buildx/bake" "github.com/docker/buildx/build" "github.com/docker/buildx/builder" + "github.com/docker/buildx/internal/metrics" "github.com/docker/buildx/localstate" "github.com/docker/buildx/util/buildflags" "github.com/docker/buildx/util/cobrautil/completion" "github.com/docker/buildx/util/confutil" "github.com/docker/buildx/util/desktop" "github.com/docker/buildx/util/dockerutil" - "github.com/docker/buildx/util/metrics" "github.com/docker/buildx/util/progress" "github.com/docker/buildx/util/tracing" "github.com/docker/cli/cli/command" @@ -44,16 +44,12 @@ type bakeOptions struct { } func runBake(dockerCli command.Cli, targets []string, in bakeOptions, cFlags commonFlags) (err error) { - ctx := appcontext.Context() - - mp, report, err := metrics.MeterProvider(dockerCli) + ctx, report, err := metrics.Initialize(appcontext.Context(), dockerCli) if err != nil { return err } defer report() - recordVersionInfo(mp, "bake") - ctx, end, err := tracing.TraceCurrentCommand(ctx, "bake") if err != nil { return err diff --git a/commands/build.go b/commands/build.go index f290de67ea78..de5ffdf9153f 100644 --- a/commands/build.go +++ b/commands/build.go @@ -23,16 +23,15 @@ import ( "github.com/docker/buildx/controller/control" controllererrors "github.com/docker/buildx/controller/errdefs" controllerapi "github.com/docker/buildx/controller/pb" + "github.com/docker/buildx/internal/metrics" "github.com/docker/buildx/monitor" "github.com/docker/buildx/store" "github.com/docker/buildx/store/storeutil" "github.com/docker/buildx/util/buildflags" "github.com/docker/buildx/util/desktop" "github.com/docker/buildx/util/ioset" - "github.com/docker/buildx/util/metrics" "github.com/docker/buildx/util/progress" "github.com/docker/buildx/util/tracing" - "github.com/docker/buildx/version" "github.com/docker/cli-docs-tool/annotation" "github.com/docker/cli/cli" "github.com/docker/cli/cli/command" @@ -53,9 +52,6 @@ import ( "github.com/sirupsen/logrus" "github.com/spf13/cobra" "github.com/spf13/pflag" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/metric" "google.golang.org/grpc/codes" ) @@ -216,16 +212,12 @@ func (o *buildOptions) toDisplayMode() (progressui.DisplayMode, error) { } func runBuild(dockerCli command.Cli, options buildOptions) (err error) { - ctx := appcontext.Context() - - mp, report, err := metrics.MeterProvider(dockerCli) + ctx, report, err := metrics.Initialize(appcontext.Context(), dockerCli) if err != nil { return err } defer report() - recordVersionInfo(mp, "build") - ctx, end, err := tracing.TraceCurrentCommand(ctx, "build") if err != nil { return err @@ -940,30 +932,3 @@ func maybeJSONArray(v string) []string { } return []string{v} } - -func recordVersionInfo(mp metric.MeterProvider, command string) { - // Still in the process of testing/stabilizing these counters. - if !isExperimental() { - return - } - - meter := mp.Meter("github.com/docker/buildx", - metric.WithInstrumentationVersion(version.Version), - ) - - counter, err := meter.Int64Counter("docker.cli.count", - metric.WithDescription("Number of invocations of the docker buildx command."), - ) - if err != nil { - otel.Handle(err) - } - - counter.Add(context.Background(), 1, - metric.WithAttributes( - attribute.String("command", command), - attribute.String("package", version.Package), - attribute.String("version", version.Version), - attribute.String("revision", version.Revision), - ), - ) -} diff --git a/go.mod b/go.mod index 65f2f1ec777b..5267f2784c9d 100644 --- a/go.mod +++ b/go.mod @@ -38,10 +38,12 @@ require ( github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.8.4 github.com/zclconf/go-cty v1.14.1 + github.com/zeebo/xxh3 v1.0.2 go.opentelemetry.io/otel v1.19.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.42.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v0.42.0 go.opentelemetry.io/otel/metric v1.19.0 + go.opentelemetry.io/otel/sdk v1.19.0 go.opentelemetry.io/otel/sdk/metric v1.19.0 go.opentelemetry.io/otel/trace v1.19.0 golang.org/x/mod v0.11.0 @@ -107,6 +109,7 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.17.4 // indirect + github.com/klauspost/cpuid/v2 v2.0.9 // indirect github.com/mailru/easyjson v0.7.6 // indirect github.com/mattn/go-runewidth v0.0.15 // indirect github.com/mattn/go-shellwords v1.0.12 // indirect @@ -146,7 +149,6 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 // indirect go.opentelemetry.io/otel/exporters/prometheus v0.42.0 // indirect - go.opentelemetry.io/otel/sdk v1.19.0 // indirect go.opentelemetry.io/proto/otlp v1.0.0 // indirect golang.org/x/crypto v0.17.0 // indirect golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 // indirect diff --git a/go.sum b/go.sum index 9d232707b805..1b60b2056664 100644 --- a/go.sum +++ b/go.sum @@ -280,6 +280,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.17.4 h1:Ej5ixsIri7BrIjBkRZLTo6ghwrEtHFk7ijlczPW4fZ4= github.com/klauspost/compress v1.17.4/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM= +github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= +github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= @@ -472,6 +474,10 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec github.com/zclconf/go-cty v1.4.0/go.mod h1:nHzOclRkoj++EU9ZjSrZvRG0BXIWt8c7loYc0qXAFGQ= github.com/zclconf/go-cty v1.14.1 h1:t9fyA35fwjjUMcmL5hLER+e/rEPqrbCK1/OSE4SI9KA= github.com/zclconf/go-cty v1.14.1/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= +github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= +github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= +github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= +github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.45.0 h1:RsQi0qJ2imFfCvZabqzM9cNXBG8k6gXMv1A0cXRmH6A= diff --git a/internal/env/env.go b/internal/env/env.go new file mode 100644 index 000000000000..95fca0c868ba --- /dev/null +++ b/internal/env/env.go @@ -0,0 +1,15 @@ +package env + +import ( + "os" + "strconv" +) + +// IsExperimental checks if the experimental flag has been configured. +func IsExperimental() bool { + if v, ok := os.LookupEnv("BUILDX_EXPERIMENTAL"); ok { + vv, _ := strconv.ParseBool(v) + return vv + } + return false +} diff --git a/util/metrics/metrics.go b/internal/metrics/metrics.go similarity index 62% rename from util/metrics/metrics.go rename to internal/metrics/metrics.go index 36e1cc62c2e9..61d0735992a9 100644 --- a/util/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -4,21 +4,34 @@ import ( "context" "fmt" "net/url" + "os" "path" + "path/filepath" + "sync" "time" + "github.com/docker/buildx/internal/env" + "github.com/docker/buildx/version" "github.com/docker/cli/cli/command" - "github.com/moby/buildkit/util/tracing/detect" + "github.com/google/uuid" "github.com/pkg/errors" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/metric/noop" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/metric/metricdata" + "go.opentelemetry.io/otel/sdk/resource" + semconv "go.opentelemetry.io/otel/semconv/v1.21.0" "golang.org/x/sync/errgroup" ) +type contextKey int + const ( + meterProviderKey contextKey = iota + otelConfigFieldName = "otel" shutdownTimeout = 2 * time.Second ) @@ -27,40 +40,61 @@ const ( // desired endpoint. It should be invoked on application shutdown. type ReportFunc func() -// MeterProvider returns a MeterProvider suitable for CLI usage. +// Initialize returns a context.Context with a MeterProvider suitable for CLI usage. // The primary difference between this metric reader and a more typical // usage is that metric reporting only happens once when ReportFunc // is invoked. -func MeterProvider(cli command.Cli) (metric.MeterProvider, ReportFunc, error) { +func Initialize(ctx context.Context, cli command.Cli) (context.Context, ReportFunc, error) { var exps []sdkmetric.Exporter - if exp, err := dockerOtelExporter(cli); err != nil { - return nil, nil, err - } else if exp != nil { - exps = append(exps, exp) - } + // Only metric exporters if the experimental flag is set. + if env.IsExperimental() { + if exp, err := dockerOtelExporter(cli); err != nil { + return nil, nil, err + } else if exp != nil { + exps = append(exps, exp) + } - if exp, err := detectOtlpExporter(context.Background()); err != nil { - return nil, nil, err - } else if exp != nil { - exps = append(exps, exp) + if exp, err := detectOtlpExporter(context.Background()); err != nil { + return nil, nil, err + } else if exp != nil { + exps = append(exps, exp) + } } if len(exps) == 0 { // No exporters are configured so use a noop provider. - return noop.NewMeterProvider(), func() {}, nil + return ctx, func() {}, nil } - // Use delta temporality because, since this is a CLI program, we can never - // know the cumulative value. reader := sdkmetric.NewManualReader( sdkmetric.WithTemporalitySelector(deltaTemporality), ) mp := sdkmetric.NewMeterProvider( - sdkmetric.WithResource(detect.Resource()), + sdkmetric.WithResource(Resource()), sdkmetric.WithReader(reader), ) - return mp, reportFunc(reader, exps), nil + return withMeterProvider(ctx, mp), reportFunc(reader, exps), nil +} + +func withMeterProvider(ctx context.Context, mp metric.MeterProvider) context.Context { + return context.WithValue(ctx, meterProviderKey, mp) +} + +func MeterProvider(ctx context.Context) metric.MeterProvider { + mp, ok := ctx.Value(meterProviderKey).(metric.MeterProvider) + if !ok { + mp = noop.NewMeterProvider() + } + return mp +} + +// Meter returns a Meter from the MetricProvider that indicates the measurement +// comes from buildx with the appropriate version. +func Meter(ctx context.Context) metric.Meter { + mp := MeterProvider(ctx) + return mp.Meter(version.Package, + metric.WithInstrumentationVersion(version.Version)) } // reportFunc returns a ReportFunc for collecting ResourceMetrics and then @@ -184,6 +218,49 @@ func otelExporterOtlpEndpoint(cli command.Cli) (string, error) { } // deltaTemporality sets the Temporality of every instrument to delta. +// +// This isn't really needed since we create a unique resource on each invocation, +// but it can help with cardinality concerns for downstream processors since they can +// perform aggregation for a time interval and then discard the data once that time +// period has passed. Cumulative temporality would imply to the downstream processor +// that they might receive a successive point and they may unnecessarily keep state +// they really shouldn't. func deltaTemporality(_ sdkmetric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } + +var ( + res *resource.Resource + resOnce sync.Once +) + +// Resource retrieves the OTEL resource for the buildx CLI. +func Resource() *resource.Resource { + resOnce.Do(func() { + var err error + res, err = resource.New(context.Background(), + resource.WithDetectors(serviceNameDetector{}), + resource.WithAttributes( + attribute.Stringer("service.instance.id", uuid.New()), + ), + resource.WithFromEnv(), + resource.WithTelemetrySDK(), + ) + if err != nil { + otel.Handle(err) + } + }) + return res +} + +type serviceNameDetector struct{} + +func (serviceNameDetector) Detect(ctx context.Context) (*resource.Resource, error) { + return resource.StringDetector( + semconv.SchemaURL, + semconv.ServiceNameKey, + func() (string, error) { + return filepath.Base(os.Args[0]), nil + }, + ).Detect(ctx) +} diff --git a/util/metrics/otlp.go b/internal/metrics/otlp.go similarity index 87% rename from util/metrics/otlp.go rename to internal/metrics/otlp.go index b121ac3cd7a3..a28cd49992ca 100644 --- a/util/metrics/otlp.go +++ b/internal/metrics/otlp.go @@ -35,13 +35,9 @@ func detectOtlpExporter(ctx context.Context) (sdkmetric.Exporter, error) { switch proto { case "grpc": - return otlpmetricgrpc.New(ctx, - otlpmetricgrpc.WithTemporalitySelector(deltaTemporality), - ) + return otlpmetricgrpc.New(ctx) case "http/protobuf": - return otlpmetrichttp.New(ctx, - otlpmetrichttp.WithTemporalitySelector(deltaTemporality), - ) + return otlpmetrichttp.New(ctx) // case "http/json": // unsupported by library default: return nil, errors.Errorf("unsupported otlp protocol %v", proto) diff --git a/internal/metrics/util.go b/internal/metrics/util.go new file mode 100644 index 000000000000..25f3f64004dd --- /dev/null +++ b/internal/metrics/util.go @@ -0,0 +1,33 @@ +package metrics + +import ( + "context" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/metric" +) + +const TimeUnit string = "ms" + +// RecordFunc is used to record the measurement from Measure. +type RecordFunc func(ctx context.Context, opts ...metric.RecordOption) + +// Measure is a utility for measuring a time duration with certain attributes. +func Measure(ctx context.Context, name, desc string, opts ...metric.RecordOption) RecordFunc { + histogram, err := Meter(ctx).Int64Histogram(name, + metric.WithDescription(desc), + metric.WithUnit(TimeUnit)) + if err != nil { + otel.Handle(err) + } + + start := time.Now() + return func(ctx context.Context, newOpts ...metric.RecordOption) { + if len(newOpts) > 0 { + opts = append(opts, newOpts...) + } + dur := int64(time.Since(start) / time.Millisecond) + histogram.Record(ctx, dur, opts...) + } +} diff --git a/vendor/github.com/klauspost/cpuid/v2/.gitignore b/vendor/github.com/klauspost/cpuid/v2/.gitignore new file mode 100644 index 000000000000..daf913b1b347 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/.gitignore @@ -0,0 +1,24 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test +*.prof diff --git a/vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml b/vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml new file mode 100644 index 000000000000..944cc0007504 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml @@ -0,0 +1,74 @@ +# This is an example goreleaser.yaml file with some sane defaults. +# Make sure to check the documentation at http://goreleaser.com + +builds: + - + id: "cpuid" + binary: cpuid + main: ./cmd/cpuid/main.go + env: + - CGO_ENABLED=0 + flags: + - -ldflags=-s -w + goos: + - aix + - linux + - freebsd + - netbsd + - windows + - darwin + goarch: + - 386 + - amd64 + - arm64 + goarm: + - 7 + +archives: + - + id: cpuid + name_template: "cpuid-{{ .Os }}_{{ .Arch }}_{{ .Version }}" + replacements: + aix: AIX + darwin: OSX + linux: Linux + windows: Windows + 386: i386 + amd64: x86_64 + freebsd: FreeBSD + netbsd: NetBSD + format_overrides: + - goos: windows + format: zip + files: + - LICENSE +checksum: + name_template: 'checksums.txt' +snapshot: + name_template: "{{ .Tag }}-next" +changelog: + sort: asc + filters: + exclude: + - '^doc:' + - '^docs:' + - '^test:' + - '^tests:' + - '^Update\sREADME.md' + +nfpms: + - + file_name_template: "cpuid_package_{{ .Version }}_{{ .Os }}_{{ .Arch }}" + vendor: Klaus Post + homepage: https://github.com/klauspost/cpuid + maintainer: Klaus Post + description: CPUID Tool + license: BSD 3-Clause + formats: + - deb + - rpm + replacements: + darwin: Darwin + linux: Linux + freebsd: FreeBSD + amd64: x86_64 diff --git a/vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt b/vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt new file mode 100644 index 000000000000..2ef4714f7165 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt @@ -0,0 +1,35 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2015- Klaus Post & Contributors. +Email: klauspost@gmail.com + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/vendor/github.com/klauspost/cpuid/v2/LICENSE b/vendor/github.com/klauspost/cpuid/v2/LICENSE new file mode 100644 index 000000000000..5cec7ee949b1 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 Klaus Post + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/vendor/github.com/klauspost/cpuid/v2/README.md b/vendor/github.com/klauspost/cpuid/v2/README.md new file mode 100644 index 000000000000..465f4b77cb77 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/README.md @@ -0,0 +1,137 @@ +# cpuid +Package cpuid provides information about the CPU running the current program. + +CPU features are detected on startup, and kept for fast access through the life of the application. +Currently x86 / x64 (AMD64/i386) and ARM (ARM64) is supported, and no external C (cgo) code is used, which should make the library very easy to use. + +You can access the CPU information by accessing the shared CPU variable of the cpuid library. + +Package home: https://github.com/klauspost/cpuid + +[![PkgGoDev](https://pkg.go.dev/badge/github.com/klauspost/cpuid)](https://pkg.go.dev/github.com/klauspost/cpuid/v2) +[![Build Status][3]][4] + +[3]: https://travis-ci.org/klauspost/cpuid.svg?branch=master +[4]: https://travis-ci.org/klauspost/cpuid + +## installing + +`go get -u github.com/klauspost/cpuid/v2` using modules. + +Drop `v2` for others. + +## example + +```Go +package main + +import ( + "fmt" + "strings" + + . "github.com/klauspost/cpuid/v2" +) + +func main() { + // Print basic CPU information: + fmt.Println("Name:", CPU.BrandName) + fmt.Println("PhysicalCores:", CPU.PhysicalCores) + fmt.Println("ThreadsPerCore:", CPU.ThreadsPerCore) + fmt.Println("LogicalCores:", CPU.LogicalCores) + fmt.Println("Family", CPU.Family, "Model:", CPU.Model, "Vendor ID:", CPU.VendorID) + fmt.Println("Features:", fmt.Sprintf(strings.Join(CPU.FeatureSet(), ","))) + fmt.Println("Cacheline bytes:", CPU.CacheLine) + fmt.Println("L1 Data Cache:", CPU.Cache.L1D, "bytes") + fmt.Println("L1 Instruction Cache:", CPU.Cache.L1D, "bytes") + fmt.Println("L2 Cache:", CPU.Cache.L2, "bytes") + fmt.Println("L3 Cache:", CPU.Cache.L3, "bytes") + fmt.Println("Frequency", CPU.Hz, "hz") + + // Test if we have these specific features: + if CPU.Supports(SSE, SSE2) { + fmt.Println("We have Streaming SIMD 2 Extensions") + } +} +``` + +Sample output: +``` +>go run main.go +Name: AMD Ryzen 9 3950X 16-Core Processor +PhysicalCores: 16 +ThreadsPerCore: 2 +LogicalCores: 32 +Family 23 Model: 113 Vendor ID: AMD +Features: ADX,AESNI,AVX,AVX2,BMI1,BMI2,CLMUL,CMOV,CX16,F16C,FMA3,HTT,HYPERVISOR,LZCNT,MMX,MMXEXT,NX,POPCNT,RDRAND,RDSEED,RDTSCP,SHA,SSE,SSE2,SSE3,SSE4,SSE42,SSE4A,SSSE3 +Cacheline bytes: 64 +L1 Data Cache: 32768 bytes +L1 Instruction Cache: 32768 bytes +L2 Cache: 524288 bytes +L3 Cache: 16777216 bytes +Frequency 0 hz +We have Streaming SIMD 2 Extensions +``` + +# usage + +The `cpuid.CPU` provides access to CPU features. Use `cpuid.CPU.Supports()` to check for CPU features. +A faster `cpuid.CPU.Has()` is provided which will usually be inlined by the gc compiler. + +Note that for some cpu/os combinations some features will not be detected. +`amd64` has rather good support and should work reliably on all platforms. + +Note that hypervisors may not pass through all CPU features. + +## arm64 feature detection + +Not all operating systems provide ARM features directly +and there is no safe way to do so for the rest. + +Currently `arm64/linux` and `arm64/freebsd` should be quite reliable. +`arm64/darwin` adds features expected from the M1 processor, but a lot remains undetected. + +A `DetectARM()` can be used if you are able to control your deployment, +it will detect CPU features, but may crash if the OS doesn't intercept the calls. +A `-cpu.arm` flag for detecting unsafe ARM features can be added. See below. + +Note that currently only features are detected on ARM, +no additional information is currently available. + +## flags + +It is possible to add flags that affects cpu detection. + +For this the `Flags()` command is provided. + +This must be called *before* `flag.Parse()` AND after the flags have been parsed `Detect()` must be called. + +This means that any detection used in `init()` functions will not contain these flags. + +Example: + +```Go +package main + +import ( + "flag" + "fmt" + "strings" + + "github.com/klauspost/cpuid/v2" +) + +func main() { + cpuid.Flags() + flag.Parse() + cpuid.Detect() + + // Test if we have these specific features: + if cpuid.CPU.Supports(cpuid.SSE, cpuid.SSE2) { + fmt.Println("We have Streaming SIMD 2 Extensions") + } +} +``` + +# license + +This code is published under an MIT license. See LICENSE file for more information. diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid.go b/vendor/github.com/klauspost/cpuid/v2/cpuid.go new file mode 100644 index 000000000000..1d88736b68a2 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid.go @@ -0,0 +1,1070 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// Package cpuid provides information about the CPU running the current program. +// +// CPU features are detected on startup, and kept for fast access through the life of the application. +// Currently x86 / x64 (AMD64) as well as arm64 is supported. +// +// You can access the CPU information by accessing the shared CPU variable of the cpuid library. +// +// Package home: https://github.com/klauspost/cpuid +package cpuid + +import ( + "flag" + "fmt" + "math" + "os" + "runtime" + "strings" +) + +// AMD refererence: https://www.amd.com/system/files/TechDocs/25481.pdf +// and Processor Programming Reference (PPR) + +// Vendor is a representation of a CPU vendor. +type Vendor int + +const ( + VendorUnknown Vendor = iota + Intel + AMD + VIA + Transmeta + NSC + KVM // Kernel-based Virtual Machine + MSVM // Microsoft Hyper-V or Windows Virtual PC + VMware + XenHVM + Bhyve + Hygon + SiS + RDC + + Ampere + ARM + Broadcom + Cavium + DEC + Fujitsu + Infineon + Motorola + NVIDIA + AMCC + Qualcomm + Marvell + + lastVendor +) + +//go:generate stringer -type=FeatureID,Vendor + +// FeatureID is the ID of a specific cpu feature. +type FeatureID int + +const ( + // Keep index -1 as unknown + UNKNOWN = -1 + + // Add features + ADX FeatureID = iota // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) + AESNI // Advanced Encryption Standard New Instructions + AMD3DNOW // AMD 3DNOW + AMD3DNOWEXT // AMD 3DNowExt + AMXBF16 // Tile computational operations on BFLOAT16 numbers + AMXINT8 // Tile computational operations on 8-bit integers + AMXTILE // Tile architecture + AVX // AVX functions + AVX2 // AVX2 functions + AVX512BF16 // AVX-512 BFLOAT16 Instructions + AVX512BITALG // AVX-512 Bit Algorithms + AVX512BW // AVX-512 Byte and Word Instructions + AVX512CD // AVX-512 Conflict Detection Instructions + AVX512DQ // AVX-512 Doubleword and Quadword Instructions + AVX512ER // AVX-512 Exponential and Reciprocal Instructions + AVX512F // AVX-512 Foundation + AVX512FP16 // AVX-512 FP16 Instructions + AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions + AVX512PF // AVX-512 Prefetch Instructions + AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions + AVX512VBMI2 // AVX-512 Vector Bit Manipulation Instructions, Version 2 + AVX512VL // AVX-512 Vector Length Extensions + AVX512VNNI // AVX-512 Vector Neural Network Instructions + AVX512VP2INTERSECT // AVX-512 Intersect for D/Q + AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword + AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one. + BMI1 // Bit Manipulation Instruction Set 1 + BMI2 // Bit Manipulation Instruction Set 2 + CLDEMOTE // Cache Line Demote + CLMUL // Carry-less Multiplication + CLZERO // CLZERO instruction supported + CMOV // i686 CMOV + CPBOOST // Core Performance Boost + CX16 // CMPXCHG16B Instruction + ENQCMD // Enqueue Command + ERMS // Enhanced REP MOVSB/STOSB + F16C // Half-precision floating-point conversion + FMA3 // Intel FMA 3. Does not imply AVX. + FMA4 // Bulldozer FMA4 functions + GFNI // Galois Field New Instructions + HLE // Hardware Lock Elision + HTT // Hyperthreading (enabled) + HWA // Hardware assert supported. Indicates support for MSRC001_10 + HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors + IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB) + IBS // Instruction Based Sampling (AMD) + IBSBRNTRGT // Instruction Based Sampling Feature (AMD) + IBSFETCHSAM // Instruction Based Sampling Feature (AMD) + IBSFFV // Instruction Based Sampling Feature (AMD) + IBSOPCNT // Instruction Based Sampling Feature (AMD) + IBSOPCNTEXT // Instruction Based Sampling Feature (AMD) + IBSOPSAM // Instruction Based Sampling Feature (AMD) + IBSRDWROPCNT // Instruction Based Sampling Feature (AMD) + IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD) + INT_WBINVD // WBINVD/WBNOINVD are interruptible. + INVLPGB // NVLPGB and TLBSYNC instruction supported + LZCNT // LZCNT instruction + MCAOVERFLOW // MCA overflow recovery support. + MCOMMIT // MCOMMIT instruction supported + MMX // standard MMX + MMXEXT // SSE integer functions or AMD MMX ext + MOVDIR64B // Move 64 Bytes as Direct Store + MOVDIRI // Move Doubleword as Direct Store + MPX // Intel MPX (Memory Protection Extensions) + MSRIRC // Instruction Retired Counter MSR available + NX // NX (No-Execute) bit + POPCNT // POPCNT instruction + RDPRU // RDPRU instruction supported + RDRAND // RDRAND instruction is available + RDSEED // RDSEED instruction is available + RDTSCP // RDTSCP Instruction + RTM // Restricted Transactional Memory + RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort. + SERIALIZE // Serialize Instruction Execution + SGX // Software Guard Extensions + SGXLC // Software Guard Extensions Launch Control + SHA // Intel SHA Extensions + SSE // SSE functions + SSE2 // P4 SSE functions + SSE3 // Prescott SSE3 functions + SSE4 // Penryn SSE4.1 functions + SSE42 // Nehalem SSE4.2 functions + SSE4A // AMD Barcelona microarchitecture SSE4a instructions + SSSE3 // Conroe SSSE3 functions + STIBP // Single Thread Indirect Branch Predictors + SUCCOR // Software uncorrectable error containment and recovery capability. + TBM // AMD Trailing Bit Manipulation + TSXLDTRK // Intel TSX Suspend Load Address Tracking + VAES // Vector AES + VMX // Virtual Machine Extensions + VPCLMULQDQ // Carry-Less Multiplication Quadword + WAITPKG // TPAUSE, UMONITOR, UMWAIT + WBNOINVD // Write Back and Do Not Invalidate Cache + XOP // Bulldozer XOP functions + + // ARM features: + AESARM // AES instructions + ARMCPUID // Some CPU ID registers readable at user-level + ASIMD // Advanced SIMD + ASIMDDP // SIMD Dot Product + ASIMDHP // Advanced SIMD half-precision floating point + ASIMDRDM // Rounding Double Multiply Accumulate/Subtract (SQRDMLAH/SQRDMLSH) + ATOMICS // Large System Extensions (LSE) + CRC32 // CRC32/CRC32C instructions + DCPOP // Data cache clean to Point of Persistence (DC CVAP) + EVTSTRM // Generic timer + FCMA // Floatin point complex number addition and multiplication + FP // Single-precision and double-precision floating point + FPHP // Half-precision floating point + GPA // Generic Pointer Authentication + JSCVT // Javascript-style double->int convert (FJCVTZS) + LRCPC // Weaker release consistency (LDAPR, etc) + PMULL // Polynomial Multiply instructions (PMULL/PMULL2) + SHA1 // SHA-1 instructions (SHA1C, etc) + SHA2 // SHA-2 instructions (SHA256H, etc) + SHA3 // SHA-3 instructions (EOR3, RAXI, XAR, BCAX) + SHA512 // SHA512 instructions + SM3 // SM3 instructions + SM4 // SM4 instructions + SVE // Scalable Vector Extension + + // Keep it last. It automatically defines the size of []flagSet + lastID + + firstID FeatureID = UNKNOWN + 1 +) + +// CPUInfo contains information about the detected system CPU. +type CPUInfo struct { + BrandName string // Brand name reported by the CPU + VendorID Vendor // Comparable CPU vendor ID + VendorString string // Raw vendor string. + featureSet flagSet // Features of the CPU + PhysicalCores int // Number of physical processor cores in your CPU. Will be 0 if undetectable. + ThreadsPerCore int // Number of threads per physical core. Will be 1 if undetectable. + LogicalCores int // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable. + Family int // CPU family number + Model int // CPU model number + CacheLine int // Cache line size in bytes. Will be 0 if undetectable. + Hz int64 // Clock speed, if known, 0 otherwise. Will attempt to contain base clock speed. + BoostFreq int64 // Max clock speed, if known, 0 otherwise + Cache struct { + L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected + L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected + L2 int // L2 Cache (per core or shared). Will be -1 if undetected + L3 int // L3 Cache (per core, per ccx or shared). Will be -1 if undetected + } + SGX SGXSupport + maxFunc uint32 + maxExFunc uint32 +} + +var cpuid func(op uint32) (eax, ebx, ecx, edx uint32) +var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32) +var xgetbv func(index uint32) (eax, edx uint32) +var rdtscpAsm func() (eax, ebx, ecx, edx uint32) +var darwinHasAVX512 = func() bool { return false } + +// CPU contains information about the CPU as detected on startup, +// or when Detect last was called. +// +// Use this as the primary entry point to you data. +var CPU CPUInfo + +func init() { + initCPU() + Detect() +} + +// Detect will re-detect current CPU info. +// This will replace the content of the exported CPU variable. +// +// Unless you expect the CPU to change while you are running your program +// you should not need to call this function. +// If you call this, you must ensure that no other goroutine is accessing the +// exported CPU variable. +func Detect() { + // Set defaults + CPU.ThreadsPerCore = 1 + CPU.Cache.L1I = -1 + CPU.Cache.L1D = -1 + CPU.Cache.L2 = -1 + CPU.Cache.L3 = -1 + safe := true + if detectArmFlag != nil { + safe = !*detectArmFlag + } + addInfo(&CPU, safe) + if displayFeats != nil && *displayFeats { + fmt.Println("cpu features:", strings.Join(CPU.FeatureSet(), ",")) + // Exit with non-zero so tests will print value. + os.Exit(1) + } + if disableFlag != nil { + s := strings.Split(*disableFlag, ",") + for _, feat := range s { + feat := ParseFeature(strings.TrimSpace(feat)) + if feat != UNKNOWN { + CPU.featureSet.unset(feat) + } + } + } +} + +// DetectARM will detect ARM64 features. +// This is NOT done automatically since it can potentially crash +// if the OS does not handle the command. +// If in the future this can be done safely this function may not +// do anything. +func DetectARM() { + addInfo(&CPU, false) +} + +var detectArmFlag *bool +var displayFeats *bool +var disableFlag *string + +// Flags will enable flags. +// This must be called *before* flag.Parse AND +// Detect must be called after the flags have been parsed. +// Note that this means that any detection used in init() functions +// will not contain these flags. +func Flags() { + disableFlag = flag.String("cpu.disable", "", "disable cpu features; comma separated list") + displayFeats = flag.Bool("cpu.features", false, "lists cpu features and exits") + detectArmFlag = flag.Bool("cpu.arm", false, "allow ARM features to be detected; can potentially crash") +} + +// Supports returns whether the CPU supports all of the requested features. +func (c CPUInfo) Supports(ids ...FeatureID) bool { + for _, id := range ids { + if !c.featureSet.inSet(id) { + return false + } + } + return true +} + +// Has allows for checking a single feature. +// Should be inlined by the compiler. +func (c CPUInfo) Has(id FeatureID) bool { + return c.featureSet.inSet(id) +} + +// Disable will disable one or several features. +func (c *CPUInfo) Disable(ids ...FeatureID) bool { + for _, id := range ids { + c.featureSet.unset(id) + } + return true +} + +// Enable will disable one or several features even if they were undetected. +// This is of course not recommended for obvious reasons. +func (c *CPUInfo) Enable(ids ...FeatureID) bool { + for _, id := range ids { + c.featureSet.set(id) + } + return true +} + +// IsVendor returns true if vendor is recognized as Intel +func (c CPUInfo) IsVendor(v Vendor) bool { + return c.VendorID == v +} + +func (c CPUInfo) FeatureSet() []string { + s := make([]string, 0) + for _, f := range c.featureSet.Strings() { + s = append(s, f) + } + return s +} + +// RTCounter returns the 64-bit time-stamp counter +// Uses the RDTSCP instruction. The value 0 is returned +// if the CPU does not support the instruction. +func (c CPUInfo) RTCounter() uint64 { + if !c.Supports(RDTSCP) { + return 0 + } + a, _, _, d := rdtscpAsm() + return uint64(a) | (uint64(d) << 32) +} + +// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP. +// This variable is OS dependent, but on Linux contains information +// about the current cpu/core the code is running on. +// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned. +func (c CPUInfo) Ia32TscAux() uint32 { + if !c.Supports(RDTSCP) { + return 0 + } + _, _, ecx, _ := rdtscpAsm() + return ecx +} + +// LogicalCPU will return the Logical CPU the code is currently executing on. +// This is likely to change when the OS re-schedules the running thread +// to another CPU. +// If the current core cannot be detected, -1 will be returned. +func (c CPUInfo) LogicalCPU() int { + if c.maxFunc < 1 { + return -1 + } + _, ebx, _, _ := cpuid(1) + return int(ebx >> 24) +} + +// frequencies tries to compute the clock speed of the CPU. If leaf 15 is +// supported, use it, otherwise parse the brand string. Yes, really. +func (c *CPUInfo) frequencies() { + c.Hz, c.BoostFreq = 0, 0 + mfi := maxFunctionID() + if mfi >= 0x15 { + eax, ebx, ecx, _ := cpuid(0x15) + if eax != 0 && ebx != 0 && ecx != 0 { + c.Hz = (int64(ecx) * int64(ebx)) / int64(eax) + } + } + if mfi >= 0x16 { + a, b, _, _ := cpuid(0x16) + // Base... + if a&0xffff > 0 { + c.Hz = int64(a&0xffff) * 1_000_000 + } + // Boost... + if b&0xffff > 0 { + c.BoostFreq = int64(b&0xffff) * 1_000_000 + } + } + if c.Hz > 0 { + return + } + + // computeHz determines the official rated speed of a CPU from its brand + // string. This insanity is *actually the official documented way to do + // this according to Intel*, prior to leaf 0x15 existing. The official + // documentation only shows this working for exactly `x.xx` or `xxxx` + // cases, e.g., `2.50GHz` or `1300MHz`; this parser will accept other + // sizes. + model := c.BrandName + hz := strings.LastIndex(model, "Hz") + if hz < 3 { + return + } + var multiplier int64 + switch model[hz-1] { + case 'M': + multiplier = 1000 * 1000 + case 'G': + multiplier = 1000 * 1000 * 1000 + case 'T': + multiplier = 1000 * 1000 * 1000 * 1000 + } + if multiplier == 0 { + return + } + freq := int64(0) + divisor := int64(0) + decimalShift := int64(1) + var i int + for i = hz - 2; i >= 0 && model[i] != ' '; i-- { + if model[i] >= '0' && model[i] <= '9' { + freq += int64(model[i]-'0') * decimalShift + decimalShift *= 10 + } else if model[i] == '.' { + if divisor != 0 { + return + } + divisor = decimalShift + } else { + return + } + } + // we didn't find a space + if i < 0 { + return + } + if divisor != 0 { + c.Hz = (freq * multiplier) / divisor + return + } + c.Hz = freq * multiplier +} + +// VM Will return true if the cpu id indicates we are in +// a virtual machine. +func (c CPUInfo) VM() bool { + return CPU.featureSet.inSet(HYPERVISOR) +} + +// flags contains detected cpu features and characteristics +type flags uint64 + +// log2(bits_in_uint64) +const flagBitsLog2 = 6 +const flagBits = 1 << flagBitsLog2 +const flagMask = flagBits - 1 + +// flagSet contains detected cpu features and characteristics in an array of flags +type flagSet [(lastID + flagMask) / flagBits]flags + +func (s flagSet) inSet(feat FeatureID) bool { + return s[feat>>flagBitsLog2]&(1<<(feat&flagMask)) != 0 +} + +func (s *flagSet) set(feat FeatureID) { + s[feat>>flagBitsLog2] |= 1 << (feat & flagMask) +} + +// setIf will set a feature if boolean is true. +func (s *flagSet) setIf(cond bool, features ...FeatureID) { + if cond { + for _, offset := range features { + s[offset>>flagBitsLog2] |= 1 << (offset & flagMask) + } + } +} + +func (s *flagSet) unset(offset FeatureID) { + bit := flags(1 << (offset & flagMask)) + s[offset>>flagBitsLog2] = s[offset>>flagBitsLog2] & ^bit +} + +// or with another flagset. +func (s *flagSet) or(other flagSet) { + for i, v := range other[:] { + s[i] |= v + } +} + +// ParseFeature will parse the string and return the ID of the matching feature. +// Will return UNKNOWN if not found. +func ParseFeature(s string) FeatureID { + s = strings.ToUpper(s) + for i := firstID; i < lastID; i++ { + if i.String() == s { + return i + } + } + return UNKNOWN +} + +// Strings returns an array of the detected features for FlagsSet. +func (s flagSet) Strings() []string { + if len(s) == 0 { + return []string{""} + } + r := make([]string, 0) + for i := firstID; i < lastID; i++ { + if s.inSet(i) { + r = append(r, i.String()) + } + } + return r +} + +func maxExtendedFunction() uint32 { + eax, _, _, _ := cpuid(0x80000000) + return eax +} + +func maxFunctionID() uint32 { + a, _, _, _ := cpuid(0) + return a +} + +func brandName() string { + if maxExtendedFunction() >= 0x80000004 { + v := make([]uint32, 0, 48) + for i := uint32(0); i < 3; i++ { + a, b, c, d := cpuid(0x80000002 + i) + v = append(v, a, b, c, d) + } + return strings.Trim(string(valAsString(v...)), " ") + } + return "unknown" +} + +func threadsPerCore() int { + mfi := maxFunctionID() + vend, _ := vendorID() + + if mfi < 0x4 || (vend != Intel && vend != AMD) { + return 1 + } + + if mfi < 0xb { + if vend != Intel { + return 1 + } + _, b, _, d := cpuid(1) + if (d & (1 << 28)) != 0 { + // v will contain logical core count + v := (b >> 16) & 255 + if v > 1 { + a4, _, _, _ := cpuid(4) + // physical cores + v2 := (a4 >> 26) + 1 + if v2 > 0 { + return int(v) / int(v2) + } + } + } + return 1 + } + _, b, _, _ := cpuidex(0xb, 0) + if b&0xffff == 0 { + if vend == AMD { + // Workaround for AMD returning 0, assume 2 if >= Zen 2 + // It will be more correct than not. + fam, _ := familyModel() + _, _, _, d := cpuid(1) + if (d&(1<<28)) != 0 && fam >= 23 { + return 2 + } + } + return 1 + } + return int(b & 0xffff) +} + +func logicalCores() int { + mfi := maxFunctionID() + v, _ := vendorID() + switch v { + case Intel: + // Use this on old Intel processors + if mfi < 0xb { + if mfi < 1 { + return 0 + } + // CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID) + // that can be assigned to logical processors in a physical package. + // The value may not be the same as the number of logical processors that are present in the hardware of a physical package. + _, ebx, _, _ := cpuid(1) + logical := (ebx >> 16) & 0xff + return int(logical) + } + _, b, _, _ := cpuidex(0xb, 1) + return int(b & 0xffff) + case AMD, Hygon: + _, b, _, _ := cpuid(1) + return int((b >> 16) & 0xff) + default: + return 0 + } +} + +func familyModel() (int, int) { + if maxFunctionID() < 0x1 { + return 0, 0 + } + eax, _, _, _ := cpuid(1) + family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff) + model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0) + return int(family), int(model) +} + +func physicalCores() int { + v, _ := vendorID() + switch v { + case Intel: + return logicalCores() / threadsPerCore() + case AMD, Hygon: + lc := logicalCores() + tpc := threadsPerCore() + if lc > 0 && tpc > 0 { + return lc / tpc + } + + // The following is inaccurate on AMD EPYC 7742 64-Core Processor + if maxExtendedFunction() >= 0x80000008 { + _, _, c, _ := cpuid(0x80000008) + if c&0xff > 0 { + return int(c&0xff) + 1 + } + } + } + return 0 +} + +// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID +var vendorMapping = map[string]Vendor{ + "AMDisbetter!": AMD, + "AuthenticAMD": AMD, + "CentaurHauls": VIA, + "GenuineIntel": Intel, + "TransmetaCPU": Transmeta, + "GenuineTMx86": Transmeta, + "Geode by NSC": NSC, + "VIA VIA VIA ": VIA, + "KVMKVMKVMKVM": KVM, + "Microsoft Hv": MSVM, + "VMwareVMware": VMware, + "XenVMMXenVMM": XenHVM, + "bhyve bhyve ": Bhyve, + "HygonGenuine": Hygon, + "Vortex86 SoC": SiS, + "SiS SiS SiS ": SiS, + "RiseRiseRise": SiS, + "Genuine RDC": RDC, +} + +func vendorID() (Vendor, string) { + _, b, c, d := cpuid(0) + v := string(valAsString(b, d, c)) + vend, ok := vendorMapping[v] + if !ok { + return VendorUnknown, v + } + return vend, v +} + +func cacheLine() int { + if maxFunctionID() < 0x1 { + return 0 + } + + _, ebx, _, _ := cpuid(1) + cache := (ebx & 0xff00) >> 5 // cflush size + if cache == 0 && maxExtendedFunction() >= 0x80000006 { + _, _, ecx, _ := cpuid(0x80000006) + cache = ecx & 0xff // cacheline size + } + // TODO: Read from Cache and TLB Information + return int(cache) +} + +func (c *CPUInfo) cacheSize() { + c.Cache.L1D = -1 + c.Cache.L1I = -1 + c.Cache.L2 = -1 + c.Cache.L3 = -1 + vendor, _ := vendorID() + switch vendor { + case Intel: + if maxFunctionID() < 4 { + return + } + for i := uint32(0); ; i++ { + eax, ebx, ecx, _ := cpuidex(4, i) + cacheType := eax & 15 + if cacheType == 0 { + break + } + cacheLevel := (eax >> 5) & 7 + coherency := int(ebx&0xfff) + 1 + partitions := int((ebx>>12)&0x3ff) + 1 + associativity := int((ebx>>22)&0x3ff) + 1 + sets := int(ecx) + 1 + size := associativity * partitions * coherency * sets + switch cacheLevel { + case 1: + if cacheType == 1 { + // 1 = Data Cache + c.Cache.L1D = size + } else if cacheType == 2 { + // 2 = Instruction Cache + c.Cache.L1I = size + } else { + if c.Cache.L1D < 0 { + c.Cache.L1I = size + } + if c.Cache.L1I < 0 { + c.Cache.L1I = size + } + } + case 2: + c.Cache.L2 = size + case 3: + c.Cache.L3 = size + } + } + case AMD, Hygon: + // Untested. + if maxExtendedFunction() < 0x80000005 { + return + } + _, _, ecx, edx := cpuid(0x80000005) + c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024) + c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024) + + if maxExtendedFunction() < 0x80000006 { + return + } + _, _, ecx, _ = cpuid(0x80000006) + c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024) + + // CPUID Fn8000_001D_EAX_x[N:0] Cache Properties + if maxExtendedFunction() < 0x8000001D { + return + } + for i := uint32(0); i < math.MaxUint32; i++ { + eax, ebx, ecx, _ := cpuidex(0x8000001D, i) + + level := (eax >> 5) & 7 + cacheNumSets := ecx + 1 + cacheLineSize := 1 + (ebx & 2047) + cachePhysPartitions := 1 + ((ebx >> 12) & 511) + cacheNumWays := 1 + ((ebx >> 22) & 511) + + typ := eax & 15 + size := int(cacheNumSets * cacheLineSize * cachePhysPartitions * cacheNumWays) + if typ == 0 { + return + } + + switch level { + case 1: + switch typ { + case 1: + // Data cache + c.Cache.L1D = size + case 2: + // Inst cache + c.Cache.L1I = size + default: + if c.Cache.L1D < 0 { + c.Cache.L1I = size + } + if c.Cache.L1I < 0 { + c.Cache.L1I = size + } + } + case 2: + c.Cache.L2 = size + case 3: + c.Cache.L3 = size + } + } + } + + return +} + +type SGXEPCSection struct { + BaseAddress uint64 + EPCSize uint64 +} + +type SGXSupport struct { + Available bool + LaunchControl bool + SGX1Supported bool + SGX2Supported bool + MaxEnclaveSizeNot64 int64 + MaxEnclaveSize64 int64 + EPCSections []SGXEPCSection +} + +func hasSGX(available, lc bool) (rval SGXSupport) { + rval.Available = available + + if !available { + return + } + + rval.LaunchControl = lc + + a, _, _, d := cpuidex(0x12, 0) + rval.SGX1Supported = a&0x01 != 0 + rval.SGX2Supported = a&0x02 != 0 + rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF) // pow 2 + rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2 + rval.EPCSections = make([]SGXEPCSection, 0) + + for subleaf := uint32(2); subleaf < 2+8; subleaf++ { + eax, ebx, ecx, edx := cpuidex(0x12, subleaf) + leafType := eax & 0xf + + if leafType == 0 { + // Invalid subleaf, stop iterating + break + } else if leafType == 1 { + // EPC Section subleaf + baseAddress := uint64(eax&0xfffff000) + (uint64(ebx&0x000fffff) << 32) + size := uint64(ecx&0xfffff000) + (uint64(edx&0x000fffff) << 32) + + section := SGXEPCSection{BaseAddress: baseAddress, EPCSize: size} + rval.EPCSections = append(rval.EPCSections, section) + } + } + + return +} + +func support() flagSet { + var fs flagSet + mfi := maxFunctionID() + vend, _ := vendorID() + if mfi < 0x1 { + return fs + } + family, model := familyModel() + + _, _, c, d := cpuid(1) + fs.setIf((d&(1<<15)) != 0, CMOV) + fs.setIf((d&(1<<23)) != 0, MMX) + fs.setIf((d&(1<<25)) != 0, MMXEXT) + fs.setIf((d&(1<<25)) != 0, SSE) + fs.setIf((d&(1<<26)) != 0, SSE2) + fs.setIf((c&1) != 0, SSE3) + fs.setIf((c&(1<<5)) != 0, VMX) + fs.setIf((c&0x00000200) != 0, SSSE3) + fs.setIf((c&0x00080000) != 0, SSE4) + fs.setIf((c&0x00100000) != 0, SSE42) + fs.setIf((c&(1<<25)) != 0, AESNI) + fs.setIf((c&(1<<1)) != 0, CLMUL) + fs.setIf(c&(1<<23) != 0, POPCNT) + fs.setIf(c&(1<<30) != 0, RDRAND) + + // This bit has been reserved by Intel & AMD for use by hypervisors, + // and indicates the presence of a hypervisor. + fs.setIf(c&(1<<31) != 0, HYPERVISOR) + fs.setIf(c&(1<<29) != 0, F16C) + fs.setIf(c&(1<<13) != 0, CX16) + + if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 { + fs.setIf(threadsPerCore() > 1, HTT) + } + if vend == AMD && (d&(1<<28)) != 0 && mfi >= 4 { + fs.setIf(threadsPerCore() > 1, HTT) + } + // Check XGETBV/XSAVE (26), OXSAVE (27) and AVX (28) bits + const avxCheck = 1<<26 | 1<<27 | 1<<28 + if c&avxCheck == avxCheck { + // Check for OS support + eax, _ := xgetbv(0) + if (eax & 0x6) == 0x6 { + fs.set(AVX) + switch vend { + case Intel: + // Older than Haswell. + fs.setIf(family == 6 && model < 60, AVXSLOW) + case AMD: + // Older than Zen 2 + fs.setIf(family < 23 || (family == 23 && model < 49), AVXSLOW) + } + } + } + // FMA3 can be used with SSE registers, so no OS support is strictly needed. + // fma3 and OSXSAVE needed. + const fma3Check = 1<<12 | 1<<27 + fs.setIf(c&fma3Check == fma3Check, FMA3) + + // Check AVX2, AVX2 requires OS support, but BMI1/2 don't. + if mfi >= 7 { + _, ebx, ecx, edx := cpuidex(7, 0) + eax1, _, _, _ := cpuidex(7, 1) + if fs.inSet(AVX) && (ebx&0x00000020) != 0 { + fs.set(AVX2) + } + // CPUID.(EAX=7, ECX=0).EBX + if (ebx & 0x00000008) != 0 { + fs.set(BMI1) + fs.setIf((ebx&0x00000100) != 0, BMI2) + } + fs.setIf(ebx&(1<<2) != 0, SGX) + fs.setIf(ebx&(1<<4) != 0, HLE) + fs.setIf(ebx&(1<<9) != 0, ERMS) + fs.setIf(ebx&(1<<11) != 0, RTM) + fs.setIf(ebx&(1<<14) != 0, MPX) + fs.setIf(ebx&(1<<18) != 0, RDSEED) + fs.setIf(ebx&(1<<19) != 0, ADX) + fs.setIf(ebx&(1<<29) != 0, SHA) + // CPUID.(EAX=7, ECX=0).ECX + fs.setIf(ecx&(1<<5) != 0, WAITPKG) + fs.setIf(ecx&(1<<25) != 0, CLDEMOTE) + fs.setIf(ecx&(1<<27) != 0, MOVDIRI) + fs.setIf(ecx&(1<<28) != 0, MOVDIR64B) + fs.setIf(ecx&(1<<29) != 0, ENQCMD) + fs.setIf(ecx&(1<<30) != 0, SGXLC) + // CPUID.(EAX=7, ECX=0).EDX + fs.setIf(edx&(1<<11) != 0, RTM_ALWAYS_ABORT) + fs.setIf(edx&(1<<14) != 0, SERIALIZE) + fs.setIf(edx&(1<<16) != 0, TSXLDTRK) + fs.setIf(edx&(1<<26) != 0, IBPB) + fs.setIf(edx&(1<<27) != 0, STIBP) + + // Only detect AVX-512 features if XGETBV is supported + if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) { + // Check for OS support + eax, _ := xgetbv(0) + + // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and + // ZMM16-ZMM31 state are enabled by OS) + /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS). + hasAVX512 := (eax>>5)&7 == 7 && (eax>>1)&3 == 3 + if runtime.GOOS == "darwin" { + hasAVX512 = fs.inSet(AVX) && darwinHasAVX512() + } + if hasAVX512 { + fs.setIf(ebx&(1<<16) != 0, AVX512F) + fs.setIf(ebx&(1<<17) != 0, AVX512DQ) + fs.setIf(ebx&(1<<21) != 0, AVX512IFMA) + fs.setIf(ebx&(1<<26) != 0, AVX512PF) + fs.setIf(ebx&(1<<27) != 0, AVX512ER) + fs.setIf(ebx&(1<<28) != 0, AVX512CD) + fs.setIf(ebx&(1<<30) != 0, AVX512BW) + fs.setIf(ebx&(1<<31) != 0, AVX512VL) + // ecx + fs.setIf(ecx&(1<<1) != 0, AVX512VBMI) + fs.setIf(ecx&(1<<6) != 0, AVX512VBMI2) + fs.setIf(ecx&(1<<8) != 0, GFNI) + fs.setIf(ecx&(1<<9) != 0, VAES) + fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ) + fs.setIf(ecx&(1<<11) != 0, AVX512VNNI) + fs.setIf(ecx&(1<<12) != 0, AVX512BITALG) + fs.setIf(ecx&(1<<14) != 0, AVX512VPOPCNTDQ) + // edx + fs.setIf(edx&(1<<8) != 0, AVX512VP2INTERSECT) + fs.setIf(edx&(1<<22) != 0, AMXBF16) + fs.setIf(edx&(1<<23) != 0, AVX512FP16) + fs.setIf(edx&(1<<24) != 0, AMXTILE) + fs.setIf(edx&(1<<25) != 0, AMXINT8) + // eax1 = CPUID.(EAX=7, ECX=1).EAX + fs.setIf(eax1&(1<<5) != 0, AVX512BF16) + } + } + } + + if maxExtendedFunction() >= 0x80000001 { + _, _, c, d := cpuid(0x80000001) + if (c & (1 << 5)) != 0 { + fs.set(LZCNT) + fs.set(POPCNT) + } + fs.setIf((c&(1<<10)) != 0, IBS) + fs.setIf((d&(1<<31)) != 0, AMD3DNOW) + fs.setIf((d&(1<<30)) != 0, AMD3DNOWEXT) + fs.setIf((d&(1<<23)) != 0, MMX) + fs.setIf((d&(1<<22)) != 0, MMXEXT) + fs.setIf((c&(1<<6)) != 0, SSE4A) + fs.setIf(d&(1<<20) != 0, NX) + fs.setIf(d&(1<<27) != 0, RDTSCP) + + /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be + * used unless the OS has AVX support. */ + if fs.inSet(AVX) { + fs.setIf((c&0x00000800) != 0, XOP) + fs.setIf((c&0x00010000) != 0, FMA4) + } + + } + if maxExtendedFunction() >= 0x80000007 { + _, b, _, d := cpuid(0x80000007) + fs.setIf((b&(1<<0)) != 0, MCAOVERFLOW) + fs.setIf((b&(1<<1)) != 0, SUCCOR) + fs.setIf((b&(1<<2)) != 0, HWA) + fs.setIf((d&(1<<9)) != 0, CPBOOST) + } + + if maxExtendedFunction() >= 0x80000008 { + _, b, _, _ := cpuid(0x80000008) + fs.setIf((b&(1<<9)) != 0, WBNOINVD) + fs.setIf((b&(1<<8)) != 0, MCOMMIT) + fs.setIf((b&(1<<13)) != 0, INT_WBINVD) + fs.setIf((b&(1<<4)) != 0, RDPRU) + fs.setIf((b&(1<<3)) != 0, INVLPGB) + fs.setIf((b&(1<<1)) != 0, MSRIRC) + fs.setIf((b&(1<<0)) != 0, CLZERO) + } + + if maxExtendedFunction() >= 0x8000001b && fs.inSet(IBS) { + eax, _, _, _ := cpuid(0x8000001b) + fs.setIf((eax>>0)&1 == 1, IBSFFV) + fs.setIf((eax>>1)&1 == 1, IBSFETCHSAM) + fs.setIf((eax>>2)&1 == 1, IBSOPSAM) + fs.setIf((eax>>3)&1 == 1, IBSRDWROPCNT) + fs.setIf((eax>>4)&1 == 1, IBSOPCNT) + fs.setIf((eax>>5)&1 == 1, IBSBRNTRGT) + fs.setIf((eax>>6)&1 == 1, IBSOPCNTEXT) + fs.setIf((eax>>7)&1 == 1, IBSRIPINVALIDCHK) + } + + return fs +} + +func valAsString(values ...uint32) []byte { + r := make([]byte, 4*len(values)) + for i, v := range values { + dst := r[i*4:] + dst[0] = byte(v & 0xff) + dst[1] = byte((v >> 8) & 0xff) + dst[2] = byte((v >> 16) & 0xff) + dst[3] = byte((v >> 24) & 0xff) + switch { + case dst[0] == 0: + return r[:i*4] + case dst[1] == 0: + return r[:i*4+1] + case dst[2] == 0: + return r[:i*4+2] + case dst[3] == 0: + return r[:i*4+3] + } + } + return r +} diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid_386.s b/vendor/github.com/klauspost/cpuid/v2/cpuid_386.s new file mode 100644 index 000000000000..8587c3a1fc55 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid_386.s @@ -0,0 +1,47 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build 386,!gccgo,!noasm,!appengine + +// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuid(SB), 7, $0 + XORL CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+4(FP) + MOVL BX, ebx+8(FP) + MOVL CX, ecx+12(FP) + MOVL DX, edx+16(FP) + RET + +// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuidex(SB), 7, $0 + MOVL op+0(FP), AX + MOVL op2+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func xgetbv(index uint32) (eax, edx uint32) +TEXT ·asmXgetbv(SB), 7, $0 + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+4(FP) + MOVL DX, edx+8(FP) + RET + +// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) +TEXT ·asmRdtscpAsm(SB), 7, $0 + BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP + MOVL AX, eax+0(FP) + MOVL BX, ebx+4(FP) + MOVL CX, ecx+8(FP) + MOVL DX, edx+12(FP) + RET + +// func asmDarwinHasAVX512() bool +TEXT ·asmDarwinHasAVX512(SB), 7, $0 + MOVL $0, eax+0(FP) + RET diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s b/vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s new file mode 100644 index 000000000000..bc11f8942193 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s @@ -0,0 +1,72 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build amd64,!gccgo,!noasm,!appengine + +// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuid(SB), 7, $0 + XORQ CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuidex(SB), 7, $0 + MOVL op+0(FP), AX + MOVL op2+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func asmXgetbv(index uint32) (eax, edx uint32) +TEXT ·asmXgetbv(SB), 7, $0 + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+8(FP) + MOVL DX, edx+12(FP) + RET + +// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) +TEXT ·asmRdtscpAsm(SB), 7, $0 + BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP + MOVL AX, eax+0(FP) + MOVL BX, ebx+4(FP) + MOVL CX, ecx+8(FP) + MOVL DX, edx+12(FP) + RET + +// From https://go-review.googlesource.com/c/sys/+/285572/ +// func asmDarwinHasAVX512() bool +TEXT ·asmDarwinHasAVX512(SB), 7, $0-1 + MOVB $0, ret+0(FP) // default to false + +#ifdef GOOS_darwin // return if not darwin +#ifdef GOARCH_amd64 // return if not amd64 +// These values from: +// https://github.com/apple/darwin-xnu/blob/xnu-4570.1.46/osfmk/i386/cpu_capabilities.h +#define commpage64_base_address 0x00007fffffe00000 +#define commpage64_cpu_capabilities64 (commpage64_base_address+0x010) +#define commpage64_version (commpage64_base_address+0x01E) +#define hasAVX512F 0x0000004000000000 + MOVQ $commpage64_version, BX + MOVW (BX), AX + CMPW AX, $13 // versions < 13 do not support AVX512 + JL no_avx512 + MOVQ $commpage64_cpu_capabilities64, BX + MOVQ (BX), AX + MOVQ $hasAVX512F, CX + ANDQ CX, AX + JZ no_avx512 + MOVB $1, ret+0(FP) + +no_avx512: +#endif +#endif + RET + diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s b/vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s new file mode 100644 index 000000000000..b31d6aec43f6 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s @@ -0,0 +1,26 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build arm64,!gccgo,!noasm,!appengine + +// See https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt + +// func getMidr +TEXT ·getMidr(SB), 7, $0 + WORD $0xd5380000 // mrs x0, midr_el1 /* Main ID Register */ + MOVD R0, midr+0(FP) + RET + +// func getProcFeatures +TEXT ·getProcFeatures(SB), 7, $0 + WORD $0xd5380400 // mrs x0, id_aa64pfr0_el1 /* Processor Feature Register 0 */ + MOVD R0, procFeatures+0(FP) + RET + +// func getInstAttributes +TEXT ·getInstAttributes(SB), 7, $0 + WORD $0xd5380600 // mrs x0, id_aa64isar0_el1 /* Instruction Set Attribute Register 0 */ + WORD $0xd5380621 // mrs x1, id_aa64isar1_el1 /* Instruction Set Attribute Register 1 */ + MOVD R0, instAttrReg0+0(FP) + MOVD R1, instAttrReg1+8(FP) + RET + diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go b/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go new file mode 100644 index 000000000000..9bf9f77f3733 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go @@ -0,0 +1,246 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build arm64,!gccgo,!noasm,!appengine + +package cpuid + +import "runtime" + +func getMidr() (midr uint64) +func getProcFeatures() (procFeatures uint64) +func getInstAttributes() (instAttrReg0, instAttrReg1 uint64) + +func initCPU() { + cpuid = func(uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 } + cpuidex = func(x, y uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 } + xgetbv = func(uint32) (a, b uint32) { return 0, 0 } + rdtscpAsm = func() (a, b, c, d uint32) { return 0, 0, 0, 0 } +} + +func addInfo(c *CPUInfo, safe bool) { + // Seems to be safe to assume on ARM64 + c.CacheLine = 64 + detectOS(c) + + // ARM64 disabled since it may crash if interrupt is not intercepted by OS. + if safe && !c.Supports(ARMCPUID) && runtime.GOOS != "freebsd" { + return + } + midr := getMidr() + + // MIDR_EL1 - Main ID Register + // https://developer.arm.com/docs/ddi0595/h/aarch64-system-registers/midr_el1 + // x--------------------------------------------------x + // | Name | bits | visible | + // |--------------------------------------------------| + // | Implementer | [31-24] | y | + // |--------------------------------------------------| + // | Variant | [23-20] | y | + // |--------------------------------------------------| + // | Architecture | [19-16] | y | + // |--------------------------------------------------| + // | PartNum | [15-4] | y | + // |--------------------------------------------------| + // | Revision | [3-0] | y | + // x--------------------------------------------------x + + switch (midr >> 24) & 0xff { + case 0xC0: + c.VendorString = "Ampere Computing" + c.VendorID = Ampere + case 0x41: + c.VendorString = "Arm Limited" + c.VendorID = ARM + case 0x42: + c.VendorString = "Broadcom Corporation" + c.VendorID = Broadcom + case 0x43: + c.VendorString = "Cavium Inc" + c.VendorID = Cavium + case 0x44: + c.VendorString = "Digital Equipment Corporation" + c.VendorID = DEC + case 0x46: + c.VendorString = "Fujitsu Ltd" + c.VendorID = Fujitsu + case 0x49: + c.VendorString = "Infineon Technologies AG" + c.VendorID = Infineon + case 0x4D: + c.VendorString = "Motorola or Freescale Semiconductor Inc" + c.VendorID = Motorola + case 0x4E: + c.VendorString = "NVIDIA Corporation" + c.VendorID = NVIDIA + case 0x50: + c.VendorString = "Applied Micro Circuits Corporation" + c.VendorID = AMCC + case 0x51: + c.VendorString = "Qualcomm Inc" + c.VendorID = Qualcomm + case 0x56: + c.VendorString = "Marvell International Ltd" + c.VendorID = Marvell + case 0x69: + c.VendorString = "Intel Corporation" + c.VendorID = Intel + } + + // Lower 4 bits: Architecture + // Architecture Meaning + // 0b0001 Armv4. + // 0b0010 Armv4T. + // 0b0011 Armv5 (obsolete). + // 0b0100 Armv5T. + // 0b0101 Armv5TE. + // 0b0110 Armv5TEJ. + // 0b0111 Armv6. + // 0b1111 Architectural features are individually identified in the ID_* registers, see 'ID registers'. + // Upper 4 bit: Variant + // An IMPLEMENTATION DEFINED variant number. + // Typically, this field is used to distinguish between different product variants, or major revisions of a product. + c.Family = int(midr>>16) & 0xff + + // PartNum, bits [15:4] + // An IMPLEMENTATION DEFINED primary part number for the device. + // On processors implemented by Arm, if the top four bits of the primary + // part number are 0x0 or 0x7, the variant and architecture are encoded differently. + // Revision, bits [3:0] + // An IMPLEMENTATION DEFINED revision number for the device. + c.Model = int(midr) & 0xffff + + procFeatures := getProcFeatures() + + // ID_AA64PFR0_EL1 - Processor Feature Register 0 + // x--------------------------------------------------x + // | Name | bits | visible | + // |--------------------------------------------------| + // | DIT | [51-48] | y | + // |--------------------------------------------------| + // | SVE | [35-32] | y | + // |--------------------------------------------------| + // | GIC | [27-24] | n | + // |--------------------------------------------------| + // | AdvSIMD | [23-20] | y | + // |--------------------------------------------------| + // | FP | [19-16] | y | + // |--------------------------------------------------| + // | EL3 | [15-12] | n | + // |--------------------------------------------------| + // | EL2 | [11-8] | n | + // |--------------------------------------------------| + // | EL1 | [7-4] | n | + // |--------------------------------------------------| + // | EL0 | [3-0] | n | + // x--------------------------------------------------x + + var f flagSet + // if procFeatures&(0xf<<48) != 0 { + // fmt.Println("DIT") + // } + f.setIf(procFeatures&(0xf<<32) != 0, SVE) + if procFeatures&(0xf<<20) != 15<<20 { + f.set(ASIMD) + // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64pfr0_el1 + // 0b0001 --> As for 0b0000, and also includes support for half-precision floating-point arithmetic. + f.setIf(procFeatures&(0xf<<20) == 1<<20, FPHP, ASIMDHP) + } + f.setIf(procFeatures&(0xf<<16) != 0, FP) + + instAttrReg0, instAttrReg1 := getInstAttributes() + + // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar0_el1 + // + // ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0 + // x--------------------------------------------------x + // | Name | bits | visible | + // |--------------------------------------------------| + // | TS | [55-52] | y | + // |--------------------------------------------------| + // | FHM | [51-48] | y | + // |--------------------------------------------------| + // | DP | [47-44] | y | + // |--------------------------------------------------| + // | SM4 | [43-40] | y | + // |--------------------------------------------------| + // | SM3 | [39-36] | y | + // |--------------------------------------------------| + // | SHA3 | [35-32] | y | + // |--------------------------------------------------| + // | RDM | [31-28] | y | + // |--------------------------------------------------| + // | ATOMICS | [23-20] | y | + // |--------------------------------------------------| + // | CRC32 | [19-16] | y | + // |--------------------------------------------------| + // | SHA2 | [15-12] | y | + // |--------------------------------------------------| + // | SHA1 | [11-8] | y | + // |--------------------------------------------------| + // | AES | [7-4] | y | + // x--------------------------------------------------x + + // if instAttrReg0&(0xf<<52) != 0 { + // fmt.Println("TS") + // } + // if instAttrReg0&(0xf<<48) != 0 { + // fmt.Println("FHM") + // } + f.setIf(instAttrReg0&(0xf<<44) != 0, ASIMDDP) + f.setIf(instAttrReg0&(0xf<<40) != 0, SM4) + f.setIf(instAttrReg0&(0xf<<36) != 0, SM3) + f.setIf(instAttrReg0&(0xf<<32) != 0, SHA3) + f.setIf(instAttrReg0&(0xf<<28) != 0, ASIMDRDM) + f.setIf(instAttrReg0&(0xf<<20) != 0, ATOMICS) + f.setIf(instAttrReg0&(0xf<<16) != 0, CRC32) + f.setIf(instAttrReg0&(0xf<<12) != 0, SHA2) + // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar0_el1 + // 0b0010 --> As 0b0001, plus SHA512H, SHA512H2, SHA512SU0, and SHA512SU1 instructions implemented. + f.setIf(instAttrReg0&(0xf<<12) == 2<<12, SHA512) + f.setIf(instAttrReg0&(0xf<<8) != 0, SHA1) + f.setIf(instAttrReg0&(0xf<<4) != 0, AESARM) + // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar0_el1 + // 0b0010 --> As for 0b0001, plus PMULL/PMULL2 instructions operating on 64-bit data quantities. + f.setIf(instAttrReg0&(0xf<<4) == 2<<4, PMULL) + + // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar1_el1 + // + // ID_AA64ISAR1_EL1 - Instruction set attribute register 1 + // x--------------------------------------------------x + // | Name | bits | visible | + // |--------------------------------------------------| + // | GPI | [31-28] | y | + // |--------------------------------------------------| + // | GPA | [27-24] | y | + // |--------------------------------------------------| + // | LRCPC | [23-20] | y | + // |--------------------------------------------------| + // | FCMA | [19-16] | y | + // |--------------------------------------------------| + // | JSCVT | [15-12] | y | + // |--------------------------------------------------| + // | API | [11-8] | y | + // |--------------------------------------------------| + // | APA | [7-4] | y | + // |--------------------------------------------------| + // | DPB | [3-0] | y | + // x--------------------------------------------------x + + // if instAttrReg1&(0xf<<28) != 0 { + // fmt.Println("GPI") + // } + f.setIf(instAttrReg1&(0xf<<28) != 24, GPA) + f.setIf(instAttrReg1&(0xf<<20) != 0, LRCPC) + f.setIf(instAttrReg1&(0xf<<16) != 0, FCMA) + f.setIf(instAttrReg1&(0xf<<12) != 0, JSCVT) + // if instAttrReg1&(0xf<<8) != 0 { + // fmt.Println("API") + // } + // if instAttrReg1&(0xf<<4) != 0 { + // fmt.Println("APA") + // } + f.setIf(instAttrReg1&(0xf<<0) != 0, DCPOP) + + // Store + c.featureSet.or(f) +} diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_ref.go b/vendor/github.com/klauspost/cpuid/v2/detect_ref.go new file mode 100644 index 000000000000..e9c8606ab920 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/detect_ref.go @@ -0,0 +1,14 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build !amd64,!386,!arm64 gccgo noasm appengine + +package cpuid + +func initCPU() { + cpuid = func(uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 } + cpuidex = func(x, y uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 } + xgetbv = func(uint32) (a, b uint32) { return 0, 0 } + rdtscpAsm = func() (a, b, c, d uint32) { return 0, 0, 0, 0 } +} + +func addInfo(info *CPUInfo, safe bool) {} diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go new file mode 100644 index 000000000000..367c35c88c23 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go @@ -0,0 +1,35 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build 386,!gccgo,!noasm,!appengine amd64,!gccgo,!noasm,!appengine + +package cpuid + +func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +func asmXgetbv(index uint32) (eax, edx uint32) +func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) +func asmDarwinHasAVX512() bool + +func initCPU() { + cpuid = asmCpuid + cpuidex = asmCpuidex + xgetbv = asmXgetbv + rdtscpAsm = asmRdtscpAsm + darwinHasAVX512 = asmDarwinHasAVX512 +} + +func addInfo(c *CPUInfo, safe bool) { + c.maxFunc = maxFunctionID() + c.maxExFunc = maxExtendedFunction() + c.BrandName = brandName() + c.CacheLine = cacheLine() + c.Family, c.Model = familyModel() + c.featureSet = support() + c.SGX = hasSGX(c.featureSet.inSet(SGX), c.featureSet.inSet(SGXLC)) + c.ThreadsPerCore = threadsPerCore() + c.LogicalCores = logicalCores() + c.PhysicalCores = physicalCores() + c.VendorID, c.VendorString = vendorID() + c.cacheSize() + c.frequencies() +} diff --git a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go new file mode 100644 index 000000000000..b1fe42e467bb --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go @@ -0,0 +1,185 @@ +// Code generated by "stringer -type=FeatureID,Vendor"; DO NOT EDIT. + +package cpuid + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[ADX-1] + _ = x[AESNI-2] + _ = x[AMD3DNOW-3] + _ = x[AMD3DNOWEXT-4] + _ = x[AMXBF16-5] + _ = x[AMXINT8-6] + _ = x[AMXTILE-7] + _ = x[AVX-8] + _ = x[AVX2-9] + _ = x[AVX512BF16-10] + _ = x[AVX512BITALG-11] + _ = x[AVX512BW-12] + _ = x[AVX512CD-13] + _ = x[AVX512DQ-14] + _ = x[AVX512ER-15] + _ = x[AVX512F-16] + _ = x[AVX512FP16-17] + _ = x[AVX512IFMA-18] + _ = x[AVX512PF-19] + _ = x[AVX512VBMI-20] + _ = x[AVX512VBMI2-21] + _ = x[AVX512VL-22] + _ = x[AVX512VNNI-23] + _ = x[AVX512VP2INTERSECT-24] + _ = x[AVX512VPOPCNTDQ-25] + _ = x[AVXSLOW-26] + _ = x[BMI1-27] + _ = x[BMI2-28] + _ = x[CLDEMOTE-29] + _ = x[CLMUL-30] + _ = x[CLZERO-31] + _ = x[CMOV-32] + _ = x[CPBOOST-33] + _ = x[CX16-34] + _ = x[ENQCMD-35] + _ = x[ERMS-36] + _ = x[F16C-37] + _ = x[FMA3-38] + _ = x[FMA4-39] + _ = x[GFNI-40] + _ = x[HLE-41] + _ = x[HTT-42] + _ = x[HWA-43] + _ = x[HYPERVISOR-44] + _ = x[IBPB-45] + _ = x[IBS-46] + _ = x[IBSBRNTRGT-47] + _ = x[IBSFETCHSAM-48] + _ = x[IBSFFV-49] + _ = x[IBSOPCNT-50] + _ = x[IBSOPCNTEXT-51] + _ = x[IBSOPSAM-52] + _ = x[IBSRDWROPCNT-53] + _ = x[IBSRIPINVALIDCHK-54] + _ = x[INT_WBINVD-55] + _ = x[INVLPGB-56] + _ = x[LZCNT-57] + _ = x[MCAOVERFLOW-58] + _ = x[MCOMMIT-59] + _ = x[MMX-60] + _ = x[MMXEXT-61] + _ = x[MOVDIR64B-62] + _ = x[MOVDIRI-63] + _ = x[MPX-64] + _ = x[MSRIRC-65] + _ = x[NX-66] + _ = x[POPCNT-67] + _ = x[RDPRU-68] + _ = x[RDRAND-69] + _ = x[RDSEED-70] + _ = x[RDTSCP-71] + _ = x[RTM-72] + _ = x[RTM_ALWAYS_ABORT-73] + _ = x[SERIALIZE-74] + _ = x[SGX-75] + _ = x[SGXLC-76] + _ = x[SHA-77] + _ = x[SSE-78] + _ = x[SSE2-79] + _ = x[SSE3-80] + _ = x[SSE4-81] + _ = x[SSE42-82] + _ = x[SSE4A-83] + _ = x[SSSE3-84] + _ = x[STIBP-85] + _ = x[SUCCOR-86] + _ = x[TBM-87] + _ = x[TSXLDTRK-88] + _ = x[VAES-89] + _ = x[VMX-90] + _ = x[VPCLMULQDQ-91] + _ = x[WAITPKG-92] + _ = x[WBNOINVD-93] + _ = x[XOP-94] + _ = x[AESARM-95] + _ = x[ARMCPUID-96] + _ = x[ASIMD-97] + _ = x[ASIMDDP-98] + _ = x[ASIMDHP-99] + _ = x[ASIMDRDM-100] + _ = x[ATOMICS-101] + _ = x[CRC32-102] + _ = x[DCPOP-103] + _ = x[EVTSTRM-104] + _ = x[FCMA-105] + _ = x[FP-106] + _ = x[FPHP-107] + _ = x[GPA-108] + _ = x[JSCVT-109] + _ = x[LRCPC-110] + _ = x[PMULL-111] + _ = x[SHA1-112] + _ = x[SHA2-113] + _ = x[SHA3-114] + _ = x[SHA512-115] + _ = x[SM3-116] + _ = x[SM4-117] + _ = x[SVE-118] + _ = x[lastID-119] + _ = x[firstID-0] +} + +const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXINT8AMXTILEAVXAVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXSLOWBMI1BMI2CLDEMOTECLMULCLZEROCMOVCPBOOSTCX16ENQCMDERMSF16CFMA3FMA4GFNIHLEHTTHWAHYPERVISORIBPBIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKINT_WBINVDINVLPGBLZCNTMCAOVERFLOWMCOMMITMMXMMXEXTMOVDIR64BMOVDIRIMPXMSRIRCNXPOPCNTRDPRURDRANDRDSEEDRDTSCPRTMRTM_ALWAYS_ABORTSERIALIZESGXSGXLCSHASSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSUCCORTBMTSXLDTRKVAESVMXVPCLMULQDQWAITPKGWBNOINVDXOPAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID" + +var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 58, 62, 72, 84, 92, 100, 108, 116, 123, 133, 143, 151, 161, 172, 180, 190, 208, 223, 230, 234, 238, 246, 251, 257, 261, 268, 272, 278, 282, 286, 290, 294, 298, 301, 304, 307, 317, 321, 324, 334, 345, 351, 359, 370, 378, 390, 406, 416, 423, 428, 439, 446, 449, 455, 464, 471, 474, 480, 482, 488, 493, 499, 505, 511, 514, 530, 539, 542, 547, 550, 553, 557, 561, 565, 570, 575, 580, 585, 591, 594, 602, 606, 609, 619, 626, 634, 637, 643, 651, 656, 663, 670, 678, 685, 690, 695, 702, 706, 708, 712, 715, 720, 725, 730, 734, 738, 742, 748, 751, 754, 757, 763} + +func (i FeatureID) String() string { + if i < 0 || i >= FeatureID(len(_FeatureID_index)-1) { + return "FeatureID(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _FeatureID_name[_FeatureID_index[i]:_FeatureID_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[VendorUnknown-0] + _ = x[Intel-1] + _ = x[AMD-2] + _ = x[VIA-3] + _ = x[Transmeta-4] + _ = x[NSC-5] + _ = x[KVM-6] + _ = x[MSVM-7] + _ = x[VMware-8] + _ = x[XenHVM-9] + _ = x[Bhyve-10] + _ = x[Hygon-11] + _ = x[SiS-12] + _ = x[RDC-13] + _ = x[Ampere-14] + _ = x[ARM-15] + _ = x[Broadcom-16] + _ = x[Cavium-17] + _ = x[DEC-18] + _ = x[Fujitsu-19] + _ = x[Infineon-20] + _ = x[Motorola-21] + _ = x[NVIDIA-22] + _ = x[AMCC-23] + _ = x[Qualcomm-24] + _ = x[Marvell-25] + _ = x[lastVendor-26] +} + +const _Vendor_name = "VendorUnknownIntelAMDVIATransmetaNSCKVMMSVMVMwareXenHVMBhyveHygonSiSRDCAmpereARMBroadcomCaviumDECFujitsuInfineonMotorolaNVIDIAAMCCQualcommMarvelllastVendor" + +var _Vendor_index = [...]uint8{0, 13, 18, 21, 24, 33, 36, 39, 43, 49, 55, 60, 65, 68, 71, 77, 80, 88, 94, 97, 104, 112, 120, 126, 130, 138, 145, 155} + +func (i Vendor) String() string { + if i < 0 || i >= Vendor(len(_Vendor_index)-1) { + return "Vendor(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _Vendor_name[_Vendor_index[i]:_Vendor_index[i+1]] +} diff --git a/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go new file mode 100644 index 000000000000..8d2cb0368bcf --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go @@ -0,0 +1,19 @@ +// Copyright (c) 2020 Klaus Post, released under MIT License. See LICENSE file. + +package cpuid + +import "runtime" + +func detectOS(c *CPUInfo) bool { + // There are no hw.optional sysctl values for the below features on Mac OS 11.0 + // to detect their supported state dynamically. Assume the CPU features that + // Apple Silicon M1 supports to be available as a minimal set of features + // to all Go programs running on darwin/arm64. + // TODO: Add more if we know them. + c.featureSet.setIf(runtime.GOOS != "ios", AESARM, PMULL, SHA1, SHA2) + c.PhysicalCores = runtime.NumCPU() + // For now assuming 1 thread per core... + c.ThreadsPerCore = 1 + c.LogicalCores = c.PhysicalCores + return true +} diff --git a/vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go new file mode 100644 index 000000000000..ee278b9e4bcf --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go @@ -0,0 +1,130 @@ +// Copyright (c) 2020 Klaus Post, released under MIT License. See LICENSE file. + +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file located +// here https://github.com/golang/sys/blob/master/LICENSE + +package cpuid + +import ( + "encoding/binary" + "io/ioutil" + "runtime" +) + +// HWCAP bits. +const ( + hwcap_FP = 1 << 0 + hwcap_ASIMD = 1 << 1 + hwcap_EVTSTRM = 1 << 2 + hwcap_AES = 1 << 3 + hwcap_PMULL = 1 << 4 + hwcap_SHA1 = 1 << 5 + hwcap_SHA2 = 1 << 6 + hwcap_CRC32 = 1 << 7 + hwcap_ATOMICS = 1 << 8 + hwcap_FPHP = 1 << 9 + hwcap_ASIMDHP = 1 << 10 + hwcap_CPUID = 1 << 11 + hwcap_ASIMDRDM = 1 << 12 + hwcap_JSCVT = 1 << 13 + hwcap_FCMA = 1 << 14 + hwcap_LRCPC = 1 << 15 + hwcap_DCPOP = 1 << 16 + hwcap_SHA3 = 1 << 17 + hwcap_SM3 = 1 << 18 + hwcap_SM4 = 1 << 19 + hwcap_ASIMDDP = 1 << 20 + hwcap_SHA512 = 1 << 21 + hwcap_SVE = 1 << 22 + hwcap_ASIMDFHM = 1 << 23 +) + +func detectOS(c *CPUInfo) bool { + // For now assuming no hyperthreading is reasonable. + c.LogicalCores = runtime.NumCPU() + c.PhysicalCores = c.LogicalCores + c.ThreadsPerCore = 1 + if hwcap == 0 { + // We did not get values from the runtime. + // Try reading /proc/self/auxv + + // From https://github.com/golang/sys + const ( + _AT_HWCAP = 16 + _AT_HWCAP2 = 26 + + uintSize = int(32 << (^uint(0) >> 63)) + ) + + buf, err := ioutil.ReadFile("/proc/self/auxv") + if err != nil { + // e.g. on android /proc/self/auxv is not accessible, so silently + // ignore the error and leave Initialized = false. On some + // architectures (e.g. arm64) doinit() implements a fallback + // readout and will set Initialized = true again. + return false + } + bo := binary.LittleEndian + for len(buf) >= 2*(uintSize/8) { + var tag, val uint + switch uintSize { + case 32: + tag = uint(bo.Uint32(buf[0:])) + val = uint(bo.Uint32(buf[4:])) + buf = buf[8:] + case 64: + tag = uint(bo.Uint64(buf[0:])) + val = uint(bo.Uint64(buf[8:])) + buf = buf[16:] + } + switch tag { + case _AT_HWCAP: + hwcap = val + case _AT_HWCAP2: + // Not used + } + } + if hwcap == 0 { + return false + } + } + + // HWCap was populated by the runtime from the auxiliary vector. + // Use HWCap information since reading aarch64 system registers + // is not supported in user space on older linux kernels. + c.featureSet.setIf(isSet(hwcap, hwcap_AES), AESARM) + c.featureSet.setIf(isSet(hwcap, hwcap_ASIMD), ASIMD) + c.featureSet.setIf(isSet(hwcap, hwcap_ASIMDDP), ASIMDDP) + c.featureSet.setIf(isSet(hwcap, hwcap_ASIMDHP), ASIMDHP) + c.featureSet.setIf(isSet(hwcap, hwcap_ASIMDRDM), ASIMDRDM) + c.featureSet.setIf(isSet(hwcap, hwcap_CPUID), ARMCPUID) + c.featureSet.setIf(isSet(hwcap, hwcap_CRC32), CRC32) + c.featureSet.setIf(isSet(hwcap, hwcap_DCPOP), DCPOP) + c.featureSet.setIf(isSet(hwcap, hwcap_EVTSTRM), EVTSTRM) + c.featureSet.setIf(isSet(hwcap, hwcap_FCMA), FCMA) + c.featureSet.setIf(isSet(hwcap, hwcap_FP), FP) + c.featureSet.setIf(isSet(hwcap, hwcap_FPHP), FPHP) + c.featureSet.setIf(isSet(hwcap, hwcap_JSCVT), JSCVT) + c.featureSet.setIf(isSet(hwcap, hwcap_LRCPC), LRCPC) + c.featureSet.setIf(isSet(hwcap, hwcap_PMULL), PMULL) + c.featureSet.setIf(isSet(hwcap, hwcap_SHA1), SHA1) + c.featureSet.setIf(isSet(hwcap, hwcap_SHA2), SHA2) + c.featureSet.setIf(isSet(hwcap, hwcap_SHA3), SHA3) + c.featureSet.setIf(isSet(hwcap, hwcap_SHA512), SHA512) + c.featureSet.setIf(isSet(hwcap, hwcap_SM3), SM3) + c.featureSet.setIf(isSet(hwcap, hwcap_SM4), SM4) + c.featureSet.setIf(isSet(hwcap, hwcap_SVE), SVE) + + // The Samsung S9+ kernel reports support for atomics, but not all cores + // actually support them, resulting in SIGILL. See issue #28431. + // TODO(elias.naur): Only disable the optimization on bad chipsets on android. + c.featureSet.setIf(isSet(hwcap, hwcap_ATOMICS) && runtime.GOOS != "android", ATOMICS) + + return true +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} diff --git a/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go new file mode 100644 index 000000000000..1a951e6ca00e --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go @@ -0,0 +1,17 @@ +// Copyright (c) 2020 Klaus Post, released under MIT License. See LICENSE file. + +// +build arm64 +// +build !linux +// +build !darwin + +package cpuid + +import "runtime" + +func detectOS(c *CPUInfo) bool { + c.PhysicalCores = runtime.NumCPU() + // For now assuming 1 thread per core... + c.ThreadsPerCore = 1 + c.LogicalCores = c.PhysicalCores + return false +} diff --git a/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go new file mode 100644 index 000000000000..4d0b8b465b3a --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go @@ -0,0 +1,7 @@ +// Copyright (c) 2021 Klaus Post, released under MIT License. See LICENSE file. + +//+build nounsafe + +package cpuid + +var hwcap uint diff --git a/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go new file mode 100644 index 000000000000..329800286e6e --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go @@ -0,0 +1,10 @@ +// Copyright (c) 2021 Klaus Post, released under MIT License. See LICENSE file. + +//+build !nounsafe + +package cpuid + +import _ "unsafe" // needed for go:linkname + +//go:linkname hwcap internal/cpu.HWCap +var hwcap uint diff --git a/vendor/github.com/klauspost/cpuid/v2/test-architectures.sh b/vendor/github.com/klauspost/cpuid/v2/test-architectures.sh new file mode 100644 index 000000000000..471d986d2488 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/test-architectures.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +set -e + +go tool dist list | while IFS=/ read os arch; do + echo "Checking $os/$arch..." + echo " normal" + GOARCH=$arch GOOS=$os go build -o /dev/null . + echo " noasm" + GOARCH=$arch GOOS=$os go build -tags noasm -o /dev/null . + echo " appengine" + GOARCH=$arch GOOS=$os go build -tags appengine -o /dev/null . + echo " noasm,appengine" + GOARCH=$arch GOOS=$os go build -tags 'appengine noasm' -o /dev/null . +done diff --git a/vendor/github.com/zeebo/xxh3/.gitignore b/vendor/github.com/zeebo/xxh3/.gitignore new file mode 100644 index 000000000000..928e12f5306f --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/.gitignore @@ -0,0 +1,6 @@ +upstream +*.pprof +xxh3.test +.vscode +*.txt +_compat diff --git a/vendor/github.com/zeebo/xxh3/LICENSE b/vendor/github.com/zeebo/xxh3/LICENSE new file mode 100644 index 000000000000..477f8e5e1ea7 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/LICENSE @@ -0,0 +1,25 @@ +xxHash Library +Copyright (c) 2012-2014, Yann Collet +Copyright (c) 2019, Jeff Wendling +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/zeebo/xxh3/Makefile b/vendor/github.com/zeebo/xxh3/Makefile new file mode 100644 index 000000000000..8bd78c48246e --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/Makefile @@ -0,0 +1,27 @@ +.PHONY: all vet +all: genasm _compat + +genasm: avo/avx.go avo/sse.go + cd ./avo; go generate gen.go + +clean: + rm accum_vector_avx_amd64.s + rm accum_vector_sse_amd64.s + rm _compat + +upstream/xxhash.o: upstream/xxhash.h + ( cd upstream && make ) + +_compat: _compat.c upstream/xxhash.o + gcc -o _compat _compat.c ./upstream/xxhash.o + +vet: + GOOS=linux GOARCH=386 GO386=softfloat go vet ./... + GOOS=windows GOARCH=386 GO386=softfloat go vet ./... + GOOS=linux GOARCH=amd64 go vet ./... + GOOS=windows GOARCH=amd64 go vet ./... + GOOS=darwin GOARCH=amd64 go vet ./... + GOOS=linux GOARCH=arm go vet ./... + GOOS=linux GOARCH=arm64 go vet ./... + GOOS=windows GOARCH=arm64 go vet ./... + GOOS=darwin GOARCH=arm64 go vet ./... \ No newline at end of file diff --git a/vendor/github.com/zeebo/xxh3/README.md b/vendor/github.com/zeebo/xxh3/README.md new file mode 100644 index 000000000000..4633fc03a84b --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/README.md @@ -0,0 +1,38 @@ +# XXH3 +[![GoDoc](https://godoc.org/github.com/zeebo/xxh3?status.svg)](https://godoc.org/github.com/zeebo/xxh3) +[![Sourcegraph](https://sourcegraph.com/github.com/zeebo/xxh3/-/badge.svg)](https://sourcegraph.com/github.com/zeebo/xxh3?badge) +[![Go Report Card](https://goreportcard.com/badge/github.com/zeebo/xxh3)](https://goreportcard.com/report/github.com/zeebo/xxh3) + +This package is a port of the [xxh3](https://github.com/Cyan4973/xxHash) library to Go. + +Upstream has fixed the output as of v0.8.0, and this package matches that. + +--- + +# Benchmarks + +Run on my `i7-8850H CPU @ 2.60GHz` + +## Small Sizes + +| Bytes | Rate | +|-----------|--------------------------------------| +|` 0 ` |` 0.74 ns/op ` | +|` 1-3 ` |` 4.19 ns/op (0.24 GB/s - 0.71 GB/s) `| +|` 4-8 ` |` 4.16 ns/op (0.97 GB/s - 1.98 GB/s) `| +|` 9-16 ` |` 4.46 ns/op (2.02 GB/s - 3.58 GB/s) `| +|` 17-32 ` |` 6.22 ns/op (2.76 GB/s - 5.15 GB/s) `| +|` 33-64 ` |` 8.00 ns/op (4.13 GB/s - 8.13 GB/s) `| +|` 65-96 ` |` 11.0 ns/op (5.91 GB/s - 8.84 GB/s) `| +|` 97-128 ` |` 12.8 ns/op (7.68 GB/s - 10.0 GB/s) `| + +## Large Sizes + +| Bytes | Rate | SSE2 Rate | AVX2 Rate | +|---------|--------------------------|--------------------------|--------------------------| +|` 129 ` |` 13.6 ns/op (9.45 GB/s) `| | | +|` 240 ` |` 23.8 ns/op (10.1 GB/s) `| | | +|` 241 ` |` 40.5 ns/op (5.97 GB/s) `|` 23.3 ns/op (10.4 GB/s) `|` 20.1 ns/op (12.0 GB/s) `| +|` 512 ` |` 69.8 ns/op (7.34 GB/s) `|` 30.4 ns/op (16.9 GB/s) `|` 24.7 ns/op (20.7 GB/s) `| +|` 1024 ` |` 132 ns/op (7.77 GB/s) `|` 48.9 ns/op (20.9 GB/s) `|` 37.7 ns/op (27.2 GB/s) `| +|` 100KB `|` 13.0 us/op (7.88 GB/s) `|` 4.05 us/op (25.3 GB/s) `|` 2.31 us/op (44.3 GB/s) `| diff --git a/vendor/github.com/zeebo/xxh3/_compat.c b/vendor/github.com/zeebo/xxh3/_compat.c new file mode 100644 index 000000000000..fda9f36ff013 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/_compat.c @@ -0,0 +1,39 @@ +#include "upstream/xxhash.h" +#include + +int main() { + unsigned char buf[4096]; + for (int i = 0; i < 4096; i++) { + buf[i] = (unsigned char)((i+1)%251); + } + + printf("var testVecs64 = []uint64{\n"); + for (int i = 0; i < 4096; i++) { + if (i % 4 == 0) { + printf("\t"); + } + + uint64_t h = XXH3_64bits(buf, (size_t)i); + printf("0x%lx, ", h); + + if (i % 4 == 3) { + printf("\n\t"); + } + } + printf("}\n\n"); + + printf("var testVecs128 = [][2]uint64{\n"); + for (int i = 0; i < 4096; i++) { + if (i % 4 == 0) { + printf("\t"); + } + + XXH128_hash_t h = XXH3_128bits(buf, (size_t)i); + printf("{0x%lx, 0x%lx}, ", h.high64, h.low64); + + if (i % 4 == 3) { + printf("\n"); + } + } + printf("}\n\n"); +} diff --git a/vendor/github.com/zeebo/xxh3/accum_generic.go b/vendor/github.com/zeebo/xxh3/accum_generic.go new file mode 100644 index 000000000000..b1be78507175 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_generic.go @@ -0,0 +1,542 @@ +package xxh3 + +// avx512Switch is the size at which the avx512 code is used. +// Bigger blocks benefit more. +const avx512Switch = 1 << 10 + +func accumScalar(accs *[8]u64, p, secret ptr, l u64) { + if secret != key { + accumScalarSeed(accs, p, secret, l) + return + } + for l > _block { + k := secret + + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= key64_128 + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= key64_136 + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= key64_144 + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= key64_152 + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= key64_160 + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= key64_168 + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= key64_176 + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= key64_184 + accs[7] *= prime32_1 + } + + if l > 0 { + t, k := (l-1)/_stripe, secret + + for i := u64(0); i < t; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + if l > 0 { + p = ptr(ui(p) - uintptr(_stripe-l)) + + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ key64_121 + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ key64_129 + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ key64_137 + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ key64_145 + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ key64_153 + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ key64_161 + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ key64_169 + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ key64_177 + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + } + } +} + +func accumBlockScalar(accs *[8]u64, p, secret ptr) { + if secret != key { + accumBlockScalarSeed(accs, p, secret) + return + } + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8) + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= key64_128 + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= key64_136 + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= key64_144 + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= key64_152 + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= key64_160 + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= key64_168 + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= key64_176 + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= key64_184 + accs[7] *= prime32_1 +} + +// accumScalarSeed should be used with custom key. +func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) { + for l > _block { + k := secret + + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= readU64(secret, 128) + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= readU64(secret, 136) + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= readU64(secret, 144) + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= readU64(secret, 152) + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= readU64(secret, 160) + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= readU64(secret, 168) + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= readU64(secret, 176) + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= readU64(secret, 184) + accs[7] *= prime32_1 + } + + if l > 0 { + t, k := (l-1)/_stripe, secret + + for i := u64(0); i < t; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + if l > 0 { + p = ptr(ui(p) - uintptr(_stripe-l)) + + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 121) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 129) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 137) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 145) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 153) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 161) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 169) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 177) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + } + } +} + +// accumBlockScalarSeed should be used with custom key. +func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) { + // accs + { + secret := secret + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= readU64(secret, 128) + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= readU64(secret, 136) + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= readU64(secret, 144) + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= readU64(secret, 152) + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= readU64(secret, 160) + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= readU64(secret, 168) + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= readU64(secret, 176) + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= readU64(secret, 184) + accs[7] *= prime32_1 +} diff --git a/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go b/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go new file mode 100644 index 000000000000..9baff6c41c7a --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go @@ -0,0 +1,40 @@ +package xxh3 + +import ( + "unsafe" + + "github.com/klauspost/cpuid/v2" +) + +var ( + hasAVX2 = cpuid.CPU.Has(cpuid.AVX2) + hasSSE2 = cpuid.CPU.Has(cpuid.SSE2) // Always true on amd64 + hasAVX512 = cpuid.CPU.Has(cpuid.AVX512F) +) + +//go:noescape +func accumAVX2(acc *[8]u64, data, key unsafe.Pointer, len u64) + +//go:noescape +func accumAVX512(acc *[8]u64, data, key unsafe.Pointer, len u64) + +//go:noescape +func accumSSE(acc *[8]u64, data, key unsafe.Pointer, len u64) + +//go:noescape +func accumBlockAVX2(acc *[8]u64, data, key unsafe.Pointer) + +//go:noescape +func accumBlockSSE(acc *[8]u64, data, key unsafe.Pointer) + +func withOverrides(avx512, avx2, sse2 bool, cb func()) { + avx512Orig, avx2Orig, sse2Orig := hasAVX512, hasAVX2, hasSSE2 + hasAVX512, hasAVX2, hasSSE2 = avx512, avx2, sse2 + defer func() { hasAVX512, hasAVX2, hasSSE2 = avx512Orig, avx2Orig, sse2Orig }() + cb() +} + +func withAVX512(cb func()) { withOverrides(hasAVX512, false, false, cb) } +func withAVX2(cb func()) { withOverrides(false, hasAVX2, false, cb) } +func withSSE2(cb func()) { withOverrides(false, false, hasSSE2, cb) } +func withGeneric(cb func()) { withOverrides(false, false, false, cb) } diff --git a/vendor/github.com/zeebo/xxh3/accum_stubs_other.go b/vendor/github.com/zeebo/xxh3/accum_stubs_other.go new file mode 100644 index 000000000000..93bf6258a828 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_stubs_other.go @@ -0,0 +1,25 @@ +//go:build !amd64 +// +build !amd64 + +package xxh3 + +import ( + "unsafe" +) + +const ( + hasAVX2 = false + hasSSE2 = false + hasAVX512 = false +) + +func accumAVX2(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") } +func accumSSE(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") } +func accumBlockAVX2(acc *[8]u64, data, key unsafe.Pointer) { panic("unreachable") } +func accumBlockSSE(acc *[8]u64, data, key unsafe.Pointer) { panic("unreachable") } +func accumAVX512(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") } + +func withAVX512(cb func()) { cb() } +func withAVX2(cb func()) { cb() } +func withSSE2(cb func()) { cb() } +func withGeneric(cb func()) { cb() } diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s new file mode 100644 index 000000000000..cfaf9f0a77d2 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s @@ -0,0 +1,379 @@ +// Code generated by command: go run gen.go -avx512 -out ../accum_vector_avx512_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_avx512<>+0(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+8(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+16(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+24(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+32(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+40(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+48(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+56(SB)/8, $0x000000009e3779b1 +GLOBL prime_avx512<>(SB), RODATA|NOPTR, $64 + +// func accumAVX512(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: AVX, AVX512F, MMX+ +TEXT ·accumAVX512(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ len+24(FP), BX + VMOVDQU64 (AX), Z1 + VMOVDQU64 prime_avx512<>+0(SB), Z0 + VMOVDQU64 (DX), Z2 + VMOVDQU64 8(DX), Z3 + VMOVDQU64 16(DX), Z4 + VMOVDQU64 24(DX), Z5 + VMOVDQU64 32(DX), Z6 + VMOVDQU64 40(DX), Z7 + VMOVDQU64 48(DX), Z8 + VMOVDQU64 56(DX), Z9 + VMOVDQU64 64(DX), Z10 + VMOVDQU64 72(DX), Z11 + VMOVDQU64 80(DX), Z12 + VMOVDQU64 88(DX), Z13 + VMOVDQU64 96(DX), Z14 + VMOVDQU64 104(DX), Z15 + VMOVDQU64 112(DX), Z16 + VMOVDQU64 120(DX), Z17 + VMOVDQU64 128(DX), Z18 + VMOVDQU64 121(DX), Z19 + +accum_large: + CMPQ BX, $0x00000400 + JLE accum + VMOVDQU64 (CX), Z20 + PREFETCHT0 1024(CX) + VPXORD Z2, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 64(CX), Z20 + PREFETCHT0 1088(CX) + VPXORD Z3, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 128(CX), Z20 + PREFETCHT0 1152(CX) + VPXORD Z4, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 192(CX), Z20 + PREFETCHT0 1216(CX) + VPXORD Z5, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 256(CX), Z20 + PREFETCHT0 1280(CX) + VPXORD Z6, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 320(CX), Z20 + PREFETCHT0 1344(CX) + VPXORD Z7, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 384(CX), Z20 + PREFETCHT0 1408(CX) + VPXORD Z8, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 448(CX), Z20 + PREFETCHT0 1472(CX) + VPXORD Z9, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 512(CX), Z20 + PREFETCHT0 1536(CX) + VPXORD Z10, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 576(CX), Z20 + PREFETCHT0 1600(CX) + VPXORD Z11, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 640(CX), Z20 + PREFETCHT0 1664(CX) + VPXORD Z12, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 704(CX), Z20 + PREFETCHT0 1728(CX) + VPXORD Z13, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 768(CX), Z20 + PREFETCHT0 1792(CX) + VPXORD Z14, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 832(CX), Z20 + PREFETCHT0 1856(CX) + VPXORD Z15, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 896(CX), Z20 + PREFETCHT0 1920(CX) + VPXORD Z16, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 960(CX), Z20 + PREFETCHT0 1984(CX) + VPXORD Z17, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + ADDQ $0x00000400, CX + SUBQ $0x00000400, BX + VPSRLQ $0x2f, Z1, Z20 + VPTERNLOGD $0x96, Z1, Z18, Z20 + VPMULUDQ Z0, Z20, Z1 + VPSHUFD $0xf5, Z20, Z20 + VPMULUDQ Z0, Z20, Z20 + VPSLLQ $0x20, Z20, Z20 + VPADDQ Z1, Z20, Z1 + JMP accum_large + +accum: + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z2, Z0, Z2 + VPSHUFD $0x31, Z2, Z18 + VPMULUDQ Z2, Z18, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z3, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z4, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z5, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z6, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z7, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z8, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z9, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z10, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z11, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z12, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z13, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z14, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z15, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z16, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z17, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + +finalize: + CMPQ BX, $0x00 + JE return + SUBQ $0x40, CX + ADDQ BX, CX + VMOVDQU64 (CX), Z0 + VPXORD Z19, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + +return: + VMOVDQU64 Z1, (AX) + VZEROUPPER + RET diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s new file mode 100644 index 000000000000..b53c1521f76c --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s @@ -0,0 +1,586 @@ +// Code generated by command: go run gen.go -avx -out ../accum_vector_avx_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_avx<>+0(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+8(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+16(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+24(SB)/8, $0x000000009e3779b1 +GLOBL prime_avx<>(SB), RODATA|NOPTR, $32 + +// func accumAVX2(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: AVX, AVX2, MMX+ +TEXT ·accumAVX2(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ key+16(FP), BX + MOVQ len+24(FP), SI + VMOVDQU (AX), Y1 + VMOVDQU 32(AX), Y2 + VMOVDQU prime_avx<>+0(SB), Y0 + +accum_large: + CMPQ SI, $0x00000400 + JLE accum + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y6 + PREFETCHT0 512(CX) + VPXOR (DX), Y3, Y4 + VPXOR 32(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y6 + PREFETCHT0 576(CX) + VPXOR 8(DX), Y3, Y4 + VPXOR 40(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y6 + PREFETCHT0 640(CX) + VPXOR 16(DX), Y3, Y4 + VPXOR 48(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y6 + PREFETCHT0 704(CX) + VPXOR 24(DX), Y3, Y4 + VPXOR 56(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y6 + PREFETCHT0 768(CX) + VPXOR 32(DX), Y3, Y4 + VPXOR 64(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y6 + PREFETCHT0 832(CX) + VPXOR 40(DX), Y3, Y4 + VPXOR 72(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y6 + PREFETCHT0 896(CX) + VPXOR 48(DX), Y3, Y4 + VPXOR 80(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y6 + PREFETCHT0 960(CX) + VPXOR 56(DX), Y3, Y4 + VPXOR 88(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y6 + PREFETCHT0 1024(CX) + VPXOR 64(DX), Y3, Y4 + VPXOR 96(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y6 + PREFETCHT0 1088(CX) + VPXOR 72(DX), Y3, Y4 + VPXOR 104(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y6 + PREFETCHT0 1152(CX) + VPXOR 80(DX), Y3, Y4 + VPXOR 112(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y6 + PREFETCHT0 1216(CX) + VPXOR 88(DX), Y3, Y4 + VPXOR 120(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y6 + PREFETCHT0 1280(CX) + VPXOR 96(DX), Y3, Y4 + VPXOR 128(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y6 + PREFETCHT0 1344(CX) + VPXOR 104(DX), Y3, Y4 + VPXOR 136(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y6 + PREFETCHT0 1408(CX) + VPXOR 112(DX), Y3, Y4 + VPXOR 144(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y6 + PREFETCHT0 1472(CX) + VPXOR 120(DX), Y3, Y4 + VPXOR 152(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + ADDQ $0x00000400, CX + SUBQ $0x00000400, SI + VPSRLQ $0x2f, Y1, Y3 + VPXOR Y1, Y3, Y3 + VPXOR 128(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y1 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y1, Y3, Y1 + VPSRLQ $0x2f, Y2, Y3 + VPXOR Y2, Y3, Y3 + VPXOR 160(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y2 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y2, Y3, Y2 + JMP accum_large + +accum: + CMPQ SI, $0x40 + JLE finalize + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y5 + VPXOR (BX), Y0, Y3 + VPXOR 32(BX), Y5, Y6 + VPSHUFD $0x31, Y3, Y4 + VPSHUFD $0x31, Y6, Y7 + VPMULUDQ Y3, Y4, Y3 + VPMULUDQ Y6, Y7, Y6 + VPSHUFD $0x4e, Y0, Y0 + VPSHUFD $0x4e, Y5, Y5 + VPADDQ Y1, Y0, Y1 + VPADDQ Y1, Y3, Y1 + VPADDQ Y2, Y5, Y2 + VPADDQ Y2, Y6, Y2 + ADDQ $0x00000040, CX + SUBQ $0x00000040, SI + ADDQ $0x00000008, BX + JMP accum + +finalize: + CMPQ SI, $0x00 + JE return + SUBQ $0x40, CX + ADDQ SI, CX + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y5 + VPXOR 121(DX), Y0, Y3 + VPXOR 153(DX), Y5, Y6 + VPSHUFD $0x31, Y3, Y4 + VPSHUFD $0x31, Y6, Y7 + VPMULUDQ Y3, Y4, Y3 + VPMULUDQ Y6, Y7, Y6 + VPSHUFD $0x4e, Y0, Y0 + VPSHUFD $0x4e, Y5, Y5 + VPADDQ Y1, Y0, Y1 + VPADDQ Y1, Y3, Y1 + VPADDQ Y2, Y5, Y2 + VPADDQ Y2, Y6, Y2 + +return: + VMOVDQU Y1, (AX) + VMOVDQU Y2, 32(AX) + VZEROUPPER + RET + +// func accumBlockAVX2(acc *[8]uint64, data *byte, key *byte) +// Requires: AVX, AVX2 +TEXT ·accumBlockAVX2(SB), NOSPLIT, $0-24 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + VMOVDQU (AX), Y1 + VMOVDQU 32(AX), Y2 + VMOVDQU prime_avx<>+0(SB), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y6 + VPXOR (DX), Y3, Y4 + VPXOR 32(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y6 + VPXOR 8(DX), Y3, Y4 + VPXOR 40(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y6 + VPXOR 16(DX), Y3, Y4 + VPXOR 48(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y6 + VPXOR 24(DX), Y3, Y4 + VPXOR 56(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y6 + VPXOR 32(DX), Y3, Y4 + VPXOR 64(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y6 + VPXOR 40(DX), Y3, Y4 + VPXOR 72(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y6 + VPXOR 48(DX), Y3, Y4 + VPXOR 80(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y6 + VPXOR 56(DX), Y3, Y4 + VPXOR 88(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y6 + VPXOR 64(DX), Y3, Y4 + VPXOR 96(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y6 + VPXOR 72(DX), Y3, Y4 + VPXOR 104(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y6 + VPXOR 80(DX), Y3, Y4 + VPXOR 112(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y6 + VPXOR 88(DX), Y3, Y4 + VPXOR 120(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y6 + VPXOR 96(DX), Y3, Y4 + VPXOR 128(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y6 + VPXOR 104(DX), Y3, Y4 + VPXOR 136(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y6 + VPXOR 112(DX), Y3, Y4 + VPXOR 144(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y6 + VPXOR 120(DX), Y3, Y4 + VPXOR 152(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VPSRLQ $0x2f, Y1, Y3 + VPXOR Y1, Y3, Y3 + VPXOR 128(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y1 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y1, Y3, Y1 + VPSRLQ $0x2f, Y2, Y3 + VPXOR Y2, Y3, Y3 + VPXOR 160(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y2 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y2, Y3, Y2 + VMOVDQU Y1, (AX) + VMOVDQU Y2, 32(AX) + VZEROUPPER + RET diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s new file mode 100644 index 000000000000..ba670e560226 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s @@ -0,0 +1,1236 @@ +// Code generated by command: go run gen.go -sse -out ../accum_vector_sse_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_sse<>+0(SB)/4, $0x9e3779b1 +DATA prime_sse<>+4(SB)/4, $0x9e3779b1 +DATA prime_sse<>+8(SB)/4, $0x9e3779b1 +DATA prime_sse<>+12(SB)/4, $0x9e3779b1 +GLOBL prime_sse<>(SB), RODATA|NOPTR, $16 + +// func accumSSE(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: SSE2 +TEXT ·accumSSE(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ key+16(FP), BX + MOVQ len+24(FP), SI + MOVOU (AX), X1 + MOVOU 16(AX), X2 + MOVOU 32(AX), X3 + MOVOU 48(AX), X4 + MOVOU prime_sse<>+0(SB), X0 + +accum_large: + CMPQ SI, $0x00000400 + JLE accum + MOVOU (CX), X5 + MOVOU (DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 16(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 32(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 48(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 64(CX), X5 + MOVOU 8(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 80(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 96(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 112(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 128(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 144(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 160(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 176(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 192(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 208(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 224(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 240(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 256(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 272(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 288(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 304(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 320(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 336(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 352(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 368(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 384(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 400(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 416(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 432(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 448(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 464(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 480(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 496(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 512(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 528(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 544(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 560(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 576(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 592(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 608(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 624(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 640(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 656(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 672(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 688(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 704(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 720(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 736(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 752(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 768(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 784(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 800(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 816(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 832(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 848(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 864(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 880(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 896(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 912(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 928(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 944(CX), X5 + MOVOU 160(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 960(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 976(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 992(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 1008(CX), X5 + MOVOU 168(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + ADDQ $0x00000400, CX + SUBQ $0x00000400, SI + MOVOU X1, X5 + PSRLQ $0x2f, X5 + PXOR X5, X1 + MOVOU 128(DX), X5 + PXOR X5, X1 + PSHUFD $0xf5, X1, X5 + PMULULQ X0, X1 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X1 + MOVOU X2, X5 + PSRLQ $0x2f, X5 + PXOR X5, X2 + MOVOU 144(DX), X5 + PXOR X5, X2 + PSHUFD $0xf5, X2, X5 + PMULULQ X0, X2 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X2 + MOVOU X3, X5 + PSRLQ $0x2f, X5 + PXOR X5, X3 + MOVOU 160(DX), X5 + PXOR X5, X3 + PSHUFD $0xf5, X3, X5 + PMULULQ X0, X3 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X3 + MOVOU X4, X5 + PSRLQ $0x2f, X5 + PXOR X5, X4 + MOVOU 176(DX), X5 + PXOR X5, X4 + PSHUFD $0xf5, X4, X5 + PMULULQ X0, X4 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X4 + JMP accum_large + +accum: + CMPQ SI, $0x40 + JLE finalize + MOVOU (CX), X0 + MOVOU (BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X1 + PADDQ X6, X1 + MOVOU 16(CX), X0 + MOVOU 16(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X2 + PADDQ X6, X2 + MOVOU 32(CX), X0 + MOVOU 32(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X3 + PADDQ X6, X3 + MOVOU 48(CX), X0 + MOVOU 48(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X4 + PADDQ X6, X4 + ADDQ $0x00000040, CX + SUBQ $0x00000040, SI + ADDQ $0x00000008, BX + JMP accum + +finalize: + CMPQ SI, $0x00 + JE return + SUBQ $0x40, CX + ADDQ SI, CX + MOVOU (CX), X0 + MOVOU 121(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X1 + PADDQ X6, X1 + MOVOU 16(CX), X0 + MOVOU 137(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X2 + PADDQ X6, X2 + MOVOU 32(CX), X0 + MOVOU 153(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X3 + PADDQ X6, X3 + MOVOU 48(CX), X0 + MOVOU 169(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X4 + PADDQ X6, X4 + +return: + MOVOU X1, (AX) + MOVOU X2, 16(AX) + MOVOU X3, 32(AX) + MOVOU X4, 48(AX) + RET + +// func accumBlockSSE(acc *[8]uint64, data *byte, key *byte) +// Requires: SSE2 +TEXT ·accumBlockSSE(SB), NOSPLIT, $0-24 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVOU (AX), X1 + MOVOU 16(AX), X2 + MOVOU 32(AX), X3 + MOVOU 48(AX), X4 + MOVOU prime_sse<>+0(SB), X0 + MOVOU (CX), X5 + MOVOU (DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 16(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 32(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 48(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 64(CX), X5 + MOVOU 8(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 80(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 96(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 112(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 128(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 144(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 160(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 176(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 192(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 208(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 224(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 240(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 256(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 272(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 288(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 304(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 320(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 336(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 352(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 368(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 384(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 400(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 416(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 432(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 448(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 464(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 480(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 496(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 512(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 528(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 544(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 560(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 576(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 592(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 608(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 624(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 640(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 656(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 672(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 688(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 704(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 720(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 736(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 752(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 768(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 784(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 800(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 816(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 832(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 848(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 864(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 880(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 896(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 912(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 928(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 944(CX), X5 + MOVOU 160(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 960(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 976(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 992(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 1008(CX), X5 + MOVOU 168(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU X1, X5 + PSRLQ $0x2f, X5 + PXOR X5, X1 + MOVOU 128(DX), X5 + PXOR X5, X1 + PSHUFD $0xf5, X1, X5 + PMULULQ X0, X1 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X1 + MOVOU X2, X5 + PSRLQ $0x2f, X5 + PXOR X5, X2 + MOVOU 144(DX), X5 + PXOR X5, X2 + PSHUFD $0xf5, X2, X5 + PMULULQ X0, X2 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X2 + MOVOU X3, X5 + PSRLQ $0x2f, X5 + PXOR X5, X3 + MOVOU 160(DX), X5 + PXOR X5, X3 + PSHUFD $0xf5, X3, X5 + PMULULQ X0, X3 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X3 + MOVOU X4, X5 + PSRLQ $0x2f, X5 + PXOR X5, X4 + MOVOU 176(DX), X5 + PXOR X5, X4 + PSHUFD $0xf5, X4, X5 + PMULULQ X0, X4 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X4 + MOVOU X1, (AX) + MOVOU X2, 16(AX) + MOVOU X3, 32(AX) + MOVOU X4, 48(AX) + RET diff --git a/vendor/github.com/zeebo/xxh3/consts.go b/vendor/github.com/zeebo/xxh3/consts.go new file mode 100644 index 000000000000..39ef6e179910 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/consts.go @@ -0,0 +1,97 @@ +package xxh3 + +const ( + _stripe = 64 + _block = 1024 + + prime32_1 = 2654435761 + prime32_2 = 2246822519 + prime32_3 = 3266489917 + + prime64_1 = 11400714785074694791 + prime64_2 = 14029467366897019727 + prime64_3 = 1609587929392839161 + prime64_4 = 9650029242287828579 + prime64_5 = 2870177450012600261 +) + +var key = ptr(&[...]u8{ + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe /* 8 */, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, /* 16 */ + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb /* 24 */, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, /* 32 */ + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78 /* 40 */, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, /* 48 */ + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e /* 56 */, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, /* 64 */ + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb /* 72 */, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, /* 80 */ + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e /* 88 */, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, /* 96 */ + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f /* 104 */, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, /* 112 */ + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31 /* 120 */, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, /* 128 */ + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3 /* 136 */, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, /* 144 */ + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49 /* 152 */, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, /* 160 */ + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc /* 168 */, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, /* 176 */ + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28 /* 184 */, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, /* 192 */ +}) + +const ( + key64_000 u64 = 0xbe4ba423396cfeb8 + key64_008 u64 = 0x1cad21f72c81017c + key64_016 u64 = 0xdb979083e96dd4de + key64_024 u64 = 0x1f67b3b7a4a44072 + key64_032 u64 = 0x78e5c0cc4ee679cb + key64_040 u64 = 0x2172ffcc7dd05a82 + key64_048 u64 = 0x8e2443f7744608b8 + key64_056 u64 = 0x4c263a81e69035e0 + key64_064 u64 = 0xcb00c391bb52283c + key64_072 u64 = 0xa32e531b8b65d088 + key64_080 u64 = 0x4ef90da297486471 + key64_088 u64 = 0xd8acdea946ef1938 + key64_096 u64 = 0x3f349ce33f76faa8 + key64_104 u64 = 0x1d4f0bc7c7bbdcf9 + key64_112 u64 = 0x3159b4cd4be0518a + key64_120 u64 = 0x647378d9c97e9fc8 + key64_128 u64 = 0xc3ebd33483acc5ea + key64_136 u64 = 0xeb6313faffa081c5 + key64_144 u64 = 0x49daf0b751dd0d17 + key64_152 u64 = 0x9e68d429265516d3 + key64_160 u64 = 0xfca1477d58be162b + key64_168 u64 = 0xce31d07ad1b8f88f + key64_176 u64 = 0x280416958f3acb45 + key64_184 u64 = 0x7e404bbbcafbd7af + + key64_103 u64 = 0x4f0bc7c7bbdcf93f + key64_111 u64 = 0x59b4cd4be0518a1d + key64_119 u64 = 0x7378d9c97e9fc831 + key64_127 u64 = 0xebd33483acc5ea64 + + key64_121 u64 = 0xea647378d9c97e9f + key64_129 u64 = 0xc5c3ebd33483acc5 + key64_137 u64 = 0x17eb6313faffa081 + key64_145 u64 = 0xd349daf0b751dd0d + key64_153 u64 = 0x2b9e68d429265516 + key64_161 u64 = 0x8ffca1477d58be16 + key64_169 u64 = 0x45ce31d07ad1b8f8 + key64_177 u64 = 0xaf280416958f3acb + + key64_011 = 0x6dd4de1cad21f72c + key64_019 = 0xa44072db979083e9 + key64_027 = 0xe679cb1f67b3b7a4 + key64_035 = 0xd05a8278e5c0cc4e + key64_043 = 0x4608b82172ffcc7d + key64_051 = 0x9035e08e2443f774 + key64_059 = 0x52283c4c263a81e6 + key64_067 = 0x65d088cb00c391bb + + key64_117 = 0xd9c97e9fc83159b4 + key64_125 = 0x3483acc5ea647378 + key64_133 = 0xfaffa081c5c3ebd3 + key64_141 = 0xb751dd0d17eb6313 + key64_149 = 0x29265516d349daf0 + key64_157 = 0x7d58be162b9e68d4 + key64_165 = 0x7ad1b8f88ffca147 + key64_173 = 0x958f3acb45ce31d0 +) + +const ( + key32_000 u32 = 0xbe4ba423 + key32_004 u32 = 0x396cfeb8 + key32_008 u32 = 0x1cad21f7 + key32_012 u32 = 0x2c81017c +) diff --git a/vendor/github.com/zeebo/xxh3/hash128.go b/vendor/github.com/zeebo/xxh3/hash128.go new file mode 100644 index 000000000000..0040a21bbce9 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash128.go @@ -0,0 +1,253 @@ +package xxh3 + +import ( + "math/bits" +) + +// Hash128 returns the 128-bit hash of the byte slice. +func Hash128(b []byte) Uint128 { + return hashAny128(*(*str)(ptr(&b))) +} + +// HashString128 returns the 128-bit hash of the string slice. +func HashString128(s string) Uint128 { + return hashAny128(*(*str)(ptr(&s))) +} + +func hashAny128(s str) (acc u128) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: // 9-16 + const bitflipl = key64_032 ^ key64_040 + const bitfliph = key64_048 ^ key64_056 + + input_lo := readU64(p, 0) + input_hi := readU64(p, ui(l)-8) + + m128_h, m128_l := bits.Mul64(input_lo^input_hi^bitflipl, prime64_1) + + m128_l += uint64(l-1) << 54 + input_hi ^= bitfliph + + m128_h += input_hi + uint64(uint32(input_hi))*(prime32_2-1) + + m128_l ^= bits.ReverseBytes64(m128_h) + + acc.Hi, acc.Lo = bits.Mul64(m128_l, prime64_2) + acc.Hi += m128_h * prime64_2 + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l > 3: // 4-8 + const bitflip = key64_016 ^ key64_024 + + input_lo := readU32(p, 0) + input_hi := readU32(p, ui(l)-4) + input_64 := u64(input_lo) + u64(input_hi)<<32 + keyed := input_64 ^ bitflip + + acc.Hi, acc.Lo = bits.Mul64(keyed, prime64_1+(uint64(l)<<2)) + + acc.Hi += acc.Lo << 1 + acc.Lo ^= acc.Hi >> 3 + + acc.Lo ^= acc.Lo >> 35 + acc.Lo *= 0x9fb21c651e98df25 + acc.Lo ^= acc.Lo >> 28 + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc.Lo = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc.Lo = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc.Lo = c1*(1<<24+1<<16+1) + 1<<8 + + default: // 0 + return u128{0x99aa06d3014798d8, 0x6001c324468d497f} + } + + acc.Hi = uint64(bits.RotateLeft32(bits.ReverseBytes32(uint32(acc.Lo)), 13)) + acc.Lo ^= uint64(key32_000 ^ key32_004) + acc.Hi ^= uint64(key32_008 ^ key32_012) + + acc.Lo = xxh64AvalancheSmall(acc.Lo) + acc.Hi = xxh64AvalancheSmall(acc.Hi) + + return acc + + case l <= 128: + acc.Lo = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + in8, in7 := readU64(p, ui(l)-8*8), readU64(p, ui(l)-7*8) + i6, i7 := readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(in8^key64_112, in7^key64_120) + acc.Hi ^= i6 + i7 + acc.Lo += mulFold64(i6^key64_096, i7^key64_104) + acc.Lo ^= in8 + in7 + + } // 96 + + in6, in5 := readU64(p, ui(l)-6*8), readU64(p, ui(l)-5*8) + i4, i5 := readU64(p, 4*8), readU64(p, 5*8) + + acc.Hi += mulFold64(in6^key64_080, in5^key64_088) + acc.Hi ^= i4 + i5 + acc.Lo += mulFold64(i4^key64_064, i5^key64_072) + acc.Lo ^= in6 + in5 + + } // 64 + + in4, in3 := readU64(p, ui(l)-4*8), readU64(p, ui(l)-3*8) + i2, i3 := readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(in4^key64_048, in3^key64_056) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^key64_032, i3^key64_040) + acc.Lo ^= in4 + in3 + + } // 32 + + in2, in1 := readU64(p, ui(l)-2*8), readU64(p, ui(l)-1*8) + i0, i1 := readU64(p, 0*8), readU64(p, 1*8) + + acc.Hi += mulFold64(in2^key64_016, in1^key64_024) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_000, i1^key64_008) + acc.Lo ^= in2 + in1 + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+(u64(l)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + case l <= 240: + acc.Lo = u64(l) * prime64_1 + + { + i0, i1, i2, i3 := readU64(p, 0*8), readU64(p, 1*8), readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(i2^key64_016, i3^key64_024) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_000, i1^key64_008) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 4*8), readU64(p, 5*8), readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(i2^key64_048, i3^key64_056) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_032, i1^key64_040) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 8*8), readU64(p, 9*8), readU64(p, 10*8), readU64(p, 11*8) + + acc.Hi += mulFold64(i2^key64_080, i3^key64_088) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_064, i1^key64_072) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 12*8), readU64(p, 13*8), readU64(p, 14*8), readU64(p, 15*8) + + acc.Hi += mulFold64(i2^key64_112, i3^key64_120) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_096, i1^key64_104) + acc.Lo ^= i2 + i3 + } + + // avalanche + acc.Hi = xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + // trailing groups after 128 + top := ui(l) &^ 31 + for i := ui(4 * 32); i < top; i += 32 { + i0, i1, i2, i3 := readU64(p, i+0), readU64(p, i+8), readU64(p, i+16), readU64(p, i+24) + k0, k1, k2, k3 := readU64(key, i-125), readU64(key, i-117), readU64(key, i-109), readU64(key, i-101) + + acc.Hi += mulFold64(i2^k2, i3^k3) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^k0, i1^k1) + acc.Lo ^= i2 + i3 + } + + // last 32 bytes + { + i0, i1, i2, i3 := readU64(p, ui(l)-32), readU64(p, ui(l)-24), readU64(p, ui(l)-16), readU64(p, ui(l)-8) + + acc.Hi += mulFold64(i0^key64_119, i1^key64_127) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^key64_103, i3^key64_111) + acc.Lo ^= i0 + i1 + } + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+(u64(l)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + default: + acc.Lo = u64(l) * prime64_1 + acc.Hi = ^(u64(l) * prime64_2) + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, key, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, key, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, key, u64(l)) + } else { + accumScalar(&accs, p, key, u64(l)) + } + + // merge accs + acc.Lo += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc.Hi += mulFold64(accs[0]^key64_117, accs[1]^key64_125) + + acc.Lo += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc.Hi += mulFold64(accs[2]^key64_133, accs[3]^key64_141) + + acc.Lo += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc.Hi += mulFold64(accs[4]^key64_149, accs[5]^key64_157) + + acc.Lo += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + acc.Hi += mulFold64(accs[6]^key64_165, accs[7]^key64_173) + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + } +} diff --git a/vendor/github.com/zeebo/xxh3/hash128_seed.go b/vendor/github.com/zeebo/xxh3/hash128_seed.go new file mode 100644 index 000000000000..358009be320a --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash128_seed.go @@ -0,0 +1,264 @@ +package xxh3 + +import ( + "math/bits" +) + +// Hash128Seed returns the 128-bit hash of the byte slice. +func Hash128Seed(b []byte, seed uint64) Uint128 { + return hashAny128Seed(*(*str)(ptr(&b)), seed) +} + +// HashString128Seed returns the 128-bit hash of the string slice. +func HashString128Seed(s string, seed uint64) Uint128 { + return hashAny128Seed(*(*str)(ptr(&s)), seed) +} + +func hashAny128Seed(s str, seed uint64) (acc u128) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: // 9-16 + bitflipl := (key64_032 ^ key64_040) - seed + bitfliph := (key64_048 ^ key64_056) + seed + + input_lo := readU64(p, 0) + input_hi := readU64(p, ui(l)-8) + + m128_h, m128_l := bits.Mul64(input_lo^input_hi^bitflipl, prime64_1) + + m128_l += uint64(l-1) << 54 + input_hi ^= bitfliph + + m128_h += input_hi + uint64(uint32(input_hi))*(prime32_2-1) + + m128_l ^= bits.ReverseBytes64(m128_h) + + acc.Hi, acc.Lo = bits.Mul64(m128_l, prime64_2) + acc.Hi += m128_h * prime64_2 + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l > 3: // 4-8 + seed ^= u64(bits.ReverseBytes32(u32(seed))) << 32 + bitflip := (key64_016 ^ key64_024) + seed + input_lo := readU32(p, 0) + input_hi := readU32(p, ui(l)-4) + input_64 := u64(input_lo) + u64(input_hi)<<32 + keyed := input_64 ^ bitflip + + acc.Hi, acc.Lo = bits.Mul64(keyed, prime64_1+(uint64(l)<<2)) + + acc.Hi += acc.Lo << 1 + acc.Lo ^= acc.Hi >> 3 + + acc.Lo ^= acc.Lo >> 35 + acc.Lo *= 0x9fb21c651e98df25 + acc.Lo ^= acc.Lo >> 28 + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc.Lo = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc.Lo = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc.Lo = c1*(1<<24+1<<16+1) + 1<<8 + + default: // 0 + bitflipl := key64_064 ^ key64_072 ^ seed + bitfliph := key64_080 ^ key64_088 ^ seed + return u128{Lo: xxh64AvalancheFull(bitflipl), Hi: xxh64AvalancheFull(bitfliph)} + } + + acc.Hi = uint64(bits.RotateLeft32(bits.ReverseBytes32(uint32(acc.Lo)), 13)) + acc.Lo ^= uint64(key32_000^key32_004) + seed + acc.Hi ^= uint64(key32_008^key32_012) - seed + + acc.Lo = xxh64AvalancheFull(acc.Lo) + acc.Hi = xxh64AvalancheFull(acc.Hi) + + return acc + + case l <= 128: + acc.Lo = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + in8, in7 := readU64(p, ui(l)-8*8), readU64(p, ui(l)-7*8) + i6, i7 := readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(in8^(key64_112+seed), in7^(key64_120-seed)) + acc.Hi ^= i6 + i7 + acc.Lo += mulFold64(i6^(key64_096+seed), i7^(key64_104-seed)) + acc.Lo ^= in8 + in7 + + } // 96 + + in6, in5 := readU64(p, ui(l)-6*8), readU64(p, ui(l)-5*8) + i4, i5 := readU64(p, 4*8), readU64(p, 5*8) + + acc.Hi += mulFold64(in6^(key64_080+seed), in5^(key64_088-seed)) + acc.Hi ^= i4 + i5 + acc.Lo += mulFold64(i4^(key64_064+seed), i5^(key64_072-seed)) + acc.Lo ^= in6 + in5 + + } // 64 + + in4, in3 := readU64(p, ui(l)-4*8), readU64(p, ui(l)-3*8) + i2, i3 := readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(in4^(key64_048+seed), in3^(key64_056-seed)) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^(key64_032+seed), i3^(key64_040-seed)) + acc.Lo ^= in4 + in3 + + } // 32 + + in2, in1 := readU64(p, ui(l)-2*8), readU64(p, ui(l)-1*8) + i0, i1 := readU64(p, 0*8), readU64(p, 1*8) + + acc.Hi += mulFold64(in2^(key64_016+seed), in1^(key64_024-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_000+seed), i1^(key64_008-seed)) + acc.Lo ^= in2 + in1 + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+((u64(l)-seed)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + case l <= 240: + acc.Lo = u64(l) * prime64_1 + + { + i0, i1, i2, i3 := readU64(p, 0*8), readU64(p, 1*8), readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(i2^(key64_016+seed), i3^(key64_024-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_000+seed), i1^(key64_008-seed)) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 4*8), readU64(p, 5*8), readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(i2^(key64_048+seed), i3^(key64_056-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_032+seed), i1^(key64_040-seed)) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 8*8), readU64(p, 9*8), readU64(p, 10*8), readU64(p, 11*8) + + acc.Hi += mulFold64(i2^(key64_080+seed), i3^(key64_088-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_064+seed), i1^(key64_072-seed)) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 12*8), readU64(p, 13*8), readU64(p, 14*8), readU64(p, 15*8) + + acc.Hi += mulFold64(i2^(key64_112+seed), i3^(key64_120-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_096+seed), i1^(key64_104-seed)) + acc.Lo ^= i2 + i3 + } + + // avalanche + acc.Hi = xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + // trailing groups after 128 + top := ui(l) &^ 31 + for i := ui(4 * 32); i < top; i += 32 { + i0, i1, i2, i3 := readU64(p, i+0), readU64(p, i+8), readU64(p, i+16), readU64(p, i+24) + k0, k1, k2, k3 := readU64(key, i-125)+seed, readU64(key, i-117)-seed, readU64(key, i-109)+seed, readU64(key, i-101)-seed + + acc.Hi += mulFold64(i2^k2, i3^k3) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^k0, i1^k1) + acc.Lo ^= i2 + i3 + } + + // last 32 bytes + { + i0, i1, i2, i3 := readU64(p, ui(l)-32), readU64(p, ui(l)-24), readU64(p, ui(l)-16), readU64(p, ui(l)-8) + + seed := 0 - seed + acc.Hi += mulFold64(i0^(key64_119+seed), i1^(key64_127-seed)) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^(key64_103+seed), i3^(key64_111-seed)) + acc.Lo ^= i0 + i1 + } + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+((u64(l)-seed)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + default: + acc.Lo = u64(l) * prime64_1 + acc.Hi = ^(u64(l) * prime64_2) + + secret := key + if seed != 0 { + secret = ptr(&[secretSize]byte{}) + initSecret(secret, seed) + } + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, secret, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, secret, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, secret, u64(l)) + } else { + accumScalar(&accs, p, secret, u64(l)) + } + + // merge accs + const hi_off = 117 - 11 + + acc.Lo += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc.Hi += mulFold64(accs[0]^readU64(secret, 11+hi_off), accs[1]^readU64(secret, 19+hi_off)) + + acc.Lo += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc.Hi += mulFold64(accs[2]^readU64(secret, 27+hi_off), accs[3]^readU64(secret, 35+hi_off)) + + acc.Lo += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc.Hi += mulFold64(accs[4]^readU64(secret, 43+hi_off), accs[5]^readU64(secret, 51+hi_off)) + + acc.Lo += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + acc.Hi += mulFold64(accs[6]^readU64(secret, 59+hi_off), accs[7]^readU64(secret, 67+hi_off)) + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + } +} diff --git a/vendor/github.com/zeebo/xxh3/hash64.go b/vendor/github.com/zeebo/xxh3/hash64.go new file mode 100644 index 000000000000..13aab9585607 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash64.go @@ -0,0 +1,126 @@ +package xxh3 + +import "math/bits" + +// Hash returns the hash of the byte slice. +func Hash(b []byte) uint64 { + return hashAny(*(*str)(ptr(&b))) +} + +// Hash returns the hash of the string slice. +func HashString(s string) uint64 { + return hashAny(*(*str)(ptr(&s))) +} + +func hashAny(s str) (acc u64) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: // 9-16 + inputlo := readU64(p, 0) ^ (key64_024 ^ key64_032) + inputhi := readU64(p, ui(l)-8) ^ (key64_040 ^ key64_048) + folded := mulFold64(inputlo, inputhi) + return xxh3Avalanche(u64(l) + bits.ReverseBytes64(inputlo) + inputhi + folded) + + case l > 3: // 4-8 + input1 := readU32(p, 0) + input2 := readU32(p, ui(l)-4) + input64 := u64(input2) + u64(input1)<<32 + keyed := input64 ^ (key64_008 ^ key64_016) + return rrmxmx(keyed, u64(l)) + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc = c1*(1<<24+1<<16+1) + 1<<8 + + default: // 0 + return 0x2d06800538d394c2 // xxh_avalanche(key64_056 ^ key64_064) + } + + acc ^= u64(key32_000 ^ key32_004) + return xxhAvalancheSmall(acc) + + case l <= 128: + acc = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + acc += mulFold64(readU64(p, 6*8)^key64_096, readU64(p, 7*8)^key64_104) + acc += mulFold64(readU64(p, ui(l)-8*8)^key64_112, readU64(p, ui(l)-7*8)^key64_120) + } // 96 + acc += mulFold64(readU64(p, 4*8)^key64_064, readU64(p, 5*8)^key64_072) + acc += mulFold64(readU64(p, ui(l)-6*8)^key64_080, readU64(p, ui(l)-5*8)^key64_088) + } // 64 + acc += mulFold64(readU64(p, 2*8)^key64_032, readU64(p, 3*8)^key64_040) + acc += mulFold64(readU64(p, ui(l)-4*8)^key64_048, readU64(p, ui(l)-3*8)^key64_056) + } // 32 + acc += mulFold64(readU64(p, 0*8)^key64_000, readU64(p, 1*8)^key64_008) + acc += mulFold64(readU64(p, ui(l)-2*8)^key64_016, readU64(p, ui(l)-1*8)^key64_024) + + return xxh3Avalanche(acc) + + case l <= 240: + acc = u64(l) * prime64_1 + + acc += mulFold64(readU64(p, 0*16+0)^key64_000, readU64(p, 0*16+8)^key64_008) + acc += mulFold64(readU64(p, 1*16+0)^key64_016, readU64(p, 1*16+8)^key64_024) + acc += mulFold64(readU64(p, 2*16+0)^key64_032, readU64(p, 2*16+8)^key64_040) + acc += mulFold64(readU64(p, 3*16+0)^key64_048, readU64(p, 3*16+8)^key64_056) + acc += mulFold64(readU64(p, 4*16+0)^key64_064, readU64(p, 4*16+8)^key64_072) + acc += mulFold64(readU64(p, 5*16+0)^key64_080, readU64(p, 5*16+8)^key64_088) + acc += mulFold64(readU64(p, 6*16+0)^key64_096, readU64(p, 6*16+8)^key64_104) + acc += mulFold64(readU64(p, 7*16+0)^key64_112, readU64(p, 7*16+8)^key64_120) + + // avalanche + acc = xxh3Avalanche(acc) + + // trailing groups after 128 + top := ui(l) &^ 15 + for i := ui(8 * 16); i < top; i += 16 { + acc += mulFold64(readU64(p, i+0)^readU64(key, i-125), readU64(p, i+8)^readU64(key, i-117)) + } + + // last 16 bytes + acc += mulFold64(readU64(p, ui(l)-16)^key64_119, readU64(p, ui(l)-8)^key64_127) + + return xxh3Avalanche(acc) + + default: + acc = u64(l) * prime64_1 + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, key, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, key, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, key, u64(l)) + } else { + accumScalar(&accs, p, key, u64(l)) + } + + // merge accs + acc += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + + return xxh3Avalanche(acc) + } +} diff --git a/vendor/github.com/zeebo/xxh3/hash64_seed.go b/vendor/github.com/zeebo/xxh3/hash64_seed.go new file mode 100644 index 000000000000..429994c363ed --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash64_seed.go @@ -0,0 +1,134 @@ +package xxh3 + +import "math/bits" + +// HashSeed returns the hash of the byte slice with given seed. +func HashSeed(b []byte, seed uint64) uint64 { + return hashAnySeed(*(*str)(ptr(&b)), seed) + +} + +// HashStringSeed returns the hash of the string slice with given seed. +func HashStringSeed(s string, seed uint64) uint64 { + return hashAnySeed(*(*str)(ptr(&s)), seed) +} + +func hashAnySeed(s str, seed uint64) (acc u64) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: + inputlo := readU64(p, 0) ^ (key64_024 ^ key64_032 + seed) + inputhi := readU64(p, ui(l)-8) ^ (key64_040 ^ key64_048 - seed) + folded := mulFold64(inputlo, inputhi) + return xxh3Avalanche(u64(l) + bits.ReverseBytes64(inputlo) + inputhi + folded) + + case l > 3: + seed ^= u64(bits.ReverseBytes32(u32(seed))) << 32 + input1 := readU32(p, 0) + input2 := readU32(p, ui(l)-4) + input64 := u64(input2) + u64(input1)<<32 + keyed := input64 ^ (key64_008 ^ key64_016 - seed) + return rrmxmx(keyed, u64(l)) + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc = c1*(1<<24+1<<16+1) + 1<<8 + + default: + return xxhAvalancheSmall(seed ^ key64_056 ^ key64_064) + } + + acc ^= u64(key32_000^key32_004) + seed + return xxhAvalancheSmall(acc) + + case l <= 128: + acc = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + acc += mulFold64(readU64(p, 6*8)^(key64_096+seed), readU64(p, 7*8)^(key64_104-seed)) + acc += mulFold64(readU64(p, ui(l)-8*8)^(key64_112+seed), readU64(p, ui(l)-7*8)^(key64_120-seed)) + } // 96 + acc += mulFold64(readU64(p, 4*8)^(key64_064+seed), readU64(p, 5*8)^(key64_072-seed)) + acc += mulFold64(readU64(p, ui(l)-6*8)^(key64_080+seed), readU64(p, ui(l)-5*8)^(key64_088-seed)) + } // 64 + acc += mulFold64(readU64(p, 2*8)^(key64_032+seed), readU64(p, 3*8)^(key64_040-seed)) + acc += mulFold64(readU64(p, ui(l)-4*8)^(key64_048+seed), readU64(p, ui(l)-3*8)^(key64_056-seed)) + } // 32 + acc += mulFold64(readU64(p, 0*8)^(key64_000+seed), readU64(p, 1*8)^(key64_008-seed)) + acc += mulFold64(readU64(p, ui(l)-2*8)^(key64_016+seed), readU64(p, ui(l)-1*8)^(key64_024-seed)) + + return xxh3Avalanche(acc) + + case l <= 240: + acc = u64(l) * prime64_1 + + acc += mulFold64(readU64(p, 0*16+0)^(key64_000+seed), readU64(p, 0*16+8)^(key64_008-seed)) + acc += mulFold64(readU64(p, 1*16+0)^(key64_016+seed), readU64(p, 1*16+8)^(key64_024-seed)) + acc += mulFold64(readU64(p, 2*16+0)^(key64_032+seed), readU64(p, 2*16+8)^(key64_040-seed)) + acc += mulFold64(readU64(p, 3*16+0)^(key64_048+seed), readU64(p, 3*16+8)^(key64_056-seed)) + acc += mulFold64(readU64(p, 4*16+0)^(key64_064+seed), readU64(p, 4*16+8)^(key64_072-seed)) + acc += mulFold64(readU64(p, 5*16+0)^(key64_080+seed), readU64(p, 5*16+8)^(key64_088-seed)) + acc += mulFold64(readU64(p, 6*16+0)^(key64_096+seed), readU64(p, 6*16+8)^(key64_104-seed)) + acc += mulFold64(readU64(p, 7*16+0)^(key64_112+seed), readU64(p, 7*16+8)^(key64_120-seed)) + + // avalanche + acc = xxh3Avalanche(acc) + + // trailing groups after 128 + top := ui(l) &^ 15 + for i := ui(8 * 16); i < top; i += 16 { + acc += mulFold64(readU64(p, i+0)^(readU64(key, i-125)+seed), readU64(p, i+8)^(readU64(key, i-117)-seed)) + } + + // last 16 bytes + acc += mulFold64(readU64(p, ui(l)-16)^(key64_119+seed), readU64(p, ui(l)-8)^(key64_127-seed)) + + return xxh3Avalanche(acc) + + default: + acc = u64(l) * prime64_1 + + secret := key + if seed != 0 { + secret = ptr(&[secretSize]byte{}) + initSecret(secret, seed) + } + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, secret, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, secret, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, secret, u64(l)) + } else { + accumScalarSeed(&accs, p, secret, u64(l)) + } + + // merge accs + acc += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + + return xxh3Avalanche(acc) + } +} diff --git a/vendor/github.com/zeebo/xxh3/hasher.go b/vendor/github.com/zeebo/xxh3/hasher.go new file mode 100644 index 000000000000..d9789980a3f0 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hasher.go @@ -0,0 +1,239 @@ +package xxh3 + +import ( + "encoding/binary" + "hash" +) + +// Hasher implements the hash.Hash interface +type Hasher struct { + acc [8]u64 + blk u64 + len u64 + key ptr + buf [_block + _stripe]byte + seed u64 +} + +var ( + _ hash.Hash = (*Hasher)(nil) + _ hash.Hash64 = (*Hasher)(nil) +) + +// New returns a new Hasher that implements the hash.Hash interface. +func New() *Hasher { + return new(Hasher) +} + +// NewSeed returns a new Hasher that implements the hash.Hash interface. +func NewSeed(seed uint64) *Hasher { + var h Hasher + h.Reset() + h.seed = seed + h.key = key + + // Only initiate once, not on reset. + if seed != 0 { + h.key = ptr(&[secretSize]byte{}) + initSecret(h.key, seed) + } + return &h +} + +// Reset resets the Hash to its initial state. +func (h *Hasher) Reset() { + h.acc = [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + h.blk = 0 + h.len = 0 +} + +// BlockSize returns the hash's underlying block size. +// The Write method will accept any amount of data, but +// it may operate more efficiently if all writes are a +// multiple of the block size. +func (h *Hasher) BlockSize() int { return _stripe } + +// Size returns the number of bytes Sum will return. +func (h *Hasher) Size() int { return 8 } + +// Sum appends the current hash to b and returns the resulting slice. +// It does not change the underlying hash state. +func (h *Hasher) Sum(b []byte) []byte { + var tmp [8]byte + binary.BigEndian.PutUint64(tmp[:], h.Sum64()) + return append(b, tmp[:]...) +} + +// Write adds more data to the running hash. +// It never returns an error. +func (h *Hasher) Write(buf []byte) (int, error) { + h.update(buf) + return len(buf), nil +} + +// WriteString adds more data to the running hash. +// It never returns an error. +func (h *Hasher) WriteString(buf string) (int, error) { + h.updateString(buf) + return len(buf), nil +} + +func (h *Hasher) update(buf []byte) { + // relies on the data pointer being the first word in the string header + h.updateString(*(*string)(ptr(&buf))) +} + +func (h *Hasher) updateString(buf string) { + if h.key == nil { + h.key = key + h.Reset() + } + + // On first write, if more than 1 block, process without copy. + for h.len == 0 && len(buf) > len(h.buf) { + if hasAVX2 { + accumBlockAVX2(&h.acc, *(*ptr)(ptr(&buf)), h.key) + } else if hasSSE2 { + accumBlockSSE(&h.acc, *(*ptr)(ptr(&buf)), h.key) + } else { + accumBlockScalar(&h.acc, *(*ptr)(ptr(&buf)), h.key) + } + buf = buf[_block:] + h.blk++ + } + + for len(buf) > 0 { + if h.len < u64(len(h.buf)) { + n := copy(h.buf[h.len:], buf) + h.len += u64(n) + buf = buf[n:] + continue + } + + if hasAVX2 { + accumBlockAVX2(&h.acc, ptr(&h.buf), h.key) + } else if hasSSE2 { + accumBlockSSE(&h.acc, ptr(&h.buf), h.key) + } else { + accumBlockScalar(&h.acc, ptr(&h.buf), h.key) + } + + h.blk++ + h.len = _stripe + copy(h.buf[:_stripe], h.buf[_block:]) + } +} + +// Sum64 returns the 64-bit hash of the written data. +func (h *Hasher) Sum64() uint64 { + if h.key == nil { + h.key = key + h.Reset() + } + + if h.blk == 0 { + if h.seed == 0 { + return Hash(h.buf[:h.len]) + } + return HashSeed(h.buf[:h.len], h.seed) + } + + l := h.blk*_block + h.len + acc := l * prime64_1 + accs := h.acc + + if h.len > 0 { + // We are only ever doing 1 block here, so no avx512. + if hasAVX2 { + accumAVX2(&accs, ptr(&h.buf[0]), h.key, h.len) + } else if hasSSE2 { + accumSSE(&accs, ptr(&h.buf[0]), h.key, h.len) + } else { + accumScalar(&accs, ptr(&h.buf[0]), h.key, h.len) + } + } + + if h.seed == 0 { + acc += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + } else { + secret := h.key + acc += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + } + + acc = xxh3Avalanche(acc) + + return acc +} + +// Sum128 returns the 128-bit hash of the written data. +func (h *Hasher) Sum128() Uint128 { + if h.key == nil { + h.key = key + h.Reset() + } + + if h.blk == 0 { + if h.seed == 0 { + return Hash128(h.buf[:h.len]) + } + return Hash128Seed(h.buf[:h.len], h.seed) + } + + l := h.blk*_block + h.len + acc := Uint128{Lo: l * prime64_1, Hi: ^(l * prime64_2)} + accs := h.acc + + if h.len > 0 { + // We are only ever doing 1 block here, so no avx512. + if hasAVX2 { + accumAVX2(&accs, ptr(&h.buf[0]), h.key, h.len) + } else if hasSSE2 { + accumSSE(&accs, ptr(&h.buf[0]), h.key, h.len) + } else { + accumScalar(&accs, ptr(&h.buf[0]), h.key, h.len) + } + } + + if h.seed == 0 { + acc.Lo += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc.Hi += mulFold64(accs[0]^key64_117, accs[1]^key64_125) + + acc.Lo += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc.Hi += mulFold64(accs[2]^key64_133, accs[3]^key64_141) + + acc.Lo += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc.Hi += mulFold64(accs[4]^key64_149, accs[5]^key64_157) + + acc.Lo += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + acc.Hi += mulFold64(accs[6]^key64_165, accs[7]^key64_173) + } else { + secret := h.key + const hi_off = 117 - 11 + + acc.Lo += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc.Hi += mulFold64(accs[0]^readU64(secret, 11+hi_off), accs[1]^readU64(secret, 19+hi_off)) + + acc.Lo += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc.Hi += mulFold64(accs[2]^readU64(secret, 27+hi_off), accs[3]^readU64(secret, 35+hi_off)) + + acc.Lo += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc.Hi += mulFold64(accs[4]^readU64(secret, 43+hi_off), accs[5]^readU64(secret, 51+hi_off)) + + acc.Lo += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + acc.Hi += mulFold64(accs[6]^readU64(secret, 59+hi_off), accs[7]^readU64(secret, 67+hi_off)) + } + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc +} diff --git a/vendor/github.com/zeebo/xxh3/utils.go b/vendor/github.com/zeebo/xxh3/utils.go new file mode 100644 index 000000000000..a837e68a6216 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/utils.go @@ -0,0 +1,129 @@ +package xxh3 + +import ( + "math/bits" + "unsafe" +) + +// Uint128 is a 128 bit value. +// The actual value can be thought of as u.Hi<<64 | u.Lo. +type Uint128 struct { + Hi, Lo uint64 +} + +// Bytes returns the uint128 as an array of bytes in canonical form (big-endian encoded). +func (u Uint128) Bytes() [16]byte { + return [16]byte{ + byte(u.Hi >> 0x38), byte(u.Hi >> 0x30), byte(u.Hi >> 0x28), byte(u.Hi >> 0x20), + byte(u.Hi >> 0x18), byte(u.Hi >> 0x10), byte(u.Hi >> 0x08), byte(u.Hi), + byte(u.Lo >> 0x38), byte(u.Lo >> 0x30), byte(u.Lo >> 0x28), byte(u.Lo >> 0x20), + byte(u.Lo >> 0x18), byte(u.Lo >> 0x10), byte(u.Lo >> 0x08), byte(u.Lo), + } +} + +type ( + ptr = unsafe.Pointer + ui = uintptr + + u8 = uint8 + u32 = uint32 + u64 = uint64 + u128 = Uint128 +) + +type str struct { + p ptr + l uint +} + +func readU8(p ptr, o ui) uint8 { + return *(*uint8)(ptr(ui(p) + o)) +} + +func readU16(p ptr, o ui) uint16 { + b := (*[2]byte)(ptr(ui(p) + o)) + return uint16(b[0]) | uint16(b[1])<<8 +} + +func readU32(p ptr, o ui) uint32 { + b := (*[4]byte)(ptr(ui(p) + o)) + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func readU64(p ptr, o ui) uint64 { + b := (*[8]byte)(ptr(ui(p) + o)) + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func writeU64(p ptr, o ui, v u64) { + b := (*[8]byte)(ptr(ui(p) + o)) + b[0] = byte(v) + b[1] = byte(v >> 8) + b[2] = byte(v >> 16) + b[3] = byte(v >> 24) + b[4] = byte(v >> 32) + b[5] = byte(v >> 40) + b[6] = byte(v >> 48) + b[7] = byte(v >> 56) +} + +const secretSize = 192 + +func initSecret(secret ptr, seed u64) { + for i := ui(0); i < secretSize/16; i++ { + lo := readU64(key, 16*i) + seed + hi := readU64(key, 16*i+8) - seed + writeU64(secret, 16*i, lo) + writeU64(secret, 16*i+8, hi) + } +} + +func xxh64AvalancheSmall(x u64) u64 { + // x ^= x >> 33 // x must be < 32 bits + // x ^= u64(key32_000 ^ key32_004) // caller must do this + x *= prime64_2 + x ^= x >> 29 + x *= prime64_3 + x ^= x >> 32 + return x +} + +func xxhAvalancheSmall(x u64) u64 { + x ^= x >> 33 + x *= prime64_2 + x ^= x >> 29 + x *= prime64_3 + x ^= x >> 32 + return x +} + +func xxh64AvalancheFull(x u64) u64 { + x ^= x >> 33 + x *= prime64_2 + x ^= x >> 29 + x *= prime64_3 + x ^= x >> 32 + return x +} + +func xxh3Avalanche(x u64) u64 { + x ^= x >> 37 + x *= 0x165667919e3779f9 + x ^= x >> 32 + return x +} + +func rrmxmx(h64 u64, len u64) u64 { + h64 ^= bits.RotateLeft64(h64, 49) ^ bits.RotateLeft64(h64, 24) + h64 *= 0x9fb21c651e98df25 + h64 ^= (h64 >> 35) + len + h64 *= 0x9fb21c651e98df25 + h64 ^= (h64 >> 28) + return h64 +} + +func mulFold64(x, y u64) u64 { + hi, lo := bits.Mul64(x, y) + return hi ^ lo +} diff --git a/vendor/modules.txt b/vendor/modules.txt index e8bb3ac0feae..2ab973d4ddac 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -463,6 +463,9 @@ github.com/klauspost/compress/internal/cpuinfo github.com/klauspost/compress/internal/snapref github.com/klauspost/compress/zstd github.com/klauspost/compress/zstd/internal/xxhash +# github.com/klauspost/cpuid/v2 v2.0.9 +## explicit; go 1.13 +github.com/klauspost/cpuid/v2 # github.com/mailru/easyjson v0.7.6 ## explicit; go 1.12 github.com/mailru/easyjson/buffer @@ -705,6 +708,9 @@ github.com/zclconf/go-cty/cty/function/stdlib github.com/zclconf/go-cty/cty/gocty github.com/zclconf/go-cty/cty/json github.com/zclconf/go-cty/cty/set +# github.com/zeebo/xxh3 v1.0.2 +## explicit; go 1.17 +github.com/zeebo/xxh3 # go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.45.0 ## explicit; go 1.19 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc