diff --git a/build/build.go b/build/build.go index 2ae0fbd8e783..b2c8e03e7d99 100644 --- a/build/build.go +++ b/build/build.go @@ -26,6 +26,7 @@ import ( "github.com/distribution/reference" "github.com/docker/buildx/builder" "github.com/docker/buildx/driver" + "github.com/docker/buildx/internal/metrics" "github.com/docker/buildx/util/desktop" "github.com/docker/buildx/util/dockerutil" "github.com/docker/buildx/util/imagetools" @@ -55,6 +56,9 @@ import ( specs "github.com/opencontainers/image-spec/specs-go/v1" "github.com/pkg/errors" "github.com/sirupsen/logrus" + "github.com/zeebo/xxh3" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/trace" "golang.org/x/sync/errgroup" ) @@ -686,6 +690,14 @@ func BuildWithResultHandler(ctx context.Context, nodes []builder.Node, opt map[s return err } + record := metrics.Measure(ctx, "build.duration", "Measures the total build duration.", + metric.WithAttributes( + backendAttribute(dp), + buildIDAttribute(so), + buildRefAttribute(so), + ), + ) + frontendInputs := make(map[string]*pb.Definition) for key, st := range so.FrontendInputs { def, err := st.Marshal(ctx) @@ -785,6 +797,11 @@ func BuildWithResultHandler(ctx context.Context, nodes []builder.Node, opt map[s } else { rr, err = c.Build(ctx, *so, "buildx", buildFunc, ch) } + + record(ctx, metric.WithAttributes( + statusAttribute(err), + )) + if desktop.BuildBackendEnabled() && node.Driver.HistoryAPISupported(ctx) { buildRef := fmt.Sprintf("%s/%s/%s", node.Builder, node.Name, so.Ref) if err != nil { @@ -1523,3 +1540,47 @@ func ReadSourcePolicy() (*spb.Policy, error) { return &pol, nil } + +// backendAttribute is a utility to retrieve the backend attribute from a resolvedNode. +func backendAttribute(dp *resolvedNode) attribute.KeyValue { + driverName := dp.Node().Driver.Factory().Name() + return attribute.String("backend", driverName) +} + +// buildIDAttribute is a utility to retrieve the build id attribute from the solve options. +// This value should be consistent between builds. +func buildIDAttribute(so *client.SolveOpt) attribute.KeyValue { + vcs := so.FrontendAttrs["vcs:source"] + target := so.FrontendAttrs["target"] + context := so.FrontendAttrs["context"] + filename := so.FrontendAttrs["filename"] + + buildID := "" + if vcs != "" || target != "" || context != "" || filename != "" { + h := xxh3.New() + for _, s := range []string{vcs, target, context, filename} { + h.WriteString(s) + h.Write([]byte{0}) + } + buildID = hex.EncodeToString(h.Sum(nil)) + } + return attribute.String("build.id", buildID) +} + +// buildRefAttribute is a utility to retrieve the build ref attribute from the solve options. +// This value should be unique to each build. +func buildRefAttribute(so *client.SolveOpt) attribute.KeyValue { + return attribute.String("build.ref", so.Ref) +} + +// statusAttribute is a utility to retrieve the status attribute from an error. +func statusAttribute(err error) attribute.KeyValue { + status := "completed" + if err != nil { + status = "error" + if errors.Is(err, context.Canceled) { + status = "canceled" + } + } + return attribute.String("status", status) +} diff --git a/commands/bake.go b/commands/bake.go index 5cfa397991f4..983396c34ddf 100644 --- a/commands/bake.go +++ b/commands/bake.go @@ -13,13 +13,13 @@ import ( "github.com/docker/buildx/bake" "github.com/docker/buildx/build" "github.com/docker/buildx/builder" + "github.com/docker/buildx/internal/metrics" "github.com/docker/buildx/localstate" "github.com/docker/buildx/util/buildflags" "github.com/docker/buildx/util/cobrautil/completion" "github.com/docker/buildx/util/confutil" "github.com/docker/buildx/util/desktop" "github.com/docker/buildx/util/dockerutil" - "github.com/docker/buildx/util/metrics" "github.com/docker/buildx/util/progress" "github.com/docker/buildx/util/tracing" "github.com/docker/cli/cli/command" @@ -44,16 +44,12 @@ type bakeOptions struct { } func runBake(dockerCli command.Cli, targets []string, in bakeOptions, cFlags commonFlags) (err error) { - ctx := appcontext.Context() - - mp, report, err := metrics.MeterProvider(dockerCli) + ctx, report, err := metrics.Initialize(appcontext.Context(), dockerCli) if err != nil { return err } defer report() - recordVersionInfo(mp, "bake") - ctx, end, err := tracing.TraceCurrentCommand(ctx, "bake") if err != nil { return err diff --git a/commands/build.go b/commands/build.go index f290de67ea78..de5ffdf9153f 100644 --- a/commands/build.go +++ b/commands/build.go @@ -23,16 +23,15 @@ import ( "github.com/docker/buildx/controller/control" controllererrors "github.com/docker/buildx/controller/errdefs" controllerapi "github.com/docker/buildx/controller/pb" + "github.com/docker/buildx/internal/metrics" "github.com/docker/buildx/monitor" "github.com/docker/buildx/store" "github.com/docker/buildx/store/storeutil" "github.com/docker/buildx/util/buildflags" "github.com/docker/buildx/util/desktop" "github.com/docker/buildx/util/ioset" - "github.com/docker/buildx/util/metrics" "github.com/docker/buildx/util/progress" "github.com/docker/buildx/util/tracing" - "github.com/docker/buildx/version" "github.com/docker/cli-docs-tool/annotation" "github.com/docker/cli/cli" "github.com/docker/cli/cli/command" @@ -53,9 +52,6 @@ import ( "github.com/sirupsen/logrus" "github.com/spf13/cobra" "github.com/spf13/pflag" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/metric" "google.golang.org/grpc/codes" ) @@ -216,16 +212,12 @@ func (o *buildOptions) toDisplayMode() (progressui.DisplayMode, error) { } func runBuild(dockerCli command.Cli, options buildOptions) (err error) { - ctx := appcontext.Context() - - mp, report, err := metrics.MeterProvider(dockerCli) + ctx, report, err := metrics.Initialize(appcontext.Context(), dockerCli) if err != nil { return err } defer report() - recordVersionInfo(mp, "build") - ctx, end, err := tracing.TraceCurrentCommand(ctx, "build") if err != nil { return err @@ -940,30 +932,3 @@ func maybeJSONArray(v string) []string { } return []string{v} } - -func recordVersionInfo(mp metric.MeterProvider, command string) { - // Still in the process of testing/stabilizing these counters. - if !isExperimental() { - return - } - - meter := mp.Meter("github.com/docker/buildx", - metric.WithInstrumentationVersion(version.Version), - ) - - counter, err := meter.Int64Counter("docker.cli.count", - metric.WithDescription("Number of invocations of the docker buildx command."), - ) - if err != nil { - otel.Handle(err) - } - - counter.Add(context.Background(), 1, - metric.WithAttributes( - attribute.String("command", command), - attribute.String("package", version.Package), - attribute.String("version", version.Version), - attribute.String("revision", version.Revision), - ), - ) -} diff --git a/go.mod b/go.mod index 65f2f1ec777b..5267f2784c9d 100644 --- a/go.mod +++ b/go.mod @@ -38,10 +38,12 @@ require ( github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.8.4 github.com/zclconf/go-cty v1.14.1 + github.com/zeebo/xxh3 v1.0.2 go.opentelemetry.io/otel v1.19.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.42.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v0.42.0 go.opentelemetry.io/otel/metric v1.19.0 + go.opentelemetry.io/otel/sdk v1.19.0 go.opentelemetry.io/otel/sdk/metric v1.19.0 go.opentelemetry.io/otel/trace v1.19.0 golang.org/x/mod v0.11.0 @@ -107,6 +109,7 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.17.4 // indirect + github.com/klauspost/cpuid/v2 v2.0.9 // indirect github.com/mailru/easyjson v0.7.6 // indirect github.com/mattn/go-runewidth v0.0.15 // indirect github.com/mattn/go-shellwords v1.0.12 // indirect @@ -146,7 +149,6 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 // indirect go.opentelemetry.io/otel/exporters/prometheus v0.42.0 // indirect - go.opentelemetry.io/otel/sdk v1.19.0 // indirect go.opentelemetry.io/proto/otlp v1.0.0 // indirect golang.org/x/crypto v0.17.0 // indirect golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 // indirect diff --git a/go.sum b/go.sum index 9d232707b805..1b60b2056664 100644 --- a/go.sum +++ b/go.sum @@ -280,6 +280,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.17.4 h1:Ej5ixsIri7BrIjBkRZLTo6ghwrEtHFk7ijlczPW4fZ4= github.com/klauspost/compress v1.17.4/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM= +github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= +github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= @@ -472,6 +474,10 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec github.com/zclconf/go-cty v1.4.0/go.mod h1:nHzOclRkoj++EU9ZjSrZvRG0BXIWt8c7loYc0qXAFGQ= github.com/zclconf/go-cty v1.14.1 h1:t9fyA35fwjjUMcmL5hLER+e/rEPqrbCK1/OSE4SI9KA= github.com/zclconf/go-cty v1.14.1/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= +github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= +github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= +github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= +github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.45.0 h1:RsQi0qJ2imFfCvZabqzM9cNXBG8k6gXMv1A0cXRmH6A= diff --git a/internal/env/env.go b/internal/env/env.go new file mode 100644 index 000000000000..95fca0c868ba --- /dev/null +++ b/internal/env/env.go @@ -0,0 +1,15 @@ +package env + +import ( + "os" + "strconv" +) + +// IsExperimental checks if the experimental flag has been configured. +func IsExperimental() bool { + if v, ok := os.LookupEnv("BUILDX_EXPERIMENTAL"); ok { + vv, _ := strconv.ParseBool(v) + return vv + } + return false +} diff --git a/util/metrics/metrics.go b/internal/metrics/metrics.go similarity index 62% rename from util/metrics/metrics.go rename to internal/metrics/metrics.go index 36e1cc62c2e9..61d0735992a9 100644 --- a/util/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -4,21 +4,34 @@ import ( "context" "fmt" "net/url" + "os" "path" + "path/filepath" + "sync" "time" + "github.com/docker/buildx/internal/env" + "github.com/docker/buildx/version" "github.com/docker/cli/cli/command" - "github.com/moby/buildkit/util/tracing/detect" + "github.com/google/uuid" "github.com/pkg/errors" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/metric/noop" sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/metric/metricdata" + "go.opentelemetry.io/otel/sdk/resource" + semconv "go.opentelemetry.io/otel/semconv/v1.21.0" "golang.org/x/sync/errgroup" ) +type contextKey int + const ( + meterProviderKey contextKey = iota + otelConfigFieldName = "otel" shutdownTimeout = 2 * time.Second ) @@ -27,40 +40,61 @@ const ( // desired endpoint. It should be invoked on application shutdown. type ReportFunc func() -// MeterProvider returns a MeterProvider suitable for CLI usage. +// Initialize returns a context.Context with a MeterProvider suitable for CLI usage. // The primary difference between this metric reader and a more typical // usage is that metric reporting only happens once when ReportFunc // is invoked. -func MeterProvider(cli command.Cli) (metric.MeterProvider, ReportFunc, error) { +func Initialize(ctx context.Context, cli command.Cli) (context.Context, ReportFunc, error) { var exps []sdkmetric.Exporter - if exp, err := dockerOtelExporter(cli); err != nil { - return nil, nil, err - } else if exp != nil { - exps = append(exps, exp) - } + // Only metric exporters if the experimental flag is set. + if env.IsExperimental() { + if exp, err := dockerOtelExporter(cli); err != nil { + return nil, nil, err + } else if exp != nil { + exps = append(exps, exp) + } - if exp, err := detectOtlpExporter(context.Background()); err != nil { - return nil, nil, err - } else if exp != nil { - exps = append(exps, exp) + if exp, err := detectOtlpExporter(context.Background()); err != nil { + return nil, nil, err + } else if exp != nil { + exps = append(exps, exp) + } } if len(exps) == 0 { // No exporters are configured so use a noop provider. - return noop.NewMeterProvider(), func() {}, nil + return ctx, func() {}, nil } - // Use delta temporality because, since this is a CLI program, we can never - // know the cumulative value. reader := sdkmetric.NewManualReader( sdkmetric.WithTemporalitySelector(deltaTemporality), ) mp := sdkmetric.NewMeterProvider( - sdkmetric.WithResource(detect.Resource()), + sdkmetric.WithResource(Resource()), sdkmetric.WithReader(reader), ) - return mp, reportFunc(reader, exps), nil + return withMeterProvider(ctx, mp), reportFunc(reader, exps), nil +} + +func withMeterProvider(ctx context.Context, mp metric.MeterProvider) context.Context { + return context.WithValue(ctx, meterProviderKey, mp) +} + +func MeterProvider(ctx context.Context) metric.MeterProvider { + mp, ok := ctx.Value(meterProviderKey).(metric.MeterProvider) + if !ok { + mp = noop.NewMeterProvider() + } + return mp +} + +// Meter returns a Meter from the MetricProvider that indicates the measurement +// comes from buildx with the appropriate version. +func Meter(ctx context.Context) metric.Meter { + mp := MeterProvider(ctx) + return mp.Meter(version.Package, + metric.WithInstrumentationVersion(version.Version)) } // reportFunc returns a ReportFunc for collecting ResourceMetrics and then @@ -184,6 +218,49 @@ func otelExporterOtlpEndpoint(cli command.Cli) (string, error) { } // deltaTemporality sets the Temporality of every instrument to delta. +// +// This isn't really needed since we create a unique resource on each invocation, +// but it can help with cardinality concerns for downstream processors since they can +// perform aggregation for a time interval and then discard the data once that time +// period has passed. Cumulative temporality would imply to the downstream processor +// that they might receive a successive point and they may unnecessarily keep state +// they really shouldn't. func deltaTemporality(_ sdkmetric.InstrumentKind) metricdata.Temporality { return metricdata.DeltaTemporality } + +var ( + res *resource.Resource + resOnce sync.Once +) + +// Resource retrieves the OTEL resource for the buildx CLI. +func Resource() *resource.Resource { + resOnce.Do(func() { + var err error + res, err = resource.New(context.Background(), + resource.WithDetectors(serviceNameDetector{}), + resource.WithAttributes( + attribute.Stringer("service.instance.id", uuid.New()), + ), + resource.WithFromEnv(), + resource.WithTelemetrySDK(), + ) + if err != nil { + otel.Handle(err) + } + }) + return res +} + +type serviceNameDetector struct{} + +func (serviceNameDetector) Detect(ctx context.Context) (*resource.Resource, error) { + return resource.StringDetector( + semconv.SchemaURL, + semconv.ServiceNameKey, + func() (string, error) { + return filepath.Base(os.Args[0]), nil + }, + ).Detect(ctx) +} diff --git a/util/metrics/otlp.go b/internal/metrics/otlp.go similarity index 87% rename from util/metrics/otlp.go rename to internal/metrics/otlp.go index b121ac3cd7a3..a28cd49992ca 100644 --- a/util/metrics/otlp.go +++ b/internal/metrics/otlp.go @@ -35,13 +35,9 @@ func detectOtlpExporter(ctx context.Context) (sdkmetric.Exporter, error) { switch proto { case "grpc": - return otlpmetricgrpc.New(ctx, - otlpmetricgrpc.WithTemporalitySelector(deltaTemporality), - ) + return otlpmetricgrpc.New(ctx) case "http/protobuf": - return otlpmetrichttp.New(ctx, - otlpmetrichttp.WithTemporalitySelector(deltaTemporality), - ) + return otlpmetrichttp.New(ctx) // case "http/json": // unsupported by library default: return nil, errors.Errorf("unsupported otlp protocol %v", proto) diff --git a/internal/metrics/util.go b/internal/metrics/util.go new file mode 100644 index 000000000000..25f3f64004dd --- /dev/null +++ b/internal/metrics/util.go @@ -0,0 +1,33 @@ +package metrics + +import ( + "context" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/metric" +) + +const TimeUnit string = "ms" + +// RecordFunc is used to record the measurement from Measure. +type RecordFunc func(ctx context.Context, opts ...metric.RecordOption) + +// Measure is a utility for measuring a time duration with certain attributes. +func Measure(ctx context.Context, name, desc string, opts ...metric.RecordOption) RecordFunc { + histogram, err := Meter(ctx).Int64Histogram(name, + metric.WithDescription(desc), + metric.WithUnit(TimeUnit)) + if err != nil { + otel.Handle(err) + } + + start := time.Now() + return func(ctx context.Context, newOpts ...metric.RecordOption) { + if len(newOpts) > 0 { + opts = append(opts, newOpts...) + } + dur := int64(time.Since(start) / time.Millisecond) + histogram.Record(ctx, dur, opts...) + } +} diff --git a/vendor/github.com/klauspost/cpuid/v2/.gitignore b/vendor/github.com/klauspost/cpuid/v2/.gitignore new file mode 100644 index 000000000000..daf913b1b347 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/.gitignore @@ -0,0 +1,24 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test +*.prof diff --git a/vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml b/vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml new file mode 100644 index 000000000000..944cc0007504 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml @@ -0,0 +1,74 @@ +# This is an example goreleaser.yaml file with some sane defaults. +# Make sure to check the documentation at http://goreleaser.com + +builds: + - + id: "cpuid" + binary: cpuid + main: ./cmd/cpuid/main.go + env: + - CGO_ENABLED=0 + flags: + - -ldflags=-s -w + goos: + - aix + - linux + - freebsd + - netbsd + - windows + - darwin + goarch: + - 386 + - amd64 + - arm64 + goarm: + - 7 + +archives: + - + id: cpuid + name_template: "cpuid-{{ .Os }}_{{ .Arch }}_{{ .Version }}" + replacements: + aix: AIX + darwin: OSX + linux: Linux + windows: Windows + 386: i386 + amd64: x86_64 + freebsd: FreeBSD + netbsd: NetBSD + format_overrides: + - goos: windows + format: zip + files: + - LICENSE +checksum: + name_template: 'checksums.txt' +snapshot: + name_template: "{{ .Tag }}-next" +changelog: + sort: asc + filters: + exclude: + - '^doc:' + - '^docs:' + - '^test:' + - '^tests:' + - '^Update\sREADME.md' + +nfpms: + - + file_name_template: "cpuid_package_{{ .Version }}_{{ .Os }}_{{ .Arch }}" + vendor: Klaus Post + homepage: https://github.com/klauspost/cpuid + maintainer: Klaus Post + description: CPUID Tool + license: BSD 3-Clause + formats: + - deb + - rpm + replacements: + darwin: Darwin + linux: Linux + freebsd: FreeBSD + amd64: x86_64 diff --git a/vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt b/vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt new file mode 100644 index 000000000000..2ef4714f7165 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt @@ -0,0 +1,35 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2015- Klaus Post & Contributors. +Email: klauspost@gmail.com + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/vendor/github.com/klauspost/cpuid/v2/LICENSE b/vendor/github.com/klauspost/cpuid/v2/LICENSE new file mode 100644 index 000000000000..5cec7ee949b1 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 Klaus Post + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/vendor/github.com/klauspost/cpuid/v2/README.md b/vendor/github.com/klauspost/cpuid/v2/README.md new file mode 100644 index 000000000000..465f4b77cb77 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/README.md @@ -0,0 +1,137 @@ +# cpuid +Package cpuid provides information about the CPU running the current program. + +CPU features are detected on startup, and kept for fast access through the life of the application. +Currently x86 / x64 (AMD64/i386) and ARM (ARM64) is supported, and no external C (cgo) code is used, which should make the library very easy to use. + +You can access the CPU information by accessing the shared CPU variable of the cpuid library. + +Package home: https://github.com/klauspost/cpuid + +[![PkgGoDev](https://pkg.go.dev/badge/github.com/klauspost/cpuid)](https://pkg.go.dev/github.com/klauspost/cpuid/v2) +[![Build Status][3]][4] + +[3]: https://travis-ci.org/klauspost/cpuid.svg?branch=master +[4]: https://travis-ci.org/klauspost/cpuid + +## installing + +`go get -u github.com/klauspost/cpuid/v2` using modules. + +Drop `v2` for others. + +## example + +```Go +package main + +import ( + "fmt" + "strings" + + . "github.com/klauspost/cpuid/v2" +) + +func main() { + // Print basic CPU information: + fmt.Println("Name:", CPU.BrandName) + fmt.Println("PhysicalCores:", CPU.PhysicalCores) + fmt.Println("ThreadsPerCore:", CPU.ThreadsPerCore) + fmt.Println("LogicalCores:", CPU.LogicalCores) + fmt.Println("Family", CPU.Family, "Model:", CPU.Model, "Vendor ID:", CPU.VendorID) + fmt.Println("Features:", fmt.Sprintf(strings.Join(CPU.FeatureSet(), ","))) + fmt.Println("Cacheline bytes:", CPU.CacheLine) + fmt.Println("L1 Data Cache:", CPU.Cache.L1D, "bytes") + fmt.Println("L1 Instruction Cache:", CPU.Cache.L1D, "bytes") + fmt.Println("L2 Cache:", CPU.Cache.L2, "bytes") + fmt.Println("L3 Cache:", CPU.Cache.L3, "bytes") + fmt.Println("Frequency", CPU.Hz, "hz") + + // Test if we have these specific features: + if CPU.Supports(SSE, SSE2) { + fmt.Println("We have Streaming SIMD 2 Extensions") + } +} +``` + +Sample output: +``` +>go run main.go +Name: AMD Ryzen 9 3950X 16-Core Processor +PhysicalCores: 16 +ThreadsPerCore: 2 +LogicalCores: 32 +Family 23 Model: 113 Vendor ID: AMD +Features: ADX,AESNI,AVX,AVX2,BMI1,BMI2,CLMUL,CMOV,CX16,F16C,FMA3,HTT,HYPERVISOR,LZCNT,MMX,MMXEXT,NX,POPCNT,RDRAND,RDSEED,RDTSCP,SHA,SSE,SSE2,SSE3,SSE4,SSE42,SSE4A,SSSE3 +Cacheline bytes: 64 +L1 Data Cache: 32768 bytes +L1 Instruction Cache: 32768 bytes +L2 Cache: 524288 bytes +L3 Cache: 16777216 bytes +Frequency 0 hz +We have Streaming SIMD 2 Extensions +``` + +# usage + +The `cpuid.CPU` provides access to CPU features. Use `cpuid.CPU.Supports()` to check for CPU features. +A faster `cpuid.CPU.Has()` is provided which will usually be inlined by the gc compiler. + +Note that for some cpu/os combinations some features will not be detected. +`amd64` has rather good support and should work reliably on all platforms. + +Note that hypervisors may not pass through all CPU features. + +## arm64 feature detection + +Not all operating systems provide ARM features directly +and there is no safe way to do so for the rest. + +Currently `arm64/linux` and `arm64/freebsd` should be quite reliable. +`arm64/darwin` adds features expected from the M1 processor, but a lot remains undetected. + +A `DetectARM()` can be used if you are able to control your deployment, +it will detect CPU features, but may crash if the OS doesn't intercept the calls. +A `-cpu.arm` flag for detecting unsafe ARM features can be added. See below. + +Note that currently only features are detected on ARM, +no additional information is currently available. + +## flags + +It is possible to add flags that affects cpu detection. + +For this the `Flags()` command is provided. + +This must be called *before* `flag.Parse()` AND after the flags have been parsed `Detect()` must be called. + +This means that any detection used in `init()` functions will not contain these flags. + +Example: + +```Go +package main + +import ( + "flag" + "fmt" + "strings" + + "github.com/klauspost/cpuid/v2" +) + +func main() { + cpuid.Flags() + flag.Parse() + cpuid.Detect() + + // Test if we have these specific features: + if cpuid.CPU.Supports(cpuid.SSE, cpuid.SSE2) { + fmt.Println("We have Streaming SIMD 2 Extensions") + } +} +``` + +# license + +This code is published under an MIT license. See LICENSE file for more information. diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid.go b/vendor/github.com/klauspost/cpuid/v2/cpuid.go new file mode 100644 index 000000000000..1d88736b68a2 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid.go @@ -0,0 +1,1070 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// Package cpuid provides information about the CPU running the current program. +// +// CPU features are detected on startup, and kept for fast access through the life of the application. +// Currently x86 / x64 (AMD64) as well as arm64 is supported. +// +// You can access the CPU information by accessing the shared CPU variable of the cpuid library. +// +// Package home: https://github.com/klauspost/cpuid +package cpuid + +import ( + "flag" + "fmt" + "math" + "os" + "runtime" + "strings" +) + +// AMD refererence: https://www.amd.com/system/files/TechDocs/25481.pdf +// and Processor Programming Reference (PPR) + +// Vendor is a representation of a CPU vendor. +type Vendor int + +const ( + VendorUnknown Vendor = iota + Intel + AMD + VIA + Transmeta + NSC + KVM // Kernel-based Virtual Machine + MSVM // Microsoft Hyper-V or Windows Virtual PC + VMware + XenHVM + Bhyve + Hygon + SiS + RDC + + Ampere + ARM + Broadcom + Cavium + DEC + Fujitsu + Infineon + Motorola + NVIDIA + AMCC + Qualcomm + Marvell + + lastVendor +) + +//go:generate stringer -type=FeatureID,Vendor + +// FeatureID is the ID of a specific cpu feature. +type FeatureID int + +const ( + // Keep index -1 as unknown + UNKNOWN = -1 + + // Add features + ADX FeatureID = iota // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) + AESNI // Advanced Encryption Standard New Instructions + AMD3DNOW // AMD 3DNOW + AMD3DNOWEXT // AMD 3DNowExt + AMXBF16 // Tile computational operations on BFLOAT16 numbers + AMXINT8 // Tile computational operations on 8-bit integers + AMXTILE // Tile architecture + AVX // AVX functions + AVX2 // AVX2 functions + AVX512BF16 // AVX-512 BFLOAT16 Instructions + AVX512BITALG // AVX-512 Bit Algorithms + AVX512BW // AVX-512 Byte and Word Instructions + AVX512CD // AVX-512 Conflict Detection Instructions + AVX512DQ // AVX-512 Doubleword and Quadword Instructions + AVX512ER // AVX-512 Exponential and Reciprocal Instructions + AVX512F // AVX-512 Foundation + AVX512FP16 // AVX-512 FP16 Instructions + AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions + AVX512PF // AVX-512 Prefetch Instructions + AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions + AVX512VBMI2 // AVX-512 Vector Bit Manipulation Instructions, Version 2 + AVX512VL // AVX-512 Vector Length Extensions + AVX512VNNI // AVX-512 Vector Neural Network Instructions + AVX512VP2INTERSECT // AVX-512 Intersect for D/Q + AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword + AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one. + BMI1 // Bit Manipulation Instruction Set 1 + BMI2 // Bit Manipulation Instruction Set 2 + CLDEMOTE // Cache Line Demote + CLMUL // Carry-less Multiplication + CLZERO // CLZERO instruction supported + CMOV // i686 CMOV + CPBOOST // Core Performance Boost + CX16 // CMPXCHG16B Instruction + ENQCMD // Enqueue Command + ERMS // Enhanced REP MOVSB/STOSB + F16C // Half-precision floating-point conversion + FMA3 // Intel FMA 3. Does not imply AVX. + FMA4 // Bulldozer FMA4 functions + GFNI // Galois Field New Instructions + HLE // Hardware Lock Elision + HTT // Hyperthreading (enabled) + HWA // Hardware assert supported. Indicates support for MSRC001_10 + HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors + IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB) + IBS // Instruction Based Sampling (AMD) + IBSBRNTRGT // Instruction Based Sampling Feature (AMD) + IBSFETCHSAM // Instruction Based Sampling Feature (AMD) + IBSFFV // Instruction Based Sampling Feature (AMD) + IBSOPCNT // Instruction Based Sampling Feature (AMD) + IBSOPCNTEXT // Instruction Based Sampling Feature (AMD) + IBSOPSAM // Instruction Based Sampling Feature (AMD) + IBSRDWROPCNT // Instruction Based Sampling Feature (AMD) + IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD) + INT_WBINVD // WBINVD/WBNOINVD are interruptible. + INVLPGB // NVLPGB and TLBSYNC instruction supported + LZCNT // LZCNT instruction + MCAOVERFLOW // MCA overflow recovery support. + MCOMMIT // MCOMMIT instruction supported + MMX // standard MMX + MMXEXT // SSE integer functions or AMD MMX ext + MOVDIR64B // Move 64 Bytes as Direct Store + MOVDIRI // Move Doubleword as Direct Store + MPX // Intel MPX (Memory Protection Extensions) + MSRIRC // Instruction Retired Counter MSR available + NX // NX (No-Execute) bit + POPCNT // POPCNT instruction + RDPRU // RDPRU instruction supported + RDRAND // RDRAND instruction is available + RDSEED // RDSEED instruction is available + RDTSCP // RDTSCP Instruction + RTM // Restricted Transactional Memory + RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort. + SERIALIZE // Serialize Instruction Execution + SGX // Software Guard Extensions + SGXLC // Software Guard Extensions Launch Control + SHA // Intel SHA Extensions + SSE // SSE functions + SSE2 // P4 SSE functions + SSE3 // Prescott SSE3 functions + SSE4 // Penryn SSE4.1 functions + SSE42 // Nehalem SSE4.2 functions + SSE4A // AMD Barcelona microarchitecture SSE4a instructions + SSSE3 // Conroe SSSE3 functions + STIBP // Single Thread Indirect Branch Predictors + SUCCOR // Software uncorrectable error containment and recovery capability. + TBM // AMD Trailing Bit Manipulation + TSXLDTRK // Intel TSX Suspend Load Address Tracking + VAES // Vector AES + VMX // Virtual Machine Extensions + VPCLMULQDQ // Carry-Less Multiplication Quadword + WAITPKG // TPAUSE, UMONITOR, UMWAIT + WBNOINVD // Write Back and Do Not Invalidate Cache + XOP // Bulldozer XOP functions + + // ARM features: + AESARM // AES instructions + ARMCPUID // Some CPU ID registers readable at user-level + ASIMD // Advanced SIMD + ASIMDDP // SIMD Dot Product + ASIMDHP // Advanced SIMD half-precision floating point + ASIMDRDM // Rounding Double Multiply Accumulate/Subtract (SQRDMLAH/SQRDMLSH) + ATOMICS // Large System Extensions (LSE) + CRC32 // CRC32/CRC32C instructions + DCPOP // Data cache clean to Point of Persistence (DC CVAP) + EVTSTRM // Generic timer + FCMA // Floatin point complex number addition and multiplication + FP // Single-precision and double-precision floating point + FPHP // Half-precision floating point + GPA // Generic Pointer Authentication + JSCVT // Javascript-style double->int convert (FJCVTZS) + LRCPC // Weaker release consistency (LDAPR, etc) + PMULL // Polynomial Multiply instructions (PMULL/PMULL2) + SHA1 // SHA-1 instructions (SHA1C, etc) + SHA2 // SHA-2 instructions (SHA256H, etc) + SHA3 // SHA-3 instructions (EOR3, RAXI, XAR, BCAX) + SHA512 // SHA512 instructions + SM3 // SM3 instructions + SM4 // SM4 instructions + SVE // Scalable Vector Extension + + // Keep it last. It automatically defines the size of []flagSet + lastID + + firstID FeatureID = UNKNOWN + 1 +) + +// CPUInfo contains information about the detected system CPU. +type CPUInfo struct { + BrandName string // Brand name reported by the CPU + VendorID Vendor // Comparable CPU vendor ID + VendorString string // Raw vendor string. + featureSet flagSet // Features of the CPU + PhysicalCores int // Number of physical processor cores in your CPU. Will be 0 if undetectable. + ThreadsPerCore int // Number of threads per physical core. Will be 1 if undetectable. + LogicalCores int // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable. + Family int // CPU family number + Model int // CPU model number + CacheLine int // Cache line size in bytes. Will be 0 if undetectable. + Hz int64 // Clock speed, if known, 0 otherwise. Will attempt to contain base clock speed. + BoostFreq int64 // Max clock speed, if known, 0 otherwise + Cache struct { + L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected + L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected + L2 int // L2 Cache (per core or shared). Will be -1 if undetected + L3 int // L3 Cache (per core, per ccx or shared). Will be -1 if undetected + } + SGX SGXSupport + maxFunc uint32 + maxExFunc uint32 +} + +var cpuid func(op uint32) (eax, ebx, ecx, edx uint32) +var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32) +var xgetbv func(index uint32) (eax, edx uint32) +var rdtscpAsm func() (eax, ebx, ecx, edx uint32) +var darwinHasAVX512 = func() bool { return false } + +// CPU contains information about the CPU as detected on startup, +// or when Detect last was called. +// +// Use this as the primary entry point to you data. +var CPU CPUInfo + +func init() { + initCPU() + Detect() +} + +// Detect will re-detect current CPU info. +// This will replace the content of the exported CPU variable. +// +// Unless you expect the CPU to change while you are running your program +// you should not need to call this function. +// If you call this, you must ensure that no other goroutine is accessing the +// exported CPU variable. +func Detect() { + // Set defaults + CPU.ThreadsPerCore = 1 + CPU.Cache.L1I = -1 + CPU.Cache.L1D = -1 + CPU.Cache.L2 = -1 + CPU.Cache.L3 = -1 + safe := true + if detectArmFlag != nil { + safe = !*detectArmFlag + } + addInfo(&CPU, safe) + if displayFeats != nil && *displayFeats { + fmt.Println("cpu features:", strings.Join(CPU.FeatureSet(), ",")) + // Exit with non-zero so tests will print value. + os.Exit(1) + } + if disableFlag != nil { + s := strings.Split(*disableFlag, ",") + for _, feat := range s { + feat := ParseFeature(strings.TrimSpace(feat)) + if feat != UNKNOWN { + CPU.featureSet.unset(feat) + } + } + } +} + +// DetectARM will detect ARM64 features. +// This is NOT done automatically since it can potentially crash +// if the OS does not handle the command. +// If in the future this can be done safely this function may not +// do anything. +func DetectARM() { + addInfo(&CPU, false) +} + +var detectArmFlag *bool +var displayFeats *bool +var disableFlag *string + +// Flags will enable flags. +// This must be called *before* flag.Parse AND +// Detect must be called after the flags have been parsed. +// Note that this means that any detection used in init() functions +// will not contain these flags. +func Flags() { + disableFlag = flag.String("cpu.disable", "", "disable cpu features; comma separated list") + displayFeats = flag.Bool("cpu.features", false, "lists cpu features and exits") + detectArmFlag = flag.Bool("cpu.arm", false, "allow ARM features to be detected; can potentially crash") +} + +// Supports returns whether the CPU supports all of the requested features. +func (c CPUInfo) Supports(ids ...FeatureID) bool { + for _, id := range ids { + if !c.featureSet.inSet(id) { + return false + } + } + return true +} + +// Has allows for checking a single feature. +// Should be inlined by the compiler. +func (c CPUInfo) Has(id FeatureID) bool { + return c.featureSet.inSet(id) +} + +// Disable will disable one or several features. +func (c *CPUInfo) Disable(ids ...FeatureID) bool { + for _, id := range ids { + c.featureSet.unset(id) + } + return true +} + +// Enable will disable one or several features even if they were undetected. +// This is of course not recommended for obvious reasons. +func (c *CPUInfo) Enable(ids ...FeatureID) bool { + for _, id := range ids { + c.featureSet.set(id) + } + return true +} + +// IsVendor returns true if vendor is recognized as Intel +func (c CPUInfo) IsVendor(v Vendor) bool { + return c.VendorID == v +} + +func (c CPUInfo) FeatureSet() []string { + s := make([]string, 0) + for _, f := range c.featureSet.Strings() { + s = append(s, f) + } + return s +} + +// RTCounter returns the 64-bit time-stamp counter +// Uses the RDTSCP instruction. The value 0 is returned +// if the CPU does not support the instruction. +func (c CPUInfo) RTCounter() uint64 { + if !c.Supports(RDTSCP) { + return 0 + } + a, _, _, d := rdtscpAsm() + return uint64(a) | (uint64(d) << 32) +} + +// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP. +// This variable is OS dependent, but on Linux contains information +// about the current cpu/core the code is running on. +// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned. +func (c CPUInfo) Ia32TscAux() uint32 { + if !c.Supports(RDTSCP) { + return 0 + } + _, _, ecx, _ := rdtscpAsm() + return ecx +} + +// LogicalCPU will return the Logical CPU the code is currently executing on. +// This is likely to change when the OS re-schedules the running thread +// to another CPU. +// If the current core cannot be detected, -1 will be returned. +func (c CPUInfo) LogicalCPU() int { + if c.maxFunc < 1 { + return -1 + } + _, ebx, _, _ := cpuid(1) + return int(ebx >> 24) +} + +// frequencies tries to compute the clock speed of the CPU. If leaf 15 is +// supported, use it, otherwise parse the brand string. Yes, really. +func (c *CPUInfo) frequencies() { + c.Hz, c.BoostFreq = 0, 0 + mfi := maxFunctionID() + if mfi >= 0x15 { + eax, ebx, ecx, _ := cpuid(0x15) + if eax != 0 && ebx != 0 && ecx != 0 { + c.Hz = (int64(ecx) * int64(ebx)) / int64(eax) + } + } + if mfi >= 0x16 { + a, b, _, _ := cpuid(0x16) + // Base... + if a&0xffff > 0 { + c.Hz = int64(a&0xffff) * 1_000_000 + } + // Boost... + if b&0xffff > 0 { + c.BoostFreq = int64(b&0xffff) * 1_000_000 + } + } + if c.Hz > 0 { + return + } + + // computeHz determines the official rated speed of a CPU from its brand + // string. This insanity is *actually the official documented way to do + // this according to Intel*, prior to leaf 0x15 existing. The official + // documentation only shows this working for exactly `x.xx` or `xxxx` + // cases, e.g., `2.50GHz` or `1300MHz`; this parser will accept other + // sizes. + model := c.BrandName + hz := strings.LastIndex(model, "Hz") + if hz < 3 { + return + } + var multiplier int64 + switch model[hz-1] { + case 'M': + multiplier = 1000 * 1000 + case 'G': + multiplier = 1000 * 1000 * 1000 + case 'T': + multiplier = 1000 * 1000 * 1000 * 1000 + } + if multiplier == 0 { + return + } + freq := int64(0) + divisor := int64(0) + decimalShift := int64(1) + var i int + for i = hz - 2; i >= 0 && model[i] != ' '; i-- { + if model[i] >= '0' && model[i] <= '9' { + freq += int64(model[i]-'0') * decimalShift + decimalShift *= 10 + } else if model[i] == '.' { + if divisor != 0 { + return + } + divisor = decimalShift + } else { + return + } + } + // we didn't find a space + if i < 0 { + return + } + if divisor != 0 { + c.Hz = (freq * multiplier) / divisor + return + } + c.Hz = freq * multiplier +} + +// VM Will return true if the cpu id indicates we are in +// a virtual machine. +func (c CPUInfo) VM() bool { + return CPU.featureSet.inSet(HYPERVISOR) +} + +// flags contains detected cpu features and characteristics +type flags uint64 + +// log2(bits_in_uint64) +const flagBitsLog2 = 6 +const flagBits = 1 << flagBitsLog2 +const flagMask = flagBits - 1 + +// flagSet contains detected cpu features and characteristics in an array of flags +type flagSet [(lastID + flagMask) / flagBits]flags + +func (s flagSet) inSet(feat FeatureID) bool { + return s[feat>>flagBitsLog2]&(1<<(feat&flagMask)) != 0 +} + +func (s *flagSet) set(feat FeatureID) { + s[feat>>flagBitsLog2] |= 1 << (feat & flagMask) +} + +// setIf will set a feature if boolean is true. +func (s *flagSet) setIf(cond bool, features ...FeatureID) { + if cond { + for _, offset := range features { + s[offset>>flagBitsLog2] |= 1 << (offset & flagMask) + } + } +} + +func (s *flagSet) unset(offset FeatureID) { + bit := flags(1 << (offset & flagMask)) + s[offset>>flagBitsLog2] = s[offset>>flagBitsLog2] & ^bit +} + +// or with another flagset. +func (s *flagSet) or(other flagSet) { + for i, v := range other[:] { + s[i] |= v + } +} + +// ParseFeature will parse the string and return the ID of the matching feature. +// Will return UNKNOWN if not found. +func ParseFeature(s string) FeatureID { + s = strings.ToUpper(s) + for i := firstID; i < lastID; i++ { + if i.String() == s { + return i + } + } + return UNKNOWN +} + +// Strings returns an array of the detected features for FlagsSet. +func (s flagSet) Strings() []string { + if len(s) == 0 { + return []string{""} + } + r := make([]string, 0) + for i := firstID; i < lastID; i++ { + if s.inSet(i) { + r = append(r, i.String()) + } + } + return r +} + +func maxExtendedFunction() uint32 { + eax, _, _, _ := cpuid(0x80000000) + return eax +} + +func maxFunctionID() uint32 { + a, _, _, _ := cpuid(0) + return a +} + +func brandName() string { + if maxExtendedFunction() >= 0x80000004 { + v := make([]uint32, 0, 48) + for i := uint32(0); i < 3; i++ { + a, b, c, d := cpuid(0x80000002 + i) + v = append(v, a, b, c, d) + } + return strings.Trim(string(valAsString(v...)), " ") + } + return "unknown" +} + +func threadsPerCore() int { + mfi := maxFunctionID() + vend, _ := vendorID() + + if mfi < 0x4 || (vend != Intel && vend != AMD) { + return 1 + } + + if mfi < 0xb { + if vend != Intel { + return 1 + } + _, b, _, d := cpuid(1) + if (d & (1 << 28)) != 0 { + // v will contain logical core count + v := (b >> 16) & 255 + if v > 1 { + a4, _, _, _ := cpuid(4) + // physical cores + v2 := (a4 >> 26) + 1 + if v2 > 0 { + return int(v) / int(v2) + } + } + } + return 1 + } + _, b, _, _ := cpuidex(0xb, 0) + if b&0xffff == 0 { + if vend == AMD { + // Workaround for AMD returning 0, assume 2 if >= Zen 2 + // It will be more correct than not. + fam, _ := familyModel() + _, _, _, d := cpuid(1) + if (d&(1<<28)) != 0 && fam >= 23 { + return 2 + } + } + return 1 + } + return int(b & 0xffff) +} + +func logicalCores() int { + mfi := maxFunctionID() + v, _ := vendorID() + switch v { + case Intel: + // Use this on old Intel processors + if mfi < 0xb { + if mfi < 1 { + return 0 + } + // CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID) + // that can be assigned to logical processors in a physical package. + // The value may not be the same as the number of logical processors that are present in the hardware of a physical package. + _, ebx, _, _ := cpuid(1) + logical := (ebx >> 16) & 0xff + return int(logical) + } + _, b, _, _ := cpuidex(0xb, 1) + return int(b & 0xffff) + case AMD, Hygon: + _, b, _, _ := cpuid(1) + return int((b >> 16) & 0xff) + default: + return 0 + } +} + +func familyModel() (int, int) { + if maxFunctionID() < 0x1 { + return 0, 0 + } + eax, _, _, _ := cpuid(1) + family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff) + model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0) + return int(family), int(model) +} + +func physicalCores() int { + v, _ := vendorID() + switch v { + case Intel: + return logicalCores() / threadsPerCore() + case AMD, Hygon: + lc := logicalCores() + tpc := threadsPerCore() + if lc > 0 && tpc > 0 { + return lc / tpc + } + + // The following is inaccurate on AMD EPYC 7742 64-Core Processor + if maxExtendedFunction() >= 0x80000008 { + _, _, c, _ := cpuid(0x80000008) + if c&0xff > 0 { + return int(c&0xff) + 1 + } + } + } + return 0 +} + +// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID +var vendorMapping = map[string]Vendor{ + "AMDisbetter!": AMD, + "AuthenticAMD": AMD, + "CentaurHauls": VIA, + "GenuineIntel": Intel, + "TransmetaCPU": Transmeta, + "GenuineTMx86": Transmeta, + "Geode by NSC": NSC, + "VIA VIA VIA ": VIA, + "KVMKVMKVMKVM": KVM, + "Microsoft Hv": MSVM, + "VMwareVMware": VMware, + "XenVMMXenVMM": XenHVM, + "bhyve bhyve ": Bhyve, + "HygonGenuine": Hygon, + "Vortex86 SoC": SiS, + "SiS SiS SiS ": SiS, + "RiseRiseRise": SiS, + "Genuine RDC": RDC, +} + +func vendorID() (Vendor, string) { + _, b, c, d := cpuid(0) + v := string(valAsString(b, d, c)) + vend, ok := vendorMapping[v] + if !ok { + return VendorUnknown, v + } + return vend, v +} + +func cacheLine() int { + if maxFunctionID() < 0x1 { + return 0 + } + + _, ebx, _, _ := cpuid(1) + cache := (ebx & 0xff00) >> 5 // cflush size + if cache == 0 && maxExtendedFunction() >= 0x80000006 { + _, _, ecx, _ := cpuid(0x80000006) + cache = ecx & 0xff // cacheline size + } + // TODO: Read from Cache and TLB Information + return int(cache) +} + +func (c *CPUInfo) cacheSize() { + c.Cache.L1D = -1 + c.Cache.L1I = -1 + c.Cache.L2 = -1 + c.Cache.L3 = -1 + vendor, _ := vendorID() + switch vendor { + case Intel: + if maxFunctionID() < 4 { + return + } + for i := uint32(0); ; i++ { + eax, ebx, ecx, _ := cpuidex(4, i) + cacheType := eax & 15 + if cacheType == 0 { + break + } + cacheLevel := (eax >> 5) & 7 + coherency := int(ebx&0xfff) + 1 + partitions := int((ebx>>12)&0x3ff) + 1 + associativity := int((ebx>>22)&0x3ff) + 1 + sets := int(ecx) + 1 + size := associativity * partitions * coherency * sets + switch cacheLevel { + case 1: + if cacheType == 1 { + // 1 = Data Cache + c.Cache.L1D = size + } else if cacheType == 2 { + // 2 = Instruction Cache + c.Cache.L1I = size + } else { + if c.Cache.L1D < 0 { + c.Cache.L1I = size + } + if c.Cache.L1I < 0 { + c.Cache.L1I = size + } + } + case 2: + c.Cache.L2 = size + case 3: + c.Cache.L3 = size + } + } + case AMD, Hygon: + // Untested. + if maxExtendedFunction() < 0x80000005 { + return + } + _, _, ecx, edx := cpuid(0x80000005) + c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024) + c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024) + + if maxExtendedFunction() < 0x80000006 { + return + } + _, _, ecx, _ = cpuid(0x80000006) + c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024) + + // CPUID Fn8000_001D_EAX_x[N:0] Cache Properties + if maxExtendedFunction() < 0x8000001D { + return + } + for i := uint32(0); i < math.MaxUint32; i++ { + eax, ebx, ecx, _ := cpuidex(0x8000001D, i) + + level := (eax >> 5) & 7 + cacheNumSets := ecx + 1 + cacheLineSize := 1 + (ebx & 2047) + cachePhysPartitions := 1 + ((ebx >> 12) & 511) + cacheNumWays := 1 + ((ebx >> 22) & 511) + + typ := eax & 15 + size := int(cacheNumSets * cacheLineSize * cachePhysPartitions * cacheNumWays) + if typ == 0 { + return + } + + switch level { + case 1: + switch typ { + case 1: + // Data cache + c.Cache.L1D = size + case 2: + // Inst cache + c.Cache.L1I = size + default: + if c.Cache.L1D < 0 { + c.Cache.L1I = size + } + if c.Cache.L1I < 0 { + c.Cache.L1I = size + } + } + case 2: + c.Cache.L2 = size + case 3: + c.Cache.L3 = size + } + } + } + + return +} + +type SGXEPCSection struct { + BaseAddress uint64 + EPCSize uint64 +} + +type SGXSupport struct { + Available bool + LaunchControl bool + SGX1Supported bool + SGX2Supported bool + MaxEnclaveSizeNot64 int64 + MaxEnclaveSize64 int64 + EPCSections []SGXEPCSection +} + +func hasSGX(available, lc bool) (rval SGXSupport) { + rval.Available = available + + if !available { + return + } + + rval.LaunchControl = lc + + a, _, _, d := cpuidex(0x12, 0) + rval.SGX1Supported = a&0x01 != 0 + rval.SGX2Supported = a&0x02 != 0 + rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF) // pow 2 + rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2 + rval.EPCSections = make([]SGXEPCSection, 0) + + for subleaf := uint32(2); subleaf < 2+8; subleaf++ { + eax, ebx, ecx, edx := cpuidex(0x12, subleaf) + leafType := eax & 0xf + + if leafType == 0 { + // Invalid subleaf, stop iterating + break + } else if leafType == 1 { + // EPC Section subleaf + baseAddress := uint64(eax&0xfffff000) + (uint64(ebx&0x000fffff) << 32) + size := uint64(ecx&0xfffff000) + (uint64(edx&0x000fffff) << 32) + + section := SGXEPCSection{BaseAddress: baseAddress, EPCSize: size} + rval.EPCSections = append(rval.EPCSections, section) + } + } + + return +} + +func support() flagSet { + var fs flagSet + mfi := maxFunctionID() + vend, _ := vendorID() + if mfi < 0x1 { + return fs + } + family, model := familyModel() + + _, _, c, d := cpuid(1) + fs.setIf((d&(1<<15)) != 0, CMOV) + fs.setIf((d&(1<<23)) != 0, MMX) + fs.setIf((d&(1<<25)) != 0, MMXEXT) + fs.setIf((d&(1<<25)) != 0, SSE) + fs.setIf((d&(1<<26)) != 0, SSE2) + fs.setIf((c&1) != 0, SSE3) + fs.setIf((c&(1<<5)) != 0, VMX) + fs.setIf((c&0x00000200) != 0, SSSE3) + fs.setIf((c&0x00080000) != 0, SSE4) + fs.setIf((c&0x00100000) != 0, SSE42) + fs.setIf((c&(1<<25)) != 0, AESNI) + fs.setIf((c&(1<<1)) != 0, CLMUL) + fs.setIf(c&(1<<23) != 0, POPCNT) + fs.setIf(c&(1<<30) != 0, RDRAND) + + // This bit has been reserved by Intel & AMD for use by hypervisors, + // and indicates the presence of a hypervisor. + fs.setIf(c&(1<<31) != 0, HYPERVISOR) + fs.setIf(c&(1<<29) != 0, F16C) + fs.setIf(c&(1<<13) != 0, CX16) + + if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 { + fs.setIf(threadsPerCore() > 1, HTT) + } + if vend == AMD && (d&(1<<28)) != 0 && mfi >= 4 { + fs.setIf(threadsPerCore() > 1, HTT) + } + // Check XGETBV/XSAVE (26), OXSAVE (27) and AVX (28) bits + const avxCheck = 1<<26 | 1<<27 | 1<<28 + if c&avxCheck == avxCheck { + // Check for OS support + eax, _ := xgetbv(0) + if (eax & 0x6) == 0x6 { + fs.set(AVX) + switch vend { + case Intel: + // Older than Haswell. + fs.setIf(family == 6 && model < 60, AVXSLOW) + case AMD: + // Older than Zen 2 + fs.setIf(family < 23 || (family == 23 && model < 49), AVXSLOW) + } + } + } + // FMA3 can be used with SSE registers, so no OS support is strictly needed. + // fma3 and OSXSAVE needed. + const fma3Check = 1<<12 | 1<<27 + fs.setIf(c&fma3Check == fma3Check, FMA3) + + // Check AVX2, AVX2 requires OS support, but BMI1/2 don't. + if mfi >= 7 { + _, ebx, ecx, edx := cpuidex(7, 0) + eax1, _, _, _ := cpuidex(7, 1) + if fs.inSet(AVX) && (ebx&0x00000020) != 0 { + fs.set(AVX2) + } + // CPUID.(EAX=7, ECX=0).EBX + if (ebx & 0x00000008) != 0 { + fs.set(BMI1) + fs.setIf((ebx&0x00000100) != 0, BMI2) + } + fs.setIf(ebx&(1<<2) != 0, SGX) + fs.setIf(ebx&(1<<4) != 0, HLE) + fs.setIf(ebx&(1<<9) != 0, ERMS) + fs.setIf(ebx&(1<<11) != 0, RTM) + fs.setIf(ebx&(1<<14) != 0, MPX) + fs.setIf(ebx&(1<<18) != 0, RDSEED) + fs.setIf(ebx&(1<<19) != 0, ADX) + fs.setIf(ebx&(1<<29) != 0, SHA) + // CPUID.(EAX=7, ECX=0).ECX + fs.setIf(ecx&(1<<5) != 0, WAITPKG) + fs.setIf(ecx&(1<<25) != 0, CLDEMOTE) + fs.setIf(ecx&(1<<27) != 0, MOVDIRI) + fs.setIf(ecx&(1<<28) != 0, MOVDIR64B) + fs.setIf(ecx&(1<<29) != 0, ENQCMD) + fs.setIf(ecx&(1<<30) != 0, SGXLC) + // CPUID.(EAX=7, ECX=0).EDX + fs.setIf(edx&(1<<11) != 0, RTM_ALWAYS_ABORT) + fs.setIf(edx&(1<<14) != 0, SERIALIZE) + fs.setIf(edx&(1<<16) != 0, TSXLDTRK) + fs.setIf(edx&(1<<26) != 0, IBPB) + fs.setIf(edx&(1<<27) != 0, STIBP) + + // Only detect AVX-512 features if XGETBV is supported + if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) { + // Check for OS support + eax, _ := xgetbv(0) + + // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and + // ZMM16-ZMM31 state are enabled by OS) + /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS). + hasAVX512 := (eax>>5)&7 == 7 && (eax>>1)&3 == 3 + if runtime.GOOS == "darwin" { + hasAVX512 = fs.inSet(AVX) && darwinHasAVX512() + } + if hasAVX512 { + fs.setIf(ebx&(1<<16) != 0, AVX512F) + fs.setIf(ebx&(1<<17) != 0, AVX512DQ) + fs.setIf(ebx&(1<<21) != 0, AVX512IFMA) + fs.setIf(ebx&(1<<26) != 0, AVX512PF) + fs.setIf(ebx&(1<<27) != 0, AVX512ER) + fs.setIf(ebx&(1<<28) != 0, AVX512CD) + fs.setIf(ebx&(1<<30) != 0, AVX512BW) + fs.setIf(ebx&(1<<31) != 0, AVX512VL) + // ecx + fs.setIf(ecx&(1<<1) != 0, AVX512VBMI) + fs.setIf(ecx&(1<<6) != 0, AVX512VBMI2) + fs.setIf(ecx&(1<<8) != 0, GFNI) + fs.setIf(ecx&(1<<9) != 0, VAES) + fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ) + fs.setIf(ecx&(1<<11) != 0, AVX512VNNI) + fs.setIf(ecx&(1<<12) != 0, AVX512BITALG) + fs.setIf(ecx&(1<<14) != 0, AVX512VPOPCNTDQ) + // edx + fs.setIf(edx&(1<<8) != 0, AVX512VP2INTERSECT) + fs.setIf(edx&(1<<22) != 0, AMXBF16) + fs.setIf(edx&(1<<23) != 0, AVX512FP16) + fs.setIf(edx&(1<<24) != 0, AMXTILE) + fs.setIf(edx&(1<<25) != 0, AMXINT8) + // eax1 = CPUID.(EAX=7, ECX=1).EAX + fs.setIf(eax1&(1<<5) != 0, AVX512BF16) + } + } + } + + if maxExtendedFunction() >= 0x80000001 { + _, _, c, d := cpuid(0x80000001) + if (c & (1 << 5)) != 0 { + fs.set(LZCNT) + fs.set(POPCNT) + } + fs.setIf((c&(1<<10)) != 0, IBS) + fs.setIf((d&(1<<31)) != 0, AMD3DNOW) + fs.setIf((d&(1<<30)) != 0, AMD3DNOWEXT) + fs.setIf((d&(1<<23)) != 0, MMX) + fs.setIf((d&(1<<22)) != 0, MMXEXT) + fs.setIf((c&(1<<6)) != 0, SSE4A) + fs.setIf(d&(1<<20) != 0, NX) + fs.setIf(d&(1<<27) != 0, RDTSCP) + + /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be + * used unless the OS has AVX support. */ + if fs.inSet(AVX) { + fs.setIf((c&0x00000800) != 0, XOP) + fs.setIf((c&0x00010000) != 0, FMA4) + } + + } + if maxExtendedFunction() >= 0x80000007 { + _, b, _, d := cpuid(0x80000007) + fs.setIf((b&(1<<0)) != 0, MCAOVERFLOW) + fs.setIf((b&(1<<1)) != 0, SUCCOR) + fs.setIf((b&(1<<2)) != 0, HWA) + fs.setIf((d&(1<<9)) != 0, CPBOOST) + } + + if maxExtendedFunction() >= 0x80000008 { + _, b, _, _ := cpuid(0x80000008) + fs.setIf((b&(1<<9)) != 0, WBNOINVD) + fs.setIf((b&(1<<8)) != 0, MCOMMIT) + fs.setIf((b&(1<<13)) != 0, INT_WBINVD) + fs.setIf((b&(1<<4)) != 0, RDPRU) + fs.setIf((b&(1<<3)) != 0, INVLPGB) + fs.setIf((b&(1<<1)) != 0, MSRIRC) + fs.setIf((b&(1<<0)) != 0, CLZERO) + } + + if maxExtendedFunction() >= 0x8000001b && fs.inSet(IBS) { + eax, _, _, _ := cpuid(0x8000001b) + fs.setIf((eax>>0)&1 == 1, IBSFFV) + fs.setIf((eax>>1)&1 == 1, IBSFETCHSAM) + fs.setIf((eax>>2)&1 == 1, IBSOPSAM) + fs.setIf((eax>>3)&1 == 1, IBSRDWROPCNT) + fs.setIf((eax>>4)&1 == 1, IBSOPCNT) + fs.setIf((eax>>5)&1 == 1, IBSBRNTRGT) + fs.setIf((eax>>6)&1 == 1, IBSOPCNTEXT) + fs.setIf((eax>>7)&1 == 1, IBSRIPINVALIDCHK) + } + + return fs +} + +func valAsString(values ...uint32) []byte { + r := make([]byte, 4*len(values)) + for i, v := range values { + dst := r[i*4:] + dst[0] = byte(v & 0xff) + dst[1] = byte((v >> 8) & 0xff) + dst[2] = byte((v >> 16) & 0xff) + dst[3] = byte((v >> 24) & 0xff) + switch { + case dst[0] == 0: + return r[:i*4] + case dst[1] == 0: + return r[:i*4+1] + case dst[2] == 0: + return r[:i*4+2] + case dst[3] == 0: + return r[:i*4+3] + } + } + return r +} diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid_386.s b/vendor/github.com/klauspost/cpuid/v2/cpuid_386.s new file mode 100644 index 000000000000..8587c3a1fc55 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid_386.s @@ -0,0 +1,47 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build 386,!gccgo,!noasm,!appengine + +// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuid(SB), 7, $0 + XORL CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+4(FP) + MOVL BX, ebx+8(FP) + MOVL CX, ecx+12(FP) + MOVL DX, edx+16(FP) + RET + +// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuidex(SB), 7, $0 + MOVL op+0(FP), AX + MOVL op2+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func xgetbv(index uint32) (eax, edx uint32) +TEXT ·asmXgetbv(SB), 7, $0 + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+4(FP) + MOVL DX, edx+8(FP) + RET + +// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) +TEXT ·asmRdtscpAsm(SB), 7, $0 + BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP + MOVL AX, eax+0(FP) + MOVL BX, ebx+4(FP) + MOVL CX, ecx+8(FP) + MOVL DX, edx+12(FP) + RET + +// func asmDarwinHasAVX512() bool +TEXT ·asmDarwinHasAVX512(SB), 7, $0 + MOVL $0, eax+0(FP) + RET diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s b/vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s new file mode 100644 index 000000000000..bc11f8942193 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s @@ -0,0 +1,72 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build amd64,!gccgo,!noasm,!appengine + +// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuid(SB), 7, $0 + XORQ CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuidex(SB), 7, $0 + MOVL op+0(FP), AX + MOVL op2+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func asmXgetbv(index uint32) (eax, edx uint32) +TEXT ·asmXgetbv(SB), 7, $0 + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+8(FP) + MOVL DX, edx+12(FP) + RET + +// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) +TEXT ·asmRdtscpAsm(SB), 7, $0 + BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP + MOVL AX, eax+0(FP) + MOVL BX, ebx+4(FP) + MOVL CX, ecx+8(FP) + MOVL DX, edx+12(FP) + RET + +// From https://go-review.googlesource.com/c/sys/+/285572/ +// func asmDarwinHasAVX512() bool +TEXT ·asmDarwinHasAVX512(SB), 7, $0-1 + MOVB $0, ret+0(FP) // default to false + +#ifdef GOOS_darwin // return if not darwin +#ifdef GOARCH_amd64 // return if not amd64 +// These values from: +// https://github.com/apple/darwin-xnu/blob/xnu-4570.1.46/osfmk/i386/cpu_capabilities.h +#define commpage64_base_address 0x00007fffffe00000 +#define commpage64_cpu_capabilities64 (commpage64_base_address+0x010) +#define commpage64_version (commpage64_base_address+0x01E) +#define hasAVX512F 0x0000004000000000 + MOVQ $commpage64_version, BX + MOVW (BX), AX + CMPW AX, $13 // versions < 13 do not support AVX512 + JL no_avx512 + MOVQ $commpage64_cpu_capabilities64, BX + MOVQ (BX), AX + MOVQ $hasAVX512F, CX + ANDQ CX, AX + JZ no_avx512 + MOVB $1, ret+0(FP) + +no_avx512: +#endif +#endif + RET + diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s b/vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s new file mode 100644 index 000000000000..b31d6aec43f6 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s @@ -0,0 +1,26 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build arm64,!gccgo,!noasm,!appengine + +// See https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt + +// func getMidr +TEXT ·getMidr(SB), 7, $0 + WORD $0xd5380000 // mrs x0, midr_el1 /* Main ID Register */ + MOVD R0, midr+0(FP) + RET + +// func getProcFeatures +TEXT ·getProcFeatures(SB), 7, $0 + WORD $0xd5380400 // mrs x0, id_aa64pfr0_el1 /* Processor Feature Register 0 */ + MOVD R0, procFeatures+0(FP) + RET + +// func getInstAttributes +TEXT ·getInstAttributes(SB), 7, $0 + WORD $0xd5380600 // mrs x0, id_aa64isar0_el1 /* Instruction Set Attribute Register 0 */ + WORD $0xd5380621 // mrs x1, id_aa64isar1_el1 /* Instruction Set Attribute Register 1 */ + MOVD R0, instAttrReg0+0(FP) + MOVD R1, instAttrReg1+8(FP) + RET + diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go b/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go new file mode 100644 index 000000000000..9bf9f77f3733 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go @@ -0,0 +1,246 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build arm64,!gccgo,!noasm,!appengine + +package cpuid + +import "runtime" + +func getMidr() (midr uint64) +func getProcFeatures() (procFeatures uint64) +func getInstAttributes() (instAttrReg0, instAttrReg1 uint64) + +func initCPU() { + cpuid = func(uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 } + cpuidex = func(x, y uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 } + xgetbv = func(uint32) (a, b uint32) { return 0, 0 } + rdtscpAsm = func() (a, b, c, d uint32) { return 0, 0, 0, 0 } +} + +func addInfo(c *CPUInfo, safe bool) { + // Seems to be safe to assume on ARM64 + c.CacheLine = 64 + detectOS(c) + + // ARM64 disabled since it may crash if interrupt is not intercepted by OS. + if safe && !c.Supports(ARMCPUID) && runtime.GOOS != "freebsd" { + return + } + midr := getMidr() + + // MIDR_EL1 - Main ID Register + // https://developer.arm.com/docs/ddi0595/h/aarch64-system-registers/midr_el1 + // x--------------------------------------------------x + // | Name | bits | visible | + // |--------------------------------------------------| + // | Implementer | [31-24] | y | + // |--------------------------------------------------| + // | Variant | [23-20] | y | + // |--------------------------------------------------| + // | Architecture | [19-16] | y | + // |--------------------------------------------------| + // | PartNum | [15-4] | y | + // |--------------------------------------------------| + // | Revision | [3-0] | y | + // x--------------------------------------------------x + + switch (midr >> 24) & 0xff { + case 0xC0: + c.VendorString = "Ampere Computing" + c.VendorID = Ampere + case 0x41: + c.VendorString = "Arm Limited" + c.VendorID = ARM + case 0x42: + c.VendorString = "Broadcom Corporation" + c.VendorID = Broadcom + case 0x43: + c.VendorString = "Cavium Inc" + c.VendorID = Cavium + case 0x44: + c.VendorString = "Digital Equipment Corporation" + c.VendorID = DEC + case 0x46: + c.VendorString = "Fujitsu Ltd" + c.VendorID = Fujitsu + case 0x49: + c.VendorString = "Infineon Technologies AG" + c.VendorID = Infineon + case 0x4D: + c.VendorString = "Motorola or Freescale Semiconductor Inc" + c.VendorID = Motorola + case 0x4E: + c.VendorString = "NVIDIA Corporation" + c.VendorID = NVIDIA + case 0x50: + c.VendorString = "Applied Micro Circuits Corporation" + c.VendorID = AMCC + case 0x51: + c.VendorString = "Qualcomm Inc" + c.VendorID = Qualcomm + case 0x56: + c.VendorString = "Marvell International Ltd" + c.VendorID = Marvell + case 0x69: + c.VendorString = "Intel Corporation" + c.VendorID = Intel + } + + // Lower 4 bits: Architecture + // Architecture Meaning + // 0b0001 Armv4. + // 0b0010 Armv4T. + // 0b0011 Armv5 (obsolete). + // 0b0100 Armv5T. + // 0b0101 Armv5TE. + // 0b0110 Armv5TEJ. + // 0b0111 Armv6. + // 0b1111 Architectural features are individually identified in the ID_* registers, see 'ID registers'. + // Upper 4 bit: Variant + // An IMPLEMENTATION DEFINED variant number. + // Typically, this field is used to distinguish between different product variants, or major revisions of a product. + c.Family = int(midr>>16) & 0xff + + // PartNum, bits [15:4] + // An IMPLEMENTATION DEFINED primary part number for the device. + // On processors implemented by Arm, if the top four bits of the primary + // part number are 0x0 or 0x7, the variant and architecture are encoded differently. + // Revision, bits [3:0] + // An IMPLEMENTATION DEFINED revision number for the device. + c.Model = int(midr) & 0xffff + + procFeatures := getProcFeatures() + + // ID_AA64PFR0_EL1 - Processor Feature Register 0 + // x--------------------------------------------------x + // | Name | bits | visible | + // |--------------------------------------------------| + // | DIT | [51-48] | y | + // |--------------------------------------------------| + // | SVE | [35-32] | y | + // |--------------------------------------------------| + // | GIC | [27-24] | n | + // |--------------------------------------------------| + // | AdvSIMD | [23-20] | y | + // |--------------------------------------------------| + // | FP | [19-16] | y | + // |--------------------------------------------------| + // | EL3 | [15-12] | n | + // |--------------------------------------------------| + // | EL2 | [11-8] | n | + // |--------------------------------------------------| + // | EL1 | [7-4] | n | + // |--------------------------------------------------| + // | EL0 | [3-0] | n | + // x--------------------------------------------------x + + var f flagSet + // if procFeatures&(0xf<<48) != 0 { + // fmt.Println("DIT") + // } + f.setIf(procFeatures&(0xf<<32) != 0, SVE) + if procFeatures&(0xf<<20) != 15<<20 { + f.set(ASIMD) + // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64pfr0_el1 + // 0b0001 --> As for 0b0000, and also includes support for half-precision floating-point arithmetic. + f.setIf(procFeatures&(0xf<<20) == 1<<20, FPHP, ASIMDHP) + } + f.setIf(procFeatures&(0xf<<16) != 0, FP) + + instAttrReg0, instAttrReg1 := getInstAttributes() + + // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar0_el1 + // + // ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0 + // x--------------------------------------------------x + // | Name | bits | visible | + // |--------------------------------------------------| + // | TS | [55-52] | y | + // |--------------------------------------------------| + // | FHM | [51-48] | y | + // |--------------------------------------------------| + // | DP | [47-44] | y | + // |--------------------------------------------------| + // | SM4 | [43-40] | y | + // |--------------------------------------------------| + // | SM3 | [39-36] | y | + // |--------------------------------------------------| + // | SHA3 | [35-32] | y | + // |--------------------------------------------------| + // | RDM | [31-28] | y | + // |--------------------------------------------------| + // | ATOMICS | [23-20] | y | + // |--------------------------------------------------| + // | CRC32 | [19-16] | y | + // |--------------------------------------------------| + // | SHA2 | [15-12] | y | + // |--------------------------------------------------| + // | SHA1 | [11-8] | y | + // |--------------------------------------------------| + // | AES | [7-4] | y | + // x--------------------------------------------------x + + // if instAttrReg0&(0xf<<52) != 0 { + // fmt.Println("TS") + // } + // if instAttrReg0&(0xf<<48) != 0 { + // fmt.Println("FHM") + // } + f.setIf(instAttrReg0&(0xf<<44) != 0, ASIMDDP) + f.setIf(instAttrReg0&(0xf<<40) != 0, SM4) + f.setIf(instAttrReg0&(0xf<<36) != 0, SM3) + f.setIf(instAttrReg0&(0xf<<32) != 0, SHA3) + f.setIf(instAttrReg0&(0xf<<28) != 0, ASIMDRDM) + f.setIf(instAttrReg0&(0xf<<20) != 0, ATOMICS) + f.setIf(instAttrReg0&(0xf<<16) != 0, CRC32) + f.setIf(instAttrReg0&(0xf<<12) != 0, SHA2) + // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar0_el1 + // 0b0010 --> As 0b0001, plus SHA512H, SHA512H2, SHA512SU0, and SHA512SU1 instructions implemented. + f.setIf(instAttrReg0&(0xf<<12) == 2<<12, SHA512) + f.setIf(instAttrReg0&(0xf<<8) != 0, SHA1) + f.setIf(instAttrReg0&(0xf<<4) != 0, AESARM) + // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar0_el1 + // 0b0010 --> As for 0b0001, plus PMULL/PMULL2 instructions operating on 64-bit data quantities. + f.setIf(instAttrReg0&(0xf<<4) == 2<<4, PMULL) + + // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar1_el1 + // + // ID_AA64ISAR1_EL1 - Instruction set attribute register 1 + // x--------------------------------------------------x + // | Name | bits | visible | + // |--------------------------------------------------| + // | GPI | [31-28] | y | + // |--------------------------------------------------| + // | GPA | [27-24] | y | + // |--------------------------------------------------| + // | LRCPC | [23-20] | y | + // |--------------------------------------------------| + // | FCMA | [19-16] | y | + // |--------------------------------------------------| + // | JSCVT | [15-12] | y | + // |--------------------------------------------------| + // | API | [11-8] | y | + // |--------------------------------------------------| + // | APA | [7-4] | y | + // |--------------------------------------------------| + // | DPB | [3-0] | y | + // x--------------------------------------------------x + + // if instAttrReg1&(0xf<<28) != 0 { + // fmt.Println("GPI") + // } + f.setIf(instAttrReg1&(0xf<<28) != 24, GPA) + f.setIf(instAttrReg1&(0xf<<20) != 0, LRCPC) + f.setIf(instAttrReg1&(0xf<<16) != 0, FCMA) + f.setIf(instAttrReg1&(0xf<<12) != 0, JSCVT) + // if instAttrReg1&(0xf<<8) != 0 { + // fmt.Println("API") + // } + // if instAttrReg1&(0xf<<4) != 0 { + // fmt.Println("APA") + // } + f.setIf(instAttrReg1&(0xf<<0) != 0, DCPOP) + + // Store + c.featureSet.or(f) +} diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_ref.go b/vendor/github.com/klauspost/cpuid/v2/detect_ref.go new file mode 100644 index 000000000000..e9c8606ab920 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/detect_ref.go @@ -0,0 +1,14 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build !amd64,!386,!arm64 gccgo noasm appengine + +package cpuid + +func initCPU() { + cpuid = func(uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 } + cpuidex = func(x, y uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 } + xgetbv = func(uint32) (a, b uint32) { return 0, 0 } + rdtscpAsm = func() (a, b, c, d uint32) { return 0, 0, 0, 0 } +} + +func addInfo(info *CPUInfo, safe bool) {} diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go new file mode 100644 index 000000000000..367c35c88c23 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go @@ -0,0 +1,35 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build 386,!gccgo,!noasm,!appengine amd64,!gccgo,!noasm,!appengine + +package cpuid + +func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +func asmXgetbv(index uint32) (eax, edx uint32) +func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) +func asmDarwinHasAVX512() bool + +func initCPU() { + cpuid = asmCpuid + cpuidex = asmCpuidex + xgetbv = asmXgetbv + rdtscpAsm = asmRdtscpAsm + darwinHasAVX512 = asmDarwinHasAVX512 +} + +func addInfo(c *CPUInfo, safe bool) { + c.maxFunc = maxFunctionID() + c.maxExFunc = maxExtendedFunction() + c.BrandName = brandName() + c.CacheLine = cacheLine() + c.Family, c.Model = familyModel() + c.featureSet = support() + c.SGX = hasSGX(c.featureSet.inSet(SGX), c.featureSet.inSet(SGXLC)) + c.ThreadsPerCore = threadsPerCore() + c.LogicalCores = logicalCores() + c.PhysicalCores = physicalCores() + c.VendorID, c.VendorString = vendorID() + c.cacheSize() + c.frequencies() +} diff --git a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go new file mode 100644 index 000000000000..b1fe42e467bb --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go @@ -0,0 +1,185 @@ +// Code generated by "stringer -type=FeatureID,Vendor"; DO NOT EDIT. + +package cpuid + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[ADX-1] + _ = x[AESNI-2] + _ = x[AMD3DNOW-3] + _ = x[AMD3DNOWEXT-4] + _ = x[AMXBF16-5] + _ = x[AMXINT8-6] + _ = x[AMXTILE-7] + _ = x[AVX-8] + _ = x[AVX2-9] + _ = x[AVX512BF16-10] + _ = x[AVX512BITALG-11] + _ = x[AVX512BW-12] + _ = x[AVX512CD-13] + _ = x[AVX512DQ-14] + _ = x[AVX512ER-15] + _ = x[AVX512F-16] + _ = x[AVX512FP16-17] + _ = x[AVX512IFMA-18] + _ = x[AVX512PF-19] + _ = x[AVX512VBMI-20] + _ = x[AVX512VBMI2-21] + _ = x[AVX512VL-22] + _ = x[AVX512VNNI-23] + _ = x[AVX512VP2INTERSECT-24] + _ = x[AVX512VPOPCNTDQ-25] + _ = x[AVXSLOW-26] + _ = x[BMI1-27] + _ = x[BMI2-28] + _ = x[CLDEMOTE-29] + _ = x[CLMUL-30] + _ = x[CLZERO-31] + _ = x[CMOV-32] + _ = x[CPBOOST-33] + _ = x[CX16-34] + _ = x[ENQCMD-35] + _ = x[ERMS-36] + _ = x[F16C-37] + _ = x[FMA3-38] + _ = x[FMA4-39] + _ = x[GFNI-40] + _ = x[HLE-41] + _ = x[HTT-42] + _ = x[HWA-43] + _ = x[HYPERVISOR-44] + _ = x[IBPB-45] + _ = x[IBS-46] + _ = x[IBSBRNTRGT-47] + _ = x[IBSFETCHSAM-48] + _ = x[IBSFFV-49] + _ = x[IBSOPCNT-50] + _ = x[IBSOPCNTEXT-51] + _ = x[IBSOPSAM-52] + _ = x[IBSRDWROPCNT-53] + _ = x[IBSRIPINVALIDCHK-54] + _ = x[INT_WBINVD-55] + _ = x[INVLPGB-56] + _ = x[LZCNT-57] + _ = x[MCAOVERFLOW-58] + _ = x[MCOMMIT-59] + _ = x[MMX-60] + _ = x[MMXEXT-61] + _ = x[MOVDIR64B-62] + _ = x[MOVDIRI-63] + _ = x[MPX-64] + _ = x[MSRIRC-65] + _ = x[NX-66] + _ = x[POPCNT-67] + _ = x[RDPRU-68] + _ = x[RDRAND-69] + _ = x[RDSEED-70] + _ = x[RDTSCP-71] + _ = x[RTM-72] + _ = x[RTM_ALWAYS_ABORT-73] + _ = x[SERIALIZE-74] + _ = x[SGX-75] + _ = x[SGXLC-76] + _ = x[SHA-77] + _ = x[SSE-78] + _ = x[SSE2-79] + _ = x[SSE3-80] + _ = x[SSE4-81] + _ = x[SSE42-82] + _ = x[SSE4A-83] + _ = x[SSSE3-84] + _ = x[STIBP-85] + _ = x[SUCCOR-86] + _ = x[TBM-87] + _ = x[TSXLDTRK-88] + _ = x[VAES-89] + _ = x[VMX-90] + _ = x[VPCLMULQDQ-91] + _ = x[WAITPKG-92] + _ = x[WBNOINVD-93] + _ = x[XOP-94] + _ = x[AESARM-95] + _ = x[ARMCPUID-96] + _ = x[ASIMD-97] + _ = x[ASIMDDP-98] + _ = x[ASIMDHP-99] + _ = x[ASIMDRDM-100] + _ = x[ATOMICS-101] + _ = x[CRC32-102] + _ = x[DCPOP-103] + _ = x[EVTSTRM-104] + _ = x[FCMA-105] + _ = x[FP-106] + _ = x[FPHP-107] + _ = x[GPA-108] + _ = x[JSCVT-109] + _ = x[LRCPC-110] + _ = x[PMULL-111] + _ = x[SHA1-112] + _ = x[SHA2-113] + _ = x[SHA3-114] + _ = x[SHA512-115] + _ = x[SM3-116] + _ = x[SM4-117] + _ = x[SVE-118] + _ = x[lastID-119] + _ = x[firstID-0] +} + +const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXINT8AMXTILEAVXAVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXSLOWBMI1BMI2CLDEMOTECLMULCLZEROCMOVCPBOOSTCX16ENQCMDERMSF16CFMA3FMA4GFNIHLEHTTHWAHYPERVISORIBPBIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKINT_WBINVDINVLPGBLZCNTMCAOVERFLOWMCOMMITMMXMMXEXTMOVDIR64BMOVDIRIMPXMSRIRCNXPOPCNTRDPRURDRANDRDSEEDRDTSCPRTMRTM_ALWAYS_ABORTSERIALIZESGXSGXLCSHASSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSUCCORTBMTSXLDTRKVAESVMXVPCLMULQDQWAITPKGWBNOINVDXOPAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID" + +var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 58, 62, 72, 84, 92, 100, 108, 116, 123, 133, 143, 151, 161, 172, 180, 190, 208, 223, 230, 234, 238, 246, 251, 257, 261, 268, 272, 278, 282, 286, 290, 294, 298, 301, 304, 307, 317, 321, 324, 334, 345, 351, 359, 370, 378, 390, 406, 416, 423, 428, 439, 446, 449, 455, 464, 471, 474, 480, 482, 488, 493, 499, 505, 511, 514, 530, 539, 542, 547, 550, 553, 557, 561, 565, 570, 575, 580, 585, 591, 594, 602, 606, 609, 619, 626, 634, 637, 643, 651, 656, 663, 670, 678, 685, 690, 695, 702, 706, 708, 712, 715, 720, 725, 730, 734, 738, 742, 748, 751, 754, 757, 763} + +func (i FeatureID) String() string { + if i < 0 || i >= FeatureID(len(_FeatureID_index)-1) { + return "FeatureID(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _FeatureID_name[_FeatureID_index[i]:_FeatureID_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[VendorUnknown-0] + _ = x[Intel-1] + _ = x[AMD-2] + _ = x[VIA-3] + _ = x[Transmeta-4] + _ = x[NSC-5] + _ = x[KVM-6] + _ = x[MSVM-7] + _ = x[VMware-8] + _ = x[XenHVM-9] + _ = x[Bhyve-10] + _ = x[Hygon-11] + _ = x[SiS-12] + _ = x[RDC-13] + _ = x[Ampere-14] + _ = x[ARM-15] + _ = x[Broadcom-16] + _ = x[Cavium-17] + _ = x[DEC-18] + _ = x[Fujitsu-19] + _ = x[Infineon-20] + _ = x[Motorola-21] + _ = x[NVIDIA-22] + _ = x[AMCC-23] + _ = x[Qualcomm-24] + _ = x[Marvell-25] + _ = x[lastVendor-26] +} + +const _Vendor_name = "VendorUnknownIntelAMDVIATransmetaNSCKVMMSVMVMwareXenHVMBhyveHygonSiSRDCAmpereARMBroadcomCaviumDECFujitsuInfineonMotorolaNVIDIAAMCCQualcommMarvelllastVendor" + +var _Vendor_index = [...]uint8{0, 13, 18, 21, 24, 33, 36, 39, 43, 49, 55, 60, 65, 68, 71, 77, 80, 88, 94, 97, 104, 112, 120, 126, 130, 138, 145, 155} + +func (i Vendor) String() string { + if i < 0 || i >= Vendor(len(_Vendor_index)-1) { + return "Vendor(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _Vendor_name[_Vendor_index[i]:_Vendor_index[i+1]] +} diff --git a/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go new file mode 100644 index 000000000000..8d2cb0368bcf --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go @@ -0,0 +1,19 @@ +// Copyright (c) 2020 Klaus Post, released under MIT License. See LICENSE file. + +package cpuid + +import "runtime" + +func detectOS(c *CPUInfo) bool { + // There are no hw.optional sysctl values for the below features on Mac OS 11.0 + // to detect their supported state dynamically. Assume the CPU features that + // Apple Silicon M1 supports to be available as a minimal set of features + // to all Go programs running on darwin/arm64. + // TODO: Add more if we know them. + c.featureSet.setIf(runtime.GOOS != "ios", AESARM, PMULL, SHA1, SHA2) + c.PhysicalCores = runtime.NumCPU() + // For now assuming 1 thread per core... + c.ThreadsPerCore = 1 + c.LogicalCores = c.PhysicalCores + return true +} diff --git a/vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go new file mode 100644 index 000000000000..ee278b9e4bcf --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go @@ -0,0 +1,130 @@ +// Copyright (c) 2020 Klaus Post, released under MIT License. See LICENSE file. + +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file located +// here https://github.com/golang/sys/blob/master/LICENSE + +package cpuid + +import ( + "encoding/binary" + "io/ioutil" + "runtime" +) + +// HWCAP bits. +const ( + hwcap_FP = 1 << 0 + hwcap_ASIMD = 1 << 1 + hwcap_EVTSTRM = 1 << 2 + hwcap_AES = 1 << 3 + hwcap_PMULL = 1 << 4 + hwcap_SHA1 = 1 << 5 + hwcap_SHA2 = 1 << 6 + hwcap_CRC32 = 1 << 7 + hwcap_ATOMICS = 1 << 8 + hwcap_FPHP = 1 << 9 + hwcap_ASIMDHP = 1 << 10 + hwcap_CPUID = 1 << 11 + hwcap_ASIMDRDM = 1 << 12 + hwcap_JSCVT = 1 << 13 + hwcap_FCMA = 1 << 14 + hwcap_LRCPC = 1 << 15 + hwcap_DCPOP = 1 << 16 + hwcap_SHA3 = 1 << 17 + hwcap_SM3 = 1 << 18 + hwcap_SM4 = 1 << 19 + hwcap_ASIMDDP = 1 << 20 + hwcap_SHA512 = 1 << 21 + hwcap_SVE = 1 << 22 + hwcap_ASIMDFHM = 1 << 23 +) + +func detectOS(c *CPUInfo) bool { + // For now assuming no hyperthreading is reasonable. + c.LogicalCores = runtime.NumCPU() + c.PhysicalCores = c.LogicalCores + c.ThreadsPerCore = 1 + if hwcap == 0 { + // We did not get values from the runtime. + // Try reading /proc/self/auxv + + // From https://github.com/golang/sys + const ( + _AT_HWCAP = 16 + _AT_HWCAP2 = 26 + + uintSize = int(32 << (^uint(0) >> 63)) + ) + + buf, err := ioutil.ReadFile("/proc/self/auxv") + if err != nil { + // e.g. on android /proc/self/auxv is not accessible, so silently + // ignore the error and leave Initialized = false. On some + // architectures (e.g. arm64) doinit() implements a fallback + // readout and will set Initialized = true again. + return false + } + bo := binary.LittleEndian + for len(buf) >= 2*(uintSize/8) { + var tag, val uint + switch uintSize { + case 32: + tag = uint(bo.Uint32(buf[0:])) + val = uint(bo.Uint32(buf[4:])) + buf = buf[8:] + case 64: + tag = uint(bo.Uint64(buf[0:])) + val = uint(bo.Uint64(buf[8:])) + buf = buf[16:] + } + switch tag { + case _AT_HWCAP: + hwcap = val + case _AT_HWCAP2: + // Not used + } + } + if hwcap == 0 { + return false + } + } + + // HWCap was populated by the runtime from the auxiliary vector. + // Use HWCap information since reading aarch64 system registers + // is not supported in user space on older linux kernels. + c.featureSet.setIf(isSet(hwcap, hwcap_AES), AESARM) + c.featureSet.setIf(isSet(hwcap, hwcap_ASIMD), ASIMD) + c.featureSet.setIf(isSet(hwcap, hwcap_ASIMDDP), ASIMDDP) + c.featureSet.setIf(isSet(hwcap, hwcap_ASIMDHP), ASIMDHP) + c.featureSet.setIf(isSet(hwcap, hwcap_ASIMDRDM), ASIMDRDM) + c.featureSet.setIf(isSet(hwcap, hwcap_CPUID), ARMCPUID) + c.featureSet.setIf(isSet(hwcap, hwcap_CRC32), CRC32) + c.featureSet.setIf(isSet(hwcap, hwcap_DCPOP), DCPOP) + c.featureSet.setIf(isSet(hwcap, hwcap_EVTSTRM), EVTSTRM) + c.featureSet.setIf(isSet(hwcap, hwcap_FCMA), FCMA) + c.featureSet.setIf(isSet(hwcap, hwcap_FP), FP) + c.featureSet.setIf(isSet(hwcap, hwcap_FPHP), FPHP) + c.featureSet.setIf(isSet(hwcap, hwcap_JSCVT), JSCVT) + c.featureSet.setIf(isSet(hwcap, hwcap_LRCPC), LRCPC) + c.featureSet.setIf(isSet(hwcap, hwcap_PMULL), PMULL) + c.featureSet.setIf(isSet(hwcap, hwcap_SHA1), SHA1) + c.featureSet.setIf(isSet(hwcap, hwcap_SHA2), SHA2) + c.featureSet.setIf(isSet(hwcap, hwcap_SHA3), SHA3) + c.featureSet.setIf(isSet(hwcap, hwcap_SHA512), SHA512) + c.featureSet.setIf(isSet(hwcap, hwcap_SM3), SM3) + c.featureSet.setIf(isSet(hwcap, hwcap_SM4), SM4) + c.featureSet.setIf(isSet(hwcap, hwcap_SVE), SVE) + + // The Samsung S9+ kernel reports support for atomics, but not all cores + // actually support them, resulting in SIGILL. See issue #28431. + // TODO(elias.naur): Only disable the optimization on bad chipsets on android. + c.featureSet.setIf(isSet(hwcap, hwcap_ATOMICS) && runtime.GOOS != "android", ATOMICS) + + return true +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} diff --git a/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go new file mode 100644 index 000000000000..1a951e6ca00e --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go @@ -0,0 +1,17 @@ +// Copyright (c) 2020 Klaus Post, released under MIT License. See LICENSE file. + +// +build arm64 +// +build !linux +// +build !darwin + +package cpuid + +import "runtime" + +func detectOS(c *CPUInfo) bool { + c.PhysicalCores = runtime.NumCPU() + // For now assuming 1 thread per core... + c.ThreadsPerCore = 1 + c.LogicalCores = c.PhysicalCores + return false +} diff --git a/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go new file mode 100644 index 000000000000..4d0b8b465b3a --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go @@ -0,0 +1,7 @@ +// Copyright (c) 2021 Klaus Post, released under MIT License. See LICENSE file. + +//+build nounsafe + +package cpuid + +var hwcap uint diff --git a/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go new file mode 100644 index 000000000000..329800286e6e --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go @@ -0,0 +1,10 @@ +// Copyright (c) 2021 Klaus Post, released under MIT License. See LICENSE file. + +//+build !nounsafe + +package cpuid + +import _ "unsafe" // needed for go:linkname + +//go:linkname hwcap internal/cpu.HWCap +var hwcap uint diff --git a/vendor/github.com/klauspost/cpuid/v2/test-architectures.sh b/vendor/github.com/klauspost/cpuid/v2/test-architectures.sh new file mode 100644 index 000000000000..471d986d2488 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/test-architectures.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +set -e + +go tool dist list | while IFS=/ read os arch; do + echo "Checking $os/$arch..." + echo " normal" + GOARCH=$arch GOOS=$os go build -o /dev/null . + echo " noasm" + GOARCH=$arch GOOS=$os go build -tags noasm -o /dev/null . + echo " appengine" + GOARCH=$arch GOOS=$os go build -tags appengine -o /dev/null . + echo " noasm,appengine" + GOARCH=$arch GOOS=$os go build -tags 'appengine noasm' -o /dev/null . +done diff --git a/vendor/github.com/zeebo/xxh3/.gitignore b/vendor/github.com/zeebo/xxh3/.gitignore new file mode 100644 index 000000000000..928e12f5306f --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/.gitignore @@ -0,0 +1,6 @@ +upstream +*.pprof +xxh3.test +.vscode +*.txt +_compat diff --git a/vendor/github.com/zeebo/xxh3/LICENSE b/vendor/github.com/zeebo/xxh3/LICENSE new file mode 100644 index 000000000000..477f8e5e1ea7 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/LICENSE @@ -0,0 +1,25 @@ +xxHash Library +Copyright (c) 2012-2014, Yann Collet +Copyright (c) 2019, Jeff Wendling +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/zeebo/xxh3/Makefile b/vendor/github.com/zeebo/xxh3/Makefile new file mode 100644 index 000000000000..8bd78c48246e --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/Makefile @@ -0,0 +1,27 @@ +.PHONY: all vet +all: genasm _compat + +genasm: avo/avx.go avo/sse.go + cd ./avo; go generate gen.go + +clean: + rm accum_vector_avx_amd64.s + rm accum_vector_sse_amd64.s + rm _compat + +upstream/xxhash.o: upstream/xxhash.h + ( cd upstream && make ) + +_compat: _compat.c upstream/xxhash.o + gcc -o _compat _compat.c ./upstream/xxhash.o + +vet: + GOOS=linux GOARCH=386 GO386=softfloat go vet ./... + GOOS=windows GOARCH=386 GO386=softfloat go vet ./... + GOOS=linux GOARCH=amd64 go vet ./... + GOOS=windows GOARCH=amd64 go vet ./... + GOOS=darwin GOARCH=amd64 go vet ./... + GOOS=linux GOARCH=arm go vet ./... + GOOS=linux GOARCH=arm64 go vet ./... + GOOS=windows GOARCH=arm64 go vet ./... + GOOS=darwin GOARCH=arm64 go vet ./... \ No newline at end of file diff --git a/vendor/github.com/zeebo/xxh3/README.md b/vendor/github.com/zeebo/xxh3/README.md new file mode 100644 index 000000000000..4633fc03a84b --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/README.md @@ -0,0 +1,38 @@ +# XXH3 +[![GoDoc](https://godoc.org/github.com/zeebo/xxh3?status.svg)](https://godoc.org/github.com/zeebo/xxh3) +[![Sourcegraph](https://sourcegraph.com/github.com/zeebo/xxh3/-/badge.svg)](https://sourcegraph.com/github.com/zeebo/xxh3?badge) +[![Go Report Card](https://goreportcard.com/badge/github.com/zeebo/xxh3)](https://goreportcard.com/report/github.com/zeebo/xxh3) + +This package is a port of the [xxh3](https://github.com/Cyan4973/xxHash) library to Go. + +Upstream has fixed the output as of v0.8.0, and this package matches that. + +--- + +# Benchmarks + +Run on my `i7-8850H CPU @ 2.60GHz` + +## Small Sizes + +| Bytes | Rate | +|-----------|--------------------------------------| +|` 0 ` |` 0.74 ns/op ` | +|` 1-3 ` |` 4.19 ns/op (0.24 GB/s - 0.71 GB/s) `| +|` 4-8 ` |` 4.16 ns/op (0.97 GB/s - 1.98 GB/s) `| +|` 9-16 ` |` 4.46 ns/op (2.02 GB/s - 3.58 GB/s) `| +|` 17-32 ` |` 6.22 ns/op (2.76 GB/s - 5.15 GB/s) `| +|` 33-64 ` |` 8.00 ns/op (4.13 GB/s - 8.13 GB/s) `| +|` 65-96 ` |` 11.0 ns/op (5.91 GB/s - 8.84 GB/s) `| +|` 97-128 ` |` 12.8 ns/op (7.68 GB/s - 10.0 GB/s) `| + +## Large Sizes + +| Bytes | Rate | SSE2 Rate | AVX2 Rate | +|---------|--------------------------|--------------------------|--------------------------| +|` 129 ` |` 13.6 ns/op (9.45 GB/s) `| | | +|` 240 ` |` 23.8 ns/op (10.1 GB/s) `| | | +|` 241 ` |` 40.5 ns/op (5.97 GB/s) `|` 23.3 ns/op (10.4 GB/s) `|` 20.1 ns/op (12.0 GB/s) `| +|` 512 ` |` 69.8 ns/op (7.34 GB/s) `|` 30.4 ns/op (16.9 GB/s) `|` 24.7 ns/op (20.7 GB/s) `| +|` 1024 ` |` 132 ns/op (7.77 GB/s) `|` 48.9 ns/op (20.9 GB/s) `|` 37.7 ns/op (27.2 GB/s) `| +|` 100KB `|` 13.0 us/op (7.88 GB/s) `|` 4.05 us/op (25.3 GB/s) `|` 2.31 us/op (44.3 GB/s) `| diff --git a/vendor/github.com/zeebo/xxh3/_compat.c b/vendor/github.com/zeebo/xxh3/_compat.c new file mode 100644 index 000000000000..fda9f36ff013 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/_compat.c @@ -0,0 +1,39 @@ +#include "upstream/xxhash.h" +#include + +int main() { + unsigned char buf[4096]; + for (int i = 0; i < 4096; i++) { + buf[i] = (unsigned char)((i+1)%251); + } + + printf("var testVecs64 = []uint64{\n"); + for (int i = 0; i < 4096; i++) { + if (i % 4 == 0) { + printf("\t"); + } + + uint64_t h = XXH3_64bits(buf, (size_t)i); + printf("0x%lx, ", h); + + if (i % 4 == 3) { + printf("\n\t"); + } + } + printf("}\n\n"); + + printf("var testVecs128 = [][2]uint64{\n"); + for (int i = 0; i < 4096; i++) { + if (i % 4 == 0) { + printf("\t"); + } + + XXH128_hash_t h = XXH3_128bits(buf, (size_t)i); + printf("{0x%lx, 0x%lx}, ", h.high64, h.low64); + + if (i % 4 == 3) { + printf("\n"); + } + } + printf("}\n\n"); +} diff --git a/vendor/github.com/zeebo/xxh3/accum_generic.go b/vendor/github.com/zeebo/xxh3/accum_generic.go new file mode 100644 index 000000000000..b1be78507175 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_generic.go @@ -0,0 +1,542 @@ +package xxh3 + +// avx512Switch is the size at which the avx512 code is used. +// Bigger blocks benefit more. +const avx512Switch = 1 << 10 + +func accumScalar(accs *[8]u64, p, secret ptr, l u64) { + if secret != key { + accumScalarSeed(accs, p, secret, l) + return + } + for l > _block { + k := secret + + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= key64_128 + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= key64_136 + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= key64_144 + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= key64_152 + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= key64_160 + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= key64_168 + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= key64_176 + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= key64_184 + accs[7] *= prime32_1 + } + + if l > 0 { + t, k := (l-1)/_stripe, secret + + for i := u64(0); i < t; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + if l > 0 { + p = ptr(ui(p) - uintptr(_stripe-l)) + + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ key64_121 + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ key64_129 + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ key64_137 + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ key64_145 + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ key64_153 + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ key64_161 + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ key64_169 + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ key64_177 + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + } + } +} + +func accumBlockScalar(accs *[8]u64, p, secret ptr) { + if secret != key { + accumBlockScalarSeed(accs, p, secret) + return + } + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8) + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= key64_128 + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= key64_136 + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= key64_144 + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= key64_152 + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= key64_160 + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= key64_168 + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= key64_176 + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= key64_184 + accs[7] *= prime32_1 +} + +// accumScalarSeed should be used with custom key. +func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) { + for l > _block { + k := secret + + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= readU64(secret, 128) + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= readU64(secret, 136) + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= readU64(secret, 144) + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= readU64(secret, 152) + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= readU64(secret, 160) + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= readU64(secret, 168) + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= readU64(secret, 176) + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= readU64(secret, 184) + accs[7] *= prime32_1 + } + + if l > 0 { + t, k := (l-1)/_stripe, secret + + for i := u64(0); i < t; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + if l > 0 { + p = ptr(ui(p) - uintptr(_stripe-l)) + + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 121) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 129) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 137) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 145) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 153) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 161) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 169) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 177) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + } + } +} + +// accumBlockScalarSeed should be used with custom key. +func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) { + // accs + { + secret := secret + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= readU64(secret, 128) + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= readU64(secret, 136) + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= readU64(secret, 144) + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= readU64(secret, 152) + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= readU64(secret, 160) + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= readU64(secret, 168) + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= readU64(secret, 176) + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= readU64(secret, 184) + accs[7] *= prime32_1 +} diff --git a/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go b/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go new file mode 100644 index 000000000000..9baff6c41c7a --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go @@ -0,0 +1,40 @@ +package xxh3 + +import ( + "unsafe" + + "github.com/klauspost/cpuid/v2" +) + +var ( + hasAVX2 = cpuid.CPU.Has(cpuid.AVX2) + hasSSE2 = cpuid.CPU.Has(cpuid.SSE2) // Always true on amd64 + hasAVX512 = cpuid.CPU.Has(cpuid.AVX512F) +) + +//go:noescape +func accumAVX2(acc *[8]u64, data, key unsafe.Pointer, len u64) + +//go:noescape +func accumAVX512(acc *[8]u64, data, key unsafe.Pointer, len u64) + +//go:noescape +func accumSSE(acc *[8]u64, data, key unsafe.Pointer, len u64) + +//go:noescape +func accumBlockAVX2(acc *[8]u64, data, key unsafe.Pointer) + +//go:noescape +func accumBlockSSE(acc *[8]u64, data, key unsafe.Pointer) + +func withOverrides(avx512, avx2, sse2 bool, cb func()) { + avx512Orig, avx2Orig, sse2Orig := hasAVX512, hasAVX2, hasSSE2 + hasAVX512, hasAVX2, hasSSE2 = avx512, avx2, sse2 + defer func() { hasAVX512, hasAVX2, hasSSE2 = avx512Orig, avx2Orig, sse2Orig }() + cb() +} + +func withAVX512(cb func()) { withOverrides(hasAVX512, false, false, cb) } +func withAVX2(cb func()) { withOverrides(false, hasAVX2, false, cb) } +func withSSE2(cb func()) { withOverrides(false, false, hasSSE2, cb) } +func withGeneric(cb func()) { withOverrides(false, false, false, cb) } diff --git a/vendor/github.com/zeebo/xxh3/accum_stubs_other.go b/vendor/github.com/zeebo/xxh3/accum_stubs_other.go new file mode 100644 index 000000000000..93bf6258a828 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_stubs_other.go @@ -0,0 +1,25 @@ +//go:build !amd64 +// +build !amd64 + +package xxh3 + +import ( + "unsafe" +) + +const ( + hasAVX2 = false + hasSSE2 = false + hasAVX512 = false +) + +func accumAVX2(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") } +func accumSSE(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") } +func accumBlockAVX2(acc *[8]u64, data, key unsafe.Pointer) { panic("unreachable") } +func accumBlockSSE(acc *[8]u64, data, key unsafe.Pointer) { panic("unreachable") } +func accumAVX512(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") } + +func withAVX512(cb func()) { cb() } +func withAVX2(cb func()) { cb() } +func withSSE2(cb func()) { cb() } +func withGeneric(cb func()) { cb() } diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s new file mode 100644 index 000000000000..cfaf9f0a77d2 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s @@ -0,0 +1,379 @@ +// Code generated by command: go run gen.go -avx512 -out ../accum_vector_avx512_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_avx512<>+0(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+8(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+16(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+24(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+32(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+40(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+48(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+56(SB)/8, $0x000000009e3779b1 +GLOBL prime_avx512<>(SB), RODATA|NOPTR, $64 + +// func accumAVX512(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: AVX, AVX512F, MMX+ +TEXT ·accumAVX512(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ len+24(FP), BX + VMOVDQU64 (AX), Z1 + VMOVDQU64 prime_avx512<>+0(SB), Z0 + VMOVDQU64 (DX), Z2 + VMOVDQU64 8(DX), Z3 + VMOVDQU64 16(DX), Z4 + VMOVDQU64 24(DX), Z5 + VMOVDQU64 32(DX), Z6 + VMOVDQU64 40(DX), Z7 + VMOVDQU64 48(DX), Z8 + VMOVDQU64 56(DX), Z9 + VMOVDQU64 64(DX), Z10 + VMOVDQU64 72(DX), Z11 + VMOVDQU64 80(DX), Z12 + VMOVDQU64 88(DX), Z13 + VMOVDQU64 96(DX), Z14 + VMOVDQU64 104(DX), Z15 + VMOVDQU64 112(DX), Z16 + VMOVDQU64 120(DX), Z17 + VMOVDQU64 128(DX), Z18 + VMOVDQU64 121(DX), Z19 + +accum_large: + CMPQ BX, $0x00000400 + JLE accum + VMOVDQU64 (CX), Z20 + PREFETCHT0 1024(CX) + VPXORD Z2, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 64(CX), Z20 + PREFETCHT0 1088(CX) + VPXORD Z3, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 128(CX), Z20 + PREFETCHT0 1152(CX) + VPXORD Z4, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 192(CX), Z20 + PREFETCHT0 1216(CX) + VPXORD Z5, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 256(CX), Z20 + PREFETCHT0 1280(CX) + VPXORD Z6, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 320(CX), Z20 + PREFETCHT0 1344(CX) + VPXORD Z7, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 384(CX), Z20 + PREFETCHT0 1408(CX) + VPXORD Z8, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 448(CX), Z20 + PREFETCHT0 1472(CX) + VPXORD Z9, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 512(CX), Z20 + PREFETCHT0 1536(CX) + VPXORD Z10, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 576(CX), Z20 + PREFETCHT0 1600(CX) + VPXORD Z11, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 640(CX), Z20 + PREFETCHT0 1664(CX) + VPXORD Z12, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 704(CX), Z20 + PREFETCHT0 1728(CX) + VPXORD Z13, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 768(CX), Z20 + PREFETCHT0 1792(CX) + VPXORD Z14, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 832(CX), Z20 + PREFETCHT0 1856(CX) + VPXORD Z15, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 896(CX), Z20 + PREFETCHT0 1920(CX) + VPXORD Z16, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 960(CX), Z20 + PREFETCHT0 1984(CX) + VPXORD Z17, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + ADDQ $0x00000400, CX + SUBQ $0x00000400, BX + VPSRLQ $0x2f, Z1, Z20 + VPTERNLOGD $0x96, Z1, Z18, Z20 + VPMULUDQ Z0, Z20, Z1 + VPSHUFD $0xf5, Z20, Z20 + VPMULUDQ Z0, Z20, Z20 + VPSLLQ $0x20, Z20, Z20 + VPADDQ Z1, Z20, Z1 + JMP accum_large + +accum: + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z2, Z0, Z2 + VPSHUFD $0x31, Z2, Z18 + VPMULUDQ Z2, Z18, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z3, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z4, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z5, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z6, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z7, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z8, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z9, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z10, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z11, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z12, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z13, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z14, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z15, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z16, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z17, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + +finalize: + CMPQ BX, $0x00 + JE return + SUBQ $0x40, CX + ADDQ BX, CX + VMOVDQU64 (CX), Z0 + VPXORD Z19, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + +return: + VMOVDQU64 Z1, (AX) + VZEROUPPER + RET diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s new file mode 100644 index 000000000000..b53c1521f76c --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s @@ -0,0 +1,586 @@ +// Code generated by command: go run gen.go -avx -out ../accum_vector_avx_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_avx<>+0(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+8(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+16(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+24(SB)/8, $0x000000009e3779b1 +GLOBL prime_avx<>(SB), RODATA|NOPTR, $32 + +// func accumAVX2(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: AVX, AVX2, MMX+ +TEXT ·accumAVX2(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ key+16(FP), BX + MOVQ len+24(FP), SI + VMOVDQU (AX), Y1 + VMOVDQU 32(AX), Y2 + VMOVDQU prime_avx<>+0(SB), Y0 + +accum_large: + CMPQ SI, $0x00000400 + JLE accum + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y6 + PREFETCHT0 512(CX) + VPXOR (DX), Y3, Y4 + VPXOR 32(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y6 + PREFETCHT0 576(CX) + VPXOR 8(DX), Y3, Y4 + VPXOR 40(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y6 + PREFETCHT0 640(CX) + VPXOR 16(DX), Y3, Y4 + VPXOR 48(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y6 + PREFETCHT0 704(CX) + VPXOR 24(DX), Y3, Y4 + VPXOR 56(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y6 + PREFETCHT0 768(CX) + VPXOR 32(DX), Y3, Y4 + VPXOR 64(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y6 + PREFETCHT0 832(CX) + VPXOR 40(DX), Y3, Y4 + VPXOR 72(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y6 + PREFETCHT0 896(CX) + VPXOR 48(DX), Y3, Y4 + VPXOR 80(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y6 + PREFETCHT0 960(CX) + VPXOR 56(DX), Y3, Y4 + VPXOR 88(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y6 + PREFETCHT0 1024(CX) + VPXOR 64(DX), Y3, Y4 + VPXOR 96(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y6 + PREFETCHT0 1088(CX) + VPXOR 72(DX), Y3, Y4 + VPXOR 104(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y6 + PREFETCHT0 1152(CX) + VPXOR 80(DX), Y3, Y4 + VPXOR 112(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y6 + PREFETCHT0 1216(CX) + VPXOR 88(DX), Y3, Y4 + VPXOR 120(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y6 + PREFETCHT0 1280(CX) + VPXOR 96(DX), Y3, Y4 + VPXOR 128(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y6 + PREFETCHT0 1344(CX) + VPXOR 104(DX), Y3, Y4 + VPXOR 136(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y6 + PREFETCHT0 1408(CX) + VPXOR 112(DX), Y3, Y4 + VPXOR 144(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y6 + PREFETCHT0 1472(CX) + VPXOR 120(DX), Y3, Y4 + VPXOR 152(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + ADDQ $0x00000400, CX + SUBQ $0x00000400, SI + VPSRLQ $0x2f, Y1, Y3 + VPXOR Y1, Y3, Y3 + VPXOR 128(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y1 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y1, Y3, Y1 + VPSRLQ $0x2f, Y2, Y3 + VPXOR Y2, Y3, Y3 + VPXOR 160(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y2 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y2, Y3, Y2 + JMP accum_large + +accum: + CMPQ SI, $0x40 + JLE finalize + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y5 + VPXOR (BX), Y0, Y3 + VPXOR 32(BX), Y5, Y6 + VPSHUFD $0x31, Y3, Y4 + VPSHUFD $0x31, Y6, Y7 + VPMULUDQ Y3, Y4, Y3 + VPMULUDQ Y6, Y7, Y6 + VPSHUFD $0x4e, Y0, Y0 + VPSHUFD $0x4e, Y5, Y5 + VPADDQ Y1, Y0, Y1 + VPADDQ Y1, Y3, Y1 + VPADDQ Y2, Y5, Y2 + VPADDQ Y2, Y6, Y2 + ADDQ $0x00000040, CX + SUBQ $0x00000040, SI + ADDQ $0x00000008, BX + JMP accum + +finalize: + CMPQ SI, $0x00 + JE return + SUBQ $0x40, CX + ADDQ SI, CX + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y5 + VPXOR 121(DX), Y0, Y3 + VPXOR 153(DX), Y5, Y6 + VPSHUFD $0x31, Y3, Y4 + VPSHUFD $0x31, Y6, Y7 + VPMULUDQ Y3, Y4, Y3 + VPMULUDQ Y6, Y7, Y6 + VPSHUFD $0x4e, Y0, Y0 + VPSHUFD $0x4e, Y5, Y5 + VPADDQ Y1, Y0, Y1 + VPADDQ Y1, Y3, Y1 + VPADDQ Y2, Y5, Y2 + VPADDQ Y2, Y6, Y2 + +return: + VMOVDQU Y1, (AX) + VMOVDQU Y2, 32(AX) + VZEROUPPER + RET + +// func accumBlockAVX2(acc *[8]uint64, data *byte, key *byte) +// Requires: AVX, AVX2 +TEXT ·accumBlockAVX2(SB), NOSPLIT, $0-24 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + VMOVDQU (AX), Y1 + VMOVDQU 32(AX), Y2 + VMOVDQU prime_avx<>+0(SB), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y6 + VPXOR (DX), Y3, Y4 + VPXOR 32(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y6 + VPXOR 8(DX), Y3, Y4 + VPXOR 40(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y6 + VPXOR 16(DX), Y3, Y4 + VPXOR 48(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y6 + VPXOR 24(DX), Y3, Y4 + VPXOR 56(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y6 + VPXOR 32(DX), Y3, Y4 + VPXOR 64(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y6 + VPXOR 40(DX), Y3, Y4 + VPXOR 72(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y6 + VPXOR 48(DX), Y3, Y4 + VPXOR 80(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y6 + VPXOR 56(DX), Y3, Y4 + VPXOR 88(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y6 + VPXOR 64(DX), Y3, Y4 + VPXOR 96(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y6 + VPXOR 72(DX), Y3, Y4 + VPXOR 104(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y6 + VPXOR 80(DX), Y3, Y4 + VPXOR 112(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y6 + VPXOR 88(DX), Y3, Y4 + VPXOR 120(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y6 + VPXOR 96(DX), Y3, Y4 + VPXOR 128(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y6 + VPXOR 104(DX), Y3, Y4 + VPXOR 136(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y6 + VPXOR 112(DX), Y3, Y4 + VPXOR 144(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y6 + VPXOR 120(DX), Y3, Y4 + VPXOR 152(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VPSRLQ $0x2f, Y1, Y3 + VPXOR Y1, Y3, Y3 + VPXOR 128(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y1 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y1, Y3, Y1 + VPSRLQ $0x2f, Y2, Y3 + VPXOR Y2, Y3, Y3 + VPXOR 160(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y2 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y2, Y3, Y2 + VMOVDQU Y1, (AX) + VMOVDQU Y2, 32(AX) + VZEROUPPER + RET diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s new file mode 100644 index 000000000000..ba670e560226 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s @@ -0,0 +1,1236 @@ +// Code generated by command: go run gen.go -sse -out ../accum_vector_sse_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_sse<>+0(SB)/4, $0x9e3779b1 +DATA prime_sse<>+4(SB)/4, $0x9e3779b1 +DATA prime_sse<>+8(SB)/4, $0x9e3779b1 +DATA prime_sse<>+12(SB)/4, $0x9e3779b1 +GLOBL prime_sse<>(SB), RODATA|NOPTR, $16 + +// func accumSSE(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: SSE2 +TEXT ·accumSSE(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ key+16(FP), BX + MOVQ len+24(FP), SI + MOVOU (AX), X1 + MOVOU 16(AX), X2 + MOVOU 32(AX), X3 + MOVOU 48(AX), X4 + MOVOU prime_sse<>+0(SB), X0 + +accum_large: + CMPQ SI, $0x00000400 + JLE accum + MOVOU (CX), X5 + MOVOU (DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 16(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 32(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 48(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 64(CX), X5 + MOVOU 8(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 80(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 96(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 112(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 128(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 144(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 160(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 176(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 192(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 208(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 224(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 240(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 256(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 272(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 288(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 304(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 320(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 336(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 352(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 368(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 384(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 400(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 416(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 432(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 448(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 464(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 480(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 496(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 512(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 528(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 544(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 560(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 576(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 592(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 608(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 624(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 640(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 656(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 672(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 688(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 704(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 720(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 736(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 752(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 768(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 784(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 800(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 816(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 832(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 848(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 864(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 880(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 896(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 912(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 928(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 944(CX), X5 + MOVOU 160(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 960(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 976(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 992(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 1008(CX), X5 + MOVOU 168(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + ADDQ $0x00000400, CX + SUBQ $0x00000400, SI + MOVOU X1, X5 + PSRLQ $0x2f, X5 + PXOR X5, X1 + MOVOU 128(DX), X5 + PXOR X5, X1 + PSHUFD $0xf5, X1, X5 + PMULULQ X0, X1 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X1 + MOVOU X2, X5 + PSRLQ $0x2f, X5 + PXOR X5, X2 + MOVOU 144(DX), X5 + PXOR X5, X2 + PSHUFD $0xf5, X2, X5 + PMULULQ X0, X2 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X2 + MOVOU X3, X5 + PSRLQ $0x2f, X5 + PXOR X5, X3 + MOVOU 160(DX), X5 + PXOR X5, X3 + PSHUFD $0xf5, X3, X5 + PMULULQ X0, X3 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X3 + MOVOU X4, X5 + PSRLQ $0x2f, X5 + PXOR X5, X4 + MOVOU 176(DX), X5 + PXOR X5, X4 + PSHUFD $0xf5, X4, X5 + PMULULQ X0, X4 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X4 + JMP accum_large + +accum: + CMPQ SI, $0x40 + JLE finalize + MOVOU (CX), X0 + MOVOU (BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X1 + PADDQ X6, X1 + MOVOU 16(CX), X0 + MOVOU 16(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X2 + PADDQ X6, X2 + MOVOU 32(CX), X0 + MOVOU 32(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X3 + PADDQ X6, X3 + MOVOU 48(CX), X0 + MOVOU 48(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X4 + PADDQ X6, X4 + ADDQ $0x00000040, CX + SUBQ $0x00000040, SI + ADDQ $0x00000008, BX + JMP accum + +finalize: + CMPQ SI, $0x00 + JE return + SUBQ $0x40, CX + ADDQ SI, CX + MOVOU (CX), X0 + MOVOU 121(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X1 + PADDQ X6, X1 + MOVOU 16(CX), X0 + MOVOU 137(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X2 + PADDQ X6, X2 + MOVOU 32(CX), X0 + MOVOU 153(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X3 + PADDQ X6, X3 + MOVOU 48(CX), X0 + MOVOU 169(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X4 + PADDQ X6, X4 + +return: + MOVOU X1, (AX) + MOVOU X2, 16(AX) + MOVOU X3, 32(AX) + MOVOU X4, 48(AX) + RET + +// func accumBlockSSE(acc *[8]uint64, data *byte, key *byte) +// Requires: SSE2 +TEXT ·accumBlockSSE(SB), NOSPLIT, $0-24 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVOU (AX), X1 + MOVOU 16(AX), X2 + MOVOU 32(AX), X3 + MOVOU 48(AX), X4 + MOVOU prime_sse<>+0(SB), X0 + MOVOU (CX), X5 + MOVOU (DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 16(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 32(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 48(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 64(CX), X5 + MOVOU 8(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 80(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 96(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 112(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 128(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 144(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 160(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 176(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 192(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 208(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 224(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 240(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 256(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 272(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 288(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 304(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 320(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 336(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 352(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 368(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 384(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 400(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 416(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 432(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 448(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 464(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 480(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 496(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 512(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 528(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 544(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 560(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 576(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 592(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 608(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 624(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 640(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 656(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 672(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 688(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 704(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 720(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 736(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 752(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 768(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 784(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 800(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 816(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 832(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 848(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 864(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 880(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 896(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 912(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 928(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 944(CX), X5 + MOVOU 160(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 960(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 976(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 992(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 1008(CX), X5 + MOVOU 168(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU X1, X5 + PSRLQ $0x2f, X5 + PXOR X5, X1 + MOVOU 128(DX), X5 + PXOR X5, X1 + PSHUFD $0xf5, X1, X5 + PMULULQ X0, X1 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X1 + MOVOU X2, X5 + PSRLQ $0x2f, X5 + PXOR X5, X2 + MOVOU 144(DX), X5 + PXOR X5, X2 + PSHUFD $0xf5, X2, X5 + PMULULQ X0, X2 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X2 + MOVOU X3, X5 + PSRLQ $0x2f, X5 + PXOR X5, X3 + MOVOU 160(DX), X5 + PXOR X5, X3 + PSHUFD $0xf5, X3, X5 + PMULULQ X0, X3 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X3 + MOVOU X4, X5 + PSRLQ $0x2f, X5 + PXOR X5, X4 + MOVOU 176(DX), X5 + PXOR X5, X4 + PSHUFD $0xf5, X4, X5 + PMULULQ X0, X4 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X4 + MOVOU X1, (AX) + MOVOU X2, 16(AX) + MOVOU X3, 32(AX) + MOVOU X4, 48(AX) + RET diff --git a/vendor/github.com/zeebo/xxh3/consts.go b/vendor/github.com/zeebo/xxh3/consts.go new file mode 100644 index 000000000000..39ef6e179910 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/consts.go @@ -0,0 +1,97 @@ +package xxh3 + +const ( + _stripe = 64 + _block = 1024 + + prime32_1 = 2654435761 + prime32_2 = 2246822519 + prime32_3 = 3266489917 + + prime64_1 = 11400714785074694791 + prime64_2 = 14029467366897019727 + prime64_3 = 1609587929392839161 + prime64_4 = 9650029242287828579 + prime64_5 = 2870177450012600261 +) + +var key = ptr(&[...]u8{ + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe /* 8 */, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, /* 16 */ + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb /* 24 */, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, /* 32 */ + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78 /* 40 */, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, /* 48 */ + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e /* 56 */, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, /* 64 */ + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb /* 72 */, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, /* 80 */ + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e /* 88 */, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, /* 96 */ + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f /* 104 */, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, /* 112 */ + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31 /* 120 */, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, /* 128 */ + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3 /* 136 */, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, /* 144 */ + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49 /* 152 */, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, /* 160 */ + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc /* 168 */, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, /* 176 */ + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28 /* 184 */, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, /* 192 */ +}) + +const ( + key64_000 u64 = 0xbe4ba423396cfeb8 + key64_008 u64 = 0x1cad21f72c81017c + key64_016 u64 = 0xdb979083e96dd4de + key64_024 u64 = 0x1f67b3b7a4a44072 + key64_032 u64 = 0x78e5c0cc4ee679cb + key64_040 u64 = 0x2172ffcc7dd05a82 + key64_048 u64 = 0x8e2443f7744608b8 + key64_056 u64 = 0x4c263a81e69035e0 + key64_064 u64 = 0xcb00c391bb52283c + key64_072 u64 = 0xa32e531b8b65d088 + key64_080 u64 = 0x4ef90da297486471 + key64_088 u64 = 0xd8acdea946ef1938 + key64_096 u64 = 0x3f349ce33f76faa8 + key64_104 u64 = 0x1d4f0bc7c7bbdcf9 + key64_112 u64 = 0x3159b4cd4be0518a + key64_120 u64 = 0x647378d9c97e9fc8 + key64_128 u64 = 0xc3ebd33483acc5ea + key64_136 u64 = 0xeb6313faffa081c5 + key64_144 u64 = 0x49daf0b751dd0d17 + key64_152 u64 = 0x9e68d429265516d3 + key64_160 u64 = 0xfca1477d58be162b + key64_168 u64 = 0xce31d07ad1b8f88f + key64_176 u64 = 0x280416958f3acb45 + key64_184 u64 = 0x7e404bbbcafbd7af + + key64_103 u64 = 0x4f0bc7c7bbdcf93f + key64_111 u64 = 0x59b4cd4be0518a1d + key64_119 u64 = 0x7378d9c97e9fc831 + key64_127 u64 = 0xebd33483acc5ea64 + + key64_121 u64 = 0xea647378d9c97e9f + key64_129 u64 = 0xc5c3ebd33483acc5 + key64_137 u64 = 0x17eb6313faffa081 + key64_145 u64 = 0xd349daf0b751dd0d + key64_153 u64 = 0x2b9e68d429265516 + key64_161 u64 = 0x8ffca1477d58be16 + key64_169 u64 = 0x45ce31d07ad1b8f8 + key64_177 u64 = 0xaf280416958f3acb + + key64_011 = 0x6dd4de1cad21f72c + key64_019 = 0xa44072db979083e9 + key64_027 = 0xe679cb1f67b3b7a4 + key64_035 = 0xd05a8278e5c0cc4e + key64_043 = 0x4608b82172ffcc7d + key64_051 = 0x9035e08e2443f774 + key64_059 = 0x52283c4c263a81e6 + key64_067 = 0x65d088cb00c391bb + + key64_117 = 0xd9c97e9fc83159b4 + key64_125 = 0x3483acc5ea647378 + key64_133 = 0xfaffa081c5c3ebd3 + key64_141 = 0xb751dd0d17eb6313 + key64_149 = 0x29265516d349daf0 + key64_157 = 0x7d58be162b9e68d4 + key64_165 = 0x7ad1b8f88ffca147 + key64_173 = 0x958f3acb45ce31d0 +) + +const ( + key32_000 u32 = 0xbe4ba423 + key32_004 u32 = 0x396cfeb8 + key32_008 u32 = 0x1cad21f7 + key32_012 u32 = 0x2c81017c +) diff --git a/vendor/github.com/zeebo/xxh3/hash128.go b/vendor/github.com/zeebo/xxh3/hash128.go new file mode 100644 index 000000000000..0040a21bbce9 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash128.go @@ -0,0 +1,253 @@ +package xxh3 + +import ( + "math/bits" +) + +// Hash128 returns the 128-bit hash of the byte slice. +func Hash128(b []byte) Uint128 { + return hashAny128(*(*str)(ptr(&b))) +} + +// HashString128 returns the 128-bit hash of the string slice. +func HashString128(s string) Uint128 { + return hashAny128(*(*str)(ptr(&s))) +} + +func hashAny128(s str) (acc u128) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: // 9-16 + const bitflipl = key64_032 ^ key64_040 + const bitfliph = key64_048 ^ key64_056 + + input_lo := readU64(p, 0) + input_hi := readU64(p, ui(l)-8) + + m128_h, m128_l := bits.Mul64(input_lo^input_hi^bitflipl, prime64_1) + + m128_l += uint64(l-1) << 54 + input_hi ^= bitfliph + + m128_h += input_hi + uint64(uint32(input_hi))*(prime32_2-1) + + m128_l ^= bits.ReverseBytes64(m128_h) + + acc.Hi, acc.Lo = bits.Mul64(m128_l, prime64_2) + acc.Hi += m128_h * prime64_2 + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l > 3: // 4-8 + const bitflip = key64_016 ^ key64_024 + + input_lo := readU32(p, 0) + input_hi := readU32(p, ui(l)-4) + input_64 := u64(input_lo) + u64(input_hi)<<32 + keyed := input_64 ^ bitflip + + acc.Hi, acc.Lo = bits.Mul64(keyed, prime64_1+(uint64(l)<<2)) + + acc.Hi += acc.Lo << 1 + acc.Lo ^= acc.Hi >> 3 + + acc.Lo ^= acc.Lo >> 35 + acc.Lo *= 0x9fb21c651e98df25 + acc.Lo ^= acc.Lo >> 28 + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc.Lo = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc.Lo = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc.Lo = c1*(1<<24+1<<16+1) + 1<<8 + + default: // 0 + return u128{0x99aa06d3014798d8, 0x6001c324468d497f} + } + + acc.Hi = uint64(bits.RotateLeft32(bits.ReverseBytes32(uint32(acc.Lo)), 13)) + acc.Lo ^= uint64(key32_000 ^ key32_004) + acc.Hi ^= uint64(key32_008 ^ key32_012) + + acc.Lo = xxh64AvalancheSmall(acc.Lo) + acc.Hi = xxh64AvalancheSmall(acc.Hi) + + return acc + + case l <= 128: + acc.Lo = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + in8, in7 := readU64(p, ui(l)-8*8), readU64(p, ui(l)-7*8) + i6, i7 := readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(in8^key64_112, in7^key64_120) + acc.Hi ^= i6 + i7 + acc.Lo += mulFold64(i6^key64_096, i7^key64_104) + acc.Lo ^= in8 + in7 + + } // 96 + + in6, in5 := readU64(p, ui(l)-6*8), readU64(p, ui(l)-5*8) + i4, i5 := readU64(p, 4*8), readU64(p, 5*8) + + acc.Hi += mulFold64(in6^key64_080, in5^key64_088) + acc.Hi ^= i4 + i5 + acc.Lo += mulFold64(i4^key64_064, i5^key64_072) + acc.Lo ^= in6 + in5 + + } // 64 + + in4, in3 := readU64(p, ui(l)-4*8), readU64(p, ui(l)-3*8) + i2, i3 := readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(in4^key64_048, in3^key64_056) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^key64_032, i3^key64_040) + acc.Lo ^= in4 + in3 + + } // 32 + + in2, in1 := readU64(p, ui(l)-2*8), readU64(p, ui(l)-1*8) + i0, i1 := readU64(p, 0*8), readU64(p, 1*8) + + acc.Hi += mulFold64(in2^key64_016, in1^key64_024) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_000, i1^key64_008) + acc.Lo ^= in2 + in1 + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+(u64(l)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + case l <= 240: + acc.Lo = u64(l) * prime64_1 + + { + i0, i1, i2, i3 := readU64(p, 0*8), readU64(p, 1*8), readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(i2^key64_016, i3^key64_024) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_000, i1^key64_008) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 4*8), readU64(p, 5*8), readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(i2^key64_048, i3^key64_056) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_032, i1^key64_040) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 8*8), readU64(p, 9*8), readU64(p, 10*8), readU64(p, 11*8) + + acc.Hi += mulFold64(i2^key64_080, i3^key64_088) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_064, i1^key64_072) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 12*8), readU64(p, 13*8), readU64(p, 14*8), readU64(p, 15*8) + + acc.Hi += mulFold64(i2^key64_112, i3^key64_120) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_096, i1^key64_104) + acc.Lo ^= i2 + i3 + } + + // avalanche + acc.Hi = xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + // trailing groups after 128 + top := ui(l) &^ 31 + for i := ui(4 * 32); i < top; i += 32 { + i0, i1, i2, i3 := readU64(p, i+0), readU64(p, i+8), readU64(p, i+16), readU64(p, i+24) + k0, k1, k2, k3 := readU64(key, i-125), readU64(key, i-117), readU64(key, i-109), readU64(key, i-101) + + acc.Hi += mulFold64(i2^k2, i3^k3) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^k0, i1^k1) + acc.Lo ^= i2 + i3 + } + + // last 32 bytes + { + i0, i1, i2, i3 := readU64(p, ui(l)-32), readU64(p, ui(l)-24), readU64(p, ui(l)-16), readU64(p, ui(l)-8) + + acc.Hi += mulFold64(i0^key64_119, i1^key64_127) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^key64_103, i3^key64_111) + acc.Lo ^= i0 + i1 + } + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+(u64(l)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + default: + acc.Lo = u64(l) * prime64_1 + acc.Hi = ^(u64(l) * prime64_2) + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, key, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, key, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, key, u64(l)) + } else { + accumScalar(&accs, p, key, u64(l)) + } + + // merge accs + acc.Lo += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc.Hi += mulFold64(accs[0]^key64_117, accs[1]^key64_125) + + acc.Lo += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc.Hi += mulFold64(accs[2]^key64_133, accs[3]^key64_141) + + acc.Lo += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc.Hi += mulFold64(accs[4]^key64_149, accs[5]^key64_157) + + acc.Lo += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + acc.Hi += mulFold64(accs[6]^key64_165, accs[7]^key64_173) + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + } +} diff --git a/vendor/github.com/zeebo/xxh3/hash128_seed.go b/vendor/github.com/zeebo/xxh3/hash128_seed.go new file mode 100644 index 000000000000..358009be320a --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash128_seed.go @@ -0,0 +1,264 @@ +package xxh3 + +import ( + "math/bits" +) + +// Hash128Seed returns the 128-bit hash of the byte slice. +func Hash128Seed(b []byte, seed uint64) Uint128 { + return hashAny128Seed(*(*str)(ptr(&b)), seed) +} + +// HashString128Seed returns the 128-bit hash of the string slice. +func HashString128Seed(s string, seed uint64) Uint128 { + return hashAny128Seed(*(*str)(ptr(&s)), seed) +} + +func hashAny128Seed(s str, seed uint64) (acc u128) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: // 9-16 + bitflipl := (key64_032 ^ key64_040) - seed + bitfliph := (key64_048 ^ key64_056) + seed + + input_lo := readU64(p, 0) + input_hi := readU64(p, ui(l)-8) + + m128_h, m128_l := bits.Mul64(input_lo^input_hi^bitflipl, prime64_1) + + m128_l += uint64(l-1) << 54 + input_hi ^= bitfliph + + m128_h += input_hi + uint64(uint32(input_hi))*(prime32_2-1) + + m128_l ^= bits.ReverseBytes64(m128_h) + + acc.Hi, acc.Lo = bits.Mul64(m128_l, prime64_2) + acc.Hi += m128_h * prime64_2 + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l > 3: // 4-8 + seed ^= u64(bits.ReverseBytes32(u32(seed))) << 32 + bitflip := (key64_016 ^ key64_024) + seed + input_lo := readU32(p, 0) + input_hi := readU32(p, ui(l)-4) + input_64 := u64(input_lo) + u64(input_hi)<<32 + keyed := input_64 ^ bitflip + + acc.Hi, acc.Lo = bits.Mul64(keyed, prime64_1+(uint64(l)<<2)) + + acc.Hi += acc.Lo << 1 + acc.Lo ^= acc.Hi >> 3 + + acc.Lo ^= acc.Lo >> 35 + acc.Lo *= 0x9fb21c651e98df25 + acc.Lo ^= acc.Lo >> 28 + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc.Lo = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc.Lo = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc.Lo = c1*(1<<24+1<<16+1) + 1<<8 + + default: // 0 + bitflipl := key64_064 ^ key64_072 ^ seed + bitfliph := key64_080 ^ key64_088 ^ seed + return u128{Lo: xxh64AvalancheFull(bitflipl), Hi: xxh64AvalancheFull(bitfliph)} + } + + acc.Hi = uint64(bits.RotateLeft32(bits.ReverseBytes32(uint32(acc.Lo)), 13)) + acc.Lo ^= uint64(key32_000^key32_004) + seed + acc.Hi ^= uint64(key32_008^key32_012) - seed + + acc.Lo = xxh64AvalancheFull(acc.Lo) + acc.Hi = xxh64AvalancheFull(acc.Hi) + + return acc + + case l <= 128: + acc.Lo = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + in8, in7 := readU64(p, ui(l)-8*8), readU64(p, ui(l)-7*8) + i6, i7 := readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(in8^(key64_112+seed), in7^(key64_120-seed)) + acc.Hi ^= i6 + i7 + acc.Lo += mulFold64(i6^(key64_096+seed), i7^(key64_104-seed)) + acc.Lo ^= in8 + in7 + + } // 96 + + in6, in5 := readU64(p, ui(l)-6*8), readU64(p, ui(l)-5*8) + i4, i5 := readU64(p, 4*8), readU64(p, 5*8) + + acc.Hi += mulFold64(in6^(key64_080+seed), in5^(key64_088-seed)) + acc.Hi ^= i4 + i5 + acc.Lo += mulFold64(i4^(key64_064+seed), i5^(key64_072-seed)) + acc.Lo ^= in6 + in5 + + } // 64 + + in4, in3 := readU64(p, ui(l)-4*8), readU64(p, ui(l)-3*8) + i2, i3 := readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(in4^(key64_048+seed), in3^(key64_056-seed)) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^(key64_032+seed), i3^(key64_040-seed)) + acc.Lo ^= in4 + in3 + + } // 32 + + in2, in1 := readU64(p, ui(l)-2*8), readU64(p, ui(l)-1*8) + i0, i1 := readU64(p, 0*8), readU64(p, 1*8) + + acc.Hi += mulFold64(in2^(key64_016+seed), in1^(key64_024-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_000+seed), i1^(key64_008-seed)) + acc.Lo ^= in2 + in1 + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+((u64(l)-seed)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + case l <= 240: + acc.Lo = u64(l) * prime64_1 + + { + i0, i1, i2, i3 := readU64(p, 0*8), readU64(p, 1*8), readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(i2^(key64_016+seed), i3^(key64_024-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_000+seed), i1^(key64_008-seed)) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 4*8), readU64(p, 5*8), readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(i2^(key64_048+seed), i3^(key64_056-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_032+seed), i1^(key64_040-seed)) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 8*8), readU64(p, 9*8), readU64(p, 10*8), readU64(p, 11*8) + + acc.Hi += mulFold64(i2^(key64_080+seed), i3^(key64_088-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_064+seed), i1^(key64_072-seed)) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 12*8), readU64(p, 13*8), readU64(p, 14*8), readU64(p, 15*8) + + acc.Hi += mulFold64(i2^(key64_112+seed), i3^(key64_120-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_096+seed), i1^(key64_104-seed)) + acc.Lo ^= i2 + i3 + } + + // avalanche + acc.Hi = xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + // trailing groups after 128 + top := ui(l) &^ 31 + for i := ui(4 * 32); i < top; i += 32 { + i0, i1, i2, i3 := readU64(p, i+0), readU64(p, i+8), readU64(p, i+16), readU64(p, i+24) + k0, k1, k2, k3 := readU64(key, i-125)+seed, readU64(key, i-117)-seed, readU64(key, i-109)+seed, readU64(key, i-101)-seed + + acc.Hi += mulFold64(i2^k2, i3^k3) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^k0, i1^k1) + acc.Lo ^= i2 + i3 + } + + // last 32 bytes + { + i0, i1, i2, i3 := readU64(p, ui(l)-32), readU64(p, ui(l)-24), readU64(p, ui(l)-16), readU64(p, ui(l)-8) + + seed := 0 - seed + acc.Hi += mulFold64(i0^(key64_119+seed), i1^(key64_127-seed)) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^(key64_103+seed), i3^(key64_111-seed)) + acc.Lo ^= i0 + i1 + } + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+((u64(l)-seed)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + default: + acc.Lo = u64(l) * prime64_1 + acc.Hi = ^(u64(l) * prime64_2) + + secret := key + if seed != 0 { + secret = ptr(&[secretSize]byte{}) + initSecret(secret, seed) + } + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, secret, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, secret, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, secret, u64(l)) + } else { + accumScalar(&accs, p, secret, u64(l)) + } + + // merge accs + const hi_off = 117 - 11 + + acc.Lo += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc.Hi += mulFold64(accs[0]^readU64(secret, 11+hi_off), accs[1]^readU64(secret, 19+hi_off)) + + acc.Lo += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc.Hi += mulFold64(accs[2]^readU64(secret, 27+hi_off), accs[3]^readU64(secret, 35+hi_off)) + + acc.Lo += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc.Hi += mulFold64(accs[4]^readU64(secret, 43+hi_off), accs[5]^readU64(secret, 51+hi_off)) + + acc.Lo += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + acc.Hi += mulFold64(accs[6]^readU64(secret, 59+hi_off), accs[7]^readU64(secret, 67+hi_off)) + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + } +} diff --git a/vendor/github.com/zeebo/xxh3/hash64.go b/vendor/github.com/zeebo/xxh3/hash64.go new file mode 100644 index 000000000000..13aab9585607 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash64.go @@ -0,0 +1,126 @@ +package xxh3 + +import "math/bits" + +// Hash returns the hash of the byte slice. +func Hash(b []byte) uint64 { + return hashAny(*(*str)(ptr(&b))) +} + +// Hash returns the hash of the string slice. +func HashString(s string) uint64 { + return hashAny(*(*str)(ptr(&s))) +} + +func hashAny(s str) (acc u64) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: // 9-16 + inputlo := readU64(p, 0) ^ (key64_024 ^ key64_032) + inputhi := readU64(p, ui(l)-8) ^ (key64_040 ^ key64_048) + folded := mulFold64(inputlo, inputhi) + return xxh3Avalanche(u64(l) + bits.ReverseBytes64(inputlo) + inputhi + folded) + + case l > 3: // 4-8 + input1 := readU32(p, 0) + input2 := readU32(p, ui(l)-4) + input64 := u64(input2) + u64(input1)<<32 + keyed := input64 ^ (key64_008 ^ key64_016) + return rrmxmx(keyed, u64(l)) + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc = c1*(1<<24+1<<16+1) + 1<<8 + + default: // 0 + return 0x2d06800538d394c2 // xxh_avalanche(key64_056 ^ key64_064) + } + + acc ^= u64(key32_000 ^ key32_004) + return xxhAvalancheSmall(acc) + + case l <= 128: + acc = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + acc += mulFold64(readU64(p, 6*8)^key64_096, readU64(p, 7*8)^key64_104) + acc += mulFold64(readU64(p, ui(l)-8*8)^key64_112, readU64(p, ui(l)-7*8)^key64_120) + } // 96 + acc += mulFold64(readU64(p, 4*8)^key64_064, readU64(p, 5*8)^key64_072) + acc += mulFold64(readU64(p, ui(l)-6*8)^key64_080, readU64(p, ui(l)-5*8)^key64_088) + } // 64 + acc += mulFold64(readU64(p, 2*8)^key64_032, readU64(p, 3*8)^key64_040) + acc += mulFold64(readU64(p, ui(l)-4*8)^key64_048, readU64(p, ui(l)-3*8)^key64_056) + } // 32 + acc += mulFold64(readU64(p, 0*8)^key64_000, readU64(p, 1*8)^key64_008) + acc += mulFold64(readU64(p, ui(l)-2*8)^key64_016, readU64(p, ui(l)-1*8)^key64_024) + + return xxh3Avalanche(acc) + + case l <= 240: + acc = u64(l) * prime64_1 + + acc += mulFold64(readU64(p, 0*16+0)^key64_000, readU64(p, 0*16+8)^key64_008) + acc += mulFold64(readU64(p, 1*16+0)^key64_016, readU64(p, 1*16+8)^key64_024) + acc += mulFold64(readU64(p, 2*16+0)^key64_032, readU64(p, 2*16+8)^key64_040) + acc += mulFold64(readU64(p, 3*16+0)^key64_048, readU64(p, 3*16+8)^key64_056) + acc += mulFold64(readU64(p, 4*16+0)^key64_064, readU64(p, 4*16+8)^key64_072) + acc += mulFold64(readU64(p, 5*16+0)^key64_080, readU64(p, 5*16+8)^key64_088) + acc += mulFold64(readU64(p, 6*16+0)^key64_096, readU64(p, 6*16+8)^key64_104) + acc += mulFold64(readU64(p, 7*16+0)^key64_112, readU64(p, 7*16+8)^key64_120) + + // avalanche + acc = xxh3Avalanche(acc) + + // trailing groups after 128 + top := ui(l) &^ 15 + for i := ui(8 * 16); i < top; i += 16 { + acc += mulFold64(readU64(p, i+0)^readU64(key, i-125), readU64(p, i+8)^readU64(key, i-117)) + } + + // last 16 bytes + acc += mulFold64(readU64(p, ui(l)-16)^key64_119, readU64(p, ui(l)-8)^key64_127) + + return xxh3Avalanche(acc) + + default: + acc = u64(l) * prime64_1 + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, key, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, key, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, key, u64(l)) + } else { + accumScalar(&accs, p, key, u64(l)) + } + + // merge accs + acc += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + + return xxh3Avalanche(acc) + } +} diff --git a/vendor/github.com/zeebo/xxh3/hash64_seed.go b/vendor/github.com/zeebo/xxh3/hash64_seed.go new file mode 100644 index 000000000000..429994c363ed --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash64_seed.go @@ -0,0 +1,134 @@ +package xxh3 + +import "math/bits" + +// HashSeed returns the hash of the byte slice with given seed. +func HashSeed(b []byte, seed uint64) uint64 { + return hashAnySeed(*(*str)(ptr(&b)), seed) + +} + +// HashStringSeed returns the hash of the string slice with given seed. +func HashStringSeed(s string, seed uint64) uint64 { + return hashAnySeed(*(*str)(ptr(&s)), seed) +} + +func hashAnySeed(s str, seed uint64) (acc u64) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: + inputlo := readU64(p, 0) ^ (key64_024 ^ key64_032 + seed) + inputhi := readU64(p, ui(l)-8) ^ (key64_040 ^ key64_048 - seed) + folded := mulFold64(inputlo, inputhi) + return xxh3Avalanche(u64(l) + bits.ReverseBytes64(inputlo) + inputhi + folded) + + case l > 3: + seed ^= u64(bits.ReverseBytes32(u32(seed))) << 32 + input1 := readU32(p, 0) + input2 := readU32(p, ui(l)-4) + input64 := u64(input2) + u64(input1)<<32 + keyed := input64 ^ (key64_008 ^ key64_016 - seed) + return rrmxmx(keyed, u64(l)) + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc = c1*(1<<24+1<<16+1) + 1<<8 + + default: + return xxhAvalancheSmall(seed ^ key64_056 ^ key64_064) + } + + acc ^= u64(key32_000^key32_004) + seed + return xxhAvalancheSmall(acc) + + case l <= 128: + acc = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + acc += mulFold64(readU64(p, 6*8)^(key64_096+seed), readU64(p, 7*8)^(key64_104-seed)) + acc += mulFold64(readU64(p, ui(l)-8*8)^(key64_112+seed), readU64(p, ui(l)-7*8)^(key64_120-seed)) + } // 96 + acc += mulFold64(readU64(p, 4*8)^(key64_064+seed), readU64(p, 5*8)^(key64_072-seed)) + acc += mulFold64(readU64(p, ui(l)-6*8)^(key64_080+seed), readU64(p, ui(l)-5*8)^(key64_088-seed)) + } // 64 + acc += mulFold64(readU64(p, 2*8)^(key64_032+seed), readU64(p, 3*8)^(key64_040-seed)) + acc += mulFold64(readU64(p, ui(l)-4*8)^(key64_048+seed), readU64(p, ui(l)-3*8)^(key64_056-seed)) + } // 32 + acc += mulFold64(readU64(p, 0*8)^(key64_000+seed), readU64(p, 1*8)^(key64_008-seed)) + acc += mulFold64(readU64(p, ui(l)-2*8)^(key64_016+seed), readU64(p, ui(l)-1*8)^(key64_024-seed)) + + return xxh3Avalanche(acc) + + case l <= 240: + acc = u64(l) * prime64_1 + + acc += mulFold64(readU64(p, 0*16+0)^(key64_000+seed), readU64(p, 0*16+8)^(key64_008-seed)) + acc += mulFold64(readU64(p, 1*16+0)^(key64_016+seed), readU64(p, 1*16+8)^(key64_024-seed)) + acc += mulFold64(readU64(p, 2*16+0)^(key64_032+seed), readU64(p, 2*16+8)^(key64_040-seed)) + acc += mulFold64(readU64(p, 3*16+0)^(key64_048+seed), readU64(p, 3*16+8)^(key64_056-seed)) + acc += mulFold64(readU64(p, 4*16+0)^(key64_064+seed), readU64(p, 4*16+8)^(key64_072-seed)) + acc += mulFold64(readU64(p, 5*16+0)^(key64_080+seed), readU64(p, 5*16+8)^(key64_088-seed)) + acc += mulFold64(readU64(p, 6*16+0)^(key64_096+seed), readU64(p, 6*16+8)^(key64_104-seed)) + acc += mulFold64(readU64(p, 7*16+0)^(key64_112+seed), readU64(p, 7*16+8)^(key64_120-seed)) + + // avalanche + acc = xxh3Avalanche(acc) + + // trailing groups after 128 + top := ui(l) &^ 15 + for i := ui(8 * 16); i < top; i += 16 { + acc += mulFold64(readU64(p, i+0)^(readU64(key, i-125)+seed), readU64(p, i+8)^(readU64(key, i-117)-seed)) + } + + // last 16 bytes + acc += mulFold64(readU64(p, ui(l)-16)^(key64_119+seed), readU64(p, ui(l)-8)^(key64_127-seed)) + + return xxh3Avalanche(acc) + + default: + acc = u64(l) * prime64_1 + + secret := key + if seed != 0 { + secret = ptr(&[secretSize]byte{}) + initSecret(secret, seed) + } + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, secret, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, secret, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, secret, u64(l)) + } else { + accumScalarSeed(&accs, p, secret, u64(l)) + } + + // merge accs + acc += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + + return xxh3Avalanche(acc) + } +} diff --git a/vendor/github.com/zeebo/xxh3/hasher.go b/vendor/github.com/zeebo/xxh3/hasher.go new file mode 100644 index 000000000000..d9789980a3f0 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hasher.go @@ -0,0 +1,239 @@ +package xxh3 + +import ( + "encoding/binary" + "hash" +) + +// Hasher implements the hash.Hash interface +type Hasher struct { + acc [8]u64 + blk u64 + len u64 + key ptr + buf [_block + _stripe]byte + seed u64 +} + +var ( + _ hash.Hash = (*Hasher)(nil) + _ hash.Hash64 = (*Hasher)(nil) +) + +// New returns a new Hasher that implements the hash.Hash interface. +func New() *Hasher { + return new(Hasher) +} + +// NewSeed returns a new Hasher that implements the hash.Hash interface. +func NewSeed(seed uint64) *Hasher { + var h Hasher + h.Reset() + h.seed = seed + h.key = key + + // Only initiate once, not on reset. + if seed != 0 { + h.key = ptr(&[secretSize]byte{}) + initSecret(h.key, seed) + } + return &h +} + +// Reset resets the Hash to its initial state. +func (h *Hasher) Reset() { + h.acc = [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + h.blk = 0 + h.len = 0 +} + +// BlockSize returns the hash's underlying block size. +// The Write method will accept any amount of data, but +// it may operate more efficiently if all writes are a +// multiple of the block size. +func (h *Hasher) BlockSize() int { return _stripe } + +// Size returns the number of bytes Sum will return. +func (h *Hasher) Size() int { return 8 } + +// Sum appends the current hash to b and returns the resulting slice. +// It does not change the underlying hash state. +func (h *Hasher) Sum(b []byte) []byte { + var tmp [8]byte + binary.BigEndian.PutUint64(tmp[:], h.Sum64()) + return append(b, tmp[:]...) +} + +// Write adds more data to the running hash. +// It never returns an error. +func (h *Hasher) Write(buf []byte) (int, error) { + h.update(buf) + return len(buf), nil +} + +// WriteString adds more data to the running hash. +// It never returns an error. +func (h *Hasher) WriteString(buf string) (int, error) { + h.updateString(buf) + return len(buf), nil +} + +func (h *Hasher) update(buf []byte) { + // relies on the data pointer being the first word in the string header + h.updateString(*(*string)(ptr(&buf))) +} + +func (h *Hasher) updateString(buf string) { + if h.key == nil { + h.key = key + h.Reset() + } + + // On first write, if more than 1 block, process without copy. + for h.len == 0 && len(buf) > len(h.buf) { + if hasAVX2 { + accumBlockAVX2(&h.acc, *(*ptr)(ptr(&buf)), h.key) + } else if hasSSE2 { + accumBlockSSE(&h.acc, *(*ptr)(ptr(&buf)), h.key) + } else { + accumBlockScalar(&h.acc, *(*ptr)(ptr(&buf)), h.key) + } + buf = buf[_block:] + h.blk++ + } + + for len(buf) > 0 { + if h.len < u64(len(h.buf)) { + n := copy(h.buf[h.len:], buf) + h.len += u64(n) + buf = buf[n:] + continue + } + + if hasAVX2 { + accumBlockAVX2(&h.acc, ptr(&h.buf), h.key) + } else if hasSSE2 { + accumBlockSSE(&h.acc, ptr(&h.buf), h.key) + } else { + accumBlockScalar(&h.acc, ptr(&h.buf), h.key) + } + + h.blk++ + h.len = _stripe + copy(h.buf[:_stripe], h.buf[_block:]) + } +} + +// Sum64 returns the 64-bit hash of the written data. +func (h *Hasher) Sum64() uint64 { + if h.key == nil { + h.key = key + h.Reset() + } + + if h.blk == 0 { + if h.seed == 0 { + return Hash(h.buf[:h.len]) + } + return HashSeed(h.buf[:h.len], h.seed) + } + + l := h.blk*_block + h.len + acc := l * prime64_1 + accs := h.acc + + if h.len > 0 { + // We are only ever doing 1 block here, so no avx512. + if hasAVX2 { + accumAVX2(&accs, ptr(&h.buf[0]), h.key, h.len) + } else if hasSSE2 { + accumSSE(&accs, ptr(&h.buf[0]), h.key, h.len) + } else { + accumScalar(&accs, ptr(&h.buf[0]), h.key, h.len) + } + } + + if h.seed == 0 { + acc += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + } else { + secret := h.key + acc += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + } + + acc = xxh3Avalanche(acc) + + return acc +} + +// Sum128 returns the 128-bit hash of the written data. +func (h *Hasher) Sum128() Uint128 { + if h.key == nil { + h.key = key + h.Reset() + } + + if h.blk == 0 { + if h.seed == 0 { + return Hash128(h.buf[:h.len]) + } + return Hash128Seed(h.buf[:h.len], h.seed) + } + + l := h.blk*_block + h.len + acc := Uint128{Lo: l * prime64_1, Hi: ^(l * prime64_2)} + accs := h.acc + + if h.len > 0 { + // We are only ever doing 1 block here, so no avx512. + if hasAVX2 { + accumAVX2(&accs, ptr(&h.buf[0]), h.key, h.len) + } else if hasSSE2 { + accumSSE(&accs, ptr(&h.buf[0]), h.key, h.len) + } else { + accumScalar(&accs, ptr(&h.buf[0]), h.key, h.len) + } + } + + if h.seed == 0 { + acc.Lo += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc.Hi += mulFold64(accs[0]^key64_117, accs[1]^key64_125) + + acc.Lo += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc.Hi += mulFold64(accs[2]^key64_133, accs[3]^key64_141) + + acc.Lo += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc.Hi += mulFold64(accs[4]^key64_149, accs[5]^key64_157) + + acc.Lo += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + acc.Hi += mulFold64(accs[6]^key64_165, accs[7]^key64_173) + } else { + secret := h.key + const hi_off = 117 - 11 + + acc.Lo += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc.Hi += mulFold64(accs[0]^readU64(secret, 11+hi_off), accs[1]^readU64(secret, 19+hi_off)) + + acc.Lo += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc.Hi += mulFold64(accs[2]^readU64(secret, 27+hi_off), accs[3]^readU64(secret, 35+hi_off)) + + acc.Lo += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc.Hi += mulFold64(accs[4]^readU64(secret, 43+hi_off), accs[5]^readU64(secret, 51+hi_off)) + + acc.Lo += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + acc.Hi += mulFold64(accs[6]^readU64(secret, 59+hi_off), accs[7]^readU64(secret, 67+hi_off)) + } + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc +} diff --git a/vendor/github.com/zeebo/xxh3/utils.go b/vendor/github.com/zeebo/xxh3/utils.go new file mode 100644 index 000000000000..a837e68a6216 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/utils.go @@ -0,0 +1,129 @@ +package xxh3 + +import ( + "math/bits" + "unsafe" +) + +// Uint128 is a 128 bit value. +// The actual value can be thought of as u.Hi<<64 | u.Lo. +type Uint128 struct { + Hi, Lo uint64 +} + +// Bytes returns the uint128 as an array of bytes in canonical form (big-endian encoded). +func (u Uint128) Bytes() [16]byte { + return [16]byte{ + byte(u.Hi >> 0x38), byte(u.Hi >> 0x30), byte(u.Hi >> 0x28), byte(u.Hi >> 0x20), + byte(u.Hi >> 0x18), byte(u.Hi >> 0x10), byte(u.Hi >> 0x08), byte(u.Hi), + byte(u.Lo >> 0x38), byte(u.Lo >> 0x30), byte(u.Lo >> 0x28), byte(u.Lo >> 0x20), + byte(u.Lo >> 0x18), byte(u.Lo >> 0x10), byte(u.Lo >> 0x08), byte(u.Lo), + } +} + +type ( + ptr = unsafe.Pointer + ui = uintptr + + u8 = uint8 + u32 = uint32 + u64 = uint64 + u128 = Uint128 +) + +type str struct { + p ptr + l uint +} + +func readU8(p ptr, o ui) uint8 { + return *(*uint8)(ptr(ui(p) + o)) +} + +func readU16(p ptr, o ui) uint16 { + b := (*[2]byte)(ptr(ui(p) + o)) + return uint16(b[0]) | uint16(b[1])<<8 +} + +func readU32(p ptr, o ui) uint32 { + b := (*[4]byte)(ptr(ui(p) + o)) + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func readU64(p ptr, o ui) uint64 { + b := (*[8]byte)(ptr(ui(p) + o)) + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func writeU64(p ptr, o ui, v u64) { + b := (*[8]byte)(ptr(ui(p) + o)) + b[0] = byte(v) + b[1] = byte(v >> 8) + b[2] = byte(v >> 16) + b[3] = byte(v >> 24) + b[4] = byte(v >> 32) + b[5] = byte(v >> 40) + b[6] = byte(v >> 48) + b[7] = byte(v >> 56) +} + +const secretSize = 192 + +func initSecret(secret ptr, seed u64) { + for i := ui(0); i < secretSize/16; i++ { + lo := readU64(key, 16*i) + seed + hi := readU64(key, 16*i+8) - seed + writeU64(secret, 16*i, lo) + writeU64(secret, 16*i+8, hi) + } +} + +func xxh64AvalancheSmall(x u64) u64 { + // x ^= x >> 33 // x must be < 32 bits + // x ^= u64(key32_000 ^ key32_004) // caller must do this + x *= prime64_2 + x ^= x >> 29 + x *= prime64_3 + x ^= x >> 32 + return x +} + +func xxhAvalancheSmall(x u64) u64 { + x ^= x >> 33 + x *= prime64_2 + x ^= x >> 29 + x *= prime64_3 + x ^= x >> 32 + return x +} + +func xxh64AvalancheFull(x u64) u64 { + x ^= x >> 33 + x *= prime64_2 + x ^= x >> 29 + x *= prime64_3 + x ^= x >> 32 + return x +} + +func xxh3Avalanche(x u64) u64 { + x ^= x >> 37 + x *= 0x165667919e3779f9 + x ^= x >> 32 + return x +} + +func rrmxmx(h64 u64, len u64) u64 { + h64 ^= bits.RotateLeft64(h64, 49) ^ bits.RotateLeft64(h64, 24) + h64 *= 0x9fb21c651e98df25 + h64 ^= (h64 >> 35) + len + h64 *= 0x9fb21c651e98df25 + h64 ^= (h64 >> 28) + return h64 +} + +func mulFold64(x, y u64) u64 { + hi, lo := bits.Mul64(x, y) + return hi ^ lo +} diff --git a/vendor/modules.txt b/vendor/modules.txt index e8bb3ac0feae..2ab973d4ddac 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -463,6 +463,9 @@ github.com/klauspost/compress/internal/cpuinfo github.com/klauspost/compress/internal/snapref github.com/klauspost/compress/zstd github.com/klauspost/compress/zstd/internal/xxhash +# github.com/klauspost/cpuid/v2 v2.0.9 +## explicit; go 1.13 +github.com/klauspost/cpuid/v2 # github.com/mailru/easyjson v0.7.6 ## explicit; go 1.12 github.com/mailru/easyjson/buffer @@ -705,6 +708,9 @@ github.com/zclconf/go-cty/cty/function/stdlib github.com/zclconf/go-cty/cty/gocty github.com/zclconf/go-cty/cty/json github.com/zclconf/go-cty/cty/set +# github.com/zeebo/xxh3 v1.0.2 +## explicit; go 1.17 +github.com/zeebo/xxh3 # go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.45.0 ## explicit; go 1.19 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc