decoder.go

package csvutil

import (
	"errors"
	"io"
	"reflect"
)

type decField struct {
	columnIndex int
	field
	decodeFunc
	zero any
}

// A Decoder reads and decodes string records into structs.
type Decoder struct {
	// Tag defines which key in the struct field's tag to scan for names and
	// options (Default: 'csv').
	Tag string

	// If true, Decoder will return a MissingColumnsError if it discovers
	// that any of the columns are missing. This means that a CSV input
	// will be required to contain all columns that were defined in the
	// provided struct.
	DisallowMissingColumns bool

	// AlignRecord will cause Decoder to align returned record slice to the
	// header in case Reader returns records of different lengths.
	//
	// This flag is supposed to work with csv.Reader.FieldsPerRecord set to -1
	// which may cause this behavior.
	//
	// When header is longer than the record, it will populate the missing
	// records with an empty string.
	//
	// When header is shorter than the record, it will slice the record to match
	// header's length.
	//
	// When this flag is used, Decoder will not ever return ErrFieldCount.
	AlignRecord bool

	// If not nil, Map is a function that is called for each field in the csv
	// record before decoding the data. It allows mapping certain string values
	// for specific columns or types to a known format. Decoder calls Map with
	// the current column name (taken from header) and a zero non-pointer value
	// of a type to which it is going to decode data into. Implementations
	// should use type assertions to recognize the type.
	//
	// The good example of use case for Map is if NaN values are represented by
	// eg 'n/a' string, implementing a specific Map function for all floats
	// could map 'n/a' back into 'NaN' to allow successful decoding.
	//
	// Use Map with caution. If the requirements of column or type are not met
	// Map should return 'field', since it is the original value that was
	// read from the csv input, this would indicate no change.
	//
	// If struct field is an interface v will be of type string, unless the
	// struct field contains a settable pointer value - then v will be a zero
	// value of that type.
	//
	// Map must be set before the first call to Decode and not changed after it.
	Map func(field, col string, v any) string

	r          Reader
	typeKey    typeKey
	hmap       map[string]int
	header     []string
	record     []string
	cache      []decField
	unused     []int
	funcMap    map[reflect.Type]func([]byte, any) error
	ifaceFuncs []ifaceDecodeFunc
}

type ifaceDecodeFunc struct {
	f       func([]byte, any) error
	argType reflect.Type
}

// NewDecoder returns a new decoder that reads from r.
//
// Decoder will match struct fields according to the given header.
//
// If header is empty NewDecoder will read one line and treat it as a header.
//
// Records coming from r must be of the same length as the header.
//
// NewDecoder may return io.EOF if there is no data in r and no header was
// provided by the caller.
func NewDecoder(r Reader, header ...string) (dec *Decoder, err error) {
	if len(header) == 0 {
		header, err = r.Read()
		if err != nil {
			return nil, err
		}
	}

	h := make([]string, len(header))
	copy(h, header)
	header = h

	m := make(map[string]int, len(header))
	for i, h := range header {
		m[h] = i
	}

	return &Decoder{
		r:      r,
		header: header,
		hmap:   m,
		unused: make([]int, 0, len(header)),
	}, nil
}

// Decode reads the next string record or records from its input and stores it
// in the value pointed to by v which must be a pointer to a struct, struct slice
// or struct array.
//
// Decode matches all exported struct fields based on the header. Struct fields
// can be adjusted by using tags.
//
// The "omitempty" option specifies that the field should be omitted from
// the decoding if record's field is an empty string.
//
// Examples of struct field tags and their meanings:
//
//	// Decode matches this field with "myName" header column.
//	Field int `csv:"myName"`
//
//	// Decode matches this field with "Field" header column.
//	Field int
//
//	// Decode matches this field with "myName" header column and decoding is not
//	// called if record's field is an empty string.
//	Field int `csv:"myName,omitempty"`
//
//	// Decode matches this field with "Field" header column and decoding is not
//	// called if record's field is an empty string.
//	Field int `csv:",omitempty"`
//
//	// Decode ignores this field.
//	Field int `csv:"-"`
//
//	// Decode treats this field exactly as if it was an embedded field and
//	// matches header columns that start with "my_prefix_" to all fields of this
//	// type.
//	Field Struct `csv:"my_prefix_,inline"`
//
//	// Decode treats this field exactly as if it was an embedded field.
//	Field Struct `csv:",inline"`
//
// By default decode looks for "csv" tag, but this can be changed by setting
// Decoder.Tag field.
//
// To Decode into a custom type v must implement csvutil.Unmarshaler or
// encoding.TextUnmarshaler.
//
// Anonymous struct fields with tags are treated like normal fields and they
// must implement csvutil.Unmarshaler or encoding.TextUnmarshaler unless inline
// tag is specified.
//
// Anonymous struct fields without tags are populated just as if they were
// part of the main struct. However, fields in the main struct have bigger
// priority and they are populated first. If main struct and anonymous struct
// field have the same fields, the main struct's fields will be populated.
//
// Fields of type []byte expect the data to be base64 encoded strings.
//
// Float fields are decoded to NaN if a string value is 'NaN'. This check
// is case insensitive.
//
// Interface fields are decoded to strings unless they contain settable pointer
// value.
//
// Pointer fields are decoded to nil if a string value is empty.
//
// If v is a slice, Decode resets it and reads the input until EOF, storing all
// decoded values in the given slice. Decode returns nil on EOF.
//
// If v is an array, Decode reads the input until EOF or until it decodes all
// corresponding array elements. If the input contains less elements than the
// array, the additional Go array elements are set to zero values. Decode
// returns nil on EOF unless there were no records decoded.
//
// Fields with inline tags that have a non-empty prefix must not be cyclic
// structures. Passing such values to Decode will result in an infinite loop.
func (d *Decoder) Decode(v any) (err error) {
	val := reflect.ValueOf(v)
	if val.Kind() != reflect.Ptr || val.IsNil() {
		return &InvalidDecodeError{Type: reflect.TypeOf(v)}
	}

	elem := indirect(val.Elem())
	switch elem.Kind() {
	case reflect.Struct:
		return d.decodeStruct(elem)
	case reflect.Slice:
		return d.decodeSlice(elem)
	case reflect.Array:
		return d.decodeArray(elem)
	case reflect.Interface, reflect.Invalid:
		elem = walkValue(elem)
		if elem.Kind() != reflect.Invalid {
			return &InvalidDecodeError{Type: elem.Type()}
		}
		return &InvalidDecodeError{Type: val.Type()}
	default:
		return &InvalidDecodeError{Type: reflect.PtrTo(elem.Type())}
	}
}

// Record returns the most recently read record. The slice is valid until the
// next call to Decode.
func (d *Decoder) Record() []string {
	return d.record
}

// Header returns the first line that came from the reader, or returns the
// defined header by the caller.
func (d *Decoder) Header() []string {
	header := make([]string, len(d.header))
	copy(header, d.header)
	return header
}

// NormalizeHeader applies f to every column in the header. It returns error
// if calling f results in conflicting header columns.
//
// NormalizeHeader must be called before Decode.
func (d *Decoder) NormalizeHeader(f func(string) string) error {
	set := make(map[string]int, len(d.header))
	for i, s := range d.header {
		set[f(s)] = i
	}

	if len(set) != len(d.header) {
		return errors.New("csvutil: normalize header results in conflicting columns")
	}

	for s, i := range set {
		d.header[i] = s
	}
	d.hmap = set
	return nil
}

// Unused returns a list of column indexes that were not used during decoding
// due to lack of matching struct field.
func (d *Decoder) Unused() []int {
	if len(d.unused) == 0 {
		return nil
	}

	indices := make([]int, len(d.unused))
	copy(indices, d.unused)
	return indices
}

// Register registers a custom decoding function for a concrete type or interface.
// The argument f must be of type:
//
//	func([]byte, T) error
//
// T must be a concrete type such as *time.Time, or interface that has at least one
// method.
//
// During decoding, fields are matched by the concrete type first. If match is not
// found then Decoder looks if field implements any of the registered interfaces
// in order they were registered.
//
// Register panics if:
//   - f does not match the right signature
//   - f is an empty interface
//   - f was already registered
//
// Register is based on the encoding/json proposal:
// https://github.com/golang/go/issues/5901.
//
// Deprecated: use UnmarshalFunc function with type parameter instead. The benefits
// are type safety and much better performance.
func (d *Decoder) Register(f any) {
	v := reflect.ValueOf(f)
	typ := v.Type()

	if typ.Kind() != reflect.Func ||
		typ.NumIn() != 2 || typ.NumOut() != 1 ||
		typ.In(0) != _bytes || typ.Out(0) != _error {
		panic("csvutil: func must be of type func([]byte, T) error")
	}

	argType := typ.In(1)

	if argType.Kind() == reflect.Interface && argType.NumMethod() == 0 {
		panic("csvutil: func argument type must not be an empty interface")
	}

	if d.funcMap == nil {
		d.funcMap = make(map[reflect.Type]func([]byte, any) error)
	}

	if _, ok := d.funcMap[argType]; ok {
		panic("csvutil: func " + typ.String() + " already registered")
	}

	isIface := argType.Kind() == reflect.Interface
	isArgPtr := v.Type().In(1).Kind() == reflect.Ptr

	fn := func(data []byte, in any) error {
		dst := reflect.ValueOf(in)

		if isIface && !dst.IsValid() {
			return &UnmarshalTypeError{Value: string(data), Type: argType}
		}

		if !isIface && isArgPtr && dst.Kind() != reflect.Pointer {
			dst = dst.Addr()
		}

		out := v.Call([]reflect.Value{
			reflect.ValueOf(data),
			dst,
		})
		err, _ := out[0].Interface().(error)
		return err
	}

	d.funcMap[argType] = fn

	if argType.Kind() == reflect.Interface {
		d.ifaceFuncs = append(d.ifaceFuncs, ifaceDecodeFunc{
			f:       fn,
			argType: argType,
		})
	}
}

// WithUnmarshalers sets the provided Unmarshalers for the decoder.
//
// WithUnmarshalers is based on the encoding/json proposal:
// https://github.com/golang/go/issues/5901.
func (d *Decoder) WithUnmarshalers(u *Unmarshalers) {
	d.funcMap = u.funcMap
	d.ifaceFuncs = u.ifaceFuncs
}

func (d *Decoder) decodeSlice(slice reflect.Value) error {
	typ := slice.Type().Elem()
	if walkType(typ).Kind() != reflect.Struct {
		return &InvalidDecodeError{Type: reflect.PtrTo(slice.Type())}
	}

	slice.SetLen(0)

	var c int
	for ; ; c++ {
		v := reflect.New(typ)

		err := d.decodeStruct(indirect(v))
		if err == io.EOF {
			if c == 0 {
				return io.EOF
			}
			break
		}

		// we want to ensure that we append this element to the slice even if it
		// was partially decoded due to error. This is how JSON pkg does it.
		slice.Set(reflect.Append(slice, v.Elem()))
		if err != nil {
			return err
		}
	}

	slice.Set(slice.Slice3(0, c, c))
	return nil
}

func (d *Decoder) decodeArray(v reflect.Value) error {
	if walkType(v.Type().Elem()).Kind() != reflect.Struct {
		return &InvalidDecodeError{Type: reflect.PtrTo(v.Type())}
	}

	l := v.Len()

	var i int
	for ; i < l; i++ {
		if err := d.decodeStruct(indirect(v.Index(i))); err == io.EOF {
			if i == 0 {
				return io.EOF
			}
			break
		} else if err != nil {
			return err
		}
	}

	zero := reflect.Zero(v.Type().Elem())
	for i := i; i < l; i++ {
		v.Index(i).Set(zero)
	}
	return nil
}

func (d *Decoder) decodeStruct(v reflect.Value) (err error) {
	d.record, err = d.r.Read()
	if err != nil {
		return err
	}

	if len(d.record) != len(d.header) {
		if !d.AlignRecord {
			return ErrFieldCount
		}

		if len(d.record) > len(d.header) {
			d.record = d.record[:len(d.header)]
		} else {
			d.record = append(d.record, make([]string, len(d.header)-len(d.record))...)
		}
	}

	return d.unmarshal(d.record, v)
}

func (d *Decoder) unmarshal(record []string, v reflect.Value) error {
	fields, err := d.fields(typeKey{d.tag(), v.Type()})
	if err != nil {
		return err
	}

fieldLoop:
	for _, f := range fields {
		isBlank := record[f.columnIndex] == ""
		if f.tag.omitEmpty && isBlank {
			continue
		}

		fv := v
		for n, i := range f.index {
			fv = fv.Field(i)
			if fv.Kind() == reflect.Ptr {
				if fv.IsNil() {
					if isBlank && n == len(f.index)-1 { // ensure we are on the leaf.
						continue fieldLoop
					}
					// this can happen if a field is an unexported embedded
					// pointer type. In Go prior to 1.10 it was possible to
					// set such value because of a bug in the reflect package
					// https://github.com/golang/go/issues/21353
					if !fv.CanSet() {
						return errPtrUnexportedStruct(fv.Type())
					}
					fv.Set(reflect.New(fv.Type().Elem()))
				}

				if isBlank && n == len(f.index)-1 { // ensure we are on the leaf.
					fv.Set(reflect.Zero(fv.Type()))
					continue fieldLoop
				}

				if n != len(f.index)-1 {
					fv = fv.Elem() // walk pointer until we are on the the leaf.
				}
			}
		}

		s := record[f.columnIndex]
		if d.Map != nil && f.zero != nil {
			zero := f.zero
			if fv := walkPtr(fv); fv.Kind() == reflect.Interface && !fv.IsNil() {
				if v := walkValue(fv); v.CanSet() {
					zero = reflect.Zero(v.Type()).Interface()
				}
			}
			s = d.Map(s, d.header[f.columnIndex], zero)
		}

		if err := f.decodeFunc(s, fv); err != nil {
			return wrapDecodeError(d.r, d.header[f.columnIndex], f.columnIndex, err)
		}
	}
	return nil
}

// wrapDecodeError provides the given error with more context such as:
//   - column name (field)
//   - line number
//   - column within record
//
// Line and Column info is available only if the used Reader supports 'FieldPos'
// that is available e.g. in csv.Reader (since Go1.17).
//
// The caller should use errors.As in order to fetch the original error.
func wrapDecodeError(r Reader, field string, fieldIndex int, err error) error {
	fp, ok := r.(interface {
		FieldPos(fieldIndex int) (line, column int)
	})
	if !ok {
		return &DecodeError{
			Field: field,
			Err:   err,
		}
	}

	l, c := fp.FieldPos(fieldIndex)

	return &DecodeError{
		Field:  field,
		Line:   l,
		Column: c,
		Err:    err,
	}
}

func (d *Decoder) fields(k typeKey) ([]decField, error) {
	if k == d.typeKey {
		return d.cache, nil
	}

	var (
		fields      = cachedFields(k)
		decFields   = make([]decField, 0, len(fields))
		used        = make([]bool, len(d.header))
		missingCols []string
	)
	for _, f := range fields {
		i, ok := d.hmap[f.name]
		if !ok {
			if d.DisallowMissingColumns {
				missingCols = append(missingCols, f.name)
			}
			continue
		}

		fn, err := decodeFn(f.baseType, d.funcMap, d.ifaceFuncs)
		if err != nil {
			return nil, err
		}

		df := decField{
			columnIndex: i,
			field:       f,
			decodeFunc:  fn,
		}

		if d.Map != nil {
			switch f.typ.Kind() {
			case reflect.Interface:
				df.zero = "" // interface values are decoded to strings
			default:
				df.zero = reflect.Zero(walkType(f.typ)).Interface()
			}
		}

		decFields = append(decFields, df)
		used[i] = true
	}

	if len(missingCols) > 0 {
		return nil, &MissingColumnsError{
			Columns: missingCols,
		}
	}

	d.unused = d.unused[:0]
	for i, b := range used {
		if !b {
			d.unused = append(d.unused, i)
		}
	}

	d.cache, d.typeKey = decFields, k
	return d.cache, nil
}

func (d *Decoder) tag() string {
	if d.Tag == "" {
		return defaultTag
	}
	return d.Tag
}

func indirect(v reflect.Value) reflect.Value {
	for {
		switch v.Kind() {
		case reflect.Interface:
			if v.IsNil() {
				return v
			}
			e := v.Elem()
			if e.Kind() == reflect.Ptr && !e.IsNil() {
				v = e
				continue
			}
			return v
		case reflect.Ptr:
			if v.IsNil() {
				v.Set(reflect.New(v.Type().Elem()))
			}
			v = v.Elem()
		default:
			return v
		}
	}
}

// Unmarshalers stores custom unmarshal functions. Unmarshalers is immutable.
//
// Unmarshalers are based on the encoding/json proposal:
// https://github.com/golang/go/issues/5901.
type Unmarshalers struct {
	funcMap    map[reflect.Type]func([]byte, any) error
	ifaceFuncs []ifaceDecodeFunc
}

// NewUnmarshalers merges the provided Unmarshalers into one and returns it.
// If Unmarshalers contain duplicate function signatures, the one that was
// provided first wins.
func NewUnmarshalers(us ...*Unmarshalers) *Unmarshalers {
	out := &Unmarshalers{
		funcMap: make(map[reflect.Type]func([]byte, any) error),
	}

	for _, u := range us {
		for k, v := range u.funcMap {
			if _, ok := out.funcMap[k]; ok {
				continue
			}
			out.funcMap[k] = v
		}
		out.ifaceFuncs = append(out.ifaceFuncs, u.ifaceFuncs...)
	}

	return out
}

// UnmarshalFunc stores the provided function in Unmarshaler and returns it.
//
// Type Parameter T must be a concrete type such as *time.Time, or interface
// that has at least one method.
//
// During decoding, fields are matched by the concrete type first. If match is not
// found then Decoder looks if field implements any of the registered interfaces
// in order they were registered.
//
// UnmarshalFunc panics if T is an empty interface.
func UnmarshalFunc[T any](f func([]byte, T) error) *Unmarshalers {
	var (
		funcMap    = make(map[reflect.Type]func([]byte, any) error)
		ifaceFuncs []ifaceDecodeFunc
		argType    = reflect.TypeOf(f).In(1)
		isIface    = argType.Kind() == reflect.Interface
	)

	fn := func(data []byte, v any) error {
		if !isIface {
			return f(data, v.(T))
		}
		if _, ok := v.(T); !ok {
			return &UnmarshalTypeError{Value: string(data), Type: argType}
		}
		return f(data, v.(T))
	}

	funcMap[argType] = fn

	if argType.Kind() == reflect.Interface {
		if argType.NumMethod() == 0 {
			panic("csvutil: func argument type must not be an empty interface")
		}

		ifaceFuncs = append(ifaceFuncs, ifaceDecodeFunc{
			f:       fn,
			argType: argType,
		})
	}

	return &Unmarshalers{
		funcMap:    funcMap,
		ifaceFuncs: ifaceFuncs,
	}
}