Skip to content

Commit

Permalink
seqGetData()
Browse files Browse the repository at this point in the history
  • Loading branch information
zhengxwen committed Mar 21, 2024
1 parent 33766e5 commit 5dc2dc0
Show file tree
Hide file tree
Showing 4 changed files with 217 additions and 16 deletions.
10 changes: 9 additions & 1 deletion NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,15 @@ UTILITIES
o tweak the display of progress information in `seqVCF2GDS()`

o `seqVCF_Header(, getnum=TRUE, verbose=TRUE)` to show the progress
information for scanning the file
information for scanning the VCF file

o new `seqGetData(, "$dosage_alt2")` and `seqGetData(, "$dosage_sp2")` for
sex chromosomes, when the alleles are partially missing

BUG FIXES

o `seqGetData(, "$dosage_alt")` and `seqGetData(, "$dosage_sp")` work
correctly when the ploidy is >2 and there are missing alleles


CHANGES IN VERSION 1.42.1
Expand Down
143 changes: 142 additions & 1 deletion src/GetData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// GetData.cpp: Get data from a SeqArray GDS file
//
// Copyright (C) 2015-2022 Xiuwen Zheng
// Copyright (C) 2015-2024 Xiuwen Zheng
//
// This file is part of SeqArray.
//
Expand Down Expand Up @@ -61,7 +61,9 @@ static const string VAR_PHASE("phase");
// variable list: internally generated
static const string VAR_DOSAGE("$dosage");
static const string VAR_DOSAGE_ALT("$dosage_alt");
static const string VAR_DOSAGE_ALT2("$dosage_alt2");
static const string VAR_DOSAGE_SP("$dosage_sp");
static const string VAR_DOSAGE_SP2("$dosage_sp2");
static const string VAR_NUM_ALLELE("$num_allele");
static const string VAR_REF_ALLELE("$ref");
static const string VAR_ALT_ALLELE("$alt");
Expand Down Expand Up @@ -297,6 +299,40 @@ static SEXP get_dosage_alt(CFileInfo &File, TVarMap &Var, void *param)
return rv_ans;
}

/// get dosage of alternative allele(s) from 'genotype/data'
static SEXP get_dosage_alt2(CFileInfo &File, TVarMap &Var, void *param)
{
const TParam *P = (const TParam*)param;
SEXP rv_ans = R_NilValue;
const ssize_t nSample = File.SampleSelNum();
const ssize_t nVariant = File.VariantSelNum();
if ((nSample > 0) && (nVariant > 0))
{
// initialize GDS genotype Node
CApply_Variant_Dosage NodeVar(File, false, true);
if (!get_geno_is_i32(P, NodeVar))
{
rv_ans = PROTECT(allocMatrix(RAWSXP, nSample, nVariant));
C_UInt8 *base = (C_UInt8 *)RAW(rv_ans);
do {
NodeVar.ReadDosageAlt_p(base);
base += nSample;
} while (NodeVar.Next());
} else {
rv_ans = PROTECT(allocMatrix(INTSXP, nSample, nVariant));
int *base = INTEGER(rv_ans);
do {
NodeVar.ReadDosageAlt_p(base);
base += nSample;
} while (NodeVar.Next());
}
SET_DIMNAMES(rv_ans, R_Dosage_Name);
UNPROTECT(1);
}
// output
return rv_ans;
}

inline static void check_vector_size(size_t n)
{
#ifdef COREARRAY_REGISTER_BIT64
Expand Down Expand Up @@ -404,6 +440,105 @@ static SEXP get_dosage_sp(CFileInfo &File, TVarMap &Var, void *param)
return rv_ans;
}

/// get sparse form of dosage of alternative allele(s), allow partial missing
static SEXP get_dosage_sp2(CFileInfo &File, TVarMap &Var, void *param)
{
SEXP rv_ans = R_NilValue;
const ssize_t nSample = File.SampleSelNum();
const ssize_t nVariant = File.VariantSelNum();
if ((nSample > 0) && (nVariant > 0))
{
// initialize GDS genotype Node
CApply_Variant_Dosage NodeVar(File, false, true);
const bool isI32 = NodeVar.NeedIntType();
// dgCMatrix@x, @i, @p
SEXP x_r, i_r;
SEXP p_r = PROTECT(NEW_INTEGER(nVariant+1));
int *p = INTEGER(p_r), i_col;
p[0] = i_col = 0;
// use RAW or integer
if (!isI32 && nSample<16777216) // 2^24
{
SEXP buffer = PROTECT(NEW_RAW(nSample));
C_UInt8 *g_buf = (C_UInt8*)RAW(buffer);
// dgCMatrix@i & @x (i << 8 | x)
vector<C_UInt32> i_x;
i_x.reserve(nSample);
do {
NodeVar.ReadDosageAlt_p(g_buf);
// get # of nonzero
size_t nnzero = vec_i8_count((char *)g_buf, nSample, 0);
i_x.reserve(i_x.size() + nnzero);
// fill i & x
for (ssize_t j=0; j < nSample; j++)
{
C_UInt8 g = g_buf[j];
if (g != 0)
i_x.push_back((C_UInt32(j) << 8) | g);
}
// update p
p[++i_col] = i_x.size();
} while (NodeVar.Next());
UNPROTECT(1); // free buffer
const size_t n = i_x.size();
check_vector_size(n);
// dgCMatrix@x & @i
x_r = PROTECT(NEW_NUMERIC(n));
i_r = PROTECT(NEW_INTEGER(n));
double *x_r_p = REAL(x_r);
int *i_r_p = INTEGER(i_r);
for (size_t i=0; i < n; i++)
{
C_UInt32 v = i_x[i];
C_UInt8 g = v;
x_r_p[i] = (g != NA_RAW) ? g : NA_REAL;
i_r_p[i] = v >> 8;
}
} else {
SEXP buffer = PROTECT(NEW_INTEGER(nSample));
int *g_buf = INTEGER(buffer);
// dgCMatrix@i & @x (i << 32 | x)
vector<C_UInt64> i_x;
i_x.reserve(nSample);
do {
NodeVar.ReadDosageAlt_p(g_buf);
// get # of nonzero
size_t nnzero = vec_i32_count(g_buf, nSample, 0);
i_x.reserve(i_x.size() + nnzero);
// fill i & x
for (ssize_t j=0; j < nSample; j++)
{
int g = g_buf[j];
if (g != 0)
i_x.push_back((C_UInt64(j) << 32) | C_UInt32(g));
}
// update p
p[++i_col] = i_x.size();
} while (NodeVar.Next());
UNPROTECT(1); // free buffer
const size_t n = i_x.size();
check_vector_size(n);
// dgCMatrix@x & @i
x_r = PROTECT(NEW_NUMERIC(n));
i_r = PROTECT(NEW_INTEGER(n));
double *x_r_p = REAL(x_r);
int *i_r_p = INTEGER(i_r);
for (size_t i=0; i < n; i++)
{
C_UInt64 v = i_x[i];
int g = v;
x_r_p[i] = (g != NA_INTEGER) ? g : NA_REAL;
i_r_p[i] = v >> 32;
}
}
// new dgCMatrix
rv_ans = GDS_New_SpCMatrix2(x_r, i_r, p_r, nSample, nVariant);
UNPROTECT(3);
}
// output
return rv_ans;
}

/// get the number of alleles for each variant
static SEXP get_num_allele(CFileInfo &File, TVarMap &Var, void *param)
{
Expand Down Expand Up @@ -1013,9 +1148,15 @@ COREARRAY_DLL_LOCAL TVarMap &VarGetStruct(CFileInfo &File, const string &name)
} else if (name == VAR_DOSAGE_ALT)
{
vm.Func = get_dosage_alt;
} else if (name == VAR_DOSAGE_ALT2)
{
vm.Func = get_dosage_alt2;
} else if (name == VAR_DOSAGE_SP)
{
vm.Func = get_dosage_sp;
} else if (name == VAR_DOSAGE_SP2)
{
vm.Func = get_dosage_sp2;
} else if (name == VAR_NUM_ALLELE)
{
vm.Init(File, VAR_ALLELE, get_num_allele);
Expand Down
74 changes: 61 additions & 13 deletions src/ReadByVariant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// ReadByVariant.cpp: Read data variant by variant
//
// Copyright (C) 2013-2022 Xiuwen Zheng
// Copyright (C) 2013-2024 Xiuwen Zheng
//
// This file is part of SeqArray.
//
Expand Down Expand Up @@ -391,7 +391,6 @@ void CApply_Variant_Dosage::ReadDosage(int *Base)
{
int *p = (int *)ExtPtr2.get();
int missing = _ReadGenoData(p);

// count the number of reference allele
if (Ploidy == 2) // diploid
{
Expand All @@ -417,7 +416,6 @@ void CApply_Variant_Dosage::ReadDosageAlt(int *Base)
{
int *p = (int *)ExtPtr2.get();
int missing = _ReadGenoData(p);

// count the number of reference allele
if (Ploidy == 2) // diploid
{
Expand All @@ -428,29 +426,54 @@ void CApply_Variant_Dosage::ReadDosageAlt(int *Base)
int cnt = 0;
for (int m=Ploidy; m > 0; m--, p++)
{
if (*p != 0)
if (*p == missing)
{
if (cnt != NA_INTEGER) cnt ++;
} else if (*p == missing)
cnt = NA_INTEGER;
} else if (*p != 0)
{
if (cnt != NA_INTEGER) cnt ++;
}
}
*Base ++ = cnt;
}
}
}

void CApply_Variant_Dosage::ReadDosageAlt_p(int *Base)
{
int *p = (int *)ExtPtr2.get();
int missing = _ReadGenoData(p);
// count the number of reference allele
/*
if (Ploidy == 2) // diploid
{
vec_i32_cnt_dosage_alt2_p(p, Base, SampNum, 0, missing, NA_INTEGER);
} else */ {
for (int n=SampNum; n > 0; n--)
{
int cnt = 0, non_miss = Ploidy;
for (int m=Ploidy; m > 0; m--, p++)
{
if (*p == missing)
non_miss --;
else if (*p != 0) cnt ++;
}
*Base ++ = (non_miss > 0) ? cnt : NA_INTEGER;
}
}
}

void CApply_Variant_Dosage::ReadDosage(C_UInt8 *Base)
{
C_UInt8 *p = (C_UInt8 *)ExtPtr2.get();
C_UInt8 missing = _ReadGenoData(p);

// count the number of reference allele
if (Ploidy == 2) // diploid
{
vec_i8_cnt_dosage2((int8_t *)p, (int8_t *)Base, SampNum, 0,
missing, NA_RAW);
} else {
C_UInt8 *p = (C_UInt8 *)ExtPtr.get();
const C_UInt8 *p = (const C_UInt8 *)ExtPtr.get();
for (int n=SampNum; n > 0; n--)
{
C_UInt8 cnt = 0;
Expand All @@ -471,30 +494,55 @@ void CApply_Variant_Dosage::ReadDosageAlt(C_UInt8 *Base)
{
C_UInt8 *p = (C_UInt8 *)ExtPtr2.get();
C_UInt8 missing = _ReadGenoData(p);

// count the number of reference allele
if (Ploidy == 2) // diploid
{
vec_i8_cnt_dosage_alt2((int8_t *)p, (int8_t *)Base, SampNum, 0,
missing, NA_RAW);
} else {
C_UInt8 *p = (C_UInt8 *)ExtPtr.get();
for (int n=SampNum; n > 0; n--)
{
C_UInt8 cnt = 0;
for (int m=Ploidy; m > 0; m--, p++)
{
if (*p != 0)
if (*p == missing)
{
if (cnt != NA_RAW) cnt ++;
} else if (*p == missing)
cnt = NA_RAW;
} else if (*p != 0)
{
if (cnt != NA_RAW) cnt ++;
}
}
*Base ++ = cnt;
}
}
}

void CApply_Variant_Dosage::ReadDosageAlt_p(C_UInt8 *Base)
{
C_UInt8 *p = (C_UInt8 *)ExtPtr2.get();
C_UInt8 missing = _ReadGenoData(p);
// count the number of reference allele
/*
if (Ploidy == 2) // diploid
{
vec_i8_cnt_dosage_alt2_p((int8_t *)p, (int8_t *)Base, SampNum, 0,
missing, NA_RAW);
} else */ {
for (int n=SampNum; n > 0; n--)
{
C_UInt8 cnt = 0, non_miss = Ploidy;
for (int m=Ploidy; m > 0; m--, p++)
{
if (*p == missing)
non_miss --;
else if (*p != 0) cnt ++;
}
*Base ++ = (non_miss > 0) ? cnt : NA_RAW;
}
}
}



// =====================================================================
Expand Down
6 changes: 5 additions & 1 deletion src/ReadByVariant.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// ReadByVariant.h: Read data variant by variant
//
// Copyright (C) 2013-2022 Xiuwen Zheng
// Copyright (C) 2013-2024 Xiuwen Zheng
//
// This file is part of SeqArray.
//
Expand Down Expand Up @@ -131,10 +131,14 @@ class COREARRAY_DLL_LOCAL CApply_Variant_Dosage: public CApply_Variant_Geno
void ReadDosage(int *Base);
/// read dosages of alternative alleles in 32-bit integer
void ReadDosageAlt(int *Base);
/// read dosages of alternative alleles in 32-bit integer, allowing partial missing
void ReadDosageAlt_p(int *Base);
/// read dosages in unsigned 8-bit intetger
void ReadDosage(C_UInt8 *Base);
/// read dosages of alternative alleles in unsigned 8-bit intetger
void ReadDosageAlt(C_UInt8 *Base);
/// read dosages of alternative alleles in unsigned 8-bit intetger, allowing partial missing
void ReadDosageAlt_p(C_UInt8 *Base);
};


Expand Down

0 comments on commit 5dc2dc0

Please sign in to comment.