Skip to content

Commit 3067771

Browse files
committed
update for low coverge data
1 parent 4889f1c commit 3067771

8 files changed

+1073
-154
lines changed

CommandLines.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <pthread.h>
66
#include <stdint.h>
77

8-
#define HA_VERSION "0.23.0-r691"
8+
#define HA_VERSION "0.24.0-r702"
99

1010
#define VERBOSE 0
1111

Correct.h

+2
Original file line numberDiff line numberDiff line change
@@ -1415,4 +1415,6 @@ void get_wqual(uint64_t zid, uint64_t zpos, uint64_t zrev, asg8_v *v, uint8_t *v
14151415
#define HPC_RR 4
14161416
#define HPC_CC 2
14171417

1418+
// #define FORCE_CUT 1
1419+
14181420
#endif

Overlaps.cpp

+804-133
Large diffs are not rendered by default.

Overlaps.h

+5-2
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ static inline int count_out_without_del(const asg_t *g, uint32_t v)
632632

633633
void build_string_graph_without_clean(
634634
int min_dp, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources,
635-
long long n_read, uint64_t* readLen, long long mini_overlap_length,
635+
uint64_t n_read, uint64_t* readLen, long long mini_overlap_length,
636636
long long max_hang_length, long long clean_round, long long gap_fuzz,
637637
float min_ovlp_drop_ratio, float max_ovlp_drop_ratio, char* output_file_name,
638638
long long bubble_dist, int read_graph, int write);
@@ -927,7 +927,7 @@ ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, R_to_U* ruIndex, int m
927927
void rescue_bubble_by_chain(asg_t *sg, ma_sub_t *coverage_cut, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources,
928928
long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex,
929929
float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, uint32_t chainLenThres, long long gap_fuzz,
930-
bub_label_t* b_mask_t, long long no_trio_recover);
930+
bub_label_t* b_mask_t, long long no_trio_recover, uint8_t *cmk);
931931

932932
typedef struct{
933933
double weight;
@@ -1242,6 +1242,9 @@ uint64_t infer_mmhap_copy(ma_ug_t *ug, asg_t *sg, ma_hit_t_alloc *src, uint8_t *
12421242
uint64_t trans_sec_cut0(kv_u_trans_t *ta, asg64_v *srt, uint32_t id, double sec_rate, uint64_t bd, ma_ug_t *ug);
12431243
void clean_u_trans_t_idx_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g, ma_hit_t_alloc* src, ug_rid_cov_t *in);
12441244
void gen_ug_rid_cov_t_by_ovlp(kv_u_trans_t *ta, ug_rid_cov_t *cc);
1245+
void rescue_chimeric_reads_aggressive(ma_ug_t *i_ug, asg_t *rg, ma_hit_t_alloc* sources, ma_sub_t *coverage_cut,
1246+
R_to_U* ruIndex, int max_hang, int min_ovlp, uint32_t chainLenThres, uint32_t is_bubble_check, uint32_t is_primary_check, kvec_asg_arc_t_warp* new_rtg_edges,
1247+
kvec_t_u32_warp* new_rtg_nodes, bub_label_t* b_mask_t, uint8_t *cmk);
12451248

12461249
#define UC_Read_resize(v, s) do {\
12471250
if ((v).size<(s)) {REALLOC((v).seq,(s));(v).size=(s);}\

ecovlp.cpp

+143-10
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ typedef struct {
6060
typedef struct {
6161
ec_ovec_buf_t0 *a;
6262
uint32_t n, rev;
63+
uint8_t *cr;
6364
} ec_ovec_buf_t;
6465

6566
ec_ovec_buf_t* gen_ec_ovec_buf_t(uint32_t n);
@@ -180,7 +181,7 @@ void destroy_ec_ovec_buf_t(ec_ovec_buf_t *p)
180181
destroy_cns_gfa(&(z->cns));
181182

182183
}
183-
free(p->a); free(p);
184+
free(p->a); free(p->cr); free(p);
184185

185186
// fprintf(stderr, "[M::%s-chains] #->%lld\n", __func__, asm_opt.num_bases);
186187
// fprintf(stderr, "[M::%s-passed-chains-0] #->%lld\n", __func__, asm_opt.num_corrected_bases);
@@ -3931,10 +3932,11 @@ uint32_t is_chemical_r(ma_hit_t_alloc *ov, asg64_v *idx, int64_t len, int64_t co
39313932
}
39323933

39333934

3934-
uint32_t is_chemical_r_adv(ma_hit_t_alloc *ov, asg64_v *idx, int64_t len, int64_t cov, int64_t cut_len, double dup_rate)
3935+
uint32_t is_chemical_r_adv(ma_hit_t_alloc *ov, asg64_v *idx, int64_t len, int64_t cov, int64_t cut_len, double dup_rate, uint64_t is_del)
39353936
{
39363937
uint64_t k, s, e; int64_t dp, old_dp, st = 0, ed, s0, e0, rr, lt;
39373938
for (k = idx->n = 0; k < ov->length; k++) {
3939+
if(is_del && ov->buffer[k].del) continue;
39383940
s0 = (uint32_t)ov->buffer[k].qns; e0 = ov->buffer[k].qe;
39393941
if(s0 > 0) s0 += cut_len;
39403942
if(e0 < len) e0 -= cut_len;
@@ -3988,6 +3990,64 @@ uint32_t is_chemical_r_adv(ma_hit_t_alloc *ov, asg64_v *idx, int64_t len, int64_
39883990
return 0;
39893991
}
39903992

3993+
int64_t cal_chemical_r_adv(ma_hit_t_alloc *ov, asg64_v *idx, int64_t len, int64_t cut_len, double dup_rate, uint64_t is_del)
3994+
{
3995+
uint64_t k, s, e; int64_t dp, old_dp, st = 0, ed, s0, e0, rr, lt, min_cov;
3996+
for (k = idx->n = 0; k < ov->length; k++) {
3997+
if(is_del && ov->buffer[k].del) continue;
3998+
s0 = (uint32_t)ov->buffer[k].qns; e0 = ov->buffer[k].qe;
3999+
if(s0 > 0) s0 += cut_len;
4000+
if(e0 < len) e0 -= cut_len;
4001+
if(e0 <= s0) continue;
4002+
s = s0; e = e0;
4003+
4004+
lt = Get_READ_LENGTH((R_INF), ov->buffer[k].tn);
4005+
rr = (lt >= len)?(lt - len):(len - lt);
4006+
if((rr <= (len*dup_rate)) && (rr <= (lt*dup_rate)) && (ov->buffer[k].rev)) {
4007+
dp = (ov->buffer[k].qe) - ((uint32_t)ov->buffer[k].qns); dp = len - dp;
4008+
old_dp = ov->buffer[k].te - ov->buffer[k].ts; old_dp = lt - old_dp;
4009+
if((dp <= (len*dup_rate)) && (old_dp <= (lt*dup_rate))) continue;
4010+
}
4011+
4012+
kv_push(uint64_t, (*idx), (s<<1));
4013+
kv_push(uint64_t, (*idx), (e<<1)|1);
4014+
}
4015+
4016+
radix_sort_ec64(idx->a, idx->a + idx->n); s0 = e0 = rr = -1; min_cov = INT64_MAX;
4017+
for (k = 0, dp = 0, st = ed = 0; k < idx->n; ++k) {
4018+
old_dp = dp;
4019+
///if a[j] is qe
4020+
if (idx->a[k]&1) --dp;
4021+
else ++dp;
4022+
4023+
ed = idx->a[k]>>1;
4024+
if(ed > st) {
4025+
// if(ov->length && ((ov->buffer[0].qns>>32) == 5045637)) {
4026+
// fprintf(stderr, "[M::%s]\tmd::[%ld,%ld)\tcov::%ld\tlen::%ld\tid::%lu\n", __func__, st, ed, old_dp, len, ov->buffer[0].qns>>32);
4027+
// }
4028+
if(old_dp <= min_cov) {
4029+
// if(ov->length && (ov->buffer[0].qns>>32) == 22344) fprintf(stderr, "[M::%s]\tmd::[%ld,%ld)\tcov::%ld\tlen::%ld\n", __func__, st, ed, old_dp, len);
4030+
min_cov = old_dp;
4031+
}
4032+
}
4033+
st = ed;
4034+
}
4035+
4036+
4037+
ed = len; old_dp = dp;
4038+
if(ed > st) {
4039+
// if(ov->length && ((ov->buffer[0].qns>>32) == 5045637)) {
4040+
// fprintf(stderr, "[M::%s]\tmd::[%ld,%ld)\tcov::%ld\tlen::%ld\tid::%lu\n", __func__, st, ed, old_dp, len, ov->buffer[0].qns>>32);
4041+
// }
4042+
if(old_dp <= min_cov) {
4043+
// if(ov->length && (ov->buffer[0].qns>>32) == 22344) fprintf(stderr, "[M::%s]\tmd::[%ld,%ld)\tcov::%ld\tlen::%ld\n", __func__, st, ed, old_dp, len);
4044+
min_cov = old_dp;
4045+
}
4046+
}
4047+
4048+
return min_cov;
4049+
}
4050+
39914051
void prt_dbg_rid_paf(ma_hit_t_alloc *ov, UC_Read *ra, asg8_v *qa)
39924052
{
39934053
if(!(ov->length)) return;
@@ -4051,7 +4111,7 @@ static void worker_hap_dc_ec_chemical_r(void *data, long i, int tid)
40514111
if(b->cnt[1] == 0) {
40524112
// if(i == 6204620) prt_dbg_rid_paf(&(R_INF.paf[i]), &(b->self_read), &(b->v8q));
40534113
// if(is_chemical_r(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), 3, 16)) {
4054-
if(is_chemical_r_adv(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), asm_opt.chemical_cov, asm_opt.chemical_flank, 0.02)) {
4114+
if(is_chemical_r_adv(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), asm_opt.chemical_cov, asm_opt.chemical_flank, 0.02, 0)) {
40554115
// fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i));
40564116
R_INF.paf[i].length = 0; b->cnt[0]++;
40574117
}
@@ -4075,11 +4135,56 @@ static void worker_hap_dc_ec_chemical_r(void *data, long i, int tid)
40754135
static void worker_hap_dc_ec_chemical_arc(void *data, long i, int tid)
40764136
{
40774137
ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]);
4078-
ma_hit_t_alloc *paf = &(R_INF.paf[i]); uint64_t k;
4138+
ma_hit_t_alloc *paf = &(R_INF.paf[i]), *rev; uint64_t k, z;
4139+
4140+
if(b->cnt[1] == 0) {
4141+
if(is_chemical_r_adv(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), asm_opt.chemical_cov, asm_opt.chemical_flank, 0.02, 1)) {
4142+
// fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i));
4143+
for (k = 0; k < paf->length; k++) paf->buffer[k].del = 1; b->cnt[0]++;
4144+
}
4145+
} else if(b->cnt[1] == 1) {
4146+
for (k = 0; k < paf->length; k++) {
4147+
if((Get_qn(paf->buffer[k])) > (Get_tn(paf->buffer[k]))) continue;
4148+
rev = &(R_INF.paf[paf->buffer[k].tn]);
4149+
for (z = 0; z < rev->length; z++) {
4150+
if((rev->buffer[z].tn == (Get_qn(paf->buffer[k])))) {
4151+
if(paf->buffer[k].del != rev->buffer[z].del) {
4152+
paf->buffer[k].del = rev->buffer[z].del = 1;
4153+
}
4154+
}
4155+
}
4156+
}
4157+
}
40794158

4080-
if(is_chemical_r_adv(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), asm_opt.chemical_cov, asm_opt.chemical_flank, 0.02)) {
4081-
// fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i));
4082-
for (k = 0; k < paf->length; k++) paf->buffer[k].del = 1; b->cnt[0]++;
4159+
refresh_ec_ovec_buf_t0(b, REFRESH_N);
4160+
}
4161+
4162+
static void worker_hap_dc_ec_chemical_arc_mark(void *data, long i, int tid)
4163+
{
4164+
ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]);
4165+
ma_hit_t_alloc *paf = &(R_INF.paf[i]), *rev; uint64_t k, z; int64_t cov, msk_cut = asm_opt.chemical_cov;
4166+
uint8_t *msk = ((ec_ovec_buf_t*)data)->cr;
4167+
4168+
if(b->cnt[1] == 0) {
4169+
msk[i] = (uint8_t)-1;
4170+
cov = cal_chemical_r_adv(&(R_INF.paf[i]), &b->v64, Get_READ_LENGTH((R_INF), i), asm_opt.chemical_flank, 0.02, 1);
4171+
if(cov <= msk_cut) msk[i] = cov;
4172+
if(cov <= msk_cut/**FORCE_CUT**/) {
4173+
// fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i));
4174+
for (k = 0; k < paf->length; k++) paf->buffer[k].del = 1; b->cnt[0]++;
4175+
}
4176+
} else if(b->cnt[1] == 1) {
4177+
for (k = 0; k < paf->length; k++) {
4178+
if((Get_qn(paf->buffer[k])) > (Get_tn(paf->buffer[k]))) continue;
4179+
rev = &(R_INF.paf[paf->buffer[k].tn]);
4180+
for (z = 0; z < rev->length; z++) {
4181+
if((rev->buffer[z].tn == (Get_qn(paf->buffer[k])))) {
4182+
if((paf->buffer[k].del != rev->buffer[z].del) || (msk[Get_qn(paf->buffer[k])] <= msk_cut/**FORCE_CUT**/) || (msk[Get_tn(paf->buffer[k])] <= msk_cut/**FORCE_CUT**/)) {
4183+
paf->buffer[k].del = rev->buffer[z].del = 1;
4184+
}
4185+
}
4186+
}
4187+
}
40834188
}
40844189

40854190
refresh_ec_ovec_buf_t0(b, REFRESH_N);
@@ -6066,7 +6171,7 @@ void handle_chemical_r(uint64_t n_thre, uint64_t n_a)
60666171

60676172
kt_for(n_thre, worker_hap_dc_ec_chemical_r, b, n_a);
60686173

6069-
fprintf(stderr, "[M::%s] # chemical reads: %lu, # arcs:: %lu\n", __func__, chem_n, dedup);
6174+
fprintf(stderr, "[M::%s] # chimeric reads: %lu, # arcs:: %lu\n", __func__, chem_n, dedup);
60706175

60716176
destroy_ec_ovec_buf_t(b);
60726177
}
@@ -6076,16 +6181,44 @@ void handle_chemical_arc(uint64_t n_thre, uint64_t n_a)
60766181
ec_ovec_buf_t *b = NULL; uint64_t k, chem_n = 0;
60776182
b = gen_ec_ovec_buf_t(n_thre);
60786183
for (k = 0; k < n_thre; ++k) {
6079-
b->a[k].cnt[0] = 0;
6184+
b->a[k].cnt[0] = 0; b->a[k].cnt[1] = 0;
60806185
}
60816186

60826187
kt_for(n_thre, worker_hap_dc_ec_chemical_arc, b, n_a);
60836188

60846189
for (k = 0; k < n_thre; ++k) {
60856190
chem_n += b->a[k].cnt[0];
6191+
b->a[k].cnt[0] = 0; b->a[k].cnt[1] = 1;
60866192
}
60876193

6088-
fprintf(stderr, "[M::%s] # chemical reads: %lu\n", __func__, chem_n);
6194+
kt_for(n_thre, worker_hap_dc_ec_chemical_arc, b, n_a);
6195+
6196+
fprintf(stderr, "[M::%s] # chimeric reads: %lu\n", __func__, chem_n);
60896197

60906198
destroy_ec_ovec_buf_t(b);
6199+
}
6200+
6201+
uint8_t* gen_chemical_arc_rf(uint64_t n_thre, uint64_t n_a)
6202+
{
6203+
ec_ovec_buf_t *b = NULL; uint64_t k, chem_n = 0; uint8_t *ra = NULL;
6204+
b = gen_ec_ovec_buf_t(n_thre);
6205+
for (k = 0; k < n_thre; ++k) {
6206+
b->a[k].cnt[0] = 0; b->a[k].cnt[1] = 0;
6207+
}
6208+
MALLOC(ra, n_a); ///memset(ra, -1, sizeof((*ra))*n_a);
6209+
b->cr = ra;
6210+
6211+
kt_for(n_thre, worker_hap_dc_ec_chemical_arc_mark, b, n_a);
6212+
6213+
for (k = 0; k < n_thre; ++k) {
6214+
chem_n += b->a[k].cnt[0];
6215+
b->a[k].cnt[0] = 0; b->a[k].cnt[1] = 1;
6216+
}
6217+
6218+
kt_for(n_thre, worker_hap_dc_ec_chemical_arc_mark, b, n_a);
6219+
6220+
fprintf(stderr, "[M::%s] # chimeric reads: %lu\n", __func__, chem_n);
6221+
6222+
b->cr = NULL; destroy_ec_ovec_buf_t(b);
6223+
return ra;
60916224
}

ecovlp.h

+1
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,6 @@ void sl_ec_r(uint64_t n_thre, uint64_t n_a);
1414
void cal_ov_r(uint64_t n_thre, uint64_t n_a, uint64_t new_idx);
1515
void handle_chemical_r(uint64_t n_thre, uint64_t n_a);
1616
void handle_chemical_arc(uint64_t n_thre, uint64_t n_a);
17+
uint8_t* gen_chemical_arc_rf(uint64_t n_thre, uint64_t n_a);
1718

1819
#endif

0 commit comments

Comments
 (0)