-
Notifications
You must be signed in to change notification settings - Fork 9
/
gaba.c
5770 lines (5197 loc) · 217 KB
/
gaba.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* @file gaba.c
*
* @brief libgaba (libsea3) DP routine implementation
*
* @author Hajime Suzuki
* @date 2016/1/11
* @license Apache v2
*/
// #define DEBUG
// #define DEBUG_MEM
// #define DEBUG_OVERFLOW
// #define DEBUG_ALL
/*
* debug print configuration: -DDEBUG to enable debug print, -DDEBUG_ALL to print all the vectors, arrays, and bitmasks
* NOTE: dumping all the vectors sometimes raises SEGV due to a stack shortage. use `ulimit -s 65536' to avoid it.
*/
#if defined(DEBUG_ALL) && !defined(DEBUG)
# define DEBUG
#endif
#ifdef DEBUG
# define REDEFINE_DEBUG
#endif
/* make sure POSIX APIs are properly activated */
#if defined(__linux__) && !defined(_POSIX_C_SOURCE)
# define _POSIX_C_SOURCE 200112L
#endif
#if defined(__darwin__) && !defined(_BSD_SOURCE)
# define _BSD_SOURCE
#endif
/* import general headers */
#include <stdio.h> /* sprintf in dump_path */
#include <stdint.h> /* uint32_t, uint64_t, ... */
#include <stddef.h> /* offsetof */
#include <string.h> /* memset, memcpy */
#include <inttypes.h>
#define _GABA_PARSE_EXPORT_LEVEL static inline
#include "gaba.h"
#include "gaba_parse.h"
#include "log.h"
#include "arch/arch.h" /* architecture dependents */
/**
* gap penalty model configuration: choose one of the following three by a define macro
* linear: g(k) = ge * k where ge > 0
* affine: g(k) = gi + ge * k where gi > 0, ge > 0
* combined: g(k) = min(gi + ge * k, gf * k) where gi > 0, gf > ge > 0
*/
#define LINEAR 0
#define AFFINE 1
#define COMBINED 2
#ifdef MODEL
# if !(MODEL == LINEAR || MODEL == AFFINE || MODEL == COMBINED)
# error "MODEL must be LINEAR (1), AFFINE (2), or COMBINED (3)."
# endif
#else
# define MODEL AFFINE
#endif
#if MODEL == LINEAR
# define MODEL_LABEL linear
# define MODEL_STR "linear"
#elif MODEL == AFFINE
# define MODEL_LABEL affine
# define MODEL_STR "affine"
#else
# define MODEL_LABEL combined
# define MODEL_STR "combined"
#endif
#ifdef BIT
# if !(BIT == 2 || BIT == 4)
# error "BIT must be 2 or 4."
# endif
#else
# define BIT 4
#endif
/* define ENABLE_FILTER to enable gapless alignment filter */
// #define ENABLE_FILTER
/* bandwidth-specific configurations aliasing vector macros */
#define BW_MAX 64
#ifndef BW
# define BW 64
#endif
#define _W BW
#if _W == 16
# define _NVEC_ALIAS_PREFIX v16i8
# define _WVEC_ALIAS_PREFIX v16i16
# define DP_CTX_INDEX 2
#elif _W == 32
# define _NVEC_ALIAS_PREFIX v32i8
# define _WVEC_ALIAS_PREFIX v32i16
# define DP_CTX_INDEX 1
#elif _W == 64
# define _NVEC_ALIAS_PREFIX v64i8
# define _WVEC_ALIAS_PREFIX v64i16
# define DP_CTX_INDEX 0
#else
# error "BW must be one of 16, 32, or 64."
#endif
#include "arch/vector_alias.h"
#define DP_CTX_MAX ( 3 )
#define _dp_ctx_index(_bw) ( ((_bw) == 64) ? 0 : (((_bw) == 32) ? 1 : 2) )
// _static_assert(_dp_ctx_index(BW) == DP_CTX_INDEX);
/* add suffix for gap-model- and bandwidth-wrapper (see gaba_wrap.h) */
#ifdef SUFFIX
# define _suffix_cat3_2(a, b, c) a##_##b##_##c
# define _suffix_cat3(a, b, c) _suffix_cat3_2(a, b, c)
# define _suffix(_base) _suffix_cat3(_base, MODEL_LABEL, BW)
#else
# define _suffix(_base) _base
#endif
/* add namespace for arch wrapper (see main.c) */
#ifdef NAMESPACE
# define _export_cat(x, y) x##_##y
# define _export_cat2(x, y) _export_cat(x, y)
# define _export(_base) _export_cat2(_suffix(_base), NAMESPACE)
#else
# define _export(_base) _suffix(_base)
#endif
/* import unittest */
#ifndef UNITTEST_UNIQUE_ID
# if MODEL == LINEAR
# if BW == 16
# define UNITTEST_UNIQUE_ID 31
# elif BW == 32
# define UNITTEST_UNIQUE_ID 32
# else
# define UNITTEST_UNIQUE_ID 33
# endif
# elif MODEL == AFFINE
# if BW == 16
# define UNITTEST_UNIQUE_ID 34
# elif BW == 32
# define UNITTEST_UNIQUE_ID 35
# else
# define UNITTEST_UNIQUE_ID 36
# endif
# else
# if BW == 16
# define UNITTEST_UNIQUE_ID 37
# elif BW == 32
# define UNITTEST_UNIQUE_ID 38
# else
# define UNITTEST_UNIQUE_ID 39
# endif
# endif
#endif
#include "unittest.h"
/* static assertion macros */
#define _sa_cat_intl(x, y) x##y
#define _sa_cat(x, y) _sa_cat_intl(x, y)
#define _static_assert(expr) typedef char _sa_cat(_st, __LINE__)[(expr) ? 1 : -1]
/* internal constants */
#define BLK_BASE ( 5 )
#define BLK ( 0x01<<BLK_BASE )
#ifdef DEBUG_MEM
# define MIN_BULK_BLOCKS ( 0 )
# define MEM_ALIGN_SIZE ( 32 ) /* 32byte aligned for AVX2 environments */
# define MEM_INIT_SIZE ( (uint64_t)4 * 1024 )
# define MEM_MARGIN_SIZE ( 4096 ) /* tail margin of internal memory blocks */
# define MEM_GC_INTV ( 32 )
#else
# define MIN_BULK_BLOCKS ( 32 )
# define MEM_ALIGN_SIZE ( 32 ) /* 32byte aligned for AVX2 environments */
# define MEM_INIT_SIZE ( (uint64_t)16 * 1024 * 1024 )
# define MEM_MARGIN_SIZE ( 4096 ) /* tail margin of internal memory blocks */
# define MEM_GC_INTV ( 4096 )
#endif
#define INIT_FETCH_APOS ( -1 )
#define INIT_FETCH_BPOS ( -1 )
/* test consistency of exported macros */
_static_assert(V2I32_MASK_01 == GABA_UPDATE_A);
_static_assert(V2I32_MASK_10 == GABA_UPDATE_B);
/**
* @macro _likely, _unlikely
* @brief branch prediction hint for gcc-compatible compilers
*/
#define _likely(x) __builtin_expect(!!(x), 1)
#define _unlikely(x) __builtin_expect(!!(x), 0)
/**
* @macro _force_inline
* @brief inline directive for gcc-compatible compilers
*/
#define _force_inline inline
// #define _force_inline /* */
/** assume 64bit little-endian system */
_static_assert(sizeof(void *) == 8);
/** check size of structs declared in gaba.h */
_static_assert(sizeof(struct gaba_params_s) == 40);
_static_assert(sizeof(struct gaba_section_s) == 16);
_static_assert(sizeof(struct gaba_fill_s) == 64);
_static_assert(sizeof(struct gaba_segment_s) == 32);
_static_assert(sizeof(struct gaba_alignment_s) == 64);
_static_assert(sizeof(nvec_masku_t) == _W / 8);
/**
* @macro _plen
* @brief extract plen from path_section_s
*/
#define _plen(seg) ( (seg)->alen + (seg)->blen )
/* forward declarations */
static int64_t gaba_dp_add_stack(struct gaba_dp_context_s *self, uint64_t size);
static void *gaba_dp_malloc(struct gaba_dp_context_s *self, uint64_t size);
static void gaba_dp_free(struct gaba_dp_context_s *self, void *ptr); /* do nothing */
struct gaba_dp_context_s;
/**
* @struct gaba_drop_s
*/
struct gaba_drop_s {
int8_t drop[_W]; /** (32) max */
};
_static_assert(sizeof(struct gaba_drop_s) == _W);
/**
* @struct gaba_middle_delta_s
*/
struct gaba_middle_delta_s {
int16_t delta[_W]; /** (64) middle delta */
};
_static_assert(sizeof(struct gaba_middle_delta_s) == sizeof(int16_t) * _W);
/**
* @struct gaba_mask_pair_s
*/
#if MODEL == LINEAR
struct gaba_mask_pair_s {
nvec_masku_t h; /** (4) horizontal mask vector */
nvec_masku_t v; /** (4) vertical mask vector */
};
_static_assert(sizeof(struct gaba_mask_pair_s) == _W / 4);
#else /* affine and combined */
struct gaba_mask_pair_s {
nvec_masku_t h; /** (4) horizontal mask vector */
nvec_masku_t v; /** (4) vertical mask vector */
nvec_masku_t e; /** (4) e mask vector */
nvec_masku_t f; /** (4) f mask vector */
};
_static_assert(sizeof(struct gaba_mask_pair_s) == _W / 2);
#endif
#define _mask_u64(_m) ( ((nvec_masku_t){ .mask = (_m) }).all )
/**
* @struct gaba_diff_vec_s
*/
#if MODEL == LINEAR
struct gaba_diff_vec_s {
uint8_t dh[_W]; /** (32) dh */
uint8_t dv[_W]; /** (32) dv */
};
_static_assert(sizeof(struct gaba_diff_vec_s) == 2 * _W);
#else /* affine and combined gap penalty */
struct gaba_diff_vec_s {
uint8_t dh[_W]; /** (32) dh */
uint8_t dv[_W]; /** (32) dv */
uint8_t de[_W]; /** (32) de */
uint8_t df[_W]; /** (32) df */
};
_static_assert(sizeof(struct gaba_diff_vec_s) == 4 * _W);
#endif
/**
* @struct gaba_char_vec_s
*/
struct gaba_char_vec_s {
uint8_t w[_W]; /** (32) a in the lower 4bit, b in the higher 4bit */
};
_static_assert(sizeof(struct gaba_char_vec_s) == _W);
/**
* @struct gaba_block_s
* @brief a unit of banded matrix, 32 vector updates will be recorded in a single block object.
* phantom is an alias of the block struct as a head cap of contiguous blocks.
*/
struct gaba_block_s {
struct gaba_mask_pair_s mask[BLK]; /** (256 / 512) traceback capability flag vectors (set if transition to the ajdacent cell is possible) */
struct gaba_diff_vec_s diff; /** (64, 128, 256) diff variables of the last vector */
int8_t acc, xstat; /** (2) accumulator, and xdrop status (term detected when xstat < 0) */
int8_t acnt, bcnt; /** (2) forwarded lengths */
uint32_t dir_mask; /** (4) extension direction bit array */
uint64_t max_mask; /** (8) lanewise update mask (set if the lane contains the current max) */
};
struct gaba_phantom_s {
struct gaba_diff_vec_s diff; /** (64, 128, 256) diff variables of the last (just before the head) vector */
int8_t acc, xstat; /** (2) accumulator, and xdrop status (term detected when xstat < 0) */
int8_t acnt, bcnt; /** (4) prefetched sequence lengths (only effective at the root, otherwise zero) */
uint32_t reserved; /** (4) overlaps with dir_mask */
struct gaba_block_s const *blk; /** (8) link to the previous block (overlaps with max_mask) */
};
_static_assert(sizeof(struct gaba_block_s) % 16 == 0);
_static_assert(sizeof(struct gaba_phantom_s) % 16 == 0);
#define _last_block(x) ( (struct gaba_block_s *)(x) - 1 )
#define _last_phantom(x) ( (struct gaba_phantom_s *)(x) - 1 )
#define _phantom(x) ( _last_phantom((struct gaba_block_s *)(x) + 1) )
/**
* @struct gaba_merge_s
*/
struct gaba_merge_s {
uint64_t reserved1; /** (8) keep aligned to 16byte boundary */
struct gaba_block_s const *blk[1]; /** (8) addressed by [tail_idx[vec][q]] */
int8_t tidx[2][_W]; /** (32, 64, 128) lanewise index array */
struct gaba_diff_vec_s diff; /** (64, 128, 256) diff variables of the last (just before the head) vector */
int8_t acc, xstat; /** (2) acc and xstat are reserved for block_s */
uint8_t reserved2[13], qofs[1]; /** (14) displacement of vectors in the q-direction */
};
#define MERGE_TAIL_OFFSET ( BLK * sizeof(struct gaba_mask_pair_s) - 2 * _W - 2 * sizeof(void *) )
#define _merge(x) ( (struct gaba_merge_s *)((uint8_t *)(x) + MERGE_TAIL_OFFSET) )
_static_assert(sizeof(struct gaba_merge_s) % 16 == 0);
_static_assert(sizeof(struct gaba_merge_s) + MERGE_TAIL_OFFSET == sizeof(struct gaba_block_s));
_static_assert(MAX_MERGE_COUNT <= 14);
/**
* @struct gaba_joint_tail_s
* @brief (internal) tail cap of a contiguous matrix blocks, contains a context of the blocks
* (band) and can be connected to the next blocks.
*/
struct gaba_joint_tail_s {
/* char vector and delta vectors */
struct gaba_char_vec_s ch; /** (16, 32, 64) char vector */
struct gaba_drop_s xd; /** (16, 32, 64) */
struct gaba_middle_delta_s md; /** (32, 64, 128) */
int16_t mdrop; /** (2) drop from m.max (offset) */
uint16_t istat; /** (2) 1 if bridge */
uint32_t pridx; /** (4) remaining p-length */
uint32_t aridx, bridx; /** (8) reverse indices for the tails */
uint32_t aadv, badv; /** (8) advanced lengths */
struct gaba_joint_tail_s const *tail;/** (8) the previous tail */
uint64_t abrk, bbrk; /** (16) breakpoint masks */
uint8_t const *atptr, *btptr; /** (16) tail of the current section */
struct gaba_fill_s f; /** (24) */
};
_static_assert((sizeof(struct gaba_joint_tail_s) % 32) == 0);
#define TAIL_BASE ( offsetof(struct gaba_joint_tail_s, f) )
#define BRIDGE_TAIL_OFFSET ( offsetof(struct gaba_joint_tail_s, mdrop) )
#define BRIDGE_TAIL_SIZE ( sizeof(struct gaba_joint_tail_s) - BRIDGE_TAIL_OFFSET )
#define _bridge(x) ( (struct gaba_joint_tail_s *)((uint8_t *)(x) - BRIDGE_TAIL_OFFSET) )
#define _tail(x) ( (struct gaba_joint_tail_s *)((uint8_t *)(x) - TAIL_BASE) )
#define _fill(x) ( (struct gaba_fill_s *)((uint8_t *)(x) + TAIL_BASE) )
#define _offset(x) ( (x)->f.max - (x)->mdrop )
#define _mem_blocks(n) ( sizeof(struct gaba_phantom_s) + (n + 1) * sizeof(struct gaba_block_s) + sizeof(struct gaba_joint_tail_s) )
#define MEM_INIT_VACANCY ( _mem_blocks(MIN_BULK_BLOCKS) )
_static_assert(2 * sizeof(struct gaba_block_s) < MEM_MARGIN_SIZE);
_static_assert(MEM_INIT_VACANCY < MEM_INIT_SIZE);
/**
* @struct gaba_root_block_s
*/
struct gaba_root_block_s {
uint8_t _pad1[288 - sizeof(struct gaba_phantom_s)];
struct gaba_phantom_s blk;
struct gaba_joint_tail_s tail;
#if _W != 64
uint8_t _pad2[352 + 32 - sizeof(struct gaba_joint_tail_s)];
#endif
};
_static_assert(sizeof(struct gaba_root_block_s) == 640 + 32);
_static_assert(sizeof(struct gaba_root_block_s) >= sizeof(struct gaba_phantom_s) + sizeof(struct gaba_joint_tail_s));
/**
* @struct gaba_reader_work_s
* @brief (internal) working buffer for fill-in functions, contains sequence prefetch buffer
* and middle/max-small deltas.
*/
struct gaba_reader_work_s {
/** 64byte aligned */
uint8_t bufa[BW_MAX + BLK]; /** (128) */
uint8_t bufb[BW_MAX + BLK]; /** (128) */
/** 256 */
/** 64byte alidned */
uint32_t arlim, brlim; /** (8) asridx - aadv = aridx + arlim */
uint32_t aid, bid; /** (8) ids */
uint8_t const *atptr, *btptr; /** (16) tail of the current section */
uint32_t pridx; /** (4) remaining p-length (unsigned!) */
int32_t ofsd; /** (4) delta of large offset */
uint32_t arem, brem; /** (8) current ridx (redefined as rem, ridx == rem + rlim) */
uint32_t asridx, bsridx; /** (8) start ridx (converted to (badv, aadv) in fill_create_tail) */
struct gaba_joint_tail_s const *tail; /** (8) previous tail */
/** 64 */
/** 64byte aligned */
struct gaba_drop_s xd; /** (16, 32, 64) current drop from max */
#if _W != 64
uint8_t _pad[_W == 16 ? 16 : 32]; /** padding to align to 64-byte boundary */
#endif
struct gaba_middle_delta_s md; /** (32, 64, 128) */
};
_static_assert((sizeof(struct gaba_reader_work_s) % 64) == 0);
/**
* @struct gaba_merge_work_s
*/
#define MERGE_BUFFER_LENGTH ( 2 * _W )
struct gaba_merge_work_s {
uint8_t qofs[16]; /** (16) q-offset array */
uint32_t qw, _pad1; /** (8) */
uint32_t lidx, uidx; /** (8) */
#if _W != 16
uint8_t _pad2[32]; /** padding to align to 64-byte boundary */
#endif
uint64_t abrk[4]; /** (32) */
uint64_t bbrk[4]; /** (32) */
uint8_t buf[MERGE_BUFFER_LENGTH + 7 * sizeof(int16_t) * MERGE_BUFFER_LENGTH]; /** (32, 64, 128) + (320, 640, 1280) */
};
_static_assert((sizeof(struct gaba_merge_work_s) % 64) == 0);
/**
* @struct gaba_aln_intl_s
* @brief internal alias of gaba_alignment_t, allocated in the local context and used as a working buffer.
*/
struct gaba_aln_intl_s {
/* memory management */
void *opaque; /** (8) opaque (context pointer) */
gaba_lfree_t lfree; /** (8) local free */
int64_t score; /** (8) score (save) */
uint32_t aicnt, bicnt; /** (8) gap region counters */
uint32_t aecnt, becnt; /** (8) gap base counters */
uint32_t dcnt; /** (4) unused in the loop */
uint32_t slen; /** (4) section length (counter) */
struct gaba_segment_s *seg; /** (8) section ptr */
uint32_t plen, padding; /** (8) path length (psum; save) */
};
_static_assert(sizeof(struct gaba_alignment_s) == sizeof(struct gaba_aln_intl_s));
_static_assert(offsetof(struct gaba_alignment_s, score) == offsetof(struct gaba_aln_intl_s, score));
_static_assert(offsetof(struct gaba_alignment_s, dcnt) == offsetof(struct gaba_aln_intl_s, dcnt));
_static_assert(offsetof(struct gaba_alignment_s, slen) == offsetof(struct gaba_aln_intl_s, slen));
_static_assert(offsetof(struct gaba_alignment_s, seg) == offsetof(struct gaba_aln_intl_s, seg));
_static_assert(offsetof(struct gaba_alignment_s, plen) == offsetof(struct gaba_aln_intl_s, plen));
_static_assert(offsetof(struct gaba_alignment_s, agcnt) == offsetof(struct gaba_aln_intl_s, aecnt));
_static_assert(offsetof(struct gaba_alignment_s, bgcnt) == offsetof(struct gaba_aln_intl_s, becnt));
/**
* @struct gaba_leaf_s
* @brief working buffer for max score search
*/
struct gaba_leaf_s {
struct gaba_joint_tail_s const *tail;
struct gaba_block_s const *blk;
uint32_t p, q; /** (8) local p (to restore mask pointer), local q */
uint64_t ppos; /** (8) global p (== resulting path length) */
uint32_t aridx, bridx;
};
/**
* @struct gaba_writer_work_s
* @brief working buffer for traceback (allocated in the thread-local context)
*/
struct gaba_writer_work_s {
/** local context */
struct gaba_aln_intl_s a; /** (64) working buffer, copied to the result object */
/** work */
uint32_t afcnt, bfcnt; /** (8) */
uint32_t *path; /** (8) path array pointer */
struct gaba_block_s const *blk; /** (8) current block */
uint8_t p, q, ofs, state, _pad[4]; /** (8) local p, q-coordinate, [0, BW), path offset, state */
/** save */
uint32_t aofs, bofs; /** (8) ofs for bridge */
uint32_t agidx, bgidx; /** (8) grid indices of the current trace */
uint32_t asgidx, bsgidx; /** (8) base indices of the current trace */
uint32_t aid, bid; /** (8) section ids */
/** section info */
struct gaba_joint_tail_s const *atail;/** (8) */
struct gaba_joint_tail_s const *btail;/** (8) */
struct gaba_alignment_s *aln; /** (8) */
struct gaba_leaf_s leaf; /** (40) working buffer for max pos search */
/** 64, 192 */
};
_static_assert((sizeof(struct gaba_writer_work_s) % 64) == 0);
/**
* @struct gaba_score_vec_s
*/
struct gaba_score_vec_s {
int8_t v1[16];
int8_t v2[16];
int8_t v3[16];
int8_t v4[16];
int8_t v5[16];
};
_static_assert(sizeof(struct gaba_score_vec_s) == 80);
/**
* @struct gaba_mem_block_s
*/
struct gaba_mem_block_s {
struct gaba_mem_block_s *next;
// struct gaba_mem_block_s *prev;
uint64_t size;
};
_static_assert(sizeof(struct gaba_mem_block_s) == 16);
/**
* @struct gaba_stack_s
* @brief save stack pointer
*/
struct gaba_stack_s {
struct gaba_mem_block_s *mem;
uint8_t *top;
uint8_t *end;
/* memory usage tracker */
uint16_t curr_depth, max_depth;
uint32_t flush_cnt;
};
_static_assert(sizeof(struct gaba_stack_s) == 32);
#define _stack_size(_s) ( (uint64_t)((_s)->end - (_s)->top) )
/**
* @macro _init_bar, _test_bar
* @brief memory barrier for debugging
*/
#ifdef DEBUG_MEM
#define _init_bar(_name) { \
memset(self->_barrier_##_name, 0xa5, 256); \
}
#define _test_bar(_name) { \
for(uint64_t __i = 0; __i < 256; __i++) { \
if(self->_barrier_##_name[__i] != 0xa5) { \
fprintf(stderr, "barrier broken, i(%lu), m(%u)\n", __i, self->_barrier_##_name[__i]); *((volatile uint8_t *)NULL); \
} \
} \
}
#define _barrier(_name) uint8_t _barrier_##_name[256];
#else
#define _init_bar(_name) ;
#define _test_bar(_name) ;
#define _barrier(_name)
#endif
/**
* @struct gaba_dp_context_s
*
* @brief (internal) container for dp implementations
*/
struct gaba_dp_context_s {
_barrier(head);
/** loaded on init */
struct gaba_joint_tail_s const *root[4]; /** (32) root tail (phantom vectors) */
/* memory management */
struct gaba_mem_block_s mem; /** (16) root memory block */
struct gaba_stack_s stack; /** (32) current stack */
// uint64_t _pad2;
/* score constants */
double imx, xmx; /** (16) 1 / (M - X), X / (M - X) (precalculated constants) */
struct gaba_score_vec_s scv; /** (80) substitution matrix and gaps */
/* scores */
int8_t tx; /** (1) xdrop threshold */
int8_t tf; /** (1) filter threshold */
int8_t gi, ge, gfa, gfb; /** (4) negative integers */
uint8_t aflen, bflen; /** (2) short-gap length thresholds */
uint8_t ofs, _pad1[7]; /** (16) */
/** 192; 64byte aligned */
_barrier(mid);
/* working buffers */
union gaba_work_s {
struct gaba_reader_work_s r; /** (192) */
struct gaba_merge_work_s m; /** (2048?) */
struct gaba_writer_work_s l; /** (192) */
} w;
/** 64byte aligned */
_barrier(tail);
};
_static_assert((sizeof(struct gaba_dp_context_s) % 64) == 0);
#define GABA_DP_CONTEXT_LOAD_OFFSET ( 0 )
#define GABA_DP_CONTEXT_LOAD_SIZE ( offsetof(struct gaba_dp_context_s, w) )
_static_assert((GABA_DP_CONTEXT_LOAD_OFFSET % 64) == 0);
_static_assert((GABA_DP_CONTEXT_LOAD_SIZE % 64) == 0);
#define _root(_t) ( (_t)->root[_dp_ctx_index(BW)] )
/**
* @struct gaba_opaque_s
*/
struct gaba_opaque_s {
void *api[8];
};
#define _export_dp_context(_t) ( \
(struct gaba_dp_context_s *)(((struct gaba_opaque_s *)(_t)) - DP_CTX_MAX + _dp_ctx_index(BW)) \
)
#define _restore_dp_context(_t) ( \
(struct gaba_dp_context_s *)(((struct gaba_opaque_s *)(_t)) - _dp_ctx_index(BW) + DP_CTX_MAX) \
)
#define _export_dp_context_global(_t) ( \
(struct gaba_dp_context_s *)(((struct gaba_opaque_s *)(_t)) - DP_CTX_MAX + _dp_ctx_index(BW)) \
)
#define _restore_dp_context_global(_t) ( \
(struct gaba_dp_context_s *)(((struct gaba_opaque_s *)(_t)) - _dp_ctx_index(BW) + DP_CTX_MAX) \
)
/**
* @struct gaba_context_s
*
* @brief (API) an algorithmic context.
*
* @sa gaba_init, gaba_close
*/
#ifdef DEBUG_MEM
# define ROOT_BLOCK_OFFSET ( 4096 + 2048 )
#else
# define ROOT_BLOCK_OFFSET ( 4096 )
#endif
struct gaba_context_s {
/** opaque pointers for function dispatch */
struct gaba_opaque_s api[4]; /** function dispatcher, used in gaba_wrap.h */
/** 64byte aligned */
/** templates */
struct gaba_dp_context_s dp; /** template of thread-local context */
uint8_t _pad[ROOT_BLOCK_OFFSET - sizeof(struct gaba_dp_context_s) - 4 * sizeof(struct gaba_opaque_s)];
/** 64byte aligned */
/** phantom vectors */
struct gaba_root_block_s ph[3]; /** (768) template of root vectors, [0] for 16-cell, [1] for 32-cell, [2] for 64-cell */
/** 64byte aligned */
};
_static_assert(offsetof(struct gaba_context_s, ph) == ROOT_BLOCK_OFFSET);
#define _proot(_c, _bw) ( &(_c)->ph[_dp_ctx_index(_bw)] )
/**
* @enum GABA_BLK_STATUS
* head states and intermediate states are exclusive
*/
enum GABA_BLK_STATUS {
/* intermediate states */
CONT = 0, /* continue */
ZERO = 0x01, /* internal use */
TERM = 0x80, /* sign bit */
STAT_MASK = ZERO | TERM | CONT,
/* head states */
HEAD = 0x20,
MERGE = 0x40, /* merged head and the corresponding block contains no actual vector (DP cell) */
ROOT = HEAD | MERGE
};
_static_assert((int8_t)TERM < 0); /* make sure TERM is recognezed as a negative value */
_static_assert((int32_t)CONT<<8 == (int32_t)GABA_CONT);
_static_assert((int32_t)TERM<<8 == (int32_t)GABA_TERM);
/**
* coordinate conversion macros
*/
// #define _rev(pos, len) ( (len) + (uint64_t)(len) - (uint64_t)(pos) - 1 )
#define _rev(pos) ( GABA_EOU + (uint64_t)GABA_EOU - (uint64_t)(pos) - 1 )
#define _roundup(x, base) ( ((x) + (base) - 1) & ~((base) - 1) )
/**
* max and min
*/
#define MAX2(x,y) ( (x) > (y) ? (x) : (y) )
#define MAX3(x,y,z) ( MAX2(x, MAX2(y, z)) )
#define MAX4(w,x,y,z) ( MAX2(MAX2(w, x), MAX2(y, z)) )
#define MIN2(x,y) ( (x) < (y) ? (x) : (y) )
#define MIN3(x,y,z) ( MIN2(x, MIN2(y, z)) )
#define MIN4(w,x,y,z) ( MIN2(MIN2(w, x), MIN2(y, z)) )
/**
* @fn gaba_malloc, gaba_free
* @brief a pair of malloc and free, aligned and margined.
* any pointer created by gaba_malloc MUST be freed by gaba_free.
*/
static _force_inline
void *gaba_malloc(
size_t size)
{
void *ptr = NULL;
/* roundup to align boundary, add margin at the head and tail */
size = _roundup(size, MEM_ALIGN_SIZE);
if(posix_memalign(&ptr, MEM_ALIGN_SIZE, size + 2 * MEM_MARGIN_SIZE) != 0) {
debug("posix_memalign failed");
trap(); return(NULL);
}
debug("posix_memalign(%p), size(%lu)", ptr, size);
return(ptr + MEM_MARGIN_SIZE);
}
static _force_inline
void gaba_free(
void *ptr)
{
debug("free(%p)", ptr - MEM_MARGIN_SIZE);
free(ptr - MEM_MARGIN_SIZE);
return;
}
/* matrix fill functions */
/* direction macros */
/**
* @struct gaba_dir_s
*/
struct gaba_dir_s {
uint32_t mask;
int32_t acc; /* use 32bit int to avoid (sometimes inefficient) 8bit and 16bit operations on x86_64 GP registers */
};
/**
* @macro _dir_init
*/
#define _dir_init(_blk) ((struct gaba_dir_s){ .mask = 0, .acc = (_blk)->acc })
/**
* @macro _dir_fetch
*/
#define _dir_fetch(_d) { \
(_d).mask <<= 1; (_d).mask |= (uint32_t)((_d).acc < 0); \
debug("fetched dir(%x), %s", (_d).mask, _dir_is_down(_d) ? "go down" : "go right"); \
}
/**
* @macro _dir_update
* @brief update direction determiner for the next band
*/
#define _dir_update(_d, _vector) { \
(_d).acc += _ext_n(_vector, 0) - _ext_n(_vector, _W-1); \
/*debug("acc(%d), (%d, %d)", _dir_acc, _ext_n(_vector, 0), _ext_n(_vector, _W-1));*/ \
}
/**
* @macro _dir_adjust_remainder
* @brief adjust direction array when termination is detected in the middle of the block
*/
#define _dir_adjust_remainder(_d, _filled_count) { \
debug("adjust remainder, array(%x), shifted array(%x)", (_d).mask, (_d).mask<<(BLK - (_filled_count))); \
(_d).mask <<= (BLK - (_filled_count)); \
}
/**
* @macro _dir_is_down, _dir_is_right
* @brief direction indicator (_dir_is_down returns true if dir == down)
*/
#define _dir_mask_is_down(_mask) ( (_mask) & 0x01 )
#define _dir_mask_is_right(_mask) ( ~(_mask) & 0x01 )
#define _dir_is_down(_d) ( _dir_mask_is_down((_d).mask) )
#define _dir_is_right(_d) ( _dir_mask_is_right((_d).mask) )
#define _dir_bcnt(_d) ( popcnt((_d).mask) ) /* count vertical transitions */
#define _dir_mask_windback(_mask) { (_mask) >>= 1; }
#define _dir_windback(_d) { (_d).mask >>= 1; }
/**
* @macro _dir_save
*/
#define _dir_save(_blk, _d) { \
(_blk)->dir_mask = (_d).mask; /* store mask */ \
(_blk)->acc = (_d).acc; /* store accumulator */ \
}
/**
* @macro _dir_load
*/
#define _dir_mask_load(_blk, _cnt) ( (_blk)->dir_mask>>(BLK - (_cnt)) )
#define _dir_load(_blk, _cnt) ( \
(struct gaba_dir_s){ \
.mask = _dir_mask_load(_blk, _cnt), \
.acc = (_blk)->acc \
} \
/*debug("load dir cnt(%d), mask(%x), shifted mask(%x)", (int32_t)_filled_count, _d.mask, _d.mask>>(BLK - (_filled_count)));*/ \
)
/**
* seqreader macros
*/
#define _rd_bufa_base(k) ( (k)->w.r.bufa + BLK + _W )
#define _rd_bufb_base(k) ( (k)->w.r.bufb )
#define _rd_bufa(k, pos, len) ( _rd_bufa_base(k) - (pos) - (len) )
#define _rd_bufb(k, pos, len) ( _rd_bufb_base(k) + (pos) )
#define _lo64(v) _ext_v2i64(v, 0)
#define _hi64(v) _ext_v2i64(v, 1)
#define _lo32(v) _ext_v2i32(v, 0)
#define _hi32(v) _ext_v2i32(v, 1)
/* scoring function and sequence encoding */
/**
* @macro _max_match, _gap_h, _gap_v
* @brief calculate scores (_gap: 0 for horizontal gap, 1 for vertical gap)
*/
#define _max_match(_p) ( _hmax_v16i8(_loadu_v16i8((_p)->score_matrix)) )
#define _min_match(_p) ( -_hmax_v16i8(_sub_v16i8(_zero_v16i8(), _loadu_v16i8((_p)->score_matrix))) )
#if MODEL == LINEAR
#define _gap_h(_p, _l) ( -1 * ((_p)->gi + (_p)->ge) * (_l) )
#define _gap_v(_p, _l) ( -1 * ((_p)->gi + (_p)->ge) * (_l) )
#define _gap(_p, _d, _l) _gap_h(_p, _l)
#elif MODEL == AFFINE
#define _gap_h(_p, _l) ( -1 * ((_l) > 0) * (_p)->gi - (_p)->ge * (_l) )
#define _gap_v(_p, _l) ( -1 * ((_l) > 0) * (_p)->gi - (_p)->ge * (_l) )
#define _gap(_p, _d, _l) _gap_h(_p, _l)
#define _gap_e(_p, _l) ( -1 * ((_l) > 0) * (_p)->gi - (_p)->ge * (_l) )
#define _gap_f(_p, _l) ( -1 * ((_l) > 0) * (_p)->gi - (_p)->ge * (_l) )
#else /* MODEL == COMBINED */
#define _gap_h(_p, _l) ( MAX2(-1 * ((_l) > 0) * (_p)->gi - (_p)->ge * (_l), -1 * (_p)->gfb * (_l)) )
#define _gap_v(_p, _l) ( MAX2(-1 * ((_l) > 0) * (_p)->gi - (_p)->ge * (_l), -1 * (_p)->gfa * (_l)) )
#define _gap(_p, _d, _l) ( MAX2(-1 * ((_l) > 0) * (_p)->gi - (_p)->ge * (_l), -1 * (&(_p)->gfb)[-(_d)] * (_l)) )
#define _gap_e(_p, _l) ( -1 * ((_l) > 0) * (_p)->gi - (_p)->ge * (_l) )
#define _gap_f(_p, _l) ( -1 * ((_l) > 0) * (_p)->gi - (_p)->ge * (_l) )
#endif
#define _ofs_h(_p) ( (_p)->gi + (_p)->ge )
#define _ofs_v(_p) ( (_p)->gi + (_p)->ge )
#define _ofs_e(_p) ( (_p)->gi )
#define _ofs_f(_p) ( (_p)->gi )
/**
* @enum BASES
*/
#if BIT == 2
enum BASES { A = 0x00, C = 0x01, G = 0x02, T = 0x03, N = 0x04 };
# define _max_match_base_a(_p) ( 0x0c )
# define _max_match_base_b(_p) ( 0x03 )
#else
enum BASES { A = 0x01, C = 0x02, G = 0x04, T = 0x08, N = 0x00 };
# define _max_match_base_a(_p) ( 0x01 )
# define _max_match_base_b(_p) ( 0x01 )
#endif
/**
* @macro _adjust_BLK, _comp_BLK, _match_BW
* NOTE: _load_v16i8 and _load_v32i8 will be moved out of the loop when loading static uint8_t const[16].
*/
#if BIT == 2
/* 2bit encoding */
static uint8_t const decode_table[16] __attribute__(( aligned(16) )) = {
'A', 'C', 'G', 'T', 'N', 'N', 'N', 'N',
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'
};
static uint8_t const comp_mask_a[16] __attribute__(( aligned(16) )) = {
0x03, 0x02, 0x01, 0x00, 0x04, 0x04, 0x04, 0x04,
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04
};
static uint8_t const shift_mask_b[16] __attribute__(( aligned(16) )) = {
0x00, 0x04, 0x08, 0x0c, 0x02, 0x02, 0x02, 0x02,
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
};
static uint8_t const compshift_mask_b[16] __attribute__(( aligned(16) )) = {
0x0c, 0x08, 0x04, 0x00, 0x02, 0x02, 0x02, 0x02,
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
};
/* q-fetch (anti-diagonal matching for vector calculation) */
#ifdef UNSAFE_FETCH
# define _fwaq_v16i8(_v) ( _shuf_v16i8((_load_v16i8(comp_mask_a)), (_v)) )
# define _fwaq_v32i8(_v) ( _shuf_v32i8((_from_v16i8_v32i8(_load_v16i8(comp_mask_a))), (_v)) )
# define _rvaq_v16i8(_v) ( _swap_v16i8((_v)) )
# define _rvaq_v32i8(_v) ( _swap_v32i8((_v)) )
# define _fwbq_v16i8(_v) ( _shuf_v16i8((_load_v16i8(shift_mask_b)), (_v)) )
# define _fwbq_v32i8(_v) ( _shuf_v32i8((_from_v16i8_v32i8(_load_v16i8(shift_mask_b))), (_v)) )
# define _rvbq_v16i8(_v) ( _shuf_v16i8((_load_v16i8(compshift_mask_b)), _swap_v16i8(_v)) )
# define _rvbq_v32i8(_v) ( _shuf_v32i8((_from_v16i8_v32i8(_load_v16i8(compshift_mask_b))), _swap_v32i8(_v)) )
#else
# define _fwaq_v16i8(_v, _l) ( _shuf_v16i8((_load_v16i8(comp_mask_a)), (_v)) ) /* _l is ignored */
# define _fwaq_v32i8(_v) ( _shuf_v32i8((_from_v16i8_v32i8(_load_v16i8(comp_mask_a))), (_v)) )
# define _rvaq_v16i8(_v, _l) ( _swapn_v16i8((_v), (_l)) )
# define _rvaq_v32i8(_v) ( _swap_v32i8((_v)) )
# define _fwbq_v16i8(_v, _l) ( _shuf_v16i8((_load_v16i8(shift_mask_b)), (_v)) ) /* _l is ignored */
# define _fwbq_v32i8(_v) ( _shuf_v32i8((_from_v16i8_v32i8(_load_v16i8(shift_mask_b))), (_v)) )
# define _rvbq_v16i8(_v, _l) ( _shuf_v16i8((_load_v16i8(compshift_mask_b)), _swapn_v16i8((_v), (_l))) )
# define _rvbq_v32i8(_v) ( _shuf_v32i8((_from_v16i8_v32i8(_load_v16i8(compshift_mask_b))), _swap_v32i8(_v)) )
#endif
/* p-fetch (diagonal matching for alignment refinement) */
# define _fwap_v16i8(_v, _l) ( (_v) )
# define _rvap_v16i8(_v, _l) ( _shuf_v16i8((_load_v16i8(comp_mask_a)), _swapn_v16i8((_v), (_l))) )
# define _fwbp_v16i8(_v, _l) ( _shuf_v16i8((_load_v16i8(shift_mask_b)), (_v)) )
# define _rvbp_v16i8(_v, _l) ( _shuf_v16i8((_load_v16i8(compshift_mask_b)), _swapn_v16i8((_v), (_l))) )
# define _match_n(_a, _b) _or_n(_a, _b)
# define _match_v16i8(_a, _b) _or_v16i8(_a, _b)
#else
/* 4bit encoding */
static uint8_t const decode_table[16] __attribute__(( aligned(16) )) = {
'N', 'A', 'C', 'M', 'G', 'R', 'S', 'V',
'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'
};
static uint8_t const comp_mask[16] __attribute__(( aligned(16) )) = {
0x00, 0x08, 0x04, 0x0c, 0x02, 0x0a, 0x06, 0x0e,
0x01, 0x09, 0x05, 0x0d, 0x03, 0x0b, 0x07, 0x0f
};
#define comp_mask_a comp_mask
#define compshift_mask_b comp_mask
/* q-fetch (anti-diagonal matching for vector calculation) */
#ifdef UNSAFE_FETCH
# define _fwaq_v16i8(_v) ( _shuf_v16i8((_load_v16i8(comp_mask)), (_v)) )
# define _fwaq_v32i8(_v) ( _shuf_v32i8((_from_v16i8_v32i8(_load_v16i8(comp_mask))), (_v)) )
# define _rvaq_v16i8(_v) ( _swap_v16i8((_v)) )
# define _rvaq_v32i8(_v) ( _swap_v32i8((_v)) )
# define _fwbq_v16i8(_v) ( (_v) )
# define _fwbq_v32i8(_v) ( (_v) )
# define _rvbq_v16i8(_v) ( _shuf_v16i8((_load_v16i8(comp_mask)), _swap_v16i8(_v)) )
# define _rvbq_v32i8(_v) ( _shuf_v32i8((_from_v16i8_v32i8(_load_v16i8(comp_mask))), _swap_v32i8(_v)) )
#else
# define _fwaq_v16i8(_v, _l) ( _shuf_v16i8((_load_v16i8(comp_mask)), (_v)) ) /* _l is ignored */
# define _fwaq_v32i8(_v) ( _shuf_v32i8((_from_v16i8_v32i8(_load_v16i8(comp_mask))), (_v)) )
# define _rvaq_v16i8(_v, _l) ( _swapn_v16i8((_v), (_l)) )
# define _rvaq_v32i8(_v) ( _swap_v32i8((_v)) )
# define _fwbq_v16i8(_v, _l) ( (_v) ) /* id(x); _l is ignored */
# define _fwbq_v32i8(_v) ( (_v) )
# define _rvbq_v16i8(_v, _l) ( _shuf_v16i8((_load_v16i8(comp_mask)), _swapn_v16i8((_v), (_l))) )
# define _rvbq_v32i8(_v) ( _shuf_v32i8((_from_v16i8_v32i8(_load_v16i8(comp_mask))), _swap_v32i8(_v)) )
#endif
/* p-fetch (diagonal matching for alignment refinement) */
# define _fwap_v16i8(_v, _l) ( (_v) )
# define _rvap_v16i8(_v, _l) ( _shuf_v16i8((_load_v16i8(comp_mask)), _swapn_v16i8((_v), (_l))) )
# define _fwbp_v16i8(_v, _l) ( (_v) )
# define _rvbp_v16i8(_v, _l) ( _shuf_v16i8((_load_v16i8(comp_mask)), _swapn_v16i8((_v), (_l))) )
# define _match_n(_a, _b) _and_n(_a, _b)
# define _match_v16i8(_a, _b) _and_v16i8(_a, _b)
#endif
/**
* @fn fill_fetch_seq_a
*/
static _force_inline
void fill_fetch_seq_a(
struct gaba_dp_context_s *self,
uint8_t const *pos,
uint64_t len)
{
if(pos < GABA_EOU) {
debug("reverse fetch a: pos(%p), len(%lu)", pos, len);
/* reverse fetch: 2 * alen - (2 * alen - pos) + (len - 32) */
#ifdef UNSAFE_FETCH
v32i8_t ach = _loadu_v32i8(pos + (len - BLK)); /* this may touch the space before the array */
_storeu_v32i8(_rd_bufa(self, _W, len), _rvaq_v32i8(ach)); /* reverse */
#else
v32i8_t ach = _loadu_v32i8(pos); /* this will not touch the space before the array, but will touch at most 31bytes after the array */
_storeu_v32i8(_rd_bufa(self, _W, BLK), _rvaq_v32i8(ach)); /* reverse; will not invade any buf */
#endif
} else {
debug("forward fetch a: pos(%p), len(%lu), p(%p)", pos, len, _rev(pos + (len - 1)));
/* forward fetch: 2 * alen - pos */
#ifdef UNSAFE_FETCH
v32i8_t ach = _loadu_v32i8(_rev(pos + (len - 1)));
_storeu_v32i8(_rd_bufa(self, _W, len), _fwaq_v32i8(ach)); /* complement */
#else
v32i8_t ach = _loadu_v32i8(_rev(pos + (len - 1)));
// _print_v32i8(ach); _print_v32i8(_fwaq_v32i8(ach));
_storeu_v32i8(_rd_bufa(self, _W, len), _fwaq_v32i8(ach)); /* complement; will invade bufa[BLK..BLK+_W] */
#endif
}
return;
}
/**
* @fn fill_fetch_seq_a_n
* FIXME: this function invades 16bytes before bufa.
*/
static _force_inline
void fill_fetch_seq_a_n(
struct gaba_dp_context_s *self,
uint64_t ofs,
uint8_t const *pos,
uint64_t len)
{
#ifdef DEBUG
if(len > _W + BLK) {